summaryrefslogtreecommitdiffstats
path: root/third_party/highway/hwy/tests/compress_test.cc
diff options
context:
space:
mode:
authorDaniel Baumann <daniel.baumann@progress-linux.org>2024-04-21 11:44:51 +0000
committerDaniel Baumann <daniel.baumann@progress-linux.org>2024-04-21 11:44:51 +0000
commit9e3c08db40b8916968b9f30096c7be3f00ce9647 (patch)
treea68f146d7fa01f0134297619fbe7e33db084e0aa /third_party/highway/hwy/tests/compress_test.cc
parentInitial commit. (diff)
downloadthunderbird-9e3c08db40b8916968b9f30096c7be3f00ce9647.tar.xz
thunderbird-9e3c08db40b8916968b9f30096c7be3f00ce9647.zip
Adding upstream version 1:115.7.0.upstream/1%115.7.0upstream
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'third_party/highway/hwy/tests/compress_test.cc')
-rw-r--r--third_party/highway/hwy/tests/compress_test.cc833
1 files changed, 833 insertions, 0 deletions
diff --git a/third_party/highway/hwy/tests/compress_test.cc b/third_party/highway/hwy/tests/compress_test.cc
new file mode 100644
index 0000000000..ae008b4dc4
--- /dev/null
+++ b/third_party/highway/hwy/tests/compress_test.cc
@@ -0,0 +1,833 @@
+// Copyright 2022 Google LLC
+// SPDX-License-Identifier: Apache-2.0
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <stddef.h>
+#include <stdint.h>
+#include <string.h> // memset
+
+#include <array> // IWYU pragma: keep
+
+#include "hwy/base.h"
+
+#undef HWY_TARGET_INCLUDE
+#define HWY_TARGET_INCLUDE "tests/compress_test.cc"
+#include "hwy/foreach_target.h" // IWYU pragma: keep
+#include "hwy/highway.h"
+#include "hwy/tests/test_util-inl.h"
+
+HWY_BEFORE_NAMESPACE();
+namespace hwy {
+namespace HWY_NAMESPACE {
+
+// Regenerate tables used in the implementation, instead of testing.
+#define HWY_PRINT_TABLES 0
+
+#if !HWY_PRINT_TABLES || HWY_IDE
+
+template <class D, class DI, typename T = TFromD<D>, typename TI = TFromD<DI>>
+void CheckStored(D d, DI di, const char* op, size_t expected_pos,
+ size_t actual_pos, size_t num_to_check,
+ const AlignedFreeUniquePtr<T[]>& in,
+ const AlignedFreeUniquePtr<TI[]>& mask_lanes,
+ const AlignedFreeUniquePtr<T[]>& expected, const T* actual_u,
+ int line) {
+ if (expected_pos != actual_pos) {
+ hwy::Abort(__FILE__, line,
+ "%s: size mismatch for %s: expected %d, actual %d\n", op,
+ TypeName(T(), Lanes(d)).c_str(), static_cast<int>(expected_pos),
+ static_cast<int>(actual_pos));
+ }
+ // Modified from AssertVecEqual - we may not be checking all lanes.
+ for (size_t i = 0; i < num_to_check; ++i) {
+ if (!IsEqual(expected[i], actual_u[i])) {
+ const size_t N = Lanes(d);
+ fprintf(stderr, "%s: mismatch at i=%d of %d, line %d:\n\n", op,
+ static_cast<int>(i), static_cast<int>(num_to_check), line);
+ Print(di, "mask", Load(di, mask_lanes.get()), 0, N);
+ Print(d, "in", Load(d, in.get()), 0, N);
+ Print(d, "expect", Load(d, expected.get()), 0, num_to_check);
+ Print(d, "actual", Load(d, actual_u), 0, num_to_check);
+ HWY_ASSERT(false);
+ }
+ }
+}
+
+struct TestCompress {
+ template <class T, class D>
+ HWY_NOINLINE void operator()(T /*unused*/, D d) {
+ RandomState rng;
+
+ using TI = MakeSigned<T>; // For mask > 0 comparison
+ using TU = MakeUnsigned<T>;
+ const Rebind<TI, D> di;
+ const size_t N = Lanes(d);
+
+ for (int frac : {0, 2, 3}) {
+ // For CompressStore
+ const size_t misalign = static_cast<size_t>(frac) * N / 4;
+
+ auto in_lanes = AllocateAligned<T>(N);
+ auto mask_lanes = AllocateAligned<TI>(N);
+ auto garbage = AllocateAligned<TU>(N);
+ auto expected = AllocateAligned<T>(N);
+ auto actual_a = AllocateAligned<T>(misalign + N);
+ T* actual_u = actual_a.get() + misalign;
+
+ const size_t bits_size = RoundUpTo((N + 7) / 8, 8);
+ auto bits = AllocateAligned<uint8_t>(bits_size);
+ memset(bits.get(), 0, bits_size); // for MSAN
+
+ // Each lane should have a chance of having mask=true.
+ for (size_t rep = 0; rep < AdjustedReps(200); ++rep) {
+ size_t expected_pos = 0;
+ for (size_t i = 0; i < N; ++i) {
+ const uint64_t r = Random32(&rng);
+ in_lanes[i] = T(); // cannot initialize float16_t directly.
+ CopyBytes<sizeof(T)>(&r, &in_lanes[i]); // not same size
+ mask_lanes[i] = (Random32(&rng) & 1024) ? TI(1) : TI(0);
+ if (mask_lanes[i] > 0) {
+ expected[expected_pos++] = in_lanes[i];
+ }
+ garbage[i] = static_cast<TU>(Random64(&rng));
+ }
+ size_t num_to_check;
+ if (CompressIsPartition<T>::value) {
+ // For non-native Compress, also check that mask=false lanes were
+ // moved to the back of the vector (highest indices).
+ size_t extra = expected_pos;
+ for (size_t i = 0; i < N; ++i) {
+ if (mask_lanes[i] == 0) {
+ expected[extra++] = in_lanes[i];
+ }
+ }
+ HWY_ASSERT(extra == N);
+ num_to_check = N;
+ } else {
+ // For native Compress, only the mask=true lanes are defined.
+ num_to_check = expected_pos;
+ }
+
+ const auto in = Load(d, in_lanes.get());
+ const auto mask =
+ RebindMask(d, Gt(Load(di, mask_lanes.get()), Zero(di)));
+ StoreMaskBits(d, mask, bits.get());
+
+ // Compress
+ memset(actual_u, 0, N * sizeof(T));
+ StoreU(Compress(in, mask), d, actual_u);
+ CheckStored(d, di, "Compress", expected_pos, expected_pos, num_to_check,
+ in_lanes, mask_lanes, expected, actual_u, __LINE__);
+
+ // CompressNot
+ memset(actual_u, 0, N * sizeof(T));
+ StoreU(CompressNot(in, Not(mask)), d, actual_u);
+ CheckStored(d, di, "CompressNot", expected_pos, expected_pos,
+ num_to_check, in_lanes, mask_lanes, expected, actual_u,
+ __LINE__);
+
+ // CompressStore
+ memset(actual_u, 0, N * sizeof(T));
+ const size_t size1 = CompressStore(in, mask, d, actual_u);
+ // expected_pos instead of num_to_check because this op is not
+ // affected by CompressIsPartition.
+ CheckStored(d, di, "CompressStore", expected_pos, size1, expected_pos,
+ in_lanes, mask_lanes, expected, actual_u, __LINE__);
+
+ // CompressBlendedStore
+ memcpy(actual_u, garbage.get(), N * sizeof(T));
+ const size_t size2 = CompressBlendedStore(in, mask, d, actual_u);
+ // expected_pos instead of num_to_check because this op only writes
+ // the mask=true lanes.
+ CheckStored(d, di, "CompressBlendedStore", expected_pos, size2,
+ expected_pos, in_lanes, mask_lanes, expected, actual_u,
+ __LINE__);
+ // Subsequent lanes are untouched.
+ for (size_t i = size2; i < N; ++i) {
+#if HWY_COMPILER_MSVC && HWY_TARGET == HWY_AVX2
+ // TODO(eustas): re-enable when compiler is fixed
+#else
+ HWY_ASSERT_EQ(garbage[i], reinterpret_cast<TU*>(actual_u)[i]);
+#endif
+ }
+
+ // CompressBits
+ memset(actual_u, 0, N * sizeof(T));
+ StoreU(CompressBits(in, bits.get()), d, actual_u);
+ CheckStored(d, di, "CompressBits", expected_pos, expected_pos,
+ num_to_check, in_lanes, mask_lanes, expected, actual_u,
+ __LINE__);
+
+ // CompressBitsStore
+ memset(actual_u, 0, N * sizeof(T));
+ const size_t size3 = CompressBitsStore(in, bits.get(), d, actual_u);
+ // expected_pos instead of num_to_check because this op is not
+ // affected by CompressIsPartition.
+ CheckStored(d, di, "CompressBitsStore", expected_pos, size3,
+ expected_pos, in_lanes, mask_lanes, expected, actual_u,
+ __LINE__);
+ } // rep
+ } // frac
+ } // operator()
+};
+
+HWY_NOINLINE void TestAllCompress() {
+ ForAllTypes(ForPartialVectors<TestCompress>());
+}
+
+struct TestCompressBlocks {
+ template <class T, class D>
+ HWY_NOINLINE void operator()(T /*unused*/, D d) {
+#if HWY_TARGET == HWY_SCALAR
+ (void)d;
+#else
+ static_assert(sizeof(T) == 8 && !IsSigned<T>(), "Should be u64");
+ RandomState rng;
+
+ using TI = MakeSigned<T>; // For mask > 0 comparison
+ const Rebind<TI, D> di;
+ const size_t N = Lanes(d);
+
+ auto in_lanes = AllocateAligned<T>(N);
+ auto mask_lanes = AllocateAligned<TI>(N);
+ auto expected = AllocateAligned<T>(N);
+ auto actual = AllocateAligned<T>(N);
+
+ // Each lane should have a chance of having mask=true.
+ for (size_t rep = 0; rep < AdjustedReps(200); ++rep) {
+ size_t expected_pos = 0;
+ for (size_t i = 0; i < N; i += 2) {
+ const uint64_t bits = Random32(&rng);
+ in_lanes[i + 1] = in_lanes[i] = T(); // cannot set float16_t directly.
+ CopyBytes<sizeof(T)>(&bits, &in_lanes[i]); // not same size
+ CopyBytes<sizeof(T)>(&bits, &in_lanes[i + 1]); // not same size
+ mask_lanes[i + 1] = mask_lanes[i] = TI{(Random32(&rng) & 8) ? 1 : 0};
+ if (mask_lanes[i] > 0) {
+ expected[expected_pos++] = in_lanes[i];
+ expected[expected_pos++] = in_lanes[i + 1];
+ }
+ }
+ size_t num_to_check;
+ if (CompressIsPartition<T>::value) {
+ // For non-native Compress, also check that mask=false lanes were
+ // moved to the back of the vector (highest indices).
+ size_t extra = expected_pos;
+ for (size_t i = 0; i < N; ++i) {
+ if (mask_lanes[i] == 0) {
+ expected[extra++] = in_lanes[i];
+ }
+ }
+ HWY_ASSERT(extra == N);
+ num_to_check = N;
+ } else {
+ // For native Compress, only the mask=true lanes are defined.
+ num_to_check = expected_pos;
+ }
+
+ const auto in = Load(d, in_lanes.get());
+ const auto mask = RebindMask(d, Gt(Load(di, mask_lanes.get()), Zero(di)));
+
+ // CompressBlocksNot
+ memset(actual.get(), 0, N * sizeof(T));
+ StoreU(CompressBlocksNot(in, Not(mask)), d, actual.get());
+ CheckStored(d, di, "CompressBlocksNot", expected_pos, expected_pos,
+ num_to_check, in_lanes, mask_lanes, expected, actual.get(),
+ __LINE__);
+ } // rep
+#endif // HWY_TARGET == HWY_SCALAR
+ } // operator()
+};
+
+HWY_NOINLINE void TestAllCompressBlocks() {
+ ForGE128Vectors<TestCompressBlocks>()(uint64_t());
+}
+
+#endif // !HWY_PRINT_TABLES
+
+#if HWY_PRINT_TABLES || HWY_IDE
+namespace detail { // for code folding
+
+void PrintCompress8x8Tables() {
+ printf("======================================= 8x8\n");
+ constexpr size_t N = 8;
+ for (uint64_t code = 0; code < (1ull << N); ++code) {
+ std::array<uint8_t, N> indices{0};
+ size_t pos = 0;
+ // All lanes where mask = true
+ for (size_t i = 0; i < N; ++i) {
+ if (code & (1ull << i)) {
+ indices[pos++] = i;
+ }
+ }
+ // All lanes where mask = false
+ for (size_t i = 0; i < N; ++i) {
+ if (!(code & (1ull << i))) {
+ indices[pos++] = i;
+ }
+ }
+ HWY_ASSERT(pos == N);
+
+ for (size_t i = 0; i < N; ++i) {
+ printf("%d,", indices[i]);
+ }
+ printf(code & 1 ? "//\n" : "/**/");
+ }
+ printf("\n");
+}
+
+void PrintCompress16x8Tables() {
+ printf("======================================= 16x8\n");
+ constexpr size_t N = 8; // 128-bit SIMD
+ for (uint64_t code = 0; code < (1ull << N); ++code) {
+ std::array<uint8_t, N> indices{0};
+ size_t pos = 0;
+ // All lanes where mask = true
+ for (size_t i = 0; i < N; ++i) {
+ if (code & (1ull << i)) {
+ indices[pos++] = i;
+ }
+ }
+ // All lanes where mask = false
+ for (size_t i = 0; i < N; ++i) {
+ if (!(code & (1ull << i))) {
+ indices[pos++] = i;
+ }
+ }
+ HWY_ASSERT(pos == N);
+
+ // Doubled (for converting lane to byte indices)
+ for (size_t i = 0; i < N; ++i) {
+ printf("%d,", 2 * indices[i]);
+ }
+ printf(code & 1 ? "//\n" : "/**/");
+ }
+ printf("\n");
+}
+
+void PrintCompressNot16x8Tables() {
+ printf("======================================= Not 16x8\n");
+ constexpr size_t N = 8; // 128-bit SIMD
+ for (uint64_t not_code = 0; not_code < (1ull << N); ++not_code) {
+ const uint64_t code = ~not_code;
+ std::array<uint8_t, N> indices{0};
+ size_t pos = 0;
+ // All lanes where mask = true
+ for (size_t i = 0; i < N; ++i) {
+ if (code & (1ull << i)) {
+ indices[pos++] = i;
+ }
+ }
+ // All lanes where mask = false
+ for (size_t i = 0; i < N; ++i) {
+ if (!(code & (1ull << i))) {
+ indices[pos++] = i;
+ }
+ }
+ HWY_ASSERT(pos == N);
+
+ // Doubled (for converting lane to byte indices)
+ for (size_t i = 0; i < N; ++i) {
+ printf("%d,", 2 * indices[i]);
+ }
+ printf(not_code & 1 ? "//\n" : "/**/");
+ }
+ printf("\n");
+}
+
+// Compressed to nibbles, unpacked via variable right shift. Also includes
+// FirstN bits in the nibble MSB.
+void PrintCompress32x8Tables() {
+ printf("======================================= 32/64x8\n");
+ constexpr size_t N = 8; // AVX2 or 64-bit AVX3
+ for (uint64_t code = 0; code < (1ull << N); ++code) {
+ const size_t count = PopCount(code);
+ std::array<uint32_t, N> indices{0};
+ size_t pos = 0;
+ // All lanes where mask = true
+ for (size_t i = 0; i < N; ++i) {
+ if (code & (1ull << i)) {
+ indices[pos++] = i;
+ }
+ }
+ // All lanes where mask = false
+ for (size_t i = 0; i < N; ++i) {
+ if (!(code & (1ull << i))) {
+ indices[pos++] = i;
+ }
+ }
+ HWY_ASSERT(pos == N);
+
+ // Convert to nibbles
+ uint64_t packed = 0;
+ for (size_t i = 0; i < N; ++i) {
+ HWY_ASSERT(indices[i] < N);
+ if (i < count) {
+ indices[i] |= N;
+ HWY_ASSERT(indices[i] < 0x10);
+ }
+ packed += indices[i] << (i * 4);
+ }
+
+ HWY_ASSERT(packed < (1ull << (N * 4)));
+ printf("0x%08x,", static_cast<uint32_t>(packed));
+ }
+ printf("\n");
+}
+
+void PrintCompressNot32x8Tables() {
+ printf("======================================= Not 32/64x8\n");
+ constexpr size_t N = 8; // AVX2 or 64-bit AVX3
+ for (uint64_t not_code = 0; not_code < (1ull << N); ++not_code) {
+ const uint64_t code = ~not_code;
+ const size_t count = PopCount(code);
+ std::array<uint32_t, N> indices{0};
+ size_t pos = 0;
+ // All lanes where mask = true
+ for (size_t i = 0; i < N; ++i) {
+ if (code & (1ull << i)) {
+ indices[pos++] = i;
+ }
+ }
+ // All lanes where mask = false
+ for (size_t i = 0; i < N; ++i) {
+ if (!(code & (1ull << i))) {
+ indices[pos++] = i;
+ }
+ }
+ HWY_ASSERT(pos == N);
+
+ // Convert to nibbles
+ uint64_t packed = 0;
+ for (size_t i = 0; i < N; ++i) {
+ HWY_ASSERT(indices[i] < N);
+ if (i < count) {
+ indices[i] |= N;
+ HWY_ASSERT(indices[i] < 0x10);
+ }
+ packed += indices[i] << (i * 4);
+ }
+
+ HWY_ASSERT(packed < (1ull << (N * 4)));
+ printf("0x%08x,", static_cast<uint32_t>(packed));
+ }
+ printf("\n");
+}
+
+// Compressed to nibbles (for AVX3 64x4)
+void PrintCompress64x4NibbleTables() {
+ printf("======================================= 64x4Nibble\n");
+ constexpr size_t N = 4; // AVX2
+ for (uint64_t code = 0; code < (1ull << N); ++code) {
+ std::array<uint32_t, N> indices{0};
+ size_t pos = 0;
+ // All lanes where mask = true
+ for (size_t i = 0; i < N; ++i) {
+ if (code & (1ull << i)) {
+ indices[pos++] = i;
+ }
+ }
+ // All lanes where mask = false
+ for (size_t i = 0; i < N; ++i) {
+ if (!(code & (1ull << i))) {
+ indices[pos++] = i;
+ }
+ }
+ HWY_ASSERT(pos == N);
+
+ // Convert to nibbles
+ uint64_t packed = 0;
+ for (size_t i = 0; i < N; ++i) {
+ HWY_ASSERT(indices[i] < N);
+ packed += indices[i] << (i * 4);
+ }
+
+ HWY_ASSERT(packed < (1ull << (N * 4)));
+ printf("0x%08x,", static_cast<uint32_t>(packed));
+ }
+ printf("\n");
+}
+
+void PrintCompressNot64x4NibbleTables() {
+ printf("======================================= Not 64x4Nibble\n");
+ constexpr size_t N = 4; // AVX2
+ for (uint64_t not_code = 0; not_code < (1ull << N); ++not_code) {
+ const uint64_t code = ~not_code;
+ std::array<uint32_t, N> indices{0};
+ size_t pos = 0;
+ // All lanes where mask = true
+ for (size_t i = 0; i < N; ++i) {
+ if (code & (1ull << i)) {
+ indices[pos++] = i;
+ }
+ }
+ // All lanes where mask = false
+ for (size_t i = 0; i < N; ++i) {
+ if (!(code & (1ull << i))) {
+ indices[pos++] = i;
+ }
+ }
+ HWY_ASSERT(pos == N);
+
+ // Convert to nibbles
+ uint64_t packed = 0;
+ for (size_t i = 0; i < N; ++i) {
+ HWY_ASSERT(indices[i] < N);
+ packed += indices[i] << (i * 4);
+ }
+
+ HWY_ASSERT(packed < (1ull << (N * 4)));
+ printf("0x%08x,", static_cast<uint32_t>(packed));
+ }
+ printf("\n");
+}
+
+void PrintCompressNot64x2NibbleTables() {
+ printf("======================================= Not 64x2Nibble\n");
+ constexpr size_t N = 2; // 128-bit
+ for (uint64_t not_code = 0; not_code < (1ull << N); ++not_code) {
+ const uint64_t code = ~not_code;
+ std::array<uint32_t, N> indices{0};
+ size_t pos = 0;
+ // All lanes where mask = true
+ for (size_t i = 0; i < N; ++i) {
+ if (code & (1ull << i)) {
+ indices[pos++] = i;
+ }
+ }
+ // All lanes where mask = false
+ for (size_t i = 0; i < N; ++i) {
+ if (!(code & (1ull << i))) {
+ indices[pos++] = i;
+ }
+ }
+ HWY_ASSERT(pos == N);
+
+ // Convert to nibbles
+ uint64_t packed = 0;
+ for (size_t i = 0; i < N; ++i) {
+ HWY_ASSERT(indices[i] < N);
+ packed += indices[i] << (i * 4);
+ }
+
+ HWY_ASSERT(packed < (1ull << (N * 4)));
+ printf("0x%08x,", static_cast<uint32_t>(packed));
+ }
+ printf("\n");
+}
+
+void PrintCompress64x4Tables() {
+ printf("======================================= 64x4 uncompressed\n");
+ constexpr size_t N = 4; // SVE_256
+ for (uint64_t code = 0; code < (1ull << N); ++code) {
+ std::array<size_t, N> indices{0};
+ size_t pos = 0;
+ // All lanes where mask = true
+ for (size_t i = 0; i < N; ++i) {
+ if (code & (1ull << i)) {
+ indices[pos++] = i;
+ }
+ }
+ // All lanes where mask = false
+ for (size_t i = 0; i < N; ++i) {
+ if (!(code & (1ull << i))) {
+ indices[pos++] = i;
+ }
+ }
+ HWY_ASSERT(pos == N);
+
+ // Store uncompressed indices because SVE TBL returns 0 if an index is out
+ // of bounds. On AVX3 we simply variable-shift because permute indices are
+ // interpreted modulo N. Compression is not worth the extra shift+AND
+ // because the table is anyway only 512 bytes.
+ for (size_t i = 0; i < N; ++i) {
+ printf("%d,", static_cast<int>(indices[i]));
+ }
+ }
+ printf("\n");
+}
+
+void PrintCompressNot64x4Tables() {
+ printf("======================================= Not 64x4 uncompressed\n");
+ constexpr size_t N = 4; // SVE_256
+ for (uint64_t not_code = 0; not_code < (1ull << N); ++not_code) {
+ const uint64_t code = ~not_code;
+ std::array<size_t, N> indices{0};
+ size_t pos = 0;
+ // All lanes where mask = true
+ for (size_t i = 0; i < N; ++i) {
+ if (code & (1ull << i)) {
+ indices[pos++] = i;
+ }
+ }
+ // All lanes where mask = false
+ for (size_t i = 0; i < N; ++i) {
+ if (!(code & (1ull << i))) {
+ indices[pos++] = i;
+ }
+ }
+ HWY_ASSERT(pos == N);
+
+ // Store uncompressed indices because SVE TBL returns 0 if an index is out
+ // of bounds. On AVX3 we simply variable-shift because permute indices are
+ // interpreted modulo N. Compression is not worth the extra shift+AND
+ // because the table is anyway only 512 bytes.
+ for (size_t i = 0; i < N; ++i) {
+ printf("%d,", static_cast<int>(indices[i]));
+ }
+ }
+ printf("\n");
+}
+
+// Same as above, but prints pairs of u32 indices (for AVX2). Also includes
+// FirstN bits in the nibble MSB.
+void PrintCompress64x4PairTables() {
+ printf("======================================= 64x4 u32 index\n");
+ constexpr size_t N = 4; // AVX2
+ for (uint64_t code = 0; code < (1ull << N); ++code) {
+ const size_t count = PopCount(code);
+ std::array<size_t, N> indices{0};
+ size_t pos = 0;
+ // All lanes where mask = true
+ for (size_t i = 0; i < N; ++i) {
+ if (code & (1ull << i)) {
+ indices[pos++] = i;
+ }
+ }
+ // All lanes where mask = false
+ for (size_t i = 0; i < N; ++i) {
+ if (!(code & (1ull << i))) {
+ indices[pos++] = i;
+ }
+ }
+ HWY_ASSERT(pos == N);
+
+ // Store uncompressed indices because SVE TBL returns 0 if an index is out
+ // of bounds. On AVX3 we simply variable-shift because permute indices are
+ // interpreted modulo N. Compression is not worth the extra shift+AND
+ // because the table is anyway only 512 bytes.
+ for (size_t i = 0; i < N; ++i) {
+ const int first_n_bit = i < count ? 8 : 0;
+ const int low = static_cast<int>(2 * indices[i]) + first_n_bit;
+ HWY_ASSERT(low < 0x10);
+ printf("%d, %d, ", low, low + 1);
+ }
+ }
+ printf("\n");
+}
+
+void PrintCompressNot64x4PairTables() {
+ printf("======================================= Not 64x4 u32 index\n");
+ constexpr size_t N = 4; // AVX2
+ for (uint64_t not_code = 0; not_code < (1ull << N); ++not_code) {
+ const uint64_t code = ~not_code;
+ const size_t count = PopCount(code);
+ std::array<size_t, N> indices{0};
+ size_t pos = 0;
+ // All lanes where mask = true
+ for (size_t i = 0; i < N; ++i) {
+ if (code & (1ull << i)) {
+ indices[pos++] = i;
+ }
+ }
+ // All lanes where mask = false
+ for (size_t i = 0; i < N; ++i) {
+ if (!(code & (1ull << i))) {
+ indices[pos++] = i;
+ }
+ }
+ HWY_ASSERT(pos == N);
+
+ // Store uncompressed indices because SVE TBL returns 0 if an index is out
+ // of bounds. On AVX3 we simply variable-shift because permute indices are
+ // interpreted modulo N. Compression is not worth the extra shift+AND
+ // because the table is anyway only 512 bytes.
+ for (size_t i = 0; i < N; ++i) {
+ const int first_n_bit = i < count ? 8 : 0;
+ const int low = static_cast<int>(2 * indices[i]) + first_n_bit;
+ HWY_ASSERT(low < 0x10);
+ printf("%d, %d, ", low, low + 1);
+ }
+ }
+ printf("\n");
+}
+
+// 4-tuple of byte indices
+void PrintCompress32x4Tables() {
+ printf("======================================= 32x4\n");
+ using T = uint32_t;
+ constexpr size_t N = 4; // SSE4
+ for (uint64_t code = 0; code < (1ull << N); ++code) {
+ std::array<uint32_t, N> indices{0};
+ size_t pos = 0;
+ // All lanes where mask = true
+ for (size_t i = 0; i < N; ++i) {
+ if (code & (1ull << i)) {
+ indices[pos++] = i;
+ }
+ }
+ // All lanes where mask = false
+ for (size_t i = 0; i < N; ++i) {
+ if (!(code & (1ull << i))) {
+ indices[pos++] = i;
+ }
+ }
+ HWY_ASSERT(pos == N);
+
+ for (size_t i = 0; i < N; ++i) {
+ for (size_t idx_byte = 0; idx_byte < sizeof(T); ++idx_byte) {
+ printf("%d,", static_cast<int>(sizeof(T) * indices[i] + idx_byte));
+ }
+ }
+ }
+ printf("\n");
+}
+
+void PrintCompressNot32x4Tables() {
+ printf("======================================= Not 32x4\n");
+ using T = uint32_t;
+ constexpr size_t N = 4; // SSE4
+ for (uint64_t not_code = 0; not_code < (1ull << N); ++not_code) {
+ const uint64_t code = ~not_code;
+ std::array<uint32_t, N> indices{0};
+ size_t pos = 0;
+ // All lanes where mask = true
+ for (size_t i = 0; i < N; ++i) {
+ if (code & (1ull << i)) {
+ indices[pos++] = i;
+ }
+ }
+ // All lanes where mask = false
+ for (size_t i = 0; i < N; ++i) {
+ if (!(code & (1ull << i))) {
+ indices[pos++] = i;
+ }
+ }
+ HWY_ASSERT(pos == N);
+
+ for (size_t i = 0; i < N; ++i) {
+ for (size_t idx_byte = 0; idx_byte < sizeof(T); ++idx_byte) {
+ printf("%d,", static_cast<int>(sizeof(T) * indices[i] + idx_byte));
+ }
+ }
+ }
+ printf("\n");
+}
+
+// 8-tuple of byte indices
+void PrintCompress64x2Tables() {
+ printf("======================================= 64x2\n");
+ using T = uint64_t;
+ constexpr size_t N = 2; // SSE4
+ for (uint64_t code = 0; code < (1ull << N); ++code) {
+ std::array<uint32_t, N> indices{0};
+ size_t pos = 0;
+ // All lanes where mask = true
+ for (size_t i = 0; i < N; ++i) {
+ if (code & (1ull << i)) {
+ indices[pos++] = i;
+ }
+ }
+ // All lanes where mask = false
+ for (size_t i = 0; i < N; ++i) {
+ if (!(code & (1ull << i))) {
+ indices[pos++] = i;
+ }
+ }
+ HWY_ASSERT(pos == N);
+
+ for (size_t i = 0; i < N; ++i) {
+ for (size_t idx_byte = 0; idx_byte < sizeof(T); ++idx_byte) {
+ printf("%d,", static_cast<int>(sizeof(T) * indices[i] + idx_byte));
+ }
+ }
+ }
+ printf("\n");
+}
+
+void PrintCompressNot64x2Tables() {
+ printf("======================================= Not 64x2\n");
+ using T = uint64_t;
+ constexpr size_t N = 2; // SSE4
+ for (uint64_t not_code = 0; not_code < (1ull << N); ++not_code) {
+ const uint64_t code = ~not_code;
+ std::array<uint32_t, N> indices{0};
+ size_t pos = 0;
+ // All lanes where mask = true
+ for (size_t i = 0; i < N; ++i) {
+ if (code & (1ull << i)) {
+ indices[pos++] = i;
+ }
+ }
+ // All lanes where mask = false
+ for (size_t i = 0; i < N; ++i) {
+ if (!(code & (1ull << i))) {
+ indices[pos++] = i;
+ }
+ }
+ HWY_ASSERT(pos == N);
+
+ for (size_t i = 0; i < N; ++i) {
+ for (size_t idx_byte = 0; idx_byte < sizeof(T); ++idx_byte) {
+ printf("%d,", static_cast<int>(sizeof(T) * indices[i] + idx_byte));
+ }
+ }
+ }
+ printf("\n");
+}
+
+} // namespace detail
+
+HWY_NOINLINE void PrintTables() {
+ // Only print once.
+#if HWY_TARGET == HWY_STATIC_TARGET
+ detail::PrintCompress32x8Tables();
+ detail::PrintCompressNot32x8Tables();
+ detail::PrintCompress64x4NibbleTables();
+ detail::PrintCompressNot64x4NibbleTables();
+ detail::PrintCompressNot64x2NibbleTables();
+ detail::PrintCompress64x4Tables();
+ detail::PrintCompressNot64x4Tables();
+ detail::PrintCompress32x4Tables();
+ detail::PrintCompressNot32x4Tables();
+ detail::PrintCompress64x2Tables();
+ detail::PrintCompressNot64x2Tables();
+ detail::PrintCompress64x4PairTables();
+ detail::PrintCompressNot64x4PairTables();
+ detail::PrintCompress16x8Tables();
+ detail::PrintCompress8x8Tables();
+ detail::PrintCompressNot16x8Tables();
+#endif
+}
+
+#endif // HWY_PRINT_TABLES
+
+// NOLINTNEXTLINE(google-readability-namespace-comments)
+} // namespace HWY_NAMESPACE
+} // namespace hwy
+HWY_AFTER_NAMESPACE();
+
+#if HWY_ONCE
+
+namespace hwy {
+HWY_BEFORE_TEST(HwyCompressTest);
+#if HWY_PRINT_TABLES
+// Only print instead of running tests; this will be visible in the log.
+HWY_EXPORT_AND_TEST_P(HwyCompressTest, PrintTables);
+#else
+HWY_EXPORT_AND_TEST_P(HwyCompressTest, TestAllCompress);
+HWY_EXPORT_AND_TEST_P(HwyCompressTest, TestAllCompressBlocks);
+#endif
+} // namespace hwy
+
+#endif