// Copyright 2022 Google LLC // SPDX-License-Identifier: Apache-2.0 // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. #include #include #include // memset #include // IWYU pragma: keep #include "hwy/base.h" #undef HWY_TARGET_INCLUDE #define HWY_TARGET_INCLUDE "tests/compress_test.cc" #include "hwy/foreach_target.h" // IWYU pragma: keep #include "hwy/highway.h" #include "hwy/tests/test_util-inl.h" HWY_BEFORE_NAMESPACE(); namespace hwy { namespace HWY_NAMESPACE { // Regenerate tables used in the implementation, instead of testing. #define HWY_PRINT_TABLES 0 #if !HWY_PRINT_TABLES || HWY_IDE template , typename TI = TFromD> void CheckStored(D d, DI di, const char* op, size_t expected_pos, size_t actual_pos, size_t num_to_check, const AlignedFreeUniquePtr& in, const AlignedFreeUniquePtr& mask_lanes, const AlignedFreeUniquePtr& expected, const T* actual_u, int line) { if (expected_pos != actual_pos) { hwy::Abort(__FILE__, line, "%s: size mismatch for %s: expected %d, actual %d\n", op, TypeName(T(), Lanes(d)).c_str(), static_cast(expected_pos), static_cast(actual_pos)); } // Modified from AssertVecEqual - we may not be checking all lanes. for (size_t i = 0; i < num_to_check; ++i) { if (!IsEqual(expected[i], actual_u[i])) { const size_t N = Lanes(d); fprintf(stderr, "%s: mismatch at i=%d of %d, line %d:\n\n", op, static_cast(i), static_cast(num_to_check), line); Print(di, "mask", Load(di, mask_lanes.get()), 0, N); Print(d, "in", Load(d, in.get()), 0, N); Print(d, "expect", Load(d, expected.get()), 0, N); Print(d, "actual", Load(d, actual_u), 0, N); HWY_ASSERT(false); } } } struct TestCompress { template HWY_NOINLINE void operator()(T /*unused*/, D d) { RandomState rng; using TI = MakeSigned; // For mask > 0 comparison const Rebind di; const size_t N = Lanes(d); const T zero{0}; for (int frac : {0, 2, 3}) { // For CompressStore const size_t misalign = static_cast(frac) * N / 4; auto in_lanes = AllocateAligned(N); auto mask_lanes = AllocateAligned(N); auto expected = AllocateAligned(N); auto actual_a = AllocateAligned(misalign + N); T* actual_u = actual_a.get() + misalign; const size_t bits_size = RoundUpTo((N + 7) / 8, 8); auto bits = AllocateAligned(bits_size); memset(bits.get(), 0, bits_size); // for MSAN // Each lane should have a chance of having mask=true. for (size_t rep = 0; rep < AdjustedReps(200); ++rep) { size_t expected_pos = 0; for (size_t i = 0; i < N; ++i) { const uint64_t r = Random32(&rng); in_lanes[i] = T(); // cannot initialize float16_t directly. CopyBytes(&r, &in_lanes[i]); // not same size mask_lanes[i] = (Random32(&rng) & 1024) ? TI(1) : TI(0); if (mask_lanes[i] > 0) { expected[expected_pos++] = in_lanes[i]; } } size_t num_to_check; if (CompressIsPartition::value) { // For non-native Compress, also check that mask=false lanes were // moved to the back of the vector (highest indices). size_t extra = expected_pos; for (size_t i = 0; i < N; ++i) { if (mask_lanes[i] == 0) { expected[extra++] = in_lanes[i]; } } HWY_ASSERT(extra == N); num_to_check = N; } else { // For native Compress, only the mask=true lanes are defined. num_to_check = expected_pos; } const auto in = Load(d, in_lanes.get()); const auto mask = RebindMask(d, Gt(Load(di, mask_lanes.get()), Zero(di))); StoreMaskBits(d, mask, bits.get()); // Compress memset(actual_u, 0, N * sizeof(T)); StoreU(Compress(in, mask), d, actual_u); CheckStored(d, di, "Compress", expected_pos, expected_pos, num_to_check, in_lanes, mask_lanes, expected, actual_u, __LINE__); // CompressNot memset(actual_u, 0, N * sizeof(T)); StoreU(CompressNot(in, Not(mask)), d, actual_u); CheckStored(d, di, "CompressNot", expected_pos, expected_pos, num_to_check, in_lanes, mask_lanes, expected, actual_u, __LINE__); // CompressStore memset(actual_u, 0, N * sizeof(T)); const size_t size1 = CompressStore(in, mask, d, actual_u); // expected_pos instead of num_to_check because this op is not // affected by CompressIsPartition. CheckStored(d, di, "CompressStore", expected_pos, size1, expected_pos, in_lanes, mask_lanes, expected, actual_u, __LINE__); // CompressBlendedStore memset(actual_u, 0, N * sizeof(T)); const size_t size2 = CompressBlendedStore(in, mask, d, actual_u); // expected_pos instead of num_to_check because this op only writes // the mask=true lanes. CheckStored(d, di, "CompressBlendedStore", expected_pos, size2, expected_pos, in_lanes, mask_lanes, expected, actual_u, __LINE__); // Subsequent lanes are untouched. for (size_t i = size2; i < N; ++i) { HWY_ASSERT_EQ(zero, actual_u[i]); } // CompressBits memset(actual_u, 0, N * sizeof(T)); StoreU(CompressBits(in, bits.get()), d, actual_u); CheckStored(d, di, "CompressBits", expected_pos, expected_pos, num_to_check, in_lanes, mask_lanes, expected, actual_u, __LINE__); // CompressBitsStore memset(actual_u, 0, N * sizeof(T)); const size_t size3 = CompressBitsStore(in, bits.get(), d, actual_u); // expected_pos instead of num_to_check because this op is not // affected by CompressIsPartition. CheckStored(d, di, "CompressBitsStore", expected_pos, size3, expected_pos, in_lanes, mask_lanes, expected, actual_u, __LINE__); } // rep } // frac } // operator() }; HWY_NOINLINE void TestAllCompress() { ForUIF163264(ForPartialVectors()); } struct TestCompressBlocks { template HWY_NOINLINE void operator()(T /*unused*/, D d) { #if HWY_TARGET == HWY_SCALAR (void)d; #else static_assert(sizeof(T) == 8 && !IsSigned(), "Should be u64"); RandomState rng; using TI = MakeSigned; // For mask > 0 comparison const Rebind di; const size_t N = Lanes(d); auto in_lanes = AllocateAligned(N); auto mask_lanes = AllocateAligned(N); auto expected = AllocateAligned(N); auto actual = AllocateAligned(N); // Each lane should have a chance of having mask=true. for (size_t rep = 0; rep < AdjustedReps(200); ++rep) { size_t expected_pos = 0; for (size_t i = 0; i < N; i += 2) { const uint64_t bits = Random32(&rng); in_lanes[i + 1] = in_lanes[i] = T(); // cannot set float16_t directly. CopyBytes(&bits, &in_lanes[i]); // not same size CopyBytes(&bits, &in_lanes[i + 1]); // not same size mask_lanes[i + 1] = mask_lanes[i] = TI{(Random32(&rng) & 8) ? 1 : 0}; if (mask_lanes[i] > 0) { expected[expected_pos++] = in_lanes[i]; expected[expected_pos++] = in_lanes[i + 1]; } } size_t num_to_check; if (CompressIsPartition::value) { // For non-native Compress, also check that mask=false lanes were // moved to the back of the vector (highest indices). size_t extra = expected_pos; for (size_t i = 0; i < N; ++i) { if (mask_lanes[i] == 0) { expected[extra++] = in_lanes[i]; } } HWY_ASSERT(extra == N); num_to_check = N; } else { // For native Compress, only the mask=true lanes are defined. num_to_check = expected_pos; } const auto in = Load(d, in_lanes.get()); const auto mask = RebindMask(d, Gt(Load(di, mask_lanes.get()), Zero(di))); // CompressBlocksNot memset(actual.get(), 0, N * sizeof(T)); StoreU(CompressBlocksNot(in, Not(mask)), d, actual.get()); CheckStored(d, di, "CompressBlocksNot", expected_pos, expected_pos, num_to_check, in_lanes, mask_lanes, expected, actual.get(), __LINE__); } // rep #endif // HWY_TARGET == HWY_SCALAR } // operator() }; HWY_NOINLINE void TestAllCompressBlocks() { ForGE128Vectors()(uint64_t()); } #endif // !HWY_PRINT_TABLES #if HWY_PRINT_TABLES || HWY_IDE namespace detail { // for code folding void PrintCompress16x8Tables() { printf("======================================= 16x8\n"); constexpr size_t N = 8; // 128-bit SIMD for (uint64_t code = 0; code < (1ull << N); ++code) { std::array indices{0}; size_t pos = 0; // All lanes where mask = true for (size_t i = 0; i < N; ++i) { if (code & (1ull << i)) { indices[pos++] = i; } } // All lanes where mask = false for (size_t i = 0; i < N; ++i) { if (!(code & (1ull << i))) { indices[pos++] = i; } } HWY_ASSERT(pos == N); // Doubled (for converting lane to byte indices) for (size_t i = 0; i < N; ++i) { printf("%d,", 2 * indices[i]); } printf(code & 1 ? "//\n" : "/**/"); } printf("\n"); } void PrintCompressNot16x8Tables() { printf("======================================= Not 16x8\n"); constexpr size_t N = 8; // 128-bit SIMD for (uint64_t not_code = 0; not_code < (1ull << N); ++not_code) { const uint64_t code = ~not_code; std::array indices{0}; size_t pos = 0; // All lanes where mask = true for (size_t i = 0; i < N; ++i) { if (code & (1ull << i)) { indices[pos++] = i; } } // All lanes where mask = false for (size_t i = 0; i < N; ++i) { if (!(code & (1ull << i))) { indices[pos++] = i; } } HWY_ASSERT(pos == N); // Doubled (for converting lane to byte indices) for (size_t i = 0; i < N; ++i) { printf("%d,", 2 * indices[i]); } printf(not_code & 1 ? "//\n" : "/**/"); } printf("\n"); } // Compressed to nibbles, unpacked via variable right shift. Also includes // FirstN bits in the nibble MSB. void PrintCompress32x8Tables() { printf("======================================= 32/64x8\n"); constexpr size_t N = 8; // AVX2 or 64-bit AVX3 for (uint64_t code = 0; code < (1ull << N); ++code) { const size_t count = PopCount(code); std::array indices{0}; size_t pos = 0; // All lanes where mask = true for (size_t i = 0; i < N; ++i) { if (code & (1ull << i)) { indices[pos++] = i; } } // All lanes where mask = false for (size_t i = 0; i < N; ++i) { if (!(code & (1ull << i))) { indices[pos++] = i; } } HWY_ASSERT(pos == N); // Convert to nibbles uint64_t packed = 0; for (size_t i = 0; i < N; ++i) { HWY_ASSERT(indices[i] < N); if (i < count) { indices[i] |= N; HWY_ASSERT(indices[i] < 0x10); } packed += indices[i] << (i * 4); } HWY_ASSERT(packed < (1ull << (N * 4))); printf("0x%08x,", static_cast(packed)); } printf("\n"); } void PrintCompressNot32x8Tables() { printf("======================================= Not 32/64x8\n"); constexpr size_t N = 8; // AVX2 or 64-bit AVX3 for (uint64_t not_code = 0; not_code < (1ull << N); ++not_code) { const uint64_t code = ~not_code; const size_t count = PopCount(code); std::array indices{0}; size_t pos = 0; // All lanes where mask = true for (size_t i = 0; i < N; ++i) { if (code & (1ull << i)) { indices[pos++] = i; } } // All lanes where mask = false for (size_t i = 0; i < N; ++i) { if (!(code & (1ull << i))) { indices[pos++] = i; } } HWY_ASSERT(pos == N); // Convert to nibbles uint64_t packed = 0; for (size_t i = 0; i < N; ++i) { HWY_ASSERT(indices[i] < N); if (i < count) { indices[i] |= N; HWY_ASSERT(indices[i] < 0x10); } packed += indices[i] << (i * 4); } HWY_ASSERT(packed < (1ull << (N * 4))); printf("0x%08x,", static_cast(packed)); } printf("\n"); } // Compressed to nibbles (for AVX3 64x4) void PrintCompress64x4NibbleTables() { printf("======================================= 64x4Nibble\n"); constexpr size_t N = 4; // AVX2 for (uint64_t code = 0; code < (1ull << N); ++code) { std::array indices{0}; size_t pos = 0; // All lanes where mask = true for (size_t i = 0; i < N; ++i) { if (code & (1ull << i)) { indices[pos++] = i; } } // All lanes where mask = false for (size_t i = 0; i < N; ++i) { if (!(code & (1ull << i))) { indices[pos++] = i; } } HWY_ASSERT(pos == N); // Convert to nibbles uint64_t packed = 0; for (size_t i = 0; i < N; ++i) { HWY_ASSERT(indices[i] < N); packed += indices[i] << (i * 4); } HWY_ASSERT(packed < (1ull << (N * 4))); printf("0x%08x,", static_cast(packed)); } printf("\n"); } void PrintCompressNot64x4NibbleTables() { printf("======================================= Not 64x4Nibble\n"); constexpr size_t N = 4; // AVX2 for (uint64_t not_code = 0; not_code < (1ull << N); ++not_code) { const uint64_t code = ~not_code; std::array indices{0}; size_t pos = 0; // All lanes where mask = true for (size_t i = 0; i < N; ++i) { if (code & (1ull << i)) { indices[pos++] = i; } } // All lanes where mask = false for (size_t i = 0; i < N; ++i) { if (!(code & (1ull << i))) { indices[pos++] = i; } } HWY_ASSERT(pos == N); // Convert to nibbles uint64_t packed = 0; for (size_t i = 0; i < N; ++i) { HWY_ASSERT(indices[i] < N); packed += indices[i] << (i * 4); } HWY_ASSERT(packed < (1ull << (N * 4))); printf("0x%08x,", static_cast(packed)); } printf("\n"); } void PrintCompress64x4Tables() { printf("======================================= 64x4 uncompressed\n"); constexpr size_t N = 4; // SVE_256 for (uint64_t code = 0; code < (1ull << N); ++code) { std::array indices{0}; size_t pos = 0; // All lanes where mask = true for (size_t i = 0; i < N; ++i) { if (code & (1ull << i)) { indices[pos++] = i; } } // All lanes where mask = false for (size_t i = 0; i < N; ++i) { if (!(code & (1ull << i))) { indices[pos++] = i; } } HWY_ASSERT(pos == N); // Store uncompressed indices because SVE TBL returns 0 if an index is out // of bounds. On AVX3 we simply variable-shift because permute indices are // interpreted modulo N. Compression is not worth the extra shift+AND // because the table is anyway only 512 bytes. for (size_t i = 0; i < N; ++i) { printf("%d,", static_cast(indices[i])); } } printf("\n"); } void PrintCompressNot64x4Tables() { printf("======================================= Not 64x4 uncompressed\n"); constexpr size_t N = 4; // SVE_256 for (uint64_t not_code = 0; not_code < (1ull << N); ++not_code) { const uint64_t code = ~not_code; std::array indices{0}; size_t pos = 0; // All lanes where mask = true for (size_t i = 0; i < N; ++i) { if (code & (1ull << i)) { indices[pos++] = i; } } // All lanes where mask = false for (size_t i = 0; i < N; ++i) { if (!(code & (1ull << i))) { indices[pos++] = i; } } HWY_ASSERT(pos == N); // Store uncompressed indices because SVE TBL returns 0 if an index is out // of bounds. On AVX3 we simply variable-shift because permute indices are // interpreted modulo N. Compression is not worth the extra shift+AND // because the table is anyway only 512 bytes. for (size_t i = 0; i < N; ++i) { printf("%d,", static_cast(indices[i])); } } printf("\n"); } // Same as above, but prints pairs of u32 indices (for AVX2). Also includes // FirstN bits in the nibble MSB. void PrintCompress64x4PairTables() { printf("======================================= 64x4 u32 index\n"); constexpr size_t N = 4; // AVX2 for (uint64_t code = 0; code < (1ull << N); ++code) { const size_t count = PopCount(code); std::array indices{0}; size_t pos = 0; // All lanes where mask = true for (size_t i = 0; i < N; ++i) { if (code & (1ull << i)) { indices[pos++] = i; } } // All lanes where mask = false for (size_t i = 0; i < N; ++i) { if (!(code & (1ull << i))) { indices[pos++] = i; } } HWY_ASSERT(pos == N); // Store uncompressed indices because SVE TBL returns 0 if an index is out // of bounds. On AVX3 we simply variable-shift because permute indices are // interpreted modulo N. Compression is not worth the extra shift+AND // because the table is anyway only 512 bytes. for (size_t i = 0; i < N; ++i) { const int first_n_bit = i < count ? 8 : 0; const int low = static_cast(2 * indices[i]) + first_n_bit; HWY_ASSERT(low < 0x10); printf("%d, %d, ", low, low + 1); } } printf("\n"); } void PrintCompressNot64x4PairTables() { printf("======================================= Not 64x4 u32 index\n"); constexpr size_t N = 4; // AVX2 for (uint64_t not_code = 0; not_code < (1ull << N); ++not_code) { const uint64_t code = ~not_code; const size_t count = PopCount(code); std::array indices{0}; size_t pos = 0; // All lanes where mask = true for (size_t i = 0; i < N; ++i) { if (code & (1ull << i)) { indices[pos++] = i; } } // All lanes where mask = false for (size_t i = 0; i < N; ++i) { if (!(code & (1ull << i))) { indices[pos++] = i; } } HWY_ASSERT(pos == N); // Store uncompressed indices because SVE TBL returns 0 if an index is out // of bounds. On AVX3 we simply variable-shift because permute indices are // interpreted modulo N. Compression is not worth the extra shift+AND // because the table is anyway only 512 bytes. for (size_t i = 0; i < N; ++i) { const int first_n_bit = i < count ? 8 : 0; const int low = static_cast(2 * indices[i]) + first_n_bit; HWY_ASSERT(low < 0x10); printf("%d, %d, ", low, low + 1); } } printf("\n"); } // 4-tuple of byte indices void PrintCompress32x4Tables() { printf("======================================= 32x4\n"); using T = uint32_t; constexpr size_t N = 4; // SSE4 for (uint64_t code = 0; code < (1ull << N); ++code) { std::array indices{0}; size_t pos = 0; // All lanes where mask = true for (size_t i = 0; i < N; ++i) { if (code & (1ull << i)) { indices[pos++] = i; } } // All lanes where mask = false for (size_t i = 0; i < N; ++i) { if (!(code & (1ull << i))) { indices[pos++] = i; } } HWY_ASSERT(pos == N); for (size_t i = 0; i < N; ++i) { for (size_t idx_byte = 0; idx_byte < sizeof(T); ++idx_byte) { printf("%d,", static_cast(sizeof(T) * indices[i] + idx_byte)); } } } printf("\n"); } void PrintCompressNot32x4Tables() { printf("======================================= Not 32x4\n"); using T = uint32_t; constexpr size_t N = 4; // SSE4 for (uint64_t not_code = 0; not_code < (1ull << N); ++not_code) { const uint64_t code = ~not_code; std::array indices{0}; size_t pos = 0; // All lanes where mask = true for (size_t i = 0; i < N; ++i) { if (code & (1ull << i)) { indices[pos++] = i; } } // All lanes where mask = false for (size_t i = 0; i < N; ++i) { if (!(code & (1ull << i))) { indices[pos++] = i; } } HWY_ASSERT(pos == N); for (size_t i = 0; i < N; ++i) { for (size_t idx_byte = 0; idx_byte < sizeof(T); ++idx_byte) { printf("%d,", static_cast(sizeof(T) * indices[i] + idx_byte)); } } } printf("\n"); } // 8-tuple of byte indices void PrintCompress64x2Tables() { printf("======================================= 64x2\n"); using T = uint64_t; constexpr size_t N = 2; // SSE4 for (uint64_t code = 0; code < (1ull << N); ++code) { std::array indices{0}; size_t pos = 0; // All lanes where mask = true for (size_t i = 0; i < N; ++i) { if (code & (1ull << i)) { indices[pos++] = i; } } // All lanes where mask = false for (size_t i = 0; i < N; ++i) { if (!(code & (1ull << i))) { indices[pos++] = i; } } HWY_ASSERT(pos == N); for (size_t i = 0; i < N; ++i) { for (size_t idx_byte = 0; idx_byte < sizeof(T); ++idx_byte) { printf("%d,", static_cast(sizeof(T) * indices[i] + idx_byte)); } } } printf("\n"); } void PrintCompressNot64x2Tables() { printf("======================================= Not 64x2\n"); using T = uint64_t; constexpr size_t N = 2; // SSE4 for (uint64_t not_code = 0; not_code < (1ull << N); ++not_code) { const uint64_t code = ~not_code; std::array indices{0}; size_t pos = 0; // All lanes where mask = true for (size_t i = 0; i < N; ++i) { if (code & (1ull << i)) { indices[pos++] = i; } } // All lanes where mask = false for (size_t i = 0; i < N; ++i) { if (!(code & (1ull << i))) { indices[pos++] = i; } } HWY_ASSERT(pos == N); for (size_t i = 0; i < N; ++i) { for (size_t idx_byte = 0; idx_byte < sizeof(T); ++idx_byte) { printf("%d,", static_cast(sizeof(T) * indices[i] + idx_byte)); } } } printf("\n"); } } // namespace detail HWY_NOINLINE void PrintTables() { // Only print once. #if HWY_TARGET == HWY_STATIC_TARGET detail::PrintCompress32x8Tables(); detail::PrintCompressNot32x8Tables(); detail::PrintCompress64x4NibbleTables(); detail::PrintCompressNot64x4NibbleTables(); detail::PrintCompress64x4Tables(); detail::PrintCompressNot64x4Tables(); detail::PrintCompress32x4Tables(); detail::PrintCompressNot32x4Tables(); detail::PrintCompress64x2Tables(); detail::PrintCompressNot64x2Tables(); detail::PrintCompress64x4PairTables(); detail::PrintCompressNot64x4PairTables(); detail::PrintCompress16x8Tables(); detail::PrintCompressNot16x8Tables(); #endif } #endif // HWY_PRINT_TABLES // NOLINTNEXTLINE(google-readability-namespace-comments) } // namespace HWY_NAMESPACE } // namespace hwy HWY_AFTER_NAMESPACE(); #if HWY_ONCE namespace hwy { HWY_BEFORE_TEST(HwyCompressTest); #if HWY_PRINT_TABLES // Only print instead of running tests; this will be visible in the log. HWY_EXPORT_AND_TEST_P(HwyCompressTest, PrintTables); #else HWY_EXPORT_AND_TEST_P(HwyCompressTest, TestAllCompress); HWY_EXPORT_AND_TEST_P(HwyCompressTest, TestAllCompressBlocks); #endif } // namespace hwy #endif