Adding upstream version 1:115.7.0.upstream/1%115.7.0 upstream

Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
author: Daniel Baumann <daniel.baumann@progress-linux.org> 2024-04-21 11:44:51 +0000
committer: Daniel Baumann <daniel.baumann@progress-linux.org> 2024-04-21 11:44:51 +0000
commit: 9e3c08db40b8916968b9f30096c7be3f00ce9647 (patch)
tree: a68f146d7fa01f0134297619fbe7e33db084e0aa /third_party/highway/hwy/tests/compress_test.cc
parent: Initial commit. (diff)
download: thunderbird-9e3c08db40b8916968b9f30096c7be3f00ce9647.tar.xz
thunderbird-9e3c08db40b8916968b9f30096c7be3f00ce9647.zip
1 files changed, 833 insertions, 0 deletions
diff --git a/third_party/highway/hwy/tests/compress_test.cc b/third_party/highway/hwy/tests/compress_test.cc
new file mode 100644
index 0000000000..ae008b4dc4
--- /dev/null
+++ b/third_party/highway/hwy/tests/compress_test.cc
@@ -0,0 +1,833 @@
+// Copyright 2022 Google LLC
+// SPDX-License-Identifier: Apache-2.0
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <stddef.h>
+#include <stdint.h>
+#include <string.h>  // memset
+
+#include <array>  // IWYU pragma: keep
+
+#include "hwy/base.h"
+
+#undef HWY_TARGET_INCLUDE
+#define HWY_TARGET_INCLUDE "tests/compress_test.cc"
+#include "hwy/foreach_target.h"  // IWYU pragma: keep
+#include "hwy/highway.h"
+#include "hwy/tests/test_util-inl.h"
+
+HWY_BEFORE_NAMESPACE();
+namespace hwy {
+namespace HWY_NAMESPACE {
+
+// Regenerate tables used in the implementation, instead of testing.
+#define HWY_PRINT_TABLES 0
+
+#if !HWY_PRINT_TABLES || HWY_IDE
+
+template <class D, class DI, typename T = TFromD<D>, typename TI = TFromD<DI>>
+void CheckStored(D d, DI di, const char* op, size_t expected_pos,
+                 size_t actual_pos, size_t num_to_check,
+                 const AlignedFreeUniquePtr<T[]>& in,
+                 const AlignedFreeUniquePtr<TI[]>& mask_lanes,
+                 const AlignedFreeUniquePtr<T[]>& expected, const T* actual_u,
+                 int line) {
+  if (expected_pos != actual_pos) {
+    hwy::Abort(__FILE__, line,
+               "%s: size mismatch for %s: expected %d, actual %d\n", op,
+               TypeName(T(), Lanes(d)).c_str(), static_cast<int>(expected_pos),
+               static_cast<int>(actual_pos));
+  }
+  // Modified from AssertVecEqual - we may not be checking all lanes.
+  for (size_t i = 0; i < num_to_check; ++i) {
+    if (!IsEqual(expected[i], actual_u[i])) {
+      const size_t N = Lanes(d);
+      fprintf(stderr, "%s: mismatch at i=%d of %d, line %d:\n\n", op,
+              static_cast<int>(i), static_cast<int>(num_to_check), line);
+      Print(di, "mask", Load(di, mask_lanes.get()), 0, N);
+      Print(d, "in", Load(d, in.get()), 0, N);
+      Print(d, "expect", Load(d, expected.get()), 0, num_to_check);
+      Print(d, "actual", Load(d, actual_u), 0, num_to_check);
+      HWY_ASSERT(false);
+    }
+  }
+}
+
+struct TestCompress {
+  template <class T, class D>
+  HWY_NOINLINE void operator()(T /*unused*/, D d) {
+    RandomState rng;
+
+    using TI = MakeSigned<T>;  // For mask > 0 comparison
+    using TU = MakeUnsigned<T>;
+    const Rebind<TI, D> di;
+    const size_t N = Lanes(d);
+
+    for (int frac : {0, 2, 3}) {
+      // For CompressStore
+      const size_t misalign = static_cast<size_t>(frac) * N / 4;
+
+      auto in_lanes = AllocateAligned<T>(N);
+      auto mask_lanes = AllocateAligned<TI>(N);
+      auto garbage = AllocateAligned<TU>(N);
+      auto expected = AllocateAligned<T>(N);
+      auto actual_a = AllocateAligned<T>(misalign + N);
+      T* actual_u = actual_a.get() + misalign;
+
+      const size_t bits_size = RoundUpTo((N + 7) / 8, 8);
+      auto bits = AllocateAligned<uint8_t>(bits_size);
+      memset(bits.get(), 0, bits_size);  // for MSAN
+
+      // Each lane should have a chance of having mask=true.
+      for (size_t rep = 0; rep < AdjustedReps(200); ++rep) {
+        size_t expected_pos = 0;
+        for (size_t i = 0; i < N; ++i) {
+          const uint64_t r = Random32(&rng);
+          in_lanes[i] = T();  // cannot initialize float16_t directly.
+          CopyBytes<sizeof(T)>(&r, &in_lanes[i]);  // not same size
+          mask_lanes[i] = (Random32(&rng) & 1024) ? TI(1) : TI(0);
+          if (mask_lanes[i] > 0) {
+            expected[expected_pos++] = in_lanes[i];
+          }
+          garbage[i] = static_cast<TU>(Random64(&rng));
+        }
+        size_t num_to_check;
+        if (CompressIsPartition<T>::value) {
+          // For non-native Compress, also check that mask=false lanes were
+          // moved to the back of the vector (highest indices).
+          size_t extra = expected_pos;
+          for (size_t i = 0; i < N; ++i) {
+            if (mask_lanes[i] == 0) {
+              expected[extra++] = in_lanes[i];
+            }
+          }
+          HWY_ASSERT(extra == N);
+          num_to_check = N;
+        } else {
+          // For native Compress, only the mask=true lanes are defined.
+          num_to_check = expected_pos;
+        }
+
+        const auto in = Load(d, in_lanes.get());
+        const auto mask =
+            RebindMask(d, Gt(Load(di, mask_lanes.get()), Zero(di)));
+        StoreMaskBits(d, mask, bits.get());
+
+        // Compress
+        memset(actual_u, 0, N * sizeof(T));
+        StoreU(Compress(in, mask), d, actual_u);
+        CheckStored(d, di, "Compress", expected_pos, expected_pos, num_to_check,
+                    in_lanes, mask_lanes, expected, actual_u, __LINE__);
+
+        // CompressNot
+        memset(actual_u, 0, N * sizeof(T));
+        StoreU(CompressNot(in, Not(mask)), d, actual_u);
+        CheckStored(d, di, "CompressNot", expected_pos, expected_pos,
+                    num_to_check, in_lanes, mask_lanes, expected, actual_u,
+                    __LINE__);
+
+        // CompressStore
+        memset(actual_u, 0, N * sizeof(T));
+        const size_t size1 = CompressStore(in, mask, d, actual_u);
+        // expected_pos instead of num_to_check because this op is not
+        // affected by CompressIsPartition.
+        CheckStored(d, di, "CompressStore", expected_pos, size1, expected_pos,
+                    in_lanes, mask_lanes, expected, actual_u, __LINE__);
+
+        // CompressBlendedStore
+        memcpy(actual_u, garbage.get(), N * sizeof(T));
+        const size_t size2 = CompressBlendedStore(in, mask, d, actual_u);
+        // expected_pos instead of num_to_check because this op only writes
+        // the mask=true lanes.
+        CheckStored(d, di, "CompressBlendedStore", expected_pos, size2,
+                    expected_pos, in_lanes, mask_lanes, expected, actual_u,
+                    __LINE__);
+        // Subsequent lanes are untouched.
+        for (size_t i = size2; i < N; ++i) {
+#if HWY_COMPILER_MSVC && HWY_TARGET == HWY_AVX2
+          // TODO(eustas): re-enable when compiler is fixed
+#else
+          HWY_ASSERT_EQ(garbage[i], reinterpret_cast<TU*>(actual_u)[i]);
+#endif
+        }
+
+        // CompressBits
+        memset(actual_u, 0, N * sizeof(T));
+        StoreU(CompressBits(in, bits.get()), d, actual_u);
+        CheckStored(d, di, "CompressBits", expected_pos, expected_pos,
+                    num_to_check, in_lanes, mask_lanes, expected, actual_u,
+                    __LINE__);
+
+        // CompressBitsStore
+        memset(actual_u, 0, N * sizeof(T));
+        const size_t size3 = CompressBitsStore(in, bits.get(), d, actual_u);
+        // expected_pos instead of num_to_check because this op is not
+        // affected by CompressIsPartition.
+        CheckStored(d, di, "CompressBitsStore", expected_pos, size3,
+                    expected_pos, in_lanes, mask_lanes, expected, actual_u,
+                    __LINE__);
+      }  // rep
+    }    // frac
+  }      // operator()
+};
+
+HWY_NOINLINE void TestAllCompress() {
+  ForAllTypes(ForPartialVectors<TestCompress>());
+}
+
+struct TestCompressBlocks {
+  template <class T, class D>
+  HWY_NOINLINE void operator()(T /*unused*/, D d) {
+#if HWY_TARGET == HWY_SCALAR
+    (void)d;
+#else
+    static_assert(sizeof(T) == 8 && !IsSigned<T>(), "Should be u64");
+    RandomState rng;
+
+    using TI = MakeSigned<T>;  // For mask > 0 comparison
+    const Rebind<TI, D> di;
+    const size_t N = Lanes(d);
+
+    auto in_lanes = AllocateAligned<T>(N);
+    auto mask_lanes = AllocateAligned<TI>(N);
+    auto expected = AllocateAligned<T>(N);
+    auto actual = AllocateAligned<T>(N);
+
+    // Each lane should have a chance of having mask=true.
+    for (size_t rep = 0; rep < AdjustedReps(200); ++rep) {
+      size_t expected_pos = 0;
+      for (size_t i = 0; i < N; i += 2) {
+        const uint64_t bits = Random32(&rng);
+        in_lanes[i + 1] = in_lanes[i] = T();  // cannot set float16_t directly.
+        CopyBytes<sizeof(T)>(&bits, &in_lanes[i]);      // not same size
+        CopyBytes<sizeof(T)>(&bits, &in_lanes[i + 1]);  // not same size
+        mask_lanes[i + 1] = mask_lanes[i] = TI{(Random32(&rng) & 8) ? 1 : 0};
+        if (mask_lanes[i] > 0) {
+          expected[expected_pos++] = in_lanes[i];
+          expected[expected_pos++] = in_lanes[i + 1];
+        }
+      }
+      size_t num_to_check;
+      if (CompressIsPartition<T>::value) {
+        // For non-native Compress, also check that mask=false lanes were
+        // moved to the back of the vector (highest indices).
+        size_t extra = expected_pos;
+        for (size_t i = 0; i < N; ++i) {
+          if (mask_lanes[i] == 0) {
+            expected[extra++] = in_lanes[i];
+          }
+        }
+        HWY_ASSERT(extra == N);
+        num_to_check = N;
+      } else {
+        // For native Compress, only the mask=true lanes are defined.
+        num_to_check = expected_pos;
+      }
+
+      const auto in = Load(d, in_lanes.get());
+      const auto mask = RebindMask(d, Gt(Load(di, mask_lanes.get()), Zero(di)));
+
+      // CompressBlocksNot
+      memset(actual.get(), 0, N * sizeof(T));
+      StoreU(CompressBlocksNot(in, Not(mask)), d, actual.get());
+      CheckStored(d, di, "CompressBlocksNot", expected_pos, expected_pos,
+                  num_to_check, in_lanes, mask_lanes, expected, actual.get(),
+                  __LINE__);
+    }  // rep
+#endif  // HWY_TARGET == HWY_SCALAR
+  }     // operator()
+};
+
+HWY_NOINLINE void TestAllCompressBlocks() {
+  ForGE128Vectors<TestCompressBlocks>()(uint64_t());
+}
+
+#endif  // !HWY_PRINT_TABLES
+
+#if HWY_PRINT_TABLES || HWY_IDE
+namespace detail {  // for code folding
+
+void PrintCompress8x8Tables() {
+  printf("======================================= 8x8\n");
+  constexpr size_t N = 8;
+  for (uint64_t code = 0; code < (1ull << N); ++code) {
+    std::array<uint8_t, N> indices{0};
+    size_t pos = 0;
+    // All lanes where mask = true
+    for (size_t i = 0; i < N; ++i) {
+      if (code & (1ull << i)) {
+        indices[pos++] = i;
+      }
+    }
+    // All lanes where mask = false
+    for (size_t i = 0; i < N; ++i) {
+      if (!(code & (1ull << i))) {
+        indices[pos++] = i;
+      }
+    }
+    HWY_ASSERT(pos == N);
+
+    for (size_t i = 0; i < N; ++i) {
+      printf("%d,", indices[i]);
+    }
+    printf(code & 1 ? "//\n" : "/**/");
+  }
+  printf("\n");
+}
+
+void PrintCompress16x8Tables() {
+  printf("======================================= 16x8\n");
+  constexpr size_t N = 8;  // 128-bit SIMD
+  for (uint64_t code = 0; code < (1ull << N); ++code) {
+    std::array<uint8_t, N> indices{0};
+    size_t pos = 0;
+    // All lanes where mask = true
+    for (size_t i = 0; i < N; ++i) {
+      if (code & (1ull << i)) {
+        indices[pos++] = i;
+      }
+    }
+    // All lanes where mask = false
+    for (size_t i = 0; i < N; ++i) {
+      if (!(code & (1ull << i))) {
+        indices[pos++] = i;
+      }
+    }
+    HWY_ASSERT(pos == N);
+
+    // Doubled (for converting lane to byte indices)
+    for (size_t i = 0; i < N; ++i) {
+      printf("%d,", 2 * indices[i]);
+    }
+    printf(code & 1 ? "//\n" : "/**/");
+  }
+  printf("\n");
+}
+
+void PrintCompressNot16x8Tables() {
+  printf("======================================= Not 16x8\n");
+  constexpr size_t N = 8;  // 128-bit SIMD
+  for (uint64_t not_code = 0; not_code < (1ull << N); ++not_code) {
+    const uint64_t code = ~not_code;
+    std::array<uint8_t, N> indices{0};
+    size_t pos = 0;
+    // All lanes where mask = true
+    for (size_t i = 0; i < N; ++i) {
+      if (code & (1ull << i)) {
+        indices[pos++] = i;
+      }
+    }
+    // All lanes where mask = false
+    for (size_t i = 0; i < N; ++i) {
+      if (!(code & (1ull << i))) {
+        indices[pos++] = i;
+      }
+    }
+    HWY_ASSERT(pos == N);
+
+    // Doubled (for converting lane to byte indices)
+    for (size_t i = 0; i < N; ++i) {
+      printf("%d,", 2 * indices[i]);
+    }
+    printf(not_code & 1 ? "//\n" : "/**/");
+  }
+  printf("\n");
+}
+
+// Compressed to nibbles, unpacked via variable right shift. Also includes
+// FirstN bits in the nibble MSB.
+void PrintCompress32x8Tables() {
+  printf("======================================= 32/64x8\n");
+  constexpr size_t N = 8;  // AVX2 or 64-bit AVX3
+  for (uint64_t code = 0; code < (1ull << N); ++code) {
+    const size_t count = PopCount(code);
+    std::array<uint32_t, N> indices{0};
+    size_t pos = 0;
+    // All lanes where mask = true
+    for (size_t i = 0; i < N; ++i) {
+      if (code & (1ull << i)) {
+        indices[pos++] = i;
+      }
+    }
+    // All lanes where mask = false
+    for (size_t i = 0; i < N; ++i) {
+      if (!(code & (1ull << i))) {
+        indices[pos++] = i;
+      }
+    }
+    HWY_ASSERT(pos == N);
+
+    // Convert to nibbles
+    uint64_t packed = 0;
+    for (size_t i = 0; i < N; ++i) {
+      HWY_ASSERT(indices[i] < N);
+      if (i < count) {
+        indices[i] |= N;
+        HWY_ASSERT(indices[i] < 0x10);
+      }
+      packed += indices[i] << (i * 4);
+    }
+
+    HWY_ASSERT(packed < (1ull << (N * 4)));
+    printf("0x%08x,", static_cast<uint32_t>(packed));
+  }
+  printf("\n");
+}
+
+void PrintCompressNot32x8Tables() {
+  printf("======================================= Not 32/64x8\n");
+  constexpr size_t N = 8;  // AVX2 or 64-bit AVX3
+  for (uint64_t not_code = 0; not_code < (1ull << N); ++not_code) {
+    const uint64_t code = ~not_code;
+    const size_t count = PopCount(code);
+    std::array<uint32_t, N> indices{0};
+    size_t pos = 0;
+    // All lanes where mask = true
+    for (size_t i = 0; i < N; ++i) {
+      if (code & (1ull << i)) {
+        indices[pos++] = i;
+      }
+    }
+    // All lanes where mask = false
+    for (size_t i = 0; i < N; ++i) {
+      if (!(code & (1ull << i))) {
+        indices[pos++] = i;
+      }
+    }
+    HWY_ASSERT(pos == N);
+
+    // Convert to nibbles
+    uint64_t packed = 0;
+    for (size_t i = 0; i < N; ++i) {
+      HWY_ASSERT(indices[i] < N);
+      if (i < count) {
+        indices[i] |= N;
+        HWY_ASSERT(indices[i] < 0x10);
+      }
+      packed += indices[i] << (i * 4);
+    }
+
+    HWY_ASSERT(packed < (1ull << (N * 4)));
+    printf("0x%08x,", static_cast<uint32_t>(packed));
+  }
+  printf("\n");
+}
+
+// Compressed to nibbles (for AVX3 64x4)
+void PrintCompress64x4NibbleTables() {
+  printf("======================================= 64x4Nibble\n");
+  constexpr size_t N = 4;  // AVX2
+  for (uint64_t code = 0; code < (1ull << N); ++code) {
+    std::array<uint32_t, N> indices{0};
+    size_t pos = 0;
+    // All lanes where mask = true
+    for (size_t i = 0; i < N; ++i) {
+      if (code & (1ull << i)) {
+        indices[pos++] = i;
+      }
+    }
+    // All lanes where mask = false
+    for (size_t i = 0; i < N; ++i) {
+      if (!(code & (1ull << i))) {
+        indices[pos++] = i;
+      }
+    }
+    HWY_ASSERT(pos == N);
+
+    // Convert to nibbles
+    uint64_t packed = 0;
+    for (size_t i = 0; i < N; ++i) {
+      HWY_ASSERT(indices[i] < N);
+      packed += indices[i] << (i * 4);
+    }
+
+    HWY_ASSERT(packed < (1ull << (N * 4)));
+    printf("0x%08x,", static_cast<uint32_t>(packed));
+  }
+  printf("\n");
+}
+
+void PrintCompressNot64x4NibbleTables() {
+  printf("======================================= Not 64x4Nibble\n");
+  constexpr size_t N = 4;  // AVX2
+  for (uint64_t not_code = 0; not_code < (1ull << N); ++not_code) {
+    const uint64_t code = ~not_code;
+    std::array<uint32_t, N> indices{0};
+    size_t pos = 0;
+    // All lanes where mask = true
+    for (size_t i = 0; i < N; ++i) {
+      if (code & (1ull << i)) {
+        indices[pos++] = i;
+      }
+    }
+    // All lanes where mask = false
+    for (size_t i = 0; i < N; ++i) {
+      if (!(code & (1ull << i))) {
+        indices[pos++] = i;
+      }
+    }
+    HWY_ASSERT(pos == N);
+
+    // Convert to nibbles
+    uint64_t packed = 0;
+    for (size_t i = 0; i < N; ++i) {
+      HWY_ASSERT(indices[i] < N);
+      packed += indices[i] << (i * 4);
+    }
+
+    HWY_ASSERT(packed < (1ull << (N * 4)));
+    printf("0x%08x,", static_cast<uint32_t>(packed));
+  }
+  printf("\n");
+}
+
+void PrintCompressNot64x2NibbleTables() {
+  printf("======================================= Not 64x2Nibble\n");
+  constexpr size_t N = 2;  // 128-bit
+  for (uint64_t not_code = 0; not_code < (1ull << N); ++not_code) {
+    const uint64_t code = ~not_code;
+    std::array<uint32_t, N> indices{0};
+    size_t pos = 0;
+    // All lanes where mask = true
+    for (size_t i = 0; i < N; ++i) {
+      if (code & (1ull << i)) {
+        indices[pos++] = i;
+      }
+    }
+    // All lanes where mask = false
+    for (size_t i = 0; i < N; ++i) {
+      if (!(code & (1ull << i))) {
+        indices[pos++] = i;
+      }
+    }
+    HWY_ASSERT(pos == N);
+
+    // Convert to nibbles
+    uint64_t packed = 0;
+    for (size_t i = 0; i < N; ++i) {
+      HWY_ASSERT(indices[i] < N);
+      packed += indices[i] << (i * 4);
+    }
+
+    HWY_ASSERT(packed < (1ull << (N * 4)));
+    printf("0x%08x,", static_cast<uint32_t>(packed));
+  }
+  printf("\n");
+}
+
+void PrintCompress64x4Tables() {
+  printf("======================================= 64x4 uncompressed\n");
+  constexpr size_t N = 4;  // SVE_256
+  for (uint64_t code = 0; code < (1ull << N); ++code) {
+    std::array<size_t, N> indices{0};
+    size_t pos = 0;
+    // All lanes where mask = true
+    for (size_t i = 0; i < N; ++i) {
+      if (code & (1ull << i)) {
+        indices[pos++] = i;
+      }
+    }
+    // All lanes where mask = false
+    for (size_t i = 0; i < N; ++i) {
+      if (!(code & (1ull << i))) {
+        indices[pos++] = i;
+      }
+    }
+    HWY_ASSERT(pos == N);
+
+    // Store uncompressed indices because SVE TBL returns 0 if an index is out
+    // of bounds. On AVX3 we simply variable-shift because permute indices are
+    // interpreted modulo N. Compression is not worth the extra shift+AND
+    // because the table is anyway only 512 bytes.
+    for (size_t i = 0; i < N; ++i) {
+      printf("%d,", static_cast<int>(indices[i]));
+    }
+  }
+  printf("\n");
+}
+
+void PrintCompressNot64x4Tables() {
+  printf("======================================= Not 64x4 uncompressed\n");
+  constexpr size_t N = 4;  // SVE_256
+  for (uint64_t not_code = 0; not_code < (1ull << N); ++not_code) {
+    const uint64_t code = ~not_code;
+    std::array<size_t, N> indices{0};
+    size_t pos = 0;
+    // All lanes where mask = true
+    for (size_t i = 0; i < N; ++i) {
+      if (code & (1ull << i)) {
+        indices[pos++] = i;
+      }
+    }
+    // All lanes where mask = false
+    for (size_t i = 0; i < N; ++i) {
+      if (!(code & (1ull << i))) {
+        indices[pos++] = i;
+      }
+    }
+    HWY_ASSERT(pos == N);
+
+    // Store uncompressed indices because SVE TBL returns 0 if an index is out
+    // of bounds. On AVX3 we simply variable-shift because permute indices are
+    // interpreted modulo N. Compression is not worth the extra shift+AND
+    // because the table is anyway only 512 bytes.
+    for (size_t i = 0; i < N; ++i) {
+      printf("%d,", static_cast<int>(indices[i]));
+    }
+  }
+  printf("\n");
+}
+
+// Same as above, but prints pairs of u32 indices (for AVX2). Also includes
+// FirstN bits in the nibble MSB.
+void PrintCompress64x4PairTables() {
+  printf("======================================= 64x4 u32 index\n");
+  constexpr size_t N = 4;  // AVX2
+  for (uint64_t code = 0; code < (1ull << N); ++code) {
+    const size_t count = PopCount(code);
+    std::array<size_t, N> indices{0};
+    size_t pos = 0;
+    // All lanes where mask = true
+    for (size_t i = 0; i < N; ++i) {
+      if (code & (1ull << i)) {
+        indices[pos++] = i;
+      }
+    }
+    // All lanes where mask = false
+    for (size_t i = 0; i < N; ++i) {
+      if (!(code & (1ull << i))) {
+        indices[pos++] = i;
+      }
+    }
+    HWY_ASSERT(pos == N);
+
+    // Store uncompressed indices because SVE TBL returns 0 if an index is out
+    // of bounds. On AVX3 we simply variable-shift because permute indices are
+    // interpreted modulo N. Compression is not worth the extra shift+AND
+    // because the table is anyway only 512 bytes.
+    for (size_t i = 0; i < N; ++i) {
+      const int first_n_bit = i < count ? 8 : 0;
+      const int low = static_cast<int>(2 * indices[i]) + first_n_bit;
+      HWY_ASSERT(low < 0x10);
+      printf("%d, %d, ", low, low + 1);
+    }
+  }
+  printf("\n");
+}
+
+void PrintCompressNot64x4PairTables() {
+  printf("======================================= Not 64x4 u32 index\n");
+  constexpr size_t N = 4;  // AVX2
+  for (uint64_t not_code = 0; not_code < (1ull << N); ++not_code) {
+    const uint64_t code = ~not_code;
+    const size_t count = PopCount(code);
+    std::array<size_t, N> indices{0};
+    size_t pos = 0;
+    // All lanes where mask = true
+    for (size_t i = 0; i < N; ++i) {
+      if (code & (1ull << i)) {
+        indices[pos++] = i;
+      }
+    }
+    // All lanes where mask = false
+    for (size_t i = 0; i < N; ++i) {
+      if (!(code & (1ull << i))) {
+        indices[pos++] = i;
+      }
+    }
+    HWY_ASSERT(pos == N);
+
+    // Store uncompressed indices because SVE TBL returns 0 if an index is out
+    // of bounds. On AVX3 we simply variable-shift because permute indices are
+    // interpreted modulo N. Compression is not worth the extra shift+AND
+    // because the table is anyway only 512 bytes.
+    for (size_t i = 0; i < N; ++i) {
+      const int first_n_bit = i < count ? 8 : 0;
+      const int low = static_cast<int>(2 * indices[i]) + first_n_bit;
+      HWY_ASSERT(low < 0x10);
+      printf("%d, %d, ", low, low + 1);
+    }
+  }
+  printf("\n");
+}
+
+// 4-tuple of byte indices
+void PrintCompress32x4Tables() {
+  printf("======================================= 32x4\n");
+  using T = uint32_t;
+  constexpr size_t N = 4;  // SSE4
+  for (uint64_t code = 0; code < (1ull << N); ++code) {
+    std::array<uint32_t, N> indices{0};
+    size_t pos = 0;
+    // All lanes where mask = true
+    for (size_t i = 0; i < N; ++i) {
+      if (code & (1ull << i)) {
+        indices[pos++] = i;
+      }
+    }
+    // All lanes where mask = false
+    for (size_t i = 0; i < N; ++i) {
+      if (!(code & (1ull << i))) {
+        indices[pos++] = i;
+      }
+    }
+    HWY_ASSERT(pos == N);
+
+    for (size_t i = 0; i < N; ++i) {
+      for (size_t idx_byte = 0; idx_byte < sizeof(T); ++idx_byte) {
+        printf("%d,", static_cast<int>(sizeof(T) * indices[i] + idx_byte));
+      }
+    }
+  }
+  printf("\n");
+}
+
+void PrintCompressNot32x4Tables() {
+  printf("======================================= Not 32x4\n");
+  using T = uint32_t;
+  constexpr size_t N = 4;  // SSE4
+  for (uint64_t not_code = 0; not_code < (1ull << N); ++not_code) {
+    const uint64_t code = ~not_code;
+    std::array<uint32_t, N> indices{0};
+    size_t pos = 0;
+    // All lanes where mask = true
+    for (size_t i = 0; i < N; ++i) {
+      if (code & (1ull << i)) {
+        indices[pos++] = i;
+      }
+    }
+    // All lanes where mask = false
+    for (size_t i = 0; i < N; ++i) {
+      if (!(code & (1ull << i))) {
+        indices[pos++] = i;
+      }
+    }
+    HWY_ASSERT(pos == N);
+
+    for (size_t i = 0; i < N; ++i) {
+      for (size_t idx_byte = 0; idx_byte < sizeof(T); ++idx_byte) {
+        printf("%d,", static_cast<int>(sizeof(T) * indices[i] + idx_byte));
+      }
+    }
+  }
+  printf("\n");
+}
+
+// 8-tuple of byte indices
+void PrintCompress64x2Tables() {
+  printf("======================================= 64x2\n");
+  using T = uint64_t;
+  constexpr size_t N = 2;  // SSE4
+  for (uint64_t code = 0; code < (1ull << N); ++code) {
+    std::array<uint32_t, N> indices{0};
+    size_t pos = 0;
+    // All lanes where mask = true
+    for (size_t i = 0; i < N; ++i) {
+      if (code & (1ull << i)) {
+        indices[pos++] = i;
+      }
+    }
+    // All lanes where mask = false
+    for (size_t i = 0; i < N; ++i) {
+      if (!(code & (1ull << i))) {
+        indices[pos++] = i;
+      }
+    }
+    HWY_ASSERT(pos == N);
+
+    for (size_t i = 0; i < N; ++i) {
+      for (size_t idx_byte = 0; idx_byte < sizeof(T); ++idx_byte) {
+        printf("%d,", static_cast<int>(sizeof(T) * indices[i] + idx_byte));
+      }
+    }
+  }
+  printf("\n");
+}
+
+void PrintCompressNot64x2Tables() {
+  printf("======================================= Not 64x2\n");
+  using T = uint64_t;
+  constexpr size_t N = 2;  // SSE4
+  for (uint64_t not_code = 0; not_code < (1ull << N); ++not_code) {
+    const uint64_t code = ~not_code;
+    std::array<uint32_t, N> indices{0};
+    size_t pos = 0;
+    // All lanes where mask = true
+    for (size_t i = 0; i < N; ++i) {
+      if (code & (1ull << i)) {
+        indices[pos++] = i;
+      }
+    }
+    // All lanes where mask = false
+    for (size_t i = 0; i < N; ++i) {
+      if (!(code & (1ull << i))) {
+        indices[pos++] = i;
+      }
+    }
+    HWY_ASSERT(pos == N);
+
+    for (size_t i = 0; i < N; ++i) {
+      for (size_t idx_byte = 0; idx_byte < sizeof(T); ++idx_byte) {
+        printf("%d,", static_cast<int>(sizeof(T) * indices[i] + idx_byte));
+      }
+    }
+  }
+  printf("\n");
+}
+
+}  // namespace detail
+
+HWY_NOINLINE void PrintTables() {
+  // Only print once.
+#if HWY_TARGET == HWY_STATIC_TARGET
+  detail::PrintCompress32x8Tables();
+  detail::PrintCompressNot32x8Tables();
+  detail::PrintCompress64x4NibbleTables();
+  detail::PrintCompressNot64x4NibbleTables();
+  detail::PrintCompressNot64x2NibbleTables();
+  detail::PrintCompress64x4Tables();
+  detail::PrintCompressNot64x4Tables();
+  detail::PrintCompress32x4Tables();
+  detail::PrintCompressNot32x4Tables();
+  detail::PrintCompress64x2Tables();
+  detail::PrintCompressNot64x2Tables();
+  detail::PrintCompress64x4PairTables();
+  detail::PrintCompressNot64x4PairTables();
+  detail::PrintCompress16x8Tables();
+  detail::PrintCompress8x8Tables();
+  detail::PrintCompressNot16x8Tables();
+#endif
+}
+
+#endif  // HWY_PRINT_TABLES
+
+// NOLINTNEXTLINE(google-readability-namespace-comments)
+}  // namespace HWY_NAMESPACE
+}  // namespace hwy
+HWY_AFTER_NAMESPACE();
+
+#if HWY_ONCE
+
+namespace hwy {
+HWY_BEFORE_TEST(HwyCompressTest);
+#if HWY_PRINT_TABLES
+// Only print instead of running tests; this will be visible in the log.
+HWY_EXPORT_AND_TEST_P(HwyCompressTest, PrintTables);
+#else
+HWY_EXPORT_AND_TEST_P(HwyCompressTest, TestAllCompress);
+HWY_EXPORT_AND_TEST_P(HwyCompressTest, TestAllCompressBlocks);
+#endif
+}  // namespace hwy
+
+#endif
author	Daniel Baumann <daniel.baumann@progress-linux.org>	2024-04-21 11:44:51 +0000
committer	Daniel Baumann <daniel.baumann@progress-linux.org>	2024-04-21 11:44:51 +0000
commit	9e3c08db40b8916968b9f30096c7be3f00ce9647 (patch)
tree	a68f146d7fa01f0134297619fbe7e33db084e0aa /third_party/highway/hwy/tests/compress_test.cc
parent	Initial commit. (diff)
download	thunderbird-9e3c08db40b8916968b9f30096c7be3f00ce9647.tar.xz thunderbird-9e3c08db40b8916968b9f30096c7be3f00ce9647.zip