From 26a029d407be480d791972afb5975cf62c9360a6 Mon Sep 17 00:00:00 2001 From: Daniel Baumann Date: Fri, 19 Apr 2024 02:47:55 +0200 Subject: Adding upstream version 124.0.1. Signed-off-by: Daniel Baumann --- third_party/highway/.bcr/metadata.template.json | 25 + third_party/highway/.bcr/presubmit.yml | 10 + third_party/highway/.bcr/source.template.json | 5 + third_party/highway/.clang-format | 1 + .../highway/.github/workflows/build_test.yml | 57 + third_party/highway/BUILD | 443 + third_party/highway/CMakeLists.txt | 595 ++ third_party/highway/CMakeLists.txt.in | 15 + third_party/highway/CONTRIBUTING | 39 + third_party/highway/LICENSE | 201 + third_party/highway/MODULE.bazel | 6 + third_party/highway/README.md | 400 + third_party/highway/WORKSPACE | 38 + third_party/highway/cmake/FindAtomics.cmake | 56 + third_party/highway/debian/changelog | 181 + third_party/highway/debian/compat | 1 + third_party/highway/debian/control | 23 + third_party/highway/debian/copyright | 20 + third_party/highway/debian/rules | 6 + third_party/highway/debian/source/format | 1 + third_party/highway/docs/.gitignore | 3 + third_party/highway/docs/Makefile | 20 + third_party/highway/docs/_static/css/dark.css | 622 ++ third_party/highway/docs/_static/css/toggle.css | 77 + third_party/highway/docs/_static/js/toggle.js | 38 + third_party/highway/docs/_templates/layout.html | 9 + third_party/highway/docs/_templates/versions.html | 62 + third_party/highway/docs/buildDocs.sh | 156 + third_party/highway/docs/conf.py | 159 + third_party/highway/docs/images/logo-32x32.ico | Bin 0 -> 15406 bytes third_party/highway/docs/images/logo.png | Bin 0 -> 54853 bytes third_party/highway/docs/index.rst | 13 + third_party/highway/docs/locales/zh/index.rst | 6 + third_party/highway/docs/make.bat | 35 + third_party/highway/docs/mm-converter.py | 108 + third_party/highway/hwy.gni | 53 + third_party/highway/hwy/aligned_allocator.cc | 152 + third_party/highway/hwy/aligned_allocator.h | 211 + third_party/highway/hwy/aligned_allocator_test.cc | 280 + third_party/highway/hwy/base.h | 1066 +++ third_party/highway/hwy/base_test.cc | 310 + third_party/highway/hwy/cache_control.h | 108 + third_party/highway/hwy/contrib/algo/copy-inl.h | 136 + third_party/highway/hwy/contrib/algo/copy_test.cc | 204 + third_party/highway/hwy/contrib/algo/find-inl.h | 109 + third_party/highway/hwy/contrib/algo/find_test.cc | 223 + .../highway/hwy/contrib/algo/transform-inl.h | 262 + .../highway/hwy/contrib/algo/transform_test.cc | 381 + .../highway/hwy/contrib/bit_pack/bit_pack-inl.h | 2599 ++++++ .../highway/hwy/contrib/bit_pack/bit_pack_test.cc | 206 + third_party/highway/hwy/contrib/dot/dot-inl.h | 252 + third_party/highway/hwy/contrib/dot/dot_test.cc | 168 + third_party/highway/hwy/contrib/image/image.cc | 145 + third_party/highway/hwy/contrib/image/image.h | 467 + .../highway/hwy/contrib/image/image_test.cc | 149 + third_party/highway/hwy/contrib/math/math-inl.h | 1285 +++ third_party/highway/hwy/contrib/math/math_test.cc | 345 + third_party/highway/hwy/contrib/sort/BUILD | 210 + third_party/highway/hwy/contrib/sort/README.md | 109 + third_party/highway/hwy/contrib/sort/algo-inl.h | 553 ++ .../highway/hwy/contrib/sort/bench_parallel.cc | 238 + third_party/highway/hwy/contrib/sort/bench_sort.cc | 367 + .../highway/hwy/contrib/sort/print_network.cc | 90 + third_party/highway/hwy/contrib/sort/result-inl.h | 140 + third_party/highway/hwy/contrib/sort/shared-inl.h | 154 + third_party/highway/hwy/contrib/sort/sort_test.cc | 650 ++ .../hwy/contrib/sort/sorting_networks-inl.h | 898 ++ third_party/highway/hwy/contrib/sort/traits-inl.h | 561 ++ .../highway/hwy/contrib/sort/traits128-inl.h | 529 ++ third_party/highway/hwy/contrib/sort/vqsort-inl.h | 1724 ++++ third_party/highway/hwy/contrib/sort/vqsort.cc | 124 + third_party/highway/hwy/contrib/sort/vqsort.h | 221 + .../highway/hwy/contrib/sort/vqsort_128a.cc | 59 + .../highway/hwy/contrib/sort/vqsort_128d.cc | 59 + .../highway/hwy/contrib/sort/vqsort_f32a.cc | 52 + .../highway/hwy/contrib/sort/vqsort_f32d.cc | 52 + .../highway/hwy/contrib/sort/vqsort_f64a.cc | 58 + .../highway/hwy/contrib/sort/vqsort_f64d.cc | 58 + .../highway/hwy/contrib/sort/vqsort_i16a.cc | 52 + .../highway/hwy/contrib/sort/vqsort_i16d.cc | 52 + .../highway/hwy/contrib/sort/vqsort_i32a.cc | 52 + .../highway/hwy/contrib/sort/vqsort_i32d.cc | 52 + .../highway/hwy/contrib/sort/vqsort_i64a.cc | 52 + .../highway/hwy/contrib/sort/vqsort_i64d.cc | 52 + .../highway/hwy/contrib/sort/vqsort_kv128a.cc | 62 + .../highway/hwy/contrib/sort/vqsort_kv128d.cc | 62 + .../highway/hwy/contrib/sort/vqsort_kv64a.cc | 62 + .../highway/hwy/contrib/sort/vqsort_kv64d.cc | 62 + .../highway/hwy/contrib/sort/vqsort_u16a.cc | 52 + .../highway/hwy/contrib/sort/vqsort_u16d.cc | 53 + .../highway/hwy/contrib/sort/vqsort_u32a.cc | 52 + .../highway/hwy/contrib/sort/vqsort_u32d.cc | 53 + .../highway/hwy/contrib/sort/vqsort_u64a.cc | 52 + .../highway/hwy/contrib/sort/vqsort_u64d.cc | 53 + third_party/highway/hwy/detect_compiler_arch.h | 281 + third_party/highway/hwy/detect_targets.h | 623 ++ third_party/highway/hwy/examples/benchmark.cc | 257 + third_party/highway/hwy/examples/skeleton-inl.h | 65 + third_party/highway/hwy/examples/skeleton.cc | 122 + third_party/highway/hwy/examples/skeleton.h | 34 + third_party/highway/hwy/examples/skeleton_test.cc | 111 + third_party/highway/hwy/foreach_target.h | 329 + third_party/highway/hwy/highway.h | 429 + third_party/highway/hwy/highway_export.h | 74 + third_party/highway/hwy/highway_test.cc | 491 ++ third_party/highway/hwy/hwy.version | 19 + third_party/highway/hwy/nanobenchmark.cc | 743 ++ third_party/highway/hwy/nanobenchmark.h | 194 + third_party/highway/hwy/nanobenchmark_test.cc | 94 + third_party/highway/hwy/ops/arm_neon-inl.h | 7725 +++++++++++++++++ third_party/highway/hwy/ops/arm_sve-inl.h | 4596 ++++++++++ third_party/highway/hwy/ops/emu128-inl.h | 2704 ++++++ third_party/highway/hwy/ops/generic_ops-inl.h | 3190 +++++++ third_party/highway/hwy/ops/ppc_vsx-inl.h | 4920 +++++++++++ third_party/highway/hwy/ops/rvv-inl.h | 4229 +++++++++ third_party/highway/hwy/ops/scalar-inl.h | 1845 ++++ third_party/highway/hwy/ops/set_macros-inl.h | 566 ++ third_party/highway/hwy/ops/shared-inl.h | 488 ++ third_party/highway/hwy/ops/tuple-inl.h | 86 + third_party/highway/hwy/ops/wasm_128-inl.h | 5060 +++++++++++ third_party/highway/hwy/ops/wasm_256-inl.h | 2030 +++++ third_party/highway/hwy/ops/x86_128-inl.h | 9038 ++++++++++++++++++++ third_party/highway/hwy/ops/x86_256-inl.h | 6476 ++++++++++++++ third_party/highway/hwy/ops/x86_512-inl.h | 5733 +++++++++++++ third_party/highway/hwy/per_target.cc | 42 + third_party/highway/hwy/per_target.h | 37 + third_party/highway/hwy/print-inl.h | 53 + third_party/highway/hwy/print.cc | 109 + third_party/highway/hwy/print.h | 73 + third_party/highway/hwy/targets.cc | 554 ++ third_party/highway/hwy/targets.h | 336 + third_party/highway/hwy/targets_test.cc | 157 + third_party/highway/hwy/tests/arithmetic_test.cc | 543 ++ .../highway/hwy/tests/blockwise_shift_test.cc | 274 + third_party/highway/hwy/tests/blockwise_test.cc | 463 + third_party/highway/hwy/tests/combine_test.cc | 455 + third_party/highway/hwy/tests/compare_test.cc | 650 ++ third_party/highway/hwy/tests/compress_test.cc | 833 ++ third_party/highway/hwy/tests/convert_test.cc | 894 ++ third_party/highway/hwy/tests/count_test.cc | 293 + third_party/highway/hwy/tests/crypto_test.cc | 716 ++ third_party/highway/hwy/tests/demote_test.cc | 686 ++ third_party/highway/hwy/tests/expand_test.cc | 292 + third_party/highway/hwy/tests/float_test.cc | 359 + third_party/highway/hwy/tests/hwy_gtest.h | 157 + third_party/highway/hwy/tests/if_test.cc | 172 + third_party/highway/hwy/tests/interleaved_test.cc | 228 + third_party/highway/hwy/tests/list_targets.cc | 71 + third_party/highway/hwy/tests/logical_test.cc | 293 + third_party/highway/hwy/tests/mask_mem_test.cc | 205 + third_party/highway/hwy/tests/mask_test.cc | 336 + third_party/highway/hwy/tests/memory_test.cc | 345 + third_party/highway/hwy/tests/mul_test.cc | 576 ++ third_party/highway/hwy/tests/reduction_test.cc | 308 + third_party/highway/hwy/tests/reverse_test.cc | 303 + third_party/highway/hwy/tests/shift_test.cc | 419 + third_party/highway/hwy/tests/swizzle_test.cc | 367 + third_party/highway/hwy/tests/test_util-inl.h | 711 ++ third_party/highway/hwy/tests/test_util.cc | 116 + third_party/highway/hwy/tests/test_util.h | 155 + third_party/highway/hwy/tests/test_util_test.cc | 107 + third_party/highway/hwy/tests/tuple_test.cc | 75 + third_party/highway/libhwy-contrib.pc.in | 10 + third_party/highway/libhwy-test.pc.in | 11 + third_party/highway/libhwy.pc.in | 10 + third_party/highway/preamble.js.lds | 9 + third_party/highway/run_tests.bat | 20 + third_party/highway/run_tests.sh | 79 + 168 files changed, 95777 insertions(+) create mode 100644 third_party/highway/.bcr/metadata.template.json create mode 100644 third_party/highway/.bcr/presubmit.yml create mode 100644 third_party/highway/.bcr/source.template.json create mode 100644 third_party/highway/.clang-format create mode 100644 third_party/highway/.github/workflows/build_test.yml create mode 100644 third_party/highway/BUILD create mode 100644 third_party/highway/CMakeLists.txt create mode 100644 third_party/highway/CMakeLists.txt.in create mode 100644 third_party/highway/CONTRIBUTING create mode 100644 third_party/highway/LICENSE create mode 100644 third_party/highway/MODULE.bazel create mode 100644 third_party/highway/README.md create mode 100644 third_party/highway/WORKSPACE create mode 100644 third_party/highway/cmake/FindAtomics.cmake create mode 100644 third_party/highway/debian/changelog create mode 100644 third_party/highway/debian/compat create mode 100644 third_party/highway/debian/control create mode 100644 third_party/highway/debian/copyright create mode 100644 third_party/highway/debian/rules create mode 100644 third_party/highway/debian/source/format create mode 100644 third_party/highway/docs/.gitignore create mode 100644 third_party/highway/docs/Makefile create mode 100644 third_party/highway/docs/_static/css/dark.css create mode 100644 third_party/highway/docs/_static/css/toggle.css create mode 100644 third_party/highway/docs/_static/js/toggle.js create mode 100644 third_party/highway/docs/_templates/layout.html create mode 100644 third_party/highway/docs/_templates/versions.html create mode 100644 third_party/highway/docs/buildDocs.sh create mode 100644 third_party/highway/docs/conf.py create mode 100644 third_party/highway/docs/images/logo-32x32.ico create mode 100644 third_party/highway/docs/images/logo.png create mode 100644 third_party/highway/docs/index.rst create mode 100644 third_party/highway/docs/locales/zh/index.rst create mode 100644 third_party/highway/docs/make.bat create mode 100644 third_party/highway/docs/mm-converter.py create mode 100644 third_party/highway/hwy.gni create mode 100644 third_party/highway/hwy/aligned_allocator.cc create mode 100644 third_party/highway/hwy/aligned_allocator.h create mode 100644 third_party/highway/hwy/aligned_allocator_test.cc create mode 100644 third_party/highway/hwy/base.h create mode 100644 third_party/highway/hwy/base_test.cc create mode 100644 third_party/highway/hwy/cache_control.h create mode 100644 third_party/highway/hwy/contrib/algo/copy-inl.h create mode 100644 third_party/highway/hwy/contrib/algo/copy_test.cc create mode 100644 third_party/highway/hwy/contrib/algo/find-inl.h create mode 100644 third_party/highway/hwy/contrib/algo/find_test.cc create mode 100644 third_party/highway/hwy/contrib/algo/transform-inl.h create mode 100644 third_party/highway/hwy/contrib/algo/transform_test.cc create mode 100644 third_party/highway/hwy/contrib/bit_pack/bit_pack-inl.h create mode 100644 third_party/highway/hwy/contrib/bit_pack/bit_pack_test.cc create mode 100644 third_party/highway/hwy/contrib/dot/dot-inl.h create mode 100644 third_party/highway/hwy/contrib/dot/dot_test.cc create mode 100644 third_party/highway/hwy/contrib/image/image.cc create mode 100644 third_party/highway/hwy/contrib/image/image.h create mode 100644 third_party/highway/hwy/contrib/image/image_test.cc create mode 100644 third_party/highway/hwy/contrib/math/math-inl.h create mode 100644 third_party/highway/hwy/contrib/math/math_test.cc create mode 100644 third_party/highway/hwy/contrib/sort/BUILD create mode 100644 third_party/highway/hwy/contrib/sort/README.md create mode 100644 third_party/highway/hwy/contrib/sort/algo-inl.h create mode 100644 third_party/highway/hwy/contrib/sort/bench_parallel.cc create mode 100644 third_party/highway/hwy/contrib/sort/bench_sort.cc create mode 100644 third_party/highway/hwy/contrib/sort/print_network.cc create mode 100644 third_party/highway/hwy/contrib/sort/result-inl.h create mode 100644 third_party/highway/hwy/contrib/sort/shared-inl.h create mode 100644 third_party/highway/hwy/contrib/sort/sort_test.cc create mode 100644 third_party/highway/hwy/contrib/sort/sorting_networks-inl.h create mode 100644 third_party/highway/hwy/contrib/sort/traits-inl.h create mode 100644 third_party/highway/hwy/contrib/sort/traits128-inl.h create mode 100644 third_party/highway/hwy/contrib/sort/vqsort-inl.h create mode 100644 third_party/highway/hwy/contrib/sort/vqsort.cc create mode 100644 third_party/highway/hwy/contrib/sort/vqsort.h create mode 100644 third_party/highway/hwy/contrib/sort/vqsort_128a.cc create mode 100644 third_party/highway/hwy/contrib/sort/vqsort_128d.cc create mode 100644 third_party/highway/hwy/contrib/sort/vqsort_f32a.cc create mode 100644 third_party/highway/hwy/contrib/sort/vqsort_f32d.cc create mode 100644 third_party/highway/hwy/contrib/sort/vqsort_f64a.cc create mode 100644 third_party/highway/hwy/contrib/sort/vqsort_f64d.cc create mode 100644 third_party/highway/hwy/contrib/sort/vqsort_i16a.cc create mode 100644 third_party/highway/hwy/contrib/sort/vqsort_i16d.cc create mode 100644 third_party/highway/hwy/contrib/sort/vqsort_i32a.cc create mode 100644 third_party/highway/hwy/contrib/sort/vqsort_i32d.cc create mode 100644 third_party/highway/hwy/contrib/sort/vqsort_i64a.cc create mode 100644 third_party/highway/hwy/contrib/sort/vqsort_i64d.cc create mode 100644 third_party/highway/hwy/contrib/sort/vqsort_kv128a.cc create mode 100644 third_party/highway/hwy/contrib/sort/vqsort_kv128d.cc create mode 100644 third_party/highway/hwy/contrib/sort/vqsort_kv64a.cc create mode 100644 third_party/highway/hwy/contrib/sort/vqsort_kv64d.cc create mode 100644 third_party/highway/hwy/contrib/sort/vqsort_u16a.cc create mode 100644 third_party/highway/hwy/contrib/sort/vqsort_u16d.cc create mode 100644 third_party/highway/hwy/contrib/sort/vqsort_u32a.cc create mode 100644 third_party/highway/hwy/contrib/sort/vqsort_u32d.cc create mode 100644 third_party/highway/hwy/contrib/sort/vqsort_u64a.cc create mode 100644 third_party/highway/hwy/contrib/sort/vqsort_u64d.cc create mode 100644 third_party/highway/hwy/detect_compiler_arch.h create mode 100644 third_party/highway/hwy/detect_targets.h create mode 100644 third_party/highway/hwy/examples/benchmark.cc create mode 100644 third_party/highway/hwy/examples/skeleton-inl.h create mode 100644 third_party/highway/hwy/examples/skeleton.cc create mode 100644 third_party/highway/hwy/examples/skeleton.h create mode 100644 third_party/highway/hwy/examples/skeleton_test.cc create mode 100644 third_party/highway/hwy/foreach_target.h create mode 100644 third_party/highway/hwy/highway.h create mode 100644 third_party/highway/hwy/highway_export.h create mode 100644 third_party/highway/hwy/highway_test.cc create mode 100644 third_party/highway/hwy/hwy.version create mode 100644 third_party/highway/hwy/nanobenchmark.cc create mode 100644 third_party/highway/hwy/nanobenchmark.h create mode 100644 third_party/highway/hwy/nanobenchmark_test.cc create mode 100644 third_party/highway/hwy/ops/arm_neon-inl.h create mode 100644 third_party/highway/hwy/ops/arm_sve-inl.h create mode 100644 third_party/highway/hwy/ops/emu128-inl.h create mode 100644 third_party/highway/hwy/ops/generic_ops-inl.h create mode 100644 third_party/highway/hwy/ops/ppc_vsx-inl.h create mode 100644 third_party/highway/hwy/ops/rvv-inl.h create mode 100644 third_party/highway/hwy/ops/scalar-inl.h create mode 100644 third_party/highway/hwy/ops/set_macros-inl.h create mode 100644 third_party/highway/hwy/ops/shared-inl.h create mode 100644 third_party/highway/hwy/ops/tuple-inl.h create mode 100644 third_party/highway/hwy/ops/wasm_128-inl.h create mode 100644 third_party/highway/hwy/ops/wasm_256-inl.h create mode 100644 third_party/highway/hwy/ops/x86_128-inl.h create mode 100644 third_party/highway/hwy/ops/x86_256-inl.h create mode 100644 third_party/highway/hwy/ops/x86_512-inl.h create mode 100644 third_party/highway/hwy/per_target.cc create mode 100644 third_party/highway/hwy/per_target.h create mode 100644 third_party/highway/hwy/print-inl.h create mode 100644 third_party/highway/hwy/print.cc create mode 100644 third_party/highway/hwy/print.h create mode 100644 third_party/highway/hwy/targets.cc create mode 100644 third_party/highway/hwy/targets.h create mode 100644 third_party/highway/hwy/targets_test.cc create mode 100644 third_party/highway/hwy/tests/arithmetic_test.cc create mode 100644 third_party/highway/hwy/tests/blockwise_shift_test.cc create mode 100644 third_party/highway/hwy/tests/blockwise_test.cc create mode 100644 third_party/highway/hwy/tests/combine_test.cc create mode 100644 third_party/highway/hwy/tests/compare_test.cc create mode 100644 third_party/highway/hwy/tests/compress_test.cc create mode 100644 third_party/highway/hwy/tests/convert_test.cc create mode 100644 third_party/highway/hwy/tests/count_test.cc create mode 100644 third_party/highway/hwy/tests/crypto_test.cc create mode 100644 third_party/highway/hwy/tests/demote_test.cc create mode 100644 third_party/highway/hwy/tests/expand_test.cc create mode 100644 third_party/highway/hwy/tests/float_test.cc create mode 100644 third_party/highway/hwy/tests/hwy_gtest.h create mode 100644 third_party/highway/hwy/tests/if_test.cc create mode 100644 third_party/highway/hwy/tests/interleaved_test.cc create mode 100644 third_party/highway/hwy/tests/list_targets.cc create mode 100644 third_party/highway/hwy/tests/logical_test.cc create mode 100644 third_party/highway/hwy/tests/mask_mem_test.cc create mode 100644 third_party/highway/hwy/tests/mask_test.cc create mode 100644 third_party/highway/hwy/tests/memory_test.cc create mode 100644 third_party/highway/hwy/tests/mul_test.cc create mode 100644 third_party/highway/hwy/tests/reduction_test.cc create mode 100644 third_party/highway/hwy/tests/reverse_test.cc create mode 100644 third_party/highway/hwy/tests/shift_test.cc create mode 100644 third_party/highway/hwy/tests/swizzle_test.cc create mode 100644 third_party/highway/hwy/tests/test_util-inl.h create mode 100644 third_party/highway/hwy/tests/test_util.cc create mode 100644 third_party/highway/hwy/tests/test_util.h create mode 100644 third_party/highway/hwy/tests/test_util_test.cc create mode 100644 third_party/highway/hwy/tests/tuple_test.cc create mode 100644 third_party/highway/libhwy-contrib.pc.in create mode 100644 third_party/highway/libhwy-test.pc.in create mode 100644 third_party/highway/libhwy.pc.in create mode 100644 third_party/highway/preamble.js.lds create mode 100644 third_party/highway/run_tests.bat create mode 100644 third_party/highway/run_tests.sh (limited to 'third_party/highway') diff --git a/third_party/highway/.bcr/metadata.template.json b/third_party/highway/.bcr/metadata.template.json new file mode 100644 index 0000000000..1a9a13645a --- /dev/null +++ b/third_party/highway/.bcr/metadata.template.json @@ -0,0 +1,25 @@ +{ + "homepage": "https://github.com/google/highway", + "maintainers": [ + { + "email": "janwas@google.com", + "github": "jan-wassenberg", + "name": "Jan Wassenberg" + }, + { + "email": "eustas@google.com", + "github": "eustas", + "name": "Eugene Kliuchnikov" + }, + { + "email": "rhettstucki@google.com", + "github": "rhettstucki", + "name": "Rhett Stucki" + } + ], + "repository": [ + "github:google/highway" + ], + "versions": [], + "yanked_versions": {} +} diff --git a/third_party/highway/.bcr/presubmit.yml b/third_party/highway/.bcr/presubmit.yml new file mode 100644 index 0000000000..03440834fe --- /dev/null +++ b/third_party/highway/.bcr/presubmit.yml @@ -0,0 +1,10 @@ +bcr_test_module: + module_path: "bzlmod" + matrix: + platform: ["debian10", "macos", "ubuntu2004", "windows"] + tasks: + run_tests: + name: "Run test module" + platform: ${{ platform }} + test_targets: + - ":hwy_ops_tests" diff --git a/third_party/highway/.bcr/source.template.json b/third_party/highway/.bcr/source.template.json new file mode 100644 index 0000000000..4f1481990e --- /dev/null +++ b/third_party/highway/.bcr/source.template.json @@ -0,0 +1,5 @@ +{ + "integrity": "", + "strip_prefix": "{REPO}-{VERSION}", + "url": "https://github.com/{OWNER}/{REPO}/archive/refs/tags/{TAG}.tar.gz" +} diff --git a/third_party/highway/.clang-format b/third_party/highway/.clang-format new file mode 100644 index 0000000000..f6cb8ad931 --- /dev/null +++ b/third_party/highway/.clang-format @@ -0,0 +1 @@ +BasedOnStyle: Google diff --git a/third_party/highway/.github/workflows/build_test.yml b/third_party/highway/.github/workflows/build_test.yml new file mode 100644 index 0000000000..bab1630bda --- /dev/null +++ b/third_party/highway/.github/workflows/build_test.yml @@ -0,0 +1,57 @@ +# Copyright 2021 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +name: Build / test +on: [push, pull_request] +jobs: + cmake: + name: Build and test ${{ matrix.name }} + runs-on: ubuntu-18.04 + strategy: + matrix: + include: + - name: Clang-5.0 + extra_deps: clang-5.0 + c_compiler: clang-5.0 + cxx_compiler: clang++-5.0 + + - name: Clang-6.0 + extra_deps: clang-6.0 + c_compiler: clang-6.0 + cxx_compiler: clang++-6.0 + + steps: + - uses: actions/checkout@v2 + + - name: Install deps + run: sudo apt-get install ${{ matrix.extra_deps }} + + - name: Build and test + run: | + export CMAKE_BUILD_PARALLEL_LEVEL=2 + export CTEST_PARALLEL_LEVEL=2 + CXXFLAGS=-Werror CC=${{ matrix.c_compiler }} CXX=${{ matrix.cxx_compiler }} cmake -B out . + cmake --build out + ctest --test-dir out + + bazel: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v2 + - uses: bazelbuild/setup-bazelisk@v1 + - uses: actions/cache@v2 + with: + path: ~/.cache/bazel + key: bazel-${{ runner.os }} + - run: bazel build //... diff --git a/third_party/highway/BUILD b/third_party/highway/BUILD new file mode 100644 index 0000000000..92ca4721fc --- /dev/null +++ b/third_party/highway/BUILD @@ -0,0 +1,443 @@ +load("@rules_license//rules:license.bzl", "license") +load("@bazel_skylib//lib:selects.bzl", "selects") + +load("@rules_cc//cc:defs.bzl", "cc_test") +package( + default_applicable_licenses = ["//:license"], + default_visibility = ["//visibility:public"], +) + +license( + name = "license", + package_name = "highway", +) + +licenses(["notice"]) + +exports_files(["LICENSE"]) + +# Detect compiler: +config_setting( + name = "compiler_clang", + flag_values = {"@bazel_tools//tools/cpp:compiler": "clang"}, +) + +config_setting( + name = "compiler_clangcl", + flag_values = {"@bazel_tools//tools/cpp:compiler": "lexan"}, +) + +config_setting( + name = "compiler_msvc_actual", + flag_values = {"@bazel_tools//tools/cpp:compiler": "msvc"}, +) + +# The above is insufficient for Bazel on Windows, which does not seem to +# detect/set a compiler flag. This workaround prevents compile errors due to +# passing clang-only warning flags to MSVC. +config_setting( + name = "compiler_msvc_cpu", + values = { + "cpu": "x64_windows", + }, +) + +selects.config_setting_group( + name = "compiler_msvc", + match_any = [ + ":compiler_msvc_actual", + ":compiler_msvc_cpu", + ], +) + +config_setting( + name = "compiler_emscripten", + values = {"cpu": "wasm32"}, +) + +# See https://github.com/bazelbuild/bazel/issues/12707 +config_setting( + name = "compiler_gcc_bug", + flag_values = { + "@bazel_tools//tools/cpp:compiler": "compiler", + }, +) + +config_setting( + name = "compiler_gcc_actual", + flag_values = { + "@bazel_tools//tools/cpp:compiler": "gcc", + }, +) + +selects.config_setting_group( + name = "compiler_gcc", + match_any = [ + ":compiler_gcc_bug", + ":compiler_gcc_actual", + ], +) + +# Additional warnings for Clang OR GCC (skip for MSVC) +CLANG_GCC_COPTS = [ + "-Wunused-parameter", + "-Wunused-variable", + "-Wextra-semi", + "-Wunreachable-code", +] + +# Warnings supported by Clang and Clang-cl +CLANG_OR_CLANGCL_OPTS = CLANG_GCC_COPTS + [ + "-Wfloat-overflow-conversion", + "-Wfloat-zero-conversion", + "-Wfor-loop-analysis", + "-Wgnu-redeclared-enum", + "-Winfinite-recursion", + "-Wliteral-conversion", + "-Wno-c++98-compat", + "-Wno-unused-command-line-argument", + "-Wprivate-header", + "-Wself-assign", + "-Wstring-conversion", + "-Wtautological-overlap-compare", + "-Wthread-safety-analysis", + "-Wundefined-func-template", + "-Wunused-comparison", +] + +# Warnings only supported by Clang, but not Clang-cl +CLANG_ONLY_COPTS = CLANG_OR_CLANGCL_OPTS + [ + # Do not treat the third_party headers as system headers when building + # highway - the errors are pertinent. + "--no-system-header-prefix=third_party/highway", +] + +COPTS = select({ + ":compiler_msvc": [], + ":compiler_gcc": CLANG_GCC_COPTS, + ":compiler_clangcl": CLANG_OR_CLANGCL_OPTS, + # Default to clang because compiler detection only works in Bazel + "//conditions:default": CLANG_ONLY_COPTS, +}) + select({ + "@platforms//cpu:riscv64": [ + "-march=rv64gcv1p0", + "-menable-experimental-extensions", + ], + "//conditions:default": [ + ], +}) + +DEFINES = select({ + ":compiler_msvc": ["HWY_SHARED_DEFINE"], + ":compiler_clangcl": ["HWY_SHARED_DEFINE"], + "//conditions:default": [], +}) + +# Unused on Bazel builds, where this is not defined/known; Copybara replaces +# usages with an empty list. +COMPAT = [ + "//buildenv/target:non_prod", # includes mobile/vendor. +] + +# WARNING: changing flags such as HWY_DISABLED_TARGETS may break users without +# failing integration tests, if the machine running tests does not support the +# newly enabled instruction set, or the failure is only caught by sanitizers +# which do not run in CI. + +cc_library( + name = "hwy", + srcs = [ + "hwy/aligned_allocator.cc", + "hwy/per_target.cc", + "hwy/print.cc", + "hwy/targets.cc", + ], + # Normal headers with include guards + hdrs = [ + "hwy/aligned_allocator.h", + "hwy/base.h", + "hwy/cache_control.h", + "hwy/detect_compiler_arch.h", # private + "hwy/print.h", + ], + compatible_with = [], + copts = COPTS, + defines = DEFINES, + local_defines = ["hwy_EXPORTS"], + textual_hdrs = [ + # These are textual because config macros influence them: + "hwy/detect_targets.h", # private + "hwy/targets.h", + # This .cc file #includes itself through foreach_target.h + "hwy/per_target.cc", + # End of list + "hwy/highway.h", # public + "hwy/foreach_target.h", # public + "hwy/per_target.h", # public + "hwy/print-inl.h", # public + "hwy/highway_export.h", # public + "hwy/ops/arm_neon-inl.h", + "hwy/ops/arm_sve-inl.h", + "hwy/ops/emu128-inl.h", + "hwy/ops/generic_ops-inl.h", + "hwy/ops/scalar-inl.h", + "hwy/ops/set_macros-inl.h", + "hwy/ops/shared-inl.h", + "hwy/ops/tuple-inl.h", + "hwy/ops/x86_128-inl.h", + "hwy/ops/x86_256-inl.h", + "hwy/ops/x86_512-inl.h", + # Select avoids recompiling native arch if only non-native changed + ] + select({ + ":compiler_emscripten": [ + "hwy/ops/wasm_128-inl.h", + "hwy/ops/wasm_256-inl.h", + ], + "//conditions:default": [], + }) + select({ + "@platforms//cpu:riscv64": ["hwy/ops/rvv-inl.h"], + "//conditions:default": [], + }), +) + +cc_library( + name = "algo", + compatible_with = [], + copts = COPTS, + textual_hdrs = [ + "hwy/contrib/algo/copy-inl.h", + "hwy/contrib/algo/find-inl.h", + "hwy/contrib/algo/transform-inl.h", + ], + deps = [ + ":hwy", + ], +) + +cc_library( + name = "bit_pack", + compatible_with = [], + copts = COPTS, + textual_hdrs = [ + "hwy/contrib/bit_pack/bit_pack-inl.h", + ], + deps = [ + ":hwy", + ], +) + +cc_library( + name = "dot", + compatible_with = [], + copts = COPTS, + textual_hdrs = [ + "hwy/contrib/dot/dot-inl.h", + ], + deps = [ + ":hwy", + ], +) + +cc_library( + name = "image", + srcs = [ + "hwy/contrib/image/image.cc", + ], + hdrs = [ + "hwy/contrib/image/image.h", + ], + compatible_with = [], + copts = COPTS, + local_defines = ["hwy_contrib_EXPORTS"], + deps = [ + ":hwy", + ], +) + +cc_library( + name = "math", + compatible_with = [], + copts = COPTS, + textual_hdrs = [ + "hwy/contrib/math/math-inl.h", + ], + deps = [ + ":hwy", + ], +) + +# Everything required for tests that use Highway. +cc_library( + name = "hwy_test_util", + srcs = ["hwy/tests/test_util.cc"], + hdrs = ["hwy/tests/test_util.h"], + compatible_with = [], + copts = COPTS, + local_defines = ["hwy_test_EXPORTS"], + textual_hdrs = [ + "hwy/tests/test_util-inl.h", + "hwy/tests/hwy_gtest.h", + ], + # Must not depend on a gtest variant, which can conflict with the + # GUNIT_INTERNAL_BUILD_MODE defined by the test. + deps = [ + ":hwy", + ], +) + +cc_library( + name = "nanobenchmark", + srcs = ["hwy/nanobenchmark.cc"], + hdrs = ["hwy/nanobenchmark.h"], + compatible_with = [], + copts = COPTS, + local_defines = ["hwy_EXPORTS"], + deps = [":hwy"], +) + +cc_binary( + name = "benchmark", + srcs = ["hwy/examples/benchmark.cc"], + copts = COPTS, + deps = [ + ":hwy", + ":nanobenchmark", + ], +) + +cc_library( + name = "skeleton", + srcs = ["hwy/examples/skeleton.cc"], + hdrs = ["hwy/examples/skeleton.h"], + copts = COPTS, + local_defines = ["hwy_EXPORTS"], + textual_hdrs = ["hwy/examples/skeleton-inl.h"], + deps = [ + ":hwy", + ], +) + +cc_test( + name = "list_targets", + size = "small", + srcs = ["hwy/tests/list_targets.cc"], + deps = [":hwy"], +) + +# path, name +HWY_TESTS = [ + ("hwy/contrib/algo/", "copy_test"), + ("hwy/contrib/algo/", "find_test"), + ("hwy/contrib/algo/", "transform_test"), + ("hwy/contrib/bit_pack/", "bit_pack_test"), + ("hwy/contrib/dot/", "dot_test"), + ("hwy/contrib/image/", "image_test"), + ("hwy/contrib/math/", "math_test"), + # contrib/sort has its own BUILD, we also add sort_test to GUITAR_TESTS. + # To run bench_sort, specify --test=hwy/contrib/sort:bench_sort. + ("hwy/examples/", "skeleton_test"), + ("hwy/", "nanobenchmark_test"), + ("hwy/", "aligned_allocator_test"), + ("hwy/", "base_test"), + ("hwy/", "highway_test"), + ("hwy/", "targets_test"), + ("hwy/tests/", "arithmetic_test"), + ("hwy/tests/", "blockwise_shift_test"), + ("hwy/tests/", "blockwise_test"), + ("hwy/tests/", "combine_test"), + ("hwy/tests/", "compare_test"), + ("hwy/tests/", "compress_test"), + ("hwy/tests/", "convert_test"), + ("hwy/tests/", "count_test"), + ("hwy/tests/", "crypto_test"), + ("hwy/tests/", "demote_test"), + ("hwy/tests/", "expand_test"), + ("hwy/tests/", "float_test"), + ("hwy/tests/", "if_test"), + ("hwy/tests/", "interleaved_test"), + ("hwy/tests/", "logical_test"), + ("hwy/tests/", "mask_mem_test"), + ("hwy/tests/", "mask_test"), + ("hwy/tests/", "memory_test"), + ("hwy/tests/", "mul_test"), + ("hwy/tests/", "reduction_test"), + ("hwy/tests/", "reverse_test"), + ("hwy/tests/", "shift_test"), + ("hwy/tests/", "swizzle_test"), + ("hwy/tests/", "test_util_test"), + ("hwy/tests/", "tuple_test"), +] + +HWY_TEST_COPTS = select({ + ":compiler_msvc": [], + "//conditions:default": [ + # gTest triggers this warning (which is enabled by the + # extra-semi in COPTS), so we need to disable it here, + # but it's still enabled for :hwy. + "-Wno-c++98-compat-extra-semi", + ], +}) + +HWY_TEST_DEPS = [ + ":algo", + ":bit_pack", + ":dot", + ":hwy", + ":hwy_test_util", + ":image", + ":math", + ":nanobenchmark", + ":skeleton", + "//hwy/contrib/sort:vqsort", + "@com_google_googletest//:gtest_main", +] + +[ + [ + cc_test( + name = test, + size = "medium", + timeout = "long", # default moderate is not enough for math_test + srcs = [ + subdir + test + ".cc", + ], + copts = COPTS + HWY_TEST_COPTS, + features = select({ + "@platforms//cpu:riscv64": ["fully_static_link"], + "//conditions:default": [], + }), + linkopts = select({ + ":compiler_emscripten": [ + "-s ASSERTIONS=2", + "-s ENVIRONMENT=node,shell,web", + "-s ERROR_ON_UNDEFINED_SYMBOLS=1", + "-s DEMANGLE_SUPPORT=1", + "-s EXIT_RUNTIME=1", + "-s ALLOW_MEMORY_GROWTH=1", + "--pre-js $(location :preamble.js.lds)", + ], + "//conditions:default": [], + }), + linkstatic = select({ + "@platforms//cpu:riscv64": True, + "//conditions:default": False, + }), + local_defines = ["HWY_IS_TEST"], + # for test_suite. + tags = ["hwy_ops_test"], + deps = HWY_TEST_DEPS + select({ + ":compiler_emscripten": [":preamble.js.lds"], + "//conditions:default": [], + }), + ), + ] + for subdir, test in HWY_TESTS +] + +# For manually building the tests we define here (:all does not work in --config=msvc) +test_suite( + name = "hwy_ops_tests", + tags = ["hwy_ops_test"], +) + +# Placeholder for integration test, do not remove diff --git a/third_party/highway/CMakeLists.txt b/third_party/highway/CMakeLists.txt new file mode 100644 index 0000000000..8685c04e7f --- /dev/null +++ b/third_party/highway/CMakeLists.txt @@ -0,0 +1,595 @@ +# Copyright 2019 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +cmake_minimum_required(VERSION 3.10) + +# Set PIE flags for POSITION_INDEPENDENT_CODE targets, added in 3.14. +if(POLICY CMP0083) + cmake_policy(SET CMP0083 NEW) +endif() + +# Workaround for 3.19 raising error 'IMPORTED_LOCATION not set for imported +# target "GTest::gtest_main"'. +if(POLICY CMP0111) + cmake_policy(SET CMP0111 OLD) +endif() + +project(hwy VERSION 1.0.4) # Keep in sync with highway.h version +# `hwy` is lowercase to handle find_package() in Config mode: +set(namespace "${PROJECT_NAME}::") + +# Directly define the ABI version from the cmake project() version values: +set(LIBRARY_VERSION "${hwy_VERSION}") +set(LIBRARY_SOVERSION ${hwy_VERSION_MAJOR}) + +set(CMAKE_CXX_EXTENSIONS OFF) + +list(APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_SOURCE_DIR}/cmake") +# Search for Atomics implementation: +find_package(Atomics REQUIRED) + +# Enabled PIE binaries by default if supported. +include(CheckPIESupported OPTIONAL RESULT_VARIABLE CHECK_PIE_SUPPORTED) +if(CHECK_PIE_SUPPORTED) + check_pie_supported(LANGUAGES CXX) + if(CMAKE_CXX_LINK_PIE_SUPPORTED) + set(CMAKE_POSITION_INDEPENDENT_CODE TRUE) + endif() +endif() + +include(GNUInstallDirs) + +if (NOT CMAKE_BUILD_TYPE) + set(CMAKE_BUILD_TYPE RelWithDebInfo) +endif() + +# The following is only required with GCC < 6.1.0 or CLANG < 16.0 +set(HWY_CMAKE_ARM7 OFF CACHE BOOL "Set copts for Armv7 with NEON (requires vfpv4)?") + +# Unconditionally adding -Werror risks breaking the build when new warnings +# arise due to compiler/platform changes. Enable this in CI/tests. +set(HWY_WARNINGS_ARE_ERRORS OFF CACHE BOOL "Add -Werror flag?") + +set(HWY_ENABLE_CONTRIB ON CACHE BOOL "Include contrib/") +set(HWY_ENABLE_EXAMPLES ON CACHE BOOL "Build examples") +set(HWY_ENABLE_INSTALL ON CACHE BOOL "Install library") +set(HWY_ENABLE_TESTS ON CACHE BOOL "Enable HWY tests") + +include(CheckCXXSourceCompiles) +check_cxx_source_compiles( + "int main() { + #if !defined(__EMSCRIPTEN__) + static_assert(false, \"__EMSCRIPTEN__ is not defined\"); + #endif + return 0; + }" + HWY_EMSCRIPTEN +) + +check_cxx_source_compiles( + "int main() { + #if !defined(__riscv) + static_assert(false, \"__riscv is not defined\"); + #endif + return 0; + }" + HWY_RISCV +) + +if (HWY_ENABLE_CONTRIB) +# Glob all the traits so we don't need to modify this file when adding +# additional special cases. +file(GLOB HWY_CONTRIB_SOURCES "hwy/contrib/sort/vqsort_*.cc") +list(APPEND HWY_CONTRIB_SOURCES + hwy/contrib/dot/dot-inl.h + hwy/contrib/image/image.cc + hwy/contrib/image/image.h + hwy/contrib/math/math-inl.h + hwy/contrib/sort/shared-inl.h + hwy/contrib/sort/sorting_networks-inl.h + hwy/contrib/sort/traits-inl.h + hwy/contrib/sort/traits128-inl.h + hwy/contrib/sort/vqsort-inl.h + hwy/contrib/sort/vqsort.cc + hwy/contrib/sort/vqsort.h + hwy/contrib/algo/copy-inl.h + hwy/contrib/algo/find-inl.h + hwy/contrib/algo/transform-inl.h +) +endif() # HWY_ENABLE_CONTRIB + +set(HWY_SOURCES + hwy/aligned_allocator.cc + hwy/aligned_allocator.h + hwy/base.h + hwy/cache_control.h + hwy/detect_compiler_arch.h # private + hwy/detect_targets.h # private + hwy/foreach_target.h + hwy/highway.h + hwy/highway_export.h + hwy/nanobenchmark.cc + hwy/nanobenchmark.h + hwy/ops/arm_neon-inl.h + hwy/ops/arm_sve-inl.h + hwy/ops/emu128-inl.h + hwy/ops/generic_ops-inl.h + hwy/ops/ppc_vsx-inl.h + hwy/ops/rvv-inl.h + hwy/ops/scalar-inl.h + hwy/ops/set_macros-inl.h + hwy/ops/shared-inl.h + hwy/ops/wasm_128-inl.h + hwy/ops/tuple-inl.h + hwy/ops/x86_128-inl.h + hwy/ops/x86_256-inl.h + hwy/ops/x86_512-inl.h + hwy/per_target.cc + hwy/per_target.h + hwy/print-inl.h + hwy/print.cc + hwy/print.h + hwy/targets.cc + hwy/targets.h +) + +set(HWY_TEST_SOURCES + hwy/tests/hwy_gtest.h + hwy/tests/test_util-inl.h + hwy/tests/test_util.cc + hwy/tests/test_util.h +) + +if (MSVC) + set(HWY_FLAGS + # fix build error C1128 in blockwise*_test & arithmetic_test + /bigobj + ) +else() + set(HWY_FLAGS + # Avoid changing binaries based on the current time and date. + -Wno-builtin-macro-redefined + -D__DATE__="redacted" + -D__TIMESTAMP__="redacted" + -D__TIME__="redacted" + + # Optimizations + -fmerge-all-constants + + # Warnings + -Wall + -Wextra + # These are not included in Wall nor Wextra: + -Wconversion + -Wsign-conversion + -Wvla + -Wnon-virtual-dtor + ) + + if(${CMAKE_CXX_COMPILER_ID} MATCHES "Clang") + list(APPEND HWY_FLAGS + -Wfloat-overflow-conversion + -Wfloat-zero-conversion + -Wfor-loop-analysis + -Wgnu-redeclared-enum + -Winfinite-recursion + -Wself-assign + -Wstring-conversion + -Wtautological-overlap-compare + -Wthread-safety-analysis + -Wundefined-func-template + + -fno-cxx-exceptions + -fno-slp-vectorize + -fno-vectorize + + # Use color in messages + -fdiagnostics-show-option -fcolor-diagnostics + ) + if (CMAKE_CXX_COMPILER_VERSION VERSION_GREATER_EQUAL 6.0) + list(APPEND HWY_FLAGS -Wc++2a-extensions) + endif() + endif() + + if (WIN32) + if(${CMAKE_CXX_COMPILER_ID} MATCHES "Clang") + list(APPEND HWY_FLAGS + -Wno-global-constructors + -Wno-language-extension-token + -Wno-used-but-marked-unused + -Wno-shadow-field-in-constructor + -Wno-unused-member-function + -Wno-unused-template + -Wno-c++98-compat-pedantic + -Wno-used-but-marked-unused + -Wno-zero-as-null-pointer-constant + ) + endif() + + list(APPEND HWY_FLAGS + -Wno-cast-align + -Wno-double-promotion + -Wno-float-equal + -Wno-format-nonliteral + -Wno-shadow + -Wno-sign-conversion + ) + else() + list(APPEND HWY_FLAGS + -fmath-errno + -fno-exceptions + ) + endif() # WIN32 + + if (HWY_CMAKE_ARM7) + list(APPEND HWY_FLAGS + -march=armv7-a + -mfpu=neon-vfpv4 + -mfloat-abi=hard # must match the toolchain specified as CXX= + -mfp16-format=ieee # required for vcvt_f32_f16 + ) + endif() # HWY_CMAKE_ARM7 + + if(HWY_RISCV) + # gcc(13) and recent clang both support V, but not yet runtime dispatch, so + # we add the gcv compiler flag, which then requires the CPU (now when using + # either compiler) to support V. + list(APPEND HWY_FLAGS -march=rv64gcv1p0) + if(${CMAKE_CXX_COMPILER_ID} MATCHES "Clang") + list(APPEND HWY_FLAGS -menable-experimental-extensions) + endif() + endif() + + if (HWY_WARNINGS_ARE_ERRORS) + list(APPEND HWY_FLAGS -Werror) + endif() + + # Prevent "wasm-ld: error: --shared-memory is disallowed by targets.cc.o + # because it was not compiled with 'atomics' or 'bulk-memory' features." + if (HWY_EMSCRIPTEN) + list(APPEND HWY_FLAGS -matomics) + endif() + +endif() # !MSVC + +include(CheckIncludeFile) +check_include_file(sys/auxv.h HAVE_SYS_AUXV_H) +check_include_file(asm/hwcap.h HAVE_ASM_HWCAP_H) +if (NOT HAVE_ASM_HWCAP_H) + list(APPEND HWY_FLAGS -DTOOLCHAIN_MISS_ASM_HWCAP_H) +endif() + +# By default prefer STATIC build (legacy behavior) +option(BUILD_SHARED_LIBS "Build shared libraries" OFF) +option(HWY_FORCE_STATIC_LIBS "Ignore BUILD_SHARED_LIBS" OFF) +# only expose shared/static options to advanced users: +mark_as_advanced(BUILD_SHARED_LIBS) +mark_as_advanced(HWY_FORCE_STATIC_LIBS) +# Define visibility settings globally: +set(CMAKE_CXX_VISIBILITY_PRESET hidden) +set(CMAKE_VISIBILITY_INLINES_HIDDEN 1) + +# Copy-cat "add_library" logic + add override. +set(HWY_LIBRARY_TYPE "SHARED") +if (NOT BUILD_SHARED_LIBS OR HWY_FORCE_STATIC_LIBS) + set(HWY_LIBRARY_TYPE "STATIC") +endif() + +# This preprocessor define will drive the build, also used in the *.pc files: +if("${HWY_LIBRARY_TYPE}" STREQUAL "SHARED") + set(DLLEXPORT_TO_DEFINE "HWY_SHARED_DEFINE") +else() + set(DLLEXPORT_TO_DEFINE "HWY_STATIC_DEFINE") +endif() + +add_library(hwy ${HWY_LIBRARY_TYPE} ${HWY_SOURCES}) +if(NOT HAVE_SYS_AUXV_H) + target_compile_definitions(hwy PUBLIC TOOLCHAIN_MISS_SYS_AUXV_H) +endif() +target_compile_definitions(hwy PUBLIC "${DLLEXPORT_TO_DEFINE}") +target_compile_options(hwy PRIVATE ${HWY_FLAGS}) +set_property(TARGET hwy PROPERTY POSITION_INDEPENDENT_CODE ON) +set_target_properties(hwy PROPERTIES VERSION ${LIBRARY_VERSION} SOVERSION ${LIBRARY_SOVERSION}) +target_include_directories(hwy PUBLIC + $ + $) +target_compile_features(hwy PUBLIC cxx_std_11) +set_target_properties(hwy PROPERTIES + LINK_DEPENDS ${CMAKE_CURRENT_SOURCE_DIR}/hwy/hwy.version) +# For GCC __atomic_store_8, see #887 +target_link_libraries(hwy PRIVATE ${ATOMICS_LIBRARIES}) +# not supported by MSVC/Clang, safe to skip (we use DLLEXPORT annotations) +if(UNIX AND NOT APPLE) + set_property(TARGET hwy APPEND_STRING PROPERTY + LINK_FLAGS " -Wl,--version-script=${CMAKE_CURRENT_SOURCE_DIR}/hwy/hwy.version") +endif() + +if (HWY_ENABLE_CONTRIB) +add_library(hwy_contrib ${HWY_LIBRARY_TYPE} ${HWY_CONTRIB_SOURCES}) +target_link_libraries(hwy_contrib hwy) +target_compile_options(hwy_contrib PRIVATE ${HWY_FLAGS}) +set_property(TARGET hwy_contrib PROPERTY POSITION_INDEPENDENT_CODE ON) +set_target_properties(hwy_contrib PROPERTIES VERSION ${LIBRARY_VERSION} SOVERSION ${LIBRARY_SOVERSION}) +target_include_directories(hwy_contrib PUBLIC + $ + $) +target_compile_features(hwy_contrib PUBLIC cxx_std_11) +set_target_properties(hwy_contrib PROPERTIES + LINK_DEPENDS ${CMAKE_CURRENT_SOURCE_DIR}/hwy/hwy.version) +# not supported by MSVC/Clang, safe to skip (we use DLLEXPORT annotations) +if(UNIX AND NOT APPLE) + set_property(TARGET hwy_contrib APPEND_STRING PROPERTY + LINK_FLAGS " -Wl,--version-script=${CMAKE_CURRENT_SOURCE_DIR}/hwy/hwy.version") +endif() +endif() # HWY_ENABLE_CONTRIB + +add_library(hwy_test ${HWY_LIBRARY_TYPE} ${HWY_TEST_SOURCES}) +target_link_libraries(hwy_test hwy) +target_compile_options(hwy_test PRIVATE ${HWY_FLAGS}) +set_property(TARGET hwy_test PROPERTY POSITION_INDEPENDENT_CODE ON) +set_target_properties(hwy_test PROPERTIES VERSION ${LIBRARY_VERSION} SOVERSION ${LIBRARY_SOVERSION}) +target_include_directories(hwy_test PUBLIC + $ + $) +target_compile_features(hwy_test PUBLIC cxx_std_11) +set_target_properties(hwy_test PROPERTIES + LINK_DEPENDS ${CMAKE_CURRENT_SOURCE_DIR}/hwy/hwy.version) +# not supported by MSVC/Clang, safe to skip (we use DLLEXPORT annotations) +if(UNIX AND NOT APPLE) + set_property(TARGET hwy_test APPEND_STRING PROPERTY + LINK_FLAGS " -Wl,--version-script=${CMAKE_CURRENT_SOURCE_DIR}/hwy/hwy.version") +endif() + +# -------------------------------------------------------- hwy_list_targets +# Generate a tool to print the compiled-in targets as defined by the current +# flags. This tool will print to stderr at build time, after building hwy. +add_executable(hwy_list_targets hwy/tests/list_targets.cc) +target_compile_options(hwy_list_targets PRIVATE ${HWY_FLAGS}) +target_link_libraries(hwy_list_targets hwy) +target_include_directories(hwy_list_targets PRIVATE + $) +# TARGET_FILE always returns the path to executable +# Naked target also not always could be run (due to the lack of '.\' prefix) +# Thus effective command to run should contain the full path +# and emulator prefix (if any). +if (NOT CMAKE_CROSSCOMPILING OR CMAKE_CROSSCOMPILING_EMULATOR) +add_custom_command(TARGET hwy_list_targets POST_BUILD + COMMAND ${CMAKE_CROSSCOMPILING_EMULATOR} $ || (exit 0)) +endif() + +# -------------------------------------------------------- +# Allow skipping the following sections for projects that do not need them: +# tests, examples, benchmarks and installation. + +# -------------------------------------------------------- install library +if (HWY_ENABLE_INSTALL) + +install(TARGETS hwy EXPORT hwy_targets + LIBRARY DESTINATION "${CMAKE_INSTALL_LIBDIR}" + ARCHIVE DESTINATION "${CMAKE_INSTALL_LIBDIR}" + RUNTIME DESTINATION "${CMAKE_INSTALL_BINDIR}") +# Install all the headers keeping the relative path to the current directory +# when installing them. +foreach (source ${HWY_SOURCES}) + if ("${source}" MATCHES "\.h$") + get_filename_component(dirname "${source}" DIRECTORY) + install(FILES "${source}" + DESTINATION "${CMAKE_INSTALL_INCLUDEDIR}/${dirname}") + endif() +endforeach() + +if (HWY_ENABLE_CONTRIB) +install(TARGETS hwy_contrib EXPORT hwy_targets + LIBRARY DESTINATION "${CMAKE_INSTALL_LIBDIR}" + ARCHIVE DESTINATION "${CMAKE_INSTALL_LIBDIR}" + RUNTIME DESTINATION "${CMAKE_INSTALL_BINDIR}") +# Install all the headers keeping the relative path to the current directory +# when installing them. +foreach (source ${HWY_CONTRIB_SOURCES}) + if ("${source}" MATCHES "\.h$") + get_filename_component(dirname "${source}" DIRECTORY) + install(FILES "${source}" + DESTINATION "${CMAKE_INSTALL_INCLUDEDIR}/${dirname}") + endif() +endforeach() +endif() # HWY_ENABLE_CONTRIB + +install(TARGETS hwy_test EXPORT hwy_targets + LIBRARY DESTINATION "${CMAKE_INSTALL_LIBDIR}" + ARCHIVE DESTINATION "${CMAKE_INSTALL_LIBDIR}" + RUNTIME DESTINATION "${CMAKE_INSTALL_BINDIR}") +# Install all the headers keeping the relative path to the current directory +# when installing them. +foreach (source ${HWY_TEST_SOURCES}) + if ("${source}" MATCHES "\.h$") + get_filename_component(dirname "${source}" DIRECTORY) + install(FILES "${source}" + DESTINATION "${CMAKE_INSTALL_INCLUDEDIR}/${dirname}") + endif() +endforeach() + +# Add a pkg-config file for libhwy and the contrib/test libraries. +set(HWY_LIBRARY_VERSION "${CMAKE_PROJECT_VERSION}") +set(HWY_PC_FILES libhwy.pc libhwy-test.pc) +if (HWY_ENABLE_CONTRIB) +list(APPEND HWY_PC_FILES libhwy-contrib.pc) +endif() # HWY_ENABLE_CONTRIB +foreach (pc ${HWY_PC_FILES}) + configure_file("${CMAKE_CURRENT_SOURCE_DIR}/${pc}.in" "${pc}" @ONLY) + install(FILES "${CMAKE_CURRENT_BINARY_DIR}/${pc}" + DESTINATION "${CMAKE_INSTALL_LIBDIR}/pkgconfig") +endforeach() + +endif() # HWY_ENABLE_INSTALL +# -------------------------------------------------------- Examples +if (HWY_ENABLE_EXAMPLES) + +# Avoids mismatch between GTest's static CRT and our dynamic. +set(gtest_force_shared_crt ON CACHE BOOL "" FORCE) + +# Programming exercise with integrated benchmark +add_executable(hwy_benchmark hwy/examples/benchmark.cc) +target_sources(hwy_benchmark PRIVATE + hwy/nanobenchmark.h) +# Try adding one of -DHWY_COMPILE_ONLY_SCALAR, -DHWY_COMPILE_ONLY_EMU128 or +# -DHWY_COMPILE_ONLY_STATIC to observe the difference in targets printed. +target_compile_options(hwy_benchmark PRIVATE ${HWY_FLAGS}) +target_link_libraries(hwy_benchmark PRIVATE hwy) +target_link_libraries(hwy_benchmark PRIVATE ${ATOMICS_LIBRARIES}) +set_target_properties(hwy_benchmark + PROPERTIES RUNTIME_OUTPUT_DIRECTORY "examples/") + +endif() # HWY_ENABLE_EXAMPLES +# -------------------------------------------------------- Tests + +include(CTest) + +if(BUILD_TESTING AND HWY_ENABLE_TESTS) +enable_testing() +include(GoogleTest) + +set(HWY_SYSTEM_GTEST OFF CACHE BOOL "Use pre-installed googletest?") +if(HWY_SYSTEM_GTEST) +find_package(GTest REQUIRED) +else() +# Download and unpack googletest at configure time +configure_file(CMakeLists.txt.in googletest-download/CMakeLists.txt) +execute_process(COMMAND ${CMAKE_COMMAND} -G "${CMAKE_GENERATOR}" . + RESULT_VARIABLE result + WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/googletest-download ) +if(result) + message(FATAL_ERROR "CMake step for googletest failed: ${result}") +endif() +execute_process(COMMAND ${CMAKE_COMMAND} --build . + RESULT_VARIABLE result + WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/googletest-download ) +if(result) + message(FATAL_ERROR "Build step for googletest failed: ${result}") +endif() + +# Prevent overriding the parent project's compiler/linker +# settings on Windows +set(gtest_force_shared_crt ON CACHE BOOL "" FORCE) + +# Add googletest directly to our build. This defines +# the gtest and gtest_main targets. +add_subdirectory(${CMAKE_CURRENT_BINARY_DIR}/googletest-src + ${CMAKE_CURRENT_BINARY_DIR}/googletest-build + EXCLUDE_FROM_ALL) +endif() # HWY_SYSTEM_GTEST + +set(HWY_TEST_FILES + hwy/contrib/algo/copy_test.cc + hwy/contrib/algo/find_test.cc + hwy/contrib/algo/transform_test.cc + hwy/aligned_allocator_test.cc + hwy/base_test.cc + hwy/highway_test.cc + hwy/nanobenchmark_test.cc + hwy/targets_test.cc + hwy/examples/skeleton_test.cc + hwy/tests/arithmetic_test.cc + hwy/tests/blockwise_shift_test.cc + hwy/tests/blockwise_test.cc + hwy/tests/combine_test.cc + hwy/tests/compare_test.cc + hwy/tests/compress_test.cc + hwy/tests/convert_test.cc + hwy/tests/count_test.cc + hwy/tests/crypto_test.cc + hwy/tests/demote_test.cc + hwy/tests/expand_test.cc + hwy/tests/float_test.cc + hwy/tests/if_test.cc + hwy/tests/interleaved_test.cc + hwy/tests/logical_test.cc + hwy/tests/mask_mem_test.cc + hwy/tests/mask_test.cc + hwy/tests/memory_test.cc + hwy/tests/mul_test.cc + hwy/tests/reduction_test.cc + hwy/tests/reverse_test.cc + hwy/tests/shift_test.cc + hwy/tests/swizzle_test.cc + hwy/tests/test_util_test.cc + hwy/tests/tuple_test.cc +) + +set(HWY_TEST_LIBS hwy hwy_test) + +if (HWY_ENABLE_CONTRIB) +list(APPEND HWY_TEST_LIBS hwy_contrib) + +list(APPEND HWY_TEST_FILES + hwy/contrib/dot/dot_test.cc + hwy/contrib/image/image_test.cc + # Disabled due to SIGILL in clang7 debug build during gtest discovery phase, + # not reproducible locally. Still tested via bazel build. + # hwy/contrib/math/math_test.cc + hwy/contrib/sort/sort_test.cc + hwy/contrib/sort/bench_sort.cc +) +endif() # HWY_ENABLE_CONTRIB + +if(HWY_SYSTEM_GTEST) + if (CMAKE_VERSION VERSION_LESS 3.20) + set(HWY_GTEST_LIBS GTest::GTest GTest::Main) + else() + set(HWY_GTEST_LIBS GTest::gtest GTest::gtest_main) + endif() +else() + set(HWY_GTEST_LIBS gtest gtest_main) +endif() + +file(MAKE_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/tests) +foreach (TESTFILE IN LISTS HWY_TEST_FILES) + # The TESTNAME is the name without the extension or directory. + get_filename_component(TESTNAME ${TESTFILE} NAME_WE) + add_executable(${TESTNAME} ${TESTFILE}) + target_compile_options(${TESTNAME} PRIVATE ${HWY_FLAGS}) + # Test all targets, not just the best/baseline. This changes the default + # policy to all-attainable; note that setting -DHWY_COMPILE_* directly can + # cause compile errors because only one may be set, and other CMakeLists.txt + # that include us may set them. + target_compile_options(${TESTNAME} PRIVATE -DHWY_IS_TEST=1) + + target_link_libraries(${TESTNAME} PRIVATE ${HWY_TEST_LIBS} ${HWY_GTEST_LIBS}) + # For GCC __atomic_store_8, see #887 + target_link_libraries(${TESTNAME} PRIVATE ${ATOMICS_LIBRARIES}) + # Output test targets in the test directory. + set_target_properties(${TESTNAME} PROPERTIES RUNTIME_OUTPUT_DIRECTORY "tests") + + if (HWY_EMSCRIPTEN) + set_target_properties(${TESTNAME} PROPERTIES LINK_FLAGS "-s SINGLE_FILE=1") + endif() + + if(${CMAKE_VERSION} VERSION_LESS "3.10.3") + gtest_discover_tests(${TESTNAME} TIMEOUT 60) + else () + gtest_discover_tests(${TESTNAME} DISCOVERY_TIMEOUT 60) + endif () +endforeach () + +# The skeleton test uses the skeleton library code. +target_sources(skeleton_test PRIVATE hwy/examples/skeleton.cc) + +endif() # BUILD_TESTING + +if (HWY_ENABLE_INSTALL) + # write hwy-config file to handle `Config` mode + include(CMakePackageConfigHelpers) + write_basic_package_version_file("${CMAKE_CURRENT_BINARY_DIR}/hwy-config-version.cmake" COMPATIBILITY SameMajorVersion) + install(FILES "${CMAKE_CURRENT_BINARY_DIR}/hwy-config-version.cmake" DESTINATION "${CMAKE_INSTALL_LIBDIR}/cmake/hwy") + install(EXPORT hwy_targets NAMESPACE "${namespace}" FILE hwy-config.cmake DESTINATION "${CMAKE_INSTALL_LIBDIR}/cmake/hwy") +endif() diff --git a/third_party/highway/CMakeLists.txt.in b/third_party/highway/CMakeLists.txt.in new file mode 100644 index 0000000000..a0260b82f7 --- /dev/null +++ b/third_party/highway/CMakeLists.txt.in @@ -0,0 +1,15 @@ +cmake_minimum_required(VERSION 2.8.12) + +project(googletest-download NONE) + +include(ExternalProject) +ExternalProject_Add(googletest + GIT_REPOSITORY https://github.com/google/googletest.git + GIT_TAG 43efa0a4efd40c78b9210d15373112081899a97c + SOURCE_DIR "${CMAKE_CURRENT_BINARY_DIR}/googletest-src" + BINARY_DIR "${CMAKE_CURRENT_BINARY_DIR}/googletest-build" + CONFIGURE_COMMAND "" + BUILD_COMMAND "" + INSTALL_COMMAND "" + TEST_COMMAND "" +) diff --git a/third_party/highway/CONTRIBUTING b/third_party/highway/CONTRIBUTING new file mode 100644 index 0000000000..24e83ee578 --- /dev/null +++ b/third_party/highway/CONTRIBUTING @@ -0,0 +1,39 @@ +# How to Contribute + +We'd love to accept your patches and contributions to this project. +Pull requests are very welcome. Bug reports (via issue) are also appreciated. + +Looking for a starter project? See the [wishlist](g3doc/op_wishlist.md) for +some ideas for extensions and codegen improvements. + +There are just a few small guidelines you need to follow. + +## Contributor License Agreement + +Contributions to this project must be accompanied by a Contributor License +Agreement. You (or your employer) retain the copyright to your contribution; +this simply gives us permission to use and redistribute your contributions as +part of the project. Head over to to see +your current agreements on file or to sign a new one. + +You generally only need to submit a CLA once, so if you've already submitted one +(even if it was for a different project), you probably don't need to do it +again. + +## Code reviews + +All submissions, including submissions by project members, require review. We +use GitHub pull requests for this purpose. Consult +[GitHub Help](https://help.github.com/articles/about-pull-requests/) for more +information on using pull requests. + +## API changes + +Highway promises to be backwards-compatible to the current documented API. If +you would like to propose a change, please raise an issue to discuss how we can +retain compatibility. + +## Community Guidelines + +This project follows +[Google's Open Source Community Guidelines](https://opensource.google.com/conduct/). diff --git a/third_party/highway/LICENSE b/third_party/highway/LICENSE new file mode 100644 index 0000000000..f49a4e16e6 --- /dev/null +++ b/third_party/highway/LICENSE @@ -0,0 +1,201 @@ + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS + + APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "[]" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + + Copyright [yyyy] [name of copyright owner] + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. \ No newline at end of file diff --git a/third_party/highway/MODULE.bazel b/third_party/highway/MODULE.bazel new file mode 100644 index 0000000000..8cdf658a53 --- /dev/null +++ b/third_party/highway/MODULE.bazel @@ -0,0 +1,6 @@ +module(name = "highway", version = "1.0.4") + +bazel_dep(name = "bazel_skylib", version = "1.3.0") +bazel_dep(name = "googletest", version = "1.12.1") +bazel_dep(name = "rules_cc", version = "0.0.4") +bazel_dep(name = "rules_license", version = "0.0.4") diff --git a/third_party/highway/README.md b/third_party/highway/README.md new file mode 100644 index 0000000000..388ed9ad08 --- /dev/null +++ b/third_party/highway/README.md @@ -0,0 +1,400 @@ +# Efficient and performance-portable vector software + +[//]: # (placeholder, do not remove) + +Highway is a C++ library that provides portable SIMD/vector intrinsics. + +[Documentation](https://google.github.io/highway/en/master/) + +## Why + +We are passionate about high-performance software. We see major untapped +potential in CPUs (servers, mobile, desktops). Highway is for engineers who want +to reliably and economically push the boundaries of what is possible in +software. + +## How + +CPUs provide SIMD/vector instructions that apply the same operation to multiple +data items. This can reduce energy usage e.g. *fivefold* because fewer +instructions are executed. We also often see *5-10x* speedups. + +Highway makes SIMD/vector programming practical and workable according to these +guiding principles: + +**Does what you expect**: Highway is a C++ library with carefully-chosen +functions that map well to CPU instructions without extensive compiler +transformations. The resulting code is more predictable and robust to code +changes/compiler updates than autovectorization. + +**Works on widely-used platforms**: Highway supports four architectures; the +same application code can target eight instruction sets, including those with +'scalable' vectors (size unknown at compile time). Highway only requires C++11 +and supports four families of compilers. If you would like to use Highway on +other platforms, please raise an issue. + +**Flexible to deploy**: Applications using Highway can run on heterogeneous +clouds or client devices, choosing the best available instruction set at +runtime. Alternatively, developers may choose to target a single instruction set +without any runtime overhead. In both cases, the application code is the same +except for swapping `HWY_STATIC_DISPATCH` with `HWY_DYNAMIC_DISPATCH` plus one +line of code. + +**Suitable for a variety of domains**: Highway provides an extensive set of +operations, used for image processing (floating-point), compression, video +analysis, linear algebra, cryptography, sorting and random generation. We +recognise that new use-cases may require additional ops and are happy to add +them where it makes sense (e.g. no performance cliffs on some architectures). If +you would like to discuss, please file an issue. + +**Rewards data-parallel design**: Highway provides tools such as Gather, +MaskedLoad, and FixedTag to enable speedups for legacy data structures. However, +the biggest gains are unlocked by designing algorithms and data structures for +scalable vectors. Helpful techniques include batching, structure-of-array +layouts, and aligned/padded allocations. + +## Examples + +Online demos using Compiler Explorer: + +- [multiple targets with dynamic dispatch](https://gcc.godbolt.org/z/zP7MYe9Yf) + (recommended) +- [single target using -m flags](https://gcc.godbolt.org/z/rGnjMevKG) + +We observe that Highway is referenced in the following open source projects, +found via sourcegraph.com. Most are Github repositories. If you would like to +add your project or link to it directly, feel free to raise an issue or contact +us via the below email. + +* Browsers: Chromium (+Vivaldi), Firefox (+floorp / foxhound / librewolf / Waterfox) +* Cryptography: google/distributed_point_functions +* Image codecs: eustas/2im, [Grok JPEG 2000](https://github.com/GrokImageCompression/grok), [JPEG XL](https://github.com/libjxl/libjxl), OpenHTJ2K +* Image processing: cloudinary/ssimulacra2, m-ab-s/media-autobuild_suite +* Image viewers: AlienCowEatCake/ImageViewer, mirillis/jpegxl-wic, + [Lux panorama/image viewer](https://bitbucket.org/kfj/pv/) +* Information retrieval: [iresearch database index](https://github.com/iresearch-toolkit/iresearch/blob/e7638e7a4b99136ca41f82be6edccf01351a7223/core/utils/simd_utils.hpp), michaeljclark/zvec + +Other + +* [zimt](https://github.com/kfjahnke/zimt): C++11 template library to process n-dimensional arrays with multi-threaded SIMD code +* [vectorized Quicksort](https://github.com/google/highway/tree/master/hwy/contrib/sort) ([paper](https://arxiv.org/abs/2205.05982)) + +If you'd like to get Highway, in addition to cloning from this Github repository +or using it as a Git submodule, you can also find it in the following package +managers or repositories: alpinelinux, conan-io, conda-forge, DragonFlyBSD, +freebsd, ghostbsd, microsoft/vcpkg, MidnightBSD, NetBSD, openSUSE, opnsense, +Xilinx/Vitis_Libraries. + +## Current status + +### Targets + +Highway supports 19 targets, listed in alphabetical order of platform: + +- Any: `EMU128`, `SCALAR`; +- Arm: `NEON` (Armv7+), `SVE`, `SVE2`, `SVE_256`, `SVE2_128`; +- POWER: `PPC8` (v2.07), `PPC9` (v3.0), `PPC10` (v3.1B, not yet supported + due to compiler bugs, see #1207; also requires QEMU 7.2); +- RISC-V: `RVV` (1.0); +- WebAssembly: `WASM`, `WASM_EMU256` (a 2x unrolled version of wasm128, + enabled if `HWY_WANT_WASM2` is defined. This will remain supported until it + is potentially superseded by a future version of WASM.); +- x86: + - `SSE2` + - `SSSE3` (~Intel Core) + - `SSE4` (~Nehalem, also includes AES + CLMUL). + - `AVX2` (~Haswell, also includes BMI2 + F16 + FMA) + - `AVX3` (~Skylake, AVX-512F/BW/CD/DQ/VL) + - `AVX3_DL` (~Icelake, includes BitAlg + CLMUL + GFNI + VAES + VBMI + + VBMI2 + VNNI + VPOPCNT; requires opt-in by defining `HWY_WANT_AVX3_DL` + unless compiling for static dispatch), + - `AVX3_ZEN4` (like AVX3_DL but optimized for AMD Zen4; requires opt-in by + defining `HWY_WANT_AVX3_ZEN4` if compiling for static dispatch) + +SVE was initially tested using farm_sve (see acknowledgments). + +### Versioning + +Highway releases aim to follow the semver.org system (MAJOR.MINOR.PATCH), +incrementing MINOR after backward-compatible additions and PATCH after +backward-compatible fixes. We recommend using releases (rather than the Git tip) +because they are tested more extensively, see below. + +The current version 1.0 signals an increased focus on backwards compatibility. +Applications using documented functionality will remain compatible with future +updates that have the same major version number. + +### Testing + +Continuous integration tests build with a recent version of Clang (running on +native x86, or QEMU for RISC-V and Arm) and MSVC 2019 (v19.28, running on native +x86). + +Before releases, we also test on x86 with Clang and GCC, and Armv7/8 via GCC +cross-compile. See the [testing process](g3doc/release_testing_process.md) for +details. + +### Related modules + +The `contrib` directory contains SIMD-related utilities: an image class with +aligned rows, a math library (16 functions already implemented, mostly +trigonometry), and functions for computing dot products and sorting. + +### Other libraries + +If you only require x86 support, you may also use Agner Fog's +[VCL vector class library](https://github.com/vectorclass). It includes many +functions including a complete math library. + +If you have existing code using x86/NEON intrinsics, you may be interested in +[SIMDe](https://github.com/simd-everywhere/simde), which emulates those +intrinsics using other platforms' intrinsics or autovectorization. + +## Installation + +This project uses CMake to generate and build. In a Debian-based system you can +install it via: + +```bash +sudo apt install cmake +``` + +Highway's unit tests use [googletest](https://github.com/google/googletest). +By default, Highway's CMake downloads this dependency at configuration time. +You can disable this by setting the `HWY_SYSTEM_GTEST` CMake variable to ON and +installing gtest separately: + +```bash +sudo apt install libgtest-dev +``` + +Running cross-compiled tests requires support from the OS, which on Debian is +provided by the `qemu-user-binfmt` package. + +To build Highway as a shared or static library (depending on BUILD_SHARED_LIBS), +the standard CMake workflow can be used: + +```bash +mkdir -p build && cd build +cmake .. +make -j && make test +``` + +Or you can run `run_tests.sh` (`run_tests.bat` on Windows). + +Bazel is also supported for building, but it is not as widely used/tested. + +When building for Armv7, a limitation of current compilers requires you to add +`-DHWY_CMAKE_ARM7:BOOL=ON` to the CMake command line; see #834 and #1032. We +understand that work is underway to remove this limitation. + +Building on 32-bit x86 is not officially supported, and AVX2/3 are disabled by +default there. Note that johnplatts has successfully built and run the Highway +tests on 32-bit x86, including AVX2/3, on GCC 7/8 and Clang 8/11/12. On Ubuntu +22.04, Clang 11 and 12, but not later versions, require extra compiler flags +`-m32 -isystem /usr/i686-linux-gnu/include`. Clang 10 and earlier require the +above plus `-isystem /usr/i686-linux-gnu/include/c++/12/i686-linux-gnu`. See +#1279. + +## Quick start + +You can use the `benchmark` inside examples/ as a starting point. + +A [quick-reference page](g3doc/quick_reference.md) briefly lists all operations +and their parameters, and the [instruction_matrix](g3doc/instruction_matrix.pdf) +indicates the number of instructions per operation. + +The [FAQ](g3doc/faq.md) answers questions about portability, API design and +where to find more information. + +We recommend using full SIMD vectors whenever possible for maximum performance +portability. To obtain them, pass a `ScalableTag` (or equivalently +`HWY_FULL(float)`) tag to functions such as `Zero/Set/Load`. There are two +alternatives for use-cases requiring an upper bound on the lanes: + +- For up to `N` lanes, specify `CappedTag` or the equivalent + `HWY_CAPPED(T, N)`. The actual number of lanes will be `N` rounded down to + the nearest power of two, such as 4 if `N` is 5, or 8 if `N` is 8. This is + useful for data structures such as a narrow matrix. A loop is still required + because vectors may actually have fewer than `N` lanes. + +- For exactly a power of two `N` lanes, specify `FixedTag`. The largest + supported `N` depends on the target, but is guaranteed to be at least + `16/sizeof(T)`. + +Due to ADL restrictions, user code calling Highway ops must either: +* Reside inside `namespace hwy { namespace HWY_NAMESPACE {`; or +* prefix each op with an alias such as `namespace hn = hwy::HWY_NAMESPACE; + hn::Add()`; or +* add using-declarations for each op used: `using hwy::HWY_NAMESPACE::Add;`. + +Additionally, each function that calls Highway ops (such as `Load`) must either +be prefixed with `HWY_ATTR`, OR reside between `HWY_BEFORE_NAMESPACE()` and +`HWY_AFTER_NAMESPACE()`. Lambda functions currently require `HWY_ATTR` before +their opening brace. + +The entry points into code using Highway differ slightly depending on whether +they use static or dynamic dispatch. + +* For static dispatch, `HWY_TARGET` will be the best available target among + `HWY_BASELINE_TARGETS`, i.e. those allowed for use by the compiler (see + [quick-reference](g3doc/quick_reference.md)). Functions inside + `HWY_NAMESPACE` can be called using `HWY_STATIC_DISPATCH(func)(args)` within + the same module they are defined in. You can call the function from other + modules by wrapping it in a regular function and declaring the regular + function in a header. + +* For dynamic dispatch, a table of function pointers is generated via the + `HWY_EXPORT` macro that is used by `HWY_DYNAMIC_DISPATCH(func)(args)` to + call the best function pointer for the current CPU's supported targets. A + module is automatically compiled for each target in `HWY_TARGETS` (see + [quick-reference](g3doc/quick_reference.md)) if `HWY_TARGET_INCLUDE` is + defined and `foreach_target.h` is included. + +When using dynamic dispatch, `foreach_target.h` is included from translation +units (.cc files), not headers. Headers containing vector code shared between +several translation units require a special include guard, for example the +following taken from `examples/skeleton-inl.h`: + +``` +#if defined(HIGHWAY_HWY_EXAMPLES_SKELETON_INL_H_) == defined(HWY_TARGET_TOGGLE) +#ifdef HIGHWAY_HWY_EXAMPLES_SKELETON_INL_H_ +#undef HIGHWAY_HWY_EXAMPLES_SKELETON_INL_H_ +#else +#define HIGHWAY_HWY_EXAMPLES_SKELETON_INL_H_ +#endif + +#include "hwy/highway.h" +// Your vector code +#endif +``` + +By convention, we name such headers `-inl.h` because their contents (often +function templates) are usually inlined. + +## Compiler flags + +Applications should be compiled with optimizations enabled - without inlining, +SIMD code may slow down by factors of 10 to 100. For clang and GCC, `-O2` is +generally sufficient. + +For MSVC, we recommend compiling with `/Gv` to allow non-inlined functions to +pass vector arguments in registers. If intending to use the AVX2 target together +with half-width vectors (e.g. for `PromoteTo`), it is also important to compile +with `/arch:AVX2`. This seems to be the only way to generate VEX-encoded SSE4 +instructions on MSVC. Otherwise, mixing VEX-encoded AVX2 instructions and +non-VEX SSE4 may cause severe performance degradation. Unfortunately, the +resulting binary will then require AVX2. Note that no such flag is needed for +clang and GCC because they support target-specific attributes, which we use to +ensure proper VEX code generation for AVX2 targets. + +## Strip-mining loops + +When vectorizing a loop, an important question is whether and how to deal with +a number of iterations ('trip count', denoted `count`) that does not evenly +divide the vector size `N = Lanes(d)`. For example, it may be necessary to avoid +writing past the end of an array. + +In this section, let `T` denote the element type and `d = ScalableTag`. +Assume the loop body is given as a function `template +void LoopBody(D d, size_t index, size_t max_n)`. + +"Strip-mining" is a technique for vectorizing a loop by transforming it into an +outer loop and inner loop, such that the number of iterations in the inner loop +matches the vector width. Then, the inner loop is replaced with vector +operations. + +Highway offers several strategies for loop vectorization: + +* Ensure all inputs/outputs are padded. Then the (outer) loop is simply + + ``` + for (size_t i = 0; i < count; i += N) LoopBody(d, i, 0); + ``` + Here, the template parameter and second function argument are not needed. + + This is the preferred option, unless `N` is in the thousands and vector + operations are pipelined with long latencies. This was the case for + supercomputers in the 90s, but nowadays ALUs are cheap and we see most + implementations split vectors into 1, 2 or 4 parts, so there is little cost + to processing entire vectors even if we do not need all their lanes. Indeed + this avoids the (potentially large) cost of predication or partial + loads/stores on older targets, and does not duplicate code. + +* Process whole vectors and include previously processed elements + in the last vector: + ``` + for (size_t i = 0; i < count; i += N) LoopBody(d, HWY_MIN(i, count - N), 0); + ``` + + This is the second preferred option provided that `count >= N` + and `LoopBody` is idempotent. Some elements might be processed twice, but + a single code path and full vectorization is usually worth it. Even if + `count < N`, it usually makes sense to pad inputs/outputs up to `N`. + +* Use the `Transform*` functions in hwy/contrib/algo/transform-inl.h. This + takes care of the loop and remainder handling and you simply define a + generic lambda function (C++14) or functor which receives the current vector + from the input/output array, plus optionally vectors from up to two extra + input arrays, and returns the value to write to the input/output array. + + Here is an example implementing the BLAS function SAXPY (`alpha * x + y`): + + ``` + Transform1(d, x, n, y, [](auto d, const auto v, const auto v1) HWY_ATTR { + return MulAdd(Set(d, alpha), v, v1); + }); + ``` + +* Process whole vectors as above, followed by a scalar loop: + + ``` + size_t i = 0; + for (; i + N <= count; i += N) LoopBody(d, i, 0); + for (; i < count; ++i) LoopBody(CappedTag(), i, 0); + ``` + The template parameter and second function arguments are again not needed. + + This avoids duplicating code, and is reasonable if `count` is large. + If `count` is small, the second loop may be slower than the next option. + +* Process whole vectors as above, followed by a single call to a modified + `LoopBody` with masking: + + ``` + size_t i = 0; + for (; i + N <= count; i += N) { + LoopBody(d, i, 0); + } + if (i < count) { + LoopBody(d, i, count - i); + } + ``` + Now the template parameter and third function argument can be used inside + `LoopBody` to non-atomically 'blend' the first `num_remaining` lanes of `v` + with the previous contents of memory at subsequent locations: + `BlendedStore(v, FirstN(d, num_remaining), d, pointer);`. Similarly, + `MaskedLoad(FirstN(d, num_remaining), d, pointer)` loads the first + `num_remaining` elements and returns zero in other lanes. + + This is a good default when it is infeasible to ensure vectors are padded, + but is only safe `#if !HWY_MEM_OPS_MIGHT_FAULT`! + In contrast to the scalar loop, only a single final iteration is needed. + The increased code size from two loop bodies is expected to be worthwhile + because it avoids the cost of masking in all but the final iteration. + +## Additional resources + +* [Highway introduction (slides)](g3doc/highway_intro.pdf) +* [Overview of instructions per operation on different architectures](g3doc/instruction_matrix.pdf) +* [Design philosophy and comparison](g3doc/design_philosophy.md) +* [Implementation details](g3doc/impl_details.md) + +## Acknowledgments + +We have used [farm-sve](https://gitlab.inria.fr/bramas/farm-sve) by Berenger +Bramas; it has proved useful for checking the SVE port on an x86 development +machine. + +This is not an officially supported Google product. +Contact: janwas@google.com diff --git a/third_party/highway/WORKSPACE b/third_party/highway/WORKSPACE new file mode 100644 index 0000000000..71e0e69942 --- /dev/null +++ b/third_party/highway/WORKSPACE @@ -0,0 +1,38 @@ +workspace(name = "highway") + +load("@bazel_tools//tools/build_defs/repo:http.bzl", "http_archive") +load("@bazel_tools//tools/build_defs/repo:utils.bzl", "maybe") + +maybe( + http_archive, + name = "com_google_googletest", + urls = ["https://github.com/google/googletest/archive/e2239ee6043f73722e7aa812a459f54a28552929.zip"], + sha256 = "8daa1a71395892f7c1ec5f7cb5b099a02e606be720d62f1a6a98f8f8898ec826", + strip_prefix = "googletest-e2239ee6043f73722e7aa812a459f54a28552929", +) + +# See https://google.github.io/googletest/quickstart-bazel.html +maybe( + http_archive, + name = "rules_cc", + urls = ["https://github.com/bazelbuild/rules_cc/archive/40548a2974f1aea06215272d9c2b47a14a24e556.zip"], + sha256 = "56ac9633c13d74cb71e0546f103ce1c58810e4a76aa8325da593ca4277908d72", + strip_prefix = "rules_cc-40548a2974f1aea06215272d9c2b47a14a24e556", +) + +# Need recent version for config_setting_group +maybe( + http_archive, + name = "bazel_skylib", + urls = ["https://github.com/bazelbuild/bazel-skylib/releases/download/0.9.0/bazel_skylib-0.9.0.tar.gz"], +) + +maybe( + http_archive, + name = "rules_license", + urls = [ + "https://github.com/bazelbuild/rules_license/releases/download/0.0.4/rules_license-0.0.4.tar.gz", + "https://mirror.bazel.build/github.com/bazelbuild/rules_license/releases/download/0.0.4/rules_license-0.0.4.tar.gz", + ], + sha256 = "6157e1e68378532d0241ecd15d3c45f6e5cfd98fc10846045509fb2a7cc9e381", +) diff --git a/third_party/highway/cmake/FindAtomics.cmake b/third_party/highway/cmake/FindAtomics.cmake new file mode 100644 index 0000000000..e866b73fac --- /dev/null +++ b/third_party/highway/cmake/FindAtomics.cmake @@ -0,0 +1,56 @@ +# Original issue: +# * https://gitlab.kitware.com/cmake/cmake/-/issues/23021#note_1098733 +# +# For reference: +# * https://gcc.gnu.org/wiki/Atomic/GCCMM +# +# riscv64 specific: +# * https://lists.debian.org/debian-riscv/2022/01/msg00009.html +# +# ATOMICS_FOUND - system has c++ atomics +# ATOMICS_LIBRARIES - libraries needed to use c++ atomics + +include(CheckCXXSourceCompiles) + +# RISC-V only has 32-bit and 64-bit atomic instructions. GCC is supposed +# to convert smaller atomics to those larger ones via masking and +# shifting like LLVM, but it’s a known bug that it does not. This means +# anything that wants to use atomics on 1-byte or 2-byte types needs +# -latomic, but not 4-byte or 8-byte (though it does no harm). +set(atomic_code + " + #include + #include + std::atomic n8 (0); // riscv64 + std::atomic n64 (0); // armel, mipsel, powerpc + int main() { + ++n8; + ++n64; + return 0; + }") + +# https://gitlab.kitware.com/cmake/cmake/-/issues/24063 +set(CMAKE_CXX_STANDARD 11) +check_cxx_source_compiles("${atomic_code}" ATOMICS_LOCK_FREE_INSTRUCTIONS) + +if(ATOMICS_LOCK_FREE_INSTRUCTIONS) + set(ATOMICS_FOUND TRUE) + set(ATOMICS_LIBRARIES) +else() + set(CMAKE_REQUIRED_LIBRARIES "-latomic") + check_cxx_source_compiles("${atomic_code}" ATOMICS_IN_LIBRARY) + set(CMAKE_REQUIRED_LIBRARIES) + if(ATOMICS_IN_LIBRARY) + set(ATOMICS_LIBRARY atomic) + include(FindPackageHandleStandardArgs) + find_package_handle_standard_args(Atomics DEFAULT_MSG ATOMICS_LIBRARY) + set(ATOMICS_LIBRARIES ${ATOMICS_LIBRARY}) + unset(ATOMICS_LIBRARY) + else() + if(Atomics_FIND_REQUIRED) + message(FATAL_ERROR "Neither lock free instructions nor -latomic found.") + endif() + endif() +endif() +unset(atomic_code) +unset(CMAKE_CXX_STANDARD) diff --git a/third_party/highway/debian/changelog b/third_party/highway/debian/changelog new file mode 100644 index 0000000000..c5fbf7db7c --- /dev/null +++ b/third_party/highway/debian/changelog @@ -0,0 +1,181 @@ +highway (1.0.4-1) UNRELEASED; urgency=medium + +* Add PPC8..10, SSE2, AVX3_ZEN4, NEON_WITHOUT_AES targets +* Add Expand, LoadExpand, integer AbsDiff, SumsOf8AbsDiff +* Improved Half/Twice support, codegen for Shift*Same +* Support Wasm in Godbolt +* Faster KV128 sorting +* Fix armv7 build config, CMake config mode +* Update RVV intrinsics for 1.0-draft + + -- Jan Wassenberg Fri, 17 Mar 2023 15:00:00 +0200 + +highway (1.0.3-1) UNRELEASED; urgency=medium + +* Add RearrangeToOddPlusEven, Xor3, 8-bit CompressStore, HWY_ASSUME +* Add contrib/bit_pack for 8/16-bit lanes +* Add WASM_EMU256 target +* Documentation improvements +* Allow opting out of C++ stdlib usage for Compiler Explorer +* Update for new RVV intrinsics; faster WASM min/max and extmul/q15mul +* Fix UB, GCC atomic + + -- Jan Wassenberg Thu, 19 Jan 2023 13:00:00 +0200 + +highway (1.0.2-1) UNRELEASED; urgency=medium + +* Add ExclusiveNeither, FindKnownFirstTrue, Ne128 +* Add 16-bit SumOfLanes/ReorderWidenMulAccumulate/ReorderDemote2To +* Faster sort for low-entropy input, improved pivot selection +* Add GN build system, Highway FAQ, k32v32 type to vqsort +* CMake: Support find_package(GTest), add rvv-inl.h, add HWY_ENABLE_TESTS +* Fix MIPS and C++20 build, Apple LLVM 10.3 detection, EMU128 AllTrue on RVV +* Fix missing exec_prefix, RVV build, warnings, libatomic linking +* Work around GCC 10.4 issue, disabled RDCYCLE, arm7 with vfpv3 +* Documentation/example improvements +* Support static dispatch to SVE2_128 and SVE_256 + + -- Jan Wassenberg Thu, 27 Oct 2022 17:00:00 +0200 + +highway (1.0.1-1) UNRELEASED; urgency=medium + +* Add Eq128, i64 Mul, unsigned->float ConvertTo +* Faster sort for few unique keys, more robust pivot selection +* Fix: floating-point generator for sort tests, Min/MaxOfLanes for i16 +* Fix: avoid always_inline in debug, link atomic +* GCC warnings: string.h, maybe-uninitialized, ignored-attributes +* GCC warnings: preprocessor int overflow, spurious use-after-free/overflow +* Doc: <=HWY_AVX3, Full32/64/128, how to use generic-inl + + -- Jan Wassenberg Tue, 23 Aug 2022 10:00:00 +0200 + +highway (1.0.0-1) UNRELEASED; urgency=medium + +* ABI change: 64-bit target values, more room for expansion +* Add CompressBlocksNot, CompressNot, Lt128Upper, Min/Max128Upper, TruncateTo +* Add HWY_SVE2_128 target +* Sort speedups especially for 128-bit +* Documentation clarifications +* Faster NEON CountTrue/FindFirstTrue/AllFalse/AllTrue +* Improved SVE codegen +* Fix u16x8 ConcatEven/Odd, SSSE3 i64 Lt +* MSVC 2017 workarounds +* Support for runtime dispatch on Arm/GCC/Linux + + -- Jan Wassenberg Wed, 27 Jul 2022 10:00:00 +0200 + +highway (0.17.0-1) UNRELEASED; urgency=medium + +* Add ExtractLane, InsertLane, IsInf, IsFinite, IsNaN +* Add StoreInterleaved2, LoadInterleaved2/3/4, BlendedStore, SafeFillN +* Add MulFixedPoint15, Or3 +* Add Copy[If], Find[If], Generate, Replace[If] algos +* Add HWY_EMU128 target (replaces HWY_SCALAR) +* HWY_RVV is feature-complete +* Add HWY_ENABLE_CONTRIB build flag, HWY_NATIVE_FMA, HWY_WANT_SSSE3/SSE4 macros +* Extend ConcatOdd/Even and StoreInterleaved* to all types +* Allow CappedTag +* Sort speedups: 2x for AVX2, 1.09x for AVX3; avoid x86 malloc +* Expand documentation +* Fix RDTSCP crash in nanobenchmark +* Fix XCR0 check (was ignoring AVX3 on ICL) +* Support Arm/RISC-V timers + + -- Jan Wassenberg Fri, 20 May 2022 10:00:00 +0200 + +highway (0.16.0-1) UNRELEASED; urgency=medium + + * Add contrib/sort (vectorized quicksort) + * Add IfNegativeThenElse, IfVecThenElse + * Add Reverse2,4,8, ReverseBlocks, DupEven/Odd, AESLastRound + * Add OrAnd, Min128, Max128, Lt128, SumsOf8 + * Support capped/partial vectors on RVV/SVE, int64 in WASM + * Support SVE2, shared library build + * Remove deprecated overloads without the required d arg (UpperHalf etc.) + + -- Jan Wassenberg Thu, 03 Feb 2022 11:00:00 +0100 + +highway (0.15.0-1) UNRELEASED; urgency=medium + + * New ops: CompressBlendedStore, ConcatOdd/Even, IndicesFromVec + * New ops: OddEvenBlocks, SwapAdjacentBlocks, Reverse, RotateRight + * Add bf16, unsigned comparisons, more lane types for Reverse/TableLookupLanes + * Contrib: add sort(ing network) and dot(product) + * Targets: update RVV for LLVM, add experimental WASM2 + * Separate library hwy_test for test utils + * Add non-macro Simd<> aliases + * Fixes: const V& for GCC, AVX3 BZHI, POPCNT with AVX on MSVC, avoid %zu + + -- Jan Wassenberg Wed, 10 Nov 2021 10:00:00 +0100 + +highway (0.14.2-1) UNRELEASED; urgency=medium + + * Add MaskedLoad + * Fix non-glibc PPC, Windows GCC, MSVC 19.14 + * Opt-in for -Werror; separate design_philosophy.md + + -- Jan Wassenberg Tue, 24 Aug 2021 15:00:00 +0200 + +highway (0.14.1-1) UNRELEASED; urgency=medium + + * Add LoadMaskBits, CompressBits[Store] + * Fix CPU feature check (AES/F16C) and warnings + * Improved DASSERT - disabled in optimized builds + + -- Jan Wassenberg Tue, 17 Aug 2021 14:00:00 +0200 + +highway (0.14.0-1) UNRELEASED; urgency=medium + + * Add SVE, S-SSE3, AVX3_DL targets + * Support partial vectors in all ops + * Add PopulationCount, FindFirstTrue, Ne, TableLookupBytesOr0 + * Add AESRound, CLMul, MulOdd, HWY_CAP_FLOAT16 + + -- Jan Wassenberg Thu, 29 Jul 2021 15:00:00 +0200 + +highway (0.12.2-1) UNRELEASED; urgency=medium + + * fix scalar-only test and Windows macro conflict with Load/StoreFence + * replace deprecated wasm intrinsics + + -- Jan Wassenberg Mon, 31 May 2021 16:00:00 +0200 + +highway (0.12.1-1) UNRELEASED; urgency=medium + + * doc updates, ARM GCC support, fix s390/ppc, complete partial vectors + * fix warnings, faster ARM div/sqrt, separate hwy_contrib library + * add Abs(i64)/FirstN/Pause, enable AVX2 on MSVC + + -- Jan Wassenberg Wed, 19 May 2021 15:00:00 +0200 + +highway (0.12.0-1) UNRELEASED; urgency=medium + + * Add Shift*8, Compress16, emulated Scatter/Gather, StoreInterleaved3/4 + * Remove deprecated HWY_*_LANES, deprecate HWY_GATHER_LANES + * Proper IEEE rounding, reduce libstdc++ usage, inlined math + + -- Jan Wassenberg Thu, 15 Apr 2021 20:00:00 +0200 + +highway (0.11.1-1) UNRELEASED; urgency=medium + + * Fix clang7 asan error, finish f16 conversions and add test + + -- Jan Wassenberg Thu, 25 Feb 2021 16:00:00 +0200 + +highway (0.11.0-1) UNRELEASED; urgency=medium + + * Add RVV+mask logical ops, allow Shl/ShiftLeftSame on all targets, more math + + -- Jan Wassenberg Thu, 18 Feb 2021 20:00:00 +0200 + +highway (0.7.0-1) UNRELEASED; urgency=medium + + * Added API stability notice, Compress[Store], contrib/, SignBit, CopySign + + -- Jan Wassenberg Tue, 5 Jan 2021 17:00:00 +0200 + +highway (0.1-1) UNRELEASED; urgency=medium + + * Initial debian package. + + -- Alex Deymo Mon, 19 Oct 2020 16:48:07 +0200 diff --git a/third_party/highway/debian/compat b/third_party/highway/debian/compat new file mode 100644 index 0000000000..f599e28b8a --- /dev/null +++ b/third_party/highway/debian/compat @@ -0,0 +1 @@ +10 diff --git a/third_party/highway/debian/control b/third_party/highway/debian/control new file mode 100644 index 0000000000..7c60ebc7f4 --- /dev/null +++ b/third_party/highway/debian/control @@ -0,0 +1,23 @@ +Source: highway +Maintainer: JPEG XL Maintainers +Section: misc +Priority: optional +Standards-Version: 3.9.8 +Build-Depends: cmake, + debhelper (>= 9), + libgtest-dev +Homepage: https://github.com/google/highway + +Package: libhwy-dev +Architecture: any +Section: libdevel +Depends: ${misc:Depends} +Description: Efficient and performance-portable SIMD wrapper (developer files) + This library provides type-safe and source-code portable wrappers over + existing platform-specific intrinsics. Its design aims for simplicity, + reliable efficiency across platforms, and immediate usability with current + compilers. + . + This package installs the development files. There's no runtime library + since most of Highway is implemented in headers and only a very small + static library is needed. diff --git a/third_party/highway/debian/copyright b/third_party/highway/debian/copyright new file mode 100644 index 0000000000..53ea57aa97 --- /dev/null +++ b/third_party/highway/debian/copyright @@ -0,0 +1,20 @@ +Format: https://www.debian.org/doc/packaging-manuals/copyright-format/1.0/ +Upstream-Name: highway + +Files: * +Copyright: 2020 Google LLC +License: Apache-2.0 + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + . + http://www.apache.org/licenses/LICENSE-2.0 + . + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. + . + On Debian systems, the complete text of the Apache License, Version 2 + can be found in "/usr/share/common-licenses/Apache-2.0". diff --git a/third_party/highway/debian/rules b/third_party/highway/debian/rules new file mode 100644 index 0000000000..969fc120e8 --- /dev/null +++ b/third_party/highway/debian/rules @@ -0,0 +1,6 @@ +#!/usr/bin/make -f +%: + dh $@ --buildsystem=cmake + +override_dh_auto_configure: + dh_auto_configure -- -DHWY_SYSTEM_GTEST=ON diff --git a/third_party/highway/debian/source/format b/third_party/highway/debian/source/format new file mode 100644 index 0000000000..163aaf8d82 --- /dev/null +++ b/third_party/highway/debian/source/format @@ -0,0 +1 @@ +3.0 (quilt) diff --git a/third_party/highway/docs/.gitignore b/third_party/highway/docs/.gitignore new file mode 100644 index 0000000000..e87b171c2d --- /dev/null +++ b/third_party/highway/docs/.gitignore @@ -0,0 +1,3 @@ +*.swp +/_build +/doctrees diff --git a/third_party/highway/docs/Makefile b/third_party/highway/docs/Makefile new file mode 100644 index 0000000000..d4bb2cbb9e --- /dev/null +++ b/third_party/highway/docs/Makefile @@ -0,0 +1,20 @@ +# Minimal makefile for Sphinx documentation +# + +# You can set these variables from the command line, and also +# from the environment for the first two. +SPHINXOPTS ?= +SPHINXBUILD ?= sphinx-build +SOURCEDIR = . +BUILDDIR = _build + +# Put it first so that "make" without argument is like "make help". +help: + @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) + +.PHONY: help Makefile + +# Catch-all target: route all unknown targets to Sphinx using the new +# "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). +%: Makefile + @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) diff --git a/third_party/highway/docs/_static/css/dark.css b/third_party/highway/docs/_static/css/dark.css new file mode 100644 index 0000000000..da2235b5b0 --- /dev/null +++ b/third_party/highway/docs/_static/css/dark.css @@ -0,0 +1,622 @@ +/* links */ + +a, +a:visited { + color: #aaddff; +} + + +/* code directives */ + +.method dt, +.class dt, +.data dt, +.attribute dt, +.function dt, +.classmethod dt, +.exception dt, +.descclassname, +.descname { + background-color: #2d2d2d !important; +} + +.rst-content dl:not(.docutils) dt { + color: #aaddff; + background-color: #2d2d2d; + border-top: solid 3px #525252; + border-left: solid 3px #525252; +} + +em.property { + color: #888888; +} + + +/* tables */ + +.rst-content table.docutils thead { + color: #ddd; +} + +.rst-content table.docutils td { + border: 0px; +} + +.rst-content table.docutils:not(.field-list) tr:nth-child(2n-1) td { + background-color: #5a5a5a; +} + + +/* inlined code highlights */ + +.xref, +.py-meth, +.rst-content a code { + color: #aaddff !important; + font-weight: normal !important; +} + +.rst-content code { + color: #eee !important; + font-weight: normal !important; +} + +code.literal { + background-color: #2d2d2d !important; + border: 1px solid #6d6d6d !important; +} + +code.docutils.literal.notranslate { + color: #ddd; +} + + +/* notes, warnings, hints */ + +.hint .admonition-title { + background: #2aa87c !important; +} + +.warning .admonition-title { + background: #cc4444 !important; +} + +.admonition-title { + background: #3a7ca8 !important; +} + +.admonition, +.note { + background-color: #2d2d2d !important; +} + + +/* table of contents */ + +.wy-nav-content-wrap { + background-color: rgba(0, 0, 0, 0.6) !important; +} + +.sidebar { + background-color: #191919 !important; +} + +.sidebar-title { + background-color: #2b2b2b !important; +} + +.wy-menu-vertical a { + color: #ddd; +} + +.wy-menu-vertical code.docutils.literal.notranslate { + color: #404040; + background: none !important; + border: none !important; +} + +.wy-nav-content { + background: #3c3c3c; + color: #dddddd; +} + +.wy-menu-vertical li.on a, +.wy-menu-vertical li.current>a { + background: #a3a3a3; + border-bottom: 0px !important; + border-top: 0px !important; +} + +.wy-menu-vertical li.current { + background: #b3b3b3; +} + +.toc-backref { + color: grey !important; +} + +.highlight .hll { + background-color: #49483e +} + +.highlight { + background: #222; + color: #f8f8f2 +} + +.highlight .c { + color: #888 +} + + +/* Comment */ + +.highlight .err { + color: #960050; + background-color: #1e0010 +} + + +/* Error */ + +.highlight .k { + color: #66d9ef +} + + +/* Keyword */ + +.highlight .l { + color: #ae81ff +} + + +/* Literal */ + +.highlight .n { + color: #f8f8f2 +} + + +/* Name */ + +.highlight .o { + color: #f92672 +} + + +/* Operator */ + +.highlight .p { + color: #f8f8f2 +} + + +/* Punctuation */ + +.highlight .ch { + color: #888 +} + + +/* Comment.Hashbang */ + +.highlight .cm { + color: #888 +} + + +/* Comment.Multiline */ + +.highlight .cp { + color: #888 +} + + +/* Comment.Preproc */ + +.highlight .cpf { + color: #888 +} + + +/* Comment.PreprocFile */ + +.highlight .c1 { + color: #888 +} + + +/* Comment.Single */ + +.highlight .cs { + color: #888 +} + + +/* Comment.Special */ + +.highlight .gd { + color: #f92672 +} + + +/* Generic.Deleted */ + +.highlight .ge { + font-style: italic +} + + +/* Generic.Emph */ + +.highlight .gi { + color: #a6e22e +} + + +/* Generic.Inserted */ + +.highlight .gs { + font-weight: bold +} + + +/* Generic.Strong */ + +.highlight .gu { + color: #888 +} + + +/* Generic.Subheading */ + +.highlight .kc { + color: #66d9ef +} + + +/* Keyword.Constant */ + +.highlight .kd { + color: #66d9ef +} + + +/* Keyword.Declaration */ + +.highlight .kn { + color: #f92672 +} + + +/* Keyword.Namespace */ + +.highlight .kp { + color: #66d9ef +} + + +/* Keyword.Pseudo */ + +.highlight .kr { + color: #66d9ef +} + + +/* Keyword.Reserved */ + +.highlight .kt { + color: #66d9ef +} + + +/* Keyword.Type */ + +.highlight .ld { + color: #e6db74 +} + + +/* Literal.Date */ + +.highlight .m { + color: #ae81ff +} + + +/* Literal.Number */ + +.highlight .s { + color: #e6db74 +} + + +/* Literal.String */ + +.highlight .na { + color: #a6e22e +} + + +/* Name.Attribute */ + +.highlight .nb { + color: #f8f8f2 +} + + +/* Name.Builtin */ + +.highlight .nc { + color: #a6e22e +} + + +/* Name.Class */ + +.highlight .no { + color: #66d9ef +} + + +/* Name.Constant */ + +.highlight .nd { + color: #a6e22e +} + + +/* Name.Decorator */ + +.highlight .ni { + color: #f8f8f2 +} + + +/* Name.Entity */ + +.highlight .ne { + color: #a6e22e +} + + +/* Name.Exception */ + +.highlight .nf { + color: #a6e22e +} + + +/* Name.Function */ + +.highlight .nl { + color: #f8f8f2 +} + + +/* Name.Label */ + +.highlight .nn { + color: #f8f8f2 +} + + +/* Name.Namespace */ + +.highlight .nx { + color: #a6e22e +} + + +/* Name.Other */ + +.highlight .py { + color: #f8f8f2 +} + + +/* Name.Property */ + +.highlight .nt { + color: #f92672 +} + + +/* Name.Tag */ + +.highlight .nv { + color: #f8f8f2 +} + + +/* Name.Variable */ + +.highlight .ow { + color: #f92672 +} + + +/* Operator.Word */ + +.highlight .w { + color: #f8f8f2 +} + + +/* Text.Whitespace */ + +.highlight .mb { + color: #ae81ff +} + + +/* Literal.Number.Bin */ + +.highlight .mf { + color: #ae81ff +} + + +/* Literal.Number.Float */ + +.highlight .mh { + color: #ae81ff +} + + +/* Literal.Number.Hex */ + +.highlight .mi { + color: #ae81ff +} + + +/* Literal.Number.Integer */ + +.highlight .mo { + color: #ae81ff +} + + +/* Literal.Number.Oct */ + +.highlight .sa { + color: #e6db74 +} + + +/* Literal.String.Affix */ + +.highlight .sb { + color: #e6db74 +} + + +/* Literal.String.Backtick */ + +.highlight .sc { + color: #e6db74 +} + + +/* Literal.String.Char */ + +.highlight .dl { + color: #e6db74 +} + + +/* Literal.String.Delimiter */ + +.highlight .sd { + color: #e6db74 +} + + +/* Literal.String.Doc */ + +.highlight .s2 { + color: #e6db74 +} + + +/* Literal.String.Double */ + +.highlight .se { + color: #ae81ff +} + + +/* Literal.String.Escape */ + +.highlight .sh { + color: #e6db74 +} + + +/* Literal.String.Heredoc */ + +.highlight .si { + color: #e6db74 +} + + +/* Literal.String.Interpol */ + +.highlight .sx { + color: #e6db74 +} + + +/* Literal.String.Other */ + +.highlight .sr { + color: #e6db74 +} + + +/* Literal.String.Regex */ + +.highlight .s1 { + color: #e6db74 +} + + +/* Literal.String.Single */ + +.highlight .ss { + color: #e6db74 +} + + +/* Literal.String.Symbol */ + +.highlight .bp { + color: #f8f8f2 +} + + +/* Name.Builtin.Pseudo */ + +.highlight .fm { + color: #a6e22e +} + + +/* Name.Function.Magic */ + +.highlight .vc { + color: #f8f8f2 +} + + +/* Name.Variable.Class */ + +.highlight .vg { + color: #f8f8f2 +} + + +/* Name.Variable.Global */ + +.highlight .vi { + color: #f8f8f2 +} + + +/* Name.Variable.Instance */ + +.highlight .vm { + color: #f8f8f2 +} + + +/* Name.Variable.Magic */ + +.highlight .il { + color: #ae81ff +} + + +/* Literal.Number.Integer.Long */ diff --git a/third_party/highway/docs/_static/css/toggle.css b/third_party/highway/docs/_static/css/toggle.css new file mode 100644 index 0000000000..f79376af7d --- /dev/null +++ b/third_party/highway/docs/_static/css/toggle.css @@ -0,0 +1,77 @@ +input[type=checkbox] { + visibility: hidden; + height: 0; + width: 0; + margin: 0; +} + +.rst-versions .rst-current-version { + padding: 10px; + display: flex; + justify-content: space-between; +} + +.rst-versions .rst-current-version .fa-book, +.rst-versions .rst-current-version .fa-v, +.rst-versions .rst-current-version .fa-caret-down { + height: 24px; + line-height: 24px; + vertical-align: middle; +} + +.rst-versions .rst-current-version .fa-element { + width: 80px; + text-align: center; +} + +.rst-versions .rst-current-version .fa-book { + text-align: left; +} + +.rst-versions .rst-current-version .fa-v { + color: #27AE60; + text-align: right; +} + +label { + margin: 0 auto; + display: inline-block; + justify-content: center; + align-items: right; + border-radius: 100px; + position: relative; + cursor: pointer; + text-indent: -9999px; + width: 50px; + height: 21px; + background: #000; +} + +label:after { + border-radius: 50%; + position: absolute; + content: ''; + background: #fff; + width: 15px; + height: 15px; + top: 3px; + left: 3px; + transition: ease-in-out 200ms; +} + +input:checked+label { + background: #3a7ca8; +} + +input:checked+label:after { + left: calc(100% - 5px); + transform: translateX(-100%); +} + +html.transition, +html.transition *, +html.transition *:before, +html.transition *:after { + transition: ease-in-out 200ms !important; + transition-delay: 0 !important; +} diff --git a/third_party/highway/docs/_static/js/toggle.js b/third_party/highway/docs/_static/js/toggle.js new file mode 100644 index 0000000000..7cfe613f60 --- /dev/null +++ b/third_party/highway/docs/_static/js/toggle.js @@ -0,0 +1,38 @@ +document.addEventListener('DOMContentLoaded', function() { + + function toggleCssMode(isDay) { + var mode = (isDay ? "Day" : "Night"); + localStorage.setItem("css-mode", mode); + + var daysheet = $('link[href="_static/pygments.css"]')[0].sheet; + daysheet.disabled = !isDay; + + var nightsheet = $('link[href="_static/css/dark.css"]')[0]; + if (!isDay && nightsheet === undefined) { + var element = document.createElement("link"); + element.setAttribute("rel", "stylesheet"); + element.setAttribute("type", "text/css"); + element.setAttribute("href", "_static/css/dark.css"); + document.getElementsByTagName("head")[0].appendChild(element); + return; + } + if (nightsheet !== undefined) { + nightsheet.sheet.disabled = isDay; + } + } + + var initial = localStorage.getItem("css-mode") != "Night"; + var checkbox = document.querySelector('input[name=mode]'); + + toggleCssMode(initial); + checkbox.checked = initial; + + checkbox.addEventListener('change', function() { + document.documentElement.classList.add('transition'); + window.setTimeout(() => { + document.documentElement.classList.remove('transition'); + }, 1000) + toggleCssMode(this.checked); + }) + +}); diff --git a/third_party/highway/docs/_templates/layout.html b/third_party/highway/docs/_templates/layout.html new file mode 100644 index 0000000000..e8f2b7f759 --- /dev/null +++ b/third_party/highway/docs/_templates/layout.html @@ -0,0 +1,9 @@ +{% extends "!layout.html" %} + {% block footer %} {{ super() }} + + + +{% endblock %} + diff --git a/third_party/highway/docs/_templates/versions.html b/third_party/highway/docs/_templates/versions.html new file mode 100644 index 0000000000..e9abbcdafe --- /dev/null +++ b/third_party/highway/docs/_templates/versions.html @@ -0,0 +1,62 @@ +{% if READTHEDOCS or display_lower_left %} +{# Add rst-badge after rst-versions for small badge style. #} +
+ + Read the Docs + + + + + + + + v: {{ current_version }} + + +
+ {% if languages|length >= 1 %} +
+
{{ _('Languages') }}
+ {% for slug, url in languages %} + {% if slug == current_language %} {% endif %} +
{{ slug }}
+ {% if slug == current_language %}
{% endif %} + {% endfor %} +
+ {% endif %} + {% if versions|length >= 1 %} +
+
{{ _('Versions') }}
+ {% for slug, url in versions %} + {% if slug == current_version %} {% endif %} +
{{ slug }}
+ {% if slug == current_version %}
{% endif %} + {% endfor %} +
+ {% endif %} + {% if downloads|length >= 1 %} +
+
{{ _('Downloads') }}
+ {% for type, url in downloads %} +
{{ type }}
+ {% endfor %} +
+ {% endif %} + {% if READTHEDOCS %} +
+
{{ _('On Read the Docs') }}
+
+ {{ _('Project Home') }} +
+
+ {{ _('Builds') }} +
+
+ {% endif %} +
+ {% trans %}Free document hosting provided by GitHub Pages.{% endtrans %} + +
+
+{% endif %} + diff --git a/third_party/highway/docs/buildDocs.sh b/third_party/highway/docs/buildDocs.sh new file mode 100644 index 0000000000..0cd587794a --- /dev/null +++ b/third_party/highway/docs/buildDocs.sh @@ -0,0 +1,156 @@ +#!/bin/bash +set -x +################################################################################ +# File: buildDocs.sh +# Purpose: Script that builds our documentation using sphinx and updates GitHub +# Pages. This script is executed by: +# .github/workflows/docs_pages_workflow.yml +# +# Authors: Michael Altfield +# Created: 2020-07-17 +# Updated: 2020-07-23 +# Version: 0.2 +################################################################################ + +################### +# INSTALL DEPENDS # +################### + +apt-get update +apt-get -y install git rsync pandoc python3-sphinx python3-sphinx-rtd-theme python3-stemmer python3-git python3-pip python3-virtualenv python3-setuptools + +python3 -m pip install --upgrade rinohtype pygments sphinx-rtd-theme sphinx-tabs docutils==0.16 pandoc +python3 -m pip list + +# get rid of all these safe dir warnings +git config --global --add safe.directory '*' + +##################### +# DECLARE VARIABLES # +##################### + +pwd +ls -lah +export SOURCE_DATE_EPOCH=$(git log -1 --pretty=%ct) + +# make a new temp dir which will be our GitHub Pages docroot +docroot=`mktemp -d` + +export REPO_NAME="${GITHUB_REPOSITORY##*/}" + +############## +# BUILD DOCS # +############## + +# first, cleanup any old builds' static assets +make -C docs clean + +# get a list of branches, excluding 'HEAD' and 'gh-pages' +versions="`git for-each-ref '--format=%(refname:lstrip=-1)' refs/remotes/origin/ | grep -viE '^(HEAD|gh-pages)$'`" +ls +for current_version in ${versions}; do + + # make the current language available to conf.py + export current_version + git checkout ${current_version} + + echo "INFO: Building sites for ${current_version}" + + cd docs && python3 mm-converter.py + cd .. + + # skip this branch if it doesn't have our docs dir & sphinx config + if [ ! -e 'docs/conf.py' ]; then + echo -e "\tINFO: Couldn't find 'docs/conf.py' (skipped)" + continue + fi + + languages="en" + for current_language in ${languages}; do + + # make the current language available to conf.py + export current_language + + ########## + # BUILDS # + ########## + echo "INFO: Building for ${current_language}" + + # HTML # + sphinx-build -b html docs/ docs/_build/html/${current_language}/${current_version} -D language="${current_language}" + + # EPUB # + sphinx-build -b epub docs/ docs/_build/epub -D language="${current_language}" + mkdir -p "${docroot}/${current_language}/${current_version}" + cp "docs/_build/epub/target.epub" "${docroot}/${current_language}/${current_version}/${REPO_NAME}_${current_language}_${current_version}.epub" + + # copy the static assets produced by the above build into our docroot + rsync -av "docs/_build/html/" "${docroot}/" + + done + +done + +# return to master branch +git checkout master + +####################### +# Update GitHub Pages # +####################### + +git config --global user.name "${GITHUB_ACTOR}" +git config --global user.email "${GITHUB_ACTOR}@users.noreply.github.com" + +pushd "${docroot}" + +# don't bother maintaining history; just generate fresh +git init +git remote add deploy "https://token:${GITHUB_TOKEN}@github.com/${GITHUB_REPOSITORY}.git" +git checkout -b gh-pages + +# add .nojekyll to the root so that github won't 404 on content added to dirs +# that start with an underscore (_), such as our "_content" dir.. +touch .nojekyll + +# add redirect from the docroot to our default docs language/version +cat > index.html < + + + ${REPO_NAME} Docs + + + +

Please wait while you're redirected to our documentation.

+ + +EOF + +# Add README +cat > README.md <NUL 2>NUL +if errorlevel 9009 ( + echo. + echo.The 'sphinx-build' command was not found. Make sure you have Sphinx + echo.installed, then set the SPHINXBUILD environment variable to point + echo.to the full path of the 'sphinx-build' executable. Alternatively you + echo.may add the Sphinx directory to PATH. + echo. + echo.If you don't have Sphinx installed, grab it from + echo.http://sphinx-doc.org/ + exit /b 1 +) + +%SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% +goto end + +:help +%SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% + +:end +popd diff --git a/third_party/highway/docs/mm-converter.py b/third_party/highway/docs/mm-converter.py new file mode 100644 index 0000000000..3b2f1ddf6a --- /dev/null +++ b/third_party/highway/docs/mm-converter.py @@ -0,0 +1,108 @@ +#!/usr/bin/env python3 + +# set correct links (pandoc did not deal with github links properly) + +import os +import re +import subprocess + +regex_pdf_links1 = re.compile(r'`(.*)\`__', + re.M | re.X) # Multiline and Verbose +regex_md_links = re.compile(r'`(.*)\`__', + re.M | re.X) # Multiline and Verbose +regex_md_links2 = re.compile(r'`(.*)\n(.*)\`__', + re.M | re.X) # Multiline and Verbose +regex_pdf_links2 = re.compile(r'`(.*)\n\s+(.*)\`__', + re.M | re.X) # Multiline and Verbose + +def remove_links_to_index2(data): + # remove liks to the index, they are useless in py4web docs + data = data + print(re.search(regex_pdf_links2, data)) + return re.sub(regex_pdf_links2, + r':download:`\1 \2`', + data) + +def remove_links_to_index(data): + # remove liks to the index, they are useless in py4web docs + data = data + print(re.search(regex_pdf_links1, data)) + return re.sub(regex_pdf_links1, + r':download:`\1`', + data) + +def rewrite_md_links(data): + # remove liks to the index, they are useless in py4web docs + data = data + print(re.search(regex_md_links, data)) + data = re.sub(regex_md_links, + r'`\1<\2.html>`__', + data) + data = re.sub(regex_md_links2, + r'`\1 \2<\3.html>`__', + data) + return data + + +docs_on_pages = [ + 'README.md', + 'quick_reference.md', + 'design_philosophy.md', + 'impl_details.md', + 'faq.md', + 'release_testing_process.md' +] + +def convert2md(file): + print(f" Working on file {file}") + file = os.path.join('g3doc', file) + data = open(file, 'r').read() + write_files(file, data) + +def write_files(file, data): + for extension in ['rst']: + ext_dir = os.getcwd() + md_dir = os.path.join(os.getcwd(), 'g3doc') + if not os.path.isdir(ext_dir): + os.mkdir(ext_dir) + ext_file = os.path.join(ext_dir , os.path.splitext(os.path.basename(file))[0] + "." + extension) + md_file = os.path.join(md_dir , os.path.splitext(os.path.basename(file))[0] + ".md") + print(f'writing {ext_file}') + if os.path.exists(ext_file): + os.unlink(ext_file) + with open(ext_file, 'w') as handler: + write_format(extension, ext_file, handler, md_file, data) + + +def write_format(extension, ext_file, handler, md_file, data): + if extension =='md': + handler.write(data) + elif extension =='rst': + try: + subprocess.call(['pandoc', '-s', md_file, '-f', 'markdown', '-t', 'rst', '-o', ext_file]) + data = open(ext_file, 'r').read() + data = remove_links_to_index(data) + data = remove_links_to_index2(data) + data = rewrite_md_links(data) + handler.write(data) + # Open a file for writing + # with open('tmp.txt', 'w') as f: + # Call the subprocess and redirect the output to the file + # subprocess.call(['awk', '{ gsub(/ +#include +#include +#include // malloc + +#include +#include + +#include "hwy/base.h" + +namespace hwy { +namespace { + +#if HWY_ARCH_RVV && defined(__riscv_v_intrinsic) && __riscv_v_intrinsic >= 11000 +// Not actually an upper bound on the size, but this value prevents crossing a +// 4K boundary (relevant on Andes). +constexpr size_t kAlignment = HWY_MAX(HWY_ALIGNMENT, 4096); +#else +constexpr size_t kAlignment = HWY_ALIGNMENT; +#endif + +#if HWY_ARCH_X86 +// On x86, aliasing can only occur at multiples of 2K, but that's too wasteful +// if this is used for single-vector allocations. 256 is more reasonable. +constexpr size_t kAlias = kAlignment * 4; +#else +constexpr size_t kAlias = kAlignment; +#endif + +#pragma pack(push, 1) +struct AllocationHeader { + void* allocated; + size_t payload_size; +}; +#pragma pack(pop) + +// Returns a 'random' (cyclical) offset for AllocateAlignedBytes. +size_t NextAlignedOffset() { + static std::atomic next{0}; + constexpr uint32_t kGroups = kAlias / kAlignment; + const uint32_t group = next.fetch_add(1, std::memory_order_relaxed) % kGroups; + const size_t offset = kAlignment * group; + HWY_DASSERT((offset % kAlignment == 0) && offset <= kAlias); + return offset; +} + +} // namespace + +HWY_DLLEXPORT void* AllocateAlignedBytes(const size_t payload_size, + AllocPtr alloc_ptr, void* opaque_ptr) { + HWY_ASSERT(payload_size != 0); // likely a bug in caller + if (payload_size >= std::numeric_limits::max() / 2) { + HWY_DASSERT(false && "payload_size too large"); + return nullptr; + } + + size_t offset = NextAlignedOffset(); + + // What: | misalign | unused | AllocationHeader |payload + // Size: |<= kAlias | offset |payload_size + // ^allocated.^aligned.^header............^payload + // The header must immediately precede payload, which must remain aligned. + // To avoid wasting space, the header resides at the end of `unused`, + // which therefore cannot be empty (offset == 0). + if (offset == 0) { + offset = kAlignment; // = RoundUpTo(sizeof(AllocationHeader), kAlignment) + static_assert(sizeof(AllocationHeader) <= kAlignment, "Else: round up"); + } + + const size_t allocated_size = kAlias + offset + payload_size; + void* allocated; + if (alloc_ptr == nullptr) { + allocated = malloc(allocated_size); + } else { + allocated = (*alloc_ptr)(opaque_ptr, allocated_size); + } + if (allocated == nullptr) return nullptr; + // Always round up even if already aligned - we already asked for kAlias + // extra bytes and there's no way to give them back. + uintptr_t aligned = reinterpret_cast(allocated) + kAlias; + static_assert((kAlias & (kAlias - 1)) == 0, "kAlias must be a power of 2"); + static_assert(kAlias >= kAlignment, "Cannot align to more than kAlias"); + aligned &= ~(kAlias - 1); + + const uintptr_t payload = aligned + offset; // still aligned + + // Stash `allocated` and payload_size inside header for FreeAlignedBytes(). + // The allocated_size can be reconstructed from the payload_size. + AllocationHeader* header = reinterpret_cast(payload) - 1; + header->allocated = allocated; + header->payload_size = payload_size; + + return HWY_ASSUME_ALIGNED(reinterpret_cast(payload), kAlignment); +} + +HWY_DLLEXPORT void FreeAlignedBytes(const void* aligned_pointer, + FreePtr free_ptr, void* opaque_ptr) { + if (aligned_pointer == nullptr) return; + + const uintptr_t payload = reinterpret_cast(aligned_pointer); + HWY_DASSERT(payload % kAlignment == 0); + const AllocationHeader* header = + reinterpret_cast(payload) - 1; + + if (free_ptr == nullptr) { + free(header->allocated); + } else { + (*free_ptr)(opaque_ptr, header->allocated); + } +} + +// static +HWY_DLLEXPORT void AlignedDeleter::DeleteAlignedArray(void* aligned_pointer, + FreePtr free_ptr, + void* opaque_ptr, + ArrayDeleter deleter) { + if (aligned_pointer == nullptr) return; + + const uintptr_t payload = reinterpret_cast(aligned_pointer); + HWY_DASSERT(payload % kAlignment == 0); + const AllocationHeader* header = + reinterpret_cast(payload) - 1; + + if (deleter) { + (*deleter)(aligned_pointer, header->payload_size); + } + + if (free_ptr == nullptr) { + free(header->allocated); + } else { + (*free_ptr)(opaque_ptr, header->allocated); + } +} + +} // namespace hwy diff --git a/third_party/highway/hwy/aligned_allocator.h b/third_party/highway/hwy/aligned_allocator.h new file mode 100644 index 0000000000..d0671a5774 --- /dev/null +++ b/third_party/highway/hwy/aligned_allocator.h @@ -0,0 +1,211 @@ +// Copyright 2020 Google LLC +// SPDX-License-Identifier: Apache-2.0 +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef HIGHWAY_HWY_ALIGNED_ALLOCATOR_H_ +#define HIGHWAY_HWY_ALIGNED_ALLOCATOR_H_ + +// Memory allocator with support for alignment and offsets. + +#include +#include + +#include "hwy/base.h" + +namespace hwy { + +// Minimum alignment of allocated memory for use in HWY_ASSUME_ALIGNED, which +// requires a literal. This matches typical L1 cache line sizes, which prevents +// false sharing. +#define HWY_ALIGNMENT 64 + +// Pointers to functions equivalent to malloc/free with an opaque void* passed +// to them. +using AllocPtr = void* (*)(void* opaque, size_t bytes); +using FreePtr = void (*)(void* opaque, void* memory); + +// Returns null or a pointer to at least `payload_size` (which can be zero) +// bytes of newly allocated memory, aligned to the larger of HWY_ALIGNMENT and +// the vector size. Calls `alloc` with the passed `opaque` pointer to obtain +// memory or malloc() if it is null. +HWY_DLLEXPORT void* AllocateAlignedBytes(size_t payload_size, + AllocPtr alloc_ptr, void* opaque_ptr); + +// Frees all memory. No effect if `aligned_pointer` == nullptr, otherwise it +// must have been returned from a previous call to `AllocateAlignedBytes`. +// Calls `free_ptr` with the passed `opaque_ptr` pointer to free the memory; if +// `free_ptr` function is null, uses the default free(). +HWY_DLLEXPORT void FreeAlignedBytes(const void* aligned_pointer, + FreePtr free_ptr, void* opaque_ptr); + +// Class that deletes the aligned pointer passed to operator() calling the +// destructor before freeing the pointer. This is equivalent to the +// std::default_delete but for aligned objects. For a similar deleter equivalent +// to free() for aligned memory see AlignedFreer(). +class AlignedDeleter { + public: + AlignedDeleter() : free_(nullptr), opaque_ptr_(nullptr) {} + AlignedDeleter(FreePtr free_ptr, void* opaque_ptr) + : free_(free_ptr), opaque_ptr_(opaque_ptr) {} + + template + void operator()(T* aligned_pointer) const { + return DeleteAlignedArray(aligned_pointer, free_, opaque_ptr_, + TypedArrayDeleter); + } + + private: + template + static void TypedArrayDeleter(void* ptr, size_t size_in_bytes) { + size_t elems = size_in_bytes / sizeof(T); + for (size_t i = 0; i < elems; i++) { + // Explicitly call the destructor on each element. + (static_cast(ptr) + i)->~T(); + } + } + + // Function prototype that calls the destructor for each element in a typed + // array. TypeArrayDeleter would match this prototype. + using ArrayDeleter = void (*)(void* t_ptr, size_t t_size); + + HWY_DLLEXPORT static void DeleteAlignedArray(void* aligned_pointer, + FreePtr free_ptr, + void* opaque_ptr, + ArrayDeleter deleter); + + FreePtr free_; + void* opaque_ptr_; +}; + +// Unique pointer to T with custom aligned deleter. This can be a single +// element U or an array of element if T is a U[]. The custom aligned deleter +// will call the destructor on U or each element of a U[] in the array case. +template +using AlignedUniquePtr = std::unique_ptr; + +// Aligned memory equivalent of make_unique using the custom allocators +// alloc/free with the passed `opaque` pointer. This function calls the +// constructor with the passed Args... and calls the destructor of the object +// when the AlignedUniquePtr is destroyed. +template +AlignedUniquePtr MakeUniqueAlignedWithAlloc(AllocPtr alloc, FreePtr free, + void* opaque, Args&&... args) { + T* ptr = static_cast(AllocateAlignedBytes(sizeof(T), alloc, opaque)); + return AlignedUniquePtr(new (ptr) T(std::forward(args)...), + AlignedDeleter(free, opaque)); +} + +// Similar to MakeUniqueAlignedWithAlloc but using the default alloc/free +// functions. +template +AlignedUniquePtr MakeUniqueAligned(Args&&... args) { + T* ptr = static_cast(AllocateAlignedBytes( + sizeof(T), /*alloc_ptr=*/nullptr, /*opaque_ptr=*/nullptr)); + return AlignedUniquePtr(new (ptr) T(std::forward(args)...), + AlignedDeleter()); +} + +// Helpers for array allocators (avoids overflow) +namespace detail { + +// Returns x such that 1u << x == n (if n is a power of two). +static inline constexpr size_t ShiftCount(size_t n) { + return (n <= 1) ? 0 : 1 + ShiftCount(n / 2); +} + +template +T* AllocateAlignedItems(size_t items, AllocPtr alloc_ptr, void* opaque_ptr) { + constexpr size_t size = sizeof(T); + + constexpr bool is_pow2 = (size & (size - 1)) == 0; + constexpr size_t bits = ShiftCount(size); + static_assert(!is_pow2 || (1ull << bits) == size, "ShiftCount is incorrect"); + + const size_t bytes = is_pow2 ? items << bits : items * size; + const size_t check = is_pow2 ? bytes >> bits : bytes / size; + if (check != items) { + return nullptr; // overflowed + } + return static_cast(AllocateAlignedBytes(bytes, alloc_ptr, opaque_ptr)); +} + +} // namespace detail + +// Aligned memory equivalent of make_unique for array types using the +// custom allocators alloc/free. This function calls the constructor with the +// passed Args... on every created item. The destructor of each element will be +// called when the AlignedUniquePtr is destroyed. +template +AlignedUniquePtr MakeUniqueAlignedArrayWithAlloc( + size_t items, AllocPtr alloc, FreePtr free, void* opaque, Args&&... args) { + T* ptr = detail::AllocateAlignedItems(items, alloc, opaque); + if (ptr != nullptr) { + for (size_t i = 0; i < items; i++) { + new (ptr + i) T(std::forward(args)...); + } + } + return AlignedUniquePtr(ptr, AlignedDeleter(free, opaque)); +} + +template +AlignedUniquePtr MakeUniqueAlignedArray(size_t items, Args&&... args) { + return MakeUniqueAlignedArrayWithAlloc( + items, nullptr, nullptr, nullptr, std::forward(args)...); +} + +// Custom deleter for std::unique_ptr equivalent to using free() as a deleter +// but for aligned memory. +class AlignedFreer { + public: + // Pass address of this to ctor to skip deleting externally-owned memory. + static void DoNothing(void* /*opaque*/, void* /*aligned_pointer*/) {} + + AlignedFreer() : free_(nullptr), opaque_ptr_(nullptr) {} + AlignedFreer(FreePtr free_ptr, void* opaque_ptr) + : free_(free_ptr), opaque_ptr_(opaque_ptr) {} + + template + void operator()(T* aligned_pointer) const { + // TODO(deymo): assert that we are using a POD type T. + FreeAlignedBytes(aligned_pointer, free_, opaque_ptr_); + } + + private: + FreePtr free_; + void* opaque_ptr_; +}; + +// Unique pointer to single POD, or (if T is U[]) an array of POD. For non POD +// data use AlignedUniquePtr. +template +using AlignedFreeUniquePtr = std::unique_ptr; + +// Allocate an aligned and uninitialized array of POD values as a unique_ptr. +// Upon destruction of the unique_ptr the aligned array will be freed. +template +AlignedFreeUniquePtr AllocateAligned(const size_t items, AllocPtr alloc, + FreePtr free, void* opaque) { + return AlignedFreeUniquePtr( + detail::AllocateAlignedItems(items, alloc, opaque), + AlignedFreer(free, opaque)); +} + +// Same as previous AllocateAligned(), using default allocate/free functions. +template +AlignedFreeUniquePtr AllocateAligned(const size_t items) { + return AllocateAligned(items, nullptr, nullptr, nullptr); +} + +} // namespace hwy +#endif // HIGHWAY_HWY_ALIGNED_ALLOCATOR_H_ diff --git a/third_party/highway/hwy/aligned_allocator_test.cc b/third_party/highway/hwy/aligned_allocator_test.cc new file mode 100644 index 0000000000..5d678a0de6 --- /dev/null +++ b/third_party/highway/hwy/aligned_allocator_test.cc @@ -0,0 +1,280 @@ +// Copyright 2020 Google LLC +// SPDX-License-Identifier: Apache-2.0 +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "hwy/aligned_allocator.h" + +#include +#include +#include // malloc + +#include +#include +#include +#include + +#include "gtest/gtest.h" + +namespace { + +// Sample object that keeps track on an external counter of how many times was +// the explicit constructor and destructor called. +template +class SampleObject { + public: + SampleObject() { data_[0] = 'a'; } + explicit SampleObject(int* counter) : counter_(counter) { + if (counter) (*counter)++; + data_[0] = 'b'; + } + + ~SampleObject() { + if (counter_) (*counter_)--; + } + + static_assert(N > sizeof(int*), "SampleObject size too small."); + int* counter_ = nullptr; + char data_[N - sizeof(int*)]; +}; + +class FakeAllocator { + public: + // static AllocPtr and FreePtr member to be used with the aligned + // allocator. These functions calls the private non-static members. + static void* StaticAlloc(void* opaque, size_t bytes) { + return reinterpret_cast(opaque)->Alloc(bytes); + } + static void StaticFree(void* opaque, void* memory) { + return reinterpret_cast(opaque)->Free(memory); + } + + // Returns the number of pending allocations to be freed. + size_t PendingAllocs() { return allocs_.size(); } + + private: + void* Alloc(size_t bytes) { + void* ret = malloc(bytes); + allocs_.insert(ret); + return ret; + } + void Free(void* memory) { + if (!memory) return; + EXPECT_NE(allocs_.end(), allocs_.find(memory)); + allocs_.erase(memory); + free(memory); + } + + std::set allocs_; +}; + +} // namespace + +namespace hwy { + +class AlignedAllocatorTest : public testing::Test {}; + +TEST(AlignedAllocatorTest, FreeNullptr) { + // Calling free with a nullptr is always ok. + FreeAlignedBytes(/*aligned_pointer=*/nullptr, /*free_ptr=*/nullptr, + /*opaque_ptr=*/nullptr); +} + +TEST(AlignedAllocatorTest, Log2) { + EXPECT_EQ(0u, detail::ShiftCount(1)); + EXPECT_EQ(1u, detail::ShiftCount(2)); + EXPECT_EQ(3u, detail::ShiftCount(8)); +} + +// Allocator returns null when it detects overflow of items * sizeof(T). +TEST(AlignedAllocatorTest, Overflow) { + constexpr size_t max = ~size_t(0); + constexpr size_t msb = (max >> 1) + 1; + using Size5 = std::array; + using Size10 = std::array; + EXPECT_EQ(nullptr, + detail::AllocateAlignedItems(max / 2, nullptr, nullptr)); + EXPECT_EQ(nullptr, + detail::AllocateAlignedItems(max / 3, nullptr, nullptr)); + EXPECT_EQ(nullptr, + detail::AllocateAlignedItems(max / 4, nullptr, nullptr)); + EXPECT_EQ(nullptr, + detail::AllocateAlignedItems(msb, nullptr, nullptr)); + EXPECT_EQ(nullptr, + detail::AllocateAlignedItems(msb + 1, nullptr, nullptr)); + EXPECT_EQ(nullptr, + detail::AllocateAlignedItems(msb / 4, nullptr, nullptr)); +} + +TEST(AlignedAllocatorTest, AllocDefaultPointers) { + const size_t kSize = 7777; + void* ptr = AllocateAlignedBytes(kSize, /*alloc_ptr=*/nullptr, + /*opaque_ptr=*/nullptr); + ASSERT_NE(nullptr, ptr); + // Make sure the pointer is actually aligned. + EXPECT_EQ(0U, reinterpret_cast(ptr) % HWY_ALIGNMENT); + char* p = static_cast(ptr); + size_t ret = 0; + for (size_t i = 0; i < kSize; i++) { + // Performs a computation using p[] to prevent it being optimized away. + p[i] = static_cast(i & 0x7F); + if (i) ret += static_cast(p[i] * p[i - 1]); + } + EXPECT_NE(0U, ret); + FreeAlignedBytes(ptr, /*free_ptr=*/nullptr, /*opaque_ptr=*/nullptr); +} + +TEST(AlignedAllocatorTest, EmptyAlignedUniquePtr) { + AlignedUniquePtr> ptr(nullptr, AlignedDeleter()); + AlignedUniquePtr[]> arr(nullptr, AlignedDeleter()); +} + +TEST(AlignedAllocatorTest, EmptyAlignedFreeUniquePtr) { + AlignedFreeUniquePtr> ptr(nullptr, AlignedFreer()); + AlignedFreeUniquePtr[]> arr(nullptr, AlignedFreer()); +} + +TEST(AlignedAllocatorTest, CustomAlloc) { + FakeAllocator fake_alloc; + + const size_t kSize = 7777; + void* ptr = + AllocateAlignedBytes(kSize, &FakeAllocator::StaticAlloc, &fake_alloc); + ASSERT_NE(nullptr, ptr); + // We should have only requested one alloc from the allocator. + EXPECT_EQ(1U, fake_alloc.PendingAllocs()); + // Make sure the pointer is actually aligned. + EXPECT_EQ(0U, reinterpret_cast(ptr) % HWY_ALIGNMENT); + FreeAlignedBytes(ptr, &FakeAllocator::StaticFree, &fake_alloc); + EXPECT_EQ(0U, fake_alloc.PendingAllocs()); +} + +TEST(AlignedAllocatorTest, MakeUniqueAlignedDefaultConstructor) { + { + auto ptr = MakeUniqueAligned>(); + // Default constructor sets the data_[0] to 'a'. + EXPECT_EQ('a', ptr->data_[0]); + EXPECT_EQ(nullptr, ptr->counter_); + } +} + +TEST(AlignedAllocatorTest, MakeUniqueAligned) { + int counter = 0; + { + // Creates the object, initializes it with the explicit constructor and + // returns an unique_ptr to it. + auto ptr = MakeUniqueAligned>(&counter); + EXPECT_EQ(1, counter); + // Custom constructor sets the data_[0] to 'b'. + EXPECT_EQ('b', ptr->data_[0]); + } + EXPECT_EQ(0, counter); +} + +TEST(AlignedAllocatorTest, MakeUniqueAlignedArray) { + int counter = 0; + { + // Creates the array of objects and initializes them with the explicit + // constructor. + auto arr = MakeUniqueAlignedArray>(7, &counter); + EXPECT_EQ(7, counter); + for (size_t i = 0; i < 7; i++) { + // Custom constructor sets the data_[0] to 'b'. + EXPECT_EQ('b', arr[i].data_[0]) << "Where i = " << i; + } + } + EXPECT_EQ(0, counter); +} + +TEST(AlignedAllocatorTest, AllocSingleInt) { + auto ptr = AllocateAligned(1); + ASSERT_NE(nullptr, ptr.get()); + EXPECT_EQ(0U, reinterpret_cast(ptr.get()) % HWY_ALIGNMENT); + // Force delete of the unique_ptr now to check that it doesn't crash. + ptr.reset(nullptr); + EXPECT_EQ(nullptr, ptr.get()); +} + +TEST(AlignedAllocatorTest, AllocMultipleInt) { + const size_t kSize = 7777; + auto ptr = AllocateAligned(kSize); + ASSERT_NE(nullptr, ptr.get()); + EXPECT_EQ(0U, reinterpret_cast(ptr.get()) % HWY_ALIGNMENT); + // ptr[i] is actually (*ptr.get())[i] which will use the operator[] of the + // underlying type chosen by AllocateAligned() for the std::unique_ptr. + EXPECT_EQ(&(ptr[0]) + 1, &(ptr[1])); + + size_t ret = 0; + for (size_t i = 0; i < kSize; i++) { + // Performs a computation using ptr[] to prevent it being optimized away. + ptr[i] = static_cast(i); + if (i) ret += ptr[i] * ptr[i - 1]; + } + EXPECT_NE(0U, ret); +} + +TEST(AlignedAllocatorTest, AllocateAlignedObjectWithoutDestructor) { + int counter = 0; + { + // This doesn't call the constructor. + auto obj = AllocateAligned>(1); + obj[0].counter_ = &counter; + } + // Destroying the unique_ptr shouldn't have called the destructor of the + // SampleObject<24>. + EXPECT_EQ(0, counter); +} + +TEST(AlignedAllocatorTest, MakeUniqueAlignedArrayWithCustomAlloc) { + FakeAllocator fake_alloc; + int counter = 0; + { + // Creates the array of objects and initializes them with the explicit + // constructor. + auto arr = MakeUniqueAlignedArrayWithAlloc>( + 7, FakeAllocator::StaticAlloc, FakeAllocator::StaticFree, &fake_alloc, + &counter); + ASSERT_NE(nullptr, arr.get()); + // An array should still only call a single allocation. + EXPECT_EQ(1u, fake_alloc.PendingAllocs()); + EXPECT_EQ(7, counter); + for (size_t i = 0; i < 7; i++) { + // Custom constructor sets the data_[0] to 'b'. + EXPECT_EQ('b', arr[i].data_[0]) << "Where i = " << i; + } + } + EXPECT_EQ(0, counter); + EXPECT_EQ(0u, fake_alloc.PendingAllocs()); +} + +TEST(AlignedAllocatorTest, DefaultInit) { + // The test is whether this compiles. Default-init is useful for output params + // and per-thread storage. + std::vector> ptrs; + std::vector> free_ptrs; + ptrs.resize(128); + free_ptrs.resize(128); + // The following is to prevent elision of the pointers. + std::mt19937 rng(129); // Emscripten lacks random_device. + std::uniform_int_distribution dist(0, 127); + ptrs[dist(rng)] = MakeUniqueAlignedArray(123); + free_ptrs[dist(rng)] = AllocateAligned(456); + // "Use" pointer without resorting to printf. 0 == 0. Can't shift by 64. + const auto addr1 = reinterpret_cast(ptrs[dist(rng)].get()); + const auto addr2 = reinterpret_cast(free_ptrs[dist(rng)].get()); + constexpr size_t kBits = sizeof(uintptr_t) * 8; + EXPECT_EQ((addr1 >> (kBits - 1)) >> (kBits - 1), + (addr2 >> (kBits - 1)) >> (kBits - 1)); +} + +} // namespace hwy diff --git a/third_party/highway/hwy/base.h b/third_party/highway/hwy/base.h new file mode 100644 index 0000000000..0ca755a8f8 --- /dev/null +++ b/third_party/highway/hwy/base.h @@ -0,0 +1,1066 @@ +// Copyright 2020 Google LLC +// SPDX-License-Identifier: Apache-2.0 +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef HIGHWAY_HWY_BASE_H_ +#define HIGHWAY_HWY_BASE_H_ + +// For SIMD module implementations and their callers, target-independent. + +// IWYU pragma: begin_exports +#include +#include + +#include "hwy/detect_compiler_arch.h" +#include "hwy/highway_export.h" + +// "IWYU pragma: keep" does not work for this include, so hide it from the IDE. +#if ((HWY_ARCH_X86 && !defined(HWY_NO_LIBCXX)) || HWY_COMPILER_MSVC) && !HWY_IDE +#include +#endif +// IWYU pragma: end_exports + +#if HWY_COMPILER_MSVC +#include // memcpy +#endif + +//------------------------------------------------------------------------------ +// Compiler-specific definitions + +#define HWY_STR_IMPL(macro) #macro +#define HWY_STR(macro) HWY_STR_IMPL(macro) + +#if HWY_COMPILER_MSVC + +#include + +#define HWY_RESTRICT __restrict +#define HWY_INLINE __forceinline +#define HWY_NOINLINE __declspec(noinline) +#define HWY_FLATTEN +#define HWY_NORETURN __declspec(noreturn) +#define HWY_LIKELY(expr) (expr) +#define HWY_UNLIKELY(expr) (expr) +#define HWY_PRAGMA(tokens) __pragma(tokens) +#define HWY_DIAGNOSTICS(tokens) HWY_PRAGMA(warning(tokens)) +#define HWY_DIAGNOSTICS_OFF(msc, gcc) HWY_DIAGNOSTICS(msc) +#define HWY_MAYBE_UNUSED +#define HWY_HAS_ASSUME_ALIGNED 0 +#if (_MSC_VER >= 1700) +#define HWY_MUST_USE_RESULT _Check_return_ +#else +#define HWY_MUST_USE_RESULT +#endif + +#else + +#define HWY_RESTRICT __restrict__ +// force inlining without optimization enabled creates very inefficient code +// that can cause compiler timeout +#ifdef __OPTIMIZE__ +#define HWY_INLINE inline __attribute__((always_inline)) +#else +#define HWY_INLINE inline +#endif +#define HWY_NOINLINE __attribute__((noinline)) +#define HWY_FLATTEN __attribute__((flatten)) +#define HWY_NORETURN __attribute__((noreturn)) +#define HWY_LIKELY(expr) __builtin_expect(!!(expr), 1) +#define HWY_UNLIKELY(expr) __builtin_expect(!!(expr), 0) +#define HWY_PRAGMA(tokens) _Pragma(#tokens) +#define HWY_DIAGNOSTICS(tokens) HWY_PRAGMA(GCC diagnostic tokens) +#define HWY_DIAGNOSTICS_OFF(msc, gcc) HWY_DIAGNOSTICS(gcc) +// Encountered "attribute list cannot appear here" when using the C++17 +// [[maybe_unused]], so only use the old style attribute for now. +#define HWY_MAYBE_UNUSED __attribute__((unused)) +#define HWY_MUST_USE_RESULT __attribute__((warn_unused_result)) + +#endif // !HWY_COMPILER_MSVC + +//------------------------------------------------------------------------------ +// Builtin/attributes + +// Enables error-checking of format strings. +#if HWY_HAS_ATTRIBUTE(__format__) +#define HWY_FORMAT(idx_fmt, idx_arg) \ + __attribute__((__format__(__printf__, idx_fmt, idx_arg))) +#else +#define HWY_FORMAT(idx_fmt, idx_arg) +#endif + +// Returns a void* pointer which the compiler then assumes is N-byte aligned. +// Example: float* HWY_RESTRICT aligned = (float*)HWY_ASSUME_ALIGNED(in, 32); +// +// The assignment semantics are required by GCC/Clang. ICC provides an in-place +// __assume_aligned, whereas MSVC's __assume appears unsuitable. +#if HWY_HAS_BUILTIN(__builtin_assume_aligned) +#define HWY_ASSUME_ALIGNED(ptr, align) __builtin_assume_aligned((ptr), (align)) +#else +#define HWY_ASSUME_ALIGNED(ptr, align) (ptr) /* not supported */ +#endif + +// Clang and GCC require attributes on each function into which SIMD intrinsics +// are inlined. Support both per-function annotation (HWY_ATTR) for lambdas and +// automatic annotation via pragmas. +#if HWY_COMPILER_CLANG +#define HWY_PUSH_ATTRIBUTES(targets_str) \ + HWY_PRAGMA(clang attribute push(__attribute__((target(targets_str))), \ + apply_to = function)) +#define HWY_POP_ATTRIBUTES HWY_PRAGMA(clang attribute pop) +#elif HWY_COMPILER_GCC +#define HWY_PUSH_ATTRIBUTES(targets_str) \ + HWY_PRAGMA(GCC push_options) HWY_PRAGMA(GCC target targets_str) +#define HWY_POP_ATTRIBUTES HWY_PRAGMA(GCC pop_options) +#else +#define HWY_PUSH_ATTRIBUTES(targets_str) +#define HWY_POP_ATTRIBUTES +#endif + +//------------------------------------------------------------------------------ +// Macros + +#define HWY_API static HWY_INLINE HWY_FLATTEN HWY_MAYBE_UNUSED + +#define HWY_CONCAT_IMPL(a, b) a##b +#define HWY_CONCAT(a, b) HWY_CONCAT_IMPL(a, b) + +#define HWY_MIN(a, b) ((a) < (b) ? (a) : (b)) +#define HWY_MAX(a, b) ((a) > (b) ? (a) : (b)) + +#if HWY_COMPILER_GCC_ACTUAL +// nielskm: GCC does not support '#pragma GCC unroll' without the factor. +#define HWY_UNROLL(factor) HWY_PRAGMA(GCC unroll factor) +#define HWY_DEFAULT_UNROLL HWY_UNROLL(4) +#elif HWY_COMPILER_CLANG || HWY_COMPILER_ICC || HWY_COMPILER_ICX +#define HWY_UNROLL(factor) HWY_PRAGMA(unroll factor) +#define HWY_DEFAULT_UNROLL HWY_UNROLL() +#else +#define HWY_UNROLL(factor) +#define HWY_DEFAULT_UNROLL +#endif + +// Tell a compiler that the expression always evaluates to true. +// The expression should be free from any side effects. +// Some older compilers may have trouble with complex expressions, therefore +// it is advisable to split multiple conditions into separate assume statements, +// and manually check the generated code. +// OK but could fail: +// HWY_ASSUME(x == 2 && y == 3); +// Better: +// HWY_ASSUME(x == 2); +// HWY_ASSUME(y == 3); +#if HWY_HAS_CPP_ATTRIBUTE(assume) +#define HWY_ASSUME(expr) [[assume(expr)]] +#elif HWY_COMPILER_MSVC || HWY_COMPILER_ICC +#define HWY_ASSUME(expr) __assume(expr) +// __builtin_assume() was added in clang 3.6. +#elif HWY_COMPILER_CLANG && HWY_HAS_BUILTIN(__builtin_assume) +#define HWY_ASSUME(expr) __builtin_assume(expr) +// __builtin_unreachable() was added in GCC 4.5, but __has_builtin() was added +// later, so check for the compiler version directly. +#elif HWY_COMPILER_GCC_ACTUAL >= 405 +#define HWY_ASSUME(expr) \ + ((expr) ? static_cast(0) : __builtin_unreachable()) +#else +#define HWY_ASSUME(expr) static_cast(0) +#endif + +// Compile-time fence to prevent undesirable code reordering. On Clang x86, the +// typical asm volatile("" : : : "memory") has no effect, whereas atomic fence +// does, without generating code. +#if HWY_ARCH_X86 && !defined(HWY_NO_LIBCXX) +#define HWY_FENCE std::atomic_thread_fence(std::memory_order_acq_rel) +#else +// TODO(janwas): investigate alternatives. On Arm, the above generates barriers. +#define HWY_FENCE +#endif + +// 4 instances of a given literal value, useful as input to LoadDup128. +#define HWY_REP4(literal) literal, literal, literal, literal + +#define HWY_ABORT(format, ...) \ + ::hwy::Abort(__FILE__, __LINE__, format, ##__VA_ARGS__) + +// Always enabled. +#define HWY_ASSERT(condition) \ + do { \ + if (!(condition)) { \ + HWY_ABORT("Assert %s", #condition); \ + } \ + } while (0) + +#if HWY_HAS_FEATURE(memory_sanitizer) || defined(MEMORY_SANITIZER) +#define HWY_IS_MSAN 1 +#else +#define HWY_IS_MSAN 0 +#endif + +#if HWY_HAS_FEATURE(address_sanitizer) || defined(ADDRESS_SANITIZER) +#define HWY_IS_ASAN 1 +#else +#define HWY_IS_ASAN 0 +#endif + +#if HWY_HAS_FEATURE(thread_sanitizer) || defined(THREAD_SANITIZER) +#define HWY_IS_TSAN 1 +#else +#define HWY_IS_TSAN 0 +#endif + +// MSAN may cause lengthy build times or false positives e.g. in AVX3 DemoteTo. +// You can disable MSAN by adding this attribute to the function that fails. +#if HWY_IS_MSAN +#define HWY_ATTR_NO_MSAN __attribute__((no_sanitize_memory)) +#else +#define HWY_ATTR_NO_MSAN +#endif + +// For enabling HWY_DASSERT and shortening tests in slower debug builds +#if !defined(HWY_IS_DEBUG_BUILD) +// Clang does not define NDEBUG, but it and GCC define __OPTIMIZE__, and recent +// MSVC defines NDEBUG (if not, could instead check _DEBUG). +#if (!defined(__OPTIMIZE__) && !defined(NDEBUG)) || HWY_IS_ASAN || \ + HWY_IS_MSAN || HWY_IS_TSAN || defined(__clang_analyzer__) +#define HWY_IS_DEBUG_BUILD 1 +#else +#define HWY_IS_DEBUG_BUILD 0 +#endif +#endif // HWY_IS_DEBUG_BUILD + +#if HWY_IS_DEBUG_BUILD +#define HWY_DASSERT(condition) HWY_ASSERT(condition) +#else +#define HWY_DASSERT(condition) \ + do { \ + } while (0) +#endif + +namespace hwy { + +//------------------------------------------------------------------------------ +// kMaxVectorSize (undocumented, pending removal) + +#if HWY_ARCH_X86 +static constexpr HWY_MAYBE_UNUSED size_t kMaxVectorSize = 64; // AVX-512 +#elif HWY_ARCH_RVV && defined(__riscv_v_intrinsic) && \ + __riscv_v_intrinsic >= 11000 +// Not actually an upper bound on the size. +static constexpr HWY_MAYBE_UNUSED size_t kMaxVectorSize = 4096; +#else +static constexpr HWY_MAYBE_UNUSED size_t kMaxVectorSize = 16; +#endif + +//------------------------------------------------------------------------------ +// Alignment + +// Potentially useful for LoadDup128 and capped vectors. In other cases, arrays +// should be allocated dynamically via aligned_allocator.h because Lanes() may +// exceed the stack size. +#if HWY_ARCH_X86 +#define HWY_ALIGN_MAX alignas(64) +#elif HWY_ARCH_RVV && defined(__riscv_v_intrinsic) && \ + __riscv_v_intrinsic >= 11000 +#define HWY_ALIGN_MAX alignas(8) // only elements need be aligned +#else +#define HWY_ALIGN_MAX alignas(16) +#endif + +//------------------------------------------------------------------------------ +// Lane types + +// Match [u]int##_t naming scheme so rvv-inl.h macros can obtain the type name +// by concatenating base type and bits. + +#pragma pack(push, 1) + +// ACLE (https://gcc.gnu.org/onlinedocs/gcc/Half-Precision.html): +// always supported on Armv8, for Armv7 only if -mfp16-format is given. +#if ((HWY_ARCH_ARM_A64 || (__ARM_FP & 2)) && HWY_COMPILER_GCC) +using float16_t = __fp16; +// C11 extension ISO/IEC TS 18661-3:2015 but not supported on all targets. +// Required for Clang RVV if the float16 extension is used. +#elif HWY_ARCH_RVV && HWY_COMPILER_CLANG && defined(__riscv_zvfh) +using float16_t = _Float16; +// Otherwise emulate +#else +struct float16_t { + uint16_t bits; +}; +#endif + +struct bfloat16_t { + uint16_t bits; +}; + +#pragma pack(pop) + +using float32_t = float; +using float64_t = double; + +#pragma pack(push, 1) + +// Aligned 128-bit type. Cannot use __int128 because clang doesn't yet align it: +// https://reviews.llvm.org/D86310 +struct alignas(16) uint128_t { + uint64_t lo; // little-endian layout + uint64_t hi; +}; + +// 64 bit key plus 64 bit value. Faster than using uint128_t when only the key +// field is to be compared (Lt128Upper instead of Lt128). +struct alignas(16) K64V64 { + uint64_t value; // little-endian layout + uint64_t key; +}; + +// 32 bit key plus 32 bit value. Allows vqsort recursions to terminate earlier +// than when considering both to be a 64-bit key. +struct alignas(8) K32V32 { + uint32_t value; // little-endian layout + uint32_t key; +}; + +#pragma pack(pop) + +static inline HWY_MAYBE_UNUSED bool operator<(const uint128_t& a, + const uint128_t& b) { + return (a.hi == b.hi) ? a.lo < b.lo : a.hi < b.hi; +} +// Required for std::greater. +static inline HWY_MAYBE_UNUSED bool operator>(const uint128_t& a, + const uint128_t& b) { + return b < a; +} +static inline HWY_MAYBE_UNUSED bool operator==(const uint128_t& a, + const uint128_t& b) { + return a.lo == b.lo && a.hi == b.hi; +} + +static inline HWY_MAYBE_UNUSED bool operator<(const K64V64& a, + const K64V64& b) { + return a.key < b.key; +} +// Required for std::greater. +static inline HWY_MAYBE_UNUSED bool operator>(const K64V64& a, + const K64V64& b) { + return b < a; +} +static inline HWY_MAYBE_UNUSED bool operator==(const K64V64& a, + const K64V64& b) { + return a.key == b.key; +} + +static inline HWY_MAYBE_UNUSED bool operator<(const K32V32& a, + const K32V32& b) { + return a.key < b.key; +} +// Required for std::greater. +static inline HWY_MAYBE_UNUSED bool operator>(const K32V32& a, + const K32V32& b) { + return b < a; +} +static inline HWY_MAYBE_UNUSED bool operator==(const K32V32& a, + const K32V32& b) { + return a.key == b.key; +} + +//------------------------------------------------------------------------------ +// Controlling overload resolution (SFINAE) + +template +struct EnableIfT {}; +template <> +struct EnableIfT { + using type = void; +}; + +template +using EnableIf = typename EnableIfT::type; + +template +struct IsSameT { + enum { value = 0 }; +}; + +template +struct IsSameT { + enum { value = 1 }; +}; + +template +HWY_API constexpr bool IsSame() { + return IsSameT::value; +} + +template +struct IfT { + using type = Then; +}; + +template +struct IfT { + using type = Else; +}; + +template +using If = typename IfT::type; + +// Insert into template/function arguments to enable this overload only for +// vectors of exactly, at most (LE), or more than (GT) this many bytes. +// +// As an example, checking for a total size of 16 bytes will match both +// Simd and Simd. +#define HWY_IF_V_SIZE(T, kN, bytes) \ + hwy::EnableIf* = nullptr +#define HWY_IF_V_SIZE_LE(T, kN, bytes) \ + hwy::EnableIf* = nullptr +#define HWY_IF_V_SIZE_GT(T, kN, bytes) \ + hwy::EnableIf<(kN * sizeof(T) > bytes)>* = nullptr + +#define HWY_IF_LANES(kN, lanes) hwy::EnableIf<(kN == lanes)>* = nullptr +#define HWY_IF_LANES_LE(kN, lanes) hwy::EnableIf<(kN <= lanes)>* = nullptr +#define HWY_IF_LANES_GT(kN, lanes) hwy::EnableIf<(kN > lanes)>* = nullptr + +#define HWY_IF_UNSIGNED(T) hwy::EnableIf()>* = nullptr +#define HWY_IF_SIGNED(T) \ + hwy::EnableIf() && !IsFloat() && !IsSpecialFloat()>* = \ + nullptr +#define HWY_IF_FLOAT(T) hwy::EnableIf()>* = nullptr +#define HWY_IF_NOT_FLOAT(T) hwy::EnableIf()>* = nullptr +#define HWY_IF_SPECIAL_FLOAT(T) \ + hwy::EnableIf()>* = nullptr +#define HWY_IF_NOT_SPECIAL_FLOAT(T) \ + hwy::EnableIf()>* = nullptr +#define HWY_IF_NOT_FLOAT_NOR_SPECIAL(T) \ + hwy::EnableIf() && !hwy::IsSpecialFloat()>* = nullptr + +#define HWY_IF_T_SIZE(T, bytes) hwy::EnableIf* = nullptr +#define HWY_IF_NOT_T_SIZE(T, bytes) \ + hwy::EnableIf* = nullptr +// bit_array = 0x102 means 1 or 8 bytes. There is no NONE_OF because it sounds +// too similar. If you want the opposite of this (2 or 4 bytes), ask for those +// bits explicitly (0x14) instead of attempting to 'negate' 0x102. +#define HWY_IF_T_SIZE_ONE_OF(T, bit_array) \ + hwy::EnableIf<((size_t{1} << sizeof(T)) & (bit_array)) != 0>* = nullptr + +// Use instead of HWY_IF_T_SIZE to avoid ambiguity with float/double +// overloads. +#define HWY_IF_UI32(T) \ + hwy::EnableIf() || IsSame()>* = nullptr +#define HWY_IF_UI64(T) \ + hwy::EnableIf() || IsSame()>* = nullptr + +#define HWY_IF_LANES_PER_BLOCK(T, N, LANES) \ + hwy::EnableIf* = nullptr + +// Empty struct used as a size tag type. +template +struct SizeTag {}; + +template +struct RemoveConstT { + using type = T; +}; +template +struct RemoveConstT { + using type = T; +}; + +template +using RemoveConst = typename RemoveConstT::type; + +template +struct RemoveRefT { + using type = T; +}; +template +struct RemoveRefT { + using type = T; +}; + +template +using RemoveRef = typename RemoveRefT::type; + +//------------------------------------------------------------------------------ +// Type relations + +namespace detail { + +template +struct Relations; +template <> +struct Relations { + using Unsigned = uint8_t; + using Signed = int8_t; + using Wide = uint16_t; + enum { is_signed = 0, is_float = 0 }; +}; +template <> +struct Relations { + using Unsigned = uint8_t; + using Signed = int8_t; + using Wide = int16_t; + enum { is_signed = 1, is_float = 0 }; +}; +template <> +struct Relations { + using Unsigned = uint16_t; + using Signed = int16_t; + using Wide = uint32_t; + using Narrow = uint8_t; + enum { is_signed = 0, is_float = 0 }; +}; +template <> +struct Relations { + using Unsigned = uint16_t; + using Signed = int16_t; + using Wide = int32_t; + using Narrow = int8_t; + enum { is_signed = 1, is_float = 0 }; +}; +template <> +struct Relations { + using Unsigned = uint32_t; + using Signed = int32_t; + using Float = float; + using Wide = uint64_t; + using Narrow = uint16_t; + enum { is_signed = 0, is_float = 0 }; +}; +template <> +struct Relations { + using Unsigned = uint32_t; + using Signed = int32_t; + using Float = float; + using Wide = int64_t; + using Narrow = int16_t; + enum { is_signed = 1, is_float = 0 }; +}; +template <> +struct Relations { + using Unsigned = uint64_t; + using Signed = int64_t; + using Float = double; + using Wide = uint128_t; + using Narrow = uint32_t; + enum { is_signed = 0, is_float = 0 }; +}; +template <> +struct Relations { + using Unsigned = uint64_t; + using Signed = int64_t; + using Float = double; + using Narrow = int32_t; + enum { is_signed = 1, is_float = 0 }; +}; +template <> +struct Relations { + using Unsigned = uint128_t; + using Narrow = uint64_t; + enum { is_signed = 0, is_float = 0 }; +}; +template <> +struct Relations { + using Unsigned = uint16_t; + using Signed = int16_t; + using Float = float16_t; + using Wide = float; + enum { is_signed = 1, is_float = 1 }; +}; +template <> +struct Relations { + using Unsigned = uint16_t; + using Signed = int16_t; + using Wide = float; + enum { is_signed = 1, is_float = 1 }; +}; +template <> +struct Relations { + using Unsigned = uint32_t; + using Signed = int32_t; + using Float = float; + using Wide = double; + using Narrow = float16_t; + enum { is_signed = 1, is_float = 1 }; +}; +template <> +struct Relations { + using Unsigned = uint64_t; + using Signed = int64_t; + using Float = double; + using Narrow = float; + enum { is_signed = 1, is_float = 1 }; +}; + +template +struct TypeFromSize; +template <> +struct TypeFromSize<1> { + using Unsigned = uint8_t; + using Signed = int8_t; +}; +template <> +struct TypeFromSize<2> { + using Unsigned = uint16_t; + using Signed = int16_t; +}; +template <> +struct TypeFromSize<4> { + using Unsigned = uint32_t; + using Signed = int32_t; + using Float = float; +}; +template <> +struct TypeFromSize<8> { + using Unsigned = uint64_t; + using Signed = int64_t; + using Float = double; +}; +template <> +struct TypeFromSize<16> { + using Unsigned = uint128_t; +}; + +} // namespace detail + +// Aliases for types of a different category, but the same size. +template +using MakeUnsigned = typename detail::Relations::Unsigned; +template +using MakeSigned = typename detail::Relations::Signed; +template +using MakeFloat = typename detail::Relations::Float; + +// Aliases for types of the same category, but different size. +template +using MakeWide = typename detail::Relations::Wide; +template +using MakeNarrow = typename detail::Relations::Narrow; + +// Obtain type from its size [bytes]. +template +using UnsignedFromSize = typename detail::TypeFromSize::Unsigned; +template +using SignedFromSize = typename detail::TypeFromSize::Signed; +template +using FloatFromSize = typename detail::TypeFromSize::Float; + +// Avoid confusion with SizeTag where the parameter is a lane size. +using UnsignedTag = SizeTag<0>; +using SignedTag = SizeTag<0x100>; // integer +using FloatTag = SizeTag<0x200>; + +template > +constexpr auto TypeTag() -> hwy::SizeTag<((R::is_signed + R::is_float) << 8)> { + return hwy::SizeTag<((R::is_signed + R::is_float) << 8)>(); +} + +// For when we only want to distinguish FloatTag from everything else. +using NonFloatTag = SizeTag<0x400>; + +template > +constexpr auto IsFloatTag() -> hwy::SizeTag<(R::is_float ? 0x200 : 0x400)> { + return hwy::SizeTag<(R::is_float ? 0x200 : 0x400)>(); +} + +//------------------------------------------------------------------------------ +// Type traits + +template +HWY_API constexpr bool IsFloat() { + // Cannot use T(1.25) != T(1) for float16_t, which can only be converted to or + // from a float, not compared. + return IsSame() || IsSame(); +} + +// These types are often special-cased and not supported in all ops. +template +HWY_API constexpr bool IsSpecialFloat() { + return IsSame() || IsSame(); +} + +template +HWY_API constexpr bool IsSigned() { + return T(0) > T(-1); +} +template <> +constexpr bool IsSigned() { + return true; +} +template <> +constexpr bool IsSigned() { + return true; +} + +// Largest/smallest representable integer values. +template +HWY_API constexpr T LimitsMax() { + static_assert(!IsFloat(), "Only for integer types"); + using TU = MakeUnsigned; + return static_cast(IsSigned() ? (static_cast(~0ull) >> 1) + : static_cast(~0ull)); +} +template +HWY_API constexpr T LimitsMin() { + static_assert(!IsFloat(), "Only for integer types"); + return IsSigned() ? T(-1) - LimitsMax() : T(0); +} + +// Largest/smallest representable value (integer or float). This naming avoids +// confusion with numeric_limits::min() (the smallest positive value). +template +HWY_API constexpr T LowestValue() { + return LimitsMin(); +} +template <> +constexpr float LowestValue() { + return -3.402823466e+38F; +} +template <> +constexpr double LowestValue() { + return -1.7976931348623158e+308; +} + +template +HWY_API constexpr T HighestValue() { + return LimitsMax(); +} +template <> +constexpr float HighestValue() { + return 3.402823466e+38F; +} +template <> +constexpr double HighestValue() { + return 1.7976931348623158e+308; +} + +// Difference between 1.0 and the next representable value. +template +HWY_API constexpr T Epsilon() { + return 1; +} +template <> +constexpr float Epsilon() { + return 1.192092896e-7f; +} +template <> +constexpr double Epsilon() { + return 2.2204460492503131e-16; +} + +// Returns width in bits of the mantissa field in IEEE binary32/64. +template +constexpr int MantissaBits() { + static_assert(sizeof(T) == 0, "Only instantiate the specializations"); + return 0; +} +template <> +constexpr int MantissaBits() { + return 23; +} +template <> +constexpr int MantissaBits() { + return 52; +} + +// Returns the (left-shifted by one bit) IEEE binary32/64 representation with +// the largest possible (biased) exponent field. Used by IsInf. +template +constexpr MakeSigned MaxExponentTimes2() { + return -(MakeSigned{1} << (MantissaBits() + 1)); +} + +// Returns bitmask of the sign bit in IEEE binary32/64. +template +constexpr MakeUnsigned SignMask() { + return MakeUnsigned{1} << (sizeof(T) * 8 - 1); +} + +// Returns bitmask of the exponent field in IEEE binary32/64. +template +constexpr MakeUnsigned ExponentMask() { + return (~(MakeUnsigned{1} << MantissaBits()) + 1) & ~SignMask(); +} + +// Returns bitmask of the mantissa field in IEEE binary32/64. +template +constexpr MakeUnsigned MantissaMask() { + return (MakeUnsigned{1} << MantissaBits()) - 1; +} + +// Returns 1 << mantissa_bits as a floating-point number. All integers whose +// absolute value are less than this can be represented exactly. +template +constexpr T MantissaEnd() { + static_assert(sizeof(T) == 0, "Only instantiate the specializations"); + return 0; +} +template <> +constexpr float MantissaEnd() { + return 8388608.0f; // 1 << 23 +} +template <> +constexpr double MantissaEnd() { + // floating point literal with p52 requires C++17. + return 4503599627370496.0; // 1 << 52 +} + +// Returns width in bits of the exponent field in IEEE binary32/64. +template +constexpr int ExponentBits() { + // Exponent := remaining bits after deducting sign and mantissa. + return 8 * sizeof(T) - 1 - MantissaBits(); +} + +// Returns largest value of the biased exponent field in IEEE binary32/64, +// right-shifted so that the LSB is bit zero. Example: 0xFF for float. +// This is expressed as a signed integer for more efficient comparison. +template +constexpr MakeSigned MaxExponentField() { + return (MakeSigned{1} << ExponentBits()) - 1; +} + +//------------------------------------------------------------------------------ +// Helper functions + +template +constexpr inline T1 DivCeil(T1 a, T2 b) { + return (a + b - 1) / b; +} + +// Works for any `align`; if a power of two, compiler emits ADD+AND. +constexpr inline size_t RoundUpTo(size_t what, size_t align) { + return DivCeil(what, align) * align; +} + +// Undefined results for x == 0. +HWY_API size_t Num0BitsBelowLS1Bit_Nonzero32(const uint32_t x) { +#if HWY_COMPILER_MSVC + unsigned long index; // NOLINT + _BitScanForward(&index, x); + return index; +#else // HWY_COMPILER_MSVC + return static_cast(__builtin_ctz(x)); +#endif // HWY_COMPILER_MSVC +} + +HWY_API size_t Num0BitsBelowLS1Bit_Nonzero64(const uint64_t x) { +#if HWY_COMPILER_MSVC +#if HWY_ARCH_X86_64 + unsigned long index; // NOLINT + _BitScanForward64(&index, x); + return index; +#else // HWY_ARCH_X86_64 + // _BitScanForward64 not available + uint32_t lsb = static_cast(x & 0xFFFFFFFF); + unsigned long index; // NOLINT + if (lsb == 0) { + uint32_t msb = static_cast(x >> 32u); + _BitScanForward(&index, msb); + return 32 + index; + } else { + _BitScanForward(&index, lsb); + return index; + } +#endif // HWY_ARCH_X86_64 +#else // HWY_COMPILER_MSVC + return static_cast(__builtin_ctzll(x)); +#endif // HWY_COMPILER_MSVC +} + +// Undefined results for x == 0. +HWY_API size_t Num0BitsAboveMS1Bit_Nonzero32(const uint32_t x) { +#if HWY_COMPILER_MSVC + unsigned long index; // NOLINT + _BitScanReverse(&index, x); + return 31 - index; +#else // HWY_COMPILER_MSVC + return static_cast(__builtin_clz(x)); +#endif // HWY_COMPILER_MSVC +} + +HWY_API size_t Num0BitsAboveMS1Bit_Nonzero64(const uint64_t x) { +#if HWY_COMPILER_MSVC +#if HWY_ARCH_X86_64 + unsigned long index; // NOLINT + _BitScanReverse64(&index, x); + return 63 - index; +#else // HWY_ARCH_X86_64 + // _BitScanReverse64 not available + const uint32_t msb = static_cast(x >> 32u); + unsigned long index; // NOLINT + if (msb == 0) { + const uint32_t lsb = static_cast(x & 0xFFFFFFFF); + _BitScanReverse(&index, lsb); + return 63 - index; + } else { + _BitScanReverse(&index, msb); + return 31 - index; + } +#endif // HWY_ARCH_X86_64 +#else // HWY_COMPILER_MSVC + return static_cast(__builtin_clzll(x)); +#endif // HWY_COMPILER_MSVC +} + +HWY_API size_t PopCount(uint64_t x) { +#if HWY_COMPILER_GCC // includes clang + return static_cast(__builtin_popcountll(x)); + // This instruction has a separate feature flag, but is often called from + // non-SIMD code, so we don't want to require dynamic dispatch. It was first + // supported by Intel in Nehalem (SSE4.2), but MSVC only predefines a macro + // for AVX, so check for that. +#elif HWY_COMPILER_MSVC && HWY_ARCH_X86_64 && defined(__AVX__) + return _mm_popcnt_u64(x); +#elif HWY_COMPILER_MSVC && HWY_ARCH_X86_32 && defined(__AVX__) + return _mm_popcnt_u32(static_cast(x & 0xFFFFFFFFu)) + + _mm_popcnt_u32(static_cast(x >> 32)); +#else + x -= ((x >> 1) & 0x5555555555555555ULL); + x = (((x >> 2) & 0x3333333333333333ULL) + (x & 0x3333333333333333ULL)); + x = (((x >> 4) + x) & 0x0F0F0F0F0F0F0F0FULL); + x += (x >> 8); + x += (x >> 16); + x += (x >> 32); + return static_cast(x & 0x7Fu); +#endif +} + +// Skip HWY_API due to GCC "function not considered for inlining". Previously +// such errors were caused by underlying type mismatches, but it's not clear +// what is still mismatched despite all the casts. +template +/*HWY_API*/ constexpr size_t FloorLog2(TI x) { + return x == TI{1} + ? 0 + : static_cast(FloorLog2(static_cast(x >> 1)) + 1); +} + +template +/*HWY_API*/ constexpr size_t CeilLog2(TI x) { + return x == TI{1} + ? 0 + : static_cast(FloorLog2(static_cast(x - 1)) + 1); +} + +template +HWY_INLINE constexpr T AddWithWraparound(hwy::FloatTag /*tag*/, T t, size_t n) { + return t + static_cast(n); +} + +template +HWY_INLINE constexpr T AddWithWraparound(hwy::NonFloatTag /*tag*/, T t, + size_t n) { + using TU = MakeUnsigned; + return static_cast( + static_cast(static_cast(t) + static_cast(n)) & + hwy::LimitsMax()); +} + +#if HWY_COMPILER_MSVC && HWY_ARCH_X86_64 +#pragma intrinsic(_umul128) +#endif + +// 64 x 64 = 128 bit multiplication +HWY_API uint64_t Mul128(uint64_t a, uint64_t b, uint64_t* HWY_RESTRICT upper) { +#if defined(__SIZEOF_INT128__) + __uint128_t product = (__uint128_t)a * (__uint128_t)b; + *upper = (uint64_t)(product >> 64); + return (uint64_t)(product & 0xFFFFFFFFFFFFFFFFULL); +#elif HWY_COMPILER_MSVC && HWY_ARCH_X86_64 + return _umul128(a, b, upper); +#else + constexpr uint64_t kLo32 = 0xFFFFFFFFU; + const uint64_t lo_lo = (a & kLo32) * (b & kLo32); + const uint64_t hi_lo = (a >> 32) * (b & kLo32); + const uint64_t lo_hi = (a & kLo32) * (b >> 32); + const uint64_t hi_hi = (a >> 32) * (b >> 32); + const uint64_t t = (lo_lo >> 32) + (hi_lo & kLo32) + lo_hi; + *upper = (hi_lo >> 32) + (t >> 32) + hi_hi; + return (t << 32) | (lo_lo & kLo32); +#endif +} + +#if HWY_COMPILER_MSVC +#pragma intrinsic(memcpy) +#pragma intrinsic(memset) +#endif + +// The source/destination must not overlap/alias. +template +HWY_API void CopyBytes(const From* from, To* to) { +#if HWY_COMPILER_MSVC + memcpy(to, from, kBytes); +#else + __builtin_memcpy( + static_cast(to), static_cast(from), kBytes); +#endif +} + +// Same as CopyBytes, but for same-sized objects; avoids a size argument. +template +HWY_API void CopySameSize(const From* HWY_RESTRICT from, To* HWY_RESTRICT to) { + static_assert(sizeof(From) == sizeof(To), ""); + CopyBytes(from, to); +} + +template +HWY_API void ZeroBytes(To* to) { +#if HWY_COMPILER_MSVC + memset(to, 0, kBytes); +#else + __builtin_memset(to, 0, kBytes); +#endif +} + +HWY_API float F32FromBF16(bfloat16_t bf) { + uint32_t bits = bf.bits; + bits <<= 16; + float f; + CopySameSize(&bits, &f); + return f; +} + +HWY_API bfloat16_t BF16FromF32(float f) { + uint32_t bits; + CopySameSize(&f, &bits); + bfloat16_t bf; + bf.bits = static_cast(bits >> 16); + return bf; +} + +HWY_DLLEXPORT HWY_NORETURN void HWY_FORMAT(3, 4) + Abort(const char* file, int line, const char* format, ...); + +// Prevents the compiler from eliding the computations that led to "output". +template +HWY_API void PreventElision(T&& output) { +#if HWY_COMPILER_MSVC + // MSVC does not support inline assembly anymore (and never supported GCC's + // RTL constraints). Self-assignment with #pragma optimize("off") might be + // expected to prevent elision, but it does not with MSVC 2015. Type-punning + // with volatile pointers generates inefficient code on MSVC 2017. + static std::atomic> dummy; + dummy.store(output, std::memory_order_relaxed); +#else + // Works by indicating to the compiler that "output" is being read and + // modified. The +r constraint avoids unnecessary writes to memory, but only + // works for built-in types (typically FuncOutput). + asm volatile("" : "+r"(output) : : "memory"); +#endif +} + +} // namespace hwy + +#endif // HIGHWAY_HWY_BASE_H_ diff --git a/third_party/highway/hwy/base_test.cc b/third_party/highway/hwy/base_test.cc new file mode 100644 index 0000000000..474b10859b --- /dev/null +++ b/third_party/highway/hwy/base_test.cc @@ -0,0 +1,310 @@ +// Copyright 2019 Google LLC +// SPDX-License-Identifier: Apache-2.0 +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include + +#undef HWY_TARGET_INCLUDE +#define HWY_TARGET_INCLUDE "base_test.cc" +#include "hwy/foreach_target.h" // IWYU pragma: keep +#include "hwy/highway.h" +#include "hwy/tests/test_util-inl.h" + +HWY_BEFORE_NAMESPACE(); +namespace hwy { +namespace HWY_NAMESPACE { + +HWY_NOINLINE void TestAllLimits() { + HWY_ASSERT_EQ(uint8_t{0}, LimitsMin()); + HWY_ASSERT_EQ(uint16_t{0}, LimitsMin()); + HWY_ASSERT_EQ(uint32_t{0}, LimitsMin()); + HWY_ASSERT_EQ(uint64_t{0}, LimitsMin()); + + HWY_ASSERT_EQ(int8_t{-128}, LimitsMin()); + HWY_ASSERT_EQ(int16_t{-32768}, LimitsMin()); + HWY_ASSERT_EQ(static_cast(0x80000000u), LimitsMin()); + HWY_ASSERT_EQ(static_cast(0x8000000000000000ull), + LimitsMin()); + + HWY_ASSERT_EQ(uint8_t{0xFF}, LimitsMax()); + HWY_ASSERT_EQ(uint16_t{0xFFFF}, LimitsMax()); + HWY_ASSERT_EQ(uint32_t{0xFFFFFFFFu}, LimitsMax()); + HWY_ASSERT_EQ(uint64_t{0xFFFFFFFFFFFFFFFFull}, LimitsMax()); + + HWY_ASSERT_EQ(int8_t{0x7F}, LimitsMax()); + HWY_ASSERT_EQ(int16_t{0x7FFF}, LimitsMax()); + HWY_ASSERT_EQ(int32_t{0x7FFFFFFFu}, LimitsMax()); + HWY_ASSERT_EQ(int64_t{0x7FFFFFFFFFFFFFFFull}, LimitsMax()); +} + +struct TestLowestHighest { + template + HWY_NOINLINE void operator()(T /*unused*/) const { + HWY_ASSERT_EQ(std::numeric_limits::lowest(), LowestValue()); + HWY_ASSERT_EQ(std::numeric_limits::max(), HighestValue()); + } +}; + +HWY_NOINLINE void TestAllLowestHighest() { ForAllTypes(TestLowestHighest()); } +struct TestIsUnsigned { + template + HWY_NOINLINE void operator()(T /*unused*/) const { + static_assert(!IsFloat(), "Expected !IsFloat"); + static_assert(!IsSigned(), "Expected !IsSigned"); + } +}; + +struct TestIsSigned { + template + HWY_NOINLINE void operator()(T /*unused*/) const { + static_assert(!IsFloat(), "Expected !IsFloat"); + static_assert(IsSigned(), "Expected IsSigned"); + } +}; + +struct TestIsFloat { + template + HWY_NOINLINE void operator()(T /*unused*/) const { + static_assert(IsFloat(), "Expected IsFloat"); + static_assert(IsSigned(), "Floats are also considered signed"); + } +}; + +HWY_NOINLINE void TestAllType() { + ForUnsignedTypes(TestIsUnsigned()); + ForSignedTypes(TestIsSigned()); + ForFloatTypes(TestIsFloat()); + + static_assert(sizeof(MakeUnsigned) == 16, ""); + static_assert(sizeof(MakeWide) == 16, "Expected uint128_t"); + static_assert(sizeof(MakeNarrow) == 8, "Expected uint64_t"); +} + +struct TestIsSame { + template + HWY_NOINLINE void operator()(T /*unused*/) const { + static_assert(IsSame(), "T == T"); + static_assert(!IsSame, MakeUnsigned>(), "S != U"); + static_assert(!IsSame, MakeSigned>(), "U != S"); + } +}; + +HWY_NOINLINE void TestAllIsSame() { ForAllTypes(TestIsSame()); } + +HWY_NOINLINE void TestAllBitScan() { + HWY_ASSERT_EQ(size_t{0}, Num0BitsAboveMS1Bit_Nonzero32(0x80000000u)); + HWY_ASSERT_EQ(size_t{0}, Num0BitsAboveMS1Bit_Nonzero32(0xFFFFFFFFu)); + HWY_ASSERT_EQ(size_t{1}, Num0BitsAboveMS1Bit_Nonzero32(0x40000000u)); + HWY_ASSERT_EQ(size_t{1}, Num0BitsAboveMS1Bit_Nonzero32(0x40108210u)); + HWY_ASSERT_EQ(size_t{30}, Num0BitsAboveMS1Bit_Nonzero32(2u)); + HWY_ASSERT_EQ(size_t{30}, Num0BitsAboveMS1Bit_Nonzero32(3u)); + HWY_ASSERT_EQ(size_t{31}, Num0BitsAboveMS1Bit_Nonzero32(1u)); + + HWY_ASSERT_EQ(size_t{0}, + Num0BitsAboveMS1Bit_Nonzero64(0x8000000000000000ull)); + HWY_ASSERT_EQ(size_t{0}, + Num0BitsAboveMS1Bit_Nonzero64(0xFFFFFFFFFFFFFFFFull)); + HWY_ASSERT_EQ(size_t{1}, + Num0BitsAboveMS1Bit_Nonzero64(0x4000000000000000ull)); + HWY_ASSERT_EQ(size_t{1}, + Num0BitsAboveMS1Bit_Nonzero64(0x4010821004200011ull)); + HWY_ASSERT_EQ(size_t{62}, Num0BitsAboveMS1Bit_Nonzero64(2ull)); + HWY_ASSERT_EQ(size_t{62}, Num0BitsAboveMS1Bit_Nonzero64(3ull)); + HWY_ASSERT_EQ(size_t{63}, Num0BitsAboveMS1Bit_Nonzero64(1ull)); + + HWY_ASSERT_EQ(size_t{0}, Num0BitsBelowLS1Bit_Nonzero32(1u)); + HWY_ASSERT_EQ(size_t{1}, Num0BitsBelowLS1Bit_Nonzero32(2u)); + HWY_ASSERT_EQ(size_t{30}, Num0BitsBelowLS1Bit_Nonzero32(0xC0000000u)); + HWY_ASSERT_EQ(size_t{31}, Num0BitsBelowLS1Bit_Nonzero32(0x80000000u)); + + HWY_ASSERT_EQ(size_t{0}, Num0BitsBelowLS1Bit_Nonzero64(1ull)); + HWY_ASSERT_EQ(size_t{1}, Num0BitsBelowLS1Bit_Nonzero64(2ull)); + HWY_ASSERT_EQ(size_t{62}, + Num0BitsBelowLS1Bit_Nonzero64(0xC000000000000000ull)); + HWY_ASSERT_EQ(size_t{63}, + Num0BitsBelowLS1Bit_Nonzero64(0x8000000000000000ull)); +} + +HWY_NOINLINE void TestAllPopCount() { + HWY_ASSERT_EQ(size_t{0}, PopCount(0u)); + HWY_ASSERT_EQ(size_t{1}, PopCount(1u)); + HWY_ASSERT_EQ(size_t{1}, PopCount(2u)); + HWY_ASSERT_EQ(size_t{2}, PopCount(3u)); + HWY_ASSERT_EQ(size_t{1}, PopCount(0x80000000u)); + HWY_ASSERT_EQ(size_t{31}, PopCount(0x7FFFFFFFu)); + HWY_ASSERT_EQ(size_t{32}, PopCount(0xFFFFFFFFu)); + + HWY_ASSERT_EQ(size_t{1}, PopCount(0x80000000ull)); + HWY_ASSERT_EQ(size_t{31}, PopCount(0x7FFFFFFFull)); + HWY_ASSERT_EQ(size_t{32}, PopCount(0xFFFFFFFFull)); + HWY_ASSERT_EQ(size_t{33}, PopCount(0x10FFFFFFFFull)); + HWY_ASSERT_EQ(size_t{63}, PopCount(0xFFFEFFFFFFFFFFFFull)); + HWY_ASSERT_EQ(size_t{64}, PopCount(0xFFFFFFFFFFFFFFFFull)); +} + +template +static HWY_INLINE T TestEndianGetIntegerVal(T val) { + static_assert(!IsFloat() && !IsSpecialFloat(), + "T must not be a floating-point type"); + using TU = MakeUnsigned; + static_assert(sizeof(T) == sizeof(TU), + "sizeof(T) == sizeof(TU) must be true"); + + uint8_t result_bytes[sizeof(T)]; + const TU val_u = static_cast(val); + + for (size_t i = 0; i < sizeof(T); i++) { +#if HWY_IS_BIG_ENDIAN + const size_t shift_amt = (sizeof(T) - 1 - i) * 8; +#else + const size_t shift_amt = i * 8; +#endif + result_bytes[i] = static_cast((val_u >> shift_amt) & 0xFF); + } + + T result; + CopyBytes(result_bytes, &result); + return result; +} + +template +static HWY_INLINE T TestEndianCreateValueFromBytes(Bytes&&... bytes) { + static_assert(sizeof(T) > 0, "sizeof(T) > 0 must be true"); + static_assert(sizeof...(Bytes) == sizeof(T), + "sizeof...(Bytes) == sizeof(T) must be true"); + + const uint8_t src_bytes[sizeof(T)]{static_cast(bytes)...}; + + T result; + CopyBytes(src_bytes, &result); + return result; +} + +#define HWY_TEST_ENDIAN_CHECK_INTEGER_VAL(val) \ + HWY_ASSERT_EQ(val, TestEndianGetIntegerVal(val)) + +HWY_NOINLINE void TestAllEndian() { + HWY_TEST_ENDIAN_CHECK_INTEGER_VAL(int8_t{0x01}); + HWY_TEST_ENDIAN_CHECK_INTEGER_VAL(uint8_t{0x01}); + HWY_TEST_ENDIAN_CHECK_INTEGER_VAL(int16_t{0x0102}); + HWY_TEST_ENDIAN_CHECK_INTEGER_VAL(uint16_t{0x0102}); + HWY_TEST_ENDIAN_CHECK_INTEGER_VAL(int32_t{0x01020304}); + HWY_TEST_ENDIAN_CHECK_INTEGER_VAL(uint32_t{0x01020304}); + HWY_TEST_ENDIAN_CHECK_INTEGER_VAL(int64_t{0x0102030405060708}); + HWY_TEST_ENDIAN_CHECK_INTEGER_VAL(uint64_t{0x0102030405060708}); + + HWY_TEST_ENDIAN_CHECK_INTEGER_VAL(int16_t{0x0201}); + HWY_TEST_ENDIAN_CHECK_INTEGER_VAL(uint16_t{0x0201}); + HWY_TEST_ENDIAN_CHECK_INTEGER_VAL(int32_t{0x04030201}); + HWY_TEST_ENDIAN_CHECK_INTEGER_VAL(uint32_t{0x04030201}); + HWY_TEST_ENDIAN_CHECK_INTEGER_VAL(int64_t{0x0807060504030201}); + HWY_TEST_ENDIAN_CHECK_INTEGER_VAL(uint64_t{0x0807060504030201}); + + HWY_ASSERT_EQ(HWY_IS_BIG_ENDIAN ? int16_t{0x0102} : int16_t{0x0201}, + TestEndianCreateValueFromBytes(0x01, 0x02)); + HWY_ASSERT_EQ(HWY_IS_BIG_ENDIAN ? uint16_t{0x0102} : uint16_t{0x0201}, + TestEndianCreateValueFromBytes(0x01, 0x02)); + HWY_ASSERT_EQ( + HWY_IS_BIG_ENDIAN ? int32_t{0x01020304} : int32_t{0x04030201}, + TestEndianCreateValueFromBytes(0x01, 0x02, 0x03, 0x04)); + HWY_ASSERT_EQ( + HWY_IS_BIG_ENDIAN ? uint32_t{0x01020304} : uint32_t{0x04030201}, + TestEndianCreateValueFromBytes(0x01, 0x02, 0x03, 0x04)); + HWY_ASSERT_EQ(HWY_IS_BIG_ENDIAN ? int64_t{0x0102030405060708} + : int64_t{0x0807060504030201}, + TestEndianCreateValueFromBytes( + 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08)); + HWY_ASSERT_EQ(HWY_IS_BIG_ENDIAN ? uint64_t{0x0102030405060708} + : uint64_t{0x0807060504030201}, + TestEndianCreateValueFromBytes( + 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08)); + + HWY_ASSERT_EQ(HWY_IS_BIG_ENDIAN ? int16_t{-0x5EFE} : int16_t{0x02A1}, + TestEndianCreateValueFromBytes(0xA1, 0x02)); + HWY_ASSERT_EQ( + HWY_IS_BIG_ENDIAN ? int32_t{-0x5E4D3CFC} : int32_t{0x04C3B2A1}, + TestEndianCreateValueFromBytes(0xA1, 0xB2, 0xC3, 0x04)); + HWY_ASSERT_EQ(HWY_IS_BIG_ENDIAN ? int64_t{-0x6E5D4C3B2A1908F8} + : int64_t{0x08F7E6D5C4B3A291}, + TestEndianCreateValueFromBytes( + 0x91, 0xA2, 0xB3, 0xC4, 0xD5, 0xE6, 0xF7, 0x08)); + + HWY_ASSERT_EQ(HWY_IS_LITTLE_ENDIAN ? int16_t{-0x5DFF} : int16_t{0x01A2}, + TestEndianCreateValueFromBytes(0x01, 0xA2)); + HWY_ASSERT_EQ( + HWY_IS_LITTLE_ENDIAN ? int32_t{-0x3B4C5DFF} : int32_t{0x01A2B3C4}, + TestEndianCreateValueFromBytes(0x01, 0xA2, 0xB3, 0xC4)); + HWY_ASSERT_EQ(HWY_IS_LITTLE_ENDIAN ? int64_t{-0x0718293A4B5C6DFF} + : int64_t{0x0192A3B4C5D6E7F8}, + TestEndianCreateValueFromBytes( + 0x01, 0x92, 0xA3, 0xB4, 0xC5, 0xD6, 0xE7, 0xF8)); + +#if HWY_IS_BIG_ENDIAN + HWY_ASSERT_EQ(1.0f, + TestEndianCreateValueFromBytes(0x3F, 0x80, 0x00, 0x00)); + HWY_ASSERT_EQ(15922433.0f, + TestEndianCreateValueFromBytes(0x4B, 0x72, 0xF5, 0x01)); + HWY_ASSERT_EQ(-12357485.0f, + TestEndianCreateValueFromBytes(0xCB, 0x3C, 0x8F, 0x6D)); +#else + HWY_ASSERT_EQ(1.0f, + TestEndianCreateValueFromBytes(0x00, 0x00, 0x80, 0x3F)); + HWY_ASSERT_EQ(15922433.0f, + TestEndianCreateValueFromBytes(0x01, 0xF5, 0x72, 0x4B)); + HWY_ASSERT_EQ(-12357485.0f, + TestEndianCreateValueFromBytes(0x6D, 0x8F, 0x3C, 0xCB)); +#endif + +#if HWY_HAVE_FLOAT64 +#if HWY_IS_BIG_ENDIAN + HWY_ASSERT_EQ(1.0, TestEndianCreateValueFromBytes( + 0x3F, 0xF0, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00)); + HWY_ASSERT_EQ(8707235690688195.0, + TestEndianCreateValueFromBytes(0x43, 0x3E, 0xEF, 0x2F, + 0x4A, 0x51, 0xAE, 0xC3)); + HWY_ASSERT_EQ(-6815854340348452.0, + TestEndianCreateValueFromBytes(0xC3, 0x38, 0x36, 0xFB, + 0xC0, 0xCC, 0x1A, 0x24)); +#else + HWY_ASSERT_EQ(1.0, TestEndianCreateValueFromBytes( + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xF0, 0x3F)); + HWY_ASSERT_EQ(8707235690688195.0, + TestEndianCreateValueFromBytes(0xC3, 0xAE, 0x51, 0x4A, + 0x2F, 0xEF, 0x3E, 0x43)); + HWY_ASSERT_EQ(-6815854340348452.0, + TestEndianCreateValueFromBytes(0x24, 0x1A, 0xCC, 0xC0, + 0xFB, 0x36, 0x38, 0xC3)); +#endif // HWY_IS_BIG_ENDIAN +#endif // HWY_HAVE_FLOAT64 +} + +// NOLINTNEXTLINE(google-readability-namespace-comments) +} // namespace HWY_NAMESPACE +} // namespace hwy +HWY_AFTER_NAMESPACE(); + +#if HWY_ONCE + +namespace hwy { +HWY_BEFORE_TEST(BaseTest); +HWY_EXPORT_AND_TEST_P(BaseTest, TestAllLimits); +HWY_EXPORT_AND_TEST_P(BaseTest, TestAllLowestHighest); +HWY_EXPORT_AND_TEST_P(BaseTest, TestAllType); +HWY_EXPORT_AND_TEST_P(BaseTest, TestAllIsSame); +HWY_EXPORT_AND_TEST_P(BaseTest, TestAllBitScan); +HWY_EXPORT_AND_TEST_P(BaseTest, TestAllPopCount); +HWY_EXPORT_AND_TEST_P(BaseTest, TestAllEndian); +} // namespace hwy + +#endif diff --git a/third_party/highway/hwy/cache_control.h b/third_party/highway/hwy/cache_control.h new file mode 100644 index 0000000000..6e7665dd29 --- /dev/null +++ b/third_party/highway/hwy/cache_control.h @@ -0,0 +1,108 @@ +// Copyright 2020 Google LLC +// SPDX-License-Identifier: Apache-2.0 +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef HIGHWAY_HWY_CACHE_CONTROL_H_ +#define HIGHWAY_HWY_CACHE_CONTROL_H_ + +#include "hwy/base.h" + +// Requires SSE2; fails to compile on 32-bit Clang 7 (see +// https://github.com/gperftools/gperftools/issues/946). +#if !defined(__SSE2__) || (HWY_COMPILER_CLANG && HWY_ARCH_X86_32) +#undef HWY_DISABLE_CACHE_CONTROL +#define HWY_DISABLE_CACHE_CONTROL +#endif + +// intrin.h is sufficient on MSVC and already included by base.h. +#if HWY_ARCH_X86 && !defined(HWY_DISABLE_CACHE_CONTROL) && !HWY_COMPILER_MSVC +#include // SSE2 +#include // _mm_prefetch +#endif + +namespace hwy { + +// Even if N*sizeof(T) is smaller, Stream may write a multiple of this size. +#define HWY_STREAM_MULTIPLE 16 + +// The following functions may also require an attribute. +#if HWY_ARCH_X86 && !defined(HWY_DISABLE_CACHE_CONTROL) && !HWY_COMPILER_MSVC +#define HWY_ATTR_CACHE __attribute__((target("sse2"))) +#else +#define HWY_ATTR_CACHE +#endif + +// Windows.h #defines this, which causes infinite recursion. Temporarily +// undefine to avoid conflict with our function. +// TODO(janwas): remove when this function is removed. +#pragma push_macro("LoadFence") +#undef LoadFence + +// Delays subsequent loads until prior loads are visible. Beware of potentially +// differing behavior across architectures and vendors: on Intel but not +// AMD CPUs, also serves as a full fence (waits for all prior instructions to +// complete). +HWY_INLINE HWY_ATTR_CACHE void LoadFence() { +#if HWY_ARCH_X86 && !defined(HWY_DISABLE_CACHE_CONTROL) + _mm_lfence(); +#endif +} + +// TODO(janwas): remove when this function is removed. (See above.) +#pragma pop_macro("LoadFence") + +// Ensures values written by previous `Stream` calls are visible on the current +// core. This is NOT sufficient for synchronizing across cores; when `Stream` +// outputs are to be consumed by other core(s), the producer must publish +// availability (e.g. via mutex or atomic_flag) after `FlushStream`. +HWY_INLINE HWY_ATTR_CACHE void FlushStream() { +#if HWY_ARCH_X86 && !defined(HWY_DISABLE_CACHE_CONTROL) + _mm_sfence(); +#endif +} + +// Optionally begins loading the cache line containing "p" to reduce latency of +// subsequent actual loads. +template +HWY_INLINE HWY_ATTR_CACHE void Prefetch(const T* p) { +#if HWY_ARCH_X86 && !defined(HWY_DISABLE_CACHE_CONTROL) + _mm_prefetch(reinterpret_cast(p), _MM_HINT_T0); +#elif HWY_COMPILER_GCC // includes clang + // Hint=0 (NTA) behavior differs, but skipping outer caches is probably not + // desirable, so use the default 3 (keep in caches). + __builtin_prefetch(p, /*write=*/0, /*hint=*/3); +#else + (void)p; +#endif +} + +// Invalidates and flushes the cache line containing "p", if possible. +HWY_INLINE HWY_ATTR_CACHE void FlushCacheline(const void* p) { +#if HWY_ARCH_X86 && !defined(HWY_DISABLE_CACHE_CONTROL) + _mm_clflush(p); +#else + (void)p; +#endif +} + +// When called inside a spin-loop, may reduce power consumption. +HWY_INLINE HWY_ATTR_CACHE void Pause() { +#if HWY_ARCH_X86 && !defined(HWY_DISABLE_CACHE_CONTROL) + _mm_pause(); +#endif +} + +} // namespace hwy + +#endif // HIGHWAY_HWY_CACHE_CONTROL_H_ diff --git a/third_party/highway/hwy/contrib/algo/copy-inl.h b/third_party/highway/hwy/contrib/algo/copy-inl.h new file mode 100644 index 0000000000..033cf8a626 --- /dev/null +++ b/third_party/highway/hwy/contrib/algo/copy-inl.h @@ -0,0 +1,136 @@ +// Copyright 2022 Google LLC +// SPDX-License-Identifier: Apache-2.0 +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Per-target include guard +#if defined(HIGHWAY_HWY_CONTRIB_ALGO_COPY_INL_H_) == \ + defined(HWY_TARGET_TOGGLE) +#ifdef HIGHWAY_HWY_CONTRIB_ALGO_COPY_INL_H_ +#undef HIGHWAY_HWY_CONTRIB_ALGO_COPY_INL_H_ +#else +#define HIGHWAY_HWY_CONTRIB_ALGO_COPY_INL_H_ +#endif + +#include "hwy/highway.h" + +HWY_BEFORE_NAMESPACE(); +namespace hwy { +namespace HWY_NAMESPACE { + +// These functions avoid having to write a loop plus remainder handling in the +// (unfortunately still common) case where arrays are not aligned/padded. If the +// inputs are known to be aligned/padded, it is more efficient to write a single +// loop using Load(). We do not provide a CopyAlignedPadded because it +// would be more verbose than such a loop. + +// Fills `to`[0, `count`) with `value`. +template > +void Fill(D d, T value, size_t count, T* HWY_RESTRICT to) { + const size_t N = Lanes(d); + const Vec v = Set(d, value); + + size_t idx = 0; + for (; idx + N <= count; idx += N) { + StoreU(v, d, to + idx); + } + + // `count` was a multiple of the vector length `N`: already done. + if (HWY_UNLIKELY(idx == count)) return; + + const size_t remaining = count - idx; + HWY_DASSERT(0 != remaining && remaining < N); + SafeFillN(remaining, value, d, to + idx); +} + +// Copies `from`[0, `count`) to `to`, which must not overlap `from`. +template > +void Copy(D d, const T* HWY_RESTRICT from, size_t count, T* HWY_RESTRICT to) { + const size_t N = Lanes(d); + + size_t idx = 0; + for (; idx + N <= count; idx += N) { + const Vec v = LoadU(d, from + idx); + StoreU(v, d, to + idx); + } + + // `count` was a multiple of the vector length `N`: already done. + if (HWY_UNLIKELY(idx == count)) return; + + const size_t remaining = count - idx; + HWY_DASSERT(0 != remaining && remaining < N); + SafeCopyN(remaining, d, from + idx, to + idx); +} + +// For idx in [0, count) in ascending order, appends `from[idx]` to `to` if the +// corresponding mask element of `func(d, v)` is true. Returns the STL-style end +// of the newly written elements in `to`. +// +// `func` is either a functor with a templated operator()(d, v) returning a +// mask, or a generic lambda if using C++14. Due to apparent limitations of +// Clang on Windows, it is currently necessary to add HWY_ATTR before the +// opening { of the lambda to avoid errors about "function .. requires target". +// +// NOTE: this is only supported for 16-, 32- or 64-bit types. +// NOTE: Func may be called a second time for elements it has already seen, but +// these elements will not be written to `to` again. +template > +T* CopyIf(D d, const T* HWY_RESTRICT from, size_t count, T* HWY_RESTRICT to, + const Func& func) { + const size_t N = Lanes(d); + + size_t idx = 0; + for (; idx + N <= count; idx += N) { + const Vec v = LoadU(d, from + idx); + to += CompressBlendedStore(v, func(d, v), d, to); + } + + // `count` was a multiple of the vector length `N`: already done. + if (HWY_UNLIKELY(idx == count)) return to; + +#if HWY_MEM_OPS_MIGHT_FAULT + // Proceed one by one. + const CappedTag d1; + for (; idx < count; ++idx) { + using V1 = Vec; + // Workaround for -Waggressive-loop-optimizations on GCC 8 + // (iteration 2305843009213693951 invokes undefined behavior for T=i64) + const uintptr_t addr = reinterpret_cast(from); + const T* HWY_RESTRICT from_idx = + reinterpret_cast(addr + (idx * sizeof(T))); + const V1 v = LoadU(d1, from_idx); + // Avoid storing to `to` unless we know it should be kept - otherwise, we + // might overrun the end if it was allocated for the exact count. + if (CountTrue(d1, func(d1, v)) == 0) continue; + StoreU(v, d1, to); + to += 1; + } +#else + // Start index of the last unaligned whole vector, ending at the array end. + const size_t last = count - N; + // Number of elements before `from` or already written. + const size_t invalid = idx - last; + HWY_DASSERT(0 != invalid && invalid < N); + const Mask mask = Not(FirstN(d, invalid)); + const Vec v = MaskedLoad(mask, d, from + last); + to += CompressBlendedStore(v, And(mask, func(d, v)), d, to); +#endif + return to; +} + +// NOLINTNEXTLINE(google-readability-namespace-comments) +} // namespace HWY_NAMESPACE +} // namespace hwy +HWY_AFTER_NAMESPACE(); + +#endif // HIGHWAY_HWY_CONTRIB_ALGO_COPY_INL_H_ diff --git a/third_party/highway/hwy/contrib/algo/copy_test.cc b/third_party/highway/hwy/contrib/algo/copy_test.cc new file mode 100644 index 0000000000..c74f6e9bd7 --- /dev/null +++ b/third_party/highway/hwy/contrib/algo/copy_test.cc @@ -0,0 +1,204 @@ +// Copyright 2022 Google LLC +// SPDX-License-Identifier: Apache-2.0 +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "hwy/aligned_allocator.h" + +// clang-format off +#undef HWY_TARGET_INCLUDE +#define HWY_TARGET_INCLUDE "hwy/contrib/algo/copy_test.cc" +#include "hwy/foreach_target.h" // IWYU pragma: keep +#include "hwy/highway.h" +#include "hwy/contrib/algo/copy-inl.h" +#include "hwy/tests/test_util-inl.h" +// clang-format on + +// If your project requires C++14 or later, you can ignore this and pass lambdas +// directly to Transform, without requiring an lvalue as we do here for C++11. +#if __cplusplus < 201402L +#define HWY_GENERIC_LAMBDA 0 +#else +#define HWY_GENERIC_LAMBDA 1 +#endif + +HWY_BEFORE_NAMESPACE(); +namespace hwy { +namespace HWY_NAMESPACE { + +// Returns random integer in [0, 128), which fits in any lane type. +template +T Random7Bit(RandomState& rng) { + return static_cast(Random32(&rng) & 127); +} + +// In C++14, we can instead define these as generic lambdas next to where they +// are invoked. +#if !HWY_GENERIC_LAMBDA + +struct IsOdd { + template + Mask operator()(D d, V v) const { + return TestBit(v, Set(d, TFromD{1})); + } +}; + +#endif // !HWY_GENERIC_LAMBDA + +// Invokes Test (e.g. TestCopyIf) with all arg combinations. T comes from +// ForFloatTypes. +template +struct ForeachCountAndMisalign { + template + HWY_NOINLINE void operator()(T /*unused*/, D d) const { + RandomState rng; + const size_t N = Lanes(d); + const size_t misalignments[3] = {0, N / 4, 3 * N / 5}; + + for (size_t count = 0; count < 2 * N; ++count) { + for (size_t ma : misalignments) { + for (size_t mb : misalignments) { + Test()(d, count, ma, mb, rng); + } + } + } + } +}; + +struct TestFill { + template + void operator()(D d, size_t count, size_t misalign_a, size_t misalign_b, + RandomState& rng) { + using T = TFromD; + // HWY_MAX prevents error when misalign == count == 0. + AlignedFreeUniquePtr pa = + AllocateAligned(HWY_MAX(1, misalign_a + count)); + AlignedFreeUniquePtr pb = AllocateAligned(misalign_b + count + 1); + HWY_ASSERT(pa && pb); + T* expected = pa.get() + misalign_a; + const T value = Random7Bit(rng); + for (size_t i = 0; i < count; ++i) { + expected[i] = value; + } + T* actual = pb.get() + misalign_b; + + actual[count] = T{0}; // sentinel + Fill(d, value, count, actual); + HWY_ASSERT_EQ(T{0}, actual[count]); // did not write past end + + const auto info = hwy::detail::MakeTypeInfo(); + const char* target_name = hwy::TargetName(HWY_TARGET); + hwy::detail::AssertArrayEqual(info, expected, actual, count, target_name, + __FILE__, __LINE__); + } +}; + +void TestAllFill() { + ForAllTypes(ForPartialVectors>()); +} + +struct TestCopy { + template + void operator()(D d, size_t count, size_t misalign_a, size_t misalign_b, + RandomState& rng) { + using T = TFromD; + // Prevents error if size to allocate is zero. + AlignedFreeUniquePtr pa = + AllocateAligned(HWY_MAX(1, misalign_a + count)); + AlignedFreeUniquePtr pb = + AllocateAligned(HWY_MAX(1, misalign_b + count)); + HWY_ASSERT(pa && pb); + T* a = pa.get() + misalign_a; + for (size_t i = 0; i < count; ++i) { + a[i] = Random7Bit(rng); + } + T* b = pb.get() + misalign_b; + + Copy(d, a, count, b); + + const auto info = hwy::detail::MakeTypeInfo(); + const char* target_name = hwy::TargetName(HWY_TARGET); + hwy::detail::AssertArrayEqual(info, a, b, count, target_name, __FILE__, + __LINE__); + } +}; + +void TestAllCopy() { + ForAllTypes(ForPartialVectors>()); +} + +struct TestCopyIf { + template + void operator()(D d, size_t count, size_t misalign_a, size_t misalign_b, + RandomState& rng) { + using T = TFromD; + const size_t padding = Lanes(ScalableTag()); + + // Prevents error if size to allocate is zero. + AlignedFreeUniquePtr pa = + AllocateAligned(HWY_MAX(1, misalign_a + count)); + AlignedFreeUniquePtr pb = + AllocateAligned(HWY_MAX(1, misalign_b + count + padding)); + AlignedFreeUniquePtr expected = AllocateAligned(HWY_MAX(1, count)); + HWY_ASSERT(pa && pb && expected); + + T* a = pa.get() + misalign_a; + for (size_t i = 0; i < count; ++i) { + a[i] = Random7Bit(rng); + } + T* b = pb.get() + misalign_b; + + size_t num_odd = 0; + for (size_t i = 0; i < count; ++i) { + if (a[i] & 1) { + expected[num_odd++] = a[i]; + } + } + +#if HWY_GENERIC_LAMBDA + const auto is_odd = [](const auto d, const auto v) HWY_ATTR { + return TestBit(v, Set(d, TFromD{1})); + }; +#else + const IsOdd is_odd; +#endif + T* end = CopyIf(d, a, count, b, is_odd); + const size_t num_written = static_cast(end - b); + HWY_ASSERT_EQ(num_odd, num_written); + + const auto info = hwy::detail::MakeTypeInfo(); + const char* target_name = hwy::TargetName(HWY_TARGET); + hwy::detail::AssertArrayEqual(info, expected.get(), b, num_odd, target_name, + __FILE__, __LINE__); + } +}; + +void TestAllCopyIf() { + ForUI163264(ForPartialVectors>()); +} + +// NOLINTNEXTLINE(google-readability-namespace-comments) +} // namespace HWY_NAMESPACE +} // namespace hwy +HWY_AFTER_NAMESPACE(); + +#if HWY_ONCE + +namespace hwy { +HWY_BEFORE_TEST(CopyTest); +HWY_EXPORT_AND_TEST_P(CopyTest, TestAllFill); +HWY_EXPORT_AND_TEST_P(CopyTest, TestAllCopy); +HWY_EXPORT_AND_TEST_P(CopyTest, TestAllCopyIf); +} // namespace hwy + +#endif diff --git a/third_party/highway/hwy/contrib/algo/find-inl.h b/third_party/highway/hwy/contrib/algo/find-inl.h new file mode 100644 index 0000000000..388842e988 --- /dev/null +++ b/third_party/highway/hwy/contrib/algo/find-inl.h @@ -0,0 +1,109 @@ +// Copyright 2022 Google LLC +// SPDX-License-Identifier: Apache-2.0 +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Per-target include guard +#if defined(HIGHWAY_HWY_CONTRIB_ALGO_FIND_INL_H_) == \ + defined(HWY_TARGET_TOGGLE) +#ifdef HIGHWAY_HWY_CONTRIB_ALGO_FIND_INL_H_ +#undef HIGHWAY_HWY_CONTRIB_ALGO_FIND_INL_H_ +#else +#define HIGHWAY_HWY_CONTRIB_ALGO_FIND_INL_H_ +#endif + +#include "hwy/highway.h" + +HWY_BEFORE_NAMESPACE(); +namespace hwy { +namespace HWY_NAMESPACE { + +// Returns index of the first element equal to `value` in `in[0, count)`, or +// `count` if not found. +template > +size_t Find(D d, T value, const T* HWY_RESTRICT in, size_t count) { + const size_t N = Lanes(d); + const Vec broadcasted = Set(d, value); + + size_t i = 0; + for (; i + N <= count; i += N) { + const intptr_t pos = FindFirstTrue(d, Eq(broadcasted, LoadU(d, in + i))); + if (pos >= 0) return i + static_cast(pos); + } + + if (i != count) { +#if HWY_MEM_OPS_MIGHT_FAULT + // Scan single elements. + const CappedTag d1; + using V1 = Vec; + const V1 broadcasted1 = Set(d1, GetLane(broadcasted)); + for (; i < count; ++i) { + if (AllTrue(d1, Eq(broadcasted1, LoadU(d1, in + i)))) { + return i; + } + } +#else + const size_t remaining = count - i; + HWY_DASSERT(0 != remaining && remaining < N); + const Mask mask = FirstN(d, remaining); + const Vec v = MaskedLoad(mask, d, in + i); + // Apply mask so that we don't 'find' the zero-padding from MaskedLoad. + const intptr_t pos = FindFirstTrue(d, And(Eq(broadcasted, v), mask)); + if (pos >= 0) return i + static_cast(pos); +#endif // HWY_MEM_OPS_MIGHT_FAULT + } + + return count; // not found +} + +// Returns index of the first element in `in[0, count)` for which `func(d, vec)` +// returns true, otherwise `count`. +template > +size_t FindIf(D d, const T* HWY_RESTRICT in, size_t count, const Func& func) { + const size_t N = Lanes(d); + + size_t i = 0; + for (; i + N <= count; i += N) { + const intptr_t pos = FindFirstTrue(d, func(d, LoadU(d, in + i))); + if (pos >= 0) return i + static_cast(pos); + } + + if (i != count) { +#if HWY_MEM_OPS_MIGHT_FAULT + // Scan single elements. + const CappedTag d1; + for (; i < count; ++i) { + if (AllTrue(d1, func(d1, LoadU(d1, in + i)))) { + return i; + } + } +#else + const size_t remaining = count - i; + HWY_DASSERT(0 != remaining && remaining < N); + const Mask mask = FirstN(d, remaining); + const Vec v = MaskedLoad(mask, d, in + i); + // Apply mask so that we don't 'find' the zero-padding from MaskedLoad. + const intptr_t pos = FindFirstTrue(d, And(func(d, v), mask)); + if (pos >= 0) return i + static_cast(pos); +#endif // HWY_MEM_OPS_MIGHT_FAULT + } + + return count; // not found +} + +// NOLINTNEXTLINE(google-readability-namespace-comments) +} // namespace HWY_NAMESPACE +} // namespace hwy +HWY_AFTER_NAMESPACE(); + +#endif // HIGHWAY_HWY_CONTRIB_ALGO_FIND_INL_H_ diff --git a/third_party/highway/hwy/contrib/algo/find_test.cc b/third_party/highway/hwy/contrib/algo/find_test.cc new file mode 100644 index 0000000000..96f1fd640c --- /dev/null +++ b/third_party/highway/hwy/contrib/algo/find_test.cc @@ -0,0 +1,223 @@ +// Copyright 2022 Google LLC +// SPDX-License-Identifier: Apache-2.0 +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include + +#include // std::find_if +#include + +#include "hwy/aligned_allocator.h" +#include "hwy/base.h" +#include "hwy/print.h" + +// clang-format off +#undef HWY_TARGET_INCLUDE +#define HWY_TARGET_INCLUDE "hwy/contrib/algo/find_test.cc" +#include "hwy/foreach_target.h" // IWYU pragma: keep +#include "hwy/highway.h" +#include "hwy/contrib/algo/find-inl.h" +#include "hwy/tests/test_util-inl.h" +// clang-format on + +// If your project requires C++14 or later, you can ignore this and pass lambdas +// directly to FindIf, without requiring an lvalue as we do here for C++11. +#if __cplusplus < 201402L +#define HWY_GENERIC_LAMBDA 0 +#else +#define HWY_GENERIC_LAMBDA 1 +#endif + +HWY_BEFORE_NAMESPACE(); +namespace hwy { +namespace HWY_NAMESPACE { + +// Returns random number in [-8, 8) - we use knowledge of the range to Find() +// values we know are not present. +template +T Random(RandomState& rng) { + const int32_t bits = static_cast(Random32(&rng)) & 1023; + const double val = (bits - 512) / 64.0; + // Clamp negative to zero for unsigned types. + return static_cast(HWY_MAX(hwy::LowestValue(), val)); +} + +// In C++14, we can instead define these as generic lambdas next to where they +// are invoked. +#if !HWY_GENERIC_LAMBDA + +class GreaterThan { + public: + GreaterThan(int val) : val_(val) {} + template + Mask operator()(D d, V v) const { + return Gt(v, Set(d, static_cast>(val_))); + } + + private: + int val_; +}; + +#endif // !HWY_GENERIC_LAMBDA + +// Invokes Test (e.g. TestFind) with all arg combinations. +template +struct ForeachCountAndMisalign { + template + HWY_NOINLINE void operator()(T /*unused*/, D d) const { + RandomState rng; + const size_t N = Lanes(d); + const size_t misalignments[3] = {0, N / 4, 3 * N / 5}; + + // Find() checks 8 vectors at a time, so we want to cover a fairly large + // range without oversampling (checking every possible count). + std::vector counts(AdjustedReps(512)); + for (size_t& count : counts) { + count = static_cast(rng()) % (16 * N + 1); + } + counts[0] = 0; // ensure we test count=0. + + for (size_t count : counts) { + for (size_t m : misalignments) { + Test()(d, count, m, rng); + } + } + } +}; + +struct TestFind { + template + void operator()(D d, size_t count, size_t misalign, RandomState& rng) { + using T = TFromD; + // Must allocate at least one even if count is zero. + AlignedFreeUniquePtr storage = + AllocateAligned(HWY_MAX(1, misalign + count)); + HWY_ASSERT(storage); + T* in = storage.get() + misalign; + for (size_t i = 0; i < count; ++i) { + in[i] = Random(rng); + } + + // For each position, search for that element (which we know is there) + for (size_t pos = 0; pos < count; ++pos) { + const size_t actual = Find(d, in[pos], in, count); + + // We may have found an earlier occurrence of the same value; ensure the + // value is the same, and that it is the first. + if (!IsEqual(in[pos], in[actual])) { + fprintf(stderr, "%s count %d, found %.15f at %d but wanted %.15f\n", + hwy::TypeName(T(), Lanes(d)).c_str(), static_cast(count), + static_cast(in[actual]), static_cast(actual), + static_cast(in[pos])); + HWY_ASSERT(false); + } + for (size_t i = 0; i < actual; ++i) { + if (IsEqual(in[i], in[pos])) { + fprintf(stderr, "%s count %d, found %f at %d but Find returned %d\n", + hwy::TypeName(T(), Lanes(d)).c_str(), static_cast(count), + static_cast(in[i]), static_cast(i), + static_cast(actual)); + HWY_ASSERT(false); + } + } + } + + // Also search for values we know not to be present (out of range) + HWY_ASSERT_EQ(count, Find(d, T{9}, in, count)); + HWY_ASSERT_EQ(count, Find(d, static_cast(-9), in, count)); + } +}; + +void TestAllFind() { + ForAllTypes(ForPartialVectors>()); +} + +struct TestFindIf { + template + void operator()(D d, size_t count, size_t misalign, RandomState& rng) { + using T = TFromD; + using TI = MakeSigned; + // Must allocate at least one even if count is zero. + AlignedFreeUniquePtr storage = + AllocateAligned(HWY_MAX(1, misalign + count)); + HWY_ASSERT(storage); + T* in = storage.get() + misalign; + for (size_t i = 0; i < count; ++i) { + in[i] = Random(rng); + HWY_ASSERT(in[i] < 8); + HWY_ASSERT(!hwy::IsSigned() || static_cast(in[i]) >= -8); + } + + bool found_any = false; + bool not_found_any = false; + + // unsigned T would be promoted to signed and compare greater than any + // negative val, whereas Set() would just cast to an unsigned value and the + // comparison remains unsigned, so avoid negative numbers there. + const int min_val = IsSigned() ? -9 : 0; + // Includes out-of-range value 9 to test the not-found path. + for (int val = min_val; val <= 9; ++val) { +#if HWY_GENERIC_LAMBDA + const auto greater = [val](const auto d, const auto v) HWY_ATTR { + return Gt(v, Set(d, static_cast(val))); + }; +#else + const GreaterThan greater(val); +#endif + const size_t actual = FindIf(d, in, count, greater); + found_any |= actual < count; + not_found_any |= actual == count; + + const auto pos = std::find_if( + in, in + count, [val](T x) { return x > static_cast(val); }); + // Convert returned iterator to index. + const size_t expected = static_cast(pos - in); + if (expected != actual) { + fprintf(stderr, "%s count %d val %d, expected %d actual %d\n", + hwy::TypeName(T(), Lanes(d)).c_str(), static_cast(count), + val, static_cast(expected), static_cast(actual)); + hwy::detail::PrintArray(hwy::detail::MakeTypeInfo(), "in", in, count, + 0, count); + HWY_ASSERT(false); + } + } + + // We will always not-find something due to val=9. + HWY_ASSERT(not_found_any); + // We'll find something unless the input is empty or {0} - because 0 > i + // is false for all i=[0,9]. + if (count != 0 && in[0] != 0) { + HWY_ASSERT(found_any); + } + } +}; + +void TestAllFindIf() { + ForAllTypes(ForPartialVectors>()); +} + +// NOLINTNEXTLINE(google-readability-namespace-comments) +} // namespace HWY_NAMESPACE +} // namespace hwy +HWY_AFTER_NAMESPACE(); + +#if HWY_ONCE + +namespace hwy { +HWY_BEFORE_TEST(FindTest); +HWY_EXPORT_AND_TEST_P(FindTest, TestAllFind); +HWY_EXPORT_AND_TEST_P(FindTest, TestAllFindIf); +} // namespace hwy + +#endif diff --git a/third_party/highway/hwy/contrib/algo/transform-inl.h b/third_party/highway/hwy/contrib/algo/transform-inl.h new file mode 100644 index 0000000000..3e830acb47 --- /dev/null +++ b/third_party/highway/hwy/contrib/algo/transform-inl.h @@ -0,0 +1,262 @@ +// Copyright 2022 Google LLC +// SPDX-License-Identifier: Apache-2.0 +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Per-target include guard +#if defined(HIGHWAY_HWY_CONTRIB_ALGO_TRANSFORM_INL_H_) == \ + defined(HWY_TARGET_TOGGLE) +#ifdef HIGHWAY_HWY_CONTRIB_ALGO_TRANSFORM_INL_H_ +#undef HIGHWAY_HWY_CONTRIB_ALGO_TRANSFORM_INL_H_ +#else +#define HIGHWAY_HWY_CONTRIB_ALGO_TRANSFORM_INL_H_ +#endif + +#include "hwy/highway.h" + +HWY_BEFORE_NAMESPACE(); +namespace hwy { +namespace HWY_NAMESPACE { + +// These functions avoid having to write a loop plus remainder handling in the +// (unfortunately still common) case where arrays are not aligned/padded. If the +// inputs are known to be aligned/padded, it is more efficient to write a single +// loop using Load(). We do not provide a TransformAlignedPadded because it +// would be more verbose than such a loop. +// +// Func is either a functor with a templated operator()(d, v[, v1[, v2]]), or a +// generic lambda if using C++14. Due to apparent limitations of Clang on +// Windows, it is currently necessary to add HWY_ATTR before the opening { of +// the lambda to avoid errors about "always_inline function .. requires target". +// +// If HWY_MEM_OPS_MIGHT_FAULT, we use scalar code instead of masking. Otherwise, +// we used `MaskedLoad` and `BlendedStore` to read/write the final partial +// vector. + +// Fills `out[0, count)` with the vectors returned by `func(d, index_vec)`, +// where `index_vec` is `Vec>`. On the first call to `func`, +// the value of its lane i is i, and increases by `Lanes(d)` after every call. +// Note that some of these indices may be `>= count`, but the elements that +// `func` returns in those lanes will not be written to `out`. +template > +void Generate(D d, T* HWY_RESTRICT out, size_t count, const Func& func) { + const RebindToUnsigned du; + using TU = TFromD; + const size_t N = Lanes(d); + + size_t idx = 0; + Vec vidx = Iota(du, 0); + for (; idx + N <= count; idx += N) { + StoreU(func(d, vidx), d, out + idx); + vidx = Add(vidx, Set(du, static_cast(N))); + } + + // `count` was a multiple of the vector length `N`: already done. + if (HWY_UNLIKELY(idx == count)) return; + +#if HWY_MEM_OPS_MIGHT_FAULT + // Proceed one by one. + const CappedTag d1; + const RebindToUnsigned du1; + for (; idx < count; ++idx) { + StoreU(func(d1, Set(du1, static_cast(idx))), d1, out + idx); + } +#else + const size_t remaining = count - idx; + HWY_DASSERT(0 != remaining && remaining < N); + const Mask mask = FirstN(d, remaining); + BlendedStore(func(d, vidx), mask, d, out + idx); +#endif +} + +// Replaces `inout[idx]` with `func(d, inout[idx])`. Example usage: multiplying +// array elements by a constant. +template > +void Transform(D d, T* HWY_RESTRICT inout, size_t count, const Func& func) { + const size_t N = Lanes(d); + + size_t idx = 0; + for (; idx + N <= count; idx += N) { + const Vec v = LoadU(d, inout + idx); + StoreU(func(d, v), d, inout + idx); + } + + // `count` was a multiple of the vector length `N`: already done. + if (HWY_UNLIKELY(idx == count)) return; + +#if HWY_MEM_OPS_MIGHT_FAULT + // Proceed one by one. + const CappedTag d1; + for (; idx < count; ++idx) { + using V1 = Vec; + const V1 v = LoadU(d1, inout + idx); + StoreU(func(d1, v), d1, inout + idx); + } +#else + const size_t remaining = count - idx; + HWY_DASSERT(0 != remaining && remaining < N); + const Mask mask = FirstN(d, remaining); + const Vec v = MaskedLoad(mask, d, inout + idx); + BlendedStore(func(d, v), mask, d, inout + idx); +#endif +} + +// Replaces `inout[idx]` with `func(d, inout[idx], in1[idx])`. Example usage: +// multiplying array elements by those of another array. +template > +void Transform1(D d, T* HWY_RESTRICT inout, size_t count, + const T* HWY_RESTRICT in1, const Func& func) { + const size_t N = Lanes(d); + + size_t idx = 0; + for (; idx + N <= count; idx += N) { + const Vec v = LoadU(d, inout + idx); + const Vec v1 = LoadU(d, in1 + idx); + StoreU(func(d, v, v1), d, inout + idx); + } + + // `count` was a multiple of the vector length `N`: already done. + if (HWY_UNLIKELY(idx == count)) return; + +#if HWY_MEM_OPS_MIGHT_FAULT + // Proceed one by one. + const CappedTag d1; + for (; idx < count; ++idx) { + using V1 = Vec; + const V1 v = LoadU(d1, inout + idx); + const V1 v1 = LoadU(d1, in1 + idx); + StoreU(func(d1, v, v1), d1, inout + idx); + } +#else + const size_t remaining = count - idx; + HWY_DASSERT(0 != remaining && remaining < N); + const Mask mask = FirstN(d, remaining); + const Vec v = MaskedLoad(mask, d, inout + idx); + const Vec v1 = MaskedLoad(mask, d, in1 + idx); + BlendedStore(func(d, v, v1), mask, d, inout + idx); +#endif +} + +// Replaces `inout[idx]` with `func(d, inout[idx], in1[idx], in2[idx])`. Example +// usage: FMA of elements from three arrays, stored into the first array. +template > +void Transform2(D d, T* HWY_RESTRICT inout, size_t count, + const T* HWY_RESTRICT in1, const T* HWY_RESTRICT in2, + const Func& func) { + const size_t N = Lanes(d); + + size_t idx = 0; + for (; idx + N <= count; idx += N) { + const Vec v = LoadU(d, inout + idx); + const Vec v1 = LoadU(d, in1 + idx); + const Vec v2 = LoadU(d, in2 + idx); + StoreU(func(d, v, v1, v2), d, inout + idx); + } + + // `count` was a multiple of the vector length `N`: already done. + if (HWY_UNLIKELY(idx == count)) return; + +#if HWY_MEM_OPS_MIGHT_FAULT + // Proceed one by one. + const CappedTag d1; + for (; idx < count; ++idx) { + using V1 = Vec; + const V1 v = LoadU(d1, inout + idx); + const V1 v1 = LoadU(d1, in1 + idx); + const V1 v2 = LoadU(d1, in2 + idx); + StoreU(func(d1, v, v1, v2), d1, inout + idx); + } +#else + const size_t remaining = count - idx; + HWY_DASSERT(0 != remaining && remaining < N); + const Mask mask = FirstN(d, remaining); + const Vec v = MaskedLoad(mask, d, inout + idx); + const Vec v1 = MaskedLoad(mask, d, in1 + idx); + const Vec v2 = MaskedLoad(mask, d, in2 + idx); + BlendedStore(func(d, v, v1, v2), mask, d, inout + idx); +#endif +} + +template > +void Replace(D d, T* HWY_RESTRICT inout, size_t count, T new_t, T old_t) { + const size_t N = Lanes(d); + const Vec old_v = Set(d, old_t); + const Vec new_v = Set(d, new_t); + + size_t idx = 0; + for (; idx + N <= count; idx += N) { + Vec v = LoadU(d, inout + idx); + StoreU(IfThenElse(Eq(v, old_v), new_v, v), d, inout + idx); + } + + // `count` was a multiple of the vector length `N`: already done. + if (HWY_UNLIKELY(idx == count)) return; + +#if HWY_MEM_OPS_MIGHT_FAULT + // Proceed one by one. + const CappedTag d1; + const Vec old_v1 = Set(d1, old_t); + const Vec new_v1 = Set(d1, new_t); + for (; idx < count; ++idx) { + using V1 = Vec; + const V1 v1 = LoadU(d1, inout + idx); + StoreU(IfThenElse(Eq(v1, old_v1), new_v1, v1), d1, inout + idx); + } +#else + const size_t remaining = count - idx; + HWY_DASSERT(0 != remaining && remaining < N); + const Mask mask = FirstN(d, remaining); + const Vec v = MaskedLoad(mask, d, inout + idx); + BlendedStore(IfThenElse(Eq(v, old_v), new_v, v), mask, d, inout + idx); +#endif +} + +template > +void ReplaceIf(D d, T* HWY_RESTRICT inout, size_t count, T new_t, + const Func& func) { + const size_t N = Lanes(d); + const Vec new_v = Set(d, new_t); + + size_t idx = 0; + for (; idx + N <= count; idx += N) { + Vec v = LoadU(d, inout + idx); + StoreU(IfThenElse(func(d, v), new_v, v), d, inout + idx); + } + + // `count` was a multiple of the vector length `N`: already done. + if (HWY_UNLIKELY(idx == count)) return; + +#if HWY_MEM_OPS_MIGHT_FAULT + // Proceed one by one. + const CappedTag d1; + const Vec new_v1 = Set(d1, new_t); + for (; idx < count; ++idx) { + using V1 = Vec; + const V1 v = LoadU(d1, inout + idx); + StoreU(IfThenElse(func(d1, v), new_v1, v), d1, inout + idx); + } +#else + const size_t remaining = count - idx; + HWY_DASSERT(0 != remaining && remaining < N); + const Mask mask = FirstN(d, remaining); + const Vec v = MaskedLoad(mask, d, inout + idx); + BlendedStore(IfThenElse(func(d, v), new_v, v), mask, d, inout + idx); +#endif +} + +// NOLINTNEXTLINE(google-readability-namespace-comments) +} // namespace HWY_NAMESPACE +} // namespace hwy +HWY_AFTER_NAMESPACE(); + +#endif // HIGHWAY_HWY_CONTRIB_ALGO_TRANSFORM_INL_H_ diff --git a/third_party/highway/hwy/contrib/algo/transform_test.cc b/third_party/highway/hwy/contrib/algo/transform_test.cc new file mode 100644 index 0000000000..4726e82419 --- /dev/null +++ b/third_party/highway/hwy/contrib/algo/transform_test.cc @@ -0,0 +1,381 @@ +// Copyright 2022 Google LLC +// SPDX-License-Identifier: Apache-2.0 +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include // memcpy + +#include + +#include "hwy/aligned_allocator.h" + +// clang-format off +#undef HWY_TARGET_INCLUDE +#define HWY_TARGET_INCLUDE "hwy/contrib/algo/transform_test.cc" //NOLINT +#include "hwy/foreach_target.h" // IWYU pragma: keep +#include "hwy/highway.h" +#include "hwy/contrib/algo/transform-inl.h" +#include "hwy/tests/test_util-inl.h" +// clang-format on + +// If your project requires C++14 or later, you can ignore this and pass lambdas +// directly to Transform, without requiring an lvalue as we do here for C++11. +#if __cplusplus < 201402L +#define HWY_GENERIC_LAMBDA 0 +#else +#define HWY_GENERIC_LAMBDA 1 +#endif + +HWY_BEFORE_NAMESPACE(); +namespace hwy { +namespace HWY_NAMESPACE { + +template +T Alpha() { + return static_cast(1.5); // arbitrary scalar +} + +// Returns random floating-point number in [-8, 8) to ensure computations do +// not exceed float32 precision. +template +T Random(RandomState& rng) { + const int32_t bits = static_cast(Random32(&rng)) & 1023; + const double val = (bits - 512) / 64.0; + // Clamp negative to zero for unsigned types. + return static_cast(HWY_MAX(hwy::LowestValue(), val)); +} + +// SCAL, AXPY names are from BLAS. +template +HWY_NOINLINE void SimpleSCAL(const T* x, T* out, size_t count) { + for (size_t i = 0; i < count; ++i) { + out[i] = Alpha() * x[i]; + } +} + +template +HWY_NOINLINE void SimpleAXPY(const T* x, const T* y, T* out, size_t count) { + for (size_t i = 0; i < count; ++i) { + out[i] = Alpha() * x[i] + y[i]; + } +} + +template +HWY_NOINLINE void SimpleFMA4(const T* x, const T* y, const T* z, T* out, + size_t count) { + for (size_t i = 0; i < count; ++i) { + out[i] = x[i] * y[i] + z[i]; + } +} + +// In C++14, we can instead define these as generic lambdas next to where they +// are invoked. +#if !HWY_GENERIC_LAMBDA + +// Generator that returns even numbers by doubling the output indices. +struct Gen2 { + template + Vec operator()(D d, VU vidx) const { + return BitCast(d, Add(vidx, vidx)); + } +}; + +struct SCAL { + template + Vec operator()(D d, V v) const { + using T = TFromD; + return Mul(Set(d, Alpha()), v); + } +}; + +struct AXPY { + template + Vec operator()(D d, V v, V v1) const { + using T = TFromD; + return MulAdd(Set(d, Alpha()), v, v1); + } +}; + +struct FMA4 { + template + Vec operator()(D /*d*/, V v, V v1, V v2) const { + return MulAdd(v, v1, v2); + } +}; + +#endif // !HWY_GENERIC_LAMBDA + +// Invokes Test (e.g. TestTransform1) with all arg combinations. T comes from +// ForFloatTypes. +template +struct ForeachCountAndMisalign { + template + HWY_NOINLINE void operator()(T /*unused*/, D d) const { + RandomState rng; + const size_t N = Lanes(d); + const size_t misalignments[3] = {0, N / 4, 3 * N / 5}; + + for (size_t count = 0; count < 2 * N; ++count) { + for (size_t ma : misalignments) { + for (size_t mb : misalignments) { + Test()(d, count, ma, mb, rng); + } + } + } + } +}; + +// Output-only, no loads +struct TestGenerate { + template + void operator()(D d, size_t count, size_t misalign_a, size_t /*misalign_b*/, + RandomState& /*rng*/) { + using T = TFromD; + AlignedFreeUniquePtr pa = AllocateAligned(misalign_a + count + 1); + AlignedFreeUniquePtr expected = AllocateAligned(HWY_MAX(1, count)); + HWY_ASSERT(pa && expected); + + T* actual = pa.get() + misalign_a; + + for (size_t i = 0; i < count; ++i) { + expected[i] = static_cast(2 * i); + } + + // TODO(janwas): can we update the apply_to in HWY_PUSH_ATTRIBUTES so that + // the attribute also applies to lambdas? If so, remove HWY_ATTR. +#if HWY_GENERIC_LAMBDA + const auto gen2 = [](const auto d, const auto vidx) + HWY_ATTR { return BitCast(d, Add(vidx, vidx)); }; +#else + const Gen2 gen2; +#endif + actual[count] = T{0}; // sentinel + Generate(d, actual, count, gen2); + HWY_ASSERT_EQ(T{0}, actual[count]); // did not write past end + + const auto info = hwy::detail::MakeTypeInfo(); + const char* target_name = hwy::TargetName(HWY_TARGET); + hwy::detail::AssertArrayEqual(info, expected.get(), actual, count, + target_name, __FILE__, __LINE__); + } +}; + +// Zero extra input arrays +struct TestTransform { + template + void operator()(D d, size_t count, size_t misalign_a, size_t misalign_b, + RandomState& rng) { + if (misalign_b != 0) return; + using T = TFromD; + // Prevents error if size to allocate is zero. + AlignedFreeUniquePtr pa = + AllocateAligned(HWY_MAX(1, misalign_a + count)); + AlignedFreeUniquePtr expected = AllocateAligned(HWY_MAX(1, count)); + HWY_ASSERT(pa && expected); + + T* a = pa.get() + misalign_a; + for (size_t i = 0; i < count; ++i) { + a[i] = Random(rng); + } + + SimpleSCAL(a, expected.get(), count); + + // TODO(janwas): can we update the apply_to in HWY_PUSH_ATTRIBUTES so that + // the attribute also applies to lambdas? If so, remove HWY_ATTR. +#if HWY_GENERIC_LAMBDA + const auto scal = [](const auto d, const auto v) + HWY_ATTR { return Mul(Set(d, Alpha()), v); }; +#else + const SCAL scal; +#endif + Transform(d, a, count, scal); + + const auto info = hwy::detail::MakeTypeInfo(); + const char* target_name = hwy::TargetName(HWY_TARGET); + hwy::detail::AssertArrayEqual(info, expected.get(), a, count, target_name, + __FILE__, __LINE__); + } +}; + +// One extra input array +struct TestTransform1 { + template + void operator()(D d, size_t count, size_t misalign_a, size_t misalign_b, + RandomState& rng) { + using T = TFromD; + // Prevents error if size to allocate is zero. + AlignedFreeUniquePtr pa = + AllocateAligned(HWY_MAX(1, misalign_a + count)); + AlignedFreeUniquePtr pb = + AllocateAligned(HWY_MAX(1, misalign_b + count)); + AlignedFreeUniquePtr expected = AllocateAligned(HWY_MAX(1, count)); + HWY_ASSERT(pa && pb && expected); + T* a = pa.get() + misalign_a; + T* b = pb.get() + misalign_b; + for (size_t i = 0; i < count; ++i) { + a[i] = Random(rng); + b[i] = Random(rng); + } + + SimpleAXPY(a, b, expected.get(), count); + +#if HWY_GENERIC_LAMBDA + const auto axpy = [](const auto d, const auto v, const auto v1) HWY_ATTR { + return MulAdd(Set(d, Alpha()), v, v1); + }; +#else + const AXPY axpy; +#endif + Transform1(d, a, count, b, axpy); + + const auto info = hwy::detail::MakeTypeInfo(); + const char* target_name = hwy::TargetName(HWY_TARGET); + hwy::detail::AssertArrayEqual(info, expected.get(), a, count, target_name, + __FILE__, __LINE__); + } +}; + +// Two extra input arrays +struct TestTransform2 { + template + void operator()(D d, size_t count, size_t misalign_a, size_t misalign_b, + RandomState& rng) { + using T = TFromD; + // Prevents error if size to allocate is zero. + AlignedFreeUniquePtr pa = + AllocateAligned(HWY_MAX(1, misalign_a + count)); + AlignedFreeUniquePtr pb = + AllocateAligned(HWY_MAX(1, misalign_b + count)); + AlignedFreeUniquePtr pc = + AllocateAligned(HWY_MAX(1, misalign_a + count)); + AlignedFreeUniquePtr expected = AllocateAligned(HWY_MAX(1, count)); + HWY_ASSERT(pa && pb && pc && expected); + T* a = pa.get() + misalign_a; + T* b = pb.get() + misalign_b; + T* c = pc.get() + misalign_a; + for (size_t i = 0; i < count; ++i) { + a[i] = Random(rng); + b[i] = Random(rng); + c[i] = Random(rng); + } + + SimpleFMA4(a, b, c, expected.get(), count); + +#if HWY_GENERIC_LAMBDA + const auto fma4 = [](auto /*d*/, auto v, auto v1, auto v2) + HWY_ATTR { return MulAdd(v, v1, v2); }; +#else + const FMA4 fma4; +#endif + Transform2(d, a, count, b, c, fma4); + + const auto info = hwy::detail::MakeTypeInfo(); + const char* target_name = hwy::TargetName(HWY_TARGET); + hwy::detail::AssertArrayEqual(info, expected.get(), a, count, target_name, + __FILE__, __LINE__); + } +}; + +template +class IfEq { + public: + IfEq(T val) : val_(val) {} + + template + Mask operator()(D d, V v) const { + return Eq(v, Set(d, val_)); + } + + private: + T val_; +}; + +struct TestReplace { + template + void operator()(D d, size_t count, size_t misalign_a, size_t misalign_b, + RandomState& rng) { + if (misalign_b != 0) return; + if (count == 0) return; + using T = TFromD; + AlignedFreeUniquePtr pa = AllocateAligned(misalign_a + count); + AlignedFreeUniquePtr pb = AllocateAligned(count); + AlignedFreeUniquePtr expected = AllocateAligned(count); + HWY_ASSERT(pa && pb && expected); + + T* a = pa.get() + misalign_a; + for (size_t i = 0; i < count; ++i) { + a[i] = Random(rng); + } + + std::vector positions(AdjustedReps(count)); + for (size_t& pos : positions) { + pos = static_cast(rng()) % count; + } + + for (size_t pos = 0; pos < count; ++pos) { + const T old_t = a[pos]; + const T new_t = Random(rng); + for (size_t i = 0; i < count; ++i) { + expected[i] = IsEqual(a[i], old_t) ? new_t : a[i]; + } + + // Copy so ReplaceIf gets the same input (and thus also outputs expected) + memcpy(pb.get(), a, count * sizeof(T)); + + Replace(d, a, count, new_t, old_t); + HWY_ASSERT_ARRAY_EQ(expected.get(), a, count); + + ReplaceIf(d, pb.get(), count, new_t, IfEq(old_t)); + HWY_ASSERT_ARRAY_EQ(expected.get(), pb.get(), count); + } + } +}; + +void TestAllGenerate() { + // The test BitCast-s the indices, which does not work for floats. + ForIntegerTypes(ForPartialVectors>()); +} + +void TestAllTransform() { + ForFloatTypes(ForPartialVectors>()); +} + +void TestAllTransform1() { + ForFloatTypes(ForPartialVectors>()); +} + +void TestAllTransform2() { + ForFloatTypes(ForPartialVectors>()); +} + +void TestAllReplace() { + ForFloatTypes(ForPartialVectors>()); +} + +// NOLINTNEXTLINE(google-readability-namespace-comments) +} // namespace HWY_NAMESPACE +} // namespace hwy +HWY_AFTER_NAMESPACE(); + +#if HWY_ONCE + +namespace hwy { +HWY_BEFORE_TEST(TransformTest); +HWY_EXPORT_AND_TEST_P(TransformTest, TestAllGenerate); +HWY_EXPORT_AND_TEST_P(TransformTest, TestAllTransform); +HWY_EXPORT_AND_TEST_P(TransformTest, TestAllTransform1); +HWY_EXPORT_AND_TEST_P(TransformTest, TestAllTransform2); +HWY_EXPORT_AND_TEST_P(TransformTest, TestAllReplace); +} // namespace hwy + +#endif diff --git a/third_party/highway/hwy/contrib/bit_pack/bit_pack-inl.h b/third_party/highway/hwy/contrib/bit_pack/bit_pack-inl.h new file mode 100644 index 0000000000..04d015453b --- /dev/null +++ b/third_party/highway/hwy/contrib/bit_pack/bit_pack-inl.h @@ -0,0 +1,2599 @@ +// Copyright 2022 Google LLC +// SPDX-License-Identifier: Apache-2.0 +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Per-target include guard +#if defined(HIGHWAY_HWY_CONTRIB_BIT_PACK_INL_H_) == \ + defined(HWY_TARGET_TOGGLE) +#ifdef HIGHWAY_HWY_CONTRIB_BIT_PACK_INL_H_ +#undef HIGHWAY_HWY_CONTRIB_BIT_PACK_INL_H_ +#else +#define HIGHWAY_HWY_CONTRIB_BIT_PACK_INL_H_ +#endif + +#include "hwy/highway.h" + +HWY_BEFORE_NAMESPACE(); +namespace hwy { +namespace HWY_NAMESPACE { + +// The entry points are class templates specialized below for each number of +// bits. Each provides Pack and Unpack member functions which load (Pack) or +// store (Unpack) B raw vectors, and store (Pack) or load (Unpack) a number of +// packed vectors equal to kBits. B denotes the bits per lane: 8 for Pack8, 16 +// for Pack16, which is also the upper bound for kBits. +template // <= 8 +struct Pack8 {}; +template // <= 16 +struct Pack16 {}; + +template <> +struct Pack8<1> { + template + HWY_INLINE void Pack(D8 d8, const uint8_t* HWY_RESTRICT raw, + uint8_t* HWY_RESTRICT packed_out) const { + const RepartitionToWide d16; + using VU16 = Vec; + const size_t N8 = Lanes(d8); + // 16-bit shifts avoid masking (bits will not cross 8-bit lanes). + const VU16 raw0 = BitCast(d16, LoadU(d8, raw + 0 * N8)); + const VU16 raw1 = BitCast(d16, LoadU(d8, raw + 1 * N8)); + const VU16 raw2 = BitCast(d16, LoadU(d8, raw + 2 * N8)); + const VU16 raw3 = BitCast(d16, LoadU(d8, raw + 3 * N8)); + const VU16 raw4 = BitCast(d16, LoadU(d8, raw + 4 * N8)); + const VU16 raw5 = BitCast(d16, LoadU(d8, raw + 5 * N8)); + const VU16 raw6 = BitCast(d16, LoadU(d8, raw + 6 * N8)); + const VU16 raw7 = BitCast(d16, LoadU(d8, raw + 7 * N8)); + + const VU16 packed = + Xor3(Or(ShiftLeft<7>(raw7), ShiftLeft<6>(raw6)), + Xor3(ShiftLeft<5>(raw5), ShiftLeft<4>(raw4), ShiftLeft<3>(raw3)), + Xor3(ShiftLeft<2>(raw2), ShiftLeft<1>(raw1), raw0)); + StoreU(BitCast(d8, packed), d8, packed_out); + } + + template + HWY_INLINE void Unpack(D8 d8, const uint8_t* HWY_RESTRICT packed_in, + uint8_t* HWY_RESTRICT raw) const { + const RepartitionToWide d16; + using VU16 = Vec; + const size_t N8 = Lanes(d8); + const VU16 mask = Set(d16, 0x0101u); // LSB in each byte + + const VU16 packed = BitCast(d16, LoadU(d8, packed_in)); + + const VU16 raw0 = And(packed, mask); + StoreU(BitCast(d8, raw0), d8, raw + 0 * N8); + + const VU16 raw1 = And(ShiftRight<1>(packed), mask); + StoreU(BitCast(d8, raw1), d8, raw + 1 * N8); + + const VU16 raw2 = And(ShiftRight<2>(packed), mask); + StoreU(BitCast(d8, raw2), d8, raw + 2 * N8); + + const VU16 raw3 = And(ShiftRight<3>(packed), mask); + StoreU(BitCast(d8, raw3), d8, raw + 3 * N8); + + const VU16 raw4 = And(ShiftRight<4>(packed), mask); + StoreU(BitCast(d8, raw4), d8, raw + 4 * N8); + + const VU16 raw5 = And(ShiftRight<5>(packed), mask); + StoreU(BitCast(d8, raw5), d8, raw + 5 * N8); + + const VU16 raw6 = And(ShiftRight<6>(packed), mask); + StoreU(BitCast(d8, raw6), d8, raw + 6 * N8); + + const VU16 raw7 = And(ShiftRight<7>(packed), mask); + StoreU(BitCast(d8, raw7), d8, raw + 7 * N8); + } +}; // Pack8<1> + +template <> +struct Pack8<2> { + template + HWY_INLINE void Pack(D8 d8, const uint8_t* HWY_RESTRICT raw, + uint8_t* HWY_RESTRICT packed_out) const { + const RepartitionToWide d16; + using VU16 = Vec; + const size_t N8 = Lanes(d8); + // 16-bit shifts avoid masking (bits will not cross 8-bit lanes). + const VU16 raw0 = BitCast(d16, LoadU(d8, raw + 0 * N8)); + const VU16 raw1 = BitCast(d16, LoadU(d8, raw + 1 * N8)); + const VU16 raw2 = BitCast(d16, LoadU(d8, raw + 2 * N8)); + const VU16 raw3 = BitCast(d16, LoadU(d8, raw + 3 * N8)); + const VU16 raw4 = BitCast(d16, LoadU(d8, raw + 4 * N8)); + const VU16 raw5 = BitCast(d16, LoadU(d8, raw + 5 * N8)); + const VU16 raw6 = BitCast(d16, LoadU(d8, raw + 6 * N8)); + const VU16 raw7 = BitCast(d16, LoadU(d8, raw + 7 * N8)); + + const VU16 packed0 = Xor3(ShiftLeft<6>(raw6), ShiftLeft<4>(raw4), + Or(ShiftLeft<2>(raw2), raw0)); + const VU16 packed1 = Xor3(ShiftLeft<6>(raw7), ShiftLeft<4>(raw5), + Or(ShiftLeft<2>(raw3), raw1)); + StoreU(BitCast(d8, packed0), d8, packed_out + 0 * N8); + StoreU(BitCast(d8, packed1), d8, packed_out + 1 * N8); + } + + template + HWY_INLINE void Unpack(D8 d8, const uint8_t* HWY_RESTRICT packed_in, + uint8_t* HWY_RESTRICT raw) const { + const RepartitionToWide d16; + using VU16 = Vec; + const size_t N8 = Lanes(d8); + const VU16 mask = Set(d16, 0x0303u); // Lowest 2 bits per byte + + const VU16 packed0 = BitCast(d16, LoadU(d8, packed_in + 0 * N8)); + const VU16 packed1 = BitCast(d16, LoadU(d8, packed_in + 1 * N8)); + + const VU16 raw0 = And(packed0, mask); + StoreU(BitCast(d8, raw0), d8, raw + 0 * N8); + + const VU16 raw1 = And(packed1, mask); + StoreU(BitCast(d8, raw1), d8, raw + 1 * N8); + + const VU16 raw2 = And(ShiftRight<2>(packed0), mask); + StoreU(BitCast(d8, raw2), d8, raw + 2 * N8); + + const VU16 raw3 = And(ShiftRight<2>(packed1), mask); + StoreU(BitCast(d8, raw3), d8, raw + 3 * N8); + + const VU16 raw4 = And(ShiftRight<4>(packed0), mask); + StoreU(BitCast(d8, raw4), d8, raw + 4 * N8); + + const VU16 raw5 = And(ShiftRight<4>(packed1), mask); + StoreU(BitCast(d8, raw5), d8, raw + 5 * N8); + + const VU16 raw6 = And(ShiftRight<6>(packed0), mask); + StoreU(BitCast(d8, raw6), d8, raw + 6 * N8); + + const VU16 raw7 = And(ShiftRight<6>(packed1), mask); + StoreU(BitCast(d8, raw7), d8, raw + 7 * N8); + } +}; // Pack8<2> + +template <> +struct Pack8<3> { + template + HWY_INLINE void Pack(D8 d8, const uint8_t* HWY_RESTRICT raw, + uint8_t* HWY_RESTRICT packed_out) const { + const RepartitionToWide d16; + using VU16 = Vec; + const size_t N8 = Lanes(d8); + const VU16 raw0 = BitCast(d16, LoadU(d8, raw + 0 * N8)); + const VU16 raw1 = BitCast(d16, LoadU(d8, raw + 1 * N8)); + const VU16 raw2 = BitCast(d16, LoadU(d8, raw + 2 * N8)); + const VU16 raw3 = BitCast(d16, LoadU(d8, raw + 3 * N8)); + const VU16 raw4 = BitCast(d16, LoadU(d8, raw + 4 * N8)); + const VU16 raw5 = BitCast(d16, LoadU(d8, raw + 5 * N8)); + const VU16 raw6 = BitCast(d16, LoadU(d8, raw + 6 * N8)); + const VU16 raw7 = BitCast(d16, LoadU(d8, raw + 7 * N8)); + + // The upper two bits of these three will be filled with packed3 (6 bits). + VU16 packed0 = Or(ShiftLeft<3>(raw4), raw0); + VU16 packed1 = Or(ShiftLeft<3>(raw5), raw1); + VU16 packed2 = Or(ShiftLeft<3>(raw6), raw2); + const VU16 packed3 = Or(ShiftLeft<3>(raw7), raw3); + + const VU16 hi2 = Set(d16, 0xC0C0u); + packed0 = OrAnd(packed0, ShiftLeft<2>(packed3), hi2); + packed1 = OrAnd(packed1, ShiftLeft<4>(packed3), hi2); + packed2 = OrAnd(packed2, ShiftLeft<6>(packed3), hi2); + StoreU(BitCast(d8, packed0), d8, packed_out + 0 * N8); + StoreU(BitCast(d8, packed1), d8, packed_out + 1 * N8); + StoreU(BitCast(d8, packed2), d8, packed_out + 2 * N8); + } + + template + HWY_INLINE void Unpack(D8 d8, const uint8_t* HWY_RESTRICT packed_in, + uint8_t* HWY_RESTRICT raw) const { + const RepartitionToWide d16; + using VU16 = Vec; + const size_t N8 = Lanes(d8); + const VU16 mask = Set(d16, 0x0707u); // Lowest 3 bits per byte + + const VU16 packed0 = BitCast(d16, LoadU(d8, packed_in + 0 * N8)); + const VU16 packed1 = BitCast(d16, LoadU(d8, packed_in + 1 * N8)); + const VU16 packed2 = BitCast(d16, LoadU(d8, packed_in + 2 * N8)); + + const VU16 raw0 = And(packed0, mask); + StoreU(BitCast(d8, raw0), d8, raw + 0 * N8); + + const VU16 raw1 = And(packed1, mask); + StoreU(BitCast(d8, raw1), d8, raw + 1 * N8); + + const VU16 raw2 = And(packed2, mask); + StoreU(BitCast(d8, raw2), d8, raw + 2 * N8); + + const VU16 raw4 = And(ShiftRight<3>(packed0), mask); + StoreU(BitCast(d8, raw4), d8, raw + 4 * N8); + + const VU16 raw5 = And(ShiftRight<3>(packed1), mask); + StoreU(BitCast(d8, raw5), d8, raw + 5 * N8); + + const VU16 raw6 = And(ShiftRight<3>(packed2), mask); + StoreU(BitCast(d8, raw6), d8, raw + 6 * N8); + + // raw73 is the concatenation of the upper two bits in packed0..2. + const VU16 hi2 = Set(d16, 0xC0C0u); + const VU16 raw73 = Xor3(ShiftRight<6>(And(packed2, hi2)), // + ShiftRight<4>(And(packed1, hi2)), + ShiftRight<2>(And(packed0, hi2))); + + const VU16 raw3 = And(mask, raw73); + StoreU(BitCast(d8, raw3), d8, raw + 3 * N8); + + const VU16 raw7 = And(mask, ShiftRight<3>(raw73)); + StoreU(BitCast(d8, raw7), d8, raw + 7 * N8); + } +}; // Pack8<3> + +template <> +struct Pack8<4> { + template + HWY_INLINE void Pack(D8 d8, const uint8_t* HWY_RESTRICT raw, + uint8_t* HWY_RESTRICT packed_out) const { + const RepartitionToWide d16; + using VU16 = Vec; + const size_t N8 = Lanes(d8); + // 16-bit shifts avoid masking (bits will not cross 8-bit lanes). + const VU16 raw0 = BitCast(d16, LoadU(d8, raw + 0 * N8)); + const VU16 raw1 = BitCast(d16, LoadU(d8, raw + 1 * N8)); + const VU16 raw2 = BitCast(d16, LoadU(d8, raw + 2 * N8)); + const VU16 raw3 = BitCast(d16, LoadU(d8, raw + 3 * N8)); + const VU16 raw4 = BitCast(d16, LoadU(d8, raw + 4 * N8)); + const VU16 raw5 = BitCast(d16, LoadU(d8, raw + 5 * N8)); + const VU16 raw6 = BitCast(d16, LoadU(d8, raw + 6 * N8)); + const VU16 raw7 = BitCast(d16, LoadU(d8, raw + 7 * N8)); + + const VU16 packed0 = Or(ShiftLeft<4>(raw2), raw0); + const VU16 packed1 = Or(ShiftLeft<4>(raw3), raw1); + const VU16 packed2 = Or(ShiftLeft<4>(raw6), raw4); + const VU16 packed3 = Or(ShiftLeft<4>(raw7), raw5); + + StoreU(BitCast(d8, packed0), d8, packed_out + 0 * N8); + StoreU(BitCast(d8, packed1), d8, packed_out + 1 * N8); + StoreU(BitCast(d8, packed2), d8, packed_out + 2 * N8); + StoreU(BitCast(d8, packed3), d8, packed_out + 3 * N8); + } + + template + HWY_INLINE void Unpack(D8 d8, const uint8_t* HWY_RESTRICT packed_in, + uint8_t* HWY_RESTRICT raw) const { + const RepartitionToWide d16; + using VU16 = Vec; + const size_t N8 = Lanes(d8); + const VU16 mask = Set(d16, 0x0F0Fu); // Lowest 4 bits per byte + + const VU16 packed0 = BitCast(d16, LoadU(d8, packed_in + 0 * N8)); + const VU16 packed1 = BitCast(d16, LoadU(d8, packed_in + 1 * N8)); + const VU16 packed2 = BitCast(d16, LoadU(d8, packed_in + 2 * N8)); + const VU16 packed3 = BitCast(d16, LoadU(d8, packed_in + 3 * N8)); + + const VU16 raw0 = And(packed0, mask); + StoreU(BitCast(d8, raw0), d8, raw + 0 * N8); + + const VU16 raw1 = And(packed1, mask); + StoreU(BitCast(d8, raw1), d8, raw + 1 * N8); + + const VU16 raw2 = And(ShiftRight<4>(packed0), mask); + StoreU(BitCast(d8, raw2), d8, raw + 2 * N8); + + const VU16 raw3 = And(ShiftRight<4>(packed1), mask); + StoreU(BitCast(d8, raw3), d8, raw + 3 * N8); + + const VU16 raw4 = And(packed2, mask); + StoreU(BitCast(d8, raw4), d8, raw + 4 * N8); + + const VU16 raw5 = And(packed3, mask); + StoreU(BitCast(d8, raw5), d8, raw + 5 * N8); + + const VU16 raw6 = And(ShiftRight<4>(packed2), mask); + StoreU(BitCast(d8, raw6), d8, raw + 6 * N8); + + const VU16 raw7 = And(ShiftRight<4>(packed3), mask); + StoreU(BitCast(d8, raw7), d8, raw + 7 * N8); + } +}; // Pack8<4> + +template <> +struct Pack8<5> { + template + HWY_INLINE void Pack(D8 d8, const uint8_t* HWY_RESTRICT raw, + uint8_t* HWY_RESTRICT packed_out) const { + const RepartitionToWide d16; + using VU16 = Vec; + const size_t N8 = Lanes(d8); + const VU16 raw0 = BitCast(d16, LoadU(d8, raw + 0 * N8)); + const VU16 raw1 = BitCast(d16, LoadU(d8, raw + 1 * N8)); + const VU16 raw2 = BitCast(d16, LoadU(d8, raw + 2 * N8)); + const VU16 raw3 = BitCast(d16, LoadU(d8, raw + 3 * N8)); + const VU16 raw4 = BitCast(d16, LoadU(d8, raw + 4 * N8)); + const VU16 raw5 = BitCast(d16, LoadU(d8, raw + 5 * N8)); + const VU16 raw6 = BitCast(d16, LoadU(d8, raw + 6 * N8)); + const VU16 raw7 = BitCast(d16, LoadU(d8, raw + 7 * N8)); + + // Fill upper three bits with upper bits from raw4..7. + const VU16 hi3 = Set(d16, 0xE0E0u); + const VU16 packed0 = OrAnd(raw0, ShiftLeft<3>(raw4), hi3); + const VU16 packed1 = OrAnd(raw1, ShiftLeft<3>(raw5), hi3); + const VU16 packed2 = OrAnd(raw2, ShiftLeft<3>(raw6), hi3); + const VU16 packed3 = OrAnd(raw3, ShiftLeft<3>(raw7), hi3); + + StoreU(BitCast(d8, packed0), d8, packed_out + 0 * N8); + StoreU(BitCast(d8, packed1), d8, packed_out + 1 * N8); + StoreU(BitCast(d8, packed2), d8, packed_out + 2 * N8); + StoreU(BitCast(d8, packed3), d8, packed_out + 3 * N8); + + // Combine lower two bits of raw4..7 into packed4. + const VU16 lo2 = Set(d16, 0x0303u); + const VU16 packed4 = Or(And(raw4, lo2), Xor3(ShiftLeft<2>(And(raw5, lo2)), + ShiftLeft<4>(And(raw6, lo2)), + ShiftLeft<6>(And(raw7, lo2)))); + StoreU(BitCast(d8, packed4), d8, packed_out + 4 * N8); + } + + template + HWY_INLINE void Unpack(D8 d8, const uint8_t* HWY_RESTRICT packed_in, + uint8_t* HWY_RESTRICT raw) const { + const RepartitionToWide d16; + using VU16 = Vec; + const size_t N8 = Lanes(d8); + + const VU16 packed0 = BitCast(d16, LoadU(d8, packed_in + 0 * N8)); + const VU16 packed1 = BitCast(d16, LoadU(d8, packed_in + 1 * N8)); + const VU16 packed2 = BitCast(d16, LoadU(d8, packed_in + 2 * N8)); + const VU16 packed3 = BitCast(d16, LoadU(d8, packed_in + 3 * N8)); + const VU16 packed4 = BitCast(d16, LoadU(d8, packed_in + 4 * N8)); + + const VU16 mask = Set(d16, 0x1F1Fu); // Lowest 5 bits per byte + + const VU16 raw0 = And(packed0, mask); + StoreU(BitCast(d8, raw0), d8, raw + 0 * N8); + + const VU16 raw1 = And(packed1, mask); + StoreU(BitCast(d8, raw1), d8, raw + 1 * N8); + + const VU16 raw2 = And(packed2, mask); + StoreU(BitCast(d8, raw2), d8, raw + 2 * N8); + + const VU16 raw3 = And(packed3, mask); + StoreU(BitCast(d8, raw3), d8, raw + 3 * N8); + + // The upper bits are the top 3 bits shifted right by three. + const VU16 top4 = ShiftRight<3>(AndNot(mask, packed0)); + const VU16 top5 = ShiftRight<3>(AndNot(mask, packed1)); + const VU16 top6 = ShiftRight<3>(AndNot(mask, packed2)); + const VU16 top7 = ShiftRight<3>(AndNot(mask, packed3)); + + // Insert the lower 2 bits, which were concatenated into a byte. + const VU16 lo2 = Set(d16, 0x0303u); + const VU16 raw4 = OrAnd(top4, lo2, packed4); + const VU16 raw5 = OrAnd(top5, lo2, ShiftRight<2>(packed4)); + const VU16 raw6 = OrAnd(top6, lo2, ShiftRight<4>(packed4)); + const VU16 raw7 = OrAnd(top7, lo2, ShiftRight<6>(packed4)); + + StoreU(BitCast(d8, raw4), d8, raw + 4 * N8); + StoreU(BitCast(d8, raw5), d8, raw + 5 * N8); + StoreU(BitCast(d8, raw6), d8, raw + 6 * N8); + StoreU(BitCast(d8, raw7), d8, raw + 7 * N8); + } +}; // Pack8<5> + +template <> +struct Pack8<6> { + template + HWY_INLINE void Pack(D8 d8, const uint8_t* HWY_RESTRICT raw, + uint8_t* HWY_RESTRICT packed_out) const { + const RepartitionToWide d16; + using VU16 = Vec; + const size_t N8 = Lanes(d8); + const VU16 raw0 = BitCast(d16, LoadU(d8, raw + 0 * N8)); + const VU16 raw1 = BitCast(d16, LoadU(d8, raw + 1 * N8)); + const VU16 raw2 = BitCast(d16, LoadU(d8, raw + 2 * N8)); + const VU16 raw3 = BitCast(d16, LoadU(d8, raw + 3 * N8)); + const VU16 raw4 = BitCast(d16, LoadU(d8, raw + 4 * N8)); + const VU16 raw5 = BitCast(d16, LoadU(d8, raw + 5 * N8)); + const VU16 raw6 = BitCast(d16, LoadU(d8, raw + 6 * N8)); + const VU16 raw7 = BitCast(d16, LoadU(d8, raw + 7 * N8)); + + const VU16 hi2 = Set(d16, 0xC0C0u); + // Each triplet of these stores raw3/raw7 (6 bits) in the upper 2 bits. + const VU16 packed0 = OrAnd(raw0, ShiftLeft<2>(raw3), hi2); + const VU16 packed1 = OrAnd(raw1, ShiftLeft<4>(raw3), hi2); + const VU16 packed2 = OrAnd(raw2, ShiftLeft<6>(raw3), hi2); + const VU16 packed3 = OrAnd(raw4, ShiftLeft<2>(raw7), hi2); + const VU16 packed4 = OrAnd(raw5, ShiftLeft<4>(raw7), hi2); + const VU16 packed5 = OrAnd(raw6, ShiftLeft<6>(raw7), hi2); + + StoreU(BitCast(d8, packed0), d8, packed_out + 0 * N8); + StoreU(BitCast(d8, packed1), d8, packed_out + 1 * N8); + StoreU(BitCast(d8, packed2), d8, packed_out + 2 * N8); + StoreU(BitCast(d8, packed3), d8, packed_out + 3 * N8); + StoreU(BitCast(d8, packed4), d8, packed_out + 4 * N8); + StoreU(BitCast(d8, packed5), d8, packed_out + 5 * N8); + } + + template + HWY_INLINE void Unpack(D8 d8, const uint8_t* HWY_RESTRICT packed_in, + uint8_t* HWY_RESTRICT raw) const { + const RepartitionToWide d16; + using VU16 = Vec; + const size_t N8 = Lanes(d8); + const VU16 mask = Set(d16, 0x3F3Fu); // Lowest 6 bits per byte + + const VU16 packed0 = BitCast(d16, LoadU(d8, packed_in + 0 * N8)); + const VU16 packed1 = BitCast(d16, LoadU(d8, packed_in + 1 * N8)); + const VU16 packed2 = BitCast(d16, LoadU(d8, packed_in + 2 * N8)); + const VU16 packed3 = BitCast(d16, LoadU(d8, packed_in + 3 * N8)); + const VU16 packed4 = BitCast(d16, LoadU(d8, packed_in + 4 * N8)); + const VU16 packed5 = BitCast(d16, LoadU(d8, packed_in + 5 * N8)); + + const VU16 raw0 = And(packed0, mask); + StoreU(BitCast(d8, raw0), d8, raw + 0 * N8); + + const VU16 raw1 = And(packed1, mask); + StoreU(BitCast(d8, raw1), d8, raw + 1 * N8); + + const VU16 raw2 = And(packed2, mask); + StoreU(BitCast(d8, raw2), d8, raw + 2 * N8); + + const VU16 raw4 = And(packed3, mask); + StoreU(BitCast(d8, raw4), d8, raw + 4 * N8); + + const VU16 raw5 = And(packed4, mask); + StoreU(BitCast(d8, raw5), d8, raw + 5 * N8); + + const VU16 raw6 = And(packed5, mask); + StoreU(BitCast(d8, raw6), d8, raw + 6 * N8); + + // raw3/7 are the concatenation of the upper two bits in packed0..2. + const VU16 raw3 = Xor3(ShiftRight<6>(AndNot(mask, packed2)), + ShiftRight<4>(AndNot(mask, packed1)), + ShiftRight<2>(AndNot(mask, packed0))); + const VU16 raw7 = Xor3(ShiftRight<6>(AndNot(mask, packed5)), + ShiftRight<4>(AndNot(mask, packed4)), + ShiftRight<2>(AndNot(mask, packed3))); + StoreU(BitCast(d8, raw3), d8, raw + 3 * N8); + StoreU(BitCast(d8, raw7), d8, raw + 7 * N8); + } +}; // Pack8<6> + +template <> +struct Pack8<7> { + template + HWY_INLINE void Pack(D8 d8, const uint8_t* HWY_RESTRICT raw, + uint8_t* HWY_RESTRICT packed_out) const { + const RepartitionToWide d16; + using VU16 = Vec; + const size_t N8 = Lanes(d8); + const VU16 raw0 = BitCast(d16, LoadU(d8, raw + 0 * N8)); + const VU16 raw1 = BitCast(d16, LoadU(d8, raw + 1 * N8)); + const VU16 raw2 = BitCast(d16, LoadU(d8, raw + 2 * N8)); + const VU16 raw3 = BitCast(d16, LoadU(d8, raw + 3 * N8)); + const VU16 raw4 = BitCast(d16, LoadU(d8, raw + 4 * N8)); + const VU16 raw5 = BitCast(d16, LoadU(d8, raw + 5 * N8)); + const VU16 raw6 = BitCast(d16, LoadU(d8, raw + 6 * N8)); + // Inserted into top bit of packed0..6. + const VU16 raw7 = BitCast(d16, LoadU(d8, raw + 7 * N8)); + + const VU16 hi1 = Set(d16, 0x8080u); + const VU16 packed0 = OrAnd(raw0, Add(raw7, raw7), hi1); + const VU16 packed1 = OrAnd(raw1, ShiftLeft<2>(raw7), hi1); + const VU16 packed2 = OrAnd(raw2, ShiftLeft<3>(raw7), hi1); + const VU16 packed3 = OrAnd(raw3, ShiftLeft<4>(raw7), hi1); + const VU16 packed4 = OrAnd(raw4, ShiftLeft<5>(raw7), hi1); + const VU16 packed5 = OrAnd(raw5, ShiftLeft<6>(raw7), hi1); + const VU16 packed6 = OrAnd(raw6, ShiftLeft<7>(raw7), hi1); + + StoreU(BitCast(d8, packed0), d8, packed_out + 0 * N8); + StoreU(BitCast(d8, packed1), d8, packed_out + 1 * N8); + StoreU(BitCast(d8, packed2), d8, packed_out + 2 * N8); + StoreU(BitCast(d8, packed3), d8, packed_out + 3 * N8); + StoreU(BitCast(d8, packed4), d8, packed_out + 4 * N8); + StoreU(BitCast(d8, packed5), d8, packed_out + 5 * N8); + StoreU(BitCast(d8, packed6), d8, packed_out + 6 * N8); + } + + template + HWY_INLINE void Unpack(D8 d8, const uint8_t* HWY_RESTRICT packed_in, + uint8_t* HWY_RESTRICT raw) const { + const RepartitionToWide d16; + using VU16 = Vec; + const size_t N8 = Lanes(d8); + + const VU16 packed0 = BitCast(d16, LoadU(d8, packed_in + 0 * N8)); + const VU16 packed1 = BitCast(d16, LoadU(d8, packed_in + 1 * N8)); + const VU16 packed2 = BitCast(d16, LoadU(d8, packed_in + 2 * N8)); + const VU16 packed3 = BitCast(d16, LoadU(d8, packed_in + 3 * N8)); + const VU16 packed4 = BitCast(d16, LoadU(d8, packed_in + 4 * N8)); + const VU16 packed5 = BitCast(d16, LoadU(d8, packed_in + 5 * N8)); + const VU16 packed6 = BitCast(d16, LoadU(d8, packed_in + 6 * N8)); + + const VU16 mask = Set(d16, 0x7F7Fu); // Lowest 7 bits per byte + + const VU16 raw0 = And(packed0, mask); + StoreU(BitCast(d8, raw0), d8, raw + 0 * N8); + + const VU16 raw1 = And(packed1, mask); + StoreU(BitCast(d8, raw1), d8, raw + 1 * N8); + + const VU16 raw2 = And(packed2, mask); + StoreU(BitCast(d8, raw2), d8, raw + 2 * N8); + + const VU16 raw3 = And(packed3, mask); + StoreU(BitCast(d8, raw3), d8, raw + 3 * N8); + + const VU16 raw4 = And(packed4, mask); + StoreU(BitCast(d8, raw4), d8, raw + 4 * N8); + + const VU16 raw5 = And(packed5, mask); + StoreU(BitCast(d8, raw5), d8, raw + 5 * N8); + + const VU16 raw6 = And(packed6, mask); + StoreU(BitCast(d8, raw6), d8, raw + 6 * N8); + + const VU16 p0 = Xor3(ShiftRight<7>(AndNot(mask, packed6)), + ShiftRight<6>(AndNot(mask, packed5)), + ShiftRight<5>(AndNot(mask, packed4))); + const VU16 p1 = Xor3(ShiftRight<4>(AndNot(mask, packed3)), + ShiftRight<3>(AndNot(mask, packed2)), + ShiftRight<2>(AndNot(mask, packed1))); + const VU16 raw7 = Xor3(ShiftRight<1>(AndNot(mask, packed0)), p0, p1); + StoreU(BitCast(d8, raw7), d8, raw + 7 * N8); + } +}; // Pack8<7> + +template <> +struct Pack8<8> { + template + HWY_INLINE void Pack(D8 d8, const uint8_t* HWY_RESTRICT raw, + uint8_t* HWY_RESTRICT packed_out) const { + using VU8 = Vec; + const size_t N8 = Lanes(d8); + const VU8 raw0 = LoadU(d8, raw + 0 * N8); + const VU8 raw1 = LoadU(d8, raw + 1 * N8); + const VU8 raw2 = LoadU(d8, raw + 2 * N8); + const VU8 raw3 = LoadU(d8, raw + 3 * N8); + const VU8 raw4 = LoadU(d8, raw + 4 * N8); + const VU8 raw5 = LoadU(d8, raw + 5 * N8); + const VU8 raw6 = LoadU(d8, raw + 6 * N8); + const VU8 raw7 = LoadU(d8, raw + 7 * N8); + + StoreU(raw0, d8, packed_out + 0 * N8); + StoreU(raw1, d8, packed_out + 1 * N8); + StoreU(raw2, d8, packed_out + 2 * N8); + StoreU(raw3, d8, packed_out + 3 * N8); + StoreU(raw4, d8, packed_out + 4 * N8); + StoreU(raw5, d8, packed_out + 5 * N8); + StoreU(raw6, d8, packed_out + 6 * N8); + StoreU(raw7, d8, packed_out + 7 * N8); + } + + template + HWY_INLINE void Unpack(D8 d8, const uint8_t* HWY_RESTRICT packed_in, + uint8_t* HWY_RESTRICT raw) const { + using VU8 = Vec; + const size_t N8 = Lanes(d8); + const VU8 raw0 = LoadU(d8, packed_in + 0 * N8); + const VU8 raw1 = LoadU(d8, packed_in + 1 * N8); + const VU8 raw2 = LoadU(d8, packed_in + 2 * N8); + const VU8 raw3 = LoadU(d8, packed_in + 3 * N8); + const VU8 raw4 = LoadU(d8, packed_in + 4 * N8); + const VU8 raw5 = LoadU(d8, packed_in + 5 * N8); + const VU8 raw6 = LoadU(d8, packed_in + 6 * N8); + const VU8 raw7 = LoadU(d8, packed_in + 7 * N8); + + StoreU(raw0, d8, raw + 0 * N8); + StoreU(raw1, d8, raw + 1 * N8); + StoreU(raw2, d8, raw + 2 * N8); + StoreU(raw3, d8, raw + 3 * N8); + StoreU(raw4, d8, raw + 4 * N8); + StoreU(raw5, d8, raw + 5 * N8); + StoreU(raw6, d8, raw + 6 * N8); + StoreU(raw7, d8, raw + 7 * N8); + } +}; // Pack8<8> + +template <> +struct Pack16<1> { + template + HWY_INLINE void Pack(D d, const uint16_t* HWY_RESTRICT raw, + uint16_t* HWY_RESTRICT packed_out) const { + using VU16 = Vec; + const size_t N = Lanes(d); + const VU16 raw0 = LoadU(d, raw + 0 * N); + const VU16 raw1 = LoadU(d, raw + 1 * N); + const VU16 raw2 = LoadU(d, raw + 2 * N); + const VU16 raw3 = LoadU(d, raw + 3 * N); + const VU16 raw4 = LoadU(d, raw + 4 * N); + const VU16 raw5 = LoadU(d, raw + 5 * N); + const VU16 raw6 = LoadU(d, raw + 6 * N); + const VU16 raw7 = LoadU(d, raw + 7 * N); + const VU16 raw8 = LoadU(d, raw + 8 * N); + const VU16 raw9 = LoadU(d, raw + 9 * N); + const VU16 rawA = LoadU(d, raw + 0xA * N); + const VU16 rawB = LoadU(d, raw + 0xB * N); + const VU16 rawC = LoadU(d, raw + 0xC * N); + const VU16 rawD = LoadU(d, raw + 0xD * N); + const VU16 rawE = LoadU(d, raw + 0xE * N); + const VU16 rawF = LoadU(d, raw + 0xF * N); + + const VU16 p0 = Xor3(ShiftLeft<2>(raw2), Add(raw1, raw1), raw0); + const VU16 p1 = + Xor3(ShiftLeft<5>(raw5), ShiftLeft<4>(raw4), ShiftLeft<3>(raw3)); + const VU16 p2 = + Xor3(ShiftLeft<8>(raw8), ShiftLeft<7>(raw7), ShiftLeft<6>(raw6)); + const VU16 p3 = + Xor3(ShiftLeft<0xB>(rawB), ShiftLeft<0xA>(rawA), ShiftLeft<9>(raw9)); + const VU16 p4 = + Xor3(ShiftLeft<0xE>(rawE), ShiftLeft<0xD>(rawD), ShiftLeft<0xC>(rawC)); + const VU16 packed = + Or(Xor3(ShiftLeft<0xF>(rawF), p0, p1), Xor3(p2, p3, p4)); + StoreU(packed, d, packed_out); + } + + template + HWY_INLINE void Unpack(D d, const uint16_t* HWY_RESTRICT packed_in, + uint16_t* HWY_RESTRICT raw) const { + using VU16 = Vec; + const size_t N = Lanes(d); + const VU16 mask = Set(d, 1u); // Lowest bit + + const VU16 packed = LoadU(d, packed_in); + + const VU16 raw0 = And(packed, mask); + StoreU(raw0, d, raw + 0 * N); + + const VU16 raw1 = And(ShiftRight<1>(packed), mask); + StoreU(raw1, d, raw + 1 * N); + + const VU16 raw2 = And(ShiftRight<2>(packed), mask); + StoreU(raw2, d, raw + 2 * N); + + const VU16 raw3 = And(ShiftRight<3>(packed), mask); + StoreU(raw3, d, raw + 3 * N); + + const VU16 raw4 = And(ShiftRight<4>(packed), mask); + StoreU(raw4, d, raw + 4 * N); + + const VU16 raw5 = And(ShiftRight<5>(packed), mask); + StoreU(raw5, d, raw + 5 * N); + + const VU16 raw6 = And(ShiftRight<6>(packed), mask); + StoreU(raw6, d, raw + 6 * N); + + const VU16 raw7 = And(ShiftRight<7>(packed), mask); + StoreU(raw7, d, raw + 7 * N); + + const VU16 raw8 = And(ShiftRight<8>(packed), mask); + StoreU(raw8, d, raw + 8 * N); + + const VU16 raw9 = And(ShiftRight<9>(packed), mask); + StoreU(raw9, d, raw + 9 * N); + + const VU16 rawA = And(ShiftRight<0xA>(packed), mask); + StoreU(rawA, d, raw + 0xA * N); + + const VU16 rawB = And(ShiftRight<0xB>(packed), mask); + StoreU(rawB, d, raw + 0xB * N); + + const VU16 rawC = And(ShiftRight<0xC>(packed), mask); + StoreU(rawC, d, raw + 0xC * N); + + const VU16 rawD = And(ShiftRight<0xD>(packed), mask); + StoreU(rawD, d, raw + 0xD * N); + + const VU16 rawE = And(ShiftRight<0xE>(packed), mask); + StoreU(rawE, d, raw + 0xE * N); + + const VU16 rawF = ShiftRight<0xF>(packed); + StoreU(rawF, d, raw + 0xF * N); + } +}; // Pack16<1> + +template <> +struct Pack16<2> { + template + HWY_INLINE void Pack(D d, const uint16_t* HWY_RESTRICT raw, + uint16_t* HWY_RESTRICT packed_out) const { + using VU16 = Vec; + const size_t N = Lanes(d); + const VU16 raw0 = LoadU(d, raw + 0 * N); + const VU16 raw1 = LoadU(d, raw + 1 * N); + const VU16 raw2 = LoadU(d, raw + 2 * N); + const VU16 raw3 = LoadU(d, raw + 3 * N); + const VU16 raw4 = LoadU(d, raw + 4 * N); + const VU16 raw5 = LoadU(d, raw + 5 * N); + const VU16 raw6 = LoadU(d, raw + 6 * N); + const VU16 raw7 = LoadU(d, raw + 7 * N); + const VU16 raw8 = LoadU(d, raw + 8 * N); + const VU16 raw9 = LoadU(d, raw + 9 * N); + const VU16 rawA = LoadU(d, raw + 0xA * N); + const VU16 rawB = LoadU(d, raw + 0xB * N); + const VU16 rawC = LoadU(d, raw + 0xC * N); + const VU16 rawD = LoadU(d, raw + 0xD * N); + const VU16 rawE = LoadU(d, raw + 0xE * N); + const VU16 rawF = LoadU(d, raw + 0xF * N); + + VU16 packed0 = Xor3(ShiftLeft<4>(raw4), ShiftLeft<2>(raw2), raw0); + VU16 packed1 = Xor3(ShiftLeft<4>(raw5), ShiftLeft<2>(raw3), raw1); + packed0 = Xor3(packed0, ShiftLeft<8>(raw8), ShiftLeft<6>(raw6)); + packed1 = Xor3(packed1, ShiftLeft<8>(raw9), ShiftLeft<6>(raw7)); + + packed0 = Xor3(packed0, ShiftLeft<12>(rawC), ShiftLeft<10>(rawA)); + packed1 = Xor3(packed1, ShiftLeft<12>(rawD), ShiftLeft<10>(rawB)); + + packed0 = Or(packed0, ShiftLeft<14>(rawE)); + packed1 = Or(packed1, ShiftLeft<14>(rawF)); + StoreU(packed0, d, packed_out + 0 * N); + StoreU(packed1, d, packed_out + 1 * N); + } + + template + HWY_INLINE void Unpack(D d, const uint16_t* HWY_RESTRICT packed_in, + uint16_t* HWY_RESTRICT raw) const { + using VU16 = Vec; + const size_t N = Lanes(d); + const VU16 mask = Set(d, 0x3u); // Lowest 2 bits + + const VU16 packed0 = LoadU(d, packed_in + 0 * N); + const VU16 packed1 = LoadU(d, packed_in + 1 * N); + + const VU16 raw0 = And(packed0, mask); + StoreU(raw0, d, raw + 0 * N); + + const VU16 raw1 = And(packed1, mask); + StoreU(raw1, d, raw + 1 * N); + + const VU16 raw2 = And(ShiftRight<2>(packed0), mask); + StoreU(raw2, d, raw + 2 * N); + + const VU16 raw3 = And(ShiftRight<2>(packed1), mask); + StoreU(raw3, d, raw + 3 * N); + + const VU16 raw4 = And(ShiftRight<4>(packed0), mask); + StoreU(raw4, d, raw + 4 * N); + + const VU16 raw5 = And(ShiftRight<4>(packed1), mask); + StoreU(raw5, d, raw + 5 * N); + + const VU16 raw6 = And(ShiftRight<6>(packed0), mask); + StoreU(raw6, d, raw + 6 * N); + + const VU16 raw7 = And(ShiftRight<6>(packed1), mask); + StoreU(raw7, d, raw + 7 * N); + + const VU16 raw8 = And(ShiftRight<8>(packed0), mask); + StoreU(raw8, d, raw + 8 * N); + + const VU16 raw9 = And(ShiftRight<8>(packed1), mask); + StoreU(raw9, d, raw + 9 * N); + + const VU16 rawA = And(ShiftRight<0xA>(packed0), mask); + StoreU(rawA, d, raw + 0xA * N); + + const VU16 rawB = And(ShiftRight<0xA>(packed1), mask); + StoreU(rawB, d, raw + 0xB * N); + + const VU16 rawC = And(ShiftRight<0xC>(packed0), mask); + StoreU(rawC, d, raw + 0xC * N); + + const VU16 rawD = And(ShiftRight<0xC>(packed1), mask); + StoreU(rawD, d, raw + 0xD * N); + + const VU16 rawE = ShiftRight<0xE>(packed0); + StoreU(rawE, d, raw + 0xE * N); + + const VU16 rawF = ShiftRight<0xE>(packed1); + StoreU(rawF, d, raw + 0xF * N); + } +}; // Pack16<2> + +template <> +struct Pack16<3> { + template + HWY_INLINE void Pack(D d, const uint16_t* HWY_RESTRICT raw, + uint16_t* HWY_RESTRICT packed_out) const { + using VU16 = Vec; + const size_t N = Lanes(d); + const VU16 raw0 = LoadU(d, raw + 0 * N); + const VU16 raw1 = LoadU(d, raw + 1 * N); + const VU16 raw2 = LoadU(d, raw + 2 * N); + const VU16 raw3 = LoadU(d, raw + 3 * N); + const VU16 raw4 = LoadU(d, raw + 4 * N); + const VU16 raw5 = LoadU(d, raw + 5 * N); + const VU16 raw6 = LoadU(d, raw + 6 * N); + const VU16 raw7 = LoadU(d, raw + 7 * N); + const VU16 raw8 = LoadU(d, raw + 8 * N); + const VU16 raw9 = LoadU(d, raw + 9 * N); + const VU16 rawA = LoadU(d, raw + 0xA * N); + const VU16 rawB = LoadU(d, raw + 0xB * N); + const VU16 rawC = LoadU(d, raw + 0xC * N); + const VU16 rawD = LoadU(d, raw + 0xD * N); + const VU16 rawE = LoadU(d, raw + 0xE * N); + const VU16 rawF = LoadU(d, raw + 0xF * N); + + // We can fit 15 raw vectors in three packed vectors (five each). + VU16 packed0 = Xor3(ShiftLeft<6>(raw6), ShiftLeft<3>(raw3), raw0); + VU16 packed1 = Xor3(ShiftLeft<6>(raw7), ShiftLeft<3>(raw4), raw1); + VU16 packed2 = Xor3(ShiftLeft<6>(raw8), ShiftLeft<3>(raw5), raw2); + + // rawF will be scattered into the upper bit of these three. + packed0 = Xor3(packed0, ShiftLeft<12>(rawC), ShiftLeft<9>(raw9)); + packed1 = Xor3(packed1, ShiftLeft<12>(rawD), ShiftLeft<9>(rawA)); + packed2 = Xor3(packed2, ShiftLeft<12>(rawE), ShiftLeft<9>(rawB)); + + const VU16 hi1 = Set(d, 0x8000u); + packed0 = Or(packed0, ShiftLeft<15>(rawF)); // MSB only, no mask + packed1 = OrAnd(packed1, ShiftLeft<14>(rawF), hi1); + packed2 = OrAnd(packed2, ShiftLeft<13>(rawF), hi1); + StoreU(packed0, d, packed_out + 0 * N); + StoreU(packed1, d, packed_out + 1 * N); + StoreU(packed2, d, packed_out + 2 * N); + } + + template + HWY_INLINE void Unpack(D d, const uint16_t* HWY_RESTRICT packed_in, + uint16_t* HWY_RESTRICT raw) const { + using VU16 = Vec; + const size_t N = Lanes(d); + const VU16 mask = Set(d, 0x7u); // Lowest 3 bits + + const VU16 packed0 = LoadU(d, packed_in + 0 * N); + const VU16 packed1 = LoadU(d, packed_in + 1 * N); + const VU16 packed2 = LoadU(d, packed_in + 2 * N); + + const VU16 raw0 = And(mask, packed0); + StoreU(raw0, d, raw + 0 * N); + + const VU16 raw1 = And(mask, packed1); + StoreU(raw1, d, raw + 1 * N); + + const VU16 raw2 = And(mask, packed2); + StoreU(raw2, d, raw + 2 * N); + + const VU16 raw3 = And(mask, ShiftRight<3>(packed0)); + StoreU(raw3, d, raw + 3 * N); + + const VU16 raw4 = And(mask, ShiftRight<3>(packed1)); + StoreU(raw4, d, raw + 4 * N); + + const VU16 raw5 = And(mask, ShiftRight<3>(packed2)); + StoreU(raw5, d, raw + 5 * N); + + const VU16 raw6 = And(mask, ShiftRight<6>(packed0)); + StoreU(raw6, d, raw + 6 * N); + + const VU16 raw7 = And(mask, ShiftRight<6>(packed1)); + StoreU(raw7, d, raw + 7 * N); + + const VU16 raw8 = And(mask, ShiftRight<6>(packed2)); + StoreU(raw8, d, raw + 8 * N); + + const VU16 raw9 = And(mask, ShiftRight<9>(packed0)); + StoreU(raw9, d, raw + 9 * N); + + const VU16 rawA = And(mask, ShiftRight<9>(packed1)); + StoreU(rawA, d, raw + 0xA * N); + + const VU16 rawB = And(mask, ShiftRight<9>(packed2)); + StoreU(rawB, d, raw + 0xB * N); + + const VU16 rawC = And(mask, ShiftRight<12>(packed0)); + StoreU(rawC, d, raw + 0xC * N); + + const VU16 rawD = And(mask, ShiftRight<12>(packed1)); + StoreU(rawD, d, raw + 0xD * N); + + const VU16 rawE = And(mask, ShiftRight<12>(packed2)); + StoreU(rawE, d, raw + 0xE * N); + + // rawF is the concatenation of the upper bit of packed0..2. + const VU16 down0 = ShiftRight<15>(packed0); + const VU16 down1 = ShiftRight<15>(packed1); + const VU16 down2 = ShiftRight<15>(packed2); + const VU16 rawF = Xor3(ShiftLeft<2>(down2), Add(down1, down1), down0); + StoreU(rawF, d, raw + 0xF * N); + } +}; // Pack16<3> + +template <> +struct Pack16<4> { + template + HWY_INLINE void Pack(D d, const uint16_t* HWY_RESTRICT raw, + uint16_t* HWY_RESTRICT packed_out) const { + using VU16 = Vec; + const size_t N = Lanes(d); + const VU16 raw0 = LoadU(d, raw + 0 * N); + const VU16 raw1 = LoadU(d, raw + 1 * N); + const VU16 raw2 = LoadU(d, raw + 2 * N); + const VU16 raw3 = LoadU(d, raw + 3 * N); + const VU16 raw4 = LoadU(d, raw + 4 * N); + const VU16 raw5 = LoadU(d, raw + 5 * N); + const VU16 raw6 = LoadU(d, raw + 6 * N); + const VU16 raw7 = LoadU(d, raw + 7 * N); + const VU16 raw8 = LoadU(d, raw + 8 * N); + const VU16 raw9 = LoadU(d, raw + 9 * N); + const VU16 rawA = LoadU(d, raw + 0xA * N); + const VU16 rawB = LoadU(d, raw + 0xB * N); + const VU16 rawC = LoadU(d, raw + 0xC * N); + const VU16 rawD = LoadU(d, raw + 0xD * N); + const VU16 rawE = LoadU(d, raw + 0xE * N); + const VU16 rawF = LoadU(d, raw + 0xF * N); + + VU16 packed0 = Xor3(ShiftLeft<8>(raw4), ShiftLeft<4>(raw2), raw0); + VU16 packed1 = Xor3(ShiftLeft<8>(raw5), ShiftLeft<4>(raw3), raw1); + packed0 = Or(packed0, ShiftLeft<12>(raw6)); + packed1 = Or(packed1, ShiftLeft<12>(raw7)); + VU16 packed2 = Xor3(ShiftLeft<8>(rawC), ShiftLeft<4>(rawA), raw8); + VU16 packed3 = Xor3(ShiftLeft<8>(rawD), ShiftLeft<4>(rawB), raw9); + packed2 = Or(packed2, ShiftLeft<12>(rawE)); + packed3 = Or(packed3, ShiftLeft<12>(rawF)); + + StoreU(packed0, d, packed_out + 0 * N); + StoreU(packed1, d, packed_out + 1 * N); + StoreU(packed2, d, packed_out + 2 * N); + StoreU(packed3, d, packed_out + 3 * N); + } + + template + HWY_INLINE void Unpack(D d, const uint16_t* HWY_RESTRICT packed_in, + uint16_t* HWY_RESTRICT raw) const { + using VU16 = Vec; + const size_t N = Lanes(d); + const VU16 mask = Set(d, 0xFu); // Lowest 4 bits + + const VU16 packed0 = LoadU(d, packed_in + 0 * N); + const VU16 packed1 = LoadU(d, packed_in + 1 * N); + const VU16 packed2 = LoadU(d, packed_in + 2 * N); + const VU16 packed3 = LoadU(d, packed_in + 3 * N); + + const VU16 raw0 = And(packed0, mask); + StoreU(raw0, d, raw + 0 * N); + + const VU16 raw1 = And(packed1, mask); + StoreU(raw1, d, raw + 1 * N); + + const VU16 raw2 = And(ShiftRight<4>(packed0), mask); + StoreU(raw2, d, raw + 2 * N); + + const VU16 raw3 = And(ShiftRight<4>(packed1), mask); + StoreU(raw3, d, raw + 3 * N); + + const VU16 raw4 = And(ShiftRight<8>(packed0), mask); + StoreU(raw4, d, raw + 4 * N); + + const VU16 raw5 = And(ShiftRight<8>(packed1), mask); + StoreU(raw5, d, raw + 5 * N); + + const VU16 raw6 = ShiftRight<12>(packed0); // no mask required + StoreU(raw6, d, raw + 6 * N); + + const VU16 raw7 = ShiftRight<12>(packed1); // no mask required + StoreU(raw7, d, raw + 7 * N); + + const VU16 raw8 = And(packed2, mask); + StoreU(raw8, d, raw + 8 * N); + + const VU16 raw9 = And(packed3, mask); + StoreU(raw9, d, raw + 9 * N); + + const VU16 rawA = And(ShiftRight<4>(packed2), mask); + StoreU(rawA, d, raw + 0xA * N); + + const VU16 rawB = And(ShiftRight<4>(packed3), mask); + StoreU(rawB, d, raw + 0xB * N); + + const VU16 rawC = And(ShiftRight<8>(packed2), mask); + StoreU(rawC, d, raw + 0xC * N); + + const VU16 rawD = And(ShiftRight<8>(packed3), mask); + StoreU(rawD, d, raw + 0xD * N); + + const VU16 rawE = ShiftRight<12>(packed2); // no mask required + StoreU(rawE, d, raw + 0xE * N); + + const VU16 rawF = ShiftRight<12>(packed3); // no mask required + StoreU(rawF, d, raw + 0xF * N); + } +}; // Pack16<4> + +template <> +struct Pack16<5> { + template + HWY_INLINE void Pack(D d, const uint16_t* HWY_RESTRICT raw, + uint16_t* HWY_RESTRICT packed_out) const { + using VU16 = Vec; + const size_t N = Lanes(d); + const VU16 raw0 = LoadU(d, raw + 0 * N); + const VU16 raw1 = LoadU(d, raw + 1 * N); + const VU16 raw2 = LoadU(d, raw + 2 * N); + const VU16 raw3 = LoadU(d, raw + 3 * N); + const VU16 raw4 = LoadU(d, raw + 4 * N); + const VU16 raw5 = LoadU(d, raw + 5 * N); + const VU16 raw6 = LoadU(d, raw + 6 * N); + const VU16 raw7 = LoadU(d, raw + 7 * N); + const VU16 raw8 = LoadU(d, raw + 8 * N); + const VU16 raw9 = LoadU(d, raw + 9 * N); + const VU16 rawA = LoadU(d, raw + 0xA * N); + const VU16 rawB = LoadU(d, raw + 0xB * N); + const VU16 rawC = LoadU(d, raw + 0xC * N); + const VU16 rawD = LoadU(d, raw + 0xD * N); + const VU16 rawE = LoadU(d, raw + 0xE * N); + const VU16 rawF = LoadU(d, raw + 0xF * N); + + // We can fit 15 raw vectors in five packed vectors (three each). + VU16 packed0 = Xor3(ShiftLeft<10>(rawA), ShiftLeft<5>(raw5), raw0); + VU16 packed1 = Xor3(ShiftLeft<10>(rawB), ShiftLeft<5>(raw6), raw1); + VU16 packed2 = Xor3(ShiftLeft<10>(rawC), ShiftLeft<5>(raw7), raw2); + VU16 packed3 = Xor3(ShiftLeft<10>(rawD), ShiftLeft<5>(raw8), raw3); + VU16 packed4 = Xor3(ShiftLeft<10>(rawE), ShiftLeft<5>(raw9), raw4); + + // rawF will be scattered into the upper bits of these five. + const VU16 hi1 = Set(d, 0x8000u); + packed0 = Or(packed0, ShiftLeft<15>(rawF)); // MSB only, no mask + packed1 = OrAnd(packed1, ShiftLeft<14>(rawF), hi1); + packed2 = OrAnd(packed2, ShiftLeft<13>(rawF), hi1); + packed3 = OrAnd(packed3, ShiftLeft<12>(rawF), hi1); + packed4 = OrAnd(packed4, ShiftLeft<11>(rawF), hi1); + + StoreU(packed0, d, packed_out + 0 * N); + StoreU(packed1, d, packed_out + 1 * N); + StoreU(packed2, d, packed_out + 2 * N); + StoreU(packed3, d, packed_out + 3 * N); + StoreU(packed4, d, packed_out + 4 * N); + } + + template + HWY_INLINE void Unpack(D d, const uint16_t* HWY_RESTRICT packed_in, + uint16_t* HWY_RESTRICT raw) const { + using VU16 = Vec; + const size_t N = Lanes(d); + + const VU16 packed0 = LoadU(d, packed_in + 0 * N); + const VU16 packed1 = LoadU(d, packed_in + 1 * N); + const VU16 packed2 = LoadU(d, packed_in + 2 * N); + const VU16 packed3 = LoadU(d, packed_in + 3 * N); + const VU16 packed4 = LoadU(d, packed_in + 4 * N); + + const VU16 mask = Set(d, 0x1Fu); // Lowest 5 bits + + const VU16 raw0 = And(packed0, mask); + StoreU(raw0, d, raw + 0 * N); + + const VU16 raw1 = And(packed1, mask); + StoreU(raw1, d, raw + 1 * N); + + const VU16 raw2 = And(packed2, mask); + StoreU(raw2, d, raw + 2 * N); + + const VU16 raw3 = And(packed3, mask); + StoreU(raw3, d, raw + 3 * N); + + const VU16 raw4 = And(packed4, mask); + StoreU(raw4, d, raw + 4 * N); + + const VU16 raw5 = And(ShiftRight<5>(packed0), mask); + StoreU(raw5, d, raw + 5 * N); + + const VU16 raw6 = And(ShiftRight<5>(packed1), mask); + StoreU(raw6, d, raw + 6 * N); + + const VU16 raw7 = And(ShiftRight<5>(packed2), mask); + StoreU(raw7, d, raw + 7 * N); + + const VU16 raw8 = And(ShiftRight<5>(packed3), mask); + StoreU(raw8, d, raw + 8 * N); + + const VU16 raw9 = And(ShiftRight<5>(packed4), mask); + StoreU(raw9, d, raw + 9 * N); + + const VU16 rawA = And(ShiftRight<10>(packed0), mask); + StoreU(rawA, d, raw + 0xA * N); + + const VU16 rawB = And(ShiftRight<10>(packed1), mask); + StoreU(rawB, d, raw + 0xB * N); + + const VU16 rawC = And(ShiftRight<10>(packed2), mask); + StoreU(rawC, d, raw + 0xC * N); + + const VU16 rawD = And(ShiftRight<10>(packed3), mask); + StoreU(rawD, d, raw + 0xD * N); + + const VU16 rawE = And(ShiftRight<10>(packed4), mask); + StoreU(rawE, d, raw + 0xE * N); + + // rawF is the concatenation of the lower bit of packed0..4. + const VU16 down0 = ShiftRight<15>(packed0); + const VU16 down1 = ShiftRight<15>(packed1); + const VU16 hi1 = Set(d, 0x8000u); + const VU16 p0 = + Xor3(ShiftRight<13>(And(packed2, hi1)), Add(down1, down1), down0); + const VU16 rawF = Xor3(ShiftRight<11>(And(packed4, hi1)), + ShiftRight<12>(And(packed3, hi1)), p0); + StoreU(rawF, d, raw + 0xF * N); + } +}; // Pack16<5> + +template <> +struct Pack16<6> { + template + HWY_INLINE void Pack(D d, const uint16_t* HWY_RESTRICT raw, + uint16_t* HWY_RESTRICT packed_out) const { + using VU16 = Vec; + const size_t N = Lanes(d); + const VU16 raw0 = LoadU(d, raw + 0 * N); + const VU16 raw1 = LoadU(d, raw + 1 * N); + const VU16 raw2 = LoadU(d, raw + 2 * N); + const VU16 raw3 = LoadU(d, raw + 3 * N); + const VU16 raw4 = LoadU(d, raw + 4 * N); + const VU16 raw5 = LoadU(d, raw + 5 * N); + const VU16 raw6 = LoadU(d, raw + 6 * N); + const VU16 raw7 = LoadU(d, raw + 7 * N); + const VU16 raw8 = LoadU(d, raw + 8 * N); + const VU16 raw9 = LoadU(d, raw + 9 * N); + const VU16 rawA = LoadU(d, raw + 0xA * N); + const VU16 rawB = LoadU(d, raw + 0xB * N); + const VU16 rawC = LoadU(d, raw + 0xC * N); + const VU16 rawD = LoadU(d, raw + 0xD * N); + const VU16 rawE = LoadU(d, raw + 0xE * N); + const VU16 rawF = LoadU(d, raw + 0xF * N); + + const VU16 packed3 = Or(ShiftLeft<6>(raw7), raw3); + const VU16 packed7 = Or(ShiftLeft<6>(rawF), rawB); + // Three vectors, two 6-bit raw each; packed3 (12 bits) is spread over the + // four remainder bits at the top of each vector. + const VU16 packed0 = Xor3(ShiftLeft<12>(packed3), ShiftLeft<6>(raw4), raw0); + VU16 packed1 = Or(ShiftLeft<6>(raw5), raw1); + VU16 packed2 = Or(ShiftLeft<6>(raw6), raw2); + const VU16 packed4 = Xor3(ShiftLeft<12>(packed7), ShiftLeft<6>(rawC), raw8); + VU16 packed5 = Or(ShiftLeft<6>(rawD), raw9); + VU16 packed6 = Or(ShiftLeft<6>(rawE), rawA); + + const VU16 hi4 = Set(d, 0xF000u); + packed1 = OrAnd(packed1, ShiftLeft<8>(packed3), hi4); + packed2 = OrAnd(packed2, ShiftLeft<4>(packed3), hi4); + packed5 = OrAnd(packed5, ShiftLeft<8>(packed7), hi4); + packed6 = OrAnd(packed6, ShiftLeft<4>(packed7), hi4); + + StoreU(packed0, d, packed_out + 0 * N); + StoreU(packed1, d, packed_out + 1 * N); + StoreU(packed2, d, packed_out + 2 * N); + StoreU(packed4, d, packed_out + 3 * N); + StoreU(packed5, d, packed_out + 4 * N); + StoreU(packed6, d, packed_out + 5 * N); + } + + template + HWY_INLINE void Unpack(D d, const uint16_t* HWY_RESTRICT packed_in, + uint16_t* HWY_RESTRICT raw) const { + using VU16 = Vec; + const size_t N = Lanes(d); + const VU16 mask = Set(d, 0x3Fu); // Lowest 6 bits + + const VU16 packed0 = LoadU(d, packed_in + 0 * N); + const VU16 packed1 = LoadU(d, packed_in + 1 * N); + const VU16 packed2 = LoadU(d, packed_in + 2 * N); + const VU16 packed4 = LoadU(d, packed_in + 3 * N); + const VU16 packed5 = LoadU(d, packed_in + 4 * N); + const VU16 packed6 = LoadU(d, packed_in + 5 * N); + + const VU16 raw0 = And(packed0, mask); + StoreU(raw0, d, raw + 0 * N); + + const VU16 raw1 = And(packed1, mask); + StoreU(raw1, d, raw + 1 * N); + + const VU16 raw2 = And(packed2, mask); + StoreU(raw2, d, raw + 2 * N); + + const VU16 raw4 = And(ShiftRight<6>(packed0), mask); + StoreU(raw4, d, raw + 4 * N); + + const VU16 raw5 = And(ShiftRight<6>(packed1), mask); + StoreU(raw5, d, raw + 5 * N); + + const VU16 raw6 = And(ShiftRight<6>(packed2), mask); + StoreU(raw6, d, raw + 6 * N); + + const VU16 raw8 = And(packed4, mask); + StoreU(raw8, d, raw + 8 * N); + + const VU16 raw9 = And(packed5, mask); + StoreU(raw9, d, raw + 9 * N); + + const VU16 rawA = And(packed6, mask); + StoreU(rawA, d, raw + 0xA * N); + + const VU16 rawC = And(ShiftRight<6>(packed4), mask); + StoreU(rawC, d, raw + 0xC * N); + + const VU16 rawD = And(ShiftRight<6>(packed5), mask); + StoreU(rawD, d, raw + 0xD * N); + + const VU16 rawE = And(ShiftRight<6>(packed6), mask); + StoreU(rawE, d, raw + 0xE * N); + + // packed3 is the concatenation of the four upper bits in packed0..2. + const VU16 down0 = ShiftRight<12>(packed0); + const VU16 down4 = ShiftRight<12>(packed4); + const VU16 hi4 = Set(d, 0xF000u); + const VU16 packed3 = Xor3(ShiftRight<4>(And(packed2, hi4)), + ShiftRight<8>(And(packed1, hi4)), down0); + const VU16 packed7 = Xor3(ShiftRight<4>(And(packed6, hi4)), + ShiftRight<8>(And(packed5, hi4)), down4); + const VU16 raw3 = And(packed3, mask); + StoreU(raw3, d, raw + 3 * N); + + const VU16 rawB = And(packed7, mask); + StoreU(rawB, d, raw + 0xB * N); + + const VU16 raw7 = ShiftRight<6>(packed3); // upper bits already zero + StoreU(raw7, d, raw + 7 * N); + + const VU16 rawF = ShiftRight<6>(packed7); // upper bits already zero + StoreU(rawF, d, raw + 0xF * N); + } +}; // Pack16<6> + +template <> +struct Pack16<7> { + template + HWY_INLINE void Pack(D d, const uint16_t* HWY_RESTRICT raw, + uint16_t* HWY_RESTRICT packed_out) const { + using VU16 = Vec; + const size_t N = Lanes(d); + const VU16 raw0 = LoadU(d, raw + 0 * N); + const VU16 raw1 = LoadU(d, raw + 1 * N); + const VU16 raw2 = LoadU(d, raw + 2 * N); + const VU16 raw3 = LoadU(d, raw + 3 * N); + const VU16 raw4 = LoadU(d, raw + 4 * N); + const VU16 raw5 = LoadU(d, raw + 5 * N); + const VU16 raw6 = LoadU(d, raw + 6 * N); + const VU16 raw7 = LoadU(d, raw + 7 * N); + const VU16 raw8 = LoadU(d, raw + 8 * N); + const VU16 raw9 = LoadU(d, raw + 9 * N); + const VU16 rawA = LoadU(d, raw + 0xA * N); + const VU16 rawB = LoadU(d, raw + 0xB * N); + const VU16 rawC = LoadU(d, raw + 0xC * N); + const VU16 rawD = LoadU(d, raw + 0xD * N); + const VU16 rawE = LoadU(d, raw + 0xE * N); + const VU16 rawF = LoadU(d, raw + 0xF * N); + + const VU16 packed7 = Or(ShiftLeft<7>(rawF), raw7); + // Seven vectors, two 7-bit raw each; packed7 (14 bits) is spread over the + // two remainder bits at the top of each vector. + const VU16 packed0 = Xor3(ShiftLeft<14>(packed7), ShiftLeft<7>(raw8), raw0); + VU16 packed1 = Or(ShiftLeft<7>(raw9), raw1); + VU16 packed2 = Or(ShiftLeft<7>(rawA), raw2); + VU16 packed3 = Or(ShiftLeft<7>(rawB), raw3); + VU16 packed4 = Or(ShiftLeft<7>(rawC), raw4); + VU16 packed5 = Or(ShiftLeft<7>(rawD), raw5); + VU16 packed6 = Or(ShiftLeft<7>(rawE), raw6); + + const VU16 hi2 = Set(d, 0xC000u); + packed1 = OrAnd(packed1, ShiftLeft<12>(packed7), hi2); + packed2 = OrAnd(packed2, ShiftLeft<10>(packed7), hi2); + packed3 = OrAnd(packed3, ShiftLeft<8>(packed7), hi2); + packed4 = OrAnd(packed4, ShiftLeft<6>(packed7), hi2); + packed5 = OrAnd(packed5, ShiftLeft<4>(packed7), hi2); + packed6 = OrAnd(packed6, ShiftLeft<2>(packed7), hi2); + + StoreU(packed0, d, packed_out + 0 * N); + StoreU(packed1, d, packed_out + 1 * N); + StoreU(packed2, d, packed_out + 2 * N); + StoreU(packed3, d, packed_out + 3 * N); + StoreU(packed4, d, packed_out + 4 * N); + StoreU(packed5, d, packed_out + 5 * N); + StoreU(packed6, d, packed_out + 6 * N); + } + + template + HWY_INLINE void Unpack(D d, const uint16_t* HWY_RESTRICT packed_in, + uint16_t* HWY_RESTRICT raw) const { + using VU16 = Vec; + const size_t N = Lanes(d); + + const VU16 packed0 = BitCast(d, LoadU(d, packed_in + 0 * N)); + const VU16 packed1 = BitCast(d, LoadU(d, packed_in + 1 * N)); + const VU16 packed2 = BitCast(d, LoadU(d, packed_in + 2 * N)); + const VU16 packed3 = BitCast(d, LoadU(d, packed_in + 3 * N)); + const VU16 packed4 = BitCast(d, LoadU(d, packed_in + 4 * N)); + const VU16 packed5 = BitCast(d, LoadU(d, packed_in + 5 * N)); + const VU16 packed6 = BitCast(d, LoadU(d, packed_in + 6 * N)); + + const VU16 mask = Set(d, 0x7Fu); // Lowest 7 bits + + const VU16 raw0 = And(packed0, mask); + StoreU(raw0, d, raw + 0 * N); + + const VU16 raw1 = And(packed1, mask); + StoreU(raw1, d, raw + 1 * N); + + const VU16 raw2 = And(packed2, mask); + StoreU(raw2, d, raw + 2 * N); + + const VU16 raw3 = And(packed3, mask); + StoreU(raw3, d, raw + 3 * N); + + const VU16 raw4 = And(packed4, mask); + StoreU(raw4, d, raw + 4 * N); + + const VU16 raw5 = And(packed5, mask); + StoreU(raw5, d, raw + 5 * N); + + const VU16 raw6 = And(packed6, mask); + StoreU(raw6, d, raw + 6 * N); + + const VU16 raw8 = And(ShiftRight<7>(packed0), mask); + StoreU(raw8, d, raw + 8 * N); + + const VU16 raw9 = And(ShiftRight<7>(packed1), mask); + StoreU(raw9, d, raw + 9 * N); + + const VU16 rawA = And(ShiftRight<7>(packed2), mask); + StoreU(rawA, d, raw + 0xA * N); + + const VU16 rawB = And(ShiftRight<7>(packed3), mask); + StoreU(rawB, d, raw + 0xB * N); + + const VU16 rawC = And(ShiftRight<7>(packed4), mask); + StoreU(rawC, d, raw + 0xC * N); + + const VU16 rawD = And(ShiftRight<7>(packed5), mask); + StoreU(rawD, d, raw + 0xD * N); + + const VU16 rawE = And(ShiftRight<7>(packed6), mask); + StoreU(rawE, d, raw + 0xE * N); + + // packed7 is the concatenation of the two upper bits in packed0..6. + const VU16 down0 = ShiftRight<14>(packed0); + const VU16 hi2 = Set(d, 0xC000u); + const VU16 p0 = Xor3(ShiftRight<12>(And(packed1, hi2)), + ShiftRight<10>(And(packed2, hi2)), down0); + const VU16 p1 = Xor3(ShiftRight<8>(And(packed3, hi2)), // + ShiftRight<6>(And(packed4, hi2)), + ShiftRight<4>(And(packed5, hi2))); + const VU16 packed7 = Xor3(ShiftRight<2>(And(packed6, hi2)), p1, p0); + + const VU16 raw7 = And(packed7, mask); + StoreU(raw7, d, raw + 7 * N); + + const VU16 rawF = ShiftRight<7>(packed7); // upper bits already zero + StoreU(rawF, d, raw + 0xF * N); + } +}; // Pack16<7> + +template <> +struct Pack16<8> { + template + HWY_INLINE void Pack(D d, const uint16_t* HWY_RESTRICT raw, + uint16_t* HWY_RESTRICT packed_out) const { + using VU16 = Vec; + const size_t N = Lanes(d); + const VU16 raw0 = LoadU(d, raw + 0 * N); + const VU16 raw1 = LoadU(d, raw + 1 * N); + const VU16 raw2 = LoadU(d, raw + 2 * N); + const VU16 raw3 = LoadU(d, raw + 3 * N); + const VU16 raw4 = LoadU(d, raw + 4 * N); + const VU16 raw5 = LoadU(d, raw + 5 * N); + const VU16 raw6 = LoadU(d, raw + 6 * N); + const VU16 raw7 = LoadU(d, raw + 7 * N); + const VU16 raw8 = LoadU(d, raw + 8 * N); + const VU16 raw9 = LoadU(d, raw + 9 * N); + const VU16 rawA = LoadU(d, raw + 0xA * N); + const VU16 rawB = LoadU(d, raw + 0xB * N); + const VU16 rawC = LoadU(d, raw + 0xC * N); + const VU16 rawD = LoadU(d, raw + 0xD * N); + const VU16 rawE = LoadU(d, raw + 0xE * N); + const VU16 rawF = LoadU(d, raw + 0xF * N); + + // This is equivalent to ConcatEven with 8-bit lanes, but much more + // efficient on RVV and slightly less efficient on SVE2. + const VU16 packed0 = Or(ShiftLeft<8>(raw2), raw0); + const VU16 packed1 = Or(ShiftLeft<8>(raw3), raw1); + const VU16 packed2 = Or(ShiftLeft<8>(raw6), raw4); + const VU16 packed3 = Or(ShiftLeft<8>(raw7), raw5); + const VU16 packed4 = Or(ShiftLeft<8>(rawA), raw8); + const VU16 packed5 = Or(ShiftLeft<8>(rawB), raw9); + const VU16 packed6 = Or(ShiftLeft<8>(rawE), rawC); + const VU16 packed7 = Or(ShiftLeft<8>(rawF), rawD); + + StoreU(packed0, d, packed_out + 0 * N); + StoreU(packed1, d, packed_out + 1 * N); + StoreU(packed2, d, packed_out + 2 * N); + StoreU(packed3, d, packed_out + 3 * N); + StoreU(packed4, d, packed_out + 4 * N); + StoreU(packed5, d, packed_out + 5 * N); + StoreU(packed6, d, packed_out + 6 * N); + StoreU(packed7, d, packed_out + 7 * N); + } + + template + HWY_INLINE void Unpack(D d, const uint16_t* HWY_RESTRICT packed_in, + uint16_t* HWY_RESTRICT raw) const { + using VU16 = Vec; + const size_t N = Lanes(d); + + const VU16 packed0 = BitCast(d, LoadU(d, packed_in + 0 * N)); + const VU16 packed1 = BitCast(d, LoadU(d, packed_in + 1 * N)); + const VU16 packed2 = BitCast(d, LoadU(d, packed_in + 2 * N)); + const VU16 packed3 = BitCast(d, LoadU(d, packed_in + 3 * N)); + const VU16 packed4 = BitCast(d, LoadU(d, packed_in + 4 * N)); + const VU16 packed5 = BitCast(d, LoadU(d, packed_in + 5 * N)); + const VU16 packed6 = BitCast(d, LoadU(d, packed_in + 6 * N)); + const VU16 packed7 = BitCast(d, LoadU(d, packed_in + 7 * N)); + const VU16 mask = Set(d, 0xFFu); // Lowest 8 bits + + const VU16 raw0 = And(packed0, mask); + StoreU(raw0, d, raw + 0 * N); + + const VU16 raw1 = And(packed1, mask); + StoreU(raw1, d, raw + 1 * N); + + const VU16 raw2 = ShiftRight<8>(packed0); // upper bits already zero + StoreU(raw2, d, raw + 2 * N); + + const VU16 raw3 = ShiftRight<8>(packed1); // upper bits already zero + StoreU(raw3, d, raw + 3 * N); + + const VU16 raw4 = And(packed2, mask); + StoreU(raw4, d, raw + 4 * N); + + const VU16 raw5 = And(packed3, mask); + StoreU(raw5, d, raw + 5 * N); + + const VU16 raw6 = ShiftRight<8>(packed2); // upper bits already zero + StoreU(raw6, d, raw + 6 * N); + + const VU16 raw7 = ShiftRight<8>(packed3); // upper bits already zero + StoreU(raw7, d, raw + 7 * N); + + const VU16 raw8 = And(packed4, mask); + StoreU(raw8, d, raw + 8 * N); + + const VU16 raw9 = And(packed5, mask); + StoreU(raw9, d, raw + 9 * N); + + const VU16 rawA = ShiftRight<8>(packed4); // upper bits already zero + StoreU(rawA, d, raw + 0xA * N); + + const VU16 rawB = ShiftRight<8>(packed5); // upper bits already zero + StoreU(rawB, d, raw + 0xB * N); + + const VU16 rawC = And(packed6, mask); + StoreU(rawC, d, raw + 0xC * N); + + const VU16 rawD = And(packed7, mask); + StoreU(rawD, d, raw + 0xD * N); + + const VU16 rawE = ShiftRight<8>(packed6); // upper bits already zero + StoreU(rawE, d, raw + 0xE * N); + + const VU16 rawF = ShiftRight<8>(packed7); // upper bits already zero + StoreU(rawF, d, raw + 0xF * N); + } +}; // Pack16<8> + +template <> +struct Pack16<9> { + template + HWY_INLINE void Pack(D d, const uint16_t* HWY_RESTRICT raw, + uint16_t* HWY_RESTRICT packed_out) const { + using VU16 = Vec; + const size_t N = Lanes(d); + const VU16 raw0 = LoadU(d, raw + 0 * N); + const VU16 raw1 = LoadU(d, raw + 1 * N); + const VU16 raw2 = LoadU(d, raw + 2 * N); + const VU16 raw3 = LoadU(d, raw + 3 * N); + const VU16 raw4 = LoadU(d, raw + 4 * N); + const VU16 raw5 = LoadU(d, raw + 5 * N); + const VU16 raw6 = LoadU(d, raw + 6 * N); + const VU16 raw7 = LoadU(d, raw + 7 * N); + const VU16 raw8 = LoadU(d, raw + 8 * N); + const VU16 raw9 = LoadU(d, raw + 9 * N); + const VU16 rawA = LoadU(d, raw + 0xA * N); + const VU16 rawB = LoadU(d, raw + 0xB * N); + const VU16 rawC = LoadU(d, raw + 0xC * N); + const VU16 rawD = LoadU(d, raw + 0xD * N); + const VU16 rawE = LoadU(d, raw + 0xE * N); + const VU16 rawF = LoadU(d, raw + 0xF * N); + // 8 vectors, each with 9+7 bits; top 2 bits are concatenated into packed8. + const VU16 packed0 = Or(ShiftLeft<9>(raw8), raw0); + const VU16 packed1 = Or(ShiftLeft<9>(raw9), raw1); + const VU16 packed2 = Or(ShiftLeft<9>(rawA), raw2); + const VU16 packed3 = Or(ShiftLeft<9>(rawB), raw3); + const VU16 packed4 = Or(ShiftLeft<9>(rawC), raw4); + const VU16 packed5 = Or(ShiftLeft<9>(rawD), raw5); + const VU16 packed6 = Or(ShiftLeft<9>(rawE), raw6); + const VU16 packed7 = Or(ShiftLeft<9>(rawF), raw7); + + // We could shift down, OR and shift up, but two shifts are typically more + // expensive than AND, shift into position, and OR (which can be further + // reduced via Xor3). + const VU16 mid2 = Set(d, 0x180u); // top 2 in lower 9 + const VU16 part8 = ShiftRight<7>(And(raw8, mid2)); + const VU16 part9 = ShiftRight<5>(And(raw9, mid2)); + const VU16 partA = ShiftRight<3>(And(rawA, mid2)); + const VU16 partB = ShiftRight<1>(And(rawB, mid2)); + const VU16 partC = ShiftLeft<1>(And(rawC, mid2)); + const VU16 partD = ShiftLeft<3>(And(rawD, mid2)); + const VU16 partE = ShiftLeft<5>(And(rawE, mid2)); + const VU16 partF = ShiftLeft<7>(And(rawF, mid2)); + const VU16 packed8 = Xor3(Xor3(part8, part9, partA), + Xor3(partB, partC, partD), Or(partE, partF)); + + StoreU(packed0, d, packed_out + 0 * N); + StoreU(packed1, d, packed_out + 1 * N); + StoreU(packed2, d, packed_out + 2 * N); + StoreU(packed3, d, packed_out + 3 * N); + StoreU(packed4, d, packed_out + 4 * N); + StoreU(packed5, d, packed_out + 5 * N); + StoreU(packed6, d, packed_out + 6 * N); + StoreU(packed7, d, packed_out + 7 * N); + StoreU(packed8, d, packed_out + 8 * N); + } + + template + HWY_INLINE void Unpack(D d, const uint16_t* HWY_RESTRICT packed_in, + uint16_t* HWY_RESTRICT raw) const { + using VU16 = Vec; + const size_t N = Lanes(d); + + const VU16 packed0 = BitCast(d, LoadU(d, packed_in + 0 * N)); + const VU16 packed1 = BitCast(d, LoadU(d, packed_in + 1 * N)); + const VU16 packed2 = BitCast(d, LoadU(d, packed_in + 2 * N)); + const VU16 packed3 = BitCast(d, LoadU(d, packed_in + 3 * N)); + const VU16 packed4 = BitCast(d, LoadU(d, packed_in + 4 * N)); + const VU16 packed5 = BitCast(d, LoadU(d, packed_in + 5 * N)); + const VU16 packed6 = BitCast(d, LoadU(d, packed_in + 6 * N)); + const VU16 packed7 = BitCast(d, LoadU(d, packed_in + 7 * N)); + const VU16 packed8 = BitCast(d, LoadU(d, packed_in + 8 * N)); + + const VU16 mask = Set(d, 0x1FFu); // Lowest 9 bits + + const VU16 raw0 = And(packed0, mask); + StoreU(raw0, d, raw + 0 * N); + + const VU16 raw1 = And(packed1, mask); + StoreU(raw1, d, raw + 1 * N); + + const VU16 raw2 = And(packed2, mask); + StoreU(raw2, d, raw + 2 * N); + + const VU16 raw3 = And(packed3, mask); + StoreU(raw3, d, raw + 3 * N); + + const VU16 raw4 = And(packed4, mask); + StoreU(raw4, d, raw + 4 * N); + + const VU16 raw5 = And(packed5, mask); + StoreU(raw5, d, raw + 5 * N); + + const VU16 raw6 = And(packed6, mask); + StoreU(raw6, d, raw + 6 * N); + + const VU16 raw7 = And(packed7, mask); + StoreU(raw7, d, raw + 7 * N); + + const VU16 mid2 = Set(d, 0x180u); // top 2 in lower 9 + const VU16 raw8 = + OrAnd(ShiftRight<9>(packed0), ShiftLeft<7>(packed8), mid2); + const VU16 raw9 = + OrAnd(ShiftRight<9>(packed1), ShiftLeft<5>(packed8), mid2); + const VU16 rawA = + OrAnd(ShiftRight<9>(packed2), ShiftLeft<3>(packed8), mid2); + const VU16 rawB = + OrAnd(ShiftRight<9>(packed3), ShiftLeft<1>(packed8), mid2); + const VU16 rawC = + OrAnd(ShiftRight<9>(packed4), ShiftRight<1>(packed8), mid2); + const VU16 rawD = + OrAnd(ShiftRight<9>(packed5), ShiftRight<3>(packed8), mid2); + const VU16 rawE = + OrAnd(ShiftRight<9>(packed6), ShiftRight<5>(packed8), mid2); + const VU16 rawF = + OrAnd(ShiftRight<9>(packed7), ShiftRight<7>(packed8), mid2); + + StoreU(raw8, d, raw + 8 * N); + StoreU(raw9, d, raw + 9 * N); + StoreU(rawA, d, raw + 0xA * N); + StoreU(rawB, d, raw + 0xB * N); + StoreU(rawC, d, raw + 0xC * N); + StoreU(rawD, d, raw + 0xD * N); + StoreU(rawE, d, raw + 0xE * N); + StoreU(rawF, d, raw + 0xF * N); + } +}; // Pack16<9> + +template <> +struct Pack16<10> { + template + HWY_INLINE void Pack(D d, const uint16_t* HWY_RESTRICT raw, + uint16_t* HWY_RESTRICT packed_out) const { + using VU16 = Vec; + const size_t N = Lanes(d); + const VU16 raw0 = LoadU(d, raw + 0 * N); + const VU16 raw1 = LoadU(d, raw + 1 * N); + const VU16 raw2 = LoadU(d, raw + 2 * N); + const VU16 raw3 = LoadU(d, raw + 3 * N); + const VU16 raw4 = LoadU(d, raw + 4 * N); + const VU16 raw5 = LoadU(d, raw + 5 * N); + const VU16 raw6 = LoadU(d, raw + 6 * N); + const VU16 raw7 = LoadU(d, raw + 7 * N); + const VU16 raw8 = LoadU(d, raw + 8 * N); + const VU16 raw9 = LoadU(d, raw + 9 * N); + const VU16 rawA = LoadU(d, raw + 0xA * N); + const VU16 rawB = LoadU(d, raw + 0xB * N); + const VU16 rawC = LoadU(d, raw + 0xC * N); + const VU16 rawD = LoadU(d, raw + 0xD * N); + const VU16 rawE = LoadU(d, raw + 0xE * N); + const VU16 rawF = LoadU(d, raw + 0xF * N); + + // 8 vectors, each with 10+6 bits; top 4 bits are concatenated into + // packed8 and packed9. + const VU16 packed0 = Or(ShiftLeft<10>(raw8), raw0); + const VU16 packed1 = Or(ShiftLeft<10>(raw9), raw1); + const VU16 packed2 = Or(ShiftLeft<10>(rawA), raw2); + const VU16 packed3 = Or(ShiftLeft<10>(rawB), raw3); + const VU16 packed4 = Or(ShiftLeft<10>(rawC), raw4); + const VU16 packed5 = Or(ShiftLeft<10>(rawD), raw5); + const VU16 packed6 = Or(ShiftLeft<10>(rawE), raw6); + const VU16 packed7 = Or(ShiftLeft<10>(rawF), raw7); + + // We could shift down, OR and shift up, but two shifts are typically more + // expensive than AND, shift into position, and OR (which can be further + // reduced via Xor3). + const VU16 mid4 = Set(d, 0x3C0u); // top 4 in lower 10 + const VU16 part8 = ShiftRight<6>(And(raw8, mid4)); + const VU16 part9 = ShiftRight<2>(And(raw9, mid4)); + const VU16 partA = ShiftLeft<2>(And(rawA, mid4)); + const VU16 partB = ShiftLeft<6>(And(rawB, mid4)); + const VU16 partC = ShiftRight<6>(And(rawC, mid4)); + const VU16 partD = ShiftRight<2>(And(rawD, mid4)); + const VU16 partE = ShiftLeft<2>(And(rawE, mid4)); + const VU16 partF = ShiftLeft<6>(And(rawF, mid4)); + const VU16 packed8 = Or(Xor3(part8, part9, partA), partB); + const VU16 packed9 = Or(Xor3(partC, partD, partE), partF); + + StoreU(packed0, d, packed_out + 0 * N); + StoreU(packed1, d, packed_out + 1 * N); + StoreU(packed2, d, packed_out + 2 * N); + StoreU(packed3, d, packed_out + 3 * N); + StoreU(packed4, d, packed_out + 4 * N); + StoreU(packed5, d, packed_out + 5 * N); + StoreU(packed6, d, packed_out + 6 * N); + StoreU(packed7, d, packed_out + 7 * N); + StoreU(packed8, d, packed_out + 8 * N); + StoreU(packed9, d, packed_out + 9 * N); + } + + template + HWY_INLINE void Unpack(D d, const uint16_t* HWY_RESTRICT packed_in, + uint16_t* HWY_RESTRICT raw) const { + using VU16 = Vec; + const size_t N = Lanes(d); + + const VU16 packed0 = BitCast(d, LoadU(d, packed_in + 0 * N)); + const VU16 packed1 = BitCast(d, LoadU(d, packed_in + 1 * N)); + const VU16 packed2 = BitCast(d, LoadU(d, packed_in + 2 * N)); + const VU16 packed3 = BitCast(d, LoadU(d, packed_in + 3 * N)); + const VU16 packed4 = BitCast(d, LoadU(d, packed_in + 4 * N)); + const VU16 packed5 = BitCast(d, LoadU(d, packed_in + 5 * N)); + const VU16 packed6 = BitCast(d, LoadU(d, packed_in + 6 * N)); + const VU16 packed7 = BitCast(d, LoadU(d, packed_in + 7 * N)); + const VU16 packed8 = BitCast(d, LoadU(d, packed_in + 8 * N)); + const VU16 packed9 = BitCast(d, LoadU(d, packed_in + 9 * N)); + + const VU16 mask = Set(d, 0x3FFu); // Lowest 10 bits + + const VU16 raw0 = And(packed0, mask); + StoreU(raw0, d, raw + 0 * N); + + const VU16 raw1 = And(packed1, mask); + StoreU(raw1, d, raw + 1 * N); + + const VU16 raw2 = And(packed2, mask); + StoreU(raw2, d, raw + 2 * N); + + const VU16 raw3 = And(packed3, mask); + StoreU(raw3, d, raw + 3 * N); + + const VU16 raw4 = And(packed4, mask); + StoreU(raw4, d, raw + 4 * N); + + const VU16 raw5 = And(packed5, mask); + StoreU(raw5, d, raw + 5 * N); + + const VU16 raw6 = And(packed6, mask); + StoreU(raw6, d, raw + 6 * N); + + const VU16 raw7 = And(packed7, mask); + StoreU(raw7, d, raw + 7 * N); + + const VU16 mid4 = Set(d, 0x3C0u); // top 4 in lower 10 + const VU16 raw8 = + OrAnd(ShiftRight<10>(packed0), ShiftLeft<6>(packed8), mid4); + const VU16 raw9 = + OrAnd(ShiftRight<10>(packed1), ShiftLeft<2>(packed8), mid4); + const VU16 rawA = + OrAnd(ShiftRight<10>(packed2), ShiftRight<2>(packed8), mid4); + const VU16 rawB = + OrAnd(ShiftRight<10>(packed3), ShiftRight<6>(packed8), mid4); + const VU16 rawC = + OrAnd(ShiftRight<10>(packed4), ShiftLeft<6>(packed9), mid4); + const VU16 rawD = + OrAnd(ShiftRight<10>(packed5), ShiftLeft<2>(packed9), mid4); + const VU16 rawE = + OrAnd(ShiftRight<10>(packed6), ShiftRight<2>(packed9), mid4); + const VU16 rawF = + OrAnd(ShiftRight<10>(packed7), ShiftRight<6>(packed9), mid4); + + StoreU(raw8, d, raw + 8 * N); + StoreU(raw9, d, raw + 9 * N); + StoreU(rawA, d, raw + 0xA * N); + StoreU(rawB, d, raw + 0xB * N); + StoreU(rawC, d, raw + 0xC * N); + StoreU(rawD, d, raw + 0xD * N); + StoreU(rawE, d, raw + 0xE * N); + StoreU(rawF, d, raw + 0xF * N); + } +}; // Pack16<10> + +template <> +struct Pack16<11> { + template + HWY_INLINE void Pack(D d, const uint16_t* HWY_RESTRICT raw, + uint16_t* HWY_RESTRICT packed_out) const { + using VU16 = Vec; + const size_t N = Lanes(d); + const VU16 raw0 = LoadU(d, raw + 0 * N); + const VU16 raw1 = LoadU(d, raw + 1 * N); + const VU16 raw2 = LoadU(d, raw + 2 * N); + const VU16 raw3 = LoadU(d, raw + 3 * N); + const VU16 raw4 = LoadU(d, raw + 4 * N); + const VU16 raw5 = LoadU(d, raw + 5 * N); + const VU16 raw6 = LoadU(d, raw + 6 * N); + const VU16 raw7 = LoadU(d, raw + 7 * N); + const VU16 raw8 = LoadU(d, raw + 8 * N); + const VU16 raw9 = LoadU(d, raw + 9 * N); + const VU16 rawA = LoadU(d, raw + 0xA * N); + const VU16 rawB = LoadU(d, raw + 0xB * N); + const VU16 rawC = LoadU(d, raw + 0xC * N); + const VU16 rawD = LoadU(d, raw + 0xD * N); + const VU16 rawE = LoadU(d, raw + 0xE * N); + const VU16 rawF = LoadU(d, raw + 0xF * N); + + // It is not obvious what the optimal partitioning looks like. To reduce the + // number of constants, we want to minimize the number of distinct bit + // lengths. 11+5 also requires 6-bit remnants with 4-bit leftovers. + // 8+3 seems better: it is easier to scatter 3 bits into the MSBs. + const VU16 lo8 = Set(d, 0xFFu); + + // Lower 8 bits of all raw + const VU16 packed0 = OrAnd(ShiftLeft<8>(raw1), raw0, lo8); + const VU16 packed1 = OrAnd(ShiftLeft<8>(raw3), raw2, lo8); + const VU16 packed2 = OrAnd(ShiftLeft<8>(raw5), raw4, lo8); + const VU16 packed3 = OrAnd(ShiftLeft<8>(raw7), raw6, lo8); + const VU16 packed4 = OrAnd(ShiftLeft<8>(raw9), raw8, lo8); + const VU16 packed5 = OrAnd(ShiftLeft<8>(rawB), rawA, lo8); + const VU16 packed6 = OrAnd(ShiftLeft<8>(rawD), rawC, lo8); + const VU16 packed7 = OrAnd(ShiftLeft<8>(rawF), rawE, lo8); + + StoreU(packed0, d, packed_out + 0 * N); + StoreU(packed1, d, packed_out + 1 * N); + StoreU(packed2, d, packed_out + 2 * N); + StoreU(packed3, d, packed_out + 3 * N); + StoreU(packed4, d, packed_out + 4 * N); + StoreU(packed5, d, packed_out + 5 * N); + StoreU(packed6, d, packed_out + 6 * N); + StoreU(packed7, d, packed_out + 7 * N); + + // Three vectors, five 3bit remnants each, plus one 3bit in their MSB. + const VU16 top0 = ShiftRight<8>(raw0); + const VU16 top1 = ShiftRight<8>(raw1); + const VU16 top2 = ShiftRight<8>(raw2); + // Insert top raw bits into 3-bit groups within packed8..A. Moving the + // mask along avoids masking each of raw0..E and enables OrAnd. + VU16 next = Set(d, 0x38u); // 0x7 << 3 + VU16 packed8 = OrAnd(top0, ShiftRight<5>(raw3), next); + VU16 packed9 = OrAnd(top1, ShiftRight<5>(raw4), next); + VU16 packedA = OrAnd(top2, ShiftRight<5>(raw5), next); + next = ShiftLeft<3>(next); + packed8 = OrAnd(packed8, ShiftRight<2>(raw6), next); + packed9 = OrAnd(packed9, ShiftRight<2>(raw7), next); + packedA = OrAnd(packedA, ShiftRight<2>(raw8), next); + next = ShiftLeft<3>(next); + packed8 = OrAnd(packed8, Add(raw9, raw9), next); + packed9 = OrAnd(packed9, Add(rawA, rawA), next); + packedA = OrAnd(packedA, Add(rawB, rawB), next); + next = ShiftLeft<3>(next); + packed8 = OrAnd(packed8, ShiftLeft<4>(rawC), next); + packed9 = OrAnd(packed9, ShiftLeft<4>(rawD), next); + packedA = OrAnd(packedA, ShiftLeft<4>(rawE), next); + + // Scatter upper 3 bits of rawF into the upper bits. + next = ShiftLeft<3>(next); // = 0x8000u + packed8 = OrAnd(packed8, ShiftLeft<7>(rawF), next); + packed9 = OrAnd(packed9, ShiftLeft<6>(rawF), next); + packedA = OrAnd(packedA, ShiftLeft<5>(rawF), next); + + StoreU(packed8, d, packed_out + 8 * N); + StoreU(packed9, d, packed_out + 9 * N); + StoreU(packedA, d, packed_out + 0xA * N); + } + + template + HWY_INLINE void Unpack(D d, const uint16_t* HWY_RESTRICT packed_in, + uint16_t* HWY_RESTRICT raw) const { + using VU16 = Vec; + const size_t N = Lanes(d); + + const VU16 packed0 = BitCast(d, LoadU(d, packed_in + 0 * N)); + const VU16 packed1 = BitCast(d, LoadU(d, packed_in + 1 * N)); + const VU16 packed2 = BitCast(d, LoadU(d, packed_in + 2 * N)); + const VU16 packed3 = BitCast(d, LoadU(d, packed_in + 3 * N)); + const VU16 packed4 = BitCast(d, LoadU(d, packed_in + 4 * N)); + const VU16 packed5 = BitCast(d, LoadU(d, packed_in + 5 * N)); + const VU16 packed6 = BitCast(d, LoadU(d, packed_in + 6 * N)); + const VU16 packed7 = BitCast(d, LoadU(d, packed_in + 7 * N)); + const VU16 packed8 = BitCast(d, LoadU(d, packed_in + 8 * N)); + const VU16 packed9 = BitCast(d, LoadU(d, packed_in + 9 * N)); + const VU16 packedA = BitCast(d, LoadU(d, packed_in + 0xA * N)); + + const VU16 mask = Set(d, 0xFFu); // Lowest 8 bits + + const VU16 down0 = And(packed0, mask); + const VU16 down1 = ShiftRight<8>(packed0); + const VU16 down2 = And(packed1, mask); + const VU16 down3 = ShiftRight<8>(packed1); + const VU16 down4 = And(packed2, mask); + const VU16 down5 = ShiftRight<8>(packed2); + const VU16 down6 = And(packed3, mask); + const VU16 down7 = ShiftRight<8>(packed3); + const VU16 down8 = And(packed4, mask); + const VU16 down9 = ShiftRight<8>(packed4); + const VU16 downA = And(packed5, mask); + const VU16 downB = ShiftRight<8>(packed5); + const VU16 downC = And(packed6, mask); + const VU16 downD = ShiftRight<8>(packed6); + const VU16 downE = And(packed7, mask); + const VU16 downF = ShiftRight<8>(packed7); + + // Three bits from packed8..A, eight bits from down0..F. + const VU16 hi3 = Set(d, 0x700u); + const VU16 raw0 = OrAnd(down0, ShiftLeft<8>(packed8), hi3); + const VU16 raw1 = OrAnd(down1, ShiftLeft<8>(packed9), hi3); + const VU16 raw2 = OrAnd(down2, ShiftLeft<8>(packedA), hi3); + + const VU16 raw3 = OrAnd(down3, ShiftLeft<5>(packed8), hi3); + const VU16 raw4 = OrAnd(down4, ShiftLeft<5>(packed9), hi3); + const VU16 raw5 = OrAnd(down5, ShiftLeft<5>(packedA), hi3); + + const VU16 raw6 = OrAnd(down6, ShiftLeft<2>(packed8), hi3); + const VU16 raw7 = OrAnd(down7, ShiftLeft<2>(packed9), hi3); + const VU16 raw8 = OrAnd(down8, ShiftLeft<2>(packedA), hi3); + + const VU16 raw9 = OrAnd(down9, ShiftRight<1>(packed8), hi3); + const VU16 rawA = OrAnd(downA, ShiftRight<1>(packed9), hi3); + const VU16 rawB = OrAnd(downB, ShiftRight<1>(packedA), hi3); + + const VU16 rawC = OrAnd(downC, ShiftRight<4>(packed8), hi3); + const VU16 rawD = OrAnd(downD, ShiftRight<4>(packed9), hi3); + const VU16 rawE = OrAnd(downE, ShiftRight<4>(packedA), hi3); + + // Shift MSB into the top 3-of-11 and mask. + const VU16 rawF = Or(downF, Xor3(And(ShiftRight<7>(packed8), hi3), + And(ShiftRight<6>(packed9), hi3), + And(ShiftRight<5>(packedA), hi3))); + + StoreU(raw0, d, raw + 0 * N); + StoreU(raw1, d, raw + 1 * N); + StoreU(raw2, d, raw + 2 * N); + StoreU(raw3, d, raw + 3 * N); + StoreU(raw4, d, raw + 4 * N); + StoreU(raw5, d, raw + 5 * N); + StoreU(raw6, d, raw + 6 * N); + StoreU(raw7, d, raw + 7 * N); + StoreU(raw8, d, raw + 8 * N); + StoreU(raw9, d, raw + 9 * N); + StoreU(rawA, d, raw + 0xA * N); + StoreU(rawB, d, raw + 0xB * N); + StoreU(rawC, d, raw + 0xC * N); + StoreU(rawD, d, raw + 0xD * N); + StoreU(rawE, d, raw + 0xE * N); + StoreU(rawF, d, raw + 0xF * N); + } +}; // Pack16<11> + +template <> +struct Pack16<12> { + template + HWY_INLINE void Pack(D d, const uint16_t* HWY_RESTRICT raw, + uint16_t* HWY_RESTRICT packed_out) const { + using VU16 = Vec; + const size_t N = Lanes(d); + const VU16 raw0 = LoadU(d, raw + 0 * N); + const VU16 raw1 = LoadU(d, raw + 1 * N); + const VU16 raw2 = LoadU(d, raw + 2 * N); + const VU16 raw3 = LoadU(d, raw + 3 * N); + const VU16 raw4 = LoadU(d, raw + 4 * N); + const VU16 raw5 = LoadU(d, raw + 5 * N); + const VU16 raw6 = LoadU(d, raw + 6 * N); + const VU16 raw7 = LoadU(d, raw + 7 * N); + const VU16 raw8 = LoadU(d, raw + 8 * N); + const VU16 raw9 = LoadU(d, raw + 9 * N); + const VU16 rawA = LoadU(d, raw + 0xA * N); + const VU16 rawB = LoadU(d, raw + 0xB * N); + const VU16 rawC = LoadU(d, raw + 0xC * N); + const VU16 rawD = LoadU(d, raw + 0xD * N); + const VU16 rawE = LoadU(d, raw + 0xE * N); + const VU16 rawF = LoadU(d, raw + 0xF * N); + + // 8 vectors, each with 12+4 bits; top 8 bits are concatenated into + // packed8 to packedB. + const VU16 packed0 = Or(ShiftLeft<12>(raw8), raw0); + const VU16 packed1 = Or(ShiftLeft<12>(raw9), raw1); + const VU16 packed2 = Or(ShiftLeft<12>(rawA), raw2); + const VU16 packed3 = Or(ShiftLeft<12>(rawB), raw3); + const VU16 packed4 = Or(ShiftLeft<12>(rawC), raw4); + const VU16 packed5 = Or(ShiftLeft<12>(rawD), raw5); + const VU16 packed6 = Or(ShiftLeft<12>(rawE), raw6); + const VU16 packed7 = Or(ShiftLeft<12>(rawF), raw7); + + // Masking after shifting left enables OrAnd. + const VU16 hi8 = Set(d, 0xFF00u); + const VU16 packed8 = OrAnd(ShiftRight<4>(raw8), ShiftLeft<4>(raw9), hi8); + const VU16 packed9 = OrAnd(ShiftRight<4>(rawA), ShiftLeft<4>(rawB), hi8); + const VU16 packedA = OrAnd(ShiftRight<4>(rawC), ShiftLeft<4>(rawD), hi8); + const VU16 packedB = OrAnd(ShiftRight<4>(rawE), ShiftLeft<4>(rawF), hi8); + StoreU(packed0, d, packed_out + 0 * N); + StoreU(packed1, d, packed_out + 1 * N); + StoreU(packed2, d, packed_out + 2 * N); + StoreU(packed3, d, packed_out + 3 * N); + StoreU(packed4, d, packed_out + 4 * N); + StoreU(packed5, d, packed_out + 5 * N); + StoreU(packed6, d, packed_out + 6 * N); + StoreU(packed7, d, packed_out + 7 * N); + StoreU(packed8, d, packed_out + 8 * N); + StoreU(packed9, d, packed_out + 9 * N); + StoreU(packedA, d, packed_out + 0xA * N); + StoreU(packedB, d, packed_out + 0xB * N); + } + + template + HWY_INLINE void Unpack(D d, const uint16_t* HWY_RESTRICT packed_in, + uint16_t* HWY_RESTRICT raw) const { + using VU16 = Vec; + const size_t N = Lanes(d); + + const VU16 packed0 = BitCast(d, LoadU(d, packed_in + 0 * N)); + const VU16 packed1 = BitCast(d, LoadU(d, packed_in + 1 * N)); + const VU16 packed2 = BitCast(d, LoadU(d, packed_in + 2 * N)); + const VU16 packed3 = BitCast(d, LoadU(d, packed_in + 3 * N)); + const VU16 packed4 = BitCast(d, LoadU(d, packed_in + 4 * N)); + const VU16 packed5 = BitCast(d, LoadU(d, packed_in + 5 * N)); + const VU16 packed6 = BitCast(d, LoadU(d, packed_in + 6 * N)); + const VU16 packed7 = BitCast(d, LoadU(d, packed_in + 7 * N)); + const VU16 packed8 = BitCast(d, LoadU(d, packed_in + 8 * N)); + const VU16 packed9 = BitCast(d, LoadU(d, packed_in + 9 * N)); + const VU16 packedA = BitCast(d, LoadU(d, packed_in + 0xA * N)); + const VU16 packedB = BitCast(d, LoadU(d, packed_in + 0xB * N)); + + const VU16 mask = Set(d, 0xFFFu); // Lowest 12 bits + + const VU16 raw0 = And(packed0, mask); + StoreU(raw0, d, raw + 0 * N); + + const VU16 raw1 = And(packed1, mask); + StoreU(raw1, d, raw + 1 * N); + + const VU16 raw2 = And(packed2, mask); + StoreU(raw2, d, raw + 2 * N); + + const VU16 raw3 = And(packed3, mask); + StoreU(raw3, d, raw + 3 * N); + + const VU16 raw4 = And(packed4, mask); + StoreU(raw4, d, raw + 4 * N); + + const VU16 raw5 = And(packed5, mask); + StoreU(raw5, d, raw + 5 * N); + + const VU16 raw6 = And(packed6, mask); + StoreU(raw6, d, raw + 6 * N); + + const VU16 raw7 = And(packed7, mask); + StoreU(raw7, d, raw + 7 * N); + + const VU16 mid8 = Set(d, 0xFF0u); // upper 8 in lower 12 + const VU16 raw8 = + OrAnd(ShiftRight<12>(packed0), ShiftLeft<4>(packed8), mid8); + const VU16 raw9 = + OrAnd(ShiftRight<12>(packed1), ShiftRight<4>(packed8), mid8); + const VU16 rawA = + OrAnd(ShiftRight<12>(packed2), ShiftLeft<4>(packed9), mid8); + const VU16 rawB = + OrAnd(ShiftRight<12>(packed3), ShiftRight<4>(packed9), mid8); + const VU16 rawC = + OrAnd(ShiftRight<12>(packed4), ShiftLeft<4>(packedA), mid8); + const VU16 rawD = + OrAnd(ShiftRight<12>(packed5), ShiftRight<4>(packedA), mid8); + const VU16 rawE = + OrAnd(ShiftRight<12>(packed6), ShiftLeft<4>(packedB), mid8); + const VU16 rawF = + OrAnd(ShiftRight<12>(packed7), ShiftRight<4>(packedB), mid8); + StoreU(raw8, d, raw + 8 * N); + StoreU(raw9, d, raw + 9 * N); + StoreU(rawA, d, raw + 0xA * N); + StoreU(rawB, d, raw + 0xB * N); + StoreU(rawC, d, raw + 0xC * N); + StoreU(rawD, d, raw + 0xD * N); + StoreU(rawE, d, raw + 0xE * N); + StoreU(rawF, d, raw + 0xF * N); + } +}; // Pack16<12> + +template <> +struct Pack16<13> { + template + HWY_INLINE void Pack(D d, const uint16_t* HWY_RESTRICT raw, + uint16_t* HWY_RESTRICT packed_out) const { + using VU16 = Vec; + const size_t N = Lanes(d); + const VU16 raw0 = LoadU(d, raw + 0 * N); + const VU16 raw1 = LoadU(d, raw + 1 * N); + const VU16 raw2 = LoadU(d, raw + 2 * N); + const VU16 raw3 = LoadU(d, raw + 3 * N); + const VU16 raw4 = LoadU(d, raw + 4 * N); + const VU16 raw5 = LoadU(d, raw + 5 * N); + const VU16 raw6 = LoadU(d, raw + 6 * N); + const VU16 raw7 = LoadU(d, raw + 7 * N); + const VU16 raw8 = LoadU(d, raw + 8 * N); + const VU16 raw9 = LoadU(d, raw + 9 * N); + const VU16 rawA = LoadU(d, raw + 0xA * N); + const VU16 rawB = LoadU(d, raw + 0xB * N); + const VU16 rawC = LoadU(d, raw + 0xC * N); + const VU16 rawD = LoadU(d, raw + 0xD * N); + const VU16 rawE = LoadU(d, raw + 0xE * N); + const VU16 rawF = LoadU(d, raw + 0xF * N); + + // As with 11 bits, it is not obvious what the optimal partitioning looks + // like. We similarly go with an 8+5 split. + const VU16 lo8 = Set(d, 0xFFu); + + // Lower 8 bits of all raw + const VU16 packed0 = OrAnd(ShiftLeft<8>(raw1), raw0, lo8); + const VU16 packed1 = OrAnd(ShiftLeft<8>(raw3), raw2, lo8); + const VU16 packed2 = OrAnd(ShiftLeft<8>(raw5), raw4, lo8); + const VU16 packed3 = OrAnd(ShiftLeft<8>(raw7), raw6, lo8); + const VU16 packed4 = OrAnd(ShiftLeft<8>(raw9), raw8, lo8); + const VU16 packed5 = OrAnd(ShiftLeft<8>(rawB), rawA, lo8); + const VU16 packed6 = OrAnd(ShiftLeft<8>(rawD), rawC, lo8); + const VU16 packed7 = OrAnd(ShiftLeft<8>(rawF), rawE, lo8); + + StoreU(packed0, d, packed_out + 0 * N); + StoreU(packed1, d, packed_out + 1 * N); + StoreU(packed2, d, packed_out + 2 * N); + StoreU(packed3, d, packed_out + 3 * N); + StoreU(packed4, d, packed_out + 4 * N); + StoreU(packed5, d, packed_out + 5 * N); + StoreU(packed6, d, packed_out + 6 * N); + StoreU(packed7, d, packed_out + 7 * N); + + // Five vectors, three 5bit remnants each, plus one 5bit in their MSB. + const VU16 top0 = ShiftRight<8>(raw0); + const VU16 top1 = ShiftRight<8>(raw1); + const VU16 top2 = ShiftRight<8>(raw2); + const VU16 top3 = ShiftRight<8>(raw3); + const VU16 top4 = ShiftRight<8>(raw4); + + // Insert top raw bits into 5-bit groups within packed8..C. Moving the + // mask along avoids masking each of raw0..E and enables OrAnd. + VU16 next = Set(d, 0x3E0u); // 0x1F << 5 + VU16 packed8 = OrAnd(top0, ShiftRight<3>(raw5), next); + VU16 packed9 = OrAnd(top1, ShiftRight<3>(raw6), next); + VU16 packedA = OrAnd(top2, ShiftRight<3>(raw7), next); + VU16 packedB = OrAnd(top3, ShiftRight<3>(raw8), next); + VU16 packedC = OrAnd(top4, ShiftRight<3>(raw9), next); + next = ShiftLeft<5>(next); + packed8 = OrAnd(packed8, ShiftLeft<2>(rawA), next); + packed9 = OrAnd(packed9, ShiftLeft<2>(rawB), next); + packedA = OrAnd(packedA, ShiftLeft<2>(rawC), next); + packedB = OrAnd(packedB, ShiftLeft<2>(rawD), next); + packedC = OrAnd(packedC, ShiftLeft<2>(rawE), next); + + // Scatter upper 5 bits of rawF into the upper bits. + next = ShiftLeft<3>(next); // = 0x8000u + packed8 = OrAnd(packed8, ShiftLeft<7>(rawF), next); + packed9 = OrAnd(packed9, ShiftLeft<6>(rawF), next); + packedA = OrAnd(packedA, ShiftLeft<5>(rawF), next); + packedB = OrAnd(packedB, ShiftLeft<4>(rawF), next); + packedC = OrAnd(packedC, ShiftLeft<3>(rawF), next); + + StoreU(packed8, d, packed_out + 8 * N); + StoreU(packed9, d, packed_out + 9 * N); + StoreU(packedA, d, packed_out + 0xA * N); + StoreU(packedB, d, packed_out + 0xB * N); + StoreU(packedC, d, packed_out + 0xC * N); + } + + template + HWY_INLINE void Unpack(D d, const uint16_t* HWY_RESTRICT packed_in, + uint16_t* HWY_RESTRICT raw) const { + using VU16 = Vec; + const size_t N = Lanes(d); + + const VU16 packed0 = BitCast(d, LoadU(d, packed_in + 0 * N)); + const VU16 packed1 = BitCast(d, LoadU(d, packed_in + 1 * N)); + const VU16 packed2 = BitCast(d, LoadU(d, packed_in + 2 * N)); + const VU16 packed3 = BitCast(d, LoadU(d, packed_in + 3 * N)); + const VU16 packed4 = BitCast(d, LoadU(d, packed_in + 4 * N)); + const VU16 packed5 = BitCast(d, LoadU(d, packed_in + 5 * N)); + const VU16 packed6 = BitCast(d, LoadU(d, packed_in + 6 * N)); + const VU16 packed7 = BitCast(d, LoadU(d, packed_in + 7 * N)); + const VU16 packed8 = BitCast(d, LoadU(d, packed_in + 8 * N)); + const VU16 packed9 = BitCast(d, LoadU(d, packed_in + 9 * N)); + const VU16 packedA = BitCast(d, LoadU(d, packed_in + 0xA * N)); + const VU16 packedB = BitCast(d, LoadU(d, packed_in + 0xB * N)); + const VU16 packedC = BitCast(d, LoadU(d, packed_in + 0xC * N)); + + const VU16 mask = Set(d, 0xFFu); // Lowest 8 bits + + const VU16 down0 = And(packed0, mask); + const VU16 down1 = ShiftRight<8>(packed0); + const VU16 down2 = And(packed1, mask); + const VU16 down3 = ShiftRight<8>(packed1); + const VU16 down4 = And(packed2, mask); + const VU16 down5 = ShiftRight<8>(packed2); + const VU16 down6 = And(packed3, mask); + const VU16 down7 = ShiftRight<8>(packed3); + const VU16 down8 = And(packed4, mask); + const VU16 down9 = ShiftRight<8>(packed4); + const VU16 downA = And(packed5, mask); + const VU16 downB = ShiftRight<8>(packed5); + const VU16 downC = And(packed6, mask); + const VU16 downD = ShiftRight<8>(packed6); + const VU16 downE = And(packed7, mask); + const VU16 downF = ShiftRight<8>(packed7); + + // Upper five bits from packed8..C, eight bits from down0..F. + const VU16 hi5 = Set(d, 0x1F00u); + const VU16 raw0 = OrAnd(down0, ShiftLeft<8>(packed8), hi5); + const VU16 raw1 = OrAnd(down1, ShiftLeft<8>(packed9), hi5); + const VU16 raw2 = OrAnd(down2, ShiftLeft<8>(packedA), hi5); + const VU16 raw3 = OrAnd(down3, ShiftLeft<8>(packedB), hi5); + const VU16 raw4 = OrAnd(down4, ShiftLeft<8>(packedC), hi5); + + const VU16 raw5 = OrAnd(down5, ShiftLeft<3>(packed8), hi5); + const VU16 raw6 = OrAnd(down6, ShiftLeft<3>(packed9), hi5); + const VU16 raw7 = OrAnd(down7, ShiftLeft<3>(packedA), hi5); + const VU16 raw8 = OrAnd(down8, ShiftLeft<3>(packed9), hi5); + const VU16 raw9 = OrAnd(down9, ShiftLeft<3>(packedA), hi5); + + const VU16 rawA = OrAnd(downA, ShiftRight<2>(packed8), hi5); + const VU16 rawB = OrAnd(downB, ShiftRight<2>(packed9), hi5); + const VU16 rawC = OrAnd(downC, ShiftRight<2>(packedA), hi5); + const VU16 rawD = OrAnd(downD, ShiftRight<2>(packed9), hi5); + const VU16 rawE = OrAnd(downE, ShiftRight<2>(packedA), hi5); + + // Shift MSB into the top 5-of-11 and mask. + const VU16 p0 = Xor3(And(ShiftRight<7>(packed8), hi5), // + And(ShiftRight<6>(packed9), hi5), + And(ShiftRight<5>(packedA), hi5)); + const VU16 p1 = Xor3(And(ShiftRight<4>(packedB), hi5), + And(ShiftRight<3>(packedC), hi5), downF); + const VU16 rawF = Or(p0, p1); + + StoreU(raw0, d, raw + 0 * N); + StoreU(raw1, d, raw + 1 * N); + StoreU(raw2, d, raw + 2 * N); + StoreU(raw3, d, raw + 3 * N); + StoreU(raw4, d, raw + 4 * N); + StoreU(raw5, d, raw + 5 * N); + StoreU(raw6, d, raw + 6 * N); + StoreU(raw7, d, raw + 7 * N); + StoreU(raw8, d, raw + 8 * N); + StoreU(raw9, d, raw + 9 * N); + StoreU(rawA, d, raw + 0xA * N); + StoreU(rawB, d, raw + 0xB * N); + StoreU(rawC, d, raw + 0xC * N); + StoreU(rawD, d, raw + 0xD * N); + StoreU(rawE, d, raw + 0xE * N); + StoreU(rawF, d, raw + 0xF * N); + } +}; // Pack16<13> + +template <> +struct Pack16<14> { + template + HWY_INLINE void Pack(D d, const uint16_t* HWY_RESTRICT raw, + uint16_t* HWY_RESTRICT packed_out) const { + using VU16 = Vec; + const size_t N = Lanes(d); + const VU16 raw0 = LoadU(d, raw + 0 * N); + const VU16 raw1 = LoadU(d, raw + 1 * N); + const VU16 raw2 = LoadU(d, raw + 2 * N); + const VU16 raw3 = LoadU(d, raw + 3 * N); + const VU16 raw4 = LoadU(d, raw + 4 * N); + const VU16 raw5 = LoadU(d, raw + 5 * N); + const VU16 raw6 = LoadU(d, raw + 6 * N); + const VU16 raw7 = LoadU(d, raw + 7 * N); + const VU16 raw8 = LoadU(d, raw + 8 * N); + const VU16 raw9 = LoadU(d, raw + 9 * N); + const VU16 rawA = LoadU(d, raw + 0xA * N); + const VU16 rawB = LoadU(d, raw + 0xB * N); + const VU16 rawC = LoadU(d, raw + 0xC * N); + const VU16 rawD = LoadU(d, raw + 0xD * N); + const VU16 rawE = LoadU(d, raw + 0xE * N); + const VU16 rawF = LoadU(d, raw + 0xF * N); + + // 14 vectors, each with 14+2 bits; two raw vectors are scattered + // across the upper 2 bits. + const VU16 hi2 = Set(d, 0xC000u); + const VU16 packed0 = Or(raw0, ShiftLeft<14>(rawE)); + const VU16 packed1 = OrAnd(raw1, ShiftLeft<12>(rawE), hi2); + const VU16 packed2 = OrAnd(raw2, ShiftLeft<10>(rawE), hi2); + const VU16 packed3 = OrAnd(raw3, ShiftLeft<8>(rawE), hi2); + const VU16 packed4 = OrAnd(raw4, ShiftLeft<6>(rawE), hi2); + const VU16 packed5 = OrAnd(raw5, ShiftLeft<4>(rawE), hi2); + const VU16 packed6 = OrAnd(raw6, ShiftLeft<2>(rawE), hi2); + const VU16 packed7 = Or(raw7, ShiftLeft<14>(rawF)); + const VU16 packed8 = OrAnd(raw8, ShiftLeft<12>(rawF), hi2); + const VU16 packed9 = OrAnd(raw9, ShiftLeft<10>(rawF), hi2); + const VU16 packedA = OrAnd(rawA, ShiftLeft<8>(rawF), hi2); + const VU16 packedB = OrAnd(rawB, ShiftLeft<6>(rawF), hi2); + const VU16 packedC = OrAnd(rawC, ShiftLeft<4>(rawF), hi2); + const VU16 packedD = OrAnd(rawD, ShiftLeft<2>(rawF), hi2); + + StoreU(packed0, d, packed_out + 0 * N); + StoreU(packed1, d, packed_out + 1 * N); + StoreU(packed2, d, packed_out + 2 * N); + StoreU(packed3, d, packed_out + 3 * N); + StoreU(packed4, d, packed_out + 4 * N); + StoreU(packed5, d, packed_out + 5 * N); + StoreU(packed6, d, packed_out + 6 * N); + StoreU(packed7, d, packed_out + 7 * N); + StoreU(packed8, d, packed_out + 8 * N); + StoreU(packed9, d, packed_out + 9 * N); + StoreU(packedA, d, packed_out + 0xA * N); + StoreU(packedB, d, packed_out + 0xB * N); + StoreU(packedC, d, packed_out + 0xC * N); + StoreU(packedD, d, packed_out + 0xD * N); + } + + template + HWY_INLINE void Unpack(D d, const uint16_t* HWY_RESTRICT packed_in, + uint16_t* HWY_RESTRICT raw) const { + using VU16 = Vec; + const size_t N = Lanes(d); + + const VU16 packed0 = BitCast(d, LoadU(d, packed_in + 0 * N)); + const VU16 packed1 = BitCast(d, LoadU(d, packed_in + 1 * N)); + const VU16 packed2 = BitCast(d, LoadU(d, packed_in + 2 * N)); + const VU16 packed3 = BitCast(d, LoadU(d, packed_in + 3 * N)); + const VU16 packed4 = BitCast(d, LoadU(d, packed_in + 4 * N)); + const VU16 packed5 = BitCast(d, LoadU(d, packed_in + 5 * N)); + const VU16 packed6 = BitCast(d, LoadU(d, packed_in + 6 * N)); + const VU16 packed7 = BitCast(d, LoadU(d, packed_in + 7 * N)); + const VU16 packed8 = BitCast(d, LoadU(d, packed_in + 8 * N)); + const VU16 packed9 = BitCast(d, LoadU(d, packed_in + 9 * N)); + const VU16 packedA = BitCast(d, LoadU(d, packed_in + 0xA * N)); + const VU16 packedB = BitCast(d, LoadU(d, packed_in + 0xB * N)); + const VU16 packedC = BitCast(d, LoadU(d, packed_in + 0xC * N)); + const VU16 packedD = BitCast(d, LoadU(d, packed_in + 0xD * N)); + + const VU16 mask = Set(d, 0x3FFFu); // Lowest 14 bits + + const VU16 raw0 = And(packed0, mask); + StoreU(raw0, d, raw + 0 * N); + + const VU16 raw1 = And(packed1, mask); + StoreU(raw1, d, raw + 1 * N); + + const VU16 raw2 = And(packed2, mask); + StoreU(raw2, d, raw + 2 * N); + + const VU16 raw3 = And(packed3, mask); + StoreU(raw3, d, raw + 3 * N); + + const VU16 raw4 = And(packed4, mask); + StoreU(raw4, d, raw + 4 * N); + + const VU16 raw5 = And(packed5, mask); + StoreU(raw5, d, raw + 5 * N); + + const VU16 raw6 = And(packed6, mask); + StoreU(raw6, d, raw + 6 * N); + + const VU16 raw7 = And(packed7, mask); + StoreU(raw7, d, raw + 7 * N); + + const VU16 raw8 = And(packed8, mask); + StoreU(raw8, d, raw + 8 * N); + + const VU16 raw9 = And(packed9, mask); + StoreU(raw9, d, raw + 9 * N); + + const VU16 rawA = And(packedA, mask); + StoreU(rawA, d, raw + 0xA * N); + + const VU16 rawB = And(packedB, mask); + StoreU(rawB, d, raw + 0xB * N); + + const VU16 rawC = And(packedC, mask); + StoreU(rawC, d, raw + 0xC * N); + + const VU16 rawD = And(packedD, mask); + StoreU(rawD, d, raw + 0xD * N); + + // rawE is the concatenation of the top two bits in packed0..6. + const VU16 E0 = Xor3(ShiftRight<14>(packed0), // + ShiftRight<12>(AndNot(mask, packed1)), + ShiftRight<10>(AndNot(mask, packed2))); + const VU16 E1 = Xor3(ShiftRight<8>(AndNot(mask, packed3)), + ShiftRight<6>(AndNot(mask, packed4)), + ShiftRight<4>(AndNot(mask, packed5))); + const VU16 rawE = Xor3(ShiftRight<2>(AndNot(mask, packed6)), E0, E1); + const VU16 F0 = Xor3(ShiftRight<14>(AndNot(mask, packed7)), + ShiftRight<12>(AndNot(mask, packed8)), + ShiftRight<10>(AndNot(mask, packed9))); + const VU16 F1 = Xor3(ShiftRight<8>(AndNot(mask, packedA)), + ShiftRight<6>(AndNot(mask, packedB)), + ShiftRight<4>(AndNot(mask, packedC))); + const VU16 rawF = Xor3(ShiftRight<2>(AndNot(mask, packedD)), F0, F1); + StoreU(rawE, d, raw + 0xE * N); + StoreU(rawF, d, raw + 0xF * N); + } +}; // Pack16<14> + +template <> +struct Pack16<15> { + template + HWY_INLINE void Pack(D d, const uint16_t* HWY_RESTRICT raw, + uint16_t* HWY_RESTRICT packed_out) const { + using VU16 = Vec; + const size_t N = Lanes(d); + const VU16 raw0 = LoadU(d, raw + 0 * N); + const VU16 raw1 = LoadU(d, raw + 1 * N); + const VU16 raw2 = LoadU(d, raw + 2 * N); + const VU16 raw3 = LoadU(d, raw + 3 * N); + const VU16 raw4 = LoadU(d, raw + 4 * N); + const VU16 raw5 = LoadU(d, raw + 5 * N); + const VU16 raw6 = LoadU(d, raw + 6 * N); + const VU16 raw7 = LoadU(d, raw + 7 * N); + const VU16 raw8 = LoadU(d, raw + 8 * N); + const VU16 raw9 = LoadU(d, raw + 9 * N); + const VU16 rawA = LoadU(d, raw + 0xA * N); + const VU16 rawB = LoadU(d, raw + 0xB * N); + const VU16 rawC = LoadU(d, raw + 0xC * N); + const VU16 rawD = LoadU(d, raw + 0xD * N); + const VU16 rawE = LoadU(d, raw + 0xE * N); + const VU16 rawF = LoadU(d, raw + 0xF * N); + + // 15 vectors, each with 15+1 bits; one packed vector is scattered + // across the upper bit. + const VU16 hi1 = Set(d, 0x8000u); + const VU16 packed0 = Or(raw0, ShiftLeft<15>(rawF)); + const VU16 packed1 = OrAnd(raw1, ShiftLeft<14>(rawF), hi1); + const VU16 packed2 = OrAnd(raw2, ShiftLeft<13>(rawF), hi1); + const VU16 packed3 = OrAnd(raw3, ShiftLeft<12>(rawF), hi1); + const VU16 packed4 = OrAnd(raw4, ShiftLeft<11>(rawF), hi1); + const VU16 packed5 = OrAnd(raw5, ShiftLeft<10>(rawF), hi1); + const VU16 packed6 = OrAnd(raw6, ShiftLeft<9>(rawF), hi1); + const VU16 packed7 = OrAnd(raw7, ShiftLeft<8>(rawF), hi1); + const VU16 packed8 = OrAnd(raw8, ShiftLeft<7>(rawF), hi1); + const VU16 packed9 = OrAnd(raw9, ShiftLeft<6>(rawF), hi1); + const VU16 packedA = OrAnd(rawA, ShiftLeft<5>(rawF), hi1); + const VU16 packedB = OrAnd(rawB, ShiftLeft<4>(rawF), hi1); + const VU16 packedC = OrAnd(rawC, ShiftLeft<3>(rawF), hi1); + const VU16 packedD = OrAnd(rawD, ShiftLeft<2>(rawF), hi1); + const VU16 packedE = OrAnd(rawE, ShiftLeft<1>(rawF), hi1); + + StoreU(packed0, d, packed_out + 0 * N); + StoreU(packed1, d, packed_out + 1 * N); + StoreU(packed2, d, packed_out + 2 * N); + StoreU(packed3, d, packed_out + 3 * N); + StoreU(packed4, d, packed_out + 4 * N); + StoreU(packed5, d, packed_out + 5 * N); + StoreU(packed6, d, packed_out + 6 * N); + StoreU(packed7, d, packed_out + 7 * N); + StoreU(packed8, d, packed_out + 8 * N); + StoreU(packed9, d, packed_out + 9 * N); + StoreU(packedA, d, packed_out + 0xA * N); + StoreU(packedB, d, packed_out + 0xB * N); + StoreU(packedC, d, packed_out + 0xC * N); + StoreU(packedD, d, packed_out + 0xD * N); + StoreU(packedE, d, packed_out + 0xE * N); + } + + template + HWY_INLINE void Unpack(D d, const uint16_t* HWY_RESTRICT packed_in, + uint16_t* HWY_RESTRICT raw) const { + using VU16 = Vec; + const size_t N = Lanes(d); + + const VU16 packed0 = BitCast(d, LoadU(d, packed_in + 0 * N)); + const VU16 packed1 = BitCast(d, LoadU(d, packed_in + 1 * N)); + const VU16 packed2 = BitCast(d, LoadU(d, packed_in + 2 * N)); + const VU16 packed3 = BitCast(d, LoadU(d, packed_in + 3 * N)); + const VU16 packed4 = BitCast(d, LoadU(d, packed_in + 4 * N)); + const VU16 packed5 = BitCast(d, LoadU(d, packed_in + 5 * N)); + const VU16 packed6 = BitCast(d, LoadU(d, packed_in + 6 * N)); + const VU16 packed7 = BitCast(d, LoadU(d, packed_in + 7 * N)); + const VU16 packed8 = BitCast(d, LoadU(d, packed_in + 8 * N)); + const VU16 packed9 = BitCast(d, LoadU(d, packed_in + 9 * N)); + const VU16 packedA = BitCast(d, LoadU(d, packed_in + 0xA * N)); + const VU16 packedB = BitCast(d, LoadU(d, packed_in + 0xB * N)); + const VU16 packedC = BitCast(d, LoadU(d, packed_in + 0xC * N)); + const VU16 packedD = BitCast(d, LoadU(d, packed_in + 0xD * N)); + const VU16 packedE = BitCast(d, LoadU(d, packed_in + 0xE * N)); + + const VU16 mask = Set(d, 0x7FFFu); // Lowest 15 bits + + const VU16 raw0 = And(packed0, mask); + StoreU(raw0, d, raw + 0 * N); + + const VU16 raw1 = And(packed1, mask); + StoreU(raw1, d, raw + 1 * N); + + const VU16 raw2 = And(packed2, mask); + StoreU(raw2, d, raw + 2 * N); + + const VU16 raw3 = And(packed3, mask); + StoreU(raw3, d, raw + 3 * N); + + const VU16 raw4 = And(packed4, mask); + StoreU(raw4, d, raw + 4 * N); + + const VU16 raw5 = And(packed5, mask); + StoreU(raw5, d, raw + 5 * N); + + const VU16 raw6 = And(packed6, mask); + StoreU(raw6, d, raw + 6 * N); + + const VU16 raw7 = And(packed7, mask); + StoreU(raw7, d, raw + 7 * N); + + const VU16 raw8 = And(packed8, mask); + StoreU(raw8, d, raw + 8 * N); + + const VU16 raw9 = And(packed9, mask); + StoreU(raw9, d, raw + 9 * N); + + const VU16 rawA = And(packedA, mask); + StoreU(rawA, d, raw + 0xA * N); + + const VU16 rawB = And(packedB, mask); + StoreU(rawB, d, raw + 0xB * N); + + const VU16 rawC = And(packedC, mask); + StoreU(rawC, d, raw + 0xC * N); + + const VU16 rawD = And(packedD, mask); + StoreU(rawD, d, raw + 0xD * N); + + const VU16 rawE = And(packedE, mask); + StoreU(rawE, d, raw + 0xE * N); + + // rawF is the concatenation of the top bit in packed0..E. + const VU16 F0 = Xor3(ShiftRight<15>(packed0), // + ShiftRight<14>(AndNot(mask, packed1)), + ShiftRight<13>(AndNot(mask, packed2))); + const VU16 F1 = Xor3(ShiftRight<12>(AndNot(mask, packed3)), + ShiftRight<11>(AndNot(mask, packed4)), + ShiftRight<10>(AndNot(mask, packed5))); + const VU16 F2 = Xor3(ShiftRight<9>(AndNot(mask, packed6)), + ShiftRight<8>(AndNot(mask, packed7)), + ShiftRight<7>(AndNot(mask, packed8))); + const VU16 F3 = Xor3(ShiftRight<6>(AndNot(mask, packed9)), + ShiftRight<5>(AndNot(mask, packedA)), + ShiftRight<4>(AndNot(mask, packedB))); + const VU16 F4 = Xor3(ShiftRight<3>(AndNot(mask, packedC)), + ShiftRight<2>(AndNot(mask, packedD)), + ShiftRight<1>(AndNot(mask, packedE))); + const VU16 rawF = Xor3(F0, F1, Xor3(F2, F3, F4)); + StoreU(rawF, d, raw + 0xF * N); + } +}; // Pack16<15> + +template <> +struct Pack16<16> { + template + HWY_INLINE void Pack(D d, const uint16_t* HWY_RESTRICT raw, + uint16_t* HWY_RESTRICT packed_out) const { + using VU16 = Vec; + const size_t N = Lanes(d); + const VU16 raw0 = LoadU(d, raw + 0 * N); + const VU16 raw1 = LoadU(d, raw + 1 * N); + const VU16 raw2 = LoadU(d, raw + 2 * N); + const VU16 raw3 = LoadU(d, raw + 3 * N); + const VU16 raw4 = LoadU(d, raw + 4 * N); + const VU16 raw5 = LoadU(d, raw + 5 * N); + const VU16 raw6 = LoadU(d, raw + 6 * N); + const VU16 raw7 = LoadU(d, raw + 7 * N); + const VU16 raw8 = LoadU(d, raw + 8 * N); + const VU16 raw9 = LoadU(d, raw + 9 * N); + const VU16 rawA = LoadU(d, raw + 0xA * N); + const VU16 rawB = LoadU(d, raw + 0xB * N); + const VU16 rawC = LoadU(d, raw + 0xC * N); + const VU16 rawD = LoadU(d, raw + 0xD * N); + const VU16 rawE = LoadU(d, raw + 0xE * N); + const VU16 rawF = LoadU(d, raw + 0xF * N); + + StoreU(raw0, d, packed_out + 0 * N); + StoreU(raw1, d, packed_out + 1 * N); + StoreU(raw2, d, packed_out + 2 * N); + StoreU(raw3, d, packed_out + 3 * N); + StoreU(raw4, d, packed_out + 4 * N); + StoreU(raw5, d, packed_out + 5 * N); + StoreU(raw6, d, packed_out + 6 * N); + StoreU(raw7, d, packed_out + 7 * N); + StoreU(raw8, d, packed_out + 8 * N); + StoreU(raw9, d, packed_out + 9 * N); + StoreU(rawA, d, packed_out + 0xA * N); + StoreU(rawB, d, packed_out + 0xB * N); + StoreU(rawC, d, packed_out + 0xC * N); + StoreU(rawD, d, packed_out + 0xD * N); + StoreU(rawE, d, packed_out + 0xE * N); + StoreU(rawF, d, packed_out + 0xF * N); + } + + template + HWY_INLINE void Unpack(D d, const uint16_t* HWY_RESTRICT packed_in, + uint16_t* HWY_RESTRICT raw) const { + using VU16 = Vec; + const size_t N = Lanes(d); + + const VU16 raw0 = BitCast(d, LoadU(d, packed_in + 0 * N)); + const VU16 raw1 = BitCast(d, LoadU(d, packed_in + 1 * N)); + const VU16 raw2 = BitCast(d, LoadU(d, packed_in + 2 * N)); + const VU16 raw3 = BitCast(d, LoadU(d, packed_in + 3 * N)); + const VU16 raw4 = BitCast(d, LoadU(d, packed_in + 4 * N)); + const VU16 raw5 = BitCast(d, LoadU(d, packed_in + 5 * N)); + const VU16 raw6 = BitCast(d, LoadU(d, packed_in + 6 * N)); + const VU16 raw7 = BitCast(d, LoadU(d, packed_in + 7 * N)); + const VU16 raw8 = BitCast(d, LoadU(d, packed_in + 8 * N)); + const VU16 raw9 = BitCast(d, LoadU(d, packed_in + 9 * N)); + const VU16 rawA = BitCast(d, LoadU(d, packed_in + 0xA * N)); + const VU16 rawB = BitCast(d, LoadU(d, packed_in + 0xB * N)); + const VU16 rawC = BitCast(d, LoadU(d, packed_in + 0xC * N)); + const VU16 rawD = BitCast(d, LoadU(d, packed_in + 0xD * N)); + const VU16 rawE = BitCast(d, LoadU(d, packed_in + 0xE * N)); + const VU16 rawF = BitCast(d, LoadU(d, packed_in + 0xF * N)); + + StoreU(raw0, d, raw + 0 * N); + StoreU(raw1, d, raw + 1 * N); + StoreU(raw2, d, raw + 2 * N); + StoreU(raw3, d, raw + 3 * N); + StoreU(raw4, d, raw + 4 * N); + StoreU(raw5, d, raw + 5 * N); + StoreU(raw6, d, raw + 6 * N); + StoreU(raw7, d, raw + 7 * N); + StoreU(raw8, d, raw + 8 * N); + StoreU(raw9, d, raw + 9 * N); + StoreU(rawA, d, raw + 0xA * N); + StoreU(rawB, d, raw + 0xB * N); + StoreU(rawC, d, raw + 0xC * N); + StoreU(rawD, d, raw + 0xD * N); + StoreU(rawE, d, raw + 0xE * N); + StoreU(rawF, d, raw + 0xF * N); + } +}; // Pack16<16> + +// NOLINTNEXTLINE(google-readability-namespace-comments) +} // namespace HWY_NAMESPACE +} // namespace hwy +HWY_AFTER_NAMESPACE(); + +#endif // HIGHWAY_HWY_CONTRIB_BIT_PACK_INL_H_ diff --git a/third_party/highway/hwy/contrib/bit_pack/bit_pack_test.cc b/third_party/highway/hwy/contrib/bit_pack/bit_pack_test.cc new file mode 100644 index 0000000000..4994e1489a --- /dev/null +++ b/third_party/highway/hwy/contrib/bit_pack/bit_pack_test.cc @@ -0,0 +1,206 @@ +// Copyright 2022 Google LLC +// SPDX-License-Identifier: Apache-2.0 +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include + +#include + +#include "hwy/aligned_allocator.h" +#include "hwy/base.h" +#include "hwy/nanobenchmark.h" + +// clang-format off +#undef HWY_TARGET_INCLUDE +#define HWY_TARGET_INCLUDE "hwy/contrib/bit_pack/bit_pack_test.cc" // NOLINT +#include "hwy/foreach_target.h" // IWYU pragma: keep +#include "hwy/highway.h" +#include "hwy/contrib/bit_pack/bit_pack-inl.h" +#include "hwy/tests/test_util-inl.h" +// clang-format on + +#ifndef HWY_BIT_PACK_BENCHMARK +#define HWY_BIT_PACK_BENCHMARK 0 +#endif + +HWY_BEFORE_NAMESPACE(); +namespace hwy { +// Used to prevent running benchmark (slow) for partial vectors and targets +// except the best available. Global, not per-target, hence must be outside +// HWY_NAMESPACE. Declare first because HWY_ONCE is only true after some code +// has been re-included. +extern size_t last_bits; +extern uint64_t best_target; +#if HWY_ONCE +size_t last_bits = 0; +uint64_t best_target = ~0ull; +#endif +namespace HWY_NAMESPACE { + +template +T Random(RandomState& rng) { + return static_cast(Random32(&rng) & kBits); +} + +template +class Checker { + public: + explicit Checker(size_t num) { raw_.reserve(num); } + void NotifyRaw(T raw) { raw_.push_back(raw); } + + void NotifyRawOutput(size_t bits, T raw) { + if (raw_[num_verified_] != raw) { + HWY_ABORT("%zu bits: pos %zu of %zu, expected %.0f actual %.0f\n", bits, + num_verified_, raw_.size(), + static_cast(raw_[num_verified_]), + static_cast(raw)); + } + ++num_verified_; + } + + private: + std::vector raw_; + size_t num_verified_ = 0; +}; + +template