From ef24de24a82fe681581cc130f342363c47c0969a Mon Sep 17 00:00:00 2001 From: Daniel Baumann Date: Fri, 7 Jun 2024 07:48:48 +0200 Subject: Merging upstream version 1.75.0+dfsg1. Signed-off-by: Daniel Baumann --- vendor/bytecount/.cargo-checksum.json | 2 +- vendor/bytecount/Cargo.toml | 5 +- vendor/bytecount/README.md | 4 +- vendor/bytecount/benches/bench.rs | 69 ++++++++------- vendor/bytecount/ci/miri.sh | 5 +- vendor/bytecount/src/integer_simd.rs | 20 +++-- vendor/bytecount/src/lib.rs | 35 ++++++-- vendor/bytecount/src/naive.rs | 9 +- vendor/bytecount/src/simd/aarch64.rs | 157 ++++++++++++++++++++++++++++++++++ vendor/bytecount/src/simd/generic.rs | 20 +++-- vendor/bytecount/src/simd/mod.rs | 4 + vendor/bytecount/src/simd/x86_avx2.rs | 52 +++++------ vendor/bytecount/src/simd/x86_sse2.rs | 8 +- vendor/bytecount/tests/check.rs | 7 +- 14 files changed, 296 insertions(+), 101 deletions(-) mode change 100755 => 100644 vendor/bytecount/ci/miri.sh create mode 100644 vendor/bytecount/src/simd/aarch64.rs (limited to 'vendor/bytecount') diff --git a/vendor/bytecount/.cargo-checksum.json b/vendor/bytecount/.cargo-checksum.json index 9873561f5..dc54fec7c 100644 --- a/vendor/bytecount/.cargo-checksum.json +++ b/vendor/bytecount/.cargo-checksum.json @@ -1 +1 @@ -{"files":{"Cargo.toml":"6c717facc1c70c392d84a7016618b5dab0de6288bed23ca8487b53066a5b46d6","LICENSE.Apache2":"b40930bbcf80744c86c46a12bc9da056641d722716c378f5659b9e555ef833e1","LICENSE.MIT":"a5dea80c1f383cb5f80a6bb0da5e55a2beb9f24adb123ce6300af2cbaaa3bf65","README.md":"82e221d833de855297b37c2ab48547ede0c7db5a7c303dd9fc314f7250abe456","benches/bench.rs":"085734cc7b3c3f6da30eeab8bf98d95fc2a9c0886c63f93edd98df356674bf8e","ci/miri.sh":"fb8d52144352c6b09ac925fb089177d30f46ab8b17dd378d68e414c6b6b5b709","src/integer_simd.rs":"5cd97182cd5aea5c5eab1e1cb5d9d409f263de9083c8e69cd7f9d1dd379dfb52","src/lib.rs":"000a6591306852b9ec892995cd6ab4247b8a727f4befa9dd6a9851c20a5f6552","src/naive.rs":"068dae3fba7d721227bb7a9ed9bdfe4a12cfb373a9de4852d6a517630fabf0c8","src/simd/generic.rs":"2f01963147fa97b5dbd7a166975e6f6660d3197f4b6d0a78ed138da9f78a269e","src/simd/mod.rs":"8ce4aac79520bd0d49f6e48471bf32a2d6d3cd8a3a1c65e393f4f4fae6c7f502","src/simd/x86_avx2.rs":"0345d5fbb74d907e3e6cd0d361427ce75645e7abf4c6efa17ef20ee8a4d8fc18","src/simd/x86_sse2.rs":"e7caab115d77118e6a7bca8daea4a9b6d2d318bd3e76ddc23dc4f023aced1f77","tests/check.rs":"e5075559155a1aae6326932e68410996ca52e4140db4b26eb8b9f03a07241b87"},"package":"2c676a478f63e9fa2dd5368a42f28bba0d6c560b775f38583c8bbaa7fcd67c9c"} \ No newline at end of file +{"files":{"Cargo.toml":"f97173c1fe76d88af268e9f9b352b28778bb557c843d906f331c4835153ebd68","LICENSE.Apache2":"b40930bbcf80744c86c46a12bc9da056641d722716c378f5659b9e555ef833e1","LICENSE.MIT":"a5dea80c1f383cb5f80a6bb0da5e55a2beb9f24adb123ce6300af2cbaaa3bf65","README.md":"7635ab65fbcaf6a8c9be2352061d32ec36df029a04b4d7d1e2dc0451cc57a883","benches/bench.rs":"c40a5b875fc0620c5f2e16ab0f62d9b78d09bf9e6edd4c5b5e781dfbaf892030","ci/miri.sh":"d633bcd4e428b7d713c05b847c88a3e375dbc42ecf9c7d4fd375efccfeb83fac","src/integer_simd.rs":"a4f60db4a902d458d0bc67dd918bc940de351c73133a81f39de21f70af2b52d2","src/lib.rs":"6be1425d3dfbe104cdfe7273e5a9e014953a1d7eb172b21d911af6f5e6f09585","src/naive.rs":"ea613ac2342fd55cd6a2dc3e5b71f65cc6d65ae1a5c302da8270f2e8cf16d503","src/simd/aarch64.rs":"aef42ab239ecd39cb59c939ae9a8e78ff0a02dd40c6320d835639f8d1ca494df","src/simd/generic.rs":"107bd263df09fa91fb551b4e96ef64a604aa1819e55e50d777f15fbb8eb02aae","src/simd/mod.rs":"570af12dec20692b9f92d779c9b9d3923a4f80d18cf81f35e73cf29d8346054c","src/simd/x86_avx2.rs":"48adfbc7a8a21968d67d14f429df00cf83bceb26b055c89cb708fd300edf771e","src/simd/x86_sse2.rs":"cfeb0468dbeb51e3e12a058f93b4eebaa262f0d31a2dbe2ab57c088143f21baa","tests/check.rs":"599d9f9cd296ad08e099e176d9ec9dabfde9782429fd405d33395564e7507ad0"},"package":"ad152d03a2c813c80bb94fedbf3a3f02b28f793e39e7c214c8a0bcc196343de7"} \ No newline at end of file diff --git a/vendor/bytecount/Cargo.toml b/vendor/bytecount/Cargo.toml index 4197a82ca..0525fcbe6 100644 --- a/vendor/bytecount/Cargo.toml +++ b/vendor/bytecount/Cargo.toml @@ -12,7 +12,7 @@ [package] edition = "2018" name = "bytecount" -version = "0.6.3" +version = "0.6.4" authors = [ "Andre Bogus ", "Joshua Landau ", @@ -40,10 +40,9 @@ harness = false [dependencies.packed_simd] version = "0.3.8" optional = true -package = "packed_simd_2" [dev-dependencies.criterion] -version = "0.3" +version = "0.4" default-features = false [dev-dependencies.quickcheck] diff --git a/vendor/bytecount/README.md b/vendor/bytecount/README.md index 9ec89820f..91aa2101a 100644 --- a/vendor/bytecount/README.md +++ b/vendor/bytecount/README.md @@ -2,7 +2,7 @@ Counting bytes really fast -[![Build Status](https://travis-ci.org/llogiq/bytecount.svg?branch=master)](https://travis-ci.org/llogiq/bytecount) +[![Continuous integration](https://github.com/llogiq/bytecount/actions/workflows/ci.yml/badge.svg)](https://github.com/llogiq/bytecount/actions/workflows/ci.yml) [![Windows build status](https://ci.appveyor.com/api/projects/status/github/llogiq/bytecount?svg=true)](https://ci.appveyor.com/project/llogiq/bytecount) [![Current Version](https://img.shields.io/crates/v/bytecount.svg)](https://crates.io/crates/bytecount) [![License: Apache 2.0/MIT](https://img.shields.io/crates/l/bytecount.svg)](#license) @@ -12,7 +12,7 @@ The [newlinebench](https://github.com/llogiq/newlinebench) repository has furthe To use bytecount in your crate, if you have [cargo-edit](https://github.com/killercup/cargo-edit), just type `cargo add bytecount` in a terminal with the crate root as the current path. Otherwise you can manually edit your -`Cargo.toml` to add `bytecount = 0.6.3` to your `[dependencies]` section. +`Cargo.toml` to add `bytecount = 0.6.4` to your `[dependencies]` section. In your crate root (`lib.rs` or `main.rs`, depending on if you are writing a library or application), add `extern crate bytecount;`. Now you can simply use diff --git a/vendor/bytecount/benches/bench.rs b/vendor/bytecount/benches/bench.rs index 85d04dbb4..2e091fccc 100644 --- a/vendor/bytecount/benches/bench.rs +++ b/vendor/bytecount/benches/bench.rs @@ -1,17 +1,14 @@ #[macro_use] extern crate criterion; -extern crate rand; extern crate bytecount; +extern crate rand; +use criterion::{Bencher, BenchmarkId, Criterion}; +use rand::RngCore; use std::env; use std::time::Duration; -use rand::RngCore; -use criterion::{Bencher, Criterion, ParameterizedBenchmark}; -use bytecount::{ - count, naive_count, naive_count_32, - num_chars, naive_num_chars, -}; +use bytecount::{count, naive_count, naive_count_32, naive_num_chars, num_chars}; fn random_bytes(len: usize) -> Vec { let mut result = vec![0; len]; @@ -19,25 +16,29 @@ fn random_bytes(len: usize) -> Vec { result } -static COUNTS : &[usize] = &[0, 10, 20, 30, 40, 50, 60, 70, 80, 90, - 100, 120, 140, 170, 210, 250, 300, 400, 500, 600, 700, 800, 900, - 1000, 1_000, 1_200, 1_400, 1_700, 2_100, 2_500, 3_000, 4_000, - 5_000, 6_000, 7_000, 8_000, 9_000, 10_000, 12_000, 14_000, 17_000, - 21_000, 25_000, 30_000, 100_000, 1_000_000]; +static COUNTS: &[usize] = &[ + 0, 10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 120, 140, 170, 210, 250, 300, 400, 500, 600, 700, + 800, 900, 1_000, 1_200, 1_400, 1_700, 2_100, 2_500, 3_000, 4_000, 5_000, 6_000, 7_000, 8_000, + 9_000, 10_000, 12_000, 14_000, 17_000, 21_000, 25_000, 30_000, 100_000, 1_000_000, +]; fn get_counts() -> Vec { - env::var("COUNTS").map( - |s| s.split(',').map( - |n| str::parse::(n).unwrap()).collect()) + env::var("COUNTS") + .map(|s| { + s.split(',') + .map(|n| str::parse::(n).unwrap()) + .collect() + }) .unwrap_or(COUNTS.to_owned()) } fn get_config() -> Criterion { if env::var("CI").is_ok() { - Criterion::default().nresamples(5_000) - .without_plots() - .measurement_time(Duration::new(2, 0)) - .warm_up_time(Duration::new(1, 0)) + Criterion::default() + .nresamples(5_000) + .without_plots() + .measurement_time(Duration::new(2, 0)) + .warm_up_time(Duration::new(1, 0)) } else { Criterion::default() } @@ -45,37 +46,43 @@ fn get_config() -> Criterion { fn bench_counts(criterion: &mut Criterion) { fn naive(b: &mut Bencher, s: &usize) { - let haystack = random_bytes(*s); + let haystack = random_bytes(*s); b.iter(|| naive_count(&haystack, 10)) } fn naive_32(b: &mut Bencher, s: &usize) { - let haystack = random_bytes(*s); + let haystack = random_bytes(*s); b.iter(|| naive_count_32(&haystack, 10)) } fn hyper(b: &mut Bencher, s: &usize) { - let haystack = random_bytes(*s); + let haystack = random_bytes(*s); b.iter(|| count(&haystack, 10)) } let counts = get_counts(); - criterion.bench("counts", - ParameterizedBenchmark::new("naive", naive, counts) - .with_function("naive_32", naive_32) - .with_function("hyper", hyper)); + let mut group = criterion.benchmark_group("counts"); + for count in counts { + group.throughput(criterion::Throughput::Bytes(count as u64)); + group.bench_with_input(BenchmarkId::new("naive", count), &count, naive); + group.bench_with_input(BenchmarkId::new("naive_32", count), &count, naive_32); + group.bench_with_input(BenchmarkId::new("hyper", count), &count, hyper); + } } fn bench_num_chars(criterion: &mut Criterion) { fn naive(b: &mut Bencher, s: &usize) { - let haystack = random_bytes(*s); + let haystack = random_bytes(*s); b.iter(|| naive_num_chars(&haystack)) } fn hyper(b: &mut Bencher, s: &usize) { - let haystack = random_bytes(*s); + let haystack = random_bytes(*s); b.iter(|| num_chars(&haystack)) } let counts = get_counts(); - criterion.bench("num_chars", - ParameterizedBenchmark::new("naive", naive, counts) - .with_function("hyper", hyper)); + let mut group = criterion.benchmark_group("num_chars"); + for count in counts { + group.throughput(criterion::Throughput::Bytes(count as u64)); + group.bench_with_input(BenchmarkId::new("naive", count), &count, naive); + group.bench_with_input(BenchmarkId::new("hyper", count), &count, hyper); + } } criterion_group!(name = count_bench; config = get_config(); targets = bench_counts); diff --git a/vendor/bytecount/ci/miri.sh b/vendor/bytecount/ci/miri.sh old mode 100755 new mode 100644 index 8704ebc1d..5cb6f82dc --- a/vendor/bytecount/ci/miri.sh +++ b/vendor/bytecount/ci/miri.sh @@ -9,7 +9,4 @@ rustup component add miri # Run tests cargo miri test -cargo miri test --target=mips64-unknown-linux-gnuabi64 # big-endian architecture - -# Restore old state in case Travis uses this cache for other jobs. -rustup default nightly +cargo miri test --target=mips64-unknown-linux-gnuabi64 # big-endian architecture \ No newline at end of file diff --git a/vendor/bytecount/src/integer_simd.rs b/vendor/bytecount/src/integer_simd.rs index 48f2ee8d9..060419462 100644 --- a/vendor/bytecount/src/integer_simd.rs +++ b/vendor/bytecount/src/integer_simd.rs @@ -13,7 +13,7 @@ unsafe fn usize_load_unchecked(bytes: &[u8], offset: usize) -> usize { ptr::copy_nonoverlapping( bytes.as_ptr().add(offset), &mut output as *mut usize as *mut u8, - mem::size_of::() + mem::size_of::(), ); output } @@ -65,11 +65,17 @@ pub fn chunk_count(haystack: &[u8], needle: u8) -> usize { // 8 let mut counts = 0; for i in 0..(haystack.len() - offset) / chunksize { - counts += bytewise_equal(usize_load_unchecked(haystack, offset + i * chunksize), needles); + counts += bytewise_equal( + usize_load_unchecked(haystack, offset + i * chunksize), + needles, + ); } if haystack.len() % 8 != 0 { let mask = usize::from_le(!(!0 >> ((haystack.len() % chunksize) * 8))); - counts += bytewise_equal(usize_load_unchecked(haystack, haystack.len() - chunksize), needles) & mask; + counts += bytewise_equal( + usize_load_unchecked(haystack, haystack.len() - chunksize), + needles, + ) & mask; } count += sum_usize(counts); @@ -98,11 +104,15 @@ pub fn chunk_num_chars(utf8_chars: &[u8]) -> usize { // 8 let mut counts = 0; for i in 0..(utf8_chars.len() - offset) / chunksize { - counts += is_leading_utf8_byte(usize_load_unchecked(utf8_chars, offset + i * chunksize)); + counts += + is_leading_utf8_byte(usize_load_unchecked(utf8_chars, offset + i * chunksize)); } if utf8_chars.len() % 8 != 0 { let mask = usize::from_le(!(!0 >> ((utf8_chars.len() % chunksize) * 8))); - counts += is_leading_utf8_byte(usize_load_unchecked(utf8_chars, utf8_chars.len() - chunksize)) & mask; + counts += is_leading_utf8_byte(usize_load_unchecked( + utf8_chars, + utf8_chars.len() - chunksize, + )) & mask; } count += sum_usize(counts); diff --git a/vendor/bytecount/src/lib.rs b/vendor/bytecount/src/lib.rs index ef4235c26..24f40182b 100644 --- a/vendor/bytecount/src/lib.rs +++ b/vendor/bytecount/src/lib.rs @@ -32,7 +32,6 @@ //! still on small strings. #![deny(missing_docs)] - #![cfg_attr(not(feature = "runtime-dispatch-simd"), no_std)] #[cfg(not(feature = "runtime-dispatch-simd"))] @@ -45,7 +44,11 @@ pub use naive::*; mod integer_simd; #[cfg(any( - all(feature = "runtime-dispatch-simd", any(target_arch = "x86", target_arch = "x86_64")), + all( + feature = "runtime-dispatch-simd", + any(target_arch = "x86", target_arch = "x86_64") + ), + target_arch = "aarch64", feature = "generic-simd" ))] mod simd; @@ -64,7 +67,9 @@ pub fn count(haystack: &[u8], needle: u8) -> usize { #[cfg(all(feature = "runtime-dispatch-simd", target_arch = "x86_64"))] { if is_x86_feature_detected!("avx2") { - unsafe { return simd::x86_avx2::chunk_count(haystack, needle); } + unsafe { + return simd::x86_avx2::chunk_count(haystack, needle); + } } } @@ -80,7 +85,15 @@ pub fn count(haystack: &[u8], needle: u8) -> usize { ))] { if is_x86_feature_detected!("sse2") { - unsafe { return simd::x86_sse2::chunk_count(haystack, needle); } + unsafe { + return simd::x86_sse2::chunk_count(haystack, needle); + } + } + } + #[cfg(all(target_arch = "aarch64", not(feature = "generic_simd")))] + { + unsafe { + return simd::aarch64::chunk_count(haystack, needle); } } } @@ -109,7 +122,9 @@ pub fn num_chars(utf8_chars: &[u8]) -> usize { #[cfg(all(feature = "runtime-dispatch-simd", target_arch = "x86_64"))] { if is_x86_feature_detected!("avx2") { - unsafe { return simd::x86_avx2::chunk_num_chars(utf8_chars); } + unsafe { + return simd::x86_avx2::chunk_num_chars(utf8_chars); + } } } @@ -125,7 +140,15 @@ pub fn num_chars(utf8_chars: &[u8]) -> usize { ))] { if is_x86_feature_detected!("sse2") { - unsafe { return simd::x86_sse2::chunk_num_chars(utf8_chars); } + unsafe { + return simd::x86_sse2::chunk_num_chars(utf8_chars); + } + } + } + #[cfg(all(target_arch = "aarch64", not(feature = "generic_simd")))] + { + unsafe { + return simd::aarch64::chunk_num_chars(utf8_chars); } } } diff --git a/vendor/bytecount/src/naive.rs b/vendor/bytecount/src/naive.rs index 315c4b675..e3f6cf6c6 100644 --- a/vendor/bytecount/src/naive.rs +++ b/vendor/bytecount/src/naive.rs @@ -22,7 +22,9 @@ pub fn naive_count_32(haystack: &[u8], needle: u8) -> usize { /// assert_eq!(number_of_spaces, 6); /// ``` pub fn naive_count(utf8_chars: &[u8], needle: u8) -> usize { - utf8_chars.iter().fold(0, |n, c| n + (*c == needle) as usize) + utf8_chars + .iter() + .fold(0, |n, c| n + (*c == needle) as usize) } /// Count the number of UTF-8 encoded Unicode codepoints in a slice of bytes, simple @@ -38,5 +40,8 @@ pub fn naive_count(utf8_chars: &[u8], needle: u8) -> usize { /// assert_eq!(char_count, 4); /// ``` pub fn naive_num_chars(utf8_chars: &[u8]) -> usize { - utf8_chars.iter().filter(|&&byte| (byte >> 6) != 0b10).count() + utf8_chars + .iter() + .filter(|&&byte| (byte >> 6) != 0b10) + .count() } diff --git a/vendor/bytecount/src/simd/aarch64.rs b/vendor/bytecount/src/simd/aarch64.rs new file mode 100644 index 000000000..6544355fa --- /dev/null +++ b/vendor/bytecount/src/simd/aarch64.rs @@ -0,0 +1,157 @@ +use core::arch::aarch64::{ + uint8x16_t, uint8x16x4_t, vaddlvq_u8, vandq_u8, vceqq_u8, vdupq_n_u8, vld1q_u8, vld1q_u8_x4, + vmvnq_u8, vsubq_u8, +}; + +const MASK: [u8; 32] = [ + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 255, 255, 255, 255, 255, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, +]; + +#[target_feature(enable = "neon")] +unsafe fn u8x16_from_offset(slice: &[u8], offset: usize) -> uint8x16_t { + debug_assert!( + offset + 16 <= slice.len(), + "{} + 16 ≥ {}", + offset, + slice.len() + ); + vld1q_u8(slice.as_ptr().add(offset) as *const _) // TODO: does this need to be aligned? +} + +#[target_feature(enable = "neon")] +unsafe fn u8x16_x4_from_offset(slice: &[u8], offset: usize) -> uint8x16x4_t { + debug_assert!( + offset + 64 <= slice.len(), + "{} + 64 ≥ {}", + offset, + slice.len() + ); + vld1q_u8_x4(slice.as_ptr().add(offset) as *const _) +} + +#[target_feature(enable = "neon")] +unsafe fn sum(u8s: uint8x16_t) -> usize { + vaddlvq_u8(u8s) as usize +} + +#[target_feature(enable = "neon")] +pub unsafe fn chunk_count(haystack: &[u8], needle: u8) -> usize { + assert!(haystack.len() >= 16); + + let mut offset = 0; + let mut count = 0; + + let needles = vdupq_n_u8(needle); + + // 16320 + while haystack.len() >= offset + 64 * 255 { + let (mut count1, mut count2, mut count3, mut count4) = + (vdupq_n_u8(0), vdupq_n_u8(0), vdupq_n_u8(0), vdupq_n_u8(0)); + for _ in 0..255 { + let uint8x16x4_t(h1, h2, h3, h4) = u8x16_x4_from_offset(haystack, offset); + count1 = vsubq_u8(count1, vceqq_u8(h1, needles)); + count2 = vsubq_u8(count2, vceqq_u8(h2, needles)); + count3 = vsubq_u8(count3, vceqq_u8(h3, needles)); + count4 = vsubq_u8(count4, vceqq_u8(h4, needles)); + offset += 64; + } + count += sum(count1) + sum(count2) + sum(count3) + sum(count4); + } + + // 64 + let (mut count1, mut count2, mut count3, mut count4) = + (vdupq_n_u8(0), vdupq_n_u8(0), vdupq_n_u8(0), vdupq_n_u8(0)); + for _ in 0..(haystack.len() - offset) / 64 { + let uint8x16x4_t(h1, h2, h3, h4) = u8x16_x4_from_offset(haystack, offset); + count1 = vsubq_u8(count1, vceqq_u8(h1, needles)); + count2 = vsubq_u8(count2, vceqq_u8(h2, needles)); + count3 = vsubq_u8(count3, vceqq_u8(h3, needles)); + count4 = vsubq_u8(count4, vceqq_u8(h4, needles)); + offset += 64; + } + count += sum(count1) + sum(count2) + sum(count3) + sum(count4); + + let mut counts = vdupq_n_u8(0); + // 16 + for i in 0..(haystack.len() - offset) / 16 { + counts = vsubq_u8( + counts, + vceqq_u8(u8x16_from_offset(haystack, offset + i * 16), needles), + ); + } + if haystack.len() % 16 != 0 { + counts = vsubq_u8( + counts, + vandq_u8( + vceqq_u8(u8x16_from_offset(haystack, haystack.len() - 16), needles), + u8x16_from_offset(&MASK, haystack.len() % 16), + ), + ); + } + count + sum(counts) +} + +#[target_feature(enable = "neon")] +unsafe fn is_leading_utf8_byte(u8s: uint8x16_t) -> uint8x16_t { + vmvnq_u8(vceqq_u8( + vandq_u8(u8s, vdupq_n_u8(0b1100_0000)), + vdupq_n_u8(0b1000_0000), + )) +} + +#[target_feature(enable = "neon")] +pub unsafe fn chunk_num_chars(utf8_chars: &[u8]) -> usize { + assert!(utf8_chars.len() >= 16); + + let mut offset = 0; + let mut count = 0; + + // 4080 + while utf8_chars.len() >= offset + 16 * 255 { + let mut counts = vdupq_n_u8(0); + + for _ in 0..255 { + counts = vsubq_u8( + counts, + is_leading_utf8_byte(u8x16_from_offset(utf8_chars, offset)), + ); + offset += 16; + } + count += sum(counts); + } + + // 2048 + if utf8_chars.len() >= offset + 16 * 128 { + let mut counts = vdupq_n_u8(0); + for _ in 0..128 { + counts = vsubq_u8( + counts, + is_leading_utf8_byte(u8x16_from_offset(utf8_chars, offset)), + ); + offset += 16; + } + count += sum(counts); + } + + // 16 + let mut counts = vdupq_n_u8(0); + for i in 0..(utf8_chars.len() - offset) / 16 { + counts = vsubq_u8( + counts, + is_leading_utf8_byte(u8x16_from_offset(utf8_chars, offset + i * 16)), + ); + } + if utf8_chars.len() % 16 != 0 { + counts = vsubq_u8( + counts, + vandq_u8( + is_leading_utf8_byte(u8x16_from_offset(utf8_chars, utf8_chars.len() - 16)), + u8x16_from_offset(&MASK, utf8_chars.len() % 16), + ), + ); + } + count += sum(counts); + + count +} diff --git a/vendor/bytecount/src/simd/generic.rs b/vendor/bytecount/src/simd/generic.rs index 2031e730e..640ccd891 100644 --- a/vendor/bytecount/src/simd/generic.rs +++ b/vendor/bytecount/src/simd/generic.rs @@ -8,10 +8,9 @@ use std::mem; use self::packed_simd::{u8x32, u8x64, FromCast}; const MASK: [u8; 64] = [ - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, - 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, ]; unsafe fn u8x64_from_offset(slice: &[u8], offset: usize) -> u8x64 { @@ -66,15 +65,17 @@ pub fn chunk_count(haystack: &[u8], needle: u8) -> usize { // 32 let mut counts = u8x32::splat(0); for i in 0..(haystack.len() - offset) / 32 { - counts -= u8x32::from_cast(u8x32_from_offset(haystack, offset + i * 32).eq(needles_x32)); + counts -= + u8x32::from_cast(u8x32_from_offset(haystack, offset + i * 32).eq(needles_x32)); } count += sum_x32(&counts); // Straggler; need to reset counts because prior loop can run 255 times counts = u8x32::splat(0); if haystack.len() % 32 != 0 { - counts -= u8x32::from_cast(u8x32_from_offset(haystack, haystack.len() - 32).eq(needles_x32)) & - u8x32_from_offset(&MASK, haystack.len() % 32); + counts -= + u8x32::from_cast(u8x32_from_offset(haystack, haystack.len() - 32).eq(needles_x32)) + & u8x32_from_offset(&MASK, haystack.len() % 32); } count += sum_x32(&counts); @@ -127,8 +128,9 @@ pub fn chunk_num_chars(utf8_chars: &[u8]) -> usize { // Straggler; need to reset counts because prior loop can run 255 times counts = u8x32::splat(0); if utf8_chars.len() % 32 != 0 { - counts -= is_leading_utf8_byte_x32(u8x32_from_offset(utf8_chars, utf8_chars.len() - 32)) & - u8x32_from_offset(&MASK, utf8_chars.len() % 32); + counts -= + is_leading_utf8_byte_x32(u8x32_from_offset(utf8_chars, utf8_chars.len() - 32)) + & u8x32_from_offset(&MASK, utf8_chars.len() % 32); } count += sum_x32(&counts); diff --git a/vendor/bytecount/src/simd/mod.rs b/vendor/bytecount/src/simd/mod.rs index d144e1847..fa9857546 100644 --- a/vendor/bytecount/src/simd/mod.rs +++ b/vendor/bytecount/src/simd/mod.rs @@ -15,3 +15,7 @@ pub mod x86_sse2; // Runtime feature detection is not available with no_std. #[cfg(all(feature = "runtime-dispatch-simd", target_arch = "x86_64"))] pub mod x86_avx2; + +/// Modern ARM machines are also quite capable thanks to NEON +#[cfg(target_arch = "aarch64")] +pub mod aarch64; diff --git a/vendor/bytecount/src/simd/x86_avx2.rs b/vendor/bytecount/src/simd/x86_avx2.rs index 90a55c0fb..ea191e278 100644 --- a/vendor/bytecount/src/simd/x86_avx2.rs +++ b/vendor/bytecount/src/simd/x86_avx2.rs @@ -1,14 +1,6 @@ use std::arch::x86_64::{ - __m256i, - _mm256_and_si256, - _mm256_cmpeq_epi8, - _mm256_extract_epi64, - _mm256_loadu_si256, - _mm256_sad_epu8, - _mm256_set1_epi8, - _mm256_setzero_si256, - _mm256_sub_epi8, - _mm256_xor_si256, + __m256i, _mm256_and_si256, _mm256_cmpeq_epi8, _mm256_extract_epi64, _mm256_loadu_si256, + _mm256_sad_epu8, _mm256_set1_epi8, _mm256_setzero_si256, _mm256_sub_epi8, _mm256_xor_si256, }; #[target_feature(enable = "avx2")] @@ -22,10 +14,9 @@ pub unsafe fn mm256_cmpneq_epi8(a: __m256i, b: __m256i) -> __m256i { } const MASK: [u8; 64] = [ - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, - 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, ]; #[target_feature(enable = "avx2")] @@ -36,10 +27,10 @@ unsafe fn mm256_from_offset(slice: &[u8], offset: usize) -> __m256i { #[target_feature(enable = "avx2")] unsafe fn sum(u8s: &__m256i) -> usize { let sums = _mm256_sad_epu8(*u8s, _mm256_setzero_si256()); - ( - _mm256_extract_epi64(sums, 0) + _mm256_extract_epi64(sums, 1) + - _mm256_extract_epi64(sums, 2) + _mm256_extract_epi64(sums, 3) - ) as usize + (_mm256_extract_epi64(sums, 0) + + _mm256_extract_epi64(sums, 1) + + _mm256_extract_epi64(sums, 2) + + _mm256_extract_epi64(sums, 3)) as usize } #[target_feature(enable = "avx2")] @@ -57,7 +48,7 @@ pub unsafe fn chunk_count(haystack: &[u8], needle: u8) -> usize { for _ in 0..255 { counts = _mm256_sub_epi8( counts, - _mm256_cmpeq_epi8(mm256_from_offset(haystack, offset), needles) + _mm256_cmpeq_epi8(mm256_from_offset(haystack, offset), needles), ); offset += 32; } @@ -70,7 +61,7 @@ pub unsafe fn chunk_count(haystack: &[u8], needle: u8) -> usize { for _ in 0..128 { counts = _mm256_sub_epi8( counts, - _mm256_cmpeq_epi8(mm256_from_offset(haystack, offset), needles) + _mm256_cmpeq_epi8(mm256_from_offset(haystack, offset), needles), ); offset += 32; } @@ -82,7 +73,7 @@ pub unsafe fn chunk_count(haystack: &[u8], needle: u8) -> usize { for i in 0..(haystack.len() - offset) / 32 { counts = _mm256_sub_epi8( counts, - _mm256_cmpeq_epi8(mm256_from_offset(haystack, offset + i * 32), needles) + _mm256_cmpeq_epi8(mm256_from_offset(haystack, offset + i * 32), needles), ); } if haystack.len() % 32 != 0 { @@ -90,8 +81,8 @@ pub unsafe fn chunk_count(haystack: &[u8], needle: u8) -> usize { counts, _mm256_and_si256( _mm256_cmpeq_epi8(mm256_from_offset(haystack, haystack.len() - 32), needles), - mm256_from_offset(&MASK, haystack.len() % 32) - ) + mm256_from_offset(&MASK, haystack.len() % 32), + ), ); } count += sum(&counts); @@ -101,7 +92,10 @@ pub unsafe fn chunk_count(haystack: &[u8], needle: u8) -> usize { #[target_feature(enable = "avx2")] unsafe fn is_leading_utf8_byte(u8s: __m256i) -> __m256i { - mm256_cmpneq_epi8(_mm256_and_si256(u8s, _mm256_set1_epu8(0b1100_0000)), _mm256_set1_epu8(0b1000_0000)) + mm256_cmpneq_epi8( + _mm256_and_si256(u8s, _mm256_set1_epu8(0b1100_0000)), + _mm256_set1_epu8(0b1000_0000), + ) } #[target_feature(enable = "avx2")] @@ -118,7 +112,7 @@ pub unsafe fn chunk_num_chars(utf8_chars: &[u8]) -> usize { for _ in 0..255 { counts = _mm256_sub_epi8( counts, - is_leading_utf8_byte(mm256_from_offset(utf8_chars, offset)) + is_leading_utf8_byte(mm256_from_offset(utf8_chars, offset)), ); offset += 32; } @@ -131,7 +125,7 @@ pub unsafe fn chunk_num_chars(utf8_chars: &[u8]) -> usize { for _ in 0..128 { counts = _mm256_sub_epi8( counts, - is_leading_utf8_byte(mm256_from_offset(utf8_chars, offset)) + is_leading_utf8_byte(mm256_from_offset(utf8_chars, offset)), ); offset += 32; } @@ -143,7 +137,7 @@ pub unsafe fn chunk_num_chars(utf8_chars: &[u8]) -> usize { for i in 0..(utf8_chars.len() - offset) / 32 { counts = _mm256_sub_epi8( counts, - is_leading_utf8_byte(mm256_from_offset(utf8_chars, offset + i * 32)) + is_leading_utf8_byte(mm256_from_offset(utf8_chars, offset + i * 32)), ); } if utf8_chars.len() % 32 != 0 { @@ -151,8 +145,8 @@ pub unsafe fn chunk_num_chars(utf8_chars: &[u8]) -> usize { counts, _mm256_and_si256( is_leading_utf8_byte(mm256_from_offset(utf8_chars, utf8_chars.len() - 32)), - mm256_from_offset(&MASK, utf8_chars.len() % 32) - ) + mm256_from_offset(&MASK, utf8_chars.len() % 32), + ), ); } count += sum(&counts); diff --git a/vendor/bytecount/src/simd/x86_sse2.rs b/vendor/bytecount/src/simd/x86_sse2.rs index 63d295eae..59c7d8d1e 100644 --- a/vendor/bytecount/src/simd/x86_sse2.rs +++ b/vendor/bytecount/src/simd/x86_sse2.rs @@ -3,11 +3,12 @@ use std::arch::x86::{ __m128i, _mm_and_si128, _mm_cmpeq_epi8, - _mm_extract_epi32, + _mm_cvtsi128_si32, _mm_loadu_si128, _mm_sad_epu8, _mm_set1_epi8, _mm_setzero_si128, + _mm_shuffle_epi32, _mm_sub_epi8, _mm_xor_si128, }; @@ -17,11 +18,12 @@ use std::arch::x86_64::{ __m128i, _mm_and_si128, _mm_cmpeq_epi8, - _mm_extract_epi32, + _mm_cvtsi128_si32, _mm_loadu_si128, _mm_sad_epu8, _mm_set1_epi8, _mm_setzero_si128, + _mm_shuffle_epi32, _mm_sub_epi8, _mm_xor_si128, }; @@ -49,7 +51,7 @@ unsafe fn mm_from_offset(slice: &[u8], offset: usize) -> __m128i { #[target_feature(enable = "sse2")] unsafe fn sum(u8s: &__m128i) -> usize { let sums = _mm_sad_epu8(*u8s, _mm_setzero_si128()); - (_mm_extract_epi32(sums, 0) + _mm_extract_epi32(sums, 2)) as usize + (_mm_cvtsi128_si32(sums) + _mm_cvtsi128_si32(_mm_shuffle_epi32(sums, 0xaa))) as usize } #[target_feature(enable = "sse2")] diff --git a/vendor/bytecount/tests/check.rs b/vendor/bytecount/tests/check.rs index 147b466fc..5a999509a 100644 --- a/vendor/bytecount/tests/check.rs +++ b/vendor/bytecount/tests/check.rs @@ -3,10 +3,7 @@ extern crate bytecount; extern crate quickcheck; extern crate rand; -use bytecount::{ - count, naive_count, - num_chars, naive_num_chars, -}; +use bytecount::{count, naive_count, naive_num_chars, num_chars}; use rand::RngCore; fn random_bytes(len: usize) -> Vec { @@ -59,8 +56,6 @@ fn check_count_overflow_many() { } } - - quickcheck! { fn check_num_chars_correct(haystack: Vec) -> bool { num_chars(&haystack) == naive_num_chars(&haystack) -- cgit v1.2.3