summaryrefslogtreecommitdiffstats
path: root/vendor/bytecount
diff options
context:
space:
mode:
Diffstat (limited to 'vendor/bytecount')
-rw-r--r--vendor/bytecount/.cargo-checksum.json2
-rw-r--r--vendor/bytecount/Cargo.toml5
-rw-r--r--vendor/bytecount/README.md4
-rw-r--r--vendor/bytecount/benches/bench.rs69
-rw-r--r--[-rwxr-xr-x]vendor/bytecount/ci/miri.sh5
-rw-r--r--vendor/bytecount/src/integer_simd.rs20
-rw-r--r--vendor/bytecount/src/lib.rs35
-rw-r--r--vendor/bytecount/src/naive.rs9
-rw-r--r--vendor/bytecount/src/simd/aarch64.rs157
-rw-r--r--vendor/bytecount/src/simd/generic.rs20
-rw-r--r--vendor/bytecount/src/simd/mod.rs4
-rw-r--r--vendor/bytecount/src/simd/x86_avx2.rs52
-rw-r--r--vendor/bytecount/src/simd/x86_sse2.rs8
-rw-r--r--vendor/bytecount/tests/check.rs7
14 files changed, 296 insertions, 101 deletions
diff --git a/vendor/bytecount/.cargo-checksum.json b/vendor/bytecount/.cargo-checksum.json
index 9873561f5..dc54fec7c 100644
--- a/vendor/bytecount/.cargo-checksum.json
+++ b/vendor/bytecount/.cargo-checksum.json
@@ -1 +1 @@
-{"files":{"Cargo.toml":"6c717facc1c70c392d84a7016618b5dab0de6288bed23ca8487b53066a5b46d6","LICENSE.Apache2":"b40930bbcf80744c86c46a12bc9da056641d722716c378f5659b9e555ef833e1","LICENSE.MIT":"a5dea80c1f383cb5f80a6bb0da5e55a2beb9f24adb123ce6300af2cbaaa3bf65","README.md":"82e221d833de855297b37c2ab48547ede0c7db5a7c303dd9fc314f7250abe456","benches/bench.rs":"085734cc7b3c3f6da30eeab8bf98d95fc2a9c0886c63f93edd98df356674bf8e","ci/miri.sh":"fb8d52144352c6b09ac925fb089177d30f46ab8b17dd378d68e414c6b6b5b709","src/integer_simd.rs":"5cd97182cd5aea5c5eab1e1cb5d9d409f263de9083c8e69cd7f9d1dd379dfb52","src/lib.rs":"000a6591306852b9ec892995cd6ab4247b8a727f4befa9dd6a9851c20a5f6552","src/naive.rs":"068dae3fba7d721227bb7a9ed9bdfe4a12cfb373a9de4852d6a517630fabf0c8","src/simd/generic.rs":"2f01963147fa97b5dbd7a166975e6f6660d3197f4b6d0a78ed138da9f78a269e","src/simd/mod.rs":"8ce4aac79520bd0d49f6e48471bf32a2d6d3cd8a3a1c65e393f4f4fae6c7f502","src/simd/x86_avx2.rs":"0345d5fbb74d907e3e6cd0d361427ce75645e7abf4c6efa17ef20ee8a4d8fc18","src/simd/x86_sse2.rs":"e7caab115d77118e6a7bca8daea4a9b6d2d318bd3e76ddc23dc4f023aced1f77","tests/check.rs":"e5075559155a1aae6326932e68410996ca52e4140db4b26eb8b9f03a07241b87"},"package":"2c676a478f63e9fa2dd5368a42f28bba0d6c560b775f38583c8bbaa7fcd67c9c"} \ No newline at end of file
+{"files":{"Cargo.toml":"f97173c1fe76d88af268e9f9b352b28778bb557c843d906f331c4835153ebd68","LICENSE.Apache2":"b40930bbcf80744c86c46a12bc9da056641d722716c378f5659b9e555ef833e1","LICENSE.MIT":"a5dea80c1f383cb5f80a6bb0da5e55a2beb9f24adb123ce6300af2cbaaa3bf65","README.md":"7635ab65fbcaf6a8c9be2352061d32ec36df029a04b4d7d1e2dc0451cc57a883","benches/bench.rs":"c40a5b875fc0620c5f2e16ab0f62d9b78d09bf9e6edd4c5b5e781dfbaf892030","ci/miri.sh":"d633bcd4e428b7d713c05b847c88a3e375dbc42ecf9c7d4fd375efccfeb83fac","src/integer_simd.rs":"a4f60db4a902d458d0bc67dd918bc940de351c73133a81f39de21f70af2b52d2","src/lib.rs":"6be1425d3dfbe104cdfe7273e5a9e014953a1d7eb172b21d911af6f5e6f09585","src/naive.rs":"ea613ac2342fd55cd6a2dc3e5b71f65cc6d65ae1a5c302da8270f2e8cf16d503","src/simd/aarch64.rs":"aef42ab239ecd39cb59c939ae9a8e78ff0a02dd40c6320d835639f8d1ca494df","src/simd/generic.rs":"107bd263df09fa91fb551b4e96ef64a604aa1819e55e50d777f15fbb8eb02aae","src/simd/mod.rs":"570af12dec20692b9f92d779c9b9d3923a4f80d18cf81f35e73cf29d8346054c","src/simd/x86_avx2.rs":"48adfbc7a8a21968d67d14f429df00cf83bceb26b055c89cb708fd300edf771e","src/simd/x86_sse2.rs":"cfeb0468dbeb51e3e12a058f93b4eebaa262f0d31a2dbe2ab57c088143f21baa","tests/check.rs":"599d9f9cd296ad08e099e176d9ec9dabfde9782429fd405d33395564e7507ad0"},"package":"ad152d03a2c813c80bb94fedbf3a3f02b28f793e39e7c214c8a0bcc196343de7"} \ No newline at end of file
diff --git a/vendor/bytecount/Cargo.toml b/vendor/bytecount/Cargo.toml
index 4197a82ca..0525fcbe6 100644
--- a/vendor/bytecount/Cargo.toml
+++ b/vendor/bytecount/Cargo.toml
@@ -12,7 +12,7 @@
[package]
edition = "2018"
name = "bytecount"
-version = "0.6.3"
+version = "0.6.4"
authors = [
"Andre Bogus <bogusandre@gmail.de>",
"Joshua Landau <joshua@landau.ws>",
@@ -40,10 +40,9 @@ harness = false
[dependencies.packed_simd]
version = "0.3.8"
optional = true
-package = "packed_simd_2"
[dev-dependencies.criterion]
-version = "0.3"
+version = "0.4"
default-features = false
[dev-dependencies.quickcheck]
diff --git a/vendor/bytecount/README.md b/vendor/bytecount/README.md
index 9ec89820f..91aa2101a 100644
--- a/vendor/bytecount/README.md
+++ b/vendor/bytecount/README.md
@@ -2,7 +2,7 @@
Counting bytes really fast
-[![Build Status](https://travis-ci.org/llogiq/bytecount.svg?branch=master)](https://travis-ci.org/llogiq/bytecount)
+[![Continuous integration](https://github.com/llogiq/bytecount/actions/workflows/ci.yml/badge.svg)](https://github.com/llogiq/bytecount/actions/workflows/ci.yml)
[![Windows build status](https://ci.appveyor.com/api/projects/status/github/llogiq/bytecount?svg=true)](https://ci.appveyor.com/project/llogiq/bytecount)
[![Current Version](https://img.shields.io/crates/v/bytecount.svg)](https://crates.io/crates/bytecount)
[![License: Apache 2.0/MIT](https://img.shields.io/crates/l/bytecount.svg)](#license)
@@ -12,7 +12,7 @@ The [newlinebench](https://github.com/llogiq/newlinebench) repository has furthe
To use bytecount in your crate, if you have [cargo-edit](https://github.com/killercup/cargo-edit), just type
`cargo add bytecount` in a terminal with the crate root as the current path. Otherwise you can manually edit your
-`Cargo.toml` to add `bytecount = 0.6.3` to your `[dependencies]` section.
+`Cargo.toml` to add `bytecount = 0.6.4` to your `[dependencies]` section.
In your crate root (`lib.rs` or `main.rs`, depending on if you are writing a
library or application), add `extern crate bytecount;`. Now you can simply use
diff --git a/vendor/bytecount/benches/bench.rs b/vendor/bytecount/benches/bench.rs
index 85d04dbb4..2e091fccc 100644
--- a/vendor/bytecount/benches/bench.rs
+++ b/vendor/bytecount/benches/bench.rs
@@ -1,17 +1,14 @@
#[macro_use]
extern crate criterion;
-extern crate rand;
extern crate bytecount;
+extern crate rand;
+use criterion::{Bencher, BenchmarkId, Criterion};
+use rand::RngCore;
use std::env;
use std::time::Duration;
-use rand::RngCore;
-use criterion::{Bencher, Criterion, ParameterizedBenchmark};
-use bytecount::{
- count, naive_count, naive_count_32,
- num_chars, naive_num_chars,
-};
+use bytecount::{count, naive_count, naive_count_32, naive_num_chars, num_chars};
fn random_bytes(len: usize) -> Vec<u8> {
let mut result = vec![0; len];
@@ -19,25 +16,29 @@ fn random_bytes(len: usize) -> Vec<u8> {
result
}
-static COUNTS : &[usize] = &[0, 10, 20, 30, 40, 50, 60, 70, 80, 90,
- 100, 120, 140, 170, 210, 250, 300, 400, 500, 600, 700, 800, 900,
- 1000, 1_000, 1_200, 1_400, 1_700, 2_100, 2_500, 3_000, 4_000,
- 5_000, 6_000, 7_000, 8_000, 9_000, 10_000, 12_000, 14_000, 17_000,
- 21_000, 25_000, 30_000, 100_000, 1_000_000];
+static COUNTS: &[usize] = &[
+ 0, 10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 120, 140, 170, 210, 250, 300, 400, 500, 600, 700,
+ 800, 900, 1_000, 1_200, 1_400, 1_700, 2_100, 2_500, 3_000, 4_000, 5_000, 6_000, 7_000, 8_000,
+ 9_000, 10_000, 12_000, 14_000, 17_000, 21_000, 25_000, 30_000, 100_000, 1_000_000,
+];
fn get_counts() -> Vec<usize> {
- env::var("COUNTS").map(
- |s| s.split(',').map(
- |n| str::parse::<usize>(n).unwrap()).collect())
+ env::var("COUNTS")
+ .map(|s| {
+ s.split(',')
+ .map(|n| str::parse::<usize>(n).unwrap())
+ .collect()
+ })
.unwrap_or(COUNTS.to_owned())
}
fn get_config() -> Criterion {
if env::var("CI").is_ok() {
- Criterion::default().nresamples(5_000)
- .without_plots()
- .measurement_time(Duration::new(2, 0))
- .warm_up_time(Duration::new(1, 0))
+ Criterion::default()
+ .nresamples(5_000)
+ .without_plots()
+ .measurement_time(Duration::new(2, 0))
+ .warm_up_time(Duration::new(1, 0))
} else {
Criterion::default()
}
@@ -45,37 +46,43 @@ fn get_config() -> Criterion {
fn bench_counts(criterion: &mut Criterion) {
fn naive(b: &mut Bencher, s: &usize) {
- let haystack = random_bytes(*s);
+ let haystack = random_bytes(*s);
b.iter(|| naive_count(&haystack, 10))
}
fn naive_32(b: &mut Bencher, s: &usize) {
- let haystack = random_bytes(*s);
+ let haystack = random_bytes(*s);
b.iter(|| naive_count_32(&haystack, 10))
}
fn hyper(b: &mut Bencher, s: &usize) {
- let haystack = random_bytes(*s);
+ let haystack = random_bytes(*s);
b.iter(|| count(&haystack, 10))
}
let counts = get_counts();
- criterion.bench("counts",
- ParameterizedBenchmark::new("naive", naive, counts)
- .with_function("naive_32", naive_32)
- .with_function("hyper", hyper));
+ let mut group = criterion.benchmark_group("counts");
+ for count in counts {
+ group.throughput(criterion::Throughput::Bytes(count as u64));
+ group.bench_with_input(BenchmarkId::new("naive", count), &count, naive);
+ group.bench_with_input(BenchmarkId::new("naive_32", count), &count, naive_32);
+ group.bench_with_input(BenchmarkId::new("hyper", count), &count, hyper);
+ }
}
fn bench_num_chars(criterion: &mut Criterion) {
fn naive(b: &mut Bencher, s: &usize) {
- let haystack = random_bytes(*s);
+ let haystack = random_bytes(*s);
b.iter(|| naive_num_chars(&haystack))
}
fn hyper(b: &mut Bencher, s: &usize) {
- let haystack = random_bytes(*s);
+ let haystack = random_bytes(*s);
b.iter(|| num_chars(&haystack))
}
let counts = get_counts();
- criterion.bench("num_chars",
- ParameterizedBenchmark::new("naive", naive, counts)
- .with_function("hyper", hyper));
+ let mut group = criterion.benchmark_group("num_chars");
+ for count in counts {
+ group.throughput(criterion::Throughput::Bytes(count as u64));
+ group.bench_with_input(BenchmarkId::new("naive", count), &count, naive);
+ group.bench_with_input(BenchmarkId::new("hyper", count), &count, hyper);
+ }
}
criterion_group!(name = count_bench; config = get_config(); targets = bench_counts);
diff --git a/vendor/bytecount/ci/miri.sh b/vendor/bytecount/ci/miri.sh
index 8704ebc1d..5cb6f82dc 100755..100644
--- a/vendor/bytecount/ci/miri.sh
+++ b/vendor/bytecount/ci/miri.sh
@@ -9,7 +9,4 @@ rustup component add miri
# Run tests
cargo miri test
-cargo miri test --target=mips64-unknown-linux-gnuabi64 # big-endian architecture
-
-# Restore old state in case Travis uses this cache for other jobs.
-rustup default nightly
+cargo miri test --target=mips64-unknown-linux-gnuabi64 # big-endian architecture \ No newline at end of file
diff --git a/vendor/bytecount/src/integer_simd.rs b/vendor/bytecount/src/integer_simd.rs
index 48f2ee8d9..060419462 100644
--- a/vendor/bytecount/src/integer_simd.rs
+++ b/vendor/bytecount/src/integer_simd.rs
@@ -13,7 +13,7 @@ unsafe fn usize_load_unchecked(bytes: &[u8], offset: usize) -> usize {
ptr::copy_nonoverlapping(
bytes.as_ptr().add(offset),
&mut output as *mut usize as *mut u8,
- mem::size_of::<usize>()
+ mem::size_of::<usize>(),
);
output
}
@@ -65,11 +65,17 @@ pub fn chunk_count(haystack: &[u8], needle: u8) -> usize {
// 8
let mut counts = 0;
for i in 0..(haystack.len() - offset) / chunksize {
- counts += bytewise_equal(usize_load_unchecked(haystack, offset + i * chunksize), needles);
+ counts += bytewise_equal(
+ usize_load_unchecked(haystack, offset + i * chunksize),
+ needles,
+ );
}
if haystack.len() % 8 != 0 {
let mask = usize::from_le(!(!0 >> ((haystack.len() % chunksize) * 8)));
- counts += bytewise_equal(usize_load_unchecked(haystack, haystack.len() - chunksize), needles) & mask;
+ counts += bytewise_equal(
+ usize_load_unchecked(haystack, haystack.len() - chunksize),
+ needles,
+ ) & mask;
}
count += sum_usize(counts);
@@ -98,11 +104,15 @@ pub fn chunk_num_chars(utf8_chars: &[u8]) -> usize {
// 8
let mut counts = 0;
for i in 0..(utf8_chars.len() - offset) / chunksize {
- counts += is_leading_utf8_byte(usize_load_unchecked(utf8_chars, offset + i * chunksize));
+ counts +=
+ is_leading_utf8_byte(usize_load_unchecked(utf8_chars, offset + i * chunksize));
}
if utf8_chars.len() % 8 != 0 {
let mask = usize::from_le(!(!0 >> ((utf8_chars.len() % chunksize) * 8)));
- counts += is_leading_utf8_byte(usize_load_unchecked(utf8_chars, utf8_chars.len() - chunksize)) & mask;
+ counts += is_leading_utf8_byte(usize_load_unchecked(
+ utf8_chars,
+ utf8_chars.len() - chunksize,
+ )) & mask;
}
count += sum_usize(counts);
diff --git a/vendor/bytecount/src/lib.rs b/vendor/bytecount/src/lib.rs
index ef4235c26..24f40182b 100644
--- a/vendor/bytecount/src/lib.rs
+++ b/vendor/bytecount/src/lib.rs
@@ -32,7 +32,6 @@
//! still on small strings.
#![deny(missing_docs)]
-
#![cfg_attr(not(feature = "runtime-dispatch-simd"), no_std)]
#[cfg(not(feature = "runtime-dispatch-simd"))]
@@ -45,7 +44,11 @@ pub use naive::*;
mod integer_simd;
#[cfg(any(
- all(feature = "runtime-dispatch-simd", any(target_arch = "x86", target_arch = "x86_64")),
+ all(
+ feature = "runtime-dispatch-simd",
+ any(target_arch = "x86", target_arch = "x86_64")
+ ),
+ target_arch = "aarch64",
feature = "generic-simd"
))]
mod simd;
@@ -64,7 +67,9 @@ pub fn count(haystack: &[u8], needle: u8) -> usize {
#[cfg(all(feature = "runtime-dispatch-simd", target_arch = "x86_64"))]
{
if is_x86_feature_detected!("avx2") {
- unsafe { return simd::x86_avx2::chunk_count(haystack, needle); }
+ unsafe {
+ return simd::x86_avx2::chunk_count(haystack, needle);
+ }
}
}
@@ -80,7 +85,15 @@ pub fn count(haystack: &[u8], needle: u8) -> usize {
))]
{
if is_x86_feature_detected!("sse2") {
- unsafe { return simd::x86_sse2::chunk_count(haystack, needle); }
+ unsafe {
+ return simd::x86_sse2::chunk_count(haystack, needle);
+ }
+ }
+ }
+ #[cfg(all(target_arch = "aarch64", not(feature = "generic_simd")))]
+ {
+ unsafe {
+ return simd::aarch64::chunk_count(haystack, needle);
}
}
}
@@ -109,7 +122,9 @@ pub fn num_chars(utf8_chars: &[u8]) -> usize {
#[cfg(all(feature = "runtime-dispatch-simd", target_arch = "x86_64"))]
{
if is_x86_feature_detected!("avx2") {
- unsafe { return simd::x86_avx2::chunk_num_chars(utf8_chars); }
+ unsafe {
+ return simd::x86_avx2::chunk_num_chars(utf8_chars);
+ }
}
}
@@ -125,7 +140,15 @@ pub fn num_chars(utf8_chars: &[u8]) -> usize {
))]
{
if is_x86_feature_detected!("sse2") {
- unsafe { return simd::x86_sse2::chunk_num_chars(utf8_chars); }
+ unsafe {
+ return simd::x86_sse2::chunk_num_chars(utf8_chars);
+ }
+ }
+ }
+ #[cfg(all(target_arch = "aarch64", not(feature = "generic_simd")))]
+ {
+ unsafe {
+ return simd::aarch64::chunk_num_chars(utf8_chars);
}
}
}
diff --git a/vendor/bytecount/src/naive.rs b/vendor/bytecount/src/naive.rs
index 315c4b675..e3f6cf6c6 100644
--- a/vendor/bytecount/src/naive.rs
+++ b/vendor/bytecount/src/naive.rs
@@ -22,7 +22,9 @@ pub fn naive_count_32(haystack: &[u8], needle: u8) -> usize {
/// assert_eq!(number_of_spaces, 6);
/// ```
pub fn naive_count(utf8_chars: &[u8], needle: u8) -> usize {
- utf8_chars.iter().fold(0, |n, c| n + (*c == needle) as usize)
+ utf8_chars
+ .iter()
+ .fold(0, |n, c| n + (*c == needle) as usize)
}
/// Count the number of UTF-8 encoded Unicode codepoints in a slice of bytes, simple
@@ -38,5 +40,8 @@ pub fn naive_count(utf8_chars: &[u8], needle: u8) -> usize {
/// assert_eq!(char_count, 4);
/// ```
pub fn naive_num_chars(utf8_chars: &[u8]) -> usize {
- utf8_chars.iter().filter(|&&byte| (byte >> 6) != 0b10).count()
+ utf8_chars
+ .iter()
+ .filter(|&&byte| (byte >> 6) != 0b10)
+ .count()
}
diff --git a/vendor/bytecount/src/simd/aarch64.rs b/vendor/bytecount/src/simd/aarch64.rs
new file mode 100644
index 000000000..6544355fa
--- /dev/null
+++ b/vendor/bytecount/src/simd/aarch64.rs
@@ -0,0 +1,157 @@
+use core::arch::aarch64::{
+ uint8x16_t, uint8x16x4_t, vaddlvq_u8, vandq_u8, vceqq_u8, vdupq_n_u8, vld1q_u8, vld1q_u8_x4,
+ vmvnq_u8, vsubq_u8,
+};
+
+const MASK: [u8; 32] = [
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+ 255, 255, 255, 255, 255, 255, 255,
+];
+
+#[target_feature(enable = "neon")]
+unsafe fn u8x16_from_offset(slice: &[u8], offset: usize) -> uint8x16_t {
+ debug_assert!(
+ offset + 16 <= slice.len(),
+ "{} + 16 ≥ {}",
+ offset,
+ slice.len()
+ );
+ vld1q_u8(slice.as_ptr().add(offset) as *const _) // TODO: does this need to be aligned?
+}
+
+#[target_feature(enable = "neon")]
+unsafe fn u8x16_x4_from_offset(slice: &[u8], offset: usize) -> uint8x16x4_t {
+ debug_assert!(
+ offset + 64 <= slice.len(),
+ "{} + 64 ≥ {}",
+ offset,
+ slice.len()
+ );
+ vld1q_u8_x4(slice.as_ptr().add(offset) as *const _)
+}
+
+#[target_feature(enable = "neon")]
+unsafe fn sum(u8s: uint8x16_t) -> usize {
+ vaddlvq_u8(u8s) as usize
+}
+
+#[target_feature(enable = "neon")]
+pub unsafe fn chunk_count(haystack: &[u8], needle: u8) -> usize {
+ assert!(haystack.len() >= 16);
+
+ let mut offset = 0;
+ let mut count = 0;
+
+ let needles = vdupq_n_u8(needle);
+
+ // 16320
+ while haystack.len() >= offset + 64 * 255 {
+ let (mut count1, mut count2, mut count3, mut count4) =
+ (vdupq_n_u8(0), vdupq_n_u8(0), vdupq_n_u8(0), vdupq_n_u8(0));
+ for _ in 0..255 {
+ let uint8x16x4_t(h1, h2, h3, h4) = u8x16_x4_from_offset(haystack, offset);
+ count1 = vsubq_u8(count1, vceqq_u8(h1, needles));
+ count2 = vsubq_u8(count2, vceqq_u8(h2, needles));
+ count3 = vsubq_u8(count3, vceqq_u8(h3, needles));
+ count4 = vsubq_u8(count4, vceqq_u8(h4, needles));
+ offset += 64;
+ }
+ count += sum(count1) + sum(count2) + sum(count3) + sum(count4);
+ }
+
+ // 64
+ let (mut count1, mut count2, mut count3, mut count4) =
+ (vdupq_n_u8(0), vdupq_n_u8(0), vdupq_n_u8(0), vdupq_n_u8(0));
+ for _ in 0..(haystack.len() - offset) / 64 {
+ let uint8x16x4_t(h1, h2, h3, h4) = u8x16_x4_from_offset(haystack, offset);
+ count1 = vsubq_u8(count1, vceqq_u8(h1, needles));
+ count2 = vsubq_u8(count2, vceqq_u8(h2, needles));
+ count3 = vsubq_u8(count3, vceqq_u8(h3, needles));
+ count4 = vsubq_u8(count4, vceqq_u8(h4, needles));
+ offset += 64;
+ }
+ count += sum(count1) + sum(count2) + sum(count3) + sum(count4);
+
+ let mut counts = vdupq_n_u8(0);
+ // 16
+ for i in 0..(haystack.len() - offset) / 16 {
+ counts = vsubq_u8(
+ counts,
+ vceqq_u8(u8x16_from_offset(haystack, offset + i * 16), needles),
+ );
+ }
+ if haystack.len() % 16 != 0 {
+ counts = vsubq_u8(
+ counts,
+ vandq_u8(
+ vceqq_u8(u8x16_from_offset(haystack, haystack.len() - 16), needles),
+ u8x16_from_offset(&MASK, haystack.len() % 16),
+ ),
+ );
+ }
+ count + sum(counts)
+}
+
+#[target_feature(enable = "neon")]
+unsafe fn is_leading_utf8_byte(u8s: uint8x16_t) -> uint8x16_t {
+ vmvnq_u8(vceqq_u8(
+ vandq_u8(u8s, vdupq_n_u8(0b1100_0000)),
+ vdupq_n_u8(0b1000_0000),
+ ))
+}
+
+#[target_feature(enable = "neon")]
+pub unsafe fn chunk_num_chars(utf8_chars: &[u8]) -> usize {
+ assert!(utf8_chars.len() >= 16);
+
+ let mut offset = 0;
+ let mut count = 0;
+
+ // 4080
+ while utf8_chars.len() >= offset + 16 * 255 {
+ let mut counts = vdupq_n_u8(0);
+
+ for _ in 0..255 {
+ counts = vsubq_u8(
+ counts,
+ is_leading_utf8_byte(u8x16_from_offset(utf8_chars, offset)),
+ );
+ offset += 16;
+ }
+ count += sum(counts);
+ }
+
+ // 2048
+ if utf8_chars.len() >= offset + 16 * 128 {
+ let mut counts = vdupq_n_u8(0);
+ for _ in 0..128 {
+ counts = vsubq_u8(
+ counts,
+ is_leading_utf8_byte(u8x16_from_offset(utf8_chars, offset)),
+ );
+ offset += 16;
+ }
+ count += sum(counts);
+ }
+
+ // 16
+ let mut counts = vdupq_n_u8(0);
+ for i in 0..(utf8_chars.len() - offset) / 16 {
+ counts = vsubq_u8(
+ counts,
+ is_leading_utf8_byte(u8x16_from_offset(utf8_chars, offset + i * 16)),
+ );
+ }
+ if utf8_chars.len() % 16 != 0 {
+ counts = vsubq_u8(
+ counts,
+ vandq_u8(
+ is_leading_utf8_byte(u8x16_from_offset(utf8_chars, utf8_chars.len() - 16)),
+ u8x16_from_offset(&MASK, utf8_chars.len() % 16),
+ ),
+ );
+ }
+ count += sum(counts);
+
+ count
+}
diff --git a/vendor/bytecount/src/simd/generic.rs b/vendor/bytecount/src/simd/generic.rs
index 2031e730e..640ccd891 100644
--- a/vendor/bytecount/src/simd/generic.rs
+++ b/vendor/bytecount/src/simd/generic.rs
@@ -8,10 +8,9 @@ use std::mem;
use self::packed_simd::{u8x32, u8x64, FromCast};
const MASK: [u8; 64] = [
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
- 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
- 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+ 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
];
unsafe fn u8x64_from_offset(slice: &[u8], offset: usize) -> u8x64 {
@@ -66,15 +65,17 @@ pub fn chunk_count(haystack: &[u8], needle: u8) -> usize {
// 32
let mut counts = u8x32::splat(0);
for i in 0..(haystack.len() - offset) / 32 {
- counts -= u8x32::from_cast(u8x32_from_offset(haystack, offset + i * 32).eq(needles_x32));
+ counts -=
+ u8x32::from_cast(u8x32_from_offset(haystack, offset + i * 32).eq(needles_x32));
}
count += sum_x32(&counts);
// Straggler; need to reset counts because prior loop can run 255 times
counts = u8x32::splat(0);
if haystack.len() % 32 != 0 {
- counts -= u8x32::from_cast(u8x32_from_offset(haystack, haystack.len() - 32).eq(needles_x32)) &
- u8x32_from_offset(&MASK, haystack.len() % 32);
+ counts -=
+ u8x32::from_cast(u8x32_from_offset(haystack, haystack.len() - 32).eq(needles_x32))
+ & u8x32_from_offset(&MASK, haystack.len() % 32);
}
count += sum_x32(&counts);
@@ -127,8 +128,9 @@ pub fn chunk_num_chars(utf8_chars: &[u8]) -> usize {
// Straggler; need to reset counts because prior loop can run 255 times
counts = u8x32::splat(0);
if utf8_chars.len() % 32 != 0 {
- counts -= is_leading_utf8_byte_x32(u8x32_from_offset(utf8_chars, utf8_chars.len() - 32)) &
- u8x32_from_offset(&MASK, utf8_chars.len() % 32);
+ counts -=
+ is_leading_utf8_byte_x32(u8x32_from_offset(utf8_chars, utf8_chars.len() - 32))
+ & u8x32_from_offset(&MASK, utf8_chars.len() % 32);
}
count += sum_x32(&counts);
diff --git a/vendor/bytecount/src/simd/mod.rs b/vendor/bytecount/src/simd/mod.rs
index d144e1847..fa9857546 100644
--- a/vendor/bytecount/src/simd/mod.rs
+++ b/vendor/bytecount/src/simd/mod.rs
@@ -15,3 +15,7 @@ pub mod x86_sse2;
// Runtime feature detection is not available with no_std.
#[cfg(all(feature = "runtime-dispatch-simd", target_arch = "x86_64"))]
pub mod x86_avx2;
+
+/// Modern ARM machines are also quite capable thanks to NEON
+#[cfg(target_arch = "aarch64")]
+pub mod aarch64;
diff --git a/vendor/bytecount/src/simd/x86_avx2.rs b/vendor/bytecount/src/simd/x86_avx2.rs
index 90a55c0fb..ea191e278 100644
--- a/vendor/bytecount/src/simd/x86_avx2.rs
+++ b/vendor/bytecount/src/simd/x86_avx2.rs
@@ -1,14 +1,6 @@
use std::arch::x86_64::{
- __m256i,
- _mm256_and_si256,
- _mm256_cmpeq_epi8,
- _mm256_extract_epi64,
- _mm256_loadu_si256,
- _mm256_sad_epu8,
- _mm256_set1_epi8,
- _mm256_setzero_si256,
- _mm256_sub_epi8,
- _mm256_xor_si256,
+ __m256i, _mm256_and_si256, _mm256_cmpeq_epi8, _mm256_extract_epi64, _mm256_loadu_si256,
+ _mm256_sad_epu8, _mm256_set1_epi8, _mm256_setzero_si256, _mm256_sub_epi8, _mm256_xor_si256,
};
#[target_feature(enable = "avx2")]
@@ -22,10 +14,9 @@ pub unsafe fn mm256_cmpneq_epi8(a: __m256i, b: __m256i) -> __m256i {
}
const MASK: [u8; 64] = [
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
- 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
- 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+ 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
];
#[target_feature(enable = "avx2")]
@@ -36,10 +27,10 @@ unsafe fn mm256_from_offset(slice: &[u8], offset: usize) -> __m256i {
#[target_feature(enable = "avx2")]
unsafe fn sum(u8s: &__m256i) -> usize {
let sums = _mm256_sad_epu8(*u8s, _mm256_setzero_si256());
- (
- _mm256_extract_epi64(sums, 0) + _mm256_extract_epi64(sums, 1) +
- _mm256_extract_epi64(sums, 2) + _mm256_extract_epi64(sums, 3)
- ) as usize
+ (_mm256_extract_epi64(sums, 0)
+ + _mm256_extract_epi64(sums, 1)
+ + _mm256_extract_epi64(sums, 2)
+ + _mm256_extract_epi64(sums, 3)) as usize
}
#[target_feature(enable = "avx2")]
@@ -57,7 +48,7 @@ pub unsafe fn chunk_count(haystack: &[u8], needle: u8) -> usize {
for _ in 0..255 {
counts = _mm256_sub_epi8(
counts,
- _mm256_cmpeq_epi8(mm256_from_offset(haystack, offset), needles)
+ _mm256_cmpeq_epi8(mm256_from_offset(haystack, offset), needles),
);
offset += 32;
}
@@ -70,7 +61,7 @@ pub unsafe fn chunk_count(haystack: &[u8], needle: u8) -> usize {
for _ in 0..128 {
counts = _mm256_sub_epi8(
counts,
- _mm256_cmpeq_epi8(mm256_from_offset(haystack, offset), needles)
+ _mm256_cmpeq_epi8(mm256_from_offset(haystack, offset), needles),
);
offset += 32;
}
@@ -82,7 +73,7 @@ pub unsafe fn chunk_count(haystack: &[u8], needle: u8) -> usize {
for i in 0..(haystack.len() - offset) / 32 {
counts = _mm256_sub_epi8(
counts,
- _mm256_cmpeq_epi8(mm256_from_offset(haystack, offset + i * 32), needles)
+ _mm256_cmpeq_epi8(mm256_from_offset(haystack, offset + i * 32), needles),
);
}
if haystack.len() % 32 != 0 {
@@ -90,8 +81,8 @@ pub unsafe fn chunk_count(haystack: &[u8], needle: u8) -> usize {
counts,
_mm256_and_si256(
_mm256_cmpeq_epi8(mm256_from_offset(haystack, haystack.len() - 32), needles),
- mm256_from_offset(&MASK, haystack.len() % 32)
- )
+ mm256_from_offset(&MASK, haystack.len() % 32),
+ ),
);
}
count += sum(&counts);
@@ -101,7 +92,10 @@ pub unsafe fn chunk_count(haystack: &[u8], needle: u8) -> usize {
#[target_feature(enable = "avx2")]
unsafe fn is_leading_utf8_byte(u8s: __m256i) -> __m256i {
- mm256_cmpneq_epi8(_mm256_and_si256(u8s, _mm256_set1_epu8(0b1100_0000)), _mm256_set1_epu8(0b1000_0000))
+ mm256_cmpneq_epi8(
+ _mm256_and_si256(u8s, _mm256_set1_epu8(0b1100_0000)),
+ _mm256_set1_epu8(0b1000_0000),
+ )
}
#[target_feature(enable = "avx2")]
@@ -118,7 +112,7 @@ pub unsafe fn chunk_num_chars(utf8_chars: &[u8]) -> usize {
for _ in 0..255 {
counts = _mm256_sub_epi8(
counts,
- is_leading_utf8_byte(mm256_from_offset(utf8_chars, offset))
+ is_leading_utf8_byte(mm256_from_offset(utf8_chars, offset)),
);
offset += 32;
}
@@ -131,7 +125,7 @@ pub unsafe fn chunk_num_chars(utf8_chars: &[u8]) -> usize {
for _ in 0..128 {
counts = _mm256_sub_epi8(
counts,
- is_leading_utf8_byte(mm256_from_offset(utf8_chars, offset))
+ is_leading_utf8_byte(mm256_from_offset(utf8_chars, offset)),
);
offset += 32;
}
@@ -143,7 +137,7 @@ pub unsafe fn chunk_num_chars(utf8_chars: &[u8]) -> usize {
for i in 0..(utf8_chars.len() - offset) / 32 {
counts = _mm256_sub_epi8(
counts,
- is_leading_utf8_byte(mm256_from_offset(utf8_chars, offset + i * 32))
+ is_leading_utf8_byte(mm256_from_offset(utf8_chars, offset + i * 32)),
);
}
if utf8_chars.len() % 32 != 0 {
@@ -151,8 +145,8 @@ pub unsafe fn chunk_num_chars(utf8_chars: &[u8]) -> usize {
counts,
_mm256_and_si256(
is_leading_utf8_byte(mm256_from_offset(utf8_chars, utf8_chars.len() - 32)),
- mm256_from_offset(&MASK, utf8_chars.len() % 32)
- )
+ mm256_from_offset(&MASK, utf8_chars.len() % 32),
+ ),
);
}
count += sum(&counts);
diff --git a/vendor/bytecount/src/simd/x86_sse2.rs b/vendor/bytecount/src/simd/x86_sse2.rs
index 63d295eae..59c7d8d1e 100644
--- a/vendor/bytecount/src/simd/x86_sse2.rs
+++ b/vendor/bytecount/src/simd/x86_sse2.rs
@@ -3,11 +3,12 @@ use std::arch::x86::{
__m128i,
_mm_and_si128,
_mm_cmpeq_epi8,
- _mm_extract_epi32,
+ _mm_cvtsi128_si32,
_mm_loadu_si128,
_mm_sad_epu8,
_mm_set1_epi8,
_mm_setzero_si128,
+ _mm_shuffle_epi32,
_mm_sub_epi8,
_mm_xor_si128,
};
@@ -17,11 +18,12 @@ use std::arch::x86_64::{
__m128i,
_mm_and_si128,
_mm_cmpeq_epi8,
- _mm_extract_epi32,
+ _mm_cvtsi128_si32,
_mm_loadu_si128,
_mm_sad_epu8,
_mm_set1_epi8,
_mm_setzero_si128,
+ _mm_shuffle_epi32,
_mm_sub_epi8,
_mm_xor_si128,
};
@@ -49,7 +51,7 @@ unsafe fn mm_from_offset(slice: &[u8], offset: usize) -> __m128i {
#[target_feature(enable = "sse2")]
unsafe fn sum(u8s: &__m128i) -> usize {
let sums = _mm_sad_epu8(*u8s, _mm_setzero_si128());
- (_mm_extract_epi32(sums, 0) + _mm_extract_epi32(sums, 2)) as usize
+ (_mm_cvtsi128_si32(sums) + _mm_cvtsi128_si32(_mm_shuffle_epi32(sums, 0xaa))) as usize
}
#[target_feature(enable = "sse2")]
diff --git a/vendor/bytecount/tests/check.rs b/vendor/bytecount/tests/check.rs
index 147b466fc..5a999509a 100644
--- a/vendor/bytecount/tests/check.rs
+++ b/vendor/bytecount/tests/check.rs
@@ -3,10 +3,7 @@ extern crate bytecount;
extern crate quickcheck;
extern crate rand;
-use bytecount::{
- count, naive_count,
- num_chars, naive_num_chars,
-};
+use bytecount::{count, naive_count, naive_num_chars, num_chars};
use rand::RngCore;
fn random_bytes(len: usize) -> Vec<u8> {
@@ -59,8 +56,6 @@ fn check_count_overflow_many() {
}
}
-
-
quickcheck! {
fn check_num_chars_correct(haystack: Vec<u8>) -> bool {
num_chars(&haystack) == naive_num_chars(&haystack)