diff options
Diffstat (limited to 'third_party/rust/memchr/src')
-rw-r--r-- | third_party/rust/memchr/src/c.rs | 44 | ||||
-rw-r--r-- | third_party/rust/memchr/src/fallback.rs | 330 | ||||
-rw-r--r-- | third_party/rust/memchr/src/iter.rs | 173 | ||||
-rw-r--r-- | third_party/rust/memchr/src/lib.rs | 451 | ||||
-rw-r--r-- | third_party/rust/memchr/src/naive.rs | 25 | ||||
-rw-r--r-- | third_party/rust/memchr/src/tests/iter.rs | 229 | ||||
-rw-r--r-- | third_party/rust/memchr/src/tests/memchr.rs | 131 | ||||
-rw-r--r-- | third_party/rust/memchr/src/tests/miri.rs | 19 | ||||
-rw-r--r-- | third_party/rust/memchr/src/tests/mod.rs | 362 | ||||
-rw-r--r-- | third_party/rust/memchr/src/x86/avx.rs | 703 | ||||
-rw-r--r-- | third_party/rust/memchr/src/x86/mod.rs | 119 | ||||
-rw-r--r-- | third_party/rust/memchr/src/x86/sse2.rs | 793 | ||||
-rw-r--r-- | third_party/rust/memchr/src/x86/sse42.rs | 75 |
13 files changed, 3454 insertions, 0 deletions
diff --git a/third_party/rust/memchr/src/c.rs b/third_party/rust/memchr/src/c.rs new file mode 100644 index 0000000000..63feca979c --- /dev/null +++ b/third_party/rust/memchr/src/c.rs @@ -0,0 +1,44 @@ +// This module defines safe wrappers around memchr (POSIX) and memrchr (GNU +// extension). + +#![allow(dead_code)] + +extern crate libc; + +use self::libc::{c_int, c_void, size_t}; + +pub fn memchr(needle: u8, haystack: &[u8]) -> Option<usize> { + let p = unsafe { + libc::memchr( + haystack.as_ptr() as *const c_void, + needle as c_int, + haystack.len() as size_t, + ) + }; + if p.is_null() { + None + } else { + Some(p as usize - (haystack.as_ptr() as usize)) + } +} + +// memrchr is a GNU extension. We know it's available on Linux, so start there. +#[cfg(target_os = "linux")] +pub fn memrchr(needle: u8, haystack: &[u8]) -> Option<usize> { + // GNU's memrchr() will - unlike memchr() - error if haystack is empty. + if haystack.is_empty() { + return None; + } + let p = unsafe { + libc::memrchr( + haystack.as_ptr() as *const c_void, + needle as c_int, + haystack.len() as size_t, + ) + }; + if p.is_null() { + None + } else { + Some(p as usize - (haystack.as_ptr() as usize)) + } +} diff --git a/third_party/rust/memchr/src/fallback.rs b/third_party/rust/memchr/src/fallback.rs new file mode 100644 index 0000000000..8bc32b27ba --- /dev/null +++ b/third_party/rust/memchr/src/fallback.rs @@ -0,0 +1,330 @@ +// This module defines pure Rust platform independent implementations of all +// the memchr routines. We do our best to make them fast. Some of them may even +// get auto-vectorized. + +use core::cmp; +use core::usize; + +#[cfg(target_pointer_width = "16")] +const USIZE_BYTES: usize = 2; + +#[cfg(target_pointer_width = "32")] +const USIZE_BYTES: usize = 4; + +#[cfg(target_pointer_width = "64")] +const USIZE_BYTES: usize = 8; + +// The number of bytes to loop at in one iteration of memchr/memrchr. +const LOOP_SIZE: usize = 2 * USIZE_BYTES; + +/// Return `true` if `x` contains any zero byte. +/// +/// From *Matters Computational*, J. Arndt +/// +/// "The idea is to subtract one from each of the bytes and then look for +/// bytes where the borrow propagated all the way to the most significant +/// bit." +#[inline(always)] +fn contains_zero_byte(x: usize) -> bool { + const LO_U64: u64 = 0x0101010101010101; + const HI_U64: u64 = 0x8080808080808080; + + const LO_USIZE: usize = LO_U64 as usize; + const HI_USIZE: usize = HI_U64 as usize; + + x.wrapping_sub(LO_USIZE) & !x & HI_USIZE != 0 +} + +/// Repeat the given byte into a word size number. That is, every 8 bits +/// is equivalent to the given byte. For example, if `b` is `\x4E` or +/// `01001110` in binary, then the returned value on a 32-bit system would be: +/// `01001110_01001110_01001110_01001110`. +#[inline(always)] +fn repeat_byte(b: u8) -> usize { + (b as usize) * (usize::MAX / 255) +} + +pub fn memchr(n1: u8, haystack: &[u8]) -> Option<usize> { + let vn1 = repeat_byte(n1); + let confirm = |byte| byte == n1; + let loop_size = cmp::min(LOOP_SIZE, haystack.len()); + let align = USIZE_BYTES - 1; + let start_ptr = haystack.as_ptr(); + let end_ptr = haystack[haystack.len()..].as_ptr(); + let mut ptr = start_ptr; + + unsafe { + if haystack.len() < USIZE_BYTES { + return forward_search(start_ptr, end_ptr, ptr, confirm); + } + + let chunk = (ptr as *const usize).read_unaligned(); + if contains_zero_byte(chunk ^ vn1) { + return forward_search(start_ptr, end_ptr, ptr, confirm); + } + + ptr = ptr.add(USIZE_BYTES - (start_ptr as usize & align)); + debug_assert!(ptr > start_ptr); + debug_assert!(end_ptr.sub(USIZE_BYTES) >= start_ptr); + while loop_size == LOOP_SIZE && ptr <= end_ptr.sub(loop_size) { + debug_assert_eq!(0, (ptr as usize) % USIZE_BYTES); + + let a = *(ptr as *const usize); + let b = *(ptr.add(USIZE_BYTES) as *const usize); + let eqa = contains_zero_byte(a ^ vn1); + let eqb = contains_zero_byte(b ^ vn1); + if eqa || eqb { + break; + } + ptr = ptr.add(LOOP_SIZE); + } + forward_search(start_ptr, end_ptr, ptr, confirm) + } +} + +/// Like `memchr`, but searches for two bytes instead of one. +pub fn memchr2(n1: u8, n2: u8, haystack: &[u8]) -> Option<usize> { + let vn1 = repeat_byte(n1); + let vn2 = repeat_byte(n2); + let confirm = |byte| byte == n1 || byte == n2; + let align = USIZE_BYTES - 1; + let start_ptr = haystack.as_ptr(); + let end_ptr = haystack[haystack.len()..].as_ptr(); + let mut ptr = start_ptr; + + unsafe { + if haystack.len() < USIZE_BYTES { + return forward_search(start_ptr, end_ptr, ptr, confirm); + } + + let chunk = (ptr as *const usize).read_unaligned(); + let eq1 = contains_zero_byte(chunk ^ vn1); + let eq2 = contains_zero_byte(chunk ^ vn2); + if eq1 || eq2 { + return forward_search(start_ptr, end_ptr, ptr, confirm); + } + + ptr = ptr.add(USIZE_BYTES - (start_ptr as usize & align)); + debug_assert!(ptr > start_ptr); + debug_assert!(end_ptr.sub(USIZE_BYTES) >= start_ptr); + while ptr <= end_ptr.sub(USIZE_BYTES) { + debug_assert_eq!(0, (ptr as usize) % USIZE_BYTES); + + let chunk = *(ptr as *const usize); + let eq1 = contains_zero_byte(chunk ^ vn1); + let eq2 = contains_zero_byte(chunk ^ vn2); + if eq1 || eq2 { + break; + } + ptr = ptr.add(USIZE_BYTES); + } + forward_search(start_ptr, end_ptr, ptr, confirm) + } +} + +/// Like `memchr`, but searches for three bytes instead of one. +pub fn memchr3(n1: u8, n2: u8, n3: u8, haystack: &[u8]) -> Option<usize> { + let vn1 = repeat_byte(n1); + let vn2 = repeat_byte(n2); + let vn3 = repeat_byte(n3); + let confirm = |byte| byte == n1 || byte == n2 || byte == n3; + let align = USIZE_BYTES - 1; + let start_ptr = haystack.as_ptr(); + let end_ptr = haystack[haystack.len()..].as_ptr(); + let mut ptr = start_ptr; + + unsafe { + if haystack.len() < USIZE_BYTES { + return forward_search(start_ptr, end_ptr, ptr, confirm); + } + + let chunk = (ptr as *const usize).read_unaligned(); + let eq1 = contains_zero_byte(chunk ^ vn1); + let eq2 = contains_zero_byte(chunk ^ vn2); + let eq3 = contains_zero_byte(chunk ^ vn3); + if eq1 || eq2 || eq3 { + return forward_search(start_ptr, end_ptr, ptr, confirm); + } + + ptr = ptr.add(USIZE_BYTES - (start_ptr as usize & align)); + debug_assert!(ptr > start_ptr); + debug_assert!(end_ptr.sub(USIZE_BYTES) >= start_ptr); + while ptr <= end_ptr.sub(USIZE_BYTES) { + debug_assert_eq!(0, (ptr as usize) % USIZE_BYTES); + + let chunk = *(ptr as *const usize); + let eq1 = contains_zero_byte(chunk ^ vn1); + let eq2 = contains_zero_byte(chunk ^ vn2); + let eq3 = contains_zero_byte(chunk ^ vn3); + if eq1 || eq2 || eq3 { + break; + } + ptr = ptr.add(USIZE_BYTES); + } + forward_search(start_ptr, end_ptr, ptr, confirm) + } +} + +/// Return the last index matching the byte `x` in `text`. +pub fn memrchr(n1: u8, haystack: &[u8]) -> Option<usize> { + let vn1 = repeat_byte(n1); + let confirm = |byte| byte == n1; + let loop_size = cmp::min(LOOP_SIZE, haystack.len()); + let align = USIZE_BYTES - 1; + let start_ptr = haystack.as_ptr(); + let end_ptr = haystack[haystack.len()..].as_ptr(); + let mut ptr = end_ptr; + + unsafe { + if haystack.len() < USIZE_BYTES { + return reverse_search(start_ptr, end_ptr, ptr, confirm); + } + + let chunk = (ptr.sub(USIZE_BYTES) as *const usize).read_unaligned(); + if contains_zero_byte(chunk ^ vn1) { + return reverse_search(start_ptr, end_ptr, ptr, confirm); + } + + ptr = (end_ptr as usize & !align) as *const u8; + debug_assert!(start_ptr <= ptr && ptr <= end_ptr); + while loop_size == LOOP_SIZE && ptr >= start_ptr.add(loop_size) { + debug_assert_eq!(0, (ptr as usize) % USIZE_BYTES); + + let a = *(ptr.sub(2 * USIZE_BYTES) as *const usize); + let b = *(ptr.sub(1 * USIZE_BYTES) as *const usize); + let eqa = contains_zero_byte(a ^ vn1); + let eqb = contains_zero_byte(b ^ vn1); + if eqa || eqb { + break; + } + ptr = ptr.sub(loop_size); + } + reverse_search(start_ptr, end_ptr, ptr, confirm) + } +} + +/// Like `memrchr`, but searches for two bytes instead of one. +pub fn memrchr2(n1: u8, n2: u8, haystack: &[u8]) -> Option<usize> { + let vn1 = repeat_byte(n1); + let vn2 = repeat_byte(n2); + let confirm = |byte| byte == n1 || byte == n2; + let align = USIZE_BYTES - 1; + let start_ptr = haystack.as_ptr(); + let end_ptr = haystack[haystack.len()..].as_ptr(); + let mut ptr = end_ptr; + + unsafe { + if haystack.len() < USIZE_BYTES { + return reverse_search(start_ptr, end_ptr, ptr, confirm); + } + + let chunk = (ptr.sub(USIZE_BYTES) as *const usize).read_unaligned(); + let eq1 = contains_zero_byte(chunk ^ vn1); + let eq2 = contains_zero_byte(chunk ^ vn2); + if eq1 || eq2 { + return reverse_search(start_ptr, end_ptr, ptr, confirm); + } + + ptr = (end_ptr as usize & !align) as *const u8; + debug_assert!(start_ptr <= ptr && ptr <= end_ptr); + while ptr >= start_ptr.add(USIZE_BYTES) { + debug_assert_eq!(0, (ptr as usize) % USIZE_BYTES); + + let chunk = *(ptr.sub(USIZE_BYTES) as *const usize); + let eq1 = contains_zero_byte(chunk ^ vn1); + let eq2 = contains_zero_byte(chunk ^ vn2); + if eq1 || eq2 { + break; + } + ptr = ptr.sub(USIZE_BYTES); + } + reverse_search(start_ptr, end_ptr, ptr, confirm) + } +} + +/// Like `memrchr`, but searches for three bytes instead of one. +pub fn memrchr3(n1: u8, n2: u8, n3: u8, haystack: &[u8]) -> Option<usize> { + let vn1 = repeat_byte(n1); + let vn2 = repeat_byte(n2); + let vn3 = repeat_byte(n3); + let confirm = |byte| byte == n1 || byte == n2 || byte == n3; + let align = USIZE_BYTES - 1; + let start_ptr = haystack.as_ptr(); + let end_ptr = haystack[haystack.len()..].as_ptr(); + let mut ptr = end_ptr; + + unsafe { + if haystack.len() < USIZE_BYTES { + return reverse_search(start_ptr, end_ptr, ptr, confirm); + } + + let chunk = (ptr.sub(USIZE_BYTES) as *const usize).read_unaligned(); + let eq1 = contains_zero_byte(chunk ^ vn1); + let eq2 = contains_zero_byte(chunk ^ vn2); + let eq3 = contains_zero_byte(chunk ^ vn3); + if eq1 || eq2 || eq3 { + return reverse_search(start_ptr, end_ptr, ptr, confirm); + } + + ptr = (end_ptr as usize & !align) as *const u8; + debug_assert!(start_ptr <= ptr && ptr <= end_ptr); + while ptr >= start_ptr.add(USIZE_BYTES) { + debug_assert_eq!(0, (ptr as usize) % USIZE_BYTES); + + let chunk = *(ptr.sub(USIZE_BYTES) as *const usize); + let eq1 = contains_zero_byte(chunk ^ vn1); + let eq2 = contains_zero_byte(chunk ^ vn2); + let eq3 = contains_zero_byte(chunk ^ vn3); + if eq1 || eq2 || eq3 { + break; + } + ptr = ptr.sub(USIZE_BYTES); + } + reverse_search(start_ptr, end_ptr, ptr, confirm) + } +} + +#[inline(always)] +unsafe fn forward_search<F: Fn(u8) -> bool>( + start_ptr: *const u8, + end_ptr: *const u8, + mut ptr: *const u8, + confirm: F, +) -> Option<usize> { + debug_assert!(start_ptr <= ptr); + debug_assert!(ptr <= end_ptr); + + while ptr < end_ptr { + if confirm(*ptr) { + return Some(sub(ptr, start_ptr)); + } + ptr = ptr.offset(1); + } + None +} + +#[inline(always)] +unsafe fn reverse_search<F: Fn(u8) -> bool>( + start_ptr: *const u8, + end_ptr: *const u8, + mut ptr: *const u8, + confirm: F, +) -> Option<usize> { + debug_assert!(start_ptr <= ptr); + debug_assert!(ptr <= end_ptr); + + while ptr > start_ptr { + ptr = ptr.offset(-1); + if confirm(*ptr) { + return Some(sub(ptr, start_ptr)); + } + } + None +} + +/// Subtract `b` from `a` and return the difference. `a` should be greater than +/// or equal to `b`. +fn sub(a: *const u8, b: *const u8) -> usize { + debug_assert!(a >= b); + (a as usize) - (b as usize) +} diff --git a/third_party/rust/memchr/src/iter.rs b/third_party/rust/memchr/src/iter.rs new file mode 100644 index 0000000000..6217ae4a09 --- /dev/null +++ b/third_party/rust/memchr/src/iter.rs @@ -0,0 +1,173 @@ +use {memchr, memchr2, memchr3, memrchr, memrchr2, memrchr3}; + +macro_rules! iter_next { + // Common code for the memchr iterators: + // update haystack and position and produce the index + // + // self: &mut Self where Self is the iterator + // search_result: Option<usize> which is the result of the corresponding + // memchr function. + // + // Returns Option<usize> (the next iterator element) + ($self_:expr, $search_result:expr) => { + $search_result.map(move |index| { + // split and take the remaining back half + $self_.haystack = $self_.haystack.split_at(index + 1).1; + let found_position = $self_.position + index; + $self_.position = found_position + 1; + found_position + }) + }; +} + +macro_rules! iter_next_back { + ($self_:expr, $search_result:expr) => { + $search_result.map(move |index| { + // split and take the remaining front half + $self_.haystack = $self_.haystack.split_at(index).0; + $self_.position + index + }) + }; +} + +/// An iterator for `memchr`. +pub struct Memchr<'a> { + needle: u8, + // The haystack to iterate over + haystack: &'a [u8], + // The index + position: usize, +} + +impl<'a> Memchr<'a> { + /// Creates a new iterator that yields all positions of needle in haystack. + #[inline] + pub fn new(needle: u8, haystack: &[u8]) -> Memchr { + Memchr { needle: needle, haystack: haystack, position: 0 } + } +} + +impl<'a> Iterator for Memchr<'a> { + type Item = usize; + + #[inline] + fn next(&mut self) -> Option<usize> { + iter_next!(self, memchr(self.needle, self.haystack)) + } + + #[inline] + fn size_hint(&self) -> (usize, Option<usize>) { + (0, Some(self.haystack.len())) + } +} + +impl<'a> DoubleEndedIterator for Memchr<'a> { + #[inline] + fn next_back(&mut self) -> Option<Self::Item> { + iter_next_back!(self, memrchr(self.needle, self.haystack)) + } +} + +/// An iterator for `memchr2`. +pub struct Memchr2<'a> { + needle1: u8, + needle2: u8, + // The haystack to iterate over + haystack: &'a [u8], + // The index + position: usize, +} + +impl<'a> Memchr2<'a> { + /// Creates a new iterator that yields all positions of needle in haystack. + #[inline] + pub fn new(needle1: u8, needle2: u8, haystack: &[u8]) -> Memchr2 { + Memchr2 { + needle1: needle1, + needle2: needle2, + haystack: haystack, + position: 0, + } + } +} + +impl<'a> Iterator for Memchr2<'a> { + type Item = usize; + + #[inline] + fn next(&mut self) -> Option<usize> { + iter_next!(self, memchr2(self.needle1, self.needle2, self.haystack)) + } + + #[inline] + fn size_hint(&self) -> (usize, Option<usize>) { + (0, Some(self.haystack.len())) + } +} + +impl<'a> DoubleEndedIterator for Memchr2<'a> { + #[inline] + fn next_back(&mut self) -> Option<Self::Item> { + iter_next_back!( + self, + memrchr2(self.needle1, self.needle2, self.haystack) + ) + } +} + +/// An iterator for `memchr3`. +pub struct Memchr3<'a> { + needle1: u8, + needle2: u8, + needle3: u8, + // The haystack to iterate over + haystack: &'a [u8], + // The index + position: usize, +} + +impl<'a> Memchr3<'a> { + /// Create a new `Memchr3` that's initialized to zero with a haystack + #[inline] + pub fn new( + needle1: u8, + needle2: u8, + needle3: u8, + haystack: &[u8], + ) -> Memchr3 { + Memchr3 { + needle1: needle1, + needle2: needle2, + needle3: needle3, + haystack: haystack, + position: 0, + } + } +} + +impl<'a> Iterator for Memchr3<'a> { + type Item = usize; + + #[inline] + fn next(&mut self) -> Option<usize> { + iter_next!( + self, + memchr3(self.needle1, self.needle2, self.needle3, self.haystack) + ) + } + + #[inline] + fn size_hint(&self) -> (usize, Option<usize>) { + (0, Some(self.haystack.len())) + } +} + +impl<'a> DoubleEndedIterator for Memchr3<'a> { + #[inline] + fn next_back(&mut self) -> Option<Self::Item> { + iter_next_back!( + self, + memrchr3(self.needle1, self.needle2, self.needle3, self.haystack) + ) + } +} diff --git a/third_party/rust/memchr/src/lib.rs b/third_party/rust/memchr/src/lib.rs new file mode 100644 index 0000000000..fed7108734 --- /dev/null +++ b/third_party/rust/memchr/src/lib.rs @@ -0,0 +1,451 @@ +/*! +The `memchr` crate provides heavily optimized routines for searching bytes. + +The `memchr` function is traditionally provided by libc, however, the +performance of `memchr` can vary significantly depending on the specific +implementation of libc that is used. They can range from manually tuned +Assembly implementations (like that found in GNU's libc) all the way to +non-vectorized C implementations (like that found in MUSL). + +To smooth out the differences between implementations of libc, at least +on `x86_64` for Rust 1.27+, this crate provides its own implementation of +`memchr` that should perform competitively with the one found in GNU's libc. +The implementation is in pure Rust and has no dependency on a C compiler or an +Assembler. + +Additionally, GNU libc also provides an extension, `memrchr`. This crate +provides its own implementation of `memrchr` as well, on top of `memchr2`, +`memchr3`, `memrchr2` and `memrchr3`. The difference between `memchr` and +`memchr2` is that that `memchr2` permits finding all occurrences of two bytes +instead of one. Similarly for `memchr3`. +*/ + +#![cfg_attr(not(feature = "std"), no_std)] +#![deny(missing_docs)] +#![doc(html_root_url = "https://docs.rs/memchr/2.0.0")] + +// Supporting 8-bit (or others) would be fine. If you need it, please submit a +// bug report at https://github.com/BurntSushi/rust-memchr +#[cfg(not(any( + target_pointer_width = "16", + target_pointer_width = "32", + target_pointer_width = "64" +)))] +compile_error!("memchr currently not supported on non-32 or non-64 bit"); + +#[cfg(feature = "std")] +extern crate core; + +#[cfg(all(test, all(not(miri), feature = "std")))] +#[macro_use] +extern crate quickcheck; + +use core::iter::Rev; + +pub use iter::{Memchr, Memchr2, Memchr3}; + +// N.B. If you're looking for the cfg knobs for libc, see build.rs. +#[cfg(memchr_libc)] +mod c; +#[allow(dead_code)] +mod fallback; +mod iter; +mod naive; +#[cfg(all(test, all(not(miri), feature = "std")))] +mod tests; +#[cfg(all(test, any(miri, not(feature = "std"))))] +#[path = "tests/miri.rs"] +mod tests; +#[cfg(all(not(miri), target_arch = "x86_64", memchr_runtime_simd))] +mod x86; + +/// An iterator over all occurrences of the needle in a haystack. +#[inline] +pub fn memchr_iter(needle: u8, haystack: &[u8]) -> Memchr { + Memchr::new(needle, haystack) +} + +/// An iterator over all occurrences of the needles in a haystack. +#[inline] +pub fn memchr2_iter(needle1: u8, needle2: u8, haystack: &[u8]) -> Memchr2 { + Memchr2::new(needle1, needle2, haystack) +} + +/// An iterator over all occurrences of the needles in a haystack. +#[inline] +pub fn memchr3_iter( + needle1: u8, + needle2: u8, + needle3: u8, + haystack: &[u8], +) -> Memchr3 { + Memchr3::new(needle1, needle2, needle3, haystack) +} + +/// An iterator over all occurrences of the needle in a haystack, in reverse. +#[inline] +pub fn memrchr_iter(needle: u8, haystack: &[u8]) -> Rev<Memchr> { + Memchr::new(needle, haystack).rev() +} + +/// An iterator over all occurrences of the needles in a haystack, in reverse. +#[inline] +pub fn memrchr2_iter( + needle1: u8, + needle2: u8, + haystack: &[u8], +) -> Rev<Memchr2> { + Memchr2::new(needle1, needle2, haystack).rev() +} + +/// An iterator over all occurrences of the needles in a haystack, in reverse. +#[inline] +pub fn memrchr3_iter( + needle1: u8, + needle2: u8, + needle3: u8, + haystack: &[u8], +) -> Rev<Memchr3> { + Memchr3::new(needle1, needle2, needle3, haystack).rev() +} + +/// Search for the first occurrence of a byte in a slice. +/// +/// This returns the index corresponding to the first occurrence of `needle` in +/// `haystack`, or `None` if one is not found. +/// +/// While this is operationally the same as something like +/// `haystack.iter().position(|&b| b == needle)`, `memchr` will use a highly +/// optimized routine that can be up to an order of magnitude faster in some +/// cases. +/// +/// # Example +/// +/// This shows how to find the first position of a byte in a byte string. +/// +/// ``` +/// use memchr::memchr; +/// +/// let haystack = b"the quick brown fox"; +/// assert_eq!(memchr(b'k', haystack), Some(8)); +/// ``` +#[inline] +pub fn memchr(needle: u8, haystack: &[u8]) -> Option<usize> { + #[cfg(miri)] + #[inline(always)] + fn imp(n1: u8, haystack: &[u8]) -> Option<usize> { + naive::memchr(n1, haystack) + } + + #[cfg(all(target_arch = "x86_64", memchr_runtime_simd, not(miri)))] + #[inline(always)] + fn imp(n1: u8, haystack: &[u8]) -> Option<usize> { + x86::memchr(n1, haystack) + } + + #[cfg(all( + memchr_libc, + not(all(target_arch = "x86_64", memchr_runtime_simd)), + not(miri), + ))] + #[inline(always)] + fn imp(n1: u8, haystack: &[u8]) -> Option<usize> { + c::memchr(n1, haystack) + } + + #[cfg(all( + not(memchr_libc), + not(all(target_arch = "x86_64", memchr_runtime_simd)), + not(miri), + ))] + #[inline(always)] + fn imp(n1: u8, haystack: &[u8]) -> Option<usize> { + fallback::memchr(n1, haystack) + } + + if haystack.is_empty() { + None + } else { + imp(needle, haystack) + } +} + +/// Like `memchr`, but searches for either of two bytes instead of just one. +/// +/// This returns the index corresponding to the first occurrence of `needle1` +/// or the first occurrence of `needle2` in `haystack` (whichever occurs +/// earlier), or `None` if neither one is found. +/// +/// While this is operationally the same as something like +/// `haystack.iter().position(|&b| b == needle1 || b == needle2)`, `memchr2` +/// will use a highly optimized routine that can be up to an order of magnitude +/// faster in some cases. +/// +/// # Example +/// +/// This shows how to find the first position of either of two bytes in a byte +/// string. +/// +/// ``` +/// use memchr::memchr2; +/// +/// let haystack = b"the quick brown fox"; +/// assert_eq!(memchr2(b'k', b'q', haystack), Some(4)); +/// ``` +#[inline] +pub fn memchr2(needle1: u8, needle2: u8, haystack: &[u8]) -> Option<usize> { + #[cfg(miri)] + #[inline(always)] + fn imp(n1: u8, n2: u8, haystack: &[u8]) -> Option<usize> { + naive::memchr2(n1, n2, haystack) + } + + #[cfg(all(target_arch = "x86_64", memchr_runtime_simd, not(miri)))] + #[inline(always)] + fn imp(n1: u8, n2: u8, haystack: &[u8]) -> Option<usize> { + x86::memchr2(n1, n2, haystack) + } + + #[cfg(all( + not(all(target_arch = "x86_64", memchr_runtime_simd)), + not(miri), + ))] + #[inline(always)] + fn imp(n1: u8, n2: u8, haystack: &[u8]) -> Option<usize> { + fallback::memchr2(n1, n2, haystack) + } + + if haystack.is_empty() { + None + } else { + imp(needle1, needle2, haystack) + } +} + +/// Like `memchr`, but searches for any of three bytes instead of just one. +/// +/// This returns the index corresponding to the first occurrence of `needle1`, +/// the first occurrence of `needle2`, or the first occurrence of `needle3` in +/// `haystack` (whichever occurs earliest), or `None` if none are found. +/// +/// While this is operationally the same as something like +/// `haystack.iter().position(|&b| b == needle1 || b == needle2 || +/// b == needle3)`, `memchr3` will use a highly optimized routine that can be +/// up to an order of magnitude faster in some cases. +/// +/// # Example +/// +/// This shows how to find the first position of any of three bytes in a byte +/// string. +/// +/// ``` +/// use memchr::memchr3; +/// +/// let haystack = b"the quick brown fox"; +/// assert_eq!(memchr3(b'k', b'q', b'e', haystack), Some(2)); +/// ``` +#[inline] +pub fn memchr3( + needle1: u8, + needle2: u8, + needle3: u8, + haystack: &[u8], +) -> Option<usize> { + #[cfg(miri)] + #[inline(always)] + fn imp(n1: u8, n2: u8, n3: u8, haystack: &[u8]) -> Option<usize> { + naive::memchr3(n1, n2, n3, haystack) + } + + #[cfg(all(target_arch = "x86_64", memchr_runtime_simd, not(miri)))] + #[inline(always)] + fn imp(n1: u8, n2: u8, n3: u8, haystack: &[u8]) -> Option<usize> { + x86::memchr3(n1, n2, n3, haystack) + } + + #[cfg(all( + not(all(target_arch = "x86_64", memchr_runtime_simd)), + not(miri), + ))] + #[inline(always)] + fn imp(n1: u8, n2: u8, n3: u8, haystack: &[u8]) -> Option<usize> { + fallback::memchr3(n1, n2, n3, haystack) + } + + if haystack.is_empty() { + None + } else { + imp(needle1, needle2, needle3, haystack) + } +} + +/// Search for the last occurrence of a byte in a slice. +/// +/// This returns the index corresponding to the last occurrence of `needle` in +/// `haystack`, or `None` if one is not found. +/// +/// While this is operationally the same as something like +/// `haystack.iter().rposition(|&b| b == needle)`, `memrchr` will use a highly +/// optimized routine that can be up to an order of magnitude faster in some +/// cases. +/// +/// # Example +/// +/// This shows how to find the last position of a byte in a byte string. +/// +/// ``` +/// use memchr::memrchr; +/// +/// let haystack = b"the quick brown fox"; +/// assert_eq!(memrchr(b'o', haystack), Some(17)); +/// ``` +#[inline] +pub fn memrchr(needle: u8, haystack: &[u8]) -> Option<usize> { + #[cfg(miri)] + #[inline(always)] + fn imp(n1: u8, haystack: &[u8]) -> Option<usize> { + naive::memrchr(n1, haystack) + } + + #[cfg(all(target_arch = "x86_64", memchr_runtime_simd, not(miri)))] + #[inline(always)] + fn imp(n1: u8, haystack: &[u8]) -> Option<usize> { + x86::memrchr(n1, haystack) + } + + #[cfg(all( + memchr_libc, + target_os = "linux", + not(all(target_arch = "x86_64", memchr_runtime_simd)), + not(miri) + ))] + #[inline(always)] + fn imp(n1: u8, haystack: &[u8]) -> Option<usize> { + c::memrchr(n1, haystack) + } + + #[cfg(all( + not(all(memchr_libc, target_os = "linux")), + not(all(target_arch = "x86_64", memchr_runtime_simd)), + not(miri), + ))] + #[inline(always)] + fn imp(n1: u8, haystack: &[u8]) -> Option<usize> { + fallback::memrchr(n1, haystack) + } + + if haystack.is_empty() { + None + } else { + imp(needle, haystack) + } +} + +/// Like `memrchr`, but searches for either of two bytes instead of just one. +/// +/// This returns the index corresponding to the last occurrence of `needle1` +/// or the last occurrence of `needle2` in `haystack` (whichever occurs later), +/// or `None` if neither one is found. +/// +/// While this is operationally the same as something like +/// `haystack.iter().rposition(|&b| b == needle1 || b == needle2)`, `memrchr2` +/// will use a highly optimized routine that can be up to an order of magnitude +/// faster in some cases. +/// +/// # Example +/// +/// This shows how to find the last position of either of two bytes in a byte +/// string. +/// +/// ``` +/// use memchr::memrchr2; +/// +/// let haystack = b"the quick brown fox"; +/// assert_eq!(memrchr2(b'k', b'q', haystack), Some(8)); +/// ``` +#[inline] +pub fn memrchr2(needle1: u8, needle2: u8, haystack: &[u8]) -> Option<usize> { + #[cfg(miri)] + #[inline(always)] + fn imp(n1: u8, n2: u8, haystack: &[u8]) -> Option<usize> { + naive::memrchr2(n1, n2, haystack) + } + + #[cfg(all(target_arch = "x86_64", memchr_runtime_simd, not(miri)))] + #[inline(always)] + fn imp(n1: u8, n2: u8, haystack: &[u8]) -> Option<usize> { + x86::memrchr2(n1, n2, haystack) + } + + #[cfg(all( + not(all(target_arch = "x86_64", memchr_runtime_simd)), + not(miri), + ))] + #[inline(always)] + fn imp(n1: u8, n2: u8, haystack: &[u8]) -> Option<usize> { + fallback::memrchr2(n1, n2, haystack) + } + + if haystack.is_empty() { + None + } else { + imp(needle1, needle2, haystack) + } +} + +/// Like `memrchr`, but searches for any of three bytes instead of just one. +/// +/// This returns the index corresponding to the last occurrence of `needle1`, +/// the last occurrence of `needle2`, or the last occurrence of `needle3` in +/// `haystack` (whichever occurs later), or `None` if none are found. +/// +/// While this is operationally the same as something like +/// `haystack.iter().rposition(|&b| b == needle1 || b == needle2 || +/// b == needle3)`, `memrchr3` will use a highly optimized routine that can be +/// up to an order of magnitude faster in some cases. +/// +/// # Example +/// +/// This shows how to find the last position of any of three bytes in a byte +/// string. +/// +/// ``` +/// use memchr::memrchr3; +/// +/// let haystack = b"the quick brown fox"; +/// assert_eq!(memrchr3(b'k', b'q', b'e', haystack), Some(8)); +/// ``` +#[inline] +pub fn memrchr3( + needle1: u8, + needle2: u8, + needle3: u8, + haystack: &[u8], +) -> Option<usize> { + #[cfg(miri)] + #[inline(always)] + fn imp(n1: u8, n2: u8, n3: u8, haystack: &[u8]) -> Option<usize> { + naive::memrchr3(n1, n2, n3, haystack) + } + + #[cfg(all(target_arch = "x86_64", memchr_runtime_simd, not(miri)))] + #[inline(always)] + fn imp(n1: u8, n2: u8, n3: u8, haystack: &[u8]) -> Option<usize> { + x86::memrchr3(n1, n2, n3, haystack) + } + + #[cfg(all( + not(all(target_arch = "x86_64", memchr_runtime_simd)), + not(miri), + ))] + #[inline(always)] + fn imp(n1: u8, n2: u8, n3: u8, haystack: &[u8]) -> Option<usize> { + fallback::memrchr3(n1, n2, n3, haystack) + } + + if haystack.is_empty() { + None + } else { + imp(needle1, needle2, needle3, haystack) + } +} diff --git a/third_party/rust/memchr/src/naive.rs b/third_party/rust/memchr/src/naive.rs new file mode 100644 index 0000000000..3f3053d481 --- /dev/null +++ b/third_party/rust/memchr/src/naive.rs @@ -0,0 +1,25 @@ +#![allow(dead_code)] + +pub fn memchr(n1: u8, haystack: &[u8]) -> Option<usize> { + haystack.iter().position(|&b| b == n1) +} + +pub fn memchr2(n1: u8, n2: u8, haystack: &[u8]) -> Option<usize> { + haystack.iter().position(|&b| b == n1 || b == n2) +} + +pub fn memchr3(n1: u8, n2: u8, n3: u8, haystack: &[u8]) -> Option<usize> { + haystack.iter().position(|&b| b == n1 || b == n2 || b == n3) +} + +pub fn memrchr(n1: u8, haystack: &[u8]) -> Option<usize> { + haystack.iter().rposition(|&b| b == n1) +} + +pub fn memrchr2(n1: u8, n2: u8, haystack: &[u8]) -> Option<usize> { + haystack.iter().rposition(|&b| b == n1 || b == n2) +} + +pub fn memrchr3(n1: u8, n2: u8, n3: u8, haystack: &[u8]) -> Option<usize> { + haystack.iter().rposition(|&b| b == n1 || b == n2 || b == n3) +} diff --git a/third_party/rust/memchr/src/tests/iter.rs b/third_party/rust/memchr/src/tests/iter.rs new file mode 100644 index 0000000000..8f335003b9 --- /dev/null +++ b/third_party/rust/memchr/src/tests/iter.rs @@ -0,0 +1,229 @@ +use tests::memchr_tests; +use {Memchr, Memchr2, Memchr3}; + +#[test] +fn memchr1_iter() { + for test in memchr_tests() { + test.iter_one(false, Memchr::new); + } +} + +#[test] +fn memchr2_iter() { + for test in memchr_tests() { + test.iter_two(false, Memchr2::new); + } +} + +#[test] +fn memchr3_iter() { + for test in memchr_tests() { + test.iter_three(false, Memchr3::new); + } +} + +#[test] +fn memrchr1_iter() { + for test in memchr_tests() { + test.iter_one(true, |n1, corpus| Memchr::new(n1, corpus).rev()); + } +} + +#[test] +fn memrchr2_iter() { + for test in memchr_tests() { + test.iter_two(true, |n1, n2, corpus| { + Memchr2::new(n1, n2, corpus).rev() + }) + } +} + +#[test] +fn memrchr3_iter() { + for test in memchr_tests() { + test.iter_three(true, |n1, n2, n3, corpus| { + Memchr3::new(n1, n2, n3, corpus).rev() + }) + } +} + +quickcheck! { + fn qc_memchr_double_ended_iter( + needle: u8, data: Vec<u8>, take_side: Vec<bool> + ) -> bool { + // make nonempty + let mut take_side = take_side; + if take_side.is_empty() { take_side.push(true) }; + + let iter = Memchr::new(needle, &data); + let all_found = double_ended_take( + iter, take_side.iter().cycle().cloned()); + + all_found.iter().cloned().eq(positions1(needle, &data)) + } + + fn qc_memchr2_double_ended_iter( + needle1: u8, needle2: u8, data: Vec<u8>, take_side: Vec<bool> + ) -> bool { + // make nonempty + let mut take_side = take_side; + if take_side.is_empty() { take_side.push(true) }; + + let iter = Memchr2::new(needle1, needle2, &data); + let all_found = double_ended_take( + iter, take_side.iter().cycle().cloned()); + + all_found.iter().cloned().eq(positions2(needle1, needle2, &data)) + } + + fn qc_memchr3_double_ended_iter( + needle1: u8, needle2: u8, needle3: u8, + data: Vec<u8>, take_side: Vec<bool> + ) -> bool { + // make nonempty + let mut take_side = take_side; + if take_side.is_empty() { take_side.push(true) }; + + let iter = Memchr3::new(needle1, needle2, needle3, &data); + let all_found = double_ended_take( + iter, take_side.iter().cycle().cloned()); + + all_found + .iter() + .cloned() + .eq(positions3(needle1, needle2, needle3, &data)) + } + + fn qc_memchr1_iter(data: Vec<u8>) -> bool { + let needle = 0; + let answer = positions1(needle, &data); + answer.eq(Memchr::new(needle, &data)) + } + + fn qc_memchr1_rev_iter(data: Vec<u8>) -> bool { + let needle = 0; + let answer = positions1(needle, &data); + answer.rev().eq(Memchr::new(needle, &data).rev()) + } + + fn qc_memchr2_iter(data: Vec<u8>) -> bool { + let needle1 = 0; + let needle2 = 1; + let answer = positions2(needle1, needle2, &data); + answer.eq(Memchr2::new(needle1, needle2, &data)) + } + + fn qc_memchr2_rev_iter(data: Vec<u8>) -> bool { + let needle1 = 0; + let needle2 = 1; + let answer = positions2(needle1, needle2, &data); + answer.rev().eq(Memchr2::new(needle1, needle2, &data).rev()) + } + + fn qc_memchr3_iter(data: Vec<u8>) -> bool { + let needle1 = 0; + let needle2 = 1; + let needle3 = 2; + let answer = positions3(needle1, needle2, needle3, &data); + answer.eq(Memchr3::new(needle1, needle2, needle3, &data)) + } + + fn qc_memchr3_rev_iter(data: Vec<u8>) -> bool { + let needle1 = 0; + let needle2 = 1; + let needle3 = 2; + let answer = positions3(needle1, needle2, needle3, &data); + answer.rev().eq(Memchr3::new(needle1, needle2, needle3, &data).rev()) + } + + fn qc_memchr1_iter_size_hint(data: Vec<u8>) -> bool { + // test that the size hint is within reasonable bounds + let needle = 0; + let mut iter = Memchr::new(needle, &data); + let mut real_count = data + .iter() + .filter(|&&elt| elt == needle) + .count(); + + while let Some(index) = iter.next() { + real_count -= 1; + let (lower, upper) = iter.size_hint(); + assert!(lower <= real_count); + assert!(upper.unwrap() >= real_count); + assert!(upper.unwrap() <= data.len() - index); + } + true + } +} + +// take items from a DEI, taking front for each true and back for each false. +// Return a vector with the concatenation of the fronts and the reverse of the +// backs. +fn double_ended_take<I, J>(mut iter: I, take_side: J) -> Vec<I::Item> +where + I: DoubleEndedIterator, + J: Iterator<Item = bool>, +{ + let mut found_front = Vec::new(); + let mut found_back = Vec::new(); + + for take_front in take_side { + if take_front { + if let Some(pos) = iter.next() { + found_front.push(pos); + } else { + break; + } + } else { + if let Some(pos) = iter.next_back() { + found_back.push(pos); + } else { + break; + } + }; + } + + let mut all_found = found_front; + all_found.extend(found_back.into_iter().rev()); + all_found +} + +// return an iterator of the 0-based indices of haystack that match the needle +fn positions1<'a>( + n1: u8, + haystack: &'a [u8], +) -> Box<dyn DoubleEndedIterator<Item = usize> + 'a> { + let it = haystack + .iter() + .enumerate() + .filter(move |&(_, &b)| b == n1) + .map(|t| t.0); + Box::new(it) +} + +fn positions2<'a>( + n1: u8, + n2: u8, + haystack: &'a [u8], +) -> Box<dyn DoubleEndedIterator<Item = usize> + 'a> { + let it = haystack + .iter() + .enumerate() + .filter(move |&(_, &b)| b == n1 || b == n2) + .map(|t| t.0); + Box::new(it) +} + +fn positions3<'a>( + n1: u8, + n2: u8, + n3: u8, + haystack: &'a [u8], +) -> Box<dyn DoubleEndedIterator<Item = usize> + 'a> { + let it = haystack + .iter() + .enumerate() + .filter(move |&(_, &b)| b == n1 || b == n2 || b == n3) + .map(|t| t.0); + Box::new(it) +} diff --git a/third_party/rust/memchr/src/tests/memchr.rs b/third_party/rust/memchr/src/tests/memchr.rs new file mode 100644 index 0000000000..87d3d14edd --- /dev/null +++ b/third_party/rust/memchr/src/tests/memchr.rs @@ -0,0 +1,131 @@ +use fallback; +use naive; +use {memchr, memchr2, memchr3, memrchr, memrchr2, memrchr3}; + +use tests::memchr_tests; + +#[test] +fn memchr1_find() { + for test in memchr_tests() { + test.one(false, memchr); + } +} + +#[test] +fn memchr1_fallback_find() { + for test in memchr_tests() { + test.one(false, fallback::memchr); + } +} + +#[test] +fn memchr2_find() { + for test in memchr_tests() { + test.two(false, memchr2); + } +} + +#[test] +fn memchr2_fallback_find() { + for test in memchr_tests() { + test.two(false, fallback::memchr2); + } +} + +#[test] +fn memchr3_find() { + for test in memchr_tests() { + test.three(false, memchr3); + } +} + +#[test] +fn memchr3_fallback_find() { + for test in memchr_tests() { + test.three(false, fallback::memchr3); + } +} + +#[test] +fn memrchr1_find() { + for test in memchr_tests() { + test.one(true, memrchr); + } +} + +#[test] +fn memrchr1_fallback_find() { + for test in memchr_tests() { + test.one(true, fallback::memrchr); + } +} + +#[test] +fn memrchr2_find() { + for test in memchr_tests() { + test.two(true, memrchr2); + } +} + +#[test] +fn memrchr2_fallback_find() { + for test in memchr_tests() { + test.two(true, fallback::memrchr2); + } +} + +#[test] +fn memrchr3_find() { + for test in memchr_tests() { + test.three(true, memrchr3); + } +} + +#[test] +fn memrchr3_fallback_find() { + for test in memchr_tests() { + test.three(true, fallback::memrchr3); + } +} + +quickcheck! { + fn qc_memchr1_matches_naive(n1: u8, corpus: Vec<u8>) -> bool { + memchr(n1, &corpus) == naive::memchr(n1, &corpus) + } +} + +quickcheck! { + fn qc_memchr2_matches_naive(n1: u8, n2: u8, corpus: Vec<u8>) -> bool { + memchr2(n1, n2, &corpus) == naive::memchr2(n1, n2, &corpus) + } +} + +quickcheck! { + fn qc_memchr3_matches_naive( + n1: u8, n2: u8, n3: u8, + corpus: Vec<u8> + ) -> bool { + memchr3(n1, n2, n3, &corpus) == naive::memchr3(n1, n2, n3, &corpus) + } +} + +quickcheck! { + fn qc_memrchr1_matches_naive(n1: u8, corpus: Vec<u8>) -> bool { + memrchr(n1, &corpus) == naive::memrchr(n1, &corpus) + } +} + +quickcheck! { + fn qc_memrchr2_matches_naive(n1: u8, n2: u8, corpus: Vec<u8>) -> bool { + memrchr2(n1, n2, &corpus) == naive::memrchr2(n1, n2, &corpus) + } +} + +quickcheck! { + fn qc_memrchr3_matches_naive( + n1: u8, n2: u8, n3: u8, + corpus: Vec<u8> + ) -> bool { + memrchr3(n1, n2, n3, &corpus) == naive::memrchr3(n1, n2, n3, &corpus) + } +} diff --git a/third_party/rust/memchr/src/tests/miri.rs b/third_party/rust/memchr/src/tests/miri.rs new file mode 100644 index 0000000000..879ef938ec --- /dev/null +++ b/third_party/rust/memchr/src/tests/miri.rs @@ -0,0 +1,19 @@ +// Simple tests using MIRI + +use crate::{memchr, memchr2, memchr3, memrchr, memrchr2, memrchr3}; + +#[test] +fn test_with_miri() { + assert_eq!(memchr(b'a', b"abcda"), Some(0)); + assert_eq!(memchr(b'z', b"abcda"), None); + assert_eq!(memchr2(b'a', b'z', b"abcda"), Some(0)); + assert_eq!(memchr2(b'z', b'y', b"abcda"), None); + assert_eq!(memchr3(b'a', b'z', b'b', b"abcda"), Some(0)); + assert_eq!(memchr3(b'z', b'y', b'x', b"abcda"), None); + assert_eq!(memrchr(b'a', b"abcda"), Some(4)); + assert_eq!(memrchr(b'z', b"abcda"), None); + assert_eq!(memrchr2(b'a', b'z', b"abcda"), Some(4)); + assert_eq!(memrchr2(b'z', b'y', b"abcda"), None); + assert_eq!(memrchr3(b'a', b'z', b'b', b"abcda"), Some(4)); + assert_eq!(memrchr3(b'z', b'y', b'x', b"abcda"), None); +} diff --git a/third_party/rust/memchr/src/tests/mod.rs b/third_party/rust/memchr/src/tests/mod.rs new file mode 100644 index 0000000000..82c1a248e9 --- /dev/null +++ b/third_party/rust/memchr/src/tests/mod.rs @@ -0,0 +1,362 @@ +use std::iter::repeat; + +mod iter; +mod memchr; + +#[cfg(target_endian = "little")] +#[test] +fn byte_order() { + eprintln!("LITTLE ENDIAN"); +} + +#[cfg(target_endian = "big")] +#[test] +fn byte_order() { + eprintln!("BIG ENDIAN"); +} + +/// Create a sequence of tests that should be run by memchr implementations. +fn memchr_tests() -> Vec<MemchrTest> { + let mut tests = Vec::new(); + for statict in MEMCHR_TESTS { + assert!(!statict.corpus.contains("%"), "% is not allowed in corpora"); + assert!(!statict.corpus.contains("#"), "# is not allowed in corpora"); + assert!(!statict.needles.contains(&b'%'), "% is an invalid needle"); + assert!(!statict.needles.contains(&b'#'), "# is an invalid needle"); + + let t = MemchrTest { + corpus: statict.corpus.to_string(), + needles: statict.needles.to_vec(), + positions: statict.positions.to_vec(), + }; + tests.push(t.clone()); + tests.extend(t.expand()); + } + tests +} + +/// A set of tests for memchr-like functions. +/// +/// These tests mostly try to cover the short string cases. We cover the longer +/// string cases via the benchmarks (which are tests themselves), via +/// quickcheck tests and via automatic expansion of each test case (by +/// increasing the corpus size). Finally, we cover different alignment cases +/// in the tests by varying the starting point of the slice. +const MEMCHR_TESTS: &[MemchrTestStatic] = &[ + // one needle (applied to memchr + memchr2 + memchr3) + MemchrTestStatic { corpus: "a", needles: &[b'a'], positions: &[0] }, + MemchrTestStatic { corpus: "aa", needles: &[b'a'], positions: &[0, 1] }, + MemchrTestStatic { + corpus: "aaa", + needles: &[b'a'], + positions: &[0, 1, 2], + }, + MemchrTestStatic { corpus: "", needles: &[b'a'], positions: &[] }, + MemchrTestStatic { corpus: "z", needles: &[b'a'], positions: &[] }, + MemchrTestStatic { corpus: "zz", needles: &[b'a'], positions: &[] }, + MemchrTestStatic { corpus: "zza", needles: &[b'a'], positions: &[2] }, + MemchrTestStatic { corpus: "zaza", needles: &[b'a'], positions: &[1, 3] }, + MemchrTestStatic { corpus: "zzza", needles: &[b'a'], positions: &[3] }, + MemchrTestStatic { corpus: "\x00a", needles: &[b'a'], positions: &[1] }, + MemchrTestStatic { corpus: "\x00", needles: &[b'\x00'], positions: &[0] }, + MemchrTestStatic { + corpus: "\x00\x00", + needles: &[b'\x00'], + positions: &[0, 1], + }, + MemchrTestStatic { + corpus: "\x00a\x00", + needles: &[b'\x00'], + positions: &[0, 2], + }, + MemchrTestStatic { + corpus: "zzzzzzzzzzzzzzzza", + needles: &[b'a'], + positions: &[16], + }, + MemchrTestStatic { + corpus: "zzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzza", + needles: &[b'a'], + positions: &[32], + }, + // two needles (applied to memchr2 + memchr3) + MemchrTestStatic { + corpus: "az", + needles: &[b'a', b'z'], + positions: &[0, 1], + }, + MemchrTestStatic { + corpus: "az", + needles: &[b'a', b'z'], + positions: &[0, 1], + }, + MemchrTestStatic { corpus: "az", needles: &[b'x', b'y'], positions: &[] }, + MemchrTestStatic { corpus: "az", needles: &[b'a', b'y'], positions: &[0] }, + MemchrTestStatic { corpus: "az", needles: &[b'x', b'z'], positions: &[1] }, + MemchrTestStatic { + corpus: "yyyyaz", + needles: &[b'a', b'z'], + positions: &[4, 5], + }, + MemchrTestStatic { + corpus: "yyyyaz", + needles: &[b'z', b'a'], + positions: &[4, 5], + }, + // three needles (applied to memchr3) + MemchrTestStatic { + corpus: "xyz", + needles: &[b'x', b'y', b'z'], + positions: &[0, 1, 2], + }, + MemchrTestStatic { + corpus: "zxy", + needles: &[b'x', b'y', b'z'], + positions: &[0, 1, 2], + }, + MemchrTestStatic { + corpus: "zxy", + needles: &[b'x', b'a', b'z'], + positions: &[0, 1], + }, + MemchrTestStatic { + corpus: "zxy", + needles: &[b't', b'a', b'z'], + positions: &[0], + }, + MemchrTestStatic { + corpus: "yxz", + needles: &[b't', b'a', b'z'], + positions: &[2], + }, +]; + +/// A description of a test on a memchr like function. +#[derive(Clone, Debug)] +struct MemchrTest { + /// The thing to search. We use `&str` instead of `&[u8]` because they + /// are nicer to write in tests, and we don't miss much since memchr + /// doesn't care about UTF-8. + /// + /// Corpora cannot contain either '%' or '#'. We use these bytes when + /// expanding test cases into many test cases, and we assume they are not + /// used. If they are used, `memchr_tests` will panic. + corpus: String, + /// The needles to search for. This is intended to be an "alternation" of + /// needles. The number of needles may cause this test to be skipped for + /// some memchr variants. For example, a test with 2 needles cannot be used + /// to test `memchr`, but can be used to test `memchr2` and `memchr3`. + /// However, a test with only 1 needle can be used to test all of `memchr`, + /// `memchr2` and `memchr3`. We achieve this by filling in the needles with + /// bytes that we never used in the corpus (such as '#'). + needles: Vec<u8>, + /// The positions expected to match for all of the needles. + positions: Vec<usize>, +} + +/// Like MemchrTest, but easier to define as a constant. +#[derive(Clone, Debug)] +struct MemchrTestStatic { + corpus: &'static str, + needles: &'static [u8], + positions: &'static [usize], +} + +impl MemchrTest { + fn one<F: Fn(u8, &[u8]) -> Option<usize>>(&self, reverse: bool, f: F) { + let needles = match self.needles(1) { + None => return, + Some(needles) => needles, + }; + // We test different alignments here. Since some implementations use + // AVX2, which can read 32 bytes at a time, we test at least that. + // Moreover, with loop unrolling, we sometimes process 64 (sse2) or 128 + // (avx) bytes at a time, so we include that in our offsets as well. + // + // You might think this would cause most needles to not be found, but + // we actually expand our tests to include corpus sizes all the way up + // to >500 bytes, so we should exericse most branches. + for align in 0..130 { + let corpus = self.corpus(align); + assert_eq!( + self.positions(align, reverse).get(0).cloned(), + f(needles[0], corpus.as_bytes()), + "search for {:?} failed in: {:?} (len: {}, alignment: {})", + needles[0] as char, + corpus, + corpus.len(), + align + ); + } + } + + fn two<F: Fn(u8, u8, &[u8]) -> Option<usize>>(&self, reverse: bool, f: F) { + let needles = match self.needles(2) { + None => return, + Some(needles) => needles, + }; + for align in 0..130 { + let corpus = self.corpus(align); + assert_eq!( + self.positions(align, reverse).get(0).cloned(), + f(needles[0], needles[1], corpus.as_bytes()), + "search for {:?}|{:?} failed in: {:?} \ + (len: {}, alignment: {})", + needles[0] as char, + needles[1] as char, + corpus, + corpus.len(), + align + ); + } + } + + fn three<F: Fn(u8, u8, u8, &[u8]) -> Option<usize>>( + &self, + reverse: bool, + f: F, + ) { + let needles = match self.needles(3) { + None => return, + Some(needles) => needles, + }; + for align in 0..130 { + let corpus = self.corpus(align); + assert_eq!( + self.positions(align, reverse).get(0).cloned(), + f(needles[0], needles[1], needles[2], corpus.as_bytes()), + "search for {:?}|{:?}|{:?} failed in: {:?} \ + (len: {}, alignment: {})", + needles[0] as char, + needles[1] as char, + needles[2] as char, + corpus, + corpus.len(), + align + ); + } + } + + fn iter_one<'a, I, F>(&'a self, reverse: bool, f: F) + where + F: FnOnce(u8, &'a [u8]) -> I, + I: Iterator<Item = usize>, + { + if let Some(ns) = self.needles(1) { + self.iter(reverse, f(ns[0], self.corpus.as_bytes())); + } + } + + fn iter_two<'a, I, F>(&'a self, reverse: bool, f: F) + where + F: FnOnce(u8, u8, &'a [u8]) -> I, + I: Iterator<Item = usize>, + { + if let Some(ns) = self.needles(2) { + self.iter(reverse, f(ns[0], ns[1], self.corpus.as_bytes())); + } + } + + fn iter_three<'a, I, F>(&'a self, reverse: bool, f: F) + where + F: FnOnce(u8, u8, u8, &'a [u8]) -> I, + I: Iterator<Item = usize>, + { + if let Some(ns) = self.needles(3) { + self.iter(reverse, f(ns[0], ns[1], ns[2], self.corpus.as_bytes())); + } + } + + /// Test that the positions yielded by the given iterator match the + /// positions in this test. If reverse is true, then reverse the positions + /// before comparing them. + fn iter<I: Iterator<Item = usize>>(&self, reverse: bool, it: I) { + assert_eq!( + self.positions(0, reverse), + it.collect::<Vec<usize>>(), + r"search for {:?} failed in: {:?}", + self.needles.iter().map(|&b| b as char).collect::<Vec<char>>(), + self.corpus + ); + } + + /// Expand this test into many variations of the same test. + /// + /// In particular, this will generate more tests with larger corpus sizes. + /// The expected positions are updated to maintain the integrity of the + /// test. + /// + /// This is important in testing a memchr implementation, because there are + /// often different cases depending on the length of the corpus. + /// + /// Note that we extend the corpus by adding `%` bytes, which we + /// don't otherwise use as a needle. + fn expand(&self) -> Vec<MemchrTest> { + let mut more = Vec::new(); + + // Add bytes to the start of the corpus. + for i in 1..515 { + let mut t = self.clone(); + let mut new_corpus: String = repeat('%').take(i).collect(); + new_corpus.push_str(&t.corpus); + t.corpus = new_corpus; + t.positions = t.positions.into_iter().map(|p| p + i).collect(); + more.push(t); + } + // Add bytes to the end of the corpus. + for i in 1..515 { + let mut t = self.clone(); + let padding: String = repeat('%').take(i).collect(); + t.corpus.push_str(&padding); + more.push(t); + } + + more + } + + /// Return the corpus at the given alignment. + /// + /// If the alignment exceeds the length of the corpus, then this returns + /// an empty slice. + fn corpus(&self, align: usize) -> &str { + self.corpus.get(align..).unwrap_or("") + } + + /// Return exactly `count` needles from this test. If this test has less + /// than `count` needles, then add `#` until the number of needles + /// matches `count`. If this test has more than `count` needles, then + /// return `None` (because there is no way to use this test data for a + /// search using fewer needles). + fn needles(&self, count: usize) -> Option<Vec<u8>> { + if self.needles.len() > count { + return None; + } + + let mut needles = self.needles.to_vec(); + for _ in needles.len()..count { + // we assume # is never used in tests. + needles.push(b'#'); + } + Some(needles) + } + + /// Return the positions in this test, reversed if `reverse` is true. + /// + /// If alignment is given, then all positions greater than or equal to that + /// alignment are offset by the alignment. Positions less than the + /// alignment are dropped. + fn positions(&self, align: usize, reverse: bool) -> Vec<usize> { + let positions = if reverse { + let mut positions = self.positions.to_vec(); + positions.reverse(); + positions + } else { + self.positions.to_vec() + }; + positions + .into_iter() + .filter(|&p| p >= align) + .map(|p| p - align) + .collect() + } +} diff --git a/third_party/rust/memchr/src/x86/avx.rs b/third_party/rust/memchr/src/x86/avx.rs new file mode 100644 index 0000000000..e3d8e8902e --- /dev/null +++ b/third_party/rust/memchr/src/x86/avx.rs @@ -0,0 +1,703 @@ +use core::arch::x86_64::*; +use core::cmp; +use core::mem::size_of; + +use x86::sse2; + +const VECTOR_SIZE: usize = size_of::<__m256i>(); +const VECTOR_ALIGN: usize = VECTOR_SIZE - 1; + +// The number of bytes to loop at in one iteration of memchr/memrchr. +const LOOP_SIZE: usize = 4 * VECTOR_SIZE; + +// The number of bytes to loop at in one iteration of memchr2/memrchr2 and +// memchr3/memrchr3. There was no observable difference between 128 and 64 +// bytes in benchmarks. memchr3 in particular only gets a very slight speed up +// from the loop unrolling. +const LOOP_SIZE2: usize = 2 * VECTOR_SIZE; + +#[target_feature(enable = "avx2")] +pub unsafe fn memchr(n1: u8, haystack: &[u8]) -> Option<usize> { + // For a high level explanation for how this algorithm works, see the + // sse2 implementation. The avx implementation here is the same, but with + // 256-bit vectors instead of 128-bit vectors. + + let start_ptr = haystack.as_ptr(); + let end_ptr = haystack[haystack.len()..].as_ptr(); + let mut ptr = start_ptr; + + if haystack.len() < VECTOR_SIZE { + // For small haystacks, defer to the SSE2 implementation. Codegen + // suggests this completely avoids touching the AVX vectors. + return sse2::memchr(n1, haystack); + } + + let vn1 = _mm256_set1_epi8(n1 as i8); + let loop_size = cmp::min(LOOP_SIZE, haystack.len()); + if let Some(i) = forward_search1(start_ptr, end_ptr, ptr, vn1) { + return Some(i); + } + + ptr = ptr.add(VECTOR_SIZE - (start_ptr as usize & VECTOR_ALIGN)); + debug_assert!(ptr > start_ptr && end_ptr.sub(VECTOR_SIZE) >= start_ptr); + while loop_size == LOOP_SIZE && ptr <= end_ptr.sub(loop_size) { + debug_assert_eq!(0, (ptr as usize) % VECTOR_SIZE); + + let a = _mm256_load_si256(ptr as *const __m256i); + let b = _mm256_load_si256(ptr.add(VECTOR_SIZE) as *const __m256i); + let c = _mm256_load_si256(ptr.add(2 * VECTOR_SIZE) as *const __m256i); + let d = _mm256_load_si256(ptr.add(3 * VECTOR_SIZE) as *const __m256i); + let eqa = _mm256_cmpeq_epi8(vn1, a); + let eqb = _mm256_cmpeq_epi8(vn1, b); + let eqc = _mm256_cmpeq_epi8(vn1, c); + let eqd = _mm256_cmpeq_epi8(vn1, d); + let or1 = _mm256_or_si256(eqa, eqb); + let or2 = _mm256_or_si256(eqc, eqd); + let or3 = _mm256_or_si256(or1, or2); + if _mm256_movemask_epi8(or3) != 0 { + let mut at = sub(ptr, start_ptr); + let mask = _mm256_movemask_epi8(eqa); + if mask != 0 { + return Some(at + forward_pos(mask)); + } + + at += VECTOR_SIZE; + let mask = _mm256_movemask_epi8(eqb); + if mask != 0 { + return Some(at + forward_pos(mask)); + } + + at += VECTOR_SIZE; + let mask = _mm256_movemask_epi8(eqc); + if mask != 0 { + return Some(at + forward_pos(mask)); + } + + at += VECTOR_SIZE; + let mask = _mm256_movemask_epi8(eqd); + debug_assert!(mask != 0); + return Some(at + forward_pos(mask)); + } + ptr = ptr.add(loop_size); + } + while ptr <= end_ptr.sub(VECTOR_SIZE) { + debug_assert!(sub(end_ptr, ptr) >= VECTOR_SIZE); + + if let Some(i) = forward_search1(start_ptr, end_ptr, ptr, vn1) { + return Some(i); + } + ptr = ptr.add(VECTOR_SIZE); + } + if ptr < end_ptr { + debug_assert!(sub(end_ptr, ptr) < VECTOR_SIZE); + ptr = ptr.sub(VECTOR_SIZE - sub(end_ptr, ptr)); + debug_assert_eq!(sub(end_ptr, ptr), VECTOR_SIZE); + + return forward_search1(start_ptr, end_ptr, ptr, vn1); + } + None +} + +#[target_feature(enable = "avx2")] +pub unsafe fn memchr2(n1: u8, n2: u8, haystack: &[u8]) -> Option<usize> { + let vn1 = _mm256_set1_epi8(n1 as i8); + let vn2 = _mm256_set1_epi8(n2 as i8); + let len = haystack.len(); + let loop_size = cmp::min(LOOP_SIZE2, len); + let start_ptr = haystack.as_ptr(); + let end_ptr = haystack[haystack.len()..].as_ptr(); + let mut ptr = start_ptr; + + if haystack.len() < VECTOR_SIZE { + while ptr < end_ptr { + if *ptr == n1 || *ptr == n2 { + return Some(sub(ptr, start_ptr)); + } + ptr = ptr.offset(1); + } + return None; + } + + if let Some(i) = forward_search2(start_ptr, end_ptr, ptr, vn1, vn2) { + return Some(i); + } + + ptr = ptr.add(VECTOR_SIZE - (start_ptr as usize & VECTOR_ALIGN)); + debug_assert!(ptr > start_ptr && end_ptr.sub(VECTOR_SIZE) >= start_ptr); + while loop_size == LOOP_SIZE2 && ptr <= end_ptr.sub(loop_size) { + debug_assert_eq!(0, (ptr as usize) % VECTOR_SIZE); + + let a = _mm256_load_si256(ptr as *const __m256i); + let b = _mm256_load_si256(ptr.add(VECTOR_SIZE) as *const __m256i); + let eqa1 = _mm256_cmpeq_epi8(vn1, a); + let eqb1 = _mm256_cmpeq_epi8(vn1, b); + let eqa2 = _mm256_cmpeq_epi8(vn2, a); + let eqb2 = _mm256_cmpeq_epi8(vn2, b); + let or1 = _mm256_or_si256(eqa1, eqb1); + let or2 = _mm256_or_si256(eqa2, eqb2); + let or3 = _mm256_or_si256(or1, or2); + if _mm256_movemask_epi8(or3) != 0 { + let mut at = sub(ptr, start_ptr); + let mask1 = _mm256_movemask_epi8(eqa1); + let mask2 = _mm256_movemask_epi8(eqa2); + if mask1 != 0 || mask2 != 0 { + return Some(at + forward_pos2(mask1, mask2)); + } + + at += VECTOR_SIZE; + let mask1 = _mm256_movemask_epi8(eqb1); + let mask2 = _mm256_movemask_epi8(eqb2); + return Some(at + forward_pos2(mask1, mask2)); + } + ptr = ptr.add(loop_size); + } + while ptr <= end_ptr.sub(VECTOR_SIZE) { + if let Some(i) = forward_search2(start_ptr, end_ptr, ptr, vn1, vn2) { + return Some(i); + } + ptr = ptr.add(VECTOR_SIZE); + } + if ptr < end_ptr { + debug_assert!(sub(end_ptr, ptr) < VECTOR_SIZE); + ptr = ptr.sub(VECTOR_SIZE - sub(end_ptr, ptr)); + debug_assert_eq!(sub(end_ptr, ptr), VECTOR_SIZE); + + return forward_search2(start_ptr, end_ptr, ptr, vn1, vn2); + } + None +} + +#[target_feature(enable = "avx2")] +pub unsafe fn memchr3( + n1: u8, + n2: u8, + n3: u8, + haystack: &[u8], +) -> Option<usize> { + let vn1 = _mm256_set1_epi8(n1 as i8); + let vn2 = _mm256_set1_epi8(n2 as i8); + let vn3 = _mm256_set1_epi8(n3 as i8); + let len = haystack.len(); + let loop_size = cmp::min(LOOP_SIZE2, len); + let start_ptr = haystack.as_ptr(); + let end_ptr = haystack[haystack.len()..].as_ptr(); + let mut ptr = start_ptr; + + if haystack.len() < VECTOR_SIZE { + while ptr < end_ptr { + if *ptr == n1 || *ptr == n2 || *ptr == n3 { + return Some(sub(ptr, start_ptr)); + } + ptr = ptr.offset(1); + } + return None; + } + + if let Some(i) = forward_search3(start_ptr, end_ptr, ptr, vn1, vn2, vn3) { + return Some(i); + } + + ptr = ptr.add(VECTOR_SIZE - (start_ptr as usize & VECTOR_ALIGN)); + debug_assert!(ptr > start_ptr && end_ptr.sub(VECTOR_SIZE) >= start_ptr); + while loop_size == LOOP_SIZE2 && ptr <= end_ptr.sub(loop_size) { + debug_assert_eq!(0, (ptr as usize) % VECTOR_SIZE); + + let a = _mm256_load_si256(ptr as *const __m256i); + let b = _mm256_load_si256(ptr.add(VECTOR_SIZE) as *const __m256i); + let eqa1 = _mm256_cmpeq_epi8(vn1, a); + let eqb1 = _mm256_cmpeq_epi8(vn1, b); + let eqa2 = _mm256_cmpeq_epi8(vn2, a); + let eqb2 = _mm256_cmpeq_epi8(vn2, b); + let eqa3 = _mm256_cmpeq_epi8(vn3, a); + let eqb3 = _mm256_cmpeq_epi8(vn3, b); + let or1 = _mm256_or_si256(eqa1, eqb1); + let or2 = _mm256_or_si256(eqa2, eqb2); + let or3 = _mm256_or_si256(eqa3, eqb3); + let or4 = _mm256_or_si256(or1, or2); + let or5 = _mm256_or_si256(or3, or4); + if _mm256_movemask_epi8(or5) != 0 { + let mut at = sub(ptr, start_ptr); + let mask1 = _mm256_movemask_epi8(eqa1); + let mask2 = _mm256_movemask_epi8(eqa2); + let mask3 = _mm256_movemask_epi8(eqa3); + if mask1 != 0 || mask2 != 0 || mask3 != 0 { + return Some(at + forward_pos3(mask1, mask2, mask3)); + } + + at += VECTOR_SIZE; + let mask1 = _mm256_movemask_epi8(eqb1); + let mask2 = _mm256_movemask_epi8(eqb2); + let mask3 = _mm256_movemask_epi8(eqb3); + return Some(at + forward_pos3(mask1, mask2, mask3)); + } + ptr = ptr.add(loop_size); + } + while ptr <= end_ptr.sub(VECTOR_SIZE) { + if let Some(i) = + forward_search3(start_ptr, end_ptr, ptr, vn1, vn2, vn3) + { + return Some(i); + } + ptr = ptr.add(VECTOR_SIZE); + } + if ptr < end_ptr { + debug_assert!(sub(end_ptr, ptr) < VECTOR_SIZE); + ptr = ptr.sub(VECTOR_SIZE - sub(end_ptr, ptr)); + debug_assert_eq!(sub(end_ptr, ptr), VECTOR_SIZE); + + return forward_search3(start_ptr, end_ptr, ptr, vn1, vn2, vn3); + } + None +} + +#[target_feature(enable = "avx2")] +pub unsafe fn memrchr(n1: u8, haystack: &[u8]) -> Option<usize> { + let vn1 = _mm256_set1_epi8(n1 as i8); + let len = haystack.len(); + let loop_size = cmp::min(LOOP_SIZE, len); + let start_ptr = haystack.as_ptr(); + let end_ptr = haystack[haystack.len()..].as_ptr(); + let mut ptr = end_ptr; + + if haystack.len() < VECTOR_SIZE { + while ptr > start_ptr { + ptr = ptr.offset(-1); + if *ptr == n1 { + return Some(sub(ptr, start_ptr)); + } + } + return None; + } + + ptr = ptr.sub(VECTOR_SIZE); + if let Some(i) = reverse_search1(start_ptr, end_ptr, ptr, vn1) { + return Some(i); + } + + ptr = (end_ptr as usize & !VECTOR_ALIGN) as *const u8; + debug_assert!(start_ptr <= ptr && ptr <= end_ptr); + while loop_size == LOOP_SIZE && ptr >= start_ptr.add(loop_size) { + debug_assert_eq!(0, (ptr as usize) % VECTOR_SIZE); + + ptr = ptr.sub(loop_size); + let a = _mm256_load_si256(ptr as *const __m256i); + let b = _mm256_load_si256(ptr.add(VECTOR_SIZE) as *const __m256i); + let c = _mm256_load_si256(ptr.add(2 * VECTOR_SIZE) as *const __m256i); + let d = _mm256_load_si256(ptr.add(3 * VECTOR_SIZE) as *const __m256i); + let eqa = _mm256_cmpeq_epi8(vn1, a); + let eqb = _mm256_cmpeq_epi8(vn1, b); + let eqc = _mm256_cmpeq_epi8(vn1, c); + let eqd = _mm256_cmpeq_epi8(vn1, d); + let or1 = _mm256_or_si256(eqa, eqb); + let or2 = _mm256_or_si256(eqc, eqd); + let or3 = _mm256_or_si256(or1, or2); + if _mm256_movemask_epi8(or3) != 0 { + let mut at = sub(ptr.add(3 * VECTOR_SIZE), start_ptr); + let mask = _mm256_movemask_epi8(eqd); + if mask != 0 { + return Some(at + reverse_pos(mask)); + } + + at -= VECTOR_SIZE; + let mask = _mm256_movemask_epi8(eqc); + if mask != 0 { + return Some(at + reverse_pos(mask)); + } + + at -= VECTOR_SIZE; + let mask = _mm256_movemask_epi8(eqb); + if mask != 0 { + return Some(at + reverse_pos(mask)); + } + + at -= VECTOR_SIZE; + let mask = _mm256_movemask_epi8(eqa); + debug_assert!(mask != 0); + return Some(at + reverse_pos(mask)); + } + } + while ptr >= start_ptr.add(VECTOR_SIZE) { + ptr = ptr.sub(VECTOR_SIZE); + if let Some(i) = reverse_search1(start_ptr, end_ptr, ptr, vn1) { + return Some(i); + } + } + if ptr > start_ptr { + debug_assert!(sub(ptr, start_ptr) < VECTOR_SIZE); + return reverse_search1(start_ptr, end_ptr, start_ptr, vn1); + } + None +} + +#[target_feature(enable = "avx2")] +pub unsafe fn memrchr2(n1: u8, n2: u8, haystack: &[u8]) -> Option<usize> { + let vn1 = _mm256_set1_epi8(n1 as i8); + let vn2 = _mm256_set1_epi8(n2 as i8); + let len = haystack.len(); + let loop_size = cmp::min(LOOP_SIZE2, len); + let start_ptr = haystack.as_ptr(); + let end_ptr = haystack[haystack.len()..].as_ptr(); + let mut ptr = end_ptr; + + if haystack.len() < VECTOR_SIZE { + while ptr > start_ptr { + ptr = ptr.offset(-1); + if *ptr == n1 || *ptr == n2 { + return Some(sub(ptr, start_ptr)); + } + } + return None; + } + + ptr = ptr.sub(VECTOR_SIZE); + if let Some(i) = reverse_search2(start_ptr, end_ptr, ptr, vn1, vn2) { + return Some(i); + } + + ptr = (end_ptr as usize & !VECTOR_ALIGN) as *const u8; + debug_assert!(start_ptr <= ptr && ptr <= end_ptr); + while loop_size == LOOP_SIZE2 && ptr >= start_ptr.add(loop_size) { + debug_assert_eq!(0, (ptr as usize) % VECTOR_SIZE); + + ptr = ptr.sub(loop_size); + let a = _mm256_load_si256(ptr as *const __m256i); + let b = _mm256_load_si256(ptr.add(VECTOR_SIZE) as *const __m256i); + let eqa1 = _mm256_cmpeq_epi8(vn1, a); + let eqb1 = _mm256_cmpeq_epi8(vn1, b); + let eqa2 = _mm256_cmpeq_epi8(vn2, a); + let eqb2 = _mm256_cmpeq_epi8(vn2, b); + let or1 = _mm256_or_si256(eqa1, eqb1); + let or2 = _mm256_or_si256(eqa2, eqb2); + let or3 = _mm256_or_si256(or1, or2); + if _mm256_movemask_epi8(or3) != 0 { + let mut at = sub(ptr.add(VECTOR_SIZE), start_ptr); + let mask1 = _mm256_movemask_epi8(eqb1); + let mask2 = _mm256_movemask_epi8(eqb2); + if mask1 != 0 || mask2 != 0 { + return Some(at + reverse_pos2(mask1, mask2)); + } + + at -= VECTOR_SIZE; + let mask1 = _mm256_movemask_epi8(eqa1); + let mask2 = _mm256_movemask_epi8(eqa2); + return Some(at + reverse_pos2(mask1, mask2)); + } + } + while ptr >= start_ptr.add(VECTOR_SIZE) { + ptr = ptr.sub(VECTOR_SIZE); + if let Some(i) = reverse_search2(start_ptr, end_ptr, ptr, vn1, vn2) { + return Some(i); + } + } + if ptr > start_ptr { + debug_assert!(sub(ptr, start_ptr) < VECTOR_SIZE); + return reverse_search2(start_ptr, end_ptr, start_ptr, vn1, vn2); + } + None +} + +#[target_feature(enable = "avx2")] +pub unsafe fn memrchr3( + n1: u8, + n2: u8, + n3: u8, + haystack: &[u8], +) -> Option<usize> { + let vn1 = _mm256_set1_epi8(n1 as i8); + let vn2 = _mm256_set1_epi8(n2 as i8); + let vn3 = _mm256_set1_epi8(n3 as i8); + let len = haystack.len(); + let loop_size = cmp::min(LOOP_SIZE2, len); + let start_ptr = haystack.as_ptr(); + let end_ptr = haystack[haystack.len()..].as_ptr(); + let mut ptr = end_ptr; + + if haystack.len() < VECTOR_SIZE { + while ptr > start_ptr { + ptr = ptr.offset(-1); + if *ptr == n1 || *ptr == n2 || *ptr == n3 { + return Some(sub(ptr, start_ptr)); + } + } + return None; + } + + ptr = ptr.sub(VECTOR_SIZE); + if let Some(i) = reverse_search3(start_ptr, end_ptr, ptr, vn1, vn2, vn3) { + return Some(i); + } + + ptr = (end_ptr as usize & !VECTOR_ALIGN) as *const u8; + debug_assert!(start_ptr <= ptr && ptr <= end_ptr); + while loop_size == LOOP_SIZE2 && ptr >= start_ptr.add(loop_size) { + debug_assert_eq!(0, (ptr as usize) % VECTOR_SIZE); + + ptr = ptr.sub(loop_size); + let a = _mm256_load_si256(ptr as *const __m256i); + let b = _mm256_load_si256(ptr.add(VECTOR_SIZE) as *const __m256i); + let eqa1 = _mm256_cmpeq_epi8(vn1, a); + let eqb1 = _mm256_cmpeq_epi8(vn1, b); + let eqa2 = _mm256_cmpeq_epi8(vn2, a); + let eqb2 = _mm256_cmpeq_epi8(vn2, b); + let eqa3 = _mm256_cmpeq_epi8(vn3, a); + let eqb3 = _mm256_cmpeq_epi8(vn3, b); + let or1 = _mm256_or_si256(eqa1, eqb1); + let or2 = _mm256_or_si256(eqa2, eqb2); + let or3 = _mm256_or_si256(eqa3, eqb3); + let or4 = _mm256_or_si256(or1, or2); + let or5 = _mm256_or_si256(or3, or4); + if _mm256_movemask_epi8(or5) != 0 { + let mut at = sub(ptr.add(VECTOR_SIZE), start_ptr); + let mask1 = _mm256_movemask_epi8(eqb1); + let mask2 = _mm256_movemask_epi8(eqb2); + let mask3 = _mm256_movemask_epi8(eqb3); + if mask1 != 0 || mask2 != 0 || mask3 != 0 { + return Some(at + reverse_pos3(mask1, mask2, mask3)); + } + + at -= VECTOR_SIZE; + let mask1 = _mm256_movemask_epi8(eqa1); + let mask2 = _mm256_movemask_epi8(eqa2); + let mask3 = _mm256_movemask_epi8(eqa3); + return Some(at + reverse_pos3(mask1, mask2, mask3)); + } + } + while ptr >= start_ptr.add(VECTOR_SIZE) { + ptr = ptr.sub(VECTOR_SIZE); + if let Some(i) = + reverse_search3(start_ptr, end_ptr, ptr, vn1, vn2, vn3) + { + return Some(i); + } + } + if ptr > start_ptr { + debug_assert!(sub(ptr, start_ptr) < VECTOR_SIZE); + return reverse_search3(start_ptr, end_ptr, start_ptr, vn1, vn2, vn3); + } + None +} + +#[target_feature(enable = "avx2")] +unsafe fn forward_search1( + start_ptr: *const u8, + end_ptr: *const u8, + ptr: *const u8, + vn1: __m256i, +) -> Option<usize> { + debug_assert!(sub(end_ptr, start_ptr) >= VECTOR_SIZE); + debug_assert!(start_ptr <= ptr); + debug_assert!(ptr <= end_ptr.sub(VECTOR_SIZE)); + + let chunk = _mm256_loadu_si256(ptr as *const __m256i); + let mask = _mm256_movemask_epi8(_mm256_cmpeq_epi8(chunk, vn1)); + if mask != 0 { + Some(sub(ptr, start_ptr) + forward_pos(mask)) + } else { + None + } +} + +#[target_feature(enable = "avx2")] +unsafe fn forward_search2( + start_ptr: *const u8, + end_ptr: *const u8, + ptr: *const u8, + vn1: __m256i, + vn2: __m256i, +) -> Option<usize> { + debug_assert!(sub(end_ptr, start_ptr) >= VECTOR_SIZE); + debug_assert!(start_ptr <= ptr); + debug_assert!(ptr <= end_ptr.sub(VECTOR_SIZE)); + + let chunk = _mm256_loadu_si256(ptr as *const __m256i); + let eq1 = _mm256_cmpeq_epi8(chunk, vn1); + let eq2 = _mm256_cmpeq_epi8(chunk, vn2); + if _mm256_movemask_epi8(_mm256_or_si256(eq1, eq2)) != 0 { + let mask1 = _mm256_movemask_epi8(eq1); + let mask2 = _mm256_movemask_epi8(eq2); + Some(sub(ptr, start_ptr) + forward_pos2(mask1, mask2)) + } else { + None + } +} + +#[target_feature(enable = "avx2")] +unsafe fn forward_search3( + start_ptr: *const u8, + end_ptr: *const u8, + ptr: *const u8, + vn1: __m256i, + vn2: __m256i, + vn3: __m256i, +) -> Option<usize> { + debug_assert!(sub(end_ptr, start_ptr) >= VECTOR_SIZE); + debug_assert!(start_ptr <= ptr); + debug_assert!(ptr <= end_ptr.sub(VECTOR_SIZE)); + + let chunk = _mm256_loadu_si256(ptr as *const __m256i); + let eq1 = _mm256_cmpeq_epi8(chunk, vn1); + let eq2 = _mm256_cmpeq_epi8(chunk, vn2); + let eq3 = _mm256_cmpeq_epi8(chunk, vn3); + let or = _mm256_or_si256(eq1, eq2); + if _mm256_movemask_epi8(_mm256_or_si256(or, eq3)) != 0 { + let mask1 = _mm256_movemask_epi8(eq1); + let mask2 = _mm256_movemask_epi8(eq2); + let mask3 = _mm256_movemask_epi8(eq3); + Some(sub(ptr, start_ptr) + forward_pos3(mask1, mask2, mask3)) + } else { + None + } +} + +#[target_feature(enable = "avx2")] +unsafe fn reverse_search1( + start_ptr: *const u8, + end_ptr: *const u8, + ptr: *const u8, + vn1: __m256i, +) -> Option<usize> { + debug_assert!(sub(end_ptr, start_ptr) >= VECTOR_SIZE); + debug_assert!(start_ptr <= ptr); + debug_assert!(ptr <= end_ptr.sub(VECTOR_SIZE)); + + let chunk = _mm256_loadu_si256(ptr as *const __m256i); + let mask = _mm256_movemask_epi8(_mm256_cmpeq_epi8(vn1, chunk)); + if mask != 0 { + Some(sub(ptr, start_ptr) + reverse_pos(mask)) + } else { + None + } +} + +#[target_feature(enable = "avx2")] +unsafe fn reverse_search2( + start_ptr: *const u8, + end_ptr: *const u8, + ptr: *const u8, + vn1: __m256i, + vn2: __m256i, +) -> Option<usize> { + debug_assert!(sub(end_ptr, start_ptr) >= VECTOR_SIZE); + debug_assert!(start_ptr <= ptr); + debug_assert!(ptr <= end_ptr.sub(VECTOR_SIZE)); + + let chunk = _mm256_loadu_si256(ptr as *const __m256i); + let eq1 = _mm256_cmpeq_epi8(chunk, vn1); + let eq2 = _mm256_cmpeq_epi8(chunk, vn2); + if _mm256_movemask_epi8(_mm256_or_si256(eq1, eq2)) != 0 { + let mask1 = _mm256_movemask_epi8(eq1); + let mask2 = _mm256_movemask_epi8(eq2); + Some(sub(ptr, start_ptr) + reverse_pos2(mask1, mask2)) + } else { + None + } +} + +#[target_feature(enable = "avx2")] +unsafe fn reverse_search3( + start_ptr: *const u8, + end_ptr: *const u8, + ptr: *const u8, + vn1: __m256i, + vn2: __m256i, + vn3: __m256i, +) -> Option<usize> { + debug_assert!(sub(end_ptr, start_ptr) >= VECTOR_SIZE); + debug_assert!(start_ptr <= ptr); + debug_assert!(ptr <= end_ptr.sub(VECTOR_SIZE)); + + let chunk = _mm256_loadu_si256(ptr as *const __m256i); + let eq1 = _mm256_cmpeq_epi8(chunk, vn1); + let eq2 = _mm256_cmpeq_epi8(chunk, vn2); + let eq3 = _mm256_cmpeq_epi8(chunk, vn3); + let or = _mm256_or_si256(eq1, eq2); + if _mm256_movemask_epi8(_mm256_or_si256(or, eq3)) != 0 { + let mask1 = _mm256_movemask_epi8(eq1); + let mask2 = _mm256_movemask_epi8(eq2); + let mask3 = _mm256_movemask_epi8(eq3); + Some(sub(ptr, start_ptr) + reverse_pos3(mask1, mask2, mask3)) + } else { + None + } +} + +/// Compute the position of the first matching byte from the given mask. The +/// position returned is always in the range [0, 31]. +/// +/// The mask given is expected to be the result of _mm256_movemask_epi8. +fn forward_pos(mask: i32) -> usize { + // We are dealing with little endian here, where the most significant byte + // is at a higher address. That means the least significant bit that is set + // corresponds to the position of our first matching byte. That position + // corresponds to the number of zeros after the least significant bit. + mask.trailing_zeros() as usize +} + +/// Compute the position of the first matching byte from the given masks. The +/// position returned is always in the range [0, 31]. Each mask corresponds to +/// the equality comparison of a single byte. +/// +/// The masks given are expected to be the result of _mm256_movemask_epi8, +/// where at least one of the masks is non-zero (i.e., indicates a match). +fn forward_pos2(mask1: i32, mask2: i32) -> usize { + debug_assert!(mask1 != 0 || mask2 != 0); + + forward_pos(mask1 | mask2) +} + +/// Compute the position of the first matching byte from the given masks. The +/// position returned is always in the range [0, 31]. Each mask corresponds to +/// the equality comparison of a single byte. +/// +/// The masks given are expected to be the result of _mm256_movemask_epi8, +/// where at least one of the masks is non-zero (i.e., indicates a match). +fn forward_pos3(mask1: i32, mask2: i32, mask3: i32) -> usize { + debug_assert!(mask1 != 0 || mask2 != 0 || mask3 != 0); + + forward_pos(mask1 | mask2 | mask3) +} + +/// Compute the position of the last matching byte from the given mask. The +/// position returned is always in the range [0, 31]. +/// +/// The mask given is expected to be the result of _mm256_movemask_epi8. +fn reverse_pos(mask: i32) -> usize { + // We are dealing with little endian here, where the most significant byte + // is at a higher address. That means the most significant bit that is set + // corresponds to the position of our last matching byte. The position from + // the end of the mask is therefore the number of leading zeros in a 32 + // bit integer, and the position from the start of the mask is therefore + // 32 - (leading zeros) - 1. + VECTOR_SIZE - (mask as u32).leading_zeros() as usize - 1 +} + +/// Compute the position of the last matching byte from the given masks. The +/// position returned is always in the range [0, 31]. Each mask corresponds to +/// the equality comparison of a single byte. +/// +/// The masks given are expected to be the result of _mm256_movemask_epi8, +/// where at least one of the masks is non-zero (i.e., indicates a match). +fn reverse_pos2(mask1: i32, mask2: i32) -> usize { + debug_assert!(mask1 != 0 || mask2 != 0); + + reverse_pos(mask1 | mask2) +} + +/// Compute the position of the last matching byte from the given masks. The +/// position returned is always in the range [0, 31]. Each mask corresponds to +/// the equality comparison of a single byte. +/// +/// The masks given are expected to be the result of _mm256_movemask_epi8, +/// where at least one of the masks is non-zero (i.e., indicates a match). +fn reverse_pos3(mask1: i32, mask2: i32, mask3: i32) -> usize { + debug_assert!(mask1 != 0 || mask2 != 0 || mask3 != 0); + + reverse_pos(mask1 | mask2 | mask3) +} + +/// Subtract `b` from `a` and return the difference. `a` should be greater than +/// or equal to `b`. +fn sub(a: *const u8, b: *const u8) -> usize { + debug_assert!(a >= b); + (a as usize) - (b as usize) +} diff --git a/third_party/rust/memchr/src/x86/mod.rs b/third_party/rust/memchr/src/x86/mod.rs new file mode 100644 index 0000000000..855dc8b755 --- /dev/null +++ b/third_party/rust/memchr/src/x86/mod.rs @@ -0,0 +1,119 @@ +use fallback; + +// We only use AVX when we can detect at runtime whether it's available, which +// requires std. +#[cfg(feature = "std")] +mod avx; +mod sse2; + +// This macro employs a gcc-like "ifunc" trick where by upon first calling +// `memchr` (for example), CPU feature detection will be performed at runtime +// to determine the best implementation to use. After CPU feature detection +// is done, we replace `memchr`'s function pointer with the selection. Upon +// subsequent invocations, the CPU-specific routine is invoked directly, which +// skips the CPU feature detection and subsequent branch that's required. +// +// While this typically doesn't matter for rare occurrences or when used on +// larger haystacks, `memchr` can be called in tight loops where the overhead +// of this branch can actually add up *and is measurable*. This trick was +// necessary to bring this implementation up to glibc's speeds for the 'tiny' +// benchmarks, for example. +// +// At some point, I expect the Rust ecosystem will get a nice macro for doing +// exactly this, at which point, we can replace our hand-jammed version of it. +// +// N.B. The ifunc strategy does prevent function inlining of course, but on +// modern CPUs, you'll probably end up with the AVX2 implementation, which +// probably can't be inlined anyway---unless you've compiled your entire +// program with AVX2 enabled. However, even then, the various memchr +// implementations aren't exactly small, so inlining might not help anyway! +#[cfg(feature = "std")] +macro_rules! ifunc { + ($fnty:ty, $name:ident, $haystack:ident, $($needle:ident),+) => {{ + use std::mem; + use std::sync::atomic::{AtomicPtr, Ordering}; + + type FnRaw = *mut (); + + static FN: AtomicPtr<()> = AtomicPtr::new(detect as FnRaw); + + fn detect($($needle: u8),+, haystack: &[u8]) -> Option<usize> { + let fun = + if cfg!(memchr_runtime_avx) && is_x86_feature_detected!("avx2") { + avx::$name as FnRaw + } else if cfg!(memchr_runtime_sse2) { + sse2::$name as FnRaw + } else { + fallback::$name as FnRaw + }; + FN.store(fun as FnRaw, Ordering::Relaxed); + unsafe { + mem::transmute::<FnRaw, $fnty>(fun)($($needle),+, haystack) + } + } + + unsafe { + let fun = FN.load(Ordering::Relaxed); + mem::transmute::<FnRaw, $fnty>(fun)($($needle),+, $haystack) + } + }} +} + +// When std isn't available to provide runtime CPU feature detection, or if +// runtime CPU feature detection has been explicitly disabled, then just call +// our optimized SSE2 routine directly. SSE2 is avalbale on all x86_64 targets, +// so no CPU feature detection is necessary. +#[cfg(not(feature = "std"))] +macro_rules! ifunc { + ($fnty:ty, $name:ident, $haystack:ident, $($needle:ident),+) => {{ + if cfg!(memchr_runtime_sse2) { + unsafe { sse2::$name($($needle),+, $haystack) } + } else { + fallback::$name($($needle),+, $haystack) + } + }} +} + +#[inline(always)] +pub fn memchr(n1: u8, haystack: &[u8]) -> Option<usize> { + ifunc!(fn(u8, &[u8]) -> Option<usize>, memchr, haystack, n1) +} + +#[inline(always)] +pub fn memchr2(n1: u8, n2: u8, haystack: &[u8]) -> Option<usize> { + ifunc!(fn(u8, u8, &[u8]) -> Option<usize>, memchr2, haystack, n1, n2) +} + +#[inline(always)] +pub fn memchr3(n1: u8, n2: u8, n3: u8, haystack: &[u8]) -> Option<usize> { + ifunc!( + fn(u8, u8, u8, &[u8]) -> Option<usize>, + memchr3, + haystack, + n1, + n2, + n3 + ) +} + +#[inline(always)] +pub fn memrchr(n1: u8, haystack: &[u8]) -> Option<usize> { + ifunc!(fn(u8, &[u8]) -> Option<usize>, memrchr, haystack, n1) +} + +#[inline(always)] +pub fn memrchr2(n1: u8, n2: u8, haystack: &[u8]) -> Option<usize> { + ifunc!(fn(u8, u8, &[u8]) -> Option<usize>, memrchr2, haystack, n1, n2) +} + +#[inline(always)] +pub fn memrchr3(n1: u8, n2: u8, n3: u8, haystack: &[u8]) -> Option<usize> { + ifunc!( + fn(u8, u8, u8, &[u8]) -> Option<usize>, + memrchr3, + haystack, + n1, + n2, + n3 + ) +} diff --git a/third_party/rust/memchr/src/x86/sse2.rs b/third_party/rust/memchr/src/x86/sse2.rs new file mode 100644 index 0000000000..76f5a78c34 --- /dev/null +++ b/third_party/rust/memchr/src/x86/sse2.rs @@ -0,0 +1,793 @@ +use core::arch::x86_64::*; +use core::cmp; +use core::mem::size_of; + +const VECTOR_SIZE: usize = size_of::<__m128i>(); +const VECTOR_ALIGN: usize = VECTOR_SIZE - 1; + +// The number of bytes to loop at in one iteration of memchr/memrchr. +const LOOP_SIZE: usize = 4 * VECTOR_SIZE; + +// The number of bytes to loop at in one iteration of memchr2/memrchr2 and +// memchr3/memrchr3. There was no observable difference between 64 and 32 bytes +// in benchmarks. memchr3 in particular only gets a very slight speed up from +// the loop unrolling. +const LOOP_SIZE2: usize = 2 * VECTOR_SIZE; + +#[target_feature(enable = "sse2")] +pub unsafe fn memchr(n1: u8, haystack: &[u8]) -> Option<usize> { + // What follows is a fast SSE2-only algorithm to detect the position of + // `n1` in `haystack` if it exists. From what I know, this is the "classic" + // algorithm. I believe it can be found in places like glibc and Go's + // standard library. It appears to be well known and is elaborated on in + // more detail here: https://gms.tf/stdfind-and-memchr-optimizations.html + // + // While this routine is very long, the basic idea is actually very simple + // and can be expressed straight-forwardly in pseudo code: + // + // needle = (n1 << 15) | (n1 << 14) | ... | (n1 << 1) | n1 + // // Note: shift amount in bytes + // + // while i <= haystack.len() - 16: + // // A 16 byte vector. Each byte in chunk corresponds to a byte in + // // the haystack. + // chunk = haystack[i:i+16] + // // Compare bytes in needle with bytes in chunk. The result is a 16 + // // byte chunk where each byte is 0xFF if the corresponding bytes + // // in needle and chunk were equal, or 0x00 otherwise. + // eqs = cmpeq(needle, chunk) + // // Return a 32 bit integer where the most significant 16 bits + // // are always 0 and the lower 16 bits correspond to whether the + // // most significant bit in the correspond byte in `eqs` is set. + // // In other words, `mask as u16` has bit i set if and only if + // // needle[i] == chunk[i]. + // mask = movemask(eqs) + // + // // Mask is 0 if there is no match, and non-zero otherwise. + // if mask != 0: + // // trailing_zeros tells us the position of the least significant + // // bit that is set. + // return i + trailing_zeros(mask) + // + // // haystack length may not be a multiple of 16, so search the rest. + // while i < haystack.len(): + // if haystack[i] == n1: + // return i + // + // // No match found. + // return NULL + // + // In fact, we could loosely translate the above code to Rust line-for-line + // and it would be a pretty fast algorithm. But, we pull out all the stops + // to go as fast as possible: + // + // 1. We use aligned loads. That is, we do some finagling to make sure our + // primary loop not only proceeds in increments of 16 bytes, but that + // the address of haystack's pointer that we dereference is aligned to + // 16 bytes. 16 is a magic number here because it is the size of SSE2 + // 128-bit vector. (For the AVX2 algorithm, 32 is the magic number.) + // Therefore, to get aligned loads, our pointer's address must be evenly + // divisible by 16. + // 2. Our primary loop proceeds 64 bytes at a time instead of 16. It's + // kind of like loop unrolling, but we combine the equality comparisons + // using a vector OR such that we only need to extract a single mask to + // determine whether a match exists or not. If so, then we do some + // book-keeping to determine the precise location but otherwise mush on. + // 3. We use our "chunk" comparison routine in as many places as possible, + // even if it means using unaligned loads. In particular, if haystack + // starts with an unaligned address, then we do an unaligned load to + // search the first 16 bytes. We then start our primary loop at the + // smallest subsequent aligned address, which will actually overlap with + // previously searched bytes. But we're OK with that. We do a similar + // dance at the end of our primary loop. Finally, to avoid a + // byte-at-a-time loop at the end, we do a final 16 byte unaligned load + // that may overlap with a previous load. This is OK because it converts + // a loop into a small number of very fast vector instructions. + // + // The primary downside of this algorithm is that it's effectively + // completely unsafe. Therefore, we have to be super careful to avoid + // undefined behavior: + // + // 1. We use raw pointers everywhere. Not only does dereferencing a pointer + // require the pointer to be valid, but we actually can't even store the + // address of an invalid pointer (unless it's 1 past the end of + // haystack) without sacrificing performance. + // 2. _mm_loadu_si128 is used when you don't care about alignment, and + // _mm_load_si128 is used when you do care. You cannot use the latter + // on unaligned pointers. + // 3. We make liberal use of debug_assert! to check assumptions. + // 4. We make a concerted effort to stick with pointers instead of indices. + // Indices are nicer because there's less to worry about with them (see + // above about pointer offsets), but I could not get the compiler to + // produce as good of code as what the below produces. In any case, + // pointers are what we really care about here, and alignment is + // expressed a bit more naturally with them. + // + // In general, most of the algorithms in this crate have a similar + // structure to what you see below, so this comment applies fairly well to + // all of them. + + let vn1 = _mm_set1_epi8(n1 as i8); + let len = haystack.len(); + let loop_size = cmp::min(LOOP_SIZE, len); + let start_ptr = haystack.as_ptr(); + let end_ptr = haystack[haystack.len()..].as_ptr(); + let mut ptr = start_ptr; + + if haystack.len() < VECTOR_SIZE { + while ptr < end_ptr { + if *ptr == n1 { + return Some(sub(ptr, start_ptr)); + } + ptr = ptr.offset(1); + } + return None; + } + + if let Some(i) = forward_search1(start_ptr, end_ptr, ptr, vn1) { + return Some(i); + } + + ptr = ptr.add(VECTOR_SIZE - (start_ptr as usize & VECTOR_ALIGN)); + debug_assert!(ptr > start_ptr && end_ptr.sub(VECTOR_SIZE) >= start_ptr); + while loop_size == LOOP_SIZE && ptr <= end_ptr.sub(loop_size) { + debug_assert_eq!(0, (ptr as usize) % VECTOR_SIZE); + + let a = _mm_load_si128(ptr as *const __m128i); + let b = _mm_load_si128(ptr.add(VECTOR_SIZE) as *const __m128i); + let c = _mm_load_si128(ptr.add(2 * VECTOR_SIZE) as *const __m128i); + let d = _mm_load_si128(ptr.add(3 * VECTOR_SIZE) as *const __m128i); + let eqa = _mm_cmpeq_epi8(vn1, a); + let eqb = _mm_cmpeq_epi8(vn1, b); + let eqc = _mm_cmpeq_epi8(vn1, c); + let eqd = _mm_cmpeq_epi8(vn1, d); + let or1 = _mm_or_si128(eqa, eqb); + let or2 = _mm_or_si128(eqc, eqd); + let or3 = _mm_or_si128(or1, or2); + if _mm_movemask_epi8(or3) != 0 { + let mut at = sub(ptr, start_ptr); + let mask = _mm_movemask_epi8(eqa); + if mask != 0 { + return Some(at + forward_pos(mask)); + } + + at += VECTOR_SIZE; + let mask = _mm_movemask_epi8(eqb); + if mask != 0 { + return Some(at + forward_pos(mask)); + } + + at += VECTOR_SIZE; + let mask = _mm_movemask_epi8(eqc); + if mask != 0 { + return Some(at + forward_pos(mask)); + } + + at += VECTOR_SIZE; + let mask = _mm_movemask_epi8(eqd); + debug_assert!(mask != 0); + return Some(at + forward_pos(mask)); + } + ptr = ptr.add(loop_size); + } + while ptr <= end_ptr.sub(VECTOR_SIZE) { + debug_assert!(sub(end_ptr, ptr) >= VECTOR_SIZE); + + if let Some(i) = forward_search1(start_ptr, end_ptr, ptr, vn1) { + return Some(i); + } + ptr = ptr.add(VECTOR_SIZE); + } + if ptr < end_ptr { + debug_assert!(sub(end_ptr, ptr) < VECTOR_SIZE); + ptr = ptr.sub(VECTOR_SIZE - sub(end_ptr, ptr)); + debug_assert_eq!(sub(end_ptr, ptr), VECTOR_SIZE); + + return forward_search1(start_ptr, end_ptr, ptr, vn1); + } + None +} + +#[target_feature(enable = "sse2")] +pub unsafe fn memchr2(n1: u8, n2: u8, haystack: &[u8]) -> Option<usize> { + let vn1 = _mm_set1_epi8(n1 as i8); + let vn2 = _mm_set1_epi8(n2 as i8); + let len = haystack.len(); + let loop_size = cmp::min(LOOP_SIZE2, len); + let start_ptr = haystack.as_ptr(); + let end_ptr = haystack[haystack.len()..].as_ptr(); + let mut ptr = start_ptr; + + if haystack.len() < VECTOR_SIZE { + while ptr < end_ptr { + if *ptr == n1 || *ptr == n2 { + return Some(sub(ptr, start_ptr)); + } + ptr = ptr.offset(1); + } + return None; + } + + if let Some(i) = forward_search2(start_ptr, end_ptr, ptr, vn1, vn2) { + return Some(i); + } + + ptr = ptr.add(VECTOR_SIZE - (start_ptr as usize & VECTOR_ALIGN)); + debug_assert!(ptr > start_ptr && end_ptr.sub(VECTOR_SIZE) >= start_ptr); + while loop_size == LOOP_SIZE2 && ptr <= end_ptr.sub(loop_size) { + debug_assert_eq!(0, (ptr as usize) % VECTOR_SIZE); + + let a = _mm_load_si128(ptr as *const __m128i); + let b = _mm_load_si128(ptr.add(VECTOR_SIZE) as *const __m128i); + let eqa1 = _mm_cmpeq_epi8(vn1, a); + let eqb1 = _mm_cmpeq_epi8(vn1, b); + let eqa2 = _mm_cmpeq_epi8(vn2, a); + let eqb2 = _mm_cmpeq_epi8(vn2, b); + let or1 = _mm_or_si128(eqa1, eqb1); + let or2 = _mm_or_si128(eqa2, eqb2); + let or3 = _mm_or_si128(or1, or2); + if _mm_movemask_epi8(or3) != 0 { + let mut at = sub(ptr, start_ptr); + let mask1 = _mm_movemask_epi8(eqa1); + let mask2 = _mm_movemask_epi8(eqa2); + if mask1 != 0 || mask2 != 0 { + return Some(at + forward_pos2(mask1, mask2)); + } + + at += VECTOR_SIZE; + let mask1 = _mm_movemask_epi8(eqb1); + let mask2 = _mm_movemask_epi8(eqb2); + return Some(at + forward_pos2(mask1, mask2)); + } + ptr = ptr.add(loop_size); + } + while ptr <= end_ptr.sub(VECTOR_SIZE) { + if let Some(i) = forward_search2(start_ptr, end_ptr, ptr, vn1, vn2) { + return Some(i); + } + ptr = ptr.add(VECTOR_SIZE); + } + if ptr < end_ptr { + debug_assert!(sub(end_ptr, ptr) < VECTOR_SIZE); + ptr = ptr.sub(VECTOR_SIZE - sub(end_ptr, ptr)); + debug_assert_eq!(sub(end_ptr, ptr), VECTOR_SIZE); + + return forward_search2(start_ptr, end_ptr, ptr, vn1, vn2); + } + None +} + +#[target_feature(enable = "sse2")] +pub unsafe fn memchr3( + n1: u8, + n2: u8, + n3: u8, + haystack: &[u8], +) -> Option<usize> { + let vn1 = _mm_set1_epi8(n1 as i8); + let vn2 = _mm_set1_epi8(n2 as i8); + let vn3 = _mm_set1_epi8(n3 as i8); + let len = haystack.len(); + let loop_size = cmp::min(LOOP_SIZE2, len); + let start_ptr = haystack.as_ptr(); + let end_ptr = haystack[haystack.len()..].as_ptr(); + let mut ptr = start_ptr; + + if haystack.len() < VECTOR_SIZE { + while ptr < end_ptr { + if *ptr == n1 || *ptr == n2 || *ptr == n3 { + return Some(sub(ptr, start_ptr)); + } + ptr = ptr.offset(1); + } + return None; + } + + if let Some(i) = forward_search3(start_ptr, end_ptr, ptr, vn1, vn2, vn3) { + return Some(i); + } + + ptr = ptr.add(VECTOR_SIZE - (start_ptr as usize & VECTOR_ALIGN)); + debug_assert!(ptr > start_ptr && end_ptr.sub(VECTOR_SIZE) >= start_ptr); + while loop_size == LOOP_SIZE2 && ptr <= end_ptr.sub(loop_size) { + debug_assert_eq!(0, (ptr as usize) % VECTOR_SIZE); + + let a = _mm_load_si128(ptr as *const __m128i); + let b = _mm_load_si128(ptr.add(VECTOR_SIZE) as *const __m128i); + let eqa1 = _mm_cmpeq_epi8(vn1, a); + let eqb1 = _mm_cmpeq_epi8(vn1, b); + let eqa2 = _mm_cmpeq_epi8(vn2, a); + let eqb2 = _mm_cmpeq_epi8(vn2, b); + let eqa3 = _mm_cmpeq_epi8(vn3, a); + let eqb3 = _mm_cmpeq_epi8(vn3, b); + let or1 = _mm_or_si128(eqa1, eqb1); + let or2 = _mm_or_si128(eqa2, eqb2); + let or3 = _mm_or_si128(eqa3, eqb3); + let or4 = _mm_or_si128(or1, or2); + let or5 = _mm_or_si128(or3, or4); + if _mm_movemask_epi8(or5) != 0 { + let mut at = sub(ptr, start_ptr); + let mask1 = _mm_movemask_epi8(eqa1); + let mask2 = _mm_movemask_epi8(eqa2); + let mask3 = _mm_movemask_epi8(eqa3); + if mask1 != 0 || mask2 != 0 || mask3 != 0 { + return Some(at + forward_pos3(mask1, mask2, mask3)); + } + + at += VECTOR_SIZE; + let mask1 = _mm_movemask_epi8(eqb1); + let mask2 = _mm_movemask_epi8(eqb2); + let mask3 = _mm_movemask_epi8(eqb3); + return Some(at + forward_pos3(mask1, mask2, mask3)); + } + ptr = ptr.add(loop_size); + } + while ptr <= end_ptr.sub(VECTOR_SIZE) { + if let Some(i) = + forward_search3(start_ptr, end_ptr, ptr, vn1, vn2, vn3) + { + return Some(i); + } + ptr = ptr.add(VECTOR_SIZE); + } + if ptr < end_ptr { + debug_assert!(sub(end_ptr, ptr) < VECTOR_SIZE); + ptr = ptr.sub(VECTOR_SIZE - sub(end_ptr, ptr)); + debug_assert_eq!(sub(end_ptr, ptr), VECTOR_SIZE); + + return forward_search3(start_ptr, end_ptr, ptr, vn1, vn2, vn3); + } + None +} + +#[target_feature(enable = "sse2")] +pub unsafe fn memrchr(n1: u8, haystack: &[u8]) -> Option<usize> { + let vn1 = _mm_set1_epi8(n1 as i8); + let len = haystack.len(); + let loop_size = cmp::min(LOOP_SIZE, len); + let start_ptr = haystack.as_ptr(); + let end_ptr = haystack[haystack.len()..].as_ptr(); + let mut ptr = end_ptr; + + if haystack.len() < VECTOR_SIZE { + while ptr > start_ptr { + ptr = ptr.offset(-1); + if *ptr == n1 { + return Some(sub(ptr, start_ptr)); + } + } + return None; + } + + ptr = ptr.sub(VECTOR_SIZE); + if let Some(i) = reverse_search1(start_ptr, end_ptr, ptr, vn1) { + return Some(i); + } + + ptr = (end_ptr as usize & !VECTOR_ALIGN) as *const u8; + debug_assert!(start_ptr <= ptr && ptr <= end_ptr); + while loop_size == LOOP_SIZE && ptr >= start_ptr.add(loop_size) { + debug_assert_eq!(0, (ptr as usize) % VECTOR_SIZE); + + ptr = ptr.sub(loop_size); + let a = _mm_load_si128(ptr as *const __m128i); + let b = _mm_load_si128(ptr.add(VECTOR_SIZE) as *const __m128i); + let c = _mm_load_si128(ptr.add(2 * VECTOR_SIZE) as *const __m128i); + let d = _mm_load_si128(ptr.add(3 * VECTOR_SIZE) as *const __m128i); + let eqa = _mm_cmpeq_epi8(vn1, a); + let eqb = _mm_cmpeq_epi8(vn1, b); + let eqc = _mm_cmpeq_epi8(vn1, c); + let eqd = _mm_cmpeq_epi8(vn1, d); + let or1 = _mm_or_si128(eqa, eqb); + let or2 = _mm_or_si128(eqc, eqd); + let or3 = _mm_or_si128(or1, or2); + if _mm_movemask_epi8(or3) != 0 { + let mut at = sub(ptr.add(3 * VECTOR_SIZE), start_ptr); + let mask = _mm_movemask_epi8(eqd); + if mask != 0 { + return Some(at + reverse_pos(mask)); + } + + at -= VECTOR_SIZE; + let mask = _mm_movemask_epi8(eqc); + if mask != 0 { + return Some(at + reverse_pos(mask)); + } + + at -= VECTOR_SIZE; + let mask = _mm_movemask_epi8(eqb); + if mask != 0 { + return Some(at + reverse_pos(mask)); + } + + at -= VECTOR_SIZE; + let mask = _mm_movemask_epi8(eqa); + debug_assert!(mask != 0); + return Some(at + reverse_pos(mask)); + } + } + while ptr >= start_ptr.add(VECTOR_SIZE) { + ptr = ptr.sub(VECTOR_SIZE); + if let Some(i) = reverse_search1(start_ptr, end_ptr, ptr, vn1) { + return Some(i); + } + } + if ptr > start_ptr { + debug_assert!(sub(ptr, start_ptr) < VECTOR_SIZE); + return reverse_search1(start_ptr, end_ptr, start_ptr, vn1); + } + None +} + +#[target_feature(enable = "sse2")] +pub unsafe fn memrchr2(n1: u8, n2: u8, haystack: &[u8]) -> Option<usize> { + let vn1 = _mm_set1_epi8(n1 as i8); + let vn2 = _mm_set1_epi8(n2 as i8); + let len = haystack.len(); + let loop_size = cmp::min(LOOP_SIZE2, len); + let start_ptr = haystack.as_ptr(); + let end_ptr = haystack[haystack.len()..].as_ptr(); + let mut ptr = end_ptr; + + if haystack.len() < VECTOR_SIZE { + while ptr > start_ptr { + ptr = ptr.offset(-1); + if *ptr == n1 || *ptr == n2 { + return Some(sub(ptr, start_ptr)); + } + } + return None; + } + + ptr = ptr.sub(VECTOR_SIZE); + if let Some(i) = reverse_search2(start_ptr, end_ptr, ptr, vn1, vn2) { + return Some(i); + } + + ptr = (end_ptr as usize & !VECTOR_ALIGN) as *const u8; + debug_assert!(start_ptr <= ptr && ptr <= end_ptr); + while loop_size == LOOP_SIZE2 && ptr >= start_ptr.add(loop_size) { + debug_assert_eq!(0, (ptr as usize) % VECTOR_SIZE); + + ptr = ptr.sub(loop_size); + let a = _mm_load_si128(ptr as *const __m128i); + let b = _mm_load_si128(ptr.add(VECTOR_SIZE) as *const __m128i); + let eqa1 = _mm_cmpeq_epi8(vn1, a); + let eqb1 = _mm_cmpeq_epi8(vn1, b); + let eqa2 = _mm_cmpeq_epi8(vn2, a); + let eqb2 = _mm_cmpeq_epi8(vn2, b); + let or1 = _mm_or_si128(eqa1, eqb1); + let or2 = _mm_or_si128(eqa2, eqb2); + let or3 = _mm_or_si128(or1, or2); + if _mm_movemask_epi8(or3) != 0 { + let mut at = sub(ptr.add(VECTOR_SIZE), start_ptr); + let mask1 = _mm_movemask_epi8(eqb1); + let mask2 = _mm_movemask_epi8(eqb2); + if mask1 != 0 || mask2 != 0 { + return Some(at + reverse_pos2(mask1, mask2)); + } + + at -= VECTOR_SIZE; + let mask1 = _mm_movemask_epi8(eqa1); + let mask2 = _mm_movemask_epi8(eqa2); + return Some(at + reverse_pos2(mask1, mask2)); + } + } + while ptr >= start_ptr.add(VECTOR_SIZE) { + ptr = ptr.sub(VECTOR_SIZE); + if let Some(i) = reverse_search2(start_ptr, end_ptr, ptr, vn1, vn2) { + return Some(i); + } + } + if ptr > start_ptr { + debug_assert!(sub(ptr, start_ptr) < VECTOR_SIZE); + return reverse_search2(start_ptr, end_ptr, start_ptr, vn1, vn2); + } + None +} + +#[target_feature(enable = "sse2")] +pub unsafe fn memrchr3( + n1: u8, + n2: u8, + n3: u8, + haystack: &[u8], +) -> Option<usize> { + let vn1 = _mm_set1_epi8(n1 as i8); + let vn2 = _mm_set1_epi8(n2 as i8); + let vn3 = _mm_set1_epi8(n3 as i8); + let len = haystack.len(); + let loop_size = cmp::min(LOOP_SIZE2, len); + let start_ptr = haystack.as_ptr(); + let end_ptr = haystack[haystack.len()..].as_ptr(); + let mut ptr = end_ptr; + + if haystack.len() < VECTOR_SIZE { + while ptr > start_ptr { + ptr = ptr.offset(-1); + if *ptr == n1 || *ptr == n2 || *ptr == n3 { + return Some(sub(ptr, start_ptr)); + } + } + return None; + } + + ptr = ptr.sub(VECTOR_SIZE); + if let Some(i) = reverse_search3(start_ptr, end_ptr, ptr, vn1, vn2, vn3) { + return Some(i); + } + + ptr = (end_ptr as usize & !VECTOR_ALIGN) as *const u8; + debug_assert!(start_ptr <= ptr && ptr <= end_ptr); + while loop_size == LOOP_SIZE2 && ptr >= start_ptr.add(loop_size) { + debug_assert_eq!(0, (ptr as usize) % VECTOR_SIZE); + + ptr = ptr.sub(loop_size); + let a = _mm_load_si128(ptr as *const __m128i); + let b = _mm_load_si128(ptr.add(VECTOR_SIZE) as *const __m128i); + let eqa1 = _mm_cmpeq_epi8(vn1, a); + let eqb1 = _mm_cmpeq_epi8(vn1, b); + let eqa2 = _mm_cmpeq_epi8(vn2, a); + let eqb2 = _mm_cmpeq_epi8(vn2, b); + let eqa3 = _mm_cmpeq_epi8(vn3, a); + let eqb3 = _mm_cmpeq_epi8(vn3, b); + let or1 = _mm_or_si128(eqa1, eqb1); + let or2 = _mm_or_si128(eqa2, eqb2); + let or3 = _mm_or_si128(eqa3, eqb3); + let or4 = _mm_or_si128(or1, or2); + let or5 = _mm_or_si128(or3, or4); + if _mm_movemask_epi8(or5) != 0 { + let mut at = sub(ptr.add(VECTOR_SIZE), start_ptr); + let mask1 = _mm_movemask_epi8(eqb1); + let mask2 = _mm_movemask_epi8(eqb2); + let mask3 = _mm_movemask_epi8(eqb3); + if mask1 != 0 || mask2 != 0 || mask3 != 0 { + return Some(at + reverse_pos3(mask1, mask2, mask3)); + } + + at -= VECTOR_SIZE; + let mask1 = _mm_movemask_epi8(eqa1); + let mask2 = _mm_movemask_epi8(eqa2); + let mask3 = _mm_movemask_epi8(eqa3); + return Some(at + reverse_pos3(mask1, mask2, mask3)); + } + } + while ptr >= start_ptr.add(VECTOR_SIZE) { + ptr = ptr.sub(VECTOR_SIZE); + if let Some(i) = + reverse_search3(start_ptr, end_ptr, ptr, vn1, vn2, vn3) + { + return Some(i); + } + } + if ptr > start_ptr { + debug_assert!(sub(ptr, start_ptr) < VECTOR_SIZE); + return reverse_search3(start_ptr, end_ptr, start_ptr, vn1, vn2, vn3); + } + None +} + +#[target_feature(enable = "sse2")] +pub unsafe fn forward_search1( + start_ptr: *const u8, + end_ptr: *const u8, + ptr: *const u8, + vn1: __m128i, +) -> Option<usize> { + debug_assert!(sub(end_ptr, start_ptr) >= VECTOR_SIZE); + debug_assert!(start_ptr <= ptr); + debug_assert!(ptr <= end_ptr.sub(VECTOR_SIZE)); + + let chunk = _mm_loadu_si128(ptr as *const __m128i); + let mask = _mm_movemask_epi8(_mm_cmpeq_epi8(chunk, vn1)); + if mask != 0 { + Some(sub(ptr, start_ptr) + forward_pos(mask)) + } else { + None + } +} + +#[target_feature(enable = "sse2")] +unsafe fn forward_search2( + start_ptr: *const u8, + end_ptr: *const u8, + ptr: *const u8, + vn1: __m128i, + vn2: __m128i, +) -> Option<usize> { + debug_assert!(sub(end_ptr, start_ptr) >= VECTOR_SIZE); + debug_assert!(start_ptr <= ptr); + debug_assert!(ptr <= end_ptr.sub(VECTOR_SIZE)); + + let chunk = _mm_loadu_si128(ptr as *const __m128i); + let eq1 = _mm_cmpeq_epi8(chunk, vn1); + let eq2 = _mm_cmpeq_epi8(chunk, vn2); + if _mm_movemask_epi8(_mm_or_si128(eq1, eq2)) != 0 { + let mask1 = _mm_movemask_epi8(eq1); + let mask2 = _mm_movemask_epi8(eq2); + Some(sub(ptr, start_ptr) + forward_pos2(mask1, mask2)) + } else { + None + } +} + +#[target_feature(enable = "sse2")] +pub unsafe fn forward_search3( + start_ptr: *const u8, + end_ptr: *const u8, + ptr: *const u8, + vn1: __m128i, + vn2: __m128i, + vn3: __m128i, +) -> Option<usize> { + debug_assert!(sub(end_ptr, start_ptr) >= VECTOR_SIZE); + debug_assert!(start_ptr <= ptr); + debug_assert!(ptr <= end_ptr.sub(VECTOR_SIZE)); + + let chunk = _mm_loadu_si128(ptr as *const __m128i); + let eq1 = _mm_cmpeq_epi8(chunk, vn1); + let eq2 = _mm_cmpeq_epi8(chunk, vn2); + let eq3 = _mm_cmpeq_epi8(chunk, vn3); + let or = _mm_or_si128(eq1, eq2); + if _mm_movemask_epi8(_mm_or_si128(or, eq3)) != 0 { + let mask1 = _mm_movemask_epi8(eq1); + let mask2 = _mm_movemask_epi8(eq2); + let mask3 = _mm_movemask_epi8(eq3); + Some(sub(ptr, start_ptr) + forward_pos3(mask1, mask2, mask3)) + } else { + None + } +} + +#[target_feature(enable = "sse2")] +unsafe fn reverse_search1( + start_ptr: *const u8, + end_ptr: *const u8, + ptr: *const u8, + vn1: __m128i, +) -> Option<usize> { + debug_assert!(sub(end_ptr, start_ptr) >= VECTOR_SIZE); + debug_assert!(start_ptr <= ptr); + debug_assert!(ptr <= end_ptr.sub(VECTOR_SIZE)); + + let chunk = _mm_loadu_si128(ptr as *const __m128i); + let mask = _mm_movemask_epi8(_mm_cmpeq_epi8(vn1, chunk)); + if mask != 0 { + Some(sub(ptr, start_ptr) + reverse_pos(mask)) + } else { + None + } +} + +#[target_feature(enable = "sse2")] +unsafe fn reverse_search2( + start_ptr: *const u8, + end_ptr: *const u8, + ptr: *const u8, + vn1: __m128i, + vn2: __m128i, +) -> Option<usize> { + debug_assert!(sub(end_ptr, start_ptr) >= VECTOR_SIZE); + debug_assert!(start_ptr <= ptr); + debug_assert!(ptr <= end_ptr.sub(VECTOR_SIZE)); + + let chunk = _mm_loadu_si128(ptr as *const __m128i); + let eq1 = _mm_cmpeq_epi8(chunk, vn1); + let eq2 = _mm_cmpeq_epi8(chunk, vn2); + if _mm_movemask_epi8(_mm_or_si128(eq1, eq2)) != 0 { + let mask1 = _mm_movemask_epi8(eq1); + let mask2 = _mm_movemask_epi8(eq2); + Some(sub(ptr, start_ptr) + reverse_pos2(mask1, mask2)) + } else { + None + } +} + +#[target_feature(enable = "sse2")] +unsafe fn reverse_search3( + start_ptr: *const u8, + end_ptr: *const u8, + ptr: *const u8, + vn1: __m128i, + vn2: __m128i, + vn3: __m128i, +) -> Option<usize> { + debug_assert!(sub(end_ptr, start_ptr) >= VECTOR_SIZE); + debug_assert!(start_ptr <= ptr); + debug_assert!(ptr <= end_ptr.sub(VECTOR_SIZE)); + + let chunk = _mm_loadu_si128(ptr as *const __m128i); + let eq1 = _mm_cmpeq_epi8(chunk, vn1); + let eq2 = _mm_cmpeq_epi8(chunk, vn2); + let eq3 = _mm_cmpeq_epi8(chunk, vn3); + let or = _mm_or_si128(eq1, eq2); + if _mm_movemask_epi8(_mm_or_si128(or, eq3)) != 0 { + let mask1 = _mm_movemask_epi8(eq1); + let mask2 = _mm_movemask_epi8(eq2); + let mask3 = _mm_movemask_epi8(eq3); + Some(sub(ptr, start_ptr) + reverse_pos3(mask1, mask2, mask3)) + } else { + None + } +} + +/// Compute the position of the first matching byte from the given mask. The +/// position returned is always in the range [0, 15]. +/// +/// The mask given is expected to be the result of _mm_movemask_epi8. +fn forward_pos(mask: i32) -> usize { + // We are dealing with little endian here, where the most significant byte + // is at a higher address. That means the least significant bit that is set + // corresponds to the position of our first matching byte. That position + // corresponds to the number of zeros after the least significant bit. + mask.trailing_zeros() as usize +} + +/// Compute the position of the first matching byte from the given masks. The +/// position returned is always in the range [0, 15]. Each mask corresponds to +/// the equality comparison of a single byte. +/// +/// The masks given are expected to be the result of _mm_movemask_epi8, where +/// at least one of the masks is non-zero (i.e., indicates a match). +fn forward_pos2(mask1: i32, mask2: i32) -> usize { + debug_assert!(mask1 != 0 || mask2 != 0); + + forward_pos(mask1 | mask2) +} + +/// Compute the position of the first matching byte from the given masks. The +/// position returned is always in the range [0, 15]. Each mask corresponds to +/// the equality comparison of a single byte. +/// +/// The masks given are expected to be the result of _mm_movemask_epi8, where +/// at least one of the masks is non-zero (i.e., indicates a match). +fn forward_pos3(mask1: i32, mask2: i32, mask3: i32) -> usize { + debug_assert!(mask1 != 0 || mask2 != 0 || mask3 != 0); + + forward_pos(mask1 | mask2 | mask3) +} + +/// Compute the position of the last matching byte from the given mask. The +/// position returned is always in the range [0, 15]. +/// +/// The mask given is expected to be the result of _mm_movemask_epi8. +fn reverse_pos(mask: i32) -> usize { + // We are dealing with little endian here, where the most significant byte + // is at a higher address. That means the most significant bit that is set + // corresponds to the position of our last matching byte. The position from + // the end of the mask is therefore the number of leading zeros in a 16 + // bit integer, and the position from the start of the mask is therefore + // 16 - (leading zeros) - 1. + VECTOR_SIZE - (mask as u16).leading_zeros() as usize - 1 +} + +/// Compute the position of the last matching byte from the given masks. The +/// position returned is always in the range [0, 15]. Each mask corresponds to +/// the equality comparison of a single byte. +/// +/// The masks given are expected to be the result of _mm_movemask_epi8, where +/// at least one of the masks is non-zero (i.e., indicates a match). +fn reverse_pos2(mask1: i32, mask2: i32) -> usize { + debug_assert!(mask1 != 0 || mask2 != 0); + + reverse_pos(mask1 | mask2) +} + +/// Compute the position of the last matching byte from the given masks. The +/// position returned is always in the range [0, 15]. Each mask corresponds to +/// the equality comparison of a single byte. +/// +/// The masks given are expected to be the result of _mm_movemask_epi8, where +/// at least one of the masks is non-zero (i.e., indicates a match). +fn reverse_pos3(mask1: i32, mask2: i32, mask3: i32) -> usize { + debug_assert!(mask1 != 0 || mask2 != 0 || mask3 != 0); + + reverse_pos(mask1 | mask2 | mask3) +} + +/// Subtract `b` from `a` and return the difference. `a` should be greater than +/// or equal to `b`. +fn sub(a: *const u8, b: *const u8) -> usize { + debug_assert!(a >= b); + (a as usize) - (b as usize) +} diff --git a/third_party/rust/memchr/src/x86/sse42.rs b/third_party/rust/memchr/src/x86/sse42.rs new file mode 100644 index 0000000000..78a9b37973 --- /dev/null +++ b/third_party/rust/memchr/src/x86/sse42.rs @@ -0,0 +1,75 @@ +// This code is unused. PCMPESTRI is gratuitously slow. I imagine it might +// start winning with a hypothetical memchr4 (or greater). This technique might +// also be good for exposing searches over ranges of bytes, but that departs +// from the standard memchr API, so it's not clear whether we actually want +// that or not. +// +// N.B. PCMPISTRI appears to be about twice as fast as PCMPESTRI, which is kind +// of neat. Unfortunately, UTF-8 strings can contain NUL bytes, which means +// I don't see a way of effectively using PCMPISTRI unless there's some fast +// way to replace zero bytes with a byte that is not not a needle byte. + +use core::arch::x86_64::*; +use core::mem::size_of; + +use x86::sse2; + +const VECTOR_SIZE: usize = size_of::<__m128i>(); +const CONTROL_ANY: i32 = + _SIDD_UBYTE_OPS + | _SIDD_CMP_EQUAL_ANY + | _SIDD_POSITIVE_POLARITY + | _SIDD_LEAST_SIGNIFICANT; + +#[target_feature(enable = "sse4.2")] +pub unsafe fn memchr3( + n1: u8, n2: u8, n3: u8, + haystack: &[u8] +) -> Option<usize> { + let vn1 = _mm_set1_epi8(n1 as i8); + let vn2 = _mm_set1_epi8(n2 as i8); + let vn3 = _mm_set1_epi8(n3 as i8); + let vn = _mm_setr_epi8( + n1 as i8, n2 as i8, n3 as i8, 0, + 0, 0, 0, 0, + 0, 0, 0, 0, + 0, 0, 0, 0, + ); + let len = haystack.len(); + let start_ptr = haystack.as_ptr(); + let end_ptr = haystack[haystack.len()..].as_ptr(); + let mut ptr = start_ptr; + + if haystack.len() < VECTOR_SIZE { + while ptr < end_ptr { + if *ptr == n1 || *ptr == n2 || *ptr == n3 { + return Some(sub(ptr, start_ptr)); + } + ptr = ptr.offset(1); + } + return None; + } + while ptr <= end_ptr.sub(VECTOR_SIZE) { + let chunk = _mm_loadu_si128(ptr as *const __m128i); + let res = _mm_cmpestri(vn, 3, chunk, 16, CONTROL_ANY); + if res < 16 { + return Some(sub(ptr, start_ptr) + res as usize); + } + ptr = ptr.add(VECTOR_SIZE); + } + if ptr < end_ptr { + debug_assert!(sub(end_ptr, ptr) < VECTOR_SIZE); + ptr = ptr.sub(VECTOR_SIZE - sub(end_ptr, ptr)); + debug_assert_eq!(sub(end_ptr, ptr), VECTOR_SIZE); + + return sse2::forward_search3(start_ptr, end_ptr, ptr, vn1, vn2, vn3); + } + None +} + +/// Subtract `b` from `a` and return the difference. `a` should be greater than +/// or equal to `b`. +fn sub(a: *const u8, b: *const u8) -> usize { + debug_assert!(a >= b); + (a as usize) - (b as usize) +} |