diff options
Diffstat (limited to 'vendor/memchr/src/memchr')
-rw-r--r-- | vendor/memchr/src/memchr/c.rs | 44 | ||||
-rw-r--r-- | vendor/memchr/src/memchr/fallback.rs | 329 | ||||
-rw-r--r-- | vendor/memchr/src/memchr/iter.rs | 173 | ||||
-rw-r--r-- | vendor/memchr/src/memchr/mod.rs | 410 | ||||
-rw-r--r-- | vendor/memchr/src/memchr/naive.rs | 25 | ||||
-rw-r--r-- | vendor/memchr/src/memchr/x86/avx.rs | 755 | ||||
-rw-r--r-- | vendor/memchr/src/memchr/x86/mod.rs | 148 | ||||
-rw-r--r-- | vendor/memchr/src/memchr/x86/sse2.rs | 791 | ||||
-rw-r--r-- | vendor/memchr/src/memchr/x86/sse42.rs | 72 |
9 files changed, 0 insertions, 2747 deletions
diff --git a/vendor/memchr/src/memchr/c.rs b/vendor/memchr/src/memchr/c.rs deleted file mode 100644 index 608aabc98..000000000 --- a/vendor/memchr/src/memchr/c.rs +++ /dev/null @@ -1,44 +0,0 @@ -// This module defines safe wrappers around memchr (POSIX) and memrchr (GNU -// extension). - -#![allow(dead_code)] - -use libc::{c_int, c_void, size_t}; - -pub fn memchr(needle: u8, haystack: &[u8]) -> Option<usize> { - // SAFETY: This is safe to call since all pointers are valid. - let p = unsafe { - libc::memchr( - haystack.as_ptr() as *const c_void, - needle as c_int, - haystack.len() as size_t, - ) - }; - if p.is_null() { - None - } else { - Some(p as usize - (haystack.as_ptr() as usize)) - } -} - -// memrchr is a GNU extension. We know it's available on Linux at least. -#[cfg(target_os = "linux")] -pub fn memrchr(needle: u8, haystack: &[u8]) -> Option<usize> { - // GNU's memrchr() will - unlike memchr() - error if haystack is empty. - if haystack.is_empty() { - return None; - } - // SAFETY: This is safe to call since all pointers are valid. - let p = unsafe { - libc::memrchr( - haystack.as_ptr() as *const c_void, - needle as c_int, - haystack.len() as size_t, - ) - }; - if p.is_null() { - None - } else { - Some(p as usize - (haystack.as_ptr() as usize)) - } -} diff --git a/vendor/memchr/src/memchr/fallback.rs b/vendor/memchr/src/memchr/fallback.rs deleted file mode 100644 index b01f224fa..000000000 --- a/vendor/memchr/src/memchr/fallback.rs +++ /dev/null @@ -1,329 +0,0 @@ -// This module defines pure Rust platform independent implementations of all -// the memchr routines. We do our best to make them fast. Some of them may even -// get auto-vectorized. - -use core::{cmp, usize}; - -#[cfg(target_pointer_width = "16")] -const USIZE_BYTES: usize = 2; - -#[cfg(target_pointer_width = "32")] -const USIZE_BYTES: usize = 4; - -#[cfg(target_pointer_width = "64")] -const USIZE_BYTES: usize = 8; - -// The number of bytes to loop at in one iteration of memchr/memrchr. -const LOOP_SIZE: usize = 2 * USIZE_BYTES; - -/// Return `true` if `x` contains any zero byte. -/// -/// From *Matters Computational*, J. Arndt -/// -/// "The idea is to subtract one from each of the bytes and then look for -/// bytes where the borrow propagated all the way to the most significant -/// bit." -#[inline(always)] -fn contains_zero_byte(x: usize) -> bool { - const LO_U64: u64 = 0x0101010101010101; - const HI_U64: u64 = 0x8080808080808080; - - const LO_USIZE: usize = LO_U64 as usize; - const HI_USIZE: usize = HI_U64 as usize; - - x.wrapping_sub(LO_USIZE) & !x & HI_USIZE != 0 -} - -/// Repeat the given byte into a word size number. That is, every 8 bits -/// is equivalent to the given byte. For example, if `b` is `\x4E` or -/// `01001110` in binary, then the returned value on a 32-bit system would be: -/// `01001110_01001110_01001110_01001110`. -#[inline(always)] -fn repeat_byte(b: u8) -> usize { - (b as usize) * (usize::MAX / 255) -} - -pub fn memchr(n1: u8, haystack: &[u8]) -> Option<usize> { - let vn1 = repeat_byte(n1); - let confirm = |byte| byte == n1; - let loop_size = cmp::min(LOOP_SIZE, haystack.len()); - let align = USIZE_BYTES - 1; - let start_ptr = haystack.as_ptr(); - let mut ptr = start_ptr; - - unsafe { - let end_ptr = start_ptr.add(haystack.len()); - if haystack.len() < USIZE_BYTES { - return forward_search(start_ptr, end_ptr, ptr, confirm); - } - - let chunk = (ptr as *const usize).read_unaligned(); - if contains_zero_byte(chunk ^ vn1) { - return forward_search(start_ptr, end_ptr, ptr, confirm); - } - - ptr = ptr.add(USIZE_BYTES - (start_ptr as usize & align)); - debug_assert!(ptr > start_ptr); - debug_assert!(end_ptr.sub(USIZE_BYTES) >= start_ptr); - while loop_size == LOOP_SIZE && ptr <= end_ptr.sub(loop_size) { - debug_assert_eq!(0, (ptr as usize) % USIZE_BYTES); - - let a = *(ptr as *const usize); - let b = *(ptr.add(USIZE_BYTES) as *const usize); - let eqa = contains_zero_byte(a ^ vn1); - let eqb = contains_zero_byte(b ^ vn1); - if eqa || eqb { - break; - } - ptr = ptr.add(LOOP_SIZE); - } - forward_search(start_ptr, end_ptr, ptr, confirm) - } -} - -/// Like `memchr`, but searches for two bytes instead of one. -pub fn memchr2(n1: u8, n2: u8, haystack: &[u8]) -> Option<usize> { - let vn1 = repeat_byte(n1); - let vn2 = repeat_byte(n2); - let confirm = |byte| byte == n1 || byte == n2; - let align = USIZE_BYTES - 1; - let start_ptr = haystack.as_ptr(); - let mut ptr = start_ptr; - - unsafe { - let end_ptr = start_ptr.add(haystack.len()); - if haystack.len() < USIZE_BYTES { - return forward_search(start_ptr, end_ptr, ptr, confirm); - } - - let chunk = (ptr as *const usize).read_unaligned(); - let eq1 = contains_zero_byte(chunk ^ vn1); - let eq2 = contains_zero_byte(chunk ^ vn2); - if eq1 || eq2 { - return forward_search(start_ptr, end_ptr, ptr, confirm); - } - - ptr = ptr.add(USIZE_BYTES - (start_ptr as usize & align)); - debug_assert!(ptr > start_ptr); - debug_assert!(end_ptr.sub(USIZE_BYTES) >= start_ptr); - while ptr <= end_ptr.sub(USIZE_BYTES) { - debug_assert_eq!(0, (ptr as usize) % USIZE_BYTES); - - let chunk = *(ptr as *const usize); - let eq1 = contains_zero_byte(chunk ^ vn1); - let eq2 = contains_zero_byte(chunk ^ vn2); - if eq1 || eq2 { - break; - } - ptr = ptr.add(USIZE_BYTES); - } - forward_search(start_ptr, end_ptr, ptr, confirm) - } -} - -/// Like `memchr`, but searches for three bytes instead of one. -pub fn memchr3(n1: u8, n2: u8, n3: u8, haystack: &[u8]) -> Option<usize> { - let vn1 = repeat_byte(n1); - let vn2 = repeat_byte(n2); - let vn3 = repeat_byte(n3); - let confirm = |byte| byte == n1 || byte == n2 || byte == n3; - let align = USIZE_BYTES - 1; - let start_ptr = haystack.as_ptr(); - let mut ptr = start_ptr; - - unsafe { - let end_ptr = start_ptr.add(haystack.len()); - if haystack.len() < USIZE_BYTES { - return forward_search(start_ptr, end_ptr, ptr, confirm); - } - - let chunk = (ptr as *const usize).read_unaligned(); - let eq1 = contains_zero_byte(chunk ^ vn1); - let eq2 = contains_zero_byte(chunk ^ vn2); - let eq3 = contains_zero_byte(chunk ^ vn3); - if eq1 || eq2 || eq3 { - return forward_search(start_ptr, end_ptr, ptr, confirm); - } - - ptr = ptr.add(USIZE_BYTES - (start_ptr as usize & align)); - debug_assert!(ptr > start_ptr); - debug_assert!(end_ptr.sub(USIZE_BYTES) >= start_ptr); - while ptr <= end_ptr.sub(USIZE_BYTES) { - debug_assert_eq!(0, (ptr as usize) % USIZE_BYTES); - - let chunk = *(ptr as *const usize); - let eq1 = contains_zero_byte(chunk ^ vn1); - let eq2 = contains_zero_byte(chunk ^ vn2); - let eq3 = contains_zero_byte(chunk ^ vn3); - if eq1 || eq2 || eq3 { - break; - } - ptr = ptr.add(USIZE_BYTES); - } - forward_search(start_ptr, end_ptr, ptr, confirm) - } -} - -/// Return the last index matching the byte `x` in `text`. -pub fn memrchr(n1: u8, haystack: &[u8]) -> Option<usize> { - let vn1 = repeat_byte(n1); - let confirm = |byte| byte == n1; - let loop_size = cmp::min(LOOP_SIZE, haystack.len()); - let align = USIZE_BYTES - 1; - let start_ptr = haystack.as_ptr(); - - unsafe { - let end_ptr = start_ptr.add(haystack.len()); - let mut ptr = end_ptr; - if haystack.len() < USIZE_BYTES { - return reverse_search(start_ptr, end_ptr, ptr, confirm); - } - - let chunk = (ptr.sub(USIZE_BYTES) as *const usize).read_unaligned(); - if contains_zero_byte(chunk ^ vn1) { - return reverse_search(start_ptr, end_ptr, ptr, confirm); - } - - ptr = (end_ptr as usize & !align) as *const u8; - debug_assert!(start_ptr <= ptr && ptr <= end_ptr); - while loop_size == LOOP_SIZE && ptr >= start_ptr.add(loop_size) { - debug_assert_eq!(0, (ptr as usize) % USIZE_BYTES); - - let a = *(ptr.sub(2 * USIZE_BYTES) as *const usize); - let b = *(ptr.sub(1 * USIZE_BYTES) as *const usize); - let eqa = contains_zero_byte(a ^ vn1); - let eqb = contains_zero_byte(b ^ vn1); - if eqa || eqb { - break; - } - ptr = ptr.sub(loop_size); - } - reverse_search(start_ptr, end_ptr, ptr, confirm) - } -} - -/// Like `memrchr`, but searches for two bytes instead of one. -pub fn memrchr2(n1: u8, n2: u8, haystack: &[u8]) -> Option<usize> { - let vn1 = repeat_byte(n1); - let vn2 = repeat_byte(n2); - let confirm = |byte| byte == n1 || byte == n2; - let align = USIZE_BYTES - 1; - let start_ptr = haystack.as_ptr(); - - unsafe { - let end_ptr = start_ptr.add(haystack.len()); - let mut ptr = end_ptr; - if haystack.len() < USIZE_BYTES { - return reverse_search(start_ptr, end_ptr, ptr, confirm); - } - - let chunk = (ptr.sub(USIZE_BYTES) as *const usize).read_unaligned(); - let eq1 = contains_zero_byte(chunk ^ vn1); - let eq2 = contains_zero_byte(chunk ^ vn2); - if eq1 || eq2 { - return reverse_search(start_ptr, end_ptr, ptr, confirm); - } - - ptr = (end_ptr as usize & !align) as *const u8; - debug_assert!(start_ptr <= ptr && ptr <= end_ptr); - while ptr >= start_ptr.add(USIZE_BYTES) { - debug_assert_eq!(0, (ptr as usize) % USIZE_BYTES); - - let chunk = *(ptr.sub(USIZE_BYTES) as *const usize); - let eq1 = contains_zero_byte(chunk ^ vn1); - let eq2 = contains_zero_byte(chunk ^ vn2); - if eq1 || eq2 { - break; - } - ptr = ptr.sub(USIZE_BYTES); - } - reverse_search(start_ptr, end_ptr, ptr, confirm) - } -} - -/// Like `memrchr`, but searches for three bytes instead of one. -pub fn memrchr3(n1: u8, n2: u8, n3: u8, haystack: &[u8]) -> Option<usize> { - let vn1 = repeat_byte(n1); - let vn2 = repeat_byte(n2); - let vn3 = repeat_byte(n3); - let confirm = |byte| byte == n1 || byte == n2 || byte == n3; - let align = USIZE_BYTES - 1; - let start_ptr = haystack.as_ptr(); - - unsafe { - let end_ptr = start_ptr.add(haystack.len()); - let mut ptr = end_ptr; - if haystack.len() < USIZE_BYTES { - return reverse_search(start_ptr, end_ptr, ptr, confirm); - } - - let chunk = (ptr.sub(USIZE_BYTES) as *const usize).read_unaligned(); - let eq1 = contains_zero_byte(chunk ^ vn1); - let eq2 = contains_zero_byte(chunk ^ vn2); - let eq3 = contains_zero_byte(chunk ^ vn3); - if eq1 || eq2 || eq3 { - return reverse_search(start_ptr, end_ptr, ptr, confirm); - } - - ptr = (end_ptr as usize & !align) as *const u8; - debug_assert!(start_ptr <= ptr && ptr <= end_ptr); - while ptr >= start_ptr.add(USIZE_BYTES) { - debug_assert_eq!(0, (ptr as usize) % USIZE_BYTES); - - let chunk = *(ptr.sub(USIZE_BYTES) as *const usize); - let eq1 = contains_zero_byte(chunk ^ vn1); - let eq2 = contains_zero_byte(chunk ^ vn2); - let eq3 = contains_zero_byte(chunk ^ vn3); - if eq1 || eq2 || eq3 { - break; - } - ptr = ptr.sub(USIZE_BYTES); - } - reverse_search(start_ptr, end_ptr, ptr, confirm) - } -} - -#[inline(always)] -unsafe fn forward_search<F: Fn(u8) -> bool>( - start_ptr: *const u8, - end_ptr: *const u8, - mut ptr: *const u8, - confirm: F, -) -> Option<usize> { - debug_assert!(start_ptr <= ptr); - debug_assert!(ptr <= end_ptr); - - while ptr < end_ptr { - if confirm(*ptr) { - return Some(sub(ptr, start_ptr)); - } - ptr = ptr.offset(1); - } - None -} - -#[inline(always)] -unsafe fn reverse_search<F: Fn(u8) -> bool>( - start_ptr: *const u8, - end_ptr: *const u8, - mut ptr: *const u8, - confirm: F, -) -> Option<usize> { - debug_assert!(start_ptr <= ptr); - debug_assert!(ptr <= end_ptr); - - while ptr > start_ptr { - ptr = ptr.offset(-1); - if confirm(*ptr) { - return Some(sub(ptr, start_ptr)); - } - } - None -} - -/// Subtract `b` from `a` and return the difference. `a` should be greater than -/// or equal to `b`. -fn sub(a: *const u8, b: *const u8) -> usize { - debug_assert!(a >= b); - (a as usize) - (b as usize) -} diff --git a/vendor/memchr/src/memchr/iter.rs b/vendor/memchr/src/memchr/iter.rs deleted file mode 100644 index 16e203f63..000000000 --- a/vendor/memchr/src/memchr/iter.rs +++ /dev/null @@ -1,173 +0,0 @@ -use crate::{memchr, memchr2, memchr3, memrchr, memrchr2, memrchr3}; - -macro_rules! iter_next { - // Common code for the memchr iterators: - // update haystack and position and produce the index - // - // self: &mut Self where Self is the iterator - // search_result: Option<usize> which is the result of the corresponding - // memchr function. - // - // Returns Option<usize> (the next iterator element) - ($self_:expr, $search_result:expr) => { - $search_result.map(move |index| { - // split and take the remaining back half - $self_.haystack = $self_.haystack.split_at(index + 1).1; - let found_position = $self_.position + index; - $self_.position = found_position + 1; - found_position - }) - }; -} - -macro_rules! iter_next_back { - ($self_:expr, $search_result:expr) => { - $search_result.map(move |index| { - // split and take the remaining front half - $self_.haystack = $self_.haystack.split_at(index).0; - $self_.position + index - }) - }; -} - -/// An iterator for `memchr`. -pub struct Memchr<'a> { - needle: u8, - // The haystack to iterate over - haystack: &'a [u8], - // The index - position: usize, -} - -impl<'a> Memchr<'a> { - /// Creates a new iterator that yields all positions of needle in haystack. - #[inline] - pub fn new(needle: u8, haystack: &[u8]) -> Memchr<'_> { - Memchr { needle: needle, haystack: haystack, position: 0 } - } -} - -impl<'a> Iterator for Memchr<'a> { - type Item = usize; - - #[inline] - fn next(&mut self) -> Option<usize> { - iter_next!(self, memchr(self.needle, self.haystack)) - } - - #[inline] - fn size_hint(&self) -> (usize, Option<usize>) { - (0, Some(self.haystack.len())) - } -} - -impl<'a> DoubleEndedIterator for Memchr<'a> { - #[inline] - fn next_back(&mut self) -> Option<Self::Item> { - iter_next_back!(self, memrchr(self.needle, self.haystack)) - } -} - -/// An iterator for `memchr2`. -pub struct Memchr2<'a> { - needle1: u8, - needle2: u8, - // The haystack to iterate over - haystack: &'a [u8], - // The index - position: usize, -} - -impl<'a> Memchr2<'a> { - /// Creates a new iterator that yields all positions of needle in haystack. - #[inline] - pub fn new(needle1: u8, needle2: u8, haystack: &[u8]) -> Memchr2<'_> { - Memchr2 { - needle1: needle1, - needle2: needle2, - haystack: haystack, - position: 0, - } - } -} - -impl<'a> Iterator for Memchr2<'a> { - type Item = usize; - - #[inline] - fn next(&mut self) -> Option<usize> { - iter_next!(self, memchr2(self.needle1, self.needle2, self.haystack)) - } - - #[inline] - fn size_hint(&self) -> (usize, Option<usize>) { - (0, Some(self.haystack.len())) - } -} - -impl<'a> DoubleEndedIterator for Memchr2<'a> { - #[inline] - fn next_back(&mut self) -> Option<Self::Item> { - iter_next_back!( - self, - memrchr2(self.needle1, self.needle2, self.haystack) - ) - } -} - -/// An iterator for `memchr3`. -pub struct Memchr3<'a> { - needle1: u8, - needle2: u8, - needle3: u8, - // The haystack to iterate over - haystack: &'a [u8], - // The index - position: usize, -} - -impl<'a> Memchr3<'a> { - /// Create a new `Memchr3` that's initialized to zero with a haystack - #[inline] - pub fn new( - needle1: u8, - needle2: u8, - needle3: u8, - haystack: &[u8], - ) -> Memchr3<'_> { - Memchr3 { - needle1: needle1, - needle2: needle2, - needle3: needle3, - haystack: haystack, - position: 0, - } - } -} - -impl<'a> Iterator for Memchr3<'a> { - type Item = usize; - - #[inline] - fn next(&mut self) -> Option<usize> { - iter_next!( - self, - memchr3(self.needle1, self.needle2, self.needle3, self.haystack) - ) - } - - #[inline] - fn size_hint(&self) -> (usize, Option<usize>) { - (0, Some(self.haystack.len())) - } -} - -impl<'a> DoubleEndedIterator for Memchr3<'a> { - #[inline] - fn next_back(&mut self) -> Option<Self::Item> { - iter_next_back!( - self, - memrchr3(self.needle1, self.needle2, self.needle3, self.haystack) - ) - } -} diff --git a/vendor/memchr/src/memchr/mod.rs b/vendor/memchr/src/memchr/mod.rs deleted file mode 100644 index 09ce6ef3c..000000000 --- a/vendor/memchr/src/memchr/mod.rs +++ /dev/null @@ -1,410 +0,0 @@ -use core::iter::Rev; - -pub use self::iter::{Memchr, Memchr2, Memchr3}; - -// N.B. If you're looking for the cfg knobs for libc, see build.rs. -#[cfg(memchr_libc)] -mod c; -#[allow(dead_code)] -pub mod fallback; -mod iter; -pub mod naive; -#[cfg(all(not(miri), target_arch = "x86_64", memchr_runtime_simd))] -mod x86; - -/// An iterator over all occurrences of the needle in a haystack. -#[inline] -pub fn memchr_iter(needle: u8, haystack: &[u8]) -> Memchr<'_> { - Memchr::new(needle, haystack) -} - -/// An iterator over all occurrences of the needles in a haystack. -#[inline] -pub fn memchr2_iter(needle1: u8, needle2: u8, haystack: &[u8]) -> Memchr2<'_> { - Memchr2::new(needle1, needle2, haystack) -} - -/// An iterator over all occurrences of the needles in a haystack. -#[inline] -pub fn memchr3_iter( - needle1: u8, - needle2: u8, - needle3: u8, - haystack: &[u8], -) -> Memchr3<'_> { - Memchr3::new(needle1, needle2, needle3, haystack) -} - -/// An iterator over all occurrences of the needle in a haystack, in reverse. -#[inline] -pub fn memrchr_iter(needle: u8, haystack: &[u8]) -> Rev<Memchr<'_>> { - Memchr::new(needle, haystack).rev() -} - -/// An iterator over all occurrences of the needles in a haystack, in reverse. -#[inline] -pub fn memrchr2_iter( - needle1: u8, - needle2: u8, - haystack: &[u8], -) -> Rev<Memchr2<'_>> { - Memchr2::new(needle1, needle2, haystack).rev() -} - -/// An iterator over all occurrences of the needles in a haystack, in reverse. -#[inline] -pub fn memrchr3_iter( - needle1: u8, - needle2: u8, - needle3: u8, - haystack: &[u8], -) -> Rev<Memchr3<'_>> { - Memchr3::new(needle1, needle2, needle3, haystack).rev() -} - -/// Search for the first occurrence of a byte in a slice. -/// -/// This returns the index corresponding to the first occurrence of `needle` in -/// `haystack`, or `None` if one is not found. If an index is returned, it is -/// guaranteed to be less than `usize::MAX`. -/// -/// While this is operationally the same as something like -/// `haystack.iter().position(|&b| b == needle)`, `memchr` will use a highly -/// optimized routine that can be up to an order of magnitude faster in some -/// cases. -/// -/// # Example -/// -/// This shows how to find the first position of a byte in a byte string. -/// -/// ``` -/// use memchr::memchr; -/// -/// let haystack = b"the quick brown fox"; -/// assert_eq!(memchr(b'k', haystack), Some(8)); -/// ``` -#[inline] -pub fn memchr(needle: u8, haystack: &[u8]) -> Option<usize> { - #[cfg(miri)] - #[inline(always)] - fn imp(n1: u8, haystack: &[u8]) -> Option<usize> { - naive::memchr(n1, haystack) - } - - #[cfg(all(target_arch = "x86_64", memchr_runtime_simd, not(miri)))] - #[inline(always)] - fn imp(n1: u8, haystack: &[u8]) -> Option<usize> { - x86::memchr(n1, haystack) - } - - #[cfg(all( - memchr_libc, - not(all(target_arch = "x86_64", memchr_runtime_simd)), - not(miri), - ))] - #[inline(always)] - fn imp(n1: u8, haystack: &[u8]) -> Option<usize> { - c::memchr(n1, haystack) - } - - #[cfg(all( - not(memchr_libc), - not(all(target_arch = "x86_64", memchr_runtime_simd)), - not(miri), - ))] - #[inline(always)] - fn imp(n1: u8, haystack: &[u8]) -> Option<usize> { - fallback::memchr(n1, haystack) - } - - if haystack.is_empty() { - None - } else { - imp(needle, haystack) - } -} - -/// Like `memchr`, but searches for either of two bytes instead of just one. -/// -/// This returns the index corresponding to the first occurrence of `needle1` -/// or the first occurrence of `needle2` in `haystack` (whichever occurs -/// earlier), or `None` if neither one is found. If an index is returned, it is -/// guaranteed to be less than `usize::MAX`. -/// -/// While this is operationally the same as something like -/// `haystack.iter().position(|&b| b == needle1 || b == needle2)`, `memchr2` -/// will use a highly optimized routine that can be up to an order of magnitude -/// faster in some cases. -/// -/// # Example -/// -/// This shows how to find the first position of either of two bytes in a byte -/// string. -/// -/// ``` -/// use memchr::memchr2; -/// -/// let haystack = b"the quick brown fox"; -/// assert_eq!(memchr2(b'k', b'q', haystack), Some(4)); -/// ``` -#[inline] -pub fn memchr2(needle1: u8, needle2: u8, haystack: &[u8]) -> Option<usize> { - #[cfg(miri)] - #[inline(always)] - fn imp(n1: u8, n2: u8, haystack: &[u8]) -> Option<usize> { - naive::memchr2(n1, n2, haystack) - } - - #[cfg(all(target_arch = "x86_64", memchr_runtime_simd, not(miri)))] - #[inline(always)] - fn imp(n1: u8, n2: u8, haystack: &[u8]) -> Option<usize> { - x86::memchr2(n1, n2, haystack) - } - - #[cfg(all( - not(all(target_arch = "x86_64", memchr_runtime_simd)), - not(miri), - ))] - #[inline(always)] - fn imp(n1: u8, n2: u8, haystack: &[u8]) -> Option<usize> { - fallback::memchr2(n1, n2, haystack) - } - - if haystack.is_empty() { - None - } else { - imp(needle1, needle2, haystack) - } -} - -/// Like `memchr`, but searches for any of three bytes instead of just one. -/// -/// This returns the index corresponding to the first occurrence of `needle1`, -/// the first occurrence of `needle2`, or the first occurrence of `needle3` in -/// `haystack` (whichever occurs earliest), or `None` if none are found. If an -/// index is returned, it is guaranteed to be less than `usize::MAX`. -/// -/// While this is operationally the same as something like -/// `haystack.iter().position(|&b| b == needle1 || b == needle2 || -/// b == needle3)`, `memchr3` will use a highly optimized routine that can be -/// up to an order of magnitude faster in some cases. -/// -/// # Example -/// -/// This shows how to find the first position of any of three bytes in a byte -/// string. -/// -/// ``` -/// use memchr::memchr3; -/// -/// let haystack = b"the quick brown fox"; -/// assert_eq!(memchr3(b'k', b'q', b'e', haystack), Some(2)); -/// ``` -#[inline] -pub fn memchr3( - needle1: u8, - needle2: u8, - needle3: u8, - haystack: &[u8], -) -> Option<usize> { - #[cfg(miri)] - #[inline(always)] - fn imp(n1: u8, n2: u8, n3: u8, haystack: &[u8]) -> Option<usize> { - naive::memchr3(n1, n2, n3, haystack) - } - - #[cfg(all(target_arch = "x86_64", memchr_runtime_simd, not(miri)))] - #[inline(always)] - fn imp(n1: u8, n2: u8, n3: u8, haystack: &[u8]) -> Option<usize> { - x86::memchr3(n1, n2, n3, haystack) - } - - #[cfg(all( - not(all(target_arch = "x86_64", memchr_runtime_simd)), - not(miri), - ))] - #[inline(always)] - fn imp(n1: u8, n2: u8, n3: u8, haystack: &[u8]) -> Option<usize> { - fallback::memchr3(n1, n2, n3, haystack) - } - - if haystack.is_empty() { - None - } else { - imp(needle1, needle2, needle3, haystack) - } -} - -/// Search for the last occurrence of a byte in a slice. -/// -/// This returns the index corresponding to the last occurrence of `needle` in -/// `haystack`, or `None` if one is not found. If an index is returned, it is -/// guaranteed to be less than `usize::MAX`. -/// -/// While this is operationally the same as something like -/// `haystack.iter().rposition(|&b| b == needle)`, `memrchr` will use a highly -/// optimized routine that can be up to an order of magnitude faster in some -/// cases. -/// -/// # Example -/// -/// This shows how to find the last position of a byte in a byte string. -/// -/// ``` -/// use memchr::memrchr; -/// -/// let haystack = b"the quick brown fox"; -/// assert_eq!(memrchr(b'o', haystack), Some(17)); -/// ``` -#[inline] -pub fn memrchr(needle: u8, haystack: &[u8]) -> Option<usize> { - #[cfg(miri)] - #[inline(always)] - fn imp(n1: u8, haystack: &[u8]) -> Option<usize> { - naive::memrchr(n1, haystack) - } - - #[cfg(all(target_arch = "x86_64", memchr_runtime_simd, not(miri)))] - #[inline(always)] - fn imp(n1: u8, haystack: &[u8]) -> Option<usize> { - x86::memrchr(n1, haystack) - } - - #[cfg(all( - memchr_libc, - target_os = "linux", - not(all(target_arch = "x86_64", memchr_runtime_simd)), - not(miri) - ))] - #[inline(always)] - fn imp(n1: u8, haystack: &[u8]) -> Option<usize> { - c::memrchr(n1, haystack) - } - - #[cfg(all( - not(all(memchr_libc, target_os = "linux")), - not(all(target_arch = "x86_64", memchr_runtime_simd)), - not(miri), - ))] - #[inline(always)] - fn imp(n1: u8, haystack: &[u8]) -> Option<usize> { - fallback::memrchr(n1, haystack) - } - - if haystack.is_empty() { - None - } else { - imp(needle, haystack) - } -} - -/// Like `memrchr`, but searches for either of two bytes instead of just one. -/// -/// This returns the index corresponding to the last occurrence of `needle1` or -/// the last occurrence of `needle2` in `haystack` (whichever occurs later), or -/// `None` if neither one is found. If an index is returned, it is guaranteed -/// to be less than `usize::MAX`. -/// -/// While this is operationally the same as something like -/// `haystack.iter().rposition(|&b| b == needle1 || b == needle2)`, `memrchr2` -/// will use a highly optimized routine that can be up to an order of magnitude -/// faster in some cases. -/// -/// # Example -/// -/// This shows how to find the last position of either of two bytes in a byte -/// string. -/// -/// ``` -/// use memchr::memrchr2; -/// -/// let haystack = b"the quick brown fox"; -/// assert_eq!(memrchr2(b'k', b'q', haystack), Some(8)); -/// ``` -#[inline] -pub fn memrchr2(needle1: u8, needle2: u8, haystack: &[u8]) -> Option<usize> { - #[cfg(miri)] - #[inline(always)] - fn imp(n1: u8, n2: u8, haystack: &[u8]) -> Option<usize> { - naive::memrchr2(n1, n2, haystack) - } - - #[cfg(all(target_arch = "x86_64", memchr_runtime_simd, not(miri)))] - #[inline(always)] - fn imp(n1: u8, n2: u8, haystack: &[u8]) -> Option<usize> { - x86::memrchr2(n1, n2, haystack) - } - - #[cfg(all( - not(all(target_arch = "x86_64", memchr_runtime_simd)), - not(miri), - ))] - #[inline(always)] - fn imp(n1: u8, n2: u8, haystack: &[u8]) -> Option<usize> { - fallback::memrchr2(n1, n2, haystack) - } - - if haystack.is_empty() { - None - } else { - imp(needle1, needle2, haystack) - } -} - -/// Like `memrchr`, but searches for any of three bytes instead of just one. -/// -/// This returns the index corresponding to the last occurrence of `needle1`, -/// the last occurrence of `needle2`, or the last occurrence of `needle3` in -/// `haystack` (whichever occurs later), or `None` if none are found. If an -/// index is returned, it is guaranteed to be less than `usize::MAX`. -/// -/// While this is operationally the same as something like -/// `haystack.iter().rposition(|&b| b == needle1 || b == needle2 || -/// b == needle3)`, `memrchr3` will use a highly optimized routine that can be -/// up to an order of magnitude faster in some cases. -/// -/// # Example -/// -/// This shows how to find the last position of any of three bytes in a byte -/// string. -/// -/// ``` -/// use memchr::memrchr3; -/// -/// let haystack = b"the quick brown fox"; -/// assert_eq!(memrchr3(b'k', b'q', b'e', haystack), Some(8)); -/// ``` -#[inline] -pub fn memrchr3( - needle1: u8, - needle2: u8, - needle3: u8, - haystack: &[u8], -) -> Option<usize> { - #[cfg(miri)] - #[inline(always)] - fn imp(n1: u8, n2: u8, n3: u8, haystack: &[u8]) -> Option<usize> { - naive::memrchr3(n1, n2, n3, haystack) - } - - #[cfg(all(target_arch = "x86_64", memchr_runtime_simd, not(miri)))] - #[inline(always)] - fn imp(n1: u8, n2: u8, n3: u8, haystack: &[u8]) -> Option<usize> { - x86::memrchr3(n1, n2, n3, haystack) - } - - #[cfg(all( - not(all(target_arch = "x86_64", memchr_runtime_simd)), - not(miri), - ))] - #[inline(always)] - fn imp(n1: u8, n2: u8, n3: u8, haystack: &[u8]) -> Option<usize> { - fallback::memrchr3(n1, n2, n3, haystack) - } - - if haystack.is_empty() { - None - } else { - imp(needle1, needle2, needle3, haystack) - } -} diff --git a/vendor/memchr/src/memchr/naive.rs b/vendor/memchr/src/memchr/naive.rs deleted file mode 100644 index 3f3053d48..000000000 --- a/vendor/memchr/src/memchr/naive.rs +++ /dev/null @@ -1,25 +0,0 @@ -#![allow(dead_code)] - -pub fn memchr(n1: u8, haystack: &[u8]) -> Option<usize> { - haystack.iter().position(|&b| b == n1) -} - -pub fn memchr2(n1: u8, n2: u8, haystack: &[u8]) -> Option<usize> { - haystack.iter().position(|&b| b == n1 || b == n2) -} - -pub fn memchr3(n1: u8, n2: u8, n3: u8, haystack: &[u8]) -> Option<usize> { - haystack.iter().position(|&b| b == n1 || b == n2 || b == n3) -} - -pub fn memrchr(n1: u8, haystack: &[u8]) -> Option<usize> { - haystack.iter().rposition(|&b| b == n1) -} - -pub fn memrchr2(n1: u8, n2: u8, haystack: &[u8]) -> Option<usize> { - haystack.iter().rposition(|&b| b == n1 || b == n2) -} - -pub fn memrchr3(n1: u8, n2: u8, n3: u8, haystack: &[u8]) -> Option<usize> { - haystack.iter().rposition(|&b| b == n1 || b == n2 || b == n3) -} diff --git a/vendor/memchr/src/memchr/x86/avx.rs b/vendor/memchr/src/memchr/x86/avx.rs deleted file mode 100644 index 535123097..000000000 --- a/vendor/memchr/src/memchr/x86/avx.rs +++ /dev/null @@ -1,755 +0,0 @@ -use core::{arch::x86_64::*, cmp, mem::size_of}; - -use super::sse2; - -const VECTOR_SIZE: usize = size_of::<__m256i>(); -const VECTOR_ALIGN: usize = VECTOR_SIZE - 1; - -// The number of bytes to loop at in one iteration of memchr/memrchr. -const LOOP_SIZE: usize = 4 * VECTOR_SIZE; - -// The number of bytes to loop at in one iteration of memchr2/memrchr2 and -// memchr3/memrchr3. There was no observable difference between 128 and 64 -// bytes in benchmarks. memchr3 in particular only gets a very slight speed up -// from the loop unrolling. -const LOOP_SIZE2: usize = 2 * VECTOR_SIZE; - -#[target_feature(enable = "avx2")] -pub unsafe fn memchr(n1: u8, haystack: &[u8]) -> Option<usize> { - // For a high level explanation for how this algorithm works, see the - // sse2 implementation. The avx implementation here is the same, but with - // 256-bit vectors instead of 128-bit vectors. - - // This routine is called whenever a match is detected. It is specifically - // marked as unlineable because it improves the codegen of the unrolled - // loop below. Inlining this seems to cause codegen with some extra adds - // and a load that aren't necessary. This seems to result in about a 10% - // improvement for the memchr1/crate/huge/never benchmark. - // - // Interestingly, I couldn't observe a similar improvement for memrchr. - #[cold] - #[inline(never)] - #[target_feature(enable = "avx2")] - unsafe fn matched( - start_ptr: *const u8, - ptr: *const u8, - eqa: __m256i, - eqb: __m256i, - eqc: __m256i, - eqd: __m256i, - ) -> usize { - let mut at = sub(ptr, start_ptr); - let mask = _mm256_movemask_epi8(eqa); - if mask != 0 { - return at + forward_pos(mask); - } - - at += VECTOR_SIZE; - let mask = _mm256_movemask_epi8(eqb); - if mask != 0 { - return at + forward_pos(mask); - } - - at += VECTOR_SIZE; - let mask = _mm256_movemask_epi8(eqc); - if mask != 0 { - return at + forward_pos(mask); - } - - at += VECTOR_SIZE; - let mask = _mm256_movemask_epi8(eqd); - debug_assert!(mask != 0); - at + forward_pos(mask) - } - - let start_ptr = haystack.as_ptr(); - let end_ptr = start_ptr.add(haystack.len()); - let mut ptr = start_ptr; - - if haystack.len() < VECTOR_SIZE { - // For small haystacks, defer to the SSE2 implementation. Codegen - // suggests this completely avoids touching the AVX vectors. - return sse2::memchr(n1, haystack); - } - - let vn1 = _mm256_set1_epi8(n1 as i8); - let loop_size = cmp::min(LOOP_SIZE, haystack.len()); - if let Some(i) = forward_search1(start_ptr, end_ptr, ptr, vn1) { - return Some(i); - } - - ptr = ptr.add(VECTOR_SIZE - (start_ptr as usize & VECTOR_ALIGN)); - debug_assert!(ptr > start_ptr && end_ptr.sub(VECTOR_SIZE) >= start_ptr); - while loop_size == LOOP_SIZE && ptr <= end_ptr.sub(loop_size) { - debug_assert_eq!(0, (ptr as usize) % VECTOR_SIZE); - - let a = _mm256_load_si256(ptr as *const __m256i); - let b = _mm256_load_si256(ptr.add(VECTOR_SIZE) as *const __m256i); - let c = _mm256_load_si256(ptr.add(2 * VECTOR_SIZE) as *const __m256i); - let d = _mm256_load_si256(ptr.add(3 * VECTOR_SIZE) as *const __m256i); - let eqa = _mm256_cmpeq_epi8(vn1, a); - let eqb = _mm256_cmpeq_epi8(vn1, b); - let eqc = _mm256_cmpeq_epi8(vn1, c); - let eqd = _mm256_cmpeq_epi8(vn1, d); - let or1 = _mm256_or_si256(eqa, eqb); - let or2 = _mm256_or_si256(eqc, eqd); - let or3 = _mm256_or_si256(or1, or2); - - if _mm256_movemask_epi8(or3) != 0 { - return Some(matched(start_ptr, ptr, eqa, eqb, eqc, eqd)); - } - ptr = ptr.add(loop_size); - } - while ptr <= end_ptr.sub(VECTOR_SIZE) { - debug_assert!(sub(end_ptr, ptr) >= VECTOR_SIZE); - - if let Some(i) = forward_search1(start_ptr, end_ptr, ptr, vn1) { - return Some(i); - } - ptr = ptr.add(VECTOR_SIZE); - } - if ptr < end_ptr { - debug_assert!(sub(end_ptr, ptr) < VECTOR_SIZE); - ptr = ptr.sub(VECTOR_SIZE - sub(end_ptr, ptr)); - debug_assert_eq!(sub(end_ptr, ptr), VECTOR_SIZE); - - return forward_search1(start_ptr, end_ptr, ptr, vn1); - } - None -} - -#[target_feature(enable = "avx2")] -pub unsafe fn memchr2(n1: u8, n2: u8, haystack: &[u8]) -> Option<usize> { - #[cold] - #[inline(never)] - #[target_feature(enable = "avx2")] - unsafe fn matched( - start_ptr: *const u8, - ptr: *const u8, - eqa1: __m256i, - eqa2: __m256i, - eqb1: __m256i, - eqb2: __m256i, - ) -> usize { - let mut at = sub(ptr, start_ptr); - let mask1 = _mm256_movemask_epi8(eqa1); - let mask2 = _mm256_movemask_epi8(eqa2); - if mask1 != 0 || mask2 != 0 { - return at + forward_pos2(mask1, mask2); - } - - at += VECTOR_SIZE; - let mask1 = _mm256_movemask_epi8(eqb1); - let mask2 = _mm256_movemask_epi8(eqb2); - at + forward_pos2(mask1, mask2) - } - - let vn1 = _mm256_set1_epi8(n1 as i8); - let vn2 = _mm256_set1_epi8(n2 as i8); - let len = haystack.len(); - let loop_size = cmp::min(LOOP_SIZE2, len); - let start_ptr = haystack.as_ptr(); - let end_ptr = start_ptr.add(haystack.len()); - let mut ptr = start_ptr; - - if haystack.len() < VECTOR_SIZE { - while ptr < end_ptr { - if *ptr == n1 || *ptr == n2 { - return Some(sub(ptr, start_ptr)); - } - ptr = ptr.offset(1); - } - return None; - } - - if let Some(i) = forward_search2(start_ptr, end_ptr, ptr, vn1, vn2) { - return Some(i); - } - - ptr = ptr.add(VECTOR_SIZE - (start_ptr as usize & VECTOR_ALIGN)); - debug_assert!(ptr > start_ptr && end_ptr.sub(VECTOR_SIZE) >= start_ptr); - while loop_size == LOOP_SIZE2 && ptr <= end_ptr.sub(loop_size) { - debug_assert_eq!(0, (ptr as usize) % VECTOR_SIZE); - - let a = _mm256_load_si256(ptr as *const __m256i); - let b = _mm256_load_si256(ptr.add(VECTOR_SIZE) as *const __m256i); - let eqa1 = _mm256_cmpeq_epi8(vn1, a); - let eqb1 = _mm256_cmpeq_epi8(vn1, b); - let eqa2 = _mm256_cmpeq_epi8(vn2, a); - let eqb2 = _mm256_cmpeq_epi8(vn2, b); - let or1 = _mm256_or_si256(eqa1, eqb1); - let or2 = _mm256_or_si256(eqa2, eqb2); - let or3 = _mm256_or_si256(or1, or2); - if _mm256_movemask_epi8(or3) != 0 { - return Some(matched(start_ptr, ptr, eqa1, eqa2, eqb1, eqb2)); - } - ptr = ptr.add(loop_size); - } - while ptr <= end_ptr.sub(VECTOR_SIZE) { - if let Some(i) = forward_search2(start_ptr, end_ptr, ptr, vn1, vn2) { - return Some(i); - } - ptr = ptr.add(VECTOR_SIZE); - } - if ptr < end_ptr { - debug_assert!(sub(end_ptr, ptr) < VECTOR_SIZE); - ptr = ptr.sub(VECTOR_SIZE - sub(end_ptr, ptr)); - debug_assert_eq!(sub(end_ptr, ptr), VECTOR_SIZE); - - return forward_search2(start_ptr, end_ptr, ptr, vn1, vn2); - } - None -} - -#[target_feature(enable = "avx2")] -pub unsafe fn memchr3( - n1: u8, - n2: u8, - n3: u8, - haystack: &[u8], -) -> Option<usize> { - #[cold] - #[inline(never)] - #[target_feature(enable = "avx2")] - unsafe fn matched( - start_ptr: *const u8, - ptr: *const u8, - eqa1: __m256i, - eqa2: __m256i, - eqa3: __m256i, - eqb1: __m256i, - eqb2: __m256i, - eqb3: __m256i, - ) -> usize { - let mut at = sub(ptr, start_ptr); - let mask1 = _mm256_movemask_epi8(eqa1); - let mask2 = _mm256_movemask_epi8(eqa2); - let mask3 = _mm256_movemask_epi8(eqa3); - if mask1 != 0 || mask2 != 0 || mask3 != 0 { - return at + forward_pos3(mask1, mask2, mask3); - } - - at += VECTOR_SIZE; - let mask1 = _mm256_movemask_epi8(eqb1); - let mask2 = _mm256_movemask_epi8(eqb2); - let mask3 = _mm256_movemask_epi8(eqb3); - at + forward_pos3(mask1, mask2, mask3) - } - - let vn1 = _mm256_set1_epi8(n1 as i8); - let vn2 = _mm256_set1_epi8(n2 as i8); - let vn3 = _mm256_set1_epi8(n3 as i8); - let len = haystack.len(); - let loop_size = cmp::min(LOOP_SIZE2, len); - let start_ptr = haystack.as_ptr(); - let end_ptr = start_ptr.add(haystack.len()); - let mut ptr = start_ptr; - - if haystack.len() < VECTOR_SIZE { - while ptr < end_ptr { - if *ptr == n1 || *ptr == n2 || *ptr == n3 { - return Some(sub(ptr, start_ptr)); - } - ptr = ptr.offset(1); - } - return None; - } - - if let Some(i) = forward_search3(start_ptr, end_ptr, ptr, vn1, vn2, vn3) { - return Some(i); - } - - ptr = ptr.add(VECTOR_SIZE - (start_ptr as usize & VECTOR_ALIGN)); - debug_assert!(ptr > start_ptr && end_ptr.sub(VECTOR_SIZE) >= start_ptr); - while loop_size == LOOP_SIZE2 && ptr <= end_ptr.sub(loop_size) { - debug_assert_eq!(0, (ptr as usize) % VECTOR_SIZE); - - let a = _mm256_load_si256(ptr as *const __m256i); - let b = _mm256_load_si256(ptr.add(VECTOR_SIZE) as *const __m256i); - let eqa1 = _mm256_cmpeq_epi8(vn1, a); - let eqb1 = _mm256_cmpeq_epi8(vn1, b); - let eqa2 = _mm256_cmpeq_epi8(vn2, a); - let eqb2 = _mm256_cmpeq_epi8(vn2, b); - let eqa3 = _mm256_cmpeq_epi8(vn3, a); - let eqb3 = _mm256_cmpeq_epi8(vn3, b); - let or1 = _mm256_or_si256(eqa1, eqb1); - let or2 = _mm256_or_si256(eqa2, eqb2); - let or3 = _mm256_or_si256(eqa3, eqb3); - let or4 = _mm256_or_si256(or1, or2); - let or5 = _mm256_or_si256(or3, or4); - if _mm256_movemask_epi8(or5) != 0 { - return Some(matched( - start_ptr, ptr, eqa1, eqa2, eqa3, eqb1, eqb2, eqb3, - )); - } - ptr = ptr.add(loop_size); - } - while ptr <= end_ptr.sub(VECTOR_SIZE) { - if let Some(i) = - forward_search3(start_ptr, end_ptr, ptr, vn1, vn2, vn3) - { - return Some(i); - } - ptr = ptr.add(VECTOR_SIZE); - } - if ptr < end_ptr { - debug_assert!(sub(end_ptr, ptr) < VECTOR_SIZE); - ptr = ptr.sub(VECTOR_SIZE - sub(end_ptr, ptr)); - debug_assert_eq!(sub(end_ptr, ptr), VECTOR_SIZE); - - return forward_search3(start_ptr, end_ptr, ptr, vn1, vn2, vn3); - } - None -} - -#[target_feature(enable = "avx2")] -pub unsafe fn memrchr(n1: u8, haystack: &[u8]) -> Option<usize> { - let vn1 = _mm256_set1_epi8(n1 as i8); - let len = haystack.len(); - let loop_size = cmp::min(LOOP_SIZE, len); - let start_ptr = haystack.as_ptr(); - let end_ptr = start_ptr.add(haystack.len()); - let mut ptr = end_ptr; - - if haystack.len() < VECTOR_SIZE { - while ptr > start_ptr { - ptr = ptr.offset(-1); - if *ptr == n1 { - return Some(sub(ptr, start_ptr)); - } - } - return None; - } - - ptr = ptr.sub(VECTOR_SIZE); - if let Some(i) = reverse_search1(start_ptr, end_ptr, ptr, vn1) { - return Some(i); - } - - ptr = (end_ptr as usize & !VECTOR_ALIGN) as *const u8; - debug_assert!(start_ptr <= ptr && ptr <= end_ptr); - while loop_size == LOOP_SIZE && ptr >= start_ptr.add(loop_size) { - debug_assert_eq!(0, (ptr as usize) % VECTOR_SIZE); - - ptr = ptr.sub(loop_size); - let a = _mm256_load_si256(ptr as *const __m256i); - let b = _mm256_load_si256(ptr.add(VECTOR_SIZE) as *const __m256i); - let c = _mm256_load_si256(ptr.add(2 * VECTOR_SIZE) as *const __m256i); - let d = _mm256_load_si256(ptr.add(3 * VECTOR_SIZE) as *const __m256i); - let eqa = _mm256_cmpeq_epi8(vn1, a); - let eqb = _mm256_cmpeq_epi8(vn1, b); - let eqc = _mm256_cmpeq_epi8(vn1, c); - let eqd = _mm256_cmpeq_epi8(vn1, d); - let or1 = _mm256_or_si256(eqa, eqb); - let or2 = _mm256_or_si256(eqc, eqd); - let or3 = _mm256_or_si256(or1, or2); - if _mm256_movemask_epi8(or3) != 0 { - let mut at = sub(ptr.add(3 * VECTOR_SIZE), start_ptr); - let mask = _mm256_movemask_epi8(eqd); - if mask != 0 { - return Some(at + reverse_pos(mask)); - } - - at -= VECTOR_SIZE; - let mask = _mm256_movemask_epi8(eqc); - if mask != 0 { - return Some(at + reverse_pos(mask)); - } - - at -= VECTOR_SIZE; - let mask = _mm256_movemask_epi8(eqb); - if mask != 0 { - return Some(at + reverse_pos(mask)); - } - - at -= VECTOR_SIZE; - let mask = _mm256_movemask_epi8(eqa); - debug_assert!(mask != 0); - return Some(at + reverse_pos(mask)); - } - } - while ptr >= start_ptr.add(VECTOR_SIZE) { - ptr = ptr.sub(VECTOR_SIZE); - if let Some(i) = reverse_search1(start_ptr, end_ptr, ptr, vn1) { - return Some(i); - } - } - if ptr > start_ptr { - debug_assert!(sub(ptr, start_ptr) < VECTOR_SIZE); - return reverse_search1(start_ptr, end_ptr, start_ptr, vn1); - } - None -} - -#[target_feature(enable = "avx2")] -pub unsafe fn memrchr2(n1: u8, n2: u8, haystack: &[u8]) -> Option<usize> { - let vn1 = _mm256_set1_epi8(n1 as i8); - let vn2 = _mm256_set1_epi8(n2 as i8); - let len = haystack.len(); - let loop_size = cmp::min(LOOP_SIZE2, len); - let start_ptr = haystack.as_ptr(); - let end_ptr = start_ptr.add(haystack.len()); - let mut ptr = end_ptr; - - if haystack.len() < VECTOR_SIZE { - while ptr > start_ptr { - ptr = ptr.offset(-1); - if *ptr == n1 || *ptr == n2 { - return Some(sub(ptr, start_ptr)); - } - } - return None; - } - - ptr = ptr.sub(VECTOR_SIZE); - if let Some(i) = reverse_search2(start_ptr, end_ptr, ptr, vn1, vn2) { - return Some(i); - } - - ptr = (end_ptr as usize & !VECTOR_ALIGN) as *const u8; - debug_assert!(start_ptr <= ptr && ptr <= end_ptr); - while loop_size == LOOP_SIZE2 && ptr >= start_ptr.add(loop_size) { - debug_assert_eq!(0, (ptr as usize) % VECTOR_SIZE); - - ptr = ptr.sub(loop_size); - let a = _mm256_load_si256(ptr as *const __m256i); - let b = _mm256_load_si256(ptr.add(VECTOR_SIZE) as *const __m256i); - let eqa1 = _mm256_cmpeq_epi8(vn1, a); - let eqb1 = _mm256_cmpeq_epi8(vn1, b); - let eqa2 = _mm256_cmpeq_epi8(vn2, a); - let eqb2 = _mm256_cmpeq_epi8(vn2, b); - let or1 = _mm256_or_si256(eqa1, eqb1); - let or2 = _mm256_or_si256(eqa2, eqb2); - let or3 = _mm256_or_si256(or1, or2); - if _mm256_movemask_epi8(or3) != 0 { - let mut at = sub(ptr.add(VECTOR_SIZE), start_ptr); - let mask1 = _mm256_movemask_epi8(eqb1); - let mask2 = _mm256_movemask_epi8(eqb2); - if mask1 != 0 || mask2 != 0 { - return Some(at + reverse_pos2(mask1, mask2)); - } - - at -= VECTOR_SIZE; - let mask1 = _mm256_movemask_epi8(eqa1); - let mask2 = _mm256_movemask_epi8(eqa2); - return Some(at + reverse_pos2(mask1, mask2)); - } - } - while ptr >= start_ptr.add(VECTOR_SIZE) { - ptr = ptr.sub(VECTOR_SIZE); - if let Some(i) = reverse_search2(start_ptr, end_ptr, ptr, vn1, vn2) { - return Some(i); - } - } - if ptr > start_ptr { - debug_assert!(sub(ptr, start_ptr) < VECTOR_SIZE); - return reverse_search2(start_ptr, end_ptr, start_ptr, vn1, vn2); - } - None -} - -#[target_feature(enable = "avx2")] -pub unsafe fn memrchr3( - n1: u8, - n2: u8, - n3: u8, - haystack: &[u8], -) -> Option<usize> { - let vn1 = _mm256_set1_epi8(n1 as i8); - let vn2 = _mm256_set1_epi8(n2 as i8); - let vn3 = _mm256_set1_epi8(n3 as i8); - let len = haystack.len(); - let loop_size = cmp::min(LOOP_SIZE2, len); - let start_ptr = haystack.as_ptr(); - let end_ptr = start_ptr.add(haystack.len()); - let mut ptr = end_ptr; - - if haystack.len() < VECTOR_SIZE { - while ptr > start_ptr { - ptr = ptr.offset(-1); - if *ptr == n1 || *ptr == n2 || *ptr == n3 { - return Some(sub(ptr, start_ptr)); - } - } - return None; - } - - ptr = ptr.sub(VECTOR_SIZE); - if let Some(i) = reverse_search3(start_ptr, end_ptr, ptr, vn1, vn2, vn3) { - return Some(i); - } - - ptr = (end_ptr as usize & !VECTOR_ALIGN) as *const u8; - debug_assert!(start_ptr <= ptr && ptr <= end_ptr); - while loop_size == LOOP_SIZE2 && ptr >= start_ptr.add(loop_size) { - debug_assert_eq!(0, (ptr as usize) % VECTOR_SIZE); - - ptr = ptr.sub(loop_size); - let a = _mm256_load_si256(ptr as *const __m256i); - let b = _mm256_load_si256(ptr.add(VECTOR_SIZE) as *const __m256i); - let eqa1 = _mm256_cmpeq_epi8(vn1, a); - let eqb1 = _mm256_cmpeq_epi8(vn1, b); - let eqa2 = _mm256_cmpeq_epi8(vn2, a); - let eqb2 = _mm256_cmpeq_epi8(vn2, b); - let eqa3 = _mm256_cmpeq_epi8(vn3, a); - let eqb3 = _mm256_cmpeq_epi8(vn3, b); - let or1 = _mm256_or_si256(eqa1, eqb1); - let or2 = _mm256_or_si256(eqa2, eqb2); - let or3 = _mm256_or_si256(eqa3, eqb3); - let or4 = _mm256_or_si256(or1, or2); - let or5 = _mm256_or_si256(or3, or4); - if _mm256_movemask_epi8(or5) != 0 { - let mut at = sub(ptr.add(VECTOR_SIZE), start_ptr); - let mask1 = _mm256_movemask_epi8(eqb1); - let mask2 = _mm256_movemask_epi8(eqb2); - let mask3 = _mm256_movemask_epi8(eqb3); - if mask1 != 0 || mask2 != 0 || mask3 != 0 { - return Some(at + reverse_pos3(mask1, mask2, mask3)); - } - - at -= VECTOR_SIZE; - let mask1 = _mm256_movemask_epi8(eqa1); - let mask2 = _mm256_movemask_epi8(eqa2); - let mask3 = _mm256_movemask_epi8(eqa3); - return Some(at + reverse_pos3(mask1, mask2, mask3)); - } - } - while ptr >= start_ptr.add(VECTOR_SIZE) { - ptr = ptr.sub(VECTOR_SIZE); - if let Some(i) = - reverse_search3(start_ptr, end_ptr, ptr, vn1, vn2, vn3) - { - return Some(i); - } - } - if ptr > start_ptr { - debug_assert!(sub(ptr, start_ptr) < VECTOR_SIZE); - return reverse_search3(start_ptr, end_ptr, start_ptr, vn1, vn2, vn3); - } - None -} - -#[target_feature(enable = "avx2")] -unsafe fn forward_search1( - start_ptr: *const u8, - end_ptr: *const u8, - ptr: *const u8, - vn1: __m256i, -) -> Option<usize> { - debug_assert!(sub(end_ptr, start_ptr) >= VECTOR_SIZE); - debug_assert!(start_ptr <= ptr); - debug_assert!(ptr <= end_ptr.sub(VECTOR_SIZE)); - - let chunk = _mm256_loadu_si256(ptr as *const __m256i); - let mask = _mm256_movemask_epi8(_mm256_cmpeq_epi8(chunk, vn1)); - if mask != 0 { - Some(sub(ptr, start_ptr) + forward_pos(mask)) - } else { - None - } -} - -#[target_feature(enable = "avx2")] -unsafe fn forward_search2( - start_ptr: *const u8, - end_ptr: *const u8, - ptr: *const u8, - vn1: __m256i, - vn2: __m256i, -) -> Option<usize> { - debug_assert!(sub(end_ptr, start_ptr) >= VECTOR_SIZE); - debug_assert!(start_ptr <= ptr); - debug_assert!(ptr <= end_ptr.sub(VECTOR_SIZE)); - - let chunk = _mm256_loadu_si256(ptr as *const __m256i); - let eq1 = _mm256_cmpeq_epi8(chunk, vn1); - let eq2 = _mm256_cmpeq_epi8(chunk, vn2); - if _mm256_movemask_epi8(_mm256_or_si256(eq1, eq2)) != 0 { - let mask1 = _mm256_movemask_epi8(eq1); - let mask2 = _mm256_movemask_epi8(eq2); - Some(sub(ptr, start_ptr) + forward_pos2(mask1, mask2)) - } else { - None - } -} - -#[target_feature(enable = "avx2")] -unsafe fn forward_search3( - start_ptr: *const u8, - end_ptr: *const u8, - ptr: *const u8, - vn1: __m256i, - vn2: __m256i, - vn3: __m256i, -) -> Option<usize> { - debug_assert!(sub(end_ptr, start_ptr) >= VECTOR_SIZE); - debug_assert!(start_ptr <= ptr); - debug_assert!(ptr <= end_ptr.sub(VECTOR_SIZE)); - - let chunk = _mm256_loadu_si256(ptr as *const __m256i); - let eq1 = _mm256_cmpeq_epi8(chunk, vn1); - let eq2 = _mm256_cmpeq_epi8(chunk, vn2); - let eq3 = _mm256_cmpeq_epi8(chunk, vn3); - let or = _mm256_or_si256(eq1, eq2); - if _mm256_movemask_epi8(_mm256_or_si256(or, eq3)) != 0 { - let mask1 = _mm256_movemask_epi8(eq1); - let mask2 = _mm256_movemask_epi8(eq2); - let mask3 = _mm256_movemask_epi8(eq3); - Some(sub(ptr, start_ptr) + forward_pos3(mask1, mask2, mask3)) - } else { - None - } -} - -#[target_feature(enable = "avx2")] -unsafe fn reverse_search1( - start_ptr: *const u8, - end_ptr: *const u8, - ptr: *const u8, - vn1: __m256i, -) -> Option<usize> { - debug_assert!(sub(end_ptr, start_ptr) >= VECTOR_SIZE); - debug_assert!(start_ptr <= ptr); - debug_assert!(ptr <= end_ptr.sub(VECTOR_SIZE)); - - let chunk = _mm256_loadu_si256(ptr as *const __m256i); - let mask = _mm256_movemask_epi8(_mm256_cmpeq_epi8(vn1, chunk)); - if mask != 0 { - Some(sub(ptr, start_ptr) + reverse_pos(mask)) - } else { - None - } -} - -#[target_feature(enable = "avx2")] -unsafe fn reverse_search2( - start_ptr: *const u8, - end_ptr: *const u8, - ptr: *const u8, - vn1: __m256i, - vn2: __m256i, -) -> Option<usize> { - debug_assert!(sub(end_ptr, start_ptr) >= VECTOR_SIZE); - debug_assert!(start_ptr <= ptr); - debug_assert!(ptr <= end_ptr.sub(VECTOR_SIZE)); - - let chunk = _mm256_loadu_si256(ptr as *const __m256i); - let eq1 = _mm256_cmpeq_epi8(chunk, vn1); - let eq2 = _mm256_cmpeq_epi8(chunk, vn2); - if _mm256_movemask_epi8(_mm256_or_si256(eq1, eq2)) != 0 { - let mask1 = _mm256_movemask_epi8(eq1); - let mask2 = _mm256_movemask_epi8(eq2); - Some(sub(ptr, start_ptr) + reverse_pos2(mask1, mask2)) - } else { - None - } -} - -#[target_feature(enable = "avx2")] -unsafe fn reverse_search3( - start_ptr: *const u8, - end_ptr: *const u8, - ptr: *const u8, - vn1: __m256i, - vn2: __m256i, - vn3: __m256i, -) -> Option<usize> { - debug_assert!(sub(end_ptr, start_ptr) >= VECTOR_SIZE); - debug_assert!(start_ptr <= ptr); - debug_assert!(ptr <= end_ptr.sub(VECTOR_SIZE)); - - let chunk = _mm256_loadu_si256(ptr as *const __m256i); - let eq1 = _mm256_cmpeq_epi8(chunk, vn1); - let eq2 = _mm256_cmpeq_epi8(chunk, vn2); - let eq3 = _mm256_cmpeq_epi8(chunk, vn3); - let or = _mm256_or_si256(eq1, eq2); - if _mm256_movemask_epi8(_mm256_or_si256(or, eq3)) != 0 { - let mask1 = _mm256_movemask_epi8(eq1); - let mask2 = _mm256_movemask_epi8(eq2); - let mask3 = _mm256_movemask_epi8(eq3); - Some(sub(ptr, start_ptr) + reverse_pos3(mask1, mask2, mask3)) - } else { - None - } -} - -/// Compute the position of the first matching byte from the given mask. The -/// position returned is always in the range [0, 31]. -/// -/// The mask given is expected to be the result of _mm256_movemask_epi8. -fn forward_pos(mask: i32) -> usize { - // We are dealing with little endian here, where the most significant byte - // is at a higher address. That means the least significant bit that is set - // corresponds to the position of our first matching byte. That position - // corresponds to the number of zeros after the least significant bit. - mask.trailing_zeros() as usize -} - -/// Compute the position of the first matching byte from the given masks. The -/// position returned is always in the range [0, 31]. Each mask corresponds to -/// the equality comparison of a single byte. -/// -/// The masks given are expected to be the result of _mm256_movemask_epi8, -/// where at least one of the masks is non-zero (i.e., indicates a match). -fn forward_pos2(mask1: i32, mask2: i32) -> usize { - debug_assert!(mask1 != 0 || mask2 != 0); - - forward_pos(mask1 | mask2) -} - -/// Compute the position of the first matching byte from the given masks. The -/// position returned is always in the range [0, 31]. Each mask corresponds to -/// the equality comparison of a single byte. -/// -/// The masks given are expected to be the result of _mm256_movemask_epi8, -/// where at least one of the masks is non-zero (i.e., indicates a match). -fn forward_pos3(mask1: i32, mask2: i32, mask3: i32) -> usize { - debug_assert!(mask1 != 0 || mask2 != 0 || mask3 != 0); - - forward_pos(mask1 | mask2 | mask3) -} - -/// Compute the position of the last matching byte from the given mask. The -/// position returned is always in the range [0, 31]. -/// -/// The mask given is expected to be the result of _mm256_movemask_epi8. -fn reverse_pos(mask: i32) -> usize { - // We are dealing with little endian here, where the most significant byte - // is at a higher address. That means the most significant bit that is set - // corresponds to the position of our last matching byte. The position from - // the end of the mask is therefore the number of leading zeros in a 32 - // bit integer, and the position from the start of the mask is therefore - // 32 - (leading zeros) - 1. - VECTOR_SIZE - (mask as u32).leading_zeros() as usize - 1 -} - -/// Compute the position of the last matching byte from the given masks. The -/// position returned is always in the range [0, 31]. Each mask corresponds to -/// the equality comparison of a single byte. -/// -/// The masks given are expected to be the result of _mm256_movemask_epi8, -/// where at least one of the masks is non-zero (i.e., indicates a match). -fn reverse_pos2(mask1: i32, mask2: i32) -> usize { - debug_assert!(mask1 != 0 || mask2 != 0); - - reverse_pos(mask1 | mask2) -} - -/// Compute the position of the last matching byte from the given masks. The -/// position returned is always in the range [0, 31]. Each mask corresponds to -/// the equality comparison of a single byte. -/// -/// The masks given are expected to be the result of _mm256_movemask_epi8, -/// where at least one of the masks is non-zero (i.e., indicates a match). -fn reverse_pos3(mask1: i32, mask2: i32, mask3: i32) -> usize { - debug_assert!(mask1 != 0 || mask2 != 0 || mask3 != 0); - - reverse_pos(mask1 | mask2 | mask3) -} - -/// Subtract `b` from `a` and return the difference. `a` should be greater than -/// or equal to `b`. -fn sub(a: *const u8, b: *const u8) -> usize { - debug_assert!(a >= b); - (a as usize) - (b as usize) -} diff --git a/vendor/memchr/src/memchr/x86/mod.rs b/vendor/memchr/src/memchr/x86/mod.rs deleted file mode 100644 index aec35dbff..000000000 --- a/vendor/memchr/src/memchr/x86/mod.rs +++ /dev/null @@ -1,148 +0,0 @@ -use super::fallback; - -// We only use AVX when we can detect at runtime whether it's available, which -// requires std. -#[cfg(feature = "std")] -mod avx; -mod sse2; - -/// This macro employs a gcc-like "ifunc" trick where by upon first calling -/// `memchr` (for example), CPU feature detection will be performed at runtime -/// to determine the best implementation to use. After CPU feature detection -/// is done, we replace `memchr`'s function pointer with the selection. Upon -/// subsequent invocations, the CPU-specific routine is invoked directly, which -/// skips the CPU feature detection and subsequent branch that's required. -/// -/// While this typically doesn't matter for rare occurrences or when used on -/// larger haystacks, `memchr` can be called in tight loops where the overhead -/// of this branch can actually add up *and is measurable*. This trick was -/// necessary to bring this implementation up to glibc's speeds for the 'tiny' -/// benchmarks, for example. -/// -/// At some point, I expect the Rust ecosystem will get a nice macro for doing -/// exactly this, at which point, we can replace our hand-jammed version of it. -/// -/// N.B. The ifunc strategy does prevent function inlining of course, but -/// on modern CPUs, you'll probably end up with the AVX2 implementation, -/// which probably can't be inlined anyway---unless you've compiled your -/// entire program with AVX2 enabled. However, even then, the various memchr -/// implementations aren't exactly small, so inlining might not help anyway! -/// -/// # Safety -/// -/// Callers must ensure that fnty is function pointer type. -#[cfg(feature = "std")] -macro_rules! unsafe_ifunc { - ($fnty:ty, $name:ident, $haystack:ident, $($needle:ident),+) => {{ - use std::{mem, sync::atomic::{AtomicPtr, Ordering}}; - - type FnRaw = *mut (); - - static FN: AtomicPtr<()> = AtomicPtr::new(detect as FnRaw); - - fn detect($($needle: u8),+, haystack: &[u8]) -> Option<usize> { - let fun = - if cfg!(memchr_runtime_avx) && is_x86_feature_detected!("avx2") { - avx::$name as FnRaw - } else if cfg!(memchr_runtime_sse2) { - sse2::$name as FnRaw - } else { - fallback::$name as FnRaw - }; - FN.store(fun as FnRaw, Ordering::Relaxed); - // SAFETY: By virtue of the caller contract, $fnty is a function - // pointer, which is always safe to transmute with a *mut (). - // Also, if 'fun is the AVX routine, then it is guaranteed to be - // supported since we checked the avx2 feature. - unsafe { - mem::transmute::<FnRaw, $fnty>(fun)($($needle),+, haystack) - } - } - - // SAFETY: By virtue of the caller contract, $fnty is a function - // pointer, which is always safe to transmute with a *mut (). Also, if - // 'fun is the AVX routine, then it is guaranteed to be supported since - // we checked the avx2 feature. - unsafe { - let fun = FN.load(Ordering::Relaxed); - mem::transmute::<FnRaw, $fnty>(fun)($($needle),+, $haystack) - } - }} -} - -/// When std isn't available to provide runtime CPU feature detection, or if -/// runtime CPU feature detection has been explicitly disabled, then just -/// call our optimized SSE2 routine directly. SSE2 is avalbale on all x86_64 -/// targets, so no CPU feature detection is necessary. -/// -/// # Safety -/// -/// There are no safety requirements for this definition of the macro. It is -/// safe for all inputs since it is restricted to either the fallback routine -/// or the SSE routine, which is always safe to call on x86_64. -#[cfg(not(feature = "std"))] -macro_rules! unsafe_ifunc { - ($fnty:ty, $name:ident, $haystack:ident, $($needle:ident),+) => {{ - if cfg!(memchr_runtime_sse2) { - unsafe { sse2::$name($($needle),+, $haystack) } - } else { - fallback::$name($($needle),+, $haystack) - } - }} -} - -#[inline(always)] -pub fn memchr(n1: u8, haystack: &[u8]) -> Option<usize> { - unsafe_ifunc!(fn(u8, &[u8]) -> Option<usize>, memchr, haystack, n1) -} - -#[inline(always)] -pub fn memchr2(n1: u8, n2: u8, haystack: &[u8]) -> Option<usize> { - unsafe_ifunc!( - fn(u8, u8, &[u8]) -> Option<usize>, - memchr2, - haystack, - n1, - n2 - ) -} - -#[inline(always)] -pub fn memchr3(n1: u8, n2: u8, n3: u8, haystack: &[u8]) -> Option<usize> { - unsafe_ifunc!( - fn(u8, u8, u8, &[u8]) -> Option<usize>, - memchr3, - haystack, - n1, - n2, - n3 - ) -} - -#[inline(always)] -pub fn memrchr(n1: u8, haystack: &[u8]) -> Option<usize> { - unsafe_ifunc!(fn(u8, &[u8]) -> Option<usize>, memrchr, haystack, n1) -} - -#[inline(always)] -pub fn memrchr2(n1: u8, n2: u8, haystack: &[u8]) -> Option<usize> { - unsafe_ifunc!( - fn(u8, u8, &[u8]) -> Option<usize>, - memrchr2, - haystack, - n1, - n2 - ) -} - -#[inline(always)] -pub fn memrchr3(n1: u8, n2: u8, n3: u8, haystack: &[u8]) -> Option<usize> { - unsafe_ifunc!( - fn(u8, u8, u8, &[u8]) -> Option<usize>, - memrchr3, - haystack, - n1, - n2, - n3 - ) -} diff --git a/vendor/memchr/src/memchr/x86/sse2.rs b/vendor/memchr/src/memchr/x86/sse2.rs deleted file mode 100644 index b7b3a9328..000000000 --- a/vendor/memchr/src/memchr/x86/sse2.rs +++ /dev/null @@ -1,791 +0,0 @@ -use core::{arch::x86_64::*, cmp, mem::size_of}; - -const VECTOR_SIZE: usize = size_of::<__m128i>(); -const VECTOR_ALIGN: usize = VECTOR_SIZE - 1; - -// The number of bytes to loop at in one iteration of memchr/memrchr. -const LOOP_SIZE: usize = 4 * VECTOR_SIZE; - -// The number of bytes to loop at in one iteration of memchr2/memrchr2 and -// memchr3/memrchr3. There was no observable difference between 64 and 32 bytes -// in benchmarks. memchr3 in particular only gets a very slight speed up from -// the loop unrolling. -const LOOP_SIZE2: usize = 2 * VECTOR_SIZE; - -#[target_feature(enable = "sse2")] -pub unsafe fn memchr(n1: u8, haystack: &[u8]) -> Option<usize> { - // What follows is a fast SSE2-only algorithm to detect the position of - // `n1` in `haystack` if it exists. From what I know, this is the "classic" - // algorithm. I believe it can be found in places like glibc and Go's - // standard library. It appears to be well known and is elaborated on in - // more detail here: https://gms.tf/stdfind-and-memchr-optimizations.html - // - // While this routine is very long, the basic idea is actually very simple - // and can be expressed straight-forwardly in pseudo code: - // - // needle = (n1 << 15) | (n1 << 14) | ... | (n1 << 1) | n1 - // // Note: shift amount in bytes - // - // while i <= haystack.len() - 16: - // // A 16 byte vector. Each byte in chunk corresponds to a byte in - // // the haystack. - // chunk = haystack[i:i+16] - // // Compare bytes in needle with bytes in chunk. The result is a 16 - // // byte chunk where each byte is 0xFF if the corresponding bytes - // // in needle and chunk were equal, or 0x00 otherwise. - // eqs = cmpeq(needle, chunk) - // // Return a 32 bit integer where the most significant 16 bits - // // are always 0 and the lower 16 bits correspond to whether the - // // most significant bit in the correspond byte in `eqs` is set. - // // In other words, `mask as u16` has bit i set if and only if - // // needle[i] == chunk[i]. - // mask = movemask(eqs) - // - // // Mask is 0 if there is no match, and non-zero otherwise. - // if mask != 0: - // // trailing_zeros tells us the position of the least significant - // // bit that is set. - // return i + trailing_zeros(mask) - // - // // haystack length may not be a multiple of 16, so search the rest. - // while i < haystack.len(): - // if haystack[i] == n1: - // return i - // - // // No match found. - // return NULL - // - // In fact, we could loosely translate the above code to Rust line-for-line - // and it would be a pretty fast algorithm. But, we pull out all the stops - // to go as fast as possible: - // - // 1. We use aligned loads. That is, we do some finagling to make sure our - // primary loop not only proceeds in increments of 16 bytes, but that - // the address of haystack's pointer that we dereference is aligned to - // 16 bytes. 16 is a magic number here because it is the size of SSE2 - // 128-bit vector. (For the AVX2 algorithm, 32 is the magic number.) - // Therefore, to get aligned loads, our pointer's address must be evenly - // divisible by 16. - // 2. Our primary loop proceeds 64 bytes at a time instead of 16. It's - // kind of like loop unrolling, but we combine the equality comparisons - // using a vector OR such that we only need to extract a single mask to - // determine whether a match exists or not. If so, then we do some - // book-keeping to determine the precise location but otherwise mush on. - // 3. We use our "chunk" comparison routine in as many places as possible, - // even if it means using unaligned loads. In particular, if haystack - // starts with an unaligned address, then we do an unaligned load to - // search the first 16 bytes. We then start our primary loop at the - // smallest subsequent aligned address, which will actually overlap with - // previously searched bytes. But we're OK with that. We do a similar - // dance at the end of our primary loop. Finally, to avoid a - // byte-at-a-time loop at the end, we do a final 16 byte unaligned load - // that may overlap with a previous load. This is OK because it converts - // a loop into a small number of very fast vector instructions. - // - // The primary downside of this algorithm is that it's effectively - // completely unsafe. Therefore, we have to be super careful to avoid - // undefined behavior: - // - // 1. We use raw pointers everywhere. Not only does dereferencing a pointer - // require the pointer to be valid, but we actually can't even store the - // address of an invalid pointer (unless it's 1 past the end of - // haystack) without sacrificing performance. - // 2. _mm_loadu_si128 is used when you don't care about alignment, and - // _mm_load_si128 is used when you do care. You cannot use the latter - // on unaligned pointers. - // 3. We make liberal use of debug_assert! to check assumptions. - // 4. We make a concerted effort to stick with pointers instead of indices. - // Indices are nicer because there's less to worry about with them (see - // above about pointer offsets), but I could not get the compiler to - // produce as good of code as what the below produces. In any case, - // pointers are what we really care about here, and alignment is - // expressed a bit more naturally with them. - // - // In general, most of the algorithms in this crate have a similar - // structure to what you see below, so this comment applies fairly well to - // all of them. - - let vn1 = _mm_set1_epi8(n1 as i8); - let len = haystack.len(); - let loop_size = cmp::min(LOOP_SIZE, len); - let start_ptr = haystack.as_ptr(); - let end_ptr = start_ptr.add(haystack.len()); - let mut ptr = start_ptr; - - if haystack.len() < VECTOR_SIZE { - while ptr < end_ptr { - if *ptr == n1 { - return Some(sub(ptr, start_ptr)); - } - ptr = ptr.offset(1); - } - return None; - } - - if let Some(i) = forward_search1(start_ptr, end_ptr, ptr, vn1) { - return Some(i); - } - - ptr = ptr.add(VECTOR_SIZE - (start_ptr as usize & VECTOR_ALIGN)); - debug_assert!(ptr > start_ptr && end_ptr.sub(VECTOR_SIZE) >= start_ptr); - while loop_size == LOOP_SIZE && ptr <= end_ptr.sub(loop_size) { - debug_assert_eq!(0, (ptr as usize) % VECTOR_SIZE); - - let a = _mm_load_si128(ptr as *const __m128i); - let b = _mm_load_si128(ptr.add(VECTOR_SIZE) as *const __m128i); - let c = _mm_load_si128(ptr.add(2 * VECTOR_SIZE) as *const __m128i); - let d = _mm_load_si128(ptr.add(3 * VECTOR_SIZE) as *const __m128i); - let eqa = _mm_cmpeq_epi8(vn1, a); - let eqb = _mm_cmpeq_epi8(vn1, b); - let eqc = _mm_cmpeq_epi8(vn1, c); - let eqd = _mm_cmpeq_epi8(vn1, d); - let or1 = _mm_or_si128(eqa, eqb); - let or2 = _mm_or_si128(eqc, eqd); - let or3 = _mm_or_si128(or1, or2); - if _mm_movemask_epi8(or3) != 0 { - let mut at = sub(ptr, start_ptr); - let mask = _mm_movemask_epi8(eqa); - if mask != 0 { - return Some(at + forward_pos(mask)); - } - - at += VECTOR_SIZE; - let mask = _mm_movemask_epi8(eqb); - if mask != 0 { - return Some(at + forward_pos(mask)); - } - - at += VECTOR_SIZE; - let mask = _mm_movemask_epi8(eqc); - if mask != 0 { - return Some(at + forward_pos(mask)); - } - - at += VECTOR_SIZE; - let mask = _mm_movemask_epi8(eqd); - debug_assert!(mask != 0); - return Some(at + forward_pos(mask)); - } - ptr = ptr.add(loop_size); - } - while ptr <= end_ptr.sub(VECTOR_SIZE) { - debug_assert!(sub(end_ptr, ptr) >= VECTOR_SIZE); - - if let Some(i) = forward_search1(start_ptr, end_ptr, ptr, vn1) { - return Some(i); - } - ptr = ptr.add(VECTOR_SIZE); - } - if ptr < end_ptr { - debug_assert!(sub(end_ptr, ptr) < VECTOR_SIZE); - ptr = ptr.sub(VECTOR_SIZE - sub(end_ptr, ptr)); - debug_assert_eq!(sub(end_ptr, ptr), VECTOR_SIZE); - - return forward_search1(start_ptr, end_ptr, ptr, vn1); - } - None -} - -#[target_feature(enable = "sse2")] -pub unsafe fn memchr2(n1: u8, n2: u8, haystack: &[u8]) -> Option<usize> { - let vn1 = _mm_set1_epi8(n1 as i8); - let vn2 = _mm_set1_epi8(n2 as i8); - let len = haystack.len(); - let loop_size = cmp::min(LOOP_SIZE2, len); - let start_ptr = haystack.as_ptr(); - let end_ptr = start_ptr.add(haystack.len()); - let mut ptr = start_ptr; - - if haystack.len() < VECTOR_SIZE { - while ptr < end_ptr { - if *ptr == n1 || *ptr == n2 { - return Some(sub(ptr, start_ptr)); - } - ptr = ptr.offset(1); - } - return None; - } - - if let Some(i) = forward_search2(start_ptr, end_ptr, ptr, vn1, vn2) { - return Some(i); - } - - ptr = ptr.add(VECTOR_SIZE - (start_ptr as usize & VECTOR_ALIGN)); - debug_assert!(ptr > start_ptr && end_ptr.sub(VECTOR_SIZE) >= start_ptr); - while loop_size == LOOP_SIZE2 && ptr <= end_ptr.sub(loop_size) { - debug_assert_eq!(0, (ptr as usize) % VECTOR_SIZE); - - let a = _mm_load_si128(ptr as *const __m128i); - let b = _mm_load_si128(ptr.add(VECTOR_SIZE) as *const __m128i); - let eqa1 = _mm_cmpeq_epi8(vn1, a); - let eqb1 = _mm_cmpeq_epi8(vn1, b); - let eqa2 = _mm_cmpeq_epi8(vn2, a); - let eqb2 = _mm_cmpeq_epi8(vn2, b); - let or1 = _mm_or_si128(eqa1, eqb1); - let or2 = _mm_or_si128(eqa2, eqb2); - let or3 = _mm_or_si128(or1, or2); - if _mm_movemask_epi8(or3) != 0 { - let mut at = sub(ptr, start_ptr); - let mask1 = _mm_movemask_epi8(eqa1); - let mask2 = _mm_movemask_epi8(eqa2); - if mask1 != 0 || mask2 != 0 { - return Some(at + forward_pos2(mask1, mask2)); - } - - at += VECTOR_SIZE; - let mask1 = _mm_movemask_epi8(eqb1); - let mask2 = _mm_movemask_epi8(eqb2); - return Some(at + forward_pos2(mask1, mask2)); - } - ptr = ptr.add(loop_size); - } - while ptr <= end_ptr.sub(VECTOR_SIZE) { - if let Some(i) = forward_search2(start_ptr, end_ptr, ptr, vn1, vn2) { - return Some(i); - } - ptr = ptr.add(VECTOR_SIZE); - } - if ptr < end_ptr { - debug_assert!(sub(end_ptr, ptr) < VECTOR_SIZE); - ptr = ptr.sub(VECTOR_SIZE - sub(end_ptr, ptr)); - debug_assert_eq!(sub(end_ptr, ptr), VECTOR_SIZE); - - return forward_search2(start_ptr, end_ptr, ptr, vn1, vn2); - } - None -} - -#[target_feature(enable = "sse2")] -pub unsafe fn memchr3( - n1: u8, - n2: u8, - n3: u8, - haystack: &[u8], -) -> Option<usize> { - let vn1 = _mm_set1_epi8(n1 as i8); - let vn2 = _mm_set1_epi8(n2 as i8); - let vn3 = _mm_set1_epi8(n3 as i8); - let len = haystack.len(); - let loop_size = cmp::min(LOOP_SIZE2, len); - let start_ptr = haystack.as_ptr(); - let end_ptr = start_ptr.add(haystack.len()); - let mut ptr = start_ptr; - - if haystack.len() < VECTOR_SIZE { - while ptr < end_ptr { - if *ptr == n1 || *ptr == n2 || *ptr == n3 { - return Some(sub(ptr, start_ptr)); - } - ptr = ptr.offset(1); - } - return None; - } - - if let Some(i) = forward_search3(start_ptr, end_ptr, ptr, vn1, vn2, vn3) { - return Some(i); - } - - ptr = ptr.add(VECTOR_SIZE - (start_ptr as usize & VECTOR_ALIGN)); - debug_assert!(ptr > start_ptr && end_ptr.sub(VECTOR_SIZE) >= start_ptr); - while loop_size == LOOP_SIZE2 && ptr <= end_ptr.sub(loop_size) { - debug_assert_eq!(0, (ptr as usize) % VECTOR_SIZE); - - let a = _mm_load_si128(ptr as *const __m128i); - let b = _mm_load_si128(ptr.add(VECTOR_SIZE) as *const __m128i); - let eqa1 = _mm_cmpeq_epi8(vn1, a); - let eqb1 = _mm_cmpeq_epi8(vn1, b); - let eqa2 = _mm_cmpeq_epi8(vn2, a); - let eqb2 = _mm_cmpeq_epi8(vn2, b); - let eqa3 = _mm_cmpeq_epi8(vn3, a); - let eqb3 = _mm_cmpeq_epi8(vn3, b); - let or1 = _mm_or_si128(eqa1, eqb1); - let or2 = _mm_or_si128(eqa2, eqb2); - let or3 = _mm_or_si128(eqa3, eqb3); - let or4 = _mm_or_si128(or1, or2); - let or5 = _mm_or_si128(or3, or4); - if _mm_movemask_epi8(or5) != 0 { - let mut at = sub(ptr, start_ptr); - let mask1 = _mm_movemask_epi8(eqa1); - let mask2 = _mm_movemask_epi8(eqa2); - let mask3 = _mm_movemask_epi8(eqa3); - if mask1 != 0 || mask2 != 0 || mask3 != 0 { - return Some(at + forward_pos3(mask1, mask2, mask3)); - } - - at += VECTOR_SIZE; - let mask1 = _mm_movemask_epi8(eqb1); - let mask2 = _mm_movemask_epi8(eqb2); - let mask3 = _mm_movemask_epi8(eqb3); - return Some(at + forward_pos3(mask1, mask2, mask3)); - } - ptr = ptr.add(loop_size); - } - while ptr <= end_ptr.sub(VECTOR_SIZE) { - if let Some(i) = - forward_search3(start_ptr, end_ptr, ptr, vn1, vn2, vn3) - { - return Some(i); - } - ptr = ptr.add(VECTOR_SIZE); - } - if ptr < end_ptr { - debug_assert!(sub(end_ptr, ptr) < VECTOR_SIZE); - ptr = ptr.sub(VECTOR_SIZE - sub(end_ptr, ptr)); - debug_assert_eq!(sub(end_ptr, ptr), VECTOR_SIZE); - - return forward_search3(start_ptr, end_ptr, ptr, vn1, vn2, vn3); - } - None -} - -#[target_feature(enable = "sse2")] -pub unsafe fn memrchr(n1: u8, haystack: &[u8]) -> Option<usize> { - let vn1 = _mm_set1_epi8(n1 as i8); - let len = haystack.len(); - let loop_size = cmp::min(LOOP_SIZE, len); - let start_ptr = haystack.as_ptr(); - let end_ptr = start_ptr.add(haystack.len()); - let mut ptr = end_ptr; - - if haystack.len() < VECTOR_SIZE { - while ptr > start_ptr { - ptr = ptr.offset(-1); - if *ptr == n1 { - return Some(sub(ptr, start_ptr)); - } - } - return None; - } - - ptr = ptr.sub(VECTOR_SIZE); - if let Some(i) = reverse_search1(start_ptr, end_ptr, ptr, vn1) { - return Some(i); - } - - ptr = (end_ptr as usize & !VECTOR_ALIGN) as *const u8; - debug_assert!(start_ptr <= ptr && ptr <= end_ptr); - while loop_size == LOOP_SIZE && ptr >= start_ptr.add(loop_size) { - debug_assert_eq!(0, (ptr as usize) % VECTOR_SIZE); - - ptr = ptr.sub(loop_size); - let a = _mm_load_si128(ptr as *const __m128i); - let b = _mm_load_si128(ptr.add(VECTOR_SIZE) as *const __m128i); - let c = _mm_load_si128(ptr.add(2 * VECTOR_SIZE) as *const __m128i); - let d = _mm_load_si128(ptr.add(3 * VECTOR_SIZE) as *const __m128i); - let eqa = _mm_cmpeq_epi8(vn1, a); - let eqb = _mm_cmpeq_epi8(vn1, b); - let eqc = _mm_cmpeq_epi8(vn1, c); - let eqd = _mm_cmpeq_epi8(vn1, d); - let or1 = _mm_or_si128(eqa, eqb); - let or2 = _mm_or_si128(eqc, eqd); - let or3 = _mm_or_si128(or1, or2); - if _mm_movemask_epi8(or3) != 0 { - let mut at = sub(ptr.add(3 * VECTOR_SIZE), start_ptr); - let mask = _mm_movemask_epi8(eqd); - if mask != 0 { - return Some(at + reverse_pos(mask)); - } - - at -= VECTOR_SIZE; - let mask = _mm_movemask_epi8(eqc); - if mask != 0 { - return Some(at + reverse_pos(mask)); - } - - at -= VECTOR_SIZE; - let mask = _mm_movemask_epi8(eqb); - if mask != 0 { - return Some(at + reverse_pos(mask)); - } - - at -= VECTOR_SIZE; - let mask = _mm_movemask_epi8(eqa); - debug_assert!(mask != 0); - return Some(at + reverse_pos(mask)); - } - } - while ptr >= start_ptr.add(VECTOR_SIZE) { - ptr = ptr.sub(VECTOR_SIZE); - if let Some(i) = reverse_search1(start_ptr, end_ptr, ptr, vn1) { - return Some(i); - } - } - if ptr > start_ptr { - debug_assert!(sub(ptr, start_ptr) < VECTOR_SIZE); - return reverse_search1(start_ptr, end_ptr, start_ptr, vn1); - } - None -} - -#[target_feature(enable = "sse2")] -pub unsafe fn memrchr2(n1: u8, n2: u8, haystack: &[u8]) -> Option<usize> { - let vn1 = _mm_set1_epi8(n1 as i8); - let vn2 = _mm_set1_epi8(n2 as i8); - let len = haystack.len(); - let loop_size = cmp::min(LOOP_SIZE2, len); - let start_ptr = haystack.as_ptr(); - let end_ptr = start_ptr.add(haystack.len()); - let mut ptr = end_ptr; - - if haystack.len() < VECTOR_SIZE { - while ptr > start_ptr { - ptr = ptr.offset(-1); - if *ptr == n1 || *ptr == n2 { - return Some(sub(ptr, start_ptr)); - } - } - return None; - } - - ptr = ptr.sub(VECTOR_SIZE); - if let Some(i) = reverse_search2(start_ptr, end_ptr, ptr, vn1, vn2) { - return Some(i); - } - - ptr = (end_ptr as usize & !VECTOR_ALIGN) as *const u8; - debug_assert!(start_ptr <= ptr && ptr <= end_ptr); - while loop_size == LOOP_SIZE2 && ptr >= start_ptr.add(loop_size) { - debug_assert_eq!(0, (ptr as usize) % VECTOR_SIZE); - - ptr = ptr.sub(loop_size); - let a = _mm_load_si128(ptr as *const __m128i); - let b = _mm_load_si128(ptr.add(VECTOR_SIZE) as *const __m128i); - let eqa1 = _mm_cmpeq_epi8(vn1, a); - let eqb1 = _mm_cmpeq_epi8(vn1, b); - let eqa2 = _mm_cmpeq_epi8(vn2, a); - let eqb2 = _mm_cmpeq_epi8(vn2, b); - let or1 = _mm_or_si128(eqa1, eqb1); - let or2 = _mm_or_si128(eqa2, eqb2); - let or3 = _mm_or_si128(or1, or2); - if _mm_movemask_epi8(or3) != 0 { - let mut at = sub(ptr.add(VECTOR_SIZE), start_ptr); - let mask1 = _mm_movemask_epi8(eqb1); - let mask2 = _mm_movemask_epi8(eqb2); - if mask1 != 0 || mask2 != 0 { - return Some(at + reverse_pos2(mask1, mask2)); - } - - at -= VECTOR_SIZE; - let mask1 = _mm_movemask_epi8(eqa1); - let mask2 = _mm_movemask_epi8(eqa2); - return Some(at + reverse_pos2(mask1, mask2)); - } - } - while ptr >= start_ptr.add(VECTOR_SIZE) { - ptr = ptr.sub(VECTOR_SIZE); - if let Some(i) = reverse_search2(start_ptr, end_ptr, ptr, vn1, vn2) { - return Some(i); - } - } - if ptr > start_ptr { - debug_assert!(sub(ptr, start_ptr) < VECTOR_SIZE); - return reverse_search2(start_ptr, end_ptr, start_ptr, vn1, vn2); - } - None -} - -#[target_feature(enable = "sse2")] -pub unsafe fn memrchr3( - n1: u8, - n2: u8, - n3: u8, - haystack: &[u8], -) -> Option<usize> { - let vn1 = _mm_set1_epi8(n1 as i8); - let vn2 = _mm_set1_epi8(n2 as i8); - let vn3 = _mm_set1_epi8(n3 as i8); - let len = haystack.len(); - let loop_size = cmp::min(LOOP_SIZE2, len); - let start_ptr = haystack.as_ptr(); - let end_ptr = start_ptr.add(haystack.len()); - let mut ptr = end_ptr; - - if haystack.len() < VECTOR_SIZE { - while ptr > start_ptr { - ptr = ptr.offset(-1); - if *ptr == n1 || *ptr == n2 || *ptr == n3 { - return Some(sub(ptr, start_ptr)); - } - } - return None; - } - - ptr = ptr.sub(VECTOR_SIZE); - if let Some(i) = reverse_search3(start_ptr, end_ptr, ptr, vn1, vn2, vn3) { - return Some(i); - } - - ptr = (end_ptr as usize & !VECTOR_ALIGN) as *const u8; - debug_assert!(start_ptr <= ptr && ptr <= end_ptr); - while loop_size == LOOP_SIZE2 && ptr >= start_ptr.add(loop_size) { - debug_assert_eq!(0, (ptr as usize) % VECTOR_SIZE); - - ptr = ptr.sub(loop_size); - let a = _mm_load_si128(ptr as *const __m128i); - let b = _mm_load_si128(ptr.add(VECTOR_SIZE) as *const __m128i); - let eqa1 = _mm_cmpeq_epi8(vn1, a); - let eqb1 = _mm_cmpeq_epi8(vn1, b); - let eqa2 = _mm_cmpeq_epi8(vn2, a); - let eqb2 = _mm_cmpeq_epi8(vn2, b); - let eqa3 = _mm_cmpeq_epi8(vn3, a); - let eqb3 = _mm_cmpeq_epi8(vn3, b); - let or1 = _mm_or_si128(eqa1, eqb1); - let or2 = _mm_or_si128(eqa2, eqb2); - let or3 = _mm_or_si128(eqa3, eqb3); - let or4 = _mm_or_si128(or1, or2); - let or5 = _mm_or_si128(or3, or4); - if _mm_movemask_epi8(or5) != 0 { - let mut at = sub(ptr.add(VECTOR_SIZE), start_ptr); - let mask1 = _mm_movemask_epi8(eqb1); - let mask2 = _mm_movemask_epi8(eqb2); - let mask3 = _mm_movemask_epi8(eqb3); - if mask1 != 0 || mask2 != 0 || mask3 != 0 { - return Some(at + reverse_pos3(mask1, mask2, mask3)); - } - - at -= VECTOR_SIZE; - let mask1 = _mm_movemask_epi8(eqa1); - let mask2 = _mm_movemask_epi8(eqa2); - let mask3 = _mm_movemask_epi8(eqa3); - return Some(at + reverse_pos3(mask1, mask2, mask3)); - } - } - while ptr >= start_ptr.add(VECTOR_SIZE) { - ptr = ptr.sub(VECTOR_SIZE); - if let Some(i) = - reverse_search3(start_ptr, end_ptr, ptr, vn1, vn2, vn3) - { - return Some(i); - } - } - if ptr > start_ptr { - debug_assert!(sub(ptr, start_ptr) < VECTOR_SIZE); - return reverse_search3(start_ptr, end_ptr, start_ptr, vn1, vn2, vn3); - } - None -} - -#[target_feature(enable = "sse2")] -pub unsafe fn forward_search1( - start_ptr: *const u8, - end_ptr: *const u8, - ptr: *const u8, - vn1: __m128i, -) -> Option<usize> { - debug_assert!(sub(end_ptr, start_ptr) >= VECTOR_SIZE); - debug_assert!(start_ptr <= ptr); - debug_assert!(ptr <= end_ptr.sub(VECTOR_SIZE)); - - let chunk = _mm_loadu_si128(ptr as *const __m128i); - let mask = _mm_movemask_epi8(_mm_cmpeq_epi8(chunk, vn1)); - if mask != 0 { - Some(sub(ptr, start_ptr) + forward_pos(mask)) - } else { - None - } -} - -#[target_feature(enable = "sse2")] -unsafe fn forward_search2( - start_ptr: *const u8, - end_ptr: *const u8, - ptr: *const u8, - vn1: __m128i, - vn2: __m128i, -) -> Option<usize> { - debug_assert!(sub(end_ptr, start_ptr) >= VECTOR_SIZE); - debug_assert!(start_ptr <= ptr); - debug_assert!(ptr <= end_ptr.sub(VECTOR_SIZE)); - - let chunk = _mm_loadu_si128(ptr as *const __m128i); - let eq1 = _mm_cmpeq_epi8(chunk, vn1); - let eq2 = _mm_cmpeq_epi8(chunk, vn2); - if _mm_movemask_epi8(_mm_or_si128(eq1, eq2)) != 0 { - let mask1 = _mm_movemask_epi8(eq1); - let mask2 = _mm_movemask_epi8(eq2); - Some(sub(ptr, start_ptr) + forward_pos2(mask1, mask2)) - } else { - None - } -} - -#[target_feature(enable = "sse2")] -pub unsafe fn forward_search3( - start_ptr: *const u8, - end_ptr: *const u8, - ptr: *const u8, - vn1: __m128i, - vn2: __m128i, - vn3: __m128i, -) -> Option<usize> { - debug_assert!(sub(end_ptr, start_ptr) >= VECTOR_SIZE); - debug_assert!(start_ptr <= ptr); - debug_assert!(ptr <= end_ptr.sub(VECTOR_SIZE)); - - let chunk = _mm_loadu_si128(ptr as *const __m128i); - let eq1 = _mm_cmpeq_epi8(chunk, vn1); - let eq2 = _mm_cmpeq_epi8(chunk, vn2); - let eq3 = _mm_cmpeq_epi8(chunk, vn3); - let or = _mm_or_si128(eq1, eq2); - if _mm_movemask_epi8(_mm_or_si128(or, eq3)) != 0 { - let mask1 = _mm_movemask_epi8(eq1); - let mask2 = _mm_movemask_epi8(eq2); - let mask3 = _mm_movemask_epi8(eq3); - Some(sub(ptr, start_ptr) + forward_pos3(mask1, mask2, mask3)) - } else { - None - } -} - -#[target_feature(enable = "sse2")] -unsafe fn reverse_search1( - start_ptr: *const u8, - end_ptr: *const u8, - ptr: *const u8, - vn1: __m128i, -) -> Option<usize> { - debug_assert!(sub(end_ptr, start_ptr) >= VECTOR_SIZE); - debug_assert!(start_ptr <= ptr); - debug_assert!(ptr <= end_ptr.sub(VECTOR_SIZE)); - - let chunk = _mm_loadu_si128(ptr as *const __m128i); - let mask = _mm_movemask_epi8(_mm_cmpeq_epi8(vn1, chunk)); - if mask != 0 { - Some(sub(ptr, start_ptr) + reverse_pos(mask)) - } else { - None - } -} - -#[target_feature(enable = "sse2")] -unsafe fn reverse_search2( - start_ptr: *const u8, - end_ptr: *const u8, - ptr: *const u8, - vn1: __m128i, - vn2: __m128i, -) -> Option<usize> { - debug_assert!(sub(end_ptr, start_ptr) >= VECTOR_SIZE); - debug_assert!(start_ptr <= ptr); - debug_assert!(ptr <= end_ptr.sub(VECTOR_SIZE)); - - let chunk = _mm_loadu_si128(ptr as *const __m128i); - let eq1 = _mm_cmpeq_epi8(chunk, vn1); - let eq2 = _mm_cmpeq_epi8(chunk, vn2); - if _mm_movemask_epi8(_mm_or_si128(eq1, eq2)) != 0 { - let mask1 = _mm_movemask_epi8(eq1); - let mask2 = _mm_movemask_epi8(eq2); - Some(sub(ptr, start_ptr) + reverse_pos2(mask1, mask2)) - } else { - None - } -} - -#[target_feature(enable = "sse2")] -unsafe fn reverse_search3( - start_ptr: *const u8, - end_ptr: *const u8, - ptr: *const u8, - vn1: __m128i, - vn2: __m128i, - vn3: __m128i, -) -> Option<usize> { - debug_assert!(sub(end_ptr, start_ptr) >= VECTOR_SIZE); - debug_assert!(start_ptr <= ptr); - debug_assert!(ptr <= end_ptr.sub(VECTOR_SIZE)); - - let chunk = _mm_loadu_si128(ptr as *const __m128i); - let eq1 = _mm_cmpeq_epi8(chunk, vn1); - let eq2 = _mm_cmpeq_epi8(chunk, vn2); - let eq3 = _mm_cmpeq_epi8(chunk, vn3); - let or = _mm_or_si128(eq1, eq2); - if _mm_movemask_epi8(_mm_or_si128(or, eq3)) != 0 { - let mask1 = _mm_movemask_epi8(eq1); - let mask2 = _mm_movemask_epi8(eq2); - let mask3 = _mm_movemask_epi8(eq3); - Some(sub(ptr, start_ptr) + reverse_pos3(mask1, mask2, mask3)) - } else { - None - } -} - -/// Compute the position of the first matching byte from the given mask. The -/// position returned is always in the range [0, 15]. -/// -/// The mask given is expected to be the result of _mm_movemask_epi8. -fn forward_pos(mask: i32) -> usize { - // We are dealing with little endian here, where the most significant byte - // is at a higher address. That means the least significant bit that is set - // corresponds to the position of our first matching byte. That position - // corresponds to the number of zeros after the least significant bit. - mask.trailing_zeros() as usize -} - -/// Compute the position of the first matching byte from the given masks. The -/// position returned is always in the range [0, 15]. Each mask corresponds to -/// the equality comparison of a single byte. -/// -/// The masks given are expected to be the result of _mm_movemask_epi8, where -/// at least one of the masks is non-zero (i.e., indicates a match). -fn forward_pos2(mask1: i32, mask2: i32) -> usize { - debug_assert!(mask1 != 0 || mask2 != 0); - - forward_pos(mask1 | mask2) -} - -/// Compute the position of the first matching byte from the given masks. The -/// position returned is always in the range [0, 15]. Each mask corresponds to -/// the equality comparison of a single byte. -/// -/// The masks given are expected to be the result of _mm_movemask_epi8, where -/// at least one of the masks is non-zero (i.e., indicates a match). -fn forward_pos3(mask1: i32, mask2: i32, mask3: i32) -> usize { - debug_assert!(mask1 != 0 || mask2 != 0 || mask3 != 0); - - forward_pos(mask1 | mask2 | mask3) -} - -/// Compute the position of the last matching byte from the given mask. The -/// position returned is always in the range [0, 15]. -/// -/// The mask given is expected to be the result of _mm_movemask_epi8. -fn reverse_pos(mask: i32) -> usize { - // We are dealing with little endian here, where the most significant byte - // is at a higher address. That means the most significant bit that is set - // corresponds to the position of our last matching byte. The position from - // the end of the mask is therefore the number of leading zeros in a 16 - // bit integer, and the position from the start of the mask is therefore - // 16 - (leading zeros) - 1. - VECTOR_SIZE - (mask as u16).leading_zeros() as usize - 1 -} - -/// Compute the position of the last matching byte from the given masks. The -/// position returned is always in the range [0, 15]. Each mask corresponds to -/// the equality comparison of a single byte. -/// -/// The masks given are expected to be the result of _mm_movemask_epi8, where -/// at least one of the masks is non-zero (i.e., indicates a match). -fn reverse_pos2(mask1: i32, mask2: i32) -> usize { - debug_assert!(mask1 != 0 || mask2 != 0); - - reverse_pos(mask1 | mask2) -} - -/// Compute the position of the last matching byte from the given masks. The -/// position returned is always in the range [0, 15]. Each mask corresponds to -/// the equality comparison of a single byte. -/// -/// The masks given are expected to be the result of _mm_movemask_epi8, where -/// at least one of the masks is non-zero (i.e., indicates a match). -fn reverse_pos3(mask1: i32, mask2: i32, mask3: i32) -> usize { - debug_assert!(mask1 != 0 || mask2 != 0 || mask3 != 0); - - reverse_pos(mask1 | mask2 | mask3) -} - -/// Subtract `b` from `a` and return the difference. `a` should be greater than -/// or equal to `b`. -fn sub(a: *const u8, b: *const u8) -> usize { - debug_assert!(a >= b); - (a as usize) - (b as usize) -} diff --git a/vendor/memchr/src/memchr/x86/sse42.rs b/vendor/memchr/src/memchr/x86/sse42.rs deleted file mode 100644 index da38e50c2..000000000 --- a/vendor/memchr/src/memchr/x86/sse42.rs +++ /dev/null @@ -1,72 +0,0 @@ -// This code is unused. PCMPESTRI is gratuitously slow. I imagine it might -// start winning with a hypothetical memchr4 (or greater). This technique might -// also be good for exposing searches over ranges of bytes, but that departs -// from the standard memchr API, so it's not clear whether we actually want -// that or not. -// -// N.B. PCMPISTRI appears to be about twice as fast as PCMPESTRI, which is kind -// of neat. Unfortunately, UTF-8 strings can contain NUL bytes, which means -// I don't see a way of effectively using PCMPISTRI unless there's some fast -// way to replace zero bytes with a byte that is not not a needle byte. - -use core::{arch::x86_64::*, mem::size_of}; - -use x86::sse2; - -const VECTOR_SIZE: usize = size_of::<__m128i>(); -const CONTROL_ANY: i32 = _SIDD_UBYTE_OPS - | _SIDD_CMP_EQUAL_ANY - | _SIDD_POSITIVE_POLARITY - | _SIDD_LEAST_SIGNIFICANT; - -#[target_feature(enable = "sse4.2")] -pub unsafe fn memchr3( - n1: u8, - n2: u8, - n3: u8, - haystack: &[u8], -) -> Option<usize> { - let vn1 = _mm_set1_epi8(n1 as i8); - let vn2 = _mm_set1_epi8(n2 as i8); - let vn3 = _mm_set1_epi8(n3 as i8); - let vn = _mm_setr_epi8( - n1 as i8, n2 as i8, n3 as i8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - ); - let len = haystack.len(); - let start_ptr = haystack.as_ptr(); - let end_ptr = haystack[haystack.len()..].as_ptr(); - let mut ptr = start_ptr; - - if haystack.len() < VECTOR_SIZE { - while ptr < end_ptr { - if *ptr == n1 || *ptr == n2 || *ptr == n3 { - return Some(sub(ptr, start_ptr)); - } - ptr = ptr.offset(1); - } - return None; - } - while ptr <= end_ptr.sub(VECTOR_SIZE) { - let chunk = _mm_loadu_si128(ptr as *const __m128i); - let res = _mm_cmpestri(vn, 3, chunk, 16, CONTROL_ANY); - if res < 16 { - return Some(sub(ptr, start_ptr) + res as usize); - } - ptr = ptr.add(VECTOR_SIZE); - } - if ptr < end_ptr { - debug_assert!(sub(end_ptr, ptr) < VECTOR_SIZE); - ptr = ptr.sub(VECTOR_SIZE - sub(end_ptr, ptr)); - debug_assert_eq!(sub(end_ptr, ptr), VECTOR_SIZE); - - return sse2::forward_search3(start_ptr, end_ptr, ptr, vn1, vn2, vn3); - } - None -} - -/// Subtract `b` from `a` and return the difference. `a` should be greater than -/// or equal to `b`. -fn sub(a: *const u8, b: *const u8) -> usize { - debug_assert!(a >= b); - (a as usize) - (b as usize) -} |