diff options
Diffstat (limited to 'third_party/rust/memchr/src/memmem/rabinkarp.rs')
-rw-r--r-- | third_party/rust/memchr/src/memmem/rabinkarp.rs | 233 |
1 files changed, 233 insertions, 0 deletions
diff --git a/third_party/rust/memchr/src/memmem/rabinkarp.rs b/third_party/rust/memchr/src/memmem/rabinkarp.rs new file mode 100644 index 0000000000..daa4015d5f --- /dev/null +++ b/third_party/rust/memchr/src/memmem/rabinkarp.rs @@ -0,0 +1,233 @@ +/* +This module implements the classical Rabin-Karp substring search algorithm, +with no extra frills. While its use would seem to break our time complexity +guarantee of O(m+n) (RK's time complexity is O(mn)), we are careful to only +ever use RK on a constant subset of haystacks. The main point here is that +RK has good latency properties for small needles/haystacks. It's very quick +to compute a needle hash and zip through the haystack when compared to +initializing Two-Way, for example. And this is especially useful for cases +where the haystack is just too short for vector instructions to do much good. + +The hashing function used here is the same one recommended by ESMAJ. + +Another choice instead of Rabin-Karp would be Shift-Or. But its latency +isn't quite as good since its preprocessing time is a bit more expensive +(both in practice and in theory). However, perhaps Shift-Or has a place +somewhere else for short patterns. I think the main problem is that it +requires space proportional to the alphabet and the needle. If we, for +example, supported needles up to length 16, then the total table size would be +len(alphabet)*size_of::<u16>()==512 bytes. Which isn't exactly small, and it's +probably bad to put that on the stack. So ideally, we'd throw it on the heap, +but we'd really like to write as much code without using alloc/std as possible. +But maybe it's worth the special casing. It's a TODO to benchmark. + +Wikipedia has a decent explanation, if a bit heavy on the theory: +https://en.wikipedia.org/wiki/Rabin%E2%80%93Karp_algorithm + +But ESMAJ provides something a bit more concrete: +http://www-igm.univ-mlv.fr/~lecroq/string/node5.html + +Finally, aho-corasick uses Rabin-Karp for multiple pattern match in some cases: +https://github.com/BurntSushi/aho-corasick/blob/3852632f10587db0ff72ef29e88d58bf305a0946/src/packed/rabinkarp.rs +*/ + +/// Whether RK is believed to be very fast for the given needle/haystack. +pub(crate) fn is_fast(haystack: &[u8], _needle: &[u8]) -> bool { + haystack.len() < 16 +} + +/// Search for the first occurrence of needle in haystack using Rabin-Karp. +pub(crate) fn find(haystack: &[u8], needle: &[u8]) -> Option<usize> { + find_with(&NeedleHash::forward(needle), haystack, needle) +} + +/// Search for the first occurrence of needle in haystack using Rabin-Karp with +/// a pre-computed needle hash. +pub(crate) fn find_with( + nhash: &NeedleHash, + mut haystack: &[u8], + needle: &[u8], +) -> Option<usize> { + if haystack.len() < needle.len() { + return None; + } + let start = haystack.as_ptr() as usize; + let mut hash = Hash::from_bytes_fwd(&haystack[..needle.len()]); + // N.B. I've experimented with unrolling this loop, but couldn't realize + // any obvious gains. + loop { + if nhash.eq(hash) && is_prefix(haystack, needle) { + return Some(haystack.as_ptr() as usize - start); + } + if needle.len() >= haystack.len() { + return None; + } + hash.roll(&nhash, haystack[0], haystack[needle.len()]); + haystack = &haystack[1..]; + } +} + +/// Search for the last occurrence of needle in haystack using Rabin-Karp. +pub(crate) fn rfind(haystack: &[u8], needle: &[u8]) -> Option<usize> { + rfind_with(&NeedleHash::reverse(needle), haystack, needle) +} + +/// Search for the last occurrence of needle in haystack using Rabin-Karp with +/// a pre-computed needle hash. +pub(crate) fn rfind_with( + nhash: &NeedleHash, + mut haystack: &[u8], + needle: &[u8], +) -> Option<usize> { + if haystack.len() < needle.len() { + return None; + } + let mut hash = + Hash::from_bytes_rev(&haystack[haystack.len() - needle.len()..]); + loop { + if nhash.eq(hash) && is_suffix(haystack, needle) { + return Some(haystack.len() - needle.len()); + } + if needle.len() >= haystack.len() { + return None; + } + hash.roll( + &nhash, + haystack[haystack.len() - 1], + haystack[haystack.len() - needle.len() - 1], + ); + haystack = &haystack[..haystack.len() - 1]; + } +} + +/// A hash derived from a needle. +#[derive(Clone, Copy, Debug, Default)] +pub(crate) struct NeedleHash { + /// The actual hash. + hash: Hash, + /// The factor needed to multiply a byte by in order to subtract it from + /// the hash. It is defined to be 2^(n-1) (using wrapping exponentiation), + /// where n is the length of the needle. This is how we "remove" a byte + /// from the hash once the hash window rolls past it. + hash_2pow: u32, +} + +impl NeedleHash { + /// Create a new Rabin-Karp hash for the given needle for use in forward + /// searching. + pub(crate) fn forward(needle: &[u8]) -> NeedleHash { + let mut nh = NeedleHash { hash: Hash::new(), hash_2pow: 1 }; + if needle.is_empty() { + return nh; + } + nh.hash.add(needle[0]); + for &b in needle.iter().skip(1) { + nh.hash.add(b); + nh.hash_2pow = nh.hash_2pow.wrapping_shl(1); + } + nh + } + + /// Create a new Rabin-Karp hash for the given needle for use in reverse + /// searching. + pub(crate) fn reverse(needle: &[u8]) -> NeedleHash { + let mut nh = NeedleHash { hash: Hash::new(), hash_2pow: 1 }; + if needle.is_empty() { + return nh; + } + nh.hash.add(needle[needle.len() - 1]); + for &b in needle.iter().rev().skip(1) { + nh.hash.add(b); + nh.hash_2pow = nh.hash_2pow.wrapping_shl(1); + } + nh + } + + /// Return true if the hashes are equivalent. + fn eq(&self, hash: Hash) -> bool { + self.hash == hash + } +} + +/// A Rabin-Karp hash. This might represent the hash of a needle, or the hash +/// of a rolling window in the haystack. +#[derive(Clone, Copy, Debug, Default, Eq, PartialEq)] +pub(crate) struct Hash(u32); + +impl Hash { + /// Create a new hash that represents the empty string. + pub(crate) fn new() -> Hash { + Hash(0) + } + + /// Create a new hash from the bytes given for use in forward searches. + pub(crate) fn from_bytes_fwd(bytes: &[u8]) -> Hash { + let mut hash = Hash::new(); + for &b in bytes { + hash.add(b); + } + hash + } + + /// Create a new hash from the bytes given for use in reverse searches. + fn from_bytes_rev(bytes: &[u8]) -> Hash { + let mut hash = Hash::new(); + for &b in bytes.iter().rev() { + hash.add(b); + } + hash + } + + /// Add 'new' and remove 'old' from this hash. The given needle hash should + /// correspond to the hash computed for the needle being searched for. + /// + /// This is meant to be used when the rolling window of the haystack is + /// advanced. + fn roll(&mut self, nhash: &NeedleHash, old: u8, new: u8) { + self.del(nhash, old); + self.add(new); + } + + /// Add a byte to this hash. + fn add(&mut self, byte: u8) { + self.0 = self.0.wrapping_shl(1).wrapping_add(byte as u32); + } + + /// Remove a byte from this hash. The given needle hash should correspond + /// to the hash computed for the needle being searched for. + fn del(&mut self, nhash: &NeedleHash, byte: u8) { + let factor = nhash.hash_2pow; + self.0 = self.0.wrapping_sub((byte as u32).wrapping_mul(factor)); + } +} + +/// Returns true if the given needle is a prefix of the given haystack. +/// +/// We forcefully don't inline the is_prefix call and hint at the compiler that +/// it is unlikely to be called. This causes the inner rabinkarp loop above +/// to be a bit tighter and leads to some performance improvement. See the +/// memmem/krate/prebuilt/sliceslice-words/words benchmark. +#[cold] +#[inline(never)] +fn is_prefix(haystack: &[u8], needle: &[u8]) -> bool { + crate::memmem::util::is_prefix(haystack, needle) +} + +/// Returns true if the given needle is a suffix of the given haystack. +/// +/// See is_prefix for why this is forcefully not inlined. +#[cold] +#[inline(never)] +fn is_suffix(haystack: &[u8], needle: &[u8]) -> bool { + crate::memmem::util::is_suffix(haystack, needle) +} + +#[cfg(test)] +mod simpletests { + define_memmem_simple_tests!(super::find, super::rfind); +} + +#[cfg(all(test, feature = "std", not(miri)))] +mod proptests { + define_memmem_quickcheck_tests!(super::find, super::rfind); +} |