summaryrefslogtreecommitdiffstats
path: root/third_party/rust/memchr/src/memmem/rabinkarp.rs
diff options
context:
space:
mode:
Diffstat (limited to 'third_party/rust/memchr/src/memmem/rabinkarp.rs')
-rw-r--r--third_party/rust/memchr/src/memmem/rabinkarp.rs233
1 files changed, 233 insertions, 0 deletions
diff --git a/third_party/rust/memchr/src/memmem/rabinkarp.rs b/third_party/rust/memchr/src/memmem/rabinkarp.rs
new file mode 100644
index 0000000000..daa4015d5f
--- /dev/null
+++ b/third_party/rust/memchr/src/memmem/rabinkarp.rs
@@ -0,0 +1,233 @@
+/*
+This module implements the classical Rabin-Karp substring search algorithm,
+with no extra frills. While its use would seem to break our time complexity
+guarantee of O(m+n) (RK's time complexity is O(mn)), we are careful to only
+ever use RK on a constant subset of haystacks. The main point here is that
+RK has good latency properties for small needles/haystacks. It's very quick
+to compute a needle hash and zip through the haystack when compared to
+initializing Two-Way, for example. And this is especially useful for cases
+where the haystack is just too short for vector instructions to do much good.
+
+The hashing function used here is the same one recommended by ESMAJ.
+
+Another choice instead of Rabin-Karp would be Shift-Or. But its latency
+isn't quite as good since its preprocessing time is a bit more expensive
+(both in practice and in theory). However, perhaps Shift-Or has a place
+somewhere else for short patterns. I think the main problem is that it
+requires space proportional to the alphabet and the needle. If we, for
+example, supported needles up to length 16, then the total table size would be
+len(alphabet)*size_of::<u16>()==512 bytes. Which isn't exactly small, and it's
+probably bad to put that on the stack. So ideally, we'd throw it on the heap,
+but we'd really like to write as much code without using alloc/std as possible.
+But maybe it's worth the special casing. It's a TODO to benchmark.
+
+Wikipedia has a decent explanation, if a bit heavy on the theory:
+https://en.wikipedia.org/wiki/Rabin%E2%80%93Karp_algorithm
+
+But ESMAJ provides something a bit more concrete:
+http://www-igm.univ-mlv.fr/~lecroq/string/node5.html
+
+Finally, aho-corasick uses Rabin-Karp for multiple pattern match in some cases:
+https://github.com/BurntSushi/aho-corasick/blob/3852632f10587db0ff72ef29e88d58bf305a0946/src/packed/rabinkarp.rs
+*/
+
+/// Whether RK is believed to be very fast for the given needle/haystack.
+pub(crate) fn is_fast(haystack: &[u8], _needle: &[u8]) -> bool {
+ haystack.len() < 16
+}
+
+/// Search for the first occurrence of needle in haystack using Rabin-Karp.
+pub(crate) fn find(haystack: &[u8], needle: &[u8]) -> Option<usize> {
+ find_with(&NeedleHash::forward(needle), haystack, needle)
+}
+
+/// Search for the first occurrence of needle in haystack using Rabin-Karp with
+/// a pre-computed needle hash.
+pub(crate) fn find_with(
+ nhash: &NeedleHash,
+ mut haystack: &[u8],
+ needle: &[u8],
+) -> Option<usize> {
+ if haystack.len() < needle.len() {
+ return None;
+ }
+ let start = haystack.as_ptr() as usize;
+ let mut hash = Hash::from_bytes_fwd(&haystack[..needle.len()]);
+ // N.B. I've experimented with unrolling this loop, but couldn't realize
+ // any obvious gains.
+ loop {
+ if nhash.eq(hash) && is_prefix(haystack, needle) {
+ return Some(haystack.as_ptr() as usize - start);
+ }
+ if needle.len() >= haystack.len() {
+ return None;
+ }
+ hash.roll(&nhash, haystack[0], haystack[needle.len()]);
+ haystack = &haystack[1..];
+ }
+}
+
+/// Search for the last occurrence of needle in haystack using Rabin-Karp.
+pub(crate) fn rfind(haystack: &[u8], needle: &[u8]) -> Option<usize> {
+ rfind_with(&NeedleHash::reverse(needle), haystack, needle)
+}
+
+/// Search for the last occurrence of needle in haystack using Rabin-Karp with
+/// a pre-computed needle hash.
+pub(crate) fn rfind_with(
+ nhash: &NeedleHash,
+ mut haystack: &[u8],
+ needle: &[u8],
+) -> Option<usize> {
+ if haystack.len() < needle.len() {
+ return None;
+ }
+ let mut hash =
+ Hash::from_bytes_rev(&haystack[haystack.len() - needle.len()..]);
+ loop {
+ if nhash.eq(hash) && is_suffix(haystack, needle) {
+ return Some(haystack.len() - needle.len());
+ }
+ if needle.len() >= haystack.len() {
+ return None;
+ }
+ hash.roll(
+ &nhash,
+ haystack[haystack.len() - 1],
+ haystack[haystack.len() - needle.len() - 1],
+ );
+ haystack = &haystack[..haystack.len() - 1];
+ }
+}
+
+/// A hash derived from a needle.
+#[derive(Clone, Copy, Debug, Default)]
+pub(crate) struct NeedleHash {
+ /// The actual hash.
+ hash: Hash,
+ /// The factor needed to multiply a byte by in order to subtract it from
+ /// the hash. It is defined to be 2^(n-1) (using wrapping exponentiation),
+ /// where n is the length of the needle. This is how we "remove" a byte
+ /// from the hash once the hash window rolls past it.
+ hash_2pow: u32,
+}
+
+impl NeedleHash {
+ /// Create a new Rabin-Karp hash for the given needle for use in forward
+ /// searching.
+ pub(crate) fn forward(needle: &[u8]) -> NeedleHash {
+ let mut nh = NeedleHash { hash: Hash::new(), hash_2pow: 1 };
+ if needle.is_empty() {
+ return nh;
+ }
+ nh.hash.add(needle[0]);
+ for &b in needle.iter().skip(1) {
+ nh.hash.add(b);
+ nh.hash_2pow = nh.hash_2pow.wrapping_shl(1);
+ }
+ nh
+ }
+
+ /// Create a new Rabin-Karp hash for the given needle for use in reverse
+ /// searching.
+ pub(crate) fn reverse(needle: &[u8]) -> NeedleHash {
+ let mut nh = NeedleHash { hash: Hash::new(), hash_2pow: 1 };
+ if needle.is_empty() {
+ return nh;
+ }
+ nh.hash.add(needle[needle.len() - 1]);
+ for &b in needle.iter().rev().skip(1) {
+ nh.hash.add(b);
+ nh.hash_2pow = nh.hash_2pow.wrapping_shl(1);
+ }
+ nh
+ }
+
+ /// Return true if the hashes are equivalent.
+ fn eq(&self, hash: Hash) -> bool {
+ self.hash == hash
+ }
+}
+
+/// A Rabin-Karp hash. This might represent the hash of a needle, or the hash
+/// of a rolling window in the haystack.
+#[derive(Clone, Copy, Debug, Default, Eq, PartialEq)]
+pub(crate) struct Hash(u32);
+
+impl Hash {
+ /// Create a new hash that represents the empty string.
+ pub(crate) fn new() -> Hash {
+ Hash(0)
+ }
+
+ /// Create a new hash from the bytes given for use in forward searches.
+ pub(crate) fn from_bytes_fwd(bytes: &[u8]) -> Hash {
+ let mut hash = Hash::new();
+ for &b in bytes {
+ hash.add(b);
+ }
+ hash
+ }
+
+ /// Create a new hash from the bytes given for use in reverse searches.
+ fn from_bytes_rev(bytes: &[u8]) -> Hash {
+ let mut hash = Hash::new();
+ for &b in bytes.iter().rev() {
+ hash.add(b);
+ }
+ hash
+ }
+
+ /// Add 'new' and remove 'old' from this hash. The given needle hash should
+ /// correspond to the hash computed for the needle being searched for.
+ ///
+ /// This is meant to be used when the rolling window of the haystack is
+ /// advanced.
+ fn roll(&mut self, nhash: &NeedleHash, old: u8, new: u8) {
+ self.del(nhash, old);
+ self.add(new);
+ }
+
+ /// Add a byte to this hash.
+ fn add(&mut self, byte: u8) {
+ self.0 = self.0.wrapping_shl(1).wrapping_add(byte as u32);
+ }
+
+ /// Remove a byte from this hash. The given needle hash should correspond
+ /// to the hash computed for the needle being searched for.
+ fn del(&mut self, nhash: &NeedleHash, byte: u8) {
+ let factor = nhash.hash_2pow;
+ self.0 = self.0.wrapping_sub((byte as u32).wrapping_mul(factor));
+ }
+}
+
+/// Returns true if the given needle is a prefix of the given haystack.
+///
+/// We forcefully don't inline the is_prefix call and hint at the compiler that
+/// it is unlikely to be called. This causes the inner rabinkarp loop above
+/// to be a bit tighter and leads to some performance improvement. See the
+/// memmem/krate/prebuilt/sliceslice-words/words benchmark.
+#[cold]
+#[inline(never)]
+fn is_prefix(haystack: &[u8], needle: &[u8]) -> bool {
+ crate::memmem::util::is_prefix(haystack, needle)
+}
+
+/// Returns true if the given needle is a suffix of the given haystack.
+///
+/// See is_prefix for why this is forcefully not inlined.
+#[cold]
+#[inline(never)]
+fn is_suffix(haystack: &[u8], needle: &[u8]) -> bool {
+ crate::memmem::util::is_suffix(haystack, needle)
+}
+
+#[cfg(test)]
+mod simpletests {
+ define_memmem_simple_tests!(super::find, super::rfind);
+}
+
+#[cfg(all(test, feature = "std", not(miri)))]
+mod proptests {
+ define_memmem_quickcheck_tests!(super::find, super::rfind);
+}