diff options
author | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-04-07 19:33:14 +0000 |
---|---|---|
committer | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-04-07 19:33:14 +0000 |
commit | 36d22d82aa202bb199967e9512281e9a53db42c9 (patch) | |
tree | 105e8c98ddea1c1e4784a60a5a6410fa416be2de /third_party/rust/memchr/src/memmem/rarebytes.rs | |
parent | Initial commit. (diff) | |
download | firefox-esr-36d22d82aa202bb199967e9512281e9a53db42c9.tar.xz firefox-esr-36d22d82aa202bb199967e9512281e9a53db42c9.zip |
Adding upstream version 115.7.0esr.upstream/115.7.0esr
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'third_party/rust/memchr/src/memmem/rarebytes.rs')
-rw-r--r-- | third_party/rust/memchr/src/memmem/rarebytes.rs | 136 |
1 files changed, 136 insertions, 0 deletions
diff --git a/third_party/rust/memchr/src/memmem/rarebytes.rs b/third_party/rust/memchr/src/memmem/rarebytes.rs new file mode 100644 index 0000000000..fb33f68945 --- /dev/null +++ b/third_party/rust/memchr/src/memmem/rarebytes.rs @@ -0,0 +1,136 @@ +/// A heuristic frequency based detection of rare bytes for substring search. +/// +/// This detector attempts to pick out two bytes in a needle that are predicted +/// to occur least frequently. The purpose is to use these bytes to implement +/// fast candidate search using vectorized code. +/// +/// A set of offsets is only computed for needles of length 2 or greater. +/// Smaller needles should be special cased by the substring search algorithm +/// in use. (e.g., Use memchr for single byte needles.) +/// +/// Note that we use `u8` to represent the offsets of the rare bytes in a +/// needle to reduce space usage. This means that rare byte occurring after the +/// first 255 bytes in a needle will never be used. +#[derive(Clone, Copy, Debug, Default)] +pub(crate) struct RareNeedleBytes { + /// The leftmost offset of the rarest byte in the needle, according to + /// pre-computed frequency analysis. The "leftmost offset" means that + /// rare1i <= i for all i where needle[i] == needle[rare1i]. + rare1i: u8, + /// The leftmost offset of the second rarest byte in the needle, according + /// to pre-computed frequency analysis. The "leftmost offset" means that + /// rare2i <= i for all i where needle[i] == needle[rare2i]. + /// + /// The second rarest byte is used as a type of guard for quickly detecting + /// a mismatch if the first byte matches. This is a hedge against + /// pathological cases where the pre-computed frequency analysis may be + /// off. (But of course, does not prevent *all* pathological cases.) + /// + /// In general, rare1i != rare2i by construction, although there is no hard + /// requirement that they be different. However, since the case of a single + /// byte needle is handled specially by memchr itself, rare2i generally + /// always should be different from rare1i since it would otherwise be + /// ineffective as a guard. + rare2i: u8, +} + +impl RareNeedleBytes { + /// Create a new pair of rare needle bytes with the given offsets. This is + /// only used in tests for generating input data. + #[cfg(all(test, feature = "std"))] + pub(crate) fn new(rare1i: u8, rare2i: u8) -> RareNeedleBytes { + RareNeedleBytes { rare1i, rare2i } + } + + /// Detect the leftmost offsets of the two rarest bytes in the given + /// needle. + pub(crate) fn forward(needle: &[u8]) -> RareNeedleBytes { + if needle.len() <= 1 || needle.len() > core::u8::MAX as usize { + // For needles bigger than u8::MAX, our offsets aren't big enough. + // (We make our offsets small to reduce stack copying.) + // If you have a use case for it, please file an issue. In that + // case, we should probably just adjust the routine below to pick + // some rare bytes from the first 255 bytes of the needle. + // + // Also note that for needles of size 0 or 1, they are special + // cased in Two-Way. + // + // TODO: Benchmar this. + return RareNeedleBytes { rare1i: 0, rare2i: 0 }; + } + + // Find the rarest two bytes. We make them distinct by construction. + let (mut rare1, mut rare1i) = (needle[0], 0); + let (mut rare2, mut rare2i) = (needle[1], 1); + if rank(rare2) < rank(rare1) { + core::mem::swap(&mut rare1, &mut rare2); + core::mem::swap(&mut rare1i, &mut rare2i); + } + for (i, &b) in needle.iter().enumerate().skip(2) { + if rank(b) < rank(rare1) { + rare2 = rare1; + rare2i = rare1i; + rare1 = b; + rare1i = i as u8; + } else if b != rare1 && rank(b) < rank(rare2) { + rare2 = b; + rare2i = i as u8; + } + } + // While not strictly required, we really don't want these to be + // equivalent. If they were, it would reduce the effectiveness of + // candidate searching using these rare bytes by increasing the rate of + // false positives. + assert_ne!(rare1i, rare2i); + RareNeedleBytes { rare1i, rare2i } + } + + /// Return the rare bytes in the given needle in the forward direction. + /// The needle given must be the same one given to the RareNeedleBytes + /// constructor. + pub(crate) fn as_rare_bytes(&self, needle: &[u8]) -> (u8, u8) { + (needle[self.rare1i as usize], needle[self.rare2i as usize]) + } + + /// Return the rare offsets such that the first offset is always <= to the + /// second offset. This is useful when the caller doesn't care whether + /// rare1 is rarer than rare2, but just wants to ensure that they are + /// ordered with respect to one another. + #[cfg(memchr_runtime_simd)] + pub(crate) fn as_rare_ordered_usize(&self) -> (usize, usize) { + let (rare1i, rare2i) = self.as_rare_ordered_u8(); + (rare1i as usize, rare2i as usize) + } + + /// Like as_rare_ordered_usize, but returns the offsets as their native + /// u8 values. + #[cfg(memchr_runtime_simd)] + pub(crate) fn as_rare_ordered_u8(&self) -> (u8, u8) { + if self.rare1i <= self.rare2i { + (self.rare1i, self.rare2i) + } else { + (self.rare2i, self.rare1i) + } + } + + /// Return the rare offsets as usize values in the order in which they were + /// constructed. rare1, for example, is constructed as the "rarer" byte, + /// and thus, callers may want to treat it differently from rare2. + pub(crate) fn as_rare_usize(&self) -> (usize, usize) { + (self.rare1i as usize, self.rare2i as usize) + } + + /// Return the byte frequency rank of each byte. The higher the rank, the + /// more frequency the byte is predicted to be. The needle given must be + /// the same one given to the RareNeedleBytes constructor. + pub(crate) fn as_ranks(&self, needle: &[u8]) -> (usize, usize) { + let (b1, b2) = self.as_rare_bytes(needle); + (rank(b1), rank(b2)) + } +} + +/// Return the heuristical frequency rank of the given byte. A lower rank +/// means the byte is believed to occur less frequently. +fn rank(b: u8) -> usize { + crate::memmem::byte_frequencies::BYTE_FREQUENCIES[b as usize] as usize +} |