From 4547b622d8d29df964fa2914213088b148c498fc Mon Sep 17 00:00:00 2001 From: Daniel Baumann Date: Wed, 17 Apr 2024 14:18:32 +0200 Subject: Merging upstream version 1.67.1+dfsg1. Signed-off-by: Daniel Baumann --- vendor/regex-automata/src/dfa/accel.rs | 507 +++ vendor/regex-automata/src/dfa/automaton.rs | 1903 ++++++++++ vendor/regex-automata/src/dfa/dense.rs | 4470 ++++++++++++++++++++++++ vendor/regex-automata/src/dfa/determinize.rs | 547 +++ vendor/regex-automata/src/dfa/error.rs | 162 + vendor/regex-automata/src/dfa/minimize.rs | 461 +++ vendor/regex-automata/src/dfa/mod.rs | 363 ++ vendor/regex-automata/src/dfa/regex.rs | 2146 ++++++++++++ vendor/regex-automata/src/dfa/search.rs | 493 +++ vendor/regex-automata/src/dfa/search_unsafe.rs | 321 ++ vendor/regex-automata/src/dfa/sparse.rs | 2283 ++++++++++++ vendor/regex-automata/src/dfa/special.rs | 477 +++ vendor/regex-automata/src/dfa/transducer.rs | 207 ++ 13 files changed, 14340 insertions(+) create mode 100644 vendor/regex-automata/src/dfa/accel.rs create mode 100644 vendor/regex-automata/src/dfa/automaton.rs create mode 100644 vendor/regex-automata/src/dfa/dense.rs create mode 100644 vendor/regex-automata/src/dfa/determinize.rs create mode 100644 vendor/regex-automata/src/dfa/error.rs create mode 100644 vendor/regex-automata/src/dfa/minimize.rs create mode 100644 vendor/regex-automata/src/dfa/mod.rs create mode 100644 vendor/regex-automata/src/dfa/regex.rs create mode 100644 vendor/regex-automata/src/dfa/search.rs create mode 100644 vendor/regex-automata/src/dfa/search_unsafe.rs create mode 100644 vendor/regex-automata/src/dfa/sparse.rs create mode 100644 vendor/regex-automata/src/dfa/special.rs create mode 100644 vendor/regex-automata/src/dfa/transducer.rs (limited to 'vendor/regex-automata/src/dfa') diff --git a/vendor/regex-automata/src/dfa/accel.rs b/vendor/regex-automata/src/dfa/accel.rs new file mode 100644 index 000000000..dbfeb7932 --- /dev/null +++ b/vendor/regex-automata/src/dfa/accel.rs @@ -0,0 +1,507 @@ +// This module defines some core types for dealing with accelerated DFA states. +// Briefly, a DFA state can be "accelerated" if all of its transitions except +// for a few loop back to itself. This directly implies that the only way out +// of such a state is if a byte corresponding to one of those non-loopback +// transitions is found. Such states are often found in simple repetitions in +// non-Unicode regexes. For example, consider '(?-u)[^a]+a'. We can look at its +// DFA with regex-cli: +// +// $ regex-cli debug dfa dense '(?-u)[^a]+a' -BbC +// dense::DFA( +// D 000000: +// Q 000001: +// *000002: +// A 000003: \x00-` => 3, a => 5, b-\xFF => 3 +// >000004: \x00-` => 3, a => 4, b-\xFF => 3 +// 000005: \x00-\xFF => 2, EOI => 2 +// ) +// +// In particular, state 3 is accelerated (shown via the 'A' indicator) since +// the only way to leave that state once entered is to see an 'a' byte. If +// there is a long run of non-'a' bytes, then using something like 'memchr' +// to find the next 'a' byte can be significantly faster than just using the +// standard byte-at-a-time state machine. +// +// Unfortunately, this optimization rarely applies when Unicode is enabled. +// For example, patterns like '[^a]' don't actually match any byte that isn't +// 'a', but rather, any UTF-8 encoding of a Unicode scalar value that isn't +// 'a'. This makes the state machine much more complex---far beyond a single +// state---and removes the ability to easily accelerate it. (Because if the +// machine sees a non-UTF-8 sequence, then the machine won't match through it.) +// +// In practice, we only consider accelerating states that have 3 or fewer +// non-loop transitions. At a certain point, you get diminishing returns, but +// also because that's what the memchr crate supports. The structures below +// hard-code this assumption and provide (de)serialization APIs for use inside +// a DFA. +// +// And finally, note that there is some trickery involved in making it very +// fast to not only check whether a state is accelerated at search time, but +// also to access the bytes to search for to implement the acceleration itself. +// dfa/special.rs provides more detail, but the short story is that all +// accelerated states appear contiguously in a DFA. This means we can represent +// the ID space of all accelerated DFA states with a single range. So given +// a state ID, we can determine whether it's accelerated via +// +// min_accel_id <= id <= max_accel_id +// +// And find its corresponding accelerator with: +// +// accels.get((id - min_accel_id) / dfa_stride) + +use core::convert::{TryFrom, TryInto}; + +#[cfg(feature = "alloc")] +use alloc::{vec, vec::Vec}; + +use crate::util::bytes::{self, DeserializeError, Endian, SerializeError}; + +/// The base type used to represent a collection of accelerators. +/// +/// While an `Accel` is represented as a fixed size array of bytes, a +/// *collection* of `Accel`s (called `Accels`) is represented internally as a +/// slice of u32. While it's a bit unnatural to do this and costs us a bit of +/// fairly low-risk not-safe code, it lets us remove the need for a second type +/// parameter in the definition of dense::DFA. (Which really wants everything +/// to be a slice of u32.) +type AccelTy = u32; + +/// The size of the unit of representation for accelerators. +/// +/// ACCEL_CAP *must* be a multiple of this size. +const ACCEL_TY_SIZE: usize = core::mem::size_of::(); + +/// The maximum length in bytes that a single Accel can be. This is distinct +/// from the capacity of an accelerator in that the length represents only the +/// bytes that should be read. +const ACCEL_LEN: usize = 4; + +/// The capacity of each accelerator, in bytes. We set this to 8 since it's a +/// multiple of 4 (our ID size) and because it gives us a little wiggle room +/// if we want to support more accel bytes in the future without a breaking +/// change. +/// +/// This MUST be a multiple of ACCEL_TY_SIZE. +const ACCEL_CAP: usize = 8; + +/// Search for between 1 and 3 needle bytes in the given haystack, starting the +/// search at the given position. If `needles` has a length other than 1-3, +/// then this panics. +#[inline(always)] +pub(crate) fn find_fwd( + needles: &[u8], + haystack: &[u8], + at: usize, +) -> Option { + let bs = needles; + let i = match needles.len() { + 1 => memchr::memchr(bs[0], &haystack[at..])?, + 2 => memchr::memchr2(bs[0], bs[1], &haystack[at..])?, + 3 => memchr::memchr3(bs[0], bs[1], bs[2], &haystack[at..])?, + 0 => panic!("cannot find with empty needles"), + n => panic!("invalid needles length: {}", n), + }; + Some(at + i) +} + +/// Search for between 1 and 3 needle bytes in the given haystack in reverse, +/// starting the search at the given position. If `needles` has a length other +/// than 1-3, then this panics. +#[inline(always)] +pub(crate) fn find_rev( + needles: &[u8], + haystack: &[u8], + at: usize, +) -> Option { + let bs = needles; + match needles.len() { + 1 => memchr::memrchr(bs[0], &haystack[..at]), + 2 => memchr::memrchr2(bs[0], bs[1], &haystack[..at]), + 3 => memchr::memrchr3(bs[0], bs[1], bs[2], &haystack[..at]), + 0 => panic!("cannot find with empty needles"), + n => panic!("invalid needles length: {}", n), + } +} + +/// Represents the accelerators for all accelerated states in a dense DFA. +/// +/// The `A` type parameter represents the type of the underlying bytes. +/// Generally, this is either `&[AccelTy]` or `Vec`. +#[derive(Clone)] +pub(crate) struct Accels { + /// A length prefixed slice of contiguous accelerators. See the top comment + /// in this module for more details on how we can jump from a DFA's state + /// ID to an accelerator in this list. + /// + /// The first 4 bytes always correspond to the number of accelerators + /// that follow. + accels: A, +} + +#[cfg(feature = "alloc")] +impl Accels> { + /// Create an empty sequence of accelerators for a DFA. + pub fn empty() -> Accels> { + Accels { accels: vec![0] } + } + + /// Add an accelerator to this sequence. + /// + /// This adds to the accelerator to the end of the sequence and therefore + /// should be done in correspondence with its state in the DFA. + /// + /// This panics if this results in more accelerators than AccelTy::MAX. + pub fn add(&mut self, accel: Accel) { + self.accels.extend_from_slice(&accel.as_accel_tys()); + let len = self.len(); + self.set_len(len + 1); + } + + /// Set the number of accelerators in this sequence, which is encoded in + /// the first 4 bytes of the underlying bytes. + fn set_len(&mut self, new_len: usize) { + // The only way an accelerator gets added is if a state exists for + // it, and if a state exists, then its index is guaranteed to be + // representable by a AccelTy by virtue of the guarantees provided by + // StateID. + let new_len = AccelTy::try_from(new_len).unwrap(); + self.accels[0] = new_len; + } +} + +impl<'a> Accels<&'a [AccelTy]> { + /// Deserialize a sequence of accelerators from the given bytes. If there + /// was a problem deserializing, then an error is returned. + /// + /// This is guaranteed to run in constant time. This does not guarantee + /// that every accelerator in the returned collection is valid. Thus, + /// accessing one may panic, or not-safe code that relies on accelerators + /// being correct my result in UB. + /// + /// Callers may check the validity of every accelerator with the `validate` + /// method. + pub unsafe fn from_bytes_unchecked( + mut slice: &'a [u8], + ) -> Result<(Accels<&'a [AccelTy]>, usize), DeserializeError> { + let slice_start = slice.as_ptr() as usize; + + let (count, _) = + bytes::try_read_u32_as_usize(slice, "accelerators count")?; + // The accelerator count is part of the accel_tys slice that + // we deserialize. This is perhaps a bit idiosyncratic. It would + // probably be better to split out the count into a real field. + + let accel_tys_count = bytes::add( + bytes::mul(count, 2, "total number of accelerator accel_tys")?, + 1, + "total number of accel_tys", + )?; + let accel_tys_len = bytes::mul( + ACCEL_TY_SIZE, + accel_tys_count, + "total number of bytes in accelerators", + )?; + bytes::check_slice_len(slice, accel_tys_len, "accelerators")?; + bytes::check_alignment::(slice)?; + let accel_tys = &slice[..accel_tys_len]; + slice = &slice[accel_tys_len..]; + // SAFETY: We've checked the length and alignment above, and since + // slice is just bytes, we can safely cast to a slice of &[AccelTy]. + #[allow(unused_unsafe)] + let accels = unsafe { + core::slice::from_raw_parts( + accel_tys.as_ptr() as *const AccelTy, + accel_tys_count, + ) + }; + Ok((Accels { accels }, slice.as_ptr() as usize - slice_start)) + } +} + +impl> Accels { + /// Return an owned version of the accelerators. + #[cfg(feature = "alloc")] + pub fn to_owned(&self) -> Accels> { + Accels { accels: self.accels.as_ref().to_vec() } + } + + /// Return a borrowed version of the accelerators. + pub fn as_ref(&self) -> Accels<&[AccelTy]> { + Accels { accels: self.accels.as_ref() } + } + + /// Return the bytes representing the serialization of the accelerators. + pub fn as_bytes(&self) -> &[u8] { + let accels = self.accels.as_ref(); + // SAFETY: This is safe because accels is a just a slice of AccelTy, + // and u8 always has a smaller alignment. + unsafe { + core::slice::from_raw_parts( + accels.as_ptr() as *const u8, + accels.len() * ACCEL_TY_SIZE, + ) + } + } + + /// Returns the memory usage, in bytes, of these accelerators. + /// + /// The memory usage is computed based on the number of bytes used to + /// represent all of the accelerators. + /// + /// This does **not** include the stack size used by this value. + pub fn memory_usage(&self) -> usize { + self.as_bytes().len() + } + + /// Return the bytes to search for corresponding to the accelerator in this + /// sequence at index `i`. If no such accelerator exists, then this panics. + /// + /// The significance of the index is that it should be in correspondence + /// with the index of the corresponding DFA. That is, accelerated DFA + /// states are stored contiguously in the DFA and have an ordering implied + /// by their respective state IDs. The state's index in that sequence + /// corresponds to the index of its corresponding accelerator. + #[inline(always)] + pub fn needles(&self, i: usize) -> &[u8] { + if i >= self.len() { + panic!("invalid accelerator index {}", i); + } + let bytes = self.as_bytes(); + let offset = ACCEL_TY_SIZE + i * ACCEL_CAP; + let len = bytes[offset] as usize; + &bytes[offset + 1..offset + 1 + len] + } + + /// Return the total number of accelerators in this sequence. + pub fn len(&self) -> usize { + // This should never panic since deserialization checks that the + // length can fit into a usize. + usize::try_from(self.accels.as_ref()[0]).unwrap() + } + + /// Return the accelerator in this sequence at index `i`. If no such + /// accelerator exists, then this returns None. + /// + /// See the docs for `needles` on the significance of the index. + fn get(&self, i: usize) -> Option { + if i >= self.len() { + return None; + } + let offset = ACCEL_TY_SIZE + i * ACCEL_CAP; + let accel = Accel::from_slice(&self.as_bytes()[offset..]) + .expect("Accels must contain valid accelerators"); + Some(accel) + } + + /// Returns an iterator of accelerators in this sequence. + fn iter(&self) -> IterAccels<'_, A> { + IterAccels { accels: self, i: 0 } + } + + /// Writes these accelerators to the given byte buffer using the indicated + /// endianness. If the given buffer is too small, then an error is + /// returned. Upon success, the total number of bytes written is returned. + /// The number of bytes written is guaranteed to be a multiple of 8. + pub fn write_to( + &self, + dst: &mut [u8], + ) -> Result { + let nwrite = self.write_to_len(); + assert_eq!( + nwrite % ACCEL_TY_SIZE, + 0, + "expected accelerator bytes written to be a multiple of {}", + ACCEL_TY_SIZE, + ); + if dst.len() < nwrite { + return Err(SerializeError::buffer_too_small("accelerators")); + } + + // The number of accelerators can never exceed AccelTy::MAX. + E::write_u32(AccelTy::try_from(self.len()).unwrap(), dst); + // The actual accelerators are just raw bytes and thus their endianness + // is irrelevant. So we can copy them as bytes. + dst[ACCEL_TY_SIZE..nwrite] + .copy_from_slice(&self.as_bytes()[ACCEL_TY_SIZE..nwrite]); + Ok(nwrite) + } + + /// Validates that every accelerator in this collection can be successfully + /// deserialized as a valid accelerator. + pub fn validate(&self) -> Result<(), DeserializeError> { + for chunk in self.as_bytes()[ACCEL_TY_SIZE..].chunks(ACCEL_CAP) { + let _ = Accel::from_slice(chunk)?; + } + Ok(()) + } + + /// Returns the total number of bytes written by `write_to`. + pub fn write_to_len(&self) -> usize { + self.as_bytes().len() + } +} + +impl> core::fmt::Debug for Accels { + fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result { + write!(f, "Accels(")?; + let mut list = f.debug_list(); + for a in self.iter() { + list.entry(&a); + } + list.finish()?; + write!(f, ")") + } +} + +#[derive(Debug)] +struct IterAccels<'a, A: AsRef<[AccelTy]>> { + accels: &'a Accels, + i: usize, +} + +impl<'a, A: AsRef<[AccelTy]>> Iterator for IterAccels<'a, A> { + type Item = Accel; + + fn next(&mut self) -> Option { + let accel = self.accels.get(self.i)?; + self.i += 1; + Some(accel) + } +} + +/// Accel represents a structure for determining how to "accelerate" a DFA +/// state. +/// +/// Namely, it contains zero or more bytes that must be seen in order for the +/// DFA to leave the state it is associated with. In practice, the actual range +/// is 1 to 3 bytes. +/// +/// The purpose of acceleration is to identify states whose vast majority +/// of transitions are just loops back to the same state. For example, +/// in the regex `(?-u)^[^a]+b`, the corresponding DFA will have a state +/// (corresponding to `[^a]+`) where all transitions *except* for `a` and +/// `b` loop back to itself. Thus, this state can be "accelerated" by simply +/// looking for the next occurrence of either `a` or `b` instead of explicitly +/// following transitions. (In this case, `b` transitions to the next state +/// where as `a` would transition to the dead state.) +#[derive(Clone)] +pub(crate) struct Accel { + /// The first byte is the length. Subsequent bytes are the accelerated + /// bytes. + /// + /// Note that we make every accelerator 8 bytes as a slightly wasteful + /// way of making sure alignment is always correct for state ID sizes of + /// 1, 2, 4 and 8. This should be okay since accelerated states aren't + /// particularly common, especially when Unicode is enabled. + bytes: [u8; ACCEL_CAP], +} + +impl Accel { + /// Returns an empty accel, where no bytes are accelerated. + #[cfg(feature = "alloc")] + pub fn new() -> Accel { + Accel { bytes: [0; ACCEL_CAP] } + } + + /// Returns a verified accelerator derived from the beginning of the given + /// slice. + /// + /// If the slice is not long enough or contains invalid bytes for an + /// accelerator, then this returns an error. + pub fn from_slice(mut slice: &[u8]) -> Result { + slice = &slice[..core::cmp::min(ACCEL_LEN, slice.len())]; + let bytes = slice + .try_into() + .map_err(|_| DeserializeError::buffer_too_small("accelerator"))?; + Accel::from_bytes(bytes) + } + + /// Returns a verified accelerator derived from raw bytes. + /// + /// If the given bytes are invalid, then this returns an error. + fn from_bytes(bytes: [u8; 4]) -> Result { + if bytes[0] as usize >= ACCEL_LEN { + return Err(DeserializeError::generic( + "accelerator bytes cannot have length more than 3", + )); + } + Ok(Accel::from_bytes_unchecked(bytes)) + } + + /// Returns an accelerator derived from raw bytes. + /// + /// This does not check whether the given bytes are valid. Invalid bytes + /// cannot sacrifice memory safety, but may result in panics or silent + /// logic bugs. + fn from_bytes_unchecked(bytes: [u8; 4]) -> Accel { + Accel { bytes: [bytes[0], bytes[1], bytes[2], bytes[3], 0, 0, 0, 0] } + } + + /// Attempts to add the given byte to this accelerator. If the accelerator + /// is already full then this returns false. Otherwise, returns true. + /// + /// If the given byte is already in this accelerator, then it panics. + #[cfg(feature = "alloc")] + pub fn add(&mut self, byte: u8) -> bool { + if self.len() >= 3 { + return false; + } + assert!( + !self.contains(byte), + "accelerator already contains {:?}", + crate::util::DebugByte(byte) + ); + self.bytes[self.len() + 1] = byte; + self.bytes[0] += 1; + true + } + + /// Return the number of bytes in this accelerator. + pub fn len(&self) -> usize { + self.bytes[0] as usize + } + + /// Returns true if and only if there are no bytes in this accelerator. + #[cfg(feature = "alloc")] + pub fn is_empty(&self) -> bool { + self.len() == 0 + } + + /// Returns the slice of bytes to accelerate. + /// + /// If this accelerator is empty, then this returns an empty slice. + fn needles(&self) -> &[u8] { + &self.bytes[1..1 + self.len()] + } + + /// Returns true if and only if this accelerator will accelerate the given + /// byte. + #[cfg(feature = "alloc")] + fn contains(&self, byte: u8) -> bool { + self.needles().iter().position(|&b| b == byte).is_some() + } + + /// Returns the accelerator bytes as an array of AccelTys. + #[cfg(feature = "alloc")] + fn as_accel_tys(&self) -> [AccelTy; 2] { + assert_eq!(ACCEL_CAP, 8); + // These unwraps are OK since ACCEL_CAP is set to 8. + let first = + AccelTy::from_ne_bytes(self.bytes[0..4].try_into().unwrap()); + let second = + AccelTy::from_ne_bytes(self.bytes[4..8].try_into().unwrap()); + [first, second] + } +} + +impl core::fmt::Debug for Accel { + fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result { + write!(f, "Accel(")?; + let mut set = f.debug_set(); + for &b in self.needles() { + set.entry(&crate::util::DebugByte(b)); + } + set.finish()?; + write!(f, ")") + } +} diff --git a/vendor/regex-automata/src/dfa/automaton.rs b/vendor/regex-automata/src/dfa/automaton.rs new file mode 100644 index 000000000..08bd6722a --- /dev/null +++ b/vendor/regex-automata/src/dfa/automaton.rs @@ -0,0 +1,1903 @@ +use crate::{ + dfa::search, + util::{ + id::{PatternID, StateID}, + matchtypes::{HalfMatch, MatchError}, + prefilter, + }, +}; + +/// A trait describing the interface of a deterministic finite automaton (DFA). +/// +/// The complexity of this trait probably means that it's unlikely for others +/// to implement it. The primary purpose of the trait is to provide for a way +/// of abstracting over different types of DFAs. In this crate, that means +/// dense DFAs and sparse DFAs. (Dense DFAs are fast but memory hungry, where +/// as sparse DFAs are slower but come with a smaller memory footprint. But +/// they otherwise provide exactly equivalent expressive power.) For example, a +/// [`dfa::regex::Regex`](crate::dfa::regex::Regex) is generic over this trait. +/// +/// Normally, a DFA's execution model is very simple. You might have a single +/// start state, zero or more final or "match" states and a function that +/// transitions from one state to the next given the next byte of input. +/// Unfortunately, the interface described by this trait is significantly +/// more complicated than this. The complexity has a number of different +/// reasons, mostly motivated by performance, functionality or space savings: +/// +/// * A DFA can search for multiple patterns simultaneously. This +/// means extra information is returned when a match occurs. Namely, +/// a match is not just an offset, but an offset plus a pattern ID. +/// [`Automaton::pattern_count`] returns the number of patterns compiled into +/// the DFA, [`Automaton::match_count`] returns the total number of patterns +/// that match in a particular state and [`Automaton::match_pattern`] permits +/// iterating over the patterns that match in a particular state. +/// * A DFA can have multiple start states, and the choice of which start +/// state to use depends on the content of the string being searched and +/// position of the search, as well as whether the search is an anchored +/// search for a specific pattern in the DFA. Moreover, computing the start +/// state also depends on whether you're doing a forward or a reverse search. +/// [`Automaton::start_state_forward`] and [`Automaton::start_state_reverse`] +/// are used to compute the start state for forward and reverse searches, +/// respectively. +/// * All matches are delayed by one byte to support things like `$` and `\b` +/// at the end of a pattern. Therefore, every use of a DFA is required to use +/// [`Automaton::next_eoi_state`] +/// at the end of the search to compute the final transition. +/// * For optimization reasons, some states are treated specially. Every +/// state is either special or not, which can be determined via the +/// [`Automaton::is_special_state`] method. If it's special, then the state +/// must be at least one of a few possible types of states. (Note that some +/// types can overlap, for example, a match state can also be an accel state. +/// But some types can't. If a state is a dead state, then it can never be any +/// other type of state.) Those types are: +/// * A dead state. A dead state means the DFA will never enter a match +/// state. This can be queried via the [`Automaton::is_dead_state`] method. +/// * A quit state. A quit state occurs if the DFA had to stop the search +/// prematurely for some reason. This can be queried via the +/// [`Automaton::is_quit_state`] method. +/// * A match state. A match state occurs when a match is found. When a DFA +/// enters a match state, the search may stop immediately (when looking +/// for the earliest match), or it may continue to find the leftmost-first +/// match. This can be queried via the [`Automaton::is_match_state`] +/// method. +/// * A start state. A start state is where a search begins. For every +/// search, there is exactly one start state that is used, however, a +/// DFA may contain many start states. When the search is in a start +/// state, it may use a prefilter to quickly skip to candidate matches +/// without executing the DFA on every byte. This can be queried via the +/// [`Automaton::is_start_state`] method. +/// * An accel state. An accel state is a state that is accelerated. +/// That is, it is a state where _most_ of its transitions loop back to +/// itself and only a small number of transitions lead to other states. +/// This kind of state is said to be accelerated because a search routine +/// can quickly look for the bytes leading out of the state instead of +/// continuing to execute the DFA on each byte. This can be queried via the +/// [`Automaton::is_accel_state`] method. And the bytes that lead out of +/// the state can be queried via the [`Automaton::accelerator`] method. +/// +/// There are a number of provided methods on this trait that implement +/// efficient searching (for forwards and backwards) with a DFA using all of +/// the above features of this trait. In particular, given the complexity of +/// all these features, implementing a search routine in this trait is not +/// straight forward. If you need to do this for specialized reasons, then +/// it's recommended to look at the source of this crate. It is intentionally +/// well commented to help with this. With that said, it is possible to +/// somewhat simplify the search routine. For example, handling accelerated +/// states is strictly optional, since it is always correct to assume that +/// `Automaton::is_accel_state` returns false. However, one complex part of +/// writing a search routine using this trait is handling the 1-byte delay of a +/// match. That is not optional. +/// +/// # Safety +/// +/// This trait is unsafe to implement because DFA searching may rely on the +/// correctness of the implementation for memory safety. For example, DFA +/// searching may use explicit bounds check elision, which will in turn rely +/// on the correctness of every function that returns a state ID. +/// +/// When implementing this trait, one must uphold the documented correctness +/// guarantees. Otherwise, undefined behavior may occur. +pub unsafe trait Automaton { + /// Transitions from the current state to the next state, given the next + /// byte of input. + /// + /// Implementations must guarantee that the returned ID is always a valid + /// ID when `current` refers to a valid ID. Moreover, the transition + /// function must be defined for all possible values of `input`. + /// + /// # Panics + /// + /// If the given ID does not refer to a valid state, then this routine + /// may panic but it also may not panic and instead return an invalid ID. + /// However, if the caller provides an invalid ID then this must never + /// sacrifice memory safety. + /// + /// # Example + /// + /// This shows a simplistic example for walking a DFA for a given haystack + /// by using the `next_state` method. + /// + /// ``` + /// use regex_automata::dfa::{Automaton, dense}; + /// + /// let dfa = dense::DFA::new(r"[a-z]+r")?; + /// let haystack = "bar".as_bytes(); + /// + /// // The start state is determined by inspecting the position and the + /// // initial bytes of the haystack. + /// let mut state = dfa.start_state_forward( + /// None, haystack, 0, haystack.len(), + /// ); + /// // Walk all the bytes in the haystack. + /// for &b in haystack { + /// state = dfa.next_state(state, b); + /// } + /// // Matches are always delayed by 1 byte, so we must explicitly walk the + /// // special "EOI" transition at the end of the search. + /// state = dfa.next_eoi_state(state); + /// assert!(dfa.is_match_state(state)); + /// + /// # Ok::<(), Box>(()) + /// ``` + fn next_state(&self, current: StateID, input: u8) -> StateID; + + /// Transitions from the current state to the next state, given the next + /// byte of input. + /// + /// Unlike [`Automaton::next_state`], implementations may implement this + /// more efficiently by assuming that the `current` state ID is valid. + /// Typically, this manifests by eliding bounds checks. + /// + /// # Safety + /// + /// Callers of this method must guarantee that `current` refers to a valid + /// state ID. If `current` is not a valid state ID for this automaton, then + /// calling this routine may result in undefined behavior. + /// + /// If `current` is valid, then implementations must guarantee that the ID + /// returned is valid for all possible values of `input`. + unsafe fn next_state_unchecked( + &self, + current: StateID, + input: u8, + ) -> StateID; + + /// Transitions from the current state to the next state for the special + /// EOI symbol. + /// + /// Implementations must guarantee that the returned ID is always a valid + /// ID when `current` refers to a valid ID. + /// + /// This routine must be called at the end of every search in a correct + /// implementation of search. Namely, DFAs in this crate delay matches + /// by one byte in order to support look-around operators. Thus, after + /// reaching the end of a haystack, a search implementation must follow one + /// last EOI transition. + /// + /// It is best to think of EOI as an additional symbol in the alphabet of + /// a DFA that is distinct from every other symbol. That is, the alphabet + /// of DFAs in this crate has a logical size of 257 instead of 256, where + /// 256 corresponds to every possible inhabitant of `u8`. (In practice, the + /// physical alphabet size may be smaller because of alphabet compression + /// via equivalence classes, but EOI is always represented somehow in the + /// alphabet.) + /// + /// # Panics + /// + /// If the given ID does not refer to a valid state, then this routine + /// may panic but it also may not panic and instead return an invalid ID. + /// However, if the caller provides an invalid ID then this must never + /// sacrifice memory safety. + /// + /// # Example + /// + /// This shows a simplistic example for walking a DFA for a given haystack, + /// and then finishing the search with the final EOI transition. + /// + /// ``` + /// use regex_automata::dfa::{Automaton, dense}; + /// + /// let dfa = dense::DFA::new(r"[a-z]+r")?; + /// let haystack = "bar".as_bytes(); + /// + /// // The start state is determined by inspecting the position and the + /// // initial bytes of the haystack. + /// let mut state = dfa.start_state_forward( + /// None, haystack, 0, haystack.len(), + /// ); + /// // Walk all the bytes in the haystack. + /// for &b in haystack { + /// state = dfa.next_state(state, b); + /// } + /// // Matches are always delayed by 1 byte, so we must explicitly walk + /// // the special "EOI" transition at the end of the search. Without this + /// // final transition, the assert below will fail since the DFA will not + /// // have entered a match state yet! + /// state = dfa.next_eoi_state(state); + /// assert!(dfa.is_match_state(state)); + /// + /// # Ok::<(), Box>(()) + /// ``` + fn next_eoi_state(&self, current: StateID) -> StateID; + + /// Return the ID of the start state for this DFA when executing a forward + /// search. + /// + /// Unlike typical DFA implementations, the start state for DFAs in this + /// crate is dependent on a few different factors: + /// + /// * The pattern ID, if present. When the underlying DFA has been compiled + /// with multiple patterns _and_ the DFA has been configured to compile + /// an anchored start state for each pattern, then a pattern ID may be + /// specified to execute an anchored search for that specific pattern. + /// If `pattern_id` is invalid or if the DFA doesn't have start states + /// compiled for each pattern, then implementations must panic. DFAs in + /// this crate can be configured to compile start states for each pattern + /// via + /// [`dense::Config::starts_for_each_pattern`](crate::dfa::dense::Config::starts_for_each_pattern). + /// * When `start > 0`, the byte at index `start - 1` may influence the + /// start state if the regex uses `^` or `\b`. + /// * Similarly, when `start == 0`, it may influence the start state when + /// the regex uses `^` or `\A`. + /// * Currently, `end` is unused. + /// * Whether the search is a forward or reverse search. This routine can + /// only be used for forward searches. + /// + /// # Panics + /// + /// Implementations must panic if `start..end` is not a valid sub-slice of + /// `bytes`. Implementations must also panic if `pattern_id` is non-None + /// and does not refer to a valid pattern, or if the DFA was not compiled + /// with anchored start states for each pattern. + fn start_state_forward( + &self, + pattern_id: Option, + bytes: &[u8], + start: usize, + end: usize, + ) -> StateID; + + /// Return the ID of the start state for this DFA when executing a reverse + /// search. + /// + /// Unlike typical DFA implementations, the start state for DFAs in this + /// crate is dependent on a few different factors: + /// + /// * The pattern ID, if present. When the underlying DFA has been compiled + /// with multiple patterns _and_ the DFA has been configured to compile an + /// anchored start state for each pattern, then a pattern ID may be + /// specified to execute an anchored search for that specific pattern. If + /// `pattern_id` is invalid or if the DFA doesn't have start states compiled + /// for each pattern, then implementations must panic. DFAs in this crate + /// can be configured to compile start states for each pattern via + /// [`dense::Config::starts_for_each_pattern`](crate::dfa::dense::Config::starts_for_each_pattern). + /// * When `end < bytes.len()`, the byte at index `end` may influence the + /// start state if the regex uses `$` or `\b`. + /// * Similarly, when `end == bytes.len()`, it may influence the start + /// state when the regex uses `$` or `\z`. + /// * Currently, `start` is unused. + /// * Whether the search is a forward or reverse search. This routine can + /// only be used for reverse searches. + /// + /// # Panics + /// + /// Implementations must panic if `start..end` is not a valid sub-slice of + /// `bytes`. Implementations must also panic if `pattern_id` is non-None + /// and does not refer to a valid pattern, or if the DFA was not compiled + /// with anchored start states for each pattern. + fn start_state_reverse( + &self, + pattern_id: Option, + bytes: &[u8], + start: usize, + end: usize, + ) -> StateID; + + /// Returns true if and only if the given identifier corresponds to a + /// "special" state. A special state is one or more of the following: + /// a dead state, a quit state, a match state, a start state or an + /// accelerated state. + /// + /// A correct implementation _may_ always return false for states that + /// are either start states or accelerated states, since that information + /// is only intended to be used for optimization purposes. Correct + /// implementations must return true if the state is a dead, quit or match + /// state. This is because search routines using this trait must be able + /// to rely on `is_special_state` as an indicator that a state may need + /// special treatment. (For example, when a search routine sees a dead + /// state, it must terminate.) + /// + /// This routine permits search implementations to use a single branch to + /// check whether a state needs special attention before executing the next + /// transition. The example below shows how to do this. + /// + /// # Example + /// + /// This example shows how `is_special_state` can be used to implement a + /// correct search routine with minimal branching. In particular, this + /// search routine implements "leftmost" matching, which means that it + /// doesn't immediately stop once a match is found. Instead, it continues + /// until it reaches a dead state. + /// + /// ``` + /// use regex_automata::{ + /// dfa::{Automaton, dense}, + /// HalfMatch, MatchError, PatternID, + /// }; + /// + /// fn find_leftmost_first( + /// dfa: &A, + /// haystack: &[u8], + /// ) -> Result, MatchError> { + /// // The start state is determined by inspecting the position and the + /// // initial bytes of the haystack. Note that start states can never + /// // be match states (since DFAs in this crate delay matches by 1 + /// // byte), so we don't need to check if the start state is a match. + /// let mut state = dfa.start_state_forward( + /// None, haystack, 0, haystack.len(), + /// ); + /// let mut last_match = None; + /// // Walk all the bytes in the haystack. We can quit early if we see + /// // a dead or a quit state. The former means the automaton will + /// // never transition to any other state. The latter means that the + /// // automaton entered a condition in which its search failed. + /// for (i, &b) in haystack.iter().enumerate() { + /// state = dfa.next_state(state, b); + /// if dfa.is_special_state(state) { + /// if dfa.is_match_state(state) { + /// last_match = Some(HalfMatch::new( + /// dfa.match_pattern(state, 0), + /// i, + /// )); + /// } else if dfa.is_dead_state(state) { + /// return Ok(last_match); + /// } else if dfa.is_quit_state(state) { + /// // It is possible to enter into a quit state after + /// // observing a match has occurred. In that case, we + /// // should return the match instead of an error. + /// if last_match.is_some() { + /// return Ok(last_match); + /// } + /// return Err(MatchError::Quit { byte: b, offset: i }); + /// } + /// // Implementors may also want to check for start or accel + /// // states and handle them differently for performance + /// // reasons. But it is not necessary for correctness. + /// } + /// } + /// // Matches are always delayed by 1 byte, so we must explicitly walk + /// // the special "EOI" transition at the end of the search. + /// state = dfa.next_eoi_state(state); + /// if dfa.is_match_state(state) { + /// last_match = Some(HalfMatch::new( + /// dfa.match_pattern(state, 0), + /// haystack.len(), + /// )); + /// } + /// Ok(last_match) + /// } + /// + /// // We use a greedy '+' operator to show how the search doesn't just + /// // stop once a match is detected. It continues extending the match. + /// // Using '[a-z]+?' would also work as expected and stop the search + /// // early. Greediness is built into the automaton. + /// let dfa = dense::DFA::new(r"[a-z]+")?; + /// let haystack = "123 foobar 4567".as_bytes(); + /// let mat = find_leftmost_first(&dfa, haystack)?.unwrap(); + /// assert_eq!(mat.pattern().as_usize(), 0); + /// assert_eq!(mat.offset(), 10); + /// + /// // Here's another example that tests our handling of the special EOI + /// // transition. This will fail to find a match if we don't call + /// // 'next_eoi_state' at the end of the search since the match isn't + /// // found until the final byte in the haystack. + /// let dfa = dense::DFA::new(r"[0-9]{4}")?; + /// let haystack = "123 foobar 4567".as_bytes(); + /// let mat = find_leftmost_first(&dfa, haystack)?.unwrap(); + /// assert_eq!(mat.pattern().as_usize(), 0); + /// assert_eq!(mat.offset(), 15); + /// + /// // And note that our search implementation above automatically works + /// // with multi-DFAs. Namely, `dfa.match_pattern(match_state, 0)` selects + /// // the appropriate pattern ID for us. + /// let dfa = dense::DFA::new_many(&[r"[a-z]+", r"[0-9]+"])?; + /// let haystack = "123 foobar 4567".as_bytes(); + /// let mat = find_leftmost_first(&dfa, haystack)?.unwrap(); + /// assert_eq!(mat.pattern().as_usize(), 1); + /// assert_eq!(mat.offset(), 3); + /// let mat = find_leftmost_first(&dfa, &haystack[3..])?.unwrap(); + /// assert_eq!(mat.pattern().as_usize(), 0); + /// assert_eq!(mat.offset(), 7); + /// let mat = find_leftmost_first(&dfa, &haystack[10..])?.unwrap(); + /// assert_eq!(mat.pattern().as_usize(), 1); + /// assert_eq!(mat.offset(), 5); + /// + /// # Ok::<(), Box>(()) + /// ``` + fn is_special_state(&self, id: StateID) -> bool; + + /// Returns true if and only if the given identifier corresponds to a dead + /// state. When a DFA enters a dead state, it is impossible to leave. That + /// is, every transition on a dead state by definition leads back to the + /// same dead state. + /// + /// In practice, the dead state always corresponds to the identifier `0`. + /// Moreover, in practice, there is only one dead state. + /// + /// The existence of a dead state is not strictly required in the classical + /// model of finite state machines, where one generally only cares about + /// the question of whether an input sequence matches or not. Dead states + /// are not needed to answer that question, since one can immediately quit + /// as soon as one enters a final or "match" state. However, we don't just + /// care about matches but also care about the location of matches, and + /// more specifically, care about semantics like "greedy" matching. + /// + /// For example, given the pattern `a+` and the input `aaaz`, the dead + /// state won't be entered until the state machine reaches `z` in the + /// input, at which point, the search routine can quit. But without the + /// dead state, the search routine wouldn't know when to quit. In a + /// classical representation, the search routine would stop after seeing + /// the first `a` (which is when the search would enter a match state). But + /// this wouldn't implement "greedy" matching where `a+` matches as many + /// `a`'s as possible. + /// + /// # Example + /// + /// See the example for [`Automaton::is_special_state`] for how to use this + /// method correctly. + fn is_dead_state(&self, id: StateID) -> bool; + + /// Returns true if and only if the given identifier corresponds to a quit + /// state. A quit state is like a dead state (it has no transitions other + /// than to itself), except it indicates that the DFA failed to complete + /// the search. When this occurs, callers can neither accept or reject that + /// a match occurred. + /// + /// In practice, the quit state always corresponds to the state immediately + /// following the dead state. (Which is not usually represented by `1`, + /// since state identifiers are pre-multiplied by the state machine's + /// alphabet stride, and the alphabet stride varies between DFAs.) + /// + /// By default, state machines created by this crate will never enter a + /// quit state. Since entering a quit state is the only way for a DFA + /// in this crate to fail at search time, it follows that the default + /// configuration can never produce a match error. Nevertheless, handling + /// quit states is necessary to correctly support all configurations in + /// this crate. + /// + /// The typical way in which a quit state can occur is when heuristic + /// support for Unicode word boundaries is enabled via the + /// [`dense::Config::unicode_word_boundary`](crate::dfa::dense::Config::unicode_word_boundary) + /// option. But other options, like the lower level + /// [`dense::Config::quit`](crate::dfa::dense::Config::quit) + /// configuration, can also result in a quit state being entered. The + /// purpose of the quit state is to provide a way to execute a fast DFA + /// in common cases while delegating to slower routines when the DFA quits. + /// + /// The default search implementations provided by this crate will return + /// a [`MatchError::Quit`](crate::MatchError::Quit) error when a quit state + /// is entered. + /// + /// # Example + /// + /// See the example for [`Automaton::is_special_state`] for how to use this + /// method correctly. + fn is_quit_state(&self, id: StateID) -> bool; + + /// Returns true if and only if the given identifier corresponds to a + /// match state. A match state is also referred to as a "final" state and + /// indicates that a match has been found. + /// + /// If all you care about is whether a particular pattern matches in the + /// input sequence, then a search routine can quit early as soon as the + /// machine enters a match state. However, if you're looking for the + /// standard "leftmost-first" match location, then search _must_ continue + /// until either the end of the input or until the machine enters a dead + /// state. (Since either condition implies that no other useful work can + /// be done.) Namely, when looking for the location of a match, then + /// search implementations should record the most recent location in + /// which a match state was entered, but otherwise continue executing the + /// search as normal. (The search may even leave the match state.) Once + /// the termination condition is reached, the most recently recorded match + /// location should be returned. + /// + /// Finally, one additional power given to match states in this crate + /// is that they are always associated with a specific pattern in order + /// to support multi-DFAs. See [`Automaton::match_pattern`] for more + /// details and an example for how to query the pattern associated with a + /// particular match state. + /// + /// # Example + /// + /// See the example for [`Automaton::is_special_state`] for how to use this + /// method correctly. + fn is_match_state(&self, id: StateID) -> bool; + + /// Returns true if and only if the given identifier corresponds to a + /// start state. A start state is a state in which a DFA begins a search. + /// All searches begin in a start state. Moreover, since all matches are + /// delayed by one byte, a start state can never be a match state. + /// + /// The main role of a start state is, as mentioned, to be a starting + /// point for a DFA. This starting point is determined via one of + /// [`Automaton::start_state_forward`] or + /// [`Automaton::start_state_reverse`], depending on whether one is doing + /// a forward or a reverse search, respectively. + /// + /// A secondary use of start states is for prefix acceleration. Namely, + /// while executing a search, if one detects that you're in a start state, + /// then it may be faster to look for the next match of a prefix of the + /// pattern, if one exists. If a prefix exists and since all matches must + /// begin with that prefix, then skipping ahead to occurrences of that + /// prefix may be much faster than executing the DFA. + /// + /// # Example + /// + /// This example shows how to implement your own search routine that does + /// a prefix search whenever the search enters a start state. + /// + /// Note that you do not need to implement your own search routine to + /// make use of prefilters like this. The search routines provided + /// by this crate already implement prefilter support via the + /// [`Prefilter`](crate::util::prefilter::Prefilter) trait. The various + /// `find_*_at` routines on this trait support the `Prefilter` trait + /// through [`Scanner`](crate::util::prefilter::Scanner)s. This example is + /// meant to show how you might deal with prefilters in a simplified case + /// if you are implementing your own search routine. + /// + /// ``` + /// use regex_automata::{ + /// MatchError, PatternID, + /// dfa::{Automaton, dense}, + /// HalfMatch, + /// }; + /// + /// fn find_byte(slice: &[u8], at: usize, byte: u8) -> Option { + /// // Would be faster to use the memchr crate, but this is still + /// // faster than running through the DFA. + /// slice[at..].iter().position(|&b| b == byte).map(|i| at + i) + /// } + /// + /// fn find_leftmost_first( + /// dfa: &A, + /// haystack: &[u8], + /// prefix_byte: Option, + /// ) -> Result, MatchError> { + /// // See the Automaton::is_special_state example for similar code + /// // with more comments. + /// + /// let mut state = dfa.start_state_forward( + /// None, haystack, 0, haystack.len(), + /// ); + /// let mut last_match = None; + /// let mut pos = 0; + /// while pos < haystack.len() { + /// let b = haystack[pos]; + /// state = dfa.next_state(state, b); + /// pos += 1; + /// if dfa.is_special_state(state) { + /// if dfa.is_match_state(state) { + /// last_match = Some(HalfMatch::new( + /// dfa.match_pattern(state, 0), + /// pos - 1, + /// )); + /// } else if dfa.is_dead_state(state) { + /// return Ok(last_match); + /// } else if dfa.is_quit_state(state) { + /// // It is possible to enter into a quit state after + /// // observing a match has occurred. In that case, we + /// // should return the match instead of an error. + /// if last_match.is_some() { + /// return Ok(last_match); + /// } + /// return Err(MatchError::Quit { + /// byte: b, offset: pos - 1, + /// }); + /// } else if dfa.is_start_state(state) { + /// // If we're in a start state and know all matches begin + /// // with a particular byte, then we can quickly skip to + /// // candidate matches without running the DFA through + /// // every byte inbetween. + /// if let Some(prefix_byte) = prefix_byte { + /// pos = match find_byte(haystack, pos, prefix_byte) { + /// Some(pos) => pos, + /// None => break, + /// }; + /// } + /// } + /// } + /// } + /// // Matches are always delayed by 1 byte, so we must explicitly walk + /// // the special "EOI" transition at the end of the search. + /// state = dfa.next_eoi_state(state); + /// if dfa.is_match_state(state) { + /// last_match = Some(HalfMatch::new( + /// dfa.match_pattern(state, 0), + /// haystack.len(), + /// )); + /// } + /// Ok(last_match) + /// } + /// + /// // In this example, it's obvious that all occurrences of our pattern + /// // begin with 'Z', so we pass in 'Z'. + /// let dfa = dense::DFA::new(r"Z[a-z]+")?; + /// let haystack = "123 foobar Zbaz quux".as_bytes(); + /// let mat = find_leftmost_first(&dfa, haystack, Some(b'Z'))?.unwrap(); + /// assert_eq!(mat.pattern().as_usize(), 0); + /// assert_eq!(mat.offset(), 15); + /// + /// // But note that we don't need to pass in a prefix byte. If we don't, + /// // then the search routine does no acceleration. + /// let mat = find_leftmost_first(&dfa, haystack, None)?.unwrap(); + /// assert_eq!(mat.pattern().as_usize(), 0); + /// assert_eq!(mat.offset(), 15); + /// + /// // However, if we pass an incorrect byte, then the prefix search will + /// // result in incorrect results. + /// assert_eq!(find_leftmost_first(&dfa, haystack, Some(b'X'))?, None); + /// + /// # Ok::<(), Box>(()) + /// ``` + fn is_start_state(&self, id: StateID) -> bool; + + /// Returns true if and only if the given identifier corresponds to an + /// accelerated state. + /// + /// An accelerated state is a special optimization + /// trick implemented by this crate. Namely, if + /// [`dense::Config::accelerate`](crate::dfa::dense::Config::accelerate) is + /// enabled (and it is by default), then DFAs generated by this crate will + /// tag states meeting certain characteristics as accelerated. States meet + /// this criteria whenever most of their transitions are self-transitions. + /// That is, transitions that loop back to the same state. When a small + /// number of transitions aren't self-transitions, then it follows that + /// there are only a small number of bytes that can cause the DFA to leave + /// that state. Thus, there is an opportunity to look for those bytes + /// using more optimized routines rather than continuing to run through + /// the DFA. This trick is similar to the prefilter idea described in + /// the documentation of [`Automaton::is_start_state`] with two main + /// differences: + /// + /// 1. It is more limited since acceleration only applies to single bytes. + /// This means states are rarely accelerated when Unicode mode is enabled + /// (which is enabled by default). + /// 2. It can occur anywhere in the DFA, which increases optimization + /// opportunities. + /// + /// Like the prefilter idea, the main downside (and a possible reason to + /// disable it) is that it can lead to worse performance in some cases. + /// Namely, if a state is accelerated for very common bytes, then the + /// overhead of checking for acceleration and using the more optimized + /// routines to look for those bytes can cause overall performance to be + /// worse than if acceleration wasn't enabled at all. + /// + /// A simple example of a regex that has an accelerated state is + /// `(?-u)[^a]+a`. Namely, the `[^a]+` sub-expression gets compiled down + /// into a single state where all transitions except for `a` loop back to + /// itself, and where `a` is the only transition (other than the special + /// EOI transition) that goes to some other state. Thus, this state can + /// be accelerated and implemented more efficiently by calling an + /// optimized routine like `memchr` with `a` as the needle. Notice that + /// the `(?-u)` to disable Unicode is necessary here, as without it, + /// `[^a]` will match any UTF-8 encoding of any Unicode scalar value other + /// than `a`. This more complicated expression compiles down to many DFA + /// states and the simple acceleration optimization is no longer available. + /// + /// Typically, this routine is used to guard calls to + /// [`Automaton::accelerator`], which returns the accelerated bytes for + /// the specified state. + fn is_accel_state(&self, id: StateID) -> bool; + + /// Returns the total number of patterns compiled into this DFA. + /// + /// In the case of a DFA that contains no patterns, this must return `0`. + /// + /// # Example + /// + /// This example shows the pattern count for a DFA that never matches: + /// + /// ``` + /// use regex_automata::dfa::{Automaton, dense::DFA}; + /// + /// let dfa: DFA> = DFA::never_match()?; + /// assert_eq!(dfa.pattern_count(), 0); + /// # Ok::<(), Box>(()) + /// ``` + /// + /// And another example for a DFA that matches at every position: + /// + /// ``` + /// use regex_automata::dfa::{Automaton, dense::DFA}; + /// + /// let dfa: DFA> = DFA::always_match()?; + /// assert_eq!(dfa.pattern_count(), 1); + /// # Ok::<(), Box>(()) + /// ``` + /// + /// And finally, a DFA that was constructed from multiple patterns: + /// + /// ``` + /// use regex_automata::dfa::{Automaton, dense::DFA}; + /// + /// let dfa = DFA::new_many(&["[0-9]+", "[a-z]+", "[A-Z]+"])?; + /// assert_eq!(dfa.pattern_count(), 3); + /// # Ok::<(), Box>(()) + /// ``` + fn pattern_count(&self) -> usize; + + /// Returns the total number of patterns that match in this state. + /// + /// If the given state is not a match state, then implementations may + /// panic. + /// + /// If the DFA was compiled with one pattern, then this must necessarily + /// always return `1` for all match states. + /// + /// Implementations must guarantee that [`Automaton::match_pattern`] can + /// be called with indices up to (but not including) the count returned by + /// this routine without panicking. + /// + /// # Panics + /// + /// Implementations are permitted to panic if the provided state ID does + /// not correspond to a match state. + /// + /// # Example + /// + /// This example shows a simple instance of implementing overlapping + /// matches. In particular, it shows not only how to determine how many + /// patterns have matched in a particular state, but also how to access + /// which specific patterns have matched. + /// + /// Notice that we must use [`MatchKind::All`](crate::MatchKind::All) + /// when building the DFA. If we used + /// [`MatchKind::LeftmostFirst`](crate::MatchKind::LeftmostFirst) + /// instead, then the DFA would not be constructed in a way that supports + /// overlapping matches. (It would only report a single pattern that + /// matches at any particular point in time.) + /// + /// Another thing to take note of is the patterns used and the order in + /// which the pattern IDs are reported. In the example below, pattern `3` + /// is yielded first. Why? Because it corresponds to the match that + /// appears first. Namely, the `@` symbol is part of `\S+` but not part + /// of any of the other patterns. Since the `\S+` pattern has a match that + /// starts to the left of any other pattern, its ID is returned before any + /// other. + /// + /// ``` + /// use regex_automata::{ + /// dfa::{Automaton, dense}, + /// MatchKind, + /// }; + /// + /// let dfa = dense::Builder::new() + /// .configure(dense::Config::new().match_kind(MatchKind::All)) + /// .build_many(&[ + /// r"\w+", r"[a-z]+", r"[A-Z]+", r"\S+", + /// ])?; + /// let haystack = "@bar".as_bytes(); + /// + /// // The start state is determined by inspecting the position and the + /// // initial bytes of the haystack. + /// let mut state = dfa.start_state_forward( + /// None, haystack, 0, haystack.len(), + /// ); + /// // Walk all the bytes in the haystack. + /// for &b in haystack { + /// state = dfa.next_state(state, b); + /// } + /// state = dfa.next_eoi_state(state); + /// + /// assert!(dfa.is_match_state(state)); + /// assert_eq!(dfa.match_count(state), 3); + /// // The following calls are guaranteed to not panic since `match_count` + /// // returned `3` above. + /// assert_eq!(dfa.match_pattern(state, 0).as_usize(), 3); + /// assert_eq!(dfa.match_pattern(state, 1).as_usize(), 0); + /// assert_eq!(dfa.match_pattern(state, 2).as_usize(), 1); + /// + /// # Ok::<(), Box>(()) + /// ``` + fn match_count(&self, id: StateID) -> usize; + + /// Returns the pattern ID corresponding to the given match index in the + /// given state. + /// + /// See [`Automaton::match_count`] for an example of how to use this + /// method correctly. Note that if you know your DFA is compiled with a + /// single pattern, then this routine is never necessary since it will + /// always return a pattern ID of `0` for an index of `0` when `id` + /// corresponds to a match state. + /// + /// Typically, this routine is used when implementing an overlapping + /// search, as the example for `Automaton::match_count` does. + /// + /// # Panics + /// + /// If the state ID is not a match state or if the match index is out + /// of bounds for the given state, then this routine may either panic + /// or produce an incorrect result. If the state ID is correct and the + /// match index is correct, then this routine must always produce a valid + /// `PatternID`. + fn match_pattern(&self, id: StateID, index: usize) -> PatternID; + + /// Return a slice of bytes to accelerate for the given state, if possible. + /// + /// If the given state has no accelerator, then an empty slice must be + /// returned. If `Automaton::is_accel_state` returns true for the given + /// ID, then this routine _must_ return a non-empty slice, but it is not + /// required to do so. + /// + /// If the given ID is not a valid state ID for this automaton, then + /// implementations may panic or produce incorrect results. + /// + /// See [`Automaton::is_accel_state`] for more details on state + /// acceleration. + /// + /// By default, this method will always return an empty slice. + /// + /// # Example + /// + /// This example shows a contrived case in which we build a regex that we + /// know is accelerated and extract the accelerator from a state. + /// + /// ``` + /// use regex_automata::{ + /// nfa::thompson, + /// dfa::{Automaton, dense}, + /// util::id::StateID, + /// SyntaxConfig, + /// }; + /// + /// let dfa = dense::Builder::new() + /// // We disable Unicode everywhere and permit the regex to match + /// // invalid UTF-8. e.g., `[^abc]` matches `\xFF`, which is not valid + /// // UTF-8. + /// .syntax(SyntaxConfig::new().unicode(false).utf8(false)) + /// // This makes the implicit `(?s:.)*?` prefix added to the regex + /// // match through arbitrary bytes instead of being UTF-8 aware. This + /// // isn't necessary to get acceleration to work in this case, but + /// // it does make the DFA substantially simpler. + /// .thompson(thompson::Config::new().utf8(false)) + /// .build("[^abc]+a")?; + /// + /// // Here we just pluck out the state that we know is accelerated. + /// // While the stride calculations are something that can be relied + /// // on by callers, the specific position of the accelerated state is + /// // implementation defined. + /// // + /// // N.B. We get '3' by inspecting the state machine using 'regex-cli'. + /// // e.g., try `regex-cli debug dfa dense '[^abc]+a' -BbUC`. + /// let id = StateID::new(3 * dfa.stride()).unwrap(); + /// let accelerator = dfa.accelerator(id); + /// // The `[^abc]+` sub-expression permits [a, b, c] to be accelerated. + /// assert_eq!(accelerator, &[b'a', b'b', b'c']); + /// # Ok::<(), Box>(()) + /// ``` + fn accelerator(&self, _id: StateID) -> &[u8] { + &[] + } + + /// Executes a forward search and returns the end position of the first + /// match that is found as early as possible. If no match exists, then + /// `None` is returned. + /// + /// This routine stops scanning input as soon as the search observes a + /// match state. This is useful for implementing boolean `is_match`-like + /// routines, where as little work is done as possible. + /// + /// See [`Automaton::find_earliest_fwd_at`] for additional functionality, + /// such as providing a prefilter, a specific pattern to match and the + /// bounds of the search within the haystack. This routine is meant as + /// a convenience for common cases where the additional functionality is + /// not needed. + /// + /// # Errors + /// + /// This routine only errors if the search could not complete. For + /// DFAs generated by this crate, this only occurs in a non-default + /// configuration where quit bytes are used or Unicode word boundaries are + /// heuristically enabled. + /// + /// When a search cannot complete, callers cannot know whether a match + /// exists or not. + /// + /// # Example + /// + /// This example shows how to use this method with a + /// [`dense::DFA`](crate::dfa::dense::DFA). In particular, it demonstrates + /// how the position returned might differ from what one might expect when + /// executing a traditional leftmost search. + /// + /// ``` + /// use regex_automata::{ + /// dfa::{Automaton, dense}, + /// HalfMatch, + /// }; + /// + /// let dfa = dense::DFA::new("foo[0-9]+")?; + /// // Normally, the end of the leftmost first match here would be 8, + /// // corresponding to the end of the input. But the "earliest" semantics + /// // this routine cause it to stop as soon as a match is known, which + /// // occurs once 'foo[0-9]' has matched. + /// let expected = HalfMatch::must(0, 4); + /// assert_eq!(Some(expected), dfa.find_earliest_fwd(b"foo12345")?); + /// + /// let dfa = dense::DFA::new("abc|a")?; + /// // Normally, the end of the leftmost first match here would be 3, + /// // but the shortest match semantics detect a match earlier. + /// let expected = HalfMatch::must(0, 1); + /// assert_eq!(Some(expected), dfa.find_earliest_fwd(b"abc")?); + /// # Ok::<(), Box>(()) + /// ``` + #[inline] + fn find_earliest_fwd( + &self, + bytes: &[u8], + ) -> Result, MatchError> { + self.find_earliest_fwd_at(None, None, bytes, 0, bytes.len()) + } + + /// Executes a reverse search and returns the start position of the first + /// match that is found as early as possible. If no match exists, then + /// `None` is returned. + /// + /// This routine stops scanning input as soon as the search observes a + /// match state. + /// + /// Note that while it is not technically necessary to build a reverse + /// automaton to use a reverse search, it is likely that you'll want to do + /// so. Namely, the typical use of a reverse search is to find the starting + /// location of a match once its end is discovered from a forward search. A + /// reverse DFA automaton can be built by configuring the intermediate NFA + /// to be reversed via + /// [`nfa::thompson::Config::reverse`](crate::nfa::thompson::Config::reverse). + /// + /// # Errors + /// + /// This routine only errors if the search could not complete. For + /// DFAs generated by this crate, this only occurs in a non-default + /// configuration where quit bytes are used or Unicode word boundaries are + /// heuristically enabled. + /// + /// When a search cannot complete, callers cannot know whether a match + /// exists or not. + /// + /// # Example + /// + /// This example shows how to use this method with a + /// [`dense::DFA`](crate::dfa::dense::DFA). In particular, it demonstrates + /// how the position returned might differ from what one might expect when + /// executing a traditional leftmost reverse search. + /// + /// ``` + /// use regex_automata::{ + /// nfa::thompson, + /// dfa::{Automaton, dense}, + /// HalfMatch, + /// }; + /// + /// let dfa = dense::Builder::new() + /// .thompson(thompson::Config::new().reverse(true)) + /// .build("[a-z]+[0-9]+")?; + /// // Normally, the end of the leftmost first match here would be 0, + /// // corresponding to the beginning of the input. But the "earliest" + /// // semantics of this routine cause it to stop as soon as a match is + /// // known, which occurs once '[a-z][0-9]+' has matched. + /// let expected = HalfMatch::must(0, 2); + /// assert_eq!(Some(expected), dfa.find_earliest_rev(b"foo12345")?); + /// + /// let dfa = dense::Builder::new() + /// .thompson(thompson::Config::new().reverse(true)) + /// .build("abc|c")?; + /// // Normally, the end of the leftmost first match here would be 0, + /// // but the shortest match semantics detect a match earlier. + /// let expected = HalfMatch::must(0, 2); + /// assert_eq!(Some(expected), dfa.find_earliest_rev(b"abc")?); + /// # Ok::<(), Box>(()) + /// ``` + #[inline] + fn find_earliest_rev( + &self, + bytes: &[u8], + ) -> Result, MatchError> { + self.find_earliest_rev_at(None, bytes, 0, bytes.len()) + } + + /// Executes a forward search and returns the end position of the leftmost + /// match that is found. If no match exists, then `None` is returned. + /// + /// # Errors + /// + /// This routine only errors if the search could not complete. For + /// DFAs generated by this crate, this only occurs in a non-default + /// configuration where quit bytes are used or Unicode word boundaries are + /// heuristically enabled. + /// + /// When a search cannot complete, callers cannot know whether a match + /// exists or not. + /// + /// # Notes for implementors + /// + /// Implementors of this trait are not required to implement any particular + /// match semantics (such as leftmost-first), which are instead manifest in + /// the DFA's transitions. + /// + /// In particular, this method must continue searching even after it enters + /// a match state. The search should only terminate once it has reached + /// the end of the input or when it has entered a dead or quit state. Upon + /// termination, the position of the last byte seen while still in a match + /// state is returned. + /// + /// Since this trait provides an implementation for this method by default, + /// it's unlikely that one will need to implement this. + /// + /// # Example + /// + /// This example shows how to use this method with a + /// [`dense::DFA`](crate::dfa::dense::DFA). By default, a dense DFA uses + /// "leftmost first" match semantics. + /// + /// Leftmost first match semantics corresponds to the match with the + /// smallest starting offset, but where the end offset is determined by + /// preferring earlier branches in the original regular expression. For + /// example, `Sam|Samwise` will match `Sam` in `Samwise`, but `Samwise|Sam` + /// will match `Samwise` in `Samwise`. + /// + /// Generally speaking, the "leftmost first" match is how most backtracking + /// regular expressions tend to work. This is in contrast to POSIX-style + /// regular expressions that yield "leftmost longest" matches. Namely, + /// both `Sam|Samwise` and `Samwise|Sam` match `Samwise` when using + /// leftmost longest semantics. (This crate does not currently support + /// leftmost longest semantics.) + /// + /// ``` + /// use regex_automata::{ + /// dfa::{Automaton, dense}, + /// HalfMatch, + /// }; + /// + /// let dfa = dense::DFA::new("foo[0-9]+")?; + /// let expected = HalfMatch::must(0, 8); + /// assert_eq!(Some(expected), dfa.find_leftmost_fwd(b"foo12345")?); + /// + /// // Even though a match is found after reading the first byte (`a`), + /// // the leftmost first match semantics demand that we find the earliest + /// // match that prefers earlier parts of the pattern over latter parts. + /// let dfa = dense::DFA::new("abc|a")?; + /// let expected = HalfMatch::must(0, 3); + /// assert_eq!(Some(expected), dfa.find_leftmost_fwd(b"abc")?); + /// + /// # Ok::<(), Box>(()) + /// ``` + #[inline] + fn find_leftmost_fwd( + &self, + bytes: &[u8], + ) -> Result, MatchError> { + self.find_leftmost_fwd_at(None, None, bytes, 0, bytes.len()) + } + + /// Executes a reverse search and returns the start of the position of the + /// leftmost match that is found. If no match exists, then `None` is + /// returned. + /// + /// # Errors + /// + /// This routine only errors if the search could not complete. For + /// DFAs generated by this crate, this only occurs in a non-default + /// configuration where quit bytes are used or Unicode word boundaries are + /// heuristically enabled. + /// + /// When a search cannot complete, callers cannot know whether a match + /// exists or not. + /// + /// # Notes for implementors + /// + /// Implementors of this trait are not required to implement any particular + /// match semantics (such as leftmost-first), which are instead manifest in + /// the DFA's transitions. + /// + /// In particular, this method must continue searching even after it enters + /// a match state. The search should only terminate once it has reached + /// the end of the input or when it has entered a dead or quit state. Upon + /// termination, the position of the last byte seen while still in a match + /// state is returned. + /// + /// Since this trait provides an implementation for this method by default, + /// it's unlikely that one will need to implement this. + /// + /// # Example + /// + /// This example shows how to use this method with a + /// [`dense::DFA`](crate::dfa::dense::DFA). In particular, this routine + /// is principally useful when used in conjunction with the + /// [`nfa::thompson::Config::reverse`](crate::nfa::thompson::Config::reverse) + /// configuration. In general, it's unlikely to be correct to use both + /// `find_leftmost_fwd` and `find_leftmost_rev` with the same DFA since any + /// particular DFA will only support searching in one direction with + /// respect to the pattern. + /// + /// ``` + /// use regex_automata::{ + /// nfa::thompson, + /// dfa::{Automaton, dense}, + /// HalfMatch, + /// }; + /// + /// let dfa = dense::Builder::new() + /// .thompson(thompson::Config::new().reverse(true)) + /// .build("foo[0-9]+")?; + /// let expected = HalfMatch::must(0, 0); + /// assert_eq!(Some(expected), dfa.find_leftmost_rev(b"foo12345")?); + /// + /// // Even though a match is found after reading the last byte (`c`), + /// // the leftmost first match semantics demand that we find the earliest + /// // match that prefers earlier parts of the pattern over latter parts. + /// let dfa = dense::Builder::new() + /// .thompson(thompson::Config::new().reverse(true)) + /// .build("abc|c")?; + /// let expected = HalfMatch::must(0, 0); + /// assert_eq!(Some(expected), dfa.find_leftmost_rev(b"abc")?); + /// + /// # Ok::<(), Box>(()) + /// ``` + #[inline] + fn find_leftmost_rev( + &self, + bytes: &[u8], + ) -> Result, MatchError> { + self.find_leftmost_rev_at(None, bytes, 0, bytes.len()) + } + + /// Executes an overlapping forward search and returns the end position of + /// matches as they are found. If no match exists, then `None` is returned. + /// + /// This routine is principally only useful when searching for multiple + /// patterns on inputs where multiple patterns may match the same regions + /// of text. In particular, callers must preserve the automaton's search + /// state from prior calls so that the implementation knows where the last + /// match occurred. + /// + /// # Errors + /// + /// This routine only errors if the search could not complete. For + /// DFAs generated by this crate, this only occurs in a non-default + /// configuration where quit bytes are used or Unicode word boundaries are + /// heuristically enabled. + /// + /// When a search cannot complete, callers cannot know whether a match + /// exists or not. + /// + /// # Example + /// + /// This example shows how to run a basic overlapping search with a + /// [`dense::DFA`](crate::dfa::dense::DFA). Notice that we build the + /// automaton with a `MatchKind::All` configuration. Overlapping searches + /// are unlikely to work as one would expect when using the default + /// `MatchKind::LeftmostFirst` match semantics, since leftmost-first + /// matching is fundamentally incompatible with overlapping searches. + /// Namely, overlapping searches need to report matches as they are seen, + /// where as leftmost-first searches will continue searching even after a + /// match has been observed in order to find the conventional end position + /// of the match. More concretely, leftmost-first searches use dead states + /// to terminate a search after a specific match can no longer be extended. + /// Overlapping searches instead do the opposite by continuing the search + /// to find totally new matches (potentially of other patterns). + /// + /// ``` + /// use regex_automata::{ + /// dfa::{Automaton, OverlappingState, dense}, + /// HalfMatch, + /// MatchKind, + /// }; + /// + /// let dfa = dense::Builder::new() + /// .configure(dense::Config::new().match_kind(MatchKind::All)) + /// .build_many(&[r"\w+$", r"\S+$"])?; + /// let haystack = "@foo".as_bytes(); + /// let mut state = OverlappingState::start(); + /// + /// let expected = Some(HalfMatch::must(1, 4)); + /// let got = dfa.find_overlapping_fwd(haystack, &mut state)?; + /// assert_eq!(expected, got); + /// + /// // The first pattern also matches at the same position, so re-running + /// // the search will yield another match. Notice also that the first + /// // pattern is returned after the second. This is because the second + /// // pattern begins its match before the first, is therefore an earlier + /// // match and is thus reported first. + /// let expected = Some(HalfMatch::must(0, 4)); + /// let got = dfa.find_overlapping_fwd(haystack, &mut state)?; + /// assert_eq!(expected, got); + /// + /// # Ok::<(), Box>(()) + /// ``` + #[inline] + fn find_overlapping_fwd( + &self, + bytes: &[u8], + state: &mut OverlappingState, + ) -> Result, MatchError> { + self.find_overlapping_fwd_at(None, None, bytes, 0, bytes.len(), state) + } + + /// Executes a forward search and returns the end position of the first + /// match that is found as early as possible. If no match exists, then + /// `None` is returned. + /// + /// This routine stops scanning input as soon as the search observes a + /// match state. This is useful for implementing boolean `is_match`-like + /// routines, where as little work is done as possible. + /// + /// This is like [`Automaton::find_earliest_fwd`], except it provides some + /// additional control over how the search is executed: + /// + /// * `pre` is a prefilter scanner that, when given, is used whenever the + /// DFA enters its starting state. This is meant to speed up searches where + /// one or a small number of literal prefixes are known. + /// * `pattern_id` specifies a specific pattern in the DFA to run an + /// anchored search for. If not given, then a search for any pattern is + /// performed. For DFAs built by this crate, + /// [`dense::Config::starts_for_each_pattern`](crate::dfa::dense::Config::starts_for_each_pattern) + /// must be enabled to use this functionality. + /// * `start` and `end` permit searching a specific region of the haystack + /// `bytes`. This is useful when implementing an iterator over matches + /// within the same haystack, which cannot be done correctly by simply + /// providing a subslice of `bytes`. (Because the existence of look-around + /// operations such as `\b`, `^` and `$` need to take the surrounding + /// context into account. This cannot be done if the haystack doesn't + /// contain it.) + /// + /// The examples below demonstrate each of these additional parameters. + /// + /// # Errors + /// + /// This routine only errors if the search could not complete. For + /// DFAs generated by this crate, this only occurs in a non-default + /// configuration where quit bytes are used or Unicode word boundaries are + /// heuristically enabled. + /// + /// When a search cannot complete, callers cannot know whether a match + /// exists or not. + /// + /// # Panics + /// + /// This routine must panic if a `pattern_id` is given and the underlying + /// DFA does not support specific pattern searches. + /// + /// It must also panic if the given haystack range is not valid. + /// + /// # Example: prefilter + /// + /// This example shows how to provide a prefilter for a pattern where all + /// matches start with a `z` byte. + /// + /// ``` + /// use regex_automata::{ + /// dfa::{Automaton, dense}, + /// util::prefilter::{Candidate, Prefilter, Scanner, State}, + /// HalfMatch, + /// }; + /// + /// #[derive(Debug)] + /// pub struct ZPrefilter; + /// + /// impl Prefilter for ZPrefilter { + /// fn next_candidate( + /// &self, + /// _: &mut State, + /// haystack: &[u8], + /// at: usize, + /// ) -> Candidate { + /// // Try changing b'z' to b'q' and observe this test fail since + /// // the prefilter will skip right over the match. + /// match haystack.iter().position(|&b| b == b'z') { + /// None => Candidate::None, + /// Some(i) => Candidate::PossibleStartOfMatch(at + i), + /// } + /// } + /// + /// fn heap_bytes(&self) -> usize { + /// 0 + /// } + /// } + /// + /// let dfa = dense::DFA::new("z[0-9]{3}")?; + /// let haystack = "foobar z123 q123".as_bytes(); + /// // A scanner executes a prefilter while tracking some state that helps + /// // determine whether a prefilter is still "effective" or not. + /// let mut scanner = Scanner::new(&ZPrefilter); + /// + /// let expected = Some(HalfMatch::must(0, 11)); + /// let got = dfa.find_earliest_fwd_at( + /// Some(&mut scanner), + /// None, + /// haystack, + /// 0, + /// haystack.len(), + /// )?; + /// assert_eq!(expected, got); + /// + /// # Ok::<(), Box>(()) + /// ``` + /// + /// # Example: specific pattern search + /// + /// This example shows how to build a multi-DFA that permits searching for + /// specific patterns. + /// + /// ``` + /// use regex_automata::{ + /// dfa::{Automaton, dense}, + /// HalfMatch, + /// PatternID, + /// }; + /// + /// let dfa = dense::Builder::new() + /// .configure(dense::Config::new().starts_for_each_pattern(true)) + /// .build_many(&["[a-z0-9]{6}", "[a-z][a-z0-9]{5}"])?; + /// let haystack = "foo123".as_bytes(); + /// + /// // Since we are using the default leftmost-first match and both + /// // patterns match at the same starting position, only the first pattern + /// // will be returned in this case when doing a search for any of the + /// // patterns. + /// let expected = Some(HalfMatch::must(0, 6)); + /// let got = dfa.find_earliest_fwd_at( + /// None, + /// None, + /// haystack, + /// 0, + /// haystack.len(), + /// )?; + /// assert_eq!(expected, got); + /// + /// // But if we want to check whether some other pattern matches, then we + /// // can provide its pattern ID. + /// let expected = Some(HalfMatch::must(1, 6)); + /// let got = dfa.find_earliest_fwd_at( + /// None, + /// Some(PatternID::must(1)), + /// haystack, + /// 0, + /// haystack.len(), + /// )?; + /// assert_eq!(expected, got); + /// + /// # Ok::<(), Box>(()) + /// ``` + /// + /// # Example: specifying the bounds of a search + /// + /// This example shows how providing the bounds of a search can produce + /// different results than simply sub-slicing the haystack. + /// + /// ``` + /// use regex_automata::{ + /// dfa::{Automaton, dense}, + /// HalfMatch, + /// }; + /// + /// // N.B. We disable Unicode here so that we use a simple ASCII word + /// // boundary. Alternatively, we could enable heuristic support for + /// // Unicode word boundaries. + /// let dfa = dense::DFA::new(r"(?-u)\b[0-9]{3}\b")?; + /// let haystack = "foo123bar".as_bytes(); + /// + /// // Since we sub-slice the haystack, the search doesn't know about the + /// // larger context and assumes that `123` is surrounded by word + /// // boundaries. And of course, the match position is reported relative + /// // to the sub-slice as well, which means we get `3` instead of `6`. + /// let expected = Some(HalfMatch::must(0, 3)); + /// let got = dfa.find_earliest_fwd_at( + /// None, + /// None, + /// &haystack[3..6], + /// 0, + /// haystack[3..6].len(), + /// )?; + /// assert_eq!(expected, got); + /// + /// // But if we provide the bounds of the search within the context of the + /// // entire haystack, then the search can take the surrounding context + /// // into account. (And if we did find a match, it would be reported + /// // as a valid offset into `haystack` instead of its sub-slice.) + /// let expected = None; + /// let got = dfa.find_earliest_fwd_at( + /// None, + /// None, + /// haystack, + /// 3, + /// 6, + /// )?; + /// assert_eq!(expected, got); + /// + /// # Ok::<(), Box>(()) + /// ``` + #[inline] + fn find_earliest_fwd_at( + &self, + pre: Option<&mut prefilter::Scanner>, + pattern_id: Option, + bytes: &[u8], + start: usize, + end: usize, + ) -> Result, MatchError> { + search::find_earliest_fwd(pre, self, pattern_id, bytes, start, end) + } + + /// Executes a reverse search and returns the start position of the first + /// match that is found as early as possible. If no match exists, then + /// `None` is returned. + /// + /// This routine stops scanning input as soon as the search observes a + /// match state. + /// + /// This is like [`Automaton::find_earliest_rev`], except it provides some + /// additional control over how the search is executed. See the + /// documentation of [`Automaton::find_earliest_fwd_at`] for more details + /// on the additional parameters along with examples of their usage. + /// + /// # Errors + /// + /// This routine only errors if the search could not complete. For + /// DFAs generated by this crate, this only occurs in a non-default + /// configuration where quit bytes are used or Unicode word boundaries are + /// heuristically enabled. + /// + /// When a search cannot complete, callers cannot know whether a match + /// exists or not. + /// + /// # Panics + /// + /// This routine must panic if a `pattern_id` is given and the underlying + /// DFA does not support specific pattern searches. + /// + /// It must also panic if the given haystack range is not valid. + #[inline] + fn find_earliest_rev_at( + &self, + pattern_id: Option, + bytes: &[u8], + start: usize, + end: usize, + ) -> Result, MatchError> { + search::find_earliest_rev(self, pattern_id, bytes, start, end) + } + + /// Executes a forward search and returns the end position of the leftmost + /// match that is found. If no match exists, then `None` is returned. + /// + /// This is like [`Automaton::find_leftmost_fwd`], except it provides some + /// additional control over how the search is executed. See the + /// documentation of [`Automaton::find_earliest_fwd_at`] for more details + /// on the additional parameters along with examples of their usage. + /// + /// # Errors + /// + /// This routine only errors if the search could not complete. For + /// DFAs generated by this crate, this only occurs in a non-default + /// configuration where quit bytes are used or Unicode word boundaries are + /// heuristically enabled. + /// + /// When a search cannot complete, callers cannot know whether a match + /// exists or not. + /// + /// # Panics + /// + /// This routine must panic if a `pattern_id` is given and the underlying + /// DFA does not support specific pattern searches. + /// + /// It must also panic if the given haystack range is not valid. + #[inline] + fn find_leftmost_fwd_at( + &self, + pre: Option<&mut prefilter::Scanner>, + pattern_id: Option, + bytes: &[u8], + start: usize, + end: usize, + ) -> Result, MatchError> { + search::find_leftmost_fwd(pre, self, pattern_id, bytes, start, end) + } + + /// Executes a reverse search and returns the start of the position of the + /// leftmost match that is found. If no match exists, then `None` is + /// returned. + /// + /// This is like [`Automaton::find_leftmost_rev`], except it provides some + /// additional control over how the search is executed. See the + /// documentation of [`Automaton::find_earliest_fwd_at`] for more details + /// on the additional parameters along with examples of their usage. + /// + /// # Errors + /// + /// This routine only errors if the search could not complete. For + /// DFAs generated by this crate, this only occurs in a non-default + /// configuration where quit bytes are used or Unicode word boundaries are + /// heuristically enabled. + /// + /// When a search cannot complete, callers cannot know whether a match + /// exists or not. + /// + /// # Panics + /// + /// This routine must panic if a `pattern_id` is given and the underlying + /// DFA does not support specific pattern searches. + /// + /// It must also panic if the given haystack range is not valid. + #[inline] + fn find_leftmost_rev_at( + &self, + pattern_id: Option, + bytes: &[u8], + start: usize, + end: usize, + ) -> Result, MatchError> { + search::find_leftmost_rev(self, pattern_id, bytes, start, end) + } + + /// Executes an overlapping forward search and returns the end position of + /// matches as they are found. If no match exists, then `None` is returned. + /// + /// This routine is principally only useful when searching for multiple + /// patterns on inputs where multiple patterns may match the same regions + /// of text. In particular, callers must preserve the automaton's search + /// state from prior calls so that the implementation knows where the last + /// match occurred. + /// + /// This is like [`Automaton::find_overlapping_fwd`], except it provides + /// some additional control over how the search is executed. See the + /// documentation of [`Automaton::find_earliest_fwd_at`] for more details + /// on the additional parameters along with examples of their usage. + /// + /// When using this routine to implement an iterator of overlapping + /// matches, the `start` of the search should always be set to the end + /// of the last match. If more patterns match at the previous location, + /// then they will be immediately returned. (This is tracked by the given + /// overlapping state.) Otherwise, the search continues at the starting + /// position given. + /// + /// If for some reason you want the search to forget about its previous + /// state and restart the search at a particular position, then setting the + /// state to [`OverlappingState::start`] will accomplish that. + /// + /// # Errors + /// + /// This routine only errors if the search could not complete. For + /// DFAs generated by this crate, this only occurs in a non-default + /// configuration where quit bytes are used or Unicode word boundaries are + /// heuristically enabled. + /// + /// When a search cannot complete, callers cannot know whether a match + /// exists or not. + /// + /// # Panics + /// + /// This routine must panic if a `pattern_id` is given and the underlying + /// DFA does not support specific pattern searches. + /// + /// It must also panic if the given haystack range is not valid. + #[inline] + fn find_overlapping_fwd_at( + &self, + pre: Option<&mut prefilter::Scanner>, + pattern_id: Option, + bytes: &[u8], + start: usize, + end: usize, + state: &mut OverlappingState, + ) -> Result, MatchError> { + search::find_overlapping_fwd( + pre, self, pattern_id, bytes, start, end, state, + ) + } +} + +unsafe impl<'a, T: Automaton> Automaton for &'a T { + #[inline] + fn next_state(&self, current: StateID, input: u8) -> StateID { + (**self).next_state(current, input) + } + + #[inline] + unsafe fn next_state_unchecked( + &self, + current: StateID, + input: u8, + ) -> StateID { + (**self).next_state_unchecked(current, input) + } + + #[inline] + fn next_eoi_state(&self, current: StateID) -> StateID { + (**self).next_eoi_state(current) + } + + #[inline] + fn start_state_forward( + &self, + pattern_id: Option, + bytes: &[u8], + start: usize, + end: usize, + ) -> StateID { + (**self).start_state_forward(pattern_id, bytes, start, end) + } + + #[inline] + fn start_state_reverse( + &self, + pattern_id: Option, + bytes: &[u8], + start: usize, + end: usize, + ) -> StateID { + (**self).start_state_reverse(pattern_id, bytes, start, end) + } + + #[inline] + fn is_special_state(&self, id: StateID) -> bool { + (**self).is_special_state(id) + } + + #[inline] + fn is_dead_state(&self, id: StateID) -> bool { + (**self).is_dead_state(id) + } + + #[inline] + fn is_quit_state(&self, id: StateID) -> bool { + (**self).is_quit_state(id) + } + + #[inline] + fn is_match_state(&self, id: StateID) -> bool { + (**self).is_match_state(id) + } + + #[inline] + fn is_start_state(&self, id: StateID) -> bool { + (**self).is_start_state(id) + } + + #[inline] + fn is_accel_state(&self, id: StateID) -> bool { + (**self).is_accel_state(id) + } + + #[inline] + fn pattern_count(&self) -> usize { + (**self).pattern_count() + } + + #[inline] + fn match_count(&self, id: StateID) -> usize { + (**self).match_count(id) + } + + #[inline] + fn match_pattern(&self, id: StateID, index: usize) -> PatternID { + (**self).match_pattern(id, index) + } + + #[inline] + fn accelerator(&self, id: StateID) -> &[u8] { + (**self).accelerator(id) + } + + #[inline] + fn find_earliest_fwd( + &self, + bytes: &[u8], + ) -> Result, MatchError> { + (**self).find_earliest_fwd(bytes) + } + + #[inline] + fn find_earliest_rev( + &self, + bytes: &[u8], + ) -> Result, MatchError> { + (**self).find_earliest_rev(bytes) + } + + #[inline] + fn find_leftmost_fwd( + &self, + bytes: &[u8], + ) -> Result, MatchError> { + (**self).find_leftmost_fwd(bytes) + } + + #[inline] + fn find_leftmost_rev( + &self, + bytes: &[u8], + ) -> Result, MatchError> { + (**self).find_leftmost_rev(bytes) + } + + #[inline] + fn find_overlapping_fwd( + &self, + bytes: &[u8], + state: &mut OverlappingState, + ) -> Result, MatchError> { + (**self).find_overlapping_fwd(bytes, state) + } + + #[inline] + fn find_earliest_fwd_at( + &self, + pre: Option<&mut prefilter::Scanner>, + pattern_id: Option, + bytes: &[u8], + start: usize, + end: usize, + ) -> Result, MatchError> { + (**self).find_earliest_fwd_at(pre, pattern_id, bytes, start, end) + } + + #[inline] + fn find_earliest_rev_at( + &self, + pattern_id: Option, + bytes: &[u8], + start: usize, + end: usize, + ) -> Result, MatchError> { + (**self).find_earliest_rev_at(pattern_id, bytes, start, end) + } + + #[inline] + fn find_leftmost_fwd_at( + &self, + pre: Option<&mut prefilter::Scanner>, + pattern_id: Option, + bytes: &[u8], + start: usize, + end: usize, + ) -> Result, MatchError> { + (**self).find_leftmost_fwd_at(pre, pattern_id, bytes, start, end) + } + + #[inline] + fn find_leftmost_rev_at( + &self, + pattern_id: Option, + bytes: &[u8], + start: usize, + end: usize, + ) -> Result, MatchError> { + (**self).find_leftmost_rev_at(pattern_id, bytes, start, end) + } + + #[inline] + fn find_overlapping_fwd_at( + &self, + pre: Option<&mut prefilter::Scanner>, + pattern_id: Option, + bytes: &[u8], + start: usize, + end: usize, + state: &mut OverlappingState, + ) -> Result, MatchError> { + (**self) + .find_overlapping_fwd_at(pre, pattern_id, bytes, start, end, state) + } +} + +/// Represents the current state of an overlapping search. +/// +/// This is used for overlapping searches since they need to know something +/// about the previous search. For example, when multiple patterns match at the +/// same position, this state tracks the last reported pattern so that the next +/// search knows whether to report another matching pattern or continue with +/// the search at the next position. Additionally, it also tracks which state +/// the last search call terminated in. +/// +/// This type provides no introspection capabilities. The only thing a caller +/// can do is construct it and pass it around to permit search routines to use +/// it to track state. +/// +/// Callers should always provide a fresh state constructed via +/// [`OverlappingState::start`] when starting a new search. Reusing state from +/// a previous search may result in incorrect results. +#[derive(Clone, Debug, Eq, PartialEq)] +pub struct OverlappingState { + /// The state ID of the state at which the search was in when the call + /// terminated. When this is a match state, `last_match` must be set to a + /// non-None value. + /// + /// A `None` value indicates the start state of the corresponding + /// automaton. We cannot use the actual ID, since any one automaton may + /// have many start states, and which one is in use depends on several + /// search-time factors. + id: Option, + /// Information associated with a match when `id` corresponds to a match + /// state. + last_match: Option, +} + +/// Internal state about the last match that occurred. This records both the +/// offset of the match and the match index. +#[derive(Clone, Copy, Debug, Eq, PartialEq)] +pub(crate) struct StateMatch { + /// The index into the matching patterns for the current match state. + pub(crate) match_index: usize, + /// The offset in the haystack at which the match occurred. This is used + /// when reporting multiple matches at the same offset. That is, when + /// an overlapping search runs, the first thing it checks is whether it's + /// already in a match state, and if so, whether there are more patterns + /// to report as matches in that state. If so, it increments `match_index` + /// and returns the pattern and this offset. Once `match_index` exceeds the + /// number of matching patterns in the current state, the search continues. + pub(crate) offset: usize, +} + +impl OverlappingState { + /// Create a new overlapping state that begins at the start state of any + /// automaton. + pub fn start() -> OverlappingState { + OverlappingState { id: None, last_match: None } + } + + pub(crate) fn id(&self) -> Option { + self.id + } + + pub(crate) fn set_id(&mut self, id: StateID) { + self.id = Some(id); + } + + pub(crate) fn last_match(&mut self) -> Option<&mut StateMatch> { + self.last_match.as_mut() + } + + pub(crate) fn set_last_match(&mut self, last_match: StateMatch) { + self.last_match = Some(last_match); + } +} + +/// Write a prefix "state" indicator for fmt::Debug impls. +/// +/// Specifically, this tries to succinctly distinguish the different types of +/// states: dead states, quit states, accelerated states, start states and +/// match states. It even accounts for the possible overlappings of different +/// state types. +pub(crate) fn fmt_state_indicator( + f: &mut core::fmt::Formatter<'_>, + dfa: A, + id: StateID, +) -> core::fmt::Result { + if dfa.is_dead_state(id) { + write!(f, "D")?; + if dfa.is_start_state(id) { + write!(f, ">")?; + } else { + write!(f, " ")?; + } + } else if dfa.is_quit_state(id) { + write!(f, "Q ")?; + } else if dfa.is_start_state(id) { + if dfa.is_accel_state(id) { + write!(f, "A>")?; + } else { + write!(f, " >")?; + } + } else if dfa.is_match_state(id) { + if dfa.is_accel_state(id) { + write!(f, "A*")?; + } else { + write!(f, " *")?; + } + } else if dfa.is_accel_state(id) { + write!(f, "A ")?; + } else { + write!(f, " ")?; + } + Ok(()) +} diff --git a/vendor/regex-automata/src/dfa/dense.rs b/vendor/regex-automata/src/dfa/dense.rs new file mode 100644 index 000000000..07c135098 --- /dev/null +++ b/vendor/regex-automata/src/dfa/dense.rs @@ -0,0 +1,4470 @@ +/*! +Types and routines specific to dense DFAs. + +This module is the home of [`dense::DFA`](DFA). + +This module also contains a [`dense::Builder`](Builder) and a +[`dense::Config`](Config) for configuring and building a dense DFA. +*/ + +#[cfg(feature = "alloc")] +use core::cmp; +use core::{convert::TryFrom, fmt, iter, mem::size_of, slice}; + +#[cfg(feature = "alloc")] +use alloc::{ + collections::{BTreeMap, BTreeSet}, + vec, + vec::Vec, +}; + +#[cfg(feature = "alloc")] +use crate::{ + dfa::{ + accel::Accel, determinize, error::Error, minimize::Minimizer, sparse, + }, + nfa::thompson, + util::alphabet::ByteSet, + MatchKind, +}; +use crate::{ + dfa::{ + accel::Accels, + automaton::{fmt_state_indicator, Automaton}, + special::Special, + DEAD, + }, + util::{ + alphabet::{self, ByteClasses}, + bytes::{self, DeserializeError, Endian, SerializeError}, + id::{PatternID, StateID}, + start::Start, + }, +}; + +/// The label that is pre-pended to a serialized DFA. +const LABEL: &str = "rust-regex-automata-dfa-dense"; + +/// The format version of dense regexes. This version gets incremented when a +/// change occurs. A change may not necessarily be a breaking change, but the +/// version does permit good error messages in the case where a breaking change +/// is made. +const VERSION: u32 = 2; + +/// The configuration used for compiling a dense DFA. +/// +/// A dense DFA configuration is a simple data object that is typically used +/// with [`dense::Builder::configure`](self::Builder::configure). +/// +/// The default configuration guarantees that a search will _never_ return a +/// [`MatchError`](crate::MatchError) for any haystack or pattern. Setting a +/// quit byte with [`Config::quit`] or enabling heuristic support for Unicode +/// word boundaries with [`Config::unicode_word_boundary`] can in turn cause a +/// search to return an error. See the corresponding configuration options for +/// more details on when those error conditions arise. +#[cfg(feature = "alloc")] +#[derive(Clone, Copy, Debug, Default)] +pub struct Config { + // As with other configuration types in this crate, we put all our knobs + // in options so that we can distinguish between "default" and "not set." + // This makes it possible to easily combine multiple configurations + // without default values overwriting explicitly specified values. See the + // 'overwrite' method. + // + // For docs on the fields below, see the corresponding method setters. + anchored: Option, + accelerate: Option, + minimize: Option, + match_kind: Option, + starts_for_each_pattern: Option, + byte_classes: Option, + unicode_word_boundary: Option, + quit: Option, + dfa_size_limit: Option>, + determinize_size_limit: Option>, +} + +#[cfg(feature = "alloc")] +impl Config { + /// Return a new default dense DFA compiler configuration. + pub fn new() -> Config { + Config::default() + } + + /// Set whether matching must be anchored at the beginning of the input. + /// + /// When enabled, a match must begin at the start of a search. When + /// disabled, the DFA will act as if the pattern started with a `(?s:.)*?`, + /// which enables a match to appear anywhere. + /// + /// Note that if you want to run both anchored and unanchored + /// searches without building multiple automatons, you can enable the + /// [`Config::starts_for_each_pattern`] configuration instead. This will + /// permit unanchored any-pattern searches and pattern-specific anchored + /// searches. See the documentation for that configuration for an example. + /// + /// By default this is disabled. + /// + /// **WARNING:** this is subtly different than using a `^` at the start of + /// your regex. A `^` forces a regex to match exclusively at the start of + /// input, regardless of where you begin your search. In contrast, enabling + /// this option will allow your regex to match anywhere in your input, + /// but the match must start at the beginning of a search. (Most of the + /// higher level convenience search routines make "start of input" and + /// "start of search" equivalent, but some routines allow treating these as + /// orthogonal.) + /// + /// For example, consider the haystack `aba` and the following searches: + /// + /// 1. The regex `^a` is compiled with `anchored=false` and searches + /// `aba` starting at position `2`. Since `^` requires the match to + /// start at the beginning of the input and `2 > 0`, no match is found. + /// 2. The regex `a` is compiled with `anchored=true` and searches `aba` + /// starting at position `2`. This reports a match at `[2, 3]` since + /// the match starts where the search started. Since there is no `^`, + /// there is no requirement for the match to start at the beginning of + /// the input. + /// 3. The regex `a` is compiled with `anchored=true` and searches `aba` + /// starting at position `1`. Since `b` corresponds to position `1` and + /// since the regex is anchored, it finds no match. + /// 4. The regex `a` is compiled with `anchored=false` and searches `aba` + /// startting at position `1`. Since the regex is neither anchored nor + /// starts with `^`, the regex is compiled with an implicit `(?s:.)*?` + /// prefix that permits it to match anywhere. Thus, it reports a match + /// at `[2, 3]`. + /// + /// # Example + /// + /// This demonstrates the differences between an anchored search and + /// a pattern that begins with `^` (as described in the above warning + /// message). + /// + /// ``` + /// use regex_automata::{dfa::{Automaton, dense}, HalfMatch}; + /// + /// let haystack = "aba".as_bytes(); + /// + /// let dfa = dense::Builder::new() + /// .configure(dense::Config::new().anchored(false)) // default + /// .build(r"^a")?; + /// let got = dfa.find_leftmost_fwd_at(None, None, haystack, 2, 3)?; + /// // No match is found because 2 is not the beginning of the haystack, + /// // which is what ^ requires. + /// let expected = None; + /// assert_eq!(expected, got); + /// + /// let dfa = dense::Builder::new() + /// .configure(dense::Config::new().anchored(true)) + /// .build(r"a")?; + /// let got = dfa.find_leftmost_fwd_at(None, None, haystack, 2, 3)?; + /// // An anchored search can still match anywhere in the haystack, it just + /// // must begin at the start of the search which is '2' in this case. + /// let expected = Some(HalfMatch::must(0, 3)); + /// assert_eq!(expected, got); + /// + /// let dfa = dense::Builder::new() + /// .configure(dense::Config::new().anchored(true)) + /// .build(r"a")?; + /// let got = dfa.find_leftmost_fwd_at(None, None, haystack, 1, 3)?; + /// // No match is found since we start searching at offset 1 which + /// // corresponds to 'b'. Since there is no '(?s:.)*?' prefix, no match + /// // is found. + /// let expected = None; + /// assert_eq!(expected, got); + /// + /// let dfa = dense::Builder::new() + /// .configure(dense::Config::new().anchored(false)) // default + /// .build(r"a")?; + /// let got = dfa.find_leftmost_fwd_at(None, None, haystack, 1, 3)?; + /// // Since anchored=false, an implicit '(?s:.)*?' prefix was added to the + /// // pattern. Even though the search starts at 'b', the 'match anything' + /// // prefix allows the search to match 'a'. + /// let expected = Some(HalfMatch::must(0, 3)); + /// assert_eq!(expected, got); + /// + /// # Ok::<(), Box>(()) + /// ``` + pub fn anchored(mut self, yes: bool) -> Config { + self.anchored = Some(yes); + self + } + + /// Enable state acceleration. + /// + /// When enabled, DFA construction will analyze each state to determine + /// whether it is eligible for simple acceleration. Acceleration typically + /// occurs when most of a state's transitions loop back to itself, leaving + /// only a select few bytes that will exit the state. When this occurs, + /// other routines like `memchr` can be used to look for those bytes which + /// may be much faster than traversing the DFA. + /// + /// Callers may elect to disable this if consistent performance is more + /// desirable than variable performance. Namely, acceleration can sometimes + /// make searching slower than it otherwise would be if the transitions + /// that leave accelerated states are traversed frequently. + /// + /// See [`Automaton::accelerator`](crate::dfa::Automaton::accelerator) for + /// an example. + /// + /// This is enabled by default. + pub fn accelerate(mut self, yes: bool) -> Config { + self.accelerate = Some(yes); + self + } + + /// Minimize the DFA. + /// + /// When enabled, the DFA built will be minimized such that it is as small + /// as possible. + /// + /// Whether one enables minimization or not depends on the types of costs + /// you're willing to pay and how much you care about its benefits. In + /// particular, minimization has worst case `O(n*k*logn)` time and `O(k*n)` + /// space, where `n` is the number of DFA states and `k` is the alphabet + /// size. In practice, minimization can be quite costly in terms of both + /// space and time, so it should only be done if you're willing to wait + /// longer to produce a DFA. In general, you might want a minimal DFA in + /// the following circumstances: + /// + /// 1. You would like to optimize for the size of the automaton. This can + /// manifest in one of two ways. Firstly, if you're converting the + /// DFA into Rust code (or a table embedded in the code), then a minimal + /// DFA will translate into a corresponding reduction in code size, and + /// thus, also the final compiled binary size. Secondly, if you are + /// building many DFAs and putting them on the heap, you'll be able to + /// fit more if they are smaller. Note though that building a minimal + /// DFA itself requires additional space; you only realize the space + /// savings once the minimal DFA is constructed (at which point, the + /// space used for minimization is freed). + /// 2. You've observed that a smaller DFA results in faster match + /// performance. Naively, this isn't guaranteed since there is no + /// inherent difference between matching with a bigger-than-minimal + /// DFA and a minimal DFA. However, a smaller DFA may make use of your + /// CPU's cache more efficiently. + /// 3. You are trying to establish an equivalence between regular + /// languages. The standard method for this is to build a minimal DFA + /// for each language and then compare them. If the DFAs are equivalent + /// (up to state renaming), then the languages are equivalent. + /// + /// Typically, minimization only makes sense as an offline process. That + /// is, one might minimize a DFA before serializing it to persistent + /// storage. In practical terms, minimization can take around an order of + /// magnitude more time than compiling the initial DFA via determinization. + /// + /// This option is disabled by default. + pub fn minimize(mut self, yes: bool) -> Config { + self.minimize = Some(yes); + self + } + + /// Set the desired match semantics. + /// + /// The default is [`MatchKind::LeftmostFirst`], which corresponds to the + /// match semantics of Perl-like regex engines. That is, when multiple + /// patterns would match at the same leftmost position, the pattern that + /// appears first in the concrete syntax is chosen. + /// + /// Currently, the only other kind of match semantics supported is + /// [`MatchKind::All`]. This corresponds to classical DFA construction + /// where all possible matches are added to the DFA. + /// + /// Typically, `All` is used when one wants to execute an overlapping + /// search and `LeftmostFirst` otherwise. In particular, it rarely makes + /// sense to use `All` with the various "leftmost" find routines, since the + /// leftmost routines depend on the `LeftmostFirst` automata construction + /// strategy. Specifically, `LeftmostFirst` adds dead states to the DFA + /// as a way to terminate the search and report a match. `LeftmostFirst` + /// also supports non-greedy matches using this strategy where as `All` + /// does not. + /// + /// # Example: overlapping search + /// + /// This example shows the typical use of `MatchKind::All`, which is to + /// report overlapping matches. + /// + /// ``` + /// use regex_automata::{ + /// dfa::{Automaton, OverlappingState, dense}, + /// HalfMatch, MatchKind, + /// }; + /// + /// let dfa = dense::Builder::new() + /// .configure(dense::Config::new().match_kind(MatchKind::All)) + /// .build_many(&[r"\w+$", r"\S+$"])?; + /// let haystack = "@foo".as_bytes(); + /// let mut state = OverlappingState::start(); + /// + /// let expected = Some(HalfMatch::must(1, 4)); + /// let got = dfa.find_overlapping_fwd(haystack, &mut state)?; + /// assert_eq!(expected, got); + /// + /// // The first pattern also matches at the same position, so re-running + /// // the search will yield another match. Notice also that the first + /// // pattern is returned after the second. This is because the second + /// // pattern begins its match before the first, is therefore an earlier + /// // match and is thus reported first. + /// let expected = Some(HalfMatch::must(0, 4)); + /// let got = dfa.find_overlapping_fwd(haystack, &mut state)?; + /// assert_eq!(expected, got); + /// + /// # Ok::<(), Box>(()) + /// ``` + /// + /// # Example: reverse automaton to find start of match + /// + /// Another example for using `MatchKind::All` is for constructing a + /// reverse automaton to find the start of a match. `All` semantics are + /// used for this in order to find the longest possible match, which + /// corresponds to the leftmost starting position. + /// + /// Note that if you need the starting position then + /// [`dfa::regex::Regex`](crate::dfa::regex::Regex) will handle this for + /// you, so it's usually not necessary to do this yourself. + /// + /// ``` + /// use regex_automata::{dfa::{Automaton, dense}, HalfMatch, MatchKind}; + /// + /// let haystack = "123foobar456".as_bytes(); + /// let pattern = r"[a-z]+"; + /// + /// let dfa_fwd = dense::DFA::new(pattern)?; + /// let dfa_rev = dense::Builder::new() + /// .configure(dense::Config::new() + /// .anchored(true) + /// .match_kind(MatchKind::All) + /// ) + /// .build(pattern)?; + /// let expected_fwd = HalfMatch::must(0, 9); + /// let expected_rev = HalfMatch::must(0, 3); + /// let got_fwd = dfa_fwd.find_leftmost_fwd(haystack)?.unwrap(); + /// // Here we don't specify the pattern to search for since there's only + /// // one pattern and we're doing a leftmost search. But if this were an + /// // overlapping search, you'd need to specify the pattern that matched + /// // in the forward direction. (Otherwise, you might wind up finding the + /// // starting position of a match of some other pattern.) That in turn + /// // requires building the reverse automaton with starts_for_each_pattern + /// // enabled. Indeed, this is what Regex does internally. + /// let got_rev = dfa_rev.find_leftmost_rev_at( + /// None, haystack, 0, got_fwd.offset(), + /// )?.unwrap(); + /// assert_eq!(expected_fwd, got_fwd); + /// assert_eq!(expected_rev, got_rev); + /// + /// # Ok::<(), Box>(()) + /// ``` + pub fn match_kind(mut self, kind: MatchKind) -> Config { + self.match_kind = Some(kind); + self + } + + /// Whether to compile a separate start state for each pattern in the + /// automaton. + /// + /// When enabled, a separate **anchored** start state is added for each + /// pattern in the DFA. When this start state is used, then the DFA will + /// only search for matches for the pattern specified, even if there are + /// other patterns in the DFA. + /// + /// The main downside of this option is that it can potentially increase + /// the size of the DFA and/or increase the time it takes to build the DFA. + /// + /// There are a few reasons one might want to enable this (it's disabled + /// by default): + /// + /// 1. When looking for the start of an overlapping match (using a + /// reverse DFA), doing it correctly requires starting the reverse search + /// using the starting state of the pattern that matched in the forward + /// direction. Indeed, when building a [`Regex`](crate::dfa::regex::Regex), + /// it will automatically enable this option when building the reverse DFA + /// internally. + /// 2. When you want to use a DFA with multiple patterns to both search + /// for matches of any pattern or to search for anchored matches of one + /// particular pattern while using the same DFA. (Otherwise, you would need + /// to compile a new DFA for each pattern.) + /// 3. Since the start states added for each pattern are anchored, if you + /// compile an unanchored DFA with one pattern while also enabling this + /// option, then you can use the same DFA to perform anchored or unanchored + /// searches. The latter you get with the standard search APIs. The former + /// you get from the various `_at` search methods that allow you specify a + /// pattern ID to search for. + /// + /// By default this is disabled. + /// + /// # Example + /// + /// This example shows how to use this option to permit the same DFA to + /// run both anchored and unanchored searches for a single pattern. + /// + /// ``` + /// use regex_automata::{ + /// dfa::{Automaton, dense}, + /// HalfMatch, PatternID, + /// }; + /// + /// let dfa = dense::Builder::new() + /// .configure(dense::Config::new().starts_for_each_pattern(true)) + /// .build(r"foo[0-9]+")?; + /// let haystack = b"quux foo123"; + /// + /// // Here's a normal unanchored search. Notice that we use 'None' for the + /// // pattern ID. Since the DFA was built as an unanchored machine, it + /// // use its default unanchored starting state. + /// let expected = HalfMatch::must(0, 11); + /// assert_eq!(Some(expected), dfa.find_leftmost_fwd_at( + /// None, None, haystack, 0, haystack.len(), + /// )?); + /// // But now if we explicitly specify the pattern to search ('0' being + /// // the only pattern in the DFA), then it will use the starting state + /// // for that specific pattern which is always anchored. Since the + /// // pattern doesn't have a match at the beginning of the haystack, we + /// // find nothing. + /// assert_eq!(None, dfa.find_leftmost_fwd_at( + /// None, Some(PatternID::must(0)), haystack, 0, haystack.len(), + /// )?); + /// // And finally, an anchored search is not the same as putting a '^' at + /// // beginning of the pattern. An anchored search can only match at the + /// // beginning of the *search*, which we can change: + /// assert_eq!(Some(expected), dfa.find_leftmost_fwd_at( + /// None, Some(PatternID::must(0)), haystack, 5, haystack.len(), + /// )?); + /// + /// # Ok::<(), Box>(()) + /// ``` + pub fn starts_for_each_pattern(mut self, yes: bool) -> Config { + self.starts_for_each_pattern = Some(yes); + self + } + + /// Whether to attempt to shrink the size of the DFA's alphabet or not. + /// + /// This option is enabled by default and should never be disabled unless + /// one is debugging a generated DFA. + /// + /// When enabled, the DFA will use a map from all possible bytes to their + /// corresponding equivalence class. Each equivalence class represents a + /// set of bytes that does not discriminate between a match and a non-match + /// in the DFA. For example, the pattern `[ab]+` has at least two + /// equivalence classes: a set containing `a` and `b` and a set containing + /// every byte except for `a` and `b`. `a` and `b` are in the same + /// equivalence classes because they never discriminate between a match + /// and a non-match. + /// + /// The advantage of this map is that the size of the transition table + /// can be reduced drastically from `#states * 256 * sizeof(StateID)` to + /// `#states * k * sizeof(StateID)` where `k` is the number of equivalence + /// classes (rounded up to the nearest power of 2). As a result, total + /// space usage can decrease substantially. Moreover, since a smaller + /// alphabet is used, DFA compilation becomes faster as well. + /// + /// **WARNING:** This is only useful for debugging DFAs. Disabling this + /// does not yield any speed advantages. Namely, even when this is + /// disabled, a byte class map is still used while searching. The only + /// difference is that every byte will be forced into its own distinct + /// equivalence class. This is useful for debugging the actual generated + /// transitions because it lets one see the transitions defined on actual + /// bytes instead of the equivalence classes. + pub fn byte_classes(mut self, yes: bool) -> Config { + self.byte_classes = Some(yes); + self + } + + /// Heuristically enable Unicode word boundaries. + /// + /// When set, this will attempt to implement Unicode word boundaries as if + /// they were ASCII word boundaries. This only works when the search input + /// is ASCII only. If a non-ASCII byte is observed while searching, then a + /// [`MatchError::Quit`](crate::MatchError::Quit) error is returned. + /// + /// A possible alternative to enabling this option is to simply use an + /// ASCII word boundary, e.g., via `(?-u:\b)`. The main reason to use this + /// option is if you absolutely need Unicode support. This option lets one + /// use a fast search implementation (a DFA) for some potentially very + /// common cases, while providing the option to fall back to some other + /// regex engine to handle the general case when an error is returned. + /// + /// If the pattern provided has no Unicode word boundary in it, then this + /// option has no effect. (That is, quitting on a non-ASCII byte only + /// occurs when this option is enabled _and_ a Unicode word boundary is + /// present in the pattern.) + /// + /// This is almost equivalent to setting all non-ASCII bytes to be quit + /// bytes. The only difference is that this will cause non-ASCII bytes to + /// be quit bytes _only_ when a Unicode word boundary is present in the + /// pattern. + /// + /// When enabling this option, callers _must_ be prepared to handle + /// a [`MatchError`](crate::MatchError) error during search. + /// When using a [`Regex`](crate::dfa::regex::Regex), this corresponds + /// to using the `try_` suite of methods. Alternatively, if + /// callers can guarantee that their input is ASCII only, then a + /// [`MatchError::Quit`](crate::MatchError::Quit) error will never be + /// returned while searching. + /// + /// This is disabled by default. + /// + /// # Example + /// + /// This example shows how to heuristically enable Unicode word boundaries + /// in a pattern. It also shows what happens when a search comes across a + /// non-ASCII byte. + /// + /// ``` + /// use regex_automata::{ + /// dfa::{Automaton, dense}, + /// HalfMatch, MatchError, MatchKind, + /// }; + /// + /// let dfa = dense::Builder::new() + /// .configure(dense::Config::new().unicode_word_boundary(true)) + /// .build(r"\b[0-9]+\b")?; + /// + /// // The match occurs before the search ever observes the snowman + /// // character, so no error occurs. + /// let haystack = "foo 123 ☃".as_bytes(); + /// let expected = Some(HalfMatch::must(0, 7)); + /// let got = dfa.find_leftmost_fwd(haystack)?; + /// assert_eq!(expected, got); + /// + /// // Notice that this search fails, even though the snowman character + /// // occurs after the ending match offset. This is because search + /// // routines read one byte past the end of the search to account for + /// // look-around, and indeed, this is required here to determine whether + /// // the trailing \b matches. + /// let haystack = "foo 123☃".as_bytes(); + /// let expected = MatchError::Quit { byte: 0xE2, offset: 7 }; + /// let got = dfa.find_leftmost_fwd(haystack); + /// assert_eq!(Err(expected), got); + /// + /// # Ok::<(), Box>(()) + /// ``` + pub fn unicode_word_boundary(mut self, yes: bool) -> Config { + // We have a separate option for this instead of just setting the + // appropriate quit bytes here because we don't want to set quit bytes + // for every regex. We only want to set them when the regex contains a + // Unicode word boundary. + self.unicode_word_boundary = Some(yes); + self + } + + /// Add a "quit" byte to the DFA. + /// + /// When a quit byte is seen during search time, then search will return + /// a [`MatchError::Quit`](crate::MatchError::Quit) error indicating the + /// offset at which the search stopped. + /// + /// A quit byte will always overrule any other aspects of a regex. For + /// example, if the `x` byte is added as a quit byte and the regex `\w` is + /// used, then observing `x` will cause the search to quit immediately + /// despite the fact that `x` is in the `\w` class. + /// + /// This mechanism is primarily useful for heuristically enabling certain + /// features like Unicode word boundaries in a DFA. Namely, if the input + /// to search is ASCII, then a Unicode word boundary can be implemented + /// via an ASCII word boundary with no change in semantics. Thus, a DFA + /// can attempt to match a Unicode word boundary but give up as soon as it + /// observes a non-ASCII byte. Indeed, if callers set all non-ASCII bytes + /// to be quit bytes, then Unicode word boundaries will be permitted when + /// building DFAs. Of course, callers should enable + /// [`Config::unicode_word_boundary`] if they want this behavior instead. + /// (The advantage being that non-ASCII quit bytes will only be added if a + /// Unicode word boundary is in the pattern.) + /// + /// When enabling this option, callers _must_ be prepared to handle a + /// [`MatchError`](crate::MatchError) error during search. When using a + /// [`Regex`](crate::dfa::regex::Regex), this corresponds to using the + /// `try_` suite of methods. + /// + /// By default, there are no quit bytes set. + /// + /// # Panics + /// + /// This panics if heuristic Unicode word boundaries are enabled and any + /// non-ASCII byte is removed from the set of quit bytes. Namely, enabling + /// Unicode word boundaries requires setting every non-ASCII byte to a quit + /// byte. So if the caller attempts to undo any of that, then this will + /// panic. + /// + /// # Example + /// + /// This example shows how to cause a search to terminate if it sees a + /// `\n` byte. This could be useful if, for example, you wanted to prevent + /// a user supplied pattern from matching across a line boundary. + /// + /// ``` + /// use regex_automata::{ + /// dfa::{Automaton, dense}, + /// HalfMatch, MatchError, + /// }; + /// + /// let dfa = dense::Builder::new() + /// .configure(dense::Config::new().quit(b'\n', true)) + /// .build(r"foo\p{any}+bar")?; + /// + /// let haystack = "foo\nbar".as_bytes(); + /// // Normally this would produce a match, since \p{any} contains '\n'. + /// // But since we instructed the automaton to enter a quit state if a + /// // '\n' is observed, this produces a match error instead. + /// let expected = MatchError::Quit { byte: 0x0A, offset: 3 }; + /// let got = dfa.find_leftmost_fwd(haystack).unwrap_err(); + /// assert_eq!(expected, got); + /// + /// # Ok::<(), Box>(()) + /// ``` + pub fn quit(mut self, byte: u8, yes: bool) -> Config { + if self.get_unicode_word_boundary() && !byte.is_ascii() && !yes { + panic!( + "cannot set non-ASCII byte to be non-quit when \ + Unicode word boundaries are enabled" + ); + } + if self.quit.is_none() { + self.quit = Some(ByteSet::empty()); + } + if yes { + self.quit.as_mut().unwrap().add(byte); + } else { + self.quit.as_mut().unwrap().remove(byte); + } + self + } + + /// Set a size limit on the total heap used by a DFA. + /// + /// This size limit is expressed in bytes and is applied during + /// determinization of an NFA into a DFA. If the DFA's heap usage, and only + /// the DFA, exceeds this configured limit, then determinization is stopped + /// and an error is returned. + /// + /// This limit does not apply to auxiliary storage used during + /// determinization that isn't part of the generated DFA. + /// + /// This limit is only applied during determinization. Currently, there is + /// no way to post-pone this check to after minimization if minimization + /// was enabled. + /// + /// The total limit on heap used during determinization is the sum of the + /// DFA and determinization size limits. + /// + /// The default is no limit. + /// + /// # Example + /// + /// This example shows a DFA that fails to build because of a configured + /// size limit. This particular example also serves as a cautionary tale + /// demonstrating just how big DFAs with large Unicode character classes + /// can get. + /// + /// ``` + /// use regex_automata::dfa::{dense, Automaton}; + /// + /// // 3MB isn't enough! + /// dense::Builder::new() + /// .configure(dense::Config::new().dfa_size_limit(Some(3_000_000))) + /// .build(r"\w{20}") + /// .unwrap_err(); + /// + /// // ... but 4MB probably is! + /// // (Note that DFA sizes aren't necessarily stable between releases.) + /// let dfa = dense::Builder::new() + /// .configure(dense::Config::new().dfa_size_limit(Some(4_000_000))) + /// .build(r"\w{20}")?; + /// let haystack = "A".repeat(20).into_bytes(); + /// assert!(dfa.find_leftmost_fwd(&haystack)?.is_some()); + /// + /// # Ok::<(), Box>(()) + /// ``` + /// + /// While one needs a little more than 3MB to represent `\w{20}`, it + /// turns out that you only need a little more than 4KB to represent + /// `(?-u:\w{20})`. So only use Unicode if you need it! + pub fn dfa_size_limit(mut self, bytes: Option) -> Config { + self.dfa_size_limit = Some(bytes); + self + } + + /// Set a size limit on the total heap used by determinization. + /// + /// This size limit is expressed in bytes and is applied during + /// determinization of an NFA into a DFA. If the heap used for auxiliary + /// storage during determinization (memory that is not in the DFA but + /// necessary for building the DFA) exceeds this configured limit, then + /// determinization is stopped and an error is returned. + /// + /// This limit does not apply to heap used by the DFA itself. + /// + /// The total limit on heap used during determinization is the sum of the + /// DFA and determinization size limits. + /// + /// The default is no limit. + /// + /// # Example + /// + /// This example shows a DFA that fails to build because of a + /// configured size limit on the amount of heap space used by + /// determinization. This particular example complements the example for + /// [`Config::dfa_size_limit`] by demonstrating that not only does Unicode + /// potentially make DFAs themselves big, but it also results in more + /// auxiliary storage during determinization. (Although, auxiliary storage + /// is still not as much as the DFA itself.) + /// + /// ``` + /// use regex_automata::dfa::{dense, Automaton}; + /// + /// // 300KB isn't enough! + /// dense::Builder::new() + /// .configure(dense::Config::new() + /// .determinize_size_limit(Some(300_000)) + /// ) + /// .build(r"\w{20}") + /// .unwrap_err(); + /// + /// // ... but 400KB probably is! + /// // (Note that auxiliary storage sizes aren't necessarily stable between + /// // releases.) + /// let dfa = dense::Builder::new() + /// .configure(dense::Config::new() + /// .determinize_size_limit(Some(400_000)) + /// ) + /// .build(r"\w{20}")?; + /// let haystack = "A".repeat(20).into_bytes(); + /// assert!(dfa.find_leftmost_fwd(&haystack)?.is_some()); + /// + /// # Ok::<(), Box>(()) + /// ``` + pub fn determinize_size_limit(mut self, bytes: Option) -> Config { + self.determinize_size_limit = Some(bytes); + self + } + + /// Returns whether this configuration has enabled anchored searches. + pub fn get_anchored(&self) -> bool { + self.anchored.unwrap_or(false) + } + + /// Returns whether this configuration has enabled simple state + /// acceleration. + pub fn get_accelerate(&self) -> bool { + self.accelerate.unwrap_or(true) + } + + /// Returns whether this configuration has enabled the expensive process + /// of minimizing a DFA. + pub fn get_minimize(&self) -> bool { + self.minimize.unwrap_or(false) + } + + /// Returns the match semantics set in this configuration. + pub fn get_match_kind(&self) -> MatchKind { + self.match_kind.unwrap_or(MatchKind::LeftmostFirst) + } + + /// Returns whether this configuration has enabled anchored starting states + /// for every pattern in the DFA. + pub fn get_starts_for_each_pattern(&self) -> bool { + self.starts_for_each_pattern.unwrap_or(false) + } + + /// Returns whether this configuration has enabled byte classes or not. + /// This is typically a debugging oriented option, as disabling it confers + /// no speed benefit. + pub fn get_byte_classes(&self) -> bool { + self.byte_classes.unwrap_or(true) + } + + /// Returns whether this configuration has enabled heuristic Unicode word + /// boundary support. When enabled, it is possible for a search to return + /// an error. + pub fn get_unicode_word_boundary(&self) -> bool { + self.unicode_word_boundary.unwrap_or(false) + } + + /// Returns whether this configuration will instruct the DFA to enter a + /// quit state whenever the given byte is seen during a search. When at + /// least one byte has this enabled, it is possible for a search to return + /// an error. + pub fn get_quit(&self, byte: u8) -> bool { + self.quit.map_or(false, |q| q.contains(byte)) + } + + /// Returns the DFA size limit of this configuration if one was set. + /// The size limit is total number of bytes on the heap that a DFA is + /// permitted to use. If the DFA exceeds this limit during construction, + /// then construction is stopped and an error is returned. + pub fn get_dfa_size_limit(&self) -> Option { + self.dfa_size_limit.unwrap_or(None) + } + + /// Returns the determinization size limit of this configuration if one + /// was set. The size limit is total number of bytes on the heap that + /// determinization is permitted to use. If determinization exceeds this + /// limit during construction, then construction is stopped and an error is + /// returned. + /// + /// This is different from the DFA size limit in that this only applies to + /// the auxiliary storage used during determinization. Once determinization + /// is complete, this memory is freed. + /// + /// The limit on the total heap memory used is the sum of the DFA and + /// determinization size limits. + pub fn get_determinize_size_limit(&self) -> Option { + self.determinize_size_limit.unwrap_or(None) + } + + /// Overwrite the default configuration such that the options in `o` are + /// always used. If an option in `o` is not set, then the corresponding + /// option in `self` is used. If it's not set in `self` either, then it + /// remains not set. + pub(crate) fn overwrite(self, o: Config) -> Config { + Config { + anchored: o.anchored.or(self.anchored), + accelerate: o.accelerate.or(self.accelerate), + minimize: o.minimize.or(self.minimize), + match_kind: o.match_kind.or(self.match_kind), + starts_for_each_pattern: o + .starts_for_each_pattern + .or(self.starts_for_each_pattern), + byte_classes: o.byte_classes.or(self.byte_classes), + unicode_word_boundary: o + .unicode_word_boundary + .or(self.unicode_word_boundary), + quit: o.quit.or(self.quit), + dfa_size_limit: o.dfa_size_limit.or(self.dfa_size_limit), + determinize_size_limit: o + .determinize_size_limit + .or(self.determinize_size_limit), + } + } +} + +/// A builder for constructing a deterministic finite automaton from regular +/// expressions. +/// +/// This builder provides two main things: +/// +/// 1. It provides a few different `build` routines for actually constructing +/// a DFA from different kinds of inputs. The most convenient is +/// [`Builder::build`], which builds a DFA directly from a pattern string. The +/// most flexible is [`Builder::build_from_nfa`], which builds a DFA straight +/// from an NFA. +/// 2. The builder permits configuring a number of things. +/// [`Builder::configure`] is used with [`Config`] to configure aspects of +/// the DFA and the construction process itself. [`Builder::syntax`] and +/// [`Builder::thompson`] permit configuring the regex parser and Thompson NFA +/// construction, respectively. The syntax and thompson configurations only +/// apply when building from a pattern string. +/// +/// This builder always constructs a *single* DFA. As such, this builder +/// can only be used to construct regexes that either detect the presence +/// of a match or find the end location of a match. A single DFA cannot +/// produce both the start and end of a match. For that information, use a +/// [`Regex`](crate::dfa::regex::Regex), which can be similarly configured +/// using [`regex::Builder`](crate::dfa::regex::Builder). The main reason to +/// use a DFA directly is if the end location of a match is enough for your use +/// case. Namely, a `Regex` will construct two DFAs instead of one, since a +/// second reverse DFA is needed to find the start of a match. +/// +/// Note that if one wants to build a sparse DFA, you must first build a dense +/// DFA and convert that to a sparse DFA. There is no way to build a sparse +/// DFA without first building a dense DFA. +/// +/// # Example +/// +/// This example shows how to build a minimized DFA that completely disables +/// Unicode. That is: +/// +/// * Things such as `\w`, `.` and `\b` are no longer Unicode-aware. `\w` +/// and `\b` are ASCII-only while `.` matches any byte except for `\n` +/// (instead of any UTF-8 encoding of a Unicode scalar value except for +/// `\n`). Things that are Unicode only, such as `\pL`, are not allowed. +/// * The pattern itself is permitted to match invalid UTF-8. For example, +/// things like `[^a]` that match any byte except for `a` are permitted. +/// * Unanchored patterns can search through invalid UTF-8. That is, for +/// unanchored patterns, the implicit prefix is `(?s-u:.)*?` instead of +/// `(?s:.)*?`. +/// +/// ``` +/// use regex_automata::{ +/// dfa::{Automaton, dense}, +/// nfa::thompson, +/// HalfMatch, SyntaxConfig, +/// }; +/// +/// let dfa = dense::Builder::new() +/// .configure(dense::Config::new().minimize(false)) +/// .syntax(SyntaxConfig::new().unicode(false).utf8(false)) +/// .thompson(thompson::Config::new().utf8(false)) +/// .build(r"foo[^b]ar.*")?; +/// +/// let haystack = b"\xFEfoo\xFFar\xE2\x98\xFF\n"; +/// let expected = Some(HalfMatch::must(0, 10)); +/// let got = dfa.find_leftmost_fwd(haystack)?; +/// assert_eq!(expected, got); +/// +/// # Ok::<(), Box>(()) +/// ``` +#[cfg(feature = "alloc")] +#[derive(Clone, Debug)] +pub struct Builder { + config: Config, + thompson: thompson::Builder, +} + +#[cfg(feature = "alloc")] +impl Builder { + /// Create a new dense DFA builder with the default configuration. + pub fn new() -> Builder { + Builder { + config: Config::default(), + thompson: thompson::Builder::new(), + } + } + + /// Build a DFA from the given pattern. + /// + /// If there was a problem parsing or compiling the pattern, then an error + /// is returned. + pub fn build(&self, pattern: &str) -> Result { + self.build_many(&[pattern]) + } + + /// Build a DFA from the given patterns. + /// + /// When matches are returned, the pattern ID corresponds to the index of + /// the pattern in the slice given. + pub fn build_many>( + &self, + patterns: &[P], + ) -> Result { + let nfa = self.thompson.build_many(patterns).map_err(Error::nfa)?; + self.build_from_nfa(&nfa) + } + + /// Build a DFA from the given NFA. + /// + /// # Example + /// + /// This example shows how to build a DFA if you already have an NFA in + /// hand. + /// + /// ``` + /// use regex_automata::{ + /// dfa::{Automaton, dense}, + /// nfa::thompson, + /// HalfMatch, + /// }; + /// + /// let haystack = "foo123bar".as_bytes(); + /// + /// // This shows how to set non-default options for building an NFA. + /// let nfa = thompson::Builder::new() + /// .configure(thompson::Config::new().shrink(false)) + /// .build(r"[0-9]+")?; + /// let dfa = dense::Builder::new().build_from_nfa(&nfa)?; + /// let expected = Some(HalfMatch::must(0, 6)); + /// let got = dfa.find_leftmost_fwd(haystack)?; + /// assert_eq!(expected, got); + /// + /// # Ok::<(), Box>(()) + /// ``` + pub fn build_from_nfa( + &self, + nfa: &thompson::NFA, + ) -> Result { + let mut quit = self.config.quit.unwrap_or(ByteSet::empty()); + if self.config.get_unicode_word_boundary() + && nfa.has_word_boundary_unicode() + { + for b in 0x80..=0xFF { + quit.add(b); + } + } + let classes = if !self.config.get_byte_classes() { + // DFAs will always use the equivalence class map, but enabling + // this option is useful for debugging. Namely, this will cause all + // transitions to be defined over their actual bytes instead of an + // opaque equivalence class identifier. The former is much easier + // to grok as a human. + ByteClasses::singletons() + } else { + let mut set = nfa.byte_class_set().clone(); + // It is important to distinguish any "quit" bytes from all other + // bytes. Otherwise, a non-quit byte may end up in the same class + // as a quit byte, and thus cause the DFA stop when it shouldn't. + if !quit.is_empty() { + set.add_set(&quit); + } + set.byte_classes() + }; + + let mut dfa = DFA::initial( + classes, + nfa.pattern_len(), + self.config.get_starts_for_each_pattern(), + )?; + determinize::Config::new() + .anchored(self.config.get_anchored()) + .match_kind(self.config.get_match_kind()) + .quit(quit) + .dfa_size_limit(self.config.get_dfa_size_limit()) + .determinize_size_limit(self.config.get_determinize_size_limit()) + .run(nfa, &mut dfa)?; + if self.config.get_minimize() { + dfa.minimize(); + } + if self.config.get_accelerate() { + dfa.accelerate(); + } + Ok(dfa) + } + + /// Apply the given dense DFA configuration options to this builder. + pub fn configure(&mut self, config: Config) -> &mut Builder { + self.config = self.config.overwrite(config); + self + } + + /// Set the syntax configuration for this builder using + /// [`SyntaxConfig`](crate::SyntaxConfig). + /// + /// This permits setting things like case insensitivity, Unicode and multi + /// line mode. + /// + /// These settings only apply when constructing a DFA directly from a + /// pattern. + pub fn syntax( + &mut self, + config: crate::util::syntax::SyntaxConfig, + ) -> &mut Builder { + self.thompson.syntax(config); + self + } + + /// Set the Thompson NFA configuration for this builder using + /// [`nfa::thompson::Config`](crate::nfa::thompson::Config). + /// + /// This permits setting things like whether the DFA should match the regex + /// in reverse or if additional time should be spent shrinking the size of + /// the NFA. + /// + /// These settings only apply when constructing a DFA directly from a + /// pattern. + pub fn thompson(&mut self, config: thompson::Config) -> &mut Builder { + self.thompson.configure(config); + self + } +} + +#[cfg(feature = "alloc")] +impl Default for Builder { + fn default() -> Builder { + Builder::new() + } +} + +/// A convenience alias for an owned DFA. We use this particular instantiation +/// a lot in this crate, so it's worth giving it a name. This instantiation +/// is commonly used for mutable APIs on the DFA while building it. The main +/// reason for making DFAs generic is no_std support, and more generally, +/// making it possible to load a DFA from an arbitrary slice of bytes. +#[cfg(feature = "alloc")] +pub(crate) type OwnedDFA = DFA>; + +/// A dense table-based deterministic finite automaton (DFA). +/// +/// All dense DFAs have one or more start states, zero or more match states +/// and a transition table that maps the current state and the current byte +/// of input to the next state. A DFA can use this information to implement +/// fast searching. In particular, the use of a dense DFA generally makes the +/// trade off that match speed is the most valuable characteristic, even if +/// building the DFA may take significant time *and* space. (More concretely, +/// building a DFA takes time and space that is exponential in the size of the +/// pattern in the worst case.) As such, the processing of every byte of input +/// is done with a small constant number of operations that does not vary with +/// the pattern, its size or the size of the alphabet. If your needs don't line +/// up with this trade off, then a dense DFA may not be an adequate solution to +/// your problem. +/// +/// In contrast, a [`sparse::DFA`] makes the opposite +/// trade off: it uses less space but will execute a variable number of +/// instructions per byte at match time, which makes it slower for matching. +/// (Note that space usage is still exponential in the size of the pattern in +/// the worst case.) +/// +/// A DFA can be built using the default configuration via the +/// [`DFA::new`] constructor. Otherwise, one can +/// configure various aspects via [`dense::Builder`](Builder). +/// +/// A single DFA fundamentally supports the following operations: +/// +/// 1. Detection of a match. +/// 2. Location of the end of a match. +/// 3. In the case of a DFA with multiple patterns, which pattern matched is +/// reported as well. +/// +/// A notable absence from the above list of capabilities is the location of +/// the *start* of a match. In order to provide both the start and end of +/// a match, *two* DFAs are required. This functionality is provided by a +/// [`Regex`](crate::dfa::regex::Regex). +/// +/// # Type parameters +/// +/// A `DFA` has one type parameter, `T`, which is used to represent state IDs, +/// pattern IDs and accelerators. `T` is typically a `Vec` or a `&[u32]`. +/// +/// # The `Automaton` trait +/// +/// This type implements the [`Automaton`] trait, which means it can be used +/// for searching. For example: +/// +/// ``` +/// use regex_automata::{dfa::{Automaton, dense::DFA}, HalfMatch}; +/// +/// let dfa = DFA::new("foo[0-9]+")?; +/// let expected = HalfMatch::must(0, 8); +/// assert_eq!(Some(expected), dfa.find_leftmost_fwd(b"foo12345")?); +/// # Ok::<(), Box>(()) +/// ``` +#[derive(Clone)] +pub struct DFA { + /// The transition table for this DFA. This includes the transitions + /// themselves, along with the stride, number of states and the equivalence + /// class mapping. + tt: TransitionTable, + /// The set of starting state identifiers for this DFA. The starting state + /// IDs act as pointers into the transition table. The specific starting + /// state chosen for each search is dependent on the context at which the + /// search begins. + st: StartTable, + /// The set of match states and the patterns that match for each + /// corresponding match state. + /// + /// This structure is technically only needed because of support for + /// multi-regexes. Namely, multi-regexes require answering not just whether + /// a match exists, but _which_ patterns match. So we need to store the + /// matching pattern IDs for each match state. We do this even when there + /// is only one pattern for the sake of simplicity. In practice, this uses + /// up very little space for the case of on pattern. + ms: MatchStates, + /// Information about which states are "special." Special states are states + /// that are dead, quit, matching, starting or accelerated. For more info, + /// see the docs for `Special`. + special: Special, + /// The accelerators for this DFA. + /// + /// If a state is accelerated, then there exist only a small number of + /// bytes that can cause the DFA to leave the state. This permits searching + /// to use optimized routines to find those specific bytes instead of using + /// the transition table. + /// + /// All accelerated states exist in a contiguous range in the DFA's + /// transition table. See dfa/special.rs for more details on how states are + /// arranged. + accels: Accels, +} + +#[cfg(feature = "alloc")] +impl OwnedDFA { + /// Parse the given regular expression using a default configuration and + /// return the corresponding DFA. + /// + /// If you want a non-default configuration, then use the + /// [`dense::Builder`](Builder) to set your own configuration. + /// + /// # Example + /// + /// ``` + /// use regex_automata::{dfa::{Automaton, dense}, HalfMatch}; + /// + /// let dfa = dense::DFA::new("foo[0-9]+bar")?; + /// let expected = HalfMatch::must(0, 11); + /// assert_eq!(Some(expected), dfa.find_leftmost_fwd(b"foo12345bar")?); + /// # Ok::<(), Box>(()) + /// ``` + pub fn new(pattern: &str) -> Result { + Builder::new().build(pattern) + } + + /// Parse the given regular expressions using a default configuration and + /// return the corresponding multi-DFA. + /// + /// If you want a non-default configuration, then use the + /// [`dense::Builder`](Builder) to set your own configuration. + /// + /// # Example + /// + /// ``` + /// use regex_automata::{dfa::{Automaton, dense}, HalfMatch}; + /// + /// let dfa = dense::DFA::new_many(&["[0-9]+", "[a-z]+"])?; + /// let expected = HalfMatch::must(1, 3); + /// assert_eq!(Some(expected), dfa.find_leftmost_fwd(b"foo12345bar")?); + /// # Ok::<(), Box>(()) + /// ``` + pub fn new_many>(patterns: &[P]) -> Result { + Builder::new().build_many(patterns) + } +} + +#[cfg(feature = "alloc")] +impl OwnedDFA { + /// Create a new DFA that matches every input. + /// + /// # Example + /// + /// ``` + /// use regex_automata::{dfa::{Automaton, dense}, HalfMatch}; + /// + /// let dfa = dense::DFA::always_match()?; + /// + /// let expected = HalfMatch::must(0, 0); + /// assert_eq!(Some(expected), dfa.find_leftmost_fwd(b"")?); + /// assert_eq!(Some(expected), dfa.find_leftmost_fwd(b"foo")?); + /// # Ok::<(), Box>(()) + /// ``` + pub fn always_match() -> Result { + let nfa = thompson::NFA::always_match(); + Builder::new().build_from_nfa(&nfa) + } + + /// Create a new DFA that never matches any input. + /// + /// # Example + /// + /// ``` + /// use regex_automata::dfa::{Automaton, dense}; + /// + /// let dfa = dense::DFA::never_match()?; + /// assert_eq!(None, dfa.find_leftmost_fwd(b"")?); + /// assert_eq!(None, dfa.find_leftmost_fwd(b"foo")?); + /// # Ok::<(), Box>(()) + /// ``` + pub fn never_match() -> Result { + let nfa = thompson::NFA::never_match(); + Builder::new().build_from_nfa(&nfa) + } + + /// Create an initial DFA with the given equivalence classes, pattern count + /// and whether anchored starting states are enabled for each pattern. An + /// initial DFA can be further mutated via determinization. + fn initial( + classes: ByteClasses, + pattern_count: usize, + starts_for_each_pattern: bool, + ) -> Result { + let start_pattern_count = + if starts_for_each_pattern { pattern_count } else { 0 }; + Ok(DFA { + tt: TransitionTable::minimal(classes), + st: StartTable::dead(start_pattern_count)?, + ms: MatchStates::empty(pattern_count), + special: Special::new(), + accels: Accels::empty(), + }) + } +} + +impl> DFA { + /// Cheaply return a borrowed version of this dense DFA. Specifically, + /// the DFA returned always uses `&[u32]` for its transition table. + pub fn as_ref(&self) -> DFA<&'_ [u32]> { + DFA { + tt: self.tt.as_ref(), + st: self.st.as_ref(), + ms: self.ms.as_ref(), + special: self.special, + accels: self.accels(), + } + } + + /// Return an owned version of this sparse DFA. Specifically, the DFA + /// returned always uses `Vec` for its transition table. + /// + /// Effectively, this returns a dense DFA whose transition table lives on + /// the heap. + #[cfg(feature = "alloc")] + pub fn to_owned(&self) -> OwnedDFA { + DFA { + tt: self.tt.to_owned(), + st: self.st.to_owned(), + ms: self.ms.to_owned(), + special: self.special, + accels: self.accels().to_owned(), + } + } + + /// Returns true only if this DFA has starting states for each pattern. + /// + /// When a DFA has starting states for each pattern, then a search with the + /// DFA can be configured to only look for anchored matches of a specific + /// pattern. Specifically, APIs like [`Automaton::find_earliest_fwd_at`] + /// can accept a non-None `pattern_id` if and only if this method returns + /// true. Otherwise, calling `find_earliest_fwd_at` will panic. + /// + /// Note that if the DFA has no patterns, this always returns false. + pub fn has_starts_for_each_pattern(&self) -> bool { + self.st.patterns > 0 + } + + /// Returns the total number of elements in the alphabet for this DFA. + /// + /// That is, this returns the total number of transitions that each state + /// in this DFA must have. Typically, a normal byte oriented DFA would + /// always have an alphabet size of 256, corresponding to the number of + /// unique values in a single byte. However, this implementation has two + /// peculiarities that impact the alphabet length: + /// + /// * Every state has a special "EOI" transition that is only followed + /// after the end of some haystack is reached. This EOI transition is + /// necessary to account for one byte of look-ahead when implementing + /// things like `\b` and `$`. + /// * Bytes are grouped into equivalence classes such that no two bytes in + /// the same class can distinguish a match from a non-match. For example, + /// in the regex `^[a-z]+$`, the ASCII bytes `a-z` could all be in the + /// same equivalence class. This leads to a massive space savings. + /// + /// Note though that the alphabet length does _not_ necessarily equal the + /// total stride space taken up by a single DFA state in the transition + /// table. Namely, for performance reasons, the stride is always the + /// smallest power of two that is greater than or equal to the alphabet + /// length. For this reason, [`DFA::stride`] or [`DFA::stride2`] are + /// often more useful. The alphabet length is typically useful only for + /// informational purposes. + pub fn alphabet_len(&self) -> usize { + self.tt.alphabet_len() + } + + /// Returns the total stride for every state in this DFA, expressed as the + /// exponent of a power of 2. The stride is the amount of space each state + /// takes up in the transition table, expressed as a number of transitions. + /// (Unused transitions map to dead states.) + /// + /// The stride of a DFA is always equivalent to the smallest power of 2 + /// that is greater than or equal to the DFA's alphabet length. This + /// definition uses extra space, but permits faster translation between + /// premultiplied state identifiers and contiguous indices (by using shifts + /// instead of relying on integer division). + /// + /// For example, if the DFA's stride is 16 transitions, then its `stride2` + /// is `4` since `2^4 = 16`. + /// + /// The minimum `stride2` value is `1` (corresponding to a stride of `2`) + /// while the maximum `stride2` value is `9` (corresponding to a stride of + /// `512`). The maximum is not `8` since the maximum alphabet size is `257` + /// when accounting for the special EOI transition. However, an alphabet + /// length of that size is exceptionally rare since the alphabet is shrunk + /// into equivalence classes. + pub fn stride2(&self) -> usize { + self.tt.stride2 + } + + /// Returns the total stride for every state in this DFA. This corresponds + /// to the total number of transitions used by each state in this DFA's + /// transition table. + /// + /// Please see [`DFA::stride2`] for more information. In particular, this + /// returns the stride as the number of transitions, where as `stride2` + /// returns it as the exponent of a power of 2. + pub fn stride(&self) -> usize { + self.tt.stride() + } + + /// Returns the "universal" start state for this DFA. + /// + /// A universal start state occurs only when all of the starting states + /// for this DFA are precisely the same. This occurs when there are no + /// look-around assertions at the beginning (or end for a reverse DFA) of + /// the pattern. + /// + /// Using this as a starting state for a DFA without a universal starting + /// state has unspecified behavior. This condition is not checked, so the + /// caller must guarantee it themselves. + pub(crate) fn universal_start_state(&self) -> StateID { + // We choose 'NonWordByte' for no particular reason, other than + // the fact that this is the 'main' starting configuration used in + // determinization. But in essence, it doesn't really matter. + // + // Also, we might consider exposing this routine, but it seems + // a little tricky to use correctly. Maybe if we also expose a + // 'has_universal_start_state' method? + self.st.start(Start::NonWordByte, None) + } + + /// Returns the memory usage, in bytes, of this DFA. + /// + /// The memory usage is computed based on the number of bytes used to + /// represent this DFA. + /// + /// This does **not** include the stack size used up by this DFA. To + /// compute that, use `std::mem::size_of::()`. + pub fn memory_usage(&self) -> usize { + self.tt.memory_usage() + + self.st.memory_usage() + + self.ms.memory_usage() + + self.accels.memory_usage() + } +} + +/// Routines for converting a dense DFA to other representations, such as +/// sparse DFAs or raw bytes suitable for persistent storage. +impl> DFA { + /// Convert this dense DFA to a sparse DFA. + /// + /// If a `StateID` is too small to represent all states in the sparse + /// DFA, then this returns an error. In most cases, if a dense DFA is + /// constructable with `StateID` then a sparse DFA will be as well. + /// However, it is not guaranteed. + /// + /// # Example + /// + /// ``` + /// use regex_automata::{dfa::{Automaton, dense}, HalfMatch}; + /// + /// let dense = dense::DFA::new("foo[0-9]+")?; + /// let sparse = dense.to_sparse()?; + /// + /// let expected = HalfMatch::must(0, 8); + /// assert_eq!(Some(expected), sparse.find_leftmost_fwd(b"foo12345")?); + /// # Ok::<(), Box>(()) + /// ``` + #[cfg(feature = "alloc")] + pub fn to_sparse(&self) -> Result>, Error> { + sparse::DFA::from_dense(self) + } + + /// Serialize this DFA as raw bytes to a `Vec` in little endian + /// format. Upon success, the `Vec` and the initial padding length are + /// returned. + /// + /// The written bytes are guaranteed to be deserialized correctly and + /// without errors in a semver compatible release of this crate by a + /// `DFA`'s deserialization APIs (assuming all other criteria for the + /// deserialization APIs has been satisfied): + /// + /// * [`DFA::from_bytes`] + /// * [`DFA::from_bytes_unchecked`] + /// + /// The padding returned is non-zero if the returned `Vec` starts at + /// an address that does not have the same alignment as `u32`. The padding + /// corresponds to the number of leading bytes written to the returned + /// `Vec`. + /// + /// # Example + /// + /// This example shows how to serialize and deserialize a DFA: + /// + /// ``` + /// use regex_automata::{dfa::{Automaton, dense::DFA}, HalfMatch}; + /// + /// // Compile our original DFA. + /// let original_dfa = DFA::new("foo[0-9]+")?; + /// + /// // N.B. We use native endianness here to make the example work, but + /// // using to_bytes_little_endian would work on a little endian target. + /// let (buf, _) = original_dfa.to_bytes_native_endian(); + /// // Even if buf has initial padding, DFA::from_bytes will automatically + /// // ignore it. + /// let dfa: DFA<&[u32]> = DFA::from_bytes(&buf)?.0; + /// + /// let expected = HalfMatch::must(0, 8); + /// assert_eq!(Some(expected), dfa.find_leftmost_fwd(b"foo12345")?); + /// # Ok::<(), Box>(()) + /// ``` + #[cfg(feature = "alloc")] + pub fn to_bytes_little_endian(&self) -> (Vec, usize) { + self.to_bytes::() + } + + /// Serialize this DFA as raw bytes to a `Vec` in big endian + /// format. Upon success, the `Vec` and the initial padding length are + /// returned. + /// + /// The written bytes are guaranteed to be deserialized correctly and + /// without errors in a semver compatible release of this crate by a + /// `DFA`'s deserialization APIs (assuming all other criteria for the + /// deserialization APIs has been satisfied): + /// + /// * [`DFA::from_bytes`] + /// * [`DFA::from_bytes_unchecked`] + /// + /// The padding returned is non-zero if the returned `Vec` starts at + /// an address that does not have the same alignment as `u32`. The padding + /// corresponds to the number of leading bytes written to the returned + /// `Vec`. + /// + /// # Example + /// + /// This example shows how to serialize and deserialize a DFA: + /// + /// ``` + /// use regex_automata::{dfa::{Automaton, dense::DFA}, HalfMatch}; + /// + /// // Compile our original DFA. + /// let original_dfa = DFA::new("foo[0-9]+")?; + /// + /// // N.B. We use native endianness here to make the example work, but + /// // using to_bytes_big_endian would work on a big endian target. + /// let (buf, _) = original_dfa.to_bytes_native_endian(); + /// // Even if buf has initial padding, DFA::from_bytes will automatically + /// // ignore it. + /// let dfa: DFA<&[u32]> = DFA::from_bytes(&buf)?.0; + /// + /// let expected = HalfMatch::must(0, 8); + /// assert_eq!(Some(expected), dfa.find_leftmost_fwd(b"foo12345")?); + /// # Ok::<(), Box>(()) + /// ``` + #[cfg(feature = "alloc")] + pub fn to_bytes_big_endian(&self) -> (Vec, usize) { + self.to_bytes::() + } + + /// Serialize this DFA as raw bytes to a `Vec` in native endian + /// format. Upon success, the `Vec` and the initial padding length are + /// returned. + /// + /// The written bytes are guaranteed to be deserialized correctly and + /// without errors in a semver compatible release of this crate by a + /// `DFA`'s deserialization APIs (assuming all other criteria for the + /// deserialization APIs has been satisfied): + /// + /// * [`DFA::from_bytes`] + /// * [`DFA::from_bytes_unchecked`] + /// + /// The padding returned is non-zero if the returned `Vec` starts at + /// an address that does not have the same alignment as `u32`. The padding + /// corresponds to the number of leading bytes written to the returned + /// `Vec`. + /// + /// Generally speaking, native endian format should only be used when + /// you know that the target you're compiling the DFA for matches the + /// endianness of the target on which you're compiling DFA. For example, + /// if serialization and deserialization happen in the same process or on + /// the same machine. Otherwise, when serializing a DFA for use in a + /// portable environment, you'll almost certainly want to serialize _both_ + /// a little endian and a big endian version and then load the correct one + /// based on the target's configuration. + /// + /// # Example + /// + /// This example shows how to serialize and deserialize a DFA: + /// + /// ``` + /// use regex_automata::{dfa::{Automaton, dense::DFA}, HalfMatch}; + /// + /// // Compile our original DFA. + /// let original_dfa = DFA::new("foo[0-9]+")?; + /// + /// let (buf, _) = original_dfa.to_bytes_native_endian(); + /// // Even if buf has initial padding, DFA::from_bytes will automatically + /// // ignore it. + /// let dfa: DFA<&[u32]> = DFA::from_bytes(&buf)?.0; + /// + /// let expected = HalfMatch::must(0, 8); + /// assert_eq!(Some(expected), dfa.find_leftmost_fwd(b"foo12345")?); + /// # Ok::<(), Box>(()) + /// ``` + #[cfg(feature = "alloc")] + pub fn to_bytes_native_endian(&self) -> (Vec, usize) { + self.to_bytes::() + } + + /// The implementation of the public `to_bytes` serialization methods, + /// which is generic over endianness. + #[cfg(feature = "alloc")] + fn to_bytes(&self) -> (Vec, usize) { + let len = self.write_to_len(); + let (mut buf, padding) = bytes::alloc_aligned_buffer::(len); + // This should always succeed since the only possible serialization + // error is providing a buffer that's too small, but we've ensured that + // `buf` is big enough here. + self.as_ref().write_to::(&mut buf[padding..]).unwrap(); + (buf, padding) + } + + /// Serialize this DFA as raw bytes to the given slice, in little endian + /// format. Upon success, the total number of bytes written to `dst` is + /// returned. + /// + /// The written bytes are guaranteed to be deserialized correctly and + /// without errors in a semver compatible release of this crate by a + /// `DFA`'s deserialization APIs (assuming all other criteria for the + /// deserialization APIs has been satisfied): + /// + /// * [`DFA::from_bytes`] + /// * [`DFA::from_bytes_unchecked`] + /// + /// Note that unlike the various `to_byte_*` routines, this does not write + /// any padding. Callers are responsible for handling alignment correctly. + /// + /// # Errors + /// + /// This returns an error if the given destination slice is not big enough + /// to contain the full serialized DFA. If an error occurs, then nothing + /// is written to `dst`. + /// + /// # Example + /// + /// This example shows how to serialize and deserialize a DFA without + /// dynamic memory allocation. + /// + /// ``` + /// use regex_automata::{dfa::{Automaton, dense::DFA}, HalfMatch}; + /// + /// // Compile our original DFA. + /// let original_dfa = DFA::new("foo[0-9]+")?; + /// + /// // Create a 4KB buffer on the stack to store our serialized DFA. + /// let mut buf = [0u8; 4 * (1<<10)]; + /// // N.B. We use native endianness here to make the example work, but + /// // using write_to_little_endian would work on a little endian target. + /// let written = original_dfa.write_to_native_endian(&mut buf)?; + /// let dfa: DFA<&[u32]> = DFA::from_bytes(&buf[..written])?.0; + /// + /// let expected = HalfMatch::must(0, 8); + /// assert_eq!(Some(expected), dfa.find_leftmost_fwd(b"foo12345")?); + /// # Ok::<(), Box>(()) + /// ``` + pub fn write_to_little_endian( + &self, + dst: &mut [u8], + ) -> Result { + self.as_ref().write_to::(dst) + } + + /// Serialize this DFA as raw bytes to the given slice, in big endian + /// format. Upon success, the total number of bytes written to `dst` is + /// returned. + /// + /// The written bytes are guaranteed to be deserialized correctly and + /// without errors in a semver compatible release of this crate by a + /// `DFA`'s deserialization APIs (assuming all other criteria for the + /// deserialization APIs has been satisfied): + /// + /// * [`DFA::from_bytes`] + /// * [`DFA::from_bytes_unchecked`] + /// + /// Note that unlike the various `to_byte_*` routines, this does not write + /// any padding. Callers are responsible for handling alignment correctly. + /// + /// # Errors + /// + /// This returns an error if the given destination slice is not big enough + /// to contain the full serialized DFA. If an error occurs, then nothing + /// is written to `dst`. + /// + /// # Example + /// + /// This example shows how to serialize and deserialize a DFA without + /// dynamic memory allocation. + /// + /// ``` + /// use regex_automata::{dfa::{Automaton, dense::DFA}, HalfMatch}; + /// + /// // Compile our original DFA. + /// let original_dfa = DFA::new("foo[0-9]+")?; + /// + /// // Create a 4KB buffer on the stack to store our serialized DFA. + /// let mut buf = [0u8; 4 * (1<<10)]; + /// // N.B. We use native endianness here to make the example work, but + /// // using write_to_big_endian would work on a big endian target. + /// let written = original_dfa.write_to_native_endian(&mut buf)?; + /// let dfa: DFA<&[u32]> = DFA::from_bytes(&buf[..written])?.0; + /// + /// let expected = HalfMatch::must(0, 8); + /// assert_eq!(Some(expected), dfa.find_leftmost_fwd(b"foo12345")?); + /// # Ok::<(), Box>(()) + /// ``` + pub fn write_to_big_endian( + &self, + dst: &mut [u8], + ) -> Result { + self.as_ref().write_to::(dst) + } + + /// Serialize this DFA as raw bytes to the given slice, in native endian + /// format. Upon success, the total number of bytes written to `dst` is + /// returned. + /// + /// The written bytes are guaranteed to be deserialized correctly and + /// without errors in a semver compatible release of this crate by a + /// `DFA`'s deserialization APIs (assuming all other criteria for the + /// deserialization APIs has been satisfied): + /// + /// * [`DFA::from_bytes`] + /// * [`DFA::from_bytes_unchecked`] + /// + /// Generally speaking, native endian format should only be used when + /// you know that the target you're compiling the DFA for matches the + /// endianness of the target on which you're compiling DFA. For example, + /// if serialization and deserialization happen in the same process or on + /// the same machine. Otherwise, when serializing a DFA for use in a + /// portable environment, you'll almost certainly want to serialize _both_ + /// a little endian and a big endian version and then load the correct one + /// based on the target's configuration. + /// + /// Note that unlike the various `to_byte_*` routines, this does not write + /// any padding. Callers are responsible for handling alignment correctly. + /// + /// # Errors + /// + /// This returns an error if the given destination slice is not big enough + /// to contain the full serialized DFA. If an error occurs, then nothing + /// is written to `dst`. + /// + /// # Example + /// + /// This example shows how to serialize and deserialize a DFA without + /// dynamic memory allocation. + /// + /// ``` + /// use regex_automata::{dfa::{Automaton, dense::DFA}, HalfMatch}; + /// + /// // Compile our original DFA. + /// let original_dfa = DFA::new("foo[0-9]+")?; + /// + /// // Create a 4KB buffer on the stack to store our serialized DFA. + /// let mut buf = [0u8; 4 * (1<<10)]; + /// let written = original_dfa.write_to_native_endian(&mut buf)?; + /// let dfa: DFA<&[u32]> = DFA::from_bytes(&buf[..written])?.0; + /// + /// let expected = HalfMatch::must(0, 8); + /// assert_eq!(Some(expected), dfa.find_leftmost_fwd(b"foo12345")?); + /// # Ok::<(), Box>(()) + /// ``` + pub fn write_to_native_endian( + &self, + dst: &mut [u8], + ) -> Result { + self.as_ref().write_to::(dst) + } + + /// Return the total number of bytes required to serialize this DFA. + /// + /// This is useful for determining the size of the buffer required to pass + /// to one of the serialization routines: + /// + /// * [`DFA::write_to_little_endian`] + /// * [`DFA::write_to_big_endian`] + /// * [`DFA::write_to_native_endian`] + /// + /// Passing a buffer smaller than the size returned by this method will + /// result in a serialization error. Serialization routines are guaranteed + /// to succeed when the buffer is big enough. + /// + /// # Example + /// + /// This example shows how to dynamically allocate enough room to serialize + /// a DFA. + /// + /// ``` + /// use regex_automata::{dfa::{Automaton, dense::DFA}, HalfMatch}; + /// + /// // Compile our original DFA. + /// let original_dfa = DFA::new("foo[0-9]+")?; + /// + /// let mut buf = vec![0; original_dfa.write_to_len()]; + /// let written = original_dfa.write_to_native_endian(&mut buf)?; + /// let dfa: DFA<&[u32]> = DFA::from_bytes(&buf[..written])?.0; + /// + /// let expected = HalfMatch::must(0, 8); + /// assert_eq!(Some(expected), dfa.find_leftmost_fwd(b"foo12345")?); + /// # Ok::<(), Box>(()) + /// ``` + /// + /// Note that this example isn't actually guaranteed to work! In + /// particular, if `buf` is not aligned to a 4-byte boundary, then the + /// `DFA::from_bytes` call will fail. If you need this to work, then you + /// either need to deal with adding some initial padding yourself, or use + /// one of the `to_bytes` methods, which will do it for you. + pub fn write_to_len(&self) -> usize { + bytes::write_label_len(LABEL) + + bytes::write_endianness_check_len() + + bytes::write_version_len() + + size_of::() // unused, intended for future flexibility + + self.tt.write_to_len() + + self.st.write_to_len() + + self.ms.write_to_len() + + self.special.write_to_len() + + self.accels.write_to_len() + } +} + +impl<'a> DFA<&'a [u32]> { + /// Safely deserialize a DFA with a specific state identifier + /// representation. Upon success, this returns both the deserialized DFA + /// and the number of bytes read from the given slice. Namely, the contents + /// of the slice beyond the DFA are not read. + /// + /// Deserializing a DFA using this routine will never allocate heap memory. + /// For safety purposes, the DFA's transition table will be verified such + /// that every transition points to a valid state. If this verification is + /// too costly, then a [`DFA::from_bytes_unchecked`] API is provided, which + /// will always execute in constant time. + /// + /// The bytes given must be generated by one of the serialization APIs + /// of a `DFA` using a semver compatible release of this crate. Those + /// include: + /// + /// * [`DFA::to_bytes_little_endian`] + /// * [`DFA::to_bytes_big_endian`] + /// * [`DFA::to_bytes_native_endian`] + /// * [`DFA::write_to_little_endian`] + /// * [`DFA::write_to_big_endian`] + /// * [`DFA::write_to_native_endian`] + /// + /// The `to_bytes` methods allocate and return a `Vec` for you, along + /// with handling alignment correctly. The `write_to` methods do not + /// allocate and write to an existing slice (which may be on the stack). + /// Since deserialization always uses the native endianness of the target + /// platform, the serialization API you use should match the endianness of + /// the target platform. (It's often a good idea to generate serialized + /// DFAs for both forms of endianness and then load the correct one based + /// on endianness.) + /// + /// # Errors + /// + /// Generally speaking, it's easier to state the conditions in which an + /// error is _not_ returned. All of the following must be true: + /// + /// * The bytes given must be produced by one of the serialization APIs + /// on this DFA, as mentioned above. + /// * The endianness of the target platform matches the endianness used to + /// serialized the provided DFA. + /// * The slice given must have the same alignment as `u32`. + /// + /// If any of the above are not true, then an error will be returned. + /// + /// # Panics + /// + /// This routine will never panic for any input. + /// + /// # Example + /// + /// This example shows how to serialize a DFA to raw bytes, deserialize it + /// and then use it for searching. + /// + /// ``` + /// use regex_automata::{dfa::{Automaton, dense::DFA}, HalfMatch}; + /// + /// let initial = DFA::new("foo[0-9]+")?; + /// let (bytes, _) = initial.to_bytes_native_endian(); + /// let dfa: DFA<&[u32]> = DFA::from_bytes(&bytes)?.0; + /// + /// let expected = HalfMatch::must(0, 8); + /// assert_eq!(Some(expected), dfa.find_leftmost_fwd(b"foo12345")?); + /// # Ok::<(), Box>(()) + /// ``` + /// + /// # Example: dealing with alignment and padding + /// + /// In the above example, we used the `to_bytes_native_endian` method to + /// serialize a DFA, but we ignored part of its return value corresponding + /// to padding added to the beginning of the serialized DFA. This is OK + /// because deserialization will skip this initial padding. What matters + /// is that the address immediately following the padding has an alignment + /// that matches `u32`. That is, the following is an equivalent but + /// alternative way to write the above example: + /// + /// ``` + /// use regex_automata::{dfa::{Automaton, dense::DFA}, HalfMatch}; + /// + /// let initial = DFA::new("foo[0-9]+")?; + /// // Serialization returns the number of leading padding bytes added to + /// // the returned Vec. + /// let (bytes, pad) = initial.to_bytes_native_endian(); + /// let dfa: DFA<&[u32]> = DFA::from_bytes(&bytes[pad..])?.0; + /// + /// let expected = HalfMatch::must(0, 8); + /// assert_eq!(Some(expected), dfa.find_leftmost_fwd(b"foo12345")?); + /// # Ok::<(), Box>(()) + /// ``` + /// + /// This padding is necessary because Rust's standard library does + /// not expose any safe and robust way of creating a `Vec` with a + /// guaranteed alignment other than 1. Now, in practice, the underlying + /// allocator is likely to provide a `Vec` that meets our alignment + /// requirements, which means `pad` is zero in practice most of the time. + /// + /// The purpose of exposing the padding like this is flexibility for the + /// caller. For example, if one wants to embed a serialized DFA into a + /// compiled program, then it's important to guarantee that it starts at a + /// `u32`-aligned address. The simplest way to do this is to discard the + /// padding bytes and set it up so that the serialized DFA itself begins at + /// a properly aligned address. We can show this in two parts. The first + /// part is serializing the DFA to a file: + /// + /// ```no_run + /// use regex_automata::dfa::{Automaton, dense::DFA}; + /// + /// let dfa = DFA::new("foo[0-9]+")?; + /// + /// let (bytes, pad) = dfa.to_bytes_big_endian(); + /// // Write the contents of the DFA *without* the initial padding. + /// std::fs::write("foo.bigendian.dfa", &bytes[pad..])?; + /// + /// // Do it again, but this time for little endian. + /// let (bytes, pad) = dfa.to_bytes_little_endian(); + /// std::fs::write("foo.littleendian.dfa", &bytes[pad..])?; + /// # Ok::<(), Box>(()) + /// ``` + /// + /// And now the second part is embedding the DFA into the compiled program + /// and deserializing it at runtime on first use. We use conditional + /// compilation to choose the correct endianness. + /// + /// ```no_run + /// use regex_automata::{dfa::{Automaton, dense}, HalfMatch}; + /// + /// type S = u32; + /// type DFA = dense::DFA<&'static [S]>; + /// + /// fn get_foo() -> &'static DFA { + /// use std::cell::Cell; + /// use std::mem::MaybeUninit; + /// use std::sync::Once; + /// + /// // This struct with a generic B is used to permit unsizing + /// // coercions, specifically, where B winds up being a [u8]. We also + /// // need repr(C) to guarantee that _align comes first, which forces + /// // a correct alignment. + /// #[repr(C)] + /// struct Aligned { + /// _align: [S; 0], + /// bytes: B, + /// } + /// + /// # const _: &str = stringify! { + /// // This assignment is made possible (implicitly) via the + /// // CoerceUnsized trait. + /// static ALIGNED: &Aligned<[u8]> = &Aligned { + /// _align: [], + /// #[cfg(target_endian = "big")] + /// bytes: *include_bytes!("foo.bigendian.dfa"), + /// #[cfg(target_endian = "little")] + /// bytes: *include_bytes!("foo.littleendian.dfa"), + /// }; + /// # }; + /// # static ALIGNED: &Aligned<[u8]> = &Aligned { + /// # _align: [], + /// # bytes: [], + /// # }; + /// + /// struct Lazy(Cell>); + /// // SAFETY: This is safe because DFA impls Sync. + /// unsafe impl Sync for Lazy {} + /// + /// static INIT: Once = Once::new(); + /// static DFA: Lazy = Lazy(Cell::new(MaybeUninit::uninit())); + /// + /// INIT.call_once(|| { + /// let (dfa, _) = DFA::from_bytes(&ALIGNED.bytes) + /// .expect("serialized DFA should be valid"); + /// // SAFETY: This is guaranteed to only execute once, and all + /// // we do with the pointer is write the DFA to it. + /// unsafe { + /// (*DFA.0.as_ptr()).as_mut_ptr().write(dfa); + /// } + /// }); + /// // SAFETY: DFA is guaranteed to by initialized via INIT and is + /// // stored in static memory. + /// unsafe { + /// let dfa = (*DFA.0.as_ptr()).as_ptr(); + /// std::mem::transmute::<*const DFA, &'static DFA>(dfa) + /// } + /// } + /// + /// let dfa = get_foo(); + /// let expected = HalfMatch::must(0, 8); + /// assert_eq!(Ok(Some(expected)), dfa.find_leftmost_fwd(b"foo12345")); + /// ``` + /// + /// Alternatively, consider using + /// [`lazy_static`](https://crates.io/crates/lazy_static) + /// or + /// [`once_cell`](https://crates.io/crates/once_cell), + /// which will guarantee safety for you. You will still need to use the + /// `Aligned` trick above to force correct alignment, but this is safe to + /// do and `from_bytes` will return an error if you get it wrong. + pub fn from_bytes( + slice: &'a [u8], + ) -> Result<(DFA<&'a [u32]>, usize), DeserializeError> { + // SAFETY: This is safe because we validate both the transition table, + // start state ID list and the match states below. If either validation + // fails, then we return an error. + let (dfa, nread) = unsafe { DFA::from_bytes_unchecked(slice)? }; + dfa.tt.validate()?; + dfa.st.validate(&dfa.tt)?; + dfa.ms.validate(&dfa)?; + dfa.accels.validate()?; + // N.B. dfa.special doesn't have a way to do unchecked deserialization, + // so it has already been validated. + Ok((dfa, nread)) + } + + /// Deserialize a DFA with a specific state identifier representation in + /// constant time by omitting the verification of the validity of the + /// transition table and other data inside the DFA. + /// + /// This is just like [`DFA::from_bytes`], except it can potentially return + /// a DFA that exhibits undefined behavior if its transition table contains + /// invalid state identifiers. + /// + /// This routine is useful if you need to deserialize a DFA cheaply + /// and cannot afford the transition table validation performed by + /// `from_bytes`. + /// + /// # Example + /// + /// ``` + /// use regex_automata::{dfa::{Automaton, dense::DFA}, HalfMatch}; + /// + /// let initial = DFA::new("foo[0-9]+")?; + /// let (bytes, _) = initial.to_bytes_native_endian(); + /// // SAFETY: This is guaranteed to be safe since the bytes given come + /// // directly from a compatible serialization routine. + /// let dfa: DFA<&[u32]> = unsafe { DFA::from_bytes_unchecked(&bytes)?.0 }; + /// + /// let expected = HalfMatch::must(0, 8); + /// assert_eq!(Some(expected), dfa.find_leftmost_fwd(b"foo12345")?); + /// # Ok::<(), Box>(()) + /// ``` + pub unsafe fn from_bytes_unchecked( + slice: &'a [u8], + ) -> Result<(DFA<&'a [u32]>, usize), DeserializeError> { + let mut nr = 0; + + nr += bytes::skip_initial_padding(slice); + bytes::check_alignment::(&slice[nr..])?; + nr += bytes::read_label(&slice[nr..], LABEL)?; + nr += bytes::read_endianness_check(&slice[nr..])?; + nr += bytes::read_version(&slice[nr..], VERSION)?; + + let _unused = bytes::try_read_u32(&slice[nr..], "unused space")?; + nr += size_of::(); + + let (tt, nread) = TransitionTable::from_bytes_unchecked(&slice[nr..])?; + nr += nread; + + let (st, nread) = StartTable::from_bytes_unchecked(&slice[nr..])?; + nr += nread; + + let (ms, nread) = MatchStates::from_bytes_unchecked(&slice[nr..])?; + nr += nread; + + let (special, nread) = Special::from_bytes(&slice[nr..])?; + nr += nread; + special.validate_state_count(tt.count(), tt.stride2)?; + + let (accels, nread) = Accels::from_bytes_unchecked(&slice[nr..])?; + nr += nread; + + Ok((DFA { tt, st, ms, special, accels }, nr)) + } + + /// The implementation of the public `write_to` serialization methods, + /// which is generic over endianness. + /// + /// This is defined only for &[u32] to reduce binary size/compilation time. + fn write_to( + &self, + mut dst: &mut [u8], + ) -> Result { + let nwrite = self.write_to_len(); + if dst.len() < nwrite { + return Err(SerializeError::buffer_too_small("dense DFA")); + } + dst = &mut dst[..nwrite]; + + let mut nw = 0; + nw += bytes::write_label(LABEL, &mut dst[nw..])?; + nw += bytes::write_endianness_check::(&mut dst[nw..])?; + nw += bytes::write_version::(VERSION, &mut dst[nw..])?; + nw += { + // Currently unused, intended for future flexibility + E::write_u32(0, &mut dst[nw..]); + size_of::() + }; + nw += self.tt.write_to::(&mut dst[nw..])?; + nw += self.st.write_to::(&mut dst[nw..])?; + nw += self.ms.write_to::(&mut dst[nw..])?; + nw += self.special.write_to::(&mut dst[nw..])?; + nw += self.accels.write_to::(&mut dst[nw..])?; + Ok(nw) + } +} + +/// The following methods implement mutable routines on the internal +/// representation of a DFA. As such, we must fix the first type parameter to a +/// `Vec` since a generic `T: AsRef<[u32]>` does not permit mutation. We +/// can get away with this because these methods are internal to the crate and +/// are exclusively used during construction of the DFA. +#[cfg(feature = "alloc")] +impl OwnedDFA { + /// Add a start state of this DFA. + pub(crate) fn set_start_state( + &mut self, + index: Start, + pattern_id: Option, + id: StateID, + ) { + assert!(self.tt.is_valid(id), "invalid start state"); + self.st.set_start(index, pattern_id, id); + } + + /// Set the given transition to this DFA. Both the `from` and `to` states + /// must already exist. + pub(crate) fn set_transition( + &mut self, + from: StateID, + byte: alphabet::Unit, + to: StateID, + ) { + self.tt.set(from, byte, to); + } + + /// An an empty state (a state where all transitions lead to a dead state) + /// and return its identifier. The identifier returned is guaranteed to + /// not point to any other existing state. + /// + /// If adding a state would exceed `StateID::LIMIT`, then this returns an + /// error. + pub(crate) fn add_empty_state(&mut self) -> Result { + self.tt.add_empty_state() + } + + /// Swap the two states given in the transition table. + /// + /// This routine does not do anything to check the correctness of this + /// swap. Callers must ensure that other states pointing to id1 and id2 are + /// updated appropriately. + pub(crate) fn swap_states(&mut self, id1: StateID, id2: StateID) { + self.tt.swap(id1, id2); + } + + /// Truncate the states in this DFA to the given count. + /// + /// This routine does not do anything to check the correctness of this + /// truncation. Callers must ensure that other states pointing to truncated + /// states are updated appropriately. + pub(crate) fn truncate_states(&mut self, count: usize) { + self.tt.truncate(count); + } + + /// Return a mutable representation of the state corresponding to the given + /// id. This is useful for implementing routines that manipulate DFA states + /// (e.g., swapping states). + pub(crate) fn state_mut(&mut self, id: StateID) -> StateMut<'_> { + self.tt.state_mut(id) + } + + /// Minimize this DFA in place using Hopcroft's algorithm. + pub(crate) fn minimize(&mut self) { + Minimizer::new(self).run(); + } + + /// Updates the match state pattern ID map to use the one provided. + /// + /// This is useful when it's convenient to manipulate matching states + /// (and their corresponding pattern IDs) as a map. In particular, the + /// representation used by a DFA for this map is not amenable to mutation, + /// so if things need to be changed (like when shuffling states), it's + /// often easier to work with the map form. + pub(crate) fn set_pattern_map( + &mut self, + map: &BTreeMap>, + ) -> Result<(), Error> { + self.ms = self.ms.new_with_map(map)?; + Ok(()) + } + + /// Find states that have a small number of non-loop transitions and mark + /// them as candidates for acceleration during search. + pub(crate) fn accelerate(&mut self) { + // dead and quit states can never be accelerated. + if self.state_count() <= 2 { + return; + } + + // Go through every state and record their accelerator, if possible. + let mut accels = BTreeMap::new(); + // Count the number of accelerated match, start and non-match/start + // states. + let (mut cmatch, mut cstart, mut cnormal) = (0, 0, 0); + for state in self.states() { + if let Some(accel) = state.accelerate(self.byte_classes()) { + accels.insert(state.id(), accel); + if self.is_match_state(state.id()) { + cmatch += 1; + } else if self.is_start_state(state.id()) { + cstart += 1; + } else { + assert!(!self.is_dead_state(state.id())); + assert!(!self.is_quit_state(state.id())); + cnormal += 1; + } + } + } + // If no states were able to be accelerated, then we're done. + if accels.is_empty() { + return; + } + let original_accels_len = accels.len(); + + // A remapper keeps track of state ID changes. Once we're done + // shuffling, the remapper is used to rewrite all transitions in the + // DFA based on the new positions of states. + let mut remapper = Remapper::from_dfa(self); + + // As we swap states, if they are match states, we need to swap their + // pattern ID lists too (for multi-regexes). We do this by converting + // the lists to an easily swappable map, and then convert back to + // MatchStates once we're done. + let mut new_matches = self.ms.to_map(self); + + // There is at least one state that gets accelerated, so these are + // guaranteed to get set to sensible values below. + self.special.min_accel = StateID::MAX; + self.special.max_accel = StateID::ZERO; + let update_special_accel = + |special: &mut Special, accel_id: StateID| { + special.min_accel = cmp::min(special.min_accel, accel_id); + special.max_accel = cmp::max(special.max_accel, accel_id); + }; + + // Start by shuffling match states. Any match states that are + // accelerated get moved to the end of the match state range. + if cmatch > 0 && self.special.matches() { + // N.B. special.{min,max}_match do not need updating, since the + // range/number of match states does not change. Only the ordering + // of match states may change. + let mut next_id = self.special.max_match; + let mut cur_id = next_id; + while cur_id >= self.special.min_match { + if let Some(accel) = accels.remove(&cur_id) { + accels.insert(next_id, accel); + update_special_accel(&mut self.special, next_id); + + // No need to do any actual swapping for equivalent IDs. + if cur_id != next_id { + remapper.swap(self, cur_id, next_id); + + // Swap pattern IDs for match states. + let cur_pids = new_matches.remove(&cur_id).unwrap(); + let next_pids = new_matches.remove(&next_id).unwrap(); + new_matches.insert(cur_id, next_pids); + new_matches.insert(next_id, cur_pids); + } + next_id = self.tt.prev_state_id(next_id); + } + cur_id = self.tt.prev_state_id(cur_id); + } + } + + // This is where it gets tricky. Without acceleration, start states + // normally come right after match states. But we want accelerated + // states to be a single contiguous range (to make it very fast + // to determine whether a state *is* accelerated), while also keeping + // match and starting states as contiguous ranges for the same reason. + // So what we do here is shuffle states such that it looks like this: + // + // DQMMMMAAAAASSSSSSNNNNNNN + // | | + // |---------| + // accelerated states + // + // Where: + // D - dead state + // Q - quit state + // M - match state (may be accelerated) + // A - normal state that is accelerated + // S - start state (may be accelerated) + // N - normal state that is NOT accelerated + // + // We implement this by shuffling states, which is done by a sequence + // of pairwise swaps. We start by looking at all normal states to be + // accelerated. When we find one, we swap it with the earliest starting + // state, and then swap that with the earliest normal state. This + // preserves the contiguous property. + // + // Once we're done looking for accelerated normal states, now we look + // for accelerated starting states by moving them to the beginning + // of the starting state range (just like we moved accelerated match + // states to the end of the matching state range). + // + // For a more detailed/different perspective on this, see the docs + // in dfa/special.rs. + if cnormal > 0 { + // our next available starting and normal states for swapping. + let mut next_start_id = self.special.min_start; + let mut cur_id = self.from_index(self.state_count() - 1); + // This is guaranteed to exist since cnormal > 0. + let mut next_norm_id = + self.tt.next_state_id(self.special.max_start); + while cur_id >= next_norm_id { + if let Some(accel) = accels.remove(&cur_id) { + remapper.swap(self, next_start_id, cur_id); + remapper.swap(self, next_norm_id, cur_id); + // Keep our accelerator map updated with new IDs if the + // states we swapped were also accelerated. + if let Some(accel2) = accels.remove(&next_norm_id) { + accels.insert(cur_id, accel2); + } + if let Some(accel2) = accels.remove(&next_start_id) { + accels.insert(next_norm_id, accel2); + } + accels.insert(next_start_id, accel); + update_special_accel(&mut self.special, next_start_id); + // Our start range shifts one to the right now. + self.special.min_start = + self.tt.next_state_id(self.special.min_start); + self.special.max_start = + self.tt.next_state_id(self.special.max_start); + next_start_id = self.tt.next_state_id(next_start_id); + next_norm_id = self.tt.next_state_id(next_norm_id); + } + // This is pretty tricky, but if our 'next_norm_id' state also + // happened to be accelerated, then the result is that it is + // now in the position of cur_id, so we need to consider it + // again. This loop is still guaranteed to terminate though, + // because when accels contains cur_id, we're guaranteed to + // increment next_norm_id even if cur_id remains unchanged. + if !accels.contains_key(&cur_id) { + cur_id = self.tt.prev_state_id(cur_id); + } + } + } + // Just like we did for match states, but we want to move accelerated + // start states to the beginning of the range instead of the end. + if cstart > 0 { + // N.B. special.{min,max}_start do not need updating, since the + // range/number of start states does not change at this point. Only + // the ordering of start states may change. + let mut next_id = self.special.min_start; + let mut cur_id = next_id; + while cur_id <= self.special.max_start { + if let Some(accel) = accels.remove(&cur_id) { + remapper.swap(self, cur_id, next_id); + accels.insert(next_id, accel); + update_special_accel(&mut self.special, next_id); + next_id = self.tt.next_state_id(next_id); + } + cur_id = self.tt.next_state_id(cur_id); + } + } + + // Remap all transitions in our DFA and assert some things. + remapper.remap(self); + // This unwrap is OK because acceleration never changes the number of + // match states or patterns in those match states. Since acceleration + // runs after the pattern map has been set at least once, we know that + // our match states cannot error. + self.set_pattern_map(&new_matches).unwrap(); + self.special.set_max(); + self.special.validate().expect("special state ranges should validate"); + self.special + .validate_state_count(self.state_count(), self.stride2()) + .expect( + "special state ranges should be consistent with state count", + ); + assert_eq!( + self.special.accel_len(self.stride()), + // We record the number of accelerated states initially detected + // since the accels map is itself mutated in the process above. + // If mutated incorrectly, its size may change, and thus can't be + // trusted as a source of truth of how many accelerated states we + // expected there to be. + original_accels_len, + "mismatch with expected number of accelerated states", + ); + + // And finally record our accelerators. We kept our accels map updated + // as we shuffled states above, so the accelerators should now + // correspond to a contiguous range in the state ID space. (Which we + // assert.) + let mut prev: Option = None; + for (id, accel) in accels { + assert!(prev.map_or(true, |p| self.tt.next_state_id(p) == id)); + prev = Some(id); + self.accels.add(accel); + } + } + + /// Shuffle the states in this DFA so that starting states, match + /// states and accelerated states are all contiguous. + /// + /// See dfa/special.rs for more details. + pub(crate) fn shuffle( + &mut self, + mut matches: BTreeMap>, + ) -> Result<(), Error> { + // The determinizer always adds a quit state and it is always second. + self.special.quit_id = self.from_index(1); + // If all we have are the dead and quit states, then we're done and + // the DFA will never produce a match. + if self.state_count() <= 2 { + self.special.set_max(); + return Ok(()); + } + + // Collect all our start states into a convenient set and confirm there + // is no overlap with match states. In the classicl DFA construction, + // start states can be match states. But because of look-around, we + // delay all matches by a byte, which prevents start states from being + // match states. + let mut is_start: BTreeSet = BTreeSet::new(); + for (start_id, _, _) in self.starts() { + // While there's nothing theoretically wrong with setting a start + // state to a dead ID (indeed, it could be an optimization!), the + // shuffling code below assumes that start states aren't dead. If + // this assumption is violated, the dead state could be shuffled + // to a new location, which must never happen. So if we do want + // to allow start states to be dead, then this assert should be + // removed and the code below fixed. + // + // N.B. Minimization can cause start states to be dead, but that + // happens after states are shuffled, so it's OK. Also, start + // states are dead for the DFA that never matches anything, but + // in that case, there are no states to shuffle. + assert_ne!(start_id, DEAD, "start state cannot be dead"); + assert!( + !matches.contains_key(&start_id), + "{:?} is both a start and a match state, which is not allowed", + start_id, + ); + is_start.insert(start_id); + } + + // We implement shuffling by a sequence of pairwise swaps of states. + // Since we have a number of things referencing states via their + // IDs and swapping them changes their IDs, we need to record every + // swap we make so that we can remap IDs. The remapper handles this + // book-keeping for us. + let mut remapper = Remapper::from_dfa(self); + + // Shuffle matching states. + if matches.is_empty() { + self.special.min_match = DEAD; + self.special.max_match = DEAD; + } else { + // The determinizer guarantees that the first two states are the + // dead and quit states, respectively. We want our match states to + // come right after quit. + let mut next_id = self.from_index(2); + let mut new_matches = BTreeMap::new(); + self.special.min_match = next_id; + for (id, pids) in matches { + remapper.swap(self, next_id, id); + new_matches.insert(next_id, pids); + // If we swapped a start state, then update our set. + if is_start.contains(&next_id) { + is_start.remove(&next_id); + is_start.insert(id); + } + next_id = self.tt.next_state_id(next_id); + } + matches = new_matches; + self.special.max_match = cmp::max( + self.special.min_match, + self.tt.prev_state_id(next_id), + ); + } + + // Shuffle starting states. + { + let mut next_id = self.from_index(2); + if self.special.matches() { + next_id = self.tt.next_state_id(self.special.max_match); + } + self.special.min_start = next_id; + for id in is_start { + remapper.swap(self, next_id, id); + next_id = self.tt.next_state_id(next_id); + } + self.special.max_start = cmp::max( + self.special.min_start, + self.tt.prev_state_id(next_id), + ); + } + + // Finally remap all transitions in our DFA. + remapper.remap(self); + self.set_pattern_map(&matches)?; + self.special.set_max(); + self.special.validate().expect("special state ranges should validate"); + self.special + .validate_state_count(self.state_count(), self.stride2()) + .expect( + "special state ranges should be consistent with state count", + ); + Ok(()) + } +} + +/// A variety of generic internal methods for accessing DFA internals. +impl> DFA { + /// Return the byte classes used by this DFA. + pub(crate) fn byte_classes(&self) -> &ByteClasses { + &self.tt.classes + } + + /// Return the info about special states. + pub(crate) fn special(&self) -> &Special { + &self.special + } + + /// Return the info about special states as a mutable borrow. + #[cfg(feature = "alloc")] + pub(crate) fn special_mut(&mut self) -> &mut Special { + &mut self.special + } + + /// Returns an iterator over all states in this DFA. + /// + /// This iterator yields a tuple for each state. The first element of the + /// tuple corresponds to a state's identifier, and the second element + /// corresponds to the state itself (comprised of its transitions). + pub(crate) fn states(&self) -> StateIter<'_, T> { + self.tt.states() + } + + /// Return the total number of states in this DFA. Every DFA has at least + /// 1 state, even the empty DFA. + pub(crate) fn state_count(&self) -> usize { + self.tt.count() + } + + /// Return an iterator over all pattern IDs for the given match state. + /// + /// If the given state is not a match state, then this panics. + #[cfg(feature = "alloc")] + pub(crate) fn pattern_id_slice(&self, id: StateID) -> &[PatternID] { + assert!(self.is_match_state(id)); + self.ms.pattern_id_slice(self.match_state_index(id)) + } + + /// Return the total number of pattern IDs for the given match state. + /// + /// If the given state is not a match state, then this panics. + pub(crate) fn match_pattern_len(&self, id: StateID) -> usize { + assert!(self.is_match_state(id)); + self.ms.pattern_len(self.match_state_index(id)) + } + + /// Returns the total number of patterns matched by this DFA. + pub(crate) fn pattern_count(&self) -> usize { + self.ms.patterns + } + + /// Returns a map from match state ID to a list of pattern IDs that match + /// in that state. + #[cfg(feature = "alloc")] + pub(crate) fn pattern_map(&self) -> BTreeMap> { + self.ms.to_map(self) + } + + /// Returns the ID of the quit state for this DFA. + #[cfg(feature = "alloc")] + pub(crate) fn quit_id(&self) -> StateID { + self.from_index(1) + } + + /// Convert the given state identifier to the state's index. The state's + /// index corresponds to the position in which it appears in the transition + /// table. When a DFA is NOT premultiplied, then a state's identifier is + /// also its index. When a DFA is premultiplied, then a state's identifier + /// is equal to `index * alphabet_len`. This routine reverses that. + pub(crate) fn to_index(&self, id: StateID) -> usize { + self.tt.to_index(id) + } + + /// Convert an index to a state (in the range 0..self.state_count()) to an + /// actual state identifier. + /// + /// This is useful when using a `Vec` as an efficient map keyed by state + /// to some other information (such as a remapped state ID). + #[cfg(feature = "alloc")] + pub(crate) fn from_index(&self, index: usize) -> StateID { + self.tt.from_index(index) + } + + /// Return the table of state IDs for this DFA's start states. + pub(crate) fn starts(&self) -> StartStateIter<'_> { + self.st.iter() + } + + /// Returns the index of the match state for the given ID. If the + /// given ID does not correspond to a match state, then this may + /// panic or produce an incorrect result. + fn match_state_index(&self, id: StateID) -> usize { + debug_assert!(self.is_match_state(id)); + // This is one of the places where we rely on the fact that match + // states are contiguous in the transition table. Namely, that the + // first match state ID always corresponds to dfa.special.min_start. + // From there, since we know the stride, we can compute the overall + // index of any match state given the match state's ID. + let min = self.special().min_match.as_usize(); + // CORRECTNESS: We're allowed to produce an incorrect result or panic, + // so both the subtraction and the unchecked StateID construction is + // OK. + self.to_index(StateID::new_unchecked(id.as_usize() - min)) + } + + /// Returns the index of the accelerator state for the given ID. If the + /// given ID does not correspond to an accelerator state, then this may + /// panic or produce an incorrect result. + fn accelerator_index(&self, id: StateID) -> usize { + let min = self.special().min_accel.as_usize(); + // CORRECTNESS: We're allowed to produce an incorrect result or panic, + // so both the subtraction and the unchecked StateID construction is + // OK. + self.to_index(StateID::new_unchecked(id.as_usize() - min)) + } + + /// Return the accelerators for this DFA. + fn accels(&self) -> Accels<&[u32]> { + self.accels.as_ref() + } + + /// Return this DFA's transition table as a slice. + fn trans(&self) -> &[StateID] { + self.tt.table() + } +} + +impl> fmt::Debug for DFA { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + writeln!(f, "dense::DFA(")?; + for state in self.states() { + fmt_state_indicator(f, self, state.id())?; + let id = if f.alternate() { + state.id().as_usize() + } else { + self.to_index(state.id()) + }; + write!(f, "{:06?}: ", id)?; + state.fmt(f)?; + write!(f, "\n")?; + } + writeln!(f, "")?; + for (i, (start_id, sty, pid)) in self.starts().enumerate() { + let id = if f.alternate() { + start_id.as_usize() + } else { + self.to_index(start_id) + }; + if i % self.st.stride == 0 { + match pid { + None => writeln!(f, "START-GROUP(ALL)")?, + Some(pid) => { + writeln!(f, "START_GROUP(pattern: {:?})", pid)? + } + } + } + writeln!(f, " {:?} => {:06?}", sty, id)?; + } + if self.pattern_count() > 1 { + writeln!(f, "")?; + for i in 0..self.ms.count() { + let id = self.ms.match_state_id(self, i); + let id = if f.alternate() { + id.as_usize() + } else { + self.to_index(id) + }; + write!(f, "MATCH({:06?}): ", id)?; + for (i, &pid) in self.ms.pattern_id_slice(i).iter().enumerate() + { + if i > 0 { + write!(f, ", ")?; + } + write!(f, "{:?}", pid)?; + } + writeln!(f, "")?; + } + } + writeln!(f, "state count: {:?}", self.state_count())?; + writeln!(f, "pattern count: {:?}", self.pattern_count())?; + writeln!(f, ")")?; + Ok(()) + } +} + +unsafe impl> Automaton for DFA { + #[inline] + fn is_special_state(&self, id: StateID) -> bool { + self.special.is_special_state(id) + } + + #[inline] + fn is_dead_state(&self, id: StateID) -> bool { + self.special.is_dead_state(id) + } + + #[inline] + fn is_quit_state(&self, id: StateID) -> bool { + self.special.is_quit_state(id) + } + + #[inline] + fn is_match_state(&self, id: StateID) -> bool { + self.special.is_match_state(id) + } + + #[inline] + fn is_start_state(&self, id: StateID) -> bool { + self.special.is_start_state(id) + } + + #[inline] + fn is_accel_state(&self, id: StateID) -> bool { + self.special.is_accel_state(id) + } + + #[inline] + fn next_state(&self, current: StateID, input: u8) -> StateID { + let input = self.byte_classes().get(input); + let o = current.as_usize() + usize::from(input); + self.trans()[o] + } + + #[inline] + unsafe fn next_state_unchecked( + &self, + current: StateID, + input: u8, + ) -> StateID { + let input = self.byte_classes().get_unchecked(input); + let o = current.as_usize() + usize::from(input); + *self.trans().get_unchecked(o) + } + + #[inline] + fn next_eoi_state(&self, current: StateID) -> StateID { + let eoi = self.byte_classes().eoi().as_usize(); + let o = current.as_usize() + eoi; + self.trans()[o] + } + + #[inline] + fn pattern_count(&self) -> usize { + self.ms.patterns + } + + #[inline] + fn match_count(&self, id: StateID) -> usize { + self.match_pattern_len(id) + } + + #[inline] + fn match_pattern(&self, id: StateID, match_index: usize) -> PatternID { + // This is an optimization for the very common case of a DFA with a + // single pattern. This conditional avoids a somewhat more costly path + // that finds the pattern ID from the state machine, which requires + // a bit of slicing/pointer-chasing. This optimization tends to only + // matter when matches are frequent. + if self.ms.patterns == 1 { + return PatternID::ZERO; + } + let state_index = self.match_state_index(id); + self.ms.pattern_id(state_index, match_index) + } + + #[inline] + fn start_state_forward( + &self, + pattern_id: Option, + bytes: &[u8], + start: usize, + end: usize, + ) -> StateID { + let index = Start::from_position_fwd(bytes, start, end); + self.st.start(index, pattern_id) + } + + #[inline] + fn start_state_reverse( + &self, + pattern_id: Option, + bytes: &[u8], + start: usize, + end: usize, + ) -> StateID { + let index = Start::from_position_rev(bytes, start, end); + self.st.start(index, pattern_id) + } + + #[inline(always)] + fn accelerator(&self, id: StateID) -> &[u8] { + if !self.is_accel_state(id) { + return &[]; + } + self.accels.needles(self.accelerator_index(id)) + } +} + +/// The transition table portion of a dense DFA. +/// +/// The transition table is the core part of the DFA in that it describes how +/// to move from one state to another based on the input sequence observed. +#[derive(Clone)] +pub(crate) struct TransitionTable { + /// A contiguous region of memory representing the transition table in + /// row-major order. The representation is dense. That is, every state + /// has precisely the same number of transitions. The maximum number of + /// transitions per state is 257 (256 for each possible byte value, plus 1 + /// for the special EOI transition). If a DFA has been instructed to use + /// byte classes (the default), then the number of transitions is usually + /// substantially fewer. + /// + /// In practice, T is either `Vec` or `&[u32]`. + table: T, + /// A set of equivalence classes, where a single equivalence class + /// represents a set of bytes that never discriminate between a match + /// and a non-match in the DFA. Each equivalence class corresponds to a + /// single character in this DFA's alphabet, where the maximum number of + /// characters is 257 (each possible value of a byte plus the special + /// EOI transition). Consequently, the number of equivalence classes + /// corresponds to the number of transitions for each DFA state. Note + /// though that the *space* used by each DFA state in the transition table + /// may be larger. The total space used by each DFA state is known as the + /// stride. + /// + /// The only time the number of equivalence classes is fewer than 257 is if + /// the DFA's kind uses byte classes (which is the default). Equivalence + /// classes should generally only be disabled when debugging, so that + /// the transitions themselves aren't obscured. Disabling them has no + /// other benefit, since the equivalence class map is always used while + /// searching. In the vast majority of cases, the number of equivalence + /// classes is substantially smaller than 257, particularly when large + /// Unicode classes aren't used. + classes: ByteClasses, + /// The stride of each DFA state, expressed as a power-of-two exponent. + /// + /// The stride of a DFA corresponds to the total amount of space used by + /// each DFA state in the transition table. This may be bigger than the + /// size of a DFA's alphabet, since the stride is always the smallest + /// power of two greater than or equal to the alphabet size. + /// + /// While this wastes space, this avoids the need for integer division + /// to convert between premultiplied state IDs and their corresponding + /// indices. Instead, we can use simple bit-shifts. + /// + /// See the docs for the `stride2` method for more details. + /// + /// The minimum `stride2` value is `1` (corresponding to a stride of `2`) + /// while the maximum `stride2` value is `9` (corresponding to a stride of + /// `512`). The maximum is not `8` since the maximum alphabet size is `257` + /// when accounting for the special EOI transition. However, an alphabet + /// length of that size is exceptionally rare since the alphabet is shrunk + /// into equivalence classes. + stride2: usize, +} + +impl<'a> TransitionTable<&'a [u32]> { + /// Deserialize a transition table starting at the beginning of `slice`. + /// Upon success, return the total number of bytes read along with the + /// transition table. + /// + /// If there was a problem deserializing any part of the transition table, + /// then this returns an error. Notably, if the given slice does not have + /// the same alignment as `StateID`, then this will return an error (among + /// other possible errors). + /// + /// This is guaranteed to execute in constant time. + /// + /// # Safety + /// + /// This routine is not safe because it does not check the valdity of the + /// transition table itself. In particular, the transition table can be + /// quite large, so checking its validity can be somewhat expensive. An + /// invalid transition table is not safe because other code may rely on the + /// transition table being correct (such as explicit bounds check elision). + /// Therefore, an invalid transition table can lead to undefined behavior. + /// + /// Callers that use this function must either pass on the safety invariant + /// or guarantee that the bytes given contain a valid transition table. + /// This guarantee is upheld by the bytes written by `write_to`. + unsafe fn from_bytes_unchecked( + mut slice: &'a [u8], + ) -> Result<(TransitionTable<&'a [u32]>, usize), DeserializeError> { + let slice_start = slice.as_ptr() as usize; + + let (count, nr) = bytes::try_read_u32_as_usize(slice, "state count")?; + slice = &slice[nr..]; + + let (stride2, nr) = bytes::try_read_u32_as_usize(slice, "stride2")?; + slice = &slice[nr..]; + + let (classes, nr) = ByteClasses::from_bytes(slice)?; + slice = &slice[nr..]; + + // The alphabet length (determined by the byte class map) cannot be + // bigger than the stride (total space used by each DFA state). + if stride2 > 9 { + return Err(DeserializeError::generic( + "dense DFA has invalid stride2 (too big)", + )); + } + // It also cannot be zero, since even a DFA that never matches anything + // has a non-zero number of states with at least two equivalence + // classes: one for all 256 byte values and another for the EOI + // sentinel. + if stride2 < 1 { + return Err(DeserializeError::generic( + "dense DFA has invalid stride2 (too small)", + )); + } + // This is OK since 1 <= stride2 <= 9. + let stride = + 1usize.checked_shl(u32::try_from(stride2).unwrap()).unwrap(); + if classes.alphabet_len() > stride { + return Err(DeserializeError::generic( + "alphabet size cannot be bigger than transition table stride", + )); + } + + let trans_count = + bytes::shl(count, stride2, "dense table transition count")?; + let table_bytes_len = bytes::mul( + trans_count, + StateID::SIZE, + "dense table state byte count", + )?; + bytes::check_slice_len(slice, table_bytes_len, "transition table")?; + bytes::check_alignment::(slice)?; + let table_bytes = &slice[..table_bytes_len]; + slice = &slice[table_bytes_len..]; + // SAFETY: Since StateID is always representable as a u32, all we need + // to do is ensure that we have the proper length and alignment. We've + // checked both above, so the cast below is safe. + // + // N.B. This is the only not-safe code in this function, so we mark + // it explicitly to call it out, even though it is technically + // superfluous. + #[allow(unused_unsafe)] + let table = unsafe { + core::slice::from_raw_parts( + table_bytes.as_ptr() as *const u32, + trans_count, + ) + }; + let tt = TransitionTable { table, classes, stride2 }; + Ok((tt, slice.as_ptr() as usize - slice_start)) + } +} + +#[cfg(feature = "alloc")] +impl TransitionTable> { + /// Create a minimal transition table with just two states: a dead state + /// and a quit state. The alphabet length and stride of the transition + /// table is determined by the given set of equivalence classes. + fn minimal(classes: ByteClasses) -> TransitionTable> { + let mut tt = TransitionTable { + table: vec![], + classes, + stride2: classes.stride2(), + }; + // Two states, regardless of alphabet size, can always fit into u32. + tt.add_empty_state().unwrap(); // dead state + tt.add_empty_state().unwrap(); // quit state + tt + } + + /// Set a transition in this table. Both the `from` and `to` states must + /// already exist, otherwise this panics. `unit` should correspond to the + /// transition out of `from` to set to `to`. + fn set(&mut self, from: StateID, unit: alphabet::Unit, to: StateID) { + assert!(self.is_valid(from), "invalid 'from' state"); + assert!(self.is_valid(to), "invalid 'to' state"); + self.table[from.as_usize() + self.classes.get_by_unit(unit)] = + to.as_u32(); + } + + /// Add an empty state (a state where all transitions lead to a dead state) + /// and return its identifier. The identifier returned is guaranteed to + /// not point to any other existing state. + /// + /// If adding a state would exhaust the state identifier space, then this + /// returns an error. + fn add_empty_state(&mut self) -> Result { + // Normally, to get a fresh state identifier, we would just + // take the index of the next state added to the transition + // table. However, we actually perform an optimization here + // that premultiplies state IDs by the stride, such that they + // point immediately at the beginning of their transitions in + // the transition table. This avoids an extra multiplication + // instruction for state lookup at search time. + // + // Premultiplied identifiers means that instead of your matching + // loop looking something like this: + // + // state = dfa.start + // for byte in haystack: + // next = dfa.transitions[state * stride + byte] + // if dfa.is_match(next): + // return true + // return false + // + // it can instead look like this: + // + // state = dfa.start + // for byte in haystack: + // next = dfa.transitions[state + byte] + // if dfa.is_match(next): + // return true + // return false + // + // In other words, we save a multiplication instruction in the + // critical path. This turns out to be a decent performance win. + // The cost of using premultiplied state ids is that they can + // require a bigger state id representation. (And they also make + // the code a bit more complex, especially during minimization and + // when reshuffling states, as one needs to convert back and forth + // between state IDs and state indices.) + // + // To do this, we simply take the index of the state into the + // entire transition table, rather than the index of the state + // itself. e.g., If the stride is 64, then the ID of the 3rd state + // is 192, not 2. + let next = self.table.len(); + let id = StateID::new(next).map_err(|_| Error::too_many_states())?; + self.table.extend(iter::repeat(0).take(self.stride())); + Ok(id) + } + + /// Swap the two states given in this transition table. + /// + /// This routine does not do anything to check the correctness of this + /// swap. Callers must ensure that other states pointing to id1 and id2 are + /// updated appropriately. + /// + /// Both id1 and id2 must point to valid states, otherwise this panics. + fn swap(&mut self, id1: StateID, id2: StateID) { + assert!(self.is_valid(id1), "invalid 'id1' state: {:?}", id1); + assert!(self.is_valid(id2), "invalid 'id2' state: {:?}", id2); + // We only need to swap the parts of the state that are used. So if the + // stride is 64, but the alphabet length is only 33, then we save a lot + // of work. + for b in 0..self.classes.alphabet_len() { + self.table.swap(id1.as_usize() + b, id2.as_usize() + b); + } + } + + /// Truncate the states in this transition table to the given count. + /// + /// This routine does not do anything to check the correctness of this + /// truncation. Callers must ensure that other states pointing to truncated + /// states are updated appropriately. + fn truncate(&mut self, count: usize) { + self.table.truncate(count << self.stride2); + } + + /// Return a mutable representation of the state corresponding to the given + /// id. This is useful for implementing routines that manipulate DFA states + /// (e.g., swapping states). + fn state_mut(&mut self, id: StateID) -> StateMut<'_> { + let alphabet_len = self.alphabet_len(); + let i = id.as_usize(); + StateMut { + id, + stride2: self.stride2, + transitions: &mut self.table_mut()[i..i + alphabet_len], + } + } +} + +impl> TransitionTable { + /// Writes a serialized form of this transition table to the buffer given. + /// If the buffer is too small, then an error is returned. To determine + /// how big the buffer must be, use `write_to_len`. + fn write_to( + &self, + mut dst: &mut [u8], + ) -> Result { + let nwrite = self.write_to_len(); + if dst.len() < nwrite { + return Err(SerializeError::buffer_too_small("transition table")); + } + dst = &mut dst[..nwrite]; + + // write state count + // Unwrap is OK since number of states is guaranteed to fit in a u32. + E::write_u32(u32::try_from(self.count()).unwrap(), dst); + dst = &mut dst[size_of::()..]; + + // write state stride (as power of 2) + // Unwrap is OK since stride2 is guaranteed to be <= 9. + E::write_u32(u32::try_from(self.stride2).unwrap(), dst); + dst = &mut dst[size_of::()..]; + + // write byte class map + let n = self.classes.write_to(dst)?; + dst = &mut dst[n..]; + + // write actual transitions + for &sid in self.table() { + let n = bytes::write_state_id::(sid, &mut dst); + dst = &mut dst[n..]; + } + Ok(nwrite) + } + + /// Returns the number of bytes the serialized form of this transition + /// table will use. + fn write_to_len(&self) -> usize { + size_of::() // state count + + size_of::() // stride2 + + self.classes.write_to_len() + + (self.table().len() * StateID::SIZE) + } + + /// Validates that every state ID in this transition table is valid. + /// + /// That is, every state ID can be used to correctly index a state in this + /// table. + fn validate(&self) -> Result<(), DeserializeError> { + for state in self.states() { + for (_, to) in state.transitions() { + if !self.is_valid(to) { + return Err(DeserializeError::generic( + "found invalid state ID in transition table", + )); + } + } + } + Ok(()) + } + + /// Converts this transition table to a borrowed value. + fn as_ref(&self) -> TransitionTable<&'_ [u32]> { + TransitionTable { + table: self.table.as_ref(), + classes: self.classes.clone(), + stride2: self.stride2, + } + } + + /// Converts this transition table to an owned value. + #[cfg(feature = "alloc")] + fn to_owned(&self) -> TransitionTable> { + TransitionTable { + table: self.table.as_ref().to_vec(), + classes: self.classes.clone(), + stride2: self.stride2, + } + } + + /// Return the state for the given ID. If the given ID is not valid, then + /// this panics. + fn state(&self, id: StateID) -> State<'_> { + assert!(self.is_valid(id)); + + let i = id.as_usize(); + State { + id, + stride2: self.stride2, + transitions: &self.table()[i..i + self.alphabet_len()], + } + } + + /// Returns an iterator over all states in this transition table. + /// + /// This iterator yields a tuple for each state. The first element of the + /// tuple corresponds to a state's identifier, and the second element + /// corresponds to the state itself (comprised of its transitions). + fn states(&self) -> StateIter<'_, T> { + StateIter { + tt: self, + it: self.table().chunks(self.stride()).enumerate(), + } + } + + /// Convert a state identifier to an index to a state (in the range + /// 0..self.count()). + /// + /// This is useful when using a `Vec` as an efficient map keyed by state + /// to some other information (such as a remapped state ID). + /// + /// If the given ID is not valid, then this may panic or produce an + /// incorrect index. + fn to_index(&self, id: StateID) -> usize { + id.as_usize() >> self.stride2 + } + + /// Convert an index to a state (in the range 0..self.count()) to an actual + /// state identifier. + /// + /// This is useful when using a `Vec` as an efficient map keyed by state + /// to some other information (such as a remapped state ID). + /// + /// If the given index is not in the specified range, then this may panic + /// or produce an incorrect state ID. + fn from_index(&self, index: usize) -> StateID { + // CORRECTNESS: If the given index is not valid, then it is not + // required for this to panic or return a valid state ID. + StateID::new_unchecked(index << self.stride2) + } + + /// Returns the state ID for the state immediately following the one given. + /// + /// This does not check whether the state ID returned is invalid. In fact, + /// if the state ID given is the last state in this DFA, then the state ID + /// returned is guaranteed to be invalid. + #[cfg(feature = "alloc")] + fn next_state_id(&self, id: StateID) -> StateID { + self.from_index(self.to_index(id).checked_add(1).unwrap()) + } + + /// Returns the state ID for the state immediately preceding the one given. + /// + /// If the dead ID given (which is zero), then this panics. + #[cfg(feature = "alloc")] + fn prev_state_id(&self, id: StateID) -> StateID { + self.from_index(self.to_index(id).checked_sub(1).unwrap()) + } + + /// Returns the table as a slice of state IDs. + fn table(&self) -> &[StateID] { + let integers = self.table.as_ref(); + // SAFETY: This is safe because StateID is guaranteed to be + // representable as a u32. + unsafe { + core::slice::from_raw_parts( + integers.as_ptr() as *const StateID, + integers.len(), + ) + } + } + + /// Returns the total number of states in this transition table. + /// + /// Note that a DFA always has at least two states: the dead and quit + /// states. In particular, the dead state always has ID 0 and is + /// correspondingly always the first state. The dead state is never a match + /// state. + fn count(&self) -> usize { + self.table().len() >> self.stride2 + } + + /// Returns the total stride for every state in this DFA. This corresponds + /// to the total number of transitions used by each state in this DFA's + /// transition table. + fn stride(&self) -> usize { + 1 << self.stride2 + } + + /// Returns the total number of elements in the alphabet for this + /// transition table. This is always less than or equal to `self.stride()`. + /// It is only equal when the alphabet length is a power of 2. Otherwise, + /// it is always strictly less. + fn alphabet_len(&self) -> usize { + self.classes.alphabet_len() + } + + /// Returns true if and only if the given state ID is valid for this + /// transition table. Validity in this context means that the given ID can + /// be used as a valid offset with `self.stride()` to index this transition + /// table. + fn is_valid(&self, id: StateID) -> bool { + let id = id.as_usize(); + id < self.table().len() && id % self.stride() == 0 + } + + /// Return the memory usage, in bytes, of this transition table. + /// + /// This does not include the size of a `TransitionTable` value itself. + fn memory_usage(&self) -> usize { + self.table().len() * StateID::SIZE + } +} + +#[cfg(feature = "alloc")] +impl> TransitionTable { + /// Returns the table as a slice of state IDs. + fn table_mut(&mut self) -> &mut [StateID] { + let integers = self.table.as_mut(); + // SAFETY: This is safe because StateID is guaranteed to be + // representable as a u32. + unsafe { + core::slice::from_raw_parts_mut( + integers.as_mut_ptr() as *mut StateID, + integers.len(), + ) + } + } +} + +/// The set of all possible starting states in a DFA. +/// +/// The set of starting states corresponds to the possible choices one can make +/// in terms of starting a DFA. That is, before following the first transition, +/// you first need to select the state that you start in. +/// +/// Normally, a DFA converted from an NFA that has a single starting state +/// would itself just have one starting state. However, our support for look +/// around generally requires more starting states. The correct starting state +/// is chosen based on certain properties of the position at which we begin +/// our search. +/// +/// Before listing those properties, we first must define two terms: +/// +/// * `haystack` - The bytes to execute the search. The search always starts +/// at the beginning of `haystack` and ends before or at the end of +/// `haystack`. +/// * `context` - The (possibly empty) bytes surrounding `haystack`. `haystack` +/// must be contained within `context` such that `context` is at least as big +/// as `haystack`. +/// +/// This split is crucial for dealing with look-around. For example, consider +/// the context `foobarbaz`, the haystack `bar` and the regex `^bar$`. This +/// regex should _not_ match the haystack since `bar` does not appear at the +/// beginning of the input. Similarly, the regex `\Bbar\B` should match the +/// haystack because `bar` is not surrounded by word boundaries. But a search +/// that does not take context into account would not permit `\B` to match +/// since the beginning of any string matches a word boundary. Similarly, a +/// search that does not take context into account when searching `^bar$` in +/// the haystack `bar` would produce a match when it shouldn't. +/// +/// Thus, it follows that the starting state is chosen based on the following +/// criteria, derived from the position at which the search starts in the +/// `context` (corresponding to the start of `haystack`): +/// +/// 1. If the search starts at the beginning of `context`, then the `Text` +/// start state is used. (Since `^` corresponds to +/// `hir::Anchor::StartText`.) +/// 2. If the search starts at a position immediately following a line +/// terminator, then the `Line` start state is used. (Since `(?m:^)` +/// corresponds to `hir::Anchor::StartLine`.) +/// 3. If the search starts at a position immediately following a byte +/// classified as a "word" character (`[_0-9a-zA-Z]`), then the `WordByte` +/// start state is used. (Since `(?-u:\b)` corresponds to a word boundary.) +/// 4. Otherwise, if the search starts at a position immediately following +/// a byte that is not classified as a "word" character (`[^_0-9a-zA-Z]`), +/// then the `NonWordByte` start state is used. (Since `(?-u:\B)` +/// corresponds to a not-word-boundary.) +/// +/// (N.B. Unicode word boundaries are not supported by the DFA because they +/// require multi-byte look-around and this is difficult to support in a DFA.) +/// +/// To further complicate things, we also support constructing individual +/// anchored start states for each pattern in the DFA. (Which is required to +/// implement overlapping regexes correctly, but is also generally useful.) +/// Thus, when individual start states for each pattern are enabled, then the +/// total number of start states represented is `4 + (4 * #patterns)`, where +/// the 4 comes from each of the 4 possibilities above. The first 4 represents +/// the starting states for the entire DFA, which support searching for +/// multiple patterns simultaneously (possibly unanchored). +/// +/// If individual start states are disabled, then this will only store 4 +/// start states. Typically, individual start states are only enabled when +/// constructing the reverse DFA for regex matching. But they are also useful +/// for building DFAs that can search for a specific pattern or even to support +/// both anchored and unanchored searches with the same DFA. +/// +/// Note though that while the start table always has either `4` or +/// `4 + (4 * #patterns)` starting state *ids*, the total number of states +/// might be considerably smaller. That is, many of the IDs may be duplicative. +/// (For example, if a regex doesn't have a `\b` sub-pattern, then there's no +/// reason to generate a unique starting state for handling word boundaries. +/// Similarly for start/end anchors.) +#[derive(Clone)] +pub(crate) struct StartTable { + /// The initial start state IDs. + /// + /// In practice, T is either `Vec` or `&[u32]`. + /// + /// The first `stride` (currently always 4) entries always correspond to + /// the start states for the entire DFA. After that, there are + /// `stride * patterns` state IDs, where `patterns` may be zero in the + /// case of a DFA with no patterns or in the case where the DFA was built + /// without enabling starting states for each pattern. + table: T, + /// The number of starting state IDs per pattern. + stride: usize, + /// The total number of patterns for which starting states are encoded. + /// This may be zero for non-empty DFAs when the DFA was built without + /// start states for each pattern. Thus, one cannot use this field to + /// say how many patterns are in the DFA in all cases. It is specific to + /// how many patterns are represented in this start table. + patterns: usize, +} + +#[cfg(feature = "alloc")] +impl StartTable> { + /// Create a valid set of start states all pointing to the dead state. + /// + /// When the corresponding DFA is constructed with start states for each + /// pattern, then `patterns` should be the number of patterns. Otherwise, + /// it should be zero. + /// + /// If the total table size could exceed the allocatable limit, then this + /// returns an error. In practice, this is unlikely to be able to occur, + /// since it's likely that allocation would have failed long before it got + /// to this point. + fn dead(patterns: usize) -> Result>, Error> { + assert!(patterns <= PatternID::LIMIT); + let stride = Start::count(); + let pattern_starts_len = match stride.checked_mul(patterns) { + Some(x) => x, + None => return Err(Error::too_many_start_states()), + }; + let table_len = match stride.checked_add(pattern_starts_len) { + Some(x) => x, + None => return Err(Error::too_many_start_states()), + }; + if table_len > core::isize::MAX as usize { + return Err(Error::too_many_start_states()); + } + let table = vec![DEAD.as_u32(); table_len]; + Ok(StartTable { table, stride, patterns }) + } +} + +impl<'a> StartTable<&'a [u32]> { + /// Deserialize a table of start state IDs starting at the beginning of + /// `slice`. Upon success, return the total number of bytes read along with + /// the table of starting state IDs. + /// + /// If there was a problem deserializing any part of the starting IDs, + /// then this returns an error. Notably, if the given slice does not have + /// the same alignment as `StateID`, then this will return an error (among + /// other possible errors). + /// + /// This is guaranteed to execute in constant time. + /// + /// # Safety + /// + /// This routine is not safe because it does not check the valdity of the + /// starting state IDs themselves. In particular, the number of starting + /// IDs can be of variable length, so it's possible that checking their + /// validity cannot be done in constant time. An invalid starting state + /// ID is not safe because other code may rely on the starting IDs being + /// correct (such as explicit bounds check elision). Therefore, an invalid + /// start ID can lead to undefined behavior. + /// + /// Callers that use this function must either pass on the safety invariant + /// or guarantee that the bytes given contain valid starting state IDs. + /// This guarantee is upheld by the bytes written by `write_to`. + unsafe fn from_bytes_unchecked( + mut slice: &'a [u8], + ) -> Result<(StartTable<&'a [u32]>, usize), DeserializeError> { + let slice_start = slice.as_ptr() as usize; + + let (stride, nr) = + bytes::try_read_u32_as_usize(slice, "start table stride")?; + slice = &slice[nr..]; + + let (patterns, nr) = + bytes::try_read_u32_as_usize(slice, "start table patterns")?; + slice = &slice[nr..]; + + if stride != Start::count() { + return Err(DeserializeError::generic( + "invalid starting table stride", + )); + } + if patterns > PatternID::LIMIT { + return Err(DeserializeError::generic( + "invalid number of patterns", + )); + } + let pattern_table_size = + bytes::mul(stride, patterns, "invalid pattern count")?; + // Our start states always start with a single stride of start states + // for the entire automaton which permit it to match any pattern. What + // follows it are an optional set of start states for each pattern. + let start_state_count = bytes::add( + stride, + pattern_table_size, + "invalid 'any' pattern starts size", + )?; + let table_bytes_len = bytes::mul( + start_state_count, + StateID::SIZE, + "pattern table bytes length", + )?; + bytes::check_slice_len(slice, table_bytes_len, "start ID table")?; + bytes::check_alignment::(slice)?; + let table_bytes = &slice[..table_bytes_len]; + slice = &slice[table_bytes_len..]; + // SAFETY: Since StateID is always representable as a u32, all we need + // to do is ensure that we have the proper length and alignment. We've + // checked both above, so the cast below is safe. + // + // N.B. This is the only not-safe code in this function, so we mark + // it explicitly to call it out, even though it is technically + // superfluous. + #[allow(unused_unsafe)] + let table = unsafe { + core::slice::from_raw_parts( + table_bytes.as_ptr() as *const u32, + start_state_count, + ) + }; + let st = StartTable { table, stride, patterns }; + Ok((st, slice.as_ptr() as usize - slice_start)) + } +} + +impl> StartTable { + /// Writes a serialized form of this start table to the buffer given. If + /// the buffer is too small, then an error is returned. To determine how + /// big the buffer must be, use `write_to_len`. + fn write_to( + &self, + mut dst: &mut [u8], + ) -> Result { + let nwrite = self.write_to_len(); + if dst.len() < nwrite { + return Err(SerializeError::buffer_too_small( + "starting table ids", + )); + } + dst = &mut dst[..nwrite]; + + // write stride + // Unwrap is OK since the stride is always 4 (currently). + E::write_u32(u32::try_from(self.stride).unwrap(), dst); + dst = &mut dst[size_of::()..]; + // write pattern count + // Unwrap is OK since number of patterns is guaranteed to fit in a u32. + E::write_u32(u32::try_from(self.patterns).unwrap(), dst); + dst = &mut dst[size_of::()..]; + // write start IDs + for &sid in self.table() { + let n = bytes::write_state_id::(sid, &mut dst); + dst = &mut dst[n..]; + } + Ok(nwrite) + } + + /// Returns the number of bytes the serialized form of this start ID table + /// will use. + fn write_to_len(&self) -> usize { + size_of::() // stride + + size_of::() // # patterns + + (self.table().len() * StateID::SIZE) + } + + /// Validates that every state ID in this start table is valid by checking + /// it against the given transition table (which must be for the same DFA). + /// + /// That is, every state ID can be used to correctly index a state. + fn validate( + &self, + tt: &TransitionTable, + ) -> Result<(), DeserializeError> { + for &id in self.table() { + if !tt.is_valid(id) { + return Err(DeserializeError::generic( + "found invalid starting state ID", + )); + } + } + Ok(()) + } + + /// Converts this start list to a borrowed value. + fn as_ref(&self) -> StartTable<&'_ [u32]> { + StartTable { + table: self.table.as_ref(), + stride: self.stride, + patterns: self.patterns, + } + } + + /// Converts this start list to an owned value. + #[cfg(feature = "alloc")] + fn to_owned(&self) -> StartTable> { + StartTable { + table: self.table.as_ref().to_vec(), + stride: self.stride, + patterns: self.patterns, + } + } + + /// Return the start state for the given start index and pattern ID. If the + /// pattern ID is None, then the corresponding start state for the entire + /// DFA is returned. If the pattern ID is not None, then the corresponding + /// starting state for the given pattern is returned. If this start table + /// does not have individual starting states for each pattern, then this + /// panics. + fn start(&self, index: Start, pattern_id: Option) -> StateID { + let start_index = index.as_usize(); + let index = match pattern_id { + None => start_index, + Some(pid) => { + let pid = pid.as_usize(); + assert!(pid < self.patterns, "invalid pattern ID {:?}", pid); + self.stride + (self.stride * pid) + start_index + } + }; + self.table()[index] + } + + /// Returns an iterator over all start state IDs in this table. + /// + /// Each item is a triple of: start state ID, the start state type and the + /// pattern ID (if any). + fn iter(&self) -> StartStateIter<'_> { + StartStateIter { st: self.as_ref(), i: 0 } + } + + /// Returns the table as a slice of state IDs. + fn table(&self) -> &[StateID] { + let integers = self.table.as_ref(); + // SAFETY: This is safe because StateID is guaranteed to be + // representable as a u32. + unsafe { + core::slice::from_raw_parts( + integers.as_ptr() as *const StateID, + integers.len(), + ) + } + } + + /// Return the memory usage, in bytes, of this start list. + /// + /// This does not include the size of a `StartList` value itself. + fn memory_usage(&self) -> usize { + self.table().len() * StateID::SIZE + } +} + +#[cfg(feature = "alloc")] +impl> StartTable { + /// Set the start state for the given index and pattern. + /// + /// If the pattern ID or state ID are not valid, then this will panic. + fn set_start( + &mut self, + index: Start, + pattern_id: Option, + id: StateID, + ) { + let start_index = index.as_usize(); + let index = match pattern_id { + None => start_index, + Some(pid) => self + .stride + .checked_mul(pid.as_usize()) + .unwrap() + .checked_add(self.stride) + .unwrap() + .checked_add(start_index) + .unwrap(), + }; + self.table_mut()[index] = id; + } + + /// Returns the table as a mutable slice of state IDs. + fn table_mut(&mut self) -> &mut [StateID] { + let integers = self.table.as_mut(); + // SAFETY: This is safe because StateID is guaranteed to be + // representable as a u32. + unsafe { + core::slice::from_raw_parts_mut( + integers.as_mut_ptr() as *mut StateID, + integers.len(), + ) + } + } +} + +/// An iterator over start state IDs. +/// +/// This iterator yields a triple of start state ID, the start state type +/// and the pattern ID (if any). The pattern ID is None for start states +/// corresponding to the entire DFA and non-None for start states corresponding +/// to a specific pattern. The latter only occurs when the DFA is compiled with +/// start states for each pattern. +pub(crate) struct StartStateIter<'a> { + st: StartTable<&'a [u32]>, + i: usize, +} + +impl<'a> Iterator for StartStateIter<'a> { + type Item = (StateID, Start, Option); + + fn next(&mut self) -> Option<(StateID, Start, Option)> { + let i = self.i; + let table = self.st.table(); + if i >= table.len() { + return None; + } + self.i += 1; + + // This unwrap is okay since the stride of the starting state table + // must always match the number of start state types. + let start_type = Start::from_usize(i % self.st.stride).unwrap(); + let pid = if i < self.st.stride { + None + } else { + Some( + PatternID::new((i - self.st.stride) / self.st.stride).unwrap(), + ) + }; + Some((table[i], start_type, pid)) + } +} + +/// This type represents that patterns that should be reported whenever a DFA +/// enters a match state. This structure exists to support DFAs that search for +/// matches for multiple regexes. +/// +/// This structure relies on the fact that all match states in a DFA occur +/// contiguously in the DFA's transition table. (See dfa/special.rs for a more +/// detailed breakdown of the representation.) Namely, when a match occurs, we +/// know its state ID. Since we know the start and end of the contiguous region +/// of match states, we can use that to compute the position at which the match +/// state occurs. That in turn is used as an offset into this structure. +#[derive(Clone, Debug)] +struct MatchStates { + /// Slices is a flattened sequence of pairs, where each pair points to a + /// sub-slice of pattern_ids. The first element of the pair is an offset + /// into pattern_ids and the second element of the pair is the number + /// of 32-bit pattern IDs starting at that position. That is, each pair + /// corresponds to a single DFA match state and its corresponding match + /// IDs. The number of pairs always corresponds to the number of distinct + /// DFA match states. + /// + /// In practice, T is either Vec or &[u32]. + slices: T, + /// A flattened sequence of pattern IDs for each DFA match state. The only + /// way to correctly read this sequence is indirectly via `slices`. + /// + /// In practice, T is either Vec or &[u32]. + pattern_ids: T, + /// The total number of unique patterns represented by these match states. + patterns: usize, +} + +impl<'a> MatchStates<&'a [u32]> { + unsafe fn from_bytes_unchecked( + mut slice: &'a [u8], + ) -> Result<(MatchStates<&'a [u32]>, usize), DeserializeError> { + let slice_start = slice.as_ptr() as usize; + + // Read the total number of match states. + let (count, nr) = + bytes::try_read_u32_as_usize(slice, "match state count")?; + slice = &slice[nr..]; + + // Read the slice start/length pairs. + let pair_count = bytes::mul(2, count, "match state offset pairs")?; + let slices_bytes_len = bytes::mul( + pair_count, + PatternID::SIZE, + "match state slice offset byte length", + )?; + bytes::check_slice_len(slice, slices_bytes_len, "match state slices")?; + bytes::check_alignment::(slice)?; + let slices_bytes = &slice[..slices_bytes_len]; + slice = &slice[slices_bytes_len..]; + // SAFETY: Since PatternID is always representable as a u32, all we + // need to do is ensure that we have the proper length and alignment. + // We've checked both above, so the cast below is safe. + // + // N.B. This is one of the few not-safe snippets in this function, so + // we mark it explicitly to call it out, even though it is technically + // superfluous. + #[allow(unused_unsafe)] + let slices = unsafe { + core::slice::from_raw_parts( + slices_bytes.as_ptr() as *const u32, + pair_count, + ) + }; + + // Read the total number of unique pattern IDs (which is always 1 more + // than the maximum pattern ID in this automaton, since pattern IDs are + // handed out contiguously starting at 0). + let (patterns, nr) = + bytes::try_read_u32_as_usize(slice, "pattern count")?; + slice = &slice[nr..]; + + // Now read the pattern ID count. We don't need to store this + // explicitly, but we need it to know how many pattern IDs to read. + let (idcount, nr) = + bytes::try_read_u32_as_usize(slice, "pattern ID count")?; + slice = &slice[nr..]; + + // Read the actual pattern IDs. + let pattern_ids_len = + bytes::mul(idcount, PatternID::SIZE, "pattern ID byte length")?; + bytes::check_slice_len(slice, pattern_ids_len, "match pattern IDs")?; + bytes::check_alignment::(slice)?; + let pattern_ids_bytes = &slice[..pattern_ids_len]; + slice = &slice[pattern_ids_len..]; + // SAFETY: Since PatternID is always representable as a u32, all we + // need to do is ensure that we have the proper length and alignment. + // We've checked both above, so the cast below is safe. + // + // N.B. This is one of the few not-safe snippets in this function, so + // we mark it explicitly to call it out, even though it is technically + // superfluous. + #[allow(unused_unsafe)] + let pattern_ids = unsafe { + core::slice::from_raw_parts( + pattern_ids_bytes.as_ptr() as *const u32, + idcount, + ) + }; + + let ms = MatchStates { slices, pattern_ids, patterns }; + Ok((ms, slice.as_ptr() as usize - slice_start)) + } +} + +#[cfg(feature = "alloc")] +impl MatchStates> { + fn empty(pattern_count: usize) -> MatchStates> { + assert!(pattern_count <= PatternID::LIMIT); + MatchStates { + slices: vec![], + pattern_ids: vec![], + patterns: pattern_count, + } + } + + fn new( + matches: &BTreeMap>, + pattern_count: usize, + ) -> Result>, Error> { + let mut m = MatchStates::empty(pattern_count); + for (_, pids) in matches.iter() { + let start = PatternID::new(m.pattern_ids.len()) + .map_err(|_| Error::too_many_match_pattern_ids())?; + m.slices.push(start.as_u32()); + // This is always correct since the number of patterns in a single + // match state can never exceed maximum number of allowable + // patterns. Why? Because a pattern can only appear once in a + // particular match state, by construction. (And since our pattern + // ID limit is one less than u32::MAX, we're guaranteed that the + // length fits in a u32.) + m.slices.push(u32::try_from(pids.len()).unwrap()); + for &pid in pids { + m.pattern_ids.push(pid.as_u32()); + } + } + m.patterns = pattern_count; + Ok(m) + } + + fn new_with_map( + &self, + matches: &BTreeMap>, + ) -> Result>, Error> { + MatchStates::new(matches, self.patterns) + } +} + +impl> MatchStates { + /// Writes a serialized form of these match states to the buffer given. If + /// the buffer is too small, then an error is returned. To determine how + /// big the buffer must be, use `write_to_len`. + fn write_to( + &self, + mut dst: &mut [u8], + ) -> Result { + let nwrite = self.write_to_len(); + if dst.len() < nwrite { + return Err(SerializeError::buffer_too_small("match states")); + } + dst = &mut dst[..nwrite]; + + // write state ID count + // Unwrap is OK since number of states is guaranteed to fit in a u32. + E::write_u32(u32::try_from(self.count()).unwrap(), dst); + dst = &mut dst[size_of::()..]; + + // write slice offset pairs + for &pid in self.slices() { + let n = bytes::write_pattern_id::(pid, &mut dst); + dst = &mut dst[n..]; + } + + // write unique pattern ID count + // Unwrap is OK since number of patterns is guaranteed to fit in a u32. + E::write_u32(u32::try_from(self.patterns).unwrap(), dst); + dst = &mut dst[size_of::()..]; + + // write pattern ID count + // Unwrap is OK since we check at construction (and deserialization) + // that the number of patterns is representable as a u32. + E::write_u32(u32::try_from(self.pattern_ids().len()).unwrap(), dst); + dst = &mut dst[size_of::()..]; + + // write pattern IDs + for &pid in self.pattern_ids() { + let n = bytes::write_pattern_id::(pid, &mut dst); + dst = &mut dst[n..]; + } + + Ok(nwrite) + } + + /// Returns the number of bytes the serialized form of this transition + /// table will use. + fn write_to_len(&self) -> usize { + size_of::() // match state count + + (self.slices().len() * PatternID::SIZE) + + size_of::() // unique pattern ID count + + size_of::() // pattern ID count + + (self.pattern_ids().len() * PatternID::SIZE) + } + + /// Valides that the match state info is itself internally consistent and + /// consistent with the recorded match state region in the given DFA. + fn validate(&self, dfa: &DFA) -> Result<(), DeserializeError> { + if self.count() != dfa.special.match_len(dfa.stride()) { + return Err(DeserializeError::generic( + "match state count mismatch", + )); + } + for si in 0..self.count() { + let start = self.slices()[si * 2].as_usize(); + let len = self.slices()[si * 2 + 1].as_usize(); + if start >= self.pattern_ids().len() { + return Err(DeserializeError::generic( + "invalid pattern ID start offset", + )); + } + if start + len > self.pattern_ids().len() { + return Err(DeserializeError::generic( + "invalid pattern ID length", + )); + } + for mi in 0..len { + let pid = self.pattern_id(si, mi); + if pid.as_usize() >= self.patterns { + return Err(DeserializeError::generic( + "invalid pattern ID", + )); + } + } + } + Ok(()) + } + + /// Converts these match states back into their map form. This is useful + /// when shuffling states, as the normal MatchStates representation is not + /// amenable to easy state swapping. But with this map, to swap id1 and + /// id2, all you need to do is: + /// + /// if let Some(pids) = map.remove(&id1) { + /// map.insert(id2, pids); + /// } + /// + /// Once shuffling is done, use MatchStates::new to convert back. + #[cfg(feature = "alloc")] + fn to_map(&self, dfa: &DFA) -> BTreeMap> { + let mut map = BTreeMap::new(); + for i in 0..self.count() { + let mut pids = vec![]; + for j in 0..self.pattern_len(i) { + pids.push(self.pattern_id(i, j)); + } + map.insert(self.match_state_id(dfa, i), pids); + } + map + } + + /// Converts these match states to a borrowed value. + fn as_ref(&self) -> MatchStates<&'_ [u32]> { + MatchStates { + slices: self.slices.as_ref(), + pattern_ids: self.pattern_ids.as_ref(), + patterns: self.patterns, + } + } + + /// Converts these match states to an owned value. + #[cfg(feature = "alloc")] + fn to_owned(&self) -> MatchStates> { + MatchStates { + slices: self.slices.as_ref().to_vec(), + pattern_ids: self.pattern_ids.as_ref().to_vec(), + patterns: self.patterns, + } + } + + /// Returns the match state ID given the match state index. (Where the + /// first match state corresponds to index 0.) + /// + /// This panics if there is no match state at the given index. + fn match_state_id(&self, dfa: &DFA, index: usize) -> StateID { + assert!(dfa.special.matches(), "no match states to index"); + // This is one of the places where we rely on the fact that match + // states are contiguous in the transition table. Namely, that the + // first match state ID always corresponds to dfa.special.min_start. + // From there, since we know the stride, we can compute the ID of any + // match state given its index. + let stride2 = u32::try_from(dfa.stride2()).unwrap(); + let offset = index.checked_shl(stride2).unwrap(); + let id = dfa.special.min_match.as_usize().checked_add(offset).unwrap(); + let sid = StateID::new(id).unwrap(); + assert!(dfa.is_match_state(sid)); + sid + } + + /// Returns the pattern ID at the given match index for the given match + /// state. + /// + /// The match state index is the state index minus the state index of the + /// first match state in the DFA. + /// + /// The match index is the index of the pattern ID for the given state. + /// The index must be less than `self.pattern_len(state_index)`. + fn pattern_id(&self, state_index: usize, match_index: usize) -> PatternID { + self.pattern_id_slice(state_index)[match_index] + } + + /// Returns the number of patterns in the given match state. + /// + /// The match state index is the state index minus the state index of the + /// first match state in the DFA. + fn pattern_len(&self, state_index: usize) -> usize { + self.slices()[state_index * 2 + 1].as_usize() + } + + /// Returns all of the pattern IDs for the given match state index. + /// + /// The match state index is the state index minus the state index of the + /// first match state in the DFA. + fn pattern_id_slice(&self, state_index: usize) -> &[PatternID] { + let start = self.slices()[state_index * 2].as_usize(); + let len = self.pattern_len(state_index); + &self.pattern_ids()[start..start + len] + } + + /// Returns the pattern ID offset slice of u32 as a slice of PatternID. + fn slices(&self) -> &[PatternID] { + let integers = self.slices.as_ref(); + // SAFETY: This is safe because PatternID is guaranteed to be + // representable as a u32. + unsafe { + core::slice::from_raw_parts( + integers.as_ptr() as *const PatternID, + integers.len(), + ) + } + } + + /// Returns the total number of match states. + fn count(&self) -> usize { + assert_eq!(0, self.slices().len() % 2); + self.slices().len() / 2 + } + + /// Returns the pattern ID slice of u32 as a slice of PatternID. + fn pattern_ids(&self) -> &[PatternID] { + let integers = self.pattern_ids.as_ref(); + // SAFETY: This is safe because PatternID is guaranteed to be + // representable as a u32. + unsafe { + core::slice::from_raw_parts( + integers.as_ptr() as *const PatternID, + integers.len(), + ) + } + } + + /// Return the memory usage, in bytes, of these match pairs. + fn memory_usage(&self) -> usize { + (self.slices().len() + self.pattern_ids().len()) * PatternID::SIZE + } +} + +/// An iterator over all states in a DFA. +/// +/// This iterator yields a tuple for each state. The first element of the +/// tuple corresponds to a state's identifier, and the second element +/// corresponds to the state itself (comprised of its transitions). +/// +/// `'a` corresponding to the lifetime of original DFA, `T` corresponds to +/// the type of the transition table itself. +pub(crate) struct StateIter<'a, T> { + tt: &'a TransitionTable, + it: iter::Enumerate>, +} + +impl<'a, T: AsRef<[u32]>> Iterator for StateIter<'a, T> { + type Item = State<'a>; + + fn next(&mut self) -> Option> { + self.it.next().map(|(index, _)| { + let id = self.tt.from_index(index); + self.tt.state(id) + }) + } +} + +/// An immutable representation of a single DFA state. +/// +/// `'a` correspondings to the lifetime of a DFA's transition table. +pub(crate) struct State<'a> { + id: StateID, + stride2: usize, + transitions: &'a [StateID], +} + +impl<'a> State<'a> { + /// Return an iterator over all transitions in this state. This yields + /// a number of transitions equivalent to the alphabet length of the + /// corresponding DFA. + /// + /// Each transition is represented by a tuple. The first element is + /// the input byte for that transition and the second element is the + /// transitions itself. + pub(crate) fn transitions(&self) -> StateTransitionIter<'_> { + StateTransitionIter { + len: self.transitions.len(), + it: self.transitions.iter().enumerate(), + } + } + + /// Return an iterator over a sparse representation of the transitions in + /// this state. Only non-dead transitions are returned. + /// + /// The "sparse" representation in this case corresponds to a sequence of + /// triples. The first two elements of the triple comprise an inclusive + /// byte range while the last element corresponds to the transition taken + /// for all bytes in the range. + /// + /// This is somewhat more condensed than the classical sparse + /// representation (where you have an element for every non-dead + /// transition), but in practice, checking if a byte is in a range is very + /// cheap and using ranges tends to conserve quite a bit more space. + pub(crate) fn sparse_transitions(&self) -> StateSparseTransitionIter<'_> { + StateSparseTransitionIter { dense: self.transitions(), cur: None } + } + + /// Returns the identifier for this state. + pub(crate) fn id(&self) -> StateID { + self.id + } + + /// Analyzes this state to determine whether it can be accelerated. If so, + /// it returns an accelerator that contains at least one byte. + #[cfg(feature = "alloc")] + fn accelerate(&self, classes: &ByteClasses) -> Option { + // We just try to add bytes to our accelerator. Once adding fails + // (because we've added too many bytes), then give up. + let mut accel = Accel::new(); + for (class, id) in self.transitions() { + if id == self.id() { + continue; + } + for unit in classes.elements(class) { + if let Some(byte) = unit.as_u8() { + if !accel.add(byte) { + return None; + } + } + } + } + if accel.is_empty() { + None + } else { + Some(accel) + } + } +} + +impl<'a> fmt::Debug for State<'a> { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + for (i, (start, end, id)) in self.sparse_transitions().enumerate() { + let index = if f.alternate() { + id.as_usize() + } else { + id.as_usize() >> self.stride2 + }; + if i > 0 { + write!(f, ", ")?; + } + if start == end { + write!(f, "{:?} => {:?}", start, index)?; + } else { + write!(f, "{:?}-{:?} => {:?}", start, end, index)?; + } + } + Ok(()) + } +} + +/// A mutable representation of a single DFA state. +/// +/// `'a` correspondings to the lifetime of a DFA's transition table. +#[cfg(feature = "alloc")] +pub(crate) struct StateMut<'a> { + id: StateID, + stride2: usize, + transitions: &'a mut [StateID], +} + +#[cfg(feature = "alloc")] +impl<'a> StateMut<'a> { + /// Return an iterator over all transitions in this state. This yields + /// a number of transitions equivalent to the alphabet length of the + /// corresponding DFA. + /// + /// Each transition is represented by a tuple. The first element is the + /// input byte for that transition and the second element is a mutable + /// reference to the transition itself. + pub(crate) fn iter_mut(&mut self) -> StateTransitionIterMut<'_> { + StateTransitionIterMut { + len: self.transitions.len(), + it: self.transitions.iter_mut().enumerate(), + } + } +} + +#[cfg(feature = "alloc")] +impl<'a> fmt::Debug for StateMut<'a> { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + fmt::Debug::fmt( + &State { + id: self.id, + stride2: self.stride2, + transitions: self.transitions, + }, + f, + ) + } +} + +/// An iterator over all transitions in a single DFA state. This yields +/// a number of transitions equivalent to the alphabet length of the +/// corresponding DFA. +/// +/// Each transition is represented by a tuple. The first element is the input +/// byte for that transition and the second element is the transition itself. +#[derive(Debug)] +pub(crate) struct StateTransitionIter<'a> { + len: usize, + it: iter::Enumerate>, +} + +impl<'a> Iterator for StateTransitionIter<'a> { + type Item = (alphabet::Unit, StateID); + + fn next(&mut self) -> Option<(alphabet::Unit, StateID)> { + self.it.next().map(|(i, &id)| { + let unit = if i + 1 == self.len { + alphabet::Unit::eoi(i) + } else { + let b = u8::try_from(i) + .expect("raw byte alphabet is never exceeded"); + alphabet::Unit::u8(b) + }; + (unit, id) + }) + } +} + +/// A mutable iterator over all transitions in a DFA state. +/// +/// Each transition is represented by a tuple. The first element is the +/// input byte for that transition and the second element is a mutable +/// reference to the transition itself. +#[cfg(feature = "alloc")] +#[derive(Debug)] +pub(crate) struct StateTransitionIterMut<'a> { + len: usize, + it: iter::Enumerate>, +} + +#[cfg(feature = "alloc")] +impl<'a> Iterator for StateTransitionIterMut<'a> { + type Item = (alphabet::Unit, &'a mut StateID); + + fn next(&mut self) -> Option<(alphabet::Unit, &'a mut StateID)> { + self.it.next().map(|(i, id)| { + let unit = if i + 1 == self.len { + alphabet::Unit::eoi(i) + } else { + let b = u8::try_from(i) + .expect("raw byte alphabet is never exceeded"); + alphabet::Unit::u8(b) + }; + (unit, id) + }) + } +} + +/// An iterator over all non-DEAD transitions in a single DFA state using a +/// sparse representation. +/// +/// Each transition is represented by a triple. The first two elements of the +/// triple comprise an inclusive byte range while the last element corresponds +/// to the transition taken for all bytes in the range. +/// +/// As a convenience, this always returns `alphabet::Unit` values of the same +/// type. That is, you'll never get a (byte, EOI) or a (EOI, byte). Only (byte, +/// byte) and (EOI, EOI) values are yielded. +#[derive(Debug)] +pub(crate) struct StateSparseTransitionIter<'a> { + dense: StateTransitionIter<'a>, + cur: Option<(alphabet::Unit, alphabet::Unit, StateID)>, +} + +impl<'a> Iterator for StateSparseTransitionIter<'a> { + type Item = (alphabet::Unit, alphabet::Unit, StateID); + + fn next(&mut self) -> Option<(alphabet::Unit, alphabet::Unit, StateID)> { + while let Some((unit, next)) = self.dense.next() { + let (prev_start, prev_end, prev_next) = match self.cur { + Some(t) => t, + None => { + self.cur = Some((unit, unit, next)); + continue; + } + }; + if prev_next == next && !unit.is_eoi() { + self.cur = Some((prev_start, unit, prev_next)); + } else { + self.cur = Some((unit, unit, next)); + if prev_next != DEAD { + return Some((prev_start, prev_end, prev_next)); + } + } + } + if let Some((start, end, next)) = self.cur.take() { + if next != DEAD { + return Some((start, end, next)); + } + } + None + } +} + +/// An iterator over pattern IDs for a single match state. +#[derive(Debug)] +pub(crate) struct PatternIDIter<'a>(slice::Iter<'a, PatternID>); + +impl<'a> Iterator for PatternIDIter<'a> { + type Item = PatternID; + + fn next(&mut self) -> Option { + self.0.next().copied() + } +} + +/// Remapper is an abstraction the manages the remapping of state IDs in a +/// dense DFA. This is useful when one wants to shuffle states into different +/// positions in the DFA. +/// +/// One of the key complexities this manages is the ability to correctly move +/// one state multiple times. +/// +/// Once shuffling is complete, `remap` should be called, which will rewrite +/// all pertinent transitions to updated state IDs. +#[cfg(feature = "alloc")] +#[derive(Debug)] +struct Remapper { + /// A map from the index of a state to its pre-multiplied identifier. + /// + /// When a state is swapped with another, then their corresponding + /// locations in this map are also swapped. Thus, its new position will + /// still point to its old pre-multiplied StateID. + /// + /// While there is a bit more to it, this then allows us to rewrite the + /// state IDs in a DFA's transition table in a single pass. This is done + /// by iterating over every ID in this map, then iterating over each + /// transition for the state at that ID and re-mapping the transition from + /// `old_id` to `map[dfa.to_index(old_id)]`. That is, we find the position + /// in this map where `old_id` *started*, and set it to where it ended up + /// after all swaps have been completed. + map: Vec, +} + +#[cfg(feature = "alloc")] +impl Remapper { + fn from_dfa(dfa: &OwnedDFA) -> Remapper { + Remapper { + map: (0..dfa.state_count()).map(|i| dfa.from_index(i)).collect(), + } + } + + fn swap(&mut self, dfa: &mut OwnedDFA, id1: StateID, id2: StateID) { + dfa.swap_states(id1, id2); + self.map.swap(dfa.to_index(id1), dfa.to_index(id2)); + } + + fn remap(mut self, dfa: &mut OwnedDFA) { + // Update the map to account for states that have been swapped + // multiple times. For example, if (A, C) and (C, G) are swapped, then + // transitions previously pointing to A should now point to G. But if + // we don't update our map, they will erroneously be set to C. All we + // do is follow the swaps in our map until we see our original state + // ID. + let oldmap = self.map.clone(); + for i in 0..dfa.state_count() { + let cur_id = dfa.from_index(i); + let mut new = oldmap[i]; + if cur_id == new { + continue; + } + loop { + let id = oldmap[dfa.to_index(new)]; + if cur_id == id { + self.map[i] = new; + break; + } + new = id; + } + } + + // To work around the borrow checker for converting state IDs to + // indices. We cannot borrow self while mutably iterating over a + // state's transitions. Otherwise, we'd just use dfa.to_index(..). + let stride2 = dfa.stride2(); + let to_index = |id: StateID| -> usize { id.as_usize() >> stride2 }; + + // Now that we've finished shuffling, we need to remap all of our + // transitions. We don't need to handle re-mapping accelerated states + // since `accels` is only populated after shuffling. + for &id in self.map.iter() { + for (_, next_id) in dfa.state_mut(id).iter_mut() { + *next_id = self.map[to_index(*next_id)]; + } + } + for start_id in dfa.st.table_mut().iter_mut() { + *start_id = self.map[to_index(*start_id)]; + } + } +} + +#[cfg(all(test, feature = "alloc"))] +mod tests { + use super::*; + + #[test] + fn errors_with_unicode_word_boundary() { + let pattern = r"\b"; + assert!(Builder::new().build(pattern).is_err()); + } + + #[test] + fn roundtrip_never_match() { + let dfa = DFA::never_match().unwrap(); + let (buf, _) = dfa.to_bytes_native_endian(); + let dfa: DFA<&[u32]> = DFA::from_bytes(&buf).unwrap().0; + + assert_eq!(None, dfa.find_leftmost_fwd(b"foo12345").unwrap()); + } + + #[test] + fn roundtrip_always_match() { + use crate::HalfMatch; + + let dfa = DFA::always_match().unwrap(); + let (buf, _) = dfa.to_bytes_native_endian(); + let dfa: DFA<&[u32]> = DFA::from_bytes(&buf).unwrap().0; + + assert_eq!( + Some(HalfMatch::must(0, 0)), + dfa.find_leftmost_fwd(b"foo12345").unwrap() + ); + } +} diff --git a/vendor/regex-automata/src/dfa/determinize.rs b/vendor/regex-automata/src/dfa/determinize.rs new file mode 100644 index 000000000..61603481b --- /dev/null +++ b/vendor/regex-automata/src/dfa/determinize.rs @@ -0,0 +1,547 @@ +use alloc::{ + collections::BTreeMap, + vec::{self, Vec}, +}; + +use crate::{ + dfa::{dense, Error, DEAD}, + nfa::thompson, + util::{ + self, + alphabet::{self, ByteSet}, + determinize::{State, StateBuilderEmpty, StateBuilderNFA}, + id::{PatternID, StateID}, + matchtypes::MatchKind, + sparse_set::{SparseSet, SparseSets}, + start::Start, + }, +}; + +/// A builder for configuring and running a DFA determinizer. +#[derive(Clone, Debug)] +pub(crate) struct Config { + anchored: bool, + match_kind: MatchKind, + quit: ByteSet, + dfa_size_limit: Option, + determinize_size_limit: Option, +} + +impl Config { + /// Create a new default config for a determinizer. The determinizer may be + /// configured before calling `run`. + pub fn new() -> Config { + Config { + anchored: false, + match_kind: MatchKind::LeftmostFirst, + quit: ByteSet::empty(), + dfa_size_limit: None, + determinize_size_limit: None, + } + } + + /// Run determinization on the given NFA and write the resulting DFA into + /// the one given. The DFA given should be initialized but otherwise empty. + /// "Initialized" means that it is setup to handle the NFA's byte classes, + /// number of patterns and whether to build start states for each pattern. + pub fn run( + &self, + nfa: &thompson::NFA, + dfa: &mut dense::OwnedDFA, + ) -> Result<(), Error> { + let dead = State::dead(); + let quit = State::dead(); + let mut cache = StateMap::default(); + // We only insert the dead state here since its representation is + // identical to the quit state. And we never want anything pointing + // to the quit state other than specific transitions derived from the + // determinizer's configured "quit" bytes. + // + // We do put the quit state into 'builder_states' below. This ensures + // that a proper DFA state ID is allocated for it, and that no other + // DFA state uses the "location after the DEAD state." That is, it + // is assumed that the quit state is always the state immediately + // following the DEAD state. + cache.insert(dead.clone(), DEAD); + + let runner = Runner { + config: self.clone(), + nfa, + dfa, + builder_states: alloc::vec![dead, quit], + cache, + memory_usage_state: 0, + sparses: SparseSets::new(nfa.len()), + stack: alloc::vec![], + scratch_state_builder: StateBuilderEmpty::new(), + }; + runner.run() + } + + /// Whether to build an anchored DFA or not. When disabled (the default), + /// the unanchored prefix from the NFA is used to start the DFA. Otherwise, + /// the anchored start state of the NFA is used to start the DFA. + pub fn anchored(&mut self, yes: bool) -> &mut Config { + self.anchored = yes; + self + } + + /// The match semantics to use for determinization. + /// + /// MatchKind::All corresponds to the standard textbook construction. + /// All possible match states are represented in the DFA. + /// MatchKind::LeftmostFirst permits greediness and otherwise tries to + /// simulate the match semantics of backtracking regex engines. Namely, + /// only a subset of match states are built, and dead states are used to + /// stop searches with an unanchored prefix. + /// + /// The default is MatchKind::LeftmostFirst. + pub fn match_kind(&mut self, kind: MatchKind) -> &mut Config { + self.match_kind = kind; + self + } + + /// The set of bytes to use that will cause the DFA to enter a quit state, + /// stop searching and return an error. By default, this is empty. + pub fn quit(&mut self, set: ByteSet) -> &mut Config { + self.quit = set; + self + } + + /// The limit, in bytes of the heap, that the DFA is permitted to use. This + /// does not include the auxiliary heap storage used by determinization. + pub fn dfa_size_limit(&mut self, bytes: Option) -> &mut Config { + self.dfa_size_limit = bytes; + self + } + + /// The limit, in bytes of the heap, that determinization itself is allowed + /// to use. This does not include the size of the DFA being built. + pub fn determinize_size_limit( + &mut self, + bytes: Option, + ) -> &mut Config { + self.determinize_size_limit = bytes; + self + } +} + +/// The actual implementation of determinization that converts an NFA to a DFA +/// through powerset construction. +/// +/// This determinizer roughly follows the typical powerset construction, where +/// each DFA state is comprised of one or more NFA states. In the worst case, +/// there is one DFA state for every possible combination of NFA states. In +/// practice, this only happens in certain conditions, typically when there are +/// bounded repetitions. +/// +/// The main differences between this implementation and typical deteminization +/// are that this implementation delays matches by one state and hackily makes +/// look-around work. Comments below attempt to explain this. +/// +/// The lifetime variable `'a` refers to the lifetime of the NFA or DFA, +/// whichever is shorter. +#[derive(Debug)] +struct Runner<'a> { + /// The configuration used to initialize determinization. + config: Config, + /// The NFA we're converting into a DFA. + nfa: &'a thompson::NFA, + /// The DFA we're building. + dfa: &'a mut dense::OwnedDFA, + /// Each DFA state being built is defined as an *ordered* set of NFA + /// states, along with some meta facts about the ordered set of NFA states. + /// + /// This is never empty. The first state is always a dummy state such that + /// a state id == 0 corresponds to a dead state. The second state is always + /// the quit state. + /// + /// Why do we have states in both a `Vec` and in a cache map below? + /// Well, they serve two different roles based on access patterns. + /// `builder_states` is the canonical home of each state, and provides + /// constant random access by a DFA state's ID. The cache map below, on + /// the other hand, provides a quick way of searching for identical DFA + /// states by using the DFA state as a key in the map. Of course, we use + /// reference counting to avoid actually duplicating the state's data + /// itself. (Although this has never been benchmarked.) Note that the cache + /// map does not give us full minimization; it just lets us avoid some very + /// obvious redundant states. + /// + /// Note that the index into this Vec isn't quite the DFA's state ID. + /// Rather, it's just an index. To get the state ID, you have to multiply + /// it by the DFA's stride. That's done by self.dfa.from_index. And the + /// inverse is self.dfa.to_index. + /// + /// Moreover, DFA states don't usually retain the IDs assigned to them + /// by their position in this Vec. After determinization completes, + /// states are shuffled around to support other optimizations. See the + /// sibling 'special' module for more details on that. (The reason for + /// mentioning this is that if you print out the DFA for debugging during + /// determinization, and then print out the final DFA after it is fully + /// built, then the state IDs likely won't match up.) + builder_states: Vec, + /// A cache of DFA states that already exist and can be easily looked up + /// via ordered sets of NFA states. + /// + /// See `builder_states` docs for why we store states in two different + /// ways. + cache: StateMap, + /// The memory usage, in bytes, used by builder_states and cache. We track + /// this as new states are added since states use a variable amount of + /// heap. Tracking this as we add states makes it possible to compute the + /// total amount of memory used by the determinizer in constant time. + memory_usage_state: usize, + /// A pair of sparse sets for tracking ordered sets of NFA state IDs. + /// These are reused throughout determinization. A bounded sparse set + /// gives us constant time insertion, membership testing and clearing. + sparses: SparseSets, + /// Scratch space for a stack of NFA states to visit, for depth first + /// visiting without recursion. + stack: Vec, + /// Scratch space for storing an ordered sequence of NFA states, for + /// amortizing allocation. This is principally useful for when we avoid + /// adding a new DFA state since it already exists. In order to detect this + /// case though, we still need an ordered set of NFA state IDs. So we use + /// this space to stage that ordered set before we know whether we need to + /// create a new DFA state or not. + scratch_state_builder: StateBuilderEmpty, +} + +/// A map from states to state identifiers. When using std, we use a standard +/// hashmap, since it's a bit faster for this use case. (Other maps, like +/// one's based on FNV, have not yet been benchmarked.) +/// +/// The main purpose of this map is to reuse states where possible. This won't +/// fully minimize the DFA, but it works well in a lot of cases. +#[cfg(feature = "std")] +type StateMap = std::collections::HashMap; +#[cfg(not(feature = "std"))] +type StateMap = BTreeMap; + +impl<'a> Runner<'a> { + /// Build the DFA. If there was a problem constructing the DFA (e.g., if + /// the chosen state identifier representation is too small), then an error + /// is returned. + fn run(mut self) -> Result<(), Error> { + if self.nfa.has_word_boundary_unicode() + && !self.config.quit.contains_range(0x80, 0xFF) + { + return Err(Error::unsupported_dfa_word_boundary_unicode()); + } + + // A sequence of "representative" bytes drawn from each equivalence + // class. These representative bytes are fed to the NFA to compute + // state transitions. This allows us to avoid re-computing state + // transitions for bytes that are guaranteed to produce identical + // results. + let representatives: Vec = + self.dfa.byte_classes().representatives().collect(); + // The set of all DFA state IDs that still need to have their + // transitions set. We start by seeding this with all starting states. + let mut uncompiled = alloc::vec![]; + self.add_all_starts(&mut uncompiled)?; + while let Some(dfa_id) = uncompiled.pop() { + for &unit in &representatives { + if unit.as_u8().map_or(false, |b| self.config.quit.contains(b)) + { + continue; + } + // In many cases, the state we transition to has already been + // computed. 'cached_state' will do the minimal amount of work + // to check this, and if it exists, immediately return an + // already existing state ID. + let (next_dfa_id, is_new) = self.cached_state(dfa_id, unit)?; + self.dfa.set_transition(dfa_id, unit, next_dfa_id); + // If the state ID we got back is newly created, then we need + // to compile it, so add it to our uncompiled frontier. + if is_new { + uncompiled.push(next_dfa_id); + } + } + } + trace!( + "determinization complete, memory usage: {}, dense DFA size: {}", + self.memory_usage(), + self.dfa.memory_usage(), + ); + + // A map from DFA state ID to one or more NFA match IDs. Each NFA match + // ID corresponds to a distinct regex pattern that matches in the state + // corresponding to the key. + let mut matches: BTreeMap> = BTreeMap::new(); + self.cache.clear(); + #[allow(unused_variables)] + let mut total_pat_count = 0; + for (i, state) in self.builder_states.into_iter().enumerate() { + if let Some(pat_ids) = state.match_pattern_ids() { + let id = self.dfa.from_index(i); + total_pat_count += pat_ids.len(); + matches.insert(id, pat_ids); + } + } + log! { + use core::mem::size_of; + let per_elem = size_of::() + size_of::>(); + let pats = total_pat_count * size_of::(); + let mem = (matches.len() * per_elem) + pats; + log::trace!("matches map built, memory usage: {}", mem); + } + // At this point, we shuffle the "special" states in the final DFA. + // This permits a DFA's match loop to detect a match condition (among + // other things) by merely inspecting the current state's identifier, + // and avoids the need for any additional auxiliary storage. + self.dfa.shuffle(matches)?; + Ok(()) + } + + /// Return the identifier for the next DFA state given an existing DFA + /// state and an input byte. If the next DFA state already exists, then + /// return its identifier from the cache. Otherwise, build the state, cache + /// it and return its identifier. + /// + /// This routine returns a boolean indicating whether a new state was + /// built. If a new state is built, then the caller needs to add it to its + /// frontier of uncompiled DFA states to compute transitions for. + fn cached_state( + &mut self, + dfa_id: StateID, + unit: alphabet::Unit, + ) -> Result<(StateID, bool), Error> { + // Compute the set of all reachable NFA states, including epsilons. + let empty_builder = self.get_state_builder(); + let builder = util::determinize::next( + self.nfa, + self.config.match_kind, + &mut self.sparses, + &mut self.stack, + &self.builder_states[self.dfa.to_index(dfa_id)], + unit, + empty_builder, + ); + self.maybe_add_state(builder) + } + + /// Compute the set of DFA start states and add their identifiers in + /// 'dfa_state_ids' (no duplicates are added). + fn add_all_starts( + &mut self, + dfa_state_ids: &mut Vec, + ) -> Result<(), Error> { + // Always add the (possibly unanchored) start states for matching any + // of the patterns in this DFA. + self.add_start_group(None, dfa_state_ids)?; + // We only need to compute anchored start states for each pattern if it + // was requested to do so. + if self.dfa.has_starts_for_each_pattern() { + for pid in PatternID::iter(self.dfa.pattern_count()) { + self.add_start_group(Some(pid), dfa_state_ids)?; + } + } + Ok(()) + } + + /// Add a group of start states for the given match pattern ID. Any new + /// DFA states added are pushed on to 'dfa_state_ids'. (No duplicates are + /// pushed.) + /// + /// When pattern_id is None, then this will compile a group of unanchored + /// start states (if the DFA is unanchored). When the pattern_id is + /// present, then this will compile a group of anchored start states that + /// only match the given pattern. + fn add_start_group( + &mut self, + pattern_id: Option, + dfa_state_ids: &mut Vec, + ) -> Result<(), Error> { + let nfa_start = match pattern_id { + Some(pid) => self.nfa.start_pattern(pid), + None if self.config.anchored => self.nfa.start_anchored(), + None => self.nfa.start_unanchored(), + }; + + // When compiling start states, we're careful not to build additional + // states that aren't necessary. For example, if the NFA has no word + // boundary assertion, then there's no reason to have distinct start + // states for 'NonWordByte' and 'WordByte' starting configurations. + // Instead, the 'WordByte' starting configuration can just point + // directly to the start state for the 'NonWordByte' config. + + let (id, is_new) = + self.add_one_start(nfa_start, Start::NonWordByte)?; + self.dfa.set_start_state(Start::NonWordByte, pattern_id, id); + if is_new { + dfa_state_ids.push(id); + } + + if !self.nfa.has_word_boundary() { + self.dfa.set_start_state(Start::WordByte, pattern_id, id); + } else { + let (id, is_new) = + self.add_one_start(nfa_start, Start::WordByte)?; + self.dfa.set_start_state(Start::WordByte, pattern_id, id); + if is_new { + dfa_state_ids.push(id); + } + } + if !self.nfa.has_any_anchor() { + self.dfa.set_start_state(Start::Text, pattern_id, id); + self.dfa.set_start_state(Start::Line, pattern_id, id); + } else { + let (id, is_new) = self.add_one_start(nfa_start, Start::Text)?; + self.dfa.set_start_state(Start::Text, pattern_id, id); + if is_new { + dfa_state_ids.push(id); + } + + let (id, is_new) = self.add_one_start(nfa_start, Start::Line)?; + self.dfa.set_start_state(Start::Line, pattern_id, id); + if is_new { + dfa_state_ids.push(id); + } + } + + Ok(()) + } + + /// Add a new DFA start state corresponding to the given starting NFA + /// state, and the starting search configuration. (The starting search + /// configuration essentially tells us which look-behind assertions are + /// true for this particular state.) + /// + /// The boolean returned indicates whether the state ID returned is a newly + /// created state, or a previously cached state. + fn add_one_start( + &mut self, + nfa_start: StateID, + start: Start, + ) -> Result<(StateID, bool), Error> { + // Compute the look-behind assertions that are true in this starting + // configuration, and the determine the epsilon closure. While + // computing the epsilon closure, we only follow condiional epsilon + // transitions that satisfy the look-behind assertions in 'facts'. + let mut builder_matches = self.get_state_builder().into_matches(); + util::determinize::set_lookbehind_from_start( + &start, + &mut builder_matches, + ); + self.sparses.set1.clear(); + util::determinize::epsilon_closure( + self.nfa, + nfa_start, + *builder_matches.look_have(), + &mut self.stack, + &mut self.sparses.set1, + ); + let mut builder = builder_matches.into_nfa(); + util::determinize::add_nfa_states( + &self.nfa, + &self.sparses.set1, + &mut builder, + ); + self.maybe_add_state(builder) + } + + /// Adds the given state to the DFA being built depending on whether it + /// already exists in this determinizer's cache. + /// + /// If it does exist, then the memory used by 'state' is put back into the + /// determinizer and the previously created state's ID is returned. (Along + /// with 'false', indicating that no new state was added.) + /// + /// If it does not exist, then the state is added to the DFA being built + /// and a fresh ID is allocated (if ID allocation fails, then an error is + /// returned) and returned. (Along with 'true', indicating that a new state + /// was added.) + fn maybe_add_state( + &mut self, + builder: StateBuilderNFA, + ) -> Result<(StateID, bool), Error> { + if let Some(&cached_id) = self.cache.get(builder.as_bytes()) { + // Since we have a cached state, put the constructed state's + // memory back into our scratch space, so that it can be reused. + self.put_state_builder(builder); + return Ok((cached_id, false)); + } + self.add_state(builder).map(|sid| (sid, true)) + } + + /// Add the given state to the DFA and make it available in the cache. + /// + /// The state initially has no transitions. That is, it transitions to the + /// dead state for all possible inputs, and transitions to the quit state + /// for all quit bytes. + /// + /// If adding the state would exceed the maximum value for StateID, then an + /// error is returned. + fn add_state( + &mut self, + builder: StateBuilderNFA, + ) -> Result { + let id = self.dfa.add_empty_state()?; + if !self.config.quit.is_empty() { + for b in self.config.quit.iter() { + self.dfa.set_transition( + id, + alphabet::Unit::u8(b), + self.dfa.quit_id(), + ); + } + } + let state = builder.to_state(); + // States use reference counting internally, so we only need to count + // their memroy usage once. + self.memory_usage_state += state.memory_usage(); + self.builder_states.push(state.clone()); + self.cache.insert(state, id); + self.put_state_builder(builder); + if let Some(limit) = self.config.dfa_size_limit { + if self.dfa.memory_usage() > limit { + return Err(Error::dfa_exceeded_size_limit(limit)); + } + } + if let Some(limit) = self.config.determinize_size_limit { + if self.memory_usage() > limit { + return Err(Error::determinize_exceeded_size_limit(limit)); + } + } + Ok(id) + } + + /// Returns a state builder from this determinizer that might have existing + /// capacity. This helps avoid allocs in cases where a state is built that + /// turns out to already be cached. + /// + /// Callers must put the state builder back with 'put_state_builder', + /// otherwise the allocation reuse won't work. + fn get_state_builder(&mut self) -> StateBuilderEmpty { + core::mem::replace( + &mut self.scratch_state_builder, + StateBuilderEmpty::new(), + ) + } + + /// Puts the given state builder back into this determinizer for reuse. + /// + /// Note that building a 'State' from a builder always creates a new + /// alloc, so callers should always put the builder back. + fn put_state_builder(&mut self, builder: StateBuilderNFA) { + let _ = core::mem::replace( + &mut self.scratch_state_builder, + builder.clear(), + ); + } + + /// Return the memory usage, in bytes, of this determinizer at the current + /// point in time. This does not include memory used by the NFA or the + /// dense DFA itself. + fn memory_usage(&self) -> usize { + use core::mem::size_of; + + self.builder_states.len() * size_of::() + // Maps likely use more memory than this, but it's probably close. + + self.cache.len() * (size_of::() + size_of::()) + + self.memory_usage_state + + self.stack.capacity() * size_of::() + + self.scratch_state_builder.capacity() + } +} diff --git a/vendor/regex-automata/src/dfa/error.rs b/vendor/regex-automata/src/dfa/error.rs new file mode 100644 index 000000000..6497a4cff --- /dev/null +++ b/vendor/regex-automata/src/dfa/error.rs @@ -0,0 +1,162 @@ +use crate::{ + nfa, + util::{ + id::{PatternID, StateID}, + start::Start, + }, +}; + +/// An error that occurred during the construction of a DFA. +/// +/// This error does not provide many introspection capabilities. There are +/// generally only two things you can do with it: +/// +/// * Obtain a human readable message via its `std::fmt::Display` impl. +/// * Access an underlying [`nfa::thompson::Error`] type from its `source` +/// method via the `std::error::Error` trait. This error only occurs when using +/// convenience routines for building a DFA directly from a pattern string. +/// +/// When the `std` feature is enabled, this implements the `std::error::Error` +/// trait. +#[derive(Clone, Debug)] +pub struct Error { + kind: ErrorKind, +} + +/// The kind of error that occurred during the construction of a DFA. +/// +/// Note that this error is non-exhaustive. Adding new variants is not +/// considered a breaking change. +#[derive(Clone, Debug)] +enum ErrorKind { + /// An error that occurred while constructing an NFA as a precursor step + /// before a DFA is compiled. + NFA(nfa::thompson::Error), + /// An error that occurred because an unsupported regex feature was used. + /// The message string describes which unsupported feature was used. + /// + /// The primary regex feature that is unsupported by DFAs is the Unicode + /// word boundary look-around assertion (`\b`). This can be worked around + /// by either using an ASCII word boundary (`(?-u:\b)`) or by enabling the + /// [`dense::Builder::allow_unicode_word_boundary`](dense/struct.Builder.html#method.allow_unicode_word_boundary) + /// option when building a DFA. + Unsupported(&'static str), + /// An error that occurs if too many states are produced while building a + /// DFA. + TooManyStates, + /// An error that occurs if too many start states are needed while building + /// a DFA. + /// + /// This is a kind of oddball error that occurs when building a DFA with + /// start states enabled for each pattern and enough patterns to cause + /// the table of start states to overflow `usize`. + TooManyStartStates, + /// This is another oddball error that can occur if there are too many + /// patterns spread out across too many match states. + TooManyMatchPatternIDs, + /// An error that occurs if the DFA got too big during determinization. + DFAExceededSizeLimit { limit: usize }, + /// An error that occurs if auxiliary storage (not the DFA) used during + /// determinization got too big. + DeterminizeExceededSizeLimit { limit: usize }, +} + +impl Error { + /// Return the kind of this error. + fn kind(&self) -> &ErrorKind { + &self.kind + } + + pub(crate) fn nfa(err: nfa::thompson::Error) -> Error { + Error { kind: ErrorKind::NFA(err) } + } + + pub(crate) fn unsupported_dfa_word_boundary_unicode() -> Error { + let msg = "cannot build DFAs for regexes with Unicode word \ + boundaries; switch to ASCII word boundaries, or \ + heuristically enable Unicode word boundaries or use a \ + different regex engine"; + Error { kind: ErrorKind::Unsupported(msg) } + } + + pub(crate) fn too_many_states() -> Error { + Error { kind: ErrorKind::TooManyStates } + } + + pub(crate) fn too_many_start_states() -> Error { + Error { kind: ErrorKind::TooManyStartStates } + } + + pub(crate) fn too_many_match_pattern_ids() -> Error { + Error { kind: ErrorKind::TooManyMatchPatternIDs } + } + + pub(crate) fn dfa_exceeded_size_limit(limit: usize) -> Error { + Error { kind: ErrorKind::DFAExceededSizeLimit { limit } } + } + + pub(crate) fn determinize_exceeded_size_limit(limit: usize) -> Error { + Error { kind: ErrorKind::DeterminizeExceededSizeLimit { limit } } + } +} + +#[cfg(feature = "std")] +impl std::error::Error for Error { + fn source(&self) -> Option<&(dyn std::error::Error + 'static)> { + match self.kind() { + ErrorKind::NFA(ref err) => Some(err), + ErrorKind::Unsupported(_) => None, + ErrorKind::TooManyStates => None, + ErrorKind::TooManyStartStates => None, + ErrorKind::TooManyMatchPatternIDs => None, + ErrorKind::DFAExceededSizeLimit { .. } => None, + ErrorKind::DeterminizeExceededSizeLimit { .. } => None, + } + } +} + +impl core::fmt::Display for Error { + fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result { + match self.kind() { + ErrorKind::NFA(_) => write!(f, "error building NFA"), + ErrorKind::Unsupported(ref msg) => { + write!(f, "unsupported regex feature for DFAs: {}", msg) + } + ErrorKind::TooManyStates => write!( + f, + "number of DFA states exceeds limit of {}", + StateID::LIMIT, + ), + ErrorKind::TooManyStartStates => { + let stride = Start::count(); + // The start table has `stride` entries for starting states for + // the entire DFA, and then `stride` entries for each pattern + // if start states for each pattern are enabled (which is the + // only way this error can occur). Thus, the total number of + // patterns that can fit in the table is `stride` less than + // what we can allocate. + let limit = ((core::isize::MAX as usize) - stride) / stride; + write!( + f, + "compiling DFA with start states exceeds pattern \ + pattern limit of {}", + limit, + ) + } + ErrorKind::TooManyMatchPatternIDs => write!( + f, + "compiling DFA with total patterns in all match states \ + exceeds limit of {}", + PatternID::LIMIT, + ), + ErrorKind::DFAExceededSizeLimit { limit } => write!( + f, + "DFA exceeded size limit of {:?} during determinization", + limit, + ), + ErrorKind::DeterminizeExceededSizeLimit { limit } => { + write!(f, "determinization exceeded size limit of {:?}", limit) + } + } + } +} diff --git a/vendor/regex-automata/src/dfa/minimize.rs b/vendor/regex-automata/src/dfa/minimize.rs new file mode 100644 index 000000000..80e2f4e73 --- /dev/null +++ b/vendor/regex-automata/src/dfa/minimize.rs @@ -0,0 +1,461 @@ +use core::{cell::RefCell, fmt, mem}; + +use alloc::{collections::BTreeMap, rc::Rc, vec, vec::Vec}; + +use crate::{ + dfa::{automaton::Automaton, dense, DEAD}, + util::{ + alphabet, + id::{PatternID, StateID}, + }, +}; + +/// An implementation of Hopcroft's algorithm for minimizing DFAs. +/// +/// The algorithm implemented here is mostly taken from Wikipedia: +/// https://en.wikipedia.org/wiki/DFA_minimization#Hopcroft's_algorithm +/// +/// This code has had some light optimization attention paid to it, +/// particularly in the form of reducing allocation as much as possible. +/// However, it is still generally slow. Future optimization work should +/// probably focus on the bigger picture rather than micro-optimizations. For +/// example: +/// +/// 1. Figure out how to more intelligently create initial partitions. That is, +/// Hopcroft's algorithm starts by creating two partitions of DFA states +/// that are known to NOT be equivalent: match states and non-match states. +/// The algorithm proceeds by progressively refining these partitions into +/// smaller partitions. If we could start with more partitions, then we +/// could reduce the amount of work that Hopcroft's algorithm needs to do. +/// 2. For every partition that we visit, we find all incoming transitions to +/// every state in the partition for *every* element in the alphabet. (This +/// is why using byte classes can significantly decrease minimization times, +/// since byte classes shrink the alphabet.) This is quite costly and there +/// is perhaps some redundant work being performed depending on the specific +/// states in the set. For example, we might be able to only visit some +/// elements of the alphabet based on the transitions. +/// 3. Move parts of minimization into determinization. If minimization has +/// fewer states to deal with, then it should run faster. A prime example +/// of this might be large Unicode classes, which are generated in way that +/// can create a lot of redundant states. (Some work has been done on this +/// point during NFA compilation via the algorithm described in the +/// "Incremental Construction of MinimalAcyclic Finite-State Automata" +/// paper.) +pub(crate) struct Minimizer<'a> { + dfa: &'a mut dense::OwnedDFA, + in_transitions: Vec>>, + partitions: Vec, + waiting: Vec, +} + +impl<'a> fmt::Debug for Minimizer<'a> { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + f.debug_struct("Minimizer") + .field("dfa", &self.dfa) + .field("in_transitions", &self.in_transitions) + .field("partitions", &self.partitions) + .field("waiting", &self.waiting) + .finish() + } +} + +/// A set of states. A state set makes up a single partition in Hopcroft's +/// algorithm. +/// +/// It is represented by an ordered set of state identifiers. We use shared +/// ownership so that a single state set can be in both the set of partitions +/// and in the set of waiting sets simultaneously without an additional +/// allocation. Generally, once a state set is built, it becomes immutable. +/// +/// We use this representation because it avoids the overhead of more +/// traditional set data structures (HashSet/BTreeSet), and also because +/// computing intersection/subtraction on this representation is especially +/// fast. +#[derive(Clone, Debug, Eq, PartialEq, PartialOrd, Ord)] +struct StateSet { + ids: Rc>>, +} + +impl<'a> Minimizer<'a> { + pub fn new(dfa: &'a mut dense::OwnedDFA) -> Minimizer<'a> { + let in_transitions = Minimizer::incoming_transitions(dfa); + let partitions = Minimizer::initial_partitions(dfa); + let waiting = partitions.clone(); + Minimizer { dfa, in_transitions, partitions, waiting } + } + + pub fn run(mut self) { + let stride2 = self.dfa.stride2(); + let as_state_id = |index: usize| -> StateID { + StateID::new(index << stride2).unwrap() + }; + let as_index = |id: StateID| -> usize { id.as_usize() >> stride2 }; + + let mut incoming = StateSet::empty(); + let mut scratch1 = StateSet::empty(); + let mut scratch2 = StateSet::empty(); + let mut newparts = vec![]; + + // This loop is basically Hopcroft's algorithm. Everything else is just + // shuffling data around to fit our representation. + while let Some(set) = self.waiting.pop() { + for b in self.dfa.byte_classes().iter() { + self.find_incoming_to(b, &set, &mut incoming); + // If incoming is empty, then the intersection with any other + // set must also be empty. So 'newparts' just ends up being + // 'self.partitions'. So there's no need to go through the loop + // below. + // + // This actually turns out to be rather large optimization. On + // the order of making minimization 4-5x faster. It's likely + // that the vast majority of all states have very few incoming + // transitions. + if incoming.is_empty() { + continue; + } + + for p in 0..self.partitions.len() { + self.partitions[p].intersection(&incoming, &mut scratch1); + if scratch1.is_empty() { + newparts.push(self.partitions[p].clone()); + continue; + } + + self.partitions[p].subtract(&incoming, &mut scratch2); + if scratch2.is_empty() { + newparts.push(self.partitions[p].clone()); + continue; + } + + let (x, y) = + (scratch1.deep_clone(), scratch2.deep_clone()); + newparts.push(x.clone()); + newparts.push(y.clone()); + match self.find_waiting(&self.partitions[p]) { + Some(i) => { + self.waiting[i] = x; + self.waiting.push(y); + } + None => { + if x.len() <= y.len() { + self.waiting.push(x); + } else { + self.waiting.push(y); + } + } + } + } + newparts = mem::replace(&mut self.partitions, newparts); + newparts.clear(); + } + } + + // At this point, we now have a minimal partitioning of states, where + // each partition is an equivalence class of DFA states. Now we need to + // use this partioning to update the DFA to only contain one state for + // each partition. + + // Create a map from DFA state ID to the representative ID of the + // equivalence class to which it belongs. The representative ID of an + // equivalence class of states is the minimum ID in that class. + let mut state_to_part = vec![DEAD; self.dfa.state_count()]; + for p in &self.partitions { + p.iter(|id| state_to_part[as_index(id)] = p.min()); + } + + // Generate a new contiguous sequence of IDs for minimal states, and + // create a map from equivalence IDs to the new IDs. Thus, the new + // minimal ID of *any* state in the unminimized DFA can be obtained + // with minimals_ids[state_to_part[old_id]]. + let mut minimal_ids = vec![DEAD; self.dfa.state_count()]; + let mut new_index = 0; + for state in self.dfa.states() { + if state_to_part[as_index(state.id())] == state.id() { + minimal_ids[as_index(state.id())] = as_state_id(new_index); + new_index += 1; + } + } + // The total number of states in the minimal DFA. + let minimal_count = new_index; + // Convenience function for remapping state IDs. This takes an old ID, + // looks up its Hopcroft partition and then maps that to the new ID + // range. + let remap = |old| minimal_ids[as_index(state_to_part[as_index(old)])]; + + // Re-map this DFA in place such that the only states remaining + // correspond to the representative states of every equivalence class. + for id in (0..self.dfa.state_count()).map(as_state_id) { + // If this state isn't a representative for an equivalence class, + // then we skip it since it won't appear in the minimal DFA. + if state_to_part[as_index(id)] != id { + continue; + } + for (_, next) in self.dfa.state_mut(id).iter_mut() { + *next = remap(*next); + } + self.dfa.swap_states(id, minimal_ids[as_index(id)]); + } + // Trim off all unused states from the pre-minimized DFA. This + // represents all states that were merged into a non-singleton + // equivalence class of states, and appeared after the first state + // in each such class. (Because the state with the smallest ID in each + // equivalence class is its representative ID.) + self.dfa.truncate_states(minimal_count); + + // Update the new start states, which is now just the minimal ID of + // whatever state the old start state was collapsed into. Also, we + // collect everything before-hand to work around the borrow checker. + // We're already allocating so much that this is probably fine. If this + // turns out to be costly, then I guess add a `starts_mut` iterator. + let starts: Vec<_> = self.dfa.starts().collect(); + for (old_start_id, start_type, pid) in starts { + self.dfa.set_start_state(start_type, pid, remap(old_start_id)); + } + + // Update the match state pattern ID list for multi-regexes. All we + // need to do is remap the match state IDs. The pattern ID lists are + // always the same as they were since match states with distinct + // pattern ID lists are always considered distinct states. + let mut pmap = BTreeMap::new(); + for (match_id, pattern_ids) in self.dfa.pattern_map() { + let new_id = remap(match_id); + pmap.insert(new_id, pattern_ids); + } + // This unwrap is OK because minimization never increases the number of + // match states or patterns in those match states. Since minimization + // runs after the pattern map has already been set at least once, we + // know that our match states cannot error. + self.dfa.set_pattern_map(&pmap).unwrap(); + + // In order to update the ID of the maximum match state, we need to + // find the maximum ID among all of the match states in the minimized + // DFA. This is not necessarily the new ID of the unminimized maximum + // match state, since that could have been collapsed with a much + // earlier match state. Therefore, to find the new max match state, + // we iterate over all previous match states, find their corresponding + // new minimal ID, and take the maximum of those. + let old = self.dfa.special().clone(); + let new = self.dfa.special_mut(); + // ... but only remap if we had match states. + if old.matches() { + new.min_match = StateID::MAX; + new.max_match = StateID::ZERO; + for i in as_index(old.min_match)..=as_index(old.max_match) { + let new_id = remap(as_state_id(i)); + if new_id < new.min_match { + new.min_match = new_id; + } + if new_id > new.max_match { + new.max_match = new_id; + } + } + } + // ... same, but for start states. + if old.starts() { + new.min_start = StateID::MAX; + new.max_start = StateID::ZERO; + for i in as_index(old.min_start)..=as_index(old.max_start) { + let new_id = remap(as_state_id(i)); + if new_id == DEAD { + continue; + } + if new_id < new.min_start { + new.min_start = new_id; + } + if new_id > new.max_start { + new.max_start = new_id; + } + } + if new.max_start == DEAD { + new.min_start = DEAD; + } + } + new.quit_id = remap(new.quit_id); + new.set_max(); + } + + fn find_waiting(&self, set: &StateSet) -> Option { + self.waiting.iter().position(|s| s == set) + } + + fn find_incoming_to( + &self, + b: alphabet::Unit, + set: &StateSet, + incoming: &mut StateSet, + ) { + incoming.clear(); + set.iter(|id| { + for &inid in + &self.in_transitions[self.dfa.to_index(id)][b.as_usize()] + { + incoming.add(inid); + } + }); + incoming.canonicalize(); + } + + fn initial_partitions(dfa: &dense::OwnedDFA) -> Vec { + // For match states, we know that two match states with different + // pattern ID lists will *always* be distinct, so we can partition them + // initially based on that. + let mut matching: BTreeMap, StateSet> = BTreeMap::new(); + let mut is_quit = StateSet::empty(); + let mut no_match = StateSet::empty(); + for state in dfa.states() { + if dfa.is_match_state(state.id()) { + let mut pids = vec![]; + for i in 0..dfa.match_count(state.id()) { + pids.push(dfa.match_pattern(state.id(), i)); + } + matching + .entry(pids) + .or_insert(StateSet::empty()) + .add(state.id()); + } else if dfa.is_quit_state(state.id()) { + is_quit.add(state.id()); + } else { + no_match.add(state.id()); + } + } + + let mut sets: Vec = + matching.into_iter().map(|(_, set)| set).collect(); + sets.push(no_match); + sets.push(is_quit); + sets + } + + fn incoming_transitions(dfa: &dense::OwnedDFA) -> Vec>> { + let mut incoming = vec![]; + for _ in dfa.states() { + incoming.push(vec![vec![]; dfa.alphabet_len()]); + } + for state in dfa.states() { + for (b, next) in state.transitions() { + incoming[dfa.to_index(next)][b.as_usize()].push(state.id()); + } + } + incoming + } +} + +impl StateSet { + fn empty() -> StateSet { + StateSet { ids: Rc::new(RefCell::new(vec![])) } + } + + fn add(&mut self, id: StateID) { + self.ids.borrow_mut().push(id); + } + + fn min(&self) -> StateID { + self.ids.borrow()[0] + } + + fn canonicalize(&mut self) { + self.ids.borrow_mut().sort(); + self.ids.borrow_mut().dedup(); + } + + fn clear(&mut self) { + self.ids.borrow_mut().clear(); + } + + fn len(&self) -> usize { + self.ids.borrow().len() + } + + fn is_empty(&self) -> bool { + self.len() == 0 + } + + fn deep_clone(&self) -> StateSet { + let ids = self.ids.borrow().iter().cloned().collect(); + StateSet { ids: Rc::new(RefCell::new(ids)) } + } + + fn iter(&self, mut f: F) { + for &id in self.ids.borrow().iter() { + f(id); + } + } + + fn intersection(&self, other: &StateSet, dest: &mut StateSet) { + dest.clear(); + if self.is_empty() || other.is_empty() { + return; + } + + let (seta, setb) = (self.ids.borrow(), other.ids.borrow()); + let (mut ita, mut itb) = (seta.iter().cloned(), setb.iter().cloned()); + let (mut a, mut b) = (ita.next().unwrap(), itb.next().unwrap()); + loop { + if a == b { + dest.add(a); + a = match ita.next() { + None => break, + Some(a) => a, + }; + b = match itb.next() { + None => break, + Some(b) => b, + }; + } else if a < b { + a = match ita.next() { + None => break, + Some(a) => a, + }; + } else { + b = match itb.next() { + None => break, + Some(b) => b, + }; + } + } + } + + fn subtract(&self, other: &StateSet, dest: &mut StateSet) { + dest.clear(); + if self.is_empty() || other.is_empty() { + self.iter(|s| dest.add(s)); + return; + } + + let (seta, setb) = (self.ids.borrow(), other.ids.borrow()); + let (mut ita, mut itb) = (seta.iter().cloned(), setb.iter().cloned()); + let (mut a, mut b) = (ita.next().unwrap(), itb.next().unwrap()); + loop { + if a == b { + a = match ita.next() { + None => break, + Some(a) => a, + }; + b = match itb.next() { + None => { + dest.add(a); + break; + } + Some(b) => b, + }; + } else if a < b { + dest.add(a); + a = match ita.next() { + None => break, + Some(a) => a, + }; + } else { + b = match itb.next() { + None => { + dest.add(a); + break; + } + Some(b) => b, + }; + } + } + for a in ita { + dest.add(a); + } + } +} diff --git a/vendor/regex-automata/src/dfa/mod.rs b/vendor/regex-automata/src/dfa/mod.rs new file mode 100644 index 000000000..6f9fe605e --- /dev/null +++ b/vendor/regex-automata/src/dfa/mod.rs @@ -0,0 +1,363 @@ +/*! +A module for building and searching with determinstic finite automata (DFAs). + +Like other modules in this crate, DFAs support a rich regex syntax with Unicode +features. DFAs also have extensive options for configuring the best space vs +time trade off for your use case and provides support for cheap deserialization +of automata for use in `no_std` environments. + +If you're looking for lazy DFAs that build themselves incrementally during +search, then please see the top-level [`hybrid` module](crate::hybrid). + +# Overview + +This section gives a brief overview of the primary types in this module: + +* A [`regex::Regex`] provides a way to search for matches of a regular +expression using DFAs. This includes iterating over matches with both the start +and end positions of each match. +* A [`dense::DFA`] provides low level access to a DFA that uses a dense +representation (uses lots of space, but fast searching). +* A [`sparse::DFA`] provides the same API as a `dense::DFA`, but uses a sparse +representation (uses less space, but slower searching). +* An [`Automaton`] trait that defines an interface that both dense and sparse +DFAs implement. (A `regex::Regex` is generic over this trait.) +* Both dense DFAs and sparse DFAs support serialization to raw bytes (e.g., +[`dense::DFA::to_bytes_little_endian`]) and cheap deserialization (e.g., +[`dense::DFA::from_bytes`]). + +# Example: basic regex searching + +This example shows how to compile a regex using the default configuration +and then use it to find matches in a byte string: + +``` +use regex_automata::{MultiMatch, dfa::regex::Regex}; + +let re = Regex::new(r"[0-9]{4}-[0-9]{2}-[0-9]{2}")?; +let text = b"2018-12-24 2016-10-08"; +let matches: Vec = re.find_leftmost_iter(text).collect(); +assert_eq!(matches, vec![ + MultiMatch::must(0, 0, 10), + MultiMatch::must(0, 11, 21), +]); +# Ok::<(), Box>(()) +``` + +# Example: searching with regex sets + +The DFAs in this module all fully support searching with multiple regexes +simultaneously. You can use this support with standard leftmost-first style +searching to find non-overlapping matches: + +``` +use regex_automata::{MultiMatch, dfa::regex::Regex}; + +let re = Regex::new_many(&[r"\w+", r"\S+"])?; +let text = b"@foo bar"; +let matches: Vec = re.find_leftmost_iter(text).collect(); +assert_eq!(matches, vec![ + MultiMatch::must(1, 0, 4), + MultiMatch::must(0, 5, 8), +]); +# Ok::<(), Box>(()) +``` + +Or use overlapping style searches to find all possible occurrences: + +``` +use regex_automata::{MatchKind, MultiMatch, dfa::{dense, regex::Regex}}; + +// N.B. For overlapping searches, we need the underlying DFA to report all +// possible matches. +let re = Regex::builder() + .dense(dense::Config::new().match_kind(MatchKind::All)) + .build_many(&[r"\w{3}", r"\S{3}"])?; +let text = b"@foo bar"; +let matches: Vec = re.find_overlapping_iter(text).collect(); +assert_eq!(matches, vec![ + MultiMatch::must(1, 0, 3), + MultiMatch::must(0, 1, 4), + MultiMatch::must(1, 1, 4), + MultiMatch::must(0, 5, 8), + MultiMatch::must(1, 5, 8), +]); +# Ok::<(), Box>(()) +``` + +# Example: use sparse DFAs + +By default, compiling a regex will use dense DFAs internally. This uses more +memory, but executes searches more quickly. If you can abide slower searches +(somewhere around 3-5x), then sparse DFAs might make more sense since they can +use significantly less space. + +Using sparse DFAs is as easy as using `Regex::new_sparse` instead of +`Regex::new`: + +``` +use regex_automata::{MultiMatch, dfa::regex::Regex}; + +let re = Regex::new_sparse(r"[0-9]{4}-[0-9]{2}-[0-9]{2}").unwrap(); +let text = b"2018-12-24 2016-10-08"; +let matches: Vec = re.find_leftmost_iter(text).collect(); +assert_eq!(matches, vec![ + MultiMatch::must(0, 0, 10), + MultiMatch::must(0, 11, 21), +]); +# Ok::<(), Box>(()) +``` + +If you already have dense DFAs for some reason, they can be converted to sparse +DFAs and used to build a new `Regex`. For example: + +``` +use regex_automata::{MultiMatch, dfa::regex::Regex}; + +let dense_re = Regex::new(r"[0-9]{4}-[0-9]{2}-[0-9]{2}").unwrap(); +let sparse_re = Regex::builder().build_from_dfas( + dense_re.forward().to_sparse()?, + dense_re.reverse().to_sparse()?, +); +let text = b"2018-12-24 2016-10-08"; +let matches: Vec = sparse_re.find_leftmost_iter(text).collect(); +assert_eq!(matches, vec![ + MultiMatch::must(0, 0, 10), + MultiMatch::must(0, 11, 21), +]); +# Ok::<(), Box>(()) +``` + +# Example: deserialize a DFA + +This shows how to first serialize a DFA into raw bytes, and then deserialize +those raw bytes back into a DFA. While this particular example is a +bit contrived, this same technique can be used in your program to +deserialize a DFA at start up time or by memory mapping a file. + +``` +use regex_automata::{MultiMatch, dfa::{dense, regex::Regex}}; + +let re1 = Regex::new(r"[0-9]{4}-[0-9]{2}-[0-9]{2}").unwrap(); +// serialize both the forward and reverse DFAs, see note below +let (fwd_bytes, fwd_pad) = re1.forward().to_bytes_native_endian(); +let (rev_bytes, rev_pad) = re1.reverse().to_bytes_native_endian(); +// now deserialize both---we need to specify the correct type! +let fwd: dense::DFA<&[u32]> = dense::DFA::from_bytes(&fwd_bytes[fwd_pad..])?.0; +let rev: dense::DFA<&[u32]> = dense::DFA::from_bytes(&rev_bytes[rev_pad..])?.0; +// finally, reconstruct our regex +let re2 = Regex::builder().build_from_dfas(fwd, rev); + +// we can use it like normal +let text = b"2018-12-24 2016-10-08"; +let matches: Vec = re2.find_leftmost_iter(text).collect(); +assert_eq!(matches, vec![ + MultiMatch::must(0, 0, 10), + MultiMatch::must(0, 11, 21), +]); +# Ok::<(), Box>(()) +``` + +There are a few points worth noting here: + +* We need to extract the raw DFAs used by the regex and serialize those. You +can build the DFAs manually yourself using [`dense::Builder`], but using +the DFAs from a `Regex` guarantees that the DFAs are built correctly. (In +particular, a `Regex` constructs a reverse DFA for finding the starting +location of matches.) +* To convert the DFA to raw bytes, we use the `to_bytes_native_endian` method. +In practice, you'll want to use either [`dense::DFA::to_bytes_little_endian`] +or [`dense::DFA::to_bytes_big_endian`], depending on which platform you're +deserializing your DFA from. If you intend to deserialize on either platform, +then you'll need to serialize both and deserialize the right one depending on +your target's endianness. +* Safely deserializing a DFA requires verifying the raw bytes, particularly if +they are untrusted, since an invalid DFA could cause logical errors, panics +or even undefined behavior. This verification step requires visiting all of +the transitions in the DFA, which can be costly. If cheaper verification is +desired, then [`dense::DFA::from_bytes_unchecked`] is available that only does +verification that can be performed in constant time. However, one can only use +this routine if the caller can guarantee that the bytes provided encoded a +valid DFA. + +The same process can be achieved with sparse DFAs as well: + +``` +use regex_automata::{MultiMatch, dfa::{sparse, regex::Regex}}; + +let re1 = Regex::new(r"[0-9]{4}-[0-9]{2}-[0-9]{2}").unwrap(); +// serialize both +let fwd_bytes = re1.forward().to_sparse()?.to_bytes_native_endian(); +let rev_bytes = re1.reverse().to_sparse()?.to_bytes_native_endian(); +// now deserialize both---we need to specify the correct type! +let fwd: sparse::DFA<&[u8]> = sparse::DFA::from_bytes(&fwd_bytes)?.0; +let rev: sparse::DFA<&[u8]> = sparse::DFA::from_bytes(&rev_bytes)?.0; +// finally, reconstruct our regex +let re2 = Regex::builder().build_from_dfas(fwd, rev); + +// we can use it like normal +let text = b"2018-12-24 2016-10-08"; +let matches: Vec = re2.find_leftmost_iter(text).collect(); +assert_eq!(matches, vec![ + MultiMatch::must(0, 0, 10), + MultiMatch::must(0, 11, 21), +]); +# Ok::<(), Box>(()) +``` + +Note that unlike dense DFAs, sparse DFAs have no alignment requirements. +Conversely, dense DFAs must be be aligned to the same alignment as a +[`StateID`](crate::util::id::StateID). + +# Support for `no_std` and `alloc`-only + +This crate comes with `alloc` and `std` features that are enabled by default. +When the `alloc` or `std` features are enabled, the API of this module will +include the facilities necessary for compiling, serializing, deserializing +and searching with DFAs. When only the `alloc` feature is enabled, then +implementations of the `std::error::Error` trait are dropped, but everything +else generally remains the same. When both the `alloc` and `std` features are +disabled, the API of this module will shrink such that it only includes the +facilities necessary for deserializing and searching with DFAs. + +The intended workflow for `no_std` environments is thus as follows: + +* Write a program with the `alloc` or `std` features that compiles and +serializes a regular expression. You may need to serialize both little and big +endian versions of each DFA. (So that's 4 DFAs in total for each regex.) +* In your `no_std` environment, follow the examples above for deserializing +your previously serialized DFAs into regexes. You can then search with them as +you would any regex. + +Deserialization can happen anywhere. For example, with bytes embedded into a +binary or with a file memory mapped at runtime. + +TODO: Include link to `regex-cli` here pointing out how to generate Rust code +for deserializing DFAs. + +# Syntax + +This module supports the same syntax as the `regex` crate, since they share the +same parser. You can find an exhaustive list of supported syntax in the +[documentation for the `regex` crate](https://docs.rs/regex/1/regex/#syntax). + +There are two things that are not supported by the DFAs in this module: + +* Capturing groups. The DFAs (and [`Regex`](regex::Regex)es built on top +of them) can only find the offsets of an entire match, but cannot resolve +the offsets of each capturing group. This is because DFAs do not have the +expressive power necessary. +* Unicode word boundaries. These present particularly difficult challenges for +DFA construction and would result in an explosion in the number of states. +One can enable [`dense::Config::unicode_word_boundary`] though, which provides +heuristic support for Unicode word boundaries that only works on ASCII text. +Otherwise, one can use `(?-u:\b)` for an ASCII word boundary, which will work +on any input. + +There are no plans to lift either of these limitations. + +Note that these restrictions are identical to the restrictions on lazy DFAs. + +# Differences with general purpose regexes + +The main goal of the [`regex`](https://docs.rs/regex) crate is to serve as a +general purpose regular expression engine. It aims to automatically balance low +compile times, fast search times and low memory usage, while also providing +a convenient API for users. In contrast, this module provides a lower level +regular expression interface based exclusively on DFAs that is a bit less +convenient while providing more explicit control over memory usage and search +times. + +Here are some specific negative differences: + +* **Compilation can take an exponential amount of time and space** in the size +of the regex pattern. While most patterns do not exhibit worst case exponential +time, such patterns do exist. For example, `[01]*1[01]{N}` will build a DFA +with approximately `2^(N+2)` states. For this reason, untrusted patterns should +not be compiled with this module. (In the future, the API may expose an option +to return an error if the DFA gets too big.) +* This module does not support sub-match extraction via capturing groups, which +can be achieved with the regex crate's "captures" API. +* While the regex crate doesn't necessarily sport fast compilation times, +the regexes in this module are almost universally slow to compile, especially +when they contain large Unicode character classes. For example, on my system, +compiling `\w{50}` takes about 1 second and almost 15MB of memory! (Compiling +a sparse regex takes about the same time but only uses about 1.2MB of +memory.) Conversly, compiling the same regex without Unicode support, e.g., +`(?-u)\w{50}`, takes under 1 millisecond and about 15KB of memory. For this +reason, you should only use Unicode character classes if you absolutely need +them! (They are enabled by default though.) +* This module does not support Unicode word boundaries. ASCII word bondaries +may be used though by disabling Unicode or selectively doing so in the syntax, +e.g., `(?-u:\b)`. There is also an option to +[heuristically enable Unicode word boundaries](crate::dfa::dense::Config::unicode_word_boundary), +where the corresponding DFA will give up if any non-ASCII byte is seen. +* As a lower level API, this module does not do literal optimizations +automatically. Although it does provide hooks in its API to make use of the +[`Prefilter`](crate::util::prefilter::Prefilter) trait. Missing literal +optimizations means that searches may run much slower than what you're +accustomed to, although, it does provide more predictable and consistent +performance. +* There is no `&str` API like in the regex crate. In this module, all APIs +operate on `&[u8]`. By default, match indices are guaranteed to fall on UTF-8 +boundaries, unless any of [`SyntaxConfig::utf8`](crate::SyntaxConfig::utf8), +[`nfa::thompson::Config::utf8`](crate::nfa::thompson::Config::utf8) or +[`regex::Config::utf8`] are disabled. + +With some of the downsides out of the way, here are some positive differences: + +* Both dense and sparse DFAs can be serialized to raw bytes, and then cheaply +deserialized. Deserialization can be done in constant time with the unchecked +APIs, since searching can be performed directly on the raw serialized bytes of +a DFA. +* This module was specifically designed so that the searching phase of a +DFA has minimal runtime requirements, and can therefore be used in `no_std` +environments. While `no_std` environments cannot compile regexes, they can +deserialize pre-compiled regexes. +* Since this module builds DFAs ahead of time, it will generally out-perform +the `regex` crate on equivalent tasks. The performance difference is likely +not large. However, because of a complex set of optimizations in the regex +crate (like literal optimizations), an accurate performance comparison may be +difficult to do. +* Sparse DFAs provide a way to build a DFA ahead of time that sacrifices search +performance a small amount, but uses much less storage space. Potentially even +less than what the regex crate uses. +* This module exposes DFAs directly, such as [`dense::DFA`] and +[`sparse::DFA`], which enables one to do less work in some cases. For example, +if you only need the end of a match and not the start of a match, then you can +use a DFA directly without building a `Regex`, which always requires a second +DFA to find the start of a match. +* This module provides more control over memory usage. Aside from choosing +between dense and sparse DFAs, one can also choose a smaller state identifier +representation to use less space. Also, one can enable DFA minimization +via [`dense::Config::minimize`], but it can increase compilation times +dramatically. +*/ + +pub use crate::dfa::automaton::{Automaton, OverlappingState}; +#[cfg(feature = "alloc")] +pub use crate::dfa::error::Error; + +/// This is an alias for a state ID of zero. It has special significance +/// because it always corresponds to the first state in a DFA, and the first +/// state in a DFA is always "dead." That is, the dead state always has all +/// of its transitions set to itself. Moreover, the dead state is used as a +/// sentinel for various things. e.g., In search, reaching a dead state means +/// that the search must stop. +const DEAD: crate::util::id::StateID = crate::util::id::StateID::ZERO; + +mod accel; +mod automaton; +pub mod dense; +#[cfg(feature = "alloc")] +mod determinize; +#[cfg(feature = "alloc")] +pub(crate) mod error; +#[cfg(feature = "alloc")] +mod minimize; +pub mod regex; +mod search; +pub mod sparse; +mod special; +#[cfg(feature = "transducer")] +mod transducer; diff --git a/vendor/regex-automata/src/dfa/regex.rs b/vendor/regex-automata/src/dfa/regex.rs new file mode 100644 index 000000000..d0917e17d --- /dev/null +++ b/vendor/regex-automata/src/dfa/regex.rs @@ -0,0 +1,2146 @@ +/*! +A DFA-backed `Regex`. + +This module provides [`Regex`], which is defined generically over the +[`Automaton`] trait. A `Regex` implements convenience routines you might have +come to expect, such as finding the start/end of a match and iterating over +all non-overlapping matches. This `Regex` type is limited in its capabilities +to what a DFA can provide. Therefore, APIs involving capturing groups, for +example, are not provided. + +Internally, a `Regex` is composed of two DFAs. One is a "forward" DFA that +finds the end offset of a match, where as the other is a "reverse" DFA that +find the start offset of a match. + +See the [parent module](crate::dfa) for examples. +*/ + +#[cfg(feature = "alloc")] +use alloc::vec::Vec; + +use crate::{ + dfa::automaton::{Automaton, OverlappingState}, + util::prefilter::{self, Prefilter}, + MatchError, MultiMatch, +}; +#[cfg(feature = "alloc")] +use crate::{ + dfa::{dense, error::Error, sparse}, + nfa::thompson, + util::matchtypes::MatchKind, +}; + +// When the alloc feature is enabled, the regex type sets its A type parameter +// to default to an owned dense DFA. But without alloc, we set no default. This +// makes things a lot more convenient in the common case, since writing out the +// DFA types is pretty annoying. +// +// Since we have two different definitions but only want to write one doc +// string, we use a macro to capture the doc and other attributes once and then +// repeat them for each definition. +macro_rules! define_regex_type { + ($(#[$doc:meta])*) => { + #[cfg(feature = "alloc")] + $(#[$doc])* + pub struct Regex { + prefilter: Option

, + forward: A, + reverse: A, + utf8: bool, + } + + #[cfg(not(feature = "alloc"))] + $(#[$doc])* + pub struct Regex { + prefilter: Option

, + forward: A, + reverse: A, + utf8: bool, + } + }; +} + +define_regex_type!( + /// A regular expression that uses deterministic finite automata for fast + /// searching. + /// + /// A regular expression is comprised of two DFAs, a "forward" DFA and a + /// "reverse" DFA. The forward DFA is responsible for detecting the end of + /// a match while the reverse DFA is responsible for detecting the start + /// of a match. Thus, in order to find the bounds of any given match, a + /// forward search must first be run followed by a reverse search. A match + /// found by the forward DFA guarantees that the reverse DFA will also find + /// a match. + /// + /// The type of the DFA used by a `Regex` corresponds to the `A` type + /// parameter, which must satisfy the [`Automaton`] trait. Typically, + /// `A` is either a [`dense::DFA`](crate::dfa::dense::DFA) or a + /// [`sparse::DFA`](crate::dfa::sparse::DFA), where dense DFAs use more + /// memory but search faster, while sparse DFAs use less memory but search + /// more slowly. + /// + /// By default, a regex's automaton type parameter is set to + /// `dense::DFA>` when the `alloc` feature is enabled. For most + /// in-memory work loads, this is the most convenient type that gives the + /// best search performance. When the `alloc` feature is disabled, no + /// default type is used. + /// + /// A `Regex` also has a `P` type parameter, which is used to select the + /// prefilter used during search. By default, no prefilter is enabled by + /// setting the type to default to [`prefilter::None`]. A prefilter can be + /// enabled by using the [`Regex::prefilter`] method. + /// + /// # When should I use this? + /// + /// Generally speaking, if you can afford the overhead of building a full + /// DFA for your regex, and you don't need things like capturing groups, + /// then this is a good choice if you're looking to optimize for matching + /// speed. Note however that its speed may be worse than a general purpose + /// regex engine if you don't select a good [prefilter]. + /// + /// # Earliest vs Leftmost vs Overlapping + /// + /// The search routines exposed on a `Regex` reflect three different ways + /// of searching: + /// + /// * "earliest" means to stop as soon as a match has been detected. + /// * "leftmost" means to continue matching until the underlying + /// automaton cannot advance. This reflects "standard" searching you + /// might be used to in other regex engines. e.g., This permits + /// non-greedy and greedy searching to work as you would expect. + /// * "overlapping" means to find all possible matches, even if they + /// overlap. + /// + /// Generally speaking, when doing an overlapping search, you'll want to + /// build your regex DFAs with [`MatchKind::All`] semantics. Using + /// [`MatchKind::LeftmostFirst`] semantics with overlapping searches is + /// likely to lead to odd behavior since `LeftmostFirst` specifically omits + /// some matches that can never be reported due to its semantics. + /// + /// The following example shows the differences between how these different + /// types of searches impact looking for matches of `[a-z]+` in the + /// haystack `abc`. + /// + /// ``` + /// use regex_automata::{dfa::{self, dense}, MatchKind, MultiMatch}; + /// + /// let pattern = r"[a-z]+"; + /// let haystack = "abc".as_bytes(); + /// + /// // With leftmost-first semantics, we test "earliest" and "leftmost". + /// let re = dfa::regex::Builder::new() + /// .dense(dense::Config::new().match_kind(MatchKind::LeftmostFirst)) + /// .build(pattern)?; + /// + /// // "earliest" searching isn't impacted by greediness + /// let mut it = re.find_earliest_iter(haystack); + /// assert_eq!(Some(MultiMatch::must(0, 0, 1)), it.next()); + /// assert_eq!(Some(MultiMatch::must(0, 1, 2)), it.next()); + /// assert_eq!(Some(MultiMatch::must(0, 2, 3)), it.next()); + /// assert_eq!(None, it.next()); + /// + /// // "leftmost" searching supports greediness (and non-greediness) + /// let mut it = re.find_leftmost_iter(haystack); + /// assert_eq!(Some(MultiMatch::must(0, 0, 3)), it.next()); + /// assert_eq!(None, it.next()); + /// + /// // For overlapping, we want "all" match kind semantics. + /// let re = dfa::regex::Builder::new() + /// .dense(dense::Config::new().match_kind(MatchKind::All)) + /// .build(pattern)?; + /// + /// // In the overlapping search, we find all three possible matches + /// // starting at the beginning of the haystack. + /// let mut it = re.find_overlapping_iter(haystack); + /// assert_eq!(Some(MultiMatch::must(0, 0, 1)), it.next()); + /// assert_eq!(Some(MultiMatch::must(0, 0, 2)), it.next()); + /// assert_eq!(Some(MultiMatch::must(0, 0, 3)), it.next()); + /// assert_eq!(None, it.next()); + /// + /// # Ok::<(), Box>(()) + /// ``` + /// + /// # Sparse DFAs + /// + /// Since a `Regex` is generic over the [`Automaton`] trait, it can be + /// used with any kind of DFA. While this crate constructs dense DFAs by + /// default, it is easy enough to build corresponding sparse DFAs, and then + /// build a regex from them: + /// + /// ``` + /// use regex_automata::dfa::regex::Regex; + /// + /// // First, build a regex that uses dense DFAs. + /// let dense_re = Regex::new("foo[0-9]+")?; + /// + /// // Second, build sparse DFAs from the forward and reverse dense DFAs. + /// let fwd = dense_re.forward().to_sparse()?; + /// let rev = dense_re.reverse().to_sparse()?; + /// + /// // Third, build a new regex from the constituent sparse DFAs. + /// let sparse_re = Regex::builder().build_from_dfas(fwd, rev); + /// + /// // A regex that uses sparse DFAs can be used just like with dense DFAs. + /// assert_eq!(true, sparse_re.is_match(b"foo123")); + /// + /// # Ok::<(), Box>(()) + /// ``` + /// + /// Alternatively, one can use a [`Builder`] to construct a sparse DFA + /// more succinctly. (Note though that dense DFAs are still constructed + /// first internally, and then converted to sparse DFAs, as in the example + /// above.) + /// + /// ``` + /// use regex_automata::dfa::regex::Regex; + /// + /// let sparse_re = Regex::builder().build_sparse(r"foo[0-9]+")?; + /// // A regex that uses sparse DFAs can be used just like with dense DFAs. + /// assert!(sparse_re.is_match(b"foo123")); + /// + /// # Ok::<(), Box>(()) + /// ``` + /// + /// # Fallibility + /// + /// In non-default configurations, the DFAs generated in this module may + /// return an error during a search. (Currently, the only way this happens + /// is if quit bytes are added or Unicode word boundaries are heuristically + /// enabled, both of which are turned off by default.) For convenience, the + /// main search routines, like [`find_leftmost`](Regex::find_leftmost), + /// will panic if an error occurs. However, if you need to use DFAs + /// which may produce an error at search time, then there are fallible + /// equivalents of all search routines. For example, for `find_leftmost`, + /// its fallible analog is [`try_find_leftmost`](Regex::try_find_leftmost). + /// The routines prefixed with `try_` return `Result, + /// MatchError>`, where as the infallible routines simply return + /// `Option`. + /// + /// # Example + /// + /// This example shows how to cause a search to terminate if it sees a + /// `\n` byte, and handle the error returned. This could be useful if, for + /// example, you wanted to prevent a user supplied pattern from matching + /// across a line boundary. + /// + /// ``` + /// use regex_automata::{dfa::{self, regex::Regex}, MatchError}; + /// + /// let re = Regex::builder() + /// .dense(dfa::dense::Config::new().quit(b'\n', true)) + /// .build(r"foo\p{any}+bar")?; + /// + /// let haystack = "foo\nbar".as_bytes(); + /// // Normally this would produce a match, since \p{any} contains '\n'. + /// // But since we instructed the automaton to enter a quit state if a + /// // '\n' is observed, this produces a match error instead. + /// let expected = MatchError::Quit { byte: 0x0A, offset: 3 }; + /// let got = re.try_find_leftmost(haystack).unwrap_err(); + /// assert_eq!(expected, got); + /// + /// # Ok::<(), Box>(()) + /// ``` + #[derive(Clone, Debug)] +); + +#[cfg(feature = "alloc")] +impl Regex { + /// Parse the given regular expression using the default configuration and + /// return the corresponding regex. + /// + /// If you want a non-default configuration, then use the [`Builder`] to + /// set your own configuration. + /// + /// # Example + /// + /// ``` + /// use regex_automata::{MultiMatch, dfa::regex::Regex}; + /// + /// let re = Regex::new("foo[0-9]+bar")?; + /// assert_eq!( + /// Some(MultiMatch::must(0, 3, 14)), + /// re.find_leftmost(b"zzzfoo12345barzzz"), + /// ); + /// # Ok::<(), Box>(()) + /// ``` + pub fn new(pattern: &str) -> Result { + Builder::new().build(pattern) + } + + /// Like `new`, but parses multiple patterns into a single "regex set." + /// This similarly uses the default regex configuration. + /// + /// # Example + /// + /// ``` + /// use regex_automata::{MultiMatch, dfa::regex::Regex}; + /// + /// let re = Regex::new_many(&["[a-z]+", "[0-9]+"])?; + /// + /// let mut it = re.find_leftmost_iter(b"abc 1 foo 4567 0 quux"); + /// assert_eq!(Some(MultiMatch::must(0, 0, 3)), it.next()); + /// assert_eq!(Some(MultiMatch::must(1, 4, 5)), it.next()); + /// assert_eq!(Some(MultiMatch::must(0, 6, 9)), it.next()); + /// assert_eq!(Some(MultiMatch::must(1, 10, 14)), it.next()); + /// assert_eq!(Some(MultiMatch::must(1, 15, 16)), it.next()); + /// assert_eq!(Some(MultiMatch::must(0, 17, 21)), it.next()); + /// assert_eq!(None, it.next()); + /// # Ok::<(), Box>(()) + /// ``` + pub fn new_many>(patterns: &[P]) -> Result { + Builder::new().build_many(patterns) + } +} + +#[cfg(feature = "alloc")] +impl Regex>> { + /// Parse the given regular expression using the default configuration, + /// except using sparse DFAs, and return the corresponding regex. + /// + /// If you want a non-default configuration, then use the [`Builder`] to + /// set your own configuration. + /// + /// # Example + /// + /// ``` + /// use regex_automata::{MultiMatch, dfa::regex::Regex}; + /// + /// let re = Regex::new_sparse("foo[0-9]+bar")?; + /// assert_eq!( + /// Some(MultiMatch::must(0, 3, 14)), + /// re.find_leftmost(b"zzzfoo12345barzzz"), + /// ); + /// # Ok::<(), Box>(()) + /// ``` + pub fn new_sparse( + pattern: &str, + ) -> Result>>, Error> { + Builder::new().build_sparse(pattern) + } + + /// Like `new`, but parses multiple patterns into a single "regex set" + /// using sparse DFAs. This otherwise similarly uses the default regex + /// configuration. + /// + /// # Example + /// + /// ``` + /// use regex_automata::{MultiMatch, dfa::regex::Regex}; + /// + /// let re = Regex::new_many_sparse(&["[a-z]+", "[0-9]+"])?; + /// + /// let mut it = re.find_leftmost_iter(b"abc 1 foo 4567 0 quux"); + /// assert_eq!(Some(MultiMatch::must(0, 0, 3)), it.next()); + /// assert_eq!(Some(MultiMatch::must(1, 4, 5)), it.next()); + /// assert_eq!(Some(MultiMatch::must(0, 6, 9)), it.next()); + /// assert_eq!(Some(MultiMatch::must(1, 10, 14)), it.next()); + /// assert_eq!(Some(MultiMatch::must(1, 15, 16)), it.next()); + /// assert_eq!(Some(MultiMatch::must(0, 17, 21)), it.next()); + /// assert_eq!(None, it.next()); + /// # Ok::<(), Box>(()) + /// ``` + pub fn new_many_sparse>( + patterns: &[P], + ) -> Result>>, Error> { + Builder::new().build_many_sparse(patterns) + } +} + +/// Convenience routines for regex construction. +#[cfg(feature = "alloc")] +impl Regex { + /// Return a default configuration for a `Regex`. + /// + /// This is a convenience routine to avoid needing to import the `Config` + /// type when customizing the construction of a regex. + /// + /// # Example + /// + /// This example shows how to disable UTF-8 mode for `Regex` iteration. + /// When UTF-8 mode is disabled, the position immediately following an + /// empty match is where the next search begins, instead of the next + /// position of a UTF-8 encoded codepoint. + /// + /// ``` + /// use regex_automata::{dfa::regex::Regex, MultiMatch}; + /// + /// let re = Regex::builder() + /// .configure(Regex::config().utf8(false)) + /// .build(r"")?; + /// let haystack = "a☃z".as_bytes(); + /// let mut it = re.find_leftmost_iter(haystack); + /// assert_eq!(Some(MultiMatch::must(0, 0, 0)), it.next()); + /// assert_eq!(Some(MultiMatch::must(0, 1, 1)), it.next()); + /// assert_eq!(Some(MultiMatch::must(0, 2, 2)), it.next()); + /// assert_eq!(Some(MultiMatch::must(0, 3, 3)), it.next()); + /// assert_eq!(Some(MultiMatch::must(0, 4, 4)), it.next()); + /// assert_eq!(Some(MultiMatch::must(0, 5, 5)), it.next()); + /// assert_eq!(None, it.next()); + /// + /// # Ok::<(), Box>(()) + /// ``` + pub fn config() -> Config { + Config::new() + } + + /// Return a builder for configuring the construction of a `Regex`. + /// + /// This is a convenience routine to avoid needing to import the + /// [`Builder`] type in common cases. + /// + /// # Example + /// + /// This example shows how to use the builder to disable UTF-8 mode + /// everywhere. + /// + /// ``` + /// use regex_automata::{ + /// dfa::regex::Regex, + /// nfa::thompson, + /// MultiMatch, SyntaxConfig, + /// }; + /// + /// let re = Regex::builder() + /// .configure(Regex::config().utf8(false)) + /// .syntax(SyntaxConfig::new().utf8(false)) + /// .thompson(thompson::Config::new().utf8(false)) + /// .build(r"foo(?-u:[^b])ar.*")?; + /// let haystack = b"\xFEfoo\xFFarzz\xE2\x98\xFF\n"; + /// let expected = Some(MultiMatch::must(0, 1, 9)); + /// let got = re.find_leftmost(haystack); + /// assert_eq!(expected, got); + /// + /// # Ok::<(), Box>(()) + /// ``` + pub fn builder() -> Builder { + Builder::new() + } +} + +/// Standard search routines for finding and iterating over matches. +impl Regex { + /// Returns true if and only if this regex matches the given haystack. + /// + /// This routine may short circuit if it knows that scanning future input + /// will never lead to a different result. In particular, if the underlying + /// DFA enters a match state or a dead state, then this routine will return + /// `true` or `false`, respectively, without inspecting any future input. + /// + /// # Panics + /// + /// If the underlying DFAs return an error, then this routine panics. This + /// only occurs in non-default configurations where quit bytes are used or + /// Unicode word boundaries are heuristically enabled. + /// + /// The fallible version of this routine is + /// [`try_is_match`](Regex::try_is_match). + /// + /// # Example + /// + /// ``` + /// use regex_automata::dfa::regex::Regex; + /// + /// let re = Regex::new("foo[0-9]+bar")?; + /// assert_eq!(true, re.is_match(b"foo12345bar")); + /// assert_eq!(false, re.is_match(b"foobar")); + /// # Ok::<(), Box>(()) + /// ``` + pub fn is_match(&self, haystack: &[u8]) -> bool { + self.is_match_at(haystack, 0, haystack.len()) + } + + /// Returns the first position at which a match is found. + /// + /// This routine stops scanning input in precisely the same circumstances + /// as `is_match`. The key difference is that this routine returns the + /// position at which it stopped scanning input if and only if a match + /// was found. If no match is found, then `None` is returned. + /// + /// # Panics + /// + /// If the underlying DFAs return an error, then this routine panics. This + /// only occurs in non-default configurations where quit bytes are used or + /// Unicode word boundaries are heuristically enabled. + /// + /// The fallible version of this routine is + /// [`try_find_earliest`](Regex::try_find_earliest). + /// + /// # Example + /// + /// ``` + /// use regex_automata::{MultiMatch, dfa::regex::Regex}; + /// + /// // Normally, the leftmost first match would greedily consume as many + /// // decimal digits as it could. But a match is detected as soon as one + /// // digit is seen. + /// let re = Regex::new("foo[0-9]+")?; + /// assert_eq!( + /// Some(MultiMatch::must(0, 0, 4)), + /// re.find_earliest(b"foo12345"), + /// ); + /// + /// // Normally, the end of the leftmost first match here would be 3, + /// // but the "earliest" match semantics detect a match earlier. + /// let re = Regex::new("abc|a")?; + /// assert_eq!(Some(MultiMatch::must(0, 0, 1)), re.find_earliest(b"abc")); + /// # Ok::<(), Box>(()) + /// ``` + pub fn find_earliest(&self, haystack: &[u8]) -> Option { + self.find_earliest_at(haystack, 0, haystack.len()) + } + + /// Returns the start and end offset of the leftmost match. If no match + /// exists, then `None` is returned. + /// + /// # Panics + /// + /// If the underlying DFAs return an error, then this routine panics. This + /// only occurs in non-default configurations where quit bytes are used or + /// Unicode word boundaries are heuristically enabled. + /// + /// The fallible version of this routine is + /// [`try_find_leftmost`](Regex::try_find_leftmost). + /// + /// # Example + /// + /// ``` + /// use regex_automata::{MultiMatch, dfa::regex::Regex}; + /// + /// // Greediness is applied appropriately when compared to find_earliest. + /// let re = Regex::new("foo[0-9]+")?; + /// assert_eq!( + /// Some(MultiMatch::must(0, 3, 11)), + /// re.find_leftmost(b"zzzfoo12345zzz"), + /// ); + /// + /// // Even though a match is found after reading the first byte (`a`), + /// // the default leftmost-first match semantics demand that we find the + /// // earliest match that prefers earlier parts of the pattern over latter + /// // parts. + /// let re = Regex::new("abc|a")?; + /// assert_eq!(Some(MultiMatch::must(0, 0, 3)), re.find_leftmost(b"abc")); + /// # Ok::<(), Box>(()) + /// ``` + pub fn find_leftmost(&self, haystack: &[u8]) -> Option { + self.find_leftmost_at(haystack, 0, haystack.len()) + } + + /// Search for the first overlapping match in `haystack`. + /// + /// This routine is principally useful when searching for multiple patterns + /// on inputs where multiple patterns may match the same regions of text. + /// In particular, callers must preserve the automaton's search state from + /// prior calls so that the implementation knows where the last match + /// occurred and which pattern was reported. + /// + /// # Panics + /// + /// If the underlying DFAs return an error, then this routine panics. This + /// only occurs in non-default configurations where quit bytes are used or + /// Unicode word boundaries are heuristically enabled. + /// + /// The fallible version of this routine is + /// [`try_find_overlapping`](Regex::try_find_overlapping). + /// + /// # Example + /// + /// This example shows how to run an overlapping search with multiple + /// regexes. + /// + /// ``` + /// use regex_automata::{dfa::{self, regex::Regex}, MatchKind, MultiMatch}; + /// + /// let re = Regex::builder() + /// .dense(dfa::dense::Config::new().match_kind(MatchKind::All)) + /// .build_many(&[r"\w+$", r"\S+$"])?; + /// let haystack = "@foo".as_bytes(); + /// let mut state = dfa::OverlappingState::start(); + /// + /// let expected = Some(MultiMatch::must(1, 0, 4)); + /// let got = re.find_overlapping(haystack, &mut state); + /// assert_eq!(expected, got); + /// + /// // The first pattern also matches at the same position, so re-running + /// // the search will yield another match. Notice also that the first + /// // pattern is returned after the second. This is because the second + /// // pattern begins its match before the first, is therefore an earlier + /// // match and is thus reported first. + /// let expected = Some(MultiMatch::must(0, 1, 4)); + /// let got = re.find_overlapping(haystack, &mut state); + /// assert_eq!(expected, got); + /// + /// # Ok::<(), Box>(()) + /// ``` + pub fn find_overlapping( + &self, + haystack: &[u8], + state: &mut OverlappingState, + ) -> Option { + self.find_overlapping_at(haystack, 0, haystack.len(), state) + } + + /// Returns an iterator over all non-overlapping "earliest" matches. + /// + /// Match positions are reported as soon as a match is known to occur, even + /// if the standard leftmost match would be longer. + /// + /// # Panics + /// + /// If the underlying DFAs return an error during iteration, then iteration + /// panics. This only occurs in non-default configurations where quit bytes + /// are used or Unicode word boundaries are heuristically enabled. + /// + /// The fallible version of this routine is + /// [`try_find_earliest_iter`](Regex::try_find_earliest_iter). + /// + /// # Example + /// + /// This example shows how to run an "earliest" iterator. + /// + /// ``` + /// use regex_automata::{dfa::regex::Regex, MultiMatch}; + /// + /// let re = Regex::new("[0-9]+")?; + /// let haystack = "123".as_bytes(); + /// + /// // Normally, a standard leftmost iterator would return a single + /// // match, but since "earliest" detects matches earlier, we get + /// // three matches. + /// let mut it = re.find_earliest_iter(haystack); + /// assert_eq!(Some(MultiMatch::must(0, 0, 1)), it.next()); + /// assert_eq!(Some(MultiMatch::must(0, 1, 2)), it.next()); + /// assert_eq!(Some(MultiMatch::must(0, 2, 3)), it.next()); + /// assert_eq!(None, it.next()); + /// + /// # Ok::<(), Box>(()) + /// ``` + pub fn find_earliest_iter<'r, 't>( + &'r self, + haystack: &'t [u8], + ) -> FindEarliestMatches<'r, 't, A, P> { + FindEarliestMatches::new(self, haystack) + } + + /// Returns an iterator over all non-overlapping leftmost matches in the + /// given bytes. If no match exists, then the iterator yields no elements. + /// + /// This corresponds to the "standard" regex search iterator. + /// + /// # Panics + /// + /// If the underlying DFAs return an error during iteration, then iteration + /// panics. This only occurs in non-default configurations where quit bytes + /// are used or Unicode word boundaries are heuristically enabled. + /// + /// The fallible version of this routine is + /// [`try_find_leftmost_iter`](Regex::try_find_leftmost_iter). + /// + /// # Example + /// + /// ``` + /// use regex_automata::{MultiMatch, dfa::regex::Regex}; + /// + /// let re = Regex::new("foo[0-9]+")?; + /// let text = b"foo1 foo12 foo123"; + /// let matches: Vec = re.find_leftmost_iter(text).collect(); + /// assert_eq!(matches, vec![ + /// MultiMatch::must(0, 0, 4), + /// MultiMatch::must(0, 5, 10), + /// MultiMatch::must(0, 11, 17), + /// ]); + /// # Ok::<(), Box>(()) + /// ``` + pub fn find_leftmost_iter<'r, 't>( + &'r self, + haystack: &'t [u8], + ) -> FindLeftmostMatches<'r, 't, A, P> { + FindLeftmostMatches::new(self, haystack) + } + + /// Returns an iterator over all overlapping matches in the given haystack. + /// + /// This routine is principally useful when searching for multiple patterns + /// on inputs where multiple patterns may match the same regions of text. + /// The iterator takes care of handling the overlapping state that must be + /// threaded through every search. + /// + /// # Panics + /// + /// If the underlying DFAs return an error during iteration, then iteration + /// panics. This only occurs in non-default configurations where quit bytes + /// are used or Unicode word boundaries are heuristically enabled. + /// + /// The fallible version of this routine is + /// [`try_find_overlapping_iter`](Regex::try_find_overlapping_iter). + /// + /// # Example + /// + /// This example shows how to run an overlapping search with multiple + /// regexes. + /// + /// ``` + /// use regex_automata::{dfa::{self, regex::Regex}, MatchKind, MultiMatch}; + /// + /// let re = Regex::builder() + /// .dense(dfa::dense::Config::new().match_kind(MatchKind::All)) + /// .build_many(&[r"\w+$", r"\S+$"])?; + /// let haystack = "@foo".as_bytes(); + /// + /// let mut it = re.find_overlapping_iter(haystack); + /// assert_eq!(Some(MultiMatch::must(1, 0, 4)), it.next()); + /// assert_eq!(Some(MultiMatch::must(0, 1, 4)), it.next()); + /// assert_eq!(None, it.next()); + /// + /// # Ok::<(), Box>(()) + /// ``` + pub fn find_overlapping_iter<'r, 't>( + &'r self, + haystack: &'t [u8], + ) -> FindOverlappingMatches<'r, 't, A, P> { + FindOverlappingMatches::new(self, haystack) + } +} + +/// Lower level infallible search routines that permit controlling where +/// the search starts and ends in a particular sequence. This is useful for +/// executing searches that need to take surrounding context into account. This +/// is required for correctly implementing iteration because of look-around +/// operators (`^`, `$`, `\b`). +impl Regex { + /// Returns true if and only if this regex matches the given haystack. + /// + /// This routine may short circuit if it knows that scanning future input + /// will never lead to a different result. In particular, if the underlying + /// DFA enters a match state or a dead state, then this routine will return + /// `true` or `false`, respectively, without inspecting any future input. + /// + /// # Searching a substring of the haystack + /// + /// Being an "at" search routine, this permits callers to search a + /// substring of `haystack` by specifying a range in `haystack`. + /// Why expose this as an API instead of just asking callers to use + /// `&input[start..end]`? The reason is that regex matching often wants + /// to take the surrounding context into account in order to handle + /// look-around (`^`, `$` and `\b`). + /// + /// # Panics + /// + /// If the underlying DFAs return an error, then this routine panics. This + /// only occurs in non-default configurations where quit bytes are used or + /// Unicode word boundaries are heuristically enabled. + /// + /// The fallible version of this routine is + /// [`try_is_match_at`](Regex::try_is_match_at). + pub fn is_match_at( + &self, + haystack: &[u8], + start: usize, + end: usize, + ) -> bool { + self.try_is_match_at(haystack, start, end).unwrap() + } + + /// Returns the first position at which a match is found. + /// + /// This routine stops scanning input in precisely the same circumstances + /// as `is_match`. The key difference is that this routine returns the + /// position at which it stopped scanning input if and only if a match + /// was found. If no match is found, then `None` is returned. + /// + /// # Searching a substring of the haystack + /// + /// Being an "at" search routine, this permits callers to search a + /// substring of `haystack` by specifying a range in `haystack`. + /// Why expose this as an API instead of just asking callers to use + /// `&input[start..end]`? The reason is that regex matching often wants + /// to take the surrounding context into account in order to handle + /// look-around (`^`, `$` and `\b`). + /// + /// This is useful when implementing an iterator over matches + /// within the same haystack, which cannot be done correctly by simply + /// providing a subslice of `haystack`. + /// + /// # Panics + /// + /// If the underlying DFAs return an error, then this routine panics. This + /// only occurs in non-default configurations where quit bytes are used or + /// Unicode word boundaries are heuristically enabled. + /// + /// The fallible version of this routine is + /// [`try_find_earliest_at`](Regex::try_find_earliest_at). + pub fn find_earliest_at( + &self, + haystack: &[u8], + start: usize, + end: usize, + ) -> Option { + self.try_find_earliest_at(haystack, start, end).unwrap() + } + + /// Returns the same as `find_leftmost`, but starts the search at the given + /// offset. + /// + /// The significance of the starting point is that it takes the surrounding + /// context into consideration. For example, if the DFA is anchored, then + /// a match can only occur when `start == 0`. + /// + /// # Searching a substring of the haystack + /// + /// Being an "at" search routine, this permits callers to search a + /// substring of `haystack` by specifying a range in `haystack`. + /// Why expose this as an API instead of just asking callers to use + /// `&input[start..end]`? The reason is that regex matching often wants + /// to take the surrounding context into account in order to handle + /// look-around (`^`, `$` and `\b`). + /// + /// This is useful when implementing an iterator over matches within the + /// same haystack, which cannot be done correctly by simply providing a + /// subslice of `haystack`. + /// + /// # Panics + /// + /// If the underlying DFAs return an error, then this routine panics. This + /// only occurs in non-default configurations where quit bytes are used or + /// Unicode word boundaries are heuristically enabled. + /// + /// The fallible version of this routine is + /// [`try_find_leftmost_at`](Regex::try_find_leftmost_at). + pub fn find_leftmost_at( + &self, + haystack: &[u8], + start: usize, + end: usize, + ) -> Option { + self.try_find_leftmost_at(haystack, start, end).unwrap() + } + + /// Search for the first overlapping match within a given range of + /// `haystack`. + /// + /// This routine is principally useful when searching for multiple patterns + /// on inputs where multiple patterns may match the same regions of text. + /// In particular, callers must preserve the automaton's search state from + /// prior calls so that the implementation knows where the last match + /// occurred and which pattern was reported. + /// + /// # Searching a substring of the haystack + /// + /// Being an "at" search routine, this permits callers to search a + /// substring of `haystack` by specifying a range in `haystack`. + /// Why expose this as an API instead of just asking callers to use + /// `&input[start..end]`? The reason is that regex matching often wants + /// to take the surrounding context into account in order to handle + /// look-around (`^`, `$` and `\b`). + /// + /// This is useful when implementing an iterator over matches + /// within the same haystack, which cannot be done correctly by simply + /// providing a subslice of `haystack`. + /// + /// # Panics + /// + /// If the underlying DFAs return an error, then this routine panics. This + /// only occurs in non-default configurations where quit bytes are used or + /// Unicode word boundaries are heuristically enabled. + /// + /// The fallible version of this routine is + /// [`try_find_overlapping_at`](Regex::try_find_overlapping_at). + pub fn find_overlapping_at( + &self, + haystack: &[u8], + start: usize, + end: usize, + state: &mut OverlappingState, + ) -> Option { + self.try_find_overlapping_at(haystack, start, end, state).unwrap() + } +} + +/// Fallible search routines. These may return an error when the underlying +/// DFAs have been configured in a way that permits them to fail during a +/// search. +/// +/// Errors during search only occur when the DFA has been explicitly +/// configured to do so, usually by specifying one or more "quit" bytes or by +/// heuristically enabling Unicode word boundaries. +/// +/// Errors will never be returned using the default configuration. So these +/// fallible routines are only needed for particular configurations. +impl Regex { + /// Returns true if and only if this regex matches the given haystack. + /// + /// This routine may short circuit if it knows that scanning future input + /// will never lead to a different result. In particular, if the underlying + /// DFA enters a match state or a dead state, then this routine will return + /// `true` or `false`, respectively, without inspecting any future input. + /// + /// # Errors + /// + /// This routine only errors if the search could not complete. For + /// DFA-based regexes, this only occurs in a non-default configuration + /// where quit bytes are used or Unicode word boundaries are heuristically + /// enabled. + /// + /// When a search cannot complete, callers cannot know whether a match + /// exists or not. + /// + /// The infallible (panics on error) version of this routine is + /// [`is_match`](Regex::is_match). + pub fn try_is_match(&self, haystack: &[u8]) -> Result { + self.try_is_match_at(haystack, 0, haystack.len()) + } + + /// Returns the first position at which a match is found. + /// + /// This routine stops scanning input in precisely the same circumstances + /// as `is_match`. The key difference is that this routine returns the + /// position at which it stopped scanning input if and only if a match + /// was found. If no match is found, then `None` is returned. + /// + /// # Errors + /// + /// This routine only errors if the search could not complete. For + /// DFA-based regexes, this only occurs in a non-default configuration + /// where quit bytes are used or Unicode word boundaries are heuristically + /// enabled. + /// + /// When a search cannot complete, callers cannot know whether a match + /// exists or not. + /// + /// The infallible (panics on error) version of this routine is + /// [`find_earliest`](Regex::find_earliest). + pub fn try_find_earliest( + &self, + haystack: &[u8], + ) -> Result, MatchError> { + self.try_find_earliest_at(haystack, 0, haystack.len()) + } + + /// Returns the start and end offset of the leftmost match. If no match + /// exists, then `None` is returned. + /// + /// # Errors + /// + /// This routine only errors if the search could not complete. For + /// DFA-based regexes, this only occurs in a non-default configuration + /// where quit bytes are used or Unicode word boundaries are heuristically + /// enabled. + /// + /// When a search cannot complete, callers cannot know whether a match + /// exists or not. + /// + /// The infallible (panics on error) version of this routine is + /// [`find_leftmost`](Regex::find_leftmost). + pub fn try_find_leftmost( + &self, + haystack: &[u8], + ) -> Result, MatchError> { + self.try_find_leftmost_at(haystack, 0, haystack.len()) + } + + /// Search for the first overlapping match in `haystack`. + /// + /// This routine is principally useful when searching for multiple patterns + /// on inputs where multiple patterns may match the same regions of text. + /// In particular, callers must preserve the automaton's search state from + /// prior calls so that the implementation knows where the last match + /// occurred and which pattern was reported. + /// + /// # Errors + /// + /// This routine only errors if the search could not complete. For + /// DFA-based regexes, this only occurs in a non-default configuration + /// where quit bytes are used or Unicode word boundaries are heuristically + /// enabled. + /// + /// When a search cannot complete, callers cannot know whether a match + /// exists or not. + /// + /// The infallible (panics on error) version of this routine is + /// [`find_overlapping`](Regex::find_overlapping). + pub fn try_find_overlapping( + &self, + haystack: &[u8], + state: &mut OverlappingState, + ) -> Result, MatchError> { + self.try_find_overlapping_at(haystack, 0, haystack.len(), state) + } + + /// Returns an iterator over all non-overlapping "earliest" matches. + /// + /// Match positions are reported as soon as a match is known to occur, even + /// if the standard leftmost match would be longer. + /// + /// # Errors + /// + /// This iterator only yields errors if the search could not complete. For + /// DFA-based regexes, this only occurs in a non-default configuration + /// where quit bytes are used or Unicode word boundaries are heuristically + /// enabled. + /// + /// When a search cannot complete, callers cannot know whether a match + /// exists or not. + /// + /// The infallible (panics on error) version of this routine is + /// [`find_earliest_iter`](Regex::find_earliest_iter). + pub fn try_find_earliest_iter<'r, 't>( + &'r self, + haystack: &'t [u8], + ) -> TryFindEarliestMatches<'r, 't, A, P> { + TryFindEarliestMatches::new(self, haystack) + } + + /// Returns an iterator over all non-overlapping leftmost matches in the + /// given bytes. If no match exists, then the iterator yields no elements. + /// + /// This corresponds to the "standard" regex search iterator. + /// + /// # Errors + /// + /// This iterator only yields errors if the search could not complete. For + /// DFA-based regexes, this only occurs in a non-default configuration + /// where quit bytes are used or Unicode word boundaries are heuristically + /// enabled. + /// + /// When a search cannot complete, callers cannot know whether a match + /// exists or not. + /// + /// The infallible (panics on error) version of this routine is + /// [`find_leftmost_iter`](Regex::find_leftmost_iter). + pub fn try_find_leftmost_iter<'r, 't>( + &'r self, + haystack: &'t [u8], + ) -> TryFindLeftmostMatches<'r, 't, A, P> { + TryFindLeftmostMatches::new(self, haystack) + } + + /// Returns an iterator over all overlapping matches in the given haystack. + /// + /// This routine is principally useful when searching for multiple patterns + /// on inputs where multiple patterns may match the same regions of text. + /// The iterator takes care of handling the overlapping state that must be + /// threaded through every search. + /// + /// # Errors + /// + /// This iterator only yields errors if the search could not complete. For + /// DFA-based regexes, this only occurs in a non-default configuration + /// where quit bytes are used or Unicode word boundaries are heuristically + /// enabled. + /// + /// When a search cannot complete, callers cannot know whether a match + /// exists or not. + /// + /// The infallible (panics on error) version of this routine is + /// [`find_overlapping_iter`](Regex::find_overlapping_iter). + pub fn try_find_overlapping_iter<'r, 't>( + &'r self, + haystack: &'t [u8], + ) -> TryFindOverlappingMatches<'r, 't, A, P> { + TryFindOverlappingMatches::new(self, haystack) + } +} + +/// Lower level fallible search routines that permit controlling where the +/// search starts and ends in a particular sequence. +impl Regex { + /// Returns true if and only if this regex matches the given haystack. + /// + /// This routine may short circuit if it knows that scanning future input + /// will never lead to a different result. In particular, if the underlying + /// DFA enters a match state or a dead state, then this routine will return + /// `true` or `false`, respectively, without inspecting any future input. + /// + /// # Searching a substring of the haystack + /// + /// Being an "at" search routine, this permits callers to search a + /// substring of `haystack` by specifying a range in `haystack`. + /// Why expose this as an API instead of just asking callers to use + /// `&input[start..end]`? The reason is that regex matching often wants + /// to take the surrounding context into account in order to handle + /// look-around (`^`, `$` and `\b`). + /// + /// # Errors + /// + /// This routine only errors if the search could not complete. For + /// DFA-based regexes, this only occurs in a non-default configuration + /// where quit bytes are used, Unicode word boundaries are heuristically + /// enabled or limits are set on the number of times the lazy DFA's cache + /// may be cleared. + /// + /// When a search cannot complete, callers cannot know whether a match + /// exists or not. + /// + /// The infallible (panics on error) version of this routine is + /// [`is_match_at`](Regex::is_match_at). + pub fn try_is_match_at( + &self, + haystack: &[u8], + start: usize, + end: usize, + ) -> Result { + self.forward() + .find_earliest_fwd_at( + self.scanner().as_mut(), + None, + haystack, + start, + end, + ) + .map(|x| x.is_some()) + } + + /// Returns the first position at which a match is found. + /// + /// This routine stops scanning input in precisely the same circumstances + /// as `is_match`. The key difference is that this routine returns the + /// position at which it stopped scanning input if and only if a match + /// was found. If no match is found, then `None` is returned. + /// + /// # Searching a substring of the haystack + /// + /// Being an "at" search routine, this permits callers to search a + /// substring of `haystack` by specifying a range in `haystack`. + /// Why expose this as an API instead of just asking callers to use + /// `&input[start..end]`? The reason is that regex matching often wants + /// to take the surrounding context into account in order to handle + /// look-around (`^`, `$` and `\b`). + /// + /// This is useful when implementing an iterator over matches + /// within the same haystack, which cannot be done correctly by simply + /// providing a subslice of `haystack`. + /// + /// # Errors + /// + /// This routine only errors if the search could not complete. For + /// DFA-based regexes, this only occurs in a non-default configuration + /// where quit bytes are used or Unicode word boundaries are heuristically + /// enabled. + /// + /// When a search cannot complete, callers cannot know whether a match + /// exists or not. + /// + /// The infallible (panics on error) version of this routine is + /// [`find_earliest_at`](Regex::find_earliest_at). + pub fn try_find_earliest_at( + &self, + haystack: &[u8], + start: usize, + end: usize, + ) -> Result, MatchError> { + self.try_find_earliest_at_imp( + self.scanner().as_mut(), + haystack, + start, + end, + ) + } + + /// The implementation of "earliest" searching, where a prefilter scanner + /// may be given. + fn try_find_earliest_at_imp( + &self, + pre: Option<&mut prefilter::Scanner>, + haystack: &[u8], + start: usize, + end: usize, + ) -> Result, MatchError> { + // N.B. We use `&&A` here to call `Automaton` methods, which ensures + // that we always use the `impl Automaton for &A` for calling methods. + // Since this is the usual way that automata are used, this helps + // reduce the number of monomorphized copies of the search code. + let (fwd, rev) = (self.forward(), self.reverse()); + let end = match (&fwd) + .find_earliest_fwd_at(pre, None, haystack, start, end)? + { + None => return Ok(None), + Some(end) => end, + }; + // N.B. The only time we need to tell the reverse searcher the pattern + // to match is in the overlapping case, since it's ambiguous. In the + // leftmost case, I have tentatively convinced myself that it isn't + // necessary and the reverse search will always find the same pattern + // to match as the forward search. But I lack a rigorous proof. + let start = (&rev) + .find_earliest_rev_at(None, haystack, start, end.offset())? + .expect("reverse search must match if forward search does"); + assert_eq!( + start.pattern(), + end.pattern(), + "forward and reverse search must match same pattern" + ); + assert!(start.offset() <= end.offset()); + Ok(Some(MultiMatch::new(end.pattern(), start.offset(), end.offset()))) + } + + /// Returns the start and end offset of the leftmost match. If no match + /// exists, then `None` is returned. + /// + /// # Searching a substring of the haystack + /// + /// Being an "at" search routine, this permits callers to search a + /// substring of `haystack` by specifying a range in `haystack`. + /// Why expose this as an API instead of just asking callers to use + /// `&input[start..end]`? The reason is that regex matching often wants + /// to take the surrounding context into account in order to handle + /// look-around (`^`, `$` and `\b`). + /// + /// This is useful when implementing an iterator over matches + /// within the same haystack, which cannot be done correctly by simply + /// providing a subslice of `haystack`. + /// + /// # Errors + /// + /// This routine only errors if the search could not complete. For + /// DFA-based regexes, this only occurs in a non-default configuration + /// where quit bytes are used or Unicode word boundaries are heuristically + /// enabled. + /// + /// When a search cannot complete, callers cannot know whether a match + /// exists or not. + /// + /// The infallible (panics on error) version of this routine is + /// [`find_leftmost_at`](Regex::find_leftmost_at). + pub fn try_find_leftmost_at( + &self, + haystack: &[u8], + start: usize, + end: usize, + ) -> Result, MatchError> { + self.try_find_leftmost_at_imp( + self.scanner().as_mut(), + haystack, + start, + end, + ) + } + + /// The implementation of leftmost searching, where a prefilter scanner + /// may be given. + fn try_find_leftmost_at_imp( + &self, + scanner: Option<&mut prefilter::Scanner>, + haystack: &[u8], + start: usize, + end: usize, + ) -> Result, MatchError> { + // N.B. We use `&&A` here to call `Automaton` methods, which ensures + // that we always use the `impl Automaton for &A` for calling methods. + // Since this is the usual way that automata are used, this helps + // reduce the number of monomorphized copies of the search code. + let (fwd, rev) = (self.forward(), self.reverse()); + let end = match (&fwd) + .find_leftmost_fwd_at(scanner, None, haystack, start, end)? + { + None => return Ok(None), + Some(end) => end, + }; + // N.B. The only time we need to tell the reverse searcher the pattern + // to match is in the overlapping case, since it's ambiguous. In the + // leftmost case, I have tentatively convinced myself that it isn't + // necessary and the reverse search will always find the same pattern + // to match as the forward search. But I lack a rigorous proof. Why not + // just provide the pattern anyway? Well, if it is needed, then leaving + // it out gives us a chance to find a witness. + let start = (&rev) + .find_leftmost_rev_at(None, haystack, start, end.offset())? + .expect("reverse search must match if forward search does"); + assert_eq!( + start.pattern(), + end.pattern(), + "forward and reverse search must match same pattern", + ); + assert!(start.offset() <= end.offset()); + Ok(Some(MultiMatch::new(end.pattern(), start.offset(), end.offset()))) + } + + /// Search for the first overlapping match within a given range of + /// `haystack`. + /// + /// This routine is principally useful when searching for multiple patterns + /// on inputs where multiple patterns may match the same regions of text. + /// In particular, callers must preserve the automaton's search state from + /// prior calls so that the implementation knows where the last match + /// occurred and which pattern was reported. + /// + /// # Searching a substring of the haystack + /// + /// Being an "at" search routine, this permits callers to search a + /// substring of `haystack` by specifying a range in `haystack`. + /// Why expose this as an API instead of just asking callers to use + /// `&input[start..end]`? The reason is that regex matching often wants + /// to take the surrounding context into account in order to handle + /// look-around (`^`, `$` and `\b`). + /// + /// This is useful when implementing an iterator over matches + /// within the same haystack, which cannot be done correctly by simply + /// providing a subslice of `haystack`. + /// + /// # Errors + /// + /// This routine only errors if the search could not complete. For + /// DFA-based regexes, this only occurs in a non-default configuration + /// where quit bytes are used or Unicode word boundaries are heuristically + /// enabled. + /// + /// When a search cannot complete, callers cannot know whether a match + /// exists or not. + /// + /// The infallible (panics on error) version of this routine is + /// [`find_overlapping_at`](Regex::find_overlapping_at). + pub fn try_find_overlapping_at( + &self, + haystack: &[u8], + start: usize, + end: usize, + state: &mut OverlappingState, + ) -> Result, MatchError> { + self.try_find_overlapping_at_imp( + self.scanner().as_mut(), + haystack, + start, + end, + state, + ) + } + + /// The implementation of overlapping search at a given range in + /// `haystack`, where `scanner` is a prefilter (if active) and `state` is + /// the current state of the search. + fn try_find_overlapping_at_imp( + &self, + scanner: Option<&mut prefilter::Scanner>, + haystack: &[u8], + start: usize, + end: usize, + state: &mut OverlappingState, + ) -> Result, MatchError> { + // N.B. We use `&&A` here to call `Automaton` methods, which ensures + // that we always use the `impl Automaton for &A` for calling methods. + // Since this is the usual way that automata are used, this helps + // reduce the number of monomorphized copies of the search code. + let (fwd, rev) = (self.forward(), self.reverse()); + // TODO: Decide whether it's worth making this assert work. It doesn't + // work currently because 'has_starts_for_each_pattern' isn't on the + // Automaton trait. Without this assert, we still get a panic, but it's + // a bit more inscrutable. + // assert!( + // rev.has_starts_for_each_pattern(), + // "overlapping searches require that the reverse DFA is \ + // compiled with the 'starts_for_each_pattern' option", + // ); + let end = match (&fwd).find_overlapping_fwd_at( + scanner, None, haystack, start, end, state, + )? { + None => return Ok(None), + Some(end) => end, + }; + // Unlike the leftmost cases, the reverse overlapping search may match + // a different pattern than the forward search. See test failures when + // using `None` instead of `Some(end.pattern())` below. Thus, we must + // run our reverse search using the pattern that matched in the forward + // direction. + let start = (&rev) + .find_leftmost_rev_at( + Some(end.pattern()), + haystack, + 0, + end.offset(), + )? + .expect("reverse search must match if forward search does"); + assert!(start.offset() <= end.offset()); + assert_eq!(start.pattern(), end.pattern()); + Ok(Some(MultiMatch::new(end.pattern(), start.offset(), end.offset()))) + } +} + +/// Non-search APIs for querying information about the regex and setting a +/// prefilter. +impl Regex { + /// Attach the given prefilter to this regex. + pub fn with_prefilter(self, prefilter: Q) -> Regex { + Regex { + prefilter: Some(prefilter), + forward: self.forward, + reverse: self.reverse, + utf8: self.utf8, + } + } + + /// Remove any prefilter from this regex. + pub fn without_prefilter(self) -> Regex { + Regex { + prefilter: None, + forward: self.forward, + reverse: self.reverse, + utf8: self.utf8, + } + } + + /// Return the underlying DFA responsible for forward matching. + /// + /// This is useful for accessing the underlying DFA and converting it to + /// some other format or size. See the [`Builder::build_from_dfas`] docs + /// for an example of where this might be useful. + pub fn forward(&self) -> &A { + &self.forward + } + + /// Return the underlying DFA responsible for reverse matching. + /// + /// This is useful for accessing the underlying DFA and converting it to + /// some other format or size. See the [`Builder::build_from_dfas`] docs + /// for an example of where this might be useful. + pub fn reverse(&self) -> &A { + &self.reverse + } + + /// Returns the total number of patterns matched by this regex. + /// + /// # Example + /// + /// ``` + /// use regex_automata::{MultiMatch, dfa::regex::Regex}; + /// + /// let re = Regex::new_many(&[r"[a-z]+", r"[0-9]+", r"\w+"])?; + /// assert_eq!(3, re.pattern_count()); + /// # Ok::<(), Box>(()) + /// ``` + pub fn pattern_count(&self) -> usize { + assert_eq!( + self.forward().pattern_count(), + self.reverse().pattern_count() + ); + self.forward().pattern_count() + } + + /// Convenience function for returning this regex's prefilter as a trait + /// object. + /// + /// If this regex doesn't have a prefilter, then `None` is returned. + pub fn prefilter(&self) -> Option<&dyn Prefilter> { + match self.prefilter { + None => None, + Some(ref x) => Some(&*x), + } + } + + /// Convenience function for returning a prefilter scanner. + fn scanner(&self) -> Option { + self.prefilter().map(prefilter::Scanner::new) + } +} + +/// An iterator over all non-overlapping earliest matches for a particular +/// infallible search. +/// +/// The iterator yields a [`MultiMatch`] value until no more matches could be +/// found. If the underlying search returns an error, then this panics. +/// +/// `A` is the type used to represent the underlying DFAs used by the regex, +/// while `P` is the type of prefilter used, if any. The lifetime variables are +/// as follows: +/// +/// * `'r` is the lifetime of the regular expression itself. +/// * `'t` is the lifetime of the text being searched. +#[derive(Clone, Debug)] +pub struct FindEarliestMatches<'r, 't, A, P>( + TryFindEarliestMatches<'r, 't, A, P>, +); + +impl<'r, 't, A: Automaton, P: Prefilter> FindEarliestMatches<'r, 't, A, P> { + fn new( + re: &'r Regex, + text: &'t [u8], + ) -> FindEarliestMatches<'r, 't, A, P> { + FindEarliestMatches(TryFindEarliestMatches::new(re, text)) + } +} + +impl<'r, 't, A: Automaton, P: Prefilter> Iterator + for FindEarliestMatches<'r, 't, A, P> +{ + type Item = MultiMatch; + + fn next(&mut self) -> Option { + next_unwrap(self.0.next()) + } +} + +/// An iterator over all non-overlapping leftmost matches for a particular +/// infallible search. +/// +/// The iterator yields a [`MultiMatch`] value until no more matches could be +/// found. If the underlying search returns an error, then this panics. +/// +/// `A` is the type used to represent the underlying DFAs used by the regex, +/// while `P` is the type of prefilter used, if any. The lifetime variables are +/// as follows: +/// +/// * `'r` is the lifetime of the regular expression itself. +/// * `'t` is the lifetime of the text being searched. +#[derive(Clone, Debug)] +pub struct FindLeftmostMatches<'r, 't, A, P>( + TryFindLeftmostMatches<'r, 't, A, P>, +); + +impl<'r, 't, A: Automaton, P: Prefilter> FindLeftmostMatches<'r, 't, A, P> { + fn new( + re: &'r Regex, + text: &'t [u8], + ) -> FindLeftmostMatches<'r, 't, A, P> { + FindLeftmostMatches(TryFindLeftmostMatches::new(re, text)) + } +} + +impl<'r, 't, A: Automaton, P: Prefilter> Iterator + for FindLeftmostMatches<'r, 't, A, P> +{ + type Item = MultiMatch; + + fn next(&mut self) -> Option { + next_unwrap(self.0.next()) + } +} + +/// An iterator over all overlapping matches for a particular infallible +/// search. +/// +/// The iterator yields a [`MultiMatch`] value until no more matches could be +/// found. If the underlying search returns an error, then this panics. +/// +/// `A` is the type used to represent the underlying DFAs used by the regex, +/// while `P` is the type of prefilter used, if any. The lifetime variables are +/// as follows: +/// +/// * `'r` is the lifetime of the regular expression itself. +/// * `'t` is the lifetime of the text being searched. +#[derive(Clone, Debug)] +pub struct FindOverlappingMatches<'r, 't, A: Automaton, P>( + TryFindOverlappingMatches<'r, 't, A, P>, +); + +impl<'r, 't, A: Automaton, P: Prefilter> FindOverlappingMatches<'r, 't, A, P> { + fn new( + re: &'r Regex, + text: &'t [u8], + ) -> FindOverlappingMatches<'r, 't, A, P> { + FindOverlappingMatches(TryFindOverlappingMatches::new(re, text)) + } +} + +impl<'r, 't, A: Automaton, P: Prefilter> Iterator + for FindOverlappingMatches<'r, 't, A, P> +{ + type Item = MultiMatch; + + fn next(&mut self) -> Option { + next_unwrap(self.0.next()) + } +} + +/// An iterator over all non-overlapping earliest matches for a particular +/// fallible search. +/// +/// The iterator yields a [`MultiMatch`] value until no more matches could be +/// found. +/// +/// `A` is the type used to represent the underlying DFAs used by the regex, +/// while `P` is the type of prefilter used, if any. The lifetime variables are +/// as follows: +/// +/// * `'r` is the lifetime of the regular expression itself. +/// * `'t` is the lifetime of the text being searched. +#[derive(Clone, Debug)] +pub struct TryFindEarliestMatches<'r, 't, A, P> { + re: &'r Regex, + scanner: Option>, + text: &'t [u8], + last_end: usize, + last_match: Option, +} + +impl<'r, 't, A: Automaton, P: Prefilter> TryFindEarliestMatches<'r, 't, A, P> { + fn new( + re: &'r Regex, + text: &'t [u8], + ) -> TryFindEarliestMatches<'r, 't, A, P> { + let scanner = re.scanner(); + TryFindEarliestMatches { + re, + scanner, + text, + last_end: 0, + last_match: None, + } + } +} + +impl<'r, 't, A: Automaton, P: Prefilter> Iterator + for TryFindEarliestMatches<'r, 't, A, P> +{ + type Item = Result; + + fn next(&mut self) -> Option> { + if self.last_end > self.text.len() { + return None; + } + let result = self.re.try_find_earliest_at_imp( + self.scanner.as_mut(), + self.text, + self.last_end, + self.text.len(), + ); + let m = match result { + Err(err) => return Some(Err(err)), + Ok(None) => return None, + Ok(Some(m)) => m, + }; + if m.is_empty() { + // This is an empty match. To ensure we make progress, start + // the next search at the smallest possible starting position + // of the next match following this one. + self.last_end = if self.re.utf8 { + crate::util::next_utf8(self.text, m.end()) + } else { + m.end() + 1 + }; + // Don't accept empty matches immediately following a match. + // Just move on to the next match. + if Some(m.end()) == self.last_match { + return self.next(); + } + } else { + self.last_end = m.end(); + } + self.last_match = Some(m.end()); + Some(Ok(m)) + } +} + +/// An iterator over all non-overlapping leftmost matches for a particular +/// fallible search. +/// +/// The iterator yields a [`MultiMatch`] value until no more matches could be +/// found. +/// +/// `A` is the type used to represent the underlying DFAs used by the regex, +/// while `P` is the type of prefilter used, if any. The lifetime variables are +/// as follows: +/// +/// * `'r` is the lifetime of the regular expression itself. +/// * `'t` is the lifetime of the text being searched. +#[derive(Clone, Debug)] +pub struct TryFindLeftmostMatches<'r, 't, A, P> { + re: &'r Regex, + scanner: Option>, + text: &'t [u8], + last_end: usize, + last_match: Option, +} + +impl<'r, 't, A: Automaton, P: Prefilter> TryFindLeftmostMatches<'r, 't, A, P> { + fn new( + re: &'r Regex, + text: &'t [u8], + ) -> TryFindLeftmostMatches<'r, 't, A, P> { + let scanner = re.scanner(); + TryFindLeftmostMatches { + re, + scanner, + text, + last_end: 0, + last_match: None, + } + } +} + +impl<'r, 't, A: Automaton, P: Prefilter> Iterator + for TryFindLeftmostMatches<'r, 't, A, P> +{ + type Item = Result; + + fn next(&mut self) -> Option> { + if self.last_end > self.text.len() { + return None; + } + let result = self.re.try_find_leftmost_at_imp( + self.scanner.as_mut(), + self.text, + self.last_end, + self.text.len(), + ); + let m = match result { + Err(err) => return Some(Err(err)), + Ok(None) => return None, + Ok(Some(m)) => m, + }; + if m.is_empty() { + // This is an empty match. To ensure we make progress, start + // the next search at the smallest possible starting position + // of the next match following this one. + self.last_end = if self.re.utf8 { + crate::util::next_utf8(self.text, m.end()) + } else { + m.end() + 1 + }; + // Don't accept empty matches immediately following a match. + // Just move on to the next match. + if Some(m.end()) == self.last_match { + return self.next(); + } + } else { + self.last_end = m.end(); + } + self.last_match = Some(m.end()); + Some(Ok(m)) + } +} + +/// An iterator over all overlapping matches for a particular fallible search. +/// +/// The iterator yields a [`MultiMatch`] value until no more matches could be +/// found. +/// +/// `A` is the type used to represent the underlying DFAs used by the regex, +/// while `P` is the type of prefilter used, if any. The lifetime variables are +/// as follows: +/// +/// * `'r` is the lifetime of the regular expression itself. +/// * `'t` is the lifetime of the text being searched. +#[derive(Clone, Debug)] +pub struct TryFindOverlappingMatches<'r, 't, A: Automaton, P> { + re: &'r Regex, + scanner: Option>, + text: &'t [u8], + last_end: usize, + state: OverlappingState, +} + +impl<'r, 't, A: Automaton, P: Prefilter> + TryFindOverlappingMatches<'r, 't, A, P> +{ + fn new( + re: &'r Regex, + text: &'t [u8], + ) -> TryFindOverlappingMatches<'r, 't, A, P> { + let scanner = re.scanner(); + TryFindOverlappingMatches { + re, + scanner, + text, + last_end: 0, + state: OverlappingState::start(), + } + } +} + +impl<'r, 't, A: Automaton, P: Prefilter> Iterator + for TryFindOverlappingMatches<'r, 't, A, P> +{ + type Item = Result; + + fn next(&mut self) -> Option> { + if self.last_end > self.text.len() { + return None; + } + let result = self.re.try_find_overlapping_at_imp( + self.scanner.as_mut(), + self.text, + self.last_end, + self.text.len(), + &mut self.state, + ); + let m = match result { + Err(err) => return Some(Err(err)), + Ok(None) => return None, + Ok(Some(m)) => m, + }; + // Unlike the non-overlapping case, we're OK with empty matches at this + // level. In particular, the overlapping search algorithm is itself + // responsible for ensuring that progress is always made. + self.last_end = m.end(); + Some(Ok(m)) + } +} + +/// The configuration used for compiling a DFA-backed regex. +/// +/// A regex configuration is a simple data object that is typically used with +/// [`Builder::configure`]. +#[cfg(feature = "alloc")] +#[derive(Clone, Copy, Debug, Default)] +pub struct Config { + utf8: Option, +} + +#[cfg(feature = "alloc")] +impl Config { + /// Return a new default regex compiler configuration. + pub fn new() -> Config { + Config::default() + } + + /// Whether to enable UTF-8 mode or not. + /// + /// When UTF-8 mode is enabled (the default) and an empty match is seen, + /// the iterators on [`Regex`] will always start the next search at the + /// next UTF-8 encoded codepoint when searching valid UTF-8. When UTF-8 + /// mode is disabled, such searches are begun at the next byte offset. + /// + /// If this mode is enabled and invalid UTF-8 is given to search, then + /// behavior is unspecified. + /// + /// Generally speaking, one should enable this when + /// [`SyntaxConfig::utf8`](crate::SyntaxConfig::utf8) + /// and + /// [`thompson::Config::utf8`](crate::nfa::thompson::Config::utf8) + /// are enabled, and disable it otherwise. + /// + /// # Example + /// + /// This example demonstrates the differences between when this option is + /// enabled and disabled. The differences only arise when the regex can + /// return matches of length zero. + /// + /// In this first snippet, we show the results when UTF-8 mode is disabled. + /// + /// ``` + /// use regex_automata::{dfa::regex::Regex, MultiMatch}; + /// + /// let re = Regex::builder() + /// .configure(Regex::config().utf8(false)) + /// .build(r"")?; + /// let haystack = "a☃z".as_bytes(); + /// let mut it = re.find_leftmost_iter(haystack); + /// assert_eq!(Some(MultiMatch::must(0, 0, 0)), it.next()); + /// assert_eq!(Some(MultiMatch::must(0, 1, 1)), it.next()); + /// assert_eq!(Some(MultiMatch::must(0, 2, 2)), it.next()); + /// assert_eq!(Some(MultiMatch::must(0, 3, 3)), it.next()); + /// assert_eq!(Some(MultiMatch::must(0, 4, 4)), it.next()); + /// assert_eq!(Some(MultiMatch::must(0, 5, 5)), it.next()); + /// assert_eq!(None, it.next()); + /// + /// # Ok::<(), Box>(()) + /// ``` + /// + /// And in this snippet, we execute the same search on the same haystack, + /// but with UTF-8 mode enabled. Notice that byte offsets that would + /// otherwise split the encoding of `☃` are not returned. + /// + /// ``` + /// use regex_automata::{dfa::regex::Regex, MultiMatch}; + /// + /// let re = Regex::builder() + /// .configure(Regex::config().utf8(true)) + /// .build(r"")?; + /// let haystack = "a☃z".as_bytes(); + /// let mut it = re.find_leftmost_iter(haystack); + /// assert_eq!(Some(MultiMatch::must(0, 0, 0)), it.next()); + /// assert_eq!(Some(MultiMatch::must(0, 1, 1)), it.next()); + /// assert_eq!(Some(MultiMatch::must(0, 4, 4)), it.next()); + /// assert_eq!(Some(MultiMatch::must(0, 5, 5)), it.next()); + /// assert_eq!(None, it.next()); + /// + /// # Ok::<(), Box>(()) + /// ``` + pub fn utf8(mut self, yes: bool) -> Config { + self.utf8 = Some(yes); + self + } + + /// Returns true if and only if this configuration has UTF-8 mode enabled. + /// + /// When UTF-8 mode is enabled and an empty match is seen, the iterators on + /// [`Regex`] will always start the next search at the next UTF-8 encoded + /// codepoint. When UTF-8 mode is disabled, such searches are begun at the + /// next byte offset. + pub fn get_utf8(&self) -> bool { + self.utf8.unwrap_or(true) + } + + /// Overwrite the default configuration such that the options in `o` are + /// always used. If an option in `o` is not set, then the corresponding + /// option in `self` is used. If it's not set in `self` either, then it + /// remains not set. + pub(crate) fn overwrite(self, o: Config) -> Config { + Config { utf8: o.utf8.or(self.utf8) } + } +} + +/// A builder for a regex based on deterministic finite automatons. +/// +/// This builder permits configuring options for the syntax of a pattern, the +/// NFA construction, the DFA construction and finally the regex searching +/// itself. This builder is different from a general purpose regex builder in +/// that it permits fine grain configuration of the construction process. The +/// trade off for this is complexity, and the possibility of setting a +/// configuration that might not make sense. For example, there are three +/// different UTF-8 modes: +/// +/// * [`SyntaxConfig::utf8`](crate::SyntaxConfig::utf8) controls whether the +/// pattern itself can contain sub-expressions that match invalid UTF-8. +/// * [`nfa::thompson::Config::utf8`](crate::nfa::thompson::Config::utf8) +/// controls whether the implicit unanchored prefix added to the NFA can +/// match through invalid UTF-8 or not. +/// * [`Config::utf8`] controls how the regex iterators themselves advance +/// the starting position of the next search when a match with zero length is +/// found. +/// +/// Generally speaking, callers will want to either enable all of these or +/// disable all of these. +/// +/// Internally, building a regex requires building two DFAs, where one is +/// responsible for finding the end of a match and the other is responsible +/// for finding the start of a match. If you only need to detect whether +/// something matched, or only the end of a match, then you should use a +/// [`dense::Builder`] to construct a single DFA, which is cheaper than +/// building two DFAs. +/// +/// # Build methods +/// +/// This builder has a few "build" methods. In general, it's the result of +/// combining the following parameters: +/// +/// * Building one or many regexes. +/// * Building a regex with dense or sparse DFAs. +/// +/// The simplest "build" method is [`Builder::build`]. It accepts a single +/// pattern and builds a dense DFA using `usize` for the state identifier +/// representation. +/// +/// The most general "build" method is [`Builder::build_many`], which permits +/// building a regex that searches for multiple patterns simultaneously while +/// using a specific state identifier representation. +/// +/// The most flexible "build" method, but hardest to use, is +/// [`Builder::build_from_dfas`]. This exposes the fact that a [`Regex`] is +/// just a pair of DFAs, and this method allows you to specify those DFAs +/// exactly. +/// +/// # Example +/// +/// This example shows how to disable UTF-8 mode in the syntax, the NFA and +/// the regex itself. This is generally what you want for matching on +/// arbitrary bytes. +/// +/// ``` +/// use regex_automata::{ +/// dfa::regex::Regex, nfa::thompson, MultiMatch, SyntaxConfig +/// }; +/// +/// let re = Regex::builder() +/// .configure(Regex::config().utf8(false)) +/// .syntax(SyntaxConfig::new().utf8(false)) +/// .thompson(thompson::Config::new().utf8(false)) +/// .build(r"foo(?-u:[^b])ar.*")?; +/// let haystack = b"\xFEfoo\xFFarzz\xE2\x98\xFF\n"; +/// let expected = Some(MultiMatch::must(0, 1, 9)); +/// let got = re.find_leftmost(haystack); +/// assert_eq!(expected, got); +/// // Notice that `(?-u:[^b])` matches invalid UTF-8, +/// // but the subsequent `.*` does not! Disabling UTF-8 +/// // on the syntax permits this. Notice also that the +/// // search was unanchored and skipped over invalid UTF-8. +/// // Disabling UTF-8 on the Thompson NFA permits this. +/// // +/// // N.B. This example does not show the impact of +/// // disabling UTF-8 mode on Config, since that +/// // only impacts regexes that can produce matches of +/// // length 0. +/// assert_eq!(b"foo\xFFarzz", &haystack[got.unwrap().range()]); +/// +/// # Ok::<(), Box>(()) +/// ``` +#[cfg(feature = "alloc")] +#[derive(Clone, Debug)] +pub struct Builder { + config: Config, + dfa: dense::Builder, +} + +#[cfg(feature = "alloc")] +impl Builder { + /// Create a new regex builder with the default configuration. + pub fn new() -> Builder { + Builder { config: Config::default(), dfa: dense::Builder::new() } + } + + /// Build a regex from the given pattern. + /// + /// If there was a problem parsing or compiling the pattern, then an error + /// is returned. + pub fn build(&self, pattern: &str) -> Result { + self.build_many(&[pattern]) + } + + /// Build a regex from the given pattern using sparse DFAs. + /// + /// If there was a problem parsing or compiling the pattern, then an error + /// is returned. + pub fn build_sparse( + &self, + pattern: &str, + ) -> Result>>, Error> { + self.build_many_sparse(&[pattern]) + } + + /// Build a regex from the given patterns. + pub fn build_many>( + &self, + patterns: &[P], + ) -> Result { + let forward = self.dfa.build_many(patterns)?; + let reverse = self + .dfa + .clone() + .configure( + dense::Config::new() + .anchored(true) + .match_kind(MatchKind::All) + .starts_for_each_pattern(true), + ) + .thompson(thompson::Config::new().reverse(true)) + .build_many(patterns)?; + Ok(self.build_from_dfas(forward, reverse)) + } + + /// Build a sparse regex from the given patterns. + pub fn build_many_sparse>( + &self, + patterns: &[P], + ) -> Result>>, Error> { + let re = self.build_many(patterns)?; + let forward = re.forward().to_sparse()?; + let reverse = re.reverse().to_sparse()?; + Ok(self.build_from_dfas(forward, reverse)) + } + + /// Build a regex from its component forward and reverse DFAs. + /// + /// This is useful when deserializing a regex from some arbitrary + /// memory region. This is also useful for building regexes from other + /// types of DFAs. + /// + /// If you're building the DFAs from scratch instead of building new DFAs + /// from other DFAs, then you'll need to make sure that the reverse DFA is + /// configured correctly to match the intended semantics. Namely: + /// + /// * It should be anchored. + /// * It should use [`MatchKind::All`] semantics. + /// * It should match in reverse. + /// * It should have anchored start states compiled for each pattern. + /// * Otherwise, its configuration should match the forward DFA. + /// + /// If these conditions are satisfied, then behavior of searches is + /// unspecified. + /// + /// Note that when using this constructor, only the configuration from + /// [`Config`] is applied. The only configuration settings on this builder + /// only apply when the builder owns the construction of the DFAs + /// themselves. + /// + /// # Example + /// + /// This example is a bit a contrived. The usual use of these methods + /// would involve serializing `initial_re` somewhere and then deserializing + /// it later to build a regex. But in this case, we do everything in + /// memory. + /// + /// ``` + /// use regex_automata::dfa::regex::Regex; + /// + /// let initial_re = Regex::new("foo[0-9]+")?; + /// assert_eq!(true, initial_re.is_match(b"foo123")); + /// + /// let (fwd, rev) = (initial_re.forward(), initial_re.reverse()); + /// let re = Regex::builder().build_from_dfas(fwd, rev); + /// assert_eq!(true, re.is_match(b"foo123")); + /// # Ok::<(), Box>(()) + /// ``` + /// + /// This example shows how to build a `Regex` that uses sparse DFAs instead + /// of dense DFAs without using one of the convenience `build_sparse` + /// routines: + /// + /// ``` + /// use regex_automata::dfa::regex::Regex; + /// + /// let initial_re = Regex::new("foo[0-9]+")?; + /// assert_eq!(true, initial_re.is_match(b"foo123")); + /// + /// let fwd = initial_re.forward().to_sparse()?; + /// let rev = initial_re.reverse().to_sparse()?; + /// let re = Regex::builder().build_from_dfas(fwd, rev); + /// assert_eq!(true, re.is_match(b"foo123")); + /// # Ok::<(), Box>(()) + /// ``` + pub fn build_from_dfas( + &self, + forward: A, + reverse: A, + ) -> Regex { + let utf8 = self.config.get_utf8(); + Regex { prefilter: None, forward, reverse, utf8 } + } + + /// Apply the given regex configuration options to this builder. + pub fn configure(&mut self, config: Config) -> &mut Builder { + self.config = self.config.overwrite(config); + self + } + + /// Set the syntax configuration for this builder using + /// [`SyntaxConfig`](crate::SyntaxConfig). + /// + /// This permits setting things like case insensitivity, Unicode and multi + /// line mode. + pub fn syntax( + &mut self, + config: crate::util::syntax::SyntaxConfig, + ) -> &mut Builder { + self.dfa.syntax(config); + self + } + + /// Set the Thompson NFA configuration for this builder using + /// [`nfa::thompson::Config`](thompson::Config). + /// + /// This permits setting things like whether additional time should be + /// spent shrinking the size of the NFA. + pub fn thompson(&mut self, config: thompson::Config) -> &mut Builder { + self.dfa.thompson(config); + self + } + + /// Set the dense DFA compilation configuration for this builder using + /// [`dense::Config`](dense::Config). + /// + /// This permits setting things like whether the underlying DFAs should + /// be minimized. + pub fn dense(&mut self, config: dense::Config) -> &mut Builder { + self.dfa.configure(config); + self + } +} + +#[cfg(feature = "alloc")] +impl Default for Builder { + fn default() -> Builder { + Builder::new() + } +} + +#[inline(always)] +fn next_unwrap( + item: Option>, +) -> Option { + match item { + None => None, + Some(Ok(m)) => Some(m), + Some(Err(err)) => panic!( + "unexpected regex search error: {}\n\ + to handle search errors, use try_ methods", + err, + ), + } +} diff --git a/vendor/regex-automata/src/dfa/search.rs b/vendor/regex-automata/src/dfa/search.rs new file mode 100644 index 000000000..492414981 --- /dev/null +++ b/vendor/regex-automata/src/dfa/search.rs @@ -0,0 +1,493 @@ +use crate::{ + dfa::{ + accel, + automaton::{Automaton, OverlappingState, StateMatch}, + }, + util::{ + id::{PatternID, StateID}, + matchtypes::HalfMatch, + prefilter, MATCH_OFFSET, + }, + MatchError, +}; + +#[inline(never)] +pub fn find_earliest_fwd( + pre: Option<&mut prefilter::Scanner>, + dfa: &A, + pattern_id: Option, + bytes: &[u8], + start: usize, + end: usize, +) -> Result, MatchError> { + // Searching with a pattern ID is always anchored, so we should never use + // a prefilter. + if pre.is_some() && pattern_id.is_none() { + find_fwd(pre, true, dfa, pattern_id, bytes, start, end) + } else { + find_fwd(None, true, dfa, pattern_id, bytes, start, end) + } +} + +#[inline(never)] +pub fn find_leftmost_fwd( + pre: Option<&mut prefilter::Scanner>, + dfa: &A, + pattern_id: Option, + bytes: &[u8], + start: usize, + end: usize, +) -> Result, MatchError> { + // Searching with a pattern ID is always anchored, so we should never use + // a prefilter. + if pre.is_some() && pattern_id.is_none() { + find_fwd(pre, false, dfa, pattern_id, bytes, start, end) + } else { + find_fwd(None, false, dfa, pattern_id, bytes, start, end) + } +} + +/// This is marked as `inline(always)` specifically because it supports +/// multiple modes of searching. Namely, the 'pre' and 'earliest' parameters +/// getting inlined eliminate some critical branches. To avoid bloating binary +/// size, we only call this function in a fixed number of places. +#[inline(always)] +fn find_fwd( + mut pre: Option<&mut prefilter::Scanner>, + earliest: bool, + dfa: &A, + pattern_id: Option, + haystack: &[u8], + start: usize, + end: usize, +) -> Result, MatchError> { + assert!(start <= end); + assert!(start <= haystack.len()); + assert!(end <= haystack.len()); + + // Why do this? This lets 'bytes[at]' work without bounds checks below. + // It seems the assert on 'end <= haystack.len()' above is otherwise + // not enough. Why not just make 'bytes' scoped this way anyway? Well, + // 'eoi_fwd' (below) might actually want to try to access the byte at 'end' + // for resolving look-ahead. + let bytes = &haystack[..end]; + + let mut state = init_fwd(dfa, pattern_id, haystack, start, end)?; + let mut last_match = None; + let mut at = start; + if let Some(ref mut pre) = pre { + // If a prefilter doesn't report false positives, then we don't need to + // touch the DFA at all. However, since all matches include the pattern + // ID, and the prefilter infrastructure doesn't report pattern IDs, we + // limit this optimization to cases where there is exactly one pattern. + // In that case, any match must be the 0th pattern. + if dfa.pattern_count() == 1 && !pre.reports_false_positives() { + return Ok(pre.next_candidate(bytes, at).into_option().map( + |offset| HalfMatch { pattern: PatternID::ZERO, offset }, + )); + } else if pre.is_effective(at) { + match pre.next_candidate(bytes, at).into_option() { + None => return Ok(None), + Some(i) => { + at = i; + } + } + } + } + while at < end { + let byte = bytes[at]; + state = dfa.next_state(state, byte); + at += 1; + if dfa.is_special_state(state) { + if dfa.is_start_state(state) { + if let Some(ref mut pre) = pre { + if pre.is_effective(at) { + match pre.next_candidate(bytes, at).into_option() { + None => return Ok(None), + Some(i) => { + at = i; + } + } + } + } else if dfa.is_accel_state(state) { + let needles = dfa.accelerator(state); + at = accel::find_fwd(needles, bytes, at) + .unwrap_or(bytes.len()); + } + } else if dfa.is_match_state(state) { + last_match = Some(HalfMatch { + pattern: dfa.match_pattern(state, 0), + offset: at - MATCH_OFFSET, + }); + if earliest { + return Ok(last_match); + } + if dfa.is_accel_state(state) { + let needles = dfa.accelerator(state); + at = accel::find_fwd(needles, bytes, at) + .unwrap_or(bytes.len()); + } + } else if dfa.is_accel_state(state) { + let needs = dfa.accelerator(state); + at = accel::find_fwd(needs, bytes, at).unwrap_or(bytes.len()); + } else if dfa.is_dead_state(state) { + return Ok(last_match); + } else { + debug_assert!(dfa.is_quit_state(state)); + if last_match.is_some() { + return Ok(last_match); + } + return Err(MatchError::Quit { byte, offset: at - 1 }); + } + } + while at < end && dfa.next_state(state, bytes[at]) == state { + at += 1; + } + } + Ok(eoi_fwd(dfa, haystack, end, &mut state)?.or(last_match)) +} + +#[inline(never)] +pub fn find_earliest_rev( + dfa: &A, + pattern_id: Option, + bytes: &[u8], + start: usize, + end: usize, +) -> Result, MatchError> { + find_rev(true, dfa, pattern_id, bytes, start, end) +} + +#[inline(never)] +pub fn find_leftmost_rev( + dfa: &A, + pattern_id: Option, + bytes: &[u8], + start: usize, + end: usize, +) -> Result, MatchError> { + find_rev(false, dfa, pattern_id, bytes, start, end) +} + +/// This is marked as `inline(always)` specifically because it supports +/// multiple modes of searching. Namely, the 'earliest' boolean getting inlined +/// permits eliminating a few crucial branches. +#[inline(always)] +fn find_rev( + earliest: bool, + dfa: &A, + pattern_id: Option, + bytes: &[u8], + start: usize, + end: usize, +) -> Result, MatchError> { + assert!(start <= end); + assert!(start <= bytes.len()); + assert!(end <= bytes.len()); + + let mut state = init_rev(dfa, pattern_id, bytes, start, end)?; + let mut last_match = None; + let mut at = end; + while at > start { + at -= 1; + while at > start && dfa.next_state(state, bytes[at]) == state { + at -= 1; + } + + let byte = bytes[at]; + state = dfa.next_state(state, byte); + if dfa.is_special_state(state) { + if dfa.is_start_state(state) { + if dfa.is_accel_state(state) { + let needles = dfa.accelerator(state); + at = accel::find_rev(needles, bytes, at) + .map(|i| i + 1) + .unwrap_or(0); + } + } else if dfa.is_match_state(state) { + last_match = Some(HalfMatch { + pattern: dfa.match_pattern(state, 0), + offset: at + MATCH_OFFSET, + }); + if earliest { + return Ok(last_match); + } + if dfa.is_accel_state(state) { + let needles = dfa.accelerator(state); + at = accel::find_rev(needles, bytes, at) + .map(|i| i + 1) + .unwrap_or(0); + } + } else if dfa.is_accel_state(state) { + let needles = dfa.accelerator(state); + at = accel::find_rev(needles, bytes, at) + .map(|i| i + 1) + .unwrap_or(0); + } else if dfa.is_dead_state(state) { + return Ok(last_match); + } else { + debug_assert!(dfa.is_quit_state(state)); + if last_match.is_some() { + return Ok(last_match); + } + return Err(MatchError::Quit { byte, offset: at }); + } + } + } + Ok(eoi_rev(dfa, bytes, start, state)?.or(last_match)) +} + +#[inline(never)] +pub fn find_overlapping_fwd( + pre: Option<&mut prefilter::Scanner>, + dfa: &A, + pattern_id: Option, + bytes: &[u8], + start: usize, + end: usize, + caller_state: &mut OverlappingState, +) -> Result, MatchError> { + // Searching with a pattern ID is always anchored, so we should only ever + // use a prefilter when no pattern ID is given. + if pre.is_some() && pattern_id.is_none() { + find_overlapping_fwd_imp( + pre, + dfa, + pattern_id, + bytes, + start, + end, + caller_state, + ) + } else { + find_overlapping_fwd_imp( + None, + dfa, + pattern_id, + bytes, + start, + end, + caller_state, + ) + } +} + +/// This is marked as `inline(always)` specifically because it supports +/// multiple modes of searching. Namely, the 'pre' prefilter getting inlined +/// permits eliminating a few crucial branches and reduces code size when it is +/// not used. +#[inline(always)] +fn find_overlapping_fwd_imp( + mut pre: Option<&mut prefilter::Scanner>, + dfa: &A, + pattern_id: Option, + bytes: &[u8], + mut start: usize, + end: usize, + caller_state: &mut OverlappingState, +) -> Result, MatchError> { + assert!(start <= end); + assert!(start <= bytes.len()); + assert!(end <= bytes.len()); + + let mut state = match caller_state.id() { + None => init_fwd(dfa, pattern_id, bytes, start, end)?, + Some(id) => { + if let Some(last) = caller_state.last_match() { + let match_count = dfa.match_count(id); + if last.match_index < match_count { + let m = HalfMatch { + pattern: dfa.match_pattern(id, last.match_index), + offset: last.offset, + }; + last.match_index += 1; + return Ok(Some(m)); + } + } + + // This is a subtle but critical detail. If the caller provides a + // non-None state ID, then it must be the case that the state ID + // corresponds to one set by this function. The state ID therefore + // corresponds to a match state, a dead state or some other state. + // However, "some other" state _only_ occurs when the input has + // been exhausted because the only way to stop before then is to + // see a match or a dead/quit state. + // + // If the input is exhausted or if it's a dead state, then + // incrementing the starting position has no relevance on + // correctness, since the loop below will either not execute + // at all or will immediately stop due to being in a dead state. + // (Once in a dead state it is impossible to leave it.) + // + // Therefore, the only case we need to consider is when + // caller_state is a match state. In this case, since our machines + // support the ability to delay a match by a certain number of + // bytes (to support look-around), it follows that we actually + // consumed that many additional bytes on our previous search. When + // the caller resumes their search to find subsequent matches, they + // will use the ending location from the previous match as the next + // starting point, which is `MATCH_OFFSET` bytes PRIOR to where + // we scanned to on the previous search. Therefore, we need to + // compensate by bumping `start` up by `MATCH_OFFSET` bytes. + // + // Incidentally, since MATCH_OFFSET is non-zero, this also makes + // dealing with empty matches convenient. Namely, callers needn't + // special case them when implementing an iterator. Instead, this + // ensures that forward progress is always made. + start += MATCH_OFFSET; + id + } + }; + + let mut at = start; + while at < end { + let byte = bytes[at]; + state = dfa.next_state(state, byte); + at += 1; + if dfa.is_special_state(state) { + caller_state.set_id(state); + if dfa.is_start_state(state) { + if let Some(ref mut pre) = pre { + if pre.is_effective(at) { + match pre.next_candidate(bytes, at).into_option() { + None => return Ok(None), + Some(i) => { + at = i; + } + } + } + } else if dfa.is_accel_state(state) { + let needles = dfa.accelerator(state); + at = accel::find_fwd(needles, bytes, at) + .unwrap_or(bytes.len()); + } + } else if dfa.is_match_state(state) { + let offset = at - MATCH_OFFSET; + caller_state + .set_last_match(StateMatch { match_index: 1, offset }); + return Ok(Some(HalfMatch { + pattern: dfa.match_pattern(state, 0), + offset, + })); + } else if dfa.is_accel_state(state) { + let needs = dfa.accelerator(state); + at = accel::find_fwd(needs, bytes, at).unwrap_or(bytes.len()); + } else if dfa.is_dead_state(state) { + return Ok(None); + } else { + debug_assert!(dfa.is_quit_state(state)); + return Err(MatchError::Quit { byte, offset: at - 1 }); + } + } + } + + let result = eoi_fwd(dfa, bytes, end, &mut state); + caller_state.set_id(state); + if let Ok(Some(ref last_match)) = result { + caller_state.set_last_match(StateMatch { + match_index: 1, + offset: last_match.offset(), + }); + } + result +} + +fn init_fwd( + dfa: &A, + pattern_id: Option, + bytes: &[u8], + start: usize, + end: usize, +) -> Result { + let state = dfa.start_state_forward(pattern_id, bytes, start, end); + // Start states can never be match states, since all matches are delayed + // by 1 byte. + assert!(!dfa.is_match_state(state)); + Ok(state) +} + +fn init_rev( + dfa: &A, + pattern_id: Option, + bytes: &[u8], + start: usize, + end: usize, +) -> Result { + let state = dfa.start_state_reverse(pattern_id, bytes, start, end); + // Start states can never be match states, since all matches are delayed + // by 1 byte. + assert!(!dfa.is_match_state(state)); + Ok(state) +} + +fn eoi_fwd( + dfa: &A, + bytes: &[u8], + end: usize, + state: &mut StateID, +) -> Result, MatchError> { + match bytes.get(end) { + Some(&b) => { + *state = dfa.next_state(*state, b); + if dfa.is_match_state(*state) { + Ok(Some(HalfMatch { + pattern: dfa.match_pattern(*state, 0), + offset: end, + })) + } else { + Ok(None) + } + } + None => { + *state = dfa.next_eoi_state(*state); + if dfa.is_match_state(*state) { + Ok(Some(HalfMatch { + pattern: dfa.match_pattern(*state, 0), + offset: bytes.len(), + })) + } else { + Ok(None) + } + } + } +} + +fn eoi_rev( + dfa: &A, + bytes: &[u8], + start: usize, + state: StateID, +) -> Result, MatchError> { + if start > 0 { + let state = dfa.next_state(state, bytes[start - 1]); + if dfa.is_match_state(state) { + Ok(Some(HalfMatch { + pattern: dfa.match_pattern(state, 0), + offset: start, + })) + } else { + Ok(None) + } + } else { + let state = dfa.next_eoi_state(state); + if dfa.is_match_state(state) { + Ok(Some(HalfMatch { + pattern: dfa.match_pattern(state, 0), + offset: 0, + })) + } else { + Ok(None) + } + } +} + +// Currently unused, but is useful to keep around. This was originally used +// when the code above used raw pointers for its main loop. +// /// Returns the distance between the given pointer and the start of `bytes`. +// /// This assumes that the given pointer points to somewhere in the `bytes` +// /// slice given. +// fn offset(bytes: &[u8], p: *const u8) -> usize { +// debug_assert!(bytes.as_ptr() <= p); +// debug_assert!(bytes[bytes.len()..].as_ptr() >= p); +// ((p as isize) - (bytes.as_ptr() as isize)) as usize +// } diff --git a/vendor/regex-automata/src/dfa/search_unsafe.rs b/vendor/regex-automata/src/dfa/search_unsafe.rs new file mode 100644 index 000000000..ea1c29ff7 --- /dev/null +++ b/vendor/regex-automata/src/dfa/search_unsafe.rs @@ -0,0 +1,321 @@ +use crate::dfa::automaton::{Automaton, State}; +use crate::MatchError; + +/// This is marked as `inline(always)` specifically because it supports +/// multiple modes of searching. Namely, the 'earliest' boolean getting inlined +/// permits eliminating a few crucial branches. +#[inline(always)] +pub fn find_fwd( + dfa: &A, + bytes: &[u8], + start: usize, + end: usize, + earliest: bool, +) -> Result, MatchError> { + assert!(start <= end); + assert!(start <= bytes.len()); + assert!(end <= bytes.len()); + + let (mut state, mut last_match) = init_fwd(dfa, bytes, start, end)?; + if earliest && last_match.is_some() { + return Ok(last_match); + } + + let mut at = start; + while at < end { + let byte = bytes[at]; + state = dfa.next_state(state, byte); + at += 1; + if dfa.is_special_state(state) { + if dfa.is_dead_state(state) { + return Ok(last_match); + } else if dfa.is_quit_state(state) { + return Err(MatchError::Quit { byte, offset: at - 1 }); + } + last_match = Some(at - dfa.match_offset()); + if earliest { + return Ok(last_match); + } + } + } + /* + unsafe { + let mut p = bytes.as_ptr().add(start); + while p < bytes[end..].as_ptr() { + let byte = *p; + state = dfa.next_state_unchecked(state, byte); + p = p.add(1); + if dfa.is_special_state(state) { + if dfa.is_dead_state(state) { + return Ok(last_match); + } else if dfa.is_quit_state(state) { + return Err(MatchError::Quit { + byte, + offset: offset(bytes, p) - 1, + }); + } + last_match = Some(offset(bytes, p) - dfa.match_offset()); + if earliest { + return Ok(last_match); + } + } + } + } + */ + Ok(eof_fwd(dfa, bytes, end, &mut state)?.or(last_match)) +} + +/// This is marked as `inline(always)` specifically because it supports +/// multiple modes of searching. Namely, the 'earliest' boolean getting inlined +/// permits eliminating a few crucial branches. +#[inline(always)] +pub fn find_rev( + dfa: &A, + bytes: &[u8], + start: usize, + end: usize, + earliest: bool, +) -> Result, MatchError> { + assert!(start <= end); + assert!(start <= bytes.len()); + assert!(end <= bytes.len()); + + let (mut state, mut last_match) = init_rev(dfa, bytes, start, end)?; + if earliest && last_match.is_some() { + return Ok(last_match); + } + + let mut at = end; + while at > start { + at -= 1; + let byte = bytes[at]; + state = dfa.next_state(state, byte); + if dfa.is_special_state(state) { + if dfa.is_dead_state(state) { + return Ok(last_match); + } else if dfa.is_quit_state(state) { + return Err(MatchError::Quit { byte, offset: at }); + } + last_match = Some(at + dfa.match_offset()); + if earliest { + return Ok(last_match); + } + } + } + /* + unsafe { + let mut p = bytes.as_ptr().add(end); + while p > bytes[start..].as_ptr() { + p = p.sub(1); + let byte = *p; + state = dfa.next_state_unchecked(state, byte); + if dfa.is_special_state(state) { + if dfa.is_dead_state(state) { + return Ok(last_match); + } else if dfa.is_quit_state(state) { + return Err(MatchError::Quit { + byte, + offset: offset(bytes, p), + }); + } + last_match = Some(offset(bytes, p) + dfa.match_offset()); + if earliest { + return Ok(last_match); + } + } + } + } + */ + Ok(eof_rev(dfa, state, bytes, start)?.or(last_match)) +} + +pub fn find_overlapping_fwd( + dfa: &A, + bytes: &[u8], + mut start: usize, + end: usize, + caller_state: &mut State, +) -> Result, MatchError> { + assert!(start <= end); + assert!(start <= bytes.len()); + assert!(end <= bytes.len()); + + let (mut state, mut last_match) = match caller_state.as_option() { + None => init_fwd(dfa, bytes, start, end)?, + Some(id) => { + // This is a subtle but critical detail. If the caller provides a + // non-None state ID, then it must be the case that the state ID + // corresponds to one set by this function. The state ID therefore + // corresponds to a match state, a dead state or some other state. + // However, "some other" state _only_ occurs when the input has + // been exhausted because the only way to stop before then is to + // see a match or a dead/quit state. + // + // If the input is exhausted or if it's a dead state, then + // incrementing the starting position has no relevance on + // correctness, since the loop below will either not execute + // at all or will immediately stop due to being in a dead state. + // (Once in a dead state it is impossible to leave it.) + // + // Therefore, the only case we need to consider is when + // caller_state is a match state. In this case, since our machines + // support the ability to delay a match by a certain number of + // bytes (to support look-around), it follows that we actually + // consumed that many additional bytes on our previous search. When + // the caller resumes their search to find subsequent matches, they + // will use the ending location from the previous match as the next + // starting point, which is `match_offset` bytes PRIOR to where + // we scanned to on the previous search. Therefore, we need to + // compensate by bumping `start` up by `match_offset` bytes. + start += dfa.match_offset(); + // Since match_offset could be any arbitrary value and we use + // `start` in pointer arithmetic below, we check that we are still + // in bounds. Otherwise, we could materialize a pointer that is + // more than one past the end point of `bytes`, which is UB. + if start > end { + return Ok(None); + } + (id, None) + } + }; + if last_match.is_some() { + caller_state.set(state); + return Ok(last_match); + } + + let mut at = start; + while at < end { + let byte = bytes[at]; + state = dfa.next_state(state, byte); + at += 1; + if dfa.is_special_state(state) { + caller_state.set(state); + if dfa.is_dead_state(state) { + return Ok(None); + } else if dfa.is_quit_state(state) { + return Err(MatchError::Quit { byte, offset: at - 1 }); + } else { + return Ok(Some(at - dfa.match_offset())); + } + } + } + /* + // SAFETY: Other than the normal pointer arithmetic happening here, a + // unique aspect of safety for this function is the fact that the caller + // can provide the state that the search routine will start with. If this + // state were invalid, it would be possible to incorrectly index the + // transition table. We however prevent this from happening by guaranteeing + // that State is valid. Namely, callers cannot mutate a State. All they can + // do is create a "start" state or otherwise reuse a previously set state. + // Since callers can't mutate a state, it follows that a previously set + // state can only be retrieved by crate internal functions. Therefore, our + // use of it is safe since this code will only ever set the provided state + // to a valid state. + unsafe { + let mut p = bytes.as_ptr().add(start); + while p < bytes[end..].as_ptr() { + let byte = *p; + state = dfa.next_state_unchecked(state, byte); + p = p.add(1); + if dfa.is_special_state(state) { + caller_state.set(state); + return if dfa.is_dead_state(state) { + Ok(None) + } else if dfa.is_quit_state(state) { + Err(MatchError::Quit { byte, offset: offset(bytes, p) - 1 }) + } else { + Ok(Some(offset(bytes, p) - dfa.match_offset())) + }; + } + } + } + */ + + let result = eof_fwd(dfa, bytes, end, &mut state); + caller_state.set(state); + result +} + +fn init_fwd( + dfa: &A, + bytes: &[u8], + start: usize, + end: usize, +) -> Result<(A::ID, Option), MatchError> { + let state = dfa.start_state_forward(bytes, start, end); + if dfa.is_match_state(state) { + Ok((state, Some(start - dfa.match_offset()))) + } else { + Ok((state, None)) + } +} + +fn init_rev( + dfa: &A, + bytes: &[u8], + start: usize, + end: usize, +) -> Result<(A::ID, Option), MatchError> { + let state = dfa.start_state_reverse(bytes, start, end); + if dfa.is_match_state(state) { + Ok((state, Some(end + dfa.match_offset()))) + } else { + Ok((state, None)) + } +} + +fn eof_fwd( + dfa: &A, + bytes: &[u8], + end: usize, + state: &mut A::ID, +) -> Result, MatchError> { + match bytes.get(end) { + Some(&b) => { + *state = dfa.next_state(*state, b); + if dfa.is_match_state(*state) { + Ok(Some(end)) + } else { + Ok(None) + } + } + None => { + *state = dfa.next_eof_state(*state); + if dfa.is_match_state(*state) { + Ok(Some(bytes.len())) + } else { + Ok(None) + } + } + } +} + +fn eof_rev( + dfa: &A, + state: A::ID, + bytes: &[u8], + start: usize, +) -> Result, MatchError> { + if start > 0 { + if dfa.is_match_state(dfa.next_state(state, bytes[start - 1])) { + Ok(Some(start)) + } else { + Ok(None) + } + } else { + if dfa.is_match_state(dfa.next_eof_state(state)) { + Ok(Some(0)) + } else { + Ok(None) + } + } +} + +/// Returns the distance between the given pointer and the start of `bytes`. +/// This assumes that the given pointer points to somewhere in the `bytes` +/// slice given. +fn offset(bytes: &[u8], p: *const u8) -> usize { + debug_assert!(bytes.as_ptr() <= p); + debug_assert!(bytes[bytes.len()..].as_ptr() >= p); + ((p as isize) - (bytes.as_ptr() as isize)) as usize +} diff --git a/vendor/regex-automata/src/dfa/sparse.rs b/vendor/regex-automata/src/dfa/sparse.rs new file mode 100644 index 000000000..346606987 --- /dev/null +++ b/vendor/regex-automata/src/dfa/sparse.rs @@ -0,0 +1,2283 @@ +/*! +Types and routines specific to sparse DFAs. + +This module is the home of [`sparse::DFA`](DFA). + +Unlike the [`dense`](super::dense) module, this module does not contain a +builder or configuration specific for sparse DFAs. Instead, the intended +way to build a sparse DFA is either by using a default configuration with +its constructor [`sparse::DFA::new`](DFA::new), or by first configuring the +construction of a dense DFA with [`dense::Builder`](super::dense::Builder) +and then calling [`dense::DFA::to_sparse`](super::dense::DFA::to_sparse). For +example, this configures a sparse DFA to do an overlapping search: + +``` +use regex_automata::{ + dfa::{Automaton, OverlappingState, dense}, + HalfMatch, MatchKind, +}; + +let dense_re = dense::Builder::new() + .configure(dense::Config::new().match_kind(MatchKind::All)) + .build(r"Samwise|Sam")?; +let sparse_re = dense_re.to_sparse()?; + +// Setup our haystack and initial start state. +let haystack = b"Samwise"; +let mut state = OverlappingState::start(); + +// First, 'Sam' will match. +let end1 = sparse_re.find_overlapping_fwd_at( + None, None, haystack, 0, haystack.len(), &mut state, +)?; +assert_eq!(end1, Some(HalfMatch::must(0, 3))); + +// And now 'Samwise' will match. +let end2 = sparse_re.find_overlapping_fwd_at( + None, None, haystack, 3, haystack.len(), &mut state, +)?; +assert_eq!(end2, Some(HalfMatch::must(0, 7))); +# Ok::<(), Box>(()) +``` +*/ + +#[cfg(feature = "alloc")] +use core::iter; +use core::{ + convert::{TryFrom, TryInto}, + fmt, + mem::size_of, +}; + +#[cfg(feature = "alloc")] +use alloc::{collections::BTreeSet, vec, vec::Vec}; + +#[cfg(feature = "alloc")] +use crate::dfa::{dense, error::Error}; +use crate::{ + dfa::{ + automaton::{fmt_state_indicator, Automaton}, + special::Special, + DEAD, + }, + util::{ + alphabet::ByteClasses, + bytes::{self, DeserializeError, Endian, SerializeError}, + id::{PatternID, StateID}, + start::Start, + DebugByte, + }, +}; + +const LABEL: &str = "rust-regex-automata-dfa-sparse"; +const VERSION: u32 = 2; + +/// A sparse deterministic finite automaton (DFA) with variable sized states. +/// +/// In contrast to a [dense::DFA](crate::dfa::dense::DFA), a sparse DFA uses +/// a more space efficient representation for its transitions. Consequently, +/// sparse DFAs may use much less memory than dense DFAs, but this comes at a +/// price. In particular, reading the more space efficient transitions takes +/// more work, and consequently, searching using a sparse DFA is typically +/// slower than a dense DFA. +/// +/// A sparse DFA can be built using the default configuration via the +/// [`DFA::new`] constructor. Otherwise, one can configure various aspects +/// of a dense DFA via [`dense::Builder`](crate::dfa::dense::Builder), +/// and then convert a dense DFA to a sparse DFA using +/// [`dense::DFA::to_sparse`](crate::dfa::dense::DFA::to_sparse). +/// +/// In general, a sparse DFA supports all the same search operations as a dense +/// DFA. +/// +/// Making the choice between a dense and sparse DFA depends on your specific +/// work load. If you can sacrifice a bit of search time performance, then a +/// sparse DFA might be the best choice. In particular, while sparse DFAs are +/// probably always slower than dense DFAs, you may find that they are easily +/// fast enough for your purposes! +/// +/// # Type parameters +/// +/// A `DFA` has one type parameter, `T`, which is used to represent the parts +/// of a sparse DFA. `T` is typically a `Vec` or a `&[u8]`. +/// +/// # The `Automaton` trait +/// +/// This type implements the [`Automaton`] trait, which means it can be used +/// for searching. For example: +/// +/// ``` +/// use regex_automata::{ +/// dfa::{Automaton, sparse::DFA}, +/// HalfMatch, +/// }; +/// +/// let dfa = DFA::new("foo[0-9]+")?; +/// let expected = HalfMatch::must(0, 8); +/// assert_eq!(Some(expected), dfa.find_leftmost_fwd(b"foo12345")?); +/// # Ok::<(), Box>(()) +/// ``` +#[derive(Clone)] +pub struct DFA { + // When compared to a dense DFA, a sparse DFA *looks* a lot simpler + // representation-wise. In reality, it is perhaps more complicated. Namely, + // in a dense DFA, all information needs to be very cheaply accessible + // using only state IDs. In a sparse DFA however, each state uses a + // variable amount of space because each state encodes more information + // than just its transitions. Each state also includes an accelerator if + // one exists, along with the matching pattern IDs if the state is a match + // state. + // + // That is, a lot of the complexity is pushed down into how each state + // itself is represented. + trans: Transitions, + starts: StartTable, + special: Special, +} + +#[cfg(feature = "alloc")] +impl DFA> { + /// Parse the given regular expression using a default configuration and + /// return the corresponding sparse DFA. + /// + /// If you want a non-default configuration, then use + /// the [`dense::Builder`](crate::dfa::dense::Builder) + /// to set your own configuration, and then call + /// [`dense::DFA::to_sparse`](crate::dfa::dense::DFA::to_sparse) to create + /// a sparse DFA. + /// + /// # Example + /// + /// ``` + /// use regex_automata::{ + /// dfa::{Automaton, sparse}, + /// HalfMatch, + /// }; + /// + /// let dfa = sparse::DFA::new("foo[0-9]+bar")?; + /// + /// let expected = HalfMatch::must(0, 11); + /// assert_eq!(Some(expected), dfa.find_leftmost_fwd(b"foo12345bar")?); + /// # Ok::<(), Box>(()) + /// ``` + pub fn new(pattern: &str) -> Result>, Error> { + dense::Builder::new() + .build(pattern) + .and_then(|dense| dense.to_sparse()) + } + + /// Parse the given regular expressions using a default configuration and + /// return the corresponding multi-DFA. + /// + /// If you want a non-default configuration, then use + /// the [`dense::Builder`](crate::dfa::dense::Builder) + /// to set your own configuration, and then call + /// [`dense::DFA::to_sparse`](crate::dfa::dense::DFA::to_sparse) to create + /// a sparse DFA. + /// + /// # Example + /// + /// ``` + /// use regex_automata::{ + /// dfa::{Automaton, sparse}, + /// HalfMatch, + /// }; + /// + /// let dfa = sparse::DFA::new_many(&["[0-9]+", "[a-z]+"])?; + /// let expected = HalfMatch::must(1, 3); + /// assert_eq!(Some(expected), dfa.find_leftmost_fwd(b"foo12345bar")?); + /// # Ok::<(), Box>(()) + /// ``` + pub fn new_many>( + patterns: &[P], + ) -> Result>, Error> { + dense::Builder::new() + .build_many(patterns) + .and_then(|dense| dense.to_sparse()) + } +} + +#[cfg(feature = "alloc")] +impl DFA> { + /// Create a new DFA that matches every input. + /// + /// # Example + /// + /// ``` + /// use regex_automata::{ + /// dfa::{Automaton, sparse}, + /// HalfMatch, + /// }; + /// + /// let dfa = sparse::DFA::always_match()?; + /// + /// let expected = HalfMatch::must(0, 0); + /// assert_eq!(Some(expected), dfa.find_leftmost_fwd(b"")?); + /// assert_eq!(Some(expected), dfa.find_leftmost_fwd(b"foo")?); + /// # Ok::<(), Box>(()) + /// ``` + pub fn always_match() -> Result>, Error> { + dense::DFA::always_match()?.to_sparse() + } + + /// Create a new sparse DFA that never matches any input. + /// + /// # Example + /// + /// ``` + /// use regex_automata::dfa::{Automaton, sparse}; + /// + /// let dfa = sparse::DFA::never_match()?; + /// assert_eq!(None, dfa.find_leftmost_fwd(b"")?); + /// assert_eq!(None, dfa.find_leftmost_fwd(b"foo")?); + /// # Ok::<(), Box>(()) + /// ``` + pub fn never_match() -> Result>, Error> { + dense::DFA::never_match()?.to_sparse() + } + + /// The implementation for constructing a sparse DFA from a dense DFA. + pub(crate) fn from_dense>( + dfa: &dense::DFA, + ) -> Result>, Error> { + // In order to build the transition table, we need to be able to write + // state identifiers for each of the "next" transitions in each state. + // Our state identifiers correspond to the byte offset in the + // transition table at which the state is encoded. Therefore, we do not + // actually know what the state identifiers are until we've allocated + // exactly as much space as we need for each state. Thus, construction + // of the transition table happens in two passes. + // + // In the first pass, we fill out the shell of each state, which + // includes the transition count, the input byte ranges and zero-filled + // space for the transitions and accelerators, if present. In this + // first pass, we also build up a map from the state identifier index + // of the dense DFA to the state identifier in this sparse DFA. + // + // In the second pass, we fill in the transitions based on the map + // built in the first pass. + + // The capacity given here reflects a minimum. (Well, the true minimum + // is likely even bigger, but hopefully this saves a few reallocs.) + let mut sparse = Vec::with_capacity(StateID::SIZE * dfa.state_count()); + // This maps state indices from the dense DFA to StateIDs in the sparse + // DFA. We build out this map on the first pass, and then use it in the + // second pass to back-fill our transitions. + let mut remap: Vec = vec![DEAD; dfa.state_count()]; + for state in dfa.states() { + let pos = sparse.len(); + + remap[dfa.to_index(state.id())] = + StateID::new(pos).map_err(|_| Error::too_many_states())?; + // zero-filled space for the transition count + sparse.push(0); + sparse.push(0); + + let mut transition_count = 0; + for (unit1, unit2, _) in state.sparse_transitions() { + match (unit1.as_u8(), unit2.as_u8()) { + (Some(b1), Some(b2)) => { + transition_count += 1; + sparse.push(b1); + sparse.push(b2); + } + (None, None) => {} + (Some(_), None) | (None, Some(_)) => { + // can never occur because sparse_transitions never + // groups EOI with any other transition. + unreachable!() + } + } + } + // Add dummy EOI transition. This is never actually read while + // searching, but having space equivalent to the total number + // of transitions is convenient. Otherwise, we'd need to track + // a different number of transitions for the byte ranges as for + // the 'next' states. + // + // N.B. The loop above is not guaranteed to yield the EOI + // transition, since it may point to a DEAD state. By putting + // it here, we always write the EOI transition, and thus + // guarantee that our transition count is >0. Why do we always + // need the EOI transition? Because in order to implement + // Automaton::next_eoi_state, this lets us just ask for the last + // transition. There are probably other/better ways to do this. + transition_count += 1; + sparse.push(0); + sparse.push(0); + + // Check some assumptions about transition count. + assert_ne!( + transition_count, 0, + "transition count should be non-zero", + ); + assert!( + transition_count <= 257, + "expected transition count {} to be <= 257", + transition_count, + ); + + // Fill in the transition count. + // Since transition count is always <= 257, we use the most + // significant bit to indicate whether this is a match state or + // not. + let ntrans = if dfa.is_match_state(state.id()) { + transition_count | (1 << 15) + } else { + transition_count + }; + bytes::NE::write_u16(ntrans, &mut sparse[pos..]); + + // zero-fill the actual transitions. + // Unwraps are OK since transition_count <= 257 and our minimum + // support usize size is 16-bits. + let zeros = usize::try_from(transition_count) + .unwrap() + .checked_mul(StateID::SIZE) + .unwrap(); + sparse.extend(iter::repeat(0).take(zeros)); + + // If this is a match state, write the pattern IDs matched by this + // state. + if dfa.is_match_state(state.id()) { + let plen = dfa.match_pattern_len(state.id()); + // Write the actual pattern IDs with a u32 length prefix. + // First, zero-fill space. + let mut pos = sparse.len(); + // Unwraps are OK since it's guaranteed that plen <= + // PatternID::LIMIT, which is in turn guaranteed to fit into a + // u32. + let zeros = size_of::() + .checked_mul(plen) + .unwrap() + .checked_add(size_of::()) + .unwrap(); + sparse.extend(iter::repeat(0).take(zeros)); + + // Now write the length prefix. + bytes::NE::write_u32( + // Will never fail since u32::MAX is invalid pattern ID. + // Thus, the number of pattern IDs is representable by a + // u32. + plen.try_into().expect("pattern ID count fits in u32"), + &mut sparse[pos..], + ); + pos += size_of::(); + + // Now write the pattern IDs. + for &pid in dfa.pattern_id_slice(state.id()) { + pos += bytes::write_pattern_id::( + pid, + &mut sparse[pos..], + ); + } + } + + // And now add the accelerator, if one exists. An accelerator is + // at most 4 bytes and at least 1 byte. The first byte is the + // length, N. N bytes follow the length. The set of bytes that + // follow correspond (exhaustively) to the bytes that must be seen + // to leave this state. + let accel = dfa.accelerator(state.id()); + sparse.push(accel.len().try_into().unwrap()); + sparse.extend_from_slice(accel); + } + + let mut new = DFA { + trans: Transitions { + sparse, + classes: dfa.byte_classes().clone(), + count: dfa.state_count(), + patterns: dfa.pattern_count(), + }, + starts: StartTable::from_dense_dfa(dfa, &remap)?, + special: dfa.special().remap(|id| remap[dfa.to_index(id)]), + }; + // And here's our second pass. Iterate over all of the dense states + // again, and update the transitions in each of the states in the + // sparse DFA. + for old_state in dfa.states() { + let new_id = remap[dfa.to_index(old_state.id())]; + let mut new_state = new.trans.state_mut(new_id); + let sparse = old_state.sparse_transitions(); + for (i, (_, _, next)) in sparse.enumerate() { + let next = remap[dfa.to_index(next)]; + new_state.set_next_at(i, next); + } + } + trace!( + "created sparse DFA, memory usage: {} (dense memory usage: {})", + new.memory_usage(), + dfa.memory_usage(), + ); + Ok(new) + } +} + +impl> DFA { + /// Cheaply return a borrowed version of this sparse DFA. Specifically, the + /// DFA returned always uses `&[u8]` for its transitions. + pub fn as_ref<'a>(&'a self) -> DFA<&'a [u8]> { + DFA { + trans: self.trans.as_ref(), + starts: self.starts.as_ref(), + special: self.special, + } + } + + /// Return an owned version of this sparse DFA. Specifically, the DFA + /// returned always uses `Vec` for its transitions. + /// + /// Effectively, this returns a sparse DFA whose transitions live on the + /// heap. + #[cfg(feature = "alloc")] + pub fn to_owned(&self) -> DFA> { + DFA { + trans: self.trans.to_owned(), + starts: self.starts.to_owned(), + special: self.special, + } + } + + /// Returns the memory usage, in bytes, of this DFA. + /// + /// The memory usage is computed based on the number of bytes used to + /// represent this DFA. + /// + /// This does **not** include the stack size used up by this DFA. To + /// compute that, use `std::mem::size_of::()`. + pub fn memory_usage(&self) -> usize { + self.trans.memory_usage() + self.starts.memory_usage() + } + + /// Returns true only if this DFA has starting states for each pattern. + /// + /// When a DFA has starting states for each pattern, then a search with the + /// DFA can be configured to only look for anchored matches of a specific + /// pattern. Specifically, APIs like [`Automaton::find_earliest_fwd_at`] + /// can accept a non-None `pattern_id` if and only if this method returns + /// true. Otherwise, calling `find_earliest_fwd_at` will panic. + /// + /// Note that if the DFA is empty, this always returns false. + pub fn has_starts_for_each_pattern(&self) -> bool { + self.starts.patterns > 0 + } +} + +/// Routines for converting a sparse DFA to other representations, such as raw +/// bytes suitable for persistent storage. +impl> DFA { + /// Serialize this DFA as raw bytes to a `Vec` in little endian + /// format. + /// + /// The written bytes are guaranteed to be deserialized correctly and + /// without errors in a semver compatible release of this crate by a + /// `DFA`'s deserialization APIs (assuming all other criteria for the + /// deserialization APIs has been satisfied): + /// + /// * [`DFA::from_bytes`] + /// * [`DFA::from_bytes_unchecked`] + /// + /// Note that unlike a [`dense::DFA`](crate::dfa::dense::DFA)'s + /// serialization methods, this does not add any initial padding to the + /// returned bytes. Padding isn't required for sparse DFAs since they have + /// no alignment requirements. + /// + /// # Example + /// + /// This example shows how to serialize and deserialize a DFA: + /// + /// ``` + /// use regex_automata::{ + /// dfa::{Automaton, sparse::DFA}, + /// HalfMatch, + /// }; + /// + /// // Compile our original DFA. + /// let original_dfa = DFA::new("foo[0-9]+")?; + /// + /// // N.B. We use native endianness here to make the example work, but + /// // using to_bytes_little_endian would work on a little endian target. + /// let buf = original_dfa.to_bytes_native_endian(); + /// // Even if buf has initial padding, DFA::from_bytes will automatically + /// // ignore it. + /// let dfa: DFA<&[u8]> = DFA::from_bytes(&buf)?.0; + /// + /// let expected = HalfMatch::must(0, 8); + /// assert_eq!(Some(expected), dfa.find_leftmost_fwd(b"foo12345")?); + /// # Ok::<(), Box>(()) + /// ``` + #[cfg(feature = "alloc")] + pub fn to_bytes_little_endian(&self) -> Vec { + self.to_bytes::() + } + + /// Serialize this DFA as raw bytes to a `Vec` in big endian + /// format. + /// + /// The written bytes are guaranteed to be deserialized correctly and + /// without errors in a semver compatible release of this crate by a + /// `DFA`'s deserialization APIs (assuming all other criteria for the + /// deserialization APIs has been satisfied): + /// + /// * [`DFA::from_bytes`] + /// * [`DFA::from_bytes_unchecked`] + /// + /// Note that unlike a [`dense::DFA`](crate::dfa::dense::DFA)'s + /// serialization methods, this does not add any initial padding to the + /// returned bytes. Padding isn't required for sparse DFAs since they have + /// no alignment requirements. + /// + /// # Example + /// + /// This example shows how to serialize and deserialize a DFA: + /// + /// ``` + /// use regex_automata::{ + /// dfa::{Automaton, sparse::DFA}, + /// HalfMatch, + /// }; + /// + /// // Compile our original DFA. + /// let original_dfa = DFA::new("foo[0-9]+")?; + /// + /// // N.B. We use native endianness here to make the example work, but + /// // using to_bytes_big_endian would work on a big endian target. + /// let buf = original_dfa.to_bytes_native_endian(); + /// // Even if buf has initial padding, DFA::from_bytes will automatically + /// // ignore it. + /// let dfa: DFA<&[u8]> = DFA::from_bytes(&buf)?.0; + /// + /// let expected = HalfMatch::must(0, 8); + /// assert_eq!(Some(expected), dfa.find_leftmost_fwd(b"foo12345")?); + /// # Ok::<(), Box>(()) + /// ``` + #[cfg(feature = "alloc")] + pub fn to_bytes_big_endian(&self) -> Vec { + self.to_bytes::() + } + + /// Serialize this DFA as raw bytes to a `Vec` in native endian + /// format. + /// + /// The written bytes are guaranteed to be deserialized correctly and + /// without errors in a semver compatible release of this crate by a + /// `DFA`'s deserialization APIs (assuming all other criteria for the + /// deserialization APIs has been satisfied): + /// + /// * [`DFA::from_bytes`] + /// * [`DFA::from_bytes_unchecked`] + /// + /// Note that unlike a [`dense::DFA`](crate::dfa::dense::DFA)'s + /// serialization methods, this does not add any initial padding to the + /// returned bytes. Padding isn't required for sparse DFAs since they have + /// no alignment requirements. + /// + /// Generally speaking, native endian format should only be used when + /// you know that the target you're compiling the DFA for matches the + /// endianness of the target on which you're compiling DFA. For example, + /// if serialization and deserialization happen in the same process or on + /// the same machine. Otherwise, when serializing a DFA for use in a + /// portable environment, you'll almost certainly want to serialize _both_ + /// a little endian and a big endian version and then load the correct one + /// based on the target's configuration. + /// + /// # Example + /// + /// This example shows how to serialize and deserialize a DFA: + /// + /// ``` + /// use regex_automata::{ + /// dfa::{Automaton, sparse::DFA}, + /// HalfMatch, + /// }; + /// + /// // Compile our original DFA. + /// let original_dfa = DFA::new("foo[0-9]+")?; + /// + /// let buf = original_dfa.to_bytes_native_endian(); + /// // Even if buf has initial padding, DFA::from_bytes will automatically + /// // ignore it. + /// let dfa: DFA<&[u8]> = DFA::from_bytes(&buf)?.0; + /// + /// let expected = HalfMatch::must(0, 8); + /// assert_eq!(Some(expected), dfa.find_leftmost_fwd(b"foo12345")?); + /// # Ok::<(), Box>(()) + /// ``` + #[cfg(feature = "alloc")] + pub fn to_bytes_native_endian(&self) -> Vec { + self.to_bytes::() + } + + /// The implementation of the public `to_bytes` serialization methods, + /// which is generic over endianness. + #[cfg(feature = "alloc")] + fn to_bytes(&self) -> Vec { + let mut buf = vec![0; self.write_to_len()]; + // This should always succeed since the only possible serialization + // error is providing a buffer that's too small, but we've ensured that + // `buf` is big enough here. + self.write_to::(&mut buf).unwrap(); + buf + } + + /// Serialize this DFA as raw bytes to the given slice, in little endian + /// format. Upon success, the total number of bytes written to `dst` is + /// returned. + /// + /// The written bytes are guaranteed to be deserialized correctly and + /// without errors in a semver compatible release of this crate by a + /// `DFA`'s deserialization APIs (assuming all other criteria for the + /// deserialization APIs has been satisfied): + /// + /// * [`DFA::from_bytes`] + /// * [`DFA::from_bytes_unchecked`] + /// + /// # Errors + /// + /// This returns an error if the given destination slice is not big enough + /// to contain the full serialized DFA. If an error occurs, then nothing + /// is written to `dst`. + /// + /// # Example + /// + /// This example shows how to serialize and deserialize a DFA without + /// dynamic memory allocation. + /// + /// ``` + /// use regex_automata::{ + /// dfa::{Automaton, sparse::DFA}, + /// HalfMatch, + /// }; + /// + /// // Compile our original DFA. + /// let original_dfa = DFA::new("foo[0-9]+")?; + /// + /// // Create a 4KB buffer on the stack to store our serialized DFA. + /// let mut buf = [0u8; 4 * (1<<10)]; + /// // N.B. We use native endianness here to make the example work, but + /// // using write_to_little_endian would work on a little endian target. + /// let written = original_dfa.write_to_native_endian(&mut buf)?; + /// let dfa: DFA<&[u8]> = DFA::from_bytes(&buf[..written])?.0; + /// + /// let expected = HalfMatch::must(0, 8); + /// assert_eq!(Some(expected), dfa.find_leftmost_fwd(b"foo12345")?); + /// # Ok::<(), Box>(()) + /// ``` + pub fn write_to_little_endian( + &self, + dst: &mut [u8], + ) -> Result { + self.write_to::(dst) + } + + /// Serialize this DFA as raw bytes to the given slice, in big endian + /// format. Upon success, the total number of bytes written to `dst` is + /// returned. + /// + /// The written bytes are guaranteed to be deserialized correctly and + /// without errors in a semver compatible release of this crate by a + /// `DFA`'s deserialization APIs (assuming all other criteria for the + /// deserialization APIs has been satisfied): + /// + /// * [`DFA::from_bytes`] + /// * [`DFA::from_bytes_unchecked`] + /// + /// # Errors + /// + /// This returns an error if the given destination slice is not big enough + /// to contain the full serialized DFA. If an error occurs, then nothing + /// is written to `dst`. + /// + /// # Example + /// + /// This example shows how to serialize and deserialize a DFA without + /// dynamic memory allocation. + /// + /// ``` + /// use regex_automata::{ + /// dfa::{Automaton, sparse::DFA}, + /// HalfMatch, + /// }; + /// + /// // Compile our original DFA. + /// let original_dfa = DFA::new("foo[0-9]+")?; + /// + /// // Create a 4KB buffer on the stack to store our serialized DFA. + /// let mut buf = [0u8; 4 * (1<<10)]; + /// // N.B. We use native endianness here to make the example work, but + /// // using write_to_big_endian would work on a big endian target. + /// let written = original_dfa.write_to_native_endian(&mut buf)?; + /// let dfa: DFA<&[u8]> = DFA::from_bytes(&buf[..written])?.0; + /// + /// let expected = HalfMatch::must(0, 8); + /// assert_eq!(Some(expected), dfa.find_leftmost_fwd(b"foo12345")?); + /// # Ok::<(), Box>(()) + /// ``` + pub fn write_to_big_endian( + &self, + dst: &mut [u8], + ) -> Result { + self.write_to::(dst) + } + + /// Serialize this DFA as raw bytes to the given slice, in native endian + /// format. Upon success, the total number of bytes written to `dst` is + /// returned. + /// + /// The written bytes are guaranteed to be deserialized correctly and + /// without errors in a semver compatible release of this crate by a + /// `DFA`'s deserialization APIs (assuming all other criteria for the + /// deserialization APIs has been satisfied): + /// + /// * [`DFA::from_bytes`] + /// * [`DFA::from_bytes_unchecked`] + /// + /// Generally speaking, native endian format should only be used when + /// you know that the target you're compiling the DFA for matches the + /// endianness of the target on which you're compiling DFA. For example, + /// if serialization and deserialization happen in the same process or on + /// the same machine. Otherwise, when serializing a DFA for use in a + /// portable environment, you'll almost certainly want to serialize _both_ + /// a little endian and a big endian version and then load the correct one + /// based on the target's configuration. + /// + /// # Errors + /// + /// This returns an error if the given destination slice is not big enough + /// to contain the full serialized DFA. If an error occurs, then nothing + /// is written to `dst`. + /// + /// # Example + /// + /// This example shows how to serialize and deserialize a DFA without + /// dynamic memory allocation. + /// + /// ``` + /// use regex_automata::{ + /// dfa::{Automaton, sparse::DFA}, + /// HalfMatch, + /// }; + /// + /// // Compile our original DFA. + /// let original_dfa = DFA::new("foo[0-9]+")?; + /// + /// // Create a 4KB buffer on the stack to store our serialized DFA. + /// let mut buf = [0u8; 4 * (1<<10)]; + /// let written = original_dfa.write_to_native_endian(&mut buf)?; + /// let dfa: DFA<&[u8]> = DFA::from_bytes(&buf[..written])?.0; + /// + /// let expected = HalfMatch::must(0, 8); + /// assert_eq!(Some(expected), dfa.find_leftmost_fwd(b"foo12345")?); + /// # Ok::<(), Box>(()) + /// ``` + pub fn write_to_native_endian( + &self, + dst: &mut [u8], + ) -> Result { + self.write_to::(dst) + } + + /// The implementation of the public `write_to` serialization methods, + /// which is generic over endianness. + fn write_to( + &self, + dst: &mut [u8], + ) -> Result { + let mut nw = 0; + nw += bytes::write_label(LABEL, &mut dst[nw..])?; + nw += bytes::write_endianness_check::(&mut dst[nw..])?; + nw += bytes::write_version::(VERSION, &mut dst[nw..])?; + nw += { + // Currently unused, intended for future flexibility + E::write_u32(0, &mut dst[nw..]); + size_of::() + }; + nw += self.trans.write_to::(&mut dst[nw..])?; + nw += self.starts.write_to::(&mut dst[nw..])?; + nw += self.special.write_to::(&mut dst[nw..])?; + Ok(nw) + } + + /// Return the total number of bytes required to serialize this DFA. + /// + /// This is useful for determining the size of the buffer required to pass + /// to one of the serialization routines: + /// + /// * [`DFA::write_to_little_endian`] + /// * [`DFA::write_to_big_endian`] + /// * [`DFA::write_to_native_endian`] + /// + /// Passing a buffer smaller than the size returned by this method will + /// result in a serialization error. + /// + /// # Example + /// + /// This example shows how to dynamically allocate enough room to serialize + /// a sparse DFA. + /// + /// ``` + /// use regex_automata::{ + /// dfa::{Automaton, sparse::DFA}, + /// HalfMatch, + /// }; + /// + /// // Compile our original DFA. + /// let original_dfa = DFA::new("foo[0-9]+")?; + /// + /// let mut buf = vec![0; original_dfa.write_to_len()]; + /// let written = original_dfa.write_to_native_endian(&mut buf)?; + /// let dfa: DFA<&[u8]> = DFA::from_bytes(&buf[..written])?.0; + /// + /// let expected = HalfMatch::must(0, 8); + /// assert_eq!(Some(expected), dfa.find_leftmost_fwd(b"foo12345")?); + /// # Ok::<(), Box>(()) + /// ``` + pub fn write_to_len(&self) -> usize { + bytes::write_label_len(LABEL) + + bytes::write_endianness_check_len() + + bytes::write_version_len() + + size_of::() // unused, intended for future flexibility + + self.trans.write_to_len() + + self.starts.write_to_len() + + self.special.write_to_len() + } +} + +impl<'a> DFA<&'a [u8]> { + /// Safely deserialize a sparse DFA with a specific state identifier + /// representation. Upon success, this returns both the deserialized DFA + /// and the number of bytes read from the given slice. Namely, the contents + /// of the slice beyond the DFA are not read. + /// + /// Deserializing a DFA using this routine will never allocate heap memory. + /// For safety purposes, the DFA's transitions will be verified such that + /// every transition points to a valid state. If this verification is too + /// costly, then a [`DFA::from_bytes_unchecked`] API is provided, which + /// will always execute in constant time. + /// + /// The bytes given must be generated by one of the serialization APIs + /// of a `DFA` using a semver compatible release of this crate. Those + /// include: + /// + /// * [`DFA::to_bytes_little_endian`] + /// * [`DFA::to_bytes_big_endian`] + /// * [`DFA::to_bytes_native_endian`] + /// * [`DFA::write_to_little_endian`] + /// * [`DFA::write_to_big_endian`] + /// * [`DFA::write_to_native_endian`] + /// + /// The `to_bytes` methods allocate and return a `Vec` for you. The + /// `write_to` methods do not allocate and write to an existing slice + /// (which may be on the stack). Since deserialization always uses the + /// native endianness of the target platform, the serialization API you use + /// should match the endianness of the target platform. (It's often a good + /// idea to generate serialized DFAs for both forms of endianness and then + /// load the correct one based on endianness.) + /// + /// # Errors + /// + /// Generally speaking, it's easier to state the conditions in which an + /// error is _not_ returned. All of the following must be true: + /// + /// * The bytes given must be produced by one of the serialization APIs + /// on this DFA, as mentioned above. + /// * The endianness of the target platform matches the endianness used to + /// serialized the provided DFA. + /// + /// If any of the above are not true, then an error will be returned. + /// + /// Note that unlike deserializing a + /// [`dense::DFA`](crate::dfa::dense::DFA), deserializing a sparse DFA has + /// no alignment requirements. That is, an alignment of `1` is valid. + /// + /// # Panics + /// + /// This routine will never panic for any input. + /// + /// # Example + /// + /// This example shows how to serialize a DFA to raw bytes, deserialize it + /// and then use it for searching. + /// + /// ``` + /// use regex_automata::{ + /// dfa::{Automaton, sparse::DFA}, + /// HalfMatch, + /// }; + /// + /// let initial = DFA::new("foo[0-9]+")?; + /// let bytes = initial.to_bytes_native_endian(); + /// let dfa: DFA<&[u8]> = DFA::from_bytes(&bytes)?.0; + /// + /// let expected = HalfMatch::must(0, 8); + /// assert_eq!(Some(expected), dfa.find_leftmost_fwd(b"foo12345")?); + /// # Ok::<(), Box>(()) + /// ``` + /// + /// # Example: loading a DFA from static memory + /// + /// One use case this library supports is the ability to serialize a + /// DFA to disk and then use `include_bytes!` to store it in a compiled + /// Rust program. Those bytes can then be cheaply deserialized into a + /// `DFA` structure at runtime and used for searching without having to + /// re-compile the DFA (which can be quite costly). + /// + /// We can show this in two parts. The first part is serializing the DFA to + /// a file: + /// + /// ```no_run + /// use regex_automata::dfa::{Automaton, sparse::DFA}; + /// + /// let dfa = DFA::new("foo[0-9]+")?; + /// + /// // Write a big endian serialized version of this DFA to a file. + /// let bytes = dfa.to_bytes_big_endian(); + /// std::fs::write("foo.bigendian.dfa", &bytes)?; + /// + /// // Do it again, but this time for little endian. + /// let bytes = dfa.to_bytes_little_endian(); + /// std::fs::write("foo.littleendian.dfa", &bytes)?; + /// # Ok::<(), Box>(()) + /// ``` + /// + /// And now the second part is embedding the DFA into the compiled program + /// and deserializing it at runtime on first use. We use conditional + /// compilation to choose the correct endianness. As mentioned above, we + /// do not need to employ any special tricks to ensure a proper alignment, + /// since a sparse DFA has no alignment requirements. + /// + /// ```no_run + /// use regex_automata::{ + /// dfa::{Automaton, sparse}, + /// HalfMatch, + /// }; + /// + /// type DFA = sparse::DFA<&'static [u8]>; + /// + /// fn get_foo() -> &'static DFA { + /// use std::cell::Cell; + /// use std::mem::MaybeUninit; + /// use std::sync::Once; + /// + /// # const _: &str = stringify! { + /// #[cfg(target_endian = "big")] + /// static BYTES: &[u8] = include_bytes!("foo.bigendian.dfa"); + /// #[cfg(target_endian = "little")] + /// static BYTES: &[u8] = include_bytes!("foo.littleendian.dfa"); + /// # }; + /// # static BYTES: &[u8] = b""; + /// + /// struct Lazy(Cell>); + /// // SAFETY: This is safe because DFA impls Sync. + /// unsafe impl Sync for Lazy {} + /// + /// static INIT: Once = Once::new(); + /// static DFA: Lazy = Lazy(Cell::new(MaybeUninit::uninit())); + /// + /// INIT.call_once(|| { + /// let (dfa, _) = DFA::from_bytes(BYTES) + /// .expect("serialized DFA should be valid"); + /// // SAFETY: This is guaranteed to only execute once, and all + /// // we do with the pointer is write the DFA to it. + /// unsafe { + /// (*DFA.0.as_ptr()).as_mut_ptr().write(dfa); + /// } + /// }); + /// // SAFETY: DFA is guaranteed to by initialized via INIT and is + /// // stored in static memory. + /// unsafe { + /// let dfa = (*DFA.0.as_ptr()).as_ptr(); + /// std::mem::transmute::<*const DFA, &'static DFA>(dfa) + /// } + /// } + /// + /// let dfa = get_foo(); + /// let expected = HalfMatch::must(0, 8); + /// assert_eq!(Ok(Some(expected)), dfa.find_leftmost_fwd(b"foo12345")); + /// ``` + /// + /// Alternatively, consider using + /// [`lazy_static`](https://crates.io/crates/lazy_static) + /// or + /// [`once_cell`](https://crates.io/crates/once_cell), + /// which will guarantee safety for you. + pub fn from_bytes( + slice: &'a [u8], + ) -> Result<(DFA<&'a [u8]>, usize), DeserializeError> { + // SAFETY: This is safe because we validate both the sparse transitions + // (by trying to decode every state) and start state ID list below. If + // either validation fails, then we return an error. + let (dfa, nread) = unsafe { DFA::from_bytes_unchecked(slice)? }; + dfa.trans.validate()?; + dfa.starts.validate(&dfa.trans)?; + // N.B. dfa.special doesn't have a way to do unchecked deserialization, + // so it has already been validated. + Ok((dfa, nread)) + } + + /// Deserialize a DFA with a specific state identifier representation in + /// constant time by omitting the verification of the validity of the + /// sparse transitions. + /// + /// This is just like [`DFA::from_bytes`], except it can potentially return + /// a DFA that exhibits undefined behavior if its transitions contains + /// invalid state identifiers. + /// + /// This routine is useful if you need to deserialize a DFA cheaply and + /// cannot afford the transition validation performed by `from_bytes`. + /// + /// # Safety + /// + /// This routine is unsafe because it permits callers to provide + /// arbitrary transitions with possibly incorrect state identifiers. While + /// the various serialization routines will never return an incorrect + /// DFA, there is no guarantee that the bytes provided here + /// are correct. While `from_bytes_unchecked` will still do several forms + /// of basic validation, this routine does not check that the transitions + /// themselves are correct. Given an incorrect transition table, it is + /// possible for the search routines to access out-of-bounds memory because + /// of explicit bounds check elision. + /// + /// # Example + /// + /// ``` + /// use regex_automata::{ + /// dfa::{Automaton, sparse::DFA}, + /// HalfMatch, + /// }; + /// + /// let initial = DFA::new("foo[0-9]+")?; + /// let bytes = initial.to_bytes_native_endian(); + /// // SAFETY: This is guaranteed to be safe since the bytes given come + /// // directly from a compatible serialization routine. + /// let dfa: DFA<&[u8]> = unsafe { DFA::from_bytes_unchecked(&bytes)?.0 }; + /// + /// let expected = HalfMatch::must(0, 8); + /// assert_eq!(Some(expected), dfa.find_leftmost_fwd(b"foo12345")?); + /// # Ok::<(), Box>(()) + /// ``` + pub unsafe fn from_bytes_unchecked( + slice: &'a [u8], + ) -> Result<(DFA<&'a [u8]>, usize), DeserializeError> { + let mut nr = 0; + + nr += bytes::read_label(&slice[nr..], LABEL)?; + nr += bytes::read_endianness_check(&slice[nr..])?; + nr += bytes::read_version(&slice[nr..], VERSION)?; + + let _unused = bytes::try_read_u32(&slice[nr..], "unused space")?; + nr += size_of::(); + + let (trans, nread) = Transitions::from_bytes_unchecked(&slice[nr..])?; + nr += nread; + + let (starts, nread) = StartTable::from_bytes_unchecked(&slice[nr..])?; + nr += nread; + + let (special, nread) = Special::from_bytes(&slice[nr..])?; + nr += nread; + if special.max.as_usize() >= trans.sparse().len() { + return Err(DeserializeError::generic( + "max should not be greater than or equal to sparse bytes", + )); + } + + Ok((DFA { trans, starts, special }, nr)) + } +} + +impl> fmt::Debug for DFA { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + writeln!(f, "sparse::DFA(")?; + for state in self.trans.states() { + fmt_state_indicator(f, self, state.id())?; + writeln!(f, "{:06?}: {:?}", state.id(), state)?; + } + writeln!(f, "")?; + for (i, (start_id, sty, pid)) in self.starts.iter().enumerate() { + if i % self.starts.stride == 0 { + match pid { + None => writeln!(f, "START-GROUP(ALL)")?, + Some(pid) => { + writeln!(f, "START_GROUP(pattern: {:?})", pid)? + } + } + } + writeln!(f, " {:?} => {:06?}", sty, start_id.as_usize())?; + } + writeln!(f, "state count: {:?}", self.trans.count)?; + writeln!(f, ")")?; + Ok(()) + } +} + +unsafe impl> Automaton for DFA { + #[inline] + fn is_special_state(&self, id: StateID) -> bool { + self.special.is_special_state(id) + } + + #[inline] + fn is_dead_state(&self, id: StateID) -> bool { + self.special.is_dead_state(id) + } + + #[inline] + fn is_quit_state(&self, id: StateID) -> bool { + self.special.is_quit_state(id) + } + + #[inline] + fn is_match_state(&self, id: StateID) -> bool { + self.special.is_match_state(id) + } + + #[inline] + fn is_start_state(&self, id: StateID) -> bool { + self.special.is_start_state(id) + } + + #[inline] + fn is_accel_state(&self, id: StateID) -> bool { + self.special.is_accel_state(id) + } + + // This is marked as inline to help dramatically boost sparse searching, + // which decodes each state it enters to follow the next transition. + #[inline(always)] + fn next_state(&self, current: StateID, input: u8) -> StateID { + let input = self.trans.classes.get(input); + self.trans.state(current).next(input) + } + + #[inline] + unsafe fn next_state_unchecked( + &self, + current: StateID, + input: u8, + ) -> StateID { + self.next_state(current, input) + } + + #[inline] + fn next_eoi_state(&self, current: StateID) -> StateID { + self.trans.state(current).next_eoi() + } + + #[inline] + fn pattern_count(&self) -> usize { + self.trans.patterns + } + + #[inline] + fn match_count(&self, id: StateID) -> usize { + self.trans.state(id).pattern_count() + } + + #[inline] + fn match_pattern(&self, id: StateID, match_index: usize) -> PatternID { + // This is an optimization for the very common case of a DFA with a + // single pattern. This conditional avoids a somewhat more costly path + // that finds the pattern ID from the state machine, which requires + // a bit of slicing/pointer-chasing. This optimization tends to only + // matter when matches are frequent. + if self.trans.patterns == 1 { + return PatternID::ZERO; + } + self.trans.state(id).pattern_id(match_index) + } + + #[inline] + fn start_state_forward( + &self, + pattern_id: Option, + bytes: &[u8], + start: usize, + end: usize, + ) -> StateID { + let index = Start::from_position_fwd(bytes, start, end); + self.starts.start(index, pattern_id) + } + + #[inline] + fn start_state_reverse( + &self, + pattern_id: Option, + bytes: &[u8], + start: usize, + end: usize, + ) -> StateID { + let index = Start::from_position_rev(bytes, start, end); + self.starts.start(index, pattern_id) + } + + #[inline] + fn accelerator(&self, id: StateID) -> &[u8] { + self.trans.state(id).accelerator() + } +} + +/// The transition table portion of a sparse DFA. +/// +/// The transition table is the core part of the DFA in that it describes how +/// to move from one state to another based on the input sequence observed. +/// +/// Unlike a typical dense table based DFA, states in a sparse transition +/// table have variable size. That is, states with more transitions use more +/// space than states with fewer transitions. This means that finding the next +/// transition takes more work than with a dense DFA, but also typically uses +/// much less space. +#[derive(Clone)] +struct Transitions { + /// The raw encoding of each state in this DFA. + /// + /// Each state has the following information: + /// + /// * A set of transitions to subsequent states. Transitions to the dead + /// state are omitted. + /// * If the state can be accelerated, then any additional accelerator + /// information. + /// * If the state is a match state, then the state contains all pattern + /// IDs that match when in that state. + /// + /// To decode a state, use Transitions::state. + /// + /// In practice, T is either Vec or &[u8]. + sparse: T, + /// A set of equivalence classes, where a single equivalence class + /// represents a set of bytes that never discriminate between a match + /// and a non-match in the DFA. Each equivalence class corresponds to a + /// single character in this DFA's alphabet, where the maximum number of + /// characters is 257 (each possible value of a byte plus the special + /// EOI transition). Consequently, the number of equivalence classes + /// corresponds to the number of transitions for each DFA state. Note + /// though that the *space* used by each DFA state in the transition table + /// may be larger. The total space used by each DFA state is known as the + /// stride and is documented above. + /// + /// The only time the number of equivalence classes is fewer than 257 is + /// if the DFA's kind uses byte classes which is the default. Equivalence + /// classes should generally only be disabled when debugging, so that + /// the transitions themselves aren't obscured. Disabling them has no + /// other benefit, since the equivalence class map is always used while + /// searching. In the vast majority of cases, the number of equivalence + /// classes is substantially smaller than 257, particularly when large + /// Unicode classes aren't used. + /// + /// N.B. Equivalence classes aren't particularly useful in a sparse DFA + /// in the current implementation, since equivalence classes generally tend + /// to correspond to continuous ranges of bytes that map to the same + /// transition. So in a sparse DFA, equivalence classes don't really lead + /// to a space savings. In the future, it would be good to try and remove + /// them from sparse DFAs entirely, but requires a bit of work since sparse + /// DFAs are built from dense DFAs, which are in turn built on top of + /// equivalence classes. + classes: ByteClasses, + /// The total number of states in this DFA. Note that a DFA always has at + /// least one state---the dead state---even the empty DFA. In particular, + /// the dead state always has ID 0 and is correspondingly always the first + /// state. The dead state is never a match state. + count: usize, + /// The total number of unique patterns represented by these match states. + patterns: usize, +} + +impl<'a> Transitions<&'a [u8]> { + unsafe fn from_bytes_unchecked( + mut slice: &'a [u8], + ) -> Result<(Transitions<&'a [u8]>, usize), DeserializeError> { + let slice_start = slice.as_ptr() as usize; + + let (state_count, nr) = + bytes::try_read_u32_as_usize(&slice, "state count")?; + slice = &slice[nr..]; + + let (pattern_count, nr) = + bytes::try_read_u32_as_usize(&slice, "pattern count")?; + slice = &slice[nr..]; + + let (classes, nr) = ByteClasses::from_bytes(&slice)?; + slice = &slice[nr..]; + + let (len, nr) = + bytes::try_read_u32_as_usize(&slice, "sparse transitions length")?; + slice = &slice[nr..]; + + bytes::check_slice_len(slice, len, "sparse states byte length")?; + let sparse = &slice[..len]; + slice = &slice[len..]; + + let trans = Transitions { + sparse, + classes, + count: state_count, + patterns: pattern_count, + }; + Ok((trans, slice.as_ptr() as usize - slice_start)) + } +} + +impl> Transitions { + /// Writes a serialized form of this transition table to the buffer given. + /// If the buffer is too small, then an error is returned. To determine + /// how big the buffer must be, use `write_to_len`. + fn write_to( + &self, + mut dst: &mut [u8], + ) -> Result { + let nwrite = self.write_to_len(); + if dst.len() < nwrite { + return Err(SerializeError::buffer_too_small( + "sparse transition table", + )); + } + dst = &mut dst[..nwrite]; + + // write state count + E::write_u32(u32::try_from(self.count).unwrap(), dst); + dst = &mut dst[size_of::()..]; + + // write pattern count + E::write_u32(u32::try_from(self.patterns).unwrap(), dst); + dst = &mut dst[size_of::()..]; + + // write byte class map + let n = self.classes.write_to(dst)?; + dst = &mut dst[n..]; + + // write number of bytes in sparse transitions + E::write_u32(u32::try_from(self.sparse().len()).unwrap(), dst); + dst = &mut dst[size_of::()..]; + + // write actual transitions + dst.copy_from_slice(self.sparse()); + Ok(nwrite) + } + + /// Returns the number of bytes the serialized form of this transition + /// table will use. + fn write_to_len(&self) -> usize { + size_of::() // state count + + size_of::() // pattern count + + self.classes.write_to_len() + + size_of::() // sparse transitions length + + self.sparse().len() + } + + /// Validates that every state ID in this transition table is valid. + /// + /// That is, every state ID can be used to correctly index a state in this + /// table. + fn validate(&self) -> Result<(), DeserializeError> { + // In order to validate everything, we not only need to make sure we + // can decode every state, but that every transition in every state + // points to a valid state. There are many duplicative transitions, so + // we record state IDs that we've verified so that we don't redo the + // decoding work. + // + // Except, when in no_std mode, we don't have dynamic memory allocation + // available to us, so we skip this optimization. It's not clear + // whether doing something more clever is worth it just yet. If you're + // profiling this code and need it to run faster, please file an issue. + // + // ---AG + struct Seen { + #[cfg(feature = "alloc")] + set: BTreeSet, + #[cfg(not(feature = "alloc"))] + set: core::marker::PhantomData, + } + + #[cfg(feature = "alloc")] + impl Seen { + fn new() -> Seen { + Seen { set: BTreeSet::new() } + } + fn insert(&mut self, id: StateID) { + self.set.insert(id); + } + fn contains(&self, id: &StateID) -> bool { + self.set.contains(id) + } + } + + #[cfg(not(feature = "alloc"))] + impl Seen { + fn new() -> Seen { + Seen { set: core::marker::PhantomData } + } + fn insert(&mut self, _id: StateID) {} + fn contains(&self, _id: &StateID) -> bool { + false + } + } + + let mut verified: Seen = Seen::new(); + // We need to make sure that we decode the correct number of states. + // Otherwise, an empty set of transitions would validate even if the + // recorded state count is non-empty. + let mut count = 0; + // We can't use the self.states() iterator because it assumes the state + // encodings are valid. It could panic if they aren't. + let mut id = DEAD; + while id.as_usize() < self.sparse().len() { + let state = self.try_state(id)?; + verified.insert(id); + // The next ID should be the offset immediately following `state`. + id = StateID::new(bytes::add( + id.as_usize(), + state.bytes_len(), + "next state ID offset", + )?) + .map_err(|err| { + DeserializeError::state_id_error(err, "next state ID offset") + })?; + count += 1; + + // Now check that all transitions in this state are correct. + for i in 0..state.ntrans { + let to = state.next_at(i); + if verified.contains(&to) { + continue; + } + let _ = self.try_state(to)?; + verified.insert(id); + } + } + if count != self.count { + return Err(DeserializeError::generic( + "mismatching sparse state count", + )); + } + Ok(()) + } + + /// Converts these transitions to a borrowed value. + fn as_ref(&self) -> Transitions<&'_ [u8]> { + Transitions { + sparse: self.sparse(), + classes: self.classes.clone(), + count: self.count, + patterns: self.patterns, + } + } + + /// Converts these transitions to an owned value. + #[cfg(feature = "alloc")] + fn to_owned(&self) -> Transitions> { + Transitions { + sparse: self.sparse().to_vec(), + classes: self.classes.clone(), + count: self.count, + patterns: self.patterns, + } + } + + /// Return a convenient representation of the given state. + /// + /// This panics if the state is invalid. + /// + /// This is marked as inline to help dramatically boost sparse searching, + /// which decodes each state it enters to follow the next transition. Other + /// functions involved are also inlined, which should hopefully eliminate + /// a lot of the extraneous decoding that is never needed just to follow + /// the next transition. + #[inline(always)] + fn state(&self, id: StateID) -> State<'_> { + let mut state = &self.sparse()[id.as_usize()..]; + let mut ntrans = bytes::read_u16(&state) as usize; + let is_match = (1 << 15) & ntrans != 0; + ntrans &= !(1 << 15); + state = &state[2..]; + + let (input_ranges, state) = state.split_at(ntrans * 2); + let (next, state) = state.split_at(ntrans * StateID::SIZE); + let (pattern_ids, state) = if is_match { + let npats = bytes::read_u32(&state) as usize; + state[4..].split_at(npats * 4) + } else { + (&[][..], state) + }; + + let accel_len = state[0] as usize; + let accel = &state[1..accel_len + 1]; + State { id, is_match, ntrans, input_ranges, next, pattern_ids, accel } + } + + /// Like `state`, but will return an error if the state encoding is + /// invalid. This is useful for verifying states after deserialization, + /// which is required for a safe deserialization API. + /// + /// Note that this only verifies that this state is decodable and that + /// all of its data is consistent. It does not verify that its state ID + /// transitions point to valid states themselves, nor does it verify that + /// every pattern ID is valid. + fn try_state(&self, id: StateID) -> Result, DeserializeError> { + if id.as_usize() > self.sparse().len() { + return Err(DeserializeError::generic("invalid sparse state ID")); + } + let mut state = &self.sparse()[id.as_usize()..]; + // Encoding format starts with a u16 that stores the total number of + // transitions in this state. + let (mut ntrans, _) = + bytes::try_read_u16_as_usize(state, "state transition count")?; + let is_match = ((1 << 15) & ntrans) != 0; + ntrans &= !(1 << 15); + state = &state[2..]; + if ntrans > 257 || ntrans == 0 { + return Err(DeserializeError::generic("invalid transition count")); + } + + // Each transition has two pieces: an inclusive range of bytes on which + // it is defined, and the state ID that those bytes transition to. The + // pairs come first, followed by a corresponding sequence of state IDs. + let input_ranges_len = ntrans.checked_mul(2).unwrap(); + bytes::check_slice_len(state, input_ranges_len, "sparse byte pairs")?; + let (input_ranges, state) = state.split_at(input_ranges_len); + // Every range should be of the form A-B, where A<=B. + for pair in input_ranges.chunks(2) { + let (start, end) = (pair[0], pair[1]); + if start > end { + return Err(DeserializeError::generic("invalid input range")); + } + } + + // And now extract the corresponding sequence of state IDs. We leave + // this sequence as a &[u8] instead of a &[S] because sparse DFAs do + // not have any alignment requirements. + let next_len = ntrans + .checked_mul(self.id_len()) + .expect("state size * #trans should always fit in a usize"); + bytes::check_slice_len(state, next_len, "sparse trans state IDs")?; + let (next, state) = state.split_at(next_len); + // We can at least verify that every state ID is in bounds. + for idbytes in next.chunks(self.id_len()) { + let (id, _) = + bytes::read_state_id(idbytes, "sparse state ID in try_state")?; + bytes::check_slice_len( + self.sparse(), + id.as_usize(), + "invalid sparse state ID", + )?; + } + + // If this is a match state, then read the pattern IDs for this state. + // Pattern IDs is a u32-length prefixed sequence of native endian + // encoded 32-bit integers. + let (pattern_ids, state) = if is_match { + let (npats, nr) = + bytes::try_read_u32_as_usize(state, "pattern ID count")?; + let state = &state[nr..]; + + let pattern_ids_len = + bytes::mul(npats, 4, "sparse pattern ID byte length")?; + bytes::check_slice_len( + state, + pattern_ids_len, + "sparse pattern IDs", + )?; + let (pattern_ids, state) = state.split_at(pattern_ids_len); + for patbytes in pattern_ids.chunks(PatternID::SIZE) { + bytes::read_pattern_id( + patbytes, + "sparse pattern ID in try_state", + )?; + } + (pattern_ids, state) + } else { + (&[][..], state) + }; + + // Now read this state's accelerator info. The first byte is the length + // of the accelerator, which is typically 0 (for no acceleration) but + // is no bigger than 3. The length indicates the number of bytes that + // follow, where each byte corresponds to a transition out of this + // state. + if state.is_empty() { + return Err(DeserializeError::generic("no accelerator length")); + } + let (accel_len, state) = (state[0] as usize, &state[1..]); + + if accel_len > 3 { + return Err(DeserializeError::generic( + "sparse invalid accelerator length", + )); + } + bytes::check_slice_len( + state, + accel_len, + "sparse corrupt accelerator length", + )?; + let (accel, _) = (&state[..accel_len], &state[accel_len..]); + + Ok(State { + id, + is_match, + ntrans, + input_ranges, + next, + pattern_ids, + accel, + }) + } + + /// Return an iterator over all of the states in this DFA. + /// + /// The iterator returned yields tuples, where the first element is the + /// state ID and the second element is the state itself. + fn states(&self) -> StateIter<'_, T> { + StateIter { trans: self, id: DEAD.as_usize() } + } + + /// Returns the sparse transitions as raw bytes. + fn sparse(&self) -> &[u8] { + self.sparse.as_ref() + } + + /// Returns the number of bytes represented by a single state ID. + fn id_len(&self) -> usize { + StateID::SIZE + } + + /// Return the memory usage, in bytes, of these transitions. + /// + /// This does not include the size of a `Transitions` value itself. + fn memory_usage(&self) -> usize { + self.sparse().len() + } +} + +#[cfg(feature = "alloc")] +impl> Transitions { + /// Return a convenient mutable representation of the given state. + /// This panics if the state is invalid. + fn state_mut(&mut self, id: StateID) -> StateMut<'_> { + let mut state = &mut self.sparse_mut()[id.as_usize()..]; + let mut ntrans = bytes::read_u16(&state) as usize; + let is_match = (1 << 15) & ntrans != 0; + ntrans &= !(1 << 15); + state = &mut state[2..]; + + let (input_ranges, state) = state.split_at_mut(ntrans * 2); + let (next, state) = state.split_at_mut(ntrans * StateID::SIZE); + let (pattern_ids, state) = if is_match { + let npats = bytes::read_u32(&state) as usize; + state[4..].split_at_mut(npats * 4) + } else { + (&mut [][..], state) + }; + + let accel_len = state[0] as usize; + let accel = &mut state[1..accel_len + 1]; + StateMut { + id, + is_match, + ntrans, + input_ranges, + next, + pattern_ids, + accel, + } + } + + /// Returns the sparse transitions as raw mutable bytes. + fn sparse_mut(&mut self) -> &mut [u8] { + self.sparse.as_mut() + } +} + +/// The set of all possible starting states in a DFA. +/// +/// See the eponymous type in the `dense` module for more details. This type +/// is very similar to `dense::StartTable`, except that its underlying +/// representation is `&[u8]` instead of `&[S]`. (The latter would require +/// sparse DFAs to be aligned, which is explicitly something we do not require +/// because we don't really need it.) +#[derive(Clone)] +struct StartTable { + /// The initial start state IDs as a contiguous table of native endian + /// encoded integers, represented by `S`. + /// + /// In practice, T is either Vec or &[u8] and has no alignment + /// requirements. + /// + /// The first `stride` (currently always 4) entries always correspond to + /// the start states for the entire DFA. After that, there are + /// `stride * patterns` state IDs, where `patterns` may be zero in the + /// case of a DFA with no patterns or in the case where the DFA was built + /// without enabling starting states for each pattern. + table: T, + /// The number of starting state IDs per pattern. + stride: usize, + /// The total number of patterns for which starting states are encoded. + /// This may be zero for non-empty DFAs when the DFA was built without + /// start states for each pattern. + patterns: usize, +} + +#[cfg(feature = "alloc")] +impl StartTable> { + fn new(patterns: usize) -> StartTable> { + let stride = Start::count(); + // This is OK since the only way we're here is if a dense DFA could be + // constructed successfully, which uses the same space. + let len = stride + .checked_mul(patterns) + .unwrap() + .checked_add(stride) + .unwrap() + .checked_mul(StateID::SIZE) + .unwrap(); + StartTable { table: vec![0; len], stride, patterns } + } + + fn from_dense_dfa>( + dfa: &dense::DFA, + remap: &[StateID], + ) -> Result>, Error> { + // Unless the DFA has start states compiled for each pattern, then + // as far as the starting state table is concerned, there are zero + // patterns to account for. It will instead only store starting states + // for the entire DFA. + let start_pattern_count = if dfa.has_starts_for_each_pattern() { + dfa.pattern_count() + } else { + 0 + }; + let mut sl = StartTable::new(start_pattern_count); + for (old_start_id, sty, pid) in dfa.starts() { + let new_start_id = remap[dfa.to_index(old_start_id)]; + sl.set_start(sty, pid, new_start_id); + } + Ok(sl) + } +} + +impl<'a> StartTable<&'a [u8]> { + unsafe fn from_bytes_unchecked( + mut slice: &'a [u8], + ) -> Result<(StartTable<&'a [u8]>, usize), DeserializeError> { + let slice_start = slice.as_ptr() as usize; + + let (stride, nr) = + bytes::try_read_u32_as_usize(slice, "sparse start table stride")?; + slice = &slice[nr..]; + + let (patterns, nr) = bytes::try_read_u32_as_usize( + slice, + "sparse start table patterns", + )?; + slice = &slice[nr..]; + + if stride != Start::count() { + return Err(DeserializeError::generic( + "invalid sparse starting table stride", + )); + } + if patterns > PatternID::LIMIT { + return Err(DeserializeError::generic( + "sparse invalid number of patterns", + )); + } + let pattern_table_size = + bytes::mul(stride, patterns, "sparse invalid pattern count")?; + // Our start states always start with a single stride of start states + // for the entire automaton which permit it to match any pattern. What + // follows it are an optional set of start states for each pattern. + let start_state_count = bytes::add( + stride, + pattern_table_size, + "sparse invalid 'any' pattern starts size", + )?; + let table_bytes_len = bytes::mul( + start_state_count, + StateID::SIZE, + "sparse pattern table bytes length", + )?; + bytes::check_slice_len( + slice, + table_bytes_len, + "sparse start ID table", + )?; + let table_bytes = &slice[..table_bytes_len]; + slice = &slice[table_bytes_len..]; + + let sl = StartTable { table: table_bytes, stride, patterns }; + Ok((sl, slice.as_ptr() as usize - slice_start)) + } +} + +impl> StartTable { + fn write_to( + &self, + mut dst: &mut [u8], + ) -> Result { + let nwrite = self.write_to_len(); + if dst.len() < nwrite { + return Err(SerializeError::buffer_too_small( + "sparse starting table ids", + )); + } + dst = &mut dst[..nwrite]; + + // write stride + E::write_u32(u32::try_from(self.stride).unwrap(), dst); + dst = &mut dst[size_of::()..]; + // write pattern count + E::write_u32(u32::try_from(self.patterns).unwrap(), dst); + dst = &mut dst[size_of::()..]; + // write start IDs + dst.copy_from_slice(self.table()); + Ok(nwrite) + } + + /// Returns the number of bytes the serialized form of this transition + /// table will use. + fn write_to_len(&self) -> usize { + size_of::() // stride + + size_of::() // # patterns + + self.table().len() + } + + /// Validates that every starting state ID in this table is valid. + /// + /// That is, every starting state ID can be used to correctly decode a + /// state in the DFA's sparse transitions. + fn validate( + &self, + trans: &Transitions, + ) -> Result<(), DeserializeError> { + for (id, _, _) in self.iter() { + let _ = trans.try_state(id)?; + } + Ok(()) + } + + /// Converts this start list to a borrowed value. + fn as_ref(&self) -> StartTable<&'_ [u8]> { + StartTable { + table: self.table(), + stride: self.stride, + patterns: self.patterns, + } + } + + /// Converts this start list to an owned value. + #[cfg(feature = "alloc")] + fn to_owned(&self) -> StartTable> { + StartTable { + table: self.table().to_vec(), + stride: self.stride, + patterns: self.patterns, + } + } + + /// Return the start state for the given index and pattern ID. If the + /// pattern ID is None, then the corresponding start state for the entire + /// DFA is returned. If the pattern ID is not None, then the corresponding + /// starting state for the given pattern is returned. If this start table + /// does not have individual starting states for each pattern, then this + /// panics. + fn start(&self, index: Start, pattern_id: Option) -> StateID { + let start_index = index.as_usize(); + let index = match pattern_id { + None => start_index, + Some(pid) => { + let pid = pid.as_usize(); + assert!(pid < self.patterns, "invalid pattern ID {:?}", pid); + self.stride + .checked_mul(pid) + .unwrap() + .checked_add(self.stride) + .unwrap() + .checked_add(start_index) + .unwrap() + } + }; + let start = index * StateID::SIZE; + // This OK since we're allowed to assume that the start table contains + // valid StateIDs. + bytes::read_state_id_unchecked(&self.table()[start..]).0 + } + + /// Return an iterator over all start IDs in this table. + fn iter(&self) -> StartStateIter<'_, T> { + StartStateIter { st: self, i: 0 } + } + + /// Returns the total number of start state IDs in this table. + fn len(&self) -> usize { + self.table().len() / StateID::SIZE + } + + /// Returns the table as a raw slice of bytes. + fn table(&self) -> &[u8] { + self.table.as_ref() + } + + /// Return the memory usage, in bytes, of this start list. + /// + /// This does not include the size of a `StartTable` value itself. + fn memory_usage(&self) -> usize { + self.table().len() + } +} + +#[cfg(feature = "alloc")] +impl> StartTable { + /// Set the start state for the given index and pattern. + /// + /// If the pattern ID or state ID are not valid, then this will panic. + fn set_start( + &mut self, + index: Start, + pattern_id: Option, + id: StateID, + ) { + let start_index = index.as_usize(); + let index = match pattern_id { + None => start_index, + Some(pid) => { + let pid = pid.as_usize(); + assert!(pid < self.patterns, "invalid pattern ID {:?}", pid); + self.stride + .checked_mul(pid) + .unwrap() + .checked_add(self.stride) + .unwrap() + .checked_add(start_index) + .unwrap() + } + }; + let start = index * StateID::SIZE; + let end = start + StateID::SIZE; + bytes::write_state_id::( + id, + &mut self.table.as_mut()[start..end], + ); + } +} + +/// An iterator over all state state IDs in a sparse DFA. +struct StartStateIter<'a, T> { + st: &'a StartTable, + i: usize, +} + +impl<'a, T: AsRef<[u8]>> Iterator for StartStateIter<'a, T> { + type Item = (StateID, Start, Option); + + fn next(&mut self) -> Option<(StateID, Start, Option)> { + let i = self.i; + if i >= self.st.len() { + return None; + } + self.i += 1; + + // This unwrap is okay since the stride of any DFA must always match + // the number of start state types. + let start_type = Start::from_usize(i % self.st.stride).unwrap(); + let pid = if i < self.st.stride { + // This means we don't have start states for each pattern. + None + } else { + // These unwraps are OK since we may assume our table and stride + // is correct. + let pid = i + .checked_sub(self.st.stride) + .unwrap() + .checked_div(self.st.stride) + .unwrap(); + Some(PatternID::new(pid).unwrap()) + }; + let start = i * StateID::SIZE; + let end = start + StateID::SIZE; + let bytes = self.st.table()[start..end].try_into().unwrap(); + // This is OK since we're allowed to assume that any IDs in this start + // table are correct and valid for this DFA. + let id = StateID::from_ne_bytes_unchecked(bytes); + Some((id, start_type, pid)) + } +} + +impl<'a, T> fmt::Debug for StartStateIter<'a, T> { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + f.debug_struct("StartStateIter").field("i", &self.i).finish() + } +} + +/// An iterator over all states in a sparse DFA. +/// +/// This iterator yields tuples, where the first element is the state ID and +/// the second element is the state itself. +struct StateIter<'a, T> { + trans: &'a Transitions, + id: usize, +} + +impl<'a, T: AsRef<[u8]>> Iterator for StateIter<'a, T> { + type Item = State<'a>; + + fn next(&mut self) -> Option> { + if self.id >= self.trans.sparse().len() { + return None; + } + let state = self.trans.state(StateID::new_unchecked(self.id)); + self.id = self.id + state.bytes_len(); + Some(state) + } +} + +impl<'a, T> fmt::Debug for StateIter<'a, T> { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + f.debug_struct("StateIter").field("id", &self.id).finish() + } +} + +/// A representation of a sparse DFA state that can be cheaply materialized +/// from a state identifier. +#[derive(Clone)] +struct State<'a> { + /// The identifier of this state. + id: StateID, + /// Whether this is a match state or not. + is_match: bool, + /// The number of transitions in this state. + ntrans: usize, + /// Pairs of input ranges, where there is one pair for each transition. + /// Each pair specifies an inclusive start and end byte range for the + /// corresponding transition. + input_ranges: &'a [u8], + /// Transitions to the next state. This slice contains native endian + /// encoded state identifiers, with `S` as the representation. Thus, there + /// are `ntrans * size_of::()` bytes in this slice. + next: &'a [u8], + /// If this is a match state, then this contains the pattern IDs that match + /// when the DFA is in this state. + /// + /// This is a contiguous sequence of 32-bit native endian encoded integers. + pattern_ids: &'a [u8], + /// An accelerator for this state, if present. If this state has no + /// accelerator, then this is an empty slice. When non-empty, this slice + /// has length at most 3 and corresponds to the exhaustive set of bytes + /// that must be seen in order to transition out of this state. + accel: &'a [u8], +} + +impl<'a> State<'a> { + /// Searches for the next transition given an input byte. If no such + /// transition could be found, then a dead state is returned. + /// + /// This is marked as inline to help dramatically boost sparse searching, + /// which decodes each state it enters to follow the next transition. + #[inline(always)] + fn next(&self, input: u8) -> StateID { + // This straight linear search was observed to be much better than + // binary search on ASCII haystacks, likely because a binary search + // visits the ASCII case last but a linear search sees it first. A + // binary search does do a little better on non-ASCII haystacks, but + // not by much. There might be a better trade off lurking here. + for i in 0..(self.ntrans - 1) { + let (start, end) = self.range(i); + if start <= input && input <= end { + return self.next_at(i); + } + // We could bail early with an extra branch: if input < b1, then + // we know we'll never find a matching transition. Interestingly, + // this extra branch seems to not help performance, or will even + // hurt it. It's likely very dependent on the DFA itself and what + // is being searched. + } + DEAD + } + + /// Returns the next state ID for the special EOI transition. + fn next_eoi(&self) -> StateID { + self.next_at(self.ntrans - 1) + } + + /// Returns the identifier for this state. + fn id(&self) -> StateID { + self.id + } + + /// Returns the inclusive input byte range for the ith transition in this + /// state. + fn range(&self, i: usize) -> (u8, u8) { + (self.input_ranges[i * 2], self.input_ranges[i * 2 + 1]) + } + + /// Returns the next state for the ith transition in this state. + fn next_at(&self, i: usize) -> StateID { + let start = i * StateID::SIZE; + let end = start + StateID::SIZE; + let bytes = self.next[start..end].try_into().unwrap(); + StateID::from_ne_bytes_unchecked(bytes) + } + + /// Returns the pattern ID for the given match index. If the match index + /// is invalid, then this panics. + fn pattern_id(&self, match_index: usize) -> PatternID { + let start = match_index * PatternID::SIZE; + bytes::read_pattern_id_unchecked(&self.pattern_ids[start..]).0 + } + + /// Returns the total number of pattern IDs for this state. This is always + /// zero when `is_match` is false. + fn pattern_count(&self) -> usize { + assert_eq!(0, self.pattern_ids.len() % 4); + self.pattern_ids.len() / 4 + } + + /// Return the total number of bytes that this state consumes in its + /// encoded form. + fn bytes_len(&self) -> usize { + let mut len = 2 + + (self.ntrans * 2) + + (self.ntrans * StateID::SIZE) + + (1 + self.accel.len()); + if self.is_match { + len += size_of::() + self.pattern_ids.len(); + } + len + } + + /// Return an accelerator for this state. + fn accelerator(&self) -> &'a [u8] { + self.accel + } +} + +impl<'a> fmt::Debug for State<'a> { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + let mut printed = false; + for i in 0..(self.ntrans - 1) { + let next = self.next_at(i); + if next == DEAD { + continue; + } + + if printed { + write!(f, ", ")?; + } + let (start, end) = self.range(i); + if start == end { + write!(f, "{:?} => {:?}", DebugByte(start), next)?; + } else { + write!( + f, + "{:?}-{:?} => {:?}", + DebugByte(start), + DebugByte(end), + next, + )?; + } + printed = true; + } + let eoi = self.next_at(self.ntrans - 1); + if eoi != DEAD { + if printed { + write!(f, ", ")?; + } + write!(f, "EOI => {:?}", eoi)?; + } + Ok(()) + } +} + +/// A representation of a mutable sparse DFA state that can be cheaply +/// materialized from a state identifier. +#[cfg(feature = "alloc")] +struct StateMut<'a> { + /// The identifier of this state. + id: StateID, + /// Whether this is a match state or not. + is_match: bool, + /// The number of transitions in this state. + ntrans: usize, + /// Pairs of input ranges, where there is one pair for each transition. + /// Each pair specifies an inclusive start and end byte range for the + /// corresponding transition. + input_ranges: &'a mut [u8], + /// Transitions to the next state. This slice contains native endian + /// encoded state identifiers, with `S` as the representation. Thus, there + /// are `ntrans * size_of::()` bytes in this slice. + next: &'a mut [u8], + /// If this is a match state, then this contains the pattern IDs that match + /// when the DFA is in this state. + /// + /// This is a contiguous sequence of 32-bit native endian encoded integers. + pattern_ids: &'a [u8], + /// An accelerator for this state, if present. If this state has no + /// accelerator, then this is an empty slice. When non-empty, this slice + /// has length at most 3 and corresponds to the exhaustive set of bytes + /// that must be seen in order to transition out of this state. + accel: &'a mut [u8], +} + +#[cfg(feature = "alloc")] +impl<'a> StateMut<'a> { + /// Sets the ith transition to the given state. + fn set_next_at(&mut self, i: usize, next: StateID) { + let start = i * StateID::SIZE; + let end = start + StateID::SIZE; + bytes::write_state_id::(next, &mut self.next[start..end]); + } +} + +#[cfg(feature = "alloc")] +impl<'a> fmt::Debug for StateMut<'a> { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + let state = State { + id: self.id, + is_match: self.is_match, + ntrans: self.ntrans, + input_ranges: self.input_ranges, + next: self.next, + pattern_ids: self.pattern_ids, + accel: self.accel, + }; + fmt::Debug::fmt(&state, f) + } +} + +/// A binary search routine specialized specifically to a sparse DFA state's +/// transitions. Specifically, the transitions are defined as a set of pairs +/// of input bytes that delineate an inclusive range of bytes. If the input +/// byte is in the range, then the corresponding transition is a match. +/// +/// This binary search accepts a slice of these pairs and returns the position +/// of the matching pair (the ith transition), or None if no matching pair +/// could be found. +/// +/// Note that this routine is not currently used since it was observed to +/// either decrease performance when searching ASCII, or did not provide enough +/// of a boost on non-ASCII haystacks to be worth it. However, we leave it here +/// for posterity in case we can find a way to use it. +/// +/// In theory, we could use the standard library's search routine if we could +/// cast a `&[u8]` to a `&[(u8, u8)]`, but I don't believe this is currently +/// guaranteed to be safe and is thus UB (since I don't think the in-memory +/// representation of `(u8, u8)` has been nailed down). One could define a +/// repr(C) type, but the casting doesn't seem justified. +#[allow(dead_code)] +#[inline(always)] +fn binary_search_ranges(ranges: &[u8], needle: u8) -> Option { + debug_assert!(ranges.len() % 2 == 0, "ranges must have even length"); + debug_assert!(ranges.len() <= 512, "ranges should be short"); + + let (mut left, mut right) = (0, ranges.len() / 2); + while left < right { + let mid = (left + right) / 2; + let (b1, b2) = (ranges[mid * 2], ranges[mid * 2 + 1]); + if needle < b1 { + right = mid; + } else if needle > b2 { + left = mid + 1; + } else { + return Some(mid); + } + } + None +} diff --git a/vendor/regex-automata/src/dfa/special.rs b/vendor/regex-automata/src/dfa/special.rs new file mode 100644 index 000000000..3db95a707 --- /dev/null +++ b/vendor/regex-automata/src/dfa/special.rs @@ -0,0 +1,477 @@ +use crate::{ + dfa::DEAD, + util::{ + bytes::{self, DeserializeError, Endian, SerializeError}, + id::StateID, + }, +}; + +macro_rules! err { + ($msg:expr) => { + return Err(DeserializeError::generic($msg)); + }; +} + +// Special represents the identifiers in a DFA that correspond to "special" +// states. If a state is one or more of the following, then it is considered +// special: +// +// * dead - A non-matching state where all outgoing transitions lead back to +// itself. There is only one of these, regardless of whether minimization +// has run. The dead state always has an ID of 0. i.e., It is always the +// first state in a DFA. +// * quit - A state that is entered whenever a byte is seen that should cause +// a DFA to give up and stop searching. This results in a MatchError::Quit +// error being returned at search time. The default configuration for a DFA +// has no quit bytes, which means this state is unreachable by default, +// although it is always present for reasons of implementation simplicity. +// This state is only reachable when the caller configures the DFA to quit +// on certain bytes. There is always exactly one of these states and it +// is always the second state. (Its actual ID depends on the size of the +// alphabet in dense DFAs, since state IDs are premultiplied in order to +// allow them to be used directly as indices into the transition table.) +// * match - An accepting state, i.e., indicative of a match. There may be +// zero or more of these states. +// * accelerated - A state where all of its outgoing transitions, except a +// few, loop back to itself. These states are candidates for acceleration +// via memchr during search. There may be zero or more of these states. +// * start - A non-matching state that indicates where the automaton should +// start during a search. There is always at least one starting state and +// all are guaranteed to be non-match states. (A start state cannot be a +// match state because the DFAs in this crate delay all matches by one byte. +// So every search that finds a match must move through one transition to +// some other match state, even when searching an empty string.) +// +// These are not mutually exclusive categories. Namely, the following +// overlappings can occur: +// +// * {dead, start} - If a DFA can never lead to a match and it is minimized, +// then it will typically compile to something where all starting IDs point +// to the DFA's dead state. +// * {match, accelerated} - It is possible for a match state to have the +// majority of its transitions loop back to itself, which means it's +// possible for a match state to be accelerated. +// * {start, accelerated} - Similarly, it is possible for a start state to be +// accelerated. Note that it is possible for an accelerated state to be +// neither a match or a start state. Also note that just because both match +// and start states overlap with accelerated states does not mean that +// match and start states overlap with each other. In fact, they are +// guaranteed not to overlap. +// +// As a special mention, every DFA always has a dead and a quit state, even +// though from the perspective of the DFA, they are equivalent. (Indeed, +// minimization special cases them to ensure they don't get merged.) The +// purpose of keeping them distinct is to use the quit state as a sentinel to +// distguish between whether a search finished successfully without finding +// anything or whether it gave up before finishing. +// +// So the main problem we want to solve here is the *fast* detection of whether +// a state is special or not. And we also want to do this while storing as +// little extra data as possible. AND we want to be able to quickly determine +// which categories a state falls into above if it is special. +// +// We achieve this by essentially shuffling all special states to the beginning +// of a DFA. That is, all special states appear before every other non-special +// state. By representing special states this way, we can determine whether a +// state is special or not by a single comparison, where special.max is the +// identifier of the last special state in the DFA: +// +// if current_state <= special.max: +// ... do something with special state +// +// The only thing left to do is to determine what kind of special state +// it is. Because what we do next depends on that. Since special states +// are typically rare, we can afford to do a bit more extra work, but we'd +// still like this to be as fast as possible. The trick we employ here is to +// continue shuffling states even within the special state range. Such that +// one contiguous region corresponds to match states, another for start states +// and then an overlapping range for accelerated states. At a high level, our +// special state detection might look like this (for leftmost searching, where +// we continue searching even after seeing a match): +// +// byte = input[offset] +// current_state = next_state(current_state, byte) +// offset += 1 +// if current_state <= special.max: +// if current_state == 0: +// # We can never leave a dead state, so this always marks the +// # end of our search. +// return last_match +// if current_state == special.quit_id: +// # A quit state means we give up. If he DFA has no quit state, +// # then special.quit_id == 0 == dead, which is handled by the +// # conditional above. +// return Err(MatchError::Quit { byte, offset: offset - 1 }) +// if special.min_match <= current_state <= special.max_match: +// last_match = Some(offset) +// if special.min_accel <= current_state <= special.max_accel: +// offset = accelerate(input, offset) +// last_match = Some(offset) +// elif special.min_start <= current_state <= special.max_start: +// offset = prefilter.find(input, offset) +// if special.min_accel <= current_state <= special.max_accel: +// offset = accelerate(input, offset) +// elif special.min_accel <= current_state <= special.max_accel: +// offset = accelerate(input, offset) +// +// There are some small details left out of the logic above. For example, +// in order to accelerate a state, we need to know which bytes to search for. +// This in turn implies some extra data we need to store in the DFA. To keep +// things compact, we would ideally only store +// +// N = special.max_accel - special.min_accel + 1 +// +// items. But state IDs are premultiplied, which means they are not contiguous. +// So in order to take a state ID and index an array of accelerated structures, +// we need to do: +// +// i = (state_id - special.min_accel) / stride +// +// (N.B. 'stride' is always a power of 2, so the above can be implemented via +// '(state_id - special.min_accel) >> stride2', where 'stride2' is x in +// 2^x=stride.) +// +// Moreover, some of these specialty categories may be empty. For example, +// DFAs are not required to have any match states or any accelerated states. +// In that case, the lower and upper bounds are both set to 0 (the dead state +// ID) and the first `current_state == 0` check subsumes cases where the +// ranges are empty. +// +// Loop unrolling, if applicable, has also been left out of the logic above. +// +// Graphically, the ranges look like this, where asterisks indicate ranges +// that can be empty. Each 'x' is a state. +// +// quit +// dead| +// || +// xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx +// | | | | start | | +// | |-------------| |-------| | +// | match* | | | | +// | | | | | +// | |----------| | | +// | accel* | | +// | | | +// | | | +// |----------------------------|------------------------ +// special non-special* +#[derive(Clone, Copy, Debug)] +pub struct Special { + /// The identifier of the last special state in a DFA. A state is special + /// if and only if its identifier is less than or equal to `max`. + pub max: StateID, + /// The identifier of the quit state in a DFA. (There is no analogous field + /// for the dead state since the dead state's ID is always zero, regardless + /// of state ID size.) + pub quit_id: StateID, + /// The identifier of the first match state. + pub min_match: StateID, + /// The identifier of the last match state. + pub max_match: StateID, + /// The identifier of the first accelerated state. + pub min_accel: StateID, + /// The identifier of the last accelerated state. + pub max_accel: StateID, + /// The identifier of the first start state. + pub min_start: StateID, + /// The identifier of the last start state. + pub max_start: StateID, +} + +impl Special { + /// Creates a new set of special ranges for a DFA. All ranges are initially + /// set to only contain the dead state. This is interpreted as an empty + /// range. + #[cfg(feature = "alloc")] + pub fn new() -> Special { + Special { + max: DEAD, + quit_id: DEAD, + min_match: DEAD, + max_match: DEAD, + min_accel: DEAD, + max_accel: DEAD, + min_start: DEAD, + max_start: DEAD, + } + } + + /// Remaps all of the special state identifiers using the function given. + #[cfg(feature = "alloc")] + pub fn remap(&self, map: impl Fn(StateID) -> StateID) -> Special { + Special { + max: map(self.max), + quit_id: map(self.quit_id), + min_match: map(self.min_match), + max_match: map(self.max_match), + min_accel: map(self.min_accel), + max_accel: map(self.max_accel), + min_start: map(self.min_start), + max_start: map(self.max_start), + } + } + + /// Deserialize the given bytes into special state ranges. If the slice + /// given is not big enough, then this returns an error. Similarly, if + /// any of the expected invariants around special state ranges aren't + /// upheld, an error is returned. Note that this does not guarantee that + /// the information returned is correct. + /// + /// Upon success, this returns the number of bytes read in addition to the + /// special state IDs themselves. + pub fn from_bytes( + mut slice: &[u8], + ) -> Result<(Special, usize), DeserializeError> { + bytes::check_slice_len(slice, 8 * StateID::SIZE, "special states")?; + + let mut nread = 0; + let mut read_id = |what| -> Result { + let (id, nr) = bytes::try_read_state_id(slice, what)?; + nread += nr; + slice = &slice[StateID::SIZE..]; + Ok(id) + }; + + let max = read_id("special max id")?; + let quit_id = read_id("special quit id")?; + let min_match = read_id("special min match id")?; + let max_match = read_id("special max match id")?; + let min_accel = read_id("special min accel id")?; + let max_accel = read_id("special max accel id")?; + let min_start = read_id("special min start id")?; + let max_start = read_id("special max start id")?; + + let special = Special { + max, + quit_id, + min_match, + max_match, + min_accel, + max_accel, + min_start, + max_start, + }; + special.validate()?; + assert_eq!(nread, special.write_to_len()); + Ok((special, nread)) + } + + /// Validate that the information describing special states satisfies + /// all known invariants. + pub fn validate(&self) -> Result<(), DeserializeError> { + // Check that both ends of the range are DEAD or neither are. + if self.min_match == DEAD && self.max_match != DEAD { + err!("min_match is DEAD, but max_match is not"); + } + if self.min_match != DEAD && self.max_match == DEAD { + err!("max_match is DEAD, but min_match is not"); + } + if self.min_accel == DEAD && self.max_accel != DEAD { + err!("min_accel is DEAD, but max_accel is not"); + } + if self.min_accel != DEAD && self.max_accel == DEAD { + err!("max_accel is DEAD, but min_accel is not"); + } + if self.min_start == DEAD && self.max_start != DEAD { + err!("min_start is DEAD, but max_start is not"); + } + if self.min_start != DEAD && self.max_start == DEAD { + err!("max_start is DEAD, but min_start is not"); + } + + // Check that ranges are well formed. + if self.min_match > self.max_match { + err!("min_match should not be greater than max_match"); + } + if self.min_accel > self.max_accel { + err!("min_accel should not be greater than max_accel"); + } + if self.min_start > self.max_start { + err!("min_start should not be greater than max_start"); + } + + // Check that ranges are ordered with respect to one another. + if self.matches() && self.quit_id >= self.min_match { + err!("quit_id should not be greater than min_match"); + } + if self.accels() && self.quit_id >= self.min_accel { + err!("quit_id should not be greater than min_accel"); + } + if self.starts() && self.quit_id >= self.min_start { + err!("quit_id should not be greater than min_start"); + } + if self.matches() && self.accels() && self.min_accel < self.min_match { + err!("min_match should not be greater than min_accel"); + } + if self.matches() && self.starts() && self.min_start < self.min_match { + err!("min_match should not be greater than min_start"); + } + if self.accels() && self.starts() && self.min_start < self.min_accel { + err!("min_accel should not be greater than min_start"); + } + + // Check that max is at least as big as everything else. + if self.max < self.quit_id { + err!("quit_id should not be greater than max"); + } + if self.max < self.max_match { + err!("max_match should not be greater than max"); + } + if self.max < self.max_accel { + err!("max_accel should not be greater than max"); + } + if self.max < self.max_start { + err!("max_start should not be greater than max"); + } + + Ok(()) + } + + /// Validate that the special state information is compatible with the + /// given state count. + pub fn validate_state_count( + &self, + count: usize, + stride2: usize, + ) -> Result<(), DeserializeError> { + // We assume that 'validate' has already passed, so we know that 'max' + // is truly the max. So all we need to check is that the max state + // ID is less than the state ID count. The max legal value here is + // count-1, which occurs when there are no non-special states. + if (self.max.as_usize() >> stride2) >= count { + err!("max should not be greater than or equal to state count"); + } + Ok(()) + } + + /// Write the IDs and ranges for special states to the given byte buffer. + /// The buffer given must have enough room to store all data, otherwise + /// this will return an error. The number of bytes written is returned + /// on success. The number of bytes written is guaranteed to be a multiple + /// of 8. + pub fn write_to( + &self, + dst: &mut [u8], + ) -> Result { + use crate::util::bytes::write_state_id as write; + + if dst.len() < self.write_to_len() { + return Err(SerializeError::buffer_too_small("special state ids")); + } + + let mut nwrite = 0; + nwrite += write::(self.max, &mut dst[nwrite..]); + nwrite += write::(self.quit_id, &mut dst[nwrite..]); + nwrite += write::(self.min_match, &mut dst[nwrite..]); + nwrite += write::(self.max_match, &mut dst[nwrite..]); + nwrite += write::(self.min_accel, &mut dst[nwrite..]); + nwrite += write::(self.max_accel, &mut dst[nwrite..]); + nwrite += write::(self.min_start, &mut dst[nwrite..]); + nwrite += write::(self.max_start, &mut dst[nwrite..]); + + assert_eq!( + self.write_to_len(), + nwrite, + "expected to write certain number of bytes", + ); + assert_eq!( + nwrite % 8, + 0, + "expected to write multiple of 8 bytes for special states", + ); + Ok(nwrite) + } + + /// Returns the total number of bytes written by `write_to`. + pub fn write_to_len(&self) -> usize { + 8 * StateID::SIZE + } + + /// Sets the maximum special state ID based on the current values. This + /// should be used once all possible state IDs are set. + #[cfg(feature = "alloc")] + pub fn set_max(&mut self) { + use core::cmp::max; + self.max = max( + self.quit_id, + max(self.max_match, max(self.max_accel, self.max_start)), + ); + } + + /// Returns true if and only if the given state ID is a special state. + #[inline] + pub fn is_special_state(&self, id: StateID) -> bool { + id <= self.max + } + + /// Returns true if and only if the given state ID is a dead state. + #[inline] + pub fn is_dead_state(&self, id: StateID) -> bool { + id == DEAD + } + + /// Returns true if and only if the given state ID is a quit state. + #[inline] + pub fn is_quit_state(&self, id: StateID) -> bool { + !self.is_dead_state(id) && self.quit_id == id + } + + /// Returns true if and only if the given state ID is a match state. + #[inline] + pub fn is_match_state(&self, id: StateID) -> bool { + !self.is_dead_state(id) && self.min_match <= id && id <= self.max_match + } + + /// Returns true if and only if the given state ID is an accel state. + #[inline] + pub fn is_accel_state(&self, id: StateID) -> bool { + !self.is_dead_state(id) && self.min_accel <= id && id <= self.max_accel + } + + /// Returns true if and only if the given state ID is a start state. + #[inline] + pub fn is_start_state(&self, id: StateID) -> bool { + !self.is_dead_state(id) && self.min_start <= id && id <= self.max_start + } + + /// Returns the total number of match states for a dense table based DFA. + #[inline] + pub fn match_len(&self, stride: usize) -> usize { + if self.matches() { + (self.max_match.as_usize() - self.min_match.as_usize() + stride) + / stride + } else { + 0 + } + } + + /// Returns true if and only if there is at least one match state. + #[inline] + pub fn matches(&self) -> bool { + self.min_match != DEAD + } + + /// Returns the total number of accel states. + #[cfg(feature = "alloc")] + pub fn accel_len(&self, stride: usize) -> usize { + if self.accels() { + (self.max_accel.as_usize() - self.min_accel.as_usize() + stride) + / stride + } else { + 0 + } + } + + /// Returns true if and only if there is at least one accel state. + #[inline] + pub fn accels(&self) -> bool { + self.min_accel != DEAD + } + + /// Returns true if and only if there is at least one start state. + #[inline] + pub fn starts(&self) -> bool { + self.min_start != DEAD + } +} diff --git a/vendor/regex-automata/src/dfa/transducer.rs b/vendor/regex-automata/src/dfa/transducer.rs new file mode 100644 index 000000000..58b34e00a --- /dev/null +++ b/vendor/regex-automata/src/dfa/transducer.rs @@ -0,0 +1,207 @@ +use crate::{ + dfa::{automaton::Automaton, dense, sparse}, + util::id::StateID, +}; + +impl> fst::Automaton for dense::DFA { + type State = StateID; + + #[inline] + fn start(&self) -> StateID { + self.start_state_forward(None, &[], 0, 0) + } + + #[inline] + fn is_match(&self, state: &StateID) -> bool { + self.is_match_state(*state) + } + + #[inline] + fn accept(&self, state: &StateID, byte: u8) -> StateID { + if fst::Automaton::is_match(self, state) { + return *state; + } + self.next_state(*state, byte) + } + + #[inline] + fn accept_eof(&self, state: &StateID) -> Option { + if fst::Automaton::is_match(self, state) { + return Some(*state); + } + Some(self.next_eoi_state(*state)) + } + + #[inline] + fn can_match(&self, state: &StateID) -> bool { + !self.is_dead_state(*state) + } +} + +impl> fst::Automaton for sparse::DFA { + type State = StateID; + + #[inline] + fn start(&self) -> StateID { + self.start_state_forward(None, &[], 0, 0) + } + + #[inline] + fn is_match(&self, state: &StateID) -> bool { + self.is_match_state(*state) + } + + #[inline] + fn accept(&self, state: &StateID, byte: u8) -> StateID { + if fst::Automaton::is_match(self, state) { + return *state; + } + self.next_state(*state, byte) + } + + #[inline] + fn accept_eof(&self, state: &StateID) -> Option { + if fst::Automaton::is_match(self, state) { + return Some(*state); + } + Some(self.next_eoi_state(*state)) + } + + #[inline] + fn can_match(&self, state: &StateID) -> bool { + !self.is_dead_state(*state) + } +} + +#[cfg(test)] +mod tests { + use bstr::BString; + use fst::{Automaton, IntoStreamer, Set, Streamer}; + + use crate::dfa::{dense, sparse}; + + fn search>( + set: &Set, + aut: A, + ) -> Vec { + let mut stream = set.search(aut).into_stream(); + + let mut results = vec![]; + while let Some(key) = stream.next() { + results.push(BString::from(key)); + } + results + } + + #[test] + fn dense_anywhere() { + let set = + Set::from_iter(&["a", "bar", "baz", "wat", "xba", "xbax", "z"]) + .unwrap(); + let dfa = dense::DFA::new("ba.*").unwrap(); + let got = search(&set, &dfa); + assert_eq!(got, vec!["bar", "baz", "xba", "xbax"]); + } + + #[test] + fn dense_anchored() { + let set = + Set::from_iter(&["a", "bar", "baz", "wat", "xba", "xbax", "z"]) + .unwrap(); + let dfa = dense::Builder::new() + .configure(dense::Config::new().anchored(true)) + .build("ba.*") + .unwrap(); + let got = search(&set, &dfa); + assert_eq!(got, vec!["bar", "baz"]); + } + + #[test] + fn dense_assertions_start() { + let set = + Set::from_iter(&["a", "bar", "baz", "wat", "xba", "xbax", "z"]) + .unwrap(); + let dfa = dense::Builder::new().build("^ba.*").unwrap(); + let got = search(&set, &dfa); + assert_eq!(got, vec!["bar", "baz"]); + } + + #[test] + fn dense_assertions_end() { + let set = + Set::from_iter(&["a", "bar", "bax", "wat", "xba", "xbax", "z"]) + .unwrap(); + let dfa = dense::Builder::new().build(".*x$").unwrap(); + let got = search(&set, &dfa); + assert_eq!(got, vec!["bax", "xbax"]); + } + + #[test] + fn dense_assertions_word() { + let set = + Set::from_iter(&["foo", "foox", "xfoo", "zzz foo zzz"]).unwrap(); + let dfa = dense::Builder::new().build(r"(?-u)\bfoo\b").unwrap(); + let got = search(&set, &dfa); + assert_eq!(got, vec!["foo", "zzz foo zzz"]); + } + + #[test] + fn sparse_anywhere() { + let set = + Set::from_iter(&["a", "bar", "baz", "wat", "xba", "xbax", "z"]) + .unwrap(); + let dfa = sparse::DFA::new("ba.*").unwrap(); + let got = search(&set, &dfa); + assert_eq!(got, vec!["bar", "baz", "xba", "xbax"]); + } + + #[test] + fn sparse_anchored() { + let set = + Set::from_iter(&["a", "bar", "baz", "wat", "xba", "xbax", "z"]) + .unwrap(); + let dfa = dense::Builder::new() + .configure(dense::Config::new().anchored(true)) + .build("ba.*") + .unwrap() + .to_sparse() + .unwrap(); + let got = search(&set, &dfa); + assert_eq!(got, vec!["bar", "baz"]); + } + + #[test] + fn sparse_assertions_start() { + let set = + Set::from_iter(&["a", "bar", "baz", "wat", "xba", "xbax", "z"]) + .unwrap(); + let dfa = + dense::Builder::new().build("^ba.*").unwrap().to_sparse().unwrap(); + let got = search(&set, &dfa); + assert_eq!(got, vec!["bar", "baz"]); + } + + #[test] + fn sparse_assertions_end() { + let set = + Set::from_iter(&["a", "bar", "bax", "wat", "xba", "xbax", "z"]) + .unwrap(); + let dfa = + dense::Builder::new().build(".*x$").unwrap().to_sparse().unwrap(); + let got = search(&set, &dfa); + assert_eq!(got, vec!["bax", "xbax"]); + } + + #[test] + fn sparse_assertions_word() { + let set = + Set::from_iter(&["foo", "foox", "xfoo", "zzz foo zzz"]).unwrap(); + let dfa = dense::Builder::new() + .build(r"(?-u)\bfoo\b") + .unwrap() + .to_sparse() + .unwrap(); + let got = search(&set, &dfa); + assert_eq!(got, vec!["foo", "zzz foo zzz"]); + } +} -- cgit v1.2.3