diff options
Diffstat (limited to 'third_party/rust/regex-automata/src')
70 files changed, 62740 insertions, 0 deletions
diff --git a/third_party/rust/regex-automata/src/dfa/accel.rs b/third_party/rust/regex-automata/src/dfa/accel.rs new file mode 100644 index 0000000000..5ea2423dd0 --- /dev/null +++ b/third_party/rust/regex-automata/src/dfa/accel.rs @@ -0,0 +1,516 @@ +// This module defines some core types for dealing with accelerated DFA states. +// Briefly, a DFA state can be "accelerated" if all of its transitions except +// for a few loop back to itself. This directly implies that the only way out +// of such a state is if a byte corresponding to one of those non-loopback +// transitions is found. Such states are often found in simple repetitions in +// non-Unicode regexes. For example, consider '(?-u)[^a]+a'. We can look at its +// DFA with regex-cli: +// +// $ regex-cli debug dfa dense '(?-u)[^a]+a' -BbC +// dense::DFA( +// D 000000: +// Q 000001: +// *000002: +// A 000003: \x00-` => 3, a => 5, b-\xFF => 3 +// >000004: \x00-` => 3, a => 4, b-\xFF => 3 +// 000005: \x00-\xFF => 2, EOI => 2 +// ) +// +// In particular, state 3 is accelerated (shown via the 'A' indicator) since +// the only way to leave that state once entered is to see an 'a' byte. If +// there is a long run of non-'a' bytes, then using something like 'memchr' +// to find the next 'a' byte can be significantly faster than just using the +// standard byte-at-a-time state machine. +// +// Unfortunately, this optimization rarely applies when Unicode is enabled. +// For example, patterns like '[^a]' don't actually match any byte that isn't +// 'a', but rather, any UTF-8 encoding of a Unicode scalar value that isn't +// 'a'. This makes the state machine much more complex---far beyond a single +// state---and removes the ability to easily accelerate it. (Because if the +// machine sees a non-UTF-8 sequence, then the machine won't match through it.) +// +// In practice, we only consider accelerating states that have 3 or fewer +// non-loop transitions. At a certain point, you get diminishing returns, but +// also because that's what the memchr crate supports. The structures below +// hard-code this assumption and provide (de)serialization APIs for use inside +// a DFA. +// +// And finally, note that there is some trickery involved in making it very +// fast to not only check whether a state is accelerated at search time, but +// also to access the bytes to search for to implement the acceleration itself. +// dfa/special.rs provides more detail, but the short story is that all +// accelerated states appear contiguously in a DFA. This means we can represent +// the ID space of all accelerated DFA states with a single range. So given +// a state ID, we can determine whether it's accelerated via +// +// min_accel_id <= id <= max_accel_id +// +// And find its corresponding accelerator with: +// +// accels.get((id - min_accel_id) / dfa_stride) + +#[cfg(feature = "dfa-build")] +use alloc::{vec, vec::Vec}; + +use crate::util::{ + int::Pointer, + memchr, + wire::{self, DeserializeError, Endian, SerializeError}, +}; + +/// The base type used to represent a collection of accelerators. +/// +/// While an `Accel` is represented as a fixed size array of bytes, a +/// *collection* of `Accel`s (called `Accels`) is represented internally as a +/// slice of u32. While it's a bit unnatural to do this and costs us a bit of +/// fairly low-risk not-safe code, it lets us remove the need for a second type +/// parameter in the definition of dense::DFA. (Which really wants everything +/// to be a slice of u32.) +type AccelTy = u32; + +/// The size of the unit of representation for accelerators. +/// +/// ACCEL_CAP *must* be a multiple of this size. +const ACCEL_TY_SIZE: usize = core::mem::size_of::<AccelTy>(); + +/// The maximum length in bytes that a single Accel can be. This is distinct +/// from the capacity of an accelerator in that the length represents only the +/// bytes that should be read. +const ACCEL_LEN: usize = 4; + +/// The capacity of each accelerator, in bytes. We set this to 8 since it's a +/// multiple of 4 (our ID size) and because it gives us a little wiggle room +/// if we want to support more accel bytes in the future without a breaking +/// change. +/// +/// This MUST be a multiple of ACCEL_TY_SIZE. +const ACCEL_CAP: usize = 8; + +/// Search for between 1 and 3 needle bytes in the given haystack, starting the +/// search at the given position. If `needles` has a length other than 1-3, +/// then this panics. +#[cfg_attr(feature = "perf-inline", inline(always))] +pub(crate) fn find_fwd( + needles: &[u8], + haystack: &[u8], + at: usize, +) -> Option<usize> { + let bs = needles; + let i = match needles.len() { + 1 => memchr::memchr(bs[0], &haystack[at..])?, + 2 => memchr::memchr2(bs[0], bs[1], &haystack[at..])?, + 3 => memchr::memchr3(bs[0], bs[1], bs[2], &haystack[at..])?, + 0 => panic!("cannot find with empty needles"), + n => panic!("invalid needles length: {}", n), + }; + Some(at + i) +} + +/// Search for between 1 and 3 needle bytes in the given haystack in reverse, +/// starting the search at the given position. If `needles` has a length other +/// than 1-3, then this panics. +#[cfg_attr(feature = "perf-inline", inline(always))] +pub(crate) fn find_rev( + needles: &[u8], + haystack: &[u8], + at: usize, +) -> Option<usize> { + let bs = needles; + match needles.len() { + 1 => memchr::memrchr(bs[0], &haystack[..at]), + 2 => memchr::memrchr2(bs[0], bs[1], &haystack[..at]), + 3 => memchr::memrchr3(bs[0], bs[1], bs[2], &haystack[..at]), + 0 => panic!("cannot find with empty needles"), + n => panic!("invalid needles length: {}", n), + } +} + +/// Represents the accelerators for all accelerated states in a dense DFA. +/// +/// The `A` type parameter represents the type of the underlying bytes. +/// Generally, this is either `&[AccelTy]` or `Vec<AccelTy>`. +#[derive(Clone)] +pub(crate) struct Accels<A> { + /// A length prefixed slice of contiguous accelerators. See the top comment + /// in this module for more details on how we can jump from a DFA's state + /// ID to an accelerator in this list. + /// + /// The first 4 bytes always correspond to the number of accelerators + /// that follow. + accels: A, +} + +#[cfg(feature = "dfa-build")] +impl Accels<Vec<AccelTy>> { + /// Create an empty sequence of accelerators for a DFA. + pub fn empty() -> Accels<Vec<AccelTy>> { + Accels { accels: vec![0] } + } + + /// Add an accelerator to this sequence. + /// + /// This adds to the accelerator to the end of the sequence and therefore + /// should be done in correspondence with its state in the DFA. + /// + /// This panics if this results in more accelerators than AccelTy::MAX. + pub fn add(&mut self, accel: Accel) { + self.accels.extend_from_slice(&accel.as_accel_tys()); + let len = self.len(); + self.set_len(len + 1); + } + + /// Set the number of accelerators in this sequence, which is encoded in + /// the first 4 bytes of the underlying bytes. + fn set_len(&mut self, new_len: usize) { + // The only way an accelerator gets added is if a state exists for + // it, and if a state exists, then its index is guaranteed to be + // representable by a AccelTy by virtue of the guarantees provided by + // StateID. + let new_len = AccelTy::try_from(new_len).unwrap(); + self.accels[0] = new_len; + } +} + +impl<'a> Accels<&'a [AccelTy]> { + /// Deserialize a sequence of accelerators from the given bytes. If there + /// was a problem deserializing, then an error is returned. + /// + /// This is guaranteed to run in constant time. This does not guarantee + /// that every accelerator in the returned collection is valid. Thus, + /// accessing one may panic, or not-safe code that relies on accelerators + /// being correct my result in UB. + /// + /// Callers may check the validity of every accelerator with the `validate` + /// method. + pub fn from_bytes_unchecked( + mut slice: &'a [u8], + ) -> Result<(Accels<&'a [AccelTy]>, usize), DeserializeError> { + let slice_start = slice.as_ptr().as_usize(); + + let (accel_len, _) = + wire::try_read_u32_as_usize(slice, "accelerators length")?; + // The accelerator length is part of the accel_tys slice that + // we deserialize. This is perhaps a bit idiosyncratic. It would + // probably be better to split out the length into a real field. + + let accel_tys_len = wire::add( + wire::mul(accel_len, 2, "total number of accelerator accel_tys")?, + 1, + "total number of accel_tys", + )?; + let accel_tys_bytes_len = wire::mul( + ACCEL_TY_SIZE, + accel_tys_len, + "total number of bytes in accelerators", + )?; + wire::check_slice_len(slice, accel_tys_bytes_len, "accelerators")?; + wire::check_alignment::<AccelTy>(slice)?; + let accel_tys = &slice[..accel_tys_bytes_len]; + slice = &slice[accel_tys_bytes_len..]; + // SAFETY: We've checked the length and alignment above, and since + // slice is just bytes and AccelTy is just a u32, we can safely cast to + // a slice of &[AccelTy]. + let accels = unsafe { + core::slice::from_raw_parts( + accel_tys.as_ptr().cast::<AccelTy>(), + accel_tys_len, + ) + }; + Ok((Accels { accels }, slice.as_ptr().as_usize() - slice_start)) + } +} + +impl<A: AsRef<[AccelTy]>> Accels<A> { + /// Return an owned version of the accelerators. + #[cfg(feature = "alloc")] + pub fn to_owned(&self) -> Accels<alloc::vec::Vec<AccelTy>> { + Accels { accels: self.accels.as_ref().to_vec() } + } + + /// Return a borrowed version of the accelerators. + pub fn as_ref(&self) -> Accels<&[AccelTy]> { + Accels { accels: self.accels.as_ref() } + } + + /// Return the bytes representing the serialization of the accelerators. + pub fn as_bytes(&self) -> &[u8] { + let accels = self.accels.as_ref(); + // SAFETY: This is safe because accels is a just a slice of AccelTy, + // and u8 always has a smaller alignment. + unsafe { + core::slice::from_raw_parts( + accels.as_ptr().cast::<u8>(), + accels.len() * ACCEL_TY_SIZE, + ) + } + } + + /// Returns the memory usage, in bytes, of these accelerators. + /// + /// The memory usage is computed based on the number of bytes used to + /// represent all of the accelerators. + /// + /// This does **not** include the stack size used by this value. + pub fn memory_usage(&self) -> usize { + self.as_bytes().len() + } + + /// Return the bytes to search for corresponding to the accelerator in this + /// sequence at index `i`. If no such accelerator exists, then this panics. + /// + /// The significance of the index is that it should be in correspondence + /// with the index of the corresponding DFA. That is, accelerated DFA + /// states are stored contiguously in the DFA and have an ordering implied + /// by their respective state IDs. The state's index in that sequence + /// corresponds to the index of its corresponding accelerator. + #[cfg_attr(feature = "perf-inline", inline(always))] + pub fn needles(&self, i: usize) -> &[u8] { + if i >= self.len() { + panic!("invalid accelerator index {}", i); + } + let bytes = self.as_bytes(); + let offset = ACCEL_TY_SIZE + i * ACCEL_CAP; + let len = usize::from(bytes[offset]); + &bytes[offset + 1..offset + 1 + len] + } + + /// Return the total number of accelerators in this sequence. + pub fn len(&self) -> usize { + // This should never panic since deserialization checks that the + // length can fit into a usize. + usize::try_from(self.accels.as_ref()[0]).unwrap() + } + + /// Return the accelerator in this sequence at index `i`. If no such + /// accelerator exists, then this returns None. + /// + /// See the docs for `needles` on the significance of the index. + fn get(&self, i: usize) -> Option<Accel> { + if i >= self.len() { + return None; + } + let offset = ACCEL_TY_SIZE + i * ACCEL_CAP; + let accel = Accel::from_slice(&self.as_bytes()[offset..]) + .expect("Accels must contain valid accelerators"); + Some(accel) + } + + /// Returns an iterator of accelerators in this sequence. + fn iter(&self) -> IterAccels<'_, A> { + IterAccels { accels: self, i: 0 } + } + + /// Writes these accelerators to the given byte buffer using the indicated + /// endianness. If the given buffer is too small, then an error is + /// returned. Upon success, the total number of bytes written is returned. + /// The number of bytes written is guaranteed to be a multiple of 8. + pub fn write_to<E: Endian>( + &self, + dst: &mut [u8], + ) -> Result<usize, SerializeError> { + let nwrite = self.write_to_len(); + assert_eq!( + nwrite % ACCEL_TY_SIZE, + 0, + "expected accelerator bytes written to be a multiple of {}", + ACCEL_TY_SIZE, + ); + if dst.len() < nwrite { + return Err(SerializeError::buffer_too_small("accelerators")); + } + + // The number of accelerators can never exceed AccelTy::MAX. + E::write_u32(AccelTy::try_from(self.len()).unwrap(), dst); + // The actual accelerators are just raw bytes and thus their endianness + // is irrelevant. So we can copy them as bytes. + dst[ACCEL_TY_SIZE..nwrite] + .copy_from_slice(&self.as_bytes()[ACCEL_TY_SIZE..nwrite]); + Ok(nwrite) + } + + /// Validates that every accelerator in this collection can be successfully + /// deserialized as a valid accelerator. + pub fn validate(&self) -> Result<(), DeserializeError> { + for chunk in self.as_bytes()[ACCEL_TY_SIZE..].chunks(ACCEL_CAP) { + let _ = Accel::from_slice(chunk)?; + } + Ok(()) + } + + /// Returns the total number of bytes written by `write_to`. + pub fn write_to_len(&self) -> usize { + self.as_bytes().len() + } +} + +impl<A: AsRef<[AccelTy]>> core::fmt::Debug for Accels<A> { + fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result { + write!(f, "Accels(")?; + let mut list = f.debug_list(); + for a in self.iter() { + list.entry(&a); + } + list.finish()?; + write!(f, ")") + } +} + +#[derive(Debug)] +struct IterAccels<'a, A: AsRef<[AccelTy]>> { + accels: &'a Accels<A>, + i: usize, +} + +impl<'a, A: AsRef<[AccelTy]>> Iterator for IterAccels<'a, A> { + type Item = Accel; + + fn next(&mut self) -> Option<Accel> { + let accel = self.accels.get(self.i)?; + self.i += 1; + Some(accel) + } +} + +/// Accel represents a structure for determining how to "accelerate" a DFA +/// state. +/// +/// Namely, it contains zero or more bytes that must be seen in order for the +/// DFA to leave the state it is associated with. In practice, the actual range +/// is 1 to 3 bytes. +/// +/// The purpose of acceleration is to identify states whose vast majority +/// of transitions are just loops back to the same state. For example, +/// in the regex `(?-u)^[^a]+b`, the corresponding DFA will have a state +/// (corresponding to `[^a]+`) where all transitions *except* for `a` and +/// `b` loop back to itself. Thus, this state can be "accelerated" by simply +/// looking for the next occurrence of either `a` or `b` instead of explicitly +/// following transitions. (In this case, `b` transitions to the next state +/// where as `a` would transition to the dead state.) +#[derive(Clone)] +pub(crate) struct Accel { + /// The first byte is the length. Subsequent bytes are the accelerated + /// bytes. + /// + /// Note that we make every accelerator 8 bytes as a slightly wasteful + /// way of making sure alignment is always correct for state ID sizes of + /// 1, 2, 4 and 8. This should be okay since accelerated states aren't + /// particularly common, especially when Unicode is enabled. + bytes: [u8; ACCEL_CAP], +} + +impl Accel { + /// Returns an empty accel, where no bytes are accelerated. + #[cfg(feature = "dfa-build")] + pub fn new() -> Accel { + Accel { bytes: [0; ACCEL_CAP] } + } + + /// Returns a verified accelerator derived from the beginning of the given + /// slice. + /// + /// If the slice is not long enough or contains invalid bytes for an + /// accelerator, then this returns an error. + pub fn from_slice(mut slice: &[u8]) -> Result<Accel, DeserializeError> { + slice = &slice[..core::cmp::min(ACCEL_LEN, slice.len())]; + let bytes = slice + .try_into() + .map_err(|_| DeserializeError::buffer_too_small("accelerator"))?; + Accel::from_bytes(bytes) + } + + /// Returns a verified accelerator derived from raw bytes. + /// + /// If the given bytes are invalid, then this returns an error. + fn from_bytes(bytes: [u8; 4]) -> Result<Accel, DeserializeError> { + if usize::from(bytes[0]) >= ACCEL_LEN { + return Err(DeserializeError::generic( + "accelerator bytes cannot have length more than 3", + )); + } + Ok(Accel::from_bytes_unchecked(bytes)) + } + + /// Returns an accelerator derived from raw bytes. + /// + /// This does not check whether the given bytes are valid. Invalid bytes + /// cannot sacrifice memory safety, but may result in panics or silent + /// logic bugs. + fn from_bytes_unchecked(bytes: [u8; 4]) -> Accel { + Accel { bytes: [bytes[0], bytes[1], bytes[2], bytes[3], 0, 0, 0, 0] } + } + + /// Attempts to add the given byte to this accelerator. If the accelerator + /// is already full or thinks the byte is a poor accelerator, then this + /// returns false. Otherwise, returns true. + /// + /// If the given byte is already in this accelerator, then it panics. + #[cfg(feature = "dfa-build")] + pub fn add(&mut self, byte: u8) -> bool { + if self.len() >= 3 { + return false; + } + // As a special case, we totally reject trying to accelerate a state + // with an ASCII space. In most cases, it occurs very frequently, and + // tends to result in worse overall performance. + if byte == b' ' { + return false; + } + assert!( + !self.contains(byte), + "accelerator already contains {:?}", + crate::util::escape::DebugByte(byte) + ); + self.bytes[self.len() + 1] = byte; + self.bytes[0] += 1; + true + } + + /// Return the number of bytes in this accelerator. + pub fn len(&self) -> usize { + usize::from(self.bytes[0]) + } + + /// Returns true if and only if there are no bytes in this accelerator. + #[cfg(feature = "dfa-build")] + pub fn is_empty(&self) -> bool { + self.len() == 0 + } + + /// Returns the slice of bytes to accelerate. + /// + /// If this accelerator is empty, then this returns an empty slice. + fn needles(&self) -> &[u8] { + &self.bytes[1..1 + self.len()] + } + + /// Returns true if and only if this accelerator will accelerate the given + /// byte. + #[cfg(feature = "dfa-build")] + fn contains(&self, byte: u8) -> bool { + self.needles().iter().position(|&b| b == byte).is_some() + } + + /// Returns the accelerator bytes as an array of AccelTys. + #[cfg(feature = "dfa-build")] + fn as_accel_tys(&self) -> [AccelTy; 2] { + assert_eq!(ACCEL_CAP, 8); + // These unwraps are OK since ACCEL_CAP is set to 8. + let first = + AccelTy::from_ne_bytes(self.bytes[0..4].try_into().unwrap()); + let second = + AccelTy::from_ne_bytes(self.bytes[4..8].try_into().unwrap()); + [first, second] + } +} + +impl core::fmt::Debug for Accel { + fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result { + write!(f, "Accel(")?; + let mut set = f.debug_set(); + for &b in self.needles() { + set.entry(&crate::util::escape::DebugByte(b)); + } + set.finish()?; + write!(f, ")") + } +} diff --git a/third_party/rust/regex-automata/src/dfa/automaton.rs b/third_party/rust/regex-automata/src/dfa/automaton.rs new file mode 100644 index 0000000000..7e2be9a151 --- /dev/null +++ b/third_party/rust/regex-automata/src/dfa/automaton.rs @@ -0,0 +1,2120 @@ +#[cfg(feature = "alloc")] +use crate::util::search::PatternSet; +use crate::{ + dfa::search, + util::{ + empty, + prefilter::Prefilter, + primitives::{PatternID, StateID}, + search::{Anchored, HalfMatch, Input, MatchError}, + }, +}; + +/// A trait describing the interface of a deterministic finite automaton (DFA). +/// +/// The complexity of this trait probably means that it's unlikely for others +/// to implement it. The primary purpose of the trait is to provide for a way +/// of abstracting over different types of DFAs. In this crate, that means +/// dense DFAs and sparse DFAs. (Dense DFAs are fast but memory hungry, where +/// as sparse DFAs are slower but come with a smaller memory footprint. But +/// they otherwise provide exactly equivalent expressive power.) For example, a +/// [`dfa::regex::Regex`](crate::dfa::regex::Regex) is generic over this trait. +/// +/// Normally, a DFA's execution model is very simple. You might have a single +/// start state, zero or more final or "match" states and a function that +/// transitions from one state to the next given the next byte of input. +/// Unfortunately, the interface described by this trait is significantly +/// more complicated than this. The complexity has a number of different +/// reasons, mostly motivated by performance, functionality or space savings: +/// +/// * A DFA can search for multiple patterns simultaneously. This +/// means extra information is returned when a match occurs. Namely, +/// a match is not just an offset, but an offset plus a pattern ID. +/// [`Automaton::pattern_len`] returns the number of patterns compiled into +/// the DFA, [`Automaton::match_len`] returns the total number of patterns +/// that match in a particular state and [`Automaton::match_pattern`] permits +/// iterating over the patterns that match in a particular state. +/// * A DFA can have multiple start states, and the choice of which start +/// state to use depends on the content of the string being searched and +/// position of the search, as well as whether the search is an anchored +/// search for a specific pattern in the DFA. Moreover, computing the start +/// state also depends on whether you're doing a forward or a reverse search. +/// [`Automaton::start_state_forward`] and [`Automaton::start_state_reverse`] +/// are used to compute the start state for forward and reverse searches, +/// respectively. +/// * All matches are delayed by one byte to support things like `$` and `\b` +/// at the end of a pattern. Therefore, every use of a DFA is required to use +/// [`Automaton::next_eoi_state`] +/// at the end of the search to compute the final transition. +/// * For optimization reasons, some states are treated specially. Every +/// state is either special or not, which can be determined via the +/// [`Automaton::is_special_state`] method. If it's special, then the state +/// must be at least one of a few possible types of states. (Note that some +/// types can overlap, for example, a match state can also be an accel state. +/// But some types can't. If a state is a dead state, then it can never be any +/// other type of state.) Those types are: +/// * A dead state. A dead state means the DFA will never enter a match +/// state. This can be queried via the [`Automaton::is_dead_state`] method. +/// * A quit state. A quit state occurs if the DFA had to stop the search +/// prematurely for some reason. This can be queried via the +/// [`Automaton::is_quit_state`] method. +/// * A match state. A match state occurs when a match is found. When a DFA +/// enters a match state, the search may stop immediately (when looking +/// for the earliest match), or it may continue to find the leftmost-first +/// match. This can be queried via the [`Automaton::is_match_state`] +/// method. +/// * A start state. A start state is where a search begins. For every +/// search, there is exactly one start state that is used, however, a +/// DFA may contain many start states. When the search is in a start +/// state, it may use a prefilter to quickly skip to candidate matches +/// without executing the DFA on every byte. This can be queried via the +/// [`Automaton::is_start_state`] method. +/// * An accel state. An accel state is a state that is accelerated. +/// That is, it is a state where _most_ of its transitions loop back to +/// itself and only a small number of transitions lead to other states. +/// This kind of state is said to be accelerated because a search routine +/// can quickly look for the bytes leading out of the state instead of +/// continuing to execute the DFA on each byte. This can be queried via the +/// [`Automaton::is_accel_state`] method. And the bytes that lead out of +/// the state can be queried via the [`Automaton::accelerator`] method. +/// +/// There are a number of provided methods on this trait that implement +/// efficient searching (for forwards and backwards) with a DFA using +/// all of the above features of this trait. In particular, given the +/// complexity of all these features, implementing a search routine in +/// this trait can be a little subtle. With that said, it is possible to +/// somewhat simplify the search routine. For example, handling accelerated +/// states is strictly optional, since it is always correct to assume that +/// `Automaton::is_accel_state` returns false. However, one complex part of +/// writing a search routine using this trait is handling the 1-byte delay of a +/// match. That is not optional. +/// +/// # Safety +/// +/// This trait is not safe to implement so that code may rely on the +/// correctness of implementations of this trait to avoid undefined behavior. +/// The primary correctness guarantees are: +/// +/// * `Automaton::start_state` always returns a valid state ID or an error or +/// panics. +/// * `Automaton::next_state`, when given a valid state ID, always returns +/// a valid state ID for all values of `anchored` and `byte`, or otherwise +/// panics. +/// +/// In general, the rest of the methods on `Automaton` need to uphold their +/// contracts as well. For example, `Automaton::is_dead` should only returns +/// true if the given state ID is actually a dead state. +pub unsafe trait Automaton { + /// Transitions from the current state to the next state, given the next + /// byte of input. + /// + /// Implementations must guarantee that the returned ID is always a valid + /// ID when `current` refers to a valid ID. Moreover, the transition + /// function must be defined for all possible values of `input`. + /// + /// # Panics + /// + /// If the given ID does not refer to a valid state, then this routine + /// may panic but it also may not panic and instead return an invalid ID. + /// However, if the caller provides an invalid ID then this must never + /// sacrifice memory safety. + /// + /// # Example + /// + /// This shows a simplistic example for walking a DFA for a given haystack + /// by using the `next_state` method. + /// + /// ``` + /// use regex_automata::{dfa::{Automaton, dense}, Input}; + /// + /// let dfa = dense::DFA::new(r"[a-z]+r")?; + /// let haystack = "bar".as_bytes(); + /// + /// // The start state is determined by inspecting the position and the + /// // initial bytes of the haystack. + /// let mut state = dfa.start_state_forward(&Input::new(haystack))?; + /// // Walk all the bytes in the haystack. + /// for &b in haystack { + /// state = dfa.next_state(state, b); + /// } + /// // Matches are always delayed by 1 byte, so we must explicitly walk the + /// // special "EOI" transition at the end of the search. + /// state = dfa.next_eoi_state(state); + /// assert!(dfa.is_match_state(state)); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + fn next_state(&self, current: StateID, input: u8) -> StateID; + + /// Transitions from the current state to the next state, given the next + /// byte of input. + /// + /// Unlike [`Automaton::next_state`], implementations may implement this + /// more efficiently by assuming that the `current` state ID is valid. + /// Typically, this manifests by eliding bounds checks. + /// + /// # Safety + /// + /// Callers of this method must guarantee that `current` refers to a valid + /// state ID. If `current` is not a valid state ID for this automaton, then + /// calling this routine may result in undefined behavior. + /// + /// If `current` is valid, then implementations must guarantee that the ID + /// returned is valid for all possible values of `input`. + unsafe fn next_state_unchecked( + &self, + current: StateID, + input: u8, + ) -> StateID; + + /// Transitions from the current state to the next state for the special + /// EOI symbol. + /// + /// Implementations must guarantee that the returned ID is always a valid + /// ID when `current` refers to a valid ID. + /// + /// This routine must be called at the end of every search in a correct + /// implementation of search. Namely, DFAs in this crate delay matches + /// by one byte in order to support look-around operators. Thus, after + /// reaching the end of a haystack, a search implementation must follow one + /// last EOI transition. + /// + /// It is best to think of EOI as an additional symbol in the alphabet of + /// a DFA that is distinct from every other symbol. That is, the alphabet + /// of DFAs in this crate has a logical size of 257 instead of 256, where + /// 256 corresponds to every possible inhabitant of `u8`. (In practice, the + /// physical alphabet size may be smaller because of alphabet compression + /// via equivalence classes, but EOI is always represented somehow in the + /// alphabet.) + /// + /// # Panics + /// + /// If the given ID does not refer to a valid state, then this routine + /// may panic but it also may not panic and instead return an invalid ID. + /// However, if the caller provides an invalid ID then this must never + /// sacrifice memory safety. + /// + /// # Example + /// + /// This shows a simplistic example for walking a DFA for a given haystack, + /// and then finishing the search with the final EOI transition. + /// + /// ``` + /// use regex_automata::{dfa::{Automaton, dense}, Input}; + /// + /// let dfa = dense::DFA::new(r"[a-z]+r")?; + /// let haystack = "bar".as_bytes(); + /// + /// // The start state is determined by inspecting the position and the + /// // initial bytes of the haystack. + /// // + /// // The unwrap is OK because we aren't requesting a start state for a + /// // specific pattern. + /// let mut state = dfa.start_state_forward(&Input::new(haystack))?; + /// // Walk all the bytes in the haystack. + /// for &b in haystack { + /// state = dfa.next_state(state, b); + /// } + /// // Matches are always delayed by 1 byte, so we must explicitly walk + /// // the special "EOI" transition at the end of the search. Without this + /// // final transition, the assert below will fail since the DFA will not + /// // have entered a match state yet! + /// state = dfa.next_eoi_state(state); + /// assert!(dfa.is_match_state(state)); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + fn next_eoi_state(&self, current: StateID) -> StateID; + + /// Return the ID of the start state for this lazy DFA when executing a + /// forward search. + /// + /// Unlike typical DFA implementations, the start state for DFAs in this + /// crate is dependent on a few different factors: + /// + /// * The [`Anchored`] mode of the search. Unanchored, anchored and + /// anchored searches for a specific [`PatternID`] all use different start + /// states. + /// * The position at which the search begins, via [`Input::start`]. This + /// and the byte immediately preceding the start of the search (if one + /// exists) influence which look-behind assertions are true at the start + /// of the search. This in turn influences which start state is selected. + /// * Whether the search is a forward or reverse search. This routine can + /// only be used for forward searches. + /// + /// # Errors + /// + /// This may return a [`MatchError`] if the search needs to give up + /// when determining the start state (for example, if it sees a "quit" + /// byte). This can also return an error if the given `Input` contains an + /// unsupported [`Anchored`] configuration. + fn start_state_forward( + &self, + input: &Input<'_>, + ) -> Result<StateID, MatchError>; + + /// Return the ID of the start state for this lazy DFA when executing a + /// reverse search. + /// + /// Unlike typical DFA implementations, the start state for DFAs in this + /// crate is dependent on a few different factors: + /// + /// * The [`Anchored`] mode of the search. Unanchored, anchored and + /// anchored searches for a specific [`PatternID`] all use different start + /// states. + /// * The position at which the search begins, via [`Input::start`]. This + /// and the byte immediately preceding the start of the search (if one + /// exists) influence which look-behind assertions are true at the start + /// of the search. This in turn influences which start state is selected. + /// * Whether the search is a forward or reverse search. This routine can + /// only be used for reverse searches. + /// + /// # Errors + /// + /// This may return a [`MatchError`] if the search needs to give up + /// when determining the start state (for example, if it sees a "quit" + /// byte). This can also return an error if the given `Input` contains an + /// unsupported [`Anchored`] configuration. + fn start_state_reverse( + &self, + input: &Input<'_>, + ) -> Result<StateID, MatchError>; + + /// If this DFA has a universal starting state for the given anchor mode + /// and the DFA supports universal starting states, then this returns that + /// state's identifier. + /// + /// A DFA is said to have a universal starting state when the starting + /// state is invariant with respect to the haystack. Usually, the starting + /// state is chosen depending on the bytes immediately surrounding the + /// starting position of a search. However, the starting state only differs + /// when one or more of the patterns in the DFA have look-around assertions + /// in its prefix. + /// + /// Stated differently, if none of the patterns in a DFA have look-around + /// assertions in their prefix, then the DFA has a universal starting state + /// and _may_ be returned by this method. + /// + /// It is always correct for implementations to return `None`, and indeed, + /// this is what the default implementation does. When this returns `None`, + /// callers must use either `start_state_forward` or `start_state_reverse` + /// to get the starting state. + /// + /// # Use case + /// + /// There are a few reasons why one might want to use this: + /// + /// * If you know your regex patterns have no look-around assertions in + /// their prefix, then calling this routine is likely cheaper and perhaps + /// more semantically meaningful. + /// * When implementing prefilter support in a DFA regex implementation, + /// it is necessary to re-compute the start state after a candidate + /// is returned from the prefilter. However, this is only needed when + /// there isn't a universal start state. When one exists, one can avoid + /// re-computing the start state. + /// + /// # Example + /// + /// ``` + /// use regex_automata::{ + /// dfa::{Automaton, dense::DFA}, + /// Anchored, + /// }; + /// + /// // There are no look-around assertions in the prefixes of any of the + /// // patterns, so we get a universal start state. + /// let dfa = DFA::new_many(&["[0-9]+", "[a-z]+$", "[A-Z]+"])?; + /// assert!(dfa.universal_start_state(Anchored::No).is_some()); + /// assert!(dfa.universal_start_state(Anchored::Yes).is_some()); + /// + /// // One of the patterns has a look-around assertion in its prefix, + /// // so this means there is no longer a universal start state. + /// let dfa = DFA::new_many(&["[0-9]+", "^[a-z]+$", "[A-Z]+"])?; + /// assert!(!dfa.universal_start_state(Anchored::No).is_some()); + /// assert!(!dfa.universal_start_state(Anchored::Yes).is_some()); + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + #[inline] + fn universal_start_state(&self, _mode: Anchored) -> Option<StateID> { + None + } + + /// Returns true if and only if the given identifier corresponds to a + /// "special" state. A special state is one or more of the following: + /// a dead state, a quit state, a match state, a start state or an + /// accelerated state. + /// + /// A correct implementation _may_ always return false for states that + /// are either start states or accelerated states, since that information + /// is only intended to be used for optimization purposes. Correct + /// implementations must return true if the state is a dead, quit or match + /// state. This is because search routines using this trait must be able + /// to rely on `is_special_state` as an indicator that a state may need + /// special treatment. (For example, when a search routine sees a dead + /// state, it must terminate.) + /// + /// This routine permits search implementations to use a single branch to + /// check whether a state needs special attention before executing the next + /// transition. The example below shows how to do this. + /// + /// # Example + /// + /// This example shows how `is_special_state` can be used to implement a + /// correct search routine with minimal branching. In particular, this + /// search routine implements "leftmost" matching, which means that it + /// doesn't immediately stop once a match is found. Instead, it continues + /// until it reaches a dead state. + /// + /// ``` + /// use regex_automata::{ + /// dfa::{Automaton, dense}, + /// HalfMatch, MatchError, Input, + /// }; + /// + /// fn find<A: Automaton>( + /// dfa: &A, + /// haystack: &[u8], + /// ) -> Result<Option<HalfMatch>, MatchError> { + /// // The start state is determined by inspecting the position and the + /// // initial bytes of the haystack. Note that start states can never + /// // be match states (since DFAs in this crate delay matches by 1 + /// // byte), so we don't need to check if the start state is a match. + /// let mut state = dfa.start_state_forward(&Input::new(haystack))?; + /// let mut last_match = None; + /// // Walk all the bytes in the haystack. We can quit early if we see + /// // a dead or a quit state. The former means the automaton will + /// // never transition to any other state. The latter means that the + /// // automaton entered a condition in which its search failed. + /// for (i, &b) in haystack.iter().enumerate() { + /// state = dfa.next_state(state, b); + /// if dfa.is_special_state(state) { + /// if dfa.is_match_state(state) { + /// last_match = Some(HalfMatch::new( + /// dfa.match_pattern(state, 0), + /// i, + /// )); + /// } else if dfa.is_dead_state(state) { + /// return Ok(last_match); + /// } else if dfa.is_quit_state(state) { + /// // It is possible to enter into a quit state after + /// // observing a match has occurred. In that case, we + /// // should return the match instead of an error. + /// if last_match.is_some() { + /// return Ok(last_match); + /// } + /// return Err(MatchError::quit(b, i)); + /// } + /// // Implementors may also want to check for start or accel + /// // states and handle them differently for performance + /// // reasons. But it is not necessary for correctness. + /// } + /// } + /// // Matches are always delayed by 1 byte, so we must explicitly walk + /// // the special "EOI" transition at the end of the search. + /// state = dfa.next_eoi_state(state); + /// if dfa.is_match_state(state) { + /// last_match = Some(HalfMatch::new( + /// dfa.match_pattern(state, 0), + /// haystack.len(), + /// )); + /// } + /// Ok(last_match) + /// } + /// + /// // We use a greedy '+' operator to show how the search doesn't just + /// // stop once a match is detected. It continues extending the match. + /// // Using '[a-z]+?' would also work as expected and stop the search + /// // early. Greediness is built into the automaton. + /// let dfa = dense::DFA::new(r"[a-z]+")?; + /// let haystack = "123 foobar 4567".as_bytes(); + /// let mat = find(&dfa, haystack)?.unwrap(); + /// assert_eq!(mat.pattern().as_usize(), 0); + /// assert_eq!(mat.offset(), 10); + /// + /// // Here's another example that tests our handling of the special EOI + /// // transition. This will fail to find a match if we don't call + /// // 'next_eoi_state' at the end of the search since the match isn't + /// // found until the final byte in the haystack. + /// let dfa = dense::DFA::new(r"[0-9]{4}")?; + /// let haystack = "123 foobar 4567".as_bytes(); + /// let mat = find(&dfa, haystack)?.unwrap(); + /// assert_eq!(mat.pattern().as_usize(), 0); + /// assert_eq!(mat.offset(), 15); + /// + /// // And note that our search implementation above automatically works + /// // with multi-DFAs. Namely, `dfa.match_pattern(match_state, 0)` selects + /// // the appropriate pattern ID for us. + /// let dfa = dense::DFA::new_many(&[r"[a-z]+", r"[0-9]+"])?; + /// let haystack = "123 foobar 4567".as_bytes(); + /// let mat = find(&dfa, haystack)?.unwrap(); + /// assert_eq!(mat.pattern().as_usize(), 1); + /// assert_eq!(mat.offset(), 3); + /// let mat = find(&dfa, &haystack[3..])?.unwrap(); + /// assert_eq!(mat.pattern().as_usize(), 0); + /// assert_eq!(mat.offset(), 7); + /// let mat = find(&dfa, &haystack[10..])?.unwrap(); + /// assert_eq!(mat.pattern().as_usize(), 1); + /// assert_eq!(mat.offset(), 5); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + fn is_special_state(&self, id: StateID) -> bool; + + /// Returns true if and only if the given identifier corresponds to a dead + /// state. When a DFA enters a dead state, it is impossible to leave. That + /// is, every transition on a dead state by definition leads back to the + /// same dead state. + /// + /// In practice, the dead state always corresponds to the identifier `0`. + /// Moreover, in practice, there is only one dead state. + /// + /// The existence of a dead state is not strictly required in the classical + /// model of finite state machines, where one generally only cares about + /// the question of whether an input sequence matches or not. Dead states + /// are not needed to answer that question, since one can immediately quit + /// as soon as one enters a final or "match" state. However, we don't just + /// care about matches but also care about the location of matches, and + /// more specifically, care about semantics like "greedy" matching. + /// + /// For example, given the pattern `a+` and the input `aaaz`, the dead + /// state won't be entered until the state machine reaches `z` in the + /// input, at which point, the search routine can quit. But without the + /// dead state, the search routine wouldn't know when to quit. In a + /// classical representation, the search routine would stop after seeing + /// the first `a` (which is when the search would enter a match state). But + /// this wouldn't implement "greedy" matching where `a+` matches as many + /// `a`'s as possible. + /// + /// # Example + /// + /// See the example for [`Automaton::is_special_state`] for how to use this + /// method correctly. + fn is_dead_state(&self, id: StateID) -> bool; + + /// Returns true if and only if the given identifier corresponds to a quit + /// state. A quit state is like a dead state (it has no transitions other + /// than to itself), except it indicates that the DFA failed to complete + /// the search. When this occurs, callers can neither accept or reject that + /// a match occurred. + /// + /// In practice, the quit state always corresponds to the state immediately + /// following the dead state. (Which is not usually represented by `1`, + /// since state identifiers are pre-multiplied by the state machine's + /// alphabet stride, and the alphabet stride varies between DFAs.) + /// + /// The typical way in which a quit state can occur is when heuristic + /// support for Unicode word boundaries is enabled via the + /// [`dense::Config::unicode_word_boundary`](crate::dfa::dense::Config::unicode_word_boundary) + /// option. But other options, like the lower level + /// [`dense::Config::quit`](crate::dfa::dense::Config::quit) + /// configuration, can also result in a quit state being entered. The + /// purpose of the quit state is to provide a way to execute a fast DFA + /// in common cases while delegating to slower routines when the DFA quits. + /// + /// The default search implementations provided by this crate will return a + /// [`MatchError::quit`] error when a quit state is entered. + /// + /// # Example + /// + /// See the example for [`Automaton::is_special_state`] for how to use this + /// method correctly. + fn is_quit_state(&self, id: StateID) -> bool; + + /// Returns true if and only if the given identifier corresponds to a + /// match state. A match state is also referred to as a "final" state and + /// indicates that a match has been found. + /// + /// If all you care about is whether a particular pattern matches in the + /// input sequence, then a search routine can quit early as soon as the + /// machine enters a match state. However, if you're looking for the + /// standard "leftmost-first" match location, then search _must_ continue + /// until either the end of the input or until the machine enters a dead + /// state. (Since either condition implies that no other useful work can + /// be done.) Namely, when looking for the location of a match, then + /// search implementations should record the most recent location in + /// which a match state was entered, but otherwise continue executing the + /// search as normal. (The search may even leave the match state.) Once + /// the termination condition is reached, the most recently recorded match + /// location should be returned. + /// + /// Finally, one additional power given to match states in this crate + /// is that they are always associated with a specific pattern in order + /// to support multi-DFAs. See [`Automaton::match_pattern`] for more + /// details and an example for how to query the pattern associated with a + /// particular match state. + /// + /// # Example + /// + /// See the example for [`Automaton::is_special_state`] for how to use this + /// method correctly. + fn is_match_state(&self, id: StateID) -> bool; + + /// Returns true only if the given identifier corresponds to a start + /// state + /// + /// A start state is a state in which a DFA begins a search. + /// All searches begin in a start state. Moreover, since all matches are + /// delayed by one byte, a start state can never be a match state. + /// + /// The main role of a start state is, as mentioned, to be a starting + /// point for a DFA. This starting point is determined via one of + /// [`Automaton::start_state_forward`] or + /// [`Automaton::start_state_reverse`], depending on whether one is doing + /// a forward or a reverse search, respectively. + /// + /// A secondary use of start states is for prefix acceleration. Namely, + /// while executing a search, if one detects that you're in a start state, + /// then it may be faster to look for the next match of a prefix of the + /// pattern, if one exists. If a prefix exists and since all matches must + /// begin with that prefix, then skipping ahead to occurrences of that + /// prefix may be much faster than executing the DFA. + /// + /// As mentioned in the documentation for + /// [`is_special_state`](Automaton::is_special_state) implementations + /// _may_ always return false, even if the given identifier is a start + /// state. This is because knowing whether a state is a start state or not + /// is not necessary for correctness and is only treated as a potential + /// performance optimization. (For example, the implementations of this + /// trait in this crate will only return true when the given identifier + /// corresponds to a start state and when [specialization of start + /// states](crate::dfa::dense::Config::specialize_start_states) was enabled + /// during DFA construction. If start state specialization is disabled + /// (which is the default), then this method will always return false.) + /// + /// # Example + /// + /// This example shows how to implement your own search routine that does + /// a prefix search whenever the search enters a start state. + /// + /// Note that you do not need to implement your own search routine + /// to make use of prefilters like this. The search routines + /// provided by this crate already implement prefilter support via + /// the [`Prefilter`](crate::util::prefilter::Prefilter) trait. + /// A prefilter can be added to your search configuration with + /// [`dense::Config::prefilter`](crate::dfa::dense::Config::prefilter) for + /// dense and sparse DFAs in this crate. + /// + /// This example is meant to show how you might deal with prefilters in a + /// simplified case if you are implementing your own search routine. + /// + /// ``` + /// use regex_automata::{ + /// dfa::{Automaton, dense}, + /// HalfMatch, MatchError, Input, + /// }; + /// + /// fn find_byte(slice: &[u8], at: usize, byte: u8) -> Option<usize> { + /// // Would be faster to use the memchr crate, but this is still + /// // faster than running through the DFA. + /// slice[at..].iter().position(|&b| b == byte).map(|i| at + i) + /// } + /// + /// fn find<A: Automaton>( + /// dfa: &A, + /// haystack: &[u8], + /// prefix_byte: Option<u8>, + /// ) -> Result<Option<HalfMatch>, MatchError> { + /// // See the Automaton::is_special_state example for similar code + /// // with more comments. + /// + /// let mut state = dfa.start_state_forward(&Input::new(haystack))?; + /// let mut last_match = None; + /// let mut pos = 0; + /// while pos < haystack.len() { + /// let b = haystack[pos]; + /// state = dfa.next_state(state, b); + /// pos += 1; + /// if dfa.is_special_state(state) { + /// if dfa.is_match_state(state) { + /// last_match = Some(HalfMatch::new( + /// dfa.match_pattern(state, 0), + /// pos - 1, + /// )); + /// } else if dfa.is_dead_state(state) { + /// return Ok(last_match); + /// } else if dfa.is_quit_state(state) { + /// // It is possible to enter into a quit state after + /// // observing a match has occurred. In that case, we + /// // should return the match instead of an error. + /// if last_match.is_some() { + /// return Ok(last_match); + /// } + /// return Err(MatchError::quit(b, pos - 1)); + /// } else if dfa.is_start_state(state) { + /// // If we're in a start state and know all matches begin + /// // with a particular byte, then we can quickly skip to + /// // candidate matches without running the DFA through + /// // every byte inbetween. + /// if let Some(prefix_byte) = prefix_byte { + /// pos = match find_byte(haystack, pos, prefix_byte) { + /// Some(pos) => pos, + /// None => break, + /// }; + /// } + /// } + /// } + /// } + /// // Matches are always delayed by 1 byte, so we must explicitly walk + /// // the special "EOI" transition at the end of the search. + /// state = dfa.next_eoi_state(state); + /// if dfa.is_match_state(state) { + /// last_match = Some(HalfMatch::new( + /// dfa.match_pattern(state, 0), + /// haystack.len(), + /// )); + /// } + /// Ok(last_match) + /// } + /// + /// // In this example, it's obvious that all occurrences of our pattern + /// // begin with 'Z', so we pass in 'Z'. Note also that we need to + /// // enable start state specialization, or else it won't be possible to + /// // detect start states during a search. ('is_start_state' would always + /// // return false.) + /// let dfa = dense::DFA::builder() + /// .configure(dense::DFA::config().specialize_start_states(true)) + /// .build(r"Z[a-z]+")?; + /// let haystack = "123 foobar Zbaz quux".as_bytes(); + /// let mat = find(&dfa, haystack, Some(b'Z'))?.unwrap(); + /// assert_eq!(mat.pattern().as_usize(), 0); + /// assert_eq!(mat.offset(), 15); + /// + /// // But note that we don't need to pass in a prefix byte. If we don't, + /// // then the search routine does no acceleration. + /// let mat = find(&dfa, haystack, None)?.unwrap(); + /// assert_eq!(mat.pattern().as_usize(), 0); + /// assert_eq!(mat.offset(), 15); + /// + /// // However, if we pass an incorrect byte, then the prefix search will + /// // result in incorrect results. + /// assert_eq!(find(&dfa, haystack, Some(b'X'))?, None); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + fn is_start_state(&self, id: StateID) -> bool; + + /// Returns true if and only if the given identifier corresponds to an + /// accelerated state. + /// + /// An accelerated state is a special optimization + /// trick implemented by this crate. Namely, if + /// [`dense::Config::accelerate`](crate::dfa::dense::Config::accelerate) is + /// enabled (and it is by default), then DFAs generated by this crate will + /// tag states meeting certain characteristics as accelerated. States meet + /// this criteria whenever most of their transitions are self-transitions. + /// That is, transitions that loop back to the same state. When a small + /// number of transitions aren't self-transitions, then it follows that + /// there are only a small number of bytes that can cause the DFA to leave + /// that state. Thus, there is an opportunity to look for those bytes + /// using more optimized routines rather than continuing to run through + /// the DFA. This trick is similar to the prefilter idea described in + /// the documentation of [`Automaton::is_start_state`] with two main + /// differences: + /// + /// 1. It is more limited since acceleration only applies to single bytes. + /// This means states are rarely accelerated when Unicode mode is enabled + /// (which is enabled by default). + /// 2. It can occur anywhere in the DFA, which increases optimization + /// opportunities. + /// + /// Like the prefilter idea, the main downside (and a possible reason to + /// disable it) is that it can lead to worse performance in some cases. + /// Namely, if a state is accelerated for very common bytes, then the + /// overhead of checking for acceleration and using the more optimized + /// routines to look for those bytes can cause overall performance to be + /// worse than if acceleration wasn't enabled at all. + /// + /// A simple example of a regex that has an accelerated state is + /// `(?-u)[^a]+a`. Namely, the `[^a]+` sub-expression gets compiled down + /// into a single state where all transitions except for `a` loop back to + /// itself, and where `a` is the only transition (other than the special + /// EOI transition) that goes to some other state. Thus, this state can + /// be accelerated and implemented more efficiently by calling an + /// optimized routine like `memchr` with `a` as the needle. Notice that + /// the `(?-u)` to disable Unicode is necessary here, as without it, + /// `[^a]` will match any UTF-8 encoding of any Unicode scalar value other + /// than `a`. This more complicated expression compiles down to many DFA + /// states and the simple acceleration optimization is no longer available. + /// + /// Typically, this routine is used to guard calls to + /// [`Automaton::accelerator`], which returns the accelerated bytes for + /// the specified state. + fn is_accel_state(&self, id: StateID) -> bool; + + /// Returns the total number of patterns compiled into this DFA. + /// + /// In the case of a DFA that contains no patterns, this must return `0`. + /// + /// # Example + /// + /// This example shows the pattern length for a DFA that never matches: + /// + /// ``` + /// use regex_automata::dfa::{Automaton, dense::DFA}; + /// + /// let dfa: DFA<Vec<u32>> = DFA::never_match()?; + /// assert_eq!(dfa.pattern_len(), 0); + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + /// + /// And another example for a DFA that matches at every position: + /// + /// ``` + /// use regex_automata::dfa::{Automaton, dense::DFA}; + /// + /// let dfa: DFA<Vec<u32>> = DFA::always_match()?; + /// assert_eq!(dfa.pattern_len(), 1); + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + /// + /// And finally, a DFA that was constructed from multiple patterns: + /// + /// ``` + /// use regex_automata::dfa::{Automaton, dense::DFA}; + /// + /// let dfa = DFA::new_many(&["[0-9]+", "[a-z]+", "[A-Z]+"])?; + /// assert_eq!(dfa.pattern_len(), 3); + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + fn pattern_len(&self) -> usize; + + /// Returns the total number of patterns that match in this state. + /// + /// If the given state is not a match state, then implementations may + /// panic. + /// + /// If the DFA was compiled with one pattern, then this must necessarily + /// always return `1` for all match states. + /// + /// Implementations must guarantee that [`Automaton::match_pattern`] can be + /// called with indices up to (but not including) the length returned by + /// this routine without panicking. + /// + /// # Panics + /// + /// Implementations are permitted to panic if the provided state ID does + /// not correspond to a match state. + /// + /// # Example + /// + /// This example shows a simple instance of implementing overlapping + /// matches. In particular, it shows not only how to determine how many + /// patterns have matched in a particular state, but also how to access + /// which specific patterns have matched. + /// + /// Notice that we must use + /// [`MatchKind::All`](crate::MatchKind::All) + /// when building the DFA. If we used + /// [`MatchKind::LeftmostFirst`](crate::MatchKind::LeftmostFirst) + /// instead, then the DFA would not be constructed in a way that + /// supports overlapping matches. (It would only report a single pattern + /// that matches at any particular point in time.) + /// + /// Another thing to take note of is the patterns used and the order in + /// which the pattern IDs are reported. In the example below, pattern `3` + /// is yielded first. Why? Because it corresponds to the match that + /// appears first. Namely, the `@` symbol is part of `\S+` but not part + /// of any of the other patterns. Since the `\S+` pattern has a match that + /// starts to the left of any other pattern, its ID is returned before any + /// other. + /// + /// ``` + /// # if cfg!(miri) { return Ok(()); } // miri takes too long + /// use regex_automata::{dfa::{Automaton, dense}, Input, MatchKind}; + /// + /// let dfa = dense::Builder::new() + /// .configure(dense::Config::new().match_kind(MatchKind::All)) + /// .build_many(&[ + /// r"[[:word:]]+", r"[a-z]+", r"[A-Z]+", r"[[:^space:]]+", + /// ])?; + /// let haystack = "@bar".as_bytes(); + /// + /// // The start state is determined by inspecting the position and the + /// // initial bytes of the haystack. + /// let mut state = dfa.start_state_forward(&Input::new(haystack))?; + /// // Walk all the bytes in the haystack. + /// for &b in haystack { + /// state = dfa.next_state(state, b); + /// } + /// state = dfa.next_eoi_state(state); + /// + /// assert!(dfa.is_match_state(state)); + /// assert_eq!(dfa.match_len(state), 3); + /// // The following calls are guaranteed to not panic since `match_len` + /// // returned `3` above. + /// assert_eq!(dfa.match_pattern(state, 0).as_usize(), 3); + /// assert_eq!(dfa.match_pattern(state, 1).as_usize(), 0); + /// assert_eq!(dfa.match_pattern(state, 2).as_usize(), 1); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + fn match_len(&self, id: StateID) -> usize; + + /// Returns the pattern ID corresponding to the given match index in the + /// given state. + /// + /// See [`Automaton::match_len`] for an example of how to use this + /// method correctly. Note that if you know your DFA is compiled with a + /// single pattern, then this routine is never necessary since it will + /// always return a pattern ID of `0` for an index of `0` when `id` + /// corresponds to a match state. + /// + /// Typically, this routine is used when implementing an overlapping + /// search, as the example for `Automaton::match_len` does. + /// + /// # Panics + /// + /// If the state ID is not a match state or if the match index is out + /// of bounds for the given state, then this routine may either panic + /// or produce an incorrect result. If the state ID is correct and the + /// match index is correct, then this routine must always produce a valid + /// `PatternID`. + fn match_pattern(&self, id: StateID, index: usize) -> PatternID; + + /// Returns true if and only if this automaton can match the empty string. + /// When it returns false, all possible matches are guaranteed to have a + /// non-zero length. + /// + /// This is useful as cheap way to know whether code needs to handle the + /// case of a zero length match. This is particularly important when UTF-8 + /// modes are enabled, as when UTF-8 mode is enabled, empty matches that + /// split a codepoint must never be reported. This extra handling can + /// sometimes be costly, and since regexes matching an empty string are + /// somewhat rare, it can be beneficial to treat such regexes specially. + /// + /// # Example + /// + /// This example shows a few different DFAs and whether they match the + /// empty string or not. Notice the empty string isn't merely a matter + /// of a string of length literally `0`, but rather, whether a match can + /// occur between specific pairs of bytes. + /// + /// ``` + /// use regex_automata::{dfa::{dense::DFA, Automaton}, util::syntax}; + /// + /// // The empty regex matches the empty string. + /// let dfa = DFA::new("")?; + /// assert!(dfa.has_empty(), "empty matches empty"); + /// // The '+' repetition operator requires at least one match, and so + /// // does not match the empty string. + /// let dfa = DFA::new("a+")?; + /// assert!(!dfa.has_empty(), "+ does not match empty"); + /// // But the '*' repetition operator does. + /// let dfa = DFA::new("a*")?; + /// assert!(dfa.has_empty(), "* does match empty"); + /// // And wrapping '+' in an operator that can match an empty string also + /// // causes it to match the empty string too. + /// let dfa = DFA::new("(a+)*")?; + /// assert!(dfa.has_empty(), "+ inside of * matches empty"); + /// + /// // If a regex is just made of a look-around assertion, even if the + /// // assertion requires some kind of non-empty string around it (such as + /// // \b), then it is still treated as if it matches the empty string. + /// // Namely, if a match occurs of just a look-around assertion, then the + /// // match returned is empty. + /// let dfa = DFA::builder() + /// .configure(DFA::config().unicode_word_boundary(true)) + /// .syntax(syntax::Config::new().utf8(false)) + /// .build(r"^$\A\z\b\B(?-u:\b\B)")?; + /// assert!(dfa.has_empty(), "assertions match empty"); + /// // Even when an assertion is wrapped in a '+', it still matches the + /// // empty string. + /// let dfa = DFA::new(r"^+")?; + /// assert!(dfa.has_empty(), "+ of an assertion matches empty"); + /// + /// // An alternation with even one branch that can match the empty string + /// // is also said to match the empty string overall. + /// let dfa = DFA::new("foo|(bar)?|quux")?; + /// assert!(dfa.has_empty(), "alternations can match empty"); + /// + /// // An NFA that matches nothing does not match the empty string. + /// let dfa = DFA::new("[a&&b]")?; + /// assert!(!dfa.has_empty(), "never matching means not matching empty"); + /// // But if it's wrapped in something that doesn't require a match at + /// // all, then it can match the empty string! + /// let dfa = DFA::new("[a&&b]*")?; + /// assert!(dfa.has_empty(), "* on never-match still matches empty"); + /// // Since a '+' requires a match, using it on something that can never + /// // match will itself produce a regex that can never match anything, + /// // and thus does not match the empty string. + /// let dfa = DFA::new("[a&&b]+")?; + /// assert!(!dfa.has_empty(), "+ on never-match still matches nothing"); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + fn has_empty(&self) -> bool; + + /// Whether UTF-8 mode is enabled for this DFA or not. + /// + /// When UTF-8 mode is enabled, all matches reported by a DFA are + /// guaranteed to correspond to spans of valid UTF-8. This includes + /// zero-width matches. For example, the DFA must guarantee that the empty + /// regex will not match at the positions between code units in the UTF-8 + /// encoding of a single codepoint. + /// + /// See [`thompson::Config::utf8`](crate::nfa::thompson::Config::utf8) for + /// more information. + /// + /// # Example + /// + /// This example shows how UTF-8 mode can impact the match spans that may + /// be reported in certain cases. + /// + /// ``` + /// use regex_automata::{ + /// dfa::{dense::DFA, Automaton}, + /// nfa::thompson, + /// HalfMatch, Input, + /// }; + /// + /// // UTF-8 mode is enabled by default. + /// let re = DFA::new("")?; + /// assert!(re.is_utf8()); + /// let mut input = Input::new("☃"); + /// let got = re.try_search_fwd(&input)?; + /// assert_eq!(Some(HalfMatch::must(0, 0)), got); + /// + /// // Even though an empty regex matches at 1..1, our next match is + /// // 3..3 because 1..1 and 2..2 split the snowman codepoint (which is + /// // three bytes long). + /// input.set_start(1); + /// let got = re.try_search_fwd(&input)?; + /// assert_eq!(Some(HalfMatch::must(0, 3)), got); + /// + /// // But if we disable UTF-8, then we'll get matches at 1..1 and 2..2: + /// let re = DFA::builder() + /// .thompson(thompson::Config::new().utf8(false)) + /// .build("")?; + /// assert!(!re.is_utf8()); + /// let got = re.try_search_fwd(&input)?; + /// assert_eq!(Some(HalfMatch::must(0, 1)), got); + /// + /// input.set_start(2); + /// let got = re.try_search_fwd(&input)?; + /// assert_eq!(Some(HalfMatch::must(0, 2)), got); + /// + /// input.set_start(3); + /// let got = re.try_search_fwd(&input)?; + /// assert_eq!(Some(HalfMatch::must(0, 3)), got); + /// + /// input.set_start(4); + /// let got = re.try_search_fwd(&input)?; + /// assert_eq!(None, got); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + fn is_utf8(&self) -> bool; + + /// Returns true if and only if this DFA is limited to returning matches + /// whose start position is `0`. + /// + /// Note that if you're using DFAs provided by + /// this crate, then this is _orthogonal_ to + /// [`Config::start_kind`](crate::dfa::dense::Config::start_kind). + /// + /// This is useful in some cases because if a DFA is limited to producing + /// matches that start at offset `0`, then a reverse search is never + /// required for finding the start of a match. + /// + /// # Example + /// + /// ``` + /// use regex_automata::dfa::{dense::DFA, Automaton}; + /// + /// // The empty regex matches anywhere + /// let dfa = DFA::new("")?; + /// assert!(!dfa.is_always_start_anchored(), "empty matches anywhere"); + /// // 'a' matches anywhere. + /// let dfa = DFA::new("a")?; + /// assert!(!dfa.is_always_start_anchored(), "'a' matches anywhere"); + /// // '^' only matches at offset 0! + /// let dfa = DFA::new("^a")?; + /// assert!(dfa.is_always_start_anchored(), "'^a' matches only at 0"); + /// // But '(?m:^)' matches at 0 but at other offsets too. + /// let dfa = DFA::new("(?m:^)a")?; + /// assert!(!dfa.is_always_start_anchored(), "'(?m:^)a' matches anywhere"); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + fn is_always_start_anchored(&self) -> bool; + + /// Return a slice of bytes to accelerate for the given state, if possible. + /// + /// If the given state has no accelerator, then an empty slice must be + /// returned. If `Automaton::is_accel_state` returns true for the given ID, + /// then this routine _must_ return a non-empty slice. But note that it is + /// not required for an implementation of this trait to ever return `true` + /// for `is_accel_state`, even if the state _could_ be accelerated. That + /// is, acceleration is an optional optimization. But the return values of + /// `is_accel_state` and `accelerator` must be in sync. + /// + /// If the given ID is not a valid state ID for this automaton, then + /// implementations may panic or produce incorrect results. + /// + /// See [`Automaton::is_accel_state`] for more details on state + /// acceleration. + /// + /// By default, this method will always return an empty slice. + /// + /// # Example + /// + /// This example shows a contrived case in which we build a regex that we + /// know is accelerated and extract the accelerator from a state. + /// + /// ``` + /// use regex_automata::{ + /// dfa::{Automaton, dense}, + /// util::{primitives::StateID, syntax}, + /// }; + /// + /// let dfa = dense::Builder::new() + /// // We disable Unicode everywhere and permit the regex to match + /// // invalid UTF-8. e.g., [^abc] matches \xFF, which is not valid + /// // UTF-8. If we left Unicode enabled, [^abc] would match any UTF-8 + /// // encoding of any Unicode scalar value except for 'a', 'b' or 'c'. + /// // That translates to a much more complicated DFA, and also + /// // inhibits the 'accelerator' optimization that we are trying to + /// // demonstrate in this example. + /// .syntax(syntax::Config::new().unicode(false).utf8(false)) + /// .build("[^abc]+a")?; + /// + /// // Here we just pluck out the state that we know is accelerated. + /// // While the stride calculations are something that can be relied + /// // on by callers, the specific position of the accelerated state is + /// // implementation defined. + /// // + /// // N.B. We get '3' by inspecting the state machine using 'regex-cli'. + /// // e.g., try `regex-cli debug dfa dense '[^abc]+a' -BbUC`. + /// let id = StateID::new(3 * dfa.stride()).unwrap(); + /// let accelerator = dfa.accelerator(id); + /// // The `[^abc]+` sub-expression permits [a, b, c] to be accelerated. + /// assert_eq!(accelerator, &[b'a', b'b', b'c']); + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + #[inline] + fn accelerator(&self, _id: StateID) -> &[u8] { + &[] + } + + /// Returns the prefilter associated with a DFA, if one exists. + /// + /// The default implementation of this trait always returns `None`. And + /// indeed, it is always correct to return `None`. + /// + /// For DFAs in this crate, a prefilter can be attached to a DFA via + /// [`dense::Config::prefilter`](crate::dfa::dense::Config::prefilter). + /// + /// Do note that prefilters are not serialized by DFAs in this crate. + /// So if you deserialize a DFA that had a prefilter attached to it + /// at serialization time, then it will not have a prefilter after + /// deserialization. + #[inline] + fn get_prefilter(&self) -> Option<&Prefilter> { + None + } + + /// Executes a forward search and returns the end position of the leftmost + /// match that is found. If no match exists, then `None` is returned. + /// + /// In particular, this method continues searching even after it enters + /// a match state. The search only terminates once it has reached the + /// end of the input or when it has entered a dead or quit state. Upon + /// termination, the position of the last byte seen while still in a match + /// state is returned. + /// + /// # Errors + /// + /// This routine errors if the search could not complete. This can occur + /// in a number of circumstances: + /// + /// * The configuration of the DFA may permit it to "quit" the search. + /// For example, setting quit bytes or enabling heuristic support for + /// Unicode word boundaries. The default configuration does not enable any + /// option that could result in the DFA quitting. + /// * When the provided `Input` configuration is not supported. For + /// example, by providing an unsupported anchor mode. + /// + /// When a search returns an error, callers cannot know whether a match + /// exists or not. + /// + /// # Notes for implementors + /// + /// Implementors of this trait are not required to implement any particular + /// match semantics (such as leftmost-first), which are instead manifest in + /// the DFA's transitions. But this search routine should behave as a + /// general "leftmost" search. + /// + /// In particular, this method must continue searching even after it enters + /// a match state. The search should only terminate once it has reached + /// the end of the input or when it has entered a dead or quit state. Upon + /// termination, the position of the last byte seen while still in a match + /// state is returned. + /// + /// Since this trait provides an implementation for this method by default, + /// it's unlikely that one will need to implement this. + /// + /// # Example + /// + /// This example shows how to use this method with a + /// [`dense::DFA`](crate::dfa::dense::DFA). + /// + /// ``` + /// use regex_automata::{dfa::{Automaton, dense}, HalfMatch, Input}; + /// + /// let dfa = dense::DFA::new("foo[0-9]+")?; + /// let expected = Some(HalfMatch::must(0, 8)); + /// assert_eq!(expected, dfa.try_search_fwd(&Input::new(b"foo12345"))?); + /// + /// // Even though a match is found after reading the first byte (`a`), + /// // the leftmost first match semantics demand that we find the earliest + /// // match that prefers earlier parts of the pattern over latter parts. + /// let dfa = dense::DFA::new("abc|a")?; + /// let expected = Some(HalfMatch::must(0, 3)); + /// assert_eq!(expected, dfa.try_search_fwd(&Input::new(b"abc"))?); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + /// + /// # Example: specific pattern search + /// + /// This example shows how to build a multi-DFA that permits searching for + /// specific patterns. + /// + /// ``` + /// # if cfg!(miri) { return Ok(()); } // miri takes too long + /// use regex_automata::{ + /// dfa::{Automaton, dense}, + /// Anchored, HalfMatch, PatternID, Input, + /// }; + /// + /// let dfa = dense::Builder::new() + /// .configure(dense::Config::new().starts_for_each_pattern(true)) + /// .build_many(&["[a-z0-9]{6}", "[a-z][a-z0-9]{5}"])?; + /// let haystack = "foo123".as_bytes(); + /// + /// // Since we are using the default leftmost-first match and both + /// // patterns match at the same starting position, only the first pattern + /// // will be returned in this case when doing a search for any of the + /// // patterns. + /// let expected = Some(HalfMatch::must(0, 6)); + /// let got = dfa.try_search_fwd(&Input::new(haystack))?; + /// assert_eq!(expected, got); + /// + /// // But if we want to check whether some other pattern matches, then we + /// // can provide its pattern ID. + /// let input = Input::new(haystack) + /// .anchored(Anchored::Pattern(PatternID::must(1))); + /// let expected = Some(HalfMatch::must(1, 6)); + /// let got = dfa.try_search_fwd(&input)?; + /// assert_eq!(expected, got); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + /// + /// # Example: specifying the bounds of a search + /// + /// This example shows how providing the bounds of a search can produce + /// different results than simply sub-slicing the haystack. + /// + /// ``` + /// use regex_automata::{dfa::{Automaton, dense}, HalfMatch, Input}; + /// + /// // N.B. We disable Unicode here so that we use a simple ASCII word + /// // boundary. Alternatively, we could enable heuristic support for + /// // Unicode word boundaries. + /// let dfa = dense::DFA::new(r"(?-u)\b[0-9]{3}\b")?; + /// let haystack = "foo123bar".as_bytes(); + /// + /// // Since we sub-slice the haystack, the search doesn't know about the + /// // larger context and assumes that `123` is surrounded by word + /// // boundaries. And of course, the match position is reported relative + /// // to the sub-slice as well, which means we get `3` instead of `6`. + /// let input = Input::new(&haystack[3..6]); + /// let expected = Some(HalfMatch::must(0, 3)); + /// let got = dfa.try_search_fwd(&input)?; + /// assert_eq!(expected, got); + /// + /// // But if we provide the bounds of the search within the context of the + /// // entire haystack, then the search can take the surrounding context + /// // into account. (And if we did find a match, it would be reported + /// // as a valid offset into `haystack` instead of its sub-slice.) + /// let input = Input::new(haystack).range(3..6); + /// let expected = None; + /// let got = dfa.try_search_fwd(&input)?; + /// assert_eq!(expected, got); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + #[inline] + fn try_search_fwd( + &self, + input: &Input<'_>, + ) -> Result<Option<HalfMatch>, MatchError> { + let utf8empty = self.has_empty() && self.is_utf8(); + let hm = match search::find_fwd(&self, input)? { + None => return Ok(None), + Some(hm) if !utf8empty => return Ok(Some(hm)), + Some(hm) => hm, + }; + // We get to this point when we know our DFA can match the empty string + // AND when UTF-8 mode is enabled. In this case, we skip any matches + // whose offset splits a codepoint. Such a match is necessarily a + // zero-width match, because UTF-8 mode requires the underlying NFA + // to be built such that all non-empty matches span valid UTF-8. + // Therefore, any match that ends in the middle of a codepoint cannot + // be part of a span of valid UTF-8 and thus must be an empty match. + // In such cases, we skip it, so as not to report matches that split a + // codepoint. + // + // Note that this is not a checked assumption. Callers *can* provide an + // NFA with UTF-8 mode enabled but produces non-empty matches that span + // invalid UTF-8. But doing so is documented to result in unspecified + // behavior. + empty::skip_splits_fwd(input, hm, hm.offset(), |input| { + let got = search::find_fwd(&self, input)?; + Ok(got.map(|hm| (hm, hm.offset()))) + }) + } + + /// Executes a reverse search and returns the start of the position of the + /// leftmost match that is found. If no match exists, then `None` is + /// returned. + /// + /// # Errors + /// + /// This routine errors if the search could not complete. This can occur + /// in a number of circumstances: + /// + /// * The configuration of the DFA may permit it to "quit" the search. + /// For example, setting quit bytes or enabling heuristic support for + /// Unicode word boundaries. The default configuration does not enable any + /// option that could result in the DFA quitting. + /// * When the provided `Input` configuration is not supported. For + /// example, by providing an unsupported anchor mode. + /// + /// When a search returns an error, callers cannot know whether a match + /// exists or not. + /// + /// # Example + /// + /// This example shows how to use this method with a + /// [`dense::DFA`](crate::dfa::dense::DFA). In particular, this + /// routine is principally useful when used in conjunction with the + /// [`nfa::thompson::Config::reverse`](crate::nfa::thompson::Config::reverse) + /// configuration. In general, it's unlikely to be correct to use + /// both `try_search_fwd` and `try_search_rev` with the same DFA since + /// any particular DFA will only support searching in one direction with + /// respect to the pattern. + /// + /// ``` + /// use regex_automata::{ + /// nfa::thompson, + /// dfa::{Automaton, dense}, + /// HalfMatch, Input, + /// }; + /// + /// let dfa = dense::Builder::new() + /// .thompson(thompson::Config::new().reverse(true)) + /// .build("foo[0-9]+")?; + /// let expected = Some(HalfMatch::must(0, 0)); + /// assert_eq!(expected, dfa.try_search_rev(&Input::new(b"foo12345"))?); + /// + /// // Even though a match is found after reading the last byte (`c`), + /// // the leftmost first match semantics demand that we find the earliest + /// // match that prefers earlier parts of the pattern over latter parts. + /// let dfa = dense::Builder::new() + /// .thompson(thompson::Config::new().reverse(true)) + /// .build("abc|c")?; + /// let expected = Some(HalfMatch::must(0, 0)); + /// assert_eq!(expected, dfa.try_search_rev(&Input::new(b"abc"))?); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + /// + /// # Example: UTF-8 mode + /// + /// This examples demonstrates that UTF-8 mode applies to reverse + /// DFAs. When UTF-8 mode is enabled in the underlying NFA, then all + /// matches reported must correspond to valid UTF-8 spans. This includes + /// prohibiting zero-width matches that split a codepoint. + /// + /// UTF-8 mode is enabled by default. Notice below how the only zero-width + /// matches reported are those at UTF-8 boundaries: + /// + /// ``` + /// use regex_automata::{ + /// dfa::{dense::DFA, Automaton}, + /// nfa::thompson, + /// HalfMatch, Input, MatchKind, + /// }; + /// + /// let dfa = DFA::builder() + /// .thompson(thompson::Config::new().reverse(true)) + /// .build(r"")?; + /// + /// // Run the reverse DFA to collect all matches. + /// let mut input = Input::new("☃"); + /// let mut matches = vec![]; + /// loop { + /// match dfa.try_search_rev(&input)? { + /// None => break, + /// Some(hm) => { + /// matches.push(hm); + /// if hm.offset() == 0 || input.end() == 0 { + /// break; + /// } else if hm.offset() < input.end() { + /// input.set_end(hm.offset()); + /// } else { + /// // This is only necessary to handle zero-width + /// // matches, which of course occur in this example. + /// // Without this, the search would never advance + /// // backwards beyond the initial match. + /// input.set_end(input.end() - 1); + /// } + /// } + /// } + /// } + /// + /// // No matches split a codepoint. + /// let expected = vec![ + /// HalfMatch::must(0, 3), + /// HalfMatch::must(0, 0), + /// ]; + /// assert_eq!(expected, matches); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + /// + /// Now let's look at the same example, but with UTF-8 mode on the + /// original NFA disabled (which results in disabling UTF-8 mode on the + /// DFA): + /// + /// ``` + /// use regex_automata::{ + /// dfa::{dense::DFA, Automaton}, + /// nfa::thompson, + /// HalfMatch, Input, MatchKind, + /// }; + /// + /// let dfa = DFA::builder() + /// .thompson(thompson::Config::new().reverse(true).utf8(false)) + /// .build(r"")?; + /// + /// // Run the reverse DFA to collect all matches. + /// let mut input = Input::new("☃"); + /// let mut matches = vec![]; + /// loop { + /// match dfa.try_search_rev(&input)? { + /// None => break, + /// Some(hm) => { + /// matches.push(hm); + /// if hm.offset() == 0 || input.end() == 0 { + /// break; + /// } else if hm.offset() < input.end() { + /// input.set_end(hm.offset()); + /// } else { + /// // This is only necessary to handle zero-width + /// // matches, which of course occur in this example. + /// // Without this, the search would never advance + /// // backwards beyond the initial match. + /// input.set_end(input.end() - 1); + /// } + /// } + /// } + /// } + /// + /// // No matches split a codepoint. + /// let expected = vec![ + /// HalfMatch::must(0, 3), + /// HalfMatch::must(0, 2), + /// HalfMatch::must(0, 1), + /// HalfMatch::must(0, 0), + /// ]; + /// assert_eq!(expected, matches); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + #[inline] + fn try_search_rev( + &self, + input: &Input<'_>, + ) -> Result<Option<HalfMatch>, MatchError> { + let utf8empty = self.has_empty() && self.is_utf8(); + let hm = match search::find_rev(self, input)? { + None => return Ok(None), + Some(hm) if !utf8empty => return Ok(Some(hm)), + Some(hm) => hm, + }; + empty::skip_splits_rev(input, hm, hm.offset(), |input| { + let got = search::find_rev(self, input)?; + Ok(got.map(|hm| (hm, hm.offset()))) + }) + } + + /// Executes an overlapping forward search. Matches, if one exists, can be + /// obtained via the [`OverlappingState::get_match`] method. + /// + /// This routine is principally only useful when searching for multiple + /// patterns on inputs where multiple patterns may match the same regions + /// of text. In particular, callers must preserve the automaton's search + /// state from prior calls so that the implementation knows where the last + /// match occurred. + /// + /// When using this routine to implement an iterator of overlapping + /// matches, the `start` of the search should always be set to the end + /// of the last match. If more patterns match at the previous location, + /// then they will be immediately returned. (This is tracked by the given + /// overlapping state.) Otherwise, the search continues at the starting + /// position given. + /// + /// If for some reason you want the search to forget about its previous + /// state and restart the search at a particular position, then setting the + /// state to [`OverlappingState::start`] will accomplish that. + /// + /// # Errors + /// + /// This routine errors if the search could not complete. This can occur + /// in a number of circumstances: + /// + /// * The configuration of the DFA may permit it to "quit" the search. + /// For example, setting quit bytes or enabling heuristic support for + /// Unicode word boundaries. The default configuration does not enable any + /// option that could result in the DFA quitting. + /// * When the provided `Input` configuration is not supported. For + /// example, by providing an unsupported anchor mode. + /// + /// When a search returns an error, callers cannot know whether a match + /// exists or not. + /// + /// # Example + /// + /// This example shows how to run a basic overlapping search with a + /// [`dense::DFA`](crate::dfa::dense::DFA). Notice that we build the + /// automaton with a `MatchKind::All` configuration. Overlapping searches + /// are unlikely to work as one would expect when using the default + /// `MatchKind::LeftmostFirst` match semantics, since leftmost-first + /// matching is fundamentally incompatible with overlapping searches. + /// Namely, overlapping searches need to report matches as they are seen, + /// where as leftmost-first searches will continue searching even after a + /// match has been observed in order to find the conventional end position + /// of the match. More concretely, leftmost-first searches use dead states + /// to terminate a search after a specific match can no longer be extended. + /// Overlapping searches instead do the opposite by continuing the search + /// to find totally new matches (potentially of other patterns). + /// + /// ``` + /// # if cfg!(miri) { return Ok(()); } // miri takes too long + /// use regex_automata::{ + /// dfa::{Automaton, OverlappingState, dense}, + /// HalfMatch, Input, MatchKind, + /// }; + /// + /// let dfa = dense::Builder::new() + /// .configure(dense::Config::new().match_kind(MatchKind::All)) + /// .build_many(&[r"[[:word:]]+$", r"[[:^space:]]+$"])?; + /// let haystack = "@foo"; + /// let mut state = OverlappingState::start(); + /// + /// let expected = Some(HalfMatch::must(1, 4)); + /// dfa.try_search_overlapping_fwd(&Input::new(haystack), &mut state)?; + /// assert_eq!(expected, state.get_match()); + /// + /// // The first pattern also matches at the same position, so re-running + /// // the search will yield another match. Notice also that the first + /// // pattern is returned after the second. This is because the second + /// // pattern begins its match before the first, is therefore an earlier + /// // match and is thus reported first. + /// let expected = Some(HalfMatch::must(0, 4)); + /// dfa.try_search_overlapping_fwd(&Input::new(haystack), &mut state)?; + /// assert_eq!(expected, state.get_match()); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + #[inline] + fn try_search_overlapping_fwd( + &self, + input: &Input<'_>, + state: &mut OverlappingState, + ) -> Result<(), MatchError> { + let utf8empty = self.has_empty() && self.is_utf8(); + search::find_overlapping_fwd(self, input, state)?; + match state.get_match() { + None => Ok(()), + Some(_) if !utf8empty => Ok(()), + Some(_) => skip_empty_utf8_splits_overlapping( + input, + state, + |input, state| { + search::find_overlapping_fwd(self, input, state) + }, + ), + } + } + + /// Executes a reverse overlapping forward search. Matches, if one exists, + /// can be obtained via the [`OverlappingState::get_match`] method. + /// + /// When using this routine to implement an iterator of overlapping + /// matches, the `start` of the search should remain invariant throughout + /// iteration. The `OverlappingState` given to the search will keep track + /// of the current position of the search. (This is because multiple + /// matches may be reported at the same position, so only the search + /// implementation itself knows when to advance the position.) + /// + /// If for some reason you want the search to forget about its previous + /// state and restart the search at a particular position, then setting the + /// state to [`OverlappingState::start`] will accomplish that. + /// + /// # Errors + /// + /// This routine errors if the search could not complete. This can occur + /// in a number of circumstances: + /// + /// * The configuration of the DFA may permit it to "quit" the search. + /// For example, setting quit bytes or enabling heuristic support for + /// Unicode word boundaries. The default configuration does not enable any + /// option that could result in the DFA quitting. + /// * When the provided `Input` configuration is not supported. For + /// example, by providing an unsupported anchor mode. + /// + /// When a search returns an error, callers cannot know whether a match + /// exists or not. + /// + /// # Example: UTF-8 mode + /// + /// This examples demonstrates that UTF-8 mode applies to reverse + /// DFAs. When UTF-8 mode is enabled in the underlying NFA, then all + /// matches reported must correspond to valid UTF-8 spans. This includes + /// prohibiting zero-width matches that split a codepoint. + /// + /// UTF-8 mode is enabled by default. Notice below how the only zero-width + /// matches reported are those at UTF-8 boundaries: + /// + /// ``` + /// use regex_automata::{ + /// dfa::{dense::DFA, Automaton, OverlappingState}, + /// nfa::thompson, + /// HalfMatch, Input, MatchKind, + /// }; + /// + /// let dfa = DFA::builder() + /// .configure(DFA::config().match_kind(MatchKind::All)) + /// .thompson(thompson::Config::new().reverse(true)) + /// .build_many(&[r"", r"☃"])?; + /// + /// // Run the reverse DFA to collect all matches. + /// let input = Input::new("☃"); + /// let mut state = OverlappingState::start(); + /// let mut matches = vec![]; + /// loop { + /// dfa.try_search_overlapping_rev(&input, &mut state)?; + /// match state.get_match() { + /// None => break, + /// Some(hm) => matches.push(hm), + /// } + /// } + /// + /// // No matches split a codepoint. + /// let expected = vec![ + /// HalfMatch::must(0, 3), + /// HalfMatch::must(1, 0), + /// HalfMatch::must(0, 0), + /// ]; + /// assert_eq!(expected, matches); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + /// + /// Now let's look at the same example, but with UTF-8 mode on the + /// original NFA disabled (which results in disabling UTF-8 mode on the + /// DFA): + /// + /// ``` + /// use regex_automata::{ + /// dfa::{dense::DFA, Automaton, OverlappingState}, + /// nfa::thompson, + /// HalfMatch, Input, MatchKind, + /// }; + /// + /// let dfa = DFA::builder() + /// .configure(DFA::config().match_kind(MatchKind::All)) + /// .thompson(thompson::Config::new().reverse(true).utf8(false)) + /// .build_many(&[r"", r"☃"])?; + /// + /// // Run the reverse DFA to collect all matches. + /// let input = Input::new("☃"); + /// let mut state = OverlappingState::start(); + /// let mut matches = vec![]; + /// loop { + /// dfa.try_search_overlapping_rev(&input, &mut state)?; + /// match state.get_match() { + /// None => break, + /// Some(hm) => matches.push(hm), + /// } + /// } + /// + /// // Now *all* positions match, even within a codepoint, + /// // because we lifted the requirement that matches + /// // correspond to valid UTF-8 spans. + /// let expected = vec![ + /// HalfMatch::must(0, 3), + /// HalfMatch::must(0, 2), + /// HalfMatch::must(0, 1), + /// HalfMatch::must(1, 0), + /// HalfMatch::must(0, 0), + /// ]; + /// assert_eq!(expected, matches); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + #[inline] + fn try_search_overlapping_rev( + &self, + input: &Input<'_>, + state: &mut OverlappingState, + ) -> Result<(), MatchError> { + let utf8empty = self.has_empty() && self.is_utf8(); + search::find_overlapping_rev(self, input, state)?; + match state.get_match() { + None => Ok(()), + Some(_) if !utf8empty => Ok(()), + Some(_) => skip_empty_utf8_splits_overlapping( + input, + state, + |input, state| { + search::find_overlapping_rev(self, input, state) + }, + ), + } + } + + /// Writes the set of patterns that match anywhere in the given search + /// configuration to `patset`. If multiple patterns match at the same + /// position and the underlying DFA supports overlapping matches, then all + /// matching patterns are written to the given set. + /// + /// Unless all of the patterns in this DFA are anchored, then generally + /// speaking, this will visit every byte in the haystack. + /// + /// This search routine *does not* clear the pattern set. This gives some + /// flexibility to the caller (e.g., running multiple searches with the + /// same pattern set), but does make the API bug-prone if you're reusing + /// the same pattern set for multiple searches but intended them to be + /// independent. + /// + /// If a pattern ID matched but the given `PatternSet` does not have + /// sufficient capacity to store it, then it is not inserted and silently + /// dropped. + /// + /// # Errors + /// + /// This routine errors if the search could not complete. This can occur + /// in a number of circumstances: + /// + /// * The configuration of the DFA may permit it to "quit" the search. + /// For example, setting quit bytes or enabling heuristic support for + /// Unicode word boundaries. The default configuration does not enable any + /// option that could result in the DFA quitting. + /// * When the provided `Input` configuration is not supported. For + /// example, by providing an unsupported anchor mode. + /// + /// When a search returns an error, callers cannot know whether a match + /// exists or not. + /// + /// # Example + /// + /// This example shows how to find all matching patterns in a haystack, + /// even when some patterns match at the same position as other patterns. + /// + /// ``` + /// # if cfg!(miri) { return Ok(()); } // miri takes too long + /// use regex_automata::{ + /// dfa::{Automaton, dense::DFA}, + /// Input, MatchKind, PatternSet, + /// }; + /// + /// let patterns = &[ + /// r"[[:word:]]+", + /// r"[0-9]+", + /// r"[[:alpha:]]+", + /// r"foo", + /// r"bar", + /// r"barfoo", + /// r"foobar", + /// ]; + /// let dfa = DFA::builder() + /// .configure(DFA::config().match_kind(MatchKind::All)) + /// .build_many(patterns)?; + /// + /// let input = Input::new("foobar"); + /// let mut patset = PatternSet::new(dfa.pattern_len()); + /// dfa.try_which_overlapping_matches(&input, &mut patset)?; + /// let expected = vec![0, 2, 3, 4, 6]; + /// let got: Vec<usize> = patset.iter().map(|p| p.as_usize()).collect(); + /// assert_eq!(expected, got); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + #[cfg(feature = "alloc")] + #[inline] + fn try_which_overlapping_matches( + &self, + input: &Input<'_>, + patset: &mut PatternSet, + ) -> Result<(), MatchError> { + let mut state = OverlappingState::start(); + while let Some(m) = { + self.try_search_overlapping_fwd(input, &mut state)?; + state.get_match() + } { + let _ = patset.insert(m.pattern()); + // There's nothing left to find, so we can stop. Or the caller + // asked us to. + if patset.is_full() || input.get_earliest() { + break; + } + } + Ok(()) + } +} + +unsafe impl<'a, A: Automaton + ?Sized> Automaton for &'a A { + #[inline] + fn next_state(&self, current: StateID, input: u8) -> StateID { + (**self).next_state(current, input) + } + + #[inline] + unsafe fn next_state_unchecked( + &self, + current: StateID, + input: u8, + ) -> StateID { + (**self).next_state_unchecked(current, input) + } + + #[inline] + fn next_eoi_state(&self, current: StateID) -> StateID { + (**self).next_eoi_state(current) + } + + #[inline] + fn start_state_forward( + &self, + input: &Input<'_>, + ) -> Result<StateID, MatchError> { + (**self).start_state_forward(input) + } + + #[inline] + fn start_state_reverse( + &self, + input: &Input<'_>, + ) -> Result<StateID, MatchError> { + (**self).start_state_reverse(input) + } + + #[inline] + fn universal_start_state(&self, mode: Anchored) -> Option<StateID> { + (**self).universal_start_state(mode) + } + + #[inline] + fn is_special_state(&self, id: StateID) -> bool { + (**self).is_special_state(id) + } + + #[inline] + fn is_dead_state(&self, id: StateID) -> bool { + (**self).is_dead_state(id) + } + + #[inline] + fn is_quit_state(&self, id: StateID) -> bool { + (**self).is_quit_state(id) + } + + #[inline] + fn is_match_state(&self, id: StateID) -> bool { + (**self).is_match_state(id) + } + + #[inline] + fn is_start_state(&self, id: StateID) -> bool { + (**self).is_start_state(id) + } + + #[inline] + fn is_accel_state(&self, id: StateID) -> bool { + (**self).is_accel_state(id) + } + + #[inline] + fn pattern_len(&self) -> usize { + (**self).pattern_len() + } + + #[inline] + fn match_len(&self, id: StateID) -> usize { + (**self).match_len(id) + } + + #[inline] + fn match_pattern(&self, id: StateID, index: usize) -> PatternID { + (**self).match_pattern(id, index) + } + + #[inline] + fn has_empty(&self) -> bool { + (**self).has_empty() + } + + #[inline] + fn is_utf8(&self) -> bool { + (**self).is_utf8() + } + + #[inline] + fn is_always_start_anchored(&self) -> bool { + (**self).is_always_start_anchored() + } + + #[inline] + fn accelerator(&self, id: StateID) -> &[u8] { + (**self).accelerator(id) + } + + #[inline] + fn get_prefilter(&self) -> Option<&Prefilter> { + (**self).get_prefilter() + } + + #[inline] + fn try_search_fwd( + &self, + input: &Input<'_>, + ) -> Result<Option<HalfMatch>, MatchError> { + (**self).try_search_fwd(input) + } + + #[inline] + fn try_search_rev( + &self, + input: &Input<'_>, + ) -> Result<Option<HalfMatch>, MatchError> { + (**self).try_search_rev(input) + } + + #[inline] + fn try_search_overlapping_fwd( + &self, + input: &Input<'_>, + state: &mut OverlappingState, + ) -> Result<(), MatchError> { + (**self).try_search_overlapping_fwd(input, state) + } + + #[inline] + fn try_search_overlapping_rev( + &self, + input: &Input<'_>, + state: &mut OverlappingState, + ) -> Result<(), MatchError> { + (**self).try_search_overlapping_rev(input, state) + } + + #[cfg(feature = "alloc")] + #[inline] + fn try_which_overlapping_matches( + &self, + input: &Input<'_>, + patset: &mut PatternSet, + ) -> Result<(), MatchError> { + (**self).try_which_overlapping_matches(input, patset) + } +} + +/// Represents the current state of an overlapping search. +/// +/// This is used for overlapping searches since they need to know something +/// about the previous search. For example, when multiple patterns match at the +/// same position, this state tracks the last reported pattern so that the next +/// search knows whether to report another matching pattern or continue with +/// the search at the next position. Additionally, it also tracks which state +/// the last search call terminated in. +/// +/// This type provides little introspection capabilities. The only thing a +/// caller can do is construct it and pass it around to permit search routines +/// to use it to track state, and also ask whether a match has been found. +/// +/// Callers should always provide a fresh state constructed via +/// [`OverlappingState::start`] when starting a new search. Reusing state from +/// a previous search may result in incorrect results. +#[derive(Clone, Debug, Eq, PartialEq)] +pub struct OverlappingState { + /// The match reported by the most recent overlapping search to use this + /// state. + /// + /// If a search does not find any matches, then it is expected to clear + /// this value. + pub(crate) mat: Option<HalfMatch>, + /// The state ID of the state at which the search was in when the call + /// terminated. When this is a match state, `last_match` must be set to a + /// non-None value. + /// + /// A `None` value indicates the start state of the corresponding + /// automaton. We cannot use the actual ID, since any one automaton may + /// have many start states, and which one is in use depends on several + /// search-time factors. + pub(crate) id: Option<StateID>, + /// The position of the search. + /// + /// When `id` is None (i.e., we are starting a search), this is set to + /// the beginning of the search as given by the caller regardless of its + /// current value. Subsequent calls to an overlapping search pick up at + /// this offset. + pub(crate) at: usize, + /// The index into the matching patterns of the next match to report if the + /// current state is a match state. Note that this may be 1 greater than + /// the total number of matches to report for the current match state. (In + /// which case, no more matches should be reported at the current position + /// and the search should advance to the next position.) + pub(crate) next_match_index: Option<usize>, + /// This is set to true when a reverse overlapping search has entered its + /// EOI transitions. + /// + /// This isn't used in a forward search because it knows to stop once the + /// position exceeds the end of the search range. In a reverse search, + /// since we use unsigned offsets, we don't "know" once we've gone past + /// `0`. So the only way to detect it is with this extra flag. The reverse + /// overlapping search knows to terminate specifically after it has + /// reported all matches after following the EOI transition. + pub(crate) rev_eoi: bool, +} + +impl OverlappingState { + /// Create a new overlapping state that begins at the start state of any + /// automaton. + pub fn start() -> OverlappingState { + OverlappingState { + mat: None, + id: None, + at: 0, + next_match_index: None, + rev_eoi: false, + } + } + + /// Return the match result of the most recent search to execute with this + /// state. + /// + /// A searches will clear this result automatically, such that if no + /// match is found, this will correctly report `None`. + pub fn get_match(&self) -> Option<HalfMatch> { + self.mat + } +} + +/// Runs the given overlapping `search` function (forwards or backwards) until +/// a match is found whose offset does not split a codepoint. +/// +/// This is *not* always correct to call. It should only be called when the DFA +/// has UTF-8 mode enabled *and* it can produce zero-width matches. Calling +/// this when both of those things aren't true might result in legitimate +/// matches getting skipped. +#[cold] +#[inline(never)] +fn skip_empty_utf8_splits_overlapping<F>( + input: &Input<'_>, + state: &mut OverlappingState, + mut search: F, +) -> Result<(), MatchError> +where + F: FnMut(&Input<'_>, &mut OverlappingState) -> Result<(), MatchError>, +{ + // Note that this routine works for forwards and reverse searches + // even though there's no code here to handle those cases. That's + // because overlapping searches drive themselves to completion via + // `OverlappingState`. So all we have to do is push it until no matches are + // found. + + let mut hm = match state.get_match() { + None => return Ok(()), + Some(hm) => hm, + }; + if input.get_anchored().is_anchored() { + if !input.is_char_boundary(hm.offset()) { + state.mat = None; + } + return Ok(()); + } + while !input.is_char_boundary(hm.offset()) { + search(input, state)?; + hm = match state.get_match() { + None => return Ok(()), + Some(hm) => hm, + }; + } + Ok(()) +} + +/// Write a prefix "state" indicator for fmt::Debug impls. +/// +/// Specifically, this tries to succinctly distinguish the different types of +/// states: dead states, quit states, accelerated states, start states and +/// match states. It even accounts for the possible overlappings of different +/// state types. +pub(crate) fn fmt_state_indicator<A: Automaton>( + f: &mut core::fmt::Formatter<'_>, + dfa: A, + id: StateID, +) -> core::fmt::Result { + if dfa.is_dead_state(id) { + write!(f, "D")?; + if dfa.is_start_state(id) { + write!(f, ">")?; + } else { + write!(f, " ")?; + } + } else if dfa.is_quit_state(id) { + write!(f, "Q ")?; + } else if dfa.is_start_state(id) { + if dfa.is_accel_state(id) { + write!(f, "A>")?; + } else { + write!(f, " >")?; + } + } else if dfa.is_match_state(id) { + if dfa.is_accel_state(id) { + write!(f, "A*")?; + } else { + write!(f, " *")?; + } + } else if dfa.is_accel_state(id) { + write!(f, "A ")?; + } else { + write!(f, " ")?; + } + Ok(()) +} + +#[cfg(all(test, feature = "syntax", feature = "dfa-build"))] +mod tests { + // A basic test ensuring that our Automaton trait is object safe. (This is + // the main reason why we don't define the search routines as generic over + // Into<Input>.) + #[test] + fn object_safe() { + use crate::{ + dfa::{dense, Automaton}, + HalfMatch, Input, + }; + + let dfa = dense::DFA::new("abc").unwrap(); + let dfa: &dyn Automaton = &dfa; + assert_eq!( + Ok(Some(HalfMatch::must(0, 6))), + dfa.try_search_fwd(&Input::new(b"xyzabcxyz")), + ); + } +} diff --git a/third_party/rust/regex-automata/src/dfa/dense.rs b/third_party/rust/regex-automata/src/dfa/dense.rs new file mode 100644 index 0000000000..6da865f977 --- /dev/null +++ b/third_party/rust/regex-automata/src/dfa/dense.rs @@ -0,0 +1,5139 @@ +/*! +Types and routines specific to dense DFAs. + +This module is the home of [`dense::DFA`](DFA). + +This module also contains a [`dense::Builder`](Builder) and a +[`dense::Config`](Config) for building and configuring a dense DFA. +*/ + +#[cfg(feature = "dfa-build")] +use core::cmp; +use core::{convert::TryFrom, fmt, iter, mem::size_of, slice}; + +#[cfg(feature = "dfa-build")] +use alloc::{ + collections::{BTreeMap, BTreeSet}, + vec, + vec::Vec, +}; + +#[cfg(feature = "dfa-build")] +use crate::{ + dfa::{ + accel::Accel, determinize, minimize::Minimizer, remapper::Remapper, + sparse, + }, + nfa::thompson, + util::{look::LookMatcher, search::MatchKind}, +}; +use crate::{ + dfa::{ + accel::Accels, + automaton::{fmt_state_indicator, Automaton}, + special::Special, + start::StartKind, + DEAD, + }, + util::{ + alphabet::{self, ByteClasses, ByteSet}, + int::{Pointer, Usize}, + prefilter::Prefilter, + primitives::{PatternID, StateID}, + search::{Anchored, Input, MatchError}, + start::{Start, StartByteMap}, + wire::{self, DeserializeError, Endian, SerializeError}, + }, +}; + +/// The label that is pre-pended to a serialized DFA. +const LABEL: &str = "rust-regex-automata-dfa-dense"; + +/// The format version of dense regexes. This version gets incremented when a +/// change occurs. A change may not necessarily be a breaking change, but the +/// version does permit good error messages in the case where a breaking change +/// is made. +const VERSION: u32 = 2; + +/// The configuration used for compiling a dense DFA. +/// +/// As a convenience, [`DFA::config`] is an alias for [`Config::new`]. The +/// advantage of the former is that it often lets you avoid importing the +/// `Config` type directly. +/// +/// A dense DFA configuration is a simple data object that is typically used +/// with [`dense::Builder::configure`](self::Builder::configure). +/// +/// The default configuration guarantees that a search will never return +/// a "quit" error, although it is possible for a search to fail if +/// [`Config::starts_for_each_pattern`] wasn't enabled (which it is not by +/// default) and an [`Anchored::Pattern`] mode is requested via [`Input`]. +#[cfg(feature = "dfa-build")] +#[derive(Clone, Debug, Default)] +pub struct Config { + // As with other configuration types in this crate, we put all our knobs + // in options so that we can distinguish between "default" and "not set." + // This makes it possible to easily combine multiple configurations + // without default values overwriting explicitly specified values. See the + // 'overwrite' method. + // + // For docs on the fields below, see the corresponding method setters. + accelerate: Option<bool>, + pre: Option<Option<Prefilter>>, + minimize: Option<bool>, + match_kind: Option<MatchKind>, + start_kind: Option<StartKind>, + starts_for_each_pattern: Option<bool>, + byte_classes: Option<bool>, + unicode_word_boundary: Option<bool>, + quitset: Option<ByteSet>, + specialize_start_states: Option<bool>, + dfa_size_limit: Option<Option<usize>>, + determinize_size_limit: Option<Option<usize>>, +} + +#[cfg(feature = "dfa-build")] +impl Config { + /// Return a new default dense DFA compiler configuration. + pub fn new() -> Config { + Config::default() + } + + /// Enable state acceleration. + /// + /// When enabled, DFA construction will analyze each state to determine + /// whether it is eligible for simple acceleration. Acceleration typically + /// occurs when most of a state's transitions loop back to itself, leaving + /// only a select few bytes that will exit the state. When this occurs, + /// other routines like `memchr` can be used to look for those bytes which + /// may be much faster than traversing the DFA. + /// + /// Callers may elect to disable this if consistent performance is more + /// desirable than variable performance. Namely, acceleration can sometimes + /// make searching slower than it otherwise would be if the transitions + /// that leave accelerated states are traversed frequently. + /// + /// See [`Automaton::accelerator`](crate::dfa::Automaton::accelerator) for + /// an example. + /// + /// This is enabled by default. + pub fn accelerate(mut self, yes: bool) -> Config { + self.accelerate = Some(yes); + self + } + + /// Set a prefilter to be used whenever a start state is entered. + /// + /// A [`Prefilter`] in this context is meant to accelerate searches by + /// looking for literal prefixes that every match for the corresponding + /// pattern (or patterns) must start with. Once a prefilter produces a + /// match, the underlying search routine continues on to try and confirm + /// the match. + /// + /// Be warned that setting a prefilter does not guarantee that the search + /// will be faster. While it's usually a good bet, if the prefilter + /// produces a lot of false positive candidates (i.e., positions matched + /// by the prefilter but not by the regex), then the overall result can + /// be slower than if you had just executed the regex engine without any + /// prefilters. + /// + /// Note that unless [`Config::specialize_start_states`] has been + /// explicitly set, then setting this will also enable (when `pre` is + /// `Some`) or disable (when `pre` is `None`) start state specialization. + /// This occurs because without start state specialization, a prefilter + /// is likely to be less effective. And without a prefilter, start state + /// specialization is usually pointless. + /// + /// **WARNING:** Note that prefilters are not preserved as part of + /// serialization. Serializing a DFA will drop its prefilter. + /// + /// By default no prefilter is set. + /// + /// # Example + /// + /// ``` + /// use regex_automata::{ + /// dfa::{dense::DFA, Automaton}, + /// util::prefilter::Prefilter, + /// Input, HalfMatch, MatchKind, + /// }; + /// + /// let pre = Prefilter::new(MatchKind::LeftmostFirst, &["foo", "bar"]); + /// let re = DFA::builder() + /// .configure(DFA::config().prefilter(pre)) + /// .build(r"(foo|bar)[a-z]+")?; + /// let input = Input::new("foo1 barfox bar"); + /// assert_eq!( + /// Some(HalfMatch::must(0, 11)), + /// re.try_search_fwd(&input)?, + /// ); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + /// + /// Be warned though that an incorrect prefilter can lead to incorrect + /// results! + /// + /// ``` + /// use regex_automata::{ + /// dfa::{dense::DFA, Automaton}, + /// util::prefilter::Prefilter, + /// Input, HalfMatch, MatchKind, + /// }; + /// + /// let pre = Prefilter::new(MatchKind::LeftmostFirst, &["foo", "car"]); + /// let re = DFA::builder() + /// .configure(DFA::config().prefilter(pre)) + /// .build(r"(foo|bar)[a-z]+")?; + /// let input = Input::new("foo1 barfox bar"); + /// assert_eq!( + /// // No match reported even though there clearly is one! + /// None, + /// re.try_search_fwd(&input)?, + /// ); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + pub fn prefilter(mut self, pre: Option<Prefilter>) -> Config { + self.pre = Some(pre); + if self.specialize_start_states.is_none() { + self.specialize_start_states = + Some(self.get_prefilter().is_some()); + } + self + } + + /// Minimize the DFA. + /// + /// When enabled, the DFA built will be minimized such that it is as small + /// as possible. + /// + /// Whether one enables minimization or not depends on the types of costs + /// you're willing to pay and how much you care about its benefits. In + /// particular, minimization has worst case `O(n*k*logn)` time and `O(k*n)` + /// space, where `n` is the number of DFA states and `k` is the alphabet + /// size. In practice, minimization can be quite costly in terms of both + /// space and time, so it should only be done if you're willing to wait + /// longer to produce a DFA. In general, you might want a minimal DFA in + /// the following circumstances: + /// + /// 1. You would like to optimize for the size of the automaton. This can + /// manifest in one of two ways. Firstly, if you're converting the + /// DFA into Rust code (or a table embedded in the code), then a minimal + /// DFA will translate into a corresponding reduction in code size, and + /// thus, also the final compiled binary size. Secondly, if you are + /// building many DFAs and putting them on the heap, you'll be able to + /// fit more if they are smaller. Note though that building a minimal + /// DFA itself requires additional space; you only realize the space + /// savings once the minimal DFA is constructed (at which point, the + /// space used for minimization is freed). + /// 2. You've observed that a smaller DFA results in faster match + /// performance. Naively, this isn't guaranteed since there is no + /// inherent difference between matching with a bigger-than-minimal + /// DFA and a minimal DFA. However, a smaller DFA may make use of your + /// CPU's cache more efficiently. + /// 3. You are trying to establish an equivalence between regular + /// languages. The standard method for this is to build a minimal DFA + /// for each language and then compare them. If the DFAs are equivalent + /// (up to state renaming), then the languages are equivalent. + /// + /// Typically, minimization only makes sense as an offline process. That + /// is, one might minimize a DFA before serializing it to persistent + /// storage. In practical terms, minimization can take around an order of + /// magnitude more time than compiling the initial DFA via determinization. + /// + /// This option is disabled by default. + pub fn minimize(mut self, yes: bool) -> Config { + self.minimize = Some(yes); + self + } + + /// Set the desired match semantics. + /// + /// The default is [`MatchKind::LeftmostFirst`], which corresponds to the + /// match semantics of Perl-like regex engines. That is, when multiple + /// patterns would match at the same leftmost position, the pattern that + /// appears first in the concrete syntax is chosen. + /// + /// Currently, the only other kind of match semantics supported is + /// [`MatchKind::All`]. This corresponds to classical DFA construction + /// where all possible matches are added to the DFA. + /// + /// Typically, `All` is used when one wants to execute an overlapping + /// search and `LeftmostFirst` otherwise. In particular, it rarely makes + /// sense to use `All` with the various "leftmost" find routines, since the + /// leftmost routines depend on the `LeftmostFirst` automata construction + /// strategy. Specifically, `LeftmostFirst` adds dead states to the DFA + /// as a way to terminate the search and report a match. `LeftmostFirst` + /// also supports non-greedy matches using this strategy where as `All` + /// does not. + /// + /// # Example: overlapping search + /// + /// This example shows the typical use of `MatchKind::All`, which is to + /// report overlapping matches. + /// + /// ``` + /// # if cfg!(miri) { return Ok(()); } // miri takes too long + /// use regex_automata::{ + /// dfa::{Automaton, OverlappingState, dense}, + /// HalfMatch, Input, MatchKind, + /// }; + /// + /// let dfa = dense::Builder::new() + /// .configure(dense::Config::new().match_kind(MatchKind::All)) + /// .build_many(&[r"\w+$", r"\S+$"])?; + /// let input = Input::new("@foo"); + /// let mut state = OverlappingState::start(); + /// + /// let expected = Some(HalfMatch::must(1, 4)); + /// dfa.try_search_overlapping_fwd(&input, &mut state)?; + /// assert_eq!(expected, state.get_match()); + /// + /// // The first pattern also matches at the same position, so re-running + /// // the search will yield another match. Notice also that the first + /// // pattern is returned after the second. This is because the second + /// // pattern begins its match before the first, is therefore an earlier + /// // match and is thus reported first. + /// let expected = Some(HalfMatch::must(0, 4)); + /// dfa.try_search_overlapping_fwd(&input, &mut state)?; + /// assert_eq!(expected, state.get_match()); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + /// + /// # Example: reverse automaton to find start of match + /// + /// Another example for using `MatchKind::All` is for constructing a + /// reverse automaton to find the start of a match. `All` semantics are + /// used for this in order to find the longest possible match, which + /// corresponds to the leftmost starting position. + /// + /// Note that if you need the starting position then + /// [`dfa::regex::Regex`](crate::dfa::regex::Regex) will handle this for + /// you, so it's usually not necessary to do this yourself. + /// + /// ``` + /// use regex_automata::{ + /// dfa::{dense, Automaton, StartKind}, + /// nfa::thompson::NFA, + /// Anchored, HalfMatch, Input, MatchKind, + /// }; + /// + /// let haystack = "123foobar456".as_bytes(); + /// let pattern = r"[a-z]+r"; + /// + /// let dfa_fwd = dense::DFA::new(pattern)?; + /// let dfa_rev = dense::Builder::new() + /// .thompson(NFA::config().reverse(true)) + /// .configure(dense::Config::new() + /// // This isn't strictly necessary since both anchored and + /// // unanchored searches are supported by default. But since + /// // finding the start-of-match only requires anchored searches, + /// // we can get rid of the unanchored configuration and possibly + /// // slim down our DFA considerably. + /// .start_kind(StartKind::Anchored) + /// .match_kind(MatchKind::All) + /// ) + /// .build(pattern)?; + /// let expected_fwd = HalfMatch::must(0, 9); + /// let expected_rev = HalfMatch::must(0, 3); + /// let got_fwd = dfa_fwd.try_search_fwd(&Input::new(haystack))?.unwrap(); + /// // Here we don't specify the pattern to search for since there's only + /// // one pattern and we're doing a leftmost search. But if this were an + /// // overlapping search, you'd need to specify the pattern that matched + /// // in the forward direction. (Otherwise, you might wind up finding the + /// // starting position of a match of some other pattern.) That in turn + /// // requires building the reverse automaton with starts_for_each_pattern + /// // enabled. Indeed, this is what Regex does internally. + /// let input = Input::new(haystack) + /// .range(..got_fwd.offset()) + /// .anchored(Anchored::Yes); + /// let got_rev = dfa_rev.try_search_rev(&input)?.unwrap(); + /// assert_eq!(expected_fwd, got_fwd); + /// assert_eq!(expected_rev, got_rev); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + pub fn match_kind(mut self, kind: MatchKind) -> Config { + self.match_kind = Some(kind); + self + } + + /// The type of starting state configuration to use for a DFA. + /// + /// By default, the starting state configuration is [`StartKind::Both`]. + /// + /// # Example + /// + /// ``` + /// use regex_automata::{ + /// dfa::{dense::DFA, Automaton, StartKind}, + /// Anchored, HalfMatch, Input, + /// }; + /// + /// let haystack = "quux foo123"; + /// let expected = HalfMatch::must(0, 11); + /// + /// // By default, DFAs support both anchored and unanchored searches. + /// let dfa = DFA::new(r"[0-9]+")?; + /// let input = Input::new(haystack); + /// assert_eq!(Some(expected), dfa.try_search_fwd(&input)?); + /// + /// // But if we only need anchored searches, then we can build a DFA + /// // that only supports anchored searches. This leads to a smaller DFA + /// // (potentially significantly smaller in some cases), but a DFA that + /// // will panic if you try to use it with an unanchored search. + /// let dfa = DFA::builder() + /// .configure(DFA::config().start_kind(StartKind::Anchored)) + /// .build(r"[0-9]+")?; + /// let input = Input::new(haystack) + /// .range(8..) + /// .anchored(Anchored::Yes); + /// assert_eq!(Some(expected), dfa.try_search_fwd(&input)?); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + pub fn start_kind(mut self, kind: StartKind) -> Config { + self.start_kind = Some(kind); + self + } + + /// Whether to compile a separate start state for each pattern in the + /// automaton. + /// + /// When enabled, a separate **anchored** start state is added for each + /// pattern in the DFA. When this start state is used, then the DFA will + /// only search for matches for the pattern specified, even if there are + /// other patterns in the DFA. + /// + /// The main downside of this option is that it can potentially increase + /// the size of the DFA and/or increase the time it takes to build the DFA. + /// + /// There are a few reasons one might want to enable this (it's disabled + /// by default): + /// + /// 1. When looking for the start of an overlapping match (using a + /// reverse DFA), doing it correctly requires starting the reverse search + /// using the starting state of the pattern that matched in the forward + /// direction. Indeed, when building a [`Regex`](crate::dfa::regex::Regex), + /// it will automatically enable this option when building the reverse DFA + /// internally. + /// 2. When you want to use a DFA with multiple patterns to both search + /// for matches of any pattern or to search for anchored matches of one + /// particular pattern while using the same DFA. (Otherwise, you would need + /// to compile a new DFA for each pattern.) + /// 3. Since the start states added for each pattern are anchored, if you + /// compile an unanchored DFA with one pattern while also enabling this + /// option, then you can use the same DFA to perform anchored or unanchored + /// searches. The latter you get with the standard search APIs. The former + /// you get from the various `_at` search methods that allow you specify a + /// pattern ID to search for. + /// + /// By default this is disabled. + /// + /// # Example + /// + /// This example shows how to use this option to permit the same DFA to + /// run both anchored and unanchored searches for a single pattern. + /// + /// ``` + /// use regex_automata::{ + /// dfa::{dense, Automaton}, + /// Anchored, HalfMatch, PatternID, Input, + /// }; + /// + /// let dfa = dense::Builder::new() + /// .configure(dense::Config::new().starts_for_each_pattern(true)) + /// .build(r"foo[0-9]+")?; + /// let haystack = "quux foo123"; + /// + /// // Here's a normal unanchored search. Notice that we use 'None' for the + /// // pattern ID. Since the DFA was built as an unanchored machine, it + /// // use its default unanchored starting state. + /// let expected = HalfMatch::must(0, 11); + /// let input = Input::new(haystack); + /// assert_eq!(Some(expected), dfa.try_search_fwd(&input)?); + /// // But now if we explicitly specify the pattern to search ('0' being + /// // the only pattern in the DFA), then it will use the starting state + /// // for that specific pattern which is always anchored. Since the + /// // pattern doesn't have a match at the beginning of the haystack, we + /// // find nothing. + /// let input = Input::new(haystack) + /// .anchored(Anchored::Pattern(PatternID::must(0))); + /// assert_eq!(None, dfa.try_search_fwd(&input)?); + /// // And finally, an anchored search is not the same as putting a '^' at + /// // beginning of the pattern. An anchored search can only match at the + /// // beginning of the *search*, which we can change: + /// let input = Input::new(haystack) + /// .anchored(Anchored::Pattern(PatternID::must(0))) + /// .range(5..); + /// assert_eq!(Some(expected), dfa.try_search_fwd(&input)?); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + pub fn starts_for_each_pattern(mut self, yes: bool) -> Config { + self.starts_for_each_pattern = Some(yes); + self + } + + /// Whether to attempt to shrink the size of the DFA's alphabet or not. + /// + /// This option is enabled by default and should never be disabled unless + /// one is debugging a generated DFA. + /// + /// When enabled, the DFA will use a map from all possible bytes to their + /// corresponding equivalence class. Each equivalence class represents a + /// set of bytes that does not discriminate between a match and a non-match + /// in the DFA. For example, the pattern `[ab]+` has at least two + /// equivalence classes: a set containing `a` and `b` and a set containing + /// every byte except for `a` and `b`. `a` and `b` are in the same + /// equivalence class because they never discriminate between a match and a + /// non-match. + /// + /// The advantage of this map is that the size of the transition table + /// can be reduced drastically from `#states * 256 * sizeof(StateID)` to + /// `#states * k * sizeof(StateID)` where `k` is the number of equivalence + /// classes (rounded up to the nearest power of 2). As a result, total + /// space usage can decrease substantially. Moreover, since a smaller + /// alphabet is used, DFA compilation becomes faster as well. + /// + /// **WARNING:** This is only useful for debugging DFAs. Disabling this + /// does not yield any speed advantages. Namely, even when this is + /// disabled, a byte class map is still used while searching. The only + /// difference is that every byte will be forced into its own distinct + /// equivalence class. This is useful for debugging the actual generated + /// transitions because it lets one see the transitions defined on actual + /// bytes instead of the equivalence classes. + pub fn byte_classes(mut self, yes: bool) -> Config { + self.byte_classes = Some(yes); + self + } + + /// Heuristically enable Unicode word boundaries. + /// + /// When set, this will attempt to implement Unicode word boundaries as if + /// they were ASCII word boundaries. This only works when the search input + /// is ASCII only. If a non-ASCII byte is observed while searching, then a + /// [`MatchError::quit`](crate::MatchError::quit) error is returned. + /// + /// A possible alternative to enabling this option is to simply use an + /// ASCII word boundary, e.g., via `(?-u:\b)`. The main reason to use this + /// option is if you absolutely need Unicode support. This option lets one + /// use a fast search implementation (a DFA) for some potentially very + /// common cases, while providing the option to fall back to some other + /// regex engine to handle the general case when an error is returned. + /// + /// If the pattern provided has no Unicode word boundary in it, then this + /// option has no effect. (That is, quitting on a non-ASCII byte only + /// occurs when this option is enabled _and_ a Unicode word boundary is + /// present in the pattern.) + /// + /// This is almost equivalent to setting all non-ASCII bytes to be quit + /// bytes. The only difference is that this will cause non-ASCII bytes to + /// be quit bytes _only_ when a Unicode word boundary is present in the + /// pattern. + /// + /// When enabling this option, callers _must_ be prepared to handle + /// a [`MatchError`](crate::MatchError) error during search. + /// When using a [`Regex`](crate::dfa::regex::Regex), this corresponds + /// to using the `try_` suite of methods. Alternatively, if + /// callers can guarantee that their input is ASCII only, then a + /// [`MatchError::quit`](crate::MatchError::quit) error will never be + /// returned while searching. + /// + /// This is disabled by default. + /// + /// # Example + /// + /// This example shows how to heuristically enable Unicode word boundaries + /// in a pattern. It also shows what happens when a search comes across a + /// non-ASCII byte. + /// + /// ``` + /// use regex_automata::{ + /// dfa::{Automaton, dense}, + /// HalfMatch, Input, MatchError, + /// }; + /// + /// let dfa = dense::Builder::new() + /// .configure(dense::Config::new().unicode_word_boundary(true)) + /// .build(r"\b[0-9]+\b")?; + /// + /// // The match occurs before the search ever observes the snowman + /// // character, so no error occurs. + /// let haystack = "foo 123 ☃".as_bytes(); + /// let expected = Some(HalfMatch::must(0, 7)); + /// let got = dfa.try_search_fwd(&Input::new(haystack))?; + /// assert_eq!(expected, got); + /// + /// // Notice that this search fails, even though the snowman character + /// // occurs after the ending match offset. This is because search + /// // routines read one byte past the end of the search to account for + /// // look-around, and indeed, this is required here to determine whether + /// // the trailing \b matches. + /// let haystack = "foo 123 ☃".as_bytes(); + /// let expected = MatchError::quit(0xE2, 8); + /// let got = dfa.try_search_fwd(&Input::new(haystack)); + /// assert_eq!(Err(expected), got); + /// + /// // Another example is executing a search where the span of the haystack + /// // we specify is all ASCII, but there is non-ASCII just before it. This + /// // correctly also reports an error. + /// let input = Input::new("β123").range(2..); + /// let expected = MatchError::quit(0xB2, 1); + /// let got = dfa.try_search_fwd(&input); + /// assert_eq!(Err(expected), got); + /// + /// // And similarly for the trailing word boundary. + /// let input = Input::new("123β").range(..3); + /// let expected = MatchError::quit(0xCE, 3); + /// let got = dfa.try_search_fwd(&input); + /// assert_eq!(Err(expected), got); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + pub fn unicode_word_boundary(mut self, yes: bool) -> Config { + // We have a separate option for this instead of just setting the + // appropriate quit bytes here because we don't want to set quit bytes + // for every regex. We only want to set them when the regex contains a + // Unicode word boundary. + self.unicode_word_boundary = Some(yes); + self + } + + /// Add a "quit" byte to the DFA. + /// + /// When a quit byte is seen during search time, then search will return + /// a [`MatchError::quit`](crate::MatchError::quit) error indicating the + /// offset at which the search stopped. + /// + /// A quit byte will always overrule any other aspects of a regex. For + /// example, if the `x` byte is added as a quit byte and the regex `\w` is + /// used, then observing `x` will cause the search to quit immediately + /// despite the fact that `x` is in the `\w` class. + /// + /// This mechanism is primarily useful for heuristically enabling certain + /// features like Unicode word boundaries in a DFA. Namely, if the input + /// to search is ASCII, then a Unicode word boundary can be implemented + /// via an ASCII word boundary with no change in semantics. Thus, a DFA + /// can attempt to match a Unicode word boundary but give up as soon as it + /// observes a non-ASCII byte. Indeed, if callers set all non-ASCII bytes + /// to be quit bytes, then Unicode word boundaries will be permitted when + /// building DFAs. Of course, callers should enable + /// [`Config::unicode_word_boundary`] if they want this behavior instead. + /// (The advantage being that non-ASCII quit bytes will only be added if a + /// Unicode word boundary is in the pattern.) + /// + /// When enabling this option, callers _must_ be prepared to handle a + /// [`MatchError`](crate::MatchError) error during search. When using a + /// [`Regex`](crate::dfa::regex::Regex), this corresponds to using the + /// `try_` suite of methods. + /// + /// By default, there are no quit bytes set. + /// + /// # Panics + /// + /// This panics if heuristic Unicode word boundaries are enabled and any + /// non-ASCII byte is removed from the set of quit bytes. Namely, enabling + /// Unicode word boundaries requires setting every non-ASCII byte to a quit + /// byte. So if the caller attempts to undo any of that, then this will + /// panic. + /// + /// # Example + /// + /// This example shows how to cause a search to terminate if it sees a + /// `\n` byte. This could be useful if, for example, you wanted to prevent + /// a user supplied pattern from matching across a line boundary. + /// + /// ``` + /// # if cfg!(miri) { return Ok(()); } // miri takes too long + /// use regex_automata::{dfa::{Automaton, dense}, Input, MatchError}; + /// + /// let dfa = dense::Builder::new() + /// .configure(dense::Config::new().quit(b'\n', true)) + /// .build(r"foo\p{any}+bar")?; + /// + /// let haystack = "foo\nbar".as_bytes(); + /// // Normally this would produce a match, since \p{any} contains '\n'. + /// // But since we instructed the automaton to enter a quit state if a + /// // '\n' is observed, this produces a match error instead. + /// let expected = MatchError::quit(b'\n', 3); + /// let got = dfa.try_search_fwd(&Input::new(haystack)).unwrap_err(); + /// assert_eq!(expected, got); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + pub fn quit(mut self, byte: u8, yes: bool) -> Config { + if self.get_unicode_word_boundary() && !byte.is_ascii() && !yes { + panic!( + "cannot set non-ASCII byte to be non-quit when \ + Unicode word boundaries are enabled" + ); + } + if self.quitset.is_none() { + self.quitset = Some(ByteSet::empty()); + } + if yes { + self.quitset.as_mut().unwrap().add(byte); + } else { + self.quitset.as_mut().unwrap().remove(byte); + } + self + } + + /// Enable specializing start states in the DFA. + /// + /// When start states are specialized, an implementor of a search routine + /// using a lazy DFA can tell when the search has entered a starting state. + /// When start states aren't specialized, then it is impossible to know + /// whether the search has entered a start state. + /// + /// Ideally, this option wouldn't need to exist and we could always + /// specialize start states. The problem is that start states can be quite + /// active. This in turn means that an efficient search routine is likely + /// to ping-pong between a heavily optimized hot loop that handles most + /// states and to a less optimized specialized handling of start states. + /// This causes branches to get heavily mispredicted and overall can + /// materially decrease throughput. Therefore, specializing start states + /// should only be enabled when it is needed. + /// + /// Knowing whether a search is in a start state is typically useful when a + /// prefilter is active for the search. A prefilter is typically only run + /// when in a start state and a prefilter can greatly accelerate a search. + /// Therefore, the possible cost of specializing start states is worth it + /// in this case. Otherwise, if you have no prefilter, there is likely no + /// reason to specialize start states. + /// + /// This is disabled by default, but note that it is automatically + /// enabled (or disabled) if [`Config::prefilter`] is set. Namely, unless + /// `specialize_start_states` has already been set, [`Config::prefilter`] + /// will automatically enable or disable it based on whether a prefilter + /// is present or not, respectively. This is done because a prefilter's + /// effectiveness is rooted in being executed whenever the DFA is in a + /// start state, and that's only possible to do when they are specialized. + /// + /// Note that it is plausibly reasonable to _disable_ this option + /// explicitly while _enabling_ a prefilter. In that case, a prefilter + /// will still be run at the beginning of a search, but never again. This + /// in theory could strike a good balance if you're in a situation where a + /// prefilter is likely to produce many false positive candidates. + /// + /// # Example + /// + /// This example shows how to enable start state specialization and then + /// shows how to check whether a state is a start state or not. + /// + /// ``` + /// use regex_automata::{dfa::{Automaton, dense::DFA}, Input}; + /// + /// let dfa = DFA::builder() + /// .configure(DFA::config().specialize_start_states(true)) + /// .build(r"[a-z]+")?; + /// + /// let haystack = "123 foobar 4567".as_bytes(); + /// let sid = dfa.start_state_forward(&Input::new(haystack))?; + /// // The ID returned by 'start_state_forward' will always be tagged as + /// // a start state when start state specialization is enabled. + /// assert!(dfa.is_special_state(sid)); + /// assert!(dfa.is_start_state(sid)); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + /// + /// Compare the above with the default DFA configuration where start states + /// are _not_ specialized. In this case, the start state is not tagged at + /// all: + /// + /// ``` + /// use regex_automata::{dfa::{Automaton, dense::DFA}, Input}; + /// + /// let dfa = DFA::new(r"[a-z]+")?; + /// + /// let haystack = "123 foobar 4567"; + /// let sid = dfa.start_state_forward(&Input::new(haystack))?; + /// // Start states are not special in the default configuration! + /// assert!(!dfa.is_special_state(sid)); + /// assert!(!dfa.is_start_state(sid)); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + pub fn specialize_start_states(mut self, yes: bool) -> Config { + self.specialize_start_states = Some(yes); + self + } + + /// Set a size limit on the total heap used by a DFA. + /// + /// This size limit is expressed in bytes and is applied during + /// determinization of an NFA into a DFA. If the DFA's heap usage, and only + /// the DFA, exceeds this configured limit, then determinization is stopped + /// and an error is returned. + /// + /// This limit does not apply to auxiliary storage used during + /// determinization that isn't part of the generated DFA. + /// + /// This limit is only applied during determinization. Currently, there is + /// no way to post-pone this check to after minimization if minimization + /// was enabled. + /// + /// The total limit on heap used during determinization is the sum of the + /// DFA and determinization size limits. + /// + /// The default is no limit. + /// + /// # Example + /// + /// This example shows a DFA that fails to build because of a configured + /// size limit. This particular example also serves as a cautionary tale + /// demonstrating just how big DFAs with large Unicode character classes + /// can get. + /// + /// ``` + /// # if cfg!(miri) { return Ok(()); } // miri takes too long + /// use regex_automata::{dfa::{dense, Automaton}, Input}; + /// + /// // 6MB isn't enough! + /// dense::Builder::new() + /// .configure(dense::Config::new().dfa_size_limit(Some(6_000_000))) + /// .build(r"\w{20}") + /// .unwrap_err(); + /// + /// // ... but 7MB probably is! + /// // (Note that DFA sizes aren't necessarily stable between releases.) + /// let dfa = dense::Builder::new() + /// .configure(dense::Config::new().dfa_size_limit(Some(7_000_000))) + /// .build(r"\w{20}")?; + /// let haystack = "A".repeat(20).into_bytes(); + /// assert!(dfa.try_search_fwd(&Input::new(&haystack))?.is_some()); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + /// + /// While one needs a little more than 6MB to represent `\w{20}`, it + /// turns out that you only need a little more than 6KB to represent + /// `(?-u:\w{20})`. So only use Unicode if you need it! + /// + /// As with [`Config::determinize_size_limit`], the size of a DFA is + /// influenced by other factors, such as what start state configurations + /// to support. For example, if you only need unanchored searches and not + /// anchored searches, then configuring the DFA to only support unanchored + /// searches can reduce its size. By default, DFAs support both unanchored + /// and anchored searches. + /// + /// ``` + /// # if cfg!(miri) { return Ok(()); } // miri takes too long + /// use regex_automata::{dfa::{dense, Automaton, StartKind}, Input}; + /// + /// // 3MB isn't enough! + /// dense::Builder::new() + /// .configure(dense::Config::new() + /// .dfa_size_limit(Some(3_000_000)) + /// .start_kind(StartKind::Unanchored) + /// ) + /// .build(r"\w{20}") + /// .unwrap_err(); + /// + /// // ... but 4MB probably is! + /// // (Note that DFA sizes aren't necessarily stable between releases.) + /// let dfa = dense::Builder::new() + /// .configure(dense::Config::new() + /// .dfa_size_limit(Some(4_000_000)) + /// .start_kind(StartKind::Unanchored) + /// ) + /// .build(r"\w{20}")?; + /// let haystack = "A".repeat(20).into_bytes(); + /// assert!(dfa.try_search_fwd(&Input::new(&haystack))?.is_some()); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + pub fn dfa_size_limit(mut self, bytes: Option<usize>) -> Config { + self.dfa_size_limit = Some(bytes); + self + } + + /// Set a size limit on the total heap used by determinization. + /// + /// This size limit is expressed in bytes and is applied during + /// determinization of an NFA into a DFA. If the heap used for auxiliary + /// storage during determinization (memory that is not in the DFA but + /// necessary for building the DFA) exceeds this configured limit, then + /// determinization is stopped and an error is returned. + /// + /// This limit does not apply to heap used by the DFA itself. + /// + /// The total limit on heap used during determinization is the sum of the + /// DFA and determinization size limits. + /// + /// The default is no limit. + /// + /// # Example + /// + /// This example shows a DFA that fails to build because of a + /// configured size limit on the amount of heap space used by + /// determinization. This particular example complements the example for + /// [`Config::dfa_size_limit`] by demonstrating that not only does Unicode + /// potentially make DFAs themselves big, but it also results in more + /// auxiliary storage during determinization. (Although, auxiliary storage + /// is still not as much as the DFA itself.) + /// + /// ``` + /// # if cfg!(miri) { return Ok(()); } // miri takes too long + /// # if !cfg!(target_pointer_width = "64") { return Ok(()); } // see #1039 + /// use regex_automata::{dfa::{dense, Automaton}, Input}; + /// + /// // 600KB isn't enough! + /// dense::Builder::new() + /// .configure(dense::Config::new() + /// .determinize_size_limit(Some(600_000)) + /// ) + /// .build(r"\w{20}") + /// .unwrap_err(); + /// + /// // ... but 700KB probably is! + /// // (Note that auxiliary storage sizes aren't necessarily stable between + /// // releases.) + /// let dfa = dense::Builder::new() + /// .configure(dense::Config::new() + /// .determinize_size_limit(Some(700_000)) + /// ) + /// .build(r"\w{20}")?; + /// let haystack = "A".repeat(20).into_bytes(); + /// assert!(dfa.try_search_fwd(&Input::new(&haystack))?.is_some()); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + /// + /// Note that some parts of the configuration on a DFA can have a + /// big impact on how big the DFA is, and thus, how much memory is + /// used. For example, the default setting for [`Config::start_kind`] is + /// [`StartKind::Both`]. But if you only need an anchored search, for + /// example, then it can be much cheaper to build a DFA that only supports + /// anchored searches. (Running an unanchored search with it would panic.) + /// + /// ``` + /// # if cfg!(miri) { return Ok(()); } // miri takes too long + /// # if !cfg!(target_pointer_width = "64") { return Ok(()); } // see #1039 + /// use regex_automata::{ + /// dfa::{dense, Automaton, StartKind}, + /// Anchored, Input, + /// }; + /// + /// // 200KB isn't enough! + /// dense::Builder::new() + /// .configure(dense::Config::new() + /// .determinize_size_limit(Some(200_000)) + /// .start_kind(StartKind::Anchored) + /// ) + /// .build(r"\w{20}") + /// .unwrap_err(); + /// + /// // ... but 300KB probably is! + /// // (Note that auxiliary storage sizes aren't necessarily stable between + /// // releases.) + /// let dfa = dense::Builder::new() + /// .configure(dense::Config::new() + /// .determinize_size_limit(Some(300_000)) + /// .start_kind(StartKind::Anchored) + /// ) + /// .build(r"\w{20}")?; + /// let haystack = "A".repeat(20).into_bytes(); + /// let input = Input::new(&haystack).anchored(Anchored::Yes); + /// assert!(dfa.try_search_fwd(&input)?.is_some()); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + pub fn determinize_size_limit(mut self, bytes: Option<usize>) -> Config { + self.determinize_size_limit = Some(bytes); + self + } + + /// Returns whether this configuration has enabled simple state + /// acceleration. + pub fn get_accelerate(&self) -> bool { + self.accelerate.unwrap_or(true) + } + + /// Returns the prefilter attached to this configuration, if any. + pub fn get_prefilter(&self) -> Option<&Prefilter> { + self.pre.as_ref().unwrap_or(&None).as_ref() + } + + /// Returns whether this configuration has enabled the expensive process + /// of minimizing a DFA. + pub fn get_minimize(&self) -> bool { + self.minimize.unwrap_or(false) + } + + /// Returns the match semantics set in this configuration. + pub fn get_match_kind(&self) -> MatchKind { + self.match_kind.unwrap_or(MatchKind::LeftmostFirst) + } + + /// Returns the starting state configuration for a DFA. + pub fn get_starts(&self) -> StartKind { + self.start_kind.unwrap_or(StartKind::Both) + } + + /// Returns whether this configuration has enabled anchored starting states + /// for every pattern in the DFA. + pub fn get_starts_for_each_pattern(&self) -> bool { + self.starts_for_each_pattern.unwrap_or(false) + } + + /// Returns whether this configuration has enabled byte classes or not. + /// This is typically a debugging oriented option, as disabling it confers + /// no speed benefit. + pub fn get_byte_classes(&self) -> bool { + self.byte_classes.unwrap_or(true) + } + + /// Returns whether this configuration has enabled heuristic Unicode word + /// boundary support. When enabled, it is possible for a search to return + /// an error. + pub fn get_unicode_word_boundary(&self) -> bool { + self.unicode_word_boundary.unwrap_or(false) + } + + /// Returns whether this configuration will instruct the DFA to enter a + /// quit state whenever the given byte is seen during a search. When at + /// least one byte has this enabled, it is possible for a search to return + /// an error. + pub fn get_quit(&self, byte: u8) -> bool { + self.quitset.map_or(false, |q| q.contains(byte)) + } + + /// Returns whether this configuration will instruct the DFA to + /// "specialize" start states. When enabled, the DFA will mark start states + /// as "special" so that search routines using the DFA can detect when + /// it's in a start state and do some kind of optimization (like run a + /// prefilter). + pub fn get_specialize_start_states(&self) -> bool { + self.specialize_start_states.unwrap_or(false) + } + + /// Returns the DFA size limit of this configuration if one was set. + /// The size limit is total number of bytes on the heap that a DFA is + /// permitted to use. If the DFA exceeds this limit during construction, + /// then construction is stopped and an error is returned. + pub fn get_dfa_size_limit(&self) -> Option<usize> { + self.dfa_size_limit.unwrap_or(None) + } + + /// Returns the determinization size limit of this configuration if one + /// was set. The size limit is total number of bytes on the heap that + /// determinization is permitted to use. If determinization exceeds this + /// limit during construction, then construction is stopped and an error is + /// returned. + /// + /// This is different from the DFA size limit in that this only applies to + /// the auxiliary storage used during determinization. Once determinization + /// is complete, this memory is freed. + /// + /// The limit on the total heap memory used is the sum of the DFA and + /// determinization size limits. + pub fn get_determinize_size_limit(&self) -> Option<usize> { + self.determinize_size_limit.unwrap_or(None) + } + + /// Overwrite the default configuration such that the options in `o` are + /// always used. If an option in `o` is not set, then the corresponding + /// option in `self` is used. If it's not set in `self` either, then it + /// remains not set. + pub(crate) fn overwrite(&self, o: Config) -> Config { + Config { + accelerate: o.accelerate.or(self.accelerate), + pre: o.pre.or_else(|| self.pre.clone()), + minimize: o.minimize.or(self.minimize), + match_kind: o.match_kind.or(self.match_kind), + start_kind: o.start_kind.or(self.start_kind), + starts_for_each_pattern: o + .starts_for_each_pattern + .or(self.starts_for_each_pattern), + byte_classes: o.byte_classes.or(self.byte_classes), + unicode_word_boundary: o + .unicode_word_boundary + .or(self.unicode_word_boundary), + quitset: o.quitset.or(self.quitset), + specialize_start_states: o + .specialize_start_states + .or(self.specialize_start_states), + dfa_size_limit: o.dfa_size_limit.or(self.dfa_size_limit), + determinize_size_limit: o + .determinize_size_limit + .or(self.determinize_size_limit), + } + } +} + +/// A builder for constructing a deterministic finite automaton from regular +/// expressions. +/// +/// This builder provides two main things: +/// +/// 1. It provides a few different `build` routines for actually constructing +/// a DFA from different kinds of inputs. The most convenient is +/// [`Builder::build`], which builds a DFA directly from a pattern string. The +/// most flexible is [`Builder::build_from_nfa`], which builds a DFA straight +/// from an NFA. +/// 2. The builder permits configuring a number of things. +/// [`Builder::configure`] is used with [`Config`] to configure aspects of +/// the DFA and the construction process itself. [`Builder::syntax`] and +/// [`Builder::thompson`] permit configuring the regex parser and Thompson NFA +/// construction, respectively. The syntax and thompson configurations only +/// apply when building from a pattern string. +/// +/// This builder always constructs a *single* DFA. As such, this builder +/// can only be used to construct regexes that either detect the presence +/// of a match or find the end location of a match. A single DFA cannot +/// produce both the start and end of a match. For that information, use a +/// [`Regex`](crate::dfa::regex::Regex), which can be similarly configured +/// using [`regex::Builder`](crate::dfa::regex::Builder). The main reason to +/// use a DFA directly is if the end location of a match is enough for your use +/// case. Namely, a `Regex` will construct two DFAs instead of one, since a +/// second reverse DFA is needed to find the start of a match. +/// +/// Note that if one wants to build a sparse DFA, you must first build a dense +/// DFA and convert that to a sparse DFA. There is no way to build a sparse +/// DFA without first building a dense DFA. +/// +/// # Example +/// +/// This example shows how to build a minimized DFA that completely disables +/// Unicode. That is: +/// +/// * Things such as `\w`, `.` and `\b` are no longer Unicode-aware. `\w` +/// and `\b` are ASCII-only while `.` matches any byte except for `\n` +/// (instead of any UTF-8 encoding of a Unicode scalar value except for +/// `\n`). Things that are Unicode only, such as `\pL`, are not allowed. +/// * The pattern itself is permitted to match invalid UTF-8. For example, +/// things like `[^a]` that match any byte except for `a` are permitted. +/// +/// ``` +/// use regex_automata::{ +/// dfa::{Automaton, dense}, +/// util::syntax, +/// HalfMatch, Input, +/// }; +/// +/// let dfa = dense::Builder::new() +/// .configure(dense::Config::new().minimize(false)) +/// .syntax(syntax::Config::new().unicode(false).utf8(false)) +/// .build(r"foo[^b]ar.*")?; +/// +/// let haystack = b"\xFEfoo\xFFar\xE2\x98\xFF\n"; +/// let expected = Some(HalfMatch::must(0, 10)); +/// let got = dfa.try_search_fwd(&Input::new(haystack))?; +/// assert_eq!(expected, got); +/// +/// # Ok::<(), Box<dyn std::error::Error>>(()) +/// ``` +#[cfg(feature = "dfa-build")] +#[derive(Clone, Debug)] +pub struct Builder { + config: Config, + #[cfg(feature = "syntax")] + thompson: thompson::Compiler, +} + +#[cfg(feature = "dfa-build")] +impl Builder { + /// Create a new dense DFA builder with the default configuration. + pub fn new() -> Builder { + Builder { + config: Config::default(), + #[cfg(feature = "syntax")] + thompson: thompson::Compiler::new(), + } + } + + /// Build a DFA from the given pattern. + /// + /// If there was a problem parsing or compiling the pattern, then an error + /// is returned. + #[cfg(feature = "syntax")] + pub fn build(&self, pattern: &str) -> Result<OwnedDFA, BuildError> { + self.build_many(&[pattern]) + } + + /// Build a DFA from the given patterns. + /// + /// When matches are returned, the pattern ID corresponds to the index of + /// the pattern in the slice given. + #[cfg(feature = "syntax")] + pub fn build_many<P: AsRef<str>>( + &self, + patterns: &[P], + ) -> Result<OwnedDFA, BuildError> { + let nfa = self + .thompson + .clone() + // We can always forcefully disable captures because DFAs do not + // support them. + .configure( + thompson::Config::new() + .which_captures(thompson::WhichCaptures::None), + ) + .build_many(patterns) + .map_err(BuildError::nfa)?; + self.build_from_nfa(&nfa) + } + + /// Build a DFA from the given NFA. + /// + /// # Example + /// + /// This example shows how to build a DFA if you already have an NFA in + /// hand. + /// + /// ``` + /// use regex_automata::{ + /// dfa::{Automaton, dense}, + /// nfa::thompson::NFA, + /// HalfMatch, Input, + /// }; + /// + /// let haystack = "foo123bar".as_bytes(); + /// + /// // This shows how to set non-default options for building an NFA. + /// let nfa = NFA::compiler() + /// .configure(NFA::config().shrink(true)) + /// .build(r"[0-9]+")?; + /// let dfa = dense::Builder::new().build_from_nfa(&nfa)?; + /// let expected = Some(HalfMatch::must(0, 6)); + /// let got = dfa.try_search_fwd(&Input::new(haystack))?; + /// assert_eq!(expected, got); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + pub fn build_from_nfa( + &self, + nfa: &thompson::NFA, + ) -> Result<OwnedDFA, BuildError> { + let mut quitset = self.config.quitset.unwrap_or(ByteSet::empty()); + if self.config.get_unicode_word_boundary() + && nfa.look_set_any().contains_word_unicode() + { + for b in 0x80..=0xFF { + quitset.add(b); + } + } + let classes = if !self.config.get_byte_classes() { + // DFAs will always use the equivalence class map, but enabling + // this option is useful for debugging. Namely, this will cause all + // transitions to be defined over their actual bytes instead of an + // opaque equivalence class identifier. The former is much easier + // to grok as a human. + ByteClasses::singletons() + } else { + let mut set = nfa.byte_class_set().clone(); + // It is important to distinguish any "quit" bytes from all other + // bytes. Otherwise, a non-quit byte may end up in the same class + // as a quit byte, and thus cause the DFA stop when it shouldn't. + // + // Test case: + // + // regex-cli find hybrid regex -w @conn.json.1000x.log \ + // '^#' '\b10\.55\.182\.100\b' + if !quitset.is_empty() { + set.add_set(&quitset); + } + set.byte_classes() + }; + + let mut dfa = DFA::initial( + classes, + nfa.pattern_len(), + self.config.get_starts(), + nfa.look_matcher(), + self.config.get_starts_for_each_pattern(), + self.config.get_prefilter().map(|p| p.clone()), + quitset, + Flags::from_nfa(&nfa), + )?; + determinize::Config::new() + .match_kind(self.config.get_match_kind()) + .quit(quitset) + .dfa_size_limit(self.config.get_dfa_size_limit()) + .determinize_size_limit(self.config.get_determinize_size_limit()) + .run(nfa, &mut dfa)?; + if self.config.get_minimize() { + dfa.minimize(); + } + if self.config.get_accelerate() { + dfa.accelerate(); + } + // The state shuffling done before this point always assumes that start + // states should be marked as "special," even though it isn't the + // default configuration. State shuffling is complex enough as it is, + // so it's simpler to just "fix" our special state ID ranges to not + // include starting states after-the-fact. + if !self.config.get_specialize_start_states() { + dfa.special.set_no_special_start_states(); + } + // Look for and set the universal starting states. + dfa.set_universal_starts(); + Ok(dfa) + } + + /// Apply the given dense DFA configuration options to this builder. + pub fn configure(&mut self, config: Config) -> &mut Builder { + self.config = self.config.overwrite(config); + self + } + + /// Set the syntax configuration for this builder using + /// [`syntax::Config`](crate::util::syntax::Config). + /// + /// This permits setting things like case insensitivity, Unicode and multi + /// line mode. + /// + /// These settings only apply when constructing a DFA directly from a + /// pattern. + #[cfg(feature = "syntax")] + pub fn syntax( + &mut self, + config: crate::util::syntax::Config, + ) -> &mut Builder { + self.thompson.syntax(config); + self + } + + /// Set the Thompson NFA configuration for this builder using + /// [`nfa::thompson::Config`](crate::nfa::thompson::Config). + /// + /// This permits setting things like whether the DFA should match the regex + /// in reverse or if additional time should be spent shrinking the size of + /// the NFA. + /// + /// These settings only apply when constructing a DFA directly from a + /// pattern. + #[cfg(feature = "syntax")] + pub fn thompson(&mut self, config: thompson::Config) -> &mut Builder { + self.thompson.configure(config); + self + } +} + +#[cfg(feature = "dfa-build")] +impl Default for Builder { + fn default() -> Builder { + Builder::new() + } +} + +/// A convenience alias for an owned DFA. We use this particular instantiation +/// a lot in this crate, so it's worth giving it a name. This instantiation +/// is commonly used for mutable APIs on the DFA while building it. The main +/// reason for making DFAs generic is no_std support, and more generally, +/// making it possible to load a DFA from an arbitrary slice of bytes. +#[cfg(feature = "alloc")] +pub(crate) type OwnedDFA = DFA<alloc::vec::Vec<u32>>; + +/// A dense table-based deterministic finite automaton (DFA). +/// +/// All dense DFAs have one or more start states, zero or more match states +/// and a transition table that maps the current state and the current byte +/// of input to the next state. A DFA can use this information to implement +/// fast searching. In particular, the use of a dense DFA generally makes the +/// trade off that match speed is the most valuable characteristic, even if +/// building the DFA may take significant time *and* space. (More concretely, +/// building a DFA takes time and space that is exponential in the size of the +/// pattern in the worst case.) As such, the processing of every byte of input +/// is done with a small constant number of operations that does not vary with +/// the pattern, its size or the size of the alphabet. If your needs don't line +/// up with this trade off, then a dense DFA may not be an adequate solution to +/// your problem. +/// +/// In contrast, a [`sparse::DFA`] makes the opposite +/// trade off: it uses less space but will execute a variable number of +/// instructions per byte at match time, which makes it slower for matching. +/// (Note that space usage is still exponential in the size of the pattern in +/// the worst case.) +/// +/// A DFA can be built using the default configuration via the +/// [`DFA::new`] constructor. Otherwise, one can +/// configure various aspects via [`dense::Builder`](Builder). +/// +/// A single DFA fundamentally supports the following operations: +/// +/// 1. Detection of a match. +/// 2. Location of the end of a match. +/// 3. In the case of a DFA with multiple patterns, which pattern matched is +/// reported as well. +/// +/// A notable absence from the above list of capabilities is the location of +/// the *start* of a match. In order to provide both the start and end of +/// a match, *two* DFAs are required. This functionality is provided by a +/// [`Regex`](crate::dfa::regex::Regex). +/// +/// # Type parameters +/// +/// A `DFA` has one type parameter, `T`, which is used to represent state IDs, +/// pattern IDs and accelerators. `T` is typically a `Vec<u32>` or a `&[u32]`. +/// +/// # The `Automaton` trait +/// +/// This type implements the [`Automaton`] trait, which means it can be used +/// for searching. For example: +/// +/// ``` +/// use regex_automata::{dfa::{Automaton, dense::DFA}, HalfMatch, Input}; +/// +/// let dfa = DFA::new("foo[0-9]+")?; +/// let expected = HalfMatch::must(0, 8); +/// assert_eq!(Some(expected), dfa.try_search_fwd(&Input::new("foo12345"))?); +/// # Ok::<(), Box<dyn std::error::Error>>(()) +/// ``` +#[derive(Clone)] +pub struct DFA<T> { + /// The transition table for this DFA. This includes the transitions + /// themselves, along with the stride, number of states and the equivalence + /// class mapping. + tt: TransitionTable<T>, + /// The set of starting state identifiers for this DFA. The starting state + /// IDs act as pointers into the transition table. The specific starting + /// state chosen for each search is dependent on the context at which the + /// search begins. + st: StartTable<T>, + /// The set of match states and the patterns that match for each + /// corresponding match state. + /// + /// This structure is technically only needed because of support for + /// multi-regexes. Namely, multi-regexes require answering not just whether + /// a match exists, but _which_ patterns match. So we need to store the + /// matching pattern IDs for each match state. We do this even when there + /// is only one pattern for the sake of simplicity. In practice, this uses + /// up very little space for the case of one pattern. + ms: MatchStates<T>, + /// Information about which states are "special." Special states are states + /// that are dead, quit, matching, starting or accelerated. For more info, + /// see the docs for `Special`. + special: Special, + /// The accelerators for this DFA. + /// + /// If a state is accelerated, then there exist only a small number of + /// bytes that can cause the DFA to leave the state. This permits searching + /// to use optimized routines to find those specific bytes instead of using + /// the transition table. + /// + /// All accelerated states exist in a contiguous range in the DFA's + /// transition table. See dfa/special.rs for more details on how states are + /// arranged. + accels: Accels<T>, + /// Any prefilter attached to this DFA. + /// + /// Note that currently prefilters are not serialized. When deserializing + /// a DFA from bytes, this is always set to `None`. + pre: Option<Prefilter>, + /// The set of "quit" bytes for this DFA. + /// + /// This is only used when computing the start state for a particular + /// position in a haystack. Namely, in the case where there is a quit + /// byte immediately before the start of the search, this set needs to be + /// explicitly consulted. In all other cases, quit bytes are detected by + /// the DFA itself, by transitioning all quit bytes to a special "quit + /// state." + quitset: ByteSet, + /// Various flags describing the behavior of this DFA. + flags: Flags, +} + +#[cfg(feature = "dfa-build")] +impl OwnedDFA { + /// Parse the given regular expression using a default configuration and + /// return the corresponding DFA. + /// + /// If you want a non-default configuration, then use the + /// [`dense::Builder`](Builder) to set your own configuration. + /// + /// # Example + /// + /// ``` + /// use regex_automata::{dfa::{Automaton, dense}, HalfMatch, Input}; + /// + /// let dfa = dense::DFA::new("foo[0-9]+bar")?; + /// let expected = Some(HalfMatch::must(0, 11)); + /// assert_eq!(expected, dfa.try_search_fwd(&Input::new("foo12345bar"))?); + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + #[cfg(feature = "syntax")] + pub fn new(pattern: &str) -> Result<OwnedDFA, BuildError> { + Builder::new().build(pattern) + } + + /// Parse the given regular expressions using a default configuration and + /// return the corresponding multi-DFA. + /// + /// If you want a non-default configuration, then use the + /// [`dense::Builder`](Builder) to set your own configuration. + /// + /// # Example + /// + /// ``` + /// use regex_automata::{dfa::{Automaton, dense}, HalfMatch, Input}; + /// + /// let dfa = dense::DFA::new_many(&["[0-9]+", "[a-z]+"])?; + /// let expected = Some(HalfMatch::must(1, 3)); + /// assert_eq!(expected, dfa.try_search_fwd(&Input::new("foo12345bar"))?); + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + #[cfg(feature = "syntax")] + pub fn new_many<P: AsRef<str>>( + patterns: &[P], + ) -> Result<OwnedDFA, BuildError> { + Builder::new().build_many(patterns) + } +} + +#[cfg(feature = "dfa-build")] +impl OwnedDFA { + /// Create a new DFA that matches every input. + /// + /// # Example + /// + /// ``` + /// use regex_automata::{dfa::{Automaton, dense}, HalfMatch, Input}; + /// + /// let dfa = dense::DFA::always_match()?; + /// + /// let expected = Some(HalfMatch::must(0, 0)); + /// assert_eq!(expected, dfa.try_search_fwd(&Input::new(""))?); + /// assert_eq!(expected, dfa.try_search_fwd(&Input::new("foo"))?); + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + pub fn always_match() -> Result<OwnedDFA, BuildError> { + let nfa = thompson::NFA::always_match(); + Builder::new().build_from_nfa(&nfa) + } + + /// Create a new DFA that never matches any input. + /// + /// # Example + /// + /// ``` + /// use regex_automata::{dfa::{Automaton, dense}, Input}; + /// + /// let dfa = dense::DFA::never_match()?; + /// assert_eq!(None, dfa.try_search_fwd(&Input::new(""))?); + /// assert_eq!(None, dfa.try_search_fwd(&Input::new("foo"))?); + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + pub fn never_match() -> Result<OwnedDFA, BuildError> { + let nfa = thompson::NFA::never_match(); + Builder::new().build_from_nfa(&nfa) + } + + /// Create an initial DFA with the given equivalence classes, pattern + /// length and whether anchored starting states are enabled for each + /// pattern. An initial DFA can be further mutated via determinization. + fn initial( + classes: ByteClasses, + pattern_len: usize, + starts: StartKind, + lookm: &LookMatcher, + starts_for_each_pattern: bool, + pre: Option<Prefilter>, + quitset: ByteSet, + flags: Flags, + ) -> Result<OwnedDFA, BuildError> { + let start_pattern_len = + if starts_for_each_pattern { Some(pattern_len) } else { None }; + Ok(DFA { + tt: TransitionTable::minimal(classes), + st: StartTable::dead(starts, lookm, start_pattern_len)?, + ms: MatchStates::empty(pattern_len), + special: Special::new(), + accels: Accels::empty(), + pre, + quitset, + flags, + }) + } +} + +#[cfg(feature = "dfa-build")] +impl DFA<&[u32]> { + /// Return a new default dense DFA compiler configuration. + /// + /// This is a convenience routine to avoid needing to import the [`Config`] + /// type when customizing the construction of a dense DFA. + pub fn config() -> Config { + Config::new() + } + + /// Create a new dense DFA builder with the default configuration. + /// + /// This is a convenience routine to avoid needing to import the + /// [`Builder`] type in common cases. + pub fn builder() -> Builder { + Builder::new() + } +} + +impl<T: AsRef<[u32]>> DFA<T> { + /// Cheaply return a borrowed version of this dense DFA. Specifically, + /// the DFA returned always uses `&[u32]` for its transition table. + pub fn as_ref(&self) -> DFA<&'_ [u32]> { + DFA { + tt: self.tt.as_ref(), + st: self.st.as_ref(), + ms: self.ms.as_ref(), + special: self.special, + accels: self.accels(), + pre: self.pre.clone(), + quitset: self.quitset, + flags: self.flags, + } + } + + /// Return an owned version of this sparse DFA. Specifically, the DFA + /// returned always uses `Vec<u32>` for its transition table. + /// + /// Effectively, this returns a dense DFA whose transition table lives on + /// the heap. + #[cfg(feature = "alloc")] + pub fn to_owned(&self) -> OwnedDFA { + DFA { + tt: self.tt.to_owned(), + st: self.st.to_owned(), + ms: self.ms.to_owned(), + special: self.special, + accels: self.accels().to_owned(), + pre: self.pre.clone(), + quitset: self.quitset, + flags: self.flags, + } + } + + /// Returns the starting state configuration for this DFA. + /// + /// The default is [`StartKind::Both`], which means the DFA supports both + /// unanchored and anchored searches. However, this can generally lead to + /// bigger DFAs. Therefore, a DFA might be compiled with support for just + /// unanchored or anchored searches. In that case, running a search with + /// an unsupported configuration will panic. + pub fn start_kind(&self) -> StartKind { + self.st.kind + } + + /// Returns the start byte map used for computing the `Start` configuration + /// at the beginning of a search. + pub(crate) fn start_map(&self) -> &StartByteMap { + &self.st.start_map + } + + /// Returns true only if this DFA has starting states for each pattern. + /// + /// When a DFA has starting states for each pattern, then a search with the + /// DFA can be configured to only look for anchored matches of a specific + /// pattern. Specifically, APIs like [`Automaton::try_search_fwd`] can + /// accept a non-None `pattern_id` if and only if this method returns true. + /// Otherwise, calling `try_search_fwd` will panic. + /// + /// Note that if the DFA has no patterns, this always returns false. + pub fn starts_for_each_pattern(&self) -> bool { + self.st.pattern_len.is_some() + } + + /// Returns the equivalence classes that make up the alphabet for this DFA. + /// + /// Unless [`Config::byte_classes`] was disabled, it is possible that + /// multiple distinct bytes are grouped into the same equivalence class + /// if it is impossible for them to discriminate between a match and a + /// non-match. This has the effect of reducing the overall alphabet size + /// and in turn potentially substantially reducing the size of the DFA's + /// transition table. + /// + /// The downside of using equivalence classes like this is that every state + /// transition will automatically use this map to convert an arbitrary + /// byte to its corresponding equivalence class. In practice this has a + /// negligible impact on performance. + pub fn byte_classes(&self) -> &ByteClasses { + &self.tt.classes + } + + /// Returns the total number of elements in the alphabet for this DFA. + /// + /// That is, this returns the total number of transitions that each state + /// in this DFA must have. Typically, a normal byte oriented DFA would + /// always have an alphabet size of 256, corresponding to the number of + /// unique values in a single byte. However, this implementation has two + /// peculiarities that impact the alphabet length: + /// + /// * Every state has a special "EOI" transition that is only followed + /// after the end of some haystack is reached. This EOI transition is + /// necessary to account for one byte of look-ahead when implementing + /// things like `\b` and `$`. + /// * Bytes are grouped into equivalence classes such that no two bytes in + /// the same class can distinguish a match from a non-match. For example, + /// in the regex `^[a-z]+$`, the ASCII bytes `a-z` could all be in the + /// same equivalence class. This leads to a massive space savings. + /// + /// Note though that the alphabet length does _not_ necessarily equal the + /// total stride space taken up by a single DFA state in the transition + /// table. Namely, for performance reasons, the stride is always the + /// smallest power of two that is greater than or equal to the alphabet + /// length. For this reason, [`DFA::stride`] or [`DFA::stride2`] are + /// often more useful. The alphabet length is typically useful only for + /// informational purposes. + pub fn alphabet_len(&self) -> usize { + self.tt.alphabet_len() + } + + /// Returns the total stride for every state in this DFA, expressed as the + /// exponent of a power of 2. The stride is the amount of space each state + /// takes up in the transition table, expressed as a number of transitions. + /// (Unused transitions map to dead states.) + /// + /// The stride of a DFA is always equivalent to the smallest power of 2 + /// that is greater than or equal to the DFA's alphabet length. This + /// definition uses extra space, but permits faster translation between + /// premultiplied state identifiers and contiguous indices (by using shifts + /// instead of relying on integer division). + /// + /// For example, if the DFA's stride is 16 transitions, then its `stride2` + /// is `4` since `2^4 = 16`. + /// + /// The minimum `stride2` value is `1` (corresponding to a stride of `2`) + /// while the maximum `stride2` value is `9` (corresponding to a stride of + /// `512`). The maximum is not `8` since the maximum alphabet size is `257` + /// when accounting for the special EOI transition. However, an alphabet + /// length of that size is exceptionally rare since the alphabet is shrunk + /// into equivalence classes. + pub fn stride2(&self) -> usize { + self.tt.stride2 + } + + /// Returns the total stride for every state in this DFA. This corresponds + /// to the total number of transitions used by each state in this DFA's + /// transition table. + /// + /// Please see [`DFA::stride2`] for more information. In particular, this + /// returns the stride as the number of transitions, where as `stride2` + /// returns it as the exponent of a power of 2. + pub fn stride(&self) -> usize { + self.tt.stride() + } + + /// Returns the memory usage, in bytes, of this DFA. + /// + /// The memory usage is computed based on the number of bytes used to + /// represent this DFA. + /// + /// This does **not** include the stack size used up by this DFA. To + /// compute that, use `std::mem::size_of::<dense::DFA>()`. + pub fn memory_usage(&self) -> usize { + self.tt.memory_usage() + + self.st.memory_usage() + + self.ms.memory_usage() + + self.accels.memory_usage() + } +} + +/// Routines for converting a dense DFA to other representations, such as +/// sparse DFAs or raw bytes suitable for persistent storage. +impl<T: AsRef<[u32]>> DFA<T> { + /// Convert this dense DFA to a sparse DFA. + /// + /// If a `StateID` is too small to represent all states in the sparse + /// DFA, then this returns an error. In most cases, if a dense DFA is + /// constructable with `StateID` then a sparse DFA will be as well. + /// However, it is not guaranteed. + /// + /// # Example + /// + /// ``` + /// use regex_automata::{dfa::{Automaton, dense}, HalfMatch, Input}; + /// + /// let dense = dense::DFA::new("foo[0-9]+")?; + /// let sparse = dense.to_sparse()?; + /// + /// let expected = Some(HalfMatch::must(0, 8)); + /// assert_eq!(expected, sparse.try_search_fwd(&Input::new("foo12345"))?); + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + #[cfg(feature = "dfa-build")] + pub fn to_sparse(&self) -> Result<sparse::DFA<Vec<u8>>, BuildError> { + sparse::DFA::from_dense(self) + } + + /// Serialize this DFA as raw bytes to a `Vec<u8>` in little endian + /// format. Upon success, the `Vec<u8>` and the initial padding length are + /// returned. + /// + /// The written bytes are guaranteed to be deserialized correctly and + /// without errors in a semver compatible release of this crate by a + /// `DFA`'s deserialization APIs (assuming all other criteria for the + /// deserialization APIs has been satisfied): + /// + /// * [`DFA::from_bytes`] + /// * [`DFA::from_bytes_unchecked`] + /// + /// The padding returned is non-zero if the returned `Vec<u8>` starts at + /// an address that does not have the same alignment as `u32`. The padding + /// corresponds to the number of leading bytes written to the returned + /// `Vec<u8>`. + /// + /// # Example + /// + /// This example shows how to serialize and deserialize a DFA: + /// + /// ``` + /// use regex_automata::{dfa::{Automaton, dense::DFA}, HalfMatch, Input}; + /// + /// // Compile our original DFA. + /// let original_dfa = DFA::new("foo[0-9]+")?; + /// + /// // N.B. We use native endianness here to make the example work, but + /// // using to_bytes_little_endian would work on a little endian target. + /// let (buf, _) = original_dfa.to_bytes_native_endian(); + /// // Even if buf has initial padding, DFA::from_bytes will automatically + /// // ignore it. + /// let dfa: DFA<&[u32]> = DFA::from_bytes(&buf)?.0; + /// + /// let expected = Some(HalfMatch::must(0, 8)); + /// assert_eq!(expected, dfa.try_search_fwd(&Input::new("foo12345"))?); + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + #[cfg(feature = "dfa-build")] + pub fn to_bytes_little_endian(&self) -> (Vec<u8>, usize) { + self.to_bytes::<wire::LE>() + } + + /// Serialize this DFA as raw bytes to a `Vec<u8>` in big endian + /// format. Upon success, the `Vec<u8>` and the initial padding length are + /// returned. + /// + /// The written bytes are guaranteed to be deserialized correctly and + /// without errors in a semver compatible release of this crate by a + /// `DFA`'s deserialization APIs (assuming all other criteria for the + /// deserialization APIs has been satisfied): + /// + /// * [`DFA::from_bytes`] + /// * [`DFA::from_bytes_unchecked`] + /// + /// The padding returned is non-zero if the returned `Vec<u8>` starts at + /// an address that does not have the same alignment as `u32`. The padding + /// corresponds to the number of leading bytes written to the returned + /// `Vec<u8>`. + /// + /// # Example + /// + /// This example shows how to serialize and deserialize a DFA: + /// + /// ``` + /// use regex_automata::{dfa::{Automaton, dense::DFA}, HalfMatch, Input}; + /// + /// // Compile our original DFA. + /// let original_dfa = DFA::new("foo[0-9]+")?; + /// + /// // N.B. We use native endianness here to make the example work, but + /// // using to_bytes_big_endian would work on a big endian target. + /// let (buf, _) = original_dfa.to_bytes_native_endian(); + /// // Even if buf has initial padding, DFA::from_bytes will automatically + /// // ignore it. + /// let dfa: DFA<&[u32]> = DFA::from_bytes(&buf)?.0; + /// + /// let expected = Some(HalfMatch::must(0, 8)); + /// assert_eq!(expected, dfa.try_search_fwd(&Input::new("foo12345"))?); + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + #[cfg(feature = "dfa-build")] + pub fn to_bytes_big_endian(&self) -> (Vec<u8>, usize) { + self.to_bytes::<wire::BE>() + } + + /// Serialize this DFA as raw bytes to a `Vec<u8>` in native endian + /// format. Upon success, the `Vec<u8>` and the initial padding length are + /// returned. + /// + /// The written bytes are guaranteed to be deserialized correctly and + /// without errors in a semver compatible release of this crate by a + /// `DFA`'s deserialization APIs (assuming all other criteria for the + /// deserialization APIs has been satisfied): + /// + /// * [`DFA::from_bytes`] + /// * [`DFA::from_bytes_unchecked`] + /// + /// The padding returned is non-zero if the returned `Vec<u8>` starts at + /// an address that does not have the same alignment as `u32`. The padding + /// corresponds to the number of leading bytes written to the returned + /// `Vec<u8>`. + /// + /// Generally speaking, native endian format should only be used when + /// you know that the target you're compiling the DFA for matches the + /// endianness of the target on which you're compiling DFA. For example, + /// if serialization and deserialization happen in the same process or on + /// the same machine. Otherwise, when serializing a DFA for use in a + /// portable environment, you'll almost certainly want to serialize _both_ + /// a little endian and a big endian version and then load the correct one + /// based on the target's configuration. + /// + /// # Example + /// + /// This example shows how to serialize and deserialize a DFA: + /// + /// ``` + /// use regex_automata::{dfa::{Automaton, dense::DFA}, HalfMatch, Input}; + /// + /// // Compile our original DFA. + /// let original_dfa = DFA::new("foo[0-9]+")?; + /// + /// let (buf, _) = original_dfa.to_bytes_native_endian(); + /// // Even if buf has initial padding, DFA::from_bytes will automatically + /// // ignore it. + /// let dfa: DFA<&[u32]> = DFA::from_bytes(&buf)?.0; + /// + /// let expected = Some(HalfMatch::must(0, 8)); + /// assert_eq!(expected, dfa.try_search_fwd(&Input::new("foo12345"))?); + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + #[cfg(feature = "dfa-build")] + pub fn to_bytes_native_endian(&self) -> (Vec<u8>, usize) { + self.to_bytes::<wire::NE>() + } + + /// The implementation of the public `to_bytes` serialization methods, + /// which is generic over endianness. + #[cfg(feature = "dfa-build")] + fn to_bytes<E: Endian>(&self) -> (Vec<u8>, usize) { + let len = self.write_to_len(); + let (mut buf, padding) = wire::alloc_aligned_buffer::<u32>(len); + // This should always succeed since the only possible serialization + // error is providing a buffer that's too small, but we've ensured that + // `buf` is big enough here. + self.as_ref().write_to::<E>(&mut buf[padding..]).unwrap(); + (buf, padding) + } + + /// Serialize this DFA as raw bytes to the given slice, in little endian + /// format. Upon success, the total number of bytes written to `dst` is + /// returned. + /// + /// The written bytes are guaranteed to be deserialized correctly and + /// without errors in a semver compatible release of this crate by a + /// `DFA`'s deserialization APIs (assuming all other criteria for the + /// deserialization APIs has been satisfied): + /// + /// * [`DFA::from_bytes`] + /// * [`DFA::from_bytes_unchecked`] + /// + /// Note that unlike the various `to_byte_*` routines, this does not write + /// any padding. Callers are responsible for handling alignment correctly. + /// + /// # Errors + /// + /// This returns an error if the given destination slice is not big enough + /// to contain the full serialized DFA. If an error occurs, then nothing + /// is written to `dst`. + /// + /// # Example + /// + /// This example shows how to serialize and deserialize a DFA without + /// dynamic memory allocation. + /// + /// ``` + /// use regex_automata::{dfa::{Automaton, dense::DFA}, HalfMatch, Input}; + /// + /// // Compile our original DFA. + /// let original_dfa = DFA::new("foo[0-9]+")?; + /// + /// // Create a 4KB buffer on the stack to store our serialized DFA. We + /// // need to use a special type to force the alignment of our [u8; N] + /// // array to be aligned to a 4 byte boundary. Otherwise, deserializing + /// // the DFA may fail because of an alignment mismatch. + /// #[repr(C)] + /// struct Aligned<B: ?Sized> { + /// _align: [u32; 0], + /// bytes: B, + /// } + /// let mut buf = Aligned { _align: [], bytes: [0u8; 4 * (1<<10)] }; + /// // N.B. We use native endianness here to make the example work, but + /// // using write_to_little_endian would work on a little endian target. + /// let written = original_dfa.write_to_native_endian(&mut buf.bytes)?; + /// let dfa: DFA<&[u32]> = DFA::from_bytes(&buf.bytes[..written])?.0; + /// + /// let expected = Some(HalfMatch::must(0, 8)); + /// assert_eq!(expected, dfa.try_search_fwd(&Input::new("foo12345"))?); + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + pub fn write_to_little_endian( + &self, + dst: &mut [u8], + ) -> Result<usize, SerializeError> { + self.as_ref().write_to::<wire::LE>(dst) + } + + /// Serialize this DFA as raw bytes to the given slice, in big endian + /// format. Upon success, the total number of bytes written to `dst` is + /// returned. + /// + /// The written bytes are guaranteed to be deserialized correctly and + /// without errors in a semver compatible release of this crate by a + /// `DFA`'s deserialization APIs (assuming all other criteria for the + /// deserialization APIs has been satisfied): + /// + /// * [`DFA::from_bytes`] + /// * [`DFA::from_bytes_unchecked`] + /// + /// Note that unlike the various `to_byte_*` routines, this does not write + /// any padding. Callers are responsible for handling alignment correctly. + /// + /// # Errors + /// + /// This returns an error if the given destination slice is not big enough + /// to contain the full serialized DFA. If an error occurs, then nothing + /// is written to `dst`. + /// + /// # Example + /// + /// This example shows how to serialize and deserialize a DFA without + /// dynamic memory allocation. + /// + /// ``` + /// use regex_automata::{dfa::{Automaton, dense::DFA}, HalfMatch, Input}; + /// + /// // Compile our original DFA. + /// let original_dfa = DFA::new("foo[0-9]+")?; + /// + /// // Create a 4KB buffer on the stack to store our serialized DFA. We + /// // need to use a special type to force the alignment of our [u8; N] + /// // array to be aligned to a 4 byte boundary. Otherwise, deserializing + /// // the DFA may fail because of an alignment mismatch. + /// #[repr(C)] + /// struct Aligned<B: ?Sized> { + /// _align: [u32; 0], + /// bytes: B, + /// } + /// let mut buf = Aligned { _align: [], bytes: [0u8; 4 * (1<<10)] }; + /// // N.B. We use native endianness here to make the example work, but + /// // using write_to_big_endian would work on a big endian target. + /// let written = original_dfa.write_to_native_endian(&mut buf.bytes)?; + /// let dfa: DFA<&[u32]> = DFA::from_bytes(&buf.bytes[..written])?.0; + /// + /// let expected = Some(HalfMatch::must(0, 8)); + /// assert_eq!(expected, dfa.try_search_fwd(&Input::new("foo12345"))?); + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + pub fn write_to_big_endian( + &self, + dst: &mut [u8], + ) -> Result<usize, SerializeError> { + self.as_ref().write_to::<wire::BE>(dst) + } + + /// Serialize this DFA as raw bytes to the given slice, in native endian + /// format. Upon success, the total number of bytes written to `dst` is + /// returned. + /// + /// The written bytes are guaranteed to be deserialized correctly and + /// without errors in a semver compatible release of this crate by a + /// `DFA`'s deserialization APIs (assuming all other criteria for the + /// deserialization APIs has been satisfied): + /// + /// * [`DFA::from_bytes`] + /// * [`DFA::from_bytes_unchecked`] + /// + /// Generally speaking, native endian format should only be used when + /// you know that the target you're compiling the DFA for matches the + /// endianness of the target on which you're compiling DFA. For example, + /// if serialization and deserialization happen in the same process or on + /// the same machine. Otherwise, when serializing a DFA for use in a + /// portable environment, you'll almost certainly want to serialize _both_ + /// a little endian and a big endian version and then load the correct one + /// based on the target's configuration. + /// + /// Note that unlike the various `to_byte_*` routines, this does not write + /// any padding. Callers are responsible for handling alignment correctly. + /// + /// # Errors + /// + /// This returns an error if the given destination slice is not big enough + /// to contain the full serialized DFA. If an error occurs, then nothing + /// is written to `dst`. + /// + /// # Example + /// + /// This example shows how to serialize and deserialize a DFA without + /// dynamic memory allocation. + /// + /// ``` + /// use regex_automata::{dfa::{Automaton, dense::DFA}, HalfMatch, Input}; + /// + /// // Compile our original DFA. + /// let original_dfa = DFA::new("foo[0-9]+")?; + /// + /// // Create a 4KB buffer on the stack to store our serialized DFA. We + /// // need to use a special type to force the alignment of our [u8; N] + /// // array to be aligned to a 4 byte boundary. Otherwise, deserializing + /// // the DFA may fail because of an alignment mismatch. + /// #[repr(C)] + /// struct Aligned<B: ?Sized> { + /// _align: [u32; 0], + /// bytes: B, + /// } + /// let mut buf = Aligned { _align: [], bytes: [0u8; 4 * (1<<10)] }; + /// let written = original_dfa.write_to_native_endian(&mut buf.bytes)?; + /// let dfa: DFA<&[u32]> = DFA::from_bytes(&buf.bytes[..written])?.0; + /// + /// let expected = Some(HalfMatch::must(0, 8)); + /// assert_eq!(expected, dfa.try_search_fwd(&Input::new("foo12345"))?); + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + pub fn write_to_native_endian( + &self, + dst: &mut [u8], + ) -> Result<usize, SerializeError> { + self.as_ref().write_to::<wire::NE>(dst) + } + + /// Return the total number of bytes required to serialize this DFA. + /// + /// This is useful for determining the size of the buffer required to pass + /// to one of the serialization routines: + /// + /// * [`DFA::write_to_little_endian`] + /// * [`DFA::write_to_big_endian`] + /// * [`DFA::write_to_native_endian`] + /// + /// Passing a buffer smaller than the size returned by this method will + /// result in a serialization error. Serialization routines are guaranteed + /// to succeed when the buffer is big enough. + /// + /// # Example + /// + /// This example shows how to dynamically allocate enough room to serialize + /// a DFA. + /// + /// ``` + /// use regex_automata::{dfa::{Automaton, dense::DFA}, HalfMatch, Input}; + /// + /// let original_dfa = DFA::new("foo[0-9]+")?; + /// + /// let mut buf = vec![0; original_dfa.write_to_len()]; + /// // This is guaranteed to succeed, because the only serialization error + /// // that can occur is when the provided buffer is too small. But + /// // write_to_len guarantees a correct size. + /// let written = original_dfa.write_to_native_endian(&mut buf).unwrap(); + /// // But this is not guaranteed to succeed! In particular, + /// // deserialization requires proper alignment for &[u32], but our buffer + /// // was allocated as a &[u8] whose required alignment is smaller than + /// // &[u32]. However, it's likely to work in practice because of how most + /// // allocators work. So if you write code like this, make sure to either + /// // handle the error correctly and/or run it under Miri since Miri will + /// // likely provoke the error by returning Vec<u8> buffers with alignment + /// // less than &[u32]. + /// let dfa: DFA<&[u32]> = match DFA::from_bytes(&buf[..written]) { + /// // As mentioned above, it is legal for an error to be returned + /// // here. It is quite difficult to get a Vec<u8> with a guaranteed + /// // alignment equivalent to Vec<u32>. + /// Err(_) => return Ok(()), + /// Ok((dfa, _)) => dfa, + /// }; + /// + /// let expected = Some(HalfMatch::must(0, 8)); + /// assert_eq!(expected, dfa.try_search_fwd(&Input::new("foo12345"))?); + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + /// + /// Note that this example isn't actually guaranteed to work! In + /// particular, if `buf` is not aligned to a 4-byte boundary, then the + /// `DFA::from_bytes` call will fail. If you need this to work, then you + /// either need to deal with adding some initial padding yourself, or use + /// one of the `to_bytes` methods, which will do it for you. + pub fn write_to_len(&self) -> usize { + wire::write_label_len(LABEL) + + wire::write_endianness_check_len() + + wire::write_version_len() + + size_of::<u32>() // unused, intended for future flexibility + + self.flags.write_to_len() + + self.tt.write_to_len() + + self.st.write_to_len() + + self.ms.write_to_len() + + self.special.write_to_len() + + self.accels.write_to_len() + + self.quitset.write_to_len() + } +} + +impl<'a> DFA<&'a [u32]> { + /// Safely deserialize a DFA with a specific state identifier + /// representation. Upon success, this returns both the deserialized DFA + /// and the number of bytes read from the given slice. Namely, the contents + /// of the slice beyond the DFA are not read. + /// + /// Deserializing a DFA using this routine will never allocate heap memory. + /// For safety purposes, the DFA's transition table will be verified such + /// that every transition points to a valid state. If this verification is + /// too costly, then a [`DFA::from_bytes_unchecked`] API is provided, which + /// will always execute in constant time. + /// + /// The bytes given must be generated by one of the serialization APIs + /// of a `DFA` using a semver compatible release of this crate. Those + /// include: + /// + /// * [`DFA::to_bytes_little_endian`] + /// * [`DFA::to_bytes_big_endian`] + /// * [`DFA::to_bytes_native_endian`] + /// * [`DFA::write_to_little_endian`] + /// * [`DFA::write_to_big_endian`] + /// * [`DFA::write_to_native_endian`] + /// + /// The `to_bytes` methods allocate and return a `Vec<u8>` for you, along + /// with handling alignment correctly. The `write_to` methods do not + /// allocate and write to an existing slice (which may be on the stack). + /// Since deserialization always uses the native endianness of the target + /// platform, the serialization API you use should match the endianness of + /// the target platform. (It's often a good idea to generate serialized + /// DFAs for both forms of endianness and then load the correct one based + /// on endianness.) + /// + /// # Errors + /// + /// Generally speaking, it's easier to state the conditions in which an + /// error is _not_ returned. All of the following must be true: + /// + /// * The bytes given must be produced by one of the serialization APIs + /// on this DFA, as mentioned above. + /// * The endianness of the target platform matches the endianness used to + /// serialized the provided DFA. + /// * The slice given must have the same alignment as `u32`. + /// + /// If any of the above are not true, then an error will be returned. + /// + /// # Panics + /// + /// This routine will never panic for any input. + /// + /// # Example + /// + /// This example shows how to serialize a DFA to raw bytes, deserialize it + /// and then use it for searching. + /// + /// ``` + /// use regex_automata::{dfa::{Automaton, dense::DFA}, HalfMatch, Input}; + /// + /// let initial = DFA::new("foo[0-9]+")?; + /// let (bytes, _) = initial.to_bytes_native_endian(); + /// let dfa: DFA<&[u32]> = DFA::from_bytes(&bytes)?.0; + /// + /// let expected = Some(HalfMatch::must(0, 8)); + /// assert_eq!(expected, dfa.try_search_fwd(&Input::new("foo12345"))?); + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + /// + /// # Example: dealing with alignment and padding + /// + /// In the above example, we used the `to_bytes_native_endian` method to + /// serialize a DFA, but we ignored part of its return value corresponding + /// to padding added to the beginning of the serialized DFA. This is OK + /// because deserialization will skip this initial padding. What matters + /// is that the address immediately following the padding has an alignment + /// that matches `u32`. That is, the following is an equivalent but + /// alternative way to write the above example: + /// + /// ``` + /// use regex_automata::{dfa::{Automaton, dense::DFA}, HalfMatch, Input}; + /// + /// let initial = DFA::new("foo[0-9]+")?; + /// // Serialization returns the number of leading padding bytes added to + /// // the returned Vec<u8>. + /// let (bytes, pad) = initial.to_bytes_native_endian(); + /// let dfa: DFA<&[u32]> = DFA::from_bytes(&bytes[pad..])?.0; + /// + /// let expected = Some(HalfMatch::must(0, 8)); + /// assert_eq!(expected, dfa.try_search_fwd(&Input::new("foo12345"))?); + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + /// + /// This padding is necessary because Rust's standard library does + /// not expose any safe and robust way of creating a `Vec<u8>` with a + /// guaranteed alignment other than 1. Now, in practice, the underlying + /// allocator is likely to provide a `Vec<u8>` that meets our alignment + /// requirements, which means `pad` is zero in practice most of the time. + /// + /// The purpose of exposing the padding like this is flexibility for the + /// caller. For example, if one wants to embed a serialized DFA into a + /// compiled program, then it's important to guarantee that it starts at a + /// `u32`-aligned address. The simplest way to do this is to discard the + /// padding bytes and set it up so that the serialized DFA itself begins at + /// a properly aligned address. We can show this in two parts. The first + /// part is serializing the DFA to a file: + /// + /// ```no_run + /// use regex_automata::dfa::dense::DFA; + /// + /// let dfa = DFA::new("foo[0-9]+")?; + /// + /// let (bytes, pad) = dfa.to_bytes_big_endian(); + /// // Write the contents of the DFA *without* the initial padding. + /// std::fs::write("foo.bigendian.dfa", &bytes[pad..])?; + /// + /// // Do it again, but this time for little endian. + /// let (bytes, pad) = dfa.to_bytes_little_endian(); + /// std::fs::write("foo.littleendian.dfa", &bytes[pad..])?; + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + /// + /// And now the second part is embedding the DFA into the compiled program + /// and deserializing it at runtime on first use. We use conditional + /// compilation to choose the correct endianness. + /// + /// ```no_run + /// use regex_automata::{ + /// dfa::{Automaton, dense::DFA}, + /// util::{lazy::Lazy, wire::AlignAs}, + /// HalfMatch, Input, + /// }; + /// + /// // This crate provides its own "lazy" type, kind of like + /// // lazy_static! or once_cell::sync::Lazy. But it works in no-alloc + /// // no-std environments and let's us write this using completely + /// // safe code. + /// static RE: Lazy<DFA<&'static [u32]>> = Lazy::new(|| { + /// # const _: &str = stringify! { + /// // This assignment is made possible (implicitly) via the + /// // CoerceUnsized trait. This is what guarantees that our + /// // bytes are stored in memory on a 4 byte boundary. You + /// // *must* do this or something equivalent for correct + /// // deserialization. + /// static ALIGNED: &AlignAs<[u8], u32> = &AlignAs { + /// _align: [], + /// #[cfg(target_endian = "big")] + /// bytes: *include_bytes!("foo.bigendian.dfa"), + /// #[cfg(target_endian = "little")] + /// bytes: *include_bytes!("foo.littleendian.dfa"), + /// }; + /// # }; + /// # static ALIGNED: &AlignAs<[u8], u32> = &AlignAs { + /// # _align: [], + /// # bytes: [], + /// # }; + /// + /// let (dfa, _) = DFA::from_bytes(&ALIGNED.bytes) + /// .expect("serialized DFA should be valid"); + /// dfa + /// }); + /// + /// let expected = Ok(Some(HalfMatch::must(0, 8))); + /// assert_eq!(expected, RE.try_search_fwd(&Input::new("foo12345"))); + /// ``` + /// + /// An alternative to [`util::lazy::Lazy`](crate::util::lazy::Lazy) + /// is [`lazy_static`](https://crates.io/crates/lazy_static) or + /// [`once_cell`](https://crates.io/crates/once_cell), which provide + /// stronger guarantees (like the initialization function only being + /// executed once). And `once_cell` in particular provides a more + /// expressive API. But a `Lazy` value from this crate is likely just fine + /// in most circumstances. + /// + /// Note that regardless of which initialization method you use, you + /// will still need to use the [`AlignAs`](crate::util::wire::AlignAs) + /// trick above to force correct alignment, but this is safe to do and + /// `from_bytes` will return an error if you get it wrong. + pub fn from_bytes( + slice: &'a [u8], + ) -> Result<(DFA<&'a [u32]>, usize), DeserializeError> { + // SAFETY: This is safe because we validate the transition table, start + // table, match states and accelerators below. If any validation fails, + // then we return an error. + let (dfa, nread) = unsafe { DFA::from_bytes_unchecked(slice)? }; + dfa.tt.validate(&dfa.special)?; + dfa.st.validate(&dfa.tt)?; + dfa.ms.validate(&dfa)?; + dfa.accels.validate()?; + // N.B. dfa.special doesn't have a way to do unchecked deserialization, + // so it has already been validated. + Ok((dfa, nread)) + } + + /// Deserialize a DFA with a specific state identifier representation in + /// constant time by omitting the verification of the validity of the + /// transition table and other data inside the DFA. + /// + /// This is just like [`DFA::from_bytes`], except it can potentially return + /// a DFA that exhibits undefined behavior if its transition table contains + /// invalid state identifiers. + /// + /// This routine is useful if you need to deserialize a DFA cheaply + /// and cannot afford the transition table validation performed by + /// `from_bytes`. + /// + /// # Example + /// + /// ``` + /// use regex_automata::{dfa::{Automaton, dense::DFA}, HalfMatch, Input}; + /// + /// let initial = DFA::new("foo[0-9]+")?; + /// let (bytes, _) = initial.to_bytes_native_endian(); + /// // SAFETY: This is guaranteed to be safe since the bytes given come + /// // directly from a compatible serialization routine. + /// let dfa: DFA<&[u32]> = unsafe { DFA::from_bytes_unchecked(&bytes)?.0 }; + /// + /// let expected = Some(HalfMatch::must(0, 8)); + /// assert_eq!(expected, dfa.try_search_fwd(&Input::new("foo12345"))?); + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + pub unsafe fn from_bytes_unchecked( + slice: &'a [u8], + ) -> Result<(DFA<&'a [u32]>, usize), DeserializeError> { + let mut nr = 0; + + nr += wire::skip_initial_padding(slice); + wire::check_alignment::<StateID>(&slice[nr..])?; + nr += wire::read_label(&slice[nr..], LABEL)?; + nr += wire::read_endianness_check(&slice[nr..])?; + nr += wire::read_version(&slice[nr..], VERSION)?; + + let _unused = wire::try_read_u32(&slice[nr..], "unused space")?; + nr += size_of::<u32>(); + + let (flags, nread) = Flags::from_bytes(&slice[nr..])?; + nr += nread; + + let (tt, nread) = TransitionTable::from_bytes_unchecked(&slice[nr..])?; + nr += nread; + + let (st, nread) = StartTable::from_bytes_unchecked(&slice[nr..])?; + nr += nread; + + let (ms, nread) = MatchStates::from_bytes_unchecked(&slice[nr..])?; + nr += nread; + + let (special, nread) = Special::from_bytes(&slice[nr..])?; + nr += nread; + special.validate_state_len(tt.len(), tt.stride2)?; + + let (accels, nread) = Accels::from_bytes_unchecked(&slice[nr..])?; + nr += nread; + + let (quitset, nread) = ByteSet::from_bytes(&slice[nr..])?; + nr += nread; + + // Prefilters don't support serialization, so they're always absent. + let pre = None; + Ok((DFA { tt, st, ms, special, accels, pre, quitset, flags }, nr)) + } + + /// The implementation of the public `write_to` serialization methods, + /// which is generic over endianness. + /// + /// This is defined only for &[u32] to reduce binary size/compilation time. + fn write_to<E: Endian>( + &self, + mut dst: &mut [u8], + ) -> Result<usize, SerializeError> { + let nwrite = self.write_to_len(); + if dst.len() < nwrite { + return Err(SerializeError::buffer_too_small("dense DFA")); + } + dst = &mut dst[..nwrite]; + + let mut nw = 0; + nw += wire::write_label(LABEL, &mut dst[nw..])?; + nw += wire::write_endianness_check::<E>(&mut dst[nw..])?; + nw += wire::write_version::<E>(VERSION, &mut dst[nw..])?; + nw += { + // Currently unused, intended for future flexibility + E::write_u32(0, &mut dst[nw..]); + size_of::<u32>() + }; + nw += self.flags.write_to::<E>(&mut dst[nw..])?; + nw += self.tt.write_to::<E>(&mut dst[nw..])?; + nw += self.st.write_to::<E>(&mut dst[nw..])?; + nw += self.ms.write_to::<E>(&mut dst[nw..])?; + nw += self.special.write_to::<E>(&mut dst[nw..])?; + nw += self.accels.write_to::<E>(&mut dst[nw..])?; + nw += self.quitset.write_to::<E>(&mut dst[nw..])?; + Ok(nw) + } +} + +// The following methods implement mutable routines on the internal +// representation of a DFA. As such, we must fix the first type parameter to a +// `Vec<u32>` since a generic `T: AsRef<[u32]>` does not permit mutation. We +// can get away with this because these methods are internal to the crate and +// are exclusively used during construction of the DFA. +#[cfg(feature = "dfa-build")] +impl OwnedDFA { + /// Add a start state of this DFA. + pub(crate) fn set_start_state( + &mut self, + anchored: Anchored, + start: Start, + id: StateID, + ) { + assert!(self.tt.is_valid(id), "invalid start state"); + self.st.set_start(anchored, start, id); + } + + /// Set the given transition to this DFA. Both the `from` and `to` states + /// must already exist. + pub(crate) fn set_transition( + &mut self, + from: StateID, + byte: alphabet::Unit, + to: StateID, + ) { + self.tt.set(from, byte, to); + } + + /// An an empty state (a state where all transitions lead to a dead state) + /// and return its identifier. The identifier returned is guaranteed to + /// not point to any other existing state. + /// + /// If adding a state would exceed `StateID::LIMIT`, then this returns an + /// error. + pub(crate) fn add_empty_state(&mut self) -> Result<StateID, BuildError> { + self.tt.add_empty_state() + } + + /// Swap the two states given in the transition table. + /// + /// This routine does not do anything to check the correctness of this + /// swap. Callers must ensure that other states pointing to id1 and id2 are + /// updated appropriately. + pub(crate) fn swap_states(&mut self, id1: StateID, id2: StateID) { + self.tt.swap(id1, id2); + } + + /// Remap all of the state identifiers in this DFA according to the map + /// function given. This includes all transitions and all starting state + /// identifiers. + pub(crate) fn remap(&mut self, map: impl Fn(StateID) -> StateID) { + // We could loop over each state ID and call 'remap_state' here, but + // this is more direct: just map every transition directly. This + // technically might do a little extra work since the alphabet length + // is likely less than the stride, but if that is indeed an issue we + // should benchmark it and fix it. + for sid in self.tt.table_mut().iter_mut() { + *sid = map(*sid); + } + for sid in self.st.table_mut().iter_mut() { + *sid = map(*sid); + } + } + + /// Remap the transitions for the state given according to the function + /// given. This applies the given map function to every transition in the + /// given state and changes the transition in place to the result of the + /// map function for that transition. + pub(crate) fn remap_state( + &mut self, + id: StateID, + map: impl Fn(StateID) -> StateID, + ) { + self.tt.remap(id, map); + } + + /// Truncate the states in this DFA to the given length. + /// + /// This routine does not do anything to check the correctness of this + /// truncation. Callers must ensure that other states pointing to truncated + /// states are updated appropriately. + pub(crate) fn truncate_states(&mut self, len: usize) { + self.tt.truncate(len); + } + + /// Minimize this DFA in place using Hopcroft's algorithm. + pub(crate) fn minimize(&mut self) { + Minimizer::new(self).run(); + } + + /// Updates the match state pattern ID map to use the one provided. + /// + /// This is useful when it's convenient to manipulate matching states + /// (and their corresponding pattern IDs) as a map. In particular, the + /// representation used by a DFA for this map is not amenable to mutation, + /// so if things need to be changed (like when shuffling states), it's + /// often easier to work with the map form. + pub(crate) fn set_pattern_map( + &mut self, + map: &BTreeMap<StateID, Vec<PatternID>>, + ) -> Result<(), BuildError> { + self.ms = self.ms.new_with_map(map)?; + Ok(()) + } + + /// Find states that have a small number of non-loop transitions and mark + /// them as candidates for acceleration during search. + pub(crate) fn accelerate(&mut self) { + // dead and quit states can never be accelerated. + if self.state_len() <= 2 { + return; + } + + // Go through every state and record their accelerator, if possible. + let mut accels = BTreeMap::new(); + // Count the number of accelerated match, start and non-match/start + // states. + let (mut cmatch, mut cstart, mut cnormal) = (0, 0, 0); + for state in self.states() { + if let Some(accel) = state.accelerate(self.byte_classes()) { + debug!( + "accelerating full DFA state {}: {:?}", + state.id().as_usize(), + accel, + ); + accels.insert(state.id(), accel); + if self.is_match_state(state.id()) { + cmatch += 1; + } else if self.is_start_state(state.id()) { + cstart += 1; + } else { + assert!(!self.is_dead_state(state.id())); + assert!(!self.is_quit_state(state.id())); + cnormal += 1; + } + } + } + // If no states were able to be accelerated, then we're done. + if accels.is_empty() { + return; + } + let original_accels_len = accels.len(); + + // A remapper keeps track of state ID changes. Once we're done + // shuffling, the remapper is used to rewrite all transitions in the + // DFA based on the new positions of states. + let mut remapper = Remapper::new(self); + + // As we swap states, if they are match states, we need to swap their + // pattern ID lists too (for multi-regexes). We do this by converting + // the lists to an easily swappable map, and then convert back to + // MatchStates once we're done. + let mut new_matches = self.ms.to_map(self); + + // There is at least one state that gets accelerated, so these are + // guaranteed to get set to sensible values below. + self.special.min_accel = StateID::MAX; + self.special.max_accel = StateID::ZERO; + let update_special_accel = + |special: &mut Special, accel_id: StateID| { + special.min_accel = cmp::min(special.min_accel, accel_id); + special.max_accel = cmp::max(special.max_accel, accel_id); + }; + + // Start by shuffling match states. Any match states that are + // accelerated get moved to the end of the match state range. + if cmatch > 0 && self.special.matches() { + // N.B. special.{min,max}_match do not need updating, since the + // range/number of match states does not change. Only the ordering + // of match states may change. + let mut next_id = self.special.max_match; + let mut cur_id = next_id; + while cur_id >= self.special.min_match { + if let Some(accel) = accels.remove(&cur_id) { + accels.insert(next_id, accel); + update_special_accel(&mut self.special, next_id); + + // No need to do any actual swapping for equivalent IDs. + if cur_id != next_id { + remapper.swap(self, cur_id, next_id); + + // Swap pattern IDs for match states. + let cur_pids = new_matches.remove(&cur_id).unwrap(); + let next_pids = new_matches.remove(&next_id).unwrap(); + new_matches.insert(cur_id, next_pids); + new_matches.insert(next_id, cur_pids); + } + next_id = self.tt.prev_state_id(next_id); + } + cur_id = self.tt.prev_state_id(cur_id); + } + } + + // This is where it gets tricky. Without acceleration, start states + // normally come right after match states. But we want accelerated + // states to be a single contiguous range (to make it very fast + // to determine whether a state *is* accelerated), while also keeping + // match and starting states as contiguous ranges for the same reason. + // So what we do here is shuffle states such that it looks like this: + // + // DQMMMMAAAAASSSSSSNNNNNNN + // | | + // |---------| + // accelerated states + // + // Where: + // D - dead state + // Q - quit state + // M - match state (may be accelerated) + // A - normal state that is accelerated + // S - start state (may be accelerated) + // N - normal state that is NOT accelerated + // + // We implement this by shuffling states, which is done by a sequence + // of pairwise swaps. We start by looking at all normal states to be + // accelerated. When we find one, we swap it with the earliest starting + // state, and then swap that with the earliest normal state. This + // preserves the contiguous property. + // + // Once we're done looking for accelerated normal states, now we look + // for accelerated starting states by moving them to the beginning + // of the starting state range (just like we moved accelerated match + // states to the end of the matching state range). + // + // For a more detailed/different perspective on this, see the docs + // in dfa/special.rs. + if cnormal > 0 { + // our next available starting and normal states for swapping. + let mut next_start_id = self.special.min_start; + let mut cur_id = self.to_state_id(self.state_len() - 1); + // This is guaranteed to exist since cnormal > 0. + let mut next_norm_id = + self.tt.next_state_id(self.special.max_start); + while cur_id >= next_norm_id { + if let Some(accel) = accels.remove(&cur_id) { + remapper.swap(self, next_start_id, cur_id); + remapper.swap(self, next_norm_id, cur_id); + // Keep our accelerator map updated with new IDs if the + // states we swapped were also accelerated. + if let Some(accel2) = accels.remove(&next_norm_id) { + accels.insert(cur_id, accel2); + } + if let Some(accel2) = accels.remove(&next_start_id) { + accels.insert(next_norm_id, accel2); + } + accels.insert(next_start_id, accel); + update_special_accel(&mut self.special, next_start_id); + // Our start range shifts one to the right now. + self.special.min_start = + self.tt.next_state_id(self.special.min_start); + self.special.max_start = + self.tt.next_state_id(self.special.max_start); + next_start_id = self.tt.next_state_id(next_start_id); + next_norm_id = self.tt.next_state_id(next_norm_id); + } + // This is pretty tricky, but if our 'next_norm_id' state also + // happened to be accelerated, then the result is that it is + // now in the position of cur_id, so we need to consider it + // again. This loop is still guaranteed to terminate though, + // because when accels contains cur_id, we're guaranteed to + // increment next_norm_id even if cur_id remains unchanged. + if !accels.contains_key(&cur_id) { + cur_id = self.tt.prev_state_id(cur_id); + } + } + } + // Just like we did for match states, but we want to move accelerated + // start states to the beginning of the range instead of the end. + if cstart > 0 { + // N.B. special.{min,max}_start do not need updating, since the + // range/number of start states does not change at this point. Only + // the ordering of start states may change. + let mut next_id = self.special.min_start; + let mut cur_id = next_id; + while cur_id <= self.special.max_start { + if let Some(accel) = accels.remove(&cur_id) { + remapper.swap(self, cur_id, next_id); + accels.insert(next_id, accel); + update_special_accel(&mut self.special, next_id); + next_id = self.tt.next_state_id(next_id); + } + cur_id = self.tt.next_state_id(cur_id); + } + } + + // Remap all transitions in our DFA and assert some things. + remapper.remap(self); + // This unwrap is OK because acceleration never changes the number of + // match states or patterns in those match states. Since acceleration + // runs after the pattern map has been set at least once, we know that + // our match states cannot error. + self.set_pattern_map(&new_matches).unwrap(); + self.special.set_max(); + self.special.validate().expect("special state ranges should validate"); + self.special + .validate_state_len(self.state_len(), self.stride2()) + .expect( + "special state ranges should be consistent with state length", + ); + assert_eq!( + self.special.accel_len(self.stride()), + // We record the number of accelerated states initially detected + // since the accels map is itself mutated in the process above. + // If mutated incorrectly, its size may change, and thus can't be + // trusted as a source of truth of how many accelerated states we + // expected there to be. + original_accels_len, + "mismatch with expected number of accelerated states", + ); + + // And finally record our accelerators. We kept our accels map updated + // as we shuffled states above, so the accelerators should now + // correspond to a contiguous range in the state ID space. (Which we + // assert.) + let mut prev: Option<StateID> = None; + for (id, accel) in accels { + assert!(prev.map_or(true, |p| self.tt.next_state_id(p) == id)); + prev = Some(id); + self.accels.add(accel); + } + } + + /// Shuffle the states in this DFA so that starting states, match + /// states and accelerated states are all contiguous. + /// + /// See dfa/special.rs for more details. + pub(crate) fn shuffle( + &mut self, + mut matches: BTreeMap<StateID, Vec<PatternID>>, + ) -> Result<(), BuildError> { + // The determinizer always adds a quit state and it is always second. + self.special.quit_id = self.to_state_id(1); + // If all we have are the dead and quit states, then we're done and + // the DFA will never produce a match. + if self.state_len() <= 2 { + self.special.set_max(); + return Ok(()); + } + + // Collect all our non-DEAD start states into a convenient set and + // confirm there is no overlap with match states. In the classicl DFA + // construction, start states can be match states. But because of + // look-around, we delay all matches by a byte, which prevents start + // states from being match states. + let mut is_start: BTreeSet<StateID> = BTreeSet::new(); + for (start_id, _, _) in self.starts() { + // If a starting configuration points to a DEAD state, then we + // don't want to shuffle it. The DEAD state is always the first + // state with ID=0. So we can just leave it be. + if start_id == DEAD { + continue; + } + assert!( + !matches.contains_key(&start_id), + "{:?} is both a start and a match state, which is not allowed", + start_id, + ); + is_start.insert(start_id); + } + + // We implement shuffling by a sequence of pairwise swaps of states. + // Since we have a number of things referencing states via their + // IDs and swapping them changes their IDs, we need to record every + // swap we make so that we can remap IDs. The remapper handles this + // book-keeping for us. + let mut remapper = Remapper::new(self); + + // Shuffle matching states. + if matches.is_empty() { + self.special.min_match = DEAD; + self.special.max_match = DEAD; + } else { + // The determinizer guarantees that the first two states are the + // dead and quit states, respectively. We want our match states to + // come right after quit. + let mut next_id = self.to_state_id(2); + let mut new_matches = BTreeMap::new(); + self.special.min_match = next_id; + for (id, pids) in matches { + remapper.swap(self, next_id, id); + new_matches.insert(next_id, pids); + // If we swapped a start state, then update our set. + if is_start.contains(&next_id) { + is_start.remove(&next_id); + is_start.insert(id); + } + next_id = self.tt.next_state_id(next_id); + } + matches = new_matches; + self.special.max_match = cmp::max( + self.special.min_match, + self.tt.prev_state_id(next_id), + ); + } + + // Shuffle starting states. + { + let mut next_id = self.to_state_id(2); + if self.special.matches() { + next_id = self.tt.next_state_id(self.special.max_match); + } + self.special.min_start = next_id; + for id in is_start { + remapper.swap(self, next_id, id); + next_id = self.tt.next_state_id(next_id); + } + self.special.max_start = cmp::max( + self.special.min_start, + self.tt.prev_state_id(next_id), + ); + } + + // Finally remap all transitions in our DFA. + remapper.remap(self); + self.set_pattern_map(&matches)?; + self.special.set_max(); + self.special.validate().expect("special state ranges should validate"); + self.special + .validate_state_len(self.state_len(), self.stride2()) + .expect( + "special state ranges should be consistent with state length", + ); + Ok(()) + } + + /// Checks whether there are universal start states (both anchored and + /// unanchored), and if so, sets the relevant fields to the start state + /// IDs. + /// + /// Universal start states occur precisely when the all patterns in the + /// DFA have no look-around assertions in their prefix. + fn set_universal_starts(&mut self) { + assert_eq!(6, Start::len(), "expected 6 start configurations"); + + let start_id = |dfa: &mut OwnedDFA, inp: &Input<'_>, start: Start| { + // This OK because we only call 'start' under conditions + // in which we know it will succeed. + dfa.st.start(inp, start).expect("valid Input configuration") + }; + if self.start_kind().has_unanchored() { + let inp = Input::new("").anchored(Anchored::No); + let sid = start_id(self, &inp, Start::NonWordByte); + if sid == start_id(self, &inp, Start::WordByte) + && sid == start_id(self, &inp, Start::Text) + && sid == start_id(self, &inp, Start::LineLF) + && sid == start_id(self, &inp, Start::LineCR) + && sid == start_id(self, &inp, Start::CustomLineTerminator) + { + self.st.universal_start_unanchored = Some(sid); + } + } + if self.start_kind().has_anchored() { + let inp = Input::new("").anchored(Anchored::Yes); + let sid = start_id(self, &inp, Start::NonWordByte); + if sid == start_id(self, &inp, Start::WordByte) + && sid == start_id(self, &inp, Start::Text) + && sid == start_id(self, &inp, Start::LineLF) + && sid == start_id(self, &inp, Start::LineCR) + && sid == start_id(self, &inp, Start::CustomLineTerminator) + { + self.st.universal_start_anchored = Some(sid); + } + } + } +} + +// A variety of generic internal methods for accessing DFA internals. +impl<T: AsRef<[u32]>> DFA<T> { + /// Return the info about special states. + pub(crate) fn special(&self) -> &Special { + &self.special + } + + /// Return the info about special states as a mutable borrow. + #[cfg(feature = "dfa-build")] + pub(crate) fn special_mut(&mut self) -> &mut Special { + &mut self.special + } + + /// Returns the quit set (may be empty) used by this DFA. + pub(crate) fn quitset(&self) -> &ByteSet { + &self.quitset + } + + /// Returns the flags for this DFA. + pub(crate) fn flags(&self) -> &Flags { + &self.flags + } + + /// Returns an iterator over all states in this DFA. + /// + /// This iterator yields a tuple for each state. The first element of the + /// tuple corresponds to a state's identifier, and the second element + /// corresponds to the state itself (comprised of its transitions). + pub(crate) fn states(&self) -> StateIter<'_, T> { + self.tt.states() + } + + /// Return the total number of states in this DFA. Every DFA has at least + /// 1 state, even the empty DFA. + pub(crate) fn state_len(&self) -> usize { + self.tt.len() + } + + /// Return an iterator over all pattern IDs for the given match state. + /// + /// If the given state is not a match state, then this panics. + #[cfg(feature = "dfa-build")] + pub(crate) fn pattern_id_slice(&self, id: StateID) -> &[PatternID] { + assert!(self.is_match_state(id)); + self.ms.pattern_id_slice(self.match_state_index(id)) + } + + /// Return the total number of pattern IDs for the given match state. + /// + /// If the given state is not a match state, then this panics. + pub(crate) fn match_pattern_len(&self, id: StateID) -> usize { + assert!(self.is_match_state(id)); + self.ms.pattern_len(self.match_state_index(id)) + } + + /// Returns the total number of patterns matched by this DFA. + pub(crate) fn pattern_len(&self) -> usize { + self.ms.pattern_len + } + + /// Returns a map from match state ID to a list of pattern IDs that match + /// in that state. + #[cfg(feature = "dfa-build")] + pub(crate) fn pattern_map(&self) -> BTreeMap<StateID, Vec<PatternID>> { + self.ms.to_map(self) + } + + /// Returns the ID of the quit state for this DFA. + #[cfg(feature = "dfa-build")] + pub(crate) fn quit_id(&self) -> StateID { + self.to_state_id(1) + } + + /// Convert the given state identifier to the state's index. The state's + /// index corresponds to the position in which it appears in the transition + /// table. When a DFA is NOT premultiplied, then a state's identifier is + /// also its index. When a DFA is premultiplied, then a state's identifier + /// is equal to `index * alphabet_len`. This routine reverses that. + pub(crate) fn to_index(&self, id: StateID) -> usize { + self.tt.to_index(id) + } + + /// Convert an index to a state (in the range 0..self.state_len()) to an + /// actual state identifier. + /// + /// This is useful when using a `Vec<T>` as an efficient map keyed by state + /// to some other information (such as a remapped state ID). + #[cfg(feature = "dfa-build")] + pub(crate) fn to_state_id(&self, index: usize) -> StateID { + self.tt.to_state_id(index) + } + + /// Return the table of state IDs for this DFA's start states. + pub(crate) fn starts(&self) -> StartStateIter<'_> { + self.st.iter() + } + + /// Returns the index of the match state for the given ID. If the + /// given ID does not correspond to a match state, then this may + /// panic or produce an incorrect result. + #[cfg_attr(feature = "perf-inline", inline(always))] + fn match_state_index(&self, id: StateID) -> usize { + debug_assert!(self.is_match_state(id)); + // This is one of the places where we rely on the fact that match + // states are contiguous in the transition table. Namely, that the + // first match state ID always corresponds to dfa.special.min_match. + // From there, since we know the stride, we can compute the overall + // index of any match state given the match state's ID. + let min = self.special().min_match.as_usize(); + // CORRECTNESS: We're allowed to produce an incorrect result or panic, + // so both the subtraction and the unchecked StateID construction is + // OK. + self.to_index(StateID::new_unchecked(id.as_usize() - min)) + } + + /// Returns the index of the accelerator state for the given ID. If the + /// given ID does not correspond to an accelerator state, then this may + /// panic or produce an incorrect result. + fn accelerator_index(&self, id: StateID) -> usize { + let min = self.special().min_accel.as_usize(); + // CORRECTNESS: We're allowed to produce an incorrect result or panic, + // so both the subtraction and the unchecked StateID construction is + // OK. + self.to_index(StateID::new_unchecked(id.as_usize() - min)) + } + + /// Return the accelerators for this DFA. + fn accels(&self) -> Accels<&[u32]> { + self.accels.as_ref() + } + + /// Return this DFA's transition table as a slice. + fn trans(&self) -> &[StateID] { + self.tt.table() + } +} + +impl<T: AsRef<[u32]>> fmt::Debug for DFA<T> { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + writeln!(f, "dense::DFA(")?; + for state in self.states() { + fmt_state_indicator(f, self, state.id())?; + let id = if f.alternate() { + state.id().as_usize() + } else { + self.to_index(state.id()) + }; + write!(f, "{:06?}: ", id)?; + state.fmt(f)?; + write!(f, "\n")?; + } + writeln!(f, "")?; + for (i, (start_id, anchored, sty)) in self.starts().enumerate() { + let id = if f.alternate() { + start_id.as_usize() + } else { + self.to_index(start_id) + }; + if i % self.st.stride == 0 { + match anchored { + Anchored::No => writeln!(f, "START-GROUP(unanchored)")?, + Anchored::Yes => writeln!(f, "START-GROUP(anchored)")?, + Anchored::Pattern(pid) => { + writeln!(f, "START_GROUP(pattern: {:?})", pid)? + } + } + } + writeln!(f, " {:?} => {:06?}", sty, id)?; + } + if self.pattern_len() > 1 { + writeln!(f, "")?; + for i in 0..self.ms.len() { + let id = self.ms.match_state_id(self, i); + let id = if f.alternate() { + id.as_usize() + } else { + self.to_index(id) + }; + write!(f, "MATCH({:06?}): ", id)?; + for (i, &pid) in self.ms.pattern_id_slice(i).iter().enumerate() + { + if i > 0 { + write!(f, ", ")?; + } + write!(f, "{:?}", pid)?; + } + writeln!(f, "")?; + } + } + writeln!(f, "state length: {:?}", self.state_len())?; + writeln!(f, "pattern length: {:?}", self.pattern_len())?; + writeln!(f, "flags: {:?}", self.flags)?; + writeln!(f, ")")?; + Ok(()) + } +} + +// SAFETY: We assert that our implementation of each method is correct. +unsafe impl<T: AsRef<[u32]>> Automaton for DFA<T> { + #[cfg_attr(feature = "perf-inline", inline(always))] + fn is_special_state(&self, id: StateID) -> bool { + self.special.is_special_state(id) + } + + #[cfg_attr(feature = "perf-inline", inline(always))] + fn is_dead_state(&self, id: StateID) -> bool { + self.special.is_dead_state(id) + } + + #[cfg_attr(feature = "perf-inline", inline(always))] + fn is_quit_state(&self, id: StateID) -> bool { + self.special.is_quit_state(id) + } + + #[cfg_attr(feature = "perf-inline", inline(always))] + fn is_match_state(&self, id: StateID) -> bool { + self.special.is_match_state(id) + } + + #[cfg_attr(feature = "perf-inline", inline(always))] + fn is_start_state(&self, id: StateID) -> bool { + self.special.is_start_state(id) + } + + #[cfg_attr(feature = "perf-inline", inline(always))] + fn is_accel_state(&self, id: StateID) -> bool { + self.special.is_accel_state(id) + } + + #[cfg_attr(feature = "perf-inline", inline(always))] + fn next_state(&self, current: StateID, input: u8) -> StateID { + let input = self.byte_classes().get(input); + let o = current.as_usize() + usize::from(input); + self.trans()[o] + } + + #[cfg_attr(feature = "perf-inline", inline(always))] + unsafe fn next_state_unchecked( + &self, + current: StateID, + byte: u8, + ) -> StateID { + // We don't (or shouldn't) need an unchecked variant for the byte + // class mapping, since bound checks should be omitted automatically + // by virtue of its representation. If this ends up not being true as + // confirmed by codegen, please file an issue. ---AG + let class = self.byte_classes().get(byte); + let o = current.as_usize() + usize::from(class); + let next = *self.trans().get_unchecked(o); + next + } + + #[cfg_attr(feature = "perf-inline", inline(always))] + fn next_eoi_state(&self, current: StateID) -> StateID { + let eoi = self.byte_classes().eoi().as_usize(); + let o = current.as_usize() + eoi; + self.trans()[o] + } + + #[cfg_attr(feature = "perf-inline", inline(always))] + fn pattern_len(&self) -> usize { + self.ms.pattern_len + } + + #[cfg_attr(feature = "perf-inline", inline(always))] + fn match_len(&self, id: StateID) -> usize { + self.match_pattern_len(id) + } + + #[cfg_attr(feature = "perf-inline", inline(always))] + fn match_pattern(&self, id: StateID, match_index: usize) -> PatternID { + // This is an optimization for the very common case of a DFA with a + // single pattern. This conditional avoids a somewhat more costly path + // that finds the pattern ID from the state machine, which requires + // a bit of slicing/pointer-chasing. This optimization tends to only + // matter when matches are frequent. + if self.ms.pattern_len == 1 { + return PatternID::ZERO; + } + let state_index = self.match_state_index(id); + self.ms.pattern_id(state_index, match_index) + } + + #[cfg_attr(feature = "perf-inline", inline(always))] + fn has_empty(&self) -> bool { + self.flags.has_empty + } + + #[cfg_attr(feature = "perf-inline", inline(always))] + fn is_utf8(&self) -> bool { + self.flags.is_utf8 + } + + #[cfg_attr(feature = "perf-inline", inline(always))] + fn is_always_start_anchored(&self) -> bool { + self.flags.is_always_start_anchored + } + + #[cfg_attr(feature = "perf-inline", inline(always))] + fn start_state_forward( + &self, + input: &Input<'_>, + ) -> Result<StateID, MatchError> { + if !self.quitset.is_empty() && input.start() > 0 { + let offset = input.start() - 1; + let byte = input.haystack()[offset]; + if self.quitset.contains(byte) { + return Err(MatchError::quit(byte, offset)); + } + } + let start = self.st.start_map.fwd(&input); + self.st.start(input, start) + } + + #[cfg_attr(feature = "perf-inline", inline(always))] + fn start_state_reverse( + &self, + input: &Input<'_>, + ) -> Result<StateID, MatchError> { + if !self.quitset.is_empty() && input.end() < input.haystack().len() { + let offset = input.end(); + let byte = input.haystack()[offset]; + if self.quitset.contains(byte) { + return Err(MatchError::quit(byte, offset)); + } + } + let start = self.st.start_map.rev(&input); + self.st.start(input, start) + } + + #[cfg_attr(feature = "perf-inline", inline(always))] + fn universal_start_state(&self, mode: Anchored) -> Option<StateID> { + match mode { + Anchored::No => self.st.universal_start_unanchored, + Anchored::Yes => self.st.universal_start_anchored, + Anchored::Pattern(_) => None, + } + } + + #[cfg_attr(feature = "perf-inline", inline(always))] + fn accelerator(&self, id: StateID) -> &[u8] { + if !self.is_accel_state(id) { + return &[]; + } + self.accels.needles(self.accelerator_index(id)) + } + + #[cfg_attr(feature = "perf-inline", inline(always))] + fn get_prefilter(&self) -> Option<&Prefilter> { + self.pre.as_ref() + } +} + +/// The transition table portion of a dense DFA. +/// +/// The transition table is the core part of the DFA in that it describes how +/// to move from one state to another based on the input sequence observed. +#[derive(Clone)] +pub(crate) struct TransitionTable<T> { + /// A contiguous region of memory representing the transition table in + /// row-major order. The representation is dense. That is, every state + /// has precisely the same number of transitions. The maximum number of + /// transitions per state is 257 (256 for each possible byte value, plus 1 + /// for the special EOI transition). If a DFA has been instructed to use + /// byte classes (the default), then the number of transitions is usually + /// substantially fewer. + /// + /// In practice, T is either `Vec<u32>` or `&[u32]`. + table: T, + /// A set of equivalence classes, where a single equivalence class + /// represents a set of bytes that never discriminate between a match + /// and a non-match in the DFA. Each equivalence class corresponds to a + /// single character in this DFA's alphabet, where the maximum number of + /// characters is 257 (each possible value of a byte plus the special + /// EOI transition). Consequently, the number of equivalence classes + /// corresponds to the number of transitions for each DFA state. Note + /// though that the *space* used by each DFA state in the transition table + /// may be larger. The total space used by each DFA state is known as the + /// stride. + /// + /// The only time the number of equivalence classes is fewer than 257 is if + /// the DFA's kind uses byte classes (which is the default). Equivalence + /// classes should generally only be disabled when debugging, so that + /// the transitions themselves aren't obscured. Disabling them has no + /// other benefit, since the equivalence class map is always used while + /// searching. In the vast majority of cases, the number of equivalence + /// classes is substantially smaller than 257, particularly when large + /// Unicode classes aren't used. + classes: ByteClasses, + /// The stride of each DFA state, expressed as a power-of-two exponent. + /// + /// The stride of a DFA corresponds to the total amount of space used by + /// each DFA state in the transition table. This may be bigger than the + /// size of a DFA's alphabet, since the stride is always the smallest + /// power of two greater than or equal to the alphabet size. + /// + /// While this wastes space, this avoids the need for integer division + /// to convert between premultiplied state IDs and their corresponding + /// indices. Instead, we can use simple bit-shifts. + /// + /// See the docs for the `stride2` method for more details. + /// + /// The minimum `stride2` value is `1` (corresponding to a stride of `2`) + /// while the maximum `stride2` value is `9` (corresponding to a stride of + /// `512`). The maximum is not `8` since the maximum alphabet size is `257` + /// when accounting for the special EOI transition. However, an alphabet + /// length of that size is exceptionally rare since the alphabet is shrunk + /// into equivalence classes. + stride2: usize, +} + +impl<'a> TransitionTable<&'a [u32]> { + /// Deserialize a transition table starting at the beginning of `slice`. + /// Upon success, return the total number of bytes read along with the + /// transition table. + /// + /// If there was a problem deserializing any part of the transition table, + /// then this returns an error. Notably, if the given slice does not have + /// the same alignment as `StateID`, then this will return an error (among + /// other possible errors). + /// + /// This is guaranteed to execute in constant time. + /// + /// # Safety + /// + /// This routine is not safe because it does not check the validity of the + /// transition table itself. In particular, the transition table can be + /// quite large, so checking its validity can be somewhat expensive. An + /// invalid transition table is not safe because other code may rely on the + /// transition table being correct (such as explicit bounds check elision). + /// Therefore, an invalid transition table can lead to undefined behavior. + /// + /// Callers that use this function must either pass on the safety invariant + /// or guarantee that the bytes given contain a valid transition table. + /// This guarantee is upheld by the bytes written by `write_to`. + unsafe fn from_bytes_unchecked( + mut slice: &'a [u8], + ) -> Result<(TransitionTable<&'a [u32]>, usize), DeserializeError> { + let slice_start = slice.as_ptr().as_usize(); + + let (state_len, nr) = + wire::try_read_u32_as_usize(slice, "state length")?; + slice = &slice[nr..]; + + let (stride2, nr) = wire::try_read_u32_as_usize(slice, "stride2")?; + slice = &slice[nr..]; + + let (classes, nr) = ByteClasses::from_bytes(slice)?; + slice = &slice[nr..]; + + // The alphabet length (determined by the byte class map) cannot be + // bigger than the stride (total space used by each DFA state). + if stride2 > 9 { + return Err(DeserializeError::generic( + "dense DFA has invalid stride2 (too big)", + )); + } + // It also cannot be zero, since even a DFA that never matches anything + // has a non-zero number of states with at least two equivalence + // classes: one for all 256 byte values and another for the EOI + // sentinel. + if stride2 < 1 { + return Err(DeserializeError::generic( + "dense DFA has invalid stride2 (too small)", + )); + } + // This is OK since 1 <= stride2 <= 9. + let stride = + 1usize.checked_shl(u32::try_from(stride2).unwrap()).unwrap(); + if classes.alphabet_len() > stride { + return Err(DeserializeError::generic( + "alphabet size cannot be bigger than transition table stride", + )); + } + + let trans_len = + wire::shl(state_len, stride2, "dense table transition length")?; + let table_bytes_len = wire::mul( + trans_len, + StateID::SIZE, + "dense table state byte length", + )?; + wire::check_slice_len(slice, table_bytes_len, "transition table")?; + wire::check_alignment::<StateID>(slice)?; + let table_bytes = &slice[..table_bytes_len]; + slice = &slice[table_bytes_len..]; + // SAFETY: Since StateID is always representable as a u32, all we need + // to do is ensure that we have the proper length and alignment. We've + // checked both above, so the cast below is safe. + // + // N.B. This is the only not-safe code in this function. + let table = core::slice::from_raw_parts( + table_bytes.as_ptr().cast::<u32>(), + trans_len, + ); + let tt = TransitionTable { table, classes, stride2 }; + Ok((tt, slice.as_ptr().as_usize() - slice_start)) + } +} + +#[cfg(feature = "dfa-build")] +impl TransitionTable<Vec<u32>> { + /// Create a minimal transition table with just two states: a dead state + /// and a quit state. The alphabet length and stride of the transition + /// table is determined by the given set of equivalence classes. + fn minimal(classes: ByteClasses) -> TransitionTable<Vec<u32>> { + let mut tt = TransitionTable { + table: vec![], + classes, + stride2: classes.stride2(), + }; + // Two states, regardless of alphabet size, can always fit into u32. + tt.add_empty_state().unwrap(); // dead state + tt.add_empty_state().unwrap(); // quit state + tt + } + + /// Set a transition in this table. Both the `from` and `to` states must + /// already exist, otherwise this panics. `unit` should correspond to the + /// transition out of `from` to set to `to`. + fn set(&mut self, from: StateID, unit: alphabet::Unit, to: StateID) { + assert!(self.is_valid(from), "invalid 'from' state"); + assert!(self.is_valid(to), "invalid 'to' state"); + self.table[from.as_usize() + self.classes.get_by_unit(unit)] = + to.as_u32(); + } + + /// Add an empty state (a state where all transitions lead to a dead state) + /// and return its identifier. The identifier returned is guaranteed to + /// not point to any other existing state. + /// + /// If adding a state would exhaust the state identifier space, then this + /// returns an error. + fn add_empty_state(&mut self) -> Result<StateID, BuildError> { + // Normally, to get a fresh state identifier, we would just + // take the index of the next state added to the transition + // table. However, we actually perform an optimization here + // that premultiplies state IDs by the stride, such that they + // point immediately at the beginning of their transitions in + // the transition table. This avoids an extra multiplication + // instruction for state lookup at search time. + // + // Premultiplied identifiers means that instead of your matching + // loop looking something like this: + // + // state = dfa.start + // for byte in haystack: + // next = dfa.transitions[state * stride + byte] + // if dfa.is_match(next): + // return true + // return false + // + // it can instead look like this: + // + // state = dfa.start + // for byte in haystack: + // next = dfa.transitions[state + byte] + // if dfa.is_match(next): + // return true + // return false + // + // In other words, we save a multiplication instruction in the + // critical path. This turns out to be a decent performance win. + // The cost of using premultiplied state ids is that they can + // require a bigger state id representation. (And they also make + // the code a bit more complex, especially during minimization and + // when reshuffling states, as one needs to convert back and forth + // between state IDs and state indices.) + // + // To do this, we simply take the index of the state into the + // entire transition table, rather than the index of the state + // itself. e.g., If the stride is 64, then the ID of the 3rd state + // is 192, not 2. + let next = self.table.len(); + let id = + StateID::new(next).map_err(|_| BuildError::too_many_states())?; + self.table.extend(iter::repeat(0).take(self.stride())); + Ok(id) + } + + /// Swap the two states given in this transition table. + /// + /// This routine does not do anything to check the correctness of this + /// swap. Callers must ensure that other states pointing to id1 and id2 are + /// updated appropriately. + /// + /// Both id1 and id2 must point to valid states, otherwise this panics. + fn swap(&mut self, id1: StateID, id2: StateID) { + assert!(self.is_valid(id1), "invalid 'id1' state: {:?}", id1); + assert!(self.is_valid(id2), "invalid 'id2' state: {:?}", id2); + // We only need to swap the parts of the state that are used. So if the + // stride is 64, but the alphabet length is only 33, then we save a lot + // of work. + for b in 0..self.classes.alphabet_len() { + self.table.swap(id1.as_usize() + b, id2.as_usize() + b); + } + } + + /// Remap the transitions for the state given according to the function + /// given. This applies the given map function to every transition in the + /// given state and changes the transition in place to the result of the + /// map function for that transition. + fn remap(&mut self, id: StateID, map: impl Fn(StateID) -> StateID) { + for byte in 0..self.alphabet_len() { + let i = id.as_usize() + byte; + let next = self.table()[i]; + self.table_mut()[id.as_usize() + byte] = map(next); + } + } + + /// Truncate the states in this transition table to the given length. + /// + /// This routine does not do anything to check the correctness of this + /// truncation. Callers must ensure that other states pointing to truncated + /// states are updated appropriately. + fn truncate(&mut self, len: usize) { + self.table.truncate(len << self.stride2); + } +} + +impl<T: AsRef<[u32]>> TransitionTable<T> { + /// Writes a serialized form of this transition table to the buffer given. + /// If the buffer is too small, then an error is returned. To determine + /// how big the buffer must be, use `write_to_len`. + fn write_to<E: Endian>( + &self, + mut dst: &mut [u8], + ) -> Result<usize, SerializeError> { + let nwrite = self.write_to_len(); + if dst.len() < nwrite { + return Err(SerializeError::buffer_too_small("transition table")); + } + dst = &mut dst[..nwrite]; + + // write state length + // Unwrap is OK since number of states is guaranteed to fit in a u32. + E::write_u32(u32::try_from(self.len()).unwrap(), dst); + dst = &mut dst[size_of::<u32>()..]; + + // write state stride (as power of 2) + // Unwrap is OK since stride2 is guaranteed to be <= 9. + E::write_u32(u32::try_from(self.stride2).unwrap(), dst); + dst = &mut dst[size_of::<u32>()..]; + + // write byte class map + let n = self.classes.write_to(dst)?; + dst = &mut dst[n..]; + + // write actual transitions + for &sid in self.table() { + let n = wire::write_state_id::<E>(sid, &mut dst); + dst = &mut dst[n..]; + } + Ok(nwrite) + } + + /// Returns the number of bytes the serialized form of this transition + /// table will use. + fn write_to_len(&self) -> usize { + size_of::<u32>() // state length + + size_of::<u32>() // stride2 + + self.classes.write_to_len() + + (self.table().len() * StateID::SIZE) + } + + /// Validates that every state ID in this transition table is valid. + /// + /// That is, every state ID can be used to correctly index a state in this + /// table. + fn validate(&self, sp: &Special) -> Result<(), DeserializeError> { + for state in self.states() { + // We check that the ID itself is well formed. That is, if it's + // a special state then it must actually be a quit, dead, accel, + // match or start state. + if sp.is_special_state(state.id()) { + let is_actually_special = sp.is_dead_state(state.id()) + || sp.is_quit_state(state.id()) + || sp.is_match_state(state.id()) + || sp.is_start_state(state.id()) + || sp.is_accel_state(state.id()); + if !is_actually_special { + // This is kind of a cryptic error message... + return Err(DeserializeError::generic( + "found dense state tagged as special but \ + wasn't actually special", + )); + } + } + for (_, to) in state.transitions() { + if !self.is_valid(to) { + return Err(DeserializeError::generic( + "found invalid state ID in transition table", + )); + } + } + } + Ok(()) + } + + /// Converts this transition table to a borrowed value. + fn as_ref(&self) -> TransitionTable<&'_ [u32]> { + TransitionTable { + table: self.table.as_ref(), + classes: self.classes.clone(), + stride2: self.stride2, + } + } + + /// Converts this transition table to an owned value. + #[cfg(feature = "alloc")] + fn to_owned(&self) -> TransitionTable<alloc::vec::Vec<u32>> { + TransitionTable { + table: self.table.as_ref().to_vec(), + classes: self.classes.clone(), + stride2: self.stride2, + } + } + + /// Return the state for the given ID. If the given ID is not valid, then + /// this panics. + fn state(&self, id: StateID) -> State<'_> { + assert!(self.is_valid(id)); + + let i = id.as_usize(); + State { + id, + stride2: self.stride2, + transitions: &self.table()[i..i + self.alphabet_len()], + } + } + + /// Returns an iterator over all states in this transition table. + /// + /// This iterator yields a tuple for each state. The first element of the + /// tuple corresponds to a state's identifier, and the second element + /// corresponds to the state itself (comprised of its transitions). + fn states(&self) -> StateIter<'_, T> { + StateIter { + tt: self, + it: self.table().chunks(self.stride()).enumerate(), + } + } + + /// Convert a state identifier to an index to a state (in the range + /// 0..self.len()). + /// + /// This is useful when using a `Vec<T>` as an efficient map keyed by state + /// to some other information (such as a remapped state ID). + /// + /// If the given ID is not valid, then this may panic or produce an + /// incorrect index. + fn to_index(&self, id: StateID) -> usize { + id.as_usize() >> self.stride2 + } + + /// Convert an index to a state (in the range 0..self.len()) to an actual + /// state identifier. + /// + /// This is useful when using a `Vec<T>` as an efficient map keyed by state + /// to some other information (such as a remapped state ID). + /// + /// If the given index is not in the specified range, then this may panic + /// or produce an incorrect state ID. + fn to_state_id(&self, index: usize) -> StateID { + // CORRECTNESS: If the given index is not valid, then it is not + // required for this to panic or return a valid state ID. + StateID::new_unchecked(index << self.stride2) + } + + /// Returns the state ID for the state immediately following the one given. + /// + /// This does not check whether the state ID returned is invalid. In fact, + /// if the state ID given is the last state in this DFA, then the state ID + /// returned is guaranteed to be invalid. + #[cfg(feature = "dfa-build")] + fn next_state_id(&self, id: StateID) -> StateID { + self.to_state_id(self.to_index(id).checked_add(1).unwrap()) + } + + /// Returns the state ID for the state immediately preceding the one given. + /// + /// If the dead ID given (which is zero), then this panics. + #[cfg(feature = "dfa-build")] + fn prev_state_id(&self, id: StateID) -> StateID { + self.to_state_id(self.to_index(id).checked_sub(1).unwrap()) + } + + /// Returns the table as a slice of state IDs. + fn table(&self) -> &[StateID] { + wire::u32s_to_state_ids(self.table.as_ref()) + } + + /// Returns the total number of states in this transition table. + /// + /// Note that a DFA always has at least two states: the dead and quit + /// states. In particular, the dead state always has ID 0 and is + /// correspondingly always the first state. The dead state is never a match + /// state. + fn len(&self) -> usize { + self.table().len() >> self.stride2 + } + + /// Returns the total stride for every state in this DFA. This corresponds + /// to the total number of transitions used by each state in this DFA's + /// transition table. + fn stride(&self) -> usize { + 1 << self.stride2 + } + + /// Returns the total number of elements in the alphabet for this + /// transition table. This is always less than or equal to `self.stride()`. + /// It is only equal when the alphabet length is a power of 2. Otherwise, + /// it is always strictly less. + fn alphabet_len(&self) -> usize { + self.classes.alphabet_len() + } + + /// Returns true if and only if the given state ID is valid for this + /// transition table. Validity in this context means that the given ID can + /// be used as a valid offset with `self.stride()` to index this transition + /// table. + fn is_valid(&self, id: StateID) -> bool { + let id = id.as_usize(); + id < self.table().len() && id % self.stride() == 0 + } + + /// Return the memory usage, in bytes, of this transition table. + /// + /// This does not include the size of a `TransitionTable` value itself. + fn memory_usage(&self) -> usize { + self.table().len() * StateID::SIZE + } +} + +#[cfg(feature = "dfa-build")] +impl<T: AsMut<[u32]>> TransitionTable<T> { + /// Returns the table as a slice of state IDs. + fn table_mut(&mut self) -> &mut [StateID] { + wire::u32s_to_state_ids_mut(self.table.as_mut()) + } +} + +/// The set of all possible starting states in a DFA. +/// +/// The set of starting states corresponds to the possible choices one can make +/// in terms of starting a DFA. That is, before following the first transition, +/// you first need to select the state that you start in. +/// +/// Normally, a DFA converted from an NFA that has a single starting state +/// would itself just have one starting state. However, our support for look +/// around generally requires more starting states. The correct starting state +/// is chosen based on certain properties of the position at which we begin +/// our search. +/// +/// Before listing those properties, we first must define two terms: +/// +/// * `haystack` - The bytes to execute the search. The search always starts +/// at the beginning of `haystack` and ends before or at the end of +/// `haystack`. +/// * `context` - The (possibly empty) bytes surrounding `haystack`. `haystack` +/// must be contained within `context` such that `context` is at least as big +/// as `haystack`. +/// +/// This split is crucial for dealing with look-around. For example, consider +/// the context `foobarbaz`, the haystack `bar` and the regex `^bar$`. This +/// regex should _not_ match the haystack since `bar` does not appear at the +/// beginning of the input. Similarly, the regex `\Bbar\B` should match the +/// haystack because `bar` is not surrounded by word boundaries. But a search +/// that does not take context into account would not permit `\B` to match +/// since the beginning of any string matches a word boundary. Similarly, a +/// search that does not take context into account when searching `^bar$` in +/// the haystack `bar` would produce a match when it shouldn't. +/// +/// Thus, it follows that the starting state is chosen based on the following +/// criteria, derived from the position at which the search starts in the +/// `context` (corresponding to the start of `haystack`): +/// +/// 1. If the search starts at the beginning of `context`, then the `Text` +/// start state is used. (Since `^` corresponds to +/// `hir::Anchor::Start`.) +/// 2. If the search starts at a position immediately following a line +/// terminator, then the `Line` start state is used. (Since `(?m:^)` +/// corresponds to `hir::Anchor::StartLF`.) +/// 3. If the search starts at a position immediately following a byte +/// classified as a "word" character (`[_0-9a-zA-Z]`), then the `WordByte` +/// start state is used. (Since `(?-u:\b)` corresponds to a word boundary.) +/// 4. Otherwise, if the search starts at a position immediately following +/// a byte that is not classified as a "word" character (`[^_0-9a-zA-Z]`), +/// then the `NonWordByte` start state is used. (Since `(?-u:\B)` +/// corresponds to a not-word-boundary.) +/// +/// (N.B. Unicode word boundaries are not supported by the DFA because they +/// require multi-byte look-around and this is difficult to support in a DFA.) +/// +/// To further complicate things, we also support constructing individual +/// anchored start states for each pattern in the DFA. (Which is required to +/// implement overlapping regexes correctly, but is also generally useful.) +/// Thus, when individual start states for each pattern are enabled, then the +/// total number of start states represented is `4 + (4 * #patterns)`, where +/// the 4 comes from each of the 4 possibilities above. The first 4 represents +/// the starting states for the entire DFA, which support searching for +/// multiple patterns simultaneously (possibly unanchored). +/// +/// If individual start states are disabled, then this will only store 4 +/// start states. Typically, individual start states are only enabled when +/// constructing the reverse DFA for regex matching. But they are also useful +/// for building DFAs that can search for a specific pattern or even to support +/// both anchored and unanchored searches with the same DFA. +/// +/// Note though that while the start table always has either `4` or +/// `4 + (4 * #patterns)` starting state *ids*, the total number of states +/// might be considerably smaller. That is, many of the IDs may be duplicative. +/// (For example, if a regex doesn't have a `\b` sub-pattern, then there's no +/// reason to generate a unique starting state for handling word boundaries. +/// Similarly for start/end anchors.) +#[derive(Clone)] +pub(crate) struct StartTable<T> { + /// The initial start state IDs. + /// + /// In practice, T is either `Vec<u32>` or `&[u32]`. + /// + /// The first `2 * stride` (currently always 8) entries always correspond + /// to the starts states for the entire DFA, with the first 4 entries being + /// for unanchored searches and the second 4 entries being for anchored + /// searches. To keep things simple, we always use 8 entries even if the + /// `StartKind` is not both. + /// + /// After that, there are `stride * patterns` state IDs, where `patterns` + /// may be zero in the case of a DFA with no patterns or in the case where + /// the DFA was built without enabling starting states for each pattern. + table: T, + /// The starting state configuration supported. When 'both', both + /// unanchored and anchored searches work. When 'unanchored', anchored + /// searches panic. When 'anchored', unanchored searches panic. + kind: StartKind, + /// The start state configuration for every possible byte. + start_map: StartByteMap, + /// The number of starting state IDs per pattern. + stride: usize, + /// The total number of patterns for which starting states are encoded. + /// This is `None` for DFAs that were built without start states for each + /// pattern. Thus, one cannot use this field to say how many patterns + /// are in the DFA in all cases. It is specific to how many patterns are + /// represented in this start table. + pattern_len: Option<usize>, + /// The universal starting state for unanchored searches. This is only + /// present when the DFA supports unanchored searches and when all starting + /// state IDs for an unanchored search are equivalent. + universal_start_unanchored: Option<StateID>, + /// The universal starting state for anchored searches. This is only + /// present when the DFA supports anchored searches and when all starting + /// state IDs for an anchored search are equivalent. + universal_start_anchored: Option<StateID>, +} + +#[cfg(feature = "dfa-build")] +impl StartTable<Vec<u32>> { + /// Create a valid set of start states all pointing to the dead state. + /// + /// When the corresponding DFA is constructed with start states for each + /// pattern, then `patterns` should be the number of patterns. Otherwise, + /// it should be zero. + /// + /// If the total table size could exceed the allocatable limit, then this + /// returns an error. In practice, this is unlikely to be able to occur, + /// since it's likely that allocation would have failed long before it got + /// to this point. + fn dead( + kind: StartKind, + lookm: &LookMatcher, + pattern_len: Option<usize>, + ) -> Result<StartTable<Vec<u32>>, BuildError> { + if let Some(len) = pattern_len { + assert!(len <= PatternID::LIMIT); + } + let stride = Start::len(); + // OK because 2*4 is never going to overflow anything. + let starts_len = stride.checked_mul(2).unwrap(); + let pattern_starts_len = + match stride.checked_mul(pattern_len.unwrap_or(0)) { + Some(x) => x, + None => return Err(BuildError::too_many_start_states()), + }; + let table_len = match starts_len.checked_add(pattern_starts_len) { + Some(x) => x, + None => return Err(BuildError::too_many_start_states()), + }; + if let Err(_) = isize::try_from(table_len) { + return Err(BuildError::too_many_start_states()); + } + let table = vec![DEAD.as_u32(); table_len]; + let start_map = StartByteMap::new(lookm); + Ok(StartTable { + table, + kind, + start_map, + stride, + pattern_len, + universal_start_unanchored: None, + universal_start_anchored: None, + }) + } +} + +impl<'a> StartTable<&'a [u32]> { + /// Deserialize a table of start state IDs starting at the beginning of + /// `slice`. Upon success, return the total number of bytes read along with + /// the table of starting state IDs. + /// + /// If there was a problem deserializing any part of the starting IDs, + /// then this returns an error. Notably, if the given slice does not have + /// the same alignment as `StateID`, then this will return an error (among + /// other possible errors). + /// + /// This is guaranteed to execute in constant time. + /// + /// # Safety + /// + /// This routine is not safe because it does not check the validity of the + /// starting state IDs themselves. In particular, the number of starting + /// IDs can be of variable length, so it's possible that checking their + /// validity cannot be done in constant time. An invalid starting state + /// ID is not safe because other code may rely on the starting IDs being + /// correct (such as explicit bounds check elision). Therefore, an invalid + /// start ID can lead to undefined behavior. + /// + /// Callers that use this function must either pass on the safety invariant + /// or guarantee that the bytes given contain valid starting state IDs. + /// This guarantee is upheld by the bytes written by `write_to`. + unsafe fn from_bytes_unchecked( + mut slice: &'a [u8], + ) -> Result<(StartTable<&'a [u32]>, usize), DeserializeError> { + let slice_start = slice.as_ptr().as_usize(); + + let (kind, nr) = StartKind::from_bytes(slice)?; + slice = &slice[nr..]; + + let (start_map, nr) = StartByteMap::from_bytes(slice)?; + slice = &slice[nr..]; + + let (stride, nr) = + wire::try_read_u32_as_usize(slice, "start table stride")?; + slice = &slice[nr..]; + if stride != Start::len() { + return Err(DeserializeError::generic( + "invalid starting table stride", + )); + } + + let (maybe_pattern_len, nr) = + wire::try_read_u32_as_usize(slice, "start table patterns")?; + slice = &slice[nr..]; + let pattern_len = if maybe_pattern_len.as_u32() == u32::MAX { + None + } else { + Some(maybe_pattern_len) + }; + if pattern_len.map_or(false, |len| len > PatternID::LIMIT) { + return Err(DeserializeError::generic( + "invalid number of patterns", + )); + } + + let (universal_unanchored, nr) = + wire::try_read_u32(slice, "universal unanchored start")?; + slice = &slice[nr..]; + let universal_start_unanchored = if universal_unanchored == u32::MAX { + None + } else { + Some(StateID::try_from(universal_unanchored).map_err(|e| { + DeserializeError::state_id_error( + e, + "universal unanchored start", + ) + })?) + }; + + let (universal_anchored, nr) = + wire::try_read_u32(slice, "universal anchored start")?; + slice = &slice[nr..]; + let universal_start_anchored = if universal_anchored == u32::MAX { + None + } else { + Some(StateID::try_from(universal_anchored).map_err(|e| { + DeserializeError::state_id_error(e, "universal anchored start") + })?) + }; + + let pattern_table_size = wire::mul( + stride, + pattern_len.unwrap_or(0), + "invalid pattern length", + )?; + // Our start states always start with a two stride of start states for + // the entire automaton. The first stride is for unanchored starting + // states and the second stride is for anchored starting states. What + // follows it are an optional set of start states for each pattern. + let start_state_len = wire::add( + wire::mul(2, stride, "start state stride too big")?, + pattern_table_size, + "invalid 'any' pattern starts size", + )?; + let table_bytes_len = wire::mul( + start_state_len, + StateID::SIZE, + "pattern table bytes length", + )?; + wire::check_slice_len(slice, table_bytes_len, "start ID table")?; + wire::check_alignment::<StateID>(slice)?; + let table_bytes = &slice[..table_bytes_len]; + slice = &slice[table_bytes_len..]; + // SAFETY: Since StateID is always representable as a u32, all we need + // to do is ensure that we have the proper length and alignment. We've + // checked both above, so the cast below is safe. + // + // N.B. This is the only not-safe code in this function. + let table = core::slice::from_raw_parts( + table_bytes.as_ptr().cast::<u32>(), + start_state_len, + ); + let st = StartTable { + table, + kind, + start_map, + stride, + pattern_len, + universal_start_unanchored, + universal_start_anchored, + }; + Ok((st, slice.as_ptr().as_usize() - slice_start)) + } +} + +impl<T: AsRef<[u32]>> StartTable<T> { + /// Writes a serialized form of this start table to the buffer given. If + /// the buffer is too small, then an error is returned. To determine how + /// big the buffer must be, use `write_to_len`. + fn write_to<E: Endian>( + &self, + mut dst: &mut [u8], + ) -> Result<usize, SerializeError> { + let nwrite = self.write_to_len(); + if dst.len() < nwrite { + return Err(SerializeError::buffer_too_small( + "starting table ids", + )); + } + dst = &mut dst[..nwrite]; + + // write start kind + let nw = self.kind.write_to::<E>(dst)?; + dst = &mut dst[nw..]; + // write start byte map + let nw = self.start_map.write_to(dst)?; + dst = &mut dst[nw..]; + // write stride + // Unwrap is OK since the stride is always 4 (currently). + E::write_u32(u32::try_from(self.stride).unwrap(), dst); + dst = &mut dst[size_of::<u32>()..]; + // write pattern length + // Unwrap is OK since number of patterns is guaranteed to fit in a u32. + E::write_u32( + u32::try_from(self.pattern_len.unwrap_or(0xFFFF_FFFF)).unwrap(), + dst, + ); + dst = &mut dst[size_of::<u32>()..]; + // write universal start unanchored state id, u32::MAX if absent + E::write_u32( + self.universal_start_unanchored + .map_or(u32::MAX, |sid| sid.as_u32()), + dst, + ); + dst = &mut dst[size_of::<u32>()..]; + // write universal start anchored state id, u32::MAX if absent + E::write_u32( + self.universal_start_anchored.map_or(u32::MAX, |sid| sid.as_u32()), + dst, + ); + dst = &mut dst[size_of::<u32>()..]; + // write start IDs + for &sid in self.table() { + let n = wire::write_state_id::<E>(sid, &mut dst); + dst = &mut dst[n..]; + } + Ok(nwrite) + } + + /// Returns the number of bytes the serialized form of this start ID table + /// will use. + fn write_to_len(&self) -> usize { + self.kind.write_to_len() + + self.start_map.write_to_len() + + size_of::<u32>() // stride + + size_of::<u32>() // # patterns + + size_of::<u32>() // universal unanchored start + + size_of::<u32>() // universal anchored start + + (self.table().len() * StateID::SIZE) + } + + /// Validates that every state ID in this start table is valid by checking + /// it against the given transition table (which must be for the same DFA). + /// + /// That is, every state ID can be used to correctly index a state. + fn validate( + &self, + tt: &TransitionTable<T>, + ) -> Result<(), DeserializeError> { + if !self.universal_start_unanchored.map_or(true, |s| tt.is_valid(s)) { + return Err(DeserializeError::generic( + "found invalid universal unanchored starting state ID", + )); + } + if !self.universal_start_anchored.map_or(true, |s| tt.is_valid(s)) { + return Err(DeserializeError::generic( + "found invalid universal anchored starting state ID", + )); + } + for &id in self.table() { + if !tt.is_valid(id) { + return Err(DeserializeError::generic( + "found invalid starting state ID", + )); + } + } + Ok(()) + } + + /// Converts this start list to a borrowed value. + fn as_ref(&self) -> StartTable<&'_ [u32]> { + StartTable { + table: self.table.as_ref(), + kind: self.kind, + start_map: self.start_map.clone(), + stride: self.stride, + pattern_len: self.pattern_len, + universal_start_unanchored: self.universal_start_unanchored, + universal_start_anchored: self.universal_start_anchored, + } + } + + /// Converts this start list to an owned value. + #[cfg(feature = "alloc")] + fn to_owned(&self) -> StartTable<alloc::vec::Vec<u32>> { + StartTable { + table: self.table.as_ref().to_vec(), + kind: self.kind, + start_map: self.start_map.clone(), + stride: self.stride, + pattern_len: self.pattern_len, + universal_start_unanchored: self.universal_start_unanchored, + universal_start_anchored: self.universal_start_anchored, + } + } + + /// Return the start state for the given input and starting configuration. + /// This returns an error if the input configuration is not supported by + /// this DFA. For example, requesting an unanchored search when the DFA was + /// not built with unanchored starting states. Or asking for an anchored + /// pattern search with an invalid pattern ID or on a DFA that was not + /// built with start states for each pattern. + #[cfg_attr(feature = "perf-inline", inline(always))] + fn start( + &self, + input: &Input<'_>, + start: Start, + ) -> Result<StateID, MatchError> { + let start_index = start.as_usize(); + let mode = input.get_anchored(); + let index = match mode { + Anchored::No => { + if !self.kind.has_unanchored() { + return Err(MatchError::unsupported_anchored(mode)); + } + start_index + } + Anchored::Yes => { + if !self.kind.has_anchored() { + return Err(MatchError::unsupported_anchored(mode)); + } + self.stride + start_index + } + Anchored::Pattern(pid) => { + let len = match self.pattern_len { + None => { + return Err(MatchError::unsupported_anchored(mode)) + } + Some(len) => len, + }; + if pid.as_usize() >= len { + return Ok(DEAD); + } + (2 * self.stride) + + (self.stride * pid.as_usize()) + + start_index + } + }; + Ok(self.table()[index]) + } + + /// Returns an iterator over all start state IDs in this table. + /// + /// Each item is a triple of: start state ID, the start state type and the + /// pattern ID (if any). + fn iter(&self) -> StartStateIter<'_> { + StartStateIter { st: self.as_ref(), i: 0 } + } + + /// Returns the table as a slice of state IDs. + fn table(&self) -> &[StateID] { + wire::u32s_to_state_ids(self.table.as_ref()) + } + + /// Return the memory usage, in bytes, of this start list. + /// + /// This does not include the size of a `StartList` value itself. + fn memory_usage(&self) -> usize { + self.table().len() * StateID::SIZE + } +} + +#[cfg(feature = "dfa-build")] +impl<T: AsMut<[u32]>> StartTable<T> { + /// Set the start state for the given index and pattern. + /// + /// If the pattern ID or state ID are not valid, then this will panic. + fn set_start(&mut self, anchored: Anchored, start: Start, id: StateID) { + let start_index = start.as_usize(); + let index = match anchored { + Anchored::No => start_index, + Anchored::Yes => self.stride + start_index, + Anchored::Pattern(pid) => { + let pid = pid.as_usize(); + let len = self + .pattern_len + .expect("start states for each pattern enabled"); + assert!(pid < len, "invalid pattern ID {:?}", pid); + self.stride + .checked_mul(pid) + .unwrap() + .checked_add(self.stride.checked_mul(2).unwrap()) + .unwrap() + .checked_add(start_index) + .unwrap() + } + }; + self.table_mut()[index] = id; + } + + /// Returns the table as a mutable slice of state IDs. + fn table_mut(&mut self) -> &mut [StateID] { + wire::u32s_to_state_ids_mut(self.table.as_mut()) + } +} + +/// An iterator over start state IDs. +/// +/// This iterator yields a triple of start state ID, the anchored mode and the +/// start state type. If a pattern ID is relevant, then the anchored mode will +/// contain it. Start states with an anchored mode containing a pattern ID will +/// only occur when the DFA was compiled with start states for each pattern +/// (which is disabled by default). +pub(crate) struct StartStateIter<'a> { + st: StartTable<&'a [u32]>, + i: usize, +} + +impl<'a> Iterator for StartStateIter<'a> { + type Item = (StateID, Anchored, Start); + + fn next(&mut self) -> Option<(StateID, Anchored, Start)> { + let i = self.i; + let table = self.st.table(); + if i >= table.len() { + return None; + } + self.i += 1; + + // This unwrap is okay since the stride of the starting state table + // must always match the number of start state types. + let start_type = Start::from_usize(i % self.st.stride).unwrap(); + let anchored = if i < self.st.stride { + Anchored::No + } else if i < (2 * self.st.stride) { + Anchored::Yes + } else { + let pid = (i - (2 * self.st.stride)) / self.st.stride; + Anchored::Pattern(PatternID::new(pid).unwrap()) + }; + Some((table[i], anchored, start_type)) + } +} + +/// This type represents that patterns that should be reported whenever a DFA +/// enters a match state. This structure exists to support DFAs that search for +/// matches for multiple regexes. +/// +/// This structure relies on the fact that all match states in a DFA occur +/// contiguously in the DFA's transition table. (See dfa/special.rs for a more +/// detailed breakdown of the representation.) Namely, when a match occurs, we +/// know its state ID. Since we know the start and end of the contiguous region +/// of match states, we can use that to compute the position at which the match +/// state occurs. That in turn is used as an offset into this structure. +#[derive(Clone, Debug)] +struct MatchStates<T> { + /// Slices is a flattened sequence of pairs, where each pair points to a + /// sub-slice of pattern_ids. The first element of the pair is an offset + /// into pattern_ids and the second element of the pair is the number + /// of 32-bit pattern IDs starting at that position. That is, each pair + /// corresponds to a single DFA match state and its corresponding match + /// IDs. The number of pairs always corresponds to the number of distinct + /// DFA match states. + /// + /// In practice, T is either Vec<u32> or &[u32]. + slices: T, + /// A flattened sequence of pattern IDs for each DFA match state. The only + /// way to correctly read this sequence is indirectly via `slices`. + /// + /// In practice, T is either Vec<u32> or &[u32]. + pattern_ids: T, + /// The total number of unique patterns represented by these match states. + pattern_len: usize, +} + +impl<'a> MatchStates<&'a [u32]> { + unsafe fn from_bytes_unchecked( + mut slice: &'a [u8], + ) -> Result<(MatchStates<&'a [u32]>, usize), DeserializeError> { + let slice_start = slice.as_ptr().as_usize(); + + // Read the total number of match states. + let (state_len, nr) = + wire::try_read_u32_as_usize(slice, "match state length")?; + slice = &slice[nr..]; + + // Read the slice start/length pairs. + let pair_len = wire::mul(2, state_len, "match state offset pairs")?; + let slices_bytes_len = wire::mul( + pair_len, + PatternID::SIZE, + "match state slice offset byte length", + )?; + wire::check_slice_len(slice, slices_bytes_len, "match state slices")?; + wire::check_alignment::<PatternID>(slice)?; + let slices_bytes = &slice[..slices_bytes_len]; + slice = &slice[slices_bytes_len..]; + // SAFETY: Since PatternID is always representable as a u32, all we + // need to do is ensure that we have the proper length and alignment. + // We've checked both above, so the cast below is safe. + // + // N.B. This is one of the few not-safe snippets in this function, + // so we mark it explicitly to call it out. + let slices = core::slice::from_raw_parts( + slices_bytes.as_ptr().cast::<u32>(), + pair_len, + ); + + // Read the total number of unique pattern IDs (which is always 1 more + // than the maximum pattern ID in this automaton, since pattern IDs are + // handed out contiguously starting at 0). + let (pattern_len, nr) = + wire::try_read_u32_as_usize(slice, "pattern length")?; + slice = &slice[nr..]; + + // Now read the pattern ID length. We don't need to store this + // explicitly, but we need it to know how many pattern IDs to read. + let (idlen, nr) = + wire::try_read_u32_as_usize(slice, "pattern ID length")?; + slice = &slice[nr..]; + + // Read the actual pattern IDs. + let pattern_ids_len = + wire::mul(idlen, PatternID::SIZE, "pattern ID byte length")?; + wire::check_slice_len(slice, pattern_ids_len, "match pattern IDs")?; + wire::check_alignment::<PatternID>(slice)?; + let pattern_ids_bytes = &slice[..pattern_ids_len]; + slice = &slice[pattern_ids_len..]; + // SAFETY: Since PatternID is always representable as a u32, all we + // need to do is ensure that we have the proper length and alignment. + // We've checked both above, so the cast below is safe. + // + // N.B. This is one of the few not-safe snippets in this function, + // so we mark it explicitly to call it out. + let pattern_ids = core::slice::from_raw_parts( + pattern_ids_bytes.as_ptr().cast::<u32>(), + idlen, + ); + + let ms = MatchStates { slices, pattern_ids, pattern_len }; + Ok((ms, slice.as_ptr().as_usize() - slice_start)) + } +} + +#[cfg(feature = "dfa-build")] +impl MatchStates<Vec<u32>> { + fn empty(pattern_len: usize) -> MatchStates<Vec<u32>> { + assert!(pattern_len <= PatternID::LIMIT); + MatchStates { slices: vec![], pattern_ids: vec![], pattern_len } + } + + fn new( + matches: &BTreeMap<StateID, Vec<PatternID>>, + pattern_len: usize, + ) -> Result<MatchStates<Vec<u32>>, BuildError> { + let mut m = MatchStates::empty(pattern_len); + for (_, pids) in matches.iter() { + let start = PatternID::new(m.pattern_ids.len()) + .map_err(|_| BuildError::too_many_match_pattern_ids())?; + m.slices.push(start.as_u32()); + // This is always correct since the number of patterns in a single + // match state can never exceed maximum number of allowable + // patterns. Why? Because a pattern can only appear once in a + // particular match state, by construction. (And since our pattern + // ID limit is one less than u32::MAX, we're guaranteed that the + // length fits in a u32.) + m.slices.push(u32::try_from(pids.len()).unwrap()); + for &pid in pids { + m.pattern_ids.push(pid.as_u32()); + } + } + m.pattern_len = pattern_len; + Ok(m) + } + + fn new_with_map( + &self, + matches: &BTreeMap<StateID, Vec<PatternID>>, + ) -> Result<MatchStates<Vec<u32>>, BuildError> { + MatchStates::new(matches, self.pattern_len) + } +} + +impl<T: AsRef<[u32]>> MatchStates<T> { + /// Writes a serialized form of these match states to the buffer given. If + /// the buffer is too small, then an error is returned. To determine how + /// big the buffer must be, use `write_to_len`. + fn write_to<E: Endian>( + &self, + mut dst: &mut [u8], + ) -> Result<usize, SerializeError> { + let nwrite = self.write_to_len(); + if dst.len() < nwrite { + return Err(SerializeError::buffer_too_small("match states")); + } + dst = &mut dst[..nwrite]; + + // write state ID length + // Unwrap is OK since number of states is guaranteed to fit in a u32. + E::write_u32(u32::try_from(self.len()).unwrap(), dst); + dst = &mut dst[size_of::<u32>()..]; + + // write slice offset pairs + for &pid in self.slices() { + let n = wire::write_pattern_id::<E>(pid, &mut dst); + dst = &mut dst[n..]; + } + + // write unique pattern ID length + // Unwrap is OK since number of patterns is guaranteed to fit in a u32. + E::write_u32(u32::try_from(self.pattern_len).unwrap(), dst); + dst = &mut dst[size_of::<u32>()..]; + + // write pattern ID length + // Unwrap is OK since we check at construction (and deserialization) + // that the number of patterns is representable as a u32. + E::write_u32(u32::try_from(self.pattern_ids().len()).unwrap(), dst); + dst = &mut dst[size_of::<u32>()..]; + + // write pattern IDs + for &pid in self.pattern_ids() { + let n = wire::write_pattern_id::<E>(pid, &mut dst); + dst = &mut dst[n..]; + } + + Ok(nwrite) + } + + /// Returns the number of bytes the serialized form of these match states + /// will use. + fn write_to_len(&self) -> usize { + size_of::<u32>() // match state length + + (self.slices().len() * PatternID::SIZE) + + size_of::<u32>() // unique pattern ID length + + size_of::<u32>() // pattern ID length + + (self.pattern_ids().len() * PatternID::SIZE) + } + + /// Valides that the match state info is itself internally consistent and + /// consistent with the recorded match state region in the given DFA. + fn validate(&self, dfa: &DFA<T>) -> Result<(), DeserializeError> { + if self.len() != dfa.special.match_len(dfa.stride()) { + return Err(DeserializeError::generic( + "match state length mismatch", + )); + } + for si in 0..self.len() { + let start = self.slices()[si * 2].as_usize(); + let len = self.slices()[si * 2 + 1].as_usize(); + if start >= self.pattern_ids().len() { + return Err(DeserializeError::generic( + "invalid pattern ID start offset", + )); + } + if start + len > self.pattern_ids().len() { + return Err(DeserializeError::generic( + "invalid pattern ID length", + )); + } + for mi in 0..len { + let pid = self.pattern_id(si, mi); + if pid.as_usize() >= self.pattern_len { + return Err(DeserializeError::generic( + "invalid pattern ID", + )); + } + } + } + Ok(()) + } + + /// Converts these match states back into their map form. This is useful + /// when shuffling states, as the normal MatchStates representation is not + /// amenable to easy state swapping. But with this map, to swap id1 and + /// id2, all you need to do is: + /// + /// if let Some(pids) = map.remove(&id1) { + /// map.insert(id2, pids); + /// } + /// + /// Once shuffling is done, use MatchStates::new to convert back. + #[cfg(feature = "dfa-build")] + fn to_map(&self, dfa: &DFA<T>) -> BTreeMap<StateID, Vec<PatternID>> { + let mut map = BTreeMap::new(); + for i in 0..self.len() { + let mut pids = vec![]; + for j in 0..self.pattern_len(i) { + pids.push(self.pattern_id(i, j)); + } + map.insert(self.match_state_id(dfa, i), pids); + } + map + } + + /// Converts these match states to a borrowed value. + fn as_ref(&self) -> MatchStates<&'_ [u32]> { + MatchStates { + slices: self.slices.as_ref(), + pattern_ids: self.pattern_ids.as_ref(), + pattern_len: self.pattern_len, + } + } + + /// Converts these match states to an owned value. + #[cfg(feature = "alloc")] + fn to_owned(&self) -> MatchStates<alloc::vec::Vec<u32>> { + MatchStates { + slices: self.slices.as_ref().to_vec(), + pattern_ids: self.pattern_ids.as_ref().to_vec(), + pattern_len: self.pattern_len, + } + } + + /// Returns the match state ID given the match state index. (Where the + /// first match state corresponds to index 0.) + /// + /// This panics if there is no match state at the given index. + fn match_state_id(&self, dfa: &DFA<T>, index: usize) -> StateID { + assert!(dfa.special.matches(), "no match states to index"); + // This is one of the places where we rely on the fact that match + // states are contiguous in the transition table. Namely, that the + // first match state ID always corresponds to dfa.special.min_start. + // From there, since we know the stride, we can compute the ID of any + // match state given its index. + let stride2 = u32::try_from(dfa.stride2()).unwrap(); + let offset = index.checked_shl(stride2).unwrap(); + let id = dfa.special.min_match.as_usize().checked_add(offset).unwrap(); + let sid = StateID::new(id).unwrap(); + assert!(dfa.is_match_state(sid)); + sid + } + + /// Returns the pattern ID at the given match index for the given match + /// state. + /// + /// The match state index is the state index minus the state index of the + /// first match state in the DFA. + /// + /// The match index is the index of the pattern ID for the given state. + /// The index must be less than `self.pattern_len(state_index)`. + #[cfg_attr(feature = "perf-inline", inline(always))] + fn pattern_id(&self, state_index: usize, match_index: usize) -> PatternID { + self.pattern_id_slice(state_index)[match_index] + } + + /// Returns the number of patterns in the given match state. + /// + /// The match state index is the state index minus the state index of the + /// first match state in the DFA. + #[cfg_attr(feature = "perf-inline", inline(always))] + fn pattern_len(&self, state_index: usize) -> usize { + self.slices()[state_index * 2 + 1].as_usize() + } + + /// Returns all of the pattern IDs for the given match state index. + /// + /// The match state index is the state index minus the state index of the + /// first match state in the DFA. + #[cfg_attr(feature = "perf-inline", inline(always))] + fn pattern_id_slice(&self, state_index: usize) -> &[PatternID] { + let start = self.slices()[state_index * 2].as_usize(); + let len = self.pattern_len(state_index); + &self.pattern_ids()[start..start + len] + } + + /// Returns the pattern ID offset slice of u32 as a slice of PatternID. + #[cfg_attr(feature = "perf-inline", inline(always))] + fn slices(&self) -> &[PatternID] { + wire::u32s_to_pattern_ids(self.slices.as_ref()) + } + + /// Returns the total number of match states. + #[cfg_attr(feature = "perf-inline", inline(always))] + fn len(&self) -> usize { + assert_eq!(0, self.slices().len() % 2); + self.slices().len() / 2 + } + + /// Returns the pattern ID slice of u32 as a slice of PatternID. + #[cfg_attr(feature = "perf-inline", inline(always))] + fn pattern_ids(&self) -> &[PatternID] { + wire::u32s_to_pattern_ids(self.pattern_ids.as_ref()) + } + + /// Return the memory usage, in bytes, of these match pairs. + fn memory_usage(&self) -> usize { + (self.slices().len() + self.pattern_ids().len()) * PatternID::SIZE + } +} + +/// A common set of flags for both dense and sparse DFAs. This primarily +/// centralizes the serialization format of these flags at a bitset. +#[derive(Clone, Copy, Debug)] +pub(crate) struct Flags { + /// Whether the DFA can match the empty string. When this is false, all + /// matches returned by this DFA are guaranteed to have non-zero length. + pub(crate) has_empty: bool, + /// Whether the DFA should only produce matches with spans that correspond + /// to valid UTF-8. This also includes omitting any zero-width matches that + /// split the UTF-8 encoding of a codepoint. + pub(crate) is_utf8: bool, + /// Whether the DFA is always anchored or not, regardless of `Input` + /// configuration. This is useful for avoiding a reverse scan even when + /// executing unanchored searches. + pub(crate) is_always_start_anchored: bool, +} + +impl Flags { + /// Creates a set of flags for a DFA from an NFA. + /// + /// N.B. This constructor was defined at the time of writing because all + /// of the flags are derived directly from the NFA. If this changes in the + /// future, we might be more thoughtful about how the `Flags` value is + /// itself built. + #[cfg(feature = "dfa-build")] + fn from_nfa(nfa: &thompson::NFA) -> Flags { + Flags { + has_empty: nfa.has_empty(), + is_utf8: nfa.is_utf8(), + is_always_start_anchored: nfa.is_always_start_anchored(), + } + } + + /// Deserializes the flags from the given slice. On success, this also + /// returns the number of bytes read from the slice. + pub(crate) fn from_bytes( + slice: &[u8], + ) -> Result<(Flags, usize), DeserializeError> { + let (bits, nread) = wire::try_read_u32(slice, "flag bitset")?; + let flags = Flags { + has_empty: bits & (1 << 0) != 0, + is_utf8: bits & (1 << 1) != 0, + is_always_start_anchored: bits & (1 << 2) != 0, + }; + Ok((flags, nread)) + } + + /// Writes these flags to the given byte slice. If the buffer is too small, + /// then an error is returned. To determine how big the buffer must be, + /// use `write_to_len`. + pub(crate) fn write_to<E: Endian>( + &self, + dst: &mut [u8], + ) -> Result<usize, SerializeError> { + fn bool_to_int(b: bool) -> u32 { + if b { + 1 + } else { + 0 + } + } + + let nwrite = self.write_to_len(); + if dst.len() < nwrite { + return Err(SerializeError::buffer_too_small("flag bitset")); + } + let bits = (bool_to_int(self.has_empty) << 0) + | (bool_to_int(self.is_utf8) << 1) + | (bool_to_int(self.is_always_start_anchored) << 2); + E::write_u32(bits, dst); + Ok(nwrite) + } + + /// Returns the number of bytes the serialized form of these flags + /// will use. + pub(crate) fn write_to_len(&self) -> usize { + size_of::<u32>() + } +} + +/// An iterator over all states in a DFA. +/// +/// This iterator yields a tuple for each state. The first element of the +/// tuple corresponds to a state's identifier, and the second element +/// corresponds to the state itself (comprised of its transitions). +/// +/// `'a` corresponding to the lifetime of original DFA, `T` corresponds to +/// the type of the transition table itself. +pub(crate) struct StateIter<'a, T> { + tt: &'a TransitionTable<T>, + it: iter::Enumerate<slice::Chunks<'a, StateID>>, +} + +impl<'a, T: AsRef<[u32]>> Iterator for StateIter<'a, T> { + type Item = State<'a>; + + fn next(&mut self) -> Option<State<'a>> { + self.it.next().map(|(index, _)| { + let id = self.tt.to_state_id(index); + self.tt.state(id) + }) + } +} + +/// An immutable representation of a single DFA state. +/// +/// `'a` correspondings to the lifetime of a DFA's transition table. +pub(crate) struct State<'a> { + id: StateID, + stride2: usize, + transitions: &'a [StateID], +} + +impl<'a> State<'a> { + /// Return an iterator over all transitions in this state. This yields + /// a number of transitions equivalent to the alphabet length of the + /// corresponding DFA. + /// + /// Each transition is represented by a tuple. The first element is + /// the input byte for that transition and the second element is the + /// transitions itself. + pub(crate) fn transitions(&self) -> StateTransitionIter<'_> { + StateTransitionIter { + len: self.transitions.len(), + it: self.transitions.iter().enumerate(), + } + } + + /// Return an iterator over a sparse representation of the transitions in + /// this state. Only non-dead transitions are returned. + /// + /// The "sparse" representation in this case corresponds to a sequence of + /// triples. The first two elements of the triple comprise an inclusive + /// byte range while the last element corresponds to the transition taken + /// for all bytes in the range. + /// + /// This is somewhat more condensed than the classical sparse + /// representation (where you have an element for every non-dead + /// transition), but in practice, checking if a byte is in a range is very + /// cheap and using ranges tends to conserve quite a bit more space. + pub(crate) fn sparse_transitions(&self) -> StateSparseTransitionIter<'_> { + StateSparseTransitionIter { dense: self.transitions(), cur: None } + } + + /// Returns the identifier for this state. + pub(crate) fn id(&self) -> StateID { + self.id + } + + /// Analyzes this state to determine whether it can be accelerated. If so, + /// it returns an accelerator that contains at least one byte. + #[cfg(feature = "dfa-build")] + fn accelerate(&self, classes: &ByteClasses) -> Option<Accel> { + // We just try to add bytes to our accelerator. Once adding fails + // (because we've added too many bytes), then give up. + let mut accel = Accel::new(); + for (class, id) in self.transitions() { + if id == self.id() { + continue; + } + for unit in classes.elements(class) { + if let Some(byte) = unit.as_u8() { + if !accel.add(byte) { + return None; + } + } + } + } + if accel.is_empty() { + None + } else { + Some(accel) + } + } +} + +impl<'a> fmt::Debug for State<'a> { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + for (i, (start, end, sid)) in self.sparse_transitions().enumerate() { + let id = if f.alternate() { + sid.as_usize() + } else { + sid.as_usize() >> self.stride2 + }; + if i > 0 { + write!(f, ", ")?; + } + if start == end { + write!(f, "{:?} => {:?}", start, id)?; + } else { + write!(f, "{:?}-{:?} => {:?}", start, end, id)?; + } + } + Ok(()) + } +} + +/// An iterator over all transitions in a single DFA state. This yields +/// a number of transitions equivalent to the alphabet length of the +/// corresponding DFA. +/// +/// Each transition is represented by a tuple. The first element is the input +/// byte for that transition and the second element is the transition itself. +#[derive(Debug)] +pub(crate) struct StateTransitionIter<'a> { + len: usize, + it: iter::Enumerate<slice::Iter<'a, StateID>>, +} + +impl<'a> Iterator for StateTransitionIter<'a> { + type Item = (alphabet::Unit, StateID); + + fn next(&mut self) -> Option<(alphabet::Unit, StateID)> { + self.it.next().map(|(i, &id)| { + let unit = if i + 1 == self.len { + alphabet::Unit::eoi(i) + } else { + let b = u8::try_from(i) + .expect("raw byte alphabet is never exceeded"); + alphabet::Unit::u8(b) + }; + (unit, id) + }) + } +} + +/// An iterator over all non-DEAD transitions in a single DFA state using a +/// sparse representation. +/// +/// Each transition is represented by a triple. The first two elements of the +/// triple comprise an inclusive byte range while the last element corresponds +/// to the transition taken for all bytes in the range. +/// +/// As a convenience, this always returns `alphabet::Unit` values of the same +/// type. That is, you'll never get a (byte, EOI) or a (EOI, byte). Only (byte, +/// byte) and (EOI, EOI) values are yielded. +#[derive(Debug)] +pub(crate) struct StateSparseTransitionIter<'a> { + dense: StateTransitionIter<'a>, + cur: Option<(alphabet::Unit, alphabet::Unit, StateID)>, +} + +impl<'a> Iterator for StateSparseTransitionIter<'a> { + type Item = (alphabet::Unit, alphabet::Unit, StateID); + + fn next(&mut self) -> Option<(alphabet::Unit, alphabet::Unit, StateID)> { + while let Some((unit, next)) = self.dense.next() { + let (prev_start, prev_end, prev_next) = match self.cur { + Some(t) => t, + None => { + self.cur = Some((unit, unit, next)); + continue; + } + }; + if prev_next == next && !unit.is_eoi() { + self.cur = Some((prev_start, unit, prev_next)); + } else { + self.cur = Some((unit, unit, next)); + if prev_next != DEAD { + return Some((prev_start, prev_end, prev_next)); + } + } + } + if let Some((start, end, next)) = self.cur.take() { + if next != DEAD { + return Some((start, end, next)); + } + } + None + } +} + +/// An error that occurred during the construction of a DFA. +/// +/// This error does not provide many introspection capabilities. There are +/// generally only two things you can do with it: +/// +/// * Obtain a human readable message via its `std::fmt::Display` impl. +/// * Access an underlying [`nfa::thompson::BuildError`](thompson::BuildError) +/// type from its `source` method via the `std::error::Error` trait. This error +/// only occurs when using convenience routines for building a DFA directly +/// from a pattern string. +/// +/// When the `std` feature is enabled, this implements the `std::error::Error` +/// trait. +#[cfg(feature = "dfa-build")] +#[derive(Clone, Debug)] +pub struct BuildError { + kind: BuildErrorKind, +} + +/// The kind of error that occurred during the construction of a DFA. +/// +/// Note that this error is non-exhaustive. Adding new variants is not +/// considered a breaking change. +#[cfg(feature = "dfa-build")] +#[derive(Clone, Debug)] +enum BuildErrorKind { + /// An error that occurred while constructing an NFA as a precursor step + /// before a DFA is compiled. + NFA(thompson::BuildError), + /// An error that occurred because an unsupported regex feature was used. + /// The message string describes which unsupported feature was used. + /// + /// The primary regex feature that is unsupported by DFAs is the Unicode + /// word boundary look-around assertion (`\b`). This can be worked around + /// by either using an ASCII word boundary (`(?-u:\b)`) or by enabling + /// Unicode word boundaries when building a DFA. + Unsupported(&'static str), + /// An error that occurs if too many states are produced while building a + /// DFA. + TooManyStates, + /// An error that occurs if too many start states are needed while building + /// a DFA. + /// + /// This is a kind of oddball error that occurs when building a DFA with + /// start states enabled for each pattern and enough patterns to cause + /// the table of start states to overflow `usize`. + TooManyStartStates, + /// This is another oddball error that can occur if there are too many + /// patterns spread out across too many match states. + TooManyMatchPatternIDs, + /// An error that occurs if the DFA got too big during determinization. + DFAExceededSizeLimit { limit: usize }, + /// An error that occurs if auxiliary storage (not the DFA) used during + /// determinization got too big. + DeterminizeExceededSizeLimit { limit: usize }, +} + +#[cfg(feature = "dfa-build")] +impl BuildError { + /// Return the kind of this error. + fn kind(&self) -> &BuildErrorKind { + &self.kind + } + + pub(crate) fn nfa(err: thompson::BuildError) -> BuildError { + BuildError { kind: BuildErrorKind::NFA(err) } + } + + pub(crate) fn unsupported_dfa_word_boundary_unicode() -> BuildError { + let msg = "cannot build DFAs for regexes with Unicode word \ + boundaries; switch to ASCII word boundaries, or \ + heuristically enable Unicode word boundaries or use a \ + different regex engine"; + BuildError { kind: BuildErrorKind::Unsupported(msg) } + } + + pub(crate) fn too_many_states() -> BuildError { + BuildError { kind: BuildErrorKind::TooManyStates } + } + + pub(crate) fn too_many_start_states() -> BuildError { + BuildError { kind: BuildErrorKind::TooManyStartStates } + } + + pub(crate) fn too_many_match_pattern_ids() -> BuildError { + BuildError { kind: BuildErrorKind::TooManyMatchPatternIDs } + } + + pub(crate) fn dfa_exceeded_size_limit(limit: usize) -> BuildError { + BuildError { kind: BuildErrorKind::DFAExceededSizeLimit { limit } } + } + + pub(crate) fn determinize_exceeded_size_limit(limit: usize) -> BuildError { + BuildError { + kind: BuildErrorKind::DeterminizeExceededSizeLimit { limit }, + } + } +} + +#[cfg(all(feature = "std", feature = "dfa-build"))] +impl std::error::Error for BuildError { + fn source(&self) -> Option<&(dyn std::error::Error + 'static)> { + match self.kind() { + BuildErrorKind::NFA(ref err) => Some(err), + _ => None, + } + } +} + +#[cfg(feature = "dfa-build")] +impl core::fmt::Display for BuildError { + fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result { + match self.kind() { + BuildErrorKind::NFA(_) => write!(f, "error building NFA"), + BuildErrorKind::Unsupported(ref msg) => { + write!(f, "unsupported regex feature for DFAs: {}", msg) + } + BuildErrorKind::TooManyStates => write!( + f, + "number of DFA states exceeds limit of {}", + StateID::LIMIT, + ), + BuildErrorKind::TooManyStartStates => { + let stride = Start::len(); + // The start table has `stride` entries for starting states for + // the entire DFA, and then `stride` entries for each pattern + // if start states for each pattern are enabled (which is the + // only way this error can occur). Thus, the total number of + // patterns that can fit in the table is `stride` less than + // what we can allocate. + let max = usize::try_from(core::isize::MAX).unwrap(); + let limit = (max - stride) / stride; + write!( + f, + "compiling DFA with start states exceeds pattern \ + pattern limit of {}", + limit, + ) + } + BuildErrorKind::TooManyMatchPatternIDs => write!( + f, + "compiling DFA with total patterns in all match states \ + exceeds limit of {}", + PatternID::LIMIT, + ), + BuildErrorKind::DFAExceededSizeLimit { limit } => write!( + f, + "DFA exceeded size limit of {:?} during determinization", + limit, + ), + BuildErrorKind::DeterminizeExceededSizeLimit { limit } => { + write!(f, "determinization exceeded size limit of {:?}", limit) + } + } + } +} + +#[cfg(all(test, feature = "syntax", feature = "dfa-build"))] +mod tests { + use super::*; + + #[test] + fn errors_with_unicode_word_boundary() { + let pattern = r"\b"; + assert!(Builder::new().build(pattern).is_err()); + } + + #[test] + fn roundtrip_never_match() { + let dfa = DFA::never_match().unwrap(); + let (buf, _) = dfa.to_bytes_native_endian(); + let dfa: DFA<&[u32]> = DFA::from_bytes(&buf).unwrap().0; + + assert_eq!(None, dfa.try_search_fwd(&Input::new("foo12345")).unwrap()); + } + + #[test] + fn roundtrip_always_match() { + use crate::HalfMatch; + + let dfa = DFA::always_match().unwrap(); + let (buf, _) = dfa.to_bytes_native_endian(); + let dfa: DFA<&[u32]> = DFA::from_bytes(&buf).unwrap().0; + + assert_eq!( + Some(HalfMatch::must(0, 0)), + dfa.try_search_fwd(&Input::new("foo12345")).unwrap() + ); + } + + // See the analogous test in src/hybrid/dfa.rs. + #[test] + fn heuristic_unicode_reverse() { + let dfa = DFA::builder() + .configure(DFA::config().unicode_word_boundary(true)) + .thompson(thompson::Config::new().reverse(true)) + .build(r"\b[0-9]+\b") + .unwrap(); + + let input = Input::new("β123").range(2..); + let expected = MatchError::quit(0xB2, 1); + let got = dfa.try_search_rev(&input); + assert_eq!(Err(expected), got); + + let input = Input::new("123β").range(..3); + let expected = MatchError::quit(0xCE, 3); + let got = dfa.try_search_rev(&input); + assert_eq!(Err(expected), got); + } +} diff --git a/third_party/rust/regex-automata/src/dfa/determinize.rs b/third_party/rust/regex-automata/src/dfa/determinize.rs new file mode 100644 index 0000000000..19f99f5d64 --- /dev/null +++ b/third_party/rust/regex-automata/src/dfa/determinize.rs @@ -0,0 +1,599 @@ +use alloc::{collections::BTreeMap, vec::Vec}; + +use crate::{ + dfa::{ + dense::{self, BuildError}, + DEAD, + }, + nfa::thompson, + util::{ + self, + alphabet::{self, ByteSet}, + determinize::{State, StateBuilderEmpty, StateBuilderNFA}, + primitives::{PatternID, StateID}, + search::{Anchored, MatchKind}, + sparse_set::SparseSets, + start::Start, + }, +}; + +/// A builder for configuring and running a DFA determinizer. +#[derive(Clone, Debug)] +pub(crate) struct Config { + match_kind: MatchKind, + quit: ByteSet, + dfa_size_limit: Option<usize>, + determinize_size_limit: Option<usize>, +} + +impl Config { + /// Create a new default config for a determinizer. The determinizer may be + /// configured before calling `run`. + pub fn new() -> Config { + Config { + match_kind: MatchKind::LeftmostFirst, + quit: ByteSet::empty(), + dfa_size_limit: None, + determinize_size_limit: None, + } + } + + /// Run determinization on the given NFA and write the resulting DFA into + /// the one given. The DFA given should be initialized but otherwise empty. + /// "Initialized" means that it is setup to handle the NFA's byte classes, + /// number of patterns and whether to build start states for each pattern. + pub fn run( + &self, + nfa: &thompson::NFA, + dfa: &mut dense::OwnedDFA, + ) -> Result<(), BuildError> { + let dead = State::dead(); + let quit = State::dead(); + let mut cache = StateMap::default(); + // We only insert the dead state here since its representation is + // identical to the quit state. And we never want anything pointing + // to the quit state other than specific transitions derived from the + // determinizer's configured "quit" bytes. + // + // We do put the quit state into 'builder_states' below. This ensures + // that a proper DFA state ID is allocated for it, and that no other + // DFA state uses the "location after the DEAD state." That is, it + // is assumed that the quit state is always the state immediately + // following the DEAD state. + cache.insert(dead.clone(), DEAD); + + let runner = Runner { + config: self.clone(), + nfa, + dfa, + builder_states: alloc::vec![dead, quit], + cache, + memory_usage_state: 0, + sparses: SparseSets::new(nfa.states().len()), + stack: alloc::vec![], + scratch_state_builder: StateBuilderEmpty::new(), + }; + runner.run() + } + + /// The match semantics to use for determinization. + /// + /// MatchKind::All corresponds to the standard textbook construction. + /// All possible match states are represented in the DFA. + /// MatchKind::LeftmostFirst permits greediness and otherwise tries to + /// simulate the match semantics of backtracking regex engines. Namely, + /// only a subset of match states are built, and dead states are used to + /// stop searches with an unanchored prefix. + /// + /// The default is MatchKind::LeftmostFirst. + pub fn match_kind(&mut self, kind: MatchKind) -> &mut Config { + self.match_kind = kind; + self + } + + /// The set of bytes to use that will cause the DFA to enter a quit state, + /// stop searching and return an error. By default, this is empty. + pub fn quit(&mut self, set: ByteSet) -> &mut Config { + self.quit = set; + self + } + + /// The limit, in bytes of the heap, that the DFA is permitted to use. This + /// does not include the auxiliary heap storage used by determinization. + pub fn dfa_size_limit(&mut self, bytes: Option<usize>) -> &mut Config { + self.dfa_size_limit = bytes; + self + } + + /// The limit, in bytes of the heap, that determinization itself is allowed + /// to use. This does not include the size of the DFA being built. + pub fn determinize_size_limit( + &mut self, + bytes: Option<usize>, + ) -> &mut Config { + self.determinize_size_limit = bytes; + self + } +} + +/// The actual implementation of determinization that converts an NFA to a DFA +/// through powerset construction. +/// +/// This determinizer roughly follows the typical powerset construction, where +/// each DFA state is comprised of one or more NFA states. In the worst case, +/// there is one DFA state for every possible combination of NFA states. In +/// practice, this only happens in certain conditions, typically when there are +/// bounded repetitions. +/// +/// The main differences between this implementation and typical deteminization +/// are that this implementation delays matches by one state and hackily makes +/// look-around work. Comments below attempt to explain this. +/// +/// The lifetime variable `'a` refers to the lifetime of the NFA or DFA, +/// whichever is shorter. +#[derive(Debug)] +struct Runner<'a> { + /// The configuration used to initialize determinization. + config: Config, + /// The NFA we're converting into a DFA. + nfa: &'a thompson::NFA, + /// The DFA we're building. + dfa: &'a mut dense::OwnedDFA, + /// Each DFA state being built is defined as an *ordered* set of NFA + /// states, along with some meta facts about the ordered set of NFA states. + /// + /// This is never empty. The first state is always a dummy state such that + /// a state id == 0 corresponds to a dead state. The second state is always + /// the quit state. + /// + /// Why do we have states in both a `Vec` and in a cache map below? + /// Well, they serve two different roles based on access patterns. + /// `builder_states` is the canonical home of each state, and provides + /// constant random access by a DFA state's ID. The cache map below, on + /// the other hand, provides a quick way of searching for identical DFA + /// states by using the DFA state as a key in the map. Of course, we use + /// reference counting to avoid actually duplicating the state's data + /// itself. (Although this has never been benchmarked.) Note that the cache + /// map does not give us full minimization; it just lets us avoid some very + /// obvious redundant states. + /// + /// Note that the index into this Vec isn't quite the DFA's state ID. + /// Rather, it's just an index. To get the state ID, you have to multiply + /// it by the DFA's stride. That's done by self.dfa.from_index. And the + /// inverse is self.dfa.to_index. + /// + /// Moreover, DFA states don't usually retain the IDs assigned to them + /// by their position in this Vec. After determinization completes, + /// states are shuffled around to support other optimizations. See the + /// sibling 'special' module for more details on that. (The reason for + /// mentioning this is that if you print out the DFA for debugging during + /// determinization, and then print out the final DFA after it is fully + /// built, then the state IDs likely won't match up.) + builder_states: Vec<State>, + /// A cache of DFA states that already exist and can be easily looked up + /// via ordered sets of NFA states. + /// + /// See `builder_states` docs for why we store states in two different + /// ways. + cache: StateMap, + /// The memory usage, in bytes, used by builder_states and cache. We track + /// this as new states are added since states use a variable amount of + /// heap. Tracking this as we add states makes it possible to compute the + /// total amount of memory used by the determinizer in constant time. + memory_usage_state: usize, + /// A pair of sparse sets for tracking ordered sets of NFA state IDs. + /// These are reused throughout determinization. A bounded sparse set + /// gives us constant time insertion, membership testing and clearing. + sparses: SparseSets, + /// Scratch space for a stack of NFA states to visit, for depth first + /// visiting without recursion. + stack: Vec<StateID>, + /// Scratch space for storing an ordered sequence of NFA states, for + /// amortizing allocation. This is principally useful for when we avoid + /// adding a new DFA state since it already exists. In order to detect this + /// case though, we still need an ordered set of NFA state IDs. So we use + /// this space to stage that ordered set before we know whether we need to + /// create a new DFA state or not. + scratch_state_builder: StateBuilderEmpty, +} + +/// A map from states to state identifiers. When using std, we use a standard +/// hashmap, since it's a bit faster for this use case. (Other maps, like +/// one's based on FNV, have not yet been benchmarked.) +/// +/// The main purpose of this map is to reuse states where possible. This won't +/// fully minimize the DFA, but it works well in a lot of cases. +#[cfg(feature = "std")] +type StateMap = std::collections::HashMap<State, StateID>; +#[cfg(not(feature = "std"))] +type StateMap = BTreeMap<State, StateID>; + +impl<'a> Runner<'a> { + /// Build the DFA. If there was a problem constructing the DFA (e.g., if + /// the chosen state identifier representation is too small), then an error + /// is returned. + fn run(mut self) -> Result<(), BuildError> { + if self.nfa.look_set_any().contains_word_unicode() + && !self.config.quit.contains_range(0x80, 0xFF) + { + return Err(BuildError::unsupported_dfa_word_boundary_unicode()); + } + + // A sequence of "representative" bytes drawn from each equivalence + // class. These representative bytes are fed to the NFA to compute + // state transitions. This allows us to avoid re-computing state + // transitions for bytes that are guaranteed to produce identical + // results. Since computing the representatives needs to do a little + // work, we do it once here because we'll be iterating over them a lot. + let representatives: Vec<alphabet::Unit> = + self.dfa.byte_classes().representatives(..).collect(); + // The set of all DFA state IDs that still need to have their + // transitions set. We start by seeding this with all starting states. + let mut uncompiled = alloc::vec![]; + self.add_all_starts(&mut uncompiled)?; + while let Some(dfa_id) = uncompiled.pop() { + for &unit in &representatives { + if unit.as_u8().map_or(false, |b| self.config.quit.contains(b)) + { + continue; + } + // In many cases, the state we transition to has already been + // computed. 'cached_state' will do the minimal amount of work + // to check this, and if it exists, immediately return an + // already existing state ID. + let (next_dfa_id, is_new) = self.cached_state(dfa_id, unit)?; + self.dfa.set_transition(dfa_id, unit, next_dfa_id); + // If the state ID we got back is newly created, then we need + // to compile it, so add it to our uncompiled frontier. + if is_new { + uncompiled.push(next_dfa_id); + } + } + } + debug!( + "determinization complete, memory usage: {}, \ + dense DFA size: {}, \ + is reverse? {}", + self.memory_usage(), + self.dfa.memory_usage(), + self.nfa.is_reverse(), + ); + + // A map from DFA state ID to one or more NFA match IDs. Each NFA match + // ID corresponds to a distinct regex pattern that matches in the state + // corresponding to the key. + let mut matches: BTreeMap<StateID, Vec<PatternID>> = BTreeMap::new(); + self.cache.clear(); + #[cfg(feature = "logging")] + let mut total_pat_len = 0; + for (i, state) in self.builder_states.into_iter().enumerate() { + if let Some(pat_ids) = state.match_pattern_ids() { + let id = self.dfa.to_state_id(i); + log! { + total_pat_len += pat_ids.len(); + } + matches.insert(id, pat_ids); + } + } + log! { + use core::mem::size_of; + let per_elem = size_of::<StateID>() + size_of::<Vec<PatternID>>(); + let pats = total_pat_len * size_of::<PatternID>(); + let mem = (matches.len() * per_elem) + pats; + log::debug!("matches map built, memory usage: {}", mem); + } + // At this point, we shuffle the "special" states in the final DFA. + // This permits a DFA's match loop to detect a match condition (among + // other things) by merely inspecting the current state's identifier, + // and avoids the need for any additional auxiliary storage. + self.dfa.shuffle(matches)?; + Ok(()) + } + + /// Return the identifier for the next DFA state given an existing DFA + /// state and an input byte. If the next DFA state already exists, then + /// return its identifier from the cache. Otherwise, build the state, cache + /// it and return its identifier. + /// + /// This routine returns a boolean indicating whether a new state was + /// built. If a new state is built, then the caller needs to add it to its + /// frontier of uncompiled DFA states to compute transitions for. + fn cached_state( + &mut self, + dfa_id: StateID, + unit: alphabet::Unit, + ) -> Result<(StateID, bool), BuildError> { + // Compute the set of all reachable NFA states, including epsilons. + let empty_builder = self.get_state_builder(); + let builder = util::determinize::next( + self.nfa, + self.config.match_kind, + &mut self.sparses, + &mut self.stack, + &self.builder_states[self.dfa.to_index(dfa_id)], + unit, + empty_builder, + ); + self.maybe_add_state(builder) + } + + /// Compute the set of DFA start states and add their identifiers in + /// 'dfa_state_ids' (no duplicates are added). + fn add_all_starts( + &mut self, + dfa_state_ids: &mut Vec<StateID>, + ) -> Result<(), BuildError> { + // These should be the first states added. + assert!(dfa_state_ids.is_empty()); + // We only want to add (un)anchored starting states that is consistent + // with our DFA's configuration. Unconditionally adding both (although + // it is the default) can make DFAs quite a bit bigger. + if self.dfa.start_kind().has_unanchored() { + self.add_start_group(Anchored::No, dfa_state_ids)?; + } + if self.dfa.start_kind().has_anchored() { + self.add_start_group(Anchored::Yes, dfa_state_ids)?; + } + // I previously has an 'assert' here checking that either + // 'dfa_state_ids' was non-empty, or the NFA had zero patterns. But it + // turns out this isn't always true. For example, the NFA might have + // one or more patterns but where all such patterns are just 'fail' + // states. These will ultimately just compile down to DFA dead states, + // and since the dead state was added earlier, no new DFA states are + // added. And thus, it is valid and okay for 'dfa_state_ids' to be + // empty even if there are a non-zero number of patterns in the NFA. + + // We only need to compute anchored start states for each pattern if it + // was requested to do so. + if self.dfa.starts_for_each_pattern() { + for pid in self.nfa.patterns() { + self.add_start_group(Anchored::Pattern(pid), dfa_state_ids)?; + } + } + Ok(()) + } + + /// Add a group of start states for the given match pattern ID. Any new + /// DFA states added are pushed on to 'dfa_state_ids'. (No duplicates are + /// pushed.) + /// + /// When pattern_id is None, then this will compile a group of unanchored + /// start states (if the DFA is unanchored). When the pattern_id is + /// present, then this will compile a group of anchored start states that + /// only match the given pattern. + /// + /// This panics if `anchored` corresponds to an invalid pattern ID. + fn add_start_group( + &mut self, + anchored: Anchored, + dfa_state_ids: &mut Vec<StateID>, + ) -> Result<(), BuildError> { + let nfa_start = match anchored { + Anchored::No => self.nfa.start_unanchored(), + Anchored::Yes => self.nfa.start_anchored(), + Anchored::Pattern(pid) => { + self.nfa.start_pattern(pid).expect("valid pattern ID") + } + }; + + // When compiling start states, we're careful not to build additional + // states that aren't necessary. For example, if the NFA has no word + // boundary assertion, then there's no reason to have distinct start + // states for 'NonWordByte' and 'WordByte' starting configurations. + // Instead, the 'WordByte' starting configuration can just point + // directly to the start state for the 'NonWordByte' config. + // + // Note though that we only need to care about assertions in the prefix + // of an NFA since this only concerns the starting states. (Actually, + // the most precisely thing we could do it is look at the prefix + // assertions of each pattern when 'anchored == Anchored::Pattern', + // and then only compile extra states if the prefix is non-empty.) But + // we settle for simplicity here instead of absolute minimalism. It is + // somewhat rare, after all, for multiple patterns in the same regex to + // have different prefix look-arounds. + + let (id, is_new) = + self.add_one_start(nfa_start, Start::NonWordByte)?; + self.dfa.set_start_state(anchored, Start::NonWordByte, id); + if is_new { + dfa_state_ids.push(id); + } + + if !self.nfa.look_set_prefix_any().contains_word() { + self.dfa.set_start_state(anchored, Start::WordByte, id); + } else { + let (id, is_new) = + self.add_one_start(nfa_start, Start::WordByte)?; + self.dfa.set_start_state(anchored, Start::WordByte, id); + if is_new { + dfa_state_ids.push(id); + } + } + if !self.nfa.look_set_prefix_any().contains_anchor() { + self.dfa.set_start_state(anchored, Start::Text, id); + self.dfa.set_start_state(anchored, Start::LineLF, id); + self.dfa.set_start_state(anchored, Start::LineCR, id); + self.dfa.set_start_state( + anchored, + Start::CustomLineTerminator, + id, + ); + } else { + let (id, is_new) = self.add_one_start(nfa_start, Start::Text)?; + self.dfa.set_start_state(anchored, Start::Text, id); + if is_new { + dfa_state_ids.push(id); + } + + let (id, is_new) = self.add_one_start(nfa_start, Start::LineLF)?; + self.dfa.set_start_state(anchored, Start::LineLF, id); + if is_new { + dfa_state_ids.push(id); + } + + let (id, is_new) = self.add_one_start(nfa_start, Start::LineCR)?; + self.dfa.set_start_state(anchored, Start::LineCR, id); + if is_new { + dfa_state_ids.push(id); + } + + let (id, is_new) = + self.add_one_start(nfa_start, Start::CustomLineTerminator)?; + self.dfa.set_start_state( + anchored, + Start::CustomLineTerminator, + id, + ); + if is_new { + dfa_state_ids.push(id); + } + } + + Ok(()) + } + + /// Add a new DFA start state corresponding to the given starting NFA + /// state, and the starting search configuration. (The starting search + /// configuration essentially tells us which look-behind assertions are + /// true for this particular state.) + /// + /// The boolean returned indicates whether the state ID returned is a newly + /// created state, or a previously cached state. + fn add_one_start( + &mut self, + nfa_start: StateID, + start: Start, + ) -> Result<(StateID, bool), BuildError> { + // Compute the look-behind assertions that are true in this starting + // configuration, and the determine the epsilon closure. While + // computing the epsilon closure, we only follow condiional epsilon + // transitions that satisfy the look-behind assertions in 'look_have'. + let mut builder_matches = self.get_state_builder().into_matches(); + util::determinize::set_lookbehind_from_start( + self.nfa, + &start, + &mut builder_matches, + ); + self.sparses.set1.clear(); + util::determinize::epsilon_closure( + self.nfa, + nfa_start, + builder_matches.look_have(), + &mut self.stack, + &mut self.sparses.set1, + ); + let mut builder = builder_matches.into_nfa(); + util::determinize::add_nfa_states( + &self.nfa, + &self.sparses.set1, + &mut builder, + ); + self.maybe_add_state(builder) + } + + /// Adds the given state to the DFA being built depending on whether it + /// already exists in this determinizer's cache. + /// + /// If it does exist, then the memory used by 'state' is put back into the + /// determinizer and the previously created state's ID is returned. (Along + /// with 'false', indicating that no new state was added.) + /// + /// If it does not exist, then the state is added to the DFA being built + /// and a fresh ID is allocated (if ID allocation fails, then an error is + /// returned) and returned. (Along with 'true', indicating that a new state + /// was added.) + fn maybe_add_state( + &mut self, + builder: StateBuilderNFA, + ) -> Result<(StateID, bool), BuildError> { + if let Some(&cached_id) = self.cache.get(builder.as_bytes()) { + // Since we have a cached state, put the constructed state's + // memory back into our scratch space, so that it can be reused. + self.put_state_builder(builder); + return Ok((cached_id, false)); + } + self.add_state(builder).map(|sid| (sid, true)) + } + + /// Add the given state to the DFA and make it available in the cache. + /// + /// The state initially has no transitions. That is, it transitions to the + /// dead state for all possible inputs, and transitions to the quit state + /// for all quit bytes. + /// + /// If adding the state would exceed the maximum value for StateID, then an + /// error is returned. + fn add_state( + &mut self, + builder: StateBuilderNFA, + ) -> Result<StateID, BuildError> { + let id = self.dfa.add_empty_state()?; + if !self.config.quit.is_empty() { + for b in self.config.quit.iter() { + self.dfa.set_transition( + id, + alphabet::Unit::u8(b), + self.dfa.quit_id(), + ); + } + } + let state = builder.to_state(); + // States use reference counting internally, so we only need to count + // their memory usage once. + self.memory_usage_state += state.memory_usage(); + self.builder_states.push(state.clone()); + self.cache.insert(state, id); + self.put_state_builder(builder); + if let Some(limit) = self.config.dfa_size_limit { + if self.dfa.memory_usage() > limit { + return Err(BuildError::dfa_exceeded_size_limit(limit)); + } + } + if let Some(limit) = self.config.determinize_size_limit { + if self.memory_usage() > limit { + return Err(BuildError::determinize_exceeded_size_limit( + limit, + )); + } + } + Ok(id) + } + + /// Returns a state builder from this determinizer that might have existing + /// capacity. This helps avoid allocs in cases where a state is built that + /// turns out to already be cached. + /// + /// Callers must put the state builder back with 'put_state_builder', + /// otherwise the allocation reuse won't work. + fn get_state_builder(&mut self) -> StateBuilderEmpty { + core::mem::replace( + &mut self.scratch_state_builder, + StateBuilderEmpty::new(), + ) + } + + /// Puts the given state builder back into this determinizer for reuse. + /// + /// Note that building a 'State' from a builder always creates a new + /// alloc, so callers should always put the builder back. + fn put_state_builder(&mut self, builder: StateBuilderNFA) { + let _ = core::mem::replace( + &mut self.scratch_state_builder, + builder.clear(), + ); + } + + /// Return the memory usage, in bytes, of this determinizer at the current + /// point in time. This does not include memory used by the NFA or the + /// dense DFA itself. + fn memory_usage(&self) -> usize { + use core::mem::size_of; + + self.builder_states.len() * size_of::<State>() + // Maps likely use more memory than this, but it's probably close. + + self.cache.len() * (size_of::<State>() + size_of::<StateID>()) + + self.memory_usage_state + + self.stack.capacity() * size_of::<StateID>() + + self.scratch_state_builder.capacity() + } +} diff --git a/third_party/rust/regex-automata/src/dfa/minimize.rs b/third_party/rust/regex-automata/src/dfa/minimize.rs new file mode 100644 index 0000000000..fea925bdc6 --- /dev/null +++ b/third_party/rust/regex-automata/src/dfa/minimize.rs @@ -0,0 +1,463 @@ +use core::{cell::RefCell, fmt, mem}; + +use alloc::{collections::BTreeMap, rc::Rc, vec, vec::Vec}; + +use crate::{ + dfa::{automaton::Automaton, dense, DEAD}, + util::{ + alphabet, + primitives::{PatternID, StateID}, + }, +}; + +/// An implementation of Hopcroft's algorithm for minimizing DFAs. +/// +/// The algorithm implemented here is mostly taken from Wikipedia: +/// https://en.wikipedia.org/wiki/DFA_minimization#Hopcroft's_algorithm +/// +/// This code has had some light optimization attention paid to it, +/// particularly in the form of reducing allocation as much as possible. +/// However, it is still generally slow. Future optimization work should +/// probably focus on the bigger picture rather than micro-optimizations. For +/// example: +/// +/// 1. Figure out how to more intelligently create initial partitions. That is, +/// Hopcroft's algorithm starts by creating two partitions of DFA states +/// that are known to NOT be equivalent: match states and non-match states. +/// The algorithm proceeds by progressively refining these partitions into +/// smaller partitions. If we could start with more partitions, then we +/// could reduce the amount of work that Hopcroft's algorithm needs to do. +/// 2. For every partition that we visit, we find all incoming transitions to +/// every state in the partition for *every* element in the alphabet. (This +/// is why using byte classes can significantly decrease minimization times, +/// since byte classes shrink the alphabet.) This is quite costly and there +/// is perhaps some redundant work being performed depending on the specific +/// states in the set. For example, we might be able to only visit some +/// elements of the alphabet based on the transitions. +/// 3. Move parts of minimization into determinization. If minimization has +/// fewer states to deal with, then it should run faster. A prime example +/// of this might be large Unicode classes, which are generated in way that +/// can create a lot of redundant states. (Some work has been done on this +/// point during NFA compilation via the algorithm described in the +/// "Incremental Construction of MinimalAcyclic Finite-State Automata" +/// paper.) +pub(crate) struct Minimizer<'a> { + dfa: &'a mut dense::OwnedDFA, + in_transitions: Vec<Vec<Vec<StateID>>>, + partitions: Vec<StateSet>, + waiting: Vec<StateSet>, +} + +impl<'a> fmt::Debug for Minimizer<'a> { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + f.debug_struct("Minimizer") + .field("dfa", &self.dfa) + .field("in_transitions", &self.in_transitions) + .field("partitions", &self.partitions) + .field("waiting", &self.waiting) + .finish() + } +} + +/// A set of states. A state set makes up a single partition in Hopcroft's +/// algorithm. +/// +/// It is represented by an ordered set of state identifiers. We use shared +/// ownership so that a single state set can be in both the set of partitions +/// and in the set of waiting sets simultaneously without an additional +/// allocation. Generally, once a state set is built, it becomes immutable. +/// +/// We use this representation because it avoids the overhead of more +/// traditional set data structures (HashSet/BTreeSet), and also because +/// computing intersection/subtraction on this representation is especially +/// fast. +#[derive(Clone, Debug, Eq, PartialEq, PartialOrd, Ord)] +struct StateSet { + ids: Rc<RefCell<Vec<StateID>>>, +} + +impl<'a> Minimizer<'a> { + pub fn new(dfa: &'a mut dense::OwnedDFA) -> Minimizer<'a> { + let in_transitions = Minimizer::incoming_transitions(dfa); + let partitions = Minimizer::initial_partitions(dfa); + let waiting = partitions.clone(); + Minimizer { dfa, in_transitions, partitions, waiting } + } + + pub fn run(mut self) { + let stride2 = self.dfa.stride2(); + let as_state_id = |index: usize| -> StateID { + StateID::new(index << stride2).unwrap() + }; + let as_index = |id: StateID| -> usize { id.as_usize() >> stride2 }; + + let mut incoming = StateSet::empty(); + let mut scratch1 = StateSet::empty(); + let mut scratch2 = StateSet::empty(); + let mut newparts = vec![]; + + // This loop is basically Hopcroft's algorithm. Everything else is just + // shuffling data around to fit our representation. + while let Some(set) = self.waiting.pop() { + for b in self.dfa.byte_classes().iter() { + self.find_incoming_to(b, &set, &mut incoming); + // If incoming is empty, then the intersection with any other + // set must also be empty. So 'newparts' just ends up being + // 'self.partitions'. So there's no need to go through the loop + // below. + // + // This actually turns out to be rather large optimization. On + // the order of making minimization 4-5x faster. It's likely + // that the vast majority of all states have very few incoming + // transitions. + if incoming.is_empty() { + continue; + } + + for p in 0..self.partitions.len() { + self.partitions[p].intersection(&incoming, &mut scratch1); + if scratch1.is_empty() { + newparts.push(self.partitions[p].clone()); + continue; + } + + self.partitions[p].subtract(&incoming, &mut scratch2); + if scratch2.is_empty() { + newparts.push(self.partitions[p].clone()); + continue; + } + + let (x, y) = + (scratch1.deep_clone(), scratch2.deep_clone()); + newparts.push(x.clone()); + newparts.push(y.clone()); + match self.find_waiting(&self.partitions[p]) { + Some(i) => { + self.waiting[i] = x; + self.waiting.push(y); + } + None => { + if x.len() <= y.len() { + self.waiting.push(x); + } else { + self.waiting.push(y); + } + } + } + } + newparts = mem::replace(&mut self.partitions, newparts); + newparts.clear(); + } + } + + // At this point, we now have a minimal partitioning of states, where + // each partition is an equivalence class of DFA states. Now we need to + // use this partitioning to update the DFA to only contain one state for + // each partition. + + // Create a map from DFA state ID to the representative ID of the + // equivalence class to which it belongs. The representative ID of an + // equivalence class of states is the minimum ID in that class. + let mut state_to_part = vec![DEAD; self.dfa.state_len()]; + for p in &self.partitions { + p.iter(|id| state_to_part[as_index(id)] = p.min()); + } + + // Generate a new contiguous sequence of IDs for minimal states, and + // create a map from equivalence IDs to the new IDs. Thus, the new + // minimal ID of *any* state in the unminimized DFA can be obtained + // with minimals_ids[state_to_part[old_id]]. + let mut minimal_ids = vec![DEAD; self.dfa.state_len()]; + let mut new_index = 0; + for state in self.dfa.states() { + if state_to_part[as_index(state.id())] == state.id() { + minimal_ids[as_index(state.id())] = as_state_id(new_index); + new_index += 1; + } + } + // The total number of states in the minimal DFA. + let minimal_count = new_index; + // Convenience function for remapping state IDs. This takes an old ID, + // looks up its Hopcroft partition and then maps that to the new ID + // range. + let remap = |old| minimal_ids[as_index(state_to_part[as_index(old)])]; + + // Re-map this DFA in place such that the only states remaining + // correspond to the representative states of every equivalence class. + for id in (0..self.dfa.state_len()).map(as_state_id) { + // If this state isn't a representative for an equivalence class, + // then we skip it since it won't appear in the minimal DFA. + if state_to_part[as_index(id)] != id { + continue; + } + self.dfa.remap_state(id, remap); + self.dfa.swap_states(id, minimal_ids[as_index(id)]); + } + // Trim off all unused states from the pre-minimized DFA. This + // represents all states that were merged into a non-singleton + // equivalence class of states, and appeared after the first state + // in each such class. (Because the state with the smallest ID in each + // equivalence class is its representative ID.) + self.dfa.truncate_states(minimal_count); + + // Update the new start states, which is now just the minimal ID of + // whatever state the old start state was collapsed into. Also, we + // collect everything before-hand to work around the borrow checker. + // We're already allocating so much that this is probably fine. If this + // turns out to be costly, then I guess add a `starts_mut` iterator. + let starts: Vec<_> = self.dfa.starts().collect(); + for (old_start_id, anchored, start_type) in starts { + self.dfa.set_start_state( + anchored, + start_type, + remap(old_start_id), + ); + } + + // Update the match state pattern ID list for multi-regexes. All we + // need to do is remap the match state IDs. The pattern ID lists are + // always the same as they were since match states with distinct + // pattern ID lists are always considered distinct states. + let mut pmap = BTreeMap::new(); + for (match_id, pattern_ids) in self.dfa.pattern_map() { + let new_id = remap(match_id); + pmap.insert(new_id, pattern_ids); + } + // This unwrap is OK because minimization never increases the number of + // match states or patterns in those match states. Since minimization + // runs after the pattern map has already been set at least once, we + // know that our match states cannot error. + self.dfa.set_pattern_map(&pmap).unwrap(); + + // In order to update the ID of the maximum match state, we need to + // find the maximum ID among all of the match states in the minimized + // DFA. This is not necessarily the new ID of the unminimized maximum + // match state, since that could have been collapsed with a much + // earlier match state. Therefore, to find the new max match state, + // we iterate over all previous match states, find their corresponding + // new minimal ID, and take the maximum of those. + let old = self.dfa.special().clone(); + let new = self.dfa.special_mut(); + // ... but only remap if we had match states. + if old.matches() { + new.min_match = StateID::MAX; + new.max_match = StateID::ZERO; + for i in as_index(old.min_match)..=as_index(old.max_match) { + let new_id = remap(as_state_id(i)); + if new_id < new.min_match { + new.min_match = new_id; + } + if new_id > new.max_match { + new.max_match = new_id; + } + } + } + // ... same, but for start states. + if old.starts() { + new.min_start = StateID::MAX; + new.max_start = StateID::ZERO; + for i in as_index(old.min_start)..=as_index(old.max_start) { + let new_id = remap(as_state_id(i)); + if new_id == DEAD { + continue; + } + if new_id < new.min_start { + new.min_start = new_id; + } + if new_id > new.max_start { + new.max_start = new_id; + } + } + if new.max_start == DEAD { + new.min_start = DEAD; + } + } + new.quit_id = remap(new.quit_id); + new.set_max(); + } + + fn find_waiting(&self, set: &StateSet) -> Option<usize> { + self.waiting.iter().position(|s| s == set) + } + + fn find_incoming_to( + &self, + b: alphabet::Unit, + set: &StateSet, + incoming: &mut StateSet, + ) { + incoming.clear(); + set.iter(|id| { + for &inid in + &self.in_transitions[self.dfa.to_index(id)][b.as_usize()] + { + incoming.add(inid); + } + }); + incoming.canonicalize(); + } + + fn initial_partitions(dfa: &dense::OwnedDFA) -> Vec<StateSet> { + // For match states, we know that two match states with different + // pattern ID lists will *always* be distinct, so we can partition them + // initially based on that. + let mut matching: BTreeMap<Vec<PatternID>, StateSet> = BTreeMap::new(); + let mut is_quit = StateSet::empty(); + let mut no_match = StateSet::empty(); + for state in dfa.states() { + if dfa.is_match_state(state.id()) { + let mut pids = vec![]; + for i in 0..dfa.match_len(state.id()) { + pids.push(dfa.match_pattern(state.id(), i)); + } + matching + .entry(pids) + .or_insert(StateSet::empty()) + .add(state.id()); + } else if dfa.is_quit_state(state.id()) { + is_quit.add(state.id()); + } else { + no_match.add(state.id()); + } + } + + let mut sets: Vec<StateSet> = + matching.into_iter().map(|(_, set)| set).collect(); + sets.push(no_match); + sets.push(is_quit); + sets + } + + fn incoming_transitions(dfa: &dense::OwnedDFA) -> Vec<Vec<Vec<StateID>>> { + let mut incoming = vec![]; + for _ in dfa.states() { + incoming.push(vec![vec![]; dfa.alphabet_len()]); + } + for state in dfa.states() { + for (b, next) in state.transitions() { + incoming[dfa.to_index(next)][b.as_usize()].push(state.id()); + } + } + incoming + } +} + +impl StateSet { + fn empty() -> StateSet { + StateSet { ids: Rc::new(RefCell::new(vec![])) } + } + + fn add(&mut self, id: StateID) { + self.ids.borrow_mut().push(id); + } + + fn min(&self) -> StateID { + self.ids.borrow()[0] + } + + fn canonicalize(&mut self) { + self.ids.borrow_mut().sort(); + self.ids.borrow_mut().dedup(); + } + + fn clear(&mut self) { + self.ids.borrow_mut().clear(); + } + + fn len(&self) -> usize { + self.ids.borrow().len() + } + + fn is_empty(&self) -> bool { + self.len() == 0 + } + + fn deep_clone(&self) -> StateSet { + let ids = self.ids.borrow().iter().cloned().collect(); + StateSet { ids: Rc::new(RefCell::new(ids)) } + } + + fn iter<F: FnMut(StateID)>(&self, mut f: F) { + for &id in self.ids.borrow().iter() { + f(id); + } + } + + fn intersection(&self, other: &StateSet, dest: &mut StateSet) { + dest.clear(); + if self.is_empty() || other.is_empty() { + return; + } + + let (seta, setb) = (self.ids.borrow(), other.ids.borrow()); + let (mut ita, mut itb) = (seta.iter().cloned(), setb.iter().cloned()); + let (mut a, mut b) = (ita.next().unwrap(), itb.next().unwrap()); + loop { + if a == b { + dest.add(a); + a = match ita.next() { + None => break, + Some(a) => a, + }; + b = match itb.next() { + None => break, + Some(b) => b, + }; + } else if a < b { + a = match ita.next() { + None => break, + Some(a) => a, + }; + } else { + b = match itb.next() { + None => break, + Some(b) => b, + }; + } + } + } + + fn subtract(&self, other: &StateSet, dest: &mut StateSet) { + dest.clear(); + if self.is_empty() || other.is_empty() { + self.iter(|s| dest.add(s)); + return; + } + + let (seta, setb) = (self.ids.borrow(), other.ids.borrow()); + let (mut ita, mut itb) = (seta.iter().cloned(), setb.iter().cloned()); + let (mut a, mut b) = (ita.next().unwrap(), itb.next().unwrap()); + loop { + if a == b { + a = match ita.next() { + None => break, + Some(a) => a, + }; + b = match itb.next() { + None => { + dest.add(a); + break; + } + Some(b) => b, + }; + } else if a < b { + dest.add(a); + a = match ita.next() { + None => break, + Some(a) => a, + }; + } else { + b = match itb.next() { + None => { + dest.add(a); + break; + } + Some(b) => b, + }; + } + } + for a in ita { + dest.add(a); + } + } +} diff --git a/third_party/rust/regex-automata/src/dfa/mod.rs b/third_party/rust/regex-automata/src/dfa/mod.rs new file mode 100644 index 0000000000..4bb8704352 --- /dev/null +++ b/third_party/rust/regex-automata/src/dfa/mod.rs @@ -0,0 +1,360 @@ +/*! +A module for building and searching with deterministic finite automata (DFAs). + +Like other modules in this crate, DFAs support a rich regex syntax with Unicode +features. DFAs also have extensive options for configuring the best space vs +time trade off for your use case and provides support for cheap deserialization +of automata for use in `no_std` environments. + +If you're looking for lazy DFAs that build themselves incrementally during +search, then please see the top-level [`hybrid` module](crate::hybrid). + +# Overview + +This section gives a brief overview of the primary types in this module: + +* A [`regex::Regex`] provides a way to search for matches of a regular +expression using DFAs. This includes iterating over matches with both the start +and end positions of each match. +* A [`dense::DFA`] provides low level access to a DFA that uses a dense +representation (uses lots of space, but fast searching). +* A [`sparse::DFA`] provides the same API as a `dense::DFA`, but uses a sparse +representation (uses less space, but slower searching). +* An [`Automaton`] trait that defines an interface that both dense and sparse +DFAs implement. (A `regex::Regex` is generic over this trait.) +* Both dense DFAs and sparse DFAs support serialization to raw bytes (e.g., +[`dense::DFA::to_bytes_little_endian`]) and cheap deserialization (e.g., +[`dense::DFA::from_bytes`]). + +There is also a [`onepass`] module that provides a [one-pass +DFA](onepass::DFA). The unique advantage of this DFA is that, for the class +of regexes it can be built with, it supports reporting the spans of matching +capturing groups. It is the only DFA in this crate capable of such a thing. + +# Example: basic regex searching + +This example shows how to compile a regex using the default configuration +and then use it to find matches in a byte string: + +``` +use regex_automata::{Match, dfa::regex::Regex}; + +let re = Regex::new(r"[0-9]{4}-[0-9]{2}-[0-9]{2}")?; +let text = b"2018-12-24 2016-10-08"; +let matches: Vec<Match> = re.find_iter(text).collect(); +assert_eq!(matches, vec![ + Match::must(0, 0..10), + Match::must(0, 11..21), +]); +# Ok::<(), Box<dyn std::error::Error>>(()) +``` + +# Example: searching with regex sets + +The DFAs in this module all fully support searching with multiple regexes +simultaneously. You can use this support with standard leftmost-first style +searching to find non-overlapping matches: + +``` +# if cfg!(miri) { return Ok(()); } // miri takes too long +use regex_automata::{Match, dfa::regex::Regex}; + +let re = Regex::new_many(&[r"\w+", r"\S+"])?; +let text = b"@foo bar"; +let matches: Vec<Match> = re.find_iter(text).collect(); +assert_eq!(matches, vec![ + Match::must(1, 0..4), + Match::must(0, 5..8), +]); +# Ok::<(), Box<dyn std::error::Error>>(()) +``` + +# Example: use sparse DFAs + +By default, compiling a regex will use dense DFAs internally. This uses more +memory, but executes searches more quickly. If you can abide slower searches +(somewhere around 3-5x), then sparse DFAs might make more sense since they can +use significantly less space. + +Using sparse DFAs is as easy as using `Regex::new_sparse` instead of +`Regex::new`: + +``` +use regex_automata::{Match, dfa::regex::Regex}; + +let re = Regex::new_sparse(r"[0-9]{4}-[0-9]{2}-[0-9]{2}").unwrap(); +let text = b"2018-12-24 2016-10-08"; +let matches: Vec<Match> = re.find_iter(text).collect(); +assert_eq!(matches, vec![ + Match::must(0, 0..10), + Match::must(0, 11..21), +]); +# Ok::<(), Box<dyn std::error::Error>>(()) +``` + +If you already have dense DFAs for some reason, they can be converted to sparse +DFAs and used to build a new `Regex`. For example: + +``` +use regex_automata::{Match, dfa::regex::Regex}; + +let dense_re = Regex::new(r"[0-9]{4}-[0-9]{2}-[0-9]{2}").unwrap(); +let sparse_re = Regex::builder().build_from_dfas( + dense_re.forward().to_sparse()?, + dense_re.reverse().to_sparse()?, +); +let text = b"2018-12-24 2016-10-08"; +let matches: Vec<Match> = sparse_re.find_iter(text).collect(); +assert_eq!(matches, vec![ + Match::must(0, 0..10), + Match::must(0, 11..21), +]); +# Ok::<(), Box<dyn std::error::Error>>(()) +``` + +# Example: deserialize a DFA + +This shows how to first serialize a DFA into raw bytes, and then deserialize +those raw bytes back into a DFA. While this particular example is a +bit contrived, this same technique can be used in your program to +deserialize a DFA at start up time or by memory mapping a file. + +``` +use regex_automata::{Match, dfa::{dense, regex::Regex}}; + +let re1 = Regex::new(r"[0-9]{4}-[0-9]{2}-[0-9]{2}").unwrap(); +// serialize both the forward and reverse DFAs, see note below +let (fwd_bytes, fwd_pad) = re1.forward().to_bytes_native_endian(); +let (rev_bytes, rev_pad) = re1.reverse().to_bytes_native_endian(); +// now deserialize both---we need to specify the correct type! +let fwd: dense::DFA<&[u32]> = dense::DFA::from_bytes(&fwd_bytes[fwd_pad..])?.0; +let rev: dense::DFA<&[u32]> = dense::DFA::from_bytes(&rev_bytes[rev_pad..])?.0; +// finally, reconstruct our regex +let re2 = Regex::builder().build_from_dfas(fwd, rev); + +// we can use it like normal +let text = b"2018-12-24 2016-10-08"; +let matches: Vec<Match> = re2.find_iter(text).collect(); +assert_eq!(matches, vec![ + Match::must(0, 0..10), + Match::must(0, 11..21), +]); +# Ok::<(), Box<dyn std::error::Error>>(()) +``` + +There are a few points worth noting here: + +* We need to extract the raw DFAs used by the regex and serialize those. You +can build the DFAs manually yourself using [`dense::Builder`], but using +the DFAs from a `Regex` guarantees that the DFAs are built correctly. (In +particular, a `Regex` constructs a reverse DFA for finding the starting +location of matches.) +* To convert the DFA to raw bytes, we use the `to_bytes_native_endian` method. +In practice, you'll want to use either [`dense::DFA::to_bytes_little_endian`] +or [`dense::DFA::to_bytes_big_endian`], depending on which platform you're +deserializing your DFA from. If you intend to deserialize on either platform, +then you'll need to serialize both and deserialize the right one depending on +your target's endianness. +* Safely deserializing a DFA requires verifying the raw bytes, particularly if +they are untrusted, since an invalid DFA could cause logical errors, panics +or even undefined behavior. This verification step requires visiting all of +the transitions in the DFA, which can be costly. If cheaper verification is +desired, then [`dense::DFA::from_bytes_unchecked`] is available that only does +verification that can be performed in constant time. However, one can only use +this routine if the caller can guarantee that the bytes provided encoded a +valid DFA. + +The same process can be achieved with sparse DFAs as well: + +``` +use regex_automata::{Match, dfa::{sparse, regex::Regex}}; + +let re1 = Regex::new(r"[0-9]{4}-[0-9]{2}-[0-9]{2}").unwrap(); +// serialize both +let fwd_bytes = re1.forward().to_sparse()?.to_bytes_native_endian(); +let rev_bytes = re1.reverse().to_sparse()?.to_bytes_native_endian(); +// now deserialize both---we need to specify the correct type! +let fwd: sparse::DFA<&[u8]> = sparse::DFA::from_bytes(&fwd_bytes)?.0; +let rev: sparse::DFA<&[u8]> = sparse::DFA::from_bytes(&rev_bytes)?.0; +// finally, reconstruct our regex +let re2 = Regex::builder().build_from_dfas(fwd, rev); + +// we can use it like normal +let text = b"2018-12-24 2016-10-08"; +let matches: Vec<Match> = re2.find_iter(text).collect(); +assert_eq!(matches, vec![ + Match::must(0, 0..10), + Match::must(0, 11..21), +]); +# Ok::<(), Box<dyn std::error::Error>>(()) +``` + +Note that unlike dense DFAs, sparse DFAs have no alignment requirements. +Conversely, dense DFAs must be be aligned to the same alignment as a +[`StateID`](crate::util::primitives::StateID). + +# Support for `no_std` and `alloc`-only + +This crate comes with `alloc` and `std` features that are enabled by default. +When the `alloc` or `std` features are enabled, the API of this module will +include the facilities necessary for compiling, serializing, deserializing +and searching with DFAs. When only the `alloc` feature is enabled, then +implementations of the `std::error::Error` trait are dropped, but everything +else generally remains the same. When both the `alloc` and `std` features are +disabled, the API of this module will shrink such that it only includes the +facilities necessary for deserializing and searching with DFAs. + +The intended workflow for `no_std` environments is thus as follows: + +* Write a program with the `alloc` or `std` features that compiles and +serializes a regular expression. You may need to serialize both little and big +endian versions of each DFA. (So that's 4 DFAs in total for each regex.) +* In your `no_std` environment, follow the examples above for deserializing +your previously serialized DFAs into regexes. You can then search with them as +you would any regex. + +Deserialization can happen anywhere. For example, with bytes embedded into a +binary or with a file memory mapped at runtime. + +The `regex-cli` command (found in the same repository as this crate) can be +used to serialize DFAs to files and generate Rust code to read them. + +# Syntax + +This module supports the same syntax as the `regex` crate, since they share the +same parser. You can find an exhaustive list of supported syntax in the +[documentation for the `regex` crate](https://docs.rs/regex/1/regex/#syntax). + +There are two things that are not supported by the DFAs in this module: + +* Capturing groups. The DFAs (and [`Regex`](regex::Regex)es built on top +of them) can only find the offsets of an entire match, but cannot resolve +the offsets of each capturing group. This is because DFAs do not have the +expressive power necessary. +* Unicode word boundaries. These present particularly difficult challenges for +DFA construction and would result in an explosion in the number of states. +One can enable [`dense::Config::unicode_word_boundary`] though, which provides +heuristic support for Unicode word boundaries that only works on ASCII text. +Otherwise, one can use `(?-u:\b)` for an ASCII word boundary, which will work +on any input. + +There are no plans to lift either of these limitations. + +Note that these restrictions are identical to the restrictions on lazy DFAs. + +# Differences with general purpose regexes + +The main goal of the [`regex`](https://docs.rs/regex) crate is to serve as a +general purpose regular expression engine. It aims to automatically balance low +compile times, fast search times and low memory usage, while also providing +a convenient API for users. In contrast, this module provides a lower level +regular expression interface based exclusively on DFAs that is a bit less +convenient while providing more explicit control over memory usage and search +times. + +Here are some specific negative differences: + +* **Compilation can take an exponential amount of time and space** in the size +of the regex pattern. While most patterns do not exhibit worst case exponential +time, such patterns do exist. For example, `[01]*1[01]{N}` will build a DFA +with approximately `2^(N+2)` states. For this reason, untrusted patterns should +not be compiled with this module. (In the future, the API may expose an option +to return an error if the DFA gets too big.) +* This module does not support sub-match extraction via capturing groups, which +can be achieved with the regex crate's "captures" API. +* While the regex crate doesn't necessarily sport fast compilation times, +the regexes in this module are almost universally slow to compile, especially +when they contain large Unicode character classes. For example, on my system, +compiling `\w{50}` takes about 1 second and almost 15MB of memory! (Compiling +a sparse regex takes about the same time but only uses about 1.2MB of +memory.) Conversely, compiling the same regex without Unicode support, e.g., +`(?-u)\w{50}`, takes under 1 millisecond and about 15KB of memory. For this +reason, you should only use Unicode character classes if you absolutely need +them! (They are enabled by default though.) +* This module does not support Unicode word boundaries. ASCII word bondaries +may be used though by disabling Unicode or selectively doing so in the syntax, +e.g., `(?-u:\b)`. There is also an option to +[heuristically enable Unicode word boundaries](crate::dfa::dense::Config::unicode_word_boundary), +where the corresponding DFA will give up if any non-ASCII byte is seen. +* As a lower level API, this module does not do literal optimizations +automatically. Although it does provide hooks in its API to make use of the +[`Prefilter`](crate::util::prefilter::Prefilter) trait. Missing literal +optimizations means that searches may run much slower than what you're +accustomed to, although, it does provide more predictable and consistent +performance. +* There is no `&str` API like in the regex crate. In this module, all APIs +operate on `&[u8]`. By default, match indices are +guaranteed to fall on UTF-8 boundaries, unless either of +[`syntax::Config::utf8`](crate::util::syntax::Config::utf8) or +[`thompson::Config::utf8`](crate::nfa::thompson::Config::utf8) are disabled. + +With some of the downsides out of the way, here are some positive differences: + +* Both dense and sparse DFAs can be serialized to raw bytes, and then cheaply +deserialized. Deserialization can be done in constant time with the unchecked +APIs, since searching can be performed directly on the raw serialized bytes of +a DFA. +* This module was specifically designed so that the searching phase of a +DFA has minimal runtime requirements, and can therefore be used in `no_std` +environments. While `no_std` environments cannot compile regexes, they can +deserialize pre-compiled regexes. +* Since this module builds DFAs ahead of time, it will generally out-perform +the `regex` crate on equivalent tasks. The performance difference is likely +not large. However, because of a complex set of optimizations in the regex +crate (like literal optimizations), an accurate performance comparison may be +difficult to do. +* Sparse DFAs provide a way to build a DFA ahead of time that sacrifices search +performance a small amount, but uses much less storage space. Potentially even +less than what the regex crate uses. +* This module exposes DFAs directly, such as [`dense::DFA`] and +[`sparse::DFA`], which enables one to do less work in some cases. For example, +if you only need the end of a match and not the start of a match, then you can +use a DFA directly without building a `Regex`, which always requires a second +DFA to find the start of a match. +* This module provides more control over memory usage. Aside from choosing +between dense and sparse DFAs, one can also choose a smaller state identifier +representation to use less space. Also, one can enable DFA minimization +via [`dense::Config::minimize`], but it can increase compilation times +dramatically. +*/ + +#[cfg(feature = "dfa-search")] +pub use crate::dfa::{ + automaton::{Automaton, OverlappingState}, + start::StartKind, +}; + +/// This is an alias for a state ID of zero. It has special significance +/// because it always corresponds to the first state in a DFA, and the first +/// state in a DFA is always "dead." That is, the dead state always has all +/// of its transitions set to itself. Moreover, the dead state is used as a +/// sentinel for various things. e.g., In search, reaching a dead state means +/// that the search must stop. +const DEAD: crate::util::primitives::StateID = + crate::util::primitives::StateID::ZERO; + +#[cfg(feature = "dfa-search")] +pub mod dense; +#[cfg(feature = "dfa-onepass")] +pub mod onepass; +#[cfg(feature = "dfa-search")] +pub mod regex; +#[cfg(feature = "dfa-search")] +pub mod sparse; + +#[cfg(feature = "dfa-search")] +pub(crate) mod accel; +#[cfg(feature = "dfa-search")] +mod automaton; +#[cfg(feature = "dfa-build")] +mod determinize; +#[cfg(feature = "dfa-build")] +mod minimize; +#[cfg(any(feature = "dfa-build", feature = "dfa-onepass"))] +mod remapper; +#[cfg(feature = "dfa-search")] +mod search; +#[cfg(feature = "dfa-search")] +mod special; +#[cfg(feature = "dfa-search")] +mod start; diff --git a/third_party/rust/regex-automata/src/dfa/onepass.rs b/third_party/rust/regex-automata/src/dfa/onepass.rs new file mode 100644 index 0000000000..44691d0c8a --- /dev/null +++ b/third_party/rust/regex-automata/src/dfa/onepass.rs @@ -0,0 +1,3188 @@ +/*! +A DFA that can return spans for matching capturing groups. + +This module is the home of a [one-pass DFA](DFA). + +This module also contains a [`Builder`] and a [`Config`] for building and +configuring a one-pass DFA. +*/ + +// A note on naming and credit: +// +// As far as I know, Russ Cox came up with the practical vision and +// implementation of a "one-pass regex engine." He mentions and describes it +// briefly in the third article of his regexp article series: +// https://swtch.com/~rsc/regexp/regexp3.html +// +// Cox's implementation is in RE2, and the implementation below is most +// heavily inspired by RE2's. The key thing they have in common is that +// their transitions are defined over an alphabet of bytes. In contrast, +// Go's regex engine also has a one-pass engine, but its transitions are +// more firmly rooted on Unicode codepoints. The ideas are the same, but the +// implementations are different. +// +// RE2 tends to call this a "one-pass NFA." Here, we call it a "one-pass DFA." +// They're both true in their own ways: +// +// * The "one-pass" criterion is generally a property of the NFA itself. In +// particular, it is said that an NFA is one-pass if, after each byte of input +// during a search, there is at most one "VM thread" remaining to take for the +// next byte of input. That is, there is never any ambiguity as to the path to +// take through the NFA during a search. +// +// * On the other hand, once a one-pass NFA has its representation converted +// to something where a constant number of instructions is used for each byte +// of input, the implementation looks a lot more like a DFA. It's technically +// more powerful than a DFA since it has side effects (storing offsets inside +// of slots activated by a transition), but it is far closer to a DFA than an +// NFA simulation. +// +// Thus, in this crate, we call it a one-pass DFA. + +use alloc::{vec, vec::Vec}; + +use crate::{ + dfa::{remapper::Remapper, DEAD}, + nfa::thompson::{self, NFA}, + util::{ + alphabet::ByteClasses, + captures::Captures, + escape::DebugByte, + int::{Usize, U32, U64, U8}, + look::{Look, LookSet, UnicodeWordBoundaryError}, + primitives::{NonMaxUsize, PatternID, StateID}, + search::{Anchored, Input, Match, MatchError, MatchKind, Span}, + sparse_set::SparseSet, + }, +}; + +/// The configuration used for building a [one-pass DFA](DFA). +/// +/// A one-pass DFA configuration is a simple data object that is typically used +/// with [`Builder::configure`]. It can be cheaply cloned. +/// +/// A default configuration can be created either with `Config::new`, or +/// perhaps more conveniently, with [`DFA::config`]. +#[derive(Clone, Debug, Default)] +pub struct Config { + match_kind: Option<MatchKind>, + starts_for_each_pattern: Option<bool>, + byte_classes: Option<bool>, + size_limit: Option<Option<usize>>, +} + +impl Config { + /// Return a new default one-pass DFA configuration. + pub fn new() -> Config { + Config::default() + } + + /// Set the desired match semantics. + /// + /// The default is [`MatchKind::LeftmostFirst`], which corresponds to the + /// match semantics of Perl-like regex engines. That is, when multiple + /// patterns would match at the same leftmost position, the pattern that + /// appears first in the concrete syntax is chosen. + /// + /// Currently, the only other kind of match semantics supported is + /// [`MatchKind::All`]. This corresponds to "classical DFA" construction + /// where all possible matches are visited. + /// + /// When it comes to the one-pass DFA, it is rarer for preference order and + /// "longest match" to actually disagree. Since if they did disagree, then + /// the regex typically isn't one-pass. For example, searching `Samwise` + /// for `Sam|Samwise` will report `Sam` for leftmost-first matching and + /// `Samwise` for "longest match" or "all" matching. However, this regex is + /// not one-pass if taken literally. The equivalent regex, `Sam(?:|wise)` + /// is one-pass and `Sam|Samwise` may be optimized to it. + /// + /// The other main difference is that "all" match semantics don't support + /// non-greedy matches. "All" match semantics always try to match as much + /// as possible. + pub fn match_kind(mut self, kind: MatchKind) -> Config { + self.match_kind = Some(kind); + self + } + + /// Whether to compile a separate start state for each pattern in the + /// one-pass DFA. + /// + /// When enabled, a separate **anchored** start state is added for each + /// pattern in the DFA. When this start state is used, then the DFA will + /// only search for matches for the pattern specified, even if there are + /// other patterns in the DFA. + /// + /// The main downside of this option is that it can potentially increase + /// the size of the DFA and/or increase the time it takes to build the DFA. + /// + /// You might want to enable this option when you want to both search for + /// anchored matches of any pattern or to search for anchored matches of + /// one particular pattern while using the same DFA. (Otherwise, you would + /// need to compile a new DFA for each pattern.) + /// + /// By default this is disabled. + /// + /// # Example + /// + /// This example shows how to build a multi-regex and then search for + /// matches for a any of the patterns or matches for a specific pattern. + /// + /// ``` + /// use regex_automata::{ + /// dfa::onepass::DFA, Anchored, Input, Match, PatternID, + /// }; + /// + /// let re = DFA::builder() + /// .configure(DFA::config().starts_for_each_pattern(true)) + /// .build_many(&["[a-z]+", "[0-9]+"])?; + /// let (mut cache, mut caps) = (re.create_cache(), re.create_captures()); + /// let haystack = "123abc"; + /// let input = Input::new(haystack).anchored(Anchored::Yes); + /// + /// // A normal multi-pattern search will show pattern 1 matches. + /// re.try_search(&mut cache, &input, &mut caps)?; + /// assert_eq!(Some(Match::must(1, 0..3)), caps.get_match()); + /// + /// // If we only want to report pattern 0 matches, then we'll get no + /// // match here. + /// let input = input.anchored(Anchored::Pattern(PatternID::must(0))); + /// re.try_search(&mut cache, &input, &mut caps)?; + /// assert_eq!(None, caps.get_match()); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + pub fn starts_for_each_pattern(mut self, yes: bool) -> Config { + self.starts_for_each_pattern = Some(yes); + self + } + + /// Whether to attempt to shrink the size of the DFA's alphabet or not. + /// + /// This option is enabled by default and should never be disabled unless + /// one is debugging a one-pass DFA. + /// + /// When enabled, the DFA will use a map from all possible bytes to their + /// corresponding equivalence class. Each equivalence class represents a + /// set of bytes that does not discriminate between a match and a non-match + /// in the DFA. For example, the pattern `[ab]+` has at least two + /// equivalence classes: a set containing `a` and `b` and a set containing + /// every byte except for `a` and `b`. `a` and `b` are in the same + /// equivalence class because they never discriminate between a match and a + /// non-match. + /// + /// The advantage of this map is that the size of the transition table + /// can be reduced drastically from (approximately) `#states * 256 * + /// sizeof(StateID)` to `#states * k * sizeof(StateID)` where `k` is the + /// number of equivalence classes (rounded up to the nearest power of 2). + /// As a result, total space usage can decrease substantially. Moreover, + /// since a smaller alphabet is used, DFA compilation becomes faster as + /// well. + /// + /// **WARNING:** This is only useful for debugging DFAs. Disabling this + /// does not yield any speed advantages. Namely, even when this is + /// disabled, a byte class map is still used while searching. The only + /// difference is that every byte will be forced into its own distinct + /// equivalence class. This is useful for debugging the actual generated + /// transitions because it lets one see the transitions defined on actual + /// bytes instead of the equivalence classes. + pub fn byte_classes(mut self, yes: bool) -> Config { + self.byte_classes = Some(yes); + self + } + + /// Set a size limit on the total heap used by a one-pass DFA. + /// + /// This size limit is expressed in bytes and is applied during + /// construction of a one-pass DFA. If the DFA's heap usage exceeds + /// this configured limit, then construction is stopped and an error is + /// returned. + /// + /// The default is no limit. + /// + /// # Example + /// + /// This example shows a one-pass DFA that fails to build because of + /// a configured size limit. This particular example also serves as a + /// cautionary tale demonstrating just how big DFAs with large Unicode + /// character classes can get. + /// + /// ``` + /// # if cfg!(miri) { return Ok(()); } // miri takes too long + /// use regex_automata::{dfa::onepass::DFA, Match}; + /// + /// // 6MB isn't enough! + /// DFA::builder() + /// .configure(DFA::config().size_limit(Some(6_000_000))) + /// .build(r"\w{20}") + /// .unwrap_err(); + /// + /// // ... but 7MB probably is! + /// // (Note that DFA sizes aren't necessarily stable between releases.) + /// let re = DFA::builder() + /// .configure(DFA::config().size_limit(Some(7_000_000))) + /// .build(r"\w{20}")?; + /// let (mut cache, mut caps) = (re.create_cache(), re.create_captures()); + /// let haystack = "A".repeat(20); + /// re.captures(&mut cache, &haystack, &mut caps); + /// assert_eq!(Some(Match::must(0, 0..20)), caps.get_match()); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + /// + /// While one needs a little more than 3MB to represent `\w{20}`, it + /// turns out that you only need a little more than 4KB to represent + /// `(?-u:\w{20})`. So only use Unicode if you need it! + pub fn size_limit(mut self, limit: Option<usize>) -> Config { + self.size_limit = Some(limit); + self + } + + /// Returns the match semantics set in this configuration. + pub fn get_match_kind(&self) -> MatchKind { + self.match_kind.unwrap_or(MatchKind::LeftmostFirst) + } + + /// Returns whether this configuration has enabled anchored starting states + /// for every pattern in the DFA. + pub fn get_starts_for_each_pattern(&self) -> bool { + self.starts_for_each_pattern.unwrap_or(false) + } + + /// Returns whether this configuration has enabled byte classes or not. + /// This is typically a debugging oriented option, as disabling it confers + /// no speed benefit. + pub fn get_byte_classes(&self) -> bool { + self.byte_classes.unwrap_or(true) + } + + /// Returns the DFA size limit of this configuration if one was set. + /// The size limit is total number of bytes on the heap that a DFA is + /// permitted to use. If the DFA exceeds this limit during construction, + /// then construction is stopped and an error is returned. + pub fn get_size_limit(&self) -> Option<usize> { + self.size_limit.unwrap_or(None) + } + + /// Overwrite the default configuration such that the options in `o` are + /// always used. If an option in `o` is not set, then the corresponding + /// option in `self` is used. If it's not set in `self` either, then it + /// remains not set. + pub(crate) fn overwrite(&self, o: Config) -> Config { + Config { + match_kind: o.match_kind.or(self.match_kind), + starts_for_each_pattern: o + .starts_for_each_pattern + .or(self.starts_for_each_pattern), + byte_classes: o.byte_classes.or(self.byte_classes), + size_limit: o.size_limit.or(self.size_limit), + } + } +} + +/// A builder for a [one-pass DFA](DFA). +/// +/// This builder permits configuring options for the syntax of a pattern, the +/// NFA construction and the DFA construction. This builder is different from a +/// general purpose regex builder in that it permits fine grain configuration +/// of the construction process. The trade off for this is complexity, and +/// the possibility of setting a configuration that might not make sense. For +/// example, there are two different UTF-8 modes: +/// +/// * [`syntax::Config::utf8`](crate::util::syntax::Config::utf8) controls +/// whether the pattern itself can contain sub-expressions that match invalid +/// UTF-8. +/// * [`thompson::Config::utf8`] controls whether empty matches that split a +/// Unicode codepoint are reported or not. +/// +/// Generally speaking, callers will want to either enable all of these or +/// disable all of these. +/// +/// # Example +/// +/// This example shows how to disable UTF-8 mode in the syntax and the NFA. +/// This is generally what you want for matching on arbitrary bytes. +/// +/// ``` +/// # if cfg!(miri) { return Ok(()); } // miri takes too long +/// use regex_automata::{ +/// dfa::onepass::DFA, +/// nfa::thompson, +/// util::syntax, +/// Match, +/// }; +/// +/// let re = DFA::builder() +/// .syntax(syntax::Config::new().utf8(false)) +/// .thompson(thompson::Config::new().utf8(false)) +/// .build(r"foo(?-u:[^b])ar.*")?; +/// let (mut cache, mut caps) = (re.create_cache(), re.create_captures()); +/// +/// let haystack = b"foo\xFFarzz\xE2\x98\xFF\n"; +/// re.captures(&mut cache, haystack, &mut caps); +/// // Notice that `(?-u:[^b])` matches invalid UTF-8, +/// // but the subsequent `.*` does not! Disabling UTF-8 +/// // on the syntax permits this. +/// // +/// // N.B. This example does not show the impact of +/// // disabling UTF-8 mode on a one-pass DFA Config, +/// // since that only impacts regexes that can +/// // produce matches of length 0. +/// assert_eq!(Some(Match::must(0, 0..8)), caps.get_match()); +/// +/// # Ok::<(), Box<dyn std::error::Error>>(()) +/// ``` +#[derive(Clone, Debug)] +pub struct Builder { + config: Config, + #[cfg(feature = "syntax")] + thompson: thompson::Compiler, +} + +impl Builder { + /// Create a new one-pass DFA builder with the default configuration. + pub fn new() -> Builder { + Builder { + config: Config::default(), + #[cfg(feature = "syntax")] + thompson: thompson::Compiler::new(), + } + } + + /// Build a one-pass DFA from the given pattern. + /// + /// If there was a problem parsing or compiling the pattern, then an error + /// is returned. + #[cfg(feature = "syntax")] + pub fn build(&self, pattern: &str) -> Result<DFA, BuildError> { + self.build_many(&[pattern]) + } + + /// Build a one-pass DFA from the given patterns. + /// + /// When matches are returned, the pattern ID corresponds to the index of + /// the pattern in the slice given. + #[cfg(feature = "syntax")] + pub fn build_many<P: AsRef<str>>( + &self, + patterns: &[P], + ) -> Result<DFA, BuildError> { + let nfa = + self.thompson.build_many(patterns).map_err(BuildError::nfa)?; + self.build_from_nfa(nfa) + } + + /// Build a DFA from the given NFA. + /// + /// # Example + /// + /// This example shows how to build a DFA if you already have an NFA in + /// hand. + /// + /// ``` + /// use regex_automata::{dfa::onepass::DFA, nfa::thompson::NFA, Match}; + /// + /// // This shows how to set non-default options for building an NFA. + /// let nfa = NFA::compiler() + /// .configure(NFA::config().shrink(true)) + /// .build(r"[a-z0-9]+")?; + /// let re = DFA::builder().build_from_nfa(nfa)?; + /// let (mut cache, mut caps) = (re.create_cache(), re.create_captures()); + /// re.captures(&mut cache, "foo123bar", &mut caps); + /// assert_eq!(Some(Match::must(0, 0..9)), caps.get_match()); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + pub fn build_from_nfa(&self, nfa: NFA) -> Result<DFA, BuildError> { + // Why take ownership if we're just going to pass a reference to the + // NFA to our internal builder? Well, the first thing to note is that + // an NFA uses reference counting internally, so either choice is going + // to be cheap. So there isn't much cost either way. + // + // The real reason is that a one-pass DFA, semantically, shares + // ownership of an NFA. This is unlike other DFAs that don't share + // ownership of an NFA at all, primarily because they want to be + // self-contained in order to support cheap (de)serialization. + // + // But then why pass a '&nfa' below if we want to share ownership? + // Well, it turns out that using a '&NFA' in our internal builder + // separates its lifetime from the DFA we're building, and this turns + // out to make code a bit more composable. e.g., We can iterate over + // things inside the NFA while borrowing the builder as mutable because + // we know the NFA cannot be mutated. So TL;DR --- this weirdness is + // "because borrow checker." + InternalBuilder::new(self.config.clone(), &nfa).build() + } + + /// Apply the given one-pass DFA configuration options to this builder. + pub fn configure(&mut self, config: Config) -> &mut Builder { + self.config = self.config.overwrite(config); + self + } + + /// Set the syntax configuration for this builder using + /// [`syntax::Config`](crate::util::syntax::Config). + /// + /// This permits setting things like case insensitivity, Unicode and multi + /// line mode. + /// + /// These settings only apply when constructing a one-pass DFA directly + /// from a pattern. + #[cfg(feature = "syntax")] + pub fn syntax( + &mut self, + config: crate::util::syntax::Config, + ) -> &mut Builder { + self.thompson.syntax(config); + self + } + + /// Set the Thompson NFA configuration for this builder using + /// [`nfa::thompson::Config`](crate::nfa::thompson::Config). + /// + /// This permits setting things like whether additional time should be + /// spent shrinking the size of the NFA. + /// + /// These settings only apply when constructing a DFA directly from a + /// pattern. + #[cfg(feature = "syntax")] + pub fn thompson(&mut self, config: thompson::Config) -> &mut Builder { + self.thompson.configure(config); + self + } +} + +/// An internal builder for encapsulating the state necessary to build a +/// one-pass DFA. Typical use is just `InternalBuilder::new(..).build()`. +/// +/// There is no separate pass for determining whether the NFA is one-pass or +/// not. We just try to build the DFA. If during construction we discover that +/// it is not one-pass, we bail out. This is likely to lead to some undesirable +/// expense in some cases, so it might make sense to try an identify common +/// patterns in the NFA that make it definitively not one-pass. That way, we +/// can avoid ever trying to build a one-pass DFA in the first place. For +/// example, '\w*\s' is not one-pass, and since '\w' is Unicode-aware by +/// default, it's probably not a trivial cost to try and build a one-pass DFA +/// for it and then fail. +/// +/// Note that some (immutable) fields are duplicated here. For example, the +/// 'nfa' and 'classes' fields are both in the 'DFA'. They are the same thing, +/// but we duplicate them because it makes composition easier below. Otherwise, +/// since the borrow checker can't see through method calls, the mutable borrow +/// we use to mutate the DFA winds up preventing borrowing from any other part +/// of the DFA, even though we aren't mutating those parts. We only do this +/// because the duplication is cheap. +#[derive(Debug)] +struct InternalBuilder<'a> { + /// The DFA we're building. + dfa: DFA, + /// An unordered collection of NFA state IDs that we haven't yet tried to + /// build into a DFA state yet. + /// + /// This collection does not ultimately wind up including every NFA state + /// ID. Instead, each ID represents a "start" state for a sub-graph of the + /// NFA. The set of NFA states we then use to build a DFA state consists + /// of that "start" state and all states reachable from it via epsilon + /// transitions. + uncompiled_nfa_ids: Vec<StateID>, + /// A map from NFA state ID to DFA state ID. This is useful for easily + /// determining whether an NFA state has been used as a "starting" point + /// to build a DFA state yet. If it hasn't, then it is mapped to DEAD, + /// and since DEAD is specially added and never corresponds to any NFA + /// state, it follows that a mapping to DEAD implies the NFA state has + /// no corresponding DFA state yet. + nfa_to_dfa_id: Vec<StateID>, + /// A stack used to traverse the NFA states that make up a single DFA + /// state. Traversal occurs until the stack is empty, and we only push to + /// the stack when the state ID isn't in 'seen'. Actually, even more than + /// that, if we try to push something on to this stack that is already in + /// 'seen', then we bail out on construction completely, since it implies + /// that the NFA is not one-pass. + stack: Vec<(StateID, Epsilons)>, + /// The set of NFA states that we've visited via 'stack'. + seen: SparseSet, + /// Whether a match NFA state has been observed while constructing a + /// one-pass DFA state. Once a match state is seen, assuming we are using + /// leftmost-first match semantics, then we don't add any more transitions + /// to the DFA state we're building. + matched: bool, + /// The config passed to the builder. + /// + /// This is duplicated in dfa.config. + config: Config, + /// The NFA we're building a one-pass DFA from. + /// + /// This is duplicated in dfa.nfa. + nfa: &'a NFA, + /// The equivalence classes that make up the alphabet for this DFA> + /// + /// This is duplicated in dfa.classes. + classes: ByteClasses, +} + +impl<'a> InternalBuilder<'a> { + /// Create a new builder with an initial empty DFA. + fn new(config: Config, nfa: &'a NFA) -> InternalBuilder { + let classes = if !config.get_byte_classes() { + // A one-pass DFA will always use the equivalence class map, but + // enabling this option is useful for debugging. Namely, this will + // cause all transitions to be defined over their actual bytes + // instead of an opaque equivalence class identifier. The former is + // much easier to grok as a human. + ByteClasses::singletons() + } else { + nfa.byte_classes().clone() + }; + // Normally a DFA alphabet includes the EOI symbol, but we don't need + // that in the one-pass DFA since we handle look-around explicitly + // without encoding it into the DFA. Thus, we don't need to delay + // matches by 1 byte. However, we reuse the space that *would* be used + // by the EOI transition by putting match information there (like which + // pattern matches and which look-around assertions need to hold). So + // this means our real alphabet length is 1 fewer than what the byte + // classes report, since we don't use EOI. + let alphabet_len = classes.alphabet_len().checked_sub(1).unwrap(); + let stride2 = classes.stride2(); + let dfa = DFA { + config: config.clone(), + nfa: nfa.clone(), + table: vec![], + starts: vec![], + // Since one-pass DFAs have a smaller state ID max than + // StateID::MAX, it follows that StateID::MAX is a valid initial + // value for min_match_id since no state ID can ever be greater + // than it. In the case of a one-pass DFA with no match states, the + // min_match_id will keep this sentinel value. + min_match_id: StateID::MAX, + classes: classes.clone(), + alphabet_len, + stride2, + pateps_offset: alphabet_len, + // OK because PatternID::MAX*2 is guaranteed not to overflow. + explicit_slot_start: nfa.pattern_len().checked_mul(2).unwrap(), + }; + InternalBuilder { + dfa, + uncompiled_nfa_ids: vec![], + nfa_to_dfa_id: vec![DEAD; nfa.states().len()], + stack: vec![], + seen: SparseSet::new(nfa.states().len()), + matched: false, + config, + nfa, + classes, + } + } + + /// Build the DFA from the NFA given to this builder. If the NFA is not + /// one-pass, then return an error. An error may also be returned if a + /// particular limit is exceeded. (Some limits, like the total heap memory + /// used, are configurable. Others, like the total patterns or slots, are + /// hard-coded based on representational limitations.) + fn build(mut self) -> Result<DFA, BuildError> { + self.nfa.look_set_any().available().map_err(BuildError::word)?; + for look in self.nfa.look_set_any().iter() { + // This is a future incompatibility check where if we add any + // more look-around assertions, then the one-pass DFA either + // needs to reject them (what we do here) or it needs to have its + // Transition representation modified to be capable of storing the + // new assertions. + if look.as_repr() > Look::WordUnicodeNegate.as_repr() { + return Err(BuildError::unsupported_look(look)); + } + } + if self.nfa.pattern_len().as_u64() > PatternEpsilons::PATTERN_ID_LIMIT + { + return Err(BuildError::too_many_patterns( + PatternEpsilons::PATTERN_ID_LIMIT, + )); + } + if self.nfa.group_info().explicit_slot_len() > Slots::LIMIT { + return Err(BuildError::not_one_pass( + "too many explicit capturing groups (max is 16)", + )); + } + assert_eq!(DEAD, self.add_empty_state()?); + + // This is where the explicit slots start. We care about this because + // we only need to track explicit slots. The implicit slots---two for + // each pattern---are tracked as part of the search routine itself. + let explicit_slot_start = self.nfa.pattern_len() * 2; + self.add_start_state(None, self.nfa.start_anchored())?; + if self.config.get_starts_for_each_pattern() { + for pid in self.nfa.patterns() { + self.add_start_state( + Some(pid), + self.nfa.start_pattern(pid).unwrap(), + )?; + } + } + // NOTE: One wonders what the effects of treating 'uncompiled_nfa_ids' + // as a stack are. It is really an unordered *set* of NFA state IDs. + // If it, for example, in practice led to discovering whether a regex + // was or wasn't one-pass later than if we processed NFA state IDs in + // ascending order, then that would make this routine more costly in + // the somewhat common case of a regex that isn't one-pass. + while let Some(nfa_id) = self.uncompiled_nfa_ids.pop() { + let dfa_id = self.nfa_to_dfa_id[nfa_id]; + // Once we see a match, we keep going, but don't add any new + // transitions. Normally we'd just stop, but we have to keep + // going in order to verify that our regex is actually one-pass. + self.matched = false; + // The NFA states we've already explored for this DFA state. + self.seen.clear(); + // The NFA states to explore via epsilon transitions. If we ever + // try to push an NFA state that we've already seen, then the NFA + // is not one-pass because it implies there are multiple epsilon + // transition paths that lead to the same NFA state. In other + // words, there is ambiguity. + self.stack_push(nfa_id, Epsilons::empty())?; + while let Some((id, epsilons)) = self.stack.pop() { + match *self.nfa.state(id) { + thompson::State::ByteRange { ref trans } => { + self.compile_transition(dfa_id, trans, epsilons)?; + } + thompson::State::Sparse(ref sparse) => { + for trans in sparse.transitions.iter() { + self.compile_transition(dfa_id, trans, epsilons)?; + } + } + thompson::State::Dense(ref dense) => { + for trans in dense.iter() { + self.compile_transition(dfa_id, &trans, epsilons)?; + } + } + thompson::State::Look { look, next } => { + let looks = epsilons.looks().insert(look); + self.stack_push(next, epsilons.set_looks(looks))?; + } + thompson::State::Union { ref alternates } => { + for &sid in alternates.iter().rev() { + self.stack_push(sid, epsilons)?; + } + } + thompson::State::BinaryUnion { alt1, alt2 } => { + self.stack_push(alt2, epsilons)?; + self.stack_push(alt1, epsilons)?; + } + thompson::State::Capture { next, slot, .. } => { + let slot = slot.as_usize(); + let epsilons = if slot < explicit_slot_start { + // If this is an implicit slot, we don't care + // about it, since we handle implicit slots in + // the search routine. We can get away with that + // because there are 2 implicit slots for every + // pattern. + epsilons + } else { + // Offset our explicit slots so that they start + // at index 0. + let offset = slot - explicit_slot_start; + epsilons.set_slots(epsilons.slots().insert(offset)) + }; + self.stack_push(next, epsilons)?; + } + thompson::State::Fail => { + continue; + } + thompson::State::Match { pattern_id } => { + // If we found two different paths to a match state + // for the same DFA state, then we have ambiguity. + // Thus, it's not one-pass. + if self.matched { + return Err(BuildError::not_one_pass( + "multiple epsilon transitions to match state", + )); + } + self.matched = true; + // Shove the matching pattern ID and the 'epsilons' + // into the current DFA state's pattern epsilons. The + // 'epsilons' includes the slots we need to capture + // before reporting the match and also the conditional + // epsilon transitions we need to check before we can + // report a match. + self.dfa.set_pattern_epsilons( + dfa_id, + PatternEpsilons::empty() + .set_pattern_id(pattern_id) + .set_epsilons(epsilons), + ); + // N.B. It is tempting to just bail out here when + // compiling a leftmost-first DFA, since we will never + // compile any more transitions in that case. But we + // actually need to keep going in order to verify that + // we actually have a one-pass regex. e.g., We might + // see more Match states (e.g., for other patterns) + // that imply that we don't have a one-pass regex. + // So instead, we mark that we've found a match and + // continue on. When we go to compile a new DFA state, + // we just skip that part. But otherwise check that the + // one-pass property is upheld. + } + } + } + } + self.shuffle_states(); + Ok(self.dfa) + } + + /// Shuffle all match states to the end of the transition table and set + /// 'min_match_id' to the ID of the first such match state. + /// + /// The point of this is to make it extremely cheap to determine whether + /// a state is a match state or not. We need to check on this on every + /// transition during a search, so it being cheap is important. This + /// permits us to check it by simply comparing two state identifiers, as + /// opposed to looking for the pattern ID in the state's `PatternEpsilons`. + /// (Which requires a memory load and some light arithmetic.) + fn shuffle_states(&mut self) { + let mut remapper = Remapper::new(&self.dfa); + let mut next_dest = self.dfa.last_state_id(); + for i in (0..self.dfa.state_len()).rev() { + let id = StateID::must(i); + let is_match = + self.dfa.pattern_epsilons(id).pattern_id().is_some(); + if !is_match { + continue; + } + remapper.swap(&mut self.dfa, next_dest, id); + self.dfa.min_match_id = next_dest; + next_dest = self.dfa.prev_state_id(next_dest).expect( + "match states should be a proper subset of all states", + ); + } + remapper.remap(&mut self.dfa); + } + + /// Compile the given NFA transition into the DFA state given. + /// + /// 'Epsilons' corresponds to any conditional epsilon transitions that need + /// to be satisfied to follow this transition, and any slots that need to + /// be saved if the transition is followed. + /// + /// If this transition indicates that the NFA is not one-pass, then + /// this returns an error. (This occurs, for example, if the DFA state + /// already has a transition defined for the same input symbols as the + /// given transition, *and* the result of the old and new transitions is + /// different.) + fn compile_transition( + &mut self, + dfa_id: StateID, + trans: &thompson::Transition, + epsilons: Epsilons, + ) -> Result<(), BuildError> { + let next_dfa_id = self.add_dfa_state_for_nfa_state(trans.next)?; + for byte in self + .classes + .representatives(trans.start..=trans.end) + .filter_map(|r| r.as_u8()) + { + let oldtrans = self.dfa.transition(dfa_id, byte); + let newtrans = + Transition::new(self.matched, next_dfa_id, epsilons); + // If the old transition points to the DEAD state, then we know + // 'byte' has not been mapped to any transition for this DFA state + // yet. So set it unconditionally. Otherwise, we require that the + // old and new transitions are equivalent. Otherwise, there is + // ambiguity and thus the regex is not one-pass. + if oldtrans.state_id() == DEAD { + self.dfa.set_transition(dfa_id, byte, newtrans); + } else if oldtrans != newtrans { + return Err(BuildError::not_one_pass( + "conflicting transition", + )); + } + } + Ok(()) + } + + /// Add a start state to the DFA corresponding to the given NFA starting + /// state ID. + /// + /// If adding a state would blow any limits (configured or hard-coded), + /// then an error is returned. + /// + /// If the starting state is an anchored state for a particular pattern, + /// then callers must provide the pattern ID for that starting state. + /// Callers must also ensure that the first starting state added is the + /// start state for all patterns, and then each anchored starting state for + /// each pattern (if necessary) added in order. Otherwise, this panics. + fn add_start_state( + &mut self, + pid: Option<PatternID>, + nfa_id: StateID, + ) -> Result<StateID, BuildError> { + match pid { + // With no pid, this should be the start state for all patterns + // and thus be the first one. + None => assert!(self.dfa.starts.is_empty()), + // With a pid, we want it to be at self.dfa.starts[pid+1]. + Some(pid) => assert!(self.dfa.starts.len() == pid.one_more()), + } + let dfa_id = self.add_dfa_state_for_nfa_state(nfa_id)?; + self.dfa.starts.push(dfa_id); + Ok(dfa_id) + } + + /// Add a new DFA state corresponding to the given NFA state. If adding a + /// state would blow any limits (configured or hard-coded), then an error + /// is returned. If a DFA state already exists for the given NFA state, + /// then that DFA state's ID is returned and no new states are added. + /// + /// It is not expected that this routine is called for every NFA state. + /// Instead, an NFA state ID will usually correspond to the "start" state + /// for a sub-graph of the NFA, where all states in the sub-graph are + /// reachable via epsilon transitions (conditional or unconditional). That + /// sub-graph of NFA states is ultimately what produces a single DFA state. + fn add_dfa_state_for_nfa_state( + &mut self, + nfa_id: StateID, + ) -> Result<StateID, BuildError> { + // If we've already built a DFA state for the given NFA state, then + // just return that. We definitely do not want to have more than one + // DFA state in existence for the same NFA state, since all but one of + // them will likely become unreachable. And at least some of them are + // likely to wind up being incomplete. + let existing_dfa_id = self.nfa_to_dfa_id[nfa_id]; + if existing_dfa_id != DEAD { + return Ok(existing_dfa_id); + } + // If we don't have any DFA state yet, add it and then add the given + // NFA state to the list of states to explore. + let dfa_id = self.add_empty_state()?; + self.nfa_to_dfa_id[nfa_id] = dfa_id; + self.uncompiled_nfa_ids.push(nfa_id); + Ok(dfa_id) + } + + /// Unconditionally add a new empty DFA state. If adding it would exceed + /// any limits (configured or hard-coded), then an error is returned. The + /// ID of the new state is returned on success. + /// + /// The added state is *not* a match state. + fn add_empty_state(&mut self) -> Result<StateID, BuildError> { + let state_limit = Transition::STATE_ID_LIMIT; + // Note that unlike dense and lazy DFAs, we specifically do NOT + // premultiply our state IDs here. The reason is that we want to pack + // our state IDs into 64-bit transitions with other info, so the fewer + // the bits we use for state IDs the better. If we premultiply, then + // our state ID space shrinks. We justify this by the assumption that + // a one-pass DFA is just already doing a fair bit more work than a + // normal DFA anyway, so an extra multiplication to compute a state + // transition doesn't seem like a huge deal. + let next_id = self.dfa.table.len() >> self.dfa.stride2(); + let id = StateID::new(next_id) + .map_err(|_| BuildError::too_many_states(state_limit))?; + if id.as_u64() > Transition::STATE_ID_LIMIT { + return Err(BuildError::too_many_states(state_limit)); + } + self.dfa + .table + .extend(core::iter::repeat(Transition(0)).take(self.dfa.stride())); + // The default empty value for 'PatternEpsilons' is sadly not all + // zeroes. Instead, a special sentinel is used to indicate that there + // is no pattern. So we need to explicitly set the pattern epsilons to + // the correct "empty" PatternEpsilons. + self.dfa.set_pattern_epsilons(id, PatternEpsilons::empty()); + if let Some(size_limit) = self.config.get_size_limit() { + if self.dfa.memory_usage() > size_limit { + return Err(BuildError::exceeded_size_limit(size_limit)); + } + } + Ok(id) + } + + /// Push the given NFA state ID and its corresponding epsilons (slots and + /// conditional epsilon transitions) on to a stack for use in a depth first + /// traversal of a sub-graph of the NFA. + /// + /// If the given NFA state ID has already been pushed on to the stack, then + /// it indicates the regex is not one-pass and this correspondingly returns + /// an error. + fn stack_push( + &mut self, + nfa_id: StateID, + epsilons: Epsilons, + ) -> Result<(), BuildError> { + // If we already have seen a match and we are compiling a leftmost + // first DFA, then we shouldn't add any more states to look at. This is + // effectively how preference order and non-greediness is implemented. + // if !self.config.get_match_kind().continue_past_first_match() + // && self.matched + // { + // return Ok(()); + // } + if !self.seen.insert(nfa_id) { + return Err(BuildError::not_one_pass( + "multiple epsilon transitions to same state", + )); + } + self.stack.push((nfa_id, epsilons)); + Ok(()) + } +} + +/// A one-pass DFA for executing a subset of anchored regex searches while +/// resolving capturing groups. +/// +/// A one-pass DFA can be built from an NFA that is one-pass. An NFA is +/// one-pass when there is never any ambiguity about how to continue a search. +/// For example, `a*a` is not one-pass becuase during a search, it's not +/// possible to know whether to continue matching the `a*` or to move on to +/// the single `a`. However, `a*b` is one-pass, because for every byte in the +/// input, it's always clear when to move on from `a*` to `b`. +/// +/// # Only anchored searches are supported +/// +/// In this crate, especially for DFAs, unanchored searches are implemented by +/// treating the pattern as if it had a `(?s-u:.)*?` prefix. While the prefix +/// is one-pass on its own, adding anything after it, e.g., `(?s-u:.)*?a` will +/// make the overall pattern not one-pass. Why? Because the `(?s-u:.)` matches +/// any byte, and there is therefore ambiguity as to when the prefix should +/// stop matching and something else should start matching. +/// +/// Therefore, one-pass DFAs do not support unanchored searches. In addition +/// to many regexes simply not being one-pass, it implies that one-pass DFAs +/// have limited utility. With that said, when a one-pass DFA can be used, it +/// can potentially provide a dramatic speed up over alternatives like the +/// [`BoundedBacktracker`](crate::nfa::thompson::backtrack::BoundedBacktracker) +/// and the [`PikeVM`](crate::nfa::thompson::pikevm::PikeVM). In particular, +/// a one-pass DFA is the only DFA capable of reporting the spans of matching +/// capturing groups. +/// +/// To clarify, when we say that unanchored searches are not supported, what +/// that actually means is: +/// +/// * The high level routines, [`DFA::is_match`] and [`DFA::captures`], always +/// do anchored searches. +/// * Since iterators are most useful in the context of unanchored searches, +/// there is no `DFA::captures_iter` method. +/// * For lower level routines like [`DFA::try_search`], an error will be +/// returned if the given [`Input`] is configured to do an unanchored search or +/// search for an invalid pattern ID. (Note that an [`Input`] is configured to +/// do an unanchored search by default, so just giving a `Input::new` is +/// guaranteed to return an error.) +/// +/// # Other limitations +/// +/// In addition to the [configurable heap limit](Config::size_limit) and +/// the requirement that a regex pattern be one-pass, there are some other +/// limitations: +/// +/// * There is an internal limit on the total number of explicit capturing +/// groups that appear across all patterns. It is somewhat small and there is +/// no way to configure it. If your pattern(s) exceed this limit, then building +/// a one-pass DFA will fail. +/// * If the number of patterns exceeds an internal unconfigurable limit, then +/// building a one-pass DFA will fail. This limit is quite large and you're +/// unlikely to hit it. +/// * If the total number of states exceeds an internal unconfigurable limit, +/// then building a one-pass DFA will fail. This limit is quite large and +/// you're unlikely to hit it. +/// +/// # Other examples of regexes that aren't one-pass +/// +/// One particularly unfortunate example is that enabling Unicode can cause +/// regexes that were one-pass to no longer be one-pass. Consider the regex +/// `(?-u)\w*\s` for example. It is one-pass because there is exactly no +/// overlap between the ASCII definitions of `\w` and `\s`. But `\w*\s` +/// (i.e., with Unicode enabled) is *not* one-pass because `\w` and `\s` get +/// translated to UTF-8 automatons. And while the *codepoints* in `\w` and `\s` +/// do not overlap, the underlying UTF-8 encodings do. Indeed, because of the +/// overlap between UTF-8 automata, the use of Unicode character classes will +/// tend to vastly increase the likelihood of a regex not being one-pass. +/// +/// # How does one know if a regex is one-pass or not? +/// +/// At the time of writing, the only way to know is to try and build a one-pass +/// DFA. The one-pass property is checked while constructing the DFA. +/// +/// This does mean that you might potentially waste some CPU cycles and memory +/// by optimistically trying to build a one-pass DFA. But this is currently the +/// only way. In the future, building a one-pass DFA might be able to use some +/// heuristics to detect common violations of the one-pass property and bail +/// more quickly. +/// +/// # Resource usage +/// +/// Unlike a general DFA, a one-pass DFA has stricter bounds on its resource +/// usage. Namely, construction of a one-pass DFA has a time and space +/// complexity of `O(n)`, where `n ~ nfa.states().len()`. (A general DFA's time +/// and space complexity is `O(2^n)`.) This smaller time bound is achieved +/// because there is at most one DFA state created for each NFA state. If +/// additional DFA states would be required, then the pattern is not one-pass +/// and construction will fail. +/// +/// Note though that currently, this DFA uses a fully dense representation. +/// This means that while its space complexity is no worse than an NFA, it may +/// in practice use more memory because of higher constant factors. The reason +/// for this trade off is two-fold. Firstly, a dense representation makes the +/// search faster. Secondly, the bigger an NFA, the more unlikely it is to be +/// one-pass. Therefore, most one-pass DFAs are usually pretty small. +/// +/// # Example +/// +/// This example shows that the one-pass DFA implements Unicode word boundaries +/// correctly while simultaneously reporting spans for capturing groups that +/// participate in a match. (This is the only DFA that implements full support +/// for Unicode word boundaries.) +/// +/// ``` +/// # if cfg!(miri) { return Ok(()); } // miri takes too long +/// use regex_automata::{dfa::onepass::DFA, Match, Span}; +/// +/// let re = DFA::new(r"\b(?P<first>\w+)[[:space:]]+(?P<last>\w+)\b")?; +/// let (mut cache, mut caps) = (re.create_cache(), re.create_captures()); +/// +/// re.captures(&mut cache, "Шерлок Холмс", &mut caps); +/// assert_eq!(Some(Match::must(0, 0..23)), caps.get_match()); +/// assert_eq!(Some(Span::from(0..12)), caps.get_group_by_name("first")); +/// assert_eq!(Some(Span::from(13..23)), caps.get_group_by_name("last")); +/// # Ok::<(), Box<dyn std::error::Error>>(()) +/// ``` +/// +/// # Example: iteration +/// +/// Unlike other regex engines in this crate, this one does not provide +/// iterator search functions. This is because a one-pass DFA only supports +/// anchored searches, and so iterator functions are generally not applicable. +/// +/// However, if you know that all of your matches are +/// directly adjacent, then an iterator can be used. The +/// [`util::iter::Searcher`](crate::util::iter::Searcher) type can be used for +/// this purpose: +/// +/// ``` +/// # if cfg!(miri) { return Ok(()); } // miri takes too long +/// use regex_automata::{ +/// dfa::onepass::DFA, +/// util::iter::Searcher, +/// Anchored, Input, Span, +/// }; +/// +/// let re = DFA::new(r"\w(\d)\w")?; +/// let (mut cache, caps) = (re.create_cache(), re.create_captures()); +/// let input = Input::new("a1zb2yc3x").anchored(Anchored::Yes); +/// +/// let mut it = Searcher::new(input).into_captures_iter(caps, |input, caps| { +/// Ok(re.try_search(&mut cache, input, caps)?) +/// }).infallible(); +/// let caps0 = it.next().unwrap(); +/// assert_eq!(Some(Span::from(1..2)), caps0.get_group(1)); +/// +/// # Ok::<(), Box<dyn std::error::Error>>(()) +/// ``` +#[derive(Clone)] +pub struct DFA { + /// The configuration provided by the caller. + config: Config, + /// The NFA used to build this DFA. + /// + /// NOTE: We probably don't need to store the NFA here, but we use enough + /// bits from it that it's convenient to do so. And there really isn't much + /// cost to doing so either, since an NFA is reference counted internally. + nfa: NFA, + /// The transition table. Given a state ID 's' and a byte of haystack 'b', + /// the next state is `table[sid + classes[byte]]`. + /// + /// The stride of this table (i.e., the number of columns) is always + /// a power of 2, even if the alphabet length is smaller. This makes + /// converting between state IDs and state indices very cheap. + /// + /// Note that the stride always includes room for one extra "transition" + /// that isn't actually a transition. It is a 'PatternEpsilons' that is + /// used for match states only. Because of this, the maximum number of + /// active columns in the transition table is 257, which means the maximum + /// stride is 512 (the next power of 2 greater than or equal to 257). + table: Vec<Transition>, + /// The DFA state IDs of the starting states. + /// + /// `starts[0]` is always present and corresponds to the starting state + /// when searching for matches of any pattern in the DFA. + /// + /// `starts[i]` where i>0 corresponds to the starting state for the pattern + /// ID 'i-1'. These starting states are optional. + starts: Vec<StateID>, + /// Every state ID >= this value corresponds to a match state. + /// + /// This is what a search uses to detect whether a state is a match state + /// or not. It requires only a simple comparison instead of bit-unpacking + /// the PatternEpsilons from every state. + min_match_id: StateID, + /// The alphabet of this DFA, split into equivalence classes. Bytes in the + /// same equivalence class can never discriminate between a match and a + /// non-match. + classes: ByteClasses, + /// The number of elements in each state in the transition table. This may + /// be less than the stride, since the stride is always a power of 2 and + /// the alphabet length can be anything up to and including 256. + alphabet_len: usize, + /// The number of columns in the transition table, expressed as a power of + /// 2. + stride2: usize, + /// The offset at which the PatternEpsilons for a match state is stored in + /// the transition table. + /// + /// PERF: One wonders whether it would be better to put this in a separate + /// allocation, since only match states have a non-empty PatternEpsilons + /// and the number of match states tends be dwarfed by the number of + /// non-match states. So this would save '8*len(non_match_states)' for each + /// DFA. The question is whether moving this to a different allocation will + /// lead to a perf hit during searches. You might think dealing with match + /// states is rare, but some regexes spend a lot of time in match states + /// gobbling up input. But... match state handling is already somewhat + /// expensive, so maybe this wouldn't do much? Either way, it's worth + /// experimenting. + pateps_offset: usize, + /// The first explicit slot index. This refers to the first slot appearing + /// immediately after the last implicit slot. It is always 'patterns.len() + /// * 2'. + /// + /// We record this because we only store the explicit slots in our DFA + /// transition table that need to be saved. Implicit slots are handled + /// automatically as part of the search. + explicit_slot_start: usize, +} + +impl DFA { + /// Parse the given regular expression using the default configuration and + /// return the corresponding one-pass DFA. + /// + /// If you want a non-default configuration, then use the [`Builder`] to + /// set your own configuration. + /// + /// # Example + /// + /// ``` + /// use regex_automata::{dfa::onepass::DFA, Match}; + /// + /// let re = DFA::new("foo[0-9]+bar")?; + /// let (mut cache, mut caps) = (re.create_cache(), re.create_captures()); + /// + /// re.captures(&mut cache, "foo12345barzzz", &mut caps); + /// assert_eq!(Some(Match::must(0, 0..11)), caps.get_match()); + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + #[cfg(feature = "syntax")] + #[inline] + pub fn new(pattern: &str) -> Result<DFA, BuildError> { + DFA::builder().build(pattern) + } + + /// Like `new`, but parses multiple patterns into a single "multi regex." + /// This similarly uses the default regex configuration. + /// + /// # Example + /// + /// ``` + /// use regex_automata::{dfa::onepass::DFA, Match}; + /// + /// let re = DFA::new_many(&["[a-z]+", "[0-9]+"])?; + /// let (mut cache, mut caps) = (re.create_cache(), re.create_captures()); + /// + /// re.captures(&mut cache, "abc123", &mut caps); + /// assert_eq!(Some(Match::must(0, 0..3)), caps.get_match()); + /// + /// re.captures(&mut cache, "123abc", &mut caps); + /// assert_eq!(Some(Match::must(1, 0..3)), caps.get_match()); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + #[cfg(feature = "syntax")] + #[inline] + pub fn new_many<P: AsRef<str>>(patterns: &[P]) -> Result<DFA, BuildError> { + DFA::builder().build_many(patterns) + } + + /// Like `new`, but builds a one-pass DFA directly from an NFA. This is + /// useful if you already have an NFA, or even if you hand-assembled the + /// NFA. + /// + /// # Example + /// + /// This shows how to hand assemble a regular expression via its HIR, + /// compile an NFA from it and build a one-pass DFA from the NFA. + /// + /// ``` + /// use regex_automata::{ + /// dfa::onepass::DFA, + /// nfa::thompson::NFA, + /// Match, + /// }; + /// use regex_syntax::hir::{Hir, Class, ClassBytes, ClassBytesRange}; + /// + /// let hir = Hir::class(Class::Bytes(ClassBytes::new(vec![ + /// ClassBytesRange::new(b'0', b'9'), + /// ClassBytesRange::new(b'A', b'Z'), + /// ClassBytesRange::new(b'_', b'_'), + /// ClassBytesRange::new(b'a', b'z'), + /// ]))); + /// + /// let config = NFA::config().nfa_size_limit(Some(1_000)); + /// let nfa = NFA::compiler().configure(config).build_from_hir(&hir)?; + /// + /// let re = DFA::new_from_nfa(nfa)?; + /// let (mut cache, mut caps) = (re.create_cache(), re.create_captures()); + /// let expected = Some(Match::must(0, 0..1)); + /// re.captures(&mut cache, "A", &mut caps); + /// assert_eq!(expected, caps.get_match()); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + pub fn new_from_nfa(nfa: NFA) -> Result<DFA, BuildError> { + DFA::builder().build_from_nfa(nfa) + } + + /// Create a new one-pass DFA that matches every input. + /// + /// # Example + /// + /// ``` + /// use regex_automata::{dfa::onepass::DFA, Match}; + /// + /// let dfa = DFA::always_match()?; + /// let mut cache = dfa.create_cache(); + /// let mut caps = dfa.create_captures(); + /// + /// let expected = Match::must(0, 0..0); + /// dfa.captures(&mut cache, "", &mut caps); + /// assert_eq!(Some(expected), caps.get_match()); + /// dfa.captures(&mut cache, "foo", &mut caps); + /// assert_eq!(Some(expected), caps.get_match()); + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + pub fn always_match() -> Result<DFA, BuildError> { + let nfa = thompson::NFA::always_match(); + Builder::new().build_from_nfa(nfa) + } + + /// Create a new one-pass DFA that never matches any input. + /// + /// # Example + /// + /// ``` + /// use regex_automata::dfa::onepass::DFA; + /// + /// let dfa = DFA::never_match()?; + /// let mut cache = dfa.create_cache(); + /// let mut caps = dfa.create_captures(); + /// + /// dfa.captures(&mut cache, "", &mut caps); + /// assert_eq!(None, caps.get_match()); + /// dfa.captures(&mut cache, "foo", &mut caps); + /// assert_eq!(None, caps.get_match()); + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + pub fn never_match() -> Result<DFA, BuildError> { + let nfa = thompson::NFA::never_match(); + Builder::new().build_from_nfa(nfa) + } + + /// Return a default configuration for a DFA. + /// + /// This is a convenience routine to avoid needing to import the `Config` + /// type when customizing the construction of a DFA. + /// + /// # Example + /// + /// This example shows how to change the match semantics of this DFA from + /// its default "leftmost first" to "all." When using "all," non-greediness + /// doesn't apply and neither does preference order matching. Instead, the + /// longest match possible is always returned. (Although, by construction, + /// it's impossible for a one-pass DFA to have a different answer for + /// "preference order" vs "longest match.") + /// + /// ``` + /// use regex_automata::{dfa::onepass::DFA, Match, MatchKind}; + /// + /// let re = DFA::builder() + /// .configure(DFA::config().match_kind(MatchKind::All)) + /// .build(r"(abc)+?")?; + /// let mut cache = re.create_cache(); + /// let mut caps = re.create_captures(); + /// + /// re.captures(&mut cache, "abcabc", &mut caps); + /// // Normally, the non-greedy repetition would give us a 0..3 match. + /// assert_eq!(Some(Match::must(0, 0..6)), caps.get_match()); + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + #[inline] + pub fn config() -> Config { + Config::new() + } + + /// Return a builder for configuring the construction of a DFA. + /// + /// This is a convenience routine to avoid needing to import the + /// [`Builder`] type in common cases. + /// + /// # Example + /// + /// This example shows how to use the builder to disable UTF-8 mode. + /// + /// ``` + /// # if cfg!(miri) { return Ok(()); } // miri takes too long + /// use regex_automata::{ + /// dfa::onepass::DFA, + /// nfa::thompson, + /// util::syntax, + /// Match, + /// }; + /// + /// let re = DFA::builder() + /// .syntax(syntax::Config::new().utf8(false)) + /// .thompson(thompson::Config::new().utf8(false)) + /// .build(r"foo(?-u:[^b])ar.*")?; + /// let (mut cache, mut caps) = (re.create_cache(), re.create_captures()); + /// + /// let haystack = b"foo\xFFarzz\xE2\x98\xFF\n"; + /// let expected = Some(Match::must(0, 0..8)); + /// re.captures(&mut cache, haystack, &mut caps); + /// assert_eq!(expected, caps.get_match()); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + #[inline] + pub fn builder() -> Builder { + Builder::new() + } + + /// Create a new empty set of capturing groups that is guaranteed to be + /// valid for the search APIs on this DFA. + /// + /// A `Captures` value created for a specific DFA cannot be used with any + /// other DFA. + /// + /// This is a convenience function for [`Captures::all`]. See the + /// [`Captures`] documentation for an explanation of its alternative + /// constructors that permit the DFA to do less work during a search, and + /// thus might make it faster. + #[inline] + pub fn create_captures(&self) -> Captures { + Captures::all(self.nfa.group_info().clone()) + } + + /// Create a new cache for this DFA. + /// + /// The cache returned should only be used for searches for this + /// DFA. If you want to reuse the cache for another DFA, then you + /// must call [`Cache::reset`] with that DFA (or, equivalently, + /// [`DFA::reset_cache`]). + #[inline] + pub fn create_cache(&self) -> Cache { + Cache::new(self) + } + + /// Reset the given cache such that it can be used for searching with the + /// this DFA (and only this DFA). + /// + /// A cache reset permits reusing memory already allocated in this cache + /// with a different DFA. + /// + /// # Example + /// + /// This shows how to re-purpose a cache for use with a different DFA. + /// + /// ``` + /// # if cfg!(miri) { return Ok(()); } // miri takes too long + /// use regex_automata::{dfa::onepass::DFA, Match}; + /// + /// let re1 = DFA::new(r"\w")?; + /// let re2 = DFA::new(r"\W")?; + /// let mut caps1 = re1.create_captures(); + /// let mut caps2 = re2.create_captures(); + /// + /// let mut cache = re1.create_cache(); + /// assert_eq!( + /// Some(Match::must(0, 0..2)), + /// { re1.captures(&mut cache, "Δ", &mut caps1); caps1.get_match() }, + /// ); + /// + /// // Using 'cache' with re2 is not allowed. It may result in panics or + /// // incorrect results. In order to re-purpose the cache, we must reset + /// // it with the one-pass DFA we'd like to use it with. + /// // + /// // Similarly, after this reset, using the cache with 're1' is also not + /// // allowed. + /// re2.reset_cache(&mut cache); + /// assert_eq!( + /// Some(Match::must(0, 0..3)), + /// { re2.captures(&mut cache, "☃", &mut caps2); caps2.get_match() }, + /// ); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + #[inline] + pub fn reset_cache(&self, cache: &mut Cache) { + cache.reset(self); + } + + /// Return the config for this one-pass DFA. + #[inline] + pub fn get_config(&self) -> &Config { + &self.config + } + + /// Returns a reference to the underlying NFA. + #[inline] + pub fn get_nfa(&self) -> &NFA { + &self.nfa + } + + /// Returns the total number of patterns compiled into this DFA. + /// + /// In the case of a DFA that contains no patterns, this returns `0`. + #[inline] + pub fn pattern_len(&self) -> usize { + self.get_nfa().pattern_len() + } + + /// Returns the total number of states in this one-pass DFA. + /// + /// Note that unlike dense or sparse DFAs, a one-pass DFA does not expose + /// a low level DFA API. Therefore, this routine has little use other than + /// being informational. + #[inline] + pub fn state_len(&self) -> usize { + self.table.len() >> self.stride2() + } + + /// Returns the total number of elements in the alphabet for this DFA. + /// + /// That is, this returns the total number of transitions that each + /// state in this DFA must have. The maximum alphabet size is 256, which + /// corresponds to each possible byte value. + /// + /// The alphabet size may be less than 256 though, and unless + /// [`Config::byte_classes`] is disabled, it is typically must less than + /// 256. Namely, bytes are grouped into equivalence classes such that no + /// two bytes in the same class can distinguish a match from a non-match. + /// For example, in the regex `^[a-z]+$`, the ASCII bytes `a-z` could + /// all be in the same equivalence class. This leads to a massive space + /// savings. + /// + /// Note though that the alphabet length does _not_ necessarily equal the + /// total stride space taken up by a single DFA state in the transition + /// table. Namely, for performance reasons, the stride is always the + /// smallest power of two that is greater than or equal to the alphabet + /// length. For this reason, [`DFA::stride`] or [`DFA::stride2`] are + /// often more useful. The alphabet length is typically useful only for + /// informational purposes. + /// + /// Note also that unlike dense or sparse DFAs, a one-pass DFA does + /// not have a special end-of-input (EOI) transition. This is because + /// a one-pass DFA handles look-around assertions explicitly (like the + /// [`PikeVM`](crate::nfa::thompson::pikevm::PikeVM)) and does not build + /// them into the transitions of the DFA. + #[inline] + pub fn alphabet_len(&self) -> usize { + self.alphabet_len + } + + /// Returns the total stride for every state in this DFA, expressed as the + /// exponent of a power of 2. The stride is the amount of space each state + /// takes up in the transition table, expressed as a number of transitions. + /// (Unused transitions map to dead states.) + /// + /// The stride of a DFA is always equivalent to the smallest power of + /// 2 that is greater than or equal to the DFA's alphabet length. This + /// definition uses extra space, but possibly permits faster translation + /// between state identifiers and their corresponding offsets in this DFA's + /// transition table. + /// + /// For example, if the DFA's stride is 16 transitions, then its `stride2` + /// is `4` since `2^4 = 16`. + /// + /// The minimum `stride2` value is `1` (corresponding to a stride of `2`) + /// while the maximum `stride2` value is `9` (corresponding to a stride + /// of `512`). The maximum in theory should be `8`, but because of some + /// implementation quirks that may be relaxed in the future, it is one more + /// than `8`. (Do note that a maximal stride is incredibly rare, as it + /// would imply that there is almost no redundant in the regex pattern.) + /// + /// Note that unlike dense or sparse DFAs, a one-pass DFA does not expose + /// a low level DFA API. Therefore, this routine has little use other than + /// being informational. + #[inline] + pub fn stride2(&self) -> usize { + self.stride2 + } + + /// Returns the total stride for every state in this DFA. This corresponds + /// to the total number of transitions used by each state in this DFA's + /// transition table. + /// + /// Please see [`DFA::stride2`] for more information. In particular, this + /// returns the stride as the number of transitions, where as `stride2` + /// returns it as the exponent of a power of 2. + /// + /// Note that unlike dense or sparse DFAs, a one-pass DFA does not expose + /// a low level DFA API. Therefore, this routine has little use other than + /// being informational. + #[inline] + pub fn stride(&self) -> usize { + 1 << self.stride2() + } + + /// Returns the memory usage, in bytes, of this DFA. + /// + /// The memory usage is computed based on the number of bytes used to + /// represent this DFA. + /// + /// This does **not** include the stack size used up by this DFA. To + /// compute that, use `std::mem::size_of::<onepass::DFA>()`. + #[inline] + pub fn memory_usage(&self) -> usize { + use core::mem::size_of; + + self.table.len() * size_of::<Transition>() + + self.starts.len() * size_of::<StateID>() + } +} + +impl DFA { + /// Executes an anchored leftmost forward search, and returns true if and + /// only if this one-pass DFA matches the given haystack. + /// + /// This routine may short circuit if it knows that scanning future + /// input will never lead to a different result. In particular, if the + /// underlying DFA enters a match state, then this routine will return + /// `true` immediately without inspecting any future input. (Consider how + /// this might make a difference given the regex `a+` on the haystack + /// `aaaaaaaaaaaaaaa`. This routine can stop after it sees the first `a`, + /// but routines like `find` need to continue searching because `+` is + /// greedy by default.) + /// + /// The given `Input` is forcefully set to use [`Anchored::Yes`] if the + /// given configuration was [`Anchored::No`] (which is the default). + /// + /// # Panics + /// + /// This routine panics if the search could not complete. This can occur + /// in the following circumstances: + /// + /// * When the provided `Input` configuration is not supported. For + /// example, by providing an unsupported anchor mode. Concretely, + /// this occurs when using [`Anchored::Pattern`] without enabling + /// [`Config::starts_for_each_pattern`]. + /// + /// When a search panics, callers cannot know whether a match exists or + /// not. + /// + /// Use [`DFA::try_search`] if you want to handle these panics as error + /// values instead. + /// + /// # Example + /// + /// This shows basic usage: + /// + /// ``` + /// use regex_automata::dfa::onepass::DFA; + /// + /// let re = DFA::new("foo[0-9]+bar")?; + /// let mut cache = re.create_cache(); + /// + /// assert!(re.is_match(&mut cache, "foo12345bar")); + /// assert!(!re.is_match(&mut cache, "foobar")); + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + /// + /// # Example: consistency with search APIs + /// + /// `is_match` is guaranteed to return `true` whenever `captures` returns + /// a match. This includes searches that are executed entirely within a + /// codepoint: + /// + /// ``` + /// use regex_automata::{dfa::onepass::DFA, Input}; + /// + /// let re = DFA::new("a*")?; + /// let mut cache = re.create_cache(); + /// + /// assert!(!re.is_match(&mut cache, Input::new("☃").span(1..2))); + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + /// + /// Notice that when UTF-8 mode is disabled, then the above reports a + /// match because the restriction against zero-width matches that split a + /// codepoint has been lifted: + /// + /// ``` + /// use regex_automata::{dfa::onepass::DFA, nfa::thompson::NFA, Input}; + /// + /// let re = DFA::builder() + /// .thompson(NFA::config().utf8(false)) + /// .build("a*")?; + /// let mut cache = re.create_cache(); + /// + /// assert!(re.is_match(&mut cache, Input::new("☃").span(1..2))); + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + #[inline] + pub fn is_match<'h, I: Into<Input<'h>>>( + &self, + cache: &mut Cache, + input: I, + ) -> bool { + let mut input = input.into().earliest(true); + if matches!(input.get_anchored(), Anchored::No) { + input.set_anchored(Anchored::Yes); + } + self.try_search_slots(cache, &input, &mut []).unwrap().is_some() + } + + /// Executes an anchored leftmost forward search, and returns a `Match` if + /// and only if this one-pass DFA matches the given haystack. + /// + /// This routine only includes the overall match span. To get access to the + /// individual spans of each capturing group, use [`DFA::captures`]. + /// + /// The given `Input` is forcefully set to use [`Anchored::Yes`] if the + /// given configuration was [`Anchored::No`] (which is the default). + /// + /// # Panics + /// + /// This routine panics if the search could not complete. This can occur + /// in the following circumstances: + /// + /// * When the provided `Input` configuration is not supported. For + /// example, by providing an unsupported anchor mode. Concretely, + /// this occurs when using [`Anchored::Pattern`] without enabling + /// [`Config::starts_for_each_pattern`]. + /// + /// When a search panics, callers cannot know whether a match exists or + /// not. + /// + /// Use [`DFA::try_search`] if you want to handle these panics as error + /// values instead. + /// + /// # Example + /// + /// Leftmost first match semantics corresponds to the match with the + /// smallest starting offset, but where the end offset is determined by + /// preferring earlier branches in the original regular expression. For + /// example, `Sam|Samwise` will match `Sam` in `Samwise`, but `Samwise|Sam` + /// will match `Samwise` in `Samwise`. + /// + /// Generally speaking, the "leftmost first" match is how most backtracking + /// regular expressions tend to work. This is in contrast to POSIX-style + /// regular expressions that yield "leftmost longest" matches. Namely, + /// both `Sam|Samwise` and `Samwise|Sam` match `Samwise` when using + /// leftmost longest semantics. (This crate does not currently support + /// leftmost longest semantics.) + /// + /// ``` + /// use regex_automata::{dfa::onepass::DFA, Match}; + /// + /// let re = DFA::new("foo[0-9]+")?; + /// let mut cache = re.create_cache(); + /// let expected = Match::must(0, 0..8); + /// assert_eq!(Some(expected), re.find(&mut cache, "foo12345")); + /// + /// // Even though a match is found after reading the first byte (`a`), + /// // the leftmost first match semantics demand that we find the earliest + /// // match that prefers earlier parts of the pattern over later parts. + /// let re = DFA::new("abc|a")?; + /// let mut cache = re.create_cache(); + /// let expected = Match::must(0, 0..3); + /// assert_eq!(Some(expected), re.find(&mut cache, "abc")); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + #[inline] + pub fn find<'h, I: Into<Input<'h>>>( + &self, + cache: &mut Cache, + input: I, + ) -> Option<Match> { + let mut input = input.into(); + if matches!(input.get_anchored(), Anchored::No) { + input.set_anchored(Anchored::Yes); + } + if self.get_nfa().pattern_len() == 1 { + let mut slots = [None, None]; + let pid = + self.try_search_slots(cache, &input, &mut slots).unwrap()?; + let start = slots[0].unwrap().get(); + let end = slots[1].unwrap().get(); + return Some(Match::new(pid, Span { start, end })); + } + let ginfo = self.get_nfa().group_info(); + let slots_len = ginfo.implicit_slot_len(); + let mut slots = vec![None; slots_len]; + let pid = self.try_search_slots(cache, &input, &mut slots).unwrap()?; + let start = slots[pid.as_usize() * 2].unwrap().get(); + let end = slots[pid.as_usize() * 2 + 1].unwrap().get(); + Some(Match::new(pid, Span { start, end })) + } + + /// Executes an anchored leftmost forward search and writes the spans + /// of capturing groups that participated in a match into the provided + /// [`Captures`] value. If no match was found, then [`Captures::is_match`] + /// is guaranteed to return `false`. + /// + /// The given `Input` is forcefully set to use [`Anchored::Yes`] if the + /// given configuration was [`Anchored::No`] (which is the default). + /// + /// # Panics + /// + /// This routine panics if the search could not complete. This can occur + /// in the following circumstances: + /// + /// * When the provided `Input` configuration is not supported. For + /// example, by providing an unsupported anchor mode. Concretely, + /// this occurs when using [`Anchored::Pattern`] without enabling + /// [`Config::starts_for_each_pattern`]. + /// + /// When a search panics, callers cannot know whether a match exists or + /// not. + /// + /// Use [`DFA::try_search`] if you want to handle these panics as error + /// values instead. + /// + /// # Example + /// + /// This shows a simple example of a one-pass regex that extracts + /// capturing group spans. + /// + /// ``` + /// use regex_automata::{dfa::onepass::DFA, Match, Span}; + /// + /// let re = DFA::new( + /// // Notice that we use ASCII here. The corresponding Unicode regex + /// // is sadly not one-pass. + /// "(?P<first>[[:alpha:]]+)[[:space:]]+(?P<last>[[:alpha:]]+)", + /// )?; + /// let (mut cache, mut caps) = (re.create_cache(), re.create_captures()); + /// + /// re.captures(&mut cache, "Bruce Springsteen", &mut caps); + /// assert_eq!(Some(Match::must(0, 0..17)), caps.get_match()); + /// assert_eq!(Some(Span::from(0..5)), caps.get_group(1)); + /// assert_eq!(Some(Span::from(6..17)), caps.get_group_by_name("last")); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + #[inline] + pub fn captures<'h, I: Into<Input<'h>>>( + &self, + cache: &mut Cache, + input: I, + caps: &mut Captures, + ) { + let mut input = input.into(); + if matches!(input.get_anchored(), Anchored::No) { + input.set_anchored(Anchored::Yes); + } + self.try_search(cache, &input, caps).unwrap(); + } + + /// Executes an anchored leftmost forward search and writes the spans + /// of capturing groups that participated in a match into the provided + /// [`Captures`] value. If no match was found, then [`Captures::is_match`] + /// is guaranteed to return `false`. + /// + /// The differences with [`DFA::captures`] are: + /// + /// 1. This returns an error instead of panicking if the search fails. + /// 2. Accepts an `&Input` instead of a `Into<Input>`. This permits reusing + /// the same input for multiple searches, which _may_ be important for + /// latency. + /// 3. This does not automatically change the [`Anchored`] mode from `No` + /// to `Yes`. Instead, if [`Input::anchored`] is `Anchored::No`, then an + /// error is returned. + /// + /// # Errors + /// + /// This routine errors if the search could not complete. This can occur + /// in the following circumstances: + /// + /// * When the provided `Input` configuration is not supported. For + /// example, by providing an unsupported anchor mode. Concretely, + /// this occurs when using [`Anchored::Pattern`] without enabling + /// [`Config::starts_for_each_pattern`]. + /// + /// When a search returns an error, callers cannot know whether a match + /// exists or not. + /// + /// # Example: specific pattern search + /// + /// This example shows how to build a multi-regex that permits searching + /// for specific patterns. Note that this is somewhat less useful than + /// in other regex engines, since a one-pass DFA by definition has no + /// ambiguity about which pattern can match at a position. That is, if it + /// were possible for two different patterns to match at the same starting + /// position, then the multi-regex would not be one-pass and construction + /// would have failed. + /// + /// Nevertheless, this can still be useful if you only care about matches + /// for a specific pattern, and want the DFA to report "no match" even if + /// some other pattern would have matched. + /// + /// Note that in order to make use of this functionality, + /// [`Config::starts_for_each_pattern`] must be enabled. It is disabled + /// by default since it may result in higher memory usage. + /// + /// ``` + /// use regex_automata::{ + /// dfa::onepass::DFA, Anchored, Input, Match, PatternID, + /// }; + /// + /// let re = DFA::builder() + /// .configure(DFA::config().starts_for_each_pattern(true)) + /// .build_many(&["[a-z]+", "[0-9]+"])?; + /// let (mut cache, mut caps) = (re.create_cache(), re.create_captures()); + /// let haystack = "123abc"; + /// let input = Input::new(haystack).anchored(Anchored::Yes); + /// + /// // A normal multi-pattern search will show pattern 1 matches. + /// re.try_search(&mut cache, &input, &mut caps)?; + /// assert_eq!(Some(Match::must(1, 0..3)), caps.get_match()); + /// + /// // If we only want to report pattern 0 matches, then we'll get no + /// // match here. + /// let input = input.anchored(Anchored::Pattern(PatternID::must(0))); + /// re.try_search(&mut cache, &input, &mut caps)?; + /// assert_eq!(None, caps.get_match()); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + /// + /// # Example: specifying the bounds of a search + /// + /// This example shows how providing the bounds of a search can produce + /// different results than simply sub-slicing the haystack. + /// + /// ``` + /// # if cfg!(miri) { return Ok(()); } // miri takes too long + /// use regex_automata::{dfa::onepass::DFA, Anchored, Input, Match}; + /// + /// // one-pass DFAs fully support Unicode word boundaries! + /// // A sad joke is that a Unicode aware regex like \w+\s is not one-pass. + /// // :-( + /// let re = DFA::new(r"\b[0-9]{3}\b")?; + /// let (mut cache, mut caps) = (re.create_cache(), re.create_captures()); + /// let haystack = "foo123bar"; + /// + /// // Since we sub-slice the haystack, the search doesn't know about + /// // the larger context and assumes that `123` is surrounded by word + /// // boundaries. And of course, the match position is reported relative + /// // to the sub-slice as well, which means we get `0..3` instead of + /// // `3..6`. + /// let expected = Some(Match::must(0, 0..3)); + /// let input = Input::new(&haystack[3..6]).anchored(Anchored::Yes); + /// re.try_search(&mut cache, &input, &mut caps)?; + /// assert_eq!(expected, caps.get_match()); + /// + /// // But if we provide the bounds of the search within the context of the + /// // entire haystack, then the search can take the surrounding context + /// // into account. (And if we did find a match, it would be reported + /// // as a valid offset into `haystack` instead of its sub-slice.) + /// let expected = None; + /// let input = Input::new(haystack).range(3..6).anchored(Anchored::Yes); + /// re.try_search(&mut cache, &input, &mut caps)?; + /// assert_eq!(expected, caps.get_match()); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + #[inline] + pub fn try_search( + &self, + cache: &mut Cache, + input: &Input<'_>, + caps: &mut Captures, + ) -> Result<(), MatchError> { + let pid = self.try_search_slots(cache, input, caps.slots_mut())?; + caps.set_pattern(pid); + Ok(()) + } + + /// Executes an anchored leftmost forward search and writes the spans + /// of capturing groups that participated in a match into the provided + /// `slots`, and returns the matching pattern ID. The contents of the + /// slots for patterns other than the matching pattern are unspecified. If + /// no match was found, then `None` is returned and the contents of all + /// `slots` is unspecified. + /// + /// This is like [`DFA::try_search`], but it accepts a raw slots slice + /// instead of a `Captures` value. This is useful in contexts where you + /// don't want or need to allocate a `Captures`. + /// + /// It is legal to pass _any_ number of slots to this routine. If the regex + /// engine would otherwise write a slot offset that doesn't fit in the + /// provided slice, then it is simply skipped. In general though, there are + /// usually three slice lengths you might want to use: + /// + /// * An empty slice, if you only care about which pattern matched. + /// * A slice with + /// [`pattern_len() * 2`](crate::dfa::onepass::DFA::pattern_len) + /// slots, if you only care about the overall match spans for each matching + /// pattern. + /// * A slice with + /// [`slot_len()`](crate::util::captures::GroupInfo::slot_len) slots, which + /// permits recording match offsets for every capturing group in every + /// pattern. + /// + /// # Errors + /// + /// This routine errors if the search could not complete. This can occur + /// in the following circumstances: + /// + /// * When the provided `Input` configuration is not supported. For + /// example, by providing an unsupported anchor mode. Concretely, + /// this occurs when using [`Anchored::Pattern`] without enabling + /// [`Config::starts_for_each_pattern`]. + /// + /// When a search returns an error, callers cannot know whether a match + /// exists or not. + /// + /// # Example + /// + /// This example shows how to find the overall match offsets in a + /// multi-pattern search without allocating a `Captures` value. Indeed, we + /// can put our slots right on the stack. + /// + /// ``` + /// use regex_automata::{dfa::onepass::DFA, Anchored, Input, PatternID}; + /// + /// let re = DFA::new_many(&[ + /// r"[a-zA-Z]+", + /// r"[0-9]+", + /// ])?; + /// let mut cache = re.create_cache(); + /// let input = Input::new("123").anchored(Anchored::Yes); + /// + /// // We only care about the overall match offsets here, so we just + /// // allocate two slots for each pattern. Each slot records the start + /// // and end of the match. + /// let mut slots = [None; 4]; + /// let pid = re.try_search_slots(&mut cache, &input, &mut slots)?; + /// assert_eq!(Some(PatternID::must(1)), pid); + /// + /// // The overall match offsets are always at 'pid * 2' and 'pid * 2 + 1'. + /// // See 'GroupInfo' for more details on the mapping between groups and + /// // slot indices. + /// let slot_start = pid.unwrap().as_usize() * 2; + /// let slot_end = slot_start + 1; + /// assert_eq!(Some(0), slots[slot_start].map(|s| s.get())); + /// assert_eq!(Some(3), slots[slot_end].map(|s| s.get())); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + #[inline] + pub fn try_search_slots( + &self, + cache: &mut Cache, + input: &Input<'_>, + slots: &mut [Option<NonMaxUsize>], + ) -> Result<Option<PatternID>, MatchError> { + let utf8empty = self.get_nfa().has_empty() && self.get_nfa().is_utf8(); + if !utf8empty { + return self.try_search_slots_imp(cache, input, slots); + } + // See PikeVM::try_search_slots for why we do this. + let min = self.get_nfa().group_info().implicit_slot_len(); + if slots.len() >= min { + return self.try_search_slots_imp(cache, input, slots); + } + if self.get_nfa().pattern_len() == 1 { + let mut enough = [None, None]; + let got = self.try_search_slots_imp(cache, input, &mut enough)?; + // This is OK because we know `enough_slots` is strictly bigger + // than `slots`, otherwise this special case isn't reached. + slots.copy_from_slice(&enough[..slots.len()]); + return Ok(got); + } + let mut enough = vec![None; min]; + let got = self.try_search_slots_imp(cache, input, &mut enough)?; + // This is OK because we know `enough_slots` is strictly bigger than + // `slots`, otherwise this special case isn't reached. + slots.copy_from_slice(&enough[..slots.len()]); + Ok(got) + } + + #[inline(never)] + fn try_search_slots_imp( + &self, + cache: &mut Cache, + input: &Input<'_>, + slots: &mut [Option<NonMaxUsize>], + ) -> Result<Option<PatternID>, MatchError> { + let utf8empty = self.get_nfa().has_empty() && self.get_nfa().is_utf8(); + match self.search_imp(cache, input, slots)? { + None => return Ok(None), + Some(pid) if !utf8empty => return Ok(Some(pid)), + Some(pid) => { + // These slot indices are always correct because we know our + // 'pid' is valid and thus we know that the slot indices for it + // are valid. + let slot_start = pid.as_usize().wrapping_mul(2); + let slot_end = slot_start.wrapping_add(1); + // OK because we know we have a match and we know our caller + // provided slots are big enough (which we make true above if + // the caller didn't). Namely, we're only here when 'utf8empty' + // is true, and when that's true, we require slots for every + // pattern. + let start = slots[slot_start].unwrap().get(); + let end = slots[slot_end].unwrap().get(); + // If our match splits a codepoint, then we cannot report is + // as a match. And since one-pass DFAs only support anchored + // searches, we don't try to skip ahead to find the next match. + // We can just quit with nothing. + if start == end && !input.is_char_boundary(start) { + return Ok(None); + } + Ok(Some(pid)) + } + } + } +} + +impl DFA { + fn search_imp( + &self, + cache: &mut Cache, + input: &Input<'_>, + slots: &mut [Option<NonMaxUsize>], + ) -> Result<Option<PatternID>, MatchError> { + // PERF: Some ideas. I ran out of steam after my initial impl to try + // many of these. + // + // 1) Try doing more state shuffling. Right now, all we do is push + // match states to the end of the transition table so that we can do + // 'if sid >= self.min_match_id' to know whether we're in a match + // state or not. But what about doing something like dense DFAs and + // pushing dead, match and states with captures/looks all toward the + // beginning of the transition table. Then we could do 'if sid <= + // self.max_special_id', in which case, we need to do some special + // handling of some sort. Otherwise, we get the happy path, just + // like in a DFA search. The main argument against this is that the + // one-pass DFA is likely to be used most often with capturing groups + // and if capturing groups are common, then this might wind up being a + // pessimization. + // + // 2) Consider moving 'PatternEpsilons' out of the transition table. + // It is only needed for match states and usually a small minority of + // states are match states. Therefore, we're using an extra 'u64' for + // most states. + // + // 3) I played around with the match state handling and it seems like + // there is probably a lot left on the table for improvement. The + // key tension is that the 'find_match' routine is a giant mess, but + // splitting it out into a non-inlineable function is a non-starter + // because the match state might consume input, so 'find_match' COULD + // be called quite a lot, and a function call at that point would trash + // perf. In theory, we could detect whether a match state consumes + // input and then specialize our search routine based on that. In that + // case, maybe an extra function call is OK, but even then, it might be + // too much of a latency hit. Another idea is to just try and figure + // out how to reduce the code size of 'find_match'. RE2 has a trick + // here where the match handling isn't done if we know the next byte of + // input yields a match too. Maybe we adopt that? + // + // This just might be a tricky DFA to optimize. + + if input.is_done() { + return Ok(None); + } + // We unfortunately have a bit of book-keeping to do to set things + // up. We do have to setup our cache and clear all of our slots. In + // particular, clearing the slots is necessary for the case where we + // report a match, but one of the capturing groups didn't participate + // in the match but had a span set from a previous search. That would + // be bad. In theory, we could avoid all this slot clearing if we knew + // that every slot was always activated for every match. Then we would + // know they would always be overwritten when a match is found. + let explicit_slots_len = core::cmp::min( + Slots::LIMIT, + slots.len().saturating_sub(self.explicit_slot_start), + ); + cache.setup_search(explicit_slots_len); + for slot in cache.explicit_slots() { + *slot = None; + } + for slot in slots.iter_mut() { + *slot = None; + } + // We set the starting slots for every pattern up front. This does + // increase our latency somewhat, but it avoids having to do it every + // time we see a match state (which could be many times in a single + // search if the match state consumes input). + for pid in self.nfa.patterns() { + let i = pid.as_usize() * 2; + if i >= slots.len() { + break; + } + slots[i] = NonMaxUsize::new(input.start()); + } + let mut pid = None; + let mut next_sid = match input.get_anchored() { + Anchored::Yes => self.start(), + Anchored::Pattern(pid) => self.start_pattern(pid)?, + Anchored::No => { + // If the regex is itself always anchored, then we're fine, + // even if the search is configured to be unanchored. + if !self.nfa.is_always_start_anchored() { + return Err(MatchError::unsupported_anchored( + Anchored::No, + )); + } + self.start() + } + }; + let leftmost_first = + matches!(self.config.get_match_kind(), MatchKind::LeftmostFirst); + for at in input.start()..input.end() { + let sid = next_sid; + let trans = self.transition(sid, input.haystack()[at]); + next_sid = trans.state_id(); + let epsilons = trans.epsilons(); + if sid >= self.min_match_id { + if self.find_match(cache, input, at, sid, slots, &mut pid) { + if input.get_earliest() + || (leftmost_first && trans.match_wins()) + { + return Ok(pid); + } + } + } + if sid == DEAD + || (!epsilons.looks().is_empty() + && !self.nfa.look_matcher().matches_set_inline( + epsilons.looks(), + input.haystack(), + at, + )) + { + return Ok(pid); + } + epsilons.slots().apply(at, cache.explicit_slots()); + } + if next_sid >= self.min_match_id { + self.find_match( + cache, + input, + input.end(), + next_sid, + slots, + &mut pid, + ); + } + Ok(pid) + } + + /// Assumes 'sid' is a match state and looks for whether a match can + /// be reported. If so, appropriate offsets are written to 'slots' and + /// 'matched_pid' is set to the matching pattern ID. + /// + /// Even when 'sid' is a match state, it's possible that a match won't + /// be reported. For example, when the conditional epsilon transitions + /// leading to the match state aren't satisfied at the given position in + /// the haystack. + #[cfg_attr(feature = "perf-inline", inline(always))] + fn find_match( + &self, + cache: &mut Cache, + input: &Input<'_>, + at: usize, + sid: StateID, + slots: &mut [Option<NonMaxUsize>], + matched_pid: &mut Option<PatternID>, + ) -> bool { + debug_assert!(sid >= self.min_match_id); + let pateps = self.pattern_epsilons(sid); + let epsilons = pateps.epsilons(); + if !epsilons.looks().is_empty() + && !self.nfa.look_matcher().matches_set_inline( + epsilons.looks(), + input.haystack(), + at, + ) + { + return false; + } + let pid = pateps.pattern_id_unchecked(); + // This calculation is always correct because we know our 'pid' is + // valid and thus we know that the slot indices for it are valid. + let slot_end = pid.as_usize().wrapping_mul(2).wrapping_add(1); + // Set the implicit 'end' slot for the matching pattern. (The 'start' + // slot was set at the beginning of the search.) + if slot_end < slots.len() { + slots[slot_end] = NonMaxUsize::new(at); + } + // If the caller provided enough room, copy the previously recorded + // explicit slots from our scratch space to the caller provided slots. + // We *also* need to set any explicit slots that are active as part of + // the path to the match state. + if self.explicit_slot_start < slots.len() { + // NOTE: The 'cache.explicit_slots()' slice is setup at the + // beginning of every search such that it is guaranteed to return a + // slice of length equivalent to 'slots[explicit_slot_start..]'. + slots[self.explicit_slot_start..] + .copy_from_slice(cache.explicit_slots()); + epsilons.slots().apply(at, &mut slots[self.explicit_slot_start..]); + } + *matched_pid = Some(pid); + true + } +} + +impl DFA { + /// Returns the anchored start state for matching any pattern in this DFA. + fn start(&self) -> StateID { + self.starts[0] + } + + /// Returns the anchored start state for matching the given pattern. If + /// 'starts_for_each_pattern' + /// was not enabled, then this returns an error. If the given pattern is + /// not in this DFA, then `Ok(None)` is returned. + fn start_pattern(&self, pid: PatternID) -> Result<StateID, MatchError> { + if !self.config.get_starts_for_each_pattern() { + return Err(MatchError::unsupported_anchored(Anchored::Pattern( + pid, + ))); + } + // 'starts' always has non-zero length. The first entry is always the + // anchored starting state for all patterns, and the following entries + // are optional and correspond to the anchored starting states for + // patterns at pid+1. Thus, starts.len()-1 corresponds to the total + // number of patterns that one can explicitly search for. (And it may + // be zero.) + Ok(self.starts.get(pid.one_more()).copied().unwrap_or(DEAD)) + } + + /// Returns the transition from the given state ID and byte of input. The + /// transition includes the next state ID, the slots that should be saved + /// and any conditional epsilon transitions that must be satisfied in order + /// to take this transition. + fn transition(&self, sid: StateID, byte: u8) -> Transition { + let offset = sid.as_usize() << self.stride2(); + let class = self.classes.get(byte).as_usize(); + self.table[offset + class] + } + + /// Set the transition from the given state ID and byte of input to the + /// transition given. + fn set_transition(&mut self, sid: StateID, byte: u8, to: Transition) { + let offset = sid.as_usize() << self.stride2(); + let class = self.classes.get(byte).as_usize(); + self.table[offset + class] = to; + } + + /// Return an iterator of "sparse" transitions for the given state ID. + /// "sparse" in this context means that consecutive transitions that are + /// equivalent are returned as one group, and transitions to the DEAD state + /// are ignored. + /// + /// This winds up being useful for debug printing, since it's much terser + /// to display runs of equivalent transitions than the transition for every + /// possible byte value. Indeed, in practice, it's very common for runs + /// of equivalent transitions to appear. + fn sparse_transitions(&self, sid: StateID) -> SparseTransitionIter<'_> { + let start = sid.as_usize() << self.stride2(); + let end = start + self.alphabet_len(); + SparseTransitionIter { + it: self.table[start..end].iter().enumerate(), + cur: None, + } + } + + /// Return the pattern epsilons for the given state ID. + /// + /// If the given state ID does not correspond to a match state ID, then the + /// pattern epsilons returned is empty. + fn pattern_epsilons(&self, sid: StateID) -> PatternEpsilons { + let offset = sid.as_usize() << self.stride2(); + PatternEpsilons(self.table[offset + self.pateps_offset].0) + } + + /// Set the pattern epsilons for the given state ID. + fn set_pattern_epsilons(&mut self, sid: StateID, pateps: PatternEpsilons) { + let offset = sid.as_usize() << self.stride2(); + self.table[offset + self.pateps_offset] = Transition(pateps.0); + } + + /// Returns the state ID prior to the one given. This returns None if the + /// given ID is the first DFA state. + fn prev_state_id(&self, id: StateID) -> Option<StateID> { + if id == DEAD { + None + } else { + // CORRECTNESS: Since 'id' is not the first state, subtracting 1 + // is always valid. + Some(StateID::new_unchecked(id.as_usize().checked_sub(1).unwrap())) + } + } + + /// Returns the state ID of the last state in this DFA's transition table. + /// "last" in this context means the last state to appear in memory, i.e., + /// the one with the greatest ID. + fn last_state_id(&self) -> StateID { + // CORRECTNESS: A DFA table is always non-empty since it always at + // least contains a DEAD state. Since every state has the same stride, + // we can just compute what the "next" state ID would have been and + // then subtract 1 from it. + StateID::new_unchecked( + (self.table.len() >> self.stride2()).checked_sub(1).unwrap(), + ) + } + + /// Move the transitions from 'id1' to 'id2' and vice versa. + /// + /// WARNING: This does not update the rest of the transition table to have + /// transitions to 'id1' changed to 'id2' and vice versa. This merely moves + /// the states in memory. + pub(super) fn swap_states(&mut self, id1: StateID, id2: StateID) { + let o1 = id1.as_usize() << self.stride2(); + let o2 = id2.as_usize() << self.stride2(); + for b in 0..self.stride() { + self.table.swap(o1 + b, o2 + b); + } + } + + /// Map all state IDs in this DFA (transition table + start states) + /// according to the closure given. + pub(super) fn remap(&mut self, map: impl Fn(StateID) -> StateID) { + for i in 0..self.state_len() { + let offset = i << self.stride2(); + for b in 0..self.alphabet_len() { + let next = self.table[offset + b].state_id(); + self.table[offset + b].set_state_id(map(next)); + } + } + for i in 0..self.starts.len() { + self.starts[i] = map(self.starts[i]); + } + } +} + +impl core::fmt::Debug for DFA { + fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result { + fn debug_state_transitions( + f: &mut core::fmt::Formatter, + dfa: &DFA, + sid: StateID, + ) -> core::fmt::Result { + for (i, (start, end, trans)) in + dfa.sparse_transitions(sid).enumerate() + { + let next = trans.state_id(); + if i > 0 { + write!(f, ", ")?; + } + if start == end { + write!( + f, + "{:?} => {:?}", + DebugByte(start), + next.as_usize(), + )?; + } else { + write!( + f, + "{:?}-{:?} => {:?}", + DebugByte(start), + DebugByte(end), + next.as_usize(), + )?; + } + if trans.match_wins() { + write!(f, " (MW)")?; + } + if !trans.epsilons().is_empty() { + write!(f, " ({:?})", trans.epsilons())?; + } + } + Ok(()) + } + + writeln!(f, "onepass::DFA(")?; + for index in 0..self.state_len() { + let sid = StateID::must(index); + let pateps = self.pattern_epsilons(sid); + if sid == DEAD { + write!(f, "D ")?; + } else if pateps.pattern_id().is_some() { + write!(f, "* ")?; + } else { + write!(f, " ")?; + } + write!(f, "{:06?}", sid.as_usize())?; + if !pateps.is_empty() { + write!(f, " ({:?})", pateps)?; + } + write!(f, ": ")?; + debug_state_transitions(f, self, sid)?; + write!(f, "\n")?; + } + writeln!(f, "")?; + for (i, &sid) in self.starts.iter().enumerate() { + if i == 0 { + writeln!(f, "START(ALL): {:?}", sid.as_usize())?; + } else { + writeln!( + f, + "START(pattern: {:?}): {:?}", + i - 1, + sid.as_usize(), + )?; + } + } + writeln!(f, "state length: {:?}", self.state_len())?; + writeln!(f, "pattern length: {:?}", self.pattern_len())?; + writeln!(f, ")")?; + Ok(()) + } +} + +/// An iterator over groups of consecutive equivalent transitions in a single +/// state. +#[derive(Debug)] +struct SparseTransitionIter<'a> { + it: core::iter::Enumerate<core::slice::Iter<'a, Transition>>, + cur: Option<(u8, u8, Transition)>, +} + +impl<'a> Iterator for SparseTransitionIter<'a> { + type Item = (u8, u8, Transition); + + fn next(&mut self) -> Option<(u8, u8, Transition)> { + while let Some((b, &trans)) = self.it.next() { + // Fine because we'll never have more than u8::MAX transitions in + // one state. + let b = b.as_u8(); + let (prev_start, prev_end, prev_trans) = match self.cur { + Some(t) => t, + None => { + self.cur = Some((b, b, trans)); + continue; + } + }; + if prev_trans == trans { + self.cur = Some((prev_start, b, prev_trans)); + } else { + self.cur = Some((b, b, trans)); + if prev_trans.state_id() != DEAD { + return Some((prev_start, prev_end, prev_trans)); + } + } + } + if let Some((start, end, trans)) = self.cur.take() { + if trans.state_id() != DEAD { + return Some((start, end, trans)); + } + } + None + } +} + +/// A cache represents mutable state that a one-pass [`DFA`] requires during a +/// search. +/// +/// For a given one-pass DFA, its corresponding cache may be created either via +/// [`DFA::create_cache`], or via [`Cache::new`]. They are equivalent in every +/// way, except the former does not require explicitly importing `Cache`. +/// +/// A particular `Cache` is coupled with the one-pass DFA from which it was +/// created. It may only be used with that one-pass DFA. A cache and its +/// allocations may be re-purposed via [`Cache::reset`], in which case, it can +/// only be used with the new one-pass DFA (and not the old one). +#[derive(Clone, Debug)] +pub struct Cache { + /// Scratch space used to store slots during a search. Basically, we use + /// the caller provided slots to store slots known when a match occurs. + /// But after a match occurs, we might continue a search but ultimately + /// fail to extend the match. When continuing the search, we need some + /// place to store candidate capture offsets without overwriting the slot + /// offsets recorded for the most recently seen match. + explicit_slots: Vec<Option<NonMaxUsize>>, + /// The number of slots in the caller-provided 'Captures' value for the + /// current search. This is always at most 'explicit_slots.len()', but + /// might be less than it, if the caller provided fewer slots to fill. + explicit_slot_len: usize, +} + +impl Cache { + /// Create a new [`onepass::DFA`](DFA) cache. + /// + /// A potentially more convenient routine to create a cache is + /// [`DFA::create_cache`], as it does not require also importing the + /// `Cache` type. + /// + /// If you want to reuse the returned `Cache` with some other one-pass DFA, + /// then you must call [`Cache::reset`] with the desired one-pass DFA. + pub fn new(re: &DFA) -> Cache { + let mut cache = Cache { explicit_slots: vec![], explicit_slot_len: 0 }; + cache.reset(re); + cache + } + + /// Reset this cache such that it can be used for searching with a + /// different [`onepass::DFA`](DFA). + /// + /// A cache reset permits reusing memory already allocated in this cache + /// with a different one-pass DFA. + /// + /// # Example + /// + /// This shows how to re-purpose a cache for use with a different one-pass + /// DFA. + /// + /// ``` + /// # if cfg!(miri) { return Ok(()); } // miri takes too long + /// use regex_automata::{dfa::onepass::DFA, Match}; + /// + /// let re1 = DFA::new(r"\w")?; + /// let re2 = DFA::new(r"\W")?; + /// let mut caps1 = re1.create_captures(); + /// let mut caps2 = re2.create_captures(); + /// + /// let mut cache = re1.create_cache(); + /// assert_eq!( + /// Some(Match::must(0, 0..2)), + /// { re1.captures(&mut cache, "Δ", &mut caps1); caps1.get_match() }, + /// ); + /// + /// // Using 'cache' with re2 is not allowed. It may result in panics or + /// // incorrect results. In order to re-purpose the cache, we must reset + /// // it with the one-pass DFA we'd like to use it with. + /// // + /// // Similarly, after this reset, using the cache with 're1' is also not + /// // allowed. + /// re2.reset_cache(&mut cache); + /// assert_eq!( + /// Some(Match::must(0, 0..3)), + /// { re2.captures(&mut cache, "☃", &mut caps2); caps2.get_match() }, + /// ); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + pub fn reset(&mut self, re: &DFA) { + let explicit_slot_len = re.get_nfa().group_info().explicit_slot_len(); + self.explicit_slots.resize(explicit_slot_len, None); + self.explicit_slot_len = explicit_slot_len; + } + + /// Returns the heap memory usage, in bytes, of this cache. + /// + /// This does **not** include the stack size used up by this cache. To + /// compute that, use `std::mem::size_of::<Cache>()`. + pub fn memory_usage(&self) -> usize { + self.explicit_slots.len() * core::mem::size_of::<Option<NonMaxUsize>>() + } + + fn explicit_slots(&mut self) -> &mut [Option<NonMaxUsize>] { + &mut self.explicit_slots[..self.explicit_slot_len] + } + + fn setup_search(&mut self, explicit_slot_len: usize) { + self.explicit_slot_len = explicit_slot_len; + } +} + +/// Represents a single transition in a one-pass DFA. +/// +/// The high 24 bits corresponds to the state ID. The low 48 bits corresponds +/// to the transition epsilons, which contains the slots that should be saved +/// when this transition is followed and the conditional epsilon transitions +/// that must be satisfied in order to follow this transition. +#[derive(Clone, Copy, Eq, PartialEq)] +struct Transition(u64); + +impl Transition { + const STATE_ID_BITS: u64 = 21; + const STATE_ID_SHIFT: u64 = 64 - Transition::STATE_ID_BITS; + const STATE_ID_LIMIT: u64 = 1 << Transition::STATE_ID_BITS; + const MATCH_WINS_SHIFT: u64 = 64 - (Transition::STATE_ID_BITS + 1); + const INFO_MASK: u64 = 0x000003FF_FFFFFFFF; + + /// Return a new transition to the given state ID with the given epsilons. + fn new(match_wins: bool, sid: StateID, epsilons: Epsilons) -> Transition { + let match_wins = + if match_wins { 1 << Transition::MATCH_WINS_SHIFT } else { 0 }; + let sid = sid.as_u64() << Transition::STATE_ID_SHIFT; + Transition(sid | match_wins | epsilons.0) + } + + /// Returns true if and only if this transition points to the DEAD state. + fn is_dead(self) -> bool { + self.state_id() == DEAD + } + + /// Return whether this transition has a "match wins" property. + /// + /// When a transition has this property, it means that if a match has been + /// found and the search uses leftmost-first semantics, then that match + /// should be returned immediately instead of continuing on. + /// + /// The "match wins" name comes from RE2, which uses a pretty much + /// identical mechanism for implementing leftmost-first semantics. + fn match_wins(&self) -> bool { + (self.0 >> Transition::MATCH_WINS_SHIFT & 1) == 1 + } + + /// Return the "next" state ID that this transition points to. + fn state_id(&self) -> StateID { + // OK because a Transition has a valid StateID in its upper bits by + // construction. The cast to usize is also correct, even on 16-bit + // targets because, again, we know the upper bits is a valid StateID, + // which can never overflow usize on any supported target. + StateID::new_unchecked( + (self.0 >> Transition::STATE_ID_SHIFT).as_usize(), + ) + } + + /// Set the "next" state ID in this transition. + fn set_state_id(&mut self, sid: StateID) { + *self = Transition::new(self.match_wins(), sid, self.epsilons()); + } + + /// Return the epsilons embedded in this transition. + fn epsilons(&self) -> Epsilons { + Epsilons(self.0 & Transition::INFO_MASK) + } +} + +impl core::fmt::Debug for Transition { + fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result { + if self.is_dead() { + return write!(f, "0"); + } + write!(f, "{}", self.state_id().as_usize())?; + if self.match_wins() { + write!(f, "-MW")?; + } + if !self.epsilons().is_empty() { + write!(f, "-{:?}", self.epsilons())?; + } + Ok(()) + } +} + +/// A representation of a match state's pattern ID along with the epsilons for +/// when a match occurs. +/// +/// A match state in a one-pass DFA, unlike in a more general DFA, has exactly +/// one pattern ID. If it had more, then the original NFA would not have been +/// one-pass. +/// +/// The "epsilons" part of this corresponds to what was found in the epsilon +/// transitions between the transition taken in the last byte of input and the +/// ultimate match state. This might include saving slots and/or conditional +/// epsilon transitions that must be satisfied before one can report the match. +/// +/// Technically, every state has room for a 'PatternEpsilons', but it is only +/// ever non-empty for match states. +#[derive(Clone, Copy)] +struct PatternEpsilons(u64); + +impl PatternEpsilons { + const PATTERN_ID_BITS: u64 = 22; + const PATTERN_ID_SHIFT: u64 = 64 - PatternEpsilons::PATTERN_ID_BITS; + // A sentinel value indicating that this is not a match state. We don't + // use 0 since 0 is a valid pattern ID. + const PATTERN_ID_NONE: u64 = 0x00000000_003FFFFF; + const PATTERN_ID_LIMIT: u64 = PatternEpsilons::PATTERN_ID_NONE; + const PATTERN_ID_MASK: u64 = 0xFFFFFC00_00000000; + const EPSILONS_MASK: u64 = 0x000003FF_FFFFFFFF; + + /// Return a new empty pattern epsilons that has no pattern ID and has no + /// epsilons. This is suitable for non-match states. + fn empty() -> PatternEpsilons { + PatternEpsilons( + PatternEpsilons::PATTERN_ID_NONE + << PatternEpsilons::PATTERN_ID_SHIFT, + ) + } + + /// Whether this pattern epsilons is empty or not. It's empty when it has + /// no pattern ID and an empty epsilons. + fn is_empty(self) -> bool { + self.pattern_id().is_none() && self.epsilons().is_empty() + } + + /// Return the pattern ID in this pattern epsilons if one exists. + fn pattern_id(self) -> Option<PatternID> { + let pid = self.0 >> PatternEpsilons::PATTERN_ID_SHIFT; + if pid == PatternEpsilons::PATTERN_ID_LIMIT { + None + } else { + Some(PatternID::new_unchecked(pid.as_usize())) + } + } + + /// Returns the pattern ID without checking whether it's valid. If this is + /// called and there is no pattern ID in this `PatternEpsilons`, then this + /// will likely produce an incorrect result or possibly even a panic or + /// an overflow. But safety will not be violated. + /// + /// This is useful when you know a particular state is a match state. If + /// it's a match state, then it must have a pattern ID. + fn pattern_id_unchecked(self) -> PatternID { + let pid = self.0 >> PatternEpsilons::PATTERN_ID_SHIFT; + PatternID::new_unchecked(pid.as_usize()) + } + + /// Return a new pattern epsilons with the given pattern ID, but the same + /// epsilons. + fn set_pattern_id(self, pid: PatternID) -> PatternEpsilons { + PatternEpsilons( + (pid.as_u64() << PatternEpsilons::PATTERN_ID_SHIFT) + | (self.0 & PatternEpsilons::EPSILONS_MASK), + ) + } + + /// Return the epsilons part of this pattern epsilons. + fn epsilons(self) -> Epsilons { + Epsilons(self.0 & PatternEpsilons::EPSILONS_MASK) + } + + /// Return a new pattern epsilons with the given epsilons, but the same + /// pattern ID. + fn set_epsilons(self, epsilons: Epsilons) -> PatternEpsilons { + PatternEpsilons( + (self.0 & PatternEpsilons::PATTERN_ID_MASK) + | u64::from(epsilons.0), + ) + } +} + +impl core::fmt::Debug for PatternEpsilons { + fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result { + if self.is_empty() { + return write!(f, "N/A"); + } + if let Some(pid) = self.pattern_id() { + write!(f, "{}", pid.as_usize())?; + } + if !self.epsilons().is_empty() { + if self.pattern_id().is_some() { + write!(f, "/")?; + } + write!(f, "{:?}", self.epsilons())?; + } + Ok(()) + } +} + +/// Epsilons represents all of the NFA epsilons transitions that went into a +/// single transition in a single DFA state. In this case, it only represents +/// the epsilon transitions that have some kind of non-consuming side effect: +/// either the transition requires storing the current position of the search +/// into a slot, or the transition is conditional and requires the current +/// position in the input to satisfy an assertion before the transition may be +/// taken. +/// +/// This folds the cumulative effect of a group of NFA states (all connected +/// by epsilon transitions) down into a single set of bits. While these bits +/// can represent all possible conditional epsilon transitions, it only permits +/// storing up to a somewhat small number of slots. +/// +/// Epsilons is represented as a 42-bit integer. For example, it is packed into +/// the lower 42 bits of a `Transition`. (Where the high 22 bits contains a +/// `StateID` and a special "match wins" property.) +#[derive(Clone, Copy)] +struct Epsilons(u64); + +impl Epsilons { + const SLOT_MASK: u64 = 0x000003FF_FFFFFC00; + const SLOT_SHIFT: u64 = 10; + const LOOK_MASK: u64 = 0x00000000_000003FF; + + /// Create a new empty epsilons. It has no slots and no assertions that + /// need to be satisfied. + fn empty() -> Epsilons { + Epsilons(0) + } + + /// Returns true if this epsilons contains no slots and no assertions. + fn is_empty(self) -> bool { + self.0 == 0 + } + + /// Returns the slot epsilon transitions. + fn slots(self) -> Slots { + Slots((self.0 >> Epsilons::SLOT_SHIFT).low_u32()) + } + + /// Set the slot epsilon transitions. + fn set_slots(self, slots: Slots) -> Epsilons { + Epsilons( + (u64::from(slots.0) << Epsilons::SLOT_SHIFT) + | (self.0 & Epsilons::LOOK_MASK), + ) + } + + /// Return the set of look-around assertions in these epsilon transitions. + fn looks(self) -> LookSet { + LookSet { bits: (self.0 & Epsilons::LOOK_MASK).low_u16() } + } + + /// Set the look-around assertions on these epsilon transitions. + fn set_looks(self, look_set: LookSet) -> Epsilons { + Epsilons((self.0 & Epsilons::SLOT_MASK) | u64::from(look_set.bits)) + } +} + +impl core::fmt::Debug for Epsilons { + fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result { + let mut wrote = false; + if !self.slots().is_empty() { + write!(f, "{:?}", self.slots())?; + wrote = true; + } + if !self.looks().is_empty() { + if wrote { + write!(f, "/")?; + } + write!(f, "{:?}", self.looks())?; + wrote = true; + } + if !wrote { + write!(f, "N/A")?; + } + Ok(()) + } +} + +/// The set of epsilon transitions indicating that the current position in a +/// search should be saved to a slot. +/// +/// This *only* represents explicit slots. So for example, the pattern +/// `[a-z]+([0-9]+)([a-z]+)` has: +/// +/// * 3 capturing groups, thus 6 slots. +/// * 1 implicit capturing group, thus 2 implicit slots. +/// * 2 explicit capturing groups, thus 4 explicit slots. +/// +/// While implicit slots are represented by epsilon transitions in an NFA, we +/// do not explicitly represent them here. Instead, implicit slots are assumed +/// to be present and handled automatically in the search code. Therefore, +/// that means we only need to represent explicit slots in our epsilon +/// transitions. +/// +/// Its representation is a bit set. The bit 'i' is set if and only if there +/// exists an explicit slot at index 'c', where 'c = (#patterns * 2) + i'. That +/// is, the bit 'i' corresponds to the first explicit slot and the first +/// explicit slot appears immediately following the last implicit slot. (If +/// this is confusing, see `GroupInfo` for more details on how slots works.) +/// +/// A single `Slots` represents all the active slots in a sub-graph of an NFA, +/// where all the states are connected by epsilon transitions. In effect, when +/// traversing the one-pass DFA during a search, all slots set in a particular +/// transition must be captured by recording the current search position. +/// +/// The API of `Slots` requires the caller to handle the explicit slot offset. +/// That is, a `Slots` doesn't know where the explicit slots start for a +/// particular NFA. Thus, if the callers see's the bit 'i' is set, then they +/// need to do the arithmetic above to find 'c', which is the real actual slot +/// index in the corresponding NFA. +#[derive(Clone, Copy)] +struct Slots(u32); + +impl Slots { + const LIMIT: usize = 32; + + /// Insert the slot at the given bit index. + fn insert(self, slot: usize) -> Slots { + debug_assert!(slot < Slots::LIMIT); + Slots(self.0 | (1 << slot.as_u32())) + } + + /// Remove the slot at the given bit index. + fn remove(self, slot: usize) -> Slots { + debug_assert!(slot < Slots::LIMIT); + Slots(self.0 & !(1 << slot.as_u32())) + } + + /// Returns true if and only if this set contains no slots. + fn is_empty(self) -> bool { + self.0 == 0 + } + + /// Returns an iterator over all of the set bits in this set. + fn iter(self) -> SlotsIter { + SlotsIter { slots: self } + } + + /// For the position `at` in the current haystack, copy it to + /// `caller_explicit_slots` for all slots that are in this set. + /// + /// Callers may pass a slice of any length. Slots in this set bigger than + /// the length of the given explicit slots are simply skipped. + /// + /// The slice *must* correspond only to the explicit slots and the first + /// element of the slice must always correspond to the first explicit slot + /// in the corresponding NFA. + fn apply( + self, + at: usize, + caller_explicit_slots: &mut [Option<NonMaxUsize>], + ) { + if self.is_empty() { + return; + } + let at = NonMaxUsize::new(at); + for slot in self.iter() { + if slot >= caller_explicit_slots.len() { + break; + } + caller_explicit_slots[slot] = at; + } + } +} + +impl core::fmt::Debug for Slots { + fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result { + write!(f, "S")?; + for slot in self.iter() { + write!(f, "-{:?}", slot)?; + } + Ok(()) + } +} + +/// An iterator over all of the bits set in a slot set. +/// +/// This returns the bit index that is set, so callers may need to offset it +/// to get the actual NFA slot index. +#[derive(Debug)] +struct SlotsIter { + slots: Slots, +} + +impl Iterator for SlotsIter { + type Item = usize; + + fn next(&mut self) -> Option<usize> { + // Number of zeroes here is always <= u8::MAX, and so fits in a usize. + let slot = self.slots.0.trailing_zeros().as_usize(); + if slot >= Slots::LIMIT { + return None; + } + self.slots = self.slots.remove(slot); + Some(slot) + } +} + +/// An error that occurred during the construction of a one-pass DFA. +/// +/// This error does not provide many introspection capabilities. There are +/// generally only two things you can do with it: +/// +/// * Obtain a human readable message via its `std::fmt::Display` impl. +/// * Access an underlying [`thompson::BuildError`] type from its `source` +/// method via the `std::error::Error` trait. This error only occurs when using +/// convenience routines for building a one-pass DFA directly from a pattern +/// string. +/// +/// When the `std` feature is enabled, this implements the `std::error::Error` +/// trait. +#[derive(Clone, Debug)] +pub struct BuildError { + kind: BuildErrorKind, +} + +/// The kind of error that occurred during the construction of a one-pass DFA. +#[derive(Clone, Debug)] +enum BuildErrorKind { + NFA(crate::nfa::thompson::BuildError), + Word(UnicodeWordBoundaryError), + TooManyStates { limit: u64 }, + TooManyPatterns { limit: u64 }, + UnsupportedLook { look: Look }, + ExceededSizeLimit { limit: usize }, + NotOnePass { msg: &'static str }, +} + +impl BuildError { + fn nfa(err: crate::nfa::thompson::BuildError) -> BuildError { + BuildError { kind: BuildErrorKind::NFA(err) } + } + + fn word(err: UnicodeWordBoundaryError) -> BuildError { + BuildError { kind: BuildErrorKind::Word(err) } + } + + fn too_many_states(limit: u64) -> BuildError { + BuildError { kind: BuildErrorKind::TooManyStates { limit } } + } + + fn too_many_patterns(limit: u64) -> BuildError { + BuildError { kind: BuildErrorKind::TooManyPatterns { limit } } + } + + fn unsupported_look(look: Look) -> BuildError { + BuildError { kind: BuildErrorKind::UnsupportedLook { look } } + } + + fn exceeded_size_limit(limit: usize) -> BuildError { + BuildError { kind: BuildErrorKind::ExceededSizeLimit { limit } } + } + + fn not_one_pass(msg: &'static str) -> BuildError { + BuildError { kind: BuildErrorKind::NotOnePass { msg } } + } +} + +#[cfg(feature = "std")] +impl std::error::Error for BuildError { + fn source(&self) -> Option<&(dyn std::error::Error + 'static)> { + use self::BuildErrorKind::*; + + match self.kind { + NFA(ref err) => Some(err), + Word(ref err) => Some(err), + _ => None, + } + } +} + +impl core::fmt::Display for BuildError { + fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result { + use self::BuildErrorKind::*; + + match self.kind { + NFA(_) => write!(f, "error building NFA"), + Word(_) => write!(f, "NFA contains Unicode word boundary"), + TooManyStates { limit } => write!( + f, + "one-pass DFA exceeded a limit of {:?} for number of states", + limit, + ), + TooManyPatterns { limit } => write!( + f, + "one-pass DFA exceeded a limit of {:?} for number of patterns", + limit, + ), + UnsupportedLook { look } => write!( + f, + "one-pass DFA does not support the {:?} assertion", + look, + ), + ExceededSizeLimit { limit } => write!( + f, + "one-pass DFA exceeded size limit of {:?} during building", + limit, + ), + NotOnePass { msg } => write!( + f, + "one-pass DFA could not be built because \ + pattern is not one-pass: {}", + msg, + ), + } + } +} + +#[cfg(all(test, feature = "syntax"))] +mod tests { + use alloc::string::ToString; + + use super::*; + + #[test] + fn fail_conflicting_transition() { + let predicate = |err: &str| err.contains("conflicting transition"); + + let err = DFA::new(r"a*[ab]").unwrap_err().to_string(); + assert!(predicate(&err), "{}", err); + } + + #[test] + fn fail_multiple_epsilon() { + let predicate = |err: &str| { + err.contains("multiple epsilon transitions to same state") + }; + + let err = DFA::new(r"(^|$)a").unwrap_err().to_string(); + assert!(predicate(&err), "{}", err); + } + + #[test] + fn fail_multiple_match() { + let predicate = |err: &str| { + err.contains("multiple epsilon transitions to match state") + }; + + let err = DFA::new_many(&[r"^", r"$"]).unwrap_err().to_string(); + assert!(predicate(&err), "{}", err); + } + + // This test is meant to build a one-pass regex with the maximum number of + // possible slots. + // + // NOTE: Remember that the slot limit only applies to explicit capturing + // groups. Any number of implicit capturing groups is supported (up to the + // maximum number of supported patterns), since implicit groups are handled + // by the search loop itself. + #[test] + fn max_slots() { + // One too many... + let pat = r"(a)(b)(c)(d)(e)(f)(g)(h)(i)(j)(k)(l)(m)(n)(o)(p)(q)"; + assert!(DFA::new(pat).is_err()); + // Just right. + let pat = r"(a)(b)(c)(d)(e)(f)(g)(h)(i)(j)(k)(l)(m)(n)(o)(p)"; + assert!(DFA::new(pat).is_ok()); + } + + // This test ensures that the one-pass DFA works with all look-around + // assertions that we expect it to work with. + // + // The utility of this test is that each one-pass transition has a small + // amount of space to store look-around assertions. Currently, there is + // logic in the one-pass constructor to ensure there aren't more than ten + // possible assertions. And indeed, there are only ten possible assertions + // (at time of writing), so this is okay. But conceivably, more assertions + // could be added. So we check that things at least work with what we + // expect them to work with. + #[test] + fn assertions() { + // haystack anchors + assert!(DFA::new(r"^").is_ok()); + assert!(DFA::new(r"$").is_ok()); + + // line anchors + assert!(DFA::new(r"(?m)^").is_ok()); + assert!(DFA::new(r"(?m)$").is_ok()); + assert!(DFA::new(r"(?Rm)^").is_ok()); + assert!(DFA::new(r"(?Rm)$").is_ok()); + + // word boundaries + if cfg!(feature = "unicode-word-boundary") { + assert!(DFA::new(r"\b").is_ok()); + assert!(DFA::new(r"\B").is_ok()); + } + assert!(DFA::new(r"(?-u)\b").is_ok()); + assert!(DFA::new(r"(?-u)\B").is_ok()); + } + + #[cfg(not(miri))] // takes too long on miri + #[test] + fn is_one_pass() { + use crate::util::syntax; + + assert!(DFA::new(r"a*b").is_ok()); + if cfg!(feature = "unicode-perl") { + assert!(DFA::new(r"\w").is_ok()); + } + assert!(DFA::new(r"(?-u)\w*\s").is_ok()); + assert!(DFA::new(r"(?s:.)*?").is_ok()); + assert!(DFA::builder() + .syntax(syntax::Config::new().utf8(false)) + .build(r"(?s-u:.)*?") + .is_ok()); + } + + #[test] + fn is_not_one_pass() { + assert!(DFA::new(r"a*a").is_err()); + assert!(DFA::new(r"(?s-u:.)*?").is_err()); + assert!(DFA::new(r"(?s:.)*?a").is_err()); + } + + #[cfg(not(miri))] + #[test] + fn is_not_one_pass_bigger() { + assert!(DFA::new(r"\w*\s").is_err()); + } +} diff --git a/third_party/rust/regex-automata/src/dfa/regex.rs b/third_party/rust/regex-automata/src/dfa/regex.rs new file mode 100644 index 0000000000..f39c1c055c --- /dev/null +++ b/third_party/rust/regex-automata/src/dfa/regex.rs @@ -0,0 +1,871 @@ +/*! +A DFA-backed `Regex`. + +This module provides [`Regex`], which is defined generically over the +[`Automaton`] trait. A `Regex` implements convenience routines you might have +come to expect, such as finding the start/end of a match and iterating over +all non-overlapping matches. This `Regex` type is limited in its capabilities +to what a DFA can provide. Therefore, APIs involving capturing groups, for +example, are not provided. + +Internally, a `Regex` is composed of two DFAs. One is a "forward" DFA that +finds the end offset of a match, where as the other is a "reverse" DFA that +find the start offset of a match. + +See the [parent module](crate::dfa) for examples. +*/ + +#[cfg(feature = "alloc")] +use alloc::vec::Vec; + +#[cfg(feature = "dfa-build")] +use crate::dfa::dense::BuildError; +use crate::{ + dfa::{automaton::Automaton, dense}, + util::{iter, search::Input}, + Anchored, Match, MatchError, +}; +#[cfg(feature = "alloc")] +use crate::{ + dfa::{sparse, StartKind}, + util::search::MatchKind, +}; + +// When the alloc feature is enabled, the regex type sets its A type parameter +// to default to an owned dense DFA. But without alloc, we set no default. This +// makes things a lot more convenient in the common case, since writing out the +// DFA types is pretty annoying. +// +// Since we have two different definitions but only want to write one doc +// string, we use a macro to capture the doc and other attributes once and then +// repeat them for each definition. +macro_rules! define_regex_type { + ($(#[$doc:meta])*) => { + #[cfg(feature = "alloc")] + $(#[$doc])* + pub struct Regex<A = dense::OwnedDFA> { + forward: A, + reverse: A, + } + + #[cfg(not(feature = "alloc"))] + $(#[$doc])* + pub struct Regex<A> { + forward: A, + reverse: A, + } + }; +} + +define_regex_type!( + /// A regular expression that uses deterministic finite automata for fast + /// searching. + /// + /// A regular expression is comprised of two DFAs, a "forward" DFA and a + /// "reverse" DFA. The forward DFA is responsible for detecting the end of + /// a match while the reverse DFA is responsible for detecting the start + /// of a match. Thus, in order to find the bounds of any given match, a + /// forward search must first be run followed by a reverse search. A match + /// found by the forward DFA guarantees that the reverse DFA will also find + /// a match. + /// + /// The type of the DFA used by a `Regex` corresponds to the `A` type + /// parameter, which must satisfy the [`Automaton`] trait. Typically, + /// `A` is either a [`dense::DFA`](crate::dfa::dense::DFA) or a + /// [`sparse::DFA`](crate::dfa::sparse::DFA), where dense DFAs use more + /// memory but search faster, while sparse DFAs use less memory but search + /// more slowly. + /// + /// # Crate features + /// + /// Note that despite what the documentation auto-generates, the _only_ + /// crate feature needed to use this type is `dfa-search`. You do _not_ + /// need to enable the `alloc` feature. + /// + /// By default, a regex's automaton type parameter is set to + /// `dense::DFA<Vec<u32>>` when the `alloc` feature is enabled. For most + /// in-memory work loads, this is the most convenient type that gives the + /// best search performance. When the `alloc` feature is disabled, no + /// default type is used. + /// + /// # When should I use this? + /// + /// Generally speaking, if you can afford the overhead of building a full + /// DFA for your regex, and you don't need things like capturing groups, + /// then this is a good choice if you're looking to optimize for matching + /// speed. Note however that its speed may be worse than a general purpose + /// regex engine if you don't provide a [`dense::Config::prefilter`] to the + /// underlying DFA. + /// + /// # Sparse DFAs + /// + /// Since a `Regex` is generic over the [`Automaton`] trait, it can be + /// used with any kind of DFA. While this crate constructs dense DFAs by + /// default, it is easy enough to build corresponding sparse DFAs, and then + /// build a regex from them: + /// + /// ``` + /// use regex_automata::dfa::regex::Regex; + /// + /// // First, build a regex that uses dense DFAs. + /// let dense_re = Regex::new("foo[0-9]+")?; + /// + /// // Second, build sparse DFAs from the forward and reverse dense DFAs. + /// let fwd = dense_re.forward().to_sparse()?; + /// let rev = dense_re.reverse().to_sparse()?; + /// + /// // Third, build a new regex from the constituent sparse DFAs. + /// let sparse_re = Regex::builder().build_from_dfas(fwd, rev); + /// + /// // A regex that uses sparse DFAs can be used just like with dense DFAs. + /// assert_eq!(true, sparse_re.is_match(b"foo123")); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + /// + /// Alternatively, one can use a [`Builder`] to construct a sparse DFA + /// more succinctly. (Note though that dense DFAs are still constructed + /// first internally, and then converted to sparse DFAs, as in the example + /// above.) + /// + /// ``` + /// use regex_automata::dfa::regex::Regex; + /// + /// let sparse_re = Regex::builder().build_sparse(r"foo[0-9]+")?; + /// // A regex that uses sparse DFAs can be used just like with dense DFAs. + /// assert!(sparse_re.is_match(b"foo123")); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + /// + /// # Fallibility + /// + /// Most of the search routines defined on this type will _panic_ when the + /// underlying search fails. This might be because the DFA gave up because + /// it saw a quit byte, whether configured explicitly or via heuristic + /// Unicode word boundary support, although neither are enabled by default. + /// Or it might fail because an invalid `Input` configuration is given, + /// for example, with an unsupported [`Anchored`] mode. + /// + /// If you need to handle these error cases instead of allowing them to + /// trigger a panic, then the lower level [`Regex::try_search`] provides + /// a fallible API that never panics. + /// + /// # Example + /// + /// This example shows how to cause a search to terminate if it sees a + /// `\n` byte, and handle the error returned. This could be useful if, for + /// example, you wanted to prevent a user supplied pattern from matching + /// across a line boundary. + /// + /// ``` + /// # if cfg!(miri) { return Ok(()); } // miri takes too long + /// use regex_automata::{dfa::{self, regex::Regex}, Input, MatchError}; + /// + /// let re = Regex::builder() + /// .dense(dfa::dense::Config::new().quit(b'\n', true)) + /// .build(r"foo\p{any}+bar")?; + /// + /// let input = Input::new("foo\nbar"); + /// // Normally this would produce a match, since \p{any} contains '\n'. + /// // But since we instructed the automaton to enter a quit state if a + /// // '\n' is observed, this produces a match error instead. + /// let expected = MatchError::quit(b'\n', 3); + /// let got = re.try_search(&input).unwrap_err(); + /// assert_eq!(expected, got); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + #[derive(Clone, Debug)] +); + +#[cfg(all(feature = "syntax", feature = "dfa-build"))] +impl Regex { + /// Parse the given regular expression using the default configuration and + /// return the corresponding regex. + /// + /// If you want a non-default configuration, then use the [`Builder`] to + /// set your own configuration. + /// + /// # Example + /// + /// ``` + /// use regex_automata::{Match, dfa::regex::Regex}; + /// + /// let re = Regex::new("foo[0-9]+bar")?; + /// assert_eq!( + /// Some(Match::must(0, 3..14)), + /// re.find(b"zzzfoo12345barzzz"), + /// ); + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + pub fn new(pattern: &str) -> Result<Regex, BuildError> { + Builder::new().build(pattern) + } + + /// Like `new`, but parses multiple patterns into a single "regex set." + /// This similarly uses the default regex configuration. + /// + /// # Example + /// + /// ``` + /// use regex_automata::{Match, dfa::regex::Regex}; + /// + /// let re = Regex::new_many(&["[a-z]+", "[0-9]+"])?; + /// + /// let mut it = re.find_iter(b"abc 1 foo 4567 0 quux"); + /// assert_eq!(Some(Match::must(0, 0..3)), it.next()); + /// assert_eq!(Some(Match::must(1, 4..5)), it.next()); + /// assert_eq!(Some(Match::must(0, 6..9)), it.next()); + /// assert_eq!(Some(Match::must(1, 10..14)), it.next()); + /// assert_eq!(Some(Match::must(1, 15..16)), it.next()); + /// assert_eq!(Some(Match::must(0, 17..21)), it.next()); + /// assert_eq!(None, it.next()); + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + pub fn new_many<P: AsRef<str>>( + patterns: &[P], + ) -> Result<Regex, BuildError> { + Builder::new().build_many(patterns) + } +} + +#[cfg(all(feature = "syntax", feature = "dfa-build"))] +impl Regex<sparse::DFA<Vec<u8>>> { + /// Parse the given regular expression using the default configuration, + /// except using sparse DFAs, and return the corresponding regex. + /// + /// If you want a non-default configuration, then use the [`Builder`] to + /// set your own configuration. + /// + /// # Example + /// + /// ``` + /// use regex_automata::{Match, dfa::regex::Regex}; + /// + /// let re = Regex::new_sparse("foo[0-9]+bar")?; + /// assert_eq!( + /// Some(Match::must(0, 3..14)), + /// re.find(b"zzzfoo12345barzzz"), + /// ); + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + pub fn new_sparse( + pattern: &str, + ) -> Result<Regex<sparse::DFA<Vec<u8>>>, BuildError> { + Builder::new().build_sparse(pattern) + } + + /// Like `new`, but parses multiple patterns into a single "regex set" + /// using sparse DFAs. This otherwise similarly uses the default regex + /// configuration. + /// + /// # Example + /// + /// ``` + /// use regex_automata::{Match, dfa::regex::Regex}; + /// + /// let re = Regex::new_many_sparse(&["[a-z]+", "[0-9]+"])?; + /// + /// let mut it = re.find_iter(b"abc 1 foo 4567 0 quux"); + /// assert_eq!(Some(Match::must(0, 0..3)), it.next()); + /// assert_eq!(Some(Match::must(1, 4..5)), it.next()); + /// assert_eq!(Some(Match::must(0, 6..9)), it.next()); + /// assert_eq!(Some(Match::must(1, 10..14)), it.next()); + /// assert_eq!(Some(Match::must(1, 15..16)), it.next()); + /// assert_eq!(Some(Match::must(0, 17..21)), it.next()); + /// assert_eq!(None, it.next()); + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + pub fn new_many_sparse<P: AsRef<str>>( + patterns: &[P], + ) -> Result<Regex<sparse::DFA<Vec<u8>>>, BuildError> { + Builder::new().build_many_sparse(patterns) + } +} + +/// Convenience routines for regex construction. +impl Regex<dense::DFA<&'static [u32]>> { + /// Return a builder for configuring the construction of a `Regex`. + /// + /// This is a convenience routine to avoid needing to import the + /// [`Builder`] type in common cases. + /// + /// # Example + /// + /// This example shows how to use the builder to disable UTF-8 mode + /// everywhere. + /// + /// ``` + /// # if cfg!(miri) { return Ok(()); } // miri takes too long + /// use regex_automata::{ + /// dfa::regex::Regex, nfa::thompson, util::syntax, Match, + /// }; + /// + /// let re = Regex::builder() + /// .syntax(syntax::Config::new().utf8(false)) + /// .thompson(thompson::Config::new().utf8(false)) + /// .build(r"foo(?-u:[^b])ar.*")?; + /// let haystack = b"\xFEfoo\xFFarzz\xE2\x98\xFF\n"; + /// let expected = Some(Match::must(0, 1..9)); + /// let got = re.find(haystack); + /// assert_eq!(expected, got); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + pub fn builder() -> Builder { + Builder::new() + } +} + +/// Standard search routines for finding and iterating over matches. +impl<A: Automaton> Regex<A> { + /// Returns true if and only if this regex matches the given haystack. + /// + /// This routine may short circuit if it knows that scanning future input + /// will never lead to a different result. In particular, if the underlying + /// DFA enters a match state or a dead state, then this routine will return + /// `true` or `false`, respectively, without inspecting any future input. + /// + /// # Panics + /// + /// This routine panics if the search could not complete. This can occur + /// in a number of circumstances: + /// + /// * The configuration of the DFA may permit it to "quit" the search. + /// For example, setting quit bytes or enabling heuristic support for + /// Unicode word boundaries. The default configuration does not enable any + /// option that could result in the DFA quitting. + /// * When the provided `Input` configuration is not supported. For + /// example, by providing an unsupported anchor mode. + /// + /// When a search panics, callers cannot know whether a match exists or + /// not. + /// + /// Use [`Regex::try_search`] if you want to handle these error conditions. + /// + /// # Example + /// + /// ``` + /// use regex_automata::dfa::regex::Regex; + /// + /// let re = Regex::new("foo[0-9]+bar")?; + /// assert_eq!(true, re.is_match("foo12345bar")); + /// assert_eq!(false, re.is_match("foobar")); + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + #[inline] + pub fn is_match<'h, I: Into<Input<'h>>>(&self, input: I) -> bool { + // Not only can we do an "earliest" search, but we can avoid doing a + // reverse scan too. + let input = input.into().earliest(true); + self.forward().try_search_fwd(&input).map(|x| x.is_some()).unwrap() + } + + /// Returns the start and end offset of the leftmost match. If no match + /// exists, then `None` is returned. + /// + /// # Panics + /// + /// This routine panics if the search could not complete. This can occur + /// in a number of circumstances: + /// + /// * The configuration of the DFA may permit it to "quit" the search. + /// For example, setting quit bytes or enabling heuristic support for + /// Unicode word boundaries. The default configuration does not enable any + /// option that could result in the DFA quitting. + /// * When the provided `Input` configuration is not supported. For + /// example, by providing an unsupported anchor mode. + /// + /// When a search panics, callers cannot know whether a match exists or + /// not. + /// + /// Use [`Regex::try_search`] if you want to handle these error conditions. + /// + /// # Example + /// + /// ``` + /// use regex_automata::{Match, dfa::regex::Regex}; + /// + /// // Greediness is applied appropriately. + /// let re = Regex::new("foo[0-9]+")?; + /// assert_eq!(Some(Match::must(0, 3..11)), re.find("zzzfoo12345zzz")); + /// + /// // Even though a match is found after reading the first byte (`a`), + /// // the default leftmost-first match semantics demand that we find the + /// // earliest match that prefers earlier parts of the pattern over latter + /// // parts. + /// let re = Regex::new("abc|a")?; + /// assert_eq!(Some(Match::must(0, 0..3)), re.find("abc")); + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + #[inline] + pub fn find<'h, I: Into<Input<'h>>>(&self, input: I) -> Option<Match> { + self.try_search(&input.into()).unwrap() + } + + /// Returns an iterator over all non-overlapping leftmost matches in the + /// given bytes. If no match exists, then the iterator yields no elements. + /// + /// This corresponds to the "standard" regex search iterator. + /// + /// # Panics + /// + /// If the search returns an error during iteration, then iteration + /// panics. See [`Regex::find`] for the panic conditions. + /// + /// Use [`Regex::try_search`] with + /// [`util::iter::Searcher`](crate::util::iter::Searcher) if you want to + /// handle these error conditions. + /// + /// # Example + /// + /// ``` + /// use regex_automata::{Match, dfa::regex::Regex}; + /// + /// let re = Regex::new("foo[0-9]+")?; + /// let text = "foo1 foo12 foo123"; + /// let matches: Vec<Match> = re.find_iter(text).collect(); + /// assert_eq!(matches, vec![ + /// Match::must(0, 0..4), + /// Match::must(0, 5..10), + /// Match::must(0, 11..17), + /// ]); + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + #[inline] + pub fn find_iter<'r, 'h, I: Into<Input<'h>>>( + &'r self, + input: I, + ) -> FindMatches<'r, 'h, A> { + let it = iter::Searcher::new(input.into()); + FindMatches { re: self, it } + } +} + +/// Lower level fallible search routines that permit controlling where the +/// search starts and ends in a particular sequence. +impl<A: Automaton> Regex<A> { + /// Returns the start and end offset of the leftmost match. If no match + /// exists, then `None` is returned. + /// + /// This is like [`Regex::find`] but with two differences: + /// + /// 1. It is not generic over `Into<Input>` and instead accepts a + /// `&Input`. This permits reusing the same `Input` for multiple searches + /// without needing to create a new one. This _may_ help with latency. + /// 2. It returns an error if the search could not complete where as + /// [`Regex::find`] will panic. + /// + /// # Errors + /// + /// This routine errors if the search could not complete. This can occur + /// in the following circumstances: + /// + /// * The configuration of the DFA may permit it to "quit" the search. + /// For example, setting quit bytes or enabling heuristic support for + /// Unicode word boundaries. The default configuration does not enable any + /// option that could result in the DFA quitting. + /// * When the provided `Input` configuration is not supported. For + /// example, by providing an unsupported anchor mode. + /// + /// When a search returns an error, callers cannot know whether a match + /// exists or not. + #[inline] + pub fn try_search( + &self, + input: &Input<'_>, + ) -> Result<Option<Match>, MatchError> { + let (fwd, rev) = (self.forward(), self.reverse()); + let end = match fwd.try_search_fwd(input)? { + None => return Ok(None), + Some(end) => end, + }; + // This special cases an empty match at the beginning of the search. If + // our end matches our start, then since a reverse DFA can't match past + // the start, it must follow that our starting position is also our end + // position. So short circuit and skip the reverse search. + if input.start() == end.offset() { + return Ok(Some(Match::new( + end.pattern(), + end.offset()..end.offset(), + ))); + } + // We can also skip the reverse search if we know our search was + // anchored. This occurs either when the input config is anchored or + // when we know the regex itself is anchored. In this case, we know the + // start of the match, if one is found, must be the start of the + // search. + if self.is_anchored(input) { + return Ok(Some(Match::new( + end.pattern(), + input.start()..end.offset(), + ))); + } + // N.B. I have tentatively convinced myself that it isn't necessary + // to specify the specific pattern for the reverse search since the + // reverse search will always find the same pattern to match as the + // forward search. But I lack a rigorous proof. Why not just provide + // the pattern anyway? Well, if it is needed, then leaving it out + // gives us a chance to find a witness. (Also, if we don't need to + // specify the pattern, then we don't need to build the reverse DFA + // with 'starts_for_each_pattern' enabled.) + // + // We also need to be careful to disable 'earliest' for the reverse + // search, since it could be enabled for the forward search. In the + // reverse case, to satisfy "leftmost" criteria, we need to match + // as much as we can. We also need to be careful to make the search + // anchored. We don't want the reverse search to report any matches + // other than the one beginning at the end of our forward search. + let revsearch = input + .clone() + .span(input.start()..end.offset()) + .anchored(Anchored::Yes) + .earliest(false); + let start = rev + .try_search_rev(&revsearch)? + .expect("reverse search must match if forward search does"); + assert_eq!( + start.pattern(), + end.pattern(), + "forward and reverse search must match same pattern", + ); + assert!(start.offset() <= end.offset()); + Ok(Some(Match::new(end.pattern(), start.offset()..end.offset()))) + } + + /// Returns true if either the given input specifies an anchored search + /// or if the underlying DFA is always anchored. + fn is_anchored(&self, input: &Input<'_>) -> bool { + match input.get_anchored() { + Anchored::No => self.forward().is_always_start_anchored(), + Anchored::Yes | Anchored::Pattern(_) => true, + } + } +} + +/// Non-search APIs for querying information about the regex and setting a +/// prefilter. +impl<A: Automaton> Regex<A> { + /// Return the underlying DFA responsible for forward matching. + /// + /// This is useful for accessing the underlying DFA and converting it to + /// some other format or size. See the [`Builder::build_from_dfas`] docs + /// for an example of where this might be useful. + pub fn forward(&self) -> &A { + &self.forward + } + + /// Return the underlying DFA responsible for reverse matching. + /// + /// This is useful for accessing the underlying DFA and converting it to + /// some other format or size. See the [`Builder::build_from_dfas`] docs + /// for an example of where this might be useful. + pub fn reverse(&self) -> &A { + &self.reverse + } + + /// Returns the total number of patterns matched by this regex. + /// + /// # Example + /// + /// ``` + /// # if cfg!(miri) { return Ok(()); } // miri takes too long + /// use regex_automata::dfa::regex::Regex; + /// + /// let re = Regex::new_many(&[r"[a-z]+", r"[0-9]+", r"\w+"])?; + /// assert_eq!(3, re.pattern_len()); + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + pub fn pattern_len(&self) -> usize { + assert_eq!(self.forward().pattern_len(), self.reverse().pattern_len()); + self.forward().pattern_len() + } +} + +/// An iterator over all non-overlapping matches for an infallible search. +/// +/// The iterator yields a [`Match`] value until no more matches could be found. +/// If the underlying regex engine returns an error, then a panic occurs. +/// +/// The type parameters are as follows: +/// +/// * `A` represents the type of the underlying DFA that implements the +/// [`Automaton`] trait. +/// +/// The lifetime parameters are as follows: +/// +/// * `'h` represents the lifetime of the haystack being searched. +/// * `'r` represents the lifetime of the regex object itself. +/// +/// This iterator can be created with the [`Regex::find_iter`] method. +#[derive(Debug)] +pub struct FindMatches<'r, 'h, A> { + re: &'r Regex<A>, + it: iter::Searcher<'h>, +} + +impl<'r, 'h, A: Automaton> Iterator for FindMatches<'r, 'h, A> { + type Item = Match; + + #[inline] + fn next(&mut self) -> Option<Match> { + let FindMatches { re, ref mut it } = *self; + it.advance(|input| re.try_search(input)) + } +} + +/// A builder for a regex based on deterministic finite automatons. +/// +/// This builder permits configuring options for the syntax of a pattern, the +/// NFA construction, the DFA construction and finally the regex searching +/// itself. This builder is different from a general purpose regex builder in +/// that it permits fine grain configuration of the construction process. The +/// trade off for this is complexity, and the possibility of setting a +/// configuration that might not make sense. For example, there are two +/// different UTF-8 modes: +/// +/// * [`syntax::Config::utf8`](crate::util::syntax::Config::utf8) controls +/// whether the pattern itself can contain sub-expressions that match invalid +/// UTF-8. +/// * [`thompson::Config::utf8`](crate::nfa::thompson::Config::utf8) controls +/// how the regex iterators themselves advance the starting position of the +/// next search when a match with zero length is found. +/// +/// Generally speaking, callers will want to either enable all of these or +/// disable all of these. +/// +/// Internally, building a regex requires building two DFAs, where one is +/// responsible for finding the end of a match and the other is responsible +/// for finding the start of a match. If you only need to detect whether +/// something matched, or only the end of a match, then you should use a +/// [`dense::Builder`] to construct a single DFA, which is cheaper than +/// building two DFAs. +/// +/// # Build methods +/// +/// This builder has a few "build" methods. In general, it's the result of +/// combining the following parameters: +/// +/// * Building one or many regexes. +/// * Building a regex with dense or sparse DFAs. +/// +/// The simplest "build" method is [`Builder::build`]. It accepts a single +/// pattern and builds a dense DFA using `usize` for the state identifier +/// representation. +/// +/// The most general "build" method is [`Builder::build_many`], which permits +/// building a regex that searches for multiple patterns simultaneously while +/// using a specific state identifier representation. +/// +/// The most flexible "build" method, but hardest to use, is +/// [`Builder::build_from_dfas`]. This exposes the fact that a [`Regex`] is +/// just a pair of DFAs, and this method allows you to specify those DFAs +/// exactly. +/// +/// # Example +/// +/// This example shows how to disable UTF-8 mode in the syntax and the regex +/// itself. This is generally what you want for matching on arbitrary bytes. +/// +/// ``` +/// # if cfg!(miri) { return Ok(()); } // miri takes too long +/// use regex_automata::{ +/// dfa::regex::Regex, nfa::thompson, util::syntax, Match, +/// }; +/// +/// let re = Regex::builder() +/// .syntax(syntax::Config::new().utf8(false)) +/// .thompson(thompson::Config::new().utf8(false)) +/// .build(r"foo(?-u:[^b])ar.*")?; +/// let haystack = b"\xFEfoo\xFFarzz\xE2\x98\xFF\n"; +/// let expected = Some(Match::must(0, 1..9)); +/// let got = re.find(haystack); +/// assert_eq!(expected, got); +/// // Notice that `(?-u:[^b])` matches invalid UTF-8, +/// // but the subsequent `.*` does not! Disabling UTF-8 +/// // on the syntax permits this. +/// assert_eq!(b"foo\xFFarzz", &haystack[got.unwrap().range()]); +/// +/// # Ok::<(), Box<dyn std::error::Error>>(()) +/// ``` +#[derive(Clone, Debug)] +pub struct Builder { + #[cfg(feature = "dfa-build")] + dfa: dense::Builder, +} + +impl Builder { + /// Create a new regex builder with the default configuration. + pub fn new() -> Builder { + Builder { + #[cfg(feature = "dfa-build")] + dfa: dense::Builder::new(), + } + } + + /// Build a regex from the given pattern. + /// + /// If there was a problem parsing or compiling the pattern, then an error + /// is returned. + #[cfg(all(feature = "syntax", feature = "dfa-build"))] + pub fn build(&self, pattern: &str) -> Result<Regex, BuildError> { + self.build_many(&[pattern]) + } + + /// Build a regex from the given pattern using sparse DFAs. + /// + /// If there was a problem parsing or compiling the pattern, then an error + /// is returned. + #[cfg(all(feature = "syntax", feature = "dfa-build"))] + pub fn build_sparse( + &self, + pattern: &str, + ) -> Result<Regex<sparse::DFA<Vec<u8>>>, BuildError> { + self.build_many_sparse(&[pattern]) + } + + /// Build a regex from the given patterns. + #[cfg(all(feature = "syntax", feature = "dfa-build"))] + pub fn build_many<P: AsRef<str>>( + &self, + patterns: &[P], + ) -> Result<Regex, BuildError> { + let forward = self.dfa.build_many(patterns)?; + let reverse = self + .dfa + .clone() + .configure( + dense::Config::new() + .prefilter(None) + .specialize_start_states(false) + .start_kind(StartKind::Anchored) + .match_kind(MatchKind::All), + ) + .thompson(crate::nfa::thompson::Config::new().reverse(true)) + .build_many(patterns)?; + Ok(self.build_from_dfas(forward, reverse)) + } + + /// Build a sparse regex from the given patterns. + #[cfg(all(feature = "syntax", feature = "dfa-build"))] + pub fn build_many_sparse<P: AsRef<str>>( + &self, + patterns: &[P], + ) -> Result<Regex<sparse::DFA<Vec<u8>>>, BuildError> { + let re = self.build_many(patterns)?; + let forward = re.forward().to_sparse()?; + let reverse = re.reverse().to_sparse()?; + Ok(self.build_from_dfas(forward, reverse)) + } + + /// Build a regex from its component forward and reverse DFAs. + /// + /// This is useful when deserializing a regex from some arbitrary + /// memory region. This is also useful for building regexes from other + /// types of DFAs. + /// + /// If you're building the DFAs from scratch instead of building new DFAs + /// from other DFAs, then you'll need to make sure that the reverse DFA is + /// configured correctly to match the intended semantics. Namely: + /// + /// * It should be anchored. + /// * It should use [`MatchKind::All`] semantics. + /// * It should match in reverse. + /// * Otherwise, its configuration should match the forward DFA. + /// + /// If these conditions aren't satisfied, then the behavior of searches is + /// unspecified. + /// + /// Note that when using this constructor, no configuration is applied. + /// Since this routine provides the DFAs to the builder, there is no + /// opportunity to apply other configuration options. + /// + /// # Example + /// + /// This example is a bit a contrived. The usual use of these methods + /// would involve serializing `initial_re` somewhere and then deserializing + /// it later to build a regex. But in this case, we do everything in + /// memory. + /// + /// ``` + /// use regex_automata::dfa::regex::Regex; + /// + /// let initial_re = Regex::new("foo[0-9]+")?; + /// assert_eq!(true, initial_re.is_match(b"foo123")); + /// + /// let (fwd, rev) = (initial_re.forward(), initial_re.reverse()); + /// let re = Regex::builder().build_from_dfas(fwd, rev); + /// assert_eq!(true, re.is_match(b"foo123")); + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + /// + /// This example shows how to build a `Regex` that uses sparse DFAs instead + /// of dense DFAs without using one of the convenience `build_sparse` + /// routines: + /// + /// ``` + /// use regex_automata::dfa::regex::Regex; + /// + /// let initial_re = Regex::new("foo[0-9]+")?; + /// assert_eq!(true, initial_re.is_match(b"foo123")); + /// + /// let fwd = initial_re.forward().to_sparse()?; + /// let rev = initial_re.reverse().to_sparse()?; + /// let re = Regex::builder().build_from_dfas(fwd, rev); + /// assert_eq!(true, re.is_match(b"foo123")); + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + pub fn build_from_dfas<A: Automaton>( + &self, + forward: A, + reverse: A, + ) -> Regex<A> { + Regex { forward, reverse } + } + + /// Set the syntax configuration for this builder using + /// [`syntax::Config`](crate::util::syntax::Config). + /// + /// This permits setting things like case insensitivity, Unicode and multi + /// line mode. + #[cfg(all(feature = "syntax", feature = "dfa-build"))] + pub fn syntax( + &mut self, + config: crate::util::syntax::Config, + ) -> &mut Builder { + self.dfa.syntax(config); + self + } + + /// Set the Thompson NFA configuration for this builder using + /// [`nfa::thompson::Config`](crate::nfa::thompson::Config). + /// + /// This permits setting things like whether additional time should be + /// spent shrinking the size of the NFA. + #[cfg(all(feature = "syntax", feature = "dfa-build"))] + pub fn thompson( + &mut self, + config: crate::nfa::thompson::Config, + ) -> &mut Builder { + self.dfa.thompson(config); + self + } + + /// Set the dense DFA compilation configuration for this builder using + /// [`dense::Config`](dense::Config). + /// + /// This permits setting things like whether the underlying DFAs should + /// be minimized. + #[cfg(feature = "dfa-build")] + pub fn dense(&mut self, config: dense::Config) -> &mut Builder { + self.dfa.configure(config); + self + } +} + +impl Default for Builder { + fn default() -> Builder { + Builder::new() + } +} diff --git a/third_party/rust/regex-automata/src/dfa/remapper.rs b/third_party/rust/regex-automata/src/dfa/remapper.rs new file mode 100644 index 0000000000..6e49646721 --- /dev/null +++ b/third_party/rust/regex-automata/src/dfa/remapper.rs @@ -0,0 +1,242 @@ +use alloc::vec::Vec; + +use crate::util::primitives::StateID; + +/// Remappable is a tightly coupled abstraction that facilitates remapping +/// state identifiers in DFAs. +/// +/// The main idea behind remapping state IDs is that DFAs often need to check +/// if a certain state is a "special" state of some kind (like a match state) +/// during a search. Since this is extremely perf critical code, we want this +/// check to be as fast as possible. Partitioning state IDs into, for example, +/// into "non-match" and "match" states means one can tell if a state is a +/// match state via a simple comparison of the state ID. +/// +/// The issue is that during the DFA construction process, it's not +/// particularly easy to partition the states. Instead, the simplest thing is +/// to often just do a pass over all of the states and shuffle them into their +/// desired partitionings. To do that, we need a mechanism for swapping states. +/// Hence, this abstraction. +/// +/// Normally, for such little code, I would just duplicate it. But this is a +/// key optimization and the implementation is a bit subtle. So the abstraction +/// is basically a ham-fisted attempt at DRY. The only place we use this is in +/// the dense and one-pass DFAs. +/// +/// See also src/dfa/special.rs for a more detailed explanation of how dense +/// DFAs are partitioned. +pub(super) trait Remappable: core::fmt::Debug { + /// Return the total number of states. + fn state_len(&self) -> usize; + /// Return the power-of-2 exponent that yields the stride. The pertinent + /// laws here are, where N=stride2: 2^N=stride and len(alphabet) <= stride. + fn stride2(&self) -> usize; + /// Swap the states pointed to by the given IDs. The underlying finite + /// state machine should be mutated such that all of the transitions in + /// `id1` are now in the memory region where the transitions for `id2` + /// were, and all of the transitions in `id2` are now in the memory region + /// where the transitions for `id1` were. + /// + /// Essentially, this "moves" `id1` to `id2` and `id2` to `id1`. + /// + /// It is expected that, after calling this, the underlying value will be + /// left in an inconsistent state, since any other transitions pointing to, + /// e.g., `id1` need to be updated to point to `id2`, since that's where + /// `id1` moved to. + /// + /// In order to "fix" the underlying inconsistent state, a `Remapper` + /// should be used to guarantee that `remap` is called at the appropriate + /// time. + fn swap_states(&mut self, id1: StateID, id2: StateID); + /// This must remap every single state ID in the underlying value according + /// to the function given. For example, in a DFA, this should remap every + /// transition and every starting state ID. + fn remap(&mut self, map: impl Fn(StateID) -> StateID); +} + +/// Remapper is an abstraction the manages the remapping of state IDs in a +/// finite state machine. This is useful when one wants to shuffle states into +/// different positions in the machine. +/// +/// One of the key complexities this manages is the ability to correctly move +/// one state multiple times. +/// +/// Once shuffling is complete, `remap` must be called, which will rewrite +/// all pertinent transitions to updated state IDs. Neglecting to call `remap` +/// will almost certainly result in a corrupt machine. +#[derive(Debug)] +pub(super) struct Remapper { + /// A map from the index of a state to its pre-multiplied identifier. + /// + /// When a state is swapped with another, then their corresponding + /// locations in this map are also swapped. Thus, its new position will + /// still point to its old pre-multiplied StateID. + /// + /// While there is a bit more to it, this then allows us to rewrite the + /// state IDs in a DFA's transition table in a single pass. This is done + /// by iterating over every ID in this map, then iterating over each + /// transition for the state at that ID and re-mapping the transition from + /// `old_id` to `map[dfa.to_index(old_id)]`. That is, we find the position + /// in this map where `old_id` *started*, and set it to where it ended up + /// after all swaps have been completed. + map: Vec<StateID>, + /// A mapper from state index to state ID (and back). + idxmap: IndexMapper, +} + +impl Remapper { + /// Create a new remapper from the given remappable implementation. The + /// remapper can then be used to swap states. The remappable value given + /// here must the same one given to `swap` and `remap`. + pub(super) fn new(r: &impl Remappable) -> Remapper { + let idxmap = IndexMapper { stride2: r.stride2() }; + let map = (0..r.state_len()).map(|i| idxmap.to_state_id(i)).collect(); + Remapper { map, idxmap } + } + + /// Swap two states. Once this is called, callers must follow through to + /// call `remap`, or else it's possible for the underlying remappable + /// value to be in a corrupt state. + pub(super) fn swap( + &mut self, + r: &mut impl Remappable, + id1: StateID, + id2: StateID, + ) { + if id1 == id2 { + return; + } + r.swap_states(id1, id2); + self.map.swap(self.idxmap.to_index(id1), self.idxmap.to_index(id2)); + } + + /// Complete the remapping process by rewriting all state IDs in the + /// remappable value according to the swaps performed. + pub(super) fn remap(mut self, r: &mut impl Remappable) { + // Update the map to account for states that have been swapped + // multiple times. For example, if (A, C) and (C, G) are swapped, then + // transitions previously pointing to A should now point to G. But if + // we don't update our map, they will erroneously be set to C. All we + // do is follow the swaps in our map until we see our original state + // ID. + // + // The intuition here is to think about how changes are made to the + // map: only through pairwise swaps. That means that starting at any + // given state, it is always possible to find the loop back to that + // state by following the swaps represented in the map (which might be + // 0 swaps). + // + // We are also careful to clone the map before starting in order to + // freeze it. We use the frozen map to find our loops, since we need to + // update our map as well. Without freezing it, our updates could break + // the loops referenced above and produce incorrect results. + let oldmap = self.map.clone(); + for i in 0..r.state_len() { + let cur_id = self.idxmap.to_state_id(i); + let mut new_id = oldmap[i]; + if cur_id == new_id { + continue; + } + loop { + let id = oldmap[self.idxmap.to_index(new_id)]; + if cur_id == id { + self.map[i] = new_id; + break; + } + new_id = id; + } + } + r.remap(|next| self.map[self.idxmap.to_index(next)]); + } +} + +/// A simple type for mapping between state indices and state IDs. +/// +/// The reason why this exists is because state IDs are "premultiplied." That +/// is, in order to get to the transitions for a particular state, one need +/// only use the state ID as-is, instead of having to multiple it by transition +/// table's stride. +/// +/// The downside of this is that it's inconvenient to map between state IDs +/// using a dense map, e.g., Vec<StateID>. That's because state IDs look like +/// `0`, `0+stride`, `0+2*stride`, `0+3*stride`, etc., instead of `0`, `1`, +/// `2`, `3`, etc. +/// +/// Since our state IDs are premultiplied, we can convert back-and-forth +/// between IDs and indices by simply unmultiplying the IDs and multiplying the +/// indices. +#[derive(Debug)] +struct IndexMapper { + /// The power of 2 corresponding to the stride of the corresponding + /// transition table. 'id >> stride2' de-multiplies an ID while 'index << + /// stride2' pre-multiplies an index to an ID. + stride2: usize, +} + +impl IndexMapper { + /// Convert a state ID to a state index. + fn to_index(&self, id: StateID) -> usize { + id.as_usize() >> self.stride2 + } + + /// Convert a state index to a state ID. + fn to_state_id(&self, index: usize) -> StateID { + // CORRECTNESS: If the given index is not valid, then it is not + // required for this to panic or return a valid state ID. We'll "just" + // wind up with panics or silent logic errors at some other point. + StateID::new_unchecked(index << self.stride2) + } +} + +#[cfg(feature = "dfa-build")] +mod dense { + use crate::{dfa::dense::OwnedDFA, util::primitives::StateID}; + + use super::Remappable; + + impl Remappable for OwnedDFA { + fn state_len(&self) -> usize { + OwnedDFA::state_len(self) + } + + fn stride2(&self) -> usize { + OwnedDFA::stride2(self) + } + + fn swap_states(&mut self, id1: StateID, id2: StateID) { + OwnedDFA::swap_states(self, id1, id2) + } + + fn remap(&mut self, map: impl Fn(StateID) -> StateID) { + OwnedDFA::remap(self, map) + } + } +} + +#[cfg(feature = "dfa-onepass")] +mod onepass { + use crate::{dfa::onepass::DFA, util::primitives::StateID}; + + use super::Remappable; + + impl Remappable for DFA { + fn state_len(&self) -> usize { + DFA::state_len(self) + } + + fn stride2(&self) -> usize { + // We don't do pre-multiplication for the one-pass DFA, so + // returning 0 has the effect of making state IDs and state indices + // equivalent. + 0 + } + + fn swap_states(&mut self, id1: StateID, id2: StateID) { + DFA::swap_states(self, id1, id2) + } + + fn remap(&mut self, map: impl Fn(StateID) -> StateID) { + DFA::remap(self, map) + } + } +} diff --git a/third_party/rust/regex-automata/src/dfa/search.rs b/third_party/rust/regex-automata/src/dfa/search.rs new file mode 100644 index 0000000000..8c012a5944 --- /dev/null +++ b/third_party/rust/regex-automata/src/dfa/search.rs @@ -0,0 +1,654 @@ +use crate::{ + dfa::{ + accel, + automaton::{Automaton, OverlappingState}, + }, + util::{ + prefilter::Prefilter, + primitives::StateID, + search::{Anchored, HalfMatch, Input, Span}, + }, + MatchError, +}; + +#[inline(never)] +pub fn find_fwd<A: Automaton + ?Sized>( + dfa: &A, + input: &Input<'_>, +) -> Result<Option<HalfMatch>, MatchError> { + if input.is_done() { + return Ok(None); + } + let pre = if input.get_anchored().is_anchored() { + None + } else { + dfa.get_prefilter() + }; + // Searching with a pattern ID is always anchored, so we should never use + // a prefilter. + if pre.is_some() { + if input.get_earliest() { + find_fwd_imp(dfa, input, pre, true) + } else { + find_fwd_imp(dfa, input, pre, false) + } + } else { + if input.get_earliest() { + find_fwd_imp(dfa, input, None, true) + } else { + find_fwd_imp(dfa, input, None, false) + } + } +} + +#[cfg_attr(feature = "perf-inline", inline(always))] +fn find_fwd_imp<A: Automaton + ?Sized>( + dfa: &A, + input: &Input<'_>, + pre: Option<&'_ Prefilter>, + earliest: bool, +) -> Result<Option<HalfMatch>, MatchError> { + // See 'prefilter_restart' docs for explanation. + let universal_start = dfa.universal_start_state(Anchored::No).is_some(); + let mut mat = None; + let mut sid = init_fwd(dfa, input)?; + let mut at = input.start(); + // This could just be a closure, but then I think it would be unsound + // because it would need to be safe to invoke. This way, the lack of safety + // is clearer in the code below. + macro_rules! next_unchecked { + ($sid:expr, $at:expr) => {{ + let byte = *input.haystack().get_unchecked($at); + dfa.next_state_unchecked($sid, byte) + }}; + } + + if let Some(ref pre) = pre { + let span = Span::from(at..input.end()); + // If a prefilter doesn't report false positives, then we don't need to + // touch the DFA at all. However, since all matches include the pattern + // ID, and the prefilter infrastructure doesn't report pattern IDs, we + // limit this optimization to cases where there is exactly one pattern. + // In that case, any match must be the 0th pattern. + match pre.find(input.haystack(), span) { + None => return Ok(mat), + Some(ref span) => { + at = span.start; + if !universal_start { + sid = prefilter_restart(dfa, &input, at)?; + } + } + } + } + while at < input.end() { + // SAFETY: There are two safety invariants we need to uphold here in + // the loops below: that 'sid' and 'prev_sid' are valid state IDs + // for this DFA, and that 'at' is a valid index into 'haystack'. + // For the former, we rely on the invariant that next_state* and + // start_state_forward always returns a valid state ID (given a valid + // state ID in the former case). For the latter safety invariant, we + // always guard unchecked access with a check that 'at' is less than + // 'end', where 'end <= haystack.len()'. In the unrolled loop below, we + // ensure that 'at' is always in bounds. + // + // PERF: See a similar comment in src/hybrid/search.rs that justifies + // this extra work to make the search loop fast. The same reasoning and + // benchmarks apply here. + let mut prev_sid; + while at < input.end() { + prev_sid = unsafe { next_unchecked!(sid, at) }; + if dfa.is_special_state(prev_sid) || at + 3 >= input.end() { + core::mem::swap(&mut prev_sid, &mut sid); + break; + } + at += 1; + + sid = unsafe { next_unchecked!(prev_sid, at) }; + if dfa.is_special_state(sid) { + break; + } + at += 1; + + prev_sid = unsafe { next_unchecked!(sid, at) }; + if dfa.is_special_state(prev_sid) { + core::mem::swap(&mut prev_sid, &mut sid); + break; + } + at += 1; + + sid = unsafe { next_unchecked!(prev_sid, at) }; + if dfa.is_special_state(sid) { + break; + } + at += 1; + } + if dfa.is_special_state(sid) { + if dfa.is_start_state(sid) { + if let Some(ref pre) = pre { + let span = Span::from(at..input.end()); + match pre.find(input.haystack(), span) { + None => return Ok(mat), + Some(ref span) => { + // We want to skip any update to 'at' below + // at the end of this iteration and just + // jump immediately back to the next state + // transition at the leading position of the + // candidate match. + // + // ... but only if we actually made progress + // with our prefilter, otherwise if the start + // state has a self-loop, we can get stuck. + if span.start > at { + at = span.start; + if !universal_start { + sid = prefilter_restart(dfa, &input, at)?; + } + continue; + } + } + } + } else if dfa.is_accel_state(sid) { + let needles = dfa.accelerator(sid); + at = accel::find_fwd(needles, input.haystack(), at + 1) + .unwrap_or(input.end()); + continue; + } + } else if dfa.is_match_state(sid) { + let pattern = dfa.match_pattern(sid, 0); + mat = Some(HalfMatch::new(pattern, at)); + if earliest { + return Ok(mat); + } + if dfa.is_accel_state(sid) { + let needles = dfa.accelerator(sid); + at = accel::find_fwd(needles, input.haystack(), at + 1) + .unwrap_or(input.end()); + continue; + } + } else if dfa.is_accel_state(sid) { + let needs = dfa.accelerator(sid); + at = accel::find_fwd(needs, input.haystack(), at + 1) + .unwrap_or(input.end()); + continue; + } else if dfa.is_dead_state(sid) { + return Ok(mat); + } else { + // It's important that this is a debug_assert, since this can + // actually be tripped even if DFA::from_bytes succeeds and + // returns a supposedly valid DFA. + debug_assert!(dfa.is_quit_state(sid)); + return Err(MatchError::quit(input.haystack()[at], at)); + } + } + at += 1; + } + eoi_fwd(dfa, input, &mut sid, &mut mat)?; + Ok(mat) +} + +#[inline(never)] +pub fn find_rev<A: Automaton + ?Sized>( + dfa: &A, + input: &Input<'_>, +) -> Result<Option<HalfMatch>, MatchError> { + if input.is_done() { + return Ok(None); + } + if input.get_earliest() { + find_rev_imp(dfa, input, true) + } else { + find_rev_imp(dfa, input, false) + } +} + +#[cfg_attr(feature = "perf-inline", inline(always))] +fn find_rev_imp<A: Automaton + ?Sized>( + dfa: &A, + input: &Input<'_>, + earliest: bool, +) -> Result<Option<HalfMatch>, MatchError> { + let mut mat = None; + let mut sid = init_rev(dfa, input)?; + // In reverse search, the loop below can't handle the case of searching an + // empty slice. Ideally we could write something congruent to the forward + // search, i.e., 'while at >= start', but 'start' might be 0. Since we use + // an unsigned offset, 'at >= 0' is trivially always true. We could avoid + // this extra case handling by using a signed offset, but Rust makes it + // annoying to do. So... We just handle the empty case separately. + if input.start() == input.end() { + eoi_rev(dfa, input, &mut sid, &mut mat)?; + return Ok(mat); + } + + let mut at = input.end() - 1; + macro_rules! next_unchecked { + ($sid:expr, $at:expr) => {{ + let byte = *input.haystack().get_unchecked($at); + dfa.next_state_unchecked($sid, byte) + }}; + } + loop { + // SAFETY: See comments in 'find_fwd' for a safety argument. + let mut prev_sid; + while at >= input.start() { + prev_sid = unsafe { next_unchecked!(sid, at) }; + if dfa.is_special_state(prev_sid) + || at <= input.start().saturating_add(3) + { + core::mem::swap(&mut prev_sid, &mut sid); + break; + } + at -= 1; + + sid = unsafe { next_unchecked!(prev_sid, at) }; + if dfa.is_special_state(sid) { + break; + } + at -= 1; + + prev_sid = unsafe { next_unchecked!(sid, at) }; + if dfa.is_special_state(prev_sid) { + core::mem::swap(&mut prev_sid, &mut sid); + break; + } + at -= 1; + + sid = unsafe { next_unchecked!(prev_sid, at) }; + if dfa.is_special_state(sid) { + break; + } + at -= 1; + } + if dfa.is_special_state(sid) { + if dfa.is_start_state(sid) { + if dfa.is_accel_state(sid) { + let needles = dfa.accelerator(sid); + at = accel::find_rev(needles, input.haystack(), at) + .map(|i| i + 1) + .unwrap_or(input.start()); + } + } else if dfa.is_match_state(sid) { + let pattern = dfa.match_pattern(sid, 0); + // Since reverse searches report the beginning of a match + // and the beginning is inclusive (not exclusive like the + // end of a match), we add 1 to make it inclusive. + mat = Some(HalfMatch::new(pattern, at + 1)); + if earliest { + return Ok(mat); + } + if dfa.is_accel_state(sid) { + let needles = dfa.accelerator(sid); + at = accel::find_rev(needles, input.haystack(), at) + .map(|i| i + 1) + .unwrap_or(input.start()); + } + } else if dfa.is_accel_state(sid) { + let needles = dfa.accelerator(sid); + // If the accelerator returns nothing, why don't we quit the + // search? Well, if the accelerator doesn't find anything, that + // doesn't mean we don't have a match. It just means that we + // can't leave the current state given one of the 255 possible + // byte values. However, there might be an EOI transition. So + // we set 'at' to the end of the haystack, which will cause + // this loop to stop and fall down into the EOI transition. + at = accel::find_rev(needles, input.haystack(), at) + .map(|i| i + 1) + .unwrap_or(input.start()); + } else if dfa.is_dead_state(sid) { + return Ok(mat); + } else { + debug_assert!(dfa.is_quit_state(sid)); + return Err(MatchError::quit(input.haystack()[at], at)); + } + } + if at == input.start() { + break; + } + at -= 1; + } + eoi_rev(dfa, input, &mut sid, &mut mat)?; + Ok(mat) +} + +#[inline(never)] +pub fn find_overlapping_fwd<A: Automaton + ?Sized>( + dfa: &A, + input: &Input<'_>, + state: &mut OverlappingState, +) -> Result<(), MatchError> { + state.mat = None; + if input.is_done() { + return Ok(()); + } + let pre = if input.get_anchored().is_anchored() { + None + } else { + dfa.get_prefilter() + }; + if pre.is_some() { + find_overlapping_fwd_imp(dfa, input, pre, state) + } else { + find_overlapping_fwd_imp(dfa, input, None, state) + } +} + +#[cfg_attr(feature = "perf-inline", inline(always))] +fn find_overlapping_fwd_imp<A: Automaton + ?Sized>( + dfa: &A, + input: &Input<'_>, + pre: Option<&'_ Prefilter>, + state: &mut OverlappingState, +) -> Result<(), MatchError> { + // See 'prefilter_restart' docs for explanation. + let universal_start = dfa.universal_start_state(Anchored::No).is_some(); + let mut sid = match state.id { + None => { + state.at = input.start(); + init_fwd(dfa, input)? + } + Some(sid) => { + if let Some(match_index) = state.next_match_index { + let match_len = dfa.match_len(sid); + if match_index < match_len { + state.next_match_index = Some(match_index + 1); + let pattern = dfa.match_pattern(sid, match_index); + state.mat = Some(HalfMatch::new(pattern, state.at)); + return Ok(()); + } + } + // Once we've reported all matches at a given position, we need to + // advance the search to the next position. + state.at += 1; + if state.at > input.end() { + return Ok(()); + } + sid + } + }; + + // NOTE: We don't optimize the crap out of this routine primarily because + // it seems like most find_overlapping searches will have higher match + // counts, and thus, throughput is perhaps not as important. But if you + // have a use case for something faster, feel free to file an issue. + while state.at < input.end() { + sid = dfa.next_state(sid, input.haystack()[state.at]); + if dfa.is_special_state(sid) { + state.id = Some(sid); + if dfa.is_start_state(sid) { + if let Some(ref pre) = pre { + let span = Span::from(state.at..input.end()); + match pre.find(input.haystack(), span) { + None => return Ok(()), + Some(ref span) => { + if span.start > state.at { + state.at = span.start; + if !universal_start { + sid = prefilter_restart( + dfa, &input, state.at, + )?; + } + continue; + } + } + } + } else if dfa.is_accel_state(sid) { + let needles = dfa.accelerator(sid); + state.at = accel::find_fwd( + needles, + input.haystack(), + state.at + 1, + ) + .unwrap_or(input.end()); + continue; + } + } else if dfa.is_match_state(sid) { + state.next_match_index = Some(1); + let pattern = dfa.match_pattern(sid, 0); + state.mat = Some(HalfMatch::new(pattern, state.at)); + return Ok(()); + } else if dfa.is_accel_state(sid) { + let needs = dfa.accelerator(sid); + // If the accelerator returns nothing, why don't we quit the + // search? Well, if the accelerator doesn't find anything, that + // doesn't mean we don't have a match. It just means that we + // can't leave the current state given one of the 255 possible + // byte values. However, there might be an EOI transition. So + // we set 'at' to the end of the haystack, which will cause + // this loop to stop and fall down into the EOI transition. + state.at = + accel::find_fwd(needs, input.haystack(), state.at + 1) + .unwrap_or(input.end()); + continue; + } else if dfa.is_dead_state(sid) { + return Ok(()); + } else { + debug_assert!(dfa.is_quit_state(sid)); + return Err(MatchError::quit( + input.haystack()[state.at], + state.at, + )); + } + } + state.at += 1; + } + + let result = eoi_fwd(dfa, input, &mut sid, &mut state.mat); + state.id = Some(sid); + if state.mat.is_some() { + // '1' is always correct here since if we get to this point, this + // always corresponds to the first (index '0') match discovered at + // this position. So the next match to report at this position (if + // it exists) is at index '1'. + state.next_match_index = Some(1); + } + result +} + +#[inline(never)] +pub(crate) fn find_overlapping_rev<A: Automaton + ?Sized>( + dfa: &A, + input: &Input<'_>, + state: &mut OverlappingState, +) -> Result<(), MatchError> { + state.mat = None; + if input.is_done() { + return Ok(()); + } + let mut sid = match state.id { + None => { + let sid = init_rev(dfa, input)?; + state.id = Some(sid); + if input.start() == input.end() { + state.rev_eoi = true; + } else { + state.at = input.end() - 1; + } + sid + } + Some(sid) => { + if let Some(match_index) = state.next_match_index { + let match_len = dfa.match_len(sid); + if match_index < match_len { + state.next_match_index = Some(match_index + 1); + let pattern = dfa.match_pattern(sid, match_index); + state.mat = Some(HalfMatch::new(pattern, state.at)); + return Ok(()); + } + } + // Once we've reported all matches at a given position, we need + // to advance the search to the next position. However, if we've + // already followed the EOI transition, then we know we're done + // with the search and there cannot be any more matches to report. + if state.rev_eoi { + return Ok(()); + } else if state.at == input.start() { + // At this point, we should follow the EOI transition. This + // will cause us the skip the main loop below and fall through + // to the final 'eoi_rev' transition. + state.rev_eoi = true; + } else { + // We haven't hit the end of the search yet, so move on. + state.at -= 1; + } + sid + } + }; + while !state.rev_eoi { + sid = dfa.next_state(sid, input.haystack()[state.at]); + if dfa.is_special_state(sid) { + state.id = Some(sid); + if dfa.is_start_state(sid) { + if dfa.is_accel_state(sid) { + let needles = dfa.accelerator(sid); + state.at = + accel::find_rev(needles, input.haystack(), state.at) + .map(|i| i + 1) + .unwrap_or(input.start()); + } + } else if dfa.is_match_state(sid) { + state.next_match_index = Some(1); + let pattern = dfa.match_pattern(sid, 0); + state.mat = Some(HalfMatch::new(pattern, state.at + 1)); + return Ok(()); + } else if dfa.is_accel_state(sid) { + let needles = dfa.accelerator(sid); + // If the accelerator returns nothing, why don't we quit the + // search? Well, if the accelerator doesn't find anything, that + // doesn't mean we don't have a match. It just means that we + // can't leave the current state given one of the 255 possible + // byte values. However, there might be an EOI transition. So + // we set 'at' to the end of the haystack, which will cause + // this loop to stop and fall down into the EOI transition. + state.at = + accel::find_rev(needles, input.haystack(), state.at) + .map(|i| i + 1) + .unwrap_or(input.start()); + } else if dfa.is_dead_state(sid) { + return Ok(()); + } else { + debug_assert!(dfa.is_quit_state(sid)); + return Err(MatchError::quit( + input.haystack()[state.at], + state.at, + )); + } + } + if state.at == input.start() { + break; + } + state.at -= 1; + } + + let result = eoi_rev(dfa, input, &mut sid, &mut state.mat); + state.rev_eoi = true; + state.id = Some(sid); + if state.mat.is_some() { + // '1' is always correct here since if we get to this point, this + // always corresponds to the first (index '0') match discovered at + // this position. So the next match to report at this position (if + // it exists) is at index '1'. + state.next_match_index = Some(1); + } + result +} + +#[cfg_attr(feature = "perf-inline", inline(always))] +fn init_fwd<A: Automaton + ?Sized>( + dfa: &A, + input: &Input<'_>, +) -> Result<StateID, MatchError> { + let sid = dfa.start_state_forward(input)?; + // Start states can never be match states, since all matches are delayed + // by 1 byte. + debug_assert!(!dfa.is_match_state(sid)); + Ok(sid) +} + +#[cfg_attr(feature = "perf-inline", inline(always))] +fn init_rev<A: Automaton + ?Sized>( + dfa: &A, + input: &Input<'_>, +) -> Result<StateID, MatchError> { + let sid = dfa.start_state_reverse(input)?; + // Start states can never be match states, since all matches are delayed + // by 1 byte. + debug_assert!(!dfa.is_match_state(sid)); + Ok(sid) +} + +#[cfg_attr(feature = "perf-inline", inline(always))] +fn eoi_fwd<A: Automaton + ?Sized>( + dfa: &A, + input: &Input<'_>, + sid: &mut StateID, + mat: &mut Option<HalfMatch>, +) -> Result<(), MatchError> { + let sp = input.get_span(); + match input.haystack().get(sp.end) { + Some(&b) => { + *sid = dfa.next_state(*sid, b); + if dfa.is_match_state(*sid) { + let pattern = dfa.match_pattern(*sid, 0); + *mat = Some(HalfMatch::new(pattern, sp.end)); + } else if dfa.is_quit_state(*sid) { + return Err(MatchError::quit(b, sp.end)); + } + } + None => { + *sid = dfa.next_eoi_state(*sid); + if dfa.is_match_state(*sid) { + let pattern = dfa.match_pattern(*sid, 0); + *mat = Some(HalfMatch::new(pattern, input.haystack().len())); + } + // N.B. We don't have to check 'is_quit' here because the EOI + // transition can never lead to a quit state. + debug_assert!(!dfa.is_quit_state(*sid)); + } + } + Ok(()) +} + +#[cfg_attr(feature = "perf-inline", inline(always))] +fn eoi_rev<A: Automaton + ?Sized>( + dfa: &A, + input: &Input<'_>, + sid: &mut StateID, + mat: &mut Option<HalfMatch>, +) -> Result<(), MatchError> { + let sp = input.get_span(); + if sp.start > 0 { + let byte = input.haystack()[sp.start - 1]; + *sid = dfa.next_state(*sid, byte); + if dfa.is_match_state(*sid) { + let pattern = dfa.match_pattern(*sid, 0); + *mat = Some(HalfMatch::new(pattern, sp.start)); + } else if dfa.is_quit_state(*sid) { + return Err(MatchError::quit(byte, sp.start - 1)); + } + } else { + *sid = dfa.next_eoi_state(*sid); + if dfa.is_match_state(*sid) { + let pattern = dfa.match_pattern(*sid, 0); + *mat = Some(HalfMatch::new(pattern, 0)); + } + // N.B. We don't have to check 'is_quit' here because the EOI + // transition can never lead to a quit state. + debug_assert!(!dfa.is_quit_state(*sid)); + } + Ok(()) +} + +/// Re-compute the starting state that a DFA should be in after finding a +/// prefilter candidate match at the position `at`. +/// +/// The function with the same name has a bit more docs in hybrid/search.rs. +#[cfg_attr(feature = "perf-inline", inline(always))] +fn prefilter_restart<A: Automaton + ?Sized>( + dfa: &A, + input: &Input<'_>, + at: usize, +) -> Result<StateID, MatchError> { + let mut input = input.clone(); + input.set_start(at); + init_fwd(dfa, &input) +} diff --git a/third_party/rust/regex-automata/src/dfa/sparse.rs b/third_party/rust/regex-automata/src/dfa/sparse.rs new file mode 100644 index 0000000000..5d8ec23408 --- /dev/null +++ b/third_party/rust/regex-automata/src/dfa/sparse.rs @@ -0,0 +1,2656 @@ +/*! +Types and routines specific to sparse DFAs. + +This module is the home of [`sparse::DFA`](DFA). + +Unlike the [`dense`](super::dense) module, this module does not contain a +builder or configuration specific for sparse DFAs. Instead, the intended +way to build a sparse DFA is either by using a default configuration with +its constructor [`sparse::DFA::new`](DFA::new), or by first configuring the +construction of a dense DFA with [`dense::Builder`](super::dense::Builder) +and then calling [`dense::DFA::to_sparse`](super::dense::DFA::to_sparse). For +example, this configures a sparse DFA to do an overlapping search: + +``` +use regex_automata::{ + dfa::{Automaton, OverlappingState, dense}, + HalfMatch, Input, MatchKind, +}; + +let dense_re = dense::Builder::new() + .configure(dense::Config::new().match_kind(MatchKind::All)) + .build(r"Samwise|Sam")?; +let sparse_re = dense_re.to_sparse()?; + +// Setup our haystack and initial start state. +let input = Input::new("Samwise"); +let mut state = OverlappingState::start(); + +// First, 'Sam' will match. +sparse_re.try_search_overlapping_fwd(&input, &mut state)?; +assert_eq!(Some(HalfMatch::must(0, 3)), state.get_match()); + +// And now 'Samwise' will match. +sparse_re.try_search_overlapping_fwd(&input, &mut state)?; +assert_eq!(Some(HalfMatch::must(0, 7)), state.get_match()); +# Ok::<(), Box<dyn std::error::Error>>(()) +``` +*/ + +#[cfg(feature = "dfa-build")] +use core::iter; +use core::{ + convert::{TryFrom, TryInto}, + fmt, + mem::size_of, +}; + +#[cfg(feature = "dfa-build")] +use alloc::{vec, vec::Vec}; + +#[cfg(feature = "dfa-build")] +use crate::dfa::dense::{self, BuildError}; +use crate::{ + dfa::{ + automaton::{fmt_state_indicator, Automaton}, + dense::Flags, + special::Special, + StartKind, DEAD, + }, + util::{ + alphabet::{ByteClasses, ByteSet}, + escape::DebugByte, + int::{Pointer, Usize, U16, U32}, + prefilter::Prefilter, + primitives::{PatternID, StateID}, + search::{Anchored, Input, MatchError}, + start::{Start, StartByteMap}, + wire::{self, DeserializeError, Endian, SerializeError}, + }, +}; + +const LABEL: &str = "rust-regex-automata-dfa-sparse"; +const VERSION: u32 = 2; + +/// A sparse deterministic finite automaton (DFA) with variable sized states. +/// +/// In contrast to a [dense::DFA](crate::dfa::dense::DFA), a sparse DFA uses +/// a more space efficient representation for its transitions. Consequently, +/// sparse DFAs may use much less memory than dense DFAs, but this comes at a +/// price. In particular, reading the more space efficient transitions takes +/// more work, and consequently, searching using a sparse DFA is typically +/// slower than a dense DFA. +/// +/// A sparse DFA can be built using the default configuration via the +/// [`DFA::new`] constructor. Otherwise, one can configure various aspects +/// of a dense DFA via [`dense::Builder`](crate::dfa::dense::Builder), +/// and then convert a dense DFA to a sparse DFA using +/// [`dense::DFA::to_sparse`](crate::dfa::dense::DFA::to_sparse). +/// +/// In general, a sparse DFA supports all the same search operations as a dense +/// DFA. +/// +/// Making the choice between a dense and sparse DFA depends on your specific +/// work load. If you can sacrifice a bit of search time performance, then a +/// sparse DFA might be the best choice. In particular, while sparse DFAs are +/// probably always slower than dense DFAs, you may find that they are easily +/// fast enough for your purposes! +/// +/// # Type parameters +/// +/// A `DFA` has one type parameter, `T`, which is used to represent the parts +/// of a sparse DFA. `T` is typically a `Vec<u8>` or a `&[u8]`. +/// +/// # The `Automaton` trait +/// +/// This type implements the [`Automaton`] trait, which means it can be used +/// for searching. For example: +/// +/// ``` +/// use regex_automata::{dfa::{Automaton, sparse::DFA}, HalfMatch, Input}; +/// +/// let dfa = DFA::new("foo[0-9]+")?; +/// let expected = Some(HalfMatch::must(0, 8)); +/// assert_eq!(expected, dfa.try_search_fwd(&Input::new("foo12345"))?); +/// # Ok::<(), Box<dyn std::error::Error>>(()) +/// ``` +#[derive(Clone)] +pub struct DFA<T> { + // When compared to a dense DFA, a sparse DFA *looks* a lot simpler + // representation-wise. In reality, it is perhaps more complicated. Namely, + // in a dense DFA, all information needs to be very cheaply accessible + // using only state IDs. In a sparse DFA however, each state uses a + // variable amount of space because each state encodes more information + // than just its transitions. Each state also includes an accelerator if + // one exists, along with the matching pattern IDs if the state is a match + // state. + // + // That is, a lot of the complexity is pushed down into how each state + // itself is represented. + tt: Transitions<T>, + st: StartTable<T>, + special: Special, + pre: Option<Prefilter>, + quitset: ByteSet, + flags: Flags, +} + +#[cfg(feature = "dfa-build")] +impl DFA<Vec<u8>> { + /// Parse the given regular expression using a default configuration and + /// return the corresponding sparse DFA. + /// + /// If you want a non-default configuration, then use + /// the [`dense::Builder`](crate::dfa::dense::Builder) + /// to set your own configuration, and then call + /// [`dense::DFA::to_sparse`](crate::dfa::dense::DFA::to_sparse) to create + /// a sparse DFA. + /// + /// # Example + /// + /// ``` + /// use regex_automata::{dfa::{Automaton, sparse}, HalfMatch, Input}; + /// + /// let dfa = sparse::DFA::new("foo[0-9]+bar")?; + /// + /// let expected = Some(HalfMatch::must(0, 11)); + /// assert_eq!(expected, dfa.try_search_fwd(&Input::new("foo12345bar"))?); + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + #[cfg(feature = "syntax")] + pub fn new(pattern: &str) -> Result<DFA<Vec<u8>>, BuildError> { + dense::Builder::new() + .build(pattern) + .and_then(|dense| dense.to_sparse()) + } + + /// Parse the given regular expressions using a default configuration and + /// return the corresponding multi-DFA. + /// + /// If you want a non-default configuration, then use + /// the [`dense::Builder`](crate::dfa::dense::Builder) + /// to set your own configuration, and then call + /// [`dense::DFA::to_sparse`](crate::dfa::dense::DFA::to_sparse) to create + /// a sparse DFA. + /// + /// # Example + /// + /// ``` + /// use regex_automata::{dfa::{Automaton, sparse}, HalfMatch, Input}; + /// + /// let dfa = sparse::DFA::new_many(&["[0-9]+", "[a-z]+"])?; + /// let expected = Some(HalfMatch::must(1, 3)); + /// assert_eq!(expected, dfa.try_search_fwd(&Input::new("foo12345bar"))?); + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + #[cfg(feature = "syntax")] + pub fn new_many<P: AsRef<str>>( + patterns: &[P], + ) -> Result<DFA<Vec<u8>>, BuildError> { + dense::Builder::new() + .build_many(patterns) + .and_then(|dense| dense.to_sparse()) + } +} + +#[cfg(feature = "dfa-build")] +impl DFA<Vec<u8>> { + /// Create a new DFA that matches every input. + /// + /// # Example + /// + /// ``` + /// use regex_automata::{ + /// dfa::{Automaton, sparse}, + /// HalfMatch, Input, + /// }; + /// + /// let dfa = sparse::DFA::always_match()?; + /// + /// let expected = Some(HalfMatch::must(0, 0)); + /// assert_eq!(expected, dfa.try_search_fwd(&Input::new(""))?); + /// assert_eq!(expected, dfa.try_search_fwd(&Input::new("foo"))?); + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + pub fn always_match() -> Result<DFA<Vec<u8>>, BuildError> { + dense::DFA::always_match()?.to_sparse() + } + + /// Create a new sparse DFA that never matches any input. + /// + /// # Example + /// + /// ``` + /// use regex_automata::{dfa::{Automaton, sparse}, Input}; + /// + /// let dfa = sparse::DFA::never_match()?; + /// assert_eq!(None, dfa.try_search_fwd(&Input::new(""))?); + /// assert_eq!(None, dfa.try_search_fwd(&Input::new("foo"))?); + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + pub fn never_match() -> Result<DFA<Vec<u8>>, BuildError> { + dense::DFA::never_match()?.to_sparse() + } + + /// The implementation for constructing a sparse DFA from a dense DFA. + pub(crate) fn from_dense<T: AsRef<[u32]>>( + dfa: &dense::DFA<T>, + ) -> Result<DFA<Vec<u8>>, BuildError> { + // In order to build the transition table, we need to be able to write + // state identifiers for each of the "next" transitions in each state. + // Our state identifiers correspond to the byte offset in the + // transition table at which the state is encoded. Therefore, we do not + // actually know what the state identifiers are until we've allocated + // exactly as much space as we need for each state. Thus, construction + // of the transition table happens in two passes. + // + // In the first pass, we fill out the shell of each state, which + // includes the transition length, the input byte ranges and + // zero-filled space for the transitions and accelerators, if present. + // In this first pass, we also build up a map from the state identifier + // index of the dense DFA to the state identifier in this sparse DFA. + // + // In the second pass, we fill in the transitions based on the map + // built in the first pass. + + // The capacity given here reflects a minimum. (Well, the true minimum + // is likely even bigger, but hopefully this saves a few reallocs.) + let mut sparse = Vec::with_capacity(StateID::SIZE * dfa.state_len()); + // This maps state indices from the dense DFA to StateIDs in the sparse + // DFA. We build out this map on the first pass, and then use it in the + // second pass to back-fill our transitions. + let mut remap: Vec<StateID> = vec![DEAD; dfa.state_len()]; + for state in dfa.states() { + let pos = sparse.len(); + + remap[dfa.to_index(state.id())] = StateID::new(pos) + .map_err(|_| BuildError::too_many_states())?; + // zero-filled space for the transition length + sparse.push(0); + sparse.push(0); + + let mut transition_len = 0; + for (unit1, unit2, _) in state.sparse_transitions() { + match (unit1.as_u8(), unit2.as_u8()) { + (Some(b1), Some(b2)) => { + transition_len += 1; + sparse.push(b1); + sparse.push(b2); + } + (None, None) => {} + (Some(_), None) | (None, Some(_)) => { + // can never occur because sparse_transitions never + // groups EOI with any other transition. + unreachable!() + } + } + } + // Add dummy EOI transition. This is never actually read while + // searching, but having space equivalent to the total number + // of transitions is convenient. Otherwise, we'd need to track + // a different number of transitions for the byte ranges as for + // the 'next' states. + // + // N.B. The loop above is not guaranteed to yield the EOI + // transition, since it may point to a DEAD state. By putting + // it here, we always write the EOI transition, and thus + // guarantee that our transition length is >0. Why do we always + // need the EOI transition? Because in order to implement + // Automaton::next_eoi_state, this lets us just ask for the last + // transition. There are probably other/better ways to do this. + transition_len += 1; + sparse.push(0); + sparse.push(0); + + // Check some assumptions about transition length. + assert_ne!( + transition_len, 0, + "transition length should be non-zero", + ); + assert!( + transition_len <= 257, + "expected transition length {} to be <= 257", + transition_len, + ); + + // Fill in the transition length. + // Since transition length is always <= 257, we use the most + // significant bit to indicate whether this is a match state or + // not. + let ntrans = if dfa.is_match_state(state.id()) { + transition_len | (1 << 15) + } else { + transition_len + }; + wire::NE::write_u16(ntrans, &mut sparse[pos..]); + + // zero-fill the actual transitions. + // Unwraps are OK since transition_length <= 257 and our minimum + // support usize size is 16-bits. + let zeros = usize::try_from(transition_len) + .unwrap() + .checked_mul(StateID::SIZE) + .unwrap(); + sparse.extend(iter::repeat(0).take(zeros)); + + // If this is a match state, write the pattern IDs matched by this + // state. + if dfa.is_match_state(state.id()) { + let plen = dfa.match_pattern_len(state.id()); + // Write the actual pattern IDs with a u32 length prefix. + // First, zero-fill space. + let mut pos = sparse.len(); + // Unwraps are OK since it's guaranteed that plen <= + // PatternID::LIMIT, which is in turn guaranteed to fit into a + // u32. + let zeros = size_of::<u32>() + .checked_mul(plen) + .unwrap() + .checked_add(size_of::<u32>()) + .unwrap(); + sparse.extend(iter::repeat(0).take(zeros)); + + // Now write the length prefix. + wire::NE::write_u32( + // Will never fail since u32::MAX is invalid pattern ID. + // Thus, the number of pattern IDs is representable by a + // u32. + plen.try_into().expect("pattern ID length fits in u32"), + &mut sparse[pos..], + ); + pos += size_of::<u32>(); + + // Now write the pattern IDs. + for &pid in dfa.pattern_id_slice(state.id()) { + pos += wire::write_pattern_id::<wire::NE>( + pid, + &mut sparse[pos..], + ); + } + } + + // And now add the accelerator, if one exists. An accelerator is + // at most 4 bytes and at least 1 byte. The first byte is the + // length, N. N bytes follow the length. The set of bytes that + // follow correspond (exhaustively) to the bytes that must be seen + // to leave this state. + let accel = dfa.accelerator(state.id()); + sparse.push(accel.len().try_into().unwrap()); + sparse.extend_from_slice(accel); + } + + let mut new = DFA { + tt: Transitions { + sparse, + classes: dfa.byte_classes().clone(), + state_len: dfa.state_len(), + pattern_len: dfa.pattern_len(), + }, + st: StartTable::from_dense_dfa(dfa, &remap)?, + special: dfa.special().remap(|id| remap[dfa.to_index(id)]), + pre: dfa.get_prefilter().map(|p| p.clone()), + quitset: dfa.quitset().clone(), + flags: dfa.flags().clone(), + }; + // And here's our second pass. Iterate over all of the dense states + // again, and update the transitions in each of the states in the + // sparse DFA. + for old_state in dfa.states() { + let new_id = remap[dfa.to_index(old_state.id())]; + let mut new_state = new.tt.state_mut(new_id); + let sparse = old_state.sparse_transitions(); + for (i, (_, _, next)) in sparse.enumerate() { + let next = remap[dfa.to_index(next)]; + new_state.set_next_at(i, next); + } + } + debug!( + "created sparse DFA, memory usage: {} (dense memory usage: {})", + new.memory_usage(), + dfa.memory_usage(), + ); + Ok(new) + } +} + +impl<T: AsRef<[u8]>> DFA<T> { + /// Cheaply return a borrowed version of this sparse DFA. Specifically, the + /// DFA returned always uses `&[u8]` for its transitions. + pub fn as_ref<'a>(&'a self) -> DFA<&'a [u8]> { + DFA { + tt: self.tt.as_ref(), + st: self.st.as_ref(), + special: self.special, + pre: self.pre.clone(), + quitset: self.quitset, + flags: self.flags, + } + } + + /// Return an owned version of this sparse DFA. Specifically, the DFA + /// returned always uses `Vec<u8>` for its transitions. + /// + /// Effectively, this returns a sparse DFA whose transitions live on the + /// heap. + #[cfg(feature = "alloc")] + pub fn to_owned(&self) -> DFA<alloc::vec::Vec<u8>> { + DFA { + tt: self.tt.to_owned(), + st: self.st.to_owned(), + special: self.special, + pre: self.pre.clone(), + quitset: self.quitset, + flags: self.flags, + } + } + + /// Returns the starting state configuration for this DFA. + /// + /// The default is [`StartKind::Both`], which means the DFA supports both + /// unanchored and anchored searches. However, this can generally lead to + /// bigger DFAs. Therefore, a DFA might be compiled with support for just + /// unanchored or anchored searches. In that case, running a search with + /// an unsupported configuration will panic. + pub fn start_kind(&self) -> StartKind { + self.st.kind + } + + /// Returns true only if this DFA has starting states for each pattern. + /// + /// When a DFA has starting states for each pattern, then a search with the + /// DFA can be configured to only look for anchored matches of a specific + /// pattern. Specifically, APIs like [`Automaton::try_search_fwd`] can + /// accept a [`Anchored::Pattern`] if and only if this method returns true. + /// Otherwise, an error will be returned. + /// + /// Note that if the DFA is empty, this always returns false. + pub fn starts_for_each_pattern(&self) -> bool { + self.st.pattern_len.is_some() + } + + /// Returns the equivalence classes that make up the alphabet for this DFA. + /// + /// Unless [`dense::Config::byte_classes`] was disabled, it is possible + /// that multiple distinct bytes are grouped into the same equivalence + /// class if it is impossible for them to discriminate between a match and + /// a non-match. This has the effect of reducing the overall alphabet size + /// and in turn potentially substantially reducing the size of the DFA's + /// transition table. + /// + /// The downside of using equivalence classes like this is that every state + /// transition will automatically use this map to convert an arbitrary + /// byte to its corresponding equivalence class. In practice this has a + /// negligible impact on performance. + pub fn byte_classes(&self) -> &ByteClasses { + &self.tt.classes + } + + /// Returns the memory usage, in bytes, of this DFA. + /// + /// The memory usage is computed based on the number of bytes used to + /// represent this DFA. + /// + /// This does **not** include the stack size used up by this DFA. To + /// compute that, use `std::mem::size_of::<sparse::DFA>()`. + pub fn memory_usage(&self) -> usize { + self.tt.memory_usage() + self.st.memory_usage() + } +} + +/// Routines for converting a sparse DFA to other representations, such as raw +/// bytes suitable for persistent storage. +impl<T: AsRef<[u8]>> DFA<T> { + /// Serialize this DFA as raw bytes to a `Vec<u8>` in little endian + /// format. + /// + /// The written bytes are guaranteed to be deserialized correctly and + /// without errors in a semver compatible release of this crate by a + /// `DFA`'s deserialization APIs (assuming all other criteria for the + /// deserialization APIs has been satisfied): + /// + /// * [`DFA::from_bytes`] + /// * [`DFA::from_bytes_unchecked`] + /// + /// Note that unlike a [`dense::DFA`](crate::dfa::dense::DFA)'s + /// serialization methods, this does not add any initial padding to the + /// returned bytes. Padding isn't required for sparse DFAs since they have + /// no alignment requirements. + /// + /// # Example + /// + /// This example shows how to serialize and deserialize a DFA: + /// + /// ``` + /// use regex_automata::{dfa::{Automaton, sparse::DFA}, HalfMatch, Input}; + /// + /// // Compile our original DFA. + /// let original_dfa = DFA::new("foo[0-9]+")?; + /// + /// // N.B. We use native endianness here to make the example work, but + /// // using to_bytes_little_endian would work on a little endian target. + /// let buf = original_dfa.to_bytes_native_endian(); + /// // Even if buf has initial padding, DFA::from_bytes will automatically + /// // ignore it. + /// let dfa: DFA<&[u8]> = DFA::from_bytes(&buf)?.0; + /// + /// let expected = Some(HalfMatch::must(0, 8)); + /// assert_eq!(expected, dfa.try_search_fwd(&Input::new("foo12345"))?); + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + #[cfg(feature = "dfa-build")] + pub fn to_bytes_little_endian(&self) -> Vec<u8> { + self.to_bytes::<wire::LE>() + } + + /// Serialize this DFA as raw bytes to a `Vec<u8>` in big endian + /// format. + /// + /// The written bytes are guaranteed to be deserialized correctly and + /// without errors in a semver compatible release of this crate by a + /// `DFA`'s deserialization APIs (assuming all other criteria for the + /// deserialization APIs has been satisfied): + /// + /// * [`DFA::from_bytes`] + /// * [`DFA::from_bytes_unchecked`] + /// + /// Note that unlike a [`dense::DFA`](crate::dfa::dense::DFA)'s + /// serialization methods, this does not add any initial padding to the + /// returned bytes. Padding isn't required for sparse DFAs since they have + /// no alignment requirements. + /// + /// # Example + /// + /// This example shows how to serialize and deserialize a DFA: + /// + /// ``` + /// use regex_automata::{dfa::{Automaton, sparse::DFA}, HalfMatch, Input}; + /// + /// // Compile our original DFA. + /// let original_dfa = DFA::new("foo[0-9]+")?; + /// + /// // N.B. We use native endianness here to make the example work, but + /// // using to_bytes_big_endian would work on a big endian target. + /// let buf = original_dfa.to_bytes_native_endian(); + /// // Even if buf has initial padding, DFA::from_bytes will automatically + /// // ignore it. + /// let dfa: DFA<&[u8]> = DFA::from_bytes(&buf)?.0; + /// + /// let expected = Some(HalfMatch::must(0, 8)); + /// assert_eq!(expected, dfa.try_search_fwd(&Input::new("foo12345"))?); + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + #[cfg(feature = "dfa-build")] + pub fn to_bytes_big_endian(&self) -> Vec<u8> { + self.to_bytes::<wire::BE>() + } + + /// Serialize this DFA as raw bytes to a `Vec<u8>` in native endian + /// format. + /// + /// The written bytes are guaranteed to be deserialized correctly and + /// without errors in a semver compatible release of this crate by a + /// `DFA`'s deserialization APIs (assuming all other criteria for the + /// deserialization APIs has been satisfied): + /// + /// * [`DFA::from_bytes`] + /// * [`DFA::from_bytes_unchecked`] + /// + /// Note that unlike a [`dense::DFA`](crate::dfa::dense::DFA)'s + /// serialization methods, this does not add any initial padding to the + /// returned bytes. Padding isn't required for sparse DFAs since they have + /// no alignment requirements. + /// + /// Generally speaking, native endian format should only be used when + /// you know that the target you're compiling the DFA for matches the + /// endianness of the target on which you're compiling DFA. For example, + /// if serialization and deserialization happen in the same process or on + /// the same machine. Otherwise, when serializing a DFA for use in a + /// portable environment, you'll almost certainly want to serialize _both_ + /// a little endian and a big endian version and then load the correct one + /// based on the target's configuration. + /// + /// # Example + /// + /// This example shows how to serialize and deserialize a DFA: + /// + /// ``` + /// use regex_automata::{dfa::{Automaton, sparse::DFA}, HalfMatch, Input}; + /// + /// // Compile our original DFA. + /// let original_dfa = DFA::new("foo[0-9]+")?; + /// + /// let buf = original_dfa.to_bytes_native_endian(); + /// // Even if buf has initial padding, DFA::from_bytes will automatically + /// // ignore it. + /// let dfa: DFA<&[u8]> = DFA::from_bytes(&buf)?.0; + /// + /// let expected = Some(HalfMatch::must(0, 8)); + /// assert_eq!(expected, dfa.try_search_fwd(&Input::new("foo12345"))?); + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + #[cfg(feature = "dfa-build")] + pub fn to_bytes_native_endian(&self) -> Vec<u8> { + self.to_bytes::<wire::NE>() + } + + /// The implementation of the public `to_bytes` serialization methods, + /// which is generic over endianness. + #[cfg(feature = "dfa-build")] + fn to_bytes<E: Endian>(&self) -> Vec<u8> { + let mut buf = vec![0; self.write_to_len()]; + // This should always succeed since the only possible serialization + // error is providing a buffer that's too small, but we've ensured that + // `buf` is big enough here. + self.write_to::<E>(&mut buf).unwrap(); + buf + } + + /// Serialize this DFA as raw bytes to the given slice, in little endian + /// format. Upon success, the total number of bytes written to `dst` is + /// returned. + /// + /// The written bytes are guaranteed to be deserialized correctly and + /// without errors in a semver compatible release of this crate by a + /// `DFA`'s deserialization APIs (assuming all other criteria for the + /// deserialization APIs has been satisfied): + /// + /// * [`DFA::from_bytes`] + /// * [`DFA::from_bytes_unchecked`] + /// + /// # Errors + /// + /// This returns an error if the given destination slice is not big enough + /// to contain the full serialized DFA. If an error occurs, then nothing + /// is written to `dst`. + /// + /// # Example + /// + /// This example shows how to serialize and deserialize a DFA without + /// dynamic memory allocation. + /// + /// ``` + /// use regex_automata::{dfa::{Automaton, sparse::DFA}, HalfMatch, Input}; + /// + /// // Compile our original DFA. + /// let original_dfa = DFA::new("foo[0-9]+")?; + /// + /// // Create a 4KB buffer on the stack to store our serialized DFA. + /// let mut buf = [0u8; 4 * (1<<10)]; + /// // N.B. We use native endianness here to make the example work, but + /// // using write_to_little_endian would work on a little endian target. + /// let written = original_dfa.write_to_native_endian(&mut buf)?; + /// let dfa: DFA<&[u8]> = DFA::from_bytes(&buf[..written])?.0; + /// + /// let expected = Some(HalfMatch::must(0, 8)); + /// assert_eq!(expected, dfa.try_search_fwd(&Input::new("foo12345"))?); + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + pub fn write_to_little_endian( + &self, + dst: &mut [u8], + ) -> Result<usize, SerializeError> { + self.write_to::<wire::LE>(dst) + } + + /// Serialize this DFA as raw bytes to the given slice, in big endian + /// format. Upon success, the total number of bytes written to `dst` is + /// returned. + /// + /// The written bytes are guaranteed to be deserialized correctly and + /// without errors in a semver compatible release of this crate by a + /// `DFA`'s deserialization APIs (assuming all other criteria for the + /// deserialization APIs has been satisfied): + /// + /// * [`DFA::from_bytes`] + /// * [`DFA::from_bytes_unchecked`] + /// + /// # Errors + /// + /// This returns an error if the given destination slice is not big enough + /// to contain the full serialized DFA. If an error occurs, then nothing + /// is written to `dst`. + /// + /// # Example + /// + /// This example shows how to serialize and deserialize a DFA without + /// dynamic memory allocation. + /// + /// ``` + /// use regex_automata::{dfa::{Automaton, sparse::DFA}, HalfMatch, Input}; + /// + /// // Compile our original DFA. + /// let original_dfa = DFA::new("foo[0-9]+")?; + /// + /// // Create a 4KB buffer on the stack to store our serialized DFA. + /// let mut buf = [0u8; 4 * (1<<10)]; + /// // N.B. We use native endianness here to make the example work, but + /// // using write_to_big_endian would work on a big endian target. + /// let written = original_dfa.write_to_native_endian(&mut buf)?; + /// let dfa: DFA<&[u8]> = DFA::from_bytes(&buf[..written])?.0; + /// + /// let expected = Some(HalfMatch::must(0, 8)); + /// assert_eq!(expected, dfa.try_search_fwd(&Input::new("foo12345"))?); + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + pub fn write_to_big_endian( + &self, + dst: &mut [u8], + ) -> Result<usize, SerializeError> { + self.write_to::<wire::BE>(dst) + } + + /// Serialize this DFA as raw bytes to the given slice, in native endian + /// format. Upon success, the total number of bytes written to `dst` is + /// returned. + /// + /// The written bytes are guaranteed to be deserialized correctly and + /// without errors in a semver compatible release of this crate by a + /// `DFA`'s deserialization APIs (assuming all other criteria for the + /// deserialization APIs has been satisfied): + /// + /// * [`DFA::from_bytes`] + /// * [`DFA::from_bytes_unchecked`] + /// + /// Generally speaking, native endian format should only be used when + /// you know that the target you're compiling the DFA for matches the + /// endianness of the target on which you're compiling DFA. For example, + /// if serialization and deserialization happen in the same process or on + /// the same machine. Otherwise, when serializing a DFA for use in a + /// portable environment, you'll almost certainly want to serialize _both_ + /// a little endian and a big endian version and then load the correct one + /// based on the target's configuration. + /// + /// # Errors + /// + /// This returns an error if the given destination slice is not big enough + /// to contain the full serialized DFA. If an error occurs, then nothing + /// is written to `dst`. + /// + /// # Example + /// + /// This example shows how to serialize and deserialize a DFA without + /// dynamic memory allocation. + /// + /// ``` + /// use regex_automata::{dfa::{Automaton, sparse::DFA}, HalfMatch, Input}; + /// + /// // Compile our original DFA. + /// let original_dfa = DFA::new("foo[0-9]+")?; + /// + /// // Create a 4KB buffer on the stack to store our serialized DFA. + /// let mut buf = [0u8; 4 * (1<<10)]; + /// let written = original_dfa.write_to_native_endian(&mut buf)?; + /// let dfa: DFA<&[u8]> = DFA::from_bytes(&buf[..written])?.0; + /// + /// let expected = Some(HalfMatch::must(0, 8)); + /// assert_eq!(expected, dfa.try_search_fwd(&Input::new("foo12345"))?); + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + pub fn write_to_native_endian( + &self, + dst: &mut [u8], + ) -> Result<usize, SerializeError> { + self.write_to::<wire::NE>(dst) + } + + /// The implementation of the public `write_to` serialization methods, + /// which is generic over endianness. + fn write_to<E: Endian>( + &self, + dst: &mut [u8], + ) -> Result<usize, SerializeError> { + let mut nw = 0; + nw += wire::write_label(LABEL, &mut dst[nw..])?; + nw += wire::write_endianness_check::<E>(&mut dst[nw..])?; + nw += wire::write_version::<E>(VERSION, &mut dst[nw..])?; + nw += { + // Currently unused, intended for future flexibility + E::write_u32(0, &mut dst[nw..]); + size_of::<u32>() + }; + nw += self.flags.write_to::<E>(&mut dst[nw..])?; + nw += self.tt.write_to::<E>(&mut dst[nw..])?; + nw += self.st.write_to::<E>(&mut dst[nw..])?; + nw += self.special.write_to::<E>(&mut dst[nw..])?; + nw += self.quitset.write_to::<E>(&mut dst[nw..])?; + Ok(nw) + } + + /// Return the total number of bytes required to serialize this DFA. + /// + /// This is useful for determining the size of the buffer required to pass + /// to one of the serialization routines: + /// + /// * [`DFA::write_to_little_endian`] + /// * [`DFA::write_to_big_endian`] + /// * [`DFA::write_to_native_endian`] + /// + /// Passing a buffer smaller than the size returned by this method will + /// result in a serialization error. + /// + /// # Example + /// + /// This example shows how to dynamically allocate enough room to serialize + /// a sparse DFA. + /// + /// ``` + /// use regex_automata::{dfa::{Automaton, sparse::DFA}, HalfMatch, Input}; + /// + /// // Compile our original DFA. + /// let original_dfa = DFA::new("foo[0-9]+")?; + /// + /// let mut buf = vec![0; original_dfa.write_to_len()]; + /// let written = original_dfa.write_to_native_endian(&mut buf)?; + /// let dfa: DFA<&[u8]> = DFA::from_bytes(&buf[..written])?.0; + /// + /// let expected = Some(HalfMatch::must(0, 8)); + /// assert_eq!(expected, dfa.try_search_fwd(&Input::new("foo12345"))?); + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + pub fn write_to_len(&self) -> usize { + wire::write_label_len(LABEL) + + wire::write_endianness_check_len() + + wire::write_version_len() + + size_of::<u32>() // unused, intended for future flexibility + + self.flags.write_to_len() + + self.tt.write_to_len() + + self.st.write_to_len() + + self.special.write_to_len() + + self.quitset.write_to_len() + } +} + +impl<'a> DFA<&'a [u8]> { + /// Safely deserialize a sparse DFA with a specific state identifier + /// representation. Upon success, this returns both the deserialized DFA + /// and the number of bytes read from the given slice. Namely, the contents + /// of the slice beyond the DFA are not read. + /// + /// Deserializing a DFA using this routine will never allocate heap memory. + /// For safety purposes, the DFA's transitions will be verified such that + /// every transition points to a valid state. If this verification is too + /// costly, then a [`DFA::from_bytes_unchecked`] API is provided, which + /// will always execute in constant time. + /// + /// The bytes given must be generated by one of the serialization APIs + /// of a `DFA` using a semver compatible release of this crate. Those + /// include: + /// + /// * [`DFA::to_bytes_little_endian`] + /// * [`DFA::to_bytes_big_endian`] + /// * [`DFA::to_bytes_native_endian`] + /// * [`DFA::write_to_little_endian`] + /// * [`DFA::write_to_big_endian`] + /// * [`DFA::write_to_native_endian`] + /// + /// The `to_bytes` methods allocate and return a `Vec<u8>` for you. The + /// `write_to` methods do not allocate and write to an existing slice + /// (which may be on the stack). Since deserialization always uses the + /// native endianness of the target platform, the serialization API you use + /// should match the endianness of the target platform. (It's often a good + /// idea to generate serialized DFAs for both forms of endianness and then + /// load the correct one based on endianness.) + /// + /// # Errors + /// + /// Generally speaking, it's easier to state the conditions in which an + /// error is _not_ returned. All of the following must be true: + /// + /// * The bytes given must be produced by one of the serialization APIs + /// on this DFA, as mentioned above. + /// * The endianness of the target platform matches the endianness used to + /// serialized the provided DFA. + /// + /// If any of the above are not true, then an error will be returned. + /// + /// Note that unlike deserializing a + /// [`dense::DFA`](crate::dfa::dense::DFA), deserializing a sparse DFA has + /// no alignment requirements. That is, an alignment of `1` is valid. + /// + /// # Panics + /// + /// This routine will never panic for any input. + /// + /// # Example + /// + /// This example shows how to serialize a DFA to raw bytes, deserialize it + /// and then use it for searching. + /// + /// ``` + /// use regex_automata::{dfa::{Automaton, sparse::DFA}, HalfMatch, Input}; + /// + /// let initial = DFA::new("foo[0-9]+")?; + /// let bytes = initial.to_bytes_native_endian(); + /// let dfa: DFA<&[u8]> = DFA::from_bytes(&bytes)?.0; + /// + /// let expected = Some(HalfMatch::must(0, 8)); + /// assert_eq!(expected, dfa.try_search_fwd(&Input::new("foo12345"))?); + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + /// + /// # Example: loading a DFA from static memory + /// + /// One use case this library supports is the ability to serialize a + /// DFA to disk and then use `include_bytes!` to store it in a compiled + /// Rust program. Those bytes can then be cheaply deserialized into a + /// `DFA` structure at runtime and used for searching without having to + /// re-compile the DFA (which can be quite costly). + /// + /// We can show this in two parts. The first part is serializing the DFA to + /// a file: + /// + /// ```no_run + /// use regex_automata::dfa::sparse::DFA; + /// + /// let dfa = DFA::new("foo[0-9]+")?; + /// + /// // Write a big endian serialized version of this DFA to a file. + /// let bytes = dfa.to_bytes_big_endian(); + /// std::fs::write("foo.bigendian.dfa", &bytes)?; + /// + /// // Do it again, but this time for little endian. + /// let bytes = dfa.to_bytes_little_endian(); + /// std::fs::write("foo.littleendian.dfa", &bytes)?; + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + /// + /// And now the second part is embedding the DFA into the compiled program + /// and deserializing it at runtime on first use. We use conditional + /// compilation to choose the correct endianness. We do not need to employ + /// any special tricks to ensure a proper alignment, since a sparse DFA has + /// no alignment requirements. + /// + /// ```no_run + /// use regex_automata::{ + /// dfa::{Automaton, sparse::DFA}, + /// util::lazy::Lazy, + /// HalfMatch, Input, + /// }; + /// + /// // This crate provides its own "lazy" type, kind of like + /// // lazy_static! or once_cell::sync::Lazy. But it works in no-alloc + /// // no-std environments and let's us write this using completely + /// // safe code. + /// static RE: Lazy<DFA<&'static [u8]>> = Lazy::new(|| { + /// # const _: &str = stringify! { + /// #[cfg(target_endian = "big")] + /// static BYTES: &[u8] = include_bytes!("foo.bigendian.dfa"); + /// #[cfg(target_endian = "little")] + /// static BYTES: &[u8] = include_bytes!("foo.littleendian.dfa"); + /// # }; + /// # static BYTES: &[u8] = b""; + /// + /// let (dfa, _) = DFA::from_bytes(BYTES) + /// .expect("serialized DFA should be valid"); + /// dfa + /// }); + /// + /// let expected = Ok(Some(HalfMatch::must(0, 8))); + /// assert_eq!(expected, RE.try_search_fwd(&Input::new("foo12345"))); + /// ``` + /// + /// Alternatively, consider using + /// [`lazy_static`](https://crates.io/crates/lazy_static) + /// or + /// [`once_cell`](https://crates.io/crates/once_cell), + /// which will guarantee safety for you. + pub fn from_bytes( + slice: &'a [u8], + ) -> Result<(DFA<&'a [u8]>, usize), DeserializeError> { + // SAFETY: This is safe because we validate both the sparse transitions + // (by trying to decode every state) and start state ID list below. If + // either validation fails, then we return an error. + let (dfa, nread) = unsafe { DFA::from_bytes_unchecked(slice)? }; + dfa.tt.validate(&dfa.special)?; + dfa.st.validate(&dfa.special, &dfa.tt)?; + // N.B. dfa.special doesn't have a way to do unchecked deserialization, + // so it has already been validated. + Ok((dfa, nread)) + } + + /// Deserialize a DFA with a specific state identifier representation in + /// constant time by omitting the verification of the validity of the + /// sparse transitions. + /// + /// This is just like [`DFA::from_bytes`], except it can potentially return + /// a DFA that exhibits undefined behavior if its transitions contains + /// invalid state identifiers. + /// + /// This routine is useful if you need to deserialize a DFA cheaply and + /// cannot afford the transition validation performed by `from_bytes`. + /// + /// # Safety + /// + /// This routine is not safe because it permits callers to provide + /// arbitrary transitions with possibly incorrect state identifiers. While + /// the various serialization routines will never return an incorrect + /// DFA, there is no guarantee that the bytes provided here are correct. + /// While `from_bytes_unchecked` will still do several forms of basic + /// validation, this routine does not check that the transitions themselves + /// are correct. Given an incorrect transition table, it is possible for + /// the search routines to access out-of-bounds memory because of explicit + /// bounds check elision. + /// + /// # Example + /// + /// ``` + /// use regex_automata::{dfa::{Automaton, sparse::DFA}, HalfMatch, Input}; + /// + /// let initial = DFA::new("foo[0-9]+")?; + /// let bytes = initial.to_bytes_native_endian(); + /// // SAFETY: This is guaranteed to be safe since the bytes given come + /// // directly from a compatible serialization routine. + /// let dfa: DFA<&[u8]> = unsafe { DFA::from_bytes_unchecked(&bytes)?.0 }; + /// + /// let expected = Some(HalfMatch::must(0, 8)); + /// assert_eq!(expected, dfa.try_search_fwd(&Input::new("foo12345"))?); + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + pub unsafe fn from_bytes_unchecked( + slice: &'a [u8], + ) -> Result<(DFA<&'a [u8]>, usize), DeserializeError> { + let mut nr = 0; + + nr += wire::read_label(&slice[nr..], LABEL)?; + nr += wire::read_endianness_check(&slice[nr..])?; + nr += wire::read_version(&slice[nr..], VERSION)?; + + let _unused = wire::try_read_u32(&slice[nr..], "unused space")?; + nr += size_of::<u32>(); + + let (flags, nread) = Flags::from_bytes(&slice[nr..])?; + nr += nread; + + let (tt, nread) = Transitions::from_bytes_unchecked(&slice[nr..])?; + nr += nread; + + let (st, nread) = StartTable::from_bytes_unchecked(&slice[nr..])?; + nr += nread; + + let (special, nread) = Special::from_bytes(&slice[nr..])?; + nr += nread; + if special.max.as_usize() >= tt.sparse().len() { + return Err(DeserializeError::generic( + "max should not be greater than or equal to sparse bytes", + )); + } + + let (quitset, nread) = ByteSet::from_bytes(&slice[nr..])?; + nr += nread; + + // Prefilters don't support serialization, so they're always absent. + let pre = None; + Ok((DFA { tt, st, special, pre, quitset, flags }, nr)) + } +} + +impl<T: AsRef<[u8]>> fmt::Debug for DFA<T> { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + writeln!(f, "sparse::DFA(")?; + for state in self.tt.states() { + fmt_state_indicator(f, self, state.id())?; + writeln!(f, "{:06?}: {:?}", state.id().as_usize(), state)?; + } + writeln!(f, "")?; + for (i, (start_id, anchored, sty)) in self.st.iter().enumerate() { + if i % self.st.stride == 0 { + match anchored { + Anchored::No => writeln!(f, "START-GROUP(unanchored)")?, + Anchored::Yes => writeln!(f, "START-GROUP(anchored)")?, + Anchored::Pattern(pid) => writeln!( + f, + "START_GROUP(pattern: {:?})", + pid.as_usize() + )?, + } + } + writeln!(f, " {:?} => {:06?}", sty, start_id.as_usize())?; + } + writeln!(f, "state length: {:?}", self.tt.state_len)?; + writeln!(f, "pattern length: {:?}", self.pattern_len())?; + writeln!(f, "flags: {:?}", self.flags)?; + writeln!(f, ")")?; + Ok(()) + } +} + +// SAFETY: We assert that our implementation of each method is correct. +unsafe impl<T: AsRef<[u8]>> Automaton for DFA<T> { + #[inline] + fn is_special_state(&self, id: StateID) -> bool { + self.special.is_special_state(id) + } + + #[inline] + fn is_dead_state(&self, id: StateID) -> bool { + self.special.is_dead_state(id) + } + + #[inline] + fn is_quit_state(&self, id: StateID) -> bool { + self.special.is_quit_state(id) + } + + #[inline] + fn is_match_state(&self, id: StateID) -> bool { + self.special.is_match_state(id) + } + + #[inline] + fn is_start_state(&self, id: StateID) -> bool { + self.special.is_start_state(id) + } + + #[inline] + fn is_accel_state(&self, id: StateID) -> bool { + self.special.is_accel_state(id) + } + + // This is marked as inline to help dramatically boost sparse searching, + // which decodes each state it enters to follow the next transition. + #[cfg_attr(feature = "perf-inline", inline(always))] + fn next_state(&self, current: StateID, input: u8) -> StateID { + let input = self.tt.classes.get(input); + self.tt.state(current).next(input) + } + + #[inline] + unsafe fn next_state_unchecked( + &self, + current: StateID, + input: u8, + ) -> StateID { + self.next_state(current, input) + } + + #[inline] + fn next_eoi_state(&self, current: StateID) -> StateID { + self.tt.state(current).next_eoi() + } + + #[inline] + fn pattern_len(&self) -> usize { + self.tt.pattern_len + } + + #[inline] + fn match_len(&self, id: StateID) -> usize { + self.tt.state(id).pattern_len() + } + + #[inline] + fn match_pattern(&self, id: StateID, match_index: usize) -> PatternID { + // This is an optimization for the very common case of a DFA with a + // single pattern. This conditional avoids a somewhat more costly path + // that finds the pattern ID from the state machine, which requires + // a bit of slicing/pointer-chasing. This optimization tends to only + // matter when matches are frequent. + if self.tt.pattern_len == 1 { + return PatternID::ZERO; + } + self.tt.state(id).pattern_id(match_index) + } + + #[inline] + fn has_empty(&self) -> bool { + self.flags.has_empty + } + + #[inline] + fn is_utf8(&self) -> bool { + self.flags.is_utf8 + } + + #[inline] + fn is_always_start_anchored(&self) -> bool { + self.flags.is_always_start_anchored + } + + #[inline] + fn start_state_forward( + &self, + input: &Input<'_>, + ) -> Result<StateID, MatchError> { + if !self.quitset.is_empty() && input.start() > 0 { + let offset = input.start() - 1; + let byte = input.haystack()[offset]; + if self.quitset.contains(byte) { + return Err(MatchError::quit(byte, offset)); + } + } + let start = self.st.start_map.fwd(&input); + self.st.start(input, start) + } + + #[inline] + fn start_state_reverse( + &self, + input: &Input<'_>, + ) -> Result<StateID, MatchError> { + if !self.quitset.is_empty() && input.end() < input.haystack().len() { + let offset = input.end(); + let byte = input.haystack()[offset]; + if self.quitset.contains(byte) { + return Err(MatchError::quit(byte, offset)); + } + } + let start = self.st.start_map.rev(&input); + self.st.start(input, start) + } + + #[inline] + fn universal_start_state(&self, mode: Anchored) -> Option<StateID> { + match mode { + Anchored::No => self.st.universal_start_unanchored, + Anchored::Yes => self.st.universal_start_anchored, + Anchored::Pattern(_) => None, + } + } + + #[inline] + fn accelerator(&self, id: StateID) -> &[u8] { + self.tt.state(id).accelerator() + } + + #[inline] + fn get_prefilter(&self) -> Option<&Prefilter> { + self.pre.as_ref() + } +} + +/// The transition table portion of a sparse DFA. +/// +/// The transition table is the core part of the DFA in that it describes how +/// to move from one state to another based on the input sequence observed. +/// +/// Unlike a typical dense table based DFA, states in a sparse transition +/// table have variable size. That is, states with more transitions use more +/// space than states with fewer transitions. This means that finding the next +/// transition takes more work than with a dense DFA, but also typically uses +/// much less space. +#[derive(Clone)] +struct Transitions<T> { + /// The raw encoding of each state in this DFA. + /// + /// Each state has the following information: + /// + /// * A set of transitions to subsequent states. Transitions to the dead + /// state are omitted. + /// * If the state can be accelerated, then any additional accelerator + /// information. + /// * If the state is a match state, then the state contains all pattern + /// IDs that match when in that state. + /// + /// To decode a state, use Transitions::state. + /// + /// In practice, T is either Vec<u8> or &[u8]. + sparse: T, + /// A set of equivalence classes, where a single equivalence class + /// represents a set of bytes that never discriminate between a match + /// and a non-match in the DFA. Each equivalence class corresponds to a + /// single character in this DFA's alphabet, where the maximum number of + /// characters is 257 (each possible value of a byte plus the special + /// EOI transition). Consequently, the number of equivalence classes + /// corresponds to the number of transitions for each DFA state. Note + /// though that the *space* used by each DFA state in the transition table + /// may be larger. The total space used by each DFA state is known as the + /// stride and is documented above. + /// + /// The only time the number of equivalence classes is fewer than 257 is + /// if the DFA's kind uses byte classes which is the default. Equivalence + /// classes should generally only be disabled when debugging, so that + /// the transitions themselves aren't obscured. Disabling them has no + /// other benefit, since the equivalence class map is always used while + /// searching. In the vast majority of cases, the number of equivalence + /// classes is substantially smaller than 257, particularly when large + /// Unicode classes aren't used. + /// + /// N.B. Equivalence classes aren't particularly useful in a sparse DFA + /// in the current implementation, since equivalence classes generally tend + /// to correspond to continuous ranges of bytes that map to the same + /// transition. So in a sparse DFA, equivalence classes don't really lead + /// to a space savings. In the future, it would be good to try and remove + /// them from sparse DFAs entirely, but requires a bit of work since sparse + /// DFAs are built from dense DFAs, which are in turn built on top of + /// equivalence classes. + classes: ByteClasses, + /// The total number of states in this DFA. Note that a DFA always has at + /// least one state---the dead state---even the empty DFA. In particular, + /// the dead state always has ID 0 and is correspondingly always the first + /// state. The dead state is never a match state. + state_len: usize, + /// The total number of unique patterns represented by these match states. + pattern_len: usize, +} + +impl<'a> Transitions<&'a [u8]> { + unsafe fn from_bytes_unchecked( + mut slice: &'a [u8], + ) -> Result<(Transitions<&'a [u8]>, usize), DeserializeError> { + let slice_start = slice.as_ptr().as_usize(); + + let (state_len, nr) = + wire::try_read_u32_as_usize(&slice, "state length")?; + slice = &slice[nr..]; + + let (pattern_len, nr) = + wire::try_read_u32_as_usize(&slice, "pattern length")?; + slice = &slice[nr..]; + + let (classes, nr) = ByteClasses::from_bytes(&slice)?; + slice = &slice[nr..]; + + let (len, nr) = + wire::try_read_u32_as_usize(&slice, "sparse transitions length")?; + slice = &slice[nr..]; + + wire::check_slice_len(slice, len, "sparse states byte length")?; + let sparse = &slice[..len]; + slice = &slice[len..]; + + let trans = Transitions { sparse, classes, state_len, pattern_len }; + Ok((trans, slice.as_ptr().as_usize() - slice_start)) + } +} + +impl<T: AsRef<[u8]>> Transitions<T> { + /// Writes a serialized form of this transition table to the buffer given. + /// If the buffer is too small, then an error is returned. To determine + /// how big the buffer must be, use `write_to_len`. + fn write_to<E: Endian>( + &self, + mut dst: &mut [u8], + ) -> Result<usize, SerializeError> { + let nwrite = self.write_to_len(); + if dst.len() < nwrite { + return Err(SerializeError::buffer_too_small( + "sparse transition table", + )); + } + dst = &mut dst[..nwrite]; + + // write state length + E::write_u32(u32::try_from(self.state_len).unwrap(), dst); + dst = &mut dst[size_of::<u32>()..]; + + // write pattern length + E::write_u32(u32::try_from(self.pattern_len).unwrap(), dst); + dst = &mut dst[size_of::<u32>()..]; + + // write byte class map + let n = self.classes.write_to(dst)?; + dst = &mut dst[n..]; + + // write number of bytes in sparse transitions + E::write_u32(u32::try_from(self.sparse().len()).unwrap(), dst); + dst = &mut dst[size_of::<u32>()..]; + + // write actual transitions + let mut id = DEAD; + while id.as_usize() < self.sparse().len() { + let state = self.state(id); + let n = state.write_to::<E>(&mut dst)?; + dst = &mut dst[n..]; + // The next ID is the offset immediately following `state`. + id = StateID::new(id.as_usize() + state.write_to_len()).unwrap(); + } + Ok(nwrite) + } + + /// Returns the number of bytes the serialized form of this transition + /// table will use. + fn write_to_len(&self) -> usize { + size_of::<u32>() // state length + + size_of::<u32>() // pattern length + + self.classes.write_to_len() + + size_of::<u32>() // sparse transitions length + + self.sparse().len() + } + + /// Validates that every state ID in this transition table is valid. + /// + /// That is, every state ID can be used to correctly index a state in this + /// table. + fn validate(&self, sp: &Special) -> Result<(), DeserializeError> { + // In order to validate everything, we not only need to make sure we + // can decode every state, but that every transition in every state + // points to a valid state. There are many duplicative transitions, so + // we record state IDs that we've verified so that we don't redo the + // decoding work. + // + // Except, when in no_std mode, we don't have dynamic memory allocation + // available to us, so we skip this optimization. It's not clear + // whether doing something more clever is worth it just yet. If you're + // profiling this code and need it to run faster, please file an issue. + // + // OK, so we also use this to record the set of valid state IDs. Since + // it is possible for a transition to point to an invalid state ID that + // still (somehow) deserializes to a valid state. So we need to make + // sure our transitions are limited to actually correct state IDs. + // The problem is, I'm not sure how to do this verification step in + // no-std no-alloc mode. I think we'd *have* to store the set of valid + // state IDs in the DFA itself. For now, we don't do this verification + // in no-std no-alloc mode. The worst thing that can happen is an + // incorrect result. But no panics or memory safety problems should + // result. Because we still do validate that the state itself is + // "valid" in the sense that everything it points to actually exists. + // + // ---AG + struct Seen { + #[cfg(feature = "alloc")] + set: alloc::collections::BTreeSet<StateID>, + #[cfg(not(feature = "alloc"))] + set: core::marker::PhantomData<StateID>, + } + + #[cfg(feature = "alloc")] + impl Seen { + fn new() -> Seen { + Seen { set: alloc::collections::BTreeSet::new() } + } + fn insert(&mut self, id: StateID) { + self.set.insert(id); + } + fn contains(&self, id: &StateID) -> bool { + self.set.contains(id) + } + } + + #[cfg(not(feature = "alloc"))] + impl Seen { + fn new() -> Seen { + Seen { set: core::marker::PhantomData } + } + fn insert(&mut self, _id: StateID) {} + fn contains(&self, _id: &StateID) -> bool { + false + } + } + + let mut verified: Seen = Seen::new(); + // We need to make sure that we decode the correct number of states. + // Otherwise, an empty set of transitions would validate even if the + // recorded state length is non-empty. + let mut len = 0; + // We can't use the self.states() iterator because it assumes the state + // encodings are valid. It could panic if they aren't. + let mut id = DEAD; + while id.as_usize() < self.sparse().len() { + // Before we even decode the state, we check that the ID itself + // is well formed. That is, if it's a special state then it must + // actually be a quit, dead, accel, match or start state. + if sp.is_special_state(id) { + let is_actually_special = sp.is_dead_state(id) + || sp.is_quit_state(id) + || sp.is_match_state(id) + || sp.is_start_state(id) + || sp.is_accel_state(id); + if !is_actually_special { + // This is kind of a cryptic error message... + return Err(DeserializeError::generic( + "found sparse state tagged as special but \ + wasn't actually special", + )); + } + } + let state = self.try_state(sp, id)?; + verified.insert(id); + // The next ID should be the offset immediately following `state`. + id = StateID::new(wire::add( + id.as_usize(), + state.write_to_len(), + "next state ID offset", + )?) + .map_err(|err| { + DeserializeError::state_id_error(err, "next state ID offset") + })?; + len += 1; + } + // Now that we've checked that all top-level states are correct and + // importantly, collected a set of valid state IDs, we have all the + // information we need to check that all transitions are correct too. + // + // Note that we can't use `valid_ids` to iterate because it will + // be empty in no-std no-alloc contexts. (And yes, that means our + // verification isn't quite as good.) We can use `self.states()` + // though at least, since we know that all states can at least be + // decoded and traversed correctly. + for state in self.states() { + // Check that all transitions in this state are correct. + for i in 0..state.ntrans { + let to = state.next_at(i); + // For no-alloc, we just check that the state can decode. It is + // technically possible that the state ID could still point to + // a non-existent state even if it decodes (fuzzing proved this + // to be true), but it shouldn't result in any memory unsafety + // or panics in non-debug mode. + #[cfg(not(feature = "alloc"))] + { + let _ = self.try_state(sp, to)?; + } + #[cfg(feature = "alloc")] + { + if !verified.contains(&to) { + return Err(DeserializeError::generic( + "found transition that points to a \ + non-existent state", + )); + } + } + } + } + if len != self.state_len { + return Err(DeserializeError::generic( + "mismatching sparse state length", + )); + } + Ok(()) + } + + /// Converts these transitions to a borrowed value. + fn as_ref(&self) -> Transitions<&'_ [u8]> { + Transitions { + sparse: self.sparse(), + classes: self.classes.clone(), + state_len: self.state_len, + pattern_len: self.pattern_len, + } + } + + /// Converts these transitions to an owned value. + #[cfg(feature = "alloc")] + fn to_owned(&self) -> Transitions<alloc::vec::Vec<u8>> { + Transitions { + sparse: self.sparse().to_vec(), + classes: self.classes.clone(), + state_len: self.state_len, + pattern_len: self.pattern_len, + } + } + + /// Return a convenient representation of the given state. + /// + /// This panics if the state is invalid. + /// + /// This is marked as inline to help dramatically boost sparse searching, + /// which decodes each state it enters to follow the next transition. Other + /// functions involved are also inlined, which should hopefully eliminate + /// a lot of the extraneous decoding that is never needed just to follow + /// the next transition. + #[cfg_attr(feature = "perf-inline", inline(always))] + fn state(&self, id: StateID) -> State<'_> { + let mut state = &self.sparse()[id.as_usize()..]; + let mut ntrans = wire::read_u16(&state).as_usize(); + let is_match = (1 << 15) & ntrans != 0; + ntrans &= !(1 << 15); + state = &state[2..]; + + let (input_ranges, state) = state.split_at(ntrans * 2); + let (next, state) = state.split_at(ntrans * StateID::SIZE); + let (pattern_ids, state) = if is_match { + let npats = wire::read_u32(&state).as_usize(); + state[4..].split_at(npats * 4) + } else { + (&[][..], state) + }; + + let accel_len = usize::from(state[0]); + let accel = &state[1..accel_len + 1]; + State { id, is_match, ntrans, input_ranges, next, pattern_ids, accel } + } + + /// Like `state`, but will return an error if the state encoding is + /// invalid. This is useful for verifying states after deserialization, + /// which is required for a safe deserialization API. + /// + /// Note that this only verifies that this state is decodable and that + /// all of its data is consistent. It does not verify that its state ID + /// transitions point to valid states themselves, nor does it verify that + /// every pattern ID is valid. + fn try_state( + &self, + sp: &Special, + id: StateID, + ) -> Result<State<'_>, DeserializeError> { + if id.as_usize() > self.sparse().len() { + return Err(DeserializeError::generic( + "invalid caller provided sparse state ID", + )); + } + let mut state = &self.sparse()[id.as_usize()..]; + // Encoding format starts with a u16 that stores the total number of + // transitions in this state. + let (mut ntrans, _) = + wire::try_read_u16_as_usize(state, "state transition length")?; + let is_match = ((1 << 15) & ntrans) != 0; + ntrans &= !(1 << 15); + state = &state[2..]; + if ntrans > 257 || ntrans == 0 { + return Err(DeserializeError::generic( + "invalid transition length", + )); + } + if is_match && !sp.is_match_state(id) { + return Err(DeserializeError::generic( + "state marked as match but not in match ID range", + )); + } else if !is_match && sp.is_match_state(id) { + return Err(DeserializeError::generic( + "state in match ID range but not marked as match state", + )); + } + + // Each transition has two pieces: an inclusive range of bytes on which + // it is defined, and the state ID that those bytes transition to. The + // pairs come first, followed by a corresponding sequence of state IDs. + let input_ranges_len = ntrans.checked_mul(2).unwrap(); + wire::check_slice_len(state, input_ranges_len, "sparse byte pairs")?; + let (input_ranges, state) = state.split_at(input_ranges_len); + // Every range should be of the form A-B, where A<=B. + for pair in input_ranges.chunks(2) { + let (start, end) = (pair[0], pair[1]); + if start > end { + return Err(DeserializeError::generic("invalid input range")); + } + } + + // And now extract the corresponding sequence of state IDs. We leave + // this sequence as a &[u8] instead of a &[S] because sparse DFAs do + // not have any alignment requirements. + let next_len = ntrans + .checked_mul(self.id_len()) + .expect("state size * #trans should always fit in a usize"); + wire::check_slice_len(state, next_len, "sparse trans state IDs")?; + let (next, state) = state.split_at(next_len); + // We can at least verify that every state ID is in bounds. + for idbytes in next.chunks(self.id_len()) { + let (id, _) = + wire::read_state_id(idbytes, "sparse state ID in try_state")?; + wire::check_slice_len( + self.sparse(), + id.as_usize(), + "invalid sparse state ID", + )?; + } + + // If this is a match state, then read the pattern IDs for this state. + // Pattern IDs is a u32-length prefixed sequence of native endian + // encoded 32-bit integers. + let (pattern_ids, state) = if is_match { + let (npats, nr) = + wire::try_read_u32_as_usize(state, "pattern ID length")?; + let state = &state[nr..]; + if npats == 0 { + return Err(DeserializeError::generic( + "state marked as a match, but has no pattern IDs", + )); + } + + let pattern_ids_len = + wire::mul(npats, 4, "sparse pattern ID byte length")?; + wire::check_slice_len( + state, + pattern_ids_len, + "sparse pattern IDs", + )?; + let (pattern_ids, state) = state.split_at(pattern_ids_len); + for patbytes in pattern_ids.chunks(PatternID::SIZE) { + wire::read_pattern_id( + patbytes, + "sparse pattern ID in try_state", + )?; + } + (pattern_ids, state) + } else { + (&[][..], state) + }; + + // Now read this state's accelerator info. The first byte is the length + // of the accelerator, which is typically 0 (for no acceleration) but + // is no bigger than 3. The length indicates the number of bytes that + // follow, where each byte corresponds to a transition out of this + // state. + if state.is_empty() { + return Err(DeserializeError::generic("no accelerator length")); + } + let (accel_len, state) = (usize::from(state[0]), &state[1..]); + + if accel_len > 3 { + return Err(DeserializeError::generic( + "sparse invalid accelerator length", + )); + } else if accel_len == 0 && sp.is_accel_state(id) { + return Err(DeserializeError::generic( + "got no accelerators in state, but in accelerator ID range", + )); + } else if accel_len > 0 && !sp.is_accel_state(id) { + return Err(DeserializeError::generic( + "state in accelerator ID range, but has no accelerators", + )); + } + + wire::check_slice_len( + state, + accel_len, + "sparse corrupt accelerator length", + )?; + let (accel, _) = (&state[..accel_len], &state[accel_len..]); + + let state = State { + id, + is_match, + ntrans, + input_ranges, + next, + pattern_ids, + accel, + }; + if sp.is_quit_state(state.next_at(state.ntrans - 1)) { + return Err(DeserializeError::generic( + "state with EOI transition to quit state is illegal", + )); + } + Ok(state) + } + + /// Return an iterator over all of the states in this DFA. + /// + /// The iterator returned yields tuples, where the first element is the + /// state ID and the second element is the state itself. + fn states(&self) -> StateIter<'_, T> { + StateIter { trans: self, id: DEAD.as_usize() } + } + + /// Returns the sparse transitions as raw bytes. + fn sparse(&self) -> &[u8] { + self.sparse.as_ref() + } + + /// Returns the number of bytes represented by a single state ID. + fn id_len(&self) -> usize { + StateID::SIZE + } + + /// Return the memory usage, in bytes, of these transitions. + /// + /// This does not include the size of a `Transitions` value itself. + fn memory_usage(&self) -> usize { + self.sparse().len() + } +} + +#[cfg(feature = "dfa-build")] +impl<T: AsMut<[u8]>> Transitions<T> { + /// Return a convenient mutable representation of the given state. + /// This panics if the state is invalid. + fn state_mut(&mut self, id: StateID) -> StateMut<'_> { + let mut state = &mut self.sparse_mut()[id.as_usize()..]; + let mut ntrans = wire::read_u16(&state).as_usize(); + let is_match = (1 << 15) & ntrans != 0; + ntrans &= !(1 << 15); + state = &mut state[2..]; + + let (input_ranges, state) = state.split_at_mut(ntrans * 2); + let (next, state) = state.split_at_mut(ntrans * StateID::SIZE); + let (pattern_ids, state) = if is_match { + let npats = wire::read_u32(&state).as_usize(); + state[4..].split_at_mut(npats * 4) + } else { + (&mut [][..], state) + }; + + let accel_len = usize::from(state[0]); + let accel = &mut state[1..accel_len + 1]; + StateMut { + id, + is_match, + ntrans, + input_ranges, + next, + pattern_ids, + accel, + } + } + + /// Returns the sparse transitions as raw mutable bytes. + fn sparse_mut(&mut self) -> &mut [u8] { + self.sparse.as_mut() + } +} + +/// The set of all possible starting states in a DFA. +/// +/// See the eponymous type in the `dense` module for more details. This type +/// is very similar to `dense::StartTable`, except that its underlying +/// representation is `&[u8]` instead of `&[S]`. (The latter would require +/// sparse DFAs to be aligned, which is explicitly something we do not require +/// because we don't really need it.) +#[derive(Clone)] +struct StartTable<T> { + /// The initial start state IDs as a contiguous table of native endian + /// encoded integers, represented by `S`. + /// + /// In practice, T is either Vec<u8> or &[u8] and has no alignment + /// requirements. + /// + /// The first `2 * stride` (currently always 8) entries always correspond + /// to the starts states for the entire DFA, with the first 4 entries being + /// for unanchored searches and the second 4 entries being for anchored + /// searches. To keep things simple, we always use 8 entries even if the + /// `StartKind` is not both. + /// + /// After that, there are `stride * patterns` state IDs, where `patterns` + /// may be zero in the case of a DFA with no patterns or in the case where + /// the DFA was built without enabling starting states for each pattern. + table: T, + /// The starting state configuration supported. When 'both', both + /// unanchored and anchored searches work. When 'unanchored', anchored + /// searches panic. When 'anchored', unanchored searches panic. + kind: StartKind, + /// The start state configuration for every possible byte. + start_map: StartByteMap, + /// The number of starting state IDs per pattern. + stride: usize, + /// The total number of patterns for which starting states are encoded. + /// This is `None` for DFAs that were built without start states for each + /// pattern. Thus, one cannot use this field to say how many patterns + /// are in the DFA in all cases. It is specific to how many patterns are + /// represented in this start table. + pattern_len: Option<usize>, + /// The universal starting state for unanchored searches. This is only + /// present when the DFA supports unanchored searches and when all starting + /// state IDs for an unanchored search are equivalent. + universal_start_unanchored: Option<StateID>, + /// The universal starting state for anchored searches. This is only + /// present when the DFA supports anchored searches and when all starting + /// state IDs for an anchored search are equivalent. + universal_start_anchored: Option<StateID>, +} + +#[cfg(feature = "dfa-build")] +impl StartTable<Vec<u8>> { + fn new<T: AsRef<[u32]>>( + dfa: &dense::DFA<T>, + pattern_len: Option<usize>, + ) -> StartTable<Vec<u8>> { + let stride = Start::len(); + // This is OK since the only way we're here is if a dense DFA could be + // constructed successfully, which uses the same space. + let len = stride + .checked_mul(pattern_len.unwrap_or(0)) + .unwrap() + .checked_add(stride.checked_mul(2).unwrap()) + .unwrap() + .checked_mul(StateID::SIZE) + .unwrap(); + StartTable { + table: vec![0; len], + kind: dfa.start_kind(), + start_map: dfa.start_map().clone(), + stride, + pattern_len, + universal_start_unanchored: dfa + .universal_start_state(Anchored::No), + universal_start_anchored: dfa.universal_start_state(Anchored::Yes), + } + } + + fn from_dense_dfa<T: AsRef<[u32]>>( + dfa: &dense::DFA<T>, + remap: &[StateID], + ) -> Result<StartTable<Vec<u8>>, BuildError> { + // Unless the DFA has start states compiled for each pattern, then + // as far as the starting state table is concerned, there are zero + // patterns to account for. It will instead only store starting states + // for the entire DFA. + let start_pattern_len = if dfa.starts_for_each_pattern() { + Some(dfa.pattern_len()) + } else { + None + }; + let mut sl = StartTable::new(dfa, start_pattern_len); + for (old_start_id, anchored, sty) in dfa.starts() { + let new_start_id = remap[dfa.to_index(old_start_id)]; + sl.set_start(anchored, sty, new_start_id); + } + Ok(sl) + } +} + +impl<'a> StartTable<&'a [u8]> { + unsafe fn from_bytes_unchecked( + mut slice: &'a [u8], + ) -> Result<(StartTable<&'a [u8]>, usize), DeserializeError> { + let slice_start = slice.as_ptr().as_usize(); + + let (kind, nr) = StartKind::from_bytes(slice)?; + slice = &slice[nr..]; + + let (start_map, nr) = StartByteMap::from_bytes(slice)?; + slice = &slice[nr..]; + + let (stride, nr) = + wire::try_read_u32_as_usize(slice, "sparse start table stride")?; + slice = &slice[nr..]; + if stride != Start::len() { + return Err(DeserializeError::generic( + "invalid sparse starting table stride", + )); + } + + let (maybe_pattern_len, nr) = + wire::try_read_u32_as_usize(slice, "sparse start table patterns")?; + slice = &slice[nr..]; + let pattern_len = if maybe_pattern_len.as_u32() == u32::MAX { + None + } else { + Some(maybe_pattern_len) + }; + if pattern_len.map_or(false, |len| len > PatternID::LIMIT) { + return Err(DeserializeError::generic( + "sparse invalid number of patterns", + )); + } + + let (universal_unanchored, nr) = + wire::try_read_u32(slice, "universal unanchored start")?; + slice = &slice[nr..]; + let universal_start_unanchored = if universal_unanchored == u32::MAX { + None + } else { + Some(StateID::try_from(universal_unanchored).map_err(|e| { + DeserializeError::state_id_error( + e, + "universal unanchored start", + ) + })?) + }; + + let (universal_anchored, nr) = + wire::try_read_u32(slice, "universal anchored start")?; + slice = &slice[nr..]; + let universal_start_anchored = if universal_anchored == u32::MAX { + None + } else { + Some(StateID::try_from(universal_anchored).map_err(|e| { + DeserializeError::state_id_error(e, "universal anchored start") + })?) + }; + + let pattern_table_size = wire::mul( + stride, + pattern_len.unwrap_or(0), + "sparse invalid pattern length", + )?; + // Our start states always start with a single stride of start states + // for the entire automaton which permit it to match any pattern. What + // follows it are an optional set of start states for each pattern. + let start_state_len = wire::add( + wire::mul(2, stride, "start state stride too big")?, + pattern_table_size, + "sparse invalid 'any' pattern starts size", + )?; + let table_bytes_len = wire::mul( + start_state_len, + StateID::SIZE, + "sparse pattern table bytes length", + )?; + wire::check_slice_len( + slice, + table_bytes_len, + "sparse start ID table", + )?; + let table = &slice[..table_bytes_len]; + slice = &slice[table_bytes_len..]; + + let sl = StartTable { + table, + kind, + start_map, + stride, + pattern_len, + universal_start_unanchored, + universal_start_anchored, + }; + Ok((sl, slice.as_ptr().as_usize() - slice_start)) + } +} + +impl<T: AsRef<[u8]>> StartTable<T> { + fn write_to<E: Endian>( + &self, + mut dst: &mut [u8], + ) -> Result<usize, SerializeError> { + let nwrite = self.write_to_len(); + if dst.len() < nwrite { + return Err(SerializeError::buffer_too_small( + "sparse starting table ids", + )); + } + dst = &mut dst[..nwrite]; + + // write start kind + let nw = self.kind.write_to::<E>(dst)?; + dst = &mut dst[nw..]; + // write start byte map + let nw = self.start_map.write_to(dst)?; + dst = &mut dst[nw..]; + // write stride + E::write_u32(u32::try_from(self.stride).unwrap(), dst); + dst = &mut dst[size_of::<u32>()..]; + // write pattern length + E::write_u32( + u32::try_from(self.pattern_len.unwrap_or(0xFFFF_FFFF)).unwrap(), + dst, + ); + dst = &mut dst[size_of::<u32>()..]; + // write universal start unanchored state id, u32::MAX if absent + E::write_u32( + self.universal_start_unanchored + .map_or(u32::MAX, |sid| sid.as_u32()), + dst, + ); + dst = &mut dst[size_of::<u32>()..]; + // write universal start anchored state id, u32::MAX if absent + E::write_u32( + self.universal_start_anchored.map_or(u32::MAX, |sid| sid.as_u32()), + dst, + ); + dst = &mut dst[size_of::<u32>()..]; + // write start IDs + for (sid, _, _) in self.iter() { + E::write_u32(sid.as_u32(), dst); + dst = &mut dst[StateID::SIZE..]; + } + Ok(nwrite) + } + + /// Returns the number of bytes the serialized form of this transition + /// table will use. + fn write_to_len(&self) -> usize { + self.kind.write_to_len() + + self.start_map.write_to_len() + + size_of::<u32>() // stride + + size_of::<u32>() // # patterns + + size_of::<u32>() // universal unanchored start + + size_of::<u32>() // universal anchored start + + self.table().len() + } + + /// Validates that every starting state ID in this table is valid. + /// + /// That is, every starting state ID can be used to correctly decode a + /// state in the DFA's sparse transitions. + fn validate( + &self, + sp: &Special, + trans: &Transitions<T>, + ) -> Result<(), DeserializeError> { + for (id, _, _) in self.iter() { + if sp.is_match_state(id) { + return Err(DeserializeError::generic( + "start states cannot be match states", + )); + } + // Confirm that the start state points to a valid state. + let state = trans.try_state(sp, id)?; + // And like for the transition table, confirm that the transitions + // on all start states themselves point to a valid state. + // + // It'd probably be better to integrate this validation with the + // transition table, or otherwise store a sorted sequence of all + // valid state IDs in the sparse DFA itself. That way, we could + // check that every pointer to a state corresponds precisely to a + // correct and valid state. + for i in 0..state.ntrans { + let to = state.next_at(i); + let _ = trans.try_state(sp, to)?; + } + } + Ok(()) + } + + /// Converts this start list to a borrowed value. + fn as_ref(&self) -> StartTable<&'_ [u8]> { + StartTable { + table: self.table(), + kind: self.kind, + start_map: self.start_map.clone(), + stride: self.stride, + pattern_len: self.pattern_len, + universal_start_unanchored: self.universal_start_unanchored, + universal_start_anchored: self.universal_start_anchored, + } + } + + /// Converts this start list to an owned value. + #[cfg(feature = "alloc")] + fn to_owned(&self) -> StartTable<alloc::vec::Vec<u8>> { + StartTable { + table: self.table().to_vec(), + kind: self.kind, + start_map: self.start_map.clone(), + stride: self.stride, + pattern_len: self.pattern_len, + universal_start_unanchored: self.universal_start_unanchored, + universal_start_anchored: self.universal_start_anchored, + } + } + + /// Return the start state for the given index and pattern ID. If the + /// pattern ID is None, then the corresponding start state for the entire + /// DFA is returned. If the pattern ID is not None, then the corresponding + /// starting state for the given pattern is returned. If this start table + /// does not have individual starting states for each pattern, then this + /// panics. + fn start( + &self, + input: &Input<'_>, + start: Start, + ) -> Result<StateID, MatchError> { + let start_index = start.as_usize(); + let mode = input.get_anchored(); + let index = match mode { + Anchored::No => { + if !self.kind.has_unanchored() { + return Err(MatchError::unsupported_anchored(mode)); + } + start_index + } + Anchored::Yes => { + if !self.kind.has_anchored() { + return Err(MatchError::unsupported_anchored(mode)); + } + self.stride + start_index + } + Anchored::Pattern(pid) => { + let len = match self.pattern_len { + None => { + return Err(MatchError::unsupported_anchored(mode)) + } + Some(len) => len, + }; + if pid.as_usize() >= len { + return Ok(DEAD); + } + (2 * self.stride) + + (self.stride * pid.as_usize()) + + start_index + } + }; + let start = index * StateID::SIZE; + // This OK since we're allowed to assume that the start table contains + // valid StateIDs. + Ok(wire::read_state_id_unchecked(&self.table()[start..]).0) + } + + /// Return an iterator over all start IDs in this table. + fn iter(&self) -> StartStateIter<'_, T> { + StartStateIter { st: self, i: 0 } + } + + /// Returns the total number of start state IDs in this table. + fn len(&self) -> usize { + self.table().len() / StateID::SIZE + } + + /// Returns the table as a raw slice of bytes. + fn table(&self) -> &[u8] { + self.table.as_ref() + } + + /// Return the memory usage, in bytes, of this start list. + /// + /// This does not include the size of a `StartTable` value itself. + fn memory_usage(&self) -> usize { + self.table().len() + } +} + +#[cfg(feature = "dfa-build")] +impl<T: AsMut<[u8]>> StartTable<T> { + /// Set the start state for the given index and pattern. + /// + /// If the pattern ID or state ID are not valid, then this will panic. + fn set_start(&mut self, anchored: Anchored, start: Start, id: StateID) { + let start_index = start.as_usize(); + let index = match anchored { + Anchored::No => start_index, + Anchored::Yes => self.stride + start_index, + Anchored::Pattern(pid) => { + let pid = pid.as_usize(); + let len = self + .pattern_len + .expect("start states for each pattern enabled"); + assert!(pid < len, "invalid pattern ID {:?}", pid); + self.stride + .checked_mul(pid) + .unwrap() + .checked_add(self.stride.checked_mul(2).unwrap()) + .unwrap() + .checked_add(start_index) + .unwrap() + } + }; + let start = index * StateID::SIZE; + let end = start + StateID::SIZE; + wire::write_state_id::<wire::NE>( + id, + &mut self.table.as_mut()[start..end], + ); + } +} + +/// An iterator over all state state IDs in a sparse DFA. +struct StartStateIter<'a, T> { + st: &'a StartTable<T>, + i: usize, +} + +impl<'a, T: AsRef<[u8]>> Iterator for StartStateIter<'a, T> { + type Item = (StateID, Anchored, Start); + + fn next(&mut self) -> Option<(StateID, Anchored, Start)> { + let i = self.i; + if i >= self.st.len() { + return None; + } + self.i += 1; + + // This unwrap is okay since the stride of any DFA must always match + // the number of start state types. + let start_type = Start::from_usize(i % self.st.stride).unwrap(); + let anchored = if i < self.st.stride { + Anchored::No + } else if i < (2 * self.st.stride) { + Anchored::Yes + } else { + let pid = (i - (2 * self.st.stride)) / self.st.stride; + Anchored::Pattern(PatternID::new(pid).unwrap()) + }; + let start = i * StateID::SIZE; + let end = start + StateID::SIZE; + let bytes = self.st.table()[start..end].try_into().unwrap(); + // This is OK since we're allowed to assume that any IDs in this start + // table are correct and valid for this DFA. + let id = StateID::from_ne_bytes_unchecked(bytes); + Some((id, anchored, start_type)) + } +} + +impl<'a, T> fmt::Debug for StartStateIter<'a, T> { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + f.debug_struct("StartStateIter").field("i", &self.i).finish() + } +} + +/// An iterator over all states in a sparse DFA. +/// +/// This iterator yields tuples, where the first element is the state ID and +/// the second element is the state itself. +struct StateIter<'a, T> { + trans: &'a Transitions<T>, + id: usize, +} + +impl<'a, T: AsRef<[u8]>> Iterator for StateIter<'a, T> { + type Item = State<'a>; + + fn next(&mut self) -> Option<State<'a>> { + if self.id >= self.trans.sparse().len() { + return None; + } + let state = self.trans.state(StateID::new_unchecked(self.id)); + self.id = self.id + state.write_to_len(); + Some(state) + } +} + +impl<'a, T> fmt::Debug for StateIter<'a, T> { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + f.debug_struct("StateIter").field("id", &self.id).finish() + } +} + +/// A representation of a sparse DFA state that can be cheaply materialized +/// from a state identifier. +#[derive(Clone)] +struct State<'a> { + /// The identifier of this state. + id: StateID, + /// Whether this is a match state or not. + is_match: bool, + /// The number of transitions in this state. + ntrans: usize, + /// Pairs of input ranges, where there is one pair for each transition. + /// Each pair specifies an inclusive start and end byte range for the + /// corresponding transition. + input_ranges: &'a [u8], + /// Transitions to the next state. This slice contains native endian + /// encoded state identifiers, with `S` as the representation. Thus, there + /// are `ntrans * size_of::<S>()` bytes in this slice. + next: &'a [u8], + /// If this is a match state, then this contains the pattern IDs that match + /// when the DFA is in this state. + /// + /// This is a contiguous sequence of 32-bit native endian encoded integers. + pattern_ids: &'a [u8], + /// An accelerator for this state, if present. If this state has no + /// accelerator, then this is an empty slice. When non-empty, this slice + /// has length at most 3 and corresponds to the exhaustive set of bytes + /// that must be seen in order to transition out of this state. + accel: &'a [u8], +} + +impl<'a> State<'a> { + /// Searches for the next transition given an input byte. If no such + /// transition could be found, then a dead state is returned. + /// + /// This is marked as inline to help dramatically boost sparse searching, + /// which decodes each state it enters to follow the next transition. + #[cfg_attr(feature = "perf-inline", inline(always))] + fn next(&self, input: u8) -> StateID { + // This straight linear search was observed to be much better than + // binary search on ASCII haystacks, likely because a binary search + // visits the ASCII case last but a linear search sees it first. A + // binary search does do a little better on non-ASCII haystacks, but + // not by much. There might be a better trade off lurking here. + for i in 0..(self.ntrans - 1) { + let (start, end) = self.range(i); + if start <= input && input <= end { + return self.next_at(i); + } + // We could bail early with an extra branch: if input < b1, then + // we know we'll never find a matching transition. Interestingly, + // this extra branch seems to not help performance, or will even + // hurt it. It's likely very dependent on the DFA itself and what + // is being searched. + } + DEAD + } + + /// Returns the next state ID for the special EOI transition. + fn next_eoi(&self) -> StateID { + self.next_at(self.ntrans - 1) + } + + /// Returns the identifier for this state. + fn id(&self) -> StateID { + self.id + } + + /// Returns the inclusive input byte range for the ith transition in this + /// state. + fn range(&self, i: usize) -> (u8, u8) { + (self.input_ranges[i * 2], self.input_ranges[i * 2 + 1]) + } + + /// Returns the next state for the ith transition in this state. + fn next_at(&self, i: usize) -> StateID { + let start = i * StateID::SIZE; + let end = start + StateID::SIZE; + let bytes = self.next[start..end].try_into().unwrap(); + StateID::from_ne_bytes_unchecked(bytes) + } + + /// Returns the pattern ID for the given match index. If the match index + /// is invalid, then this panics. + fn pattern_id(&self, match_index: usize) -> PatternID { + let start = match_index * PatternID::SIZE; + wire::read_pattern_id_unchecked(&self.pattern_ids[start..]).0 + } + + /// Returns the total number of pattern IDs for this state. This is always + /// zero when `is_match` is false. + fn pattern_len(&self) -> usize { + assert_eq!(0, self.pattern_ids.len() % 4); + self.pattern_ids.len() / 4 + } + + /// Return an accelerator for this state. + fn accelerator(&self) -> &'a [u8] { + self.accel + } + + /// Write the raw representation of this state to the given buffer using + /// the given endianness. + fn write_to<E: Endian>( + &self, + mut dst: &mut [u8], + ) -> Result<usize, SerializeError> { + let nwrite = self.write_to_len(); + if dst.len() < nwrite { + return Err(SerializeError::buffer_too_small( + "sparse state transitions", + )); + } + + let ntrans = + if self.is_match { self.ntrans | (1 << 15) } else { self.ntrans }; + E::write_u16(u16::try_from(ntrans).unwrap(), dst); + dst = &mut dst[size_of::<u16>()..]; + + dst[..self.input_ranges.len()].copy_from_slice(self.input_ranges); + dst = &mut dst[self.input_ranges.len()..]; + + for i in 0..self.ntrans { + E::write_u32(self.next_at(i).as_u32(), dst); + dst = &mut dst[StateID::SIZE..]; + } + + if self.is_match { + E::write_u32(u32::try_from(self.pattern_len()).unwrap(), dst); + dst = &mut dst[size_of::<u32>()..]; + for i in 0..self.pattern_len() { + let pid = self.pattern_id(i); + E::write_u32(pid.as_u32(), dst); + dst = &mut dst[PatternID::SIZE..]; + } + } + + dst[0] = u8::try_from(self.accel.len()).unwrap(); + dst[1..][..self.accel.len()].copy_from_slice(self.accel); + + Ok(nwrite) + } + + /// Return the total number of bytes that this state consumes in its + /// encoded form. + fn write_to_len(&self) -> usize { + let mut len = 2 + + (self.ntrans * 2) + + (self.ntrans * StateID::SIZE) + + (1 + self.accel.len()); + if self.is_match { + len += size_of::<u32>() + self.pattern_ids.len(); + } + len + } +} + +impl<'a> fmt::Debug for State<'a> { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + let mut printed = false; + for i in 0..(self.ntrans - 1) { + let next = self.next_at(i); + if next == DEAD { + continue; + } + + if printed { + write!(f, ", ")?; + } + let (start, end) = self.range(i); + if start == end { + write!(f, "{:?} => {:?}", DebugByte(start), next.as_usize())?; + } else { + write!( + f, + "{:?}-{:?} => {:?}", + DebugByte(start), + DebugByte(end), + next.as_usize(), + )?; + } + printed = true; + } + let eoi = self.next_at(self.ntrans - 1); + if eoi != DEAD { + if printed { + write!(f, ", ")?; + } + write!(f, "EOI => {:?}", eoi.as_usize())?; + } + Ok(()) + } +} + +/// A representation of a mutable sparse DFA state that can be cheaply +/// materialized from a state identifier. +#[cfg(feature = "dfa-build")] +struct StateMut<'a> { + /// The identifier of this state. + id: StateID, + /// Whether this is a match state or not. + is_match: bool, + /// The number of transitions in this state. + ntrans: usize, + /// Pairs of input ranges, where there is one pair for each transition. + /// Each pair specifies an inclusive start and end byte range for the + /// corresponding transition. + input_ranges: &'a mut [u8], + /// Transitions to the next state. This slice contains native endian + /// encoded state identifiers, with `S` as the representation. Thus, there + /// are `ntrans * size_of::<S>()` bytes in this slice. + next: &'a mut [u8], + /// If this is a match state, then this contains the pattern IDs that match + /// when the DFA is in this state. + /// + /// This is a contiguous sequence of 32-bit native endian encoded integers. + pattern_ids: &'a [u8], + /// An accelerator for this state, if present. If this state has no + /// accelerator, then this is an empty slice. When non-empty, this slice + /// has length at most 3 and corresponds to the exhaustive set of bytes + /// that must be seen in order to transition out of this state. + accel: &'a mut [u8], +} + +#[cfg(feature = "dfa-build")] +impl<'a> StateMut<'a> { + /// Sets the ith transition to the given state. + fn set_next_at(&mut self, i: usize, next: StateID) { + let start = i * StateID::SIZE; + let end = start + StateID::SIZE; + wire::write_state_id::<wire::NE>(next, &mut self.next[start..end]); + } +} + +#[cfg(feature = "dfa-build")] +impl<'a> fmt::Debug for StateMut<'a> { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + let state = State { + id: self.id, + is_match: self.is_match, + ntrans: self.ntrans, + input_ranges: self.input_ranges, + next: self.next, + pattern_ids: self.pattern_ids, + accel: self.accel, + }; + fmt::Debug::fmt(&state, f) + } +} + +/* +/// A binary search routine specialized specifically to a sparse DFA state's +/// transitions. Specifically, the transitions are defined as a set of pairs +/// of input bytes that delineate an inclusive range of bytes. If the input +/// byte is in the range, then the corresponding transition is a match. +/// +/// This binary search accepts a slice of these pairs and returns the position +/// of the matching pair (the ith transition), or None if no matching pair +/// could be found. +/// +/// Note that this routine is not currently used since it was observed to +/// either decrease performance when searching ASCII, or did not provide enough +/// of a boost on non-ASCII haystacks to be worth it. However, we leave it here +/// for posterity in case we can find a way to use it. +/// +/// In theory, we could use the standard library's search routine if we could +/// cast a `&[u8]` to a `&[(u8, u8)]`, but I don't believe this is currently +/// guaranteed to be safe and is thus UB (since I don't think the in-memory +/// representation of `(u8, u8)` has been nailed down). One could define a +/// repr(C) type, but the casting doesn't seem justified. +#[cfg_attr(feature = "perf-inline", inline(always))] +fn binary_search_ranges(ranges: &[u8], needle: u8) -> Option<usize> { + debug_assert!(ranges.len() % 2 == 0, "ranges must have even length"); + debug_assert!(ranges.len() <= 512, "ranges should be short"); + + let (mut left, mut right) = (0, ranges.len() / 2); + while left < right { + let mid = (left + right) / 2; + let (b1, b2) = (ranges[mid * 2], ranges[mid * 2 + 1]); + if needle < b1 { + right = mid; + } else if needle > b2 { + left = mid + 1; + } else { + return Some(mid); + } + } + None +} +*/ + +#[cfg(all(test, feature = "syntax", feature = "dfa-build"))] +mod tests { + use crate::{ + dfa::{dense::DFA, Automaton}, + nfa::thompson, + Input, MatchError, + }; + + // See the analogous test in src/hybrid/dfa.rs and src/dfa/dense.rs. + #[test] + fn heuristic_unicode_forward() { + let dfa = DFA::builder() + .configure(DFA::config().unicode_word_boundary(true)) + .thompson(thompson::Config::new().reverse(true)) + .build(r"\b[0-9]+\b") + .unwrap() + .to_sparse() + .unwrap(); + + let input = Input::new("β123").range(2..); + let expected = MatchError::quit(0xB2, 1); + let got = dfa.try_search_fwd(&input); + assert_eq!(Err(expected), got); + + let input = Input::new("123β").range(..3); + let expected = MatchError::quit(0xCE, 3); + let got = dfa.try_search_fwd(&input); + assert_eq!(Err(expected), got); + } + + // See the analogous test in src/hybrid/dfa.rs and src/dfa/dense.rs. + #[test] + fn heuristic_unicode_reverse() { + let dfa = DFA::builder() + .configure(DFA::config().unicode_word_boundary(true)) + .thompson(thompson::Config::new().reverse(true)) + .build(r"\b[0-9]+\b") + .unwrap() + .to_sparse() + .unwrap(); + + let input = Input::new("β123").range(2..); + let expected = MatchError::quit(0xB2, 1); + let got = dfa.try_search_rev(&input); + assert_eq!(Err(expected), got); + + let input = Input::new("123β").range(..3); + let expected = MatchError::quit(0xCE, 3); + let got = dfa.try_search_rev(&input); + assert_eq!(Err(expected), got); + } +} diff --git a/third_party/rust/regex-automata/src/dfa/special.rs b/third_party/rust/regex-automata/src/dfa/special.rs new file mode 100644 index 0000000000..a831df5c55 --- /dev/null +++ b/third_party/rust/regex-automata/src/dfa/special.rs @@ -0,0 +1,494 @@ +use crate::{ + dfa::DEAD, + util::{ + primitives::StateID, + wire::{self, DeserializeError, Endian, SerializeError}, + }, +}; + +macro_rules! err { + ($msg:expr) => { + return Err(DeserializeError::generic($msg)); + }; +} + +// Special represents the identifiers in a DFA that correspond to "special" +// states. If a state is one or more of the following, then it is considered +// special: +// +// * dead - A non-matching state where all outgoing transitions lead back to +// itself. There is only one of these, regardless of whether minimization +// has run. The dead state always has an ID of 0. i.e., It is always the +// first state in a DFA. +// * quit - A state that is entered whenever a byte is seen that should cause +// a DFA to give up and stop searching. This results in a MatchError::quit +// error being returned at search time. The default configuration for a DFA +// has no quit bytes, which means this state is unreachable by default, +// although it is always present for reasons of implementation simplicity. +// This state is only reachable when the caller configures the DFA to quit +// on certain bytes. There is always exactly one of these states and it +// is always the second state. (Its actual ID depends on the size of the +// alphabet in dense DFAs, since state IDs are premultiplied in order to +// allow them to be used directly as indices into the transition table.) +// * match - An accepting state, i.e., indicative of a match. There may be +// zero or more of these states. +// * accelerated - A state where all of its outgoing transitions, except a +// few, loop back to itself. These states are candidates for acceleration +// via memchr during search. There may be zero or more of these states. +// * start - A non-matching state that indicates where the automaton should +// start during a search. There is always at least one starting state and +// all are guaranteed to be non-match states. (A start state cannot be a +// match state because the DFAs in this crate delay all matches by one byte. +// So every search that finds a match must move through one transition to +// some other match state, even when searching an empty string.) +// +// These are not mutually exclusive categories. Namely, the following +// overlappings can occur: +// +// * {dead, start} - If a DFA can never lead to a match and it is minimized, +// then it will typically compile to something where all starting IDs point +// to the DFA's dead state. +// * {match, accelerated} - It is possible for a match state to have the +// majority of its transitions loop back to itself, which means it's +// possible for a match state to be accelerated. +// * {start, accelerated} - Similarly, it is possible for a start state to be +// accelerated. Note that it is possible for an accelerated state to be +// neither a match or a start state. Also note that just because both match +// and start states overlap with accelerated states does not mean that +// match and start states overlap with each other. In fact, they are +// guaranteed not to overlap. +// +// As a special mention, every DFA always has a dead and a quit state, even +// though from the perspective of the DFA, they are equivalent. (Indeed, +// minimization special cases them to ensure they don't get merged.) The +// purpose of keeping them distinct is to use the quit state as a sentinel to +// distguish between whether a search finished successfully without finding +// anything or whether it gave up before finishing. +// +// So the main problem we want to solve here is the *fast* detection of whether +// a state is special or not. And we also want to do this while storing as +// little extra data as possible. AND we want to be able to quickly determine +// which categories a state falls into above if it is special. +// +// We achieve this by essentially shuffling all special states to the beginning +// of a DFA. That is, all special states appear before every other non-special +// state. By representing special states this way, we can determine whether a +// state is special or not by a single comparison, where special.max is the +// identifier of the last special state in the DFA: +// +// if current_state <= special.max: +// ... do something with special state +// +// The only thing left to do is to determine what kind of special state +// it is. Because what we do next depends on that. Since special states +// are typically rare, we can afford to do a bit more extra work, but we'd +// still like this to be as fast as possible. The trick we employ here is to +// continue shuffling states even within the special state range. Such that +// one contiguous region corresponds to match states, another for start states +// and then an overlapping range for accelerated states. At a high level, our +// special state detection might look like this (for leftmost searching, where +// we continue searching even after seeing a match): +// +// byte = input[offset] +// current_state = next_state(current_state, byte) +// offset += 1 +// if current_state <= special.max: +// if current_state == 0: +// # We can never leave a dead state, so this always marks the +// # end of our search. +// return last_match +// if current_state == special.quit_id: +// # A quit state means we give up. If he DFA has no quit state, +// # then special.quit_id == 0 == dead, which is handled by the +// # conditional above. +// return Err(MatchError::quit { byte, offset: offset - 1 }) +// if special.min_match <= current_state <= special.max_match: +// last_match = Some(offset) +// if special.min_accel <= current_state <= special.max_accel: +// offset = accelerate(input, offset) +// last_match = Some(offset) +// elif special.min_start <= current_state <= special.max_start: +// offset = prefilter.find(input, offset) +// if special.min_accel <= current_state <= special.max_accel: +// offset = accelerate(input, offset) +// elif special.min_accel <= current_state <= special.max_accel: +// offset = accelerate(input, offset) +// +// There are some small details left out of the logic above. For example, +// in order to accelerate a state, we need to know which bytes to search for. +// This in turn implies some extra data we need to store in the DFA. To keep +// things compact, we would ideally only store +// +// N = special.max_accel - special.min_accel + 1 +// +// items. But state IDs are premultiplied, which means they are not contiguous. +// So in order to take a state ID and index an array of accelerated structures, +// we need to do: +// +// i = (state_id - special.min_accel) / stride +// +// (N.B. 'stride' is always a power of 2, so the above can be implemented via +// '(state_id - special.min_accel) >> stride2', where 'stride2' is x in +// 2^x=stride.) +// +// Moreover, some of these specialty categories may be empty. For example, +// DFAs are not required to have any match states or any accelerated states. +// In that case, the lower and upper bounds are both set to 0 (the dead state +// ID) and the first `current_state == 0` check subsumes cases where the +// ranges are empty. +// +// Loop unrolling, if applicable, has also been left out of the logic above. +// +// Graphically, the ranges look like this, where asterisks indicate ranges +// that can be empty. Each 'x' is a state. +// +// quit +// dead| +// || +// xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx +// | | | | start | | +// | |-------------| |-------| | +// | match* | | | | +// | | | | | +// | |----------| | | +// | accel* | | +// | | | +// | | | +// |----------------------------|------------------------ +// special non-special* +#[derive(Clone, Copy, Debug)] +pub(crate) struct Special { + /// The identifier of the last special state in a DFA. A state is special + /// if and only if its identifier is less than or equal to `max`. + pub(crate) max: StateID, + /// The identifier of the quit state in a DFA. (There is no analogous field + /// for the dead state since the dead state's ID is always zero, regardless + /// of state ID size.) + pub(crate) quit_id: StateID, + /// The identifier of the first match state. + pub(crate) min_match: StateID, + /// The identifier of the last match state. + pub(crate) max_match: StateID, + /// The identifier of the first accelerated state. + pub(crate) min_accel: StateID, + /// The identifier of the last accelerated state. + pub(crate) max_accel: StateID, + /// The identifier of the first start state. + pub(crate) min_start: StateID, + /// The identifier of the last start state. + pub(crate) max_start: StateID, +} + +impl Special { + /// Creates a new set of special ranges for a DFA. All ranges are initially + /// set to only contain the dead state. This is interpreted as an empty + /// range. + #[cfg(feature = "dfa-build")] + pub(crate) fn new() -> Special { + Special { + max: DEAD, + quit_id: DEAD, + min_match: DEAD, + max_match: DEAD, + min_accel: DEAD, + max_accel: DEAD, + min_start: DEAD, + max_start: DEAD, + } + } + + /// Remaps all of the special state identifiers using the function given. + #[cfg(feature = "dfa-build")] + pub(crate) fn remap(&self, map: impl Fn(StateID) -> StateID) -> Special { + Special { + max: map(self.max), + quit_id: map(self.quit_id), + min_match: map(self.min_match), + max_match: map(self.max_match), + min_accel: map(self.min_accel), + max_accel: map(self.max_accel), + min_start: map(self.min_start), + max_start: map(self.max_start), + } + } + + /// Deserialize the given bytes into special state ranges. If the slice + /// given is not big enough, then this returns an error. Similarly, if + /// any of the expected invariants around special state ranges aren't + /// upheld, an error is returned. Note that this does not guarantee that + /// the information returned is correct. + /// + /// Upon success, this returns the number of bytes read in addition to the + /// special state IDs themselves. + pub(crate) fn from_bytes( + mut slice: &[u8], + ) -> Result<(Special, usize), DeserializeError> { + wire::check_slice_len(slice, 8 * StateID::SIZE, "special states")?; + + let mut nread = 0; + let mut read_id = |what| -> Result<StateID, DeserializeError> { + let (id, nr) = wire::try_read_state_id(slice, what)?; + nread += nr; + slice = &slice[StateID::SIZE..]; + Ok(id) + }; + + let max = read_id("special max id")?; + let quit_id = read_id("special quit id")?; + let min_match = read_id("special min match id")?; + let max_match = read_id("special max match id")?; + let min_accel = read_id("special min accel id")?; + let max_accel = read_id("special max accel id")?; + let min_start = read_id("special min start id")?; + let max_start = read_id("special max start id")?; + + let special = Special { + max, + quit_id, + min_match, + max_match, + min_accel, + max_accel, + min_start, + max_start, + }; + special.validate()?; + assert_eq!(nread, special.write_to_len()); + Ok((special, nread)) + } + + /// Validate that the information describing special states satisfies + /// all known invariants. + pub(crate) fn validate(&self) -> Result<(), DeserializeError> { + // Check that both ends of the range are DEAD or neither are. + if self.min_match == DEAD && self.max_match != DEAD { + err!("min_match is DEAD, but max_match is not"); + } + if self.min_match != DEAD && self.max_match == DEAD { + err!("max_match is DEAD, but min_match is not"); + } + if self.min_accel == DEAD && self.max_accel != DEAD { + err!("min_accel is DEAD, but max_accel is not"); + } + if self.min_accel != DEAD && self.max_accel == DEAD { + err!("max_accel is DEAD, but min_accel is not"); + } + if self.min_start == DEAD && self.max_start != DEAD { + err!("min_start is DEAD, but max_start is not"); + } + if self.min_start != DEAD && self.max_start == DEAD { + err!("max_start is DEAD, but min_start is not"); + } + + // Check that ranges are well formed. + if self.min_match > self.max_match { + err!("min_match should not be greater than max_match"); + } + if self.min_accel > self.max_accel { + err!("min_accel should not be greater than max_accel"); + } + if self.min_start > self.max_start { + err!("min_start should not be greater than max_start"); + } + + // Check that ranges are ordered with respect to one another. + if self.matches() && self.quit_id >= self.min_match { + err!("quit_id should not be greater than min_match"); + } + if self.accels() && self.quit_id >= self.min_accel { + err!("quit_id should not be greater than min_accel"); + } + if self.starts() && self.quit_id >= self.min_start { + err!("quit_id should not be greater than min_start"); + } + if self.matches() && self.accels() && self.min_accel < self.min_match { + err!("min_match should not be greater than min_accel"); + } + if self.matches() && self.starts() && self.min_start < self.min_match { + err!("min_match should not be greater than min_start"); + } + if self.accels() && self.starts() && self.min_start < self.min_accel { + err!("min_accel should not be greater than min_start"); + } + + // Check that max is at least as big as everything else. + if self.max < self.quit_id { + err!("quit_id should not be greater than max"); + } + if self.max < self.max_match { + err!("max_match should not be greater than max"); + } + if self.max < self.max_accel { + err!("max_accel should not be greater than max"); + } + if self.max < self.max_start { + err!("max_start should not be greater than max"); + } + + Ok(()) + } + + /// Validate that the special state information is compatible with the + /// given state len. + pub(crate) fn validate_state_len( + &self, + len: usize, + stride2: usize, + ) -> Result<(), DeserializeError> { + // We assume that 'validate' has already passed, so we know that 'max' + // is truly the max. So all we need to check is that the max state ID + // is less than the state ID len. The max legal value here is len-1, + // which occurs when there are no non-special states. + if (self.max.as_usize() >> stride2) >= len { + err!("max should not be greater than or equal to state length"); + } + Ok(()) + } + + /// Write the IDs and ranges for special states to the given byte buffer. + /// The buffer given must have enough room to store all data, otherwise + /// this will return an error. The number of bytes written is returned + /// on success. The number of bytes written is guaranteed to be a multiple + /// of 8. + pub(crate) fn write_to<E: Endian>( + &self, + dst: &mut [u8], + ) -> Result<usize, SerializeError> { + use crate::util::wire::write_state_id as write; + + if dst.len() < self.write_to_len() { + return Err(SerializeError::buffer_too_small("special state ids")); + } + + let mut nwrite = 0; + nwrite += write::<E>(self.max, &mut dst[nwrite..]); + nwrite += write::<E>(self.quit_id, &mut dst[nwrite..]); + nwrite += write::<E>(self.min_match, &mut dst[nwrite..]); + nwrite += write::<E>(self.max_match, &mut dst[nwrite..]); + nwrite += write::<E>(self.min_accel, &mut dst[nwrite..]); + nwrite += write::<E>(self.max_accel, &mut dst[nwrite..]); + nwrite += write::<E>(self.min_start, &mut dst[nwrite..]); + nwrite += write::<E>(self.max_start, &mut dst[nwrite..]); + + assert_eq!( + self.write_to_len(), + nwrite, + "expected to write certain number of bytes", + ); + assert_eq!( + nwrite % 8, + 0, + "expected to write multiple of 8 bytes for special states", + ); + Ok(nwrite) + } + + /// Returns the total number of bytes written by `write_to`. + pub(crate) fn write_to_len(&self) -> usize { + 8 * StateID::SIZE + } + + /// Sets the maximum special state ID based on the current values. This + /// should be used once all possible state IDs are set. + #[cfg(feature = "dfa-build")] + pub(crate) fn set_max(&mut self) { + use core::cmp::max; + self.max = max( + self.quit_id, + max(self.max_match, max(self.max_accel, self.max_start)), + ); + } + + /// Sets the maximum special state ID such that starting states are not + /// considered "special." This also marks the min/max starting states as + /// DEAD such that 'is_start_state' always returns false, even if the state + /// is actually a starting state. + /// + /// This is useful when there is no prefilter set. It will avoid + /// ping-ponging between the hot path in the DFA search code and the start + /// state handling code, which is typically only useful for executing a + /// prefilter. + #[cfg(feature = "dfa-build")] + pub(crate) fn set_no_special_start_states(&mut self) { + use core::cmp::max; + self.max = max(self.quit_id, max(self.max_match, self.max_accel)); + self.min_start = DEAD; + self.max_start = DEAD; + } + + /// Returns true if and only if the given state ID is a special state. + #[inline] + pub(crate) fn is_special_state(&self, id: StateID) -> bool { + id <= self.max + } + + /// Returns true if and only if the given state ID is a dead state. + #[inline] + pub(crate) fn is_dead_state(&self, id: StateID) -> bool { + id == DEAD + } + + /// Returns true if and only if the given state ID is a quit state. + #[inline] + pub(crate) fn is_quit_state(&self, id: StateID) -> bool { + !self.is_dead_state(id) && self.quit_id == id + } + + /// Returns true if and only if the given state ID is a match state. + #[inline] + pub(crate) fn is_match_state(&self, id: StateID) -> bool { + !self.is_dead_state(id) && self.min_match <= id && id <= self.max_match + } + + /// Returns true if and only if the given state ID is an accel state. + #[inline] + pub(crate) fn is_accel_state(&self, id: StateID) -> bool { + !self.is_dead_state(id) && self.min_accel <= id && id <= self.max_accel + } + + /// Returns true if and only if the given state ID is a start state. + #[inline] + pub(crate) fn is_start_state(&self, id: StateID) -> bool { + !self.is_dead_state(id) && self.min_start <= id && id <= self.max_start + } + + /// Returns the total number of match states for a dense table based DFA. + #[inline] + pub(crate) fn match_len(&self, stride: usize) -> usize { + if self.matches() { + (self.max_match.as_usize() - self.min_match.as_usize() + stride) + / stride + } else { + 0 + } + } + + /// Returns true if and only if there is at least one match state. + #[inline] + pub(crate) fn matches(&self) -> bool { + self.min_match != DEAD + } + + /// Returns the total number of accel states. + #[cfg(feature = "dfa-build")] + pub(crate) fn accel_len(&self, stride: usize) -> usize { + if self.accels() { + (self.max_accel.as_usize() - self.min_accel.as_usize() + stride) + / stride + } else { + 0 + } + } + + /// Returns true if and only if there is at least one accel state. + #[inline] + pub(crate) fn accels(&self) -> bool { + self.min_accel != DEAD + } + + /// Returns true if and only if there is at least one start state. + #[inline] + pub(crate) fn starts(&self) -> bool { + self.min_start != DEAD + } +} diff --git a/third_party/rust/regex-automata/src/dfa/start.rs b/third_party/rust/regex-automata/src/dfa/start.rs new file mode 100644 index 0000000000..fddc702df5 --- /dev/null +++ b/third_party/rust/regex-automata/src/dfa/start.rs @@ -0,0 +1,74 @@ +use core::mem::size_of; + +use crate::util::wire::{self, DeserializeError, Endian, SerializeError}; + +/// The kind of anchored starting configurations to support in a DFA. +/// +/// Fully compiled DFAs need to be explicitly configured as to which anchored +/// starting configurations to support. The reason for not just supporting +/// everything unconditionally is that it can use more resources (such as +/// memory and build time). The downside of this is that if you try to execute +/// a search using an [`Anchored`](crate::Anchored) mode that is not supported +/// by the DFA, then the search will return an error. +#[derive(Clone, Copy, Debug, Eq, PartialEq)] +pub enum StartKind { + /// Support both anchored and unanchored searches. + Both, + /// Support only unanchored searches. Requesting an anchored search will + /// panic. + /// + /// Note that even if an unanchored search is requested, the pattern itself + /// may still be anchored. For example, `^abc` will only match `abc` at the + /// start of a haystack. This will remain true, even if the regex engine + /// only supported unanchored searches. + Unanchored, + /// Support only anchored searches. Requesting an unanchored search will + /// panic. + Anchored, +} + +impl StartKind { + pub(crate) fn from_bytes( + slice: &[u8], + ) -> Result<(StartKind, usize), DeserializeError> { + wire::check_slice_len(slice, size_of::<u32>(), "start kind bytes")?; + let (n, nr) = wire::try_read_u32(slice, "start kind integer")?; + match n { + 0 => Ok((StartKind::Both, nr)), + 1 => Ok((StartKind::Unanchored, nr)), + 2 => Ok((StartKind::Anchored, nr)), + _ => Err(DeserializeError::generic("unrecognized start kind")), + } + } + + pub(crate) fn write_to<E: Endian>( + &self, + dst: &mut [u8], + ) -> Result<usize, SerializeError> { + let nwrite = self.write_to_len(); + if dst.len() < nwrite { + return Err(SerializeError::buffer_too_small("start kind")); + } + let n = match *self { + StartKind::Both => 0, + StartKind::Unanchored => 1, + StartKind::Anchored => 2, + }; + E::write_u32(n, dst); + Ok(nwrite) + } + + pub(crate) fn write_to_len(&self) -> usize { + size_of::<u32>() + } + + #[cfg_attr(feature = "perf-inline", inline(always))] + pub(crate) fn has_unanchored(&self) -> bool { + matches!(*self, StartKind::Both | StartKind::Unanchored) + } + + #[cfg_attr(feature = "perf-inline", inline(always))] + pub(crate) fn has_anchored(&self) -> bool { + matches!(*self, StartKind::Both | StartKind::Anchored) + } +} diff --git a/third_party/rust/regex-automata/src/hybrid/dfa.rs b/third_party/rust/regex-automata/src/hybrid/dfa.rs new file mode 100644 index 0000000000..67261c1a30 --- /dev/null +++ b/third_party/rust/regex-automata/src/hybrid/dfa.rs @@ -0,0 +1,4381 @@ +/*! +Types and routines specific to lazy DFAs. + +This module is the home of [`hybrid::dfa::DFA`](DFA). + +This module also contains a [`hybrid::dfa::Builder`](Builder) and a +[`hybrid::dfa::Config`](Config) for configuring and building a lazy DFA. +*/ + +use core::{iter, mem::size_of}; + +use alloc::vec::Vec; + +use crate::{ + hybrid::{ + error::{BuildError, CacheError}, + id::{LazyStateID, LazyStateIDError}, + search, + }, + nfa::thompson, + util::{ + alphabet::{self, ByteClasses, ByteSet}, + determinize::{self, State, StateBuilderEmpty, StateBuilderNFA}, + empty, + prefilter::Prefilter, + primitives::{PatternID, StateID as NFAStateID}, + search::{ + Anchored, HalfMatch, Input, MatchError, MatchKind, PatternSet, + }, + sparse_set::SparseSets, + start::{Start, StartByteMap}, + }, +}; + +/// The minimum number of states that a lazy DFA's cache size must support. +/// +/// This is checked at time of construction to ensure that at least some small +/// number of states can fit in the given capacity allotment. If we can't fit +/// at least this number of states, then the thinking is that it's pretty +/// senseless to use the lazy DFA. More to the point, parts of the code do +/// assume that the cache can fit at least some small number of states. +const MIN_STATES: usize = SENTINEL_STATES + 2; + +/// The number of "sentinel" states that get added to every lazy DFA. +/// +/// These are special states indicating status conditions of a search: unknown, +/// dead and quit. These states in particular also use zero NFA states, so +/// their memory usage is quite small. This is relevant for computing the +/// minimum memory needed for a lazy DFA cache. +const SENTINEL_STATES: usize = 3; + +/// A hybrid NFA/DFA (also called a "lazy DFA") for regex searching. +/// +/// A lazy DFA is a DFA that builds itself at search time. It otherwise has +/// very similar characteristics as a [`dense::DFA`](crate::dfa::dense::DFA). +/// Indeed, both support precisely the same regex features with precisely the +/// same semantics. +/// +/// Where as a `dense::DFA` must be completely built to handle any input before +/// it may be used for search, a lazy DFA starts off effectively empty. During +/// a search, a lazy DFA will build itself depending on whether it has already +/// computed the next transition or not. If it has, then it looks a lot like +/// a `dense::DFA` internally: it does a very fast table based access to find +/// the next transition. Otherwise, if the state hasn't been computed, then it +/// does determinization _for that specific transition_ to compute the next DFA +/// state. +/// +/// The main selling point of a lazy DFA is that, in practice, it has +/// the performance profile of a `dense::DFA` without the weakness of it +/// taking worst case exponential time to build. Indeed, for each byte of +/// input, the lazy DFA will construct as most one new DFA state. Thus, a +/// lazy DFA achieves worst case `O(mn)` time for regex search (where `m ~ +/// pattern.len()` and `n ~ haystack.len()`). +/// +/// The main downsides of a lazy DFA are: +/// +/// 1. It requires mutable "cache" space during search. This is where the +/// transition table, among other things, is stored. +/// 2. In pathological cases (e.g., if the cache is too small), it will run +/// out of room and either require a bigger cache capacity or will repeatedly +/// clear the cache and thus repeatedly regenerate DFA states. Overall, this +/// will tend to be slower than a typical NFA simulation. +/// +/// # Capabilities +/// +/// Like a `dense::DFA`, a single lazy DFA fundamentally supports the following +/// operations: +/// +/// 1. Detection of a match. +/// 2. Location of the end of a match. +/// 3. In the case of a lazy DFA with multiple patterns, which pattern matched +/// is reported as well. +/// +/// A notable absence from the above list of capabilities is the location of +/// the *start* of a match. In order to provide both the start and end of +/// a match, *two* lazy DFAs are required. This functionality is provided by a +/// [`Regex`](crate::hybrid::regex::Regex). +/// +/// # Example +/// +/// This shows how to build a lazy DFA with the default configuration and +/// execute a search. Notice how, in contrast to a `dense::DFA`, we must create +/// a cache and pass it to our search routine. +/// +/// ``` +/// use regex_automata::{hybrid::dfa::DFA, HalfMatch, Input}; +/// +/// let dfa = DFA::new("foo[0-9]+")?; +/// let mut cache = dfa.create_cache(); +/// +/// let expected = Some(HalfMatch::must(0, 8)); +/// assert_eq!(expected, dfa.try_search_fwd( +/// &mut cache, &Input::new("foo12345"))?, +/// ); +/// # Ok::<(), Box<dyn std::error::Error>>(()) +/// ``` +#[derive(Clone, Debug)] +pub struct DFA { + config: Config, + nfa: thompson::NFA, + stride2: usize, + start_map: StartByteMap, + classes: ByteClasses, + quitset: ByteSet, + cache_capacity: usize, +} + +impl DFA { + /// Parse the given regular expression using a default configuration and + /// return the corresponding lazy DFA. + /// + /// If you want a non-default configuration, then use the [`Builder`] to + /// set your own configuration. + /// + /// # Example + /// + /// ``` + /// use regex_automata::{hybrid::dfa::DFA, HalfMatch, Input}; + /// + /// let dfa = DFA::new("foo[0-9]+bar")?; + /// let mut cache = dfa.create_cache(); + /// + /// let expected = HalfMatch::must(0, 11); + /// assert_eq!( + /// Some(expected), + /// dfa.try_search_fwd(&mut cache, &Input::new("foo12345bar"))?, + /// ); + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + #[cfg(feature = "syntax")] + pub fn new(pattern: &str) -> Result<DFA, BuildError> { + DFA::builder().build(pattern) + } + + /// Parse the given regular expressions using a default configuration and + /// return the corresponding lazy multi-DFA. + /// + /// If you want a non-default configuration, then use the [`Builder`] to + /// set your own configuration. + /// + /// # Example + /// + /// ``` + /// use regex_automata::{hybrid::dfa::DFA, HalfMatch, Input}; + /// + /// let dfa = DFA::new_many(&["[0-9]+", "[a-z]+"])?; + /// let mut cache = dfa.create_cache(); + /// + /// let expected = HalfMatch::must(1, 3); + /// assert_eq!( + /// Some(expected), + /// dfa.try_search_fwd(&mut cache, &Input::new("foo12345bar"))?, + /// ); + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + #[cfg(feature = "syntax")] + pub fn new_many<P: AsRef<str>>(patterns: &[P]) -> Result<DFA, BuildError> { + DFA::builder().build_many(patterns) + } + + /// Create a new lazy DFA that matches every input. + /// + /// # Example + /// + /// ``` + /// use regex_automata::{hybrid::dfa::DFA, HalfMatch, Input}; + /// + /// let dfa = DFA::always_match()?; + /// let mut cache = dfa.create_cache(); + /// + /// let expected = HalfMatch::must(0, 0); + /// assert_eq!(Some(expected), dfa.try_search_fwd( + /// &mut cache, &Input::new(""))?, + /// ); + /// assert_eq!(Some(expected), dfa.try_search_fwd( + /// &mut cache, &Input::new("foo"))?, + /// ); + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + pub fn always_match() -> Result<DFA, BuildError> { + let nfa = thompson::NFA::always_match(); + Builder::new().build_from_nfa(nfa) + } + + /// Create a new lazy DFA that never matches any input. + /// + /// # Example + /// + /// ``` + /// use regex_automata::{hybrid::dfa::DFA, Input}; + /// + /// let dfa = DFA::never_match()?; + /// let mut cache = dfa.create_cache(); + /// + /// assert_eq!(None, dfa.try_search_fwd(&mut cache, &Input::new(""))?); + /// assert_eq!(None, dfa.try_search_fwd(&mut cache, &Input::new("foo"))?); + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + pub fn never_match() -> Result<DFA, BuildError> { + let nfa = thompson::NFA::never_match(); + Builder::new().build_from_nfa(nfa) + } + + /// Return a default configuration for a `DFA`. + /// + /// This is a convenience routine to avoid needing to import the [`Config`] + /// type when customizing the construction of a lazy DFA. + /// + /// # Example + /// + /// This example shows how to build a lazy DFA that heuristically supports + /// Unicode word boundaries. + /// + /// ``` + /// # if cfg!(miri) { return Ok(()); } // miri takes too long + /// use regex_automata::{hybrid::dfa::DFA, HalfMatch, MatchError, Input}; + /// + /// let re = DFA::builder() + /// .configure(DFA::config().unicode_word_boundary(true)) + /// .build(r"\b\w+\b")?; + /// let mut cache = re.create_cache(); + /// + /// // Since our haystack is all ASCII, the DFA search sees then and knows + /// // it is legal to interpret Unicode word boundaries as ASCII word + /// // boundaries. + /// let input = Input::new("!!foo!!"); + /// let expected = HalfMatch::must(0, 5); + /// assert_eq!(Some(expected), re.try_search_fwd(&mut cache, &input)?); + /// + /// // But if our haystack contains non-ASCII, then the search will fail + /// // with an error. + /// let input = Input::new("!!βββ!!"); + /// let expected = MatchError::quit(b'\xCE', 2); + /// assert_eq!(Err(expected), re.try_search_fwd(&mut cache, &input)); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + pub fn config() -> Config { + Config::new() + } + + /// Return a builder for configuring the construction of a `Regex`. + /// + /// This is a convenience routine to avoid needing to import the + /// [`Builder`] type in common cases. + /// + /// # Example + /// + /// This example shows how to use the builder to disable UTF-8 mode + /// everywhere for lazy DFAs. + /// + /// ``` + /// # if cfg!(miri) { return Ok(()); } // miri takes too long + /// use regex_automata::{hybrid::dfa::DFA, util::syntax, HalfMatch, Input}; + /// + /// let re = DFA::builder() + /// .syntax(syntax::Config::new().utf8(false)) + /// .build(r"foo(?-u:[^b])ar.*")?; + /// let mut cache = re.create_cache(); + /// + /// let input = Input::new(b"\xFEfoo\xFFarzz\xE2\x98\xFF\n"); + /// let expected = Some(HalfMatch::must(0, 9)); + /// let got = re.try_search_fwd(&mut cache, &input)?; + /// assert_eq!(expected, got); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + pub fn builder() -> Builder { + Builder::new() + } + + /// Create a new cache for this lazy DFA. + /// + /// The cache returned should only be used for searches for this + /// lazy DFA. If you want to reuse the cache for another DFA, then + /// you must call [`Cache::reset`] with that DFA (or, equivalently, + /// [`DFA::reset_cache`]). + pub fn create_cache(&self) -> Cache { + Cache::new(self) + } + + /// Reset the given cache such that it can be used for searching with the + /// this lazy DFA (and only this DFA). + /// + /// A cache reset permits reusing memory already allocated in this cache + /// with a different lazy DFA. + /// + /// Resetting a cache sets its "clear count" to 0. This is relevant if the + /// lazy DFA has been configured to "give up" after it has cleared the + /// cache a certain number of times. + /// + /// Any lazy state ID generated by the cache prior to resetting it is + /// invalid after the reset. + /// + /// # Example + /// + /// This shows how to re-purpose a cache for use with a different DFA. + /// + /// ``` + /// # if cfg!(miri) { return Ok(()); } // miri takes too long + /// use regex_automata::{hybrid::dfa::DFA, HalfMatch, Input}; + /// + /// let dfa1 = DFA::new(r"\w")?; + /// let dfa2 = DFA::new(r"\W")?; + /// + /// let mut cache = dfa1.create_cache(); + /// assert_eq!( + /// Some(HalfMatch::must(0, 2)), + /// dfa1.try_search_fwd(&mut cache, &Input::new("Δ"))?, + /// ); + /// + /// // Using 'cache' with dfa2 is not allowed. It may result in panics or + /// // incorrect results. In order to re-purpose the cache, we must reset + /// // it with the DFA we'd like to use it with. + /// // + /// // Similarly, after this reset, using the cache with 'dfa1' is also not + /// // allowed. + /// dfa2.reset_cache(&mut cache); + /// assert_eq!( + /// Some(HalfMatch::must(0, 3)), + /// dfa2.try_search_fwd(&mut cache, &Input::new("☃"))?, + /// ); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + pub fn reset_cache(&self, cache: &mut Cache) { + Lazy::new(self, cache).reset_cache() + } + + /// Returns the total number of patterns compiled into this lazy DFA. + /// + /// In the case of a DFA that contains no patterns, this returns `0`. + /// + /// # Example + /// + /// This example shows the pattern length for a DFA that never matches: + /// + /// ``` + /// use regex_automata::hybrid::dfa::DFA; + /// + /// let dfa = DFA::never_match()?; + /// assert_eq!(dfa.pattern_len(), 0); + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + /// + /// And another example for a DFA that matches at every position: + /// + /// ``` + /// use regex_automata::hybrid::dfa::DFA; + /// + /// let dfa = DFA::always_match()?; + /// assert_eq!(dfa.pattern_len(), 1); + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + /// + /// And finally, a DFA that was constructed from multiple patterns: + /// + /// ``` + /// use regex_automata::hybrid::dfa::DFA; + /// + /// let dfa = DFA::new_many(&["[0-9]+", "[a-z]+", "[A-Z]+"])?; + /// assert_eq!(dfa.pattern_len(), 3); + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + pub fn pattern_len(&self) -> usize { + self.nfa.pattern_len() + } + + /// Returns the equivalence classes that make up the alphabet for this DFA. + /// + /// Unless [`Config::byte_classes`] was disabled, it is possible that + /// multiple distinct bytes are grouped into the same equivalence class + /// if it is impossible for them to discriminate between a match and a + /// non-match. This has the effect of reducing the overall alphabet size + /// and in turn potentially substantially reducing the size of the DFA's + /// transition table. + /// + /// The downside of using equivalence classes like this is that every state + /// transition will automatically use this map to convert an arbitrary + /// byte to its corresponding equivalence class. In practice this has a + /// negligible impact on performance. + pub fn byte_classes(&self) -> &ByteClasses { + &self.classes + } + + /// Returns this lazy DFA's configuration. + pub fn get_config(&self) -> &Config { + &self.config + } + + /// Returns a reference to the underlying NFA. + pub fn get_nfa(&self) -> &thompson::NFA { + &self.nfa + } + + /// Returns the stride, as a base-2 exponent, required for these + /// equivalence classes. + /// + /// The stride is always the smallest power of 2 that is greater than or + /// equal to the alphabet length. This is done so that converting between + /// state IDs and indices can be done with shifts alone, which is much + /// faster than integer division. + fn stride2(&self) -> usize { + self.stride2 + } + + /// Returns the total stride for every state in this lazy DFA. This + /// corresponds to the total number of transitions used by each state in + /// this DFA's transition table. + fn stride(&self) -> usize { + 1 << self.stride2() + } + + /// Returns the memory usage, in bytes, of this lazy DFA. + /// + /// This does **not** include the stack size used up by this lazy DFA. To + /// compute that, use `std::mem::size_of::<DFA>()`. This also does not + /// include the size of the `Cache` used. + /// + /// This also does not include any heap memory used by the NFA inside of + /// this hybrid NFA/DFA. This is because the NFA's ownership is shared, and + /// thus not owned by this hybrid NFA/DFA. More practically, several regex + /// engines in this crate embed an NFA, and reporting the NFA's memory + /// usage in all of them would likely result in reporting higher heap + /// memory than is actually used. + pub fn memory_usage(&self) -> usize { + // The only thing that uses heap memory in a DFA is the NFA. But the + // NFA has shared ownership, so reporting its memory as part of the + // hybrid DFA is likely to lead to double-counting the NFA memory + // somehow. In particular, this DFA does not really own an NFA, so + // including it in the DFA's memory usage doesn't seem semantically + // correct. + 0 + } +} + +impl DFA { + /// Executes a forward search and returns the end position of the leftmost + /// match that is found. If no match exists, then `None` is returned. + /// + /// In particular, this method continues searching even after it enters + /// a match state. The search only terminates once it has reached the + /// end of the input or when it has entered a dead or quit state. Upon + /// termination, the position of the last byte seen while still in a match + /// state is returned. + /// + /// # Errors + /// + /// This routine errors if the search could not complete. This can occur + /// in a number of circumstances: + /// + /// * The configuration of the lazy DFA may permit it to "quit" the search. + /// For example, setting quit bytes or enabling heuristic support for + /// Unicode word boundaries. The default configuration does not enable any + /// option that could result in the lazy DFA quitting. + /// * The configuration of the lazy DFA may also permit it to "give up" + /// on a search if it makes ineffective use of its transition table + /// cache. The default configuration does not enable this by default, + /// although it is typically a good idea to. + /// * When the provided `Input` configuration is not supported. For + /// example, by providing an unsupported anchor mode. + /// + /// When a search returns an error, callers cannot know whether a match + /// exists or not. + /// + /// # Example + /// + /// This example shows how to run a basic search. + /// + /// ``` + /// use regex_automata::{hybrid::dfa::DFA, HalfMatch, Input}; + /// + /// let dfa = DFA::new("foo[0-9]+")?; + /// let mut cache = dfa.create_cache(); + /// let expected = HalfMatch::must(0, 8); + /// assert_eq!(Some(expected), dfa.try_search_fwd( + /// &mut cache, &Input::new("foo12345"))?, + /// ); + /// + /// // Even though a match is found after reading the first byte (`a`), + /// // the leftmost first match semantics demand that we find the earliest + /// // match that prefers earlier parts of the pattern over later parts. + /// let dfa = DFA::new("abc|a")?; + /// let mut cache = dfa.create_cache(); + /// let expected = HalfMatch::must(0, 3); + /// assert_eq!(Some(expected), dfa.try_search_fwd( + /// &mut cache, &Input::new("abc"))?, + /// ); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + /// + /// # Example: specific pattern search + /// + /// This example shows how to build a lazy multi-DFA that permits searching + /// for specific patterns. + /// + /// ``` + /// use regex_automata::{ + /// hybrid::dfa::DFA, + /// Anchored, HalfMatch, PatternID, Input, + /// }; + /// + /// let dfa = DFA::builder() + /// .configure(DFA::config().starts_for_each_pattern(true)) + /// .build_many(&["[a-z0-9]{6}", "[a-z][a-z0-9]{5}"])?; + /// let mut cache = dfa.create_cache(); + /// let haystack = "foo123"; + /// + /// // Since we are using the default leftmost-first match and both + /// // patterns match at the same starting position, only the first pattern + /// // will be returned in this case when doing a search for any of the + /// // patterns. + /// let expected = Some(HalfMatch::must(0, 6)); + /// let got = dfa.try_search_fwd(&mut cache, &Input::new(haystack))?; + /// assert_eq!(expected, got); + /// + /// // But if we want to check whether some other pattern matches, then we + /// // can provide its pattern ID. + /// let expected = Some(HalfMatch::must(1, 6)); + /// let input = Input::new(haystack) + /// .anchored(Anchored::Pattern(PatternID::must(1))); + /// let got = dfa.try_search_fwd(&mut cache, &input)?; + /// assert_eq!(expected, got); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + /// + /// # Example: specifying the bounds of a search + /// + /// This example shows how providing the bounds of a search can produce + /// different results than simply sub-slicing the haystack. + /// + /// ``` + /// use regex_automata::{hybrid::dfa::DFA, HalfMatch, Input}; + /// + /// // N.B. We disable Unicode here so that we use a simple ASCII word + /// // boundary. Alternatively, we could enable heuristic support for + /// // Unicode word boundaries since our haystack is pure ASCII. + /// let dfa = DFA::new(r"(?-u)\b[0-9]{3}\b")?; + /// let mut cache = dfa.create_cache(); + /// let haystack = "foo123bar"; + /// + /// // Since we sub-slice the haystack, the search doesn't know about the + /// // larger context and assumes that `123` is surrounded by word + /// // boundaries. And of course, the match position is reported relative + /// // to the sub-slice as well, which means we get `3` instead of `6`. + /// let expected = Some(HalfMatch::must(0, 3)); + /// let got = dfa.try_search_fwd( + /// &mut cache, + /// &Input::new(&haystack[3..6]), + /// )?; + /// assert_eq!(expected, got); + /// + /// // But if we provide the bounds of the search within the context of the + /// // entire haystack, then the search can take the surrounding context + /// // into account. (And if we did find a match, it would be reported + /// // as a valid offset into `haystack` instead of its sub-slice.) + /// let expected = None; + /// let got = dfa.try_search_fwd( + /// &mut cache, + /// &Input::new(haystack).range(3..6), + /// )?; + /// assert_eq!(expected, got); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + #[inline] + pub fn try_search_fwd( + &self, + cache: &mut Cache, + input: &Input<'_>, + ) -> Result<Option<HalfMatch>, MatchError> { + let utf8empty = self.get_nfa().has_empty() && self.get_nfa().is_utf8(); + let hm = match search::find_fwd(self, cache, input)? { + None => return Ok(None), + Some(hm) if !utf8empty => return Ok(Some(hm)), + Some(hm) => hm, + }; + // We get to this point when we know our DFA can match the empty string + // AND when UTF-8 mode is enabled. In this case, we skip any matches + // whose offset splits a codepoint. Such a match is necessarily a + // zero-width match, because UTF-8 mode requires the underlying NFA + // to be built such that all non-empty matches span valid UTF-8. + // Therefore, any match that ends in the middle of a codepoint cannot + // be part of a span of valid UTF-8 and thus must be an empty match. + // In such cases, we skip it, so as not to report matches that split a + // codepoint. + // + // Note that this is not a checked assumption. Callers *can* provide an + // NFA with UTF-8 mode enabled but produces non-empty matches that span + // invalid UTF-8. But doing so is documented to result in unspecified + // behavior. + empty::skip_splits_fwd(input, hm, hm.offset(), |input| { + let got = search::find_fwd(self, cache, input)?; + Ok(got.map(|hm| (hm, hm.offset()))) + }) + } + + /// Executes a reverse search and returns the start of the position of the + /// leftmost match that is found. If no match exists, then `None` is + /// returned. + /// + /// # Errors + /// + /// This routine errors if the search could not complete. This can occur + /// in a number of circumstances: + /// + /// * The configuration of the lazy DFA may permit it to "quit" the search. + /// For example, setting quit bytes or enabling heuristic support for + /// Unicode word boundaries. The default configuration does not enable any + /// option that could result in the lazy DFA quitting. + /// * The configuration of the lazy DFA may also permit it to "give up" + /// on a search if it makes ineffective use of its transition table + /// cache. The default configuration does not enable this by default, + /// although it is typically a good idea to. + /// * When the provided `Input` configuration is not supported. For + /// example, by providing an unsupported anchor mode. + /// + /// When a search returns an error, callers cannot know whether a match + /// exists or not. + /// + /// # Example + /// + /// This routine is principally useful when used in + /// conjunction with the + /// [`nfa::thompson::Config::reverse`](crate::nfa::thompson::Config::reverse) + /// configuration. In general, it's unlikely to be correct to use both + /// `try_search_fwd` and `try_search_rev` with the same DFA since any + /// particular DFA will only support searching in one direction with + /// respect to the pattern. + /// + /// ``` + /// use regex_automata::{ + /// nfa::thompson, + /// hybrid::dfa::DFA, + /// HalfMatch, Input, + /// }; + /// + /// let dfa = DFA::builder() + /// .thompson(thompson::Config::new().reverse(true)) + /// .build("foo[0-9]+")?; + /// let mut cache = dfa.create_cache(); + /// let expected = HalfMatch::must(0, 0); + /// assert_eq!( + /// Some(expected), + /// dfa.try_search_rev(&mut cache, &Input::new("foo12345"))?, + /// ); + /// + /// // Even though a match is found after reading the last byte (`c`), + /// // the leftmost first match semantics demand that we find the earliest + /// // match that prefers earlier parts of the pattern over latter parts. + /// let dfa = DFA::builder() + /// .thompson(thompson::Config::new().reverse(true)) + /// .build("abc|c")?; + /// let mut cache = dfa.create_cache(); + /// let expected = HalfMatch::must(0, 0); + /// assert_eq!(Some(expected), dfa.try_search_rev( + /// &mut cache, &Input::new("abc"))?, + /// ); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + /// + /// # Example: UTF-8 mode + /// + /// This examples demonstrates that UTF-8 mode applies to reverse + /// DFAs. When UTF-8 mode is enabled in the underlying NFA, then all + /// matches reported must correspond to valid UTF-8 spans. This includes + /// prohibiting zero-width matches that split a codepoint. + /// + /// UTF-8 mode is enabled by default. Notice below how the only zero-width + /// matches reported are those at UTF-8 boundaries: + /// + /// ``` + /// use regex_automata::{ + /// hybrid::dfa::DFA, + /// nfa::thompson, + /// HalfMatch, Input, MatchKind, + /// }; + /// + /// let dfa = DFA::builder() + /// .thompson(thompson::Config::new().reverse(true)) + /// .build(r"")?; + /// let mut cache = dfa.create_cache(); + /// + /// // Run the reverse DFA to collect all matches. + /// let mut input = Input::new("☃"); + /// let mut matches = vec![]; + /// loop { + /// match dfa.try_search_rev(&mut cache, &input)? { + /// None => break, + /// Some(hm) => { + /// matches.push(hm); + /// if hm.offset() == 0 || input.end() == 0 { + /// break; + /// } else if hm.offset() < input.end() { + /// input.set_end(hm.offset()); + /// } else { + /// // This is only necessary to handle zero-width + /// // matches, which of course occur in this example. + /// // Without this, the search would never advance + /// // backwards beyond the initial match. + /// input.set_end(input.end() - 1); + /// } + /// } + /// } + /// } + /// + /// // No matches split a codepoint. + /// let expected = vec![ + /// HalfMatch::must(0, 3), + /// HalfMatch::must(0, 0), + /// ]; + /// assert_eq!(expected, matches); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + /// + /// Now let's look at the same example, but with UTF-8 mode on the + /// underlying NFA disabled: + /// + /// ``` + /// use regex_automata::{ + /// hybrid::dfa::DFA, + /// nfa::thompson, + /// HalfMatch, Input, MatchKind, + /// }; + /// + /// let dfa = DFA::builder() + /// .thompson(thompson::Config::new().reverse(true).utf8(false)) + /// .build(r"")?; + /// let mut cache = dfa.create_cache(); + /// + /// // Run the reverse DFA to collect all matches. + /// let mut input = Input::new("☃"); + /// let mut matches = vec![]; + /// loop { + /// match dfa.try_search_rev(&mut cache, &input)? { + /// None => break, + /// Some(hm) => { + /// matches.push(hm); + /// if hm.offset() == 0 || input.end() == 0 { + /// break; + /// } else if hm.offset() < input.end() { + /// input.set_end(hm.offset()); + /// } else { + /// // This is only necessary to handle zero-width + /// // matches, which of course occur in this example. + /// // Without this, the search would never advance + /// // backwards beyond the initial match. + /// input.set_end(input.end() - 1); + /// } + /// } + /// } + /// } + /// + /// // No matches split a codepoint. + /// let expected = vec![ + /// HalfMatch::must(0, 3), + /// HalfMatch::must(0, 2), + /// HalfMatch::must(0, 1), + /// HalfMatch::must(0, 0), + /// ]; + /// assert_eq!(expected, matches); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + #[inline] + pub fn try_search_rev( + &self, + cache: &mut Cache, + input: &Input<'_>, + ) -> Result<Option<HalfMatch>, MatchError> { + let utf8empty = self.get_nfa().has_empty() && self.get_nfa().is_utf8(); + let hm = match search::find_rev(self, cache, input)? { + None => return Ok(None), + Some(hm) if !utf8empty => return Ok(Some(hm)), + Some(hm) => hm, + }; + empty::skip_splits_rev(input, hm, hm.offset(), |input| { + let got = search::find_rev(self, cache, input)?; + Ok(got.map(|hm| (hm, hm.offset()))) + }) + } + + /// Executes an overlapping forward search and returns the end position of + /// matches as they are found. If no match exists, then `None` is returned. + /// + /// This routine is principally only useful when searching for multiple + /// patterns on inputs where multiple patterns may match the same regions + /// of text. In particular, callers must preserve the automaton's search + /// state from prior calls so that the implementation knows where the last + /// match occurred. + /// + /// When using this routine to implement an iterator of overlapping + /// matches, the `start` of the search should remain invariant throughout + /// iteration. The `OverlappingState` given to the search will keep track + /// of the current position of the search. (This is because multiple + /// matches may be reported at the same position, so only the search + /// implementation itself knows when to advance the position.) + /// + /// If for some reason you want the search to forget about its previous + /// state and restart the search at a particular position, then setting the + /// state to [`OverlappingState::start`] will accomplish that. + /// + /// # Errors + /// + /// This routine errors if the search could not complete. This can occur + /// in a number of circumstances: + /// + /// * The configuration of the lazy DFA may permit it to "quit" the search. + /// For example, setting quit bytes or enabling heuristic support for + /// Unicode word boundaries. The default configuration does not enable any + /// option that could result in the lazy DFA quitting. + /// * The configuration of the lazy DFA may also permit it to "give up" + /// on a search if it makes ineffective use of its transition table + /// cache. The default configuration does not enable this by default, + /// although it is typically a good idea to. + /// * When the provided `Input` configuration is not supported. For + /// example, by providing an unsupported anchor mode. + /// + /// When a search returns an error, callers cannot know whether a match + /// exists or not. + /// + /// # Example + /// + /// This example shows how to run a basic overlapping search. Notice + /// that we build the automaton with a `MatchKind::All` configuration. + /// Overlapping searches are unlikely to work as one would expect when + /// using the default `MatchKind::LeftmostFirst` match semantics, since + /// leftmost-first matching is fundamentally incompatible with overlapping + /// searches. Namely, overlapping searches need to report matches as they + /// are seen, where as leftmost-first searches will continue searching even + /// after a match has been observed in order to find the conventional end + /// position of the match. More concretely, leftmost-first searches use + /// dead states to terminate a search after a specific match can no longer + /// be extended. Overlapping searches instead do the opposite by continuing + /// the search to find totally new matches (potentially of other patterns). + /// + /// ``` + /// # if cfg!(miri) { return Ok(()); } // miri takes too long + /// use regex_automata::{ + /// hybrid::dfa::{DFA, OverlappingState}, + /// HalfMatch, Input, MatchKind, + /// }; + /// + /// let dfa = DFA::builder() + /// .configure(DFA::config().match_kind(MatchKind::All)) + /// .build_many(&[r"\w+$", r"\S+$"])?; + /// let mut cache = dfa.create_cache(); + /// + /// let haystack = "@foo"; + /// let mut state = OverlappingState::start(); + /// + /// let expected = Some(HalfMatch::must(1, 4)); + /// dfa.try_search_overlapping_fwd( + /// &mut cache, &Input::new(haystack), &mut state, + /// )?; + /// assert_eq!(expected, state.get_match()); + /// + /// // The first pattern also matches at the same position, so re-running + /// // the search will yield another match. Notice also that the first + /// // pattern is returned after the second. This is because the second + /// // pattern begins its match before the first, is therefore an earlier + /// // match and is thus reported first. + /// let expected = Some(HalfMatch::must(0, 4)); + /// dfa.try_search_overlapping_fwd( + /// &mut cache, &Input::new(haystack), &mut state, + /// )?; + /// assert_eq!(expected, state.get_match()); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + #[inline] + pub fn try_search_overlapping_fwd( + &self, + cache: &mut Cache, + input: &Input<'_>, + state: &mut OverlappingState, + ) -> Result<(), MatchError> { + let utf8empty = self.get_nfa().has_empty() && self.get_nfa().is_utf8(); + search::find_overlapping_fwd(self, cache, input, state)?; + match state.get_match() { + None => Ok(()), + Some(_) if !utf8empty => Ok(()), + Some(_) => skip_empty_utf8_splits_overlapping( + input, + state, + |input, state| { + search::find_overlapping_fwd(self, cache, input, state) + }, + ), + } + } + + /// Executes a reverse overlapping search and returns the start of the + /// position of the leftmost match that is found. If no match exists, then + /// `None` is returned. + /// + /// When using this routine to implement an iterator of overlapping + /// matches, the `start` of the search should remain invariant throughout + /// iteration. The `OverlappingState` given to the search will keep track + /// of the current position of the search. (This is because multiple + /// matches may be reported at the same position, so only the search + /// implementation itself knows when to advance the position.) + /// + /// If for some reason you want the search to forget about its previous + /// state and restart the search at a particular position, then setting the + /// state to [`OverlappingState::start`] will accomplish that. + /// + /// # Errors + /// + /// This routine errors if the search could not complete. This can occur + /// in a number of circumstances: + /// + /// * The configuration of the lazy DFA may permit it to "quit" the search. + /// For example, setting quit bytes or enabling heuristic support for + /// Unicode word boundaries. The default configuration does not enable any + /// option that could result in the lazy DFA quitting. + /// * The configuration of the lazy DFA may also permit it to "give up" + /// on a search if it makes ineffective use of its transition table + /// cache. The default configuration does not enable this by default, + /// although it is typically a good idea to. + /// * When the provided `Input` configuration is not supported. For + /// example, by providing an unsupported anchor mode. + /// + /// When a search returns an error, callers cannot know whether a match + /// exists or not. + /// + /// # Example: UTF-8 mode + /// + /// This examples demonstrates that UTF-8 mode applies to reverse + /// DFAs. When UTF-8 mode is enabled in the underlying NFA, then all + /// matches reported must correspond to valid UTF-8 spans. This includes + /// prohibiting zero-width matches that split a codepoint. + /// + /// UTF-8 mode is enabled by default. Notice below how the only zero-width + /// matches reported are those at UTF-8 boundaries: + /// + /// ``` + /// use regex_automata::{ + /// hybrid::dfa::{DFA, OverlappingState}, + /// nfa::thompson, + /// HalfMatch, Input, MatchKind, + /// }; + /// + /// let dfa = DFA::builder() + /// .configure(DFA::config().match_kind(MatchKind::All)) + /// .thompson(thompson::Config::new().reverse(true)) + /// .build_many(&[r"", r"☃"])?; + /// let mut cache = dfa.create_cache(); + /// + /// // Run the reverse DFA to collect all matches. + /// let input = Input::new("☃"); + /// let mut state = OverlappingState::start(); + /// let mut matches = vec![]; + /// loop { + /// dfa.try_search_overlapping_rev(&mut cache, &input, &mut state)?; + /// match state.get_match() { + /// None => break, + /// Some(hm) => matches.push(hm), + /// } + /// } + /// + /// // No matches split a codepoint. + /// let expected = vec![ + /// HalfMatch::must(0, 3), + /// HalfMatch::must(1, 0), + /// HalfMatch::must(0, 0), + /// ]; + /// assert_eq!(expected, matches); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + /// + /// Now let's look at the same example, but with UTF-8 mode on the + /// underlying NFA disabled: + /// + /// ``` + /// use regex_automata::{ + /// hybrid::dfa::{DFA, OverlappingState}, + /// nfa::thompson, + /// HalfMatch, Input, MatchKind, + /// }; + /// + /// let dfa = DFA::builder() + /// .configure(DFA::config().match_kind(MatchKind::All)) + /// .thompson(thompson::Config::new().reverse(true).utf8(false)) + /// .build_many(&[r"", r"☃"])?; + /// let mut cache = dfa.create_cache(); + /// + /// // Run the reverse DFA to collect all matches. + /// let input = Input::new("☃"); + /// let mut state = OverlappingState::start(); + /// let mut matches = vec![]; + /// loop { + /// dfa.try_search_overlapping_rev(&mut cache, &input, &mut state)?; + /// match state.get_match() { + /// None => break, + /// Some(hm) => matches.push(hm), + /// } + /// } + /// + /// // Now *all* positions match, even within a codepoint, + /// // because we lifted the requirement that matches + /// // correspond to valid UTF-8 spans. + /// let expected = vec![ + /// HalfMatch::must(0, 3), + /// HalfMatch::must(0, 2), + /// HalfMatch::must(0, 1), + /// HalfMatch::must(1, 0), + /// HalfMatch::must(0, 0), + /// ]; + /// assert_eq!(expected, matches); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + #[inline] + pub fn try_search_overlapping_rev( + &self, + cache: &mut Cache, + input: &Input<'_>, + state: &mut OverlappingState, + ) -> Result<(), MatchError> { + let utf8empty = self.get_nfa().has_empty() && self.get_nfa().is_utf8(); + search::find_overlapping_rev(self, cache, input, state)?; + match state.get_match() { + None => Ok(()), + Some(_) if !utf8empty => Ok(()), + Some(_) => skip_empty_utf8_splits_overlapping( + input, + state, + |input, state| { + search::find_overlapping_rev(self, cache, input, state) + }, + ), + } + } + + /// Writes the set of patterns that match anywhere in the given search + /// configuration to `patset`. If multiple patterns match at the same + /// position and the underlying DFA supports overlapping matches, then all + /// matching patterns are written to the given set. + /// + /// Unless all of the patterns in this DFA are anchored, then generally + /// speaking, this will visit every byte in the haystack. + /// + /// This search routine *does not* clear the pattern set. This gives some + /// flexibility to the caller (e.g., running multiple searches with the + /// same pattern set), but does make the API bug-prone if you're reusing + /// the same pattern set for multiple searches but intended them to be + /// independent. + /// + /// If a pattern ID matched but the given `PatternSet` does not have + /// sufficient capacity to store it, then it is not inserted and silently + /// dropped. + /// + /// # Errors + /// + /// This routine errors if the search could not complete. This can occur + /// in a number of circumstances: + /// + /// * The configuration of the lazy DFA may permit it to "quit" the search. + /// For example, setting quit bytes or enabling heuristic support for + /// Unicode word boundaries. The default configuration does not enable any + /// option that could result in the lazy DFA quitting. + /// * The configuration of the lazy DFA may also permit it to "give up" + /// on a search if it makes ineffective use of its transition table + /// cache. The default configuration does not enable this by default, + /// although it is typically a good idea to. + /// * When the provided `Input` configuration is not supported. For + /// example, by providing an unsupported anchor mode. + /// + /// When a search returns an error, callers cannot know whether a match + /// exists or not. + /// + /// # Example + /// + /// This example shows how to find all matching patterns in a haystack, + /// even when some patterns match at the same position as other patterns. + /// + /// ``` + /// # if cfg!(miri) { return Ok(()); } // miri takes too long + /// use regex_automata::{ + /// hybrid::dfa::DFA, + /// Input, MatchKind, PatternSet, + /// }; + /// + /// let patterns = &[ + /// r"\w+", r"\d+", r"\pL+", r"foo", r"bar", r"barfoo", r"foobar", + /// ]; + /// let dfa = DFA::builder() + /// .configure(DFA::config().match_kind(MatchKind::All)) + /// .build_many(patterns)?; + /// let mut cache = dfa.create_cache(); + /// + /// let input = Input::new("foobar"); + /// let mut patset = PatternSet::new(dfa.pattern_len()); + /// dfa.try_which_overlapping_matches(&mut cache, &input, &mut patset)?; + /// let expected = vec![0, 2, 3, 4, 6]; + /// let got: Vec<usize> = patset.iter().map(|p| p.as_usize()).collect(); + /// assert_eq!(expected, got); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + #[inline] + pub fn try_which_overlapping_matches( + &self, + cache: &mut Cache, + input: &Input<'_>, + patset: &mut PatternSet, + ) -> Result<(), MatchError> { + let mut state = OverlappingState::start(); + while let Some(m) = { + self.try_search_overlapping_fwd(cache, input, &mut state)?; + state.get_match() + } { + let _ = patset.try_insert(m.pattern()); + // There's nothing left to find, so we can stop. Or the caller + // asked us to. + if patset.is_full() || input.get_earliest() { + break; + } + } + Ok(()) + } +} + +impl DFA { + /// Transitions from the current state to the next state, given the next + /// byte of input. + /// + /// The given cache is used to either reuse pre-computed state + /// transitions, or to store this newly computed transition for future + /// reuse. Thus, this routine guarantees that it will never return a state + /// ID that has an "unknown" tag. + /// + /// # State identifier validity + /// + /// The only valid value for `current` is the lazy state ID returned + /// by the most recent call to `next_state`, `next_state_untagged`, + /// `next_state_untagged_unchecked`, `start_state_forward` or + /// `state_state_reverse` for the given `cache`. Any state ID returned from + /// prior calls to these routines (with the same `cache`) is considered + /// invalid (even if it gives an appearance of working). State IDs returned + /// from _any_ prior call for different `cache` values are also always + /// invalid. + /// + /// The returned ID is always a valid ID when `current` refers to a valid + /// ID. Moreover, this routine is defined for all possible values of + /// `input`. + /// + /// These validity rules are not checked, even in debug mode. Callers are + /// required to uphold these rules themselves. + /// + /// Violating these state ID validity rules will not sacrifice memory + /// safety, but _may_ produce an incorrect result or a panic. + /// + /// # Panics + /// + /// If the given ID does not refer to a valid state, then this routine + /// may panic but it also may not panic and instead return an invalid or + /// incorrect ID. + /// + /// # Example + /// + /// This shows a simplistic example for walking a lazy DFA for a given + /// haystack by using the `next_state` method. + /// + /// ``` + /// use regex_automata::{hybrid::dfa::DFA, Input}; + /// + /// let dfa = DFA::new(r"[a-z]+r")?; + /// let mut cache = dfa.create_cache(); + /// let haystack = "bar".as_bytes(); + /// + /// // The start state is determined by inspecting the position and the + /// // initial bytes of the haystack. + /// let mut sid = dfa.start_state_forward( + /// &mut cache, &Input::new(haystack), + /// )?; + /// // Walk all the bytes in the haystack. + /// for &b in haystack { + /// sid = dfa.next_state(&mut cache, sid, b)?; + /// } + /// // Matches are always delayed by 1 byte, so we must explicitly walk the + /// // special "EOI" transition at the end of the search. + /// sid = dfa.next_eoi_state(&mut cache, sid)?; + /// assert!(sid.is_match()); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + #[inline] + pub fn next_state( + &self, + cache: &mut Cache, + current: LazyStateID, + input: u8, + ) -> Result<LazyStateID, CacheError> { + let class = usize::from(self.classes.get(input)); + let offset = current.as_usize_untagged() + class; + let sid = cache.trans[offset]; + if !sid.is_unknown() { + return Ok(sid); + } + let unit = alphabet::Unit::u8(input); + Lazy::new(self, cache).cache_next_state(current, unit) + } + + /// Transitions from the current state to the next state, given the next + /// byte of input and a state ID that is not tagged. + /// + /// The only reason to use this routine is performance. In particular, the + /// `next_state` method needs to do some additional checks, among them is + /// to account for identifiers to states that are not yet computed. In + /// such a case, the transition is computed on the fly. However, if it is + /// known that the `current` state ID is untagged, then these checks can be + /// omitted. + /// + /// Since this routine does not compute states on the fly, it does not + /// modify the cache and thus cannot return an error. Consequently, `cache` + /// does not need to be mutable and it is possible for this routine to + /// return a state ID corresponding to the special "unknown" state. In + /// this case, it is the caller's responsibility to use the prior state + /// ID and `input` with `next_state` in order to force the computation of + /// the unknown transition. Otherwise, trying to use the "unknown" state + /// ID will just result in transitioning back to itself, and thus never + /// terminating. (This is technically a special exemption to the state ID + /// validity rules, but is permissible since this routine is guarateed to + /// never mutate the given `cache`, and thus the identifier is guaranteed + /// to remain valid.) + /// + /// See [`LazyStateID`] for more details on what it means for a state ID + /// to be tagged. Also, see + /// [`next_state_untagged_unchecked`](DFA::next_state_untagged_unchecked) + /// for this same idea, but with bounds checks forcefully elided. + /// + /// # State identifier validity + /// + /// The only valid value for `current` is an **untagged** lazy + /// state ID returned by the most recent call to `next_state`, + /// `next_state_untagged`, `next_state_untagged_unchecked`, + /// `start_state_forward` or `state_state_reverse` for the given `cache`. + /// Any state ID returned from prior calls to these routines (with the + /// same `cache`) is considered invalid (even if it gives an appearance + /// of working). State IDs returned from _any_ prior call for different + /// `cache` values are also always invalid. + /// + /// The returned ID is always a valid ID when `current` refers to a valid + /// ID, although it may be tagged. Moreover, this routine is defined for + /// all possible values of `input`. + /// + /// Not all validity rules are checked, even in debug mode. Callers are + /// required to uphold these rules themselves. + /// + /// Violating these state ID validity rules will not sacrifice memory + /// safety, but _may_ produce an incorrect result or a panic. + /// + /// # Panics + /// + /// If the given ID does not refer to a valid state, then this routine + /// may panic but it also may not panic and instead return an invalid or + /// incorrect ID. + /// + /// # Example + /// + /// This shows a simplistic example for walking a lazy DFA for a given + /// haystack by using the `next_state_untagged` method where possible. + /// + /// ``` + /// use regex_automata::{hybrid::dfa::DFA, Input}; + /// + /// let dfa = DFA::new(r"[a-z]+r")?; + /// let mut cache = dfa.create_cache(); + /// let haystack = "bar".as_bytes(); + /// + /// // The start state is determined by inspecting the position and the + /// // initial bytes of the haystack. + /// let mut sid = dfa.start_state_forward( + /// &mut cache, &Input::new(haystack), + /// )?; + /// // Walk all the bytes in the haystack. + /// let mut at = 0; + /// while at < haystack.len() { + /// if sid.is_tagged() { + /// sid = dfa.next_state(&mut cache, sid, haystack[at])?; + /// } else { + /// let mut prev_sid = sid; + /// // We attempt to chew through as much as we can while moving + /// // through untagged state IDs. Thus, the transition function + /// // does less work on average per byte. (Unrolling this loop + /// // may help even more.) + /// while at < haystack.len() { + /// prev_sid = sid; + /// sid = dfa.next_state_untagged( + /// &mut cache, sid, haystack[at], + /// ); + /// at += 1; + /// if sid.is_tagged() { + /// break; + /// } + /// } + /// // We must ensure that we never proceed to the next iteration + /// // with an unknown state ID. If we don't account for this + /// // case, then search isn't guaranteed to terminate since all + /// // transitions on unknown states loop back to itself. + /// if sid.is_unknown() { + /// sid = dfa.next_state( + /// &mut cache, prev_sid, haystack[at - 1], + /// )?; + /// } + /// } + /// } + /// // Matches are always delayed by 1 byte, so we must explicitly walk the + /// // special "EOI" transition at the end of the search. + /// sid = dfa.next_eoi_state(&mut cache, sid)?; + /// assert!(sid.is_match()); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + #[inline] + pub fn next_state_untagged( + &self, + cache: &Cache, + current: LazyStateID, + input: u8, + ) -> LazyStateID { + debug_assert!(!current.is_tagged()); + let class = usize::from(self.classes.get(input)); + let offset = current.as_usize_unchecked() + class; + cache.trans[offset] + } + + /// Transitions from the current state to the next state, eliding bounds + /// checks, given the next byte of input and a state ID that is not tagged. + /// + /// The only reason to use this routine is performance. In particular, the + /// `next_state` method needs to do some additional checks, among them is + /// to account for identifiers to states that are not yet computed. In + /// such a case, the transition is computed on the fly. However, if it is + /// known that the `current` state ID is untagged, then these checks can be + /// omitted. + /// + /// Since this routine does not compute states on the fly, it does not + /// modify the cache and thus cannot return an error. Consequently, `cache` + /// does not need to be mutable and it is possible for this routine to + /// return a state ID corresponding to the special "unknown" state. In + /// this case, it is the caller's responsibility to use the prior state + /// ID and `input` with `next_state` in order to force the computation of + /// the unknown transition. Otherwise, trying to use the "unknown" state + /// ID will just result in transitioning back to itself, and thus never + /// terminating. (This is technically a special exemption to the state ID + /// validity rules, but is permissible since this routine is guarateed to + /// never mutate the given `cache`, and thus the identifier is guaranteed + /// to remain valid.) + /// + /// See [`LazyStateID`] for more details on what it means for a state ID + /// to be tagged. Also, see + /// [`next_state_untagged`](DFA::next_state_untagged) + /// for this same idea, but with memory safety guaranteed by retaining + /// bounds checks. + /// + /// # State identifier validity + /// + /// The only valid value for `current` is an **untagged** lazy + /// state ID returned by the most recent call to `next_state`, + /// `next_state_untagged`, `next_state_untagged_unchecked`, + /// `start_state_forward` or `state_state_reverse` for the given `cache`. + /// Any state ID returned from prior calls to these routines (with the + /// same `cache`) is considered invalid (even if it gives an appearance + /// of working). State IDs returned from _any_ prior call for different + /// `cache` values are also always invalid. + /// + /// The returned ID is always a valid ID when `current` refers to a valid + /// ID, although it may be tagged. Moreover, this routine is defined for + /// all possible values of `input`. + /// + /// Not all validity rules are checked, even in debug mode. Callers are + /// required to uphold these rules themselves. + /// + /// Violating these state ID validity rules will not sacrifice memory + /// safety, but _may_ produce an incorrect result or a panic. + /// + /// # Safety + /// + /// Callers of this method must guarantee that `current` refers to a valid + /// state ID according to the rules described above. If `current` is not a + /// valid state ID for this automaton, then calling this routine may result + /// in undefined behavior. + /// + /// If `current` is valid, then the ID returned is valid for all possible + /// values of `input`. + #[inline] + pub unsafe fn next_state_untagged_unchecked( + &self, + cache: &Cache, + current: LazyStateID, + input: u8, + ) -> LazyStateID { + debug_assert!(!current.is_tagged()); + let class = usize::from(self.classes.get(input)); + let offset = current.as_usize_unchecked() + class; + *cache.trans.get_unchecked(offset) + } + + /// Transitions from the current state to the next state for the special + /// EOI symbol. + /// + /// The given cache is used to either reuse pre-computed state + /// transitions, or to store this newly computed transition for future + /// reuse. Thus, this routine guarantees that it will never return a state + /// ID that has an "unknown" tag. + /// + /// This routine must be called at the end of every search in a correct + /// implementation of search. Namely, lazy DFAs in this crate delay matches + /// by one byte in order to support look-around operators. Thus, after + /// reaching the end of a haystack, a search implementation must follow one + /// last EOI transition. + /// + /// It is best to think of EOI as an additional symbol in the alphabet of a + /// DFA that is distinct from every other symbol. That is, the alphabet of + /// lazy DFAs in this crate has a logical size of 257 instead of 256, where + /// 256 corresponds to every possible inhabitant of `u8`. (In practice, the + /// physical alphabet size may be smaller because of alphabet compression + /// via equivalence classes, but EOI is always represented somehow in the + /// alphabet.) + /// + /// # State identifier validity + /// + /// The only valid value for `current` is the lazy state ID returned + /// by the most recent call to `next_state`, `next_state_untagged`, + /// `next_state_untagged_unchecked`, `start_state_forward` or + /// `state_state_reverse` for the given `cache`. Any state ID returned from + /// prior calls to these routines (with the same `cache`) is considered + /// invalid (even if it gives an appearance of working). State IDs returned + /// from _any_ prior call for different `cache` values are also always + /// invalid. + /// + /// The returned ID is always a valid ID when `current` refers to a valid + /// ID. + /// + /// These validity rules are not checked, even in debug mode. Callers are + /// required to uphold these rules themselves. + /// + /// Violating these state ID validity rules will not sacrifice memory + /// safety, but _may_ produce an incorrect result or a panic. + /// + /// # Panics + /// + /// If the given ID does not refer to a valid state, then this routine + /// may panic but it also may not panic and instead return an invalid or + /// incorrect ID. + /// + /// # Example + /// + /// This shows a simplistic example for walking a DFA for a given haystack, + /// and then finishing the search with the final EOI transition. + /// + /// ``` + /// use regex_automata::{hybrid::dfa::DFA, Input}; + /// + /// let dfa = DFA::new(r"[a-z]+r")?; + /// let mut cache = dfa.create_cache(); + /// let haystack = "bar".as_bytes(); + /// + /// // The start state is determined by inspecting the position and the + /// // initial bytes of the haystack. + /// let mut sid = dfa.start_state_forward( + /// &mut cache, &Input::new(haystack), + /// )?; + /// // Walk all the bytes in the haystack. + /// for &b in haystack { + /// sid = dfa.next_state(&mut cache, sid, b)?; + /// } + /// // Matches are always delayed by 1 byte, so we must explicitly walk + /// // the special "EOI" transition at the end of the search. Without this + /// // final transition, the assert below will fail since the DFA will not + /// // have entered a match state yet! + /// sid = dfa.next_eoi_state(&mut cache, sid)?; + /// assert!(sid.is_match()); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + #[inline] + pub fn next_eoi_state( + &self, + cache: &mut Cache, + current: LazyStateID, + ) -> Result<LazyStateID, CacheError> { + let eoi = self.classes.eoi().as_usize(); + let offset = current.as_usize_untagged() + eoi; + let sid = cache.trans[offset]; + if !sid.is_unknown() { + return Ok(sid); + } + let unit = self.classes.eoi(); + Lazy::new(self, cache).cache_next_state(current, unit) + } + + /// Return the ID of the start state for this lazy DFA when executing a + /// forward search. + /// + /// Unlike typical DFA implementations, the start state for DFAs in this + /// crate is dependent on a few different factors: + /// + /// * The [`Anchored`] mode of the search. Unanchored, anchored and + /// anchored searches for a specific [`PatternID`] all use different start + /// states. + /// * The position at which the search begins, via [`Input::start`]. This + /// and the byte immediately preceding the start of the search (if one + /// exists) influence which look-behind assertions are true at the start + /// of the search. This in turn influences which start state is selected. + /// * Whether the search is a forward or reverse search. This routine can + /// only be used for forward searches. + /// + /// # Errors + /// + /// This may return a [`MatchError`] (not a [`CacheError`]!) if the search + /// needs to give up when determining the start state (for example, if + /// it sees a "quit" byte or if the cache has been cleared too many + /// times). This can also return an error if the given `Input` contains an + /// unsupported [`Anchored`] configuration. + #[cfg_attr(feature = "perf-inline", inline(always))] + pub fn start_state_forward( + &self, + cache: &mut Cache, + input: &Input<'_>, + ) -> Result<LazyStateID, MatchError> { + if !self.quitset.is_empty() && input.start() > 0 { + let offset = input.start() - 1; + let byte = input.haystack()[offset]; + if self.quitset.contains(byte) { + return Err(MatchError::quit(byte, offset)); + } + } + let start_type = self.start_map.fwd(input); + let start = LazyRef::new(self, cache) + .get_cached_start_id(input, start_type)?; + if !start.is_unknown() { + return Ok(start); + } + Lazy::new(self, cache).cache_start_group(input, start_type) + } + + /// Return the ID of the start state for this lazy DFA when executing a + /// reverse search. + /// + /// Unlike typical DFA implementations, the start state for DFAs in this + /// crate is dependent on a few different factors: + /// + /// * The [`Anchored`] mode of the search. Unanchored, anchored and + /// anchored searches for a specific [`PatternID`] all use different start + /// states. + /// * The position at which the search begins, via [`Input::start`]. This + /// and the byte immediately preceding the start of the search (if one + /// exists) influence which look-behind assertions are true at the start + /// of the search. This in turn influences which start state is selected. + /// * Whether the search is a forward or reverse search. This routine can + /// only be used for reverse searches. + /// + /// # Errors + /// + /// This may return a [`MatchError`] (not a [`CacheError`]!) if the search + /// needs to give up when determining the start state (for example, if + /// it sees a "quit" byte or if the cache has been cleared too many + /// times). This can also return an error if the given `Input` contains an + /// unsupported [`Anchored`] configuration. + #[cfg_attr(feature = "perf-inline", inline(always))] + pub fn start_state_reverse( + &self, + cache: &mut Cache, + input: &Input<'_>, + ) -> Result<LazyStateID, MatchError> { + if !self.quitset.is_empty() && input.end() < input.haystack().len() { + let offset = input.end(); + let byte = input.haystack()[offset]; + if self.quitset.contains(byte) { + return Err(MatchError::quit(byte, offset)); + } + } + let start_type = self.start_map.rev(input); + let start = LazyRef::new(self, cache) + .get_cached_start_id(input, start_type)?; + if !start.is_unknown() { + return Ok(start); + } + Lazy::new(self, cache).cache_start_group(input, start_type) + } + + /// Returns the total number of patterns that match in this state. + /// + /// If the lazy DFA was compiled with one pattern, then this must + /// necessarily always return `1` for all match states. + /// + /// A lazy DFA guarantees that [`DFA::match_pattern`] can be called with + /// indices up to (but not including) the length returned by this routine + /// without panicking. + /// + /// # Panics + /// + /// If the given state is not a match state, then this may either panic + /// or return an incorrect result. + /// + /// # Example + /// + /// This example shows a simple instance of implementing overlapping + /// matches. In particular, it shows not only how to determine how many + /// patterns have matched in a particular state, but also how to access + /// which specific patterns have matched. + /// + /// Notice that we must use [`MatchKind::All`] when building the DFA. If we + /// used [`MatchKind::LeftmostFirst`] instead, then the DFA would not be + /// constructed in a way that supports overlapping matches. (It would only + /// report a single pattern that matches at any particular point in time.) + /// + /// Another thing to take note of is the patterns used and the order in + /// which the pattern IDs are reported. In the example below, pattern `3` + /// is yielded first. Why? Because it corresponds to the match that + /// appears first. Namely, the `@` symbol is part of `\S+` but not part + /// of any of the other patterns. Since the `\S+` pattern has a match that + /// starts to the left of any other pattern, its ID is returned before any + /// other. + /// + /// ``` + /// # if cfg!(miri) { return Ok(()); } // miri takes too long + /// use regex_automata::{hybrid::dfa::DFA, Input, MatchKind}; + /// + /// let dfa = DFA::builder() + /// .configure(DFA::config().match_kind(MatchKind::All)) + /// .build_many(&[ + /// r"\w+", r"[a-z]+", r"[A-Z]+", r"\S+", + /// ])?; + /// let mut cache = dfa.create_cache(); + /// let haystack = "@bar".as_bytes(); + /// + /// // The start state is determined by inspecting the position and the + /// // initial bytes of the haystack. + /// let mut sid = dfa.start_state_forward( + /// &mut cache, &Input::new(haystack), + /// )?; + /// // Walk all the bytes in the haystack. + /// for &b in haystack { + /// sid = dfa.next_state(&mut cache, sid, b)?; + /// } + /// sid = dfa.next_eoi_state(&mut cache, sid)?; + /// + /// assert!(sid.is_match()); + /// assert_eq!(dfa.match_len(&mut cache, sid), 3); + /// // The following calls are guaranteed to not panic since `match_len` + /// // returned `3` above. + /// assert_eq!(dfa.match_pattern(&mut cache, sid, 0).as_usize(), 3); + /// assert_eq!(dfa.match_pattern(&mut cache, sid, 1).as_usize(), 0); + /// assert_eq!(dfa.match_pattern(&mut cache, sid, 2).as_usize(), 1); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + #[inline] + pub fn match_len(&self, cache: &Cache, id: LazyStateID) -> usize { + assert!(id.is_match()); + LazyRef::new(self, cache).get_cached_state(id).match_len() + } + + /// Returns the pattern ID corresponding to the given match index in the + /// given state. + /// + /// See [`DFA::match_len`] for an example of how to use this method + /// correctly. Note that if you know your lazy DFA is configured with a + /// single pattern, then this routine is never necessary since it will + /// always return a pattern ID of `0` for an index of `0` when `id` + /// corresponds to a match state. + /// + /// Typically, this routine is used when implementing an overlapping + /// search, as the example for `DFA::match_len` does. + /// + /// # Panics + /// + /// If the state ID is not a match state or if the match index is out + /// of bounds for the given state, then this routine may either panic + /// or produce an incorrect result. If the state ID is correct and the + /// match index is correct, then this routine always produces a valid + /// `PatternID`. + #[inline] + pub fn match_pattern( + &self, + cache: &Cache, + id: LazyStateID, + match_index: usize, + ) -> PatternID { + // This is an optimization for the very common case of a DFA with a + // single pattern. This conditional avoids a somewhat more costly path + // that finds the pattern ID from the corresponding `State`, which + // requires a bit of slicing/pointer-chasing. This optimization tends + // to only matter when matches are frequent. + if self.pattern_len() == 1 { + return PatternID::ZERO; + } + LazyRef::new(self, cache) + .get_cached_state(id) + .match_pattern(match_index) + } +} + +/// A cache represents a partially computed DFA. +/// +/// A cache is the key component that differentiates a classical DFA and a +/// hybrid NFA/DFA (also called a "lazy DFA"). Where a classical DFA builds a +/// complete transition table that can handle all possible inputs, a hybrid +/// NFA/DFA starts with an empty transition table and builds only the parts +/// required during search. The parts that are built are stored in a cache. For +/// this reason, a cache is a required parameter for nearly every operation on +/// a [`DFA`]. +/// +/// Caches can be created from their corresponding DFA via +/// [`DFA::create_cache`]. A cache can only be used with either the DFA that +/// created it, or the DFA that was most recently used to reset it with +/// [`Cache::reset`]. Using a cache with any other DFA may result in panics +/// or incorrect results. +#[derive(Clone, Debug)] +pub struct Cache { + // N.B. If you're looking to understand how determinization works, it + // is probably simpler to first grok src/dfa/determinize.rs, since that + // doesn't have the "laziness" component. + /// The transition table. + /// + /// Given a `current` LazyStateID and an `input` byte, the next state can + /// be computed via `trans[untagged(current) + equiv_class(input)]`. Notice + /// that no multiplication is used. That's because state identifiers are + /// "premultiplied." + /// + /// Note that the next state may be the "unknown" state. In this case, the + /// next state is not known and determinization for `current` on `input` + /// must be performed. + trans: Vec<LazyStateID>, + /// The starting states for this DFA. + /// + /// These are computed lazily. Initially, these are all set to "unknown" + /// lazy state IDs. + /// + /// When 'starts_for_each_pattern' is disabled (the default), then the size + /// of this is constrained to the possible starting configurations based + /// on the search parameters. (At time of writing, that's 4.) However, + /// when starting states for each pattern is enabled, then there are N + /// additional groups of starting states, where each group reflects the + /// different possible configurations and N is the number of patterns. + starts: Vec<LazyStateID>, + /// A sequence of NFA/DFA powerset states that have been computed for this + /// lazy DFA. This sequence is indexable by untagged LazyStateIDs. (Every + /// tagged LazyStateID can be used to index this sequence by converting it + /// to its untagged form.) + states: Vec<State>, + /// A map from states to their corresponding IDs. This map may be accessed + /// via the raw byte representation of a state, which means that a `State` + /// does not need to be allocated to determine whether it already exists + /// in this map. Indeed, the existence of such a state is what determines + /// whether we allocate a new `State` or not. + /// + /// The higher level idea here is that we do just enough determinization + /// for a state to check whether we've already computed it. If we have, + /// then we can save a little (albeit not much) work. The real savings is + /// in memory usage. If we never checked for trivially duplicate states, + /// then our memory usage would explode to unreasonable levels. + states_to_id: StateMap, + /// Sparse sets used to track which NFA states have been visited during + /// various traversals. + sparses: SparseSets, + /// Scratch space for traversing the NFA graph. (We use space on the heap + /// instead of the call stack.) + stack: Vec<NFAStateID>, + /// Scratch space for building a NFA/DFA powerset state. This is used to + /// help amortize allocation since not every powerset state generated is + /// added to the cache. In particular, if it already exists in the cache, + /// then there is no need to allocate a new `State` for it. + scratch_state_builder: StateBuilderEmpty, + /// A simple abstraction for handling the saving of at most a single state + /// across a cache clearing. This is required for correctness. Namely, if + /// adding a new state after clearing the cache fails, then the caller + /// must retain the ability to continue using the state ID given. The + /// state corresponding to the state ID is what we preserve across cache + /// clearings. + state_saver: StateSaver, + /// The memory usage, in bytes, used by 'states' and 'states_to_id'. We + /// track this as new states are added since states use a variable amount + /// of heap. Tracking this as we add states makes it possible to compute + /// the total amount of memory used by the determinizer in constant time. + memory_usage_state: usize, + /// The number of times the cache has been cleared. When a minimum cache + /// clear count is set, then the cache will return an error instead of + /// clearing the cache if the count has been exceeded. + clear_count: usize, + /// The total number of bytes searched since the last time this cache was + /// cleared, not including the current search. + /// + /// This can be added to the length of the current search to get the true + /// total number of bytes searched. + /// + /// This is generally only non-zero when the + /// `Cache::search_{start,update,finish}` APIs are used to track search + /// progress. + bytes_searched: usize, + /// The progress of the current search. + /// + /// This is only non-`None` when callers utlize the `Cache::search_start`, + /// `Cache::search_update` and `Cache::search_finish` APIs. + /// + /// The purpose of recording search progress is to be able to make a + /// determination about the efficiency of the cache. Namely, by keeping + /// track of the + progress: Option<SearchProgress>, +} + +impl Cache { + /// Create a new cache for the given lazy DFA. + /// + /// The cache returned should only be used for searches for the given DFA. + /// If you want to reuse the cache for another DFA, then you must call + /// [`Cache::reset`] with that DFA. + pub fn new(dfa: &DFA) -> Cache { + let mut cache = Cache { + trans: alloc::vec![], + starts: alloc::vec![], + states: alloc::vec![], + states_to_id: StateMap::new(), + sparses: SparseSets::new(dfa.get_nfa().states().len()), + stack: alloc::vec![], + scratch_state_builder: StateBuilderEmpty::new(), + state_saver: StateSaver::none(), + memory_usage_state: 0, + clear_count: 0, + bytes_searched: 0, + progress: None, + }; + debug!("pre-init lazy DFA cache size: {}", cache.memory_usage()); + Lazy { dfa, cache: &mut cache }.init_cache(); + debug!("post-init lazy DFA cache size: {}", cache.memory_usage()); + cache + } + + /// Reset this cache such that it can be used for searching with the given + /// lazy DFA (and only that DFA). + /// + /// A cache reset permits reusing memory already allocated in this cache + /// with a different lazy DFA. + /// + /// Resetting a cache sets its "clear count" to 0. This is relevant if the + /// lazy DFA has been configured to "give up" after it has cleared the + /// cache a certain number of times. + /// + /// Any lazy state ID generated by the cache prior to resetting it is + /// invalid after the reset. + /// + /// # Example + /// + /// This shows how to re-purpose a cache for use with a different DFA. + /// + /// ``` + /// # if cfg!(miri) { return Ok(()); } // miri takes too long + /// use regex_automata::{hybrid::dfa::DFA, HalfMatch, Input}; + /// + /// let dfa1 = DFA::new(r"\w")?; + /// let dfa2 = DFA::new(r"\W")?; + /// + /// let mut cache = dfa1.create_cache(); + /// assert_eq!( + /// Some(HalfMatch::must(0, 2)), + /// dfa1.try_search_fwd(&mut cache, &Input::new("Δ"))?, + /// ); + /// + /// // Using 'cache' with dfa2 is not allowed. It may result in panics or + /// // incorrect results. In order to re-purpose the cache, we must reset + /// // it with the DFA we'd like to use it with. + /// // + /// // Similarly, after this reset, using the cache with 'dfa1' is also not + /// // allowed. + /// cache.reset(&dfa2); + /// assert_eq!( + /// Some(HalfMatch::must(0, 3)), + /// dfa2.try_search_fwd(&mut cache, &Input::new("☃"))?, + /// ); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + pub fn reset(&mut self, dfa: &DFA) { + Lazy::new(dfa, self).reset_cache() + } + + /// Initializes a new search starting at the given position. + /// + /// If a previous search was unfinished, then it is finished automatically + /// and a new search is begun. + /// + /// Note that keeping track of search progress is _not necessary_ + /// for correct implementations of search using a lazy DFA. Keeping + /// track of search progress is only necessary if you want the + /// [`Config::minimum_bytes_per_state`] configuration knob to work. + #[inline] + pub fn search_start(&mut self, at: usize) { + // If a previous search wasn't marked as finished, then finish it + // now automatically. + if let Some(p) = self.progress.take() { + self.bytes_searched += p.len(); + } + self.progress = Some(SearchProgress { start: at, at }); + } + + /// Updates the current search to indicate that it has search to the + /// current position. + /// + /// No special care needs to be taken for reverse searches. Namely, the + /// position given may be _less than_ the starting position of the search. + /// + /// # Panics + /// + /// This panics if no search has been started by [`Cache::search_start`]. + #[inline] + pub fn search_update(&mut self, at: usize) { + let p = + self.progress.as_mut().expect("no in-progress search to update"); + p.at = at; + } + + /// Indicates that a search has finished at the given position. + /// + /// # Panics + /// + /// This panics if no search has been started by [`Cache::search_start`]. + #[inline] + pub fn search_finish(&mut self, at: usize) { + let mut p = + self.progress.take().expect("no in-progress search to finish"); + p.at = at; + self.bytes_searched += p.len(); + } + + /// Returns the total number of bytes that have been searched since this + /// cache was last cleared. + /// + /// This is useful for determining the efficiency of the cache. For + /// example, the lazy DFA uses this value in conjunction with the + /// [`Config::minimum_bytes_per_state`] knob to help determine whether it + /// should quit searching. + /// + /// This always returns `0` if search progress isn't being tracked. Note + /// that the lazy DFA search routines in this crate always track search + /// progress. + pub fn search_total_len(&self) -> usize { + self.bytes_searched + self.progress.as_ref().map_or(0, |p| p.len()) + } + + /// Returns the total number of times this cache has been cleared since it + /// was either created or last reset. + /// + /// This is useful for informational purposes or if you want to change + /// search strategies based on the number of times the cache has been + /// cleared. + pub fn clear_count(&self) -> usize { + self.clear_count + } + + /// Returns the heap memory usage, in bytes, of this cache. + /// + /// This does **not** include the stack size used up by this cache. To + /// compute that, use `std::mem::size_of::<Cache>()`. + pub fn memory_usage(&self) -> usize { + const ID_SIZE: usize = size_of::<LazyStateID>(); + const STATE_SIZE: usize = size_of::<State>(); + + // NOTE: If you make changes to the below, then + // 'minimum_cache_capacity' should be updated correspondingly. + + self.trans.len() * ID_SIZE + + self.starts.len() * ID_SIZE + + self.states.len() * STATE_SIZE + // Maps likely use more memory than this, but it's probably close. + + self.states_to_id.len() * (STATE_SIZE + ID_SIZE) + + self.sparses.memory_usage() + + self.stack.capacity() * ID_SIZE + + self.scratch_state_builder.capacity() + // Heap memory used by 'State' in both 'states' and 'states_to_id'. + + self.memory_usage_state + } +} + +/// Keeps track of the progress of the current search. +/// +/// This is updated via the `Cache::search_{start,update,finish}` APIs to +/// record how many bytes have been searched. This permits computing a +/// heuristic that represents the efficiency of a cache, and thus helps inform +/// whether the lazy DFA should give up or not. +#[derive(Clone, Debug)] +struct SearchProgress { + start: usize, + at: usize, +} + +impl SearchProgress { + /// Returns the length, in bytes, of this search so far. + /// + /// This automatically handles the case of a reverse search, where `at` + /// is likely to be less than `start`. + fn len(&self) -> usize { + if self.start <= self.at { + self.at - self.start + } else { + self.start - self.at + } + } +} + +/// A map from states to state identifiers. When using std, we use a standard +/// hashmap, since it's a bit faster for this use case. (Other maps, like +/// one's based on FNV, have not yet been benchmarked.) +/// +/// The main purpose of this map is to reuse states where possible. This won't +/// fully minimize the DFA, but it works well in a lot of cases. +#[cfg(feature = "std")] +type StateMap = std::collections::HashMap<State, LazyStateID>; +#[cfg(not(feature = "std"))] +type StateMap = alloc::collections::BTreeMap<State, LazyStateID>; + +/// A type that groups methods that require the base NFA/DFA and writable +/// access to the cache. +#[derive(Debug)] +struct Lazy<'i, 'c> { + dfa: &'i DFA, + cache: &'c mut Cache, +} + +impl<'i, 'c> Lazy<'i, 'c> { + /// Creates a new 'Lazy' wrapper for a DFA and its corresponding cache. + fn new(dfa: &'i DFA, cache: &'c mut Cache) -> Lazy<'i, 'c> { + Lazy { dfa, cache } + } + + /// Return an immutable view by downgrading a writable cache to a read-only + /// cache. + fn as_ref<'a>(&'a self) -> LazyRef<'i, 'a> { + LazyRef::new(self.dfa, self.cache) + } + + /// This is marked as 'inline(never)' to avoid bloating methods on 'DFA' + /// like 'next_state' and 'next_eoi_state' that are called in critical + /// areas. The idea is to let the optimizer focus on the other areas of + /// those methods as the hot path. + /// + /// Here's an example that justifies 'inline(never)' + /// + /// ```ignore + /// regex-cli find hybrid dfa \ + /// @all-codepoints-utf8-100x '\pL{100}' --cache-capacity 10000000 + /// ``` + /// + /// Where 'all-codepoints-utf8-100x' is the UTF-8 encoding of every + /// codepoint, in sequence, repeated 100 times. + /// + /// With 'inline(never)' hyperfine reports 1.1s per run. With + /// 'inline(always)', hyperfine reports 1.23s. So that's a 10% improvement. + #[cold] + #[inline(never)] + fn cache_next_state( + &mut self, + mut current: LazyStateID, + unit: alphabet::Unit, + ) -> Result<LazyStateID, CacheError> { + let stride2 = self.dfa.stride2(); + let empty_builder = self.get_state_builder(); + let builder = determinize::next( + self.dfa.get_nfa(), + self.dfa.get_config().get_match_kind(), + &mut self.cache.sparses, + &mut self.cache.stack, + &self.cache.states[current.as_usize_untagged() >> stride2], + unit, + empty_builder, + ); + let save_state = !self.as_ref().state_builder_fits_in_cache(&builder); + if save_state { + self.save_state(current); + } + let next = self.add_builder_state(builder, |sid| sid)?; + if save_state { + current = self.saved_state_id(); + } + // This is the payoff. The next time 'next_state' is called with this + // state and alphabet unit, it will find this transition and avoid + // having to re-determinize this transition. + self.set_transition(current, unit, next); + Ok(next) + } + + /// Compute and cache the starting state for the given pattern ID (if + /// present) and the starting configuration. + /// + /// This panics if a pattern ID is given and the DFA isn't configured to + /// build anchored start states for each pattern. + /// + /// This will never return an unknown lazy state ID. + /// + /// If caching this state would otherwise result in a cache that has been + /// cleared too many times, then an error is returned. + #[cold] + #[inline(never)] + fn cache_start_group( + &mut self, + input: &Input<'_>, + start: Start, + ) -> Result<LazyStateID, MatchError> { + let mode = input.get_anchored(); + let nfa_start_id = match mode { + Anchored::No => self.dfa.get_nfa().start_unanchored(), + Anchored::Yes => self.dfa.get_nfa().start_anchored(), + Anchored::Pattern(pid) => { + if !self.dfa.get_config().get_starts_for_each_pattern() { + return Err(MatchError::unsupported_anchored(mode)); + } + match self.dfa.get_nfa().start_pattern(pid) { + None => return Ok(self.as_ref().dead_id()), + Some(sid) => sid, + } + } + }; + + let id = self + .cache_start_one(nfa_start_id, start) + .map_err(|_| MatchError::gave_up(input.start()))?; + self.set_start_state(input, start, id); + Ok(id) + } + + /// Compute and cache the starting state for the given NFA state ID and the + /// starting configuration. The NFA state ID might be one of the following: + /// + /// 1) An unanchored start state to match any pattern. + /// 2) An anchored start state to match any pattern. + /// 3) An anchored start state for a particular pattern. + /// + /// This will never return an unknown lazy state ID. + /// + /// If caching this state would otherwise result in a cache that has been + /// cleared too many times, then an error is returned. + fn cache_start_one( + &mut self, + nfa_start_id: NFAStateID, + start: Start, + ) -> Result<LazyStateID, CacheError> { + let mut builder_matches = self.get_state_builder().into_matches(); + determinize::set_lookbehind_from_start( + self.dfa.get_nfa(), + &start, + &mut builder_matches, + ); + self.cache.sparses.set1.clear(); + determinize::epsilon_closure( + self.dfa.get_nfa(), + nfa_start_id, + builder_matches.look_have(), + &mut self.cache.stack, + &mut self.cache.sparses.set1, + ); + let mut builder = builder_matches.into_nfa(); + determinize::add_nfa_states( + &self.dfa.get_nfa(), + &self.cache.sparses.set1, + &mut builder, + ); + let tag_starts = self.dfa.get_config().get_specialize_start_states(); + self.add_builder_state(builder, |id| { + if tag_starts { + id.to_start() + } else { + id + } + }) + } + + /// Either add the given builder state to this cache, or return an ID to an + /// equivalent state already in this cache. + /// + /// In the case where no equivalent state exists, the idmap function given + /// may be used to transform the identifier allocated. This is useful if + /// the caller needs to tag the ID with additional information. + /// + /// This will never return an unknown lazy state ID. + /// + /// If caching this state would otherwise result in a cache that has been + /// cleared too many times, then an error is returned. + fn add_builder_state( + &mut self, + builder: StateBuilderNFA, + idmap: impl Fn(LazyStateID) -> LazyStateID, + ) -> Result<LazyStateID, CacheError> { + if let Some(&cached_id) = + self.cache.states_to_id.get(builder.as_bytes()) + { + // Since we have a cached state, put the constructed state's + // memory back into our scratch space, so that it can be reused. + self.put_state_builder(builder); + return Ok(cached_id); + } + let result = self.add_state(builder.to_state(), idmap); + self.put_state_builder(builder); + result + } + + /// Allocate a new state ID and add the given state to this cache. + /// + /// The idmap function given may be used to transform the identifier + /// allocated. This is useful if the caller needs to tag the ID with + /// additional information. + /// + /// This will never return an unknown lazy state ID. + /// + /// If caching this state would otherwise result in a cache that has been + /// cleared too many times, then an error is returned. + fn add_state( + &mut self, + state: State, + idmap: impl Fn(LazyStateID) -> LazyStateID, + ) -> Result<LazyStateID, CacheError> { + if !self.as_ref().state_fits_in_cache(&state) { + self.try_clear_cache()?; + } + // It's important for this to come second, since the above may clear + // the cache. If we clear the cache after ID generation, then the ID + // is likely bunk since it would have been generated based on a larger + // transition table. + let mut id = idmap(self.next_state_id()?); + if state.is_match() { + id = id.to_match(); + } + // Add room in the transition table. Since this is a fresh state, all + // of its transitions are unknown. + self.cache.trans.extend( + iter::repeat(self.as_ref().unknown_id()).take(self.dfa.stride()), + ); + // When we add a sentinel state, we never want to set any quit + // transitions. Technically, this is harmless, since sentinel states + // have all of their transitions set to loop back to themselves. But + // when creating sentinel states before the quit sentinel state, + // this will try to call 'set_transition' on a state ID that doesn't + // actually exist yet, which isn't allowed. So we just skip doing so + // entirely. + if !self.dfa.quitset.is_empty() && !self.as_ref().is_sentinel(id) { + let quit_id = self.as_ref().quit_id(); + for b in self.dfa.quitset.iter() { + self.set_transition(id, alphabet::Unit::u8(b), quit_id); + } + } + self.cache.memory_usage_state += state.memory_usage(); + self.cache.states.push(state.clone()); + self.cache.states_to_id.insert(state, id); + Ok(id) + } + + /// Allocate a new state ID. + /// + /// This will never return an unknown lazy state ID. + /// + /// If caching this state would otherwise result in a cache that has been + /// cleared too many times, then an error is returned. + fn next_state_id(&mut self) -> Result<LazyStateID, CacheError> { + let sid = match LazyStateID::new(self.cache.trans.len()) { + Ok(sid) => sid, + Err(_) => { + self.try_clear_cache()?; + // This has to pass since we check that ID capacity at + // construction time can fit at least MIN_STATES states. + LazyStateID::new(self.cache.trans.len()).unwrap() + } + }; + Ok(sid) + } + + /// Attempt to clear the cache used by this lazy DFA. + /// + /// If clearing the cache exceeds the minimum number of required cache + /// clearings, then this will return a cache error. In this case, + /// callers should bubble this up as the cache can't be used until it is + /// reset. Implementations of search should convert this error into a + /// [`MatchError::gave_up`]. + /// + /// If 'self.state_saver' is set to save a state, then this state is + /// persisted through cache clearing. Otherwise, the cache is returned to + /// its state after initialization with two exceptions: its clear count + /// is incremented and some of its memory likely has additional capacity. + /// That is, clearing a cache does _not_ release memory. + /// + /// Otherwise, any lazy state ID generated by the cache prior to resetting + /// it is invalid after the reset. + fn try_clear_cache(&mut self) -> Result<(), CacheError> { + let c = self.dfa.get_config(); + if let Some(min_count) = c.get_minimum_cache_clear_count() { + if self.cache.clear_count >= min_count { + if let Some(min_bytes_per) = c.get_minimum_bytes_per_state() { + let len = self.cache.search_total_len(); + let min_bytes = + min_bytes_per.saturating_mul(self.cache.states.len()); + // If we've searched 0 bytes then probably something has + // gone wrong and the lazy DFA search implementation isn't + // correctly updating the search progress state. + if len == 0 { + trace!( + "number of bytes searched is 0, but \ + a minimum bytes per state searched ({}) is \ + enabled, maybe Cache::search_update \ + is not being used?", + min_bytes_per, + ); + } + if len < min_bytes { + trace!( + "lazy DFA cache has been cleared {} times, \ + which exceeds the limit of {}, \ + AND its bytes searched per state is less \ + than the configured minimum of {}, \ + therefore lazy DFA is giving up \ + (bytes searched since cache clear = {}, \ + number of states = {})", + self.cache.clear_count, + min_count, + min_bytes_per, + len, + self.cache.states.len(), + ); + return Err(CacheError::bad_efficiency()); + } else { + trace!( + "lazy DFA cache has been cleared {} times, \ + which exceeds the limit of {}, \ + AND its bytes searched per state is greater \ + than the configured minimum of {}, \ + therefore lazy DFA is continuing! \ + (bytes searched since cache clear = {}, \ + number of states = {})", + self.cache.clear_count, + min_count, + min_bytes_per, + len, + self.cache.states.len(), + ); + } + } else { + trace!( + "lazy DFA cache has been cleared {} times, \ + which exceeds the limit of {}, \ + since there is no configured bytes per state \ + minimum, lazy DFA is giving up", + self.cache.clear_count, + min_count, + ); + return Err(CacheError::too_many_cache_clears()); + } + } + } + self.clear_cache(); + Ok(()) + } + + /// Clears _and_ resets the cache. Resetting the cache means that no + /// states are persisted and the clear count is reset to 0. No heap memory + /// is released. + /// + /// Note that the caller may reset a cache with a different DFA than what + /// it was created from. In which case, the cache can now be used with the + /// new DFA (and not the old DFA). + fn reset_cache(&mut self) { + self.cache.state_saver = StateSaver::none(); + self.clear_cache(); + // If a new DFA is used, it might have a different number of NFA + // states, so we need to make sure our sparse sets have the appropriate + // size. + self.cache.sparses.resize(self.dfa.get_nfa().states().len()); + self.cache.clear_count = 0; + self.cache.progress = None; + } + + /// Clear the cache used by this lazy DFA. + /// + /// If 'self.state_saver' is set to save a state, then this state is + /// persisted through cache clearing. Otherwise, the cache is returned to + /// its state after initialization with two exceptions: its clear count + /// is incremented and some of its memory likely has additional capacity. + /// That is, clearing a cache does _not_ release memory. + /// + /// Otherwise, any lazy state ID generated by the cache prior to resetting + /// it is invalid after the reset. + fn clear_cache(&mut self) { + self.cache.trans.clear(); + self.cache.starts.clear(); + self.cache.states.clear(); + self.cache.states_to_id.clear(); + self.cache.memory_usage_state = 0; + self.cache.clear_count += 1; + self.cache.bytes_searched = 0; + if let Some(ref mut progress) = self.cache.progress { + progress.start = progress.at; + } + trace!( + "lazy DFA cache has been cleared (count: {})", + self.cache.clear_count + ); + self.init_cache(); + // If the state we want to save is one of the sentinel + // (unknown/dead/quit) states, then 'init_cache' adds those back, and + // their identifier values remains invariant. So there's no need to add + // it again. (And indeed, doing so would be incorrect!) + if let Some((old_id, state)) = self.cache.state_saver.take_to_save() { + // If the state is one of the special sentinel states, then it is + // automatically added by cache initialization and its ID always + // remains the same. With that said, this should never occur since + // the sentinel states are all loop states back to themselves. So + // we should never be in a position where we're attempting to save + // a sentinel state since we never compute transitions out of a + // sentinel state. + assert!( + !self.as_ref().is_sentinel(old_id), + "cannot save sentinel state" + ); + let new_id = self + .add_state(state, |id| { + if old_id.is_start() { + // We don't need to consult the + // 'specialize_start_states' config knob here, because + // if it's disabled, old_id.is_start() will never + // return true. + id.to_start() + } else { + id + } + }) + // The unwrap here is OK because lazy DFA creation ensures that + // we have room in the cache to add MIN_STATES states. Since + // 'init_cache' above adds 3, this adds a 4th. + .expect("adding one state after cache clear must work"); + self.cache.state_saver = StateSaver::Saved(new_id); + } + } + + /// Initialize this cache from emptiness to a place where it can be used + /// for search. + /// + /// This is called both at cache creation time and after the cache has been + /// cleared. + /// + /// Primarily, this adds the three sentinel states and allocates some + /// initial memory. + fn init_cache(&mut self) { + // Why multiply by 2 here? Because we make room for both the unanchored + // and anchored start states. Unanchored is first and then anchored. + let mut starts_len = Start::len().checked_mul(2).unwrap(); + // ... but if we also want start states for every pattern, we make room + // for that too. + if self.dfa.get_config().get_starts_for_each_pattern() { + starts_len += Start::len() * self.dfa.pattern_len(); + } + self.cache + .starts + .extend(iter::repeat(self.as_ref().unknown_id()).take(starts_len)); + // This is the set of NFA states that corresponds to each of our three + // sentinel states: the empty set. + let dead = State::dead(); + // This sets up some states that we use as sentinels that are present + // in every DFA. While it would be technically possible to implement + // this DFA without explicitly putting these states in the transition + // table, this is convenient to do to make `next_state` correct for all + // valid state IDs without needing explicit conditionals to special + // case these sentinel states. + // + // All three of these states are "dead" states. That is, all of + // them transition only to themselves. So once you enter one of + // these states, it's impossible to leave them. Thus, any correct + // search routine must explicitly check for these state types. (Sans + // `unknown`, since that is only used internally to represent missing + // states.) + let unk_id = + self.add_state(dead.clone(), |id| id.to_unknown()).unwrap(); + let dead_id = self.add_state(dead.clone(), |id| id.to_dead()).unwrap(); + let quit_id = self.add_state(dead.clone(), |id| id.to_quit()).unwrap(); + assert_eq!(unk_id, self.as_ref().unknown_id()); + assert_eq!(dead_id, self.as_ref().dead_id()); + assert_eq!(quit_id, self.as_ref().quit_id()); + // The idea here is that if you start in an unknown/dead/quit state and + // try to transition on them, then you should end up where you started. + self.set_all_transitions(unk_id, unk_id); + self.set_all_transitions(dead_id, dead_id); + self.set_all_transitions(quit_id, quit_id); + // All of these states are technically equivalent from the FSM + // perspective, so putting all three of them in the cache isn't + // possible. (They are distinct merely because we use their + // identifiers as sentinels to mean something, as indicated by the + // names.) Moreover, we wouldn't want to do that. Unknown and quit + // states are special in that they are artificial constructions + // this implementation. But dead states are a natural part of + // determinization. When you reach a point in the NFA where you cannot + // go anywhere else, a dead state will naturally arise and we MUST + // reuse the canonical dead state that we've created here. Why? Because + // it is the state ID that tells the search routine whether a state is + // dead or not, and thus, whether to stop the search. Having a bunch of + // distinct dead states would be quite wasteful! + self.cache.states_to_id.insert(dead, dead_id); + } + + /// Save the state corresponding to the ID given such that the state + /// persists through a cache clearing. + /// + /// While the state may persist, the ID may not. In order to discover the + /// new state ID, one must call 'saved_state_id' after a cache clearing. + fn save_state(&mut self, id: LazyStateID) { + let state = self.as_ref().get_cached_state(id).clone(); + self.cache.state_saver = StateSaver::ToSave { id, state }; + } + + /// Returns the updated lazy state ID for a state that was persisted + /// through a cache clearing. + /// + /// It is only correct to call this routine when both a state has been + /// saved and the cache has just been cleared. Otherwise, this panics. + fn saved_state_id(&mut self) -> LazyStateID { + self.cache + .state_saver + .take_saved() + .expect("state saver does not have saved state ID") + } + + /// Set all transitions on the state 'from' to 'to'. + fn set_all_transitions(&mut self, from: LazyStateID, to: LazyStateID) { + for unit in self.dfa.classes.representatives(..) { + self.set_transition(from, unit, to); + } + } + + /// Set the transition on 'from' for 'unit' to 'to'. + /// + /// This panics if either 'from' or 'to' is invalid. + /// + /// All unit values are OK. + fn set_transition( + &mut self, + from: LazyStateID, + unit: alphabet::Unit, + to: LazyStateID, + ) { + assert!(self.as_ref().is_valid(from), "invalid 'from' id: {:?}", from); + assert!(self.as_ref().is_valid(to), "invalid 'to' id: {:?}", to); + let offset = + from.as_usize_untagged() + self.dfa.classes.get_by_unit(unit); + self.cache.trans[offset] = to; + } + + /// Set the start ID for the given pattern ID (if given) and starting + /// configuration to the ID given. + /// + /// This panics if 'id' is not valid or if a pattern ID is given and + /// 'starts_for_each_pattern' is not enabled. + fn set_start_state( + &mut self, + input: &Input<'_>, + start: Start, + id: LazyStateID, + ) { + assert!(self.as_ref().is_valid(id)); + let start_index = start.as_usize(); + let index = match input.get_anchored() { + Anchored::No => start_index, + Anchored::Yes => Start::len() + start_index, + Anchored::Pattern(pid) => { + assert!( + self.dfa.get_config().get_starts_for_each_pattern(), + "attempted to search for a specific pattern \ + without enabling starts_for_each_pattern", + ); + let pid = pid.as_usize(); + (2 * Start::len()) + (Start::len() * pid) + start_index + } + }; + self.cache.starts[index] = id; + } + + /// Returns a state builder from this DFA that might have existing + /// capacity. This helps avoid allocs in cases where a state is built that + /// turns out to already be cached. + /// + /// Callers must put the state builder back with 'put_state_builder', + /// otherwise the allocation reuse won't work. + fn get_state_builder(&mut self) -> StateBuilderEmpty { + core::mem::replace( + &mut self.cache.scratch_state_builder, + StateBuilderEmpty::new(), + ) + } + + /// Puts the given state builder back into this DFA for reuse. + /// + /// Note that building a 'State' from a builder always creates a new alloc, + /// so callers should always put the builder back. + fn put_state_builder(&mut self, builder: StateBuilderNFA) { + let _ = core::mem::replace( + &mut self.cache.scratch_state_builder, + builder.clear(), + ); + } +} + +/// A type that groups methods that require the base NFA/DFA and read-only +/// access to the cache. +#[derive(Debug)] +struct LazyRef<'i, 'c> { + dfa: &'i DFA, + cache: &'c Cache, +} + +impl<'i, 'c> LazyRef<'i, 'c> { + /// Creates a new 'Lazy' wrapper for a DFA and its corresponding cache. + fn new(dfa: &'i DFA, cache: &'c Cache) -> LazyRef<'i, 'c> { + LazyRef { dfa, cache } + } + + /// Return the ID of the start state for the given configuration. + /// + /// If the start state has not yet been computed, then this returns an + /// unknown lazy state ID. + #[cfg_attr(feature = "perf-inline", inline(always))] + fn get_cached_start_id( + &self, + input: &Input<'_>, + start: Start, + ) -> Result<LazyStateID, MatchError> { + let start_index = start.as_usize(); + let mode = input.get_anchored(); + let index = match mode { + Anchored::No => start_index, + Anchored::Yes => Start::len() + start_index, + Anchored::Pattern(pid) => { + if !self.dfa.get_config().get_starts_for_each_pattern() { + return Err(MatchError::unsupported_anchored(mode)); + } + if pid.as_usize() >= self.dfa.pattern_len() { + return Ok(self.dead_id()); + } + (2 * Start::len()) + + (Start::len() * pid.as_usize()) + + start_index + } + }; + Ok(self.cache.starts[index]) + } + + /// Return the cached NFA/DFA powerset state for the given ID. + /// + /// This panics if the given ID does not address a valid state. + fn get_cached_state(&self, sid: LazyStateID) -> &State { + let index = sid.as_usize_untagged() >> self.dfa.stride2(); + &self.cache.states[index] + } + + /// Returns true if and only if the given ID corresponds to a "sentinel" + /// state. + /// + /// A sentinel state is a state that signifies a special condition of + /// search, and where every transition maps back to itself. See LazyStateID + /// for more details. Note that start and match states are _not_ sentinels + /// since they may otherwise be real states with non-trivial transitions. + /// The purposes of sentinel states is purely to indicate something. Their + /// transitions are not meant to be followed. + fn is_sentinel(&self, id: LazyStateID) -> bool { + id == self.unknown_id() || id == self.dead_id() || id == self.quit_id() + } + + /// Returns the ID of the unknown state for this lazy DFA. + fn unknown_id(&self) -> LazyStateID { + // This unwrap is OK since 0 is always a valid state ID. + LazyStateID::new(0).unwrap().to_unknown() + } + + /// Returns the ID of the dead state for this lazy DFA. + fn dead_id(&self) -> LazyStateID { + // This unwrap is OK since the maximum value here is 1 * 512 = 512, + // which is <= 2047 (the maximum state ID on 16-bit systems). Where + // 512 is the worst case for our equivalence classes (every byte is a + // distinct class). + LazyStateID::new(1 << self.dfa.stride2()).unwrap().to_dead() + } + + /// Returns the ID of the quit state for this lazy DFA. + fn quit_id(&self) -> LazyStateID { + // This unwrap is OK since the maximum value here is 2 * 512 = 1024, + // which is <= 2047 (the maximum state ID on 16-bit systems). Where + // 512 is the worst case for our equivalence classes (every byte is a + // distinct class). + LazyStateID::new(2 << self.dfa.stride2()).unwrap().to_quit() + } + + /// Returns true if and only if the given ID is valid. + /// + /// An ID is valid if it is both a valid index into the transition table + /// and is a multiple of the DFA's stride. + fn is_valid(&self, id: LazyStateID) -> bool { + let id = id.as_usize_untagged(); + id < self.cache.trans.len() && id % self.dfa.stride() == 0 + } + + /// Returns true if adding the state given would fit in this cache. + fn state_fits_in_cache(&self, state: &State) -> bool { + let needed = self.cache.memory_usage() + + self.memory_usage_for_one_more_state(state.memory_usage()); + trace!( + "lazy DFA cache capacity check: {:?} ?<=? {:?}", + needed, + self.dfa.cache_capacity + ); + needed <= self.dfa.cache_capacity + } + + /// Returns true if adding the state to be built by the given builder would + /// fit in this cache. + fn state_builder_fits_in_cache(&self, state: &StateBuilderNFA) -> bool { + let needed = self.cache.memory_usage() + + self.memory_usage_for_one_more_state(state.as_bytes().len()); + needed <= self.dfa.cache_capacity + } + + /// Returns the additional memory usage, in bytes, required to add one more + /// state to this cache. The given size should be the heap size, in bytes, + /// that would be used by the new state being added. + fn memory_usage_for_one_more_state( + &self, + state_heap_size: usize, + ) -> usize { + const ID_SIZE: usize = size_of::<LazyStateID>(); + const STATE_SIZE: usize = size_of::<State>(); + + self.dfa.stride() * ID_SIZE // additional space needed in trans table + + STATE_SIZE // space in cache.states + + (STATE_SIZE + ID_SIZE) // space in cache.states_to_id + + state_heap_size // heap memory used by state itself + } +} + +/// A simple type that encapsulates the saving of a state ID through a cache +/// clearing. +/// +/// A state ID can be marked for saving with ToSave, while a state ID can be +/// saved itself with Saved. +#[derive(Clone, Debug)] +enum StateSaver { + /// An empty state saver. In this case, no states (other than the special + /// sentinel states) are preserved after clearing the cache. + None, + /// An ID of a state (and the state itself) that should be preserved after + /// the lazy DFA's cache has been cleared. After clearing, the updated ID + /// is stored in 'Saved' since it may have changed. + ToSave { id: LazyStateID, state: State }, + /// An ID that of a state that has been persisted through a lazy DFA + /// cache clearing. The ID recorded here corresponds to an ID that was + /// once marked as ToSave. The IDs are likely not equivalent even though + /// the states they point to are. + Saved(LazyStateID), +} + +impl StateSaver { + /// Create an empty state saver. + fn none() -> StateSaver { + StateSaver::None + } + + /// Replace this state saver with an empty saver, and if this saver is a + /// request to save a state, return that request. + fn take_to_save(&mut self) -> Option<(LazyStateID, State)> { + match core::mem::replace(self, StateSaver::None) { + StateSaver::None | StateSaver::Saved(_) => None, + StateSaver::ToSave { id, state } => Some((id, state)), + } + } + + /// Replace this state saver with an empty saver, and if this saver is a + /// saved state (or a request to save a state), return that state's ID. + /// + /// The idea here is that a request to save a state isn't necessarily + /// honored because it might not be needed. e.g., Some higher level code + /// might request a state to be saved on the off chance that the cache gets + /// cleared when a new state is added at a lower level. But if that new + /// state is never added, then the cache is never cleared and the state and + /// its ID remain unchanged. + fn take_saved(&mut self) -> Option<LazyStateID> { + match core::mem::replace(self, StateSaver::None) { + StateSaver::None => None, + StateSaver::Saved(id) | StateSaver::ToSave { id, .. } => Some(id), + } + } +} + +/// The configuration used for building a lazy DFA. +/// +/// As a convenience, [`DFA::config`] is an alias for [`Config::new`]. The +/// advantage of the former is that it often lets you avoid importing the +/// `Config` type directly. +/// +/// A lazy DFA configuration is a simple data object that is typically used +/// with [`Builder::configure`]. +/// +/// The default configuration guarantees that a search will never return a +/// "gave up" or "quit" error, although it is possible for a search to fail +/// if [`Config::starts_for_each_pattern`] wasn't enabled (which it is not by +/// default) and an [`Anchored::Pattern`] mode is requested via [`Input`]. +#[derive(Clone, Debug, Default)] +pub struct Config { + // As with other configuration types in this crate, we put all our knobs + // in options so that we can distinguish between "default" and "not set." + // This makes it possible to easily combine multiple configurations + // without default values overwriting explicitly specified values. See the + // 'overwrite' method. + // + // For docs on the fields below, see the corresponding method setters. + match_kind: Option<MatchKind>, + pre: Option<Option<Prefilter>>, + starts_for_each_pattern: Option<bool>, + byte_classes: Option<bool>, + unicode_word_boundary: Option<bool>, + quitset: Option<ByteSet>, + specialize_start_states: Option<bool>, + cache_capacity: Option<usize>, + skip_cache_capacity_check: Option<bool>, + minimum_cache_clear_count: Option<Option<usize>>, + minimum_bytes_per_state: Option<Option<usize>>, +} + +impl Config { + /// Return a new default lazy DFA builder configuration. + pub fn new() -> Config { + Config::default() + } + + /// Set the desired match semantics. + /// + /// The default is [`MatchKind::LeftmostFirst`], which corresponds to the + /// match semantics of Perl-like regex engines. That is, when multiple + /// patterns would match at the same leftmost position, the pattern that + /// appears first in the concrete syntax is chosen. + /// + /// Currently, the only other kind of match semantics supported is + /// [`MatchKind::All`]. This corresponds to classical DFA construction + /// where all possible matches are added to the lazy DFA. + /// + /// Typically, `All` is used when one wants to execute an overlapping + /// search and `LeftmostFirst` otherwise. In particular, it rarely makes + /// sense to use `All` with the various "leftmost" find routines, since the + /// leftmost routines depend on the `LeftmostFirst` automata construction + /// strategy. Specifically, `LeftmostFirst` adds dead states to the + /// lazy DFA as a way to terminate the search and report a match. + /// `LeftmostFirst` also supports non-greedy matches using this strategy + /// where as `All` does not. + /// + /// # Example: overlapping search + /// + /// This example shows the typical use of `MatchKind::All`, which is to + /// report overlapping matches. + /// + /// ``` + /// # if cfg!(miri) { return Ok(()); } // miri takes too long + /// use regex_automata::{ + /// hybrid::dfa::{DFA, OverlappingState}, + /// HalfMatch, Input, MatchKind, + /// }; + /// + /// let dfa = DFA::builder() + /// .configure(DFA::config().match_kind(MatchKind::All)) + /// .build_many(&[r"\w+$", r"\S+$"])?; + /// let mut cache = dfa.create_cache(); + /// let haystack = "@foo"; + /// let mut state = OverlappingState::start(); + /// + /// let expected = Some(HalfMatch::must(1, 4)); + /// dfa.try_search_overlapping_fwd( + /// &mut cache, &Input::new(haystack), &mut state, + /// )?; + /// assert_eq!(expected, state.get_match()); + /// + /// // The first pattern also matches at the same position, so re-running + /// // the search will yield another match. Notice also that the first + /// // pattern is returned after the second. This is because the second + /// // pattern begins its match before the first, is therefore an earlier + /// // match and is thus reported first. + /// let expected = Some(HalfMatch::must(0, 4)); + /// dfa.try_search_overlapping_fwd( + /// &mut cache, &Input::new(haystack), &mut state, + /// )?; + /// assert_eq!(expected, state.get_match()); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + /// + /// # Example: reverse automaton to find start of match + /// + /// Another example for using `MatchKind::All` is for constructing a + /// reverse automaton to find the start of a match. `All` semantics are + /// used for this in order to find the longest possible match, which + /// corresponds to the leftmost starting position. + /// + /// Note that if you need the starting position then + /// [`hybrid::regex::Regex`](crate::hybrid::regex::Regex) will handle this + /// for you, so it's usually not necessary to do this yourself. + /// + /// ``` + /// use regex_automata::{ + /// hybrid::dfa::DFA, + /// nfa::thompson::NFA, + /// Anchored, HalfMatch, Input, MatchKind, + /// }; + /// + /// let input = Input::new("123foobar456"); + /// let pattern = r"[a-z]+r"; + /// + /// let dfa_fwd = DFA::new(pattern)?; + /// let dfa_rev = DFA::builder() + /// .thompson(NFA::config().reverse(true)) + /// .configure(DFA::config().match_kind(MatchKind::All)) + /// .build(pattern)?; + /// let mut cache_fwd = dfa_fwd.create_cache(); + /// let mut cache_rev = dfa_rev.create_cache(); + /// + /// let expected_fwd = HalfMatch::must(0, 9); + /// let expected_rev = HalfMatch::must(0, 3); + /// let got_fwd = dfa_fwd.try_search_fwd(&mut cache_fwd, &input)?.unwrap(); + /// // Here we don't specify the pattern to search for since there's only + /// // one pattern and we're doing a leftmost search. But if this were an + /// // overlapping search, you'd need to specify the pattern that matched + /// // in the forward direction. (Otherwise, you might wind up finding the + /// // starting position of a match of some other pattern.) That in turn + /// // requires building the reverse automaton with starts_for_each_pattern + /// // enabled. + /// let input = input + /// .clone() + /// .range(..got_fwd.offset()) + /// .anchored(Anchored::Yes); + /// let got_rev = dfa_rev.try_search_rev(&mut cache_rev, &input)?.unwrap(); + /// assert_eq!(expected_fwd, got_fwd); + /// assert_eq!(expected_rev, got_rev); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + pub fn match_kind(mut self, kind: MatchKind) -> Config { + self.match_kind = Some(kind); + self + } + + /// Set a prefilter to be used whenever a start state is entered. + /// + /// A [`Prefilter`] in this context is meant to accelerate searches by + /// looking for literal prefixes that every match for the corresponding + /// pattern (or patterns) must start with. Once a prefilter produces a + /// match, the underlying search routine continues on to try and confirm + /// the match. + /// + /// Be warned that setting a prefilter does not guarantee that the search + /// will be faster. While it's usually a good bet, if the prefilter + /// produces a lot of false positive candidates (i.e., positions matched + /// by the prefilter but not by the regex), then the overall result can + /// be slower than if you had just executed the regex engine without any + /// prefilters. + /// + /// Note that unless [`Config::specialize_start_states`] has been + /// explicitly set, then setting this will also enable (when `pre` is + /// `Some`) or disable (when `pre` is `None`) start state specialization. + /// This occurs because without start state specialization, a prefilter + /// is likely to be less effective. And without a prefilter, start state + /// specialization is usually pointless. + /// + /// By default no prefilter is set. + /// + /// # Example + /// + /// ``` + /// use regex_automata::{ + /// hybrid::dfa::DFA, + /// util::prefilter::Prefilter, + /// Input, HalfMatch, MatchKind, + /// }; + /// + /// let pre = Prefilter::new(MatchKind::LeftmostFirst, &["foo", "bar"]); + /// let re = DFA::builder() + /// .configure(DFA::config().prefilter(pre)) + /// .build(r"(foo|bar)[a-z]+")?; + /// let mut cache = re.create_cache(); + /// let input = Input::new("foo1 barfox bar"); + /// assert_eq!( + /// Some(HalfMatch::must(0, 11)), + /// re.try_search_fwd(&mut cache, &input)?, + /// ); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + /// + /// Be warned though that an incorrect prefilter can lead to incorrect + /// results! + /// + /// ``` + /// use regex_automata::{ + /// hybrid::dfa::DFA, + /// util::prefilter::Prefilter, + /// Input, HalfMatch, MatchKind, + /// }; + /// + /// let pre = Prefilter::new(MatchKind::LeftmostFirst, &["foo", "car"]); + /// let re = DFA::builder() + /// .configure(DFA::config().prefilter(pre)) + /// .build(r"(foo|bar)[a-z]+")?; + /// let mut cache = re.create_cache(); + /// let input = Input::new("foo1 barfox bar"); + /// assert_eq!( + /// // No match reported even though there clearly is one! + /// None, + /// re.try_search_fwd(&mut cache, &input)?, + /// ); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + pub fn prefilter(mut self, pre: Option<Prefilter>) -> Config { + self.pre = Some(pre); + if self.specialize_start_states.is_none() { + self.specialize_start_states = + Some(self.get_prefilter().is_some()); + } + self + } + + /// Whether to compile a separate start state for each pattern in the + /// lazy DFA. + /// + /// When enabled, a separate **anchored** start state is added for each + /// pattern in the lazy DFA. When this start state is used, then the DFA + /// will only search for matches for the pattern specified, even if there + /// are other patterns in the DFA. + /// + /// The main downside of this option is that it can potentially increase + /// the size of the DFA and/or increase the time it takes to build the + /// DFA at search time. However, since this is configuration for a lazy + /// DFA, these states aren't actually built unless they're used. Enabling + /// this isn't necessarily free, however, as it may result in higher cache + /// usage. + /// + /// There are a few reasons one might want to enable this (it's disabled + /// by default): + /// + /// 1. When looking for the start of an overlapping match (using a reverse + /// DFA), doing it correctly requires starting the reverse search using the + /// starting state of the pattern that matched in the forward direction. + /// Indeed, when building a [`Regex`](crate::hybrid::regex::Regex), it + /// will automatically enable this option when building the reverse DFA + /// internally. + /// 2. When you want to use a DFA with multiple patterns to both search + /// for matches of any pattern or to search for anchored matches of one + /// particular pattern while using the same DFA. (Otherwise, you would need + /// to compile a new DFA for each pattern.) + /// + /// By default this is disabled. + /// + /// # Example + /// + /// This example shows how to use this option to permit the same lazy DFA + /// to run both general searches for any pattern and anchored searches for + /// a specific pattern. + /// + /// ``` + /// use regex_automata::{ + /// hybrid::dfa::DFA, + /// Anchored, HalfMatch, Input, PatternID, + /// }; + /// + /// let dfa = DFA::builder() + /// .configure(DFA::config().starts_for_each_pattern(true)) + /// .build_many(&[r"[a-z0-9]{6}", r"[a-z][a-z0-9]{5}"])?; + /// let mut cache = dfa.create_cache(); + /// let haystack = "bar foo123"; + /// + /// // Here's a normal unanchored search that looks for any pattern. + /// let expected = HalfMatch::must(0, 10); + /// let input = Input::new(haystack); + /// assert_eq!(Some(expected), dfa.try_search_fwd(&mut cache, &input)?); + /// // We can also do a normal anchored search for any pattern. Since it's + /// // an anchored search, we position the start of the search where we + /// // know the match will begin. + /// let expected = HalfMatch::must(0, 10); + /// let input = Input::new(haystack).range(4..); + /// assert_eq!(Some(expected), dfa.try_search_fwd(&mut cache, &input)?); + /// // Since we compiled anchored start states for each pattern, we can + /// // also look for matches of other patterns explicitly, even if a + /// // different pattern would have normally matched. + /// let expected = HalfMatch::must(1, 10); + /// let input = Input::new(haystack) + /// .range(4..) + /// .anchored(Anchored::Pattern(PatternID::must(1))); + /// assert_eq!(Some(expected), dfa.try_search_fwd(&mut cache, &input)?); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + pub fn starts_for_each_pattern(mut self, yes: bool) -> Config { + self.starts_for_each_pattern = Some(yes); + self + } + + /// Whether to attempt to shrink the size of the lazy DFA's alphabet or + /// not. + /// + /// This option is enabled by default and should never be disabled unless + /// one is debugging the lazy DFA. + /// + /// When enabled, the lazy DFA will use a map from all possible bytes + /// to their corresponding equivalence class. Each equivalence class + /// represents a set of bytes that does not discriminate between a match + /// and a non-match in the DFA. For example, the pattern `[ab]+` has at + /// least two equivalence classes: a set containing `a` and `b` and a set + /// containing every byte except for `a` and `b`. `a` and `b` are in the + /// same equivalence classes because they never discriminate between a + /// match and a non-match. + /// + /// The advantage of this map is that the size of the transition table + /// can be reduced drastically from `#states * 256 * sizeof(LazyStateID)` + /// to `#states * k * sizeof(LazyStateID)` where `k` is the number of + /// equivalence classes (rounded up to the nearest power of 2). As a + /// result, total space usage can decrease substantially. Moreover, since a + /// smaller alphabet is used, DFA compilation during search becomes faster + /// as well since it will potentially be able to reuse a single transition + /// for multiple bytes. + /// + /// **WARNING:** This is only useful for debugging lazy DFAs. Disabling + /// this does not yield any speed advantages. Namely, even when this is + /// disabled, a byte class map is still used while searching. The only + /// difference is that every byte will be forced into its own distinct + /// equivalence class. This is useful for debugging the actual generated + /// transitions because it lets one see the transitions defined on actual + /// bytes instead of the equivalence classes. + pub fn byte_classes(mut self, yes: bool) -> Config { + self.byte_classes = Some(yes); + self + } + + /// Heuristically enable Unicode word boundaries. + /// + /// When set, this will attempt to implement Unicode word boundaries as if + /// they were ASCII word boundaries. This only works when the search input + /// is ASCII only. If a non-ASCII byte is observed while searching, then a + /// [`MatchError::quit`] error is returned. + /// + /// A possible alternative to enabling this option is to simply use an + /// ASCII word boundary, e.g., via `(?-u:\b)`. The main reason to use this + /// option is if you absolutely need Unicode support. This option lets one + /// use a fast search implementation (a DFA) for some potentially very + /// common cases, while providing the option to fall back to some other + /// regex engine to handle the general case when an error is returned. + /// + /// If the pattern provided has no Unicode word boundary in it, then this + /// option has no effect. (That is, quitting on a non-ASCII byte only + /// occurs when this option is enabled _and_ a Unicode word boundary is + /// present in the pattern.) + /// + /// This is almost equivalent to setting all non-ASCII bytes to be quit + /// bytes. The only difference is that this will cause non-ASCII bytes to + /// be quit bytes _only_ when a Unicode word boundary is present in the + /// pattern. + /// + /// When enabling this option, callers _must_ be prepared to handle + /// a [`MatchError`](crate::MatchError) error during search. + /// When using a [`Regex`](crate::hybrid::regex::Regex), this + /// corresponds to using the `try_` suite of methods. Alternatively, + /// if callers can guarantee that their input is ASCII only, then a + /// [`MatchError::quit`] error will never be returned while searching. + /// + /// This is disabled by default. + /// + /// # Example + /// + /// This example shows how to heuristically enable Unicode word boundaries + /// in a pattern. It also shows what happens when a search comes across a + /// non-ASCII byte. + /// + /// ``` + /// use regex_automata::{ + /// hybrid::dfa::DFA, + /// HalfMatch, Input, MatchError, + /// }; + /// + /// let dfa = DFA::builder() + /// .configure(DFA::config().unicode_word_boundary(true)) + /// .build(r"\b[0-9]+\b")?; + /// let mut cache = dfa.create_cache(); + /// + /// // The match occurs before the search ever observes the snowman + /// // character, so no error occurs. + /// let haystack = "foo 123 ☃"; + /// let expected = Some(HalfMatch::must(0, 7)); + /// let got = dfa.try_search_fwd(&mut cache, &Input::new(haystack))?; + /// assert_eq!(expected, got); + /// + /// // Notice that this search fails, even though the snowman character + /// // occurs after the ending match offset. This is because search + /// // routines read one byte past the end of the search to account for + /// // look-around, and indeed, this is required here to determine whether + /// // the trailing \b matches. + /// let haystack = "foo 123 ☃"; + /// let expected = MatchError::quit(0xE2, 8); + /// let got = dfa.try_search_fwd(&mut cache, &Input::new(haystack)); + /// assert_eq!(Err(expected), got); + /// + /// // Another example is executing a search where the span of the haystack + /// // we specify is all ASCII, but there is non-ASCII just before it. This + /// // correctly also reports an error. + /// let input = Input::new("β123").range(2..); + /// let expected = MatchError::quit(0xB2, 1); + /// let got = dfa.try_search_fwd(&mut cache, &input); + /// assert_eq!(Err(expected), got); + /// + /// // And similarly for the trailing word boundary. + /// let input = Input::new("123β").range(..3); + /// let expected = MatchError::quit(0xCE, 3); + /// let got = dfa.try_search_fwd(&mut cache, &input); + /// assert_eq!(Err(expected), got); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + pub fn unicode_word_boundary(mut self, yes: bool) -> Config { + // We have a separate option for this instead of just setting the + // appropriate quit bytes here because we don't want to set quit bytes + // for every regex. We only want to set them when the regex contains a + // Unicode word boundary. + self.unicode_word_boundary = Some(yes); + self + } + + /// Add a "quit" byte to the lazy DFA. + /// + /// When a quit byte is seen during search time, then search will return a + /// [`MatchError::quit`] error indicating the offset at which the search + /// stopped. + /// + /// A quit byte will always overrule any other aspects of a regex. For + /// example, if the `x` byte is added as a quit byte and the regex `\w` is + /// used, then observing `x` will cause the search to quit immediately + /// despite the fact that `x` is in the `\w` class. + /// + /// This mechanism is primarily useful for heuristically enabling certain + /// features like Unicode word boundaries in a DFA. Namely, if the input + /// to search is ASCII, then a Unicode word boundary can be implemented + /// via an ASCII word boundary with no change in semantics. Thus, a DFA + /// can attempt to match a Unicode word boundary but give up as soon as it + /// observes a non-ASCII byte. Indeed, if callers set all non-ASCII bytes + /// to be quit bytes, then Unicode word boundaries will be permitted when + /// building lazy DFAs. Of course, callers should enable + /// [`Config::unicode_word_boundary`] if they want this behavior instead. + /// (The advantage being that non-ASCII quit bytes will only be added if a + /// Unicode word boundary is in the pattern.) + /// + /// When enabling this option, callers _must_ be prepared to handle a + /// [`MatchError`](crate::MatchError) error during search. When using a + /// [`Regex`](crate::hybrid::regex::Regex), this corresponds to using the + /// `try_` suite of methods. + /// + /// By default, there are no quit bytes set. + /// + /// # Panics + /// + /// This panics if heuristic Unicode word boundaries are enabled and any + /// non-ASCII byte is removed from the set of quit bytes. Namely, enabling + /// Unicode word boundaries requires setting every non-ASCII byte to a quit + /// byte. So if the caller attempts to undo any of that, then this will + /// panic. + /// + /// # Example + /// + /// This example shows how to cause a search to terminate if it sees a + /// `\n` byte. This could be useful if, for example, you wanted to prevent + /// a user supplied pattern from matching across a line boundary. + /// + /// ``` + /// # if cfg!(miri) { return Ok(()); } // miri takes too long + /// use regex_automata::{hybrid::dfa::DFA, MatchError, Input}; + /// + /// let dfa = DFA::builder() + /// .configure(DFA::config().quit(b'\n', true)) + /// .build(r"foo\p{any}+bar")?; + /// let mut cache = dfa.create_cache(); + /// + /// let haystack = "foo\nbar"; + /// // Normally this would produce a match, since \p{any} contains '\n'. + /// // But since we instructed the automaton to enter a quit state if a + /// // '\n' is observed, this produces a match error instead. + /// let expected = MatchError::quit(b'\n', 3); + /// let got = dfa.try_search_fwd( + /// &mut cache, + /// &Input::new(haystack), + /// ).unwrap_err(); + /// assert_eq!(expected, got); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + pub fn quit(mut self, byte: u8, yes: bool) -> Config { + if self.get_unicode_word_boundary() && !byte.is_ascii() && !yes { + panic!( + "cannot set non-ASCII byte to be non-quit when \ + Unicode word boundaries are enabled" + ); + } + if self.quitset.is_none() { + self.quitset = Some(ByteSet::empty()); + } + if yes { + self.quitset.as_mut().unwrap().add(byte); + } else { + self.quitset.as_mut().unwrap().remove(byte); + } + self + } + + /// Enable specializing start states in the lazy DFA. + /// + /// When start states are specialized, an implementor of a search routine + /// using a lazy DFA can tell when the search has entered a starting state. + /// When start states aren't specialized, then it is impossible to know + /// whether the search has entered a start state. + /// + /// Ideally, this option wouldn't need to exist and we could always + /// specialize start states. The problem is that start states can be quite + /// active. This in turn means that an efficient search routine is likely + /// to ping-pong between a heavily optimized hot loop that handles most + /// states and to a less optimized specialized handling of start states. + /// This causes branches to get heavily mispredicted and overall can + /// materially decrease throughput. Therefore, specializing start states + /// should only be enabled when it is needed. + /// + /// Knowing whether a search is in a start state is typically useful when a + /// prefilter is active for the search. A prefilter is typically only run + /// when in a start state and a prefilter can greatly accelerate a search. + /// Therefore, the possible cost of specializing start states is worth it + /// in this case. Otherwise, if you have no prefilter, there is likely no + /// reason to specialize start states. + /// + /// This is disabled by default, but note that it is automatically + /// enabled (or disabled) if [`Config::prefilter`] is set. Namely, unless + /// `specialize_start_states` has already been set, [`Config::prefilter`] + /// will automatically enable or disable it based on whether a prefilter + /// is present or not, respectively. This is done because a prefilter's + /// effectiveness is rooted in being executed whenever the DFA is in a + /// start state, and that's only possible to do when they are specialized. + /// + /// Note that it is plausibly reasonable to _disable_ this option + /// explicitly while _enabling_ a prefilter. In that case, a prefilter + /// will still be run at the beginning of a search, but never again. This + /// in theory could strike a good balance if you're in a situation where a + /// prefilter is likely to produce many false positive candidates. + /// + /// # Example + /// + /// This example shows how to enable start state specialization and then + /// shows how to check whether a state is a start state or not. + /// + /// ``` + /// use regex_automata::{hybrid::dfa::DFA, MatchError, Input}; + /// + /// let dfa = DFA::builder() + /// .configure(DFA::config().specialize_start_states(true)) + /// .build(r"[a-z]+")?; + /// let mut cache = dfa.create_cache(); + /// + /// let haystack = "123 foobar 4567".as_bytes(); + /// let sid = dfa.start_state_forward(&mut cache, &Input::new(haystack))?; + /// // The ID returned by 'start_state_forward' will always be tagged as + /// // a start state when start state specialization is enabled. + /// assert!(sid.is_tagged()); + /// assert!(sid.is_start()); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + /// + /// Compare the above with the default lazy DFA configuration where + /// start states are _not_ specialized. In this case, the start state + /// is not tagged and `sid.is_start()` returns false. + /// + /// ``` + /// use regex_automata::{hybrid::dfa::DFA, MatchError, Input}; + /// + /// let dfa = DFA::new(r"[a-z]+")?; + /// let mut cache = dfa.create_cache(); + /// + /// let haystack = "123 foobar 4567".as_bytes(); + /// let sid = dfa.start_state_forward(&mut cache, &Input::new(haystack))?; + /// // Start states are not tagged in the default configuration! + /// assert!(!sid.is_tagged()); + /// assert!(!sid.is_start()); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + pub fn specialize_start_states(mut self, yes: bool) -> Config { + self.specialize_start_states = Some(yes); + self + } + + /// Sets the maximum amount of heap memory, in bytes, to allocate to the + /// cache for use during a lazy DFA search. If the lazy DFA would otherwise + /// use more heap memory, then, depending on other configuration knobs, + /// either stop the search and return an error or clear the cache and + /// continue the search. + /// + /// The default cache capacity is some "reasonable" number that will + /// accommodate most regular expressions. You may find that if you need + /// to build a large DFA then it may be necessary to increase the cache + /// capacity. + /// + /// Note that while building a lazy DFA will do a "minimum" check to ensure + /// the capacity is big enough, this is more or less about correctness. + /// If the cache is bigger than the minimum but still "too small," then the + /// lazy DFA could wind up spending a lot of time clearing the cache and + /// recomputing transitions, thus negating the performance benefits of a + /// lazy DFA. Thus, setting the cache capacity is mostly an experimental + /// endeavor. For most common patterns, however, the default should be + /// sufficient. + /// + /// For more details on how the lazy DFA's cache is used, see the + /// documentation for [`Cache`]. + /// + /// # Example + /// + /// This example shows what happens if the configured cache capacity is + /// too small. In such cases, one can override the cache capacity to make + /// it bigger. Alternatively, one might want to use less memory by setting + /// a smaller cache capacity. + /// + /// ``` + /// # if cfg!(miri) { return Ok(()); } // miri takes too long + /// use regex_automata::{hybrid::dfa::DFA, HalfMatch, Input}; + /// + /// let pattern = r"\p{L}{1000}"; + /// + /// // The default cache capacity is likely too small to deal with regexes + /// // that are very large. Large repetitions of large Unicode character + /// // classes are a common way to make very large regexes. + /// let _ = DFA::new(pattern).unwrap_err(); + /// // Bump up the capacity to something bigger. + /// let dfa = DFA::builder() + /// .configure(DFA::config().cache_capacity(100 * (1<<20))) // 100 MB + /// .build(pattern)?; + /// let mut cache = dfa.create_cache(); + /// + /// let haystack = "ͰͲͶͿΆΈΉΊΌΎΏΑΒΓΔΕΖΗΘΙ".repeat(50); + /// let expected = Some(HalfMatch::must(0, 2000)); + /// let got = dfa.try_search_fwd(&mut cache, &Input::new(&haystack))?; + /// assert_eq!(expected, got); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + pub fn cache_capacity(mut self, bytes: usize) -> Config { + self.cache_capacity = Some(bytes); + self + } + + /// Configures construction of a lazy DFA to use the minimum cache capacity + /// if the configured capacity is otherwise too small for the provided NFA. + /// + /// This is useful if you never want lazy DFA construction to fail because + /// of a capacity that is too small. + /// + /// In general, this option is typically not a good idea. In particular, + /// while a minimum cache capacity does permit the lazy DFA to function + /// where it otherwise couldn't, it's plausible that it may not function + /// well if it's constantly running out of room. In that case, the speed + /// advantages of the lazy DFA may be negated. On the other hand, the + /// "minimum" cache capacity computed may not be completely accurate and + /// could actually be bigger than what is really necessary. Therefore, it + /// is plausible that using the minimum cache capacity could still result + /// in very good performance. + /// + /// This is disabled by default. + /// + /// # Example + /// + /// This example shows what happens if the configured cache capacity is + /// too small. In such cases, one could override the capacity explicitly. + /// An alternative, demonstrated here, let's us force construction to use + /// the minimum cache capacity if the configured capacity is otherwise + /// too small. + /// + /// ``` + /// # if cfg!(miri) { return Ok(()); } // miri takes too long + /// use regex_automata::{hybrid::dfa::DFA, HalfMatch, Input}; + /// + /// let pattern = r"\p{L}{1000}"; + /// + /// // The default cache capacity is likely too small to deal with regexes + /// // that are very large. Large repetitions of large Unicode character + /// // classes are a common way to make very large regexes. + /// let _ = DFA::new(pattern).unwrap_err(); + /// // Configure construction such it automatically selects the minimum + /// // cache capacity if it would otherwise be too small. + /// let dfa = DFA::builder() + /// .configure(DFA::config().skip_cache_capacity_check(true)) + /// .build(pattern)?; + /// let mut cache = dfa.create_cache(); + /// + /// let haystack = "ͰͲͶͿΆΈΉΊΌΎΏΑΒΓΔΕΖΗΘΙ".repeat(50); + /// let expected = Some(HalfMatch::must(0, 2000)); + /// let got = dfa.try_search_fwd(&mut cache, &Input::new(&haystack))?; + /// assert_eq!(expected, got); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + pub fn skip_cache_capacity_check(mut self, yes: bool) -> Config { + self.skip_cache_capacity_check = Some(yes); + self + } + + /// Configure a lazy DFA search to quit after a certain number of cache + /// clearings. + /// + /// When a minimum is set, then a lazy DFA search will *possibly* "give + /// up" after the minimum number of cache clearings has occurred. This is + /// typically useful in scenarios where callers want to detect whether the + /// lazy DFA search is "efficient" or not. If the cache is cleared too many + /// times, this is a good indicator that it is not efficient, and thus, the + /// caller may wish to use some other regex engine. + /// + /// Note that the number of times a cache is cleared is a property of + /// the cache itself. Thus, if a cache is used in a subsequent search + /// with a similarly configured lazy DFA, then it could cause the + /// search to "give up" if the cache needed to be cleared, depending + /// on its internal count and configured minimum. The cache clear + /// count can only be reset to `0` via [`DFA::reset_cache`] (or + /// [`Regex::reset_cache`](crate::hybrid::regex::Regex::reset_cache) if + /// you're using the `Regex` API). + /// + /// By default, no minimum is configured. Thus, a lazy DFA search will + /// never give up due to cache clearings. If you do set this option, you + /// might consider also setting [`Config::minimum_bytes_per_state`] in + /// order for the lazy DFA to take efficiency into account before giving + /// up. + /// + /// # Example + /// + /// This example uses a somewhat pathological configuration to demonstrate + /// the _possible_ behavior of cache clearing and how it might result + /// in a search that returns an error. + /// + /// It is important to note that the precise mechanics of how and when + /// a cache gets cleared is an implementation detail. + /// + /// ``` + /// # if cfg!(miri) { return Ok(()); } // miri takes too long + /// use regex_automata::{hybrid::dfa::DFA, Input, MatchError, MatchErrorKind}; + /// + /// // This is a carefully chosen regex. The idea is to pick one + /// // that requires some decent number of states (hence the bounded + /// // repetition). But we specifically choose to create a class with an + /// // ASCII letter and a non-ASCII letter so that we can check that no new + /// // states are created once the cache is full. Namely, if we fill up the + /// // cache on a haystack of 'a's, then in order to match one 'β', a new + /// // state will need to be created since a 'β' is encoded with multiple + /// // bytes. Since there's no room for this state, the search should quit + /// // at the very first position. + /// let pattern = r"[aβ]{100}"; + /// let dfa = DFA::builder() + /// .configure( + /// // Configure it so that we have the minimum cache capacity + /// // possible. And that if any clearings occur, the search quits. + /// DFA::config() + /// .skip_cache_capacity_check(true) + /// .cache_capacity(0) + /// .minimum_cache_clear_count(Some(0)), + /// ) + /// .build(pattern)?; + /// let mut cache = dfa.create_cache(); + /// + /// // Our search will give up before reaching the end! + /// let haystack = "a".repeat(101).into_bytes(); + /// let result = dfa.try_search_fwd(&mut cache, &Input::new(&haystack)); + /// assert!(matches!( + /// *result.unwrap_err().kind(), + /// MatchErrorKind::GaveUp { .. }, + /// )); + /// + /// // Now that we know the cache is full, if we search a haystack that we + /// // know will require creating at least one new state, it should not + /// // be able to make much progress. + /// let haystack = "β".repeat(101).into_bytes(); + /// let result = dfa.try_search_fwd(&mut cache, &Input::new(&haystack)); + /// assert!(matches!( + /// *result.unwrap_err().kind(), + /// MatchErrorKind::GaveUp { .. }, + /// )); + /// + /// // If we reset the cache, then we should be able to create more states + /// // and make more progress with searching for betas. + /// cache.reset(&dfa); + /// let haystack = "β".repeat(101).into_bytes(); + /// let result = dfa.try_search_fwd(&mut cache, &Input::new(&haystack)); + /// assert!(matches!( + /// *result.unwrap_err().kind(), + /// MatchErrorKind::GaveUp { .. }, + /// )); + /// + /// // ... switching back to ASCII still makes progress since it just needs + /// // to set transitions on existing states! + /// let haystack = "a".repeat(101).into_bytes(); + /// let result = dfa.try_search_fwd(&mut cache, &Input::new(&haystack)); + /// assert!(matches!( + /// *result.unwrap_err().kind(), + /// MatchErrorKind::GaveUp { .. }, + /// )); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + pub fn minimum_cache_clear_count(mut self, min: Option<usize>) -> Config { + self.minimum_cache_clear_count = Some(min); + self + } + + /// Configure a lazy DFA search to quit only when its efficiency drops + /// below the given minimum. + /// + /// The efficiency of the cache is determined by the number of DFA states + /// compiled per byte of haystack searched. For example, if the efficiency + /// is 2, then it means the lazy DFA is creating a new DFA state after + /// searching approximately 2 bytes in a haystack. Generally speaking, 2 + /// is quite bad and it's likely that even a slower regex engine like the + /// [`PikeVM`](crate::nfa::thompson::pikevm::PikeVM) would be faster. + /// + /// This has no effect if [`Config::minimum_cache_clear_count`] is not set. + /// Namely, this option only kicks in when the cache has been cleared more + /// than the minimum number. If no minimum is set, then the cache is simply + /// cleared whenever it fills up and it is impossible for the lazy DFA to + /// quit due to ineffective use of the cache. + /// + /// In general, if one is setting [`Config::minimum_cache_clear_count`], + /// then one should probably also set this knob as well. The reason is + /// that the absolute number of times the cache is cleared is generally + /// not a great predictor of efficiency. For example, if a new DFA state + /// is created for every 1,000 bytes searched, then it wouldn't be hard + /// for the cache to get cleared more than `N` times and then cause the + /// lazy DFA to quit. But a new DFA state every 1,000 bytes is likely quite + /// good from a performance perspective, and it's likely that the lazy + /// DFA should continue searching, even if it requires clearing the cache + /// occasionally. + /// + /// Finally, note that if you're implementing your own lazy DFA search + /// routine and also want this efficiency check to work correctly, then + /// you'll need to use the following routines to record search progress: + /// + /// * Call [`Cache::search_start`] at the beginning of every search. + /// * Call [`Cache::search_update`] whenever [`DFA::next_state`] is + /// called. + /// * Call [`Cache::search_finish`] before completing a search. (It is + /// not strictly necessary to call this when an error is returned, as + /// `Cache::search_start` will automatically finish the previous search + /// for you. But calling it where possible before returning helps improve + /// the accuracy of how many bytes have actually been searched.) + pub fn minimum_bytes_per_state(mut self, min: Option<usize>) -> Config { + self.minimum_bytes_per_state = Some(min); + self + } + + /// Returns the match semantics set in this configuration. + pub fn get_match_kind(&self) -> MatchKind { + self.match_kind.unwrap_or(MatchKind::LeftmostFirst) + } + + /// Returns the prefilter set in this configuration, if one at all. + pub fn get_prefilter(&self) -> Option<&Prefilter> { + self.pre.as_ref().unwrap_or(&None).as_ref() + } + + /// Returns whether this configuration has enabled anchored starting states + /// for every pattern in the DFA. + pub fn get_starts_for_each_pattern(&self) -> bool { + self.starts_for_each_pattern.unwrap_or(false) + } + + /// Returns whether this configuration has enabled byte classes or not. + /// This is typically a debugging oriented option, as disabling it confers + /// no speed benefit. + pub fn get_byte_classes(&self) -> bool { + self.byte_classes.unwrap_or(true) + } + + /// Returns whether this configuration has enabled heuristic Unicode word + /// boundary support. When enabled, it is possible for a search to return + /// an error. + pub fn get_unicode_word_boundary(&self) -> bool { + self.unicode_word_boundary.unwrap_or(false) + } + + /// Returns whether this configuration will instruct the lazy DFA to enter + /// a quit state whenever the given byte is seen during a search. When at + /// least one byte has this enabled, it is possible for a search to return + /// an error. + pub fn get_quit(&self, byte: u8) -> bool { + self.quitset.map_or(false, |q| q.contains(byte)) + } + + /// Returns whether this configuration will instruct the lazy DFA to + /// "specialize" start states. When enabled, the lazy DFA will tag start + /// states so that search routines using the lazy DFA can detect when + /// it's in a start state and do some kind of optimization (like run a + /// prefilter). + pub fn get_specialize_start_states(&self) -> bool { + self.specialize_start_states.unwrap_or(false) + } + + /// Returns the cache capacity set on this configuration. + pub fn get_cache_capacity(&self) -> usize { + self.cache_capacity.unwrap_or(2 * (1 << 20)) + } + + /// Returns whether the cache capacity check should be skipped. + pub fn get_skip_cache_capacity_check(&self) -> bool { + self.skip_cache_capacity_check.unwrap_or(false) + } + + /// Returns, if set, the minimum number of times the cache must be cleared + /// before a lazy DFA search can give up. When no minimum is set, then a + /// search will never quit and will always clear the cache whenever it + /// fills up. + pub fn get_minimum_cache_clear_count(&self) -> Option<usize> { + self.minimum_cache_clear_count.unwrap_or(None) + } + + /// Returns, if set, the minimum number of bytes per state that need to be + /// processed in order for the lazy DFA to keep going. If the minimum falls + /// below this number (and the cache has been cleared a minimum number of + /// times), then the lazy DFA will return a "gave up" error. + pub fn get_minimum_bytes_per_state(&self) -> Option<usize> { + self.minimum_bytes_per_state.unwrap_or(None) + } + + /// Returns the minimum lazy DFA cache capacity required for the given NFA. + /// + /// The cache capacity required for a particular NFA may change without + /// notice. Callers should not rely on it being stable. + /// + /// This is useful for informational purposes, but can also be useful for + /// other reasons. For example, if one wants to check the minimum cache + /// capacity themselves or if one wants to set the capacity based on the + /// minimum. + /// + /// This may return an error if this configuration does not support all of + /// the instructions used in the given NFA. For example, if the NFA has a + /// Unicode word boundary but this configuration does not enable heuristic + /// support for Unicode word boundaries. + pub fn get_minimum_cache_capacity( + &self, + nfa: &thompson::NFA, + ) -> Result<usize, BuildError> { + let quitset = self.quit_set_from_nfa(nfa)?; + let classes = self.byte_classes_from_nfa(nfa, &quitset); + let starts = self.get_starts_for_each_pattern(); + Ok(minimum_cache_capacity(nfa, &classes, starts)) + } + + /// Returns the byte class map used during search from the given NFA. + /// + /// If byte classes are disabled on this configuration, then a map is + /// returned that puts each byte in its own equivalent class. + fn byte_classes_from_nfa( + &self, + nfa: &thompson::NFA, + quit: &ByteSet, + ) -> ByteClasses { + if !self.get_byte_classes() { + // The lazy DFA will always use the equivalence class map, but + // enabling this option is useful for debugging. Namely, this will + // cause all transitions to be defined over their actual bytes + // instead of an opaque equivalence class identifier. The former is + // much easier to grok as a human. + ByteClasses::singletons() + } else { + let mut set = nfa.byte_class_set().clone(); + // It is important to distinguish any "quit" bytes from all other + // bytes. Otherwise, a non-quit byte may end up in the same class + // as a quit byte, and thus cause the DFA stop when it shouldn't. + // + // Test case: + // + // regex-cli find hybrid regex -w @conn.json.1000x.log \ + // '^#' '\b10\.55\.182\.100\b' + if !quit.is_empty() { + set.add_set(&quit); + } + set.byte_classes() + } + } + + /// Return the quit set for this configuration and the given NFA. + /// + /// This may return an error if the NFA is incompatible with this + /// configuration's quit set. For example, if the NFA has a Unicode word + /// boundary and the quit set doesn't include non-ASCII bytes. + fn quit_set_from_nfa( + &self, + nfa: &thompson::NFA, + ) -> Result<ByteSet, BuildError> { + let mut quit = self.quitset.unwrap_or(ByteSet::empty()); + if nfa.look_set_any().contains_word_unicode() { + if self.get_unicode_word_boundary() { + for b in 0x80..=0xFF { + quit.add(b); + } + } else { + // If heuristic support for Unicode word boundaries wasn't + // enabled, then we can still check if our quit set is correct. + // If the caller set their quit bytes in a way that causes the + // DFA to quit on at least all non-ASCII bytes, then that's all + // we need for heuristic support to work. + if !quit.contains_range(0x80, 0xFF) { + return Err( + BuildError::unsupported_dfa_word_boundary_unicode(), + ); + } + } + } + Ok(quit) + } + + /// Overwrite the default configuration such that the options in `o` are + /// always used. If an option in `o` is not set, then the corresponding + /// option in `self` is used. If it's not set in `self` either, then it + /// remains not set. + fn overwrite(&self, o: Config) -> Config { + Config { + match_kind: o.match_kind.or(self.match_kind), + pre: o.pre.or_else(|| self.pre.clone()), + starts_for_each_pattern: o + .starts_for_each_pattern + .or(self.starts_for_each_pattern), + byte_classes: o.byte_classes.or(self.byte_classes), + unicode_word_boundary: o + .unicode_word_boundary + .or(self.unicode_word_boundary), + quitset: o.quitset.or(self.quitset), + specialize_start_states: o + .specialize_start_states + .or(self.specialize_start_states), + cache_capacity: o.cache_capacity.or(self.cache_capacity), + skip_cache_capacity_check: o + .skip_cache_capacity_check + .or(self.skip_cache_capacity_check), + minimum_cache_clear_count: o + .minimum_cache_clear_count + .or(self.minimum_cache_clear_count), + minimum_bytes_per_state: o + .minimum_bytes_per_state + .or(self.minimum_bytes_per_state), + } + } +} + +/// A builder for constructing a lazy deterministic finite automaton from +/// regular expressions. +/// +/// As a convenience, [`DFA::builder`] is an alias for [`Builder::new`]. The +/// advantage of the former is that it often lets you avoid importing the +/// `Builder` type directly. +/// +/// This builder provides two main things: +/// +/// 1. It provides a few different `build` routines for actually constructing +/// a DFA from different kinds of inputs. The most convenient is +/// [`Builder::build`], which builds a DFA directly from a pattern string. The +/// most flexible is [`Builder::build_from_nfa`], which builds a DFA straight +/// from an NFA. +/// 2. The builder permits configuring a number of things. +/// [`Builder::configure`] is used with [`Config`] to configure aspects of +/// the DFA and the construction process itself. [`Builder::syntax`] and +/// [`Builder::thompson`] permit configuring the regex parser and Thompson NFA +/// construction, respectively. The syntax and thompson configurations only +/// apply when building from a pattern string. +/// +/// This builder always constructs a *single* lazy DFA. As such, this builder +/// can only be used to construct regexes that either detect the presence +/// of a match or find the end location of a match. A single DFA cannot +/// produce both the start and end of a match. For that information, use a +/// [`Regex`](crate::hybrid::regex::Regex), which can be similarly configured +/// using [`regex::Builder`](crate::hybrid::regex::Builder). The main reason +/// to use a DFA directly is if the end location of a match is enough for your +/// use case. Namely, a `Regex` will construct two lazy DFAs instead of one, +/// since a second reverse DFA is needed to find the start of a match. +/// +/// # Example +/// +/// This example shows how to build a lazy DFA that uses a tiny cache capacity +/// and completely disables Unicode. That is: +/// +/// * Things such as `\w`, `.` and `\b` are no longer Unicode-aware. `\w` +/// and `\b` are ASCII-only while `.` matches any byte except for `\n` +/// (instead of any UTF-8 encoding of a Unicode scalar value except for +/// `\n`). Things that are Unicode only, such as `\pL`, are not allowed. +/// * The pattern itself is permitted to match invalid UTF-8. For example, +/// things like `[^a]` that match any byte except for `a` are permitted. +/// +/// ``` +/// use regex_automata::{ +/// hybrid::dfa::DFA, +/// nfa::thompson, +/// util::syntax, +/// HalfMatch, Input, +/// }; +/// +/// let dfa = DFA::builder() +/// .configure(DFA::config().cache_capacity(5_000)) +/// .thompson(thompson::Config::new().utf8(false)) +/// .syntax(syntax::Config::new().unicode(false).utf8(false)) +/// .build(r"foo[^b]ar.*")?; +/// let mut cache = dfa.create_cache(); +/// +/// let haystack = b"\xFEfoo\xFFar\xE2\x98\xFF\n"; +/// let expected = Some(HalfMatch::must(0, 10)); +/// let got = dfa.try_search_fwd(&mut cache, &Input::new(haystack))?; +/// assert_eq!(expected, got); +/// +/// # Ok::<(), Box<dyn std::error::Error>>(()) +/// ``` +#[derive(Clone, Debug)] +pub struct Builder { + config: Config, + #[cfg(feature = "syntax")] + thompson: thompson::Compiler, +} + +impl Builder { + /// Create a new lazy DFA builder with the default configuration. + pub fn new() -> Builder { + Builder { + config: Config::default(), + #[cfg(feature = "syntax")] + thompson: thompson::Compiler::new(), + } + } + + /// Build a lazy DFA from the given pattern. + /// + /// If there was a problem parsing or compiling the pattern, then an error + /// is returned. + #[cfg(feature = "syntax")] + pub fn build(&self, pattern: &str) -> Result<DFA, BuildError> { + self.build_many(&[pattern]) + } + + /// Build a lazy DFA from the given patterns. + /// + /// When matches are returned, the pattern ID corresponds to the index of + /// the pattern in the slice given. + #[cfg(feature = "syntax")] + pub fn build_many<P: AsRef<str>>( + &self, + patterns: &[P], + ) -> Result<DFA, BuildError> { + let nfa = self + .thompson + .clone() + // We can always forcefully disable captures because DFAs do not + // support them. + .configure( + thompson::Config::new() + .which_captures(thompson::WhichCaptures::None), + ) + .build_many(patterns) + .map_err(BuildError::nfa)?; + self.build_from_nfa(nfa) + } + + /// Build a DFA from the given NFA. + /// + /// Note that this requires owning a `thompson::NFA`. While this may force + /// you to clone the NFA, such a clone is not a deep clone. Namely, NFAs + /// are defined internally to support shared ownership such that cloning is + /// very cheap. + /// + /// # Example + /// + /// This example shows how to build a lazy DFA if you already have an NFA + /// in hand. + /// + /// ``` + /// use regex_automata::{ + /// hybrid::dfa::DFA, + /// nfa::thompson, + /// HalfMatch, Input, + /// }; + /// + /// let haystack = "foo123bar"; + /// + /// // This shows how to set non-default options for building an NFA. + /// let nfa = thompson::Compiler::new() + /// .configure(thompson::Config::new().shrink(true)) + /// .build(r"[0-9]+")?; + /// let dfa = DFA::builder().build_from_nfa(nfa)?; + /// let mut cache = dfa.create_cache(); + /// let expected = Some(HalfMatch::must(0, 6)); + /// let got = dfa.try_search_fwd(&mut cache, &Input::new(haystack))?; + /// assert_eq!(expected, got); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + pub fn build_from_nfa( + &self, + nfa: thompson::NFA, + ) -> Result<DFA, BuildError> { + let quitset = self.config.quit_set_from_nfa(&nfa)?; + let classes = self.config.byte_classes_from_nfa(&nfa, &quitset); + // Check that we can fit at least a few states into our cache, + // otherwise it's pretty senseless to use the lazy DFA. This does have + // a possible failure mode though. This assumes the maximum size of a + // state in powerset space (so, the total number of NFA states), which + // may never actually materialize, and could be quite a bit larger + // than the actual biggest state. If this turns out to be a problem, + // we could expose a knob that disables this check. But if so, we have + // to be careful not to panic in other areas of the code (the cache + // clearing and init code) that tend to assume some minimum useful + // cache capacity. + let min_cache = minimum_cache_capacity( + &nfa, + &classes, + self.config.get_starts_for_each_pattern(), + ); + let mut cache_capacity = self.config.get_cache_capacity(); + if cache_capacity < min_cache { + // When the caller has asked us to skip the cache capacity check, + // then we simply force the cache capacity to its minimum amount + // and mush on. + if self.config.get_skip_cache_capacity_check() { + debug!( + "given capacity ({}) is too small, \ + since skip_cache_capacity_check is enabled, \ + setting cache capacity to minimum ({})", + cache_capacity, min_cache, + ); + cache_capacity = min_cache; + } else { + return Err(BuildError::insufficient_cache_capacity( + min_cache, + cache_capacity, + )); + } + } + // We also need to check that we can fit at least some small number + // of states in our state ID space. This is unlikely to trigger in + // >=32-bit systems, but 16-bit systems have a pretty small state ID + // space since a number of bits are used up as sentinels. + if let Err(err) = minimum_lazy_state_id(&classes) { + return Err(BuildError::insufficient_state_id_capacity(err)); + } + let stride2 = classes.stride2(); + let start_map = StartByteMap::new(nfa.look_matcher()); + Ok(DFA { + config: self.config.clone(), + nfa, + stride2, + start_map, + classes, + quitset, + cache_capacity, + }) + } + + /// Apply the given lazy DFA configuration options to this builder. + pub fn configure(&mut self, config: Config) -> &mut Builder { + self.config = self.config.overwrite(config); + self + } + + /// Set the syntax configuration for this builder using + /// [`syntax::Config`](crate::util::syntax::Config). + /// + /// This permits setting things like case insensitivity, Unicode and multi + /// line mode. + /// + /// These settings only apply when constructing a lazy DFA directly from a + /// pattern. + #[cfg(feature = "syntax")] + pub fn syntax( + &mut self, + config: crate::util::syntax::Config, + ) -> &mut Builder { + self.thompson.syntax(config); + self + } + + /// Set the Thompson NFA configuration for this builder using + /// [`nfa::thompson::Config`](crate::nfa::thompson::Config). + /// + /// This permits setting things like whether the DFA should match the regex + /// in reverse or if additional time should be spent shrinking the size of + /// the NFA. + /// + /// These settings only apply when constructing a DFA directly from a + /// pattern. + #[cfg(feature = "syntax")] + pub fn thompson(&mut self, config: thompson::Config) -> &mut Builder { + self.thompson.configure(config); + self + } +} + +/// Represents the current state of an overlapping search. +/// +/// This is used for overlapping searches since they need to know something +/// about the previous search. For example, when multiple patterns match at the +/// same position, this state tracks the last reported pattern so that the next +/// search knows whether to report another matching pattern or continue with +/// the search at the next position. Additionally, it also tracks which state +/// the last search call terminated in. +/// +/// This type provides little introspection capabilities. The only thing a +/// caller can do is construct it and pass it around to permit search routines +/// to use it to track state, and also ask whether a match has been found. +/// +/// Callers should always provide a fresh state constructed via +/// [`OverlappingState::start`] when starting a new search. Reusing state from +/// a previous search may result in incorrect results. +#[derive(Clone, Debug, Eq, PartialEq)] +pub struct OverlappingState { + /// The match reported by the most recent overlapping search to use this + /// state. + /// + /// If a search does not find any matches, then it is expected to clear + /// this value. + pub(crate) mat: Option<HalfMatch>, + /// The state ID of the state at which the search was in when the call + /// terminated. When this is a match state, `last_match` must be set to a + /// non-None value. + /// + /// A `None` value indicates the start state of the corresponding + /// automaton. We cannot use the actual ID, since any one automaton may + /// have many start states, and which one is in use depends on several + /// search-time factors. + pub(crate) id: Option<LazyStateID>, + /// The position of the search. + /// + /// When `id` is None (i.e., we are starting a search), this is set to + /// the beginning of the search as given by the caller regardless of its + /// current value. Subsequent calls to an overlapping search pick up at + /// this offset. + pub(crate) at: usize, + /// The index into the matching patterns of the next match to report if the + /// current state is a match state. Note that this may be 1 greater than + /// the total number of matches to report for the current match state. (In + /// which case, no more matches should be reported at the current position + /// and the search should advance to the next position.) + pub(crate) next_match_index: Option<usize>, + /// This is set to true when a reverse overlapping search has entered its + /// EOI transitions. + /// + /// This isn't used in a forward search because it knows to stop once the + /// position exceeds the end of the search range. In a reverse search, + /// since we use unsigned offsets, we don't "know" once we've gone past + /// `0`. So the only way to detect it is with this extra flag. The reverse + /// overlapping search knows to terminate specifically after it has + /// reported all matches after following the EOI transition. + pub(crate) rev_eoi: bool, +} + +impl OverlappingState { + /// Create a new overlapping state that begins at the start state of any + /// automaton. + pub fn start() -> OverlappingState { + OverlappingState { + mat: None, + id: None, + at: 0, + next_match_index: None, + rev_eoi: false, + } + } + + /// Return the match result of the most recent search to execute with this + /// state. + /// + /// A searches will clear this result automatically, such that if no + /// match is found, this will correctly report `None`. + pub fn get_match(&self) -> Option<HalfMatch> { + self.mat + } +} + +/// Runs the given overlapping `search` function (forwards or backwards) until +/// a match is found whose offset does not split a codepoint. +/// +/// This is *not* always correct to call. It should only be called when the +/// underlying NFA has UTF-8 mode enabled *and* it can produce zero-width +/// matches. Calling this when both of those things aren't true might result +/// in legitimate matches getting skipped. +#[cold] +#[inline(never)] +fn skip_empty_utf8_splits_overlapping<F>( + input: &Input<'_>, + state: &mut OverlappingState, + mut search: F, +) -> Result<(), MatchError> +where + F: FnMut(&Input<'_>, &mut OverlappingState) -> Result<(), MatchError>, +{ + // Note that this routine works for forwards and reverse searches + // even though there's no code here to handle those cases. That's + // because overlapping searches drive themselves to completion via + // `OverlappingState`. So all we have to do is push it until no matches are + // found. + + let mut hm = match state.get_match() { + None => return Ok(()), + Some(hm) => hm, + }; + if input.get_anchored().is_anchored() { + if !input.is_char_boundary(hm.offset()) { + state.mat = None; + } + return Ok(()); + } + while !input.is_char_boundary(hm.offset()) { + search(input, state)?; + hm = match state.get_match() { + None => return Ok(()), + Some(hm) => hm, + }; + } + Ok(()) +} + +/// Based on the minimum number of states required for a useful lazy DFA cache, +/// this returns the minimum lazy state ID that must be representable. +/// +/// It's not likely for this to have any impact 32-bit systems (or higher), but +/// on 16-bit systems, the lazy state ID space is quite constrained and thus +/// may be insufficient if our MIN_STATES value is (for some reason) too high. +fn minimum_lazy_state_id( + classes: &ByteClasses, +) -> Result<LazyStateID, LazyStateIDError> { + let stride = 1 << classes.stride2(); + let min_state_index = MIN_STATES.checked_sub(1).unwrap(); + LazyStateID::new(min_state_index * stride) +} + +/// Based on the minimum number of states required for a useful lazy DFA cache, +/// this returns a heuristic minimum number of bytes of heap space required. +/// +/// This is a "heuristic" because the minimum it returns is likely bigger than +/// the true minimum. Namely, it assumes that each powerset NFA/DFA state uses +/// the maximum number of NFA states (all of them). This is likely bigger +/// than what is required in practice. Computing the true minimum effectively +/// requires determinization, which is probably too much work to do for a +/// simple check like this. +/// +/// One of the issues with this approach IMO is that it requires that this +/// be in sync with the calculation above for computing how much heap memory +/// the DFA cache uses. If we get it wrong, it's possible for example for the +/// minimum to be smaller than the computed heap memory, and thus, it may be +/// the case that we can't add the required minimum number of states. That in +/// turn will make lazy DFA panic because we assume that we can add at least a +/// minimum number of states. +/// +/// Another approach would be to always allow the minimum number of states to +/// be added to the lazy DFA cache, even if it exceeds the configured cache +/// limit. This does mean that the limit isn't really a limit in all cases, +/// which is unfortunate. But it does at least guarantee that the lazy DFA can +/// always make progress, even if it is slow. (This approach is very similar to +/// enabling the 'skip_cache_capacity_check' config knob, except it wouldn't +/// rely on cache size calculation. Instead, it would just always permit a +/// minimum number of states to be added.) +fn minimum_cache_capacity( + nfa: &thompson::NFA, + classes: &ByteClasses, + starts_for_each_pattern: bool, +) -> usize { + const ID_SIZE: usize = size_of::<LazyStateID>(); + const STATE_SIZE: usize = size_of::<State>(); + + let stride = 1 << classes.stride2(); + let states_len = nfa.states().len(); + let sparses = 2 * states_len * NFAStateID::SIZE; + let trans = MIN_STATES * stride * ID_SIZE; + + let mut starts = Start::len() * ID_SIZE; + if starts_for_each_pattern { + starts += (Start::len() * nfa.pattern_len()) * ID_SIZE; + } + + // The min number of states HAS to be at least 4: we have 3 sentinel states + // and then we need space for one more when we save a state after clearing + // the cache. We also need space for one more, otherwise we get stuck in a + // loop where we try to add a 5th state, which gets rejected, which clears + // the cache, which adds back a saved state (4th total state) which then + // tries to add the 5th state again. + assert!(MIN_STATES >= 5, "minimum number of states has to be at least 5"); + // The minimum number of non-sentinel states. We consider this separately + // because sentinel states are much smaller in that they contain no NFA + // states. Given our aggressive calculation here, it's worth being more + // precise with the number of states we need. + let non_sentinel = MIN_STATES.checked_sub(SENTINEL_STATES).unwrap(); + + // Every `State` has 5 bytes for flags, 4 bytes (max) for the number of + // patterns, followed by 32-bit encodings of patterns and then delta + // varint encodings of NFA state IDs. We use the worst case (which isn't + // technically possible) of 5 bytes for each NFA state ID. + // + // HOWEVER, three of the states needed by a lazy DFA are just the sentinel + // unknown, dead and quit states. Those states have a known size and it is + // small. + let dead_state_size = State::dead().memory_usage(); + let max_state_size = 5 + 4 + (nfa.pattern_len() * 4) + (states_len * 5); + let states = (SENTINEL_STATES * (STATE_SIZE + dead_state_size)) + + (non_sentinel * (STATE_SIZE + max_state_size)); + // NOTE: We don't double count heap memory used by State for this map since + // we use reference counting to avoid doubling memory usage. (This tends to + // be where most memory is allocated in the cache.) + let states_to_sid = (MIN_STATES * STATE_SIZE) + (MIN_STATES * ID_SIZE); + let stack = states_len * NFAStateID::SIZE; + let scratch_state_builder = max_state_size; + + trans + + starts + + states + + states_to_sid + + sparses + + stack + + scratch_state_builder +} + +#[cfg(all(test, feature = "syntax"))] +mod tests { + use super::*; + + // Tests that we handle heuristic Unicode word boundary support in reverse + // DFAs in the specific case of contextual searches. + // + // I wrote this test when I discovered a bug in how heuristic word + // boundaries were handled. Namely, that the starting state selection + // didn't consider the DFA's quit byte set when looking at the byte + // immediately before the start of the search (or immediately after the + // end of the search in the case of a reverse search). As a result, it was + // possible for '\bfoo\b' to match 'β123' because the trailing \xB2 byte + // in the 'β' codepoint would be treated as a non-word character. But of + // course, this search should trigger the DFA to quit, since there is a + // non-ASCII byte in consideration. + // + // Thus, I fixed 'start_state_{forward,reverse}' to check the quit byte set + // if it wasn't empty. The forward case is tested in the doc test for the + // Config::unicode_word_boundary API. We test the reverse case here, which + // is sufficiently niche that it doesn't really belong in a doc test. + #[test] + fn heuristic_unicode_reverse() { + let dfa = DFA::builder() + .configure(DFA::config().unicode_word_boundary(true)) + .thompson(thompson::Config::new().reverse(true)) + .build(r"\b[0-9]+\b") + .unwrap(); + let mut cache = dfa.create_cache(); + + let input = Input::new("β123").range(2..); + let expected = MatchError::quit(0xB2, 1); + let got = dfa.try_search_rev(&mut cache, &input); + assert_eq!(Err(expected), got); + + let input = Input::new("123β").range(..3); + let expected = MatchError::quit(0xCE, 3); + let got = dfa.try_search_rev(&mut cache, &input); + assert_eq!(Err(expected), got); + } +} diff --git a/third_party/rust/regex-automata/src/hybrid/error.rs b/third_party/rust/regex-automata/src/hybrid/error.rs new file mode 100644 index 0000000000..604daf3c38 --- /dev/null +++ b/third_party/rust/regex-automata/src/hybrid/error.rs @@ -0,0 +1,139 @@ +use crate::{hybrid::id::LazyStateIDError, nfa}; + +/// An error that occurs when initial construction of a lazy DFA fails. +/// +/// A build error can occur when insufficient cache capacity is configured or +/// if something about the NFA is unsupported. (For example, if one attempts +/// to build a lazy DFA without heuristic Unicode support but with an NFA that +/// contains a Unicode word boundary.) +/// +/// This error does not provide many introspection capabilities. There are +/// generally only two things you can do with it: +/// +/// * Obtain a human readable message via its `std::fmt::Display` impl. +/// * Access an underlying +/// [`nfa::thompson::BuildError`](crate::nfa::thompson::BuildError) +/// type from its `source` method via the `std::error::Error` trait. This error +/// only occurs when using convenience routines for building a lazy DFA +/// directly from a pattern string. +/// +/// When the `std` feature is enabled, this implements the `std::error::Error` +/// trait. +#[derive(Clone, Debug)] +pub struct BuildError { + kind: BuildErrorKind, +} + +#[derive(Clone, Debug)] +enum BuildErrorKind { + NFA(nfa::thompson::BuildError), + InsufficientCacheCapacity { minimum: usize, given: usize }, + InsufficientStateIDCapacity { err: LazyStateIDError }, + Unsupported(&'static str), +} + +impl BuildError { + pub(crate) fn nfa(err: nfa::thompson::BuildError) -> BuildError { + BuildError { kind: BuildErrorKind::NFA(err) } + } + + pub(crate) fn insufficient_cache_capacity( + minimum: usize, + given: usize, + ) -> BuildError { + BuildError { + kind: BuildErrorKind::InsufficientCacheCapacity { minimum, given }, + } + } + + pub(crate) fn insufficient_state_id_capacity( + err: LazyStateIDError, + ) -> BuildError { + BuildError { + kind: BuildErrorKind::InsufficientStateIDCapacity { err }, + } + } + + pub(crate) fn unsupported_dfa_word_boundary_unicode() -> BuildError { + let msg = "cannot build lazy DFAs for regexes with Unicode word \ + boundaries; switch to ASCII word boundaries, or \ + heuristically enable Unicode word boundaries or use a \ + different regex engine"; + BuildError { kind: BuildErrorKind::Unsupported(msg) } + } +} + +#[cfg(feature = "std")] +impl std::error::Error for BuildError { + fn source(&self) -> Option<&(dyn std::error::Error + 'static)> { + match self.kind { + BuildErrorKind::NFA(ref err) => Some(err), + _ => None, + } + } +} + +impl core::fmt::Display for BuildError { + fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result { + match self.kind { + BuildErrorKind::NFA(_) => write!(f, "error building NFA"), + BuildErrorKind::InsufficientCacheCapacity { minimum, given } => { + write!( + f, + "given cache capacity ({}) is smaller than \ + minimum required ({})", + given, minimum, + ) + } + BuildErrorKind::InsufficientStateIDCapacity { ref err } => { + err.fmt(f) + } + BuildErrorKind::Unsupported(ref msg) => { + write!(f, "unsupported regex feature for DFAs: {}", msg) + } + } + } +} + +/// An error that occurs when cache usage has become inefficient. +/// +/// One of the weaknesses of a lazy DFA is that it may need to clear its +/// cache repeatedly if it's not big enough. If this happens too much, then it +/// can slow searching down significantly. A mitigation to this is to use +/// heuristics to detect whether the cache is being used efficiently or not. +/// If not, then a lazy DFA can return a `CacheError`. +/// +/// The default configuration of a lazy DFA in this crate is +/// set such that a `CacheError` will never occur. Instead, +/// callers must opt into this behavior with settings like +/// [`dfa::Config::minimum_cache_clear_count`](crate::hybrid::dfa::Config::minimum_cache_clear_count) +/// and +/// [`dfa::Config::minimum_bytes_per_state`](crate::hybrid::dfa::Config::minimum_bytes_per_state). +/// +/// When the `std` feature is enabled, this implements the `std::error::Error` +/// trait. +#[derive(Clone, Debug)] +pub struct CacheError(()); + +impl CacheError { + pub(crate) fn too_many_cache_clears() -> CacheError { + CacheError(()) + } + + pub(crate) fn bad_efficiency() -> CacheError { + CacheError(()) + } +} + +#[cfg(feature = "std")] +impl std::error::Error for CacheError { + fn source(&self) -> Option<&(dyn std::error::Error + 'static)> { + None + } +} + +impl core::fmt::Display for CacheError { + fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result { + write!(f, "lazy DFA cache has been cleared too many times") + } +} diff --git a/third_party/rust/regex-automata/src/hybrid/id.rs b/third_party/rust/regex-automata/src/hybrid/id.rs new file mode 100644 index 0000000000..662e3c98f0 --- /dev/null +++ b/third_party/rust/regex-automata/src/hybrid/id.rs @@ -0,0 +1,354 @@ +/// A state identifier specifically tailored for lazy DFAs. +/// +/// A lazy state ID logically represents a pointer to a DFA state. In practice, +/// by limiting the number of DFA states it can address, it reserves some +/// bits of its representation to encode some additional information. That +/// additional information is called a "tag." That tag is used to record +/// whether the state it points to is an unknown, dead, quit, start or match +/// state. +/// +/// When implementing a low level search routine with a lazy DFA, it is +/// necessary to query the type of the current state to know what to do: +/// +/// * **Unknown** - The state has not yet been computed. The +/// parameters used to get this state ID must be re-passed to +/// [`DFA::next_state`](crate::hybrid::dfa::DFA::next_state), which will never +/// return an unknown state ID. +/// * **Dead** - A dead state only has transitions to itself. It indicates that +/// the search cannot do anything else and should stop with whatever result it +/// has. +/// * **Quit** - A quit state indicates that the automaton could not answer +/// whether a match exists or not. Correct search implementations must return a +/// [`MatchError::quit`](crate::MatchError::quit) when a DFA enters a quit +/// state. +/// * **Start** - A start state is a state in which a search can begin. +/// Lazy DFAs usually have more than one start state. Branching on +/// this isn't required for correctness, but a common optimization is +/// to run a prefilter when a search enters a start state. Note that +/// start states are *not* tagged automatically, and one must enable the +/// [`Config::specialize_start_states`](crate::hybrid::dfa::Config::specialize_start_states) +/// setting for start states to be tagged. The reason for this is +/// that a DFA search loop is usually written to execute a prefilter once it +/// enters a start state. But if there is no prefilter, this handling can be +/// quite diastrous as the DFA may ping-pong between the special handling code +/// and a possible optimized hot path for handling untagged states. When start +/// states aren't specialized, then they are untagged and remain in the hot +/// path. +/// * **Match** - A match state indicates that a match has been found. +/// Depending on the semantics of your search implementation, it may either +/// continue until the end of the haystack or a dead state, or it might quit +/// and return the match immediately. +/// +/// As an optimization, the [`is_tagged`](LazyStateID::is_tagged) predicate +/// can be used to determine if a tag exists at all. This is useful to avoid +/// branching on all of the above types for every byte searched. +/// +/// # Example +/// +/// This example shows how `LazyStateID` can be used to implement a correct +/// search routine with minimal branching. In particular, this search routine +/// implements "leftmost" matching, which means that it doesn't immediately +/// stop once a match is found. Instead, it continues until it reaches a dead +/// state. +/// +/// Notice also how a correct search implementation deals with +/// [`CacheError`](crate::hybrid::CacheError)s returned by some of +/// the lazy DFA routines. When a `CacheError` occurs, it returns +/// [`MatchError::gave_up`](crate::MatchError::gave_up). +/// +/// ``` +/// use regex_automata::{ +/// hybrid::dfa::{Cache, DFA}, +/// HalfMatch, MatchError, Input, +/// }; +/// +/// fn find_leftmost_first( +/// dfa: &DFA, +/// cache: &mut Cache, +/// haystack: &[u8], +/// ) -> Result<Option<HalfMatch>, MatchError> { +/// // The start state is determined by inspecting the position and the +/// // initial bytes of the haystack. Note that start states can never +/// // be match states (since DFAs in this crate delay matches by 1 +/// // byte), so we don't need to check if the start state is a match. +/// let mut sid = dfa.start_state_forward( +/// cache, +/// &Input::new(haystack), +/// )?; +/// let mut last_match = None; +/// // Walk all the bytes in the haystack. We can quit early if we see +/// // a dead or a quit state. The former means the automaton will +/// // never transition to any other state. The latter means that the +/// // automaton entered a condition in which its search failed. +/// for (i, &b) in haystack.iter().enumerate() { +/// sid = dfa +/// .next_state(cache, sid, b) +/// .map_err(|_| MatchError::gave_up(i))?; +/// if sid.is_tagged() { +/// if sid.is_match() { +/// last_match = Some(HalfMatch::new( +/// dfa.match_pattern(cache, sid, 0), +/// i, +/// )); +/// } else if sid.is_dead() { +/// return Ok(last_match); +/// } else if sid.is_quit() { +/// // It is possible to enter into a quit state after +/// // observing a match has occurred. In that case, we +/// // should return the match instead of an error. +/// if last_match.is_some() { +/// return Ok(last_match); +/// } +/// return Err(MatchError::quit(b, i)); +/// } +/// // Implementors may also want to check for start states and +/// // handle them differently for performance reasons. But it is +/// // not necessary for correctness. Note that in order to check +/// // for start states, you'll need to enable the +/// // 'specialize_start_states' config knob, otherwise start +/// // states will not be tagged. +/// } +/// } +/// // Matches are always delayed by 1 byte, so we must explicitly walk +/// // the special "EOI" transition at the end of the search. +/// sid = dfa +/// .next_eoi_state(cache, sid) +/// .map_err(|_| MatchError::gave_up(haystack.len()))?; +/// if sid.is_match() { +/// last_match = Some(HalfMatch::new( +/// dfa.match_pattern(cache, sid, 0), +/// haystack.len(), +/// )); +/// } +/// Ok(last_match) +/// } +/// +/// // We use a greedy '+' operator to show how the search doesn't just stop +/// // once a match is detected. It continues extending the match. Using +/// // '[a-z]+?' would also work as expected and stop the search early. +/// // Greediness is built into the automaton. +/// let dfa = DFA::new(r"[a-z]+")?; +/// let mut cache = dfa.create_cache(); +/// let haystack = "123 foobar 4567".as_bytes(); +/// let mat = find_leftmost_first(&dfa, &mut cache, haystack)?.unwrap(); +/// assert_eq!(mat.pattern().as_usize(), 0); +/// assert_eq!(mat.offset(), 10); +/// +/// // Here's another example that tests our handling of the special +/// // EOI transition. This will fail to find a match if we don't call +/// // 'next_eoi_state' at the end of the search since the match isn't found +/// // until the final byte in the haystack. +/// let dfa = DFA::new(r"[0-9]{4}")?; +/// let mut cache = dfa.create_cache(); +/// let haystack = "123 foobar 4567".as_bytes(); +/// let mat = find_leftmost_first(&dfa, &mut cache, haystack)?.unwrap(); +/// assert_eq!(mat.pattern().as_usize(), 0); +/// assert_eq!(mat.offset(), 15); +/// +/// // And note that our search implementation above automatically works +/// // with multi-DFAs. Namely, `dfa.match_pattern(match_state, 0)` selects +/// // the appropriate pattern ID for us. +/// let dfa = DFA::new_many(&[r"[a-z]+", r"[0-9]+"])?; +/// let mut cache = dfa.create_cache(); +/// let haystack = "123 foobar 4567".as_bytes(); +/// let mat = find_leftmost_first(&dfa, &mut cache, haystack)?.unwrap(); +/// assert_eq!(mat.pattern().as_usize(), 1); +/// assert_eq!(mat.offset(), 3); +/// let mat = find_leftmost_first(&dfa, &mut cache, &haystack[3..])?.unwrap(); +/// assert_eq!(mat.pattern().as_usize(), 0); +/// assert_eq!(mat.offset(), 7); +/// let mat = find_leftmost_first(&dfa, &mut cache, &haystack[10..])?.unwrap(); +/// assert_eq!(mat.pattern().as_usize(), 1); +/// assert_eq!(mat.offset(), 5); +/// +/// # Ok::<(), Box<dyn std::error::Error>>(()) +/// ``` +#[derive( + Clone, Copy, Debug, Default, Eq, Hash, PartialEq, PartialOrd, Ord, +)] +pub struct LazyStateID(u32); + +impl LazyStateID { + #[cfg(any(target_pointer_width = "32", target_pointer_width = "64"))] + const MAX_BIT: usize = 31; + + #[cfg(target_pointer_width = "16")] + const MAX_BIT: usize = 15; + + const MASK_UNKNOWN: usize = 1 << (LazyStateID::MAX_BIT); + const MASK_DEAD: usize = 1 << (LazyStateID::MAX_BIT - 1); + const MASK_QUIT: usize = 1 << (LazyStateID::MAX_BIT - 2); + const MASK_START: usize = 1 << (LazyStateID::MAX_BIT - 3); + const MASK_MATCH: usize = 1 << (LazyStateID::MAX_BIT - 4); + const MAX: usize = LazyStateID::MASK_MATCH - 1; + + /// Create a new lazy state ID. + /// + /// If the given identifier exceeds [`LazyStateID::MAX`], then this returns + /// an error. + #[inline] + pub(crate) fn new(id: usize) -> Result<LazyStateID, LazyStateIDError> { + if id > LazyStateID::MAX { + let attempted = u64::try_from(id).unwrap(); + return Err(LazyStateIDError { attempted }); + } + Ok(LazyStateID::new_unchecked(id)) + } + + /// Create a new lazy state ID without checking whether the given value + /// exceeds [`LazyStateID::MAX`]. + /// + /// While this is unchecked, providing an incorrect value must never + /// sacrifice memory safety. + #[inline] + const fn new_unchecked(id: usize) -> LazyStateID { + // FIXME: Use as_u32() once const functions in traits are stable. + LazyStateID(id as u32) + } + + /// Return this lazy state ID as an untagged `usize`. + /// + /// If this lazy state ID is tagged, then the usize returned is the state + /// ID without the tag. If the ID was not tagged, then the usize returned + /// is equivalent to the state ID. + #[inline] + pub(crate) fn as_usize_untagged(&self) -> usize { + self.as_usize_unchecked() & LazyStateID::MAX + } + + /// Return this lazy state ID as its raw internal `usize` value, which may + /// be tagged (and thus greater than LazyStateID::MAX). + #[inline] + pub(crate) const fn as_usize_unchecked(&self) -> usize { + // FIXME: Use as_usize() once const functions in traits are stable. + self.0 as usize + } + + #[inline] + pub(crate) const fn to_unknown(&self) -> LazyStateID { + LazyStateID::new_unchecked( + self.as_usize_unchecked() | LazyStateID::MASK_UNKNOWN, + ) + } + + #[inline] + pub(crate) const fn to_dead(&self) -> LazyStateID { + LazyStateID::new_unchecked( + self.as_usize_unchecked() | LazyStateID::MASK_DEAD, + ) + } + + #[inline] + pub(crate) const fn to_quit(&self) -> LazyStateID { + LazyStateID::new_unchecked( + self.as_usize_unchecked() | LazyStateID::MASK_QUIT, + ) + } + + /// Return this lazy state ID as a state ID that is tagged as a start + /// state. + #[inline] + pub(crate) const fn to_start(&self) -> LazyStateID { + LazyStateID::new_unchecked( + self.as_usize_unchecked() | LazyStateID::MASK_START, + ) + } + + /// Return this lazy state ID as a lazy state ID that is tagged as a match + /// state. + #[inline] + pub(crate) const fn to_match(&self) -> LazyStateID { + LazyStateID::new_unchecked( + self.as_usize_unchecked() | LazyStateID::MASK_MATCH, + ) + } + + /// Return true if and only if this lazy state ID is tagged. + /// + /// When a lazy state ID is tagged, then one can conclude that it is one + /// of a match, start, dead, quit or unknown state. + #[inline] + pub const fn is_tagged(&self) -> bool { + self.as_usize_unchecked() > LazyStateID::MAX + } + + /// Return true if and only if this represents a lazy state ID that is + /// "unknown." That is, the state has not yet been created. When a caller + /// sees this state ID, it generally means that a state has to be computed + /// in order to proceed. + #[inline] + pub const fn is_unknown(&self) -> bool { + self.as_usize_unchecked() & LazyStateID::MASK_UNKNOWN > 0 + } + + /// Return true if and only if this represents a dead state. A dead state + /// is a state that can never transition to any other state except the + /// dead state. When a dead state is seen, it generally indicates that a + /// search should stop. + #[inline] + pub const fn is_dead(&self) -> bool { + self.as_usize_unchecked() & LazyStateID::MASK_DEAD > 0 + } + + /// Return true if and only if this represents a quit state. A quit state + /// is a state that is representationally equivalent to a dead state, + /// except it indicates the automaton has reached a point at which it can + /// no longer determine whether a match exists or not. In general, this + /// indicates an error during search and the caller must either pass this + /// error up or use a different search technique. + #[inline] + pub const fn is_quit(&self) -> bool { + self.as_usize_unchecked() & LazyStateID::MASK_QUIT > 0 + } + + /// Return true if and only if this lazy state ID has been tagged as a + /// start state. + /// + /// Note that if + /// [`Config::specialize_start_states`](crate::hybrid::dfa::Config) is + /// disabled (which is the default), then this will always return false + /// since start states won't be tagged. + #[inline] + pub const fn is_start(&self) -> bool { + self.as_usize_unchecked() & LazyStateID::MASK_START > 0 + } + + /// Return true if and only if this lazy state ID has been tagged as a + /// match state. + #[inline] + pub const fn is_match(&self) -> bool { + self.as_usize_unchecked() & LazyStateID::MASK_MATCH > 0 + } +} + +/// This error occurs when a lazy state ID could not be constructed. +/// +/// This occurs when given an integer exceeding the maximum lazy state ID +/// value. +/// +/// When the `std` feature is enabled, this implements the `Error` trait. +#[derive(Clone, Debug, Eq, PartialEq)] +pub(crate) struct LazyStateIDError { + attempted: u64, +} + +impl LazyStateIDError { + /// Returns the value that failed to constructed a lazy state ID. + pub(crate) fn attempted(&self) -> u64 { + self.attempted + } +} + +#[cfg(feature = "std")] +impl std::error::Error for LazyStateIDError {} + +impl core::fmt::Display for LazyStateIDError { + fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result { + write!( + f, + "failed to create LazyStateID from {:?}, which exceeds {:?}", + self.attempted(), + LazyStateID::MAX, + ) + } +} diff --git a/third_party/rust/regex-automata/src/hybrid/mod.rs b/third_party/rust/regex-automata/src/hybrid/mod.rs new file mode 100644 index 0000000000..44e67e1299 --- /dev/null +++ b/third_party/rust/regex-automata/src/hybrid/mod.rs @@ -0,0 +1,144 @@ +/*! +A module for building and searching with lazy deterministic finite automata +(DFAs). + +Like other modules in this crate, lazy DFAs support a rich regex syntax with +Unicode features. The key feature of a lazy DFA is that it builds itself +incrementally during search, and never uses more than a configured capacity of +memory. Thus, when searching with a lazy DFA, one must supply a mutable "cache" +in which the actual DFA's transition table is stored. + +If you're looking for fully compiled DFAs, then please see the top-level +[`dfa` module](crate::dfa). + +# Overview + +This section gives a brief overview of the primary types in this module: + +* A [`regex::Regex`] provides a way to search for matches of a regular +expression using lazy DFAs. This includes iterating over matches with both the +start and end positions of each match. +* A [`dfa::DFA`] provides direct low level access to a lazy DFA. + +# Example: basic regex searching + +This example shows how to compile a regex using the default configuration +and then use it to find matches in a byte string: + +``` +use regex_automata::{hybrid::regex::Regex, Match}; + +let re = Regex::new(r"[0-9]{4}-[0-9]{2}-[0-9]{2}")?; +let mut cache = re.create_cache(); + +let haystack = "2018-12-24 2016-10-08"; +let matches: Vec<Match> = re.find_iter(&mut cache, haystack).collect(); +assert_eq!(matches, vec![ + Match::must(0, 0..10), + Match::must(0, 11..21), +]); +# Ok::<(), Box<dyn std::error::Error>>(()) +``` + +# Example: searching with multiple regexes + +The lazy DFAs in this module all fully support searching with multiple regexes +simultaneously. You can use this support with standard leftmost-first style +searching to find non-overlapping matches: + +``` +# if cfg!(miri) { return Ok(()); } // miri takes too long +use regex_automata::{hybrid::regex::Regex, Match}; + +let re = Regex::new_many(&[r"\w+", r"\S+"])?; +let mut cache = re.create_cache(); + +let haystack = "@foo bar"; +let matches: Vec<Match> = re.find_iter(&mut cache, haystack).collect(); +assert_eq!(matches, vec![ + Match::must(1, 0..4), + Match::must(0, 5..8), +]); +# Ok::<(), Box<dyn std::error::Error>>(()) +``` + +# When should I use this? + +Generally speaking, if you can abide the use of mutable state during search, +and you don't need things like capturing groups or Unicode word boundary +support in non-ASCII text, then a lazy DFA is likely a robust choice with +respect to both search speed and memory usage. Note however that its speed +may be worse than a general purpose regex engine if you don't select a good +[prefilter](crate::util::prefilter). + +If you know ahead of time that your pattern would result in a very large DFA +if it was fully compiled, it may be better to use an NFA simulation instead +of a lazy DFA. Either that, or increase the cache capacity of your lazy DFA +to something that is big enough to hold the state machine (likely through +experimentation). The issue here is that if the cache is too small, then it +could wind up being reset too frequently and this might decrease searching +speed significantly. + +# Differences with fully compiled DFAs + +A [`hybrid::regex::Regex`](crate::hybrid::regex::Regex) and a +[`dfa::regex::Regex`](crate::dfa::regex::Regex) both have the same capabilities +(and similarly for their underlying DFAs), but they achieve them through +different means. The main difference is that a hybrid or "lazy" regex builds +its DFA lazily during search, where as a fully compiled regex will build its +DFA at construction time. While building a DFA at search time might sound like +it's slow, it tends to work out where most bytes seen during a search will +reuse pre-built parts of the DFA and thus can be almost as fast as a fully +compiled DFA. The main downside is that searching requires mutable space to +store the DFA, and, in the worst case, a search can result in a new state being +created for each byte seen, which would make searching quite a bit slower. + +A fully compiled DFA never has to worry about searches being slower once +it's built. (Aside from, say, the transition table being so large that it +is subject to harsh CPU cache effects.) However, of course, building a full +DFA can be quite time consuming and memory hungry. Particularly when large +Unicode character classes are used, which tend to translate into very large +DFAs. + +A lazy DFA strikes a nice balance _in practice_, particularly in the +presence of Unicode mode, by only building what is needed. It avoids the +worst case exponential time complexity of DFA compilation by guaranteeing that +it will only build at most one state per byte searched. While the worst +case here can lead to a very high constant, it will never be exponential. + +# Syntax + +This module supports the same syntax as the `regex` crate, since they share the +same parser. You can find an exhaustive list of supported syntax in the +[documentation for the `regex` crate](https://docs.rs/regex/1/regex/#syntax). + +There are two things that are not supported by the lazy DFAs in this module: + +* Capturing groups. The DFAs (and [`Regex`](regex::Regex)es built on top +of them) can only find the offsets of an entire match, but cannot resolve +the offsets of each capturing group. This is because DFAs do not have the +expressive power necessary. Note that it is okay to build a lazy DFA from an +NFA that contains capture groups. The capture groups will simply be ignored. +* Unicode word boundaries. These present particularly difficult challenges for +DFA construction and would result in an explosion in the number of states. +One can enable [`dfa::Config::unicode_word_boundary`] though, which provides +heuristic support for Unicode word boundaries that only works on ASCII text. +Otherwise, one can use `(?-u:\b)` for an ASCII word boundary, which will work +on any input. + +There are no plans to lift either of these limitations. + +Note that these restrictions are identical to the restrictions on fully +compiled DFAs. +*/ + +pub use self::{ + error::{BuildError, CacheError}, + id::LazyStateID, +}; + +pub mod dfa; +mod error; +mod id; +pub mod regex; +mod search; diff --git a/third_party/rust/regex-automata/src/hybrid/regex.rs b/third_party/rust/regex-automata/src/hybrid/regex.rs new file mode 100644 index 0000000000..75667daf91 --- /dev/null +++ b/third_party/rust/regex-automata/src/hybrid/regex.rs @@ -0,0 +1,895 @@ +/*! +A lazy DFA backed `Regex`. + +This module provides a [`Regex`] backed by a lazy DFA. A `Regex` implements +convenience routines you might have come to expect, such as finding a match +and iterating over all non-overlapping matches. This `Regex` type is limited +in its capabilities to what a lazy DFA can provide. Therefore, APIs involving +capturing groups, for example, are not provided. + +Internally, a `Regex` is composed of two DFAs. One is a "forward" DFA that +finds the end offset of a match, where as the other is a "reverse" DFA that +find the start offset of a match. + +See the [parent module](crate::hybrid) for examples. +*/ + +use crate::{ + hybrid::{ + dfa::{self, DFA}, + error::BuildError, + }, + nfa::thompson, + util::{ + iter, + search::{Anchored, Input, Match, MatchError, MatchKind}, + }, +}; + +/// A regular expression that uses hybrid NFA/DFAs (also called "lazy DFAs") +/// for searching. +/// +/// A regular expression is comprised of two lazy DFAs, a "forward" DFA and a +/// "reverse" DFA. The forward DFA is responsible for detecting the end of +/// a match while the reverse DFA is responsible for detecting the start +/// of a match. Thus, in order to find the bounds of any given match, a +/// forward search must first be run followed by a reverse search. A match +/// found by the forward DFA guarantees that the reverse DFA will also find +/// a match. +/// +/// # Fallibility +/// +/// Most of the search routines defined on this type will _panic_ when the +/// underlying search fails. This might be because the DFA gave up because it +/// saw a quit byte, whether configured explicitly or via heuristic Unicode +/// word boundary support, although neither are enabled by default. It might +/// also fail if the underlying DFA determines it isn't making effective use of +/// the cache (which also never happens by default). Or it might fail because +/// an invalid `Input` configuration is given, for example, with an unsupported +/// [`Anchored`] mode. +/// +/// If you need to handle these error cases instead of allowing them to trigger +/// a panic, then the lower level [`Regex::try_search`] provides a fallible API +/// that never panics. +/// +/// # Example +/// +/// This example shows how to cause a search to terminate if it sees a +/// `\n` byte, and handle the error returned. This could be useful if, for +/// example, you wanted to prevent a user supplied pattern from matching +/// across a line boundary. +/// +/// ``` +/// # if cfg!(miri) { return Ok(()); } // miri takes too long +/// use regex_automata::{hybrid::{dfa, regex::Regex}, Input, MatchError}; +/// +/// let re = Regex::builder() +/// .dfa(dfa::Config::new().quit(b'\n', true)) +/// .build(r"foo\p{any}+bar")?; +/// let mut cache = re.create_cache(); +/// +/// let input = Input::new("foo\nbar"); +/// // Normally this would produce a match, since \p{any} contains '\n'. +/// // But since we instructed the automaton to enter a quit state if a +/// // '\n' is observed, this produces a match error instead. +/// let expected = MatchError::quit(b'\n', 3); +/// let got = re.try_search(&mut cache, &input).unwrap_err(); +/// assert_eq!(expected, got); +/// +/// # Ok::<(), Box<dyn std::error::Error>>(()) +/// ``` +#[derive(Debug)] +pub struct Regex { + /// The forward lazy DFA. This can only find the end of a match. + forward: DFA, + /// The reverse lazy DFA. This can only find the start of a match. + /// + /// This is built with 'all' match semantics (instead of leftmost-first) + /// so that it always finds the longest possible match (which corresponds + /// to the leftmost starting position). It is also compiled as an anchored + /// matcher and has 'starts_for_each_pattern' enabled. Including starting + /// states for each pattern is necessary to ensure that we only look for + /// matches of a pattern that matched in the forward direction. Otherwise, + /// we might wind up finding the "leftmost" starting position of a totally + /// different pattern! + reverse: DFA, +} + +/// Convenience routines for regex and cache construction. +impl Regex { + /// Parse the given regular expression using the default configuration and + /// return the corresponding regex. + /// + /// If you want a non-default configuration, then use the [`Builder`] to + /// set your own configuration. + /// + /// # Example + /// + /// ``` + /// use regex_automata::{hybrid::regex::Regex, Match}; + /// + /// let re = Regex::new("foo[0-9]+bar")?; + /// let mut cache = re.create_cache(); + /// assert_eq!( + /// Some(Match::must(0, 3..14)), + /// re.find(&mut cache, "zzzfoo12345barzzz"), + /// ); + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + #[cfg(feature = "syntax")] + pub fn new(pattern: &str) -> Result<Regex, BuildError> { + Regex::builder().build(pattern) + } + + /// Like `new`, but parses multiple patterns into a single "multi regex." + /// This similarly uses the default regex configuration. + /// + /// # Example + /// + /// ``` + /// use regex_automata::{hybrid::regex::Regex, Match}; + /// + /// let re = Regex::new_many(&["[a-z]+", "[0-9]+"])?; + /// let mut cache = re.create_cache(); + /// + /// let mut it = re.find_iter(&mut cache, "abc 1 foo 4567 0 quux"); + /// assert_eq!(Some(Match::must(0, 0..3)), it.next()); + /// assert_eq!(Some(Match::must(1, 4..5)), it.next()); + /// assert_eq!(Some(Match::must(0, 6..9)), it.next()); + /// assert_eq!(Some(Match::must(1, 10..14)), it.next()); + /// assert_eq!(Some(Match::must(1, 15..16)), it.next()); + /// assert_eq!(Some(Match::must(0, 17..21)), it.next()); + /// assert_eq!(None, it.next()); + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + #[cfg(feature = "syntax")] + pub fn new_many<P: AsRef<str>>( + patterns: &[P], + ) -> Result<Regex, BuildError> { + Regex::builder().build_many(patterns) + } + + /// Return a builder for configuring the construction of a `Regex`. + /// + /// This is a convenience routine to avoid needing to import the + /// [`Builder`] type in common cases. + /// + /// # Example + /// + /// This example shows how to use the builder to disable UTF-8 mode + /// everywhere. + /// + /// ``` + /// # if cfg!(miri) { return Ok(()); } // miri takes too long + /// use regex_automata::{ + /// hybrid::regex::Regex, nfa::thompson, util::syntax, Match, + /// }; + /// + /// let re = Regex::builder() + /// .syntax(syntax::Config::new().utf8(false)) + /// .thompson(thompson::Config::new().utf8(false)) + /// .build(r"foo(?-u:[^b])ar.*")?; + /// let mut cache = re.create_cache(); + /// + /// let haystack = b"\xFEfoo\xFFarzz\xE2\x98\xFF\n"; + /// let expected = Some(Match::must(0, 1..9)); + /// let got = re.find(&mut cache, haystack); + /// assert_eq!(expected, got); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + pub fn builder() -> Builder { + Builder::new() + } + + /// Create a new cache for this `Regex`. + /// + /// The cache returned should only be used for searches for this + /// `Regex`. If you want to reuse the cache for another `Regex`, then + /// you must call [`Cache::reset`] with that `Regex` (or, equivalently, + /// [`Regex::reset_cache`]). + pub fn create_cache(&self) -> Cache { + Cache::new(self) + } + + /// Reset the given cache such that it can be used for searching with the + /// this `Regex` (and only this `Regex`). + /// + /// A cache reset permits reusing memory already allocated in this cache + /// with a different `Regex`. + /// + /// Resetting a cache sets its "clear count" to 0. This is relevant if the + /// `Regex` has been configured to "give up" after it has cleared the cache + /// a certain number of times. + /// + /// # Example + /// + /// This shows how to re-purpose a cache for use with a different `Regex`. + /// + /// ``` + /// # if cfg!(miri) { return Ok(()); } // miri takes too long + /// use regex_automata::{hybrid::regex::Regex, Match}; + /// + /// let re1 = Regex::new(r"\w")?; + /// let re2 = Regex::new(r"\W")?; + /// + /// let mut cache = re1.create_cache(); + /// assert_eq!( + /// Some(Match::must(0, 0..2)), + /// re1.find(&mut cache, "Δ"), + /// ); + /// + /// // Using 'cache' with re2 is not allowed. It may result in panics or + /// // incorrect results. In order to re-purpose the cache, we must reset + /// // it with the Regex we'd like to use it with. + /// // + /// // Similarly, after this reset, using the cache with 're1' is also not + /// // allowed. + /// re2.reset_cache(&mut cache); + /// assert_eq!( + /// Some(Match::must(0, 0..3)), + /// re2.find(&mut cache, "☃"), + /// ); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + pub fn reset_cache(&self, cache: &mut Cache) { + self.forward().reset_cache(&mut cache.forward); + self.reverse().reset_cache(&mut cache.reverse); + } +} + +/// Standard infallible search routines for finding and iterating over matches. +impl Regex { + /// Returns true if and only if this regex matches the given haystack. + /// + /// This routine may short circuit if it knows that scanning future input + /// will never lead to a different result. In particular, if the underlying + /// DFA enters a match state or a dead state, then this routine will return + /// `true` or `false`, respectively, without inspecting any future input. + /// + /// # Panics + /// + /// This routine panics if the search could not complete. This can occur + /// in a number of circumstances: + /// + /// * The configuration of the lazy DFA may permit it to "quit" the search. + /// For example, setting quit bytes or enabling heuristic support for + /// Unicode word boundaries. The default configuration does not enable any + /// option that could result in the lazy DFA quitting. + /// * The configuration of the lazy DFA may also permit it to "give up" + /// on a search if it makes ineffective use of its transition table + /// cache. The default configuration does not enable this by default, + /// although it is typically a good idea to. + /// * When the provided `Input` configuration is not supported. For + /// example, by providing an unsupported anchor mode. + /// + /// When a search panics, callers cannot know whether a match exists or + /// not. + /// + /// Use [`Regex::try_search`] if you want to handle these error conditions. + /// + /// # Example + /// + /// ``` + /// use regex_automata::hybrid::regex::Regex; + /// + /// let re = Regex::new("foo[0-9]+bar")?; + /// let mut cache = re.create_cache(); + /// + /// assert!(re.is_match(&mut cache, "foo12345bar")); + /// assert!(!re.is_match(&mut cache, "foobar")); + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + #[inline] + pub fn is_match<'h, I: Into<Input<'h>>>( + &self, + cache: &mut Cache, + input: I, + ) -> bool { + // Not only can we do an "earliest" search, but we can avoid doing a + // reverse scan too. + self.forward() + .try_search_fwd(&mut cache.forward, &input.into().earliest(true)) + .unwrap() + .is_some() + } + + /// Returns the start and end offset of the leftmost match. If no match + /// exists, then `None` is returned. + /// + /// # Panics + /// + /// This routine panics if the search could not complete. This can occur + /// in a number of circumstances: + /// + /// * The configuration of the lazy DFA may permit it to "quit" the search. + /// For example, setting quit bytes or enabling heuristic support for + /// Unicode word boundaries. The default configuration does not enable any + /// option that could result in the lazy DFA quitting. + /// * The configuration of the lazy DFA may also permit it to "give up" + /// on a search if it makes ineffective use of its transition table + /// cache. The default configuration does not enable this by default, + /// although it is typically a good idea to. + /// * When the provided `Input` configuration is not supported. For + /// example, by providing an unsupported anchor mode. + /// + /// When a search panics, callers cannot know whether a match exists or + /// not. + /// + /// Use [`Regex::try_search`] if you want to handle these error conditions. + /// + /// # Example + /// + /// ``` + /// use regex_automata::{Match, hybrid::regex::Regex}; + /// + /// let re = Regex::new("foo[0-9]+")?; + /// let mut cache = re.create_cache(); + /// assert_eq!( + /// Some(Match::must(0, 3..11)), + /// re.find(&mut cache, "zzzfoo12345zzz"), + /// ); + /// + /// // Even though a match is found after reading the first byte (`a`), + /// // the default leftmost-first match semantics demand that we find the + /// // earliest match that prefers earlier parts of the pattern over latter + /// // parts. + /// let re = Regex::new("abc|a")?; + /// let mut cache = re.create_cache(); + /// assert_eq!(Some(Match::must(0, 0..3)), re.find(&mut cache, "abc")); + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + #[inline] + pub fn find<'h, I: Into<Input<'h>>>( + &self, + cache: &mut Cache, + input: I, + ) -> Option<Match> { + self.try_search(cache, &input.into()).unwrap() + } + + /// Returns an iterator over all non-overlapping leftmost matches in the + /// given bytes. If no match exists, then the iterator yields no elements. + /// + /// # Panics + /// + /// This routine panics if the search could not complete. This can occur + /// in a number of circumstances: + /// + /// * The configuration of the lazy DFA may permit it to "quit" the search. + /// For example, setting quit bytes or enabling heuristic support for + /// Unicode word boundaries. The default configuration does not enable any + /// option that could result in the lazy DFA quitting. + /// * The configuration of the lazy DFA may also permit it to "give up" + /// on a search if it makes ineffective use of its transition table + /// cache. The default configuration does not enable this by default, + /// although it is typically a good idea to. + /// * When the provided `Input` configuration is not supported. For + /// example, by providing an unsupported anchor mode. + /// + /// When a search panics, callers cannot know whether a match exists or + /// not. + /// + /// The above conditions also apply to the iterator returned as well. For + /// example, if the lazy DFA gives up or quits during a search using this + /// method, then a panic will occur during iteration. + /// + /// Use [`Regex::try_search`] with [`util::iter::Searcher`](iter::Searcher) + /// if you want to handle these error conditions. + /// + /// # Example + /// + /// ``` + /// use regex_automata::{hybrid::regex::Regex, Match}; + /// + /// let re = Regex::new("foo[0-9]+")?; + /// let mut cache = re.create_cache(); + /// + /// let text = "foo1 foo12 foo123"; + /// let matches: Vec<Match> = re.find_iter(&mut cache, text).collect(); + /// assert_eq!(matches, vec![ + /// Match::must(0, 0..4), + /// Match::must(0, 5..10), + /// Match::must(0, 11..17), + /// ]); + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + #[inline] + pub fn find_iter<'r, 'c, 'h, I: Into<Input<'h>>>( + &'r self, + cache: &'c mut Cache, + input: I, + ) -> FindMatches<'r, 'c, 'h> { + let it = iter::Searcher::new(input.into()); + FindMatches { re: self, cache, it } + } +} + +/// Lower level "search" primitives that accept a `&Input` for cheap reuse +/// and return an error if one occurs instead of panicking. +impl Regex { + /// Returns the start and end offset of the leftmost match. If no match + /// exists, then `None` is returned. + /// + /// This is like [`Regex::find`] but with two differences: + /// + /// 1. It is not generic over `Into<Input>` and instead accepts a + /// `&Input`. This permits reusing the same `Input` for multiple searches + /// without needing to create a new one. This _may_ help with latency. + /// 2. It returns an error if the search could not complete where as + /// [`Regex::find`] will panic. + /// + /// # Errors + /// + /// This routine errors if the search could not complete. This can occur + /// in a number of circumstances: + /// + /// * The configuration of the lazy DFA may permit it to "quit" the search. + /// For example, setting quit bytes or enabling heuristic support for + /// Unicode word boundaries. The default configuration does not enable any + /// option that could result in the lazy DFA quitting. + /// * The configuration of the lazy DFA may also permit it to "give up" + /// on a search if it makes ineffective use of its transition table + /// cache. The default configuration does not enable this by default, + /// although it is typically a good idea to. + /// * When the provided `Input` configuration is not supported. For + /// example, by providing an unsupported anchor mode. + /// + /// When a search returns an error, callers cannot know whether a match + /// exists or not. + #[inline] + pub fn try_search( + &self, + cache: &mut Cache, + input: &Input<'_>, + ) -> Result<Option<Match>, MatchError> { + let (fcache, rcache) = (&mut cache.forward, &mut cache.reverse); + let end = match self.forward().try_search_fwd(fcache, input)? { + None => return Ok(None), + Some(end) => end, + }; + // This special cases an empty match at the beginning of the search. If + // our end matches our start, then since a reverse DFA can't match past + // the start, it must follow that our starting position is also our end + // position. So short circuit and skip the reverse search. + if input.start() == end.offset() { + return Ok(Some(Match::new( + end.pattern(), + end.offset()..end.offset(), + ))); + } + // We can also skip the reverse search if we know our search was + // anchored. This occurs either when the input config is anchored or + // when we know the regex itself is anchored. In this case, we know the + // start of the match, if one is found, must be the start of the + // search. + if self.is_anchored(input) { + return Ok(Some(Match::new( + end.pattern(), + input.start()..end.offset(), + ))); + } + // N.B. I have tentatively convinced myself that it isn't necessary + // to specify the specific pattern for the reverse search since the + // reverse search will always find the same pattern to match as the + // forward search. But I lack a rigorous proof. Why not just provide + // the pattern anyway? Well, if it is needed, then leaving it out + // gives us a chance to find a witness. (Also, if we don't need to + // specify the pattern, then we don't need to build the reverse DFA + // with 'starts_for_each_pattern' enabled. It doesn't matter too much + // for the lazy DFA, but does make the overall DFA bigger.) + // + // We also need to be careful to disable 'earliest' for the reverse + // search, since it could be enabled for the forward search. In the + // reverse case, to satisfy "leftmost" criteria, we need to match as + // much as we can. We also need to be careful to make the search + // anchored. We don't want the reverse search to report any matches + // other than the one beginning at the end of our forward search. + let revsearch = input + .clone() + .span(input.start()..end.offset()) + .anchored(Anchored::Yes) + .earliest(false); + let start = self + .reverse() + .try_search_rev(rcache, &revsearch)? + .expect("reverse search must match if forward search does"); + debug_assert_eq!( + start.pattern(), + end.pattern(), + "forward and reverse search must match same pattern", + ); + debug_assert!(start.offset() <= end.offset()); + Ok(Some(Match::new(end.pattern(), start.offset()..end.offset()))) + } + + /// Returns true if either the given input specifies an anchored search + /// or if the underlying NFA is always anchored. + fn is_anchored(&self, input: &Input<'_>) -> bool { + match input.get_anchored() { + Anchored::No => { + self.forward().get_nfa().is_always_start_anchored() + } + Anchored::Yes | Anchored::Pattern(_) => true, + } + } +} + +/// Non-search APIs for querying information about the regex and setting a +/// prefilter. +impl Regex { + /// Return the underlying lazy DFA responsible for forward matching. + /// + /// This is useful for accessing the underlying lazy DFA and using it + /// directly if the situation calls for it. + pub fn forward(&self) -> &DFA { + &self.forward + } + + /// Return the underlying lazy DFA responsible for reverse matching. + /// + /// This is useful for accessing the underlying lazy DFA and using it + /// directly if the situation calls for it. + pub fn reverse(&self) -> &DFA { + &self.reverse + } + + /// Returns the total number of patterns matched by this regex. + /// + /// # Example + /// + /// ``` + /// # if cfg!(miri) { return Ok(()); } // miri takes too long + /// use regex_automata::hybrid::regex::Regex; + /// + /// let re = Regex::new_many(&[r"[a-z]+", r"[0-9]+", r"\w+"])?; + /// assert_eq!(3, re.pattern_len()); + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + pub fn pattern_len(&self) -> usize { + assert_eq!(self.forward().pattern_len(), self.reverse().pattern_len()); + self.forward().pattern_len() + } +} + +/// An iterator over all non-overlapping matches for an infallible search. +/// +/// The iterator yields a [`Match`] value until no more matches could be found. +/// If the underlying regex engine returns an error, then a panic occurs. +/// +/// The lifetime parameters are as follows: +/// +/// * `'r` represents the lifetime of the regex object. +/// * `'h` represents the lifetime of the haystack being searched. +/// * `'c` represents the lifetime of the regex cache. +/// +/// This iterator can be created with the [`Regex::find_iter`] method. +#[derive(Debug)] +pub struct FindMatches<'r, 'c, 'h> { + re: &'r Regex, + cache: &'c mut Cache, + it: iter::Searcher<'h>, +} + +impl<'r, 'c, 'h> Iterator for FindMatches<'r, 'c, 'h> { + type Item = Match; + + #[inline] + fn next(&mut self) -> Option<Match> { + let FindMatches { re, ref mut cache, ref mut it } = *self; + it.advance(|input| re.try_search(cache, input)) + } +} + +/// A cache represents a partially computed forward and reverse DFA. +/// +/// A cache is the key component that differentiates a classical DFA and a +/// hybrid NFA/DFA (also called a "lazy DFA"). Where a classical DFA builds a +/// complete transition table that can handle all possible inputs, a hybrid +/// NFA/DFA starts with an empty transition table and builds only the parts +/// required during search. The parts that are built are stored in a cache. For +/// this reason, a cache is a required parameter for nearly every operation on +/// a [`Regex`]. +/// +/// Caches can be created from their corresponding `Regex` via +/// [`Regex::create_cache`]. A cache can only be used with either the `Regex` +/// that created it, or the `Regex` that was most recently used to reset it +/// with [`Cache::reset`]. Using a cache with any other `Regex` may result in +/// panics or incorrect results. +#[derive(Debug, Clone)] +pub struct Cache { + forward: dfa::Cache, + reverse: dfa::Cache, +} + +impl Cache { + /// Create a new cache for the given `Regex`. + /// + /// The cache returned should only be used for searches for the given + /// `Regex`. If you want to reuse the cache for another `Regex`, then you + /// must call [`Cache::reset`] with that `Regex`. + pub fn new(re: &Regex) -> Cache { + let forward = dfa::Cache::new(re.forward()); + let reverse = dfa::Cache::new(re.reverse()); + Cache { forward, reverse } + } + + /// Reset this cache such that it can be used for searching with the given + /// `Regex` (and only that `Regex`). + /// + /// A cache reset permits reusing memory already allocated in this cache + /// with a different `Regex`. + /// + /// Resetting a cache sets its "clear count" to 0. This is relevant if the + /// `Regex` has been configured to "give up" after it has cleared the cache + /// a certain number of times. + /// + /// # Example + /// + /// This shows how to re-purpose a cache for use with a different `Regex`. + /// + /// ``` + /// # if cfg!(miri) { return Ok(()); } // miri takes too long + /// use regex_automata::{hybrid::regex::Regex, Match}; + /// + /// let re1 = Regex::new(r"\w")?; + /// let re2 = Regex::new(r"\W")?; + /// + /// let mut cache = re1.create_cache(); + /// assert_eq!( + /// Some(Match::must(0, 0..2)), + /// re1.find(&mut cache, "Δ"), + /// ); + /// + /// // Using 'cache' with re2 is not allowed. It may result in panics or + /// // incorrect results. In order to re-purpose the cache, we must reset + /// // it with the Regex we'd like to use it with. + /// // + /// // Similarly, after this reset, using the cache with 're1' is also not + /// // allowed. + /// cache.reset(&re2); + /// assert_eq!( + /// Some(Match::must(0, 0..3)), + /// re2.find(&mut cache, "☃"), + /// ); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + pub fn reset(&mut self, re: &Regex) { + self.forward.reset(re.forward()); + self.reverse.reset(re.reverse()); + } + + /// Return a reference to the forward cache. + pub fn forward(&mut self) -> &dfa::Cache { + &self.forward + } + + /// Return a reference to the reverse cache. + pub fn reverse(&mut self) -> &dfa::Cache { + &self.reverse + } + + /// Return a mutable reference to the forward cache. + /// + /// If you need mutable references to both the forward and reverse caches, + /// then use [`Cache::as_parts_mut`]. + pub fn forward_mut(&mut self) -> &mut dfa::Cache { + &mut self.forward + } + + /// Return a mutable reference to the reverse cache. + /// + /// If you need mutable references to both the forward and reverse caches, + /// then use [`Cache::as_parts_mut`]. + pub fn reverse_mut(&mut self) -> &mut dfa::Cache { + &mut self.reverse + } + + /// Return references to the forward and reverse caches, respectively. + pub fn as_parts(&self) -> (&dfa::Cache, &dfa::Cache) { + (&self.forward, &self.reverse) + } + + /// Return mutable references to the forward and reverse caches, + /// respectively. + pub fn as_parts_mut(&mut self) -> (&mut dfa::Cache, &mut dfa::Cache) { + (&mut self.forward, &mut self.reverse) + } + + /// Returns the heap memory usage, in bytes, as a sum of the forward and + /// reverse lazy DFA caches. + /// + /// This does **not** include the stack size used up by this cache. To + /// compute that, use `std::mem::size_of::<Cache>()`. + pub fn memory_usage(&self) -> usize { + self.forward.memory_usage() + self.reverse.memory_usage() + } +} + +/// A builder for a regex based on a hybrid NFA/DFA. +/// +/// This builder permits configuring options for the syntax of a pattern, the +/// NFA construction, the lazy DFA construction and finally the regex searching +/// itself. This builder is different from a general purpose regex builder +/// in that it permits fine grain configuration of the construction process. +/// The trade off for this is complexity, and the possibility of setting a +/// configuration that might not make sense. For example, there are two +/// different UTF-8 modes: +/// +/// * [`syntax::Config::utf8`](crate::util::syntax::Config::utf8) controls +/// whether the pattern itself can contain sub-expressions that match invalid +/// UTF-8. +/// * [`thompson::Config::utf8`] controls how the regex iterators themselves +/// advance the starting position of the next search when a match with zero +/// length is found. +/// +/// Generally speaking, callers will want to either enable all of these or +/// disable all of these. +/// +/// Internally, building a regex requires building two hybrid NFA/DFAs, +/// where one is responsible for finding the end of a match and the other is +/// responsible for finding the start of a match. If you only need to detect +/// whether something matched, or only the end of a match, then you should use +/// a [`dfa::Builder`] to construct a single hybrid NFA/DFA, which is cheaper +/// than building two of them. +/// +/// # Example +/// +/// This example shows how to disable UTF-8 mode in the syntax and the regex +/// itself. This is generally what you want for matching on arbitrary bytes. +/// +/// ``` +/// # if cfg!(miri) { return Ok(()); } // miri takes too long +/// use regex_automata::{ +/// hybrid::regex::Regex, nfa::thompson, util::syntax, Match, +/// }; +/// +/// let re = Regex::builder() +/// .syntax(syntax::Config::new().utf8(false)) +/// .thompson(thompson::Config::new().utf8(false)) +/// .build(r"foo(?-u:[^b])ar.*")?; +/// let mut cache = re.create_cache(); +/// +/// let haystack = b"\xFEfoo\xFFarzz\xE2\x98\xFF\n"; +/// let expected = Some(Match::must(0, 1..9)); +/// let got = re.find(&mut cache, haystack); +/// assert_eq!(expected, got); +/// // Notice that `(?-u:[^b])` matches invalid UTF-8, +/// // but the subsequent `.*` does not! Disabling UTF-8 +/// // on the syntax permits this. +/// assert_eq!(b"foo\xFFarzz", &haystack[got.unwrap().range()]); +/// +/// # Ok::<(), Box<dyn std::error::Error>>(()) +/// ``` +#[derive(Clone, Debug)] +pub struct Builder { + dfa: dfa::Builder, +} + +impl Builder { + /// Create a new regex builder with the default configuration. + pub fn new() -> Builder { + Builder { dfa: DFA::builder() } + } + + /// Build a regex from the given pattern. + /// + /// If there was a problem parsing or compiling the pattern, then an error + /// is returned. + #[cfg(feature = "syntax")] + pub fn build(&self, pattern: &str) -> Result<Regex, BuildError> { + self.build_many(&[pattern]) + } + + /// Build a regex from the given patterns. + #[cfg(feature = "syntax")] + pub fn build_many<P: AsRef<str>>( + &self, + patterns: &[P], + ) -> Result<Regex, BuildError> { + let forward = self.dfa.build_many(patterns)?; + let reverse = self + .dfa + .clone() + .configure( + DFA::config() + .prefilter(None) + .specialize_start_states(false) + .match_kind(MatchKind::All), + ) + .thompson(thompson::Config::new().reverse(true)) + .build_many(patterns)?; + Ok(self.build_from_dfas(forward, reverse)) + } + + /// Build a regex from its component forward and reverse hybrid NFA/DFAs. + /// + /// This is useful when you've built a forward and reverse lazy DFA + /// separately, and want to combine them into a single regex. Once build, + /// the individual DFAs given can still be accessed via [`Regex::forward`] + /// and [`Regex::reverse`]. + /// + /// It is important that the reverse lazy DFA be compiled under the + /// following conditions: + /// + /// * It should use [`MatchKind::All`] semantics. + /// * It should match in reverse. + /// * Otherwise, its configuration should match the forward DFA. + /// + /// If these conditions aren't satisfied, then the behavior of searches is + /// unspecified. + /// + /// Note that when using this constructor, no configuration is applied. + /// Since this routine provides the DFAs to the builder, there is no + /// opportunity to apply other configuration options. + /// + /// # Example + /// + /// This shows how to build individual lazy forward and reverse DFAs, and + /// then combine them into a single `Regex`. + /// + /// ``` + /// use regex_automata::{ + /// hybrid::{dfa::DFA, regex::Regex}, + /// nfa::thompson, + /// MatchKind, + /// }; + /// + /// let fwd = DFA::new(r"foo[0-9]+")?; + /// let rev = DFA::builder() + /// .configure(DFA::config().match_kind(MatchKind::All)) + /// .thompson(thompson::Config::new().reverse(true)) + /// .build(r"foo[0-9]+")?; + /// + /// let re = Regex::builder().build_from_dfas(fwd, rev); + /// let mut cache = re.create_cache(); + /// assert_eq!(true, re.is_match(&mut cache, "foo123")); + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + pub fn build_from_dfas(&self, forward: DFA, reverse: DFA) -> Regex { + Regex { forward, reverse } + } + + /// Set the syntax configuration for this builder using + /// [`syntax::Config`](crate::util::syntax::Config). + /// + /// This permits setting things like case insensitivity, Unicode and multi + /// line mode. + #[cfg(feature = "syntax")] + pub fn syntax( + &mut self, + config: crate::util::syntax::Config, + ) -> &mut Builder { + self.dfa.syntax(config); + self + } + + /// Set the Thompson NFA configuration for this builder using + /// [`nfa::thompson::Config`](thompson::Config). + /// + /// This permits setting things like whether additional time should be + /// spent shrinking the size of the NFA. + #[cfg(feature = "syntax")] + pub fn thompson(&mut self, config: thompson::Config) -> &mut Builder { + self.dfa.thompson(config); + self + } + + /// Set the lazy DFA compilation configuration for this builder using + /// [`dfa::Config`](dfa::Config). + /// + /// This permits setting things like whether Unicode word boundaries should + /// be heuristically supported or settings how the behavior of the cache. + pub fn dfa(&mut self, config: dfa::Config) -> &mut Builder { + self.dfa.configure(config); + self + } +} + +impl Default for Builder { + fn default() -> Builder { + Builder::new() + } +} diff --git a/third_party/rust/regex-automata/src/hybrid/search.rs b/third_party/rust/regex-automata/src/hybrid/search.rs new file mode 100644 index 0000000000..f232836854 --- /dev/null +++ b/third_party/rust/regex-automata/src/hybrid/search.rs @@ -0,0 +1,802 @@ +use crate::{ + hybrid::{ + dfa::{Cache, OverlappingState, DFA}, + id::LazyStateID, + }, + util::{ + prefilter::Prefilter, + search::{HalfMatch, Input, MatchError, Span}, + }, +}; + +#[inline(never)] +pub(crate) fn find_fwd( + dfa: &DFA, + cache: &mut Cache, + input: &Input<'_>, +) -> Result<Option<HalfMatch>, MatchError> { + if input.is_done() { + return Ok(None); + } + let pre = if input.get_anchored().is_anchored() { + None + } else { + dfa.get_config().get_prefilter() + }; + // So what we do here is specialize four different versions of 'find_fwd': + // one for each of the combinations for 'has prefilter' and 'is earliest + // search'. The reason for doing this is that both of these things require + // branches and special handling in some code that can be very hot, + // and shaving off as much as we can when we don't need it tends to be + // beneficial in ad hoc benchmarks. To see these differences, you often + // need a query with a high match count. In other words, specializing these + // four routines *tends* to help latency more than throughput. + if pre.is_some() { + if input.get_earliest() { + find_fwd_imp(dfa, cache, input, pre, true) + } else { + find_fwd_imp(dfa, cache, input, pre, false) + } + } else { + if input.get_earliest() { + find_fwd_imp(dfa, cache, input, None, true) + } else { + find_fwd_imp(dfa, cache, input, None, false) + } + } +} + +#[cfg_attr(feature = "perf-inline", inline(always))] +fn find_fwd_imp( + dfa: &DFA, + cache: &mut Cache, + input: &Input<'_>, + pre: Option<&'_ Prefilter>, + earliest: bool, +) -> Result<Option<HalfMatch>, MatchError> { + // See 'prefilter_restart' docs for explanation. + let universal_start = dfa.get_nfa().look_set_prefix_any().is_empty(); + let mut mat = None; + let mut sid = init_fwd(dfa, cache, input)?; + let mut at = input.start(); + // This could just be a closure, but then I think it would be unsound + // because it would need to be safe to invoke. This way, the lack of safety + // is clearer in the code below. + macro_rules! next_unchecked { + ($sid:expr, $at:expr) => {{ + let byte = *input.haystack().get_unchecked($at); + dfa.next_state_untagged_unchecked(cache, $sid, byte) + }}; + } + + if let Some(ref pre) = pre { + let span = Span::from(at..input.end()); + match pre.find(input.haystack(), span) { + None => return Ok(mat), + Some(ref span) => { + at = span.start; + if !universal_start { + sid = prefilter_restart(dfa, cache, &input, at)?; + } + } + } + } + cache.search_start(at); + while at < input.end() { + if sid.is_tagged() { + cache.search_update(at); + sid = dfa + .next_state(cache, sid, input.haystack()[at]) + .map_err(|_| gave_up(at))?; + } else { + // SAFETY: There are two safety invariants we need to uphold + // here in the loops below: that 'sid' and 'prev_sid' are valid + // state IDs for this DFA, and that 'at' is a valid index into + // 'haystack'. For the former, we rely on the invariant that + // next_state* and start_state_forward always returns a valid state + // ID (given a valid state ID in the former case), and that we are + // only at this place in the code if 'sid' is untagged. Moreover, + // every call to next_state_untagged_unchecked below is guarded by + // a check that sid is untagged. For the latter safety invariant, + // we always guard unchecked access with a check that 'at' is less + // than 'end', where 'end <= haystack.len()'. In the unrolled loop + // below, we ensure that 'at' is always in bounds. + // + // PERF: For justification of omitting bounds checks, it gives us a + // ~10% bump in search time. This was used for a benchmark: + // + // regex-cli find hybrid dfa @bigfile '(?m)^.+$' -UBb + // + // PERF: For justification for the loop unrolling, we use a few + // different tests: + // + // regex-cli find hybrid dfa @$bigfile '\w{50}' -UBb + // regex-cli find hybrid dfa @$bigfile '(?m)^.+$' -UBb + // regex-cli find hybrid dfa @$bigfile 'ZQZQZQZQ' -UBb + // + // And there are three different configurations: + // + // nounroll: this entire 'else' block vanishes and we just + // always use 'dfa.next_state(..)'. + // unroll1: just the outer loop below + // unroll2: just the inner loop below + // unroll3: both the outer and inner loops below + // + // This results in a matrix of timings for each of the above + // regexes with each of the above unrolling configurations: + // + // '\w{50}' '(?m)^.+$' 'ZQZQZQZQ' + // nounroll 1.51s 2.34s 1.51s + // unroll1 1.53s 2.32s 1.56s + // unroll2 2.22s 1.50s 0.61s + // unroll3 1.67s 1.45s 0.61s + // + // Ideally we'd be able to find a configuration that yields the + // best time for all regexes, but alas we settle for unroll3 that + // gives us *almost* the best for '\w{50}' and the best for the + // other two regexes. + // + // So what exactly is going on here? The first unrolling (grouping + // together runs of untagged transitions) specifically targets + // our choice of representation. The second unrolling (grouping + // together runs of self-transitions) specifically targets a common + // DFA topology. Let's dig in a little bit by looking at our + // regexes: + // + // '\w{50}': This regex spends a lot of time outside of the DFA's + // start state matching some part of the '\w' repetition. This + // means that it's a bit of a worst case for loop unrolling that + // targets self-transitions since the self-transitions in '\w{50}' + // are not particularly active for this haystack. However, the + // first unrolling (grouping together untagged transitions) + // does apply quite well here since very few transitions hit + // match/dead/quit/unknown states. It is however worth mentioning + // that if start states are configured to be tagged (which you + // typically want to do if you have a prefilter), then this regex + // actually slows way down because it is constantly ping-ponging + // out of the unrolled loop and into the handling of a tagged start + // state below. But when start states aren't tagged, the unrolled + // loop stays hot. (This is why it's imperative that start state + // tagging be disabled when there isn't a prefilter!) + // + // '(?m)^.+$': There are two important aspects of this regex: 1) + // on this haystack, its match count is very high, much higher + // than the other two regex and 2) it spends the vast majority + // of its time matching '.+'. Since Unicode mode is disabled, + // this corresponds to repeatedly following self transitions for + // the vast majority of the input. This does benefit from the + // untagged unrolling since most of the transitions will be to + // untagged states, but the untagged unrolling does more work than + // what is actually required. Namely, it has to keep track of the + // previous and next state IDs, which I guess requires a bit more + // shuffling. This is supported by the fact that nounroll+unroll1 + // are both slower than unroll2+unroll3, where the latter has a + // loop unrolling that specifically targets self-transitions. + // + // 'ZQZQZQZQ': This one is very similar to '(?m)^.+$' because it + // spends the vast majority of its time in self-transitions for + // the (implicit) unanchored prefix. The main difference with + // '(?m)^.+$' is that it has a much lower match count. So there + // isn't much time spent in the overhead of reporting matches. This + // is the primary explainer in the perf difference here. We include + // this regex and the former to make sure we have comparison points + // with high and low match counts. + // + // NOTE: I used 'OpenSubtitles2018.raw.sample.en' for 'bigfile'. + // + // NOTE: In a follow-up, it turns out that the "inner" loop + // mentioned above was a pretty big pessimization in some other + // cases. Namely, it resulted in too much ping-ponging into and out + // of the loop, which resulted in nearly ~2x regressions in search + // time when compared to the originally lazy DFA in the regex crate. + // So I've removed the second loop unrolling that targets the + // self-transition case. + let mut prev_sid = sid; + while at < input.end() { + prev_sid = unsafe { next_unchecked!(sid, at) }; + if prev_sid.is_tagged() || at + 3 >= input.end() { + core::mem::swap(&mut prev_sid, &mut sid); + break; + } + at += 1; + + sid = unsafe { next_unchecked!(prev_sid, at) }; + if sid.is_tagged() { + break; + } + at += 1; + + prev_sid = unsafe { next_unchecked!(sid, at) }; + if prev_sid.is_tagged() { + core::mem::swap(&mut prev_sid, &mut sid); + break; + } + at += 1; + + sid = unsafe { next_unchecked!(prev_sid, at) }; + if sid.is_tagged() { + break; + } + at += 1; + } + // If we quit out of the code above with an unknown state ID at + // any point, then we need to re-compute that transition using + // 'next_state', which will do NFA powerset construction for us. + if sid.is_unknown() { + cache.search_update(at); + sid = dfa + .next_state(cache, prev_sid, input.haystack()[at]) + .map_err(|_| gave_up(at))?; + } + } + if sid.is_tagged() { + if sid.is_start() { + if let Some(ref pre) = pre { + let span = Span::from(at..input.end()); + match pre.find(input.haystack(), span) { + None => { + cache.search_finish(span.end); + return Ok(mat); + } + Some(ref span) => { + // We want to skip any update to 'at' below + // at the end of this iteration and just + // jump immediately back to the next state + // transition at the leading position of the + // candidate match. + // + // ... but only if we actually made progress + // with our prefilter, otherwise if the start + // state has a self-loop, we can get stuck. + if span.start > at { + at = span.start; + if !universal_start { + sid = prefilter_restart( + dfa, cache, &input, at, + )?; + } + continue; + } + } + } + } + } else if sid.is_match() { + let pattern = dfa.match_pattern(cache, sid, 0); + // Since slice ranges are inclusive at the beginning and + // exclusive at the end, and since forward searches report + // the end, we can return 'at' as-is. This only works because + // matches are delayed by 1 byte. So by the time we observe a + // match, 'at' has already been set to 1 byte past the actual + // match location, which is precisely the exclusive ending + // bound of the match. + mat = Some(HalfMatch::new(pattern, at)); + if earliest { + cache.search_finish(at); + return Ok(mat); + } + } else if sid.is_dead() { + cache.search_finish(at); + return Ok(mat); + } else if sid.is_quit() { + cache.search_finish(at); + return Err(MatchError::quit(input.haystack()[at], at)); + } else { + debug_assert!(sid.is_unknown()); + unreachable!("sid being unknown is a bug"); + } + } + at += 1; + } + eoi_fwd(dfa, cache, input, &mut sid, &mut mat)?; + cache.search_finish(input.end()); + Ok(mat) +} + +#[inline(never)] +pub(crate) fn find_rev( + dfa: &DFA, + cache: &mut Cache, + input: &Input<'_>, +) -> Result<Option<HalfMatch>, MatchError> { + if input.is_done() { + return Ok(None); + } + if input.get_earliest() { + find_rev_imp(dfa, cache, input, true) + } else { + find_rev_imp(dfa, cache, input, false) + } +} + +#[cfg_attr(feature = "perf-inline", inline(always))] +fn find_rev_imp( + dfa: &DFA, + cache: &mut Cache, + input: &Input<'_>, + earliest: bool, +) -> Result<Option<HalfMatch>, MatchError> { + let mut mat = None; + let mut sid = init_rev(dfa, cache, input)?; + // In reverse search, the loop below can't handle the case of searching an + // empty slice. Ideally we could write something congruent to the forward + // search, i.e., 'while at >= start', but 'start' might be 0. Since we use + // an unsigned offset, 'at >= 0' is trivially always true. We could avoid + // this extra case handling by using a signed offset, but Rust makes it + // annoying to do. So... We just handle the empty case separately. + if input.start() == input.end() { + eoi_rev(dfa, cache, input, &mut sid, &mut mat)?; + return Ok(mat); + } + + let mut at = input.end() - 1; + macro_rules! next_unchecked { + ($sid:expr, $at:expr) => {{ + let byte = *input.haystack().get_unchecked($at); + dfa.next_state_untagged_unchecked(cache, $sid, byte) + }}; + } + cache.search_start(at); + loop { + if sid.is_tagged() { + cache.search_update(at); + sid = dfa + .next_state(cache, sid, input.haystack()[at]) + .map_err(|_| gave_up(at))?; + } else { + // SAFETY: See comments in 'find_fwd' for a safety argument. + // + // PERF: The comments in 'find_fwd' also provide a justification + // from a performance perspective as to 1) why we elide bounds + // checks and 2) why we do a specialized version of unrolling + // below. The reverse search does have a slightly different + // consideration in that most reverse searches tend to be + // anchored and on shorter haystacks. However, this still makes a + // difference. Take this command for example: + // + // regex-cli find hybrid regex @$bigfile '(?m)^.+$' -UBb + // + // (Notice that we use 'find hybrid regex', not 'find hybrid dfa' + // like in the justification for the forward direction. The 'regex' + // sub-command will find start-of-match and thus run the reverse + // direction.) + // + // Without unrolling below, the above command takes around 3.76s. + // But with the unrolling below, we get down to 2.55s. If we keep + // the unrolling but add in bounds checks, then we get 2.86s. + // + // NOTE: I used 'OpenSubtitles2018.raw.sample.en' for 'bigfile'. + let mut prev_sid = sid; + while at >= input.start() { + prev_sid = unsafe { next_unchecked!(sid, at) }; + if prev_sid.is_tagged() + || at <= input.start().saturating_add(3) + { + core::mem::swap(&mut prev_sid, &mut sid); + break; + } + at -= 1; + + sid = unsafe { next_unchecked!(prev_sid, at) }; + if sid.is_tagged() { + break; + } + at -= 1; + + prev_sid = unsafe { next_unchecked!(sid, at) }; + if prev_sid.is_tagged() { + core::mem::swap(&mut prev_sid, &mut sid); + break; + } + at -= 1; + + sid = unsafe { next_unchecked!(prev_sid, at) }; + if sid.is_tagged() { + break; + } + at -= 1; + } + // If we quit out of the code above with an unknown state ID at + // any point, then we need to re-compute that transition using + // 'next_state', which will do NFA powerset construction for us. + if sid.is_unknown() { + cache.search_update(at); + sid = dfa + .next_state(cache, prev_sid, input.haystack()[at]) + .map_err(|_| gave_up(at))?; + } + } + if sid.is_tagged() { + if sid.is_start() { + // do nothing + } else if sid.is_match() { + let pattern = dfa.match_pattern(cache, sid, 0); + // Since reverse searches report the beginning of a match + // and the beginning is inclusive (not exclusive like the + // end of a match), we add 1 to make it inclusive. + mat = Some(HalfMatch::new(pattern, at + 1)); + if earliest { + cache.search_finish(at); + return Ok(mat); + } + } else if sid.is_dead() { + cache.search_finish(at); + return Ok(mat); + } else if sid.is_quit() { + cache.search_finish(at); + return Err(MatchError::quit(input.haystack()[at], at)); + } else { + debug_assert!(sid.is_unknown()); + unreachable!("sid being unknown is a bug"); + } + } + if at == input.start() { + break; + } + at -= 1; + } + cache.search_finish(input.start()); + eoi_rev(dfa, cache, input, &mut sid, &mut mat)?; + Ok(mat) +} + +#[inline(never)] +pub(crate) fn find_overlapping_fwd( + dfa: &DFA, + cache: &mut Cache, + input: &Input<'_>, + state: &mut OverlappingState, +) -> Result<(), MatchError> { + state.mat = None; + if input.is_done() { + return Ok(()); + } + let pre = if input.get_anchored().is_anchored() { + None + } else { + dfa.get_config().get_prefilter() + }; + if pre.is_some() { + find_overlapping_fwd_imp(dfa, cache, input, pre, state) + } else { + find_overlapping_fwd_imp(dfa, cache, input, None, state) + } +} + +#[cfg_attr(feature = "perf-inline", inline(always))] +fn find_overlapping_fwd_imp( + dfa: &DFA, + cache: &mut Cache, + input: &Input<'_>, + pre: Option<&'_ Prefilter>, + state: &mut OverlappingState, +) -> Result<(), MatchError> { + // See 'prefilter_restart' docs for explanation. + let universal_start = dfa.get_nfa().look_set_prefix_any().is_empty(); + let mut sid = match state.id { + None => { + state.at = input.start(); + init_fwd(dfa, cache, input)? + } + Some(sid) => { + if let Some(match_index) = state.next_match_index { + let match_len = dfa.match_len(cache, sid); + if match_index < match_len { + state.next_match_index = Some(match_index + 1); + let pattern = dfa.match_pattern(cache, sid, match_index); + state.mat = Some(HalfMatch::new(pattern, state.at)); + return Ok(()); + } + } + // Once we've reported all matches at a given position, we need to + // advance the search to the next position. + state.at += 1; + if state.at > input.end() { + return Ok(()); + } + sid + } + }; + + // NOTE: We don't optimize the crap out of this routine primarily because + // it seems like most overlapping searches will have higher match counts, + // and thus, throughput is perhaps not as important. But if you have a use + // case for something faster, feel free to file an issue. + cache.search_start(state.at); + while state.at < input.end() { + sid = dfa + .next_state(cache, sid, input.haystack()[state.at]) + .map_err(|_| gave_up(state.at))?; + if sid.is_tagged() { + state.id = Some(sid); + if sid.is_start() { + if let Some(ref pre) = pre { + let span = Span::from(state.at..input.end()); + match pre.find(input.haystack(), span) { + None => return Ok(()), + Some(ref span) => { + if span.start > state.at { + state.at = span.start; + if !universal_start { + sid = prefilter_restart( + dfa, cache, &input, state.at, + )?; + } + continue; + } + } + } + } + } else if sid.is_match() { + state.next_match_index = Some(1); + let pattern = dfa.match_pattern(cache, sid, 0); + state.mat = Some(HalfMatch::new(pattern, state.at)); + cache.search_finish(state.at); + return Ok(()); + } else if sid.is_dead() { + cache.search_finish(state.at); + return Ok(()); + } else if sid.is_quit() { + cache.search_finish(state.at); + return Err(MatchError::quit( + input.haystack()[state.at], + state.at, + )); + } else { + debug_assert!(sid.is_unknown()); + unreachable!("sid being unknown is a bug"); + } + } + state.at += 1; + cache.search_update(state.at); + } + + let result = eoi_fwd(dfa, cache, input, &mut sid, &mut state.mat); + state.id = Some(sid); + if state.mat.is_some() { + // '1' is always correct here since if we get to this point, this + // always corresponds to the first (index '0') match discovered at + // this position. So the next match to report at this position (if + // it exists) is at index '1'. + state.next_match_index = Some(1); + } + cache.search_finish(input.end()); + result +} + +#[inline(never)] +pub(crate) fn find_overlapping_rev( + dfa: &DFA, + cache: &mut Cache, + input: &Input<'_>, + state: &mut OverlappingState, +) -> Result<(), MatchError> { + state.mat = None; + if input.is_done() { + return Ok(()); + } + let mut sid = match state.id { + None => { + let sid = init_rev(dfa, cache, input)?; + state.id = Some(sid); + if input.start() == input.end() { + state.rev_eoi = true; + } else { + state.at = input.end() - 1; + } + sid + } + Some(sid) => { + if let Some(match_index) = state.next_match_index { + let match_len = dfa.match_len(cache, sid); + if match_index < match_len { + state.next_match_index = Some(match_index + 1); + let pattern = dfa.match_pattern(cache, sid, match_index); + state.mat = Some(HalfMatch::new(pattern, state.at)); + return Ok(()); + } + } + // Once we've reported all matches at a given position, we need + // to advance the search to the next position. However, if we've + // already followed the EOI transition, then we know we're done + // with the search and there cannot be any more matches to report. + if state.rev_eoi { + return Ok(()); + } else if state.at == input.start() { + // At this point, we should follow the EOI transition. This + // will cause us the skip the main loop below and fall through + // to the final 'eoi_rev' transition. + state.rev_eoi = true; + } else { + // We haven't hit the end of the search yet, so move on. + state.at -= 1; + } + sid + } + }; + cache.search_start(state.at); + while !state.rev_eoi { + sid = dfa + .next_state(cache, sid, input.haystack()[state.at]) + .map_err(|_| gave_up(state.at))?; + if sid.is_tagged() { + state.id = Some(sid); + if sid.is_start() { + // do nothing + } else if sid.is_match() { + state.next_match_index = Some(1); + let pattern = dfa.match_pattern(cache, sid, 0); + state.mat = Some(HalfMatch::new(pattern, state.at + 1)); + cache.search_finish(state.at); + return Ok(()); + } else if sid.is_dead() { + cache.search_finish(state.at); + return Ok(()); + } else if sid.is_quit() { + cache.search_finish(state.at); + return Err(MatchError::quit( + input.haystack()[state.at], + state.at, + )); + } else { + debug_assert!(sid.is_unknown()); + unreachable!("sid being unknown is a bug"); + } + } + if state.at == input.start() { + break; + } + state.at -= 1; + cache.search_update(state.at); + } + + let result = eoi_rev(dfa, cache, input, &mut sid, &mut state.mat); + state.rev_eoi = true; + state.id = Some(sid); + if state.mat.is_some() { + // '1' is always correct here since if we get to this point, this + // always corresponds to the first (index '0') match discovered at + // this position. So the next match to report at this position (if + // it exists) is at index '1'. + state.next_match_index = Some(1); + } + cache.search_finish(input.start()); + result +} + +#[cfg_attr(feature = "perf-inline", inline(always))] +fn init_fwd( + dfa: &DFA, + cache: &mut Cache, + input: &Input<'_>, +) -> Result<LazyStateID, MatchError> { + let sid = dfa.start_state_forward(cache, input)?; + // Start states can never be match states, since all matches are delayed + // by 1 byte. + debug_assert!(!sid.is_match()); + Ok(sid) +} + +#[cfg_attr(feature = "perf-inline", inline(always))] +fn init_rev( + dfa: &DFA, + cache: &mut Cache, + input: &Input<'_>, +) -> Result<LazyStateID, MatchError> { + let sid = dfa.start_state_reverse(cache, input)?; + // Start states can never be match states, since all matches are delayed + // by 1 byte. + debug_assert!(!sid.is_match()); + Ok(sid) +} + +#[cfg_attr(feature = "perf-inline", inline(always))] +fn eoi_fwd( + dfa: &DFA, + cache: &mut Cache, + input: &Input<'_>, + sid: &mut LazyStateID, + mat: &mut Option<HalfMatch>, +) -> Result<(), MatchError> { + let sp = input.get_span(); + match input.haystack().get(sp.end) { + Some(&b) => { + *sid = + dfa.next_state(cache, *sid, b).map_err(|_| gave_up(sp.end))?; + if sid.is_match() { + let pattern = dfa.match_pattern(cache, *sid, 0); + *mat = Some(HalfMatch::new(pattern, sp.end)); + } else if sid.is_quit() { + return Err(MatchError::quit(b, sp.end)); + } + } + None => { + *sid = dfa + .next_eoi_state(cache, *sid) + .map_err(|_| gave_up(input.haystack().len()))?; + if sid.is_match() { + let pattern = dfa.match_pattern(cache, *sid, 0); + *mat = Some(HalfMatch::new(pattern, input.haystack().len())); + } + // N.B. We don't have to check 'is_quit' here because the EOI + // transition can never lead to a quit state. + debug_assert!(!sid.is_quit()); + } + } + Ok(()) +} + +#[cfg_attr(feature = "perf-inline", inline(always))] +fn eoi_rev( + dfa: &DFA, + cache: &mut Cache, + input: &Input<'_>, + sid: &mut LazyStateID, + mat: &mut Option<HalfMatch>, +) -> Result<(), MatchError> { + let sp = input.get_span(); + if sp.start > 0 { + let byte = input.haystack()[sp.start - 1]; + *sid = dfa + .next_state(cache, *sid, byte) + .map_err(|_| gave_up(sp.start))?; + if sid.is_match() { + let pattern = dfa.match_pattern(cache, *sid, 0); + *mat = Some(HalfMatch::new(pattern, sp.start)); + } else if sid.is_quit() { + return Err(MatchError::quit(byte, sp.start - 1)); + } + } else { + *sid = + dfa.next_eoi_state(cache, *sid).map_err(|_| gave_up(sp.start))?; + if sid.is_match() { + let pattern = dfa.match_pattern(cache, *sid, 0); + *mat = Some(HalfMatch::new(pattern, 0)); + } + // N.B. We don't have to check 'is_quit' here because the EOI + // transition can never lead to a quit state. + debug_assert!(!sid.is_quit()); + } + Ok(()) +} + +/// Re-compute the starting state that a DFA should be in after finding a +/// prefilter candidate match at the position `at`. +/// +/// It is always correct to call this, but not always necessary. Namely, +/// whenever the DFA has a universal start state, the DFA can remain in the +/// start state that it was in when it ran the prefilter. Why? Because in that +/// case, there is only one start state. +/// +/// When does a DFA have a universal start state? In precisely cases where +/// it has no look-around assertions in its prefix. So for example, `\bfoo` +/// does not have a universal start state because the start state depends on +/// whether the byte immediately before the start position is a word byte or +/// not. However, `foo\b` does have a universal start state because the word +/// boundary does not appear in the pattern's prefix. +/// +/// So... most cases don't need this, but when a pattern doesn't have a +/// universal start state, then after a prefilter candidate has been found, the +/// current state *must* be re-litigated as if computing the start state at the +/// beginning of the search because it might change. That is, not all start +/// states are created equal. +/// +/// Why avoid it? Because while it's not super expensive, it isn't a trivial +/// operation to compute the start state. It is much better to avoid it and +/// just state in the current state if you know it to be correct. +#[cfg_attr(feature = "perf-inline", inline(always))] +fn prefilter_restart( + dfa: &DFA, + cache: &mut Cache, + input: &Input<'_>, + at: usize, +) -> Result<LazyStateID, MatchError> { + let mut input = input.clone(); + input.set_start(at); + init_fwd(dfa, cache, &input) +} + +/// A convenience routine for constructing a "gave up" match error. +#[cfg_attr(feature = "perf-inline", inline(always))] +fn gave_up(offset: usize) -> MatchError { + MatchError::gave_up(offset) +} diff --git a/third_party/rust/regex-automata/src/lib.rs b/third_party/rust/regex-automata/src/lib.rs new file mode 100644 index 0000000000..62260a5aea --- /dev/null +++ b/third_party/rust/regex-automata/src/lib.rs @@ -0,0 +1,648 @@ +/*! +This crate exposes a variety of regex engines used by the `regex` crate. +It provides a vast, sprawling and "expert" level API to each regex engine. +The regex engines provided by this crate focus heavily on finite automata +implementations and specifically guarantee worst case `O(m * n)` time +complexity for all searches. (Where `m ~ len(regex)` and `n ~ len(haystack)`.) + +The primary goal of this crate is to serve as an implementation detail for the +`regex` crate. A secondary goal is to make its internals available for use by +others. + +# Table of contents + +* [Should I be using this crate?](#should-i-be-using-this-crate) gives some +reasons for and against using this crate. +* [Examples](#examples) provides a small selection of things you can do with +this crate. +* [Available regex engines](#available-regex-engines) provides a hyperlinked +list of all regex engines in this crate. +* [API themes](#api-themes) discusses common elements used throughout this +crate. +* [Crate features](#crate-features) documents the extensive list of Cargo +features available. + +# Should I be using this crate? + +If you find yourself here because you just want to use regexes, then you should +first check out whether the [`regex` crate](https://docs.rs/regex) meets +your needs. It provides a streamlined and difficult-to-misuse API for regex +searching. + +If you're here because there is something specific you want to do that can't +be easily done with `regex` crate, then you are perhaps in the right place. +It's most likely that the first stop you'll want to make is to explore the +[`meta` regex APIs](meta). Namely, the `regex` crate is just a light wrapper +over a [`meta::Regex`], so its API will probably be the easiest to transition +to. In contrast to the `regex` crate, the `meta::Regex` API supports more +search parameters and does multi-pattern searches. However, it isn't quite as +ergonomic. + +Otherwise, the following is an inexhaustive list of reasons to use this crate: + +* You want to analyze or use a [Thompson `NFA`](nfa::thompson::NFA) directly. +* You want more powerful multi-pattern search than what is provided by +`RegexSet` in the `regex` crate. All regex engines in this crate support +multi-pattern searches. +* You want to use one of the `regex` crate's internal engines directly because +of some interesting configuration that isn't possible via the `regex` crate. +For example, a [lazy DFA's configuration](hybrid::dfa::Config) exposes a +dizzying number of options for controlling its execution. +* You want to use the lower level search APIs. For example, both the [lazy +DFA](hybrid::dfa) and [fully compiled DFAs](dfa) support searching by exploring +the automaton one state at a time. This might be useful, for example, for +stream searches or searches of strings stored in non-contiguous in memory. +* You want to build a fully compiled DFA and then [use zero-copy +deserialization](dfa::dense::DFA::from_bytes) to load it into memory and use +it for searching. This use case is supported in core-only no-std/no-alloc +environments. +* You want to run [anchored searches](Input::anchored) without using the `^` +anchor in your regex pattern. +* You need to work-around contention issues with +sharing a regex across multiple threads. The +[`meta::Regex::search_with`](meta::Regex::search_with) API permits bypassing +any kind of synchronization at all by requiring the caller to provide the +mutable scratch spaced needed during a search. +* You want to build your own regex engine on top of the `regex` crate's +infrastructure. + +# Examples + +This section tries to identify a few interesting things you can do with this +crate and demonstrates them. + +### Multi-pattern searches with capture groups + +One of the more frustrating limitations of `RegexSet` in the `regex` crate +(at the time of writing) is that it doesn't report match positions. With this +crate, multi-pattern support was intentionally designed in from the beginning, +which means it works in all regex engines and even for capture groups as well. + +This example shows how to search for matches of multiple regexes, where each +regex uses the same capture group names to parse different key-value formats. + +``` +use regex_automata::{meta::Regex, PatternID}; + +let re = Regex::new_many(&[ + r#"(?m)^(?<key>[[:word:]]+)=(?<val>[[:word:]]+)$"#, + r#"(?m)^(?<key>[[:word:]]+)="(?<val>[^"]+)"$"#, + r#"(?m)^(?<key>[[:word:]]+)='(?<val>[^']+)'$"#, + r#"(?m)^(?<key>[[:word:]]+):\s*(?<val>[[:word:]]+)$"#, +])?; +let hay = r#" +best_album="Blow Your Face Out" +best_quote='"then as it was, then again it will be"' +best_year=1973 +best_simpsons_episode: HOMR +"#; +let mut kvs = vec![]; +for caps in re.captures_iter(hay) { + // N.B. One could use capture indices '1' and '2' here + // as well. Capture indices are local to each pattern. + // (Just like names are.) + let key = &hay[caps.get_group_by_name("key").unwrap()]; + let val = &hay[caps.get_group_by_name("val").unwrap()]; + kvs.push((key, val)); +} +assert_eq!(kvs, vec![ + ("best_album", "Blow Your Face Out"), + ("best_quote", "\"then as it was, then again it will be\""), + ("best_year", "1973"), + ("best_simpsons_episode", "HOMR"), +]); + +# Ok::<(), Box<dyn std::error::Error>>(()) +``` + +### Build a full DFA and walk it manually + +One of the regex engines in this crate is a fully compiled DFA. It takes worst +case exponential time to build, but once built, it can be easily explored and +used for searches. Here's a simple example that uses its lower level APIs to +implement a simple anchored search by hand. + +``` +use regex_automata::{dfa::{Automaton, dense}, Input}; + +let dfa = dense::DFA::new(r"(?-u)\b[A-Z]\w+z\b")?; +let haystack = "Quartz"; + +// The start state is determined by inspecting the position and the +// initial bytes of the haystack. +let mut state = dfa.start_state_forward(&Input::new(haystack))?; +// Walk all the bytes in the haystack. +for &b in haystack.as_bytes().iter() { + state = dfa.next_state(state, b); +} +// DFAs in this crate require an explicit +// end-of-input transition if a search reaches +// the end of a haystack. +state = dfa.next_eoi_state(state); +assert!(dfa.is_match_state(state)); + +# Ok::<(), Box<dyn std::error::Error>>(()) +``` + +Or do the same with a lazy DFA that avoids exponential worst case compile time, +but requires mutable scratch space to lazily build the DFA during the search. + +``` +use regex_automata::{hybrid::dfa::DFA, Input}; + +let dfa = DFA::new(r"(?-u)\b[A-Z]\w+z\b")?; +let mut cache = dfa.create_cache(); +let hay = "Quartz"; + +// The start state is determined by inspecting the position and the +// initial bytes of the haystack. +let mut state = dfa.start_state_forward(&mut cache, &Input::new(hay))?; +// Walk all the bytes in the haystack. +for &b in hay.as_bytes().iter() { + state = dfa.next_state(&mut cache, state, b)?; +} +// DFAs in this crate require an explicit +// end-of-input transition if a search reaches +// the end of a haystack. +state = dfa.next_eoi_state(&mut cache, state)?; +assert!(state.is_match()); + +# Ok::<(), Box<dyn std::error::Error>>(()) +``` + +### Find all overlapping matches + +This example shows how to build a DFA and use it to find all possible matches, +including overlapping matches. A similar example will work with a lazy DFA as +well. This also works with multiple patterns and will report all matches at the +same position where multiple patterns match. + +``` +use regex_automata::{ + dfa::{dense, Automaton, OverlappingState}, + Input, MatchKind, +}; + +let dfa = dense::DFA::builder() + .configure(dense::DFA::config().match_kind(MatchKind::All)) + .build(r"(?-u)\w{3,}")?; +let input = Input::new("homer marge bart lisa maggie"); +let mut state = OverlappingState::start(); + +let mut matches = vec![]; +while let Some(hm) = { + dfa.try_search_overlapping_fwd(&input, &mut state)?; + state.get_match() +} { + matches.push(hm.offset()); +} +assert_eq!(matches, vec![ + 3, 4, 5, // hom, home, homer + 9, 10, 11, // mar, marg, marge + 15, 16, // bar, bart + 20, 21, // lis, lisa + 25, 26, 27, 28, // mag, magg, maggi, maggie +]); + +# Ok::<(), Box<dyn std::error::Error>>(()) +``` + +# Available regex engines + +The following is a complete list of all regex engines provided by this crate, +along with a very brief description of it and why you might want to use it. + +* [`dfa::regex::Regex`] is a regex engine that works on top of either +[dense](dfa::dense) or [sparse](dfa::sparse) fully compiled DFAs. You might +use a DFA if you need the fastest possible regex engine in this crate and can +afford the exorbitant memory usage usually required by DFAs. Low level APIs on +fully compiled DFAs are provided by the [`Automaton` trait](dfa::Automaton). +Fully compiled dense DFAs can handle all regexes except for searching a regex +with a Unicode word boundary on non-ASCII haystacks. A fully compiled DFA based +regex can only report the start and end of each match. +* [`hybrid::regex::Regex`] is a regex engine that works on top of a lazily +built DFA. Its performance profile is very similar to that of fully compiled +DFAs, but can be slower in some pathological cases. Fully compiled DFAs are +also amenable to more optimizations, such as state acceleration, that aren't +available in a lazy DFA. You might use this lazy DFA if you can't abide the +worst case exponential compile time of a full DFA, but still want the DFA +search performance in the vast majority of cases. A lazy DFA based regex can +only report the start and end of each match. +* [`dfa::onepass::DFA`] is a regex engine that is implemented as a DFA, but +can report the matches of each capture group in addition to the start and end +of each match. The catch is that it only works on a somewhat small subset of +regexes known as "one-pass." You'll want to use this for cases when you need +capture group matches and the regex is one-pass since it is likely to be faster +than any alternative. A one-pass DFA can handle all types of regexes, but does +have some reasonable limits on the number of capture groups it can handle. +* [`nfa::thompson::backtrack::BoundedBacktracker`] is a regex engine that uses +backtracking, but keeps track of the work it has done to avoid catastrophic +backtracking. Like the one-pass DFA, it provides the matches of each capture +group. It retains the `O(m * n)` worst case time bound. This tends to be slower +than the one-pass DFA regex engine, but faster than the PikeVM. It can handle +all types of regexes, but usually only works well with small haystacks and +small regexes due to the memory required to avoid redoing work. +* [`nfa::thompson::pikevm::PikeVM`] is a regex engine that can handle all +regexes, of all sizes and provides capture group matches. It tends to be a tool +of last resort because it is also usually the slowest regex engine. +* [`meta::Regex`] is the meta regex engine that combines *all* of the above +engines into one. The reason for this is that each of the engines above have +their own caveats such as, "only handles a subset of regexes" or "is generally +slow." The meta regex engine accounts for all of these caveats and composes +the engines in a way that attempts to mitigate each engine's weaknesses while +emphasizing its strengths. For example, it will attempt to run a lazy DFA even +if it might fail. In which case, it will restart the search with a likely +slower but more capable regex engine. The meta regex engine is what you should +default to. Use one of the above engines directly only if you have a specific +reason to. + +# API themes + +While each regex engine has its own APIs and configuration options, there are +some general themes followed by all of them. + +### The `Input` abstraction + +Most search routines in this crate accept anything that implements +`Into<Input>`. Both `&str` and `&[u8]` haystacks satisfy this constraint, which +means that things like `engine.search("foo")` will work as you would expect. + +By virtue of accepting an `Into<Input>` though, callers can provide more than +just a haystack. Indeed, the [`Input`] type has more details, but briefly, +callers can use it to configure various aspects of the search: + +* The span of the haystack to search via [`Input::span`] or [`Input::range`], +which might be a substring of the haystack. +* Whether to run an anchored search or not via [`Input::anchored`]. This +permits one to require matches to start at the same offset that the search +started. +* Whether to ask the regex engine to stop as soon as a match is seen via +[`Input::earliest`]. This can be used to find the offset of a match as soon +as it is known without waiting for the full leftmost-first match to be found. +This can also be used to avoid the worst case `O(m * n^2)` time complexity +of iteration. + +Some lower level search routines accept an `&Input` for performance reasons. +In which case, `&Input::new("haystack")` can be used for a simple search. + +### Error reporting + +Most, but not all, regex engines in this crate can fail to execute a search. +When a search fails, callers cannot determine whether or not a match exists. +That is, the result is indeterminate. + +Search failure, in all cases in this crate, is represented by a [`MatchError`]. +Routines that can fail start with the `try_` prefix in their name. For example, +[`hybrid::regex::Regex::try_search`] can fail for a number of reasons. +Conversely, routines that either can't fail or can panic on failure lack the +`try_` prefix. For example, [`hybrid::regex::Regex::find`] will panic in +cases where [`hybrid::regex::Regex::try_search`] would return an error, and +[`meta::Regex::find`] will never panic. Therefore, callers need to pay close +attention to the panicking conditions in the documentation. + +In most cases, the reasons that a search fails are either predictable or +configurable, albeit at some additional cost. + +An example of predictable failure is +[`BoundedBacktracker::try_search`](nfa::thompson::backtrack::BoundedBacktracker::try_search). +Namely, it fails whenever the multiplication of the haystack, the regex and some +constant exceeds the +[configured visited capacity](nfa::thompson::backtrack::Config::visited_capacity). +Callers can predict the failure in terms of haystack length via the +[`BoundedBacktracker::max_haystack_len`](nfa::thompson::backtrack::BoundedBacktracker::max_haystack_len) +method. While this form of failure is technically avoidable by increasing the +visited capacity, it isn't practical to do so for all inputs because the +memory usage required for larger haystacks becomes impractically large. So in +practice, if one is using the bounded backtracker, you really do have to deal +with the failure. + +An example of configurable failure happens when one enables heuristic support +for Unicode word boundaries in a DFA. Namely, since the DFAs in this crate +(except for the one-pass DFA) do not support Unicode word boundaries on +non-ASCII haystacks, building a DFA from an NFA that contains a Unicode word +boundary will itself fail. However, one can configure DFAs to still be built in +this case by +[configuring heuristic support for Unicode word boundaries](hybrid::dfa::Config::unicode_word_boundary). +If the NFA the DFA is built from contains a Unicode word boundary, then the +DFA will still be built, but special transitions will be added to every state +that cause the DFA to fail if any non-ASCII byte is seen. This failure happens +at search time and it requires the caller to opt into this. + +There are other ways for regex engines to fail in this crate, but the above +two should represent the general theme of failures one can find. Dealing +with these failures is, in part, one the responsibilities of the [meta regex +engine](meta). Notice, for example, that the meta regex engine exposes an API +that never returns an error nor panics. It carefully manages all of the ways +in which the regex engines can fail and either avoids the predictable ones +entirely (e.g., the bounded backtracker) or reacts to configured failures by +falling back to a different engine (e.g., the lazy DFA quitting because it saw +a non-ASCII byte). + +### Configuration and Builders + +Most of the regex engines in this crate come with two types to facilitate +building the regex engine: a `Config` and a `Builder`. A `Config` is usually +specific to that particular regex engine, but other objects such as parsing and +NFA compilation have `Config` types too. A `Builder` is the thing responsible +for taking inputs (either pattern strings or already-parsed patterns or even +NFAs directly) and turning them into an actual regex engine that can be used +for searching. + +The main reason why building a regex engine is a bit complicated is because +of the desire to permit composition with de-coupled components. For example, +you might want to [manually construct a Thompson NFA](nfa::thompson::Builder) +and then build a regex engine from it without ever using a regex parser +at all. On the other hand, you might also want to build a regex engine directly +from the concrete syntax. This demonstrates why regex engine construction is +so flexible: it needs to support not just convenient construction, but also +construction from parts built elsewhere. + +This is also in turn why there are many different `Config` structs in this +crate. Let's look more closely at an example: [`hybrid::regex::Builder`]. It +accepts three different `Config` types for configuring construction of a lazy +DFA regex: + +* [`hybrid::regex::Builder::syntax`] accepts a +[`util::syntax::Config`] for configuring the options found in the +[`regex-syntax`](regex_syntax) crate. For example, whether to match +case insensitively. +* [`hybrid::regex::Builder::thompson`] accepts a [`nfa::thompson::Config`] for +configuring construction of a [Thompson NFA](nfa::thompson::NFA). For example, +whether to build an NFA that matches the reverse language described by the +regex. +* [`hybrid::regex::Builder::dfa`] accept a [`hybrid::dfa::Config`] for +configuring construction of the pair of underlying lazy DFAs that make up the +lazy DFA regex engine. For example, changing the capacity of the cache used to +store the transition table. + +The lazy DFA regex engine uses all three of those configuration objects for +methods like [`hybrid::regex::Builder::build`], which accepts a pattern +string containing the concrete syntax of your regex. It uses the syntax +configuration to parse it into an AST and translate it into an HIR. Then the +NFA configuration when compiling the HIR into an NFA. And then finally the DFA +configuration when lazily determinizing the NFA into a DFA. + +Notice though that the builder also has a +[`hybrid::regex::Builder::build_from_dfas`] constructor. This permits callers +to build the underlying pair of lazy DFAs themselves (one for the forward +searching to find the end of a match and one for the reverse searching to find +the start of a match), and then build the regex engine from them. The lazy +DFAs, in turn, have their own builder that permits [construction directly from +a Thompson NFA](hybrid::dfa::Builder::build_from_nfa). Continuing down the +rabbit hole, a Thompson NFA has its own compiler that permits [construction +directly from an HIR](nfa::thompson::Compiler::build_from_hir). The lazy DFA +regex engine builder lets you follow this rabbit hole all the way down, but +also provides convenience routines that do it for you when you don't need +precise control over every component. + +The [meta regex engine](meta) is a good example of something that utilizes the +full flexibility of these builders. It often needs not only precise control +over each component, but also shares them across multiple regex engines. +(Most sharing is done by internal reference accounting. For example, an +[`NFA`](nfa::thompson::NFA) is reference counted internally which makes cloning +cheap.) + +### Size limits + +Unlike the `regex` crate, the `regex-automata` crate specifically does not +enable any size limits by default. That means users of this crate need to +be quite careful when using untrusted patterns. Namely, because bounded +repetitions can grow exponentially by stacking them, it is possible to build a +very large internal regex object from just a small pattern string. For example, +the NFA built from the pattern `a{10}{10}{10}{10}{10}{10}{10}` is over 240MB. + +There are multiple size limit options in this crate. If one or more size limits +are relevant for the object you're building, they will be configurable via +methods on a corresponding `Config` type. + +# Crate features + +This crate has a dizzying number of features. The main idea is to be able to +control how much stuff you pull in for your specific use case, since the full +crate is quite large and can dramatically increase compile times and binary +size. + +The most barebones but useful configuration is to disable all default features +and enable only `dfa-search`. This will bring in just the DFA deserialization +and search routines without any dependency on `std` or `alloc`. This does +require generating and serializing a DFA, and then storing it somewhere, but +it permits regex searches in freestanding or embedded environments. + +Because there are so many features, they are split into a few groups. + +The default set of features is: `std`, `syntax`, `perf`, `unicode`, `meta`, +`nfa`, `dfa` and `hybrid`. Basically, the default is to enable everything +except for development related features like `logging`. + +### Ecosystem features + +* **std** - Enables use of the standard library. In terms of APIs, this usually +just means that error types implement the `std::error::Error` trait. Otherwise, +`std` sometimes enables the code to be faster, for example, using a `HashMap` +instead of a `BTreeMap`. (The `std` feature matters more for dependencies like +`aho-corasick` and `memchr`, where `std` is required to enable certain classes +of SIMD optimizations.) Enabling `std` automatically enables `alloc`. +* **alloc** - Enables use of the `alloc` library. This is required for most +APIs in this crate. The main exception is deserializing and searching with +fully compiled DFAs. +* **logging** - Adds a dependency on the `log` crate and makes this crate emit +log messages of varying degrees of utility. The log messages are especially +useful in trying to understand what the meta regex engine is doing. + +### Performance features + +* **perf** - Enables all of the below features. +* **perf-inline** - When enabled, `inline(always)` is used in (many) strategic +locations to help performance at the expense of longer compile times and +increased binary size. +* **perf-literal** - Enables all literal related optimizations. + * **perf-literal-substring** - Enables all single substring literal + optimizations. This includes adding a dependency on the `memchr` crate. + * **perf-literal-multisubstring** - Enables all multiple substring literal + optimizations. This includes adding a dependency on the `aho-corasick` + crate. + +### Unicode features + +* **unicode** - + Enables all Unicode features. This feature is enabled by default, and will + always cover all Unicode features, even if more are added in the future. +* **unicode-age** - + Provide the data for the + [Unicode `Age` property](https://www.unicode.org/reports/tr44/tr44-24.html#Character_Age). + This makes it possible to use classes like `\p{Age:6.0}` to refer to all + codepoints first introduced in Unicode 6.0 +* **unicode-bool** - + Provide the data for numerous Unicode boolean properties. The full list + is not included here, but contains properties like `Alphabetic`, `Emoji`, + `Lowercase`, `Math`, `Uppercase` and `White_Space`. +* **unicode-case** - + Provide the data for case insensitive matching using + [Unicode's "simple loose matches" specification](https://www.unicode.org/reports/tr18/#Simple_Loose_Matches). +* **unicode-gencat** - + Provide the data for + [Unicode general categories](https://www.unicode.org/reports/tr44/tr44-24.html#General_Category_Values). + This includes, but is not limited to, `Decimal_Number`, `Letter`, + `Math_Symbol`, `Number` and `Punctuation`. +* **unicode-perl** - + Provide the data for supporting the Unicode-aware Perl character classes, + corresponding to `\w`, `\s` and `\d`. This is also necessary for using + Unicode-aware word boundary assertions. Note that if this feature is + disabled, the `\s` and `\d` character classes are still available if the + `unicode-bool` and `unicode-gencat` features are enabled, respectively. +* **unicode-script** - + Provide the data for + [Unicode scripts and script extensions](https://www.unicode.org/reports/tr24/). + This includes, but is not limited to, `Arabic`, `Cyrillic`, `Hebrew`, + `Latin` and `Thai`. +* **unicode-segment** - + Provide the data necessary to provide the properties used to implement the + [Unicode text segmentation algorithms](https://www.unicode.org/reports/tr29/). + This enables using classes like `\p{gcb=Extend}`, `\p{wb=Katakana}` and + `\p{sb=ATerm}`. +* **unicode-word-boundary** - + Enables support for Unicode word boundaries, i.e., `\b`, in regexes. When + this and `unicode-perl` are enabled, then data tables from `regex-syntax` are + used to implement Unicode word boundaries. However, if `regex-syntax` isn't + enabled as a dependency then one can still enable this feature. It will + cause `regex-automata` to bundle its own data table that would otherwise be + redundant with `regex-syntax`'s table. + +### Regex engine features + +* **syntax** - Enables a dependency on `regex-syntax`. This makes APIs +for building regex engines from pattern strings available. Without the +`regex-syntax` dependency, the only way to build a regex engine is generally +to deserialize a previously built DFA or to hand assemble an NFA using its +[builder API](nfa::thompson::Builder). Once you have an NFA, you can build any +of the regex engines in this crate. The `syntax` feature also enables `alloc`. +* **meta** - Enables the meta regex engine. This also enables the `syntax` and +`nfa-pikevm` features, as both are the minimal requirements needed. The meta +regex engine benefits from enabling any of the other regex engines and will +use them automatically when appropriate. +* **nfa** - Enables all NFA related features below. + * **nfa-thompson** - Enables the Thompson NFA APIs. This enables `alloc`. + * **nfa-pikevm** - Enables the PikeVM regex engine. This enables + `nfa-thompson`. + * **nfa-backtrack** - Enables the bounded backtracker regex engine. This + enables `nfa-thompson`. +* **dfa** - Enables all DFA related features below. + * **dfa-build** - Enables APIs for determinizing DFAs from NFAs. This + enables `nfa-thompson` and `dfa-search`. + * **dfa-search** - Enables APIs for searching with DFAs. + * **dfa-onepass** - Enables the one-pass DFA API. This enables + `nfa-thompson`. +* **hybrid** - Enables the hybrid NFA/DFA or "lazy DFA" regex engine. This +enables `alloc` and `nfa-thompson`. + +*/ + +// We are no_std. +#![no_std] +// All APIs need docs! +#![deny(missing_docs)] +// Some intra-doc links are broken when certain features are disabled, so we +// only bleat about it when most (all?) features are enabled. But when we do, +// we block the build. Links need to work. +#![cfg_attr( + all( + feature = "std", + feature = "nfa", + feature = "dfa", + feature = "hybrid" + ), + deny(rustdoc::broken_intra_doc_links) +)] +// Broken rustdoc links are very easy to come by when you start disabling +// features. Namely, features tend to change imports, and imports change what's +// available to link to. +// +// Basically, we just don't support rustdoc for anything other than the maximal +// feature configuration. Other configurations will work, they just won't be +// perfect. +// +// So here, we specifically allow them so we don't even get warned about them. +#![cfg_attr( + not(all( + feature = "std", + feature = "nfa", + feature = "dfa", + feature = "hybrid" + )), + allow(rustdoc::broken_intra_doc_links) +)] +// Kinda similar, but eliminating all of the dead code and unused import +// warnings for every feature combo is a fool's errand. Instead, we just +// suppress those, but still let them through in a common configuration when we +// build most of everything. +// +// This does actually suggest that when features are disabled, we are actually +// compiling more code than we need to be. And this is perhaps not so great +// because disabling features is usually done in order to reduce compile times +// by reducing the amount of code one compiles... However, usually, most of the +// time this dead code is a relatively small amount from the 'util' module. +// But... I confess... There isn't a ton of visibility on this. +// +// I'm happy to try to address this in a different way, but "let's annotate +// every function in 'util' with some non-local combination of features" just +// cannot be the way forward. +#![cfg_attr( + not(all( + feature = "std", + feature = "nfa", + feature = "dfa", + feature = "hybrid", + feature = "perf-literal-substring", + feature = "perf-literal-multisubstring", + )), + allow(dead_code, unused_imports, unused_variables) +)] +// We generally want all types to impl Debug. +#![warn(missing_debug_implementations)] +// No clue why this thing is still unstable because it's pretty amazing. This +// adds Cargo feature annotations to items in the rustdoc output. Which is +// sadly hugely beneficial for this crate due to the number of features. +#![cfg_attr(docsrs, feature(doc_auto_cfg))] + +// I have literally never tested this crate on 16-bit, so it is quite +// suspicious to advertise support for it. But... the regex crate, at time +// of writing, at least claims to support it by not doing any conditional +// compilation based on the target pointer width. So I guess I remain +// consistent with that here. +// +// If you are here because you're on a 16-bit system and you were somehow using +// the regex crate previously, please file an issue. Please be prepared to +// provide some kind of reproduction or carve out some path to getting 16-bit +// working in CI. (Via qemu?) +#[cfg(not(any( + target_pointer_width = "16", + target_pointer_width = "32", + target_pointer_width = "64" +)))] +compile_error!("not supported on non-{16,32,64}, please file an issue"); + +#[cfg(any(test, feature = "std"))] +extern crate std; + +#[cfg(feature = "alloc")] +extern crate alloc; + +#[cfg(doctest)] +doc_comment::doctest!("../README.md"); + +#[doc(inline)] +pub use crate::util::primitives::PatternID; +pub use crate::util::search::*; + +#[macro_use] +mod macros; + +#[cfg(any(feature = "dfa-search", feature = "dfa-onepass"))] +pub mod dfa; +#[cfg(feature = "hybrid")] +pub mod hybrid; +#[cfg(feature = "meta")] +pub mod meta; +#[cfg(feature = "nfa-thompson")] +pub mod nfa; +pub mod util; diff --git a/third_party/rust/regex-automata/src/macros.rs b/third_party/rust/regex-automata/src/macros.rs new file mode 100644 index 0000000000..31b4ca3816 --- /dev/null +++ b/third_party/rust/regex-automata/src/macros.rs @@ -0,0 +1,20 @@ +// Some feature combinations result in some of these macros never being used. +// Which is fine. Just squash the warnings. +#![allow(unused_macros)] + +macro_rules! log { + ($($tt:tt)*) => { + #[cfg(feature = "logging")] + { + $($tt)* + } + } +} + +macro_rules! debug { + ($($tt:tt)*) => { log!(log::debug!($($tt)*)) } +} + +macro_rules! trace { + ($($tt:tt)*) => { log!(log::trace!($($tt)*)) } +} diff --git a/third_party/rust/regex-automata/src/meta/error.rs b/third_party/rust/regex-automata/src/meta/error.rs new file mode 100644 index 0000000000..ea9a3160e0 --- /dev/null +++ b/third_party/rust/regex-automata/src/meta/error.rs @@ -0,0 +1,241 @@ +use regex_syntax::{ast, hir}; + +use crate::{nfa, util::search::MatchError, PatternID}; + +/// An error that occurs when construction of a `Regex` fails. +/// +/// A build error is generally a result of one of two possible failure +/// modes. First is a parse or syntax error in the concrete syntax of a +/// pattern. Second is that the construction of the underlying regex matcher +/// fails, usually because it gets too big with respect to limits like +/// [`Config::nfa_size_limit`](crate::meta::Config::nfa_size_limit). +/// +/// This error provides very little introspection capabilities. You can: +/// +/// * Ask for the [`PatternID`] of the pattern that caused an error, if one +/// is available. This is available for things like syntax errors, but not for +/// cases where build limits are exceeded. +/// * Ask for the underlying syntax error, but only if the error is a syntax +/// error. +/// * Ask for a human readable message corresponding to the underlying error. +/// * The `BuildError::source` method (from the `std::error::Error` +/// trait implementation) may be used to query for an underlying error if one +/// exists. There are no API guarantees about which error is returned. +/// +/// When the `std` feature is enabled, this implements `std::error::Error`. +#[derive(Clone, Debug)] +pub struct BuildError { + kind: BuildErrorKind, +} + +#[derive(Clone, Debug)] +enum BuildErrorKind { + Syntax { pid: PatternID, err: regex_syntax::Error }, + NFA(nfa::thompson::BuildError), +} + +impl BuildError { + /// If it is known which pattern ID caused this build error to occur, then + /// this method returns it. + /// + /// Some errors are not associated with a particular pattern. However, any + /// errors that occur as part of parsing a pattern are guaranteed to be + /// associated with a pattern ID. + /// + /// # Example + /// + /// ``` + /// use regex_automata::{meta::Regex, PatternID}; + /// + /// let err = Regex::new_many(&["a", "b", r"\p{Foo}", "c"]).unwrap_err(); + /// assert_eq!(Some(PatternID::must(2)), err.pattern()); + /// ``` + pub fn pattern(&self) -> Option<PatternID> { + match self.kind { + BuildErrorKind::Syntax { pid, .. } => Some(pid), + _ => None, + } + } + + /// If this error occurred because the regex exceeded the configured size + /// limit before being built, then this returns the configured size limit. + /// + /// The limit returned is what was configured, and corresponds to the + /// maximum amount of heap usage in bytes. + pub fn size_limit(&self) -> Option<usize> { + match self.kind { + BuildErrorKind::NFA(ref err) => err.size_limit(), + _ => None, + } + } + + /// If this error corresponds to a syntax error, then a reference to it is + /// returned by this method. + pub fn syntax_error(&self) -> Option<®ex_syntax::Error> { + match self.kind { + BuildErrorKind::Syntax { ref err, .. } => Some(err), + _ => None, + } + } + + pub(crate) fn ast(pid: PatternID, err: ast::Error) -> BuildError { + let err = regex_syntax::Error::from(err); + BuildError { kind: BuildErrorKind::Syntax { pid, err } } + } + + pub(crate) fn hir(pid: PatternID, err: hir::Error) -> BuildError { + let err = regex_syntax::Error::from(err); + BuildError { kind: BuildErrorKind::Syntax { pid, err } } + } + + pub(crate) fn nfa(err: nfa::thompson::BuildError) -> BuildError { + BuildError { kind: BuildErrorKind::NFA(err) } + } +} + +#[cfg(feature = "std")] +impl std::error::Error for BuildError { + fn source(&self) -> Option<&(dyn std::error::Error + 'static)> { + match self.kind { + BuildErrorKind::Syntax { ref err, .. } => Some(err), + BuildErrorKind::NFA(ref err) => Some(err), + } + } +} + +impl core::fmt::Display for BuildError { + fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result { + match self.kind { + BuildErrorKind::Syntax { pid, .. } => { + write!(f, "error parsing pattern {}", pid.as_usize()) + } + BuildErrorKind::NFA(_) => write!(f, "error building NFA"), + } + } +} + +/// An error that occurs when a search should be retried. +/// +/// This retry error distinguishes between two different failure modes. +/// +/// The first is one where potential quadratic behavior has been detected. +/// In this case, whatever optimization that led to this behavior should be +/// stopped, and the next best strategy should be used. +/// +/// The second indicates that the underlying regex engine has failed for some +/// reason. This usually occurs because either a lazy DFA's cache has become +/// ineffective or because a non-ASCII byte has been seen *and* a Unicode word +/// boundary was used in one of the patterns. In this failure case, a different +/// regex engine that won't fail in these ways (PikeVM, backtracker or the +/// one-pass DFA) should be used. +/// +/// This is an internal error only and should never bleed into the public +/// API. +#[derive(Debug)] +pub(crate) enum RetryError { + Quadratic(RetryQuadraticError), + Fail(RetryFailError), +} + +#[cfg(feature = "std")] +impl std::error::Error for RetryError {} + +impl core::fmt::Display for RetryError { + fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result { + match *self { + RetryError::Quadratic(ref err) => err.fmt(f), + RetryError::Fail(ref err) => err.fmt(f), + } + } +} + +impl From<MatchError> for RetryError { + fn from(merr: MatchError) -> RetryError { + RetryError::Fail(RetryFailError::from(merr)) + } +} + +/// An error that occurs when potential quadratic behavior has been detected +/// when applying either the "reverse suffix" or "reverse inner" optimizations. +/// +/// When this error occurs, callers should abandon the "reverse" optimization +/// and use a normal forward search. +#[derive(Debug)] +pub(crate) struct RetryQuadraticError(()); + +impl RetryQuadraticError { + pub(crate) fn new() -> RetryQuadraticError { + RetryQuadraticError(()) + } +} + +#[cfg(feature = "std")] +impl std::error::Error for RetryQuadraticError {} + +impl core::fmt::Display for RetryQuadraticError { + fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result { + write!(f, "regex engine gave up to avoid quadratic behavior") + } +} + +impl From<RetryQuadraticError> for RetryError { + fn from(err: RetryQuadraticError) -> RetryError { + RetryError::Quadratic(err) + } +} + +/// An error that occurs when a regex engine "gives up" for some reason before +/// finishing a search. Usually this occurs because of heuristic Unicode word +/// boundary support or because of ineffective cache usage in the lazy DFA. +/// +/// When this error occurs, callers should retry the regex search with a +/// different regex engine. +/// +/// Note that this has convenient `From` impls that will automatically +/// convert a `MatchError` into this error. This works because the meta +/// regex engine internals guarantee that errors like `HaystackTooLong` and +/// `UnsupportedAnchored` will never occur. The only errors left are `Quit` and +/// `GaveUp`, which both correspond to this "failure" error. +#[derive(Debug)] +pub(crate) struct RetryFailError { + offset: usize, +} + +impl RetryFailError { + pub(crate) fn from_offset(offset: usize) -> RetryFailError { + RetryFailError { offset } + } +} + +#[cfg(feature = "std")] +impl std::error::Error for RetryFailError {} + +impl core::fmt::Display for RetryFailError { + fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result { + write!(f, "regex engine failed at offset {:?}", self.offset) + } +} + +impl From<RetryFailError> for RetryError { + fn from(err: RetryFailError) -> RetryError { + RetryError::Fail(err) + } +} + +impl From<MatchError> for RetryFailError { + fn from(merr: MatchError) -> RetryFailError { + use crate::util::search::MatchErrorKind::*; + + match *merr.kind() { + Quit { offset, .. } => RetryFailError::from_offset(offset), + GaveUp { offset } => RetryFailError::from_offset(offset), + // These can never occur because we avoid them by construction + // or with higher level control flow logic. For example, the + // backtracker's wrapper will never hand out a backtracker engine + // when the haystack would be too long. + HaystackTooLong { .. } | UnsupportedAnchored { .. } => { + unreachable!("found impossible error in meta engine: {}", merr) + } + } + } +} diff --git a/third_party/rust/regex-automata/src/meta/limited.rs b/third_party/rust/regex-automata/src/meta/limited.rs new file mode 100644 index 0000000000..192a2625e4 --- /dev/null +++ b/third_party/rust/regex-automata/src/meta/limited.rs @@ -0,0 +1,267 @@ +/*! +This module defines two bespoke reverse DFA searching routines. (One for the +lazy DFA and one for the fully compiled DFA.) These routines differ from the +usual ones by permitting the caller to specify a minimum starting position. +That is, the search will begin at `input.end()` and will usually stop at +`input.start()`, unless `min_start > input.start()`, in which case, the search +will stop at `min_start`. + +In other words, this lets you say, "no, the search must not extend past this +point, even if it's within the bounds of the given `Input`." And if the search +*does* want to go past that point, it stops and returns a "may be quadratic" +error, which indicates that the caller should retry using some other technique. + +These routines specifically exist to protect against quadratic behavior when +employing the "reverse suffix" and "reverse inner" optimizations. Without the +backstop these routines provide, it is possible for parts of the haystack to +get re-scanned over and over again. The backstop not only prevents this, but +*tells you when it is happening* so that you can change the strategy. + +Why can't we just use the normal search routines? We could use the normal +search routines and just set the start bound on the provided `Input` to our +`min_start` position. The problem here is that it's impossible to distinguish +between "no match because we reached the end of input" and "determined there +was no match well before the end of input." The former case is what we care +about with respect to quadratic behavior. The latter case is totally fine. + +Why don't we modify the normal search routines to report the position at which +the search stops? I considered this, and I still wonder if it is indeed the +right thing to do. However, I think the straight-forward thing to do there +would be to complicate the return type signature of almost every search routine +in this crate, which I really do not want to do. It therefore might make more +sense to provide a richer way for search routines to report meta data, but that +was beyond my bandwidth to work on at the time of writing. + +See the 'opt/reverse-inner' and 'opt/reverse-suffix' benchmarks in rebar for a +real demonstration of how quadratic behavior is mitigated. +*/ + +use crate::{ + meta::error::{RetryError, RetryQuadraticError}, + HalfMatch, Input, MatchError, +}; + +#[cfg(feature = "dfa-build")] +pub(crate) fn dfa_try_search_half_rev( + dfa: &crate::dfa::dense::DFA<alloc::vec::Vec<u32>>, + input: &Input<'_>, + min_start: usize, +) -> Result<Option<HalfMatch>, RetryError> { + use crate::dfa::Automaton; + + let mut mat = None; + let mut sid = dfa.start_state_reverse(input)?; + if input.start() == input.end() { + dfa_eoi_rev(dfa, input, &mut sid, &mut mat)?; + return Ok(mat); + } + let mut at = input.end() - 1; + loop { + sid = dfa.next_state(sid, input.haystack()[at]); + if dfa.is_special_state(sid) { + if dfa.is_match_state(sid) { + let pattern = dfa.match_pattern(sid, 0); + // Since reverse searches report the beginning of a + // match and the beginning is inclusive (not exclusive + // like the end of a match), we add 1 to make it + // inclusive. + mat = Some(HalfMatch::new(pattern, at + 1)); + } else if dfa.is_dead_state(sid) { + return Ok(mat); + } else if dfa.is_quit_state(sid) { + if mat.is_some() { + return Ok(mat); + } + return Err(MatchError::quit(input.haystack()[at], at).into()); + } + } + if at == input.start() { + break; + } + at -= 1; + if at < min_start { + trace!( + "reached position {} which is before the previous literal \ + match, quitting to avoid quadratic behavior", + at, + ); + return Err(RetryError::Quadratic(RetryQuadraticError::new())); + } + } + let was_dead = dfa.is_dead_state(sid); + dfa_eoi_rev(dfa, input, &mut sid, &mut mat)?; + // If we reach the beginning of the search and we could otherwise still + // potentially keep matching if there was more to match, then we actually + // return an error to indicate giving up on this optimization. Why? Because + // we can't prove that the real match begins at where we would report it. + // + // This only happens when all of the following are true: + // + // 1) We reach the starting point of our search span. + // 2) The match we found is before the starting point. + // 3) The FSM reports we could possibly find a longer match. + // + // We need (1) because otherwise the search stopped before the starting + // point and there is no possible way to find a more leftmost position. + // + // We need (2) because if the match found has an offset equal to the minimum + // possible offset, then there is no possible more leftmost match. + // + // We need (3) because if the FSM couldn't continue anyway (i.e., it's in + // a dead state), then we know we couldn't find anything more leftmost + // than what we have. (We have to check the state we were in prior to the + // EOI transition since the EOI transition will usually bring us to a dead + // state by virtue of it represents the end-of-input.) + if at == input.start() + && mat.map_or(false, |m| m.offset() > input.start()) + && !was_dead + { + trace!( + "reached beginning of search at offset {} without hitting \ + a dead state, quitting to avoid potential false positive match", + at, + ); + return Err(RetryError::Quadratic(RetryQuadraticError::new())); + } + Ok(mat) +} + +#[cfg(feature = "hybrid")] +pub(crate) fn hybrid_try_search_half_rev( + dfa: &crate::hybrid::dfa::DFA, + cache: &mut crate::hybrid::dfa::Cache, + input: &Input<'_>, + min_start: usize, +) -> Result<Option<HalfMatch>, RetryError> { + let mut mat = None; + let mut sid = dfa.start_state_reverse(cache, input)?; + if input.start() == input.end() { + hybrid_eoi_rev(dfa, cache, input, &mut sid, &mut mat)?; + return Ok(mat); + } + let mut at = input.end() - 1; + loop { + sid = dfa + .next_state(cache, sid, input.haystack()[at]) + .map_err(|_| MatchError::gave_up(at))?; + if sid.is_tagged() { + if sid.is_match() { + let pattern = dfa.match_pattern(cache, sid, 0); + // Since reverse searches report the beginning of a + // match and the beginning is inclusive (not exclusive + // like the end of a match), we add 1 to make it + // inclusive. + mat = Some(HalfMatch::new(pattern, at + 1)); + } else if sid.is_dead() { + return Ok(mat); + } else if sid.is_quit() { + if mat.is_some() { + return Ok(mat); + } + return Err(MatchError::quit(input.haystack()[at], at).into()); + } + } + if at == input.start() { + break; + } + at -= 1; + if at < min_start { + trace!( + "reached position {} which is before the previous literal \ + match, quitting to avoid quadratic behavior", + at, + ); + return Err(RetryError::Quadratic(RetryQuadraticError::new())); + } + } + let was_dead = sid.is_dead(); + hybrid_eoi_rev(dfa, cache, input, &mut sid, &mut mat)?; + // See the comments in the full DFA routine above for why we need this. + if at == input.start() + && mat.map_or(false, |m| m.offset() > input.start()) + && !was_dead + { + trace!( + "reached beginning of search at offset {} without hitting \ + a dead state, quitting to avoid potential false positive match", + at, + ); + return Err(RetryError::Quadratic(RetryQuadraticError::new())); + } + Ok(mat) +} + +#[cfg(feature = "dfa-build")] +#[cfg_attr(feature = "perf-inline", inline(always))] +fn dfa_eoi_rev( + dfa: &crate::dfa::dense::DFA<alloc::vec::Vec<u32>>, + input: &Input<'_>, + sid: &mut crate::util::primitives::StateID, + mat: &mut Option<HalfMatch>, +) -> Result<(), MatchError> { + use crate::dfa::Automaton; + + let sp = input.get_span(); + if sp.start > 0 { + let byte = input.haystack()[sp.start - 1]; + *sid = dfa.next_state(*sid, byte); + if dfa.is_match_state(*sid) { + let pattern = dfa.match_pattern(*sid, 0); + *mat = Some(HalfMatch::new(pattern, sp.start)); + } else if dfa.is_quit_state(*sid) { + if mat.is_some() { + return Ok(()); + } + return Err(MatchError::quit(byte, sp.start - 1)); + } + } else { + *sid = dfa.next_eoi_state(*sid); + if dfa.is_match_state(*sid) { + let pattern = dfa.match_pattern(*sid, 0); + *mat = Some(HalfMatch::new(pattern, 0)); + } + // N.B. We don't have to check 'is_quit' here because the EOI + // transition can never lead to a quit state. + debug_assert!(!dfa.is_quit_state(*sid)); + } + Ok(()) +} + +#[cfg(feature = "hybrid")] +#[cfg_attr(feature = "perf-inline", inline(always))] +fn hybrid_eoi_rev( + dfa: &crate::hybrid::dfa::DFA, + cache: &mut crate::hybrid::dfa::Cache, + input: &Input<'_>, + sid: &mut crate::hybrid::LazyStateID, + mat: &mut Option<HalfMatch>, +) -> Result<(), MatchError> { + let sp = input.get_span(); + if sp.start > 0 { + let byte = input.haystack()[sp.start - 1]; + *sid = dfa + .next_state(cache, *sid, byte) + .map_err(|_| MatchError::gave_up(sp.start))?; + if sid.is_match() { + let pattern = dfa.match_pattern(cache, *sid, 0); + *mat = Some(HalfMatch::new(pattern, sp.start)); + } else if sid.is_quit() { + if mat.is_some() { + return Ok(()); + } + return Err(MatchError::quit(byte, sp.start - 1)); + } + } else { + *sid = dfa + .next_eoi_state(cache, *sid) + .map_err(|_| MatchError::gave_up(sp.start))?; + if sid.is_match() { + let pattern = dfa.match_pattern(cache, *sid, 0); + *mat = Some(HalfMatch::new(pattern, 0)); + } + // N.B. We don't have to check 'is_quit' here because the EOI + // transition can never lead to a quit state. + debug_assert!(!sid.is_quit()); + } + Ok(()) +} diff --git a/third_party/rust/regex-automata/src/meta/literal.rs b/third_party/rust/regex-automata/src/meta/literal.rs new file mode 100644 index 0000000000..a68b93b7a6 --- /dev/null +++ b/third_party/rust/regex-automata/src/meta/literal.rs @@ -0,0 +1,81 @@ +use alloc::{vec, vec::Vec}; + +use regex_syntax::hir::Hir; + +use crate::{meta::regex::RegexInfo, util::search::MatchKind}; + +/// Pull out an alternation of literals from the given sequence of HIR +/// expressions. +/// +/// There are numerous ways for this to fail. Generally, this only applies +/// to regexes of the form 'foo|bar|baz|...|quux'. It can also fail if there +/// are "too few" alternates, in which case, the regex engine is likely faster. +/// +/// And currently, this only returns something when 'hirs.len() == 1'. +pub(crate) fn alternation_literals( + info: &RegexInfo, + hirs: &[&Hir], +) -> Option<Vec<Vec<u8>>> { + use regex_syntax::hir::{HirKind, Literal}; + + // Might as well skip the work below if we know we can't build an + // Aho-Corasick searcher. + if !cfg!(feature = "perf-literal-multisubstring") { + return None; + } + // This is pretty hacky, but basically, if `is_alternation_literal` is + // true, then we can make several assumptions about the structure of our + // HIR. This is what justifies the `unreachable!` statements below. + if hirs.len() != 1 + || !info.props()[0].look_set().is_empty() + || info.props()[0].explicit_captures_len() > 0 + || !info.props()[0].is_alternation_literal() + || info.config().get_match_kind() != MatchKind::LeftmostFirst + { + return None; + } + let hir = &hirs[0]; + let alts = match *hir.kind() { + HirKind::Alternation(ref alts) => alts, + _ => return None, // one literal isn't worth it + }; + + let mut lits = vec![]; + for alt in alts { + let mut lit = vec![]; + match *alt.kind() { + HirKind::Literal(Literal(ref bytes)) => { + lit.extend_from_slice(bytes) + } + HirKind::Concat(ref exprs) => { + for e in exprs { + match *e.kind() { + HirKind::Literal(Literal(ref bytes)) => { + lit.extend_from_slice(bytes); + } + _ => unreachable!("expected literal, got {:?}", e), + } + } + } + _ => unreachable!("expected literal or concat, got {:?}", alt), + } + lits.push(lit); + } + // Why do this? Well, when the number of literals is small, it's likely + // that we'll use the lazy DFA which is in turn likely to be faster than + // Aho-Corasick in such cases. Primarily because Aho-Corasick doesn't have + // a "lazy DFA" but either a contiguous NFA or a full DFA. We rarely use + // the latter because it is so hungry (in time and space), and the former + // is decently fast, but not as fast as a well oiled lazy DFA. + // + // However, once the number starts getting large, the lazy DFA is likely + // to start thrashing because of the modest default cache size. When + // exactly does this happen? Dunno. But at whatever point that is (we make + // a guess below based on ad hoc benchmarking), we'll want to cut over to + // Aho-Corasick, where even the contiguous NFA is likely to do much better. + if lits.len() < 3000 { + debug!("skipping Aho-Corasick because there are too few literals"); + return None; + } + Some(lits) +} diff --git a/third_party/rust/regex-automata/src/meta/mod.rs b/third_party/rust/regex-automata/src/meta/mod.rs new file mode 100644 index 0000000000..01f430fcb7 --- /dev/null +++ b/third_party/rust/regex-automata/src/meta/mod.rs @@ -0,0 +1,62 @@ +/*! +Provides a regex matcher that composes several other regex matchers +automatically. + +This module is home to a meta [`Regex`], which provides a convenient high +level API for executing regular expressions in linear time. + +# Comparison with the `regex` crate + +A meta `Regex` is the implementation used directly by the `regex` crate. +Indeed, the `regex` crate API is essentially just a light wrapper over a meta +`Regex`. This means that if you need the full flexibility offered by this +API, then you should be able to switch to using this API directly without +any changes in match semantics or syntax. However, there are some API level +differences: + +* The `regex` crate API returns match objects that include references to the +haystack itself, which in turn makes it easy to access the matching strings +without having to slice the haystack yourself. In contrast, a meta `Regex` +returns match objects that only have offsets in them. +* At time of writing, a meta `Regex` doesn't have some of the convenience +routines that the `regex` crate has, such as replacements. Note though that +[`Captures::interpolate_string`](crate::util::captures::Captures::interpolate_string) +will handle the replacement string interpolation for you. +* A meta `Regex` supports the [`Input`](crate::Input) abstraction, which +provides a way to configure a search in more ways than is supported by the +`regex` crate. For example, [`Input::anchored`](crate::Input::anchored) can +be used to run an anchored search, regardless of whether the pattern is itself +anchored with a `^`. +* A meta `Regex` supports multi-pattern searching everywhere. +Indeed, every [`Match`](crate::Match) returned by the search APIs +include a [`PatternID`](crate::PatternID) indicating which pattern +matched. In the single pattern case, all matches correspond to +[`PatternID::ZERO`](crate::PatternID::ZERO). In contrast, the `regex` crate +has distinct `Regex` and a `RegexSet` APIs. The former only supports a single +pattern, while the latter supports multiple patterns but cannot report the +offsets of a match. +* A meta `Regex` provides the explicit capability of bypassing its internal +memory pool for automatically acquiring mutable scratch space required by its +internal regex engines. Namely, a [`Cache`] can be explicitly provided to lower +level routines such as [`Regex::search_with`]. + +*/ + +pub use self::{ + error::BuildError, + regex::{ + Builder, Cache, CapturesMatches, Config, FindMatches, Regex, Split, + SplitN, + }, +}; + +mod error; +#[cfg(any(feature = "dfa-build", feature = "hybrid"))] +mod limited; +mod literal; +mod regex; +mod reverse_inner; +#[cfg(any(feature = "dfa-build", feature = "hybrid"))] +mod stopat; +mod strategy; +mod wrappers; diff --git a/third_party/rust/regex-automata/src/meta/regex.rs b/third_party/rust/regex-automata/src/meta/regex.rs new file mode 100644 index 0000000000..3a04b14d88 --- /dev/null +++ b/third_party/rust/regex-automata/src/meta/regex.rs @@ -0,0 +1,3649 @@ +use core::{ + borrow::Borrow, + panic::{RefUnwindSafe, UnwindSafe}, +}; + +use alloc::{boxed::Box, sync::Arc, vec, vec::Vec}; + +use regex_syntax::{ + ast, + hir::{self, Hir}, +}; + +use crate::{ + meta::{ + error::BuildError, + strategy::{self, Strategy}, + wrappers, + }, + nfa::thompson::WhichCaptures, + util::{ + captures::{Captures, GroupInfo}, + iter, + pool::{Pool, PoolGuard}, + prefilter::Prefilter, + primitives::{NonMaxUsize, PatternID}, + search::{HalfMatch, Input, Match, MatchKind, PatternSet, Span}, + }, +}; + +/// A type alias for our pool of meta::Cache that fixes the type parameters to +/// what we use for the meta regex below. +type CachePool = Pool<Cache, CachePoolFn>; + +/// Same as above, but for the guard returned by a pool. +type CachePoolGuard<'a> = PoolGuard<'a, Cache, CachePoolFn>; + +/// The type of the closure we use to create new caches. We need to spell out +/// all of the marker traits or else we risk leaking !MARKER impls. +type CachePoolFn = + Box<dyn Fn() -> Cache + Send + Sync + UnwindSafe + RefUnwindSafe>; + +/// A regex matcher that works by composing several other regex matchers +/// automatically. +/// +/// In effect, a meta regex papers over a lot of the quirks or performance +/// problems in each of the regex engines in this crate. Its goal is to provide +/// an infallible and simple API that "just does the right thing" in the common +/// case. +/// +/// A meta regex is the implementation of a `Regex` in the `regex` crate. +/// Indeed, the `regex` crate API is essentially just a light wrapper over +/// this type. This includes the `regex` crate's `RegexSet` API! +/// +/// # Composition +/// +/// This is called a "meta" matcher precisely because it uses other regex +/// matchers to provide a convenient high level regex API. Here are some +/// examples of how other regex matchers are composed: +/// +/// * When calling [`Regex::captures`], instead of immediately +/// running a slower but more capable regex engine like the +/// [`PikeVM`](crate::nfa::thompson::pikevm::PikeVM), the meta regex engine +/// will usually first look for the bounds of a match with a higher throughput +/// regex engine like a [lazy DFA](crate::hybrid). Only when a match is found +/// is a slower engine like `PikeVM` used to find the matching span for each +/// capture group. +/// * While higher throughout engines like the lazy DFA cannot handle +/// Unicode word boundaries in general, they can still be used on pure ASCII +/// haystacks by pretending that Unicode word boundaries are just plain ASCII +/// word boundaries. However, if a haystack is not ASCII, the meta regex engine +/// will automatically switch to a (possibly slower) regex engine that supports +/// Unicode word boundaries in general. +/// * In some cases where a regex pattern is just a simple literal or a small +/// set of literals, an actual regex engine won't be used at all. Instead, +/// substring or multi-substring search algorithms will be employed. +/// +/// There are many other forms of composition happening too, but the above +/// should give a general idea. In particular, it may perhaps be surprising +/// that *multiple* regex engines might get executed for a single search. That +/// is, the decision of what regex engine to use is not _just_ based on the +/// pattern, but also based on the dynamic execution of the search itself. +/// +/// The primary reason for this composition is performance. The fundamental +/// tension is that the faster engines tend to be less capable, and the more +/// capable engines tend to be slower. +/// +/// Note that the forms of composition that are allowed are determined by +/// compile time crate features and configuration. For example, if the `hybrid` +/// feature isn't enabled, or if [`Config::hybrid`] has been disabled, then the +/// meta regex engine will never use a lazy DFA. +/// +/// # Synchronization and cloning +/// +/// Most of the regex engines in this crate require some kind of mutable +/// "scratch" space to read and write from while performing a search. Since +/// a meta regex composes these regex engines, a meta regex also requires +/// mutable scratch space. This scratch space is called a [`Cache`]. +/// +/// Most regex engines _also_ usually have a read-only component, typically +/// a [Thompson `NFA`](crate::nfa::thompson::NFA). +/// +/// In order to make the `Regex` API convenient, most of the routines hide +/// the fact that a `Cache` is needed at all. To achieve this, a [memory +/// pool](crate::util::pool::Pool) is used internally to retrieve `Cache` +/// values in a thread safe way that also permits reuse. This in turn implies +/// that every such search call requires some form of synchronization. Usually +/// this synchronization is fast enough to not notice, but in some cases, it +/// can be a bottleneck. This typically occurs when all of the following are +/// true: +/// +/// * The same `Regex` is shared across multiple threads simultaneously, +/// usually via a [`util::lazy::Lazy`](crate::util::lazy::Lazy) or something +/// similar from the `once_cell` or `lazy_static` crates. +/// * The primary unit of work in each thread is a regex search. +/// * Searches are run on very short haystacks. +/// +/// This particular case can lead to high contention on the pool used by a +/// `Regex` internally, which can in turn increase latency to a noticeable +/// effect. This cost can be mitigated in one of the following ways: +/// +/// * Use a distinct copy of a `Regex` in each thread, usually by cloning it. +/// Cloning a `Regex` _does not_ do a deep copy of its read-only component. +/// But it does lead to each `Regex` having its own memory pool, which in +/// turn eliminates the problem of contention. In general, this technique should +/// not result in any additional memory usage when compared to sharing the same +/// `Regex` across multiple threads simultaneously. +/// * Use lower level APIs, like [`Regex::search_with`], which permit passing +/// a `Cache` explicitly. In this case, it is up to you to determine how best +/// to provide a `Cache`. For example, you might put a `Cache` in thread-local +/// storage if your use case allows for it. +/// +/// Overall, this is an issue that happens rarely in practice, but it can +/// happen. +/// +/// # Warning: spin-locks may be used in alloc-only mode +/// +/// When this crate is built without the `std` feature and the high level APIs +/// on a `Regex` are used, then a spin-lock will be used to synchronize access +/// to an internal pool of `Cache` values. This may be undesirable because +/// a spin-lock is [effectively impossible to implement correctly in user +/// space][spinlocks-are-bad]. That is, more concretely, the spin-lock could +/// result in a deadlock. +/// +/// [spinlocks-are-bad]: https://matklad.github.io/2020/01/02/spinlocks-considered-harmful.html +/// +/// If one wants to avoid the use of spin-locks when the `std` feature is +/// disabled, then you must use APIs that accept a `Cache` value explicitly. +/// For example, [`Regex::search_with`]. +/// +/// # Example +/// +/// ``` +/// use regex_automata::meta::Regex; +/// +/// let re = Regex::new(r"^[0-9]{4}-[0-9]{2}-[0-9]{2}$")?; +/// assert!(re.is_match("2010-03-14")); +/// +/// # Ok::<(), Box<dyn std::error::Error>>(()) +/// ``` +/// +/// # Example: anchored search +/// +/// This example shows how to use [`Input::anchored`] to run an anchored +/// search, even when the regex pattern itself isn't anchored. An anchored +/// search guarantees that if a match is found, then the start offset of the +/// match corresponds to the offset at which the search was started. +/// +/// ``` +/// use regex_automata::{meta::Regex, Anchored, Input, Match}; +/// +/// let re = Regex::new(r"\bfoo\b")?; +/// let input = Input::new("xx foo xx").range(3..).anchored(Anchored::Yes); +/// // The offsets are in terms of the original haystack. +/// assert_eq!(Some(Match::must(0, 3..6)), re.find(input)); +/// +/// // Notice that no match occurs here, because \b still takes the +/// // surrounding context into account, even if it means looking back +/// // before the start of your search. +/// let hay = "xxfoo xx"; +/// let input = Input::new(hay).range(2..).anchored(Anchored::Yes); +/// assert_eq!(None, re.find(input)); +/// // Indeed, you cannot achieve the above by simply slicing the +/// // haystack itself, since the regex engine can't see the +/// // surrounding context. This is why 'Input' permits setting +/// // the bounds of a search! +/// let input = Input::new(&hay[2..]).anchored(Anchored::Yes); +/// // WRONG! +/// assert_eq!(Some(Match::must(0, 0..3)), re.find(input)); +/// +/// # Ok::<(), Box<dyn std::error::Error>>(()) +/// ``` +/// +/// # Example: earliest search +/// +/// This example shows how to use [`Input::earliest`] to run a search that +/// might stop before finding the typical leftmost match. +/// +/// ``` +/// use regex_automata::{meta::Regex, Anchored, Input, Match}; +/// +/// let re = Regex::new(r"[a-z]{3}|b")?; +/// let input = Input::new("abc").earliest(true); +/// assert_eq!(Some(Match::must(0, 1..2)), re.find(input)); +/// +/// // Note that "earliest" isn't really a match semantic unto itself. +/// // Instead, it is merely an instruction to whatever regex engine +/// // gets used internally to quit as soon as it can. For example, +/// // this regex uses a different search technique, and winds up +/// // producing a different (but valid) match! +/// let re = Regex::new(r"abc|b")?; +/// let input = Input::new("abc").earliest(true); +/// assert_eq!(Some(Match::must(0, 0..3)), re.find(input)); +/// +/// # Ok::<(), Box<dyn std::error::Error>>(()) +/// ``` +/// +/// # Example: change the line terminator +/// +/// This example shows how to enable multi-line mode by default and change +/// the line terminator to the NUL byte: +/// +/// ``` +/// use regex_automata::{meta::Regex, util::syntax, Match}; +/// +/// let re = Regex::builder() +/// .syntax(syntax::Config::new().multi_line(true)) +/// .configure(Regex::config().line_terminator(b'\x00')) +/// .build(r"^foo$")?; +/// let hay = "\x00foo\x00"; +/// assert_eq!(Some(Match::must(0, 1..4)), re.find(hay)); +/// +/// # Ok::<(), Box<dyn std::error::Error>>(()) +/// ``` +#[derive(Debug)] +pub struct Regex { + /// The actual regex implementation. + imp: Arc<RegexI>, + /// A thread safe pool of caches. + /// + /// For the higher level search APIs, a `Cache` is automatically plucked + /// from this pool before running a search. The lower level `with` methods + /// permit the caller to provide their own cache, thereby bypassing + /// accesses to this pool. + /// + /// Note that we put this outside the `Arc` so that cloning a `Regex` + /// results in creating a fresh `CachePool`. This in turn permits callers + /// to clone regexes into separate threads where each such regex gets + /// the pool's "thread owner" optimization. Otherwise, if one shares the + /// `Regex` directly, then the pool will go through a slower mutex path for + /// all threads except for the "owner." + pool: CachePool, +} + +/// The internal implementation of `Regex`, split out so that it can be wrapped +/// in an `Arc`. +#[derive(Debug)] +struct RegexI { + /// The core matching engine. + /// + /// Why is this reference counted when RegexI is already wrapped in an Arc? + /// Well, we need to capture this in a closure to our `Pool` below in order + /// to create new `Cache` values when needed. So since it needs to be in + /// two places, we make it reference counted. + /// + /// We make `RegexI` itself reference counted too so that `Regex` itself + /// stays extremely small and very cheap to clone. + strat: Arc<dyn Strategy>, + /// Metadata about the regexes driving the strategy. The metadata is also + /// usually stored inside the strategy too, but we put it here as well + /// so that we can get quick access to it (without virtual calls) before + /// executing the regex engine. For example, we use this metadata to + /// detect a subset of cases where we know a match is impossible, and can + /// thus avoid calling into the strategy at all. + /// + /// Since `RegexInfo` is stored in multiple places, it is also reference + /// counted. + info: RegexInfo, +} + +/// Convenience constructors for a `Regex` using the default configuration. +impl Regex { + /// Builds a `Regex` from a single pattern string using the default + /// configuration. + /// + /// If there was a problem parsing the pattern or a problem turning it into + /// a regex matcher, then an error is returned. + /// + /// If you want to change the configuration of a `Regex`, use a [`Builder`] + /// with a [`Config`]. + /// + /// # Example + /// + /// ``` + /// use regex_automata::{meta::Regex, Match}; + /// + /// let re = Regex::new(r"(?Rm)^foo$")?; + /// let hay = "\r\nfoo\r\n"; + /// assert_eq!(Some(Match::must(0, 2..5)), re.find(hay)); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + pub fn new(pattern: &str) -> Result<Regex, BuildError> { + Regex::builder().build(pattern) + } + + /// Builds a `Regex` from many pattern strings using the default + /// configuration. + /// + /// If there was a problem parsing any of the patterns or a problem turning + /// them into a regex matcher, then an error is returned. + /// + /// If you want to change the configuration of a `Regex`, use a [`Builder`] + /// with a [`Config`]. + /// + /// # Example: simple lexer + /// + /// This simplistic example leverages the multi-pattern support to build a + /// simple little lexer. The pattern ID in the match tells you which regex + /// matched, which in turn might be used to map back to the "type" of the + /// token returned by the lexer. + /// + /// ``` + /// use regex_automata::{meta::Regex, Match}; + /// + /// let re = Regex::new_many(&[ + /// r"[[:space:]]", + /// r"[A-Za-z0-9][A-Za-z0-9_]+", + /// r"->", + /// r".", + /// ])?; + /// let haystack = "fn is_boss(bruce: i32, springsteen: String) -> bool;"; + /// let matches: Vec<Match> = re.find_iter(haystack).collect(); + /// assert_eq!(matches, vec![ + /// Match::must(1, 0..2), // 'fn' + /// Match::must(0, 2..3), // ' ' + /// Match::must(1, 3..10), // 'is_boss' + /// Match::must(3, 10..11), // '(' + /// Match::must(1, 11..16), // 'bruce' + /// Match::must(3, 16..17), // ':' + /// Match::must(0, 17..18), // ' ' + /// Match::must(1, 18..21), // 'i32' + /// Match::must(3, 21..22), // ',' + /// Match::must(0, 22..23), // ' ' + /// Match::must(1, 23..34), // 'springsteen' + /// Match::must(3, 34..35), // ':' + /// Match::must(0, 35..36), // ' ' + /// Match::must(1, 36..42), // 'String' + /// Match::must(3, 42..43), // ')' + /// Match::must(0, 43..44), // ' ' + /// Match::must(2, 44..46), // '->' + /// Match::must(0, 46..47), // ' ' + /// Match::must(1, 47..51), // 'bool' + /// Match::must(3, 51..52), // ';' + /// ]); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + /// + /// One can write a lexer like the above using a regex like + /// `(?P<space>[[:space:]])|(?P<ident>[A-Za-z0-9][A-Za-z0-9_]+)|...`, + /// but then you need to ask whether capture group matched to determine + /// which branch in the regex matched, and thus, which token the match + /// corresponds to. In contrast, the above example includes the pattern ID + /// in the match. There's no need to use capture groups at all. + /// + /// # Example: finding the pattern that caused an error + /// + /// When a syntax error occurs, it is possible to ask which pattern + /// caused the syntax error. + /// + /// ``` + /// use regex_automata::{meta::Regex, PatternID}; + /// + /// let err = Regex::new_many(&["a", "b", r"\p{Foo}", "c"]).unwrap_err(); + /// assert_eq!(Some(PatternID::must(2)), err.pattern()); + /// ``` + /// + /// # Example: zero patterns is valid + /// + /// Building a regex with zero patterns results in a regex that never + /// matches anything. Because this routine is generic, passing an empty + /// slice usually requires a turbo-fish (or something else to help type + /// inference). + /// + /// ``` + /// use regex_automata::{meta::Regex, util::syntax, Match}; + /// + /// let re = Regex::new_many::<&str>(&[])?; + /// assert_eq!(None, re.find("")); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + pub fn new_many<P: AsRef<str>>( + patterns: &[P], + ) -> Result<Regex, BuildError> { + Regex::builder().build_many(patterns) + } + + /// Return a default configuration for a `Regex`. + /// + /// This is a convenience routine to avoid needing to import the [`Config`] + /// type when customizing the construction of a `Regex`. + /// + /// # Example: lower the NFA size limit + /// + /// In some cases, the default size limit might be too big. The size limit + /// can be lowered, which will prevent large regex patterns from compiling. + /// + /// ``` + /// # if cfg!(miri) { return Ok(()); } // miri takes too long + /// use regex_automata::meta::Regex; + /// + /// let result = Regex::builder() + /// .configure(Regex::config().nfa_size_limit(Some(20 * (1<<10)))) + /// // Not even 20KB is enough to build a single large Unicode class! + /// .build(r"\pL"); + /// assert!(result.is_err()); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + pub fn config() -> Config { + Config::new() + } + + /// Return a builder for configuring the construction of a `Regex`. + /// + /// This is a convenience routine to avoid needing to import the + /// [`Builder`] type in common cases. + /// + /// # Example: change the line terminator + /// + /// This example shows how to enable multi-line mode by default and change + /// the line terminator to the NUL byte: + /// + /// ``` + /// use regex_automata::{meta::Regex, util::syntax, Match}; + /// + /// let re = Regex::builder() + /// .syntax(syntax::Config::new().multi_line(true)) + /// .configure(Regex::config().line_terminator(b'\x00')) + /// .build(r"^foo$")?; + /// let hay = "\x00foo\x00"; + /// assert_eq!(Some(Match::must(0, 1..4)), re.find(hay)); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + pub fn builder() -> Builder { + Builder::new() + } +} + +/// High level convenience routines for using a regex to search a haystack. +impl Regex { + /// Returns true if and only if this regex matches the given haystack. + /// + /// This routine may short circuit if it knows that scanning future input + /// will never lead to a different result. (Consider how this might make + /// a difference given the regex `a+` on the haystack `aaaaaaaaaaaaaaa`. + /// This routine _may_ stop after it sees the first `a`, but routines like + /// `find` need to continue searching because `+` is greedy by default.) + /// + /// # Example + /// + /// ``` + /// use regex_automata::meta::Regex; + /// + /// let re = Regex::new("foo[0-9]+bar")?; + /// + /// assert!(re.is_match("foo12345bar")); + /// assert!(!re.is_match("foobar")); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + /// + /// # Example: consistency with search APIs + /// + /// `is_match` is guaranteed to return `true` whenever `find` returns a + /// match. This includes searches that are executed entirely within a + /// codepoint: + /// + /// ``` + /// use regex_automata::{meta::Regex, Input}; + /// + /// let re = Regex::new("a*")?; + /// + /// // This doesn't match because the default configuration bans empty + /// // matches from splitting a codepoint. + /// assert!(!re.is_match(Input::new("☃").span(1..2))); + /// assert_eq!(None, re.find(Input::new("☃").span(1..2))); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + /// + /// Notice that when UTF-8 mode is disabled, then the above reports a + /// match because the restriction against zero-width matches that split a + /// codepoint has been lifted: + /// + /// ``` + /// use regex_automata::{meta::Regex, Input, Match}; + /// + /// let re = Regex::builder() + /// .configure(Regex::config().utf8_empty(false)) + /// .build("a*")?; + /// + /// assert!(re.is_match(Input::new("☃").span(1..2))); + /// assert_eq!( + /// Some(Match::must(0, 1..1)), + /// re.find(Input::new("☃").span(1..2)), + /// ); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + /// + /// A similar idea applies when using line anchors with CRLF mode enabled, + /// which prevents them from matching between a `\r` and a `\n`. + /// + /// ``` + /// use regex_automata::{meta::Regex, Input, Match}; + /// + /// let re = Regex::new(r"(?Rm:$)")?; + /// assert!(!re.is_match(Input::new("\r\n").span(1..1))); + /// // A regular line anchor, which only considers \n as a + /// // line terminator, will match. + /// let re = Regex::new(r"(?m:$)")?; + /// assert!(re.is_match(Input::new("\r\n").span(1..1))); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + #[inline] + pub fn is_match<'h, I: Into<Input<'h>>>(&self, input: I) -> bool { + let input = input.into().earliest(true); + if self.imp.info.is_impossible(&input) { + return false; + } + let mut guard = self.pool.get(); + let result = self.imp.strat.is_match(&mut guard, &input); + // See 'Regex::search' for why we put the guard back explicitly. + PoolGuard::put(guard); + result + } + + /// Executes a leftmost search and returns the first match that is found, + /// if one exists. + /// + /// # Example + /// + /// ``` + /// use regex_automata::{meta::Regex, Match}; + /// + /// let re = Regex::new("foo[0-9]+")?; + /// assert_eq!(Some(Match::must(0, 0..8)), re.find("foo12345")); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + #[inline] + pub fn find<'h, I: Into<Input<'h>>>(&self, input: I) -> Option<Match> { + self.search(&input.into()) + } + + /// Executes a leftmost forward search and writes the spans of capturing + /// groups that participated in a match into the provided [`Captures`] + /// value. If no match was found, then [`Captures::is_match`] is guaranteed + /// to return `false`. + /// + /// # Example + /// + /// ``` + /// use regex_automata::{meta::Regex, Span}; + /// + /// let re = Regex::new(r"^([0-9]{4})-([0-9]{2})-([0-9]{2})$")?; + /// let mut caps = re.create_captures(); + /// + /// re.captures("2010-03-14", &mut caps); + /// assert!(caps.is_match()); + /// assert_eq!(Some(Span::from(0..4)), caps.get_group(1)); + /// assert_eq!(Some(Span::from(5..7)), caps.get_group(2)); + /// assert_eq!(Some(Span::from(8..10)), caps.get_group(3)); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + #[inline] + pub fn captures<'h, I: Into<Input<'h>>>( + &self, + input: I, + caps: &mut Captures, + ) { + self.search_captures(&input.into(), caps) + } + + /// Returns an iterator over all non-overlapping leftmost matches in + /// the given haystack. If no match exists, then the iterator yields no + /// elements. + /// + /// # Example + /// + /// ``` + /// use regex_automata::{meta::Regex, Match}; + /// + /// let re = Regex::new("foo[0-9]+")?; + /// let haystack = "foo1 foo12 foo123"; + /// let matches: Vec<Match> = re.find_iter(haystack).collect(); + /// assert_eq!(matches, vec![ + /// Match::must(0, 0..4), + /// Match::must(0, 5..10), + /// Match::must(0, 11..17), + /// ]); + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + #[inline] + pub fn find_iter<'r, 'h, I: Into<Input<'h>>>( + &'r self, + input: I, + ) -> FindMatches<'r, 'h> { + let cache = self.pool.get(); + let it = iter::Searcher::new(input.into()); + FindMatches { re: self, cache, it } + } + + /// Returns an iterator over all non-overlapping `Captures` values. If no + /// match exists, then the iterator yields no elements. + /// + /// This yields the same matches as [`Regex::find_iter`], but it includes + /// the spans of all capturing groups that participate in each match. + /// + /// **Tip:** See [`util::iter::Searcher`](crate::util::iter::Searcher) for + /// how to correctly iterate over all matches in a haystack while avoiding + /// the creation of a new `Captures` value for every match. (Which you are + /// forced to do with an `Iterator`.) + /// + /// # Example + /// + /// ``` + /// use regex_automata::{meta::Regex, Span}; + /// + /// let re = Regex::new("foo(?P<numbers>[0-9]+)")?; + /// + /// let haystack = "foo1 foo12 foo123"; + /// let matches: Vec<Span> = re + /// .captures_iter(haystack) + /// // The unwrap is OK since 'numbers' matches if the pattern matches. + /// .map(|caps| caps.get_group_by_name("numbers").unwrap()) + /// .collect(); + /// assert_eq!(matches, vec![ + /// Span::from(3..4), + /// Span::from(8..10), + /// Span::from(14..17), + /// ]); + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + #[inline] + pub fn captures_iter<'r, 'h, I: Into<Input<'h>>>( + &'r self, + input: I, + ) -> CapturesMatches<'r, 'h> { + let cache = self.pool.get(); + let caps = self.create_captures(); + let it = iter::Searcher::new(input.into()); + CapturesMatches { re: self, cache, caps, it } + } + + /// Returns an iterator of spans of the haystack given, delimited by a + /// match of the regex. Namely, each element of the iterator corresponds to + /// a part of the haystack that *isn't* matched by the regular expression. + /// + /// # Example + /// + /// To split a string delimited by arbitrary amounts of spaces or tabs: + /// + /// ``` + /// use regex_automata::meta::Regex; + /// + /// let re = Regex::new(r"[ \t]+")?; + /// let hay = "a b \t c\td e"; + /// let fields: Vec<&str> = re.split(hay).map(|span| &hay[span]).collect(); + /// assert_eq!(fields, vec!["a", "b", "c", "d", "e"]); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + /// + /// # Example: more cases + /// + /// Basic usage: + /// + /// ``` + /// use regex_automata::meta::Regex; + /// + /// let re = Regex::new(r" ")?; + /// let hay = "Mary had a little lamb"; + /// let got: Vec<&str> = re.split(hay).map(|sp| &hay[sp]).collect(); + /// assert_eq!(got, vec!["Mary", "had", "a", "little", "lamb"]); + /// + /// let re = Regex::new(r"X")?; + /// let hay = ""; + /// let got: Vec<&str> = re.split(hay).map(|sp| &hay[sp]).collect(); + /// assert_eq!(got, vec![""]); + /// + /// let re = Regex::new(r"X")?; + /// let hay = "lionXXtigerXleopard"; + /// let got: Vec<&str> = re.split(hay).map(|sp| &hay[sp]).collect(); + /// assert_eq!(got, vec!["lion", "", "tiger", "leopard"]); + /// + /// let re = Regex::new(r"::")?; + /// let hay = "lion::tiger::leopard"; + /// let got: Vec<&str> = re.split(hay).map(|sp| &hay[sp]).collect(); + /// assert_eq!(got, vec!["lion", "tiger", "leopard"]); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + /// + /// If a haystack contains multiple contiguous matches, you will end up + /// with empty spans yielded by the iterator: + /// + /// ``` + /// use regex_automata::meta::Regex; + /// + /// let re = Regex::new(r"X")?; + /// let hay = "XXXXaXXbXc"; + /// let got: Vec<&str> = re.split(hay).map(|sp| &hay[sp]).collect(); + /// assert_eq!(got, vec!["", "", "", "", "a", "", "b", "c"]); + /// + /// let re = Regex::new(r"/")?; + /// let hay = "(///)"; + /// let got: Vec<&str> = re.split(hay).map(|sp| &hay[sp]).collect(); + /// assert_eq!(got, vec!["(", "", "", ")"]); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + /// + /// Separators at the start or end of a haystack are neighbored by empty + /// spans. + /// + /// ``` + /// use regex_automata::meta::Regex; + /// + /// let re = Regex::new(r"0")?; + /// let hay = "010"; + /// let got: Vec<&str> = re.split(hay).map(|sp| &hay[sp]).collect(); + /// assert_eq!(got, vec!["", "1", ""]); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + /// + /// When the empty string is used as a regex, it splits at every valid + /// UTF-8 boundary by default (which includes the beginning and end of the + /// haystack): + /// + /// ``` + /// use regex_automata::meta::Regex; + /// + /// let re = Regex::new(r"")?; + /// let hay = "rust"; + /// let got: Vec<&str> = re.split(hay).map(|sp| &hay[sp]).collect(); + /// assert_eq!(got, vec!["", "r", "u", "s", "t", ""]); + /// + /// // Splitting by an empty string is UTF-8 aware by default! + /// let re = Regex::new(r"")?; + /// let hay = "☃"; + /// let got: Vec<&str> = re.split(hay).map(|sp| &hay[sp]).collect(); + /// assert_eq!(got, vec!["", "☃", ""]); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + /// + /// But note that UTF-8 mode for empty strings can be disabled, which will + /// then result in a match at every byte offset in the haystack, + /// including between every UTF-8 code unit. + /// + /// ``` + /// use regex_automata::meta::Regex; + /// + /// let re = Regex::builder() + /// .configure(Regex::config().utf8_empty(false)) + /// .build(r"")?; + /// let hay = "☃".as_bytes(); + /// let got: Vec<&[u8]> = re.split(hay).map(|sp| &hay[sp]).collect(); + /// assert_eq!(got, vec![ + /// // Writing byte string slices is just brutal. The problem is that + /// // b"foo" has type &[u8; 3] instead of &[u8]. + /// &[][..], &[b'\xE2'][..], &[b'\x98'][..], &[b'\x83'][..], &[][..], + /// ]); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + /// + /// Contiguous separators (commonly shows up with whitespace), can lead to + /// possibly surprising behavior. For example, this code is correct: + /// + /// ``` + /// use regex_automata::meta::Regex; + /// + /// let re = Regex::new(r" ")?; + /// let hay = " a b c"; + /// let got: Vec<&str> = re.split(hay).map(|sp| &hay[sp]).collect(); + /// assert_eq!(got, vec!["", "", "", "", "a", "", "b", "c"]); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + /// + /// It does *not* give you `["a", "b", "c"]`. For that behavior, you'd want + /// to match contiguous space characters: + /// + /// ``` + /// use regex_automata::meta::Regex; + /// + /// let re = Regex::new(r" +")?; + /// let hay = " a b c"; + /// let got: Vec<&str> = re.split(hay).map(|sp| &hay[sp]).collect(); + /// // N.B. This does still include a leading empty span because ' +' + /// // matches at the beginning of the haystack. + /// assert_eq!(got, vec!["", "a", "b", "c"]); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + #[inline] + pub fn split<'r, 'h, I: Into<Input<'h>>>( + &'r self, + input: I, + ) -> Split<'r, 'h> { + Split { finder: self.find_iter(input), last: 0 } + } + + /// Returns an iterator of at most `limit` spans of the haystack given, + /// delimited by a match of the regex. (A `limit` of `0` will return no + /// spans.) Namely, each element of the iterator corresponds to a part + /// of the haystack that *isn't* matched by the regular expression. The + /// remainder of the haystack that is not split will be the last element in + /// the iterator. + /// + /// # Example + /// + /// Get the first two words in some haystack: + /// + /// ``` + /// # if cfg!(miri) { return Ok(()); } // miri takes too long + /// use regex_automata::meta::Regex; + /// + /// let re = Regex::new(r"\W+").unwrap(); + /// let hay = "Hey! How are you?"; + /// let fields: Vec<&str> = + /// re.splitn(hay, 3).map(|span| &hay[span]).collect(); + /// assert_eq!(fields, vec!["Hey", "How", "are you?"]); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + /// + /// # Examples: more cases + /// + /// ``` + /// use regex_automata::meta::Regex; + /// + /// let re = Regex::new(r" ")?; + /// let hay = "Mary had a little lamb"; + /// let got: Vec<&str> = re.splitn(hay, 3).map(|sp| &hay[sp]).collect(); + /// assert_eq!(got, vec!["Mary", "had", "a little lamb"]); + /// + /// let re = Regex::new(r"X")?; + /// let hay = ""; + /// let got: Vec<&str> = re.splitn(hay, 3).map(|sp| &hay[sp]).collect(); + /// assert_eq!(got, vec![""]); + /// + /// let re = Regex::new(r"X")?; + /// let hay = "lionXXtigerXleopard"; + /// let got: Vec<&str> = re.splitn(hay, 3).map(|sp| &hay[sp]).collect(); + /// assert_eq!(got, vec!["lion", "", "tigerXleopard"]); + /// + /// let re = Regex::new(r"::")?; + /// let hay = "lion::tiger::leopard"; + /// let got: Vec<&str> = re.splitn(hay, 2).map(|sp| &hay[sp]).collect(); + /// assert_eq!(got, vec!["lion", "tiger::leopard"]); + /// + /// let re = Regex::new(r"X")?; + /// let hay = "abcXdef"; + /// let got: Vec<&str> = re.splitn(hay, 1).map(|sp| &hay[sp]).collect(); + /// assert_eq!(got, vec!["abcXdef"]); + /// + /// let re = Regex::new(r"X")?; + /// let hay = "abcdef"; + /// let got: Vec<&str> = re.splitn(hay, 2).map(|sp| &hay[sp]).collect(); + /// assert_eq!(got, vec!["abcdef"]); + /// + /// let re = Regex::new(r"X")?; + /// let hay = "abcXdef"; + /// let got: Vec<&str> = re.splitn(hay, 0).map(|sp| &hay[sp]).collect(); + /// assert!(got.is_empty()); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + pub fn splitn<'r, 'h, I: Into<Input<'h>>>( + &'r self, + input: I, + limit: usize, + ) -> SplitN<'r, 'h> { + SplitN { splits: self.split(input), limit } + } +} + +/// Lower level search routines that give more control. +impl Regex { + /// Returns the start and end offset of the leftmost match. If no match + /// exists, then `None` is returned. + /// + /// This is like [`Regex::find`] but, but it accepts a concrete `&Input` + /// instead of an `Into<Input>`. + /// + /// # Example + /// + /// ``` + /// use regex_automata::{meta::Regex, Input, Match}; + /// + /// let re = Regex::new(r"Samwise|Sam")?; + /// let input = Input::new( + /// "one of the chief characters, Samwise the Brave", + /// ); + /// assert_eq!(Some(Match::must(0, 29..36)), re.search(&input)); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + #[inline] + pub fn search(&self, input: &Input<'_>) -> Option<Match> { + if self.imp.info.is_impossible(input) { + return None; + } + let mut guard = self.pool.get(); + let result = self.imp.strat.search(&mut guard, input); + // We do this dance with the guard and explicitly put it back in the + // pool because it seems to result in better codegen. If we let the + // guard's Drop impl put it back in the pool, then functions like + // ptr::drop_in_place get called and they *don't* get inlined. This + // isn't usually a big deal, but in latency sensitive benchmarks the + // extra function call can matter. + // + // I used `rebar measure -f '^grep/every-line$' -e meta` to measure + // the effects here. + // + // Note that this doesn't eliminate the latency effects of using the + // pool. There is still some (minor) cost for the "thread owner" of the + // pool. (i.e., The thread that first calls a regex search routine.) + // However, for other threads using the regex, the pool access can be + // quite expensive as it goes through a mutex. Callers can avoid this + // by either cloning the Regex (which creates a distinct copy of the + // pool), or callers can use the lower level APIs that accept a 'Cache' + // directly and do their own handling. + PoolGuard::put(guard); + result + } + + /// Returns the end offset of the leftmost match. If no match exists, then + /// `None` is returned. + /// + /// This is distinct from [`Regex::search`] in that it only returns the end + /// of a match and not the start of the match. Depending on a variety of + /// implementation details, this _may_ permit the regex engine to do less + /// overall work. For example, if a DFA is being used to execute a search, + /// then the start of a match usually requires running a separate DFA in + /// reverse to the find the start of a match. If one only needs the end of + /// a match, then the separate reverse scan to find the start of a match + /// can be skipped. (Note that the reverse scan is avoided even when using + /// `Regex::search` when possible, for example, in the case of an anchored + /// search.) + /// + /// # Example + /// + /// ``` + /// use regex_automata::{meta::Regex, Input, HalfMatch}; + /// + /// let re = Regex::new(r"Samwise|Sam")?; + /// let input = Input::new( + /// "one of the chief characters, Samwise the Brave", + /// ); + /// assert_eq!(Some(HalfMatch::must(0, 36)), re.search_half(&input)); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + #[inline] + pub fn search_half(&self, input: &Input<'_>) -> Option<HalfMatch> { + if self.imp.info.is_impossible(input) { + return None; + } + let mut guard = self.pool.get(); + let result = self.imp.strat.search_half(&mut guard, input); + // See 'Regex::search' for why we put the guard back explicitly. + PoolGuard::put(guard); + result + } + + /// Executes a leftmost forward search and writes the spans of capturing + /// groups that participated in a match into the provided [`Captures`] + /// value. If no match was found, then [`Captures::is_match`] is guaranteed + /// to return `false`. + /// + /// This is like [`Regex::captures`], but it accepts a concrete `&Input` + /// instead of an `Into<Input>`. + /// + /// # Example: specific pattern search + /// + /// This example shows how to build a multi-pattern `Regex` that permits + /// searching for specific patterns. + /// + /// ``` + /// use regex_automata::{ + /// meta::Regex, + /// Anchored, Match, PatternID, Input, + /// }; + /// + /// let re = Regex::new_many(&["[a-z0-9]{6}", "[a-z][a-z0-9]{5}"])?; + /// let mut caps = re.create_captures(); + /// let haystack = "foo123"; + /// + /// // Since we are using the default leftmost-first match and both + /// // patterns match at the same starting position, only the first pattern + /// // will be returned in this case when doing a search for any of the + /// // patterns. + /// let expected = Some(Match::must(0, 0..6)); + /// re.search_captures(&Input::new(haystack), &mut caps); + /// assert_eq!(expected, caps.get_match()); + /// + /// // But if we want to check whether some other pattern matches, then we + /// // can provide its pattern ID. + /// let expected = Some(Match::must(1, 0..6)); + /// let input = Input::new(haystack) + /// .anchored(Anchored::Pattern(PatternID::must(1))); + /// re.search_captures(&input, &mut caps); + /// assert_eq!(expected, caps.get_match()); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + /// + /// # Example: specifying the bounds of a search + /// + /// This example shows how providing the bounds of a search can produce + /// different results than simply sub-slicing the haystack. + /// + /// ``` + /// # if cfg!(miri) { return Ok(()); } // miri takes too long + /// use regex_automata::{meta::Regex, Match, Input}; + /// + /// let re = Regex::new(r"\b[0-9]{3}\b")?; + /// let mut caps = re.create_captures(); + /// let haystack = "foo123bar"; + /// + /// // Since we sub-slice the haystack, the search doesn't know about + /// // the larger context and assumes that `123` is surrounded by word + /// // boundaries. And of course, the match position is reported relative + /// // to the sub-slice as well, which means we get `0..3` instead of + /// // `3..6`. + /// let expected = Some(Match::must(0, 0..3)); + /// let input = Input::new(&haystack[3..6]); + /// re.search_captures(&input, &mut caps); + /// assert_eq!(expected, caps.get_match()); + /// + /// // But if we provide the bounds of the search within the context of the + /// // entire haystack, then the search can take the surrounding context + /// // into account. (And if we did find a match, it would be reported + /// // as a valid offset into `haystack` instead of its sub-slice.) + /// let expected = None; + /// let input = Input::new(haystack).range(3..6); + /// re.search_captures(&input, &mut caps); + /// assert_eq!(expected, caps.get_match()); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + #[inline] + pub fn search_captures(&self, input: &Input<'_>, caps: &mut Captures) { + caps.set_pattern(None); + let pid = self.search_slots(input, caps.slots_mut()); + caps.set_pattern(pid); + } + + /// Executes a leftmost forward search and writes the spans of capturing + /// groups that participated in a match into the provided `slots`, and + /// returns the matching pattern ID. The contents of the slots for patterns + /// other than the matching pattern are unspecified. If no match was found, + /// then `None` is returned and the contents of `slots` is unspecified. + /// + /// This is like [`Regex::search`], but it accepts a raw slots slice + /// instead of a `Captures` value. This is useful in contexts where you + /// don't want or need to allocate a `Captures`. + /// + /// It is legal to pass _any_ number of slots to this routine. If the regex + /// engine would otherwise write a slot offset that doesn't fit in the + /// provided slice, then it is simply skipped. In general though, there are + /// usually three slice lengths you might want to use: + /// + /// * An empty slice, if you only care about which pattern matched. + /// * A slice with [`pattern_len() * 2`](Regex::pattern_len) slots, if you + /// only care about the overall match spans for each matching pattern. + /// * A slice with + /// [`slot_len()`](crate::util::captures::GroupInfo::slot_len) slots, which + /// permits recording match offsets for every capturing group in every + /// pattern. + /// + /// # Example + /// + /// This example shows how to find the overall match offsets in a + /// multi-pattern search without allocating a `Captures` value. Indeed, we + /// can put our slots right on the stack. + /// + /// ``` + /// # if cfg!(miri) { return Ok(()); } // miri takes too long + /// use regex_automata::{meta::Regex, PatternID, Input}; + /// + /// let re = Regex::new_many(&[ + /// r"\pL+", + /// r"\d+", + /// ])?; + /// let input = Input::new("!@#123"); + /// + /// // We only care about the overall match offsets here, so we just + /// // allocate two slots for each pattern. Each slot records the start + /// // and end of the match. + /// let mut slots = [None; 4]; + /// let pid = re.search_slots(&input, &mut slots); + /// assert_eq!(Some(PatternID::must(1)), pid); + /// + /// // The overall match offsets are always at 'pid * 2' and 'pid * 2 + 1'. + /// // See 'GroupInfo' for more details on the mapping between groups and + /// // slot indices. + /// let slot_start = pid.unwrap().as_usize() * 2; + /// let slot_end = slot_start + 1; + /// assert_eq!(Some(3), slots[slot_start].map(|s| s.get())); + /// assert_eq!(Some(6), slots[slot_end].map(|s| s.get())); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + #[inline] + pub fn search_slots( + &self, + input: &Input<'_>, + slots: &mut [Option<NonMaxUsize>], + ) -> Option<PatternID> { + if self.imp.info.is_impossible(input) { + return None; + } + let mut guard = self.pool.get(); + let result = self.imp.strat.search_slots(&mut guard, input, slots); + // See 'Regex::search' for why we put the guard back explicitly. + PoolGuard::put(guard); + result + } + + /// Writes the set of patterns that match anywhere in the given search + /// configuration to `patset`. If multiple patterns match at the same + /// position and this `Regex` was configured with [`MatchKind::All`] + /// semantics, then all matching patterns are written to the given set. + /// + /// Unless all of the patterns in this `Regex` are anchored, then generally + /// speaking, this will scan the entire haystack. + /// + /// This search routine *does not* clear the pattern set. This gives some + /// flexibility to the caller (e.g., running multiple searches with the + /// same pattern set), but does make the API bug-prone if you're reusing + /// the same pattern set for multiple searches but intended them to be + /// independent. + /// + /// If a pattern ID matched but the given `PatternSet` does not have + /// sufficient capacity to store it, then it is not inserted and silently + /// dropped. + /// + /// # Example + /// + /// This example shows how to find all matching patterns in a haystack, + /// even when some patterns match at the same position as other patterns. + /// It is important that we configure the `Regex` with [`MatchKind::All`] + /// semantics here, or else overlapping matches will not be reported. + /// + /// ``` + /// # if cfg!(miri) { return Ok(()); } // miri takes too long + /// use regex_automata::{meta::Regex, Input, MatchKind, PatternSet}; + /// + /// let patterns = &[ + /// r"\w+", r"\d+", r"\pL+", r"foo", r"bar", r"barfoo", r"foobar", + /// ]; + /// let re = Regex::builder() + /// .configure(Regex::config().match_kind(MatchKind::All)) + /// .build_many(patterns)?; + /// + /// let input = Input::new("foobar"); + /// let mut patset = PatternSet::new(re.pattern_len()); + /// re.which_overlapping_matches(&input, &mut patset); + /// let expected = vec![0, 2, 3, 4, 6]; + /// let got: Vec<usize> = patset.iter().map(|p| p.as_usize()).collect(); + /// assert_eq!(expected, got); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + #[inline] + pub fn which_overlapping_matches( + &self, + input: &Input<'_>, + patset: &mut PatternSet, + ) { + if self.imp.info.is_impossible(input) { + return; + } + let mut guard = self.pool.get(); + let result = self + .imp + .strat + .which_overlapping_matches(&mut guard, input, patset); + // See 'Regex::search' for why we put the guard back explicitly. + PoolGuard::put(guard); + result + } +} + +/// Lower level search routines that give more control, and require the caller +/// to provide an explicit [`Cache`] parameter. +impl Regex { + /// This is like [`Regex::search`], but requires the caller to + /// explicitly pass a [`Cache`]. + /// + /// # Why pass a `Cache` explicitly? + /// + /// Passing a `Cache` explicitly will bypass the use of an internal memory + /// pool used by `Regex` to get a `Cache` for a search. The use of this + /// pool can be slower in some cases when a `Regex` is used from multiple + /// threads simultaneously. Typically, performance only becomes an issue + /// when there is heavy contention, which in turn usually only occurs + /// when each thread's primary unit of work is a regex search on a small + /// haystack. + /// + /// # Example + /// + /// ``` + /// use regex_automata::{meta::Regex, Input, Match}; + /// + /// let re = Regex::new(r"Samwise|Sam")?; + /// let mut cache = re.create_cache(); + /// let input = Input::new( + /// "one of the chief characters, Samwise the Brave", + /// ); + /// assert_eq!( + /// Some(Match::must(0, 29..36)), + /// re.search_with(&mut cache, &input), + /// ); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + #[inline] + pub fn search_with( + &self, + cache: &mut Cache, + input: &Input<'_>, + ) -> Option<Match> { + if self.imp.info.is_impossible(input) { + return None; + } + self.imp.strat.search(cache, input) + } + + /// This is like [`Regex::search_half`], but requires the caller to + /// explicitly pass a [`Cache`]. + /// + /// # Why pass a `Cache` explicitly? + /// + /// Passing a `Cache` explicitly will bypass the use of an internal memory + /// pool used by `Regex` to get a `Cache` for a search. The use of this + /// pool can be slower in some cases when a `Regex` is used from multiple + /// threads simultaneously. Typically, performance only becomes an issue + /// when there is heavy contention, which in turn usually only occurs + /// when each thread's primary unit of work is a regex search on a small + /// haystack. + /// + /// # Example + /// + /// ``` + /// use regex_automata::{meta::Regex, Input, HalfMatch}; + /// + /// let re = Regex::new(r"Samwise|Sam")?; + /// let mut cache = re.create_cache(); + /// let input = Input::new( + /// "one of the chief characters, Samwise the Brave", + /// ); + /// assert_eq!( + /// Some(HalfMatch::must(0, 36)), + /// re.search_half_with(&mut cache, &input), + /// ); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + #[inline] + pub fn search_half_with( + &self, + cache: &mut Cache, + input: &Input<'_>, + ) -> Option<HalfMatch> { + if self.imp.info.is_impossible(input) { + return None; + } + self.imp.strat.search_half(cache, input) + } + + /// This is like [`Regex::search_captures`], but requires the caller to + /// explicitly pass a [`Cache`]. + /// + /// # Why pass a `Cache` explicitly? + /// + /// Passing a `Cache` explicitly will bypass the use of an internal memory + /// pool used by `Regex` to get a `Cache` for a search. The use of this + /// pool can be slower in some cases when a `Regex` is used from multiple + /// threads simultaneously. Typically, performance only becomes an issue + /// when there is heavy contention, which in turn usually only occurs + /// when each thread's primary unit of work is a regex search on a small + /// haystack. + /// + /// # Example: specific pattern search + /// + /// This example shows how to build a multi-pattern `Regex` that permits + /// searching for specific patterns. + /// + /// ``` + /// use regex_automata::{ + /// meta::Regex, + /// Anchored, Match, PatternID, Input, + /// }; + /// + /// let re = Regex::new_many(&["[a-z0-9]{6}", "[a-z][a-z0-9]{5}"])?; + /// let (mut cache, mut caps) = (re.create_cache(), re.create_captures()); + /// let haystack = "foo123"; + /// + /// // Since we are using the default leftmost-first match and both + /// // patterns match at the same starting position, only the first pattern + /// // will be returned in this case when doing a search for any of the + /// // patterns. + /// let expected = Some(Match::must(0, 0..6)); + /// re.search_captures_with(&mut cache, &Input::new(haystack), &mut caps); + /// assert_eq!(expected, caps.get_match()); + /// + /// // But if we want to check whether some other pattern matches, then we + /// // can provide its pattern ID. + /// let expected = Some(Match::must(1, 0..6)); + /// let input = Input::new(haystack) + /// .anchored(Anchored::Pattern(PatternID::must(1))); + /// re.search_captures_with(&mut cache, &input, &mut caps); + /// assert_eq!(expected, caps.get_match()); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + /// + /// # Example: specifying the bounds of a search + /// + /// This example shows how providing the bounds of a search can produce + /// different results than simply sub-slicing the haystack. + /// + /// ``` + /// # if cfg!(miri) { return Ok(()); } // miri takes too long + /// use regex_automata::{meta::Regex, Match, Input}; + /// + /// let re = Regex::new(r"\b[0-9]{3}\b")?; + /// let (mut cache, mut caps) = (re.create_cache(), re.create_captures()); + /// let haystack = "foo123bar"; + /// + /// // Since we sub-slice the haystack, the search doesn't know about + /// // the larger context and assumes that `123` is surrounded by word + /// // boundaries. And of course, the match position is reported relative + /// // to the sub-slice as well, which means we get `0..3` instead of + /// // `3..6`. + /// let expected = Some(Match::must(0, 0..3)); + /// let input = Input::new(&haystack[3..6]); + /// re.search_captures_with(&mut cache, &input, &mut caps); + /// assert_eq!(expected, caps.get_match()); + /// + /// // But if we provide the bounds of the search within the context of the + /// // entire haystack, then the search can take the surrounding context + /// // into account. (And if we did find a match, it would be reported + /// // as a valid offset into `haystack` instead of its sub-slice.) + /// let expected = None; + /// let input = Input::new(haystack).range(3..6); + /// re.search_captures_with(&mut cache, &input, &mut caps); + /// assert_eq!(expected, caps.get_match()); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + #[inline] + pub fn search_captures_with( + &self, + cache: &mut Cache, + input: &Input<'_>, + caps: &mut Captures, + ) { + caps.set_pattern(None); + let pid = self.search_slots_with(cache, input, caps.slots_mut()); + caps.set_pattern(pid); + } + + /// This is like [`Regex::search_slots`], but requires the caller to + /// explicitly pass a [`Cache`]. + /// + /// # Why pass a `Cache` explicitly? + /// + /// Passing a `Cache` explicitly will bypass the use of an internal memory + /// pool used by `Regex` to get a `Cache` for a search. The use of this + /// pool can be slower in some cases when a `Regex` is used from multiple + /// threads simultaneously. Typically, performance only becomes an issue + /// when there is heavy contention, which in turn usually only occurs + /// when each thread's primary unit of work is a regex search on a small + /// haystack. + /// + /// # Example + /// + /// This example shows how to find the overall match offsets in a + /// multi-pattern search without allocating a `Captures` value. Indeed, we + /// can put our slots right on the stack. + /// + /// ``` + /// # if cfg!(miri) { return Ok(()); } // miri takes too long + /// use regex_automata::{meta::Regex, PatternID, Input}; + /// + /// let re = Regex::new_many(&[ + /// r"\pL+", + /// r"\d+", + /// ])?; + /// let mut cache = re.create_cache(); + /// let input = Input::new("!@#123"); + /// + /// // We only care about the overall match offsets here, so we just + /// // allocate two slots for each pattern. Each slot records the start + /// // and end of the match. + /// let mut slots = [None; 4]; + /// let pid = re.search_slots_with(&mut cache, &input, &mut slots); + /// assert_eq!(Some(PatternID::must(1)), pid); + /// + /// // The overall match offsets are always at 'pid * 2' and 'pid * 2 + 1'. + /// // See 'GroupInfo' for more details on the mapping between groups and + /// // slot indices. + /// let slot_start = pid.unwrap().as_usize() * 2; + /// let slot_end = slot_start + 1; + /// assert_eq!(Some(3), slots[slot_start].map(|s| s.get())); + /// assert_eq!(Some(6), slots[slot_end].map(|s| s.get())); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + #[inline] + pub fn search_slots_with( + &self, + cache: &mut Cache, + input: &Input<'_>, + slots: &mut [Option<NonMaxUsize>], + ) -> Option<PatternID> { + if self.imp.info.is_impossible(input) { + return None; + } + self.imp.strat.search_slots(cache, input, slots) + } + + /// This is like [`Regex::which_overlapping_matches`], but requires the + /// caller to explicitly pass a [`Cache`]. + /// + /// Passing a `Cache` explicitly will bypass the use of an internal memory + /// pool used by `Regex` to get a `Cache` for a search. The use of this + /// pool can be slower in some cases when a `Regex` is used from multiple + /// threads simultaneously. Typically, performance only becomes an issue + /// when there is heavy contention, which in turn usually only occurs + /// when each thread's primary unit of work is a regex search on a small + /// haystack. + /// + /// # Why pass a `Cache` explicitly? + /// + /// # Example + /// + /// ``` + /// # if cfg!(miri) { return Ok(()); } // miri takes too long + /// use regex_automata::{meta::Regex, Input, MatchKind, PatternSet}; + /// + /// let patterns = &[ + /// r"\w+", r"\d+", r"\pL+", r"foo", r"bar", r"barfoo", r"foobar", + /// ]; + /// let re = Regex::builder() + /// .configure(Regex::config().match_kind(MatchKind::All)) + /// .build_many(patterns)?; + /// let mut cache = re.create_cache(); + /// + /// let input = Input::new("foobar"); + /// let mut patset = PatternSet::new(re.pattern_len()); + /// re.which_overlapping_matches_with(&mut cache, &input, &mut patset); + /// let expected = vec![0, 2, 3, 4, 6]; + /// let got: Vec<usize> = patset.iter().map(|p| p.as_usize()).collect(); + /// assert_eq!(expected, got); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + #[inline] + pub fn which_overlapping_matches_with( + &self, + cache: &mut Cache, + input: &Input<'_>, + patset: &mut PatternSet, + ) { + if self.imp.info.is_impossible(input) { + return; + } + self.imp.strat.which_overlapping_matches(cache, input, patset) + } +} + +/// Various non-search routines for querying properties of a `Regex` and +/// convenience routines for creating [`Captures`] and [`Cache`] values. +impl Regex { + /// Creates a new object for recording capture group offsets. This is used + /// in search APIs like [`Regex::captures`] and [`Regex::search_captures`]. + /// + /// This is a convenience routine for + /// `Captures::all(re.group_info().clone())`. Callers may build other types + /// of `Captures` values that record less information (and thus require + /// less work from the regex engine) using [`Captures::matches`] and + /// [`Captures::empty`]. + /// + /// # Example + /// + /// This shows some alternatives to [`Regex::create_captures`]: + /// + /// ``` + /// use regex_automata::{ + /// meta::Regex, + /// util::captures::Captures, + /// Match, PatternID, Span, + /// }; + /// + /// let re = Regex::new(r"(?<first>[A-Z][a-z]+) (?<last>[A-Z][a-z]+)")?; + /// + /// // This is equivalent to Regex::create_captures. It stores matching + /// // offsets for all groups in the regex. + /// let mut all = Captures::all(re.group_info().clone()); + /// re.captures("Bruce Springsteen", &mut all); + /// assert_eq!(Some(Match::must(0, 0..17)), all.get_match()); + /// assert_eq!(Some(Span::from(0..5)), all.get_group_by_name("first")); + /// assert_eq!(Some(Span::from(6..17)), all.get_group_by_name("last")); + /// + /// // In this version, we only care about the implicit groups, which + /// // means offsets for the explicit groups will be unavailable. It can + /// // sometimes be faster to ask for fewer groups, since the underlying + /// // regex engine needs to do less work to keep track of them. + /// let mut matches = Captures::matches(re.group_info().clone()); + /// re.captures("Bruce Springsteen", &mut matches); + /// // We still get the overall match info. + /// assert_eq!(Some(Match::must(0, 0..17)), matches.get_match()); + /// // But now the explicit groups are unavailable. + /// assert_eq!(None, matches.get_group_by_name("first")); + /// assert_eq!(None, matches.get_group_by_name("last")); + /// + /// // Finally, in this version, we don't ask to keep track of offsets for + /// // *any* groups. All we get back is whether a match occurred, and if + /// // so, the ID of the pattern that matched. + /// let mut empty = Captures::empty(re.group_info().clone()); + /// re.captures("Bruce Springsteen", &mut empty); + /// // it's a match! + /// assert!(empty.is_match()); + /// // for pattern ID 0 + /// assert_eq!(Some(PatternID::ZERO), empty.pattern()); + /// // Match offsets are unavailable. + /// assert_eq!(None, empty.get_match()); + /// // And of course, explicit groups are unavailable too. + /// assert_eq!(None, empty.get_group_by_name("first")); + /// assert_eq!(None, empty.get_group_by_name("last")); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + pub fn create_captures(&self) -> Captures { + Captures::all(self.group_info().clone()) + } + + /// Creates a new cache for use with lower level search APIs like + /// [`Regex::search_with`]. + /// + /// The cache returned should only be used for searches for this `Regex`. + /// If you want to reuse the cache for another `Regex`, then you must call + /// [`Cache::reset`] with that `Regex`. + /// + /// This is a convenience routine for [`Cache::new`]. + /// + /// # Example + /// + /// ``` + /// use regex_automata::{meta::Regex, Input, Match}; + /// + /// let re = Regex::new(r"(?-u)m\w+\s+m\w+")?; + /// let mut cache = re.create_cache(); + /// let input = Input::new("crazy janey and her mission man"); + /// assert_eq!( + /// Some(Match::must(0, 20..31)), + /// re.search_with(&mut cache, &input), + /// ); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + pub fn create_cache(&self) -> Cache { + self.imp.strat.create_cache() + } + + /// Returns the total number of patterns in this regex. + /// + /// The standard [`Regex::new`] constructor always results in a `Regex` + /// with a single pattern, but [`Regex::new_many`] permits building a + /// multi-pattern regex. + /// + /// A `Regex` guarantees that the maximum possible `PatternID` returned in + /// any match is `Regex::pattern_len() - 1`. In the case where the number + /// of patterns is `0`, a match is impossible. + /// + /// # Example + /// + /// ``` + /// use regex_automata::meta::Regex; + /// + /// let re = Regex::new(r"(?m)^[a-z]$")?; + /// assert_eq!(1, re.pattern_len()); + /// + /// let re = Regex::new_many::<&str>(&[])?; + /// assert_eq!(0, re.pattern_len()); + /// + /// let re = Regex::new_many(&["a", "b", "c"])?; + /// assert_eq!(3, re.pattern_len()); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + pub fn pattern_len(&self) -> usize { + self.imp.info.pattern_len() + } + + /// Returns the total number of capturing groups. + /// + /// This includes the implicit capturing group corresponding to the + /// entire match. Therefore, the minimum value returned is `1`. + /// + /// # Example + /// + /// This shows a few patterns and how many capture groups they have. + /// + /// ``` + /// use regex_automata::meta::Regex; + /// + /// let len = |pattern| { + /// Regex::new(pattern).map(|re| re.captures_len()) + /// }; + /// + /// assert_eq!(1, len("a")?); + /// assert_eq!(2, len("(a)")?); + /// assert_eq!(3, len("(a)|(b)")?); + /// assert_eq!(5, len("(a)(b)|(c)(d)")?); + /// assert_eq!(2, len("(a)|b")?); + /// assert_eq!(2, len("a|(b)")?); + /// assert_eq!(2, len("(b)*")?); + /// assert_eq!(2, len("(b)+")?); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + /// + /// # Example: multiple patterns + /// + /// This routine also works for multiple patterns. The total number is + /// the sum of the capture groups of each pattern. + /// + /// ``` + /// use regex_automata::meta::Regex; + /// + /// let len = |patterns| { + /// Regex::new_many(patterns).map(|re| re.captures_len()) + /// }; + /// + /// assert_eq!(2, len(&["a", "b"])?); + /// assert_eq!(4, len(&["(a)", "(b)"])?); + /// assert_eq!(6, len(&["(a)|(b)", "(c)|(d)"])?); + /// assert_eq!(8, len(&["(a)(b)|(c)(d)", "(x)(y)"])?); + /// assert_eq!(3, len(&["(a)", "b"])?); + /// assert_eq!(3, len(&["a", "(b)"])?); + /// assert_eq!(4, len(&["(a)", "(b)*"])?); + /// assert_eq!(4, len(&["(a)+", "(b)+"])?); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + pub fn captures_len(&self) -> usize { + self.imp + .info + .props_union() + .explicit_captures_len() + .saturating_add(self.pattern_len()) + } + + /// Returns the total number of capturing groups that appear in every + /// possible match. + /// + /// If the number of capture groups can vary depending on the match, then + /// this returns `None`. That is, a value is only returned when the number + /// of matching groups is invariant or "static." + /// + /// Note that like [`Regex::captures_len`], this **does** include the + /// implicit capturing group corresponding to the entire match. Therefore, + /// when a non-None value is returned, it is guaranteed to be at least `1`. + /// Stated differently, a return value of `Some(0)` is impossible. + /// + /// # Example + /// + /// This shows a few cases where a static number of capture groups is + /// available and a few cases where it is not. + /// + /// ``` + /// use regex_automata::meta::Regex; + /// + /// let len = |pattern| { + /// Regex::new(pattern).map(|re| re.static_captures_len()) + /// }; + /// + /// assert_eq!(Some(1), len("a")?); + /// assert_eq!(Some(2), len("(a)")?); + /// assert_eq!(Some(2), len("(a)|(b)")?); + /// assert_eq!(Some(3), len("(a)(b)|(c)(d)")?); + /// assert_eq!(None, len("(a)|b")?); + /// assert_eq!(None, len("a|(b)")?); + /// assert_eq!(None, len("(b)*")?); + /// assert_eq!(Some(2), len("(b)+")?); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + /// + /// # Example: multiple patterns + /// + /// This property extends to regexes with multiple patterns as well. In + /// order for their to be a static number of capture groups in this case, + /// every pattern must have the same static number. + /// + /// ``` + /// use regex_automata::meta::Regex; + /// + /// let len = |patterns| { + /// Regex::new_many(patterns).map(|re| re.static_captures_len()) + /// }; + /// + /// assert_eq!(Some(1), len(&["a", "b"])?); + /// assert_eq!(Some(2), len(&["(a)", "(b)"])?); + /// assert_eq!(Some(2), len(&["(a)|(b)", "(c)|(d)"])?); + /// assert_eq!(Some(3), len(&["(a)(b)|(c)(d)", "(x)(y)"])?); + /// assert_eq!(None, len(&["(a)", "b"])?); + /// assert_eq!(None, len(&["a", "(b)"])?); + /// assert_eq!(None, len(&["(a)", "(b)*"])?); + /// assert_eq!(Some(2), len(&["(a)+", "(b)+"])?); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + #[inline] + pub fn static_captures_len(&self) -> Option<usize> { + self.imp + .info + .props_union() + .static_explicit_captures_len() + .map(|len| len.saturating_add(1)) + } + + /// Return information about the capture groups in this `Regex`. + /// + /// A `GroupInfo` is an immutable object that can be cheaply cloned. It + /// is responsible for maintaining a mapping between the capture groups + /// in the concrete syntax of zero or more regex patterns and their + /// internal representation used by some of the regex matchers. It is also + /// responsible for maintaining a mapping between the name of each group + /// (if one exists) and its corresponding group index. + /// + /// A `GroupInfo` is ultimately what is used to build a [`Captures`] value, + /// which is some mutable space where group offsets are stored as a result + /// of a search. + /// + /// # Example + /// + /// This shows some alternatives to [`Regex::create_captures`]: + /// + /// ``` + /// use regex_automata::{ + /// meta::Regex, + /// util::captures::Captures, + /// Match, PatternID, Span, + /// }; + /// + /// let re = Regex::new(r"(?<first>[A-Z][a-z]+) (?<last>[A-Z][a-z]+)")?; + /// + /// // This is equivalent to Regex::create_captures. It stores matching + /// // offsets for all groups in the regex. + /// let mut all = Captures::all(re.group_info().clone()); + /// re.captures("Bruce Springsteen", &mut all); + /// assert_eq!(Some(Match::must(0, 0..17)), all.get_match()); + /// assert_eq!(Some(Span::from(0..5)), all.get_group_by_name("first")); + /// assert_eq!(Some(Span::from(6..17)), all.get_group_by_name("last")); + /// + /// // In this version, we only care about the implicit groups, which + /// // means offsets for the explicit groups will be unavailable. It can + /// // sometimes be faster to ask for fewer groups, since the underlying + /// // regex engine needs to do less work to keep track of them. + /// let mut matches = Captures::matches(re.group_info().clone()); + /// re.captures("Bruce Springsteen", &mut matches); + /// // We still get the overall match info. + /// assert_eq!(Some(Match::must(0, 0..17)), matches.get_match()); + /// // But now the explicit groups are unavailable. + /// assert_eq!(None, matches.get_group_by_name("first")); + /// assert_eq!(None, matches.get_group_by_name("last")); + /// + /// // Finally, in this version, we don't ask to keep track of offsets for + /// // *any* groups. All we get back is whether a match occurred, and if + /// // so, the ID of the pattern that matched. + /// let mut empty = Captures::empty(re.group_info().clone()); + /// re.captures("Bruce Springsteen", &mut empty); + /// // it's a match! + /// assert!(empty.is_match()); + /// // for pattern ID 0 + /// assert_eq!(Some(PatternID::ZERO), empty.pattern()); + /// // Match offsets are unavailable. + /// assert_eq!(None, empty.get_match()); + /// // And of course, explicit groups are unavailable too. + /// assert_eq!(None, empty.get_group_by_name("first")); + /// assert_eq!(None, empty.get_group_by_name("last")); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + #[inline] + pub fn group_info(&self) -> &GroupInfo { + self.imp.strat.group_info() + } + + /// Returns the configuration object used to build this `Regex`. + /// + /// If no configuration object was explicitly passed, then the + /// configuration returned represents the default. + #[inline] + pub fn get_config(&self) -> &Config { + self.imp.info.config() + } + + /// Returns true if this regex has a high chance of being "accelerated." + /// + /// The precise meaning of "accelerated" is specifically left unspecified, + /// but the general meaning is that the search is a high likelihood of + /// running faster than than a character-at-a-time loop inside a standard + /// regex engine. + /// + /// When a regex is accelerated, it is only a *probabilistic* claim. That + /// is, just because the regex is believed to be accelerated, that doesn't + /// mean it will definitely execute searches very fast. Similarly, if a + /// regex is *not* accelerated, that is also a probabilistic claim. That + /// is, a regex for which `is_accelerated` returns `false` could still run + /// searches more quickly than a regex for which `is_accelerated` returns + /// `true`. + /// + /// Whether a regex is marked as accelerated or not is dependent on + /// implementations details that may change in a semver compatible release. + /// That is, a regex that is accelerated in a `x.y.1` release might not be + /// accelerated in a `x.y.2` release. + /// + /// Basically, the value of acceleration boils down to a hedge: a hodge + /// podge of internal heuristics combine to make a probabilistic guess + /// that this regex search may run "fast." The value in knowing this from + /// a caller's perspective is that it may act as a signal that no further + /// work should be done to accelerate a search. For example, a grep-like + /// tool might try to do some extra work extracting literals from a regex + /// to create its own heuristic acceleration strategies. But it might + /// choose to defer to this crate's acceleration strategy if one exists. + /// This routine permits querying whether such a strategy is active for a + /// particular regex. + /// + /// # Example + /// + /// ``` + /// use regex_automata::meta::Regex; + /// + /// // A simple literal is very likely to be accelerated. + /// let re = Regex::new(r"foo")?; + /// assert!(re.is_accelerated()); + /// + /// // A regex with no literals is likely to not be accelerated. + /// let re = Regex::new(r"\w")?; + /// assert!(!re.is_accelerated()); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + #[inline] + pub fn is_accelerated(&self) -> bool { + self.imp.strat.is_accelerated() + } + + /// Return the total approximate heap memory, in bytes, used by this `Regex`. + /// + /// Note that currently, there is no high level configuration for setting + /// a limit on the specific value returned by this routine. Instead, the + /// following routines can be used to control heap memory at a bit of a + /// lower level: + /// + /// * [`Config::nfa_size_limit`] controls how big _any_ of the NFAs are + /// allowed to be. + /// * [`Config::onepass_size_limit`] controls how big the one-pass DFA is + /// allowed to be. + /// * [`Config::hybrid_cache_capacity`] controls how much memory the lazy + /// DFA is permitted to allocate to store its transition table. + /// * [`Config::dfa_size_limit`] controls how big a fully compiled DFA is + /// allowed to be. + /// * [`Config::dfa_state_limit`] controls the conditions under which the + /// meta regex engine will even attempt to build a fully compiled DFA. + #[inline] + pub fn memory_usage(&self) -> usize { + self.imp.strat.memory_usage() + } +} + +impl Clone for Regex { + fn clone(&self) -> Regex { + let imp = Arc::clone(&self.imp); + let pool = { + let strat = Arc::clone(&imp.strat); + let create: CachePoolFn = Box::new(move || strat.create_cache()); + Pool::new(create) + }; + Regex { imp, pool } + } +} + +#[derive(Clone, Debug)] +pub(crate) struct RegexInfo(Arc<RegexInfoI>); + +#[derive(Clone, Debug)] +struct RegexInfoI { + config: Config, + props: Vec<hir::Properties>, + props_union: hir::Properties, +} + +impl RegexInfo { + fn new(config: Config, hirs: &[&Hir]) -> RegexInfo { + // Collect all of the properties from each of the HIRs, and also + // union them into one big set of properties representing all HIRs + // as if they were in one big alternation. + let mut props = vec![]; + for hir in hirs.iter() { + props.push(hir.properties().clone()); + } + let props_union = hir::Properties::union(&props); + + RegexInfo(Arc::new(RegexInfoI { config, props, props_union })) + } + + pub(crate) fn config(&self) -> &Config { + &self.0.config + } + + pub(crate) fn props(&self) -> &[hir::Properties] { + &self.0.props + } + + pub(crate) fn props_union(&self) -> &hir::Properties { + &self.0.props_union + } + + pub(crate) fn pattern_len(&self) -> usize { + self.props().len() + } + + pub(crate) fn memory_usage(&self) -> usize { + self.props().iter().map(|p| p.memory_usage()).sum::<usize>() + + self.props_union().memory_usage() + } + + /// Returns true when the search is guaranteed to be anchored. That is, + /// when a match is reported, its offset is guaranteed to correspond to + /// the start of the search. + /// + /// This includes returning true when `input` _isn't_ anchored but the + /// underlying regex is. + #[cfg_attr(feature = "perf-inline", inline(always))] + pub(crate) fn is_anchored_start(&self, input: &Input<'_>) -> bool { + input.get_anchored().is_anchored() || self.is_always_anchored_start() + } + + /// Returns true when this regex is always anchored to the start of a + /// search. And in particular, that regardless of an `Input` configuration, + /// if any match is reported it must start at `0`. + #[cfg_attr(feature = "perf-inline", inline(always))] + pub(crate) fn is_always_anchored_start(&self) -> bool { + use regex_syntax::hir::Look; + self.props_union().look_set_prefix().contains(Look::Start) + } + + /// Returns true when this regex is always anchored to the end of a + /// search. And in particular, that regardless of an `Input` configuration, + /// if any match is reported it must end at the end of the haystack. + #[cfg_attr(feature = "perf-inline", inline(always))] + pub(crate) fn is_always_anchored_end(&self) -> bool { + use regex_syntax::hir::Look; + self.props_union().look_set_suffix().contains(Look::End) + } + + /// Returns true if and only if it is known that a match is impossible + /// for the given input. This is useful for short-circuiting and avoiding + /// running the regex engine if it's known no match can be reported. + /// + /// Note that this doesn't necessarily detect every possible case. For + /// example, when `pattern_len() == 0`, a match is impossible, but that + /// case is so rare that it's fine to be handled by the regex engine + /// itself. That is, it's not worth the cost of adding it here in order to + /// make it a little faster. The reason is that this is called for every + /// search. so there is some cost to adding checks here. Arguably, some of + /// the checks that are here already probably shouldn't be here... + #[cfg_attr(feature = "perf-inline", inline(always))] + fn is_impossible(&self, input: &Input<'_>) -> bool { + // The underlying regex is anchored, so if we don't start the search + // at position 0, a match is impossible, because the anchor can only + // match at position 0. + if input.start() > 0 && self.is_always_anchored_start() { + return true; + } + // Same idea, but for the end anchor. + if input.end() < input.haystack().len() + && self.is_always_anchored_end() + { + return true; + } + // If the haystack is smaller than the minimum length required, then + // we know there can be no match. + let minlen = match self.props_union().minimum_len() { + None => return false, + Some(minlen) => minlen, + }; + if input.get_span().len() < minlen { + return true; + } + // Same idea as minimum, but for maximum. This is trickier. We can + // only apply the maximum when we know the entire span that we're + // searching *has* to match according to the regex (and possibly the + // input configuration). If we know there is too much for the regex + // to match, we can bail early. + // + // I don't think we can apply the maximum otherwise unfortunately. + if self.is_anchored_start(input) && self.is_always_anchored_end() { + let maxlen = match self.props_union().maximum_len() { + None => return false, + Some(maxlen) => maxlen, + }; + if input.get_span().len() > maxlen { + return true; + } + } + false + } +} + +/// An iterator over all non-overlapping matches. +/// +/// The iterator yields a [`Match`] value until no more matches could be found. +/// +/// The lifetime parameters are as follows: +/// +/// * `'r` represents the lifetime of the `Regex` that produced this iterator. +/// * `'h` represents the lifetime of the haystack being searched. +/// +/// This iterator can be created with the [`Regex::find_iter`] method. +#[derive(Debug)] +pub struct FindMatches<'r, 'h> { + re: &'r Regex, + cache: CachePoolGuard<'r>, + it: iter::Searcher<'h>, +} + +impl<'r, 'h> FindMatches<'r, 'h> { + /// Returns the `Regex` value that created this iterator. + #[inline] + pub fn regex(&self) -> &'r Regex { + self.re + } + + /// Returns the current `Input` associated with this iterator. + /// + /// The `start` position on the given `Input` may change during iteration, + /// but all other values are guaranteed to remain invariant. + #[inline] + pub fn input<'s>(&'s self) -> &'s Input<'h> { + self.it.input() + } +} + +impl<'r, 'h> Iterator for FindMatches<'r, 'h> { + type Item = Match; + + #[inline] + fn next(&mut self) -> Option<Match> { + let FindMatches { re, ref mut cache, ref mut it } = *self; + it.advance(|input| Ok(re.search_with(cache, input))) + } + + #[inline] + fn count(self) -> usize { + // If all we care about is a count of matches, then we only need to + // find the end position of each match. This can give us a 2x perf + // boost in some cases, because it avoids needing to do a reverse scan + // to find the start of a match. + let FindMatches { re, mut cache, it } = self; + // This does the deref for PoolGuard once instead of every iter. + let cache = &mut *cache; + it.into_half_matches_iter( + |input| Ok(re.search_half_with(cache, input)), + ) + .count() + } +} + +impl<'r, 'h> core::iter::FusedIterator for FindMatches<'r, 'h> {} + +/// An iterator over all non-overlapping leftmost matches with their capturing +/// groups. +/// +/// The iterator yields a [`Captures`] value until no more matches could be +/// found. +/// +/// The lifetime parameters are as follows: +/// +/// * `'r` represents the lifetime of the `Regex` that produced this iterator. +/// * `'h` represents the lifetime of the haystack being searched. +/// +/// This iterator can be created with the [`Regex::captures_iter`] method. +#[derive(Debug)] +pub struct CapturesMatches<'r, 'h> { + re: &'r Regex, + cache: CachePoolGuard<'r>, + caps: Captures, + it: iter::Searcher<'h>, +} + +impl<'r, 'h> CapturesMatches<'r, 'h> { + /// Returns the `Regex` value that created this iterator. + #[inline] + pub fn regex(&self) -> &'r Regex { + self.re + } + + /// Returns the current `Input` associated with this iterator. + /// + /// The `start` position on the given `Input` may change during iteration, + /// but all other values are guaranteed to remain invariant. + #[inline] + pub fn input<'s>(&'s self) -> &'s Input<'h> { + self.it.input() + } +} + +impl<'r, 'h> Iterator for CapturesMatches<'r, 'h> { + type Item = Captures; + + #[inline] + fn next(&mut self) -> Option<Captures> { + // Splitting 'self' apart seems necessary to appease borrowck. + let CapturesMatches { re, ref mut cache, ref mut caps, ref mut it } = + *self; + let _ = it.advance(|input| { + re.search_captures_with(cache, input, caps); + Ok(caps.get_match()) + }); + if caps.is_match() { + Some(caps.clone()) + } else { + None + } + } + + #[inline] + fn count(self) -> usize { + let CapturesMatches { re, mut cache, it, .. } = self; + // This does the deref for PoolGuard once instead of every iter. + let cache = &mut *cache; + it.into_half_matches_iter( + |input| Ok(re.search_half_with(cache, input)), + ) + .count() + } +} + +impl<'r, 'h> core::iter::FusedIterator for CapturesMatches<'r, 'h> {} + +/// Yields all substrings delimited by a regular expression match. +/// +/// The spans correspond to the offsets between matches. +/// +/// The lifetime parameters are as follows: +/// +/// * `'r` represents the lifetime of the `Regex` that produced this iterator. +/// * `'h` represents the lifetime of the haystack being searched. +/// +/// This iterator can be created with the [`Regex::split`] method. +#[derive(Debug)] +pub struct Split<'r, 'h> { + finder: FindMatches<'r, 'h>, + last: usize, +} + +impl<'r, 'h> Split<'r, 'h> { + /// Returns the current `Input` associated with this iterator. + /// + /// The `start` position on the given `Input` may change during iteration, + /// but all other values are guaranteed to remain invariant. + #[inline] + pub fn input<'s>(&'s self) -> &'s Input<'h> { + self.finder.input() + } +} + +impl<'r, 'h> Iterator for Split<'r, 'h> { + type Item = Span; + + fn next(&mut self) -> Option<Span> { + match self.finder.next() { + None => { + let len = self.finder.it.input().haystack().len(); + if self.last > len { + None + } else { + let span = Span::from(self.last..len); + self.last = len + 1; // Next call will return None + Some(span) + } + } + Some(m) => { + let span = Span::from(self.last..m.start()); + self.last = m.end(); + Some(span) + } + } + } +} + +impl<'r, 'h> core::iter::FusedIterator for Split<'r, 'h> {} + +/// Yields at most `N` spans delimited by a regular expression match. +/// +/// The spans correspond to the offsets between matches. The last span will be +/// whatever remains after splitting. +/// +/// The lifetime parameters are as follows: +/// +/// * `'r` represents the lifetime of the `Regex` that produced this iterator. +/// * `'h` represents the lifetime of the haystack being searched. +/// +/// This iterator can be created with the [`Regex::splitn`] method. +#[derive(Debug)] +pub struct SplitN<'r, 'h> { + splits: Split<'r, 'h>, + limit: usize, +} + +impl<'r, 'h> SplitN<'r, 'h> { + /// Returns the current `Input` associated with this iterator. + /// + /// The `start` position on the given `Input` may change during iteration, + /// but all other values are guaranteed to remain invariant. + #[inline] + pub fn input<'s>(&'s self) -> &'s Input<'h> { + self.splits.input() + } +} + +impl<'r, 'h> Iterator for SplitN<'r, 'h> { + type Item = Span; + + fn next(&mut self) -> Option<Span> { + if self.limit == 0 { + return None; + } + + self.limit -= 1; + if self.limit > 0 { + return self.splits.next(); + } + + let len = self.splits.finder.it.input().haystack().len(); + if self.splits.last > len { + // We've already returned all substrings. + None + } else { + // self.n == 0, so future calls will return None immediately + Some(Span::from(self.splits.last..len)) + } + } + + fn size_hint(&self) -> (usize, Option<usize>) { + (0, Some(self.limit)) + } +} + +impl<'r, 'h> core::iter::FusedIterator for SplitN<'r, 'h> {} + +/// Represents mutable scratch space used by regex engines during a search. +/// +/// Most of the regex engines in this crate require some kind of +/// mutable state in order to execute a search. This mutable state is +/// explicitly separated from the the core regex object (such as a +/// [`thompson::NFA`](crate::nfa::thompson::NFA)) so that the read-only regex +/// object can be shared across multiple threads simultaneously without any +/// synchronization. Conversely, a `Cache` must either be duplicated if using +/// the same `Regex` from multiple threads, or else there must be some kind of +/// synchronization that guarantees exclusive access while it's in use by one +/// thread. +/// +/// A `Regex` attempts to do this synchronization for you by using a thread +/// pool internally. Its size scales roughly with the number of simultaneous +/// regex searches. +/// +/// For cases where one does not want to rely on a `Regex`'s internal thread +/// pool, lower level routines such as [`Regex::search_with`] are provided +/// that permit callers to pass a `Cache` into the search routine explicitly. +/// +/// General advice is that the thread pool is often more than good enough. +/// However, it may be possible to observe the effects of its latency, +/// especially when searching many small haystacks from many threads +/// simultaneously. +/// +/// Caches can be created from their corresponding `Regex` via +/// [`Regex::create_cache`]. A cache can only be used with either the `Regex` +/// that created it, or the `Regex` that was most recently used to reset it +/// with [`Cache::reset`]. Using a cache with any other `Regex` may result in +/// panics or incorrect results. +/// +/// # Example +/// +/// ``` +/// use regex_automata::{meta::Regex, Input, Match}; +/// +/// let re = Regex::new(r"(?-u)m\w+\s+m\w+")?; +/// let mut cache = re.create_cache(); +/// let input = Input::new("crazy janey and her mission man"); +/// assert_eq!( +/// Some(Match::must(0, 20..31)), +/// re.search_with(&mut cache, &input), +/// ); +/// +/// # Ok::<(), Box<dyn std::error::Error>>(()) +/// ``` +#[derive(Debug, Clone)] +pub struct Cache { + pub(crate) capmatches: Captures, + pub(crate) pikevm: wrappers::PikeVMCache, + pub(crate) backtrack: wrappers::BoundedBacktrackerCache, + pub(crate) onepass: wrappers::OnePassCache, + pub(crate) hybrid: wrappers::HybridCache, + pub(crate) revhybrid: wrappers::ReverseHybridCache, +} + +impl Cache { + /// Creates a new `Cache` for use with this regex. + /// + /// The cache returned should only be used for searches for the given + /// `Regex`. If you want to reuse the cache for another `Regex`, then you + /// must call [`Cache::reset`] with that `Regex`. + pub fn new(re: &Regex) -> Cache { + re.create_cache() + } + + /// Reset this cache such that it can be used for searching with the given + /// `Regex` (and only that `Regex`). + /// + /// A cache reset permits potentially reusing memory already allocated in + /// this cache with a different `Regex`. + /// + /// # Example + /// + /// This shows how to re-purpose a cache for use with a different `Regex`. + /// + /// ``` + /// # if cfg!(miri) { return Ok(()); } // miri takes too long + /// use regex_automata::{meta::Regex, Match, Input}; + /// + /// let re1 = Regex::new(r"\w")?; + /// let re2 = Regex::new(r"\W")?; + /// + /// let mut cache = re1.create_cache(); + /// assert_eq!( + /// Some(Match::must(0, 0..2)), + /// re1.search_with(&mut cache, &Input::new("Δ")), + /// ); + /// + /// // Using 'cache' with re2 is not allowed. It may result in panics or + /// // incorrect results. In order to re-purpose the cache, we must reset + /// // it with the Regex we'd like to use it with. + /// // + /// // Similarly, after this reset, using the cache with 're1' is also not + /// // allowed. + /// cache.reset(&re2); + /// assert_eq!( + /// Some(Match::must(0, 0..3)), + /// re2.search_with(&mut cache, &Input::new("☃")), + /// ); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + pub fn reset(&mut self, re: &Regex) { + re.imp.strat.reset_cache(self) + } + + /// Returns the heap memory usage, in bytes, of this cache. + /// + /// This does **not** include the stack size used up by this cache. To + /// compute that, use `std::mem::size_of::<Cache>()`. + pub fn memory_usage(&self) -> usize { + let mut bytes = 0; + bytes += self.pikevm.memory_usage(); + bytes += self.backtrack.memory_usage(); + bytes += self.onepass.memory_usage(); + bytes += self.hybrid.memory_usage(); + bytes += self.revhybrid.memory_usage(); + bytes + } +} + +/// An object describing the configuration of a `Regex`. +/// +/// This configuration only includes options for the +/// non-syntax behavior of a `Regex`, and can be applied via the +/// [`Builder::configure`] method. For configuring the syntax options, see +/// [`util::syntax::Config`](crate::util::syntax::Config). +/// +/// # Example: lower the NFA size limit +/// +/// In some cases, the default size limit might be too big. The size limit can +/// be lowered, which will prevent large regex patterns from compiling. +/// +/// ``` +/// # if cfg!(miri) { return Ok(()); } // miri takes too long +/// use regex_automata::meta::Regex; +/// +/// let result = Regex::builder() +/// .configure(Regex::config().nfa_size_limit(Some(20 * (1<<10)))) +/// // Not even 20KB is enough to build a single large Unicode class! +/// .build(r"\pL"); +/// assert!(result.is_err()); +/// +/// # Ok::<(), Box<dyn std::error::Error>>(()) +/// ``` +#[derive(Clone, Debug, Default)] +pub struct Config { + // As with other configuration types in this crate, we put all our knobs + // in options so that we can distinguish between "default" and "not set." + // This makes it possible to easily combine multiple configurations + // without default values overwriting explicitly specified values. See the + // 'overwrite' method. + // + // For docs on the fields below, see the corresponding method setters. + match_kind: Option<MatchKind>, + utf8_empty: Option<bool>, + autopre: Option<bool>, + pre: Option<Option<Prefilter>>, + which_captures: Option<WhichCaptures>, + nfa_size_limit: Option<Option<usize>>, + onepass_size_limit: Option<Option<usize>>, + hybrid_cache_capacity: Option<usize>, + hybrid: Option<bool>, + dfa: Option<bool>, + dfa_size_limit: Option<Option<usize>>, + dfa_state_limit: Option<Option<usize>>, + onepass: Option<bool>, + backtrack: Option<bool>, + byte_classes: Option<bool>, + line_terminator: Option<u8>, +} + +impl Config { + /// Create a new configuration object for a `Regex`. + pub fn new() -> Config { + Config::default() + } + + /// Set the match semantics for a `Regex`. + /// + /// The default value is [`MatchKind::LeftmostFirst`]. + /// + /// # Example + /// + /// ``` + /// use regex_automata::{meta::Regex, Match, MatchKind}; + /// + /// // By default, leftmost-first semantics are used, which + /// // disambiguates matches at the same position by selecting + /// // the one that corresponds earlier in the pattern. + /// let re = Regex::new("sam|samwise")?; + /// assert_eq!(Some(Match::must(0, 0..3)), re.find("samwise")); + /// + /// // But with 'all' semantics, match priority is ignored + /// // and all match states are included. When coupled with + /// // a leftmost search, the search will report the last + /// // possible match. + /// let re = Regex::builder() + /// .configure(Regex::config().match_kind(MatchKind::All)) + /// .build("sam|samwise")?; + /// assert_eq!(Some(Match::must(0, 0..7)), re.find("samwise")); + /// // Beware that this can lead to skipping matches! + /// // Usually 'all' is used for anchored reverse searches + /// // only, or for overlapping searches. + /// assert_eq!(Some(Match::must(0, 4..11)), re.find("sam samwise")); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + pub fn match_kind(self, kind: MatchKind) -> Config { + Config { match_kind: Some(kind), ..self } + } + + /// Toggles whether empty matches are permitted to occur between the code + /// units of a UTF-8 encoded codepoint. + /// + /// This should generally be enabled when search a `&str` or anything that + /// you otherwise know is valid UTF-8. It should be disabled in all other + /// cases. Namely, if the haystack is not valid UTF-8 and this is enabled, + /// then behavior is unspecified. + /// + /// By default, this is enabled. + /// + /// # Example + /// + /// ``` + /// use regex_automata::{meta::Regex, Match}; + /// + /// let re = Regex::new("")?; + /// let got: Vec<Match> = re.find_iter("☃").collect(); + /// // Matches only occur at the beginning and end of the snowman. + /// assert_eq!(got, vec![ + /// Match::must(0, 0..0), + /// Match::must(0, 3..3), + /// ]); + /// + /// let re = Regex::builder() + /// .configure(Regex::config().utf8_empty(false)) + /// .build("")?; + /// let got: Vec<Match> = re.find_iter("☃").collect(); + /// // Matches now occur at every position! + /// assert_eq!(got, vec![ + /// Match::must(0, 0..0), + /// Match::must(0, 1..1), + /// Match::must(0, 2..2), + /// Match::must(0, 3..3), + /// ]); + /// + /// Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + pub fn utf8_empty(self, yes: bool) -> Config { + Config { utf8_empty: Some(yes), ..self } + } + + /// Toggles whether automatic prefilter support is enabled. + /// + /// If this is disabled and [`Config::prefilter`] is not set, then the + /// meta regex engine will not use any prefilters. This can sometimes + /// be beneficial in cases where you know (or have measured) that the + /// prefilter leads to overall worse search performance. + /// + /// By default, this is enabled. + /// + /// # Example + /// + /// ``` + /// # if cfg!(miri) { return Ok(()); } // miri takes too long + /// use regex_automata::{meta::Regex, Match}; + /// + /// let re = Regex::builder() + /// .configure(Regex::config().auto_prefilter(false)) + /// .build(r"Bruce \w+")?; + /// let hay = "Hello Bruce Springsteen!"; + /// assert_eq!(Some(Match::must(0, 6..23)), re.find(hay)); + /// + /// Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + pub fn auto_prefilter(self, yes: bool) -> Config { + Config { autopre: Some(yes), ..self } + } + + /// Overrides and sets the prefilter to use inside a `Regex`. + /// + /// This permits one to forcefully set a prefilter in cases where the + /// caller knows better than whatever the automatic prefilter logic is + /// capable of. + /// + /// By default, this is set to `None` and an automatic prefilter will be + /// used if one could be built. (Assuming [`Config::auto_prefilter`] is + /// enabled, which it is by default.) + /// + /// # Example + /// + /// This example shows how to set your own prefilter. In the case of a + /// pattern like `Bruce \w+`, the automatic prefilter is likely to be + /// constructed in a way that it will look for occurrences of `Bruce `. + /// In most cases, this is the best choice. But in some cases, it may be + /// the case that running `memchr` on `B` is the best choice. One can + /// achieve that behavior by overriding the automatic prefilter logic + /// and providing a prefilter that just matches `B`. + /// + /// ``` + /// # if cfg!(miri) { return Ok(()); } // miri takes too long + /// use regex_automata::{ + /// meta::Regex, + /// util::prefilter::Prefilter, + /// Match, MatchKind, + /// }; + /// + /// let pre = Prefilter::new(MatchKind::LeftmostFirst, &["B"]) + /// .expect("a prefilter"); + /// let re = Regex::builder() + /// .configure(Regex::config().prefilter(Some(pre))) + /// .build(r"Bruce \w+")?; + /// let hay = "Hello Bruce Springsteen!"; + /// assert_eq!(Some(Match::must(0, 6..23)), re.find(hay)); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + /// + /// # Example: incorrect prefilters can lead to incorrect results! + /// + /// Be warned that setting an incorrect prefilter can lead to missed + /// matches. So if you use this option, ensure your prefilter can _never_ + /// report false negatives. (A false positive is, on the other hand, quite + /// okay and generally unavoidable.) + /// + /// ``` + /// # if cfg!(miri) { return Ok(()); } // miri takes too long + /// use regex_automata::{ + /// meta::Regex, + /// util::prefilter::Prefilter, + /// Match, MatchKind, + /// }; + /// + /// let pre = Prefilter::new(MatchKind::LeftmostFirst, &["Z"]) + /// .expect("a prefilter"); + /// let re = Regex::builder() + /// .configure(Regex::config().prefilter(Some(pre))) + /// .build(r"Bruce \w+")?; + /// let hay = "Hello Bruce Springsteen!"; + /// // Oops! No match found, but there should be one! + /// assert_eq!(None, re.find(hay)); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + pub fn prefilter(self, pre: Option<Prefilter>) -> Config { + Config { pre: Some(pre), ..self } + } + + /// Configures what kinds of groups are compiled as "capturing" in the + /// underlying regex engine. + /// + /// This is set to [`WhichCaptures::All`] by default. Callers may wish to + /// use [`WhichCaptures::Implicit`] in cases where one wants avoid the + /// overhead of capture states for explicit groups. + /// + /// Note that another approach to avoiding the overhead of capture groups + /// is by using non-capturing groups in the regex pattern. That is, + /// `(?:a)` instead of `(a)`. This option is useful when you can't control + /// the concrete syntax but know that you don't need the underlying capture + /// states. For example, using `WhichCaptures::Implicit` will behave as if + /// all explicit capturing groups in the pattern were non-capturing. + /// + /// Setting this to `WhichCaptures::None` is usually not the right thing to + /// do. When no capture states are compiled, some regex engines (such as + /// the `PikeVM`) won't be able to report match offsets. This will manifest + /// as no match being found. + /// + /// # Example + /// + /// This example demonstrates how the results of capture groups can change + /// based on this option. First we show the default (all capture groups in + /// the pattern are capturing): + /// + /// ``` + /// use regex_automata::{meta::Regex, Match, Span}; + /// + /// let re = Regex::new(r"foo([0-9]+)bar")?; + /// let hay = "foo123bar"; + /// + /// let mut caps = re.create_captures(); + /// re.captures(hay, &mut caps); + /// assert_eq!(Some(Span::from(0..9)), caps.get_group(0)); + /// assert_eq!(Some(Span::from(3..6)), caps.get_group(1)); + /// + /// Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + /// + /// And now we show the behavior when we only include implicit capture + /// groups. In this case, we can only find the overall match span, but the + /// spans of any other explicit group don't exist because they are treated + /// as non-capturing. (In effect, when `WhichCaptures::Implicit` is used, + /// there is no real point in using [`Regex::captures`] since it will never + /// be able to report more information than [`Regex::find`].) + /// + /// ``` + /// use regex_automata::{ + /// meta::Regex, + /// nfa::thompson::WhichCaptures, + /// Match, + /// Span, + /// }; + /// + /// let re = Regex::builder() + /// .configure(Regex::config().which_captures(WhichCaptures::Implicit)) + /// .build(r"foo([0-9]+)bar")?; + /// let hay = "foo123bar"; + /// + /// let mut caps = re.create_captures(); + /// re.captures(hay, &mut caps); + /// assert_eq!(Some(Span::from(0..9)), caps.get_group(0)); + /// assert_eq!(None, caps.get_group(1)); + /// + /// Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + pub fn which_captures(mut self, which_captures: WhichCaptures) -> Config { + self.which_captures = Some(which_captures); + self + } + + /// Sets the size limit, in bytes, to enforce on the construction of every + /// NFA build by the meta regex engine. + /// + /// Setting it to `None` disables the limit. This is not recommended if + /// you're compiling untrusted patterns. + /// + /// Note that this limit is applied to _each_ NFA built, and if any of + /// them excceed the limit, then construction will fail. This limit does + /// _not_ correspond to the total memory used by all NFAs in the meta regex + /// engine. + /// + /// This defaults to some reasonable number that permits most reasonable + /// patterns. + /// + /// # Example + /// + /// ``` + /// # if cfg!(miri) { return Ok(()); } // miri takes too long + /// use regex_automata::meta::Regex; + /// + /// let result = Regex::builder() + /// .configure(Regex::config().nfa_size_limit(Some(20 * (1<<10)))) + /// // Not even 20KB is enough to build a single large Unicode class! + /// .build(r"\pL"); + /// assert!(result.is_err()); + /// + /// // But notice that building such a regex with the exact same limit + /// // can succeed depending on other aspects of the configuration. For + /// // example, a single *forward* NFA will (at time of writing) fit into + /// // the 20KB limit, but a *reverse* NFA of the same pattern will not. + /// // So if one configures a meta regex such that a reverse NFA is never + /// // needed and thus never built, then the 20KB limit will be enough for + /// // a pattern like \pL! + /// let result = Regex::builder() + /// .configure(Regex::config() + /// .nfa_size_limit(Some(20 * (1<<10))) + /// // The DFAs are the only thing that (currently) need a reverse + /// // NFA. So if both are disabled, the meta regex engine will + /// // skip building the reverse NFA. Note that this isn't an API + /// // guarantee. A future semver compatible version may introduce + /// // new use cases for a reverse NFA. + /// .hybrid(false) + /// .dfa(false) + /// ) + /// // Not even 20KB is enough to build a single large Unicode class! + /// .build(r"\pL"); + /// assert!(result.is_ok()); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + pub fn nfa_size_limit(self, limit: Option<usize>) -> Config { + Config { nfa_size_limit: Some(limit), ..self } + } + + /// Sets the size limit, in bytes, for the one-pass DFA. + /// + /// Setting it to `None` disables the limit. Disabling the limit is + /// strongly discouraged when compiling untrusted patterns. Even if the + /// patterns are trusted, it still may not be a good idea, since a one-pass + /// DFA can use a lot of memory. With that said, as the size of a regex + /// increases, the likelihood of it being one-pass likely decreases. + /// + /// This defaults to some reasonable number that permits most reasonable + /// one-pass patterns. + /// + /// # Example + /// + /// This shows how to set the one-pass DFA size limit. Note that since + /// a one-pass DFA is an optional component of the meta regex engine, + /// this size limit only impacts what is built internally and will never + /// determine whether a `Regex` itself fails to build. + /// + /// ``` + /// # if cfg!(miri) { return Ok(()); } // miri takes too long + /// use regex_automata::meta::Regex; + /// + /// let result = Regex::builder() + /// .configure(Regex::config().onepass_size_limit(Some(2 * (1<<20)))) + /// .build(r"\pL{5}"); + /// assert!(result.is_ok()); + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + pub fn onepass_size_limit(self, limit: Option<usize>) -> Config { + Config { onepass_size_limit: Some(limit), ..self } + } + + /// Set the cache capacity, in bytes, for the lazy DFA. + /// + /// The cache capacity of the lazy DFA determines approximately how much + /// heap memory it is allowed to use to store its state transitions. The + /// state transitions are computed at search time, and if the cache fills + /// up it, it is cleared. At this point, any previously generated state + /// transitions are lost and are re-generated if they're needed again. + /// + /// This sort of cache filling and clearing works quite well _so long as + /// cache clearing happens infrequently_. If it happens too often, then the + /// meta regex engine will stop using the lazy DFA and switch over to a + /// different regex engine. + /// + /// In cases where the cache is cleared too often, it may be possible to + /// give the cache more space and reduce (or eliminate) how often it is + /// cleared. Similarly, sometimes a regex is so big that the lazy DFA isn't + /// used at all if its cache capacity isn't big enough. + /// + /// The capacity set here is a _limit_ on how much memory is used. The + /// actual memory used is only allocated as it's needed. + /// + /// Determining the right value for this is a little tricky and will likely + /// required some profiling. Enabling the `logging` feature and setting the + /// log level to `trace` will also tell you how often the cache is being + /// cleared. + /// + /// # Example + /// + /// ``` + /// # if cfg!(miri) { return Ok(()); } // miri takes too long + /// use regex_automata::meta::Regex; + /// + /// let result = Regex::builder() + /// .configure(Regex::config().hybrid_cache_capacity(20 * (1<<20))) + /// .build(r"\pL{5}"); + /// assert!(result.is_ok()); + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + pub fn hybrid_cache_capacity(self, limit: usize) -> Config { + Config { hybrid_cache_capacity: Some(limit), ..self } + } + + /// Sets the size limit, in bytes, for heap memory used for a fully + /// compiled DFA. + /// + /// **NOTE:** If you increase this, you'll likely also need to increase + /// [`Config::dfa_state_limit`]. + /// + /// In contrast to the lazy DFA, building a full DFA requires computing + /// all of its state transitions up front. This can be a very expensive + /// process, and runs in worst case `2^n` time and space (where `n` is + /// proportional to the size of the regex). However, a full DFA unlocks + /// some additional optimization opportunities. + /// + /// Because full DFAs can be so expensive, the default limits for them are + /// incredibly small. Generally speaking, if your regex is moderately big + /// or if you're using Unicode features (`\w` is Unicode-aware by default + /// for example), then you can expect that the meta regex engine won't even + /// attempt to build a DFA for it. + /// + /// If this and [`Config::dfa_state_limit`] are set to `None`, then the + /// meta regex will not use any sort of limits when deciding whether to + /// build a DFA. This in turn makes construction of a `Regex` take + /// worst case exponential time and space. Even short patterns can result + /// in huge space blow ups. So it is strongly recommended to keep some kind + /// of limit set! + /// + /// The default is set to a small number that permits some simple regexes + /// to get compiled into DFAs in reasonable time. + /// + /// # Example + /// + /// ``` + /// # if cfg!(miri) { return Ok(()); } // miri takes too long + /// use regex_automata::meta::Regex; + /// + /// let result = Regex::builder() + /// // 100MB is much bigger than the default. + /// .configure(Regex::config() + /// .dfa_size_limit(Some(100 * (1<<20))) + /// // We don't care about size too much here, so just + /// // remove the NFA state limit altogether. + /// .dfa_state_limit(None)) + /// .build(r"\pL{5}"); + /// assert!(result.is_ok()); + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + pub fn dfa_size_limit(self, limit: Option<usize>) -> Config { + Config { dfa_size_limit: Some(limit), ..self } + } + + /// Sets a limit on the total number of NFA states, beyond which, a full + /// DFA is not attempted to be compiled. + /// + /// This limit works in concert with [`Config::dfa_size_limit`]. Namely, + /// where as `Config::dfa_size_limit` is applied by attempting to construct + /// a DFA, this limit is used to avoid the attempt in the first place. This + /// is useful to avoid hefty initialization costs associated with building + /// a DFA for cases where it is obvious the DFA will ultimately be too big. + /// + /// By default, this is set to a very small number. + /// + /// # Example + /// + /// ``` + /// # if cfg!(miri) { return Ok(()); } // miri takes too long + /// use regex_automata::meta::Regex; + /// + /// let result = Regex::builder() + /// .configure(Regex::config() + /// // Sometimes the default state limit rejects DFAs even + /// // if they would fit in the size limit. Here, we disable + /// // the check on the number of NFA states and just rely on + /// // the size limit. + /// .dfa_state_limit(None)) + /// .build(r"(?-u)\w{30}"); + /// assert!(result.is_ok()); + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + pub fn dfa_state_limit(self, limit: Option<usize>) -> Config { + Config { dfa_state_limit: Some(limit), ..self } + } + + /// Whether to attempt to shrink the size of the alphabet for the regex + /// pattern or not. When enabled, the alphabet is shrunk into a set of + /// equivalence classes, where every byte in the same equivalence class + /// cannot discriminate between a match or non-match. + /// + /// **WARNING:** This is only useful for debugging DFAs. Disabling this + /// does not yield any speed advantages. Indeed, disabling it can result + /// in much higher memory usage. Disabling byte classes is useful for + /// debugging the actual generated transitions because it lets one see the + /// transitions defined on actual bytes instead of the equivalence classes. + /// + /// This option is enabled by default and should never be disabled unless + /// one is debugging the meta regex engine's internals. + /// + /// # Example + /// + /// ``` + /// use regex_automata::{meta::Regex, Match}; + /// + /// let re = Regex::builder() + /// .configure(Regex::config().byte_classes(false)) + /// .build(r"[a-z]+")?; + /// let hay = "!!quux!!"; + /// assert_eq!(Some(Match::must(0, 2..6)), re.find(hay)); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + pub fn byte_classes(self, yes: bool) -> Config { + Config { byte_classes: Some(yes), ..self } + } + + /// Set the line terminator to be used by the `^` and `$` anchors in + /// multi-line mode. + /// + /// This option has no effect when CRLF mode is enabled. That is, + /// regardless of this setting, `(?Rm:^)` and `(?Rm:$)` will always treat + /// `\r` and `\n` as line terminators (and will never match between a `\r` + /// and a `\n`). + /// + /// By default, `\n` is the line terminator. + /// + /// **Warning**: This does not change the behavior of `.`. To do that, + /// you'll need to configure the syntax option + /// [`syntax::Config::line_terminator`](crate::util::syntax::Config::line_terminator) + /// in addition to this. Otherwise, `.` will continue to match any + /// character other than `\n`. + /// + /// # Example + /// + /// ``` + /// use regex_automata::{meta::Regex, util::syntax, Match}; + /// + /// let re = Regex::builder() + /// .syntax(syntax::Config::new().multi_line(true)) + /// .configure(Regex::config().line_terminator(b'\x00')) + /// .build(r"^foo$")?; + /// let hay = "\x00foo\x00"; + /// assert_eq!(Some(Match::must(0, 1..4)), re.find(hay)); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + pub fn line_terminator(self, byte: u8) -> Config { + Config { line_terminator: Some(byte), ..self } + } + + /// Toggle whether the hybrid NFA/DFA (also known as the "lazy DFA") should + /// be available for use by the meta regex engine. + /// + /// Enabling this does not necessarily mean that the lazy DFA will + /// definitely be used. It just means that it will be _available_ for use + /// if the meta regex engine thinks it will be useful. + /// + /// When the `hybrid` crate feature is enabled, then this is enabled by + /// default. Otherwise, if the crate feature is disabled, then this is + /// always disabled, regardless of its setting by the caller. + pub fn hybrid(self, yes: bool) -> Config { + Config { hybrid: Some(yes), ..self } + } + + /// Toggle whether a fully compiled DFA should be available for use by the + /// meta regex engine. + /// + /// Enabling this does not necessarily mean that a DFA will definitely be + /// used. It just means that it will be _available_ for use if the meta + /// regex engine thinks it will be useful. + /// + /// When the `dfa-build` crate feature is enabled, then this is enabled by + /// default. Otherwise, if the crate feature is disabled, then this is + /// always disabled, regardless of its setting by the caller. + pub fn dfa(self, yes: bool) -> Config { + Config { dfa: Some(yes), ..self } + } + + /// Toggle whether a one-pass DFA should be available for use by the meta + /// regex engine. + /// + /// Enabling this does not necessarily mean that a one-pass DFA will + /// definitely be used. It just means that it will be _available_ for + /// use if the meta regex engine thinks it will be useful. (Indeed, a + /// one-pass DFA can only be used when the regex is one-pass. See the + /// [`dfa::onepass`](crate::dfa::onepass) module for more details.) + /// + /// When the `dfa-onepass` crate feature is enabled, then this is enabled + /// by default. Otherwise, if the crate feature is disabled, then this is + /// always disabled, regardless of its setting by the caller. + pub fn onepass(self, yes: bool) -> Config { + Config { onepass: Some(yes), ..self } + } + + /// Toggle whether a bounded backtracking regex engine should be available + /// for use by the meta regex engine. + /// + /// Enabling this does not necessarily mean that a bounded backtracker will + /// definitely be used. It just means that it will be _available_ for use + /// if the meta regex engine thinks it will be useful. + /// + /// When the `nfa-backtrack` crate feature is enabled, then this is enabled + /// by default. Otherwise, if the crate feature is disabled, then this is + /// always disabled, regardless of its setting by the caller. + pub fn backtrack(self, yes: bool) -> Config { + Config { backtrack: Some(yes), ..self } + } + + /// Returns the match kind on this configuration, as set by + /// [`Config::match_kind`]. + /// + /// If it was not explicitly set, then a default value is returned. + pub fn get_match_kind(&self) -> MatchKind { + self.match_kind.unwrap_or(MatchKind::LeftmostFirst) + } + + /// Returns whether empty matches must fall on valid UTF-8 boundaries, as + /// set by [`Config::utf8_empty`]. + /// + /// If it was not explicitly set, then a default value is returned. + pub fn get_utf8_empty(&self) -> bool { + self.utf8_empty.unwrap_or(true) + } + + /// Returns whether automatic prefilters are enabled, as set by + /// [`Config::auto_prefilter`]. + /// + /// If it was not explicitly set, then a default value is returned. + pub fn get_auto_prefilter(&self) -> bool { + self.autopre.unwrap_or(true) + } + + /// Returns a manually set prefilter, if one was set by + /// [`Config::prefilter`]. + /// + /// If it was not explicitly set, then a default value is returned. + pub fn get_prefilter(&self) -> Option<&Prefilter> { + self.pre.as_ref().unwrap_or(&None).as_ref() + } + + /// Returns the capture configuration, as set by + /// [`Config::which_captures`]. + /// + /// If it was not explicitly set, then a default value is returned. + pub fn get_which_captures(&self) -> WhichCaptures { + self.which_captures.unwrap_or(WhichCaptures::All) + } + + /// Returns NFA size limit, as set by [`Config::nfa_size_limit`]. + /// + /// If it was not explicitly set, then a default value is returned. + pub fn get_nfa_size_limit(&self) -> Option<usize> { + self.nfa_size_limit.unwrap_or(Some(10 * (1 << 20))) + } + + /// Returns one-pass DFA size limit, as set by + /// [`Config::onepass_size_limit`]. + /// + /// If it was not explicitly set, then a default value is returned. + pub fn get_onepass_size_limit(&self) -> Option<usize> { + self.onepass_size_limit.unwrap_or(Some(1 * (1 << 20))) + } + + /// Returns hybrid NFA/DFA cache capacity, as set by + /// [`Config::hybrid_cache_capacity`]. + /// + /// If it was not explicitly set, then a default value is returned. + pub fn get_hybrid_cache_capacity(&self) -> usize { + self.hybrid_cache_capacity.unwrap_or(2 * (1 << 20)) + } + + /// Returns DFA size limit, as set by [`Config::dfa_size_limit`]. + /// + /// If it was not explicitly set, then a default value is returned. + pub fn get_dfa_size_limit(&self) -> Option<usize> { + // The default for this is VERY small because building a full DFA is + // ridiculously costly. But for regexes that are very small, it can be + // beneficial to use a full DFA. In particular, a full DFA can enable + // additional optimizations via something called "accelerated" states. + // Namely, when there's a state with only a few outgoing transitions, + // we can temporary suspend walking the transition table and use memchr + // for just those outgoing transitions to skip ahead very quickly. + // + // Generally speaking, if Unicode is enabled in your regex and you're + // using some kind of Unicode feature, then it's going to blow this + // size limit. Moreover, Unicode tends to defeat the "accelerated" + // state optimization too, so it's a double whammy. + // + // We also use a limit on the number of NFA states to avoid even + // starting the DFA construction process. Namely, DFA construction + // itself could make lots of initial allocs proportional to the size + // of the NFA, and if the NFA is large, it doesn't make sense to pay + // that cost if we know it's likely to be blown by a large margin. + self.dfa_size_limit.unwrap_or(Some(40 * (1 << 10))) + } + + /// Returns DFA size limit in terms of the number of states in the NFA, as + /// set by [`Config::dfa_state_limit`]. + /// + /// If it was not explicitly set, then a default value is returned. + pub fn get_dfa_state_limit(&self) -> Option<usize> { + // Again, as with the size limit, we keep this very small. + self.dfa_state_limit.unwrap_or(Some(30)) + } + + /// Returns whether byte classes are enabled, as set by + /// [`Config::byte_classes`]. + /// + /// If it was not explicitly set, then a default value is returned. + pub fn get_byte_classes(&self) -> bool { + self.byte_classes.unwrap_or(true) + } + + /// Returns the line terminator for this configuration, as set by + /// [`Config::line_terminator`]. + /// + /// If it was not explicitly set, then a default value is returned. + pub fn get_line_terminator(&self) -> u8 { + self.line_terminator.unwrap_or(b'\n') + } + + /// Returns whether the hybrid NFA/DFA regex engine may be used, as set by + /// [`Config::hybrid`]. + /// + /// If it was not explicitly set, then a default value is returned. + pub fn get_hybrid(&self) -> bool { + #[cfg(feature = "hybrid")] + { + self.hybrid.unwrap_or(true) + } + #[cfg(not(feature = "hybrid"))] + { + false + } + } + + /// Returns whether the DFA regex engine may be used, as set by + /// [`Config::dfa`]. + /// + /// If it was not explicitly set, then a default value is returned. + pub fn get_dfa(&self) -> bool { + #[cfg(feature = "dfa-build")] + { + self.dfa.unwrap_or(true) + } + #[cfg(not(feature = "dfa-build"))] + { + false + } + } + + /// Returns whether the one-pass DFA regex engine may be used, as set by + /// [`Config::onepass`]. + /// + /// If it was not explicitly set, then a default value is returned. + pub fn get_onepass(&self) -> bool { + #[cfg(feature = "dfa-onepass")] + { + self.onepass.unwrap_or(true) + } + #[cfg(not(feature = "dfa-onepass"))] + { + false + } + } + + /// Returns whether the bounded backtracking regex engine may be used, as + /// set by [`Config::backtrack`]. + /// + /// If it was not explicitly set, then a default value is returned. + pub fn get_backtrack(&self) -> bool { + #[cfg(feature = "nfa-backtrack")] + { + self.backtrack.unwrap_or(true) + } + #[cfg(not(feature = "nfa-backtrack"))] + { + false + } + } + + /// Overwrite the default configuration such that the options in `o` are + /// always used. If an option in `o` is not set, then the corresponding + /// option in `self` is used. If it's not set in `self` either, then it + /// remains not set. + pub(crate) fn overwrite(&self, o: Config) -> Config { + Config { + match_kind: o.match_kind.or(self.match_kind), + utf8_empty: o.utf8_empty.or(self.utf8_empty), + autopre: o.autopre.or(self.autopre), + pre: o.pre.or_else(|| self.pre.clone()), + which_captures: o.which_captures.or(self.which_captures), + nfa_size_limit: o.nfa_size_limit.or(self.nfa_size_limit), + onepass_size_limit: o + .onepass_size_limit + .or(self.onepass_size_limit), + hybrid_cache_capacity: o + .hybrid_cache_capacity + .or(self.hybrid_cache_capacity), + hybrid: o.hybrid.or(self.hybrid), + dfa: o.dfa.or(self.dfa), + dfa_size_limit: o.dfa_size_limit.or(self.dfa_size_limit), + dfa_state_limit: o.dfa_state_limit.or(self.dfa_state_limit), + onepass: o.onepass.or(self.onepass), + backtrack: o.backtrack.or(self.backtrack), + byte_classes: o.byte_classes.or(self.byte_classes), + line_terminator: o.line_terminator.or(self.line_terminator), + } + } +} + +/// A builder for configuring and constructing a `Regex`. +/// +/// The builder permits configuring two different aspects of a `Regex`: +/// +/// * [`Builder::configure`] will set high-level configuration options as +/// described by a [`Config`]. +/// * [`Builder::syntax`] will set the syntax level configuration options +/// as described by a [`util::syntax::Config`](crate::util::syntax::Config). +/// This only applies when building a `Regex` from pattern strings. +/// +/// Once configured, the builder can then be used to construct a `Regex` from +/// one of 4 different inputs: +/// +/// * [`Builder::build`] creates a regex from a single pattern string. +/// * [`Builder::build_many`] creates a regex from many pattern strings. +/// * [`Builder::build_from_hir`] creates a regex from a +/// [`regex-syntax::Hir`](Hir) expression. +/// * [`Builder::build_many_from_hir`] creates a regex from many +/// [`regex-syntax::Hir`](Hir) expressions. +/// +/// The latter two methods in particular provide a way to construct a fully +/// feature regular expression matcher directly from an `Hir` expression +/// without having to first convert it to a string. (This is in contrast to the +/// top-level `regex` crate which intentionally provides no such API in order +/// to avoid making `regex-syntax` a public dependency.) +/// +/// As a convenience, this builder may be created via [`Regex::builder`], which +/// may help avoid an extra import. +/// +/// # Example: change the line terminator +/// +/// This example shows how to enable multi-line mode by default and change the +/// line terminator to the NUL byte: +/// +/// ``` +/// use regex_automata::{meta::Regex, util::syntax, Match}; +/// +/// let re = Regex::builder() +/// .syntax(syntax::Config::new().multi_line(true)) +/// .configure(Regex::config().line_terminator(b'\x00')) +/// .build(r"^foo$")?; +/// let hay = "\x00foo\x00"; +/// assert_eq!(Some(Match::must(0, 1..4)), re.find(hay)); +/// +/// # Ok::<(), Box<dyn std::error::Error>>(()) +/// ``` +/// +/// # Example: disable UTF-8 requirement +/// +/// By default, regex patterns are required to match UTF-8. This includes +/// regex patterns that can produce matches of length zero. In the case of an +/// empty match, by default, matches will not appear between the code units of +/// a UTF-8 encoded codepoint. +/// +/// However, it can be useful to disable this requirement, particularly if +/// you're searching things like `&[u8]` that are not known to be valid UTF-8. +/// +/// ``` +/// use regex_automata::{meta::Regex, util::syntax, Match}; +/// +/// let mut builder = Regex::builder(); +/// // Disables the requirement that non-empty matches match UTF-8. +/// builder.syntax(syntax::Config::new().utf8(false)); +/// // Disables the requirement that empty matches match UTF-8 boundaries. +/// builder.configure(Regex::config().utf8_empty(false)); +/// +/// // We can match raw bytes via \xZZ syntax, but we need to disable +/// // Unicode mode to do that. We could disable it everywhere, or just +/// // selectively, as shown here. +/// let re = builder.build(r"(?-u:\xFF)foo(?-u:\xFF)")?; +/// let hay = b"\xFFfoo\xFF"; +/// assert_eq!(Some(Match::must(0, 0..5)), re.find(hay)); +/// +/// // We can also match between code units. +/// let re = builder.build(r"")?; +/// let hay = "☃"; +/// assert_eq!(re.find_iter(hay).collect::<Vec<Match>>(), vec![ +/// Match::must(0, 0..0), +/// Match::must(0, 1..1), +/// Match::must(0, 2..2), +/// Match::must(0, 3..3), +/// ]); +/// +/// # Ok::<(), Box<dyn std::error::Error>>(()) +/// ``` +#[derive(Clone, Debug)] +pub struct Builder { + config: Config, + ast: ast::parse::ParserBuilder, + hir: hir::translate::TranslatorBuilder, +} + +impl Builder { + /// Creates a new builder for configuring and constructing a [`Regex`]. + pub fn new() -> Builder { + Builder { + config: Config::default(), + ast: ast::parse::ParserBuilder::new(), + hir: hir::translate::TranslatorBuilder::new(), + } + } + + /// Builds a `Regex` from a single pattern string. + /// + /// If there was a problem parsing the pattern or a problem turning it into + /// a regex matcher, then an error is returned. + /// + /// # Example + /// + /// This example shows how to configure syntax options. + /// + /// ``` + /// use regex_automata::{meta::Regex, util::syntax, Match}; + /// + /// let re = Regex::builder() + /// .syntax(syntax::Config::new().crlf(true).multi_line(true)) + /// .build(r"^foo$")?; + /// let hay = "\r\nfoo\r\n"; + /// assert_eq!(Some(Match::must(0, 2..5)), re.find(hay)); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + pub fn build(&self, pattern: &str) -> Result<Regex, BuildError> { + self.build_many(&[pattern]) + } + + /// Builds a `Regex` from many pattern strings. + /// + /// If there was a problem parsing any of the patterns or a problem turning + /// them into a regex matcher, then an error is returned. + /// + /// # Example: finding the pattern that caused an error + /// + /// When a syntax error occurs, it is possible to ask which pattern + /// caused the syntax error. + /// + /// ``` + /// use regex_automata::{meta::Regex, PatternID}; + /// + /// let err = Regex::builder() + /// .build_many(&["a", "b", r"\p{Foo}", "c"]) + /// .unwrap_err(); + /// assert_eq!(Some(PatternID::must(2)), err.pattern()); + /// ``` + /// + /// # Example: zero patterns is valid + /// + /// Building a regex with zero patterns results in a regex that never + /// matches anything. Because this routine is generic, passing an empty + /// slice usually requires a turbo-fish (or something else to help type + /// inference). + /// + /// ``` + /// use regex_automata::{meta::Regex, util::syntax, Match}; + /// + /// let re = Regex::builder() + /// .build_many::<&str>(&[])?; + /// assert_eq!(None, re.find("")); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + pub fn build_many<P: AsRef<str>>( + &self, + patterns: &[P], + ) -> Result<Regex, BuildError> { + use crate::util::primitives::IteratorIndexExt; + log! { + debug!("building meta regex with {} patterns:", patterns.len()); + for (pid, p) in patterns.iter().with_pattern_ids() { + let p = p.as_ref(); + // We might split a grapheme with this truncation logic, but + // that's fine. We at least avoid splitting a codepoint. + let maxoff = p + .char_indices() + .map(|(i, ch)| i + ch.len_utf8()) + .take(1000) + .last() + .unwrap_or(0); + if maxoff < p.len() { + debug!("{:?}: {}[... snip ...]", pid, &p[..maxoff]); + } else { + debug!("{:?}: {}", pid, p); + } + } + } + let (mut asts, mut hirs) = (vec![], vec![]); + for (pid, p) in patterns.iter().with_pattern_ids() { + let ast = self + .ast + .build() + .parse(p.as_ref()) + .map_err(|err| BuildError::ast(pid, err))?; + asts.push(ast); + } + for ((pid, p), ast) in + patterns.iter().with_pattern_ids().zip(asts.iter()) + { + let hir = self + .hir + .build() + .translate(p.as_ref(), ast) + .map_err(|err| BuildError::hir(pid, err))?; + hirs.push(hir); + } + self.build_many_from_hir(&hirs) + } + + /// Builds a `Regex` directly from an `Hir` expression. + /// + /// This is useful if you needed to parse a pattern string into an `Hir` + /// for other reasons (such as analysis or transformations). This routine + /// permits building a `Regex` directly from the `Hir` expression instead + /// of first converting the `Hir` back to a pattern string. + /// + /// When using this method, any options set via [`Builder::syntax`] are + /// ignored. Namely, the syntax options only apply when parsing a pattern + /// string, which isn't relevant here. + /// + /// If there was a problem building the underlying regex matcher for the + /// given `Hir`, then an error is returned. + /// + /// # Example + /// + /// This example shows how one can hand-construct an `Hir` expression and + /// build a regex from it without doing any parsing at all. + /// + /// ``` + /// use { + /// regex_automata::{meta::Regex, Match}, + /// regex_syntax::hir::{Hir, Look}, + /// }; + /// + /// // (?Rm)^foo$ + /// let hir = Hir::concat(vec![ + /// Hir::look(Look::StartCRLF), + /// Hir::literal("foo".as_bytes()), + /// Hir::look(Look::EndCRLF), + /// ]); + /// let re = Regex::builder() + /// .build_from_hir(&hir)?; + /// let hay = "\r\nfoo\r\n"; + /// assert_eq!(Some(Match::must(0, 2..5)), re.find(hay)); + /// + /// Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + pub fn build_from_hir(&self, hir: &Hir) -> Result<Regex, BuildError> { + self.build_many_from_hir(&[hir]) + } + + /// Builds a `Regex` directly from many `Hir` expressions. + /// + /// This is useful if you needed to parse pattern strings into `Hir` + /// expressions for other reasons (such as analysis or transformations). + /// This routine permits building a `Regex` directly from the `Hir` + /// expressions instead of first converting the `Hir` expressions back to + /// pattern strings. + /// + /// When using this method, any options set via [`Builder::syntax`] are + /// ignored. Namely, the syntax options only apply when parsing a pattern + /// string, which isn't relevant here. + /// + /// If there was a problem building the underlying regex matcher for the + /// given `Hir` expressions, then an error is returned. + /// + /// Note that unlike [`Builder::build_many`], this can only fail as a + /// result of building the underlying matcher. In that case, there is + /// no single `Hir` expression that can be isolated as a reason for the + /// failure. So if this routine fails, it's not possible to determine which + /// `Hir` expression caused the failure. + /// + /// # Example + /// + /// This example shows how one can hand-construct multiple `Hir` + /// expressions and build a single regex from them without doing any + /// parsing at all. + /// + /// ``` + /// use { + /// regex_automata::{meta::Regex, Match}, + /// regex_syntax::hir::{Hir, Look}, + /// }; + /// + /// // (?Rm)^foo$ + /// let hir1 = Hir::concat(vec![ + /// Hir::look(Look::StartCRLF), + /// Hir::literal("foo".as_bytes()), + /// Hir::look(Look::EndCRLF), + /// ]); + /// // (?Rm)^bar$ + /// let hir2 = Hir::concat(vec![ + /// Hir::look(Look::StartCRLF), + /// Hir::literal("bar".as_bytes()), + /// Hir::look(Look::EndCRLF), + /// ]); + /// let re = Regex::builder() + /// .build_many_from_hir(&[&hir1, &hir2])?; + /// let hay = "\r\nfoo\r\nbar"; + /// let got: Vec<Match> = re.find_iter(hay).collect(); + /// let expected = vec![ + /// Match::must(0, 2..5), + /// Match::must(1, 7..10), + /// ]; + /// assert_eq!(expected, got); + /// + /// Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + pub fn build_many_from_hir<H: Borrow<Hir>>( + &self, + hirs: &[H], + ) -> Result<Regex, BuildError> { + let config = self.config.clone(); + // We collect the HIRs into a vec so we can write internal routines + // with '&[&Hir]'. i.e., Don't use generics everywhere to keep code + // bloat down.. + let hirs: Vec<&Hir> = hirs.iter().map(|hir| hir.borrow()).collect(); + let info = RegexInfo::new(config, &hirs); + let strat = strategy::new(&info, &hirs)?; + let pool = { + let strat = Arc::clone(&strat); + let create: CachePoolFn = Box::new(move || strat.create_cache()); + Pool::new(create) + }; + Ok(Regex { imp: Arc::new(RegexI { strat, info }), pool }) + } + + /// Configure the behavior of a `Regex`. + /// + /// This configuration controls non-syntax options related to the behavior + /// of a `Regex`. This includes things like whether empty matches can split + /// a codepoint, prefilters, line terminators and a long list of options + /// for configuring which regex engines the meta regex engine will be able + /// to use internally. + /// + /// # Example + /// + /// This example shows how to disable UTF-8 empty mode. This will permit + /// empty matches to occur between the UTF-8 encoding of a codepoint. + /// + /// ``` + /// use regex_automata::{meta::Regex, Match}; + /// + /// let re = Regex::new("")?; + /// let got: Vec<Match> = re.find_iter("☃").collect(); + /// // Matches only occur at the beginning and end of the snowman. + /// assert_eq!(got, vec![ + /// Match::must(0, 0..0), + /// Match::must(0, 3..3), + /// ]); + /// + /// let re = Regex::builder() + /// .configure(Regex::config().utf8_empty(false)) + /// .build("")?; + /// let got: Vec<Match> = re.find_iter("☃").collect(); + /// // Matches now occur at every position! + /// assert_eq!(got, vec![ + /// Match::must(0, 0..0), + /// Match::must(0, 1..1), + /// Match::must(0, 2..2), + /// Match::must(0, 3..3), + /// ]); + /// + /// Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + pub fn configure(&mut self, config: Config) -> &mut Builder { + self.config = self.config.overwrite(config); + self + } + + /// Configure the syntax options when parsing a pattern string while + /// building a `Regex`. + /// + /// These options _only_ apply when [`Builder::build`] or [`Builder::build_many`] + /// are used. The other build methods accept `Hir` values, which have + /// already been parsed. + /// + /// # Example + /// + /// This example shows how to enable case insensitive mode. + /// + /// ``` + /// use regex_automata::{meta::Regex, util::syntax, Match}; + /// + /// let re = Regex::builder() + /// .syntax(syntax::Config::new().case_insensitive(true)) + /// .build(r"δ")?; + /// assert_eq!(Some(Match::must(0, 0..2)), re.find(r"Δ")); + /// + /// Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + pub fn syntax( + &mut self, + config: crate::util::syntax::Config, + ) -> &mut Builder { + config.apply_ast(&mut self.ast); + config.apply_hir(&mut self.hir); + self + } +} + +#[cfg(test)] +mod tests { + use super::*; + + // I found this in the course of building out the benchmark suite for + // rebar. + #[test] + fn regression() { + env_logger::init(); + + let re = Regex::new(r"[a-zA-Z]+ing").unwrap(); + assert_eq!(1, re.find_iter("tingling").count()); + } +} diff --git a/third_party/rust/regex-automata/src/meta/reverse_inner.rs b/third_party/rust/regex-automata/src/meta/reverse_inner.rs new file mode 100644 index 0000000000..3d78779f6f --- /dev/null +++ b/third_party/rust/regex-automata/src/meta/reverse_inner.rs @@ -0,0 +1,220 @@ +/*! +A module dedicated to plucking inner literals out of a regex pattern, and +then constructing a prefilter for them. We also include a regex pattern +"prefix" that corresponds to the bits of the regex that need to match before +the literals do. The reverse inner optimization then proceeds by looking for +matches of the inner literal(s), and then doing a reverse search of the prefix +from the start of the literal match to find the overall start position of the +match. + +The essential invariant we want to uphold here is that the literals we return +reflect a set where *at least* one of them must match in order for the overall +regex to match. We also need to maintain the invariant that the regex prefix +returned corresponds to the entirety of the regex up until the literals we +return. + +This somewhat limits what we can do. That is, if we a regex like +`\w+(@!|%%)\w+`, then we can pluck the `{@!, %%}` out and build a prefilter +from it. Then we just need to compile `\w+` in reverse. No fuss no muss. But if +we have a regex like \d+@!|\w+%%`, then we get kind of stymied. Technically, +we could still extract `{@!, %%}`, and it is true that at least of them must +match. But then, what is our regex prefix? Again, in theory, that could be +`\d+|\w+`, but that's not quite right, because the `\d+` only matches when `@!` +matches, and `\w+` only matches when `%%` matches. + +All of that is technically possible to do, but it seemingly requires a lot of +sophistication and machinery. Probably the way to tackle that is with some kind +of formalism and approach this problem more generally. + +For now, the code below basically just looks for a top-level concatenation. +And if it can find one, it looks for literals in each of the direct child +sub-expressions of that concatenation. If some good ones are found, we return +those and a concatenation of the Hir expressions seen up to that point. +*/ + +use alloc::vec::Vec; + +use regex_syntax::hir::{self, literal, Hir, HirKind}; + +use crate::{util::prefilter::Prefilter, MatchKind}; + +/// Attempts to extract an "inner" prefilter from the given HIR expressions. If +/// one was found, then a concatenation of the HIR expressions that precede it +/// is returned. +/// +/// The idea here is that the prefilter returned can be used to find candidate +/// matches. And then the HIR returned can be used to build a reverse regex +/// matcher, which will find the start of the candidate match. Finally, the +/// match still has to be confirmed with a normal anchored forward scan to find +/// the end position of the match. +/// +/// Note that this assumes leftmost-first match semantics, so callers must +/// not call this otherwise. +pub(crate) fn extract(hirs: &[&Hir]) -> Option<(Hir, Prefilter)> { + if hirs.len() != 1 { + debug!( + "skipping reverse inner optimization since it only \ + supports 1 pattern, {} were given", + hirs.len(), + ); + return None; + } + let mut concat = match top_concat(hirs[0]) { + Some(concat) => concat, + None => { + debug!( + "skipping reverse inner optimization because a top-level \ + concatenation could not found", + ); + return None; + } + }; + // We skip the first HIR because if it did have a prefix prefilter in it, + // we probably wouldn't be here looking for an inner prefilter. + for i in 1..concat.len() { + let hir = &concat[i]; + let pre = match prefilter(hir) { + None => continue, + Some(pre) => pre, + }; + // Even if we got a prefilter, if it isn't consider "fast," then we + // probably don't want to bother with it. Namely, since the reverse + // inner optimization requires some overhead, it likely only makes + // sense if the prefilter scan itself is (believed) to be much faster + // than the regex engine. + if !pre.is_fast() { + debug!( + "skipping extracted inner prefilter because \ + it probably isn't fast" + ); + continue; + } + let concat_suffix = Hir::concat(concat.split_off(i)); + let concat_prefix = Hir::concat(concat); + // Look for a prefilter again. Why? Because above we only looked for + // a prefilter on the individual 'hir', but we might be able to find + // something better and more discriminatory by looking at the entire + // suffix. We don't do this above to avoid making this loop worst case + // quadratic in the length of 'concat'. + let pre2 = match prefilter(&concat_suffix) { + None => pre, + Some(pre2) => { + if pre2.is_fast() { + pre2 + } else { + pre + } + } + }; + return Some((concat_prefix, pre2)); + } + debug!( + "skipping reverse inner optimization because a top-level \ + sub-expression with a fast prefilter could not be found" + ); + None +} + +/// Attempt to extract a prefilter from an HIR expression. +/// +/// We do a little massaging here to do our best that the prefilter we get out +/// of this is *probably* fast. Basically, the false positive rate has a much +/// higher impact for things like the reverse inner optimization because more +/// work needs to potentially be done for each candidate match. +/// +/// Note that this assumes leftmost-first match semantics, so callers must +/// not call this otherwise. +fn prefilter(hir: &Hir) -> Option<Prefilter> { + let mut extractor = literal::Extractor::new(); + extractor.kind(literal::ExtractKind::Prefix); + let mut prefixes = extractor.extract(hir); + debug!( + "inner prefixes (len={:?}) extracted before optimization: {:?}", + prefixes.len(), + prefixes + ); + // Since these are inner literals, we know they cannot be exact. But the + // extractor doesn't know this. We mark them as inexact because this might + // impact literal optimization. Namely, optimization weights "all literals + // are exact" as very high, because it presumes that any match results in + // an overall match. But of course, that is not the case here. + // + // In practice, this avoids plucking out a ASCII-only \s as an alternation + // of single-byte whitespace characters. + prefixes.make_inexact(); + prefixes.optimize_for_prefix_by_preference(); + debug!( + "inner prefixes (len={:?}) extracted after optimization: {:?}", + prefixes.len(), + prefixes + ); + prefixes + .literals() + .and_then(|lits| Prefilter::new(MatchKind::LeftmostFirst, lits)) +} + +/// Looks for a "top level" HirKind::Concat item in the given HIR. This will +/// try to return one even if it's embedded in a capturing group, but is +/// otherwise pretty conservative in what is returned. +/// +/// The HIR returned is a complete copy of the concat with all capturing +/// groups removed. In effect, the concat returned is "flattened" with respect +/// to capturing groups. This makes the detection logic above for prefixes +/// a bit simpler, and it works because 1) capturing groups never influence +/// whether a match occurs or not and 2) capturing groups are not used when +/// doing the reverse inner search to find the start of the match. +fn top_concat(mut hir: &Hir) -> Option<Vec<Hir>> { + loop { + hir = match hir.kind() { + HirKind::Empty + | HirKind::Literal(_) + | HirKind::Class(_) + | HirKind::Look(_) + | HirKind::Repetition(_) + | HirKind::Alternation(_) => return None, + HirKind::Capture(hir::Capture { ref sub, .. }) => sub, + HirKind::Concat(ref subs) => { + // We are careful to only do the flattening/copy when we know + // we have a "top level" concat we can inspect. This avoids + // doing extra work in cases where we definitely won't use it. + // (This might still be wasted work if we can't go on to find + // some literals to extract.) + let concat = + Hir::concat(subs.iter().map(|h| flatten(h)).collect()); + return match concat.into_kind() { + HirKind::Concat(xs) => Some(xs), + // It is actually possible for this case to occur, because + // 'Hir::concat' might simplify the expression to the point + // that concatenations are actually removed. One wonders + // whether this leads to other cases where we should be + // extracting literals, but in theory, I believe if we do + // get here, then it means that a "real" prefilter failed + // to be extracted and we should probably leave well enough + // alone. (A "real" prefilter is unbothered by "top-level + // concats" and "capturing groups.") + _ => return None, + }; + } + }; + } +} + +/// Returns a copy of the given HIR but with all capturing groups removed. +fn flatten(hir: &Hir) -> Hir { + match hir.kind() { + HirKind::Empty => Hir::empty(), + HirKind::Literal(hir::Literal(ref x)) => Hir::literal(x.clone()), + HirKind::Class(ref x) => Hir::class(x.clone()), + HirKind::Look(ref x) => Hir::look(x.clone()), + HirKind::Repetition(ref x) => Hir::repetition(x.with(flatten(&x.sub))), + // This is the interesting case. We just drop the group information + // entirely and use the child HIR itself. + HirKind::Capture(hir::Capture { ref sub, .. }) => flatten(sub), + HirKind::Alternation(ref xs) => { + Hir::alternation(xs.iter().map(|x| flatten(x)).collect()) + } + HirKind::Concat(ref xs) => { + Hir::concat(xs.iter().map(|x| flatten(x)).collect()) + } + } +} diff --git a/third_party/rust/regex-automata/src/meta/stopat.rs b/third_party/rust/regex-automata/src/meta/stopat.rs new file mode 100644 index 0000000000..e8d716689c --- /dev/null +++ b/third_party/rust/regex-automata/src/meta/stopat.rs @@ -0,0 +1,224 @@ +/*! +This module defines two bespoke forward DFA search routines. One for the lazy +DFA and one for the fully compiled DFA. These routines differ from the normal +ones by reporting the position at which the search terminates when a match +*isn't* found. + +This position at which a search terminates is useful in contexts where the meta +regex engine runs optimizations that could go quadratic if we aren't careful. +Namely, a regex search *could* scan to the end of the haystack only to report a +non-match. If the caller doesn't know that the search scanned to the end of the +haystack, it might restart the search at the next literal candidate it finds +and repeat the process. + +Providing the caller with the position at which the search stopped provides a +way for the caller to determine the point at which subsequent scans should not +pass. This is principally used in the "reverse inner" optimization, which works +like this: + +1. Look for a match of an inner literal. Say, 'Z' in '\w+Z\d+'. +2. At the spot where 'Z' matches, do a reverse anchored search from there for +'\w+'. +3. If the reverse search matches, it corresponds to the start position of a +(possible) match. At this point, do a forward anchored search to find the end +position. If an end position is found, then we have a match and we know its +bounds. + +If the forward anchored search in (3) searches the entire rest of the haystack +but reports a non-match, then a naive implementation of the above will continue +back at step 1 looking for more candidates. There might still be a match to be +found! It's possible. But we already scanned the whole haystack. So if we keep +repeating the process, then we might wind up taking quadratic time in the size +of the haystack, which is not great. + +So if the forward anchored search in (3) reports the position at which it +stops, then we can detect whether quadratic behavior might be occurring in +steps (1) and (2). For (1), it occurs if the literal candidate found occurs +*before* the end of the previous search in (3), since that means we're now +going to look for another match in a place where the forward search has already +scanned. It is *correct* to do so, but our technique has become inefficient. +For (2), quadratic behavior occurs similarly when its reverse search extends +past the point where the previous forward search in (3) terminated. Indeed, to +implement (2), we use the sibling 'limited' module for ensuring our reverse +scan doesn't go further than we want. + +See the 'opt/reverse-inner' benchmarks in rebar for a real demonstration of +how quadratic behavior is mitigated. +*/ + +use crate::{meta::error::RetryFailError, HalfMatch, Input, MatchError}; + +#[cfg(feature = "dfa-build")] +pub(crate) fn dfa_try_search_half_fwd( + dfa: &crate::dfa::dense::DFA<alloc::vec::Vec<u32>>, + input: &Input<'_>, +) -> Result<Result<HalfMatch, usize>, RetryFailError> { + use crate::dfa::{accel, Automaton}; + + let mut mat = None; + let mut sid = dfa.start_state_forward(input)?; + let mut at = input.start(); + while at < input.end() { + sid = dfa.next_state(sid, input.haystack()[at]); + if dfa.is_special_state(sid) { + if dfa.is_match_state(sid) { + let pattern = dfa.match_pattern(sid, 0); + mat = Some(HalfMatch::new(pattern, at)); + if input.get_earliest() { + return Ok(mat.ok_or(at)); + } + if dfa.is_accel_state(sid) { + let needs = dfa.accelerator(sid); + at = accel::find_fwd(needs, input.haystack(), at) + .unwrap_or(input.end()); + continue; + } + } else if dfa.is_accel_state(sid) { + let needs = dfa.accelerator(sid); + at = accel::find_fwd(needs, input.haystack(), at) + .unwrap_or(input.end()); + continue; + } else if dfa.is_dead_state(sid) { + return Ok(mat.ok_or(at)); + } else if dfa.is_quit_state(sid) { + if mat.is_some() { + return Ok(mat.ok_or(at)); + } + return Err(MatchError::quit(input.haystack()[at], at).into()); + } else { + // Ideally we wouldn't use a DFA that specialized start states + // and thus 'is_start_state()' could never be true here, but in + // practice we reuse the DFA created for the full regex which + // will specialize start states whenever there is a prefilter. + debug_assert!(dfa.is_start_state(sid)); + } + } + at += 1; + } + dfa_eoi_fwd(dfa, input, &mut sid, &mut mat)?; + Ok(mat.ok_or(at)) +} + +#[cfg(feature = "hybrid")] +pub(crate) fn hybrid_try_search_half_fwd( + dfa: &crate::hybrid::dfa::DFA, + cache: &mut crate::hybrid::dfa::Cache, + input: &Input<'_>, +) -> Result<Result<HalfMatch, usize>, RetryFailError> { + let mut mat = None; + let mut sid = dfa.start_state_forward(cache, input)?; + let mut at = input.start(); + while at < input.end() { + sid = dfa + .next_state(cache, sid, input.haystack()[at]) + .map_err(|_| MatchError::gave_up(at))?; + if sid.is_tagged() { + if sid.is_match() { + let pattern = dfa.match_pattern(cache, sid, 0); + mat = Some(HalfMatch::new(pattern, at)); + if input.get_earliest() { + return Ok(mat.ok_or(at)); + } + } else if sid.is_dead() { + return Ok(mat.ok_or(at)); + } else if sid.is_quit() { + if mat.is_some() { + return Ok(mat.ok_or(at)); + } + return Err(MatchError::quit(input.haystack()[at], at).into()); + } else { + // We should NEVER get an unknown state ID back from + // dfa.next_state(). + debug_assert!(!sid.is_unknown()); + // Ideally we wouldn't use a lazy DFA that specialized start + // states and thus 'sid.is_start()' could never be true here, + // but in practice we reuse the lazy DFA created for the full + // regex which will specialize start states whenever there is + // a prefilter. + debug_assert!(sid.is_start()); + } + } + at += 1; + } + hybrid_eoi_fwd(dfa, cache, input, &mut sid, &mut mat)?; + Ok(mat.ok_or(at)) +} + +#[cfg(feature = "dfa-build")] +#[cfg_attr(feature = "perf-inline", inline(always))] +fn dfa_eoi_fwd( + dfa: &crate::dfa::dense::DFA<alloc::vec::Vec<u32>>, + input: &Input<'_>, + sid: &mut crate::util::primitives::StateID, + mat: &mut Option<HalfMatch>, +) -> Result<(), MatchError> { + use crate::dfa::Automaton; + + let sp = input.get_span(); + match input.haystack().get(sp.end) { + Some(&b) => { + *sid = dfa.next_state(*sid, b); + if dfa.is_match_state(*sid) { + let pattern = dfa.match_pattern(*sid, 0); + *mat = Some(HalfMatch::new(pattern, sp.end)); + } else if dfa.is_quit_state(*sid) { + if mat.is_some() { + return Ok(()); + } + return Err(MatchError::quit(b, sp.end)); + } + } + None => { + *sid = dfa.next_eoi_state(*sid); + if dfa.is_match_state(*sid) { + let pattern = dfa.match_pattern(*sid, 0); + *mat = Some(HalfMatch::new(pattern, input.haystack().len())); + } + // N.B. We don't have to check 'is_quit' here because the EOI + // transition can never lead to a quit state. + debug_assert!(!dfa.is_quit_state(*sid)); + } + } + Ok(()) +} + +#[cfg(feature = "hybrid")] +#[cfg_attr(feature = "perf-inline", inline(always))] +fn hybrid_eoi_fwd( + dfa: &crate::hybrid::dfa::DFA, + cache: &mut crate::hybrid::dfa::Cache, + input: &Input<'_>, + sid: &mut crate::hybrid::LazyStateID, + mat: &mut Option<HalfMatch>, +) -> Result<(), MatchError> { + let sp = input.get_span(); + match input.haystack().get(sp.end) { + Some(&b) => { + *sid = dfa + .next_state(cache, *sid, b) + .map_err(|_| MatchError::gave_up(sp.end))?; + if sid.is_match() { + let pattern = dfa.match_pattern(cache, *sid, 0); + *mat = Some(HalfMatch::new(pattern, sp.end)); + } else if sid.is_quit() { + if mat.is_some() { + return Ok(()); + } + return Err(MatchError::quit(b, sp.end)); + } + } + None => { + *sid = dfa + .next_eoi_state(cache, *sid) + .map_err(|_| MatchError::gave_up(input.haystack().len()))?; + if sid.is_match() { + let pattern = dfa.match_pattern(cache, *sid, 0); + *mat = Some(HalfMatch::new(pattern, input.haystack().len())); + } + // N.B. We don't have to check 'is_quit' here because the EOI + // transition can never lead to a quit state. + debug_assert!(!sid.is_quit()); + } + } + Ok(()) +} diff --git a/third_party/rust/regex-automata/src/meta/strategy.rs b/third_party/rust/regex-automata/src/meta/strategy.rs new file mode 100644 index 0000000000..ea6c6ab576 --- /dev/null +++ b/third_party/rust/regex-automata/src/meta/strategy.rs @@ -0,0 +1,1908 @@ +use core::{ + fmt::Debug, + panic::{RefUnwindSafe, UnwindSafe}, +}; + +use alloc::sync::Arc; + +use regex_syntax::hir::{literal, Hir}; + +use crate::{ + meta::{ + error::{BuildError, RetryError, RetryFailError, RetryQuadraticError}, + regex::{Cache, RegexInfo}, + reverse_inner, wrappers, + }, + nfa::thompson::{self, WhichCaptures, NFA}, + util::{ + captures::{Captures, GroupInfo}, + look::LookMatcher, + prefilter::{self, Prefilter, PrefilterI}, + primitives::{NonMaxUsize, PatternID}, + search::{Anchored, HalfMatch, Input, Match, MatchKind, PatternSet}, + }, +}; + +/// A trait that represents a single meta strategy. Its main utility is in +/// providing a way to do dynamic dispatch over a few choices. +/// +/// Why dynamic dispatch? I actually don't have a super compelling reason, and +/// importantly, I have not benchmarked it with the main alternative: an enum. +/// I went with dynamic dispatch initially because the regex engine search code +/// really can't be inlined into caller code in most cases because it's just +/// too big. In other words, it is already expected that every regex search +/// will entail at least the cost of a function call. +/// +/// I do wonder whether using enums would result in better codegen overall +/// though. It's a worthwhile experiment to try. Probably the most interesting +/// benchmark to run in such a case would be one with a high match count. That +/// is, a benchmark to test the overall latency of a search call. +pub(super) trait Strategy: + Debug + Send + Sync + RefUnwindSafe + UnwindSafe + 'static +{ + fn group_info(&self) -> &GroupInfo; + + fn create_cache(&self) -> Cache; + + fn reset_cache(&self, cache: &mut Cache); + + fn is_accelerated(&self) -> bool; + + fn memory_usage(&self) -> usize; + + fn search(&self, cache: &mut Cache, input: &Input<'_>) -> Option<Match>; + + fn search_half( + &self, + cache: &mut Cache, + input: &Input<'_>, + ) -> Option<HalfMatch>; + + fn is_match(&self, cache: &mut Cache, input: &Input<'_>) -> bool; + + fn search_slots( + &self, + cache: &mut Cache, + input: &Input<'_>, + slots: &mut [Option<NonMaxUsize>], + ) -> Option<PatternID>; + + fn which_overlapping_matches( + &self, + cache: &mut Cache, + input: &Input<'_>, + patset: &mut PatternSet, + ); +} + +pub(super) fn new( + info: &RegexInfo, + hirs: &[&Hir], +) -> Result<Arc<dyn Strategy>, BuildError> { + // At this point, we're committed to a regex engine of some kind. So pull + // out a prefilter if we can, which will feed to each of the constituent + // regex engines. + let pre = if info.is_always_anchored_start() { + // PERF: I'm not sure we necessarily want to do this... We may want to + // run a prefilter for quickly rejecting in some cases. The problem + // is that anchored searches overlap quite a bit with the use case + // of "run a regex on every line to extract data." In that case, the + // regex always matches, so running a prefilter doesn't really help us + // there. The main place where a prefilter helps in an anchored search + // is if the anchored search is not expected to match frequently. That + // is, the prefilter gives us a way to possibly reject a haystack very + // quickly. + // + // Maybe we should do use a prefilter, but only for longer haystacks? + // Or maybe we should only use a prefilter when we think it's "fast"? + // + // Interestingly, I think we currently lack the infrastructure for + // disabling a prefilter based on haystack length. That would probably + // need to be a new 'Input' option. (Interestingly, an 'Input' used to + // carry a 'Prefilter' with it, but I moved away from that.) + debug!("skipping literal extraction since regex is anchored"); + None + } else if let Some(pre) = info.config().get_prefilter() { + debug!( + "skipping literal extraction since the caller provided a prefilter" + ); + Some(pre.clone()) + } else if info.config().get_auto_prefilter() { + let kind = info.config().get_match_kind(); + let prefixes = crate::util::prefilter::prefixes(kind, hirs); + // If we can build a full `Strategy` from just the extracted prefixes, + // then we can short-circuit and avoid building a regex engine at all. + if let Some(pre) = Pre::from_prefixes(info, &prefixes) { + debug!( + "found that the regex can be broken down to a literal \ + search, avoiding the regex engine entirely", + ); + return Ok(pre); + } + // This now attempts another short-circuit of the regex engine: if we + // have a huge alternation of just plain literals, then we can just use + // Aho-Corasick for that and avoid the regex engine entirely. + // + // You might think this case would just be handled by + // `Pre::from_prefixes`, but that technique relies on heuristic literal + // extraction from the corresponding `Hir`. That works, but part of + // heuristics limit the size and number of literals returned. This case + // will specifically handle patterns with very large alternations. + // + // One wonders if we should just roll this our heuristic literal + // extraction, and then I think this case could disappear entirely. + if let Some(pre) = Pre::from_alternation_literals(info, hirs) { + debug!( + "found plain alternation of literals, \ + avoiding regex engine entirely and using Aho-Corasick" + ); + return Ok(pre); + } + prefixes.literals().and_then(|strings| { + debug!( + "creating prefilter from {} literals: {:?}", + strings.len(), + strings, + ); + Prefilter::new(kind, strings) + }) + } else { + debug!("skipping literal extraction since prefilters were disabled"); + None + }; + let mut core = Core::new(info.clone(), pre.clone(), hirs)?; + // Now that we have our core regex engines built, there are a few cases + // where we can do a little bit better than just a normal "search forward + // and maybe use a prefilter when in a start state." However, these cases + // may not always work or otherwise build on top of the Core searcher. + // For example, the reverse anchored optimization seems like it might + // always work, but only the DFAs support reverse searching and the DFAs + // might give up or quit for reasons. If we had, e.g., a PikeVM that + // supported reverse searching, then we could avoid building a full Core + // engine for this case. + core = match ReverseAnchored::new(core) { + Err(core) => core, + Ok(ra) => { + debug!("using reverse anchored strategy"); + return Ok(Arc::new(ra)); + } + }; + core = match ReverseSuffix::new(core, hirs) { + Err(core) => core, + Ok(rs) => { + debug!("using reverse suffix strategy"); + return Ok(Arc::new(rs)); + } + }; + core = match ReverseInner::new(core, hirs) { + Err(core) => core, + Ok(ri) => { + debug!("using reverse inner strategy"); + return Ok(Arc::new(ri)); + } + }; + debug!("using core strategy"); + Ok(Arc::new(core)) +} + +#[derive(Clone, Debug)] +struct Pre<P> { + pre: P, + group_info: GroupInfo, +} + +impl<P: PrefilterI> Pre<P> { + fn new(pre: P) -> Arc<dyn Strategy> { + // The only thing we support when we use prefilters directly as a + // strategy is the start and end of the overall match for a single + // pattern. In other words, exactly one implicit capturing group. Which + // is exactly what we use here for a GroupInfo. + let group_info = GroupInfo::new([[None::<&str>]]).unwrap(); + Arc::new(Pre { pre, group_info }) + } +} + +// This is a little weird, but we don't actually care about the type parameter +// here because we're selecting which underlying prefilter to use. So we just +// define it on an arbitrary type. +impl Pre<()> { + /// Given a sequence of prefixes, attempt to return a full `Strategy` using + /// just the prefixes. + /// + /// Basically, this occurs when the prefixes given not just prefixes, + /// but an enumeration of the entire language matched by the regular + /// expression. + /// + /// A number of other conditions need to be true too. For example, there + /// can be only one pattern, the number of explicit capture groups is 0, no + /// look-around assertions and so on. + /// + /// Note that this ignores `Config::get_auto_prefilter` because if this + /// returns something, then it isn't a prefilter but a matcher itself. + /// Therefore, it shouldn't suffer from the problems typical to prefilters + /// (such as a high false positive rate). + fn from_prefixes( + info: &RegexInfo, + prefixes: &literal::Seq, + ) -> Option<Arc<dyn Strategy>> { + let kind = info.config().get_match_kind(); + // Check to see if our prefixes are exact, which means we might be + // able to bypass the regex engine entirely and just rely on literal + // searches. + if !prefixes.is_exact() { + return None; + } + // We also require that we have a single regex pattern. Namely, + // we reuse the prefilter infrastructure to implement search and + // prefilters only report spans. Prefilters don't know about pattern + // IDs. The multi-regex case isn't a lost cause, we might still use + // Aho-Corasick and we might still just use a regular prefilter, but + // that's done below. + if info.pattern_len() != 1 { + return None; + } + // We can't have any capture groups either. The literal engines don't + // know how to deal with things like '(foo)(bar)'. In that case, a + // prefilter will just be used and then the regex engine will resolve + // the capture groups. + if info.props()[0].explicit_captures_len() != 0 { + return None; + } + // We also require that it has zero look-around assertions. Namely, + // literal extraction treats look-around assertions as if they match + // *every* empty string. But of course, that isn't true. So for + // example, 'foo\bquux' never matches anything, but 'fooquux' is + // extracted from that as an exact literal. Such cases should just run + // the regex engine. 'fooquux' will be used as a normal prefilter, and + // then the regex engine will try to look for an actual match. + if !info.props()[0].look_set().is_empty() { + return None; + } + // Finally, currently, our prefilters are all oriented around + // leftmost-first match semantics, so don't try to use them if the + // caller asked for anything else. + if kind != MatchKind::LeftmostFirst { + return None; + } + // The above seems like a lot of requirements to meet, but it applies + // to a lot of cases. 'foo', '[abc][123]' and 'foo|bar|quux' all meet + // the above criteria, for example. + // + // Note that this is effectively a latency optimization. If we didn't + // do this, then the extracted literals would still get bundled into + // a prefilter, and every regex engine capable of running unanchored + // searches supports prefilters. So this optimization merely sidesteps + // having to run the regex engine at all to confirm the match. Thus, it + // decreases the latency of a match. + + // OK because we know the set is exact and thus finite. + let prefixes = prefixes.literals().unwrap(); + debug!( + "trying to bypass regex engine by creating \ + prefilter from {} literals: {:?}", + prefixes.len(), + prefixes, + ); + let choice = match prefilter::Choice::new(kind, prefixes) { + Some(choice) => choice, + None => { + debug!( + "regex bypass failed because no prefilter could be built" + ); + return None; + } + }; + let strat: Arc<dyn Strategy> = match choice { + prefilter::Choice::Memchr(pre) => Pre::new(pre), + prefilter::Choice::Memchr2(pre) => Pre::new(pre), + prefilter::Choice::Memchr3(pre) => Pre::new(pre), + prefilter::Choice::Memmem(pre) => Pre::new(pre), + prefilter::Choice::Teddy(pre) => Pre::new(pre), + prefilter::Choice::ByteSet(pre) => Pre::new(pre), + prefilter::Choice::AhoCorasick(pre) => Pre::new(pre), + }; + Some(strat) + } + + /// Attempts to extract an alternation of literals, and if it's deemed + /// worth doing, returns an Aho-Corasick prefilter as a strategy. + /// + /// And currently, this only returns something when 'hirs.len() == 1'. This + /// could in theory do something if there are multiple HIRs where all of + /// them are alternation of literals, but I haven't had the time to go down + /// that path yet. + fn from_alternation_literals( + info: &RegexInfo, + hirs: &[&Hir], + ) -> Option<Arc<dyn Strategy>> { + use crate::util::prefilter::AhoCorasick; + + let lits = crate::meta::literal::alternation_literals(info, hirs)?; + let ac = AhoCorasick::new(MatchKind::LeftmostFirst, &lits)?; + Some(Pre::new(ac)) + } +} + +// This implements Strategy for anything that implements PrefilterI. +// +// Note that this must only be used for regexes of length 1. Multi-regexes +// don't work here. The prefilter interface only provides the span of a match +// and not the pattern ID. (I did consider making it more expressive, but I +// couldn't figure out how to tie everything together elegantly.) Thus, so long +// as the regex only contains one pattern, we can simply assume that a match +// corresponds to PatternID::ZERO. And indeed, that's what we do here. +// +// In practice, since this impl is used to report matches directly and thus +// completely bypasses the regex engine, we only wind up using this under the +// following restrictions: +// +// * There must be only one pattern. As explained above. +// * The literal sequence must be finite and only contain exact literals. +// * There must not be any look-around assertions. If there are, the literals +// extracted might be exact, but a match doesn't necessarily imply an overall +// match. As a trivial example, 'foo\bbar' does not match 'foobar'. +// * The pattern must not have any explicit capturing groups. If it does, the +// caller might expect them to be resolved. e.g., 'foo(bar)'. +// +// So when all of those things are true, we use a prefilter directly as a +// strategy. +// +// In the case where the number of patterns is more than 1, we don't use this +// but do use a special Aho-Corasick strategy if all of the regexes are just +// simple literals or alternations of literals. (We also use the Aho-Corasick +// strategy when len(patterns)==1 if the number of literals is large. In that +// case, literal extraction gives up and will return an infinite set.) +impl<P: PrefilterI> Strategy for Pre<P> { + fn group_info(&self) -> &GroupInfo { + &self.group_info + } + + fn create_cache(&self) -> Cache { + Cache { + capmatches: Captures::all(self.group_info().clone()), + pikevm: wrappers::PikeVMCache::none(), + backtrack: wrappers::BoundedBacktrackerCache::none(), + onepass: wrappers::OnePassCache::none(), + hybrid: wrappers::HybridCache::none(), + revhybrid: wrappers::ReverseHybridCache::none(), + } + } + + fn reset_cache(&self, _cache: &mut Cache) {} + + fn is_accelerated(&self) -> bool { + self.pre.is_fast() + } + + fn memory_usage(&self) -> usize { + self.pre.memory_usage() + } + + fn search(&self, _cache: &mut Cache, input: &Input<'_>) -> Option<Match> { + if input.is_done() { + return None; + } + if input.get_anchored().is_anchored() { + return self + .pre + .prefix(input.haystack(), input.get_span()) + .map(|sp| Match::new(PatternID::ZERO, sp)); + } + self.pre + .find(input.haystack(), input.get_span()) + .map(|sp| Match::new(PatternID::ZERO, sp)) + } + + fn search_half( + &self, + cache: &mut Cache, + input: &Input<'_>, + ) -> Option<HalfMatch> { + self.search(cache, input).map(|m| HalfMatch::new(m.pattern(), m.end())) + } + + fn is_match(&self, cache: &mut Cache, input: &Input<'_>) -> bool { + self.search(cache, input).is_some() + } + + fn search_slots( + &self, + cache: &mut Cache, + input: &Input<'_>, + slots: &mut [Option<NonMaxUsize>], + ) -> Option<PatternID> { + let m = self.search(cache, input)?; + if let Some(slot) = slots.get_mut(0) { + *slot = NonMaxUsize::new(m.start()); + } + if let Some(slot) = slots.get_mut(1) { + *slot = NonMaxUsize::new(m.end()); + } + Some(m.pattern()) + } + + fn which_overlapping_matches( + &self, + cache: &mut Cache, + input: &Input<'_>, + patset: &mut PatternSet, + ) { + if self.search(cache, input).is_some() { + patset.insert(PatternID::ZERO); + } + } +} + +#[derive(Debug)] +struct Core { + info: RegexInfo, + pre: Option<Prefilter>, + nfa: NFA, + nfarev: Option<NFA>, + pikevm: wrappers::PikeVM, + backtrack: wrappers::BoundedBacktracker, + onepass: wrappers::OnePass, + hybrid: wrappers::Hybrid, + dfa: wrappers::DFA, +} + +impl Core { + fn new( + info: RegexInfo, + pre: Option<Prefilter>, + hirs: &[&Hir], + ) -> Result<Core, BuildError> { + let mut lookm = LookMatcher::new(); + lookm.set_line_terminator(info.config().get_line_terminator()); + let thompson_config = thompson::Config::new() + .utf8(info.config().get_utf8_empty()) + .nfa_size_limit(info.config().get_nfa_size_limit()) + .shrink(false) + .which_captures(info.config().get_which_captures()) + .look_matcher(lookm); + let nfa = thompson::Compiler::new() + .configure(thompson_config.clone()) + .build_many_from_hir(hirs) + .map_err(BuildError::nfa)?; + // It's possible for the PikeVM or the BB to fail to build, even though + // at this point, we already have a full NFA in hand. They can fail + // when a Unicode word boundary is used but where Unicode word boundary + // support is disabled at compile time, thus making it impossible to + // match. (Construction can also fail if the NFA was compiled without + // captures, but we always enable that above.) + let pikevm = wrappers::PikeVM::new(&info, pre.clone(), &nfa)?; + let backtrack = + wrappers::BoundedBacktracker::new(&info, pre.clone(), &nfa)?; + // The onepass engine can of course fail to build, but we expect it to + // fail in many cases because it is an optimization that doesn't apply + // to all regexes. The 'OnePass' wrapper encapsulates this failure (and + // logs a message if it occurs). + let onepass = wrappers::OnePass::new(&info, &nfa); + // We try to encapsulate whether a particular regex engine should be + // used within each respective wrapper, but the DFAs need a reverse NFA + // to build itself, and we really do not want to build a reverse NFA if + // we know we aren't going to use the lazy DFA. So we do a config check + // up front, which is in practice the only way we won't try to use the + // DFA. + let (nfarev, hybrid, dfa) = + if !info.config().get_hybrid() && !info.config().get_dfa() { + (None, wrappers::Hybrid::none(), wrappers::DFA::none()) + } else { + // FIXME: Technically, we don't quite yet KNOW that we need + // a reverse NFA. It's possible for the DFAs below to both + // fail to build just based on the forward NFA. In which case, + // building the reverse NFA was totally wasted work. But... + // fixing this requires breaking DFA construction apart into + // two pieces: one for the forward part and another for the + // reverse part. Quite annoying. Making it worse, when building + // both DFAs fails, it's quite likely that the NFA is large and + // that it will take quite some time to build the reverse NFA + // too. So... it's really probably worth it to do this! + let nfarev = thompson::Compiler::new() + // Currently, reverse NFAs don't support capturing groups, + // so we MUST disable them. But even if we didn't have to, + // we would, because nothing in this crate does anything + // useful with capturing groups in reverse. And of course, + // the lazy DFA ignores capturing groups in all cases. + .configure( + thompson_config + .clone() + .which_captures(WhichCaptures::None) + .reverse(true), + ) + .build_many_from_hir(hirs) + .map_err(BuildError::nfa)?; + let dfa = if !info.config().get_dfa() { + wrappers::DFA::none() + } else { + wrappers::DFA::new(&info, pre.clone(), &nfa, &nfarev) + }; + let hybrid = if !info.config().get_hybrid() { + wrappers::Hybrid::none() + } else if dfa.is_some() { + debug!("skipping lazy DFA because we have a full DFA"); + wrappers::Hybrid::none() + } else { + wrappers::Hybrid::new(&info, pre.clone(), &nfa, &nfarev) + }; + (Some(nfarev), hybrid, dfa) + }; + Ok(Core { + info, + pre, + nfa, + nfarev, + pikevm, + backtrack, + onepass, + hybrid, + dfa, + }) + } + + #[cfg_attr(feature = "perf-inline", inline(always))] + fn try_search_mayfail( + &self, + cache: &mut Cache, + input: &Input<'_>, + ) -> Option<Result<Option<Match>, RetryFailError>> { + if let Some(e) = self.dfa.get(input) { + trace!("using full DFA for search at {:?}", input.get_span()); + Some(e.try_search(input)) + } else if let Some(e) = self.hybrid.get(input) { + trace!("using lazy DFA for search at {:?}", input.get_span()); + Some(e.try_search(&mut cache.hybrid, input)) + } else { + None + } + } + + fn search_nofail( + &self, + cache: &mut Cache, + input: &Input<'_>, + ) -> Option<Match> { + let caps = &mut cache.capmatches; + caps.set_pattern(None); + // We manually inline 'try_search_slots_nofail' here because we need to + // borrow from 'cache.capmatches' in this method, but if we do, then + // we can't pass 'cache' wholesale to to 'try_slots_no_hybrid'. It's a + // classic example of how the borrow checker inhibits decomposition. + // There are of course work-arounds (more types and/or interior + // mutability), but that's more annoying than this IMO. + let pid = if let Some(ref e) = self.onepass.get(input) { + trace!("using OnePass for search at {:?}", input.get_span()); + e.search_slots(&mut cache.onepass, input, caps.slots_mut()) + } else if let Some(ref e) = self.backtrack.get(input) { + trace!( + "using BoundedBacktracker for search at {:?}", + input.get_span() + ); + e.search_slots(&mut cache.backtrack, input, caps.slots_mut()) + } else { + trace!("using PikeVM for search at {:?}", input.get_span()); + let e = self.pikevm.get(); + e.search_slots(&mut cache.pikevm, input, caps.slots_mut()) + }; + caps.set_pattern(pid); + caps.get_match() + } + + fn search_half_nofail( + &self, + cache: &mut Cache, + input: &Input<'_>, + ) -> Option<HalfMatch> { + // Only the lazy/full DFA returns half-matches, since the DFA requires + // a reverse scan to find the start position. These fallback regex + // engines can find the start and end in a single pass, so we just do + // that and throw away the start offset to conform to the API. + let m = self.search_nofail(cache, input)?; + Some(HalfMatch::new(m.pattern(), m.end())) + } + + fn search_slots_nofail( + &self, + cache: &mut Cache, + input: &Input<'_>, + slots: &mut [Option<NonMaxUsize>], + ) -> Option<PatternID> { + if let Some(ref e) = self.onepass.get(input) { + trace!( + "using OnePass for capture search at {:?}", + input.get_span() + ); + e.search_slots(&mut cache.onepass, input, slots) + } else if let Some(ref e) = self.backtrack.get(input) { + trace!( + "using BoundedBacktracker for capture search at {:?}", + input.get_span() + ); + e.search_slots(&mut cache.backtrack, input, slots) + } else { + trace!( + "using PikeVM for capture search at {:?}", + input.get_span() + ); + let e = self.pikevm.get(); + e.search_slots(&mut cache.pikevm, input, slots) + } + } + + fn is_match_nofail(&self, cache: &mut Cache, input: &Input<'_>) -> bool { + if let Some(ref e) = self.onepass.get(input) { + trace!( + "using OnePass for is-match search at {:?}", + input.get_span() + ); + e.search_slots(&mut cache.onepass, input, &mut []).is_some() + } else if let Some(ref e) = self.backtrack.get(input) { + trace!( + "using BoundedBacktracker for is-match search at {:?}", + input.get_span() + ); + e.is_match(&mut cache.backtrack, input) + } else { + trace!( + "using PikeVM for is-match search at {:?}", + input.get_span() + ); + let e = self.pikevm.get(); + e.is_match(&mut cache.pikevm, input) + } + } + + fn is_capture_search_needed(&self, slots_len: usize) -> bool { + slots_len > self.nfa.group_info().implicit_slot_len() + } +} + +impl Strategy for Core { + #[cfg_attr(feature = "perf-inline", inline(always))] + fn group_info(&self) -> &GroupInfo { + self.nfa.group_info() + } + + #[cfg_attr(feature = "perf-inline", inline(always))] + fn create_cache(&self) -> Cache { + Cache { + capmatches: Captures::all(self.group_info().clone()), + pikevm: self.pikevm.create_cache(), + backtrack: self.backtrack.create_cache(), + onepass: self.onepass.create_cache(), + hybrid: self.hybrid.create_cache(), + revhybrid: wrappers::ReverseHybridCache::none(), + } + } + + #[cfg_attr(feature = "perf-inline", inline(always))] + fn reset_cache(&self, cache: &mut Cache) { + cache.pikevm.reset(&self.pikevm); + cache.backtrack.reset(&self.backtrack); + cache.onepass.reset(&self.onepass); + cache.hybrid.reset(&self.hybrid); + } + + fn is_accelerated(&self) -> bool { + self.pre.as_ref().map_or(false, |pre| pre.is_fast()) + } + + fn memory_usage(&self) -> usize { + self.info.memory_usage() + + self.pre.as_ref().map_or(0, |pre| pre.memory_usage()) + + self.nfa.memory_usage() + + self.nfarev.as_ref().map_or(0, |nfa| nfa.memory_usage()) + + self.onepass.memory_usage() + + self.dfa.memory_usage() + } + + #[cfg_attr(feature = "perf-inline", inline(always))] + fn search(&self, cache: &mut Cache, input: &Input<'_>) -> Option<Match> { + // We manually inline try_search_mayfail here because letting the + // compiler do it seems to produce pretty crappy codegen. + return if let Some(e) = self.dfa.get(input) { + trace!("using full DFA for full search at {:?}", input.get_span()); + match e.try_search(input) { + Ok(x) => x, + Err(_err) => { + trace!("full DFA search failed: {}", _err); + self.search_nofail(cache, input) + } + } + } else if let Some(e) = self.hybrid.get(input) { + trace!("using lazy DFA for full search at {:?}", input.get_span()); + match e.try_search(&mut cache.hybrid, input) { + Ok(x) => x, + Err(_err) => { + trace!("lazy DFA search failed: {}", _err); + self.search_nofail(cache, input) + } + } + } else { + self.search_nofail(cache, input) + }; + } + + #[cfg_attr(feature = "perf-inline", inline(always))] + fn search_half( + &self, + cache: &mut Cache, + input: &Input<'_>, + ) -> Option<HalfMatch> { + // The main difference with 'search' is that if we're using a DFA, we + // can use a single forward scan without needing to run the reverse + // DFA. + if let Some(e) = self.dfa.get(input) { + trace!("using full DFA for half search at {:?}", input.get_span()); + match e.try_search_half_fwd(input) { + Ok(x) => x, + Err(_err) => { + trace!("full DFA half search failed: {}", _err); + self.search_half_nofail(cache, input) + } + } + } else if let Some(e) = self.hybrid.get(input) { + trace!("using lazy DFA for half search at {:?}", input.get_span()); + match e.try_search_half_fwd(&mut cache.hybrid, input) { + Ok(x) => x, + Err(_err) => { + trace!("lazy DFA half search failed: {}", _err); + self.search_half_nofail(cache, input) + } + } + } else { + self.search_half_nofail(cache, input) + } + } + + #[cfg_attr(feature = "perf-inline", inline(always))] + fn is_match(&self, cache: &mut Cache, input: &Input<'_>) -> bool { + if let Some(e) = self.dfa.get(input) { + trace!( + "using full DFA for is-match search at {:?}", + input.get_span() + ); + match e.try_search_half_fwd(input) { + Ok(x) => x.is_some(), + Err(_err) => { + trace!("full DFA half search failed: {}", _err); + self.is_match_nofail(cache, input) + } + } + } else if let Some(e) = self.hybrid.get(input) { + trace!( + "using lazy DFA for is-match search at {:?}", + input.get_span() + ); + match e.try_search_half_fwd(&mut cache.hybrid, input) { + Ok(x) => x.is_some(), + Err(_err) => { + trace!("lazy DFA half search failed: {}", _err); + self.is_match_nofail(cache, input) + } + } + } else { + self.is_match_nofail(cache, input) + } + } + + #[cfg_attr(feature = "perf-inline", inline(always))] + fn search_slots( + &self, + cache: &mut Cache, + input: &Input<'_>, + slots: &mut [Option<NonMaxUsize>], + ) -> Option<PatternID> { + // Even if the regex has explicit capture groups, if the caller didn't + // provide any explicit slots, then it doesn't make sense to try and do + // extra work to get offsets for those slots. Ideally the caller should + // realize this and not call this routine in the first place, but alas, + // we try to save the caller from themselves if they do. + if !self.is_capture_search_needed(slots.len()) { + trace!("asked for slots unnecessarily, trying fast path"); + let m = self.search(cache, input)?; + copy_match_to_slots(m, slots); + return Some(m.pattern()); + } + // If the onepass DFA is available for this search (which only happens + // when it's anchored), then skip running a fallible DFA. The onepass + // DFA isn't as fast as a full or lazy DFA, but it is typically quite + // a bit faster than the backtracker or the PikeVM. So it isn't as + // advantageous to try and do a full/lazy DFA scan first. + // + // We still theorize that it's better to do a full/lazy DFA scan, even + // when it's anchored, because it's usually much faster and permits us + // to say "no match" much more quickly. This does hurt the case of, + // say, parsing each line in a log file into capture groups, because + // in that case, the line always matches. So the lazy DFA scan is + // usually just wasted work. But, the lazy DFA is usually quite fast + // and doesn't cost too much here. + if self.onepass.get(&input).is_some() { + return self.search_slots_nofail(cache, &input, slots); + } + let m = match self.try_search_mayfail(cache, input) { + Some(Ok(Some(m))) => m, + Some(Ok(None)) => return None, + Some(Err(_err)) => { + trace!("fast capture search failed: {}", _err); + return self.search_slots_nofail(cache, input, slots); + } + None => { + return self.search_slots_nofail(cache, input, slots); + } + }; + // At this point, now that we've found the bounds of the + // match, we need to re-run something that can resolve + // capturing groups. But we only need to run on it on the + // match bounds and not the entire haystack. + trace!( + "match found at {}..{} in capture search, \ + using another engine to find captures", + m.start(), + m.end(), + ); + let input = input + .clone() + .span(m.start()..m.end()) + .anchored(Anchored::Pattern(m.pattern())); + Some( + self.search_slots_nofail(cache, &input, slots) + .expect("should find a match"), + ) + } + + #[cfg_attr(feature = "perf-inline", inline(always))] + fn which_overlapping_matches( + &self, + cache: &mut Cache, + input: &Input<'_>, + patset: &mut PatternSet, + ) { + if let Some(e) = self.dfa.get(input) { + trace!( + "using full DFA for overlapping search at {:?}", + input.get_span() + ); + let _err = match e.try_which_overlapping_matches(input, patset) { + Ok(()) => return, + Err(err) => err, + }; + trace!("fast overlapping search failed: {}", _err); + } else if let Some(e) = self.hybrid.get(input) { + trace!( + "using lazy DFA for overlapping search at {:?}", + input.get_span() + ); + let _err = match e.try_which_overlapping_matches( + &mut cache.hybrid, + input, + patset, + ) { + Ok(()) => { + return; + } + Err(err) => err, + }; + trace!("fast overlapping search failed: {}", _err); + } + trace!( + "using PikeVM for overlapping search at {:?}", + input.get_span() + ); + let e = self.pikevm.get(); + e.which_overlapping_matches(&mut cache.pikevm, input, patset) + } +} + +#[derive(Debug)] +struct ReverseAnchored { + core: Core, +} + +impl ReverseAnchored { + fn new(core: Core) -> Result<ReverseAnchored, Core> { + if !core.info.is_always_anchored_end() { + debug!( + "skipping reverse anchored optimization because \ + the regex is not always anchored at the end" + ); + return Err(core); + } + // Note that the caller can still request an anchored search even when + // the regex isn't anchored at the start. We detect that case in the + // search routines below and just fallback to the core engine. This + // is fine because both searches are anchored. It's just a matter of + // picking one. Falling back to the core engine is a little simpler, + // since if we used the reverse anchored approach, we'd have to add an + // extra check to ensure the match reported starts at the place where + // the caller requested the search to start. + if core.info.is_always_anchored_start() { + debug!( + "skipping reverse anchored optimization because \ + the regex is also anchored at the start" + ); + return Err(core); + } + // Only DFAs can do reverse searches (currently), so we need one of + // them in order to do this optimization. It's possible (although + // pretty unlikely) that we have neither and need to give up. + if !core.hybrid.is_some() && !core.dfa.is_some() { + debug!( + "skipping reverse anchored optimization because \ + we don't have a lazy DFA or a full DFA" + ); + return Err(core); + } + Ok(ReverseAnchored { core }) + } + + #[cfg_attr(feature = "perf-inline", inline(always))] + fn try_search_half_anchored_rev( + &self, + cache: &mut Cache, + input: &Input<'_>, + ) -> Result<Option<HalfMatch>, RetryFailError> { + // We of course always want an anchored search. In theory, the + // underlying regex engines should automatically enable anchored + // searches since the regex is itself anchored, but this more clearly + // expresses intent and is always correct. + let input = input.clone().anchored(Anchored::Yes); + if let Some(e) = self.core.dfa.get(&input) { + trace!( + "using full DFA for reverse anchored search at {:?}", + input.get_span() + ); + e.try_search_half_rev(&input) + } else if let Some(e) = self.core.hybrid.get(&input) { + trace!( + "using lazy DFA for reverse anchored search at {:?}", + input.get_span() + ); + e.try_search_half_rev(&mut cache.hybrid, &input) + } else { + unreachable!("ReverseAnchored always has a DFA") + } + } +} + +// Note that in this impl, we don't check that 'input.end() == +// input.haystack().len()'. In particular, when that condition is false, a +// match is always impossible because we know that the regex is always anchored +// at the end (or else 'ReverseAnchored' won't be built). We don't check that +// here because the 'Regex' wrapper actually does that for us in all cases. +// Thus, in this impl, we can actually assume that the end position in 'input' +// is equivalent to the length of the haystack. +impl Strategy for ReverseAnchored { + #[cfg_attr(feature = "perf-inline", inline(always))] + fn group_info(&self) -> &GroupInfo { + self.core.group_info() + } + + #[cfg_attr(feature = "perf-inline", inline(always))] + fn create_cache(&self) -> Cache { + self.core.create_cache() + } + + #[cfg_attr(feature = "perf-inline", inline(always))] + fn reset_cache(&self, cache: &mut Cache) { + self.core.reset_cache(cache); + } + + fn is_accelerated(&self) -> bool { + // Since this is anchored at the end, a reverse anchored search is + // almost certainly guaranteed to result in a much faster search than + // a standard forward search. + true + } + + fn memory_usage(&self) -> usize { + self.core.memory_usage() + } + + #[cfg_attr(feature = "perf-inline", inline(always))] + fn search(&self, cache: &mut Cache, input: &Input<'_>) -> Option<Match> { + if input.get_anchored().is_anchored() { + return self.core.search(cache, input); + } + match self.try_search_half_anchored_rev(cache, input) { + Err(_err) => { + trace!("fast reverse anchored search failed: {}", _err); + self.core.search_nofail(cache, input) + } + Ok(None) => None, + Ok(Some(hm)) => { + Some(Match::new(hm.pattern(), hm.offset()..input.end())) + } + } + } + + #[cfg_attr(feature = "perf-inline", inline(always))] + fn search_half( + &self, + cache: &mut Cache, + input: &Input<'_>, + ) -> Option<HalfMatch> { + if input.get_anchored().is_anchored() { + return self.core.search_half(cache, input); + } + match self.try_search_half_anchored_rev(cache, input) { + Err(_err) => { + trace!("fast reverse anchored search failed: {}", _err); + self.core.search_half_nofail(cache, input) + } + Ok(None) => None, + Ok(Some(hm)) => { + // Careful here! 'try_search_half' is a *forward* search that + // only cares about the *end* position of a match. But + // 'hm.offset()' is actually the start of the match. So we + // actually just throw that away here and, since we know we + // have a match, return the only possible position at which a + // match can occur: input.end(). + Some(HalfMatch::new(hm.pattern(), input.end())) + } + } + } + + #[cfg_attr(feature = "perf-inline", inline(always))] + fn is_match(&self, cache: &mut Cache, input: &Input<'_>) -> bool { + if input.get_anchored().is_anchored() { + return self.core.is_match(cache, input); + } + match self.try_search_half_anchored_rev(cache, input) { + Err(_err) => { + trace!("fast reverse anchored search failed: {}", _err); + self.core.is_match_nofail(cache, input) + } + Ok(None) => false, + Ok(Some(_)) => true, + } + } + + #[cfg_attr(feature = "perf-inline", inline(always))] + fn search_slots( + &self, + cache: &mut Cache, + input: &Input<'_>, + slots: &mut [Option<NonMaxUsize>], + ) -> Option<PatternID> { + if input.get_anchored().is_anchored() { + return self.core.search_slots(cache, input, slots); + } + match self.try_search_half_anchored_rev(cache, input) { + Err(_err) => { + trace!("fast reverse anchored search failed: {}", _err); + self.core.search_slots_nofail(cache, input, slots) + } + Ok(None) => None, + Ok(Some(hm)) => { + if !self.core.is_capture_search_needed(slots.len()) { + trace!("asked for slots unnecessarily, skipping captures"); + let m = Match::new(hm.pattern(), hm.offset()..input.end()); + copy_match_to_slots(m, slots); + return Some(m.pattern()); + } + let start = hm.offset(); + let input = input + .clone() + .span(start..input.end()) + .anchored(Anchored::Pattern(hm.pattern())); + self.core.search_slots_nofail(cache, &input, slots) + } + } + } + + #[cfg_attr(feature = "perf-inline", inline(always))] + fn which_overlapping_matches( + &self, + cache: &mut Cache, + input: &Input<'_>, + patset: &mut PatternSet, + ) { + // It seems like this could probably benefit from a reverse anchored + // optimization, perhaps by doing an overlapping reverse search (which + // the DFAs do support). I haven't given it much thought though, and + // I'm currently focus more on the single pattern case. + self.core.which_overlapping_matches(cache, input, patset) + } +} + +#[derive(Debug)] +struct ReverseSuffix { + core: Core, + pre: Prefilter, +} + +impl ReverseSuffix { + fn new(core: Core, hirs: &[&Hir]) -> Result<ReverseSuffix, Core> { + if !core.info.config().get_auto_prefilter() { + debug!( + "skipping reverse suffix optimization because \ + automatic prefilters are disabled" + ); + return Err(core); + } + // Like the reverse inner optimization, we don't do this for regexes + // that are always anchored. It could lead to scanning too much, but + // could say "no match" much more quickly than running the regex + // engine if the initial literal scan doesn't match. With that said, + // the reverse suffix optimization has lower overhead, since it only + // requires a reverse scan after a literal match to confirm or reject + // the match. (Although, in the case of confirmation, it then needs to + // do another forward scan to find the end position.) + // + // Note that the caller can still request an anchored search even + // when the regex isn't anchored. We detect that case in the search + // routines below and just fallback to the core engine. Currently this + // optimization assumes all searches are unanchored, so if we do want + // to enable this optimization for anchored searches, it will need a + // little work to support it. + if core.info.is_always_anchored_start() { + debug!( + "skipping reverse suffix optimization because \ + the regex is always anchored at the start", + ); + return Err(core); + } + // Only DFAs can do reverse searches (currently), so we need one of + // them in order to do this optimization. It's possible (although + // pretty unlikely) that we have neither and need to give up. + if !core.hybrid.is_some() && !core.dfa.is_some() { + debug!( + "skipping reverse suffix optimization because \ + we don't have a lazy DFA or a full DFA" + ); + return Err(core); + } + if core.pre.as_ref().map_or(false, |p| p.is_fast()) { + debug!( + "skipping reverse suffix optimization because \ + we already have a prefilter that we think is fast" + ); + return Err(core); + } + let kind = core.info.config().get_match_kind(); + let suffixes = crate::util::prefilter::suffixes(kind, hirs); + let lcs = match suffixes.longest_common_suffix() { + None => { + debug!( + "skipping reverse suffix optimization because \ + a longest common suffix could not be found", + ); + return Err(core); + } + Some(lcs) if lcs.is_empty() => { + debug!( + "skipping reverse suffix optimization because \ + the longest common suffix is the empty string", + ); + return Err(core); + } + Some(lcs) => lcs, + }; + let pre = match Prefilter::new(kind, &[lcs]) { + Some(pre) => pre, + None => { + debug!( + "skipping reverse suffix optimization because \ + a prefilter could not be constructed from the \ + longest common suffix", + ); + return Err(core); + } + }; + if !pre.is_fast() { + debug!( + "skipping reverse suffix optimization because \ + while we have a suffix prefilter, it is not \ + believed to be 'fast'" + ); + return Err(core); + } + Ok(ReverseSuffix { core, pre }) + } + + #[cfg_attr(feature = "perf-inline", inline(always))] + fn try_search_half_start( + &self, + cache: &mut Cache, + input: &Input<'_>, + ) -> Result<Option<HalfMatch>, RetryError> { + let mut span = input.get_span(); + let mut min_start = 0; + loop { + let litmatch = match self.pre.find(input.haystack(), span) { + None => return Ok(None), + Some(span) => span, + }; + trace!("reverse suffix scan found suffix match at {:?}", litmatch); + let revinput = input + .clone() + .anchored(Anchored::Yes) + .span(input.start()..litmatch.end); + match self + .try_search_half_rev_limited(cache, &revinput, min_start)? + { + None => { + if span.start >= span.end { + break; + } + span.start = litmatch.start.checked_add(1).unwrap(); + } + Some(hm) => return Ok(Some(hm)), + } + min_start = litmatch.end; + } + Ok(None) + } + + #[cfg_attr(feature = "perf-inline", inline(always))] + fn try_search_half_fwd( + &self, + cache: &mut Cache, + input: &Input<'_>, + ) -> Result<Option<HalfMatch>, RetryFailError> { + if let Some(e) = self.core.dfa.get(&input) { + trace!( + "using full DFA for forward reverse suffix search at {:?}", + input.get_span() + ); + e.try_search_half_fwd(&input) + } else if let Some(e) = self.core.hybrid.get(&input) { + trace!( + "using lazy DFA for forward reverse suffix search at {:?}", + input.get_span() + ); + e.try_search_half_fwd(&mut cache.hybrid, &input) + } else { + unreachable!("ReverseSuffix always has a DFA") + } + } + + #[cfg_attr(feature = "perf-inline", inline(always))] + fn try_search_half_rev_limited( + &self, + cache: &mut Cache, + input: &Input<'_>, + min_start: usize, + ) -> Result<Option<HalfMatch>, RetryError> { + if let Some(e) = self.core.dfa.get(&input) { + trace!( + "using full DFA for reverse suffix search at {:?}, \ + but will be stopped at {} to avoid quadratic behavior", + input.get_span(), + min_start, + ); + e.try_search_half_rev_limited(&input, min_start) + } else if let Some(e) = self.core.hybrid.get(&input) { + trace!( + "using lazy DFA for reverse inner search at {:?}, \ + but will be stopped at {} to avoid quadratic behavior", + input.get_span(), + min_start, + ); + e.try_search_half_rev_limited(&mut cache.hybrid, &input, min_start) + } else { + unreachable!("ReverseSuffix always has a DFA") + } + } +} + +impl Strategy for ReverseSuffix { + #[cfg_attr(feature = "perf-inline", inline(always))] + fn group_info(&self) -> &GroupInfo { + self.core.group_info() + } + + #[cfg_attr(feature = "perf-inline", inline(always))] + fn create_cache(&self) -> Cache { + self.core.create_cache() + } + + #[cfg_attr(feature = "perf-inline", inline(always))] + fn reset_cache(&self, cache: &mut Cache) { + self.core.reset_cache(cache); + } + + fn is_accelerated(&self) -> bool { + self.pre.is_fast() + } + + fn memory_usage(&self) -> usize { + self.core.memory_usage() + self.pre.memory_usage() + } + + #[cfg_attr(feature = "perf-inline", inline(always))] + fn search(&self, cache: &mut Cache, input: &Input<'_>) -> Option<Match> { + if input.get_anchored().is_anchored() { + return self.core.search(cache, input); + } + match self.try_search_half_start(cache, input) { + Err(RetryError::Quadratic(_err)) => { + trace!("reverse suffix optimization failed: {}", _err); + self.core.search(cache, input) + } + Err(RetryError::Fail(_err)) => { + trace!("reverse suffix reverse fast search failed: {}", _err); + self.core.search_nofail(cache, input) + } + Ok(None) => None, + Ok(Some(hm_start)) => { + let fwdinput = input + .clone() + .anchored(Anchored::Pattern(hm_start.pattern())) + .span(hm_start.offset()..input.end()); + match self.try_search_half_fwd(cache, &fwdinput) { + Err(_err) => { + trace!( + "reverse suffix forward fast search failed: {}", + _err + ); + self.core.search_nofail(cache, input) + } + Ok(None) => { + unreachable!( + "suffix match plus reverse match implies \ + there must be a match", + ) + } + Ok(Some(hm_end)) => Some(Match::new( + hm_start.pattern(), + hm_start.offset()..hm_end.offset(), + )), + } + } + } + } + + #[cfg_attr(feature = "perf-inline", inline(always))] + fn search_half( + &self, + cache: &mut Cache, + input: &Input<'_>, + ) -> Option<HalfMatch> { + if input.get_anchored().is_anchored() { + return self.core.search_half(cache, input); + } + match self.try_search_half_start(cache, input) { + Err(RetryError::Quadratic(_err)) => { + trace!("reverse suffix half optimization failed: {}", _err); + self.core.search_half(cache, input) + } + Err(RetryError::Fail(_err)) => { + trace!( + "reverse suffix reverse fast half search failed: {}", + _err + ); + self.core.search_half_nofail(cache, input) + } + Ok(None) => None, + Ok(Some(hm_start)) => { + // This is a bit subtle. It is tempting to just stop searching + // at this point and return a half-match with an offset + // corresponding to where the suffix was found. But the suffix + // match does not necessarily correspond to the end of the + // proper leftmost-first match. Consider /[a-z]+ing/ against + // 'tingling'. The first suffix match is the first 'ing', and + // the /[a-z]+/ matches the 't'. So if we stopped here, then + // we'd report 'ting' as the match. But 'tingling' is the + // correct match because of greediness. + let fwdinput = input + .clone() + .anchored(Anchored::Pattern(hm_start.pattern())) + .span(hm_start.offset()..input.end()); + match self.try_search_half_fwd(cache, &fwdinput) { + Err(_err) => { + trace!( + "reverse suffix forward fast search failed: {}", + _err + ); + self.core.search_half_nofail(cache, input) + } + Ok(None) => { + unreachable!( + "suffix match plus reverse match implies \ + there must be a match", + ) + } + Ok(Some(hm_end)) => Some(hm_end), + } + } + } + } + + #[cfg_attr(feature = "perf-inline", inline(always))] + fn is_match(&self, cache: &mut Cache, input: &Input<'_>) -> bool { + if input.get_anchored().is_anchored() { + return self.core.is_match(cache, input); + } + match self.try_search_half_start(cache, input) { + Err(RetryError::Quadratic(_err)) => { + trace!("reverse suffix half optimization failed: {}", _err); + self.core.is_match_nofail(cache, input) + } + Err(RetryError::Fail(_err)) => { + trace!( + "reverse suffix reverse fast half search failed: {}", + _err + ); + self.core.is_match_nofail(cache, input) + } + Ok(None) => false, + Ok(Some(_)) => true, + } + } + + #[cfg_attr(feature = "perf-inline", inline(always))] + fn search_slots( + &self, + cache: &mut Cache, + input: &Input<'_>, + slots: &mut [Option<NonMaxUsize>], + ) -> Option<PatternID> { + if input.get_anchored().is_anchored() { + return self.core.search_slots(cache, input, slots); + } + if !self.core.is_capture_search_needed(slots.len()) { + trace!("asked for slots unnecessarily, trying fast path"); + let m = self.search(cache, input)?; + copy_match_to_slots(m, slots); + return Some(m.pattern()); + } + let hm_start = match self.try_search_half_start(cache, input) { + Err(RetryError::Quadratic(_err)) => { + trace!( + "reverse suffix captures optimization failed: {}", + _err + ); + return self.core.search_slots(cache, input, slots); + } + Err(RetryError::Fail(_err)) => { + trace!( + "reverse suffix reverse fast captures search failed: {}", + _err + ); + return self.core.search_slots_nofail(cache, input, slots); + } + Ok(None) => return None, + Ok(Some(hm_start)) => hm_start, + }; + trace!( + "match found at {}..{} in capture search, \ + using another engine to find captures", + hm_start.offset(), + input.end(), + ); + let start = hm_start.offset(); + let input = input + .clone() + .span(start..input.end()) + .anchored(Anchored::Pattern(hm_start.pattern())); + self.core.search_slots_nofail(cache, &input, slots) + } + + #[cfg_attr(feature = "perf-inline", inline(always))] + fn which_overlapping_matches( + &self, + cache: &mut Cache, + input: &Input<'_>, + patset: &mut PatternSet, + ) { + self.core.which_overlapping_matches(cache, input, patset) + } +} + +#[derive(Debug)] +struct ReverseInner { + core: Core, + preinner: Prefilter, + nfarev: NFA, + hybrid: wrappers::ReverseHybrid, + dfa: wrappers::ReverseDFA, +} + +impl ReverseInner { + fn new(core: Core, hirs: &[&Hir]) -> Result<ReverseInner, Core> { + if !core.info.config().get_auto_prefilter() { + debug!( + "skipping reverse inner optimization because \ + automatic prefilters are disabled" + ); + return Err(core); + } + // Currently we hard-code the assumption of leftmost-first match + // semantics. This isn't a huge deal because 'all' semantics tend to + // only be used for forward overlapping searches with multiple regexes, + // and this optimization only supports a single pattern at the moment. + if core.info.config().get_match_kind() != MatchKind::LeftmostFirst { + debug!( + "skipping reverse inner optimization because \ + match kind is {:?} but this only supports leftmost-first", + core.info.config().get_match_kind(), + ); + return Err(core); + } + // It's likely that a reverse inner scan has too much overhead for it + // to be worth it when the regex is anchored at the start. It is + // possible for it to be quite a bit faster if the initial literal + // scan fails to detect a match, in which case, we can say "no match" + // very quickly. But this could be undesirable, e.g., scanning too far + // or when the literal scan matches. If it matches, then confirming the + // match requires a reverse scan followed by a forward scan to confirm + // or reject, which is a fair bit of work. + // + // Note that the caller can still request an anchored search even + // when the regex isn't anchored. We detect that case in the search + // routines below and just fallback to the core engine. Currently this + // optimization assumes all searches are unanchored, so if we do want + // to enable this optimization for anchored searches, it will need a + // little work to support it. + if core.info.is_always_anchored_start() { + debug!( + "skipping reverse inner optimization because \ + the regex is always anchored at the start", + ); + return Err(core); + } + // Only DFAs can do reverse searches (currently), so we need one of + // them in order to do this optimization. It's possible (although + // pretty unlikely) that we have neither and need to give up. + if !core.hybrid.is_some() && !core.dfa.is_some() { + debug!( + "skipping reverse inner optimization because \ + we don't have a lazy DFA or a full DFA" + ); + return Err(core); + } + if core.pre.as_ref().map_or(false, |p| p.is_fast()) { + debug!( + "skipping reverse inner optimization because \ + we already have a prefilter that we think is fast" + ); + return Err(core); + } else if core.pre.is_some() { + debug!( + "core engine has a prefix prefilter, but it is \ + probably not fast, so continuing with attempt to \ + use reverse inner prefilter" + ); + } + let (concat_prefix, preinner) = match reverse_inner::extract(hirs) { + Some(x) => x, + // N.B. the 'extract' function emits debug messages explaining + // why we bailed out here. + None => return Err(core), + }; + debug!("building reverse NFA for prefix before inner literal"); + let mut lookm = LookMatcher::new(); + lookm.set_line_terminator(core.info.config().get_line_terminator()); + let thompson_config = thompson::Config::new() + .reverse(true) + .utf8(core.info.config().get_utf8_empty()) + .nfa_size_limit(core.info.config().get_nfa_size_limit()) + .shrink(false) + .which_captures(WhichCaptures::None) + .look_matcher(lookm); + let result = thompson::Compiler::new() + .configure(thompson_config) + .build_from_hir(&concat_prefix); + let nfarev = match result { + Ok(nfarev) => nfarev, + Err(_err) => { + debug!( + "skipping reverse inner optimization because the \ + reverse NFA failed to build: {}", + _err, + ); + return Err(core); + } + }; + debug!("building reverse DFA for prefix before inner literal"); + let dfa = if !core.info.config().get_dfa() { + wrappers::ReverseDFA::none() + } else { + wrappers::ReverseDFA::new(&core.info, &nfarev) + }; + let hybrid = if !core.info.config().get_hybrid() { + wrappers::ReverseHybrid::none() + } else if dfa.is_some() { + debug!( + "skipping lazy DFA for reverse inner optimization \ + because we have a full DFA" + ); + wrappers::ReverseHybrid::none() + } else { + wrappers::ReverseHybrid::new(&core.info, &nfarev) + }; + Ok(ReverseInner { core, preinner, nfarev, hybrid, dfa }) + } + + #[cfg_attr(feature = "perf-inline", inline(always))] + fn try_search_full( + &self, + cache: &mut Cache, + input: &Input<'_>, + ) -> Result<Option<Match>, RetryError> { + let mut span = input.get_span(); + let mut min_match_start = 0; + let mut min_pre_start = 0; + loop { + let litmatch = match self.preinner.find(input.haystack(), span) { + None => return Ok(None), + Some(span) => span, + }; + if litmatch.start < min_pre_start { + trace!( + "found inner prefilter match at {:?}, which starts \ + before the end of the last forward scan at {}, \ + quitting to avoid quadratic behavior", + litmatch, + min_pre_start, + ); + return Err(RetryError::Quadratic(RetryQuadraticError::new())); + } + trace!("reverse inner scan found inner match at {:?}", litmatch); + let revinput = input + .clone() + .anchored(Anchored::Yes) + .span(input.start()..litmatch.start); + // Note that in addition to the literal search above scanning past + // our minimum start point, this routine can also return an error + // as a result of detecting possible quadratic behavior if the + // reverse scan goes past the minimum start point. That is, the + // literal search might not, but the reverse regex search for the + // prefix might! + match self.try_search_half_rev_limited( + cache, + &revinput, + min_match_start, + )? { + None => { + if span.start >= span.end { + break; + } + span.start = litmatch.start.checked_add(1).unwrap(); + } + Some(hm_start) => { + let fwdinput = input + .clone() + .anchored(Anchored::Pattern(hm_start.pattern())) + .span(hm_start.offset()..input.end()); + match self.try_search_half_fwd_stopat(cache, &fwdinput)? { + Err(stopat) => { + min_pre_start = stopat; + span.start = + litmatch.start.checked_add(1).unwrap(); + } + Ok(hm_end) => { + return Ok(Some(Match::new( + hm_start.pattern(), + hm_start.offset()..hm_end.offset(), + ))) + } + } + } + } + min_match_start = litmatch.end; + } + Ok(None) + } + + #[cfg_attr(feature = "perf-inline", inline(always))] + fn try_search_half_fwd_stopat( + &self, + cache: &mut Cache, + input: &Input<'_>, + ) -> Result<Result<HalfMatch, usize>, RetryFailError> { + if let Some(e) = self.core.dfa.get(&input) { + trace!( + "using full DFA for forward reverse inner search at {:?}", + input.get_span() + ); + e.try_search_half_fwd_stopat(&input) + } else if let Some(e) = self.core.hybrid.get(&input) { + trace!( + "using lazy DFA for forward reverse inner search at {:?}", + input.get_span() + ); + e.try_search_half_fwd_stopat(&mut cache.hybrid, &input) + } else { + unreachable!("ReverseInner always has a DFA") + } + } + + #[cfg_attr(feature = "perf-inline", inline(always))] + fn try_search_half_rev_limited( + &self, + cache: &mut Cache, + input: &Input<'_>, + min_start: usize, + ) -> Result<Option<HalfMatch>, RetryError> { + if let Some(e) = self.dfa.get(&input) { + trace!( + "using full DFA for reverse inner search at {:?}, \ + but will be stopped at {} to avoid quadratic behavior", + input.get_span(), + min_start, + ); + e.try_search_half_rev_limited(&input, min_start) + } else if let Some(e) = self.hybrid.get(&input) { + trace!( + "using lazy DFA for reverse inner search at {:?}, \ + but will be stopped at {} to avoid quadratic behavior", + input.get_span(), + min_start, + ); + e.try_search_half_rev_limited( + &mut cache.revhybrid, + &input, + min_start, + ) + } else { + unreachable!("ReverseInner always has a DFA") + } + } +} + +impl Strategy for ReverseInner { + #[cfg_attr(feature = "perf-inline", inline(always))] + fn group_info(&self) -> &GroupInfo { + self.core.group_info() + } + + #[cfg_attr(feature = "perf-inline", inline(always))] + fn create_cache(&self) -> Cache { + let mut cache = self.core.create_cache(); + cache.revhybrid = self.hybrid.create_cache(); + cache + } + + #[cfg_attr(feature = "perf-inline", inline(always))] + fn reset_cache(&self, cache: &mut Cache) { + self.core.reset_cache(cache); + cache.revhybrid.reset(&self.hybrid); + } + + fn is_accelerated(&self) -> bool { + self.preinner.is_fast() + } + + fn memory_usage(&self) -> usize { + self.core.memory_usage() + + self.preinner.memory_usage() + + self.nfarev.memory_usage() + + self.dfa.memory_usage() + } + + #[cfg_attr(feature = "perf-inline", inline(always))] + fn search(&self, cache: &mut Cache, input: &Input<'_>) -> Option<Match> { + if input.get_anchored().is_anchored() { + return self.core.search(cache, input); + } + match self.try_search_full(cache, input) { + Err(RetryError::Quadratic(_err)) => { + trace!("reverse inner optimization failed: {}", _err); + self.core.search(cache, input) + } + Err(RetryError::Fail(_err)) => { + trace!("reverse inner fast search failed: {}", _err); + self.core.search_nofail(cache, input) + } + Ok(matornot) => matornot, + } + } + + #[cfg_attr(feature = "perf-inline", inline(always))] + fn search_half( + &self, + cache: &mut Cache, + input: &Input<'_>, + ) -> Option<HalfMatch> { + if input.get_anchored().is_anchored() { + return self.core.search_half(cache, input); + } + match self.try_search_full(cache, input) { + Err(RetryError::Quadratic(_err)) => { + trace!("reverse inner half optimization failed: {}", _err); + self.core.search_half(cache, input) + } + Err(RetryError::Fail(_err)) => { + trace!("reverse inner fast half search failed: {}", _err); + self.core.search_half_nofail(cache, input) + } + Ok(None) => None, + Ok(Some(m)) => Some(HalfMatch::new(m.pattern(), m.end())), + } + } + + #[cfg_attr(feature = "perf-inline", inline(always))] + fn is_match(&self, cache: &mut Cache, input: &Input<'_>) -> bool { + if input.get_anchored().is_anchored() { + return self.core.is_match(cache, input); + } + match self.try_search_full(cache, input) { + Err(RetryError::Quadratic(_err)) => { + trace!("reverse inner half optimization failed: {}", _err); + self.core.is_match_nofail(cache, input) + } + Err(RetryError::Fail(_err)) => { + trace!("reverse inner fast half search failed: {}", _err); + self.core.is_match_nofail(cache, input) + } + Ok(None) => false, + Ok(Some(_)) => true, + } + } + + #[cfg_attr(feature = "perf-inline", inline(always))] + fn search_slots( + &self, + cache: &mut Cache, + input: &Input<'_>, + slots: &mut [Option<NonMaxUsize>], + ) -> Option<PatternID> { + if input.get_anchored().is_anchored() { + return self.core.search_slots(cache, input, slots); + } + if !self.core.is_capture_search_needed(slots.len()) { + trace!("asked for slots unnecessarily, trying fast path"); + let m = self.search(cache, input)?; + copy_match_to_slots(m, slots); + return Some(m.pattern()); + } + let m = match self.try_search_full(cache, input) { + Err(RetryError::Quadratic(_err)) => { + trace!("reverse inner captures optimization failed: {}", _err); + return self.core.search_slots(cache, input, slots); + } + Err(RetryError::Fail(_err)) => { + trace!("reverse inner fast captures search failed: {}", _err); + return self.core.search_slots_nofail(cache, input, slots); + } + Ok(None) => return None, + Ok(Some(m)) => m, + }; + trace!( + "match found at {}..{} in capture search, \ + using another engine to find captures", + m.start(), + m.end(), + ); + let input = input + .clone() + .span(m.start()..m.end()) + .anchored(Anchored::Pattern(m.pattern())); + self.core.search_slots_nofail(cache, &input, slots) + } + + #[cfg_attr(feature = "perf-inline", inline(always))] + fn which_overlapping_matches( + &self, + cache: &mut Cache, + input: &Input<'_>, + patset: &mut PatternSet, + ) { + self.core.which_overlapping_matches(cache, input, patset) + } +} + +/// Copies the offsets in the given match to the corresponding positions in +/// `slots`. +/// +/// In effect, this sets the slots corresponding to the implicit group for the +/// pattern in the given match. If the indices for the corresponding slots do +/// not exist, then no slots are set. +/// +/// This is useful when the caller provides slots (or captures), but you use a +/// regex engine that doesn't operate on slots (like a lazy DFA). This function +/// lets you map the match you get back to the slots provided by the caller. +#[cfg_attr(feature = "perf-inline", inline(always))] +fn copy_match_to_slots(m: Match, slots: &mut [Option<NonMaxUsize>]) { + let slot_start = m.pattern().as_usize() * 2; + let slot_end = slot_start + 1; + if let Some(slot) = slots.get_mut(slot_start) { + *slot = NonMaxUsize::new(m.start()); + } + if let Some(slot) = slots.get_mut(slot_end) { + *slot = NonMaxUsize::new(m.end()); + } +} diff --git a/third_party/rust/regex-automata/src/meta/wrappers.rs b/third_party/rust/regex-automata/src/meta/wrappers.rs new file mode 100644 index 0000000000..08110d9bb8 --- /dev/null +++ b/third_party/rust/regex-automata/src/meta/wrappers.rs @@ -0,0 +1,1348 @@ +/*! +This module contains a boat load of wrappers around each of our internal regex +engines. They encapsulate a few things: + +1. The wrappers manage the conditional existence of the regex engine. Namely, +the PikeVM is the only required regex engine. The rest are optional. These +wrappers present a uniform API regardless of which engines are available. And +availability might be determined by compile time features or by dynamic +configuration via `meta::Config`. Encapsulating the conditional compilation +features is in particular a huge simplification for the higher level code that +composes these engines. +2. The wrappers manage construction of each engine, including skipping it if +the engine is unavailable or configured to not be used. +3. The wrappers manage whether an engine *can* be used for a particular +search configuration. For example, `BoundedBacktracker::get` only returns a +backtracking engine when the haystack is bigger than the maximum supported +length. The wrappers also sometimes take a position on when an engine *ought* +to be used, but only in cases where the logic is extremely local to the engine +itself. Otherwise, things like "choose between the backtracker and the one-pass +DFA" are managed by the higher level meta strategy code. + +There are also corresponding wrappers for the various `Cache` types for each +regex engine that needs them. If an engine is unavailable or not used, then a +cache for it will *not* actually be allocated. +*/ + +use alloc::vec::Vec; + +use crate::{ + meta::{ + error::{BuildError, RetryError, RetryFailError}, + regex::RegexInfo, + }, + nfa::thompson::{pikevm, NFA}, + util::{prefilter::Prefilter, primitives::NonMaxUsize}, + HalfMatch, Input, Match, MatchKind, PatternID, PatternSet, +}; + +#[cfg(feature = "dfa-build")] +use crate::dfa; +#[cfg(feature = "dfa-onepass")] +use crate::dfa::onepass; +#[cfg(feature = "hybrid")] +use crate::hybrid; +#[cfg(feature = "nfa-backtrack")] +use crate::nfa::thompson::backtrack; + +#[derive(Debug)] +pub(crate) struct PikeVM(PikeVMEngine); + +impl PikeVM { + pub(crate) fn new( + info: &RegexInfo, + pre: Option<Prefilter>, + nfa: &NFA, + ) -> Result<PikeVM, BuildError> { + PikeVMEngine::new(info, pre, nfa).map(PikeVM) + } + + pub(crate) fn create_cache(&self) -> PikeVMCache { + PikeVMCache::new(self) + } + + #[cfg_attr(feature = "perf-inline", inline(always))] + pub(crate) fn get(&self) -> &PikeVMEngine { + &self.0 + } +} + +#[derive(Debug)] +pub(crate) struct PikeVMEngine(pikevm::PikeVM); + +impl PikeVMEngine { + pub(crate) fn new( + info: &RegexInfo, + pre: Option<Prefilter>, + nfa: &NFA, + ) -> Result<PikeVMEngine, BuildError> { + let pikevm_config = pikevm::Config::new() + .match_kind(info.config().get_match_kind()) + .prefilter(pre); + let engine = pikevm::Builder::new() + .configure(pikevm_config) + .build_from_nfa(nfa.clone()) + .map_err(BuildError::nfa)?; + debug!("PikeVM built"); + Ok(PikeVMEngine(engine)) + } + + #[cfg_attr(feature = "perf-inline", inline(always))] + pub(crate) fn is_match( + &self, + cache: &mut PikeVMCache, + input: &Input<'_>, + ) -> bool { + self.0.is_match(cache.0.as_mut().unwrap(), input.clone()) + } + + #[cfg_attr(feature = "perf-inline", inline(always))] + pub(crate) fn search_slots( + &self, + cache: &mut PikeVMCache, + input: &Input<'_>, + slots: &mut [Option<NonMaxUsize>], + ) -> Option<PatternID> { + self.0.search_slots(cache.0.as_mut().unwrap(), input, slots) + } + + #[cfg_attr(feature = "perf-inline", inline(always))] + pub(crate) fn which_overlapping_matches( + &self, + cache: &mut PikeVMCache, + input: &Input<'_>, + patset: &mut PatternSet, + ) { + self.0.which_overlapping_matches( + cache.0.as_mut().unwrap(), + input, + patset, + ) + } +} + +#[derive(Clone, Debug)] +pub(crate) struct PikeVMCache(Option<pikevm::Cache>); + +impl PikeVMCache { + pub(crate) fn none() -> PikeVMCache { + PikeVMCache(None) + } + + pub(crate) fn new(builder: &PikeVM) -> PikeVMCache { + PikeVMCache(Some(builder.get().0.create_cache())) + } + + pub(crate) fn reset(&mut self, builder: &PikeVM) { + self.0.as_mut().unwrap().reset(&builder.get().0); + } + + pub(crate) fn memory_usage(&self) -> usize { + self.0.as_ref().map_or(0, |c| c.memory_usage()) + } +} + +#[derive(Debug)] +pub(crate) struct BoundedBacktracker(Option<BoundedBacktrackerEngine>); + +impl BoundedBacktracker { + pub(crate) fn new( + info: &RegexInfo, + pre: Option<Prefilter>, + nfa: &NFA, + ) -> Result<BoundedBacktracker, BuildError> { + BoundedBacktrackerEngine::new(info, pre, nfa).map(BoundedBacktracker) + } + + pub(crate) fn create_cache(&self) -> BoundedBacktrackerCache { + BoundedBacktrackerCache::new(self) + } + + #[cfg_attr(feature = "perf-inline", inline(always))] + pub(crate) fn get( + &self, + input: &Input<'_>, + ) -> Option<&BoundedBacktrackerEngine> { + let engine = self.0.as_ref()?; + // It is difficult to make the backtracker give up early if it is + // guaranteed to eventually wind up in a match state. This is because + // of the greedy nature of a backtracker: it just blindly mushes + // forward. Every other regex engine is able to give up more quickly, + // so even if the backtracker might be able to zip through faster than + // (say) the PikeVM, we prefer the theoretical benefit that some other + // engine might be able to scan much less of the haystack than the + // backtracker. + // + // Now, if the haystack is really short already, then we allow the + // backtracker to run. (This hasn't been litigated quantitatively with + // benchmarks. Just a hunch.) + if input.get_earliest() && input.haystack().len() > 128 { + return None; + } + // If the backtracker is just going to return an error because the + // haystack is too long, then obviously do not use it. + if input.get_span().len() > engine.max_haystack_len() { + return None; + } + Some(engine) + } +} + +#[derive(Debug)] +pub(crate) struct BoundedBacktrackerEngine( + #[cfg(feature = "nfa-backtrack")] backtrack::BoundedBacktracker, + #[cfg(not(feature = "nfa-backtrack"))] (), +); + +impl BoundedBacktrackerEngine { + pub(crate) fn new( + info: &RegexInfo, + pre: Option<Prefilter>, + nfa: &NFA, + ) -> Result<Option<BoundedBacktrackerEngine>, BuildError> { + #[cfg(feature = "nfa-backtrack")] + { + if !info.config().get_backtrack() + || info.config().get_match_kind() != MatchKind::LeftmostFirst + { + return Ok(None); + } + let backtrack_config = backtrack::Config::new().prefilter(pre); + let engine = backtrack::Builder::new() + .configure(backtrack_config) + .build_from_nfa(nfa.clone()) + .map_err(BuildError::nfa)?; + debug!("BoundedBacktracker built"); + Ok(Some(BoundedBacktrackerEngine(engine))) + } + #[cfg(not(feature = "nfa-backtrack"))] + { + Ok(None) + } + } + + #[cfg_attr(feature = "perf-inline", inline(always))] + pub(crate) fn is_match( + &self, + cache: &mut BoundedBacktrackerCache, + input: &Input<'_>, + ) -> bool { + #[cfg(feature = "nfa-backtrack")] + { + // OK because we only permit access to this engine when we know + // the haystack is short enough for the backtracker to run without + // reporting an error. + self.0 + .try_is_match(cache.0.as_mut().unwrap(), input.clone()) + .unwrap() + } + #[cfg(not(feature = "nfa-backtrack"))] + { + // Impossible to reach because this engine is never constructed + // if the requisite features aren't enabled. + unreachable!() + } + } + + #[cfg_attr(feature = "perf-inline", inline(always))] + pub(crate) fn search_slots( + &self, + cache: &mut BoundedBacktrackerCache, + input: &Input<'_>, + slots: &mut [Option<NonMaxUsize>], + ) -> Option<PatternID> { + #[cfg(feature = "nfa-backtrack")] + { + // OK because we only permit access to this engine when we know + // the haystack is short enough for the backtracker to run without + // reporting an error. + self.0 + .try_search_slots(cache.0.as_mut().unwrap(), input, slots) + .unwrap() + } + #[cfg(not(feature = "nfa-backtrack"))] + { + // Impossible to reach because this engine is never constructed + // if the requisite features aren't enabled. + unreachable!() + } + } + + #[cfg_attr(feature = "perf-inline", inline(always))] + fn max_haystack_len(&self) -> usize { + #[cfg(feature = "nfa-backtrack")] + { + self.0.max_haystack_len() + } + #[cfg(not(feature = "nfa-backtrack"))] + { + // Impossible to reach because this engine is never constructed + // if the requisite features aren't enabled. + unreachable!() + } + } +} + +#[derive(Clone, Debug)] +pub(crate) struct BoundedBacktrackerCache( + #[cfg(feature = "nfa-backtrack")] Option<backtrack::Cache>, + #[cfg(not(feature = "nfa-backtrack"))] (), +); + +impl BoundedBacktrackerCache { + pub(crate) fn none() -> BoundedBacktrackerCache { + #[cfg(feature = "nfa-backtrack")] + { + BoundedBacktrackerCache(None) + } + #[cfg(not(feature = "nfa-backtrack"))] + { + BoundedBacktrackerCache(()) + } + } + + pub(crate) fn new( + builder: &BoundedBacktracker, + ) -> BoundedBacktrackerCache { + #[cfg(feature = "nfa-backtrack")] + { + BoundedBacktrackerCache( + builder.0.as_ref().map(|e| e.0.create_cache()), + ) + } + #[cfg(not(feature = "nfa-backtrack"))] + { + BoundedBacktrackerCache(()) + } + } + + pub(crate) fn reset(&mut self, builder: &BoundedBacktracker) { + #[cfg(feature = "nfa-backtrack")] + if let Some(ref e) = builder.0 { + self.0.as_mut().unwrap().reset(&e.0); + } + } + + pub(crate) fn memory_usage(&self) -> usize { + #[cfg(feature = "nfa-backtrack")] + { + self.0.as_ref().map_or(0, |c| c.memory_usage()) + } + #[cfg(not(feature = "nfa-backtrack"))] + { + 0 + } + } +} + +#[derive(Debug)] +pub(crate) struct OnePass(Option<OnePassEngine>); + +impl OnePass { + pub(crate) fn new(info: &RegexInfo, nfa: &NFA) -> OnePass { + OnePass(OnePassEngine::new(info, nfa)) + } + + pub(crate) fn create_cache(&self) -> OnePassCache { + OnePassCache::new(self) + } + + #[cfg_attr(feature = "perf-inline", inline(always))] + pub(crate) fn get(&self, input: &Input<'_>) -> Option<&OnePassEngine> { + let engine = self.0.as_ref()?; + if !input.get_anchored().is_anchored() + && !engine.get_nfa().is_always_start_anchored() + { + return None; + } + Some(engine) + } + + pub(crate) fn memory_usage(&self) -> usize { + self.0.as_ref().map_or(0, |e| e.memory_usage()) + } +} + +#[derive(Debug)] +pub(crate) struct OnePassEngine( + #[cfg(feature = "dfa-onepass")] onepass::DFA, + #[cfg(not(feature = "dfa-onepass"))] (), +); + +impl OnePassEngine { + pub(crate) fn new(info: &RegexInfo, nfa: &NFA) -> Option<OnePassEngine> { + #[cfg(feature = "dfa-onepass")] + { + if !info.config().get_onepass() { + return None; + } + // In order to even attempt building a one-pass DFA, we require + // that we either have at least one explicit capturing group or + // there's a Unicode word boundary somewhere. If we don't have + // either of these things, then the lazy DFA will almost certainly + // be useable and be much faster. The only case where it might + // not is if the lazy DFA isn't utilizing its cache effectively, + // but in those cases, the underlying regex is almost certainly + // not one-pass or is too big to fit within the current one-pass + // implementation limits. + if info.props_union().explicit_captures_len() == 0 + && !info.props_union().look_set().contains_word_unicode() + { + debug!("not building OnePass because it isn't worth it"); + return None; + } + let onepass_config = onepass::Config::new() + .match_kind(info.config().get_match_kind()) + // Like for the lazy DFA, we unconditionally enable this + // because it doesn't cost much and makes the API more + // flexible. + .starts_for_each_pattern(true) + .byte_classes(info.config().get_byte_classes()) + .size_limit(info.config().get_onepass_size_limit()); + let result = onepass::Builder::new() + .configure(onepass_config) + .build_from_nfa(nfa.clone()); + let engine = match result { + Ok(engine) => engine, + Err(_err) => { + debug!("OnePass failed to build: {}", _err); + return None; + } + }; + debug!("OnePass built, {} bytes", engine.memory_usage()); + Some(OnePassEngine(engine)) + } + #[cfg(not(feature = "dfa-onepass"))] + { + None + } + } + + #[cfg_attr(feature = "perf-inline", inline(always))] + pub(crate) fn search_slots( + &self, + cache: &mut OnePassCache, + input: &Input<'_>, + slots: &mut [Option<NonMaxUsize>], + ) -> Option<PatternID> { + #[cfg(feature = "dfa-onepass")] + { + // OK because we only permit getting a OnePassEngine when we know + // the search is anchored and thus an error cannot occur. + self.0 + .try_search_slots(cache.0.as_mut().unwrap(), input, slots) + .unwrap() + } + #[cfg(not(feature = "dfa-onepass"))] + { + // Impossible to reach because this engine is never constructed + // if the requisite features aren't enabled. + unreachable!() + } + } + + pub(crate) fn memory_usage(&self) -> usize { + #[cfg(feature = "dfa-onepass")] + { + self.0.memory_usage() + } + #[cfg(not(feature = "dfa-onepass"))] + { + // Impossible to reach because this engine is never constructed + // if the requisite features aren't enabled. + unreachable!() + } + } + + #[cfg_attr(feature = "perf-inline", inline(always))] + fn get_nfa(&self) -> &NFA { + #[cfg(feature = "dfa-onepass")] + { + self.0.get_nfa() + } + #[cfg(not(feature = "dfa-onepass"))] + { + // Impossible to reach because this engine is never constructed + // if the requisite features aren't enabled. + unreachable!() + } + } +} + +#[derive(Clone, Debug)] +pub(crate) struct OnePassCache( + #[cfg(feature = "dfa-onepass")] Option<onepass::Cache>, + #[cfg(not(feature = "dfa-onepass"))] (), +); + +impl OnePassCache { + pub(crate) fn none() -> OnePassCache { + #[cfg(feature = "dfa-onepass")] + { + OnePassCache(None) + } + #[cfg(not(feature = "dfa-onepass"))] + { + OnePassCache(()) + } + } + + pub(crate) fn new(builder: &OnePass) -> OnePassCache { + #[cfg(feature = "dfa-onepass")] + { + OnePassCache(builder.0.as_ref().map(|e| e.0.create_cache())) + } + #[cfg(not(feature = "dfa-onepass"))] + { + OnePassCache(()) + } + } + + pub(crate) fn reset(&mut self, builder: &OnePass) { + #[cfg(feature = "dfa-onepass")] + if let Some(ref e) = builder.0 { + self.0.as_mut().unwrap().reset(&e.0); + } + } + + pub(crate) fn memory_usage(&self) -> usize { + #[cfg(feature = "dfa-onepass")] + { + self.0.as_ref().map_or(0, |c| c.memory_usage()) + } + #[cfg(not(feature = "dfa-onepass"))] + { + 0 + } + } +} + +#[derive(Debug)] +pub(crate) struct Hybrid(Option<HybridEngine>); + +impl Hybrid { + pub(crate) fn none() -> Hybrid { + Hybrid(None) + } + + pub(crate) fn new( + info: &RegexInfo, + pre: Option<Prefilter>, + nfa: &NFA, + nfarev: &NFA, + ) -> Hybrid { + Hybrid(HybridEngine::new(info, pre, nfa, nfarev)) + } + + pub(crate) fn create_cache(&self) -> HybridCache { + HybridCache::new(self) + } + + #[cfg_attr(feature = "perf-inline", inline(always))] + pub(crate) fn get(&self, _input: &Input<'_>) -> Option<&HybridEngine> { + let engine = self.0.as_ref()?; + Some(engine) + } + + pub(crate) fn is_some(&self) -> bool { + self.0.is_some() + } +} + +#[derive(Debug)] +pub(crate) struct HybridEngine( + #[cfg(feature = "hybrid")] hybrid::regex::Regex, + #[cfg(not(feature = "hybrid"))] (), +); + +impl HybridEngine { + pub(crate) fn new( + info: &RegexInfo, + pre: Option<Prefilter>, + nfa: &NFA, + nfarev: &NFA, + ) -> Option<HybridEngine> { + #[cfg(feature = "hybrid")] + { + if !info.config().get_hybrid() { + return None; + } + let dfa_config = hybrid::dfa::Config::new() + .match_kind(info.config().get_match_kind()) + .prefilter(pre.clone()) + // Enabling this is necessary for ensuring we can service any + // kind of 'Input' search without error. For the lazy DFA, + // this is not particularly costly, since the start states are + // generated lazily. + .starts_for_each_pattern(true) + .byte_classes(info.config().get_byte_classes()) + .unicode_word_boundary(true) + .specialize_start_states(pre.is_some()) + .cache_capacity(info.config().get_hybrid_cache_capacity()) + // This makes it possible for building a lazy DFA to + // fail even though the NFA has already been built. Namely, + // if the cache capacity is too small to fit some minimum + // number of states (which is small, like 4 or 5), then the + // DFA will refuse to build. + // + // We shouldn't enable this to make building always work, since + // this could cause the allocation of a cache bigger than the + // provided capacity amount. + // + // This is effectively the only reason why building a lazy DFA + // could fail. If it does, then we simply suppress the error + // and return None. + .skip_cache_capacity_check(false) + // This and enabling heuristic Unicode word boundary support + // above make it so the lazy DFA can quit at match time. + .minimum_cache_clear_count(Some(3)) + .minimum_bytes_per_state(Some(10)); + let result = hybrid::dfa::Builder::new() + .configure(dfa_config.clone()) + .build_from_nfa(nfa.clone()); + let fwd = match result { + Ok(fwd) => fwd, + Err(_err) => { + debug!("forward lazy DFA failed to build: {}", _err); + return None; + } + }; + let result = hybrid::dfa::Builder::new() + .configure( + dfa_config + .clone() + .match_kind(MatchKind::All) + .prefilter(None) + .specialize_start_states(false), + ) + .build_from_nfa(nfarev.clone()); + let rev = match result { + Ok(rev) => rev, + Err(_err) => { + debug!("reverse lazy DFA failed to build: {}", _err); + return None; + } + }; + let engine = + hybrid::regex::Builder::new().build_from_dfas(fwd, rev); + debug!("lazy DFA built"); + Some(HybridEngine(engine)) + } + #[cfg(not(feature = "hybrid"))] + { + None + } + } + + #[cfg_attr(feature = "perf-inline", inline(always))] + pub(crate) fn try_search( + &self, + cache: &mut HybridCache, + input: &Input<'_>, + ) -> Result<Option<Match>, RetryFailError> { + #[cfg(feature = "hybrid")] + { + let cache = cache.0.as_mut().unwrap(); + self.0.try_search(cache, input).map_err(|e| e.into()) + } + #[cfg(not(feature = "hybrid"))] + { + // Impossible to reach because this engine is never constructed + // if the requisite features aren't enabled. + unreachable!() + } + } + + #[cfg_attr(feature = "perf-inline", inline(always))] + pub(crate) fn try_search_half_fwd( + &self, + cache: &mut HybridCache, + input: &Input<'_>, + ) -> Result<Option<HalfMatch>, RetryFailError> { + #[cfg(feature = "hybrid")] + { + let fwd = self.0.forward(); + let mut fwdcache = cache.0.as_mut().unwrap().as_parts_mut().0; + fwd.try_search_fwd(&mut fwdcache, input).map_err(|e| e.into()) + } + #[cfg(not(feature = "hybrid"))] + { + // Impossible to reach because this engine is never constructed + // if the requisite features aren't enabled. + unreachable!() + } + } + + #[cfg_attr(feature = "perf-inline", inline(always))] + pub(crate) fn try_search_half_fwd_stopat( + &self, + cache: &mut HybridCache, + input: &Input<'_>, + ) -> Result<Result<HalfMatch, usize>, RetryFailError> { + #[cfg(feature = "hybrid")] + { + let dfa = self.0.forward(); + let mut cache = cache.0.as_mut().unwrap().as_parts_mut().0; + crate::meta::stopat::hybrid_try_search_half_fwd( + dfa, &mut cache, input, + ) + } + #[cfg(not(feature = "hybrid"))] + { + // Impossible to reach because this engine is never constructed + // if the requisite features aren't enabled. + unreachable!() + } + } + + #[cfg_attr(feature = "perf-inline", inline(always))] + pub(crate) fn try_search_half_rev( + &self, + cache: &mut HybridCache, + input: &Input<'_>, + ) -> Result<Option<HalfMatch>, RetryFailError> { + #[cfg(feature = "hybrid")] + { + let rev = self.0.reverse(); + let mut revcache = cache.0.as_mut().unwrap().as_parts_mut().1; + rev.try_search_rev(&mut revcache, input).map_err(|e| e.into()) + } + #[cfg(not(feature = "hybrid"))] + { + // Impossible to reach because this engine is never constructed + // if the requisite features aren't enabled. + unreachable!() + } + } + + #[cfg_attr(feature = "perf-inline", inline(always))] + pub(crate) fn try_search_half_rev_limited( + &self, + cache: &mut HybridCache, + input: &Input<'_>, + min_start: usize, + ) -> Result<Option<HalfMatch>, RetryError> { + #[cfg(feature = "hybrid")] + { + let dfa = self.0.reverse(); + let mut cache = cache.0.as_mut().unwrap().as_parts_mut().1; + crate::meta::limited::hybrid_try_search_half_rev( + dfa, &mut cache, input, min_start, + ) + } + #[cfg(not(feature = "hybrid"))] + { + // Impossible to reach because this engine is never constructed + // if the requisite features aren't enabled. + unreachable!() + } + } + + #[inline] + pub(crate) fn try_which_overlapping_matches( + &self, + cache: &mut HybridCache, + input: &Input<'_>, + patset: &mut PatternSet, + ) -> Result<(), RetryFailError> { + #[cfg(feature = "hybrid")] + { + let fwd = self.0.forward(); + let mut fwdcache = cache.0.as_mut().unwrap().as_parts_mut().0; + fwd.try_which_overlapping_matches(&mut fwdcache, input, patset) + .map_err(|e| e.into()) + } + #[cfg(not(feature = "hybrid"))] + { + // Impossible to reach because this engine is never constructed + // if the requisite features aren't enabled. + unreachable!() + } + } +} + +#[derive(Clone, Debug)] +pub(crate) struct HybridCache( + #[cfg(feature = "hybrid")] Option<hybrid::regex::Cache>, + #[cfg(not(feature = "hybrid"))] (), +); + +impl HybridCache { + pub(crate) fn none() -> HybridCache { + #[cfg(feature = "hybrid")] + { + HybridCache(None) + } + #[cfg(not(feature = "hybrid"))] + { + HybridCache(()) + } + } + + pub(crate) fn new(builder: &Hybrid) -> HybridCache { + #[cfg(feature = "hybrid")] + { + HybridCache(builder.0.as_ref().map(|e| e.0.create_cache())) + } + #[cfg(not(feature = "hybrid"))] + { + HybridCache(()) + } + } + + pub(crate) fn reset(&mut self, builder: &Hybrid) { + #[cfg(feature = "hybrid")] + if let Some(ref e) = builder.0 { + self.0.as_mut().unwrap().reset(&e.0); + } + } + + pub(crate) fn memory_usage(&self) -> usize { + #[cfg(feature = "hybrid")] + { + self.0.as_ref().map_or(0, |c| c.memory_usage()) + } + #[cfg(not(feature = "hybrid"))] + { + 0 + } + } +} + +#[derive(Debug)] +pub(crate) struct DFA(Option<DFAEngine>); + +impl DFA { + pub(crate) fn none() -> DFA { + DFA(None) + } + + pub(crate) fn new( + info: &RegexInfo, + pre: Option<Prefilter>, + nfa: &NFA, + nfarev: &NFA, + ) -> DFA { + DFA(DFAEngine::new(info, pre, nfa, nfarev)) + } + + #[cfg_attr(feature = "perf-inline", inline(always))] + pub(crate) fn get(&self, _input: &Input<'_>) -> Option<&DFAEngine> { + let engine = self.0.as_ref()?; + Some(engine) + } + + pub(crate) fn is_some(&self) -> bool { + self.0.is_some() + } + + pub(crate) fn memory_usage(&self) -> usize { + self.0.as_ref().map_or(0, |e| e.memory_usage()) + } +} + +#[derive(Debug)] +pub(crate) struct DFAEngine( + #[cfg(feature = "dfa-build")] dfa::regex::Regex, + #[cfg(not(feature = "dfa-build"))] (), +); + +impl DFAEngine { + pub(crate) fn new( + info: &RegexInfo, + pre: Option<Prefilter>, + nfa: &NFA, + nfarev: &NFA, + ) -> Option<DFAEngine> { + #[cfg(feature = "dfa-build")] + { + if !info.config().get_dfa() { + return None; + } + // If our NFA is anything but small, don't even bother with a DFA. + if let Some(state_limit) = info.config().get_dfa_state_limit() { + if nfa.states().len() > state_limit { + debug!( + "skipping full DFA because NFA has {} states, \ + which exceeds the heuristic limit of {}", + nfa.states().len(), + state_limit, + ); + return None; + } + } + // We cut the size limit in four because the total heap used by + // DFA construction is determinization aux memory and the DFA + // itself, and those things are configured independently in the + // lower level DFA builder API. And then split that in two because + // of forward and reverse DFAs. + let size_limit = info.config().get_dfa_size_limit().map(|n| n / 4); + let dfa_config = dfa::dense::Config::new() + .match_kind(info.config().get_match_kind()) + .prefilter(pre.clone()) + // Enabling this is necessary for ensuring we can service any + // kind of 'Input' search without error. For the full DFA, this + // can be quite costly. But since we have such a small bound + // on the size of the DFA, in practice, any multl-regexes are + // probably going to blow the limit anyway. + .starts_for_each_pattern(true) + .byte_classes(info.config().get_byte_classes()) + .unicode_word_boundary(true) + .specialize_start_states(pre.is_some()) + .determinize_size_limit(size_limit) + .dfa_size_limit(size_limit); + let result = dfa::dense::Builder::new() + .configure(dfa_config.clone()) + .build_from_nfa(&nfa); + let fwd = match result { + Ok(fwd) => fwd, + Err(_err) => { + debug!("forward full DFA failed to build: {}", _err); + return None; + } + }; + let result = dfa::dense::Builder::new() + .configure( + dfa_config + .clone() + // We never need unanchored reverse searches, so + // there's no point in building it into the DFA, which + // WILL take more space. (This isn't done for the lazy + // DFA because the DFA is, well, lazy. It doesn't pay + // the cost for supporting unanchored searches unless + // you actually do an unanchored search, which we + // don't.) + .start_kind(dfa::StartKind::Anchored) + .match_kind(MatchKind::All) + .prefilter(None) + .specialize_start_states(false), + ) + .build_from_nfa(&nfarev); + let rev = match result { + Ok(rev) => rev, + Err(_err) => { + debug!("reverse full DFA failed to build: {}", _err); + return None; + } + }; + let engine = dfa::regex::Builder::new().build_from_dfas(fwd, rev); + debug!( + "fully compiled forward and reverse DFAs built, {} bytes", + engine.forward().memory_usage() + + engine.reverse().memory_usage(), + ); + Some(DFAEngine(engine)) + } + #[cfg(not(feature = "dfa-build"))] + { + None + } + } + + #[cfg_attr(feature = "perf-inline", inline(always))] + pub(crate) fn try_search( + &self, + input: &Input<'_>, + ) -> Result<Option<Match>, RetryFailError> { + #[cfg(feature = "dfa-build")] + { + self.0.try_search(input).map_err(|e| e.into()) + } + #[cfg(not(feature = "dfa-build"))] + { + // Impossible to reach because this engine is never constructed + // if the requisite features aren't enabled. + unreachable!() + } + } + + #[cfg_attr(feature = "perf-inline", inline(always))] + pub(crate) fn try_search_half_fwd( + &self, + input: &Input<'_>, + ) -> Result<Option<HalfMatch>, RetryFailError> { + #[cfg(feature = "dfa-build")] + { + use crate::dfa::Automaton; + self.0.forward().try_search_fwd(input).map_err(|e| e.into()) + } + #[cfg(not(feature = "dfa-build"))] + { + // Impossible to reach because this engine is never constructed + // if the requisite features aren't enabled. + unreachable!() + } + } + + #[cfg_attr(feature = "perf-inline", inline(always))] + pub(crate) fn try_search_half_fwd_stopat( + &self, + input: &Input<'_>, + ) -> Result<Result<HalfMatch, usize>, RetryFailError> { + #[cfg(feature = "dfa-build")] + { + let dfa = self.0.forward(); + crate::meta::stopat::dfa_try_search_half_fwd(dfa, input) + } + #[cfg(not(feature = "dfa-build"))] + { + // Impossible to reach because this engine is never constructed + // if the requisite features aren't enabled. + unreachable!() + } + } + + #[cfg_attr(feature = "perf-inline", inline(always))] + pub(crate) fn try_search_half_rev( + &self, + input: &Input<'_>, + ) -> Result<Option<HalfMatch>, RetryFailError> { + #[cfg(feature = "dfa-build")] + { + use crate::dfa::Automaton; + self.0.reverse().try_search_rev(&input).map_err(|e| e.into()) + } + #[cfg(not(feature = "dfa-build"))] + { + // Impossible to reach because this engine is never constructed + // if the requisite features aren't enabled. + unreachable!() + } + } + + #[cfg_attr(feature = "perf-inline", inline(always))] + pub(crate) fn try_search_half_rev_limited( + &self, + input: &Input<'_>, + min_start: usize, + ) -> Result<Option<HalfMatch>, RetryError> { + #[cfg(feature = "dfa-build")] + { + let dfa = self.0.reverse(); + crate::meta::limited::dfa_try_search_half_rev( + dfa, input, min_start, + ) + } + #[cfg(not(feature = "dfa-build"))] + { + // Impossible to reach because this engine is never constructed + // if the requisite features aren't enabled. + unreachable!() + } + } + + #[inline] + pub(crate) fn try_which_overlapping_matches( + &self, + input: &Input<'_>, + patset: &mut PatternSet, + ) -> Result<(), RetryFailError> { + #[cfg(feature = "dfa-build")] + { + use crate::dfa::Automaton; + self.0 + .forward() + .try_which_overlapping_matches(input, patset) + .map_err(|e| e.into()) + } + #[cfg(not(feature = "dfa-build"))] + { + // Impossible to reach because this engine is never constructed + // if the requisite features aren't enabled. + unreachable!() + } + } + + pub(crate) fn memory_usage(&self) -> usize { + #[cfg(feature = "dfa-build")] + { + self.0.forward().memory_usage() + self.0.reverse().memory_usage() + } + #[cfg(not(feature = "dfa-build"))] + { + // Impossible to reach because this engine is never constructed + // if the requisite features aren't enabled. + unreachable!() + } + } +} + +#[derive(Debug)] +pub(crate) struct ReverseHybrid(Option<ReverseHybridEngine>); + +impl ReverseHybrid { + pub(crate) fn none() -> ReverseHybrid { + ReverseHybrid(None) + } + + pub(crate) fn new(info: &RegexInfo, nfarev: &NFA) -> ReverseHybrid { + ReverseHybrid(ReverseHybridEngine::new(info, nfarev)) + } + + pub(crate) fn create_cache(&self) -> ReverseHybridCache { + ReverseHybridCache::new(self) + } + + #[cfg_attr(feature = "perf-inline", inline(always))] + pub(crate) fn get( + &self, + _input: &Input<'_>, + ) -> Option<&ReverseHybridEngine> { + let engine = self.0.as_ref()?; + Some(engine) + } +} + +#[derive(Debug)] +pub(crate) struct ReverseHybridEngine( + #[cfg(feature = "hybrid")] hybrid::dfa::DFA, + #[cfg(not(feature = "hybrid"))] (), +); + +impl ReverseHybridEngine { + pub(crate) fn new( + info: &RegexInfo, + nfarev: &NFA, + ) -> Option<ReverseHybridEngine> { + #[cfg(feature = "hybrid")] + { + if !info.config().get_hybrid() { + return None; + } + // Since we only use this for reverse searches, we can hard-code + // a number of things like match semantics, prefilters, starts + // for each pattern and so on. + let dfa_config = hybrid::dfa::Config::new() + .match_kind(MatchKind::All) + .prefilter(None) + .starts_for_each_pattern(false) + .byte_classes(info.config().get_byte_classes()) + .unicode_word_boundary(true) + .specialize_start_states(false) + .cache_capacity(info.config().get_hybrid_cache_capacity()) + .skip_cache_capacity_check(false) + .minimum_cache_clear_count(Some(3)) + .minimum_bytes_per_state(Some(10)); + let result = hybrid::dfa::Builder::new() + .configure(dfa_config) + .build_from_nfa(nfarev.clone()); + let rev = match result { + Ok(rev) => rev, + Err(_err) => { + debug!("lazy reverse DFA failed to build: {}", _err); + return None; + } + }; + debug!("lazy reverse DFA built"); + Some(ReverseHybridEngine(rev)) + } + #[cfg(not(feature = "hybrid"))] + { + None + } + } + + #[cfg_attr(feature = "perf-inline", inline(always))] + pub(crate) fn try_search_half_rev_limited( + &self, + cache: &mut ReverseHybridCache, + input: &Input<'_>, + min_start: usize, + ) -> Result<Option<HalfMatch>, RetryError> { + #[cfg(feature = "hybrid")] + { + let dfa = &self.0; + let mut cache = cache.0.as_mut().unwrap(); + crate::meta::limited::hybrid_try_search_half_rev( + dfa, &mut cache, input, min_start, + ) + } + #[cfg(not(feature = "hybrid"))] + { + // Impossible to reach because this engine is never constructed + // if the requisite features aren't enabled. + unreachable!() + } + } +} + +#[derive(Clone, Debug)] +pub(crate) struct ReverseHybridCache( + #[cfg(feature = "hybrid")] Option<hybrid::dfa::Cache>, + #[cfg(not(feature = "hybrid"))] (), +); + +impl ReverseHybridCache { + pub(crate) fn none() -> ReverseHybridCache { + #[cfg(feature = "hybrid")] + { + ReverseHybridCache(None) + } + #[cfg(not(feature = "hybrid"))] + { + ReverseHybridCache(()) + } + } + + pub(crate) fn new(builder: &ReverseHybrid) -> ReverseHybridCache { + #[cfg(feature = "hybrid")] + { + ReverseHybridCache(builder.0.as_ref().map(|e| e.0.create_cache())) + } + #[cfg(not(feature = "hybrid"))] + { + ReverseHybridCache(()) + } + } + + pub(crate) fn reset(&mut self, builder: &ReverseHybrid) { + #[cfg(feature = "hybrid")] + if let Some(ref e) = builder.0 { + self.0.as_mut().unwrap().reset(&e.0); + } + } + + pub(crate) fn memory_usage(&self) -> usize { + #[cfg(feature = "hybrid")] + { + self.0.as_ref().map_or(0, |c| c.memory_usage()) + } + #[cfg(not(feature = "hybrid"))] + { + 0 + } + } +} + +#[derive(Debug)] +pub(crate) struct ReverseDFA(Option<ReverseDFAEngine>); + +impl ReverseDFA { + pub(crate) fn none() -> ReverseDFA { + ReverseDFA(None) + } + + pub(crate) fn new(info: &RegexInfo, nfarev: &NFA) -> ReverseDFA { + ReverseDFA(ReverseDFAEngine::new(info, nfarev)) + } + + #[cfg_attr(feature = "perf-inline", inline(always))] + pub(crate) fn get(&self, _input: &Input<'_>) -> Option<&ReverseDFAEngine> { + let engine = self.0.as_ref()?; + Some(engine) + } + + pub(crate) fn is_some(&self) -> bool { + self.0.is_some() + } + + pub(crate) fn memory_usage(&self) -> usize { + self.0.as_ref().map_or(0, |e| e.memory_usage()) + } +} + +#[derive(Debug)] +pub(crate) struct ReverseDFAEngine( + #[cfg(feature = "dfa-build")] dfa::dense::DFA<Vec<u32>>, + #[cfg(not(feature = "dfa-build"))] (), +); + +impl ReverseDFAEngine { + pub(crate) fn new( + info: &RegexInfo, + nfarev: &NFA, + ) -> Option<ReverseDFAEngine> { + #[cfg(feature = "dfa-build")] + { + if !info.config().get_dfa() { + return None; + } + // If our NFA is anything but small, don't even bother with a DFA. + if let Some(state_limit) = info.config().get_dfa_state_limit() { + if nfarev.states().len() > state_limit { + debug!( + "skipping full reverse DFA because NFA has {} states, \ + which exceeds the heuristic limit of {}", + nfarev.states().len(), + state_limit, + ); + return None; + } + } + // We cut the size limit in two because the total heap used by DFA + // construction is determinization aux memory and the DFA itself, + // and those things are configured independently in the lower level + // DFA builder API. + let size_limit = info.config().get_dfa_size_limit().map(|n| n / 2); + // Since we only use this for reverse searches, we can hard-code + // a number of things like match semantics, prefilters, starts + // for each pattern and so on. We also disable acceleration since + // it's incompatible with limited searches (which is the only + // operation we support for this kind of engine at the moment). + let dfa_config = dfa::dense::Config::new() + .match_kind(MatchKind::All) + .prefilter(None) + .accelerate(false) + .start_kind(dfa::StartKind::Anchored) + .starts_for_each_pattern(false) + .byte_classes(info.config().get_byte_classes()) + .unicode_word_boundary(true) + .specialize_start_states(false) + .determinize_size_limit(size_limit) + .dfa_size_limit(size_limit); + let result = dfa::dense::Builder::new() + .configure(dfa_config) + .build_from_nfa(&nfarev); + let rev = match result { + Ok(rev) => rev, + Err(_err) => { + debug!("full reverse DFA failed to build: {}", _err); + return None; + } + }; + debug!( + "fully compiled reverse DFA built, {} bytes", + rev.memory_usage() + ); + Some(ReverseDFAEngine(rev)) + } + #[cfg(not(feature = "dfa-build"))] + { + None + } + } + + #[cfg_attr(feature = "perf-inline", inline(always))] + pub(crate) fn try_search_half_rev_limited( + &self, + input: &Input<'_>, + min_start: usize, + ) -> Result<Option<HalfMatch>, RetryError> { + #[cfg(feature = "dfa-build")] + { + let dfa = &self.0; + crate::meta::limited::dfa_try_search_half_rev( + dfa, input, min_start, + ) + } + #[cfg(not(feature = "dfa-build"))] + { + // Impossible to reach because this engine is never constructed + // if the requisite features aren't enabled. + unreachable!() + } + } + + pub(crate) fn memory_usage(&self) -> usize { + #[cfg(feature = "dfa-build")] + { + self.0.memory_usage() + } + #[cfg(not(feature = "dfa-build"))] + { + // Impossible to reach because this engine is never constructed + // if the requisite features aren't enabled. + unreachable!() + } + } +} diff --git a/third_party/rust/regex-automata/src/nfa/mod.rs b/third_party/rust/regex-automata/src/nfa/mod.rs new file mode 100644 index 0000000000..0c36f598af --- /dev/null +++ b/third_party/rust/regex-automata/src/nfa/mod.rs @@ -0,0 +1,55 @@ +/*! +Provides non-deterministic finite automata (NFA) and regex engines that use +them. + +While NFAs and DFAs (deterministic finite automata) have equivalent *theoretical* +power, their usage in practice tends to result in different engineering trade +offs. While this isn't meant to be a comprehensive treatment of the topic, here +are a few key trade offs that are, at minimum, true for this crate: + +* NFAs tend to be represented sparsely where as DFAs are represented densely. +Sparse representations use less memory, but are slower to traverse. Conversely, +dense representations use more memory, but are faster to traverse. (Sometimes +these lines are blurred. For example, an `NFA` might choose to represent a +particular state in a dense fashion, and a DFA can be built using a sparse +representation via [`sparse::DFA`](crate::dfa::sparse::DFA). +* NFAs have espilon transitions and DFAs don't. In practice, this means that +handling a single byte in a haystack with an NFA at search time may require +visiting multiple NFA states. In a DFA, each byte only requires visiting +a single state. Stated differently, NFAs require a variable number of CPU +instructions to process one byte in a haystack where as a DFA uses a constant +number of CPU instructions to process one byte. +* NFAs are generally easier to amend with secondary storage. For example, the +[`thompson::pikevm::PikeVM`] uses an NFA to match, but also uses additional +memory beyond the model of a finite state machine to track offsets for matching +capturing groups. Conversely, the most a DFA can do is report the offset (and +pattern ID) at which a match occurred. This is generally why we also compile +DFAs in reverse, so that we can run them after finding the end of a match to +also find the start of a match. +* NFAs take worst case linear time to build, but DFAs take worst case +exponential time to build. The [hybrid NFA/DFA](crate::hybrid) mitigates this +challenge for DFAs in many practical cases. + +There are likely other differences, but the bottom line is that NFAs tend to be +more memory efficient and give easier opportunities for increasing expressive +power, where as DFAs are faster to search with. + +# Why only a Thompson NFA? + +Currently, the only kind of NFA we support in this crate is a [Thompson +NFA](https://en.wikipedia.org/wiki/Thompson%27s_construction). This refers +to a specific construction algorithm that takes the syntax of a regex +pattern and converts it to an NFA. Specifically, it makes gratuitous use of +epsilon transitions in order to keep its structure simple. In exchange, its +construction time is linear in the size of the regex. A Thompson NFA also makes +the guarantee that given any state and a character in a haystack, there is at +most one transition defined for it. (Although there may be many epsilon +transitions.) + +It possible that other types of NFAs will be added in the future, such as a +[Glushkov NFA](https://en.wikipedia.org/wiki/Glushkov%27s_construction_algorithm). +But currently, this crate only provides a Thompson NFA. +*/ + +#[cfg(feature = "nfa-thompson")] +pub mod thompson; diff --git a/third_party/rust/regex-automata/src/nfa/thompson/backtrack.rs b/third_party/rust/regex-automata/src/nfa/thompson/backtrack.rs new file mode 100644 index 0000000000..eba037c1d0 --- /dev/null +++ b/third_party/rust/regex-automata/src/nfa/thompson/backtrack.rs @@ -0,0 +1,1884 @@ +/*! +An NFA backed bounded backtracker for executing regex searches with capturing +groups. + +This module provides a [`BoundedBacktracker`] that works by simulating an NFA +using the classical backtracking algorithm with a twist: it avoids redoing +work that it has done before and thereby avoids worst case exponential time. +In exchange, it can only be used on "short" haystacks. Its advantage is that +is can be faster than the [`PikeVM`](thompson::pikevm::PikeVM) in many cases +because it does less book-keeping. +*/ + +use alloc::{vec, vec::Vec}; + +use crate::{ + nfa::thompson::{self, BuildError, State, NFA}, + util::{ + captures::Captures, + empty, iter, + prefilter::Prefilter, + primitives::{NonMaxUsize, PatternID, SmallIndex, StateID}, + search::{Anchored, HalfMatch, Input, Match, MatchError, Span}, + }, +}; + +/// Returns the minimum visited capacity for the given haystack. +/// +/// This function can be used as the argument to [`Config::visited_capacity`] +/// in order to guarantee that a backtracking search for the given `input` +/// won't return an error when using a [`BoundedBacktracker`] built from the +/// given `NFA`. +/// +/// This routine exists primarily as a way to test that the bounded backtracker +/// works correctly when its capacity is set to the smallest possible amount. +/// Still, it may be useful in cases where you know you want to use the bounded +/// backtracker for a specific input, and just need to know what visited +/// capacity to provide to make it work. +/// +/// Be warned that this number could be quite large as it is multiplicative in +/// the size the given NFA and haystack. +pub fn min_visited_capacity(nfa: &NFA, input: &Input<'_>) -> usize { + div_ceil(nfa.states().len() * (input.get_span().len() + 1), 8) +} + +/// The configuration used for building a bounded backtracker. +/// +/// A bounded backtracker configuration is a simple data object that is +/// typically used with [`Builder::configure`]. +#[derive(Clone, Debug, Default)] +pub struct Config { + pre: Option<Option<Prefilter>>, + visited_capacity: Option<usize>, +} + +impl Config { + /// Return a new default regex configuration. + pub fn new() -> Config { + Config::default() + } + + /// Set a prefilter to be used whenever a start state is entered. + /// + /// A [`Prefilter`] in this context is meant to accelerate searches by + /// looking for literal prefixes that every match for the corresponding + /// pattern (or patterns) must start with. Once a prefilter produces a + /// match, the underlying search routine continues on to try and confirm + /// the match. + /// + /// Be warned that setting a prefilter does not guarantee that the search + /// will be faster. While it's usually a good bet, if the prefilter + /// produces a lot of false positive candidates (i.e., positions matched + /// by the prefilter but not by the regex), then the overall result can + /// be slower than if you had just executed the regex engine without any + /// prefilters. + /// + /// By default no prefilter is set. + /// + /// # Example + /// + /// ``` + /// use regex_automata::{ + /// nfa::thompson::backtrack::BoundedBacktracker, + /// util::prefilter::Prefilter, + /// Input, Match, MatchKind, + /// }; + /// + /// let pre = Prefilter::new(MatchKind::LeftmostFirst, &["foo", "bar"]); + /// let re = BoundedBacktracker::builder() + /// .configure(BoundedBacktracker::config().prefilter(pre)) + /// .build(r"(foo|bar)[a-z]+")?; + /// let mut cache = re.create_cache(); + /// let input = Input::new("foo1 barfox bar"); + /// assert_eq!( + /// Some(Match::must(0, 5..11)), + /// re.try_find(&mut cache, input)?, + /// ); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + /// + /// Be warned though that an incorrect prefilter can lead to incorrect + /// results! + /// + /// ``` + /// use regex_automata::{ + /// nfa::thompson::backtrack::BoundedBacktracker, + /// util::prefilter::Prefilter, + /// Input, HalfMatch, MatchKind, + /// }; + /// + /// let pre = Prefilter::new(MatchKind::LeftmostFirst, &["foo", "car"]); + /// let re = BoundedBacktracker::builder() + /// .configure(BoundedBacktracker::config().prefilter(pre)) + /// .build(r"(foo|bar)[a-z]+")?; + /// let mut cache = re.create_cache(); + /// let input = Input::new("foo1 barfox bar"); + /// // No match reported even though there clearly is one! + /// assert_eq!(None, re.try_find(&mut cache, input)?); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + pub fn prefilter(mut self, pre: Option<Prefilter>) -> Config { + self.pre = Some(pre); + self + } + + /// Set the visited capacity used to bound backtracking. + /// + /// The visited capacity represents the amount of heap memory (in bytes) to + /// allocate toward tracking which parts of the backtracking search have + /// been done before. The heap memory needed for any particular search is + /// proportional to `haystack.len() * nfa.states().len()`, which an be + /// quite large. Therefore, the bounded backtracker is typically only able + /// to run on shorter haystacks. + /// + /// For a given regex, increasing the visited capacity means that the + /// maximum haystack length that can be searched is increased. The + /// [`BoundedBacktracker::max_haystack_len`] method returns that maximum. + /// + /// The default capacity is a reasonable but empirically chosen size. + /// + /// # Example + /// + /// As with other regex engines, Unicode is what tends to make the bounded + /// backtracker less useful by making the maximum haystack length quite + /// small. If necessary, increasing the visited capacity using this routine + /// will increase the maximum haystack length at the cost of using more + /// memory. + /// + /// Note though that the specific maximum values here are not an API + /// guarantee. The default visited capacity is subject to change and not + /// covered by semver. + /// + /// ``` + /// # if cfg!(miri) { return Ok(()); } // miri takes too long + /// use regex_automata::nfa::thompson::backtrack::BoundedBacktracker; + /// + /// // Unicode inflates the size of the underlying NFA quite a bit, and + /// // thus means that the backtracker can only handle smaller haystacks, + /// // assuming that the visited capacity remains unchanged. + /// let re = BoundedBacktracker::new(r"\w+")?; + /// assert!(re.max_haystack_len() <= 7_000); + /// // But we can increase the visited capacity to handle bigger haystacks! + /// let re = BoundedBacktracker::builder() + /// .configure(BoundedBacktracker::config().visited_capacity(1<<20)) + /// .build(r"\w+")?; + /// assert!(re.max_haystack_len() >= 25_000); + /// assert!(re.max_haystack_len() <= 28_000); + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + pub fn visited_capacity(mut self, capacity: usize) -> Config { + self.visited_capacity = Some(capacity); + self + } + + /// Returns the prefilter set in this configuration, if one at all. + pub fn get_prefilter(&self) -> Option<&Prefilter> { + self.pre.as_ref().unwrap_or(&None).as_ref() + } + + /// Returns the configured visited capacity. + /// + /// Note that the actual capacity used may be slightly bigger than the + /// configured capacity. + pub fn get_visited_capacity(&self) -> usize { + const DEFAULT: usize = 256 * (1 << 10); // 256 KB + self.visited_capacity.unwrap_or(DEFAULT) + } + + /// Overwrite the default configuration such that the options in `o` are + /// always used. If an option in `o` is not set, then the corresponding + /// option in `self` is used. If it's not set in `self` either, then it + /// remains not set. + pub(crate) fn overwrite(&self, o: Config) -> Config { + Config { + pre: o.pre.or_else(|| self.pre.clone()), + visited_capacity: o.visited_capacity.or(self.visited_capacity), + } + } +} + +/// A builder for a bounded backtracker. +/// +/// This builder permits configuring options for the syntax of a pattern, the +/// NFA construction and the `BoundedBacktracker` construction. This builder +/// is different from a general purpose regex builder in that it permits fine +/// grain configuration of the construction process. The trade off for this is +/// complexity, and the possibility of setting a configuration that might not +/// make sense. For example, there are two different UTF-8 modes: +/// +/// * [`syntax::Config::utf8`](crate::util::syntax::Config::utf8) controls +/// whether the pattern itself can contain sub-expressions that match invalid +/// UTF-8. +/// * [`thompson::Config::utf8`] controls how the regex iterators themselves +/// advance the starting position of the next search when a match with zero +/// length is found. +/// +/// Generally speaking, callers will want to either enable all of these or +/// disable all of these. +/// +/// # Example +/// +/// This example shows how to disable UTF-8 mode in the syntax and the regex +/// itself. This is generally what you want for matching on arbitrary bytes. +/// +/// ``` +/// use regex_automata::{ +/// nfa::thompson::{self, backtrack::BoundedBacktracker}, +/// util::syntax, +/// Match, +/// }; +/// +/// let re = BoundedBacktracker::builder() +/// .syntax(syntax::Config::new().utf8(false)) +/// .thompson(thompson::Config::new().utf8(false)) +/// .build(r"foo(?-u:[^b])ar.*")?; +/// let mut cache = re.create_cache(); +/// +/// let haystack = b"\xFEfoo\xFFarzz\xE2\x98\xFF\n"; +/// let expected = Some(Ok(Match::must(0, 1..9))); +/// let got = re.try_find_iter(&mut cache, haystack).next(); +/// assert_eq!(expected, got); +/// // Notice that `(?-u:[^b])` matches invalid UTF-8, +/// // but the subsequent `.*` does not! Disabling UTF-8 +/// // on the syntax permits this. +/// // +/// // N.B. This example does not show the impact of +/// // disabling UTF-8 mode on a BoundedBacktracker Config, since that +/// // only impacts regexes that can produce matches of +/// // length 0. +/// assert_eq!(b"foo\xFFarzz", &haystack[got.unwrap()?.range()]); +/// +/// # Ok::<(), Box<dyn std::error::Error>>(()) +/// ``` +#[derive(Clone, Debug)] +pub struct Builder { + config: Config, + #[cfg(feature = "syntax")] + thompson: thompson::Compiler, +} + +impl Builder { + /// Create a new BoundedBacktracker builder with its default configuration. + pub fn new() -> Builder { + Builder { + config: Config::default(), + #[cfg(feature = "syntax")] + thompson: thompson::Compiler::new(), + } + } + + /// Build a `BoundedBacktracker` from the given pattern. + /// + /// If there was a problem parsing or compiling the pattern, then an error + /// is returned. + #[cfg(feature = "syntax")] + pub fn build( + &self, + pattern: &str, + ) -> Result<BoundedBacktracker, BuildError> { + self.build_many(&[pattern]) + } + + /// Build a `BoundedBacktracker` from the given patterns. + #[cfg(feature = "syntax")] + pub fn build_many<P: AsRef<str>>( + &self, + patterns: &[P], + ) -> Result<BoundedBacktracker, BuildError> { + let nfa = self.thompson.build_many(patterns)?; + self.build_from_nfa(nfa) + } + + /// Build a `BoundedBacktracker` directly from its NFA. + /// + /// Note that when using this method, any configuration that applies to the + /// construction of the NFA itself will of course be ignored, since the NFA + /// given here is already built. + pub fn build_from_nfa( + &self, + nfa: NFA, + ) -> Result<BoundedBacktracker, BuildError> { + nfa.look_set_any().available().map_err(BuildError::word)?; + Ok(BoundedBacktracker { config: self.config.clone(), nfa }) + } + + /// Apply the given `BoundedBacktracker` configuration options to this + /// builder. + pub fn configure(&mut self, config: Config) -> &mut Builder { + self.config = self.config.overwrite(config); + self + } + + /// Set the syntax configuration for this builder using + /// [`syntax::Config`](crate::util::syntax::Config). + /// + /// This permits setting things like case insensitivity, Unicode and multi + /// line mode. + /// + /// These settings only apply when constructing a `BoundedBacktracker` + /// directly from a pattern. + #[cfg(feature = "syntax")] + pub fn syntax( + &mut self, + config: crate::util::syntax::Config, + ) -> &mut Builder { + self.thompson.syntax(config); + self + } + + /// Set the Thompson NFA configuration for this builder using + /// [`nfa::thompson::Config`](crate::nfa::thompson::Config). + /// + /// This permits setting things like if additional time should be spent + /// shrinking the size of the NFA. + /// + /// These settings only apply when constructing a `BoundedBacktracker` + /// directly from a pattern. + #[cfg(feature = "syntax")] + pub fn thompson(&mut self, config: thompson::Config) -> &mut Builder { + self.thompson.configure(config); + self + } +} + +/// A backtracking regex engine that bounds its execution to avoid exponential +/// blow-up. +/// +/// This regex engine only implements leftmost-first match semantics and +/// only supports leftmost searches. It effectively does the same thing as a +/// [`PikeVM`](thompson::pikevm::PikeVM), but typically does it faster because +/// it doesn't have to worry about copying capturing group spans for most NFA +/// states. Instead, the backtracker can maintain one set of captures (provided +/// by the caller) and never needs to copy them. In exchange, the backtracker +/// bounds itself to ensure it doesn't exhibit worst case exponential time. +/// This results in the backtracker only being able to handle short haystacks +/// given reasonable memory usage. +/// +/// # Searches may return an error! +/// +/// By design, this backtracking regex engine is bounded. This bound is +/// implemented by not visiting any combination of NFA state ID and position +/// in a haystack more than once. Thus, the total memory required to bound +/// backtracking is proportional to `haystack.len() * nfa.states().len()`. +/// This can obviously get quite large, since large haystacks aren't terribly +/// uncommon. To avoid using exorbitant memory, the capacity is bounded by +/// a fixed limit set via [`Config::visited_capacity`]. Thus, if the total +/// capacity required for a particular regex and a haystack exceeds this +/// capacity, then the search routine will return an error. +/// +/// Unlike other regex engines that may return an error at search time (like +/// the DFA or the hybrid NFA/DFA), there is no way to guarantee that a bounded +/// backtracker will work for every haystack. Therefore, this regex engine +/// _only_ exposes fallible search routines to avoid the footgun of panicking +/// when running a search on a haystack that is too big. +/// +/// If one wants to use the fallible search APIs without handling the +/// error, the only way to guarantee an error won't occur from the +/// haystack length is to ensure the haystack length does not exceed +/// [`BoundedBacktracker::max_haystack_len`]. +/// +/// # Example: Unicode word boundaries +/// +/// This example shows that the bounded backtracker implements Unicode word +/// boundaries correctly by default. +/// +/// ``` +/// # if cfg!(miri) { return Ok(()); } // miri takes too long +/// use regex_automata::{nfa::thompson::backtrack::BoundedBacktracker, Match}; +/// +/// let re = BoundedBacktracker::new(r"\b\w+\b")?; +/// let mut cache = re.create_cache(); +/// +/// let mut it = re.try_find_iter(&mut cache, "Шерлок Холмс"); +/// assert_eq!(Some(Ok(Match::must(0, 0..12))), it.next()); +/// assert_eq!(Some(Ok(Match::must(0, 13..23))), it.next()); +/// assert_eq!(None, it.next()); +/// # Ok::<(), Box<dyn std::error::Error>>(()) +/// ``` +/// +/// # Example: multiple regex patterns +/// +/// The bounded backtracker supports searching for multiple patterns +/// simultaneously, just like other regex engines. Note though that because it +/// uses a backtracking strategy, this regex engine is unlikely to scale well +/// as more patterns are added. But then again, as more patterns are added, the +/// maximum haystack length allowed will also shorten (assuming the visited +/// capacity remains invariant). +/// +/// ``` +/// use regex_automata::{nfa::thompson::backtrack::BoundedBacktracker, Match}; +/// +/// let re = BoundedBacktracker::new_many(&["[a-z]+", "[0-9]+"])?; +/// let mut cache = re.create_cache(); +/// +/// let mut it = re.try_find_iter(&mut cache, "abc 1 foo 4567 0 quux"); +/// assert_eq!(Some(Ok(Match::must(0, 0..3))), it.next()); +/// assert_eq!(Some(Ok(Match::must(1, 4..5))), it.next()); +/// assert_eq!(Some(Ok(Match::must(0, 6..9))), it.next()); +/// assert_eq!(Some(Ok(Match::must(1, 10..14))), it.next()); +/// assert_eq!(Some(Ok(Match::must(1, 15..16))), it.next()); +/// assert_eq!(Some(Ok(Match::must(0, 17..21))), it.next()); +/// assert_eq!(None, it.next()); +/// # Ok::<(), Box<dyn std::error::Error>>(()) +/// ``` +#[derive(Clone, Debug)] +pub struct BoundedBacktracker { + config: Config, + nfa: NFA, +} + +impl BoundedBacktracker { + /// Parse the given regular expression using the default configuration and + /// return the corresponding `BoundedBacktracker`. + /// + /// If you want a non-default configuration, then use the [`Builder`] to + /// set your own configuration. + /// + /// # Example + /// + /// ``` + /// use regex_automata::{ + /// nfa::thompson::backtrack::BoundedBacktracker, + /// Match, + /// }; + /// + /// let re = BoundedBacktracker::new("foo[0-9]+bar")?; + /// let mut cache = re.create_cache(); + /// assert_eq!( + /// Some(Ok(Match::must(0, 3..14))), + /// re.try_find_iter(&mut cache, "zzzfoo12345barzzz").next(), + /// ); + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + #[cfg(feature = "syntax")] + pub fn new(pattern: &str) -> Result<BoundedBacktracker, BuildError> { + BoundedBacktracker::builder().build(pattern) + } + + /// Like `new`, but parses multiple patterns into a single "multi regex." + /// This similarly uses the default regex configuration. + /// + /// # Example + /// + /// ``` + /// use regex_automata::{ + /// nfa::thompson::backtrack::BoundedBacktracker, + /// Match, + /// }; + /// + /// let re = BoundedBacktracker::new_many(&["[a-z]+", "[0-9]+"])?; + /// let mut cache = re.create_cache(); + /// + /// let mut it = re.try_find_iter(&mut cache, "abc 1 foo 4567 0 quux"); + /// assert_eq!(Some(Ok(Match::must(0, 0..3))), it.next()); + /// assert_eq!(Some(Ok(Match::must(1, 4..5))), it.next()); + /// assert_eq!(Some(Ok(Match::must(0, 6..9))), it.next()); + /// assert_eq!(Some(Ok(Match::must(1, 10..14))), it.next()); + /// assert_eq!(Some(Ok(Match::must(1, 15..16))), it.next()); + /// assert_eq!(Some(Ok(Match::must(0, 17..21))), it.next()); + /// assert_eq!(None, it.next()); + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + #[cfg(feature = "syntax")] + pub fn new_many<P: AsRef<str>>( + patterns: &[P], + ) -> Result<BoundedBacktracker, BuildError> { + BoundedBacktracker::builder().build_many(patterns) + } + + /// # Example + /// + /// This shows how to hand assemble a regular expression via its HIR, + /// compile an NFA from it and build a BoundedBacktracker from the NFA. + /// + /// ``` + /// use regex_automata::{ + /// nfa::thompson::{NFA, backtrack::BoundedBacktracker}, + /// Match, + /// }; + /// use regex_syntax::hir::{Hir, Class, ClassBytes, ClassBytesRange}; + /// + /// let hir = Hir::class(Class::Bytes(ClassBytes::new(vec![ + /// ClassBytesRange::new(b'0', b'9'), + /// ClassBytesRange::new(b'A', b'Z'), + /// ClassBytesRange::new(b'_', b'_'), + /// ClassBytesRange::new(b'a', b'z'), + /// ]))); + /// + /// let config = NFA::config().nfa_size_limit(Some(1_000)); + /// let nfa = NFA::compiler().configure(config).build_from_hir(&hir)?; + /// + /// let re = BoundedBacktracker::new_from_nfa(nfa)?; + /// let (mut cache, mut caps) = (re.create_cache(), re.create_captures()); + /// let expected = Some(Match::must(0, 3..4)); + /// re.try_captures(&mut cache, "!@#A#@!", &mut caps)?; + /// assert_eq!(expected, caps.get_match()); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + pub fn new_from_nfa(nfa: NFA) -> Result<BoundedBacktracker, BuildError> { + BoundedBacktracker::builder().build_from_nfa(nfa) + } + + /// Create a new `BoundedBacktracker` that matches every input. + /// + /// # Example + /// + /// ``` + /// use regex_automata::{ + /// nfa::thompson::backtrack::BoundedBacktracker, + /// Match, + /// }; + /// + /// let re = BoundedBacktracker::always_match()?; + /// let mut cache = re.create_cache(); + /// + /// let expected = Some(Ok(Match::must(0, 0..0))); + /// assert_eq!(expected, re.try_find_iter(&mut cache, "").next()); + /// assert_eq!(expected, re.try_find_iter(&mut cache, "foo").next()); + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + pub fn always_match() -> Result<BoundedBacktracker, BuildError> { + let nfa = thompson::NFA::always_match(); + BoundedBacktracker::new_from_nfa(nfa) + } + + /// Create a new `BoundedBacktracker` that never matches any input. + /// + /// # Example + /// + /// ``` + /// use regex_automata::nfa::thompson::backtrack::BoundedBacktracker; + /// + /// let re = BoundedBacktracker::never_match()?; + /// let mut cache = re.create_cache(); + /// + /// assert_eq!(None, re.try_find_iter(&mut cache, "").next()); + /// assert_eq!(None, re.try_find_iter(&mut cache, "foo").next()); + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + pub fn never_match() -> Result<BoundedBacktracker, BuildError> { + let nfa = thompson::NFA::never_match(); + BoundedBacktracker::new_from_nfa(nfa) + } + + /// Return a default configuration for a `BoundedBacktracker`. + /// + /// This is a convenience routine to avoid needing to import the `Config` + /// type when customizing the construction of a `BoundedBacktracker`. + /// + /// # Example + /// + /// This example shows how to disable UTF-8 mode. When UTF-8 mode is + /// disabled, zero-width matches that split a codepoint are allowed. + /// Otherwise they are never reported. + /// + /// In the code below, notice that `""` is permitted to match positions + /// that split the encoding of a codepoint. + /// + /// ``` + /// use regex_automata::{ + /// nfa::thompson::{self, backtrack::BoundedBacktracker}, + /// Match, + /// }; + /// + /// let re = BoundedBacktracker::builder() + /// .thompson(thompson::Config::new().utf8(false)) + /// .build(r"")?; + /// let mut cache = re.create_cache(); + /// + /// let haystack = "a☃z"; + /// let mut it = re.try_find_iter(&mut cache, haystack); + /// assert_eq!(Some(Ok(Match::must(0, 0..0))), it.next()); + /// assert_eq!(Some(Ok(Match::must(0, 1..1))), it.next()); + /// assert_eq!(Some(Ok(Match::must(0, 2..2))), it.next()); + /// assert_eq!(Some(Ok(Match::must(0, 3..3))), it.next()); + /// assert_eq!(Some(Ok(Match::must(0, 4..4))), it.next()); + /// assert_eq!(Some(Ok(Match::must(0, 5..5))), it.next()); + /// assert_eq!(None, it.next()); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + pub fn config() -> Config { + Config::new() + } + + /// Return a builder for configuring the construction of a + /// `BoundedBacktracker`. + /// + /// This is a convenience routine to avoid needing to import the + /// [`Builder`] type in common cases. + /// + /// # Example + /// + /// This example shows how to use the builder to disable UTF-8 mode + /// everywhere. + /// + /// ``` + /// # if cfg!(miri) { return Ok(()); } // miri takes too long + /// use regex_automata::{ + /// nfa::thompson::{self, backtrack::BoundedBacktracker}, + /// util::syntax, + /// Match, + /// }; + /// + /// let re = BoundedBacktracker::builder() + /// .syntax(syntax::Config::new().utf8(false)) + /// .thompson(thompson::Config::new().utf8(false)) + /// .build(r"foo(?-u:[^b])ar.*")?; + /// let (mut cache, mut caps) = (re.create_cache(), re.create_captures()); + /// + /// let haystack = b"\xFEfoo\xFFarzz\xE2\x98\xFF\n"; + /// let expected = Some(Match::must(0, 1..9)); + /// re.try_captures(&mut cache, haystack, &mut caps)?; + /// assert_eq!(expected, caps.get_match()); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + pub fn builder() -> Builder { + Builder::new() + } + + /// Create a new cache for this regex. + /// + /// The cache returned should only be used for searches for this + /// regex. If you want to reuse the cache for another regex, then you + /// must call [`Cache::reset`] with that regex (or, equivalently, + /// [`BoundedBacktracker::reset_cache`]). + pub fn create_cache(&self) -> Cache { + Cache::new(self) + } + + /// Create a new empty set of capturing groups that is guaranteed to be + /// valid for the search APIs on this `BoundedBacktracker`. + /// + /// A `Captures` value created for a specific `BoundedBacktracker` cannot + /// be used with any other `BoundedBacktracker`. + /// + /// This is a convenience function for [`Captures::all`]. See the + /// [`Captures`] documentation for an explanation of its alternative + /// constructors that permit the `BoundedBacktracker` to do less work + /// during a search, and thus might make it faster. + pub fn create_captures(&self) -> Captures { + Captures::all(self.get_nfa().group_info().clone()) + } + + /// Reset the given cache such that it can be used for searching with the + /// this `BoundedBacktracker` (and only this `BoundedBacktracker`). + /// + /// A cache reset permits reusing memory already allocated in this cache + /// with a different `BoundedBacktracker`. + /// + /// # Example + /// + /// This shows how to re-purpose a cache for use with a different + /// `BoundedBacktracker`. + /// + /// ``` + /// # if cfg!(miri) { return Ok(()); } // miri takes too long + /// use regex_automata::{ + /// nfa::thompson::backtrack::BoundedBacktracker, + /// Match, + /// }; + /// + /// let re1 = BoundedBacktracker::new(r"\w")?; + /// let re2 = BoundedBacktracker::new(r"\W")?; + /// + /// let mut cache = re1.create_cache(); + /// assert_eq!( + /// Some(Ok(Match::must(0, 0..2))), + /// re1.try_find_iter(&mut cache, "Δ").next(), + /// ); + /// + /// // Using 'cache' with re2 is not allowed. It may result in panics or + /// // incorrect results. In order to re-purpose the cache, we must reset + /// // it with the BoundedBacktracker we'd like to use it with. + /// // + /// // Similarly, after this reset, using the cache with 're1' is also not + /// // allowed. + /// cache.reset(&re2); + /// assert_eq!( + /// Some(Ok(Match::must(0, 0..3))), + /// re2.try_find_iter(&mut cache, "☃").next(), + /// ); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + pub fn reset_cache(&self, cache: &mut Cache) { + cache.reset(self); + } + + /// Returns the total number of patterns compiled into this + /// `BoundedBacktracker`. + /// + /// In the case of a `BoundedBacktracker` that contains no patterns, this + /// returns `0`. + /// + /// # Example + /// + /// This example shows the pattern length for a `BoundedBacktracker` that + /// never matches: + /// + /// ``` + /// use regex_automata::nfa::thompson::backtrack::BoundedBacktracker; + /// + /// let re = BoundedBacktracker::never_match()?; + /// assert_eq!(re.pattern_len(), 0); + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + /// + /// And another example for a `BoundedBacktracker` that matches at every + /// position: + /// + /// ``` + /// use regex_automata::nfa::thompson::backtrack::BoundedBacktracker; + /// + /// let re = BoundedBacktracker::always_match()?; + /// assert_eq!(re.pattern_len(), 1); + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + /// + /// And finally, a `BoundedBacktracker` that was constructed from multiple + /// patterns: + /// + /// ``` + /// use regex_automata::nfa::thompson::backtrack::BoundedBacktracker; + /// + /// let re = BoundedBacktracker::new_many(&["[0-9]+", "[a-z]+", "[A-Z]+"])?; + /// assert_eq!(re.pattern_len(), 3); + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + pub fn pattern_len(&self) -> usize { + self.nfa.pattern_len() + } + + /// Return the config for this `BoundedBacktracker`. + #[inline] + pub fn get_config(&self) -> &Config { + &self.config + } + + /// Returns a reference to the underlying NFA. + #[inline] + pub fn get_nfa(&self) -> &NFA { + &self.nfa + } + + /// Returns the maximum haystack length supported by this backtracker. + /// + /// This routine is a function of both [`Config::visited_capacity`] and the + /// internal size of the backtracker's NFA. + /// + /// # Example + /// + /// This example shows how the maximum haystack length can vary depending + /// on the size of the regex itself. Note though that the specific maximum + /// values here are not an API guarantee. The default visited capacity is + /// subject to change and not covered by semver. + /// + /// ``` + /// # if cfg!(miri) { return Ok(()); } // miri takes too long + /// use regex_automata::{ + /// nfa::thompson::backtrack::BoundedBacktracker, + /// Match, MatchError, + /// }; + /// + /// // If you're only using ASCII, you get a big budget. + /// let re = BoundedBacktracker::new(r"(?-u)\w+")?; + /// let mut cache = re.create_cache(); + /// assert_eq!(re.max_haystack_len(), 299_592); + /// // Things work up to the max. + /// let mut haystack = "a".repeat(299_592); + /// let expected = Some(Ok(Match::must(0, 0..299_592))); + /// assert_eq!(expected, re.try_find_iter(&mut cache, &haystack).next()); + /// // But you'll get an error if you provide a haystack that's too big. + /// // Notice that we use the 'try_find_iter' routine instead, which + /// // yields Result<Match, MatchError> instead of Match. + /// haystack.push('a'); + /// let expected = Some(Err(MatchError::haystack_too_long(299_593))); + /// assert_eq!(expected, re.try_find_iter(&mut cache, &haystack).next()); + /// + /// // Unicode inflates the size of the underlying NFA quite a bit, and + /// // thus means that the backtracker can only handle smaller haystacks, + /// // assuming that the visited capacity remains unchanged. + /// let re = BoundedBacktracker::new(r"\w+")?; + /// assert!(re.max_haystack_len() <= 7_000); + /// // But we can increase the visited capacity to handle bigger haystacks! + /// let re = BoundedBacktracker::builder() + /// .configure(BoundedBacktracker::config().visited_capacity(1<<20)) + /// .build(r"\w+")?; + /// assert!(re.max_haystack_len() >= 25_000); + /// assert!(re.max_haystack_len() <= 28_000); + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + #[inline] + pub fn max_haystack_len(&self) -> usize { + // The capacity given in the config is "bytes of heap memory," but the + // capacity we use here is "number of bits." So convert the capacity in + // bytes to the capacity in bits. + let capacity = 8 * self.get_config().get_visited_capacity(); + let blocks = div_ceil(capacity, Visited::BLOCK_SIZE); + let real_capacity = blocks * Visited::BLOCK_SIZE; + (real_capacity / self.nfa.states().len()) - 1 + } +} + +impl BoundedBacktracker { + /// Returns true if and only if this regex matches the given haystack. + /// + /// In the case of a backtracking regex engine, and unlike most other + /// regex engines in this crate, short circuiting isn't practical. However, + /// this routine may still be faster because it instructs backtracking to + /// not keep track of any capturing groups. + /// + /// # Errors + /// + /// This routine only errors if the search could not complete. For this + /// backtracking regex engine, this only occurs when the haystack length + /// exceeds [`BoundedBacktracker::max_haystack_len`]. + /// + /// When a search cannot complete, callers cannot know whether a match + /// exists or not. + /// + /// # Example + /// + /// ``` + /// use regex_automata::nfa::thompson::backtrack::BoundedBacktracker; + /// + /// let re = BoundedBacktracker::new("foo[0-9]+bar")?; + /// let mut cache = re.create_cache(); + /// + /// assert!(re.try_is_match(&mut cache, "foo12345bar")?); + /// assert!(!re.try_is_match(&mut cache, "foobar")?); + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + /// + /// # Example: consistency with search APIs + /// + /// `is_match` is guaranteed to return `true` whenever `find` returns a + /// match. This includes searches that are executed entirely within a + /// codepoint: + /// + /// ``` + /// use regex_automata::{ + /// nfa::thompson::backtrack::BoundedBacktracker, + /// Input, + /// }; + /// + /// let re = BoundedBacktracker::new("a*")?; + /// let mut cache = re.create_cache(); + /// + /// assert!(!re.try_is_match(&mut cache, Input::new("☃").span(1..2))?); + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + /// + /// Notice that when UTF-8 mode is disabled, then the above reports a + /// match because the restriction against zero-width matches that split a + /// codepoint has been lifted: + /// + /// ``` + /// use regex_automata::{ + /// nfa::thompson::{backtrack::BoundedBacktracker, NFA}, + /// Input, + /// }; + /// + /// let re = BoundedBacktracker::builder() + /// .thompson(NFA::config().utf8(false)) + /// .build("a*")?; + /// let mut cache = re.create_cache(); + /// + /// assert!(re.try_is_match(&mut cache, Input::new("☃").span(1..2))?); + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + #[inline] + pub fn try_is_match<'h, I: Into<Input<'h>>>( + &self, + cache: &mut Cache, + input: I, + ) -> Result<bool, MatchError> { + let input = input.into().earliest(true); + self.try_search_slots(cache, &input, &mut []).map(|pid| pid.is_some()) + } + + /// Executes a leftmost forward search and returns a `Match` if one exists. + /// + /// This routine only includes the overall match span. To get + /// access to the individual spans of each capturing group, use + /// [`BoundedBacktracker::try_captures`]. + /// + /// # Errors + /// + /// This routine only errors if the search could not complete. For this + /// backtracking regex engine, this only occurs when the haystack length + /// exceeds [`BoundedBacktracker::max_haystack_len`]. + /// + /// When a search cannot complete, callers cannot know whether a match + /// exists or not. + /// + /// # Example + /// + /// ``` + /// use regex_automata::{ + /// nfa::thompson::backtrack::BoundedBacktracker, + /// Match, + /// }; + /// + /// let re = BoundedBacktracker::new("foo[0-9]+")?; + /// let mut cache = re.create_cache(); + /// let expected = Match::must(0, 0..8); + /// assert_eq!(Some(expected), re.try_find(&mut cache, "foo12345")?); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + #[inline] + pub fn try_find<'h, I: Into<Input<'h>>>( + &self, + cache: &mut Cache, + input: I, + ) -> Result<Option<Match>, MatchError> { + let input = input.into(); + if self.get_nfa().pattern_len() == 1 { + let mut slots = [None, None]; + let pid = match self.try_search_slots(cache, &input, &mut slots)? { + None => return Ok(None), + Some(pid) => pid, + }; + let start = match slots[0] { + None => return Ok(None), + Some(s) => s.get(), + }; + let end = match slots[1] { + None => return Ok(None), + Some(s) => s.get(), + }; + return Ok(Some(Match::new(pid, Span { start, end }))); + } + let ginfo = self.get_nfa().group_info(); + let slots_len = ginfo.implicit_slot_len(); + let mut slots = vec![None; slots_len]; + let pid = match self.try_search_slots(cache, &input, &mut slots)? { + None => return Ok(None), + Some(pid) => pid, + }; + let start = match slots[pid.as_usize() * 2] { + None => return Ok(None), + Some(s) => s.get(), + }; + let end = match slots[pid.as_usize() * 2 + 1] { + None => return Ok(None), + Some(s) => s.get(), + }; + Ok(Some(Match::new(pid, Span { start, end }))) + } + + /// Executes a leftmost forward search and writes the spans of capturing + /// groups that participated in a match into the provided [`Captures`] + /// value. If no match was found, then [`Captures::is_match`] is guaranteed + /// to return `false`. + /// + /// # Errors + /// + /// This routine only errors if the search could not complete. For this + /// backtracking regex engine, this only occurs when the haystack length + /// exceeds [`BoundedBacktracker::max_haystack_len`]. + /// + /// When a search cannot complete, callers cannot know whether a match + /// exists or not. + /// + /// # Example + /// + /// ``` + /// use regex_automata::{ + /// nfa::thompson::backtrack::BoundedBacktracker, + /// Span, + /// }; + /// + /// let re = BoundedBacktracker::new( + /// r"^([0-9]{4})-([0-9]{2})-([0-9]{2})$", + /// )?; + /// let (mut cache, mut caps) = (re.create_cache(), re.create_captures()); + /// + /// re.try_captures(&mut cache, "2010-03-14", &mut caps)?; + /// assert!(caps.is_match()); + /// assert_eq!(Some(Span::from(0..4)), caps.get_group(1)); + /// assert_eq!(Some(Span::from(5..7)), caps.get_group(2)); + /// assert_eq!(Some(Span::from(8..10)), caps.get_group(3)); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + #[inline] + pub fn try_captures<'h, I: Into<Input<'h>>>( + &self, + cache: &mut Cache, + input: I, + caps: &mut Captures, + ) -> Result<(), MatchError> { + self.try_search(cache, &input.into(), caps) + } + + /// Returns an iterator over all non-overlapping leftmost matches in the + /// given bytes. If no match exists, then the iterator yields no elements. + /// + /// If the regex engine returns an error at any point, then the iterator + /// will yield that error. + /// + /// # Example + /// + /// ``` + /// use regex_automata::{ + /// nfa::thompson::backtrack::BoundedBacktracker, + /// Match, MatchError, + /// }; + /// + /// let re = BoundedBacktracker::new("foo[0-9]+")?; + /// let mut cache = re.create_cache(); + /// + /// let text = "foo1 foo12 foo123"; + /// let result: Result<Vec<Match>, MatchError> = re + /// .try_find_iter(&mut cache, text) + /// .collect(); + /// let matches = result?; + /// assert_eq!(matches, vec![ + /// Match::must(0, 0..4), + /// Match::must(0, 5..10), + /// Match::must(0, 11..17), + /// ]); + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + #[inline] + pub fn try_find_iter<'r, 'c, 'h, I: Into<Input<'h>>>( + &'r self, + cache: &'c mut Cache, + input: I, + ) -> TryFindMatches<'r, 'c, 'h> { + let caps = Captures::matches(self.get_nfa().group_info().clone()); + let it = iter::Searcher::new(input.into()); + TryFindMatches { re: self, cache, caps, it } + } + + /// Returns an iterator over all non-overlapping `Captures` values. If no + /// match exists, then the iterator yields no elements. + /// + /// This yields the same matches as [`BoundedBacktracker::try_find_iter`], + /// but it includes the spans of all capturing groups that participate in + /// each match. + /// + /// If the regex engine returns an error at any point, then the iterator + /// will yield that error. + /// + /// **Tip:** See [`util::iter::Searcher`](crate::util::iter::Searcher) for + /// how to correctly iterate over all matches in a haystack while avoiding + /// the creation of a new `Captures` value for every match. (Which you are + /// forced to do with an `Iterator`.) + /// + /// # Example + /// + /// ``` + /// use regex_automata::{ + /// nfa::thompson::backtrack::BoundedBacktracker, + /// Span, + /// }; + /// + /// let re = BoundedBacktracker::new("foo(?P<numbers>[0-9]+)")?; + /// let mut cache = re.create_cache(); + /// + /// let text = "foo1 foo12 foo123"; + /// let mut spans = vec![]; + /// for result in re.try_captures_iter(&mut cache, text) { + /// let caps = result?; + /// // The unwrap is OK since 'numbers' matches if the pattern matches. + /// spans.push(caps.get_group_by_name("numbers").unwrap()); + /// } + /// assert_eq!(spans, vec![ + /// Span::from(3..4), + /// Span::from(8..10), + /// Span::from(14..17), + /// ]); + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + #[inline] + pub fn try_captures_iter<'r, 'c, 'h, I: Into<Input<'h>>>( + &'r self, + cache: &'c mut Cache, + input: I, + ) -> TryCapturesMatches<'r, 'c, 'h> { + let caps = self.create_captures(); + let it = iter::Searcher::new(input.into()); + TryCapturesMatches { re: self, cache, caps, it } + } +} + +impl BoundedBacktracker { + /// Executes a leftmost forward search and writes the spans of capturing + /// groups that participated in a match into the provided [`Captures`] + /// value. If no match was found, then [`Captures::is_match`] is guaranteed + /// to return `false`. + /// + /// This is like [`BoundedBacktracker::try_captures`], but it accepts a + /// concrete `&Input` instead of an `Into<Input>`. + /// + /// # Errors + /// + /// This routine only errors if the search could not complete. For this + /// backtracking regex engine, this only occurs when the haystack length + /// exceeds [`BoundedBacktracker::max_haystack_len`]. + /// + /// When a search cannot complete, callers cannot know whether a match + /// exists or not. + /// + /// # Example: specific pattern search + /// + /// This example shows how to build a multi bounded backtracker that + /// permits searching for specific patterns. + /// + /// ``` + /// use regex_automata::{ + /// nfa::thompson::backtrack::BoundedBacktracker, + /// Anchored, Input, Match, PatternID, + /// }; + /// + /// let re = BoundedBacktracker::new_many(&[ + /// "[a-z0-9]{6}", + /// "[a-z][a-z0-9]{5}", + /// ])?; + /// let (mut cache, mut caps) = (re.create_cache(), re.create_captures()); + /// let haystack = "foo123"; + /// + /// // Since we are using the default leftmost-first match and both + /// // patterns match at the same starting position, only the first pattern + /// // will be returned in this case when doing a search for any of the + /// // patterns. + /// let expected = Some(Match::must(0, 0..6)); + /// re.try_search(&mut cache, &Input::new(haystack), &mut caps)?; + /// assert_eq!(expected, caps.get_match()); + /// + /// // But if we want to check whether some other pattern matches, then we + /// // can provide its pattern ID. + /// let expected = Some(Match::must(1, 0..6)); + /// let input = Input::new(haystack) + /// .anchored(Anchored::Pattern(PatternID::must(1))); + /// re.try_search(&mut cache, &input, &mut caps)?; + /// assert_eq!(expected, caps.get_match()); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + /// + /// # Example: specifying the bounds of a search + /// + /// This example shows how providing the bounds of a search can produce + /// different results than simply sub-slicing the haystack. + /// + /// ``` + /// # if cfg!(miri) { return Ok(()); } // miri takes too long + /// use regex_automata::{ + /// nfa::thompson::backtrack::BoundedBacktracker, + /// Match, Input, + /// }; + /// + /// let re = BoundedBacktracker::new(r"\b[0-9]{3}\b")?; + /// let (mut cache, mut caps) = (re.create_cache(), re.create_captures()); + /// let haystack = "foo123bar"; + /// + /// // Since we sub-slice the haystack, the search doesn't know about + /// // the larger context and assumes that `123` is surrounded by word + /// // boundaries. And of course, the match position is reported relative + /// // to the sub-slice as well, which means we get `0..3` instead of + /// // `3..6`. + /// let expected = Some(Match::must(0, 0..3)); + /// re.try_search(&mut cache, &Input::new(&haystack[3..6]), &mut caps)?; + /// assert_eq!(expected, caps.get_match()); + /// + /// // But if we provide the bounds of the search within the context of the + /// // entire haystack, then the search can take the surrounding context + /// // into account. (And if we did find a match, it would be reported + /// // as a valid offset into `haystack` instead of its sub-slice.) + /// let expected = None; + /// re.try_search( + /// &mut cache, &Input::new(haystack).range(3..6), &mut caps, + /// )?; + /// assert_eq!(expected, caps.get_match()); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + #[inline] + pub fn try_search( + &self, + cache: &mut Cache, + input: &Input<'_>, + caps: &mut Captures, + ) -> Result<(), MatchError> { + caps.set_pattern(None); + let pid = self.try_search_slots(cache, input, caps.slots_mut())?; + caps.set_pattern(pid); + Ok(()) + } + + /// Executes a leftmost forward search and writes the spans of capturing + /// groups that participated in a match into the provided `slots`, and + /// returns the matching pattern ID. The contents of the slots for patterns + /// other than the matching pattern are unspecified. If no match was found, + /// then `None` is returned and the contents of all `slots` is unspecified. + /// + /// This is like [`BoundedBacktracker::try_search`], but it accepts a raw + /// slots slice instead of a `Captures` value. This is useful in contexts + /// where you don't want or need to allocate a `Captures`. + /// + /// It is legal to pass _any_ number of slots to this routine. If the regex + /// engine would otherwise write a slot offset that doesn't fit in the + /// provided slice, then it is simply skipped. In general though, there are + /// usually three slice lengths you might want to use: + /// + /// * An empty slice, if you only care about which pattern matched. + /// * A slice with + /// [`pattern_len() * 2`](crate::nfa::thompson::NFA::pattern_len) + /// slots, if you only care about the overall match spans for each matching + /// pattern. + /// * A slice with + /// [`slot_len()`](crate::util::captures::GroupInfo::slot_len) slots, which + /// permits recording match offsets for every capturing group in every + /// pattern. + /// + /// # Errors + /// + /// This routine only errors if the search could not complete. For this + /// backtracking regex engine, this only occurs when the haystack length + /// exceeds [`BoundedBacktracker::max_haystack_len`]. + /// + /// When a search cannot complete, callers cannot know whether a match + /// exists or not. + /// + /// # Example + /// + /// This example shows how to find the overall match offsets in a + /// multi-pattern search without allocating a `Captures` value. Indeed, we + /// can put our slots right on the stack. + /// + /// ``` + /// # if cfg!(miri) { return Ok(()); } // miri takes too long + /// use regex_automata::{ + /// nfa::thompson::backtrack::BoundedBacktracker, + /// PatternID, Input, + /// }; + /// + /// let re = BoundedBacktracker::new_many(&[ + /// r"\pL+", + /// r"\d+", + /// ])?; + /// let mut cache = re.create_cache(); + /// let input = Input::new("!@#123"); + /// + /// // We only care about the overall match offsets here, so we just + /// // allocate two slots for each pattern. Each slot records the start + /// // and end of the match. + /// let mut slots = [None; 4]; + /// let pid = re.try_search_slots(&mut cache, &input, &mut slots)?; + /// assert_eq!(Some(PatternID::must(1)), pid); + /// + /// // The overall match offsets are always at 'pid * 2' and 'pid * 2 + 1'. + /// // See 'GroupInfo' for more details on the mapping between groups and + /// // slot indices. + /// let slot_start = pid.unwrap().as_usize() * 2; + /// let slot_end = slot_start + 1; + /// assert_eq!(Some(3), slots[slot_start].map(|s| s.get())); + /// assert_eq!(Some(6), slots[slot_end].map(|s| s.get())); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + #[inline] + pub fn try_search_slots( + &self, + cache: &mut Cache, + input: &Input<'_>, + slots: &mut [Option<NonMaxUsize>], + ) -> Result<Option<PatternID>, MatchError> { + let utf8empty = self.get_nfa().has_empty() && self.get_nfa().is_utf8(); + if !utf8empty { + let maybe_hm = self.try_search_slots_imp(cache, input, slots)?; + return Ok(maybe_hm.map(|hm| hm.pattern())); + } + // See PikeVM::try_search_slots for why we do this. + let min = self.get_nfa().group_info().implicit_slot_len(); + if slots.len() >= min { + let maybe_hm = self.try_search_slots_imp(cache, input, slots)?; + return Ok(maybe_hm.map(|hm| hm.pattern())); + } + if self.get_nfa().pattern_len() == 1 { + let mut enough = [None, None]; + let got = self.try_search_slots_imp(cache, input, &mut enough)?; + // This is OK because we know `enough_slots` is strictly bigger + // than `slots`, otherwise this special case isn't reached. + slots.copy_from_slice(&enough[..slots.len()]); + return Ok(got.map(|hm| hm.pattern())); + } + let mut enough = vec![None; min]; + let got = self.try_search_slots_imp(cache, input, &mut enough)?; + // This is OK because we know `enough_slots` is strictly bigger than + // `slots`, otherwise this special case isn't reached. + slots.copy_from_slice(&enough[..slots.len()]); + Ok(got.map(|hm| hm.pattern())) + } + + /// This is the actual implementation of `try_search_slots_imp` that + /// doesn't account for the special case when 1) the NFA has UTF-8 mode + /// enabled, 2) the NFA can match the empty string and 3) the caller has + /// provided an insufficient number of slots to record match offsets. + #[inline(never)] + fn try_search_slots_imp( + &self, + cache: &mut Cache, + input: &Input<'_>, + slots: &mut [Option<NonMaxUsize>], + ) -> Result<Option<HalfMatch>, MatchError> { + let utf8empty = self.get_nfa().has_empty() && self.get_nfa().is_utf8(); + let hm = match self.search_imp(cache, input, slots)? { + None => return Ok(None), + Some(hm) if !utf8empty => return Ok(Some(hm)), + Some(hm) => hm, + }; + empty::skip_splits_fwd(input, hm, hm.offset(), |input| { + Ok(self + .search_imp(cache, input, slots)? + .map(|hm| (hm, hm.offset()))) + }) + } + + /// The implementation of standard leftmost backtracking search. + /// + /// Capturing group spans are written to 'caps', but only if requested. + /// 'caps' can be one of three things: 1) totally empty, in which case, we + /// only report the pattern that matched or 2) only has slots for recording + /// the overall match offsets for any pattern or 3) has all slots available + /// for recording the spans of any groups participating in a match. + fn search_imp( + &self, + cache: &mut Cache, + input: &Input<'_>, + slots: &mut [Option<NonMaxUsize>], + ) -> Result<Option<HalfMatch>, MatchError> { + // Unlike in the PikeVM, we write our capturing group spans directly + // into the caller's captures groups. So we have to make sure we're + // starting with a blank slate first. In the PikeVM, we avoid this + // by construction: the spans that are copied to every slot in the + // 'Captures' value already account for presence/absence. In this + // backtracker, we write directly into the caller provided slots, where + // as in the PikeVM, we write into scratch space first and only copy + // them to the caller provided slots when a match is found. + for slot in slots.iter_mut() { + *slot = None; + } + cache.setup_search(&self, input)?; + if input.is_done() { + return Ok(None); + } + let (anchored, start_id) = match input.get_anchored() { + // Only way we're unanchored is if both the caller asked for an + // unanchored search *and* the pattern is itself not anchored. + Anchored::No => ( + self.nfa.is_always_start_anchored(), + // We always use the anchored starting state here, even if + // doing an unanchored search. The "unanchored" part of it is + // implemented in the loop below, by simply trying the next + // byte offset if the previous backtracking exploration failed. + self.nfa.start_anchored(), + ), + Anchored::Yes => (true, self.nfa.start_anchored()), + Anchored::Pattern(pid) => match self.nfa.start_pattern(pid) { + None => return Ok(None), + Some(sid) => (true, sid), + }, + }; + if anchored { + let at = input.start(); + return Ok(self.backtrack(cache, input, at, start_id, slots)); + } + let pre = self.get_config().get_prefilter(); + let mut at = input.start(); + while at <= input.end() { + if let Some(ref pre) = pre { + let span = Span::from(at..input.end()); + match pre.find(input.haystack(), span) { + None => break, + Some(ref span) => at = span.start, + } + } + if let Some(hm) = self.backtrack(cache, input, at, start_id, slots) + { + return Ok(Some(hm)); + } + at += 1; + } + Ok(None) + } + + /// Look for a match starting at `at` in `input` and write the matching + /// pattern ID and group spans to `caps`. The search uses `start_id` as its + /// starting state in the underlying NFA. + /// + /// If no match was found, then the caller should increment `at` and try + /// at the next position. + #[cfg_attr(feature = "perf-inline", inline(always))] + fn backtrack( + &self, + cache: &mut Cache, + input: &Input<'_>, + at: usize, + start_id: StateID, + slots: &mut [Option<NonMaxUsize>], + ) -> Option<HalfMatch> { + cache.stack.push(Frame::Step { sid: start_id, at }); + while let Some(frame) = cache.stack.pop() { + match frame { + Frame::Step { sid, at } => { + if let Some(hm) = self.step(cache, input, sid, at, slots) { + return Some(hm); + } + } + Frame::RestoreCapture { slot, offset } => { + slots[slot] = offset; + } + } + } + None + } + + // LAMENTATION: The actual backtracking search is implemented in about + // 75 lines below. Yet this file is over 2,000 lines long. What have I + // done? + + /// Execute a "step" in the backtracing algorithm. + /// + /// A "step" is somewhat of a misnomer, because this routine keeps going + /// until it either runs out of things to try or fins a match. In the + /// former case, it may have pushed some things on to the backtracking + /// stack, in which case, those will be tried next as part of the + /// 'backtrack' routine above. + #[cfg_attr(feature = "perf-inline", inline(always))] + fn step( + &self, + cache: &mut Cache, + input: &Input<'_>, + mut sid: StateID, + mut at: usize, + slots: &mut [Option<NonMaxUsize>], + ) -> Option<HalfMatch> { + loop { + if !cache.visited.insert(sid, at - input.start()) { + return None; + } + match *self.nfa.state(sid) { + State::ByteRange { ref trans } => { + // Why do we need this? Unlike other regex engines in this + // crate, the backtracker can steam roll ahead in the + // haystack outside of the main loop over the bytes in the + // haystack. While 'trans.matches()' below handles the case + // of 'at' being out of bounds of 'input.haystack()', we + // also need to handle the case of 'at' going out of bounds + // of the span the caller asked to search. + // + // We should perhaps make the 'trans.matches()' API accept + // an '&Input' instead of a '&[u8]'. Or at least, add a new + // API that does it. + if at >= input.end() { + return None; + } + if !trans.matches(input.haystack(), at) { + return None; + } + sid = trans.next; + at += 1; + } + State::Sparse(ref sparse) => { + if at >= input.end() { + return None; + } + sid = sparse.matches(input.haystack(), at)?; + at += 1; + } + State::Dense(ref dense) => { + if at >= input.end() { + return None; + } + sid = dense.matches(input.haystack(), at)?; + at += 1; + } + State::Look { look, next } => { + // OK because we don't permit building a searcher with a + // Unicode word boundary if the requisite Unicode data is + // unavailable. + if !self.nfa.look_matcher().matches_inline( + look, + input.haystack(), + at, + ) { + return None; + } + sid = next; + } + State::Union { ref alternates } => { + sid = match alternates.get(0) { + None => return None, + Some(&sid) => sid, + }; + cache.stack.extend( + alternates[1..] + .iter() + .copied() + .rev() + .map(|sid| Frame::Step { sid, at }), + ); + } + State::BinaryUnion { alt1, alt2 } => { + sid = alt1; + cache.stack.push(Frame::Step { sid: alt2, at }); + } + State::Capture { next, slot, .. } => { + if slot.as_usize() < slots.len() { + cache.stack.push(Frame::RestoreCapture { + slot, + offset: slots[slot], + }); + slots[slot] = NonMaxUsize::new(at); + } + sid = next; + } + State::Fail => return None, + State::Match { pattern_id } => { + return Some(HalfMatch::new(pattern_id, at)); + } + } + } + } +} + +/// An iterator over all non-overlapping matches for a fallible search. +/// +/// The iterator yields a `Result<Match, MatchError` value until no more +/// matches could be found. +/// +/// The lifetime parameters are as follows: +/// +/// * `'r` represents the lifetime of the BoundedBacktracker. +/// * `'c` represents the lifetime of the BoundedBacktracker's cache. +/// * `'h` represents the lifetime of the haystack being searched. +/// +/// This iterator can be created with the [`BoundedBacktracker::try_find_iter`] +/// method. +#[derive(Debug)] +pub struct TryFindMatches<'r, 'c, 'h> { + re: &'r BoundedBacktracker, + cache: &'c mut Cache, + caps: Captures, + it: iter::Searcher<'h>, +} + +impl<'r, 'c, 'h> Iterator for TryFindMatches<'r, 'c, 'h> { + type Item = Result<Match, MatchError>; + + #[inline] + fn next(&mut self) -> Option<Result<Match, MatchError>> { + // Splitting 'self' apart seems necessary to appease borrowck. + let TryFindMatches { re, ref mut cache, ref mut caps, ref mut it } = + *self; + it.try_advance(|input| { + re.try_search(cache, input, caps)?; + Ok(caps.get_match()) + }) + .transpose() + } +} + +/// An iterator over all non-overlapping leftmost matches, with their capturing +/// groups, for a fallible search. +/// +/// The iterator yields a `Result<Captures, MatchError>` value until no more +/// matches could be found. +/// +/// The lifetime parameters are as follows: +/// +/// * `'r` represents the lifetime of the BoundedBacktracker. +/// * `'c` represents the lifetime of the BoundedBacktracker's cache. +/// * `'h` represents the lifetime of the haystack being searched. +/// +/// This iterator can be created with the +/// [`BoundedBacktracker::try_captures_iter`] method. +#[derive(Debug)] +pub struct TryCapturesMatches<'r, 'c, 'h> { + re: &'r BoundedBacktracker, + cache: &'c mut Cache, + caps: Captures, + it: iter::Searcher<'h>, +} + +impl<'r, 'c, 'h> Iterator for TryCapturesMatches<'r, 'c, 'h> { + type Item = Result<Captures, MatchError>; + + #[inline] + fn next(&mut self) -> Option<Result<Captures, MatchError>> { + // Splitting 'self' apart seems necessary to appease borrowck. + let TryCapturesMatches { re, ref mut cache, ref mut caps, ref mut it } = + *self; + let _ = it + .try_advance(|input| { + re.try_search(cache, input, caps)?; + Ok(caps.get_match()) + }) + .transpose()?; + if caps.is_match() { + Some(Ok(caps.clone())) + } else { + None + } + } +} + +/// A cache represents mutable state that a [`BoundedBacktracker`] requires +/// during a search. +/// +/// For a given [`BoundedBacktracker`], its corresponding cache may be created +/// either via [`BoundedBacktracker::create_cache`], or via [`Cache::new`]. +/// They are equivalent in every way, except the former does not require +/// explicitly importing `Cache`. +/// +/// A particular `Cache` is coupled with the [`BoundedBacktracker`] from which +/// it was created. It may only be used with that `BoundedBacktracker`. A cache +/// and its allocations may be re-purposed via [`Cache::reset`], in which case, +/// it can only be used with the new `BoundedBacktracker` (and not the old +/// one). +#[derive(Clone, Debug)] +pub struct Cache { + /// Stack used on the heap for doing backtracking instead of the + /// traditional recursive approach. We don't want recursion because then + /// we're likely to hit a stack overflow for bigger regexes. + stack: Vec<Frame>, + /// The set of (StateID, HaystackOffset) pairs that have been visited + /// by the backtracker within a single search. If such a pair has been + /// visited, then we avoid doing the work for that pair again. This is + /// what "bounds" the backtracking and prevents it from having worst case + /// exponential time. + visited: Visited, +} + +impl Cache { + /// Create a new [`BoundedBacktracker`] cache. + /// + /// A potentially more convenient routine to create a cache is + /// [`BoundedBacktracker::create_cache`], as it does not require also + /// importing the `Cache` type. + /// + /// If you want to reuse the returned `Cache` with some other + /// `BoundedBacktracker`, then you must call [`Cache::reset`] with the + /// desired `BoundedBacktracker`. + pub fn new(re: &BoundedBacktracker) -> Cache { + Cache { stack: vec![], visited: Visited::new(re) } + } + + /// Reset this cache such that it can be used for searching with different + /// [`BoundedBacktracker`]. + /// + /// A cache reset permits reusing memory already allocated in this cache + /// with a different `BoundedBacktracker`. + /// + /// # Example + /// + /// This shows how to re-purpose a cache for use with a different + /// `BoundedBacktracker`. + /// + /// ``` + /// # if cfg!(miri) { return Ok(()); } // miri takes too long + /// use regex_automata::{ + /// nfa::thompson::backtrack::BoundedBacktracker, + /// Match, + /// }; + /// + /// let re1 = BoundedBacktracker::new(r"\w")?; + /// let re2 = BoundedBacktracker::new(r"\W")?; + /// + /// let mut cache = re1.create_cache(); + /// assert_eq!( + /// Some(Ok(Match::must(0, 0..2))), + /// re1.try_find_iter(&mut cache, "Δ").next(), + /// ); + /// + /// // Using 'cache' with re2 is not allowed. It may result in panics or + /// // incorrect results. In order to re-purpose the cache, we must reset + /// // it with the BoundedBacktracker we'd like to use it with. + /// // + /// // Similarly, after this reset, using the cache with 're1' is also not + /// // allowed. + /// cache.reset(&re2); + /// assert_eq!( + /// Some(Ok(Match::must(0, 0..3))), + /// re2.try_find_iter(&mut cache, "☃").next(), + /// ); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + pub fn reset(&mut self, re: &BoundedBacktracker) { + self.visited.reset(re); + } + + /// Returns the heap memory usage, in bytes, of this cache. + /// + /// This does **not** include the stack size used up by this cache. To + /// compute that, use `std::mem::size_of::<Cache>()`. + pub fn memory_usage(&self) -> usize { + self.stack.len() * core::mem::size_of::<Frame>() + + self.visited.memory_usage() + } + + /// Clears this cache. This should be called at the start of every search + /// to ensure we start with a clean slate. + /// + /// This also sets the length of the capturing groups used in the current + /// search. This permits an optimization where by 'SlotTable::for_state' + /// only returns the number of slots equivalent to the number of slots + /// given in the 'Captures' value. This may be less than the total number + /// of possible slots, e.g., when one only wants to track overall match + /// offsets. This in turn permits less copying of capturing group spans + /// in the BoundedBacktracker. + fn setup_search( + &mut self, + re: &BoundedBacktracker, + input: &Input<'_>, + ) -> Result<(), MatchError> { + self.stack.clear(); + self.visited.setup_search(re, input)?; + Ok(()) + } +} + +/// Represents a stack frame on the heap while doing backtracking. +/// +/// Instead of using explicit recursion for backtracking, we use a stack on +/// the heap to keep track of things that we want to explore if the current +/// backtracking branch turns out to not lead to a match. +#[derive(Clone, Debug)] +enum Frame { + /// Look for a match starting at `sid` and the given position in the + /// haystack. + Step { sid: StateID, at: usize }, + /// Reset the given `slot` to the given `offset` (which might be `None`). + /// This effectively gives a "scope" to capturing groups, such that an + /// offset for a particular group only gets returned if the match goes + /// through that capturing group. If backtracking ends up going down a + /// different branch that results in a different offset (or perhaps none at + /// all), then this "restore capture" frame will cause the offset to get + /// reset. + RestoreCapture { slot: SmallIndex, offset: Option<NonMaxUsize> }, +} + +/// A bitset that keeps track of whether a particular (StateID, offset) has +/// been considered during backtracking. If it has already been visited, then +/// backtracking skips it. This is what gives backtracking its "bound." +#[derive(Clone, Debug)] +struct Visited { + /// The actual underlying bitset. Each element in the bitset corresponds + /// to a particular (StateID, offset) pair. States correspond to the rows + /// and the offsets correspond to the columns. + /// + /// If our underlying NFA has N states and the haystack we're searching + /// has M bytes, then we have N*(M+1) entries in our bitset table. The + /// M+1 occurs because our matches are delayed by one byte (to support + /// look-around), and so we need to handle the end position itself rather + /// than stopping just before the end. (If there is no end position, then + /// it's treated as "end-of-input," which is matched by things like '$'.) + /// + /// Given BITS=N*(M+1), we wind up with div_ceil(BITS, sizeof(usize)) + /// blocks. + /// + /// We use 'usize' to represent our blocks because it makes some of the + /// arithmetic in 'insert' a bit nicer. For example, if we used 'u32' for + /// our block, we'd either need to cast u32s to usizes or usizes to u32s. + bitset: Vec<usize>, + /// The stride represents one plus length of the haystack we're searching + /// (as described above). The stride must be initialized for each search. + stride: usize, +} + +impl Visited { + /// The size of each block, in bits. + const BLOCK_SIZE: usize = 8 * core::mem::size_of::<usize>(); + + /// Create a new visited set for the given backtracker. + /// + /// The set is ready to use, but must be setup at the beginning of each + /// search by calling `setup_search`. + fn new(re: &BoundedBacktracker) -> Visited { + let mut visited = Visited { bitset: vec![], stride: 0 }; + visited.reset(re); + visited + } + + /// Insert the given (StateID, offset) pair into this set. If it already + /// exists, then this is a no-op and it returns false. Otherwise this + /// returns true. + fn insert(&mut self, sid: StateID, at: usize) -> bool { + let table_index = sid.as_usize() * self.stride + at; + let block_index = table_index / Visited::BLOCK_SIZE; + let bit = table_index % Visited::BLOCK_SIZE; + let block_with_bit = 1 << bit; + if self.bitset[block_index] & block_with_bit != 0 { + return false; + } + self.bitset[block_index] |= block_with_bit; + true + } + + /// Reset this visited set to work with the given bounded backtracker. + fn reset(&mut self, _: &BoundedBacktracker) { + self.bitset.truncate(0); + } + + /// Setup this visited set to work for a search using the given NFA + /// and input configuration. The NFA must be the same NFA used by the + /// BoundedBacktracker given to Visited::reset. Failing to call this might + /// result in panics or silently incorrect search behavior. + fn setup_search( + &mut self, + re: &BoundedBacktracker, + input: &Input<'_>, + ) -> Result<(), MatchError> { + // Our haystack length is only the length of the span of the entire + // haystack that we'll be searching. + let haylen = input.get_span().len(); + let err = || MatchError::haystack_too_long(haylen); + // Our stride is one more than the length of the input because our main + // search loop includes the position at input.end(). (And it does this + // because matches are delayed by one byte to account for look-around.) + self.stride = haylen + 1; + let needed_capacity = + match re.get_nfa().states().len().checked_mul(self.stride) { + None => return Err(err()), + Some(capacity) => capacity, + }; + let max_capacity = 8 * re.get_config().get_visited_capacity(); + if needed_capacity > max_capacity { + return Err(err()); + } + let needed_blocks = div_ceil(needed_capacity, Visited::BLOCK_SIZE); + self.bitset.truncate(needed_blocks); + for block in self.bitset.iter_mut() { + *block = 0; + } + if needed_blocks > self.bitset.len() { + self.bitset.resize(needed_blocks, 0); + } + Ok(()) + } + + /// Return the heap memory usage, in bytes, of this visited set. + fn memory_usage(&self) -> usize { + self.bitset.len() * core::mem::size_of::<usize>() + } +} + +/// Integer division, but rounds up instead of down. +fn div_ceil(lhs: usize, rhs: usize) -> usize { + if lhs % rhs == 0 { + lhs / rhs + } else { + (lhs / rhs) + 1 + } +} diff --git a/third_party/rust/regex-automata/src/nfa/thompson/builder.rs b/third_party/rust/regex-automata/src/nfa/thompson/builder.rs new file mode 100644 index 0000000000..b57e5bc0f3 --- /dev/null +++ b/third_party/rust/regex-automata/src/nfa/thompson/builder.rs @@ -0,0 +1,1337 @@ +use core::mem; + +use alloc::{sync::Arc, vec, vec::Vec}; + +use crate::{ + nfa::thompson::{ + error::BuildError, + nfa::{self, SparseTransitions, Transition, NFA}, + }, + util::{ + look::{Look, LookMatcher}, + primitives::{IteratorIndexExt, PatternID, SmallIndex, StateID}, + }, +}; + +/// An intermediate NFA state used during construction. +/// +/// During construction of an NFA, it is often convenient to work with states +/// that are amenable to mutation and other carry more information than we +/// otherwise need once an NFA has been built. This type represents those +/// needs. +/// +/// Once construction is finished, the builder will convert these states to a +/// [`nfa::thompson::State`](crate::nfa::thompson::State). This conversion not +/// only results in a simpler representation, but in some cases, entire classes +/// of states are completely removed (such as [`State::Empty`]). +#[derive(Clone, Debug, Eq, PartialEq)] +enum State { + /// An empty state whose only purpose is to forward the automaton to + /// another state via an unconditional epsilon transition. + /// + /// Unconditional epsilon transitions are quite useful during the + /// construction of an NFA, as they permit the insertion of no-op + /// placeholders that make it easier to compose NFA sub-graphs. When + /// the Thompson NFA builder produces a final NFA, all unconditional + /// epsilon transitions are removed, and state identifiers are remapped + /// accordingly. + Empty { + /// The next state that this state should transition to. + next: StateID, + }, + /// A state that only transitions to another state if the current input + /// byte is in a particular range of bytes. + ByteRange { trans: Transition }, + /// A state with possibly many transitions, represented in a sparse + /// fashion. Transitions must be ordered lexicographically by input range + /// and be non-overlapping. As such, this may only be used when every + /// transition has equal priority. (In practice, this is only used for + /// encoding large UTF-8 automata.) In contrast, a `Union` state has each + /// alternate in order of priority. Priority is used to implement greedy + /// matching and also alternations themselves, e.g., `abc|a` where `abc` + /// has priority over `a`. + /// + /// To clarify, it is possible to remove `Sparse` and represent all things + /// that `Sparse` is used for via `Union`. But this creates a more bloated + /// NFA with more epsilon transitions than is necessary in the special case + /// of character classes. + Sparse { transitions: Vec<Transition> }, + /// A conditional epsilon transition satisfied via some sort of + /// look-around. + Look { look: Look, next: StateID }, + /// An empty state that records the start of a capture location. This is an + /// unconditional epsilon transition like `Empty`, except it can be used to + /// record position information for a captue group when using the NFA for + /// search. + CaptureStart { + /// The ID of the pattern that this capture was defined. + pattern_id: PatternID, + /// The capture group index that this capture state corresponds to. + /// The capture group index is always relative to its corresponding + /// pattern. Therefore, in the presence of multiple patterns, both the + /// pattern ID and the capture group index are required to uniquely + /// identify a capturing group. + group_index: SmallIndex, + /// The next state that this state should transition to. + next: StateID, + }, + /// An empty state that records the end of a capture location. This is an + /// unconditional epsilon transition like `Empty`, except it can be used to + /// record position information for a captue group when using the NFA for + /// search. + CaptureEnd { + /// The ID of the pattern that this capture was defined. + pattern_id: PatternID, + /// The capture group index that this capture state corresponds to. + /// The capture group index is always relative to its corresponding + /// pattern. Therefore, in the presence of multiple patterns, both the + /// pattern ID and the capture group index are required to uniquely + /// identify a capturing group. + group_index: SmallIndex, + /// The next state that this state should transition to. + next: StateID, + }, + /// An alternation such that there exists an epsilon transition to all + /// states in `alternates`, where matches found via earlier transitions + /// are preferred over later transitions. + Union { alternates: Vec<StateID> }, + /// An alternation such that there exists an epsilon transition to all + /// states in `alternates`, where matches found via later transitions are + /// preferred over earlier transitions. + /// + /// This "reverse" state exists for convenience during compilation that + /// permits easy construction of non-greedy combinations of NFA states. At + /// the end of compilation, Union and UnionReverse states are merged into + /// one Union type of state, where the latter has its epsilon transitions + /// reversed to reflect the priority inversion. + /// + /// The "convenience" here arises from the fact that as new states are + /// added to the list of `alternates`, we would like that add operation + /// to be amortized constant time. But if we used a `Union`, we'd need to + /// prepend the state, which takes O(n) time. There are other approaches we + /// could use to solve this, but this seems simple enough. + UnionReverse { alternates: Vec<StateID> }, + /// A state that cannot be transitioned out of. This is useful for cases + /// where you want to prevent matching from occurring. For example, if your + /// regex parser permits empty character classes, then one could choose a + /// `Fail` state to represent it. + Fail, + /// A match state. There is at most one such occurrence of this state in + /// an NFA for each pattern compiled into the NFA. At time of writing, a + /// match state is always produced for every pattern given, but in theory, + /// if a pattern can never lead to a match, then the match state could be + /// omitted. + /// + /// `pattern_id` refers to the ID of the pattern itself, which corresponds + /// to the pattern's index (starting at 0). + Match { pattern_id: PatternID }, +} + +impl State { + /// If this state is an unconditional espilon transition, then this returns + /// the target of the transition. + fn goto(&self) -> Option<StateID> { + match *self { + State::Empty { next } => Some(next), + State::Union { ref alternates } if alternates.len() == 1 => { + Some(alternates[0]) + } + State::UnionReverse { ref alternates } + if alternates.len() == 1 => + { + Some(alternates[0]) + } + _ => None, + } + } + + /// Returns the heap memory usage, in bytes, of this state. + fn memory_usage(&self) -> usize { + match *self { + State::Empty { .. } + | State::ByteRange { .. } + | State::Look { .. } + | State::CaptureStart { .. } + | State::CaptureEnd { .. } + | State::Fail + | State::Match { .. } => 0, + State::Sparse { ref transitions } => { + transitions.len() * mem::size_of::<Transition>() + } + State::Union { ref alternates } => { + alternates.len() * mem::size_of::<StateID>() + } + State::UnionReverse { ref alternates } => { + alternates.len() * mem::size_of::<StateID>() + } + } + } +} + +/// An abstraction for building Thompson NFAs by hand. +/// +/// A builder is what a [`thompson::Compiler`](crate::nfa::thompson::Compiler) +/// uses internally to translate a regex's high-level intermediate +/// representation into an [`NFA`]. +/// +/// The primary function of this builder is to abstract away the internal +/// representation of an NFA and make it difficult to produce NFAs are that +/// internally invalid or inconsistent. This builder also provides a way to +/// add "empty" states (which can be thought of as unconditional epsilon +/// transitions), despite the fact that [`thompson::State`](nfa::State) does +/// not have any "empty" representation. The advantage of "empty" states is +/// that they make the code for constructing a Thompson NFA logically simpler. +/// +/// Many of the routines on this builder may panic or return errors. Generally +/// speaking, panics occur when an invalid sequence of method calls were made, +/// where as an error occurs if things get too big. (Where "too big" might mean +/// exhausting identifier space or using up too much heap memory in accordance +/// with the configured [`size_limit`](Builder::set_size_limit).) +/// +/// # Overview +/// +/// ## Adding multiple patterns +/// +/// Each pattern you add to an NFA should correspond to a pair of +/// [`Builder::start_pattern`] and [`Builder::finish_pattern`] calls, with +/// calls inbetween that add NFA states for that pattern. NFA states may be +/// added without first calling `start_pattern`, with the exception of adding +/// capturing states. +/// +/// ## Adding NFA states +/// +/// Here is a very brief overview of each of the methods that add NFA states. +/// Every method adds a single state. +/// +/// * [`add_empty`](Builder::add_empty): Add a state with a single +/// unconditional epsilon transition to another state. +/// * [`add_union`](Builder::add_union): Adds a state with unconditional +/// epsilon transitions to two or more states, with earlier transitions +/// preferred over later ones. +/// * [`add_union_reverse`](Builder::add_union_reverse): Adds a state with +/// unconditional epsilon transitions to two or more states, with later +/// transitions preferred over earlier ones. +/// * [`add_range`](Builder::add_range): Adds a state with a single transition +/// to another state that can only be followed if the current input byte is +/// within the range given. +/// * [`add_sparse`](Builder::add_sparse): Adds a state with two or more +/// range transitions to other states, where a transition is only followed +/// if the current input byte is within one of the ranges. All transitions +/// in this state have equal priority, and the corresponding ranges must be +/// non-overlapping. +/// * [`add_look`](Builder::add_look): Adds a state with a single *conditional* +/// epsilon transition to another state, where the condition depends on a +/// limited look-around property. +/// * [`add_capture_start`](Builder::add_capture_start): Adds a state with +/// a single unconditional epsilon transition that also instructs an NFA +/// simulation to record the current input position to a specific location in +/// memory. This is intended to represent the starting location of a capturing +/// group. +/// * [`add_capture_end`](Builder::add_capture_end): Adds a state with +/// a single unconditional epsilon transition that also instructs an NFA +/// simulation to record the current input position to a specific location in +/// memory. This is intended to represent the ending location of a capturing +/// group. +/// * [`add_fail`](Builder::add_fail): Adds a state that never transitions to +/// another state. +/// * [`add_match`](Builder::add_match): Add a state that indicates a match has +/// been found for a particular pattern. A match state is a final state with +/// no outgoing transitions. +/// +/// ## Setting transitions between NFA states +/// +/// The [`Builder::patch`] method creates a transition from one state to the +/// next. If the `from` state corresponds to a state that supports multiple +/// outgoing transitions (such as "union"), then this adds the corresponding +/// transition. Otherwise, it sets the single transition. (This routine panics +/// if `from` corresponds to a state added by `add_sparse`, since sparse states +/// need more specialized handling.) +/// +/// # Example +/// +/// This annotated example shows how to hand construct the regex `[a-z]+` +/// (without an unanchored prefix). +/// +/// ``` +/// use regex_automata::{ +/// nfa::thompson::{pikevm::PikeVM, Builder, Transition}, +/// util::primitives::StateID, +/// Match, +/// }; +/// +/// let mut builder = Builder::new(); +/// // Before adding NFA states for our pattern, we need to tell the builder +/// // that we are starting the pattern. +/// builder.start_pattern()?; +/// // Since we use the Pike VM below for searching, we need to add capturing +/// // states. If you're just going to build a DFA from the NFA, then capturing +/// // states do not need to be added. +/// let start = builder.add_capture_start(StateID::ZERO, 0, None)?; +/// let range = builder.add_range(Transition { +/// // We don't know the state ID of the 'next' state yet, so we just fill +/// // in a dummy 'ZERO' value. +/// start: b'a', end: b'z', next: StateID::ZERO, +/// })?; +/// // This state will point back to 'range', but also enable us to move ahead. +/// // That is, this implements the '+' repetition operator. We add 'range' and +/// // then 'end' below to this alternation. +/// let alt = builder.add_union(vec![])?; +/// // The final state before the match state, which serves to capture the +/// // end location of the match. +/// let end = builder.add_capture_end(StateID::ZERO, 0)?; +/// // The match state for our pattern. +/// let mat = builder.add_match()?; +/// // Now we fill in the transitions between states. +/// builder.patch(start, range)?; +/// builder.patch(range, alt)?; +/// // If we added 'end' before 'range', then we'd implement non-greedy +/// // matching, i.e., '+?'. +/// builder.patch(alt, range)?; +/// builder.patch(alt, end)?; +/// builder.patch(end, mat)?; +/// // We must explicitly finish pattern and provide the starting state ID for +/// // this particular pattern. +/// builder.finish_pattern(start)?; +/// // Finally, when we build the NFA, we provide the anchored and unanchored +/// // starting state IDs. Since we didn't bother with an unanchored prefix +/// // here, we only support anchored searching. Thus, both starting states are +/// // the same. +/// let nfa = builder.build(start, start)?; +/// +/// // Now build a Pike VM from our NFA, and use it for searching. This shows +/// // how we can use a regex engine without ever worrying about syntax! +/// let re = PikeVM::new_from_nfa(nfa)?; +/// let mut cache = re.create_cache(); +/// let mut caps = re.create_captures(); +/// let expected = Some(Match::must(0, 0..3)); +/// re.captures(&mut cache, "foo0", &mut caps); +/// assert_eq!(expected, caps.get_match()); +/// +/// # Ok::<(), Box<dyn std::error::Error>>(()) +/// ``` +#[derive(Clone, Debug, Default)] +pub struct Builder { + /// The ID of the pattern that we're currently building. + /// + /// Callers are required to set (and unset) this by calling + /// {start,finish}_pattern. Otherwise, most methods will panic. + pattern_id: Option<PatternID>, + /// A sequence of intermediate NFA states. Once a state is added to this + /// sequence, it is assigned a state ID equivalent to its index. Once a + /// state is added, it is still expected to be mutated, e.g., to set its + /// transition to a state that didn't exist at the time it was added. + states: Vec<State>, + /// The starting states for each individual pattern. Starting at any + /// of these states will result in only an anchored search for the + /// corresponding pattern. The vec is indexed by pattern ID. When the NFA + /// contains a single regex, then `start_pattern[0]` and `start_anchored` + /// are always equivalent. + start_pattern: Vec<StateID>, + /// A map from pattern ID to capture group index to name. (If no name + /// exists, then a None entry is present. Thus, all capturing groups are + /// present in this mapping.) + /// + /// The outer vec is indexed by pattern ID, while the inner vec is indexed + /// by capture index offset for the corresponding pattern. + /// + /// The first capture group for each pattern is always unnamed and is thus + /// always None. + captures: Vec<Vec<Option<Arc<str>>>>, + /// The combined memory used by each of the 'State's in 'states'. This + /// only includes heap usage by each state, and not the size of the state + /// itself. In other words, this tracks heap memory used that isn't + /// captured via `size_of::<State>() * states.len()`. + memory_states: usize, + /// Whether this NFA only matches UTF-8 and whether regex engines using + /// this NFA for searching should report empty matches that split a + /// codepoint. + utf8: bool, + /// Whether this NFA should be matched in reverse or not. + reverse: bool, + /// The matcher to use for look-around assertions. + look_matcher: LookMatcher, + /// A size limit to respect when building an NFA. If the total heap memory + /// of the intermediate NFA states exceeds (or would exceed) this amount, + /// then an error is returned. + size_limit: Option<usize>, +} + +impl Builder { + /// Create a new builder for hand-assembling NFAs. + pub fn new() -> Builder { + Builder::default() + } + + /// Clear this builder. + /// + /// Clearing removes all state associated with building an NFA, but does + /// not reset configuration (such as size limits and whether the NFA + /// should only match UTF-8). After clearing, the builder can be reused to + /// assemble an entirely new NFA. + pub fn clear(&mut self) { + self.pattern_id = None; + self.states.clear(); + self.start_pattern.clear(); + self.captures.clear(); + self.memory_states = 0; + } + + /// Assemble a [`NFA`] from the states added so far. + /// + /// After building an NFA, more states may be added and `build` may be + /// called again. To reuse a builder to produce an entirely new NFA from + /// scratch, call the [`clear`](Builder::clear) method first. + /// + /// `start_anchored` refers to the ID of the starting state that anchored + /// searches should use. That is, searches who matches are limited to the + /// starting position of the search. + /// + /// `start_unanchored` refers to the ID of the starting state that + /// unanchored searches should use. This permits searches to report matches + /// that start after the beginning of the search. In cases where unanchored + /// searches are not supported, the unanchored starting state ID must be + /// the same as the anchored starting state ID. + /// + /// # Errors + /// + /// This returns an error if there was a problem producing the final NFA. + /// In particular, this might include an error if the capturing groups + /// added to this builder violate any of the invariants documented on + /// [`GroupInfo`](crate::util::captures::GroupInfo). + /// + /// # Panics + /// + /// If `start_pattern` was called, then `finish_pattern` must be called + /// before `build`, otherwise this panics. + /// + /// This may panic for other invalid uses of a builder. For example, if + /// a "start capture" state was added without a corresponding "end capture" + /// state. + pub fn build( + &self, + start_anchored: StateID, + start_unanchored: StateID, + ) -> Result<NFA, BuildError> { + assert!(self.pattern_id.is_none(), "must call 'finish_pattern' first"); + debug!( + "intermediate NFA compilation via builder is complete, \ + intermediate NFA size: {} states, {} bytes on heap", + self.states.len(), + self.memory_usage(), + ); + + let mut nfa = nfa::Inner::default(); + nfa.set_utf8(self.utf8); + nfa.set_reverse(self.reverse); + nfa.set_look_matcher(self.look_matcher.clone()); + // A set of compiler internal state IDs that correspond to states + // that are exclusively epsilon transitions, i.e., goto instructions, + // combined with the state that they point to. This is used to + // record said states while transforming the compiler's internal NFA + // representation to the external form. + let mut empties = vec![]; + // A map used to re-map state IDs when translating this builder's + // internal NFA state representation to the final NFA representation. + let mut remap = vec![]; + remap.resize(self.states.len(), StateID::ZERO); + + nfa.set_starts(start_anchored, start_unanchored, &self.start_pattern); + nfa.set_captures(&self.captures).map_err(BuildError::captures)?; + // The idea here is to convert our intermediate states to their final + // form. The only real complexity here is the process of converting + // transitions, which are expressed in terms of state IDs. The new + // set of states will be smaller because of partial epsilon removal, + // so the state IDs will not be the same. + for (sid, state) in self.states.iter().with_state_ids() { + match *state { + State::Empty { next } => { + // Since we're removing empty states, we need to handle + // them later since we don't yet know which new state this + // empty state will be mapped to. + empties.push((sid, next)); + } + State::ByteRange { trans } => { + remap[sid] = nfa.add(nfa::State::ByteRange { trans }); + } + State::Sparse { ref transitions } => { + remap[sid] = match transitions.len() { + 0 => nfa.add(nfa::State::Fail), + 1 => nfa.add(nfa::State::ByteRange { + trans: transitions[0], + }), + _ => { + let transitions = + transitions.to_vec().into_boxed_slice(); + let sparse = SparseTransitions { transitions }; + nfa.add(nfa::State::Sparse(sparse)) + } + } + } + State::Look { look, next } => { + remap[sid] = nfa.add(nfa::State::Look { look, next }); + } + State::CaptureStart { pattern_id, group_index, next } => { + // We can't remove this empty state because of the side + // effect of capturing an offset for this capture slot. + let slot = nfa + .group_info() + .slot(pattern_id, group_index.as_usize()) + .expect("invalid capture index"); + let slot = + SmallIndex::new(slot).expect("a small enough slot"); + remap[sid] = nfa.add(nfa::State::Capture { + next, + pattern_id, + group_index, + slot, + }); + } + State::CaptureEnd { pattern_id, group_index, next } => { + // We can't remove this empty state because of the side + // effect of capturing an offset for this capture slot. + // Also, this always succeeds because we check that all + // slot indices are valid for all capture indices when they + // are initially added. + let slot = nfa + .group_info() + .slot(pattern_id, group_index.as_usize()) + .expect("invalid capture index") + .checked_add(1) + .unwrap(); + let slot = + SmallIndex::new(slot).expect("a small enough slot"); + remap[sid] = nfa.add(nfa::State::Capture { + next, + pattern_id, + group_index, + slot, + }); + } + State::Union { ref alternates } => { + if alternates.is_empty() { + remap[sid] = nfa.add(nfa::State::Fail); + } else if alternates.len() == 1 { + empties.push((sid, alternates[0])); + remap[sid] = alternates[0]; + } else if alternates.len() == 2 { + remap[sid] = nfa.add(nfa::State::BinaryUnion { + alt1: alternates[0], + alt2: alternates[1], + }); + } else { + let alternates = + alternates.to_vec().into_boxed_slice(); + remap[sid] = nfa.add(nfa::State::Union { alternates }); + } + } + State::UnionReverse { ref alternates } => { + if alternates.is_empty() { + remap[sid] = nfa.add(nfa::State::Fail); + } else if alternates.len() == 1 { + empties.push((sid, alternates[0])); + remap[sid] = alternates[0]; + } else if alternates.len() == 2 { + remap[sid] = nfa.add(nfa::State::BinaryUnion { + alt1: alternates[1], + alt2: alternates[0], + }); + } else { + let mut alternates = + alternates.to_vec().into_boxed_slice(); + alternates.reverse(); + remap[sid] = nfa.add(nfa::State::Union { alternates }); + } + } + State::Fail => { + remap[sid] = nfa.add(nfa::State::Fail); + } + State::Match { pattern_id } => { + remap[sid] = nfa.add(nfa::State::Match { pattern_id }); + } + } + } + // Some of the new states still point to empty state IDs, so we need to + // follow each of them and remap the empty state IDs to their non-empty + // state IDs. + // + // We also keep track of which states we've already mapped. This helps + // avoid quadratic behavior in a long chain of empty states. For + // example, in 'a{0}{50000}'. + let mut remapped = vec![false; self.states.len()]; + for &(empty_id, empty_next) in empties.iter() { + if remapped[empty_id] { + continue; + } + // empty states can point to other empty states, forming a chain. + // So we must follow the chain until the end, which must end at + // a non-empty state, and therefore, a state that is correctly + // remapped. We are guaranteed to terminate because our compiler + // never builds a loop among only empty states. + let mut new_next = empty_next; + while let Some(next) = self.states[new_next].goto() { + new_next = next; + } + remap[empty_id] = remap[new_next]; + remapped[empty_id] = true; + + // Now that we've remapped the main 'empty_id' above, we re-follow + // the chain from above and remap every empty state we found along + // the way to our ultimate non-empty target. We are careful to set + // 'remapped' to true for each such state. We thus will not need + // to re-compute this chain for any subsequent empty states in + // 'empties' that are part of this chain. + let mut next2 = empty_next; + while let Some(next) = self.states[next2].goto() { + remap[next2] = remap[new_next]; + remapped[next2] = true; + next2 = next; + } + } + // Finally remap all of the state IDs. + nfa.remap(&remap); + let final_nfa = nfa.into_nfa(); + debug!( + "NFA compilation via builder complete, \ + final NFA size: {} states, {} bytes on heap, \ + has empty? {:?}, utf8? {:?}", + final_nfa.states().len(), + final_nfa.memory_usage(), + final_nfa.has_empty(), + final_nfa.is_utf8(), + ); + Ok(final_nfa) + } + + /// Start the assembly of a pattern in this NFA. + /// + /// Upon success, this returns the identifier for the new pattern. + /// Identifiers start at `0` and are incremented by 1 for each new pattern. + /// + /// It is necessary to call this routine before adding capturing states. + /// Otherwise, any other NFA state may be added before starting a pattern. + /// + /// # Errors + /// + /// If the pattern identifier space is exhausted, then this returns an + /// error. + /// + /// # Panics + /// + /// If this is called while assembling another pattern (i.e., before + /// `finish_pattern` is called), then this panics. + pub fn start_pattern(&mut self) -> Result<PatternID, BuildError> { + assert!(self.pattern_id.is_none(), "must call 'finish_pattern' first"); + + let proposed = self.start_pattern.len(); + let pid = PatternID::new(proposed) + .map_err(|_| BuildError::too_many_patterns(proposed))?; + self.pattern_id = Some(pid); + // This gets filled in when 'finish_pattern' is called. + self.start_pattern.push(StateID::ZERO); + Ok(pid) + } + + /// Finish the assembly of a pattern in this NFA. + /// + /// Upon success, this returns the identifier for the new pattern. + /// Identifiers start at `0` and are incremented by 1 for each new + /// pattern. This is the same identifier returned by the corresponding + /// `start_pattern` call. + /// + /// Note that `start_pattern` and `finish_pattern` pairs cannot be + /// interleaved or nested. A correct `finish_pattern` call _always_ + /// corresponds to the most recently called `start_pattern` routine. + /// + /// # Errors + /// + /// This currently never returns an error, but this is subject to change. + /// + /// # Panics + /// + /// If this is called without a corresponding `start_pattern` call, then + /// this panics. + pub fn finish_pattern( + &mut self, + start_id: StateID, + ) -> Result<PatternID, BuildError> { + let pid = self.current_pattern_id(); + self.start_pattern[pid] = start_id; + self.pattern_id = None; + Ok(pid) + } + + /// Returns the pattern identifier of the current pattern. + /// + /// # Panics + /// + /// If this doesn't occur after a `start_pattern` call and before the + /// corresponding `finish_pattern` call, then this panics. + pub fn current_pattern_id(&self) -> PatternID { + self.pattern_id.expect("must call 'start_pattern' first") + } + + /// Returns the number of patterns added to this builder so far. + /// + /// This only includes patterns that have had `finish_pattern` called + /// for them. + pub fn pattern_len(&self) -> usize { + self.start_pattern.len() + } + + /// Add an "empty" NFA state. + /// + /// An "empty" NFA state is a state with a single unconditional epsilon + /// transition to another NFA state. Such empty states are removed before + /// building the final [`NFA`] (which has no such "empty" states), but they + /// can be quite useful in the construction process of an NFA. + /// + /// # Errors + /// + /// This returns an error if the state identifier space is exhausted, or if + /// the configured heap size limit has been exceeded. + pub fn add_empty(&mut self) -> Result<StateID, BuildError> { + self.add(State::Empty { next: StateID::ZERO }) + } + + /// Add a "union" NFA state. + /// + /// A "union" NFA state that contains zero or more unconditional epsilon + /// transitions to other NFA states. The order of these transitions + /// reflects a priority order where earlier transitions are preferred over + /// later transitions. + /// + /// Callers may provide an empty set of alternates to this method call, and + /// then later add transitions via `patch`. At final build time, a "union" + /// state with no alternates is converted to a "fail" state, and a "union" + /// state with exactly one alternate is treated as if it were an "empty" + /// state. + /// + /// # Errors + /// + /// This returns an error if the state identifier space is exhausted, or if + /// the configured heap size limit has been exceeded. + pub fn add_union( + &mut self, + alternates: Vec<StateID>, + ) -> Result<StateID, BuildError> { + self.add(State::Union { alternates }) + } + + /// Add a "reverse union" NFA state. + /// + /// A "reverse union" NFA state contains zero or more unconditional epsilon + /// transitions to other NFA states. The order of these transitions + /// reflects a priority order where later transitions are preferred + /// over earlier transitions. This is an inverted priority order when + /// compared to `add_union`. This is useful, for example, for implementing + /// non-greedy repetition operators. + /// + /// Callers may provide an empty set of alternates to this method call, and + /// then later add transitions via `patch`. At final build time, a "reverse + /// union" state with no alternates is converted to a "fail" state, and a + /// "reverse union" state with exactly one alternate is treated as if it + /// were an "empty" state. + /// + /// # Errors + /// + /// This returns an error if the state identifier space is exhausted, or if + /// the configured heap size limit has been exceeded. + pub fn add_union_reverse( + &mut self, + alternates: Vec<StateID>, + ) -> Result<StateID, BuildError> { + self.add(State::UnionReverse { alternates }) + } + + /// Add a "range" NFA state. + /// + /// A "range" NFA state is a state with one outgoing transition to another + /// state, where that transition may only be followed if the current input + /// byte falls between a range of bytes given. + /// + /// # Errors + /// + /// This returns an error if the state identifier space is exhausted, or if + /// the configured heap size limit has been exceeded. + pub fn add_range( + &mut self, + trans: Transition, + ) -> Result<StateID, BuildError> { + self.add(State::ByteRange { trans }) + } + + /// Add a "sparse" NFA state. + /// + /// A "sparse" NFA state contains zero or more outgoing transitions, where + /// the transition to be followed (if any) is chosen based on whether the + /// current input byte falls in the range of one such transition. The + /// transitions given *must* be non-overlapping and in ascending order. (A + /// "sparse" state with no transitions is equivalent to a "fail" state.) + /// + /// A "sparse" state is like adding a "union" state and pointing it at a + /// bunch of "range" states, except that the different alternates have + /// equal priority. + /// + /// Note that a "sparse" state is the only state that cannot be patched. + /// This is because a "sparse" state has many transitions, each of which + /// may point to a different NFA state. Moreover, adding more such + /// transitions requires more than just an NFA state ID to point to. It + /// also requires a byte range. The `patch` routine does not support the + /// additional information required. Therefore, callers must ensure that + /// all outgoing transitions for this state are included when `add_sparse` + /// is called. There is no way to add more later. + /// + /// # Errors + /// + /// This returns an error if the state identifier space is exhausted, or if + /// the configured heap size limit has been exceeded. + /// + /// # Panics + /// + /// This routine _may_ panic if the transitions given overlap or are not + /// in ascending order. + pub fn add_sparse( + &mut self, + transitions: Vec<Transition>, + ) -> Result<StateID, BuildError> { + self.add(State::Sparse { transitions }) + } + + /// Add a "look" NFA state. + /// + /// A "look" NFA state corresponds to a state with exactly one + /// *conditional* epsilon transition to another NFA state. Namely, it + /// represents one of a small set of simplistic look-around operators. + /// + /// Callers may provide a "dummy" state ID (typically [`StateID::ZERO`]), + /// and then change it later with [`patch`](Builder::patch). + /// + /// # Errors + /// + /// This returns an error if the state identifier space is exhausted, or if + /// the configured heap size limit has been exceeded. + pub fn add_look( + &mut self, + next: StateID, + look: Look, + ) -> Result<StateID, BuildError> { + self.add(State::Look { look, next }) + } + + /// Add a "start capture" NFA state. + /// + /// A "start capture" NFA state corresponds to a state with exactly one + /// outgoing unconditional epsilon transition to another state. Unlike + /// "empty" states, a "start capture" state also carries with it an + /// instruction for saving the current position of input to a particular + /// location in memory. NFA simulations, like the Pike VM, may use this + /// information to report the match locations of capturing groups in a + /// regex pattern. + /// + /// If the corresponding capturing group has a name, then callers should + /// include it here. + /// + /// Callers may provide a "dummy" state ID (typically [`StateID::ZERO`]), + /// and then change it later with [`patch`](Builder::patch). + /// + /// Note that unlike `start_pattern`/`finish_pattern`, capturing start and + /// end states may be interleaved. Indeed, it is typical for many "start + /// capture" NFA states to appear before the first "end capture" state. + /// + /// # Errors + /// + /// This returns an error if the state identifier space is exhausted, or if + /// the configured heap size limit has been exceeded or if the given + /// capture index overflows `usize`. + /// + /// While the above are the only conditions in which this routine can + /// currently return an error, it is possible to call this method with an + /// inputs that results in the final `build()` step failing to produce an + /// NFA. For example, if one adds two distinct capturing groups with the + /// same name, then that will result in `build()` failing with an error. + /// + /// See the [`GroupInfo`](crate::util::captures::GroupInfo) type for + /// more information on what qualifies as valid capturing groups. + /// + /// # Example + /// + /// This example shows that an error occurs when one tries to add multiple + /// capturing groups with the same name to the same pattern. + /// + /// ``` + /// use regex_automata::{ + /// nfa::thompson::Builder, + /// util::primitives::StateID, + /// }; + /// + /// let name = Some(std::sync::Arc::from("foo")); + /// let mut builder = Builder::new(); + /// builder.start_pattern()?; + /// // 0th capture group should always be unnamed. + /// let start = builder.add_capture_start(StateID::ZERO, 0, None)?; + /// // OK + /// builder.add_capture_start(StateID::ZERO, 1, name.clone())?; + /// // This is not OK, but 'add_capture_start' still succeeds. We don't + /// // get an error until we call 'build' below. Without this call, the + /// // call to 'build' below would succeed. + /// builder.add_capture_start(StateID::ZERO, 2, name.clone())?; + /// // Finish our pattern so we can try to build the NFA. + /// builder.finish_pattern(start)?; + /// let result = builder.build(start, start); + /// assert!(result.is_err()); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + /// + /// However, adding multiple capturing groups with the same name to + /// distinct patterns is okay: + /// + /// ``` + /// use std::sync::Arc; + /// + /// use regex_automata::{ + /// nfa::thompson::{pikevm::PikeVM, Builder, Transition}, + /// util::{ + /// captures::Captures, + /// primitives::{PatternID, StateID}, + /// }, + /// Span, + /// }; + /// + /// // Hand-compile the patterns '(?P<foo>[a-z])' and '(?P<foo>[A-Z])'. + /// let mut builder = Builder::new(); + /// // We compile them to support an unanchored search, which requires + /// // adding an implicit '(?s-u:.)*?' prefix before adding either pattern. + /// let unanchored_prefix = builder.add_union_reverse(vec![])?; + /// let any = builder.add_range(Transition { + /// start: b'\x00', end: b'\xFF', next: StateID::ZERO, + /// })?; + /// builder.patch(unanchored_prefix, any)?; + /// builder.patch(any, unanchored_prefix)?; + /// + /// // Compile an alternation that permits matching multiple patterns. + /// let alt = builder.add_union(vec![])?; + /// builder.patch(unanchored_prefix, alt)?; + /// + /// // Compile '(?P<foo>[a-z]+)'. + /// builder.start_pattern()?; + /// let start0 = builder.add_capture_start(StateID::ZERO, 0, None)?; + /// // N.B. 0th capture group must always be unnamed. + /// let foo_start0 = builder.add_capture_start( + /// StateID::ZERO, 1, Some(Arc::from("foo")), + /// )?; + /// let lowercase = builder.add_range(Transition { + /// start: b'a', end: b'z', next: StateID::ZERO, + /// })?; + /// let foo_end0 = builder.add_capture_end(StateID::ZERO, 1)?; + /// let end0 = builder.add_capture_end(StateID::ZERO, 0)?; + /// let match0 = builder.add_match()?; + /// builder.patch(start0, foo_start0)?; + /// builder.patch(foo_start0, lowercase)?; + /// builder.patch(lowercase, foo_end0)?; + /// builder.patch(foo_end0, end0)?; + /// builder.patch(end0, match0)?; + /// builder.finish_pattern(start0)?; + /// + /// // Compile '(?P<foo>[A-Z]+)'. + /// builder.start_pattern()?; + /// let start1 = builder.add_capture_start(StateID::ZERO, 0, None)?; + /// // N.B. 0th capture group must always be unnamed. + /// let foo_start1 = builder.add_capture_start( + /// StateID::ZERO, 1, Some(Arc::from("foo")), + /// )?; + /// let uppercase = builder.add_range(Transition { + /// start: b'A', end: b'Z', next: StateID::ZERO, + /// })?; + /// let foo_end1 = builder.add_capture_end(StateID::ZERO, 1)?; + /// let end1 = builder.add_capture_end(StateID::ZERO, 0)?; + /// let match1 = builder.add_match()?; + /// builder.patch(start1, foo_start1)?; + /// builder.patch(foo_start1, uppercase)?; + /// builder.patch(uppercase, foo_end1)?; + /// builder.patch(foo_end1, end1)?; + /// builder.patch(end1, match1)?; + /// builder.finish_pattern(start1)?; + /// + /// // Now add the patterns to our alternation that we started above. + /// builder.patch(alt, start0)?; + /// builder.patch(alt, start1)?; + /// + /// // Finally build the NFA. The first argument is the anchored starting + /// // state (the pattern alternation) where as the second is the + /// // unanchored starting state (the unanchored prefix). + /// let nfa = builder.build(alt, unanchored_prefix)?; + /// + /// // Now build a Pike VM from our NFA and access the 'foo' capture + /// // group regardless of which pattern matched, since it is defined + /// // for both patterns. + /// let vm = PikeVM::new_from_nfa(nfa)?; + /// let mut cache = vm.create_cache(); + /// let caps: Vec<Captures> = + /// vm.captures_iter(&mut cache, "0123aAaAA").collect(); + /// assert_eq!(5, caps.len()); + /// + /// assert_eq!(Some(PatternID::must(0)), caps[0].pattern()); + /// assert_eq!(Some(Span::from(4..5)), caps[0].get_group_by_name("foo")); + /// + /// assert_eq!(Some(PatternID::must(1)), caps[1].pattern()); + /// assert_eq!(Some(Span::from(5..6)), caps[1].get_group_by_name("foo")); + /// + /// assert_eq!(Some(PatternID::must(0)), caps[2].pattern()); + /// assert_eq!(Some(Span::from(6..7)), caps[2].get_group_by_name("foo")); + /// + /// assert_eq!(Some(PatternID::must(1)), caps[3].pattern()); + /// assert_eq!(Some(Span::from(7..8)), caps[3].get_group_by_name("foo")); + /// + /// assert_eq!(Some(PatternID::must(1)), caps[4].pattern()); + /// assert_eq!(Some(Span::from(8..9)), caps[4].get_group_by_name("foo")); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + pub fn add_capture_start( + &mut self, + next: StateID, + group_index: u32, + name: Option<Arc<str>>, + ) -> Result<StateID, BuildError> { + let pid = self.current_pattern_id(); + let group_index = match SmallIndex::try_from(group_index) { + Err(_) => { + return Err(BuildError::invalid_capture_index(group_index)) + } + Ok(group_index) => group_index, + }; + // Make sure we have space to insert our (pid,index)|-->name mapping. + if pid.as_usize() >= self.captures.len() { + for _ in 0..=(pid.as_usize() - self.captures.len()) { + self.captures.push(vec![]); + } + } + // In the case where 'group_index < self.captures[pid].len()', it means + // that we are adding a duplicate capture group. This is somewhat + // weird, but permissible because the capture group itself can be + // repeated in the syntax. For example, '([a-z]){4}' will produce 4 + // capture groups. In practice, only the last will be set at search + // time when a match occurs. For duplicates, we don't need to push + // anything other than a CaptureStart NFA state. + if group_index.as_usize() >= self.captures[pid].len() { + // For discontiguous indices, push placeholders for earlier capture + // groups that weren't explicitly added. + for _ in 0..(group_index.as_usize() - self.captures[pid].len()) { + self.captures[pid].push(None); + } + self.captures[pid].push(name); + } + self.add(State::CaptureStart { pattern_id: pid, group_index, next }) + } + + /// Add a "end capture" NFA state. + /// + /// A "end capture" NFA state corresponds to a state with exactly one + /// outgoing unconditional epsilon transition to another state. Unlike + /// "empty" states, a "end capture" state also carries with it an + /// instruction for saving the current position of input to a particular + /// location in memory. NFA simulations, like the Pike VM, may use this + /// information to report the match locations of capturing groups in a + /// + /// Callers may provide a "dummy" state ID (typically [`StateID::ZERO`]), + /// and then change it later with [`patch`](Builder::patch). + /// + /// Note that unlike `start_pattern`/`finish_pattern`, capturing start and + /// end states may be interleaved. Indeed, it is typical for many "start + /// capture" NFA states to appear before the first "end capture" state. + /// + /// # Errors + /// + /// This returns an error if the state identifier space is exhausted, or if + /// the configured heap size limit has been exceeded or if the given + /// capture index overflows `usize`. + /// + /// While the above are the only conditions in which this routine can + /// currently return an error, it is possible to call this method with an + /// inputs that results in the final `build()` step failing to produce an + /// NFA. For example, if one adds two distinct capturing groups with the + /// same name, then that will result in `build()` failing with an error. + /// + /// See the [`GroupInfo`](crate::util::captures::GroupInfo) type for + /// more information on what qualifies as valid capturing groups. + pub fn add_capture_end( + &mut self, + next: StateID, + group_index: u32, + ) -> Result<StateID, BuildError> { + let pid = self.current_pattern_id(); + let group_index = match SmallIndex::try_from(group_index) { + Err(_) => { + return Err(BuildError::invalid_capture_index(group_index)) + } + Ok(group_index) => group_index, + }; + self.add(State::CaptureEnd { pattern_id: pid, group_index, next }) + } + + /// Adds a "fail" NFA state. + /// + /// A "fail" state is simply a state that has no outgoing transitions. It + /// acts as a way to cause a search to stop without reporting a match. + /// For example, one way to represent an NFA with zero patterns is with a + /// single "fail" state. + /// + /// # Errors + /// + /// This returns an error if the state identifier space is exhausted, or if + /// the configured heap size limit has been exceeded. + pub fn add_fail(&mut self) -> Result<StateID, BuildError> { + self.add(State::Fail) + } + + /// Adds a "match" NFA state. + /// + /// A "match" state has no outgoing transitions (just like a "fail" + /// state), but it has special significance in that if a search enters + /// this state, then a match has been found. The match state that is added + /// automatically has the current pattern ID associated with it. This is + /// used to report the matching pattern ID at search time. + /// + /// # Errors + /// + /// This returns an error if the state identifier space is exhausted, or if + /// the configured heap size limit has been exceeded. + /// + /// # Panics + /// + /// This must be called after a `start_pattern` call but before the + /// corresponding `finish_pattern` call. Otherwise, it panics. + pub fn add_match(&mut self) -> Result<StateID, BuildError> { + let pattern_id = self.current_pattern_id(); + let sid = self.add(State::Match { pattern_id })?; + Ok(sid) + } + + /// The common implementation of "add a state." It handles the common + /// error cases of state ID exhausting (by owning state ID allocation) and + /// whether the size limit has been exceeded. + fn add(&mut self, state: State) -> Result<StateID, BuildError> { + let id = StateID::new(self.states.len()) + .map_err(|_| BuildError::too_many_states(self.states.len()))?; + self.memory_states += state.memory_usage(); + self.states.push(state); + self.check_size_limit()?; + Ok(id) + } + + /// Add a transition from one state to another. + /// + /// This routine is called "patch" since it is very common to add the + /// states you want, typically with "dummy" state ID transitions, and then + /// "patch" in the real state IDs later. This is because you don't always + /// know all of the necessary state IDs to add because they might not + /// exist yet. + /// + /// # Errors + /// + /// This may error if patching leads to an increase in heap usage beyond + /// the configured size limit. Heap usage only grows when patching adds a + /// new transition (as in the case of a "union" state). + /// + /// # Panics + /// + /// This panics if `from` corresponds to a "sparse" state. When "sparse" + /// states are added, there is no way to patch them after-the-fact. (If you + /// have a use case where this would be helpful, please file an issue. It + /// will likely require a new API.) + pub fn patch( + &mut self, + from: StateID, + to: StateID, + ) -> Result<(), BuildError> { + let old_memory_states = self.memory_states; + match self.states[from] { + State::Empty { ref mut next } => { + *next = to; + } + State::ByteRange { ref mut trans } => { + trans.next = to; + } + State::Sparse { .. } => { + panic!("cannot patch from a sparse NFA state") + } + State::Look { ref mut next, .. } => { + *next = to; + } + State::Union { ref mut alternates } => { + alternates.push(to); + self.memory_states += mem::size_of::<StateID>(); + } + State::UnionReverse { ref mut alternates } => { + alternates.push(to); + self.memory_states += mem::size_of::<StateID>(); + } + State::CaptureStart { ref mut next, .. } => { + *next = to; + } + State::CaptureEnd { ref mut next, .. } => { + *next = to; + } + State::Fail => {} + State::Match { .. } => {} + } + if old_memory_states != self.memory_states { + self.check_size_limit()?; + } + Ok(()) + } + + /// Set whether the NFA produced by this builder should only match UTF-8. + /// + /// This should be set when both of the following are true: + /// + /// 1. The caller guarantees that the NFA created by this build will only + /// report non-empty matches with spans that are valid UTF-8. + /// 2. The caller desires regex engines using this NFA to avoid reporting + /// empty matches with a span that splits a valid UTF-8 encoded codepoint. + /// + /// Property (1) is not checked. Instead, this requires the caller to + /// promise that it is true. Property (2) corresponds to the behavior of + /// regex engines using the NFA created by this builder. Namely, there + /// is no way in the NFA's graph itself to say that empty matches found + /// by, for example, the regex `a*` will fall on valid UTF-8 boundaries. + /// Instead, this option is used to communicate the UTF-8 semantic to regex + /// engines that will typically implement it as a post-processing step by + /// filtering out empty matches that don't fall on UTF-8 boundaries. + /// + /// If you're building an NFA from an HIR (and not using a + /// [`thompson::Compiler`](crate::nfa::thompson::Compiler)), then you can + /// use the [`syntax::Config::utf8`](crate::util::syntax::Config::utf8) + /// option to guarantee that if the HIR detects a non-empty match, then it + /// is guaranteed to be valid UTF-8. + /// + /// Note that property (2) does *not* specify the behavior of executing + /// a search on a haystack that is not valid UTF-8. Therefore, if you're + /// *not* running this NFA on strings that are guaranteed to be valid + /// UTF-8, you almost certainly do not want to enable this option. + /// Similarly, if you are running the NFA on strings that *are* guaranteed + /// to be valid UTF-8, then you almost certainly want to enable this option + /// unless you can guarantee that your NFA will never produce a zero-width + /// match. + /// + /// It is disabled by default. + pub fn set_utf8(&mut self, yes: bool) { + self.utf8 = yes; + } + + /// Returns whether UTF-8 mode is enabled for this builder. + /// + /// See [`Builder::set_utf8`] for more details about what "UTF-8 mode" is. + pub fn get_utf8(&self) -> bool { + self.utf8 + } + + /// Sets whether the NFA produced by this builder should be matched in + /// reverse or not. Generally speaking, when enabled, the NFA produced + /// should be matched by moving backwards through a haystack, from a higher + /// memory address to a lower memory address. + /// + /// See also [`NFA::is_reverse`] for more details. + /// + /// This is disabled by default, which means NFAs are by default matched + /// in the forward direction. + pub fn set_reverse(&mut self, yes: bool) { + self.reverse = yes; + } + + /// Returns whether reverse mode is enabled for this builder. + /// + /// See [`Builder::set_reverse`] for more details about what "reverse mode" + /// is. + pub fn get_reverse(&self) -> bool { + self.reverse + } + + /// Sets the look-around matcher that should be used for the resulting NFA. + /// + /// A look-around matcher can be used to configure how look-around + /// assertions are matched. For example, a matcher might carry + /// configuration that changes the line terminator used for `(?m:^)` and + /// `(?m:$)` assertions. + pub fn set_look_matcher(&mut self, m: LookMatcher) { + self.look_matcher = m; + } + + /// Returns the look-around matcher used for this builder. + /// + /// If a matcher was not explicitly set, then `LookMatcher::default()` is + /// returned. + pub fn get_look_matcher(&self) -> &LookMatcher { + &self.look_matcher + } + + /// Set the size limit on this builder. + /// + /// Setting the size limit will also check whether the NFA built so far + /// fits within the given size limit. If it doesn't, then an error is + /// returned. + /// + /// By default, there is no configured size limit. + pub fn set_size_limit( + &mut self, + limit: Option<usize>, + ) -> Result<(), BuildError> { + self.size_limit = limit; + self.check_size_limit() + } + + /// Return the currently configured size limit. + /// + /// By default, this returns `None`, which corresponds to no configured + /// size limit. + pub fn get_size_limit(&self) -> Option<usize> { + self.size_limit + } + + /// Returns the heap memory usage, in bytes, used by the NFA states added + /// so far. + /// + /// Note that this is an approximation of how big the final NFA will be. + /// In practice, the final NFA will likely be a bit smaller because of + /// its simpler state representation. (For example, using things like + /// `Box<[StateID]>` instead of `Vec<StateID>`.) + pub fn memory_usage(&self) -> usize { + self.states.len() * mem::size_of::<State>() + self.memory_states + } + + fn check_size_limit(&self) -> Result<(), BuildError> { + if let Some(limit) = self.size_limit { + if self.memory_usage() > limit { + return Err(BuildError::exceeded_size_limit(limit)); + } + } + Ok(()) + } +} + +#[cfg(test)] +mod tests { + use super::*; + + // This asserts that a builder state doesn't have its size changed. It is + // *really* easy to accidentally increase the size, and thus potentially + // dramatically increase the memory usage of NFA builder. + // + // This assert doesn't mean we absolutely cannot increase the size of a + // builder state. We can. It's just here to make sure we do it knowingly + // and intentionally. + // + // A builder state is unfortunately a little bigger than an NFA state, + // since we really want to support adding things to a pre-existing state. + // i.e., We use Vec<thing> instead of Box<[thing]>. So we end up using an + // extra 8 bytes per state. Sad, but at least it gets freed once the NFA + // is built. + #[test] + fn state_has_small_size() { + #[cfg(target_pointer_width = "64")] + assert_eq!(32, core::mem::size_of::<State>()); + #[cfg(target_pointer_width = "32")] + assert_eq!(16, core::mem::size_of::<State>()); + } +} diff --git a/third_party/rust/regex-automata/src/nfa/thompson/compiler.rs b/third_party/rust/regex-automata/src/nfa/thompson/compiler.rs new file mode 100644 index 0000000000..065e9ef270 --- /dev/null +++ b/third_party/rust/regex-automata/src/nfa/thompson/compiler.rs @@ -0,0 +1,2257 @@ +use core::{borrow::Borrow, cell::RefCell}; + +use alloc::{sync::Arc, vec, vec::Vec}; + +use regex_syntax::{ + hir::{self, Hir}, + utf8::{Utf8Range, Utf8Sequences}, + ParserBuilder, +}; + +use crate::{ + nfa::thompson::{ + builder::Builder, + error::BuildError, + literal_trie::LiteralTrie, + map::{Utf8BoundedMap, Utf8SuffixKey, Utf8SuffixMap}, + nfa::{Transition, NFA}, + range_trie::RangeTrie, + }, + util::{ + look::{Look, LookMatcher}, + primitives::{PatternID, StateID}, + }, +}; + +/// The configuration used for a Thompson NFA compiler. +#[derive(Clone, Debug, Default)] +pub struct Config { + utf8: Option<bool>, + reverse: Option<bool>, + nfa_size_limit: Option<Option<usize>>, + shrink: Option<bool>, + which_captures: Option<WhichCaptures>, + look_matcher: Option<LookMatcher>, + #[cfg(test)] + unanchored_prefix: Option<bool>, +} + +impl Config { + /// Return a new default Thompson NFA compiler configuration. + pub fn new() -> Config { + Config::default() + } + + /// Whether to enable UTF-8 mode during search or not. + /// + /// A regex engine is said to be in UTF-8 mode when it guarantees that + /// all matches returned by it have spans consisting of only valid UTF-8. + /// That is, it is impossible for a match span to be returned that + /// contains any invalid UTF-8. + /// + /// UTF-8 mode generally consists of two things: + /// + /// 1. Whether the NFA's states are constructed such that all paths to a + /// match state that consume at least one byte always correspond to valid + /// UTF-8. + /// 2. Whether all paths to a match state that do _not_ consume any bytes + /// should always correspond to valid UTF-8 boundaries. + /// + /// (1) is a guarantee made by whoever constructs the NFA. + /// If you're parsing a regex from its concrete syntax, then + /// [`syntax::Config::utf8`](crate::util::syntax::Config::utf8) can make + /// this guarantee for you. It does it by returning an error if the regex + /// pattern could every report a non-empty match span that contains invalid + /// UTF-8. So long as `syntax::Config::utf8` mode is enabled and your regex + /// successfully parses, then you're guaranteed that the corresponding NFA + /// will only ever report non-empty match spans containing valid UTF-8. + /// + /// (2) is a trickier guarantee because it cannot be enforced by the NFA + /// state graph itself. Consider, for example, the regex `a*`. It matches + /// the empty strings in `☃` at positions `0`, `1`, `2` and `3`, where + /// positions `1` and `2` occur within the UTF-8 encoding of a codepoint, + /// and thus correspond to invalid UTF-8 boundaries. Therefore, this + /// guarantee must be made at a higher level than the NFA state graph + /// itself. This crate deals with this case in each regex engine. Namely, + /// when a zero-width match that splits a codepoint is found and UTF-8 + /// mode enabled, then it is ignored and the engine moves on looking for + /// the next match. + /// + /// Thus, UTF-8 mode is both a promise that the NFA built only reports + /// non-empty matches that are valid UTF-8, and an *instruction* to regex + /// engines that empty matches that split codepoints should be banned. + /// + /// Because UTF-8 mode is fundamentally about avoiding invalid UTF-8 spans, + /// it only makes sense to enable this option when you *know* your haystack + /// is valid UTF-8. (For example, a `&str`.) Enabling UTF-8 mode and + /// searching a haystack that contains invalid UTF-8 leads to **unspecified + /// behavior**. + /// + /// Therefore, it may make sense to enable `syntax::Config::utf8` while + /// simultaneously *disabling* this option. That would ensure all non-empty + /// match spans are valid UTF-8, but that empty match spans may still split + /// a codepoint or match at other places that aren't valid UTF-8. + /// + /// In general, this mode is only relevant if your regex can match the + /// empty string. Most regexes don't. + /// + /// This is enabled by default. + /// + /// # Example + /// + /// This example shows how UTF-8 mode can impact the match spans that may + /// be reported in certain cases. + /// + /// ``` + /// use regex_automata::{ + /// nfa::thompson::{self, pikevm::PikeVM}, + /// Match, Input, + /// }; + /// + /// let re = PikeVM::new("")?; + /// let (mut cache, mut caps) = (re.create_cache(), re.create_captures()); + /// + /// // UTF-8 mode is enabled by default. + /// let mut input = Input::new("☃"); + /// re.search(&mut cache, &input, &mut caps); + /// assert_eq!(Some(Match::must(0, 0..0)), caps.get_match()); + /// + /// // Even though an empty regex matches at 1..1, our next match is + /// // 3..3 because 1..1 and 2..2 split the snowman codepoint (which is + /// // three bytes long). + /// input.set_start(1); + /// re.search(&mut cache, &input, &mut caps); + /// assert_eq!(Some(Match::must(0, 3..3)), caps.get_match()); + /// + /// // But if we disable UTF-8, then we'll get matches at 1..1 and 2..2: + /// let re = PikeVM::builder() + /// .thompson(thompson::Config::new().utf8(false)) + /// .build("")?; + /// re.search(&mut cache, &input, &mut caps); + /// assert_eq!(Some(Match::must(0, 1..1)), caps.get_match()); + /// + /// input.set_start(2); + /// re.search(&mut cache, &input, &mut caps); + /// assert_eq!(Some(Match::must(0, 2..2)), caps.get_match()); + /// + /// input.set_start(3); + /// re.search(&mut cache, &input, &mut caps); + /// assert_eq!(Some(Match::must(0, 3..3)), caps.get_match()); + /// + /// input.set_start(4); + /// re.search(&mut cache, &input, &mut caps); + /// assert_eq!(None, caps.get_match()); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + pub fn utf8(mut self, yes: bool) -> Config { + self.utf8 = Some(yes); + self + } + + /// Reverse the NFA. + /// + /// A NFA reversal is performed by reversing all of the concatenated + /// sub-expressions in the original pattern, recursively. (Look around + /// operators are also inverted.) The resulting NFA can be used to match + /// the pattern starting from the end of a string instead of the beginning + /// of a string. + /// + /// Reversing the NFA is useful for building a reverse DFA, which is most + /// useful for finding the start of a match after its ending position has + /// been found. NFA execution engines typically do not work on reverse + /// NFAs. For example, currently, the Pike VM reports the starting location + /// of matches without a reverse NFA. + /// + /// Currently, enabling this setting requires disabling the + /// [`captures`](Config::captures) setting. If both are enabled, then the + /// compiler will return an error. It is expected that this limitation will + /// be lifted in the future. + /// + /// This is disabled by default. + /// + /// # Example + /// + /// This example shows how to build a DFA from a reverse NFA, and then use + /// the DFA to search backwards. + /// + /// ``` + /// use regex_automata::{ + /// dfa::{self, Automaton}, + /// nfa::thompson::{NFA, WhichCaptures}, + /// HalfMatch, Input, + /// }; + /// + /// let dfa = dfa::dense::Builder::new() + /// .thompson(NFA::config() + /// .which_captures(WhichCaptures::None) + /// .reverse(true) + /// ) + /// .build("baz[0-9]+")?; + /// let expected = Some(HalfMatch::must(0, 3)); + /// assert_eq!( + /// expected, + /// dfa.try_search_rev(&Input::new("foobaz12345bar"))?, + /// ); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + pub fn reverse(mut self, yes: bool) -> Config { + self.reverse = Some(yes); + self + } + + /// Sets an approximate size limit on the total heap used by the NFA being + /// compiled. + /// + /// This permits imposing constraints on the size of a compiled NFA. This + /// may be useful in contexts where the regex pattern is untrusted and one + /// wants to avoid using too much memory. + /// + /// This size limit does not apply to auxiliary heap used during + /// compilation that is not part of the built NFA. + /// + /// Note that this size limit is applied during compilation in order for + /// the limit to prevent too much heap from being used. However, the + /// implementation may use an intermediate NFA representation that is + /// otherwise slightly bigger than the final public form. Since the size + /// limit may be applied to an intermediate representation, there is not + /// necessarily a precise correspondence between the configured size limit + /// and the heap usage of the final NFA. + /// + /// There is no size limit by default. + /// + /// # Example + /// + /// This example demonstrates how Unicode mode can greatly increase the + /// size of the NFA. + /// + /// ``` + /// # if cfg!(miri) { return Ok(()); } // miri takes too long + /// use regex_automata::nfa::thompson::NFA; + /// + /// // 300KB isn't enough! + /// NFA::compiler() + /// .configure(NFA::config().nfa_size_limit(Some(300_000))) + /// .build(r"\w{20}") + /// .unwrap_err(); + /// + /// // ... but 400KB probably is. + /// let nfa = NFA::compiler() + /// .configure(NFA::config().nfa_size_limit(Some(400_000))) + /// .build(r"\w{20}")?; + /// + /// assert_eq!(nfa.pattern_len(), 1); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + pub fn nfa_size_limit(mut self, bytes: Option<usize>) -> Config { + self.nfa_size_limit = Some(bytes); + self + } + + /// Apply best effort heuristics to shrink the NFA at the expense of more + /// time/memory. + /// + /// Generally speaking, if one is using an NFA to compile a DFA, then the + /// extra time used to shrink the NFA will be more than made up for during + /// DFA construction (potentially by a lot). In other words, enabling this + /// can substantially decrease the overall amount of time it takes to build + /// a DFA. + /// + /// A reason to keep this disabled is if you want to compile an NFA and + /// start using it as quickly as possible without needing to build a DFA, + /// and you don't mind using a bit of extra memory for the NFA. e.g., for + /// an NFA simulation or for a lazy DFA. + /// + /// NFA shrinking is currently most useful when compiling a reverse + /// NFA with large Unicode character classes. In particular, it trades + /// additional CPU time during NFA compilation in favor of generating fewer + /// NFA states. + /// + /// This is disabled by default because it can increase compile times + /// quite a bit if you aren't building a full DFA. + /// + /// # Example + /// + /// This example shows that NFA shrinking can lead to substantial space + /// savings in some cases. Notice that, as noted above, we build a reverse + /// DFA and use a pattern with a large Unicode character class. + /// + /// ``` + /// # if cfg!(miri) { return Ok(()); } // miri takes too long + /// use regex_automata::nfa::thompson::{NFA, WhichCaptures}; + /// + /// // Currently we have to disable captures when enabling reverse NFA. + /// let config = NFA::config() + /// .which_captures(WhichCaptures::None) + /// .reverse(true); + /// let not_shrunk = NFA::compiler() + /// .configure(config.clone().shrink(false)) + /// .build(r"\w")?; + /// let shrunk = NFA::compiler() + /// .configure(config.clone().shrink(true)) + /// .build(r"\w")?; + /// + /// // While a specific shrink factor is not guaranteed, the savings can be + /// // considerable in some cases. + /// assert!(shrunk.states().len() * 2 < not_shrunk.states().len()); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + pub fn shrink(mut self, yes: bool) -> Config { + self.shrink = Some(yes); + self + } + + /// Whether to include 'Capture' states in the NFA. + /// + /// Currently, enabling this setting requires disabling the + /// [`reverse`](Config::reverse) setting. If both are enabled, then the + /// compiler will return an error. It is expected that this limitation will + /// be lifted in the future. + /// + /// This is enabled by default. + /// + /// # Example + /// + /// This example demonstrates that some regex engines, like the Pike VM, + /// require capturing states to be present in the NFA to report match + /// offsets. + /// + /// (Note that since this method is deprecated, the example below uses + /// [`Config::which_captures`] to disable capture states.) + /// + /// ``` + /// use regex_automata::nfa::thompson::{ + /// pikevm::PikeVM, + /// NFA, + /// WhichCaptures, + /// }; + /// + /// let re = PikeVM::builder() + /// .thompson(NFA::config().which_captures(WhichCaptures::None)) + /// .build(r"[a-z]+")?; + /// let mut cache = re.create_cache(); + /// + /// assert!(re.is_match(&mut cache, "abc")); + /// assert_eq!(None, re.find(&mut cache, "abc")); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + #[deprecated(since = "0.3.5", note = "use which_captures instead")] + pub fn captures(self, yes: bool) -> Config { + self.which_captures(if yes { + WhichCaptures::All + } else { + WhichCaptures::None + }) + } + + /// Configures what kinds of capture groups are compiled into + /// [`State::Capture`](crate::nfa::thompson::State::Capture) states in a + /// Thompson NFA. + /// + /// Currently, using any option except for [`WhichCaptures::None`] requires + /// disabling the [`reverse`](Config::reverse) setting. If both are + /// enabled, then the compiler will return an error. It is expected that + /// this limitation will be lifted in the future. + /// + /// This is set to [`WhichCaptures::All`] by default. Callers may wish to + /// use [`WhichCaptures::Implicit`] in cases where one wants avoid the + /// overhead of capture states for explicit groups. Usually this occurs + /// when one wants to use the `PikeVM` only for determining the overall + /// match. Otherwise, the `PikeVM` could use much more memory than is + /// necessary. + /// + /// # Example + /// + /// This example demonstrates that some regex engines, like the Pike VM, + /// require capturing states to be present in the NFA to report match + /// offsets. + /// + /// ``` + /// use regex_automata::nfa::thompson::{ + /// pikevm::PikeVM, + /// NFA, + /// WhichCaptures, + /// }; + /// + /// let re = PikeVM::builder() + /// .thompson(NFA::config().which_captures(WhichCaptures::None)) + /// .build(r"[a-z]+")?; + /// let mut cache = re.create_cache(); + /// + /// assert!(re.is_match(&mut cache, "abc")); + /// assert_eq!(None, re.find(&mut cache, "abc")); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + /// + /// The same applies to the bounded backtracker: + /// + /// ``` + /// use regex_automata::nfa::thompson::{ + /// backtrack::BoundedBacktracker, + /// NFA, + /// WhichCaptures, + /// }; + /// + /// let re = BoundedBacktracker::builder() + /// .thompson(NFA::config().which_captures(WhichCaptures::None)) + /// .build(r"[a-z]+")?; + /// let mut cache = re.create_cache(); + /// + /// assert!(re.try_is_match(&mut cache, "abc")?); + /// assert_eq!(None, re.try_find(&mut cache, "abc")?); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + pub fn which_captures(mut self, which_captures: WhichCaptures) -> Config { + self.which_captures = Some(which_captures); + self + } + + /// Sets the look-around matcher that should be used with this NFA. + /// + /// A look-around matcher determines how to match look-around assertions. + /// In particular, some assertions are configurable. For example, the + /// `(?m:^)` and `(?m:$)` assertions can have their line terminator changed + /// from the default of `\n` to any other byte. + /// + /// # Example + /// + /// This shows how to change the line terminator for multi-line assertions. + /// + /// ``` + /// use regex_automata::{ + /// nfa::thompson::{self, pikevm::PikeVM}, + /// util::look::LookMatcher, + /// Match, Input, + /// }; + /// + /// let mut lookm = LookMatcher::new(); + /// lookm.set_line_terminator(b'\x00'); + /// + /// let re = PikeVM::builder() + /// .thompson(thompson::Config::new().look_matcher(lookm)) + /// .build(r"(?m)^[a-z]+$")?; + /// let mut cache = re.create_cache(); + /// + /// // Multi-line assertions now use NUL as a terminator. + /// assert_eq!( + /// Some(Match::must(0, 1..4)), + /// re.find(&mut cache, b"\x00abc\x00"), + /// ); + /// // ... and \n is no longer recognized as a terminator. + /// assert_eq!( + /// None, + /// re.find(&mut cache, b"\nabc\n"), + /// ); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + pub fn look_matcher(mut self, m: LookMatcher) -> Config { + self.look_matcher = Some(m); + self + } + + /// Whether to compile an unanchored prefix into this NFA. + /// + /// This is enabled by default. It is made available for tests only to make + /// it easier to unit test the output of the compiler. + #[cfg(test)] + fn unanchored_prefix(mut self, yes: bool) -> Config { + self.unanchored_prefix = Some(yes); + self + } + + /// Returns whether this configuration has enabled UTF-8 mode. + pub fn get_utf8(&self) -> bool { + self.utf8.unwrap_or(true) + } + + /// Returns whether this configuration has enabled reverse NFA compilation. + pub fn get_reverse(&self) -> bool { + self.reverse.unwrap_or(false) + } + + /// Return the configured NFA size limit, if it exists, in the number of + /// bytes of heap used. + pub fn get_nfa_size_limit(&self) -> Option<usize> { + self.nfa_size_limit.unwrap_or(None) + } + + /// Return whether NFA shrinking is enabled. + pub fn get_shrink(&self) -> bool { + self.shrink.unwrap_or(false) + } + + /// Return whether NFA compilation is configured to produce capture states. + #[deprecated(since = "0.3.5", note = "use get_which_captures instead")] + pub fn get_captures(&self) -> bool { + self.get_which_captures().is_any() + } + + /// Return what kinds of capture states will be compiled into an NFA. + pub fn get_which_captures(&self) -> WhichCaptures { + self.which_captures.unwrap_or(WhichCaptures::All) + } + + /// Return the look-around matcher for this NFA. + pub fn get_look_matcher(&self) -> LookMatcher { + self.look_matcher.clone().unwrap_or(LookMatcher::default()) + } + + /// Return whether NFA compilation is configured to include an unanchored + /// prefix. + /// + /// This is always false when not in test mode. + fn get_unanchored_prefix(&self) -> bool { + #[cfg(test)] + { + self.unanchored_prefix.unwrap_or(true) + } + #[cfg(not(test))] + { + true + } + } + + /// Overwrite the default configuration such that the options in `o` are + /// always used. If an option in `o` is not set, then the corresponding + /// option in `self` is used. If it's not set in `self` either, then it + /// remains not set. + pub(crate) fn overwrite(&self, o: Config) -> Config { + Config { + utf8: o.utf8.or(self.utf8), + reverse: o.reverse.or(self.reverse), + nfa_size_limit: o.nfa_size_limit.or(self.nfa_size_limit), + shrink: o.shrink.or(self.shrink), + which_captures: o.which_captures.or(self.which_captures), + look_matcher: o.look_matcher.or_else(|| self.look_matcher.clone()), + #[cfg(test)] + unanchored_prefix: o.unanchored_prefix.or(self.unanchored_prefix), + } + } +} + +/// A configuration indicating which kinds of +/// [`State::Capture`](crate::nfa::thompson::State::Capture) states to include. +/// +/// This configuration can be used with [`Config::which_captures`] to control +/// which capture states are compiled into a Thompson NFA. +/// +/// The default configuration is [`WhichCaptures::All`]. +#[derive(Clone, Copy, Debug)] +pub enum WhichCaptures { + /// All capture states, including those corresponding to both implicit and + /// explicit capture groups, are included in the Thompson NFA. + All, + /// Only capture states corresponding to implicit capture groups are + /// included. Implicit capture groups appear in every pattern implicitly + /// and correspond to the overall match of a pattern. + /// + /// This is useful when one only cares about the overall match of a + /// pattern. By excluding capture states from explicit capture groups, + /// one might be able to reduce the memory usage of a multi-pattern regex + /// substantially if it was otherwise written to have many explicit capture + /// groups. + Implicit, + /// No capture states are compiled into the Thompson NFA. + /// + /// This is useful when capture states are either not needed (for example, + /// if one is only trying to build a DFA) or if they aren't supported (for + /// example, a reverse NFA). + None, +} + +impl Default for WhichCaptures { + fn default() -> WhichCaptures { + WhichCaptures::All + } +} + +impl WhichCaptures { + /// Returns true if this configuration indicates that no capture states + /// should be produced in an NFA. + pub fn is_none(&self) -> bool { + matches!(*self, WhichCaptures::None) + } + + /// Returns true if this configuration indicates that some capture states + /// should be added to an NFA. Note that this might only include capture + /// states for implicit capture groups. + pub fn is_any(&self) -> bool { + !self.is_none() + } +} + +/* +This compiler below uses Thompson's construction algorithm. The compiler takes +a regex-syntax::Hir as input and emits an NFA graph as output. The NFA graph +is structured in a way that permits it to be executed by a virtual machine and +also used to efficiently build a DFA. + +The compiler deals with a slightly expanded set of NFA states than what is +in a final NFA (as exhibited by builder::State and nfa::State). Notably a +compiler state includes an empty node that has exactly one unconditional +epsilon transition to the next state. In other words, it's a "goto" instruction +if one views Thompson's NFA as a set of bytecode instructions. These goto +instructions are removed in a subsequent phase before returning the NFA to the +caller. The purpose of these empty nodes is that they make the construction +algorithm substantially simpler to implement. We remove them before returning +to the caller because they can represent substantial overhead when traversing +the NFA graph (either while searching using the NFA directly or while building +a DFA). + +In the future, it would be nice to provide a Glushkov compiler as well, as it +would work well as a bit-parallel NFA for smaller regexes. But the Thompson +construction is one I'm more familiar with and seems more straight-forward to +deal with when it comes to large Unicode character classes. + +Internally, the compiler uses interior mutability to improve composition in the +face of the borrow checker. In particular, we'd really like to be able to write +things like this: + + self.c_concat(exprs.iter().map(|e| self.c(e))) + +Which elegantly uses iterators to build up a sequence of compiled regex +sub-expressions and then hands it off to the concatenating compiler routine. +Without interior mutability, the borrow checker won't let us borrow `self` +mutably both inside and outside the closure at the same time. +*/ + +/// A builder for compiling an NFA from a regex's high-level intermediate +/// representation (HIR). +/// +/// This compiler provides a way to translate a parsed regex pattern into an +/// NFA state graph. The NFA state graph can either be used directly to execute +/// a search (e.g., with a Pike VM), or it can be further used to build a DFA. +/// +/// This compiler provides APIs both for compiling regex patterns directly from +/// their concrete syntax, or via a [`regex_syntax::hir::Hir`]. +/// +/// This compiler has various options that may be configured via +/// [`thompson::Config`](Config). +/// +/// Note that a compiler is not the same as a [`thompson::Builder`](Builder). +/// A `Builder` provides a lower level API that is uncoupled from a regex +/// pattern's concrete syntax or even its HIR. Instead, it permits stitching +/// together an NFA by hand. See its docs for examples. +/// +/// # Example: compilation from concrete syntax +/// +/// This shows how to compile an NFA from a pattern string while setting a size +/// limit on how big the NFA is allowed to be (in terms of bytes of heap used). +/// +/// ``` +/// use regex_automata::{ +/// nfa::thompson::{NFA, pikevm::PikeVM}, +/// Match, +/// }; +/// +/// let config = NFA::config().nfa_size_limit(Some(1_000)); +/// let nfa = NFA::compiler().configure(config).build(r"(?-u)\w")?; +/// +/// let re = PikeVM::new_from_nfa(nfa)?; +/// let mut cache = re.create_cache(); +/// let mut caps = re.create_captures(); +/// let expected = Some(Match::must(0, 3..4)); +/// re.captures(&mut cache, "!@#A#@!", &mut caps); +/// assert_eq!(expected, caps.get_match()); +/// +/// # Ok::<(), Box<dyn std::error::Error>>(()) +/// ``` +/// +/// # Example: compilation from HIR +/// +/// This shows how to hand assemble a regular expression via its HIR, and then +/// compile an NFA directly from it. +/// +/// ``` +/// use regex_automata::{nfa::thompson::{NFA, pikevm::PikeVM}, Match}; +/// use regex_syntax::hir::{Hir, Class, ClassBytes, ClassBytesRange}; +/// +/// let hir = Hir::class(Class::Bytes(ClassBytes::new(vec![ +/// ClassBytesRange::new(b'0', b'9'), +/// ClassBytesRange::new(b'A', b'Z'), +/// ClassBytesRange::new(b'_', b'_'), +/// ClassBytesRange::new(b'a', b'z'), +/// ]))); +/// +/// let config = NFA::config().nfa_size_limit(Some(1_000)); +/// let nfa = NFA::compiler().configure(config).build_from_hir(&hir)?; +/// +/// let re = PikeVM::new_from_nfa(nfa)?; +/// let mut cache = re.create_cache(); +/// let mut caps = re.create_captures(); +/// let expected = Some(Match::must(0, 3..4)); +/// re.captures(&mut cache, "!@#A#@!", &mut caps); +/// assert_eq!(expected, caps.get_match()); +/// +/// # Ok::<(), Box<dyn std::error::Error>>(()) +/// ``` +#[derive(Clone, Debug)] +pub struct Compiler { + /// A regex parser, used when compiling an NFA directly from a pattern + /// string. + parser: ParserBuilder, + /// The compiler configuration. + config: Config, + /// The builder for actually constructing an NFA. This provides a + /// convenient abstraction for writing a compiler. + builder: RefCell<Builder>, + /// State used for compiling character classes to UTF-8 byte automata. + /// State is not retained between character class compilations. This just + /// serves to amortize allocation to the extent possible. + utf8_state: RefCell<Utf8State>, + /// State used for arranging character classes in reverse into a trie. + trie_state: RefCell<RangeTrie>, + /// State used for caching common suffixes when compiling reverse UTF-8 + /// automata (for Unicode character classes). + utf8_suffix: RefCell<Utf8SuffixMap>, +} + +impl Compiler { + /// Create a new NFA builder with its default configuration. + pub fn new() -> Compiler { + Compiler { + parser: ParserBuilder::new(), + config: Config::default(), + builder: RefCell::new(Builder::new()), + utf8_state: RefCell::new(Utf8State::new()), + trie_state: RefCell::new(RangeTrie::new()), + utf8_suffix: RefCell::new(Utf8SuffixMap::new(1000)), + } + } + + /// Compile the given regular expression pattern into an NFA. + /// + /// If there was a problem parsing the regex, then that error is returned. + /// + /// Otherwise, if there was a problem building the NFA, then an error is + /// returned. The only error that can occur is if the compiled regex would + /// exceed the size limits configured on this builder, or if any part of + /// the NFA would exceed the integer representations used. (For example, + /// too many states might plausibly occur on a 16-bit target.) + /// + /// # Example + /// + /// ``` + /// use regex_automata::{nfa::thompson::{NFA, pikevm::PikeVM}, Match}; + /// + /// let config = NFA::config().nfa_size_limit(Some(1_000)); + /// let nfa = NFA::compiler().configure(config).build(r"(?-u)\w")?; + /// + /// let re = PikeVM::new_from_nfa(nfa)?; + /// let mut cache = re.create_cache(); + /// let mut caps = re.create_captures(); + /// let expected = Some(Match::must(0, 3..4)); + /// re.captures(&mut cache, "!@#A#@!", &mut caps); + /// assert_eq!(expected, caps.get_match()); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + pub fn build(&self, pattern: &str) -> Result<NFA, BuildError> { + self.build_many(&[pattern]) + } + + /// Compile the given regular expression patterns into a single NFA. + /// + /// When matches are returned, the pattern ID corresponds to the index of + /// the pattern in the slice given. + /// + /// # Example + /// + /// ``` + /// use regex_automata::{nfa::thompson::{NFA, pikevm::PikeVM}, Match}; + /// + /// let config = NFA::config().nfa_size_limit(Some(1_000)); + /// let nfa = NFA::compiler().configure(config).build_many(&[ + /// r"(?-u)\s", + /// r"(?-u)\w", + /// ])?; + /// + /// let re = PikeVM::new_from_nfa(nfa)?; + /// let mut cache = re.create_cache(); + /// let mut caps = re.create_captures(); + /// let expected = Some(Match::must(1, 1..2)); + /// re.captures(&mut cache, "!A! !A!", &mut caps); + /// assert_eq!(expected, caps.get_match()); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + pub fn build_many<P: AsRef<str>>( + &self, + patterns: &[P], + ) -> Result<NFA, BuildError> { + let mut hirs = vec![]; + for p in patterns { + hirs.push( + self.parser + .build() + .parse(p.as_ref()) + .map_err(BuildError::syntax)?, + ); + debug!("parsed: {:?}", p.as_ref()); + } + self.build_many_from_hir(&hirs) + } + + /// Compile the given high level intermediate representation of a regular + /// expression into an NFA. + /// + /// If there was a problem building the NFA, then an error is returned. The + /// only error that can occur is if the compiled regex would exceed the + /// size limits configured on this builder, or if any part of the NFA would + /// exceed the integer representations used. (For example, too many states + /// might plausibly occur on a 16-bit target.) + /// + /// # Example + /// + /// ``` + /// use regex_automata::{nfa::thompson::{NFA, pikevm::PikeVM}, Match}; + /// use regex_syntax::hir::{Hir, Class, ClassBytes, ClassBytesRange}; + /// + /// let hir = Hir::class(Class::Bytes(ClassBytes::new(vec![ + /// ClassBytesRange::new(b'0', b'9'), + /// ClassBytesRange::new(b'A', b'Z'), + /// ClassBytesRange::new(b'_', b'_'), + /// ClassBytesRange::new(b'a', b'z'), + /// ]))); + /// + /// let config = NFA::config().nfa_size_limit(Some(1_000)); + /// let nfa = NFA::compiler().configure(config).build_from_hir(&hir)?; + /// + /// let re = PikeVM::new_from_nfa(nfa)?; + /// let mut cache = re.create_cache(); + /// let mut caps = re.create_captures(); + /// let expected = Some(Match::must(0, 3..4)); + /// re.captures(&mut cache, "!@#A#@!", &mut caps); + /// assert_eq!(expected, caps.get_match()); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + pub fn build_from_hir(&self, expr: &Hir) -> Result<NFA, BuildError> { + self.build_many_from_hir(&[expr]) + } + + /// Compile the given high level intermediate representations of regular + /// expressions into a single NFA. + /// + /// When matches are returned, the pattern ID corresponds to the index of + /// the pattern in the slice given. + /// + /// # Example + /// + /// ``` + /// use regex_automata::{nfa::thompson::{NFA, pikevm::PikeVM}, Match}; + /// use regex_syntax::hir::{Hir, Class, ClassBytes, ClassBytesRange}; + /// + /// let hirs = &[ + /// Hir::class(Class::Bytes(ClassBytes::new(vec![ + /// ClassBytesRange::new(b'\t', b'\r'), + /// ClassBytesRange::new(b' ', b' '), + /// ]))), + /// Hir::class(Class::Bytes(ClassBytes::new(vec![ + /// ClassBytesRange::new(b'0', b'9'), + /// ClassBytesRange::new(b'A', b'Z'), + /// ClassBytesRange::new(b'_', b'_'), + /// ClassBytesRange::new(b'a', b'z'), + /// ]))), + /// ]; + /// + /// let config = NFA::config().nfa_size_limit(Some(1_000)); + /// let nfa = NFA::compiler().configure(config).build_many_from_hir(hirs)?; + /// + /// let re = PikeVM::new_from_nfa(nfa)?; + /// let mut cache = re.create_cache(); + /// let mut caps = re.create_captures(); + /// let expected = Some(Match::must(1, 1..2)); + /// re.captures(&mut cache, "!A! !A!", &mut caps); + /// assert_eq!(expected, caps.get_match()); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + pub fn build_many_from_hir<H: Borrow<Hir>>( + &self, + exprs: &[H], + ) -> Result<NFA, BuildError> { + self.compile(exprs) + } + + /// Apply the given NFA configuration options to this builder. + /// + /// # Example + /// + /// ``` + /// use regex_automata::nfa::thompson::NFA; + /// + /// let config = NFA::config().nfa_size_limit(Some(1_000)); + /// let nfa = NFA::compiler().configure(config).build(r"(?-u)\w")?; + /// assert_eq!(nfa.pattern_len(), 1); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + pub fn configure(&mut self, config: Config) -> &mut Compiler { + self.config = self.config.overwrite(config); + self + } + + /// Set the syntax configuration for this builder using + /// [`syntax::Config`](crate::util::syntax::Config). + /// + /// This permits setting things like case insensitivity, Unicode and multi + /// line mode. + /// + /// This syntax configuration only applies when an NFA is built directly + /// from a pattern string. If an NFA is built from an HIR, then all syntax + /// settings are ignored. + /// + /// # Example + /// + /// ``` + /// use regex_automata::{nfa::thompson::NFA, util::syntax}; + /// + /// let syntax_config = syntax::Config::new().unicode(false); + /// let nfa = NFA::compiler().syntax(syntax_config).build(r"\w")?; + /// // If Unicode were enabled, the number of states would be much bigger. + /// assert!(nfa.states().len() < 15); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + pub fn syntax( + &mut self, + config: crate::util::syntax::Config, + ) -> &mut Compiler { + config.apply(&mut self.parser); + self + } +} + +impl Compiler { + /// Compile the sequence of HIR expressions given. Pattern IDs are + /// allocated starting from 0, in correspondence with the slice given. + /// + /// It is legal to provide an empty slice. In that case, the NFA returned + /// has no patterns and will never match anything. + fn compile<H: Borrow<Hir>>(&self, exprs: &[H]) -> Result<NFA, BuildError> { + if exprs.len() > PatternID::LIMIT { + return Err(BuildError::too_many_patterns(exprs.len())); + } + if self.config.get_reverse() + && self.config.get_which_captures().is_any() + { + return Err(BuildError::unsupported_captures()); + } + + self.builder.borrow_mut().clear(); + self.builder.borrow_mut().set_utf8(self.config.get_utf8()); + self.builder.borrow_mut().set_reverse(self.config.get_reverse()); + self.builder + .borrow_mut() + .set_look_matcher(self.config.get_look_matcher()); + self.builder + .borrow_mut() + .set_size_limit(self.config.get_nfa_size_limit())?; + + // We always add an unanchored prefix unless we were specifically told + // not to (for tests only), or if we know that the regex is anchored + // for all matches. When an unanchored prefix is not added, then the + // NFA's anchored and unanchored start states are equivalent. + let all_anchored = exprs.iter().all(|e| { + e.borrow() + .properties() + .look_set_prefix() + .contains(hir::Look::Start) + }); + let anchored = !self.config.get_unanchored_prefix() || all_anchored; + let unanchored_prefix = if anchored { + self.c_empty()? + } else { + self.c_at_least(&Hir::dot(hir::Dot::AnyByte), false, 0)? + }; + + let compiled = self.c_alt_iter(exprs.iter().map(|e| { + let _ = self.start_pattern()?; + let one = self.c_cap(0, None, e.borrow())?; + let match_state_id = self.add_match()?; + self.patch(one.end, match_state_id)?; + let _ = self.finish_pattern(one.start)?; + Ok(ThompsonRef { start: one.start, end: match_state_id }) + }))?; + self.patch(unanchored_prefix.end, compiled.start)?; + let nfa = self + .builder + .borrow_mut() + .build(compiled.start, unanchored_prefix.start)?; + + debug!("HIR-to-NFA compilation complete, config: {:?}", self.config); + Ok(nfa) + } + + /// Compile an arbitrary HIR expression. + fn c(&self, expr: &Hir) -> Result<ThompsonRef, BuildError> { + use regex_syntax::hir::{Class, HirKind::*}; + + match *expr.kind() { + Empty => self.c_empty(), + Literal(hir::Literal(ref bytes)) => self.c_literal(bytes), + Class(Class::Bytes(ref c)) => self.c_byte_class(c), + Class(Class::Unicode(ref c)) => self.c_unicode_class(c), + Look(ref look) => self.c_look(look), + Repetition(ref rep) => self.c_repetition(rep), + Capture(ref c) => self.c_cap(c.index, c.name.as_deref(), &c.sub), + Concat(ref es) => self.c_concat(es.iter().map(|e| self.c(e))), + Alternation(ref es) => self.c_alt_slice(es), + } + } + + /// Compile a concatenation of the sub-expressions yielded by the given + /// iterator. If the iterator yields no elements, then this compiles down + /// to an "empty" state that always matches. + /// + /// If the compiler is in reverse mode, then the expressions given are + /// automatically compiled in reverse. + fn c_concat<I>(&self, mut it: I) -> Result<ThompsonRef, BuildError> + where + I: DoubleEndedIterator<Item = Result<ThompsonRef, BuildError>>, + { + let first = if self.is_reverse() { it.next_back() } else { it.next() }; + let ThompsonRef { start, mut end } = match first { + Some(result) => result?, + None => return self.c_empty(), + }; + loop { + let next = + if self.is_reverse() { it.next_back() } else { it.next() }; + let compiled = match next { + Some(result) => result?, + None => break, + }; + self.patch(end, compiled.start)?; + end = compiled.end; + } + Ok(ThompsonRef { start, end }) + } + + /// Compile an alternation of the given HIR values. + /// + /// This is like 'c_alt_iter', but it accepts a slice of HIR values instead + /// of an iterator of compiled NFA subgraphs. The point of accepting a + /// slice here is that it opens up some optimization opportunities. For + /// example, if all of the HIR values are literals, then this routine might + /// re-shuffle them to make NFA epsilon closures substantially faster. + fn c_alt_slice(&self, exprs: &[Hir]) -> Result<ThompsonRef, BuildError> { + // self.c_alt_iter(exprs.iter().map(|e| self.c(e))) + let literal_count = exprs + .iter() + .filter(|e| { + matches!(*e.kind(), hir::HirKind::Literal(hir::Literal(_))) + }) + .count(); + if literal_count <= 1 || literal_count < exprs.len() { + return self.c_alt_iter(exprs.iter().map(|e| self.c(e))); + } + + let mut trie = if self.is_reverse() { + LiteralTrie::reverse() + } else { + LiteralTrie::forward() + }; + for expr in exprs.iter() { + let literal = match *expr.kind() { + hir::HirKind::Literal(hir::Literal(ref bytes)) => bytes, + _ => unreachable!(), + }; + trie.add(literal)?; + } + trie.compile(&mut self.builder.borrow_mut()) + } + + /// Compile an alternation, where each element yielded by the given + /// iterator represents an item in the alternation. If the iterator yields + /// no elements, then this compiles down to a "fail" state. + /// + /// In an alternation, expressions appearing earlier are "preferred" at + /// match time over expressions appearing later. At least, this is true + /// when using "leftmost first" match semantics. (If "leftmost longest" are + /// ever added in the future, then this preference order of priority would + /// not apply in that mode.) + fn c_alt_iter<I>(&self, mut it: I) -> Result<ThompsonRef, BuildError> + where + I: Iterator<Item = Result<ThompsonRef, BuildError>>, + { + let first = match it.next() { + None => return self.c_fail(), + Some(result) => result?, + }; + let second = match it.next() { + None => return Ok(first), + Some(result) => result?, + }; + + let union = self.add_union()?; + let end = self.add_empty()?; + self.patch(union, first.start)?; + self.patch(first.end, end)?; + self.patch(union, second.start)?; + self.patch(second.end, end)?; + for result in it { + let compiled = result?; + self.patch(union, compiled.start)?; + self.patch(compiled.end, end)?; + } + Ok(ThompsonRef { start: union, end }) + } + + /// Compile the given capture sub-expression. `expr` should be the + /// sub-expression contained inside the capture. If "capture" states are + /// enabled, then they are added as appropriate. + /// + /// This accepts the pieces of a capture instead of a `hir::Capture` so + /// that it's easy to manufacture a "fake" group when necessary, e.g., for + /// adding the entire pattern as if it were a group in order to create + /// appropriate "capture" states in the NFA. + fn c_cap( + &self, + index: u32, + name: Option<&str>, + expr: &Hir, + ) -> Result<ThompsonRef, BuildError> { + match self.config.get_which_captures() { + // No capture states means we always skip them. + WhichCaptures::None => return self.c(expr), + // Implicit captures states means we only add when index==0 since + // index==0 implies the group is implicit. + WhichCaptures::Implicit if index > 0 => return self.c(expr), + _ => {} + } + + let start = self.add_capture_start(index, name)?; + let inner = self.c(expr)?; + let end = self.add_capture_end(index)?; + self.patch(start, inner.start)?; + self.patch(inner.end, end)?; + Ok(ThompsonRef { start, end }) + } + + /// Compile the given repetition expression. This handles all types of + /// repetitions and greediness. + fn c_repetition( + &self, + rep: &hir::Repetition, + ) -> Result<ThompsonRef, BuildError> { + match (rep.min, rep.max) { + (0, Some(1)) => self.c_zero_or_one(&rep.sub, rep.greedy), + (min, None) => self.c_at_least(&rep.sub, rep.greedy, min), + (min, Some(max)) if min == max => self.c_exactly(&rep.sub, min), + (min, Some(max)) => self.c_bounded(&rep.sub, rep.greedy, min, max), + } + } + + /// Compile the given expression such that it matches at least `min` times, + /// but no more than `max` times. + /// + /// When `greedy` is true, then the preference is for the expression to + /// match as much as possible. Otherwise, it will match as little as + /// possible. + fn c_bounded( + &self, + expr: &Hir, + greedy: bool, + min: u32, + max: u32, + ) -> Result<ThompsonRef, BuildError> { + let prefix = self.c_exactly(expr, min)?; + if min == max { + return Ok(prefix); + } + + // It is tempting here to compile the rest here as a concatenation + // of zero-or-one matches. i.e., for `a{2,5}`, compile it as if it + // were `aaa?a?a?`. The problem here is that it leads to this program: + // + // >000000: 61 => 01 + // 000001: 61 => 02 + // 000002: union(03, 04) + // 000003: 61 => 04 + // 000004: union(05, 06) + // 000005: 61 => 06 + // 000006: union(07, 08) + // 000007: 61 => 08 + // 000008: MATCH + // + // And effectively, once you hit state 2, the epsilon closure will + // include states 3, 5, 6, 7 and 8, which is quite a bit. It is better + // to instead compile it like so: + // + // >000000: 61 => 01 + // 000001: 61 => 02 + // 000002: union(03, 08) + // 000003: 61 => 04 + // 000004: union(05, 08) + // 000005: 61 => 06 + // 000006: union(07, 08) + // 000007: 61 => 08 + // 000008: MATCH + // + // So that the epsilon closure of state 2 is now just 3 and 8. + let empty = self.add_empty()?; + let mut prev_end = prefix.end; + for _ in min..max { + let union = if greedy { + self.add_union() + } else { + self.add_union_reverse() + }?; + let compiled = self.c(expr)?; + self.patch(prev_end, union)?; + self.patch(union, compiled.start)?; + self.patch(union, empty)?; + prev_end = compiled.end; + } + self.patch(prev_end, empty)?; + Ok(ThompsonRef { start: prefix.start, end: empty }) + } + + /// Compile the given expression such that it may be matched `n` or more + /// times, where `n` can be any integer. (Although a particularly large + /// integer is likely to run afoul of any configured size limits.) + /// + /// When `greedy` is true, then the preference is for the expression to + /// match as much as possible. Otherwise, it will match as little as + /// possible. + fn c_at_least( + &self, + expr: &Hir, + greedy: bool, + n: u32, + ) -> Result<ThompsonRef, BuildError> { + if n == 0 { + // When the expression cannot match the empty string, then we + // can get away with something much simpler: just one 'alt' + // instruction that optionally repeats itself. But if the expr + // can match the empty string... see below. + if expr.properties().minimum_len().map_or(false, |len| len > 0) { + let union = if greedy { + self.add_union() + } else { + self.add_union_reverse() + }?; + let compiled = self.c(expr)?; + self.patch(union, compiled.start)?; + self.patch(compiled.end, union)?; + return Ok(ThompsonRef { start: union, end: union }); + } + + // What's going on here? Shouldn't x* be simpler than this? It + // turns out that when implementing leftmost-first (Perl-like) + // match semantics, x* results in an incorrect preference order + // when computing the transitive closure of states if and only if + // 'x' can match the empty string. So instead, we compile x* as + // (x+)?, which preserves the correct preference order. + // + // See: https://github.com/rust-lang/regex/issues/779 + let compiled = self.c(expr)?; + let plus = if greedy { + self.add_union() + } else { + self.add_union_reverse() + }?; + self.patch(compiled.end, plus)?; + self.patch(plus, compiled.start)?; + + let question = if greedy { + self.add_union() + } else { + self.add_union_reverse() + }?; + let empty = self.add_empty()?; + self.patch(question, compiled.start)?; + self.patch(question, empty)?; + self.patch(plus, empty)?; + Ok(ThompsonRef { start: question, end: empty }) + } else if n == 1 { + let compiled = self.c(expr)?; + let union = if greedy { + self.add_union() + } else { + self.add_union_reverse() + }?; + self.patch(compiled.end, union)?; + self.patch(union, compiled.start)?; + Ok(ThompsonRef { start: compiled.start, end: union }) + } else { + let prefix = self.c_exactly(expr, n - 1)?; + let last = self.c(expr)?; + let union = if greedy { + self.add_union() + } else { + self.add_union_reverse() + }?; + self.patch(prefix.end, last.start)?; + self.patch(last.end, union)?; + self.patch(union, last.start)?; + Ok(ThompsonRef { start: prefix.start, end: union }) + } + } + + /// Compile the given expression such that it may be matched zero or one + /// times. + /// + /// When `greedy` is true, then the preference is for the expression to + /// match as much as possible. Otherwise, it will match as little as + /// possible. + fn c_zero_or_one( + &self, + expr: &Hir, + greedy: bool, + ) -> Result<ThompsonRef, BuildError> { + let union = + if greedy { self.add_union() } else { self.add_union_reverse() }?; + let compiled = self.c(expr)?; + let empty = self.add_empty()?; + self.patch(union, compiled.start)?; + self.patch(union, empty)?; + self.patch(compiled.end, empty)?; + Ok(ThompsonRef { start: union, end: empty }) + } + + /// Compile the given HIR expression exactly `n` times. + fn c_exactly( + &self, + expr: &Hir, + n: u32, + ) -> Result<ThompsonRef, BuildError> { + let it = (0..n).map(|_| self.c(expr)); + self.c_concat(it) + } + + /// Compile the given byte oriented character class. + /// + /// This uses "sparse" states to represent an alternation between ranges in + /// this character class. We can use "sparse" states instead of stitching + /// together a "union" state because all ranges in a character class have + /// equal priority *and* are non-overlapping (thus, only one can match, so + /// there's never a question of priority in the first place). This saves a + /// fair bit of overhead when traversing an NFA. + /// + /// This routine compiles an empty character class into a "fail" state. + fn c_byte_class( + &self, + cls: &hir::ClassBytes, + ) -> Result<ThompsonRef, BuildError> { + let end = self.add_empty()?; + let mut trans = Vec::with_capacity(cls.ranges().len()); + for r in cls.iter() { + trans.push(Transition { + start: r.start(), + end: r.end(), + next: end, + }); + } + Ok(ThompsonRef { start: self.add_sparse(trans)?, end }) + } + + /// Compile the given Unicode character class. + /// + /// This routine specifically tries to use various types of compression, + /// since UTF-8 automata of large classes can get quite large. The specific + /// type of compression used depends on forward vs reverse compilation, and + /// whether NFA shrinking is enabled or not. + /// + /// Aside from repetitions causing lots of repeat group, this is like the + /// single most expensive part of regex compilation. Therefore, a large part + /// of the expense of compilation may be reduce by disabling Unicode in the + /// pattern. + /// + /// This routine compiles an empty character class into a "fail" state. + fn c_unicode_class( + &self, + cls: &hir::ClassUnicode, + ) -> Result<ThompsonRef, BuildError> { + // If all we have are ASCII ranges wrapped in a Unicode package, then + // there is zero reason to bring out the big guns. We can fit all ASCII + // ranges within a single sparse state. + if cls.is_ascii() { + let end = self.add_empty()?; + let mut trans = Vec::with_capacity(cls.ranges().len()); + for r in cls.iter() { + // The unwraps below are OK because we've verified that this + // class only contains ASCII codepoints. + trans.push(Transition { + // FIXME(1.59): use the 'TryFrom<char> for u8' impl. + start: u8::try_from(u32::from(r.start())).unwrap(), + end: u8::try_from(u32::from(r.end())).unwrap(), + next: end, + }); + } + Ok(ThompsonRef { start: self.add_sparse(trans)?, end }) + } else if self.is_reverse() { + if !self.config.get_shrink() { + // When we don't want to spend the extra time shrinking, we + // compile the UTF-8 automaton in reverse using something like + // the "naive" approach, but will attempt to re-use common + // suffixes. + self.c_unicode_class_reverse_with_suffix(cls) + } else { + // When we want to shrink our NFA for reverse UTF-8 automata, + // we cannot feed UTF-8 sequences directly to the UTF-8 + // compiler, since the UTF-8 compiler requires all sequences + // to be lexicographically sorted. Instead, we organize our + // sequences into a range trie, which can then output our + // sequences in the correct order. Unfortunately, building the + // range trie is fairly expensive (but not nearly as expensive + // as building a DFA). Hence the reason why the 'shrink' option + // exists, so that this path can be toggled off. For example, + // we might want to turn this off if we know we won't be + // compiling a DFA. + let mut trie = self.trie_state.borrow_mut(); + trie.clear(); + + for rng in cls.iter() { + for mut seq in Utf8Sequences::new(rng.start(), rng.end()) { + seq.reverse(); + trie.insert(seq.as_slice()); + } + } + let mut builder = self.builder.borrow_mut(); + let mut utf8_state = self.utf8_state.borrow_mut(); + let mut utf8c = + Utf8Compiler::new(&mut *builder, &mut *utf8_state)?; + trie.iter(|seq| { + utf8c.add(&seq)?; + Ok(()) + })?; + utf8c.finish() + } + } else { + // In the forward direction, we always shrink our UTF-8 automata + // because we can stream it right into the UTF-8 compiler. There + // is almost no downside (in either memory or time) to using this + // approach. + let mut builder = self.builder.borrow_mut(); + let mut utf8_state = self.utf8_state.borrow_mut(); + let mut utf8c = + Utf8Compiler::new(&mut *builder, &mut *utf8_state)?; + for rng in cls.iter() { + for seq in Utf8Sequences::new(rng.start(), rng.end()) { + utf8c.add(seq.as_slice())?; + } + } + utf8c.finish() + } + + // For reference, the code below is the "naive" version of compiling a + // UTF-8 automaton. It is deliciously simple (and works for both the + // forward and reverse cases), but will unfortunately produce very + // large NFAs. When compiling a forward automaton, the size difference + // can sometimes be an order of magnitude. For example, the '\w' regex + // will generate about ~3000 NFA states using the naive approach below, + // but only 283 states when using the approach above. This is because + // the approach above actually compiles a *minimal* (or near minimal, + // because of the bounded hashmap for reusing equivalent states) UTF-8 + // automaton. + // + // The code below is kept as a reference point in order to make it + // easier to understand the higher level goal here. Although, it will + // almost certainly bit-rot, so keep that in mind. Also, if you try to + // use it, some of the tests in this module will fail because they look + // for terser byte code produce by the more optimized handling above. + // But the integration test suite should still pass. + // + // One good example of the substantial difference this can make is to + // compare and contrast performance of the Pike VM when the code below + // is active vs the code above. Here's an example to try: + // + // regex-cli find match pikevm -b -p '(?m)^\w{20}' -y '@$smallishru' + // + // With Unicode classes generated below, this search takes about 45s on + // my machine. But with the compressed version above, the search takes + // only around 1.4s. The NFA is also 20% smaller. This is in part due + // to the compression, but also because of the utilization of 'sparse' + // NFA states. They lead to much less state shuffling during the NFA + // search. + /* + let it = cls + .iter() + .flat_map(|rng| Utf8Sequences::new(rng.start(), rng.end())) + .map(|seq| { + let it = seq + .as_slice() + .iter() + .map(|rng| self.c_range(rng.start, rng.end)); + self.c_concat(it) + }); + self.c_alt_iter(it) + */ + } + + /// Compile the given Unicode character class in reverse with suffix + /// caching. + /// + /// This is a "quick" way to compile large Unicode classes into reverse + /// UTF-8 automata while doing a small amount of compression on that + /// automata by reusing common suffixes. + /// + /// A more comprehensive compression scheme can be accomplished by using + /// a range trie to efficiently sort a reverse sequence of UTF-8 byte + /// rqanges, and then use Daciuk's algorithm via `Utf8Compiler`. + /// + /// This is the technique used when "NFA shrinking" is disabled. + /// + /// (This also tries to use "sparse" states where possible, just like + /// `c_byte_class` does.) + fn c_unicode_class_reverse_with_suffix( + &self, + cls: &hir::ClassUnicode, + ) -> Result<ThompsonRef, BuildError> { + // N.B. It would likely be better to cache common *prefixes* in the + // reverse direction, but it's not quite clear how to do that. The + // advantage of caching suffixes is that it does give us a win, and + // has a very small additional overhead. + let mut cache = self.utf8_suffix.borrow_mut(); + cache.clear(); + + let union = self.add_union()?; + let alt_end = self.add_empty()?; + for urng in cls.iter() { + for seq in Utf8Sequences::new(urng.start(), urng.end()) { + let mut end = alt_end; + for brng in seq.as_slice() { + let key = Utf8SuffixKey { + from: end, + start: brng.start, + end: brng.end, + }; + let hash = cache.hash(&key); + if let Some(id) = cache.get(&key, hash) { + end = id; + continue; + } + + let compiled = self.c_range(brng.start, brng.end)?; + self.patch(compiled.end, end)?; + end = compiled.start; + cache.set(key, hash, end); + } + self.patch(union, end)?; + } + } + Ok(ThompsonRef { start: union, end: alt_end }) + } + + /// Compile the given HIR look-around assertion to an NFA look-around + /// assertion. + fn c_look(&self, anchor: &hir::Look) -> Result<ThompsonRef, BuildError> { + let look = match *anchor { + hir::Look::Start => Look::Start, + hir::Look::End => Look::End, + hir::Look::StartLF => Look::StartLF, + hir::Look::EndLF => Look::EndLF, + hir::Look::StartCRLF => Look::StartCRLF, + hir::Look::EndCRLF => Look::EndCRLF, + hir::Look::WordAscii => Look::WordAscii, + hir::Look::WordAsciiNegate => Look::WordAsciiNegate, + hir::Look::WordUnicode => Look::WordUnicode, + hir::Look::WordUnicodeNegate => Look::WordUnicodeNegate, + }; + let id = self.add_look(look)?; + Ok(ThompsonRef { start: id, end: id }) + } + + /// Compile the given byte string to a concatenation of bytes. + fn c_literal(&self, bytes: &[u8]) -> Result<ThompsonRef, BuildError> { + self.c_concat(bytes.iter().copied().map(|b| self.c_range(b, b))) + } + + /// Compile a "range" state with one transition that may only be followed + /// if the input byte is in the (inclusive) range given. + /// + /// Both the `start` and `end` locations point to the state created. + /// Callers will likely want to keep the `start`, but patch the `end` to + /// point to some other state. + fn c_range(&self, start: u8, end: u8) -> Result<ThompsonRef, BuildError> { + let id = self.add_range(start, end)?; + Ok(ThompsonRef { start: id, end: id }) + } + + /// Compile an "empty" state with one unconditional epsilon transition. + /// + /// Both the `start` and `end` locations point to the state created. + /// Callers will likely want to keep the `start`, but patch the `end` to + /// point to some other state. + fn c_empty(&self) -> Result<ThompsonRef, BuildError> { + let id = self.add_empty()?; + Ok(ThompsonRef { start: id, end: id }) + } + + /// Compile a "fail" state that can never have any outgoing transitions. + fn c_fail(&self) -> Result<ThompsonRef, BuildError> { + let id = self.add_fail()?; + Ok(ThompsonRef { start: id, end: id }) + } + + // The below helpers are meant to be simple wrappers around the + // corresponding Builder methods. For the most part, they let us write + // 'self.add_foo()' instead of 'self.builder.borrow_mut().add_foo()', where + // the latter is a mouthful. Some of the methods do inject a little bit + // of extra logic. e.g., Flipping look-around operators when compiling in + // reverse mode. + + fn patch(&self, from: StateID, to: StateID) -> Result<(), BuildError> { + self.builder.borrow_mut().patch(from, to) + } + + fn start_pattern(&self) -> Result<PatternID, BuildError> { + self.builder.borrow_mut().start_pattern() + } + + fn finish_pattern( + &self, + start_id: StateID, + ) -> Result<PatternID, BuildError> { + self.builder.borrow_mut().finish_pattern(start_id) + } + + fn add_empty(&self) -> Result<StateID, BuildError> { + self.builder.borrow_mut().add_empty() + } + + fn add_range(&self, start: u8, end: u8) -> Result<StateID, BuildError> { + self.builder.borrow_mut().add_range(Transition { + start, + end, + next: StateID::ZERO, + }) + } + + fn add_sparse( + &self, + ranges: Vec<Transition>, + ) -> Result<StateID, BuildError> { + self.builder.borrow_mut().add_sparse(ranges) + } + + fn add_look(&self, mut look: Look) -> Result<StateID, BuildError> { + if self.is_reverse() { + look = look.reversed(); + } + self.builder.borrow_mut().add_look(StateID::ZERO, look) + } + + fn add_union(&self) -> Result<StateID, BuildError> { + self.builder.borrow_mut().add_union(vec![]) + } + + fn add_union_reverse(&self) -> Result<StateID, BuildError> { + self.builder.borrow_mut().add_union_reverse(vec![]) + } + + fn add_capture_start( + &self, + capture_index: u32, + name: Option<&str>, + ) -> Result<StateID, BuildError> { + let name = name.map(|n| Arc::from(n)); + self.builder.borrow_mut().add_capture_start( + StateID::ZERO, + capture_index, + name, + ) + } + + fn add_capture_end( + &self, + capture_index: u32, + ) -> Result<StateID, BuildError> { + self.builder.borrow_mut().add_capture_end(StateID::ZERO, capture_index) + } + + fn add_fail(&self) -> Result<StateID, BuildError> { + self.builder.borrow_mut().add_fail() + } + + fn add_match(&self) -> Result<StateID, BuildError> { + self.builder.borrow_mut().add_match() + } + + fn is_reverse(&self) -> bool { + self.config.get_reverse() + } +} + +/// A value that represents the result of compiling a sub-expression of a +/// regex's HIR. Specifically, this represents a sub-graph of the NFA that +/// has an initial state at `start` and a final state at `end`. +#[derive(Clone, Copy, Debug)] +pub(crate) struct ThompsonRef { + pub(crate) start: StateID, + pub(crate) end: StateID, +} + +/// A UTF-8 compiler based on Daciuk's algorithm for compilining minimal DFAs +/// from a lexicographically sorted sequence of strings in linear time. +/// +/// The trick here is that any Unicode codepoint range can be converted to +/// a sequence of byte ranges that form a UTF-8 automaton. Connecting them +/// together via an alternation is trivial, and indeed, it works. However, +/// there is a lot of redundant structure in many UTF-8 automatons. Since our +/// UTF-8 ranges are in lexicographic order, we can use Daciuk's algorithm +/// to build nearly minimal DFAs in linear time. (They are guaranteed to be +/// minimal because we use a bounded cache of previously build DFA states.) +/// +/// The drawback is that this sadly doesn't work for reverse automata, since +/// the ranges are no longer in lexicographic order. For that, we invented the +/// range trie (which gets its own module). Once a range trie is built, we then +/// use this same Utf8Compiler to build a reverse UTF-8 automaton. +/// +/// The high level idea is described here: +/// https://blog.burntsushi.net/transducers/#finite-state-machines-as-data-structures +/// +/// There is also another implementation of this in the `fst` crate. +#[derive(Debug)] +struct Utf8Compiler<'a> { + builder: &'a mut Builder, + state: &'a mut Utf8State, + target: StateID, +} + +#[derive(Clone, Debug)] +struct Utf8State { + compiled: Utf8BoundedMap, + uncompiled: Vec<Utf8Node>, +} + +#[derive(Clone, Debug)] +struct Utf8Node { + trans: Vec<Transition>, + last: Option<Utf8LastTransition>, +} + +#[derive(Clone, Debug)] +struct Utf8LastTransition { + start: u8, + end: u8, +} + +impl Utf8State { + fn new() -> Utf8State { + Utf8State { compiled: Utf8BoundedMap::new(10_000), uncompiled: vec![] } + } + + fn clear(&mut self) { + self.compiled.clear(); + self.uncompiled.clear(); + } +} + +impl<'a> Utf8Compiler<'a> { + fn new( + builder: &'a mut Builder, + state: &'a mut Utf8State, + ) -> Result<Utf8Compiler<'a>, BuildError> { + let target = builder.add_empty()?; + state.clear(); + let mut utf8c = Utf8Compiler { builder, state, target }; + utf8c.add_empty(); + Ok(utf8c) + } + + fn finish(&mut self) -> Result<ThompsonRef, BuildError> { + self.compile_from(0)?; + let node = self.pop_root(); + let start = self.compile(node)?; + Ok(ThompsonRef { start, end: self.target }) + } + + fn add(&mut self, ranges: &[Utf8Range]) -> Result<(), BuildError> { + let prefix_len = ranges + .iter() + .zip(&self.state.uncompiled) + .take_while(|&(range, node)| { + node.last.as_ref().map_or(false, |t| { + (t.start, t.end) == (range.start, range.end) + }) + }) + .count(); + assert!(prefix_len < ranges.len()); + self.compile_from(prefix_len)?; + self.add_suffix(&ranges[prefix_len..]); + Ok(()) + } + + fn compile_from(&mut self, from: usize) -> Result<(), BuildError> { + let mut next = self.target; + while from + 1 < self.state.uncompiled.len() { + let node = self.pop_freeze(next); + next = self.compile(node)?; + } + self.top_last_freeze(next); + Ok(()) + } + + fn compile( + &mut self, + node: Vec<Transition>, + ) -> Result<StateID, BuildError> { + let hash = self.state.compiled.hash(&node); + if let Some(id) = self.state.compiled.get(&node, hash) { + return Ok(id); + } + let id = self.builder.add_sparse(node.clone())?; + self.state.compiled.set(node, hash, id); + Ok(id) + } + + fn add_suffix(&mut self, ranges: &[Utf8Range]) { + assert!(!ranges.is_empty()); + let last = self + .state + .uncompiled + .len() + .checked_sub(1) + .expect("non-empty nodes"); + assert!(self.state.uncompiled[last].last.is_none()); + self.state.uncompiled[last].last = Some(Utf8LastTransition { + start: ranges[0].start, + end: ranges[0].end, + }); + for r in &ranges[1..] { + self.state.uncompiled.push(Utf8Node { + trans: vec![], + last: Some(Utf8LastTransition { start: r.start, end: r.end }), + }); + } + } + + fn add_empty(&mut self) { + self.state.uncompiled.push(Utf8Node { trans: vec![], last: None }); + } + + fn pop_freeze(&mut self, next: StateID) -> Vec<Transition> { + let mut uncompiled = self.state.uncompiled.pop().unwrap(); + uncompiled.set_last_transition(next); + uncompiled.trans + } + + fn pop_root(&mut self) -> Vec<Transition> { + assert_eq!(self.state.uncompiled.len(), 1); + assert!(self.state.uncompiled[0].last.is_none()); + self.state.uncompiled.pop().expect("non-empty nodes").trans + } + + fn top_last_freeze(&mut self, next: StateID) { + let last = self + .state + .uncompiled + .len() + .checked_sub(1) + .expect("non-empty nodes"); + self.state.uncompiled[last].set_last_transition(next); + } +} + +impl Utf8Node { + fn set_last_transition(&mut self, next: StateID) { + if let Some(last) = self.last.take() { + self.trans.push(Transition { + start: last.start, + end: last.end, + next, + }); + } + } +} + +#[cfg(test)] +mod tests { + use alloc::{vec, vec::Vec}; + + use crate::{ + nfa::thompson::{SparseTransitions, State, Transition, NFA}, + util::primitives::{PatternID, SmallIndex, StateID}, + }; + + use super::*; + + fn build(pattern: &str) -> NFA { + NFA::compiler() + .configure( + NFA::config() + .which_captures(WhichCaptures::None) + .unanchored_prefix(false), + ) + .build(pattern) + .unwrap() + } + + fn pid(id: usize) -> PatternID { + PatternID::new(id).unwrap() + } + + fn sid(id: usize) -> StateID { + StateID::new(id).unwrap() + } + + fn s_byte(byte: u8, next: usize) -> State { + let next = sid(next); + let trans = Transition { start: byte, end: byte, next }; + State::ByteRange { trans } + } + + fn s_range(start: u8, end: u8, next: usize) -> State { + let next = sid(next); + let trans = Transition { start, end, next }; + State::ByteRange { trans } + } + + fn s_sparse(transitions: &[(u8, u8, usize)]) -> State { + let transitions = transitions + .iter() + .map(|&(start, end, next)| Transition { + start, + end, + next: sid(next), + }) + .collect(); + State::Sparse(SparseTransitions { transitions }) + } + + fn s_bin_union(alt1: usize, alt2: usize) -> State { + State::BinaryUnion { alt1: sid(alt1), alt2: sid(alt2) } + } + + fn s_union(alts: &[usize]) -> State { + State::Union { + alternates: alts + .iter() + .map(|&id| sid(id)) + .collect::<Vec<StateID>>() + .into_boxed_slice(), + } + } + + fn s_cap(next: usize, pattern: usize, index: usize, slot: usize) -> State { + State::Capture { + next: sid(next), + pattern_id: pid(pattern), + group_index: SmallIndex::new(index).unwrap(), + slot: SmallIndex::new(slot).unwrap(), + } + } + + fn s_fail() -> State { + State::Fail + } + + fn s_match(id: usize) -> State { + State::Match { pattern_id: pid(id) } + } + + // Test that building an unanchored NFA has an appropriate `(?s:.)*?` + // prefix. + #[test] + fn compile_unanchored_prefix() { + let nfa = NFA::compiler() + .configure(NFA::config().which_captures(WhichCaptures::None)) + .build(r"a") + .unwrap(); + assert_eq!( + nfa.states(), + &[ + s_bin_union(2, 1), + s_range(0, 255, 0), + s_byte(b'a', 3), + s_match(0), + ] + ); + } + + #[test] + fn compile_empty() { + assert_eq!(build("").states(), &[s_match(0),]); + } + + #[test] + fn compile_literal() { + assert_eq!(build("a").states(), &[s_byte(b'a', 1), s_match(0),]); + assert_eq!( + build("ab").states(), + &[s_byte(b'a', 1), s_byte(b'b', 2), s_match(0),] + ); + assert_eq!( + build("☃").states(), + &[s_byte(0xE2, 1), s_byte(0x98, 2), s_byte(0x83, 3), s_match(0)] + ); + + // Check that non-UTF-8 literals work. + let nfa = NFA::compiler() + .configure( + NFA::config() + .which_captures(WhichCaptures::None) + .unanchored_prefix(false), + ) + .syntax(crate::util::syntax::Config::new().utf8(false)) + .build(r"(?-u)\xFF") + .unwrap(); + assert_eq!(nfa.states(), &[s_byte(b'\xFF', 1), s_match(0),]); + } + + #[test] + fn compile_class_ascii() { + assert_eq!( + build(r"[a-z]").states(), + &[s_range(b'a', b'z', 1), s_match(0),] + ); + assert_eq!( + build(r"[x-za-c]").states(), + &[s_sparse(&[(b'a', b'c', 1), (b'x', b'z', 1)]), s_match(0)] + ); + } + + #[test] + #[cfg(not(miri))] + fn compile_class_unicode() { + assert_eq!( + build(r"[\u03B1-\u03B4]").states(), + &[s_range(0xB1, 0xB4, 2), s_byte(0xCE, 0), s_match(0)] + ); + assert_eq!( + build(r"[\u03B1-\u03B4\u{1F919}-\u{1F91E}]").states(), + &[ + s_range(0xB1, 0xB4, 5), + s_range(0x99, 0x9E, 5), + s_byte(0xA4, 1), + s_byte(0x9F, 2), + s_sparse(&[(0xCE, 0xCE, 0), (0xF0, 0xF0, 3)]), + s_match(0), + ] + ); + assert_eq!( + build(r"[a-z☃]").states(), + &[ + s_byte(0x83, 3), + s_byte(0x98, 0), + s_sparse(&[(b'a', b'z', 3), (0xE2, 0xE2, 1)]), + s_match(0), + ] + ); + } + + #[test] + fn compile_repetition() { + assert_eq!( + build(r"a?").states(), + &[s_bin_union(1, 2), s_byte(b'a', 2), s_match(0),] + ); + assert_eq!( + build(r"a??").states(), + &[s_bin_union(2, 1), s_byte(b'a', 2), s_match(0),] + ); + } + + #[test] + fn compile_group() { + assert_eq!( + build(r"ab+").states(), + &[s_byte(b'a', 1), s_byte(b'b', 2), s_bin_union(1, 3), s_match(0)] + ); + assert_eq!( + build(r"(ab)").states(), + &[s_byte(b'a', 1), s_byte(b'b', 2), s_match(0)] + ); + assert_eq!( + build(r"(ab)+").states(), + &[s_byte(b'a', 1), s_byte(b'b', 2), s_bin_union(0, 3), s_match(0)] + ); + } + + #[test] + fn compile_alternation() { + assert_eq!( + build(r"a|b").states(), + &[s_range(b'a', b'b', 1), s_match(0)] + ); + assert_eq!( + build(r"ab|cd").states(), + &[ + s_byte(b'b', 3), + s_byte(b'd', 3), + s_sparse(&[(b'a', b'a', 0), (b'c', b'c', 1)]), + s_match(0) + ], + ); + assert_eq!( + build(r"|b").states(), + &[s_byte(b'b', 2), s_bin_union(2, 0), s_match(0)] + ); + assert_eq!( + build(r"a|").states(), + &[s_byte(b'a', 2), s_bin_union(0, 2), s_match(0)] + ); + } + + // This tests the use of a non-binary union, i.e., a state with more than + // 2 unconditional epsilon transitions. The only place they tend to appear + // is in reverse NFAs when shrinking is disabled. Otherwise, 'binary-union' + // and 'sparse' tend to cover all other cases of alternation. + #[test] + fn compile_non_binary_union() { + let nfa = NFA::compiler() + .configure( + NFA::config() + .which_captures(WhichCaptures::None) + .reverse(true) + .shrink(false) + .unanchored_prefix(false), + ) + .build(r"[\u1000\u2000\u3000]") + .unwrap(); + assert_eq!( + nfa.states(), + &[ + s_union(&[3, 6, 9]), + s_byte(0xE1, 10), + s_byte(0x80, 1), + s_byte(0x80, 2), + s_byte(0xE2, 10), + s_byte(0x80, 4), + s_byte(0x80, 5), + s_byte(0xE3, 10), + s_byte(0x80, 7), + s_byte(0x80, 8), + s_match(0), + ] + ); + } + + #[test] + fn compile_many_start_pattern() { + let nfa = NFA::compiler() + .configure( + NFA::config() + .which_captures(WhichCaptures::None) + .unanchored_prefix(false), + ) + .build_many(&["a", "b"]) + .unwrap(); + assert_eq!( + nfa.states(), + &[ + s_byte(b'a', 1), + s_match(0), + s_byte(b'b', 3), + s_match(1), + s_bin_union(0, 2), + ] + ); + assert_eq!(nfa.start_anchored().as_usize(), 4); + assert_eq!(nfa.start_unanchored().as_usize(), 4); + // Test that the start states for each individual pattern are correct. + assert_eq!(nfa.start_pattern(pid(0)).unwrap(), sid(0)); + assert_eq!(nfa.start_pattern(pid(1)).unwrap(), sid(2)); + } + + // This tests that our compiler can handle an empty character class. At the + // time of writing, the regex parser forbids it, so the only way to test it + // is to provide a hand written HIR. + #[test] + fn empty_class_bytes() { + use regex_syntax::hir::{Class, ClassBytes, Hir}; + + let hir = Hir::class(Class::Bytes(ClassBytes::new(vec![]))); + let config = NFA::config() + .which_captures(WhichCaptures::None) + .unanchored_prefix(false); + let nfa = + NFA::compiler().configure(config).build_from_hir(&hir).unwrap(); + assert_eq!(nfa.states(), &[s_fail(), s_match(0)]); + } + + // Like empty_class_bytes, but for a Unicode class. + #[test] + fn empty_class_unicode() { + use regex_syntax::hir::{Class, ClassUnicode, Hir}; + + let hir = Hir::class(Class::Unicode(ClassUnicode::new(vec![]))); + let config = NFA::config() + .which_captures(WhichCaptures::None) + .unanchored_prefix(false); + let nfa = + NFA::compiler().configure(config).build_from_hir(&hir).unwrap(); + assert_eq!(nfa.states(), &[s_fail(), s_match(0)]); + } + + #[test] + fn compile_captures_all() { + let nfa = NFA::compiler() + .configure( + NFA::config() + .unanchored_prefix(false) + .which_captures(WhichCaptures::All), + ) + .build("a(b)c") + .unwrap(); + assert_eq!( + nfa.states(), + &[ + s_cap(1, 0, 0, 0), + s_byte(b'a', 2), + s_cap(3, 0, 1, 2), + s_byte(b'b', 4), + s_cap(5, 0, 1, 3), + s_byte(b'c', 6), + s_cap(7, 0, 0, 1), + s_match(0) + ] + ); + let ginfo = nfa.group_info(); + assert_eq!(2, ginfo.all_group_len()); + } + + #[test] + fn compile_captures_implicit() { + let nfa = NFA::compiler() + .configure( + NFA::config() + .unanchored_prefix(false) + .which_captures(WhichCaptures::Implicit), + ) + .build("a(b)c") + .unwrap(); + assert_eq!( + nfa.states(), + &[ + s_cap(1, 0, 0, 0), + s_byte(b'a', 2), + s_byte(b'b', 3), + s_byte(b'c', 4), + s_cap(5, 0, 0, 1), + s_match(0) + ] + ); + let ginfo = nfa.group_info(); + assert_eq!(1, ginfo.all_group_len()); + } + + #[test] + fn compile_captures_none() { + let nfa = NFA::compiler() + .configure( + NFA::config() + .unanchored_prefix(false) + .which_captures(WhichCaptures::None), + ) + .build("a(b)c") + .unwrap(); + assert_eq!( + nfa.states(), + &[s_byte(b'a', 1), s_byte(b'b', 2), s_byte(b'c', 3), s_match(0)] + ); + let ginfo = nfa.group_info(); + assert_eq!(0, ginfo.all_group_len()); + } +} diff --git a/third_party/rust/regex-automata/src/nfa/thompson/error.rs b/third_party/rust/regex-automata/src/nfa/thompson/error.rs new file mode 100644 index 0000000000..3c2fa8a215 --- /dev/null +++ b/third_party/rust/regex-automata/src/nfa/thompson/error.rs @@ -0,0 +1,185 @@ +use crate::util::{ + captures, look, + primitives::{PatternID, StateID}, +}; + +/// An error that can occurred during the construction of a thompson NFA. +/// +/// This error does not provide many introspection capabilities. There are +/// generally only two things you can do with it: +/// +/// * Obtain a human readable message via its `std::fmt::Display` impl. +/// * Access an underlying [`regex_syntax::Error`] type from its `source` +/// method via the `std::error::Error` trait. This error only occurs when using +/// convenience routines for building an NFA directly from a pattern string. +/// +/// Otherwise, errors typically occur when a limit has been breeched. For +/// example, if the total heap usage of the compiled NFA exceeds the limit +/// set by [`Config::nfa_size_limit`](crate::nfa::thompson::Config), then +/// building the NFA will fail. +#[derive(Clone, Debug)] +pub struct BuildError { + kind: BuildErrorKind, +} + +/// The kind of error that occurred during the construction of a thompson NFA. +#[derive(Clone, Debug)] +enum BuildErrorKind { + /// An error that occurred while parsing a regular expression. Note that + /// this error may be printed over multiple lines, and is generally + /// intended to be end user readable on its own. + #[cfg(feature = "syntax")] + Syntax(regex_syntax::Error), + /// An error that occurs if the capturing groups provided to an NFA builder + /// do not satisfy the documented invariants. For example, things like + /// too many groups, missing groups, having the first (zeroth) group be + /// named or duplicate group names within the same pattern. + Captures(captures::GroupInfoError), + /// An error that occurs when an NFA contains a Unicode word boundary, but + /// where the crate was compiled without the necessary data for dealing + /// with Unicode word boundaries. + Word(look::UnicodeWordBoundaryError), + /// An error that occurs if too many patterns were given to the NFA + /// compiler. + TooManyPatterns { + /// The number of patterns given, which exceeds the limit. + given: usize, + /// The limit on the number of patterns. + limit: usize, + }, + /// An error that occurs if too states are produced while building an NFA. + TooManyStates { + /// The minimum number of states that are desired, which exceeds the + /// limit. + given: usize, + /// The limit on the number of states. + limit: usize, + }, + /// An error that occurs when NFA compilation exceeds a configured heap + /// limit. + ExceededSizeLimit { + /// The configured limit, in bytes. + limit: usize, + }, + /// An error that occurs when an invalid capture group index is added to + /// the NFA. An "invalid" index can be one that would otherwise overflow + /// a `usize` on the current target. + InvalidCaptureIndex { + /// The invalid index that was given. + index: u32, + }, + /// An error that occurs when one tries to build a reverse NFA with + /// captures enabled. Currently, this isn't supported, but we probably + /// should support it at some point. + #[cfg(feature = "syntax")] + UnsupportedCaptures, +} + +impl BuildError { + /// If this error occurred because the NFA exceeded the configured size + /// limit before being built, then this returns the configured size limit. + /// + /// The limit returned is what was configured, and corresponds to the + /// maximum amount of heap usage in bytes. + pub fn size_limit(&self) -> Option<usize> { + match self.kind { + BuildErrorKind::ExceededSizeLimit { limit } => Some(limit), + _ => None, + } + } + + fn kind(&self) -> &BuildErrorKind { + &self.kind + } + + #[cfg(feature = "syntax")] + pub(crate) fn syntax(err: regex_syntax::Error) -> BuildError { + BuildError { kind: BuildErrorKind::Syntax(err) } + } + + pub(crate) fn captures(err: captures::GroupInfoError) -> BuildError { + BuildError { kind: BuildErrorKind::Captures(err) } + } + + pub(crate) fn word(err: look::UnicodeWordBoundaryError) -> BuildError { + BuildError { kind: BuildErrorKind::Word(err) } + } + + pub(crate) fn too_many_patterns(given: usize) -> BuildError { + let limit = PatternID::LIMIT; + BuildError { kind: BuildErrorKind::TooManyPatterns { given, limit } } + } + + pub(crate) fn too_many_states(given: usize) -> BuildError { + let limit = StateID::LIMIT; + BuildError { kind: BuildErrorKind::TooManyStates { given, limit } } + } + + pub(crate) fn exceeded_size_limit(limit: usize) -> BuildError { + BuildError { kind: BuildErrorKind::ExceededSizeLimit { limit } } + } + + pub(crate) fn invalid_capture_index(index: u32) -> BuildError { + BuildError { kind: BuildErrorKind::InvalidCaptureIndex { index } } + } + + #[cfg(feature = "syntax")] + pub(crate) fn unsupported_captures() -> BuildError { + BuildError { kind: BuildErrorKind::UnsupportedCaptures } + } +} + +#[cfg(feature = "std")] +impl std::error::Error for BuildError { + fn source(&self) -> Option<&(dyn std::error::Error + 'static)> { + match self.kind() { + #[cfg(feature = "syntax")] + BuildErrorKind::Syntax(ref err) => Some(err), + BuildErrorKind::Captures(ref err) => Some(err), + _ => None, + } + } +} + +impl core::fmt::Display for BuildError { + fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result { + match self.kind() { + #[cfg(feature = "syntax")] + BuildErrorKind::Syntax(_) => write!(f, "error parsing regex"), + BuildErrorKind::Captures(_) => { + write!(f, "error with capture groups") + } + BuildErrorKind::Word(_) => { + write!(f, "NFA contains Unicode word boundary") + } + BuildErrorKind::TooManyPatterns { given, limit } => write!( + f, + "attempted to compile {} patterns, \ + which exceeds the limit of {}", + given, limit, + ), + BuildErrorKind::TooManyStates { given, limit } => write!( + f, + "attempted to compile {} NFA states, \ + which exceeds the limit of {}", + given, limit, + ), + BuildErrorKind::ExceededSizeLimit { limit } => write!( + f, + "heap usage during NFA compilation exceeded limit of {}", + limit, + ), + BuildErrorKind::InvalidCaptureIndex { index } => write!( + f, + "capture group index {} is invalid (too big or discontinuous)", + index, + ), + #[cfg(feature = "syntax")] + BuildErrorKind::UnsupportedCaptures => write!( + f, + "currently captures must be disabled when compiling \ + a reverse NFA", + ), + } + } +} diff --git a/third_party/rust/regex-automata/src/nfa/thompson/literal_trie.rs b/third_party/rust/regex-automata/src/nfa/thompson/literal_trie.rs new file mode 100644 index 0000000000..7ed129afd1 --- /dev/null +++ b/third_party/rust/regex-automata/src/nfa/thompson/literal_trie.rs @@ -0,0 +1,528 @@ +use core::mem; + +use alloc::{vec, vec::Vec}; + +use crate::{ + nfa::thompson::{self, compiler::ThompsonRef, BuildError, Builder}, + util::primitives::{IteratorIndexExt, StateID}, +}; + +/// A trie that preserves leftmost-first match semantics. +/// +/// This is a purpose-built data structure for optimizing 'lit1|lit2|..|litN' +/// patterns. It can *only* handle alternations of literals, which makes it +/// somewhat restricted in its scope, but literal alternations are fairly +/// common. +/// +/// At a 5,000 foot level, the main idea of this trie is make an alternation of +/// literals look more like a DFA than an NFA via epsilon removal. +/// +/// More precisely, the main issue is in how alternations are compiled into +/// a Thompson NFA. Namely, each alternation gets a single NFA "union" state +/// with an epsilon transition for every branch of the alternation pointing to +/// an NFA state corresponding to the start of that branch. The main problem +/// with this representation is the cost of computing an epsilon closure. Once +/// you hit the alternation's start state, it acts as a sort of "clog" that +/// requires you to traverse all of the epsilon transitions to compute the full +/// closure. +/// +/// While fixing such clogs in the general case is pretty tricky without going +/// to a DFA (or perhaps a Glushkov NFA, but that comes with other problems). +/// But at least in the case of an alternation of literals, we can convert +/// that to a prefix trie without too much cost. In theory, that's all you +/// really need to do: build the trie and then compile it to a Thompson NFA. +/// For example, if you have the pattern 'bar|baz|foo', then using a trie, it +/// is transformed to something like 'b(a(r|z))|f'. This reduces the clog by +/// reducing the number of epsilon transitions out of the alternation's start +/// state from 3 to 2 (it actually gets down to 1 when you use a sparse state, +/// which we do below). It's a small effect here, but when your alternation is +/// huge, the savings is also huge. +/// +/// And that is... essentially what a LiteralTrie does. But there is one +/// hiccup. Consider a regex like 'sam|samwise'. How does a prefix trie compile +/// that when leftmost-first semantics are used? If 'sam|samwise' was the +/// entire regex, then you could just drop the 'samwise' branch entirely since +/// it is impossible to match ('sam' will always take priority, and since it +/// is a prefix of 'samwise', 'samwise' will never match). But what about the +/// regex '\b(sam|samwise)\b'? In that case, you can't remove 'samwise' because +/// it might match when 'sam' doesn't fall on a word boundary. +/// +/// The main idea is that 'sam|samwise' can be translated to 'sam(?:|wise)', +/// which is a precisely equivalent regex that also gets rid of the clog. +/// +/// Another example is 'zapper|z|zap'. That gets translated to +/// 'z(?:apper||ap)'. +/// +/// We accomplish this by giving each state in the trie multiple "chunks" of +/// transitions. Each chunk barrier represents a match. The idea is that once +/// you know a match occurs, none of the transitions after the match can be +/// re-ordered and mixed in with the transitions before the match. Otherwise, +/// the match semantics could be changed. +/// +/// See the 'State' data type for a bit more detail. +/// +/// Future work: +/// +/// * In theory, it would be nice to generalize the idea of removing clogs and +/// apply it to the NFA graph itself. Then this could in theory work for +/// case insensitive alternations of literals, or even just alternations where +/// each branch starts with a non-epsilon transition. +/// * Could we instead use the Aho-Corasick algorithm here? The aho-corasick +/// crate deals with leftmost-first matches correctly, but I think this implies +/// encoding failure transitions into a Thompson NFA somehow. Which seems fine, +/// because failure transitions are just unconditional epsilon transitions? +/// * Or perhaps even better, could we use an aho_corasick::AhoCorasick +/// directly? At time of writing, 0.7 is the current version of the +/// aho-corasick crate, and that definitely cannot be used as-is. But if we +/// expose the underlying finite state machine API, then could we use it? That +/// would be super. If we could figure that out, it might also lend itself to +/// more general composition of finite state machines. +#[derive(Clone)] +pub(crate) struct LiteralTrie { + /// The set of trie states. Each state contains one or more chunks, where + /// each chunk is a sparse set of transitions to other states. A leaf state + /// is always a match state that contains only empty chunks (i.e., no + /// transitions). + states: Vec<State>, + /// Whether to add literals in reverse to the trie. Useful when building + /// a reverse NFA automaton. + rev: bool, +} + +impl LiteralTrie { + /// Create a new literal trie that adds literals in the forward direction. + pub(crate) fn forward() -> LiteralTrie { + let root = State::default(); + LiteralTrie { states: vec![root], rev: false } + } + + /// Create a new literal trie that adds literals in reverse. + pub(crate) fn reverse() -> LiteralTrie { + let root = State::default(); + LiteralTrie { states: vec![root], rev: true } + } + + /// Add the given literal to this trie. + /// + /// If the literal could not be added because the `StateID` space was + /// exhausted, then an error is returned. If an error returns, the trie + /// is in an unspecified state. + pub(crate) fn add(&mut self, bytes: &[u8]) -> Result<(), BuildError> { + let mut prev = StateID::ZERO; + let mut it = bytes.iter().copied(); + while let Some(b) = if self.rev { it.next_back() } else { it.next() } { + prev = self.get_or_add_state(prev, b)?; + } + self.states[prev].add_match(); + Ok(()) + } + + /// If the given transition is defined, then return the next state ID. + /// Otherwise, add the transition to `from` and point it to a new state. + /// + /// If a new state ID could not be allocated, then an error is returned. + fn get_or_add_state( + &mut self, + from: StateID, + byte: u8, + ) -> Result<StateID, BuildError> { + let active = self.states[from].active_chunk(); + match active.binary_search_by_key(&byte, |t| t.byte) { + Ok(i) => Ok(active[i].next), + Err(i) => { + // Add a new state and get its ID. + let next = StateID::new(self.states.len()).map_err(|_| { + BuildError::too_many_states(self.states.len()) + })?; + self.states.push(State::default()); + // Offset our position to account for all transitions and not + // just the ones in the active chunk. + let i = self.states[from].active_chunk_start() + i; + let t = Transition { byte, next }; + self.states[from].transitions.insert(i, t); + Ok(next) + } + } + } + + /// Compile this literal trie to the NFA builder given. + /// + /// This forwards any errors that may occur while using the given builder. + pub(crate) fn compile( + &self, + builder: &mut Builder, + ) -> Result<ThompsonRef, BuildError> { + // Compilation proceeds via depth-first traversal of the trie. + // + // This is overall pretty brutal. The recursive version of this is + // deliciously simple. (See 'compile_to_hir' below for what it might + // look like.) But recursion on a trie means your call stack grows + // in accordance with the longest literal, which just does not seem + // appropriate. So we push the call stack to the heap. But as a result, + // the trie traversal becomes pretty brutal because we essentially + // have to encode the state of a double for-loop into an explicit call + // frame. If someone can simplify this without using recursion, that'd + // be great. + + // 'end' is our match state for this trie, but represented in the the + // NFA. Any time we see a match in the trie, we insert a transition + // from the current state we're in to 'end'. + let end = builder.add_empty()?; + let mut stack = vec![]; + let mut f = Frame::new(&self.states[StateID::ZERO]); + loop { + if let Some(t) = f.transitions.next() { + if self.states[t.next].is_leaf() { + f.sparse.push(thompson::Transition { + start: t.byte, + end: t.byte, + next: end, + }); + } else { + f.sparse.push(thompson::Transition { + start: t.byte, + end: t.byte, + // This is a little funny, but when the frame we create + // below completes, it will pop this parent frame off + // and modify this transition to point to the correct + // state. + next: StateID::ZERO, + }); + stack.push(f); + f = Frame::new(&self.states[t.next]); + } + continue; + } + // At this point, we have visited all transitions in f.chunk, so + // add it as a sparse NFA state. Unless the chunk was empty, in + // which case, we don't do anything. + if !f.sparse.is_empty() { + let chunk_id = if f.sparse.len() == 1 { + builder.add_range(f.sparse.pop().unwrap())? + } else { + let sparse = mem::replace(&mut f.sparse, vec![]); + builder.add_sparse(sparse)? + }; + f.union.push(chunk_id); + } + // Now we need to look to see if there are other chunks to visit. + if let Some(chunk) = f.chunks.next() { + // If we're here, it means we're on the second (or greater) + // chunk, which implies there is a match at this point. So + // connect this state to the final end state. + f.union.push(end); + // Advance to the next chunk. + f.transitions = chunk.iter(); + continue; + } + // Now that we are out of chunks, we have completely visited + // this state. So turn our union of chunks into an NFA union + // state, and add that union state to the parent state's current + // sparse state. (If there is no parent, we're done.) + let start = builder.add_union(f.union)?; + match stack.pop() { + None => { + return Ok(ThompsonRef { start, end }); + } + Some(mut parent) => { + // OK because the only way a frame gets pushed on to the + // stack (aside from the root) is when a transition has + // been added to 'sparse'. + parent.sparse.last_mut().unwrap().next = start; + f = parent; + } + } + } + } + + /// Converts this trie to an equivalent HIR expression. + /// + /// We don't actually use this, but it's useful for tests. In particular, + /// it provides a (somewhat) human readable representation of the trie + /// itself. + #[cfg(test)] + fn compile_to_hir(&self) -> regex_syntax::hir::Hir { + self.compile_state_to_hir(StateID::ZERO) + } + + /// The recursive implementation of 'to_hir'. + /// + /// Notice how simple this is compared to 'compile' above. 'compile' could + /// be similarly simple, but we opt to not use recursion in order to avoid + /// overflowing the stack in the case of a longer literal. + #[cfg(test)] + fn compile_state_to_hir(&self, sid: StateID) -> regex_syntax::hir::Hir { + use regex_syntax::hir::Hir; + + let mut alt = vec![]; + for (i, chunk) in self.states[sid].chunks().enumerate() { + if i > 0 { + alt.push(Hir::empty()); + } + if chunk.is_empty() { + continue; + } + let mut chunk_alt = vec![]; + for t in chunk.iter() { + chunk_alt.push(Hir::concat(vec![ + Hir::literal(vec![t.byte]), + self.compile_state_to_hir(t.next), + ])); + } + alt.push(Hir::alternation(chunk_alt)); + } + Hir::alternation(alt) + } +} + +impl core::fmt::Debug for LiteralTrie { + fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result { + writeln!(f, "LiteralTrie(")?; + for (sid, state) in self.states.iter().with_state_ids() { + writeln!(f, "{:06?}: {:?}", sid.as_usize(), state)?; + } + writeln!(f, ")")?; + Ok(()) + } +} + +/// An explicit stack frame used for traversing the trie without using +/// recursion. +/// +/// Each frame is tied to the traversal of a single trie state. The frame is +/// dropped once the entire state (and all of its children) have been visited. +/// The "output" of compiling a state is the 'union' vector, which is turn +/// converted to a NFA union state. Each branch of the union corresponds to a +/// chunk in the trie state. +/// +/// 'sparse' corresponds to the set of transitions for a particular chunk in a +/// trie state. It is ultimately converted to an NFA sparse state. The 'sparse' +/// field, after being converted to a sparse NFA state, is reused for any +/// subsequent chunks in the trie state, if any exist. +#[derive(Debug)] +struct Frame<'a> { + /// The remaining chunks to visit for a trie state. + chunks: StateChunksIter<'a>, + /// The transitions of the current chunk that we're iterating over. Since + /// every trie state has at least one chunk, every frame is initialized + /// with the first chunk's transitions ready to be consumed. + transitions: core::slice::Iter<'a, Transition>, + /// The NFA state IDs pointing to the start of each chunk compiled by + /// this trie state. This ultimately gets converted to an NFA union once + /// the entire trie state (and all of its children) have been compiled. + /// The order of these matters for leftmost-first match semantics, since + /// earlier matches in the union are preferred over later ones. + union: Vec<StateID>, + /// The actual NFA transitions for a single chunk in a trie state. This + /// gets converted to an NFA sparse state, and its corresponding NFA state + /// ID should get added to 'union'. + sparse: Vec<thompson::Transition>, +} + +impl<'a> Frame<'a> { + /// Create a new stack frame for trie traversal. This initializes the + /// 'transitions' iterator to the transitions for the first chunk, with the + /// 'chunks' iterator being every chunk after the first one. + fn new(state: &'a State) -> Frame<'a> { + let mut chunks = state.chunks(); + // every state has at least 1 chunk + let chunk = chunks.next().unwrap(); + let transitions = chunk.iter(); + Frame { chunks, transitions, union: vec![], sparse: vec![] } + } +} + +/// A state in a trie. +/// +/// This uses a sparse representation. Since we don't use literal tries +/// for searching, and ultimately (and compilation requires visiting every +/// transition anyway), we use a sparse representation for transitions. This +/// means we save on memory, at the expense of 'LiteralTrie::add' being perhaps +/// a bit slower. +/// +/// While 'transitions' is pretty standard as far as tries goes, the 'chunks' +/// piece here is more unusual. In effect, 'chunks' defines a partitioning +/// of 'transitions', where each chunk corresponds to a distinct set of +/// transitions. The key invariant is that a transition in one chunk cannot +/// be moved to another chunk. This is the secret sauce that preserve +/// leftmost-first match semantics. +/// +/// A new chunk is added whenever we mark a state as a match state. Once a +/// new chunk is added, the old active chunk is frozen and is never mutated +/// again. The new chunk becomes the active chunk, which is defined as +/// '&transitions[chunks.last().map_or(0, |c| c.1)..]'. Thus, a state where +/// 'chunks' is empty actually contains one chunk. Thus, every state contains +/// at least one (possibly empty) chunk. +/// +/// A "leaf" state is a state that has no outgoing transitions (so +/// 'transitions' is empty). Note that there is no way for a leaf state to be a +/// non-matching state. (Although while building the trie, within 'add', a leaf +/// state may exist while not containing any matches. But this invariant is +/// only broken within 'add'. Once 'add' returns, the invariant is upheld.) +#[derive(Clone, Default)] +struct State { + transitions: Vec<Transition>, + chunks: Vec<(usize, usize)>, +} + +impl State { + /// Mark this state as a match state and freeze the active chunk such that + /// it can not be further mutated. + fn add_match(&mut self) { + // This is not strictly necessary, but there's no point in recording + // another match by adding another chunk if the state has no + // transitions. Note though that we only skip this if we already know + // this is a match state, which is only true if 'chunks' is not empty. + // Basically, if we didn't do this, nothing semantically would change, + // but we'd end up pushing another chunk and potentially triggering an + // alloc. + if self.transitions.is_empty() && !self.chunks.is_empty() { + return; + } + let chunk_start = self.active_chunk_start(); + let chunk_end = self.transitions.len(); + self.chunks.push((chunk_start, chunk_end)); + } + + /// Returns true if and only if this state is a leaf state. That is, a + /// state that has no outgoing transitions. + fn is_leaf(&self) -> bool { + self.transitions.is_empty() + } + + /// Returns an iterator over all of the chunks (including the currently + /// active chunk) in this state. Since the active chunk is included, the + /// iterator is guaranteed to always yield at least one chunk (although the + /// chunk may be empty). + fn chunks(&self) -> StateChunksIter<'_> { + StateChunksIter { + transitions: &*self.transitions, + chunks: self.chunks.iter(), + active: Some(self.active_chunk()), + } + } + + /// Returns the active chunk as a slice of transitions. + fn active_chunk(&self) -> &[Transition] { + let start = self.active_chunk_start(); + &self.transitions[start..] + } + + /// Returns the index into 'transitions' where the active chunk starts. + fn active_chunk_start(&self) -> usize { + self.chunks.last().map_or(0, |&(_, end)| end) + } +} + +impl core::fmt::Debug for State { + fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result { + let mut spacing = " "; + for (i, chunk) in self.chunks().enumerate() { + if i > 0 { + write!(f, "{}MATCH", spacing)?; + } + spacing = ""; + for (j, t) in chunk.iter().enumerate() { + spacing = " "; + if j == 0 && i > 0 { + write!(f, " ")?; + } else if j > 0 { + write!(f, ", ")?; + } + write!(f, "{:?}", t)?; + } + } + Ok(()) + } +} + +/// An iterator over all of the chunks in a state, including the active chunk. +/// +/// This iterator is created by `State::chunks`. We name this iterator so that +/// we can include it in the `Frame` type for non-recursive trie traversal. +#[derive(Debug)] +struct StateChunksIter<'a> { + transitions: &'a [Transition], + chunks: core::slice::Iter<'a, (usize, usize)>, + active: Option<&'a [Transition]>, +} + +impl<'a> Iterator for StateChunksIter<'a> { + type Item = &'a [Transition]; + + fn next(&mut self) -> Option<&'a [Transition]> { + if let Some(&(start, end)) = self.chunks.next() { + return Some(&self.transitions[start..end]); + } + if let Some(chunk) = self.active.take() { + return Some(chunk); + } + None + } +} + +/// A single transition in a trie to another state. +#[derive(Clone, Copy)] +struct Transition { + byte: u8, + next: StateID, +} + +impl core::fmt::Debug for Transition { + fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result { + write!( + f, + "{:?} => {}", + crate::util::escape::DebugByte(self.byte), + self.next.as_usize() + ) + } +} + +#[cfg(test)] +mod tests { + use bstr::B; + use regex_syntax::hir::Hir; + + use super::*; + + #[test] + fn zap() { + let mut trie = LiteralTrie::forward(); + trie.add(b"zapper").unwrap(); + trie.add(b"z").unwrap(); + trie.add(b"zap").unwrap(); + + let got = trie.compile_to_hir(); + let expected = Hir::concat(vec![ + Hir::literal(B("z")), + Hir::alternation(vec![ + Hir::literal(B("apper")), + Hir::empty(), + Hir::literal(B("ap")), + ]), + ]); + assert_eq!(expected, got); + } + + #[test] + fn maker() { + let mut trie = LiteralTrie::forward(); + trie.add(b"make").unwrap(); + trie.add(b"maple").unwrap(); + trie.add(b"maker").unwrap(); + + let got = trie.compile_to_hir(); + let expected = Hir::concat(vec![ + Hir::literal(B("ma")), + Hir::alternation(vec![ + Hir::concat(vec![ + Hir::literal(B("ke")), + Hir::alternation(vec![Hir::empty(), Hir::literal(B("r"))]), + ]), + Hir::literal(B("ple")), + ]), + ]); + assert_eq!(expected, got); + } +} diff --git a/third_party/rust/regex-automata/src/nfa/thompson/map.rs b/third_party/rust/regex-automata/src/nfa/thompson/map.rs new file mode 100644 index 0000000000..c36ce53866 --- /dev/null +++ b/third_party/rust/regex-automata/src/nfa/thompson/map.rs @@ -0,0 +1,296 @@ +// This module contains a couple simple and purpose built hash maps. The key +// trade off they make is that they serve as caches rather than true maps. That +// is, inserting a new entry may cause eviction of another entry. This gives +// us two things. First, there's less overhead associated with inserts and +// lookups. Secondly, it lets us control our memory usage. +// +// These maps are used in some fairly hot code when generating NFA states for +// large Unicode character classes. +// +// Instead of exposing a rich hashmap entry API, we just permit the caller to +// produce a hash of the key directly. The hash can then be reused for both +// lookups and insertions at the cost of leaking abstraction a bit. But these +// are for internal use only, so it's fine. +// +// The Utf8BoundedMap is used for Daciuk's algorithm for constructing a +// (almost) minimal DFA for large Unicode character classes in linear time. +// (Daciuk's algorithm is always used when compiling forward NFAs. For reverse +// NFAs, it's only used when the compiler is configured to 'shrink' the NFA, +// since there's a bit more expense in the reverse direction.) +// +// The Utf8SuffixMap is used when compiling large Unicode character classes for +// reverse NFAs when 'shrink' is disabled. Specifically, it augments the naive +// construction of UTF-8 automata by caching common suffixes. This doesn't +// get the same space savings as Daciuk's algorithm, but it's basically as +// fast as the naive approach and typically winds up using less memory (since +// it generates smaller NFAs) despite the presence of the cache. +// +// These maps effectively represent caching mechanisms for sparse and +// byte-range NFA states, respectively. The former represents a single NFA +// state with many transitions of equivalent priority while the latter +// represents a single NFA state with a single transition. (Neither state ever +// has or is an epsilon transition.) Thus, they have different key types. It's +// likely we could make one generic map, but the machinery didn't seem worth +// it. They are simple enough. + +use alloc::{vec, vec::Vec}; + +use crate::{ + nfa::thompson::Transition, + util::{ + int::{Usize, U64}, + primitives::StateID, + }, +}; + +// Basic FNV-1a hash constants as described in: +// https://en.wikipedia.org/wiki/Fowler%E2%80%93Noll%E2%80%93Vo_hash_function +const PRIME: u64 = 1099511628211; +const INIT: u64 = 14695981039346656037; + +/// A bounded hash map where the key is a sequence of NFA transitions and the +/// value is a pre-existing NFA state ID. +/// +/// std's hashmap can be used for this, however, this map has two important +/// advantages. Firstly, it has lower overhead. Secondly, it permits us to +/// control our memory usage by limited the number of slots. In general, the +/// cost here is that this map acts as a cache. That is, inserting a new entry +/// may remove an old entry. We are okay with this, since it does not impact +/// correctness in the cases where it is used. The only effect that dropping +/// states from the cache has is that the resulting NFA generated may be bigger +/// than it otherwise would be. +/// +/// This improves benchmarks that compile large Unicode character classes, +/// since it makes the generation of (almost) minimal UTF-8 automaton faster. +/// Specifically, one could observe the difference with std's hashmap via +/// something like the following benchmark: +/// +/// hyperfine "regex-cli debug nfa thompson --quiet --reverse '\w{90} ecurB'" +/// +/// But to observe that difference, you'd have to modify the code to use +/// std's hashmap. +/// +/// It is quite possible that there is a better way to approach this problem. +/// For example, if there happens to be a very common state that collides with +/// a lot of less frequent states, then we could wind up with very poor caching +/// behavior. Alas, the effectiveness of this cache has not been measured. +/// Instead, ad hoc experiments suggest that it is "good enough." Additional +/// smarts (such as an LRU eviction policy) have to be weighed against the +/// amount of extra time they cost. +#[derive(Clone, Debug)] +pub struct Utf8BoundedMap { + /// The current version of this map. Only entries with matching versions + /// are considered during lookups. If an entry is found with a mismatched + /// version, then the map behaves as if the entry does not exist. + /// + /// This makes it possible to clear the map by simply incrementing the + /// version number instead of actually deallocating any storage. + version: u16, + /// The total number of entries this map can store. + capacity: usize, + /// The actual entries, keyed by hash. Collisions between different states + /// result in the old state being dropped. + map: Vec<Utf8BoundedEntry>, +} + +/// An entry in this map. +#[derive(Clone, Debug, Default)] +struct Utf8BoundedEntry { + /// The version of the map used to produce this entry. If this entry's + /// version does not match the current version of the map, then the map + /// should behave as if this entry does not exist. + version: u16, + /// The key, which is a sorted sequence of non-overlapping NFA transitions. + key: Vec<Transition>, + /// The state ID corresponding to the state containing the transitions in + /// this entry. + val: StateID, +} + +impl Utf8BoundedMap { + /// Create a new bounded map with the given capacity. The map will never + /// grow beyond the given size. + /// + /// Note that this does not allocate. Instead, callers must call `clear` + /// before using this map. `clear` will allocate space if necessary. + /// + /// This avoids the need to pay for the allocation of this map when + /// compiling regexes that lack large Unicode character classes. + pub fn new(capacity: usize) -> Utf8BoundedMap { + assert!(capacity > 0); + Utf8BoundedMap { version: 0, capacity, map: vec![] } + } + + /// Clear this map of all entries, but permit the reuse of allocation + /// if possible. + /// + /// This must be called before the map can be used. + pub fn clear(&mut self) { + if self.map.is_empty() { + self.map = vec![Utf8BoundedEntry::default(); self.capacity]; + } else { + self.version = self.version.wrapping_add(1); + // If we loop back to version 0, then we forcefully clear the + // entire map. Otherwise, it might be possible to incorrectly + // match entries used to generate other NFAs. + if self.version == 0 { + self.map = vec![Utf8BoundedEntry::default(); self.capacity]; + } + } + } + + /// Return a hash of the given transitions. + pub fn hash(&self, key: &[Transition]) -> usize { + let mut h = INIT; + for t in key { + h = (h ^ u64::from(t.start)).wrapping_mul(PRIME); + h = (h ^ u64::from(t.end)).wrapping_mul(PRIME); + h = (h ^ t.next.as_u64()).wrapping_mul(PRIME); + } + (h % self.map.len().as_u64()).as_usize() + } + + /// Retrieve the cached state ID corresponding to the given key. The hash + /// given must have been computed with `hash` using the same key value. + /// + /// If there is no cached state with the given transitions, then None is + /// returned. + pub fn get(&mut self, key: &[Transition], hash: usize) -> Option<StateID> { + let entry = &self.map[hash]; + if entry.version != self.version { + return None; + } + // There may be a hash collision, so we need to confirm real equality. + if entry.key != key { + return None; + } + Some(entry.val) + } + + /// Add a cached state to this map with the given key. Callers should + /// ensure that `state_id` points to a state that contains precisely the + /// NFA transitions given. + /// + /// `hash` must have been computed using the `hash` method with the same + /// key. + pub fn set( + &mut self, + key: Vec<Transition>, + hash: usize, + state_id: StateID, + ) { + self.map[hash] = + Utf8BoundedEntry { version: self.version, key, val: state_id }; + } +} + +/// A cache of suffixes used to modestly compress UTF-8 automata for large +/// Unicode character classes. +#[derive(Clone, Debug)] +pub struct Utf8SuffixMap { + /// The current version of this map. Only entries with matching versions + /// are considered during lookups. If an entry is found with a mismatched + /// version, then the map behaves as if the entry does not exist. + version: u16, + /// The total number of entries this map can store. + capacity: usize, + /// The actual entries, keyed by hash. Collisions between different states + /// result in the old state being dropped. + map: Vec<Utf8SuffixEntry>, +} + +/// A key that uniquely identifies an NFA state. It is a triple that represents +/// a transition from one state for a particular byte range. +#[derive(Clone, Debug, Default, Eq, PartialEq)] +pub struct Utf8SuffixKey { + pub from: StateID, + pub start: u8, + pub end: u8, +} + +/// An entry in this map. +#[derive(Clone, Debug, Default)] +struct Utf8SuffixEntry { + /// The version of the map used to produce this entry. If this entry's + /// version does not match the current version of the map, then the map + /// should behave as if this entry does not exist. + version: u16, + /// The key, which consists of a transition in a particular state. + key: Utf8SuffixKey, + /// The identifier that the transition in the key maps to. + val: StateID, +} + +impl Utf8SuffixMap { + /// Create a new bounded map with the given capacity. The map will never + /// grow beyond the given size. + /// + /// Note that this does not allocate. Instead, callers must call `clear` + /// before using this map. `clear` will allocate space if necessary. + /// + /// This avoids the need to pay for the allocation of this map when + /// compiling regexes that lack large Unicode character classes. + pub fn new(capacity: usize) -> Utf8SuffixMap { + assert!(capacity > 0); + Utf8SuffixMap { version: 0, capacity, map: vec![] } + } + + /// Clear this map of all entries, but permit the reuse of allocation + /// if possible. + /// + /// This must be called before the map can be used. + pub fn clear(&mut self) { + if self.map.is_empty() { + self.map = vec![Utf8SuffixEntry::default(); self.capacity]; + } else { + self.version = self.version.wrapping_add(1); + if self.version == 0 { + self.map = vec![Utf8SuffixEntry::default(); self.capacity]; + } + } + } + + /// Return a hash of the given transition. + pub fn hash(&self, key: &Utf8SuffixKey) -> usize { + // Basic FNV-1a hash as described: + // https://en.wikipedia.org/wiki/Fowler%E2%80%93Noll%E2%80%93Vo_hash_function + const PRIME: u64 = 1099511628211; + const INIT: u64 = 14695981039346656037; + + let mut h = INIT; + h = (h ^ key.from.as_u64()).wrapping_mul(PRIME); + h = (h ^ u64::from(key.start)).wrapping_mul(PRIME); + h = (h ^ u64::from(key.end)).wrapping_mul(PRIME); + (h % self.map.len().as_u64()).as_usize() + } + + /// Retrieve the cached state ID corresponding to the given key. The hash + /// given must have been computed with `hash` using the same key value. + /// + /// If there is no cached state with the given key, then None is returned. + pub fn get( + &mut self, + key: &Utf8SuffixKey, + hash: usize, + ) -> Option<StateID> { + let entry = &self.map[hash]; + if entry.version != self.version { + return None; + } + if key != &entry.key { + return None; + } + Some(entry.val) + } + + /// Add a cached state to this map with the given key. Callers should + /// ensure that `state_id` points to a state that contains precisely the + /// NFA transition given. + /// + /// `hash` must have been computed using the `hash` method with the same + /// key. + pub fn set(&mut self, key: Utf8SuffixKey, hash: usize, state_id: StateID) { + self.map[hash] = + Utf8SuffixEntry { version: self.version, key, val: state_id }; + } +} diff --git a/third_party/rust/regex-automata/src/nfa/thompson/mod.rs b/third_party/rust/regex-automata/src/nfa/thompson/mod.rs new file mode 100644 index 0000000000..cf426736dc --- /dev/null +++ b/third_party/rust/regex-automata/src/nfa/thompson/mod.rs @@ -0,0 +1,81 @@ +/*! +Defines a Thompson NFA and provides the [`PikeVM`](pikevm::PikeVM) and +[`BoundedBacktracker`](backtrack::BoundedBacktracker) regex engines. + +A Thompson NFA (non-deterministic finite automaton) is arguably _the_ central +data type in this library. It is the result of what is commonly referred to as +"regex compilation." That is, turning a regex pattern from its concrete syntax +string into something that can run a search looks roughly like this: + +* A `&str` is parsed into a [`regex-syntax::ast::Ast`](regex_syntax::ast::Ast). +* An `Ast` is translated into a [`regex-syntax::hir::Hir`](regex_syntax::hir::Hir). +* An `Hir` is compiled into a [`NFA`]. +* The `NFA` is then used to build one of a few different regex engines: + * An `NFA` is used directly in the `PikeVM` and `BoundedBacktracker` engines. + * An `NFA` is used by a [hybrid NFA/DFA](crate::hybrid) to build out a DFA's + transition table at search time. + * An `NFA`, assuming it is one-pass, is used to build a full + [one-pass DFA](crate::dfa::onepass) ahead of time. + * An `NFA` is used to build a [full DFA](crate::dfa) ahead of time. + +The [`meta`](crate::meta) regex engine makes all of these choices for you based +on various criteria. However, if you have a lower level use case, _you_ can +build any of the above regex engines and use them directly. But you must start +here by building an `NFA`. + +# Details + +It is perhaps worth expanding a bit more on what it means to go through the +`&str`->`Ast`->`Hir`->`NFA` process. + +* Parsing a string into an `Ast` gives it a structured representation. +Crucially, the size and amount of work done in this step is proportional to the +size of the original string. No optimization or Unicode handling is done at +this point. This means that parsing into an `Ast` has very predictable costs. +Moreover, an `Ast` can be roundtripped back to its original pattern string as +written. +* Translating an `Ast` into an `Hir` is a process by which the structured +representation is simplified down to its most fundamental components. +Translation deals with flags such as case insensitivity by converting things +like `(?i:a)` to `[Aa]`. Translation is also where Unicode tables are consulted +to resolve things like `\p{Emoji}` and `\p{Greek}`. It also flattens each +character class, regardless of how deeply nested it is, into a single sequence +of non-overlapping ranges. All the various literal forms are thrown out in +favor of one common representation. Overall, the `Hir` is small enough to fit +into your head and makes analysis and other tasks much simpler. +* Compiling an `Hir` into an `NFA` formulates the regex into a finite state +machine whose transitions are defined over bytes. For example, an `Hir` might +have a Unicode character class corresponding to a sequence of ranges defined +in terms of `char`. Compilation is then responsible for turning those ranges +into a UTF-8 automaton. That is, an automaton that matches the UTF-8 encoding +of just the codepoints specified by those ranges. Otherwise, the main job of +an `NFA` is to serve as a byte-code of sorts for a virtual machine. It can be +seen as a sequence of instructions for how to match a regex. +*/ + +#[cfg(feature = "nfa-backtrack")] +pub mod backtrack; +mod builder; +#[cfg(feature = "syntax")] +mod compiler; +mod error; +#[cfg(feature = "syntax")] +mod literal_trie; +#[cfg(feature = "syntax")] +mod map; +mod nfa; +#[cfg(feature = "nfa-pikevm")] +pub mod pikevm; +#[cfg(feature = "syntax")] +mod range_trie; + +pub use self::{ + builder::Builder, + error::BuildError, + nfa::{ + DenseTransitions, PatternIter, SparseTransitions, State, Transition, + NFA, + }, +}; +#[cfg(feature = "syntax")] +pub use compiler::{Compiler, Config, WhichCaptures}; diff --git a/third_party/rust/regex-automata/src/nfa/thompson/nfa.rs b/third_party/rust/regex-automata/src/nfa/thompson/nfa.rs new file mode 100644 index 0000000000..2108fa3381 --- /dev/null +++ b/third_party/rust/regex-automata/src/nfa/thompson/nfa.rs @@ -0,0 +1,2101 @@ +use core::{fmt, mem}; + +use alloc::{boxed::Box, format, string::String, sync::Arc, vec, vec::Vec}; + +#[cfg(feature = "syntax")] +use crate::nfa::thompson::{ + compiler::{Compiler, Config}, + error::BuildError, +}; +use crate::{ + nfa::thompson::builder::Builder, + util::{ + alphabet::{self, ByteClassSet, ByteClasses}, + captures::{GroupInfo, GroupInfoError}, + look::{Look, LookMatcher, LookSet}, + primitives::{ + IteratorIndexExt, PatternID, PatternIDIter, SmallIndex, StateID, + }, + sparse_set::SparseSet, + }, +}; + +/// A byte oriented Thompson non-deterministic finite automaton (NFA). +/// +/// A Thompson NFA is a finite state machine that permits unconditional epsilon +/// transitions, but guarantees that there exists at most one non-epsilon +/// transition for each element in the alphabet for each state. +/// +/// An NFA may be used directly for searching, for analysis or to build +/// a deterministic finite automaton (DFA). +/// +/// # Cheap clones +/// +/// Since an NFA is a core data type in this crate that many other regex +/// engines are based on top of, it is convenient to give ownership of an NFA +/// to said regex engines. Because of this, an NFA uses reference counting +/// internally. Therefore, it is cheap to clone and it is encouraged to do so. +/// +/// # Capabilities +/// +/// Using an NFA for searching via the +/// [`PikeVM`](crate::nfa::thompson::pikevm::PikeVM) provides the most amount +/// of "power" of any regex engine in this crate. Namely, it supports the +/// following in all cases: +/// +/// 1. Detection of a match. +/// 2. Location of a match, including both the start and end offset, in a +/// single pass of the haystack. +/// 3. Location of matching capturing groups. +/// 4. Handles multiple patterns, including (1)-(3) when multiple patterns are +/// present. +/// +/// # Capturing Groups +/// +/// Groups refer to parenthesized expressions inside a regex pattern. They look +/// like this, where `exp` is an arbitrary regex: +/// +/// * `(exp)` - An unnamed capturing group. +/// * `(?P<name>exp)` or `(?<name>exp)` - A named capturing group. +/// * `(?:exp)` - A non-capturing group. +/// * `(?i:exp)` - A non-capturing group that sets flags. +/// +/// Only the first two forms are said to be _capturing_. Capturing +/// means that the last position at which they match is reportable. The +/// [`Captures`](crate::util::captures::Captures) type provides convenient +/// access to the match positions of capturing groups, which includes looking +/// up capturing groups by their name. +/// +/// # Byte oriented +/// +/// This NFA is byte oriented, which means that all of its transitions are +/// defined on bytes. In other words, the alphabet of an NFA consists of the +/// 256 different byte values. +/// +/// While DFAs nearly demand that they be byte oriented for performance +/// reasons, an NFA could conceivably be *Unicode codepoint* oriented. Indeed, +/// a previous version of this NFA supported both byte and codepoint oriented +/// modes. A codepoint oriented mode can work because an NFA fundamentally uses +/// a sparse representation of transitions, which works well with the large +/// sparse space of Unicode codepoints. +/// +/// Nevertheless, this NFA is only byte oriented. This choice is primarily +/// driven by implementation simplicity, and also in part memory usage. In +/// practice, performance between the two is roughly comparable. However, +/// building a DFA (including a hybrid DFA) really wants a byte oriented NFA. +/// So if we do have a codepoint oriented NFA, then we also need to generate +/// byte oriented NFA in order to build an hybrid NFA/DFA. Thus, by only +/// generating byte oriented NFAs, we can produce one less NFA. In other words, +/// if we made our NFA codepoint oriented, we'd need to *also* make it support +/// a byte oriented mode, which is more complicated. But a byte oriented mode +/// can support everything. +/// +/// # Differences with DFAs +/// +/// At the theoretical level, the precise difference between an NFA and a DFA +/// is that, in a DFA, for every state, an input symbol unambiguously refers +/// to a single transition _and_ that an input symbol is required for each +/// transition. At a practical level, this permits DFA implementations to be +/// implemented at their core with a small constant number of CPU instructions +/// for each byte of input searched. In practice, this makes them quite a bit +/// faster than NFAs _in general_. Namely, in order to execute a search for any +/// Thompson NFA, one needs to keep track of a _set_ of states, and execute +/// the possible transitions on all of those states for each input symbol. +/// Overall, this results in much more overhead. To a first approximation, one +/// can expect DFA searches to be about an order of magnitude faster. +/// +/// So why use an NFA at all? The main advantage of an NFA is that it takes +/// linear time (in the size of the pattern string after repetitions have been +/// expanded) to build and linear memory usage. A DFA, on the other hand, may +/// take exponential time and/or space to build. Even in non-pathological +/// cases, DFAs often take quite a bit more memory than their NFA counterparts, +/// _especially_ if large Unicode character classes are involved. Of course, +/// an NFA also provides additional capabilities. For example, it can match +/// Unicode word boundaries on non-ASCII text and resolve the positions of +/// capturing groups. +/// +/// Note that a [`hybrid::regex::Regex`](crate::hybrid::regex::Regex) strikes a +/// good balance between an NFA and a DFA. It avoids the exponential build time +/// of a DFA while maintaining its fast search time. The downside of a hybrid +/// NFA/DFA is that in some cases it can be slower at search time than the NFA. +/// (It also has less functionality than a pure NFA. It cannot handle Unicode +/// word boundaries on non-ASCII text and cannot resolve capturing groups.) +/// +/// # Example +/// +/// This shows how to build an NFA with the default configuration and execute a +/// search using the Pike VM. +/// +/// ``` +/// use regex_automata::{nfa::thompson::pikevm::PikeVM, Match}; +/// +/// let re = PikeVM::new(r"foo[0-9]+")?; +/// let mut cache = re.create_cache(); +/// let mut caps = re.create_captures(); +/// +/// let expected = Some(Match::must(0, 0..8)); +/// re.captures(&mut cache, b"foo12345", &mut caps); +/// assert_eq!(expected, caps.get_match()); +/// +/// # Ok::<(), Box<dyn std::error::Error>>(()) +/// ``` +/// +/// # Example: resolving capturing groups +/// +/// This example shows how to parse some simple dates and extract the +/// components of each date via capturing groups. +/// +/// ``` +/// # if cfg!(miri) { return Ok(()); } // miri takes too long +/// use regex_automata::{ +/// nfa::thompson::pikevm::PikeVM, +/// util::captures::Captures, +/// }; +/// +/// let vm = PikeVM::new(r"(?P<y>\d{4})-(?P<m>\d{2})-(?P<d>\d{2})")?; +/// let mut cache = vm.create_cache(); +/// +/// let haystack = "2012-03-14, 2013-01-01 and 2014-07-05"; +/// let all: Vec<Captures> = vm.captures_iter( +/// &mut cache, haystack.as_bytes() +/// ).collect(); +/// // There should be a total of 3 matches. +/// assert_eq!(3, all.len()); +/// // The year from the second match is '2013'. +/// let span = all[1].get_group_by_name("y").unwrap(); +/// assert_eq!("2013", &haystack[span]); +/// +/// # Ok::<(), Box<dyn std::error::Error>>(()) +/// ``` +/// +/// This example shows that only the last match of a capturing group is +/// reported, even if it had to match multiple times for an overall match +/// to occur. +/// +/// ``` +/// use regex_automata::{nfa::thompson::pikevm::PikeVM, Span}; +/// +/// let re = PikeVM::new(r"([a-z]){4}")?; +/// let mut cache = re.create_cache(); +/// let mut caps = re.create_captures(); +/// +/// let haystack = b"quux"; +/// re.captures(&mut cache, haystack, &mut caps); +/// assert!(caps.is_match()); +/// assert_eq!(Some(Span::from(3..4)), caps.get_group(1)); +/// +/// # Ok::<(), Box<dyn std::error::Error>>(()) +/// ``` +#[derive(Clone)] +pub struct NFA( + // We make NFAs reference counted primarily for two reasons. First is that + // the NFA type itself is quite large (at least 0.5KB), and so it makes + // sense to put it on the heap by default anyway. Second is that, for Arc + // specifically, this enables cheap clones. This tends to be useful because + // several structures (the backtracker, the Pike VM, the hybrid NFA/DFA) + // all want to hang on to an NFA for use during search time. We could + // provide the NFA at search time via a function argument, but this makes + // for an unnecessarily annoying API. Instead, we just let each structure + // share ownership of the NFA. Using a deep clone would not be smart, since + // the NFA can use quite a bit of heap space. + Arc<Inner>, +); + +impl NFA { + /// Parse the given regular expression using a default configuration and + /// build an NFA from it. + /// + /// If you want a non-default configuration, then use the NFA + /// [`Compiler`] with a [`Config`]. + /// + /// # Example + /// + /// ``` + /// use regex_automata::{nfa::thompson::pikevm::PikeVM, Match}; + /// + /// let re = PikeVM::new(r"foo[0-9]+")?; + /// let (mut cache, mut caps) = (re.create_cache(), re.create_captures()); + /// + /// let expected = Some(Match::must(0, 0..8)); + /// re.captures(&mut cache, b"foo12345", &mut caps); + /// assert_eq!(expected, caps.get_match()); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + #[cfg(feature = "syntax")] + pub fn new(pattern: &str) -> Result<NFA, BuildError> { + NFA::compiler().build(pattern) + } + + /// Parse the given regular expressions using a default configuration and + /// build a multi-NFA from them. + /// + /// If you want a non-default configuration, then use the NFA + /// [`Compiler`] with a [`Config`]. + /// + /// # Example + /// + /// ``` + /// use regex_automata::{nfa::thompson::pikevm::PikeVM, Match}; + /// + /// let re = PikeVM::new_many(&["[0-9]+", "[a-z]+"])?; + /// let (mut cache, mut caps) = (re.create_cache(), re.create_captures()); + /// + /// let expected = Some(Match::must(1, 0..3)); + /// re.captures(&mut cache, b"foo12345bar", &mut caps); + /// assert_eq!(expected, caps.get_match()); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + #[cfg(feature = "syntax")] + pub fn new_many<P: AsRef<str>>(patterns: &[P]) -> Result<NFA, BuildError> { + NFA::compiler().build_many(patterns) + } + + /// Returns an NFA with a single regex pattern that always matches at every + /// position. + /// + /// # Example + /// + /// ``` + /// use regex_automata::{nfa::thompson::{NFA, pikevm::PikeVM}, Match}; + /// + /// let re = PikeVM::new_from_nfa(NFA::always_match())?; + /// let (mut cache, mut caps) = (re.create_cache(), re.create_captures()); + /// + /// let expected = Some(Match::must(0, 0..0)); + /// re.captures(&mut cache, b"", &mut caps); + /// assert_eq!(expected, caps.get_match()); + /// re.captures(&mut cache, b"foo", &mut caps); + /// assert_eq!(expected, caps.get_match()); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + pub fn always_match() -> NFA { + // We could use NFA::new("") here and we'd get the same semantics, but + // hand-assembling the NFA (as below) does the same thing with a fewer + // number of states. It also avoids needing the 'syntax' feature + // enabled. + // + // Technically all we need is the "match" state, but we add the + // "capture" states so that the PikeVM can use this NFA. + // + // The unwraps below are OK because we add so few states that they will + // never exhaust any default limits in any environment. + let mut builder = Builder::new(); + let pid = builder.start_pattern().unwrap(); + assert_eq!(pid.as_usize(), 0); + let start_id = + builder.add_capture_start(StateID::ZERO, 0, None).unwrap(); + let end_id = builder.add_capture_end(StateID::ZERO, 0).unwrap(); + let match_id = builder.add_match().unwrap(); + builder.patch(start_id, end_id).unwrap(); + builder.patch(end_id, match_id).unwrap(); + let pid = builder.finish_pattern(start_id).unwrap(); + assert_eq!(pid.as_usize(), 0); + builder.build(start_id, start_id).unwrap() + } + + /// Returns an NFA that never matches at any position. + /// + /// This is a convenience routine for creating an NFA with zero patterns. + /// + /// # Example + /// + /// ``` + /// use regex_automata::nfa::thompson::{NFA, pikevm::PikeVM}; + /// + /// let re = PikeVM::new_from_nfa(NFA::never_match())?; + /// let (mut cache, mut caps) = (re.create_cache(), re.create_captures()); + /// + /// re.captures(&mut cache, b"", &mut caps); + /// assert!(!caps.is_match()); + /// re.captures(&mut cache, b"foo", &mut caps); + /// assert!(!caps.is_match()); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + pub fn never_match() -> NFA { + // This always succeeds because it only requires one NFA state, which + // will never exhaust any (default) limits. + let mut builder = Builder::new(); + let sid = builder.add_fail().unwrap(); + builder.build(sid, sid).unwrap() + } + + /// Return a default configuration for an `NFA`. + /// + /// This is a convenience routine to avoid needing to import the `Config` + /// type when customizing the construction of an NFA. + /// + /// # Example + /// + /// This example shows how to build an NFA with a small size limit that + /// results in a compilation error for any regex that tries to use more + /// heap memory than the configured limit. + /// + /// ``` + /// use regex_automata::nfa::thompson::{NFA, pikevm::PikeVM}; + /// + /// let result = PikeVM::builder() + /// .thompson(NFA::config().nfa_size_limit(Some(1_000))) + /// // Remember, \w is Unicode-aware by default and thus huge. + /// .build(r"\w+"); + /// assert!(result.is_err()); + /// ``` + #[cfg(feature = "syntax")] + pub fn config() -> Config { + Config::new() + } + + /// Return a compiler for configuring the construction of an `NFA`. + /// + /// This is a convenience routine to avoid needing to import the + /// [`Compiler`] type in common cases. + /// + /// # Example + /// + /// This example shows how to build an NFA that is permitted match invalid + /// UTF-8. Without the additional syntax configuration here, compilation of + /// `(?-u:.)` would fail because it is permitted to match invalid UTF-8. + /// + /// ``` + /// use regex_automata::{ + /// nfa::thompson::pikevm::PikeVM, + /// util::syntax, + /// Match, + /// }; + /// + /// let re = PikeVM::builder() + /// .syntax(syntax::Config::new().utf8(false)) + /// .build(r"[a-z]+(?-u:.)")?; + /// let (mut cache, mut caps) = (re.create_cache(), re.create_captures()); + /// + /// let expected = Some(Match::must(0, 1..5)); + /// re.captures(&mut cache, b"\xFFabc\xFF", &mut caps); + /// assert_eq!(expected, caps.get_match()); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + #[cfg(feature = "syntax")] + pub fn compiler() -> Compiler { + Compiler::new() + } + + /// Returns an iterator over all pattern identifiers in this NFA. + /// + /// Pattern IDs are allocated in sequential order starting from zero, + /// where the order corresponds to the order of patterns provided to the + /// [`NFA::new_many`] constructor. + /// + /// # Example + /// + /// ``` + /// use regex_automata::{nfa::thompson::NFA, PatternID}; + /// + /// let nfa = NFA::new_many(&["[0-9]+", "[a-z]+", "[A-Z]+"])?; + /// let pids: Vec<PatternID> = nfa.patterns().collect(); + /// assert_eq!(pids, vec![ + /// PatternID::must(0), + /// PatternID::must(1), + /// PatternID::must(2), + /// ]); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + pub fn patterns(&self) -> PatternIter<'_> { + PatternIter { + it: PatternID::iter(self.pattern_len()), + _marker: core::marker::PhantomData, + } + } + + /// Returns the total number of regex patterns in this NFA. + /// + /// This may return zero if the NFA was constructed with no patterns. In + /// this case, the NFA can never produce a match for any input. + /// + /// This is guaranteed to be no bigger than [`PatternID::LIMIT`] because + /// NFA construction will fail if too many patterns are added. + /// + /// It is always true that `nfa.patterns().count() == nfa.pattern_len()`. + /// + /// # Example + /// + /// ``` + /// use regex_automata::nfa::thompson::NFA; + /// + /// let nfa = NFA::new_many(&["[0-9]+", "[a-z]+", "[A-Z]+"])?; + /// assert_eq!(3, nfa.pattern_len()); + /// + /// let nfa = NFA::never_match(); + /// assert_eq!(0, nfa.pattern_len()); + /// + /// let nfa = NFA::always_match(); + /// assert_eq!(1, nfa.pattern_len()); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + #[inline] + pub fn pattern_len(&self) -> usize { + self.0.start_pattern.len() + } + + /// Return the state identifier of the initial anchored state of this NFA. + /// + /// The returned identifier is guaranteed to be a valid index into the + /// slice returned by [`NFA::states`], and is also a valid argument to + /// [`NFA::state`]. + /// + /// # Example + /// + /// This example shows a somewhat contrived example where we can easily + /// predict the anchored starting state. + /// + /// ``` + /// use regex_automata::nfa::thompson::{NFA, State, WhichCaptures}; + /// + /// let nfa = NFA::compiler() + /// .configure(NFA::config().which_captures(WhichCaptures::None)) + /// .build("a")?; + /// let state = nfa.state(nfa.start_anchored()); + /// match *state { + /// State::ByteRange { trans } => { + /// assert_eq!(b'a', trans.start); + /// assert_eq!(b'a', trans.end); + /// } + /// _ => unreachable!("unexpected state"), + /// } + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + #[inline] + pub fn start_anchored(&self) -> StateID { + self.0.start_anchored + } + + /// Return the state identifier of the initial unanchored state of this + /// NFA. + /// + /// This is equivalent to the identifier returned by + /// [`NFA::start_anchored`] when the NFA has no unanchored starting state. + /// + /// The returned identifier is guaranteed to be a valid index into the + /// slice returned by [`NFA::states`], and is also a valid argument to + /// [`NFA::state`]. + /// + /// # Example + /// + /// This example shows that the anchored and unanchored starting states + /// are equivalent when an anchored NFA is built. + /// + /// ``` + /// use regex_automata::nfa::thompson::NFA; + /// + /// let nfa = NFA::new("^a")?; + /// assert_eq!(nfa.start_anchored(), nfa.start_unanchored()); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + #[inline] + pub fn start_unanchored(&self) -> StateID { + self.0.start_unanchored + } + + /// Return the state identifier of the initial anchored state for the given + /// pattern, or `None` if there is no pattern corresponding to the given + /// identifier. + /// + /// If one uses the starting state for a particular pattern, then the only + /// match that can be returned is for the corresponding pattern. + /// + /// The returned identifier is guaranteed to be a valid index into the + /// slice returned by [`NFA::states`], and is also a valid argument to + /// [`NFA::state`]. + /// + /// # Errors + /// + /// If the pattern doesn't exist in this NFA, then this returns an error. + /// This occurs when `pid.as_usize() >= nfa.pattern_len()`. + /// + /// # Example + /// + /// This example shows that the anchored and unanchored starting states + /// are equivalent when an anchored NFA is built. + /// + /// ``` + /// use regex_automata::{nfa::thompson::NFA, PatternID}; + /// + /// let nfa = NFA::new_many(&["^a", "^b"])?; + /// // The anchored and unanchored states for the entire NFA are the same, + /// // since all of the patterns are anchored. + /// assert_eq!(nfa.start_anchored(), nfa.start_unanchored()); + /// // But the anchored starting states for each pattern are distinct, + /// // because these starting states can only lead to matches for the + /// // corresponding pattern. + /// let anchored = Some(nfa.start_anchored()); + /// assert_ne!(anchored, nfa.start_pattern(PatternID::must(0))); + /// assert_ne!(anchored, nfa.start_pattern(PatternID::must(1))); + /// // Requesting a pattern not in the NFA will result in None: + /// assert_eq!(None, nfa.start_pattern(PatternID::must(2))); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + #[inline] + pub fn start_pattern(&self, pid: PatternID) -> Option<StateID> { + self.0.start_pattern.get(pid.as_usize()).copied() + } + + /// Get the byte class set for this NFA. + /// + /// A byte class set is a partitioning of this NFA's alphabet into + /// equivalence classes. Any two bytes in the same equivalence class are + /// guaranteed to never discriminate between a match or a non-match. (The + /// partitioning may not be minimal.) + /// + /// Byte classes are used internally by this crate when building DFAs. + /// Namely, among other optimizations, they enable a space optimization + /// where the DFA's internal alphabet is defined over the equivalence + /// classes of bytes instead of all possible byte values. The former is + /// often quite a bit smaller than the latter, which permits the DFA to use + /// less space for its transition table. + #[inline] + pub(crate) fn byte_class_set(&self) -> &ByteClassSet { + &self.0.byte_class_set + } + + /// Get the byte classes for this NFA. + /// + /// Byte classes represent a partitioning of this NFA's alphabet into + /// equivalence classes. Any two bytes in the same equivalence class are + /// guaranteed to never discriminate between a match or a non-match. (The + /// partitioning may not be minimal.) + /// + /// Byte classes are used internally by this crate when building DFAs. + /// Namely, among other optimizations, they enable a space optimization + /// where the DFA's internal alphabet is defined over the equivalence + /// classes of bytes instead of all possible byte values. The former is + /// often quite a bit smaller than the latter, which permits the DFA to use + /// less space for its transition table. + /// + /// # Example + /// + /// This example shows how to query the class of various bytes. + /// + /// ``` + /// use regex_automata::nfa::thompson::NFA; + /// + /// let nfa = NFA::new("[a-z]+")?; + /// let classes = nfa.byte_classes(); + /// // 'a' and 'z' are in the same class for this regex. + /// assert_eq!(classes.get(b'a'), classes.get(b'z')); + /// // But 'a' and 'A' are not. + /// assert_ne!(classes.get(b'a'), classes.get(b'A')); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + #[inline] + pub fn byte_classes(&self) -> &ByteClasses { + &self.0.byte_classes + } + + /// Return a reference to the NFA state corresponding to the given ID. + /// + /// This is a convenience routine for `nfa.states()[id]`. + /// + /// # Panics + /// + /// This panics when the given identifier does not reference a valid state. + /// That is, when `id.as_usize() >= nfa.states().len()`. + /// + /// # Example + /// + /// The anchored state for a pattern will typically correspond to a + /// capturing state for that pattern. (Although, this is not an API + /// guarantee!) + /// + /// ``` + /// use regex_automata::{nfa::thompson::{NFA, State}, PatternID}; + /// + /// let nfa = NFA::new("a")?; + /// let state = nfa.state(nfa.start_pattern(PatternID::ZERO).unwrap()); + /// match *state { + /// State::Capture { slot, .. } => { + /// assert_eq!(0, slot.as_usize()); + /// } + /// _ => unreachable!("unexpected state"), + /// } + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + #[inline] + pub fn state(&self, id: StateID) -> &State { + &self.states()[id] + } + + /// Returns a slice of all states in this NFA. + /// + /// The slice returned is indexed by `StateID`. This provides a convenient + /// way to access states while following transitions among those states. + /// + /// # Example + /// + /// This demonstrates that disabling UTF-8 mode can shrink the size of the + /// NFA considerably in some cases, especially when using Unicode character + /// classes. + /// + /// ``` + /// # if cfg!(miri) { return Ok(()); } // miri takes too long + /// use regex_automata::nfa::thompson::NFA; + /// + /// let nfa_unicode = NFA::new(r"\w")?; + /// let nfa_ascii = NFA::new(r"(?-u)\w")?; + /// // Yes, a factor of 45 difference. No lie. + /// assert!(40 * nfa_ascii.states().len() < nfa_unicode.states().len()); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + #[inline] + pub fn states(&self) -> &[State] { + &self.0.states + } + + /// Returns the capturing group info for this NFA. + /// + /// The [`GroupInfo`] provides a way to map to and from capture index + /// and capture name for each pattern. It also provides a mapping from + /// each of the capturing groups in every pattern to their corresponding + /// slot offsets encoded in [`State::Capture`] states. + /// + /// Note that `GroupInfo` uses reference counting internally, such that + /// cloning a `GroupInfo` is very cheap. + /// + /// # Example + /// + /// This example shows how to get a list of all capture group names for + /// a particular pattern. + /// + /// ``` + /// use regex_automata::{nfa::thompson::NFA, PatternID}; + /// + /// let nfa = NFA::new(r"(a)(?P<foo>b)(c)(d)(?P<bar>e)")?; + /// // The first is the implicit group that is always unnammed. The next + /// // 5 groups are the explicit groups found in the concrete syntax above. + /// let expected = vec![None, None, Some("foo"), None, None, Some("bar")]; + /// let got: Vec<Option<&str>> = + /// nfa.group_info().pattern_names(PatternID::ZERO).collect(); + /// assert_eq!(expected, got); + /// + /// // Using an invalid pattern ID will result in nothing yielded. + /// let got = nfa.group_info().pattern_names(PatternID::must(999)).count(); + /// assert_eq!(0, got); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + #[inline] + pub fn group_info(&self) -> &GroupInfo { + &self.0.group_info() + } + + /// Returns true if and only if this NFA has at least one + /// [`Capture`](State::Capture) in its sequence of states. + /// + /// This is useful as a way to perform a quick test before attempting + /// something that does or does not require capture states. For example, + /// some regex engines (like the PikeVM) require capture states in order to + /// work at all. + /// + /// # Example + /// + /// This example shows a few different NFAs and whether they have captures + /// or not. + /// + /// ``` + /// use regex_automata::nfa::thompson::{NFA, WhichCaptures}; + /// + /// // Obviously has capture states. + /// let nfa = NFA::new("(a)")?; + /// assert!(nfa.has_capture()); + /// + /// // Less obviously has capture states, because every pattern has at + /// // least one anonymous capture group corresponding to the match for the + /// // entire pattern. + /// let nfa = NFA::new("a")?; + /// assert!(nfa.has_capture()); + /// + /// // Other than hand building your own NFA, this is the only way to build + /// // an NFA without capturing groups. In general, you should only do this + /// // if you don't intend to use any of the NFA-oriented regex engines. + /// // Overall, capturing groups don't have many downsides. Although they + /// // can add a bit of noise to simple NFAs, so it can be nice to disable + /// // them for debugging purposes. + /// // + /// // Notice that 'has_capture' is false here even when we have an + /// // explicit capture group in the pattern. + /// let nfa = NFA::compiler() + /// .configure(NFA::config().which_captures(WhichCaptures::None)) + /// .build("(a)")?; + /// assert!(!nfa.has_capture()); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + #[inline] + pub fn has_capture(&self) -> bool { + self.0.has_capture + } + + /// Returns true if and only if this NFA can match the empty string. + /// When it returns false, all possible matches are guaranteed to have a + /// non-zero length. + /// + /// This is useful as cheap way to know whether code needs to handle the + /// case of a zero length match. This is particularly important when UTF-8 + /// modes are enabled, as when UTF-8 mode is enabled, empty matches that + /// split a codepoint must never be reported. This extra handling can + /// sometimes be costly, and since regexes matching an empty string are + /// somewhat rare, it can be beneficial to treat such regexes specially. + /// + /// # Example + /// + /// This example shows a few different NFAs and whether they match the + /// empty string or not. Notice the empty string isn't merely a matter + /// of a string of length literally `0`, but rather, whether a match can + /// occur between specific pairs of bytes. + /// + /// ``` + /// use regex_automata::{nfa::thompson::NFA, util::syntax}; + /// + /// // The empty regex matches the empty string. + /// let nfa = NFA::new("")?; + /// assert!(nfa.has_empty(), "empty matches empty"); + /// // The '+' repetition operator requires at least one match, and so + /// // does not match the empty string. + /// let nfa = NFA::new("a+")?; + /// assert!(!nfa.has_empty(), "+ does not match empty"); + /// // But the '*' repetition operator does. + /// let nfa = NFA::new("a*")?; + /// assert!(nfa.has_empty(), "* does match empty"); + /// // And wrapping '+' in an operator that can match an empty string also + /// // causes it to match the empty string too. + /// let nfa = NFA::new("(a+)*")?; + /// assert!(nfa.has_empty(), "+ inside of * matches empty"); + /// + /// // If a regex is just made of a look-around assertion, even if the + /// // assertion requires some kind of non-empty string around it (such as + /// // \b), then it is still treated as if it matches the empty string. + /// // Namely, if a match occurs of just a look-around assertion, then the + /// // match returned is empty. + /// let nfa = NFA::compiler() + /// .syntax(syntax::Config::new().utf8(false)) + /// .build(r"^$\A\z\b\B(?-u:\b\B)")?; + /// assert!(nfa.has_empty(), "assertions match empty"); + /// // Even when an assertion is wrapped in a '+', it still matches the + /// // empty string. + /// let nfa = NFA::new(r"\b+")?; + /// assert!(nfa.has_empty(), "+ of an assertion matches empty"); + /// + /// // An alternation with even one branch that can match the empty string + /// // is also said to match the empty string overall. + /// let nfa = NFA::new("foo|(bar)?|quux")?; + /// assert!(nfa.has_empty(), "alternations can match empty"); + /// + /// // An NFA that matches nothing does not match the empty string. + /// let nfa = NFA::new("[a&&b]")?; + /// assert!(!nfa.has_empty(), "never matching means not matching empty"); + /// // But if it's wrapped in something that doesn't require a match at + /// // all, then it can match the empty string! + /// let nfa = NFA::new("[a&&b]*")?; + /// assert!(nfa.has_empty(), "* on never-match still matches empty"); + /// // Since a '+' requires a match, using it on something that can never + /// // match will itself produce a regex that can never match anything, + /// // and thus does not match the empty string. + /// let nfa = NFA::new("[a&&b]+")?; + /// assert!(!nfa.has_empty(), "+ on never-match still matches nothing"); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + #[inline] + pub fn has_empty(&self) -> bool { + self.0.has_empty + } + + /// Whether UTF-8 mode is enabled for this NFA or not. + /// + /// When UTF-8 mode is enabled, all matches reported by a regex engine + /// derived from this NFA are guaranteed to correspond to spans of valid + /// UTF-8. This includes zero-width matches. For example, the regex engine + /// must guarantee that the empty regex will not match at the positions + /// between code units in the UTF-8 encoding of a single codepoint. + /// + /// See [`Config::utf8`] for more information. + /// + /// This is enabled by default. + /// + /// # Example + /// + /// This example shows how UTF-8 mode can impact the match spans that may + /// be reported in certain cases. + /// + /// ``` + /// use regex_automata::{ + /// nfa::thompson::{self, pikevm::PikeVM}, + /// Match, Input, + /// }; + /// + /// let re = PikeVM::new("")?; + /// let (mut cache, mut caps) = (re.create_cache(), re.create_captures()); + /// + /// // UTF-8 mode is enabled by default. + /// let mut input = Input::new("☃"); + /// re.search(&mut cache, &input, &mut caps); + /// assert_eq!(Some(Match::must(0, 0..0)), caps.get_match()); + /// + /// // Even though an empty regex matches at 1..1, our next match is + /// // 3..3 because 1..1 and 2..2 split the snowman codepoint (which is + /// // three bytes long). + /// input.set_start(1); + /// re.search(&mut cache, &input, &mut caps); + /// assert_eq!(Some(Match::must(0, 3..3)), caps.get_match()); + /// + /// // But if we disable UTF-8, then we'll get matches at 1..1 and 2..2: + /// let re = PikeVM::builder() + /// .thompson(thompson::Config::new().utf8(false)) + /// .build("")?; + /// re.search(&mut cache, &input, &mut caps); + /// assert_eq!(Some(Match::must(0, 1..1)), caps.get_match()); + /// + /// input.set_start(2); + /// re.search(&mut cache, &input, &mut caps); + /// assert_eq!(Some(Match::must(0, 2..2)), caps.get_match()); + /// + /// input.set_start(3); + /// re.search(&mut cache, &input, &mut caps); + /// assert_eq!(Some(Match::must(0, 3..3)), caps.get_match()); + /// + /// input.set_start(4); + /// re.search(&mut cache, &input, &mut caps); + /// assert_eq!(None, caps.get_match()); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + #[inline] + pub fn is_utf8(&self) -> bool { + self.0.utf8 + } + + /// Returns true when this NFA is meant to be matched in reverse. + /// + /// Generally speaking, when this is true, it means the NFA is supposed to + /// be used in conjunction with moving backwards through the haystack. That + /// is, from a higher memory address to a lower memory address. + /// + /// It is often the case that lower level routines dealing with an NFA + /// don't need to care about whether it is "meant" to be matched in reverse + /// or not. However, there are some specific cases where it matters. For + /// example, the implementation of CRLF-aware `^` and `$` line anchors + /// needs to know whether the search is in the forward or reverse + /// direction. In the forward direction, neither `^` nor `$` should match + /// when a `\r` has been seen previously and a `\n` is next. However, in + /// the reverse direction, neither `^` nor `$` should match when a `\n` + /// has been seen previously and a `\r` is next. This fundamentally changes + /// how the state machine is constructed, and thus needs to be altered + /// based on the direction of the search. + /// + /// This is automatically set when using a [`Compiler`] with a configuration + /// where [`Config::reverse`] is enabled. If you're building your own NFA + /// by hand via a [`Builder`] + #[inline] + pub fn is_reverse(&self) -> bool { + self.0.reverse + } + + /// Returns true if and only if all starting states for this NFA correspond + /// to the beginning of an anchored search. + /// + /// Typically, an NFA will have both an anchored and an unanchored starting + /// state. Namely, because it tends to be useful to have both and the cost + /// of having an unanchored starting state is almost zero (for an NFA). + /// However, if all patterns in the NFA are themselves anchored, then even + /// the unanchored starting state will correspond to an anchored search + /// since the pattern doesn't permit anything else. + /// + /// # Example + /// + /// This example shows a few different scenarios where this method's + /// return value varies. + /// + /// ``` + /// use regex_automata::nfa::thompson::NFA; + /// + /// // The unanchored starting state permits matching this pattern anywhere + /// // in a haystack, instead of just at the beginning. + /// let nfa = NFA::new("a")?; + /// assert!(!nfa.is_always_start_anchored()); + /// + /// // In this case, the pattern is itself anchored, so there is no way + /// // to run an unanchored search. + /// let nfa = NFA::new("^a")?; + /// assert!(nfa.is_always_start_anchored()); + /// + /// // When multiline mode is enabled, '^' can match at the start of a line + /// // in addition to the start of a haystack, so an unanchored search is + /// // actually possible. + /// let nfa = NFA::new("(?m)^a")?; + /// assert!(!nfa.is_always_start_anchored()); + /// + /// // Weird cases also work. A pattern is only considered anchored if all + /// // matches may only occur at the start of a haystack. + /// let nfa = NFA::new("(^a)|a")?; + /// assert!(!nfa.is_always_start_anchored()); + /// + /// // When multiple patterns are present, if they are all anchored, then + /// // the NFA is always anchored too. + /// let nfa = NFA::new_many(&["^a", "^b", "^c"])?; + /// assert!(nfa.is_always_start_anchored()); + /// + /// // But if one pattern is unanchored, then the NFA must permit an + /// // unanchored search. + /// let nfa = NFA::new_many(&["^a", "b", "^c"])?; + /// assert!(!nfa.is_always_start_anchored()); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + #[inline] + pub fn is_always_start_anchored(&self) -> bool { + self.start_anchored() == self.start_unanchored() + } + + /// Returns the look-around matcher associated with this NFA. + /// + /// A look-around matcher determines how to match look-around assertions. + /// In particular, some assertions are configurable. For example, the + /// `(?m:^)` and `(?m:$)` assertions can have their line terminator changed + /// from the default of `\n` to any other byte. + /// + /// If the NFA was built using a [`Compiler`], then this matcher + /// can be set via the [`Config::look_matcher`] configuration + /// knob. Otherwise, if you've built an NFA by hand, it is set via + /// [`Builder::set_look_matcher`]. + /// + /// # Example + /// + /// This shows how to change the line terminator for multi-line assertions. + /// + /// ``` + /// use regex_automata::{ + /// nfa::thompson::{self, pikevm::PikeVM}, + /// util::look::LookMatcher, + /// Match, Input, + /// }; + /// + /// let mut lookm = LookMatcher::new(); + /// lookm.set_line_terminator(b'\x00'); + /// + /// let re = PikeVM::builder() + /// .thompson(thompson::Config::new().look_matcher(lookm)) + /// .build(r"(?m)^[a-z]+$")?; + /// let mut cache = re.create_cache(); + /// + /// // Multi-line assertions now use NUL as a terminator. + /// assert_eq!( + /// Some(Match::must(0, 1..4)), + /// re.find(&mut cache, b"\x00abc\x00"), + /// ); + /// // ... and \n is no longer recognized as a terminator. + /// assert_eq!( + /// None, + /// re.find(&mut cache, b"\nabc\n"), + /// ); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + #[inline] + pub fn look_matcher(&self) -> &LookMatcher { + &self.0.look_matcher + } + + /// Returns the union of all look-around assertions used throughout this + /// NFA. When the returned set is empty, it implies that the NFA has no + /// look-around assertions and thus zero conditional epsilon transitions. + /// + /// This is useful in some cases enabling optimizations. It is not + /// unusual, for example, for optimizations to be of the form, "for any + /// regex with zero conditional epsilon transitions, do ..." where "..." + /// is some kind of optimization. + /// + /// This isn't only helpful for optimizations either. Sometimes look-around + /// assertions are difficult to support. For example, many of the DFAs in + /// this crate don't support Unicode word boundaries or handle them using + /// heuristics. Handling that correctly typically requires some kind of + /// cheap check of whether the NFA has a Unicode word boundary in the first + /// place. + /// + /// # Example + /// + /// This example shows how this routine varies based on the regex pattern: + /// + /// ``` + /// use regex_automata::{nfa::thompson::NFA, util::look::Look}; + /// + /// // No look-around at all. + /// let nfa = NFA::new("a")?; + /// assert!(nfa.look_set_any().is_empty()); + /// + /// // When multiple patterns are present, since this returns the union, + /// // it will include look-around assertions that only appear in one + /// // pattern. + /// let nfa = NFA::new_many(&["a", "b", "a^b", "c"])?; + /// assert!(nfa.look_set_any().contains(Look::Start)); + /// + /// // Some groups of assertions have various shortcuts. For example: + /// let nfa = NFA::new(r"(?-u:\b)")?; + /// assert!(nfa.look_set_any().contains_word()); + /// assert!(!nfa.look_set_any().contains_word_unicode()); + /// assert!(nfa.look_set_any().contains_word_ascii()); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + #[inline] + pub fn look_set_any(&self) -> LookSet { + self.0.look_set_any + } + + /// Returns the union of all prefix look-around assertions for every + /// pattern in this NFA. When the returned set is empty, it implies none of + /// the patterns require moving through a conditional epsilon transition + /// before inspecting the first byte in the haystack. + /// + /// This can be useful for determining what kinds of assertions need to be + /// satisfied at the beginning of a search. For example, typically DFAs + /// in this crate will build a distinct starting state for each possible + /// starting configuration that might result in look-around assertions + /// being satisfied differently. However, if the set returned here is + /// empty, then you know that the start state is invariant because there + /// are no conditional epsilon transitions to consider. + /// + /// # Example + /// + /// This example shows how this routine varies based on the regex pattern: + /// + /// ``` + /// use regex_automata::{nfa::thompson::NFA, util::look::Look}; + /// + /// // No look-around at all. + /// let nfa = NFA::new("a")?; + /// assert!(nfa.look_set_prefix_any().is_empty()); + /// + /// // When multiple patterns are present, since this returns the union, + /// // it will include look-around assertions that only appear in one + /// // pattern. But it will only include assertions that are in the prefix + /// // of a pattern. For example, this includes '^' but not '$' even though + /// // '$' does appear. + /// let nfa = NFA::new_many(&["a", "b", "^ab$", "c"])?; + /// assert!(nfa.look_set_prefix_any().contains(Look::Start)); + /// assert!(!nfa.look_set_prefix_any().contains(Look::End)); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + #[inline] + pub fn look_set_prefix_any(&self) -> LookSet { + self.0.look_set_prefix_any + } + + // FIXME: The `look_set_prefix_all` computation was not correct, and it + // seemed a little tricky to fix it. Since I wasn't actually using it for + // anything, I just decided to remove it in the run up to the regex 1.9 + // release. If you need this, please file an issue. + /* + /// Returns the intersection of all prefix look-around assertions for every + /// pattern in this NFA. When the returned set is empty, it implies at + /// least one of the patterns does not require moving through a conditional + /// epsilon transition before inspecting the first byte in the haystack. + /// Conversely, when the set contains an assertion, it implies that every + /// pattern in the NFA also contains that assertion in its prefix. + /// + /// This can be useful for determining what kinds of assertions need to be + /// satisfied at the beginning of a search. For example, if you know that + /// [`Look::Start`] is in the prefix intersection set returned here, then + /// you know that all searches, regardless of input configuration, will be + /// anchored. + /// + /// # Example + /// + /// This example shows how this routine varies based on the regex pattern: + /// + /// ``` + /// use regex_automata::{nfa::thompson::NFA, util::look::Look}; + /// + /// // No look-around at all. + /// let nfa = NFA::new("a")?; + /// assert!(nfa.look_set_prefix_all().is_empty()); + /// + /// // When multiple patterns are present, since this returns the + /// // intersection, it will only include assertions present in every + /// // prefix, and only the prefix. + /// let nfa = NFA::new_many(&["^a$", "^b$", "$^ab$", "^c$"])?; + /// assert!(nfa.look_set_prefix_all().contains(Look::Start)); + /// assert!(!nfa.look_set_prefix_all().contains(Look::End)); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + #[inline] + pub fn look_set_prefix_all(&self) -> LookSet { + self.0.look_set_prefix_all + } + */ + + /// Returns the memory usage, in bytes, of this NFA. + /// + /// This does **not** include the stack size used up by this NFA. To + /// compute that, use `std::mem::size_of::<NFA>()`. + /// + /// # Example + /// + /// This example shows that large Unicode character classes can use quite + /// a bit of memory. + /// + /// ``` + /// # if cfg!(miri) { return Ok(()); } // miri takes too long + /// use regex_automata::nfa::thompson::NFA; + /// + /// let nfa_unicode = NFA::new(r"\w")?; + /// let nfa_ascii = NFA::new(r"(?-u:\w)")?; + /// + /// assert!(10 * nfa_ascii.memory_usage() < nfa_unicode.memory_usage()); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + #[inline] + pub fn memory_usage(&self) -> usize { + use core::mem::size_of; + + size_of::<Inner>() // allocated on the heap via Arc + + self.0.states.len() * size_of::<State>() + + self.0.start_pattern.len() * size_of::<StateID>() + + self.0.group_info.memory_usage() + + self.0.memory_extra + } +} + +impl fmt::Debug for NFA { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + self.0.fmt(f) + } +} + +/// The "inner" part of the NFA. We split this part out so that we can easily +/// wrap it in an `Arc` above in the definition of `NFA`. +/// +/// See builder.rs for the code that actually builds this type. This module +/// does provide (internal) mutable methods for adding things to this +/// NFA before finalizing it, but the high level construction process is +/// controlled by the builder abstraction. (Which is complicated enough to +/// get its own module.) +#[derive(Default)] +pub(super) struct Inner { + /// The state sequence. This sequence is guaranteed to be indexable by all + /// starting state IDs, and it is also guaranteed to contain at most one + /// `Match` state for each pattern compiled into this NFA. (A pattern may + /// not have a corresponding `Match` state if a `Match` state is impossible + /// to reach.) + states: Vec<State>, + /// The anchored starting state of this NFA. + start_anchored: StateID, + /// The unanchored starting state of this NFA. + start_unanchored: StateID, + /// The starting states for each individual pattern. Starting at any + /// of these states will result in only an anchored search for the + /// corresponding pattern. The vec is indexed by pattern ID. When the NFA + /// contains a single regex, then `start_pattern[0]` and `start_anchored` + /// are always equivalent. + start_pattern: Vec<StateID>, + /// Info about the capturing groups in this NFA. This is responsible for + /// mapping groups to slots, mapping groups to names and names to groups. + group_info: GroupInfo, + /// A representation of equivalence classes over the transitions in this + /// NFA. Two bytes in the same equivalence class must not discriminate + /// between a match or a non-match. This map can be used to shrink the + /// total size of a DFA's transition table with a small match-time cost. + /// + /// Note that the NFA's transitions are *not* defined in terms of these + /// equivalence classes. The NFA's transitions are defined on the original + /// byte values. For the most part, this is because they wouldn't really + /// help the NFA much since the NFA already uses a sparse representation + /// to represent transitions. Byte classes are most effective in a dense + /// representation. + byte_class_set: ByteClassSet, + /// This is generated from `byte_class_set`, and essentially represents the + /// same thing but supports different access patterns. Namely, this permits + /// looking up the equivalence class of a byte very cheaply. + /// + /// Ideally we would just store this, but because of annoying code + /// structure reasons, we keep both this and `byte_class_set` around for + /// now. I think I would prefer that `byte_class_set` were computed in the + /// `Builder`, but right now, we compute it as states are added to the + /// `NFA`. + byte_classes: ByteClasses, + /// Whether this NFA has a `Capture` state anywhere. + has_capture: bool, + /// When the empty string is in the language matched by this NFA. + has_empty: bool, + /// Whether UTF-8 mode is enabled for this NFA. Briefly, this means that + /// all non-empty matches produced by this NFA correspond to spans of valid + /// UTF-8, and any empty matches produced by this NFA that split a UTF-8 + /// encoded codepoint should be filtered out by the corresponding regex + /// engine. + utf8: bool, + /// Whether this NFA is meant to be matched in reverse or not. + reverse: bool, + /// The matcher to be used for look-around assertions. + look_matcher: LookMatcher, + /// The union of all look-around assertions that occur anywhere within + /// this NFA. If this set is empty, then it means there are precisely zero + /// conditional epsilon transitions in the NFA. + look_set_any: LookSet, + /// The union of all look-around assertions that occur as a zero-length + /// prefix for any of the patterns in this NFA. + look_set_prefix_any: LookSet, + /* + /// The intersection of all look-around assertions that occur as a + /// zero-length prefix for any of the patterns in this NFA. + look_set_prefix_all: LookSet, + */ + /// Heap memory used indirectly by NFA states and other things (like the + /// various capturing group representations above). Since each state + /// might use a different amount of heap, we need to keep track of this + /// incrementally. + memory_extra: usize, +} + +impl Inner { + /// Runs any last finalization bits and turns this into a full NFA. + pub(super) fn into_nfa(mut self) -> NFA { + self.byte_classes = self.byte_class_set.byte_classes(); + // Do epsilon closure from the start state of every pattern in order + // to compute various properties such as look-around assertions and + // whether the empty string can be matched. + let mut stack = vec![]; + let mut seen = SparseSet::new(self.states.len()); + for &start_id in self.start_pattern.iter() { + stack.push(start_id); + seen.clear(); + // let mut prefix_all = LookSet::full(); + let mut prefix_any = LookSet::empty(); + while let Some(sid) = stack.pop() { + if !seen.insert(sid) { + continue; + } + match self.states[sid] { + State::ByteRange { .. } + | State::Dense { .. } + | State::Fail => continue, + State::Sparse(_) => { + // This snippet below will rewrite this sparse state + // as a dense state. By doing it here, we apply this + // optimization to all hot "sparse" states since these + // are the states that are reachable from the start + // state via an epsilon closure. + // + // Unfortunately, this optimization did not seem to + // help much in some very limited ad hoc benchmarking. + // + // I left the 'Dense' state type in place in case we + // want to revisit this, but I suspect the real way + // to make forward progress is a more fundamental + // rearchitecting of how data in the NFA is laid out. + // I think we should consider a single contiguous + // allocation instead of all this indirection and + // potential heap allocations for every state. But this + // is a large re-design and will require API breaking + // changes. + // self.memory_extra -= self.states[sid].memory_usage(); + // let trans = DenseTransitions::from_sparse(sparse); + // self.states[sid] = State::Dense(trans); + // self.memory_extra += self.states[sid].memory_usage(); + continue; + } + State::Match { .. } => self.has_empty = true, + State::Look { look, next } => { + prefix_any = prefix_any.insert(look); + stack.push(next); + } + State::Union { ref alternates } => { + // Order doesn't matter here, since we're just dealing + // with look-around sets. But if we do richer analysis + // here that needs to care about preference order, then + // this should be done in reverse. + stack.extend(alternates.iter()); + } + State::BinaryUnion { alt1, alt2 } => { + stack.push(alt2); + stack.push(alt1); + } + State::Capture { next, .. } => { + stack.push(next); + } + } + } + self.look_set_prefix_any = + self.look_set_prefix_any.union(prefix_any); + } + NFA(Arc::new(self)) + } + + /// Returns the capturing group info for this NFA. + pub(super) fn group_info(&self) -> &GroupInfo { + &self.group_info + } + + /// Add the given state to this NFA after allocating a fresh identifier for + /// it. + /// + /// This panics if too many states are added such that a fresh identifier + /// could not be created. (Currently, the only caller of this routine is + /// a `Builder`, and it upholds this invariant.) + pub(super) fn add(&mut self, state: State) -> StateID { + match state { + State::ByteRange { ref trans } => { + self.byte_class_set.set_range(trans.start, trans.end); + } + State::Sparse(ref sparse) => { + for trans in sparse.transitions.iter() { + self.byte_class_set.set_range(trans.start, trans.end); + } + } + State::Dense { .. } => unreachable!(), + State::Look { look, .. } => { + self.look_matcher + .add_to_byteset(look, &mut self.byte_class_set); + self.look_set_any = self.look_set_any.insert(look); + } + State::Capture { .. } => { + self.has_capture = true; + } + State::Union { .. } + | State::BinaryUnion { .. } + | State::Fail + | State::Match { .. } => {} + } + + let id = StateID::new(self.states.len()).unwrap(); + self.memory_extra += state.memory_usage(); + self.states.push(state); + id + } + + /// Set the starting state identifiers for this NFA. + /// + /// `start_anchored` and `start_unanchored` may be equivalent. When they + /// are, then the NFA can only execute anchored searches. This might + /// occur, for example, for patterns that are unconditionally anchored. + /// e.g., `^foo`. + pub(super) fn set_starts( + &mut self, + start_anchored: StateID, + start_unanchored: StateID, + start_pattern: &[StateID], + ) { + self.start_anchored = start_anchored; + self.start_unanchored = start_unanchored; + self.start_pattern = start_pattern.to_vec(); + } + + /// Sets the UTF-8 mode of this NFA. + pub(super) fn set_utf8(&mut self, yes: bool) { + self.utf8 = yes; + } + + /// Sets the reverse mode of this NFA. + pub(super) fn set_reverse(&mut self, yes: bool) { + self.reverse = yes; + } + + /// Sets the look-around assertion matcher for this NFA. + pub(super) fn set_look_matcher(&mut self, m: LookMatcher) { + self.look_matcher = m; + } + + /// Set the capturing groups for this NFA. + /// + /// The given slice should contain the capturing groups for each pattern, + /// The capturing groups in turn should correspond to the total number of + /// capturing groups in the pattern, including the anonymous first capture + /// group for each pattern. If a capturing group does have a name, then it + /// should be provided as a Arc<str>. + /// + /// This returns an error if a corresponding `GroupInfo` could not be + /// built. + pub(super) fn set_captures( + &mut self, + captures: &[Vec<Option<Arc<str>>>], + ) -> Result<(), GroupInfoError> { + self.group_info = GroupInfo::new( + captures.iter().map(|x| x.iter().map(|y| y.as_ref())), + )?; + Ok(()) + } + + /// Remap the transitions in every state of this NFA using the given map. + /// The given map should be indexed according to state ID namespace used by + /// the transitions of the states currently in this NFA. + /// + /// This is particularly useful to the NFA builder, since it is convenient + /// to add NFA states in order to produce their final IDs. Then, after all + /// of the intermediate "empty" states (unconditional epsilon transitions) + /// have been removed from the builder's representation, we can re-map all + /// of the transitions in the states already added to their final IDs. + pub(super) fn remap(&mut self, old_to_new: &[StateID]) { + for state in &mut self.states { + state.remap(old_to_new); + } + self.start_anchored = old_to_new[self.start_anchored]; + self.start_unanchored = old_to_new[self.start_unanchored]; + for id in self.start_pattern.iter_mut() { + *id = old_to_new[*id]; + } + } +} + +impl fmt::Debug for Inner { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + writeln!(f, "thompson::NFA(")?; + for (sid, state) in self.states.iter().with_state_ids() { + let status = if sid == self.start_anchored { + '^' + } else if sid == self.start_unanchored { + '>' + } else { + ' ' + }; + writeln!(f, "{}{:06?}: {:?}", status, sid.as_usize(), state)?; + } + let pattern_len = self.start_pattern.len(); + if pattern_len > 1 { + writeln!(f, "")?; + for pid in 0..pattern_len { + let sid = self.start_pattern[pid]; + writeln!(f, "START({:06?}): {:?}", pid, sid.as_usize())?; + } + } + writeln!(f, "")?; + writeln!( + f, + "transition equivalence classes: {:?}", + self.byte_classes, + )?; + writeln!(f, ")")?; + Ok(()) + } +} + +/// A state in an NFA. +/// +/// In theory, it can help to conceptualize an `NFA` as a graph consisting of +/// `State`s. Each `State` contains its complete set of outgoing transitions. +/// +/// In practice, it can help to conceptualize an `NFA` as a sequence of +/// instructions for a virtual machine. Each `State` says what to do and where +/// to go next. +/// +/// Strictly speaking, the practical interpretation is the most correct one, +/// because of the [`Capture`](State::Capture) state. Namely, a `Capture` +/// state always forwards execution to another state unconditionally. Its only +/// purpose is to cause a side effect: the recording of the current input +/// position at a particular location in memory. In this sense, an `NFA` +/// has more power than a theoretical non-deterministic finite automaton. +/// +/// For most uses of this crate, it is likely that one may never even need to +/// be aware of this type at all. The main use cases for looking at `State`s +/// directly are if you need to write your own search implementation or if you +/// need to do some kind of analysis on the NFA. +#[derive(Clone, Eq, PartialEq)] +pub enum State { + /// A state with a single transition that can only be taken if the current + /// input symbol is in a particular range of bytes. + ByteRange { + /// The transition from this state to the next. + trans: Transition, + }, + /// A state with possibly many transitions represented in a sparse fashion. + /// Transitions are non-overlapping and ordered lexicographically by input + /// range. + /// + /// In practice, this is used for encoding UTF-8 automata. Its presence is + /// primarily an optimization that avoids many additional unconditional + /// epsilon transitions (via [`Union`](State::Union) states), and thus + /// decreases the overhead of traversing the NFA. This can improve both + /// matching time and DFA construction time. + Sparse(SparseTransitions), + /// A dense representation of a state with multiple transitions. + Dense(DenseTransitions), + /// A conditional epsilon transition satisfied via some sort of + /// look-around. Look-around is limited to anchor and word boundary + /// assertions. + /// + /// Look-around states are meant to be evaluated while performing epsilon + /// closure (computing the set of states reachable from a particular state + /// via only epsilon transitions). If the current position in the haystack + /// satisfies the look-around assertion, then you're permitted to follow + /// that epsilon transition. + Look { + /// The look-around assertion that must be satisfied before moving + /// to `next`. + look: Look, + /// The state to transition to if the look-around assertion is + /// satisfied. + next: StateID, + }, + /// An alternation such that there exists an epsilon transition to all + /// states in `alternates`, where matches found via earlier transitions + /// are preferred over later transitions. + Union { + /// An ordered sequence of unconditional epsilon transitions to other + /// states. Transitions earlier in the sequence are preferred over + /// transitions later in the sequence. + alternates: Box<[StateID]>, + }, + /// An alternation such that there exists precisely two unconditional + /// epsilon transitions, where matches found via `alt1` are preferred over + /// matches found via `alt2`. + /// + /// This state exists as a common special case of Union where there are + /// only two alternates. In this case, we don't need any allocations to + /// represent the state. This saves a bit of memory and also saves an + /// additional memory access when traversing the NFA. + BinaryUnion { + /// An unconditional epsilon transition to another NFA state. This + /// is preferred over `alt2`. + alt1: StateID, + /// An unconditional epsilon transition to another NFA state. Matches + /// reported via this transition should only be reported if no matches + /// were found by following `alt1`. + alt2: StateID, + }, + /// An empty state that records a capture location. + /// + /// From the perspective of finite automata, this is precisely equivalent + /// to an unconditional epsilon transition, but serves the purpose of + /// instructing NFA simulations to record additional state when the finite + /// state machine passes through this epsilon transition. + /// + /// `slot` in this context refers to the specific capture group slot + /// offset that is being recorded. Each capturing group has two slots + /// corresponding to the start and end of the matching portion of that + /// group. + /// + /// The pattern ID and capture group index are also included in this state + /// in case they are useful. But mostly, all you'll need is `next` and + /// `slot`. + Capture { + /// The state to transition to, unconditionally. + next: StateID, + /// The pattern ID that this capture belongs to. + pattern_id: PatternID, + /// The capture group index that this capture belongs to. Capture group + /// indices are local to each pattern. For example, when capturing + /// groups are enabled, every pattern has a capture group at index + /// `0`. + group_index: SmallIndex, + /// The slot index for this capture. Every capturing group has two + /// slots: one for the start haystack offset and one for the end + /// haystack offset. Unlike capture group indices, slot indices are + /// global across all patterns in this NFA. That is, each slot belongs + /// to a single pattern, but there is only one slot at index `i`. + slot: SmallIndex, + }, + /// A state that cannot be transitioned out of. This is useful for cases + /// where you want to prevent matching from occurring. For example, if your + /// regex parser permits empty character classes, then one could choose + /// a `Fail` state to represent them. (An empty character class can be + /// thought of as an empty set. Since nothing is in an empty set, they can + /// never match anything.) + Fail, + /// A match state. There is at least one such occurrence of this state for + /// each regex that can match that is in this NFA. + Match { + /// The matching pattern ID. + pattern_id: PatternID, + }, +} + +impl State { + /// Returns true if and only if this state contains one or more epsilon + /// transitions. + /// + /// In practice, a state has no outgoing transitions (like `Match`), has + /// only non-epsilon transitions (like `ByteRange`) or has only epsilon + /// transitions (like `Union`). + /// + /// # Example + /// + /// ``` + /// use regex_automata::{ + /// nfa::thompson::{State, Transition}, + /// util::primitives::{PatternID, StateID, SmallIndex}, + /// }; + /// + /// // Capture states are epsilon transitions. + /// let state = State::Capture { + /// next: StateID::ZERO, + /// pattern_id: PatternID::ZERO, + /// group_index: SmallIndex::ZERO, + /// slot: SmallIndex::ZERO, + /// }; + /// assert!(state.is_epsilon()); + /// + /// // ByteRange states are not. + /// let state = State::ByteRange { + /// trans: Transition { start: b'a', end: b'z', next: StateID::ZERO }, + /// }; + /// assert!(!state.is_epsilon()); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + #[inline] + pub fn is_epsilon(&self) -> bool { + match *self { + State::ByteRange { .. } + | State::Sparse { .. } + | State::Dense { .. } + | State::Fail + | State::Match { .. } => false, + State::Look { .. } + | State::Union { .. } + | State::BinaryUnion { .. } + | State::Capture { .. } => true, + } + } + + /// Returns the heap memory usage of this NFA state in bytes. + fn memory_usage(&self) -> usize { + match *self { + State::ByteRange { .. } + | State::Look { .. } + | State::BinaryUnion { .. } + | State::Capture { .. } + | State::Match { .. } + | State::Fail => 0, + State::Sparse(SparseTransitions { ref transitions }) => { + transitions.len() * mem::size_of::<Transition>() + } + State::Dense { .. } => 256 * mem::size_of::<StateID>(), + State::Union { ref alternates } => { + alternates.len() * mem::size_of::<StateID>() + } + } + } + + /// Remap the transitions in this state using the given map. Namely, the + /// given map should be indexed according to the transitions currently + /// in this state. + /// + /// This is used during the final phase of the NFA compiler, which turns + /// its intermediate NFA into the final NFA. + fn remap(&mut self, remap: &[StateID]) { + match *self { + State::ByteRange { ref mut trans } => { + trans.next = remap[trans.next] + } + State::Sparse(SparseTransitions { ref mut transitions }) => { + for t in transitions.iter_mut() { + t.next = remap[t.next]; + } + } + State::Dense(DenseTransitions { ref mut transitions }) => { + for sid in transitions.iter_mut() { + *sid = remap[*sid]; + } + } + State::Look { ref mut next, .. } => *next = remap[*next], + State::Union { ref mut alternates } => { + for alt in alternates.iter_mut() { + *alt = remap[*alt]; + } + } + State::BinaryUnion { ref mut alt1, ref mut alt2 } => { + *alt1 = remap[*alt1]; + *alt2 = remap[*alt2]; + } + State::Capture { ref mut next, .. } => *next = remap[*next], + State::Fail => {} + State::Match { .. } => {} + } + } +} + +impl fmt::Debug for State { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + match *self { + State::ByteRange { ref trans } => trans.fmt(f), + State::Sparse(SparseTransitions { ref transitions }) => { + let rs = transitions + .iter() + .map(|t| format!("{:?}", t)) + .collect::<Vec<String>>() + .join(", "); + write!(f, "sparse({})", rs) + } + State::Dense(ref dense) => { + write!(f, "dense(")?; + for (i, t) in dense.iter().enumerate() { + if i > 0 { + write!(f, ", ")?; + } + write!(f, "{:?}", t)?; + } + write!(f, ")") + } + State::Look { ref look, next } => { + write!(f, "{:?} => {:?}", look, next.as_usize()) + } + State::Union { ref alternates } => { + let alts = alternates + .iter() + .map(|id| format!("{:?}", id.as_usize())) + .collect::<Vec<String>>() + .join(", "); + write!(f, "union({})", alts) + } + State::BinaryUnion { alt1, alt2 } => { + write!( + f, + "binary-union({}, {})", + alt1.as_usize(), + alt2.as_usize() + ) + } + State::Capture { next, pattern_id, group_index, slot } => { + write!( + f, + "capture(pid={:?}, group={:?}, slot={:?}) => {:?}", + pattern_id.as_usize(), + group_index.as_usize(), + slot.as_usize(), + next.as_usize(), + ) + } + State::Fail => write!(f, "FAIL"), + State::Match { pattern_id } => { + write!(f, "MATCH({:?})", pattern_id.as_usize()) + } + } + } +} + +/// A sequence of transitions used to represent a sparse state. +/// +/// This is the primary representation of a [`Sparse`](State::Sparse) state. +/// It corresponds to a sorted sequence of transitions with non-overlapping +/// byte ranges. If the byte at the current position in the haystack matches +/// one of the byte ranges, then the finite state machine should take the +/// corresponding transition. +#[derive(Clone, Debug, Eq, PartialEq)] +pub struct SparseTransitions { + /// The sorted sequence of non-overlapping transitions. + pub transitions: Box<[Transition]>, +} + +impl SparseTransitions { + /// This follows the matching transition for a particular byte. + /// + /// The matching transition is found by looking for a matching byte + /// range (there is at most one) corresponding to the position `at` in + /// `haystack`. + /// + /// If `at >= haystack.len()`, then this returns `None`. + #[inline] + pub fn matches(&self, haystack: &[u8], at: usize) -> Option<StateID> { + haystack.get(at).and_then(|&b| self.matches_byte(b)) + } + + /// This follows the matching transition for any member of the alphabet. + /// + /// The matching transition is found by looking for a matching byte + /// range (there is at most one) corresponding to the position `at` in + /// `haystack`. If the given alphabet unit is [`EOI`](alphabet::Unit::eoi), + /// then this always returns `None`. + #[inline] + pub(crate) fn matches_unit( + &self, + unit: alphabet::Unit, + ) -> Option<StateID> { + unit.as_u8().map_or(None, |byte| self.matches_byte(byte)) + } + + /// This follows the matching transition for a particular byte. + /// + /// The matching transition is found by looking for a matching byte range + /// (there is at most one) corresponding to the byte given. + #[inline] + pub fn matches_byte(&self, byte: u8) -> Option<StateID> { + for t in self.transitions.iter() { + if t.start > byte { + break; + } else if t.matches_byte(byte) { + return Some(t.next); + } + } + None + + /* + // This is an alternative implementation that uses binary search. In + // some ad hoc experiments, like + // + // smallishru=OpenSubtitles2018.raw.sample.smallish.ru + // regex-cli find nfa thompson pikevm -b "@$smallishru" '\b\w+\b' + // + // I could not observe any improvement, and in fact, things seemed to + // be a bit slower. I can see an improvement in at least one benchmark: + // + // allcpssmall=all-codepoints-utf8-10x + // regex-cli find nfa thompson pikevm @$allcpssmall '\pL{100}' + // + // Where total search time goes from 3.2s to 2.4s when using binary + // search. + self.transitions + .binary_search_by(|t| { + if t.end < byte { + core::cmp::Ordering::Less + } else if t.start > byte { + core::cmp::Ordering::Greater + } else { + core::cmp::Ordering::Equal + } + }) + .ok() + .map(|i| self.transitions[i].next) + */ + } +} + +/// A sequence of transitions used to represent a dense state. +/// +/// This is the primary representation of a [`Dense`](State::Dense) state. It +/// provides constant time matching. That is, given a byte in a haystack and +/// a `DenseTransitions`, one can determine if the state matches in constant +/// time. +/// +/// This is in contrast to `SparseTransitions`, whose time complexity is +/// necessarily bigger than constant time. Also in contrast, `DenseTransitions` +/// usually requires (much) more heap memory. +#[derive(Clone, Debug, Eq, PartialEq)] +pub struct DenseTransitions { + /// A dense representation of this state's transitions on the heap. This + /// always has length 256. + pub transitions: Box<[StateID]>, +} + +impl DenseTransitions { + /// This follows the matching transition for a particular byte. + /// + /// The matching transition is found by looking for a transition that + /// doesn't correspond to `StateID::ZERO` for the byte `at` the given + /// position in `haystack`. + /// + /// If `at >= haystack.len()`, then this returns `None`. + #[inline] + pub fn matches(&self, haystack: &[u8], at: usize) -> Option<StateID> { + haystack.get(at).and_then(|&b| self.matches_byte(b)) + } + + /// This follows the matching transition for any member of the alphabet. + /// + /// The matching transition is found by looking for a transition that + /// doesn't correspond to `StateID::ZERO` for the byte `at` the given + /// position in `haystack`. + /// + /// If `at >= haystack.len()` or if the given alphabet unit is + /// [`EOI`](alphabet::Unit::eoi), then this returns `None`. + #[inline] + pub(crate) fn matches_unit( + &self, + unit: alphabet::Unit, + ) -> Option<StateID> { + unit.as_u8().map_or(None, |byte| self.matches_byte(byte)) + } + + /// This follows the matching transition for a particular byte. + /// + /// The matching transition is found by looking for a transition that + /// doesn't correspond to `StateID::ZERO` for the given `byte`. + /// + /// If `at >= haystack.len()`, then this returns `None`. + #[inline] + pub fn matches_byte(&self, byte: u8) -> Option<StateID> { + let next = self.transitions[usize::from(byte)]; + if next == StateID::ZERO { + None + } else { + Some(next) + } + } + + /* + /// The dense state optimization isn't currently enabled, so permit a + /// little bit of dead code. + pub(crate) fn from_sparse(sparse: &SparseTransitions) -> DenseTransitions { + let mut dense = vec![StateID::ZERO; 256]; + for t in sparse.transitions.iter() { + for b in t.start..=t.end { + dense[usize::from(b)] = t.next; + } + } + DenseTransitions { transitions: dense.into_boxed_slice() } + } + */ + + /// Returns an iterator over all transitions that don't point to + /// `StateID::ZERO`. + pub(crate) fn iter(&self) -> impl Iterator<Item = Transition> + '_ { + use crate::util::int::Usize; + self.transitions + .iter() + .enumerate() + .filter(|&(_, &sid)| sid != StateID::ZERO) + .map(|(byte, &next)| Transition { + start: byte.as_u8(), + end: byte.as_u8(), + next, + }) + } +} + +/// A single transition to another state. +/// +/// This transition may only be followed if the current byte in the haystack +/// falls in the inclusive range of bytes specified. +#[derive(Clone, Copy, Eq, Hash, PartialEq)] +pub struct Transition { + /// The inclusive start of the byte range. + pub start: u8, + /// The inclusive end of the byte range. + pub end: u8, + /// The identifier of the state to transition to. + pub next: StateID, +} + +impl Transition { + /// Returns true if the position `at` in `haystack` falls in this + /// transition's range of bytes. + /// + /// If `at >= haystack.len()`, then this returns `false`. + pub fn matches(&self, haystack: &[u8], at: usize) -> bool { + haystack.get(at).map_or(false, |&b| self.matches_byte(b)) + } + + /// Returns true if the given alphabet unit falls in this transition's + /// range of bytes. If the given unit is [`EOI`](alphabet::Unit::eoi), then + /// this returns `false`. + pub fn matches_unit(&self, unit: alphabet::Unit) -> bool { + unit.as_u8().map_or(false, |byte| self.matches_byte(byte)) + } + + /// Returns true if the given byte falls in this transition's range of + /// bytes. + pub fn matches_byte(&self, byte: u8) -> bool { + self.start <= byte && byte <= self.end + } +} + +impl fmt::Debug for Transition { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + use crate::util::escape::DebugByte; + + let Transition { start, end, next } = *self; + if self.start == self.end { + write!(f, "{:?} => {:?}", DebugByte(start), next.as_usize()) + } else { + write!( + f, + "{:?}-{:?} => {:?}", + DebugByte(start), + DebugByte(end), + next.as_usize(), + ) + } + } +} + +/// An iterator over all pattern IDs in an NFA. +/// +/// This iterator is created by [`NFA::patterns`]. +/// +/// The lifetime parameter `'a` refers to the lifetime of the NFA from which +/// this pattern iterator was created. +#[derive(Debug)] +pub struct PatternIter<'a> { + it: PatternIDIter, + /// We explicitly associate a lifetime with this iterator even though we + /// don't actually borrow anything from the NFA. We do this for backward + /// compatibility purposes. If we ever do need to borrow something from + /// the NFA, then we can and just get rid of this marker without breaking + /// the public API. + _marker: core::marker::PhantomData<&'a ()>, +} + +impl<'a> Iterator for PatternIter<'a> { + type Item = PatternID; + + fn next(&mut self) -> Option<PatternID> { + self.it.next() + } +} + +#[cfg(all(test, feature = "nfa-pikevm"))] +mod tests { + use super::*; + use crate::{nfa::thompson::pikevm::PikeVM, Input}; + + // This asserts that an NFA state doesn't have its size changed. It is + // *really* easy to accidentally increase the size, and thus potentially + // dramatically increase the memory usage of every NFA. + // + // This assert doesn't mean we absolutely cannot increase the size of an + // NFA state. We can. It's just here to make sure we do it knowingly and + // intentionally. + #[test] + fn state_has_small_size() { + #[cfg(target_pointer_width = "64")] + assert_eq!(24, core::mem::size_of::<State>()); + #[cfg(target_pointer_width = "32")] + assert_eq!(20, core::mem::size_of::<State>()); + } + + #[test] + fn always_match() { + let re = PikeVM::new_from_nfa(NFA::always_match()).unwrap(); + let mut cache = re.create_cache(); + let mut caps = re.create_captures(); + let mut find = |haystack, start, end| { + let input = Input::new(haystack).range(start..end); + re.search(&mut cache, &input, &mut caps); + caps.get_match().map(|m| m.end()) + }; + + assert_eq!(Some(0), find("", 0, 0)); + assert_eq!(Some(0), find("a", 0, 1)); + assert_eq!(Some(1), find("a", 1, 1)); + assert_eq!(Some(0), find("ab", 0, 2)); + assert_eq!(Some(1), find("ab", 1, 2)); + assert_eq!(Some(2), find("ab", 2, 2)); + } + + #[test] + fn never_match() { + let re = PikeVM::new_from_nfa(NFA::never_match()).unwrap(); + let mut cache = re.create_cache(); + let mut caps = re.create_captures(); + let mut find = |haystack, start, end| { + let input = Input::new(haystack).range(start..end); + re.search(&mut cache, &input, &mut caps); + caps.get_match().map(|m| m.end()) + }; + + assert_eq!(None, find("", 0, 0)); + assert_eq!(None, find("a", 0, 1)); + assert_eq!(None, find("a", 1, 1)); + assert_eq!(None, find("ab", 0, 2)); + assert_eq!(None, find("ab", 1, 2)); + assert_eq!(None, find("ab", 2, 2)); + } +} diff --git a/third_party/rust/regex-automata/src/nfa/thompson/pikevm.rs b/third_party/rust/regex-automata/src/nfa/thompson/pikevm.rs new file mode 100644 index 0000000000..0128c151ae --- /dev/null +++ b/third_party/rust/regex-automata/src/nfa/thompson/pikevm.rs @@ -0,0 +1,2359 @@ +/*! +An NFA backed Pike VM for executing regex searches with capturing groups. + +This module provides a [`PikeVM`] that works by simulating an NFA and +resolving all spans of capturing groups that participate in a match. +*/ + +#[cfg(feature = "internal-instrument-pikevm")] +use core::cell::RefCell; + +use alloc::{vec, vec::Vec}; + +use crate::{ + nfa::thompson::{self, BuildError, State, NFA}, + util::{ + captures::Captures, + empty, iter, + prefilter::Prefilter, + primitives::{NonMaxUsize, PatternID, SmallIndex, StateID}, + search::{ + Anchored, HalfMatch, Input, Match, MatchKind, PatternSet, Span, + }, + sparse_set::SparseSet, + }, +}; + +/// A simple macro for conditionally executing instrumentation logic when +/// the 'trace' log level is enabled. This is a compile-time no-op when the +/// 'internal-instrument-pikevm' feature isn't enabled. The intent here is that +/// this makes it easier to avoid doing extra work when instrumentation isn't +/// enabled. +/// +/// This macro accepts a closure of type `|&mut Counters|`. The closure can +/// then increment counters (or whatever) in accordance with what one wants +/// to track. +macro_rules! instrument { + ($fun:expr) => { + #[cfg(feature = "internal-instrument-pikevm")] + { + let fun: &mut dyn FnMut(&mut Counters) = &mut $fun; + COUNTERS.with(|c: &RefCell<Counters>| fun(&mut *c.borrow_mut())); + } + }; +} + +#[cfg(feature = "internal-instrument-pikevm")] +std::thread_local! { + /// Effectively global state used to keep track of instrumentation + /// counters. The "proper" way to do this is to thread it through the + /// PikeVM, but it makes the code quite icky. Since this is just a + /// debugging feature, we're content to relegate it to thread local + /// state. When instrumentation is enabled, the counters are reset at the + /// beginning of every search and printed (with the 'trace' log level) at + /// the end of every search. + static COUNTERS: RefCell<Counters> = RefCell::new(Counters::empty()); +} + +/// The configuration used for building a [`PikeVM`]. +/// +/// A PikeVM configuration is a simple data object that is typically used with +/// [`Builder::configure`]. It can be cheaply cloned. +/// +/// A default configuration can be created either with `Config::new`, or +/// perhaps more conveniently, with [`PikeVM::config`]. +#[derive(Clone, Debug, Default)] +pub struct Config { + match_kind: Option<MatchKind>, + pre: Option<Option<Prefilter>>, +} + +impl Config { + /// Return a new default PikeVM configuration. + pub fn new() -> Config { + Config::default() + } + + /// Set the desired match semantics. + /// + /// The default is [`MatchKind::LeftmostFirst`], which corresponds to the + /// match semantics of Perl-like regex engines. That is, when multiple + /// patterns would match at the same leftmost position, the pattern that + /// appears first in the concrete syntax is chosen. + /// + /// Currently, the only other kind of match semantics supported is + /// [`MatchKind::All`]. This corresponds to "classical DFA" construction + /// where all possible matches are visited in the NFA by the `PikeVM`. + /// + /// Typically, `All` is used when one wants to execute an overlapping + /// search and `LeftmostFirst` otherwise. In particular, it rarely makes + /// sense to use `All` with the various "leftmost" find routines, since the + /// leftmost routines depend on the `LeftmostFirst` automata construction + /// strategy. Specifically, `LeftmostFirst` results in the `PikeVM` + /// simulating dead states as a way to terminate the search and report a + /// match. `LeftmostFirst` also supports non-greedy matches using this + /// strategy where as `All` does not. + pub fn match_kind(mut self, kind: MatchKind) -> Config { + self.match_kind = Some(kind); + self + } + + /// Set a prefilter to be used whenever a start state is entered. + /// + /// A [`Prefilter`] in this context is meant to accelerate searches by + /// looking for literal prefixes that every match for the corresponding + /// pattern (or patterns) must start with. Once a prefilter produces a + /// match, the underlying search routine continues on to try and confirm + /// the match. + /// + /// Be warned that setting a prefilter does not guarantee that the search + /// will be faster. While it's usually a good bet, if the prefilter + /// produces a lot of false positive candidates (i.e., positions matched + /// by the prefilter but not by the regex), then the overall result can + /// be slower than if you had just executed the regex engine without any + /// prefilters. + /// + /// By default no prefilter is set. + /// + /// # Example + /// + /// ``` + /// use regex_automata::{ + /// nfa::thompson::pikevm::PikeVM, + /// util::prefilter::Prefilter, + /// Input, Match, MatchKind, + /// }; + /// + /// let pre = Prefilter::new(MatchKind::LeftmostFirst, &["foo", "bar"]); + /// let re = PikeVM::builder() + /// .configure(PikeVM::config().prefilter(pre)) + /// .build(r"(foo|bar)[a-z]+")?; + /// let mut cache = re.create_cache(); + /// let input = Input::new("foo1 barfox bar"); + /// assert_eq!(Some(Match::must(0, 5..11)), re.find(&mut cache, input)); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + /// + /// Be warned though that an incorrect prefilter can lead to incorrect + /// results! + /// + /// ``` + /// use regex_automata::{ + /// nfa::thompson::pikevm::PikeVM, + /// util::prefilter::Prefilter, + /// Input, HalfMatch, MatchKind, + /// }; + /// + /// let pre = Prefilter::new(MatchKind::LeftmostFirst, &["foo", "car"]); + /// let re = PikeVM::builder() + /// .configure(PikeVM::config().prefilter(pre)) + /// .build(r"(foo|bar)[a-z]+")?; + /// let mut cache = re.create_cache(); + /// let input = Input::new("foo1 barfox bar"); + /// // No match reported even though there clearly is one! + /// assert_eq!(None, re.find(&mut cache, input)); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + pub fn prefilter(mut self, pre: Option<Prefilter>) -> Config { + self.pre = Some(pre); + self + } + + /// Returns the match semantics set in this configuration. + pub fn get_match_kind(&self) -> MatchKind { + self.match_kind.unwrap_or(MatchKind::LeftmostFirst) + } + + /// Returns the prefilter set in this configuration, if one at all. + pub fn get_prefilter(&self) -> Option<&Prefilter> { + self.pre.as_ref().unwrap_or(&None).as_ref() + } + + /// Overwrite the default configuration such that the options in `o` are + /// always used. If an option in `o` is not set, then the corresponding + /// option in `self` is used. If it's not set in `self` either, then it + /// remains not set. + pub(crate) fn overwrite(&self, o: Config) -> Config { + Config { + match_kind: o.match_kind.or(self.match_kind), + pre: o.pre.or_else(|| self.pre.clone()), + } + } +} + +/// A builder for a `PikeVM`. +/// +/// This builder permits configuring options for the syntax of a pattern, +/// the NFA construction and the `PikeVM` construction. This builder is +/// different from a general purpose regex builder in that it permits fine +/// grain configuration of the construction process. The trade off for this is +/// complexity, and the possibility of setting a configuration that might not +/// make sense. For example, there are two different UTF-8 modes: +/// +/// * [`util::syntax::Config::utf8`](crate::util::syntax::Config::utf8) +/// controls whether the pattern itself can contain sub-expressions that match +/// invalid UTF-8. +/// * [`thompson::Config::utf8`] controls whether empty matches that split a +/// Unicode codepoint are reported or not. +/// +/// Generally speaking, callers will want to either enable all of these or +/// disable all of these. +/// +/// # Example +/// +/// This example shows how to disable UTF-8 mode in the syntax and the regex +/// itself. This is generally what you want for matching on arbitrary bytes. +/// +/// ``` +/// use regex_automata::{ +/// nfa::thompson::{self, pikevm::PikeVM}, +/// util::syntax, +/// Match, +/// }; +/// +/// let re = PikeVM::builder() +/// .syntax(syntax::Config::new().utf8(false)) +/// .thompson(thompson::Config::new().utf8(false)) +/// .build(r"foo(?-u:[^b])ar.*")?; +/// let mut cache = re.create_cache(); +/// +/// let haystack = b"\xFEfoo\xFFarzz\xE2\x98\xFF\n"; +/// let expected = Some(Match::must(0, 1..9)); +/// let got = re.find_iter(&mut cache, haystack).next(); +/// assert_eq!(expected, got); +/// // Notice that `(?-u:[^b])` matches invalid UTF-8, +/// // but the subsequent `.*` does not! Disabling UTF-8 +/// // on the syntax permits this. +/// // +/// // N.B. This example does not show the impact of +/// // disabling UTF-8 mode on a PikeVM Config, since that +/// // only impacts regexes that can produce matches of +/// // length 0. +/// assert_eq!(b"foo\xFFarzz", &haystack[got.unwrap().range()]); +/// +/// # Ok::<(), Box<dyn std::error::Error>>(()) +/// ``` +#[derive(Clone, Debug)] +pub struct Builder { + config: Config, + #[cfg(feature = "syntax")] + thompson: thompson::Compiler, +} + +impl Builder { + /// Create a new PikeVM builder with its default configuration. + pub fn new() -> Builder { + Builder { + config: Config::default(), + #[cfg(feature = "syntax")] + thompson: thompson::Compiler::new(), + } + } + + /// Build a `PikeVM` from the given pattern. + /// + /// If there was a problem parsing or compiling the pattern, then an error + /// is returned. + #[cfg(feature = "syntax")] + pub fn build(&self, pattern: &str) -> Result<PikeVM, BuildError> { + self.build_many(&[pattern]) + } + + /// Build a `PikeVM` from the given patterns. + #[cfg(feature = "syntax")] + pub fn build_many<P: AsRef<str>>( + &self, + patterns: &[P], + ) -> Result<PikeVM, BuildError> { + let nfa = self.thompson.build_many(patterns)?; + self.build_from_nfa(nfa) + } + + /// Build a `PikeVM` directly from its NFA. + /// + /// Note that when using this method, any configuration that applies to the + /// construction of the NFA itself will of course be ignored, since the NFA + /// given here is already built. + pub fn build_from_nfa(&self, nfa: NFA) -> Result<PikeVM, BuildError> { + nfa.look_set_any().available().map_err(BuildError::word)?; + Ok(PikeVM { config: self.config.clone(), nfa }) + } + + /// Apply the given `PikeVM` configuration options to this builder. + pub fn configure(&mut self, config: Config) -> &mut Builder { + self.config = self.config.overwrite(config); + self + } + + /// Set the syntax configuration for this builder using + /// [`syntax::Config`](crate::util::syntax::Config). + /// + /// This permits setting things like case insensitivity, Unicode and multi + /// line mode. + /// + /// These settings only apply when constructing a PikeVM directly from a + /// pattern. + #[cfg(feature = "syntax")] + pub fn syntax( + &mut self, + config: crate::util::syntax::Config, + ) -> &mut Builder { + self.thompson.syntax(config); + self + } + + /// Set the Thompson NFA configuration for this builder using + /// [`nfa::thompson::Config`](crate::nfa::thompson::Config). + /// + /// This permits setting things like if additional time should be spent + /// shrinking the size of the NFA. + /// + /// These settings only apply when constructing a PikeVM directly from a + /// pattern. + #[cfg(feature = "syntax")] + pub fn thompson(&mut self, config: thompson::Config) -> &mut Builder { + self.thompson.configure(config); + self + } +} + +/// A virtual machine for executing regex searches with capturing groups. +/// +/// # Infallible APIs +/// +/// Unlike most other regex engines in this crate, a `PikeVM` never returns an +/// error at search time. It supports all [`Anchored`] configurations, never +/// quits and works on haystacks of arbitrary length. +/// +/// There are two caveats to mention though: +/// +/// * If an invalid pattern ID is given to a search via [`Anchored::Pattern`], +/// then the PikeVM will report "no match." This is consistent with all other +/// regex engines in this crate. +/// * When using [`PikeVM::which_overlapping_matches`] with a [`PatternSet`] +/// that has insufficient capacity to store all valid pattern IDs, then if a +/// match occurs for a `PatternID` that cannot be inserted, it is silently +/// dropped as if it did not match. +/// +/// # Advice +/// +/// The `PikeVM` is generally the most "powerful" regex engine in this crate. +/// "Powerful" in this context means that it can handle any regular expression +/// that is parseable by `regex-syntax` and any size haystack. Regretably, +/// the `PikeVM` is also simultaneously often the _slowest_ regex engine in +/// practice. This results in an annoying situation where one generally tries +/// to pick any other regex engine (or perhaps none at all) before being +/// forced to fall back to a `PikeVM`. +/// +/// For example, a common strategy for dealing with capturing groups is to +/// actually look for the overall match of the regex using a faster regex +/// engine, like a [lazy DFA](crate::hybrid::regex::Regex). Once the overall +/// match is found, one can then run the `PikeVM` on just the match span to +/// find the spans of the capturing groups. In this way, the faster regex +/// engine does the majority of the work, while the `PikeVM` only lends its +/// power in a more limited role. +/// +/// Unfortunately, this isn't always possible because the faster regex engines +/// don't support all of the regex features in `regex-syntax`. This notably +/// includes (and is currently limited to) Unicode word boundaries. So if +/// your pattern has Unicode word boundaries, you typically can't use a +/// DFA-based regex engine at all (unless you [enable heuristic support for +/// it](crate::hybrid::dfa::Config::unicode_word_boundary)). (The [one-pass +/// DFA](crate::dfa::onepass::DFA) can handle Unicode word boundaries for +/// anchored searches only, but in a cruel sort of joke, many Unicode features +/// tend to result in making the regex _not_ one-pass.) +/// +/// # Example +/// +/// This example shows that the `PikeVM` implements Unicode word boundaries +/// correctly by default. +/// +/// ``` +/// # if cfg!(miri) { return Ok(()); } // miri takes too long +/// use regex_automata::{nfa::thompson::pikevm::PikeVM, Match}; +/// +/// let re = PikeVM::new(r"\b\w+\b")?; +/// let mut cache = re.create_cache(); +/// +/// let mut it = re.find_iter(&mut cache, "Шерлок Холмс"); +/// assert_eq!(Some(Match::must(0, 0..12)), it.next()); +/// assert_eq!(Some(Match::must(0, 13..23)), it.next()); +/// assert_eq!(None, it.next()); +/// # Ok::<(), Box<dyn std::error::Error>>(()) +/// ``` +#[derive(Clone, Debug)] +pub struct PikeVM { + config: Config, + nfa: NFA, +} + +impl PikeVM { + /// Parse the given regular expression using the default configuration and + /// return the corresponding `PikeVM`. + /// + /// If you want a non-default configuration, then use the [`Builder`] to + /// set your own configuration. + /// + /// # Example + /// + /// ``` + /// use regex_automata::{nfa::thompson::pikevm::PikeVM, Match}; + /// + /// let re = PikeVM::new("foo[0-9]+bar")?; + /// let mut cache = re.create_cache(); + /// assert_eq!( + /// Some(Match::must(0, 3..14)), + /// re.find_iter(&mut cache, "zzzfoo12345barzzz").next(), + /// ); + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + #[cfg(feature = "syntax")] + pub fn new(pattern: &str) -> Result<PikeVM, BuildError> { + PikeVM::builder().build(pattern) + } + + /// Like `new`, but parses multiple patterns into a single "multi regex." + /// This similarly uses the default regex configuration. + /// + /// # Example + /// + /// ``` + /// use regex_automata::{nfa::thompson::pikevm::PikeVM, Match}; + /// + /// let re = PikeVM::new_many(&["[a-z]+", "[0-9]+"])?; + /// let mut cache = re.create_cache(); + /// + /// let mut it = re.find_iter(&mut cache, "abc 1 foo 4567 0 quux"); + /// assert_eq!(Some(Match::must(0, 0..3)), it.next()); + /// assert_eq!(Some(Match::must(1, 4..5)), it.next()); + /// assert_eq!(Some(Match::must(0, 6..9)), it.next()); + /// assert_eq!(Some(Match::must(1, 10..14)), it.next()); + /// assert_eq!(Some(Match::must(1, 15..16)), it.next()); + /// assert_eq!(Some(Match::must(0, 17..21)), it.next()); + /// assert_eq!(None, it.next()); + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + #[cfg(feature = "syntax")] + pub fn new_many<P: AsRef<str>>( + patterns: &[P], + ) -> Result<PikeVM, BuildError> { + PikeVM::builder().build_many(patterns) + } + + /// Like `new`, but builds a PikeVM directly from an NFA. This is useful + /// if you already have an NFA, or even if you hand-assembled the NFA. + /// + /// # Example + /// + /// This shows how to hand assemble a regular expression via its HIR, + /// compile an NFA from it and build a PikeVM from the NFA. + /// + /// ``` + /// use regex_automata::{nfa::thompson::{NFA, pikevm::PikeVM}, Match}; + /// use regex_syntax::hir::{Hir, Class, ClassBytes, ClassBytesRange}; + /// + /// let hir = Hir::class(Class::Bytes(ClassBytes::new(vec![ + /// ClassBytesRange::new(b'0', b'9'), + /// ClassBytesRange::new(b'A', b'Z'), + /// ClassBytesRange::new(b'_', b'_'), + /// ClassBytesRange::new(b'a', b'z'), + /// ]))); + /// + /// let config = NFA::config().nfa_size_limit(Some(1_000)); + /// let nfa = NFA::compiler().configure(config).build_from_hir(&hir)?; + /// + /// let re = PikeVM::new_from_nfa(nfa)?; + /// let (mut cache, mut caps) = (re.create_cache(), re.create_captures()); + /// let expected = Some(Match::must(0, 3..4)); + /// re.captures(&mut cache, "!@#A#@!", &mut caps); + /// assert_eq!(expected, caps.get_match()); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + pub fn new_from_nfa(nfa: NFA) -> Result<PikeVM, BuildError> { + PikeVM::builder().build_from_nfa(nfa) + } + + /// Create a new `PikeVM` that matches every input. + /// + /// # Example + /// + /// ``` + /// use regex_automata::{nfa::thompson::pikevm::PikeVM, Match}; + /// + /// let re = PikeVM::always_match()?; + /// let mut cache = re.create_cache(); + /// + /// let expected = Match::must(0, 0..0); + /// assert_eq!(Some(expected), re.find_iter(&mut cache, "").next()); + /// assert_eq!(Some(expected), re.find_iter(&mut cache, "foo").next()); + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + pub fn always_match() -> Result<PikeVM, BuildError> { + let nfa = thompson::NFA::always_match(); + PikeVM::new_from_nfa(nfa) + } + + /// Create a new `PikeVM` that never matches any input. + /// + /// # Example + /// + /// ``` + /// use regex_automata::nfa::thompson::pikevm::PikeVM; + /// + /// let re = PikeVM::never_match()?; + /// let mut cache = re.create_cache(); + /// + /// assert_eq!(None, re.find_iter(&mut cache, "").next()); + /// assert_eq!(None, re.find_iter(&mut cache, "foo").next()); + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + pub fn never_match() -> Result<PikeVM, BuildError> { + let nfa = thompson::NFA::never_match(); + PikeVM::new_from_nfa(nfa) + } + + /// Return a default configuration for a `PikeVM`. + /// + /// This is a convenience routine to avoid needing to import the `Config` + /// type when customizing the construction of a `PikeVM`. + /// + /// # Example + /// + /// This example shows how to disable UTF-8 mode. When UTF-8 mode is + /// disabled, zero-width matches that split a codepoint are allowed. + /// Otherwise they are never reported. + /// + /// In the code below, notice that `""` is permitted to match positions + /// that split the encoding of a codepoint. + /// + /// ``` + /// use regex_automata::{nfa::thompson::{self, pikevm::PikeVM}, Match}; + /// + /// let re = PikeVM::builder() + /// .thompson(thompson::Config::new().utf8(false)) + /// .build(r"")?; + /// let mut cache = re.create_cache(); + /// + /// let haystack = "a☃z"; + /// let mut it = re.find_iter(&mut cache, haystack); + /// assert_eq!(Some(Match::must(0, 0..0)), it.next()); + /// assert_eq!(Some(Match::must(0, 1..1)), it.next()); + /// assert_eq!(Some(Match::must(0, 2..2)), it.next()); + /// assert_eq!(Some(Match::must(0, 3..3)), it.next()); + /// assert_eq!(Some(Match::must(0, 4..4)), it.next()); + /// assert_eq!(Some(Match::must(0, 5..5)), it.next()); + /// assert_eq!(None, it.next()); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + pub fn config() -> Config { + Config::new() + } + + /// Return a builder for configuring the construction of a `PikeVM`. + /// + /// This is a convenience routine to avoid needing to import the + /// [`Builder`] type in common cases. + /// + /// # Example + /// + /// This example shows how to use the builder to disable UTF-8 mode + /// everywhere. + /// + /// ``` + /// use regex_automata::{ + /// nfa::thompson::{self, pikevm::PikeVM}, + /// util::syntax, + /// Match, + /// }; + /// + /// let re = PikeVM::builder() + /// .syntax(syntax::Config::new().utf8(false)) + /// .thompson(thompson::Config::new().utf8(false)) + /// .build(r"foo(?-u:[^b])ar.*")?; + /// let (mut cache, mut caps) = (re.create_cache(), re.create_captures()); + /// + /// let haystack = b"\xFEfoo\xFFarzz\xE2\x98\xFF\n"; + /// let expected = Some(Match::must(0, 1..9)); + /// re.captures(&mut cache, haystack, &mut caps); + /// assert_eq!(expected, caps.get_match()); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + pub fn builder() -> Builder { + Builder::new() + } + + /// Create a new empty set of capturing groups that is guaranteed to be + /// valid for the search APIs on this `PikeVM`. + /// + /// A `Captures` value created for a specific `PikeVM` cannot be used with + /// any other `PikeVM`. + /// + /// This is a convenience function for [`Captures::all`]. See the + /// [`Captures`] documentation for an explanation of its alternative + /// constructors that permit the `PikeVM` to do less work during a search, + /// and thus might make it faster. + pub fn create_captures(&self) -> Captures { + Captures::all(self.get_nfa().group_info().clone()) + } + + /// Create a new cache for this `PikeVM`. + /// + /// The cache returned should only be used for searches for this + /// `PikeVM`. If you want to reuse the cache for another `PikeVM`, then + /// you must call [`Cache::reset`] with that `PikeVM` (or, equivalently, + /// [`PikeVM::reset_cache`]). + pub fn create_cache(&self) -> Cache { + Cache::new(self) + } + + /// Reset the given cache such that it can be used for searching with the + /// this `PikeVM` (and only this `PikeVM`). + /// + /// A cache reset permits reusing memory already allocated in this cache + /// with a different `PikeVM`. + /// + /// # Example + /// + /// This shows how to re-purpose a cache for use with a different `PikeVM`. + /// + /// ``` + /// # if cfg!(miri) { return Ok(()); } // miri takes too long + /// use regex_automata::{nfa::thompson::pikevm::PikeVM, Match}; + /// + /// let re1 = PikeVM::new(r"\w")?; + /// let re2 = PikeVM::new(r"\W")?; + /// + /// let mut cache = re1.create_cache(); + /// assert_eq!( + /// Some(Match::must(0, 0..2)), + /// re1.find_iter(&mut cache, "Δ").next(), + /// ); + /// + /// // Using 'cache' with re2 is not allowed. It may result in panics or + /// // incorrect results. In order to re-purpose the cache, we must reset + /// // it with the PikeVM we'd like to use it with. + /// // + /// // Similarly, after this reset, using the cache with 're1' is also not + /// // allowed. + /// re2.reset_cache(&mut cache); + /// assert_eq!( + /// Some(Match::must(0, 0..3)), + /// re2.find_iter(&mut cache, "☃").next(), + /// ); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + pub fn reset_cache(&self, cache: &mut Cache) { + cache.reset(self); + } + + /// Returns the total number of patterns compiled into this `PikeVM`. + /// + /// In the case of a `PikeVM` that contains no patterns, this returns `0`. + /// + /// # Example + /// + /// This example shows the pattern length for a `PikeVM` that never + /// matches: + /// + /// ``` + /// use regex_automata::nfa::thompson::pikevm::PikeVM; + /// + /// let re = PikeVM::never_match()?; + /// assert_eq!(re.pattern_len(), 0); + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + /// + /// And another example for a `PikeVM` that matches at every position: + /// + /// ``` + /// use regex_automata::nfa::thompson::pikevm::PikeVM; + /// + /// let re = PikeVM::always_match()?; + /// assert_eq!(re.pattern_len(), 1); + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + /// + /// And finally, a `PikeVM` that was constructed from multiple patterns: + /// + /// ``` + /// use regex_automata::nfa::thompson::pikevm::PikeVM; + /// + /// let re = PikeVM::new_many(&["[0-9]+", "[a-z]+", "[A-Z]+"])?; + /// assert_eq!(re.pattern_len(), 3); + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + pub fn pattern_len(&self) -> usize { + self.nfa.pattern_len() + } + + /// Return the config for this `PikeVM`. + #[inline] + pub fn get_config(&self) -> &Config { + &self.config + } + + /// Returns a reference to the underlying NFA. + #[inline] + pub fn get_nfa(&self) -> &NFA { + &self.nfa + } +} + +impl PikeVM { + /// Returns true if and only if this `PikeVM` matches the given haystack. + /// + /// This routine may short circuit if it knows that scanning future + /// input will never lead to a different result. In particular, if the + /// underlying NFA enters a match state, then this routine will return + /// `true` immediately without inspecting any future input. (Consider how + /// this might make a difference given the regex `a+` on the haystack + /// `aaaaaaaaaaaaaaa`. This routine can stop after it sees the first `a`, + /// but routines like `find` need to continue searching because `+` is + /// greedy by default.) + /// + /// # Example + /// + /// This shows basic usage: + /// + /// ``` + /// use regex_automata::nfa::thompson::pikevm::PikeVM; + /// + /// let re = PikeVM::new("foo[0-9]+bar")?; + /// let mut cache = re.create_cache(); + /// + /// assert!(re.is_match(&mut cache, "foo12345bar")); + /// assert!(!re.is_match(&mut cache, "foobar")); + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + /// + /// # Example: consistency with search APIs + /// + /// `is_match` is guaranteed to return `true` whenever `find` returns a + /// match. This includes searches that are executed entirely within a + /// codepoint: + /// + /// ``` + /// use regex_automata::{nfa::thompson::pikevm::PikeVM, Input}; + /// + /// let re = PikeVM::new("a*")?; + /// let mut cache = re.create_cache(); + /// + /// assert!(!re.is_match(&mut cache, Input::new("☃").span(1..2))); + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + /// + /// Notice that when UTF-8 mode is disabled, then the above reports a + /// match because the restriction against zero-width matches that split a + /// codepoint has been lifted: + /// + /// ``` + /// use regex_automata::{nfa::thompson::{pikevm::PikeVM, NFA}, Input}; + /// + /// let re = PikeVM::builder() + /// .thompson(NFA::config().utf8(false)) + /// .build("a*")?; + /// let mut cache = re.create_cache(); + /// + /// assert!(re.is_match(&mut cache, Input::new("☃").span(1..2))); + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + #[inline] + pub fn is_match<'h, I: Into<Input<'h>>>( + &self, + cache: &mut Cache, + input: I, + ) -> bool { + let input = input.into().earliest(true); + self.search_slots(cache, &input, &mut []).is_some() + } + + /// Executes a leftmost forward search and returns a `Match` if one exists. + /// + /// This routine only includes the overall match span. To get access to the + /// individual spans of each capturing group, use [`PikeVM::captures`]. + /// + /// # Example + /// + /// Leftmost first match semantics corresponds to the match with the + /// smallest starting offset, but where the end offset is determined by + /// preferring earlier branches in the original regular expression. For + /// example, `Sam|Samwise` will match `Sam` in `Samwise`, but `Samwise|Sam` + /// will match `Samwise` in `Samwise`. + /// + /// Generally speaking, the "leftmost first" match is how most backtracking + /// regular expressions tend to work. This is in contrast to POSIX-style + /// regular expressions that yield "leftmost longest" matches. Namely, + /// both `Sam|Samwise` and `Samwise|Sam` match `Samwise` when using + /// leftmost longest semantics. (This crate does not currently support + /// leftmost longest semantics.) + /// + /// ``` + /// use regex_automata::{nfa::thompson::pikevm::PikeVM, Match}; + /// + /// let re = PikeVM::new("foo[0-9]+")?; + /// let mut cache = re.create_cache(); + /// let expected = Match::must(0, 0..8); + /// assert_eq!(Some(expected), re.find(&mut cache, "foo12345")); + /// + /// // Even though a match is found after reading the first byte (`a`), + /// // the leftmost first match semantics demand that we find the earliest + /// // match that prefers earlier parts of the pattern over later parts. + /// let re = PikeVM::new("abc|a")?; + /// let mut cache = re.create_cache(); + /// let expected = Match::must(0, 0..3); + /// assert_eq!(Some(expected), re.find(&mut cache, "abc")); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + #[inline] + pub fn find<'h, I: Into<Input<'h>>>( + &self, + cache: &mut Cache, + input: I, + ) -> Option<Match> { + let input = input.into(); + if self.get_nfa().pattern_len() == 1 { + let mut slots = [None, None]; + let pid = self.search_slots(cache, &input, &mut slots)?; + let start = slots[0]?.get(); + let end = slots[1]?.get(); + return Some(Match::new(pid, Span { start, end })); + } + let ginfo = self.get_nfa().group_info(); + let slots_len = ginfo.implicit_slot_len(); + let mut slots = vec![None; slots_len]; + let pid = self.search_slots(cache, &input, &mut slots)?; + let start = slots[pid.as_usize() * 2]?.get(); + let end = slots[pid.as_usize() * 2 + 1]?.get(); + Some(Match::new(pid, Span { start, end })) + } + + /// Executes a leftmost forward search and writes the spans of capturing + /// groups that participated in a match into the provided [`Captures`] + /// value. If no match was found, then [`Captures::is_match`] is guaranteed + /// to return `false`. + /// + /// # Example + /// + /// ``` + /// use regex_automata::{nfa::thompson::pikevm::PikeVM, Span}; + /// + /// let re = PikeVM::new(r"^([0-9]{4})-([0-9]{2})-([0-9]{2})$")?; + /// let (mut cache, mut caps) = (re.create_cache(), re.create_captures()); + /// + /// re.captures(&mut cache, "2010-03-14", &mut caps); + /// assert!(caps.is_match()); + /// assert_eq!(Some(Span::from(0..4)), caps.get_group(1)); + /// assert_eq!(Some(Span::from(5..7)), caps.get_group(2)); + /// assert_eq!(Some(Span::from(8..10)), caps.get_group(3)); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + #[inline] + pub fn captures<'h, I: Into<Input<'h>>>( + &self, + cache: &mut Cache, + input: I, + caps: &mut Captures, + ) { + self.search(cache, &input.into(), caps) + } + + /// Returns an iterator over all non-overlapping leftmost matches in the + /// given bytes. If no match exists, then the iterator yields no elements. + /// + /// # Example + /// + /// ``` + /// use regex_automata::{nfa::thompson::pikevm::PikeVM, Match}; + /// + /// let re = PikeVM::new("foo[0-9]+")?; + /// let mut cache = re.create_cache(); + /// + /// let text = "foo1 foo12 foo123"; + /// let matches: Vec<Match> = re.find_iter(&mut cache, text).collect(); + /// assert_eq!(matches, vec![ + /// Match::must(0, 0..4), + /// Match::must(0, 5..10), + /// Match::must(0, 11..17), + /// ]); + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + #[inline] + pub fn find_iter<'r, 'c, 'h, I: Into<Input<'h>>>( + &'r self, + cache: &'c mut Cache, + input: I, + ) -> FindMatches<'r, 'c, 'h> { + let caps = Captures::matches(self.get_nfa().group_info().clone()); + let it = iter::Searcher::new(input.into()); + FindMatches { re: self, cache, caps, it } + } + + /// Returns an iterator over all non-overlapping `Captures` values. If no + /// match exists, then the iterator yields no elements. + /// + /// This yields the same matches as [`PikeVM::find_iter`], but it includes + /// the spans of all capturing groups that participate in each match. + /// + /// **Tip:** See [`util::iter::Searcher`](crate::util::iter::Searcher) for + /// how to correctly iterate over all matches in a haystack while avoiding + /// the creation of a new `Captures` value for every match. (Which you are + /// forced to do with an `Iterator`.) + /// + /// # Example + /// + /// ``` + /// use regex_automata::{nfa::thompson::pikevm::PikeVM, Span}; + /// + /// let re = PikeVM::new("foo(?P<numbers>[0-9]+)")?; + /// let mut cache = re.create_cache(); + /// + /// let text = "foo1 foo12 foo123"; + /// let matches: Vec<Span> = re + /// .captures_iter(&mut cache, text) + /// // The unwrap is OK since 'numbers' matches if the pattern matches. + /// .map(|caps| caps.get_group_by_name("numbers").unwrap()) + /// .collect(); + /// assert_eq!(matches, vec![ + /// Span::from(3..4), + /// Span::from(8..10), + /// Span::from(14..17), + /// ]); + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + #[inline] + pub fn captures_iter<'r, 'c, 'h, I: Into<Input<'h>>>( + &'r self, + cache: &'c mut Cache, + input: I, + ) -> CapturesMatches<'r, 'c, 'h> { + let caps = self.create_captures(); + let it = iter::Searcher::new(input.into()); + CapturesMatches { re: self, cache, caps, it } + } +} + +impl PikeVM { + /// Executes a leftmost forward search and writes the spans of capturing + /// groups that participated in a match into the provided [`Captures`] + /// value. If no match was found, then [`Captures::is_match`] is guaranteed + /// to return `false`. + /// + /// This is like [`PikeVM::captures`], but it accepts a concrete `&Input` + /// instead of an `Into<Input>`. + /// + /// # Example: specific pattern search + /// + /// This example shows how to build a multi-PikeVM that permits searching + /// for specific patterns. + /// + /// ``` + /// use regex_automata::{ + /// nfa::thompson::pikevm::PikeVM, + /// Anchored, Match, PatternID, Input, + /// }; + /// + /// let re = PikeVM::new_many(&["[a-z0-9]{6}", "[a-z][a-z0-9]{5}"])?; + /// let (mut cache, mut caps) = (re.create_cache(), re.create_captures()); + /// let haystack = "foo123"; + /// + /// // Since we are using the default leftmost-first match and both + /// // patterns match at the same starting position, only the first pattern + /// // will be returned in this case when doing a search for any of the + /// // patterns. + /// let expected = Some(Match::must(0, 0..6)); + /// re.search(&mut cache, &Input::new(haystack), &mut caps); + /// assert_eq!(expected, caps.get_match()); + /// + /// // But if we want to check whether some other pattern matches, then we + /// // can provide its pattern ID. + /// let expected = Some(Match::must(1, 0..6)); + /// let input = Input::new(haystack) + /// .anchored(Anchored::Pattern(PatternID::must(1))); + /// re.search(&mut cache, &input, &mut caps); + /// assert_eq!(expected, caps.get_match()); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + /// + /// # Example: specifying the bounds of a search + /// + /// This example shows how providing the bounds of a search can produce + /// different results than simply sub-slicing the haystack. + /// + /// ``` + /// # if cfg!(miri) { return Ok(()); } // miri takes too long + /// use regex_automata::{nfa::thompson::pikevm::PikeVM, Match, Input}; + /// + /// let re = PikeVM::new(r"\b[0-9]{3}\b")?; + /// let (mut cache, mut caps) = (re.create_cache(), re.create_captures()); + /// let haystack = "foo123bar"; + /// + /// // Since we sub-slice the haystack, the search doesn't know about + /// // the larger context and assumes that `123` is surrounded by word + /// // boundaries. And of course, the match position is reported relative + /// // to the sub-slice as well, which means we get `0..3` instead of + /// // `3..6`. + /// let expected = Some(Match::must(0, 0..3)); + /// re.search(&mut cache, &Input::new(&haystack[3..6]), &mut caps); + /// assert_eq!(expected, caps.get_match()); + /// + /// // But if we provide the bounds of the search within the context of the + /// // entire haystack, then the search can take the surrounding context + /// // into account. (And if we did find a match, it would be reported + /// // as a valid offset into `haystack` instead of its sub-slice.) + /// let expected = None; + /// let input = Input::new(haystack).range(3..6); + /// re.search(&mut cache, &input, &mut caps); + /// assert_eq!(expected, caps.get_match()); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + #[inline] + pub fn search( + &self, + cache: &mut Cache, + input: &Input<'_>, + caps: &mut Captures, + ) { + caps.set_pattern(None); + let pid = self.search_slots(cache, input, caps.slots_mut()); + caps.set_pattern(pid); + } + + /// Executes a leftmost forward search and writes the spans of capturing + /// groups that participated in a match into the provided `slots`, and + /// returns the matching pattern ID. The contents of the slots for patterns + /// other than the matching pattern are unspecified. If no match was found, + /// then `None` is returned and the contents of `slots` is unspecified. + /// + /// This is like [`PikeVM::search`], but it accepts a raw slots slice + /// instead of a `Captures` value. This is useful in contexts where you + /// don't want or need to allocate a `Captures`. + /// + /// It is legal to pass _any_ number of slots to this routine. If the regex + /// engine would otherwise write a slot offset that doesn't fit in the + /// provided slice, then it is simply skipped. In general though, there are + /// usually three slice lengths you might want to use: + /// + /// * An empty slice, if you only care about which pattern matched. + /// * A slice with + /// [`pattern_len() * 2`](crate::nfa::thompson::NFA::pattern_len) + /// slots, if you only care about the overall match spans for each matching + /// pattern. + /// * A slice with + /// [`slot_len()`](crate::util::captures::GroupInfo::slot_len) slots, which + /// permits recording match offsets for every capturing group in every + /// pattern. + /// + /// # Example + /// + /// This example shows how to find the overall match offsets in a + /// multi-pattern search without allocating a `Captures` value. Indeed, we + /// can put our slots right on the stack. + /// + /// ``` + /// # if cfg!(miri) { return Ok(()); } // miri takes too long + /// use regex_automata::{nfa::thompson::pikevm::PikeVM, PatternID, Input}; + /// + /// let re = PikeVM::new_many(&[ + /// r"\pL+", + /// r"\d+", + /// ])?; + /// let mut cache = re.create_cache(); + /// let input = Input::new("!@#123"); + /// + /// // We only care about the overall match offsets here, so we just + /// // allocate two slots for each pattern. Each slot records the start + /// // and end of the match. + /// let mut slots = [None; 4]; + /// let pid = re.search_slots(&mut cache, &input, &mut slots); + /// assert_eq!(Some(PatternID::must(1)), pid); + /// + /// // The overall match offsets are always at 'pid * 2' and 'pid * 2 + 1'. + /// // See 'GroupInfo' for more details on the mapping between groups and + /// // slot indices. + /// let slot_start = pid.unwrap().as_usize() * 2; + /// let slot_end = slot_start + 1; + /// assert_eq!(Some(3), slots[slot_start].map(|s| s.get())); + /// assert_eq!(Some(6), slots[slot_end].map(|s| s.get())); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + #[inline] + pub fn search_slots( + &self, + cache: &mut Cache, + input: &Input<'_>, + slots: &mut [Option<NonMaxUsize>], + ) -> Option<PatternID> { + let utf8empty = self.get_nfa().has_empty() && self.get_nfa().is_utf8(); + if !utf8empty { + let hm = self.search_slots_imp(cache, input, slots)?; + return Some(hm.pattern()); + } + // There is an unfortunate special case where if the regex can + // match the empty string and UTF-8 mode is enabled, the search + // implementation requires that the slots have at least as much space + // to report the bounds of any match. This is so zero-width matches + // that split a codepoint can be filtered out. + // + // Note that if utf8empty is true, we specialize the case for when + // the number of patterns is 1. In that case, we can just use a stack + // allocation. Otherwise we resort to a heap allocation, which we + // convince ourselves we're fine with due to the pathological nature of + // this case. + let min = self.get_nfa().group_info().implicit_slot_len(); + if slots.len() >= min { + let hm = self.search_slots_imp(cache, input, slots)?; + return Some(hm.pattern()); + } + if self.get_nfa().pattern_len() == 1 { + let mut enough = [None, None]; + let got = self.search_slots_imp(cache, input, &mut enough); + // This is OK because we know `enough` is strictly bigger than + // `slots`, otherwise this special case isn't reached. + slots.copy_from_slice(&enough[..slots.len()]); + return got.map(|hm| hm.pattern()); + } + let mut enough = vec![None; min]; + let got = self.search_slots_imp(cache, input, &mut enough); + // This is OK because we know `enough` is strictly bigger than `slots`, + // otherwise this special case isn't reached. + slots.copy_from_slice(&enough[..slots.len()]); + got.map(|hm| hm.pattern()) + } + + /// This is the actual implementation of `search_slots_imp` that + /// doesn't account for the special case when 1) the NFA has UTF-8 mode + /// enabled, 2) the NFA can match the empty string and 3) the caller has + /// provided an insufficient number of slots to record match offsets. + #[inline(never)] + fn search_slots_imp( + &self, + cache: &mut Cache, + input: &Input<'_>, + slots: &mut [Option<NonMaxUsize>], + ) -> Option<HalfMatch> { + let utf8empty = self.get_nfa().has_empty() && self.get_nfa().is_utf8(); + let hm = match self.search_imp(cache, input, slots) { + None => return None, + Some(hm) if !utf8empty => return Some(hm), + Some(hm) => hm, + }; + empty::skip_splits_fwd(input, hm, hm.offset(), |input| { + Ok(self + .search_imp(cache, input, slots) + .map(|hm| (hm, hm.offset()))) + }) + // OK because the PikeVM never errors. + .unwrap() + } + + /// Writes the set of patterns that match anywhere in the given search + /// configuration to `patset`. If multiple patterns match at the same + /// position and this `PikeVM` was configured with [`MatchKind::All`] + /// semantics, then all matching patterns are written to the given set. + /// + /// Unless all of the patterns in this `PikeVM` are anchored, then + /// generally speaking, this will visit every byte in the haystack. + /// + /// This search routine *does not* clear the pattern set. This gives some + /// flexibility to the caller (e.g., running multiple searches with the + /// same pattern set), but does make the API bug-prone if you're reusing + /// the same pattern set for multiple searches but intended them to be + /// independent. + /// + /// If a pattern ID matched but the given `PatternSet` does not have + /// sufficient capacity to store it, then it is not inserted and silently + /// dropped. + /// + /// # Example + /// + /// This example shows how to find all matching patterns in a haystack, + /// even when some patterns match at the same position as other patterns. + /// + /// ``` + /// # if cfg!(miri) { return Ok(()); } // miri takes too long + /// use regex_automata::{ + /// nfa::thompson::pikevm::PikeVM, + /// Input, MatchKind, PatternSet, + /// }; + /// + /// let patterns = &[ + /// r"\w+", r"\d+", r"\pL+", r"foo", r"bar", r"barfoo", r"foobar", + /// ]; + /// let re = PikeVM::builder() + /// .configure(PikeVM::config().match_kind(MatchKind::All)) + /// .build_many(patterns)?; + /// let mut cache = re.create_cache(); + /// + /// let input = Input::new("foobar"); + /// let mut patset = PatternSet::new(re.pattern_len()); + /// re.which_overlapping_matches(&mut cache, &input, &mut patset); + /// let expected = vec![0, 2, 3, 4, 6]; + /// let got: Vec<usize> = patset.iter().map(|p| p.as_usize()).collect(); + /// assert_eq!(expected, got); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + #[inline] + pub fn which_overlapping_matches( + &self, + cache: &mut Cache, + input: &Input<'_>, + patset: &mut PatternSet, + ) { + self.which_overlapping_imp(cache, input, patset) + } +} + +impl PikeVM { + /// The implementation of standard leftmost search. + /// + /// Capturing group spans are written to `slots`, but only if requested. + /// `slots` can be any length. Any slot in the NFA that is activated but + /// which is out of bounds for the given `slots` is ignored. + fn search_imp( + &self, + cache: &mut Cache, + input: &Input<'_>, + slots: &mut [Option<NonMaxUsize>], + ) -> Option<HalfMatch> { + cache.setup_search(slots.len()); + if input.is_done() { + return None; + } + // Why do we even care about this? Well, in our 'Captures' + // representation, we use usize::MAX as a sentinel to indicate "no + // match." This isn't problematic so long as our haystack doesn't have + // a maximal length. Byte slices are guaranteed by Rust to have a + // length that fits into isize, and so this assert should always pass. + // But we put it here to make our assumption explicit. + assert!( + input.haystack().len() < core::usize::MAX, + "byte slice lengths must be less than usize MAX", + ); + instrument!(|c| c.reset(&self.nfa)); + + // Whether we want to visit all match states instead of emulating the + // 'leftmost' semantics of typical backtracking regex engines. + let allmatches = + self.config.get_match_kind().continue_past_first_match(); + let (anchored, start_id) = match self.start_config(input) { + None => return None, + Some(config) => config, + }; + + let pre = + if anchored { None } else { self.get_config().get_prefilter() }; + let Cache { ref mut stack, ref mut curr, ref mut next } = cache; + let mut hm = None; + // Yes, our search doesn't end at input.end(), but includes it. This + // is necessary because matches are delayed by one byte, just like + // how the DFA engines work. The delay is used to handle look-behind + // assertions. In the case of the PikeVM, the delay is implemented + // by not considering a match to exist until it is visited in + // 'steps'. Technically, we know a match exists in the previous + // iteration via 'epsilon_closure'. (It's the same thing in NFA-to-DFA + // determinization. We don't mark a DFA state as a match state if it + // contains an NFA match state, but rather, whether the DFA state was + // generated by a transition from a DFA state that contains an NFA + // match state.) + let mut at = input.start(); + while at <= input.end() { + // If we have no states left to visit, then there are some cases + // where we know we can quit early or even skip ahead. + if curr.set.is_empty() { + // We have a match and we haven't been instructed to continue + // on even after finding a match, so we can quit. + if hm.is_some() && !allmatches { + break; + } + // If we're running an anchored search and we've advanced + // beyond the start position with no other states to try, then + // we will never observe a match and thus can stop. + if anchored && at > input.start() { + break; + } + // If there no states left to explore at this position and we + // know we can't terminate early, then we are effectively at + // the starting state of the NFA. If we fell through here, + // we'd end up adding our '(?s-u:.)*?' prefix and it would be + // the only thing in 'curr'. So we might as well just skip + // ahead until we find something that we know might advance us + // forward. + if let Some(ref pre) = pre { + let span = Span::from(at..input.end()); + match pre.find(input.haystack(), span) { + None => break, + Some(ref span) => at = span.start, + } + } + } + // Instead of using the NFA's unanchored start state, we actually + // always use its anchored starting state. As a result, when doing + // an unanchored search, we need to simulate our own '(?s-u:.)*?' + // prefix, to permit a match to appear anywhere. + // + // Now, we don't *have* to do things this way. We could use the + // NFA's unanchored starting state and do one 'epsilon_closure' + // call from that starting state before the main loop here. And + // that is just as correct. However, it turns out to be slower + // than our approach here because it slightly increases the cost + // of processing each byte by requiring us to visit more NFA + // states to deal with the additional NFA states in the unanchored + // prefix. By simulating it explicitly here, we lower those costs + // substantially. The cost is itself small, but it adds up for + // large haystacks. + // + // In order to simulate the '(?s-u:.)*?' prefix---which is not + // greedy---we are careful not to perform an epsilon closure on + // the start state if we already have a match. Namely, if we + // did otherwise, we would never reach a terminating condition + // because there would always be additional states to process. + // In effect, the exclusion of running 'epsilon_closure' when + // we have a match corresponds to the "dead" states we have in + // our DFA regex engines. Namely, in a DFA, match states merely + // instruct the search execution to record the current offset as + // the most recently seen match. It is the dead state that actually + // indicates when to stop the search (other than EOF or quit + // states). + // + // However, when 'allmatches' is true, the caller has asked us to + // leave in every possible match state. This tends not to make a + // whole lot of sense in unanchored searches, because it means the + // search really cannot terminate until EOF. And often, in that + // case, you wind up skipping over a bunch of matches and are left + // with the "last" match. Arguably, it just doesn't make a lot of + // sense to run a 'leftmost' search (which is what this routine is) + // with 'allmatches' set to true. But the DFAs support it and this + // matches their behavior. (Generally, 'allmatches' is useful for + // overlapping searches or leftmost anchored searches to find the + // longest possible match by ignoring match priority.) + // + // Additionally, when we're running an anchored search, this + // epsilon closure should only be computed at the beginning of the + // search. If we re-computed it at every position, we would be + // simulating an unanchored search when we were tasked to perform + // an anchored search. + if (!hm.is_some() || allmatches) + && (!anchored || at == input.start()) + { + // Since we are adding to the 'curr' active states and since + // this is for the start ID, we use a slots slice that is + // guaranteed to have the right length but where every element + // is absent. This is exactly what we want, because this + // epsilon closure is responsible for simulating an unanchored + // '(?s:.)*?' prefix. It is specifically outside of any + // capturing groups, and thus, using slots that are always + // absent is correct. + // + // Note though that we can't just use '&mut []' here, since + // this epsilon closure may traverse through 'Captures' epsilon + // transitions, and thus must be able to write offsets to the + // slots given which are later copied to slot values in 'curr'. + let slots = next.slot_table.all_absent(); + self.epsilon_closure(stack, slots, curr, input, at, start_id); + } + if let Some(pid) = self.nexts(stack, curr, next, input, at, slots) + { + hm = Some(HalfMatch::new(pid, at)); + } + // Unless the caller asked us to return early, we need to mush on + // to see if we can extend our match. (But note that 'nexts' will + // quit right after seeing a match when match_kind==LeftmostFirst, + // as is consistent with leftmost-first match priority.) + if input.get_earliest() && hm.is_some() { + break; + } + core::mem::swap(curr, next); + next.set.clear(); + at += 1; + } + instrument!(|c| c.eprint(&self.nfa)); + hm + } + + /// The implementation for the 'which_overlapping_matches' API. Basically, + /// we do a single scan through the entire haystack (unless our regex + /// or search is anchored) and record every pattern that matched. In + /// particular, when MatchKind::All is used, this supports overlapping + /// matches. So if we have the regexes 'sam' and 'samwise', they will + /// *both* be reported in the pattern set when searching the haystack + /// 'samwise'. + fn which_overlapping_imp( + &self, + cache: &mut Cache, + input: &Input<'_>, + patset: &mut PatternSet, + ) { + // NOTE: This is effectively a copy of 'search_imp' above, but with no + // captures support and instead writes patterns that matched directly + // to 'patset'. See that routine for better commentary about what's + // going on in this routine. We probably could unify the routines using + // generics or more helper routines, but I'm not sure it's worth it. + // + // NOTE: We somewhat go out of our way here to support things like + // 'input.get_earliest()' and 'leftmost-first' match semantics. Neither + // of those seem particularly relevant to this routine, but they are + // both supported by the DFA analogs of this routine by construction + // and composition, so it seems like good sense to have the PikeVM + // match that behavior. + + cache.setup_search(0); + if input.is_done() { + return; + } + assert!( + input.haystack().len() < core::usize::MAX, + "byte slice lengths must be less than usize MAX", + ); + instrument!(|c| c.reset(&self.nfa)); + + let allmatches = + self.config.get_match_kind().continue_past_first_match(); + let (anchored, start_id) = match self.start_config(input) { + None => return, + Some(config) => config, + }; + + let Cache { ref mut stack, ref mut curr, ref mut next } = cache; + for at in input.start()..=input.end() { + let any_matches = !patset.is_empty(); + if curr.set.is_empty() { + if any_matches && !allmatches { + break; + } + if anchored && at > input.start() { + break; + } + } + if !any_matches || allmatches { + let slots = &mut []; + self.epsilon_closure(stack, slots, curr, input, at, start_id); + } + self.nexts_overlapping(stack, curr, next, input, at, patset); + // If we found a match and filled our set, then there is no more + // additional info that we can provide. Thus, we can quit. We also + // quit if the caller asked us to stop at the earliest point that + // we know a match exists. + if patset.is_full() || input.get_earliest() { + break; + } + core::mem::swap(curr, next); + next.set.clear(); + } + instrument!(|c| c.eprint(&self.nfa)); + } + + /// Process the active states in 'curr' to find the states (written to + /// 'next') we should process for the next byte in the haystack. + /// + /// 'stack' is used to perform a depth first traversal of the NFA when + /// computing an epsilon closure. + /// + /// When a match is found, the slots for that match state (in 'curr') are + /// copied to 'caps'. Moreover, once a match is seen, processing for 'curr' + /// stops (unless the PikeVM was configured with MatchKind::All semantics). + #[cfg_attr(feature = "perf-inline", inline(always))] + fn nexts( + &self, + stack: &mut Vec<FollowEpsilon>, + curr: &mut ActiveStates, + next: &mut ActiveStates, + input: &Input<'_>, + at: usize, + slots: &mut [Option<NonMaxUsize>], + ) -> Option<PatternID> { + instrument!(|c| c.record_state_set(&curr.set)); + let mut pid = None; + let ActiveStates { ref set, ref mut slot_table } = *curr; + for sid in set.iter() { + pid = match self.next(stack, slot_table, next, input, at, sid) { + None => continue, + Some(pid) => Some(pid), + }; + slots.copy_from_slice(slot_table.for_state(sid)); + if !self.config.get_match_kind().continue_past_first_match() { + break; + } + } + pid + } + + /// Like 'nexts', but for the overlapping case. This doesn't write any + /// slots, and instead just writes which pattern matched in 'patset'. + #[cfg_attr(feature = "perf-inline", inline(always))] + fn nexts_overlapping( + &self, + stack: &mut Vec<FollowEpsilon>, + curr: &mut ActiveStates, + next: &mut ActiveStates, + input: &Input<'_>, + at: usize, + patset: &mut PatternSet, + ) { + instrument!(|c| c.record_state_set(&curr.set)); + let utf8empty = self.get_nfa().has_empty() && self.get_nfa().is_utf8(); + let ActiveStates { ref set, ref mut slot_table } = *curr; + for sid in set.iter() { + let pid = match self.next(stack, slot_table, next, input, at, sid) + { + None => continue, + Some(pid) => pid, + }; + // This handles the case of finding a zero-width match that splits + // a codepoint. Namely, if we're in UTF-8 mode AND we know we can + // match the empty string, then the only valid way of getting to + // this point with an offset that splits a codepoint is when we + // have an empty match. Such matches, in UTF-8 mode, must not be + // reported. So we just skip them here and pretend as if we did + // not see a match. + if utf8empty && !input.is_char_boundary(at) { + continue; + } + let _ = patset.try_insert(pid); + if !self.config.get_match_kind().continue_past_first_match() { + break; + } + } + } + + /// Starting from 'sid', if the position 'at' in the 'input' haystack has a + /// transition defined out of 'sid', then add the state transitioned to and + /// its epsilon closure to the 'next' set of states to explore. + /// + /// 'stack' is used by the epsilon closure computation to perform a depth + /// first traversal of the NFA. + /// + /// 'curr_slot_table' should be the table of slots for the current set of + /// states being explored. If there is a transition out of 'sid', then + /// sid's row in the slot table is used to perform the epsilon closure. + #[cfg_attr(feature = "perf-inline", inline(always))] + fn next( + &self, + stack: &mut Vec<FollowEpsilon>, + curr_slot_table: &mut SlotTable, + next: &mut ActiveStates, + input: &Input<'_>, + at: usize, + sid: StateID, + ) -> Option<PatternID> { + instrument!(|c| c.record_step(sid)); + match *self.nfa.state(sid) { + State::Fail + | State::Look { .. } + | State::Union { .. } + | State::BinaryUnion { .. } + | State::Capture { .. } => None, + State::ByteRange { ref trans } => { + if trans.matches(input.haystack(), at) { + let slots = curr_slot_table.for_state(sid); + // OK because 'at <= haystack.len() < usize::MAX', so + // adding 1 will never wrap. + let at = at.wrapping_add(1); + self.epsilon_closure( + stack, slots, next, input, at, trans.next, + ); + } + None + } + State::Sparse(ref sparse) => { + if let Some(next_sid) = sparse.matches(input.haystack(), at) { + let slots = curr_slot_table.for_state(sid); + // OK because 'at <= haystack.len() < usize::MAX', so + // adding 1 will never wrap. + let at = at.wrapping_add(1); + self.epsilon_closure( + stack, slots, next, input, at, next_sid, + ); + } + None + } + State::Dense(ref dense) => { + if let Some(next_sid) = dense.matches(input.haystack(), at) { + let slots = curr_slot_table.for_state(sid); + // OK because 'at <= haystack.len() < usize::MAX', so + // adding 1 will never wrap. + let at = at.wrapping_add(1); + self.epsilon_closure( + stack, slots, next, input, at, next_sid, + ); + } + None + } + State::Match { pattern_id } => Some(pattern_id), + } + } + + /// Compute the epsilon closure of 'sid', writing the closure into 'next' + /// while copying slot values from 'curr_slots' into corresponding states + /// in 'next'. 'curr_slots' should be the slot values corresponding to + /// 'sid'. + /// + /// The given 'stack' is used to perform a depth first traversal of the + /// NFA by recursively following all epsilon transitions out of 'sid'. + /// Conditional epsilon transitions are followed if and only if they are + /// satisfied for the position 'at' in the 'input' haystack. + /// + /// While this routine may write to 'curr_slots', once it returns, any + /// writes are undone and the original values (even if absent) are + /// restored. + #[cfg_attr(feature = "perf-inline", inline(always))] + fn epsilon_closure( + &self, + stack: &mut Vec<FollowEpsilon>, + curr_slots: &mut [Option<NonMaxUsize>], + next: &mut ActiveStates, + input: &Input<'_>, + at: usize, + sid: StateID, + ) { + instrument!(|c| { + c.record_closure(sid); + c.record_stack_push(sid); + }); + stack.push(FollowEpsilon::Explore(sid)); + while let Some(frame) = stack.pop() { + match frame { + FollowEpsilon::RestoreCapture { slot, offset: pos } => { + curr_slots[slot] = pos; + } + FollowEpsilon::Explore(sid) => { + self.epsilon_closure_explore( + stack, curr_slots, next, input, at, sid, + ); + } + } + } + } + + /// Explore all of the epsilon transitions out of 'sid'. This is mostly + /// split out from 'epsilon_closure' in order to clearly delineate + /// the actual work of computing an epsilon closure from the stack + /// book-keeping. + /// + /// This will push any additional explorations needed on to 'stack'. + /// + /// 'curr_slots' should refer to the slots for the currently active NFA + /// state. That is, the current state we are stepping through. These + /// slots are mutated in place as new 'Captures' states are traversed + /// during epsilon closure, but the slots are restored to their original + /// values once the full epsilon closure is completed. The ultimate use of + /// 'curr_slots' is to copy them to the corresponding 'next_slots', so that + /// the capturing group spans are forwarded from the currently active state + /// to the next. + /// + /// 'next' refers to the next set of active states. Computing an epsilon + /// closure may increase the next set of active states. + /// + /// 'input' refers to the caller's input configuration and 'at' refers to + /// the current position in the haystack. These are used to check whether + /// conditional epsilon transitions (like look-around) are satisfied at + /// the current position. If they aren't, then the epsilon closure won't + /// include them. + #[cfg_attr(feature = "perf-inline", inline(always))] + fn epsilon_closure_explore( + &self, + stack: &mut Vec<FollowEpsilon>, + curr_slots: &mut [Option<NonMaxUsize>], + next: &mut ActiveStates, + input: &Input<'_>, + at: usize, + mut sid: StateID, + ) { + // We can avoid pushing some state IDs on to our stack in precisely + // the cases where a 'push(x)' would be immediately followed by a 'x + // = pop()'. This is achieved by this outer-loop. We simply set 'sid' + // to be the next state ID we want to explore once we're done with + // our initial exploration. In practice, this avoids a lot of stack + // thrashing. + loop { + instrument!(|c| c.record_set_insert(sid)); + // Record this state as part of our next set of active states. If + // we've already explored it, then no need to do it again. + if !next.set.insert(sid) { + return; + } + match *self.nfa.state(sid) { + State::Fail + | State::Match { .. } + | State::ByteRange { .. } + | State::Sparse { .. } + | State::Dense { .. } => { + next.slot_table.for_state(sid).copy_from_slice(curr_slots); + return; + } + State::Look { look, next } => { + // OK because we don't permit building a searcher with a + // Unicode word boundary if the requisite Unicode data is + // unavailable. + if !self.nfa.look_matcher().matches_inline( + look, + input.haystack(), + at, + ) { + return; + } + sid = next; + } + State::Union { ref alternates } => { + sid = match alternates.get(0) { + None => return, + Some(&sid) => sid, + }; + instrument!(|c| { + for &alt in &alternates[1..] { + c.record_stack_push(alt); + } + }); + stack.extend( + alternates[1..] + .iter() + .copied() + .rev() + .map(FollowEpsilon::Explore), + ); + } + State::BinaryUnion { alt1, alt2 } => { + sid = alt1; + instrument!(|c| c.record_stack_push(sid)); + stack.push(FollowEpsilon::Explore(alt2)); + } + State::Capture { next, slot, .. } => { + // There's no need to do anything with slots that + // ultimately won't be copied into the caller-provided + // 'Captures' value. So we just skip dealing with them at + // all. + if slot.as_usize() < curr_slots.len() { + instrument!(|c| c.record_stack_push(sid)); + stack.push(FollowEpsilon::RestoreCapture { + slot, + offset: curr_slots[slot], + }); + // OK because length of a slice must fit into an isize. + curr_slots[slot] = Some(NonMaxUsize::new(at).unwrap()); + } + sid = next; + } + } + } + } + + /// Return the starting configuration of a PikeVM search. + /// + /// The "start config" is basically whether the search should be anchored + /// or not and the NFA state ID at which to begin the search. The state ID + /// returned always corresponds to an anchored starting state even when the + /// search is unanchored. This is because the PikeVM search loop deals with + /// unanchored searches with an explicit epsilon closure out of the start + /// state. + /// + /// This routine accounts for both the caller's `Input` configuration + /// and the pattern itself. For example, even if the caller asks for an + /// unanchored search, if the pattern itself is anchored, then this will + /// always return 'true' because implementing an unanchored search in that + /// case would be incorrect. + /// + /// Similarly, if the caller requests an anchored search for a particular + /// pattern, then the starting state ID returned will reflect that. + /// + /// If a pattern ID is given in the input configuration that is not in + /// this regex, then `None` is returned. + fn start_config(&self, input: &Input<'_>) -> Option<(bool, StateID)> { + match input.get_anchored() { + // Only way we're unanchored is if both the caller asked for an + // unanchored search *and* the pattern is itself not anchored. + Anchored::No => Some(( + self.nfa.is_always_start_anchored(), + self.nfa.start_anchored(), + )), + Anchored::Yes => Some((true, self.nfa.start_anchored())), + Anchored::Pattern(pid) => { + Some((true, self.nfa.start_pattern(pid)?)) + } + } + } +} + +/// An iterator over all non-overlapping matches for a particular search. +/// +/// The iterator yields a [`Match`] value until no more matches could be found. +/// +/// The lifetime parameters are as follows: +/// +/// * `'r` represents the lifetime of the PikeVM. +/// * `'c` represents the lifetime of the PikeVM's cache. +/// * `'h` represents the lifetime of the haystack being searched. +/// +/// This iterator can be created with the [`PikeVM::find_iter`] method. +#[derive(Debug)] +pub struct FindMatches<'r, 'c, 'h> { + re: &'r PikeVM, + cache: &'c mut Cache, + caps: Captures, + it: iter::Searcher<'h>, +} + +impl<'r, 'c, 'h> Iterator for FindMatches<'r, 'c, 'h> { + type Item = Match; + + #[inline] + fn next(&mut self) -> Option<Match> { + // Splitting 'self' apart seems necessary to appease borrowck. + let FindMatches { re, ref mut cache, ref mut caps, ref mut it } = + *self; + // 'advance' converts errors into panics, which is OK here because + // the PikeVM can never return an error. + it.advance(|input| { + re.search(cache, input, caps); + Ok(caps.get_match()) + }) + } +} + +/// An iterator over all non-overlapping leftmost matches, with their capturing +/// groups, for a particular search. +/// +/// The iterator yields a [`Captures`] value until no more matches could be +/// found. +/// +/// The lifetime parameters are as follows: +/// +/// * `'r` represents the lifetime of the PikeVM. +/// * `'c` represents the lifetime of the PikeVM's cache. +/// * `'h` represents the lifetime of the haystack being searched. +/// +/// This iterator can be created with the [`PikeVM::captures_iter`] method. +#[derive(Debug)] +pub struct CapturesMatches<'r, 'c, 'h> { + re: &'r PikeVM, + cache: &'c mut Cache, + caps: Captures, + it: iter::Searcher<'h>, +} + +impl<'r, 'c, 'h> Iterator for CapturesMatches<'r, 'c, 'h> { + type Item = Captures; + + #[inline] + fn next(&mut self) -> Option<Captures> { + // Splitting 'self' apart seems necessary to appease borrowck. + let CapturesMatches { re, ref mut cache, ref mut caps, ref mut it } = + *self; + // 'advance' converts errors into panics, which is OK here because + // the PikeVM can never return an error. + it.advance(|input| { + re.search(cache, input, caps); + Ok(caps.get_match()) + }); + if caps.is_match() { + Some(caps.clone()) + } else { + None + } + } +} + +/// A cache represents mutable state that a [`PikeVM`] requires during a +/// search. +/// +/// For a given [`PikeVM`], its corresponding cache may be created either via +/// [`PikeVM::create_cache`], or via [`Cache::new`]. They are equivalent in +/// every way, except the former does not require explicitly importing `Cache`. +/// +/// A particular `Cache` is coupled with the [`PikeVM`] from which it +/// was created. It may only be used with that `PikeVM`. A cache and its +/// allocations may be re-purposed via [`Cache::reset`], in which case, it can +/// only be used with the new `PikeVM` (and not the old one). +#[derive(Clone, Debug)] +pub struct Cache { + /// Stack used while computing epsilon closure. This effectively lets us + /// move what is more naturally expressed through recursion to a stack + /// on the heap. + stack: Vec<FollowEpsilon>, + /// The current active states being explored for the current byte in the + /// haystack. + curr: ActiveStates, + /// The next set of states we're building that will be explored for the + /// next byte in the haystack. + next: ActiveStates, +} + +impl Cache { + /// Create a new [`PikeVM`] cache. + /// + /// A potentially more convenient routine to create a cache is + /// [`PikeVM::create_cache`], as it does not require also importing the + /// `Cache` type. + /// + /// If you want to reuse the returned `Cache` with some other `PikeVM`, + /// then you must call [`Cache::reset`] with the desired `PikeVM`. + pub fn new(re: &PikeVM) -> Cache { + Cache { + stack: vec![], + curr: ActiveStates::new(re), + next: ActiveStates::new(re), + } + } + + /// Reset this cache such that it can be used for searching with a + /// different [`PikeVM`]. + /// + /// A cache reset permits reusing memory already allocated in this cache + /// with a different `PikeVM`. + /// + /// # Example + /// + /// This shows how to re-purpose a cache for use with a different `PikeVM`. + /// + /// ``` + /// # if cfg!(miri) { return Ok(()); } // miri takes too long + /// use regex_automata::{nfa::thompson::pikevm::PikeVM, Match}; + /// + /// let re1 = PikeVM::new(r"\w")?; + /// let re2 = PikeVM::new(r"\W")?; + /// + /// let mut cache = re1.create_cache(); + /// assert_eq!( + /// Some(Match::must(0, 0..2)), + /// re1.find_iter(&mut cache, "Δ").next(), + /// ); + /// + /// // Using 'cache' with re2 is not allowed. It may result in panics or + /// // incorrect results. In order to re-purpose the cache, we must reset + /// // it with the PikeVM we'd like to use it with. + /// // + /// // Similarly, after this reset, using the cache with 're1' is also not + /// // allowed. + /// cache.reset(&re2); + /// assert_eq!( + /// Some(Match::must(0, 0..3)), + /// re2.find_iter(&mut cache, "☃").next(), + /// ); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + pub fn reset(&mut self, re: &PikeVM) { + self.curr.reset(re); + self.next.reset(re); + } + + /// Returns the heap memory usage, in bytes, of this cache. + /// + /// This does **not** include the stack size used up by this cache. To + /// compute that, use `std::mem::size_of::<Cache>()`. + pub fn memory_usage(&self) -> usize { + use core::mem::size_of; + (self.stack.len() * size_of::<FollowEpsilon>()) + + self.curr.memory_usage() + + self.next.memory_usage() + } + + /// Clears this cache. This should be called at the start of every search + /// to ensure we start with a clean slate. + /// + /// This also sets the length of the capturing groups used in the current + /// search. This permits an optimization where by 'SlotTable::for_state' + /// only returns the number of slots equivalent to the number of slots + /// given in the 'Captures' value. This may be less than the total number + /// of possible slots, e.g., when one only wants to track overall match + /// offsets. This in turn permits less copying of capturing group spans + /// in the PikeVM. + fn setup_search(&mut self, captures_slot_len: usize) { + self.stack.clear(); + self.curr.setup_search(captures_slot_len); + self.next.setup_search(captures_slot_len); + } +} + +/// A set of active states used to "simulate" the execution of an NFA via the +/// PikeVM. +/// +/// There are two sets of these used during NFA simulation. One set corresponds +/// to the "current" set of states being traversed for the current position +/// in a haystack. The other set corresponds to the "next" set of states being +/// built, which will become the new "current" set for the next position in the +/// haystack. These two sets correspond to CLIST and NLIST in Thompson's +/// original paper regexes: https://dl.acm.org/doi/pdf/10.1145/363347.363387 +/// +/// In addition to representing a set of NFA states, this also maintains slot +/// values for each state. These slot values are what turn the NFA simulation +/// into the "Pike VM." Namely, they track capturing group values for each +/// state. During the computation of epsilon closure, we copy slot values from +/// states in the "current" set to the "next" set. Eventually, once a match +/// is found, the slot values for that match state are what we write to the +/// caller provided 'Captures' value. +#[derive(Clone, Debug)] +struct ActiveStates { + /// The set of active NFA states. This set preserves insertion order, which + /// is critical for simulating the match semantics of backtracking regex + /// engines. + set: SparseSet, + /// The slots for every NFA state, where each slot stores a (possibly + /// absent) offset. Every capturing group has two slots. One for a start + /// offset and one for an end offset. + slot_table: SlotTable, +} + +impl ActiveStates { + /// Create a new set of active states for the given PikeVM. The active + /// states returned may only be used with the given PikeVM. (Use 'reset' + /// to re-purpose the allocation for a different PikeVM.) + fn new(re: &PikeVM) -> ActiveStates { + let mut active = ActiveStates { + set: SparseSet::new(0), + slot_table: SlotTable::new(), + }; + active.reset(re); + active + } + + /// Reset this set of active states such that it can be used with the given + /// PikeVM (and only that PikeVM). + fn reset(&mut self, re: &PikeVM) { + self.set.resize(re.get_nfa().states().len()); + self.slot_table.reset(re); + } + + /// Return the heap memory usage, in bytes, used by this set of active + /// states. + /// + /// This does not include the stack size of this value. + fn memory_usage(&self) -> usize { + self.set.memory_usage() + self.slot_table.memory_usage() + } + + /// Setup this set of active states for a new search. The given slot + /// length should be the number of slots in a caller provided 'Captures' + /// (and may be zero). + fn setup_search(&mut self, captures_slot_len: usize) { + self.set.clear(); + self.slot_table.setup_search(captures_slot_len); + } +} + +/// A table of slots, where each row represent a state in an NFA. Thus, the +/// table has room for storing slots for every single state in an NFA. +/// +/// This table is represented with a single contiguous allocation. In general, +/// the notion of "capturing group" doesn't really exist at this level of +/// abstraction, hence the name "slot" instead. (Indeed, every capturing group +/// maps to a pair of slots, one for the start offset and one for the end +/// offset.) Slots are indexed by the 'Captures' NFA state. +/// +/// N.B. Not every state actually needs a row of slots. Namely, states that +/// only have epsilon transitions currently never have anything written to +/// their rows in this table. Thus, the table is somewhat wasteful in its heap +/// usage. However, it is important to maintain fast random access by state +/// ID, which means one giant table tends to work well. RE2 takes a different +/// approach here and allocates each row as its own reference counted thing. +/// I explored such a strategy at one point here, but couldn't get it to work +/// well using entirely safe code. (To the ambitious reader: I encourage you to +/// re-litigate that experiment.) I very much wanted to stick to safe code, but +/// could be convinced otherwise if there was a solid argument and the safety +/// was encapsulated well. +#[derive(Clone, Debug)] +struct SlotTable { + /// The actual table of offsets. + table: Vec<Option<NonMaxUsize>>, + /// The number of slots per state, i.e., the table's stride or the length + /// of each row. + slots_per_state: usize, + /// The number of slots in the caller-provided 'Captures' value for the + /// current search. Setting this to 'slots_per_state' is always correct, + /// but may be wasteful. + slots_for_captures: usize, +} + +impl SlotTable { + /// Create a new slot table. + /// + /// One should call 'reset' with the corresponding PikeVM before use. + fn new() -> SlotTable { + SlotTable { table: vec![], slots_for_captures: 0, slots_per_state: 0 } + } + + /// Reset this slot table such that it can be used with the given PikeVM + /// (and only that PikeVM). + fn reset(&mut self, re: &PikeVM) { + let nfa = re.get_nfa(); + self.slots_per_state = nfa.group_info().slot_len(); + // This is always correct, but may be reduced for a particular search + // if a 'Captures' has fewer slots, e.g., none at all or only slots + // for tracking the overall match instead of all slots for every + // group. + self.slots_for_captures = core::cmp::max( + self.slots_per_state, + nfa.pattern_len().checked_mul(2).unwrap(), + ); + let len = nfa + .states() + .len() + .checked_mul(self.slots_per_state) + // Add space to account for scratch space used during a search. + .and_then(|x| x.checked_add(self.slots_for_captures)) + // It seems like this could actually panic on legitimate inputs on + // 32-bit targets, and very likely to panic on 16-bit. Should we + // somehow convert this to an error? What about something similar + // for the lazy DFA cache? If you're tripping this assert, please + // file a bug. + .expect("slot table length doesn't overflow"); + // This happens about as often as a regex is compiled, so it probably + // should be at debug level, but I found it quite distracting and not + // particularly useful. + trace!( + "resizing PikeVM active states table to {} entries \ + (slots_per_state={})", + len, + self.slots_per_state, + ); + self.table.resize(len, None); + } + + /// Return the heap memory usage, in bytes, used by this slot table. + /// + /// This does not include the stack size of this value. + fn memory_usage(&self) -> usize { + self.table.len() * core::mem::size_of::<Option<NonMaxUsize>>() + } + + /// Perform any per-search setup for this slot table. + /// + /// In particular, this sets the length of the number of slots used in the + /// 'Captures' given by the caller (if any at all). This number may be + /// smaller than the total number of slots available, e.g., when the caller + /// is only interested in tracking the overall match and not the spans of + /// every matching capturing group. Only tracking the overall match can + /// save a substantial amount of time copying capturing spans during a + /// search. + fn setup_search(&mut self, captures_slot_len: usize) { + self.slots_for_captures = captures_slot_len; + } + + /// Return a mutable slice of the slots for the given state. + /// + /// Note that the length of the slice returned may be less than the total + /// number of slots available for this state. In particular, the length + /// always matches the number of slots indicated via 'setup_search'. + fn for_state(&mut self, sid: StateID) -> &mut [Option<NonMaxUsize>] { + let i = sid.as_usize() * self.slots_per_state; + &mut self.table[i..i + self.slots_for_captures] + } + + /// Return a slice of slots of appropriate length where every slot offset + /// is guaranteed to be absent. This is useful in cases where you need to + /// compute an epsilon closure outside of the user supplied regex, and thus + /// never want it to have any capturing slots set. + fn all_absent(&mut self) -> &mut [Option<NonMaxUsize>] { + let i = self.table.len() - self.slots_for_captures; + &mut self.table[i..i + self.slots_for_captures] + } +} + +/// Represents a stack frame for use while computing an epsilon closure. +/// +/// (An "epsilon closure" refers to the set of reachable NFA states from a +/// single state without consuming any input. That is, the set of all epsilon +/// transitions not only from that single state, but from every other state +/// reachable by an epsilon transition as well. This is why it's called a +/// "closure." Computing an epsilon closure is also done during DFA +/// determinization! Compare and contrast the epsilon closure here in this +/// PikeVM and the one used for determinization in crate::util::determinize.) +/// +/// Computing the epsilon closure in a Thompson NFA proceeds via a depth +/// first traversal over all epsilon transitions from a particular state. +/// (A depth first traversal is important because it emulates the same priority +/// of matches that is typically found in backtracking regex engines.) This +/// depth first traversal is naturally expressed using recursion, but to avoid +/// a call stack size proportional to the size of a regex, we put our stack on +/// the heap instead. +/// +/// This stack thus consists of call frames. The typical call frame is +/// `Explore`, which instructs epsilon closure to explore the epsilon +/// transitions from that state. (Subsequent epsilon transitions are then +/// pushed on to the stack as more `Explore` frames.) If the state ID being +/// explored has no epsilon transitions, then the capturing group slots are +/// copied from the original state that sparked the epsilon closure (from the +/// 'step' routine) to the state ID being explored. This way, capturing group +/// slots are forwarded from the previous state to the next. +/// +/// The other stack frame, `RestoreCaptures`, instructs the epsilon closure to +/// set the position for a particular slot back to some particular offset. This +/// frame is pushed when `Explore` sees a `Capture` transition. `Explore` will +/// set the offset of the slot indicated in `Capture` to the current offset, +/// and then push the old offset on to the stack as a `RestoreCapture` frame. +/// Thus, the new offset is only used until the epsilon closure reverts back to +/// the `RestoreCapture` frame. In effect, this gives the `Capture` epsilon +/// transition its "scope" to only states that come "after" it during depth +/// first traversal. +#[derive(Clone, Debug)] +enum FollowEpsilon { + /// Explore the epsilon transitions from a state ID. + Explore(StateID), + /// Reset the given `slot` to the given `offset` (which might be `None`). + RestoreCapture { slot: SmallIndex, offset: Option<NonMaxUsize> }, +} + +/// A set of counters that "instruments" a PikeVM search. To enable this, you +/// must enable the 'internal-instrument-pikevm' feature. Then run your Rust +/// program with RUST_LOG=regex_automata::nfa::thompson::pikevm=trace set in +/// the environment. The metrics collected will be dumped automatically for +/// every search executed by the PikeVM. +/// +/// NOTE: When 'internal-instrument-pikevm' is enabled, it will likely cause an +/// absolute decrease in wall-clock performance, even if the 'trace' log level +/// isn't enabled. (Although, we do try to avoid extra costs when 'trace' isn't +/// enabled.) The main point of instrumentation is to get counts of various +/// events that occur during the PikeVM's execution. +/// +/// This is a somewhat hacked together collection of metrics that are useful +/// to gather from a PikeVM search. In particular, it lets us scrutinize the +/// performance profile of a search beyond what general purpose profiling tools +/// give us. Namely, we orient the profiling data around the specific states of +/// the NFA. +/// +/// In other words, this lets us see which parts of the NFA graph are most +/// frequently activated. This then provides direction for optimization +/// opportunities. +/// +/// The really sad part about this is that it absolutely clutters up the PikeVM +/// implementation. :'( Another approach would be to just manually add this +/// code in whenever I want this kind of profiling data, but it's complicated +/// and tedious enough that I went with this approach... for now. +/// +/// When instrumentation is enabled (which also turns on 'logging'), then a +/// `Counters` is initialized for every search and `trace`'d just before the +/// search returns to the caller. +/// +/// Tip: When debugging performance problems with the PikeVM, it's best to try +/// to work with an NFA that is as small as possible. Otherwise the state graph +/// is likely to be too big to digest. +#[cfg(feature = "internal-instrument-pikevm")] +#[derive(Clone, Debug)] +struct Counters { + /// The number of times the NFA is in a particular permutation of states. + state_sets: alloc::collections::BTreeMap<Vec<StateID>, u64>, + /// The number of times 'step' is called for a particular state ID (which + /// indexes this array). + steps: Vec<u64>, + /// The number of times an epsilon closure was computed for a state. + closures: Vec<u64>, + /// The number of times a particular state ID is pushed on to a stack while + /// computing an epsilon closure. + stack_pushes: Vec<u64>, + /// The number of times a particular state ID is inserted into a sparse set + /// while computing an epsilon closure. + set_inserts: Vec<u64>, +} + +#[cfg(feature = "internal-instrument-pikevm")] +impl Counters { + fn empty() -> Counters { + Counters { + state_sets: alloc::collections::BTreeMap::new(), + steps: vec![], + closures: vec![], + stack_pushes: vec![], + set_inserts: vec![], + } + } + + fn reset(&mut self, nfa: &NFA) { + let len = nfa.states().len(); + + self.state_sets.clear(); + + self.steps.clear(); + self.steps.resize(len, 0); + + self.closures.clear(); + self.closures.resize(len, 0); + + self.stack_pushes.clear(); + self.stack_pushes.resize(len, 0); + + self.set_inserts.clear(); + self.set_inserts.resize(len, 0); + } + + fn eprint(&self, nfa: &NFA) { + trace!("===== START PikeVM Instrumentation Output ====="); + // We take the top-K most occurring state sets. Otherwise the output + // is likely to be overwhelming. And we probably only care about the + // most frequently occurring ones anyway. + const LIMIT: usize = 20; + let mut set_counts = + self.state_sets.iter().collect::<Vec<(&Vec<StateID>, &u64)>>(); + set_counts.sort_by_key(|(_, &count)| core::cmp::Reverse(count)); + trace!("## PikeVM frequency of state sets (top {})", LIMIT); + for (set, count) in set_counts.iter().take(LIMIT) { + trace!("{:?}: {}", set, count); + } + if set_counts.len() > LIMIT { + trace!( + "... {} sets omitted (out of {} total)", + set_counts.len() - LIMIT, + set_counts.len(), + ); + } + + trace!(""); + trace!("## PikeVM total frequency of events"); + trace!( + "steps: {}, closures: {}, stack-pushes: {}, set-inserts: {}", + self.steps.iter().copied().sum::<u64>(), + self.closures.iter().copied().sum::<u64>(), + self.stack_pushes.iter().copied().sum::<u64>(), + self.set_inserts.iter().copied().sum::<u64>(), + ); + + trace!(""); + trace!("## PikeVM frequency of events broken down by state"); + for sid in 0..self.steps.len() { + trace!( + "{:06}: steps: {}, closures: {}, \ + stack-pushes: {}, set-inserts: {}", + sid, + self.steps[sid], + self.closures[sid], + self.stack_pushes[sid], + self.set_inserts[sid], + ); + } + + trace!(""); + trace!("## NFA debug display"); + trace!("{:?}", nfa); + trace!("===== END PikeVM Instrumentation Output ====="); + } + + fn record_state_set(&mut self, set: &SparseSet) { + let set = set.iter().collect::<Vec<StateID>>(); + *self.state_sets.entry(set).or_insert(0) += 1; + } + + fn record_step(&mut self, sid: StateID) { + self.steps[sid] += 1; + } + + fn record_closure(&mut self, sid: StateID) { + self.closures[sid] += 1; + } + + fn record_stack_push(&mut self, sid: StateID) { + self.stack_pushes[sid] += 1; + } + + fn record_set_insert(&mut self, sid: StateID) { + self.set_inserts[sid] += 1; + } +} diff --git a/third_party/rust/regex-automata/src/nfa/thompson/range_trie.rs b/third_party/rust/regex-automata/src/nfa/thompson/range_trie.rs new file mode 100644 index 0000000000..2d43a5b6f7 --- /dev/null +++ b/third_party/rust/regex-automata/src/nfa/thompson/range_trie.rs @@ -0,0 +1,1055 @@ +/* +I've called the primary data structure in this module a "range trie." As far +as I can tell, there is no prior art on a data structure like this, however, +it's likely someone somewhere has built something like it. Searching for +"range trie" turns up the paper "Range Tries for Scalable Address Lookup," +but it does not appear relevant. + +The range trie is just like a trie in that it is a special case of a +deterministic finite state machine. It has states and each state has a set +of transitions to other states. It is acyclic, and, like a normal trie, +it makes no attempt to reuse common suffixes among its elements. The key +difference between a normal trie and a range trie below is that a range trie +operates on *contiguous sequences* of bytes instead of singleton bytes. +One could say say that our alphabet is ranges of bytes instead of bytes +themselves, except a key part of range trie construction is splitting ranges +apart to ensure there is at most one transition that can be taken for any +byte in a given state. + +I've tried to explain the details of how the range trie works below, so +for now, we are left with trying to understand what problem we're trying to +solve. Which is itself fairly involved! + +At the highest level, here's what we want to do. We want to convert a +sequence of Unicode codepoints into a finite state machine whose transitions +are over *bytes* and *not* Unicode codepoints. We want this because it makes +said finite state machines much smaller and much faster to execute. As a +simple example, consider a byte oriented automaton for all Unicode scalar +values (0x00 through 0x10FFFF, not including surrogate codepoints): + + [00-7F] + [C2-DF][80-BF] + [E0-E0][A0-BF][80-BF] + [E1-EC][80-BF][80-BF] + [ED-ED][80-9F][80-BF] + [EE-EF][80-BF][80-BF] + [F0-F0][90-BF][80-BF][80-BF] + [F1-F3][80-BF][80-BF][80-BF] + [F4-F4][80-8F][80-BF][80-BF] + +(These byte ranges are generated via the regex-syntax::utf8 module, which +was based on Russ Cox's code in RE2, which was in turn based on Ken +Thompson's implementation of the same idea in his Plan9 implementation of +grep.) + +It should be fairly straight-forward to see how one could compile this into +a DFA. The sequences are sorted and non-overlapping. Essentially, you could +build a trie from this fairly easy. The problem comes when your initial +range (in this case, 0x00-0x10FFFF) isn't so nice. For example, the class +represented by '\w' contains only a tenth of the codepoints that +0x00-0x10FFFF contains, but if we were to write out the byte based ranges +as we did above, the list would stretch to 892 entries! This turns into +quite a large NFA with a few thousand states. Turning this beast into a DFA +takes quite a bit of time. We are thus left with trying to trim down the +number of states we produce as early as possible. + +One approach (used by RE2 and still by the regex crate, at time of writing) +is to try to find common suffixes while building NFA states for the above +and reuse them. This is very cheap to do and one can control precisely how +much extra memory you want to use for the cache. + +Another approach, however, is to reuse an algorithm for constructing a +*minimal* DFA from a sorted sequence of inputs. I don't want to go into +the full details here, but I explain it in more depth in my blog post on +FSTs[1]. Note that the algorithm was not invented by me, but was published +in paper by Daciuk et al. in 2000 called "Incremental Construction of +MinimalAcyclic Finite-State Automata." Like the suffix cache approach above, +it is also possible to control the amount of extra memory one uses, although +this usually comes with the cost of sacrificing true minimality. (But it's +typically close enough with a reasonably sized cache of states.) + +The catch is that Daciuk's algorithm only works if you add your keys in +lexicographic ascending order. In our case, since we're dealing with ranges, +we also need the additional requirement that ranges are either equivalent +or do not overlap at all. For example, if one were given the following byte +ranges: + + [BC-BF][80-BF] + [BC-BF][90-BF] + +Then Daciuk's algorithm would not work, since there is nothing to handle the +fact that the ranges overlap. They would need to be split apart. Thankfully, +Thompson's algorithm for producing byte ranges for Unicode codepoint ranges +meets both of our requirements. (A proof for this eludes me, but it appears +true.) + +... however, we would also like to be able to compile UTF-8 automata in +reverse. We want this because in order to find the starting location of a +match using a DFA, we need to run a second DFA---a reversed version of the +forward DFA---backwards to discover the match location. Unfortunately, if +we reverse our byte sequences for 0x00-0x10FFFF, we get sequences that are +can overlap, even if they are sorted: + + [00-7F] + [80-BF][80-9F][ED-ED] + [80-BF][80-BF][80-8F][F4-F4] + [80-BF][80-BF][80-BF][F1-F3] + [80-BF][80-BF][90-BF][F0-F0] + [80-BF][80-BF][E1-EC] + [80-BF][80-BF][EE-EF] + [80-BF][A0-BF][E0-E0] + [80-BF][C2-DF] + +For example, '[80-BF][80-BF][EE-EF]' and '[80-BF][A0-BF][E0-E0]' have +overlapping ranges between '[80-BF]' and '[A0-BF]'. Thus, there is no +simple way to apply Daciuk's algorithm. + +And thus, the range trie was born. The range trie's only purpose is to take +sequences of byte ranges like the ones above, collect them into a trie and then +spit them out in a sorted fashion with no overlapping ranges. For example, +0x00-0x10FFFF gets translated to: + + [0-7F] + [80-BF][80-9F][80-8F][F1-F3] + [80-BF][80-9F][80-8F][F4] + [80-BF][80-9F][90-BF][F0] + [80-BF][80-9F][90-BF][F1-F3] + [80-BF][80-9F][E1-EC] + [80-BF][80-9F][ED] + [80-BF][80-9F][EE-EF] + [80-BF][A0-BF][80-8F][F1-F3] + [80-BF][A0-BF][80-8F][F4] + [80-BF][A0-BF][90-BF][F0] + [80-BF][A0-BF][90-BF][F1-F3] + [80-BF][A0-BF][E0] + [80-BF][A0-BF][E1-EC] + [80-BF][A0-BF][EE-EF] + [80-BF][C2-DF] + +We've thus satisfied our requirements for running Daciuk's algorithm. All +sequences of ranges are sorted, and any corresponding ranges are either +exactly equivalent or non-overlapping. + +In effect, a range trie is building a DFA from a sequence of arbitrary byte +ranges. But it uses an algorithm custom tailored to its input, so it is not as +costly as traditional DFA construction. While it is still quite a bit more +costly than the forward case (which only needs Daciuk's algorithm), it winds +up saving a substantial amount of time if one is doing a full DFA powerset +construction later by virtue of producing a much much smaller NFA. + +[1] - https://blog.burntsushi.net/transducers/ +[2] - https://www.mitpressjournals.org/doi/pdfplus/10.1162/089120100561601 +*/ + +use core::{cell::RefCell, convert::TryFrom, fmt, mem, ops::RangeInclusive}; + +use alloc::{format, string::String, vec, vec::Vec}; + +use regex_syntax::utf8::Utf8Range; + +use crate::util::primitives::StateID; + +/// There is only one final state in this trie. Every sequence of byte ranges +/// added shares the same final state. +const FINAL: StateID = StateID::ZERO; + +/// The root state of the trie. +const ROOT: StateID = StateID::new_unchecked(1); + +/// A range trie represents an ordered set of sequences of bytes. +/// +/// A range trie accepts as input a sequence of byte ranges and merges +/// them into the existing set such that the trie can produce a sorted +/// non-overlapping sequence of byte ranges. The sequence emitted corresponds +/// precisely to the sequence of bytes matched by the given keys, although the +/// byte ranges themselves may be split at different boundaries. +/// +/// The order complexity of this data structure seems difficult to analyze. +/// If the size of a byte is held as a constant, then insertion is clearly +/// O(n) where n is the number of byte ranges in the input key. However, if +/// k=256 is our alphabet size, then insertion could be O(k^2 * n). In +/// particular it seems possible for pathological inputs to cause insertion +/// to do a lot of work. However, for what we use this data structure for, +/// there should be no pathological inputs since the ultimate source is always +/// a sorted set of Unicode scalar value ranges. +/// +/// Internally, this trie is setup like a finite state machine. Note though +/// that it is acyclic. +#[derive(Clone)] +pub struct RangeTrie { + /// The states in this trie. The first is always the shared final state. + /// The second is always the root state. Otherwise, there is no + /// particular order. + states: Vec<State>, + /// A free-list of states. When a range trie is cleared, all of its states + /// are added to this list. Creating a new state reuses states from this + /// list before allocating a new one. + free: Vec<State>, + /// A stack for traversing this trie to yield sequences of byte ranges in + /// lexicographic order. + iter_stack: RefCell<Vec<NextIter>>, + /// A buffer that stores the current sequence during iteration. + iter_ranges: RefCell<Vec<Utf8Range>>, + /// A stack used for traversing the trie in order to (deeply) duplicate + /// a state. States are recursively duplicated when ranges are split. + dupe_stack: Vec<NextDupe>, + /// A stack used for traversing the trie during insertion of a new + /// sequence of byte ranges. + insert_stack: Vec<NextInsert>, +} + +/// A single state in this trie. +#[derive(Clone)] +struct State { + /// A sorted sequence of non-overlapping transitions to other states. Each + /// transition corresponds to a single range of bytes. + transitions: Vec<Transition>, +} + +/// A transition is a single range of bytes. If a particular byte is in this +/// range, then the corresponding machine may transition to the state pointed +/// to by `next_id`. +#[derive(Clone)] +struct Transition { + /// The byte range. + range: Utf8Range, + /// The next state to transition to. + next_id: StateID, +} + +impl RangeTrie { + /// Create a new empty range trie. + pub fn new() -> RangeTrie { + let mut trie = RangeTrie { + states: vec![], + free: vec![], + iter_stack: RefCell::new(vec![]), + iter_ranges: RefCell::new(vec![]), + dupe_stack: vec![], + insert_stack: vec![], + }; + trie.clear(); + trie + } + + /// Clear this range trie such that it is empty. Clearing a range trie + /// and reusing it can beneficial because this may reuse allocations. + pub fn clear(&mut self) { + self.free.extend(self.states.drain(..)); + self.add_empty(); // final + self.add_empty(); // root + } + + /// Iterate over all of the sequences of byte ranges in this trie, and + /// call the provided function for each sequence. Iteration occurs in + /// lexicographic order. + pub fn iter<E, F: FnMut(&[Utf8Range]) -> Result<(), E>>( + &self, + mut f: F, + ) -> Result<(), E> { + let mut stack = self.iter_stack.borrow_mut(); + stack.clear(); + let mut ranges = self.iter_ranges.borrow_mut(); + ranges.clear(); + + // We do iteration in a way that permits us to use a single buffer + // for our keys. We iterate in a depth first fashion, while being + // careful to expand our frontier as we move deeper in the trie. + stack.push(NextIter { state_id: ROOT, tidx: 0 }); + while let Some(NextIter { mut state_id, mut tidx }) = stack.pop() { + // This could be implemented more simply without an inner loop + // here, but at the cost of more stack pushes. + loop { + let state = self.state(state_id); + // If we've visited all transitions in this state, then pop + // back to the parent state. + if tidx >= state.transitions.len() { + ranges.pop(); + break; + } + + let t = &state.transitions[tidx]; + ranges.push(t.range); + if t.next_id == FINAL { + f(&ranges)?; + ranges.pop(); + tidx += 1; + } else { + // Expand our frontier. Once we come back to this state + // via the stack, start in on the next transition. + stack.push(NextIter { state_id, tidx: tidx + 1 }); + // Otherwise, move to the first transition of the next + // state. + state_id = t.next_id; + tidx = 0; + } + } + } + Ok(()) + } + + /// Inserts a new sequence of ranges into this trie. + /// + /// The sequence given must be non-empty and must not have a length + /// exceeding 4. + pub fn insert(&mut self, ranges: &[Utf8Range]) { + assert!(!ranges.is_empty()); + assert!(ranges.len() <= 4); + + let mut stack = mem::replace(&mut self.insert_stack, vec![]); + stack.clear(); + + stack.push(NextInsert::new(ROOT, ranges)); + while let Some(next) = stack.pop() { + let (state_id, ranges) = (next.state_id(), next.ranges()); + assert!(!ranges.is_empty()); + + let (mut new, rest) = (ranges[0], &ranges[1..]); + + // i corresponds to the position of the existing transition on + // which we are operating. Typically, the result is to remove the + // transition and replace it with two or more new transitions + // corresponding to the partitions generated by splitting the + // 'new' with the ith transition's range. + let mut i = self.state(state_id).find(new); + + // In this case, there is no overlap *and* the new range is greater + // than all existing ranges. So we can just add it to the end. + if i == self.state(state_id).transitions.len() { + let next_id = NextInsert::push(self, &mut stack, rest); + self.add_transition(state_id, new, next_id); + continue; + } + + // The need for this loop is a bit subtle, buf basically, after + // we've handled the partitions from our initial split, it's + // possible that there will be a partition leftover that overlaps + // with a subsequent transition. If so, then we have to repeat + // the split process again with the leftovers and that subsequent + // transition. + 'OUTER: loop { + let old = self.state(state_id).transitions[i].clone(); + let split = match Split::new(old.range, new) { + Some(split) => split, + None => { + let next_id = NextInsert::push(self, &mut stack, rest); + self.add_transition_at(i, state_id, new, next_id); + continue; + } + }; + let splits = split.as_slice(); + // If we only have one partition, then the ranges must be + // equivalent. There's nothing to do here for this state, so + // just move on to the next one. + if splits.len() == 1 { + // ... but only if we have anything left to do. + if !rest.is_empty() { + stack.push(NextInsert::new(old.next_id, rest)); + } + break; + } + // At this point, we know that 'split' is non-empty and there + // must be some overlap AND that the two ranges are not + // equivalent. Therefore, the existing range MUST be removed + // and split up somehow. Instead of actually doing the removal + // and then a subsequent insertion---with all the memory + // shuffling that entails---we simply overwrite the transition + // at position `i` for the first new transition we want to + // insert. After that, we're forced to do expensive inserts. + let mut first = true; + let mut add_trans = + |trie: &mut RangeTrie, pos, from, range, to| { + if first { + trie.set_transition_at(pos, from, range, to); + first = false; + } else { + trie.add_transition_at(pos, from, range, to); + } + }; + for (j, &srange) in splits.iter().enumerate() { + match srange { + SplitRange::Old(r) => { + // Deep clone the state pointed to by the ith + // transition. This is always necessary since 'old' + // is always coupled with at least a 'both' + // partition. We don't want any new changes made + // via the 'both' partition to impact the part of + // the transition that doesn't overlap with the + // new range. + let dup_id = self.duplicate(old.next_id); + add_trans(self, i, state_id, r, dup_id); + } + SplitRange::New(r) => { + // This is a bit subtle, but if this happens to be + // the last partition in our split, it is possible + // that this overlaps with a subsequent transition. + // If it does, then we must repeat the whole + // splitting process over again with `r` and the + // subsequent transition. + { + let trans = &self.state(state_id).transitions; + if j + 1 == splits.len() + && i < trans.len() + && intersects(r, trans[i].range) + { + new = r; + continue 'OUTER; + } + } + + // ... otherwise, setup exploration for a new + // empty state and add a brand new transition for + // this new range. + let next_id = + NextInsert::push(self, &mut stack, rest); + add_trans(self, i, state_id, r, next_id); + } + SplitRange::Both(r) => { + // Continue adding the remaining ranges on this + // path and update the transition with the new + // range. + if !rest.is_empty() { + stack.push(NextInsert::new(old.next_id, rest)); + } + add_trans(self, i, state_id, r, old.next_id); + } + } + i += 1; + } + // If we've reached this point, then we know that there are + // no subsequent transitions with any overlap. Therefore, we + // can stop processing this range and move on to the next one. + break; + } + } + self.insert_stack = stack; + } + + pub fn add_empty(&mut self) -> StateID { + let id = match StateID::try_from(self.states.len()) { + Ok(id) => id, + Err(_) => { + // This generally should not happen since a range trie is + // only ever used to compile a single sequence of Unicode + // scalar values. If we ever got to this point, we would, at + // *minimum*, be using 96GB in just the range trie alone. + panic!("too many sequences added to range trie"); + } + }; + // If we have some free states available, then use them to avoid + // more allocations. + if let Some(mut state) = self.free.pop() { + state.clear(); + self.states.push(state); + } else { + self.states.push(State { transitions: vec![] }); + } + id + } + + /// Performs a deep clone of the given state and returns the duplicate's + /// state ID. + /// + /// A "deep clone" in this context means that the state given along with + /// recursively all states that it points to are copied. Once complete, + /// the given state ID and the returned state ID share nothing. + /// + /// This is useful during range trie insertion when a new range overlaps + /// with an existing range that is bigger than the new one. The part + /// of the existing range that does *not* overlap with the new one is + /// duplicated so that adding the new range to the overlap doesn't disturb + /// the non-overlapping portion. + /// + /// There's one exception: if old_id is the final state, then it is not + /// duplicated and the same final state is returned. This is because all + /// final states in this trie are equivalent. + fn duplicate(&mut self, old_id: StateID) -> StateID { + if old_id == FINAL { + return FINAL; + } + + let mut stack = mem::replace(&mut self.dupe_stack, vec![]); + stack.clear(); + + let new_id = self.add_empty(); + // old_id is the state we're cloning and new_id is the ID of the + // duplicated state for old_id. + stack.push(NextDupe { old_id, new_id }); + while let Some(NextDupe { old_id, new_id }) = stack.pop() { + for i in 0..self.state(old_id).transitions.len() { + let t = self.state(old_id).transitions[i].clone(); + if t.next_id == FINAL { + // All final states are the same, so there's no need to + // duplicate it. + self.add_transition(new_id, t.range, FINAL); + continue; + } + + let new_child_id = self.add_empty(); + self.add_transition(new_id, t.range, new_child_id); + stack.push(NextDupe { + old_id: t.next_id, + new_id: new_child_id, + }); + } + } + self.dupe_stack = stack; + new_id + } + + /// Adds the given transition to the given state. + /// + /// Callers must ensure that all previous transitions in this state + /// are lexicographically smaller than the given range. + fn add_transition( + &mut self, + from_id: StateID, + range: Utf8Range, + next_id: StateID, + ) { + self.state_mut(from_id) + .transitions + .push(Transition { range, next_id }); + } + + /// Like `add_transition`, except this inserts the transition just before + /// the ith transition. + fn add_transition_at( + &mut self, + i: usize, + from_id: StateID, + range: Utf8Range, + next_id: StateID, + ) { + self.state_mut(from_id) + .transitions + .insert(i, Transition { range, next_id }); + } + + /// Overwrites the transition at position i with the given transition. + fn set_transition_at( + &mut self, + i: usize, + from_id: StateID, + range: Utf8Range, + next_id: StateID, + ) { + self.state_mut(from_id).transitions[i] = Transition { range, next_id }; + } + + /// Return an immutable borrow for the state with the given ID. + fn state(&self, id: StateID) -> &State { + &self.states[id] + } + + /// Return a mutable borrow for the state with the given ID. + fn state_mut(&mut self, id: StateID) -> &mut State { + &mut self.states[id] + } +} + +impl State { + /// Find the position at which the given range should be inserted in this + /// state. + /// + /// The position returned is always in the inclusive range + /// [0, transitions.len()]. If 'transitions.len()' is returned, then the + /// given range overlaps with no other range in this state *and* is greater + /// than all of them. + /// + /// For all other possible positions, the given range either overlaps + /// with the transition at that position or is otherwise less than it + /// with no overlap (and is greater than the previous transition). In the + /// former case, careful attention must be paid to inserting this range + /// as a new transition. In the latter case, the range can be inserted as + /// a new transition at the given position without disrupting any other + /// transitions. + fn find(&self, range: Utf8Range) -> usize { + /// Returns the position `i` at which `pred(xs[i])` first returns true + /// such that for all `j >= i`, `pred(xs[j]) == true`. If `pred` never + /// returns true, then `xs.len()` is returned. + /// + /// We roll our own binary search because it doesn't seem like the + /// standard library's binary search can be used here. Namely, if + /// there is an overlapping range, then we want to find the first such + /// occurrence, but there may be many. Or at least, it's not quite + /// clear to me how to do it. + fn binary_search<T, F>(xs: &[T], mut pred: F) -> usize + where + F: FnMut(&T) -> bool, + { + let (mut left, mut right) = (0, xs.len()); + while left < right { + // Overflow is impossible because xs.len() <= 256. + let mid = (left + right) / 2; + if pred(&xs[mid]) { + right = mid; + } else { + left = mid + 1; + } + } + left + } + + // Benchmarks suggest that binary search is just a bit faster than + // straight linear search. Specifically when using the debug tool: + // + // hyperfine "regex-cli debug nfa thompson --quiet --reverse '\w{90} ecurB'" + binary_search(&self.transitions, |t| range.start <= t.range.end) + } + + /// Clear this state such that it has zero transitions. + fn clear(&mut self) { + self.transitions.clear(); + } +} + +/// The next state to process during duplication. +#[derive(Clone, Debug)] +struct NextDupe { + /// The state we want to duplicate. + old_id: StateID, + /// The ID of the new state that is a duplicate of old_id. + new_id: StateID, +} + +/// The next state (and its corresponding transition) that we want to visit +/// during iteration in lexicographic order. +#[derive(Clone, Debug)] +struct NextIter { + state_id: StateID, + tidx: usize, +} + +/// The next state to process during insertion and any remaining ranges that we +/// want to add for a particular sequence of ranges. The first such instance +/// is always the root state along with all ranges given. +#[derive(Clone, Debug)] +struct NextInsert { + /// The next state to begin inserting ranges. This state should be the + /// state at which `ranges[0]` should be inserted. + state_id: StateID, + /// The ranges to insert. We used a fixed-size array here to avoid an + /// allocation. + ranges: [Utf8Range; 4], + /// The number of valid ranges in the above array. + len: u8, +} + +impl NextInsert { + /// Create the next item to visit. The given state ID should correspond + /// to the state at which the first range in the given slice should be + /// inserted. The slice given must not be empty and it must be no longer + /// than 4. + fn new(state_id: StateID, ranges: &[Utf8Range]) -> NextInsert { + let len = ranges.len(); + assert!(len > 0); + assert!(len <= 4); + + let mut tmp = [Utf8Range { start: 0, end: 0 }; 4]; + tmp[..len].copy_from_slice(ranges); + NextInsert { state_id, ranges: tmp, len: u8::try_from(len).unwrap() } + } + + /// Push a new empty state to visit along with any remaining ranges that + /// still need to be inserted. The ID of the new empty state is returned. + /// + /// If ranges is empty, then no new state is created and FINAL is returned. + fn push( + trie: &mut RangeTrie, + stack: &mut Vec<NextInsert>, + ranges: &[Utf8Range], + ) -> StateID { + if ranges.is_empty() { + FINAL + } else { + let next_id = trie.add_empty(); + stack.push(NextInsert::new(next_id, ranges)); + next_id + } + } + + /// Return the ID of the state to visit. + fn state_id(&self) -> StateID { + self.state_id + } + + /// Return the remaining ranges to insert. + fn ranges(&self) -> &[Utf8Range] { + &self.ranges[..usize::try_from(self.len).unwrap()] + } +} + +/// Split represents a partitioning of two ranges into one or more ranges. This +/// is the secret sauce that makes a range trie work, as it's what tells us +/// how to deal with two overlapping but unequal ranges during insertion. +/// +/// Essentially, either two ranges overlap or they don't. If they don't, then +/// handling insertion is easy: just insert the new range into its +/// lexicographically correct position. Since it does not overlap with anything +/// else, no other transitions are impacted by the new range. +/// +/// If they do overlap though, there are generally three possible cases to +/// handle: +/// +/// 1. The part where the two ranges actually overlap. i.e., The intersection. +/// 2. The part of the existing range that is not in the the new range. +/// 3. The part of the new range that is not in the old range. +/// +/// (1) is guaranteed to always occur since all overlapping ranges have a +/// non-empty intersection. If the two ranges are not equivalent, then at +/// least one of (2) or (3) is guaranteed to occur as well. In some cases, +/// e.g., `[0-4]` and `[4-9]`, all three cases will occur. +/// +/// This `Split` type is responsible for providing (1), (2) and (3) for any +/// possible pair of byte ranges. +/// +/// As for insertion, for the overlap in (1), the remaining ranges to insert +/// should be added by following the corresponding transition. However, this +/// should only be done for the overlapping parts of the range. If there was +/// a part of the existing range that was not in the new range, then that +/// existing part must be split off from the transition and duplicated. The +/// remaining parts of the overlap can then be added to using the new ranges +/// without disturbing the existing range. +/// +/// Handling the case for the part of a new range that is not in an existing +/// range is seemingly easy. Just treat it as if it were a non-overlapping +/// range. The problem here is that if this new non-overlapping range occurs +/// after both (1) and (2), then it's possible that it can overlap with the +/// next transition in the current state. If it does, then the whole process +/// must be repeated! +/// +/// # Details of the 3 cases +/// +/// The following details the various cases that are implemented in code +/// below. It's plausible that the number of cases is not actually minimal, +/// but it's important for this code to remain at least somewhat readable. +/// +/// Given [a,b] and [x,y], where a <= b, x <= y, b < 256 and y < 256, we define +/// the follow distinct relationships where at least one must apply. The order +/// of these matters, since multiple can match. The first to match applies. +/// +/// 1. b < x <=> [a,b] < [x,y] +/// 2. y < a <=> [x,y] < [a,b] +/// +/// In the case of (1) and (2), these are the only cases where there is no +/// overlap. Or otherwise, the intersection of [a,b] and [x,y] is empty. In +/// order to compute the intersection, one can do [max(a,x), min(b,y)]. The +/// intersection in all of the following cases is non-empty. +/// +/// 3. a = x && b = y <=> [a,b] == [x,y] +/// 4. a = x && b < y <=> [x,y] right-extends [a,b] +/// 5. b = y && a > x <=> [x,y] left-extends [a,b] +/// 6. x = a && y < b <=> [a,b] right-extends [x,y] +/// 7. y = b && x > a <=> [a,b] left-extends [x,y] +/// 8. a > x && b < y <=> [x,y] covers [a,b] +/// 9. x > a && y < b <=> [a,b] covers [x,y] +/// 10. b = x && a < y <=> [a,b] is left-adjacent to [x,y] +/// 11. y = a && x < b <=> [x,y] is left-adjacent to [a,b] +/// 12. b > x && b < y <=> [a,b] left-overlaps [x,y] +/// 13. y > a && y < b <=> [x,y] left-overlaps [a,b] +/// +/// In cases 3-13, we can form rules that partition the ranges into a +/// non-overlapping ordered sequence of ranges: +/// +/// 3. [a,b] +/// 4. [a,b], [b+1,y] +/// 5. [x,a-1], [a,b] +/// 6. [x,y], [y+1,b] +/// 7. [a,x-1], [x,y] +/// 8. [x,a-1], [a,b], [b+1,y] +/// 9. [a,x-1], [x,y], [y+1,b] +/// 10. [a,b-1], [b,b], [b+1,y] +/// 11. [x,y-1], [y,y], [y+1,b] +/// 12. [a,x-1], [x,b], [b+1,y] +/// 13. [x,a-1], [a,y], [y+1,b] +/// +/// In the code below, we go a step further and identify each of the above +/// outputs as belonging either to the overlap of the two ranges or to one +/// of [a,b] or [x,y] exclusively. +#[derive(Clone, Debug, Eq, PartialEq)] +struct Split { + partitions: [SplitRange; 3], + len: usize, +} + +/// A tagged range indicating how it was derived from a pair of ranges. +#[derive(Clone, Copy, Debug, Eq, PartialEq)] +enum SplitRange { + Old(Utf8Range), + New(Utf8Range), + Both(Utf8Range), +} + +impl Split { + /// Create a partitioning of the given ranges. + /// + /// If the given ranges have an empty intersection, then None is returned. + fn new(o: Utf8Range, n: Utf8Range) -> Option<Split> { + let range = |r: RangeInclusive<u8>| Utf8Range { + start: *r.start(), + end: *r.end(), + }; + let old = |r| SplitRange::Old(range(r)); + let new = |r| SplitRange::New(range(r)); + let both = |r| SplitRange::Both(range(r)); + + // Use same names as the comment above to make it easier to compare. + let (a, b, x, y) = (o.start, o.end, n.start, n.end); + + if b < x || y < a { + // case 1, case 2 + None + } else if a == x && b == y { + // case 3 + Some(Split::parts1(both(a..=b))) + } else if a == x && b < y { + // case 4 + Some(Split::parts2(both(a..=b), new(b + 1..=y))) + } else if b == y && a > x { + // case 5 + Some(Split::parts2(new(x..=a - 1), both(a..=b))) + } else if x == a && y < b { + // case 6 + Some(Split::parts2(both(x..=y), old(y + 1..=b))) + } else if y == b && x > a { + // case 7 + Some(Split::parts2(old(a..=x - 1), both(x..=y))) + } else if a > x && b < y { + // case 8 + Some(Split::parts3(new(x..=a - 1), both(a..=b), new(b + 1..=y))) + } else if x > a && y < b { + // case 9 + Some(Split::parts3(old(a..=x - 1), both(x..=y), old(y + 1..=b))) + } else if b == x && a < y { + // case 10 + Some(Split::parts3(old(a..=b - 1), both(b..=b), new(b + 1..=y))) + } else if y == a && x < b { + // case 11 + Some(Split::parts3(new(x..=y - 1), both(y..=y), old(y + 1..=b))) + } else if b > x && b < y { + // case 12 + Some(Split::parts3(old(a..=x - 1), both(x..=b), new(b + 1..=y))) + } else if y > a && y < b { + // case 13 + Some(Split::parts3(new(x..=a - 1), both(a..=y), old(y + 1..=b))) + } else { + unreachable!() + } + } + + /// Create a new split with a single partition. This only occurs when two + /// ranges are equivalent. + fn parts1(r1: SplitRange) -> Split { + // This value doesn't matter since it is never accessed. + let nada = SplitRange::Old(Utf8Range { start: 0, end: 0 }); + Split { partitions: [r1, nada, nada], len: 1 } + } + + /// Create a new split with two partitions. + fn parts2(r1: SplitRange, r2: SplitRange) -> Split { + // This value doesn't matter since it is never accessed. + let nada = SplitRange::Old(Utf8Range { start: 0, end: 0 }); + Split { partitions: [r1, r2, nada], len: 2 } + } + + /// Create a new split with three partitions. + fn parts3(r1: SplitRange, r2: SplitRange, r3: SplitRange) -> Split { + Split { partitions: [r1, r2, r3], len: 3 } + } + + /// Return the partitions in this split as a slice. + fn as_slice(&self) -> &[SplitRange] { + &self.partitions[..self.len] + } +} + +impl fmt::Debug for RangeTrie { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + writeln!(f, "")?; + for (i, state) in self.states.iter().enumerate() { + let status = if i == FINAL.as_usize() { '*' } else { ' ' }; + writeln!(f, "{}{:06}: {:?}", status, i, state)?; + } + Ok(()) + } +} + +impl fmt::Debug for State { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + let rs = self + .transitions + .iter() + .map(|t| format!("{:?}", t)) + .collect::<Vec<String>>() + .join(", "); + write!(f, "{}", rs) + } +} + +impl fmt::Debug for Transition { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + if self.range.start == self.range.end { + write!( + f, + "{:02X} => {:02X}", + self.range.start, + self.next_id.as_usize(), + ) + } else { + write!( + f, + "{:02X}-{:02X} => {:02X}", + self.range.start, + self.range.end, + self.next_id.as_usize(), + ) + } + } +} + +/// Returns true if and only if the given ranges intersect. +fn intersects(r1: Utf8Range, r2: Utf8Range) -> bool { + !(r1.end < r2.start || r2.end < r1.start) +} + +#[cfg(test)] +mod tests { + use core::ops::RangeInclusive; + + use regex_syntax::utf8::Utf8Range; + + use super::*; + + fn r(range: RangeInclusive<u8>) -> Utf8Range { + Utf8Range { start: *range.start(), end: *range.end() } + } + + fn split_maybe( + old: RangeInclusive<u8>, + new: RangeInclusive<u8>, + ) -> Option<Split> { + Split::new(r(old), r(new)) + } + + fn split( + old: RangeInclusive<u8>, + new: RangeInclusive<u8>, + ) -> Vec<SplitRange> { + split_maybe(old, new).unwrap().as_slice().to_vec() + } + + #[test] + fn no_splits() { + // case 1 + assert_eq!(None, split_maybe(0..=1, 2..=3)); + // case 2 + assert_eq!(None, split_maybe(2..=3, 0..=1)); + } + + #[test] + fn splits() { + let range = |r: RangeInclusive<u8>| Utf8Range { + start: *r.start(), + end: *r.end(), + }; + let old = |r| SplitRange::Old(range(r)); + let new = |r| SplitRange::New(range(r)); + let both = |r| SplitRange::Both(range(r)); + + // case 3 + assert_eq!(split(0..=0, 0..=0), vec![both(0..=0)]); + assert_eq!(split(9..=9, 9..=9), vec![both(9..=9)]); + + // case 4 + assert_eq!(split(0..=5, 0..=6), vec![both(0..=5), new(6..=6)]); + assert_eq!(split(0..=5, 0..=8), vec![both(0..=5), new(6..=8)]); + assert_eq!(split(5..=5, 5..=8), vec![both(5..=5), new(6..=8)]); + + // case 5 + assert_eq!(split(1..=5, 0..=5), vec![new(0..=0), both(1..=5)]); + assert_eq!(split(3..=5, 0..=5), vec![new(0..=2), both(3..=5)]); + assert_eq!(split(5..=5, 0..=5), vec![new(0..=4), both(5..=5)]); + + // case 6 + assert_eq!(split(0..=6, 0..=5), vec![both(0..=5), old(6..=6)]); + assert_eq!(split(0..=8, 0..=5), vec![both(0..=5), old(6..=8)]); + assert_eq!(split(5..=8, 5..=5), vec![both(5..=5), old(6..=8)]); + + // case 7 + assert_eq!(split(0..=5, 1..=5), vec![old(0..=0), both(1..=5)]); + assert_eq!(split(0..=5, 3..=5), vec![old(0..=2), both(3..=5)]); + assert_eq!(split(0..=5, 5..=5), vec![old(0..=4), both(5..=5)]); + + // case 8 + assert_eq!( + split(3..=6, 2..=7), + vec![new(2..=2), both(3..=6), new(7..=7)], + ); + assert_eq!( + split(3..=6, 1..=8), + vec![new(1..=2), both(3..=6), new(7..=8)], + ); + + // case 9 + assert_eq!( + split(2..=7, 3..=6), + vec![old(2..=2), both(3..=6), old(7..=7)], + ); + assert_eq!( + split(1..=8, 3..=6), + vec![old(1..=2), both(3..=6), old(7..=8)], + ); + + // case 10 + assert_eq!( + split(3..=6, 6..=7), + vec![old(3..=5), both(6..=6), new(7..=7)], + ); + assert_eq!( + split(3..=6, 6..=8), + vec![old(3..=5), both(6..=6), new(7..=8)], + ); + assert_eq!( + split(5..=6, 6..=7), + vec![old(5..=5), both(6..=6), new(7..=7)], + ); + + // case 11 + assert_eq!( + split(6..=7, 3..=6), + vec![new(3..=5), both(6..=6), old(7..=7)], + ); + assert_eq!( + split(6..=8, 3..=6), + vec![new(3..=5), both(6..=6), old(7..=8)], + ); + assert_eq!( + split(6..=7, 5..=6), + vec![new(5..=5), both(6..=6), old(7..=7)], + ); + + // case 12 + assert_eq!( + split(3..=7, 5..=9), + vec![old(3..=4), both(5..=7), new(8..=9)], + ); + assert_eq!( + split(3..=5, 4..=6), + vec![old(3..=3), both(4..=5), new(6..=6)], + ); + + // case 13 + assert_eq!( + split(5..=9, 3..=7), + vec![new(3..=4), both(5..=7), old(8..=9)], + ); + assert_eq!( + split(4..=6, 3..=5), + vec![new(3..=3), both(4..=5), old(6..=6)], + ); + } + + // Arguably there should be more tests here, but in practice, this data + // structure is well covered by the huge number of regex tests. +} diff --git a/third_party/rust/regex-automata/src/util/alphabet.rs b/third_party/rust/regex-automata/src/util/alphabet.rs new file mode 100644 index 0000000000..22b5a76446 --- /dev/null +++ b/third_party/rust/regex-automata/src/util/alphabet.rs @@ -0,0 +1,1139 @@ +/*! +This module provides APIs for dealing with the alphabets of finite state +machines. + +There are two principal types in this module, [`ByteClasses`] and [`Unit`]. +The former defines the alphabet of a finite state machine while the latter +represents an element of that alphabet. + +To a first approximation, the alphabet of all automata in this crate is just +a `u8`. Namely, every distinct byte value. All 256 of them. In practice, this +can be quite wasteful when building a transition table for a DFA, since it +requires storing a state identifier for each element in the alphabet. Instead, +we collapse the alphabet of an automaton down into equivalence classes, where +every byte in the same equivalence class never discriminates between a match or +a non-match from any other byte in the same class. For example, in the regex +`[a-z]+`, then you could consider it having an alphabet consisting of two +equivalence classes: `a-z` and everything else. In terms of the transitions on +an automaton, it doesn't actually require representing every distinct byte. +Just the equivalence classes. + +The downside of equivalence classes is that, of course, searching a haystack +deals with individual byte values. Those byte values need to be mapped to +their corresponding equivalence class. This is what `ByteClasses` does. In +practice, doing this for every state transition has negligible impact on modern +CPUs. Moreover, it helps make more efficient use of the CPU cache by (possibly +considerably) shrinking the size of the transition table. + +One last hiccup concerns `Unit`. Namely, because of look-around and how the +DFAs in this crate work, we need to add a sentinel value to our alphabet +of equivalence classes that represents the "end" of a search. We call that +sentinel [`Unit::eoi`] or "end of input." Thus, a `Unit` is either an +equivalence class corresponding to a set of bytes, or it is a special "end of +input" sentinel. + +In general, you should not expect to need either of these types unless you're +doing lower level shenanigans with DFAs, or even building your own DFAs. +(Although, you don't have to use these types to build your own DFAs of course.) +For example, if you're walking a DFA's state graph, it's probably useful to +make use of [`ByteClasses`] to visit each element in the DFA's alphabet instead +of just visiting every distinct `u8` value. The latter isn't necessarily wrong, +but it could be potentially very wasteful. +*/ +use crate::util::{ + escape::DebugByte, + wire::{self, DeserializeError, SerializeError}, +}; + +/// Unit represents a single unit of haystack for DFA based regex engines. +/// +/// It is not expected for consumers of this crate to need to use this type +/// unless they are implementing their own DFA. And even then, it's not +/// required: implementors may use other techniques to handle haystack units. +/// +/// Typically, a single unit of haystack for a DFA would be a single byte. +/// However, for the DFAs in this crate, matches are delayed by a single byte +/// in order to handle look-ahead assertions (`\b`, `$` and `\z`). Thus, once +/// we have consumed the haystack, we must run the DFA through one additional +/// transition using a unit that indicates the haystack has ended. +/// +/// There is no way to represent a sentinel with a `u8` since all possible +/// values *may* be valid haystack units to a DFA, therefore this type +/// explicitly adds room for a sentinel value. +/// +/// The sentinel EOI value is always its own equivalence class and is +/// ultimately represented by adding 1 to the maximum equivalence class value. +/// So for example, the regex `^[a-z]+$` might be split into the following +/// equivalence classes: +/// +/// ```text +/// 0 => [\x00-`] +/// 1 => [a-z] +/// 2 => [{-\xFF] +/// 3 => [EOI] +/// ``` +/// +/// Where EOI is the special sentinel value that is always in its own +/// singleton equivalence class. +#[derive(Clone, Copy, Eq, PartialEq, PartialOrd, Ord)] +pub struct Unit(UnitKind); + +#[derive(Clone, Copy, Eq, PartialEq, PartialOrd, Ord)] +enum UnitKind { + /// Represents a byte value, or more typically, an equivalence class + /// represented as a byte value. + U8(u8), + /// Represents the "end of input" sentinel. We regretably use a `u16` + /// here since the maximum sentinel value is `256`. Thankfully, we don't + /// actually store a `Unit` anywhere, so this extra space shouldn't be too + /// bad. + EOI(u16), +} + +impl Unit { + /// Create a new haystack unit from a byte value. + /// + /// All possible byte values are legal. However, when creating a haystack + /// unit for a specific DFA, one should be careful to only construct units + /// that are in that DFA's alphabet. Namely, one way to compact a DFA's + /// in-memory representation is to collapse its transitions to a set of + /// equivalence classes into a set of all possible byte values. If a DFA + /// uses equivalence classes instead of byte values, then the byte given + /// here should be the equivalence class. + pub fn u8(byte: u8) -> Unit { + Unit(UnitKind::U8(byte)) + } + + /// Create a new "end of input" haystack unit. + /// + /// The value given is the sentinel value used by this unit to represent + /// the "end of input." The value should be the total number of equivalence + /// classes in the corresponding alphabet. Its maximum value is `256`, + /// which occurs when every byte is its own equivalence class. + /// + /// # Panics + /// + /// This panics when `num_byte_equiv_classes` is greater than `256`. + pub fn eoi(num_byte_equiv_classes: usize) -> Unit { + assert!( + num_byte_equiv_classes <= 256, + "max number of byte-based equivalent classes is 256, but got {}", + num_byte_equiv_classes, + ); + Unit(UnitKind::EOI(u16::try_from(num_byte_equiv_classes).unwrap())) + } + + /// If this unit is not an "end of input" sentinel, then returns its + /// underlying byte value. Otherwise return `None`. + pub fn as_u8(self) -> Option<u8> { + match self.0 { + UnitKind::U8(b) => Some(b), + UnitKind::EOI(_) => None, + } + } + + /// If this unit is an "end of input" sentinel, then return the underlying + /// sentinel value that was given to [`Unit::eoi`]. Otherwise return + /// `None`. + pub fn as_eoi(self) -> Option<u16> { + match self.0 { + UnitKind::U8(_) => None, + UnitKind::EOI(sentinel) => Some(sentinel), + } + } + + /// Return this unit as a `usize`, regardless of whether it is a byte value + /// or an "end of input" sentinel. In the latter case, the underlying + /// sentinel value given to [`Unit::eoi`] is returned. + pub fn as_usize(self) -> usize { + match self.0 { + UnitKind::U8(b) => usize::from(b), + UnitKind::EOI(eoi) => usize::from(eoi), + } + } + + /// Returns true if and only of this unit is a byte value equivalent to the + /// byte given. This always returns false when this is an "end of input" + /// sentinel. + pub fn is_byte(self, byte: u8) -> bool { + self.as_u8().map_or(false, |b| b == byte) + } + + /// Returns true when this unit represents an "end of input" sentinel. + pub fn is_eoi(self) -> bool { + self.as_eoi().is_some() + } + + /// Returns true when this unit corresponds to an ASCII word byte. + /// + /// This always returns false when this unit represents an "end of input" + /// sentinel. + pub fn is_word_byte(self) -> bool { + self.as_u8().map_or(false, crate::util::utf8::is_word_byte) + } +} + +impl core::fmt::Debug for Unit { + fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result { + match self.0 { + UnitKind::U8(b) => write!(f, "{:?}", DebugByte(b)), + UnitKind::EOI(_) => write!(f, "EOI"), + } + } +} + +/// A representation of byte oriented equivalence classes. +/// +/// This is used in a DFA to reduce the size of the transition table. This can +/// have a particularly large impact not only on the total size of a dense DFA, +/// but also on compile times. +/// +/// The essential idea here is that the alphabet of a DFA is shrunk from the +/// usual 256 distinct byte values down to a set of equivalence classes. The +/// guarantee you get is that any byte belonging to the same equivalence class +/// can be treated as if it were any other byte in the same class, and the +/// result of a search wouldn't change. +/// +/// # Example +/// +/// This example shows how to get byte classes from an +/// [`NFA`](crate::nfa::thompson::NFA) and ask for the class of various bytes. +/// +/// ``` +/// use regex_automata::nfa::thompson::NFA; +/// +/// let nfa = NFA::new("[a-z]+")?; +/// let classes = nfa.byte_classes(); +/// // 'a' and 'z' are in the same class for this regex. +/// assert_eq!(classes.get(b'a'), classes.get(b'z')); +/// // But 'a' and 'A' are not. +/// assert_ne!(classes.get(b'a'), classes.get(b'A')); +/// +/// # Ok::<(), Box<dyn std::error::Error>>(()) +/// ``` +#[derive(Clone, Copy)] +pub struct ByteClasses([u8; 256]); + +impl ByteClasses { + /// Creates a new set of equivalence classes where all bytes are mapped to + /// the same class. + #[inline] + pub fn empty() -> ByteClasses { + ByteClasses([0; 256]) + } + + /// Creates a new set of equivalence classes where each byte belongs to + /// its own equivalence class. + #[inline] + pub fn singletons() -> ByteClasses { + let mut classes = ByteClasses::empty(); + for b in 0..=255 { + classes.set(b, b); + } + classes + } + + /// Deserializes a byte class map from the given slice. If the slice is of + /// insufficient length or otherwise contains an impossible mapping, then + /// an error is returned. Upon success, the number of bytes read along with + /// the map are returned. The number of bytes read is always a multiple of + /// 8. + pub(crate) fn from_bytes( + slice: &[u8], + ) -> Result<(ByteClasses, usize), DeserializeError> { + wire::check_slice_len(slice, 256, "byte class map")?; + let mut classes = ByteClasses::empty(); + for (b, &class) in slice[..256].iter().enumerate() { + classes.set(u8::try_from(b).unwrap(), class); + } + // We specifically don't use 'classes.iter()' here because that + // iterator depends on 'classes.alphabet_len()' being correct. But that + // is precisely the thing we're trying to verify below! + for &b in classes.0.iter() { + if usize::from(b) >= classes.alphabet_len() { + return Err(DeserializeError::generic( + "found equivalence class greater than alphabet len", + )); + } + } + Ok((classes, 256)) + } + + /// Writes this byte class map to the given byte buffer. if the given + /// buffer is too small, then an error is returned. Upon success, the total + /// number of bytes written is returned. The number of bytes written is + /// guaranteed to be a multiple of 8. + pub(crate) fn write_to( + &self, + mut dst: &mut [u8], + ) -> Result<usize, SerializeError> { + let nwrite = self.write_to_len(); + if dst.len() < nwrite { + return Err(SerializeError::buffer_too_small("byte class map")); + } + for b in 0..=255 { + dst[0] = self.get(b); + dst = &mut dst[1..]; + } + Ok(nwrite) + } + + /// Returns the total number of bytes written by `write_to`. + pub(crate) fn write_to_len(&self) -> usize { + 256 + } + + /// Set the equivalence class for the given byte. + #[inline] + pub fn set(&mut self, byte: u8, class: u8) { + self.0[usize::from(byte)] = class; + } + + /// Get the equivalence class for the given byte. + #[inline] + pub fn get(&self, byte: u8) -> u8 { + self.0[usize::from(byte)] + } + + /// Get the equivalence class for the given haystack unit and return the + /// class as a `usize`. + #[inline] + pub fn get_by_unit(&self, unit: Unit) -> usize { + match unit.0 { + UnitKind::U8(b) => usize::from(self.get(b)), + UnitKind::EOI(b) => usize::from(b), + } + } + + /// Create a unit that represents the "end of input" sentinel based on the + /// number of equivalence classes. + #[inline] + pub fn eoi(&self) -> Unit { + // The alphabet length already includes the EOI sentinel, hence why + // we subtract 1. + Unit::eoi(self.alphabet_len().checked_sub(1).unwrap()) + } + + /// Return the total number of elements in the alphabet represented by + /// these equivalence classes. Equivalently, this returns the total number + /// of equivalence classes. + #[inline] + pub fn alphabet_len(&self) -> usize { + // Add one since the number of equivalence classes is one bigger than + // the last one. But add another to account for the final EOI class + // that isn't explicitly represented. + usize::from(self.0[255]) + 1 + 1 + } + + /// Returns the stride, as a base-2 exponent, required for these + /// equivalence classes. + /// + /// The stride is always the smallest power of 2 that is greater than or + /// equal to the alphabet length, and the `stride2` returned here is the + /// exponent applied to `2` to get the smallest power. This is done so that + /// converting between premultiplied state IDs and indices can be done with + /// shifts alone, which is much faster than integer division. + #[inline] + pub fn stride2(&self) -> usize { + let zeros = self.alphabet_len().next_power_of_two().trailing_zeros(); + usize::try_from(zeros).unwrap() + } + + /// Returns true if and only if every byte in this class maps to its own + /// equivalence class. Equivalently, there are 257 equivalence classes + /// and each class contains either exactly one byte or corresponds to the + /// singleton class containing the "end of input" sentinel. + #[inline] + pub fn is_singleton(&self) -> bool { + self.alphabet_len() == 257 + } + + /// Returns an iterator over all equivalence classes in this set. + #[inline] + pub fn iter(&self) -> ByteClassIter<'_> { + ByteClassIter { classes: self, i: 0 } + } + + /// Returns an iterator over a sequence of representative bytes from each + /// equivalence class within the range of bytes given. + /// + /// When the given range is unbounded on both sides, the iterator yields + /// exactly N items, where N is equivalent to the number of equivalence + /// classes. Each item is an arbitrary byte drawn from each equivalence + /// class. + /// + /// This is useful when one is determinizing an NFA and the NFA's alphabet + /// hasn't been converted to equivalence classes. Picking an arbitrary byte + /// from each equivalence class then permits a full exploration of the NFA + /// instead of using every possible byte value and thus potentially saves + /// quite a lot of redundant work. + /// + /// # Example + /// + /// This shows an example of what a complete sequence of representatives + /// might look like from a real example. + /// + /// ``` + /// use regex_automata::{nfa::thompson::NFA, util::alphabet::Unit}; + /// + /// let nfa = NFA::new("[a-z]+")?; + /// let classes = nfa.byte_classes(); + /// let reps: Vec<Unit> = classes.representatives(..).collect(); + /// // Note that the specific byte values yielded are not guaranteed! + /// let expected = vec![ + /// Unit::u8(b'\x00'), + /// Unit::u8(b'a'), + /// Unit::u8(b'{'), + /// Unit::eoi(3), + /// ]; + /// assert_eq!(expected, reps); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + /// + /// Note though, that you can ask for an arbitrary range of bytes, and only + /// representatives for that range will be returned: + /// + /// ``` + /// use regex_automata::{nfa::thompson::NFA, util::alphabet::Unit}; + /// + /// let nfa = NFA::new("[a-z]+")?; + /// let classes = nfa.byte_classes(); + /// let reps: Vec<Unit> = classes.representatives(b'A'..=b'z').collect(); + /// // Note that the specific byte values yielded are not guaranteed! + /// let expected = vec![ + /// Unit::u8(b'A'), + /// Unit::u8(b'a'), + /// ]; + /// assert_eq!(expected, reps); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + pub fn representatives<R: core::ops::RangeBounds<u8>>( + &self, + range: R, + ) -> ByteClassRepresentatives<'_> { + use core::ops::Bound; + + let cur_byte = match range.start_bound() { + Bound::Included(&i) => usize::from(i), + Bound::Excluded(&i) => usize::from(i).checked_add(1).unwrap(), + Bound::Unbounded => 0, + }; + let end_byte = match range.end_bound() { + Bound::Included(&i) => { + Some(usize::from(i).checked_add(1).unwrap()) + } + Bound::Excluded(&i) => Some(usize::from(i)), + Bound::Unbounded => None, + }; + assert_ne!( + cur_byte, + usize::MAX, + "start range must be less than usize::MAX", + ); + ByteClassRepresentatives { + classes: self, + cur_byte, + end_byte, + last_class: None, + } + } + + /// Returns an iterator of the bytes in the given equivalence class. + /// + /// This is useful when one needs to know the actual bytes that belong to + /// an equivalence class. For example, conceptually speaking, accelerating + /// a DFA state occurs when a state only has a few outgoing transitions. + /// But in reality, what is required is that there are only a small + /// number of distinct bytes that can lead to an outgoing transition. The + /// difference is that any one transition can correspond to an equivalence + /// class which may contains many bytes. Therefore, DFA state acceleration + /// considers the actual elements in each equivalence class of each + /// outgoing transition. + /// + /// # Example + /// + /// This shows an example of how to get all of the elements in an + /// equivalence class. + /// + /// ``` + /// use regex_automata::{nfa::thompson::NFA, util::alphabet::Unit}; + /// + /// let nfa = NFA::new("[a-z]+")?; + /// let classes = nfa.byte_classes(); + /// let elements: Vec<Unit> = classes.elements(Unit::u8(1)).collect(); + /// let expected: Vec<Unit> = (b'a'..=b'z').map(Unit::u8).collect(); + /// assert_eq!(expected, elements); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + #[inline] + pub fn elements(&self, class: Unit) -> ByteClassElements { + ByteClassElements { classes: self, class, byte: 0 } + } + + /// Returns an iterator of byte ranges in the given equivalence class. + /// + /// That is, a sequence of contiguous ranges are returned. Typically, every + /// class maps to a single contiguous range. + fn element_ranges(&self, class: Unit) -> ByteClassElementRanges { + ByteClassElementRanges { elements: self.elements(class), range: None } + } +} + +impl Default for ByteClasses { + fn default() -> ByteClasses { + ByteClasses::singletons() + } +} + +impl core::fmt::Debug for ByteClasses { + fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result { + if self.is_singleton() { + write!(f, "ByteClasses({{singletons}})") + } else { + write!(f, "ByteClasses(")?; + for (i, class) in self.iter().enumerate() { + if i > 0 { + write!(f, ", ")?; + } + write!(f, "{:?} => [", class.as_usize())?; + for (start, end) in self.element_ranges(class) { + if start == end { + write!(f, "{:?}", start)?; + } else { + write!(f, "{:?}-{:?}", start, end)?; + } + } + write!(f, "]")?; + } + write!(f, ")") + } + } +} + +/// An iterator over each equivalence class. +/// +/// The last element in this iterator always corresponds to [`Unit::eoi`]. +/// +/// This is created by the [`ByteClasses::iter`] method. +/// +/// The lifetime `'a` refers to the lifetime of the byte classes that this +/// iterator was created from. +#[derive(Debug)] +pub struct ByteClassIter<'a> { + classes: &'a ByteClasses, + i: usize, +} + +impl<'a> Iterator for ByteClassIter<'a> { + type Item = Unit; + + fn next(&mut self) -> Option<Unit> { + if self.i + 1 == self.classes.alphabet_len() { + self.i += 1; + Some(self.classes.eoi()) + } else if self.i < self.classes.alphabet_len() { + let class = u8::try_from(self.i).unwrap(); + self.i += 1; + Some(Unit::u8(class)) + } else { + None + } + } +} + +/// An iterator over representative bytes from each equivalence class. +/// +/// This is created by the [`ByteClasses::representatives`] method. +/// +/// The lifetime `'a` refers to the lifetime of the byte classes that this +/// iterator was created from. +#[derive(Debug)] +pub struct ByteClassRepresentatives<'a> { + classes: &'a ByteClasses, + cur_byte: usize, + end_byte: Option<usize>, + last_class: Option<u8>, +} + +impl<'a> Iterator for ByteClassRepresentatives<'a> { + type Item = Unit; + + fn next(&mut self) -> Option<Unit> { + while self.cur_byte < self.end_byte.unwrap_or(256) { + let byte = u8::try_from(self.cur_byte).unwrap(); + let class = self.classes.get(byte); + self.cur_byte += 1; + + if self.last_class != Some(class) { + self.last_class = Some(class); + return Some(Unit::u8(byte)); + } + } + if self.cur_byte != usize::MAX && self.end_byte.is_none() { + // Using usize::MAX as a sentinel is OK because we ban usize::MAX + // from appearing as a start bound in iterator construction. But + // why do it this way? Well, we want to return the EOI class + // whenever the end of the given range is unbounded because EOI + // isn't really a "byte" per se, so the only way it should be + // excluded is if there is a bounded end to the range. Therefore, + // when the end is unbounded, we just need to know whether we've + // reported EOI or not. When we do, we set cur_byte to a value it + // can never otherwise be. + self.cur_byte = usize::MAX; + return Some(self.classes.eoi()); + } + None + } +} + +/// An iterator over all elements in an equivalence class. +/// +/// This is created by the [`ByteClasses::elements`] method. +/// +/// The lifetime `'a` refers to the lifetime of the byte classes that this +/// iterator was created from. +#[derive(Debug)] +pub struct ByteClassElements<'a> { + classes: &'a ByteClasses, + class: Unit, + byte: usize, +} + +impl<'a> Iterator for ByteClassElements<'a> { + type Item = Unit; + + fn next(&mut self) -> Option<Unit> { + while self.byte < 256 { + let byte = u8::try_from(self.byte).unwrap(); + self.byte += 1; + if self.class.is_byte(self.classes.get(byte)) { + return Some(Unit::u8(byte)); + } + } + if self.byte < 257 { + self.byte += 1; + if self.class.is_eoi() { + return Some(Unit::eoi(256)); + } + } + None + } +} + +/// An iterator over all elements in an equivalence class expressed as a +/// sequence of contiguous ranges. +#[derive(Debug)] +struct ByteClassElementRanges<'a> { + elements: ByteClassElements<'a>, + range: Option<(Unit, Unit)>, +} + +impl<'a> Iterator for ByteClassElementRanges<'a> { + type Item = (Unit, Unit); + + fn next(&mut self) -> Option<(Unit, Unit)> { + loop { + let element = match self.elements.next() { + None => return self.range.take(), + Some(element) => element, + }; + match self.range.take() { + None => { + self.range = Some((element, element)); + } + Some((start, end)) => { + if end.as_usize() + 1 != element.as_usize() + || element.is_eoi() + { + self.range = Some((element, element)); + return Some((start, end)); + } + self.range = Some((start, element)); + } + } + } + } +} + +/// A partitioning of bytes into equivalence classes. +/// +/// A byte class set keeps track of an *approximation* of equivalence classes +/// of bytes during NFA construction. That is, every byte in an equivalence +/// class cannot discriminate between a match and a non-match. +/// +/// For example, in the regex `[ab]+`, the bytes `a` and `b` would be in the +/// same equivalence class because it never matters whether an `a` or a `b` is +/// seen, and no combination of `a`s and `b`s in the text can discriminate a +/// match. +/// +/// Note though that this does not compute the minimal set of equivalence +/// classes. For example, in the regex `[ac]+`, both `a` and `c` are in the +/// same equivalence class for the same reason that `a` and `b` are in the +/// same equivalence class in the aforementioned regex. However, in this +/// implementation, `a` and `c` are put into distinct equivalence classes. The +/// reason for this is implementation complexity. In the future, we should +/// endeavor to compute the minimal equivalence classes since they can have a +/// rather large impact on the size of the DFA. (Doing this will likely require +/// rethinking how equivalence classes are computed, including changing the +/// representation here, which is only able to group contiguous bytes into the +/// same equivalence class.) +#[cfg(feature = "alloc")] +#[derive(Clone, Debug)] +pub(crate) struct ByteClassSet(ByteSet); + +#[cfg(feature = "alloc")] +impl Default for ByteClassSet { + fn default() -> ByteClassSet { + ByteClassSet::empty() + } +} + +#[cfg(feature = "alloc")] +impl ByteClassSet { + /// Create a new set of byte classes where all bytes are part of the same + /// equivalence class. + pub(crate) fn empty() -> Self { + ByteClassSet(ByteSet::empty()) + } + + /// Indicate the the range of byte given (inclusive) can discriminate a + /// match between it and all other bytes outside of the range. + pub(crate) fn set_range(&mut self, start: u8, end: u8) { + debug_assert!(start <= end); + if start > 0 { + self.0.add(start - 1); + } + self.0.add(end); + } + + /// Add the contiguous ranges in the set given to this byte class set. + pub(crate) fn add_set(&mut self, set: &ByteSet) { + for (start, end) in set.iter_ranges() { + self.set_range(start, end); + } + } + + /// Convert this boolean set to a map that maps all byte values to their + /// corresponding equivalence class. The last mapping indicates the largest + /// equivalence class identifier (which is never bigger than 255). + pub(crate) fn byte_classes(&self) -> ByteClasses { + let mut classes = ByteClasses::empty(); + let mut class = 0u8; + let mut b = 0u8; + loop { + classes.set(b, class); + if b == 255 { + break; + } + if self.0.contains(b) { + class = class.checked_add(1).unwrap(); + } + b = b.checked_add(1).unwrap(); + } + classes + } +} + +/// A simple set of bytes that is reasonably cheap to copy and allocation free. +#[derive(Clone, Copy, Debug, Default, Eq, PartialEq)] +pub(crate) struct ByteSet { + bits: BitSet, +} + +/// The representation of a byte set. Split out so that we can define a +/// convenient Debug impl for it while keeping "ByteSet" in the output. +#[derive(Clone, Copy, Default, Eq, PartialEq)] +struct BitSet([u128; 2]); + +impl ByteSet { + /// Create an empty set of bytes. + pub(crate) fn empty() -> ByteSet { + ByteSet { bits: BitSet([0; 2]) } + } + + /// Add a byte to this set. + /// + /// If the given byte already belongs to this set, then this is a no-op. + pub(crate) fn add(&mut self, byte: u8) { + let bucket = byte / 128; + let bit = byte % 128; + self.bits.0[usize::from(bucket)] |= 1 << bit; + } + + /// Remove a byte from this set. + /// + /// If the given byte is not in this set, then this is a no-op. + pub(crate) fn remove(&mut self, byte: u8) { + let bucket = byte / 128; + let bit = byte % 128; + self.bits.0[usize::from(bucket)] &= !(1 << bit); + } + + /// Return true if and only if the given byte is in this set. + pub(crate) fn contains(&self, byte: u8) -> bool { + let bucket = byte / 128; + let bit = byte % 128; + self.bits.0[usize::from(bucket)] & (1 << bit) > 0 + } + + /// Return true if and only if the given inclusive range of bytes is in + /// this set. + pub(crate) fn contains_range(&self, start: u8, end: u8) -> bool { + (start..=end).all(|b| self.contains(b)) + } + + /// Returns an iterator over all bytes in this set. + pub(crate) fn iter(&self) -> ByteSetIter { + ByteSetIter { set: self, b: 0 } + } + + /// Returns an iterator over all contiguous ranges of bytes in this set. + pub(crate) fn iter_ranges(&self) -> ByteSetRangeIter { + ByteSetRangeIter { set: self, b: 0 } + } + + /// Return true if and only if this set is empty. + #[cfg_attr(feature = "perf-inline", inline(always))] + pub(crate) fn is_empty(&self) -> bool { + self.bits.0 == [0, 0] + } + + /// Deserializes a byte set from the given slice. If the slice is of + /// incorrect length or is otherwise malformed, then an error is returned. + /// Upon success, the number of bytes read along with the set are returned. + /// The number of bytes read is always a multiple of 8. + pub(crate) fn from_bytes( + slice: &[u8], + ) -> Result<(ByteSet, usize), DeserializeError> { + use core::mem::size_of; + + wire::check_slice_len(slice, 2 * size_of::<u128>(), "byte set")?; + let mut nread = 0; + let (low, nr) = wire::try_read_u128(slice, "byte set low bucket")?; + nread += nr; + let (high, nr) = wire::try_read_u128(slice, "byte set high bucket")?; + nread += nr; + Ok((ByteSet { bits: BitSet([low, high]) }, nread)) + } + + /// Writes this byte set to the given byte buffer. If the given buffer is + /// too small, then an error is returned. Upon success, the total number of + /// bytes written is returned. The number of bytes written is guaranteed to + /// be a multiple of 8. + pub(crate) fn write_to<E: crate::util::wire::Endian>( + &self, + dst: &mut [u8], + ) -> Result<usize, SerializeError> { + use core::mem::size_of; + + let nwrite = self.write_to_len(); + if dst.len() < nwrite { + return Err(SerializeError::buffer_too_small("byte set")); + } + let mut nw = 0; + E::write_u128(self.bits.0[0], &mut dst[nw..]); + nw += size_of::<u128>(); + E::write_u128(self.bits.0[1], &mut dst[nw..]); + nw += size_of::<u128>(); + assert_eq!(nwrite, nw, "expected to write certain number of bytes",); + assert_eq!( + nw % 8, + 0, + "expected to write multiple of 8 bytes for byte set", + ); + Ok(nw) + } + + /// Returns the total number of bytes written by `write_to`. + pub(crate) fn write_to_len(&self) -> usize { + 2 * core::mem::size_of::<u128>() + } +} + +impl core::fmt::Debug for BitSet { + fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result { + let mut fmtd = f.debug_set(); + for b in 0u8..=255 { + if (ByteSet { bits: *self }).contains(b) { + fmtd.entry(&b); + } + } + fmtd.finish() + } +} + +#[derive(Debug)] +pub(crate) struct ByteSetIter<'a> { + set: &'a ByteSet, + b: usize, +} + +impl<'a> Iterator for ByteSetIter<'a> { + type Item = u8; + + fn next(&mut self) -> Option<u8> { + while self.b <= 255 { + let b = u8::try_from(self.b).unwrap(); + self.b += 1; + if self.set.contains(b) { + return Some(b); + } + } + None + } +} + +#[derive(Debug)] +pub(crate) struct ByteSetRangeIter<'a> { + set: &'a ByteSet, + b: usize, +} + +impl<'a> Iterator for ByteSetRangeIter<'a> { + type Item = (u8, u8); + + fn next(&mut self) -> Option<(u8, u8)> { + let asu8 = |n: usize| u8::try_from(n).unwrap(); + while self.b <= 255 { + let start = asu8(self.b); + self.b += 1; + if !self.set.contains(start) { + continue; + } + + let mut end = start; + while self.b <= 255 && self.set.contains(asu8(self.b)) { + end = asu8(self.b); + self.b += 1; + } + return Some((start, end)); + } + None + } +} + +#[cfg(all(test, feature = "alloc"))] +mod tests { + use alloc::{vec, vec::Vec}; + + use super::*; + + #[test] + fn byte_classes() { + let mut set = ByteClassSet::empty(); + set.set_range(b'a', b'z'); + + let classes = set.byte_classes(); + assert_eq!(classes.get(0), 0); + assert_eq!(classes.get(1), 0); + assert_eq!(classes.get(2), 0); + assert_eq!(classes.get(b'a' - 1), 0); + assert_eq!(classes.get(b'a'), 1); + assert_eq!(classes.get(b'm'), 1); + assert_eq!(classes.get(b'z'), 1); + assert_eq!(classes.get(b'z' + 1), 2); + assert_eq!(classes.get(254), 2); + assert_eq!(classes.get(255), 2); + + let mut set = ByteClassSet::empty(); + set.set_range(0, 2); + set.set_range(4, 6); + let classes = set.byte_classes(); + assert_eq!(classes.get(0), 0); + assert_eq!(classes.get(1), 0); + assert_eq!(classes.get(2), 0); + assert_eq!(classes.get(3), 1); + assert_eq!(classes.get(4), 2); + assert_eq!(classes.get(5), 2); + assert_eq!(classes.get(6), 2); + assert_eq!(classes.get(7), 3); + assert_eq!(classes.get(255), 3); + } + + #[test] + fn full_byte_classes() { + let mut set = ByteClassSet::empty(); + for b in 0u8..=255 { + set.set_range(b, b); + } + assert_eq!(set.byte_classes().alphabet_len(), 257); + } + + #[test] + fn elements_typical() { + let mut set = ByteClassSet::empty(); + set.set_range(b'b', b'd'); + set.set_range(b'g', b'm'); + set.set_range(b'z', b'z'); + let classes = set.byte_classes(); + // class 0: \x00-a + // class 1: b-d + // class 2: e-f + // class 3: g-m + // class 4: n-y + // class 5: z-z + // class 6: \x7B-\xFF + // class 7: EOI + assert_eq!(classes.alphabet_len(), 8); + + let elements = classes.elements(Unit::u8(0)).collect::<Vec<_>>(); + assert_eq!(elements.len(), 98); + assert_eq!(elements[0], Unit::u8(b'\x00')); + assert_eq!(elements[97], Unit::u8(b'a')); + + let elements = classes.elements(Unit::u8(1)).collect::<Vec<_>>(); + assert_eq!( + elements, + vec![Unit::u8(b'b'), Unit::u8(b'c'), Unit::u8(b'd')], + ); + + let elements = classes.elements(Unit::u8(2)).collect::<Vec<_>>(); + assert_eq!(elements, vec![Unit::u8(b'e'), Unit::u8(b'f')],); + + let elements = classes.elements(Unit::u8(3)).collect::<Vec<_>>(); + assert_eq!( + elements, + vec![ + Unit::u8(b'g'), + Unit::u8(b'h'), + Unit::u8(b'i'), + Unit::u8(b'j'), + Unit::u8(b'k'), + Unit::u8(b'l'), + Unit::u8(b'm'), + ], + ); + + let elements = classes.elements(Unit::u8(4)).collect::<Vec<_>>(); + assert_eq!(elements.len(), 12); + assert_eq!(elements[0], Unit::u8(b'n')); + assert_eq!(elements[11], Unit::u8(b'y')); + + let elements = classes.elements(Unit::u8(5)).collect::<Vec<_>>(); + assert_eq!(elements, vec![Unit::u8(b'z')]); + + let elements = classes.elements(Unit::u8(6)).collect::<Vec<_>>(); + assert_eq!(elements.len(), 133); + assert_eq!(elements[0], Unit::u8(b'\x7B')); + assert_eq!(elements[132], Unit::u8(b'\xFF')); + + let elements = classes.elements(Unit::eoi(7)).collect::<Vec<_>>(); + assert_eq!(elements, vec![Unit::eoi(256)]); + } + + #[test] + fn elements_singletons() { + let classes = ByteClasses::singletons(); + assert_eq!(classes.alphabet_len(), 257); + + let elements = classes.elements(Unit::u8(b'a')).collect::<Vec<_>>(); + assert_eq!(elements, vec![Unit::u8(b'a')]); + + let elements = classes.elements(Unit::eoi(5)).collect::<Vec<_>>(); + assert_eq!(elements, vec![Unit::eoi(256)]); + } + + #[test] + fn elements_empty() { + let classes = ByteClasses::empty(); + assert_eq!(classes.alphabet_len(), 2); + + let elements = classes.elements(Unit::u8(0)).collect::<Vec<_>>(); + assert_eq!(elements.len(), 256); + assert_eq!(elements[0], Unit::u8(b'\x00')); + assert_eq!(elements[255], Unit::u8(b'\xFF')); + + let elements = classes.elements(Unit::eoi(1)).collect::<Vec<_>>(); + assert_eq!(elements, vec![Unit::eoi(256)]); + } + + #[test] + fn representatives() { + let mut set = ByteClassSet::empty(); + set.set_range(b'b', b'd'); + set.set_range(b'g', b'm'); + set.set_range(b'z', b'z'); + let classes = set.byte_classes(); + + let got: Vec<Unit> = classes.representatives(..).collect(); + let expected = vec![ + Unit::u8(b'\x00'), + Unit::u8(b'b'), + Unit::u8(b'e'), + Unit::u8(b'g'), + Unit::u8(b'n'), + Unit::u8(b'z'), + Unit::u8(b'\x7B'), + Unit::eoi(7), + ]; + assert_eq!(expected, got); + + let got: Vec<Unit> = classes.representatives(..0).collect(); + assert!(got.is_empty()); + let got: Vec<Unit> = classes.representatives(1..1).collect(); + assert!(got.is_empty()); + let got: Vec<Unit> = classes.representatives(255..255).collect(); + assert!(got.is_empty()); + + // A weird case that is the only guaranteed to way to get an iterator + // of just the EOI class by excluding all possible byte values. + let got: Vec<Unit> = classes + .representatives(( + core::ops::Bound::Excluded(255), + core::ops::Bound::Unbounded, + )) + .collect(); + let expected = vec![Unit::eoi(7)]; + assert_eq!(expected, got); + + let got: Vec<Unit> = classes.representatives(..=255).collect(); + let expected = vec![ + Unit::u8(b'\x00'), + Unit::u8(b'b'), + Unit::u8(b'e'), + Unit::u8(b'g'), + Unit::u8(b'n'), + Unit::u8(b'z'), + Unit::u8(b'\x7B'), + ]; + assert_eq!(expected, got); + + let got: Vec<Unit> = classes.representatives(b'b'..=b'd').collect(); + let expected = vec![Unit::u8(b'b')]; + assert_eq!(expected, got); + + let got: Vec<Unit> = classes.representatives(b'a'..=b'd').collect(); + let expected = vec![Unit::u8(b'a'), Unit::u8(b'b')]; + assert_eq!(expected, got); + + let got: Vec<Unit> = classes.representatives(b'b'..=b'e').collect(); + let expected = vec![Unit::u8(b'b'), Unit::u8(b'e')]; + assert_eq!(expected, got); + + let got: Vec<Unit> = classes.representatives(b'A'..=b'Z').collect(); + let expected = vec![Unit::u8(b'A')]; + assert_eq!(expected, got); + + let got: Vec<Unit> = classes.representatives(b'A'..=b'z').collect(); + let expected = vec![ + Unit::u8(b'A'), + Unit::u8(b'b'), + Unit::u8(b'e'), + Unit::u8(b'g'), + Unit::u8(b'n'), + Unit::u8(b'z'), + ]; + assert_eq!(expected, got); + + let got: Vec<Unit> = classes.representatives(b'z'..).collect(); + let expected = vec![Unit::u8(b'z'), Unit::u8(b'\x7B'), Unit::eoi(7)]; + assert_eq!(expected, got); + + let got: Vec<Unit> = classes.representatives(b'z'..=0xFF).collect(); + let expected = vec![Unit::u8(b'z'), Unit::u8(b'\x7B')]; + assert_eq!(expected, got); + } +} diff --git a/third_party/rust/regex-automata/src/util/captures.rs b/third_party/rust/regex-automata/src/util/captures.rs new file mode 100644 index 0000000000..cd3a5f8f7b --- /dev/null +++ b/third_party/rust/regex-automata/src/util/captures.rs @@ -0,0 +1,2547 @@ +/*! +Provides types for dealing with capturing groups. + +Capturing groups refer to sub-patterns of regexes that some regex engines can +report matching offsets for. For example, matching `[a-z]([0-9]+)` against +`a789` would give `a789` as the overall match (for the implicit capturing group +at index `0`) and `789` as the match for the capturing group `([0-9]+)` (an +explicit capturing group at index `1`). + +Not all regex engines can report match offsets for capturing groups. Indeed, +to a first approximation, regex engines that can report capturing group offsets +tend to be quite a bit slower than regex engines that can't. This is because +tracking capturing groups at search time usually requires more "power" that +in turn adds overhead. + +Other regex implementations might call capturing groups "submatches." + +# Overview + +The main types in this module are: + +* [`Captures`] records the capturing group offsets found during a search. It +provides convenience routines for looking up capturing group offsets by either +index or name. +* [`GroupInfo`] records the mapping between capturing groups and "slots," +where the latter are how capturing groups are recorded during a regex search. +This also keeps a mapping from capturing group name to index, and capture +group index to name. A `GroupInfo` is used by `Captures` internally to +provide a convenient API. It is unlikely that you'll use a `GroupInfo` +directly, but for example, if you've compiled an Thompson NFA, then you can use +[`thompson::NFA::group_info`](crate::nfa::thompson::NFA::group_info) to get its +underlying `GroupInfo`. +*/ + +use alloc::{string::String, sync::Arc, vec, vec::Vec}; + +use crate::util::{ + interpolate, + primitives::{ + NonMaxUsize, PatternID, PatternIDError, PatternIDIter, SmallIndex, + }, + search::{Match, Span}, +}; + +/// The span offsets of capturing groups after a match has been found. +/// +/// This type represents the output of regex engines that can report the +/// offsets at which capturing groups matches or "submatches" occur. For +/// example, the [`PikeVM`](crate::nfa::thompson::pikevm::PikeVM). When a match +/// occurs, it will at minimum contain the [`PatternID`] of the pattern that +/// matched. Depending upon how it was constructed, it may also contain the +/// start/end offsets of the entire match of the pattern and the start/end +/// offsets of each capturing group that participated in the match. +/// +/// Values of this type are always created for a specific [`GroupInfo`]. It is +/// unspecified behavior to use a `Captures` value in a search with any regex +/// engine that has a different `GroupInfo` than the one the `Captures` were +/// created with. +/// +/// # Constructors +/// +/// There are three constructors for this type that control what kind of +/// information is available upon a match: +/// +/// * [`Captures::all`]: Will store overall pattern match offsets in addition +/// to the offsets of capturing groups that participated in the match. +/// * [`Captures::matches`]: Will store only the overall pattern +/// match offsets. The offsets of capturing groups (even ones that participated +/// in the match) are not available. +/// * [`Captures::empty`]: Will only store the pattern ID that matched. No +/// match offsets are available at all. +/// +/// If you aren't sure which to choose, then pick the first one. The first one +/// is what convenience routines like, +/// [`PikeVM::create_captures`](crate::nfa::thompson::pikevm::PikeVM::create_captures), +/// will use automatically. +/// +/// The main difference between these choices is performance. Namely, if you +/// ask for _less_ information, then the execution of regex search may be able +/// to run more quickly. +/// +/// # Notes +/// +/// It is worth pointing out that this type is not coupled to any one specific +/// regex engine. Instead, its coupling is with [`GroupInfo`], which is the +/// thing that is responsible for mapping capturing groups to "slot" offsets. +/// Slot offsets are indices into a single sequence of memory at which matching +/// haystack offsets for the corresponding group are written by regex engines. +/// +/// # Example +/// +/// This example shows how to parse a simple date and extract the components of +/// the date via capturing groups: +/// +/// ``` +/// use regex_automata::{nfa::thompson::pikevm::PikeVM, Span}; +/// +/// let re = PikeVM::new(r"^([0-9]{4})-([0-9]{2})-([0-9]{2})$")?; +/// let (mut cache, mut caps) = (re.create_cache(), re.create_captures()); +/// +/// re.captures(&mut cache, "2010-03-14", &mut caps); +/// assert!(caps.is_match()); +/// assert_eq!(Some(Span::from(0..4)), caps.get_group(1)); +/// assert_eq!(Some(Span::from(5..7)), caps.get_group(2)); +/// assert_eq!(Some(Span::from(8..10)), caps.get_group(3)); +/// +/// # Ok::<(), Box<dyn std::error::Error>>(()) +/// ``` +/// +/// # Example: named capturing groups +/// +/// This example is like the one above, but leverages the ability to name +/// capturing groups in order to make the code a bit clearer: +/// +/// ``` +/// use regex_automata::{nfa::thompson::pikevm::PikeVM, Span}; +/// +/// let re = PikeVM::new(r"^(?P<y>[0-9]{4})-(?P<m>[0-9]{2})-(?P<d>[0-9]{2})$")?; +/// let (mut cache, mut caps) = (re.create_cache(), re.create_captures()); +/// +/// re.captures(&mut cache, "2010-03-14", &mut caps); +/// assert!(caps.is_match()); +/// assert_eq!(Some(Span::from(0..4)), caps.get_group_by_name("y")); +/// assert_eq!(Some(Span::from(5..7)), caps.get_group_by_name("m")); +/// assert_eq!(Some(Span::from(8..10)), caps.get_group_by_name("d")); +/// +/// # Ok::<(), Box<dyn std::error::Error>>(()) +/// ``` +#[derive(Clone)] +pub struct Captures { + /// The group info that these capture groups are coupled to. This is what + /// gives the "convenience" of the `Captures` API. Namely, it provides the + /// slot mapping and the name|-->index mapping for capture lookups by name. + group_info: GroupInfo, + /// The ID of the pattern that matched. Regex engines must set this to + /// None when no match occurs. + pid: Option<PatternID>, + /// The slot values, i.e., submatch offsets. + /// + /// In theory, the smallest sequence of slots would be something like + /// `max(groups(pattern) for pattern in regex) * 2`, but instead, we use + /// `sum(groups(pattern) for pattern in regex) * 2`. Why? + /// + /// Well, the former could be used in theory, because we don't generally + /// have any overlapping APIs that involve capturing groups. Therefore, + /// there's technically never any need to have slots set for multiple + /// patterns. However, this might change some day, in which case, we would + /// need to have slots available. + /// + /// The other reason is that during the execution of some regex engines, + /// there exists a point in time where multiple slots for different + /// patterns may be written to before knowing which pattern has matched. + /// Therefore, the regex engines themselves, in order to support multiple + /// patterns correctly, must have all slots available. If `Captures` + /// doesn't have all slots available, then regex engines can't write + /// directly into the caller provided `Captures` and must instead write + /// into some other storage and then copy the slots involved in the match + /// at the end of the search. + /// + /// So overall, at least as of the time of writing, it seems like the path + /// of least resistance is to just require allocating all possible slots + /// instead of the conceptual minimum. Another way to justify this is that + /// the most common case is a single pattern, in which case, there is no + /// inefficiency here since the 'max' and 'sum' calculations above are + /// equivalent in that case. + /// + /// N.B. The mapping from group index to slot is maintained by `GroupInfo` + /// and is considered an API guarantee. See `GroupInfo` for more details on + /// that mapping. + /// + /// N.B. `Option<NonMaxUsize>` has the same size as a `usize`. + slots: Vec<Option<NonMaxUsize>>, +} + +impl Captures { + /// Create new storage for the offsets of all matching capturing groups. + /// + /// This routine provides the most information for matches---namely, the + /// spans of matching capturing groups---but also requires the regex search + /// routines to do the most work. + /// + /// It is unspecified behavior to use the returned `Captures` value in a + /// search with a `GroupInfo` other than the one that is provided to this + /// constructor. + /// + /// # Example + /// + /// This example shows that all capturing groups---but only ones that + /// participated in a match---are available to query after a match has + /// been found: + /// + /// ``` + /// use regex_automata::{ + /// nfa::thompson::pikevm::PikeVM, + /// util::captures::Captures, + /// Span, Match, + /// }; + /// + /// let re = PikeVM::new( + /// r"^(?:(?P<lower>[a-z]+)|(?P<upper>[A-Z]+))(?P<digits>[0-9]+)$", + /// )?; + /// let mut cache = re.create_cache(); + /// let mut caps = Captures::all(re.get_nfa().group_info().clone()); + /// + /// re.captures(&mut cache, "ABC123", &mut caps); + /// assert!(caps.is_match()); + /// assert_eq!(Some(Match::must(0, 0..6)), caps.get_match()); + /// // The 'lower' group didn't match, so it won't have any offsets. + /// assert_eq!(None, caps.get_group_by_name("lower")); + /// assert_eq!(Some(Span::from(0..3)), caps.get_group_by_name("upper")); + /// assert_eq!(Some(Span::from(3..6)), caps.get_group_by_name("digits")); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + pub fn all(group_info: GroupInfo) -> Captures { + let slots = group_info.slot_len(); + Captures { group_info, pid: None, slots: vec![None; slots] } + } + + /// Create new storage for only the full match spans of a pattern. This + /// does not include any capturing group offsets. + /// + /// It is unspecified behavior to use the returned `Captures` value in a + /// search with a `GroupInfo` other than the one that is provided to this + /// constructor. + /// + /// # Example + /// + /// This example shows that only overall match offsets are reported when + /// this constructor is used. Accessing any capturing groups other than + /// the 0th will always return `None`. + /// + /// ``` + /// use regex_automata::{ + /// nfa::thompson::pikevm::PikeVM, + /// util::captures::Captures, + /// Match, + /// }; + /// + /// let re = PikeVM::new( + /// r"^(?:(?P<lower>[a-z]+)|(?P<upper>[A-Z]+))(?P<digits>[0-9]+)$", + /// )?; + /// let mut cache = re.create_cache(); + /// let mut caps = Captures::matches(re.get_nfa().group_info().clone()); + /// + /// re.captures(&mut cache, "ABC123", &mut caps); + /// assert!(caps.is_match()); + /// assert_eq!(Some(Match::must(0, 0..6)), caps.get_match()); + /// // We didn't ask for capturing group offsets, so they aren't available. + /// assert_eq!(None, caps.get_group_by_name("lower")); + /// assert_eq!(None, caps.get_group_by_name("upper")); + /// assert_eq!(None, caps.get_group_by_name("digits")); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + pub fn matches(group_info: GroupInfo) -> Captures { + // This is OK because we know there are at least this many slots, + // and GroupInfo construction guarantees that the number of slots fits + // into a usize. + let slots = group_info.pattern_len().checked_mul(2).unwrap(); + Captures { group_info, pid: None, slots: vec![None; slots] } + } + + /// Create new storage for only tracking which pattern matched. No offsets + /// are stored at all. + /// + /// It is unspecified behavior to use the returned `Captures` value in a + /// search with a `GroupInfo` other than the one that is provided to this + /// constructor. + /// + /// # Example + /// + /// This example shows that only the pattern that matched can be accessed + /// from a `Captures` value created via this constructor. + /// + /// ``` + /// use regex_automata::{ + /// nfa::thompson::pikevm::PikeVM, + /// util::captures::Captures, + /// PatternID, + /// }; + /// + /// let re = PikeVM::new_many(&[r"[a-z]+", r"[A-Z]+"])?; + /// let mut cache = re.create_cache(); + /// let mut caps = Captures::empty(re.get_nfa().group_info().clone()); + /// + /// re.captures(&mut cache, "aABCz", &mut caps); + /// assert!(caps.is_match()); + /// assert_eq!(Some(PatternID::must(0)), caps.pattern()); + /// // We didn't ask for any offsets, so they aren't available. + /// assert_eq!(None, caps.get_match()); + /// + /// re.captures(&mut cache, &"aABCz"[1..], &mut caps); + /// assert!(caps.is_match()); + /// assert_eq!(Some(PatternID::must(1)), caps.pattern()); + /// // We didn't ask for any offsets, so they aren't available. + /// assert_eq!(None, caps.get_match()); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + pub fn empty(group_info: GroupInfo) -> Captures { + Captures { group_info, pid: None, slots: vec![] } + } + + /// Returns true if and only if this capturing group represents a match. + /// + /// This is a convenience routine for `caps.pattern().is_some()`. + /// + /// # Example + /// + /// When using the PikeVM (for example), the lightest weight way of + /// detecting whether a match exists is to create capturing groups that + /// only track the ID of the pattern that match (if any): + /// + /// ``` + /// use regex_automata::{ + /// nfa::thompson::pikevm::PikeVM, + /// util::captures::Captures, + /// }; + /// + /// let re = PikeVM::new(r"[a-z]+")?; + /// let mut cache = re.create_cache(); + /// let mut caps = Captures::empty(re.get_nfa().group_info().clone()); + /// + /// re.captures(&mut cache, "aABCz", &mut caps); + /// assert!(caps.is_match()); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + #[inline] + pub fn is_match(&self) -> bool { + self.pid.is_some() + } + + /// Returns the identifier of the pattern that matched when this + /// capturing group represents a match. If no match was found, then this + /// always returns `None`. + /// + /// This returns a pattern ID in precisely the cases in which `is_match` + /// returns `true`. Similarly, the pattern ID returned is always the + /// same pattern ID found in the `Match` returned by `get_match`. + /// + /// # Example + /// + /// When using the PikeVM (for example), the lightest weight way of + /// detecting which pattern matched is to create capturing groups that only + /// track the ID of the pattern that match (if any): + /// + /// ``` + /// use regex_automata::{ + /// nfa::thompson::pikevm::PikeVM, + /// util::captures::Captures, + /// PatternID, + /// }; + /// + /// let re = PikeVM::new_many(&[r"[a-z]+", r"[A-Z]+"])?; + /// let mut cache = re.create_cache(); + /// let mut caps = Captures::empty(re.get_nfa().group_info().clone()); + /// + /// re.captures(&mut cache, "ABC", &mut caps); + /// assert_eq!(Some(PatternID::must(1)), caps.pattern()); + /// // Recall that offsets are only available when using a non-empty + /// // Captures value. So even though a match occurred, this returns None! + /// assert_eq!(None, caps.get_match()); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + #[inline] + pub fn pattern(&self) -> Option<PatternID> { + self.pid + } + + /// Returns the pattern ID and the span of the match, if one occurred. + /// + /// This always returns `None` when `Captures` was created with + /// [`Captures::empty`], even if a match was found. + /// + /// If this routine returns a non-`None` value, then `is_match` is + /// guaranteed to return `true` and `pattern` is also guaranteed to return + /// a non-`None` value. + /// + /// # Example + /// + /// This example shows how to get the full match from a search: + /// + /// ``` + /// use regex_automata::{nfa::thompson::pikevm::PikeVM, Match}; + /// + /// let re = PikeVM::new_many(&[r"[a-z]+", r"[A-Z]+"])?; + /// let (mut cache, mut caps) = (re.create_cache(), re.create_captures()); + /// + /// re.captures(&mut cache, "ABC", &mut caps); + /// assert_eq!(Some(Match::must(1, 0..3)), caps.get_match()); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + #[inline] + pub fn get_match(&self) -> Option<Match> { + Some(Match::new(self.pattern()?, self.get_group(0)?)) + } + + /// Returns the span of a capturing group match corresponding to the group + /// index given, only if both the overall pattern matched and the capturing + /// group participated in that match. + /// + /// This returns `None` if `index` is invalid. `index` is valid if and only + /// if it's less than [`Captures::group_len`] for the matching pattern. + /// + /// This always returns `None` when `Captures` was created with + /// [`Captures::empty`], even if a match was found. This also always + /// returns `None` for any `index > 0` when `Captures` was created with + /// [`Captures::matches`]. + /// + /// If this routine returns a non-`None` value, then `is_match` is + /// guaranteed to return `true`, `pattern` is guaranteed to return a + /// non-`None` value and `get_match` is guaranteed to return a non-`None` + /// value. + /// + /// By convention, the 0th capture group will always return the same + /// span as the span returned by `get_match`. This is because the 0th + /// capture group always corresponds to the entirety of the pattern's + /// match. (It is similarly always unnamed because it is implicit.) This + /// isn't necessarily true of all regex engines. For example, one can + /// hand-compile a [`thompson::NFA`](crate::nfa::thompson::NFA) via a + /// [`thompson::Builder`](crate::nfa::thompson::Builder), which isn't + /// technically forced to make the 0th capturing group always correspond to + /// the entire match. + /// + /// # Example + /// + /// This example shows how to get the capturing groups, by index, from a + /// match: + /// + /// ``` + /// # if cfg!(miri) { return Ok(()); } // miri takes too long + /// # if !cfg!(target_pointer_width = "64") { return Ok(()); } // see #1039 + /// use regex_automata::{nfa::thompson::pikevm::PikeVM, Span, Match}; + /// + /// let re = PikeVM::new(r"^(?P<first>\pL+)\s+(?P<last>\pL+)$")?; + /// let (mut cache, mut caps) = (re.create_cache(), re.create_captures()); + /// + /// re.captures(&mut cache, "Bruce Springsteen", &mut caps); + /// assert_eq!(Some(Match::must(0, 0..17)), caps.get_match()); + /// assert_eq!(Some(Span::from(0..5)), caps.get_group(1)); + /// assert_eq!(Some(Span::from(6..17)), caps.get_group(2)); + /// // Looking for a non-existent capturing group will return None: + /// assert_eq!(None, caps.get_group(3)); + /// assert_eq!(None, caps.get_group(9944060567225171988)); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + #[inline] + pub fn get_group(&self, index: usize) -> Option<Span> { + let pid = self.pattern()?; + // There's a little bit of work needed to map captures to slots in the + // fully general case. But in the overwhelming common case of a single + // pattern, we can just do some simple arithmetic. + let (slot_start, slot_end) = if self.group_info().pattern_len() == 1 { + (index.checked_mul(2)?, index.checked_mul(2)?.checked_add(1)?) + } else { + self.group_info().slots(pid, index)? + }; + let start = self.slots.get(slot_start).copied()??; + let end = self.slots.get(slot_end).copied()??; + Some(Span { start: start.get(), end: end.get() }) + } + + /// Returns the span of a capturing group match corresponding to the group + /// name given, only if both the overall pattern matched and the capturing + /// group participated in that match. + /// + /// This returns `None` if `name` does not correspond to a valid capturing + /// group for the pattern that matched. + /// + /// This always returns `None` when `Captures` was created with + /// [`Captures::empty`], even if a match was found. This also always + /// returns `None` for any `index > 0` when `Captures` was created with + /// [`Captures::matches`]. + /// + /// If this routine returns a non-`None` value, then `is_match` is + /// guaranteed to return `true`, `pattern` is guaranteed to return a + /// non-`None` value and `get_match` is guaranteed to return a non-`None` + /// value. + /// + /// # Example + /// + /// This example shows how to get the capturing groups, by name, from a + /// match: + /// + /// ``` + /// # if cfg!(miri) { return Ok(()); } // miri takes too long + /// use regex_automata::{nfa::thompson::pikevm::PikeVM, Span, Match}; + /// + /// let re = PikeVM::new(r"^(?P<first>\pL+)\s+(?P<last>\pL+)$")?; + /// let (mut cache, mut caps) = (re.create_cache(), re.create_captures()); + /// + /// re.captures(&mut cache, "Bruce Springsteen", &mut caps); + /// assert_eq!(Some(Match::must(0, 0..17)), caps.get_match()); + /// assert_eq!(Some(Span::from(0..5)), caps.get_group_by_name("first")); + /// assert_eq!(Some(Span::from(6..17)), caps.get_group_by_name("last")); + /// // Looking for a non-existent capturing group will return None: + /// assert_eq!(None, caps.get_group_by_name("middle")); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + pub fn get_group_by_name(&self, name: &str) -> Option<Span> { + let index = self.group_info().to_index(self.pattern()?, name)?; + self.get_group(index) + } + + /// Returns an iterator of possible spans for every capturing group in the + /// matching pattern. + /// + /// If this `Captures` value does not correspond to a match, then the + /// iterator returned yields no elements. + /// + /// Note that the iterator returned yields elements of type `Option<Span>`. + /// A span is present if and only if it corresponds to a capturing group + /// that participated in a match. + /// + /// # Example + /// + /// This example shows how to collect all capturing groups: + /// + /// ``` + /// # if cfg!(miri) { return Ok(()); } // miri takes too long + /// use regex_automata::{nfa::thompson::pikevm::PikeVM, Span}; + /// + /// let re = PikeVM::new( + /// // Matches first/last names, with an optional middle name. + /// r"^(?P<first>\pL+)\s+(?:(?P<middle>\pL+)\s+)?(?P<last>\pL+)$", + /// )?; + /// let (mut cache, mut caps) = (re.create_cache(), re.create_captures()); + /// + /// re.captures(&mut cache, "Harry James Potter", &mut caps); + /// assert!(caps.is_match()); + /// let groups: Vec<Option<Span>> = caps.iter().collect(); + /// assert_eq!(groups, vec![ + /// Some(Span::from(0..18)), + /// Some(Span::from(0..5)), + /// Some(Span::from(6..11)), + /// Some(Span::from(12..18)), + /// ]); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + /// + /// This example uses the same regex as the previous example, but with a + /// haystack that omits the middle name. This results in a capturing group + /// that is present in the elements yielded by the iterator but without a + /// match: + /// + /// ``` + /// # if cfg!(miri) { return Ok(()); } // miri takes too long + /// use regex_automata::{nfa::thompson::pikevm::PikeVM, Span}; + /// + /// let re = PikeVM::new( + /// // Matches first/last names, with an optional middle name. + /// r"^(?P<first>\pL+)\s+(?:(?P<middle>\pL+)\s+)?(?P<last>\pL+)$", + /// )?; + /// let (mut cache, mut caps) = (re.create_cache(), re.create_captures()); + /// + /// re.captures(&mut cache, "Harry Potter", &mut caps); + /// assert!(caps.is_match()); + /// let groups: Vec<Option<Span>> = caps.iter().collect(); + /// assert_eq!(groups, vec![ + /// Some(Span::from(0..12)), + /// Some(Span::from(0..5)), + /// None, + /// Some(Span::from(6..12)), + /// ]); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + pub fn iter(&self) -> CapturesPatternIter<'_> { + let names = self + .pattern() + .map_or(GroupInfoPatternNames::empty().enumerate(), |pid| { + self.group_info().pattern_names(pid).enumerate() + }); + CapturesPatternIter { caps: self, names } + } + + /// Return the total number of capturing groups for the matching pattern. + /// + /// If this `Captures` value does not correspond to a match, then this + /// always returns `0`. + /// + /// This always returns the same number of elements yielded by + /// [`Captures::iter`]. That is, the number includes capturing groups even + /// if they don't participate in the match. + /// + /// # Example + /// + /// This example shows how to count the total number of capturing groups + /// associated with a pattern. Notice that it includes groups that did not + /// participate in a match (just like `Captures::iter` does). + /// + /// ``` + /// # if cfg!(miri) { return Ok(()); } // miri takes too long + /// use regex_automata::nfa::thompson::pikevm::PikeVM; + /// + /// let re = PikeVM::new( + /// // Matches first/last names, with an optional middle name. + /// r"^(?P<first>\pL+)\s+(?:(?P<middle>\pL+)\s+)?(?P<last>\pL+)$", + /// )?; + /// let (mut cache, mut caps) = (re.create_cache(), re.create_captures()); + /// + /// re.captures(&mut cache, "Harry Potter", &mut caps); + /// assert_eq!(4, caps.group_len()); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + pub fn group_len(&self) -> usize { + let pid = match self.pattern() { + None => return 0, + Some(pid) => pid, + }; + self.group_info().group_len(pid) + } + + /// Returns a reference to the underlying group info on which these + /// captures are based. + /// + /// The difference between `GroupInfo` and `Captures` is that the former + /// defines the structure of capturing groups where as the latter is what + /// stores the actual match information. So where as `Captures` only gives + /// you access to the current match, `GroupInfo` lets you query any + /// information about all capturing groups, even ones for patterns that + /// weren't involved in a match. + /// + /// Note that a `GroupInfo` uses reference counting internally, so it may + /// be cloned cheaply. + /// + /// # Example + /// + /// This example shows how to get all capturing group names from the + /// underlying `GroupInfo`. Notice that we don't even need to run a + /// search. + /// + /// ``` + /// use regex_automata::{nfa::thompson::pikevm::PikeVM, PatternID}; + /// + /// let re = PikeVM::new_many(&[ + /// r"(?P<foo>a)", + /// r"(a)(b)", + /// r"ab", + /// r"(?P<bar>a)(?P<quux>a)", + /// r"(?P<foo>z)", + /// ])?; + /// let caps = re.create_captures(); + /// + /// let expected = vec![ + /// (PatternID::must(0), 0, None), + /// (PatternID::must(0), 1, Some("foo")), + /// (PatternID::must(1), 0, None), + /// (PatternID::must(1), 1, None), + /// (PatternID::must(1), 2, None), + /// (PatternID::must(2), 0, None), + /// (PatternID::must(3), 0, None), + /// (PatternID::must(3), 1, Some("bar")), + /// (PatternID::must(3), 2, Some("quux")), + /// (PatternID::must(4), 0, None), + /// (PatternID::must(4), 1, Some("foo")), + /// ]; + /// // We could also just use 're.get_nfa().group_info()'. + /// let got: Vec<(PatternID, usize, Option<&str>)> = + /// caps.group_info().all_names().collect(); + /// assert_eq!(expected, got); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + pub fn group_info(&self) -> &GroupInfo { + &self.group_info + } + + /// Interpolates the capture references in `replacement` with the + /// corresponding substrings in `haystack` matched by each reference. The + /// interpolated string is returned. + /// + /// See the [`interpolate` module](interpolate) for documentation on the + /// format of the replacement string. + /// + /// # Example + /// + /// This example shows how to use interpolation, and also shows how it + /// can work with multi-pattern regexes. + /// + /// ``` + /// use regex_automata::{nfa::thompson::pikevm::PikeVM, PatternID}; + /// + /// let re = PikeVM::new_many(&[ + /// r"(?<day>[0-9]{2})-(?<month>[0-9]{2})-(?<year>[0-9]{4})", + /// r"(?<year>[0-9]{4})-(?<month>[0-9]{2})-(?<day>[0-9]{2})", + /// ])?; + /// let mut cache = re.create_cache(); + /// let mut caps = re.create_captures(); + /// + /// let replacement = "year=$year, month=$month, day=$day"; + /// + /// // This matches the first pattern. + /// let hay = "On 14-03-2010, I became a Tenneessee lamb."; + /// re.captures(&mut cache, hay, &mut caps); + /// let result = caps.interpolate_string(hay, replacement); + /// assert_eq!("year=2010, month=03, day=14", result); + /// + /// // And this matches the second pattern. + /// let hay = "On 2010-03-14, I became a Tenneessee lamb."; + /// re.captures(&mut cache, hay, &mut caps); + /// let result = caps.interpolate_string(hay, replacement); + /// assert_eq!("year=2010, month=03, day=14", result); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + pub fn interpolate_string( + &self, + haystack: &str, + replacement: &str, + ) -> String { + let mut dst = String::new(); + self.interpolate_string_into(haystack, replacement, &mut dst); + dst + } + + /// Interpolates the capture references in `replacement` with the + /// corresponding substrings in `haystack` matched by each reference. The + /// interpolated string is written to `dst`. + /// + /// See the [`interpolate` module](interpolate) for documentation on the + /// format of the replacement string. + /// + /// # Example + /// + /// This example shows how to use interpolation, and also shows how it + /// can work with multi-pattern regexes. + /// + /// ``` + /// use regex_automata::{nfa::thompson::pikevm::PikeVM, PatternID}; + /// + /// let re = PikeVM::new_many(&[ + /// r"(?<day>[0-9]{2})-(?<month>[0-9]{2})-(?<year>[0-9]{4})", + /// r"(?<year>[0-9]{4})-(?<month>[0-9]{2})-(?<day>[0-9]{2})", + /// ])?; + /// let mut cache = re.create_cache(); + /// let mut caps = re.create_captures(); + /// + /// let replacement = "year=$year, month=$month, day=$day"; + /// + /// // This matches the first pattern. + /// let hay = "On 14-03-2010, I became a Tenneessee lamb."; + /// re.captures(&mut cache, hay, &mut caps); + /// let mut dst = String::new(); + /// caps.interpolate_string_into(hay, replacement, &mut dst); + /// assert_eq!("year=2010, month=03, day=14", dst); + /// + /// // And this matches the second pattern. + /// let hay = "On 2010-03-14, I became a Tenneessee lamb."; + /// re.captures(&mut cache, hay, &mut caps); + /// let mut dst = String::new(); + /// caps.interpolate_string_into(hay, replacement, &mut dst); + /// assert_eq!("year=2010, month=03, day=14", dst); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + pub fn interpolate_string_into( + &self, + haystack: &str, + replacement: &str, + dst: &mut String, + ) { + interpolate::string( + replacement, + |index, dst| { + let span = match self.get_group(index) { + None => return, + Some(span) => span, + }; + dst.push_str(&haystack[span]); + }, + |name| self.group_info().to_index(self.pattern()?, name), + dst, + ); + } + + /// Interpolates the capture references in `replacement` with the + /// corresponding substrings in `haystack` matched by each reference. The + /// interpolated byte string is returned. + /// + /// See the [`interpolate` module](interpolate) for documentation on the + /// format of the replacement string. + /// + /// # Example + /// + /// This example shows how to use interpolation, and also shows how it + /// can work with multi-pattern regexes. + /// + /// ``` + /// use regex_automata::{nfa::thompson::pikevm::PikeVM, PatternID}; + /// + /// let re = PikeVM::new_many(&[ + /// r"(?<day>[0-9]{2})-(?<month>[0-9]{2})-(?<year>[0-9]{4})", + /// r"(?<year>[0-9]{4})-(?<month>[0-9]{2})-(?<day>[0-9]{2})", + /// ])?; + /// let mut cache = re.create_cache(); + /// let mut caps = re.create_captures(); + /// + /// let replacement = b"year=$year, month=$month, day=$day"; + /// + /// // This matches the first pattern. + /// let hay = b"On 14-03-2010, I became a Tenneessee lamb."; + /// re.captures(&mut cache, hay, &mut caps); + /// let result = caps.interpolate_bytes(hay, replacement); + /// assert_eq!(&b"year=2010, month=03, day=14"[..], result); + /// + /// // And this matches the second pattern. + /// let hay = b"On 2010-03-14, I became a Tenneessee lamb."; + /// re.captures(&mut cache, hay, &mut caps); + /// let result = caps.interpolate_bytes(hay, replacement); + /// assert_eq!(&b"year=2010, month=03, day=14"[..], result); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + pub fn interpolate_bytes( + &self, + haystack: &[u8], + replacement: &[u8], + ) -> Vec<u8> { + let mut dst = vec![]; + self.interpolate_bytes_into(haystack, replacement, &mut dst); + dst + } + + /// Interpolates the capture references in `replacement` with the + /// corresponding substrings in `haystack` matched by each reference. The + /// interpolated byte string is written to `dst`. + /// + /// See the [`interpolate` module](interpolate) for documentation on the + /// format of the replacement string. + /// + /// # Example + /// + /// This example shows how to use interpolation, and also shows how it + /// can work with multi-pattern regexes. + /// + /// ``` + /// use regex_automata::{nfa::thompson::pikevm::PikeVM, PatternID}; + /// + /// let re = PikeVM::new_many(&[ + /// r"(?<day>[0-9]{2})-(?<month>[0-9]{2})-(?<year>[0-9]{4})", + /// r"(?<year>[0-9]{4})-(?<month>[0-9]{2})-(?<day>[0-9]{2})", + /// ])?; + /// let mut cache = re.create_cache(); + /// let mut caps = re.create_captures(); + /// + /// let replacement = b"year=$year, month=$month, day=$day"; + /// + /// // This matches the first pattern. + /// let hay = b"On 14-03-2010, I became a Tenneessee lamb."; + /// re.captures(&mut cache, hay, &mut caps); + /// let mut dst = vec![]; + /// caps.interpolate_bytes_into(hay, replacement, &mut dst); + /// assert_eq!(&b"year=2010, month=03, day=14"[..], dst); + /// + /// // And this matches the second pattern. + /// let hay = b"On 2010-03-14, I became a Tenneessee lamb."; + /// re.captures(&mut cache, hay, &mut caps); + /// let mut dst = vec![]; + /// caps.interpolate_bytes_into(hay, replacement, &mut dst); + /// assert_eq!(&b"year=2010, month=03, day=14"[..], dst); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + pub fn interpolate_bytes_into( + &self, + haystack: &[u8], + replacement: &[u8], + dst: &mut Vec<u8>, + ) { + interpolate::bytes( + replacement, + |index, dst| { + let span = match self.get_group(index) { + None => return, + Some(span) => span, + }; + dst.extend_from_slice(&haystack[span]); + }, + |name| self.group_info().to_index(self.pattern()?, name), + dst, + ); + } + + /// This is a convenience routine for extracting the substrings + /// corresponding to matching capture groups in the given `haystack`. The + /// `haystack` should be the same substring used to find the match spans in + /// this `Captures` value. + /// + /// This is identical to [`Captures::extract_bytes`], except it works with + /// `&str` instead of `&[u8]`. + /// + /// # Panics + /// + /// This panics if the number of explicit matching groups in this + /// `Captures` value is less than `N`. This also panics if this `Captures` + /// value does not correspond to a match. + /// + /// Note that this does *not* panic if the number of explicit matching + /// groups is bigger than `N`. In that case, only the first `N` matching + /// groups are extracted. + /// + /// # Example + /// + /// ``` + /// use regex_automata::nfa::thompson::pikevm::PikeVM; + /// + /// let re = PikeVM::new(r"([0-9]{4})-([0-9]{2})-([0-9]{2})")?; + /// let mut cache = re.create_cache(); + /// let mut caps = re.create_captures(); + /// + /// let hay = "On 2010-03-14, I became a Tenneessee lamb."; + /// re.captures(&mut cache, hay, &mut caps); + /// assert!(caps.is_match()); + /// let (full, [year, month, day]) = caps.extract(hay); + /// assert_eq!("2010-03-14", full); + /// assert_eq!("2010", year); + /// assert_eq!("03", month); + /// assert_eq!("14", day); + /// + /// // We can also ask for fewer than all capture groups. + /// let (full, [year]) = caps.extract(hay); + /// assert_eq!("2010-03-14", full); + /// assert_eq!("2010", year); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + pub fn extract<'h, const N: usize>( + &self, + haystack: &'h str, + ) -> (&'h str, [&'h str; N]) { + let mut matched = self.iter().flatten(); + let whole_match = &haystack[matched.next().expect("a match")]; + let group_matches = [0; N].map(|_| { + let sp = matched.next().expect("too few matching groups"); + &haystack[sp] + }); + (whole_match, group_matches) + } + + /// This is a convenience routine for extracting the substrings + /// corresponding to matching capture groups in the given `haystack`. The + /// `haystack` should be the same substring used to find the match spans in + /// this `Captures` value. + /// + /// This is identical to [`Captures::extract`], except it works with + /// `&[u8]` instead of `&str`. + /// + /// # Panics + /// + /// This panics if the number of explicit matching groups in this + /// `Captures` value is less than `N`. This also panics if this `Captures` + /// value does not correspond to a match. + /// + /// Note that this does *not* panic if the number of explicit matching + /// groups is bigger than `N`. In that case, only the first `N` matching + /// groups are extracted. + /// + /// # Example + /// + /// ``` + /// use regex_automata::nfa::thompson::pikevm::PikeVM; + /// + /// let re = PikeVM::new(r"([0-9]{4})-([0-9]{2})-([0-9]{2})")?; + /// let mut cache = re.create_cache(); + /// let mut caps = re.create_captures(); + /// + /// let hay = b"On 2010-03-14, I became a Tenneessee lamb."; + /// re.captures(&mut cache, hay, &mut caps); + /// assert!(caps.is_match()); + /// let (full, [year, month, day]) = caps.extract_bytes(hay); + /// assert_eq!(b"2010-03-14", full); + /// assert_eq!(b"2010", year); + /// assert_eq!(b"03", month); + /// assert_eq!(b"14", day); + /// + /// // We can also ask for fewer than all capture groups. + /// let (full, [year]) = caps.extract_bytes(hay); + /// assert_eq!(b"2010-03-14", full); + /// assert_eq!(b"2010", year); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + pub fn extract_bytes<'h, const N: usize>( + &self, + haystack: &'h [u8], + ) -> (&'h [u8], [&'h [u8]; N]) { + let mut matched = self.iter().flatten(); + let whole_match = &haystack[matched.next().expect("a match")]; + let group_matches = [0; N].map(|_| { + let sp = matched.next().expect("too few matching groups"); + &haystack[sp] + }); + (whole_match, group_matches) + } +} + +/// Lower level "slot" oriented APIs. One does not typically need to use these +/// when executing a search. They are instead mostly intended for folks that +/// are writing their own regex engine while reusing this `Captures` type. +impl Captures { + /// Clear this `Captures` value. + /// + /// After clearing, all slots inside this `Captures` value will be set to + /// `None`. Similarly, any pattern ID that it was previously associated + /// with (for a match) is erased. + /// + /// It is not usually necessary to call this routine. Namely, a `Captures` + /// value only provides high level access to the capturing groups of the + /// pattern that matched, and only low level access to individual slots. + /// Thus, even if slots corresponding to groups that aren't associated + /// with the matching pattern are set, then it won't impact the higher + /// level APIs. Namely, higher level APIs like [`Captures::get_group`] will + /// return `None` if no pattern ID is present, even if there are spans set + /// in the underlying slots. + /// + /// Thus, to "clear" a `Captures` value of a match, it is usually only + /// necessary to call [`Captures::set_pattern`] with `None`. + /// + /// # Example + /// + /// This example shows what happens when a `Captures` value is cleared. + /// + /// ``` + /// # if cfg!(miri) { return Ok(()); } // miri takes too long + /// use regex_automata::nfa::thompson::pikevm::PikeVM; + /// + /// let re = PikeVM::new(r"^(?P<first>\pL+)\s+(?P<last>\pL+)$")?; + /// let (mut cache, mut caps) = (re.create_cache(), re.create_captures()); + /// + /// re.captures(&mut cache, "Bruce Springsteen", &mut caps); + /// assert!(caps.is_match()); + /// let slots: Vec<Option<usize>> = + /// caps.slots().iter().map(|s| s.map(|x| x.get())).collect(); + /// // Note that the following ordering is considered an API guarantee. + /// assert_eq!(slots, vec![ + /// Some(0), + /// Some(17), + /// Some(0), + /// Some(5), + /// Some(6), + /// Some(17), + /// ]); + /// + /// // Now clear the slots. Everything is gone and it is no longer a match. + /// caps.clear(); + /// assert!(!caps.is_match()); + /// let slots: Vec<Option<usize>> = + /// caps.slots().iter().map(|s| s.map(|x| x.get())).collect(); + /// assert_eq!(slots, vec![ + /// None, + /// None, + /// None, + /// None, + /// None, + /// None, + /// ]); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + #[inline] + pub fn clear(&mut self) { + self.pid = None; + for slot in self.slots.iter_mut() { + *slot = None; + } + } + + /// Set the pattern on this `Captures` value. + /// + /// When the pattern ID is `None`, then this `Captures` value does not + /// correspond to a match (`is_match` will return `false`). Otherwise, it + /// corresponds to a match. + /// + /// This is useful in search implementations where you might want to + /// initially call `set_pattern(None)` in order to avoid the cost of + /// calling `clear()` if it turns out to not be necessary. + /// + /// # Example + /// + /// This example shows that `set_pattern` merely overwrites the pattern ID. + /// It does not actually change the underlying slot values. + /// + /// ``` + /// # if cfg!(miri) { return Ok(()); } // miri takes too long + /// use regex_automata::nfa::thompson::pikevm::PikeVM; + /// + /// let re = PikeVM::new(r"^(?P<first>\pL+)\s+(?P<last>\pL+)$")?; + /// let (mut cache, mut caps) = (re.create_cache(), re.create_captures()); + /// + /// re.captures(&mut cache, "Bruce Springsteen", &mut caps); + /// assert!(caps.is_match()); + /// assert!(caps.pattern().is_some()); + /// let slots: Vec<Option<usize>> = + /// caps.slots().iter().map(|s| s.map(|x| x.get())).collect(); + /// // Note that the following ordering is considered an API guarantee. + /// assert_eq!(slots, vec![ + /// Some(0), + /// Some(17), + /// Some(0), + /// Some(5), + /// Some(6), + /// Some(17), + /// ]); + /// + /// // Now set the pattern to None. Note that the slot values remain. + /// caps.set_pattern(None); + /// assert!(!caps.is_match()); + /// assert!(!caps.pattern().is_some()); + /// let slots: Vec<Option<usize>> = + /// caps.slots().iter().map(|s| s.map(|x| x.get())).collect(); + /// // Note that the following ordering is considered an API guarantee. + /// assert_eq!(slots, vec![ + /// Some(0), + /// Some(17), + /// Some(0), + /// Some(5), + /// Some(6), + /// Some(17), + /// ]); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + #[inline] + pub fn set_pattern(&mut self, pid: Option<PatternID>) { + self.pid = pid; + } + + /// Returns the underlying slots, where each slot stores a single offset. + /// + /// Every matching capturing group generally corresponds to two slots: one + /// slot for the starting position and another for the ending position. + /// Typically, either both are present or neither are. (The weasel word + /// "typically" is used here because it really depends on the regex engine + /// implementation. Every sensible regex engine likely adheres to this + /// invariant, and every regex engine in this crate is sensible.) + /// + /// Generally speaking, callers should prefer to use higher level routines + /// like [`Captures::get_match`] or [`Captures::get_group`]. + /// + /// An important note here is that a regex engine may not reset all of the + /// slots to `None` values when no match occurs, or even when a match of + /// a different pattern occurs. But this depends on how the regex engine + /// implementation deals with slots. + /// + /// # Example + /// + /// This example shows how to get the underlying slots from a regex match. + /// + /// ``` + /// use regex_automata::{ + /// nfa::thompson::pikevm::PikeVM, + /// util::primitives::{PatternID, NonMaxUsize}, + /// }; + /// + /// let re = PikeVM::new_many(&[ + /// r"[a-z]+", + /// r"[0-9]+", + /// ])?; + /// let (mut cache, mut caps) = (re.create_cache(), re.create_captures()); + /// + /// re.captures(&mut cache, "123", &mut caps); + /// assert_eq!(Some(PatternID::must(1)), caps.pattern()); + /// // Note that the only guarantee we have here is that slots 2 and 3 + /// // are set to correct values. The contents of the first two slots are + /// // unspecified since the 0th pattern did not match. + /// let expected = &[ + /// None, + /// None, + /// NonMaxUsize::new(0), + /// NonMaxUsize::new(3), + /// ]; + /// assert_eq!(expected, caps.slots()); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + #[inline] + pub fn slots(&self) -> &[Option<NonMaxUsize>] { + &self.slots + } + + /// Returns the underlying slots as a mutable slice, where each slot stores + /// a single offset. + /// + /// This tends to be most useful for regex engine implementations for + /// writing offsets for matching capturing groups to slots. + /// + /// See [`Captures::slots`] for more information about slots. + #[inline] + pub fn slots_mut(&mut self) -> &mut [Option<NonMaxUsize>] { + &mut self.slots + } +} + +impl core::fmt::Debug for Captures { + fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result { + let mut dstruct = f.debug_struct("Captures"); + dstruct.field("pid", &self.pid); + if let Some(pid) = self.pid { + dstruct.field("spans", &CapturesDebugMap { pid, caps: self }); + } + dstruct.finish() + } +} + +/// A little helper type to provide a nice map-like debug representation for +/// our capturing group spans. +struct CapturesDebugMap<'a> { + pid: PatternID, + caps: &'a Captures, +} + +impl<'a> core::fmt::Debug for CapturesDebugMap<'a> { + fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result { + struct Key<'a>(usize, Option<&'a str>); + + impl<'a> core::fmt::Debug for Key<'a> { + fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result { + write!(f, "{}", self.0)?; + if let Some(name) = self.1 { + write!(f, "/{:?}", name)?; + } + Ok(()) + } + } + + let mut map = f.debug_map(); + let names = self.caps.group_info().pattern_names(self.pid); + for (group_index, maybe_name) in names.enumerate() { + let key = Key(group_index, maybe_name); + match self.caps.get_group(group_index) { + None => map.entry(&key, &None::<()>), + Some(span) => map.entry(&key, &span), + }; + } + map.finish() + } +} + +/// An iterator over all capturing groups in a `Captures` value. +/// +/// This iterator includes capturing groups that did not participate in a +/// match. See the [`Captures::iter`] method documentation for more details +/// and examples. +/// +/// The lifetime parameter `'a` refers to the lifetime of the underlying +/// `Captures` value. +#[derive(Clone, Debug)] +pub struct CapturesPatternIter<'a> { + caps: &'a Captures, + names: core::iter::Enumerate<GroupInfoPatternNames<'a>>, +} + +impl<'a> Iterator for CapturesPatternIter<'a> { + type Item = Option<Span>; + + fn next(&mut self) -> Option<Option<Span>> { + let (group_index, _) = self.names.next()?; + Some(self.caps.get_group(group_index)) + } + + fn size_hint(&self) -> (usize, Option<usize>) { + self.names.size_hint() + } + + fn count(self) -> usize { + self.names.count() + } +} + +impl<'a> ExactSizeIterator for CapturesPatternIter<'a> {} +impl<'a> core::iter::FusedIterator for CapturesPatternIter<'a> {} + +/// Represents information about capturing groups in a compiled regex. +/// +/// The information encapsulated by this type consists of the following. For +/// each pattern: +/// +/// * A map from every capture group name to its corresponding capture group +/// index. +/// * A map from every capture group index to its corresponding capture group +/// name. +/// * A map from capture group index to its corresponding slot index. A slot +/// refers to one half of a capturing group. That is, a capture slot is either +/// the start or end of a capturing group. A slot is usually the mechanism +/// by which a regex engine records offsets for each capturing group during a +/// search. +/// +/// A `GroupInfo` uses reference counting internally and is thus cheap to +/// clone. +/// +/// # Mapping from capture groups to slots +/// +/// One of the main responsibilities of a `GroupInfo` is to build a mapping +/// from `(PatternID, u32)` (where the `u32` is a capture index) to something +/// called a "slot." As mentioned above, a slot refers to one half of a +/// capturing group. Both combined provide the start and end offsets of +/// a capturing group that participated in a match. +/// +/// **The mapping between group indices and slots is an API guarantee.** That +/// is, the mapping won't change within a semver compatible release. +/// +/// Slots exist primarily because this is a convenient mechanism by which +/// regex engines report group offsets at search time. For example, the +/// [`nfa::thompson::State::Capture`](crate::nfa::thompson::State::Capture) +/// NFA state includes the slot index. When a regex engine transitions through +/// this state, it will likely use the slot index to write the current haystack +/// offset to some region of memory. When a match is found, those slots are +/// then reported to the caller, typically via a convenient abstraction like a +/// [`Captures`] value. +/// +/// Because this crate provides first class support for multi-pattern regexes, +/// and because of some performance related reasons, the mapping between +/// capturing groups and slots is a little complex. However, in the case of a +/// single pattern, the mapping can be described very simply: for all capture +/// group indices `i`, its corresponding slots are at `i * 2` and `i * 2 + 1`. +/// Notice that the pattern ID isn't involved at all here, because it only +/// applies to a single-pattern regex, it is therefore always `0`. +/// +/// In the multi-pattern case, the mapping is a bit more complicated. To talk +/// about it, we must define what we mean by "implicit" vs "explicit" +/// capturing groups: +/// +/// * An **implicit** capturing group refers to the capturing group that is +/// present for every pattern automatically, and corresponds to the overall +/// match of a pattern. Every pattern has precisely one implicit capturing +/// group. It is always unnamed and it always corresponds to the capture group +/// index `0`. +/// * An **explicit** capturing group refers to any capturing group that +/// appears in the concrete syntax of the pattern. (Or, if an NFA was hand +/// built without any concrete syntax, it refers to any capturing group with an +/// index greater than `0`.) +/// +/// Some examples: +/// +/// * `\w+` has one implicit capturing group and zero explicit capturing +/// groups. +/// * `(\w+)` has one implicit group and one explicit group. +/// * `foo(\d+)(?:\pL+)(\d+)` has one implicit group and two explicit groups. +/// +/// Turning back to the slot mapping, we can now state it as follows: +/// +/// * Given a pattern ID `pid`, the slots for its implicit group are always +/// at `pid * 2` and `pid * 2 + 1`. +/// * Given a pattern ID `0`, the slots for its explicit groups start +/// at `group_info.pattern_len() * 2`. +/// * Given a pattern ID `pid > 0`, the slots for its explicit groups start +/// immediately following where the slots for the explicit groups of `pid - 1` +/// end. +/// +/// In particular, while there is a concrete formula one can use to determine +/// where the slots for the implicit group of any pattern are, there is no +/// general formula for determining where the slots for explicit capturing +/// groups are. This is because each pattern can contain a different number +/// of groups. +/// +/// The intended way of getting the slots for a particular capturing group +/// (whether implicit or explicit) is via the [`GroupInfo::slot`] or +/// [`GroupInfo::slots`] method. +/// +/// See below for a concrete example of how capturing groups get mapped to +/// slots. +/// +/// # Example +/// +/// This example shows how to build a new `GroupInfo` and query it for +/// information. +/// +/// ``` +/// use regex_automata::util::{captures::GroupInfo, primitives::PatternID}; +/// +/// let info = GroupInfo::new(vec![ +/// vec![None, Some("foo")], +/// vec![None], +/// vec![None, None, None, Some("bar"), None], +/// vec![None, None, Some("foo")], +/// ])?; +/// // The number of patterns being tracked. +/// assert_eq!(4, info.pattern_len()); +/// // We can query the number of groups for any pattern. +/// assert_eq!(2, info.group_len(PatternID::must(0))); +/// assert_eq!(1, info.group_len(PatternID::must(1))); +/// assert_eq!(5, info.group_len(PatternID::must(2))); +/// assert_eq!(3, info.group_len(PatternID::must(3))); +/// // An invalid pattern always has zero groups. +/// assert_eq!(0, info.group_len(PatternID::must(999))); +/// // 2 slots per group +/// assert_eq!(22, info.slot_len()); +/// +/// // We can map a group index for a particular pattern to its name, if +/// // one exists. +/// assert_eq!(Some("foo"), info.to_name(PatternID::must(3), 2)); +/// assert_eq!(None, info.to_name(PatternID::must(2), 4)); +/// // Or map a name to its group index. +/// assert_eq!(Some(1), info.to_index(PatternID::must(0), "foo")); +/// assert_eq!(Some(2), info.to_index(PatternID::must(3), "foo")); +/// +/// # Ok::<(), Box<dyn std::error::Error>>(()) +/// ``` +/// +/// # Example: mapping from capture groups to slots +/// +/// This example shows the specific mapping from capture group indices for +/// each pattern to their corresponding slots. The slot values shown in this +/// example are considered an API guarantee. +/// +/// ``` +/// use regex_automata::util::{captures::GroupInfo, primitives::PatternID}; +/// +/// let info = GroupInfo::new(vec![ +/// vec![None, Some("foo")], +/// vec![None], +/// vec![None, None, None, Some("bar"), None], +/// vec![None, None, Some("foo")], +/// ])?; +/// +/// // We first show the slots for each pattern's implicit group. +/// assert_eq!(Some((0, 1)), info.slots(PatternID::must(0), 0)); +/// assert_eq!(Some((2, 3)), info.slots(PatternID::must(1), 0)); +/// assert_eq!(Some((4, 5)), info.slots(PatternID::must(2), 0)); +/// assert_eq!(Some((6, 7)), info.slots(PatternID::must(3), 0)); +/// +/// // And now we show the slots for each pattern's explicit group. +/// assert_eq!(Some((8, 9)), info.slots(PatternID::must(0), 1)); +/// assert_eq!(Some((10, 11)), info.slots(PatternID::must(2), 1)); +/// assert_eq!(Some((12, 13)), info.slots(PatternID::must(2), 2)); +/// assert_eq!(Some((14, 15)), info.slots(PatternID::must(2), 3)); +/// assert_eq!(Some((16, 17)), info.slots(PatternID::must(2), 4)); +/// assert_eq!(Some((18, 19)), info.slots(PatternID::must(3), 1)); +/// assert_eq!(Some((20, 21)), info.slots(PatternID::must(3), 2)); +/// +/// // Asking for the slots for an invalid pattern ID or even for an invalid +/// // group index for a specific pattern will return None. So for example, +/// // you're guaranteed to not get the slots for a different pattern than the +/// // one requested. +/// assert_eq!(None, info.slots(PatternID::must(5), 0)); +/// assert_eq!(None, info.slots(PatternID::must(1), 1)); +/// +/// # Ok::<(), Box<dyn std::error::Error>>(()) +/// ``` +#[derive(Clone, Debug, Default)] +pub struct GroupInfo(Arc<GroupInfoInner>); + +impl GroupInfo { + /// Creates a new group info from a sequence of patterns, where each + /// sequence of patterns yields a sequence of possible group names. The + /// index of each pattern in the sequence corresponds to its `PatternID`, + /// and the index of each group in each pattern's sequence corresponds to + /// its corresponding group index. + /// + /// While this constructor is very generic and therefore perhaps hard to + /// chew on, an example of a valid concrete type that can be passed to + /// this constructor is `Vec<Vec<Option<String>>>`. The outer `Vec` + /// corresponds to the patterns, i.e., one `Vec<Option<String>>` per + /// pattern. The inner `Vec` corresponds to the capturing groups for + /// each pattern. The `Option<String>` corresponds to the name of the + /// capturing group, if present. + /// + /// It is legal to pass an empty iterator to this constructor. It will + /// return an empty group info with zero slots. An empty group info is + /// useful for cases where you have no patterns or for cases where slots + /// aren't being used at all (e.g., for most DFAs in this crate). + /// + /// # Errors + /// + /// This constructor returns an error if the given capturing groups are + /// invalid in some way. Those reasons include, but are not necessarily + /// limited to: + /// + /// * Too many patterns (i.e., `PatternID` would overflow). + /// * Too many capturing groups (e.g., `u32` would overflow). + /// * A pattern is given that has no capturing groups. (All patterns must + /// have at least an implicit capturing group at index `0`.) + /// * The capturing group at index `0` has a name. It must be unnamed. + /// * There are duplicate capturing group names within the same pattern. + /// (Multiple capturing groups with the same name may exist, but they + /// must be in different patterns.) + /// + /// An example below shows how to trigger some of the above error + /// conditions. + /// + /// # Example + /// + /// This example shows how to build a new `GroupInfo` and query it for + /// information. + /// + /// ``` + /// use regex_automata::util::captures::GroupInfo; + /// + /// let info = GroupInfo::new(vec![ + /// vec![None, Some("foo")], + /// vec![None], + /// vec![None, None, None, Some("bar"), None], + /// vec![None, None, Some("foo")], + /// ])?; + /// // The number of patterns being tracked. + /// assert_eq!(4, info.pattern_len()); + /// // 2 slots per group + /// assert_eq!(22, info.slot_len()); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + /// + /// # Example: empty `GroupInfo` + /// + /// This example shows how to build a new `GroupInfo` and query it for + /// information. + /// + /// ``` + /// use regex_automata::util::captures::GroupInfo; + /// + /// let info = GroupInfo::empty(); + /// // Everything is zero. + /// assert_eq!(0, info.pattern_len()); + /// assert_eq!(0, info.slot_len()); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + /// + /// # Example: error conditions + /// + /// This example shows how to provoke some of the ways in which building + /// a `GroupInfo` can fail. + /// + /// ``` + /// use regex_automata::util::captures::GroupInfo; + /// + /// // Either the group info is empty, or all patterns must have at least + /// // one capturing group. + /// assert!(GroupInfo::new(vec![ + /// vec![None, Some("a")], // ok + /// vec![None], // ok + /// vec![], // not ok + /// ]).is_err()); + /// // Note that building an empty group info is OK. + /// assert!(GroupInfo::new(Vec::<Vec<Option<String>>>::new()).is_ok()); + /// + /// // The first group in each pattern must correspond to an implicit + /// // anonymous group. i.e., One that is not named. By convention, this + /// // group corresponds to the overall match of a regex. Every other group + /// // in a pattern is explicit and optional. + /// assert!(GroupInfo::new(vec![vec![Some("foo")]]).is_err()); + /// + /// // There must not be duplicate group names within the same pattern. + /// assert!(GroupInfo::new(vec![ + /// vec![None, Some("foo"), Some("foo")], + /// ]).is_err()); + /// // But duplicate names across distinct patterns is OK. + /// assert!(GroupInfo::new(vec![ + /// vec![None, Some("foo")], + /// vec![None, Some("foo")], + /// ]).is_ok()); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + /// + /// There are other ways for building a `GroupInfo` to fail but are + /// difficult to show. For example, if the number of patterns given would + /// overflow `PatternID`. + pub fn new<P, G, N>(pattern_groups: P) -> Result<GroupInfo, GroupInfoError> + where + P: IntoIterator<Item = G>, + G: IntoIterator<Item = Option<N>>, + N: AsRef<str>, + { + let mut group_info = GroupInfoInner { + slot_ranges: vec![], + name_to_index: vec![], + index_to_name: vec![], + memory_extra: 0, + }; + for (pattern_index, groups) in pattern_groups.into_iter().enumerate() { + // If we can't convert the pattern index to an ID, then the caller + // tried to build capture info for too many patterns. + let pid = PatternID::new(pattern_index) + .map_err(GroupInfoError::too_many_patterns)?; + + let mut groups_iter = groups.into_iter().enumerate(); + match groups_iter.next() { + None => return Err(GroupInfoError::missing_groups(pid)), + Some((_, Some(_))) => { + return Err(GroupInfoError::first_must_be_unnamed(pid)) + } + Some((_, None)) => {} + } + group_info.add_first_group(pid); + // Now iterate over the rest, which correspond to all of the + // (conventionally) explicit capture groups in a regex pattern. + for (group_index, maybe_name) in groups_iter { + // Just like for patterns, if the group index can't be + // converted to a "small" index, then the caller has given too + // many groups for a particular pattern. + let group = SmallIndex::new(group_index).map_err(|_| { + GroupInfoError::too_many_groups(pid, group_index) + })?; + group_info.add_explicit_group(pid, group, maybe_name)?; + } + } + group_info.fixup_slot_ranges()?; + Ok(GroupInfo(Arc::new(group_info))) + } + + /// This creates an empty `GroupInfo`. + /// + /// This is a convenience routine for calling `GroupInfo::new` with an + /// iterator that yields no elements. + /// + /// # Example + /// + /// This example shows how to build a new empty `GroupInfo` and query it + /// for information. + /// + /// ``` + /// use regex_automata::util::captures::GroupInfo; + /// + /// let info = GroupInfo::empty(); + /// // Everything is zero. + /// assert_eq!(0, info.pattern_len()); + /// assert_eq!(0, info.all_group_len()); + /// assert_eq!(0, info.slot_len()); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + pub fn empty() -> GroupInfo { + GroupInfo::new(core::iter::empty::<[Option<&str>; 0]>()) + .expect("empty group info is always valid") + } + + /// Return the capture group index corresponding to the given name in the + /// given pattern. If no such capture group name exists in the given + /// pattern, then this returns `None`. + /// + /// If the given pattern ID is invalid, then this returns `None`. + /// + /// This also returns `None` for all inputs if these captures are empty + /// (e.g., built from an empty [`GroupInfo`]). To check whether captures + /// are are present for a specific pattern, use [`GroupInfo::group_len`]. + /// + /// # Example + /// + /// This example shows how to find the capture index for the given pattern + /// and group name. + /// + /// Remember that capture indices are relative to the pattern, such that + /// the same capture index value may refer to different capturing groups + /// for distinct patterns. + /// + /// ``` + /// # if cfg!(miri) { return Ok(()); } // miri takes too long + /// use regex_automata::{nfa::thompson::NFA, PatternID}; + /// + /// let (pid0, pid1) = (PatternID::must(0), PatternID::must(1)); + /// + /// let nfa = NFA::new_many(&[ + /// r"a(?P<quux>\w+)z(?P<foo>\s+)", + /// r"a(?P<foo>\d+)z", + /// ])?; + /// let groups = nfa.group_info(); + /// assert_eq!(Some(2), groups.to_index(pid0, "foo")); + /// // Recall that capture index 0 is always unnamed and refers to the + /// // entire pattern. So the first capturing group present in the pattern + /// // itself always starts at index 1. + /// assert_eq!(Some(1), groups.to_index(pid1, "foo")); + /// + /// // And if a name does not exist for a particular pattern, None is + /// // returned. + /// assert!(groups.to_index(pid0, "quux").is_some()); + /// assert!(groups.to_index(pid1, "quux").is_none()); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + #[inline] + pub fn to_index(&self, pid: PatternID, name: &str) -> Option<usize> { + let indices = self.0.name_to_index.get(pid.as_usize())?; + indices.get(name).cloned().map(|i| i.as_usize()) + } + + /// Return the capture name for the given index and given pattern. If the + /// corresponding group does not have a name, then this returns `None`. + /// + /// If the pattern ID is invalid, then this returns `None`. + /// + /// If the group index is invalid for the given pattern, then this returns + /// `None`. A group `index` is valid for a pattern `pid` in an `nfa` if and + /// only if `index < nfa.pattern_capture_len(pid)`. + /// + /// This also returns `None` for all inputs if these captures are empty + /// (e.g., built from an empty [`GroupInfo`]). To check whether captures + /// are are present for a specific pattern, use [`GroupInfo::group_len`]. + /// + /// # Example + /// + /// This example shows how to find the capture group name for the given + /// pattern and group index. + /// + /// ``` + /// # if cfg!(miri) { return Ok(()); } // miri takes too long + /// use regex_automata::{nfa::thompson::NFA, PatternID}; + /// + /// let (pid0, pid1) = (PatternID::must(0), PatternID::must(1)); + /// + /// let nfa = NFA::new_many(&[ + /// r"a(?P<foo>\w+)z(\s+)x(\d+)", + /// r"a(\d+)z(?P<foo>\s+)", + /// ])?; + /// let groups = nfa.group_info(); + /// assert_eq!(None, groups.to_name(pid0, 0)); + /// assert_eq!(Some("foo"), groups.to_name(pid0, 1)); + /// assert_eq!(None, groups.to_name(pid0, 2)); + /// assert_eq!(None, groups.to_name(pid0, 3)); + /// + /// assert_eq!(None, groups.to_name(pid1, 0)); + /// assert_eq!(None, groups.to_name(pid1, 1)); + /// assert_eq!(Some("foo"), groups.to_name(pid1, 2)); + /// // '3' is not a valid capture index for the second pattern. + /// assert_eq!(None, groups.to_name(pid1, 3)); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + #[inline] + pub fn to_name(&self, pid: PatternID, group_index: usize) -> Option<&str> { + let pattern_names = self.0.index_to_name.get(pid.as_usize())?; + pattern_names.get(group_index)?.as_deref() + } + + /// Return an iterator of all capture groups and their names (if present) + /// for a particular pattern. + /// + /// If the given pattern ID is invalid or if this `GroupInfo` is empty, + /// then the iterator yields no elements. + /// + /// The number of elements yielded by this iterator is always equal to + /// the result of calling [`GroupInfo::group_len`] with the same + /// `PatternID`. + /// + /// # Example + /// + /// This example shows how to get a list of all capture group names for + /// a particular pattern. + /// + /// ``` + /// use regex_automata::{nfa::thompson::NFA, PatternID}; + /// + /// let nfa = NFA::new(r"(a)(?P<foo>b)(c)(d)(?P<bar>e)")?; + /// // The first is the implicit group that is always unnammed. The next + /// // 5 groups are the explicit groups found in the concrete syntax above. + /// let expected = vec![None, None, Some("foo"), None, None, Some("bar")]; + /// let got: Vec<Option<&str>> = + /// nfa.group_info().pattern_names(PatternID::ZERO).collect(); + /// assert_eq!(expected, got); + /// + /// // Using an invalid pattern ID will result in nothing yielded. + /// let got = nfa.group_info().pattern_names(PatternID::must(999)).count(); + /// assert_eq!(0, got); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + #[inline] + pub fn pattern_names(&self, pid: PatternID) -> GroupInfoPatternNames<'_> { + GroupInfoPatternNames { + it: self + .0 + .index_to_name + .get(pid.as_usize()) + .map(|indices| indices.iter()) + .unwrap_or([].iter()), + } + } + + /// Return an iterator of all capture groups for all patterns supported by + /// this `GroupInfo`. Each item yielded is a triple of the group's pattern + /// ID, index in the pattern and the group's name, if present. + /// + /// # Example + /// + /// This example shows how to get a list of all capture groups found in + /// one NFA, potentially spanning multiple patterns. + /// + /// ``` + /// use regex_automata::{nfa::thompson::NFA, PatternID}; + /// + /// let nfa = NFA::new_many(&[ + /// r"(?P<foo>a)", + /// r"a", + /// r"(a)", + /// ])?; + /// let expected = vec![ + /// (PatternID::must(0), 0, None), + /// (PatternID::must(0), 1, Some("foo")), + /// (PatternID::must(1), 0, None), + /// (PatternID::must(2), 0, None), + /// (PatternID::must(2), 1, None), + /// ]; + /// let got: Vec<(PatternID, usize, Option<&str>)> = + /// nfa.group_info().all_names().collect(); + /// assert_eq!(expected, got); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + /// + /// Unlike other capturing group related routines, this routine doesn't + /// panic even if captures aren't enabled on this NFA: + /// + /// ``` + /// use regex_automata::nfa::thompson::{NFA, WhichCaptures}; + /// + /// let nfa = NFA::compiler() + /// .configure(NFA::config().which_captures(WhichCaptures::None)) + /// .build_many(&[ + /// r"(?P<foo>a)", + /// r"a", + /// r"(a)", + /// ])?; + /// // When captures aren't enabled, there's nothing to return. + /// assert_eq!(0, nfa.group_info().all_names().count()); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + #[inline] + pub fn all_names(&self) -> GroupInfoAllNames<'_> { + GroupInfoAllNames { + group_info: self, + pids: PatternID::iter(self.pattern_len()), + current_pid: None, + names: None, + } + } + + /// Returns the starting and ending slot corresponding to the given + /// capturing group for the given pattern. The ending slot is always one + /// more than the starting slot returned. + /// + /// Note that this is like [`GroupInfo::slot`], except that it also returns + /// the ending slot value for convenience. + /// + /// If either the pattern ID or the capture index is invalid, then this + /// returns None. + /// + /// # Example + /// + /// This example shows that the starting slots for the first capturing + /// group of each pattern are distinct. + /// + /// ``` + /// use regex_automata::{nfa::thompson::NFA, PatternID}; + /// + /// let nfa = NFA::new_many(&["a", "b"])?; + /// assert_ne!( + /// nfa.group_info().slots(PatternID::must(0), 0), + /// nfa.group_info().slots(PatternID::must(1), 0), + /// ); + /// + /// // Also, the start and end slot values are never equivalent. + /// let (start, end) = nfa.group_info().slots(PatternID::ZERO, 0).unwrap(); + /// assert_ne!(start, end); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + #[inline] + pub fn slots( + &self, + pid: PatternID, + group_index: usize, + ) -> Option<(usize, usize)> { + // Since 'slot' only even returns valid starting slots, we know that + // there must also be an end slot and that end slot is always one more + // than the start slot. + self.slot(pid, group_index).map(|start| (start, start + 1)) + } + + /// Returns the starting slot corresponding to the given capturing group + /// for the given pattern. The ending slot is always one more than the + /// value returned. + /// + /// If either the pattern ID or the capture index is invalid, then this + /// returns None. + /// + /// # Example + /// + /// This example shows that the starting slots for the first capturing + /// group of each pattern are distinct. + /// + /// ``` + /// use regex_automata::{nfa::thompson::NFA, PatternID}; + /// + /// let nfa = NFA::new_many(&["a", "b"])?; + /// assert_ne!( + /// nfa.group_info().slot(PatternID::must(0), 0), + /// nfa.group_info().slot(PatternID::must(1), 0), + /// ); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + #[inline] + pub fn slot(&self, pid: PatternID, group_index: usize) -> Option<usize> { + if group_index >= self.group_len(pid) { + return None; + } + // At this point, we know that 'pid' refers to a real pattern and that + // 'group_index' refers to a real group. We therefore also know that + // the pattern and group can be combined to return a correct slot. + // That's why we don't need to use checked arithmetic below. + if group_index == 0 { + Some(pid.as_usize() * 2) + } else { + // As above, we don't need to check that our slot is less than the + // end of our range since we already know the group index is a + // valid index for the given pattern. + let (start, _) = self.0.slot_ranges[pid]; + Some(start.as_usize() + ((group_index - 1) * 2)) + } + } + + /// Returns the total number of patterns in this `GroupInfo`. + /// + /// This may return zero if the `GroupInfo` was constructed with no + /// patterns. + /// + /// This is guaranteed to be no bigger than [`PatternID::LIMIT`] because + /// `GroupInfo` construction will fail if too many patterns are added. + /// + /// # Example + /// + /// ``` + /// use regex_automata::nfa::thompson::NFA; + /// + /// let nfa = NFA::new_many(&["[0-9]+", "[a-z]+", "[A-Z]+"])?; + /// assert_eq!(3, nfa.group_info().pattern_len()); + /// + /// let nfa = NFA::never_match(); + /// assert_eq!(0, nfa.group_info().pattern_len()); + /// + /// let nfa = NFA::always_match(); + /// assert_eq!(1, nfa.group_info().pattern_len()); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + #[inline] + pub fn pattern_len(&self) -> usize { + self.0.pattern_len() + } + + /// Return the number of capture groups in a pattern. + /// + /// If the pattern ID is invalid, then this returns `0`. + /// + /// # Example + /// + /// This example shows how the values returned by this routine may vary + /// for different patterns and NFA configurations. + /// + /// ``` + /// use regex_automata::{nfa::thompson::{NFA, WhichCaptures}, PatternID}; + /// + /// let nfa = NFA::new(r"(a)(b)(c)")?; + /// // There are 3 explicit groups in the pattern's concrete syntax and + /// // 1 unnamed and implicit group spanning the entire pattern. + /// assert_eq!(4, nfa.group_info().group_len(PatternID::ZERO)); + /// + /// let nfa = NFA::new(r"abc")?; + /// // There is just the unnamed implicit group. + /// assert_eq!(1, nfa.group_info().group_len(PatternID::ZERO)); + /// + /// let nfa = NFA::compiler() + /// .configure(NFA::config().which_captures(WhichCaptures::None)) + /// .build(r"abc")?; + /// // We disabled capturing groups, so there are none. + /// assert_eq!(0, nfa.group_info().group_len(PatternID::ZERO)); + /// + /// let nfa = NFA::compiler() + /// .configure(NFA::config().which_captures(WhichCaptures::None)) + /// .build(r"(a)(b)(c)")?; + /// // We disabled capturing groups, so there are none, even if there are + /// // explicit groups in the concrete syntax. + /// assert_eq!(0, nfa.group_info().group_len(PatternID::ZERO)); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + #[inline] + pub fn group_len(&self, pid: PatternID) -> usize { + self.0.group_len(pid) + } + + /// Return the total number of capture groups across all patterns. + /// + /// This includes implicit groups that represent the entire match of a + /// pattern. + /// + /// # Example + /// + /// This example shows how the values returned by this routine may vary + /// for different patterns and NFA configurations. + /// + /// ``` + /// use regex_automata::{nfa::thompson::{NFA, WhichCaptures}, PatternID}; + /// + /// let nfa = NFA::new(r"(a)(b)(c)")?; + /// // There are 3 explicit groups in the pattern's concrete syntax and + /// // 1 unnamed and implicit group spanning the entire pattern. + /// assert_eq!(4, nfa.group_info().all_group_len()); + /// + /// let nfa = NFA::new(r"abc")?; + /// // There is just the unnamed implicit group. + /// assert_eq!(1, nfa.group_info().all_group_len()); + /// + /// let nfa = NFA::new_many(&["(a)", "b", "(c)"])?; + /// // Each pattern has one implicit groups, and two + /// // patterns have one explicit group each. + /// assert_eq!(5, nfa.group_info().all_group_len()); + /// + /// let nfa = NFA::compiler() + /// .configure(NFA::config().which_captures(WhichCaptures::None)) + /// .build(r"abc")?; + /// // We disabled capturing groups, so there are none. + /// assert_eq!(0, nfa.group_info().all_group_len()); + /// + /// let nfa = NFA::compiler() + /// .configure(NFA::config().which_captures(WhichCaptures::None)) + /// .build(r"(a)(b)(c)")?; + /// // We disabled capturing groups, so there are none, even if there are + /// // explicit groups in the concrete syntax. + /// assert_eq!(0, nfa.group_info().group_len(PatternID::ZERO)); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + #[inline] + pub fn all_group_len(&self) -> usize { + self.slot_len() / 2 + } + + /// Returns the total number of slots in this `GroupInfo` across all + /// patterns. + /// + /// The total number of slots is always twice the total number of capturing + /// groups, including both implicit and explicit groups. + /// + /// # Example + /// + /// This example shows the relationship between the number of capturing + /// groups and slots. + /// + /// ``` + /// use regex_automata::util::captures::GroupInfo; + /// + /// // There are 11 total groups here. + /// let info = GroupInfo::new(vec![ + /// vec![None, Some("foo")], + /// vec![None], + /// vec![None, None, None, Some("bar"), None], + /// vec![None, None, Some("foo")], + /// ])?; + /// // 2 slots per group gives us 11*2=22 slots. + /// assert_eq!(22, info.slot_len()); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + #[inline] + pub fn slot_len(&self) -> usize { + self.0.small_slot_len().as_usize() + } + + /// Returns the total number of slots for implicit capturing groups. + /// + /// This is like [`GroupInfo::slot_len`], except it doesn't include the + /// explicit slots for each pattern. Since there are always exactly 2 + /// implicit slots for each pattern, the number of implicit slots is always + /// equal to twice the number of patterns. + /// + /// # Example + /// + /// This example shows the relationship between the number of capturing + /// groups, implicit slots and explicit slots. + /// + /// ``` + /// use regex_automata::util::captures::GroupInfo; + /// + /// // There are 11 total groups here. + /// let info = GroupInfo::new(vec![vec![None, Some("foo"), Some("bar")]])?; + /// // 2 slots per group gives us 11*2=22 slots. + /// assert_eq!(6, info.slot_len()); + /// // 2 implicit slots per pattern gives us 2 implicit slots since there + /// // is 1 pattern. + /// assert_eq!(2, info.implicit_slot_len()); + /// // 2 explicit capturing groups gives us 2*2=4 explicit slots. + /// assert_eq!(4, info.explicit_slot_len()); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + #[inline] + pub fn implicit_slot_len(&self) -> usize { + self.pattern_len() * 2 + } + + /// Returns the total number of slots for explicit capturing groups. + /// + /// This is like [`GroupInfo::slot_len`], except it doesn't include the + /// implicit slots for each pattern. (There are always 2 implicit slots for + /// each pattern.) + /// + /// For a non-empty `GroupInfo`, it is always the case that `slot_len` is + /// strictly greater than `explicit_slot_len`. For an empty `GroupInfo`, + /// both the total number of slots and the number of explicit slots is + /// `0`. + /// + /// # Example + /// + /// This example shows the relationship between the number of capturing + /// groups, implicit slots and explicit slots. + /// + /// ``` + /// use regex_automata::util::captures::GroupInfo; + /// + /// // There are 11 total groups here. + /// let info = GroupInfo::new(vec![vec![None, Some("foo"), Some("bar")]])?; + /// // 2 slots per group gives us 11*2=22 slots. + /// assert_eq!(6, info.slot_len()); + /// // 2 implicit slots per pattern gives us 2 implicit slots since there + /// // is 1 pattern. + /// assert_eq!(2, info.implicit_slot_len()); + /// // 2 explicit capturing groups gives us 2*2=4 explicit slots. + /// assert_eq!(4, info.explicit_slot_len()); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + #[inline] + pub fn explicit_slot_len(&self) -> usize { + self.slot_len().saturating_sub(self.implicit_slot_len()) + } + + /// Returns the memory usage, in bytes, of this `GroupInfo`. + /// + /// This does **not** include the stack size used up by this `GroupInfo`. + /// To compute that, use `std::mem::size_of::<GroupInfo>()`. + #[inline] + pub fn memory_usage(&self) -> usize { + use core::mem::size_of as s; + + s::<GroupInfoInner>() + + self.0.slot_ranges.len() * s::<(SmallIndex, SmallIndex)>() + + self.0.name_to_index.len() * s::<CaptureNameMap>() + + self.0.index_to_name.len() * s::<Vec<Option<Arc<str>>>>() + + self.0.memory_extra + } +} + +/// A map from capture group name to its corresponding capture group index. +/// +/// This type is actually wrapped inside a Vec indexed by pattern ID on a +/// `GroupInfo`, since multiple patterns may have the same capture group name. +/// That is, each pattern gets its own namespace of capture group names. +/// +/// Perhaps a more memory efficient representation would be +/// HashMap<(PatternID, Arc<str>), usize>, but this makes it difficult to look +/// up a capture index by name without producing a `Arc<str>`, which requires +/// an allocation. To fix this, I think we'd need to define our own unsized +/// type or something? Anyway, I didn't give this much thought since it +/// probably doesn't matter much in the grand scheme of things. But it did +/// stand out to me as mildly wasteful. +#[cfg(feature = "std")] +type CaptureNameMap = std::collections::HashMap<Arc<str>, SmallIndex>; +#[cfg(not(feature = "std"))] +type CaptureNameMap = alloc::collections::BTreeMap<Arc<str>, SmallIndex>; + +/// The inner guts of `GroupInfo`. This type only exists so that it can +/// be wrapped in an `Arc` to make `GroupInfo` reference counted. +#[derive(Debug, Default)] +struct GroupInfoInner { + slot_ranges: Vec<(SmallIndex, SmallIndex)>, + name_to_index: Vec<CaptureNameMap>, + index_to_name: Vec<Vec<Option<Arc<str>>>>, + memory_extra: usize, +} + +impl GroupInfoInner { + /// This adds the first unnamed group for the given pattern ID. The given + /// pattern ID must be zero if this is the first time this method is + /// called, or must be exactly one more than the pattern ID supplied to the + /// previous call to this method. (This method panics if this rule is + /// violated.) + /// + /// This can be thought of as initializing the GroupInfo state for the + /// given pattern and closing off the state for any previous pattern. + fn add_first_group(&mut self, pid: PatternID) { + assert_eq!(pid.as_usize(), self.slot_ranges.len()); + assert_eq!(pid.as_usize(), self.name_to_index.len()); + assert_eq!(pid.as_usize(), self.index_to_name.len()); + // This is the start of our slots for the explicit capturing groups. + // Note that since the slots for the 0th group for every pattern appear + // before any slots for the nth group (where n > 0) in any pattern, we + // will have to fix up the slot ranges once we know how many patterns + // we've added capture groups for. + let slot_start = self.small_slot_len(); + self.slot_ranges.push((slot_start, slot_start)); + self.name_to_index.push(CaptureNameMap::new()); + self.index_to_name.push(vec![None]); + self.memory_extra += core::mem::size_of::<Option<Arc<str>>>(); + } + + /// Add an explicit capturing group for the given pattern with the given + /// index. If the group has a name, then that must be given as well. + /// + /// Note that every capturing group except for the first or zeroth group is + /// explicit. + /// + /// This returns an error if adding this group would result in overflowing + /// slot indices or if a capturing group with the same name for this + /// pattern has already been added. + fn add_explicit_group<N: AsRef<str>>( + &mut self, + pid: PatternID, + group: SmallIndex, + maybe_name: Option<N>, + ) -> Result<(), GroupInfoError> { + // We also need to check that the slot index generated for + // this group is also valid. Although, this is a little weird + // because we offset these indices below, at which point, we'll + // have to recheck them. Gosh this is annoying. Note that + // the '+2' below is OK because 'end' is guaranteed to be less + // than isize::MAX. + let end = &mut self.slot_ranges[pid].1; + *end = SmallIndex::new(end.as_usize() + 2).map_err(|_| { + GroupInfoError::too_many_groups(pid, group.as_usize()) + })?; + if let Some(name) = maybe_name { + let name = Arc::<str>::from(name.as_ref()); + if self.name_to_index[pid].contains_key(&*name) { + return Err(GroupInfoError::duplicate(pid, &name)); + } + let len = name.len(); + self.name_to_index[pid].insert(Arc::clone(&name), group); + self.index_to_name[pid].push(Some(name)); + // Adds the memory used by the Arc<str> in both maps. + self.memory_extra += + 2 * (len + core::mem::size_of::<Option<Arc<str>>>()); + // And also the value entry for the 'name_to_index' map. + // This is probably an underestimate for 'name_to_index' since + // hashmaps/btrees likely have some non-zero overhead, but we + // assume here that they have zero overhead. + self.memory_extra += core::mem::size_of::<SmallIndex>(); + } else { + self.index_to_name[pid].push(None); + self.memory_extra += core::mem::size_of::<Option<Arc<str>>>(); + } + // This is a sanity assert that checks that our group index + // is in line with the number of groups added so far for this + // pattern. + assert_eq!(group.one_more(), self.group_len(pid)); + // And is also in line with the 'index_to_name' map. + assert_eq!(group.one_more(), self.index_to_name[pid].len()); + Ok(()) + } + + /// This corrects the slot ranges to account for the slots corresponding + /// to the zeroth group of each pattern. That is, every slot range is + /// offset by 'pattern_len() * 2', since each pattern uses two slots to + /// represent the zeroth group. + fn fixup_slot_ranges(&mut self) -> Result<(), GroupInfoError> { + use crate::util::primitives::IteratorIndexExt; + // Since we know number of patterns fits in PatternID and + // PatternID::MAX < isize::MAX, it follows that multiplying by 2 will + // never overflow usize. + let offset = self.pattern_len().checked_mul(2).unwrap(); + for (pid, &mut (ref mut start, ref mut end)) in + self.slot_ranges.iter_mut().with_pattern_ids() + { + let group_len = 1 + ((end.as_usize() - start.as_usize()) / 2); + let new_end = match end.as_usize().checked_add(offset) { + Some(new_end) => new_end, + None => { + return Err(GroupInfoError::too_many_groups( + pid, group_len, + )) + } + }; + *end = SmallIndex::new(new_end).map_err(|_| { + GroupInfoError::too_many_groups(pid, group_len) + })?; + // Since start <= end, if end is valid then start must be too. + *start = SmallIndex::new(start.as_usize() + offset).unwrap(); + } + Ok(()) + } + + /// Return the total number of patterns represented by this capture slot + /// info. + fn pattern_len(&self) -> usize { + self.slot_ranges.len() + } + + /// Return the total number of capturing groups for the given pattern. If + /// the given pattern isn't valid for this capture slot info, then 0 is + /// returned. + fn group_len(&self, pid: PatternID) -> usize { + let (start, end) = match self.slot_ranges.get(pid.as_usize()) { + None => return 0, + Some(range) => range, + }; + // The difference between any two SmallIndex values always fits in a + // usize since we know that SmallIndex::MAX <= isize::MAX-1. We also + // know that start<=end by construction and that the number of groups + // never exceeds SmallIndex and thus never overflows usize. + 1 + ((end.as_usize() - start.as_usize()) / 2) + } + + /// Return the total number of slots in this capture slot info as a + /// "small index." + fn small_slot_len(&self) -> SmallIndex { + // Since slots are allocated in order of pattern (starting at 0) and + // then in order of capture group, it follows that the number of slots + // is the end of the range of slots for the last pattern. This is + // true even when the last pattern has no capturing groups, since + // 'slot_ranges' will still represent it explicitly with an empty + // range. + self.slot_ranges.last().map_or(SmallIndex::ZERO, |&(_, end)| end) + } +} + +/// An error that may occur when building a `GroupInfo`. +/// +/// Building a `GroupInfo` does a variety of checks to make sure the +/// capturing groups satisfy a number of invariants. This includes, but is not +/// limited to, ensuring that the first capturing group is unnamed and that +/// there are no duplicate capture groups for a specific pattern. +#[derive(Clone, Debug)] +pub struct GroupInfoError { + kind: GroupInfoErrorKind, +} + +/// The kind of error that occurs when building a `GroupInfo` fails. +/// +/// We keep this un-exported because it's not clear how useful it is to +/// export it. +#[derive(Clone, Debug)] +enum GroupInfoErrorKind { + /// This occurs when too many patterns have been added. i.e., It would + /// otherwise overflow a `PatternID`. + TooManyPatterns { err: PatternIDError }, + /// This occurs when too many capturing groups have been added for a + /// particular pattern. + TooManyGroups { + /// The ID of the pattern that had too many groups. + pattern: PatternID, + /// The minimum number of groups that the caller has tried to add for + /// a pattern. + minimum: usize, + }, + /// An error that occurs when a pattern has no capture groups. Either the + /// group info must be empty, or all patterns must have at least one group + /// (corresponding to the unnamed group for the entire pattern). + MissingGroups { + /// The ID of the pattern that had no capturing groups. + pattern: PatternID, + }, + /// An error that occurs when one tries to provide a name for the capture + /// group at index 0. This capturing group must currently always be + /// unnamed. + FirstMustBeUnnamed { + /// The ID of the pattern that was found to have a named first + /// capturing group. + pattern: PatternID, + }, + /// An error that occurs when duplicate capture group names for the same + /// pattern are added. + /// + /// NOTE: At time of writing, this error can never occur if you're using + /// regex-syntax, since the parser itself will reject patterns with + /// duplicate capture group names. This error can only occur when the + /// builder is used to hand construct NFAs. + Duplicate { + /// The pattern in which the duplicate capture group name was found. + pattern: PatternID, + /// The duplicate name. + name: String, + }, +} + +impl GroupInfoError { + fn too_many_patterns(err: PatternIDError) -> GroupInfoError { + GroupInfoError { kind: GroupInfoErrorKind::TooManyPatterns { err } } + } + + fn too_many_groups(pattern: PatternID, minimum: usize) -> GroupInfoError { + GroupInfoError { + kind: GroupInfoErrorKind::TooManyGroups { pattern, minimum }, + } + } + + fn missing_groups(pattern: PatternID) -> GroupInfoError { + GroupInfoError { kind: GroupInfoErrorKind::MissingGroups { pattern } } + } + + fn first_must_be_unnamed(pattern: PatternID) -> GroupInfoError { + GroupInfoError { + kind: GroupInfoErrorKind::FirstMustBeUnnamed { pattern }, + } + } + + fn duplicate(pattern: PatternID, name: &str) -> GroupInfoError { + GroupInfoError { + kind: GroupInfoErrorKind::Duplicate { + pattern, + name: String::from(name), + }, + } + } +} + +#[cfg(feature = "std")] +impl std::error::Error for GroupInfoError { + fn source(&self) -> Option<&(dyn std::error::Error + 'static)> { + match self.kind { + GroupInfoErrorKind::TooManyPatterns { .. } + | GroupInfoErrorKind::TooManyGroups { .. } + | GroupInfoErrorKind::MissingGroups { .. } + | GroupInfoErrorKind::FirstMustBeUnnamed { .. } + | GroupInfoErrorKind::Duplicate { .. } => None, + } + } +} + +impl core::fmt::Display for GroupInfoError { + fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result { + use self::GroupInfoErrorKind::*; + + match self.kind { + TooManyPatterns { ref err } => { + write!(f, "too many patterns to build capture info: {}", err) + } + TooManyGroups { pattern, minimum } => { + write!( + f, + "too many capture groups (at least {}) were \ + found for pattern {}", + minimum, + pattern.as_usize() + ) + } + MissingGroups { pattern } => write!( + f, + "no capturing groups found for pattern {} \ + (either all patterns have zero groups or all patterns have \ + at least one group)", + pattern.as_usize(), + ), + FirstMustBeUnnamed { pattern } => write!( + f, + "first capture group (at index 0) for pattern {} has a name \ + (it must be unnamed)", + pattern.as_usize(), + ), + Duplicate { pattern, ref name } => write!( + f, + "duplicate capture group name '{}' found for pattern {}", + name, + pattern.as_usize(), + ), + } + } +} + +/// An iterator over capturing groups and their names for a specific pattern. +/// +/// This iterator is created by [`GroupInfo::pattern_names`]. +/// +/// The lifetime parameter `'a` refers to the lifetime of the `GroupInfo` +/// from which this iterator was created. +#[derive(Clone, Debug)] +pub struct GroupInfoPatternNames<'a> { + it: core::slice::Iter<'a, Option<Arc<str>>>, +} + +impl GroupInfoPatternNames<'static> { + fn empty() -> GroupInfoPatternNames<'static> { + GroupInfoPatternNames { it: [].iter() } + } +} + +impl<'a> Iterator for GroupInfoPatternNames<'a> { + type Item = Option<&'a str>; + + fn next(&mut self) -> Option<Option<&'a str>> { + self.it.next().map(|x| x.as_deref()) + } + + fn size_hint(&self) -> (usize, Option<usize>) { + self.it.size_hint() + } + + fn count(self) -> usize { + self.it.count() + } +} + +impl<'a> ExactSizeIterator for GroupInfoPatternNames<'a> {} +impl<'a> core::iter::FusedIterator for GroupInfoPatternNames<'a> {} + +/// An iterator over capturing groups and their names for a `GroupInfo`. +/// +/// This iterator is created by [`GroupInfo::all_names`]. +/// +/// The lifetime parameter `'a` refers to the lifetime of the `GroupInfo` +/// from which this iterator was created. +#[derive(Debug)] +pub struct GroupInfoAllNames<'a> { + group_info: &'a GroupInfo, + pids: PatternIDIter, + current_pid: Option<PatternID>, + names: Option<core::iter::Enumerate<GroupInfoPatternNames<'a>>>, +} + +impl<'a> Iterator for GroupInfoAllNames<'a> { + type Item = (PatternID, usize, Option<&'a str>); + + fn next(&mut self) -> Option<(PatternID, usize, Option<&'a str>)> { + // If the group info has no captures, then we never have anything + // to yield. We need to consider this case explicitly (at time of + // writing) because 'pattern_capture_names' will panic if captures + // aren't enabled. + if self.group_info.0.index_to_name.is_empty() { + return None; + } + if self.current_pid.is_none() { + self.current_pid = Some(self.pids.next()?); + } + let pid = self.current_pid.unwrap(); + if self.names.is_none() { + self.names = Some(self.group_info.pattern_names(pid).enumerate()); + } + let (group_index, name) = match self.names.as_mut().unwrap().next() { + Some((group_index, name)) => (group_index, name), + None => { + self.current_pid = None; + self.names = None; + return self.next(); + } + }; + Some((pid, group_index, name)) + } +} diff --git a/third_party/rust/regex-automata/src/util/determinize/mod.rs b/third_party/rust/regex-automata/src/util/determinize/mod.rs new file mode 100644 index 0000000000..30a82afb81 --- /dev/null +++ b/third_party/rust/regex-automata/src/util/determinize/mod.rs @@ -0,0 +1,610 @@ +/*! +This module contains types and routines for implementing determinization. + +In this crate, there are at least two places where we implement +determinization: fully ahead-of-time compiled DFAs in the `dfa` module and +lazily compiled DFAs in the `hybrid` module. The stuff in this module +corresponds to the things that are in common between these implementations. + +There are three broad things that our implementations of determinization have +in common, as defined by this module: + +* The classification of start states. That is, whether we're dealing with +word boundaries, line boundaries, etc., is all the same. This also includes +the look-behind assertions that are satisfied by each starting state +classification. +* The representation of DFA states as sets of NFA states, including +convenience types for building these DFA states that are amenable to reusing +allocations. +* Routines for the "classical" parts of determinization: computing the +epsilon closure, tracking match states (with corresponding pattern IDs, since +we support multi-pattern finite automata) and, of course, computing the +transition function between states for units of input. + +I did consider a couple of alternatives to this particular form of code reuse: + +1. Don't do any code reuse. The problem here is that we *really* want both +forms of determinization to do exactly identical things when it comes to +their handling of NFA states. While our tests generally ensure this, the code +is tricky and large enough where not reusing code is a pretty big bummer. + +2. Implement all of determinization once and make it generic over fully +compiled DFAs and lazily compiled DFAs. While I didn't actually try this +approach, my instinct is that it would be more complex than is needed here. +And the interface required would be pretty hairy. Instead, I think splitting +it into logical sub-components works better. +*/ + +use alloc::vec::Vec; + +pub(crate) use self::state::{ + State, StateBuilderEmpty, StateBuilderMatches, StateBuilderNFA, +}; + +use crate::{ + nfa::thompson, + util::{ + alphabet, + look::{Look, LookSet}, + primitives::StateID, + search::MatchKind, + sparse_set::{SparseSet, SparseSets}, + start::Start, + utf8, + }, +}; + +mod state; + +/// Compute the set of all reachable NFA states, including the full epsilon +/// closure, from a DFA state for a single unit of input. The set of reachable +/// states is returned as a `StateBuilderNFA`. The `StateBuilderNFA` returned +/// also includes any look-behind assertions satisfied by `unit`, in addition +/// to whether it is a match state. For multi-pattern DFAs, the builder will +/// also include the pattern IDs that match (in the order seen). +/// +/// `nfa` must be able to resolve any NFA state in `state` and any NFA state +/// reachable via the epsilon closure of any NFA state in `state`. `sparses` +/// must have capacity equivalent to `nfa.len()`. +/// +/// `match_kind` should correspond to the match semantics implemented by the +/// DFA being built. Generally speaking, for leftmost-first match semantics, +/// states that appear after the first NFA match state will not be included in +/// the `StateBuilderNFA` returned since they are impossible to visit. +/// +/// `sparses` is used as scratch space for NFA traversal. Other than their +/// capacity requirements (detailed above), there are no requirements on what's +/// contained within them (if anything). Similarly, what's inside of them once +/// this routine returns is unspecified. +/// +/// `stack` must have length 0. It is used as scratch space for depth first +/// traversal. After returning, it is guaranteed that `stack` will have length +/// 0. +/// +/// `state` corresponds to the current DFA state on which one wants to compute +/// the transition for the input `unit`. +/// +/// `empty_builder` corresponds to the builder allocation to use to produce a +/// complete `StateBuilderNFA` state. If the state is not needed (or is already +/// cached), then it can be cleared and reused without needing to create a new +/// `State`. The `StateBuilderNFA` state returned is final and ready to be +/// turned into a `State` if necessary. +pub(crate) fn next( + nfa: &thompson::NFA, + match_kind: MatchKind, + sparses: &mut SparseSets, + stack: &mut Vec<StateID>, + state: &State, + unit: alphabet::Unit, + empty_builder: StateBuilderEmpty, +) -> StateBuilderNFA { + sparses.clear(); + + // Whether the NFA is matched in reverse or not. We use this in some + // conditional logic for dealing with the exceptionally annoying CRLF-aware + // line anchors. + let rev = nfa.is_reverse(); + // The look-around matcher that our NFA is configured with. We don't + // actually use it to match look-around assertions, but we do need its + // configuration for constructing states consistent with how it matches. + let lookm = nfa.look_matcher(); + + // Put the NFA state IDs into a sparse set in case we need to + // re-compute their epsilon closure. + // + // Doing this state shuffling is technically not necessary unless some + // kind of look-around is used in the DFA. Some ad hoc experiments + // suggested that avoiding this didn't lead to much of an improvement, + // but perhaps more rigorous experimentation should be done. And in + // particular, avoiding this check requires some light refactoring of + // the code below. + state.iter_nfa_state_ids(|nfa_id| { + sparses.set1.insert(nfa_id); + }); + + // Compute look-ahead assertions originating from the current state. Based + // on the input unit we're transitioning over, some additional set of + // assertions may be true. Thus, we re-compute this state's epsilon closure + // (but only if necessary). Notably, when we build a DFA state initially, + // we don't enable any look-ahead assertions because we don't know whether + // they're true or not at that point. + if !state.look_need().is_empty() { + // Add look-ahead assertions that are now true based on the current + // input unit. + let mut look_have = state.look_have().clone(); + match unit.as_u8() { + Some(b'\r') => { + if !rev || !state.is_half_crlf() { + look_have = look_have.insert(Look::EndCRLF); + } + } + Some(b'\n') => { + if rev || !state.is_half_crlf() { + look_have = look_have.insert(Look::EndCRLF); + } + } + Some(_) => {} + None => { + look_have = look_have.insert(Look::End); + look_have = look_have.insert(Look::EndLF); + look_have = look_have.insert(Look::EndCRLF); + } + } + if unit.is_byte(lookm.get_line_terminator()) { + look_have = look_have.insert(Look::EndLF); + } + if state.is_half_crlf() + && ((rev && !unit.is_byte(b'\r')) + || (!rev && !unit.is_byte(b'\n'))) + { + look_have = look_have.insert(Look::StartCRLF); + } + if state.is_from_word() == unit.is_word_byte() { + look_have = look_have.insert(Look::WordUnicodeNegate); + look_have = look_have.insert(Look::WordAsciiNegate); + } else { + look_have = look_have.insert(Look::WordUnicode); + look_have = look_have.insert(Look::WordAscii); + } + // If we have new assertions satisfied that are among the set of + // assertions that exist in this state (that is, just because we added + // an EndLF assertion above doesn't mean there is an EndLF conditional + // epsilon transition in this state), then we re-compute this state's + // epsilon closure using the updated set of assertions. + // + // Note that since our DFA states omit unconditional epsilon + // transitions, this check is necessary for correctness. If we re-did + // the epsilon closure below needlessly, it could change based on the + // fact that we omitted epsilon states originally. + if !look_have + .subtract(state.look_have()) + .intersect(state.look_need()) + .is_empty() + { + for nfa_id in sparses.set1.iter() { + epsilon_closure( + nfa, + nfa_id, + look_have, + stack, + &mut sparses.set2, + ); + } + sparses.swap(); + sparses.set2.clear(); + } + } + + // Convert our empty builder into one that can record assertions and match + // pattern IDs. + let mut builder = empty_builder.into_matches(); + // Set whether the StartLF look-behind assertion is true for this + // transition or not. The look-behind assertion for ASCII word boundaries + // is handled below. + if nfa.look_set_any().contains_anchor_line() + && unit.is_byte(lookm.get_line_terminator()) + { + // Why only handle StartLF here and not Start? That's because Start + // can only impact the starting state, which is special cased in + // start state handling. + builder.set_look_have(|have| have.insert(Look::StartLF)); + } + // We also need to add StartCRLF to our assertions too, if we can. This + // is unfortunately a bit more complicated, because it depends on the + // direction of the search. In the forward direction, ^ matches after a + // \n, but in the reverse direction, ^ only matches after a \r. (This is + // further complicated by the fact that reverse a regex means changing a ^ + // to a $ and vice versa.) + if nfa.look_set_any().contains_anchor_crlf() + && ((rev && unit.is_byte(b'\r')) || (!rev && unit.is_byte(b'\n'))) + { + builder.set_look_have(|have| have.insert(Look::StartCRLF)); + } + for nfa_id in sparses.set1.iter() { + match *nfa.state(nfa_id) { + thompson::State::Union { .. } + | thompson::State::BinaryUnion { .. } + | thompson::State::Fail + | thompson::State::Look { .. } + | thompson::State::Capture { .. } => {} + thompson::State::Match { pattern_id } => { + // Notice here that we are calling the NEW state a match + // state if the OLD state we are transitioning from + // contains an NFA match state. This is precisely how we + // delay all matches by one byte and also what therefore + // guarantees that starting states cannot be match states. + // + // If we didn't delay matches by one byte, then whether + // a DFA is a matching state or not would be determined + // by whether one of its own constituent NFA states + // was a match state. (And that would be done in + // 'add_nfa_states'.) + // + // Also, 'add_match_pattern_id' requires that callers never + // pass duplicative pattern IDs. We do in fact uphold that + // guarantee here, but it's subtle. In particular, a Thompson + // NFA guarantees that each pattern has exactly one match + // state. Moreover, since we're iterating over the NFA state + // IDs in a set, we are guarateed not to have any duplicative + // match states. Thus, it is impossible to add the same pattern + // ID more than once. + // + // N.B. We delay matches by 1 byte as a way to hack 1-byte + // look-around into DFA searches. This lets us support ^, $ + // and ASCII-only \b. The delay is also why we need a special + // "end-of-input" (EOI) sentinel and why we need to follow the + // EOI sentinel at the end of every search. This final EOI + // transition is necessary to report matches found at the end + // of a haystack. + builder.add_match_pattern_id(pattern_id); + if !match_kind.continue_past_first_match() { + break; + } + } + thompson::State::ByteRange { ref trans } => { + if trans.matches_unit(unit) { + epsilon_closure( + nfa, + trans.next, + builder.look_have(), + stack, + &mut sparses.set2, + ); + } + } + thompson::State::Sparse(ref sparse) => { + if let Some(next) = sparse.matches_unit(unit) { + epsilon_closure( + nfa, + next, + builder.look_have(), + stack, + &mut sparses.set2, + ); + } + } + thompson::State::Dense(ref dense) => { + if let Some(next) = dense.matches_unit(unit) { + epsilon_closure( + nfa, + next, + builder.look_have(), + stack, + &mut sparses.set2, + ); + } + } + } + } + // We only set the word byte if there's a word boundary look-around + // anywhere in this regex. Otherwise, there's no point in bloating the + // number of states if we don't have one. + // + // We also only set it when the state has a non-zero number of NFA states. + // Otherwise, we could wind up with states that *should* be DEAD states + // but are otherwise distinct from DEAD states because of this look-behind + // assertion being set. While this can't technically impact correctness *in + // theory*, it can create pathological DFAs that consume input until EOI or + // a quit byte is seen. Consuming until EOI isn't a correctness problem, + // but a (serious) perf problem. Hitting a quit byte, however, could be a + // correctness problem since it could cause search routines to report an + // error instead of a detected match once the quit state is entered. (The + // search routine could be made to be a bit smarter by reporting a match + // if one was detected once it enters a quit state (and indeed, the search + // routines in this crate do just that), but it seems better to prevent + // these things by construction if possible.) + if !sparses.set2.is_empty() { + if nfa.look_set_any().contains_word() && unit.is_word_byte() { + builder.set_is_from_word(); + } + if nfa.look_set_any().contains_anchor_crlf() + && ((rev && unit.is_byte(b'\n')) || (!rev && unit.is_byte(b'\r'))) + { + builder.set_is_half_crlf(); + } + } + let mut builder_nfa = builder.into_nfa(); + add_nfa_states(nfa, &sparses.set2, &mut builder_nfa); + builder_nfa +} + +/// Compute the epsilon closure for the given NFA state. The epsilon closure +/// consists of all NFA state IDs, including `start_nfa_id`, that can be +/// reached from `start_nfa_id` without consuming any input. These state IDs +/// are written to `set` in the order they are visited, but only if they are +/// not already in `set`. `start_nfa_id` must be a valid state ID for the NFA +/// given. +/// +/// `look_have` consists of the satisfied assertions at the current +/// position. For conditional look-around epsilon transitions, these are +/// only followed if they are satisfied by `look_have`. +/// +/// `stack` must have length 0. It is used as scratch space for depth first +/// traversal. After returning, it is guaranteed that `stack` will have length +/// 0. +pub(crate) fn epsilon_closure( + nfa: &thompson::NFA, + start_nfa_id: StateID, + look_have: LookSet, + stack: &mut Vec<StateID>, + set: &mut SparseSet, +) { + assert!(stack.is_empty()); + // If this isn't an epsilon state, then the epsilon closure is always just + // itself, so there's no need to spin up the machinery below to handle it. + if !nfa.state(start_nfa_id).is_epsilon() { + set.insert(start_nfa_id); + return; + } + + stack.push(start_nfa_id); + while let Some(mut id) = stack.pop() { + // In many cases, we can avoid stack operations when an NFA state only + // adds one new state to visit. In that case, we just set our ID to + // that state and mush on. We only use the stack when an NFA state + // introduces multiple new states to visit. + loop { + // Insert this NFA state, and if it's already in the set and thus + // already visited, then we can move on to the next one. + if !set.insert(id) { + break; + } + match *nfa.state(id) { + thompson::State::ByteRange { .. } + | thompson::State::Sparse { .. } + | thompson::State::Dense { .. } + | thompson::State::Fail + | thompson::State::Match { .. } => break, + thompson::State::Look { look, next } => { + if !look_have.contains(look) { + break; + } + id = next; + } + thompson::State::Union { ref alternates } => { + id = match alternates.get(0) { + None => break, + Some(&id) => id, + }; + // We need to process our alternates in order to preserve + // match preferences, so put the earliest alternates closer + // to the top of the stack. + stack.extend(alternates[1..].iter().rev()); + } + thompson::State::BinaryUnion { alt1, alt2 } => { + id = alt1; + stack.push(alt2); + } + thompson::State::Capture { next, .. } => { + id = next; + } + } + } + } +} + +/// Add the NFA state IDs in the given `set` to the given DFA builder state. +/// The order in which states are added corresponds to the order in which they +/// were added to `set`. +/// +/// The DFA builder state given should already have its complete set of match +/// pattern IDs added (if any) and any look-behind assertions (StartLF, Start +/// and whether this state is being generated for a transition over a word byte +/// when applicable) that are true immediately prior to transitioning into this +/// state (via `builder.look_have()`). The match pattern IDs should correspond +/// to matches that occurred on the previous transition, since all matches are +/// delayed by one byte. The things that should _not_ be set are look-ahead +/// assertions (EndLF, End and whether the next byte is a word byte or not). +/// The builder state should also not have anything in `look_need` set, as this +/// routine will compute that for you. +/// +/// The given NFA should be able to resolve all identifiers in `set` to a +/// particular NFA state. Additionally, `set` must have capacity equivalent +/// to `nfa.len()`. +pub(crate) fn add_nfa_states( + nfa: &thompson::NFA, + set: &SparseSet, + builder: &mut StateBuilderNFA, +) { + for nfa_id in set.iter() { + match *nfa.state(nfa_id) { + thompson::State::ByteRange { .. } => { + builder.add_nfa_state_id(nfa_id); + } + thompson::State::Sparse { .. } => { + builder.add_nfa_state_id(nfa_id); + } + thompson::State::Dense { .. } => { + builder.add_nfa_state_id(nfa_id); + } + thompson::State::Look { look, .. } => { + builder.add_nfa_state_id(nfa_id); + builder.set_look_need(|need| need.insert(look)); + } + thompson::State::Union { .. } + | thompson::State::BinaryUnion { .. } => { + // Pure epsilon transitions don't need to be tracked as part + // of the DFA state. Tracking them is actually superfluous; + // they won't cause any harm other than making determinization + // slower. + // + // Why aren't these needed? Well, in an NFA, epsilon + // transitions are really just jumping points to other states. + // So once you hit an epsilon transition, the same set of + // resulting states always appears. Therefore, putting them in + // a DFA's set of ordered NFA states is strictly redundant. + // + // Look-around states are also epsilon transitions, but + // they are *conditional*. So their presence could be + // discriminatory, and thus, they are tracked above. + // + // But wait... why are epsilon states in our `set` in the first + // place? Why not just leave them out? They're in our `set` + // because it was generated by computing an epsilon closure, + // and we want to keep track of all states we visited to avoid + // re-visiting them. In exchange, we have to do this second + // iteration over our collected states to finalize our DFA + // state. In theory, we could avoid this second iteration if + // we maintained two sets during epsilon closure: the set of + // visited states (to avoid cycles) and the set of states that + // will actually be used to construct the next DFA state. + // + // Note that this optimization requires that we re-compute the + // epsilon closure to account for look-ahead in 'next' *only + // when necessary*. Namely, only when the set of look-around + // assertions changes and only when those changes are within + // the set of assertions that are needed in order to step + // through the closure correctly. Otherwise, if we re-do the + // epsilon closure needlessly, it could change based on the + // fact that we are omitting epsilon states here. + // + // ----- + // + // Welp, scratch the above. It turns out that recording these + // is in fact necessary to seemingly handle one particularly + // annoying case: when a conditional epsilon transition is + // put inside of a repetition operator. One specific case I + // ran into was the regex `(?:\b|%)+` on the haystack `z%`. + // The correct leftmost first matches are: [0, 0] and [1, 1]. + // But the DFA was reporting [0, 0] and [1, 2]. To understand + // why this happens, consider the NFA for the aforementioned + // regex: + // + // >000000: binary-union(4, 1) + // 000001: \x00-\xFF => 0 + // 000002: WordAscii => 5 + // 000003: % => 5 + // ^000004: binary-union(2, 3) + // 000005: binary-union(4, 6) + // 000006: MATCH(0) + // + // The problem here is that one of the DFA start states is + // going to consist of the NFA states [2, 3] by computing the + // epsilon closure of state 4. State 4 isn't included because + // we previously were not keeping track of union states. But + // only a subset of transitions out of this state will be able + // to follow WordAscii, and in those cases, the epsilon closure + // is redone. The only problem is that computing the epsilon + // closure from [2, 3] is different than computing the epsilon + // closure from [4]. In the former case, assuming the WordAscii + // assertion is satisfied, you get: [2, 3, 6]. In the latter + // case, you get: [2, 6, 3]. Notice that '6' is the match state + // and appears AFTER '3' in the former case. This leads to a + // preferential but incorrect match of '%' before returning + // a match. In the latter case, the match is preferred over + // continuing to accept the '%'. + // + // It almost feels like we might be able to fix the NFA states + // to avoid this, or to at least only keep track of union + // states where this actually matters, since in the vast + // majority of cases, this doesn't matter. + // + // Another alternative would be to define a new HIR property + // called "assertion is repeated anywhere" and compute it + // inductively over the entire pattern. If it happens anywhere, + // which is probably pretty rare, then we record union states. + // Otherwise we don't. + builder.add_nfa_state_id(nfa_id); + } + // Capture states we definitely do not need to record, since they + // are unconditional epsilon transitions with no branching. + thompson::State::Capture { .. } => {} + // It's not totally clear whether we need to record fail states or + // not, but we do so out of an abundance of caution. Since they are + // quite rare in practice, there isn't much cost to recording them. + thompson::State::Fail => { + builder.add_nfa_state_id(nfa_id); + } + thompson::State::Match { .. } => { + // Normally, the NFA match state doesn't actually need to + // be inside the DFA state. But since we delay matches by + // one byte, the matching DFA state corresponds to states + // that transition from the one we're building here. And + // the way we detect those cases is by looking for an NFA + // match state. See 'next' for how this is handled. + builder.add_nfa_state_id(nfa_id); + } + } + } + // If we know this state contains no look-around assertions, then + // there's no reason to track which look-around assertions were + // satisfied when this state was created. + if builder.look_need().is_empty() { + builder.set_look_have(|_| LookSet::empty()); + } +} + +/// Sets the appropriate look-behind assertions on the given state based on +/// this starting configuration. +pub(crate) fn set_lookbehind_from_start( + nfa: &thompson::NFA, + start: &Start, + builder: &mut StateBuilderMatches, +) { + let rev = nfa.is_reverse(); + let lineterm = nfa.look_matcher().get_line_terminator(); + match *start { + Start::NonWordByte => {} + Start::WordByte => { + builder.set_is_from_word(); + } + Start::Text => { + builder.set_look_have(|have| { + have.insert(Look::Start) + .insert(Look::StartLF) + .insert(Look::StartCRLF) + }); + } + Start::LineLF => { + if rev { + builder.set_is_half_crlf(); + builder.set_look_have(|have| have.insert(Look::StartLF)); + } else { + builder.set_look_have(|have| have.insert(Look::StartCRLF)); + } + if lineterm == b'\n' { + builder.set_look_have(|have| have.insert(Look::StartLF)); + } + } + Start::LineCR => { + if rev { + builder.set_look_have(|have| have.insert(Look::StartCRLF)); + } else { + builder.set_is_half_crlf(); + } + if lineterm == b'\r' { + builder.set_look_have(|have| have.insert(Look::StartLF)); + } + } + Start::CustomLineTerminator => { + builder.set_look_have(|have| have.insert(Look::StartLF)); + // This is a bit of a tricky case, but if the line terminator was + // set to a word byte, then we also need to behave as if the start + // configuration is Start::WordByte. That is, we need to mark our + // state as having come from a word byte. + if utf8::is_word_byte(lineterm) { + builder.set_is_from_word(); + } + } + } +} diff --git a/third_party/rust/regex-automata/src/util/determinize/state.rs b/third_party/rust/regex-automata/src/util/determinize/state.rs new file mode 100644 index 0000000000..e641235874 --- /dev/null +++ b/third_party/rust/regex-automata/src/util/determinize/state.rs @@ -0,0 +1,906 @@ +/*! +This module defines a DFA state representation and builders for constructing +DFA states. + +This representation is specifically for use in implementations of NFA-to-DFA +conversion via powerset construction. (Also called "determinization" in this +crate.) + +The term "DFA state" is somewhat overloaded in this crate. In some cases, it +refers to the set of transitions over an alphabet for a particular state. In +other cases, it refers to a set of NFA states. The former is really about the +final representation of a state in a DFA's transition table, where as the +latter---what this module is focused on---is closer to an intermediate form +that is used to help eventually build the transition table. + +This module exports four types. All four types represent the same idea: an +ordered set of NFA states. This ordered set represents the epsilon closure of a +particular NFA state, where the "epsilon closure" is the set of NFA states that +can be transitioned to without consuming any input. i.e., Follow all of the NFA +state's epsilon transitions. In addition, this implementation of DFA states +cares about two other things: the ordered set of pattern IDs corresponding +to the patterns that match if the state is a match state, and the set of +look-behind assertions that were true when the state was created. + +The first, `State`, is a frozen representation of a state that cannot be +modified. It may be cheaply cloned without copying the state itself and can be +accessed safely from multiple threads simultaneously. This type is useful for +when one knows that the DFA state being constructed is distinct from any other +previously constructed states. Namely, powerset construction, in practice, +requires one to keep a cache of previously created DFA states. Otherwise, +the number of DFA states created in memory balloons to an impractically +large number. For this reason, equivalent states should endeavor to have an +equivalent byte-level representation. (In general, "equivalency" here means, +"equivalent assertions, pattern IDs and NFA state IDs." We do not require that +full DFA minimization be implemented here. This form of equivalency is only +surface deep and is more-or-less a practical necessity.) + +The other three types represent different phases in the construction of a +DFA state. Internally, these three types (and `State`) all use the same +byte-oriented representation. That means one can use any of the builder types +to check whether the state it represents already exists or not. If it does, +then there is no need to freeze it into a `State` (which requires an alloc and +a copy). Here are the three types described succinctly: + +* `StateBuilderEmpty` represents a state with no pattern IDs, no assertions +and no NFA states. Creating a `StateBuilderEmpty` performs no allocs. A +`StateBuilderEmpty` can only be used to query its underlying memory capacity, +or to convert into a builder for recording pattern IDs and/or assertions. + +* `StateBuilderMatches` represents a state with zero or more pattern IDs, zero +or more satisfied assertions and zero NFA state IDs. A `StateBuilderMatches` +can only be used for adding pattern IDs and recording assertions. + +* `StateBuilderNFA` represents a state with zero or more pattern IDs, zero or +more satisfied assertions and zero or more NFA state IDs. A `StateBuilderNFA` +can only be used for adding NFA state IDs and recording some assertions. + +The expected flow here is to use the above builders to construct a candidate +DFA state to check if it already exists. If it does, then there's no need to +freeze it into a `State`. It it doesn't exist, then `StateBuilderNFA::to_state` +can be called to freeze the builder into an immutable `State`. In either +case, `clear` should be called on the builder to turn it back into a +`StateBuilderEmpty` that reuses the underlying memory. + +The main purpose for splitting the builder into these distinct types is to +make it impossible to do things like adding a pattern ID after adding an NFA +state ID. Namely, this makes it simpler to use a space-and-time efficient +binary representation for the state. (The format is documented on the `Repr` +type below.) If we just used one type for everything, it would be possible for +callers to use an incorrect interleaving of calls and thus result in a corrupt +representation. I chose to use more type machinery to make this impossible to +do because 1) determinization is itself pretty complex and it wouldn't be too +hard to foul this up and 2) there isn't too much machinery involved and it's +well contained. + +As an optimization, sometimes states won't have certain things set. For +example, if the underlying NFA has no word boundary assertions, then there is +no reason to set a state's look-behind assertion as to whether it was generated +from a word byte or not. Similarly, if a state has no NFA states corresponding +to look-around assertions, then there is no reason to set `look_have` to a +non-empty set. Finally, callers usually omit unconditional epsilon transitions +when adding NFA state IDs since they aren't discriminatory. + +Finally, the binary representation used by these states is, thankfully, not +serialized anywhere. So any kind of change can be made with reckless abandon, +as long as everything in this module agrees. +*/ + +use core::{convert::TryFrom, mem}; + +use alloc::{sync::Arc, vec::Vec}; + +use crate::util::{ + int::{I32, U32}, + look::LookSet, + primitives::{PatternID, StateID}, + wire::{self, Endian}, +}; + +/// A DFA state that, at its core, is represented by an ordered set of NFA +/// states. +/// +/// This type is intended to be used only in NFA-to-DFA conversion via powerset +/// construction. +/// +/// It may be cheaply cloned and accessed safely from multiple threads +/// simultaneously. +#[derive(Clone, Eq, Hash, PartialEq, PartialOrd, Ord)] +pub(crate) struct State(Arc<[u8]>); + +/// This Borrow impl permits us to lookup any state in a map by its byte +/// representation. This is particularly convenient when one has a StateBuilder +/// and we want to see if a correspondingly equivalent state already exists. If +/// one does exist, then we can reuse the allocation required by StateBuilder +/// without having to convert it into a State first. +impl core::borrow::Borrow<[u8]> for State { + fn borrow(&self) -> &[u8] { + &*self.0 + } +} + +impl core::fmt::Debug for State { + fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result { + f.debug_tuple("State").field(&self.repr()).finish() + } +} + +/// For docs on these routines, see the internal Repr and ReprVec types below. +impl State { + pub(crate) fn dead() -> State { + StateBuilderEmpty::new().into_matches().into_nfa().to_state() + } + + pub(crate) fn is_match(&self) -> bool { + self.repr().is_match() + } + + pub(crate) fn is_from_word(&self) -> bool { + self.repr().is_from_word() + } + + pub(crate) fn is_half_crlf(&self) -> bool { + self.repr().is_half_crlf() + } + + pub(crate) fn look_have(&self) -> LookSet { + self.repr().look_have() + } + + pub(crate) fn look_need(&self) -> LookSet { + self.repr().look_need() + } + + pub(crate) fn match_len(&self) -> usize { + self.repr().match_len() + } + + pub(crate) fn match_pattern(&self, index: usize) -> PatternID { + self.repr().match_pattern(index) + } + + pub(crate) fn match_pattern_ids(&self) -> Option<Vec<PatternID>> { + self.repr().match_pattern_ids() + } + + #[cfg(all(test, not(miri)))] + pub(crate) fn iter_match_pattern_ids<F: FnMut(PatternID)>(&self, f: F) { + self.repr().iter_match_pattern_ids(f) + } + + pub(crate) fn iter_nfa_state_ids<F: FnMut(StateID)>(&self, f: F) { + self.repr().iter_nfa_state_ids(f) + } + + pub(crate) fn memory_usage(&self) -> usize { + self.0.len() + } + + fn repr(&self) -> Repr<'_> { + Repr(&*self.0) + } +} + +/// A state builder that represents an empty state. +/// +/// This is a useful "initial condition" for state construction. It has no +/// NFA state IDs, no assertions set and no pattern IDs. No allocations are +/// made when new() is called. Its main use is for being converted into a +/// builder that can capture assertions and pattern IDs. +#[derive(Clone, Debug)] +pub(crate) struct StateBuilderEmpty(Vec<u8>); + +/// For docs on these routines, see the internal Repr and ReprVec types below. +impl StateBuilderEmpty { + pub(crate) fn new() -> StateBuilderEmpty { + StateBuilderEmpty(alloc::vec![]) + } + + pub(crate) fn into_matches(mut self) -> StateBuilderMatches { + self.0.extend_from_slice(&[0, 0, 0, 0, 0]); + StateBuilderMatches(self.0) + } + + fn clear(&mut self) { + self.0.clear(); + } + + pub(crate) fn capacity(&self) -> usize { + self.0.capacity() + } +} + +/// A state builder that collects assertions and pattern IDs. +/// +/// When collecting pattern IDs is finished, this can be converted into a +/// builder that collects NFA state IDs. +#[derive(Clone)] +pub(crate) struct StateBuilderMatches(Vec<u8>); + +impl core::fmt::Debug for StateBuilderMatches { + fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result { + f.debug_tuple("StateBuilderMatches").field(&self.repr()).finish() + } +} + +/// For docs on these routines, see the internal Repr and ReprVec types below. +impl StateBuilderMatches { + pub(crate) fn into_nfa(mut self) -> StateBuilderNFA { + self.repr_vec().close_match_pattern_ids(); + StateBuilderNFA { repr: self.0, prev_nfa_state_id: StateID::ZERO } + } + + pub(crate) fn set_is_from_word(&mut self) { + self.repr_vec().set_is_from_word() + } + + pub(crate) fn set_is_half_crlf(&mut self) { + self.repr_vec().set_is_half_crlf() + } + + pub(crate) fn look_have(&self) -> LookSet { + LookSet::read_repr(&self.0[1..]) + } + + pub(crate) fn set_look_have( + &mut self, + set: impl FnMut(LookSet) -> LookSet, + ) { + self.repr_vec().set_look_have(set) + } + + pub(crate) fn add_match_pattern_id(&mut self, pid: PatternID) { + self.repr_vec().add_match_pattern_id(pid) + } + + fn repr(&self) -> Repr<'_> { + Repr(&self.0) + } + + fn repr_vec(&mut self) -> ReprVec<'_> { + ReprVec(&mut self.0) + } +} + +/// A state builder that collects some assertions and NFA state IDs. +/// +/// When collecting NFA state IDs is finished, this can be used to build a +/// `State` if necessary. +/// +/// When dont with building a state (regardless of whether it got kept or not), +/// it's usually a good idea to call `clear` to get an empty builder back so +/// that it can be reused to build the next state. +#[derive(Clone)] +pub(crate) struct StateBuilderNFA { + repr: Vec<u8>, + prev_nfa_state_id: StateID, +} + +impl core::fmt::Debug for StateBuilderNFA { + fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result { + f.debug_tuple("StateBuilderNFA").field(&self.repr()).finish() + } +} + +/// For docs on these routines, see the internal Repr and ReprVec types below. +impl StateBuilderNFA { + pub(crate) fn to_state(&self) -> State { + State(Arc::from(&*self.repr)) + } + + pub(crate) fn clear(self) -> StateBuilderEmpty { + let mut builder = StateBuilderEmpty(self.repr); + builder.clear(); + builder + } + + pub(crate) fn look_need(&self) -> LookSet { + self.repr().look_need() + } + + pub(crate) fn set_look_have( + &mut self, + set: impl FnMut(LookSet) -> LookSet, + ) { + self.repr_vec().set_look_have(set) + } + + pub(crate) fn set_look_need( + &mut self, + set: impl FnMut(LookSet) -> LookSet, + ) { + self.repr_vec().set_look_need(set) + } + + pub(crate) fn add_nfa_state_id(&mut self, sid: StateID) { + ReprVec(&mut self.repr) + .add_nfa_state_id(&mut self.prev_nfa_state_id, sid) + } + + pub(crate) fn as_bytes(&self) -> &[u8] { + &self.repr + } + + fn repr(&self) -> Repr<'_> { + Repr(&self.repr) + } + + fn repr_vec(&mut self) -> ReprVec<'_> { + ReprVec(&mut self.repr) + } +} + +/// Repr is a read-only view into the representation of a DFA state. +/// +/// Primarily, a Repr is how we achieve DRY: we implement decoding the format +/// in one place, and then use a Repr to implement the various methods on the +/// public state types. +/// +/// The format is as follows: +/// +/// The first three bytes correspond to bitsets. +/// +/// Byte 0 is a bitset corresponding to miscellaneous flags associated with the +/// state. Bit 0 is set to 1 if the state is a match state. Bit 1 is set to 1 +/// if the state has pattern IDs explicitly written to it. (This is a flag that +/// is not meant to be set by determinization, but rather, is used as part of +/// an internal space-saving optimization.) Bit 2 is set to 1 if the state was +/// generated by a transition over a "word" byte. (Callers may not always set +/// this. For example, if the NFA has no word boundary assertion, then needing +/// to track whether a state came from a word byte or not is superfluous and +/// wasteful.) +/// +/// Byte 1 corresponds to the look-behind assertions that were satisfied by +/// the transition that created this state. This generally only includes the +/// StartLF and Start assertions. (Look-ahead assertions are not tracked as +/// part of states. Instead, these are applied by re-computing the epsilon +/// closure of a state when computing the transition function. See `next` in +/// the parent module.) +/// +/// Byte 2 corresponds to the set of look-around assertions (including both +/// look-behind and look-ahead) that appear somewhere in this state's set of +/// NFA state IDs. This is used to determine whether this state's epsilon +/// closure should be re-computed when computing the transition function. +/// Namely, look-around assertions are "just" conditional epsilon transitions, +/// so if there are new assertions available when computing the transition +/// function, we should only re-compute the epsilon closure if those new +/// assertions are relevant to this particular state. +/// +/// Bytes 3..7 correspond to a 32-bit native-endian encoded integer +/// corresponding to the number of patterns encoded in this state. If the state +/// is not a match state (byte 0 bit 0 is 0) or if it's only pattern ID is +/// PatternID::ZERO, then no integer is encoded at this position. Instead, byte +/// offset 3 is the position at which the first NFA state ID is encoded. +/// +/// For a match state with at least one non-ZERO pattern ID, the next bytes +/// correspond to a sequence of 32-bit native endian encoded integers that +/// represent each pattern ID, in order, that this match state represents. +/// +/// After the pattern IDs (if any), NFA state IDs are delta encoded as +/// varints.[1] The first NFA state ID is encoded as itself, and each +/// subsequent NFA state ID is encoded as the difference between itself and the +/// previous NFA state ID. +/// +/// [1] - https://developers.google.com/protocol-buffers/docs/encoding#varints +struct Repr<'a>(&'a [u8]); + +impl<'a> Repr<'a> { + /// Returns true if and only if this is a match state. + /// + /// If callers have added pattern IDs to this state, then callers MUST set + /// this state as a match state explicitly. However, as a special case, + /// states that are marked as match states but with no pattern IDs, then + /// the state is treated as if it had a single pattern ID equivalent to + /// PatternID::ZERO. + fn is_match(&self) -> bool { + self.0[0] & (1 << 0) > 0 + } + + /// Returns true if and only if this state has had at least one pattern + /// ID added to it. + /// + /// This is an internal-only flag that permits the representation to save + /// space in the common case of an NFA with one pattern in it. In that + /// case, a match state can only ever have exactly one pattern ID: + /// PatternID::ZERO. So there's no need to represent it. + fn has_pattern_ids(&self) -> bool { + self.0[0] & (1 << 1) > 0 + } + + /// Returns true if and only if this state is marked as having been created + /// from a transition over a word byte. This is useful for checking whether + /// a word boundary assertion is true or not, which requires look-behind + /// (whether the current state came from a word byte or not) and look-ahead + /// (whether the transition byte is a word byte or not). + /// + /// Since states with this set are distinct from states that don't have + /// this set (even if they are otherwise equivalent), callers should not + /// set this assertion unless the underlying NFA has at least one word + /// boundary assertion somewhere. Otherwise, a superfluous number of states + /// may be created. + fn is_from_word(&self) -> bool { + self.0[0] & (1 << 2) > 0 + } + + /// Returns true if and only if this state is marked as being inside of a + /// CRLF terminator. In the forward direction, this means the state was + /// created after seeing a `\r`. In the reverse direction, this means the + /// state was created after seeing a `\n`. + fn is_half_crlf(&self) -> bool { + self.0[0] & (1 << 3) > 0 + } + + /// The set of look-behind assertions that were true in the transition that + /// created this state. + /// + /// Generally, this should be empty if 'look_need' is empty, since there is + /// no reason to track which look-behind assertions are true if the state + /// has no conditional epsilon transitions. + /// + /// Satisfied look-ahead assertions are not tracked in states. Instead, + /// these are re-computed on demand via epsilon closure when computing the + /// transition function. + fn look_have(&self) -> LookSet { + LookSet::read_repr(&self.0[1..]) + } + + /// The set of look-around (both behind and ahead) assertions that appear + /// at least once in this state's set of NFA states. + /// + /// This is used to determine whether the epsilon closure needs to be + /// re-computed when computing the transition function. Namely, if the + /// state has no conditional epsilon transitions, then there is no need + /// to re-compute the epsilon closure. + fn look_need(&self) -> LookSet { + LookSet::read_repr(&self.0[3..]) + } + + /// Returns the total number of match pattern IDs in this state. + /// + /// If this state is not a match state, then this always returns 0. + fn match_len(&self) -> usize { + if !self.is_match() { + return 0; + } else if !self.has_pattern_ids() { + 1 + } else { + self.encoded_pattern_len() + } + } + + /// Returns the pattern ID for this match state at the given index. + /// + /// If the given index is greater than or equal to `match_len()` for this + /// state, then this could panic or return incorrect results. + fn match_pattern(&self, index: usize) -> PatternID { + if !self.has_pattern_ids() { + PatternID::ZERO + } else { + let offset = 9 + index * PatternID::SIZE; + // This is OK since we only ever serialize valid PatternIDs to + // states. + wire::read_pattern_id_unchecked(&self.0[offset..]).0 + } + } + + /// Returns a copy of all match pattern IDs in this state. If this state + /// is not a match state, then this returns None. + fn match_pattern_ids(&self) -> Option<Vec<PatternID>> { + if !self.is_match() { + return None; + } + let mut pids = alloc::vec![]; + self.iter_match_pattern_ids(|pid| pids.push(pid)); + Some(pids) + } + + /// Calls the given function on every pattern ID in this state. + fn iter_match_pattern_ids<F: FnMut(PatternID)>(&self, mut f: F) { + if !self.is_match() { + return; + } + // As an optimization for a very common case, when this is a match + // state for an NFA with only one pattern, we don't actually write the + // pattern ID to the state representation. Instead, we know it must + // be there since it is the only possible choice. + if !self.has_pattern_ids() { + f(PatternID::ZERO); + return; + } + let mut pids = &self.0[9..self.pattern_offset_end()]; + while !pids.is_empty() { + let pid = wire::read_u32(pids); + pids = &pids[PatternID::SIZE..]; + // This is OK since we only ever serialize valid PatternIDs to + // states. And since pattern IDs can never exceed a usize, the + // unwrap is OK. + f(PatternID::new_unchecked(usize::try_from(pid).unwrap())); + } + } + + /// Calls the given function on every NFA state ID in this state. + fn iter_nfa_state_ids<F: FnMut(StateID)>(&self, mut f: F) { + let mut sids = &self.0[self.pattern_offset_end()..]; + let mut prev = 0i32; + while !sids.is_empty() { + let (delta, nr) = read_vari32(sids); + sids = &sids[nr..]; + let sid = prev + delta; + prev = sid; + // This is OK since we only ever serialize valid StateIDs to + // states. And since state IDs can never exceed an isize, they must + // always be able to fit into a usize, and thus cast is OK. + f(StateID::new_unchecked(sid.as_usize())) + } + } + + /// Returns the offset into this state's representation where the pattern + /// IDs end and the NFA state IDs begin. + fn pattern_offset_end(&self) -> usize { + let encoded = self.encoded_pattern_len(); + if encoded == 0 { + return 5; + } + // This arithmetic is OK since we were able to address this many bytes + // when writing to the state, thus, it must fit into a usize. + encoded.checked_mul(4).unwrap().checked_add(9).unwrap() + } + + /// Returns the total number of *encoded* pattern IDs in this state. + /// + /// This may return 0 even when this is a match state, since the pattern + /// ID `PatternID::ZERO` is not encoded when it's the only pattern ID in + /// the match state (the overwhelming common case). + fn encoded_pattern_len(&self) -> usize { + if !self.has_pattern_ids() { + return 0; + } + // This unwrap is OK since the total number of patterns is always + // guaranteed to fit into a usize. + usize::try_from(wire::read_u32(&self.0[5..9])).unwrap() + } +} + +impl<'a> core::fmt::Debug for Repr<'a> { + fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result { + let mut nfa_ids = alloc::vec![]; + self.iter_nfa_state_ids(|sid| nfa_ids.push(sid)); + f.debug_struct("Repr") + .field("is_match", &self.is_match()) + .field("is_from_word", &self.is_from_word()) + .field("is_half_crlf", &self.is_half_crlf()) + .field("look_have", &self.look_have()) + .field("look_need", &self.look_need()) + .field("match_pattern_ids", &self.match_pattern_ids()) + .field("nfa_state_ids", &nfa_ids) + .finish() + } +} + +/// ReprVec is a write-only view into the representation of a DFA state. +/// +/// See Repr for more details on the purpose of this type and also the format. +/// +/// Note that not all possible combinations of methods may be called. This is +/// precisely what the various StateBuilder types encapsulate: they only +/// permit valid combinations via Rust's linear typing. +struct ReprVec<'a>(&'a mut Vec<u8>); + +impl<'a> ReprVec<'a> { + /// Set this state as a match state. + /// + /// This should not be exposed explicitly outside of this module. It is + /// set automatically when a pattern ID is added. + fn set_is_match(&mut self) { + self.0[0] |= 1 << 0; + } + + /// Set that this state has pattern IDs explicitly written to it. + /// + /// This should not be exposed explicitly outside of this module. This is + /// used internally as a space saving optimization. Namely, if the state + /// is a match state but does not have any pattern IDs written to it, + /// then it is automatically inferred to have a pattern ID of ZERO. + fn set_has_pattern_ids(&mut self) { + self.0[0] |= 1 << 1; + } + + /// Set this state as being built from a transition over a word byte. + /// + /// Setting this is only necessary when one needs to deal with word + /// boundary assertions. Therefore, if the underlying NFA has no word + /// boundary assertions, callers should not set this. + fn set_is_from_word(&mut self) { + self.0[0] |= 1 << 2; + } + + /// Set this state as having seen half of a CRLF terminator. + /// + /// In the forward direction, this should be set when a `\r` has been seen. + /// In the reverse direction, this should be set when a `\n` has been seen. + fn set_is_half_crlf(&mut self) { + self.0[0] |= 1 << 3; + } + + /// The set of look-behind assertions that were true in the transition that + /// created this state. + fn look_have(&self) -> LookSet { + self.repr().look_have() + } + + /// The set of look-around (both behind and ahead) assertions that appear + /// at least once in this state's set of NFA states. + fn look_need(&self) -> LookSet { + self.repr().look_need() + } + + /// Mutate the set of look-behind assertions that were true in the + /// transition that created this state. + fn set_look_have(&mut self, mut set: impl FnMut(LookSet) -> LookSet) { + set(self.look_have()).write_repr(&mut self.0[1..]); + } + + /// Mutate the set of look-around (both behind and ahead) assertions that + /// appear at least once in this state's set of NFA states. + fn set_look_need(&mut self, mut set: impl FnMut(LookSet) -> LookSet) { + set(self.look_need()).write_repr(&mut self.0[3..]); + } + + /// Add a pattern ID to this state. All match states must have at least + /// one pattern ID associated with it. + /// + /// Callers must never add duplicative pattern IDs. + /// + /// The order in which patterns are added must correspond to the order + /// in which patterns are reported as matches. + fn add_match_pattern_id(&mut self, pid: PatternID) { + // As a (somewhat small) space saving optimization, in the case where + // a matching state has exactly one pattern ID, PatternID::ZERO, we do + // not write either the pattern ID or the number of patterns encoded. + // Instead, all we do is set the 'is_match' bit on this state. Overall, + // this saves 8 bytes per match state for the overwhelming majority of + // match states. + // + // In order to know whether pattern IDs need to be explicitly read or + // not, we use another internal-only bit, 'has_pattern_ids', to + // indicate whether they have been explicitly written or not. + if !self.repr().has_pattern_ids() { + if pid == PatternID::ZERO { + self.set_is_match(); + return; + } + // Make room for 'close_match_pattern_ids' to write the total + // number of pattern IDs written. + self.0.extend(core::iter::repeat(0).take(PatternID::SIZE)); + self.set_has_pattern_ids(); + // If this was already a match state, then the only way that's + // possible when the state doesn't have pattern IDs is if + // PatternID::ZERO was added by the caller previously. In this + // case, we are now adding a non-ZERO pattern ID after it, in + // which case, we want to make sure to represent ZERO explicitly + // now. + if self.repr().is_match() { + write_u32(self.0, 0) + } else { + // Otherwise, just make sure the 'is_match' bit is set. + self.set_is_match(); + } + } + write_u32(self.0, pid.as_u32()); + } + + /// Indicate that no more pattern IDs will be added to this state. + /// + /// Once this is called, callers must not call it or 'add_match_pattern_id' + /// again. + /// + /// This should not be exposed explicitly outside of this module. It + /// should be called only when converting a StateBuilderMatches into a + /// StateBuilderNFA. + fn close_match_pattern_ids(&mut self) { + // If we never wrote any pattern IDs, then there's nothing to do here. + if !self.repr().has_pattern_ids() { + return; + } + let patsize = PatternID::SIZE; + let pattern_bytes = self.0.len() - 9; + // Every pattern ID uses 4 bytes, so number of bytes should be + // divisible by 4. + assert_eq!(pattern_bytes % patsize, 0); + // This unwrap is OK since we are guaranteed that the maximum number + // of possible patterns fits into a u32. + let count32 = u32::try_from(pattern_bytes / patsize).unwrap(); + wire::NE::write_u32(count32, &mut self.0[5..9]); + } + + /// Add an NFA state ID to this state. The order in which NFA states are + /// added matters. It is the caller's responsibility to ensure that + /// duplicate NFA state IDs are not added. + fn add_nfa_state_id(&mut self, prev: &mut StateID, sid: StateID) { + let delta = sid.as_i32() - prev.as_i32(); + write_vari32(self.0, delta); + *prev = sid; + } + + /// Return a read-only view of this state's representation. + fn repr(&self) -> Repr<'_> { + Repr(self.0.as_slice()) + } +} + +/// Write a signed 32-bit integer using zig-zag encoding. +/// +/// https://developers.google.com/protocol-buffers/docs/encoding#varints +fn write_vari32(data: &mut Vec<u8>, n: i32) { + let mut un = n.to_bits() << 1; + if n < 0 { + un = !un; + } + write_varu32(data, un) +} + +/// Read a signed 32-bit integer using zig-zag encoding. Also, return the +/// number of bytes read. +/// +/// https://developers.google.com/protocol-buffers/docs/encoding#varints +fn read_vari32(data: &[u8]) -> (i32, usize) { + let (un, i) = read_varu32(data); + let mut n = i32::from_bits(un >> 1); + if un & 1 != 0 { + n = !n; + } + (n, i) +} + +/// Write an unsigned 32-bit integer as a varint. In essence, `n` is written +/// as a sequence of bytes where all bytes except for the last one have the +/// most significant bit set. The least significant 7 bits correspond to the +/// actual bits of `n`. So in the worst case, a varint uses 5 bytes, but in +/// very common cases, it uses fewer than 4. +/// +/// https://developers.google.com/protocol-buffers/docs/encoding#varints +fn write_varu32(data: &mut Vec<u8>, mut n: u32) { + while n >= 0b1000_0000 { + data.push(n.low_u8() | 0b1000_0000); + n >>= 7; + } + data.push(n.low_u8()); +} + +/// Read an unsigned 32-bit varint. Also, return the number of bytes read. +/// +/// https://developers.google.com/protocol-buffers/docs/encoding#varints +fn read_varu32(data: &[u8]) -> (u32, usize) { + // N.B. We can assume correctness here since we know that all varuints are + // written with write_varu32. Hence, the 'as' uses and unchecked arithmetic + // is all okay. + let mut n: u32 = 0; + let mut shift: u32 = 0; + for (i, &b) in data.iter().enumerate() { + if b < 0b1000_0000 { + return (n | (u32::from(b) << shift), i + 1); + } + n |= (u32::from(b) & 0b0111_1111) << shift; + shift += 7; + } + (0, 0) +} + +/// Push a native-endian encoded `n` on to `dst`. +fn write_u32(dst: &mut Vec<u8>, n: u32) { + use crate::util::wire::NE; + + let start = dst.len(); + dst.extend(core::iter::repeat(0).take(mem::size_of::<u32>())); + NE::write_u32(n, &mut dst[start..]); +} + +#[cfg(test)] +mod tests { + use alloc::vec; + + use quickcheck::quickcheck; + + use super::*; + + #[cfg(not(miri))] + quickcheck! { + fn prop_state_read_write_nfa_state_ids(sids: Vec<StateID>) -> bool { + // Builders states do not permit duplicate IDs. + let sids = dedup_state_ids(sids); + + let mut b = StateBuilderEmpty::new().into_matches().into_nfa(); + for &sid in &sids { + b.add_nfa_state_id(sid); + } + let s = b.to_state(); + let mut got = vec![]; + s.iter_nfa_state_ids(|sid| got.push(sid)); + got == sids + } + + fn prop_state_read_write_pattern_ids(pids: Vec<PatternID>) -> bool { + // Builders states do not permit duplicate IDs. + let pids = dedup_pattern_ids(pids); + + let mut b = StateBuilderEmpty::new().into_matches(); + for &pid in &pids { + b.add_match_pattern_id(pid); + } + let s = b.into_nfa().to_state(); + let mut got = vec![]; + s.iter_match_pattern_ids(|pid| got.push(pid)); + got == pids + } + + fn prop_state_read_write_nfa_state_and_pattern_ids( + sids: Vec<StateID>, + pids: Vec<PatternID> + ) -> bool { + // Builders states do not permit duplicate IDs. + let sids = dedup_state_ids(sids); + let pids = dedup_pattern_ids(pids); + + let mut b = StateBuilderEmpty::new().into_matches(); + for &pid in &pids { + b.add_match_pattern_id(pid); + } + + let mut b = b.into_nfa(); + for &sid in &sids { + b.add_nfa_state_id(sid); + } + + let s = b.to_state(); + let mut got_pids = vec![]; + s.iter_match_pattern_ids(|pid| got_pids.push(pid)); + let mut got_sids = vec![]; + s.iter_nfa_state_ids(|sid| got_sids.push(sid)); + got_pids == pids && got_sids == sids + } + } + + quickcheck! { + fn prop_read_write_varu32(n: u32) -> bool { + let mut buf = vec![]; + write_varu32(&mut buf, n); + let (got, nread) = read_varu32(&buf); + nread == buf.len() && got == n + } + + fn prop_read_write_vari32(n: i32) -> bool { + let mut buf = vec![]; + write_vari32(&mut buf, n); + let (got, nread) = read_vari32(&buf); + nread == buf.len() && got == n + } + } + + #[cfg(not(miri))] + fn dedup_state_ids(sids: Vec<StateID>) -> Vec<StateID> { + let mut set = alloc::collections::BTreeSet::new(); + let mut deduped = vec![]; + for sid in sids { + if set.contains(&sid) { + continue; + } + set.insert(sid); + deduped.push(sid); + } + deduped + } + + #[cfg(not(miri))] + fn dedup_pattern_ids(pids: Vec<PatternID>) -> Vec<PatternID> { + let mut set = alloc::collections::BTreeSet::new(); + let mut deduped = vec![]; + for pid in pids { + if set.contains(&pid) { + continue; + } + set.insert(pid); + deduped.push(pid); + } + deduped + } +} diff --git a/third_party/rust/regex-automata/src/util/empty.rs b/third_party/rust/regex-automata/src/util/empty.rs new file mode 100644 index 0000000000..e16af3b6e5 --- /dev/null +++ b/third_party/rust/regex-automata/src/util/empty.rs @@ -0,0 +1,265 @@ +/*! +This module provides helper routines for dealing with zero-width matches. + +The main problem being solved here is this: + +1. The caller wants to search something that they know is valid UTF-8, such +as a Rust `&str`. +2. The regex used by the caller can match the empty string. For example, `a*`. +3. The caller should never get match offsets returned that occur within the +encoding of a UTF-8 codepoint. It is logically incorrect, and also means that, +e.g., slicing the `&str` at those offsets will lead to a panic. + +So the question here is, how do we prevent the caller from getting match +offsets that split a codepoint? For example, strictly speaking, the regex `a*` +matches `☃` at the positions `[0, 0]`, `[1, 1]`, `[2, 2]` and `[3, 3]` since +the UTF-8 encoding of `☃` is `\xE2\x98\x83`. In particular, the `NFA` that +underlies all of the matching engines in this crate doesn't have anything in +its state graph that prevents matching between UTF-8 code units. Indeed, any +engine derived from the `NFA` will match at those positions by virtue of the +fact that the `NFA` is byte oriented. That is, its transitions are defined over +bytes and the matching engines work by proceeding one byte at a time. + +(An alternative architecture would be to define the transitions in an `NFA` +over codepoints, or `char`. And then make the matching engines proceed by +decoding one codepoint at a time. This is a viable strategy, but it doesn't +work for DFA matching engines because designing a fast and memory efficient +transition table for an alphabet as large as Unicode is quite difficult. More +to the point, the top-level `regex` crate supports matching on arbitrary bytes +when Unicode mode is disabled and one is searching a `&[u8]`. So in that case, +you can't just limit yourself to decoding codepoints and matching those. You +really do need to be able to follow byte oriented transitions on the `NFA`.) + +In an older version of the regex crate, we handled this case not in the regex +engine, but in the iterators over matches. Namely, since this case only arises +when the match is empty, we "just" incremented the next starting position +of the search by `N`, where `N` is the length of the codepoint encoded at +the current position. The alternative or more "natural" solution of just +incrementing by `1` would result in executing a search of `a*` on `☃` like +this: + +* Start search at `0`. +* Found match at `[0, 0]`. +* Next start position is `0`. +* To avoid an infinite loop, since it's an empty match, increment by `1`. +* Start search at `1`. +* Found match at `[1, 1]`. Oops. + +But if we instead incremented by `3` (the length in bytes of `☃`), then we get +the following: + +* Start search at `0`. +* Found match at `[0, 0]`. +* Next start position is `0`. +* To avoid an infinite loop, since it's an empty match, increment by `3`. +* Start search at `3`. +* Found match at `[3, 3]`. + +And we get the correct result. But does this technique work in all cases? +Crucially, it requires that a zero-width match that splits a codepoint never +occurs beyond the starting position of the search. Because if it did, merely +incrementing the start position by the number of bytes in the codepoint at +the current position wouldn't be enough. A zero-width match could just occur +anywhere. It turns out that it is _almost_ true. We can convince ourselves by +looking at all possible patterns that can match the empty string: + +* Patterns like `a*`, `a{0}`, `(?:)`, `a|` and `|a` all unconditionally match +the empty string. That is, assuming there isn't an `a` at the current position, +they will all match the empty string at the start of a search. There is no way +to move past it because any other match would not be "leftmost." +* `^` only matches at the beginning of the haystack, where the start position +is `0`. Since we know we're searching valid UTF-8 (if it isn't valid UTF-8, +then this entire problem goes away because it implies your string type supports +invalid UTF-8 and thus must deal with offsets that not only split a codepoint +but occur in entirely invalid UTF-8 somehow), it follows that `^` never matches +between the code units of a codepoint because the start of a valid UTF-8 string +is never within the encoding of a codepoint. +* `$` basically the same logic as `^`, but for the end of a string. A valid +UTF-8 string can't have an incomplete codepoint at the end of it. +* `(?m:^)` follows similarly to `^`, but it can match immediately following +a `\n`. However, since a `\n` is always a codepoint itself and can never +appear within a codepoint, it follows that the position immediately following +a `\n` in a string that is valid UTF-8 is guaranteed to not be between the +code units of another codepoint. (One caveat here is that the line terminator +for multi-line anchors can now be changed to any arbitrary byte, including +things like `\x98` which might occur within a codepoint. However, this wasn't +supported by the old regex crate. If it was, it pose the same problems as +`(?-u:\B)`, as we'll discuss below.) +* `(?m:$)` a similar argument as for `(?m:^)`. The only difference is that a +`(?m:$)` matches just before a `\n`. But the same argument applies. +* `(?Rm:^)` and `(?Rm:$)` weren't supported by the old regex crate, but the +CRLF aware line anchors follow a similar argument as for `(?m:^)` and `(?m:$)`. +Namely, since they only ever match at a boundary where one side is either a +`\r` or a `\n`, neither of which can occur within a codepoint. +* `\b` only matches at positions where both sides are valid codepoints, so +this cannot split a codepoint. +* `\B`, like `\b`, also only matches at positions where both sides are valid +codepoints. So this cannot split a codepoint either. +* `(?-u:\b)` matches only at positions where at least one side of it is an ASCII +word byte. Since ASCII bytes cannot appear as code units in non-ASCII codepoints +(one of the many amazing qualities of UTF-8), it follows that this too cannot +split a codepoint. +* `(?-u:\B)` finally represents a problem. It can matches between *any* two +bytes that are either both word bytes or non-word bytes. Since code units like +`\xE2` and `\x98` (from the UTF-8 encoding of `☃`) are both non-word bytes, +`(?-u:\B)` will match at the position between them. + +Thus, our approach of incrementing one codepoint at a time after seeing an +empty match is flawed because `(?-u:\B)` can result in an empty match that +splits a codepoint at a position past the starting point of a search. For +example, searching `(?-u:\B)` on `a☃` would produce the following matches: `[2, +2]`, `[3, 3]` and `[4, 4]`. The positions at `0` and `1` don't match because +they correspond to word boundaries since `a` is an ASCII word byte. + +So what did the old regex crate do to avoid this? It banned `(?-u:\B)` from +regexes that could match `&str`. That might sound extreme, but a lot of other +things were banned too. For example, all of `(?-u:.)`, `(?-u:[^a])` and +`(?-u:\W)` can match invalid UTF-8 too, including individual code units with a +codepoint. The key difference is that those expressions could never produce an +empty match. That ban happens when translating an `Ast` to an `Hir`, because +that process that reason about whether an `Hir` can produce *non-empty* matches +at invalid UTF-8 boundaries. Bottom line though is that we side-stepped the +`(?-u:\B)` issue by banning it. + +If banning `(?-u:\B)` were the only issue with the old regex crate's approach, +then I probably would have kept it. `\B` is rarely used, so it's not such a big +deal to have to work-around it. However, the problem with the above approach +is that it doesn't compose. The logic for avoiding splitting a codepoint only +lived in the iterator, which means if anyone wants to implement their own +iterator over regex matches, they have to deal with this extremely subtle edge +case to get full correctness. + +Instead, in this crate, we take the approach of pushing this complexity down +to the lowest layers of each regex engine. The approach is pretty simple: + +* If this corner case doesn't apply, don't do anything. (For example, if UTF-8 +mode isn't enabled or if the regex cannot match the empty string.) +* If an empty match is reported, explicitly check if it splits a codepoint. +* If it doesn't, we're done, return the match. +* If it does, then ignore the match and re-run the search. +* Repeat the above process until the end of the haystack is reached or a match +is found that doesn't split a codepoint or isn't zero width. + +And that's pretty much what this module provides. Every regex engine uses these +methods in their lowest level public APIs, but just above the layer where +their internal engine is used. That way, all regex engines can be arbitrarily +composed without worrying about handling this case, and iterators don't need to +handle it explicitly. + +(It turns out that a new feature I added, support for changing the line +terminator in a regex to any arbitrary byte, also provokes the above problem. +Namely, the byte could be invalid UTF-8 or a UTF-8 continuation byte. So that +support would need to be limited or banned when UTF-8 mode is enabled, just +like we did for `(?-u:\B)`. But thankfully our more robust approach in this +crate handles that case just fine too.) +*/ + +use crate::util::search::{Input, MatchError}; + +#[cold] +#[inline(never)] +pub(crate) fn skip_splits_fwd<T, F>( + input: &Input<'_>, + init_value: T, + match_offset: usize, + find: F, +) -> Result<Option<T>, MatchError> +where + F: FnMut(&Input<'_>) -> Result<Option<(T, usize)>, MatchError>, +{ + skip_splits(true, input, init_value, match_offset, find) +} + +#[cold] +#[inline(never)] +pub(crate) fn skip_splits_rev<T, F>( + input: &Input<'_>, + init_value: T, + match_offset: usize, + find: F, +) -> Result<Option<T>, MatchError> +where + F: FnMut(&Input<'_>) -> Result<Option<(T, usize)>, MatchError>, +{ + skip_splits(false, input, init_value, match_offset, find) +} + +fn skip_splits<T, F>( + forward: bool, + input: &Input<'_>, + init_value: T, + mut match_offset: usize, + mut find: F, +) -> Result<Option<T>, MatchError> +where + F: FnMut(&Input<'_>) -> Result<Option<(T, usize)>, MatchError>, +{ + // If our config says to do an anchored search, then we're definitely + // done. We just need to determine whether we have a valid match or + // not. If we don't, then we're not allowed to continue, so we report + // no match. + // + // This is actually quite a subtle correctness thing. The key here is + // that if we got an empty match that splits a codepoint after doing an + // anchored search in UTF-8 mode, then that implies that we must have + // *started* the search at a location that splits a codepoint. This + // follows from the fact that if a match is reported from an anchored + // search, then the start offset of the match *must* match the start + // offset of the search. + // + // It also follows that no other non-empty match is possible. For + // example, you might write a regex like '(?:)|SOMETHING' and start its + // search in the middle of a codepoint. The first branch is an empty + // regex that will bubble up a match at the first position, and then + // get rejected here and report no match. But what if 'SOMETHING' could + // have matched? We reason that such a thing is impossible, because + // if it does, it must report a match that starts in the middle of a + // codepoint. This in turn implies that a match is reported whose span + // does not correspond to valid UTF-8, and this breaks the promise + // made when UTF-8 mode is enabled. (That promise *can* be broken, for + // example, by enabling UTF-8 mode but building an by hand NFA that + // produces non-empty matches that span invalid UTF-8. This is an unchecked + // but documented precondition violation of UTF-8 mode, and is documented + // to have unspecified behavior.) + // + // I believe this actually means that if an anchored search is run, and + // UTF-8 mode is enabled and the start position splits a codepoint, + // then it is correct to immediately report no match without even + // executing the regex engine. But it doesn't really seem worth writing + // out that case in every regex engine to save a tiny bit of work in an + // extremely pathological case, so we just handle it here. + if input.get_anchored().is_anchored() { + return Ok(if input.is_char_boundary(match_offset) { + Some(init_value) + } else { + None + }); + } + // Otherwise, we have an unanchored search, so just keep looking for + // matches until we have one that does not split a codepoint or we hit + // EOI. + let mut value = init_value; + let mut input = input.clone(); + while !input.is_char_boundary(match_offset) { + if forward { + // The unwrap is OK here because overflowing usize while + // iterating over a slice is impossible, at it would require + // a slice of length greater than isize::MAX, which is itself + // impossible. + input.set_start(input.start().checked_add(1).unwrap()); + } else { + input.set_end(match input.end().checked_sub(1) { + None => return Ok(None), + Some(end) => end, + }); + } + match find(&input)? { + None => return Ok(None), + Some((new_value, new_match_end)) => { + value = new_value; + match_offset = new_match_end; + } + } + } + Ok(Some(value)) +} diff --git a/third_party/rust/regex-automata/src/util/escape.rs b/third_party/rust/regex-automata/src/util/escape.rs new file mode 100644 index 0000000000..7f6aa15f5d --- /dev/null +++ b/third_party/rust/regex-automata/src/util/escape.rs @@ -0,0 +1,84 @@ +/*! +Provides convenience routines for escaping raw bytes. + +Since this crate tends to deal with `&[u8]` everywhere and the default +`Debug` implementation just shows decimal integers, it makes debugging those +representations quite difficult. This module provides types that show `&[u8]` +as if it were a string, with invalid UTF-8 escaped into its byte-by-byte hex +representation. +*/ + +use crate::util::utf8; + +/// Provides a convenient `Debug` implementation for a `u8`. +/// +/// The `Debug` impl treats the byte as an ASCII, and emits a human readable +/// representation of it. If the byte isn't ASCII, then it's emitted as a hex +/// escape sequence. +#[derive(Clone, Copy)] +pub struct DebugByte(pub u8); + +impl core::fmt::Debug for DebugByte { + fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result { + // Special case ASCII space. It's too hard to read otherwise, so + // put quotes around it. I sometimes wonder whether just '\x20' would + // be better... + if self.0 == b' ' { + return write!(f, "' '"); + } + // 10 bytes is enough to cover any output from ascii::escape_default. + let mut bytes = [0u8; 10]; + let mut len = 0; + for (i, mut b) in core::ascii::escape_default(self.0).enumerate() { + // capitalize \xab to \xAB + if i >= 2 && b'a' <= b && b <= b'f' { + b -= 32; + } + bytes[len] = b; + len += 1; + } + write!(f, "{}", core::str::from_utf8(&bytes[..len]).unwrap()) + } +} + +/// Provides a convenient `Debug` implementation for `&[u8]`. +/// +/// This generally works best when the bytes are presumed to be mostly UTF-8, +/// but will work for anything. For any bytes that aren't UTF-8, they are +/// emitted as hex escape sequences. +pub struct DebugHaystack<'a>(pub &'a [u8]); + +impl<'a> core::fmt::Debug for DebugHaystack<'a> { + fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result { + write!(f, "\"")?; + // This is a sad re-implementation of a similar impl found in bstr. + let mut bytes = self.0; + while let Some(result) = utf8::decode(bytes) { + let ch = match result { + Ok(ch) => ch, + Err(byte) => { + write!(f, r"\x{:02x}", byte)?; + bytes = &bytes[1..]; + continue; + } + }; + bytes = &bytes[ch.len_utf8()..]; + match ch { + '\0' => write!(f, "\\0")?, + // ASCII control characters except \0, \n, \r, \t + '\x01'..='\x08' + | '\x0b' + | '\x0c' + | '\x0e'..='\x19' + | '\x7f' => { + write!(f, "\\x{:02x}", u32::from(ch))?; + } + '\n' | '\r' | '\t' | _ => { + write!(f, "{}", ch.escape_debug())?; + } + } + } + write!(f, "\"")?; + Ok(()) + } +} diff --git a/third_party/rust/regex-automata/src/util/int.rs b/third_party/rust/regex-automata/src/util/int.rs new file mode 100644 index 0000000000..e6b13bff96 --- /dev/null +++ b/third_party/rust/regex-automata/src/util/int.rs @@ -0,0 +1,252 @@ +/*! +This module provides several integer oriented traits for converting between +both fixed size integers and integers whose size varies based on the target +(like `usize`). + +The driving design principle of this module is to attempt to centralize as many +`as` casts as possible here. And in particular, we separate casts into two +buckets: + +* Casts that we use for their truncating behavior. In this case, we use more +descriptive names, like `low_u32` and `high_u32`. +* Casts that we use for converting back-and-forth between `usize`. These +conversions are generally necessary because we often store indices in different +formats to save on memory, which requires converting to and from `usize`. In +this case, we very specifically do not want to overflow, and so the methods +defined here will panic if the `as` cast would be lossy in debug mode. (A +normal `as` cast will never panic!) + +For `as` casts between raw pointers, we use `cast`, so `as` isn't needed there. + +For regex engines, floating point is just never used, so we don't have to worry +about `as` casts for those. + +Otherwise, this module pretty much covers all of our `as` needs except for one +thing: const contexts. There are a select few places in this crate where we +still need to use `as` because const functions on traits aren't stable yet. +If we wind up significantly expanding our const footprint in this crate, it +might be worth defining free functions to handle those cases. But at the time +of writing, that just seemed like too much ceremony. Instead, I comment each +such use of `as` in a const context with a "fixme" notice. + +NOTE: for simplicity, we don't take target pointer width into account here for +`usize` conversions. Since we currently only panic in debug mode, skipping the +check when it can be proven it isn't needed at compile time doesn't really +matter. Now, if we wind up wanting to do as many checks as possible in release +mode, then we would want to skip those when we know the conversions are always +non-lossy. + +NOTE: this module isn't an exhaustive API. For example, we still use things +like `u64::from` where possible, or even `usize::try_from()` for when we do +explicitly want to panic or when we want to return an error for overflow. +*/ + +pub(crate) trait U8 { + fn as_usize(self) -> usize; +} + +impl U8 for u8 { + fn as_usize(self) -> usize { + usize::from(self) + } +} + +pub(crate) trait U16 { + fn as_usize(self) -> usize; + fn low_u8(self) -> u8; + fn high_u8(self) -> u8; +} + +impl U16 for u16 { + fn as_usize(self) -> usize { + usize::from(self) + } + + fn low_u8(self) -> u8 { + self as u8 + } + + fn high_u8(self) -> u8 { + (self >> 8) as u8 + } +} + +pub(crate) trait U32 { + fn as_usize(self) -> usize; + fn low_u8(self) -> u8; + fn low_u16(self) -> u16; + fn high_u16(self) -> u16; +} + +impl U32 for u32 { + fn as_usize(self) -> usize { + #[cfg(debug_assertions)] + { + usize::try_from(self).expect("u32 overflowed usize") + } + #[cfg(not(debug_assertions))] + { + self as usize + } + } + + fn low_u8(self) -> u8 { + self as u8 + } + + fn low_u16(self) -> u16 { + self as u16 + } + + fn high_u16(self) -> u16 { + (self >> 16) as u16 + } +} + +pub(crate) trait U64 { + fn as_usize(self) -> usize; + fn low_u8(self) -> u8; + fn low_u16(self) -> u16; + fn low_u32(self) -> u32; + fn high_u32(self) -> u32; +} + +impl U64 for u64 { + fn as_usize(self) -> usize { + #[cfg(debug_assertions)] + { + usize::try_from(self).expect("u64 overflowed usize") + } + #[cfg(not(debug_assertions))] + { + self as usize + } + } + + fn low_u8(self) -> u8 { + self as u8 + } + + fn low_u16(self) -> u16 { + self as u16 + } + + fn low_u32(self) -> u32 { + self as u32 + } + + fn high_u32(self) -> u32 { + (self >> 32) as u32 + } +} + +pub(crate) trait I32 { + fn as_usize(self) -> usize; + fn to_bits(self) -> u32; + fn from_bits(n: u32) -> i32; +} + +impl I32 for i32 { + fn as_usize(self) -> usize { + #[cfg(debug_assertions)] + { + usize::try_from(self).expect("i32 overflowed usize") + } + #[cfg(not(debug_assertions))] + { + self as usize + } + } + + fn to_bits(self) -> u32 { + self as u32 + } + + fn from_bits(n: u32) -> i32 { + n as i32 + } +} + +pub(crate) trait Usize { + fn as_u8(self) -> u8; + fn as_u16(self) -> u16; + fn as_u32(self) -> u32; + fn as_u64(self) -> u64; +} + +impl Usize for usize { + fn as_u8(self) -> u8 { + #[cfg(debug_assertions)] + { + u8::try_from(self).expect("usize overflowed u8") + } + #[cfg(not(debug_assertions))] + { + self as u8 + } + } + + fn as_u16(self) -> u16 { + #[cfg(debug_assertions)] + { + u16::try_from(self).expect("usize overflowed u16") + } + #[cfg(not(debug_assertions))] + { + self as u16 + } + } + + fn as_u32(self) -> u32 { + #[cfg(debug_assertions)] + { + u32::try_from(self).expect("usize overflowed u32") + } + #[cfg(not(debug_assertions))] + { + self as u32 + } + } + + fn as_u64(self) -> u64 { + #[cfg(debug_assertions)] + { + u64::try_from(self).expect("usize overflowed u64") + } + #[cfg(not(debug_assertions))] + { + self as u64 + } + } +} + +// Pointers aren't integers, but we convert pointers to integers to perform +// offset arithmetic in some places. (And no, we don't convert the integers +// back to pointers.) So add 'as_usize' conversions here too for completeness. +// +// These 'as' casts are actually okay because they're always non-lossy. But the +// idea here is to just try and remove as much 'as' as possible, particularly +// in this crate where we are being really paranoid about offsets and making +// sure we don't panic on inputs that might be untrusted. This way, the 'as' +// casts become easier to audit if they're all in one place, even when some of +// them are actually okay 100% of the time. + +pub(crate) trait Pointer { + fn as_usize(self) -> usize; +} + +impl<T> Pointer for *const T { + fn as_usize(self) -> usize { + self as usize + } +} + +pub(crate) trait PointerMut { + fn as_usize(self) -> usize; +} + +impl<T> PointerMut for *mut T { + fn as_usize(self) -> usize { + self as usize + } +} diff --git a/third_party/rust/regex-automata/src/util/interpolate.rs b/third_party/rust/regex-automata/src/util/interpolate.rs new file mode 100644 index 0000000000..f274629df4 --- /dev/null +++ b/third_party/rust/regex-automata/src/util/interpolate.rs @@ -0,0 +1,579 @@ +/*! +Provides routines for interpolating capture group references. + +That is, if a replacement string contains references like `$foo` or `${foo1}`, +then they are replaced with the corresponding capture values for the groups +named `foo` and `foo1`, respectively. Similarly, syntax like `$1` and `${1}` +is supported as well, with `1` corresponding to a capture group index and not +a name. + +This module provides the free functions [`string`] and [`bytes`], which +interpolate Rust Unicode strings and byte strings, respectively. + +# Format + +These routines support two different kinds of capture references: unbraced and +braced. + +For the unbraced format, the format supported is `$ref` where `name` can be +any character in the class `[0-9A-Za-z_]`. `ref` is always the longest +possible parse. So for example, `$1a` corresponds to the capture group named +`1a` and not the capture group at index `1`. If `ref` matches `^[0-9]+$`, then +it is treated as a capture group index itself and not a name. + +For the braced format, the format supported is `${ref}` where `ref` can be any +sequence of bytes except for `}`. If no closing brace occurs, then it is not +considered a capture reference. As with the unbraced format, if `ref` matches +`^[0-9]+$`, then it is treated as a capture group index and not a name. + +The braced format is useful for exerting precise control over the name of the +capture reference. For example, `${1}a` corresponds to the capture group +reference `1` followed by the letter `a`, where as `$1a` (as mentioned above) +corresponds to the capture group reference `1a`. The braced format is also +useful for expressing capture group names that use characters not supported by +the unbraced format. For example, `${foo[bar].baz}` refers to the capture group +named `foo[bar].baz`. + +If a capture group reference is found and it does not refer to a valid capture +group, then it will be replaced with the empty string. + +To write a literal `$`, use `$$`. + +To be clear, and as exhibited via the type signatures in the routines in this +module, it is impossible for a replacement string to be invalid. A replacement +string may not have the intended semantics, but the interpolation procedure +itself can never fail. +*/ + +use alloc::{string::String, vec::Vec}; + +use crate::util::memchr::memchr; + +/// Accepts a replacement string and interpolates capture references with their +/// corresponding values. +/// +/// `append` should be a function that appends the string value of a capture +/// group at a particular index to the string given. If the capture group +/// index is invalid, then nothing should be appended. +/// +/// `name_to_index` should be a function that maps a capture group name to a +/// capture group index. If the given name doesn't exist, then `None` should +/// be returned. +/// +/// Finally, `dst` is where the final interpolated contents should be written. +/// If `replacement` contains no capture group references, then `dst` will be +/// equivalent to `replacement`. +/// +/// See the [module documentation](self) for details about the format +/// supported. +/// +/// # Example +/// +/// ``` +/// use regex_automata::util::interpolate; +/// +/// let mut dst = String::new(); +/// interpolate::string( +/// "foo $bar baz", +/// |index, dst| { +/// if index == 0 { +/// dst.push_str("BAR"); +/// } +/// }, +/// |name| { +/// if name == "bar" { +/// Some(0) +/// } else { +/// None +/// } +/// }, +/// &mut dst, +/// ); +/// assert_eq!("foo BAR baz", dst); +/// ``` +pub fn string( + mut replacement: &str, + mut append: impl FnMut(usize, &mut String), + mut name_to_index: impl FnMut(&str) -> Option<usize>, + dst: &mut String, +) { + while !replacement.is_empty() { + match memchr(b'$', replacement.as_bytes()) { + None => break, + Some(i) => { + dst.push_str(&replacement[..i]); + replacement = &replacement[i..]; + } + } + // Handle escaping of '$'. + if replacement.as_bytes().get(1).map_or(false, |&b| b == b'$') { + dst.push_str("$"); + replacement = &replacement[2..]; + continue; + } + debug_assert!(!replacement.is_empty()); + let cap_ref = match find_cap_ref(replacement.as_bytes()) { + Some(cap_ref) => cap_ref, + None => { + dst.push_str("$"); + replacement = &replacement[1..]; + continue; + } + }; + replacement = &replacement[cap_ref.end..]; + match cap_ref.cap { + Ref::Number(i) => append(i, dst), + Ref::Named(name) => { + if let Some(i) = name_to_index(name) { + append(i, dst); + } + } + } + } + dst.push_str(replacement); +} + +/// Accepts a replacement byte string and interpolates capture references with +/// their corresponding values. +/// +/// `append` should be a function that appends the byte string value of a +/// capture group at a particular index to the byte string given. If the +/// capture group index is invalid, then nothing should be appended. +/// +/// `name_to_index` should be a function that maps a capture group name to a +/// capture group index. If the given name doesn't exist, then `None` should +/// be returned. +/// +/// Finally, `dst` is where the final interpolated contents should be written. +/// If `replacement` contains no capture group references, then `dst` will be +/// equivalent to `replacement`. +/// +/// See the [module documentation](self) for details about the format +/// supported. +/// +/// # Example +/// +/// ``` +/// use regex_automata::util::interpolate; +/// +/// let mut dst = vec![]; +/// interpolate::bytes( +/// b"foo $bar baz", +/// |index, dst| { +/// if index == 0 { +/// dst.extend_from_slice(b"BAR"); +/// } +/// }, +/// |name| { +/// if name == "bar" { +/// Some(0) +/// } else { +/// None +/// } +/// }, +/// &mut dst, +/// ); +/// assert_eq!(&b"foo BAR baz"[..], dst); +/// ``` +pub fn bytes( + mut replacement: &[u8], + mut append: impl FnMut(usize, &mut Vec<u8>), + mut name_to_index: impl FnMut(&str) -> Option<usize>, + dst: &mut Vec<u8>, +) { + while !replacement.is_empty() { + match memchr(b'$', replacement) { + None => break, + Some(i) => { + dst.extend_from_slice(&replacement[..i]); + replacement = &replacement[i..]; + } + } + // Handle escaping of '$'. + if replacement.get(1).map_or(false, |&b| b == b'$') { + dst.push(b'$'); + replacement = &replacement[2..]; + continue; + } + debug_assert!(!replacement.is_empty()); + let cap_ref = match find_cap_ref(replacement) { + Some(cap_ref) => cap_ref, + None => { + dst.push(b'$'); + replacement = &replacement[1..]; + continue; + } + }; + replacement = &replacement[cap_ref.end..]; + match cap_ref.cap { + Ref::Number(i) => append(i, dst), + Ref::Named(name) => { + if let Some(i) = name_to_index(name) { + append(i, dst); + } + } + } + } + dst.extend_from_slice(replacement); +} + +/// `CaptureRef` represents a reference to a capture group inside some text. +/// The reference is either a capture group name or a number. +/// +/// It is also tagged with the position in the text following the +/// capture reference. +#[derive(Clone, Copy, Debug, Eq, PartialEq)] +struct CaptureRef<'a> { + cap: Ref<'a>, + end: usize, +} + +/// A reference to a capture group in some text. +/// +/// e.g., `$2`, `$foo`, `${foo}`. +#[derive(Clone, Copy, Debug, Eq, PartialEq)] +enum Ref<'a> { + Named(&'a str), + Number(usize), +} + +impl<'a> From<&'a str> for Ref<'a> { + fn from(x: &'a str) -> Ref<'a> { + Ref::Named(x) + } +} + +impl From<usize> for Ref<'static> { + fn from(x: usize) -> Ref<'static> { + Ref::Number(x) + } +} + +/// Parses a possible reference to a capture group name in the given text, +/// starting at the beginning of `replacement`. +/// +/// If no such valid reference could be found, None is returned. +/// +/// Note that this returns a "possible" reference because this routine doesn't +/// know whether the reference is to a valid group or not. If it winds up not +/// being a valid reference, then it should be replaced with the empty string. +fn find_cap_ref(replacement: &[u8]) -> Option<CaptureRef<'_>> { + let mut i = 0; + let rep: &[u8] = replacement; + if rep.len() <= 1 || rep[0] != b'$' { + return None; + } + i += 1; + if rep[i] == b'{' { + return find_cap_ref_braced(rep, i + 1); + } + let mut cap_end = i; + while rep.get(cap_end).copied().map_or(false, is_valid_cap_letter) { + cap_end += 1; + } + if cap_end == i { + return None; + } + // We just verified that the range 0..cap_end is valid ASCII, so it must + // therefore be valid UTF-8. If we really cared, we could avoid this UTF-8 + // check via an unchecked conversion or by parsing the number straight from + // &[u8]. + let cap = core::str::from_utf8(&rep[i..cap_end]) + .expect("valid UTF-8 capture name"); + Some(CaptureRef { + cap: match cap.parse::<usize>() { + Ok(i) => Ref::Number(i), + Err(_) => Ref::Named(cap), + }, + end: cap_end, + }) +} + +/// Looks for a braced reference, e.g., `${foo1}`. This assumes that an opening +/// brace has been found at `i-1` in `rep`. This then looks for a closing +/// brace and returns the capture reference within the brace. +fn find_cap_ref_braced(rep: &[u8], mut i: usize) -> Option<CaptureRef<'_>> { + assert_eq!(b'{', rep[i.checked_sub(1).unwrap()]); + let start = i; + while rep.get(i).map_or(false, |&b| b != b'}') { + i += 1; + } + if !rep.get(i).map_or(false, |&b| b == b'}') { + return None; + } + // When looking at braced names, we don't put any restrictions on the name, + // so it's possible it could be invalid UTF-8. But a capture group name + // can never be invalid UTF-8, so if we have invalid UTF-8, then we can + // safely return None. + let cap = match core::str::from_utf8(&rep[start..i]) { + Err(_) => return None, + Ok(cap) => cap, + }; + Some(CaptureRef { + cap: match cap.parse::<usize>() { + Ok(i) => Ref::Number(i), + Err(_) => Ref::Named(cap), + }, + end: i + 1, + }) +} + +/// Returns true if and only if the given byte is allowed in a capture name +/// written in non-brace form. +fn is_valid_cap_letter(b: u8) -> bool { + match b { + b'0'..=b'9' | b'a'..=b'z' | b'A'..=b'Z' | b'_' => true, + _ => false, + } +} + +#[cfg(test)] +mod tests { + use alloc::{string::String, vec, vec::Vec}; + + use super::{find_cap_ref, CaptureRef}; + + macro_rules! find { + ($name:ident, $text:expr) => { + #[test] + fn $name() { + assert_eq!(None, find_cap_ref($text.as_bytes())); + } + }; + ($name:ident, $text:expr, $capref:expr) => { + #[test] + fn $name() { + assert_eq!(Some($capref), find_cap_ref($text.as_bytes())); + } + }; + } + + macro_rules! c { + ($name_or_number:expr, $pos:expr) => { + CaptureRef { cap: $name_or_number.into(), end: $pos } + }; + } + + find!(find_cap_ref1, "$foo", c!("foo", 4)); + find!(find_cap_ref2, "${foo}", c!("foo", 6)); + find!(find_cap_ref3, "$0", c!(0, 2)); + find!(find_cap_ref4, "$5", c!(5, 2)); + find!(find_cap_ref5, "$10", c!(10, 3)); + // See https://github.com/rust-lang/regex/pull/585 + // for more on characters following numbers + find!(find_cap_ref6, "$42a", c!("42a", 4)); + find!(find_cap_ref7, "${42}a", c!(42, 5)); + find!(find_cap_ref8, "${42"); + find!(find_cap_ref9, "${42 "); + find!(find_cap_ref10, " $0 "); + find!(find_cap_ref11, "$"); + find!(find_cap_ref12, " "); + find!(find_cap_ref13, ""); + find!(find_cap_ref14, "$1-$2", c!(1, 2)); + find!(find_cap_ref15, "$1_$2", c!("1_", 3)); + find!(find_cap_ref16, "$x-$y", c!("x", 2)); + find!(find_cap_ref17, "$x_$y", c!("x_", 3)); + find!(find_cap_ref18, "${#}", c!("#", 4)); + find!(find_cap_ref19, "${Z[}", c!("Z[", 5)); + find!(find_cap_ref20, "${¾}", c!("¾", 5)); + find!(find_cap_ref21, "${¾a}", c!("¾a", 6)); + find!(find_cap_ref22, "${a¾}", c!("a¾", 6)); + find!(find_cap_ref23, "${☃}", c!("☃", 6)); + find!(find_cap_ref24, "${a☃}", c!("a☃", 7)); + find!(find_cap_ref25, "${☃a}", c!("☃a", 7)); + find!(find_cap_ref26, "${名字}", c!("名字", 9)); + + fn interpolate_string( + mut name_to_index: Vec<(&'static str, usize)>, + caps: Vec<&'static str>, + replacement: &str, + ) -> String { + name_to_index.sort_by_key(|x| x.0); + + let mut dst = String::new(); + super::string( + replacement, + |i, dst| { + if let Some(&s) = caps.get(i) { + dst.push_str(s); + } + }, + |name| -> Option<usize> { + name_to_index + .binary_search_by_key(&name, |x| x.0) + .ok() + .map(|i| name_to_index[i].1) + }, + &mut dst, + ); + dst + } + + fn interpolate_bytes( + mut name_to_index: Vec<(&'static str, usize)>, + caps: Vec<&'static str>, + replacement: &str, + ) -> String { + name_to_index.sort_by_key(|x| x.0); + + let mut dst = vec![]; + super::bytes( + replacement.as_bytes(), + |i, dst| { + if let Some(&s) = caps.get(i) { + dst.extend_from_slice(s.as_bytes()); + } + }, + |name| -> Option<usize> { + name_to_index + .binary_search_by_key(&name, |x| x.0) + .ok() + .map(|i| name_to_index[i].1) + }, + &mut dst, + ); + String::from_utf8(dst).unwrap() + } + + macro_rules! interp { + ($name:ident, $map:expr, $caps:expr, $hay:expr, $expected:expr $(,)*) => { + #[test] + fn $name() { + assert_eq!( + $expected, + interpolate_string($map, $caps, $hay), + "interpolate::string failed", + ); + assert_eq!( + $expected, + interpolate_bytes($map, $caps, $hay), + "interpolate::bytes failed", + ); + } + }; + } + + interp!( + interp1, + vec![("foo", 2)], + vec!["", "", "xxx"], + "test $foo test", + "test xxx test", + ); + + interp!( + interp2, + vec![("foo", 2)], + vec!["", "", "xxx"], + "test$footest", + "test", + ); + + interp!( + interp3, + vec![("foo", 2)], + vec!["", "", "xxx"], + "test${foo}test", + "testxxxtest", + ); + + interp!( + interp4, + vec![("foo", 2)], + vec!["", "", "xxx"], + "test$2test", + "test", + ); + + interp!( + interp5, + vec![("foo", 2)], + vec!["", "", "xxx"], + "test${2}test", + "testxxxtest", + ); + + interp!( + interp6, + vec![("foo", 2)], + vec!["", "", "xxx"], + "test $$foo test", + "test $foo test", + ); + + interp!( + interp7, + vec![("foo", 2)], + vec!["", "", "xxx"], + "test $foo", + "test xxx", + ); + + interp!( + interp8, + vec![("foo", 2)], + vec!["", "", "xxx"], + "$foo test", + "xxx test", + ); + + interp!( + interp9, + vec![("bar", 1), ("foo", 2)], + vec!["", "yyy", "xxx"], + "test $bar$foo", + "test yyyxxx", + ); + + interp!( + interp10, + vec![("bar", 1), ("foo", 2)], + vec!["", "yyy", "xxx"], + "test $ test", + "test $ test", + ); + + interp!( + interp11, + vec![("bar", 1), ("foo", 2)], + vec!["", "yyy", "xxx"], + "test ${} test", + "test test", + ); + + interp!( + interp12, + vec![("bar", 1), ("foo", 2)], + vec!["", "yyy", "xxx"], + "test ${ } test", + "test test", + ); + + interp!( + interp13, + vec![("bar", 1), ("foo", 2)], + vec!["", "yyy", "xxx"], + "test ${a b} test", + "test test", + ); + + interp!( + interp14, + vec![("bar", 1), ("foo", 2)], + vec!["", "yyy", "xxx"], + "test ${a} test", + "test test", + ); + + // This is a funny case where a braced reference is never closed, but + // within the unclosed braced reference, there is an unbraced reference. + // In this case, the braced reference is just treated literally and the + // unbraced reference is found. + interp!( + interp15, + vec![("bar", 1), ("foo", 2)], + vec!["", "yyy", "xxx"], + "test ${wat $bar ok", + "test ${wat yyy ok", + ); +} diff --git a/third_party/rust/regex-automata/src/util/iter.rs b/third_party/rust/regex-automata/src/util/iter.rs new file mode 100644 index 0000000000..a789fa0420 --- /dev/null +++ b/third_party/rust/regex-automata/src/util/iter.rs @@ -0,0 +1,1027 @@ +/*! +Generic helpers for iteration of matches from a regex engine in a haystack. + +The principle type in this module is a [`Searcher`]. A `Searcher` provides +its own lower level iterator-like API in addition to methods for constructing +types that implement `Iterator`. The documentation for `Searcher` explains a +bit more about why these different APIs exist. + +Currently, this module supports iteration over any regex engine that works +with the [`HalfMatch`], [`Match`] or [`Captures`] types. +*/ + +#[cfg(feature = "alloc")] +use crate::util::captures::Captures; +use crate::util::search::{HalfMatch, Input, Match, MatchError}; + +/// A searcher for creating iterators and performing lower level iteration. +/// +/// This searcher encapsulates the logic required for finding all successive +/// non-overlapping matches in a haystack. In theory, iteration would look +/// something like this: +/// +/// 1. Setting the start position to `0`. +/// 2. Execute a regex search. If no match, end iteration. +/// 3. Report the match and set the start position to the end of the match. +/// 4. Go back to (2). +/// +/// And if this were indeed the case, it's likely that `Searcher` wouldn't +/// exist. Unfortunately, because a regex may match the empty string, the above +/// logic won't work for all possible regexes. Namely, if an empty match is +/// found, then step (3) would set the start position of the search to the +/// position it was at. Thus, iteration would never end. +/// +/// Instead, a `Searcher` knows how to detect these cases and forcefully +/// advance iteration in the case of an empty match that overlaps with a +/// previous match. +/// +/// If you know that your regex cannot match any empty string, then the simple +/// algorithm described above will work correctly. +/// +/// When possible, prefer the iterators defined on the regex engine you're +/// using. This tries to abstract over the regex engine and is thus a bit more +/// unwieldy to use. +/// +/// In particular, a `Searcher` is not itself an iterator. Instead, it provides +/// `advance` routines that permit moving the search along explicitly. It also +/// provides various routines, like [`Searcher::into_matches_iter`], that +/// accept a closure (representing how a regex engine executes a search) and +/// returns a conventional iterator. +/// +/// The lifetime parameters come from the [`Input`] type passed to +/// [`Searcher::new`]: +/// +/// * `'h` is the lifetime of the underlying haystack. +/// +/// # Searcher vs Iterator +/// +/// Why does a search type with "advance" APIs exist at all when we also have +/// iterators? Unfortunately, the reasoning behind this split is a complex +/// combination of the following things: +/// +/// 1. While many of the regex engines expose their own iterators, it is also +/// nice to expose this lower level iteration helper because it permits callers +/// to provide their own `Input` configuration. Moreover, a `Searcher` can work +/// with _any_ regex engine instead of only the ones defined in this crate. +/// This way, everyone benefits from a shared iteration implementation. +/// 2. There are many different regex engines that, while they have the same +/// match semantics, they have slightly different APIs. Iteration is just +/// complex enough to want to share code, and so we need a way of abstracting +/// over those different regex engines. While we could define a new trait that +/// describes any regex engine search API, it would wind up looking very close +/// to a closure. While there may still be reasons for the more generic trait +/// to exist, for now and for the purposes of iteration, we use a closure. +/// Closures also provide a lot of easy flexibility at the call site, in that +/// they permit the caller to borrow any kind of state they want for use during +/// each search call. +/// 3. As a result of using closures, and because closures are anonymous types +/// that cannot be named, it is difficult to encapsulate them without both +/// costs to speed and added complexity to the public API. For example, in +/// defining an iterator type like +/// [`dfa::regex::FindMatches`](crate::dfa::regex::FindMatches), +/// if we use a closure internally, it's not possible to name this type in the +/// return type of the iterator constructor. Thus, the only way around it is +/// to erase the type by boxing it and turning it into a `Box<dyn FnMut ...>`. +/// This boxed closure is unlikely to be inlined _and_ it infects the public +/// API in subtle ways. Namely, unless you declare the closure as implementing +/// `Send` and `Sync`, then the resulting iterator type won't implement it +/// either. But there are practical issues with requiring the closure to +/// implement `Send` and `Sync` that result in other API complexities that +/// are beyond the scope of this already long exposition. +/// 4. Some regex engines expose more complex match information than just +/// "which pattern matched" and "at what offsets." For example, the PikeVM +/// exposes match spans for each capturing group that participated in the +/// match. In such cases, it can be quite beneficial to reuse the capturing +/// group allocation on subsequent searches. A proper iterator doesn't permit +/// this API due to its interface, so it's useful to have something a bit lower +/// level that permits callers to amortize allocations while also reusing a +/// shared implementation of iteration. (See the documentation for +/// [`Searcher::advance`] for an example of using the "advance" API with the +/// PikeVM.) +/// +/// What this boils down to is that there are "advance" APIs which require +/// handing a closure to it for every call, and there are also APIs to create +/// iterators from a closure. The former are useful for _implementing_ +/// iterators or when you need more flexibility, while the latter are useful +/// for conveniently writing custom iterators on-the-fly. +/// +/// # Example: iterating with captures +/// +/// Several regex engines in this crate over convenient iterator APIs over +/// [`Captures`] values. To do so, this requires allocating a new `Captures` +/// value for each iteration step. This can perhaps be more costly than you +/// might want. Instead of implementing your own iterator to avoid that +/// cost (which can be a little subtle if you want to handle empty matches +/// correctly), you can use this `Searcher` to do it for you: +/// +/// ``` +/// use regex_automata::{ +/// nfa::thompson::pikevm::PikeVM, +/// util::iter::Searcher, +/// Input, Span, +/// }; +/// +/// let re = PikeVM::new("foo(?P<numbers>[0-9]+)")?; +/// let haystack = "foo1 foo12 foo123"; +/// +/// let mut caps = re.create_captures(); +/// let mut cache = re.create_cache(); +/// let mut matches = vec![]; +/// let mut searcher = Searcher::new(Input::new(haystack)); +/// while let Some(_) = searcher.advance(|input| { +/// re.search(&mut cache, input, &mut caps); +/// Ok(caps.get_match()) +/// }) { +/// // The unwrap is OK since 'numbers' matches if the pattern matches. +/// matches.push(caps.get_group_by_name("numbers").unwrap()); +/// } +/// assert_eq!(matches, vec![ +/// Span::from(3..4), +/// Span::from(8..10), +/// Span::from(14..17), +/// ]); +/// +/// # Ok::<(), Box<dyn std::error::Error>>(()) +/// ``` +#[derive(Clone, Debug)] +pub struct Searcher<'h> { + /// The input parameters to give to each regex engine call. + /// + /// The start position of the search is mutated during iteration. + input: Input<'h>, + /// Records the end offset of the most recent match. This is necessary to + /// handle a corner case for preventing empty matches from overlapping with + /// the ending bounds of a prior match. + last_match_end: Option<usize>, +} + +impl<'h> Searcher<'h> { + /// Create a new fallible non-overlapping matches iterator. + /// + /// The given `input` provides the parameters (including the haystack), + /// while the `finder` represents a closure that calls the underlying regex + /// engine. The closure may borrow any additional state that is needed, + /// such as a prefilter scanner. + pub fn new(input: Input<'h>) -> Searcher<'h> { + Searcher { input, last_match_end: None } + } + + /// Returns the current `Input` used by this searcher. + /// + /// The `Input` returned is generally equivalent to the one given to + /// [`Searcher::new`], but its start position may be different to reflect + /// the start of the next search to be executed. + pub fn input<'s>(&'s self) -> &'s Input<'h> { + &self.input + } + + /// Return the next half match for an infallible search if one exists, and + /// advance to the next position. + /// + /// This is like `try_advance_half`, except errors are converted into + /// panics. + /// + /// # Panics + /// + /// If the given closure returns an error, then this panics. This is useful + /// when you know your underlying regex engine has been configured to not + /// return an error. + /// + /// # Example + /// + /// This example shows how to use a `Searcher` to iterate over all matches + /// when using a DFA, which only provides "half" matches. + /// + /// ``` + /// use regex_automata::{ + /// hybrid::dfa::DFA, + /// util::iter::Searcher, + /// HalfMatch, Input, + /// }; + /// + /// let re = DFA::new(r"[0-9]{4}-[0-9]{2}-[0-9]{2}")?; + /// let mut cache = re.create_cache(); + /// + /// let input = Input::new("2010-03-14 2016-10-08 2020-10-22"); + /// let mut it = Searcher::new(input); + /// + /// let expected = Some(HalfMatch::must(0, 10)); + /// let got = it.advance_half(|input| re.try_search_fwd(&mut cache, input)); + /// assert_eq!(expected, got); + /// + /// let expected = Some(HalfMatch::must(0, 21)); + /// let got = it.advance_half(|input| re.try_search_fwd(&mut cache, input)); + /// assert_eq!(expected, got); + /// + /// let expected = Some(HalfMatch::must(0, 32)); + /// let got = it.advance_half(|input| re.try_search_fwd(&mut cache, input)); + /// assert_eq!(expected, got); + /// + /// let expected = None; + /// let got = it.advance_half(|input| re.try_search_fwd(&mut cache, input)); + /// assert_eq!(expected, got); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + /// + /// This correctly moves iteration forward even when an empty match occurs: + /// + /// ``` + /// use regex_automata::{ + /// hybrid::dfa::DFA, + /// util::iter::Searcher, + /// HalfMatch, Input, + /// }; + /// + /// let re = DFA::new(r"a|")?; + /// let mut cache = re.create_cache(); + /// + /// let input = Input::new("abba"); + /// let mut it = Searcher::new(input); + /// + /// let expected = Some(HalfMatch::must(0, 1)); + /// let got = it.advance_half(|input| re.try_search_fwd(&mut cache, input)); + /// assert_eq!(expected, got); + /// + /// let expected = Some(HalfMatch::must(0, 2)); + /// let got = it.advance_half(|input| re.try_search_fwd(&mut cache, input)); + /// assert_eq!(expected, got); + /// + /// let expected = Some(HalfMatch::must(0, 4)); + /// let got = it.advance_half(|input| re.try_search_fwd(&mut cache, input)); + /// assert_eq!(expected, got); + /// + /// let expected = None; + /// let got = it.advance_half(|input| re.try_search_fwd(&mut cache, input)); + /// assert_eq!(expected, got); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + #[inline] + pub fn advance_half<F>(&mut self, finder: F) -> Option<HalfMatch> + where + F: FnMut(&Input<'_>) -> Result<Option<HalfMatch>, MatchError>, + { + match self.try_advance_half(finder) { + Ok(m) => m, + Err(err) => panic!( + "unexpected regex half find error: {}\n\ + to handle find errors, use 'try' or 'search' methods", + err, + ), + } + } + + /// Return the next match for an infallible search if one exists, and + /// advance to the next position. + /// + /// The search is advanced even in the presence of empty matches by + /// forbidding empty matches from overlapping with any other match. + /// + /// This is like `try_advance`, except errors are converted into panics. + /// + /// # Panics + /// + /// If the given closure returns an error, then this panics. This is useful + /// when you know your underlying regex engine has been configured to not + /// return an error. + /// + /// # Example + /// + /// This example shows how to use a `Searcher` to iterate over all matches + /// when using a regex based on lazy DFAs: + /// + /// ``` + /// use regex_automata::{ + /// hybrid::regex::Regex, + /// util::iter::Searcher, + /// Match, Input, + /// }; + /// + /// let re = Regex::new(r"[0-9]{4}-[0-9]{2}-[0-9]{2}")?; + /// let mut cache = re.create_cache(); + /// + /// let input = Input::new("2010-03-14 2016-10-08 2020-10-22"); + /// let mut it = Searcher::new(input); + /// + /// let expected = Some(Match::must(0, 0..10)); + /// let got = it.advance(|input| re.try_search(&mut cache, input)); + /// assert_eq!(expected, got); + /// + /// let expected = Some(Match::must(0, 11..21)); + /// let got = it.advance(|input| re.try_search(&mut cache, input)); + /// assert_eq!(expected, got); + /// + /// let expected = Some(Match::must(0, 22..32)); + /// let got = it.advance(|input| re.try_search(&mut cache, input)); + /// assert_eq!(expected, got); + /// + /// let expected = None; + /// let got = it.advance(|input| re.try_search(&mut cache, input)); + /// assert_eq!(expected, got); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + /// + /// This example shows the same as above, but with the PikeVM. This example + /// is useful because it shows how to use this API even when the regex + /// engine doesn't directly return a `Match`. + /// + /// ``` + /// use regex_automata::{ + /// nfa::thompson::pikevm::PikeVM, + /// util::iter::Searcher, + /// Match, Input, + /// }; + /// + /// let re = PikeVM::new(r"[0-9]{4}-[0-9]{2}-[0-9]{2}")?; + /// let (mut cache, mut caps) = (re.create_cache(), re.create_captures()); + /// + /// let input = Input::new("2010-03-14 2016-10-08 2020-10-22"); + /// let mut it = Searcher::new(input); + /// + /// let expected = Some(Match::must(0, 0..10)); + /// let got = it.advance(|input| { + /// re.search(&mut cache, input, &mut caps); + /// Ok(caps.get_match()) + /// }); + /// // Note that if we wanted to extract capturing group spans, we could + /// // do that here with 'caps'. + /// assert_eq!(expected, got); + /// + /// let expected = Some(Match::must(0, 11..21)); + /// let got = it.advance(|input| { + /// re.search(&mut cache, input, &mut caps); + /// Ok(caps.get_match()) + /// }); + /// assert_eq!(expected, got); + /// + /// let expected = Some(Match::must(0, 22..32)); + /// let got = it.advance(|input| { + /// re.search(&mut cache, input, &mut caps); + /// Ok(caps.get_match()) + /// }); + /// assert_eq!(expected, got); + /// + /// let expected = None; + /// let got = it.advance(|input| { + /// re.search(&mut cache, input, &mut caps); + /// Ok(caps.get_match()) + /// }); + /// assert_eq!(expected, got); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + #[inline] + pub fn advance<F>(&mut self, finder: F) -> Option<Match> + where + F: FnMut(&Input<'_>) -> Result<Option<Match>, MatchError>, + { + match self.try_advance(finder) { + Ok(m) => m, + Err(err) => panic!( + "unexpected regex find error: {}\n\ + to handle find errors, use 'try' or 'search' methods", + err, + ), + } + } + + /// Return the next half match for a fallible search if one exists, and + /// advance to the next position. + /// + /// This is like `advance_half`, except it permits callers to handle errors + /// during iteration. + #[inline] + pub fn try_advance_half<F>( + &mut self, + mut finder: F, + ) -> Result<Option<HalfMatch>, MatchError> + where + F: FnMut(&Input<'_>) -> Result<Option<HalfMatch>, MatchError>, + { + let mut m = match finder(&self.input)? { + None => return Ok(None), + Some(m) => m, + }; + if Some(m.offset()) == self.last_match_end { + m = match self.handle_overlapping_empty_half_match(m, finder)? { + None => return Ok(None), + Some(m) => m, + }; + } + self.input.set_start(m.offset()); + self.last_match_end = Some(m.offset()); + Ok(Some(m)) + } + + /// Return the next match for a fallible search if one exists, and advance + /// to the next position. + /// + /// This is like `advance`, except it permits callers to handle errors + /// during iteration. + #[inline] + pub fn try_advance<F>( + &mut self, + mut finder: F, + ) -> Result<Option<Match>, MatchError> + where + F: FnMut(&Input<'_>) -> Result<Option<Match>, MatchError>, + { + let mut m = match finder(&self.input)? { + None => return Ok(None), + Some(m) => m, + }; + if m.is_empty() && Some(m.end()) == self.last_match_end { + m = match self.handle_overlapping_empty_match(m, finder)? { + None => return Ok(None), + Some(m) => m, + }; + } + self.input.set_start(m.end()); + self.last_match_end = Some(m.end()); + Ok(Some(m)) + } + + /// Given a closure that executes a single search, return an iterator over + /// all successive non-overlapping half matches. + /// + /// The iterator returned yields result values. If the underlying regex + /// engine is configured to never return an error, consider calling + /// [`TryHalfMatchesIter::infallible`] to convert errors into panics. + /// + /// # Example + /// + /// This example shows how to use a `Searcher` to create a proper + /// iterator over half matches. + /// + /// ``` + /// use regex_automata::{ + /// hybrid::dfa::DFA, + /// util::iter::Searcher, + /// HalfMatch, Input, + /// }; + /// + /// let re = DFA::new(r"[0-9]{4}-[0-9]{2}-[0-9]{2}")?; + /// let mut cache = re.create_cache(); + /// + /// let input = Input::new("2010-03-14 2016-10-08 2020-10-22"); + /// let mut it = Searcher::new(input).into_half_matches_iter(|input| { + /// re.try_search_fwd(&mut cache, input) + /// }); + /// + /// let expected = Some(Ok(HalfMatch::must(0, 10))); + /// assert_eq!(expected, it.next()); + /// + /// let expected = Some(Ok(HalfMatch::must(0, 21))); + /// assert_eq!(expected, it.next()); + /// + /// let expected = Some(Ok(HalfMatch::must(0, 32))); + /// assert_eq!(expected, it.next()); + /// + /// let expected = None; + /// assert_eq!(expected, it.next()); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + #[inline] + pub fn into_half_matches_iter<F>( + self, + finder: F, + ) -> TryHalfMatchesIter<'h, F> + where + F: FnMut(&Input<'_>) -> Result<Option<HalfMatch>, MatchError>, + { + TryHalfMatchesIter { it: self, finder } + } + + /// Given a closure that executes a single search, return an iterator over + /// all successive non-overlapping matches. + /// + /// The iterator returned yields result values. If the underlying regex + /// engine is configured to never return an error, consider calling + /// [`TryMatchesIter::infallible`] to convert errors into panics. + /// + /// # Example + /// + /// This example shows how to use a `Searcher` to create a proper + /// iterator over matches. + /// + /// ``` + /// use regex_automata::{ + /// hybrid::regex::Regex, + /// util::iter::Searcher, + /// Match, Input, + /// }; + /// + /// let re = Regex::new(r"[0-9]{4}-[0-9]{2}-[0-9]{2}")?; + /// let mut cache = re.create_cache(); + /// + /// let input = Input::new("2010-03-14 2016-10-08 2020-10-22"); + /// let mut it = Searcher::new(input).into_matches_iter(|input| { + /// re.try_search(&mut cache, input) + /// }); + /// + /// let expected = Some(Ok(Match::must(0, 0..10))); + /// assert_eq!(expected, it.next()); + /// + /// let expected = Some(Ok(Match::must(0, 11..21))); + /// assert_eq!(expected, it.next()); + /// + /// let expected = Some(Ok(Match::must(0, 22..32))); + /// assert_eq!(expected, it.next()); + /// + /// let expected = None; + /// assert_eq!(expected, it.next()); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + #[inline] + pub fn into_matches_iter<F>(self, finder: F) -> TryMatchesIter<'h, F> + where + F: FnMut(&Input<'_>) -> Result<Option<Match>, MatchError>, + { + TryMatchesIter { it: self, finder } + } + + /// Given a closure that executes a single search, return an iterator over + /// all successive non-overlapping `Captures` values. + /// + /// The iterator returned yields result values. If the underlying regex + /// engine is configured to never return an error, consider calling + /// [`TryCapturesIter::infallible`] to convert errors into panics. + /// + /// Unlike the other iterator constructors, this accepts an initial + /// `Captures` value. This `Captures` value is reused for each search, and + /// the iterator implementation clones it before returning it. The caller + /// must provide this value because the iterator is purposely ignorant + /// of the underlying regex engine and thus doesn't know how to create + /// one itself. More to the point, a `Captures` value itself has a few + /// different constructors, which change which kind of information is + /// available to query in exchange for search performance. + /// + /// # Example + /// + /// This example shows how to use a `Searcher` to create a proper iterator + /// over `Captures` values, which provides access to all capturing group + /// spans for each match. + /// + /// ``` + /// use regex_automata::{ + /// nfa::thompson::pikevm::PikeVM, + /// util::iter::Searcher, + /// Input, + /// }; + /// + /// let re = PikeVM::new( + /// r"(?P<y>[0-9]{4})-(?P<m>[0-9]{2})-(?P<d>[0-9]{2})", + /// )?; + /// let (mut cache, caps) = (re.create_cache(), re.create_captures()); + /// + /// let haystack = "2010-03-14 2016-10-08 2020-10-22"; + /// let input = Input::new(haystack); + /// let mut it = Searcher::new(input) + /// .into_captures_iter(caps, |input, caps| { + /// re.search(&mut cache, input, caps); + /// Ok(()) + /// }); + /// + /// let got = it.next().expect("first date")?; + /// let year = got.get_group_by_name("y").expect("must match"); + /// assert_eq!("2010", &haystack[year]); + /// + /// let got = it.next().expect("second date")?; + /// let month = got.get_group_by_name("m").expect("must match"); + /// assert_eq!("10", &haystack[month]); + /// + /// let got = it.next().expect("third date")?; + /// let day = got.get_group_by_name("d").expect("must match"); + /// assert_eq!("22", &haystack[day]); + /// + /// assert!(it.next().is_none()); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + #[cfg(feature = "alloc")] + #[inline] + pub fn into_captures_iter<F>( + self, + caps: Captures, + finder: F, + ) -> TryCapturesIter<'h, F> + where + F: FnMut(&Input<'_>, &mut Captures) -> Result<(), MatchError>, + { + TryCapturesIter { it: self, caps, finder } + } + + /// Handles the special case of a match that begins where the previous + /// match ended. Without this special handling, it'd be possible to get + /// stuck where an empty match never results in forward progress. This + /// also makes it more consistent with how presiding general purpose regex + /// engines work. + #[cold] + #[inline(never)] + fn handle_overlapping_empty_half_match<F>( + &mut self, + _: HalfMatch, + mut finder: F, + ) -> Result<Option<HalfMatch>, MatchError> + where + F: FnMut(&Input<'_>) -> Result<Option<HalfMatch>, MatchError>, + { + // Since we are only here when 'm.offset()' matches the offset of the + // last match, it follows that this must have been an empty match. + // Since we both need to make progress *and* prevent overlapping + // matches, we discard this match and advance the search by 1. + // + // Note that this may start a search in the middle of a codepoint. The + // regex engines themselves are expected to deal with that and not + // report any matches within a codepoint if they are configured in + // UTF-8 mode. + self.input.set_start(self.input.start().checked_add(1).unwrap()); + finder(&self.input) + } + + /// Handles the special case of an empty match by ensuring that 1) the + /// iterator always advances and 2) empty matches never overlap with other + /// matches. + /// + /// (1) is necessary because we principally make progress by setting the + /// starting location of the next search to the ending location of the last + /// match. But if a match is empty, then this results in a search that does + /// not advance and thus does not terminate. + /// + /// (2) is not strictly necessary, but makes intuitive sense and matches + /// the presiding behavior of most general purpose regex engines. The + /// "intuitive sense" here is that we want to report NON-overlapping + /// matches. So for example, given the regex 'a|(?:)' against the haystack + /// 'a', without the special handling, you'd get the matches [0, 1) and [1, + /// 1), where the latter overlaps with the end bounds of the former. + /// + /// Note that we mark this cold and forcefully prevent inlining because + /// handling empty matches like this is extremely rare and does require + /// quite a bit of code, comparatively. Keeping this code out of the main + /// iterator function keeps it smaller and more amenable to inlining + /// itself. + #[cold] + #[inline(never)] + fn handle_overlapping_empty_match<F>( + &mut self, + m: Match, + mut finder: F, + ) -> Result<Option<Match>, MatchError> + where + F: FnMut(&Input<'_>) -> Result<Option<Match>, MatchError>, + { + assert!(m.is_empty()); + self.input.set_start(self.input.start().checked_add(1).unwrap()); + finder(&self.input) + } +} + +/// An iterator over all non-overlapping half matches for a fallible search. +/// +/// The iterator yields a `Result<HalfMatch, MatchError>` value until no more +/// matches could be found. +/// +/// The type parameters are as follows: +/// +/// * `F` represents the type of a closure that executes the search. +/// +/// The lifetime parameters come from the [`Input`] type: +/// +/// * `'h` is the lifetime of the underlying haystack. +/// +/// When possible, prefer the iterators defined on the regex engine you're +/// using. This tries to abstract over the regex engine and is thus a bit more +/// unwieldy to use. +/// +/// This iterator is created by [`Searcher::into_half_matches_iter`]. +pub struct TryHalfMatchesIter<'h, F> { + it: Searcher<'h>, + finder: F, +} + +impl<'h, F> TryHalfMatchesIter<'h, F> { + /// Return an infallible version of this iterator. + /// + /// Any item yielded that corresponds to an error results in a panic. This + /// is useful if your underlying regex engine is configured in a way that + /// it is guaranteed to never return an error. + pub fn infallible(self) -> HalfMatchesIter<'h, F> { + HalfMatchesIter(self) + } + + /// Returns the current `Input` used by this iterator. + /// + /// The `Input` returned is generally equivalent to the one used to + /// construct this iterator, but its start position may be different to + /// reflect the start of the next search to be executed. + pub fn input<'i>(&'i self) -> &'i Input<'h> { + self.it.input() + } +} + +impl<'h, F> Iterator for TryHalfMatchesIter<'h, F> +where + F: FnMut(&Input<'_>) -> Result<Option<HalfMatch>, MatchError>, +{ + type Item = Result<HalfMatch, MatchError>; + + #[inline] + fn next(&mut self) -> Option<Result<HalfMatch, MatchError>> { + self.it.try_advance_half(&mut self.finder).transpose() + } +} + +impl<'h, F> core::fmt::Debug for TryHalfMatchesIter<'h, F> { + fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result { + f.debug_struct("TryHalfMatchesIter") + .field("it", &self.it) + .field("finder", &"<closure>") + .finish() + } +} + +/// An iterator over all non-overlapping half matches for an infallible search. +/// +/// The iterator yields a [`HalfMatch`] value until no more matches could be +/// found. +/// +/// The type parameters are as follows: +/// +/// * `F` represents the type of a closure that executes the search. +/// +/// The lifetime parameters come from the [`Input`] type: +/// +/// * `'h` is the lifetime of the underlying haystack. +/// +/// When possible, prefer the iterators defined on the regex engine you're +/// using. This tries to abstract over the regex engine and is thus a bit more +/// unwieldy to use. +/// +/// This iterator is created by [`Searcher::into_half_matches_iter`] and +/// then calling [`TryHalfMatchesIter::infallible`]. +#[derive(Debug)] +pub struct HalfMatchesIter<'h, F>(TryHalfMatchesIter<'h, F>); + +impl<'h, F> HalfMatchesIter<'h, F> { + /// Returns the current `Input` used by this iterator. + /// + /// The `Input` returned is generally equivalent to the one used to + /// construct this iterator, but its start position may be different to + /// reflect the start of the next search to be executed. + pub fn input<'i>(&'i self) -> &'i Input<'h> { + self.0.it.input() + } +} + +impl<'h, F> Iterator for HalfMatchesIter<'h, F> +where + F: FnMut(&Input<'_>) -> Result<Option<HalfMatch>, MatchError>, +{ + type Item = HalfMatch; + + #[inline] + fn next(&mut self) -> Option<HalfMatch> { + match self.0.next()? { + Ok(m) => Some(m), + Err(err) => panic!( + "unexpected regex half find error: {}\n\ + to handle find errors, use 'try' or 'search' methods", + err, + ), + } + } +} + +/// An iterator over all non-overlapping matches for a fallible search. +/// +/// The iterator yields a `Result<Match, MatchError>` value until no more +/// matches could be found. +/// +/// The type parameters are as follows: +/// +/// * `F` represents the type of a closure that executes the search. +/// +/// The lifetime parameters come from the [`Input`] type: +/// +/// * `'h` is the lifetime of the underlying haystack. +/// +/// When possible, prefer the iterators defined on the regex engine you're +/// using. This tries to abstract over the regex engine and is thus a bit more +/// unwieldy to use. +/// +/// This iterator is created by [`Searcher::into_matches_iter`]. +pub struct TryMatchesIter<'h, F> { + it: Searcher<'h>, + finder: F, +} + +impl<'h, F> TryMatchesIter<'h, F> { + /// Return an infallible version of this iterator. + /// + /// Any item yielded that corresponds to an error results in a panic. This + /// is useful if your underlying regex engine is configured in a way that + /// it is guaranteed to never return an error. + pub fn infallible(self) -> MatchesIter<'h, F> { + MatchesIter(self) + } + + /// Returns the current `Input` used by this iterator. + /// + /// The `Input` returned is generally equivalent to the one used to + /// construct this iterator, but its start position may be different to + /// reflect the start of the next search to be executed. + pub fn input<'i>(&'i self) -> &'i Input<'h> { + self.it.input() + } +} + +impl<'h, F> Iterator for TryMatchesIter<'h, F> +where + F: FnMut(&Input<'_>) -> Result<Option<Match>, MatchError>, +{ + type Item = Result<Match, MatchError>; + + #[inline] + fn next(&mut self) -> Option<Result<Match, MatchError>> { + self.it.try_advance(&mut self.finder).transpose() + } +} + +impl<'h, F> core::fmt::Debug for TryMatchesIter<'h, F> { + fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result { + f.debug_struct("TryMatchesIter") + .field("it", &self.it) + .field("finder", &"<closure>") + .finish() + } +} + +/// An iterator over all non-overlapping matches for an infallible search. +/// +/// The iterator yields a [`Match`] value until no more matches could be found. +/// +/// The type parameters are as follows: +/// +/// * `F` represents the type of a closure that executes the search. +/// +/// The lifetime parameters come from the [`Input`] type: +/// +/// * `'h` is the lifetime of the underlying haystack. +/// +/// When possible, prefer the iterators defined on the regex engine you're +/// using. This tries to abstract over the regex engine and is thus a bit more +/// unwieldy to use. +/// +/// This iterator is created by [`Searcher::into_matches_iter`] and +/// then calling [`TryMatchesIter::infallible`]. +#[derive(Debug)] +pub struct MatchesIter<'h, F>(TryMatchesIter<'h, F>); + +impl<'h, F> MatchesIter<'h, F> { + /// Returns the current `Input` used by this iterator. + /// + /// The `Input` returned is generally equivalent to the one used to + /// construct this iterator, but its start position may be different to + /// reflect the start of the next search to be executed. + pub fn input<'i>(&'i self) -> &'i Input<'h> { + self.0.it.input() + } +} + +impl<'h, F> Iterator for MatchesIter<'h, F> +where + F: FnMut(&Input<'_>) -> Result<Option<Match>, MatchError>, +{ + type Item = Match; + + #[inline] + fn next(&mut self) -> Option<Match> { + match self.0.next()? { + Ok(m) => Some(m), + Err(err) => panic!( + "unexpected regex find error: {}\n\ + to handle find errors, use 'try' or 'search' methods", + err, + ), + } + } +} + +/// An iterator over all non-overlapping captures for a fallible search. +/// +/// The iterator yields a `Result<Captures, MatchError>` value until no more +/// matches could be found. +/// +/// The type parameters are as follows: +/// +/// * `F` represents the type of a closure that executes the search. +/// +/// The lifetime parameters come from the [`Input`] type: +/// +/// * `'h` is the lifetime of the underlying haystack. +/// +/// When possible, prefer the iterators defined on the regex engine you're +/// using. This tries to abstract over the regex engine and is thus a bit more +/// unwieldy to use. +/// +/// This iterator is created by [`Searcher::into_captures_iter`]. +#[cfg(feature = "alloc")] +pub struct TryCapturesIter<'h, F> { + it: Searcher<'h>, + caps: Captures, + finder: F, +} + +#[cfg(feature = "alloc")] +impl<'h, F> TryCapturesIter<'h, F> { + /// Return an infallible version of this iterator. + /// + /// Any item yielded that corresponds to an error results in a panic. This + /// is useful if your underlying regex engine is configured in a way that + /// it is guaranteed to never return an error. + pub fn infallible(self) -> CapturesIter<'h, F> { + CapturesIter(self) + } +} + +#[cfg(feature = "alloc")] +impl<'h, F> Iterator for TryCapturesIter<'h, F> +where + F: FnMut(&Input<'_>, &mut Captures) -> Result<(), MatchError>, +{ + type Item = Result<Captures, MatchError>; + + #[inline] + fn next(&mut self) -> Option<Result<Captures, MatchError>> { + let TryCapturesIter { ref mut it, ref mut caps, ref mut finder } = + *self; + let result = it + .try_advance(|input| { + (finder)(input, caps)?; + Ok(caps.get_match()) + }) + .transpose()?; + match result { + Ok(_) => Some(Ok(caps.clone())), + Err(err) => Some(Err(err)), + } + } +} + +#[cfg(feature = "alloc")] +impl<'h, F> core::fmt::Debug for TryCapturesIter<'h, F> { + fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result { + f.debug_struct("TryCapturesIter") + .field("it", &self.it) + .field("caps", &self.caps) + .field("finder", &"<closure>") + .finish() + } +} + +/// An iterator over all non-overlapping captures for an infallible search. +/// +/// The iterator yields a [`Captures`] value until no more matches could be +/// found. +/// +/// The type parameters are as follows: +/// +/// * `F` represents the type of a closure that executes the search. +/// +/// The lifetime parameters come from the [`Input`] type: +/// +/// * `'h` is the lifetime of the underlying haystack. +/// +/// When possible, prefer the iterators defined on the regex engine you're +/// using. This tries to abstract over the regex engine and is thus a bit more +/// unwieldy to use. +/// +/// This iterator is created by [`Searcher::into_captures_iter`] and then +/// calling [`TryCapturesIter::infallible`]. +#[cfg(feature = "alloc")] +#[derive(Debug)] +pub struct CapturesIter<'h, F>(TryCapturesIter<'h, F>); + +#[cfg(feature = "alloc")] +impl<'h, F> Iterator for CapturesIter<'h, F> +where + F: FnMut(&Input<'_>, &mut Captures) -> Result<(), MatchError>, +{ + type Item = Captures; + + #[inline] + fn next(&mut self) -> Option<Captures> { + match self.0.next()? { + Ok(m) => Some(m), + Err(err) => panic!( + "unexpected regex captures error: {}\n\ + to handle find errors, use 'try' or 'search' methods", + err, + ), + } + } +} diff --git a/third_party/rust/regex-automata/src/util/lazy.rs b/third_party/rust/regex-automata/src/util/lazy.rs new file mode 100644 index 0000000000..de27a2a6e6 --- /dev/null +++ b/third_party/rust/regex-automata/src/util/lazy.rs @@ -0,0 +1,465 @@ +/*! +A lazily initialized value for safe sharing between threads. + +The principal type in this module is `Lazy`, which makes it easy to construct +values that are shared safely across multiple threads simultaneously. +*/ + +use core::fmt; + +/// A lazily initialized value that implements `Deref` for `T`. +/// +/// A `Lazy` takes an initialization function and permits callers from any +/// thread to access the result of that initialization function in a safe +/// manner. In effect, this permits one-time initialization of global resources +/// in a (possibly) multi-threaded program. +/// +/// This type and its functionality are available even when neither the `alloc` +/// nor the `std` features are enabled. In exchange, a `Lazy` does **not** +/// guarantee that the given `create` function is called at most once. It +/// might be called multiple times. Moreover, a call to `Lazy::get` (either +/// explicitly or implicitly via `Lazy`'s `Deref` impl) may block until a `T` +/// is available. +/// +/// This is very similar to `lazy_static` or `once_cell`, except it doesn't +/// guarantee that the initialization function will be run once and it works +/// in no-alloc no-std environments. With that said, if you need stronger +/// guarantees or a more flexible API, then it is recommended to use either +/// `lazy_static` or `once_cell`. +/// +/// # Warning: may use a spin lock +/// +/// When this crate is compiled _without_ the `alloc` feature, then this type +/// may used a spin lock internally. This can have subtle effects that may +/// be undesirable. See [Spinlocks Considered Harmful][spinharm] for a more +/// thorough treatment of this topic. +/// +/// [spinharm]: https://matklad.github.io/2020/01/02/spinlocks-considered-harmful.html +/// +/// # Example +/// +/// This type is useful for creating regexes once, and then using them from +/// multiple threads simultaneously without worrying about synchronization. +/// +/// ``` +/// use regex_automata::{dfa::regex::Regex, util::lazy::Lazy, Match}; +/// +/// static RE: Lazy<Regex> = Lazy::new(|| Regex::new("foo[0-9]+bar").unwrap()); +/// +/// let expected = Some(Match::must(0, 3..14)); +/// assert_eq!(expected, RE.find(b"zzzfoo12345barzzz")); +/// ``` +pub struct Lazy<T, F = fn() -> T>(lazy::Lazy<T, F>); + +impl<T, F> Lazy<T, F> { + /// Create a new `Lazy` value that is initialized via the given function. + /// + /// The `T` type is automatically inferred from the return type of the + /// `create` function given. + pub const fn new(create: F) -> Lazy<T, F> { + Lazy(lazy::Lazy::new(create)) + } +} + +impl<T, F: Fn() -> T> Lazy<T, F> { + /// Return a reference to the lazily initialized value. + /// + /// This routine may block if another thread is initializing a `T`. + /// + /// Note that given a `x` which has type `Lazy`, this must be called via + /// `Lazy::get(x)` and not `x.get()`. This routine is defined this way + /// because `Lazy` impls `Deref` with a target of `T`. + /// + /// # Panics + /// + /// This panics if the `create` function inside this lazy value panics. + /// If the panic occurred in another thread, then this routine _may_ also + /// panic (but is not guaranteed to do so). + pub fn get(this: &Lazy<T, F>) -> &T { + this.0.get() + } +} + +impl<T, F: Fn() -> T> core::ops::Deref for Lazy<T, F> { + type Target = T; + + fn deref(&self) -> &T { + Lazy::get(self) + } +} + +impl<T: fmt::Debug, F: Fn() -> T> fmt::Debug for Lazy<T, F> { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + self.0.fmt(f) + } +} + +#[cfg(feature = "alloc")] +mod lazy { + use core::{ + fmt, + marker::PhantomData, + sync::atomic::{AtomicPtr, Ordering}, + }; + + use alloc::boxed::Box; + + /// A non-std lazy initialized value. + /// + /// This might run the initialization function more than once, but will + /// never block. + /// + /// I wish I could get these semantics into the non-alloc non-std Lazy + /// type below, but I'm not sure how to do it. If you can do an alloc, + /// then the implementation becomes very simple if you don't care about + /// redundant work precisely because a pointer can be atomically swapped. + /// + /// Perhaps making this approach work in the non-alloc non-std case + /// requires asking the caller for a pointer? It would make the API less + /// convenient I think. + pub(super) struct Lazy<T, F> { + data: AtomicPtr<T>, + create: F, + // This indicates to the compiler that this type can drop T. It's not + // totally clear how the absence of this marker could lead to trouble, + // but putting here doesn't have any downsides so we hedge until somone + // can from the Unsafe Working Group can tell us definitively that we + // don't need it. + // + // See: https://github.com/BurntSushi/regex-automata/issues/30 + owned: PhantomData<Box<T>>, + } + + // SAFETY: So long as T and &T (and F and &F) can themselves be safely + // shared among threads, so to can a Lazy<T, _>. Namely, the Lazy API only + // permits accessing a &T and initialization is free of data races. So if T + // is thread safe, then so to is Lazy<T, _>. + // + // We specifically require that T: Send in order for Lazy<T> to be Sync. + // Without that requirement, it's possible to send a T from one thread to + // another via Lazy's destructor. + // + // It's not clear whether we need F: Send+Sync for Lazy to be Sync. But + // we're conservative for now and keep both. + unsafe impl<T: Send + Sync, F: Send + Sync> Sync for Lazy<T, F> {} + + impl<T, F> Lazy<T, F> { + /// Create a new alloc but non-std lazy value that is racily + /// initialized. That is, the 'create' function may be called more than + /// once. + pub(super) const fn new(create: F) -> Lazy<T, F> { + Lazy { + data: AtomicPtr::new(core::ptr::null_mut()), + create, + owned: PhantomData, + } + } + } + + impl<T, F: Fn() -> T> Lazy<T, F> { + /// Get the underlying lazy value. If it hasn't been initialized + /// yet, then always attempt to initialize it (even if some other + /// thread is initializing it) and atomically attach it to this lazy + /// value before returning it. + pub(super) fn get(&self) -> &T { + if let Some(data) = self.poll() { + return data; + } + let data = (self.create)(); + let mut ptr = Box::into_raw(Box::new(data)); + // We attempt to stuff our initialized value into our atomic + // pointer. Upon success, we don't need to do anything. But if + // someone else beat us to the punch, then we need to make sure + // our newly created value is dropped. + let result = self.data.compare_exchange( + core::ptr::null_mut(), + ptr, + Ordering::AcqRel, + Ordering::Acquire, + ); + if let Err(old) = result { + // SAFETY: We created 'ptr' via Box::into_raw above, so turning + // it back into a Box via from_raw is safe. + drop(unsafe { Box::from_raw(ptr) }); + ptr = old; + } + // SAFETY: We just set the pointer above to a non-null value, even + // in the error case, and set it to a fully initialized value + // returned by 'create'. + unsafe { &*ptr } + } + + /// If this lazy value has been initialized successfully, then return + /// that value. Otherwise return None immediately. This never attempts + /// to run initialization itself. + fn poll(&self) -> Option<&T> { + let ptr = self.data.load(Ordering::Acquire); + if ptr.is_null() { + return None; + } + // SAFETY: We just checked that the pointer is not null. Since it's + // not null, it must have been fully initialized by 'get' at some + // point. + Some(unsafe { &*ptr }) + } + } + + impl<T: fmt::Debug, F: Fn() -> T> fmt::Debug for Lazy<T, F> { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + f.debug_struct("Lazy").field("data", &self.poll()).finish() + } + } + + impl<T, F> Drop for Lazy<T, F> { + fn drop(&mut self) { + let ptr = *self.data.get_mut(); + if !ptr.is_null() { + // SAFETY: We just checked that 'ptr' is not null. And since + // we have exclusive access, there are no races to worry about. + drop(unsafe { Box::from_raw(ptr) }); + } + } + } +} + +#[cfg(not(feature = "alloc"))] +mod lazy { + use core::{ + cell::Cell, + fmt, + mem::MaybeUninit, + panic::{RefUnwindSafe, UnwindSafe}, + sync::atomic::{AtomicU8, Ordering}, + }; + + /// Our 'Lazy' value can be in one of three states: + /// + /// * INIT is where it starts, and also ends up back here if the + /// 'create' routine panics. + /// * BUSY is where it sits while initialization is running in exactly + /// one thread. + /// * DONE is where it sits after 'create' has completed and 'data' has + /// been fully initialized. + const LAZY_STATE_INIT: u8 = 0; + const LAZY_STATE_BUSY: u8 = 1; + const LAZY_STATE_DONE: u8 = 2; + + /// A non-alloc non-std lazy initialized value. + /// + /// This guarantees initialization only happens once, but uses a spinlock + /// to block in the case of simultaneous access. Blocking occurs so that + /// one thread waits while another thread initializes the value. + /// + /// I would much rather have the semantics of the 'alloc' Lazy type above. + /// Namely, that we might run the initialization function more than once, + /// but we never otherwise block. However, I don't know how to do that in + /// a non-alloc non-std context. + pub(super) struct Lazy<T, F> { + state: AtomicU8, + create: Cell<Option<F>>, + data: Cell<MaybeUninit<T>>, + } + + // SAFETY: So long as T and &T (and F and &F) can themselves be safely + // shared among threads, so to can a Lazy<T, _>. Namely, the Lazy API only + // permits accessing a &T and initialization is free of data races. So if T + // is thread safe, then so to is Lazy<T, _>. + unsafe impl<T: Send + Sync, F: Send + Sync> Sync for Lazy<T, F> {} + // A reference to a Lazy is unwind safe because we specifically take + // precautions to poison all accesses to a Lazy if the caller-provided + // 'create' function panics. + impl<T: UnwindSafe, F: UnwindSafe + RefUnwindSafe> RefUnwindSafe + for Lazy<T, F> + { + } + + impl<T, F> Lazy<T, F> { + /// Create a new non-alloc non-std lazy value that is initialized + /// exactly once on first use using the given function. + pub(super) const fn new(create: F) -> Lazy<T, F> { + Lazy { + state: AtomicU8::new(LAZY_STATE_INIT), + create: Cell::new(Some(create)), + data: Cell::new(MaybeUninit::uninit()), + } + } + } + + impl<T, F: FnOnce() -> T> Lazy<T, F> { + /// Get the underlying lazy value. If it isn't been initialized + /// yet, then either initialize it or block until some other thread + /// initializes it. If the 'create' function given to Lazy::new panics + /// (even in another thread), then this panics too. + pub(super) fn get(&self) -> &T { + // This is effectively a spinlock. We loop until we enter a DONE + // state, and if possible, initialize it ourselves. The only way + // we exit the loop is if 'create' panics, we initialize 'data' or + // some other thread initializes 'data'. + // + // Yes, I have read spinlocks considered harmful[1]. And that + // article is why this spinlock is only active when 'alloc' isn't + // enabled. I did this because I don't think there is really + // another choice without 'alloc', other than not providing this at + // all. But I think that's a big bummer. + // + // [1]: https://matklad.github.io/2020/01/02/spinlocks-considered-harmful.html + while self.state.load(Ordering::Acquire) != LAZY_STATE_DONE { + // Check if we're the first ones to get here. If so, we'll be + // the ones who initialize. + let result = self.state.compare_exchange( + LAZY_STATE_INIT, + LAZY_STATE_BUSY, + Ordering::AcqRel, + Ordering::Acquire, + ); + // This means we saw the INIT state and nobody else can. So we + // must take responsibility for initializing. And by virtue of + // observing INIT, we have also told anyone else trying to + // get here that we are BUSY. If someone else sees BUSY, then + // they will spin until we finish initialization. + if let Ok(_) = result { + // Since we are guaranteed to be the only ones here, we + // know that 'create' is there... Unless someone else got + // here before us and 'create' panicked. In which case, + // 'self.create' is now 'None' and we forward the panic + // to the caller. (i.e., We implement poisoning.) + // + // SAFETY: Our use of 'self.state' guarantees that we are + // the only thread executing this line, and thus there are + // no races. + let create = unsafe { + (*self.create.as_ptr()).take().expect( + "Lazy's create function panicked, \ + preventing initialization, + poisoning current thread", + ) + }; + let guard = Guard { state: &self.state }; + // SAFETY: Our use of 'self.state' guarantees that we are + // the only thread executing this line, and thus there are + // no races. + unsafe { + (*self.data.as_ptr()).as_mut_ptr().write(create()); + } + // All is well. 'self.create' ran successfully, so we + // forget the guard. + core::mem::forget(guard); + // Everything is initialized, so we can declare success. + self.state.store(LAZY_STATE_DONE, Ordering::Release); + break; + } + core::hint::spin_loop(); + } + // We only get here if data is fully initialized, and thus poll + // will always return something. + self.poll().unwrap() + } + + /// If this lazy value has been initialized successfully, then return + /// that value. Otherwise return None immediately. This never blocks. + fn poll(&self) -> Option<&T> { + if self.state.load(Ordering::Acquire) == LAZY_STATE_DONE { + // SAFETY: The DONE state only occurs when data has been fully + // initialized. + Some(unsafe { &*(*self.data.as_ptr()).as_ptr() }) + } else { + None + } + } + } + + impl<T: fmt::Debug, F: FnMut() -> T> fmt::Debug for Lazy<T, F> { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + f.debug_struct("Lazy") + .field("state", &self.state.load(Ordering::Acquire)) + .field("create", &"<closure>") + .field("data", &self.poll()) + .finish() + } + } + + impl<T, F> Drop for Lazy<T, F> { + fn drop(&mut self) { + if *self.state.get_mut() == LAZY_STATE_DONE { + // SAFETY: state is DONE if and only if data has been fully + // initialized. At which point, it is safe to drop. + unsafe { + // MSRV(1.60): Use assume_init_drop. The below is how + // assume_init_drop is implemented. + core::ptr::drop_in_place( + (*self.data.as_ptr()).as_mut_ptr(), + ) + } + } + } + } + + /// A guard that will reset a Lazy's state back to INIT when dropped. The + /// idea here is to 'forget' this guard on success. On failure (when a + /// panic occurs), the Drop impl runs and causes all in-progress and future + /// 'get' calls to panic. Without this guard, all in-progress and future + /// 'get' calls would spin forever. Crashing is much better than getting + /// stuck in an infinite loop. + struct Guard<'a> { + state: &'a AtomicU8, + } + + impl<'a> Drop for Guard<'a> { + fn drop(&mut self) { + // We force ourselves back into an INIT state. This will in turn + // cause any future 'get' calls to attempt calling 'self.create' + // again which will in turn panic because 'self.create' will now + // be 'None'. + self.state.store(LAZY_STATE_INIT, Ordering::Release); + } + } +} + +#[cfg(test)] +mod tests { + use super::*; + + fn assert_send<T: Send>() {} + fn assert_sync<T: Sync>() {} + fn assert_unwind<T: core::panic::UnwindSafe>() {} + fn assert_refunwind<T: core::panic::RefUnwindSafe>() {} + + #[test] + fn oibits() { + assert_send::<Lazy<u64>>(); + assert_sync::<Lazy<u64>>(); + assert_unwind::<Lazy<u64>>(); + assert_refunwind::<Lazy<u64>>(); + } + + // This is a regression test because we used to rely on the inferred Sync + // impl for the Lazy type defined above (for 'alloc' mode). In the + // inferred impl, it only requires that T: Sync for Lazy<T>: Sync. But + // if we have that, we can actually make use of the fact that Lazy<T> drops + // T to create a value on one thread and drop it on another. This *should* + // require T: Send, but our missing bounds before let it sneak by. + // + // Basically, this test should not compile, so we... comment it out. We + // don't have a great way of testing compile-fail tests right now. + // + // See: https://github.com/BurntSushi/regex-automata/issues/30 + /* + #[test] + fn sync_not_send() { + #[allow(dead_code)] + fn inner<T: Sync + Default>() { + let lazy = Lazy::new(move || T::default()); + std::thread::scope(|scope| { + scope.spawn(|| { + Lazy::get(&lazy); // We create T in this thread + }); + }); + // And drop in this thread. + drop(lazy); + // So we have send a !Send type over threads. (with some more + // legwork, its possible to even sneak the value out of drop + // through thread local) + } + } + */ +} diff --git a/third_party/rust/regex-automata/src/util/look.rs b/third_party/rust/regex-automata/src/util/look.rs new file mode 100644 index 0000000000..aee31b34e0 --- /dev/null +++ b/third_party/rust/regex-automata/src/util/look.rs @@ -0,0 +1,1748 @@ +/*! +Types and routines for working with look-around assertions. + +This module principally defines two types: + +* [`Look`] enumerates all of the assertions supported by this crate. +* [`LookSet`] provides a way to efficiently store a set of [`Look`] values. +* [`LookMatcher`] provides routines for checking whether a `Look` or a +`LookSet` matches at a particular position in a haystack. +*/ + +// LAMENTATION: Sadly, a lot of the API of `Look` and `LookSet` were basically +// copied verbatim from the regex-syntax crate. I would have no problems using +// the regex-syntax types and defining the matching routines (only found +// in this crate) as free functions, except the `Look` and `LookSet` types +// are used in lots of places. Including in places we expect to work when +// regex-syntax is *not* enabled, such as in the definition of the NFA itself. +// +// Thankfully the code we copy is pretty simple and there isn't much of it. +// Otherwise, the rest of this module deals with *matching* the assertions, +// which is not something that regex-syntax handles. + +use crate::util::{escape::DebugByte, utf8}; + +/// A look-around assertion. +/// +/// An assertion matches at a position between characters in a haystack. +/// Namely, it does not actually "consume" any input as most parts of a regular +/// expression do. Assertions are a way of stating that some property must be +/// true at a particular point during matching. +/// +/// For example, `(?m)^[a-z]+$` is a pattern that: +/// +/// * Scans the haystack for a position at which `(?m:^)` is satisfied. That +/// occurs at either the beginning of the haystack, or immediately following +/// a `\n` character. +/// * Looks for one or more occurrences of `[a-z]`. +/// * Once `[a-z]+` has matched as much as it can, an overall match is only +/// reported when `[a-z]+` stops just before a `\n`. +/// +/// So in this case, `abc` and `\nabc\n` match, but `\nabc1\n` does not. +/// +/// Assertions are also called "look-around," "look-behind" and "look-ahead." +/// Specifically, some assertions are look-behind (like `^`), other assertions +/// are look-ahead (like `$`) and yet other assertions are both look-ahead and +/// look-behind (like `\b`). +/// +/// # Assertions in an NFA +/// +/// An assertion in a [`thompson::NFA`](crate::nfa::thompson::NFA) can be +/// thought of as a conditional epsilon transition. That is, a matching engine +/// like the [`PikeVM`](crate::nfa::thompson::pikevm::PikeVM) only permits +/// moving through conditional epsilon transitions when their condition +/// is satisfied at whatever position the `PikeVM` is currently at in the +/// haystack. +/// +/// How assertions are handled in a `DFA` is trickier, since a DFA does not +/// have epsilon transitions at all. In this case, they are compiled into the +/// automaton itself, at the expense of more states than what would be required +/// without an assertion. +#[derive(Clone, Copy, Debug, Eq, PartialEq)] +pub enum Look { + /// Match the beginning of text. Specifically, this matches at the starting + /// position of the input. + Start = 1 << 0, + /// Match the end of text. Specifically, this matches at the ending + /// position of the input. + End = 1 << 1, + /// Match the beginning of a line or the beginning of text. Specifically, + /// this matches at the starting position of the input, or at the position + /// immediately following a `\n` character. + StartLF = 1 << 2, + /// Match the end of a line or the end of text. Specifically, this matches + /// at the end position of the input, or at the position immediately + /// preceding a `\n` character. + EndLF = 1 << 3, + /// Match the beginning of a line or the beginning of text. Specifically, + /// this matches at the starting position of the input, or at the position + /// immediately following either a `\r` or `\n` character, but never after + /// a `\r` when a `\n` follows. + StartCRLF = 1 << 4, + /// Match the end of a line or the end of text. Specifically, this matches + /// at the end position of the input, or at the position immediately + /// preceding a `\r` or `\n` character, but never before a `\n` when a `\r` + /// precedes it. + EndCRLF = 1 << 5, + /// Match an ASCII-only word boundary. That is, this matches a position + /// where the left adjacent character and right adjacent character + /// correspond to a word and non-word or a non-word and word character. + WordAscii = 1 << 6, + /// Match an ASCII-only negation of a word boundary. + WordAsciiNegate = 1 << 7, + /// Match a Unicode-aware word boundary. That is, this matches a position + /// where the left adjacent character and right adjacent character + /// correspond to a word and non-word or a non-word and word character. + WordUnicode = 1 << 8, + /// Match a Unicode-aware negation of a word boundary. + WordUnicodeNegate = 1 << 9, +} + +impl Look { + /// Flip the look-around assertion to its equivalent for reverse searches. + /// For example, `StartLF` gets translated to `EndLF`. + /// + /// Some assertions, such as `WordUnicode`, remain the same since they + /// match the same positions regardless of the direction of the search. + #[inline] + pub const fn reversed(self) -> Look { + match self { + Look::Start => Look::End, + Look::End => Look::Start, + Look::StartLF => Look::EndLF, + Look::EndLF => Look::StartLF, + Look::StartCRLF => Look::EndCRLF, + Look::EndCRLF => Look::StartCRLF, + Look::WordAscii => Look::WordAscii, + Look::WordAsciiNegate => Look::WordAsciiNegate, + Look::WordUnicode => Look::WordUnicode, + Look::WordUnicodeNegate => Look::WordUnicodeNegate, + } + } + + /// Return the underlying representation of this look-around enumeration + /// as an integer. Giving the return value to the [`Look::from_repr`] + /// constructor is guaranteed to return the same look-around variant that + /// one started with within a semver compatible release of this crate. + #[inline] + pub const fn as_repr(self) -> u16 { + // AFAIK, 'as' is the only way to zero-cost convert an int enum to an + // actual int. + self as u16 + } + + /// Given the underlying representation of a `Look` value, return the + /// corresponding `Look` value if the representation is valid. Otherwise + /// `None` is returned. + #[inline] + pub const fn from_repr(repr: u16) -> Option<Look> { + match repr { + 0b00_0000_0001 => Some(Look::Start), + 0b00_0000_0010 => Some(Look::End), + 0b00_0000_0100 => Some(Look::StartLF), + 0b00_0000_1000 => Some(Look::EndLF), + 0b00_0001_0000 => Some(Look::StartCRLF), + 0b00_0010_0000 => Some(Look::EndCRLF), + 0b00_0100_0000 => Some(Look::WordAscii), + 0b00_1000_0000 => Some(Look::WordAsciiNegate), + 0b01_0000_0000 => Some(Look::WordUnicode), + 0b10_0000_0000 => Some(Look::WordUnicodeNegate), + _ => None, + } + } + + /// Returns a convenient single codepoint representation of this + /// look-around assertion. Each assertion is guaranteed to be represented + /// by a distinct character. + /// + /// This is useful for succinctly representing a look-around assertion in + /// human friendly but succinct output intended for a programmer working on + /// regex internals. + #[inline] + pub const fn as_char(self) -> char { + match self { + Look::Start => 'A', + Look::End => 'z', + Look::StartLF => '^', + Look::EndLF => '$', + Look::StartCRLF => 'r', + Look::EndCRLF => 'R', + Look::WordAscii => 'b', + Look::WordAsciiNegate => 'B', + Look::WordUnicode => '𝛃', + Look::WordUnicodeNegate => '𝚩', + } + } +} + +/// LookSet is a memory-efficient set of look-around assertions. +/// +/// This is useful for efficiently tracking look-around assertions. For +/// example, a [`thompson::NFA`](crate::nfa::thompson::NFA) provides properties +/// that return `LookSet`s. +#[derive(Clone, Copy, Default, Eq, PartialEq)] +pub struct LookSet { + /// The underlying representation this set is exposed to make it possible + /// to store it somewhere efficiently. The representation is that + /// of a bitset, where each assertion occupies bit `i` where `i = + /// Look::as_repr()`. + /// + /// Note that users of this internal representation must permit the full + /// range of `u16` values to be represented. For example, even if the + /// current implementation only makes use of the 10 least significant bits, + /// it may use more bits in a future semver compatible release. + pub bits: u16, +} + +impl LookSet { + /// Create an empty set of look-around assertions. + #[inline] + pub fn empty() -> LookSet { + LookSet { bits: 0 } + } + + /// Create a full set of look-around assertions. + /// + /// This set contains all possible look-around assertions. + #[inline] + pub fn full() -> LookSet { + LookSet { bits: !0 } + } + + /// Create a look-around set containing the look-around assertion given. + /// + /// This is a convenience routine for creating an empty set and inserting + /// one look-around assertions. + #[inline] + pub fn singleton(look: Look) -> LookSet { + LookSet::empty().insert(look) + } + + /// Returns the total number of look-around assertions in this set. + #[inline] + pub fn len(self) -> usize { + // OK because max value always fits in a u8, which in turn always + // fits in a usize, regardless of target. + usize::try_from(self.bits.count_ones()).unwrap() + } + + /// Returns true if and only if this set is empty. + #[inline] + pub fn is_empty(self) -> bool { + self.len() == 0 + } + + /// Returns true if and only if the given look-around assertion is in this + /// set. + #[inline] + pub fn contains(self, look: Look) -> bool { + self.bits & look.as_repr() != 0 + } + + /// Returns true if and only if this set contains any anchor assertions. + /// This includes both "start/end of haystack" and "start/end of line." + #[inline] + pub fn contains_anchor(&self) -> bool { + self.contains_anchor_haystack() || self.contains_anchor_line() + } + + /// Returns true if and only if this set contains any "start/end of + /// haystack" anchors. This doesn't include "start/end of line" anchors. + #[inline] + pub fn contains_anchor_haystack(&self) -> bool { + self.contains(Look::Start) || self.contains(Look::End) + } + + /// Returns true if and only if this set contains any "start/end of line" + /// anchors. This doesn't include "start/end of haystack" anchors. This + /// includes both `\n` line anchors and CRLF (`\r\n`) aware line anchors. + #[inline] + pub fn contains_anchor_line(&self) -> bool { + self.contains(Look::StartLF) + || self.contains(Look::EndLF) + || self.contains(Look::StartCRLF) + || self.contains(Look::EndCRLF) + } + + /// Returns true if and only if this set contains any "start/end of line" + /// anchors that only treat `\n` as line terminators. This does not include + /// haystack anchors or CRLF aware line anchors. + #[inline] + pub fn contains_anchor_lf(&self) -> bool { + self.contains(Look::StartLF) || self.contains(Look::EndLF) + } + + /// Returns true if and only if this set contains any "start/end of line" + /// anchors that are CRLF-aware. This doesn't include "start/end of + /// haystack" or "start/end of line-feed" anchors. + #[inline] + pub fn contains_anchor_crlf(&self) -> bool { + self.contains(Look::StartCRLF) || self.contains(Look::EndCRLF) + } + + /// Returns true if and only if this set contains any word boundary or + /// negated word boundary assertions. This include both Unicode and ASCII + /// word boundaries. + #[inline] + pub fn contains_word(self) -> bool { + self.contains_word_unicode() || self.contains_word_ascii() + } + + /// Returns true if and only if this set contains any Unicode word boundary + /// or negated Unicode word boundary assertions. + #[inline] + pub fn contains_word_unicode(self) -> bool { + self.contains(Look::WordUnicode) + || self.contains(Look::WordUnicodeNegate) + } + + /// Returns true if and only if this set contains any ASCII word boundary + /// or negated ASCII word boundary assertions. + #[inline] + pub fn contains_word_ascii(self) -> bool { + self.contains(Look::WordAscii) || self.contains(Look::WordAsciiNegate) + } + + /// Returns an iterator over all of the look-around assertions in this set. + #[inline] + pub fn iter(self) -> LookSetIter { + LookSetIter { set: self } + } + + /// Return a new set that is equivalent to the original, but with the given + /// assertion added to it. If the assertion is already in the set, then the + /// returned set is equivalent to the original. + #[inline] + pub fn insert(self, look: Look) -> LookSet { + LookSet { bits: self.bits | look.as_repr() } + } + + /// Updates this set in place with the result of inserting the given + /// assertion into this set. + #[inline] + pub fn set_insert(&mut self, look: Look) { + *self = self.insert(look); + } + + /// Return a new set that is equivalent to the original, but with the given + /// assertion removed from it. If the assertion is not in the set, then the + /// returned set is equivalent to the original. + #[inline] + pub fn remove(self, look: Look) -> LookSet { + LookSet { bits: self.bits & !look.as_repr() } + } + + /// Updates this set in place with the result of removing the given + /// assertion from this set. + #[inline] + pub fn set_remove(&mut self, look: Look) { + *self = self.remove(look); + } + + /// Returns a new set that is the result of subtracting the given set from + /// this set. + #[inline] + pub fn subtract(self, other: LookSet) -> LookSet { + LookSet { bits: self.bits & !other.bits } + } + + /// Updates this set in place with the result of subtracting the given set + /// from this set. + #[inline] + pub fn set_subtract(&mut self, other: LookSet) { + *self = self.subtract(other); + } + + /// Returns a new set that is the union of this and the one given. + #[inline] + pub fn union(self, other: LookSet) -> LookSet { + LookSet { bits: self.bits | other.bits } + } + + /// Updates this set in place with the result of unioning it with the one + /// given. + #[inline] + pub fn set_union(&mut self, other: LookSet) { + *self = self.union(other); + } + + /// Returns a new set that is the intersection of this and the one given. + #[inline] + pub fn intersect(self, other: LookSet) -> LookSet { + LookSet { bits: self.bits & other.bits } + } + + /// Updates this set in place with the result of intersecting it with the + /// one given. + #[inline] + pub fn set_intersect(&mut self, other: LookSet) { + *self = self.intersect(other); + } + + /// Return a `LookSet` from the slice given as a native endian 16-bit + /// integer. + /// + /// # Panics + /// + /// This panics if `slice.len() < 2`. + #[inline] + pub fn read_repr(slice: &[u8]) -> LookSet { + let bits = u16::from_ne_bytes(slice[..2].try_into().unwrap()); + LookSet { bits } + } + + /// Write a `LookSet` as a native endian 16-bit integer to the beginning + /// of the slice given. + /// + /// # Panics + /// + /// This panics if `slice.len() < 2`. + #[inline] + pub fn write_repr(self, slice: &mut [u8]) { + let raw = self.bits.to_ne_bytes(); + slice[0] = raw[0]; + slice[1] = raw[1]; + } + + /// Checks that all assertions in this set can be matched. + /// + /// Some assertions, such as Unicode word boundaries, require optional (but + /// enabled by default) tables that may not be available. If there are + /// assertions in this set that require tables that are not available, then + /// this will return an error. + /// + /// Specifically, this returns an error when the the + /// `unicode-word-boundary` feature is _not_ enabled _and_ this set + /// contains a Unicode word boundary assertion. + /// + /// It can be useful to use this on the result of + /// [`NFA::look_set_any`](crate::nfa::thompson::NFA::look_set_any) + /// when building a matcher engine to ensure methods like + /// [`LookMatcher::matches_set`] do not panic at search time. + pub fn available(self) -> Result<(), UnicodeWordBoundaryError> { + if self.contains_word_unicode() { + UnicodeWordBoundaryError::check()?; + } + Ok(()) + } +} + +impl core::fmt::Debug for LookSet { + fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result { + if self.is_empty() { + return write!(f, "∅"); + } + for look in self.iter() { + write!(f, "{}", look.as_char())?; + } + Ok(()) + } +} + +/// An iterator over all look-around assertions in a [`LookSet`]. +/// +/// This iterator is created by [`LookSet::iter`]. +#[derive(Clone, Debug)] +pub struct LookSetIter { + set: LookSet, +} + +impl Iterator for LookSetIter { + type Item = Look; + + #[inline] + fn next(&mut self) -> Option<Look> { + if self.set.is_empty() { + return None; + } + // We'll never have more than u8::MAX distinct look-around assertions, + // so 'repr' will always fit into a u16. + let repr = u16::try_from(self.set.bits.trailing_zeros()).unwrap(); + let look = Look::from_repr(1 << repr)?; + self.set = self.set.remove(look); + Some(look) + } +} + +/// A matcher for look-around assertions. +/// +/// This matcher permits configuring aspects of how look-around assertions are +/// matched. +/// +/// # Example +/// +/// A `LookMatcher` can change the line terminator used for matching multi-line +/// anchors such as `(?m:^)` and `(?m:$)`. +/// +/// ``` +/// use regex_automata::{ +/// nfa::thompson::{self, pikevm::PikeVM}, +/// util::look::LookMatcher, +/// Match, Input, +/// }; +/// +/// let mut lookm = LookMatcher::new(); +/// lookm.set_line_terminator(b'\x00'); +/// +/// let re = PikeVM::builder() +/// .thompson(thompson::Config::new().look_matcher(lookm)) +/// .build(r"(?m)^[a-z]+$")?; +/// let mut cache = re.create_cache(); +/// +/// // Multi-line assertions now use NUL as a terminator. +/// assert_eq!( +/// Some(Match::must(0, 1..4)), +/// re.find(&mut cache, b"\x00abc\x00"), +/// ); +/// // ... and \n is no longer recognized as a terminator. +/// assert_eq!( +/// None, +/// re.find(&mut cache, b"\nabc\n"), +/// ); +/// +/// # Ok::<(), Box<dyn std::error::Error>>(()) +/// ``` +#[derive(Clone, Debug)] +pub struct LookMatcher { + lineterm: DebugByte, +} + +impl LookMatcher { + /// Creates a new default matcher for look-around assertions. + pub fn new() -> LookMatcher { + LookMatcher { lineterm: DebugByte(b'\n') } + } + + /// Sets the line terminator for use with `(?m:^)` and `(?m:$)`. + /// + /// Namely, instead of `^` matching after `\n` and `$` matching immediately + /// before a `\n`, this will cause it to match after and before the byte + /// given. + /// + /// It can occasionally be useful to use this to configure the line + /// terminator to the NUL byte when searching binary data. + /// + /// Note that this does not apply to CRLF-aware line anchors such as + /// `(?Rm:^)` and `(?Rm:$)`. CRLF-aware line anchors are hard-coded to + /// use `\r` and `\n`. + pub fn set_line_terminator(&mut self, byte: u8) -> &mut LookMatcher { + self.lineterm.0 = byte; + self + } + + /// Returns the line terminator that was configured for this matcher. + /// + /// If no line terminator was configured, then this returns `\n`. + /// + /// Note that the line terminator should only be used for matching `(?m:^)` + /// and `(?m:$)` assertions. It specifically should _not_ be used for + /// matching the CRLF aware assertions `(?Rm:^)` and `(?Rm:$)`. + pub fn get_line_terminator(&self) -> u8 { + self.lineterm.0 + } + + /// Returns true when the position `at` in `haystack` satisfies the given + /// look-around assertion. + /// + /// # Panics + /// + /// This panics when testing any Unicode word boundary assertion in this + /// set and when the Unicode word data is not available. Specifically, this + /// only occurs when the `unicode-word-boundary` feature is not enabled. + /// + /// Since it's generally expected that this routine is called inside of + /// a matching engine, callers should check the error condition when + /// building the matching engine. If there is a Unicode word boundary + /// in the matcher and the data isn't available, then the matcher should + /// fail to build. + /// + /// Callers can check the error condition with [`LookSet::available`]. + /// + /// This also may panic when `at > haystack.len()`. Note that `at == + /// haystack.len()` is legal and guaranteed not to panic. + #[inline] + pub fn matches(&self, look: Look, haystack: &[u8], at: usize) -> bool { + self.matches_inline(look, haystack, at) + } + + /// Like `matches`, but forcefully inlined. + #[cfg_attr(feature = "perf-inline", inline(always))] + pub(crate) fn matches_inline( + &self, + look: Look, + haystack: &[u8], + at: usize, + ) -> bool { + match look { + Look::Start => self.is_start(haystack, at), + Look::End => self.is_end(haystack, at), + Look::StartLF => self.is_start_lf(haystack, at), + Look::EndLF => self.is_end_lf(haystack, at), + Look::StartCRLF => self.is_start_crlf(haystack, at), + Look::EndCRLF => self.is_end_crlf(haystack, at), + Look::WordAscii => self.is_word_ascii(haystack, at), + Look::WordAsciiNegate => self.is_word_ascii_negate(haystack, at), + Look::WordUnicode => self.is_word_unicode(haystack, at).unwrap(), + Look::WordUnicodeNegate => { + self.is_word_unicode_negate(haystack, at).unwrap() + } + } + } + + /// Returns true when _all_ of the assertions in the given set match at the + /// given position in the haystack. + /// + /// # Panics + /// + /// This panics when testing any Unicode word boundary assertion in this + /// set and when the Unicode word data is not available. Specifically, this + /// only occurs when the `unicode-word-boundary` feature is not enabled. + /// + /// Since it's generally expected that this routine is called inside of + /// a matching engine, callers should check the error condition when + /// building the matching engine. If there is a Unicode word boundary + /// in the matcher and the data isn't available, then the matcher should + /// fail to build. + /// + /// Callers can check the error condition with [`LookSet::available`]. + /// + /// This also may panic when `at > haystack.len()`. Note that `at == + /// haystack.len()` is legal and guaranteed not to panic. + #[inline] + pub fn matches_set( + &self, + set: LookSet, + haystack: &[u8], + at: usize, + ) -> bool { + self.matches_set_inline(set, haystack, at) + } + + /// Like `LookSet::matches`, but forcefully inlined for perf. + #[cfg_attr(feature = "perf-inline", inline(always))] + pub(crate) fn matches_set_inline( + &self, + set: LookSet, + haystack: &[u8], + at: usize, + ) -> bool { + // This used to luse LookSet::iter with Look::matches on each element, + // but that proved to be quite diastrous for perf. The manual "if + // the set has this assertion, check it" turns out to be quite a bit + // faster. + if set.contains(Look::Start) { + if !self.is_start(haystack, at) { + return false; + } + } + if set.contains(Look::End) { + if !self.is_end(haystack, at) { + return false; + } + } + if set.contains(Look::StartLF) { + if !self.is_start_lf(haystack, at) { + return false; + } + } + if set.contains(Look::EndLF) { + if !self.is_end_lf(haystack, at) { + return false; + } + } + if set.contains(Look::StartCRLF) { + if !self.is_start_crlf(haystack, at) { + return false; + } + } + if set.contains(Look::EndCRLF) { + if !self.is_end_crlf(haystack, at) { + return false; + } + } + if set.contains(Look::WordAscii) { + if !self.is_word_ascii(haystack, at) { + return false; + } + } + if set.contains(Look::WordAsciiNegate) { + if !self.is_word_ascii_negate(haystack, at) { + return false; + } + } + if set.contains(Look::WordUnicode) { + if !self.is_word_unicode(haystack, at).unwrap() { + return false; + } + } + if set.contains(Look::WordUnicodeNegate) { + if !self.is_word_unicode_negate(haystack, at).unwrap() { + return false; + } + } + true + } + + /// Split up the given byte classes into equivalence classes in a way that + /// is consistent with this look-around assertion. + #[cfg(feature = "alloc")] + pub(crate) fn add_to_byteset( + &self, + look: Look, + set: &mut crate::util::alphabet::ByteClassSet, + ) { + match look { + Look::Start | Look::End => {} + Look::StartLF | Look::EndLF => { + set.set_range(self.lineterm.0, self.lineterm.0); + } + Look::StartCRLF | Look::EndCRLF => { + set.set_range(b'\r', b'\r'); + set.set_range(b'\n', b'\n'); + } + Look::WordAscii + | Look::WordAsciiNegate + | Look::WordUnicode + | Look::WordUnicodeNegate => { + // We need to mark all ranges of bytes whose pairs result in + // evaluating \b differently. This isn't technically correct + // for Unicode word boundaries, but DFAs can't handle those + // anyway, and thus, the byte classes don't need to either + // since they are themselves only used in DFAs. + // + // FIXME: It seems like the calls to 'set_range' here are + // completely invariant, which means we could just hard-code + // them here without needing to write a loop. And we only need + // to do this dance at most once per regex. + // + // FIXME: Is this correct for \B? + let iswb = utf8::is_word_byte; + // This unwrap is OK because we guard every use of 'asu8' with + // a check that the input is <= 255. + let asu8 = |b: u16| u8::try_from(b).unwrap(); + let mut b1: u16 = 0; + let mut b2: u16; + while b1 <= 255 { + b2 = b1 + 1; + while b2 <= 255 && iswb(asu8(b1)) == iswb(asu8(b2)) { + b2 += 1; + } + // The guards above guarantee that b2 can never get any + // bigger. + assert!(b2 <= 256); + // Subtracting 1 from b2 is always OK because it is always + // at least 1 greater than b1, and the assert above + // guarantees that the asu8 conversion will succeed. + set.set_range(asu8(b1), asu8(b2.checked_sub(1).unwrap())); + b1 = b2; + } + } + } + } + + /// Returns true when [`Look::Start`] is satisfied `at` the given position + /// in `haystack`. + /// + /// # Panics + /// + /// This may panic when `at > haystack.len()`. Note that `at == + /// haystack.len()` is legal and guaranteed not to panic. + #[inline] + pub fn is_start(&self, _haystack: &[u8], at: usize) -> bool { + at == 0 + } + + /// Returns true when [`Look::End`] is satisfied `at` the given position in + /// `haystack`. + /// + /// # Panics + /// + /// This may panic when `at > haystack.len()`. Note that `at == + /// haystack.len()` is legal and guaranteed not to panic. + #[inline] + pub fn is_end(&self, haystack: &[u8], at: usize) -> bool { + at == haystack.len() + } + + /// Returns true when [`Look::StartLF`] is satisfied `at` the given + /// position in `haystack`. + /// + /// # Panics + /// + /// This may panic when `at > haystack.len()`. Note that `at == + /// haystack.len()` is legal and guaranteed not to panic. + #[inline] + pub fn is_start_lf(&self, haystack: &[u8], at: usize) -> bool { + self.is_start(haystack, at) || haystack[at - 1] == self.lineterm.0 + } + + /// Returns true when [`Look::EndLF`] is satisfied `at` the given position + /// in `haystack`. + /// + /// # Panics + /// + /// This may panic when `at > haystack.len()`. Note that `at == + /// haystack.len()` is legal and guaranteed not to panic. + #[inline] + pub fn is_end_lf(&self, haystack: &[u8], at: usize) -> bool { + self.is_end(haystack, at) || haystack[at] == self.lineterm.0 + } + + /// Returns true when [`Look::StartCRLF`] is satisfied `at` the given + /// position in `haystack`. + /// + /// # Panics + /// + /// This may panic when `at > haystack.len()`. Note that `at == + /// haystack.len()` is legal and guaranteed not to panic. + #[inline] + pub fn is_start_crlf(&self, haystack: &[u8], at: usize) -> bool { + self.is_start(haystack, at) + || haystack[at - 1] == b'\n' + || (haystack[at - 1] == b'\r' + && (at >= haystack.len() || haystack[at] != b'\n')) + } + + /// Returns true when [`Look::EndCRLF`] is satisfied `at` the given + /// position in `haystack`. + /// + /// # Panics + /// + /// This may panic when `at > haystack.len()`. Note that `at == + /// haystack.len()` is legal and guaranteed not to panic. + #[inline] + pub fn is_end_crlf(&self, haystack: &[u8], at: usize) -> bool { + self.is_end(haystack, at) + || haystack[at] == b'\r' + || (haystack[at] == b'\n' + && (at == 0 || haystack[at - 1] != b'\r')) + } + + /// Returns true when [`Look::WordAscii`] is satisfied `at` the given + /// position in `haystack`. + /// + /// # Panics + /// + /// This may panic when `at > haystack.len()`. Note that `at == + /// haystack.len()` is legal and guaranteed not to panic. + #[inline] + pub fn is_word_ascii(&self, haystack: &[u8], at: usize) -> bool { + let word_before = at > 0 && utf8::is_word_byte(haystack[at - 1]); + let word_after = + at < haystack.len() && utf8::is_word_byte(haystack[at]); + word_before != word_after + } + + /// Returns true when [`Look::WordAsciiNegate`] is satisfied `at` the given + /// position in `haystack`. + /// + /// # Panics + /// + /// This may panic when `at > haystack.len()`. Note that `at == + /// haystack.len()` is legal and guaranteed not to panic. + #[inline] + pub fn is_word_ascii_negate(&self, haystack: &[u8], at: usize) -> bool { + !self.is_word_ascii(haystack, at) + } + + /// Returns true when [`Look::WordUnicode`] is satisfied `at` the given + /// position in `haystack`. + /// + /// # Panics + /// + /// This may panic when `at > haystack.len()`. Note that `at == + /// haystack.len()` is legal and guaranteed not to panic. + /// + /// # Errors + /// + /// This returns an error when Unicode word boundary tables + /// are not available. Specifically, this only occurs when the + /// `unicode-word-boundary` feature is not enabled. + #[inline] + pub fn is_word_unicode( + &self, + haystack: &[u8], + at: usize, + ) -> Result<bool, UnicodeWordBoundaryError> { + let word_before = is_word_char::rev(haystack, at)?; + let word_after = is_word_char::fwd(haystack, at)?; + Ok(word_before != word_after) + } + + /// Returns true when [`Look::WordUnicodeNegate`] is satisfied `at` the + /// given position in `haystack`. + /// + /// # Panics + /// + /// This may panic when `at > haystack.len()`. Note that `at == + /// haystack.len()` is legal and guaranteed not to panic. + /// + /// # Errors + /// + /// This returns an error when Unicode word boundary tables + /// are not available. Specifically, this only occurs when the + /// `unicode-word-boundary` feature is not enabled. + #[inline] + pub fn is_word_unicode_negate( + &self, + haystack: &[u8], + at: usize, + ) -> Result<bool, UnicodeWordBoundaryError> { + // This is pretty subtle. Why do we need to do UTF-8 decoding here? + // Well... at time of writing, the is_word_char_{fwd,rev} routines will + // only return true if there is a valid UTF-8 encoding of a "word" + // codepoint, and false in every other case (including invalid UTF-8). + // This means that in regions of invalid UTF-8 (which might be a + // subset of valid UTF-8!), it would result in \B matching. While this + // would be questionable in the context of truly invalid UTF-8, it is + // *certainly* wrong to report match boundaries that split the encoding + // of a codepoint. So to work around this, we ensure that we can decode + // a codepoint on either side of `at`. If either direction fails, then + // we don't permit \B to match at all. + // + // Now, this isn't exactly optimal from a perf perspective. We could + // try and detect this in is_word_char::{fwd,rev}, but it's not clear + // if it's worth it. \B is, after all, rarely used. Even worse, + // is_word_char::{fwd,rev} could do its own UTF-8 decoding, and so this + // will wind up doing UTF-8 decoding twice. Owch. We could fix this + // with more code complexity, but it just doesn't feel worth it for \B. + // + // And in particular, we do *not* have to do this with \b, because \b + // *requires* that at least one side of `at` be a "word" codepoint, + // which in turn implies one side of `at` must be valid UTF-8. This in + // turn implies that \b can never split a valid UTF-8 encoding of a + // codepoint. In the case where one side of `at` is truly invalid UTF-8 + // and the other side IS a word codepoint, then we want \b to match + // since it represents a valid UTF-8 boundary. It also makes sense. For + // example, you'd want \b\w+\b to match 'abc' in '\xFFabc\xFF'. + // + // Note also that this is not just '!is_word_unicode(..)' like it is + // for the ASCII case. For example, neither \b nor \B is satisfied + // within invalid UTF-8 sequences. + let word_before = at > 0 + && match utf8::decode_last(&haystack[..at]) { + None | Some(Err(_)) => return Ok(false), + Some(Ok(_)) => is_word_char::rev(haystack, at)?, + }; + let word_after = at < haystack.len() + && match utf8::decode(&haystack[at..]) { + None | Some(Err(_)) => return Ok(false), + Some(Ok(_)) => is_word_char::fwd(haystack, at)?, + }; + Ok(word_before == word_after) + } +} + +impl Default for LookMatcher { + fn default() -> LookMatcher { + LookMatcher::new() + } +} + +/// An error that occurs when the Unicode-aware `\w` class is unavailable. +/// +/// This error can occur when the data tables necessary for the Unicode aware +/// Perl character class `\w` are unavailable. The `\w` class is used to +/// determine whether a codepoint is considered a word character or not when +/// determining whether a Unicode aware `\b` (or `\B`) matches at a particular +/// position. +/// +/// This error can only occur when the `unicode-word-boundary` feature is +/// disabled. +#[derive(Clone, Debug)] +pub struct UnicodeWordBoundaryError(()); + +impl UnicodeWordBoundaryError { + #[cfg(not(feature = "unicode-word-boundary"))] + pub(crate) fn new() -> UnicodeWordBoundaryError { + UnicodeWordBoundaryError(()) + } + + /// Returns an error if and only if Unicode word boundary data is + /// unavailable. + pub fn check() -> Result<(), UnicodeWordBoundaryError> { + is_word_char::check() + } +} + +#[cfg(feature = "std")] +impl std::error::Error for UnicodeWordBoundaryError {} + +impl core::fmt::Display for UnicodeWordBoundaryError { + fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result { + write!( + f, + "Unicode-aware \\b and \\B are unavailable because the \ + requisite data tables are missing, please enable the \ + unicode-word-boundary feature" + ) + } +} + +// Below are FOUR different ways for checking whether whether a "word" +// codepoint exists at a particular position in the haystack. The four +// different approaches are, in order of preference: +// +// 1. Parse '\w', convert to an NFA, convert to a fully compiled DFA on the +// first call, and then use that DFA for all subsequent calls. +// 2. Do UTF-8 decoding and use regex_syntax::is_word_character if available. +// 3. Do UTF-8 decoding and use our own 'perl_word' table. +// 4. Return an error. +// +// The reason for all of these approaches is a combination of perf and +// permitting one to build regex-automata without the Unicode data necessary +// for handling Unicode-aware word boundaries. (In which case, '(?-u:\b)' would +// still work.) +// +// The DFA approach is the fastest, but it requires the regex parser, the +// NFA compiler, the DFA builder and the DFA search runtime. That's a lot to +// bring in, but if it's available, it's (probably) the best we can do. +// +// Approaches (2) and (3) are effectively equivalent, but (2) reuses the +// data in regex-syntax and avoids duplicating it in regex-automata. +// +// Finally, (4) unconditionally returns an error since the requisite data isn't +// available anywhere. +// +// There are actually more approaches possible that we didn't implement. For +// example, if the DFA builder is available but the syntax parser is not, we +// could technically hand construct our own NFA from the 'perl_word' data +// table. But to avoid some pretty hairy code duplication, we would in turn +// need to pull the UTF-8 compiler out of the NFA compiler. Yikes. +// +// A possibly more sensible alternative is to use a lazy DFA when the full +// DFA builder isn't available... +// +// Yet another choice would be to build the full DFA and then embed it into the +// source. Then we'd only need to bring in the DFA search runtime, which is +// considerably smaller than the DFA builder code. The problem here is that the +// Debian people have spooked me[1] into avoiding cyclic dependencies. Namely, +// we'd need to build regex-cli, which depends on regex-automata in order to +// build some part of regex-automata. But to be honest, something like this has +// to be allowed somehow? I just don't know what the right process is. +// +// There are perhaps other choices as well. Why did I stop at these 4? Because +// I wanted to preserve my sanity. I suspect I'll wind up adding the lazy DFA +// approach eventually, as the benefits of the DFA approach are somewhat +// compelling. The 'boundary-words-holmes' benchmark tests this: +// +// $ regex-cli bench measure -f boundary-words-holmes -e pikevm > dfa.csv +// +// Then I changed the code below so that the util/unicode_data/perl_word table +// was used and re-ran the benchmark: +// +// $ regex-cli bench measure -f boundary-words-holmes -e pikevm > table.csv +// +// And compared them: +// +// $ regex-cli bench diff dfa.csv table.csv +// benchmark engine dfa table +// --------- ------ --- ----- +// internal/count/boundary-words-holmes regex/automata/pikevm 18.6 MB/s 12.9 MB/s +// +// Which is a nice improvement. +// +// UPDATE: It turns out that it takes approximately 22ms to build the reverse +// DFA for \w. (And about 3ms for the forward DFA.) It's probably not much in +// the grand scheme things, but that is a significant latency cost. So I'm not +// sure that's a good idea. I then tried using a lazy DFA instead, and that +// eliminated the overhead, but since the lazy DFA requires mutable working +// memory, that requires introducing a 'Cache' for every simultaneous call. +// +// I ended up deciding for now to just keep the "UTF-8 decode and check the +// table." The DFA and lazy DFA approaches are still below, but commented out. +// +// [1]: https://github.com/BurntSushi/ucd-generate/issues/11 + +/* +/// A module that looks for word codepoints using lazy DFAs. +#[cfg(all( + feature = "unicode-word-boundary", + feature = "syntax", + feature = "unicode-perl", + feature = "hybrid" +))] +mod is_word_char { + use alloc::vec::Vec; + + use crate::{ + hybrid::dfa::{Cache, DFA}, + nfa::thompson::NFA, + util::{lazy::Lazy, pool::Pool, primitives::StateID}, + Anchored, Input, + }; + + pub(super) fn check() -> Result<(), super::UnicodeWordBoundaryError> { + Ok(()) + } + + #[cfg_attr(feature = "perf-inline", inline(always))] + pub(super) fn fwd( + haystack: &[u8], + mut at: usize, + ) -> Result<bool, super::UnicodeWordBoundaryError> { + static WORD: Lazy<DFA> = Lazy::new(|| DFA::new(r"\w").unwrap()); + static CACHE: Lazy<Pool<Cache>> = + Lazy::new(|| Pool::new(|| WORD.create_cache())); + let dfa = Lazy::get(&WORD); + let mut cache = Lazy::get(&CACHE).get(); + let mut sid = dfa + .start_state_forward( + &mut cache, + &Input::new("").anchored(Anchored::Yes), + ) + .unwrap(); + while at < haystack.len() { + let byte = haystack[at]; + sid = dfa.next_state(&mut cache, sid, byte).unwrap(); + at += 1; + if sid.is_tagged() { + if sid.is_match() { + return Ok(true); + } else if sid.is_dead() { + return Ok(false); + } + } + } + Ok(dfa.next_eoi_state(&mut cache, sid).unwrap().is_match()) + } + + #[cfg_attr(feature = "perf-inline", inline(always))] + pub(super) fn rev( + haystack: &[u8], + mut at: usize, + ) -> Result<bool, super::UnicodeWordBoundaryError> { + static WORD: Lazy<DFA> = Lazy::new(|| { + DFA::builder() + .thompson(NFA::config().reverse(true)) + .build(r"\w") + .unwrap() + }); + static CACHE: Lazy<Pool<Cache>> = + Lazy::new(|| Pool::new(|| WORD.create_cache())); + let dfa = Lazy::get(&WORD); + let mut cache = Lazy::get(&CACHE).get(); + let mut sid = dfa + .start_state_reverse( + &mut cache, + &Input::new("").anchored(Anchored::Yes), + ) + .unwrap(); + while at > 0 { + at -= 1; + let byte = haystack[at]; + sid = dfa.next_state(&mut cache, sid, byte).unwrap(); + if sid.is_tagged() { + if sid.is_match() { + return Ok(true); + } else if sid.is_dead() { + return Ok(false); + } + } + } + Ok(dfa.next_eoi_state(&mut cache, sid).unwrap().is_match()) + } +} +*/ + +/* +/// A module that looks for word codepoints using fully compiled DFAs. +#[cfg(all( + feature = "unicode-word-boundary", + feature = "syntax", + feature = "unicode-perl", + feature = "dfa-build" +))] +mod is_word_char { + use alloc::vec::Vec; + + use crate::{ + dfa::{dense::DFA, Automaton, StartKind}, + nfa::thompson::NFA, + util::{lazy::Lazy, primitives::StateID}, + Anchored, Input, + }; + + pub(super) fn check() -> Result<(), super::UnicodeWordBoundaryError> { + Ok(()) + } + + #[cfg_attr(feature = "perf-inline", inline(always))] + pub(super) fn fwd( + haystack: &[u8], + mut at: usize, + ) -> Result<bool, super::UnicodeWordBoundaryError> { + static WORD: Lazy<(DFA<Vec<u32>>, StateID)> = Lazy::new(|| { + let dfa = DFA::builder() + .configure(DFA::config().start_kind(StartKind::Anchored)) + .build(r"\w") + .unwrap(); + // OK because our regex has no look-around. + let start_id = dfa.universal_start_state(Anchored::Yes).unwrap(); + (dfa, start_id) + }); + let &(ref dfa, mut sid) = Lazy::get(&WORD); + while at < haystack.len() { + let byte = haystack[at]; + sid = dfa.next_state(sid, byte); + at += 1; + if dfa.is_special_state(sid) { + if dfa.is_match_state(sid) { + return Ok(true); + } else if dfa.is_dead_state(sid) { + return Ok(false); + } + } + } + Ok(dfa.is_match_state(dfa.next_eoi_state(sid))) + } + + #[cfg_attr(feature = "perf-inline", inline(always))] + pub(super) fn rev( + haystack: &[u8], + mut at: usize, + ) -> Result<bool, super::UnicodeWordBoundaryError> { + static WORD: Lazy<(DFA<Vec<u32>>, StateID)> = Lazy::new(|| { + let dfa = DFA::builder() + .configure(DFA::config().start_kind(StartKind::Anchored)) + // From ad hoc measurements, it looks like setting + // shrink==false is slightly faster than shrink==true. I kind + // of feel like this indicates that shrinking is probably a + // failure, although it can help in some cases. Sigh. + .thompson(NFA::config().reverse(true).shrink(false)) + .build(r"\w") + .unwrap(); + // OK because our regex has no look-around. + let start_id = dfa.universal_start_state(Anchored::Yes).unwrap(); + (dfa, start_id) + }); + let &(ref dfa, mut sid) = Lazy::get(&WORD); + while at > 0 { + at -= 1; + let byte = haystack[at]; + sid = dfa.next_state(sid, byte); + if dfa.is_special_state(sid) { + if dfa.is_match_state(sid) { + return Ok(true); + } else if dfa.is_dead_state(sid) { + return Ok(false); + } + } + } + Ok(dfa.is_match_state(dfa.next_eoi_state(sid))) + } +} +*/ + +/// A module that looks for word codepoints using regex-syntax's data tables. +#[cfg(all( + feature = "unicode-word-boundary", + feature = "syntax", + feature = "unicode-perl", +))] +mod is_word_char { + use regex_syntax::try_is_word_character; + + use crate::util::utf8; + + pub(super) fn check() -> Result<(), super::UnicodeWordBoundaryError> { + Ok(()) + } + + #[cfg_attr(feature = "perf-inline", inline(always))] + pub(super) fn fwd( + haystack: &[u8], + at: usize, + ) -> Result<bool, super::UnicodeWordBoundaryError> { + Ok(match utf8::decode(&haystack[at..]) { + None | Some(Err(_)) => false, + Some(Ok(ch)) => try_is_word_character(ch).expect( + "since unicode-word-boundary, syntax and unicode-perl \ + are all enabled, it is expected that \ + try_is_word_character succeeds", + ), + }) + } + + #[cfg_attr(feature = "perf-inline", inline(always))] + pub(super) fn rev( + haystack: &[u8], + at: usize, + ) -> Result<bool, super::UnicodeWordBoundaryError> { + Ok(match utf8::decode_last(&haystack[..at]) { + None | Some(Err(_)) => false, + Some(Ok(ch)) => try_is_word_character(ch).expect( + "since unicode-word-boundary, syntax and unicode-perl \ + are all enabled, it is expected that \ + try_is_word_character succeeds", + ), + }) + } +} + +/// A module that looks for word codepoints using regex-automata's data tables +/// (which are only compiled when regex-syntax's tables aren't available). +/// +/// Note that the cfg should match the one in src/util/unicode_data/mod.rs for +/// perl_word. +#[cfg(all( + feature = "unicode-word-boundary", + not(all(feature = "syntax", feature = "unicode-perl")), +))] +mod is_word_char { + use crate::util::utf8; + + pub(super) fn check() -> Result<(), super::UnicodeWordBoundaryError> { + Ok(()) + } + + #[cfg_attr(feature = "perf-inline", inline(always))] + pub(super) fn fwd( + haystack: &[u8], + at: usize, + ) -> Result<bool, super::UnicodeWordBoundaryError> { + Ok(match utf8::decode(&haystack[at..]) { + None | Some(Err(_)) => false, + Some(Ok(ch)) => is_word_character(ch), + }) + } + + #[cfg_attr(feature = "perf-inline", inline(always))] + pub(super) fn rev( + haystack: &[u8], + at: usize, + ) -> Result<bool, super::UnicodeWordBoundaryError> { + Ok(match utf8::decode_last(&haystack[..at]) { + None | Some(Err(_)) => false, + Some(Ok(ch)) => is_word_character(ch), + }) + } + + #[cfg_attr(feature = "perf-inline", inline(always))] + fn is_word_character(c: char) -> bool { + use crate::util::{unicode_data::perl_word::PERL_WORD, utf8}; + + // MSRV(1.59): Use 'u8::try_from(c)' instead. + if u8::try_from(u32::from(c)).map_or(false, utf8::is_word_byte) { + return true; + } + PERL_WORD + .binary_search_by(|&(start, end)| { + use core::cmp::Ordering; + + if start <= c && c <= end { + Ordering::Equal + } else if start > c { + Ordering::Greater + } else { + Ordering::Less + } + }) + .is_ok() + } +} + +/// A module that always returns an error if Unicode word boundaries are +/// disabled. When this feature is disabled, then regex-automata will not +/// include its own data tables even if regex-syntax is disabled. +#[cfg(not(feature = "unicode-word-boundary"))] +mod is_word_char { + pub(super) fn check() -> Result<(), super::UnicodeWordBoundaryError> { + Err(super::UnicodeWordBoundaryError::new()) + } + + #[cfg_attr(feature = "perf-inline", inline(always))] + pub(super) fn fwd( + _bytes: &[u8], + _at: usize, + ) -> Result<bool, super::UnicodeWordBoundaryError> { + Err(super::UnicodeWordBoundaryError::new()) + } + + #[cfg_attr(feature = "perf-inline", inline(always))] + pub(super) fn rev( + _bytes: &[u8], + _at: usize, + ) -> Result<bool, super::UnicodeWordBoundaryError> { + Err(super::UnicodeWordBoundaryError::new()) + } +} + +#[cfg(test)] +mod tests { + use super::*; + + macro_rules! testlook { + ($look:expr, $haystack:expr, $at:expr) => { + LookMatcher::default().matches($look, $haystack.as_bytes(), $at) + }; + } + + #[test] + fn look_matches_start_line() { + let look = Look::StartLF; + + assert!(testlook!(look, "", 0)); + assert!(testlook!(look, "\n", 0)); + assert!(testlook!(look, "\n", 1)); + assert!(testlook!(look, "a", 0)); + assert!(testlook!(look, "\na", 1)); + + assert!(!testlook!(look, "a", 1)); + assert!(!testlook!(look, "a\na", 1)); + } + + #[test] + fn look_matches_end_line() { + let look = Look::EndLF; + + assert!(testlook!(look, "", 0)); + assert!(testlook!(look, "\n", 1)); + assert!(testlook!(look, "\na", 0)); + assert!(testlook!(look, "\na", 2)); + assert!(testlook!(look, "a\na", 1)); + + assert!(!testlook!(look, "a", 0)); + assert!(!testlook!(look, "\na", 1)); + assert!(!testlook!(look, "a\na", 0)); + assert!(!testlook!(look, "a\na", 2)); + } + + #[test] + fn look_matches_start_text() { + let look = Look::Start; + + assert!(testlook!(look, "", 0)); + assert!(testlook!(look, "\n", 0)); + assert!(testlook!(look, "a", 0)); + + assert!(!testlook!(look, "\n", 1)); + assert!(!testlook!(look, "\na", 1)); + assert!(!testlook!(look, "a", 1)); + assert!(!testlook!(look, "a\na", 1)); + } + + #[test] + fn look_matches_end_text() { + let look = Look::End; + + assert!(testlook!(look, "", 0)); + assert!(testlook!(look, "\n", 1)); + assert!(testlook!(look, "\na", 2)); + + assert!(!testlook!(look, "\na", 0)); + assert!(!testlook!(look, "a\na", 1)); + assert!(!testlook!(look, "a", 0)); + assert!(!testlook!(look, "\na", 1)); + assert!(!testlook!(look, "a\na", 0)); + assert!(!testlook!(look, "a\na", 2)); + } + + #[test] + #[cfg(all(not(miri), feature = "unicode-word-boundary"))] + fn look_matches_word_unicode() { + let look = Look::WordUnicode; + + // \xF0\x9D\x9B\x83 = 𝛃 (in \w) + // \xF0\x90\x86\x80 = 𐆀 (not in \w) + + // Simple ASCII word boundaries. + assert!(testlook!(look, "a", 0)); + assert!(testlook!(look, "a", 1)); + assert!(testlook!(look, "a ", 1)); + assert!(testlook!(look, " a ", 1)); + assert!(testlook!(look, " a ", 2)); + + // Unicode word boundaries with a non-ASCII codepoint. + assert!(testlook!(look, "𝛃", 0)); + assert!(testlook!(look, "𝛃", 4)); + assert!(testlook!(look, "𝛃 ", 4)); + assert!(testlook!(look, " 𝛃 ", 1)); + assert!(testlook!(look, " 𝛃 ", 5)); + + // Unicode word boundaries between non-ASCII codepoints. + assert!(testlook!(look, "𝛃𐆀", 0)); + assert!(testlook!(look, "𝛃𐆀", 4)); + + // Non word boundaries for ASCII. + assert!(!testlook!(look, "", 0)); + assert!(!testlook!(look, "ab", 1)); + assert!(!testlook!(look, "a ", 2)); + assert!(!testlook!(look, " a ", 0)); + assert!(!testlook!(look, " a ", 3)); + + // Non word boundaries with a non-ASCII codepoint. + assert!(!testlook!(look, "𝛃b", 4)); + assert!(!testlook!(look, "𝛃 ", 5)); + assert!(!testlook!(look, " 𝛃 ", 0)); + assert!(!testlook!(look, " 𝛃 ", 6)); + assert!(!testlook!(look, "𝛃", 1)); + assert!(!testlook!(look, "𝛃", 2)); + assert!(!testlook!(look, "𝛃", 3)); + + // Non word boundaries with non-ASCII codepoints. + assert!(!testlook!(look, "𝛃𐆀", 1)); + assert!(!testlook!(look, "𝛃𐆀", 2)); + assert!(!testlook!(look, "𝛃𐆀", 3)); + assert!(!testlook!(look, "𝛃𐆀", 5)); + assert!(!testlook!(look, "𝛃𐆀", 6)); + assert!(!testlook!(look, "𝛃𐆀", 7)); + assert!(!testlook!(look, "𝛃𐆀", 8)); + } + + #[test] + fn look_matches_word_ascii() { + let look = Look::WordAscii; + + // \xF0\x9D\x9B\x83 = 𝛃 (in \w) + // \xF0\x90\x86\x80 = 𐆀 (not in \w) + + // Simple ASCII word boundaries. + assert!(testlook!(look, "a", 0)); + assert!(testlook!(look, "a", 1)); + assert!(testlook!(look, "a ", 1)); + assert!(testlook!(look, " a ", 1)); + assert!(testlook!(look, " a ", 2)); + + // Unicode word boundaries with a non-ASCII codepoint. Since this is + // an ASCII word boundary, none of these match. + assert!(!testlook!(look, "𝛃", 0)); + assert!(!testlook!(look, "𝛃", 4)); + assert!(!testlook!(look, "𝛃 ", 4)); + assert!(!testlook!(look, " 𝛃 ", 1)); + assert!(!testlook!(look, " 𝛃 ", 5)); + + // Unicode word boundaries between non-ASCII codepoints. Again, since + // this is an ASCII word boundary, none of these match. + assert!(!testlook!(look, "𝛃𐆀", 0)); + assert!(!testlook!(look, "𝛃𐆀", 4)); + + // Non word boundaries for ASCII. + assert!(!testlook!(look, "", 0)); + assert!(!testlook!(look, "ab", 1)); + assert!(!testlook!(look, "a ", 2)); + assert!(!testlook!(look, " a ", 0)); + assert!(!testlook!(look, " a ", 3)); + + // Non word boundaries with a non-ASCII codepoint. + assert!(testlook!(look, "𝛃b", 4)); + assert!(!testlook!(look, "𝛃 ", 5)); + assert!(!testlook!(look, " 𝛃 ", 0)); + assert!(!testlook!(look, " 𝛃 ", 6)); + assert!(!testlook!(look, "𝛃", 1)); + assert!(!testlook!(look, "𝛃", 2)); + assert!(!testlook!(look, "𝛃", 3)); + + // Non word boundaries with non-ASCII codepoints. + assert!(!testlook!(look, "𝛃𐆀", 1)); + assert!(!testlook!(look, "𝛃𐆀", 2)); + assert!(!testlook!(look, "𝛃𐆀", 3)); + assert!(!testlook!(look, "𝛃𐆀", 5)); + assert!(!testlook!(look, "𝛃𐆀", 6)); + assert!(!testlook!(look, "𝛃𐆀", 7)); + assert!(!testlook!(look, "𝛃𐆀", 8)); + } + + #[test] + #[cfg(all(not(miri), feature = "unicode-word-boundary"))] + fn look_matches_word_unicode_negate() { + let look = Look::WordUnicodeNegate; + + // \xF0\x9D\x9B\x83 = 𝛃 (in \w) + // \xF0\x90\x86\x80 = 𐆀 (not in \w) + + // Simple ASCII word boundaries. + assert!(!testlook!(look, "a", 0)); + assert!(!testlook!(look, "a", 1)); + assert!(!testlook!(look, "a ", 1)); + assert!(!testlook!(look, " a ", 1)); + assert!(!testlook!(look, " a ", 2)); + + // Unicode word boundaries with a non-ASCII codepoint. + assert!(!testlook!(look, "𝛃", 0)); + assert!(!testlook!(look, "𝛃", 4)); + assert!(!testlook!(look, "𝛃 ", 4)); + assert!(!testlook!(look, " 𝛃 ", 1)); + assert!(!testlook!(look, " 𝛃 ", 5)); + + // Unicode word boundaries between non-ASCII codepoints. + assert!(!testlook!(look, "𝛃𐆀", 0)); + assert!(!testlook!(look, "𝛃𐆀", 4)); + + // Non word boundaries for ASCII. + assert!(testlook!(look, "", 0)); + assert!(testlook!(look, "ab", 1)); + assert!(testlook!(look, "a ", 2)); + assert!(testlook!(look, " a ", 0)); + assert!(testlook!(look, " a ", 3)); + + // Non word boundaries with a non-ASCII codepoint. + assert!(testlook!(look, "𝛃b", 4)); + assert!(testlook!(look, "𝛃 ", 5)); + assert!(testlook!(look, " 𝛃 ", 0)); + assert!(testlook!(look, " 𝛃 ", 6)); + // These don't match because they could otherwise return an offset that + // splits the UTF-8 encoding of a codepoint. + assert!(!testlook!(look, "𝛃", 1)); + assert!(!testlook!(look, "𝛃", 2)); + assert!(!testlook!(look, "𝛃", 3)); + + // Non word boundaries with non-ASCII codepoints. These also don't + // match because they could otherwise return an offset that splits the + // UTF-8 encoding of a codepoint. + assert!(!testlook!(look, "𝛃𐆀", 1)); + assert!(!testlook!(look, "𝛃𐆀", 2)); + assert!(!testlook!(look, "𝛃𐆀", 3)); + assert!(!testlook!(look, "𝛃𐆀", 5)); + assert!(!testlook!(look, "𝛃𐆀", 6)); + assert!(!testlook!(look, "𝛃𐆀", 7)); + // But this one does, since 𐆀 isn't a word codepoint, and 8 is the end + // of the haystack. So the "end" of the haystack isn't a word and 𐆀 + // isn't a word, thus, \B matches. + assert!(testlook!(look, "𝛃𐆀", 8)); + } + + #[test] + fn look_matches_word_ascii_negate() { + let look = Look::WordAsciiNegate; + + // \xF0\x9D\x9B\x83 = 𝛃 (in \w) + // \xF0\x90\x86\x80 = 𐆀 (not in \w) + + // Simple ASCII word boundaries. + assert!(!testlook!(look, "a", 0)); + assert!(!testlook!(look, "a", 1)); + assert!(!testlook!(look, "a ", 1)); + assert!(!testlook!(look, " a ", 1)); + assert!(!testlook!(look, " a ", 2)); + + // Unicode word boundaries with a non-ASCII codepoint. Since this is + // an ASCII word boundary, none of these match. + assert!(testlook!(look, "𝛃", 0)); + assert!(testlook!(look, "𝛃", 4)); + assert!(testlook!(look, "𝛃 ", 4)); + assert!(testlook!(look, " 𝛃 ", 1)); + assert!(testlook!(look, " 𝛃 ", 5)); + + // Unicode word boundaries between non-ASCII codepoints. Again, since + // this is an ASCII word boundary, none of these match. + assert!(testlook!(look, "𝛃𐆀", 0)); + assert!(testlook!(look, "𝛃𐆀", 4)); + + // Non word boundaries for ASCII. + assert!(testlook!(look, "", 0)); + assert!(testlook!(look, "ab", 1)); + assert!(testlook!(look, "a ", 2)); + assert!(testlook!(look, " a ", 0)); + assert!(testlook!(look, " a ", 3)); + + // Non word boundaries with a non-ASCII codepoint. + assert!(!testlook!(look, "𝛃b", 4)); + assert!(testlook!(look, "𝛃 ", 5)); + assert!(testlook!(look, " 𝛃 ", 0)); + assert!(testlook!(look, " 𝛃 ", 6)); + assert!(testlook!(look, "𝛃", 1)); + assert!(testlook!(look, "𝛃", 2)); + assert!(testlook!(look, "𝛃", 3)); + + // Non word boundaries with non-ASCII codepoints. + assert!(testlook!(look, "𝛃𐆀", 1)); + assert!(testlook!(look, "𝛃𐆀", 2)); + assert!(testlook!(look, "𝛃𐆀", 3)); + assert!(testlook!(look, "𝛃𐆀", 5)); + assert!(testlook!(look, "𝛃𐆀", 6)); + assert!(testlook!(look, "𝛃𐆀", 7)); + assert!(testlook!(look, "𝛃𐆀", 8)); + } + + #[test] + fn look_set() { + let mut f = LookSet::default(); + assert!(!f.contains(Look::Start)); + assert!(!f.contains(Look::End)); + assert!(!f.contains(Look::StartLF)); + assert!(!f.contains(Look::EndLF)); + assert!(!f.contains(Look::WordUnicode)); + assert!(!f.contains(Look::WordUnicodeNegate)); + assert!(!f.contains(Look::WordAscii)); + assert!(!f.contains(Look::WordAsciiNegate)); + + f = f.insert(Look::Start); + assert!(f.contains(Look::Start)); + f = f.remove(Look::Start); + assert!(!f.contains(Look::Start)); + + f = f.insert(Look::End); + assert!(f.contains(Look::End)); + f = f.remove(Look::End); + assert!(!f.contains(Look::End)); + + f = f.insert(Look::StartLF); + assert!(f.contains(Look::StartLF)); + f = f.remove(Look::StartLF); + assert!(!f.contains(Look::StartLF)); + + f = f.insert(Look::EndLF); + assert!(f.contains(Look::EndLF)); + f = f.remove(Look::EndLF); + assert!(!f.contains(Look::EndLF)); + + f = f.insert(Look::StartCRLF); + assert!(f.contains(Look::StartCRLF)); + f = f.remove(Look::StartCRLF); + assert!(!f.contains(Look::StartCRLF)); + + f = f.insert(Look::EndCRLF); + assert!(f.contains(Look::EndCRLF)); + f = f.remove(Look::EndCRLF); + assert!(!f.contains(Look::EndCRLF)); + + f = f.insert(Look::WordUnicode); + assert!(f.contains(Look::WordUnicode)); + f = f.remove(Look::WordUnicode); + assert!(!f.contains(Look::WordUnicode)); + + f = f.insert(Look::WordUnicodeNegate); + assert!(f.contains(Look::WordUnicodeNegate)); + f = f.remove(Look::WordUnicodeNegate); + assert!(!f.contains(Look::WordUnicodeNegate)); + + f = f.insert(Look::WordAscii); + assert!(f.contains(Look::WordAscii)); + f = f.remove(Look::WordAscii); + assert!(!f.contains(Look::WordAscii)); + + f = f.insert(Look::WordAsciiNegate); + assert!(f.contains(Look::WordAsciiNegate)); + f = f.remove(Look::WordAsciiNegate); + assert!(!f.contains(Look::WordAsciiNegate)); + } + + #[test] + fn look_set_iter() { + let set = LookSet::empty(); + assert_eq!(0, set.iter().count()); + + let set = LookSet::full(); + assert_eq!(10, set.iter().count()); + + let set = + LookSet::empty().insert(Look::StartLF).insert(Look::WordUnicode); + assert_eq!(2, set.iter().count()); + + let set = LookSet::empty().insert(Look::StartLF); + assert_eq!(1, set.iter().count()); + + let set = LookSet::empty().insert(Look::WordAsciiNegate); + assert_eq!(1, set.iter().count()); + } + + #[test] + #[cfg(feature = "alloc")] + fn look_set_debug() { + let res = alloc::format!("{:?}", LookSet::empty()); + assert_eq!("∅", res); + let res = alloc::format!("{:?}", LookSet::full()); + assert_eq!("Az^$rRbB𝛃𝚩", res); + } +} diff --git a/third_party/rust/regex-automata/src/util/memchr.rs b/third_party/rust/regex-automata/src/util/memchr.rs new file mode 100644 index 0000000000..a2cbb07321 --- /dev/null +++ b/third_party/rust/regex-automata/src/util/memchr.rs @@ -0,0 +1,93 @@ +/*! +This module defines simple wrapper routines for the memchr functions from the +`memchr` crate. Basically, when the `memchr` crate is available, we use it, +otherwise we use a naive implementation which is still pretty fast. +*/ + +pub(crate) use self::inner::*; + +#[cfg(feature = "perf-literal-substring")] +pub(super) mod inner { + #[cfg_attr(feature = "perf-inline", inline(always))] + pub(crate) fn memchr(n1: u8, haystack: &[u8]) -> Option<usize> { + memchr::memchr(n1, haystack) + } + + #[cfg_attr(feature = "perf-inline", inline(always))] + pub(crate) fn memchr2(n1: u8, n2: u8, haystack: &[u8]) -> Option<usize> { + memchr::memchr2(n1, n2, haystack) + } + + #[cfg_attr(feature = "perf-inline", inline(always))] + pub(crate) fn memchr3( + n1: u8, + n2: u8, + n3: u8, + haystack: &[u8], + ) -> Option<usize> { + memchr::memchr3(n1, n2, n3, haystack) + } + + #[cfg_attr(feature = "perf-inline", inline(always))] + pub(crate) fn memrchr(n1: u8, haystack: &[u8]) -> Option<usize> { + memchr::memrchr(n1, haystack) + } + + #[cfg_attr(feature = "perf-inline", inline(always))] + pub(crate) fn memrchr2(n1: u8, n2: u8, haystack: &[u8]) -> Option<usize> { + memchr::memrchr2(n1, n2, haystack) + } + + #[cfg_attr(feature = "perf-inline", inline(always))] + pub(crate) fn memrchr3( + n1: u8, + n2: u8, + n3: u8, + haystack: &[u8], + ) -> Option<usize> { + memchr::memrchr3(n1, n2, n3, haystack) + } +} + +#[cfg(not(feature = "perf-literal-substring"))] +pub(super) mod inner { + #[cfg_attr(feature = "perf-inline", inline(always))] + pub(crate) fn memchr(n1: u8, haystack: &[u8]) -> Option<usize> { + haystack.iter().position(|&b| b == n1) + } + + #[cfg_attr(feature = "perf-inline", inline(always))] + pub(crate) fn memchr2(n1: u8, n2: u8, haystack: &[u8]) -> Option<usize> { + haystack.iter().position(|&b| b == n1 || b == n2) + } + + #[cfg_attr(feature = "perf-inline", inline(always))] + pub(crate) fn memchr3( + n1: u8, + n2: u8, + n3: u8, + haystack: &[u8], + ) -> Option<usize> { + haystack.iter().position(|&b| b == n1 || b == n2 || b == n3) + } + + #[cfg_attr(feature = "perf-inline", inline(always))] + pub(crate) fn memrchr(n1: u8, haystack: &[u8]) -> Option<usize> { + haystack.iter().rposition(|&b| b == n1) + } + + #[cfg_attr(feature = "perf-inline", inline(always))] + pub(crate) fn memrchr2(n1: u8, n2: u8, haystack: &[u8]) -> Option<usize> { + haystack.iter().rposition(|&b| b == n1 || b == n2) + } + + #[cfg_attr(feature = "perf-inline", inline(always))] + pub(crate) fn memrchr3( + n1: u8, + n2: u8, + n3: u8, + haystack: &[u8], + ) -> Option<usize> { + haystack.iter().rposition(|&b| b == n1 || b == n2 || b == n3) + } +} diff --git a/third_party/rust/regex-automata/src/util/mod.rs b/third_party/rust/regex-automata/src/util/mod.rs new file mode 100644 index 0000000000..bb739df1df --- /dev/null +++ b/third_party/rust/regex-automata/src/util/mod.rs @@ -0,0 +1,57 @@ +/*! +A collection of modules that provide APIs that are useful across many regex +engines. + +While one should explore the sub-modules directly to get a sense of what's +there, here are some highlights that tie the sub-modules to higher level +use cases: + +* `alphabet` contains APIs that are useful if you're doing low level things +with the DFAs in this crate. For example, implementing determinization or +walking its state graph directly. +* `captures` contains APIs for dealing with capture group matches and their +mapping to "slots" used inside an NFA graph. This is also where you can find +iterators over capture group names. +* `escape` contains types for pretty-printing raw byte slices as strings. +* `iter` contains API helpers for writing regex iterators. +* `lazy` contains a no-std and no-alloc variant of `lazy_static!` and +`once_cell`. +* `look` contains APIs for matching and configuring look-around assertions. +* `pool` provides a way to reuse mutable memory allocated in a thread safe +manner. +* `prefilter` provides APIs for building prefilters and using them in searches. +* `primitives` are what you might use if you're doing lower level work on +automata, such as walking an NFA state graph. +* `syntax` provides some higher level convenience functions for interacting +with the `regex-syntax` crate. +* `wire` is useful if you're working with DFA serialization. +*/ + +pub mod alphabet; +#[cfg(feature = "alloc")] +pub mod captures; +pub mod escape; +#[cfg(feature = "alloc")] +pub mod interpolate; +pub mod iter; +pub mod lazy; +pub mod look; +#[cfg(feature = "alloc")] +pub mod pool; +pub mod prefilter; +pub mod primitives; +#[cfg(feature = "syntax")] +pub mod syntax; +pub mod wire; + +#[cfg(any(feature = "dfa-build", feature = "hybrid"))] +pub(crate) mod determinize; +pub(crate) mod empty; +pub(crate) mod int; +pub(crate) mod memchr; +pub(crate) mod search; +#[cfg(feature = "alloc")] +pub(crate) mod sparse_set; +pub(crate) mod start; +pub(crate) mod unicode_data; +pub(crate) mod utf8; diff --git a/third_party/rust/regex-automata/src/util/pool.rs b/third_party/rust/regex-automata/src/util/pool.rs new file mode 100644 index 0000000000..7f4a1c21e2 --- /dev/null +++ b/third_party/rust/regex-automata/src/util/pool.rs @@ -0,0 +1,993 @@ +// This module provides a relatively simple thread-safe pool of reusable +// objects. For the most part, it's implemented by a stack represented by a +// Mutex<Vec<T>>. It has one small trick: because unlocking a mutex is somewhat +// costly, in the case where a pool is accessed by the first thread that tried +// to get a value, we bypass the mutex. Here are some benchmarks showing the +// difference. +// +// 2022-10-15: These benchmarks are from the old regex crate and they aren't +// easy to reproduce because some rely on older implementations of Pool that +// are no longer around. I've left the results here for posterity, but any +// enterprising individual should feel encouraged to re-litigate the way Pool +// works. I am not at all certain it is the best approach. +// +// 1) misc::anchored_literal_long_non_match 21 (18571 MB/s) +// 2) misc::anchored_literal_long_non_match 107 (3644 MB/s) +// 3) misc::anchored_literal_long_non_match 45 (8666 MB/s) +// 4) misc::anchored_literal_long_non_match 19 (20526 MB/s) +// +// (1) represents our baseline: the master branch at the time of writing when +// using the 'thread_local' crate to implement the pool below. +// +// (2) represents a naive pool implemented completely via Mutex<Vec<T>>. There +// is no special trick for bypassing the mutex. +// +// (3) is the same as (2), except it uses Mutex<Vec<Box<T>>>. It is twice as +// fast because a Box<T> is much smaller than the T we use with a Pool in this +// crate. So pushing and popping a Box<T> from a Vec is quite a bit faster +// than for T. +// +// (4) is the same as (3), but with the trick for bypassing the mutex in the +// case of the first-to-get thread. +// +// Why move off of thread_local? Even though (4) is a hair faster than (1) +// above, this was not the main goal. The main goal was to move off of +// thread_local and find a way to *simply* re-capture some of its speed for +// regex's specific case. So again, why move off of it? The *primary* reason is +// because of memory leaks. See https://github.com/rust-lang/regex/issues/362 +// for example. (Why do I want it to be simple? Well, I suppose what I mean is, +// "use as much safe code as possible to minimize risk and be as sure as I can +// be that it is correct.") +// +// My guess is that the thread_local design is probably not appropriate for +// regex since its memory usage scales to the number of active threads that +// have used a regex, where as the pool below scales to the number of threads +// that simultaneously use a regex. While neither case permits contraction, +// since we own the pool data structure below, we can add contraction if a +// clear use case pops up in the wild. More pressingly though, it seems that +// there are at least some use case patterns where one might have many threads +// sitting around that might have used a regex at one point. While thread_local +// does try to reuse space previously used by a thread that has since stopped, +// its maximal memory usage still scales with the total number of active +// threads. In contrast, the pool below scales with the total number of threads +// *simultaneously* using the pool. The hope is that this uses less memory +// overall. And if it doesn't, we can hopefully tune it somehow. +// +// It seems that these sort of conditions happen frequently +// in FFI inside of other more "managed" languages. This was +// mentioned in the issue linked above, and also mentioned here: +// https://github.com/BurntSushi/rure-go/issues/3. And in particular, users +// confirm that disabling the use of thread_local resolves the leak. +// +// There were other weaker reasons for moving off of thread_local as well. +// Namely, at the time, I was looking to reduce dependencies. And for something +// like regex, maintenance can be simpler when we own the full dependency tree. +// +// Note that I am not entirely happy with this pool. It has some subtle +// implementation details and is overall still observable (even with the +// thread owner optimization) in benchmarks. If someone wants to take a crack +// at building something better, please file an issue. Even if it means a +// different API. The API exposed by this pool is not the minimal thing that +// something like a 'Regex' actually needs. It could adapt to, for example, +// an API more like what is found in the 'thread_local' crate. However, we do +// really need to support the no-std alloc-only context, or else the regex +// crate wouldn't be able to support no-std alloc-only. However, I'm generally +// okay with making the alloc-only context slower (as it is here), although I +// do find it unfortunate. + +/*! +A thread safe memory pool. + +The principal type in this module is a [`Pool`]. It main use case is for +holding a thread safe collection of mutable scratch spaces (usually called +`Cache` in this crate) that regex engines need to execute a search. This then +permits sharing the same read-only regex object across multiple threads while +having a quick way of reusing scratch space in a thread safe way. This avoids +needing to re-create the scratch space for every search, which could wind up +being quite expensive. +*/ + +/// A thread safe pool that works in an `alloc`-only context. +/// +/// Getting a value out comes with a guard. When that guard is dropped, the +/// value is automatically put back in the pool. The guard provides both a +/// `Deref` and a `DerefMut` implementation for easy access to an underlying +/// `T`. +/// +/// A `Pool` impls `Sync` when `T` is `Send` (even if `T` is not `Sync`). This +/// is possible because a pool is guaranteed to provide a value to exactly one +/// thread at any time. +/// +/// Currently, a pool never contracts in size. Its size is proportional to the +/// maximum number of simultaneous uses. This may change in the future. +/// +/// A `Pool` is a particularly useful data structure for this crate because +/// many of the regex engines require a mutable "cache" in order to execute +/// a search. Since regexes themselves tend to be global, the problem is then: +/// how do you get a mutable cache to execute a search? You could: +/// +/// 1. Use a `thread_local!`, which requires the standard library and requires +/// that the regex pattern be statically known. +/// 2. Use a `Pool`. +/// 3. Make the cache an explicit dependency in your code and pass it around. +/// 4. Put the cache state in a `Mutex`, but this means only one search can +/// execute at a time. +/// 5. Create a new cache for every search. +/// +/// A `thread_local!` is perhaps the best choice if it works for your use case. +/// Putting the cache in a mutex or creating a new cache for every search are +/// perhaps the worst choices. Of the remaining two choices, whether you use +/// this `Pool` or thread through a cache explicitly in your code is a matter +/// of taste and depends on your code architecture. +/// +/// # Warning: may use a spin lock +/// +/// When this crate is compiled _without_ the `std` feature, then this type +/// may used a spin lock internally. This can have subtle effects that may +/// be undesirable. See [Spinlocks Considered Harmful][spinharm] for a more +/// thorough treatment of this topic. +/// +/// [spinharm]: https://matklad.github.io/2020/01/02/spinlocks-considered-harmful.html +/// +/// # Example +/// +/// This example shows how to share a single hybrid regex among multiple +/// threads, while also safely getting exclusive access to a hybrid's +/// [`Cache`](crate::hybrid::regex::Cache) without preventing other searches +/// from running while your thread uses the `Cache`. +/// +/// ``` +/// use regex_automata::{ +/// hybrid::regex::{Cache, Regex}, +/// util::{lazy::Lazy, pool::Pool}, +/// Match, +/// }; +/// +/// static RE: Lazy<Regex> = +/// Lazy::new(|| Regex::new("foo[0-9]+bar").unwrap()); +/// static CACHE: Lazy<Pool<Cache>> = +/// Lazy::new(|| Pool::new(|| RE.create_cache())); +/// +/// let expected = Some(Match::must(0, 3..14)); +/// assert_eq!(expected, RE.find(&mut CACHE.get(), b"zzzfoo12345barzzz")); +/// ``` +pub struct Pool<T, F = fn() -> T>(alloc::boxed::Box<inner::Pool<T, F>>); + +impl<T, F> Pool<T, F> { + /// Create a new pool. The given closure is used to create values in + /// the pool when necessary. + pub fn new(create: F) -> Pool<T, F> { + Pool(alloc::boxed::Box::new(inner::Pool::new(create))) + } +} + +impl<T: Send, F: Fn() -> T> Pool<T, F> { + /// Get a value from the pool. The caller is guaranteed to have + /// exclusive access to the given value. Namely, it is guaranteed that + /// this will never return a value that was returned by another call to + /// `get` but was not put back into the pool. + /// + /// When the guard goes out of scope and its destructor is called, then + /// it will automatically be put back into the pool. Alternatively, + /// [`PoolGuard::put`] may be used to explicitly put it back in the pool + /// without relying on its destructor. + /// + /// Note that there is no guarantee provided about which value in the + /// pool is returned. That is, calling get, dropping the guard (causing + /// the value to go back into the pool) and then calling get again is + /// *not* guaranteed to return the same value received in the first `get` + /// call. + pub fn get(&self) -> PoolGuard<'_, T, F> { + PoolGuard(self.0.get()) + } +} + +impl<T: core::fmt::Debug, F> core::fmt::Debug for Pool<T, F> { + fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result { + f.debug_tuple("Pool").field(&self.0).finish() + } +} + +/// A guard that is returned when a caller requests a value from the pool. +/// +/// The purpose of the guard is to use RAII to automatically put the value +/// back in the pool once it's dropped. +pub struct PoolGuard<'a, T: Send, F: Fn() -> T>(inner::PoolGuard<'a, T, F>); + +impl<'a, T: Send, F: Fn() -> T> PoolGuard<'a, T, F> { + /// Consumes this guard and puts it back into the pool. + /// + /// This circumvents the guard's `Drop` implementation. This can be useful + /// in circumstances where the automatic `Drop` results in poorer codegen, + /// such as calling non-inlined functions. + pub fn put(this: PoolGuard<'_, T, F>) { + inner::PoolGuard::put(this.0); + } +} + +impl<'a, T: Send, F: Fn() -> T> core::ops::Deref for PoolGuard<'a, T, F> { + type Target = T; + + fn deref(&self) -> &T { + self.0.value() + } +} + +impl<'a, T: Send, F: Fn() -> T> core::ops::DerefMut for PoolGuard<'a, T, F> { + fn deref_mut(&mut self) -> &mut T { + self.0.value_mut() + } +} + +impl<'a, T: Send + core::fmt::Debug, F: Fn() -> T> core::fmt::Debug + for PoolGuard<'a, T, F> +{ + fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result { + f.debug_tuple("PoolGuard").field(&self.0).finish() + } +} + +#[cfg(feature = "std")] +mod inner { + use core::{ + cell::UnsafeCell, + panic::{RefUnwindSafe, UnwindSafe}, + sync::atomic::{AtomicUsize, Ordering}, + }; + + use alloc::{boxed::Box, vec, vec::Vec}; + + use std::{sync::Mutex, thread_local}; + + /// An atomic counter used to allocate thread IDs. + /// + /// We specifically start our counter at 3 so that we can use the values + /// less than it as sentinels. + static COUNTER: AtomicUsize = AtomicUsize::new(3); + + /// A thread ID indicating that there is no owner. This is the initial + /// state of a pool. Once a pool has an owner, there is no way to change + /// it. + static THREAD_ID_UNOWNED: usize = 0; + + /// A thread ID indicating that the special owner value is in use and not + /// available. This state is useful for avoiding a case where the owner + /// of a pool calls `get` before putting the result of a previous `get` + /// call back into the pool. + static THREAD_ID_INUSE: usize = 1; + + /// This sentinel is used to indicate that a guard has already been dropped + /// and should not be re-dropped. We use this because our drop code can be + /// called outside of Drop and thus there could be a bug in the internal + /// implementation that results in trying to put the same guard back into + /// the same pool multiple times, and *that* could result in UB if we + /// didn't mark the guard as already having been put back in the pool. + /// + /// So this isn't strictly necessary, but this let's us define some + /// routines as safe (like PoolGuard::put_imp) that we couldn't otherwise + /// do. + static THREAD_ID_DROPPED: usize = 2; + + thread_local!( + /// A thread local used to assign an ID to a thread. + static THREAD_ID: usize = { + let next = COUNTER.fetch_add(1, Ordering::Relaxed); + // SAFETY: We cannot permit the reuse of thread IDs since reusing a + // thread ID might result in more than one thread "owning" a pool, + // and thus, permit accessing a mutable value from multiple threads + // simultaneously without synchronization. The intent of this panic + // is to be a sanity check. It is not expected that the thread ID + // space will actually be exhausted in practice. Even on a 32-bit + // system, it would require spawning 2^32 threads (although they + // wouldn't all need to run simultaneously, so it is in theory + // possible). + // + // This checks that the counter never wraps around, since atomic + // addition wraps around on overflow. + if next == 0 { + panic!("regex: thread ID allocation space exhausted"); + } + next + }; + ); + + /// A thread safe pool utilizing std-only features. + /// + /// The main difference between this and the simplistic alloc-only pool is + /// the use of std::sync::Mutex and an "owner thread" optimization that + /// makes accesses by the owner of a pool faster than all other threads. + /// This makes the common case of running a regex within a single thread + /// faster by avoiding mutex unlocking. + pub(super) struct Pool<T, F> { + /// A stack of T values to hand out. These are used when a Pool is + /// accessed by a thread that didn't create it. + stack: Mutex<Vec<Box<T>>>, + /// A function to create more T values when stack is empty and a caller + /// has requested a T. + create: F, + /// The ID of the thread that owns this pool. The owner is the thread + /// that makes the first call to 'get'. When the owner calls 'get', it + /// gets 'owner_val' directly instead of returning a T from 'stack'. + /// See comments elsewhere for details, but this is intended to be an + /// optimization for the common case that makes getting a T faster. + /// + /// It is initialized to a value of zero (an impossible thread ID) as a + /// sentinel to indicate that it is unowned. + owner: AtomicUsize, + /// A value to return when the caller is in the same thread that + /// first called `Pool::get`. + /// + /// This is set to None when a Pool is first created, and set to Some + /// once the first thread calls Pool::get. + owner_val: UnsafeCell<Option<T>>, + } + + // SAFETY: Since we want to use a Pool from multiple threads simultaneously + // behind an Arc, we need for it to be Sync. In cases where T is sync, + // Pool<T> would be Sync. However, since we use a Pool to store mutable + // scratch space, we wind up using a T that has interior mutability and is + // thus itself not Sync. So what we *really* want is for our Pool<T> to by + // Sync even when T is not Sync (but is at least Send). + // + // The only non-sync aspect of a Pool is its 'owner_val' field, which is + // used to implement faster access to a pool value in the common case of + // a pool being accessed in the same thread in which it was created. The + // 'stack' field is also shared, but a Mutex<T> where T: Send is already + // Sync. So we only need to worry about 'owner_val'. + // + // The key is to guarantee that 'owner_val' can only ever be accessed from + // one thread. In our implementation below, we guarantee this by only + // returning the 'owner_val' when the ID of the current thread matches the + // ID of the thread that first called 'Pool::get'. Since this can only ever + // be one thread, it follows that only one thread can access 'owner_val' at + // any point in time. Thus, it is safe to declare that Pool<T> is Sync when + // T is Send. + // + // If there is a way to achieve our performance goals using safe code, then + // I would very much welcome a patch. As it stands, the implementation + // below tries to balance safety with performance. The case where a Regex + // is used from multiple threads simultaneously will suffer a bit since + // getting a value out of the pool will require unlocking a mutex. + // + // We require `F: Send + Sync` because we call `F` at any point on demand, + // potentially from multiple threads simultaneously. + unsafe impl<T: Send, F: Send + Sync> Sync for Pool<T, F> {} + + // If T is UnwindSafe, then since we provide exclusive access to any + // particular value in the pool, it should therefore also be considered + // RefUnwindSafe. Also, since we use std::sync::Mutex, we get poisoning + // from it if another thread panics while the lock is held. + // + // We require `F: UnwindSafe + RefUnwindSafe` because we call `F` at any + // point on demand, so it needs to be unwind safe on both dimensions for + // the entire Pool to be unwind safe. + impl<T: UnwindSafe, F: UnwindSafe + RefUnwindSafe> RefUnwindSafe + for Pool<T, F> + { + } + + impl<T, F> Pool<T, F> { + /// Create a new pool. The given closure is used to create values in + /// the pool when necessary. + pub(super) fn new(create: F) -> Pool<T, F> { + // MSRV(1.63): Mark this function as 'const'. I've arranged the + // code such that it should "just work." Then mark the public + // 'Pool::new' method as 'const' too. (The alloc-only Pool::new + // is already 'const', so that should "just work" too.) The only + // thing we're waiting for is Mutex::new to be const. + let owner = AtomicUsize::new(THREAD_ID_UNOWNED); + let owner_val = UnsafeCell::new(None); // init'd on first access + Pool { stack: Mutex::new(vec![]), create, owner, owner_val } + } + } + + impl<T: Send, F: Fn() -> T> Pool<T, F> { + /// Get a value from the pool. This may block if another thread is also + /// attempting to retrieve a value from the pool. + pub(super) fn get(&self) -> PoolGuard<'_, T, F> { + // Our fast path checks if the caller is the thread that "owns" + // this pool. Or stated differently, whether it is the first thread + // that tried to extract a value from the pool. If it is, then we + // can return a T to the caller without going through a mutex. + // + // SAFETY: We must guarantee that only one thread gets access + // to this value. Since a thread is uniquely identified by the + // THREAD_ID thread local, it follows that if the caller's thread + // ID is equal to the owner, then only one thread may receive this + // value. This is also why we can get away with what looks like a + // racy load and a store. We know that if 'owner == caller', then + // only one thread can be here, so we don't need to worry about any + // other thread setting the owner to something else. + let caller = THREAD_ID.with(|id| *id); + let owner = self.owner.load(Ordering::Acquire); + if caller == owner { + self.owner.store(THREAD_ID_INUSE, Ordering::Release); + return self.guard_owned(caller); + } + self.get_slow(caller, owner) + } + + /// This is the "slow" version that goes through a mutex to pop an + /// allocated value off a stack to return to the caller. (Or, if the + /// stack is empty, a new value is created.) + /// + /// If the pool has no owner, then this will set the owner. + #[cold] + fn get_slow( + &self, + caller: usize, + owner: usize, + ) -> PoolGuard<'_, T, F> { + if owner == THREAD_ID_UNOWNED { + // This sentinel means this pool is not yet owned. We try to + // atomically set the owner. If we do, then this thread becomes + // the owner and we can return a guard that represents the + // special T for the owner. + // + // Note that we set the owner to a different sentinel that + // indicates that the owned value is in use. The owner ID will + // get updated to the actual ID of this thread once the guard + // returned by this function is put back into the pool. + let res = self.owner.compare_exchange( + THREAD_ID_UNOWNED, + THREAD_ID_INUSE, + Ordering::AcqRel, + Ordering::Acquire, + ); + if res.is_ok() { + // SAFETY: A successful CAS above implies this thread is + // the owner and that this is the only such thread that + // can reach here. Thus, there is no data race. + unsafe { + *self.owner_val.get() = Some((self.create)()); + } + return self.guard_owned(caller); + } + } + let mut stack = self.stack.lock().unwrap(); + let value = match stack.pop() { + None => Box::new((self.create)()), + Some(value) => value, + }; + self.guard_stack(value) + } + + /// Puts a value back into the pool. Callers don't need to call this. + /// Once the guard that's returned by 'get' is dropped, it is put back + /// into the pool automatically. + fn put_value(&self, value: Box<T>) { + let mut stack = self.stack.lock().unwrap(); + stack.push(value); + } + + /// Create a guard that represents the special owned T. + fn guard_owned(&self, caller: usize) -> PoolGuard<'_, T, F> { + PoolGuard { pool: self, value: Err(caller) } + } + + /// Create a guard that contains a value from the pool's stack. + fn guard_stack(&self, value: Box<T>) -> PoolGuard<'_, T, F> { + PoolGuard { pool: self, value: Ok(value) } + } + } + + impl<T: core::fmt::Debug, F> core::fmt::Debug for Pool<T, F> { + fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result { + f.debug_struct("Pool") + .field("stack", &self.stack) + .field("owner", &self.owner) + .field("owner_val", &self.owner_val) + .finish() + } + } + + /// A guard that is returned when a caller requests a value from the pool. + pub(super) struct PoolGuard<'a, T: Send, F: Fn() -> T> { + /// The pool that this guard is attached to. + pool: &'a Pool<T, F>, + /// This is Err when the guard represents the special "owned" value. + /// In which case, the value is retrieved from 'pool.owner_val'. And + /// in the special case of `Err(THREAD_ID_DROPPED)`, it means the + /// guard has been put back into the pool and should no longer be used. + value: Result<Box<T>, usize>, + } + + impl<'a, T: Send, F: Fn() -> T> PoolGuard<'a, T, F> { + /// Return the underlying value. + pub(super) fn value(&self) -> &T { + match self.value { + Ok(ref v) => &**v, + // SAFETY: This is safe because the only way a PoolGuard gets + // created for self.value=Err is when the current thread + // corresponds to the owning thread, of which there can only + // be one. Thus, we are guaranteed to be providing exclusive + // access here which makes this safe. + // + // Also, since 'owner_val' is guaranteed to be initialized + // before an owned PoolGuard is created, the unchecked unwrap + // is safe. + Err(id) => unsafe { + // This assert is *not* necessary for safety, since we + // should never be here if the guard had been put back into + // the pool. This is a sanity check to make sure we didn't + // break an internal invariant. + debug_assert_ne!(THREAD_ID_DROPPED, id); + (*self.pool.owner_val.get()).as_ref().unwrap_unchecked() + }, + } + } + + /// Return the underlying value as a mutable borrow. + pub(super) fn value_mut(&mut self) -> &mut T { + match self.value { + Ok(ref mut v) => &mut **v, + // SAFETY: This is safe because the only way a PoolGuard gets + // created for self.value=None is when the current thread + // corresponds to the owning thread, of which there can only + // be one. Thus, we are guaranteed to be providing exclusive + // access here which makes this safe. + // + // Also, since 'owner_val' is guaranteed to be initialized + // before an owned PoolGuard is created, the unwrap_unchecked + // is safe. + Err(id) => unsafe { + // This assert is *not* necessary for safety, since we + // should never be here if the guard had been put back into + // the pool. This is a sanity check to make sure we didn't + // break an internal invariant. + debug_assert_ne!(THREAD_ID_DROPPED, id); + (*self.pool.owner_val.get()).as_mut().unwrap_unchecked() + }, + } + } + + /// Consumes this guard and puts it back into the pool. + pub(super) fn put(this: PoolGuard<'_, T, F>) { + // Since this is effectively consuming the guard and putting the + // value back into the pool, there's no reason to run its Drop + // impl after doing this. I don't believe there is a correctness + // problem with doing so, but there's definitely a perf problem + // by redoing this work. So we avoid it. + let mut this = core::mem::ManuallyDrop::new(this); + this.put_imp(); + } + + /// Puts this guard back into the pool by only borrowing the guard as + /// mutable. This should be called at most once. + #[inline(always)] + fn put_imp(&mut self) { + match core::mem::replace(&mut self.value, Err(THREAD_ID_DROPPED)) { + Ok(value) => self.pool.put_value(value), + // If this guard has a value "owned" by the thread, then + // the Pool guarantees that this is the ONLY such guard. + // Therefore, in order to place it back into the pool and make + // it available, we need to change the owner back to the owning + // thread's ID. But note that we use the ID that was stored in + // the guard, since a guard can be moved to another thread and + // dropped. (A previous iteration of this code read from the + // THREAD_ID thread local, which uses the ID of the current + // thread which may not be the ID of the owning thread! This + // also avoids the TLS access, which is likely a hair faster.) + Err(owner) => { + // If we hit this point, it implies 'put_imp' has been + // called multiple times for the same guard which in turn + // corresponds to a bug in this implementation. + assert_ne!(THREAD_ID_DROPPED, owner); + self.pool.owner.store(owner, Ordering::Release); + } + } + } + } + + impl<'a, T: Send, F: Fn() -> T> Drop for PoolGuard<'a, T, F> { + fn drop(&mut self) { + self.put_imp(); + } + } + + impl<'a, T: Send + core::fmt::Debug, F: Fn() -> T> core::fmt::Debug + for PoolGuard<'a, T, F> + { + fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result { + f.debug_struct("PoolGuard") + .field("pool", &self.pool) + .field("value", &self.value) + .finish() + } + } +} + +// FUTURE: We should consider using Mara Bos's nearly-lock-free version of this +// here: https://gist.github.com/m-ou-se/5fdcbdf7dcf4585199ce2de697f367a4. +// +// One reason why I did things with a "mutex" below is that it isolates the +// safety concerns to just the Mutex, where as the safety of Mara's pool is a +// bit more sprawling. I also expect this code to not be used that much, and +// so is unlikely to get as much real world usage with which to test it. That +// means the "obviously correct" lever is an important one. +// +// The specific reason to use Mara's pool is that it is likely faster and also +// less likely to hit problems with spin-locks, although it is not completely +// impervious to them. +// +// The best solution to this problem, probably, is a truly lock free pool. That +// could be done with a lock free linked list. The issue is the ABA problem. It +// is difficult to avoid, and doing so is complex. BUT, the upshot of that is +// that if we had a truly lock free pool, then we could also use it above in +// the 'std' pool instead of a Mutex because it should be completely free the +// problems that come from spin-locks. +#[cfg(not(feature = "std"))] +mod inner { + use core::{ + cell::UnsafeCell, + panic::{RefUnwindSafe, UnwindSafe}, + sync::atomic::{AtomicBool, Ordering}, + }; + + use alloc::{boxed::Box, vec, vec::Vec}; + + /// A thread safe pool utilizing alloc-only features. + /// + /// Unlike the std version, it doesn't seem possible(?) to implement the + /// "thread owner" optimization because alloc-only doesn't have any concept + /// of threads. So the best we can do is just a normal stack. This will + /// increase latency in alloc-only environments. + pub(super) struct Pool<T, F> { + /// A stack of T values to hand out. These are used when a Pool is + /// accessed by a thread that didn't create it. + stack: Mutex<Vec<Box<T>>>, + /// A function to create more T values when stack is empty and a caller + /// has requested a T. + create: F, + } + + // If T is UnwindSafe, then since we provide exclusive access to any + // particular value in the pool, it should therefore also be considered + // RefUnwindSafe. + impl<T: UnwindSafe, F: UnwindSafe> RefUnwindSafe for Pool<T, F> {} + + impl<T, F> Pool<T, F> { + /// Create a new pool. The given closure is used to create values in + /// the pool when necessary. + pub(super) const fn new(create: F) -> Pool<T, F> { + Pool { stack: Mutex::new(vec![]), create } + } + } + + impl<T: Send, F: Fn() -> T> Pool<T, F> { + /// Get a value from the pool. This may block if another thread is also + /// attempting to retrieve a value from the pool. + pub(super) fn get(&self) -> PoolGuard<'_, T, F> { + let mut stack = self.stack.lock(); + let value = match stack.pop() { + None => Box::new((self.create)()), + Some(value) => value, + }; + PoolGuard { pool: self, value: Some(value) } + } + + fn put(&self, guard: PoolGuard<'_, T, F>) { + let mut guard = core::mem::ManuallyDrop::new(guard); + if let Some(value) = guard.value.take() { + self.put_value(value); + } + } + + /// Puts a value back into the pool. Callers don't need to call this. + /// Once the guard that's returned by 'get' is dropped, it is put back + /// into the pool automatically. + fn put_value(&self, value: Box<T>) { + let mut stack = self.stack.lock(); + stack.push(value); + } + } + + impl<T: core::fmt::Debug, F> core::fmt::Debug for Pool<T, F> { + fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result { + f.debug_struct("Pool").field("stack", &self.stack).finish() + } + } + + /// A guard that is returned when a caller requests a value from the pool. + pub(super) struct PoolGuard<'a, T: Send, F: Fn() -> T> { + /// The pool that this guard is attached to. + pool: &'a Pool<T, F>, + /// This is None after the guard has been put back into the pool. + value: Option<Box<T>>, + } + + impl<'a, T: Send, F: Fn() -> T> PoolGuard<'a, T, F> { + /// Return the underlying value. + pub(super) fn value(&self) -> &T { + self.value.as_deref().unwrap() + } + + /// Return the underlying value as a mutable borrow. + pub(super) fn value_mut(&mut self) -> &mut T { + self.value.as_deref_mut().unwrap() + } + + /// Consumes this guard and puts it back into the pool. + pub(super) fn put(this: PoolGuard<'_, T, F>) { + // Since this is effectively consuming the guard and putting the + // value back into the pool, there's no reason to run its Drop + // impl after doing this. I don't believe there is a correctness + // problem with doing so, but there's definitely a perf problem + // by redoing this work. So we avoid it. + let mut this = core::mem::ManuallyDrop::new(this); + this.put_imp(); + } + + /// Puts this guard back into the pool by only borrowing the guard as + /// mutable. This should be called at most once. + #[inline(always)] + fn put_imp(&mut self) { + if let Some(value) = self.value.take() { + self.pool.put_value(value); + } + } + } + + impl<'a, T: Send, F: Fn() -> T> Drop for PoolGuard<'a, T, F> { + fn drop(&mut self) { + self.put_imp(); + } + } + + impl<'a, T: Send + core::fmt::Debug, F: Fn() -> T> core::fmt::Debug + for PoolGuard<'a, T, F> + { + fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result { + f.debug_struct("PoolGuard") + .field("pool", &self.pool) + .field("value", &self.value) + .finish() + } + } + + /// A spin-lock based mutex. Yes, I have read spinlocks cosnidered + /// harmful[1], and if there's a reasonable alternative choice, I'll + /// happily take it. + /// + /// I suspect the most likely alternative here is a Treiber stack, but + /// implementing one correctly in a way that avoids the ABA problem looks + /// subtle enough that I'm not sure I want to attempt that. But otherwise, + /// we only need a mutex in order to implement our pool, so if there's + /// something simpler we can use that works for our `Pool` use case, then + /// that would be great. + /// + /// Note that this mutex does not do poisoning. + /// + /// [1]: https://matklad.github.io/2020/01/02/spinlocks-considered-harmful.html + #[derive(Debug)] + struct Mutex<T> { + locked: AtomicBool, + data: UnsafeCell<T>, + } + + // SAFETY: Since a Mutex guarantees exclusive access, as long as we can + // send it across threads, it must also be Sync. + unsafe impl<T: Send> Sync for Mutex<T> {} + + impl<T> Mutex<T> { + /// Create a new mutex for protecting access to the given value across + /// multiple threads simultaneously. + const fn new(value: T) -> Mutex<T> { + Mutex { + locked: AtomicBool::new(false), + data: UnsafeCell::new(value), + } + } + + /// Lock this mutex and return a guard providing exclusive access to + /// `T`. This blocks if some other thread has already locked this + /// mutex. + fn lock(&self) -> MutexGuard<'_, T> { + while self + .locked + .compare_exchange( + false, + true, + Ordering::AcqRel, + Ordering::Acquire, + ) + .is_err() + { + core::hint::spin_loop(); + } + // SAFETY: The only way we're here is if we successfully set + // 'locked' to true, which implies we must be the only thread here + // and thus have exclusive access to 'data'. + let data = unsafe { &mut *self.data.get() }; + MutexGuard { locked: &self.locked, data } + } + } + + /// A guard that derefs to &T and &mut T. When it's dropped, the lock is + /// released. + #[derive(Debug)] + struct MutexGuard<'a, T> { + locked: &'a AtomicBool, + data: &'a mut T, + } + + impl<'a, T> core::ops::Deref for MutexGuard<'a, T> { + type Target = T; + + fn deref(&self) -> &T { + self.data + } + } + + impl<'a, T> core::ops::DerefMut for MutexGuard<'a, T> { + fn deref_mut(&mut self) -> &mut T { + self.data + } + } + + impl<'a, T> Drop for MutexGuard<'a, T> { + fn drop(&mut self) { + // Drop means 'data' is no longer accessible, so we can unlock + // the mutex. + self.locked.store(false, Ordering::Release); + } + } +} + +#[cfg(test)] +mod tests { + use core::panic::{RefUnwindSafe, UnwindSafe}; + + use alloc::{boxed::Box, vec, vec::Vec}; + + use super::*; + + #[test] + fn oibits() { + fn assert_oitbits<T: Send + Sync + UnwindSafe + RefUnwindSafe>() {} + assert_oitbits::<Pool<Vec<u32>>>(); + assert_oitbits::<Pool<core::cell::RefCell<Vec<u32>>>>(); + assert_oitbits::< + Pool< + Vec<u32>, + Box< + dyn Fn() -> Vec<u32> + + Send + + Sync + + UnwindSafe + + RefUnwindSafe, + >, + >, + >(); + } + + // Tests that Pool implements the "single owner" optimization. That is, the + // thread that first accesses the pool gets its own copy, while all other + // threads get distinct copies. + #[cfg(feature = "std")] + #[test] + fn thread_owner_optimization() { + use std::{cell::RefCell, sync::Arc, vec}; + + let pool: Arc<Pool<RefCell<Vec<char>>>> = + Arc::new(Pool::new(|| RefCell::new(vec!['a']))); + pool.get().borrow_mut().push('x'); + + let pool1 = pool.clone(); + let t1 = std::thread::spawn(move || { + let guard = pool1.get(); + guard.borrow_mut().push('y'); + }); + + let pool2 = pool.clone(); + let t2 = std::thread::spawn(move || { + let guard = pool2.get(); + guard.borrow_mut().push('z'); + }); + + t1.join().unwrap(); + t2.join().unwrap(); + + // If we didn't implement the single owner optimization, then one of + // the threads above is likely to have mutated the [a, x] vec that + // we stuffed in the pool before spawning the threads. But since + // neither thread was first to access the pool, and because of the + // optimization, we should be guaranteed that neither thread mutates + // the special owned pool value. + // + // (Technically this is an implementation detail and not a contract of + // Pool's API.) + assert_eq!(vec!['a', 'x'], *pool.get().borrow()); + } + + // This tests that if the "owner" of a pool asks for two values, then it + // gets two distinct values and not the same one. This test failed in the + // course of developing the pool, which in turn resulted in UB because it + // permitted getting aliasing &mut borrows to the same place in memory. + #[test] + fn thread_owner_distinct() { + let pool = Pool::new(|| vec!['a']); + + { + let mut g1 = pool.get(); + let v1 = &mut *g1; + let mut g2 = pool.get(); + let v2 = &mut *g2; + v1.push('b'); + v2.push('c'); + assert_eq!(&mut vec!['a', 'b'], v1); + assert_eq!(&mut vec!['a', 'c'], v2); + } + // This isn't technically guaranteed, but we + // expect to now get the "owned" value (the first + // call to 'get()' above) now that it's back in + // the pool. + assert_eq!(&mut vec!['a', 'b'], &mut *pool.get()); + } + + // This tests that we can share a guard with another thread, mutate the + // underlying value and everything works. This failed in the course of + // developing a pool since the pool permitted 'get()' to return the same + // value to the owner thread, even before the previous value was put back + // into the pool. This in turn resulted in this test producing a data race. + #[cfg(feature = "std")] + #[test] + fn thread_owner_sync() { + let pool = Pool::new(|| vec!['a']); + { + let mut g1 = pool.get(); + let mut g2 = pool.get(); + std::thread::scope(|s| { + s.spawn(|| { + g1.push('b'); + }); + s.spawn(|| { + g2.push('c'); + }); + }); + + let v1 = &mut *g1; + let v2 = &mut *g2; + assert_eq!(&mut vec!['a', 'b'], v1); + assert_eq!(&mut vec!['a', 'c'], v2); + } + + // This isn't technically guaranteed, but we + // expect to now get the "owned" value (the first + // call to 'get()' above) now that it's back in + // the pool. + assert_eq!(&mut vec!['a', 'b'], &mut *pool.get()); + } + + // This tests that if we move a PoolGuard that is owned by the current + // thread to another thread and drop it, then the thread owner doesn't + // change. During development of the pool, this test failed because the + // PoolGuard assumed it was dropped in the same thread from which it was + // created, and thus used the current thread's ID as the owner, which could + // be different than the actual owner of the pool. + #[cfg(feature = "std")] + #[test] + fn thread_owner_send_drop() { + let pool = Pool::new(|| vec!['a']); + // Establishes this thread as the owner. + { + pool.get().push('b'); + } + std::thread::scope(|s| { + // Sanity check that we get the same value back. + // (Not technically guaranteed.) + let mut g = pool.get(); + assert_eq!(&vec!['a', 'b'], &*g); + // Now push it to another thread and drop it. + s.spawn(move || { + g.push('c'); + }) + .join() + .unwrap(); + }); + // Now check that we're still the owner. This is not technically + // guaranteed by the API, but is true in practice given the thread + // owner optimization. + assert_eq!(&vec!['a', 'b', 'c'], &*pool.get()); + } +} diff --git a/third_party/rust/regex-automata/src/util/prefilter/aho_corasick.rs b/third_party/rust/regex-automata/src/util/prefilter/aho_corasick.rs new file mode 100644 index 0000000000..50cce827ee --- /dev/null +++ b/third_party/rust/regex-automata/src/util/prefilter/aho_corasick.rs @@ -0,0 +1,149 @@ +use crate::util::{ + prefilter::PrefilterI, + search::{MatchKind, Span}, +}; + +#[derive(Clone, Debug)] +pub(crate) struct AhoCorasick { + #[cfg(not(feature = "perf-literal-multisubstring"))] + _unused: (), + #[cfg(feature = "perf-literal-multisubstring")] + ac: aho_corasick::AhoCorasick, +} + +impl AhoCorasick { + pub(crate) fn new<B: AsRef<[u8]>>( + kind: MatchKind, + needles: &[B], + ) -> Option<AhoCorasick> { + #[cfg(not(feature = "perf-literal-multisubstring"))] + { + None + } + #[cfg(feature = "perf-literal-multisubstring")] + { + // We used to use `aho_corasick::MatchKind::Standard` here when + // `kind` was `MatchKind::All`, but this is not correct. The + // "standard" Aho-Corasick match semantics are to report a match + // immediately as soon as it is seen, but `All` isn't like that. + // In particular, with "standard" semantics, given the needles + // "abc" and "b" and the haystack "abc," it would report a match + // at offset 1 before a match at offset 0. This is never what we + // want in the context of the regex engine, regardless of whether + // we have leftmost-first or 'all' semantics. Namely, we always + // want the leftmost match. + let ac_match_kind = match kind { + MatchKind::LeftmostFirst | MatchKind::All => { + aho_corasick::MatchKind::LeftmostFirst + } + }; + // This is kind of just an arbitrary number, but basically, if we + // have a small enough set of literals, then we try to use the VERY + // memory hungry DFA. Otherwise, we whimp out and use an NFA. The + // upshot is that the NFA is quite lean and decently fast. Faster + // than a naive Aho-Corasick NFA anyway. + let ac_kind = if needles.len() <= 500 { + aho_corasick::AhoCorasickKind::DFA + } else { + aho_corasick::AhoCorasickKind::ContiguousNFA + }; + let result = aho_corasick::AhoCorasick::builder() + .kind(Some(ac_kind)) + .match_kind(ac_match_kind) + .start_kind(aho_corasick::StartKind::Both) + // We try to handle all of the prefilter cases in the super + // module, and only use Aho-Corasick for the actual automaton. + // The aho-corasick crate does have some extra prefilters, + // namely, looking for rare bytes to feed to memchr{,2,3} + // instead of just the first byte. If we end up wanting + // those---and they are somewhat tricky to implement---then + // we could port them to this crate. + // + // The main reason for doing things this way is so we have a + // complete and easy to understand picture of which prefilters + // are available and how they work. Otherwise it seems too + // easy to get into a situation where we have a prefilter + // layered on top of prefilter, and that might have unintended + // consequences. + .prefilter(false) + .build(needles); + let ac = match result { + Ok(ac) => ac, + Err(_err) => { + debug!("aho-corasick prefilter failed to build: {}", _err); + return None; + } + }; + Some(AhoCorasick { ac }) + } + } +} + +impl PrefilterI for AhoCorasick { + fn find(&self, haystack: &[u8], span: Span) -> Option<Span> { + #[cfg(not(feature = "perf-literal-multisubstring"))] + { + unreachable!() + } + #[cfg(feature = "perf-literal-multisubstring")] + { + let input = + aho_corasick::Input::new(haystack).span(span.start..span.end); + self.ac + .find(input) + .map(|m| Span { start: m.start(), end: m.end() }) + } + } + + fn prefix(&self, haystack: &[u8], span: Span) -> Option<Span> { + #[cfg(not(feature = "perf-literal-multisubstring"))] + { + unreachable!() + } + #[cfg(feature = "perf-literal-multisubstring")] + { + let input = aho_corasick::Input::new(haystack) + .anchored(aho_corasick::Anchored::Yes) + .span(span.start..span.end); + self.ac + .find(input) + .map(|m| Span { start: m.start(), end: m.end() }) + } + } + + fn memory_usage(&self) -> usize { + #[cfg(not(feature = "perf-literal-multisubstring"))] + { + unreachable!() + } + #[cfg(feature = "perf-literal-multisubstring")] + { + self.ac.memory_usage() + } + } + + fn is_fast(&self) -> bool { + #[cfg(not(feature = "perf-literal-multisubstring"))] + { + unreachable!() + } + #[cfg(feature = "perf-literal-multisubstring")] + { + // Aho-Corasick is never considered "fast" because it's never + // going to be even close to an order of magnitude faster than the + // regex engine itself (assuming a DFA is used). In fact, it is + // usually slower. The magic of Aho-Corasick is that it can search + // a *large* number of literals with a relatively small amount of + // memory. The regex engines are far more wasteful. + // + // Aho-Corasick may be "fast" when the regex engine corresponds + // to, say, the PikeVM. That happens when the lazy DFA couldn't be + // built or used for some reason. But in these cases, the regex + // itself is likely quite big and we're probably hosed no matter + // what we do. (In this case, the best bet is for the caller to + // increase some of the memory limits on the hybrid cache capacity + // and hope that's enough.) + false + } + } +} diff --git a/third_party/rust/regex-automata/src/util/prefilter/byteset.rs b/third_party/rust/regex-automata/src/util/prefilter/byteset.rs new file mode 100644 index 0000000000..a669d6c9d7 --- /dev/null +++ b/third_party/rust/regex-automata/src/util/prefilter/byteset.rs @@ -0,0 +1,58 @@ +use crate::util::{ + prefilter::PrefilterI, + search::{MatchKind, Span}, +}; + +#[derive(Clone, Debug)] +pub(crate) struct ByteSet([bool; 256]); + +impl ByteSet { + pub(crate) fn new<B: AsRef<[u8]>>( + _kind: MatchKind, + needles: &[B], + ) -> Option<ByteSet> { + #[cfg(not(feature = "perf-literal-multisubstring"))] + { + None + } + #[cfg(feature = "perf-literal-multisubstring")] + { + let mut set = [false; 256]; + for needle in needles.iter() { + let needle = needle.as_ref(); + if needle.len() != 1 { + return None; + } + set[usize::from(needle[0])] = true; + } + Some(ByteSet(set)) + } + } +} + +impl PrefilterI for ByteSet { + fn find(&self, haystack: &[u8], span: Span) -> Option<Span> { + haystack[span].iter().position(|&b| self.0[usize::from(b)]).map(|i| { + let start = span.start + i; + let end = start + 1; + Span { start, end } + }) + } + + fn prefix(&self, haystack: &[u8], span: Span) -> Option<Span> { + let b = *haystack.get(span.start)?; + if self.0[usize::from(b)] { + Some(Span { start: span.start, end: span.start + 1 }) + } else { + None + } + } + + fn memory_usage(&self) -> usize { + 0 + } + + fn is_fast(&self) -> bool { + false + } +} diff --git a/third_party/rust/regex-automata/src/util/prefilter/memchr.rs b/third_party/rust/regex-automata/src/util/prefilter/memchr.rs new file mode 100644 index 0000000000..3d44b83721 --- /dev/null +++ b/third_party/rust/regex-automata/src/util/prefilter/memchr.rs @@ -0,0 +1,186 @@ +use crate::util::{ + prefilter::PrefilterI, + search::{MatchKind, Span}, +}; + +#[derive(Clone, Debug)] +pub(crate) struct Memchr(u8); + +impl Memchr { + pub(crate) fn new<B: AsRef<[u8]>>( + _kind: MatchKind, + needles: &[B], + ) -> Option<Memchr> { + #[cfg(not(feature = "perf-literal-substring"))] + { + None + } + #[cfg(feature = "perf-literal-substring")] + { + if needles.len() != 1 { + return None; + } + if needles[0].as_ref().len() != 1 { + return None; + } + Some(Memchr(needles[0].as_ref()[0])) + } + } +} + +impl PrefilterI for Memchr { + fn find(&self, haystack: &[u8], span: Span) -> Option<Span> { + #[cfg(not(feature = "perf-literal-substring"))] + { + unreachable!() + } + #[cfg(feature = "perf-literal-substring")] + { + memchr::memchr(self.0, &haystack[span]).map(|i| { + let start = span.start + i; + let end = start + 1; + Span { start, end } + }) + } + } + + fn prefix(&self, haystack: &[u8], span: Span) -> Option<Span> { + let b = *haystack.get(span.start)?; + if self.0 == b { + Some(Span { start: span.start, end: span.start + 1 }) + } else { + None + } + } + + fn memory_usage(&self) -> usize { + 0 + } + + fn is_fast(&self) -> bool { + true + } +} + +#[derive(Clone, Debug)] +pub(crate) struct Memchr2(u8, u8); + +impl Memchr2 { + pub(crate) fn new<B: AsRef<[u8]>>( + _kind: MatchKind, + needles: &[B], + ) -> Option<Memchr2> { + #[cfg(not(feature = "perf-literal-substring"))] + { + None + } + #[cfg(feature = "perf-literal-substring")] + { + if needles.len() != 2 { + return None; + } + if !needles.iter().all(|n| n.as_ref().len() == 1) { + return None; + } + let b1 = needles[0].as_ref()[0]; + let b2 = needles[1].as_ref()[0]; + Some(Memchr2(b1, b2)) + } + } +} + +impl PrefilterI for Memchr2 { + fn find(&self, haystack: &[u8], span: Span) -> Option<Span> { + #[cfg(not(feature = "perf-literal-substring"))] + { + unreachable!() + } + #[cfg(feature = "perf-literal-substring")] + { + memchr::memchr2(self.0, self.1, &haystack[span]).map(|i| { + let start = span.start + i; + let end = start + 1; + Span { start, end } + }) + } + } + + fn prefix(&self, haystack: &[u8], span: Span) -> Option<Span> { + let b = *haystack.get(span.start)?; + if self.0 == b || self.1 == b { + Some(Span { start: span.start, end: span.start + 1 }) + } else { + None + } + } + + fn memory_usage(&self) -> usize { + 0 + } + + fn is_fast(&self) -> bool { + true + } +} + +#[derive(Clone, Debug)] +pub(crate) struct Memchr3(u8, u8, u8); + +impl Memchr3 { + pub(crate) fn new<B: AsRef<[u8]>>( + _kind: MatchKind, + needles: &[B], + ) -> Option<Memchr3> { + #[cfg(not(feature = "perf-literal-substring"))] + { + None + } + #[cfg(feature = "perf-literal-substring")] + { + if needles.len() != 3 { + return None; + } + if !needles.iter().all(|n| n.as_ref().len() == 1) { + return None; + } + let b1 = needles[0].as_ref()[0]; + let b2 = needles[1].as_ref()[0]; + let b3 = needles[2].as_ref()[0]; + Some(Memchr3(b1, b2, b3)) + } + } +} + +impl PrefilterI for Memchr3 { + fn find(&self, haystack: &[u8], span: Span) -> Option<Span> { + #[cfg(not(feature = "perf-literal-substring"))] + { + unreachable!() + } + #[cfg(feature = "perf-literal-substring")] + { + memchr::memchr3(self.0, self.1, self.2, &haystack[span]).map(|i| { + let start = span.start + i; + let end = start + 1; + Span { start, end } + }) + } + } + + fn prefix(&self, haystack: &[u8], span: Span) -> Option<Span> { + let b = *haystack.get(span.start)?; + if self.0 == b || self.1 == b || self.2 == b { + Some(Span { start: span.start, end: span.start + 1 }) + } else { + None + } + } + + fn memory_usage(&self) -> usize { + 0 + } + + fn is_fast(&self) -> bool { + true + } +} diff --git a/third_party/rust/regex-automata/src/util/prefilter/memmem.rs b/third_party/rust/regex-automata/src/util/prefilter/memmem.rs new file mode 100644 index 0000000000..deea17bd9d --- /dev/null +++ b/third_party/rust/regex-automata/src/util/prefilter/memmem.rs @@ -0,0 +1,88 @@ +use crate::util::{ + prefilter::PrefilterI, + search::{MatchKind, Span}, +}; + +#[derive(Clone, Debug)] +pub(crate) struct Memmem { + #[cfg(not(all(feature = "std", feature = "perf-literal-substring")))] + _unused: (), + #[cfg(all(feature = "std", feature = "perf-literal-substring"))] + finder: memchr::memmem::Finder<'static>, +} + +impl Memmem { + pub(crate) fn new<B: AsRef<[u8]>>( + _kind: MatchKind, + needles: &[B], + ) -> Option<Memmem> { + #[cfg(not(all(feature = "std", feature = "perf-literal-substring")))] + { + None + } + #[cfg(all(feature = "std", feature = "perf-literal-substring"))] + { + if needles.len() != 1 { + return None; + } + let needle = needles[0].as_ref(); + let finder = memchr::memmem::Finder::new(needle).into_owned(); + Some(Memmem { finder }) + } + } +} + +impl PrefilterI for Memmem { + fn find(&self, haystack: &[u8], span: Span) -> Option<Span> { + #[cfg(not(all(feature = "std", feature = "perf-literal-substring")))] + { + unreachable!() + } + #[cfg(all(feature = "std", feature = "perf-literal-substring"))] + { + self.finder.find(&haystack[span]).map(|i| { + let start = span.start + i; + let end = start + self.finder.needle().len(); + Span { start, end } + }) + } + } + + fn prefix(&self, haystack: &[u8], span: Span) -> Option<Span> { + #[cfg(not(all(feature = "std", feature = "perf-literal-substring")))] + { + unreachable!() + } + #[cfg(all(feature = "std", feature = "perf-literal-substring"))] + { + let needle = self.finder.needle(); + if haystack[span].starts_with(needle) { + Some(Span { end: span.start + needle.len(), ..span }) + } else { + None + } + } + } + + fn memory_usage(&self) -> usize { + #[cfg(not(all(feature = "std", feature = "perf-literal-substring")))] + { + unreachable!() + } + #[cfg(all(feature = "std", feature = "perf-literal-substring"))] + { + self.finder.needle().len() + } + } + + fn is_fast(&self) -> bool { + #[cfg(not(all(feature = "std", feature = "perf-literal-substring")))] + { + unreachable!() + } + #[cfg(all(feature = "std", feature = "perf-literal-substring"))] + { + true + } + } +} diff --git a/third_party/rust/regex-automata/src/util/prefilter/mod.rs b/third_party/rust/regex-automata/src/util/prefilter/mod.rs new file mode 100644 index 0000000000..51fc922337 --- /dev/null +++ b/third_party/rust/regex-automata/src/util/prefilter/mod.rs @@ -0,0 +1,696 @@ +/*! +Defines a prefilter for accelerating regex searches. + +A prefilter can be created by building a [`Prefilter`] value. + +A prefilter represents one of the most important optimizations available for +accelerating regex searches. The idea of a prefilter is to very quickly find +candidate locations in a haystack where a regex _could_ match. Once a candidate +is found, it is then intended for the regex engine to run at that position to +determine whether the candidate is a match or a false positive. + +In the aforementioned description of the prefilter optimization also lay its +demise. Namely, if a prefilter has a high false positive rate and it produces +lots of candidates, then a prefilter can overall make a regex search slower. +It can run more slowly because more time is spent ping-ponging between the +prefilter search and the regex engine attempting to confirm each candidate as +a match. This ping-ponging has overhead that adds up, and is exacerbated by +a high false positive rate. + +Nevertheless, the optimization is still generally worth performing in most +cases. Particularly given just how much throughput can be improved. (It is not +uncommon for prefilter optimizations to improve throughput by one or two orders +of magnitude.) + +Typically a prefilter is used to find occurrences of literal prefixes from a +regex pattern, but this isn't required. A prefilter can be used to look for +suffixes or even inner literals. + +Note that as of now, prefilters throw away information about which pattern +each literal comes from. In other words, when a prefilter finds a match, +there's no way to know which pattern (or patterns) it came from. Therefore, +in order to confirm a match, you'll have to check all of the patterns by +running the full regex engine. +*/ + +mod aho_corasick; +mod byteset; +mod memchr; +mod memmem; +mod teddy; + +use core::{ + borrow::Borrow, + fmt::Debug, + panic::{RefUnwindSafe, UnwindSafe}, +}; + +#[cfg(feature = "alloc")] +use alloc::sync::Arc; + +#[cfg(feature = "syntax")] +use regex_syntax::hir::{literal, Hir}; + +use crate::util::search::{MatchKind, Span}; + +pub(crate) use crate::util::prefilter::{ + aho_corasick::AhoCorasick, + byteset::ByteSet, + memchr::{Memchr, Memchr2, Memchr3}, + memmem::Memmem, + teddy::Teddy, +}; + +/// A prefilter for accelerating regex searches. +/// +/// If you already have your literals that you want to search with, +/// then the vanilla [`Prefilter::new`] constructor is for you. But +/// if you have an [`Hir`] value from the `regex-syntax` crate, then +/// [`Prefilter::from_hir_prefix`] might be more convenient. Namely, it uses +/// the [`regex-syntax::hir::literal`](regex_syntax::hir::literal) module to +/// extract literal prefixes for you, optimize them and then select and build a +/// prefilter matcher. +/// +/// A prefilter must have **zero false negatives**. However, by its very +/// nature, it may produce false positives. That is, a prefilter will never +/// skip over a position in the haystack that corresponds to a match of the +/// original regex pattern, but it *may* produce a match for a position +/// in the haystack that does *not* correspond to a match of the original +/// regex pattern. If you use either the [`Prefilter::from_hir_prefix`] or +/// [`Prefilter::from_hirs_prefix`] constructors, then this guarantee is +/// upheld for you automatically. This guarantee is not preserved if you use +/// [`Prefilter::new`] though, since it is up to the caller to provide correct +/// literal strings with respect to the original regex pattern. +/// +/// # Cloning +/// +/// It is an API guarantee that cloning a prefilter is cheap. That is, cloning +/// it will not duplicate whatever heap memory is used to represent the +/// underlying matcher. +/// +/// # Example +/// +/// This example shows how to attach a `Prefilter` to the +/// [`PikeVM`](crate::nfa::thompson::pikevm::PikeVM) in order to accelerate +/// searches. +/// +/// ``` +/// use regex_automata::{ +/// nfa::thompson::pikevm::PikeVM, +/// util::prefilter::Prefilter, +/// Match, MatchKind, +/// }; +/// +/// let pre = Prefilter::new(MatchKind::LeftmostFirst, &["Bruce "]) +/// .expect("a prefilter"); +/// let re = PikeVM::builder() +/// .configure(PikeVM::config().prefilter(Some(pre))) +/// .build(r"Bruce \w+")?; +/// let mut cache = re.create_cache(); +/// assert_eq!( +/// Some(Match::must(0, 6..23)), +/// re.find(&mut cache, "Hello Bruce Springsteen!"), +/// ); +/// # Ok::<(), Box<dyn std::error::Error>>(()) +/// ``` +/// +/// But note that if you get your prefilter incorrect, it could lead to an +/// incorrect result! +/// +/// ``` +/// use regex_automata::{ +/// nfa::thompson::pikevm::PikeVM, +/// util::prefilter::Prefilter, +/// Match, MatchKind, +/// }; +/// +/// // This prefilter is wrong! +/// let pre = Prefilter::new(MatchKind::LeftmostFirst, &["Patti "]) +/// .expect("a prefilter"); +/// let re = PikeVM::builder() +/// .configure(PikeVM::config().prefilter(Some(pre))) +/// .build(r"Bruce \w+")?; +/// let mut cache = re.create_cache(); +/// // We find no match even though the regex does match. +/// assert_eq!( +/// None, +/// re.find(&mut cache, "Hello Bruce Springsteen!"), +/// ); +/// # Ok::<(), Box<dyn std::error::Error>>(()) +/// ``` +#[derive(Clone, Debug)] +pub struct Prefilter { + #[cfg(not(feature = "alloc"))] + _unused: (), + #[cfg(feature = "alloc")] + pre: Arc<dyn PrefilterI>, + #[cfg(feature = "alloc")] + is_fast: bool, +} + +impl Prefilter { + /// Create a new prefilter from a sequence of needles and a corresponding + /// match semantics. + /// + /// This may return `None` for a variety of reasons, for example, if + /// a suitable prefilter could not be constructed. That might occur + /// if they are unavailable (e.g., the `perf-literal-substring` and + /// `perf-literal-multisubstring` features aren't enabled), or it might + /// occur because of heuristics or other artifacts of how the prefilter + /// works. + /// + /// Note that if you have an [`Hir`] expression, it may be more convenient + /// to use [`Prefilter::from_hir_prefix`]. It will automatically handle the + /// task of extracting prefix literals for you. + /// + /// # Example + /// + /// This example shows how match semantics can impact the matching + /// algorithm used by the prefilter. For this reason, it is important to + /// ensure that the match semantics given here are consistent with the + /// match semantics intended for the regular expression that the literals + /// were extracted from. + /// + /// ``` + /// use regex_automata::{ + /// util::{prefilter::Prefilter, syntax}, + /// MatchKind, Span, + /// }; + /// + /// let hay = "Hello samwise"; + /// + /// // With leftmost-first, we find 'samwise' here because it comes + /// // before 'sam' in the sequence we give it.. + /// let pre = Prefilter::new(MatchKind::LeftmostFirst, &["samwise", "sam"]) + /// .expect("a prefilter"); + /// assert_eq!( + /// Some(Span::from(6..13)), + /// pre.find(hay.as_bytes(), Span::from(0..hay.len())), + /// ); + /// // Still with leftmost-first but with the literals reverse, now 'sam' + /// // will match instead! + /// let pre = Prefilter::new(MatchKind::LeftmostFirst, &["sam", "samwise"]) + /// .expect("a prefilter"); + /// assert_eq!( + /// Some(Span::from(6..9)), + /// pre.find(hay.as_bytes(), Span::from(0..hay.len())), + /// ); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + pub fn new<B: AsRef<[u8]>>( + kind: MatchKind, + needles: &[B], + ) -> Option<Prefilter> { + Choice::new(kind, needles).and_then(Prefilter::from_choice) + } + + /// This turns a prefilter selection into a `Prefilter`. That is, in turns + /// the enum given into a trait object. + fn from_choice(choice: Choice) -> Option<Prefilter> { + #[cfg(not(feature = "alloc"))] + { + None + } + #[cfg(feature = "alloc")] + { + let pre: Arc<dyn PrefilterI> = match choice { + Choice::Memchr(p) => Arc::new(p), + Choice::Memchr2(p) => Arc::new(p), + Choice::Memchr3(p) => Arc::new(p), + Choice::Memmem(p) => Arc::new(p), + Choice::Teddy(p) => Arc::new(p), + Choice::ByteSet(p) => Arc::new(p), + Choice::AhoCorasick(p) => Arc::new(p), + }; + let is_fast = pre.is_fast(); + Some(Prefilter { pre, is_fast }) + } + } + + /// This attempts to extract prefixes from the given `Hir` expression for + /// the given match semantics, and if possible, builds a prefilter for + /// them. + /// + /// # Example + /// + /// This example shows how to build a prefilter directly from an [`Hir`] + /// expression, and use to find an occurrence of a prefix from the regex + /// pattern. + /// + /// ``` + /// use regex_automata::{ + /// util::{prefilter::Prefilter, syntax}, + /// MatchKind, Span, + /// }; + /// + /// let hir = syntax::parse(r"(Bruce|Patti) \w+")?; + /// let pre = Prefilter::from_hir_prefix(MatchKind::LeftmostFirst, &hir) + /// .expect("a prefilter"); + /// let hay = "Hello Patti Scialfa!"; + /// assert_eq!( + /// Some(Span::from(6..12)), + /// pre.find(hay.as_bytes(), Span::from(0..hay.len())), + /// ); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + #[cfg(feature = "syntax")] + pub fn from_hir_prefix(kind: MatchKind, hir: &Hir) -> Option<Prefilter> { + Prefilter::from_hirs_prefix(kind, &[hir]) + } + + /// This attempts to extract prefixes from the given `Hir` expressions for + /// the given match semantics, and if possible, builds a prefilter for + /// them. + /// + /// Note that as of now, prefilters throw away information about which + /// pattern each literal comes from. In other words, when a prefilter finds + /// a match, there's no way to know which pattern (or patterns) it came + /// from. Therefore, in order to confirm a match, you'll have to check all + /// of the patterns by running the full regex engine. + /// + /// # Example + /// + /// This example shows how to build a prefilter directly from multiple + /// `Hir` expressions expression, and use it to find an occurrence of a + /// prefix from the regex patterns. + /// + /// ``` + /// use regex_automata::{ + /// util::{prefilter::Prefilter, syntax}, + /// MatchKind, Span, + /// }; + /// + /// let hirs = syntax::parse_many(&[ + /// r"(Bruce|Patti) \w+", + /// r"Mrs?\. Doubtfire", + /// ])?; + /// let pre = Prefilter::from_hirs_prefix(MatchKind::LeftmostFirst, &hirs) + /// .expect("a prefilter"); + /// let hay = "Hello Mrs. Doubtfire"; + /// assert_eq!( + /// Some(Span::from(6..20)), + /// pre.find(hay.as_bytes(), Span::from(0..hay.len())), + /// ); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + #[cfg(feature = "syntax")] + pub fn from_hirs_prefix<H: Borrow<Hir>>( + kind: MatchKind, + hirs: &[H], + ) -> Option<Prefilter> { + prefixes(kind, hirs) + .literals() + .and_then(|lits| Prefilter::new(kind, lits)) + } + + /// Run this prefilter on `haystack[span.start..end]` and return a matching + /// span if one exists. + /// + /// The span returned is guaranteed to have a start position greater than + /// or equal to the one given, and an end position less than or equal to + /// the one given. + /// + /// # Example + /// + /// This example shows how to build a prefilter directly from an [`Hir`] + /// expression, and use it to find an occurrence of a prefix from the regex + /// pattern. + /// + /// ``` + /// use regex_automata::{ + /// util::{prefilter::Prefilter, syntax}, + /// MatchKind, Span, + /// }; + /// + /// let hir = syntax::parse(r"Bruce \w+")?; + /// let pre = Prefilter::from_hir_prefix(MatchKind::LeftmostFirst, &hir) + /// .expect("a prefilter"); + /// let hay = "Hello Bruce Springsteen!"; + /// assert_eq!( + /// Some(Span::from(6..12)), + /// pre.find(hay.as_bytes(), Span::from(0..hay.len())), + /// ); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + #[inline] + pub fn find(&self, haystack: &[u8], span: Span) -> Option<Span> { + #[cfg(not(feature = "alloc"))] + { + unreachable!() + } + #[cfg(feature = "alloc")] + { + self.pre.find(haystack, span) + } + } + + /// Returns the span of a prefix of `haystack[span.start..span.end]` if + /// the prefilter matches. + /// + /// The span returned is guaranteed to have a start position equivalent to + /// the one given, and an end position less than or equal to the one given. + /// + /// # Example + /// + /// This example shows how to build a prefilter directly from an [`Hir`] + /// expression, and use it to find an occurrence of a prefix from the regex + /// pattern that begins at the start of a haystack only. + /// + /// ``` + /// use regex_automata::{ + /// util::{prefilter::Prefilter, syntax}, + /// MatchKind, Span, + /// }; + /// + /// let hir = syntax::parse(r"Bruce \w+")?; + /// let pre = Prefilter::from_hir_prefix(MatchKind::LeftmostFirst, &hir) + /// .expect("a prefilter"); + /// let hay = "Hello Bruce Springsteen!"; + /// // Nothing is found here because 'Bruce' does + /// // not occur at the beginning of our search. + /// assert_eq!( + /// None, + /// pre.prefix(hay.as_bytes(), Span::from(0..hay.len())), + /// ); + /// // But if we change where we start the search + /// // to begin where 'Bruce ' begins, then a + /// // match will be found. + /// assert_eq!( + /// Some(Span::from(6..12)), + /// pre.prefix(hay.as_bytes(), Span::from(6..hay.len())), + /// ); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + #[inline] + pub fn prefix(&self, haystack: &[u8], span: Span) -> Option<Span> { + #[cfg(not(feature = "alloc"))] + { + unreachable!() + } + #[cfg(feature = "alloc")] + { + self.pre.prefix(haystack, span) + } + } + + /// Returns the heap memory, in bytes, used by the underlying prefilter. + #[inline] + pub fn memory_usage(&self) -> usize { + #[cfg(not(feature = "alloc"))] + { + unreachable!() + } + #[cfg(feature = "alloc")] + { + self.pre.memory_usage() + } + } + + /// Implementations might return true here if they believe themselves to + /// be "fast." The concept of "fast" is deliberately left vague, but in + /// practice this usually corresponds to whether it's believed that SIMD + /// will be used. + /// + /// Why do we care about this? Well, some prefilter tricks tend to come + /// with their own bits of overhead, and so might only make sense if we + /// know that a scan will be *much* faster than the regex engine itself. + /// Otherwise, the trick may not be worth doing. Whether something is + /// "much" faster than the regex engine generally boils down to whether + /// SIMD is used. (But not always. Even a SIMD matcher with a high false + /// positive rate can become quite slow.) + /// + /// Even if this returns true, it is still possible for the prefilter to + /// be "slow." Remember, prefilters are just heuristics. We can't really + /// *know* a prefilter will be fast without actually trying the prefilter. + /// (Which of course we cannot afford to do.) + #[inline] + pub(crate) fn is_fast(&self) -> bool { + #[cfg(not(feature = "alloc"))] + { + unreachable!() + } + #[cfg(feature = "alloc")] + { + self.is_fast + } + } +} + +/// A trait for abstracting over prefilters. Basically, a prefilter is +/// something that do an unanchored *and* an anchored search in a haystack +/// within a given span. +/// +/// This exists pretty much only so that we can use prefilters as a trait +/// object (which is what `Prefilter` is). If we ever move off of trait objects +/// and to an enum, then it's likely this trait could be removed. +pub(crate) trait PrefilterI: + Debug + Send + Sync + RefUnwindSafe + UnwindSafe + 'static +{ + /// Run this prefilter on `haystack[span.start..end]` and return a matching + /// span if one exists. + /// + /// The span returned is guaranteed to have a start position greater than + /// or equal to the one given, and an end position less than or equal to + /// the one given. + fn find(&self, haystack: &[u8], span: Span) -> Option<Span>; + + /// Returns the span of a prefix of `haystack[span.start..span.end]` if + /// the prefilter matches. + /// + /// The span returned is guaranteed to have a start position equivalent to + /// the one given, and an end position less than or equal to the one given. + fn prefix(&self, haystack: &[u8], span: Span) -> Option<Span>; + + /// Returns the heap memory, in bytes, used by the underlying prefilter. + fn memory_usage(&self) -> usize; + + /// Implementations might return true here if they believe themselves to + /// be "fast." See [`Prefilter::is_fast`] for more details. + fn is_fast(&self) -> bool; +} + +#[cfg(feature = "alloc")] +impl<P: PrefilterI + ?Sized> PrefilterI for Arc<P> { + #[cfg_attr(feature = "perf-inline", inline(always))] + fn find(&self, haystack: &[u8], span: Span) -> Option<Span> { + (&**self).find(haystack, span) + } + + #[cfg_attr(feature = "perf-inline", inline(always))] + fn prefix(&self, haystack: &[u8], span: Span) -> Option<Span> { + (&**self).prefix(haystack, span) + } + + #[cfg_attr(feature = "perf-inline", inline(always))] + fn memory_usage(&self) -> usize { + (&**self).memory_usage() + } + + #[cfg_attr(feature = "perf-inline", inline(always))] + fn is_fast(&self) -> bool { + (&**self).is_fast() + } +} + +/// A type that encapsulates the selection of a prefilter algorithm from a +/// sequence of needles. +/// +/// The existence of this type is a little tricky, because we don't (currently) +/// use it for performing a search. Instead, we really only consume it by +/// converting the underlying prefilter into a trait object, whether that be +/// `dyn PrefilterI` or `dyn Strategy` (for the meta regex engine). In order +/// to avoid re-copying the prefilter selection logic, we isolate it here, and +/// then force anything downstream that wants to convert it to a trait object +/// to do trivial case analysis on it. +/// +/// One wonders whether we *should* use an enum instead of a trait object. +/// At time of writing, I chose trait objects based on instinct because 1) I +/// knew I wasn't going to inline anything and 2) there would potentially be +/// many different choices. However, as of time of writing, I haven't actually +/// compared the trait object approach to the enum approach. That probably +/// should be litigated, but I ran out of steam. +/// +/// Note that if the `alloc` feature is disabled, then values of this type +/// are (and should) never be constructed. Also, in practice, for any of the +/// prefilters to be selected, you'll need at least one of the `perf-literal-*` +/// features enabled. +#[derive(Clone, Debug)] +pub(crate) enum Choice { + Memchr(Memchr), + Memchr2(Memchr2), + Memchr3(Memchr3), + Memmem(Memmem), + Teddy(Teddy), + ByteSet(ByteSet), + AhoCorasick(AhoCorasick), +} + +impl Choice { + /// Select what is believed to be the best prefilter algorithm for the + /// match semantics and sequence of needles given. + /// + /// This selection algorithm uses the needles as given without any + /// modification. For example, if `[bar]` is given, then this doesn't + /// try to select `memchr` for `b`. Instead, it would select `memmem` + /// for `bar`. If callers would want `memchr` selected for `[bar]`, then + /// callers should massages the literals themselves. That is, callers are + /// responsible for heuristics surrounding which sequence of literals is + /// best. + /// + /// What this selection algorithm does is attempt to use the fastest + /// prefilter that works for the literals given. So if `[a, b]`, is given, + /// then `memchr2` is selected. + /// + /// Of course, which prefilter is selected is also subject to what + /// is available. For example, if `alloc` isn't enabled, then + /// that limits which prefilters can be selected. Similarly, if + /// `perf-literal-substring` isn't enabled, then nothing from the `memchr` + /// crate can be returned. + pub(crate) fn new<B: AsRef<[u8]>>( + kind: MatchKind, + needles: &[B], + ) -> Option<Choice> { + // An empty set means the regex matches nothing, so no sense in + // building a prefilter. + if needles.len() == 0 { + debug!("prefilter building failed: found empty set of literals"); + return None; + } + // If the regex can match the empty string, then the prefilter + // will by definition match at every position. This is obviously + // completely ineffective. + if needles.iter().any(|n| n.as_ref().is_empty()) { + debug!("prefilter building failed: literals match empty string"); + return None; + } + // BREADCRUMBS: Perhaps the literal optimizer should special case + // sequences of length two or three if the leading bytes of each are + // "rare"? Or perhaps, if there are two or three total possible leading + // bytes, regardless of the number of literals, and all are rare... + // Then well, perhaps we should use memchr2 or memchr3 in those cases? + if let Some(pre) = Memchr::new(kind, needles) { + debug!("prefilter built: memchr"); + return Some(Choice::Memchr(pre)); + } + if let Some(pre) = Memchr2::new(kind, needles) { + debug!("prefilter built: memchr2"); + return Some(Choice::Memchr2(pre)); + } + if let Some(pre) = Memchr3::new(kind, needles) { + debug!("prefilter built: memchr3"); + return Some(Choice::Memchr3(pre)); + } + if let Some(pre) = Memmem::new(kind, needles) { + debug!("prefilter built: memmem"); + return Some(Choice::Memmem(pre)); + } + if let Some(pre) = Teddy::new(kind, needles) { + debug!("prefilter built: teddy"); + return Some(Choice::Teddy(pre)); + } + if let Some(pre) = ByteSet::new(kind, needles) { + debug!("prefilter built: byteset"); + return Some(Choice::ByteSet(pre)); + } + if let Some(pre) = AhoCorasick::new(kind, needles) { + debug!("prefilter built: aho-corasick"); + return Some(Choice::AhoCorasick(pre)); + } + debug!("prefilter building failed: no strategy could be found"); + None + } +} + +/// Extracts all of the prefix literals from the given HIR expressions into a +/// single `Seq`. The literals in the sequence are ordered with respect to the +/// order of the given HIR expressions and consistent with the match semantics +/// given. +/// +/// The sequence returned is "optimized." That is, they may be shrunk or even +/// truncated according to heuristics with the intent of making them more +/// useful as a prefilter. (Which translates to both using faster algorithms +/// and minimizing the false positive rate.) +/// +/// Note that this erases any connection between the literals and which pattern +/// (or patterns) they came from. +/// +/// The match kind given must correspond to the match semantics of the regex +/// that is represented by the HIRs given. The match semantics may change the +/// literal sequence returned. +#[cfg(feature = "syntax")] +pub(crate) fn prefixes<H>(kind: MatchKind, hirs: &[H]) -> literal::Seq +where + H: core::borrow::Borrow<Hir>, +{ + let mut extractor = literal::Extractor::new(); + extractor.kind(literal::ExtractKind::Prefix); + + let mut prefixes = literal::Seq::empty(); + for hir in hirs { + prefixes.union(&mut extractor.extract(hir.borrow())); + } + debug!( + "prefixes (len={:?}, exact={:?}) extracted before optimization: {:?}", + prefixes.len(), + prefixes.is_exact(), + prefixes + ); + match kind { + MatchKind::All => { + prefixes.sort(); + prefixes.dedup(); + } + MatchKind::LeftmostFirst => { + prefixes.optimize_for_prefix_by_preference(); + } + } + debug!( + "prefixes (len={:?}, exact={:?}) extracted after optimization: {:?}", + prefixes.len(), + prefixes.is_exact(), + prefixes + ); + prefixes +} + +/// Like `prefixes`, but for all suffixes of all matches for the given HIRs. +#[cfg(feature = "syntax")] +pub(crate) fn suffixes<H>(kind: MatchKind, hirs: &[H]) -> literal::Seq +where + H: core::borrow::Borrow<Hir>, +{ + let mut extractor = literal::Extractor::new(); + extractor.kind(literal::ExtractKind::Suffix); + + let mut suffixes = literal::Seq::empty(); + for hir in hirs { + suffixes.union(&mut extractor.extract(hir.borrow())); + } + debug!( + "suffixes (len={:?}, exact={:?}) extracted before optimization: {:?}", + suffixes.len(), + suffixes.is_exact(), + suffixes + ); + match kind { + MatchKind::All => { + suffixes.sort(); + suffixes.dedup(); + } + MatchKind::LeftmostFirst => { + suffixes.optimize_for_suffix_by_preference(); + } + } + debug!( + "suffixes (len={:?}, exact={:?}) extracted after optimization: {:?}", + suffixes.len(), + suffixes.is_exact(), + suffixes + ); + suffixes +} diff --git a/third_party/rust/regex-automata/src/util/prefilter/teddy.rs b/third_party/rust/regex-automata/src/util/prefilter/teddy.rs new file mode 100644 index 0000000000..fc79f2b2f3 --- /dev/null +++ b/third_party/rust/regex-automata/src/util/prefilter/teddy.rs @@ -0,0 +1,160 @@ +use crate::util::{ + prefilter::PrefilterI, + search::{MatchKind, Span}, +}; + +#[derive(Clone, Debug)] +pub(crate) struct Teddy { + #[cfg(not(feature = "perf-literal-multisubstring"))] + _unused: (), + /// The actual Teddy searcher. + /// + /// Technically, it's possible that Teddy doesn't actually get used, since + /// Teddy does require its haystack to at least be of a certain size + /// (usually around the size of whatever vector is being used, so ~16 + /// or ~32 bytes). For haystacks shorter than that, the implementation + /// currently uses Rabin-Karp. + #[cfg(feature = "perf-literal-multisubstring")] + searcher: aho_corasick::packed::Searcher, + /// When running an anchored search, the packed searcher can't handle it so + /// we defer to Aho-Corasick itself. Kind of sad, but changing the packed + /// searchers to support anchored search would be difficult at worst and + /// annoying at best. Since packed searchers only apply to small numbers of + /// literals, we content ourselves that this is not much of an added cost. + /// (That packed searchers only work with a small number of literals is + /// also why we use a DFA here. Otherwise, the memory usage of a DFA would + /// likely be unacceptable.) + #[cfg(feature = "perf-literal-multisubstring")] + anchored_ac: aho_corasick::dfa::DFA, + /// The length of the smallest literal we look for. + /// + /// We use this as a heuristic to figure out whether this will be "fast" or + /// not. Generally, the longer the better, because longer needles are more + /// discriminating and thus reduce false positive rate. + #[cfg(feature = "perf-literal-multisubstring")] + minimum_len: usize, +} + +impl Teddy { + pub(crate) fn new<B: AsRef<[u8]>>( + kind: MatchKind, + needles: &[B], + ) -> Option<Teddy> { + #[cfg(not(feature = "perf-literal-multisubstring"))] + { + None + } + #[cfg(feature = "perf-literal-multisubstring")] + { + // We only really support leftmost-first semantics. In + // theory we could at least support leftmost-longest, as the + // aho-corasick crate does, but regex-automata doesn't know about + // leftmost-longest currently. + // + // And like the aho-corasick prefilter, if we're using `All` + // semantics, then we can still use leftmost semantics for a + // prefilter. (This might be a suspicious choice for the literal + // engine, which uses a prefilter as a regex engine directly, but + // that only happens when using leftmost-first semantics.) + let (packed_match_kind, ac_match_kind) = match kind { + MatchKind::LeftmostFirst | MatchKind::All => ( + aho_corasick::packed::MatchKind::LeftmostFirst, + aho_corasick::MatchKind::LeftmostFirst, + ), + }; + let minimum_len = + needles.iter().map(|n| n.as_ref().len()).min().unwrap_or(0); + let packed = aho_corasick::packed::Config::new() + .match_kind(packed_match_kind) + .builder() + .extend(needles) + .build()?; + let anchored_ac = aho_corasick::dfa::DFA::builder() + .match_kind(ac_match_kind) + .start_kind(aho_corasick::StartKind::Anchored) + .prefilter(false) + .build(needles) + .ok()?; + Some(Teddy { searcher: packed, anchored_ac, minimum_len }) + } + } +} + +impl PrefilterI for Teddy { + fn find(&self, haystack: &[u8], span: Span) -> Option<Span> { + #[cfg(not(feature = "perf-literal-multisubstring"))] + { + unreachable!() + } + #[cfg(feature = "perf-literal-multisubstring")] + { + let ac_span = + aho_corasick::Span { start: span.start, end: span.end }; + self.searcher + .find_in(haystack, ac_span) + .map(|m| Span { start: m.start(), end: m.end() }) + } + } + + fn prefix(&self, haystack: &[u8], span: Span) -> Option<Span> { + #[cfg(not(feature = "perf-literal-multisubstring"))] + { + unreachable!() + } + #[cfg(feature = "perf-literal-multisubstring")] + { + use aho_corasick::automaton::Automaton; + let input = aho_corasick::Input::new(haystack) + .anchored(aho_corasick::Anchored::Yes) + .span(span.start..span.end); + self.anchored_ac + .try_find(&input) + // OK because we build the DFA with anchored support. + .expect("aho-corasick DFA should never fail") + .map(|m| Span { start: m.start(), end: m.end() }) + } + } + + fn memory_usage(&self) -> usize { + #[cfg(not(feature = "perf-literal-multisubstring"))] + { + unreachable!() + } + #[cfg(feature = "perf-literal-multisubstring")] + { + use aho_corasick::automaton::Automaton; + self.searcher.memory_usage() + self.anchored_ac.memory_usage() + } + } + + fn is_fast(&self) -> bool { + #[cfg(not(feature = "perf-literal-multisubstring"))] + { + unreachable!() + } + #[cfg(feature = "perf-literal-multisubstring")] + { + // Teddy is usually quite fast, but I have seen some cases where + // a large number of literals can overwhelm it and make it not so + // fast. We make an educated but conservative guess at a limit, at + // which point, we're not so comfortable thinking Teddy is "fast." + // + // Well... this used to incorporate a "limit" on the *number* + // of literals, but I have since changed it to a minimum on the + // *smallest* literal. Namely, when there is a very small literal + // (1 or 2 bytes), it is far more likely that it leads to a higher + // false positive rate. (Although, of course, not always. For + // example, 'zq' is likely to have a very low false positive rate.) + // But when we have 3 bytes, we have a really good chance of being + // quite discriminatory and thus fast. + // + // We may still want to add some kind of limit on the number of + // literals here, but keep in mind that Teddy already has its own + // somewhat small limit (64 at time of writing). The main issue + // here is that if 'is_fast' is false, it opens the door for the + // reverse inner optimization to kick in. We really only want to + // resort to the reverse inner optimization if we absolutely must. + self.minimum_len >= 3 + } + } +} diff --git a/third_party/rust/regex-automata/src/util/primitives.rs b/third_party/rust/regex-automata/src/util/primitives.rs new file mode 100644 index 0000000000..5c5d187b0e --- /dev/null +++ b/third_party/rust/regex-automata/src/util/primitives.rs @@ -0,0 +1,776 @@ +/*! +Lower level primitive types that are useful in a variety of circumstances. + +# Overview + +This list represents the principle types in this module and briefly describes +when you might want to use them. + +* [`PatternID`] - A type that represents the identifier of a regex pattern. +This is probably the most widely used type in this module (which is why it's +also re-exported in the crate root). +* [`StateID`] - A type the represents the identifier of a finite automaton +state. This is used for both NFAs and DFAs, with the notable exception of +the hybrid NFA/DFA. (The hybrid NFA/DFA uses a special purpose "lazy" state +identifier.) +* [`SmallIndex`] - The internal representation of both a `PatternID` and a +`StateID`. Its purpose is to serve as a type that can index memory without +being as big as a `usize` on 64-bit targets. The main idea behind this type +is that there are many things in regex engines that will, in practice, never +overflow a 32-bit integer. (For example, like the number of patterns in a regex +or the number of states in an NFA.) Thus, a `SmallIndex` can be used to index +memory without peppering `as` casts everywhere. Moreover, it forces callers +to handle errors in the case where, somehow, the value would otherwise overflow +either a 32-bit integer or a `usize` (e.g., on 16-bit targets). +* [`NonMaxUsize`] - Represents a `usize` that cannot be `usize::MAX`. As a +result, `Option<NonMaxUsize>` has the same size in memory as a `usize`. This +useful, for example, when representing the offsets of submatches since it +reduces memory usage by a factor of 2. It is a legal optimization since Rust +guarantees that slices never have a length that exceeds `isize::MAX`. +*/ + +use core::num::NonZeroUsize; + +#[cfg(feature = "alloc")] +use alloc::vec::Vec; + +use crate::util::int::{Usize, U16, U32, U64}; + +/// A `usize` that can never be `usize::MAX`. +/// +/// This is similar to `core::num::NonZeroUsize`, but instead of not permitting +/// a zero value, this does not permit a max value. +/// +/// This is useful in certain contexts where one wants to optimize the memory +/// usage of things that contain match offsets. Namely, since Rust slices +/// are guaranteed to never have a length exceeding `isize::MAX`, we can use +/// `usize::MAX` as a sentinel to indicate that no match was found. Indeed, +/// types like `Option<NonMaxUsize>` have exactly the same size in memory as a +/// `usize`. +/// +/// This type is defined to be `repr(transparent)` for +/// `core::num::NonZeroUsize`, which is in turn defined to be +/// `repr(transparent)` for `usize`. +#[derive(Clone, Copy, Eq, Hash, PartialEq, PartialOrd, Ord)] +#[repr(transparent)] +pub struct NonMaxUsize(NonZeroUsize); + +impl NonMaxUsize { + /// Create a new `NonMaxUsize` from the given value. + /// + /// This returns `None` only when the given value is equal to `usize::MAX`. + #[inline] + pub fn new(value: usize) -> Option<NonMaxUsize> { + NonZeroUsize::new(value.wrapping_add(1)).map(NonMaxUsize) + } + + /// Return the underlying `usize` value. The returned value is guaranteed + /// to not equal `usize::MAX`. + #[inline] + pub fn get(self) -> usize { + self.0.get().wrapping_sub(1) + } +} + +// We provide our own Debug impl because seeing the internal repr can be quite +// surprising if you aren't expecting it. e.g., 'NonMaxUsize(5)' vs just '5'. +impl core::fmt::Debug for NonMaxUsize { + fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result { + write!(f, "{:?}", self.get()) + } +} + +/// A type that represents a "small" index. +/// +/// The main idea of this type is to provide something that can index memory, +/// but uses less memory than `usize` on 64-bit systems. Specifically, its +/// representation is always a `u32` and has `repr(transparent)` enabled. (So +/// it is safe to transmute between a `u32` and a `SmallIndex`.) +/// +/// A small index is typically useful in cases where there is no practical way +/// that the index will overflow a 32-bit integer. A good example of this is +/// an NFA state. If you could somehow build an NFA with `2^30` states, its +/// memory usage would be exorbitant and its runtime execution would be so +/// slow as to be completely worthless. Therefore, this crate generally deems +/// it acceptable to return an error if it would otherwise build an NFA that +/// requires a slice longer than what a 32-bit integer can index. In exchange, +/// we can use 32-bit indices instead of 64-bit indices in various places. +/// +/// This type ensures this by providing a constructor that will return an error +/// if its argument cannot fit into the type. This makes it much easier to +/// handle these sorts of boundary cases that are otherwise extremely subtle. +/// +/// On all targets, this type guarantees that its value will fit in a `u32`, +/// `i32`, `usize` and an `isize`. This means that on 16-bit targets, for +/// example, this type's maximum value will never overflow an `isize`, +/// which means it will never overflow a `i16` even though its internal +/// representation is still a `u32`. +/// +/// The purpose for making the type fit into even signed integer types like +/// `isize` is to guarantee that the difference between any two small indices +/// is itself also a small index. This is useful in certain contexts, e.g., +/// for delta encoding. +/// +/// # Other types +/// +/// The following types wrap `SmallIndex` to provide a more focused use case: +/// +/// * [`PatternID`] is for representing the identifiers of patterns. +/// * [`StateID`] is for representing the identifiers of states in finite +/// automata. It is used for both NFAs and DFAs. +/// +/// # Representation +/// +/// This type is always represented internally by a `u32` and is marked as +/// `repr(transparent)`. Thus, this type always has the same representation as +/// a `u32`. It is thus safe to transmute between a `u32` and a `SmallIndex`. +/// +/// # Indexing +/// +/// For convenience, callers may use a `SmallIndex` to index slices. +/// +/// # Safety +/// +/// While a `SmallIndex` is meant to guarantee that its value fits into `usize` +/// without using as much space as a `usize` on all targets, callers must +/// not rely on this property for safety. Callers may choose to rely on this +/// property for correctness however. For example, creating a `SmallIndex` with +/// an invalid value can be done in entirely safe code. This may in turn result +/// in panics or silent logical errors. +#[derive( + Clone, Copy, Debug, Default, Eq, Hash, PartialEq, PartialOrd, Ord, +)] +#[repr(transparent)] +pub struct SmallIndex(u32); + +impl SmallIndex { + /// The maximum index value. + #[cfg(any(target_pointer_width = "32", target_pointer_width = "64"))] + pub const MAX: SmallIndex = + // FIXME: Use as_usize() once const functions in traits are stable. + SmallIndex::new_unchecked(core::i32::MAX as usize - 1); + + /// The maximum index value. + #[cfg(target_pointer_width = "16")] + pub const MAX: SmallIndex = + SmallIndex::new_unchecked(core::isize::MAX - 1); + + /// The total number of values that can be represented as a small index. + pub const LIMIT: usize = SmallIndex::MAX.as_usize() + 1; + + /// The zero index value. + pub const ZERO: SmallIndex = SmallIndex::new_unchecked(0); + + /// The number of bytes that a single small index uses in memory. + pub const SIZE: usize = core::mem::size_of::<SmallIndex>(); + + /// Create a new small index. + /// + /// If the given index exceeds [`SmallIndex::MAX`], then this returns + /// an error. + #[inline] + pub fn new(index: usize) -> Result<SmallIndex, SmallIndexError> { + SmallIndex::try_from(index) + } + + /// Create a new small index without checking whether the given value + /// exceeds [`SmallIndex::MAX`]. + /// + /// Using this routine with an invalid index value will result in + /// unspecified behavior, but *not* undefined behavior. In particular, an + /// invalid index value is likely to cause panics or possibly even silent + /// logical errors. + /// + /// Callers must never rely on a `SmallIndex` to be within a certain range + /// for memory safety. + #[inline] + pub const fn new_unchecked(index: usize) -> SmallIndex { + // FIXME: Use as_u32() once const functions in traits are stable. + SmallIndex(index as u32) + } + + /// Like [`SmallIndex::new`], but panics if the given index is not valid. + #[inline] + pub fn must(index: usize) -> SmallIndex { + SmallIndex::new(index).expect("invalid small index") + } + + /// Return this small index as a `usize`. This is guaranteed to never + /// overflow `usize`. + #[inline] + pub const fn as_usize(&self) -> usize { + // FIXME: Use as_usize() once const functions in traits are stable. + self.0 as usize + } + + /// Return this small index as a `u64`. This is guaranteed to never + /// overflow. + #[inline] + pub const fn as_u64(&self) -> u64 { + // FIXME: Use u64::from() once const functions in traits are stable. + self.0 as u64 + } + + /// Return the internal `u32` of this small index. This is guaranteed to + /// never overflow `u32`. + #[inline] + pub const fn as_u32(&self) -> u32 { + self.0 + } + + /// Return the internal `u32` of this small index represented as an `i32`. + /// This is guaranteed to never overflow an `i32`. + #[inline] + pub const fn as_i32(&self) -> i32 { + // This is OK because we guarantee that our max value is <= i32::MAX. + self.0 as i32 + } + + /// Returns one more than this small index as a usize. + /// + /// Since a small index has constraints on its maximum value, adding `1` to + /// it will always fit in a `usize`, `u32` and a `i32`. + #[inline] + pub fn one_more(&self) -> usize { + self.as_usize() + 1 + } + + /// Decode this small index from the bytes given using the native endian + /// byte order for the current target. + /// + /// If the decoded integer is not representable as a small index for the + /// current target, then this returns an error. + #[inline] + pub fn from_ne_bytes( + bytes: [u8; 4], + ) -> Result<SmallIndex, SmallIndexError> { + let id = u32::from_ne_bytes(bytes); + if id > SmallIndex::MAX.as_u32() { + return Err(SmallIndexError { attempted: u64::from(id) }); + } + Ok(SmallIndex::new_unchecked(id.as_usize())) + } + + /// Decode this small index from the bytes given using the native endian + /// byte order for the current target. + /// + /// This is analogous to [`SmallIndex::new_unchecked`] in that is does not + /// check whether the decoded integer is representable as a small index. + #[inline] + pub fn from_ne_bytes_unchecked(bytes: [u8; 4]) -> SmallIndex { + SmallIndex::new_unchecked(u32::from_ne_bytes(bytes).as_usize()) + } + + /// Return the underlying small index integer as raw bytes in native endian + /// format. + #[inline] + pub fn to_ne_bytes(&self) -> [u8; 4] { + self.0.to_ne_bytes() + } +} + +impl<T> core::ops::Index<SmallIndex> for [T] { + type Output = T; + + #[inline] + fn index(&self, index: SmallIndex) -> &T { + &self[index.as_usize()] + } +} + +impl<T> core::ops::IndexMut<SmallIndex> for [T] { + #[inline] + fn index_mut(&mut self, index: SmallIndex) -> &mut T { + &mut self[index.as_usize()] + } +} + +#[cfg(feature = "alloc")] +impl<T> core::ops::Index<SmallIndex> for Vec<T> { + type Output = T; + + #[inline] + fn index(&self, index: SmallIndex) -> &T { + &self[index.as_usize()] + } +} + +#[cfg(feature = "alloc")] +impl<T> core::ops::IndexMut<SmallIndex> for Vec<T> { + #[inline] + fn index_mut(&mut self, index: SmallIndex) -> &mut T { + &mut self[index.as_usize()] + } +} + +impl From<u8> for SmallIndex { + fn from(index: u8) -> SmallIndex { + SmallIndex::new_unchecked(usize::from(index)) + } +} + +impl TryFrom<u16> for SmallIndex { + type Error = SmallIndexError; + + fn try_from(index: u16) -> Result<SmallIndex, SmallIndexError> { + if u32::from(index) > SmallIndex::MAX.as_u32() { + return Err(SmallIndexError { attempted: u64::from(index) }); + } + Ok(SmallIndex::new_unchecked(index.as_usize())) + } +} + +impl TryFrom<u32> for SmallIndex { + type Error = SmallIndexError; + + fn try_from(index: u32) -> Result<SmallIndex, SmallIndexError> { + if index > SmallIndex::MAX.as_u32() { + return Err(SmallIndexError { attempted: u64::from(index) }); + } + Ok(SmallIndex::new_unchecked(index.as_usize())) + } +} + +impl TryFrom<u64> for SmallIndex { + type Error = SmallIndexError; + + fn try_from(index: u64) -> Result<SmallIndex, SmallIndexError> { + if index > SmallIndex::MAX.as_u64() { + return Err(SmallIndexError { attempted: index }); + } + Ok(SmallIndex::new_unchecked(index.as_usize())) + } +} + +impl TryFrom<usize> for SmallIndex { + type Error = SmallIndexError; + + fn try_from(index: usize) -> Result<SmallIndex, SmallIndexError> { + if index > SmallIndex::MAX.as_usize() { + return Err(SmallIndexError { attempted: index.as_u64() }); + } + Ok(SmallIndex::new_unchecked(index)) + } +} + +#[cfg(test)] +impl quickcheck::Arbitrary for SmallIndex { + fn arbitrary(gen: &mut quickcheck::Gen) -> SmallIndex { + use core::cmp::max; + + let id = max(i32::MIN + 1, i32::arbitrary(gen)).abs(); + if id > SmallIndex::MAX.as_i32() { + SmallIndex::MAX + } else { + SmallIndex::new(usize::try_from(id).unwrap()).unwrap() + } + } +} + +/// This error occurs when a small index could not be constructed. +/// +/// This occurs when given an integer exceeding the maximum small index value. +/// +/// When the `std` feature is enabled, this implements the `Error` trait. +#[derive(Clone, Debug, Eq, PartialEq)] +pub struct SmallIndexError { + attempted: u64, +} + +impl SmallIndexError { + /// Returns the value that could not be converted to a small index. + pub fn attempted(&self) -> u64 { + self.attempted + } +} + +#[cfg(feature = "std")] +impl std::error::Error for SmallIndexError {} + +impl core::fmt::Display for SmallIndexError { + fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result { + write!( + f, + "failed to create small index from {:?}, which exceeds {:?}", + self.attempted(), + SmallIndex::MAX, + ) + } +} + +#[derive(Clone, Debug)] +pub(crate) struct SmallIndexIter { + rng: core::ops::Range<usize>, +} + +impl Iterator for SmallIndexIter { + type Item = SmallIndex; + + fn next(&mut self) -> Option<SmallIndex> { + if self.rng.start >= self.rng.end { + return None; + } + let next_id = self.rng.start + 1; + let id = core::mem::replace(&mut self.rng.start, next_id); + // new_unchecked is OK since we asserted that the number of + // elements in this iterator will fit in an ID at construction. + Some(SmallIndex::new_unchecked(id)) + } +} + +macro_rules! index_type_impls { + ($name:ident, $err:ident, $iter:ident, $withiter:ident) => { + impl $name { + /// The maximum value. + pub const MAX: $name = $name(SmallIndex::MAX); + + /// The total number of values that can be represented. + pub const LIMIT: usize = SmallIndex::LIMIT; + + /// The zero value. + pub const ZERO: $name = $name(SmallIndex::ZERO); + + /// The number of bytes that a single value uses in memory. + pub const SIZE: usize = SmallIndex::SIZE; + + /// Create a new value that is represented by a "small index." + /// + /// If the given index exceeds the maximum allowed value, then this + /// returns an error. + #[inline] + pub fn new(value: usize) -> Result<$name, $err> { + SmallIndex::new(value).map($name).map_err($err) + } + + /// Create a new value without checking whether the given argument + /// exceeds the maximum. + /// + /// Using this routine with an invalid value will result in + /// unspecified behavior, but *not* undefined behavior. In + /// particular, an invalid ID value is likely to cause panics or + /// possibly even silent logical errors. + /// + /// Callers must never rely on this type to be within a certain + /// range for memory safety. + #[inline] + pub const fn new_unchecked(value: usize) -> $name { + $name(SmallIndex::new_unchecked(value)) + } + + /// Like `new`, but panics if the given value is not valid. + #[inline] + pub fn must(value: usize) -> $name { + $name::new(value).expect(concat!( + "invalid ", + stringify!($name), + " value" + )) + } + + /// Return the internal value as a `usize`. This is guaranteed to + /// never overflow `usize`. + #[inline] + pub const fn as_usize(&self) -> usize { + self.0.as_usize() + } + + /// Return the internal value as a `u64`. This is guaranteed to + /// never overflow. + #[inline] + pub const fn as_u64(&self) -> u64 { + self.0.as_u64() + } + + /// Return the internal value as a `u32`. This is guaranteed to + /// never overflow `u32`. + #[inline] + pub const fn as_u32(&self) -> u32 { + self.0.as_u32() + } + + /// Return the internal value as a i32`. This is guaranteed to + /// never overflow an `i32`. + #[inline] + pub const fn as_i32(&self) -> i32 { + self.0.as_i32() + } + + /// Returns one more than this value as a usize. + /// + /// Since values represented by a "small index" have constraints + /// on their maximum value, adding `1` to it will always fit in a + /// `usize`, `u32` and a `i32`. + #[inline] + pub fn one_more(&self) -> usize { + self.0.one_more() + } + + /// Decode this value from the bytes given using the native endian + /// byte order for the current target. + /// + /// If the decoded integer is not representable as a small index + /// for the current target, then this returns an error. + #[inline] + pub fn from_ne_bytes(bytes: [u8; 4]) -> Result<$name, $err> { + SmallIndex::from_ne_bytes(bytes).map($name).map_err($err) + } + + /// Decode this value from the bytes given using the native endian + /// byte order for the current target. + /// + /// This is analogous to `new_unchecked` in that is does not check + /// whether the decoded integer is representable as a small index. + #[inline] + pub fn from_ne_bytes_unchecked(bytes: [u8; 4]) -> $name { + $name(SmallIndex::from_ne_bytes_unchecked(bytes)) + } + + /// Return the underlying integer as raw bytes in native endian + /// format. + #[inline] + pub fn to_ne_bytes(&self) -> [u8; 4] { + self.0.to_ne_bytes() + } + + /// Returns an iterator over all values from 0 up to and not + /// including the given length. + /// + /// If the given length exceeds this type's limit, then this + /// panics. + pub(crate) fn iter(len: usize) -> $iter { + $iter::new(len) + } + } + + // We write our own Debug impl so that we get things like PatternID(5) + // instead of PatternID(SmallIndex(5)). + impl core::fmt::Debug for $name { + fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result { + f.debug_tuple(stringify!($name)).field(&self.as_u32()).finish() + } + } + + impl<T> core::ops::Index<$name> for [T] { + type Output = T; + + #[inline] + fn index(&self, index: $name) -> &T { + &self[index.as_usize()] + } + } + + impl<T> core::ops::IndexMut<$name> for [T] { + #[inline] + fn index_mut(&mut self, index: $name) -> &mut T { + &mut self[index.as_usize()] + } + } + + #[cfg(feature = "alloc")] + impl<T> core::ops::Index<$name> for Vec<T> { + type Output = T; + + #[inline] + fn index(&self, index: $name) -> &T { + &self[index.as_usize()] + } + } + + #[cfg(feature = "alloc")] + impl<T> core::ops::IndexMut<$name> for Vec<T> { + #[inline] + fn index_mut(&mut self, index: $name) -> &mut T { + &mut self[index.as_usize()] + } + } + + impl From<u8> for $name { + fn from(value: u8) -> $name { + $name(SmallIndex::from(value)) + } + } + + impl TryFrom<u16> for $name { + type Error = $err; + + fn try_from(value: u16) -> Result<$name, $err> { + SmallIndex::try_from(value).map($name).map_err($err) + } + } + + impl TryFrom<u32> for $name { + type Error = $err; + + fn try_from(value: u32) -> Result<$name, $err> { + SmallIndex::try_from(value).map($name).map_err($err) + } + } + + impl TryFrom<u64> for $name { + type Error = $err; + + fn try_from(value: u64) -> Result<$name, $err> { + SmallIndex::try_from(value).map($name).map_err($err) + } + } + + impl TryFrom<usize> for $name { + type Error = $err; + + fn try_from(value: usize) -> Result<$name, $err> { + SmallIndex::try_from(value).map($name).map_err($err) + } + } + + #[cfg(test)] + impl quickcheck::Arbitrary for $name { + fn arbitrary(gen: &mut quickcheck::Gen) -> $name { + $name(SmallIndex::arbitrary(gen)) + } + } + + /// This error occurs when a value could not be constructed. + /// + /// This occurs when given an integer exceeding the maximum allowed + /// value. + /// + /// When the `std` feature is enabled, this implements the `Error` + /// trait. + #[derive(Clone, Debug, Eq, PartialEq)] + pub struct $err(SmallIndexError); + + impl $err { + /// Returns the value that could not be converted to an ID. + pub fn attempted(&self) -> u64 { + self.0.attempted() + } + } + + #[cfg(feature = "std")] + impl std::error::Error for $err {} + + impl core::fmt::Display for $err { + fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result { + write!( + f, + "failed to create {} from {:?}, which exceeds {:?}", + stringify!($name), + self.attempted(), + $name::MAX, + ) + } + } + + #[derive(Clone, Debug)] + pub(crate) struct $iter(SmallIndexIter); + + impl $iter { + fn new(len: usize) -> $iter { + assert!( + len <= $name::LIMIT, + "cannot create iterator for {} when number of \ + elements exceed {:?}", + stringify!($name), + $name::LIMIT, + ); + $iter(SmallIndexIter { rng: 0..len }) + } + } + + impl Iterator for $iter { + type Item = $name; + + fn next(&mut self) -> Option<$name> { + self.0.next().map($name) + } + } + + /// An iterator adapter that is like std::iter::Enumerate, but attaches + /// small index values instead. It requires `ExactSizeIterator`. At + /// construction, it ensures that the index of each element in the + /// iterator is representable in the corresponding small index type. + #[derive(Clone, Debug)] + pub(crate) struct $withiter<I> { + it: I, + ids: $iter, + } + + impl<I: Iterator + ExactSizeIterator> $withiter<I> { + fn new(it: I) -> $withiter<I> { + let ids = $name::iter(it.len()); + $withiter { it, ids } + } + } + + impl<I: Iterator + ExactSizeIterator> Iterator for $withiter<I> { + type Item = ($name, I::Item); + + fn next(&mut self) -> Option<($name, I::Item)> { + let item = self.it.next()?; + // Number of elements in this iterator must match, according + // to contract of ExactSizeIterator. + let id = self.ids.next().unwrap(); + Some((id, item)) + } + } + }; +} + +/// The identifier of a regex pattern, represented by a [`SmallIndex`]. +/// +/// The identifier for a pattern corresponds to its relative position among +/// other patterns in a single finite state machine. Namely, when building +/// a multi-pattern regex engine, one must supply a sequence of patterns to +/// match. The position (starting at 0) of each pattern in that sequence +/// represents its identifier. This identifier is in turn used to identify and +/// report matches of that pattern in various APIs. +/// +/// See the [`SmallIndex`] type for more information about what it means for +/// a pattern ID to be a "small index." +/// +/// Note that this type is defined in the +/// [`util::primitives`](crate::util::primitives) module, but it is also +/// re-exported at the crate root due to how common it is. +#[derive(Clone, Copy, Default, Eq, Hash, PartialEq, PartialOrd, Ord)] +#[repr(transparent)] +pub struct PatternID(SmallIndex); + +/// The identifier of a finite automaton state, represented by a +/// [`SmallIndex`]. +/// +/// Most regex engines in this crate are built on top of finite automata. Each +/// state in a finite automaton defines transitions from its state to another. +/// Those transitions point to other states via their identifiers, i.e., a +/// `StateID`. Since finite automata tend to contain many transitions, it is +/// much more memory efficient to define state IDs as small indices. +/// +/// See the [`SmallIndex`] type for more information about what it means for +/// a state ID to be a "small index." +#[derive(Clone, Copy, Default, Eq, Hash, PartialEq, PartialOrd, Ord)] +#[repr(transparent)] +pub struct StateID(SmallIndex); + +index_type_impls!(PatternID, PatternIDError, PatternIDIter, WithPatternIDIter); +index_type_impls!(StateID, StateIDError, StateIDIter, WithStateIDIter); + +/// A utility trait that defines a couple of adapters for making it convenient +/// to access indices as "small index" types. We require ExactSizeIterator so +/// that iterator construction can do a single check to make sure the index of +/// each element is representable by its small index type. +pub(crate) trait IteratorIndexExt: Iterator { + fn with_pattern_ids(self) -> WithPatternIDIter<Self> + where + Self: Sized + ExactSizeIterator, + { + WithPatternIDIter::new(self) + } + + fn with_state_ids(self) -> WithStateIDIter<Self> + where + Self: Sized + ExactSizeIterator, + { + WithStateIDIter::new(self) + } +} + +impl<I: Iterator> IteratorIndexExt for I {} diff --git a/third_party/rust/regex-automata/src/util/search.rs b/third_party/rust/regex-automata/src/util/search.rs new file mode 100644 index 0000000000..39aec522be --- /dev/null +++ b/third_party/rust/regex-automata/src/util/search.rs @@ -0,0 +1,1969 @@ +/*! +Types and routines that support the search APIs of most regex engines. + +This sub-module isn't exposed directly, but rather, its contents are exported +at the crate root due to the universality of most of the types and routines in +this module. +*/ + +use core::ops::{Range, RangeBounds}; + +use crate::util::{escape::DebugByte, primitives::PatternID, utf8}; + +/// The parameters for a regex search including the haystack to search. +/// +/// It turns out that regex searches have a few parameters, and in most cases, +/// those parameters have defaults that work in the vast majority of cases. +/// This `Input` type exists to make that common case seamnless while also +/// providing an avenue for changing the parameters of a search. In particular, +/// this type enables doing so without a combinatorial explosion of different +/// methods and/or superfluous parameters in the common cases. +/// +/// An `Input` permits configuring the following things: +/// +/// * Search only a substring of a haystack, while taking the broader context +/// into account for resolving look-around assertions. +/// * Indicating whether to search for all patterns in a regex, or to +/// only search for one pattern in particular. +/// * Whether to perform an anchored on unanchored search. +/// * Whether to report a match as early as possible. +/// +/// All of these parameters, except for the haystack, have sensible default +/// values. This means that the minimal search configuration is simply a call +/// to [`Input::new`] with your haystack. Setting any other parameter is +/// optional. +/// +/// Moreover, for any `H` that implements `AsRef<[u8]>`, there exists a +/// `From<H> for Input` implementation. This is useful because many of the +/// search APIs in this crate accept an `Into<Input>`. This means you can +/// provide string or byte strings to these routines directly, and they'll +/// automatically get converted into an `Input` for you. +/// +/// The lifetime parameter `'h` refers to the lifetime of the haystack. +/// +/// # Organization +/// +/// The API of `Input` is split into a few different parts: +/// +/// * A builder-like API that transforms a `Input` by value. Examples: +/// [`Input::span`] and [`Input::anchored`]. +/// * A setter API that permits mutating parameters in place. Examples: +/// [`Input::set_span`] and [`Input::set_anchored`]. +/// * A getter API that permits retrieving any of the search parameters. +/// Examples: [`Input::get_span`] and [`Input::get_anchored`]. +/// * A few convenience getter routines that don't conform to the above naming +/// pattern due to how common they are. Examples: [`Input::haystack`], +/// [`Input::start`] and [`Input::end`]. +/// * Miscellaneous predicates and other helper routines that are useful +/// in some contexts. Examples: [`Input::is_char_boundary`]. +/// +/// A `Input` exposes so much because it is meant to be used by both callers of +/// regex engines _and_ implementors of regex engines. A constraining factor is +/// that regex engines should accept a `&Input` as its lowest level API, which +/// means that implementors should only use the "getter" APIs of a `Input`. +/// +/// # Valid bounds and search termination +/// +/// An `Input` permits setting the bounds of a search via either +/// [`Input::span`] or [`Input::range`]. The bounds set must be valid, or +/// else a panic will occur. Bounds are valid if and only if: +/// +/// * The bounds represent a valid range into the input's haystack. +/// * **or** the end bound is a valid ending bound for the haystack *and* +/// the start bound is exactly one greater than the start bound. +/// +/// In the latter case, [`Input::is_done`] will return true and indicates any +/// search receiving such an input should immediately return with no match. +/// +/// Note that while `Input` is used for reverse searches in this crate, the +/// `Input::is_done` predicate assumes a forward search. Because unsigned +/// offsets are used internally, there is no way to tell from only the offsets +/// whether a reverse search is done or not. +/// +/// # Regex engine support +/// +/// Any regex engine accepting an `Input` must support at least the following +/// things: +/// +/// * Searching a `&[u8]` for matches. +/// * Searching a substring of `&[u8]` for a match, such that any match +/// reported must appear entirely within that substring. +/// * For a forwards search, a match should never be reported when +/// [`Input::is_done`] returns true. (For reverse searches, termination should +/// be handled outside of `Input`.) +/// +/// Supporting other aspects of an `Input` are optional, but regex engines +/// should handle aspects they don't support gracefully. How this is done is +/// generally up to the regex engine. This crate generally treats unsupported +/// anchored modes as an error to report for example, but for simplicity, in +/// the meta regex engine, trying to search with an invalid pattern ID just +/// results in no match being reported. +#[derive(Clone)] +pub struct Input<'h> { + haystack: &'h [u8], + span: Span, + anchored: Anchored, + earliest: bool, +} + +impl<'h> Input<'h> { + /// Create a new search configuration for the given haystack. + #[inline] + pub fn new<H: ?Sized + AsRef<[u8]>>(haystack: &'h H) -> Input<'h> { + Input { + haystack: haystack.as_ref(), + span: Span { start: 0, end: haystack.as_ref().len() }, + anchored: Anchored::No, + earliest: false, + } + } + + /// Set the span for this search. + /// + /// This routine does not panic if the span given is not a valid range for + /// this search's haystack. If this search is run with an invalid range, + /// then the most likely outcome is that the actual search execution will + /// panic. + /// + /// This routine is generic over how a span is provided. While + /// a [`Span`] may be given directly, one may also provide a + /// `std::ops::Range<usize>`. To provide anything supported by range + /// syntax, use the [`Input::range`] method. + /// + /// The default span is the entire haystack. + /// + /// Note that [`Input::range`] overrides this method and vice versa. + /// + /// # Panics + /// + /// This panics if the given span does not correspond to valid bounds in + /// the haystack or the termination of a search. + /// + /// # Example + /// + /// This example shows how the span of the search can impact whether a + /// match is reported or not. This is particularly relevant for look-around + /// operators, which might take things outside of the span into account + /// when determining whether they match. + /// + /// ``` + /// # if cfg!(miri) { return Ok(()); } // miri takes too long + /// use regex_automata::{ + /// nfa::thompson::pikevm::PikeVM, + /// Match, Input, + /// }; + /// + /// // Look for 'at', but as a distinct word. + /// let re = PikeVM::new(r"\bat\b")?; + /// let mut cache = re.create_cache(); + /// let mut caps = re.create_captures(); + /// + /// // Our haystack contains 'at', but not as a distinct word. + /// let haystack = "batter"; + /// + /// // A standard search finds nothing, as expected. + /// let input = Input::new(haystack); + /// re.search(&mut cache, &input, &mut caps); + /// assert_eq!(None, caps.get_match()); + /// + /// // But if we wanted to search starting at position '1', we might + /// // slice the haystack. If we do this, it's impossible for the \b + /// // anchors to take the surrounding context into account! And thus, + /// // a match is produced. + /// let input = Input::new(&haystack[1..3]); + /// re.search(&mut cache, &input, &mut caps); + /// assert_eq!(Some(Match::must(0, 0..2)), caps.get_match()); + /// + /// // But if we specify the span of the search instead of slicing the + /// // haystack, then the regex engine can "see" outside of the span + /// // and resolve the anchors correctly. + /// let input = Input::new(haystack).span(1..3); + /// re.search(&mut cache, &input, &mut caps); + /// assert_eq!(None, caps.get_match()); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + /// + /// This may seem a little ham-fisted, but this scenario tends to come up + /// if some other regex engine found the match span and now you need to + /// re-process that span to look for capturing groups. (e.g., Run a faster + /// DFA first, find a match, then run the PikeVM on just the match span to + /// resolve capturing groups.) In order to implement that sort of logic + /// correctly, you need to set the span on the search instead of slicing + /// the haystack directly. + /// + /// The other advantage of using this routine to specify the bounds of the + /// search is that the match offsets are still reported in terms of the + /// original haystack. For example, the second search in the example above + /// reported a match at position `0`, even though `at` starts at offset + /// `1` because we sliced the haystack. + #[inline] + pub fn span<S: Into<Span>>(mut self, span: S) -> Input<'h> { + self.set_span(span); + self + } + + /// Like `Input::span`, but accepts any range instead. + /// + /// This routine does not panic if the range given is not a valid range for + /// this search's haystack. If this search is run with an invalid range, + /// then the most likely outcome is that the actual search execution will + /// panic. + /// + /// The default range is the entire haystack. + /// + /// Note that [`Input::span`] overrides this method and vice versa. + /// + /// # Panics + /// + /// This routine will panic if the given range could not be converted + /// to a valid [`Range`]. For example, this would panic when given + /// `0..=usize::MAX` since it cannot be represented using a half-open + /// interval in terms of `usize`. + /// + /// This also panics if the given range does not correspond to valid bounds + /// in the haystack or the termination of a search. + /// + /// # Example + /// + /// ``` + /// use regex_automata::Input; + /// + /// let input = Input::new("foobar"); + /// assert_eq!(0..6, input.get_range()); + /// + /// let input = Input::new("foobar").range(2..=4); + /// assert_eq!(2..5, input.get_range()); + /// ``` + #[inline] + pub fn range<R: RangeBounds<usize>>(mut self, range: R) -> Input<'h> { + self.set_range(range); + self + } + + /// Sets the anchor mode of a search. + /// + /// When a search is anchored (so that's [`Anchored::Yes`] or + /// [`Anchored::Pattern`]), a match must begin at the start of a search. + /// When a search is not anchored (that's [`Anchored::No`]), regex engines + /// will behave as if the pattern started with a `(?s-u:.)*?`. This prefix + /// permits a match to appear anywhere. + /// + /// By default, the anchored mode is [`Anchored::No`]. + /// + /// **WARNING:** this is subtly different than using a `^` at the start of + /// your regex. A `^` forces a regex to match exclusively at the start of + /// a haystack, regardless of where you begin your search. In contrast, + /// anchoring a search will allow your regex to match anywhere in your + /// haystack, but the match must start at the beginning of a search. + /// + /// For example, consider the haystack `aba` and the following searches: + /// + /// 1. The regex `^a` is compiled with `Anchored::No` and searches `aba` + /// starting at position `2`. Since `^` requires the match to start at + /// the beginning of the haystack and `2 > 0`, no match is found. + /// 2. The regex `a` is compiled with `Anchored::Yes` and searches `aba` + /// starting at position `2`. This reports a match at `[2, 3]` since + /// the match starts where the search started. Since there is no `^`, + /// there is no requirement for the match to start at the beginning of + /// the haystack. + /// 3. The regex `a` is compiled with `Anchored::Yes` and searches `aba` + /// starting at position `1`. Since `b` corresponds to position `1` and + /// since the search is anchored, it finds no match. While the regex + /// matches at other positions, configuring the search to be anchored + /// requires that it only report a match that begins at the same offset + /// as the beginning of the search. + /// 4. The regex `a` is compiled with `Anchored::No` and searches `aba` + /// starting at position `1`. Since the search is not anchored and + /// the regex does not start with `^`, the search executes as if there + /// is a `(?s:.)*?` prefix that permits it to match anywhere. Thus, it + /// reports a match at `[2, 3]`. + /// + /// Note that the [`Anchored::Pattern`] mode is like `Anchored::Yes`, + /// except it only reports matches for a particular pattern. + /// + /// # Example + /// + /// This demonstrates the differences between an anchored search and + /// a pattern that begins with `^` (as described in the above warning + /// message). + /// + /// ``` + /// use regex_automata::{ + /// nfa::thompson::pikevm::PikeVM, + /// Anchored, Match, Input, + /// }; + /// + /// let haystack = "aba"; + /// + /// let re = PikeVM::new(r"^a")?; + /// let (mut cache, mut caps) = (re.create_cache(), re.create_captures()); + /// let input = Input::new(haystack).span(2..3).anchored(Anchored::No); + /// re.search(&mut cache, &input, &mut caps); + /// // No match is found because 2 is not the beginning of the haystack, + /// // which is what ^ requires. + /// assert_eq!(None, caps.get_match()); + /// + /// let re = PikeVM::new(r"a")?; + /// let (mut cache, mut caps) = (re.create_cache(), re.create_captures()); + /// let input = Input::new(haystack).span(2..3).anchored(Anchored::Yes); + /// re.search(&mut cache, &input, &mut caps); + /// // An anchored search can still match anywhere in the haystack, it just + /// // must begin at the start of the search which is '2' in this case. + /// assert_eq!(Some(Match::must(0, 2..3)), caps.get_match()); + /// + /// let re = PikeVM::new(r"a")?; + /// let (mut cache, mut caps) = (re.create_cache(), re.create_captures()); + /// let input = Input::new(haystack).span(1..3).anchored(Anchored::Yes); + /// re.search(&mut cache, &input, &mut caps); + /// // No match is found since we start searching at offset 1 which + /// // corresponds to 'b'. Since there is no '(?s:.)*?' prefix, no match + /// // is found. + /// assert_eq!(None, caps.get_match()); + /// + /// let re = PikeVM::new(r"a")?; + /// let (mut cache, mut caps) = (re.create_cache(), re.create_captures()); + /// let input = Input::new(haystack).span(1..3).anchored(Anchored::No); + /// re.search(&mut cache, &input, &mut caps); + /// // Since anchored=no, an implicit '(?s:.)*?' prefix was added to the + /// // pattern. Even though the search starts at 'b', the 'match anything' + /// // prefix allows the search to match 'a'. + /// let expected = Some(Match::must(0, 2..3)); + /// assert_eq!(expected, caps.get_match()); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + #[inline] + pub fn anchored(mut self, mode: Anchored) -> Input<'h> { + self.set_anchored(mode); + self + } + + /// Whether to execute an "earliest" search or not. + /// + /// When running a non-overlapping search, an "earliest" search will return + /// the match location as early as possible. For example, given a pattern + /// of `foo[0-9]+` and a haystack of `foo12345`, a normal leftmost search + /// will return `foo12345` as a match. But an "earliest" search for regex + /// engines that support "earliest" semantics will return `foo1` as a + /// match, since as soon as the first digit following `foo` is seen, it is + /// known to have found a match. + /// + /// Note that "earliest" semantics generally depend on the regex engine. + /// Different regex engines may determine there is a match at different + /// points. So there is no guarantee that "earliest" matches will always + /// return the same offsets for all regex engines. The "earliest" notion + /// is really about when the particular regex engine determines there is + /// a match rather than a consistent semantic unto itself. This is often + /// useful for implementing "did a match occur or not" predicates, but + /// sometimes the offset is useful as well. + /// + /// This is disabled by default. + /// + /// # Example + /// + /// This example shows the difference between "earliest" searching and + /// normal searching. + /// + /// ``` + /// use regex_automata::{nfa::thompson::pikevm::PikeVM, Match, Input}; + /// + /// let re = PikeVM::new(r"foo[0-9]+")?; + /// let mut cache = re.create_cache(); + /// let mut caps = re.create_captures(); + /// + /// // A normal search implements greediness like you expect. + /// let input = Input::new("foo12345"); + /// re.search(&mut cache, &input, &mut caps); + /// assert_eq!(Some(Match::must(0, 0..8)), caps.get_match()); + /// + /// // When 'earliest' is enabled and the regex engine supports + /// // it, the search will bail once it knows a match has been + /// // found. + /// let input = Input::new("foo12345").earliest(true); + /// re.search(&mut cache, &input, &mut caps); + /// assert_eq!(Some(Match::must(0, 0..4)), caps.get_match()); + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + #[inline] + pub fn earliest(mut self, yes: bool) -> Input<'h> { + self.set_earliest(yes); + self + } + + /// Set the span for this search configuration. + /// + /// This is like the [`Input::span`] method, except this mutates the + /// span in place. + /// + /// This routine is generic over how a span is provided. While + /// a [`Span`] may be given directly, one may also provide a + /// `std::ops::Range<usize>`. + /// + /// # Panics + /// + /// This panics if the given span does not correspond to valid bounds in + /// the haystack or the termination of a search. + /// + /// # Example + /// + /// ``` + /// use regex_automata::Input; + /// + /// let mut input = Input::new("foobar"); + /// assert_eq!(0..6, input.get_range()); + /// input.set_span(2..4); + /// assert_eq!(2..4, input.get_range()); + /// ``` + #[inline] + pub fn set_span<S: Into<Span>>(&mut self, span: S) { + let span = span.into(); + assert!( + span.end <= self.haystack.len() + && span.start <= span.end.wrapping_add(1), + "invalid span {:?} for haystack of length {}", + span, + self.haystack.len(), + ); + self.span = span; + } + + /// Set the span for this search configuration given any range. + /// + /// This is like the [`Input::range`] method, except this mutates the + /// span in place. + /// + /// This routine does not panic if the range given is not a valid range for + /// this search's haystack. If this search is run with an invalid range, + /// then the most likely outcome is that the actual search execution will + /// panic. + /// + /// # Panics + /// + /// This routine will panic if the given range could not be converted + /// to a valid [`Range`]. For example, this would panic when given + /// `0..=usize::MAX` since it cannot be represented using a half-open + /// interval in terms of `usize`. + /// + /// This also panics if the given span does not correspond to valid bounds + /// in the haystack or the termination of a search. + /// + /// # Example + /// + /// ``` + /// use regex_automata::Input; + /// + /// let mut input = Input::new("foobar"); + /// assert_eq!(0..6, input.get_range()); + /// input.set_range(2..=4); + /// assert_eq!(2..5, input.get_range()); + /// ``` + #[inline] + pub fn set_range<R: RangeBounds<usize>>(&mut self, range: R) { + use core::ops::Bound; + + // It's a little weird to convert ranges into spans, and then spans + // back into ranges when we actually slice the haystack. Because + // of that process, we always represent everything as a half-open + // internal. Therefore, handling things like m..=n is a little awkward. + let start = match range.start_bound() { + Bound::Included(&i) => i, + // Can this case ever happen? Range syntax doesn't support it... + Bound::Excluded(&i) => i.checked_add(1).unwrap(), + Bound::Unbounded => 0, + }; + let end = match range.end_bound() { + Bound::Included(&i) => i.checked_add(1).unwrap(), + Bound::Excluded(&i) => i, + Bound::Unbounded => self.haystack().len(), + }; + self.set_span(Span { start, end }); + } + + /// Set the starting offset for the span for this search configuration. + /// + /// This is a convenience routine for only mutating the start of a span + /// without having to set the entire span. + /// + /// # Panics + /// + /// This panics if the span resulting from the new start position does not + /// correspond to valid bounds in the haystack or the termination of a + /// search. + /// + /// # Example + /// + /// ``` + /// use regex_automata::Input; + /// + /// let mut input = Input::new("foobar"); + /// assert_eq!(0..6, input.get_range()); + /// input.set_start(5); + /// assert_eq!(5..6, input.get_range()); + /// ``` + #[inline] + pub fn set_start(&mut self, start: usize) { + self.set_span(Span { start, ..self.get_span() }); + } + + /// Set the ending offset for the span for this search configuration. + /// + /// This is a convenience routine for only mutating the end of a span + /// without having to set the entire span. + /// + /// # Panics + /// + /// This panics if the span resulting from the new end position does not + /// correspond to valid bounds in the haystack or the termination of a + /// search. + /// + /// # Example + /// + /// ``` + /// use regex_automata::Input; + /// + /// let mut input = Input::new("foobar"); + /// assert_eq!(0..6, input.get_range()); + /// input.set_end(5); + /// assert_eq!(0..5, input.get_range()); + /// ``` + #[inline] + pub fn set_end(&mut self, end: usize) { + self.set_span(Span { end, ..self.get_span() }); + } + + /// Set the anchor mode of a search. + /// + /// This is like [`Input::anchored`], except it mutates the search + /// configuration in place. + /// + /// # Example + /// + /// ``` + /// use regex_automata::{Anchored, Input, PatternID}; + /// + /// let mut input = Input::new("foobar"); + /// assert_eq!(Anchored::No, input.get_anchored()); + /// + /// let pid = PatternID::must(5); + /// input.set_anchored(Anchored::Pattern(pid)); + /// assert_eq!(Anchored::Pattern(pid), input.get_anchored()); + /// ``` + #[inline] + pub fn set_anchored(&mut self, mode: Anchored) { + self.anchored = mode; + } + + /// Set whether the search should execute in "earliest" mode or not. + /// + /// This is like [`Input::earliest`], except it mutates the search + /// configuration in place. + /// + /// # Example + /// + /// ``` + /// use regex_automata::Input; + /// + /// let mut input = Input::new("foobar"); + /// assert!(!input.get_earliest()); + /// input.set_earliest(true); + /// assert!(input.get_earliest()); + /// ``` + #[inline] + pub fn set_earliest(&mut self, yes: bool) { + self.earliest = yes; + } + + /// Return a borrow of the underlying haystack as a slice of bytes. + /// + /// # Example + /// + /// ``` + /// use regex_automata::Input; + /// + /// let input = Input::new("foobar"); + /// assert_eq!(b"foobar", input.haystack()); + /// ``` + #[inline] + pub fn haystack(&self) -> &[u8] { + self.haystack + } + + /// Return the start position of this search. + /// + /// This is a convenience routine for `search.get_span().start()`. + /// + /// When [`Input::is_done`] is `false`, this is guaranteed to return + /// an offset that is less than or equal to [`Input::end`]. Otherwise, + /// the offset is one greater than [`Input::end`]. + /// + /// # Example + /// + /// ``` + /// use regex_automata::Input; + /// + /// let input = Input::new("foobar"); + /// assert_eq!(0, input.start()); + /// + /// let input = Input::new("foobar").span(2..4); + /// assert_eq!(2, input.start()); + /// ``` + #[inline] + pub fn start(&self) -> usize { + self.get_span().start + } + + /// Return the end position of this search. + /// + /// This is a convenience routine for `search.get_span().end()`. + /// + /// This is guaranteed to return an offset that is a valid exclusive end + /// bound for this input's haystack. + /// + /// # Example + /// + /// ``` + /// use regex_automata::Input; + /// + /// let input = Input::new("foobar"); + /// assert_eq!(6, input.end()); + /// + /// let input = Input::new("foobar").span(2..4); + /// assert_eq!(4, input.end()); + /// ``` + #[inline] + pub fn end(&self) -> usize { + self.get_span().end + } + + /// Return the span for this search configuration. + /// + /// If one was not explicitly set, then the span corresponds to the entire + /// range of the haystack. + /// + /// When [`Input::is_done`] is `false`, the span returned is guaranteed + /// to correspond to valid bounds for this input's haystack. + /// + /// # Example + /// + /// ``` + /// use regex_automata::{Input, Span}; + /// + /// let input = Input::new("foobar"); + /// assert_eq!(Span { start: 0, end: 6 }, input.get_span()); + /// ``` + #[inline] + pub fn get_span(&self) -> Span { + self.span + } + + /// Return the span as a range for this search configuration. + /// + /// If one was not explicitly set, then the span corresponds to the entire + /// range of the haystack. + /// + /// When [`Input::is_done`] is `false`, the range returned is guaranteed + /// to correspond to valid bounds for this input's haystack. + /// + /// # Example + /// + /// ``` + /// use regex_automata::Input; + /// + /// let input = Input::new("foobar"); + /// assert_eq!(0..6, input.get_range()); + /// ``` + #[inline] + pub fn get_range(&self) -> Range<usize> { + self.get_span().range() + } + + /// Return the anchored mode for this search configuration. + /// + /// If no anchored mode was set, then it defaults to [`Anchored::No`]. + /// + /// # Example + /// + /// ``` + /// use regex_automata::{Anchored, Input, PatternID}; + /// + /// let mut input = Input::new("foobar"); + /// assert_eq!(Anchored::No, input.get_anchored()); + /// + /// let pid = PatternID::must(5); + /// input.set_anchored(Anchored::Pattern(pid)); + /// assert_eq!(Anchored::Pattern(pid), input.get_anchored()); + /// ``` + #[inline] + pub fn get_anchored(&self) -> Anchored { + self.anchored + } + + /// Return whether this search should execute in "earliest" mode. + /// + /// # Example + /// + /// ``` + /// use regex_automata::Input; + /// + /// let input = Input::new("foobar"); + /// assert!(!input.get_earliest()); + /// ``` + #[inline] + pub fn get_earliest(&self) -> bool { + self.earliest + } + + /// Return true if and only if this search can never return any other + /// matches. + /// + /// This occurs when the start position of this search is greater than the + /// end position of the search. + /// + /// # Example + /// + /// ``` + /// use regex_automata::Input; + /// + /// let mut input = Input::new("foobar"); + /// assert!(!input.is_done()); + /// input.set_start(6); + /// assert!(!input.is_done()); + /// input.set_start(7); + /// assert!(input.is_done()); + /// ``` + #[inline] + pub fn is_done(&self) -> bool { + self.get_span().start > self.get_span().end + } + + /// Returns true if and only if the given offset in this search's haystack + /// falls on a valid UTF-8 encoded codepoint boundary. + /// + /// If the haystack is not valid UTF-8, then the behavior of this routine + /// is unspecified. + /// + /// # Example + /// + /// This shows where codepoint boundaries do and don't exist in valid + /// UTF-8. + /// + /// ``` + /// use regex_automata::Input; + /// + /// let input = Input::new("☃"); + /// assert!(input.is_char_boundary(0)); + /// assert!(!input.is_char_boundary(1)); + /// assert!(!input.is_char_boundary(2)); + /// assert!(input.is_char_boundary(3)); + /// assert!(!input.is_char_boundary(4)); + /// ``` + #[inline] + pub fn is_char_boundary(&self, offset: usize) -> bool { + utf8::is_boundary(self.haystack(), offset) + } +} + +impl<'h> core::fmt::Debug for Input<'h> { + fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result { + use crate::util::escape::DebugHaystack; + + f.debug_struct("Input") + .field("haystack", &DebugHaystack(self.haystack())) + .field("span", &self.span) + .field("anchored", &self.anchored) + .field("earliest", &self.earliest) + .finish() + } +} + +impl<'h, H: ?Sized + AsRef<[u8]>> From<&'h H> for Input<'h> { + fn from(haystack: &'h H) -> Input<'h> { + Input::new(haystack) + } +} + +/// A representation of a span reported by a regex engine. +/// +/// A span corresponds to the starting and ending _byte offsets_ of a +/// contiguous region of bytes. The starting offset is inclusive while the +/// ending offset is exclusive. That is, a span is a half-open interval. +/// +/// A span is used to report the offsets of a match, but it is also used to +/// convey which region of a haystack should be searched via routines like +/// [`Input::span`]. +/// +/// This is basically equivalent to a `std::ops::Range<usize>`, except this +/// type implements `Copy` which makes it more ergonomic to use in the context +/// of this crate. Like a range, this implements `Index` for `[u8]` and `str`, +/// and `IndexMut` for `[u8]`. For convenience, this also impls `From<Range>`, +/// which means things like `Span::from(5..10)` work. +#[derive(Clone, Copy, Eq, Hash, PartialEq)] +pub struct Span { + /// The start offset of the span, inclusive. + pub start: usize, + /// The end offset of the span, exclusive. + pub end: usize, +} + +impl Span { + /// Returns this span as a range. + #[inline] + pub fn range(&self) -> Range<usize> { + Range::from(*self) + } + + /// Returns true when this span is empty. That is, when `start >= end`. + #[inline] + pub fn is_empty(&self) -> bool { + self.start >= self.end + } + + /// Returns the length of this span. + /// + /// This returns `0` in precisely the cases that `is_empty` returns `true`. + #[inline] + pub fn len(&self) -> usize { + self.end.saturating_sub(self.start) + } + + /// Returns true when the given offset is contained within this span. + /// + /// Note that an empty span contains no offsets and will always return + /// false. + #[inline] + pub fn contains(&self, offset: usize) -> bool { + !self.is_empty() && self.start <= offset && offset <= self.end + } + + /// Returns a new span with `offset` added to this span's `start` and `end` + /// values. + #[inline] + pub fn offset(&self, offset: usize) -> Span { + Span { start: self.start + offset, end: self.end + offset } + } +} + +impl core::fmt::Debug for Span { + fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result { + write!(f, "{}..{}", self.start, self.end) + } +} + +impl core::ops::Index<Span> for [u8] { + type Output = [u8]; + + #[inline] + fn index(&self, index: Span) -> &[u8] { + &self[index.range()] + } +} + +impl core::ops::IndexMut<Span> for [u8] { + #[inline] + fn index_mut(&mut self, index: Span) -> &mut [u8] { + &mut self[index.range()] + } +} + +impl core::ops::Index<Span> for str { + type Output = str; + + #[inline] + fn index(&self, index: Span) -> &str { + &self[index.range()] + } +} + +impl From<Range<usize>> for Span { + #[inline] + fn from(range: Range<usize>) -> Span { + Span { start: range.start, end: range.end } + } +} + +impl From<Span> for Range<usize> { + #[inline] + fn from(span: Span) -> Range<usize> { + Range { start: span.start, end: span.end } + } +} + +impl PartialEq<Range<usize>> for Span { + #[inline] + fn eq(&self, range: &Range<usize>) -> bool { + self.start == range.start && self.end == range.end + } +} + +impl PartialEq<Span> for Range<usize> { + #[inline] + fn eq(&self, span: &Span) -> bool { + self.start == span.start && self.end == span.end + } +} + +/// A representation of "half" of a match reported by a DFA. +/// +/// This is called a "half" match because it only includes the end location (or +/// start location for a reverse search) of a match. This corresponds to the +/// information that a single DFA scan can report. Getting the other half of +/// the match requires a second scan with a reversed DFA. +/// +/// A half match also includes the pattern that matched. The pattern is +/// identified by an ID, which corresponds to its position (starting from `0`) +/// relative to other patterns used to construct the corresponding DFA. If only +/// a single pattern is provided to the DFA, then all matches are guaranteed to +/// have a pattern ID of `0`. +#[derive(Clone, Copy, Debug, Eq, Hash, PartialEq)] +pub struct HalfMatch { + /// The pattern ID. + pattern: PatternID, + /// The offset of the match. + /// + /// For forward searches, the offset is exclusive. For reverse searches, + /// the offset is inclusive. + offset: usize, +} + +impl HalfMatch { + /// Create a new half match from a pattern ID and a byte offset. + #[inline] + pub fn new(pattern: PatternID, offset: usize) -> HalfMatch { + HalfMatch { pattern, offset } + } + + /// Create a new half match from a pattern ID and a byte offset. + /// + /// This is like [`HalfMatch::new`], but accepts a `usize` instead of a + /// [`PatternID`]. This panics if the given `usize` is not representable + /// as a `PatternID`. + #[inline] + pub fn must(pattern: usize, offset: usize) -> HalfMatch { + HalfMatch::new(PatternID::new(pattern).unwrap(), offset) + } + + /// Returns the ID of the pattern that matched. + /// + /// The ID of a pattern is derived from the position in which it was + /// originally inserted into the corresponding DFA. The first pattern has + /// identifier `0`, and each subsequent pattern is `1`, `2` and so on. + #[inline] + pub fn pattern(&self) -> PatternID { + self.pattern + } + + /// The position of the match. + /// + /// If this match was produced by a forward search, then the offset is + /// exclusive. If this match was produced by a reverse search, then the + /// offset is inclusive. + #[inline] + pub fn offset(&self) -> usize { + self.offset + } +} + +/// A representation of a match reported by a regex engine. +/// +/// A match has two essential pieces of information: the [`PatternID`] that +/// matches, and the [`Span`] of the match in a haystack. +/// +/// The pattern is identified by an ID, which corresponds to its position +/// (starting from `0`) relative to other patterns used to construct the +/// corresponding regex engine. If only a single pattern is provided, then all +/// matches are guaranteed to have a pattern ID of `0`. +/// +/// Every match reported by a regex engine guarantees that its span has its +/// start offset as less than or equal to its end offset. +#[derive(Clone, Copy, Debug, Eq, Hash, PartialEq)] +pub struct Match { + /// The pattern ID. + pattern: PatternID, + /// The underlying match span. + span: Span, +} + +impl Match { + /// Create a new match from a pattern ID and a span. + /// + /// This constructor is generic over how a span is provided. While + /// a [`Span`] may be given directly, one may also provide a + /// `std::ops::Range<usize>`. + /// + /// # Panics + /// + /// This panics if `end < start`. + /// + /// # Example + /// + /// This shows how to create a match for the first pattern in a regex + /// object using convenient range syntax. + /// + /// ``` + /// use regex_automata::{Match, PatternID}; + /// + /// let m = Match::new(PatternID::ZERO, 5..10); + /// assert_eq!(0, m.pattern().as_usize()); + /// assert_eq!(5, m.start()); + /// assert_eq!(10, m.end()); + /// ``` + #[inline] + pub fn new<S: Into<Span>>(pattern: PatternID, span: S) -> Match { + let span: Span = span.into(); + assert!(span.start <= span.end, "invalid match span"); + Match { pattern, span } + } + + /// Create a new match from a pattern ID and a byte offset span. + /// + /// This constructor is generic over how a span is provided. While + /// a [`Span`] may be given directly, one may also provide a + /// `std::ops::Range<usize>`. + /// + /// This is like [`Match::new`], but accepts a `usize` instead of a + /// [`PatternID`]. This panics if the given `usize` is not representable + /// as a `PatternID`. + /// + /// # Panics + /// + /// This panics if `end < start` or if `pattern > PatternID::MAX`. + /// + /// # Example + /// + /// This shows how to create a match for the third pattern in a regex + /// object using convenient range syntax. + /// + /// ``` + /// use regex_automata::Match; + /// + /// let m = Match::must(3, 5..10); + /// assert_eq!(3, m.pattern().as_usize()); + /// assert_eq!(5, m.start()); + /// assert_eq!(10, m.end()); + /// ``` + #[inline] + pub fn must<S: Into<Span>>(pattern: usize, span: S) -> Match { + Match::new(PatternID::must(pattern), span) + } + + /// Returns the ID of the pattern that matched. + /// + /// The ID of a pattern is derived from the position in which it was + /// originally inserted into the corresponding regex engine. The first + /// pattern has identifier `0`, and each subsequent pattern is `1`, `2` and + /// so on. + #[inline] + pub fn pattern(&self) -> PatternID { + self.pattern + } + + /// The starting position of the match. + /// + /// This is a convenience routine for `Match::span().start`. + #[inline] + pub fn start(&self) -> usize { + self.span().start + } + + /// The ending position of the match. + /// + /// This is a convenience routine for `Match::span().end`. + #[inline] + pub fn end(&self) -> usize { + self.span().end + } + + /// Returns the match span as a range. + /// + /// This is a convenience routine for `Match::span().range()`. + #[inline] + pub fn range(&self) -> core::ops::Range<usize> { + self.span().range() + } + + /// Returns the span for this match. + #[inline] + pub fn span(&self) -> Span { + self.span + } + + /// Returns true when the span in this match is empty. + /// + /// An empty match can only be returned when the regex itself can match + /// the empty string. + #[inline] + pub fn is_empty(&self) -> bool { + self.span().is_empty() + } + + /// Returns the length of this match. + /// + /// This returns `0` in precisely the cases that `is_empty` returns `true`. + #[inline] + pub fn len(&self) -> usize { + self.span().len() + } +} + +/// A set of `PatternID`s. +/// +/// A set of pattern identifiers is useful for recording which patterns have +/// matched a particular haystack. A pattern set _only_ includes pattern +/// identifiers. It does not include offset information. +/// +/// # Example +/// +/// This shows basic usage of a set. +/// +/// ``` +/// use regex_automata::{PatternID, PatternSet}; +/// +/// let pid1 = PatternID::must(5); +/// let pid2 = PatternID::must(8); +/// // Create a new empty set. +/// let mut set = PatternSet::new(10); +/// // Insert pattern IDs. +/// set.insert(pid1); +/// set.insert(pid2); +/// // Test membership. +/// assert!(set.contains(pid1)); +/// assert!(set.contains(pid2)); +/// // Get all members. +/// assert_eq!( +/// vec![5, 8], +/// set.iter().map(|p| p.as_usize()).collect::<Vec<usize>>(), +/// ); +/// // Clear the set. +/// set.clear(); +/// // Test that it is indeed empty. +/// assert!(set.is_empty()); +/// ``` +#[cfg(feature = "alloc")] +#[derive(Clone, Debug, Eq, PartialEq)] +pub struct PatternSet { + /// The number of patterns set to 'true' in this set. + len: usize, + /// A map from PatternID to boolean of whether a pattern matches or not. + /// + /// This should probably be a bitset, but it's probably unlikely to matter + /// much in practice. + /// + /// The main downside of this representation (and similarly for a bitset) + /// is that iteration scales with the capacity of the set instead of + /// the length of the set. This doesn't seem likely to be a problem in + /// practice. + /// + /// Another alternative is to just use a 'SparseSet' for this. It does use + /// more memory (quite a bit more), but that seems fine I think compared + /// to the memory being used by the regex engine. The real hiccup with + /// it is that it yields pattern IDs in the order they were inserted. + /// Which is actually kind of nice, but at the time of writing, pattern + /// IDs are yielded in ascending order in the regex crate RegexSet API. + /// If we did change to 'SparseSet', we could provide an additional + /// 'iter_match_order' iterator, but keep the ascending order one for + /// compatibility. + which: alloc::boxed::Box<[bool]>, +} + +#[cfg(feature = "alloc")] +impl PatternSet { + /// Create a new set of pattern identifiers with the given capacity. + /// + /// The given capacity typically corresponds to (at least) the number of + /// patterns in a compiled regex object. + /// + /// # Panics + /// + /// This panics if the given capacity exceeds [`PatternID::LIMIT`]. This is + /// impossible if you use the `pattern_len()` method as defined on any of + /// the regex engines in this crate. Namely, a regex will fail to build by + /// returning an error if the number of patterns given to it exceeds the + /// limit. Therefore, the number of patterns in a valid regex is always + /// a correct capacity to provide here. + pub fn new(capacity: usize) -> PatternSet { + assert!( + capacity <= PatternID::LIMIT, + "pattern set capacity exceeds limit of {}", + PatternID::LIMIT, + ); + PatternSet { + len: 0, + which: alloc::vec![false; capacity].into_boxed_slice(), + } + } + + /// Clear this set such that it contains no pattern IDs. + pub fn clear(&mut self) { + self.len = 0; + for matched in self.which.iter_mut() { + *matched = false; + } + } + + /// Return true if and only if the given pattern identifier is in this set. + pub fn contains(&self, pid: PatternID) -> bool { + pid.as_usize() < self.capacity() && self.which[pid] + } + + /// Insert the given pattern identifier into this set and return `true` if + /// the given pattern ID was not previously in this set. + /// + /// If the pattern identifier is already in this set, then this is a no-op. + /// + /// Use [`PatternSet::try_insert`] for a fallible version of this routine. + /// + /// # Panics + /// + /// This panics if this pattern set has insufficient capacity to + /// store the given pattern ID. + pub fn insert(&mut self, pid: PatternID) -> bool { + self.try_insert(pid) + .expect("PatternSet should have sufficient capacity") + } + + /// Insert the given pattern identifier into this set and return `true` if + /// the given pattern ID was not previously in this set. + /// + /// If the pattern identifier is already in this set, then this is a no-op. + /// + /// # Errors + /// + /// This returns an error if this pattern set has insufficient capacity to + /// store the given pattern ID. + pub fn try_insert( + &mut self, + pid: PatternID, + ) -> Result<bool, PatternSetInsertError> { + if pid.as_usize() >= self.capacity() { + return Err(PatternSetInsertError { + attempted: pid, + capacity: self.capacity(), + }); + } + if self.which[pid] { + return Ok(false); + } + self.len += 1; + self.which[pid] = true; + Ok(true) + } + + /* + // This is currently commented out because it is unused and it is unclear + // whether it's useful or not. What's the harm in having it? When, if + // we ever wanted to change our representation to a 'SparseSet', then + // supporting this method would be a bit tricky. So in order to keep some + // API evolution flexibility, we leave it out for now. + + /// Remove the given pattern identifier from this set. + /// + /// If the pattern identifier was not previously in this set, then this + /// does not change the set and returns `false`. + /// + /// # Panics + /// + /// This panics if `pid` exceeds the capacity of this set. + pub fn remove(&mut self, pid: PatternID) -> bool { + if !self.which[pid] { + return false; + } + self.len -= 1; + self.which[pid] = false; + true + } + */ + + /// Return true if and only if this set has no pattern identifiers in it. + pub fn is_empty(&self) -> bool { + self.len() == 0 + } + + /// Return true if and only if this set has the maximum number of pattern + /// identifiers in the set. This occurs precisely when `PatternSet::len() + /// == PatternSet::capacity()`. + /// + /// This particular property is useful to test because it may allow one to + /// stop a search earlier than you might otherwise. Namely, if a search is + /// only reporting which patterns match a haystack and if you know all of + /// the patterns match at a given point, then there's no new information + /// that can be learned by continuing the search. (Because a pattern set + /// does not keep track of offset information.) + pub fn is_full(&self) -> bool { + self.len() == self.capacity() + } + + /// Returns the total number of pattern identifiers in this set. + pub fn len(&self) -> usize { + self.len + } + + /// Returns the total number of pattern identifiers that may be stored + /// in this set. + /// + /// This is guaranteed to be less than or equal to [`PatternID::LIMIT`]. + /// + /// Typically, the capacity of a pattern set matches the number of patterns + /// in a regex object with which you are searching. + pub fn capacity(&self) -> usize { + self.which.len() + } + + /// Returns an iterator over all pattern identifiers in this set. + /// + /// The iterator yields pattern identifiers in ascending order, starting + /// at zero. + pub fn iter(&self) -> PatternSetIter<'_> { + PatternSetIter { it: self.which.iter().enumerate() } + } +} + +/// An error that occurs when a `PatternID` failed to insert into a +/// `PatternSet`. +/// +/// An insert fails when the given `PatternID` exceeds the configured capacity +/// of the `PatternSet`. +/// +/// This error is created by the [`PatternSet::try_insert`] routine. +#[cfg(feature = "alloc")] +#[derive(Clone, Debug)] +pub struct PatternSetInsertError { + attempted: PatternID, + capacity: usize, +} + +#[cfg(feature = "std")] +impl std::error::Error for PatternSetInsertError {} + +#[cfg(feature = "alloc")] +impl core::fmt::Display for PatternSetInsertError { + fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result { + write!( + f, + "failed to insert pattern ID {} into pattern set \ + with insufficiet capacity of {}", + self.attempted.as_usize(), + self.capacity, + ) + } +} + +/// An iterator over all pattern identifiers in a [`PatternSet`]. +/// +/// The lifetime parameter `'a` refers to the lifetime of the pattern set being +/// iterated over. +/// +/// This iterator is created by the [`PatternSet::iter`] method. +#[cfg(feature = "alloc")] +#[derive(Clone, Debug)] +pub struct PatternSetIter<'a> { + it: core::iter::Enumerate<core::slice::Iter<'a, bool>>, +} + +#[cfg(feature = "alloc")] +impl<'a> Iterator for PatternSetIter<'a> { + type Item = PatternID; + + fn next(&mut self) -> Option<PatternID> { + while let Some((index, &yes)) = self.it.next() { + if yes { + // Only valid 'PatternID' values can be inserted into the set + // and construction of the set panics if the capacity would + // permit storing invalid pattern IDs. Thus, 'yes' is only true + // precisely when 'index' corresponds to a valid 'PatternID'. + return Some(PatternID::new_unchecked(index)); + } + } + None + } + + fn size_hint(&self) -> (usize, Option<usize>) { + self.it.size_hint() + } +} + +#[cfg(feature = "alloc")] +impl<'a> DoubleEndedIterator for PatternSetIter<'a> { + fn next_back(&mut self) -> Option<PatternID> { + while let Some((index, &yes)) = self.it.next_back() { + if yes { + // Only valid 'PatternID' values can be inserted into the set + // and construction of the set panics if the capacity would + // permit storing invalid pattern IDs. Thus, 'yes' is only true + // precisely when 'index' corresponds to a valid 'PatternID'. + return Some(PatternID::new_unchecked(index)); + } + } + None + } +} + +/// The type of anchored search to perform. +/// +/// This is *almost* a boolean option. That is, you can either do an unanchored +/// search for any pattern in a regex, or you can do an anchored search for any +/// pattern in a regex. +/// +/// A third option exists that, assuming the regex engine supports it, permits +/// you to do an anchored search for a specific pattern. +/// +/// Note that there is no way to run an unanchored search for a specific +/// pattern. If you need that, you'll need to build separate regexes for each +/// pattern. +/// +/// # Errors +/// +/// If a regex engine does not support the anchored mode selected, then the +/// regex engine will return an error. While any non-trivial regex engine +/// should support at least one of the available anchored modes, there is no +/// singular mode that is guaranteed to be universally supported. Some regex +/// engines might only support unanchored searches (DFAs compiled without +/// anchored starting states) and some regex engines might only support +/// anchored searches (like the one-pass DFA). +/// +/// The specific error returned is a [`MatchError`] with a +/// [`MatchErrorKind::UnsupportedAnchored`] kind. The kind includes the +/// `Anchored` value given that is unsupported. +/// +/// Note that regex engines should report "no match" if, for example, an +/// `Anchored::Pattern` is provided with an invalid pattern ID _but_ where +/// anchored searches for a specific pattern are supported. This is smooths out +/// behavior such that it's possible to guarantee that an error never occurs +/// based on how the regex engine is configured. All regex engines in this +/// crate report "no match" when searching for an invalid pattern ID, but where +/// searching for a valid pattern ID is otherwise supported. +/// +/// # Example +/// +/// This example shows how to use the various `Anchored` modes to run a +/// search. We use the [`PikeVM`](crate::nfa::thompson::pikevm::PikeVM) +/// because it supports all modes unconditionally. Some regex engines, like +/// the [`onepass::DFA`](crate::dfa::onepass::DFA) cannot support unanchored +/// searches. +/// +/// ``` +/// # if cfg!(miri) { return Ok(()); } // miri takes too long +/// use regex_automata::{ +/// nfa::thompson::pikevm::PikeVM, +/// Anchored, Input, Match, PatternID, +/// }; +/// +/// let re = PikeVM::new_many(&[ +/// r"Mrs. \w+", +/// r"Miss \w+", +/// r"Mr. \w+", +/// r"Ms. \w+", +/// ])?; +/// let mut cache = re.create_cache(); +/// let hay = "Hello Mr. Springsteen!"; +/// +/// // The default is to do an unanchored search. +/// assert_eq!(Some(Match::must(2, 6..21)), re.find(&mut cache, hay)); +/// // Explicitly ask for an unanchored search. Same as above. +/// let input = Input::new(hay).anchored(Anchored::No); +/// assert_eq!(Some(Match::must(2, 6..21)), re.find(&mut cache, hay)); +/// +/// // Now try an anchored search. Since the match doesn't start at the +/// // beginning of the haystack, no match is found! +/// let input = Input::new(hay).anchored(Anchored::Yes); +/// assert_eq!(None, re.find(&mut cache, input)); +/// +/// // We can try an anchored search again, but move the location of where +/// // we start the search. Note that the offsets reported are still in +/// // terms of the overall haystack and not relative to where we started +/// // the search. +/// let input = Input::new(hay).anchored(Anchored::Yes).range(6..); +/// assert_eq!(Some(Match::must(2, 6..21)), re.find(&mut cache, input)); +/// +/// // Now try an anchored search for a specific pattern. We specifically +/// // choose a pattern that we know doesn't match to prove that the search +/// // only looks for the pattern we provide. +/// let input = Input::new(hay) +/// .anchored(Anchored::Pattern(PatternID::must(1))) +/// .range(6..); +/// assert_eq!(None, re.find(&mut cache, input)); +/// +/// // But if we switch it to the pattern that we know matches, then we find +/// // the match. +/// let input = Input::new(hay) +/// .anchored(Anchored::Pattern(PatternID::must(2))) +/// .range(6..); +/// assert_eq!(Some(Match::must(2, 6..21)), re.find(&mut cache, input)); +/// +/// # Ok::<(), Box<dyn std::error::Error>>(()) +/// ``` +#[derive(Clone, Copy, Debug, Eq, PartialEq)] +pub enum Anchored { + /// Run an unanchored search. This means a match may occur anywhere at or + /// after the start position of the search. + /// + /// This search can return a match for any pattern in the regex. + No, + /// Run an anchored search. This means that a match must begin at the + /// start position of the search. + /// + /// This search can return a match for any pattern in the regex. + Yes, + /// Run an anchored search for a specific pattern. This means that a match + /// must be for the given pattern and must begin at the start position of + /// the search. + Pattern(PatternID), +} + +impl Anchored { + /// Returns true if and only if this anchor mode corresponds to any kind of + /// anchored search. + /// + /// # Example + /// + /// This examples shows that both `Anchored::Yes` and `Anchored::Pattern` + /// are considered anchored searches. + /// + /// ``` + /// use regex_automata::{Anchored, PatternID}; + /// + /// assert!(!Anchored::No.is_anchored()); + /// assert!(Anchored::Yes.is_anchored()); + /// assert!(Anchored::Pattern(PatternID::ZERO).is_anchored()); + /// ``` + #[inline] + pub fn is_anchored(&self) -> bool { + matches!(*self, Anchored::Yes | Anchored::Pattern(_)) + } + + /// Returns the pattern ID associated with this configuration if it is an + /// anchored search for a specific pattern. Otherwise `None` is returned. + /// + /// # Example + /// + /// ``` + /// use regex_automata::{Anchored, PatternID}; + /// + /// assert_eq!(None, Anchored::No.pattern()); + /// assert_eq!(None, Anchored::Yes.pattern()); + /// + /// let pid = PatternID::must(5); + /// assert_eq!(Some(pid), Anchored::Pattern(pid).pattern()); + /// ``` + #[inline] + pub fn pattern(&self) -> Option<PatternID> { + match *self { + Anchored::Pattern(pid) => Some(pid), + _ => None, + } + } +} + +/// The kind of match semantics to use for a regex pattern. +/// +/// The default match kind is `LeftmostFirst`, and this corresponds to the +/// match semantics used by most backtracking engines, such as Perl. +/// +/// # Leftmost first or "preference order" match semantics +/// +/// Leftmost-first semantics determine which match to report when there are +/// multiple paths through a regex that match at the same position. The tie is +/// essentially broken by how a backtracker would behave. For example, consider +/// running the regex `foofoofoo|foofoo|foo` on the haystack `foofoo`. In this +/// case, both the `foofoo` and `foo` branches match at position `0`. So should +/// the end of the match be `3` or `6`? +/// +/// A backtracker will conceptually work by trying `foofoofoo` and failing. +/// Then it will try `foofoo`, find the match and stop there. Thus, the +/// leftmost-first match position is `6`. This is called "leftmost-first" or +/// "preference order" because the order of the branches as written in the +/// regex pattern is what determines how to break the tie. +/// +/// (Note that leftmost-longest match semantics, which break ties by always +/// taking the longest matching string, are not currently supported by this +/// crate. These match semantics tend to be found in POSIX regex engines.) +/// +/// This example shows how leftmost-first semantics work, and how it even +/// applies to multi-pattern regexes: +/// +/// ``` +/// use regex_automata::{ +/// nfa::thompson::pikevm::PikeVM, +/// Match, +/// }; +/// +/// let re = PikeVM::new_many(&[ +/// r"foofoofoo", +/// r"foofoo", +/// r"foo", +/// ])?; +/// let mut cache = re.create_cache(); +/// let got: Vec<Match> = re.find_iter(&mut cache, "foofoo").collect(); +/// let expected = vec![Match::must(1, 0..6)]; +/// assert_eq!(expected, got); +/// +/// # Ok::<(), Box<dyn std::error::Error>>(()) +/// ``` +/// +/// # All matches +/// +/// The `All` match semantics report any and all matches, and generally will +/// attempt to match as much as possible. It doesn't respect any sort of match +/// priority at all, so things like non-greedy matching don't work in this +/// mode. +/// +/// The fact that non-greedy matching doesn't work generally makes most forms +/// of unanchored non-overlapping searches have unintuitive behavior. Namely, +/// unanchored searches behave as if there is a `(?s-u:.)*?` prefix at the +/// beginning of the pattern, which is specifically non-greedy. Since it will +/// be treated as greedy in `All` match semantics, this generally means that +/// it will first attempt to consume all of the haystack and is likely to wind +/// up skipping matches. +/// +/// Generally speaking, `All` should only be used in two circumstances: +/// +/// * When running an anchored search and there is a desire to match as much as +/// possible. For example, when building a reverse regex matcher to find the +/// start of a match after finding the end. In this case, the reverse search +/// is anchored to the end of the match found by the forward search. +/// * When running overlapping searches. Since `All` encodes all possible +/// matches, this is generally what you want for an overlapping search. If you +/// try to use leftmost-first in an overlapping search, it is likely to produce +/// counter-intuitive results since leftmost-first specifically excludes some +/// matches from its underlying finite state machine. +/// +/// This example demonstrates the counter-intuitive behavior of `All` semantics +/// when using a standard leftmost unanchored search: +/// +/// ``` +/// use regex_automata::{ +/// nfa::thompson::pikevm::PikeVM, +/// Match, MatchKind, +/// }; +/// +/// let re = PikeVM::builder() +/// .configure(PikeVM::config().match_kind(MatchKind::All)) +/// .build("foo")?; +/// let hay = "first foo second foo wat"; +/// let mut cache = re.create_cache(); +/// let got: Vec<Match> = re.find_iter(&mut cache, hay).collect(); +/// // Notice that it completely skips the first 'foo'! +/// let expected = vec![Match::must(0, 17..20)]; +/// assert_eq!(expected, got); +/// +/// # Ok::<(), Box<dyn std::error::Error>>(()) +/// ``` +/// +/// This second example shows how `All` semantics are useful for an overlapping +/// search. Note that we use lower level lazy DFA APIs here since the NFA +/// engines only currently support a very limited form of overlapping search. +/// +/// ``` +/// use regex_automata::{ +/// hybrid::dfa::{DFA, OverlappingState}, +/// HalfMatch, Input, MatchKind, +/// }; +/// +/// let re = DFA::builder() +/// // If we didn't set 'All' semantics here, then the regex would only +/// // match 'foo' at offset 3 and nothing else. Why? Because the state +/// // machine implements preference order and knows that the 'foofoo' and +/// // 'foofoofoo' branches can never match since 'foo' will always match +/// // when they match and take priority. +/// .configure(DFA::config().match_kind(MatchKind::All)) +/// .build(r"foo|foofoo|foofoofoo")?; +/// let mut cache = re.create_cache(); +/// let mut state = OverlappingState::start(); +/// let input = Input::new("foofoofoo"); +/// let mut got = vec![]; +/// loop { +/// re.try_search_overlapping_fwd(&mut cache, &input, &mut state)?; +/// let m = match state.get_match() { +/// None => break, +/// Some(m) => m, +/// }; +/// got.push(m); +/// } +/// let expected = vec![ +/// HalfMatch::must(0, 3), +/// HalfMatch::must(0, 6), +/// HalfMatch::must(0, 9), +/// ]; +/// assert_eq!(expected, got); +/// +/// # Ok::<(), Box<dyn std::error::Error>>(()) +/// ``` +#[non_exhaustive] +#[derive(Clone, Copy, Debug, Eq, PartialEq)] +pub enum MatchKind { + /// Report all possible matches. + All, + /// Report only the leftmost matches. When multiple leftmost matches exist, + /// report the match corresponding to the part of the regex that appears + /// first in the syntax. + LeftmostFirst, + // There is prior art in RE2 that shows that we should be able to add + // LeftmostLongest too. The tricky part of it is supporting ungreedy + // repetitions. Instead of treating all NFA states as having equivalent + // priority (as in 'All') or treating all NFA states as having distinct + // priority based on order (as in 'LeftmostFirst'), we instead group NFA + // states into sets, and treat members of each set as having equivalent + // priority, but having greater priority than all following members + // of different sets. + // + // However, it's not clear whether it's really worth adding this. After + // all, leftmost-longest can be emulated when using literals by using + // leftmost-first and sorting the literals by length in descending order. + // However, this won't work for arbitrary regexes. e.g., `\w|\w\w` will + // always match `a` in `ab` when using leftmost-first, but leftmost-longest + // would match `ab`. +} + +impl MatchKind { + #[cfg(feature = "alloc")] + pub(crate) fn continue_past_first_match(&self) -> bool { + *self == MatchKind::All + } +} + +impl Default for MatchKind { + fn default() -> MatchKind { + MatchKind::LeftmostFirst + } +} + +/// An error indicating that a search stopped before reporting whether a +/// match exists or not. +/// +/// To be very clear, this error type implies that one cannot assume that no +/// matches occur, since the search stopped before completing. That is, if +/// you're looking for information about where a search determined that no +/// match can occur, then this error type does *not* give you that. (Indeed, at +/// the time of writing, if you need such a thing, you have to write your own +/// search routine.) +/// +/// Normally, when one searches for something, the response is either an +/// affirmative "it was found at this location" or a negative "not found at +/// all." However, in some cases, a regex engine can be configured to stop its +/// search before concluding whether a match exists or not. When this happens, +/// it may be important for the caller to know why the regex engine gave up and +/// where in the input it gave up at. This error type exposes the 'why' and the +/// 'where.' +/// +/// For example, the DFAs provided by this library generally cannot correctly +/// implement Unicode word boundaries. Instead, they provide an option to +/// eagerly support them on ASCII text (since Unicode word boundaries are +/// equivalent to ASCII word boundaries when searching ASCII text), but will +/// "give up" if a non-ASCII byte is seen. In such cases, one is usually +/// required to either report the failure to the caller (unergonomic) or +/// otherwise fall back to some other regex engine (ergonomic, but potentially +/// costly). +/// +/// More generally, some regex engines offer the ability for callers to specify +/// certain bytes that will trigger the regex engine to automatically quit if +/// they are seen. +/// +/// Still yet, there may be other reasons for a failed match. For example, +/// the hybrid DFA provided by this crate can be configured to give up if it +/// believes that it is not efficient. This in turn permits callers to choose a +/// different regex engine. +/// +/// (Note that DFAs are configured by default to never quit or give up in this +/// fashion. For example, by default, a DFA will fail to build if the regex +/// pattern contains a Unicode word boundary. One needs to opt into the "quit" +/// behavior via options, like +/// [`hybrid::dfa::Config::unicode_word_boundary`](crate::hybrid::dfa::Config::unicode_word_boundary).) +/// +/// There are a couple other ways a search +/// can fail. For example, when using the +/// [`BoundedBacktracker`](crate::nfa::thompson::backtrack::BoundedBacktracker) +/// with a haystack that is too long, or trying to run an unanchored search +/// with a [one-pass DFA](crate::dfa::onepass). +#[derive(Clone, Debug, Eq, PartialEq)] +pub struct MatchError( + #[cfg(feature = "alloc")] alloc::boxed::Box<MatchErrorKind>, + #[cfg(not(feature = "alloc"))] MatchErrorKind, +); + +impl MatchError { + /// Create a new error value with the given kind. + /// + /// This is a more verbose version of the kind-specific constructors, + /// e.g., `MatchError::quit`. + pub fn new(kind: MatchErrorKind) -> MatchError { + #[cfg(feature = "alloc")] + { + MatchError(alloc::boxed::Box::new(kind)) + } + #[cfg(not(feature = "alloc"))] + { + MatchError(kind) + } + } + + /// Returns a reference to the underlying error kind. + pub fn kind(&self) -> &MatchErrorKind { + &self.0 + } + + /// Create a new "quit" error. The given `byte` corresponds to the value + /// that tripped a search's quit condition, and `offset` corresponds to the + /// location in the haystack at which the search quit. + /// + /// This is the same as calling `MatchError::new` with a + /// [`MatchErrorKind::Quit`] kind. + pub fn quit(byte: u8, offset: usize) -> MatchError { + MatchError::new(MatchErrorKind::Quit { byte, offset }) + } + + /// Create a new "gave up" error. The given `offset` corresponds to the + /// location in the haystack at which the search gave up. + /// + /// This is the same as calling `MatchError::new` with a + /// [`MatchErrorKind::GaveUp`] kind. + pub fn gave_up(offset: usize) -> MatchError { + MatchError::new(MatchErrorKind::GaveUp { offset }) + } + + /// Create a new "haystack too long" error. The given `len` corresponds to + /// the length of the haystack that was problematic. + /// + /// This is the same as calling `MatchError::new` with a + /// [`MatchErrorKind::HaystackTooLong`] kind. + pub fn haystack_too_long(len: usize) -> MatchError { + MatchError::new(MatchErrorKind::HaystackTooLong { len }) + } + + /// Create a new "unsupported anchored" error. This occurs when the caller + /// requests a search with an anchor mode that is not supported by the + /// regex engine. + /// + /// This is the same as calling `MatchError::new` with a + /// [`MatchErrorKind::UnsupportedAnchored`] kind. + pub fn unsupported_anchored(mode: Anchored) -> MatchError { + MatchError::new(MatchErrorKind::UnsupportedAnchored { mode }) + } +} + +/// The underlying kind of a [`MatchError`]. +/// +/// This is a **non-exhaustive** enum. That means new variants may be added in +/// a semver-compatible release. +#[non_exhaustive] +#[derive(Clone, Debug, Eq, PartialEq)] +pub enum MatchErrorKind { + /// The search saw a "quit" byte at which it was instructed to stop + /// searching. + Quit { + /// The "quit" byte that was observed that caused the search to stop. + byte: u8, + /// The offset at which the quit byte was observed. + offset: usize, + }, + /// The search, based on heuristics, determined that it would be better + /// to stop, typically to provide the caller an opportunity to use an + /// alternative regex engine. + /// + /// Currently, the only way for this to occur is via the lazy DFA and + /// only when it is configured to do so (it will not return this error by + /// default). + GaveUp { + /// The offset at which the search stopped. This corresponds to the + /// position immediately following the last byte scanned. + offset: usize, + }, + /// This error occurs if the haystack given to the regex engine was too + /// long to be searched. This occurs, for example, with regex engines + /// like the bounded backtracker that have a configurable fixed amount of + /// capacity that is tied to the length of the haystack. Anything beyond + /// that configured limit will result in an error at search time. + HaystackTooLong { + /// The length of the haystack that exceeded the limit. + len: usize, + }, + /// An error indicating that a particular type of anchored search was + /// requested, but that the regex engine does not support it. + /// + /// Note that this error should not be returned by a regex engine simply + /// because the pattern ID is invalid (i.e., equal to or exceeds the number + /// of patterns in the regex). In that case, the regex engine should report + /// a non-match. + UnsupportedAnchored { + /// The anchored mode given that is unsupported. + mode: Anchored, + }, +} + +#[cfg(feature = "std")] +impl std::error::Error for MatchError {} + +impl core::fmt::Display for MatchError { + fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result { + match *self.kind() { + MatchErrorKind::Quit { byte, offset } => write!( + f, + "quit search after observing byte {:?} at offset {}", + DebugByte(byte), + offset, + ), + MatchErrorKind::GaveUp { offset } => { + write!(f, "gave up searching at offset {}", offset) + } + MatchErrorKind::HaystackTooLong { len } => { + write!(f, "haystack of length {} is too long", len) + } + MatchErrorKind::UnsupportedAnchored { mode: Anchored::Yes } => { + write!(f, "anchored searches are not supported or enabled") + } + MatchErrorKind::UnsupportedAnchored { mode: Anchored::No } => { + write!(f, "unanchored searches are not supported or enabled") + } + MatchErrorKind::UnsupportedAnchored { + mode: Anchored::Pattern(pid), + } => { + write!( + f, + "anchored searches for a specific pattern ({}) are \ + not supported or enabled", + pid.as_usize(), + ) + } + } + } +} + +#[cfg(test)] +mod tests { + use super::*; + + // We test that our 'MatchError' type is the size we expect. This isn't an + // API guarantee, but if the size increases, we really want to make sure we + // decide to do that intentionally. So this should be a speed bump. And in + // general, we should not increase the size without a very good reason. + // + // Why? Because low level search APIs return Result<.., MatchError>. When + // MatchError gets bigger, so to does the Result type. + // + // Now, when 'alloc' is enabled, we do box the error, which de-emphasizes + // the importance of keeping a small error type. But without 'alloc', we + // still want things to be small. + #[test] + fn match_error_size() { + let expected_size = if cfg!(feature = "alloc") { + core::mem::size_of::<usize>() + } else { + 2 * core::mem::size_of::<usize>() + }; + assert_eq!(expected_size, core::mem::size_of::<MatchError>()); + } + + // Same as above, but for the underlying match error kind. + #[cfg(target_pointer_width = "64")] + #[test] + fn match_error_kind_size() { + let expected_size = 2 * core::mem::size_of::<usize>(); + assert_eq!(expected_size, core::mem::size_of::<MatchErrorKind>()); + } + + #[cfg(target_pointer_width = "32")] + #[test] + fn match_error_kind_size() { + let expected_size = 3 * core::mem::size_of::<usize>(); + assert_eq!(expected_size, core::mem::size_of::<MatchErrorKind>()); + } +} diff --git a/third_party/rust/regex-automata/src/util/sparse_set.rs b/third_party/rust/regex-automata/src/util/sparse_set.rs new file mode 100644 index 0000000000..cbaa0b6f4f --- /dev/null +++ b/third_party/rust/regex-automata/src/util/sparse_set.rs @@ -0,0 +1,239 @@ +/*! +This module defines a sparse set data structure. Its most interesting +properties are: + +* They preserve insertion order. +* Set membership testing is done in constant time. +* Set insertion is done in constant time. +* Clearing the set is done in constant time. + +The cost for doing this is that the capacity of the set needs to be known up +front, and the elements in the set are limited to state identifiers. + +These sets are principally used when traversing an NFA state graph. This +happens at search time, for example, in the PikeVM. It also happens during DFA +determinization. +*/ + +use alloc::{vec, vec::Vec}; + +use crate::util::primitives::StateID; + +/// A pairse of sparse sets. +/// +/// This is useful when one needs to compute NFA epsilon closures from a +/// previous set of states derived from an epsilon closure. One set can be the +/// starting states where as the other set can be the destination states after +/// following the transitions for a particular byte of input. +/// +/// There is no significance to 'set1' or 'set2'. They are both sparse sets of +/// the same size. +/// +/// The members of this struct are exposed so that callers may borrow 'set1' +/// and 'set2' individually without being force to borrow both at the same +/// time. +#[derive(Clone, Debug)] +pub(crate) struct SparseSets { + pub(crate) set1: SparseSet, + pub(crate) set2: SparseSet, +} + +impl SparseSets { + /// Create a new pair of sparse sets where each set has the given capacity. + /// + /// This panics if the capacity given is bigger than `StateID::LIMIT`. + pub(crate) fn new(capacity: usize) -> SparseSets { + SparseSets { + set1: SparseSet::new(capacity), + set2: SparseSet::new(capacity), + } + } + + /// Resizes these sparse sets to have the new capacity given. + /// + /// The sets are automatically cleared. + /// + /// This panics if the capacity given is bigger than `StateID::LIMIT`. + #[inline] + pub(crate) fn resize(&mut self, new_capacity: usize) { + self.set1.resize(new_capacity); + self.set2.resize(new_capacity); + } + + /// Clear both sparse sets. + pub(crate) fn clear(&mut self) { + self.set1.clear(); + self.set2.clear(); + } + + /// Swap set1 with set2. + pub(crate) fn swap(&mut self) { + core::mem::swap(&mut self.set1, &mut self.set2); + } + + /// Returns the memory usage, in bytes, used by this pair of sparse sets. + pub(crate) fn memory_usage(&self) -> usize { + self.set1.memory_usage() + self.set2.memory_usage() + } +} + +/// A sparse set used for representing ordered NFA states. +/// +/// This supports constant time addition and membership testing. Clearing an +/// entire set can also be done in constant time. Iteration yields elements +/// in the order in which they were inserted. +/// +/// The data structure is based on: https://research.swtch.com/sparse +/// Note though that we don't actually use uninitialized memory. We generally +/// reuse sparse sets, so the initial allocation cost is bareable. However, its +/// other properties listed above are extremely useful. +#[derive(Clone)] +pub(crate) struct SparseSet { + /// The number of elements currently in this set. + len: usize, + /// Dense contains the ids in the order in which they were inserted. + dense: Vec<StateID>, + /// Sparse maps ids to their location in dense. + /// + /// A state ID is in the set if and only if + /// sparse[id] < len && id == dense[sparse[id]]. + /// + /// Note that these are indices into 'dense'. It's a little weird to use + /// StateID here, but we know our length can never exceed the bounds of + /// StateID (enforced by 'resize') and StateID will be at most 4 bytes + /// where as a usize is likely double that in most cases. + sparse: Vec<StateID>, +} + +impl SparseSet { + /// Create a new sparse set with the given capacity. + /// + /// Sparse sets have a fixed size and they cannot grow. Attempting to + /// insert more distinct elements than the total capacity of the set will + /// result in a panic. + /// + /// This panics if the capacity given is bigger than `StateID::LIMIT`. + #[inline] + pub(crate) fn new(capacity: usize) -> SparseSet { + let mut set = SparseSet { len: 0, dense: vec![], sparse: vec![] }; + set.resize(capacity); + set + } + + /// Resizes this sparse set to have the new capacity given. + /// + /// This set is automatically cleared. + /// + /// This panics if the capacity given is bigger than `StateID::LIMIT`. + #[inline] + pub(crate) fn resize(&mut self, new_capacity: usize) { + assert!( + new_capacity <= StateID::LIMIT, + "sparse set capacity cannot excced {:?}", + StateID::LIMIT + ); + self.clear(); + self.dense.resize(new_capacity, StateID::ZERO); + self.sparse.resize(new_capacity, StateID::ZERO); + } + + /// Returns the capacity of this set. + /// + /// The capacity represents a fixed limit on the number of distinct + /// elements that are allowed in this set. The capacity cannot be changed. + #[inline] + pub(crate) fn capacity(&self) -> usize { + self.dense.len() + } + + /// Returns the number of elements in this set. + #[inline] + pub(crate) fn len(&self) -> usize { + self.len + } + + /// Returns true if and only if this set is empty. + #[inline] + pub(crate) fn is_empty(&self) -> bool { + self.len() == 0 + } + + /// Insert the state ID value into this set and return true if the given + /// state ID was not previously in this set. + /// + /// This operation is idempotent. If the given value is already in this + /// set, then this is a no-op. + /// + /// If more than `capacity` ids are inserted, then this panics. + /// + /// This is marked as inline(always) since the compiler won't inline it + /// otherwise, and it's a fairly hot piece of code in DFA determinization. + #[cfg_attr(feature = "perf-inline", inline(always))] + pub(crate) fn insert(&mut self, id: StateID) -> bool { + if self.contains(id) { + return false; + } + + let i = self.len(); + assert!( + i < self.capacity(), + "{:?} exceeds capacity of {:?} when inserting {:?}", + i, + self.capacity(), + id, + ); + // OK since i < self.capacity() and self.capacity() is guaranteed to + // be <= StateID::LIMIT. + let index = StateID::new_unchecked(i); + self.dense[index] = id; + self.sparse[id] = index; + self.len += 1; + true + } + + /// Returns true if and only if this set contains the given value. + #[inline] + pub(crate) fn contains(&self, id: StateID) -> bool { + let index = self.sparse[id]; + index.as_usize() < self.len() && self.dense[index] == id + } + + /// Clear this set such that it has no members. + #[inline] + pub(crate) fn clear(&mut self) { + self.len = 0; + } + + #[inline] + pub(crate) fn iter(&self) -> SparseSetIter<'_> { + SparseSetIter(self.dense[..self.len()].iter()) + } + + /// Returns the heap memory usage, in bytes, used by this sparse set. + #[inline] + pub(crate) fn memory_usage(&self) -> usize { + self.dense.len() * StateID::SIZE + self.sparse.len() * StateID::SIZE + } +} + +impl core::fmt::Debug for SparseSet { + fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result { + let elements: Vec<StateID> = self.iter().collect(); + f.debug_tuple("SparseSet").field(&elements).finish() + } +} + +/// An iterator over all elements in a sparse set. +/// +/// The lifetime `'a` refers to the lifetime of the set being iterated over. +#[derive(Debug)] +pub(crate) struct SparseSetIter<'a>(core::slice::Iter<'a, StateID>); + +impl<'a> Iterator for SparseSetIter<'a> { + type Item = StateID; + + #[cfg_attr(feature = "perf-inline", inline(always))] + fn next(&mut self) -> Option<StateID> { + self.0.next().map(|&id| id) + } +} diff --git a/third_party/rust/regex-automata/src/util/start.rs b/third_party/rust/regex-automata/src/util/start.rs new file mode 100644 index 0000000000..4e360d083a --- /dev/null +++ b/third_party/rust/regex-automata/src/util/start.rs @@ -0,0 +1,306 @@ +/*! +Provides some helpers for dealing with start state configurations in DFAs. + +[`Start`] represents the possible starting configurations, while +[`StartByteMap`] represents a way to retrieve the `Start` configuration for a +given position in a haystack. +*/ + +use crate::util::{ + look::LookMatcher, + search::Input, + wire::{self, DeserializeError, SerializeError}, +}; + +/// A map from every possible byte value to its corresponding starting +/// configuration. +/// +/// This map is used in order to lookup the start configuration for a particular +/// position in a haystack. This start configuration is then used in +/// combination with things like the anchored mode and pattern ID to fully +/// determine the start state. +/// +/// Generally speaking, this map is only used for fully compiled DFAs and lazy +/// DFAs. For NFAs (including the one-pass DFA), the start state is generally +/// selected by virtue of traversing the NFA state graph. DFAs do the same +/// thing, but at build time and not search time. (Well, technically the lazy +/// DFA does it at search time, but it does enough work to cache the full +/// result of the epsilon closure that the NFA engines tend to need to do.) +#[derive(Clone)] +pub(crate) struct StartByteMap { + map: [Start; 256], +} + +impl StartByteMap { + /// Create a new map from byte values to their corresponding starting + /// configurations. The map is determined, in part, by how look-around + /// assertions are matched via the matcher given. + pub(crate) fn new(lookm: &LookMatcher) -> StartByteMap { + let mut map = [Start::NonWordByte; 256]; + map[usize::from(b'\n')] = Start::LineLF; + map[usize::from(b'\r')] = Start::LineCR; + map[usize::from(b'_')] = Start::WordByte; + + let mut byte = b'0'; + while byte <= b'9' { + map[usize::from(byte)] = Start::WordByte; + byte += 1; + } + byte = b'A'; + while byte <= b'Z' { + map[usize::from(byte)] = Start::WordByte; + byte += 1; + } + byte = b'a'; + while byte <= b'z' { + map[usize::from(byte)] = Start::WordByte; + byte += 1; + } + + let lineterm = lookm.get_line_terminator(); + // If our line terminator is normal, then it is already handled by + // the LineLF and LineCR configurations. But if it's weird, then we + // overwrite whatever was there before for that terminator with a + // special configuration. The trick here is that if the terminator + // is, say, a word byte like `a`, then callers seeing this start + // configuration need to account for that and build their DFA state as + // if it *also* came from a word byte. + if lineterm != b'\r' && lineterm != b'\n' { + map[usize::from(lineterm)] = Start::CustomLineTerminator; + } + StartByteMap { map } + } + + /// Return the forward starting configuration for the given `input`. + #[cfg_attr(feature = "perf-inline", inline(always))] + pub(crate) fn fwd(&self, input: &Input) -> Start { + match input + .start() + .checked_sub(1) + .and_then(|i| input.haystack().get(i)) + { + None => Start::Text, + Some(&byte) => self.get(byte), + } + } + + /// Return the reverse starting configuration for the given `input`. + #[cfg_attr(feature = "perf-inline", inline(always))] + pub(crate) fn rev(&self, input: &Input) -> Start { + match input.haystack().get(input.end()) { + None => Start::Text, + Some(&byte) => self.get(byte), + } + } + + #[cfg_attr(feature = "perf-inline", inline(always))] + fn get(&self, byte: u8) -> Start { + self.map[usize::from(byte)] + } + + /// Deserializes a byte class map from the given slice. If the slice is of + /// insufficient length or otherwise contains an impossible mapping, then + /// an error is returned. Upon success, the number of bytes read along with + /// the map are returned. The number of bytes read is always a multiple of + /// 8. + pub(crate) fn from_bytes( + slice: &[u8], + ) -> Result<(StartByteMap, usize), DeserializeError> { + wire::check_slice_len(slice, 256, "start byte map")?; + let mut map = [Start::NonWordByte; 256]; + for (i, &repr) in slice[..256].iter().enumerate() { + map[i] = match Start::from_usize(usize::from(repr)) { + Some(start) => start, + None => { + return Err(DeserializeError::generic( + "found invalid starting configuration", + )) + } + }; + } + Ok((StartByteMap { map }, 256)) + } + + /// Writes this map to the given byte buffer. if the given buffer is too + /// small, then an error is returned. Upon success, the total number of + /// bytes written is returned. The number of bytes written is guaranteed to + /// be a multiple of 8. + pub(crate) fn write_to( + &self, + dst: &mut [u8], + ) -> Result<usize, SerializeError> { + let nwrite = self.write_to_len(); + if dst.len() < nwrite { + return Err(SerializeError::buffer_too_small("start byte map")); + } + for (i, &start) in self.map.iter().enumerate() { + dst[i] = start.as_u8(); + } + Ok(nwrite) + } + + /// Returns the total number of bytes written by `write_to`. + pub(crate) fn write_to_len(&self) -> usize { + 256 + } +} + +impl core::fmt::Debug for StartByteMap { + fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result { + use crate::util::escape::DebugByte; + + write!(f, "StartByteMap{{")?; + for byte in 0..=255 { + if byte > 0 { + write!(f, ", ")?; + } + let start = self.map[usize::from(byte)]; + write!(f, "{:?} => {:?}", DebugByte(byte), start)?; + } + write!(f, "}}")?; + Ok(()) + } +} + +/// Represents the six possible starting configurations of a DFA search. +/// +/// The starting configuration is determined by inspecting the the beginning +/// of the haystack (up to 1 byte). Ultimately, this along with a pattern ID +/// (if specified) and the type of search (anchored or not) is what selects the +/// start state to use in a DFA. +/// +/// As one example, if a DFA only supports unanchored searches and does not +/// support anchored searches for each pattern, then it will have at most 6 +/// distinct start states. (Some start states may be reused if determinization +/// can determine that they will be equivalent.) If the DFA supports both +/// anchored and unanchored searches, then it will have a maximum of 12 +/// distinct start states. Finally, if the DFA also supports anchored searches +/// for each pattern, then it can have up to `12 + (N * 6)` start states, where +/// `N` is the number of patterns. +/// +/// Handling each of these starting configurations in the context of DFA +/// determinization can be *quite* tricky and subtle. But the code is small +/// and can be found at `crate::util::determinize::set_lookbehind_from_start`. +#[derive(Clone, Copy, Debug, Eq, PartialEq)] +pub(crate) enum Start { + /// This occurs when the starting position is not any of the ones below. + NonWordByte = 0, + /// This occurs when the byte immediately preceding the start of the search + /// is an ASCII word byte. + WordByte = 1, + /// This occurs when the starting position of the search corresponds to the + /// beginning of the haystack. + Text = 2, + /// This occurs when the byte immediately preceding the start of the search + /// is a line terminator. Specifically, `\n`. + LineLF = 3, + /// This occurs when the byte immediately preceding the start of the search + /// is a line terminator. Specifically, `\r`. + LineCR = 4, + /// This occurs when a custom line terminator has been set via a + /// `LookMatcher`, and when that line terminator is neither a `\r` or a + /// `\n`. + /// + /// If the custom line terminator is a word byte, then this start + /// configuration is still selected. DFAs that implement word boundary + /// assertions will likely need to check whether the custom line terminator + /// is a word byte, in which case, it should behave as if the byte + /// satisfies `\b` in addition to multi-line anchors. + CustomLineTerminator = 5, +} + +impl Start { + /// Return the starting state corresponding to the given integer. If no + /// starting state exists for the given integer, then None is returned. + pub(crate) fn from_usize(n: usize) -> Option<Start> { + match n { + 0 => Some(Start::NonWordByte), + 1 => Some(Start::WordByte), + 2 => Some(Start::Text), + 3 => Some(Start::LineLF), + 4 => Some(Start::LineCR), + 5 => Some(Start::CustomLineTerminator), + _ => None, + } + } + + /// Returns the total number of starting state configurations. + pub(crate) fn len() -> usize { + 6 + } + + /// Return this starting configuration as `u8` integer. It is guaranteed to + /// be less than `Start::len()`. + #[cfg_attr(feature = "perf-inline", inline(always))] + pub(crate) fn as_u8(&self) -> u8 { + // AFAIK, 'as' is the only way to zero-cost convert an int enum to an + // actual int. + *self as u8 + } + + /// Return this starting configuration as a `usize` integer. It is + /// guaranteed to be less than `Start::len()`. + #[cfg_attr(feature = "perf-inline", inline(always))] + pub(crate) fn as_usize(&self) -> usize { + usize::from(self.as_u8()) + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn start_fwd_done_range() { + let smap = StartByteMap::new(&LookMatcher::default()); + assert_eq!(Start::Text, smap.fwd(&Input::new("").range(1..0))); + } + + #[test] + fn start_rev_done_range() { + let smap = StartByteMap::new(&LookMatcher::default()); + assert_eq!(Start::Text, smap.rev(&Input::new("").range(1..0))); + } + + #[test] + fn start_fwd() { + let f = |haystack, start, end| { + let smap = StartByteMap::new(&LookMatcher::default()); + let input = &Input::new(haystack).range(start..end); + smap.fwd(input) + }; + + assert_eq!(Start::Text, f("", 0, 0)); + assert_eq!(Start::Text, f("abc", 0, 3)); + assert_eq!(Start::Text, f("\nabc", 0, 3)); + + assert_eq!(Start::LineLF, f("\nabc", 1, 3)); + + assert_eq!(Start::LineCR, f("\rabc", 1, 3)); + + assert_eq!(Start::WordByte, f("abc", 1, 3)); + + assert_eq!(Start::NonWordByte, f(" abc", 1, 3)); + } + + #[test] + fn start_rev() { + let f = |haystack, start, end| { + let smap = StartByteMap::new(&LookMatcher::default()); + let input = &Input::new(haystack).range(start..end); + smap.rev(input) + }; + + assert_eq!(Start::Text, f("", 0, 0)); + assert_eq!(Start::Text, f("abc", 0, 3)); + assert_eq!(Start::Text, f("abc\n", 0, 4)); + + assert_eq!(Start::LineLF, f("abc\nz", 0, 3)); + + assert_eq!(Start::LineCR, f("abc\rz", 0, 3)); + + assert_eq!(Start::WordByte, f("abc", 0, 2)); + + assert_eq!(Start::NonWordByte, f("abc ", 0, 3)); + } +} diff --git a/third_party/rust/regex-automata/src/util/syntax.rs b/third_party/rust/regex-automata/src/util/syntax.rs new file mode 100644 index 0000000000..78e3cf9a11 --- /dev/null +++ b/third_party/rust/regex-automata/src/util/syntax.rs @@ -0,0 +1,482 @@ +/*! +Utilities for dealing with the syntax of a regular expression. + +This module currently only exposes a [`Config`] type that +itself represents a wrapper around the configuration for a +[`regex-syntax::ParserBuilder`](regex_syntax::ParserBuilder). The purpose of +this wrapper is to make configuring syntax options very similar to how other +configuration is done throughout this crate. Namely, instead of duplicating +syntax options across every builder (of which there are many), we instead +create small config objects like this one that can be passed around and +composed. +*/ + +use alloc::{vec, vec::Vec}; + +use regex_syntax::{ + ast, + hir::{self, Hir}, + Error, ParserBuilder, +}; + +/// A convenience routine for parsing a pattern into an HIR value with the +/// default configuration. +/// +/// # Example +/// +/// This shows how to parse a pattern into an HIR value: +/// +/// ``` +/// use regex_automata::util::syntax; +/// +/// let hir = syntax::parse(r"([a-z]+)|([0-9]+)")?; +/// assert_eq!(Some(1), hir.properties().static_explicit_captures_len()); +/// +/// # Ok::<(), Box<dyn std::error::Error>>(()) +/// ``` +pub fn parse(pattern: &str) -> Result<Hir, Error> { + parse_with(pattern, &Config::default()) +} + +/// A convenience routine for parsing many patterns into HIR value with the +/// default configuration. +/// +/// # Example +/// +/// This shows how to parse many patterns into an corresponding HIR values: +/// +/// ``` +/// use { +/// regex_automata::util::syntax, +/// regex_syntax::hir::Properties, +/// }; +/// +/// let hirs = syntax::parse_many(&[ +/// r"([a-z]+)|([0-9]+)", +/// r"foo(A-Z]+)bar", +/// ])?; +/// let props = Properties::union(hirs.iter().map(|h| h.properties())); +/// assert_eq!(Some(1), props.static_explicit_captures_len()); +/// +/// # Ok::<(), Box<dyn std::error::Error>>(()) +/// ``` +pub fn parse_many<P: AsRef<str>>(patterns: &[P]) -> Result<Vec<Hir>, Error> { + parse_many_with(patterns, &Config::default()) +} + +/// A convenience routine for parsing a pattern into an HIR value using a +/// `Config`. +/// +/// # Example +/// +/// This shows how to parse a pattern into an HIR value with a non-default +/// configuration: +/// +/// ``` +/// use regex_automata::util::syntax; +/// +/// let hir = syntax::parse_with( +/// r"^[a-z]+$", +/// &syntax::Config::new().multi_line(true).crlf(true), +/// )?; +/// assert!(hir.properties().look_set().contains_anchor_crlf()); +/// +/// # Ok::<(), Box<dyn std::error::Error>>(()) +/// ``` +pub fn parse_with(pattern: &str, config: &Config) -> Result<Hir, Error> { + let mut builder = ParserBuilder::new(); + config.apply(&mut builder); + builder.build().parse(pattern) +} + +/// A convenience routine for parsing many patterns into HIR values using a +/// `Config`. +/// +/// # Example +/// +/// This shows how to parse many patterns into an corresponding HIR values +/// with a non-default configuration: +/// +/// ``` +/// use { +/// regex_automata::util::syntax, +/// regex_syntax::hir::Properties, +/// }; +/// +/// let patterns = &[ +/// r"([a-z]+)|([0-9]+)", +/// r"\W", +/// r"foo(A-Z]+)bar", +/// ]; +/// let config = syntax::Config::new().unicode(false).utf8(false); +/// let hirs = syntax::parse_many_with(patterns, &config)?; +/// let props = Properties::union(hirs.iter().map(|h| h.properties())); +/// assert!(!props.is_utf8()); +/// +/// # Ok::<(), Box<dyn std::error::Error>>(()) +/// ``` +pub fn parse_many_with<P: AsRef<str>>( + patterns: &[P], + config: &Config, +) -> Result<Vec<Hir>, Error> { + let mut builder = ParserBuilder::new(); + config.apply(&mut builder); + let mut hirs = vec![]; + for p in patterns.iter() { + hirs.push(builder.build().parse(p.as_ref())?); + } + Ok(hirs) +} + +/// A common set of configuration options that apply to the syntax of a regex. +/// +/// This represents a group of configuration options that specifically apply +/// to how the concrete syntax of a regular expression is interpreted. In +/// particular, they are generally forwarded to the +/// [`ParserBuilder`](https://docs.rs/regex-syntax/*/regex_syntax/struct.ParserBuilder.html) +/// in the +/// [`regex-syntax`](https://docs.rs/regex-syntax) +/// crate when building a regex from its concrete syntax directly. +/// +/// These options are defined as a group since they apply to every regex engine +/// in this crate. Instead of re-defining them on every engine's builder, they +/// are instead provided here as one cohesive unit. +#[derive(Clone, Copy, Debug)] +pub struct Config { + case_insensitive: bool, + multi_line: bool, + dot_matches_new_line: bool, + crlf: bool, + line_terminator: u8, + swap_greed: bool, + ignore_whitespace: bool, + unicode: bool, + utf8: bool, + nest_limit: u32, + octal: bool, +} + +impl Config { + /// Return a new default syntax configuration. + pub fn new() -> Config { + // These defaults match the ones used in regex-syntax. + Config { + case_insensitive: false, + multi_line: false, + dot_matches_new_line: false, + crlf: false, + line_terminator: b'\n', + swap_greed: false, + ignore_whitespace: false, + unicode: true, + utf8: true, + nest_limit: 250, + octal: false, + } + } + + /// Enable or disable the case insensitive flag by default. + /// + /// When Unicode mode is enabled, case insensitivity is Unicode-aware. + /// Specifically, it will apply the "simple" case folding rules as + /// specified by Unicode. + /// + /// By default this is disabled. It may alternatively be selectively + /// enabled in the regular expression itself via the `i` flag. + pub fn case_insensitive(mut self, yes: bool) -> Config { + self.case_insensitive = yes; + self + } + + /// Enable or disable the multi-line matching flag by default. + /// + /// When this is enabled, the `^` and `$` look-around assertions will + /// match immediately after and immediately before a new line character, + /// respectively. Note that the `\A` and `\z` look-around assertions are + /// unaffected by this setting and always correspond to matching at the + /// beginning and end of the input. + /// + /// By default this is disabled. It may alternatively be selectively + /// enabled in the regular expression itself via the `m` flag. + pub fn multi_line(mut self, yes: bool) -> Config { + self.multi_line = yes; + self + } + + /// Enable or disable the "dot matches any character" flag by default. + /// + /// When this is enabled, `.` will match any character. When it's disabled, + /// then `.` will match any character except for a new line character. + /// + /// Note that `.` is impacted by whether the "unicode" setting is enabled + /// or not. When Unicode is enabled (the default), `.` will match any UTF-8 + /// encoding of any Unicode scalar value (sans a new line, depending on + /// whether this "dot matches new line" option is enabled). When Unicode + /// mode is disabled, `.` will match any byte instead. Because of this, + /// when Unicode mode is disabled, `.` can only be used when the "allow + /// invalid UTF-8" option is enabled, since `.` could otherwise match + /// invalid UTF-8. + /// + /// By default this is disabled. It may alternatively be selectively + /// enabled in the regular expression itself via the `s` flag. + pub fn dot_matches_new_line(mut self, yes: bool) -> Config { + self.dot_matches_new_line = yes; + self + } + + /// Enable or disable the "CRLF mode" flag by default. + /// + /// By default this is disabled. It may alternatively be selectively + /// enabled in the regular expression itself via the `R` flag. + /// + /// When CRLF mode is enabled, the following happens: + /// + /// * Unless `dot_matches_new_line` is enabled, `.` will match any character + /// except for `\r` and `\n`. + /// * When `multi_line` mode is enabled, `^` and `$` will treat `\r\n`, + /// `\r` and `\n` as line terminators. And in particular, neither will + /// match between a `\r` and a `\n`. + pub fn crlf(mut self, yes: bool) -> Config { + self.crlf = yes; + self + } + + /// Sets the line terminator for use with `(?u-s:.)` and `(?-us:.)`. + /// + /// Namely, instead of `.` (by default) matching everything except for `\n`, + /// this will cause `.` to match everything except for the byte given. + /// + /// If `.` is used in a context where Unicode mode is enabled and this byte + /// isn't ASCII, then an error will be returned. When Unicode mode is + /// disabled, then any byte is permitted, but will return an error if UTF-8 + /// mode is enabled and it is a non-ASCII byte. + /// + /// In short, any ASCII value for a line terminator is always okay. But a + /// non-ASCII byte might result in an error depending on whether Unicode + /// mode or UTF-8 mode are enabled. + /// + /// Note that if `R` mode is enabled then it always takes precedence and + /// the line terminator will be treated as `\r` and `\n` simultaneously. + /// + /// Note also that this *doesn't* impact the look-around assertions + /// `(?m:^)` and `(?m:$)`. That's usually controlled by additional + /// configuration in the regex engine itself. + pub fn line_terminator(mut self, byte: u8) -> Config { + self.line_terminator = byte; + self + } + + /// Enable or disable the "swap greed" flag by default. + /// + /// When this is enabled, `.*` (for example) will become ungreedy and `.*?` + /// will become greedy. + /// + /// By default this is disabled. It may alternatively be selectively + /// enabled in the regular expression itself via the `U` flag. + pub fn swap_greed(mut self, yes: bool) -> Config { + self.swap_greed = yes; + self + } + + /// Enable verbose mode in the regular expression. + /// + /// When enabled, verbose mode permits insigificant whitespace in many + /// places in the regular expression, as well as comments. Comments are + /// started using `#` and continue until the end of the line. + /// + /// By default, this is disabled. It may be selectively enabled in the + /// regular expression by using the `x` flag regardless of this setting. + pub fn ignore_whitespace(mut self, yes: bool) -> Config { + self.ignore_whitespace = yes; + self + } + + /// Enable or disable the Unicode flag (`u`) by default. + /// + /// By default this is **enabled**. It may alternatively be selectively + /// disabled in the regular expression itself via the `u` flag. + /// + /// Note that unless "allow invalid UTF-8" is enabled (it's disabled by + /// default), a regular expression will fail to parse if Unicode mode is + /// disabled and a sub-expression could possibly match invalid UTF-8. + /// + /// **WARNING**: Unicode mode can greatly increase the size of the compiled + /// DFA, which can noticeably impact both memory usage and compilation + /// time. This is especially noticeable if your regex contains character + /// classes like `\w` that are impacted by whether Unicode is enabled or + /// not. If Unicode is not necessary, you are encouraged to disable it. + pub fn unicode(mut self, yes: bool) -> Config { + self.unicode = yes; + self + } + + /// When disabled, the builder will permit the construction of a regular + /// expression that may match invalid UTF-8. + /// + /// For example, when [`Config::unicode`] is disabled, then + /// expressions like `[^a]` may match invalid UTF-8 since they can match + /// any single byte that is not `a`. By default, these sub-expressions + /// are disallowed to avoid returning offsets that split a UTF-8 + /// encoded codepoint. However, in cases where matching at arbitrary + /// locations is desired, this option can be disabled to permit all such + /// sub-expressions. + /// + /// When enabled (the default), the builder is guaranteed to produce a + /// regex that will only ever match valid UTF-8 (otherwise, the builder + /// will return an error). + pub fn utf8(mut self, yes: bool) -> Config { + self.utf8 = yes; + self + } + + /// Set the nesting limit used for the regular expression parser. + /// + /// The nesting limit controls how deep the abstract syntax tree is allowed + /// to be. If the AST exceeds the given limit (e.g., with too many nested + /// groups), then an error is returned by the parser. + /// + /// The purpose of this limit is to act as a heuristic to prevent stack + /// overflow when building a finite automaton from a regular expression's + /// abstract syntax tree. In particular, construction currently uses + /// recursion. In the future, the implementation may stop using recursion + /// and this option will no longer be necessary. + /// + /// This limit is not checked until the entire AST is parsed. Therefore, + /// if callers want to put a limit on the amount of heap space used, then + /// they should impose a limit on the length, in bytes, of the concrete + /// pattern string. In particular, this is viable since the parser will + /// limit itself to heap space proportional to the length of the pattern + /// string. + /// + /// Note that a nest limit of `0` will return a nest limit error for most + /// patterns but not all. For example, a nest limit of `0` permits `a` but + /// not `ab`, since `ab` requires a concatenation AST item, which results + /// in a nest depth of `1`. In general, a nest limit is not something that + /// manifests in an obvious way in the concrete syntax, therefore, it + /// should not be used in a granular way. + pub fn nest_limit(mut self, limit: u32) -> Config { + self.nest_limit = limit; + self + } + + /// Whether to support octal syntax or not. + /// + /// Octal syntax is a little-known way of uttering Unicode codepoints in + /// a regular expression. For example, `a`, `\x61`, `\u0061` and + /// `\141` are all equivalent regular expressions, where the last example + /// shows octal syntax. + /// + /// While supporting octal syntax isn't in and of itself a problem, it does + /// make good error messages harder. That is, in PCRE based regex engines, + /// syntax like `\1` invokes a backreference, which is explicitly + /// unsupported in Rust's regex engine. However, many users expect it to + /// be supported. Therefore, when octal support is disabled, the error + /// message will explicitly mention that backreferences aren't supported. + /// + /// Octal syntax is disabled by default. + pub fn octal(mut self, yes: bool) -> Config { + self.octal = yes; + self + } + + /// Returns whether "unicode" mode is enabled. + pub fn get_unicode(&self) -> bool { + self.unicode + } + + /// Returns whether "case insensitive" mode is enabled. + pub fn get_case_insensitive(&self) -> bool { + self.case_insensitive + } + + /// Returns whether "multi line" mode is enabled. + pub fn get_multi_line(&self) -> bool { + self.multi_line + } + + /// Returns whether "dot matches new line" mode is enabled. + pub fn get_dot_matches_new_line(&self) -> bool { + self.dot_matches_new_line + } + + /// Returns whether "CRLF" mode is enabled. + pub fn get_crlf(&self) -> bool { + self.crlf + } + + /// Returns the line terminator in this syntax configuration. + pub fn get_line_terminator(&self) -> u8 { + self.line_terminator + } + + /// Returns whether "swap greed" mode is enabled. + pub fn get_swap_greed(&self) -> bool { + self.swap_greed + } + + /// Returns whether "ignore whitespace" mode is enabled. + pub fn get_ignore_whitespace(&self) -> bool { + self.ignore_whitespace + } + + /// Returns whether UTF-8 mode is enabled. + pub fn get_utf8(&self) -> bool { + self.utf8 + } + + /// Returns the "nest limit" setting. + pub fn get_nest_limit(&self) -> u32 { + self.nest_limit + } + + /// Returns whether "octal" mode is enabled. + pub fn get_octal(&self) -> bool { + self.octal + } + + /// Applies this configuration to the given parser. + pub(crate) fn apply(&self, builder: &mut ParserBuilder) { + builder + .unicode(self.unicode) + .case_insensitive(self.case_insensitive) + .multi_line(self.multi_line) + .dot_matches_new_line(self.dot_matches_new_line) + .crlf(self.crlf) + .line_terminator(self.line_terminator) + .swap_greed(self.swap_greed) + .ignore_whitespace(self.ignore_whitespace) + .utf8(self.utf8) + .nest_limit(self.nest_limit) + .octal(self.octal); + } + + /// Applies this configuration to the given AST parser. + pub(crate) fn apply_ast(&self, builder: &mut ast::parse::ParserBuilder) { + builder + .ignore_whitespace(self.ignore_whitespace) + .nest_limit(self.nest_limit) + .octal(self.octal); + } + + /// Applies this configuration to the given AST-to-HIR translator. + pub(crate) fn apply_hir( + &self, + builder: &mut hir::translate::TranslatorBuilder, + ) { + builder + .unicode(self.unicode) + .case_insensitive(self.case_insensitive) + .multi_line(self.multi_line) + .crlf(self.crlf) + .dot_matches_new_line(self.dot_matches_new_line) + .line_terminator(self.line_terminator) + .swap_greed(self.swap_greed) + .utf8(self.utf8); + } +} + +impl Default for Config { + fn default() -> Config { + Config::new() + } +} diff --git a/third_party/rust/regex-automata/src/util/unicode_data/mod.rs b/third_party/rust/regex-automata/src/util/unicode_data/mod.rs new file mode 100644 index 0000000000..fc7b1c738a --- /dev/null +++ b/third_party/rust/regex-automata/src/util/unicode_data/mod.rs @@ -0,0 +1,17 @@ +// This cfg should match the one in src/util/look.rs that uses perl_word. +#[cfg(all( + // We have to explicitly want to support Unicode word boundaries. + feature = "unicode-word-boundary", + not(all( + // If we don't have regex-syntax at all, then we definitely need to + // bring our own \w data table. + feature = "syntax", + // If unicode-perl is enabled, then regex-syntax/unicode-perl is + // also enabled, which in turn means we can use regex-syntax's + // is_word_character routine (and thus use its data tables). But if + // unicode-perl is not enabled, even if syntax is, then we need to + // bring our own. + feature = "unicode-perl", + )), +))] +pub(crate) mod perl_word; diff --git a/third_party/rust/regex-automata/src/util/unicode_data/perl_word.rs b/third_party/rust/regex-automata/src/util/unicode_data/perl_word.rs new file mode 100644 index 0000000000..74d62656f3 --- /dev/null +++ b/third_party/rust/regex-automata/src/util/unicode_data/perl_word.rs @@ -0,0 +1,781 @@ +// DO NOT EDIT THIS FILE. IT WAS AUTOMATICALLY GENERATED BY: +// +// ucd-generate perl-word tmp/ucd-15.0.0/ --chars +// +// Unicode version: 15.0.0. +// +// ucd-generate 0.2.15 is available on crates.io. + +pub const PERL_WORD: &'static [(char, char)] = &[ + ('0', '9'), + ('A', 'Z'), + ('_', '_'), + ('a', 'z'), + ('ª', 'ª'), + ('µ', 'µ'), + ('º', 'º'), + ('À', 'Ö'), + ('Ø', 'ö'), + ('ø', 'ˁ'), + ('ˆ', 'ˑ'), + ('ˠ', 'ˤ'), + ('ˬ', 'ˬ'), + ('ˮ', 'ˮ'), + ('\u{300}', 'ʹ'), + ('Ͷ', 'ͷ'), + ('ͺ', 'ͽ'), + ('Ϳ', 'Ϳ'), + ('Ά', 'Ά'), + ('Έ', 'Ί'), + ('Ό', 'Ό'), + ('Ύ', 'Ρ'), + ('Σ', 'ϵ'), + ('Ϸ', 'ҁ'), + ('\u{483}', 'ԯ'), + ('Ա', 'Ֆ'), + ('ՙ', 'ՙ'), + ('ՠ', 'ֈ'), + ('\u{591}', '\u{5bd}'), + ('\u{5bf}', '\u{5bf}'), + ('\u{5c1}', '\u{5c2}'), + ('\u{5c4}', '\u{5c5}'), + ('\u{5c7}', '\u{5c7}'), + ('א', 'ת'), + ('ׯ', 'ײ'), + ('\u{610}', '\u{61a}'), + ('ؠ', '٩'), + ('ٮ', 'ۓ'), + ('ە', '\u{6dc}'), + ('\u{6df}', '\u{6e8}'), + ('\u{6ea}', 'ۼ'), + ('ۿ', 'ۿ'), + ('ܐ', '\u{74a}'), + ('ݍ', 'ޱ'), + ('߀', 'ߵ'), + ('ߺ', 'ߺ'), + ('\u{7fd}', '\u{7fd}'), + ('ࠀ', '\u{82d}'), + ('ࡀ', '\u{85b}'), + ('ࡠ', 'ࡪ'), + ('ࡰ', 'ࢇ'), + ('ࢉ', 'ࢎ'), + ('\u{898}', '\u{8e1}'), + ('\u{8e3}', '\u{963}'), + ('०', '९'), + ('ॱ', 'ঃ'), + ('অ', 'ঌ'), + ('এ', 'ঐ'), + ('ও', 'ন'), + ('প', 'র'), + ('ল', 'ল'), + ('শ', 'হ'), + ('\u{9bc}', '\u{9c4}'), + ('ে', 'ৈ'), + ('ো', 'ৎ'), + ('\u{9d7}', '\u{9d7}'), + ('ড়', 'ঢ়'), + ('য়', '\u{9e3}'), + ('০', 'ৱ'), + ('ৼ', 'ৼ'), + ('\u{9fe}', '\u{9fe}'), + ('\u{a01}', 'ਃ'), + ('ਅ', 'ਊ'), + ('ਏ', 'ਐ'), + ('ਓ', 'ਨ'), + ('ਪ', 'ਰ'), + ('ਲ', 'ਲ਼'), + ('ਵ', 'ਸ਼'), + ('ਸ', 'ਹ'), + ('\u{a3c}', '\u{a3c}'), + ('ਾ', '\u{a42}'), + ('\u{a47}', '\u{a48}'), + ('\u{a4b}', '\u{a4d}'), + ('\u{a51}', '\u{a51}'), + ('ਖ਼', 'ੜ'), + ('ਫ਼', 'ਫ਼'), + ('੦', '\u{a75}'), + ('\u{a81}', 'ઃ'), + ('અ', 'ઍ'), + ('એ', 'ઑ'), + ('ઓ', 'ન'), + ('પ', 'ર'), + ('લ', 'ળ'), + ('વ', 'હ'), + ('\u{abc}', '\u{ac5}'), + ('\u{ac7}', 'ૉ'), + ('ો', '\u{acd}'), + ('ૐ', 'ૐ'), + ('ૠ', '\u{ae3}'), + ('૦', '૯'), + ('ૹ', '\u{aff}'), + ('\u{b01}', 'ଃ'), + ('ଅ', 'ଌ'), + ('ଏ', 'ଐ'), + ('ଓ', 'ନ'), + ('ପ', 'ର'), + ('ଲ', 'ଳ'), + ('ଵ', 'ହ'), + ('\u{b3c}', '\u{b44}'), + ('େ', 'ୈ'), + ('ୋ', '\u{b4d}'), + ('\u{b55}', '\u{b57}'), + ('ଡ଼', 'ଢ଼'), + ('ୟ', '\u{b63}'), + ('୦', '୯'), + ('ୱ', 'ୱ'), + ('\u{b82}', 'ஃ'), + ('அ', 'ஊ'), + ('எ', 'ஐ'), + ('ஒ', 'க'), + ('ங', 'ச'), + ('ஜ', 'ஜ'), + ('ஞ', 'ட'), + ('ண', 'த'), + ('ந', 'ப'), + ('ம', 'ஹ'), + ('\u{bbe}', 'ூ'), + ('ெ', 'ை'), + ('ொ', '\u{bcd}'), + ('ௐ', 'ௐ'), + ('\u{bd7}', '\u{bd7}'), + ('௦', '௯'), + ('\u{c00}', 'ఌ'), + ('ఎ', 'ఐ'), + ('ఒ', 'న'), + ('ప', 'హ'), + ('\u{c3c}', 'ౄ'), + ('\u{c46}', '\u{c48}'), + ('\u{c4a}', '\u{c4d}'), + ('\u{c55}', '\u{c56}'), + ('ౘ', 'ౚ'), + ('ౝ', 'ౝ'), + ('ౠ', '\u{c63}'), + ('౦', '౯'), + ('ಀ', 'ಃ'), + ('ಅ', 'ಌ'), + ('ಎ', 'ಐ'), + ('ಒ', 'ನ'), + ('ಪ', 'ಳ'), + ('ವ', 'ಹ'), + ('\u{cbc}', 'ೄ'), + ('\u{cc6}', 'ೈ'), + ('ೊ', '\u{ccd}'), + ('\u{cd5}', '\u{cd6}'), + ('ೝ', 'ೞ'), + ('ೠ', '\u{ce3}'), + ('೦', '೯'), + ('ೱ', 'ೳ'), + ('\u{d00}', 'ഌ'), + ('എ', 'ഐ'), + ('ഒ', '\u{d44}'), + ('െ', 'ൈ'), + ('ൊ', 'ൎ'), + ('ൔ', '\u{d57}'), + ('ൟ', '\u{d63}'), + ('൦', '൯'), + ('ൺ', 'ൿ'), + ('\u{d81}', 'ඃ'), + ('අ', 'ඖ'), + ('ක', 'න'), + ('ඳ', 'ර'), + ('ල', 'ල'), + ('ව', 'ෆ'), + ('\u{dca}', '\u{dca}'), + ('\u{dcf}', '\u{dd4}'), + ('\u{dd6}', '\u{dd6}'), + ('ෘ', '\u{ddf}'), + ('෦', '෯'), + ('ෲ', 'ෳ'), + ('ก', '\u{e3a}'), + ('เ', '\u{e4e}'), + ('๐', '๙'), + ('ກ', 'ຂ'), + ('ຄ', 'ຄ'), + ('ຆ', 'ຊ'), + ('ຌ', 'ຣ'), + ('ລ', 'ລ'), + ('ວ', 'ຽ'), + ('ເ', 'ໄ'), + ('ໆ', 'ໆ'), + ('\u{ec8}', '\u{ece}'), + ('໐', '໙'), + ('ໜ', 'ໟ'), + ('ༀ', 'ༀ'), + ('\u{f18}', '\u{f19}'), + ('༠', '༩'), + ('\u{f35}', '\u{f35}'), + ('\u{f37}', '\u{f37}'), + ('\u{f39}', '\u{f39}'), + ('༾', 'ཇ'), + ('ཉ', 'ཬ'), + ('\u{f71}', '\u{f84}'), + ('\u{f86}', '\u{f97}'), + ('\u{f99}', '\u{fbc}'), + ('\u{fc6}', '\u{fc6}'), + ('က', '၉'), + ('ၐ', '\u{109d}'), + ('Ⴀ', 'Ⴥ'), + ('Ⴧ', 'Ⴧ'), + ('Ⴭ', 'Ⴭ'), + ('ა', 'ჺ'), + ('ჼ', 'ቈ'), + ('ቊ', 'ቍ'), + ('ቐ', 'ቖ'), + ('ቘ', 'ቘ'), + ('ቚ', 'ቝ'), + ('በ', 'ኈ'), + ('ኊ', 'ኍ'), + ('ነ', 'ኰ'), + ('ኲ', 'ኵ'), + ('ኸ', 'ኾ'), + ('ዀ', 'ዀ'), + ('ዂ', 'ዅ'), + ('ወ', 'ዖ'), + ('ዘ', 'ጐ'), + ('ጒ', 'ጕ'), + ('ጘ', 'ፚ'), + ('\u{135d}', '\u{135f}'), + ('ᎀ', 'ᎏ'), + ('Ꭰ', 'Ᏽ'), + ('ᏸ', 'ᏽ'), + ('ᐁ', 'ᙬ'), + ('ᙯ', 'ᙿ'), + ('ᚁ', 'ᚚ'), + ('ᚠ', 'ᛪ'), + ('ᛮ', 'ᛸ'), + ('ᜀ', '᜕'), + ('ᜟ', '᜴'), + ('ᝀ', '\u{1753}'), + ('ᝠ', 'ᝬ'), + ('ᝮ', 'ᝰ'), + ('\u{1772}', '\u{1773}'), + ('ក', '\u{17d3}'), + ('ៗ', 'ៗ'), + ('ៜ', '\u{17dd}'), + ('០', '៩'), + ('\u{180b}', '\u{180d}'), + ('\u{180f}', '᠙'), + ('ᠠ', 'ᡸ'), + ('ᢀ', 'ᢪ'), + ('ᢰ', 'ᣵ'), + ('ᤀ', 'ᤞ'), + ('\u{1920}', 'ᤫ'), + ('ᤰ', '\u{193b}'), + ('᥆', 'ᥭ'), + ('ᥰ', 'ᥴ'), + ('ᦀ', 'ᦫ'), + ('ᦰ', 'ᧉ'), + ('᧐', '᧙'), + ('ᨀ', '\u{1a1b}'), + ('ᨠ', '\u{1a5e}'), + ('\u{1a60}', '\u{1a7c}'), + ('\u{1a7f}', '᪉'), + ('᪐', '᪙'), + ('ᪧ', 'ᪧ'), + ('\u{1ab0}', '\u{1ace}'), + ('\u{1b00}', 'ᭌ'), + ('᭐', '᭙'), + ('\u{1b6b}', '\u{1b73}'), + ('\u{1b80}', '᯳'), + ('ᰀ', '\u{1c37}'), + ('᱀', '᱉'), + ('ᱍ', 'ᱽ'), + ('ᲀ', 'ᲈ'), + ('Ა', 'Ჺ'), + ('Ჽ', 'Ჿ'), + ('\u{1cd0}', '\u{1cd2}'), + ('\u{1cd4}', 'ᳺ'), + ('ᴀ', 'ἕ'), + ('Ἐ', 'Ἕ'), + ('ἠ', 'ὅ'), + ('Ὀ', 'Ὅ'), + ('ὐ', 'ὗ'), + ('Ὑ', 'Ὑ'), + ('Ὓ', 'Ὓ'), + ('Ὕ', 'Ὕ'), + ('Ὗ', 'ώ'), + ('ᾀ', 'ᾴ'), + ('ᾶ', 'ᾼ'), + ('ι', 'ι'), + ('ῂ', 'ῄ'), + ('ῆ', 'ῌ'), + ('ῐ', 'ΐ'), + ('ῖ', 'Ί'), + ('ῠ', 'Ῥ'), + ('ῲ', 'ῴ'), + ('ῶ', 'ῼ'), + ('\u{200c}', '\u{200d}'), + ('‿', '⁀'), + ('⁔', '⁔'), + ('ⁱ', 'ⁱ'), + ('ⁿ', 'ⁿ'), + ('ₐ', 'ₜ'), + ('\u{20d0}', '\u{20f0}'), + ('ℂ', 'ℂ'), + ('ℇ', 'ℇ'), + ('ℊ', 'ℓ'), + ('ℕ', 'ℕ'), + ('ℙ', 'ℝ'), + ('ℤ', 'ℤ'), + ('Ω', 'Ω'), + ('ℨ', 'ℨ'), + ('K', 'ℭ'), + ('ℯ', 'ℹ'), + ('ℼ', 'ℿ'), + ('ⅅ', 'ⅉ'), + ('ⅎ', 'ⅎ'), + ('Ⅰ', 'ↈ'), + ('Ⓐ', 'ⓩ'), + ('Ⰰ', 'ⳤ'), + ('Ⳬ', 'ⳳ'), + ('ⴀ', 'ⴥ'), + ('ⴧ', 'ⴧ'), + ('ⴭ', 'ⴭ'), + ('ⴰ', 'ⵧ'), + ('ⵯ', 'ⵯ'), + ('\u{2d7f}', 'ⶖ'), + ('ⶠ', 'ⶦ'), + ('ⶨ', 'ⶮ'), + ('ⶰ', 'ⶶ'), + ('ⶸ', 'ⶾ'), + ('ⷀ', 'ⷆ'), + ('ⷈ', 'ⷎ'), + ('ⷐ', 'ⷖ'), + ('ⷘ', 'ⷞ'), + ('\u{2de0}', '\u{2dff}'), + ('ⸯ', 'ⸯ'), + ('々', '〇'), + ('〡', '\u{302f}'), + ('〱', '〵'), + ('〸', '〼'), + ('ぁ', 'ゖ'), + ('\u{3099}', '\u{309a}'), + ('ゝ', 'ゟ'), + ('ァ', 'ヺ'), + ('ー', 'ヿ'), + ('ㄅ', 'ㄯ'), + ('ㄱ', 'ㆎ'), + ('ㆠ', 'ㆿ'), + ('ㇰ', 'ㇿ'), + ('㐀', '䶿'), + ('一', 'ꒌ'), + ('ꓐ', 'ꓽ'), + ('ꔀ', 'ꘌ'), + ('ꘐ', 'ꘫ'), + ('Ꙁ', '\u{a672}'), + ('\u{a674}', '\u{a67d}'), + ('ꙿ', '\u{a6f1}'), + ('ꜗ', 'ꜟ'), + ('Ꜣ', 'ꞈ'), + ('Ꞌ', 'ꟊ'), + ('Ꟑ', 'ꟑ'), + ('ꟓ', 'ꟓ'), + ('ꟕ', 'ꟙ'), + ('ꟲ', 'ꠧ'), + ('\u{a82c}', '\u{a82c}'), + ('ꡀ', 'ꡳ'), + ('ꢀ', '\u{a8c5}'), + ('꣐', '꣙'), + ('\u{a8e0}', 'ꣷ'), + ('ꣻ', 'ꣻ'), + ('ꣽ', '\u{a92d}'), + ('ꤰ', '꥓'), + ('ꥠ', 'ꥼ'), + ('\u{a980}', '꧀'), + ('ꧏ', '꧙'), + ('ꧠ', 'ꧾ'), + ('ꨀ', '\u{aa36}'), + ('ꩀ', 'ꩍ'), + ('꩐', '꩙'), + ('ꩠ', 'ꩶ'), + ('ꩺ', 'ꫂ'), + ('ꫛ', 'ꫝ'), + ('ꫠ', 'ꫯ'), + ('ꫲ', '\u{aaf6}'), + ('ꬁ', 'ꬆ'), + ('ꬉ', 'ꬎ'), + ('ꬑ', 'ꬖ'), + ('ꬠ', 'ꬦ'), + ('ꬨ', 'ꬮ'), + ('ꬰ', 'ꭚ'), + ('ꭜ', 'ꭩ'), + ('ꭰ', 'ꯪ'), + ('꯬', '\u{abed}'), + ('꯰', '꯹'), + ('가', '힣'), + ('ힰ', 'ퟆ'), + ('ퟋ', 'ퟻ'), + ('豈', '舘'), + ('並', '龎'), + ('ff', 'st'), + ('ﬓ', 'ﬗ'), + ('יִ', 'ﬨ'), + ('שׁ', 'זּ'), + ('טּ', 'לּ'), + ('מּ', 'מּ'), + ('נּ', 'סּ'), + ('ףּ', 'פּ'), + ('צּ', 'ﮱ'), + ('ﯓ', 'ﴽ'), + ('ﵐ', 'ﶏ'), + ('ﶒ', 'ﷇ'), + ('ﷰ', 'ﷻ'), + ('\u{fe00}', '\u{fe0f}'), + ('\u{fe20}', '\u{fe2f}'), + ('︳', '︴'), + ('﹍', '﹏'), + ('ﹰ', 'ﹴ'), + ('ﹶ', 'ﻼ'), + ('0', '9'), + ('A', 'Z'), + ('_', '_'), + ('a', 'z'), + ('ヲ', 'ᄒ'), + ('ᅡ', 'ᅦ'), + ('ᅧ', 'ᅬ'), + ('ᅭ', 'ᅲ'), + ('ᅳ', 'ᅵ'), + ('𐀀', '𐀋'), + ('𐀍', '𐀦'), + ('𐀨', '𐀺'), + ('𐀼', '𐀽'), + ('𐀿', '𐁍'), + ('𐁐', '𐁝'), + ('𐂀', '𐃺'), + ('𐅀', '𐅴'), + ('\u{101fd}', '\u{101fd}'), + ('𐊀', '𐊜'), + ('𐊠', '𐋐'), + ('\u{102e0}', '\u{102e0}'), + ('𐌀', '𐌟'), + ('𐌭', '𐍊'), + ('𐍐', '\u{1037a}'), + ('𐎀', '𐎝'), + ('𐎠', '𐏃'), + ('𐏈', '𐏏'), + ('𐏑', '𐏕'), + ('𐐀', '𐒝'), + ('𐒠', '𐒩'), + ('𐒰', '𐓓'), + ('𐓘', '𐓻'), + ('𐔀', '𐔧'), + ('𐔰', '𐕣'), + ('𐕰', '𐕺'), + ('𐕼', '𐖊'), + ('𐖌', '𐖒'), + ('𐖔', '𐖕'), + ('𐖗', '𐖡'), + ('𐖣', '𐖱'), + ('𐖳', '𐖹'), + ('𐖻', '𐖼'), + ('𐘀', '𐜶'), + ('𐝀', '𐝕'), + ('𐝠', '𐝧'), + ('𐞀', '𐞅'), + ('𐞇', '𐞰'), + ('𐞲', '𐞺'), + ('𐠀', '𐠅'), + ('𐠈', '𐠈'), + ('𐠊', '𐠵'), + ('𐠷', '𐠸'), + ('𐠼', '𐠼'), + ('𐠿', '𐡕'), + ('𐡠', '𐡶'), + ('𐢀', '𐢞'), + ('𐣠', '𐣲'), + ('𐣴', '𐣵'), + ('𐤀', '𐤕'), + ('𐤠', '𐤹'), + ('𐦀', '𐦷'), + ('𐦾', '𐦿'), + ('𐨀', '\u{10a03}'), + ('\u{10a05}', '\u{10a06}'), + ('\u{10a0c}', '𐨓'), + ('𐨕', '𐨗'), + ('𐨙', '𐨵'), + ('\u{10a38}', '\u{10a3a}'), + ('\u{10a3f}', '\u{10a3f}'), + ('𐩠', '𐩼'), + ('𐪀', '𐪜'), + ('𐫀', '𐫇'), + ('𐫉', '\u{10ae6}'), + ('𐬀', '𐬵'), + ('𐭀', '𐭕'), + ('𐭠', '𐭲'), + ('𐮀', '𐮑'), + ('𐰀', '𐱈'), + ('𐲀', '𐲲'), + ('𐳀', '𐳲'), + ('𐴀', '\u{10d27}'), + ('𐴰', '𐴹'), + ('𐺀', '𐺩'), + ('\u{10eab}', '\u{10eac}'), + ('𐺰', '𐺱'), + ('\u{10efd}', '𐼜'), + ('𐼧', '𐼧'), + ('𐼰', '\u{10f50}'), + ('𐽰', '\u{10f85}'), + ('𐾰', '𐿄'), + ('𐿠', '𐿶'), + ('𑀀', '\u{11046}'), + ('𑁦', '𑁵'), + ('\u{1107f}', '\u{110ba}'), + ('\u{110c2}', '\u{110c2}'), + ('𑃐', '𑃨'), + ('𑃰', '𑃹'), + ('\u{11100}', '\u{11134}'), + ('𑄶', '𑄿'), + ('𑅄', '𑅇'), + ('𑅐', '\u{11173}'), + ('𑅶', '𑅶'), + ('\u{11180}', '𑇄'), + ('\u{111c9}', '\u{111cc}'), + ('𑇎', '𑇚'), + ('𑇜', '𑇜'), + ('𑈀', '𑈑'), + ('𑈓', '\u{11237}'), + ('\u{1123e}', '\u{11241}'), + ('𑊀', '𑊆'), + ('𑊈', '𑊈'), + ('𑊊', '𑊍'), + ('𑊏', '𑊝'), + ('𑊟', '𑊨'), + ('𑊰', '\u{112ea}'), + ('𑋰', '𑋹'), + ('\u{11300}', '𑌃'), + ('𑌅', '𑌌'), + ('𑌏', '𑌐'), + ('𑌓', '𑌨'), + ('𑌪', '𑌰'), + ('𑌲', '𑌳'), + ('𑌵', '𑌹'), + ('\u{1133b}', '𑍄'), + ('𑍇', '𑍈'), + ('𑍋', '𑍍'), + ('𑍐', '𑍐'), + ('\u{11357}', '\u{11357}'), + ('𑍝', '𑍣'), + ('\u{11366}', '\u{1136c}'), + ('\u{11370}', '\u{11374}'), + ('𑐀', '𑑊'), + ('𑑐', '𑑙'), + ('\u{1145e}', '𑑡'), + ('𑒀', '𑓅'), + ('𑓇', '𑓇'), + ('𑓐', '𑓙'), + ('𑖀', '\u{115b5}'), + ('𑖸', '\u{115c0}'), + ('𑗘', '\u{115dd}'), + ('𑘀', '\u{11640}'), + ('𑙄', '𑙄'), + ('𑙐', '𑙙'), + ('𑚀', '𑚸'), + ('𑛀', '𑛉'), + ('𑜀', '𑜚'), + ('\u{1171d}', '\u{1172b}'), + ('𑜰', '𑜹'), + ('𑝀', '𑝆'), + ('𑠀', '\u{1183a}'), + ('𑢠', '𑣩'), + ('𑣿', '𑤆'), + ('𑤉', '𑤉'), + ('𑤌', '𑤓'), + ('𑤕', '𑤖'), + ('𑤘', '𑤵'), + ('𑤷', '𑤸'), + ('\u{1193b}', '\u{11943}'), + ('𑥐', '𑥙'), + ('𑦠', '𑦧'), + ('𑦪', '\u{119d7}'), + ('\u{119da}', '𑧡'), + ('𑧣', '𑧤'), + ('𑨀', '\u{11a3e}'), + ('\u{11a47}', '\u{11a47}'), + ('𑩐', '\u{11a99}'), + ('𑪝', '𑪝'), + ('𑪰', '𑫸'), + ('𑰀', '𑰈'), + ('𑰊', '\u{11c36}'), + ('\u{11c38}', '𑱀'), + ('𑱐', '𑱙'), + ('𑱲', '𑲏'), + ('\u{11c92}', '\u{11ca7}'), + ('𑲩', '\u{11cb6}'), + ('𑴀', '𑴆'), + ('𑴈', '𑴉'), + ('𑴋', '\u{11d36}'), + ('\u{11d3a}', '\u{11d3a}'), + ('\u{11d3c}', '\u{11d3d}'), + ('\u{11d3f}', '\u{11d47}'), + ('𑵐', '𑵙'), + ('𑵠', '𑵥'), + ('𑵧', '𑵨'), + ('𑵪', '𑶎'), + ('\u{11d90}', '\u{11d91}'), + ('𑶓', '𑶘'), + ('𑶠', '𑶩'), + ('𑻠', '𑻶'), + ('\u{11f00}', '𑼐'), + ('𑼒', '\u{11f3a}'), + ('𑼾', '\u{11f42}'), + ('𑽐', '𑽙'), + ('𑾰', '𑾰'), + ('𒀀', '𒎙'), + ('𒐀', '𒑮'), + ('𒒀', '𒕃'), + ('𒾐', '𒿰'), + ('𓀀', '𓐯'), + ('\u{13440}', '\u{13455}'), + ('𔐀', '𔙆'), + ('𖠀', '𖨸'), + ('𖩀', '𖩞'), + ('𖩠', '𖩩'), + ('𖩰', '𖪾'), + ('𖫀', '𖫉'), + ('𖫐', '𖫭'), + ('\u{16af0}', '\u{16af4}'), + ('𖬀', '\u{16b36}'), + ('𖭀', '𖭃'), + ('𖭐', '𖭙'), + ('𖭣', '𖭷'), + ('𖭽', '𖮏'), + ('𖹀', '𖹿'), + ('𖼀', '𖽊'), + ('\u{16f4f}', '𖾇'), + ('\u{16f8f}', '𖾟'), + ('𖿠', '𖿡'), + ('𖿣', '\u{16fe4}'), + ('𖿰', '𖿱'), + ('𗀀', '𘟷'), + ('𘠀', '𘳕'), + ('𘴀', '𘴈'), + ('𚿰', '𚿳'), + ('𚿵', '𚿻'), + ('𚿽', '𚿾'), + ('𛀀', '𛄢'), + ('𛄲', '𛄲'), + ('𛅐', '𛅒'), + ('𛅕', '𛅕'), + ('𛅤', '𛅧'), + ('𛅰', '𛋻'), + ('𛰀', '𛱪'), + ('𛱰', '𛱼'), + ('𛲀', '𛲈'), + ('𛲐', '𛲙'), + ('\u{1bc9d}', '\u{1bc9e}'), + ('\u{1cf00}', '\u{1cf2d}'), + ('\u{1cf30}', '\u{1cf46}'), + ('\u{1d165}', '\u{1d169}'), + ('𝅭', '\u{1d172}'), + ('\u{1d17b}', '\u{1d182}'), + ('\u{1d185}', '\u{1d18b}'), + ('\u{1d1aa}', '\u{1d1ad}'), + ('\u{1d242}', '\u{1d244}'), + ('𝐀', '𝑔'), + ('𝑖', '𝒜'), + ('𝒞', '𝒟'), + ('𝒢', '𝒢'), + ('𝒥', '𝒦'), + ('𝒩', '𝒬'), + ('𝒮', '𝒹'), + ('𝒻', '𝒻'), + ('𝒽', '𝓃'), + ('𝓅', '𝔅'), + ('𝔇', '𝔊'), + ('𝔍', '𝔔'), + ('𝔖', '𝔜'), + ('𝔞', '𝔹'), + ('𝔻', '𝔾'), + ('𝕀', '𝕄'), + ('𝕆', '𝕆'), + ('𝕊', '𝕐'), + ('𝕒', '𝚥'), + ('𝚨', '𝛀'), + ('𝛂', '𝛚'), + ('𝛜', '𝛺'), + ('𝛼', '𝜔'), + ('𝜖', '𝜴'), + ('𝜶', '𝝎'), + ('𝝐', '𝝮'), + ('𝝰', '𝞈'), + ('𝞊', '𝞨'), + ('𝞪', '𝟂'), + ('𝟄', '𝟋'), + ('𝟎', '𝟿'), + ('\u{1da00}', '\u{1da36}'), + ('\u{1da3b}', '\u{1da6c}'), + ('\u{1da75}', '\u{1da75}'), + ('\u{1da84}', '\u{1da84}'), + ('\u{1da9b}', '\u{1da9f}'), + ('\u{1daa1}', '\u{1daaf}'), + ('𝼀', '𝼞'), + ('𝼥', '𝼪'), + ('\u{1e000}', '\u{1e006}'), + ('\u{1e008}', '\u{1e018}'), + ('\u{1e01b}', '\u{1e021}'), + ('\u{1e023}', '\u{1e024}'), + ('\u{1e026}', '\u{1e02a}'), + ('𞀰', '𞁭'), + ('\u{1e08f}', '\u{1e08f}'), + ('𞄀', '𞄬'), + ('\u{1e130}', '𞄽'), + ('𞅀', '𞅉'), + ('𞅎', '𞅎'), + ('𞊐', '\u{1e2ae}'), + ('𞋀', '𞋹'), + ('𞓐', '𞓹'), + ('𞟠', '𞟦'), + ('𞟨', '𞟫'), + ('𞟭', '𞟮'), + ('𞟰', '𞟾'), + ('𞠀', '𞣄'), + ('\u{1e8d0}', '\u{1e8d6}'), + ('𞤀', '𞥋'), + ('𞥐', '𞥙'), + ('𞸀', '𞸃'), + ('𞸅', '𞸟'), + ('𞸡', '𞸢'), + ('𞸤', '𞸤'), + ('𞸧', '𞸧'), + ('𞸩', '𞸲'), + ('𞸴', '𞸷'), + ('𞸹', '𞸹'), + ('𞸻', '𞸻'), + ('𞹂', '𞹂'), + ('𞹇', '𞹇'), + ('𞹉', '𞹉'), + ('𞹋', '𞹋'), + ('𞹍', '𞹏'), + ('𞹑', '𞹒'), + ('𞹔', '𞹔'), + ('𞹗', '𞹗'), + ('𞹙', '𞹙'), + ('𞹛', '𞹛'), + ('𞹝', '𞹝'), + ('𞹟', '𞹟'), + ('𞹡', '𞹢'), + ('𞹤', '𞹤'), + ('𞹧', '𞹪'), + ('𞹬', '𞹲'), + ('𞹴', '𞹷'), + ('𞹹', '𞹼'), + ('𞹾', '𞹾'), + ('𞺀', '𞺉'), + ('𞺋', '𞺛'), + ('𞺡', '𞺣'), + ('𞺥', '𞺩'), + ('𞺫', '𞺻'), + ('🄰', '🅉'), + ('🅐', '🅩'), + ('🅰', '🆉'), + ('🯰', '🯹'), + ('𠀀', '𪛟'), + ('𪜀', '𫜹'), + ('𫝀', '𫠝'), + ('𫠠', '𬺡'), + ('𬺰', '𮯠'), + ('丽', '𪘀'), + ('𰀀', '𱍊'), + ('𱍐', '𲎯'), + ('\u{e0100}', '\u{e01ef}'), +]; diff --git a/third_party/rust/regex-automata/src/util/utf8.rs b/third_party/rust/regex-automata/src/util/utf8.rs new file mode 100644 index 0000000000..91b27efe0f --- /dev/null +++ b/third_party/rust/regex-automata/src/util/utf8.rs @@ -0,0 +1,196 @@ +/*! +Utilities for dealing with UTF-8. + +This module provides some UTF-8 related helper routines, including an +incremental decoder. +*/ + +/// Returns true if and only if the given byte is considered a word character. +/// This only applies to ASCII. +/// +/// This was copied from regex-syntax so that we can use it to determine the +/// starting DFA state while searching without depending on regex-syntax. The +/// definition is never going to change, so there's no maintenance/bit-rot +/// hazard here. +#[cfg_attr(feature = "perf-inline", inline(always))] +pub(crate) fn is_word_byte(b: u8) -> bool { + const fn mkwordset() -> [bool; 256] { + // FIXME: Use as_usize() once const functions in traits are stable. + let mut set = [false; 256]; + set[b'_' as usize] = true; + + let mut byte = b'0'; + while byte <= b'9' { + set[byte as usize] = true; + byte += 1; + } + byte = b'A'; + while byte <= b'Z' { + set[byte as usize] = true; + byte += 1; + } + byte = b'a'; + while byte <= b'z' { + set[byte as usize] = true; + byte += 1; + } + set + } + const WORD: [bool; 256] = mkwordset(); + WORD[b as usize] +} + +/// Decodes the next UTF-8 encoded codepoint from the given byte slice. +/// +/// If no valid encoding of a codepoint exists at the beginning of the given +/// byte slice, then the first byte is returned instead. +/// +/// This returns `None` if and only if `bytes` is empty. +/// +/// This never panics. +/// +/// *WARNING*: This is not designed for performance. If you're looking for a +/// fast UTF-8 decoder, this is not it. If you feel like you need one in this +/// crate, then please file an issue and discuss your use case. +#[cfg_attr(feature = "perf-inline", inline(always))] +pub(crate) fn decode(bytes: &[u8]) -> Option<Result<char, u8>> { + if bytes.is_empty() { + return None; + } + let len = match len(bytes[0]) { + None => return Some(Err(bytes[0])), + Some(len) if len > bytes.len() => return Some(Err(bytes[0])), + Some(1) => return Some(Ok(char::from(bytes[0]))), + Some(len) => len, + }; + match core::str::from_utf8(&bytes[..len]) { + Ok(s) => Some(Ok(s.chars().next().unwrap())), + Err(_) => Some(Err(bytes[0])), + } +} + +/// Decodes the last UTF-8 encoded codepoint from the given byte slice. +/// +/// If no valid encoding of a codepoint exists at the end of the given byte +/// slice, then the last byte is returned instead. +/// +/// This returns `None` if and only if `bytes` is empty. +#[cfg_attr(feature = "perf-inline", inline(always))] +pub(crate) fn decode_last(bytes: &[u8]) -> Option<Result<char, u8>> { + if bytes.is_empty() { + return None; + } + let mut start = bytes.len() - 1; + let limit = bytes.len().saturating_sub(4); + while start > limit && !is_leading_or_invalid_byte(bytes[start]) { + start -= 1; + } + match decode(&bytes[start..]) { + None => None, + Some(Ok(ch)) => Some(Ok(ch)), + Some(Err(_)) => Some(Err(bytes[bytes.len() - 1])), + } +} + +/// Given a UTF-8 leading byte, this returns the total number of code units +/// in the following encoded codepoint. +/// +/// If the given byte is not a valid UTF-8 leading byte, then this returns +/// `None`. +#[cfg_attr(feature = "perf-inline", inline(always))] +fn len(byte: u8) -> Option<usize> { + if byte <= 0x7F { + return Some(1); + } else if byte & 0b1100_0000 == 0b1000_0000 { + return None; + } else if byte <= 0b1101_1111 { + Some(2) + } else if byte <= 0b1110_1111 { + Some(3) + } else if byte <= 0b1111_0111 { + Some(4) + } else { + None + } +} + +/// Returns true if and only if the given offset in the given bytes falls on a +/// valid UTF-8 encoded codepoint boundary. +/// +/// If `bytes` is not valid UTF-8, then the behavior of this routine is +/// unspecified. +#[cfg_attr(feature = "perf-inline", inline(always))] +pub(crate) fn is_boundary(bytes: &[u8], i: usize) -> bool { + match bytes.get(i) { + // The position at the end of the bytes always represents an empty + // string, which is a valid boundary. But anything after that doesn't + // make much sense to call valid a boundary. + None => i == bytes.len(), + // Other than ASCII (where the most significant bit is never set), + // valid starting bytes always have their most significant two bits + // set, where as continuation bytes never have their second most + // significant bit set. Therefore, this only returns true when bytes[i] + // corresponds to a byte that begins a valid UTF-8 encoding of a + // Unicode scalar value. + Some(&b) => b <= 0b0111_1111 || b >= 0b1100_0000, + } +} + +/// Returns true if and only if the given byte is either a valid leading UTF-8 +/// byte, or is otherwise an invalid byte that can never appear anywhere in a +/// valid UTF-8 sequence. +#[cfg_attr(feature = "perf-inline", inline(always))] +fn is_leading_or_invalid_byte(b: u8) -> bool { + // In the ASCII case, the most significant bit is never set. The leading + // byte of a 2/3/4-byte sequence always has the top two most significant + // bits set. For bytes that can never appear anywhere in valid UTF-8, this + // also returns true, since every such byte has its two most significant + // bits set: + // + // \xC0 :: 11000000 + // \xC1 :: 11000001 + // \xF5 :: 11110101 + // \xF6 :: 11110110 + // \xF7 :: 11110111 + // \xF8 :: 11111000 + // \xF9 :: 11111001 + // \xFA :: 11111010 + // \xFB :: 11111011 + // \xFC :: 11111100 + // \xFD :: 11111101 + // \xFE :: 11111110 + // \xFF :: 11111111 + (b & 0b1100_0000) != 0b1000_0000 +} + +/* +/// Returns the smallest possible index of the next valid UTF-8 sequence +/// starting after `i`. +/// +/// For all inputs, including invalid UTF-8 and any value of `i`, the return +/// value is guaranteed to be greater than `i`. (If there is no value greater +/// than `i` that fits in `usize`, then this panics.) +/// +/// Generally speaking, this should only be called on `text` when it is +/// permitted to assume that it is valid UTF-8 and where either `i >= +/// text.len()` or where `text[i]` is a leading byte of a UTF-8 sequence. +/// +/// NOTE: This method was used in a previous conception of iterators where we +/// specifically tried to skip over empty matches that split a codepoint by +/// simply requiring that our next search begin at the beginning of codepoint. +/// But we ended up changing that technique to always advance by 1 byte and +/// then filter out matches that split a codepoint after-the-fact. Thus, we no +/// longer use this method. But I've kept it around in case we want to switch +/// back to this approach. Its guarantees are a little subtle, so I'd prefer +/// not to rebuild it from whole cloth. +pub(crate) fn next(text: &[u8], i: usize) -> usize { + let b = match text.get(i) { + None => return i.checked_add(1).unwrap(), + Some(&b) => b, + }; + // For cases where we see an invalid UTF-8 byte, there isn't much we can do + // other than just start at the next byte. + let inc = len(b).unwrap_or(1); + i.checked_add(inc).unwrap() +} +*/ diff --git a/third_party/rust/regex-automata/src/util/wire.rs b/third_party/rust/regex-automata/src/util/wire.rs new file mode 100644 index 0000000000..ecf4fd8c0a --- /dev/null +++ b/third_party/rust/regex-automata/src/util/wire.rs @@ -0,0 +1,975 @@ +/*! +Types and routines that support the wire format of finite automata. + +Currently, this module just exports a few error types and some small helpers +for deserializing [dense DFAs](crate::dfa::dense::DFA) using correct alignment. +*/ + +/* +A collection of helper functions, types and traits for serializing automata. + +This crate defines its own bespoke serialization mechanism for some structures +provided in the public API, namely, DFAs. A bespoke mechanism was developed +primarily because structures like automata demand a specific binary format. +Attempting to encode their rich structure in an existing serialization +format is just not feasible. Moreover, the format for each structure is +generally designed such that deserialization is cheap. More specifically, that +deserialization can be done in constant time. (The idea being that you can +embed it into your binary or mmap it, and then use it immediately.) + +In order to achieve this, the dense and sparse DFAs in this crate use an +in-memory representation that very closely corresponds to its binary serialized +form. This pervades and complicates everything, and in some cases, requires +dealing with alignment and reasoning about safety. + +This technique does have major advantages. In particular, it permits doing +the potentially costly work of compiling a finite state machine in an offline +manner, and then loading it at runtime not only without having to re-compile +the regex, but even without the code required to do the compilation. This, for +example, permits one to use a pre-compiled DFA not only in environments without +Rust's standard library, but also in environments without a heap. + +In the code below, whenever we insert some kind of padding, it's to enforce a +4-byte alignment, unless otherwise noted. Namely, u32 is the only state ID type +supported. (In a previous version of this library, DFAs were generic over the +state ID representation.) + +Also, serialization generally requires the caller to specify endianness, +where as deserialization always assumes native endianness (otherwise cheap +deserialization would be impossible). This implies that serializing a structure +generally requires serializing both its big-endian and little-endian variants, +and then loading the correct one based on the target's endianness. +*/ + +use core::{ + cmp, + convert::{TryFrom, TryInto}, + mem::size_of, +}; + +#[cfg(feature = "alloc")] +use alloc::{vec, vec::Vec}; + +use crate::util::{ + int::Pointer, + primitives::{PatternID, PatternIDError, StateID, StateIDError}, +}; + +/// A hack to align a smaller type `B` with a bigger type `T`. +/// +/// The usual use of this is with `B = [u8]` and `T = u32`. That is, +/// it permits aligning a sequence of bytes on a 4-byte boundary. This +/// is useful in contexts where one wants to embed a serialized [dense +/// DFA](crate::dfa::dense::DFA) into a Rust a program while guaranteeing the +/// alignment required for the DFA. +/// +/// See [`dense::DFA::from_bytes`](crate::dfa::dense::DFA::from_bytes) for an +/// example of how to use this type. +#[repr(C)] +#[derive(Debug)] +pub struct AlignAs<B: ?Sized, T> { + /// A zero-sized field indicating the alignment we want. + pub _align: [T; 0], + /// A possibly non-sized field containing a sequence of bytes. + pub bytes: B, +} + +/// An error that occurs when serializing an object from this crate. +/// +/// Serialization, as used in this crate, universally refers to the process +/// of transforming a structure (like a DFA) into a custom binary format +/// represented by `&[u8]`. To this end, serialization is generally infallible. +/// However, it can fail when caller provided buffer sizes are too small. When +/// that occurs, a serialization error is reported. +/// +/// A `SerializeError` provides no introspection capabilities. Its only +/// supported operation is conversion to a human readable error message. +/// +/// This error type implements the `std::error::Error` trait only when the +/// `std` feature is enabled. Otherwise, this type is defined in all +/// configurations. +#[derive(Debug)] +pub struct SerializeError { + /// The name of the thing that a buffer is too small for. + /// + /// Currently, the only kind of serialization error is one that is + /// committed by a caller: providing a destination buffer that is too + /// small to fit the serialized object. This makes sense conceptually, + /// since every valid inhabitant of a type should be serializable. + /// + /// This is somewhat exposed in the public API of this crate. For example, + /// the `to_bytes_{big,little}_endian` APIs return a `Vec<u8>` and are + /// guaranteed to never panic or error. This is only possible because the + /// implementation guarantees that it will allocate a `Vec<u8>` that is + /// big enough. + /// + /// In summary, if a new serialization error kind needs to be added, then + /// it will need careful consideration. + what: &'static str, +} + +impl SerializeError { + pub(crate) fn buffer_too_small(what: &'static str) -> SerializeError { + SerializeError { what } + } +} + +impl core::fmt::Display for SerializeError { + fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result { + write!(f, "destination buffer is too small to write {}", self.what) + } +} + +#[cfg(feature = "std")] +impl std::error::Error for SerializeError {} + +/// An error that occurs when deserializing an object defined in this crate. +/// +/// Serialization, as used in this crate, universally refers to the process +/// of transforming a structure (like a DFA) into a custom binary format +/// represented by `&[u8]`. Deserialization, then, refers to the process of +/// cheaply converting this binary format back to the object's in-memory +/// representation as defined in this crate. To the extent possible, +/// deserialization will report this error whenever this process fails. +/// +/// A `DeserializeError` provides no introspection capabilities. Its only +/// supported operation is conversion to a human readable error message. +/// +/// This error type implements the `std::error::Error` trait only when the +/// `std` feature is enabled. Otherwise, this type is defined in all +/// configurations. +#[derive(Debug)] +pub struct DeserializeError(DeserializeErrorKind); + +#[derive(Debug)] +enum DeserializeErrorKind { + Generic { msg: &'static str }, + BufferTooSmall { what: &'static str }, + InvalidUsize { what: &'static str }, + VersionMismatch { expected: u32, found: u32 }, + EndianMismatch { expected: u32, found: u32 }, + AlignmentMismatch { alignment: usize, address: usize }, + LabelMismatch { expected: &'static str }, + ArithmeticOverflow { what: &'static str }, + PatternID { err: PatternIDError, what: &'static str }, + StateID { err: StateIDError, what: &'static str }, +} + +impl DeserializeError { + pub(crate) fn generic(msg: &'static str) -> DeserializeError { + DeserializeError(DeserializeErrorKind::Generic { msg }) + } + + pub(crate) fn buffer_too_small(what: &'static str) -> DeserializeError { + DeserializeError(DeserializeErrorKind::BufferTooSmall { what }) + } + + fn invalid_usize(what: &'static str) -> DeserializeError { + DeserializeError(DeserializeErrorKind::InvalidUsize { what }) + } + + fn version_mismatch(expected: u32, found: u32) -> DeserializeError { + DeserializeError(DeserializeErrorKind::VersionMismatch { + expected, + found, + }) + } + + fn endian_mismatch(expected: u32, found: u32) -> DeserializeError { + DeserializeError(DeserializeErrorKind::EndianMismatch { + expected, + found, + }) + } + + fn alignment_mismatch( + alignment: usize, + address: usize, + ) -> DeserializeError { + DeserializeError(DeserializeErrorKind::AlignmentMismatch { + alignment, + address, + }) + } + + fn label_mismatch(expected: &'static str) -> DeserializeError { + DeserializeError(DeserializeErrorKind::LabelMismatch { expected }) + } + + fn arithmetic_overflow(what: &'static str) -> DeserializeError { + DeserializeError(DeserializeErrorKind::ArithmeticOverflow { what }) + } + + fn pattern_id_error( + err: PatternIDError, + what: &'static str, + ) -> DeserializeError { + DeserializeError(DeserializeErrorKind::PatternID { err, what }) + } + + pub(crate) fn state_id_error( + err: StateIDError, + what: &'static str, + ) -> DeserializeError { + DeserializeError(DeserializeErrorKind::StateID { err, what }) + } +} + +#[cfg(feature = "std")] +impl std::error::Error for DeserializeError {} + +impl core::fmt::Display for DeserializeError { + fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result { + use self::DeserializeErrorKind::*; + + match self.0 { + Generic { msg } => write!(f, "{}", msg), + BufferTooSmall { what } => { + write!(f, "buffer is too small to read {}", what) + } + InvalidUsize { what } => { + write!(f, "{} is too big to fit in a usize", what) + } + VersionMismatch { expected, found } => write!( + f, + "unsupported version: \ + expected version {} but found version {}", + expected, found, + ), + EndianMismatch { expected, found } => write!( + f, + "endianness mismatch: expected 0x{:X} but got 0x{:X}. \ + (Are you trying to load an object serialized with a \ + different endianness?)", + expected, found, + ), + AlignmentMismatch { alignment, address } => write!( + f, + "alignment mismatch: slice starts at address \ + 0x{:X}, which is not aligned to a {} byte boundary", + address, alignment, + ), + LabelMismatch { expected } => write!( + f, + "label mismatch: start of serialized object should \ + contain a NUL terminated {:?} label, but a different \ + label was found", + expected, + ), + ArithmeticOverflow { what } => { + write!(f, "arithmetic overflow for {}", what) + } + PatternID { ref err, what } => { + write!(f, "failed to read pattern ID for {}: {}", what, err) + } + StateID { ref err, what } => { + write!(f, "failed to read state ID for {}: {}", what, err) + } + } + } +} + +/// Safely converts a `&[u32]` to `&[StateID]` with zero cost. +#[cfg_attr(feature = "perf-inline", inline(always))] +pub(crate) fn u32s_to_state_ids(slice: &[u32]) -> &[StateID] { + // SAFETY: This is safe because StateID is defined to have the same memory + // representation as a u32 (it is repr(transparent)). While not every u32 + // is a "valid" StateID, callers are not permitted to rely on the validity + // of StateIDs for memory safety. It can only lead to logical errors. (This + // is why StateID::new_unchecked is safe.) + unsafe { + core::slice::from_raw_parts( + slice.as_ptr().cast::<StateID>(), + slice.len(), + ) + } +} + +/// Safely converts a `&mut [u32]` to `&mut [StateID]` with zero cost. +pub(crate) fn u32s_to_state_ids_mut(slice: &mut [u32]) -> &mut [StateID] { + // SAFETY: This is safe because StateID is defined to have the same memory + // representation as a u32 (it is repr(transparent)). While not every u32 + // is a "valid" StateID, callers are not permitted to rely on the validity + // of StateIDs for memory safety. It can only lead to logical errors. (This + // is why StateID::new_unchecked is safe.) + unsafe { + core::slice::from_raw_parts_mut( + slice.as_mut_ptr().cast::<StateID>(), + slice.len(), + ) + } +} + +/// Safely converts a `&[u32]` to `&[PatternID]` with zero cost. +#[cfg_attr(feature = "perf-inline", inline(always))] +pub(crate) fn u32s_to_pattern_ids(slice: &[u32]) -> &[PatternID] { + // SAFETY: This is safe because PatternID is defined to have the same + // memory representation as a u32 (it is repr(transparent)). While not + // every u32 is a "valid" PatternID, callers are not permitted to rely + // on the validity of PatternIDs for memory safety. It can only lead to + // logical errors. (This is why PatternID::new_unchecked is safe.) + unsafe { + core::slice::from_raw_parts( + slice.as_ptr().cast::<PatternID>(), + slice.len(), + ) + } +} + +/// Checks that the given slice has an alignment that matches `T`. +/// +/// This is useful for checking that a slice has an appropriate alignment +/// before casting it to a &[T]. Note though that alignment is not itself +/// sufficient to perform the cast for any `T`. +pub(crate) fn check_alignment<T>( + slice: &[u8], +) -> Result<(), DeserializeError> { + let alignment = core::mem::align_of::<T>(); + let address = slice.as_ptr().as_usize(); + if address % alignment == 0 { + return Ok(()); + } + Err(DeserializeError::alignment_mismatch(alignment, address)) +} + +/// Reads a possibly empty amount of padding, up to 7 bytes, from the beginning +/// of the given slice. All padding bytes must be NUL bytes. +/// +/// This is useful because it can be theoretically necessary to pad the +/// beginning of a serialized object with NUL bytes to ensure that it starts +/// at a correctly aligned address. These padding bytes should come immediately +/// before the label. +/// +/// This returns the number of bytes read from the given slice. +pub(crate) fn skip_initial_padding(slice: &[u8]) -> usize { + let mut nread = 0; + while nread < 7 && nread < slice.len() && slice[nread] == 0 { + nread += 1; + } + nread +} + +/// Allocate a byte buffer of the given size, along with some initial padding +/// such that `buf[padding..]` has the same alignment as `T`, where the +/// alignment of `T` must be at most `8`. In particular, callers should treat +/// the first N bytes (second return value) as padding bytes that must not be +/// overwritten. In all cases, the following identity holds: +/// +/// ```ignore +/// let (buf, padding) = alloc_aligned_buffer::<StateID>(SIZE); +/// assert_eq!(SIZE, buf[padding..].len()); +/// ``` +/// +/// In practice, padding is often zero. +/// +/// The requirement for `8` as a maximum here is somewhat arbitrary. In +/// practice, we never need anything bigger in this crate, and so this function +/// does some sanity asserts under the assumption of a max alignment of `8`. +#[cfg(feature = "alloc")] +pub(crate) fn alloc_aligned_buffer<T>(size: usize) -> (Vec<u8>, usize) { + // NOTE: This is a kludge because there's no easy way to allocate a Vec<u8> + // with an alignment guaranteed to be greater than 1. We could create a + // Vec<u32>, but this cannot be safely transmuted to a Vec<u8> without + // concern, since reallocing or dropping the Vec<u8> is UB (different + // alignment than the initial allocation). We could define a wrapper type + // to manage this for us, but it seems like more machinery than it's worth. + let buf = vec![0; size]; + let align = core::mem::align_of::<T>(); + let address = buf.as_ptr().as_usize(); + if address % align == 0 { + return (buf, 0); + } + // Let's try this again. We have to create a totally new alloc with + // the maximum amount of bytes we might need. We can't just extend our + // pre-existing 'buf' because that might create a new alloc with a + // different alignment. + let extra = align - 1; + let mut buf = vec![0; size + extra]; + let address = buf.as_ptr().as_usize(); + // The code below handles the case where 'address' is aligned to T, so if + // we got lucky and 'address' is now aligned to T (when it previously + // wasn't), then we're done. + if address % align == 0 { + buf.truncate(size); + return (buf, 0); + } + let padding = ((address & !(align - 1)).checked_add(align).unwrap()) + .checked_sub(address) + .unwrap(); + assert!(padding <= 7, "padding of {} is bigger than 7", padding); + assert!( + padding <= extra, + "padding of {} is bigger than extra {} bytes", + padding, + extra + ); + buf.truncate(size + padding); + assert_eq!(size + padding, buf.len()); + assert_eq!( + 0, + buf[padding..].as_ptr().as_usize() % align, + "expected end of initial padding to be aligned to {}", + align, + ); + (buf, padding) +} + +/// Reads a NUL terminated label starting at the beginning of the given slice. +/// +/// If a NUL terminated label could not be found, then an error is returned. +/// Similarly, if a label is found but doesn't match the expected label, then +/// an error is returned. +/// +/// Upon success, the total number of bytes read (including padding bytes) is +/// returned. +pub(crate) fn read_label( + slice: &[u8], + expected_label: &'static str, +) -> Result<usize, DeserializeError> { + // Set an upper bound on how many bytes we scan for a NUL. Since no label + // in this crate is longer than 256 bytes, if we can't find one within that + // range, then we have corrupted data. + let first_nul = + slice[..cmp::min(slice.len(), 256)].iter().position(|&b| b == 0); + let first_nul = match first_nul { + Some(first_nul) => first_nul, + None => { + return Err(DeserializeError::generic( + "could not find NUL terminated label \ + at start of serialized object", + )); + } + }; + let len = first_nul + padding_len(first_nul); + if slice.len() < len { + return Err(DeserializeError::generic( + "could not find properly sized label at start of serialized object" + )); + } + if expected_label.as_bytes() != &slice[..first_nul] { + return Err(DeserializeError::label_mismatch(expected_label)); + } + Ok(len) +} + +/// Writes the given label to the buffer as a NUL terminated string. The label +/// given must not contain NUL, otherwise this will panic. Similarly, the label +/// must not be longer than 255 bytes, otherwise this will panic. +/// +/// Additional NUL bytes are written as necessary to ensure that the number of +/// bytes written is always a multiple of 4. +/// +/// Upon success, the total number of bytes written (including padding) is +/// returned. +pub(crate) fn write_label( + label: &str, + dst: &mut [u8], +) -> Result<usize, SerializeError> { + let nwrite = write_label_len(label); + if dst.len() < nwrite { + return Err(SerializeError::buffer_too_small("label")); + } + dst[..label.len()].copy_from_slice(label.as_bytes()); + for i in 0..(nwrite - label.len()) { + dst[label.len() + i] = 0; + } + assert_eq!(nwrite % 4, 0); + Ok(nwrite) +} + +/// Returns the total number of bytes (including padding) that would be written +/// for the given label. This panics if the given label contains a NUL byte or +/// is longer than 255 bytes. (The size restriction exists so that searching +/// for a label during deserialization can be done in small bounded space.) +pub(crate) fn write_label_len(label: &str) -> usize { + if label.len() > 255 { + panic!("label must not be longer than 255 bytes"); + } + if label.as_bytes().iter().position(|&b| b == 0).is_some() { + panic!("label must not contain NUL bytes"); + } + let label_len = label.len() + 1; // +1 for the NUL terminator + label_len + padding_len(label_len) +} + +/// Reads the endianness check from the beginning of the given slice and +/// confirms that the endianness of the serialized object matches the expected +/// endianness. If the slice is too small or if the endianness check fails, +/// this returns an error. +/// +/// Upon success, the total number of bytes read is returned. +pub(crate) fn read_endianness_check( + slice: &[u8], +) -> Result<usize, DeserializeError> { + let (n, nr) = try_read_u32(slice, "endianness check")?; + assert_eq!(nr, write_endianness_check_len()); + if n != 0xFEFF { + return Err(DeserializeError::endian_mismatch(0xFEFF, n)); + } + Ok(nr) +} + +/// Writes 0xFEFF as an integer using the given endianness. +/// +/// This is useful for writing into the header of a serialized object. It can +/// be read during deserialization as a sanity check to ensure the proper +/// endianness is used. +/// +/// Upon success, the total number of bytes written is returned. +pub(crate) fn write_endianness_check<E: Endian>( + dst: &mut [u8], +) -> Result<usize, SerializeError> { + let nwrite = write_endianness_check_len(); + if dst.len() < nwrite { + return Err(SerializeError::buffer_too_small("endianness check")); + } + E::write_u32(0xFEFF, dst); + Ok(nwrite) +} + +/// Returns the number of bytes written by the endianness check. +pub(crate) fn write_endianness_check_len() -> usize { + size_of::<u32>() +} + +/// Reads a version number from the beginning of the given slice and confirms +/// that is matches the expected version number given. If the slice is too +/// small or if the version numbers aren't equivalent, this returns an error. +/// +/// Upon success, the total number of bytes read is returned. +/// +/// N.B. Currently, we require that the version number is exactly equivalent. +/// In the future, if we bump the version number without a semver bump, then +/// we'll need to relax this a bit and support older versions. +pub(crate) fn read_version( + slice: &[u8], + expected_version: u32, +) -> Result<usize, DeserializeError> { + let (n, nr) = try_read_u32(slice, "version")?; + assert_eq!(nr, write_version_len()); + if n != expected_version { + return Err(DeserializeError::version_mismatch(expected_version, n)); + } + Ok(nr) +} + +/// Writes the given version number to the beginning of the given slice. +/// +/// This is useful for writing into the header of a serialized object. It can +/// be read during deserialization as a sanity check to ensure that the library +/// code supports the format of the serialized object. +/// +/// Upon success, the total number of bytes written is returned. +pub(crate) fn write_version<E: Endian>( + version: u32, + dst: &mut [u8], +) -> Result<usize, SerializeError> { + let nwrite = write_version_len(); + if dst.len() < nwrite { + return Err(SerializeError::buffer_too_small("version number")); + } + E::write_u32(version, dst); + Ok(nwrite) +} + +/// Returns the number of bytes written by writing the version number. +pub(crate) fn write_version_len() -> usize { + size_of::<u32>() +} + +/// Reads a pattern ID from the given slice. If the slice has insufficient +/// length, then this panics. If the deserialized integer exceeds the pattern +/// ID limit for the current target, then this returns an error. +/// +/// Upon success, this also returns the number of bytes read. +pub(crate) fn read_pattern_id( + slice: &[u8], + what: &'static str, +) -> Result<(PatternID, usize), DeserializeError> { + let bytes: [u8; PatternID::SIZE] = + slice[..PatternID::SIZE].try_into().unwrap(); + let pid = PatternID::from_ne_bytes(bytes) + .map_err(|err| DeserializeError::pattern_id_error(err, what))?; + Ok((pid, PatternID::SIZE)) +} + +/// Reads a pattern ID from the given slice. If the slice has insufficient +/// length, then this panics. Otherwise, the deserialized integer is assumed +/// to be a valid pattern ID. +/// +/// This also returns the number of bytes read. +pub(crate) fn read_pattern_id_unchecked(slice: &[u8]) -> (PatternID, usize) { + let pid = PatternID::from_ne_bytes_unchecked( + slice[..PatternID::SIZE].try_into().unwrap(), + ); + (pid, PatternID::SIZE) +} + +/// Write the given pattern ID to the beginning of the given slice of bytes +/// using the specified endianness. The given slice must have length at least +/// `PatternID::SIZE`, or else this panics. Upon success, the total number of +/// bytes written is returned. +pub(crate) fn write_pattern_id<E: Endian>( + pid: PatternID, + dst: &mut [u8], +) -> usize { + E::write_u32(pid.as_u32(), dst); + PatternID::SIZE +} + +/// Attempts to read a state ID from the given slice. If the slice has an +/// insufficient number of bytes or if the state ID exceeds the limit for +/// the current target, then this returns an error. +/// +/// Upon success, this also returns the number of bytes read. +pub(crate) fn try_read_state_id( + slice: &[u8], + what: &'static str, +) -> Result<(StateID, usize), DeserializeError> { + if slice.len() < StateID::SIZE { + return Err(DeserializeError::buffer_too_small(what)); + } + read_state_id(slice, what) +} + +/// Reads a state ID from the given slice. If the slice has insufficient +/// length, then this panics. If the deserialized integer exceeds the state ID +/// limit for the current target, then this returns an error. +/// +/// Upon success, this also returns the number of bytes read. +pub(crate) fn read_state_id( + slice: &[u8], + what: &'static str, +) -> Result<(StateID, usize), DeserializeError> { + let bytes: [u8; StateID::SIZE] = + slice[..StateID::SIZE].try_into().unwrap(); + let sid = StateID::from_ne_bytes(bytes) + .map_err(|err| DeserializeError::state_id_error(err, what))?; + Ok((sid, StateID::SIZE)) +} + +/// Reads a state ID from the given slice. If the slice has insufficient +/// length, then this panics. Otherwise, the deserialized integer is assumed +/// to be a valid state ID. +/// +/// This also returns the number of bytes read. +pub(crate) fn read_state_id_unchecked(slice: &[u8]) -> (StateID, usize) { + let sid = StateID::from_ne_bytes_unchecked( + slice[..StateID::SIZE].try_into().unwrap(), + ); + (sid, StateID::SIZE) +} + +/// Write the given state ID to the beginning of the given slice of bytes +/// using the specified endianness. The given slice must have length at least +/// `StateID::SIZE`, or else this panics. Upon success, the total number of +/// bytes written is returned. +pub(crate) fn write_state_id<E: Endian>( + sid: StateID, + dst: &mut [u8], +) -> usize { + E::write_u32(sid.as_u32(), dst); + StateID::SIZE +} + +/// Try to read a u16 as a usize from the beginning of the given slice in +/// native endian format. If the slice has fewer than 2 bytes or if the +/// deserialized number cannot be represented by usize, then this returns an +/// error. The error message will include the `what` description of what is +/// being deserialized, for better error messages. `what` should be a noun in +/// singular form. +/// +/// Upon success, this also returns the number of bytes read. +pub(crate) fn try_read_u16_as_usize( + slice: &[u8], + what: &'static str, +) -> Result<(usize, usize), DeserializeError> { + try_read_u16(slice, what).and_then(|(n, nr)| { + usize::try_from(n) + .map(|n| (n, nr)) + .map_err(|_| DeserializeError::invalid_usize(what)) + }) +} + +/// Try to read a u32 as a usize from the beginning of the given slice in +/// native endian format. If the slice has fewer than 4 bytes or if the +/// deserialized number cannot be represented by usize, then this returns an +/// error. The error message will include the `what` description of what is +/// being deserialized, for better error messages. `what` should be a noun in +/// singular form. +/// +/// Upon success, this also returns the number of bytes read. +pub(crate) fn try_read_u32_as_usize( + slice: &[u8], + what: &'static str, +) -> Result<(usize, usize), DeserializeError> { + try_read_u32(slice, what).and_then(|(n, nr)| { + usize::try_from(n) + .map(|n| (n, nr)) + .map_err(|_| DeserializeError::invalid_usize(what)) + }) +} + +/// Try to read a u16 from the beginning of the given slice in native endian +/// format. If the slice has fewer than 2 bytes, then this returns an error. +/// The error message will include the `what` description of what is being +/// deserialized, for better error messages. `what` should be a noun in +/// singular form. +/// +/// Upon success, this also returns the number of bytes read. +pub(crate) fn try_read_u16( + slice: &[u8], + what: &'static str, +) -> Result<(u16, usize), DeserializeError> { + check_slice_len(slice, size_of::<u16>(), what)?; + Ok((read_u16(slice), size_of::<u16>())) +} + +/// Try to read a u32 from the beginning of the given slice in native endian +/// format. If the slice has fewer than 4 bytes, then this returns an error. +/// The error message will include the `what` description of what is being +/// deserialized, for better error messages. `what` should be a noun in +/// singular form. +/// +/// Upon success, this also returns the number of bytes read. +pub(crate) fn try_read_u32( + slice: &[u8], + what: &'static str, +) -> Result<(u32, usize), DeserializeError> { + check_slice_len(slice, size_of::<u32>(), what)?; + Ok((read_u32(slice), size_of::<u32>())) +} + +/// Try to read a u128 from the beginning of the given slice in native endian +/// format. If the slice has fewer than 16 bytes, then this returns an error. +/// The error message will include the `what` description of what is being +/// deserialized, for better error messages. `what` should be a noun in +/// singular form. +/// +/// Upon success, this also returns the number of bytes read. +pub(crate) fn try_read_u128( + slice: &[u8], + what: &'static str, +) -> Result<(u128, usize), DeserializeError> { + check_slice_len(slice, size_of::<u128>(), what)?; + Ok((read_u128(slice), size_of::<u128>())) +} + +/// Read a u16 from the beginning of the given slice in native endian format. +/// If the slice has fewer than 2 bytes, then this panics. +/// +/// Marked as inline to speed up sparse searching which decodes integers from +/// its automaton at search time. +#[cfg_attr(feature = "perf-inline", inline(always))] +pub(crate) fn read_u16(slice: &[u8]) -> u16 { + let bytes: [u8; 2] = slice[..size_of::<u16>()].try_into().unwrap(); + u16::from_ne_bytes(bytes) +} + +/// Read a u32 from the beginning of the given slice in native endian format. +/// If the slice has fewer than 4 bytes, then this panics. +/// +/// Marked as inline to speed up sparse searching which decodes integers from +/// its automaton at search time. +#[cfg_attr(feature = "perf-inline", inline(always))] +pub(crate) fn read_u32(slice: &[u8]) -> u32 { + let bytes: [u8; 4] = slice[..size_of::<u32>()].try_into().unwrap(); + u32::from_ne_bytes(bytes) +} + +/// Read a u128 from the beginning of the given slice in native endian format. +/// If the slice has fewer than 16 bytes, then this panics. +pub(crate) fn read_u128(slice: &[u8]) -> u128 { + let bytes: [u8; 16] = slice[..size_of::<u128>()].try_into().unwrap(); + u128::from_ne_bytes(bytes) +} + +/// Checks that the given slice has some minimal length. If it's smaller than +/// the bound given, then a "buffer too small" error is returned with `what` +/// describing what the buffer represents. +pub(crate) fn check_slice_len<T>( + slice: &[T], + at_least_len: usize, + what: &'static str, +) -> Result<(), DeserializeError> { + if slice.len() < at_least_len { + return Err(DeserializeError::buffer_too_small(what)); + } + Ok(()) +} + +/// Multiply the given numbers, and on overflow, return an error that includes +/// 'what' in the error message. +/// +/// This is useful when doing arithmetic with untrusted data. +pub(crate) fn mul( + a: usize, + b: usize, + what: &'static str, +) -> Result<usize, DeserializeError> { + match a.checked_mul(b) { + Some(c) => Ok(c), + None => Err(DeserializeError::arithmetic_overflow(what)), + } +} + +/// Add the given numbers, and on overflow, return an error that includes +/// 'what' in the error message. +/// +/// This is useful when doing arithmetic with untrusted data. +pub(crate) fn add( + a: usize, + b: usize, + what: &'static str, +) -> Result<usize, DeserializeError> { + match a.checked_add(b) { + Some(c) => Ok(c), + None => Err(DeserializeError::arithmetic_overflow(what)), + } +} + +/// Shift `a` left by `b`, and on overflow, return an error that includes +/// 'what' in the error message. +/// +/// This is useful when doing arithmetic with untrusted data. +pub(crate) fn shl( + a: usize, + b: usize, + what: &'static str, +) -> Result<usize, DeserializeError> { + let amount = u32::try_from(b) + .map_err(|_| DeserializeError::arithmetic_overflow(what))?; + match a.checked_shl(amount) { + Some(c) => Ok(c), + None => Err(DeserializeError::arithmetic_overflow(what)), + } +} + +/// Returns the number of additional bytes required to add to the given length +/// in order to make the total length a multiple of 4. The return value is +/// always less than 4. +pub(crate) fn padding_len(non_padding_len: usize) -> usize { + (4 - (non_padding_len & 0b11)) & 0b11 +} + +/// A simple trait for writing code generic over endianness. +/// +/// This is similar to what byteorder provides, but we only need a very small +/// subset. +pub(crate) trait Endian { + /// Writes a u16 to the given destination buffer in a particular + /// endianness. If the destination buffer has a length smaller than 2, then + /// this panics. + fn write_u16(n: u16, dst: &mut [u8]); + + /// Writes a u32 to the given destination buffer in a particular + /// endianness. If the destination buffer has a length smaller than 4, then + /// this panics. + fn write_u32(n: u32, dst: &mut [u8]); + + /// Writes a u64 to the given destination buffer in a particular + /// endianness. If the destination buffer has a length smaller than 8, then + /// this panics. + fn write_u64(n: u64, dst: &mut [u8]); + + /// Writes a u128 to the given destination buffer in a particular + /// endianness. If the destination buffer has a length smaller than 16, + /// then this panics. + fn write_u128(n: u128, dst: &mut [u8]); +} + +/// Little endian writing. +pub(crate) enum LE {} +/// Big endian writing. +pub(crate) enum BE {} + +#[cfg(target_endian = "little")] +pub(crate) type NE = LE; +#[cfg(target_endian = "big")] +pub(crate) type NE = BE; + +impl Endian for LE { + fn write_u16(n: u16, dst: &mut [u8]) { + dst[..2].copy_from_slice(&n.to_le_bytes()); + } + + fn write_u32(n: u32, dst: &mut [u8]) { + dst[..4].copy_from_slice(&n.to_le_bytes()); + } + + fn write_u64(n: u64, dst: &mut [u8]) { + dst[..8].copy_from_slice(&n.to_le_bytes()); + } + + fn write_u128(n: u128, dst: &mut [u8]) { + dst[..16].copy_from_slice(&n.to_le_bytes()); + } +} + +impl Endian for BE { + fn write_u16(n: u16, dst: &mut [u8]) { + dst[..2].copy_from_slice(&n.to_be_bytes()); + } + + fn write_u32(n: u32, dst: &mut [u8]) { + dst[..4].copy_from_slice(&n.to_be_bytes()); + } + + fn write_u64(n: u64, dst: &mut [u8]) { + dst[..8].copy_from_slice(&n.to_be_bytes()); + } + + fn write_u128(n: u128, dst: &mut [u8]) { + dst[..16].copy_from_slice(&n.to_be_bytes()); + } +} + +#[cfg(all(test, feature = "alloc"))] +mod tests { + use super::*; + + #[test] + fn labels() { + let mut buf = [0; 1024]; + + let nwrite = write_label("fooba", &mut buf).unwrap(); + assert_eq!(nwrite, 8); + assert_eq!(&buf[..nwrite], b"fooba\x00\x00\x00"); + + let nread = read_label(&buf, "fooba").unwrap(); + assert_eq!(nread, 8); + } + + #[test] + #[should_panic] + fn bad_label_interior_nul() { + // interior NULs are not allowed + write_label("foo\x00bar", &mut [0; 1024]).unwrap(); + } + + #[test] + fn bad_label_almost_too_long() { + // ok + write_label(&"z".repeat(255), &mut [0; 1024]).unwrap(); + } + + #[test] + #[should_panic] + fn bad_label_too_long() { + // labels longer than 255 bytes are banned + write_label(&"z".repeat(256), &mut [0; 1024]).unwrap(); + } + + #[test] + fn padding() { + assert_eq!(0, padding_len(8)); + assert_eq!(3, padding_len(9)); + assert_eq!(2, padding_len(10)); + assert_eq!(1, padding_len(11)); + assert_eq!(0, padding_len(12)); + assert_eq!(3, padding_len(13)); + assert_eq!(2, padding_len(14)); + assert_eq!(1, padding_len(15)); + assert_eq!(0, padding_len(16)); + } +} |