diff options
Diffstat (limited to 'vendor/regex-automata/src/nfa')
-rw-r--r-- | vendor/regex-automata/src/nfa/compiler.rs | 1193 | ||||
-rw-r--r-- | vendor/regex-automata/src/nfa/mod.rs | 253 | ||||
-rw-r--r-- | vendor/regex-automata/src/nfa/thompson/compiler.rs | 1713 | ||||
-rw-r--r-- | vendor/regex-automata/src/nfa/thompson/error.rs | 145 | ||||
-rw-r--r-- | vendor/regex-automata/src/nfa/thompson/map.rs (renamed from vendor/regex-automata/src/nfa/map.rs) | 24 | ||||
-rw-r--r-- | vendor/regex-automata/src/nfa/thompson/mod.rs | 1555 | ||||
-rw-r--r-- | vendor/regex-automata/src/nfa/thompson/pikevm.rs | 554 | ||||
-rw-r--r-- | vendor/regex-automata/src/nfa/thompson/range_trie.rs (renamed from vendor/regex-automata/src/nfa/range_trie.rs) | 49 |
8 files changed, 4010 insertions, 1476 deletions
diff --git a/vendor/regex-automata/src/nfa/compiler.rs b/vendor/regex-automata/src/nfa/compiler.rs deleted file mode 100644 index d9b3945b3..000000000 --- a/vendor/regex-automata/src/nfa/compiler.rs +++ /dev/null @@ -1,1193 +0,0 @@ -// This module provides an NFA compiler using Thompson's construction -// algorithm. The compiler takes a regex-syntax::Hir as input and emits an NFA -// graph as output. The NFA graph is structured in a way that permits it to be -// executed by a virtual machine and also used to efficiently build a DFA. -// -// The compiler deals with a slightly expanded set of NFA states that notably -// includes an empty node that has exactly one epsilon transition to the next -// state. In other words, it's a "goto" instruction if one views Thompson's NFA -// as a set of bytecode instructions. These goto instructions are removed in -// a subsequent phase before returning the NFA to the caller. The purpose of -// these empty nodes is that they make the construction algorithm substantially -// simpler to implement. We remove them before returning to the caller because -// they can represent substantial overhead when traversing the NFA graph -// (either while searching using the NFA directly or while building a DFA). -// -// In the future, it would be nice to provide a Glushkov compiler as well, -// as it would work well as a bit-parallel NFA for smaller regexes. But -// the Thompson construction is one I'm more familiar with and seems more -// straight-forward to deal with when it comes to large Unicode character -// classes. -// -// Internally, the compiler uses interior mutability to improve composition -// in the face of the borrow checker. In particular, we'd really like to be -// able to write things like this: -// -// self.c_concat(exprs.iter().map(|e| self.c(e))) -// -// Which elegantly uses iterators to build up a sequence of compiled regex -// sub-expressions and then hands it off to the concatenating compiler -// routine. Without interior mutability, the borrow checker won't let us -// borrow `self` mutably both inside and outside the closure at the same -// time. - -use std::cell::RefCell; -use std::mem; - -use regex_syntax::hir::{self, Hir, HirKind}; -use regex_syntax::utf8::{Utf8Range, Utf8Sequences}; - -use classes::ByteClassSet; -use error::{Error, Result}; -use nfa::map::{Utf8BoundedMap, Utf8SuffixKey, Utf8SuffixMap}; -use nfa::range_trie::RangeTrie; -use nfa::{State, StateID, Transition, NFA}; - -/// Config knobs for the NFA compiler. See the builder's methods for more -/// docs on each one. -#[derive(Clone, Copy, Debug)] -struct Config { - anchored: bool, - allow_invalid_utf8: bool, - reverse: bool, - shrink: bool, -} - -impl Default for Config { - fn default() -> Config { - Config { - anchored: false, - allow_invalid_utf8: false, - reverse: false, - shrink: true, - } - } -} - -/// A builder for compiling an NFA. -#[derive(Clone, Debug)] -pub struct Builder { - config: Config, -} - -impl Builder { - /// Create a new NFA builder with its default configuration. - pub fn new() -> Builder { - Builder { config: Config::default() } - } - - /// Compile the given high level intermediate representation of a regular - /// expression into an NFA. - /// - /// If there was a problem building the NFA, then an error is returned. - /// For example, if the regex uses unsupported features (such as zero-width - /// assertions), then an error is returned. - pub fn build(&self, expr: &Hir) -> Result<NFA> { - let mut nfa = NFA::always_match(); - self.build_with(&mut Compiler::new(), &mut nfa, expr)?; - Ok(nfa) - } - - /// Compile the given high level intermediate representation of a regular - /// expression into the NFA given using the given compiler. Callers may - /// prefer this over `build` if they would like to reuse allocations while - /// compiling many regular expressions. - /// - /// On success, the given NFA is completely overwritten with the NFA - /// produced by the compiler. - /// - /// If there was a problem building the NFA, then an error is returned. For - /// example, if the regex uses unsupported features (such as zero-width - /// assertions), then an error is returned. When an error is returned, - /// the contents of `nfa` are unspecified and should not be relied upon. - /// However, it can still be reused in subsequent calls to this method. - pub fn build_with( - &self, - compiler: &mut Compiler, - nfa: &mut NFA, - expr: &Hir, - ) -> Result<()> { - compiler.clear(); - compiler.configure(self.config); - compiler.compile(nfa, expr) - } - - /// Set whether matching must be anchored at the beginning of the input. - /// - /// When enabled, a match must begin at the start of the input. When - /// disabled, the NFA will act as if the pattern started with a `.*?`, - /// which enables a match to appear anywhere. - /// - /// By default this is disabled. - pub fn anchored(&mut self, yes: bool) -> &mut Builder { - self.config.anchored = yes; - self - } - - /// When enabled, the builder will permit the construction of an NFA that - /// may match invalid UTF-8. - /// - /// When disabled (the default), the builder is guaranteed to produce a - /// regex that will only ever match valid UTF-8 (otherwise, the builder - /// will return an error). - pub fn allow_invalid_utf8(&mut self, yes: bool) -> &mut Builder { - self.config.allow_invalid_utf8 = yes; - self - } - - /// Reverse the NFA. - /// - /// A NFA reversal is performed by reversing all of the concatenated - /// sub-expressions in the original pattern, recursively. The resulting - /// NFA can be used to match the pattern starting from the end of a string - /// instead of the beginning of a string. - /// - /// Reversing the NFA is useful for building a reverse DFA, which is most - /// useful for finding the start of a match. - pub fn reverse(&mut self, yes: bool) -> &mut Builder { - self.config.reverse = yes; - self - } - - /// Apply best effort heuristics to shrink the NFA at the expense of more - /// time/memory. - /// - /// This is enabled by default. Generally speaking, if one is using an NFA - /// to compile DFA, then the extra time used to shrink the NFA will be - /// more than made up for during DFA construction (potentially by a lot). - /// In other words, enabling this can substantially decrease the overall - /// amount of time it takes to build a DFA. - /// - /// The only reason to disable this if you want to compile an NFA and start - /// using it as quickly as possible without needing to build a DFA. - pub fn shrink(&mut self, yes: bool) -> &mut Builder { - self.config.shrink = yes; - self - } -} - -/// A compiler that converts a regex abstract syntax to an NFA via Thompson's -/// construction. Namely, this compiler permits epsilon transitions between -/// states. -/// -/// Users of this crate cannot use a compiler directly. Instead, all one can -/// do is create one and use it via the -/// [`Builder::build_with`](struct.Builder.html#method.build_with) -/// method. This permits callers to reuse compilers in order to amortize -/// allocations. -#[derive(Clone, Debug)] -pub struct Compiler { - /// The set of compiled NFA states. Once a state is compiled, it is - /// assigned a state ID equivalent to its index in this list. Subsequent - /// compilation can modify previous states by adding new transitions. - states: RefCell<Vec<CState>>, - /// The configuration from the builder. - config: Config, - /// State used for compiling character classes to UTF-8 byte automata. - /// State is not retained between character class compilations. This just - /// serves to amortize allocation to the extent possible. - utf8_state: RefCell<Utf8State>, - /// State used for arranging character classes in reverse into a trie. - trie_state: RefCell<RangeTrie>, - /// State used for caching common suffixes when compiling reverse UTF-8 - /// automata (for Unicode character classes). - utf8_suffix: RefCell<Utf8SuffixMap>, - /// A map used to re-map state IDs when translating the compiler's internal - /// NFA state representation to the external NFA representation. - remap: RefCell<Vec<StateID>>, - /// A set of compiler internal state IDs that correspond to states that are - /// exclusively epsilon transitions, i.e., goto instructions, combined with - /// the state that they point to. This is used to record said states while - /// transforming the compiler's internal NFA representation to the external - /// form. - empties: RefCell<Vec<(StateID, StateID)>>, -} - -/// A compiler intermediate state representation for an NFA that is only used -/// during compilation. Once compilation is done, `CState`s are converted to -/// `State`s, which have a much simpler representation. -#[derive(Clone, Debug, Eq, PartialEq)] -enum CState { - /// An empty state whose only purpose is to forward the automaton to - /// another state via en epsilon transition. These are useful during - /// compilation but are otherwise removed at the end. - Empty { next: StateID }, - /// A state that only transitions to `next` if the current input byte is - /// in the range `[start, end]` (inclusive on both ends). - Range { range: Transition }, - /// A state with possibly many transitions, represented in a sparse - /// fashion. Transitions are ordered lexicographically by input range. - /// As such, this may only be used when every transition has equal - /// priority. (In practice, this is only used for encoding large UTF-8 - /// automata.) - Sparse { ranges: Vec<Transition> }, - /// An alternation such that there exists an epsilon transition to all - /// states in `alternates`, where matches found via earlier transitions - /// are preferred over later transitions. - Union { alternates: Vec<StateID> }, - /// An alternation such that there exists an epsilon transition to all - /// states in `alternates`, where matches found via later transitions - /// are preferred over earlier transitions. - /// - /// This "reverse" state exists for convenience during compilation that - /// permits easy construction of non-greedy combinations of NFA states. - /// At the end of compilation, Union and UnionReverse states are merged - /// into one Union type of state, where the latter has its epsilon - /// transitions reversed to reflect the priority inversion. - UnionReverse { alternates: Vec<StateID> }, - /// A match state. There is exactly one such occurrence of this state in - /// an NFA. - Match, -} - -/// A value that represents the result of compiling a sub-expression of a -/// regex's HIR. Specifically, this represents a sub-graph of the NFA that -/// has an initial state at `start` and a final state at `end`. -#[derive(Clone, Copy, Debug)] -pub struct ThompsonRef { - start: StateID, - end: StateID, -} - -impl Compiler { - /// Create a new compiler. - pub fn new() -> Compiler { - Compiler { - states: RefCell::new(vec![]), - config: Config::default(), - utf8_state: RefCell::new(Utf8State::new()), - trie_state: RefCell::new(RangeTrie::new()), - utf8_suffix: RefCell::new(Utf8SuffixMap::new(1000)), - remap: RefCell::new(vec![]), - empties: RefCell::new(vec![]), - } - } - - /// Clear any memory used by this compiler such that it is ready to compile - /// a new regex. - /// - /// It is preferrable to reuse a compiler if possible in order to reuse - /// allocations. - fn clear(&self) { - self.states.borrow_mut().clear(); - // We don't need to clear anything else since they are cleared on - // their own and only when they are used. - } - - /// Configure this compiler from the builder's knobs. - /// - /// The compiler is always reconfigured by the builder before using it to - /// build an NFA. - fn configure(&mut self, config: Config) { - self.config = config; - } - - /// Convert the current intermediate NFA to its final compiled form. - fn compile(&self, nfa: &mut NFA, expr: &Hir) -> Result<()> { - nfa.anchored = self.config.anchored; - - let mut start = self.add_empty(); - if !nfa.anchored { - let compiled = if self.config.allow_invalid_utf8 { - self.c_unanchored_prefix_invalid_utf8()? - } else { - self.c_unanchored_prefix_valid_utf8()? - }; - self.patch(start, compiled.start); - start = compiled.end; - } - let compiled = self.c(&expr)?; - let match_id = self.add_match(); - self.patch(start, compiled.start); - self.patch(compiled.end, match_id); - self.finish(nfa); - Ok(()) - } - - /// Finishes the compilation process and populates the provide NFA with - /// the final graph. - fn finish(&self, nfa: &mut NFA) { - let mut bstates = self.states.borrow_mut(); - let mut remap = self.remap.borrow_mut(); - remap.resize(bstates.len(), 0); - let mut empties = self.empties.borrow_mut(); - empties.clear(); - - // We don't reuse allocations here becuase this is what we're - // returning. - nfa.states.clear(); - let mut byteset = ByteClassSet::new(); - - // The idea here is to convert our intermediate states to their final - // form. The only real complexity here is the process of converting - // transitions, which are expressed in terms of state IDs. The new - // set of states will be smaller because of partial epsilon removal, - // so the state IDs will not be the same. - for (id, bstate) in bstates.iter_mut().enumerate() { - match *bstate { - CState::Empty { next } => { - // Since we're removing empty states, we need to handle - // them later since we don't yet know which new state this - // empty state will be mapped to. - empties.push((id, next)); - } - CState::Range { ref range } => { - remap[id] = nfa.states.len(); - byteset.set_range(range.start, range.end); - nfa.states.push(State::Range { range: range.clone() }); - } - CState::Sparse { ref mut ranges } => { - remap[id] = nfa.states.len(); - - let ranges = mem::replace(ranges, vec![]); - for r in &ranges { - byteset.set_range(r.start, r.end); - } - nfa.states.push(State::Sparse { - ranges: ranges.into_boxed_slice(), - }); - } - CState::Union { ref mut alternates } => { - remap[id] = nfa.states.len(); - - let alternates = mem::replace(alternates, vec![]); - nfa.states.push(State::Union { - alternates: alternates.into_boxed_slice(), - }); - } - CState::UnionReverse { ref mut alternates } => { - remap[id] = nfa.states.len(); - - let mut alternates = mem::replace(alternates, vec![]); - alternates.reverse(); - nfa.states.push(State::Union { - alternates: alternates.into_boxed_slice(), - }); - } - CState::Match => { - remap[id] = nfa.states.len(); - nfa.states.push(State::Match); - } - } - } - for &(empty_id, mut empty_next) in empties.iter() { - // empty states can point to other empty states, forming a chain. - // So we must follow the chain until the end, which must end at - // a non-empty state, and therefore, a state that is correctly - // remapped. We are guaranteed to terminate because our compiler - // never builds a loop among empty states. - while let CState::Empty { next } = bstates[empty_next] { - empty_next = next; - } - remap[empty_id] = remap[empty_next]; - } - for state in &mut nfa.states { - state.remap(&remap); - } - // The compiler always begins the NFA at the first state. - nfa.start = remap[0]; - nfa.byte_classes = byteset.byte_classes(); - } - - fn c(&self, expr: &Hir) -> Result<ThompsonRef> { - match *expr.kind() { - HirKind::Empty => { - let id = self.add_empty(); - Ok(ThompsonRef { start: id, end: id }) - } - HirKind::Literal(hir::Literal::Unicode(ch)) => { - let mut buf = [0; 4]; - let it = ch - .encode_utf8(&mut buf) - .as_bytes() - .iter() - .map(|&b| Ok(self.c_range(b, b))); - self.c_concat(it) - } - HirKind::Literal(hir::Literal::Byte(b)) => Ok(self.c_range(b, b)), - HirKind::Class(hir::Class::Bytes(ref cls)) => { - self.c_byte_class(cls) - } - HirKind::Class(hir::Class::Unicode(ref cls)) => { - self.c_unicode_class(cls) - } - HirKind::Repetition(ref rep) => self.c_repetition(rep), - HirKind::Group(ref group) => self.c(&*group.hir), - HirKind::Concat(ref exprs) => { - self.c_concat(exprs.iter().map(|e| self.c(e))) - } - HirKind::Alternation(ref exprs) => { - self.c_alternation(exprs.iter().map(|e| self.c(e))) - } - HirKind::Anchor(_) => Err(Error::unsupported_anchor()), - HirKind::WordBoundary(_) => Err(Error::unsupported_word()), - } - } - - fn c_concat<I>(&self, mut it: I) -> Result<ThompsonRef> - where - I: DoubleEndedIterator<Item = Result<ThompsonRef>>, - { - let first = - if self.config.reverse { it.next_back() } else { it.next() }; - let ThompsonRef { start, mut end } = match first { - Some(result) => result?, - None => return Ok(self.c_empty()), - }; - loop { - let next = - if self.config.reverse { it.next_back() } else { it.next() }; - let compiled = match next { - Some(result) => result?, - None => break, - }; - self.patch(end, compiled.start); - end = compiled.end; - } - Ok(ThompsonRef { start, end }) - } - - fn c_alternation<I>(&self, mut it: I) -> Result<ThompsonRef> - where - I: Iterator<Item = Result<ThompsonRef>>, - { - let first = it.next().expect("alternations must be non-empty")?; - let second = match it.next() { - None => return Ok(first), - Some(result) => result?, - }; - - let union = self.add_union(); - let end = self.add_empty(); - self.patch(union, first.start); - self.patch(first.end, end); - self.patch(union, second.start); - self.patch(second.end, end); - for result in it { - let compiled = result?; - self.patch(union, compiled.start); - self.patch(compiled.end, end); - } - Ok(ThompsonRef { start: union, end }) - } - - fn c_repetition(&self, rep: &hir::Repetition) -> Result<ThompsonRef> { - match rep.kind { - hir::RepetitionKind::ZeroOrOne => { - self.c_zero_or_one(&rep.hir, rep.greedy) - } - hir::RepetitionKind::ZeroOrMore => { - self.c_at_least(&rep.hir, rep.greedy, 0) - } - hir::RepetitionKind::OneOrMore => { - self.c_at_least(&rep.hir, rep.greedy, 1) - } - hir::RepetitionKind::Range(ref rng) => match *rng { - hir::RepetitionRange::Exactly(count) => { - self.c_exactly(&rep.hir, count) - } - hir::RepetitionRange::AtLeast(m) => { - self.c_at_least(&rep.hir, rep.greedy, m) - } - hir::RepetitionRange::Bounded(min, max) => { - self.c_bounded(&rep.hir, rep.greedy, min, max) - } - }, - } - } - - fn c_bounded( - &self, - expr: &Hir, - greedy: bool, - min: u32, - max: u32, - ) -> Result<ThompsonRef> { - let prefix = self.c_exactly(expr, min)?; - if min == max { - return Ok(prefix); - } - - // It is tempting here to compile the rest here as a concatenation - // of zero-or-one matches. i.e., for `a{2,5}`, compile it as if it - // were `aaa?a?a?`. The problem here is that it leads to this program: - // - // >000000: 61 => 01 - // 000001: 61 => 02 - // 000002: alt(03, 04) - // 000003: 61 => 04 - // 000004: alt(05, 06) - // 000005: 61 => 06 - // 000006: alt(07, 08) - // 000007: 61 => 08 - // 000008: MATCH - // - // And effectively, once you hit state 2, the epsilon closure will - // include states 3, 5, 5, 6, 7 and 8, which is quite a bit. It is - // better to instead compile it like so: - // - // >000000: 61 => 01 - // 000001: 61 => 02 - // 000002: alt(03, 08) - // 000003: 61 => 04 - // 000004: alt(05, 08) - // 000005: 61 => 06 - // 000006: alt(07, 08) - // 000007: 61 => 08 - // 000008: MATCH - // - // So that the epsilon closure of state 2 is now just 3 and 8. - let empty = self.add_empty(); - let mut prev_end = prefix.end; - for _ in min..max { - let union = if greedy { - self.add_union() - } else { - self.add_reverse_union() - }; - let compiled = self.c(expr)?; - self.patch(prev_end, union); - self.patch(union, compiled.start); - self.patch(union, empty); - prev_end = compiled.end; - } - self.patch(prev_end, empty); - Ok(ThompsonRef { start: prefix.start, end: empty }) - } - - fn c_at_least( - &self, - expr: &Hir, - greedy: bool, - n: u32, - ) -> Result<ThompsonRef> { - if n == 0 { - let union = if greedy { - self.add_union() - } else { - self.add_reverse_union() - }; - let compiled = self.c(expr)?; - self.patch(union, compiled.start); - self.patch(compiled.end, union); - Ok(ThompsonRef { start: union, end: union }) - } else if n == 1 { - let compiled = self.c(expr)?; - let union = if greedy { - self.add_union() - } else { - self.add_reverse_union() - }; - self.patch(compiled.end, union); - self.patch(union, compiled.start); - Ok(ThompsonRef { start: compiled.start, end: union }) - } else { - let prefix = self.c_exactly(expr, n - 1)?; - let last = self.c(expr)?; - let union = if greedy { - self.add_union() - } else { - self.add_reverse_union() - }; - self.patch(prefix.end, last.start); - self.patch(last.end, union); - self.patch(union, last.start); - Ok(ThompsonRef { start: prefix.start, end: union }) - } - } - - fn c_zero_or_one(&self, expr: &Hir, greedy: bool) -> Result<ThompsonRef> { - let union = - if greedy { self.add_union() } else { self.add_reverse_union() }; - let compiled = self.c(expr)?; - let empty = self.add_empty(); - self.patch(union, compiled.start); - self.patch(union, empty); - self.patch(compiled.end, empty); - Ok(ThompsonRef { start: union, end: empty }) - } - - fn c_exactly(&self, expr: &Hir, n: u32) -> Result<ThompsonRef> { - let it = (0..n).map(|_| self.c(expr)); - self.c_concat(it) - } - - fn c_byte_class(&self, cls: &hir::ClassBytes) -> Result<ThompsonRef> { - let end = self.add_empty(); - let mut trans = Vec::with_capacity(cls.ranges().len()); - for r in cls.iter() { - trans.push(Transition { - start: r.start(), - end: r.end(), - next: end, - }); - } - Ok(ThompsonRef { start: self.add_sparse(trans), end }) - } - - fn c_unicode_class(&self, cls: &hir::ClassUnicode) -> Result<ThompsonRef> { - // If all we have are ASCII ranges wrapped in a Unicode package, then - // there is zero reason to bring out the big guns. We can fit all ASCII - // ranges within a single sparse transition. - if cls.is_all_ascii() { - let end = self.add_empty(); - let mut trans = Vec::with_capacity(cls.ranges().len()); - for r in cls.iter() { - assert!(r.start() <= '\x7F'); - assert!(r.end() <= '\x7F'); - trans.push(Transition { - start: r.start() as u8, - end: r.end() as u8, - next: end, - }); - } - Ok(ThompsonRef { start: self.add_sparse(trans), end }) - } else if self.config.reverse { - if !self.config.shrink { - // When we don't want to spend the extra time shrinking, we - // compile the UTF-8 automaton in reverse using something like - // the "naive" approach, but will attempt to re-use common - // suffixes. - self.c_unicode_class_reverse_with_suffix(cls) - } else { - // When we want to shrink our NFA for reverse UTF-8 automata, - // we cannot feed UTF-8 sequences directly to the UTF-8 - // compiler, since the UTF-8 compiler requires all sequences - // to be lexicographically sorted. Instead, we organize our - // sequences into a range trie, which can then output our - // sequences in the correct order. Unfortunately, building the - // range trie is fairly expensive (but not nearly as expensive - // as building a DFA). Hence the reason why the 'shrink' option - // exists, so that this path can be toggled off. - let mut trie = self.trie_state.borrow_mut(); - trie.clear(); - - for rng in cls.iter() { - for mut seq in Utf8Sequences::new(rng.start(), rng.end()) { - seq.reverse(); - trie.insert(seq.as_slice()); - } - } - let mut utf8_state = self.utf8_state.borrow_mut(); - let mut utf8c = Utf8Compiler::new(self, &mut *utf8_state); - trie.iter(|seq| { - utf8c.add(&seq); - }); - Ok(utf8c.finish()) - } - } else { - // In the forward direction, we always shrink our UTF-8 automata - // because we can stream it right into the UTF-8 compiler. There - // is almost no downside (in either memory or time) to using this - // approach. - let mut utf8_state = self.utf8_state.borrow_mut(); - let mut utf8c = Utf8Compiler::new(self, &mut *utf8_state); - for rng in cls.iter() { - for seq in Utf8Sequences::new(rng.start(), rng.end()) { - utf8c.add(seq.as_slice()); - } - } - Ok(utf8c.finish()) - } - - // For reference, the code below is the "naive" version of compiling a - // UTF-8 automaton. It is deliciously simple (and works for both the - // forward and reverse cases), but will unfortunately produce very - // large NFAs. When compiling a forward automaton, the size difference - // can sometimes be an order of magnitude. For example, the '\w' regex - // will generate about ~3000 NFA states using the naive approach below, - // but only 283 states when using the approach above. This is because - // the approach above actually compiles a *minimal* (or near minimal, - // because of the bounded hashmap) UTF-8 automaton. - // - // The code below is kept as a reference point in order to make it - // easier to understand the higher level goal here. - /* - let it = cls - .iter() - .flat_map(|rng| Utf8Sequences::new(rng.start(), rng.end())) - .map(|seq| { - let it = seq - .as_slice() - .iter() - .map(|rng| Ok(self.c_range(rng.start, rng.end))); - self.c_concat(it) - }); - self.c_alternation(it); - */ - } - - fn c_unicode_class_reverse_with_suffix( - &self, - cls: &hir::ClassUnicode, - ) -> Result<ThompsonRef> { - // N.B. It would likely be better to cache common *prefixes* in the - // reverse direction, but it's not quite clear how to do that. The - // advantage of caching suffixes is that it does give us a win, and - // has a very small additional overhead. - let mut cache = self.utf8_suffix.borrow_mut(); - cache.clear(); - - let union = self.add_union(); - let alt_end = self.add_empty(); - for urng in cls.iter() { - for seq in Utf8Sequences::new(urng.start(), urng.end()) { - let mut end = alt_end; - for brng in seq.as_slice() { - let key = Utf8SuffixKey { - from: end, - start: brng.start, - end: brng.end, - }; - let hash = cache.hash(&key); - if let Some(id) = cache.get(&key, hash) { - end = id; - continue; - } - - let compiled = self.c_range(brng.start, brng.end); - self.patch(compiled.end, end); - end = compiled.start; - cache.set(key, hash, end); - } - self.patch(union, end); - } - } - Ok(ThompsonRef { start: union, end: alt_end }) - } - - fn c_range(&self, start: u8, end: u8) -> ThompsonRef { - let id = self.add_range(start, end); - ThompsonRef { start: id, end: id } - } - - fn c_empty(&self) -> ThompsonRef { - let id = self.add_empty(); - ThompsonRef { start: id, end: id } - } - - fn c_unanchored_prefix_valid_utf8(&self) -> Result<ThompsonRef> { - self.c(&Hir::repetition(hir::Repetition { - kind: hir::RepetitionKind::ZeroOrMore, - greedy: false, - hir: Box::new(Hir::any(false)), - })) - } - - fn c_unanchored_prefix_invalid_utf8(&self) -> Result<ThompsonRef> { - self.c(&Hir::repetition(hir::Repetition { - kind: hir::RepetitionKind::ZeroOrMore, - greedy: false, - hir: Box::new(Hir::any(true)), - })) - } - - fn patch(&self, from: StateID, to: StateID) { - match self.states.borrow_mut()[from] { - CState::Empty { ref mut next } => { - *next = to; - } - CState::Range { ref mut range } => { - range.next = to; - } - CState::Sparse { .. } => { - panic!("cannot patch from a sparse NFA state") - } - CState::Union { ref mut alternates } => { - alternates.push(to); - } - CState::UnionReverse { ref mut alternates } => { - alternates.push(to); - } - CState::Match => {} - } - } - - fn add_empty(&self) -> StateID { - let id = self.states.borrow().len(); - self.states.borrow_mut().push(CState::Empty { next: 0 }); - id - } - - fn add_range(&self, start: u8, end: u8) -> StateID { - let id = self.states.borrow().len(); - let trans = Transition { start, end, next: 0 }; - let state = CState::Range { range: trans }; - self.states.borrow_mut().push(state); - id - } - - fn add_sparse(&self, ranges: Vec<Transition>) -> StateID { - if ranges.len() == 1 { - let id = self.states.borrow().len(); - let state = CState::Range { range: ranges[0] }; - self.states.borrow_mut().push(state); - return id; - } - let id = self.states.borrow().len(); - let state = CState::Sparse { ranges }; - self.states.borrow_mut().push(state); - id - } - - fn add_union(&self) -> StateID { - let id = self.states.borrow().len(); - let state = CState::Union { alternates: vec![] }; - self.states.borrow_mut().push(state); - id - } - - fn add_reverse_union(&self) -> StateID { - let id = self.states.borrow().len(); - let state = CState::UnionReverse { alternates: vec![] }; - self.states.borrow_mut().push(state); - id - } - - fn add_match(&self) -> StateID { - let id = self.states.borrow().len(); - self.states.borrow_mut().push(CState::Match); - id - } -} - -#[derive(Debug)] -struct Utf8Compiler<'a> { - nfac: &'a Compiler, - state: &'a mut Utf8State, - target: StateID, -} - -#[derive(Clone, Debug)] -struct Utf8State { - compiled: Utf8BoundedMap, - uncompiled: Vec<Utf8Node>, -} - -#[derive(Clone, Debug)] -struct Utf8Node { - trans: Vec<Transition>, - last: Option<Utf8LastTransition>, -} - -#[derive(Clone, Debug)] -struct Utf8LastTransition { - start: u8, - end: u8, -} - -impl Utf8State { - fn new() -> Utf8State { - Utf8State { compiled: Utf8BoundedMap::new(5000), uncompiled: vec![] } - } - - fn clear(&mut self) { - self.compiled.clear(); - self.uncompiled.clear(); - } -} - -impl<'a> Utf8Compiler<'a> { - fn new(nfac: &'a Compiler, state: &'a mut Utf8State) -> Utf8Compiler<'a> { - let target = nfac.add_empty(); - state.clear(); - let mut utf8c = Utf8Compiler { nfac, state, target }; - utf8c.add_empty(); - utf8c - } - - fn finish(&mut self) -> ThompsonRef { - self.compile_from(0); - let node = self.pop_root(); - let start = self.compile(node); - ThompsonRef { start, end: self.target } - } - - fn add(&mut self, ranges: &[Utf8Range]) { - let prefix_len = ranges - .iter() - .zip(&self.state.uncompiled) - .take_while(|&(range, node)| { - node.last.as_ref().map_or(false, |t| { - (t.start, t.end) == (range.start, range.end) - }) - }) - .count(); - assert!(prefix_len < ranges.len()); - self.compile_from(prefix_len); - self.add_suffix(&ranges[prefix_len..]); - } - - fn compile_from(&mut self, from: usize) { - let mut next = self.target; - while from + 1 < self.state.uncompiled.len() { - let node = self.pop_freeze(next); - next = self.compile(node); - } - self.top_last_freeze(next); - } - - fn compile(&mut self, node: Vec<Transition>) -> StateID { - let hash = self.state.compiled.hash(&node); - if let Some(id) = self.state.compiled.get(&node, hash) { - return id; - } - let id = self.nfac.add_sparse(node.clone()); - self.state.compiled.set(node, hash, id); - id - } - - fn add_suffix(&mut self, ranges: &[Utf8Range]) { - assert!(!ranges.is_empty()); - let last = self - .state - .uncompiled - .len() - .checked_sub(1) - .expect("non-empty nodes"); - assert!(self.state.uncompiled[last].last.is_none()); - self.state.uncompiled[last].last = Some(Utf8LastTransition { - start: ranges[0].start, - end: ranges[0].end, - }); - for r in &ranges[1..] { - self.state.uncompiled.push(Utf8Node { - trans: vec![], - last: Some(Utf8LastTransition { start: r.start, end: r.end }), - }); - } - } - - fn add_empty(&mut self) { - self.state.uncompiled.push(Utf8Node { trans: vec![], last: None }); - } - - fn pop_freeze(&mut self, next: StateID) -> Vec<Transition> { - let mut uncompiled = self.state.uncompiled.pop().unwrap(); - uncompiled.set_last_transition(next); - uncompiled.trans - } - - fn pop_root(&mut self) -> Vec<Transition> { - assert_eq!(self.state.uncompiled.len(), 1); - assert!(self.state.uncompiled[0].last.is_none()); - self.state.uncompiled.pop().expect("non-empty nodes").trans - } - - fn top_last_freeze(&mut self, next: StateID) { - let last = self - .state - .uncompiled - .len() - .checked_sub(1) - .expect("non-empty nodes"); - self.state.uncompiled[last].set_last_transition(next); - } -} - -impl Utf8Node { - fn set_last_transition(&mut self, next: StateID) { - if let Some(last) = self.last.take() { - self.trans.push(Transition { - start: last.start, - end: last.end, - next, - }); - } - } -} - -#[cfg(test)] -mod tests { - use regex_syntax::hir::Hir; - use regex_syntax::ParserBuilder; - - use super::{Builder, State, StateID, Transition, NFA}; - - fn parse(pattern: &str) -> Hir { - ParserBuilder::new().build().parse(pattern).unwrap() - } - - fn build(pattern: &str) -> NFA { - Builder::new().anchored(true).build(&parse(pattern)).unwrap() - } - - fn s_byte(byte: u8, next: StateID) -> State { - let trans = Transition { start: byte, end: byte, next }; - State::Range { range: trans } - } - - fn s_range(start: u8, end: u8, next: StateID) -> State { - let trans = Transition { start, end, next }; - State::Range { range: trans } - } - - fn s_sparse(ranges: &[(u8, u8, StateID)]) -> State { - let ranges = ranges - .iter() - .map(|&(start, end, next)| Transition { start, end, next }) - .collect(); - State::Sparse { ranges } - } - - fn s_union(alts: &[StateID]) -> State { - State::Union { alternates: alts.to_vec().into_boxed_slice() } - } - - fn s_match() -> State { - State::Match - } - - #[test] - fn errors() { - // unsupported anchors - assert!(Builder::new().build(&parse(r"^")).is_err()); - assert!(Builder::new().build(&parse(r"$")).is_err()); - assert!(Builder::new().build(&parse(r"\A")).is_err()); - assert!(Builder::new().build(&parse(r"\z")).is_err()); - - // unsupported word boundaries - assert!(Builder::new().build(&parse(r"\b")).is_err()); - assert!(Builder::new().build(&parse(r"\B")).is_err()); - assert!(Builder::new().build(&parse(r"(?-u)\b")).is_err()); - } - - // Test that building an unanchored NFA has an appropriate `.*?` prefix. - #[test] - fn compile_unanchored_prefix() { - // When the machine can only match valid UTF-8. - let nfa = Builder::new().anchored(false).build(&parse(r"a")).unwrap(); - // There should be many states since the `.` in `.*?` matches any - // Unicode scalar value. - assert_eq!(11, nfa.len()); - assert_eq!(nfa.states[10], s_match()); - assert_eq!(nfa.states[9], s_byte(b'a', 10)); - - // When the machine can match invalid UTF-8. - let nfa = Builder::new() - .anchored(false) - .allow_invalid_utf8(true) - .build(&parse(r"a")) - .unwrap(); - assert_eq!( - nfa.states, - &[ - s_union(&[2, 1]), - s_range(0, 255, 0), - s_byte(b'a', 3), - s_match(), - ] - ); - } - - #[test] - fn compile_empty() { - assert_eq!(build("").states, &[s_match(),]); - } - - #[test] - fn compile_literal() { - assert_eq!(build("a").states, &[s_byte(b'a', 1), s_match(),]); - assert_eq!( - build("ab").states, - &[s_byte(b'a', 1), s_byte(b'b', 2), s_match(),] - ); - assert_eq!( - build("☃").states, - &[s_byte(0xE2, 1), s_byte(0x98, 2), s_byte(0x83, 3), s_match(),] - ); - - // Check that non-UTF-8 literals work. - let hir = ParserBuilder::new() - .allow_invalid_utf8(true) - .build() - .parse(r"(?-u)\xFF") - .unwrap(); - let nfa = Builder::new() - .anchored(true) - .allow_invalid_utf8(true) - .build(&hir) - .unwrap(); - assert_eq!(nfa.states, &[s_byte(b'\xFF', 1), s_match(),]); - } - - #[test] - fn compile_class() { - assert_eq!( - build(r"[a-z]").states, - &[s_range(b'a', b'z', 1), s_match(),] - ); - assert_eq!( - build(r"[x-za-c]").states, - &[s_sparse(&[(b'a', b'c', 1), (b'x', b'z', 1)]), s_match()] - ); - assert_eq!( - build(r"[\u03B1-\u03B4]").states, - &[s_range(0xB1, 0xB4, 2), s_byte(0xCE, 0), s_match()] - ); - assert_eq!( - build(r"[\u03B1-\u03B4\u{1F919}-\u{1F91E}]").states, - &[ - s_range(0xB1, 0xB4, 5), - s_range(0x99, 0x9E, 5), - s_byte(0xA4, 1), - s_byte(0x9F, 2), - s_sparse(&[(0xCE, 0xCE, 0), (0xF0, 0xF0, 3)]), - s_match(), - ] - ); - assert_eq!( - build(r"[a-z☃]").states, - &[ - s_byte(0x83, 3), - s_byte(0x98, 0), - s_sparse(&[(b'a', b'z', 3), (0xE2, 0xE2, 1)]), - s_match(), - ] - ); - } - - #[test] - fn compile_repetition() { - assert_eq!( - build(r"a?").states, - &[s_union(&[1, 2]), s_byte(b'a', 2), s_match(),] - ); - assert_eq!( - build(r"a??").states, - &[s_union(&[2, 1]), s_byte(b'a', 2), s_match(),] - ); - } - - #[test] - fn compile_group() { - assert_eq!( - build(r"ab+").states, - &[s_byte(b'a', 1), s_byte(b'b', 2), s_union(&[1, 3]), s_match(),] - ); - assert_eq!( - build(r"(ab)").states, - &[s_byte(b'a', 1), s_byte(b'b', 2), s_match(),] - ); - assert_eq!( - build(r"(ab)+").states, - &[s_byte(b'a', 1), s_byte(b'b', 2), s_union(&[0, 3]), s_match(),] - ); - } - - #[test] - fn compile_alternation() { - assert_eq!( - build(r"a|b").states, - &[s_byte(b'a', 3), s_byte(b'b', 3), s_union(&[0, 1]), s_match(),] - ); - assert_eq!( - build(r"|b").states, - &[s_byte(b'b', 2), s_union(&[2, 0]), s_match(),] - ); - assert_eq!( - build(r"a|").states, - &[s_byte(b'a', 2), s_union(&[0, 2]), s_match(),] - ); - } -} diff --git a/vendor/regex-automata/src/nfa/mod.rs b/vendor/regex-automata/src/nfa/mod.rs index 02d0501de..61ce5ef47 100644 --- a/vendor/regex-automata/src/nfa/mod.rs +++ b/vendor/regex-automata/src/nfa/mod.rs @@ -1,252 +1 @@ -use std::fmt; - -use classes::ByteClasses; -pub use nfa::compiler::Builder; - -mod compiler; -mod map; -mod range_trie; - -/// The representation for an NFA state identifier. -pub type StateID = usize; - -/// A final compiled NFA. -/// -/// The states of the NFA are indexed by state IDs, which are how transitions -/// are expressed. -#[derive(Clone)] -pub struct NFA { - /// Whether this NFA can only match at the beginning of input or not. - /// - /// When true, a match should only be reported if it begins at the 0th - /// index of the haystack. - anchored: bool, - /// The starting state of this NFA. - start: StateID, - /// The state list. This list is guaranteed to be indexable by the starting - /// state ID, and it is also guaranteed to contain exactly one `Match` - /// state. - states: Vec<State>, - /// A mapping from any byte value to its corresponding equivalence class - /// identifier. Two bytes in the same equivalence class cannot discriminate - /// between a match or a non-match. This map can be used to shrink the - /// total size of a DFA's transition table with a small match-time cost. - /// - /// Note that the NFA's transitions are *not* defined in terms of these - /// equivalence classes. The NFA's transitions are defined on the original - /// byte values. For the most part, this is because they wouldn't really - /// help the NFA much since the NFA already uses a sparse representation - /// to represent transitions. Byte classes are most effective in a dense - /// representation. - byte_classes: ByteClasses, -} - -impl NFA { - /// Returns an NFA that always matches at every position. - pub fn always_match() -> NFA { - NFA { - anchored: false, - start: 0, - states: vec![State::Match], - byte_classes: ByteClasses::empty(), - } - } - - /// Returns an NFA that never matches at any position. - pub fn never_match() -> NFA { - NFA { - anchored: false, - start: 0, - states: vec![State::Fail], - byte_classes: ByteClasses::empty(), - } - } - - /// Returns true if and only if this NFA is anchored. - pub fn is_anchored(&self) -> bool { - self.anchored - } - - /// Return the number of states in this NFA. - pub fn len(&self) -> usize { - self.states.len() - } - - /// Return the ID of the initial state of this NFA. - pub fn start(&self) -> StateID { - self.start - } - - /// Return the NFA state corresponding to the given ID. - pub fn state(&self, id: StateID) -> &State { - &self.states[id] - } - - /// Return the set of equivalence classes for this NFA. The slice returned - /// always has length 256 and maps each possible byte value to its - /// corresponding equivalence class ID (which is never more than 255). - pub fn byte_classes(&self) -> &ByteClasses { - &self.byte_classes - } -} - -impl fmt::Debug for NFA { - fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { - for (i, state) in self.states.iter().enumerate() { - let status = if i == self.start { '>' } else { ' ' }; - writeln!(f, "{}{:06}: {:?}", status, i, state)?; - } - Ok(()) - } -} - -/// A state in a final compiled NFA. -#[derive(Clone, Eq, PartialEq)] -pub enum State { - /// A state that transitions to `next` if and only if the current input - /// byte is in the range `[start, end]` (inclusive). - /// - /// This is a special case of Sparse in that it encodes only one transition - /// (and therefore avoids the allocation). - Range { range: Transition }, - /// A state with possibly many transitions, represented in a sparse - /// fashion. Transitions are ordered lexicographically by input range. - /// As such, this may only be used when every transition has equal - /// priority. (In practice, this is only used for encoding large UTF-8 - /// automata.) - Sparse { ranges: Box<[Transition]> }, - /// An alternation such that there exists an epsilon transition to all - /// states in `alternates`, where matches found via earlier transitions - /// are preferred over later transitions. - Union { alternates: Box<[StateID]> }, - /// A fail state. When encountered, the automaton is guaranteed to never - /// reach a match state. - Fail, - /// A match state. There is exactly one such occurrence of this state in - /// an NFA. - Match, -} - -/// A transition to another state, only if the given byte falls in the -/// inclusive range specified. -#[derive(Clone, Copy, Eq, Hash, PartialEq)] -pub struct Transition { - pub start: u8, - pub end: u8, - pub next: StateID, -} - -impl State { - /// Returns true if and only if this state contains one or more epsilon - /// transitions. - pub fn is_epsilon(&self) -> bool { - match *self { - State::Range { .. } - | State::Sparse { .. } - | State::Fail - | State::Match => false, - State::Union { .. } => true, - } - } - - /// Remap the transitions in this state using the given map. Namely, the - /// given map should be indexed according to the transitions currently - /// in this state. - /// - /// This is used during the final phase of the NFA compiler, which turns - /// its intermediate NFA into the final NFA. - fn remap(&mut self, remap: &[StateID]) { - match *self { - State::Range { ref mut range } => range.next = remap[range.next], - State::Sparse { ref mut ranges } => { - for r in ranges.iter_mut() { - r.next = remap[r.next]; - } - } - State::Union { ref mut alternates } => { - for alt in alternates.iter_mut() { - *alt = remap[*alt]; - } - } - State::Fail => {} - State::Match => {} - } - } -} - -impl fmt::Debug for State { - fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { - match *self { - State::Range { ref range } => range.fmt(f), - State::Sparse { ref ranges } => { - let rs = ranges - .iter() - .map(|t| format!("{:?}", t)) - .collect::<Vec<String>>() - .join(", "); - write!(f, "sparse({})", rs) - } - State::Union { ref alternates } => { - let alts = alternates - .iter() - .map(|id| format!("{}", id)) - .collect::<Vec<String>>() - .join(", "); - write!(f, "alt({})", alts) - } - State::Fail => write!(f, "FAIL"), - State::Match => write!(f, "MATCH"), - } - } -} - -impl fmt::Debug for Transition { - fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { - let Transition { start, end, next } = *self; - if self.start == self.end { - write!(f, "{} => {}", escape(start), next) - } else { - write!(f, "{}-{} => {}", escape(start), escape(end), next) - } - } -} - -/// Return the given byte as its escaped string form. -fn escape(b: u8) -> String { - use std::ascii; - - String::from_utf8(ascii::escape_default(b).collect::<Vec<_>>()).unwrap() -} - -#[cfg(test)] -mod tests { - use super::*; - use dense; - use dfa::DFA; - - #[test] - fn always_match() { - let nfa = NFA::always_match(); - let dfa = dense::Builder::new().build_from_nfa::<usize>(&nfa).unwrap(); - - assert_eq!(Some(0), dfa.find_at(b"", 0)); - assert_eq!(Some(0), dfa.find_at(b"a", 0)); - assert_eq!(Some(1), dfa.find_at(b"a", 1)); - assert_eq!(Some(0), dfa.find_at(b"ab", 0)); - assert_eq!(Some(1), dfa.find_at(b"ab", 1)); - assert_eq!(Some(2), dfa.find_at(b"ab", 2)); - } - - #[test] - fn never_match() { - let nfa = NFA::never_match(); - let dfa = dense::Builder::new().build_from_nfa::<usize>(&nfa).unwrap(); - - assert_eq!(None, dfa.find_at(b"", 0)); - assert_eq!(None, dfa.find_at(b"a", 0)); - assert_eq!(None, dfa.find_at(b"a", 1)); - assert_eq!(None, dfa.find_at(b"ab", 0)); - assert_eq!(None, dfa.find_at(b"ab", 1)); - assert_eq!(None, dfa.find_at(b"ab", 2)); - } -} +pub mod thompson; diff --git a/vendor/regex-automata/src/nfa/thompson/compiler.rs b/vendor/regex-automata/src/nfa/thompson/compiler.rs new file mode 100644 index 000000000..301194005 --- /dev/null +++ b/vendor/regex-automata/src/nfa/thompson/compiler.rs @@ -0,0 +1,1713 @@ +/* +This module provides an NFA compiler using Thompson's construction +algorithm. The compiler takes a regex-syntax::Hir as input and emits an NFA +graph as output. The NFA graph is structured in a way that permits it to be +executed by a virtual machine and also used to efficiently build a DFA. + +The compiler deals with a slightly expanded set of NFA states that notably +includes an empty node that has exactly one epsilon transition to the next +state. In other words, it's a "goto" instruction if one views Thompson's NFA +as a set of bytecode instructions. These goto instructions are removed in +a subsequent phase before returning the NFA to the caller. The purpose of +these empty nodes is that they make the construction algorithm substantially +simpler to implement. We remove them before returning to the caller because +they can represent substantial overhead when traversing the NFA graph +(either while searching using the NFA directly or while building a DFA). + +In the future, it would be nice to provide a Glushkov compiler as well, +as it would work well as a bit-parallel NFA for smaller regexes. But +the Thompson construction is one I'm more familiar with and seems more +straight-forward to deal with when it comes to large Unicode character +classes. + +Internally, the compiler uses interior mutability to improve composition +in the face of the borrow checker. In particular, we'd really like to be +able to write things like this: + + self.c_concat(exprs.iter().map(|e| self.c(e))) + +Which elegantly uses iterators to build up a sequence of compiled regex +sub-expressions and then hands it off to the concatenating compiler +routine. Without interior mutability, the borrow checker won't let us +borrow `self` mutably both inside and outside the closure at the same +time. +*/ + +use core::{ + borrow::Borrow, + cell::{Cell, RefCell}, + mem, +}; + +use alloc::{sync::Arc, vec, vec::Vec}; + +use regex_syntax::{ + hir::{self, Anchor, Class, Hir, HirKind, Literal, WordBoundary}, + utf8::{Utf8Range, Utf8Sequences}, + ParserBuilder, +}; + +use crate::{ + nfa::thompson::{ + error::Error, + map::{Utf8BoundedMap, Utf8SuffixKey, Utf8SuffixMap}, + range_trie::RangeTrie, + Look, SparseTransitions, State, Transition, NFA, + }, + util::{ + alphabet::ByteClassSet, + id::{IteratorIDExt, PatternID, StateID}, + }, +}; + +/// The configuration used for compiling a Thompson NFA from a regex pattern. +#[derive(Clone, Copy, Debug, Default)] +pub struct Config { + reverse: Option<bool>, + utf8: Option<bool>, + nfa_size_limit: Option<Option<usize>>, + shrink: Option<bool>, + captures: Option<bool>, + #[cfg(test)] + unanchored_prefix: Option<bool>, +} + +impl Config { + /// Return a new default Thompson NFA compiler configuration. + pub fn new() -> Config { + Config::default() + } + + /// Reverse the NFA. + /// + /// A NFA reversal is performed by reversing all of the concatenated + /// sub-expressions in the original pattern, recursively. The resulting + /// NFA can be used to match the pattern starting from the end of a string + /// instead of the beginning of a string. + /// + /// Reversing the NFA is useful for building a reverse DFA, which is most + /// useful for finding the start of a match after its ending position has + /// been found. + /// + /// This is disabled by default. + pub fn reverse(mut self, yes: bool) -> Config { + self.reverse = Some(yes); + self + } + + /// Whether to enable UTF-8 mode or not. + /// + /// When UTF-8 mode is enabled (which is the default), unanchored searches + /// will only match through valid UTF-8. If invalid UTF-8 is seen, then + /// an unanchored search will stop at that point. This is equivalent to + /// putting a `(?s:.)*?` at the start of the regex. + /// + /// When UTF-8 mode is disabled, then unanchored searches will match + /// through any arbitrary byte. This is equivalent to putting a + /// `(?s-u:.)*?` at the start of the regex. + /// + /// Generally speaking, UTF-8 mode should only be used when you know you + /// are searching valid UTF-8, such as a Rust `&str`. If UTF-8 mode is used + /// on input that is not valid UTF-8, then the regex is not likely to work + /// as expected. + /// + /// This is enabled by default. + pub fn utf8(mut self, yes: bool) -> Config { + self.utf8 = Some(yes); + self + } + + /// Sets an approximate size limit on the total heap used by the NFA being + /// compiled. + /// + /// This permits imposing constraints on the size of a compiled NFA. This + /// may be useful in contexts where the regex pattern is untrusted and one + /// wants to avoid using too much memory. + /// + /// This size limit does not apply to auxiliary heap used during + /// compilation that is not part of the built NFA. + /// + /// Note that this size limit is applied during compilation in order for + /// the limit to prevent too much heap from being used. However, the + /// implementation may use an intermediate NFA representation that is + /// otherwise slightly bigger than the final public form. Since the size + /// limit may be applied to an intermediate representation, there is not + /// necessarily a precise correspondence between the configured size limit + /// and the heap usage of the final NFA. + /// + /// There is no size limit by default. + /// + /// # Example + /// + /// This example demonstrates how Unicode mode can greatly increase the + /// size of the NFA. + /// + /// ``` + /// use regex_automata::nfa::thompson::NFA; + /// + /// // 300KB isn't enough! + /// NFA::builder() + /// .configure(NFA::config().nfa_size_limit(Some(300_000))) + /// .build(r"\w{20}") + /// .unwrap_err(); + /// + /// // ... but 400KB probably is. + /// let nfa = NFA::builder() + /// .configure(NFA::config().nfa_size_limit(Some(400_000))) + /// .build(r"\w{20}")?; + /// + /// assert_eq!(nfa.pattern_len(), 1); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + pub fn nfa_size_limit(mut self, bytes: Option<usize>) -> Config { + self.nfa_size_limit = Some(bytes); + self + } + + /// Apply best effort heuristics to shrink the NFA at the expense of more + /// time/memory. + /// + /// This is enabled by default. Generally speaking, if one is using an NFA + /// to compile a DFA, then the extra time used to shrink the NFA will be + /// more than made up for during DFA construction (potentially by a lot). + /// In other words, enabling this can substantially decrease the overall + /// amount of time it takes to build a DFA. + /// + /// The only reason to disable this if you want to compile an NFA and start + /// using it as quickly as possible without needing to build a DFA. e.g., + /// for an NFA simulation or for a lazy DFA. + /// + /// This is enabled by default. + pub fn shrink(mut self, yes: bool) -> Config { + self.shrink = Some(yes); + self + } + + /// Whether to include 'Capture' states in the NFA. + /// + /// This can only be enabled when compiling a forward NFA. This is + /// always disabled---with no way to override it---when the `reverse` + /// configuration is enabled. + /// + /// This is enabled by default. + pub fn captures(mut self, yes: bool) -> Config { + self.captures = Some(yes); + self + } + + /// Whether to compile an unanchored prefix into this NFA. + /// + /// This is enabled by default. It is made available for tests only to make + /// it easier to unit test the output of the compiler. + #[cfg(test)] + fn unanchored_prefix(mut self, yes: bool) -> Config { + self.unanchored_prefix = Some(yes); + self + } + + pub fn get_reverse(&self) -> bool { + self.reverse.unwrap_or(false) + } + + pub fn get_utf8(&self) -> bool { + self.utf8.unwrap_or(true) + } + + pub fn get_nfa_size_limit(&self) -> Option<usize> { + self.nfa_size_limit.unwrap_or(None) + } + + pub fn get_shrink(&self) -> bool { + self.shrink.unwrap_or(true) + } + + pub fn get_captures(&self) -> bool { + !self.get_reverse() && self.captures.unwrap_or(true) + } + + fn get_unanchored_prefix(&self) -> bool { + #[cfg(test)] + { + self.unanchored_prefix.unwrap_or(true) + } + #[cfg(not(test))] + { + true + } + } + + pub(crate) fn overwrite(self, o: Config) -> Config { + Config { + reverse: o.reverse.or(self.reverse), + utf8: o.utf8.or(self.utf8), + nfa_size_limit: o.nfa_size_limit.or(self.nfa_size_limit), + shrink: o.shrink.or(self.shrink), + captures: o.captures.or(self.captures), + #[cfg(test)] + unanchored_prefix: o.unanchored_prefix.or(self.unanchored_prefix), + } + } +} + +/// A builder for compiling an NFA. +#[derive(Clone, Debug)] +pub struct Builder { + config: Config, + parser: ParserBuilder, +} + +impl Builder { + /// Create a new NFA builder with its default configuration. + pub fn new() -> Builder { + Builder { config: Config::default(), parser: ParserBuilder::new() } + } + + /// Compile the given regular expression into an NFA. + /// + /// If there was a problem parsing the regex, then that error is returned. + /// + /// Otherwise, if there was a problem building the NFA, then an error is + /// returned. The only error that can occur is if the compiled regex would + /// exceed the size limits configured on this builder. + pub fn build(&self, pattern: &str) -> Result<NFA, Error> { + self.build_many(&[pattern]) + } + + pub fn build_many<P: AsRef<str>>( + &self, + patterns: &[P], + ) -> Result<NFA, Error> { + let mut hirs = vec![]; + for p in patterns { + hirs.push( + self.parser + .build() + .parse(p.as_ref()) + .map_err(Error::syntax)?, + ); + log!(log::trace!("parsed: {:?}", p.as_ref())); + } + self.build_many_from_hir(&hirs) + } + + /// Compile the given high level intermediate representation of a regular + /// expression into an NFA. + /// + /// If there was a problem building the NFA, then an error is returned. The + /// only error that can occur is if the compiled regex would exceed the + /// size limits configured on this builder. + pub fn build_from_hir(&self, expr: &Hir) -> Result<NFA, Error> { + self.build_from_hir_with(&mut Compiler::new(), expr) + } + + pub fn build_many_from_hir<H: Borrow<Hir>>( + &self, + exprs: &[H], + ) -> Result<NFA, Error> { + self.build_many_from_hir_with(&mut Compiler::new(), exprs) + } + + /// Compile the given high level intermediate representation of a regular + /// expression into the NFA given using the given compiler. Callers may + /// prefer this over `build` if they would like to reuse allocations while + /// compiling many regular expressions. + /// + /// On success, the given NFA is completely overwritten with the NFA + /// produced by the compiler. + /// + /// If there was a problem building the NFA, then an error is returned. + /// The only error that can occur is if the compiled regex would exceed + /// the size limits configured on this builder. When an error is returned, + /// the contents of `nfa` are unspecified and should not be relied upon. + /// However, it can still be reused in subsequent calls to this method. + fn build_from_hir_with( + &self, + compiler: &mut Compiler, + expr: &Hir, + ) -> Result<NFA, Error> { + self.build_many_from_hir_with(compiler, &[expr]) + } + + fn build_many_from_hir_with<H: Borrow<Hir>>( + &self, + compiler: &mut Compiler, + exprs: &[H], + ) -> Result<NFA, Error> { + compiler.configure(self.config); + compiler.compile(exprs) + } + + /// Apply the given NFA configuration options to this builder. + pub fn configure(&mut self, config: Config) -> &mut Builder { + self.config = self.config.overwrite(config); + self + } + + /// Set the syntax configuration for this builder using + /// [`SyntaxConfig`](../../struct.SyntaxConfig.html). + /// + /// This permits setting things like case insensitivity, Unicode and multi + /// line mode. + /// + /// This syntax configuration generally only applies when an NFA is built + /// directly from a pattern string. If an NFA is built from an HIR, then + /// all syntax settings are ignored. + pub fn syntax( + &mut self, + config: crate::util::syntax::SyntaxConfig, + ) -> &mut Builder { + config.apply(&mut self.parser); + self + } +} + +/// A compiler that converts a regex abstract syntax to an NFA via Thompson's +/// construction. Namely, this compiler permits epsilon transitions between +/// states. +#[derive(Clone, Debug)] +pub struct Compiler { + /// The configuration from the builder. + config: Config, + /// The final NFA that is built. + /// + /// Parts of this NFA are constructed during compilation, but the actual + /// states aren't added until a final "finish" step. This is because the + /// states constructed during compilation have unconditional epsilon + /// transitions, which makes the logic of compilation much simpler. The + /// "finish" step removes these unconditional epsilon transitions and must + /// therefore remap all of the transition state IDs. + nfa: RefCell<NFA>, + /// The set of compiled NFA states. Once a state is compiled, it is + /// assigned a state ID equivalent to its index in this list. Subsequent + /// compilation can modify previous states by adding new transitions. + states: RefCell<Vec<CState>>, + /// State used for compiling character classes to UTF-8 byte automata. + /// State is not retained between character class compilations. This just + /// serves to amortize allocation to the extent possible. + utf8_state: RefCell<Utf8State>, + /// State used for arranging character classes in reverse into a trie. + trie_state: RefCell<RangeTrie>, + /// State used for caching common suffixes when compiling reverse UTF-8 + /// automata (for Unicode character classes). + utf8_suffix: RefCell<Utf8SuffixMap>, + /// A map used to re-map state IDs when translating the compiler's internal + /// NFA state representation to the external NFA representation. + remap: RefCell<Vec<StateID>>, + /// A set of compiler internal state IDs that correspond to states that are + /// exclusively epsilon transitions, i.e., goto instructions, combined with + /// the state that they point to. This is used to record said states while + /// transforming the compiler's internal NFA representation to the external + /// form. + empties: RefCell<Vec<(StateID, StateID)>>, + /// The total memory used by each of the 'CState's in 'states'. This only + /// includes heap usage by each state, and not the size of the state + /// itself. + memory_cstates: Cell<usize>, +} + +/// A compiler intermediate state representation for an NFA that is only used +/// during compilation. Once compilation is done, `CState`s are converted +/// to `State`s (defined in the parent module), which have a much simpler +/// representation. +#[derive(Clone, Debug, Eq, PartialEq)] +enum CState { + /// An empty state whose only purpose is to forward the automaton to + /// another state via en epsilon transition. These are useful during + /// compilation but are otherwise removed at the end. + Empty { + next: StateID, + }, + /// An empty state that records a capture location. + /// + /// From the perspective of finite automata, this is precisely equivalent + /// to 'Empty', but serves the purpose of instructing NFA simulations to + /// record additional state when the finite state machine passes through + /// this epsilon transition. + /// + /// These transitions are treated as epsilon transitions with no additional + /// effects in DFAs. + /// + /// 'slot' in this context refers to the specific capture group offset that + /// is being recorded. Each capturing group has two slots corresponding to + /// the start and end of the matching portion of that group. + CaptureStart { + next: StateID, + capture_index: u32, + name: Option<Arc<str>>, + }, + CaptureEnd { + next: StateID, + capture_index: u32, + }, + /// A state that only transitions to `next` if the current input byte is + /// in the range `[start, end]` (inclusive on both ends). + Range { + range: Transition, + }, + /// A state with possibly many transitions, represented in a sparse + /// fashion. Transitions are ordered lexicographically by input range. + /// As such, this may only be used when every transition has equal + /// priority. (In practice, this is only used for encoding large UTF-8 + /// automata.) In contrast, a `Union` state has each alternate in order + /// of priority. Priority is used to implement greedy matching and also + /// alternations themselves, e.g., `abc|a` where `abc` has priority over + /// `a`. + /// + /// To clarify, it is possible to remove `Sparse` and represent all things + /// that `Sparse` is used for via `Union`. But this creates a more bloated + /// NFA with more epsilon transitions than is necessary in the special case + /// of character classes. + Sparse { + ranges: Vec<Transition>, + }, + /// A conditional epsilon transition satisfied via some sort of + /// look-around. + Look { + look: Look, + next: StateID, + }, + /// An alternation such that there exists an epsilon transition to all + /// states in `alternates`, where matches found via earlier transitions + /// are preferred over later transitions. + Union { + alternates: Vec<StateID>, + }, + /// An alternation such that there exists an epsilon transition to all + /// states in `alternates`, where matches found via later transitions are + /// preferred over earlier transitions. + /// + /// This "reverse" state exists for convenience during compilation that + /// permits easy construction of non-greedy combinations of NFA states. At + /// the end of compilation, Union and UnionReverse states are merged into + /// one Union type of state, where the latter has its epsilon transitions + /// reversed to reflect the priority inversion. + /// + /// The "convenience" here arises from the fact that as new states are + /// added to the list of `alternates`, we would like that add operation + /// to be amortized constant time. But if we used a `Union`, we'd need to + /// prepend the state, which takes O(n) time. There are other approaches we + /// could use to solve this, but this seems simple enough. + UnionReverse { + alternates: Vec<StateID>, + }, + /// A match state. There is at most one such occurrence of this state in + /// an NFA for each pattern compiled into the NFA. At time of writing, a + /// match state is always produced for every pattern given, but in theory, + /// if a pattern can never lead to a match, then the match state could be + /// omitted. + /// + /// `id` refers to the ID of the pattern itself, which corresponds to the + /// pattern's index (starting at 0). `start_id` refers to the anchored + /// NFA starting state corresponding to this pattern. + Match { + pattern_id: PatternID, + start_id: StateID, + }, +} + +/// A value that represents the result of compiling a sub-expression of a +/// regex's HIR. Specifically, this represents a sub-graph of the NFA that +/// has an initial state at `start` and a final state at `end`. +#[derive(Clone, Copy, Debug)] +pub struct ThompsonRef { + start: StateID, + end: StateID, +} + +impl Compiler { + /// Create a new compiler. + pub fn new() -> Compiler { + Compiler { + config: Config::default(), + nfa: RefCell::new(NFA::empty()), + states: RefCell::new(vec![]), + utf8_state: RefCell::new(Utf8State::new()), + trie_state: RefCell::new(RangeTrie::new()), + utf8_suffix: RefCell::new(Utf8SuffixMap::new(1000)), + remap: RefCell::new(vec![]), + empties: RefCell::new(vec![]), + memory_cstates: Cell::new(0), + } + } + + /// Configure and prepare this compiler from the builder's knobs. + /// + /// The compiler is must always reconfigured by the builder before using it + /// to build an NFA. Namely, this will also clear any latent state in the + /// compiler used during previous compilations. + fn configure(&mut self, config: Config) { + self.config = config; + self.nfa.borrow_mut().clear(); + self.states.borrow_mut().clear(); + self.memory_cstates.set(0); + // We don't need to clear anything else since they are cleared on + // their own and only when they are used. + } + + /// Convert the current intermediate NFA to its final compiled form. + fn compile<H: Borrow<Hir>>(&self, exprs: &[H]) -> Result<NFA, Error> { + if exprs.is_empty() { + return Ok(NFA::never_match()); + } + if exprs.len() > PatternID::LIMIT { + return Err(Error::too_many_patterns(exprs.len())); + } + + // We always add an unanchored prefix unless we were specifically told + // not to (for tests only), or if we know that the regex is anchored + // for all matches. When an unanchored prefix is not added, then the + // NFA's anchored and unanchored start states are equivalent. + let all_anchored = + exprs.iter().all(|e| e.borrow().is_anchored_start()); + let anchored = !self.config.get_unanchored_prefix() || all_anchored; + let unanchored_prefix = if anchored { + self.c_empty()? + } else { + if self.config.get_utf8() { + self.c_unanchored_prefix_valid_utf8()? + } else { + self.c_unanchored_prefix_invalid_utf8()? + } + }; + + let compiled = self.c_alternation( + exprs.iter().with_pattern_ids().map(|(pid, e)| { + let group_kind = hir::GroupKind::CaptureIndex(0); + let one = self.c_group(&group_kind, e.borrow())?; + let match_state_id = self.add_match(pid, one.start)?; + self.patch(one.end, match_state_id)?; + Ok(ThompsonRef { start: one.start, end: match_state_id }) + }), + )?; + self.patch(unanchored_prefix.end, compiled.start)?; + self.finish(compiled.start, unanchored_prefix.start)?; + Ok(self.nfa.replace(NFA::empty())) + } + + /// Finishes the compilation process and populates the NFA attached to this + /// compiler with the final graph. + fn finish( + &self, + start_anchored: StateID, + start_unanchored: StateID, + ) -> Result<(), Error> { + trace!( + "intermediate NFA compilation complete, \ + intermediate NFA size: {} states, {} bytes on heap", + self.states.borrow().len(), + self.nfa_memory_usage(), + ); + let mut nfa = self.nfa.borrow_mut(); + let mut bstates = self.states.borrow_mut(); + let mut remap = self.remap.borrow_mut(); + let mut empties = self.empties.borrow_mut(); + remap.resize(bstates.len(), StateID::ZERO); + empties.clear(); + + // The idea here is to convert our intermediate states to their final + // form. The only real complexity here is the process of converting + // transitions, which are expressed in terms of state IDs. The new + // set of states will be smaller because of partial epsilon removal, + // so the state IDs will not be the same. + for (sid, bstate) in bstates.iter_mut().with_state_ids() { + match *bstate { + CState::Empty { next } => { + // Since we're removing empty states, we need to handle + // them later since we don't yet know which new state this + // empty state will be mapped to. + empties.push((sid, next)); + } + CState::CaptureStart { next, capture_index, ref name } => { + // We can't remove this empty state because of the side + // effect of capturing an offset for this capture slot. + remap[sid] = nfa.add_capture_start( + next, + capture_index, + name.clone(), + )?; + } + CState::CaptureEnd { next, capture_index } => { + // We can't remove this empty state because of the side + // effect of capturing an offset for this capture slot. + remap[sid] = nfa.add_capture_end(next, capture_index)?; + } + CState::Range { range } => { + remap[sid] = nfa.add_range(range)?; + } + CState::Sparse { ref mut ranges } => { + let ranges = + mem::replace(ranges, vec![]).into_boxed_slice(); + remap[sid] = + nfa.add_sparse(SparseTransitions { ranges })?; + } + CState::Look { look, next } => { + remap[sid] = nfa.add_look(next, look)?; + } + CState::Union { ref mut alternates } => { + let alternates = + mem::replace(alternates, vec![]).into_boxed_slice(); + remap[sid] = nfa.add_union(alternates)?; + } + CState::UnionReverse { ref mut alternates } => { + let mut alternates = + mem::replace(alternates, vec![]).into_boxed_slice(); + alternates.reverse(); + remap[sid] = nfa.add_union(alternates)?; + } + CState::Match { start_id, .. } => { + remap[sid] = nfa.add_match()?; + nfa.finish_pattern(start_id)?; + } + } + } + for &(empty_id, mut empty_next) in empties.iter() { + // empty states can point to other empty states, forming a chain. + // So we must follow the chain until the end, which must end at + // a non-empty state, and therefore, a state that is correctly + // remapped. We are guaranteed to terminate because our compiler + // never builds a loop among only empty states. + while let CState::Empty { next } = bstates[empty_next] { + empty_next = next; + } + remap[empty_id] = remap[empty_next]; + } + nfa.set_start_anchored(start_anchored); + nfa.set_start_unanchored(start_unanchored); + nfa.remap(&remap); + trace!( + "final NFA (reverse? {:?}) compilation complete, \ + final NFA size: {} states, {} bytes on heap", + self.config.get_reverse(), + nfa.states().len(), + nfa.memory_usage(), + ); + Ok(()) + } + + fn c(&self, expr: &Hir) -> Result<ThompsonRef, Error> { + match *expr.kind() { + HirKind::Empty => self.c_empty(), + HirKind::Literal(Literal::Unicode(ch)) => self.c_char(ch), + HirKind::Literal(Literal::Byte(b)) => self.c_range(b, b), + HirKind::Class(Class::Bytes(ref c)) => self.c_byte_class(c), + HirKind::Class(Class::Unicode(ref c)) => self.c_unicode_class(c), + HirKind::Anchor(ref anchor) => self.c_anchor(anchor), + HirKind::WordBoundary(ref wb) => self.c_word_boundary(wb), + HirKind::Repetition(ref rep) => self.c_repetition(rep), + HirKind::Group(ref group) => self.c_group(&group.kind, &group.hir), + HirKind::Concat(ref es) => { + self.c_concat(es.iter().map(|e| self.c(e))) + } + HirKind::Alternation(ref es) => { + self.c_alternation(es.iter().map(|e| self.c(e))) + } + } + } + + fn c_concat<I>(&self, mut it: I) -> Result<ThompsonRef, Error> + where + I: DoubleEndedIterator<Item = Result<ThompsonRef, Error>>, + { + let first = if self.is_reverse() { it.next_back() } else { it.next() }; + let ThompsonRef { start, mut end } = match first { + Some(result) => result?, + None => return self.c_empty(), + }; + loop { + let next = + if self.is_reverse() { it.next_back() } else { it.next() }; + let compiled = match next { + Some(result) => result?, + None => break, + }; + self.patch(end, compiled.start)?; + end = compiled.end; + } + Ok(ThompsonRef { start, end }) + } + + fn c_alternation<I>(&self, mut it: I) -> Result<ThompsonRef, Error> + where + I: Iterator<Item = Result<ThompsonRef, Error>>, + { + let first = it.next().expect("alternations must be non-empty")?; + let second = match it.next() { + None => return Ok(first), + Some(result) => result?, + }; + + let union = self.add_union()?; + let end = self.add_empty()?; + self.patch(union, first.start)?; + self.patch(first.end, end)?; + self.patch(union, second.start)?; + self.patch(second.end, end)?; + for result in it { + let compiled = result?; + self.patch(union, compiled.start)?; + self.patch(compiled.end, end)?; + } + Ok(ThompsonRef { start: union, end }) + } + + fn c_group( + &self, + kind: &hir::GroupKind, + expr: &Hir, + ) -> Result<ThompsonRef, Error> { + if !self.config.get_captures() { + return self.c(expr); + } + let (capi, name) = match *kind { + hir::GroupKind::NonCapturing => return self.c(expr), + hir::GroupKind::CaptureIndex(index) => (index, None), + hir::GroupKind::CaptureName { ref name, index } => { + (index, Some(Arc::from(&**name))) + } + }; + + let start = self.add_capture_start(capi, name)?; + let inner = self.c(expr)?; + let end = self.add_capture_end(capi)?; + + self.patch(start, inner.start)?; + self.patch(inner.end, end)?; + Ok(ThompsonRef { start, end }) + } + + fn c_repetition( + &self, + rep: &hir::Repetition, + ) -> Result<ThompsonRef, Error> { + match rep.kind { + hir::RepetitionKind::ZeroOrOne => { + self.c_zero_or_one(&rep.hir, rep.greedy) + } + hir::RepetitionKind::ZeroOrMore => { + self.c_at_least(&rep.hir, rep.greedy, 0) + } + hir::RepetitionKind::OneOrMore => { + self.c_at_least(&rep.hir, rep.greedy, 1) + } + hir::RepetitionKind::Range(ref rng) => match *rng { + hir::RepetitionRange::Exactly(count) => { + self.c_exactly(&rep.hir, count) + } + hir::RepetitionRange::AtLeast(m) => { + self.c_at_least(&rep.hir, rep.greedy, m) + } + hir::RepetitionRange::Bounded(min, max) => { + self.c_bounded(&rep.hir, rep.greedy, min, max) + } + }, + } + } + + fn c_bounded( + &self, + expr: &Hir, + greedy: bool, + min: u32, + max: u32, + ) -> Result<ThompsonRef, Error> { + let prefix = self.c_exactly(expr, min)?; + if min == max { + return Ok(prefix); + } + + // It is tempting here to compile the rest here as a concatenation + // of zero-or-one matches. i.e., for `a{2,5}`, compile it as if it + // were `aaa?a?a?`. The problem here is that it leads to this program: + // + // >000000: 61 => 01 + // 000001: 61 => 02 + // 000002: union(03, 04) + // 000003: 61 => 04 + // 000004: union(05, 06) + // 000005: 61 => 06 + // 000006: union(07, 08) + // 000007: 61 => 08 + // 000008: MATCH + // + // And effectively, once you hit state 2, the epsilon closure will + // include states 3, 5, 6, 7 and 8, which is quite a bit. It is better + // to instead compile it like so: + // + // >000000: 61 => 01 + // 000001: 61 => 02 + // 000002: union(03, 08) + // 000003: 61 => 04 + // 000004: union(05, 08) + // 000005: 61 => 06 + // 000006: union(07, 08) + // 000007: 61 => 08 + // 000008: MATCH + // + // So that the epsilon closure of state 2 is now just 3 and 8. + let empty = self.add_empty()?; + let mut prev_end = prefix.end; + for _ in min..max { + let union = if greedy { + self.add_union() + } else { + self.add_reverse_union() + }?; + let compiled = self.c(expr)?; + self.patch(prev_end, union)?; + self.patch(union, compiled.start)?; + self.patch(union, empty)?; + prev_end = compiled.end; + } + self.patch(prev_end, empty)?; + Ok(ThompsonRef { start: prefix.start, end: empty }) + } + + fn c_at_least( + &self, + expr: &Hir, + greedy: bool, + n: u32, + ) -> Result<ThompsonRef, Error> { + if n == 0 { + // When the expression cannot match the empty string, then we + // can get away with something much simpler: just one 'alt' + // instruction that optionally repeats itself. But if the expr + // can match the empty string... see below. + if !expr.is_match_empty() { + let union = if greedy { + self.add_union() + } else { + self.add_reverse_union() + }?; + let compiled = self.c(expr)?; + self.patch(union, compiled.start)?; + self.patch(compiled.end, union)?; + return Ok(ThompsonRef { start: union, end: union }); + } + + // What's going on here? Shouldn't x* be simpler than this? It + // turns out that when implementing leftmost-first (Perl-like) + // match semantics, x* results in an incorrect preference order + // when computing the transitive closure of states if and only if + // 'x' can match the empty string. So instead, we compile x* as + // (x+)?, which preserves the correct preference order. + // + // See: https://github.com/rust-lang/regex/issues/779 + let compiled = self.c(expr)?; + let plus = if greedy { + self.add_union() + } else { + self.add_reverse_union() + }?; + self.patch(compiled.end, plus)?; + self.patch(plus, compiled.start)?; + + let question = if greedy { + self.add_union() + } else { + self.add_reverse_union() + }?; + let empty = self.add_empty()?; + self.patch(question, compiled.start)?; + self.patch(question, empty)?; + self.patch(plus, empty)?; + Ok(ThompsonRef { start: question, end: empty }) + } else if n == 1 { + let compiled = self.c(expr)?; + let union = if greedy { + self.add_union() + } else { + self.add_reverse_union() + }?; + self.patch(compiled.end, union)?; + self.patch(union, compiled.start)?; + Ok(ThompsonRef { start: compiled.start, end: union }) + } else { + let prefix = self.c_exactly(expr, n - 1)?; + let last = self.c(expr)?; + let union = if greedy { + self.add_union() + } else { + self.add_reverse_union() + }?; + self.patch(prefix.end, last.start)?; + self.patch(last.end, union)?; + self.patch(union, last.start)?; + Ok(ThompsonRef { start: prefix.start, end: union }) + } + } + + fn c_zero_or_one( + &self, + expr: &Hir, + greedy: bool, + ) -> Result<ThompsonRef, Error> { + let union = + if greedy { self.add_union() } else { self.add_reverse_union() }?; + let compiled = self.c(expr)?; + let empty = self.add_empty()?; + self.patch(union, compiled.start)?; + self.patch(union, empty)?; + self.patch(compiled.end, empty)?; + Ok(ThompsonRef { start: union, end: empty }) + } + + fn c_exactly(&self, expr: &Hir, n: u32) -> Result<ThompsonRef, Error> { + let it = (0..n).map(|_| self.c(expr)); + self.c_concat(it) + } + + fn c_byte_class( + &self, + cls: &hir::ClassBytes, + ) -> Result<ThompsonRef, Error> { + let end = self.add_empty()?; + let mut trans = Vec::with_capacity(cls.ranges().len()); + for r in cls.iter() { + trans.push(Transition { + start: r.start(), + end: r.end(), + next: end, + }); + } + Ok(ThompsonRef { start: self.add_sparse(trans)?, end }) + } + + fn c_unicode_class( + &self, + cls: &hir::ClassUnicode, + ) -> Result<ThompsonRef, Error> { + // If all we have are ASCII ranges wrapped in a Unicode package, then + // there is zero reason to bring out the big guns. We can fit all ASCII + // ranges within a single sparse state. + if cls.is_all_ascii() { + let end = self.add_empty()?; + let mut trans = Vec::with_capacity(cls.ranges().len()); + for r in cls.iter() { + assert!(r.start() <= '\x7F'); + assert!(r.end() <= '\x7F'); + trans.push(Transition { + start: r.start() as u8, + end: r.end() as u8, + next: end, + }); + } + Ok(ThompsonRef { start: self.add_sparse(trans)?, end }) + } else if self.is_reverse() { + if !self.config.get_shrink() { + // When we don't want to spend the extra time shrinking, we + // compile the UTF-8 automaton in reverse using something like + // the "naive" approach, but will attempt to re-use common + // suffixes. + self.c_unicode_class_reverse_with_suffix(cls) + } else { + // When we want to shrink our NFA for reverse UTF-8 automata, + // we cannot feed UTF-8 sequences directly to the UTF-8 + // compiler, since the UTF-8 compiler requires all sequences + // to be lexicographically sorted. Instead, we organize our + // sequences into a range trie, which can then output our + // sequences in the correct order. Unfortunately, building the + // range trie is fairly expensive (but not nearly as expensive + // as building a DFA). Hence the reason why the 'shrink' option + // exists, so that this path can be toggled off. For example, + // we might want to turn this off if we know we won't be + // compiling a DFA. + let mut trie = self.trie_state.borrow_mut(); + trie.clear(); + + for rng in cls.iter() { + for mut seq in Utf8Sequences::new(rng.start(), rng.end()) { + seq.reverse(); + trie.insert(seq.as_slice()); + } + } + let mut utf8_state = self.utf8_state.borrow_mut(); + let mut utf8c = Utf8Compiler::new(self, &mut *utf8_state)?; + trie.iter(|seq| { + utf8c.add(&seq)?; + Ok(()) + })?; + utf8c.finish() + } + } else { + // In the forward direction, we always shrink our UTF-8 automata + // because we can stream it right into the UTF-8 compiler. There + // is almost no downside (in either memory or time) to using this + // approach. + let mut utf8_state = self.utf8_state.borrow_mut(); + let mut utf8c = Utf8Compiler::new(self, &mut *utf8_state)?; + for rng in cls.iter() { + for seq in Utf8Sequences::new(rng.start(), rng.end()) { + utf8c.add(seq.as_slice())?; + } + } + utf8c.finish() + } + + // For reference, the code below is the "naive" version of compiling a + // UTF-8 automaton. It is deliciously simple (and works for both the + // forward and reverse cases), but will unfortunately produce very + // large NFAs. When compiling a forward automaton, the size difference + // can sometimes be an order of magnitude. For example, the '\w' regex + // will generate about ~3000 NFA states using the naive approach below, + // but only 283 states when using the approach above. This is because + // the approach above actually compiles a *minimal* (or near minimal, + // because of the bounded hashmap for reusing equivalent states) UTF-8 + // automaton. + // + // The code below is kept as a reference point in order to make it + // easier to understand the higher level goal here. Although, it will + // almost certainly bit-rot, so keep that in mind. + /* + let it = cls + .iter() + .flat_map(|rng| Utf8Sequences::new(rng.start(), rng.end())) + .map(|seq| { + let it = seq + .as_slice() + .iter() + .map(|rng| self.c_range(rng.start, rng.end)); + self.c_concat(it) + }); + self.c_alternation(it) + */ + } + + fn c_unicode_class_reverse_with_suffix( + &self, + cls: &hir::ClassUnicode, + ) -> Result<ThompsonRef, Error> { + // N.B. It would likely be better to cache common *prefixes* in the + // reverse direction, but it's not quite clear how to do that. The + // advantage of caching suffixes is that it does give us a win, and + // has a very small additional overhead. + let mut cache = self.utf8_suffix.borrow_mut(); + cache.clear(); + + let union = self.add_union()?; + let alt_end = self.add_empty()?; + for urng in cls.iter() { + for seq in Utf8Sequences::new(urng.start(), urng.end()) { + let mut end = alt_end; + for brng in seq.as_slice() { + let key = Utf8SuffixKey { + from: end, + start: brng.start, + end: brng.end, + }; + let hash = cache.hash(&key); + if let Some(id) = cache.get(&key, hash) { + end = id; + continue; + } + + let compiled = self.c_range(brng.start, brng.end)?; + self.patch(compiled.end, end)?; + end = compiled.start; + cache.set(key, hash, end); + } + self.patch(union, end)?; + } + } + Ok(ThompsonRef { start: union, end: alt_end }) + } + + fn c_anchor(&self, anchor: &Anchor) -> Result<ThompsonRef, Error> { + let look = match *anchor { + Anchor::StartLine => Look::StartLine, + Anchor::EndLine => Look::EndLine, + Anchor::StartText => Look::StartText, + Anchor::EndText => Look::EndText, + }; + let id = self.add_look(look)?; + Ok(ThompsonRef { start: id, end: id }) + } + + fn c_word_boundary( + &self, + wb: &WordBoundary, + ) -> Result<ThompsonRef, Error> { + let look = match *wb { + WordBoundary::Unicode => Look::WordBoundaryUnicode, + WordBoundary::UnicodeNegate => Look::WordBoundaryUnicodeNegate, + WordBoundary::Ascii => Look::WordBoundaryAscii, + WordBoundary::AsciiNegate => Look::WordBoundaryAsciiNegate, + }; + let id = self.add_look(look)?; + Ok(ThompsonRef { start: id, end: id }) + } + + fn c_char(&self, ch: char) -> Result<ThompsonRef, Error> { + let mut buf = [0; 4]; + let it = ch + .encode_utf8(&mut buf) + .as_bytes() + .iter() + .map(|&b| self.c_range(b, b)); + self.c_concat(it) + } + + fn c_range(&self, start: u8, end: u8) -> Result<ThompsonRef, Error> { + let id = self.add_range(start, end)?; + Ok(ThompsonRef { start: id, end: id }) + } + + fn c_empty(&self) -> Result<ThompsonRef, Error> { + let id = self.add_empty()?; + Ok(ThompsonRef { start: id, end: id }) + } + + fn c_unanchored_prefix_valid_utf8(&self) -> Result<ThompsonRef, Error> { + self.c_at_least(&Hir::any(false), false, 0) + } + + fn c_unanchored_prefix_invalid_utf8(&self) -> Result<ThompsonRef, Error> { + self.c_at_least(&Hir::any(true), false, 0) + } + + fn patch(&self, from: StateID, to: StateID) -> Result<(), Error> { + let old_memory_cstates = self.memory_cstates.get(); + match self.states.borrow_mut()[from] { + CState::Empty { ref mut next } => { + *next = to; + } + CState::Range { ref mut range } => { + range.next = to; + } + CState::Sparse { .. } => { + panic!("cannot patch from a sparse NFA state") + } + CState::Look { ref mut next, .. } => { + *next = to; + } + CState::Union { ref mut alternates } => { + alternates.push(to); + self.memory_cstates + .set(old_memory_cstates + mem::size_of::<StateID>()); + } + CState::UnionReverse { ref mut alternates } => { + alternates.push(to); + self.memory_cstates + .set(old_memory_cstates + mem::size_of::<StateID>()); + } + CState::CaptureStart { ref mut next, .. } => { + *next = to; + } + CState::CaptureEnd { ref mut next, .. } => { + *next = to; + } + CState::Match { .. } => {} + } + if old_memory_cstates != self.memory_cstates.get() { + self.check_nfa_size_limit()?; + } + Ok(()) + } + + fn add_empty(&self) -> Result<StateID, Error> { + self.add_state(CState::Empty { next: StateID::ZERO }) + } + + fn add_capture_start( + &self, + capture_index: u32, + name: Option<Arc<str>>, + ) -> Result<StateID, Error> { + self.add_state(CState::CaptureStart { + next: StateID::ZERO, + capture_index, + name, + }) + } + + fn add_capture_end(&self, capture_index: u32) -> Result<StateID, Error> { + self.add_state(CState::CaptureEnd { + next: StateID::ZERO, + capture_index, + }) + } + + fn add_range(&self, start: u8, end: u8) -> Result<StateID, Error> { + let trans = Transition { start, end, next: StateID::ZERO }; + self.add_state(CState::Range { range: trans }) + } + + fn add_sparse(&self, ranges: Vec<Transition>) -> Result<StateID, Error> { + if ranges.len() == 1 { + self.add_state(CState::Range { range: ranges[0] }) + } else { + self.add_state(CState::Sparse { ranges }) + } + } + + fn add_look(&self, mut look: Look) -> Result<StateID, Error> { + if self.is_reverse() { + look = look.reversed(); + } + self.add_state(CState::Look { look, next: StateID::ZERO }) + } + + fn add_union(&self) -> Result<StateID, Error> { + self.add_state(CState::Union { alternates: vec![] }) + } + + fn add_reverse_union(&self) -> Result<StateID, Error> { + self.add_state(CState::UnionReverse { alternates: vec![] }) + } + + fn add_match( + &self, + pattern_id: PatternID, + start_id: StateID, + ) -> Result<StateID, Error> { + self.add_state(CState::Match { pattern_id, start_id }) + } + + fn add_state(&self, state: CState) -> Result<StateID, Error> { + let mut states = self.states.borrow_mut(); + let id = StateID::new(states.len()) + .map_err(|_| Error::too_many_states(states.len()))?; + self.memory_cstates + .set(self.memory_cstates.get() + state.memory_usage()); + states.push(state); + // If we don't explicitly drop this, then 'nfa_memory_usage' will also + // try to borrow it when we check the size limit and hit an error. + drop(states); + self.check_nfa_size_limit()?; + Ok(id) + } + + fn is_reverse(&self) -> bool { + self.config.get_reverse() + } + + /// If an NFA size limit was set, this checks that the NFA compiled so far + /// fits within that limit. If so, then nothing is returned. Otherwise, an + /// error is returned. + /// + /// This should be called after increasing the heap usage of the + /// intermediate NFA. + /// + /// Note that this borrows 'self.states', so callers should ensure there is + /// no mutable borrow of it outstanding. + fn check_nfa_size_limit(&self) -> Result<(), Error> { + if let Some(limit) = self.config.get_nfa_size_limit() { + if self.nfa_memory_usage() > limit { + return Err(Error::exceeded_size_limit(limit)); + } + } + Ok(()) + } + + /// Returns the heap memory usage, in bytes, of the NFA compiled so far. + /// + /// Note that this is an approximation of how big the final NFA will be. + /// In practice, the final NFA will likely be a bit smaller since it uses + /// things like `Box<[T]>` instead of `Vec<T>`. + fn nfa_memory_usage(&self) -> usize { + self.states.borrow().len() * mem::size_of::<CState>() + + self.memory_cstates.get() + } +} + +impl CState { + fn memory_usage(&self) -> usize { + match *self { + CState::Empty { .. } + | CState::Range { .. } + | CState::Look { .. } + | CState::CaptureStart { .. } + | CState::CaptureEnd { .. } + | CState::Match { .. } => 0, + CState::Sparse { ref ranges } => { + ranges.len() * mem::size_of::<Transition>() + } + CState::Union { ref alternates } => { + alternates.len() * mem::size_of::<StateID>() + } + CState::UnionReverse { ref alternates } => { + alternates.len() * mem::size_of::<StateID>() + } + } + } +} + +#[derive(Debug)] +struct Utf8Compiler<'a> { + nfac: &'a Compiler, + state: &'a mut Utf8State, + target: StateID, +} + +#[derive(Clone, Debug)] +struct Utf8State { + compiled: Utf8BoundedMap, + uncompiled: Vec<Utf8Node>, +} + +#[derive(Clone, Debug)] +struct Utf8Node { + trans: Vec<Transition>, + last: Option<Utf8LastTransition>, +} + +#[derive(Clone, Debug)] +struct Utf8LastTransition { + start: u8, + end: u8, +} + +impl Utf8State { + fn new() -> Utf8State { + Utf8State { compiled: Utf8BoundedMap::new(10_000), uncompiled: vec![] } + } + + fn clear(&mut self) { + self.compiled.clear(); + self.uncompiled.clear(); + } +} + +impl<'a> Utf8Compiler<'a> { + fn new( + nfac: &'a Compiler, + state: &'a mut Utf8State, + ) -> Result<Utf8Compiler<'a>, Error> { + let target = nfac.add_empty()?; + state.clear(); + let mut utf8c = Utf8Compiler { nfac, state, target }; + utf8c.add_empty(); + Ok(utf8c) + } + + fn finish(&mut self) -> Result<ThompsonRef, Error> { + self.compile_from(0)?; + let node = self.pop_root(); + let start = self.compile(node)?; + Ok(ThompsonRef { start, end: self.target }) + } + + fn add(&mut self, ranges: &[Utf8Range]) -> Result<(), Error> { + let prefix_len = ranges + .iter() + .zip(&self.state.uncompiled) + .take_while(|&(range, node)| { + node.last.as_ref().map_or(false, |t| { + (t.start, t.end) == (range.start, range.end) + }) + }) + .count(); + assert!(prefix_len < ranges.len()); + self.compile_from(prefix_len)?; + self.add_suffix(&ranges[prefix_len..]); + Ok(()) + } + + fn compile_from(&mut self, from: usize) -> Result<(), Error> { + let mut next = self.target; + while from + 1 < self.state.uncompiled.len() { + let node = self.pop_freeze(next); + next = self.compile(node)?; + } + self.top_last_freeze(next); + Ok(()) + } + + fn compile(&mut self, node: Vec<Transition>) -> Result<StateID, Error> { + let hash = self.state.compiled.hash(&node); + if let Some(id) = self.state.compiled.get(&node, hash) { + return Ok(id); + } + let id = self.nfac.add_sparse(node.clone())?; + self.state.compiled.set(node, hash, id); + Ok(id) + } + + fn add_suffix(&mut self, ranges: &[Utf8Range]) { + assert!(!ranges.is_empty()); + let last = self + .state + .uncompiled + .len() + .checked_sub(1) + .expect("non-empty nodes"); + assert!(self.state.uncompiled[last].last.is_none()); + self.state.uncompiled[last].last = Some(Utf8LastTransition { + start: ranges[0].start, + end: ranges[0].end, + }); + for r in &ranges[1..] { + self.state.uncompiled.push(Utf8Node { + trans: vec![], + last: Some(Utf8LastTransition { start: r.start, end: r.end }), + }); + } + } + + fn add_empty(&mut self) { + self.state.uncompiled.push(Utf8Node { trans: vec![], last: None }); + } + + fn pop_freeze(&mut self, next: StateID) -> Vec<Transition> { + let mut uncompiled = self.state.uncompiled.pop().unwrap(); + uncompiled.set_last_transition(next); + uncompiled.trans + } + + fn pop_root(&mut self) -> Vec<Transition> { + assert_eq!(self.state.uncompiled.len(), 1); + assert!(self.state.uncompiled[0].last.is_none()); + self.state.uncompiled.pop().expect("non-empty nodes").trans + } + + fn top_last_freeze(&mut self, next: StateID) { + let last = self + .state + .uncompiled + .len() + .checked_sub(1) + .expect("non-empty nodes"); + self.state.uncompiled[last].set_last_transition(next); + } +} + +impl Utf8Node { + fn set_last_transition(&mut self, next: StateID) { + if let Some(last) = self.last.take() { + self.trans.push(Transition { + start: last.start, + end: last.end, + next, + }); + } + } +} + +#[cfg(test)] +mod tests { + use alloc::vec::Vec; + + use super::{ + Builder, Config, PatternID, SparseTransitions, State, StateID, + Transition, NFA, + }; + + fn build(pattern: &str) -> NFA { + Builder::new() + .configure(Config::new().captures(false).unanchored_prefix(false)) + .build(pattern) + .unwrap() + } + + fn pid(id: usize) -> PatternID { + PatternID::new(id).unwrap() + } + + fn sid(id: usize) -> StateID { + StateID::new(id).unwrap() + } + + fn s_byte(byte: u8, next: usize) -> State { + let next = sid(next); + let trans = Transition { start: byte, end: byte, next }; + State::Range { range: trans } + } + + fn s_range(start: u8, end: u8, next: usize) -> State { + let next = sid(next); + let trans = Transition { start, end, next }; + State::Range { range: trans } + } + + fn s_sparse(ranges: &[(u8, u8, usize)]) -> State { + let ranges = ranges + .iter() + .map(|&(start, end, next)| Transition { + start, + end, + next: sid(next), + }) + .collect(); + State::Sparse(SparseTransitions { ranges }) + } + + fn s_union(alts: &[usize]) -> State { + State::Union { + alternates: alts + .iter() + .map(|&id| sid(id)) + .collect::<Vec<StateID>>() + .into_boxed_slice(), + } + } + + fn s_match(id: usize) -> State { + State::Match { id: pid(id) } + } + + // Test that building an unanchored NFA has an appropriate `(?s:.)*?` + // prefix. + #[test] + fn compile_unanchored_prefix() { + // When the machine can only match valid UTF-8. + let nfa = Builder::new() + .configure(Config::new().captures(false)) + .build(r"a") + .unwrap(); + // There should be many states since the `.` in `(?s:.)*?` matches any + // Unicode scalar value. + assert_eq!(11, nfa.len()); + assert_eq!(nfa.states[10], s_match(0)); + assert_eq!(nfa.states[9], s_byte(b'a', 10)); + + // When the machine can match through invalid UTF-8. + let nfa = Builder::new() + .configure(Config::new().captures(false).utf8(false)) + .build(r"a") + .unwrap(); + assert_eq!( + nfa.states, + &[ + s_union(&[2, 1]), + s_range(0, 255, 0), + s_byte(b'a', 3), + s_match(0), + ] + ); + } + + #[test] + fn compile_empty() { + assert_eq!(build("").states, &[s_match(0),]); + } + + #[test] + fn compile_literal() { + assert_eq!(build("a").states, &[s_byte(b'a', 1), s_match(0),]); + assert_eq!( + build("ab").states, + &[s_byte(b'a', 1), s_byte(b'b', 2), s_match(0),] + ); + assert_eq!( + build("☃").states, + &[s_byte(0xE2, 1), s_byte(0x98, 2), s_byte(0x83, 3), s_match(0)] + ); + + // Check that non-UTF-8 literals work. + let nfa = Builder::new() + .configure( + Config::new() + .captures(false) + .utf8(false) + .unanchored_prefix(false), + ) + .syntax(crate::SyntaxConfig::new().utf8(false)) + .build(r"(?-u)\xFF") + .unwrap(); + assert_eq!(nfa.states, &[s_byte(b'\xFF', 1), s_match(0),]); + } + + #[test] + fn compile_class() { + assert_eq!( + build(r"[a-z]").states, + &[s_range(b'a', b'z', 1), s_match(0),] + ); + assert_eq!( + build(r"[x-za-c]").states, + &[s_sparse(&[(b'a', b'c', 1), (b'x', b'z', 1)]), s_match(0)] + ); + assert_eq!( + build(r"[\u03B1-\u03B4]").states, + &[s_range(0xB1, 0xB4, 2), s_byte(0xCE, 0), s_match(0)] + ); + assert_eq!( + build(r"[\u03B1-\u03B4\u{1F919}-\u{1F91E}]").states, + &[ + s_range(0xB1, 0xB4, 5), + s_range(0x99, 0x9E, 5), + s_byte(0xA4, 1), + s_byte(0x9F, 2), + s_sparse(&[(0xCE, 0xCE, 0), (0xF0, 0xF0, 3)]), + s_match(0), + ] + ); + assert_eq!( + build(r"[a-z☃]").states, + &[ + s_byte(0x83, 3), + s_byte(0x98, 0), + s_sparse(&[(b'a', b'z', 3), (0xE2, 0xE2, 1)]), + s_match(0), + ] + ); + } + + #[test] + fn compile_repetition() { + assert_eq!( + build(r"a?").states, + &[s_union(&[1, 2]), s_byte(b'a', 2), s_match(0),] + ); + assert_eq!( + build(r"a??").states, + &[s_union(&[2, 1]), s_byte(b'a', 2), s_match(0),] + ); + } + + #[test] + fn compile_group() { + assert_eq!( + build(r"ab+").states, + &[s_byte(b'a', 1), s_byte(b'b', 2), s_union(&[1, 3]), s_match(0)] + ); + assert_eq!( + build(r"(ab)").states, + &[s_byte(b'a', 1), s_byte(b'b', 2), s_match(0)] + ); + assert_eq!( + build(r"(ab)+").states, + &[s_byte(b'a', 1), s_byte(b'b', 2), s_union(&[0, 3]), s_match(0)] + ); + } + + #[test] + fn compile_alternation() { + assert_eq!( + build(r"a|b").states, + &[s_byte(b'a', 3), s_byte(b'b', 3), s_union(&[0, 1]), s_match(0)] + ); + assert_eq!( + build(r"|b").states, + &[s_byte(b'b', 2), s_union(&[2, 0]), s_match(0)] + ); + assert_eq!( + build(r"a|").states, + &[s_byte(b'a', 2), s_union(&[0, 2]), s_match(0)] + ); + } + + #[test] + fn many_start_pattern() { + let nfa = Builder::new() + .configure(Config::new().captures(false).unanchored_prefix(false)) + .build_many(&["a", "b"]) + .unwrap(); + assert_eq!( + nfa.states, + &[ + s_byte(b'a', 1), + s_match(0), + s_byte(b'b', 3), + s_match(1), + s_union(&[0, 2]), + ] + ); + assert_eq!(nfa.start_anchored().as_usize(), 4); + assert_eq!(nfa.start_unanchored().as_usize(), 4); + // Test that the start states for each individual pattern are correct. + assert_eq!(nfa.start_pattern(pid(0)), sid(0)); + assert_eq!(nfa.start_pattern(pid(1)), sid(2)); + } +} diff --git a/vendor/regex-automata/src/nfa/thompson/error.rs b/vendor/regex-automata/src/nfa/thompson/error.rs new file mode 100644 index 000000000..52f02e888 --- /dev/null +++ b/vendor/regex-automata/src/nfa/thompson/error.rs @@ -0,0 +1,145 @@ +use crate::util::id::{PatternID, StateID}; + +/// An error that can occured during the construction of a thompson NFA. +/// +/// This error does not provide many introspection capabilities. There are +/// generally only two things you can do with it: +/// +/// * Obtain a human readable message via its `std::fmt::Display` impl. +/// * Access an underlying [`regex_syntax::Error`] type from its `source` +/// method via the `std::error::Error` trait. This error only occurs when using +/// convenience routines for building an NFA directly from a pattern string. +/// +/// Otherwise, errors typically occur when a limit has been breeched. For +/// example, if the total heap usage of the compiled NFA exceeds the limit +/// set by [`Config::nfa_size_limit`](crate::nfa::thompson::Config), then +/// building the NFA will fail. +#[derive(Clone, Debug)] +pub struct Error { + kind: ErrorKind, +} + +/// The kind of error that occurred during the construction of a thompson NFA. +#[derive(Clone, Debug)] +enum ErrorKind { + /// An error that occurred while parsing a regular expression. Note that + /// this error may be printed over multiple lines, and is generally + /// intended to be end user readable on its own. + Syntax(regex_syntax::Error), + /// An error that occurs if too many patterns were given to the NFA + /// compiler. + TooManyPatterns { + /// The number of patterns given, which exceeds the limit. + given: usize, + /// The limit on the number of patterns. + limit: usize, + }, + /// An error that occurs if too states are produced while building an NFA. + TooManyStates { + /// The minimum number of states that are desired, which exceeds the + /// limit. + given: usize, + /// The limit on the number of states. + limit: usize, + }, + /// An error that occurs when NFA compilation exceeds a configured heap + /// limit. + ExceededSizeLimit { + /// The configured limit, in bytes. + limit: usize, + }, + /// An error that occurs when an invalid capture group index is added to + /// the NFA. An "invalid" index can be one that is too big (e.g., results + /// in an integer overflow) or one that is discontinuous from previous + /// capture group indices added. + InvalidCaptureIndex { + /// The invalid index that was given. + index: usize, + }, + /// An error that occurs when an NFA contains a Unicode word boundary, but + /// where the crate was compiled without the necessary data for dealing + /// with Unicode word boundaries. + UnicodeWordUnavailable, +} + +impl Error { + fn kind(&self) -> &ErrorKind { + &self.kind + } + + pub(crate) fn syntax(err: regex_syntax::Error) -> Error { + Error { kind: ErrorKind::Syntax(err) } + } + + pub(crate) fn too_many_patterns(given: usize) -> Error { + let limit = PatternID::LIMIT; + Error { kind: ErrorKind::TooManyPatterns { given, limit } } + } + + pub(crate) fn too_many_states(given: usize) -> Error { + let limit = StateID::LIMIT; + Error { kind: ErrorKind::TooManyStates { given, limit } } + } + + pub(crate) fn exceeded_size_limit(limit: usize) -> Error { + Error { kind: ErrorKind::ExceededSizeLimit { limit } } + } + + pub(crate) fn invalid_capture_index(index: usize) -> Error { + Error { kind: ErrorKind::InvalidCaptureIndex { index } } + } + + pub(crate) fn unicode_word_unavailable() -> Error { + Error { kind: ErrorKind::UnicodeWordUnavailable } + } +} + +#[cfg(feature = "std")] +impl std::error::Error for Error { + fn source(&self) -> Option<&(dyn std::error::Error + 'static)> { + match self.kind() { + ErrorKind::Syntax(ref err) => Some(err), + ErrorKind::TooManyPatterns { .. } => None, + ErrorKind::TooManyStates { .. } => None, + ErrorKind::ExceededSizeLimit { .. } => None, + ErrorKind::InvalidCaptureIndex { .. } => None, + ErrorKind::UnicodeWordUnavailable => None, + } + } +} + +impl core::fmt::Display for Error { + fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result { + match self.kind() { + ErrorKind::Syntax(_) => write!(f, "error parsing regex"), + ErrorKind::TooManyPatterns { given, limit } => write!( + f, + "attemped to compile {} patterns, \ + which exceeds the limit of {}", + given, limit, + ), + ErrorKind::TooManyStates { given, limit } => write!( + f, + "attemped to compile {} NFA states, \ + which exceeds the limit of {}", + given, limit, + ), + ErrorKind::ExceededSizeLimit { limit } => write!( + f, + "heap usage during NFA compilation exceeded limit of {}", + limit, + ), + ErrorKind::InvalidCaptureIndex { index } => write!( + f, + "capture group index {} is invalid (too big or discontinuous)", + index, + ), + ErrorKind::UnicodeWordUnavailable => write!( + f, + "crate has been compiled without Unicode word boundary \ + support, but the NFA contains Unicode word boundary \ + assertions", + ), + } + } +} diff --git a/vendor/regex-automata/src/nfa/map.rs b/vendor/regex-automata/src/nfa/thompson/map.rs index e636c0dd3..79ff63ca3 100644 --- a/vendor/regex-automata/src/nfa/map.rs +++ b/vendor/regex-automata/src/nfa/thompson/map.rs @@ -7,10 +7,10 @@ // These maps are used in some fairly hot code when generating NFA states for // large Unicode character classes. // -// Instead of exposing a rich hashmap entry API, we just permit the caller -// to produce a hash of the key directly. The hash can then be reused for both -// lookups and insertions at the cost of leaking things a bit. But these are -// for internal use only, so it's fine. +// Instead of exposing a rich hashmap entry API, we just permit the caller to +// produce a hash of the key directly. The hash can then be reused for both +// lookups and insertions at the cost of leaking abstraction a bit. But these +// are for internal use only, so it's fine. // // The Utf8BoundedMap is used for Daciuk's algorithm for constructing a // (almost) minimal DFA for large Unicode character classes in linear time. @@ -33,7 +33,9 @@ // could make one generic map, but the machinery didn't seem worth it. They // are simple enough. -use nfa::{StateID, Transition}; +use alloc::{vec, vec::Vec}; + +use crate::{nfa::thompson::Transition, util::id::StateID}; // Basic FNV-1a hash constants as described in: // https://en.wikipedia.org/wiki/Fowler%E2%80%93Noll%E2%80%93Vo_hash_function @@ -57,7 +59,7 @@ const INIT: u64 = 14695981039346656037; /// Specifically, one could observe the difference with std's hashmap via /// something like the following benchmark: /// -/// hyperfine "regex-automata-debug debug -acqr '\w{40} ecurB'" +/// hyperfine "regex-cli debug nfa thompson --quiet --reverse '\w{90} ecurB'" /// /// But to observe that difference, you'd have to modify the code to use /// std's hashmap. @@ -74,6 +76,9 @@ pub struct Utf8BoundedMap { /// The current version of this map. Only entries with matching versions /// are considered during lookups. If an entry is found with a mismatched /// version, then the map behaves as if the entry does not exist. + /// + /// This makes it possible to clear the map by simply incrementing the + /// version number instead of actually deallocating any storage. version: u16, /// The total number of entries this map can store. capacity: usize, @@ -119,6 +124,9 @@ impl Utf8BoundedMap { self.map = vec![Utf8BoundedEntry::default(); self.capacity]; } else { self.version = self.version.wrapping_add(1); + // If we loop back to version 0, then we forcefully clear the + // entire map. Otherwise, it might be possible to incorrectly + // match entries used to generate other NFAs. if self.version == 0 { self.map = vec![Utf8BoundedEntry::default(); self.capacity]; } @@ -131,7 +139,7 @@ impl Utf8BoundedMap { for t in key { h = (h ^ (t.start as u64)).wrapping_mul(PRIME); h = (h ^ (t.end as u64)).wrapping_mul(PRIME); - h = (h ^ (t.next as u64)).wrapping_mul(PRIME); + h = (h ^ (t.next.as_usize() as u64)).wrapping_mul(PRIME); } (h as usize) % self.map.len() } @@ -244,7 +252,7 @@ impl Utf8SuffixMap { const INIT: u64 = 14695981039346656037; let mut h = INIT; - h = (h ^ (key.from as u64)).wrapping_mul(PRIME); + h = (h ^ (key.from.as_usize() as u64)).wrapping_mul(PRIME); h = (h ^ (key.start as u64)).wrapping_mul(PRIME); h = (h ^ (key.end as u64)).wrapping_mul(PRIME); (h as usize) % self.map.len() diff --git a/vendor/regex-automata/src/nfa/thompson/mod.rs b/vendor/regex-automata/src/nfa/thompson/mod.rs new file mode 100644 index 000000000..88a438e8e --- /dev/null +++ b/vendor/regex-automata/src/nfa/thompson/mod.rs @@ -0,0 +1,1555 @@ +use core::{convert::TryFrom, fmt, mem, ops::Range}; + +use alloc::{boxed::Box, format, string::String, sync::Arc, vec, vec::Vec}; + +use crate::util::{ + alphabet::{self, ByteClassSet}, + decode_last_utf8, decode_utf8, + id::{IteratorIDExt, PatternID, PatternIDIter, StateID}, + is_word_byte, is_word_char_fwd, is_word_char_rev, +}; + +pub use self::{ + compiler::{Builder, Config}, + error::Error, +}; + +mod compiler; +mod error; +mod map; +pub mod pikevm; +mod range_trie; + +/// A map from capture group name to its corresponding capture index. +/// +/// Since there are always two slots for each capture index, the pair of slots +/// corresponding to the capture index for a pattern ID of 0 are indexed at +/// `map["<name>"] * 2` and `map["<name>"] * 2 + 1`. +/// +/// This type is actually wrapped inside a Vec indexed by pattern ID on the +/// NFA, since multiple patterns may have the same capture group name. +/// +/// Note that this is somewhat of a sub-optimal representation, since it +/// requires a hashmap for each pattern. A better representation would be +/// HashMap<(PatternID, Arc<str>), usize>, but this makes it difficult to look +/// up a capture index by name without producing a `Arc<str>`, which requires +/// an allocation. To fix this, I think we'd need to define our own unsized +/// type or something? +#[cfg(feature = "std")] +type CaptureNameMap = std::collections::HashMap<Arc<str>, usize>; +#[cfg(not(feature = "std"))] +type CaptureNameMap = alloc::collections::BTreeMap<Arc<str>, usize>; + +// The NFA API below is not something I'm terribly proud of at the moment. In +// particular, it supports both mutating the NFA and actually using the NFA to +// perform a search. I think combining these two things muddies the waters a +// bit too much. +// +// I think the issue is that I saw the compiler as the 'builder,' and where +// the compiler had the ability to manipulate the internal state of the NFA. +// However, one of my goals was to make it possible for others to build their +// own NFAs in a way that is *not* couple to the regex-syntax crate. +// +// So I think really, there should be an NFA, a NFABuilder and then the +// internal compiler which uses the NFABuilder API to build an NFA. Alas, at +// the time of writing, I kind of ran out of steam. + +/// A fully compiled Thompson NFA. +/// +/// The states of the NFA are indexed by state IDs, which are how transitions +/// are expressed. +#[derive(Clone)] +pub struct NFA { + /// The state list. This list is guaranteed to be indexable by all starting + /// state IDs, and it is also guaranteed to contain at most one `Match` + /// state for each pattern compiled into this NFA. (A pattern may not have + /// a corresponding `Match` state if a `Match` state is impossible to + /// reach.) + states: Vec<State>, + /// The anchored starting state of this NFA. + start_anchored: StateID, + /// The unanchored starting state of this NFA. + start_unanchored: StateID, + /// The starting states for each individual pattern. Starting at any + /// of these states will result in only an anchored search for the + /// corresponding pattern. The vec is indexed by pattern ID. When the NFA + /// contains a single regex, then `start_pattern[0]` and `start_anchored` + /// are always equivalent. + start_pattern: Vec<StateID>, + /// A map from PatternID to its corresponding range of capture slots. Each + /// range is guaranteed to be contiguous with the previous range. The + /// end of the last range corresponds to the total number of slots needed + /// for this NFA. + patterns_to_slots: Vec<Range<usize>>, + /// A map from capture name to its corresponding index. So e.g., given + /// a single regex like '(\w+) (\w+) (?P<word>\w+)', the capture name + /// 'word' for pattern ID=0 would corresponding to the index '3'. Its + /// corresponding slots would then be '3 * 2 = 6' and '3 * 2 + 1 = 7'. + capture_name_to_index: Vec<CaptureNameMap>, + /// A map from pattern ID to capture group index to name, if one exists. + /// This is effectively the inverse of 'capture_name_to_index'. The outer + /// vec is indexed by pattern ID, while the inner vec is index by capture + /// index offset for the corresponding pattern. + /// + /// The first capture group for each pattern is always unnamed and is thus + /// always None. + capture_index_to_name: Vec<Vec<Option<Arc<str>>>>, + /// A representation of equivalence classes over the transitions in this + /// NFA. Two bytes in the same equivalence class must not discriminate + /// between a match or a non-match. This map can be used to shrink the + /// total size of a DFA's transition table with a small match-time cost. + /// + /// Note that the NFA's transitions are *not* defined in terms of these + /// equivalence classes. The NFA's transitions are defined on the original + /// byte values. For the most part, this is because they wouldn't really + /// help the NFA much since the NFA already uses a sparse representation + /// to represent transitions. Byte classes are most effective in a dense + /// representation. + byte_class_set: ByteClassSet, + /// Various facts about this NFA, which can be used to improve failure + /// modes (e.g., rejecting DFA construction if an NFA has Unicode word + /// boundaries) or for performing optimizations (avoiding an increase in + /// states if there are no look-around states). + facts: Facts, + /// Heap memory used indirectly by NFA states. Since each state might use a + /// different amount of heap, we need to keep track of this incrementally. + memory_states: usize, +} + +impl NFA { + pub fn config() -> Config { + Config::new() + } + + pub fn builder() -> Builder { + Builder::new() + } + + /// Returns an NFA with no states. Its match semantics are unspecified. + /// + /// An empty NFA is useful as a starting point for building one. It is + /// itself not intended to be used for matching. For example, its starting + /// state identifiers are configured to be `0`, but since it has no states, + /// the identifiers are invalid. + /// + /// If you need an NFA that never matches is anything and can be correctly + /// used for matching, use [`NFA::never_match`]. + #[inline] + pub fn empty() -> NFA { + NFA { + states: vec![], + start_anchored: StateID::ZERO, + start_unanchored: StateID::ZERO, + start_pattern: vec![], + patterns_to_slots: vec![], + capture_name_to_index: vec![], + capture_index_to_name: vec![], + byte_class_set: ByteClassSet::empty(), + facts: Facts::default(), + memory_states: 0, + } + } + + /// Returns an NFA with a single regex that always matches at every + /// position. + #[inline] + pub fn always_match() -> NFA { + let mut nfa = NFA::empty(); + // Since we're only adding one pattern, these are guaranteed to work. + let start = nfa.add_match().unwrap(); + assert_eq!(start.as_usize(), 0); + let pid = nfa.finish_pattern(start).unwrap(); + assert_eq!(pid.as_usize(), 0); + nfa + } + + /// Returns an NFA that never matches at any position. It contains no + /// regexes. + #[inline] + pub fn never_match() -> NFA { + let mut nfa = NFA::empty(); + // Since we're only adding one state, this can never fail. + nfa.add_fail().unwrap(); + nfa + } + + /// Return the number of states in this NFA. + /// + /// This is guaranteed to be no bigger than [`StateID::LIMIT`]. + #[inline] + pub fn len(&self) -> usize { + self.states.len() + } + + /// Returns the total number of distinct match states in this NFA. + /// Stated differently, this returns the total number of regex patterns + /// used to build this NFA. + /// + /// This may return zero if the NFA was constructed with no patterns. In + /// this case, and only this case, the NFA can never produce a match for + /// any input. + /// + /// This is guaranteed to be no bigger than [`PatternID::LIMIT`]. + #[inline] + pub fn pattern_len(&self) -> usize { + self.start_pattern.len() + } + + /// Returns the pattern ID of the pattern currently being compiled by this + /// NFA. + fn current_pattern_id(&self) -> PatternID { + // This always works because we never permit more patterns in + // 'start_pattern' than can be addressed by PatternID. Also, we only + // add a new entry to 'start_pattern' once we finish compiling a + // pattern. Thus, the length refers to the ID of the current pattern + // being compiled. + PatternID::new(self.start_pattern.len()).unwrap() + } + + /// Returns the total number of capturing groups in this NFA. + /// + /// This includes the special 0th capture group that is always present and + /// captures the start and end offset of the entire match. + /// + /// This is a convenience routine for `nfa.capture_slot_len() / 2`. + #[inline] + pub fn capture_len(&self) -> usize { + let slots = self.capture_slot_len(); + // This assert is guaranteed to pass since the NFA construction process + // guarantees that it is always true. + assert_eq!(slots % 2, 0, "capture slots must be divisible by 2"); + slots / 2 + } + + /// Returns the total number of capturing slots in this NFA. + /// + /// This value is guaranteed to be a multiple of 2. (Where each capturing + /// group has precisely two capturing slots in the NFA.) + #[inline] + pub fn capture_slot_len(&self) -> usize { + self.patterns_to_slots.last().map_or(0, |r| r.end) + } + + /// Return a range of capture slots for the given pattern. + /// + /// The range returned is guaranteed to be contiguous with ranges for + /// adjacent patterns. + /// + /// This panics if the given pattern ID is greater than or equal to the + /// number of patterns in this NFA. + #[inline] + pub fn pattern_slots(&self, pid: PatternID) -> Range<usize> { + self.patterns_to_slots[pid].clone() + } + + /// Return the capture group index corresponding to the given name in the + /// given pattern. If no such capture group name exists in the given + /// pattern, then this returns `None`. + /// + /// If the given pattern ID is invalid, then this panics. + #[inline] + pub fn capture_name_to_index( + &self, + pid: PatternID, + name: &str, + ) -> Option<usize> { + assert!(pid.as_usize() < self.pattern_len(), "invalid pattern ID"); + self.capture_name_to_index[pid].get(name).cloned() + } + + // TODO: add iterators over capture group names. + // Do we also permit indexing? + + /// Returns an iterator over all pattern IDs in this NFA. + #[inline] + pub fn patterns(&self) -> PatternIter { + PatternIter { + it: PatternID::iter(self.pattern_len()), + _marker: core::marker::PhantomData, + } + } + + /// Return the ID of the initial anchored state of this NFA. + #[inline] + pub fn start_anchored(&self) -> StateID { + self.start_anchored + } + + /// Set the anchored starting state ID for this NFA. + #[inline] + pub fn set_start_anchored(&mut self, id: StateID) { + self.start_anchored = id; + } + + /// Return the ID of the initial unanchored state of this NFA. + #[inline] + pub fn start_unanchored(&self) -> StateID { + self.start_unanchored + } + + /// Set the unanchored starting state ID for this NFA. + #[inline] + pub fn set_start_unanchored(&mut self, id: StateID) { + self.start_unanchored = id; + } + + /// Return the ID of the initial anchored state for the given pattern. + /// + /// If the pattern doesn't exist in this NFA, then this panics. + #[inline] + pub fn start_pattern(&self, pid: PatternID) -> StateID { + self.start_pattern[pid] + } + + /// Get the byte class set for this NFA. + #[inline] + pub fn byte_class_set(&self) -> &ByteClassSet { + &self.byte_class_set + } + + /// Return a reference to the NFA state corresponding to the given ID. + #[inline] + pub fn state(&self, id: StateID) -> &State { + &self.states[id] + } + + /// Returns a slice of all states in this NFA. + /// + /// The slice returned may be indexed by a `StateID` generated by `add`. + #[inline] + pub fn states(&self) -> &[State] { + &self.states + } + + #[inline] + pub fn is_always_start_anchored(&self) -> bool { + self.start_anchored() == self.start_unanchored() + } + + #[inline] + pub fn has_any_look(&self) -> bool { + self.facts.has_any_look() + } + + #[inline] + pub fn has_any_anchor(&self) -> bool { + self.facts.has_any_anchor() + } + + #[inline] + pub fn has_word_boundary(&self) -> bool { + self.has_word_boundary_unicode() || self.has_word_boundary_ascii() + } + + #[inline] + pub fn has_word_boundary_unicode(&self) -> bool { + self.facts.has_word_boundary_unicode() + } + + #[inline] + pub fn has_word_boundary_ascii(&self) -> bool { + self.facts.has_word_boundary_ascii() + } + + /// Returns the memory usage, in bytes, of this NFA. + /// + /// This does **not** include the stack size used up by this NFA. To + /// compute that, use `std::mem::size_of::<NFA>()`. + #[inline] + pub fn memory_usage(&self) -> usize { + self.states.len() * mem::size_of::<State>() + + self.memory_states + + self.start_pattern.len() * mem::size_of::<StateID>() + } + + // Why do we define a bunch of 'add_*' routines below instead of just + // defining a single 'add' routine that accepts a 'State'? Indeed, for most + // of the 'add_*' routines below, such a simple API would be more than + // appropriate. Unfortunately, adding capture states and, to a lesser + // extent, match states, is a bit more complex. Namely, when we add a + // capture state, we *really* want to know the corresponding capture + // group's name and index and what not, so that we can update other state + // inside this NFA. But, e.g., the capture group name is not and should + // not be included in 'State::Capture'. So what are our choices? + // + // 1) Define one 'add' and require some additional optional parameters. + // This feels quite ugly, and adds unnecessary complexity to more common + // and simpler cases. + // + // 2) Do what we do below. The sad thing is that our API is bigger with + // more methods. But each method is very specific and hopefully simple. + // + // 3) Define a new enum, say, 'StateWithInfo', or something that permits + // providing both a State and some extra ancillary info in some cases. This + // doesn't seem too bad to me, but seems slightly worse than (2) because of + // the additional type required. + // + // 4) Abandon the idea that we have to specify things like the capture + // group name when we add the Capture state to the NFA. We would then need + // to add other methods that permit the caller to add this additional state + // "out of band." Other than it introducing some additional complexity, I + // decided against this because I wanted the NFA builder API to make it + // as hard as possible to build a bad or invalid NFA. Using the approach + // below, as you'll see, permits us to do a lot of strict checking of our + // inputs and return an error if we see something we don't expect. + + pub fn add_range(&mut self, range: Transition) -> Result<StateID, Error> { + self.byte_class_set.set_range(range.start, range.end); + self.add_state(State::Range { range }) + } + + pub fn add_sparse( + &mut self, + sparse: SparseTransitions, + ) -> Result<StateID, Error> { + for range in sparse.ranges.iter() { + self.byte_class_set.set_range(range.start, range.end); + } + self.add_state(State::Sparse(sparse)) + } + + pub fn add_look( + &mut self, + next: StateID, + look: Look, + ) -> Result<StateID, Error> { + self.facts.set_has_any_look(true); + look.add_to_byteset(&mut self.byte_class_set); + match look { + Look::StartLine + | Look::EndLine + | Look::StartText + | Look::EndText => { + self.facts.set_has_any_anchor(true); + } + Look::WordBoundaryUnicode | Look::WordBoundaryUnicodeNegate => { + self.facts.set_has_word_boundary_unicode(true); + } + Look::WordBoundaryAscii | Look::WordBoundaryAsciiNegate => { + self.facts.set_has_word_boundary_ascii(true); + } + } + self.add_state(State::Look { look, next }) + } + + pub fn add_union( + &mut self, + alternates: Box<[StateID]>, + ) -> Result<StateID, Error> { + self.add_state(State::Union { alternates }) + } + + pub fn add_capture_start( + &mut self, + next_id: StateID, + capture_index: u32, + name: Option<Arc<str>>, + ) -> Result<StateID, Error> { + let pid = self.current_pattern_id(); + let capture_index = match usize::try_from(capture_index) { + Err(_) => { + return Err(Error::invalid_capture_index(core::usize::MAX)) + } + Ok(capture_index) => capture_index, + }; + // Do arithmetic to find our absolute slot index first, to make sure + // the index is at least possibly valid (doesn't overflow). + let relative_slot = match capture_index.checked_mul(2) { + Some(relative_slot) => relative_slot, + None => return Err(Error::invalid_capture_index(capture_index)), + }; + let slot = match relative_slot.checked_add(self.capture_slot_len()) { + Some(slot) => slot, + None => return Err(Error::invalid_capture_index(capture_index)), + }; + // Make sure we have space to insert our (pid,index)|-->name mapping. + if pid.as_usize() >= self.capture_index_to_name.len() { + // Note that we require that if you're adding capturing groups, + // then there must be at least one capturing group per pattern. + // Moreover, whenever we expand our space here, it should always + // first be for the first capture group (at index==0). + if pid.as_usize() > self.capture_index_to_name.len() + || capture_index > 0 + { + return Err(Error::invalid_capture_index(capture_index)); + } + self.capture_name_to_index.push(CaptureNameMap::new()); + self.capture_index_to_name.push(vec![]); + } + if capture_index >= self.capture_index_to_name[pid].len() { + // We require that capturing groups are added in correspondence + // to their index. So no discontinuous indices. This is likely + // overly strict, but also makes it simpler to provide guarantees + // about our capturing group data. + if capture_index > self.capture_index_to_name[pid].len() { + return Err(Error::invalid_capture_index(capture_index)); + } + self.capture_index_to_name[pid].push(None); + } + if let Some(ref name) = name { + self.capture_name_to_index[pid] + .insert(Arc::clone(name), capture_index); + } + self.capture_index_to_name[pid][capture_index] = name; + self.add_state(State::Capture { next: next_id, slot }) + } + + pub fn add_capture_end( + &mut self, + next_id: StateID, + capture_index: u32, + ) -> Result<StateID, Error> { + let pid = self.current_pattern_id(); + let capture_index = match usize::try_from(capture_index) { + Err(_) => { + return Err(Error::invalid_capture_index(core::usize::MAX)) + } + Ok(capture_index) => capture_index, + }; + // If we haven't already added this capture group via a corresponding + // 'add_capture_start' call, then we consider the index given to be + // invalid. + if pid.as_usize() >= self.capture_index_to_name.len() + || capture_index >= self.capture_index_to_name[pid].len() + { + return Err(Error::invalid_capture_index(capture_index)); + } + // Since we've already confirmed that this capture index is invalid + // and has a corresponding starting slot, we know the multiplcation + // has already been done and succeeded. + let relative_slot_start = capture_index.checked_mul(2).unwrap(); + let relative_slot = match relative_slot_start.checked_add(1) { + Some(relative_slot) => relative_slot, + None => return Err(Error::invalid_capture_index(capture_index)), + }; + let slot = match relative_slot.checked_add(self.capture_slot_len()) { + Some(slot) => slot, + None => return Err(Error::invalid_capture_index(capture_index)), + }; + self.add_state(State::Capture { next: next_id, slot }) + } + + pub fn add_fail(&mut self) -> Result<StateID, Error> { + self.add_state(State::Fail) + } + + /// Add a new match state to this NFA and return its state ID. + pub fn add_match(&mut self) -> Result<StateID, Error> { + let pattern_id = self.current_pattern_id(); + let sid = self.add_state(State::Match { id: pattern_id })?; + Ok(sid) + } + + /// Finish compiling the current pattern and return its identifier. The + /// given ID should be the state ID corresponding to the anchored starting + /// state for matching this pattern. + pub fn finish_pattern( + &mut self, + start_id: StateID, + ) -> Result<PatternID, Error> { + // We've gotta make sure that we never permit the user to add more + // patterns than we can identify. So if we're already at the limit, + // then return an error. This is somewhat non-ideal since this won't + // result in an error until trying to complete the compilation of a + // pattern instead of starting it. + if self.start_pattern.len() >= PatternID::LIMIT { + return Err(Error::too_many_patterns( + self.start_pattern.len().saturating_add(1), + )); + } + let pid = self.current_pattern_id(); + self.start_pattern.push(start_id); + // Add the number of new slots created by this pattern. This is always + // equivalent to '2 * caps.len()', where 'caps.len()' is the number of + // new capturing groups introduced by the pattern we're finishing. + let new_cap_groups = self + .capture_index_to_name + .get(pid.as_usize()) + .map_or(0, |caps| caps.len()); + let new_slots = match new_cap_groups.checked_mul(2) { + Some(new_slots) => new_slots, + None => { + // Just return the biggest index that we know exists. + let index = new_cap_groups.saturating_sub(1); + return Err(Error::invalid_capture_index(index)); + } + }; + let slot_start = self.capture_slot_len(); + self.patterns_to_slots.push(slot_start..(slot_start + new_slots)); + Ok(pid) + } + + fn add_state(&mut self, state: State) -> Result<StateID, Error> { + let id = StateID::new(self.states.len()) + .map_err(|_| Error::too_many_states(self.states.len()))?; + self.memory_states += state.memory_usage(); + self.states.push(state); + Ok(id) + } + + /// Remap the transitions in every state of this NFA using the given map. + /// The given map should be indexed according to state ID namespace used by + /// the transitions of the states currently in this NFA. + /// + /// This may be used during the final phases of an NFA compiler, which + /// turns its intermediate NFA into the final NFA. Remapping may be + /// required to bring the state pointers from the intermediate NFA to the + /// final NFA. + pub fn remap(&mut self, old_to_new: &[StateID]) { + for state in &mut self.states { + state.remap(old_to_new); + } + self.start_anchored = old_to_new[self.start_anchored]; + self.start_unanchored = old_to_new[self.start_unanchored]; + for (pid, id) in self.start_pattern.iter_mut().with_pattern_ids() { + *id = old_to_new[*id]; + } + } + + /// Clear this NFA such that it has zero states and is otherwise "empty." + /// + /// An empty NFA is useful as a starting point for building one. It is + /// itself not intended to be used for matching. For example, its starting + /// state identifiers are configured to be `0`, but since it has no states, + /// the identifiers are invalid. + pub fn clear(&mut self) { + self.states.clear(); + self.start_anchored = StateID::ZERO; + self.start_unanchored = StateID::ZERO; + self.start_pattern.clear(); + self.patterns_to_slots.clear(); + self.capture_name_to_index.clear(); + self.capture_index_to_name.clear(); + self.byte_class_set = ByteClassSet::empty(); + self.facts = Facts::default(); + self.memory_states = 0; + } +} + +impl fmt::Debug for NFA { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + writeln!(f, "thompson::NFA(")?; + for (sid, state) in self.states.iter().with_state_ids() { + let status = if sid == self.start_anchored { + '^' + } else if sid == self.start_unanchored { + '>' + } else { + ' ' + }; + writeln!(f, "{}{:06?}: {:?}", status, sid.as_usize(), state)?; + } + if self.pattern_len() > 1 { + writeln!(f, "")?; + for pid in self.patterns() { + let sid = self.start_pattern(pid); + writeln!( + f, + "START({:06?}): {:?}", + pid.as_usize(), + sid.as_usize() + )?; + } + } + writeln!(f, "")?; + writeln!( + f, + "transition equivalence classes: {:?}", + self.byte_class_set().byte_classes() + )?; + writeln!(f, ")")?; + Ok(()) + } +} + +/// A state in a final compiled NFA. +#[derive(Clone, Eq, PartialEq)] +pub enum State { + /// A state that transitions to `next` if and only if the current input + /// byte is in the range `[start, end]` (inclusive). + /// + /// This is a special case of Sparse in that it encodes only one transition + /// (and therefore avoids the allocation). + Range { range: Transition }, + /// A state with possibly many transitions, represented in a sparse + /// fashion. Transitions are ordered lexicographically by input range. As + /// such, this may only be used when every transition has equal priority. + /// (In practice, this is only used for encoding UTF-8 automata.) + Sparse(SparseTransitions), + /// A conditional epsilon transition satisfied via some sort of + /// look-around. + Look { look: Look, next: StateID }, + /// An alternation such that there exists an epsilon transition to all + /// states in `alternates`, where matches found via earlier transitions + /// are preferred over later transitions. + Union { alternates: Box<[StateID]> }, + /// An empty state that records a capture location. + /// + /// From the perspective of finite automata, this is precisely equivalent + /// to an epsilon transition, but serves the purpose of instructing NFA + /// simulations to record additional state when the finite state machine + /// passes through this epsilon transition. + /// + /// These transitions are treated as epsilon transitions with no additional + /// effects in DFAs. + /// + /// 'slot' in this context refers to the specific capture group offset that + /// is being recorded. Each capturing group has two slots corresponding to + /// the start and end of the matching portion of that group. + /// A fail state. When encountered, the automaton is guaranteed to never + /// reach a match state. + Capture { next: StateID, slot: usize }, + /// A state that cannot be transitioned out of. If a search reaches this + /// state, then no match is possible and the search should terminate. + Fail, + /// A match state. There is exactly one such occurrence of this state for + /// each regex compiled into the NFA. + Match { id: PatternID }, +} + +impl State { + /// Returns true if and only if this state contains one or more epsilon + /// transitions. + #[inline] + pub fn is_epsilon(&self) -> bool { + match *self { + State::Range { .. } + | State::Sparse { .. } + | State::Fail + | State::Match { .. } => false, + State::Look { .. } + | State::Union { .. } + | State::Capture { .. } => true, + } + } + + /// Returns the heap memory usage of this NFA state in bytes. + fn memory_usage(&self) -> usize { + match *self { + State::Range { .. } + | State::Look { .. } + | State::Capture { .. } + | State::Match { .. } + | State::Fail => 0, + State::Sparse(SparseTransitions { ref ranges }) => { + ranges.len() * mem::size_of::<Transition>() + } + State::Union { ref alternates } => { + alternates.len() * mem::size_of::<StateID>() + } + } + } + + /// Remap the transitions in this state using the given map. Namely, the + /// given map should be indexed according to the transitions currently + /// in this state. + /// + /// This is used during the final phase of the NFA compiler, which turns + /// its intermediate NFA into the final NFA. + fn remap(&mut self, remap: &[StateID]) { + match *self { + State::Range { ref mut range } => range.next = remap[range.next], + State::Sparse(SparseTransitions { ref mut ranges }) => { + for r in ranges.iter_mut() { + r.next = remap[r.next]; + } + } + State::Look { ref mut next, .. } => *next = remap[*next], + State::Union { ref mut alternates } => { + for alt in alternates.iter_mut() { + *alt = remap[*alt]; + } + } + State::Capture { ref mut next, .. } => *next = remap[*next], + State::Fail => {} + State::Match { .. } => {} + } + } +} + +impl fmt::Debug for State { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + match *self { + State::Range { ref range } => range.fmt(f), + State::Sparse(SparseTransitions { ref ranges }) => { + let rs = ranges + .iter() + .map(|t| format!("{:?}", t)) + .collect::<Vec<String>>() + .join(", "); + write!(f, "sparse({})", rs) + } + State::Look { ref look, next } => { + write!(f, "{:?} => {:?}", look, next.as_usize()) + } + State::Union { ref alternates } => { + let alts = alternates + .iter() + .map(|id| format!("{:?}", id.as_usize())) + .collect::<Vec<String>>() + .join(", "); + write!(f, "alt({})", alts) + } + State::Capture { next, slot } => { + write!(f, "capture({:?}) => {:?}", slot, next.as_usize()) + } + State::Fail => write!(f, "FAIL"), + State::Match { id } => write!(f, "MATCH({:?})", id.as_usize()), + } + } +} + +/// A collection of facts about an NFA. +/// +/// There are no real cohesive principles behind what gets put in here. For +/// the most part, it is implementation driven. +#[derive(Clone, Copy, Debug, Default)] +struct Facts { + /// Various yes/no facts about this NFA. + bools: u16, +} + +impl Facts { + define_bool!(0, has_any_look, set_has_any_look); + define_bool!(1, has_any_anchor, set_has_any_anchor); + define_bool!(2, has_word_boundary_unicode, set_has_word_boundary_unicode); + define_bool!(3, has_word_boundary_ascii, set_has_word_boundary_ascii); +} + +/// A sequence of transitions used to represent a sparse state. +#[derive(Clone, Debug, Eq, PartialEq)] +pub struct SparseTransitions { + pub ranges: Box<[Transition]>, +} + +impl SparseTransitions { + pub fn matches(&self, haystack: &[u8], at: usize) -> Option<StateID> { + haystack.get(at).and_then(|&b| self.matches_byte(b)) + } + + pub fn matches_unit(&self, unit: alphabet::Unit) -> Option<StateID> { + unit.as_u8().map_or(None, |byte| self.matches_byte(byte)) + } + + pub fn matches_byte(&self, byte: u8) -> Option<StateID> { + for t in self.ranges.iter() { + if t.start > byte { + break; + } else if t.matches_byte(byte) { + return Some(t.next); + } + } + None + + /* + // This is an alternative implementation that uses binary search. In + // some ad hoc experiments, like + // + // smallishru=OpenSubtitles2018.raw.sample.smallish.ru + // regex-cli find nfa thompson pikevm -b "@$smallishru" '\b\w+\b' + // + // I could not observe any improvement, and in fact, things seemed to + // be a bit slower. + self.ranges + .binary_search_by(|t| { + if t.end < byte { + core::cmp::Ordering::Less + } else if t.start > byte { + core::cmp::Ordering::Greater + } else { + core::cmp::Ordering::Equal + } + }) + .ok() + .map(|i| self.ranges[i].next) + */ + } +} + +/// A transition to another state, only if the given byte falls in the +/// inclusive range specified. +#[derive(Clone, Copy, Eq, Hash, PartialEq)] +pub struct Transition { + pub start: u8, + pub end: u8, + pub next: StateID, +} + +impl Transition { + pub fn matches(&self, haystack: &[u8], at: usize) -> bool { + haystack.get(at).map_or(false, |&b| self.matches_byte(b)) + } + + pub fn matches_unit(&self, unit: alphabet::Unit) -> bool { + unit.as_u8().map_or(false, |byte| self.matches_byte(byte)) + } + + pub fn matches_byte(&self, byte: u8) -> bool { + self.start <= byte && byte <= self.end + } +} + +impl fmt::Debug for Transition { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + use crate::util::DebugByte; + + let Transition { start, end, next } = *self; + if self.start == self.end { + write!(f, "{:?} => {:?}", DebugByte(start), next.as_usize()) + } else { + write!( + f, + "{:?}-{:?} => {:?}", + DebugByte(start), + DebugByte(end), + next.as_usize(), + ) + } + } +} + +/// A conditional NFA epsilon transition. +/// +/// A simulation of the NFA can only move through this epsilon transition if +/// the current position satisfies some look-around property. Some assertions +/// are look-behind (StartLine, StartText), some assertions are look-ahead +/// (EndLine, EndText) while other assertions are both look-behind and +/// look-ahead (WordBoundary*). +#[derive(Clone, Copy, Debug, Eq, PartialEq)] +pub enum Look { + /// The previous position is either `\n` or the current position is the + /// beginning of the haystack (i.e., at position `0`). + StartLine = 1 << 0, + /// The next position is either `\n` or the current position is the end of + /// the haystack (i.e., at position `haystack.len()`). + EndLine = 1 << 1, + /// The current position is the beginning of the haystack (i.e., at + /// position `0`). + StartText = 1 << 2, + /// The current position is the end of the haystack (i.e., at position + /// `haystack.len()`). + EndText = 1 << 3, + /// When tested at position `i`, where `p=decode_utf8_rev(&haystack[..i])` + /// and `n=decode_utf8(&haystack[i..])`, this assertion passes if and only + /// if `is_word(p) != is_word(n)`. If `i=0`, then `is_word(p)=false` and if + /// `i=haystack.len()`, then `is_word(n)=false`. + WordBoundaryUnicode = 1 << 4, + /// Same as for `WordBoundaryUnicode`, but requires that + /// `is_word(p) == is_word(n)`. + WordBoundaryUnicodeNegate = 1 << 5, + /// When tested at position `i`, where `p=haystack[i-1]` and + /// `n=haystack[i]`, this assertion passes if and only if `is_word(p) + /// != is_word(n)`. If `i=0`, then `is_word(p)=false` and if + /// `i=haystack.len()`, then `is_word(n)=false`. + WordBoundaryAscii = 1 << 6, + /// Same as for `WordBoundaryAscii`, but requires that + /// `is_word(p) == is_word(n)`. + /// + /// Note that it is possible for this assertion to match at positions that + /// split the UTF-8 encoding of a codepoint. For this reason, this may only + /// be used when UTF-8 mode is disable in the regex syntax. + WordBoundaryAsciiNegate = 1 << 7, +} + +impl Look { + #[inline(always)] + pub fn matches(&self, bytes: &[u8], at: usize) -> bool { + match *self { + Look::StartLine => at == 0 || bytes[at - 1] == b'\n', + Look::EndLine => at == bytes.len() || bytes[at] == b'\n', + Look::StartText => at == 0, + Look::EndText => at == bytes.len(), + Look::WordBoundaryUnicode => { + let word_before = is_word_char_rev(bytes, at); + let word_after = is_word_char_fwd(bytes, at); + word_before != word_after + } + Look::WordBoundaryUnicodeNegate => { + // This is pretty subtle. Why do we need to do UTF-8 decoding + // here? Well... at time of writing, the is_word_char_{fwd,rev} + // routines will only return true if there is a valid UTF-8 + // encoding of a "word" codepoint, and false in every other + // case (including invalid UTF-8). This means that in regions + // of invalid UTF-8 (which might be a subset of valid UTF-8!), + // it would result in \B matching. While this would be + // questionable in the context of truly invalid UTF-8, it is + // *certainly* wrong to report match boundaries that split the + // encoding of a codepoint. So to work around this, we ensure + // that we can decode a codepoint on either side of `at`. If + // either direction fails, then we don't permit \B to match at + // all. + // + // Now, this isn't exactly optimal from a perf perspective. We + // could try and detect this in is_word_char_{fwd,rev}, but + // it's not clear if it's worth it. \B is, after all, rarely + // used. + // + // And in particular, we do *not* have to do this with \b, + // because \b *requires* that at least one side of `at` be a + // "word" codepoint, which in turn implies one side of `at` + // must be valid UTF-8. This in turn implies that \b can never + // split a valid UTF-8 encoding of a codepoint. In the case + // where one side of `at` is truly invalid UTF-8 and the other + // side IS a word codepoint, then we want \b to match since it + // represents a valid UTF-8 boundary. It also makes sense. For + // example, you'd want \b\w+\b to match 'abc' in '\xFFabc\xFF'. + let word_before = at > 0 + && match decode_last_utf8(&bytes[..at]) { + None | Some(Err(_)) => return false, + Some(Ok(_)) => is_word_char_rev(bytes, at), + }; + let word_after = at < bytes.len() + && match decode_utf8(&bytes[at..]) { + None | Some(Err(_)) => return false, + Some(Ok(_)) => is_word_char_fwd(bytes, at), + }; + word_before == word_after + } + Look::WordBoundaryAscii => { + let word_before = at > 0 && is_word_byte(bytes[at - 1]); + let word_after = at < bytes.len() && is_word_byte(bytes[at]); + word_before != word_after + } + Look::WordBoundaryAsciiNegate => { + let word_before = at > 0 && is_word_byte(bytes[at - 1]); + let word_after = at < bytes.len() && is_word_byte(bytes[at]); + word_before == word_after + } + } + } + + /// Create a look-around assertion from its corresponding integer (as + /// defined in `Look`). If the given integer does not correspond to any + /// assertion, then None is returned. + fn from_int(n: u8) -> Option<Look> { + match n { + 0b0000_0001 => Some(Look::StartLine), + 0b0000_0010 => Some(Look::EndLine), + 0b0000_0100 => Some(Look::StartText), + 0b0000_1000 => Some(Look::EndText), + 0b0001_0000 => Some(Look::WordBoundaryUnicode), + 0b0010_0000 => Some(Look::WordBoundaryUnicodeNegate), + 0b0100_0000 => Some(Look::WordBoundaryAscii), + 0b1000_0000 => Some(Look::WordBoundaryAsciiNegate), + _ => None, + } + } + + /// Flip the look-around assertion to its equivalent for reverse searches. + fn reversed(&self) -> Look { + match *self { + Look::StartLine => Look::EndLine, + Look::EndLine => Look::StartLine, + Look::StartText => Look::EndText, + Look::EndText => Look::StartText, + Look::WordBoundaryUnicode => Look::WordBoundaryUnicode, + Look::WordBoundaryUnicodeNegate => Look::WordBoundaryUnicodeNegate, + Look::WordBoundaryAscii => Look::WordBoundaryAscii, + Look::WordBoundaryAsciiNegate => Look::WordBoundaryAsciiNegate, + } + } + + /// Split up the given byte classes into equivalence classes in a way that + /// is consistent with this look-around assertion. + fn add_to_byteset(&self, set: &mut ByteClassSet) { + match *self { + Look::StartText | Look::EndText => {} + Look::StartLine | Look::EndLine => { + set.set_range(b'\n', b'\n'); + } + Look::WordBoundaryUnicode + | Look::WordBoundaryUnicodeNegate + | Look::WordBoundaryAscii + | Look::WordBoundaryAsciiNegate => { + // We need to mark all ranges of bytes whose pairs result in + // evaluating \b differently. This isn't technically correct + // for Unicode word boundaries, but DFAs can't handle those + // anyway, and thus, the byte classes don't need to either + // since they are themselves only used in DFAs. + let iswb = regex_syntax::is_word_byte; + let mut b1: u16 = 0; + let mut b2: u16; + while b1 <= 255 { + b2 = b1 + 1; + while b2 <= 255 && iswb(b1 as u8) == iswb(b2 as u8) { + b2 += 1; + } + set.set_range(b1 as u8, (b2 - 1) as u8); + b1 = b2; + } + } + } + } +} + +/// LookSet is a memory-efficient set of look-around assertions. Callers may +/// idempotently insert or remove any look-around assertion from a set. +#[repr(transparent)] +#[derive(Clone, Copy, Default, Eq, Hash, PartialEq, PartialOrd, Ord)] +pub(crate) struct LookSet { + set: u8, +} + +impl LookSet { + /// Return a LookSet from its representation. + pub(crate) fn from_repr(repr: u8) -> LookSet { + LookSet { set: repr } + } + + /// Return a mutable LookSet from a mutable pointer to its representation. + pub(crate) fn from_repr_mut(repr: &mut u8) -> &mut LookSet { + // SAFETY: This is safe since a LookSet is repr(transparent) where its + // repr is a u8. + unsafe { core::mem::transmute::<&mut u8, &mut LookSet>(repr) } + } + + /// Return true if and only if this set is empty. + pub(crate) fn is_empty(&self) -> bool { + self.set == 0 + } + + /// Clears this set such that it has no assertions in it. + pub(crate) fn clear(&mut self) { + self.set = 0; + } + + /// Insert the given look-around assertion into this set. If the assertion + /// already exists, then this is a no-op. + pub(crate) fn insert(&mut self, look: Look) { + self.set |= look as u8; + } + + /// Remove the given look-around assertion from this set. If the assertion + /// is not in this set, then this is a no-op. + #[cfg(test)] + pub(crate) fn remove(&mut self, look: Look) { + self.set &= !(look as u8); + } + + /// Return true if and only if the given assertion is in this set. + pub(crate) fn contains(&self, look: Look) -> bool { + (look as u8) & self.set != 0 + } + + /// Subtract the given `other` set from the `self` set and return a new + /// set. + pub(crate) fn subtract(&self, other: LookSet) -> LookSet { + LookSet { set: self.set & !other.set } + } + + /// Return the intersection of the given `other` set with the `self` set + /// and return the resulting set. + pub(crate) fn intersect(&self, other: LookSet) -> LookSet { + LookSet { set: self.set & other.set } + } +} + +impl core::fmt::Debug for LookSet { + fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result { + let mut members = vec![]; + for i in 0..8 { + let look = match Look::from_int(1 << i) { + None => continue, + Some(look) => look, + }; + if self.contains(look) { + members.push(look); + } + } + f.debug_tuple("LookSet").field(&members).finish() + } +} + +/// An iterator over all pattern IDs in an NFA. +pub struct PatternIter<'a> { + it: PatternIDIter, + /// We explicitly associate a lifetime with this iterator even though we + /// don't actually borrow anything from the NFA. We do this for backward + /// compatibility purposes. If we ever do need to borrow something from + /// the NFA, then we can and just get rid of this marker without breaking + /// the public API. + _marker: core::marker::PhantomData<&'a ()>, +} + +impl<'a> Iterator for PatternIter<'a> { + type Item = PatternID; + + fn next(&mut self) -> Option<PatternID> { + self.it.next() + } +} + +#[cfg(test)] +mod tests { + use super::*; + // TODO: Replace tests using DFA with NFA matching engine once implemented. + use crate::dfa::{dense, Automaton}; + + #[test] + fn always_match() { + let nfa = NFA::always_match(); + let dfa = dense::Builder::new().build_from_nfa(&nfa).unwrap(); + let find = |input, start, end| { + dfa.find_leftmost_fwd_at(None, None, input, start, end) + .unwrap() + .map(|m| m.offset()) + }; + + assert_eq!(Some(0), find(b"", 0, 0)); + assert_eq!(Some(0), find(b"a", 0, 1)); + assert_eq!(Some(1), find(b"a", 1, 1)); + assert_eq!(Some(0), find(b"ab", 0, 2)); + assert_eq!(Some(1), find(b"ab", 1, 2)); + assert_eq!(Some(2), find(b"ab", 2, 2)); + } + + #[test] + fn never_match() { + let nfa = NFA::never_match(); + let dfa = dense::Builder::new().build_from_nfa(&nfa).unwrap(); + let find = |input, start, end| { + dfa.find_leftmost_fwd_at(None, None, input, start, end) + .unwrap() + .map(|m| m.offset()) + }; + + assert_eq!(None, find(b"", 0, 0)); + assert_eq!(None, find(b"a", 0, 1)); + assert_eq!(None, find(b"a", 1, 1)); + assert_eq!(None, find(b"ab", 0, 2)); + assert_eq!(None, find(b"ab", 1, 2)); + assert_eq!(None, find(b"ab", 2, 2)); + } + + #[test] + fn look_set() { + let mut f = LookSet::default(); + assert!(!f.contains(Look::StartText)); + assert!(!f.contains(Look::EndText)); + assert!(!f.contains(Look::StartLine)); + assert!(!f.contains(Look::EndLine)); + assert!(!f.contains(Look::WordBoundaryUnicode)); + assert!(!f.contains(Look::WordBoundaryUnicodeNegate)); + assert!(!f.contains(Look::WordBoundaryAscii)); + assert!(!f.contains(Look::WordBoundaryAsciiNegate)); + + f.insert(Look::StartText); + assert!(f.contains(Look::StartText)); + f.remove(Look::StartText); + assert!(!f.contains(Look::StartText)); + + f.insert(Look::EndText); + assert!(f.contains(Look::EndText)); + f.remove(Look::EndText); + assert!(!f.contains(Look::EndText)); + + f.insert(Look::StartLine); + assert!(f.contains(Look::StartLine)); + f.remove(Look::StartLine); + assert!(!f.contains(Look::StartLine)); + + f.insert(Look::EndLine); + assert!(f.contains(Look::EndLine)); + f.remove(Look::EndLine); + assert!(!f.contains(Look::EndLine)); + + f.insert(Look::WordBoundaryUnicode); + assert!(f.contains(Look::WordBoundaryUnicode)); + f.remove(Look::WordBoundaryUnicode); + assert!(!f.contains(Look::WordBoundaryUnicode)); + + f.insert(Look::WordBoundaryUnicodeNegate); + assert!(f.contains(Look::WordBoundaryUnicodeNegate)); + f.remove(Look::WordBoundaryUnicodeNegate); + assert!(!f.contains(Look::WordBoundaryUnicodeNegate)); + + f.insert(Look::WordBoundaryAscii); + assert!(f.contains(Look::WordBoundaryAscii)); + f.remove(Look::WordBoundaryAscii); + assert!(!f.contains(Look::WordBoundaryAscii)); + + f.insert(Look::WordBoundaryAsciiNegate); + assert!(f.contains(Look::WordBoundaryAsciiNegate)); + f.remove(Look::WordBoundaryAsciiNegate); + assert!(!f.contains(Look::WordBoundaryAsciiNegate)); + } + + #[test] + fn look_matches_start_line() { + let look = Look::StartLine; + + assert!(look.matches(B(""), 0)); + assert!(look.matches(B("\n"), 0)); + assert!(look.matches(B("\n"), 1)); + assert!(look.matches(B("a"), 0)); + assert!(look.matches(B("\na"), 1)); + + assert!(!look.matches(B("a"), 1)); + assert!(!look.matches(B("a\na"), 1)); + } + + #[test] + fn look_matches_end_line() { + let look = Look::EndLine; + + assert!(look.matches(B(""), 0)); + assert!(look.matches(B("\n"), 1)); + assert!(look.matches(B("\na"), 0)); + assert!(look.matches(B("\na"), 2)); + assert!(look.matches(B("a\na"), 1)); + + assert!(!look.matches(B("a"), 0)); + assert!(!look.matches(B("\na"), 1)); + assert!(!look.matches(B("a\na"), 0)); + assert!(!look.matches(B("a\na"), 2)); + } + + #[test] + fn look_matches_start_text() { + let look = Look::StartText; + + assert!(look.matches(B(""), 0)); + assert!(look.matches(B("\n"), 0)); + assert!(look.matches(B("a"), 0)); + + assert!(!look.matches(B("\n"), 1)); + assert!(!look.matches(B("\na"), 1)); + assert!(!look.matches(B("a"), 1)); + assert!(!look.matches(B("a\na"), 1)); + } + + #[test] + fn look_matches_end_text() { + let look = Look::EndText; + + assert!(look.matches(B(""), 0)); + assert!(look.matches(B("\n"), 1)); + assert!(look.matches(B("\na"), 2)); + + assert!(!look.matches(B("\na"), 0)); + assert!(!look.matches(B("a\na"), 1)); + assert!(!look.matches(B("a"), 0)); + assert!(!look.matches(B("\na"), 1)); + assert!(!look.matches(B("a\na"), 0)); + assert!(!look.matches(B("a\na"), 2)); + } + + #[test] + fn look_matches_word_unicode() { + let look = Look::WordBoundaryUnicode; + + // \xF0\x9D\x9B\x83 = 𝛃 (in \w) + // \xF0\x90\x86\x80 = 𐆀 (not in \w) + + // Simple ASCII word boundaries. + assert!(look.matches(B("a"), 0)); + assert!(look.matches(B("a"), 1)); + assert!(look.matches(B("a "), 1)); + assert!(look.matches(B(" a "), 1)); + assert!(look.matches(B(" a "), 2)); + + // Unicode word boundaries with a non-ASCII codepoint. + assert!(look.matches(B("𝛃"), 0)); + assert!(look.matches(B("𝛃"), 4)); + assert!(look.matches(B("𝛃 "), 4)); + assert!(look.matches(B(" 𝛃 "), 1)); + assert!(look.matches(B(" 𝛃 "), 5)); + + // Unicode word boundaries between non-ASCII codepoints. + assert!(look.matches(B("𝛃𐆀"), 0)); + assert!(look.matches(B("𝛃𐆀"), 4)); + + // Non word boundaries for ASCII. + assert!(!look.matches(B(""), 0)); + assert!(!look.matches(B("ab"), 1)); + assert!(!look.matches(B("a "), 2)); + assert!(!look.matches(B(" a "), 0)); + assert!(!look.matches(B(" a "), 3)); + + // Non word boundaries with a non-ASCII codepoint. + assert!(!look.matches(B("𝛃b"), 4)); + assert!(!look.matches(B("𝛃 "), 5)); + assert!(!look.matches(B(" 𝛃 "), 0)); + assert!(!look.matches(B(" 𝛃 "), 6)); + assert!(!look.matches(B("𝛃"), 1)); + assert!(!look.matches(B("𝛃"), 2)); + assert!(!look.matches(B("𝛃"), 3)); + + // Non word boundaries with non-ASCII codepoints. + assert!(!look.matches(B("𝛃𐆀"), 1)); + assert!(!look.matches(B("𝛃𐆀"), 2)); + assert!(!look.matches(B("𝛃𐆀"), 3)); + assert!(!look.matches(B("𝛃𐆀"), 5)); + assert!(!look.matches(B("𝛃𐆀"), 6)); + assert!(!look.matches(B("𝛃𐆀"), 7)); + assert!(!look.matches(B("𝛃𐆀"), 8)); + } + + #[test] + fn look_matches_word_ascii() { + let look = Look::WordBoundaryAscii; + + // \xF0\x9D\x9B\x83 = 𝛃 (in \w) + // \xF0\x90\x86\x80 = 𐆀 (not in \w) + + // Simple ASCII word boundaries. + assert!(look.matches(B("a"), 0)); + assert!(look.matches(B("a"), 1)); + assert!(look.matches(B("a "), 1)); + assert!(look.matches(B(" a "), 1)); + assert!(look.matches(B(" a "), 2)); + + // Unicode word boundaries with a non-ASCII codepoint. Since this is + // an ASCII word boundary, none of these match. + assert!(!look.matches(B("𝛃"), 0)); + assert!(!look.matches(B("𝛃"), 4)); + assert!(!look.matches(B("𝛃 "), 4)); + assert!(!look.matches(B(" 𝛃 "), 1)); + assert!(!look.matches(B(" 𝛃 "), 5)); + + // Unicode word boundaries between non-ASCII codepoints. Again, since + // this is an ASCII word boundary, none of these match. + assert!(!look.matches(B("𝛃𐆀"), 0)); + assert!(!look.matches(B("𝛃𐆀"), 4)); + + // Non word boundaries for ASCII. + assert!(!look.matches(B(""), 0)); + assert!(!look.matches(B("ab"), 1)); + assert!(!look.matches(B("a "), 2)); + assert!(!look.matches(B(" a "), 0)); + assert!(!look.matches(B(" a "), 3)); + + // Non word boundaries with a non-ASCII codepoint. + assert!(look.matches(B("𝛃b"), 4)); + assert!(!look.matches(B("𝛃 "), 5)); + assert!(!look.matches(B(" 𝛃 "), 0)); + assert!(!look.matches(B(" 𝛃 "), 6)); + assert!(!look.matches(B("𝛃"), 1)); + assert!(!look.matches(B("𝛃"), 2)); + assert!(!look.matches(B("𝛃"), 3)); + + // Non word boundaries with non-ASCII codepoints. + assert!(!look.matches(B("𝛃𐆀"), 1)); + assert!(!look.matches(B("𝛃𐆀"), 2)); + assert!(!look.matches(B("𝛃𐆀"), 3)); + assert!(!look.matches(B("𝛃𐆀"), 5)); + assert!(!look.matches(B("𝛃𐆀"), 6)); + assert!(!look.matches(B("𝛃𐆀"), 7)); + assert!(!look.matches(B("𝛃𐆀"), 8)); + } + + #[test] + fn look_matches_word_unicode_negate() { + let look = Look::WordBoundaryUnicodeNegate; + + // \xF0\x9D\x9B\x83 = 𝛃 (in \w) + // \xF0\x90\x86\x80 = 𐆀 (not in \w) + + // Simple ASCII word boundaries. + assert!(!look.matches(B("a"), 0)); + assert!(!look.matches(B("a"), 1)); + assert!(!look.matches(B("a "), 1)); + assert!(!look.matches(B(" a "), 1)); + assert!(!look.matches(B(" a "), 2)); + + // Unicode word boundaries with a non-ASCII codepoint. + assert!(!look.matches(B("𝛃"), 0)); + assert!(!look.matches(B("𝛃"), 4)); + assert!(!look.matches(B("𝛃 "), 4)); + assert!(!look.matches(B(" 𝛃 "), 1)); + assert!(!look.matches(B(" 𝛃 "), 5)); + + // Unicode word boundaries between non-ASCII codepoints. + assert!(!look.matches(B("𝛃𐆀"), 0)); + assert!(!look.matches(B("𝛃𐆀"), 4)); + + // Non word boundaries for ASCII. + assert!(look.matches(B(""), 0)); + assert!(look.matches(B("ab"), 1)); + assert!(look.matches(B("a "), 2)); + assert!(look.matches(B(" a "), 0)); + assert!(look.matches(B(" a "), 3)); + + // Non word boundaries with a non-ASCII codepoint. + assert!(look.matches(B("𝛃b"), 4)); + assert!(look.matches(B("𝛃 "), 5)); + assert!(look.matches(B(" 𝛃 "), 0)); + assert!(look.matches(B(" 𝛃 "), 6)); + // These don't match because they could otherwise return an offset that + // splits the UTF-8 encoding of a codepoint. + assert!(!look.matches(B("𝛃"), 1)); + assert!(!look.matches(B("𝛃"), 2)); + assert!(!look.matches(B("𝛃"), 3)); + + // Non word boundaries with non-ASCII codepoints. These also don't + // match because they could otherwise return an offset that splits the + // UTF-8 encoding of a codepoint. + assert!(!look.matches(B("𝛃𐆀"), 1)); + assert!(!look.matches(B("𝛃𐆀"), 2)); + assert!(!look.matches(B("𝛃𐆀"), 3)); + assert!(!look.matches(B("𝛃𐆀"), 5)); + assert!(!look.matches(B("𝛃𐆀"), 6)); + assert!(!look.matches(B("𝛃𐆀"), 7)); + // But this one does, since 𐆀 isn't a word codepoint, and 8 is the end + // of the haystack. So the "end" of the haystack isn't a word and 𐆀 + // isn't a word, thus, \B matches. + assert!(look.matches(B("𝛃𐆀"), 8)); + } + + #[test] + fn look_matches_word_ascii_negate() { + let look = Look::WordBoundaryAsciiNegate; + + // \xF0\x9D\x9B\x83 = 𝛃 (in \w) + // \xF0\x90\x86\x80 = 𐆀 (not in \w) + + // Simple ASCII word boundaries. + assert!(!look.matches(B("a"), 0)); + assert!(!look.matches(B("a"), 1)); + assert!(!look.matches(B("a "), 1)); + assert!(!look.matches(B(" a "), 1)); + assert!(!look.matches(B(" a "), 2)); + + // Unicode word boundaries with a non-ASCII codepoint. Since this is + // an ASCII word boundary, none of these match. + assert!(look.matches(B("𝛃"), 0)); + assert!(look.matches(B("𝛃"), 4)); + assert!(look.matches(B("𝛃 "), 4)); + assert!(look.matches(B(" 𝛃 "), 1)); + assert!(look.matches(B(" 𝛃 "), 5)); + + // Unicode word boundaries between non-ASCII codepoints. Again, since + // this is an ASCII word boundary, none of these match. + assert!(look.matches(B("𝛃𐆀"), 0)); + assert!(look.matches(B("𝛃𐆀"), 4)); + + // Non word boundaries for ASCII. + assert!(look.matches(B(""), 0)); + assert!(look.matches(B("ab"), 1)); + assert!(look.matches(B("a "), 2)); + assert!(look.matches(B(" a "), 0)); + assert!(look.matches(B(" a "), 3)); + + // Non word boundaries with a non-ASCII codepoint. + assert!(!look.matches(B("𝛃b"), 4)); + assert!(look.matches(B("𝛃 "), 5)); + assert!(look.matches(B(" 𝛃 "), 0)); + assert!(look.matches(B(" 𝛃 "), 6)); + assert!(look.matches(B("𝛃"), 1)); + assert!(look.matches(B("𝛃"), 2)); + assert!(look.matches(B("𝛃"), 3)); + + // Non word boundaries with non-ASCII codepoints. + assert!(look.matches(B("𝛃𐆀"), 1)); + assert!(look.matches(B("𝛃𐆀"), 2)); + assert!(look.matches(B("𝛃𐆀"), 3)); + assert!(look.matches(B("𝛃𐆀"), 5)); + assert!(look.matches(B("𝛃𐆀"), 6)); + assert!(look.matches(B("𝛃𐆀"), 7)); + assert!(look.matches(B("𝛃𐆀"), 8)); + } + + fn B<'a, T: 'a + ?Sized + AsRef<[u8]>>(string: &'a T) -> &'a [u8] { + string.as_ref() + } +} diff --git a/vendor/regex-automata/src/nfa/thompson/pikevm.rs b/vendor/regex-automata/src/nfa/thompson/pikevm.rs new file mode 100644 index 000000000..7572f9f10 --- /dev/null +++ b/vendor/regex-automata/src/nfa/thompson/pikevm.rs @@ -0,0 +1,554 @@ +use alloc::{sync::Arc, vec, vec::Vec}; + +use crate::{ + nfa::thompson::{self, Error, State, NFA}, + util::{ + id::{PatternID, StateID}, + matchtypes::MultiMatch, + sparse_set::SparseSet, + }, +}; + +#[derive(Clone, Copy, Debug, Default)] +pub struct Config { + anchored: Option<bool>, + utf8: Option<bool>, +} + +impl Config { + /// Return a new default PikeVM configuration. + pub fn new() -> Config { + Config::default() + } + + pub fn anchored(mut self, yes: bool) -> Config { + self.anchored = Some(yes); + self + } + + pub fn utf8(mut self, yes: bool) -> Config { + self.utf8 = Some(yes); + self + } + + pub fn get_anchored(&self) -> bool { + self.anchored.unwrap_or(false) + } + + pub fn get_utf8(&self) -> bool { + self.utf8.unwrap_or(true) + } + + pub(crate) fn overwrite(self, o: Config) -> Config { + Config { + anchored: o.anchored.or(self.anchored), + utf8: o.utf8.or(self.utf8), + } + } +} + +/// A builder for a PikeVM. +#[derive(Clone, Debug)] +pub struct Builder { + config: Config, + thompson: thompson::Builder, +} + +impl Builder { + /// Create a new PikeVM builder with its default configuration. + pub fn new() -> Builder { + Builder { + config: Config::default(), + thompson: thompson::Builder::new(), + } + } + + pub fn build(&self, pattern: &str) -> Result<PikeVM, Error> { + self.build_many(&[pattern]) + } + + pub fn build_many<P: AsRef<str>>( + &self, + patterns: &[P], + ) -> Result<PikeVM, Error> { + let nfa = self.thompson.build_many(patterns)?; + self.build_from_nfa(Arc::new(nfa)) + } + + pub fn build_from_nfa(&self, nfa: Arc<NFA>) -> Result<PikeVM, Error> { + // TODO: Check that this is correct. + // if !cfg!(all( + // feature = "dfa", + // feature = "syntax", + // feature = "unicode-perl" + // )) { + if !cfg!(feature = "syntax") { + if nfa.has_word_boundary_unicode() { + return Err(Error::unicode_word_unavailable()); + } + } + Ok(PikeVM { config: self.config, nfa }) + } + + pub fn configure(&mut self, config: Config) -> &mut Builder { + self.config = self.config.overwrite(config); + self + } + + /// Set the syntax configuration for this builder using + /// [`SyntaxConfig`](crate::SyntaxConfig). + /// + /// This permits setting things like case insensitivity, Unicode and multi + /// line mode. + /// + /// These settings only apply when constructing a PikeVM directly from a + /// pattern. + pub fn syntax( + &mut self, + config: crate::util::syntax::SyntaxConfig, + ) -> &mut Builder { + self.thompson.syntax(config); + self + } + + /// Set the Thompson NFA configuration for this builder using + /// [`nfa::thompson::Config`](crate::nfa::thompson::Config). + /// + /// This permits setting things like if additional time should be spent + /// shrinking the size of the NFA. + /// + /// These settings only apply when constructing a PikeVM directly from a + /// pattern. + pub fn thompson(&mut self, config: thompson::Config) -> &mut Builder { + self.thompson.configure(config); + self + } +} + +#[derive(Clone, Debug)] +pub struct PikeVM { + config: Config, + nfa: Arc<NFA>, +} + +impl PikeVM { + pub fn new(pattern: &str) -> Result<PikeVM, Error> { + PikeVM::builder().build(pattern) + } + + pub fn new_many<P: AsRef<str>>(patterns: &[P]) -> Result<PikeVM, Error> { + PikeVM::builder().build_many(patterns) + } + + pub fn config() -> Config { + Config::new() + } + + pub fn builder() -> Builder { + Builder::new() + } + + pub fn create_cache(&self) -> Cache { + Cache::new(self.nfa()) + } + + pub fn create_captures(&self) -> Captures { + Captures::new(self.nfa()) + } + + pub fn nfa(&self) -> &Arc<NFA> { + &self.nfa + } + + pub fn find_leftmost_iter<'r, 'c, 't>( + &'r self, + cache: &'c mut Cache, + haystack: &'t [u8], + ) -> FindLeftmostMatches<'r, 'c, 't> { + FindLeftmostMatches::new(self, cache, haystack) + } + + // BREADCRUMBS: + // + // 1) Don't forget about prefilters. + // + // 2) Consider the case of using a PikeVM with an NFA that has Capture + // states, but where we don't want to track capturing groups (other than + // group 0). This potentially saves a lot of copying around and what not. I + // believe the current regex crate does this, for example. The interesting + // bit here is how to handle the case of multiple patterns... + // + // 3) Permit the caller to specify a pattern ID to run an anchored-only + // search on. + // + // 4) How to do overlapping? The way multi-regex support works in the regex + // crate currently is to run the PikeVM until either we reach the end of + // the haystack or when we know all regexes have matched. The latter case + // is probably quite rare, so the common case is likely that we're always + // searching the entire input. The question is: can we emulate that with + // our typical 'overlapping' APIs on DFAs? I believe we can. If so, then + // all we need to do is provide an overlapping API on the PikeVM that + // roughly matches the ones we provide on DFAs. For those APIs, the only + // thing they need over non-overlapping APIs is "caller state." For DFAs, + // the caller state is simple: it contains the last state visited and the + // last match reported. For the PikeVM (and NFAs in general), the "last + // state" is actually a *set* of NFA states. So I think what happens here + // is that we can just force the `Cache` to subsume this role. We'll still + // need some additional state to track the last match reported though. + // Because when two or more patterns match at the same location, we need a + // way to know to iterate over them. Although maybe it's not match index we + // need, but the state index of the last NFA state processed in the cache. + // Then we just pick up where we left off. There might be another match + // state, in which case, we report it. + + pub fn find_leftmost_at( + &self, + cache: &mut Cache, + haystack: &[u8], + start: usize, + end: usize, + caps: &mut Captures, + ) -> Option<MultiMatch> { + let anchored = + self.config.get_anchored() || self.nfa.is_always_start_anchored(); + let mut at = start; + let mut matched_pid = None; + cache.clear(); + 'LOOP: loop { + if cache.clist.set.is_empty() { + if matched_pid.is_some() || (anchored && at > start) { + break 'LOOP; + } + // TODO: prefilter + } + if (!anchored && matched_pid.is_none()) + || cache.clist.set.is_empty() + { + self.epsilon_closure( + &mut cache.clist, + &mut caps.slots, + &mut cache.stack, + self.nfa.start_anchored(), + haystack, + at, + ); + } + for i in 0..cache.clist.set.len() { + let sid = cache.clist.set.get(i); + let pid = match self.step( + &mut cache.nlist, + &mut caps.slots, + cache.clist.caps(sid), + &mut cache.stack, + sid, + haystack, + at, + ) { + None => continue, + Some(pid) => pid, + }; + matched_pid = Some(pid); + break; + } + if at >= end { + break; + } + at += 1; + cache.swap(); + cache.nlist.set.clear(); + } + matched_pid.map(|pid| { + let slots = self.nfa.pattern_slots(pid); + let (start, end) = (slots.start, slots.start + 1); + MultiMatch::new( + pid, + caps.slots[start].unwrap(), + caps.slots[end].unwrap(), + ) + }) + } + + #[inline(always)] + fn step( + &self, + nlist: &mut Threads, + slots: &mut [Slot], + thread_caps: &mut [Slot], + stack: &mut Vec<FollowEpsilon>, + sid: StateID, + haystack: &[u8], + at: usize, + ) -> Option<PatternID> { + match *self.nfa.state(sid) { + State::Fail + | State::Look { .. } + | State::Union { .. } + | State::Capture { .. } => None, + State::Range { ref range } => { + if range.matches(haystack, at) { + self.epsilon_closure( + nlist, + thread_caps, + stack, + range.next, + haystack, + at + 1, + ); + } + None + } + State::Sparse(ref sparse) => { + if let Some(next) = sparse.matches(haystack, at) { + self.epsilon_closure( + nlist, + thread_caps, + stack, + next, + haystack, + at + 1, + ); + } + None + } + State::Match { id } => { + slots.copy_from_slice(thread_caps); + Some(id) + } + } + } + + #[inline(always)] + fn epsilon_closure( + &self, + nlist: &mut Threads, + thread_caps: &mut [Slot], + stack: &mut Vec<FollowEpsilon>, + sid: StateID, + haystack: &[u8], + at: usize, + ) { + stack.push(FollowEpsilon::StateID(sid)); + while let Some(frame) = stack.pop() { + match frame { + FollowEpsilon::StateID(sid) => { + self.epsilon_closure_step( + nlist, + thread_caps, + stack, + sid, + haystack, + at, + ); + } + FollowEpsilon::Capture { slot, pos } => { + thread_caps[slot] = pos; + } + } + } + } + + #[inline(always)] + fn epsilon_closure_step( + &self, + nlist: &mut Threads, + thread_caps: &mut [Slot], + stack: &mut Vec<FollowEpsilon>, + mut sid: StateID, + haystack: &[u8], + at: usize, + ) { + loop { + if !nlist.set.insert(sid) { + return; + } + match *self.nfa.state(sid) { + State::Fail + | State::Range { .. } + | State::Sparse { .. } + | State::Match { .. } => { + let t = &mut nlist.caps(sid); + t.copy_from_slice(thread_caps); + return; + } + State::Look { look, next } => { + if !look.matches(haystack, at) { + return; + } + sid = next; + } + State::Union { ref alternates } => { + sid = match alternates.get(0) { + None => return, + Some(&sid) => sid, + }; + stack.extend( + alternates[1..] + .iter() + .copied() + .rev() + .map(FollowEpsilon::StateID), + ); + } + State::Capture { next, slot } => { + if slot < thread_caps.len() { + stack.push(FollowEpsilon::Capture { + slot, + pos: thread_caps[slot], + }); + thread_caps[slot] = Some(at); + } + sid = next; + } + } + } + } +} + +/// An iterator over all non-overlapping leftmost matches for a particular +/// infallible search. +/// +/// The iterator yields a [`MultiMatch`] value until no more matches could be +/// found. If the underlying search returns an error, then this panics. +/// +/// The lifetime variables are as follows: +/// +/// * `'r` is the lifetime of the regular expression itself. +/// * `'c` is the lifetime of the mutable cache used during search. +/// * `'t` is the lifetime of the text being searched. +#[derive(Debug)] +pub struct FindLeftmostMatches<'r, 'c, 't> { + vm: &'r PikeVM, + cache: &'c mut Cache, + // scanner: Option<prefilter::Scanner<'r>>, + text: &'t [u8], + last_end: usize, + last_match: Option<usize>, +} + +impl<'r, 'c, 't> FindLeftmostMatches<'r, 'c, 't> { + fn new( + vm: &'r PikeVM, + cache: &'c mut Cache, + text: &'t [u8], + ) -> FindLeftmostMatches<'r, 'c, 't> { + FindLeftmostMatches { vm, cache, text, last_end: 0, last_match: None } + } +} + +impl<'r, 'c, 't> Iterator for FindLeftmostMatches<'r, 'c, 't> { + // type Item = Captures; + type Item = MultiMatch; + + // fn next(&mut self) -> Option<Captures> { + fn next(&mut self) -> Option<MultiMatch> { + if self.last_end > self.text.len() { + return None; + } + let mut caps = self.vm.create_captures(); + let m = self.vm.find_leftmost_at( + self.cache, + self.text, + self.last_end, + self.text.len(), + &mut caps, + )?; + if m.is_empty() { + // This is an empty match. To ensure we make progress, start + // the next search at the smallest possible starting position + // of the next match following this one. + self.last_end = if self.vm.config.get_utf8() { + crate::util::next_utf8(self.text, m.end()) + } else { + m.end() + 1 + }; + // Don't accept empty matches immediately following a match. + // Just move on to the next match. + if Some(m.end()) == self.last_match { + return self.next(); + } + } else { + self.last_end = m.end(); + } + self.last_match = Some(m.end()); + Some(m) + } +} + +#[derive(Clone, Debug)] +pub struct Captures { + slots: Vec<Slot>, +} + +impl Captures { + pub fn new(nfa: &NFA) -> Captures { + Captures { slots: vec![None; nfa.capture_slot_len()] } + } +} + +#[derive(Clone, Debug)] +pub struct Cache { + stack: Vec<FollowEpsilon>, + clist: Threads, + nlist: Threads, +} + +type Slot = Option<usize>; + +#[derive(Clone, Debug)] +struct Threads { + set: SparseSet, + caps: Vec<Slot>, + slots_per_thread: usize, +} + +#[derive(Clone, Debug)] +enum FollowEpsilon { + StateID(StateID), + Capture { slot: usize, pos: Slot }, +} + +impl Cache { + pub fn new(nfa: &NFA) -> Cache { + Cache { + stack: vec![], + clist: Threads::new(nfa), + nlist: Threads::new(nfa), + } + } + + fn clear(&mut self) { + self.stack.clear(); + self.clist.set.clear(); + self.nlist.set.clear(); + } + + fn swap(&mut self) { + core::mem::swap(&mut self.clist, &mut self.nlist); + } +} + +impl Threads { + fn new(nfa: &NFA) -> Threads { + let mut threads = Threads { + set: SparseSet::new(0), + caps: vec![], + slots_per_thread: 0, + }; + threads.resize(nfa); + threads + } + + fn resize(&mut self, nfa: &NFA) { + if nfa.states().len() == self.set.capacity() { + return; + } + self.slots_per_thread = nfa.capture_slot_len(); + self.set.resize(nfa.states().len()); + self.caps.resize(self.slots_per_thread * nfa.states().len(), None); + } + + fn caps(&mut self, sid: StateID) -> &mut [Slot] { + let i = sid.as_usize() * self.slots_per_thread; + &mut self.caps[i..i + self.slots_per_thread] + } +} diff --git a/vendor/regex-automata/src/nfa/range_trie.rs b/vendor/regex-automata/src/nfa/thompson/range_trie.rs index 50767c7c6..92f36ce3a 100644 --- a/vendor/regex-automata/src/nfa/range_trie.rs +++ b/vendor/regex-automata/src/nfa/thompson/range_trie.rs @@ -60,7 +60,7 @@ // Another approach, however, is to reuse an algorithm for constructing a // *minimal* DFA from a sorted sequence of inputs. I don't want to go into // the full details here, but I explain it in more depth in my blog post on -// FSTs[1]. Note that the algorithm not invented by me, but was published +// FSTs[1]. Note that the algorithm was not invented by me, but was published // in paper by Daciuk et al. in 2000 called "Incremental Construction of // MinimalAcyclic Finite-State Automata." Like the suffix cache approach above, // it is also possible to control the amount of extra memory one uses, although @@ -76,10 +76,11 @@ // [BC-BF][80-BF] // [BC-BF][90-BF] // -// Then Daciuk's algorithm also would not work, since there is nothing to -// handle the fact that the ranges overlap. They would need to be split apart. -// Thankfully, Thompson's algorithm for producing byte ranges for Unicode -// codepoint ranges meets both of our requirements. +// Then Daciuk's algorithm would not work, since there is nothing to handle the +// fact that the ranges overlap. They would need to be split apart. Thankfully, +// Thompson's algorithm for producing byte ranges for Unicode codepoint ranges +// meets both of our requirements. (A proof for this eludes me, but it appears +// true.) // // ... however, we would also like to be able to compile UTF-8 automata in // reverse. We want this because in order to find the starting location of a @@ -139,11 +140,9 @@ // [1] - https://blog.burntsushi.net/transducers/ // [2] - https://www.mitpressjournals.org/doi/pdfplus/10.1162/089120100561601 -use std::cell::RefCell; -use std::fmt; -use std::mem; -use std::ops::RangeInclusive; -use std::u32; +use core::{cell::RefCell, fmt, mem, ops::RangeInclusive, u32}; + +use alloc::{format, string::String, vec, vec::Vec}; use regex_syntax::utf8::Utf8Range; @@ -188,8 +187,8 @@ pub struct RangeTrie { /// particular order. states: Vec<State>, /// A free-list of states. When a range trie is cleared, all of its states - /// are added to list. Creating a new state reuses states from this list - /// before allocating a new one. + /// are added to this list. Creating a new state reuses states from this + /// list before allocating a new one. free: Vec<State>, /// A stack for traversing this trie to yield sequences of byte ranges in /// lexicographic order. @@ -197,7 +196,7 @@ pub struct RangeTrie { /// A bufer that stores the current sequence during iteration. iter_ranges: RefCell<Vec<Utf8Range>>, /// A stack used for traversing the trie in order to (deeply) duplicate - /// a state. + /// a state. States are recursively duplicated when ranges are split. dupe_stack: Vec<NextDupe>, /// A stack used for traversing the trie during insertion of a new /// sequence of byte ranges. @@ -249,7 +248,10 @@ impl RangeTrie { /// Iterate over all of the sequences of byte ranges in this trie, and /// call the provided function for each sequence. Iteration occurs in /// lexicographic order. - pub fn iter<F: FnMut(&[Utf8Range])>(&self, mut f: F) { + pub fn iter<E, F: FnMut(&[Utf8Range]) -> Result<(), E>>( + &self, + mut f: F, + ) -> Result<(), E> { let mut stack = self.iter_stack.borrow_mut(); stack.clear(); let mut ranges = self.iter_ranges.borrow_mut(); @@ -264,7 +266,7 @@ impl RangeTrie { // here, but at the cost of more stack pushes. loop { let state = self.state(state_id); - // If we're visited all transitions in this state, then pop + // If we've visited all transitions in this state, then pop // back to the parent state. if tidx >= state.transitions.len() { ranges.pop(); @@ -274,7 +276,7 @@ impl RangeTrie { let t = &state.transitions[tidx]; ranges.push(t.range); if t.next_id == FINAL { - f(&ranges); + f(&ranges)?; ranges.pop(); tidx += 1; } else { @@ -288,6 +290,7 @@ impl RangeTrie { } } } + Ok(()) } /// Inserts a new sequence of ranges into this trie. @@ -455,8 +458,8 @@ impl RangeTrie { /// the given state ID and the returned state ID share nothing. /// /// This is useful during range trie insertion when a new range overlaps - /// with an existing range that is bigger than the new one. The part of - /// the existing range that does *not* overlap with the new one is that + /// with an existing range that is bigger than the new one. The part + /// of the existing range that does *not* overlap with the new one is /// duplicated so that adding the new range to the overlap doesn't disturb /// the non-overlapping portion. /// @@ -594,7 +597,7 @@ impl State { // Benchmarks suggest that binary search is just a bit faster than // straight linear search. Specifically when using the debug tool: // - // hyperfine "regex-automata-debug debug -acqr '\w{40} ecurB'" + // hyperfine "regex-cli debug nfa thompson --quiet --reverse '\w{90} ecurB'" binary_search(&self.transitions, |t| range.start <= t.range.end) } @@ -865,7 +868,7 @@ impl Split { } impl fmt::Debug for RangeTrie { - fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { writeln!(f, "")?; for (i, state) in self.states.iter().enumerate() { let status = if i == FINAL as usize { '*' } else { ' ' }; @@ -876,7 +879,7 @@ impl fmt::Debug for RangeTrie { } impl fmt::Debug for State { - fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { let rs = self .transitions .iter() @@ -888,7 +891,7 @@ impl fmt::Debug for State { } impl fmt::Debug for Transition { - fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { if self.range.start == self.range.end { write!(f, "{:02X} => {:02X}", self.range.start, self.next_id) } else { @@ -908,7 +911,7 @@ fn intersects(r1: Utf8Range, r2: Utf8Range) -> bool { #[cfg(test)] mod tests { - use std::ops::RangeInclusive; + use core::ops::RangeInclusive; use regex_syntax::utf8::Utf8Range; |