diff options
Diffstat (limited to 'vendor/regex-automata/src/nfa/compiler.rs')
-rw-r--r-- | vendor/regex-automata/src/nfa/compiler.rs | 1193 |
1 files changed, 0 insertions, 1193 deletions
diff --git a/vendor/regex-automata/src/nfa/compiler.rs b/vendor/regex-automata/src/nfa/compiler.rs deleted file mode 100644 index d9b3945b3..000000000 --- a/vendor/regex-automata/src/nfa/compiler.rs +++ /dev/null @@ -1,1193 +0,0 @@ -// This module provides an NFA compiler using Thompson's construction -// algorithm. The compiler takes a regex-syntax::Hir as input and emits an NFA -// graph as output. The NFA graph is structured in a way that permits it to be -// executed by a virtual machine and also used to efficiently build a DFA. -// -// The compiler deals with a slightly expanded set of NFA states that notably -// includes an empty node that has exactly one epsilon transition to the next -// state. In other words, it's a "goto" instruction if one views Thompson's NFA -// as a set of bytecode instructions. These goto instructions are removed in -// a subsequent phase before returning the NFA to the caller. The purpose of -// these empty nodes is that they make the construction algorithm substantially -// simpler to implement. We remove them before returning to the caller because -// they can represent substantial overhead when traversing the NFA graph -// (either while searching using the NFA directly or while building a DFA). -// -// In the future, it would be nice to provide a Glushkov compiler as well, -// as it would work well as a bit-parallel NFA for smaller regexes. But -// the Thompson construction is one I'm more familiar with and seems more -// straight-forward to deal with when it comes to large Unicode character -// classes. -// -// Internally, the compiler uses interior mutability to improve composition -// in the face of the borrow checker. In particular, we'd really like to be -// able to write things like this: -// -// self.c_concat(exprs.iter().map(|e| self.c(e))) -// -// Which elegantly uses iterators to build up a sequence of compiled regex -// sub-expressions and then hands it off to the concatenating compiler -// routine. Without interior mutability, the borrow checker won't let us -// borrow `self` mutably both inside and outside the closure at the same -// time. - -use std::cell::RefCell; -use std::mem; - -use regex_syntax::hir::{self, Hir, HirKind}; -use regex_syntax::utf8::{Utf8Range, Utf8Sequences}; - -use classes::ByteClassSet; -use error::{Error, Result}; -use nfa::map::{Utf8BoundedMap, Utf8SuffixKey, Utf8SuffixMap}; -use nfa::range_trie::RangeTrie; -use nfa::{State, StateID, Transition, NFA}; - -/// Config knobs for the NFA compiler. See the builder's methods for more -/// docs on each one. -#[derive(Clone, Copy, Debug)] -struct Config { - anchored: bool, - allow_invalid_utf8: bool, - reverse: bool, - shrink: bool, -} - -impl Default for Config { - fn default() -> Config { - Config { - anchored: false, - allow_invalid_utf8: false, - reverse: false, - shrink: true, - } - } -} - -/// A builder for compiling an NFA. -#[derive(Clone, Debug)] -pub struct Builder { - config: Config, -} - -impl Builder { - /// Create a new NFA builder with its default configuration. - pub fn new() -> Builder { - Builder { config: Config::default() } - } - - /// Compile the given high level intermediate representation of a regular - /// expression into an NFA. - /// - /// If there was a problem building the NFA, then an error is returned. - /// For example, if the regex uses unsupported features (such as zero-width - /// assertions), then an error is returned. - pub fn build(&self, expr: &Hir) -> Result<NFA> { - let mut nfa = NFA::always_match(); - self.build_with(&mut Compiler::new(), &mut nfa, expr)?; - Ok(nfa) - } - - /// Compile the given high level intermediate representation of a regular - /// expression into the NFA given using the given compiler. Callers may - /// prefer this over `build` if they would like to reuse allocations while - /// compiling many regular expressions. - /// - /// On success, the given NFA is completely overwritten with the NFA - /// produced by the compiler. - /// - /// If there was a problem building the NFA, then an error is returned. For - /// example, if the regex uses unsupported features (such as zero-width - /// assertions), then an error is returned. When an error is returned, - /// the contents of `nfa` are unspecified and should not be relied upon. - /// However, it can still be reused in subsequent calls to this method. - pub fn build_with( - &self, - compiler: &mut Compiler, - nfa: &mut NFA, - expr: &Hir, - ) -> Result<()> { - compiler.clear(); - compiler.configure(self.config); - compiler.compile(nfa, expr) - } - - /// Set whether matching must be anchored at the beginning of the input. - /// - /// When enabled, a match must begin at the start of the input. When - /// disabled, the NFA will act as if the pattern started with a `.*?`, - /// which enables a match to appear anywhere. - /// - /// By default this is disabled. - pub fn anchored(&mut self, yes: bool) -> &mut Builder { - self.config.anchored = yes; - self - } - - /// When enabled, the builder will permit the construction of an NFA that - /// may match invalid UTF-8. - /// - /// When disabled (the default), the builder is guaranteed to produce a - /// regex that will only ever match valid UTF-8 (otherwise, the builder - /// will return an error). - pub fn allow_invalid_utf8(&mut self, yes: bool) -> &mut Builder { - self.config.allow_invalid_utf8 = yes; - self - } - - /// Reverse the NFA. - /// - /// A NFA reversal is performed by reversing all of the concatenated - /// sub-expressions in the original pattern, recursively. The resulting - /// NFA can be used to match the pattern starting from the end of a string - /// instead of the beginning of a string. - /// - /// Reversing the NFA is useful for building a reverse DFA, which is most - /// useful for finding the start of a match. - pub fn reverse(&mut self, yes: bool) -> &mut Builder { - self.config.reverse = yes; - self - } - - /// Apply best effort heuristics to shrink the NFA at the expense of more - /// time/memory. - /// - /// This is enabled by default. Generally speaking, if one is using an NFA - /// to compile DFA, then the extra time used to shrink the NFA will be - /// more than made up for during DFA construction (potentially by a lot). - /// In other words, enabling this can substantially decrease the overall - /// amount of time it takes to build a DFA. - /// - /// The only reason to disable this if you want to compile an NFA and start - /// using it as quickly as possible without needing to build a DFA. - pub fn shrink(&mut self, yes: bool) -> &mut Builder { - self.config.shrink = yes; - self - } -} - -/// A compiler that converts a regex abstract syntax to an NFA via Thompson's -/// construction. Namely, this compiler permits epsilon transitions between -/// states. -/// -/// Users of this crate cannot use a compiler directly. Instead, all one can -/// do is create one and use it via the -/// [`Builder::build_with`](struct.Builder.html#method.build_with) -/// method. This permits callers to reuse compilers in order to amortize -/// allocations. -#[derive(Clone, Debug)] -pub struct Compiler { - /// The set of compiled NFA states. Once a state is compiled, it is - /// assigned a state ID equivalent to its index in this list. Subsequent - /// compilation can modify previous states by adding new transitions. - states: RefCell<Vec<CState>>, - /// The configuration from the builder. - config: Config, - /// State used for compiling character classes to UTF-8 byte automata. - /// State is not retained between character class compilations. This just - /// serves to amortize allocation to the extent possible. - utf8_state: RefCell<Utf8State>, - /// State used for arranging character classes in reverse into a trie. - trie_state: RefCell<RangeTrie>, - /// State used for caching common suffixes when compiling reverse UTF-8 - /// automata (for Unicode character classes). - utf8_suffix: RefCell<Utf8SuffixMap>, - /// A map used to re-map state IDs when translating the compiler's internal - /// NFA state representation to the external NFA representation. - remap: RefCell<Vec<StateID>>, - /// A set of compiler internal state IDs that correspond to states that are - /// exclusively epsilon transitions, i.e., goto instructions, combined with - /// the state that they point to. This is used to record said states while - /// transforming the compiler's internal NFA representation to the external - /// form. - empties: RefCell<Vec<(StateID, StateID)>>, -} - -/// A compiler intermediate state representation for an NFA that is only used -/// during compilation. Once compilation is done, `CState`s are converted to -/// `State`s, which have a much simpler representation. -#[derive(Clone, Debug, Eq, PartialEq)] -enum CState { - /// An empty state whose only purpose is to forward the automaton to - /// another state via en epsilon transition. These are useful during - /// compilation but are otherwise removed at the end. - Empty { next: StateID }, - /// A state that only transitions to `next` if the current input byte is - /// in the range `[start, end]` (inclusive on both ends). - Range { range: Transition }, - /// A state with possibly many transitions, represented in a sparse - /// fashion. Transitions are ordered lexicographically by input range. - /// As such, this may only be used when every transition has equal - /// priority. (In practice, this is only used for encoding large UTF-8 - /// automata.) - Sparse { ranges: Vec<Transition> }, - /// An alternation such that there exists an epsilon transition to all - /// states in `alternates`, where matches found via earlier transitions - /// are preferred over later transitions. - Union { alternates: Vec<StateID> }, - /// An alternation such that there exists an epsilon transition to all - /// states in `alternates`, where matches found via later transitions - /// are preferred over earlier transitions. - /// - /// This "reverse" state exists for convenience during compilation that - /// permits easy construction of non-greedy combinations of NFA states. - /// At the end of compilation, Union and UnionReverse states are merged - /// into one Union type of state, where the latter has its epsilon - /// transitions reversed to reflect the priority inversion. - UnionReverse { alternates: Vec<StateID> }, - /// A match state. There is exactly one such occurrence of this state in - /// an NFA. - Match, -} - -/// A value that represents the result of compiling a sub-expression of a -/// regex's HIR. Specifically, this represents a sub-graph of the NFA that -/// has an initial state at `start` and a final state at `end`. -#[derive(Clone, Copy, Debug)] -pub struct ThompsonRef { - start: StateID, - end: StateID, -} - -impl Compiler { - /// Create a new compiler. - pub fn new() -> Compiler { - Compiler { - states: RefCell::new(vec![]), - config: Config::default(), - utf8_state: RefCell::new(Utf8State::new()), - trie_state: RefCell::new(RangeTrie::new()), - utf8_suffix: RefCell::new(Utf8SuffixMap::new(1000)), - remap: RefCell::new(vec![]), - empties: RefCell::new(vec![]), - } - } - - /// Clear any memory used by this compiler such that it is ready to compile - /// a new regex. - /// - /// It is preferrable to reuse a compiler if possible in order to reuse - /// allocations. - fn clear(&self) { - self.states.borrow_mut().clear(); - // We don't need to clear anything else since they are cleared on - // their own and only when they are used. - } - - /// Configure this compiler from the builder's knobs. - /// - /// The compiler is always reconfigured by the builder before using it to - /// build an NFA. - fn configure(&mut self, config: Config) { - self.config = config; - } - - /// Convert the current intermediate NFA to its final compiled form. - fn compile(&self, nfa: &mut NFA, expr: &Hir) -> Result<()> { - nfa.anchored = self.config.anchored; - - let mut start = self.add_empty(); - if !nfa.anchored { - let compiled = if self.config.allow_invalid_utf8 { - self.c_unanchored_prefix_invalid_utf8()? - } else { - self.c_unanchored_prefix_valid_utf8()? - }; - self.patch(start, compiled.start); - start = compiled.end; - } - let compiled = self.c(&expr)?; - let match_id = self.add_match(); - self.patch(start, compiled.start); - self.patch(compiled.end, match_id); - self.finish(nfa); - Ok(()) - } - - /// Finishes the compilation process and populates the provide NFA with - /// the final graph. - fn finish(&self, nfa: &mut NFA) { - let mut bstates = self.states.borrow_mut(); - let mut remap = self.remap.borrow_mut(); - remap.resize(bstates.len(), 0); - let mut empties = self.empties.borrow_mut(); - empties.clear(); - - // We don't reuse allocations here becuase this is what we're - // returning. - nfa.states.clear(); - let mut byteset = ByteClassSet::new(); - - // The idea here is to convert our intermediate states to their final - // form. The only real complexity here is the process of converting - // transitions, which are expressed in terms of state IDs. The new - // set of states will be smaller because of partial epsilon removal, - // so the state IDs will not be the same. - for (id, bstate) in bstates.iter_mut().enumerate() { - match *bstate { - CState::Empty { next } => { - // Since we're removing empty states, we need to handle - // them later since we don't yet know which new state this - // empty state will be mapped to. - empties.push((id, next)); - } - CState::Range { ref range } => { - remap[id] = nfa.states.len(); - byteset.set_range(range.start, range.end); - nfa.states.push(State::Range { range: range.clone() }); - } - CState::Sparse { ref mut ranges } => { - remap[id] = nfa.states.len(); - - let ranges = mem::replace(ranges, vec![]); - for r in &ranges { - byteset.set_range(r.start, r.end); - } - nfa.states.push(State::Sparse { - ranges: ranges.into_boxed_slice(), - }); - } - CState::Union { ref mut alternates } => { - remap[id] = nfa.states.len(); - - let alternates = mem::replace(alternates, vec![]); - nfa.states.push(State::Union { - alternates: alternates.into_boxed_slice(), - }); - } - CState::UnionReverse { ref mut alternates } => { - remap[id] = nfa.states.len(); - - let mut alternates = mem::replace(alternates, vec![]); - alternates.reverse(); - nfa.states.push(State::Union { - alternates: alternates.into_boxed_slice(), - }); - } - CState::Match => { - remap[id] = nfa.states.len(); - nfa.states.push(State::Match); - } - } - } - for &(empty_id, mut empty_next) in empties.iter() { - // empty states can point to other empty states, forming a chain. - // So we must follow the chain until the end, which must end at - // a non-empty state, and therefore, a state that is correctly - // remapped. We are guaranteed to terminate because our compiler - // never builds a loop among empty states. - while let CState::Empty { next } = bstates[empty_next] { - empty_next = next; - } - remap[empty_id] = remap[empty_next]; - } - for state in &mut nfa.states { - state.remap(&remap); - } - // The compiler always begins the NFA at the first state. - nfa.start = remap[0]; - nfa.byte_classes = byteset.byte_classes(); - } - - fn c(&self, expr: &Hir) -> Result<ThompsonRef> { - match *expr.kind() { - HirKind::Empty => { - let id = self.add_empty(); - Ok(ThompsonRef { start: id, end: id }) - } - HirKind::Literal(hir::Literal::Unicode(ch)) => { - let mut buf = [0; 4]; - let it = ch - .encode_utf8(&mut buf) - .as_bytes() - .iter() - .map(|&b| Ok(self.c_range(b, b))); - self.c_concat(it) - } - HirKind::Literal(hir::Literal::Byte(b)) => Ok(self.c_range(b, b)), - HirKind::Class(hir::Class::Bytes(ref cls)) => { - self.c_byte_class(cls) - } - HirKind::Class(hir::Class::Unicode(ref cls)) => { - self.c_unicode_class(cls) - } - HirKind::Repetition(ref rep) => self.c_repetition(rep), - HirKind::Group(ref group) => self.c(&*group.hir), - HirKind::Concat(ref exprs) => { - self.c_concat(exprs.iter().map(|e| self.c(e))) - } - HirKind::Alternation(ref exprs) => { - self.c_alternation(exprs.iter().map(|e| self.c(e))) - } - HirKind::Anchor(_) => Err(Error::unsupported_anchor()), - HirKind::WordBoundary(_) => Err(Error::unsupported_word()), - } - } - - fn c_concat<I>(&self, mut it: I) -> Result<ThompsonRef> - where - I: DoubleEndedIterator<Item = Result<ThompsonRef>>, - { - let first = - if self.config.reverse { it.next_back() } else { it.next() }; - let ThompsonRef { start, mut end } = match first { - Some(result) => result?, - None => return Ok(self.c_empty()), - }; - loop { - let next = - if self.config.reverse { it.next_back() } else { it.next() }; - let compiled = match next { - Some(result) => result?, - None => break, - }; - self.patch(end, compiled.start); - end = compiled.end; - } - Ok(ThompsonRef { start, end }) - } - - fn c_alternation<I>(&self, mut it: I) -> Result<ThompsonRef> - where - I: Iterator<Item = Result<ThompsonRef>>, - { - let first = it.next().expect("alternations must be non-empty")?; - let second = match it.next() { - None => return Ok(first), - Some(result) => result?, - }; - - let union = self.add_union(); - let end = self.add_empty(); - self.patch(union, first.start); - self.patch(first.end, end); - self.patch(union, second.start); - self.patch(second.end, end); - for result in it { - let compiled = result?; - self.patch(union, compiled.start); - self.patch(compiled.end, end); - } - Ok(ThompsonRef { start: union, end }) - } - - fn c_repetition(&self, rep: &hir::Repetition) -> Result<ThompsonRef> { - match rep.kind { - hir::RepetitionKind::ZeroOrOne => { - self.c_zero_or_one(&rep.hir, rep.greedy) - } - hir::RepetitionKind::ZeroOrMore => { - self.c_at_least(&rep.hir, rep.greedy, 0) - } - hir::RepetitionKind::OneOrMore => { - self.c_at_least(&rep.hir, rep.greedy, 1) - } - hir::RepetitionKind::Range(ref rng) => match *rng { - hir::RepetitionRange::Exactly(count) => { - self.c_exactly(&rep.hir, count) - } - hir::RepetitionRange::AtLeast(m) => { - self.c_at_least(&rep.hir, rep.greedy, m) - } - hir::RepetitionRange::Bounded(min, max) => { - self.c_bounded(&rep.hir, rep.greedy, min, max) - } - }, - } - } - - fn c_bounded( - &self, - expr: &Hir, - greedy: bool, - min: u32, - max: u32, - ) -> Result<ThompsonRef> { - let prefix = self.c_exactly(expr, min)?; - if min == max { - return Ok(prefix); - } - - // It is tempting here to compile the rest here as a concatenation - // of zero-or-one matches. i.e., for `a{2,5}`, compile it as if it - // were `aaa?a?a?`. The problem here is that it leads to this program: - // - // >000000: 61 => 01 - // 000001: 61 => 02 - // 000002: alt(03, 04) - // 000003: 61 => 04 - // 000004: alt(05, 06) - // 000005: 61 => 06 - // 000006: alt(07, 08) - // 000007: 61 => 08 - // 000008: MATCH - // - // And effectively, once you hit state 2, the epsilon closure will - // include states 3, 5, 5, 6, 7 and 8, which is quite a bit. It is - // better to instead compile it like so: - // - // >000000: 61 => 01 - // 000001: 61 => 02 - // 000002: alt(03, 08) - // 000003: 61 => 04 - // 000004: alt(05, 08) - // 000005: 61 => 06 - // 000006: alt(07, 08) - // 000007: 61 => 08 - // 000008: MATCH - // - // So that the epsilon closure of state 2 is now just 3 and 8. - let empty = self.add_empty(); - let mut prev_end = prefix.end; - for _ in min..max { - let union = if greedy { - self.add_union() - } else { - self.add_reverse_union() - }; - let compiled = self.c(expr)?; - self.patch(prev_end, union); - self.patch(union, compiled.start); - self.patch(union, empty); - prev_end = compiled.end; - } - self.patch(prev_end, empty); - Ok(ThompsonRef { start: prefix.start, end: empty }) - } - - fn c_at_least( - &self, - expr: &Hir, - greedy: bool, - n: u32, - ) -> Result<ThompsonRef> { - if n == 0 { - let union = if greedy { - self.add_union() - } else { - self.add_reverse_union() - }; - let compiled = self.c(expr)?; - self.patch(union, compiled.start); - self.patch(compiled.end, union); - Ok(ThompsonRef { start: union, end: union }) - } else if n == 1 { - let compiled = self.c(expr)?; - let union = if greedy { - self.add_union() - } else { - self.add_reverse_union() - }; - self.patch(compiled.end, union); - self.patch(union, compiled.start); - Ok(ThompsonRef { start: compiled.start, end: union }) - } else { - let prefix = self.c_exactly(expr, n - 1)?; - let last = self.c(expr)?; - let union = if greedy { - self.add_union() - } else { - self.add_reverse_union() - }; - self.patch(prefix.end, last.start); - self.patch(last.end, union); - self.patch(union, last.start); - Ok(ThompsonRef { start: prefix.start, end: union }) - } - } - - fn c_zero_or_one(&self, expr: &Hir, greedy: bool) -> Result<ThompsonRef> { - let union = - if greedy { self.add_union() } else { self.add_reverse_union() }; - let compiled = self.c(expr)?; - let empty = self.add_empty(); - self.patch(union, compiled.start); - self.patch(union, empty); - self.patch(compiled.end, empty); - Ok(ThompsonRef { start: union, end: empty }) - } - - fn c_exactly(&self, expr: &Hir, n: u32) -> Result<ThompsonRef> { - let it = (0..n).map(|_| self.c(expr)); - self.c_concat(it) - } - - fn c_byte_class(&self, cls: &hir::ClassBytes) -> Result<ThompsonRef> { - let end = self.add_empty(); - let mut trans = Vec::with_capacity(cls.ranges().len()); - for r in cls.iter() { - trans.push(Transition { - start: r.start(), - end: r.end(), - next: end, - }); - } - Ok(ThompsonRef { start: self.add_sparse(trans), end }) - } - - fn c_unicode_class(&self, cls: &hir::ClassUnicode) -> Result<ThompsonRef> { - // If all we have are ASCII ranges wrapped in a Unicode package, then - // there is zero reason to bring out the big guns. We can fit all ASCII - // ranges within a single sparse transition. - if cls.is_all_ascii() { - let end = self.add_empty(); - let mut trans = Vec::with_capacity(cls.ranges().len()); - for r in cls.iter() { - assert!(r.start() <= '\x7F'); - assert!(r.end() <= '\x7F'); - trans.push(Transition { - start: r.start() as u8, - end: r.end() as u8, - next: end, - }); - } - Ok(ThompsonRef { start: self.add_sparse(trans), end }) - } else if self.config.reverse { - if !self.config.shrink { - // When we don't want to spend the extra time shrinking, we - // compile the UTF-8 automaton in reverse using something like - // the "naive" approach, but will attempt to re-use common - // suffixes. - self.c_unicode_class_reverse_with_suffix(cls) - } else { - // When we want to shrink our NFA for reverse UTF-8 automata, - // we cannot feed UTF-8 sequences directly to the UTF-8 - // compiler, since the UTF-8 compiler requires all sequences - // to be lexicographically sorted. Instead, we organize our - // sequences into a range trie, which can then output our - // sequences in the correct order. Unfortunately, building the - // range trie is fairly expensive (but not nearly as expensive - // as building a DFA). Hence the reason why the 'shrink' option - // exists, so that this path can be toggled off. - let mut trie = self.trie_state.borrow_mut(); - trie.clear(); - - for rng in cls.iter() { - for mut seq in Utf8Sequences::new(rng.start(), rng.end()) { - seq.reverse(); - trie.insert(seq.as_slice()); - } - } - let mut utf8_state = self.utf8_state.borrow_mut(); - let mut utf8c = Utf8Compiler::new(self, &mut *utf8_state); - trie.iter(|seq| { - utf8c.add(&seq); - }); - Ok(utf8c.finish()) - } - } else { - // In the forward direction, we always shrink our UTF-8 automata - // because we can stream it right into the UTF-8 compiler. There - // is almost no downside (in either memory or time) to using this - // approach. - let mut utf8_state = self.utf8_state.borrow_mut(); - let mut utf8c = Utf8Compiler::new(self, &mut *utf8_state); - for rng in cls.iter() { - for seq in Utf8Sequences::new(rng.start(), rng.end()) { - utf8c.add(seq.as_slice()); - } - } - Ok(utf8c.finish()) - } - - // For reference, the code below is the "naive" version of compiling a - // UTF-8 automaton. It is deliciously simple (and works for both the - // forward and reverse cases), but will unfortunately produce very - // large NFAs. When compiling a forward automaton, the size difference - // can sometimes be an order of magnitude. For example, the '\w' regex - // will generate about ~3000 NFA states using the naive approach below, - // but only 283 states when using the approach above. This is because - // the approach above actually compiles a *minimal* (or near minimal, - // because of the bounded hashmap) UTF-8 automaton. - // - // The code below is kept as a reference point in order to make it - // easier to understand the higher level goal here. - /* - let it = cls - .iter() - .flat_map(|rng| Utf8Sequences::new(rng.start(), rng.end())) - .map(|seq| { - let it = seq - .as_slice() - .iter() - .map(|rng| Ok(self.c_range(rng.start, rng.end))); - self.c_concat(it) - }); - self.c_alternation(it); - */ - } - - fn c_unicode_class_reverse_with_suffix( - &self, - cls: &hir::ClassUnicode, - ) -> Result<ThompsonRef> { - // N.B. It would likely be better to cache common *prefixes* in the - // reverse direction, but it's not quite clear how to do that. The - // advantage of caching suffixes is that it does give us a win, and - // has a very small additional overhead. - let mut cache = self.utf8_suffix.borrow_mut(); - cache.clear(); - - let union = self.add_union(); - let alt_end = self.add_empty(); - for urng in cls.iter() { - for seq in Utf8Sequences::new(urng.start(), urng.end()) { - let mut end = alt_end; - for brng in seq.as_slice() { - let key = Utf8SuffixKey { - from: end, - start: brng.start, - end: brng.end, - }; - let hash = cache.hash(&key); - if let Some(id) = cache.get(&key, hash) { - end = id; - continue; - } - - let compiled = self.c_range(brng.start, brng.end); - self.patch(compiled.end, end); - end = compiled.start; - cache.set(key, hash, end); - } - self.patch(union, end); - } - } - Ok(ThompsonRef { start: union, end: alt_end }) - } - - fn c_range(&self, start: u8, end: u8) -> ThompsonRef { - let id = self.add_range(start, end); - ThompsonRef { start: id, end: id } - } - - fn c_empty(&self) -> ThompsonRef { - let id = self.add_empty(); - ThompsonRef { start: id, end: id } - } - - fn c_unanchored_prefix_valid_utf8(&self) -> Result<ThompsonRef> { - self.c(&Hir::repetition(hir::Repetition { - kind: hir::RepetitionKind::ZeroOrMore, - greedy: false, - hir: Box::new(Hir::any(false)), - })) - } - - fn c_unanchored_prefix_invalid_utf8(&self) -> Result<ThompsonRef> { - self.c(&Hir::repetition(hir::Repetition { - kind: hir::RepetitionKind::ZeroOrMore, - greedy: false, - hir: Box::new(Hir::any(true)), - })) - } - - fn patch(&self, from: StateID, to: StateID) { - match self.states.borrow_mut()[from] { - CState::Empty { ref mut next } => { - *next = to; - } - CState::Range { ref mut range } => { - range.next = to; - } - CState::Sparse { .. } => { - panic!("cannot patch from a sparse NFA state") - } - CState::Union { ref mut alternates } => { - alternates.push(to); - } - CState::UnionReverse { ref mut alternates } => { - alternates.push(to); - } - CState::Match => {} - } - } - - fn add_empty(&self) -> StateID { - let id = self.states.borrow().len(); - self.states.borrow_mut().push(CState::Empty { next: 0 }); - id - } - - fn add_range(&self, start: u8, end: u8) -> StateID { - let id = self.states.borrow().len(); - let trans = Transition { start, end, next: 0 }; - let state = CState::Range { range: trans }; - self.states.borrow_mut().push(state); - id - } - - fn add_sparse(&self, ranges: Vec<Transition>) -> StateID { - if ranges.len() == 1 { - let id = self.states.borrow().len(); - let state = CState::Range { range: ranges[0] }; - self.states.borrow_mut().push(state); - return id; - } - let id = self.states.borrow().len(); - let state = CState::Sparse { ranges }; - self.states.borrow_mut().push(state); - id - } - - fn add_union(&self) -> StateID { - let id = self.states.borrow().len(); - let state = CState::Union { alternates: vec![] }; - self.states.borrow_mut().push(state); - id - } - - fn add_reverse_union(&self) -> StateID { - let id = self.states.borrow().len(); - let state = CState::UnionReverse { alternates: vec![] }; - self.states.borrow_mut().push(state); - id - } - - fn add_match(&self) -> StateID { - let id = self.states.borrow().len(); - self.states.borrow_mut().push(CState::Match); - id - } -} - -#[derive(Debug)] -struct Utf8Compiler<'a> { - nfac: &'a Compiler, - state: &'a mut Utf8State, - target: StateID, -} - -#[derive(Clone, Debug)] -struct Utf8State { - compiled: Utf8BoundedMap, - uncompiled: Vec<Utf8Node>, -} - -#[derive(Clone, Debug)] -struct Utf8Node { - trans: Vec<Transition>, - last: Option<Utf8LastTransition>, -} - -#[derive(Clone, Debug)] -struct Utf8LastTransition { - start: u8, - end: u8, -} - -impl Utf8State { - fn new() -> Utf8State { - Utf8State { compiled: Utf8BoundedMap::new(5000), uncompiled: vec![] } - } - - fn clear(&mut self) { - self.compiled.clear(); - self.uncompiled.clear(); - } -} - -impl<'a> Utf8Compiler<'a> { - fn new(nfac: &'a Compiler, state: &'a mut Utf8State) -> Utf8Compiler<'a> { - let target = nfac.add_empty(); - state.clear(); - let mut utf8c = Utf8Compiler { nfac, state, target }; - utf8c.add_empty(); - utf8c - } - - fn finish(&mut self) -> ThompsonRef { - self.compile_from(0); - let node = self.pop_root(); - let start = self.compile(node); - ThompsonRef { start, end: self.target } - } - - fn add(&mut self, ranges: &[Utf8Range]) { - let prefix_len = ranges - .iter() - .zip(&self.state.uncompiled) - .take_while(|&(range, node)| { - node.last.as_ref().map_or(false, |t| { - (t.start, t.end) == (range.start, range.end) - }) - }) - .count(); - assert!(prefix_len < ranges.len()); - self.compile_from(prefix_len); - self.add_suffix(&ranges[prefix_len..]); - } - - fn compile_from(&mut self, from: usize) { - let mut next = self.target; - while from + 1 < self.state.uncompiled.len() { - let node = self.pop_freeze(next); - next = self.compile(node); - } - self.top_last_freeze(next); - } - - fn compile(&mut self, node: Vec<Transition>) -> StateID { - let hash = self.state.compiled.hash(&node); - if let Some(id) = self.state.compiled.get(&node, hash) { - return id; - } - let id = self.nfac.add_sparse(node.clone()); - self.state.compiled.set(node, hash, id); - id - } - - fn add_suffix(&mut self, ranges: &[Utf8Range]) { - assert!(!ranges.is_empty()); - let last = self - .state - .uncompiled - .len() - .checked_sub(1) - .expect("non-empty nodes"); - assert!(self.state.uncompiled[last].last.is_none()); - self.state.uncompiled[last].last = Some(Utf8LastTransition { - start: ranges[0].start, - end: ranges[0].end, - }); - for r in &ranges[1..] { - self.state.uncompiled.push(Utf8Node { - trans: vec![], - last: Some(Utf8LastTransition { start: r.start, end: r.end }), - }); - } - } - - fn add_empty(&mut self) { - self.state.uncompiled.push(Utf8Node { trans: vec![], last: None }); - } - - fn pop_freeze(&mut self, next: StateID) -> Vec<Transition> { - let mut uncompiled = self.state.uncompiled.pop().unwrap(); - uncompiled.set_last_transition(next); - uncompiled.trans - } - - fn pop_root(&mut self) -> Vec<Transition> { - assert_eq!(self.state.uncompiled.len(), 1); - assert!(self.state.uncompiled[0].last.is_none()); - self.state.uncompiled.pop().expect("non-empty nodes").trans - } - - fn top_last_freeze(&mut self, next: StateID) { - let last = self - .state - .uncompiled - .len() - .checked_sub(1) - .expect("non-empty nodes"); - self.state.uncompiled[last].set_last_transition(next); - } -} - -impl Utf8Node { - fn set_last_transition(&mut self, next: StateID) { - if let Some(last) = self.last.take() { - self.trans.push(Transition { - start: last.start, - end: last.end, - next, - }); - } - } -} - -#[cfg(test)] -mod tests { - use regex_syntax::hir::Hir; - use regex_syntax::ParserBuilder; - - use super::{Builder, State, StateID, Transition, NFA}; - - fn parse(pattern: &str) -> Hir { - ParserBuilder::new().build().parse(pattern).unwrap() - } - - fn build(pattern: &str) -> NFA { - Builder::new().anchored(true).build(&parse(pattern)).unwrap() - } - - fn s_byte(byte: u8, next: StateID) -> State { - let trans = Transition { start: byte, end: byte, next }; - State::Range { range: trans } - } - - fn s_range(start: u8, end: u8, next: StateID) -> State { - let trans = Transition { start, end, next }; - State::Range { range: trans } - } - - fn s_sparse(ranges: &[(u8, u8, StateID)]) -> State { - let ranges = ranges - .iter() - .map(|&(start, end, next)| Transition { start, end, next }) - .collect(); - State::Sparse { ranges } - } - - fn s_union(alts: &[StateID]) -> State { - State::Union { alternates: alts.to_vec().into_boxed_slice() } - } - - fn s_match() -> State { - State::Match - } - - #[test] - fn errors() { - // unsupported anchors - assert!(Builder::new().build(&parse(r"^")).is_err()); - assert!(Builder::new().build(&parse(r"$")).is_err()); - assert!(Builder::new().build(&parse(r"\A")).is_err()); - assert!(Builder::new().build(&parse(r"\z")).is_err()); - - // unsupported word boundaries - assert!(Builder::new().build(&parse(r"\b")).is_err()); - assert!(Builder::new().build(&parse(r"\B")).is_err()); - assert!(Builder::new().build(&parse(r"(?-u)\b")).is_err()); - } - - // Test that building an unanchored NFA has an appropriate `.*?` prefix. - #[test] - fn compile_unanchored_prefix() { - // When the machine can only match valid UTF-8. - let nfa = Builder::new().anchored(false).build(&parse(r"a")).unwrap(); - // There should be many states since the `.` in `.*?` matches any - // Unicode scalar value. - assert_eq!(11, nfa.len()); - assert_eq!(nfa.states[10], s_match()); - assert_eq!(nfa.states[9], s_byte(b'a', 10)); - - // When the machine can match invalid UTF-8. - let nfa = Builder::new() - .anchored(false) - .allow_invalid_utf8(true) - .build(&parse(r"a")) - .unwrap(); - assert_eq!( - nfa.states, - &[ - s_union(&[2, 1]), - s_range(0, 255, 0), - s_byte(b'a', 3), - s_match(), - ] - ); - } - - #[test] - fn compile_empty() { - assert_eq!(build("").states, &[s_match(),]); - } - - #[test] - fn compile_literal() { - assert_eq!(build("a").states, &[s_byte(b'a', 1), s_match(),]); - assert_eq!( - build("ab").states, - &[s_byte(b'a', 1), s_byte(b'b', 2), s_match(),] - ); - assert_eq!( - build("ā").states, - &[s_byte(0xE2, 1), s_byte(0x98, 2), s_byte(0x83, 3), s_match(),] - ); - - // Check that non-UTF-8 literals work. - let hir = ParserBuilder::new() - .allow_invalid_utf8(true) - .build() - .parse(r"(?-u)\xFF") - .unwrap(); - let nfa = Builder::new() - .anchored(true) - .allow_invalid_utf8(true) - .build(&hir) - .unwrap(); - assert_eq!(nfa.states, &[s_byte(b'\xFF', 1), s_match(),]); - } - - #[test] - fn compile_class() { - assert_eq!( - build(r"[a-z]").states, - &[s_range(b'a', b'z', 1), s_match(),] - ); - assert_eq!( - build(r"[x-za-c]").states, - &[s_sparse(&[(b'a', b'c', 1), (b'x', b'z', 1)]), s_match()] - ); - assert_eq!( - build(r"[\u03B1-\u03B4]").states, - &[s_range(0xB1, 0xB4, 2), s_byte(0xCE, 0), s_match()] - ); - assert_eq!( - build(r"[\u03B1-\u03B4\u{1F919}-\u{1F91E}]").states, - &[ - s_range(0xB1, 0xB4, 5), - s_range(0x99, 0x9E, 5), - s_byte(0xA4, 1), - s_byte(0x9F, 2), - s_sparse(&[(0xCE, 0xCE, 0), (0xF0, 0xF0, 3)]), - s_match(), - ] - ); - assert_eq!( - build(r"[a-zā]").states, - &[ - s_byte(0x83, 3), - s_byte(0x98, 0), - s_sparse(&[(b'a', b'z', 3), (0xE2, 0xE2, 1)]), - s_match(), - ] - ); - } - - #[test] - fn compile_repetition() { - assert_eq!( - build(r"a?").states, - &[s_union(&[1, 2]), s_byte(b'a', 2), s_match(),] - ); - assert_eq!( - build(r"a??").states, - &[s_union(&[2, 1]), s_byte(b'a', 2), s_match(),] - ); - } - - #[test] - fn compile_group() { - assert_eq!( - build(r"ab+").states, - &[s_byte(b'a', 1), s_byte(b'b', 2), s_union(&[1, 3]), s_match(),] - ); - assert_eq!( - build(r"(ab)").states, - &[s_byte(b'a', 1), s_byte(b'b', 2), s_match(),] - ); - assert_eq!( - build(r"(ab)+").states, - &[s_byte(b'a', 1), s_byte(b'b', 2), s_union(&[0, 3]), s_match(),] - ); - } - - #[test] - fn compile_alternation() { - assert_eq!( - build(r"a|b").states, - &[s_byte(b'a', 3), s_byte(b'b', 3), s_union(&[0, 1]), s_match(),] - ); - assert_eq!( - build(r"|b").states, - &[s_byte(b'b', 2), s_union(&[2, 0]), s_match(),] - ); - assert_eq!( - build(r"a|").states, - &[s_byte(b'a', 2), s_union(&[0, 2]), s_match(),] - ); - } -} |