diff options
Diffstat (limited to 'vendor/regex-automata/src/nfa')
-rw-r--r-- | vendor/regex-automata/src/nfa/compiler.rs | 1193 | ||||
-rw-r--r-- | vendor/regex-automata/src/nfa/map.rs | 282 | ||||
-rw-r--r-- | vendor/regex-automata/src/nfa/mod.rs | 252 | ||||
-rw-r--r-- | vendor/regex-automata/src/nfa/range_trie.rs | 1048 |
4 files changed, 2775 insertions, 0 deletions
diff --git a/vendor/regex-automata/src/nfa/compiler.rs b/vendor/regex-automata/src/nfa/compiler.rs new file mode 100644 index 000000000..d9b3945b3 --- /dev/null +++ b/vendor/regex-automata/src/nfa/compiler.rs @@ -0,0 +1,1193 @@ +// This module provides an NFA compiler using Thompson's construction +// algorithm. The compiler takes a regex-syntax::Hir as input and emits an NFA +// graph as output. The NFA graph is structured in a way that permits it to be +// executed by a virtual machine and also used to efficiently build a DFA. +// +// The compiler deals with a slightly expanded set of NFA states that notably +// includes an empty node that has exactly one epsilon transition to the next +// state. In other words, it's a "goto" instruction if one views Thompson's NFA +// as a set of bytecode instructions. These goto instructions are removed in +// a subsequent phase before returning the NFA to the caller. The purpose of +// these empty nodes is that they make the construction algorithm substantially +// simpler to implement. We remove them before returning to the caller because +// they can represent substantial overhead when traversing the NFA graph +// (either while searching using the NFA directly or while building a DFA). +// +// In the future, it would be nice to provide a Glushkov compiler as well, +// as it would work well as a bit-parallel NFA for smaller regexes. But +// the Thompson construction is one I'm more familiar with and seems more +// straight-forward to deal with when it comes to large Unicode character +// classes. +// +// Internally, the compiler uses interior mutability to improve composition +// in the face of the borrow checker. In particular, we'd really like to be +// able to write things like this: +// +// self.c_concat(exprs.iter().map(|e| self.c(e))) +// +// Which elegantly uses iterators to build up a sequence of compiled regex +// sub-expressions and then hands it off to the concatenating compiler +// routine. Without interior mutability, the borrow checker won't let us +// borrow `self` mutably both inside and outside the closure at the same +// time. + +use std::cell::RefCell; +use std::mem; + +use regex_syntax::hir::{self, Hir, HirKind}; +use regex_syntax::utf8::{Utf8Range, Utf8Sequences}; + +use classes::ByteClassSet; +use error::{Error, Result}; +use nfa::map::{Utf8BoundedMap, Utf8SuffixKey, Utf8SuffixMap}; +use nfa::range_trie::RangeTrie; +use nfa::{State, StateID, Transition, NFA}; + +/// Config knobs for the NFA compiler. See the builder's methods for more +/// docs on each one. +#[derive(Clone, Copy, Debug)] +struct Config { + anchored: bool, + allow_invalid_utf8: bool, + reverse: bool, + shrink: bool, +} + +impl Default for Config { + fn default() -> Config { + Config { + anchored: false, + allow_invalid_utf8: false, + reverse: false, + shrink: true, + } + } +} + +/// A builder for compiling an NFA. +#[derive(Clone, Debug)] +pub struct Builder { + config: Config, +} + +impl Builder { + /// Create a new NFA builder with its default configuration. + pub fn new() -> Builder { + Builder { config: Config::default() } + } + + /// Compile the given high level intermediate representation of a regular + /// expression into an NFA. + /// + /// If there was a problem building the NFA, then an error is returned. + /// For example, if the regex uses unsupported features (such as zero-width + /// assertions), then an error is returned. + pub fn build(&self, expr: &Hir) -> Result<NFA> { + let mut nfa = NFA::always_match(); + self.build_with(&mut Compiler::new(), &mut nfa, expr)?; + Ok(nfa) + } + + /// Compile the given high level intermediate representation of a regular + /// expression into the NFA given using the given compiler. Callers may + /// prefer this over `build` if they would like to reuse allocations while + /// compiling many regular expressions. + /// + /// On success, the given NFA is completely overwritten with the NFA + /// produced by the compiler. + /// + /// If there was a problem building the NFA, then an error is returned. For + /// example, if the regex uses unsupported features (such as zero-width + /// assertions), then an error is returned. When an error is returned, + /// the contents of `nfa` are unspecified and should not be relied upon. + /// However, it can still be reused in subsequent calls to this method. + pub fn build_with( + &self, + compiler: &mut Compiler, + nfa: &mut NFA, + expr: &Hir, + ) -> Result<()> { + compiler.clear(); + compiler.configure(self.config); + compiler.compile(nfa, expr) + } + + /// Set whether matching must be anchored at the beginning of the input. + /// + /// When enabled, a match must begin at the start of the input. When + /// disabled, the NFA will act as if the pattern started with a `.*?`, + /// which enables a match to appear anywhere. + /// + /// By default this is disabled. + pub fn anchored(&mut self, yes: bool) -> &mut Builder { + self.config.anchored = yes; + self + } + + /// When enabled, the builder will permit the construction of an NFA that + /// may match invalid UTF-8. + /// + /// When disabled (the default), the builder is guaranteed to produce a + /// regex that will only ever match valid UTF-8 (otherwise, the builder + /// will return an error). + pub fn allow_invalid_utf8(&mut self, yes: bool) -> &mut Builder { + self.config.allow_invalid_utf8 = yes; + self + } + + /// Reverse the NFA. + /// + /// A NFA reversal is performed by reversing all of the concatenated + /// sub-expressions in the original pattern, recursively. The resulting + /// NFA can be used to match the pattern starting from the end of a string + /// instead of the beginning of a string. + /// + /// Reversing the NFA is useful for building a reverse DFA, which is most + /// useful for finding the start of a match. + pub fn reverse(&mut self, yes: bool) -> &mut Builder { + self.config.reverse = yes; + self + } + + /// Apply best effort heuristics to shrink the NFA at the expense of more + /// time/memory. + /// + /// This is enabled by default. Generally speaking, if one is using an NFA + /// to compile DFA, then the extra time used to shrink the NFA will be + /// more than made up for during DFA construction (potentially by a lot). + /// In other words, enabling this can substantially decrease the overall + /// amount of time it takes to build a DFA. + /// + /// The only reason to disable this if you want to compile an NFA and start + /// using it as quickly as possible without needing to build a DFA. + pub fn shrink(&mut self, yes: bool) -> &mut Builder { + self.config.shrink = yes; + self + } +} + +/// A compiler that converts a regex abstract syntax to an NFA via Thompson's +/// construction. Namely, this compiler permits epsilon transitions between +/// states. +/// +/// Users of this crate cannot use a compiler directly. Instead, all one can +/// do is create one and use it via the +/// [`Builder::build_with`](struct.Builder.html#method.build_with) +/// method. This permits callers to reuse compilers in order to amortize +/// allocations. +#[derive(Clone, Debug)] +pub struct Compiler { + /// The set of compiled NFA states. Once a state is compiled, it is + /// assigned a state ID equivalent to its index in this list. Subsequent + /// compilation can modify previous states by adding new transitions. + states: RefCell<Vec<CState>>, + /// The configuration from the builder. + config: Config, + /// State used for compiling character classes to UTF-8 byte automata. + /// State is not retained between character class compilations. This just + /// serves to amortize allocation to the extent possible. + utf8_state: RefCell<Utf8State>, + /// State used for arranging character classes in reverse into a trie. + trie_state: RefCell<RangeTrie>, + /// State used for caching common suffixes when compiling reverse UTF-8 + /// automata (for Unicode character classes). + utf8_suffix: RefCell<Utf8SuffixMap>, + /// A map used to re-map state IDs when translating the compiler's internal + /// NFA state representation to the external NFA representation. + remap: RefCell<Vec<StateID>>, + /// A set of compiler internal state IDs that correspond to states that are + /// exclusively epsilon transitions, i.e., goto instructions, combined with + /// the state that they point to. This is used to record said states while + /// transforming the compiler's internal NFA representation to the external + /// form. + empties: RefCell<Vec<(StateID, StateID)>>, +} + +/// A compiler intermediate state representation for an NFA that is only used +/// during compilation. Once compilation is done, `CState`s are converted to +/// `State`s, which have a much simpler representation. +#[derive(Clone, Debug, Eq, PartialEq)] +enum CState { + /// An empty state whose only purpose is to forward the automaton to + /// another state via en epsilon transition. These are useful during + /// compilation but are otherwise removed at the end. + Empty { next: StateID }, + /// A state that only transitions to `next` if the current input byte is + /// in the range `[start, end]` (inclusive on both ends). + Range { range: Transition }, + /// A state with possibly many transitions, represented in a sparse + /// fashion. Transitions are ordered lexicographically by input range. + /// As such, this may only be used when every transition has equal + /// priority. (In practice, this is only used for encoding large UTF-8 + /// automata.) + Sparse { ranges: Vec<Transition> }, + /// An alternation such that there exists an epsilon transition to all + /// states in `alternates`, where matches found via earlier transitions + /// are preferred over later transitions. + Union { alternates: Vec<StateID> }, + /// An alternation such that there exists an epsilon transition to all + /// states in `alternates`, where matches found via later transitions + /// are preferred over earlier transitions. + /// + /// This "reverse" state exists for convenience during compilation that + /// permits easy construction of non-greedy combinations of NFA states. + /// At the end of compilation, Union and UnionReverse states are merged + /// into one Union type of state, where the latter has its epsilon + /// transitions reversed to reflect the priority inversion. + UnionReverse { alternates: Vec<StateID> }, + /// A match state. There is exactly one such occurrence of this state in + /// an NFA. + Match, +} + +/// A value that represents the result of compiling a sub-expression of a +/// regex's HIR. Specifically, this represents a sub-graph of the NFA that +/// has an initial state at `start` and a final state at `end`. +#[derive(Clone, Copy, Debug)] +pub struct ThompsonRef { + start: StateID, + end: StateID, +} + +impl Compiler { + /// Create a new compiler. + pub fn new() -> Compiler { + Compiler { + states: RefCell::new(vec![]), + config: Config::default(), + utf8_state: RefCell::new(Utf8State::new()), + trie_state: RefCell::new(RangeTrie::new()), + utf8_suffix: RefCell::new(Utf8SuffixMap::new(1000)), + remap: RefCell::new(vec![]), + empties: RefCell::new(vec![]), + } + } + + /// Clear any memory used by this compiler such that it is ready to compile + /// a new regex. + /// + /// It is preferrable to reuse a compiler if possible in order to reuse + /// allocations. + fn clear(&self) { + self.states.borrow_mut().clear(); + // We don't need to clear anything else since they are cleared on + // their own and only when they are used. + } + + /// Configure this compiler from the builder's knobs. + /// + /// The compiler is always reconfigured by the builder before using it to + /// build an NFA. + fn configure(&mut self, config: Config) { + self.config = config; + } + + /// Convert the current intermediate NFA to its final compiled form. + fn compile(&self, nfa: &mut NFA, expr: &Hir) -> Result<()> { + nfa.anchored = self.config.anchored; + + let mut start = self.add_empty(); + if !nfa.anchored { + let compiled = if self.config.allow_invalid_utf8 { + self.c_unanchored_prefix_invalid_utf8()? + } else { + self.c_unanchored_prefix_valid_utf8()? + }; + self.patch(start, compiled.start); + start = compiled.end; + } + let compiled = self.c(&expr)?; + let match_id = self.add_match(); + self.patch(start, compiled.start); + self.patch(compiled.end, match_id); + self.finish(nfa); + Ok(()) + } + + /// Finishes the compilation process and populates the provide NFA with + /// the final graph. + fn finish(&self, nfa: &mut NFA) { + let mut bstates = self.states.borrow_mut(); + let mut remap = self.remap.borrow_mut(); + remap.resize(bstates.len(), 0); + let mut empties = self.empties.borrow_mut(); + empties.clear(); + + // We don't reuse allocations here becuase this is what we're + // returning. + nfa.states.clear(); + let mut byteset = ByteClassSet::new(); + + // The idea here is to convert our intermediate states to their final + // form. The only real complexity here is the process of converting + // transitions, which are expressed in terms of state IDs. The new + // set of states will be smaller because of partial epsilon removal, + // so the state IDs will not be the same. + for (id, bstate) in bstates.iter_mut().enumerate() { + match *bstate { + CState::Empty { next } => { + // Since we're removing empty states, we need to handle + // them later since we don't yet know which new state this + // empty state will be mapped to. + empties.push((id, next)); + } + CState::Range { ref range } => { + remap[id] = nfa.states.len(); + byteset.set_range(range.start, range.end); + nfa.states.push(State::Range { range: range.clone() }); + } + CState::Sparse { ref mut ranges } => { + remap[id] = nfa.states.len(); + + let ranges = mem::replace(ranges, vec![]); + for r in &ranges { + byteset.set_range(r.start, r.end); + } + nfa.states.push(State::Sparse { + ranges: ranges.into_boxed_slice(), + }); + } + CState::Union { ref mut alternates } => { + remap[id] = nfa.states.len(); + + let alternates = mem::replace(alternates, vec![]); + nfa.states.push(State::Union { + alternates: alternates.into_boxed_slice(), + }); + } + CState::UnionReverse { ref mut alternates } => { + remap[id] = nfa.states.len(); + + let mut alternates = mem::replace(alternates, vec![]); + alternates.reverse(); + nfa.states.push(State::Union { + alternates: alternates.into_boxed_slice(), + }); + } + CState::Match => { + remap[id] = nfa.states.len(); + nfa.states.push(State::Match); + } + } + } + for &(empty_id, mut empty_next) in empties.iter() { + // empty states can point to other empty states, forming a chain. + // So we must follow the chain until the end, which must end at + // a non-empty state, and therefore, a state that is correctly + // remapped. We are guaranteed to terminate because our compiler + // never builds a loop among empty states. + while let CState::Empty { next } = bstates[empty_next] { + empty_next = next; + } + remap[empty_id] = remap[empty_next]; + } + for state in &mut nfa.states { + state.remap(&remap); + } + // The compiler always begins the NFA at the first state. + nfa.start = remap[0]; + nfa.byte_classes = byteset.byte_classes(); + } + + fn c(&self, expr: &Hir) -> Result<ThompsonRef> { + match *expr.kind() { + HirKind::Empty => { + let id = self.add_empty(); + Ok(ThompsonRef { start: id, end: id }) + } + HirKind::Literal(hir::Literal::Unicode(ch)) => { + let mut buf = [0; 4]; + let it = ch + .encode_utf8(&mut buf) + .as_bytes() + .iter() + .map(|&b| Ok(self.c_range(b, b))); + self.c_concat(it) + } + HirKind::Literal(hir::Literal::Byte(b)) => Ok(self.c_range(b, b)), + HirKind::Class(hir::Class::Bytes(ref cls)) => { + self.c_byte_class(cls) + } + HirKind::Class(hir::Class::Unicode(ref cls)) => { + self.c_unicode_class(cls) + } + HirKind::Repetition(ref rep) => self.c_repetition(rep), + HirKind::Group(ref group) => self.c(&*group.hir), + HirKind::Concat(ref exprs) => { + self.c_concat(exprs.iter().map(|e| self.c(e))) + } + HirKind::Alternation(ref exprs) => { + self.c_alternation(exprs.iter().map(|e| self.c(e))) + } + HirKind::Anchor(_) => Err(Error::unsupported_anchor()), + HirKind::WordBoundary(_) => Err(Error::unsupported_word()), + } + } + + fn c_concat<I>(&self, mut it: I) -> Result<ThompsonRef> + where + I: DoubleEndedIterator<Item = Result<ThompsonRef>>, + { + let first = + if self.config.reverse { it.next_back() } else { it.next() }; + let ThompsonRef { start, mut end } = match first { + Some(result) => result?, + None => return Ok(self.c_empty()), + }; + loop { + let next = + if self.config.reverse { it.next_back() } else { it.next() }; + let compiled = match next { + Some(result) => result?, + None => break, + }; + self.patch(end, compiled.start); + end = compiled.end; + } + Ok(ThompsonRef { start, end }) + } + + fn c_alternation<I>(&self, mut it: I) -> Result<ThompsonRef> + where + I: Iterator<Item = Result<ThompsonRef>>, + { + let first = it.next().expect("alternations must be non-empty")?; + let second = match it.next() { + None => return Ok(first), + Some(result) => result?, + }; + + let union = self.add_union(); + let end = self.add_empty(); + self.patch(union, first.start); + self.patch(first.end, end); + self.patch(union, second.start); + self.patch(second.end, end); + for result in it { + let compiled = result?; + self.patch(union, compiled.start); + self.patch(compiled.end, end); + } + Ok(ThompsonRef { start: union, end }) + } + + fn c_repetition(&self, rep: &hir::Repetition) -> Result<ThompsonRef> { + match rep.kind { + hir::RepetitionKind::ZeroOrOne => { + self.c_zero_or_one(&rep.hir, rep.greedy) + } + hir::RepetitionKind::ZeroOrMore => { + self.c_at_least(&rep.hir, rep.greedy, 0) + } + hir::RepetitionKind::OneOrMore => { + self.c_at_least(&rep.hir, rep.greedy, 1) + } + hir::RepetitionKind::Range(ref rng) => match *rng { + hir::RepetitionRange::Exactly(count) => { + self.c_exactly(&rep.hir, count) + } + hir::RepetitionRange::AtLeast(m) => { + self.c_at_least(&rep.hir, rep.greedy, m) + } + hir::RepetitionRange::Bounded(min, max) => { + self.c_bounded(&rep.hir, rep.greedy, min, max) + } + }, + } + } + + fn c_bounded( + &self, + expr: &Hir, + greedy: bool, + min: u32, + max: u32, + ) -> Result<ThompsonRef> { + let prefix = self.c_exactly(expr, min)?; + if min == max { + return Ok(prefix); + } + + // It is tempting here to compile the rest here as a concatenation + // of zero-or-one matches. i.e., for `a{2,5}`, compile it as if it + // were `aaa?a?a?`. The problem here is that it leads to this program: + // + // >000000: 61 => 01 + // 000001: 61 => 02 + // 000002: alt(03, 04) + // 000003: 61 => 04 + // 000004: alt(05, 06) + // 000005: 61 => 06 + // 000006: alt(07, 08) + // 000007: 61 => 08 + // 000008: MATCH + // + // And effectively, once you hit state 2, the epsilon closure will + // include states 3, 5, 5, 6, 7 and 8, which is quite a bit. It is + // better to instead compile it like so: + // + // >000000: 61 => 01 + // 000001: 61 => 02 + // 000002: alt(03, 08) + // 000003: 61 => 04 + // 000004: alt(05, 08) + // 000005: 61 => 06 + // 000006: alt(07, 08) + // 000007: 61 => 08 + // 000008: MATCH + // + // So that the epsilon closure of state 2 is now just 3 and 8. + let empty = self.add_empty(); + let mut prev_end = prefix.end; + for _ in min..max { + let union = if greedy { + self.add_union() + } else { + self.add_reverse_union() + }; + let compiled = self.c(expr)?; + self.patch(prev_end, union); + self.patch(union, compiled.start); + self.patch(union, empty); + prev_end = compiled.end; + } + self.patch(prev_end, empty); + Ok(ThompsonRef { start: prefix.start, end: empty }) + } + + fn c_at_least( + &self, + expr: &Hir, + greedy: bool, + n: u32, + ) -> Result<ThompsonRef> { + if n == 0 { + let union = if greedy { + self.add_union() + } else { + self.add_reverse_union() + }; + let compiled = self.c(expr)?; + self.patch(union, compiled.start); + self.patch(compiled.end, union); + Ok(ThompsonRef { start: union, end: union }) + } else if n == 1 { + let compiled = self.c(expr)?; + let union = if greedy { + self.add_union() + } else { + self.add_reverse_union() + }; + self.patch(compiled.end, union); + self.patch(union, compiled.start); + Ok(ThompsonRef { start: compiled.start, end: union }) + } else { + let prefix = self.c_exactly(expr, n - 1)?; + let last = self.c(expr)?; + let union = if greedy { + self.add_union() + } else { + self.add_reverse_union() + }; + self.patch(prefix.end, last.start); + self.patch(last.end, union); + self.patch(union, last.start); + Ok(ThompsonRef { start: prefix.start, end: union }) + } + } + + fn c_zero_or_one(&self, expr: &Hir, greedy: bool) -> Result<ThompsonRef> { + let union = + if greedy { self.add_union() } else { self.add_reverse_union() }; + let compiled = self.c(expr)?; + let empty = self.add_empty(); + self.patch(union, compiled.start); + self.patch(union, empty); + self.patch(compiled.end, empty); + Ok(ThompsonRef { start: union, end: empty }) + } + + fn c_exactly(&self, expr: &Hir, n: u32) -> Result<ThompsonRef> { + let it = (0..n).map(|_| self.c(expr)); + self.c_concat(it) + } + + fn c_byte_class(&self, cls: &hir::ClassBytes) -> Result<ThompsonRef> { + let end = self.add_empty(); + let mut trans = Vec::with_capacity(cls.ranges().len()); + for r in cls.iter() { + trans.push(Transition { + start: r.start(), + end: r.end(), + next: end, + }); + } + Ok(ThompsonRef { start: self.add_sparse(trans), end }) + } + + fn c_unicode_class(&self, cls: &hir::ClassUnicode) -> Result<ThompsonRef> { + // If all we have are ASCII ranges wrapped in a Unicode package, then + // there is zero reason to bring out the big guns. We can fit all ASCII + // ranges within a single sparse transition. + if cls.is_all_ascii() { + let end = self.add_empty(); + let mut trans = Vec::with_capacity(cls.ranges().len()); + for r in cls.iter() { + assert!(r.start() <= '\x7F'); + assert!(r.end() <= '\x7F'); + trans.push(Transition { + start: r.start() as u8, + end: r.end() as u8, + next: end, + }); + } + Ok(ThompsonRef { start: self.add_sparse(trans), end }) + } else if self.config.reverse { + if !self.config.shrink { + // When we don't want to spend the extra time shrinking, we + // compile the UTF-8 automaton in reverse using something like + // the "naive" approach, but will attempt to re-use common + // suffixes. + self.c_unicode_class_reverse_with_suffix(cls) + } else { + // When we want to shrink our NFA for reverse UTF-8 automata, + // we cannot feed UTF-8 sequences directly to the UTF-8 + // compiler, since the UTF-8 compiler requires all sequences + // to be lexicographically sorted. Instead, we organize our + // sequences into a range trie, which can then output our + // sequences in the correct order. Unfortunately, building the + // range trie is fairly expensive (but not nearly as expensive + // as building a DFA). Hence the reason why the 'shrink' option + // exists, so that this path can be toggled off. + let mut trie = self.trie_state.borrow_mut(); + trie.clear(); + + for rng in cls.iter() { + for mut seq in Utf8Sequences::new(rng.start(), rng.end()) { + seq.reverse(); + trie.insert(seq.as_slice()); + } + } + let mut utf8_state = self.utf8_state.borrow_mut(); + let mut utf8c = Utf8Compiler::new(self, &mut *utf8_state); + trie.iter(|seq| { + utf8c.add(&seq); + }); + Ok(utf8c.finish()) + } + } else { + // In the forward direction, we always shrink our UTF-8 automata + // because we can stream it right into the UTF-8 compiler. There + // is almost no downside (in either memory or time) to using this + // approach. + let mut utf8_state = self.utf8_state.borrow_mut(); + let mut utf8c = Utf8Compiler::new(self, &mut *utf8_state); + for rng in cls.iter() { + for seq in Utf8Sequences::new(rng.start(), rng.end()) { + utf8c.add(seq.as_slice()); + } + } + Ok(utf8c.finish()) + } + + // For reference, the code below is the "naive" version of compiling a + // UTF-8 automaton. It is deliciously simple (and works for both the + // forward and reverse cases), but will unfortunately produce very + // large NFAs. When compiling a forward automaton, the size difference + // can sometimes be an order of magnitude. For example, the '\w' regex + // will generate about ~3000 NFA states using the naive approach below, + // but only 283 states when using the approach above. This is because + // the approach above actually compiles a *minimal* (or near minimal, + // because of the bounded hashmap) UTF-8 automaton. + // + // The code below is kept as a reference point in order to make it + // easier to understand the higher level goal here. + /* + let it = cls + .iter() + .flat_map(|rng| Utf8Sequences::new(rng.start(), rng.end())) + .map(|seq| { + let it = seq + .as_slice() + .iter() + .map(|rng| Ok(self.c_range(rng.start, rng.end))); + self.c_concat(it) + }); + self.c_alternation(it); + */ + } + + fn c_unicode_class_reverse_with_suffix( + &self, + cls: &hir::ClassUnicode, + ) -> Result<ThompsonRef> { + // N.B. It would likely be better to cache common *prefixes* in the + // reverse direction, but it's not quite clear how to do that. The + // advantage of caching suffixes is that it does give us a win, and + // has a very small additional overhead. + let mut cache = self.utf8_suffix.borrow_mut(); + cache.clear(); + + let union = self.add_union(); + let alt_end = self.add_empty(); + for urng in cls.iter() { + for seq in Utf8Sequences::new(urng.start(), urng.end()) { + let mut end = alt_end; + for brng in seq.as_slice() { + let key = Utf8SuffixKey { + from: end, + start: brng.start, + end: brng.end, + }; + let hash = cache.hash(&key); + if let Some(id) = cache.get(&key, hash) { + end = id; + continue; + } + + let compiled = self.c_range(brng.start, brng.end); + self.patch(compiled.end, end); + end = compiled.start; + cache.set(key, hash, end); + } + self.patch(union, end); + } + } + Ok(ThompsonRef { start: union, end: alt_end }) + } + + fn c_range(&self, start: u8, end: u8) -> ThompsonRef { + let id = self.add_range(start, end); + ThompsonRef { start: id, end: id } + } + + fn c_empty(&self) -> ThompsonRef { + let id = self.add_empty(); + ThompsonRef { start: id, end: id } + } + + fn c_unanchored_prefix_valid_utf8(&self) -> Result<ThompsonRef> { + self.c(&Hir::repetition(hir::Repetition { + kind: hir::RepetitionKind::ZeroOrMore, + greedy: false, + hir: Box::new(Hir::any(false)), + })) + } + + fn c_unanchored_prefix_invalid_utf8(&self) -> Result<ThompsonRef> { + self.c(&Hir::repetition(hir::Repetition { + kind: hir::RepetitionKind::ZeroOrMore, + greedy: false, + hir: Box::new(Hir::any(true)), + })) + } + + fn patch(&self, from: StateID, to: StateID) { + match self.states.borrow_mut()[from] { + CState::Empty { ref mut next } => { + *next = to; + } + CState::Range { ref mut range } => { + range.next = to; + } + CState::Sparse { .. } => { + panic!("cannot patch from a sparse NFA state") + } + CState::Union { ref mut alternates } => { + alternates.push(to); + } + CState::UnionReverse { ref mut alternates } => { + alternates.push(to); + } + CState::Match => {} + } + } + + fn add_empty(&self) -> StateID { + let id = self.states.borrow().len(); + self.states.borrow_mut().push(CState::Empty { next: 0 }); + id + } + + fn add_range(&self, start: u8, end: u8) -> StateID { + let id = self.states.borrow().len(); + let trans = Transition { start, end, next: 0 }; + let state = CState::Range { range: trans }; + self.states.borrow_mut().push(state); + id + } + + fn add_sparse(&self, ranges: Vec<Transition>) -> StateID { + if ranges.len() == 1 { + let id = self.states.borrow().len(); + let state = CState::Range { range: ranges[0] }; + self.states.borrow_mut().push(state); + return id; + } + let id = self.states.borrow().len(); + let state = CState::Sparse { ranges }; + self.states.borrow_mut().push(state); + id + } + + fn add_union(&self) -> StateID { + let id = self.states.borrow().len(); + let state = CState::Union { alternates: vec![] }; + self.states.borrow_mut().push(state); + id + } + + fn add_reverse_union(&self) -> StateID { + let id = self.states.borrow().len(); + let state = CState::UnionReverse { alternates: vec![] }; + self.states.borrow_mut().push(state); + id + } + + fn add_match(&self) -> StateID { + let id = self.states.borrow().len(); + self.states.borrow_mut().push(CState::Match); + id + } +} + +#[derive(Debug)] +struct Utf8Compiler<'a> { + nfac: &'a Compiler, + state: &'a mut Utf8State, + target: StateID, +} + +#[derive(Clone, Debug)] +struct Utf8State { + compiled: Utf8BoundedMap, + uncompiled: Vec<Utf8Node>, +} + +#[derive(Clone, Debug)] +struct Utf8Node { + trans: Vec<Transition>, + last: Option<Utf8LastTransition>, +} + +#[derive(Clone, Debug)] +struct Utf8LastTransition { + start: u8, + end: u8, +} + +impl Utf8State { + fn new() -> Utf8State { + Utf8State { compiled: Utf8BoundedMap::new(5000), uncompiled: vec![] } + } + + fn clear(&mut self) { + self.compiled.clear(); + self.uncompiled.clear(); + } +} + +impl<'a> Utf8Compiler<'a> { + fn new(nfac: &'a Compiler, state: &'a mut Utf8State) -> Utf8Compiler<'a> { + let target = nfac.add_empty(); + state.clear(); + let mut utf8c = Utf8Compiler { nfac, state, target }; + utf8c.add_empty(); + utf8c + } + + fn finish(&mut self) -> ThompsonRef { + self.compile_from(0); + let node = self.pop_root(); + let start = self.compile(node); + ThompsonRef { start, end: self.target } + } + + fn add(&mut self, ranges: &[Utf8Range]) { + let prefix_len = ranges + .iter() + .zip(&self.state.uncompiled) + .take_while(|&(range, node)| { + node.last.as_ref().map_or(false, |t| { + (t.start, t.end) == (range.start, range.end) + }) + }) + .count(); + assert!(prefix_len < ranges.len()); + self.compile_from(prefix_len); + self.add_suffix(&ranges[prefix_len..]); + } + + fn compile_from(&mut self, from: usize) { + let mut next = self.target; + while from + 1 < self.state.uncompiled.len() { + let node = self.pop_freeze(next); + next = self.compile(node); + } + self.top_last_freeze(next); + } + + fn compile(&mut self, node: Vec<Transition>) -> StateID { + let hash = self.state.compiled.hash(&node); + if let Some(id) = self.state.compiled.get(&node, hash) { + return id; + } + let id = self.nfac.add_sparse(node.clone()); + self.state.compiled.set(node, hash, id); + id + } + + fn add_suffix(&mut self, ranges: &[Utf8Range]) { + assert!(!ranges.is_empty()); + let last = self + .state + .uncompiled + .len() + .checked_sub(1) + .expect("non-empty nodes"); + assert!(self.state.uncompiled[last].last.is_none()); + self.state.uncompiled[last].last = Some(Utf8LastTransition { + start: ranges[0].start, + end: ranges[0].end, + }); + for r in &ranges[1..] { + self.state.uncompiled.push(Utf8Node { + trans: vec![], + last: Some(Utf8LastTransition { start: r.start, end: r.end }), + }); + } + } + + fn add_empty(&mut self) { + self.state.uncompiled.push(Utf8Node { trans: vec![], last: None }); + } + + fn pop_freeze(&mut self, next: StateID) -> Vec<Transition> { + let mut uncompiled = self.state.uncompiled.pop().unwrap(); + uncompiled.set_last_transition(next); + uncompiled.trans + } + + fn pop_root(&mut self) -> Vec<Transition> { + assert_eq!(self.state.uncompiled.len(), 1); + assert!(self.state.uncompiled[0].last.is_none()); + self.state.uncompiled.pop().expect("non-empty nodes").trans + } + + fn top_last_freeze(&mut self, next: StateID) { + let last = self + .state + .uncompiled + .len() + .checked_sub(1) + .expect("non-empty nodes"); + self.state.uncompiled[last].set_last_transition(next); + } +} + +impl Utf8Node { + fn set_last_transition(&mut self, next: StateID) { + if let Some(last) = self.last.take() { + self.trans.push(Transition { + start: last.start, + end: last.end, + next, + }); + } + } +} + +#[cfg(test)] +mod tests { + use regex_syntax::hir::Hir; + use regex_syntax::ParserBuilder; + + use super::{Builder, State, StateID, Transition, NFA}; + + fn parse(pattern: &str) -> Hir { + ParserBuilder::new().build().parse(pattern).unwrap() + } + + fn build(pattern: &str) -> NFA { + Builder::new().anchored(true).build(&parse(pattern)).unwrap() + } + + fn s_byte(byte: u8, next: StateID) -> State { + let trans = Transition { start: byte, end: byte, next }; + State::Range { range: trans } + } + + fn s_range(start: u8, end: u8, next: StateID) -> State { + let trans = Transition { start, end, next }; + State::Range { range: trans } + } + + fn s_sparse(ranges: &[(u8, u8, StateID)]) -> State { + let ranges = ranges + .iter() + .map(|&(start, end, next)| Transition { start, end, next }) + .collect(); + State::Sparse { ranges } + } + + fn s_union(alts: &[StateID]) -> State { + State::Union { alternates: alts.to_vec().into_boxed_slice() } + } + + fn s_match() -> State { + State::Match + } + + #[test] + fn errors() { + // unsupported anchors + assert!(Builder::new().build(&parse(r"^")).is_err()); + assert!(Builder::new().build(&parse(r"$")).is_err()); + assert!(Builder::new().build(&parse(r"\A")).is_err()); + assert!(Builder::new().build(&parse(r"\z")).is_err()); + + // unsupported word boundaries + assert!(Builder::new().build(&parse(r"\b")).is_err()); + assert!(Builder::new().build(&parse(r"\B")).is_err()); + assert!(Builder::new().build(&parse(r"(?-u)\b")).is_err()); + } + + // Test that building an unanchored NFA has an appropriate `.*?` prefix. + #[test] + fn compile_unanchored_prefix() { + // When the machine can only match valid UTF-8. + let nfa = Builder::new().anchored(false).build(&parse(r"a")).unwrap(); + // There should be many states since the `.` in `.*?` matches any + // Unicode scalar value. + assert_eq!(11, nfa.len()); + assert_eq!(nfa.states[10], s_match()); + assert_eq!(nfa.states[9], s_byte(b'a', 10)); + + // When the machine can match invalid UTF-8. + let nfa = Builder::new() + .anchored(false) + .allow_invalid_utf8(true) + .build(&parse(r"a")) + .unwrap(); + assert_eq!( + nfa.states, + &[ + s_union(&[2, 1]), + s_range(0, 255, 0), + s_byte(b'a', 3), + s_match(), + ] + ); + } + + #[test] + fn compile_empty() { + assert_eq!(build("").states, &[s_match(),]); + } + + #[test] + fn compile_literal() { + assert_eq!(build("a").states, &[s_byte(b'a', 1), s_match(),]); + assert_eq!( + build("ab").states, + &[s_byte(b'a', 1), s_byte(b'b', 2), s_match(),] + ); + assert_eq!( + build("☃").states, + &[s_byte(0xE2, 1), s_byte(0x98, 2), s_byte(0x83, 3), s_match(),] + ); + + // Check that non-UTF-8 literals work. + let hir = ParserBuilder::new() + .allow_invalid_utf8(true) + .build() + .parse(r"(?-u)\xFF") + .unwrap(); + let nfa = Builder::new() + .anchored(true) + .allow_invalid_utf8(true) + .build(&hir) + .unwrap(); + assert_eq!(nfa.states, &[s_byte(b'\xFF', 1), s_match(),]); + } + + #[test] + fn compile_class() { + assert_eq!( + build(r"[a-z]").states, + &[s_range(b'a', b'z', 1), s_match(),] + ); + assert_eq!( + build(r"[x-za-c]").states, + &[s_sparse(&[(b'a', b'c', 1), (b'x', b'z', 1)]), s_match()] + ); + assert_eq!( + build(r"[\u03B1-\u03B4]").states, + &[s_range(0xB1, 0xB4, 2), s_byte(0xCE, 0), s_match()] + ); + assert_eq!( + build(r"[\u03B1-\u03B4\u{1F919}-\u{1F91E}]").states, + &[ + s_range(0xB1, 0xB4, 5), + s_range(0x99, 0x9E, 5), + s_byte(0xA4, 1), + s_byte(0x9F, 2), + s_sparse(&[(0xCE, 0xCE, 0), (0xF0, 0xF0, 3)]), + s_match(), + ] + ); + assert_eq!( + build(r"[a-z☃]").states, + &[ + s_byte(0x83, 3), + s_byte(0x98, 0), + s_sparse(&[(b'a', b'z', 3), (0xE2, 0xE2, 1)]), + s_match(), + ] + ); + } + + #[test] + fn compile_repetition() { + assert_eq!( + build(r"a?").states, + &[s_union(&[1, 2]), s_byte(b'a', 2), s_match(),] + ); + assert_eq!( + build(r"a??").states, + &[s_union(&[2, 1]), s_byte(b'a', 2), s_match(),] + ); + } + + #[test] + fn compile_group() { + assert_eq!( + build(r"ab+").states, + &[s_byte(b'a', 1), s_byte(b'b', 2), s_union(&[1, 3]), s_match(),] + ); + assert_eq!( + build(r"(ab)").states, + &[s_byte(b'a', 1), s_byte(b'b', 2), s_match(),] + ); + assert_eq!( + build(r"(ab)+").states, + &[s_byte(b'a', 1), s_byte(b'b', 2), s_union(&[0, 3]), s_match(),] + ); + } + + #[test] + fn compile_alternation() { + assert_eq!( + build(r"a|b").states, + &[s_byte(b'a', 3), s_byte(b'b', 3), s_union(&[0, 1]), s_match(),] + ); + assert_eq!( + build(r"|b").states, + &[s_byte(b'b', 2), s_union(&[2, 0]), s_match(),] + ); + assert_eq!( + build(r"a|").states, + &[s_byte(b'a', 2), s_union(&[0, 2]), s_match(),] + ); + } +} diff --git a/vendor/regex-automata/src/nfa/map.rs b/vendor/regex-automata/src/nfa/map.rs new file mode 100644 index 000000000..e636c0dd3 --- /dev/null +++ b/vendor/regex-automata/src/nfa/map.rs @@ -0,0 +1,282 @@ +// This module contains a couple simple and purpose built hash maps. The key +// trade off they make is that they serve as caches rather than true maps. That +// is, inserting a new entry may cause eviction of another entry. This gives +// us two things. First, there's less overhead associated with inserts and +// lookups. Secondly, it lets us control our memory usage. +// +// These maps are used in some fairly hot code when generating NFA states for +// large Unicode character classes. +// +// Instead of exposing a rich hashmap entry API, we just permit the caller +// to produce a hash of the key directly. The hash can then be reused for both +// lookups and insertions at the cost of leaking things a bit. But these are +// for internal use only, so it's fine. +// +// The Utf8BoundedMap is used for Daciuk's algorithm for constructing a +// (almost) minimal DFA for large Unicode character classes in linear time. +// (Daciuk's algorithm is always used when compiling forward NFAs. For reverse +// NFAs, it's only used when the compiler is configured to 'shrink' the NFA, +// since there's a bit more expense in the reverse direction.) +// +// The Utf8SuffixMap is used when compiling large Unicode character classes for +// reverse NFAs when 'shrink' is disabled. Specifically, it augments the naive +// construction of UTF-8 automata by caching common suffixes. This doesn't +// get the same space savings as Daciuk's algorithm, but it's basically as +// fast as the naive approach and typically winds up using less memory (since +// it generates smaller NFAs) despite the presence of the cache. +// +// These maps effectively represent caching mechanisms for CState::Sparse and +// CState::Range, respectively. The former represents a single NFA state with +// many transitions of equivalent priority while the latter represents a single +// NFA state with a single transition. (Neither state ever has or is an +// epsilon transition.) Thus, they have different key types. It's likely we +// could make one generic map, but the machinery didn't seem worth it. They +// are simple enough. + +use nfa::{StateID, Transition}; + +// Basic FNV-1a hash constants as described in: +// https://en.wikipedia.org/wiki/Fowler%E2%80%93Noll%E2%80%93Vo_hash_function +const PRIME: u64 = 1099511628211; +const INIT: u64 = 14695981039346656037; + +/// A bounded hash map where the key is a sequence of NFA transitions and the +/// value is a pre-existing NFA state ID. +/// +/// std's hashmap can be used for this, however, this map has two important +/// advantages. Firstly, it has lower overhead. Secondly, it permits us to +/// control our memory usage by limited the number of slots. In general, the +/// cost here is that this map acts as a cache. That is, inserting a new entry +/// may remove an old entry. We are okay with this, since it does not impact +/// correctness in the cases where it is used. The only effect that dropping +/// states from the cache has is that the resulting NFA generated may be bigger +/// than it otherwise would be. +/// +/// This improves benchmarks that compile large Unicode character classes, +/// since it makes the generation of (almost) minimal UTF-8 automaton faster. +/// Specifically, one could observe the difference with std's hashmap via +/// something like the following benchmark: +/// +/// hyperfine "regex-automata-debug debug -acqr '\w{40} ecurB'" +/// +/// But to observe that difference, you'd have to modify the code to use +/// std's hashmap. +/// +/// It is quite possible that there is a better way to approach this problem. +/// For example, if there happens to be a very common state that collides with +/// a lot of less frequent states, then we could wind up with very poor caching +/// behavior. Alas, the effectiveness of this cache has not been measured. +/// Instead, ad hoc experiments suggest that it is "good enough." Additional +/// smarts (such as an LRU eviction policy) have to be weighed against the +/// amount of extra time they cost. +#[derive(Clone, Debug)] +pub struct Utf8BoundedMap { + /// The current version of this map. Only entries with matching versions + /// are considered during lookups. If an entry is found with a mismatched + /// version, then the map behaves as if the entry does not exist. + version: u16, + /// The total number of entries this map can store. + capacity: usize, + /// The actual entries, keyed by hash. Collisions between different states + /// result in the old state being dropped. + map: Vec<Utf8BoundedEntry>, +} + +/// An entry in this map. +#[derive(Clone, Debug, Default)] +struct Utf8BoundedEntry { + /// The version of the map used to produce this entry. If this entry's + /// version does not match the current version of the map, then the map + /// should behave as if this entry does not exist. + version: u16, + /// The key, which is a sorted sequence of non-overlapping NFA transitions. + key: Vec<Transition>, + /// The state ID corresponding to the state containing the transitions in + /// this entry. + val: StateID, +} + +impl Utf8BoundedMap { + /// Create a new bounded map with the given capacity. The map will never + /// grow beyond the given size. + /// + /// Note that this does not allocate. Instead, callers must call `clear` + /// before using this map. `clear` will allocate space if necessary. + /// + /// This avoids the need to pay for the allocation of this map when + /// compiling regexes that lack large Unicode character classes. + pub fn new(capacity: usize) -> Utf8BoundedMap { + assert!(capacity > 0); + Utf8BoundedMap { version: 0, capacity, map: vec![] } + } + + /// Clear this map of all entries, but permit the reuse of allocation + /// if possible. + /// + /// This must be called before the map can be used. + pub fn clear(&mut self) { + if self.map.is_empty() { + self.map = vec![Utf8BoundedEntry::default(); self.capacity]; + } else { + self.version = self.version.wrapping_add(1); + if self.version == 0 { + self.map = vec![Utf8BoundedEntry::default(); self.capacity]; + } + } + } + + /// Return a hash of the given transitions. + pub fn hash(&self, key: &[Transition]) -> usize { + let mut h = INIT; + for t in key { + h = (h ^ (t.start as u64)).wrapping_mul(PRIME); + h = (h ^ (t.end as u64)).wrapping_mul(PRIME); + h = (h ^ (t.next as u64)).wrapping_mul(PRIME); + } + (h as usize) % self.map.len() + } + + /// Retrieve the cached state ID corresponding to the given key. The hash + /// given must have been computed with `hash` using the same key value. + /// + /// If there is no cached state with the given transitions, then None is + /// returned. + pub fn get(&mut self, key: &[Transition], hash: usize) -> Option<StateID> { + let entry = &self.map[hash]; + if entry.version != self.version { + return None; + } + // There may be a hash collision, so we need to confirm real equality. + if entry.key != key { + return None; + } + Some(entry.val) + } + + /// Add a cached state to this map with the given key. Callers should + /// ensure that `state_id` points to a state that contains precisely the + /// NFA transitions given. + /// + /// `hash` must have been computed using the `hash` method with the same + /// key. + pub fn set( + &mut self, + key: Vec<Transition>, + hash: usize, + state_id: StateID, + ) { + self.map[hash] = + Utf8BoundedEntry { version: self.version, key, val: state_id }; + } +} + +/// A cache of suffixes used to modestly compress UTF-8 automata for large +/// Unicode character classes. +#[derive(Clone, Debug)] +pub struct Utf8SuffixMap { + /// The current version of this map. Only entries with matching versions + /// are considered during lookups. If an entry is found with a mismatched + /// version, then the map behaves as if the entry does not exist. + version: u16, + /// The total number of entries this map can store. + capacity: usize, + /// The actual entries, keyed by hash. Collisions between different states + /// result in the old state being dropped. + map: Vec<Utf8SuffixEntry>, +} + +/// A key that uniquely identifies an NFA state. It is a triple that represents +/// a transition from one state for a particular byte range. +#[derive(Clone, Debug, Default, Eq, PartialEq)] +pub struct Utf8SuffixKey { + pub from: StateID, + pub start: u8, + pub end: u8, +} + +/// An entry in this map. +#[derive(Clone, Debug, Default)] +struct Utf8SuffixEntry { + /// The version of the map used to produce this entry. If this entry's + /// version does not match the current version of the map, then the map + /// should behave as if this entry does not exist. + version: u16, + /// The key, which consists of a transition in a particular state. + key: Utf8SuffixKey, + /// The identifier that the transition in the key maps to. + val: StateID, +} + +impl Utf8SuffixMap { + /// Create a new bounded map with the given capacity. The map will never + /// grow beyond the given size. + /// + /// Note that this does not allocate. Instead, callers must call `clear` + /// before using this map. `clear` will allocate space if necessary. + /// + /// This avoids the need to pay for the allocation of this map when + /// compiling regexes that lack large Unicode character classes. + pub fn new(capacity: usize) -> Utf8SuffixMap { + assert!(capacity > 0); + Utf8SuffixMap { version: 0, capacity, map: vec![] } + } + + /// Clear this map of all entries, but permit the reuse of allocation + /// if possible. + /// + /// This must be called before the map can be used. + pub fn clear(&mut self) { + if self.map.is_empty() { + self.map = vec![Utf8SuffixEntry::default(); self.capacity]; + } else { + self.version = self.version.wrapping_add(1); + if self.version == 0 { + self.map = vec![Utf8SuffixEntry::default(); self.capacity]; + } + } + } + + /// Return a hash of the given transition. + pub fn hash(&self, key: &Utf8SuffixKey) -> usize { + // Basic FNV-1a hash as described: + // https://en.wikipedia.org/wiki/Fowler%E2%80%93Noll%E2%80%93Vo_hash_function + const PRIME: u64 = 1099511628211; + const INIT: u64 = 14695981039346656037; + + let mut h = INIT; + h = (h ^ (key.from as u64)).wrapping_mul(PRIME); + h = (h ^ (key.start as u64)).wrapping_mul(PRIME); + h = (h ^ (key.end as u64)).wrapping_mul(PRIME); + (h as usize) % self.map.len() + } + + /// Retrieve the cached state ID corresponding to the given key. The hash + /// given must have been computed with `hash` using the same key value. + /// + /// If there is no cached state with the given key, then None is returned. + pub fn get( + &mut self, + key: &Utf8SuffixKey, + hash: usize, + ) -> Option<StateID> { + let entry = &self.map[hash]; + if entry.version != self.version { + return None; + } + if key != &entry.key { + return None; + } + Some(entry.val) + } + + /// Add a cached state to this map with the given key. Callers should + /// ensure that `state_id` points to a state that contains precisely the + /// NFA transition given. + /// + /// `hash` must have been computed using the `hash` method with the same + /// key. + pub fn set(&mut self, key: Utf8SuffixKey, hash: usize, state_id: StateID) { + self.map[hash] = + Utf8SuffixEntry { version: self.version, key, val: state_id }; + } +} diff --git a/vendor/regex-automata/src/nfa/mod.rs b/vendor/regex-automata/src/nfa/mod.rs new file mode 100644 index 000000000..02d0501de --- /dev/null +++ b/vendor/regex-automata/src/nfa/mod.rs @@ -0,0 +1,252 @@ +use std::fmt; + +use classes::ByteClasses; +pub use nfa::compiler::Builder; + +mod compiler; +mod map; +mod range_trie; + +/// The representation for an NFA state identifier. +pub type StateID = usize; + +/// A final compiled NFA. +/// +/// The states of the NFA are indexed by state IDs, which are how transitions +/// are expressed. +#[derive(Clone)] +pub struct NFA { + /// Whether this NFA can only match at the beginning of input or not. + /// + /// When true, a match should only be reported if it begins at the 0th + /// index of the haystack. + anchored: bool, + /// The starting state of this NFA. + start: StateID, + /// The state list. This list is guaranteed to be indexable by the starting + /// state ID, and it is also guaranteed to contain exactly one `Match` + /// state. + states: Vec<State>, + /// A mapping from any byte value to its corresponding equivalence class + /// identifier. Two bytes in the same equivalence class cannot discriminate + /// between a match or a non-match. This map can be used to shrink the + /// total size of a DFA's transition table with a small match-time cost. + /// + /// Note that the NFA's transitions are *not* defined in terms of these + /// equivalence classes. The NFA's transitions are defined on the original + /// byte values. For the most part, this is because they wouldn't really + /// help the NFA much since the NFA already uses a sparse representation + /// to represent transitions. Byte classes are most effective in a dense + /// representation. + byte_classes: ByteClasses, +} + +impl NFA { + /// Returns an NFA that always matches at every position. + pub fn always_match() -> NFA { + NFA { + anchored: false, + start: 0, + states: vec![State::Match], + byte_classes: ByteClasses::empty(), + } + } + + /// Returns an NFA that never matches at any position. + pub fn never_match() -> NFA { + NFA { + anchored: false, + start: 0, + states: vec![State::Fail], + byte_classes: ByteClasses::empty(), + } + } + + /// Returns true if and only if this NFA is anchored. + pub fn is_anchored(&self) -> bool { + self.anchored + } + + /// Return the number of states in this NFA. + pub fn len(&self) -> usize { + self.states.len() + } + + /// Return the ID of the initial state of this NFA. + pub fn start(&self) -> StateID { + self.start + } + + /// Return the NFA state corresponding to the given ID. + pub fn state(&self, id: StateID) -> &State { + &self.states[id] + } + + /// Return the set of equivalence classes for this NFA. The slice returned + /// always has length 256 and maps each possible byte value to its + /// corresponding equivalence class ID (which is never more than 255). + pub fn byte_classes(&self) -> &ByteClasses { + &self.byte_classes + } +} + +impl fmt::Debug for NFA { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + for (i, state) in self.states.iter().enumerate() { + let status = if i == self.start { '>' } else { ' ' }; + writeln!(f, "{}{:06}: {:?}", status, i, state)?; + } + Ok(()) + } +} + +/// A state in a final compiled NFA. +#[derive(Clone, Eq, PartialEq)] +pub enum State { + /// A state that transitions to `next` if and only if the current input + /// byte is in the range `[start, end]` (inclusive). + /// + /// This is a special case of Sparse in that it encodes only one transition + /// (and therefore avoids the allocation). + Range { range: Transition }, + /// A state with possibly many transitions, represented in a sparse + /// fashion. Transitions are ordered lexicographically by input range. + /// As such, this may only be used when every transition has equal + /// priority. (In practice, this is only used for encoding large UTF-8 + /// automata.) + Sparse { ranges: Box<[Transition]> }, + /// An alternation such that there exists an epsilon transition to all + /// states in `alternates`, where matches found via earlier transitions + /// are preferred over later transitions. + Union { alternates: Box<[StateID]> }, + /// A fail state. When encountered, the automaton is guaranteed to never + /// reach a match state. + Fail, + /// A match state. There is exactly one such occurrence of this state in + /// an NFA. + Match, +} + +/// A transition to another state, only if the given byte falls in the +/// inclusive range specified. +#[derive(Clone, Copy, Eq, Hash, PartialEq)] +pub struct Transition { + pub start: u8, + pub end: u8, + pub next: StateID, +} + +impl State { + /// Returns true if and only if this state contains one or more epsilon + /// transitions. + pub fn is_epsilon(&self) -> bool { + match *self { + State::Range { .. } + | State::Sparse { .. } + | State::Fail + | State::Match => false, + State::Union { .. } => true, + } + } + + /// Remap the transitions in this state using the given map. Namely, the + /// given map should be indexed according to the transitions currently + /// in this state. + /// + /// This is used during the final phase of the NFA compiler, which turns + /// its intermediate NFA into the final NFA. + fn remap(&mut self, remap: &[StateID]) { + match *self { + State::Range { ref mut range } => range.next = remap[range.next], + State::Sparse { ref mut ranges } => { + for r in ranges.iter_mut() { + r.next = remap[r.next]; + } + } + State::Union { ref mut alternates } => { + for alt in alternates.iter_mut() { + *alt = remap[*alt]; + } + } + State::Fail => {} + State::Match => {} + } + } +} + +impl fmt::Debug for State { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + match *self { + State::Range { ref range } => range.fmt(f), + State::Sparse { ref ranges } => { + let rs = ranges + .iter() + .map(|t| format!("{:?}", t)) + .collect::<Vec<String>>() + .join(", "); + write!(f, "sparse({})", rs) + } + State::Union { ref alternates } => { + let alts = alternates + .iter() + .map(|id| format!("{}", id)) + .collect::<Vec<String>>() + .join(", "); + write!(f, "alt({})", alts) + } + State::Fail => write!(f, "FAIL"), + State::Match => write!(f, "MATCH"), + } + } +} + +impl fmt::Debug for Transition { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + let Transition { start, end, next } = *self; + if self.start == self.end { + write!(f, "{} => {}", escape(start), next) + } else { + write!(f, "{}-{} => {}", escape(start), escape(end), next) + } + } +} + +/// Return the given byte as its escaped string form. +fn escape(b: u8) -> String { + use std::ascii; + + String::from_utf8(ascii::escape_default(b).collect::<Vec<_>>()).unwrap() +} + +#[cfg(test)] +mod tests { + use super::*; + use dense; + use dfa::DFA; + + #[test] + fn always_match() { + let nfa = NFA::always_match(); + let dfa = dense::Builder::new().build_from_nfa::<usize>(&nfa).unwrap(); + + assert_eq!(Some(0), dfa.find_at(b"", 0)); + assert_eq!(Some(0), dfa.find_at(b"a", 0)); + assert_eq!(Some(1), dfa.find_at(b"a", 1)); + assert_eq!(Some(0), dfa.find_at(b"ab", 0)); + assert_eq!(Some(1), dfa.find_at(b"ab", 1)); + assert_eq!(Some(2), dfa.find_at(b"ab", 2)); + } + + #[test] + fn never_match() { + let nfa = NFA::never_match(); + let dfa = dense::Builder::new().build_from_nfa::<usize>(&nfa).unwrap(); + + assert_eq!(None, dfa.find_at(b"", 0)); + assert_eq!(None, dfa.find_at(b"a", 0)); + assert_eq!(None, dfa.find_at(b"a", 1)); + assert_eq!(None, dfa.find_at(b"ab", 0)); + assert_eq!(None, dfa.find_at(b"ab", 1)); + assert_eq!(None, dfa.find_at(b"ab", 2)); + } +} diff --git a/vendor/regex-automata/src/nfa/range_trie.rs b/vendor/regex-automata/src/nfa/range_trie.rs new file mode 100644 index 000000000..50767c7c6 --- /dev/null +++ b/vendor/regex-automata/src/nfa/range_trie.rs @@ -0,0 +1,1048 @@ +// I've called the primary data structure in this module a "range trie." As far +// as I can tell, there is no prior art on a data structure like this, however, +// it's likely someone somewhere has built something like it. Searching for +// "range trie" turns up the paper "Range Tries for Scalable Address Lookup," +// but it does not appear relevant. +// +// The range trie is just like a trie in that it is a special case of a +// deterministic finite state machine. It has states and each state has a set +// of transitions to other states. It is acyclic, and, like a normal trie, +// it makes no attempt to reuse common suffixes among its elements. The key +// difference between a normal trie and a range trie below is that a range trie +// operates on *contiguous sequences* of bytes instead of singleton bytes. +// One could say say that our alphabet is ranges of bytes instead of bytes +// themselves, except a key part of range trie construction is splitting ranges +// apart to ensure there is at most one transition that can be taken for any +// byte in a given state. +// +// I've tried to explain the details of how the range trie works below, so +// for now, we are left with trying to understand what problem we're trying to +// solve. Which is itself fairly involved! +// +// At the highest level, here's what we want to do. We want to convert a +// sequence of Unicode codepoints into a finite state machine whose transitions +// are over *bytes* and *not* Unicode codepoints. We want this because it makes +// said finite state machines much smaller and much faster to execute. As a +// simple example, consider a byte oriented automaton for all Unicode scalar +// values (0x00 through 0x10FFFF, not including surrogate codepoints): +// +// [00-7F] +// [C2-DF][80-BF] +// [E0-E0][A0-BF][80-BF] +// [E1-EC][80-BF][80-BF] +// [ED-ED][80-9F][80-BF] +// [EE-EF][80-BF][80-BF] +// [F0-F0][90-BF][80-BF][80-BF] +// [F1-F3][80-BF][80-BF][80-BF] +// [F4-F4][80-8F][80-BF][80-BF] +// +// (These byte ranges are generated via the regex-syntax::utf8 module, which +// was based on Russ Cox's code in RE2, which was in turn based on Ken +// Thompson's implementation of the same idea in his Plan9 implementation of +// grep.) +// +// It should be fairly straight-forward to see how one could compile this into +// a DFA. The sequences are sorted and non-overlapping. Essentially, you could +// build a trie from this fairly easy. The problem comes when your initial +// range (in this case, 0x00-0x10FFFF) isn't so nice. For example, the class +// represented by '\w' contains only a tenth of the codepoints that +// 0x00-0x10FFFF contains, but if we were to write out the byte based ranges +// as we did above, the list would stretch to 892 entries! This turns into +// quite a large NFA with a few thousand states. Turning this beast into a DFA +// takes quite a bit of time. We are thus left with trying to trim down the +// number of states we produce as early as possible. +// +// One approach (used by RE2 and still by the regex crate, at time of writing) +// is to try to find common suffixes while building NFA states for the above +// and reuse them. This is very cheap to do and one can control precisely how +// much extra memory you want to use for the cache. +// +// Another approach, however, is to reuse an algorithm for constructing a +// *minimal* DFA from a sorted sequence of inputs. I don't want to go into +// the full details here, but I explain it in more depth in my blog post on +// FSTs[1]. Note that the algorithm not invented by me, but was published +// in paper by Daciuk et al. in 2000 called "Incremental Construction of +// MinimalAcyclic Finite-State Automata." Like the suffix cache approach above, +// it is also possible to control the amount of extra memory one uses, although +// this usually comes with the cost of sacrificing true minimality. (But it's +// typically close enough with a reasonably sized cache of states.) +// +// The catch is that Daciuk's algorithm only works if you add your keys in +// lexicographic ascending order. In our case, since we're dealing with ranges, +// we also need the additional requirement that ranges are either equivalent +// or do not overlap at all. For example, if one were given the following byte +// ranges: +// +// [BC-BF][80-BF] +// [BC-BF][90-BF] +// +// Then Daciuk's algorithm also would not work, since there is nothing to +// handle the fact that the ranges overlap. They would need to be split apart. +// Thankfully, Thompson's algorithm for producing byte ranges for Unicode +// codepoint ranges meets both of our requirements. +// +// ... however, we would also like to be able to compile UTF-8 automata in +// reverse. We want this because in order to find the starting location of a +// match using a DFA, we need to run a second DFA---a reversed version of the +// forward DFA---backwards to discover the match location. Unfortunately, if +// we reverse our byte sequences for 0x00-0x10FFFF, we get sequences that are +// can overlap, even if they are sorted: +// +// [00-7F] +// [80-BF][80-9F][ED-ED] +// [80-BF][80-BF][80-8F][F4-F4] +// [80-BF][80-BF][80-BF][F1-F3] +// [80-BF][80-BF][90-BF][F0-F0] +// [80-BF][80-BF][E1-EC] +// [80-BF][80-BF][EE-EF] +// [80-BF][A0-BF][E0-E0] +// [80-BF][C2-DF] +// +// For example, '[80-BF][80-BF][EE-EF]' and '[80-BF][A0-BF][E0-E0]' have +// overlapping ranges between '[80-BF]' and '[A0-BF]'. Thus, there is no +// simple way to apply Daciuk's algorithm. +// +// And thus, the range trie was born. The range trie's only purpose is to take +// sequences of byte ranges like the ones above, collect them into a trie and +// then spit them in a sorted fashion with no overlapping ranges. For example, +// 0x00-0x10FFFF gets translated to: +// +// [0-7F] +// [80-BF][80-9F][80-8F][F1-F3] +// [80-BF][80-9F][80-8F][F4] +// [80-BF][80-9F][90-BF][F0] +// [80-BF][80-9F][90-BF][F1-F3] +// [80-BF][80-9F][E1-EC] +// [80-BF][80-9F][ED] +// [80-BF][80-9F][EE-EF] +// [80-BF][A0-BF][80-8F][F1-F3] +// [80-BF][A0-BF][80-8F][F4] +// [80-BF][A0-BF][90-BF][F0] +// [80-BF][A0-BF][90-BF][F1-F3] +// [80-BF][A0-BF][E0] +// [80-BF][A0-BF][E1-EC] +// [80-BF][A0-BF][EE-EF] +// [80-BF][C2-DF] +// +// We've thus satisfied our requirements for running Daciuk's algorithm. All +// sequences of ranges are sorted, and any corresponding ranges are either +// exactly equivalent or non-overlapping. +// +// In effect, a range trie is building a DFA from a sequence of arbitrary +// byte ranges. But it uses an algoritm custom tailored to its input, so it +// is not as costly as traditional DFA construction. While it is still quite +// a bit more costly than the forward's case (which only needs Daciuk's +// algorithm), it winds up saving a substantial amount of time if one is doing +// a full DFA powerset construction later by virtue of producing a much much +// smaller NFA. +// +// [1] - https://blog.burntsushi.net/transducers/ +// [2] - https://www.mitpressjournals.org/doi/pdfplus/10.1162/089120100561601 + +use std::cell::RefCell; +use std::fmt; +use std::mem; +use std::ops::RangeInclusive; +use std::u32; + +use regex_syntax::utf8::Utf8Range; + +/// A smaller state ID means more effective use of the CPU cache and less +/// time spent copying. The implementation below will panic if the state ID +/// space is exhausted, but in order for that to happen, the range trie itself +/// would use well over 100GB of memory. Moreover, it's likely impossible +/// for the state ID space to get that big. In fact, it's likely that even a +/// u16 would be good enough here. But it's not quite clear how to prove this. +type StateID = u32; + +/// There is only one final state in this trie. Every sequence of byte ranges +/// added shares the same final state. +const FINAL: StateID = 0; + +/// The root state of the trie. +const ROOT: StateID = 1; + +/// A range trie represents an ordered set of sequences of bytes. +/// +/// A range trie accepts as input a sequence of byte ranges and merges +/// them into the existing set such that the trie can produce a sorted +/// non-overlapping sequence of byte ranges. The sequence emitted corresponds +/// precisely to the sequence of bytes matched by the given keys, although the +/// byte ranges themselves may be split at different boundaries. +/// +/// The order complexity of this data structure seems difficult to analyze. +/// If the size of a byte is held as a constant, then insertion is clearly +/// O(n) where n is the number of byte ranges in the input key. However, if +/// k=256 is our alphabet size, then insertion could be O(k^2 * n). In +/// particular it seems possible for pathological inputs to cause insertion +/// to do a lot of work. However, for what we use this data structure for, +/// there should be no pathological inputs since the ultimate source is always +/// a sorted set of Unicode scalar value ranges. +/// +/// Internally, this trie is setup like a finite state machine. Note though +/// that it is acyclic. +#[derive(Clone)] +pub struct RangeTrie { + /// The states in this trie. The first is always the shared final state. + /// The second is always the root state. Otherwise, there is no + /// particular order. + states: Vec<State>, + /// A free-list of states. When a range trie is cleared, all of its states + /// are added to list. Creating a new state reuses states from this list + /// before allocating a new one. + free: Vec<State>, + /// A stack for traversing this trie to yield sequences of byte ranges in + /// lexicographic order. + iter_stack: RefCell<Vec<NextIter>>, + /// A bufer that stores the current sequence during iteration. + iter_ranges: RefCell<Vec<Utf8Range>>, + /// A stack used for traversing the trie in order to (deeply) duplicate + /// a state. + dupe_stack: Vec<NextDupe>, + /// A stack used for traversing the trie during insertion of a new + /// sequence of byte ranges. + insert_stack: Vec<NextInsert>, +} + +/// A single state in this trie. +#[derive(Clone)] +struct State { + /// A sorted sequence of non-overlapping transitions to other states. Each + /// transition corresponds to a single range of bytes. + transitions: Vec<Transition>, +} + +/// A transition is a single range of bytes. If a particular byte is in this +/// range, then the corresponding machine may transition to the state pointed +/// to by `next_id`. +#[derive(Clone)] +struct Transition { + /// The byte range. + range: Utf8Range, + /// The next state to transition to. + next_id: StateID, +} + +impl RangeTrie { + /// Create a new empty range trie. + pub fn new() -> RangeTrie { + let mut trie = RangeTrie { + states: vec![], + free: vec![], + iter_stack: RefCell::new(vec![]), + iter_ranges: RefCell::new(vec![]), + dupe_stack: vec![], + insert_stack: vec![], + }; + trie.clear(); + trie + } + + /// Clear this range trie such that it is empty. Clearing a range trie + /// and reusing it can beneficial because this may reuse allocations. + pub fn clear(&mut self) { + self.free.extend(self.states.drain(..)); + self.add_empty(); // final + self.add_empty(); // root + } + + /// Iterate over all of the sequences of byte ranges in this trie, and + /// call the provided function for each sequence. Iteration occurs in + /// lexicographic order. + pub fn iter<F: FnMut(&[Utf8Range])>(&self, mut f: F) { + let mut stack = self.iter_stack.borrow_mut(); + stack.clear(); + let mut ranges = self.iter_ranges.borrow_mut(); + ranges.clear(); + + // We do iteration in a way that permits us to use a single buffer + // for our keys. We iterate in a depth first fashion, while being + // careful to expand our frontier as we move deeper in the trie. + stack.push(NextIter { state_id: ROOT, tidx: 0 }); + while let Some(NextIter { mut state_id, mut tidx }) = stack.pop() { + // This could be implemented more simply without an inner loop + // here, but at the cost of more stack pushes. + loop { + let state = self.state(state_id); + // If we're visited all transitions in this state, then pop + // back to the parent state. + if tidx >= state.transitions.len() { + ranges.pop(); + break; + } + + let t = &state.transitions[tidx]; + ranges.push(t.range); + if t.next_id == FINAL { + f(&ranges); + ranges.pop(); + tidx += 1; + } else { + // Expand our frontier. Once we come back to this state + // via the stack, start in on the next transition. + stack.push(NextIter { state_id, tidx: tidx + 1 }); + // Otherwise, move to the first transition of the next + // state. + state_id = t.next_id; + tidx = 0; + } + } + } + } + + /// Inserts a new sequence of ranges into this trie. + /// + /// The sequence given must be non-empty and must not have a length + /// exceeding 4. + pub fn insert(&mut self, ranges: &[Utf8Range]) { + assert!(!ranges.is_empty()); + assert!(ranges.len() <= 4); + + let mut stack = mem::replace(&mut self.insert_stack, vec![]); + stack.clear(); + + stack.push(NextInsert::new(ROOT, ranges)); + while let Some(next) = stack.pop() { + let (state_id, ranges) = (next.state_id(), next.ranges()); + assert!(!ranges.is_empty()); + + let (mut new, rest) = (ranges[0], &ranges[1..]); + + // i corresponds to the position of the existing transition on + // which we are operating. Typically, the result is to remove the + // transition and replace it with two or more new transitions + // corresponding to the partitions generated by splitting the + // 'new' with the ith transition's range. + let mut i = self.state(state_id).find(new); + + // In this case, there is no overlap *and* the new range is greater + // than all existing ranges. So we can just add it to the end. + if i == self.state(state_id).transitions.len() { + let next_id = NextInsert::push(self, &mut stack, rest); + self.add_transition(state_id, new, next_id); + continue; + } + + // The need for this loop is a bit subtle, buf basically, after + // we've handled the partitions from our initial split, it's + // possible that there will be a partition leftover that overlaps + // with a subsequent transition. If so, then we have to repeat + // the split process again with the leftovers and that subsequent + // transition. + 'OUTER: loop { + let old = self.state(state_id).transitions[i].clone(); + let split = match Split::new(old.range, new) { + Some(split) => split, + None => { + let next_id = NextInsert::push(self, &mut stack, rest); + self.add_transition_at(i, state_id, new, next_id); + continue; + } + }; + let splits = split.as_slice(); + // If we only have one partition, then the ranges must be + // equivalent. There's nothing to do here for this state, so + // just move on to the next one. + if splits.len() == 1 { + // ... but only if we have anything left to do. + if !rest.is_empty() { + stack.push(NextInsert::new(old.next_id, rest)); + } + break; + } + // At this point, we know that 'split' is non-empty and there + // must be some overlap AND that the two ranges are not + // equivalent. Therefore, the existing range MUST be removed + // and split up somehow. Instead of actually doing the removal + // and then a subsequent insertion---with all the memory + // shuffling that entails---we simply overwrite the transition + // at position `i` for the first new transition we want to + // insert. After that, we're forced to do expensive inserts. + let mut first = true; + let mut add_trans = + |trie: &mut RangeTrie, pos, from, range, to| { + if first { + trie.set_transition_at(pos, from, range, to); + first = false; + } else { + trie.add_transition_at(pos, from, range, to); + } + }; + for (j, &srange) in splits.iter().enumerate() { + match srange { + SplitRange::Old(r) => { + // Deep clone the state pointed to by the ith + // transition. This is always necessary since 'old' + // is always coupled with at least a 'both' + // partition. We don't want any new changes made + // via the 'both' partition to impact the part of + // the transition that doesn't overlap with the + // new range. + let dup_id = self.duplicate(old.next_id); + add_trans(self, i, state_id, r, dup_id); + } + SplitRange::New(r) => { + // This is a bit subtle, but if this happens to be + // the last partition in our split, it is possible + // that this overlaps with a subsequent transition. + // If it does, then we must repeat the whole + // splitting process over again with `r` and the + // subsequent transition. + { + let trans = &self.state(state_id).transitions; + if j + 1 == splits.len() + && i < trans.len() + && intersects(r, trans[i].range) + { + new = r; + continue 'OUTER; + } + } + + // ... otherwise, setup exploration for a new + // empty state and add a brand new transition for + // this new range. + let next_id = + NextInsert::push(self, &mut stack, rest); + add_trans(self, i, state_id, r, next_id); + } + SplitRange::Both(r) => { + // Continue adding the remaining ranges on this + // path and update the transition with the new + // range. + if !rest.is_empty() { + stack.push(NextInsert::new(old.next_id, rest)); + } + add_trans(self, i, state_id, r, old.next_id); + } + } + i += 1; + } + // If we've reached this point, then we know that there are + // no subsequent transitions with any overlap. Therefore, we + // can stop processing this range and move on to the next one. + break; + } + } + self.insert_stack = stack; + } + + pub fn add_empty(&mut self) -> StateID { + if self.states.len() as u64 > u32::MAX as u64 { + // This generally should not happen since a range trie is only + // ever used to compile a single sequence of Unicode scalar values. + // If we ever got to this point, we would, at *minimum*, be using + // 96GB in just the range trie alone. + panic!("too many sequences added to range trie"); + } + let id = self.states.len() as StateID; + // If we have some free states available, then use them to avoid + // more allocations. + if let Some(mut state) = self.free.pop() { + state.clear(); + self.states.push(state); + } else { + self.states.push(State { transitions: vec![] }); + } + id + } + + /// Performs a deep clone of the given state and returns the duplicate's + /// state ID. + /// + /// A "deep clone" in this context means that the state given along with + /// recursively all states that it points to are copied. Once complete, + /// the given state ID and the returned state ID share nothing. + /// + /// This is useful during range trie insertion when a new range overlaps + /// with an existing range that is bigger than the new one. The part of + /// the existing range that does *not* overlap with the new one is that + /// duplicated so that adding the new range to the overlap doesn't disturb + /// the non-overlapping portion. + /// + /// There's one exception: if old_id is the final state, then it is not + /// duplicated and the same final state is returned. This is because all + /// final states in this trie are equivalent. + fn duplicate(&mut self, old_id: StateID) -> StateID { + if old_id == FINAL { + return FINAL; + } + + let mut stack = mem::replace(&mut self.dupe_stack, vec![]); + stack.clear(); + + let new_id = self.add_empty(); + // old_id is the state we're cloning and new_id is the ID of the + // duplicated state for old_id. + stack.push(NextDupe { old_id, new_id }); + while let Some(NextDupe { old_id, new_id }) = stack.pop() { + for i in 0..self.state(old_id).transitions.len() { + let t = self.state(old_id).transitions[i].clone(); + if t.next_id == FINAL { + // All final states are the same, so there's no need to + // duplicate it. + self.add_transition(new_id, t.range, FINAL); + continue; + } + + let new_child_id = self.add_empty(); + self.add_transition(new_id, t.range, new_child_id); + stack.push(NextDupe { + old_id: t.next_id, + new_id: new_child_id, + }); + } + } + self.dupe_stack = stack; + new_id + } + + /// Adds the given transition to the given state. + /// + /// Callers must ensure that all previous transitions in this state + /// are lexicographically smaller than the given range. + fn add_transition( + &mut self, + from_id: StateID, + range: Utf8Range, + next_id: StateID, + ) { + self.state_mut(from_id) + .transitions + .push(Transition { range, next_id }); + } + + /// Like `add_transition`, except this inserts the transition just before + /// the ith transition. + fn add_transition_at( + &mut self, + i: usize, + from_id: StateID, + range: Utf8Range, + next_id: StateID, + ) { + self.state_mut(from_id) + .transitions + .insert(i, Transition { range, next_id }); + } + + /// Overwrites the transition at position i with the given transition. + fn set_transition_at( + &mut self, + i: usize, + from_id: StateID, + range: Utf8Range, + next_id: StateID, + ) { + self.state_mut(from_id).transitions[i] = Transition { range, next_id }; + } + + /// Return an immutable borrow for the state with the given ID. + fn state(&self, id: StateID) -> &State { + &self.states[id as usize] + } + + /// Return a mutable borrow for the state with the given ID. + fn state_mut(&mut self, id: StateID) -> &mut State { + &mut self.states[id as usize] + } +} + +impl State { + /// Find the position at which the given range should be inserted in this + /// state. + /// + /// The position returned is always in the inclusive range + /// [0, transitions.len()]. If 'transitions.len()' is returned, then the + /// given range overlaps with no other range in this state *and* is greater + /// than all of them. + /// + /// For all other possible positions, the given range either overlaps + /// with the transition at that position or is otherwise less than it + /// with no overlap (and is greater than the previous transition). In the + /// former case, careful attention must be paid to inserting this range + /// as a new transition. In the latter case, the range can be inserted as + /// a new transition at the given position without disrupting any other + /// transitions. + fn find(&self, range: Utf8Range) -> usize { + /// Returns the position `i` at which `pred(xs[i])` first returns true + /// such that for all `j >= i`, `pred(xs[j]) == true`. If `pred` never + /// returns true, then `xs.len()` is returned. + /// + /// We roll our own binary search because it doesn't seem like the + /// standard library's binary search can be used here. Namely, if + /// there is an overlapping range, then we want to find the first such + /// occurrence, but there may be many. Or at least, it's not quite + /// clear to me how to do it. + fn binary_search<T, F>(xs: &[T], mut pred: F) -> usize + where + F: FnMut(&T) -> bool, + { + let (mut left, mut right) = (0, xs.len()); + while left < right { + // Overflow is impossible because xs.len() <= 256. + let mid = (left + right) / 2; + if pred(&xs[mid]) { + right = mid; + } else { + left = mid + 1; + } + } + left + } + + // Benchmarks suggest that binary search is just a bit faster than + // straight linear search. Specifically when using the debug tool: + // + // hyperfine "regex-automata-debug debug -acqr '\w{40} ecurB'" + binary_search(&self.transitions, |t| range.start <= t.range.end) + } + + /// Clear this state such that it has zero transitions. + fn clear(&mut self) { + self.transitions.clear(); + } +} + +/// The next state to process during duplication. +#[derive(Clone, Debug)] +struct NextDupe { + /// The state we want to duplicate. + old_id: StateID, + /// The ID of the new state that is a duplicate of old_id. + new_id: StateID, +} + +/// The next state (and its corresponding transition) that we want to visit +/// during iteration in lexicographic order. +#[derive(Clone, Debug)] +struct NextIter { + state_id: StateID, + tidx: usize, +} + +/// The next state to process during insertion and any remaining ranges that we +/// want to add for a partcular sequence of ranges. The first such instance +/// is always the root state along with all ranges given. +#[derive(Clone, Debug)] +struct NextInsert { + /// The next state to begin inserting ranges. This state should be the + /// state at which `ranges[0]` should be inserted. + state_id: StateID, + /// The ranges to insert. We used a fixed-size array here to avoid an + /// allocation. + ranges: [Utf8Range; 4], + /// The number of valid ranges in the above array. + len: u8, +} + +impl NextInsert { + /// Create the next item to visit. The given state ID should correspond + /// to the state at which the first range in the given slice should be + /// inserted. The slice given must not be empty and it must be no longer + /// than 4. + fn new(state_id: StateID, ranges: &[Utf8Range]) -> NextInsert { + let len = ranges.len(); + assert!(len > 0); + assert!(len <= 4); + + let mut tmp = [Utf8Range { start: 0, end: 0 }; 4]; + tmp[..len].copy_from_slice(ranges); + NextInsert { state_id, ranges: tmp, len: len as u8 } + } + + /// Push a new empty state to visit along with any remaining ranges that + /// still need to be inserted. The ID of the new empty state is returned. + /// + /// If ranges is empty, then no new state is created and FINAL is returned. + fn push( + trie: &mut RangeTrie, + stack: &mut Vec<NextInsert>, + ranges: &[Utf8Range], + ) -> StateID { + if ranges.is_empty() { + FINAL + } else { + let next_id = trie.add_empty(); + stack.push(NextInsert::new(next_id, ranges)); + next_id + } + } + + /// Return the ID of the state to visit. + fn state_id(&self) -> StateID { + self.state_id + } + + /// Return the remaining ranges to insert. + fn ranges(&self) -> &[Utf8Range] { + &self.ranges[..self.len as usize] + } +} + +/// Split represents a partitioning of two ranges into one or more ranges. This +/// is the secret sauce that makes a range trie work, as it's what tells us +/// how to deal with two overlapping but unequal ranges during insertion. +/// +/// Essentially, either two ranges overlap or they don't. If they don't, then +/// handling insertion is easy: just insert the new range into its +/// lexicographically correct position. Since it does not overlap with anything +/// else, no other transitions are impacted by the new range. +/// +/// If they do overlap though, there are generally three possible cases to +/// handle: +/// +/// 1. The part where the two ranges actually overlap. i.e., The intersection. +/// 2. The part of the existing range that is not in the the new range. +/// 3. The part of the new range that is not in the old range. +/// +/// (1) is guaranteed to always occur since all overlapping ranges have a +/// non-empty intersection. If the two ranges are not equivalent, then at +/// least one of (2) or (3) is guaranteed to occur as well. In some cases, +/// e.g., `[0-4]` and `[4-9]`, all three cases will occur. +/// +/// This `Split` type is responsible for providing (1), (2) and (3) for any +/// possible pair of byte ranges. +/// +/// As for insertion, for the overlap in (1), the remaining ranges to insert +/// should be added by following the corresponding transition. However, this +/// should only be done for the overlapping parts of the range. If there was +/// a part of the existing range that was not in the new range, then that +/// existing part must be split off from the transition and duplicated. The +/// remaining parts of the overlap can then be added to using the new ranges +/// without disturbing the existing range. +/// +/// Handling the case for the part of a new range that is not in an existing +/// range is seemingly easy. Just treat it as if it were a non-overlapping +/// range. The problem here is that if this new non-overlapping range occurs +/// after both (1) and (2), then it's possible that it can overlap with the +/// next transition in the current state. If it does, then the whole process +/// must be repeated! +/// +/// # Details of the 3 cases +/// +/// The following details the various cases that are implemented in code +/// below. It's plausible that the number of cases is not actually minimal, +/// but it's important for this code to remain at least somewhat readable. +/// +/// Given [a,b] and [x,y], where a <= b, x <= y, b < 256 and y < 256, we define +/// the follow distinct relationships where at least one must apply. The order +/// of these matters, since multiple can match. The first to match applies. +/// +/// 1. b < x <=> [a,b] < [x,y] +/// 2. y < a <=> [x,y] < [a,b] +/// +/// In the case of (1) and (2), these are the only cases where there is no +/// overlap. Or otherwise, the intersection of [a,b] and [x,y] is empty. In +/// order to compute the intersection, one can do [max(a,x), min(b,y)]. The +/// intersection in all of the following cases is non-empty. +/// +/// 3. a = x && b = y <=> [a,b] == [x,y] +/// 4. a = x && b < y <=> [x,y] right-extends [a,b] +/// 5. b = y && a > x <=> [x,y] left-extends [a,b] +/// 6. x = a && y < b <=> [a,b] right-extends [x,y] +/// 7. y = b && x > a <=> [a,b] left-extends [x,y] +/// 8. a > x && b < y <=> [x,y] covers [a,b] +/// 9. x > a && y < b <=> [a,b] covers [x,y] +/// 10. b = x && a < y <=> [a,b] is left-adjacent to [x,y] +/// 11. y = a && x < b <=> [x,y] is left-adjacent to [a,b] +/// 12. b > x && b < y <=> [a,b] left-overlaps [x,y] +/// 13. y > a && y < b <=> [x,y] left-overlaps [a,b] +/// +/// In cases 3-13, we can form rules that partition the ranges into a +/// non-overlapping ordered sequence of ranges: +/// +/// 3. [a,b] +/// 4. [a,b], [b+1,y] +/// 5. [x,a-1], [a,b] +/// 6. [x,y], [y+1,b] +/// 7. [a,x-1], [x,y] +/// 8. [x,a-1], [a,b], [b+1,y] +/// 9. [a,x-1], [x,y], [y+1,b] +/// 10. [a,b-1], [b,b], [b+1,y] +/// 11. [x,y-1], [y,y], [y+1,b] +/// 12. [a,x-1], [x,b], [b+1,y] +/// 13. [x,a-1], [a,y], [y+1,b] +/// +/// In the code below, we go a step further and identify each of the above +/// outputs as belonging either to the overlap of the two ranges or to one +/// of [a,b] or [x,y] exclusively. +#[derive(Clone, Debug, Eq, PartialEq)] +struct Split { + partitions: [SplitRange; 3], + len: usize, +} + +/// A tagged range indicating how it was derived from a pair of ranges. +#[derive(Clone, Copy, Debug, Eq, PartialEq)] +enum SplitRange { + Old(Utf8Range), + New(Utf8Range), + Both(Utf8Range), +} + +impl Split { + /// Create a partitioning of the given ranges. + /// + /// If the given ranges have an empty intersection, then None is returned. + fn new(o: Utf8Range, n: Utf8Range) -> Option<Split> { + let range = |r: RangeInclusive<u8>| Utf8Range { + start: *r.start(), + end: *r.end(), + }; + let old = |r| SplitRange::Old(range(r)); + let new = |r| SplitRange::New(range(r)); + let both = |r| SplitRange::Both(range(r)); + + // Use same names as the comment above to make it easier to compare. + let (a, b, x, y) = (o.start, o.end, n.start, n.end); + + if b < x || y < a { + // case 1, case 2 + None + } else if a == x && b == y { + // case 3 + Some(Split::parts1(both(a..=b))) + } else if a == x && b < y { + // case 4 + Some(Split::parts2(both(a..=b), new(b + 1..=y))) + } else if b == y && a > x { + // case 5 + Some(Split::parts2(new(x..=a - 1), both(a..=b))) + } else if x == a && y < b { + // case 6 + Some(Split::parts2(both(x..=y), old(y + 1..=b))) + } else if y == b && x > a { + // case 7 + Some(Split::parts2(old(a..=x - 1), both(x..=y))) + } else if a > x && b < y { + // case 8 + Some(Split::parts3(new(x..=a - 1), both(a..=b), new(b + 1..=y))) + } else if x > a && y < b { + // case 9 + Some(Split::parts3(old(a..=x - 1), both(x..=y), old(y + 1..=b))) + } else if b == x && a < y { + // case 10 + Some(Split::parts3(old(a..=b - 1), both(b..=b), new(b + 1..=y))) + } else if y == a && x < b { + // case 11 + Some(Split::parts3(new(x..=y - 1), both(y..=y), old(y + 1..=b))) + } else if b > x && b < y { + // case 12 + Some(Split::parts3(old(a..=x - 1), both(x..=b), new(b + 1..=y))) + } else if y > a && y < b { + // case 13 + Some(Split::parts3(new(x..=a - 1), both(a..=y), old(y + 1..=b))) + } else { + unreachable!() + } + } + + /// Create a new split with a single partition. This only occurs when two + /// ranges are equivalent. + fn parts1(r1: SplitRange) -> Split { + // This value doesn't matter since it is never accessed. + let nada = SplitRange::Old(Utf8Range { start: 0, end: 0 }); + Split { partitions: [r1, nada, nada], len: 1 } + } + + /// Create a new split with two partitions. + fn parts2(r1: SplitRange, r2: SplitRange) -> Split { + // This value doesn't matter since it is never accessed. + let nada = SplitRange::Old(Utf8Range { start: 0, end: 0 }); + Split { partitions: [r1, r2, nada], len: 2 } + } + + /// Create a new split with three partitions. + fn parts3(r1: SplitRange, r2: SplitRange, r3: SplitRange) -> Split { + Split { partitions: [r1, r2, r3], len: 3 } + } + + /// Return the partitions in this split as a slice. + fn as_slice(&self) -> &[SplitRange] { + &self.partitions[..self.len] + } +} + +impl fmt::Debug for RangeTrie { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + writeln!(f, "")?; + for (i, state) in self.states.iter().enumerate() { + let status = if i == FINAL as usize { '*' } else { ' ' }; + writeln!(f, "{}{:06}: {:?}", status, i, state)?; + } + Ok(()) + } +} + +impl fmt::Debug for State { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + let rs = self + .transitions + .iter() + .map(|t| format!("{:?}", t)) + .collect::<Vec<String>>() + .join(", "); + write!(f, "{}", rs) + } +} + +impl fmt::Debug for Transition { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + if self.range.start == self.range.end { + write!(f, "{:02X} => {:02X}", self.range.start, self.next_id) + } else { + write!( + f, + "{:02X}-{:02X} => {:02X}", + self.range.start, self.range.end, self.next_id + ) + } + } +} + +/// Returns true if and only if the given ranges intersect. +fn intersects(r1: Utf8Range, r2: Utf8Range) -> bool { + !(r1.end < r2.start || r2.end < r1.start) +} + +#[cfg(test)] +mod tests { + use std::ops::RangeInclusive; + + use regex_syntax::utf8::Utf8Range; + + use super::*; + + fn r(range: RangeInclusive<u8>) -> Utf8Range { + Utf8Range { start: *range.start(), end: *range.end() } + } + + fn split_maybe( + old: RangeInclusive<u8>, + new: RangeInclusive<u8>, + ) -> Option<Split> { + Split::new(r(old), r(new)) + } + + fn split( + old: RangeInclusive<u8>, + new: RangeInclusive<u8>, + ) -> Vec<SplitRange> { + split_maybe(old, new).unwrap().as_slice().to_vec() + } + + #[test] + fn no_splits() { + // case 1 + assert_eq!(None, split_maybe(0..=1, 2..=3)); + // case 2 + assert_eq!(None, split_maybe(2..=3, 0..=1)); + } + + #[test] + fn splits() { + let range = |r: RangeInclusive<u8>| Utf8Range { + start: *r.start(), + end: *r.end(), + }; + let old = |r| SplitRange::Old(range(r)); + let new = |r| SplitRange::New(range(r)); + let both = |r| SplitRange::Both(range(r)); + + // case 3 + assert_eq!(split(0..=0, 0..=0), vec![both(0..=0)]); + assert_eq!(split(9..=9, 9..=9), vec![both(9..=9)]); + + // case 4 + assert_eq!(split(0..=5, 0..=6), vec![both(0..=5), new(6..=6)]); + assert_eq!(split(0..=5, 0..=8), vec![both(0..=5), new(6..=8)]); + assert_eq!(split(5..=5, 5..=8), vec![both(5..=5), new(6..=8)]); + + // case 5 + assert_eq!(split(1..=5, 0..=5), vec![new(0..=0), both(1..=5)]); + assert_eq!(split(3..=5, 0..=5), vec![new(0..=2), both(3..=5)]); + assert_eq!(split(5..=5, 0..=5), vec![new(0..=4), both(5..=5)]); + + // case 6 + assert_eq!(split(0..=6, 0..=5), vec![both(0..=5), old(6..=6)]); + assert_eq!(split(0..=8, 0..=5), vec![both(0..=5), old(6..=8)]); + assert_eq!(split(5..=8, 5..=5), vec![both(5..=5), old(6..=8)]); + + // case 7 + assert_eq!(split(0..=5, 1..=5), vec![old(0..=0), both(1..=5)]); + assert_eq!(split(0..=5, 3..=5), vec![old(0..=2), both(3..=5)]); + assert_eq!(split(0..=5, 5..=5), vec![old(0..=4), both(5..=5)]); + + // case 8 + assert_eq!( + split(3..=6, 2..=7), + vec![new(2..=2), both(3..=6), new(7..=7)], + ); + assert_eq!( + split(3..=6, 1..=8), + vec![new(1..=2), both(3..=6), new(7..=8)], + ); + + // case 9 + assert_eq!( + split(2..=7, 3..=6), + vec![old(2..=2), both(3..=6), old(7..=7)], + ); + assert_eq!( + split(1..=8, 3..=6), + vec![old(1..=2), both(3..=6), old(7..=8)], + ); + + // case 10 + assert_eq!( + split(3..=6, 6..=7), + vec![old(3..=5), both(6..=6), new(7..=7)], + ); + assert_eq!( + split(3..=6, 6..=8), + vec![old(3..=5), both(6..=6), new(7..=8)], + ); + assert_eq!( + split(5..=6, 6..=7), + vec![old(5..=5), both(6..=6), new(7..=7)], + ); + + // case 11 + assert_eq!( + split(6..=7, 3..=6), + vec![new(3..=5), both(6..=6), old(7..=7)], + ); + assert_eq!( + split(6..=8, 3..=6), + vec![new(3..=5), both(6..=6), old(7..=8)], + ); + assert_eq!( + split(6..=7, 5..=6), + vec![new(5..=5), both(6..=6), old(7..=7)], + ); + + // case 12 + assert_eq!( + split(3..=7, 5..=9), + vec![old(3..=4), both(5..=7), new(8..=9)], + ); + assert_eq!( + split(3..=5, 4..=6), + vec![old(3..=3), both(4..=5), new(6..=6)], + ); + + // case 13 + assert_eq!( + split(5..=9, 3..=7), + vec![new(3..=4), both(5..=7), old(8..=9)], + ); + assert_eq!( + split(4..=6, 3..=5), + vec![new(3..=3), both(4..=5), old(6..=6)], + ); + } + + // Arguably there should be more tests here, but in practice, this data + // structure is well covered by the huge number of regex tests. +} |