summaryrefslogtreecommitdiffstats
path: root/vendor/regex-automata/src/nfa
diff options
context:
space:
mode:
Diffstat (limited to 'vendor/regex-automata/src/nfa')
-rw-r--r--vendor/regex-automata/src/nfa/compiler.rs1193
-rw-r--r--vendor/regex-automata/src/nfa/map.rs282
-rw-r--r--vendor/regex-automata/src/nfa/mod.rs252
-rw-r--r--vendor/regex-automata/src/nfa/range_trie.rs1048
4 files changed, 2775 insertions, 0 deletions
diff --git a/vendor/regex-automata/src/nfa/compiler.rs b/vendor/regex-automata/src/nfa/compiler.rs
new file mode 100644
index 000000000..d9b3945b3
--- /dev/null
+++ b/vendor/regex-automata/src/nfa/compiler.rs
@@ -0,0 +1,1193 @@
+// This module provides an NFA compiler using Thompson's construction
+// algorithm. The compiler takes a regex-syntax::Hir as input and emits an NFA
+// graph as output. The NFA graph is structured in a way that permits it to be
+// executed by a virtual machine and also used to efficiently build a DFA.
+//
+// The compiler deals with a slightly expanded set of NFA states that notably
+// includes an empty node that has exactly one epsilon transition to the next
+// state. In other words, it's a "goto" instruction if one views Thompson's NFA
+// as a set of bytecode instructions. These goto instructions are removed in
+// a subsequent phase before returning the NFA to the caller. The purpose of
+// these empty nodes is that they make the construction algorithm substantially
+// simpler to implement. We remove them before returning to the caller because
+// they can represent substantial overhead when traversing the NFA graph
+// (either while searching using the NFA directly or while building a DFA).
+//
+// In the future, it would be nice to provide a Glushkov compiler as well,
+// as it would work well as a bit-parallel NFA for smaller regexes. But
+// the Thompson construction is one I'm more familiar with and seems more
+// straight-forward to deal with when it comes to large Unicode character
+// classes.
+//
+// Internally, the compiler uses interior mutability to improve composition
+// in the face of the borrow checker. In particular, we'd really like to be
+// able to write things like this:
+//
+// self.c_concat(exprs.iter().map(|e| self.c(e)))
+//
+// Which elegantly uses iterators to build up a sequence of compiled regex
+// sub-expressions and then hands it off to the concatenating compiler
+// routine. Without interior mutability, the borrow checker won't let us
+// borrow `self` mutably both inside and outside the closure at the same
+// time.
+
+use std::cell::RefCell;
+use std::mem;
+
+use regex_syntax::hir::{self, Hir, HirKind};
+use regex_syntax::utf8::{Utf8Range, Utf8Sequences};
+
+use classes::ByteClassSet;
+use error::{Error, Result};
+use nfa::map::{Utf8BoundedMap, Utf8SuffixKey, Utf8SuffixMap};
+use nfa::range_trie::RangeTrie;
+use nfa::{State, StateID, Transition, NFA};
+
+/// Config knobs for the NFA compiler. See the builder's methods for more
+/// docs on each one.
+#[derive(Clone, Copy, Debug)]
+struct Config {
+ anchored: bool,
+ allow_invalid_utf8: bool,
+ reverse: bool,
+ shrink: bool,
+}
+
+impl Default for Config {
+ fn default() -> Config {
+ Config {
+ anchored: false,
+ allow_invalid_utf8: false,
+ reverse: false,
+ shrink: true,
+ }
+ }
+}
+
+/// A builder for compiling an NFA.
+#[derive(Clone, Debug)]
+pub struct Builder {
+ config: Config,
+}
+
+impl Builder {
+ /// Create a new NFA builder with its default configuration.
+ pub fn new() -> Builder {
+ Builder { config: Config::default() }
+ }
+
+ /// Compile the given high level intermediate representation of a regular
+ /// expression into an NFA.
+ ///
+ /// If there was a problem building the NFA, then an error is returned.
+ /// For example, if the regex uses unsupported features (such as zero-width
+ /// assertions), then an error is returned.
+ pub fn build(&self, expr: &Hir) -> Result<NFA> {
+ let mut nfa = NFA::always_match();
+ self.build_with(&mut Compiler::new(), &mut nfa, expr)?;
+ Ok(nfa)
+ }
+
+ /// Compile the given high level intermediate representation of a regular
+ /// expression into the NFA given using the given compiler. Callers may
+ /// prefer this over `build` if they would like to reuse allocations while
+ /// compiling many regular expressions.
+ ///
+ /// On success, the given NFA is completely overwritten with the NFA
+ /// produced by the compiler.
+ ///
+ /// If there was a problem building the NFA, then an error is returned. For
+ /// example, if the regex uses unsupported features (such as zero-width
+ /// assertions), then an error is returned. When an error is returned,
+ /// the contents of `nfa` are unspecified and should not be relied upon.
+ /// However, it can still be reused in subsequent calls to this method.
+ pub fn build_with(
+ &self,
+ compiler: &mut Compiler,
+ nfa: &mut NFA,
+ expr: &Hir,
+ ) -> Result<()> {
+ compiler.clear();
+ compiler.configure(self.config);
+ compiler.compile(nfa, expr)
+ }
+
+ /// Set whether matching must be anchored at the beginning of the input.
+ ///
+ /// When enabled, a match must begin at the start of the input. When
+ /// disabled, the NFA will act as if the pattern started with a `.*?`,
+ /// which enables a match to appear anywhere.
+ ///
+ /// By default this is disabled.
+ pub fn anchored(&mut self, yes: bool) -> &mut Builder {
+ self.config.anchored = yes;
+ self
+ }
+
+ /// When enabled, the builder will permit the construction of an NFA that
+ /// may match invalid UTF-8.
+ ///
+ /// When disabled (the default), the builder is guaranteed to produce a
+ /// regex that will only ever match valid UTF-8 (otherwise, the builder
+ /// will return an error).
+ pub fn allow_invalid_utf8(&mut self, yes: bool) -> &mut Builder {
+ self.config.allow_invalid_utf8 = yes;
+ self
+ }
+
+ /// Reverse the NFA.
+ ///
+ /// A NFA reversal is performed by reversing all of the concatenated
+ /// sub-expressions in the original pattern, recursively. The resulting
+ /// NFA can be used to match the pattern starting from the end of a string
+ /// instead of the beginning of a string.
+ ///
+ /// Reversing the NFA is useful for building a reverse DFA, which is most
+ /// useful for finding the start of a match.
+ pub fn reverse(&mut self, yes: bool) -> &mut Builder {
+ self.config.reverse = yes;
+ self
+ }
+
+ /// Apply best effort heuristics to shrink the NFA at the expense of more
+ /// time/memory.
+ ///
+ /// This is enabled by default. Generally speaking, if one is using an NFA
+ /// to compile DFA, then the extra time used to shrink the NFA will be
+ /// more than made up for during DFA construction (potentially by a lot).
+ /// In other words, enabling this can substantially decrease the overall
+ /// amount of time it takes to build a DFA.
+ ///
+ /// The only reason to disable this if you want to compile an NFA and start
+ /// using it as quickly as possible without needing to build a DFA.
+ pub fn shrink(&mut self, yes: bool) -> &mut Builder {
+ self.config.shrink = yes;
+ self
+ }
+}
+
+/// A compiler that converts a regex abstract syntax to an NFA via Thompson's
+/// construction. Namely, this compiler permits epsilon transitions between
+/// states.
+///
+/// Users of this crate cannot use a compiler directly. Instead, all one can
+/// do is create one and use it via the
+/// [`Builder::build_with`](struct.Builder.html#method.build_with)
+/// method. This permits callers to reuse compilers in order to amortize
+/// allocations.
+#[derive(Clone, Debug)]
+pub struct Compiler {
+ /// The set of compiled NFA states. Once a state is compiled, it is
+ /// assigned a state ID equivalent to its index in this list. Subsequent
+ /// compilation can modify previous states by adding new transitions.
+ states: RefCell<Vec<CState>>,
+ /// The configuration from the builder.
+ config: Config,
+ /// State used for compiling character classes to UTF-8 byte automata.
+ /// State is not retained between character class compilations. This just
+ /// serves to amortize allocation to the extent possible.
+ utf8_state: RefCell<Utf8State>,
+ /// State used for arranging character classes in reverse into a trie.
+ trie_state: RefCell<RangeTrie>,
+ /// State used for caching common suffixes when compiling reverse UTF-8
+ /// automata (for Unicode character classes).
+ utf8_suffix: RefCell<Utf8SuffixMap>,
+ /// A map used to re-map state IDs when translating the compiler's internal
+ /// NFA state representation to the external NFA representation.
+ remap: RefCell<Vec<StateID>>,
+ /// A set of compiler internal state IDs that correspond to states that are
+ /// exclusively epsilon transitions, i.e., goto instructions, combined with
+ /// the state that they point to. This is used to record said states while
+ /// transforming the compiler's internal NFA representation to the external
+ /// form.
+ empties: RefCell<Vec<(StateID, StateID)>>,
+}
+
+/// A compiler intermediate state representation for an NFA that is only used
+/// during compilation. Once compilation is done, `CState`s are converted to
+/// `State`s, which have a much simpler representation.
+#[derive(Clone, Debug, Eq, PartialEq)]
+enum CState {
+ /// An empty state whose only purpose is to forward the automaton to
+ /// another state via en epsilon transition. These are useful during
+ /// compilation but are otherwise removed at the end.
+ Empty { next: StateID },
+ /// A state that only transitions to `next` if the current input byte is
+ /// in the range `[start, end]` (inclusive on both ends).
+ Range { range: Transition },
+ /// A state with possibly many transitions, represented in a sparse
+ /// fashion. Transitions are ordered lexicographically by input range.
+ /// As such, this may only be used when every transition has equal
+ /// priority. (In practice, this is only used for encoding large UTF-8
+ /// automata.)
+ Sparse { ranges: Vec<Transition> },
+ /// An alternation such that there exists an epsilon transition to all
+ /// states in `alternates`, where matches found via earlier transitions
+ /// are preferred over later transitions.
+ Union { alternates: Vec<StateID> },
+ /// An alternation such that there exists an epsilon transition to all
+ /// states in `alternates`, where matches found via later transitions
+ /// are preferred over earlier transitions.
+ ///
+ /// This "reverse" state exists for convenience during compilation that
+ /// permits easy construction of non-greedy combinations of NFA states.
+ /// At the end of compilation, Union and UnionReverse states are merged
+ /// into one Union type of state, where the latter has its epsilon
+ /// transitions reversed to reflect the priority inversion.
+ UnionReverse { alternates: Vec<StateID> },
+ /// A match state. There is exactly one such occurrence of this state in
+ /// an NFA.
+ Match,
+}
+
+/// A value that represents the result of compiling a sub-expression of a
+/// regex's HIR. Specifically, this represents a sub-graph of the NFA that
+/// has an initial state at `start` and a final state at `end`.
+#[derive(Clone, Copy, Debug)]
+pub struct ThompsonRef {
+ start: StateID,
+ end: StateID,
+}
+
+impl Compiler {
+ /// Create a new compiler.
+ pub fn new() -> Compiler {
+ Compiler {
+ states: RefCell::new(vec![]),
+ config: Config::default(),
+ utf8_state: RefCell::new(Utf8State::new()),
+ trie_state: RefCell::new(RangeTrie::new()),
+ utf8_suffix: RefCell::new(Utf8SuffixMap::new(1000)),
+ remap: RefCell::new(vec![]),
+ empties: RefCell::new(vec![]),
+ }
+ }
+
+ /// Clear any memory used by this compiler such that it is ready to compile
+ /// a new regex.
+ ///
+ /// It is preferrable to reuse a compiler if possible in order to reuse
+ /// allocations.
+ fn clear(&self) {
+ self.states.borrow_mut().clear();
+ // We don't need to clear anything else since they are cleared on
+ // their own and only when they are used.
+ }
+
+ /// Configure this compiler from the builder's knobs.
+ ///
+ /// The compiler is always reconfigured by the builder before using it to
+ /// build an NFA.
+ fn configure(&mut self, config: Config) {
+ self.config = config;
+ }
+
+ /// Convert the current intermediate NFA to its final compiled form.
+ fn compile(&self, nfa: &mut NFA, expr: &Hir) -> Result<()> {
+ nfa.anchored = self.config.anchored;
+
+ let mut start = self.add_empty();
+ if !nfa.anchored {
+ let compiled = if self.config.allow_invalid_utf8 {
+ self.c_unanchored_prefix_invalid_utf8()?
+ } else {
+ self.c_unanchored_prefix_valid_utf8()?
+ };
+ self.patch(start, compiled.start);
+ start = compiled.end;
+ }
+ let compiled = self.c(&expr)?;
+ let match_id = self.add_match();
+ self.patch(start, compiled.start);
+ self.patch(compiled.end, match_id);
+ self.finish(nfa);
+ Ok(())
+ }
+
+ /// Finishes the compilation process and populates the provide NFA with
+ /// the final graph.
+ fn finish(&self, nfa: &mut NFA) {
+ let mut bstates = self.states.borrow_mut();
+ let mut remap = self.remap.borrow_mut();
+ remap.resize(bstates.len(), 0);
+ let mut empties = self.empties.borrow_mut();
+ empties.clear();
+
+ // We don't reuse allocations here becuase this is what we're
+ // returning.
+ nfa.states.clear();
+ let mut byteset = ByteClassSet::new();
+
+ // The idea here is to convert our intermediate states to their final
+ // form. The only real complexity here is the process of converting
+ // transitions, which are expressed in terms of state IDs. The new
+ // set of states will be smaller because of partial epsilon removal,
+ // so the state IDs will not be the same.
+ for (id, bstate) in bstates.iter_mut().enumerate() {
+ match *bstate {
+ CState::Empty { next } => {
+ // Since we're removing empty states, we need to handle
+ // them later since we don't yet know which new state this
+ // empty state will be mapped to.
+ empties.push((id, next));
+ }
+ CState::Range { ref range } => {
+ remap[id] = nfa.states.len();
+ byteset.set_range(range.start, range.end);
+ nfa.states.push(State::Range { range: range.clone() });
+ }
+ CState::Sparse { ref mut ranges } => {
+ remap[id] = nfa.states.len();
+
+ let ranges = mem::replace(ranges, vec![]);
+ for r in &ranges {
+ byteset.set_range(r.start, r.end);
+ }
+ nfa.states.push(State::Sparse {
+ ranges: ranges.into_boxed_slice(),
+ });
+ }
+ CState::Union { ref mut alternates } => {
+ remap[id] = nfa.states.len();
+
+ let alternates = mem::replace(alternates, vec![]);
+ nfa.states.push(State::Union {
+ alternates: alternates.into_boxed_slice(),
+ });
+ }
+ CState::UnionReverse { ref mut alternates } => {
+ remap[id] = nfa.states.len();
+
+ let mut alternates = mem::replace(alternates, vec![]);
+ alternates.reverse();
+ nfa.states.push(State::Union {
+ alternates: alternates.into_boxed_slice(),
+ });
+ }
+ CState::Match => {
+ remap[id] = nfa.states.len();
+ nfa.states.push(State::Match);
+ }
+ }
+ }
+ for &(empty_id, mut empty_next) in empties.iter() {
+ // empty states can point to other empty states, forming a chain.
+ // So we must follow the chain until the end, which must end at
+ // a non-empty state, and therefore, a state that is correctly
+ // remapped. We are guaranteed to terminate because our compiler
+ // never builds a loop among empty states.
+ while let CState::Empty { next } = bstates[empty_next] {
+ empty_next = next;
+ }
+ remap[empty_id] = remap[empty_next];
+ }
+ for state in &mut nfa.states {
+ state.remap(&remap);
+ }
+ // The compiler always begins the NFA at the first state.
+ nfa.start = remap[0];
+ nfa.byte_classes = byteset.byte_classes();
+ }
+
+ fn c(&self, expr: &Hir) -> Result<ThompsonRef> {
+ match *expr.kind() {
+ HirKind::Empty => {
+ let id = self.add_empty();
+ Ok(ThompsonRef { start: id, end: id })
+ }
+ HirKind::Literal(hir::Literal::Unicode(ch)) => {
+ let mut buf = [0; 4];
+ let it = ch
+ .encode_utf8(&mut buf)
+ .as_bytes()
+ .iter()
+ .map(|&b| Ok(self.c_range(b, b)));
+ self.c_concat(it)
+ }
+ HirKind::Literal(hir::Literal::Byte(b)) => Ok(self.c_range(b, b)),
+ HirKind::Class(hir::Class::Bytes(ref cls)) => {
+ self.c_byte_class(cls)
+ }
+ HirKind::Class(hir::Class::Unicode(ref cls)) => {
+ self.c_unicode_class(cls)
+ }
+ HirKind::Repetition(ref rep) => self.c_repetition(rep),
+ HirKind::Group(ref group) => self.c(&*group.hir),
+ HirKind::Concat(ref exprs) => {
+ self.c_concat(exprs.iter().map(|e| self.c(e)))
+ }
+ HirKind::Alternation(ref exprs) => {
+ self.c_alternation(exprs.iter().map(|e| self.c(e)))
+ }
+ HirKind::Anchor(_) => Err(Error::unsupported_anchor()),
+ HirKind::WordBoundary(_) => Err(Error::unsupported_word()),
+ }
+ }
+
+ fn c_concat<I>(&self, mut it: I) -> Result<ThompsonRef>
+ where
+ I: DoubleEndedIterator<Item = Result<ThompsonRef>>,
+ {
+ let first =
+ if self.config.reverse { it.next_back() } else { it.next() };
+ let ThompsonRef { start, mut end } = match first {
+ Some(result) => result?,
+ None => return Ok(self.c_empty()),
+ };
+ loop {
+ let next =
+ if self.config.reverse { it.next_back() } else { it.next() };
+ let compiled = match next {
+ Some(result) => result?,
+ None => break,
+ };
+ self.patch(end, compiled.start);
+ end = compiled.end;
+ }
+ Ok(ThompsonRef { start, end })
+ }
+
+ fn c_alternation<I>(&self, mut it: I) -> Result<ThompsonRef>
+ where
+ I: Iterator<Item = Result<ThompsonRef>>,
+ {
+ let first = it.next().expect("alternations must be non-empty")?;
+ let second = match it.next() {
+ None => return Ok(first),
+ Some(result) => result?,
+ };
+
+ let union = self.add_union();
+ let end = self.add_empty();
+ self.patch(union, first.start);
+ self.patch(first.end, end);
+ self.patch(union, second.start);
+ self.patch(second.end, end);
+ for result in it {
+ let compiled = result?;
+ self.patch(union, compiled.start);
+ self.patch(compiled.end, end);
+ }
+ Ok(ThompsonRef { start: union, end })
+ }
+
+ fn c_repetition(&self, rep: &hir::Repetition) -> Result<ThompsonRef> {
+ match rep.kind {
+ hir::RepetitionKind::ZeroOrOne => {
+ self.c_zero_or_one(&rep.hir, rep.greedy)
+ }
+ hir::RepetitionKind::ZeroOrMore => {
+ self.c_at_least(&rep.hir, rep.greedy, 0)
+ }
+ hir::RepetitionKind::OneOrMore => {
+ self.c_at_least(&rep.hir, rep.greedy, 1)
+ }
+ hir::RepetitionKind::Range(ref rng) => match *rng {
+ hir::RepetitionRange::Exactly(count) => {
+ self.c_exactly(&rep.hir, count)
+ }
+ hir::RepetitionRange::AtLeast(m) => {
+ self.c_at_least(&rep.hir, rep.greedy, m)
+ }
+ hir::RepetitionRange::Bounded(min, max) => {
+ self.c_bounded(&rep.hir, rep.greedy, min, max)
+ }
+ },
+ }
+ }
+
+ fn c_bounded(
+ &self,
+ expr: &Hir,
+ greedy: bool,
+ min: u32,
+ max: u32,
+ ) -> Result<ThompsonRef> {
+ let prefix = self.c_exactly(expr, min)?;
+ if min == max {
+ return Ok(prefix);
+ }
+
+ // It is tempting here to compile the rest here as a concatenation
+ // of zero-or-one matches. i.e., for `a{2,5}`, compile it as if it
+ // were `aaa?a?a?`. The problem here is that it leads to this program:
+ //
+ // >000000: 61 => 01
+ // 000001: 61 => 02
+ // 000002: alt(03, 04)
+ // 000003: 61 => 04
+ // 000004: alt(05, 06)
+ // 000005: 61 => 06
+ // 000006: alt(07, 08)
+ // 000007: 61 => 08
+ // 000008: MATCH
+ //
+ // And effectively, once you hit state 2, the epsilon closure will
+ // include states 3, 5, 5, 6, 7 and 8, which is quite a bit. It is
+ // better to instead compile it like so:
+ //
+ // >000000: 61 => 01
+ // 000001: 61 => 02
+ // 000002: alt(03, 08)
+ // 000003: 61 => 04
+ // 000004: alt(05, 08)
+ // 000005: 61 => 06
+ // 000006: alt(07, 08)
+ // 000007: 61 => 08
+ // 000008: MATCH
+ //
+ // So that the epsilon closure of state 2 is now just 3 and 8.
+ let empty = self.add_empty();
+ let mut prev_end = prefix.end;
+ for _ in min..max {
+ let union = if greedy {
+ self.add_union()
+ } else {
+ self.add_reverse_union()
+ };
+ let compiled = self.c(expr)?;
+ self.patch(prev_end, union);
+ self.patch(union, compiled.start);
+ self.patch(union, empty);
+ prev_end = compiled.end;
+ }
+ self.patch(prev_end, empty);
+ Ok(ThompsonRef { start: prefix.start, end: empty })
+ }
+
+ fn c_at_least(
+ &self,
+ expr: &Hir,
+ greedy: bool,
+ n: u32,
+ ) -> Result<ThompsonRef> {
+ if n == 0 {
+ let union = if greedy {
+ self.add_union()
+ } else {
+ self.add_reverse_union()
+ };
+ let compiled = self.c(expr)?;
+ self.patch(union, compiled.start);
+ self.patch(compiled.end, union);
+ Ok(ThompsonRef { start: union, end: union })
+ } else if n == 1 {
+ let compiled = self.c(expr)?;
+ let union = if greedy {
+ self.add_union()
+ } else {
+ self.add_reverse_union()
+ };
+ self.patch(compiled.end, union);
+ self.patch(union, compiled.start);
+ Ok(ThompsonRef { start: compiled.start, end: union })
+ } else {
+ let prefix = self.c_exactly(expr, n - 1)?;
+ let last = self.c(expr)?;
+ let union = if greedy {
+ self.add_union()
+ } else {
+ self.add_reverse_union()
+ };
+ self.patch(prefix.end, last.start);
+ self.patch(last.end, union);
+ self.patch(union, last.start);
+ Ok(ThompsonRef { start: prefix.start, end: union })
+ }
+ }
+
+ fn c_zero_or_one(&self, expr: &Hir, greedy: bool) -> Result<ThompsonRef> {
+ let union =
+ if greedy { self.add_union() } else { self.add_reverse_union() };
+ let compiled = self.c(expr)?;
+ let empty = self.add_empty();
+ self.patch(union, compiled.start);
+ self.patch(union, empty);
+ self.patch(compiled.end, empty);
+ Ok(ThompsonRef { start: union, end: empty })
+ }
+
+ fn c_exactly(&self, expr: &Hir, n: u32) -> Result<ThompsonRef> {
+ let it = (0..n).map(|_| self.c(expr));
+ self.c_concat(it)
+ }
+
+ fn c_byte_class(&self, cls: &hir::ClassBytes) -> Result<ThompsonRef> {
+ let end = self.add_empty();
+ let mut trans = Vec::with_capacity(cls.ranges().len());
+ for r in cls.iter() {
+ trans.push(Transition {
+ start: r.start(),
+ end: r.end(),
+ next: end,
+ });
+ }
+ Ok(ThompsonRef { start: self.add_sparse(trans), end })
+ }
+
+ fn c_unicode_class(&self, cls: &hir::ClassUnicode) -> Result<ThompsonRef> {
+ // If all we have are ASCII ranges wrapped in a Unicode package, then
+ // there is zero reason to bring out the big guns. We can fit all ASCII
+ // ranges within a single sparse transition.
+ if cls.is_all_ascii() {
+ let end = self.add_empty();
+ let mut trans = Vec::with_capacity(cls.ranges().len());
+ for r in cls.iter() {
+ assert!(r.start() <= '\x7F');
+ assert!(r.end() <= '\x7F');
+ trans.push(Transition {
+ start: r.start() as u8,
+ end: r.end() as u8,
+ next: end,
+ });
+ }
+ Ok(ThompsonRef { start: self.add_sparse(trans), end })
+ } else if self.config.reverse {
+ if !self.config.shrink {
+ // When we don't want to spend the extra time shrinking, we
+ // compile the UTF-8 automaton in reverse using something like
+ // the "naive" approach, but will attempt to re-use common
+ // suffixes.
+ self.c_unicode_class_reverse_with_suffix(cls)
+ } else {
+ // When we want to shrink our NFA for reverse UTF-8 automata,
+ // we cannot feed UTF-8 sequences directly to the UTF-8
+ // compiler, since the UTF-8 compiler requires all sequences
+ // to be lexicographically sorted. Instead, we organize our
+ // sequences into a range trie, which can then output our
+ // sequences in the correct order. Unfortunately, building the
+ // range trie is fairly expensive (but not nearly as expensive
+ // as building a DFA). Hence the reason why the 'shrink' option
+ // exists, so that this path can be toggled off.
+ let mut trie = self.trie_state.borrow_mut();
+ trie.clear();
+
+ for rng in cls.iter() {
+ for mut seq in Utf8Sequences::new(rng.start(), rng.end()) {
+ seq.reverse();
+ trie.insert(seq.as_slice());
+ }
+ }
+ let mut utf8_state = self.utf8_state.borrow_mut();
+ let mut utf8c = Utf8Compiler::new(self, &mut *utf8_state);
+ trie.iter(|seq| {
+ utf8c.add(&seq);
+ });
+ Ok(utf8c.finish())
+ }
+ } else {
+ // In the forward direction, we always shrink our UTF-8 automata
+ // because we can stream it right into the UTF-8 compiler. There
+ // is almost no downside (in either memory or time) to using this
+ // approach.
+ let mut utf8_state = self.utf8_state.borrow_mut();
+ let mut utf8c = Utf8Compiler::new(self, &mut *utf8_state);
+ for rng in cls.iter() {
+ for seq in Utf8Sequences::new(rng.start(), rng.end()) {
+ utf8c.add(seq.as_slice());
+ }
+ }
+ Ok(utf8c.finish())
+ }
+
+ // For reference, the code below is the "naive" version of compiling a
+ // UTF-8 automaton. It is deliciously simple (and works for both the
+ // forward and reverse cases), but will unfortunately produce very
+ // large NFAs. When compiling a forward automaton, the size difference
+ // can sometimes be an order of magnitude. For example, the '\w' regex
+ // will generate about ~3000 NFA states using the naive approach below,
+ // but only 283 states when using the approach above. This is because
+ // the approach above actually compiles a *minimal* (or near minimal,
+ // because of the bounded hashmap) UTF-8 automaton.
+ //
+ // The code below is kept as a reference point in order to make it
+ // easier to understand the higher level goal here.
+ /*
+ let it = cls
+ .iter()
+ .flat_map(|rng| Utf8Sequences::new(rng.start(), rng.end()))
+ .map(|seq| {
+ let it = seq
+ .as_slice()
+ .iter()
+ .map(|rng| Ok(self.c_range(rng.start, rng.end)));
+ self.c_concat(it)
+ });
+ self.c_alternation(it);
+ */
+ }
+
+ fn c_unicode_class_reverse_with_suffix(
+ &self,
+ cls: &hir::ClassUnicode,
+ ) -> Result<ThompsonRef> {
+ // N.B. It would likely be better to cache common *prefixes* in the
+ // reverse direction, but it's not quite clear how to do that. The
+ // advantage of caching suffixes is that it does give us a win, and
+ // has a very small additional overhead.
+ let mut cache = self.utf8_suffix.borrow_mut();
+ cache.clear();
+
+ let union = self.add_union();
+ let alt_end = self.add_empty();
+ for urng in cls.iter() {
+ for seq in Utf8Sequences::new(urng.start(), urng.end()) {
+ let mut end = alt_end;
+ for brng in seq.as_slice() {
+ let key = Utf8SuffixKey {
+ from: end,
+ start: brng.start,
+ end: brng.end,
+ };
+ let hash = cache.hash(&key);
+ if let Some(id) = cache.get(&key, hash) {
+ end = id;
+ continue;
+ }
+
+ let compiled = self.c_range(brng.start, brng.end);
+ self.patch(compiled.end, end);
+ end = compiled.start;
+ cache.set(key, hash, end);
+ }
+ self.patch(union, end);
+ }
+ }
+ Ok(ThompsonRef { start: union, end: alt_end })
+ }
+
+ fn c_range(&self, start: u8, end: u8) -> ThompsonRef {
+ let id = self.add_range(start, end);
+ ThompsonRef { start: id, end: id }
+ }
+
+ fn c_empty(&self) -> ThompsonRef {
+ let id = self.add_empty();
+ ThompsonRef { start: id, end: id }
+ }
+
+ fn c_unanchored_prefix_valid_utf8(&self) -> Result<ThompsonRef> {
+ self.c(&Hir::repetition(hir::Repetition {
+ kind: hir::RepetitionKind::ZeroOrMore,
+ greedy: false,
+ hir: Box::new(Hir::any(false)),
+ }))
+ }
+
+ fn c_unanchored_prefix_invalid_utf8(&self) -> Result<ThompsonRef> {
+ self.c(&Hir::repetition(hir::Repetition {
+ kind: hir::RepetitionKind::ZeroOrMore,
+ greedy: false,
+ hir: Box::new(Hir::any(true)),
+ }))
+ }
+
+ fn patch(&self, from: StateID, to: StateID) {
+ match self.states.borrow_mut()[from] {
+ CState::Empty { ref mut next } => {
+ *next = to;
+ }
+ CState::Range { ref mut range } => {
+ range.next = to;
+ }
+ CState::Sparse { .. } => {
+ panic!("cannot patch from a sparse NFA state")
+ }
+ CState::Union { ref mut alternates } => {
+ alternates.push(to);
+ }
+ CState::UnionReverse { ref mut alternates } => {
+ alternates.push(to);
+ }
+ CState::Match => {}
+ }
+ }
+
+ fn add_empty(&self) -> StateID {
+ let id = self.states.borrow().len();
+ self.states.borrow_mut().push(CState::Empty { next: 0 });
+ id
+ }
+
+ fn add_range(&self, start: u8, end: u8) -> StateID {
+ let id = self.states.borrow().len();
+ let trans = Transition { start, end, next: 0 };
+ let state = CState::Range { range: trans };
+ self.states.borrow_mut().push(state);
+ id
+ }
+
+ fn add_sparse(&self, ranges: Vec<Transition>) -> StateID {
+ if ranges.len() == 1 {
+ let id = self.states.borrow().len();
+ let state = CState::Range { range: ranges[0] };
+ self.states.borrow_mut().push(state);
+ return id;
+ }
+ let id = self.states.borrow().len();
+ let state = CState::Sparse { ranges };
+ self.states.borrow_mut().push(state);
+ id
+ }
+
+ fn add_union(&self) -> StateID {
+ let id = self.states.borrow().len();
+ let state = CState::Union { alternates: vec![] };
+ self.states.borrow_mut().push(state);
+ id
+ }
+
+ fn add_reverse_union(&self) -> StateID {
+ let id = self.states.borrow().len();
+ let state = CState::UnionReverse { alternates: vec![] };
+ self.states.borrow_mut().push(state);
+ id
+ }
+
+ fn add_match(&self) -> StateID {
+ let id = self.states.borrow().len();
+ self.states.borrow_mut().push(CState::Match);
+ id
+ }
+}
+
+#[derive(Debug)]
+struct Utf8Compiler<'a> {
+ nfac: &'a Compiler,
+ state: &'a mut Utf8State,
+ target: StateID,
+}
+
+#[derive(Clone, Debug)]
+struct Utf8State {
+ compiled: Utf8BoundedMap,
+ uncompiled: Vec<Utf8Node>,
+}
+
+#[derive(Clone, Debug)]
+struct Utf8Node {
+ trans: Vec<Transition>,
+ last: Option<Utf8LastTransition>,
+}
+
+#[derive(Clone, Debug)]
+struct Utf8LastTransition {
+ start: u8,
+ end: u8,
+}
+
+impl Utf8State {
+ fn new() -> Utf8State {
+ Utf8State { compiled: Utf8BoundedMap::new(5000), uncompiled: vec![] }
+ }
+
+ fn clear(&mut self) {
+ self.compiled.clear();
+ self.uncompiled.clear();
+ }
+}
+
+impl<'a> Utf8Compiler<'a> {
+ fn new(nfac: &'a Compiler, state: &'a mut Utf8State) -> Utf8Compiler<'a> {
+ let target = nfac.add_empty();
+ state.clear();
+ let mut utf8c = Utf8Compiler { nfac, state, target };
+ utf8c.add_empty();
+ utf8c
+ }
+
+ fn finish(&mut self) -> ThompsonRef {
+ self.compile_from(0);
+ let node = self.pop_root();
+ let start = self.compile(node);
+ ThompsonRef { start, end: self.target }
+ }
+
+ fn add(&mut self, ranges: &[Utf8Range]) {
+ let prefix_len = ranges
+ .iter()
+ .zip(&self.state.uncompiled)
+ .take_while(|&(range, node)| {
+ node.last.as_ref().map_or(false, |t| {
+ (t.start, t.end) == (range.start, range.end)
+ })
+ })
+ .count();
+ assert!(prefix_len < ranges.len());
+ self.compile_from(prefix_len);
+ self.add_suffix(&ranges[prefix_len..]);
+ }
+
+ fn compile_from(&mut self, from: usize) {
+ let mut next = self.target;
+ while from + 1 < self.state.uncompiled.len() {
+ let node = self.pop_freeze(next);
+ next = self.compile(node);
+ }
+ self.top_last_freeze(next);
+ }
+
+ fn compile(&mut self, node: Vec<Transition>) -> StateID {
+ let hash = self.state.compiled.hash(&node);
+ if let Some(id) = self.state.compiled.get(&node, hash) {
+ return id;
+ }
+ let id = self.nfac.add_sparse(node.clone());
+ self.state.compiled.set(node, hash, id);
+ id
+ }
+
+ fn add_suffix(&mut self, ranges: &[Utf8Range]) {
+ assert!(!ranges.is_empty());
+ let last = self
+ .state
+ .uncompiled
+ .len()
+ .checked_sub(1)
+ .expect("non-empty nodes");
+ assert!(self.state.uncompiled[last].last.is_none());
+ self.state.uncompiled[last].last = Some(Utf8LastTransition {
+ start: ranges[0].start,
+ end: ranges[0].end,
+ });
+ for r in &ranges[1..] {
+ self.state.uncompiled.push(Utf8Node {
+ trans: vec![],
+ last: Some(Utf8LastTransition { start: r.start, end: r.end }),
+ });
+ }
+ }
+
+ fn add_empty(&mut self) {
+ self.state.uncompiled.push(Utf8Node { trans: vec![], last: None });
+ }
+
+ fn pop_freeze(&mut self, next: StateID) -> Vec<Transition> {
+ let mut uncompiled = self.state.uncompiled.pop().unwrap();
+ uncompiled.set_last_transition(next);
+ uncompiled.trans
+ }
+
+ fn pop_root(&mut self) -> Vec<Transition> {
+ assert_eq!(self.state.uncompiled.len(), 1);
+ assert!(self.state.uncompiled[0].last.is_none());
+ self.state.uncompiled.pop().expect("non-empty nodes").trans
+ }
+
+ fn top_last_freeze(&mut self, next: StateID) {
+ let last = self
+ .state
+ .uncompiled
+ .len()
+ .checked_sub(1)
+ .expect("non-empty nodes");
+ self.state.uncompiled[last].set_last_transition(next);
+ }
+}
+
+impl Utf8Node {
+ fn set_last_transition(&mut self, next: StateID) {
+ if let Some(last) = self.last.take() {
+ self.trans.push(Transition {
+ start: last.start,
+ end: last.end,
+ next,
+ });
+ }
+ }
+}
+
+#[cfg(test)]
+mod tests {
+ use regex_syntax::hir::Hir;
+ use regex_syntax::ParserBuilder;
+
+ use super::{Builder, State, StateID, Transition, NFA};
+
+ fn parse(pattern: &str) -> Hir {
+ ParserBuilder::new().build().parse(pattern).unwrap()
+ }
+
+ fn build(pattern: &str) -> NFA {
+ Builder::new().anchored(true).build(&parse(pattern)).unwrap()
+ }
+
+ fn s_byte(byte: u8, next: StateID) -> State {
+ let trans = Transition { start: byte, end: byte, next };
+ State::Range { range: trans }
+ }
+
+ fn s_range(start: u8, end: u8, next: StateID) -> State {
+ let trans = Transition { start, end, next };
+ State::Range { range: trans }
+ }
+
+ fn s_sparse(ranges: &[(u8, u8, StateID)]) -> State {
+ let ranges = ranges
+ .iter()
+ .map(|&(start, end, next)| Transition { start, end, next })
+ .collect();
+ State::Sparse { ranges }
+ }
+
+ fn s_union(alts: &[StateID]) -> State {
+ State::Union { alternates: alts.to_vec().into_boxed_slice() }
+ }
+
+ fn s_match() -> State {
+ State::Match
+ }
+
+ #[test]
+ fn errors() {
+ // unsupported anchors
+ assert!(Builder::new().build(&parse(r"^")).is_err());
+ assert!(Builder::new().build(&parse(r"$")).is_err());
+ assert!(Builder::new().build(&parse(r"\A")).is_err());
+ assert!(Builder::new().build(&parse(r"\z")).is_err());
+
+ // unsupported word boundaries
+ assert!(Builder::new().build(&parse(r"\b")).is_err());
+ assert!(Builder::new().build(&parse(r"\B")).is_err());
+ assert!(Builder::new().build(&parse(r"(?-u)\b")).is_err());
+ }
+
+ // Test that building an unanchored NFA has an appropriate `.*?` prefix.
+ #[test]
+ fn compile_unanchored_prefix() {
+ // When the machine can only match valid UTF-8.
+ let nfa = Builder::new().anchored(false).build(&parse(r"a")).unwrap();
+ // There should be many states since the `.` in `.*?` matches any
+ // Unicode scalar value.
+ assert_eq!(11, nfa.len());
+ assert_eq!(nfa.states[10], s_match());
+ assert_eq!(nfa.states[9], s_byte(b'a', 10));
+
+ // When the machine can match invalid UTF-8.
+ let nfa = Builder::new()
+ .anchored(false)
+ .allow_invalid_utf8(true)
+ .build(&parse(r"a"))
+ .unwrap();
+ assert_eq!(
+ nfa.states,
+ &[
+ s_union(&[2, 1]),
+ s_range(0, 255, 0),
+ s_byte(b'a', 3),
+ s_match(),
+ ]
+ );
+ }
+
+ #[test]
+ fn compile_empty() {
+ assert_eq!(build("").states, &[s_match(),]);
+ }
+
+ #[test]
+ fn compile_literal() {
+ assert_eq!(build("a").states, &[s_byte(b'a', 1), s_match(),]);
+ assert_eq!(
+ build("ab").states,
+ &[s_byte(b'a', 1), s_byte(b'b', 2), s_match(),]
+ );
+ assert_eq!(
+ build("☃").states,
+ &[s_byte(0xE2, 1), s_byte(0x98, 2), s_byte(0x83, 3), s_match(),]
+ );
+
+ // Check that non-UTF-8 literals work.
+ let hir = ParserBuilder::new()
+ .allow_invalid_utf8(true)
+ .build()
+ .parse(r"(?-u)\xFF")
+ .unwrap();
+ let nfa = Builder::new()
+ .anchored(true)
+ .allow_invalid_utf8(true)
+ .build(&hir)
+ .unwrap();
+ assert_eq!(nfa.states, &[s_byte(b'\xFF', 1), s_match(),]);
+ }
+
+ #[test]
+ fn compile_class() {
+ assert_eq!(
+ build(r"[a-z]").states,
+ &[s_range(b'a', b'z', 1), s_match(),]
+ );
+ assert_eq!(
+ build(r"[x-za-c]").states,
+ &[s_sparse(&[(b'a', b'c', 1), (b'x', b'z', 1)]), s_match()]
+ );
+ assert_eq!(
+ build(r"[\u03B1-\u03B4]").states,
+ &[s_range(0xB1, 0xB4, 2), s_byte(0xCE, 0), s_match()]
+ );
+ assert_eq!(
+ build(r"[\u03B1-\u03B4\u{1F919}-\u{1F91E}]").states,
+ &[
+ s_range(0xB1, 0xB4, 5),
+ s_range(0x99, 0x9E, 5),
+ s_byte(0xA4, 1),
+ s_byte(0x9F, 2),
+ s_sparse(&[(0xCE, 0xCE, 0), (0xF0, 0xF0, 3)]),
+ s_match(),
+ ]
+ );
+ assert_eq!(
+ build(r"[a-z☃]").states,
+ &[
+ s_byte(0x83, 3),
+ s_byte(0x98, 0),
+ s_sparse(&[(b'a', b'z', 3), (0xE2, 0xE2, 1)]),
+ s_match(),
+ ]
+ );
+ }
+
+ #[test]
+ fn compile_repetition() {
+ assert_eq!(
+ build(r"a?").states,
+ &[s_union(&[1, 2]), s_byte(b'a', 2), s_match(),]
+ );
+ assert_eq!(
+ build(r"a??").states,
+ &[s_union(&[2, 1]), s_byte(b'a', 2), s_match(),]
+ );
+ }
+
+ #[test]
+ fn compile_group() {
+ assert_eq!(
+ build(r"ab+").states,
+ &[s_byte(b'a', 1), s_byte(b'b', 2), s_union(&[1, 3]), s_match(),]
+ );
+ assert_eq!(
+ build(r"(ab)").states,
+ &[s_byte(b'a', 1), s_byte(b'b', 2), s_match(),]
+ );
+ assert_eq!(
+ build(r"(ab)+").states,
+ &[s_byte(b'a', 1), s_byte(b'b', 2), s_union(&[0, 3]), s_match(),]
+ );
+ }
+
+ #[test]
+ fn compile_alternation() {
+ assert_eq!(
+ build(r"a|b").states,
+ &[s_byte(b'a', 3), s_byte(b'b', 3), s_union(&[0, 1]), s_match(),]
+ );
+ assert_eq!(
+ build(r"|b").states,
+ &[s_byte(b'b', 2), s_union(&[2, 0]), s_match(),]
+ );
+ assert_eq!(
+ build(r"a|").states,
+ &[s_byte(b'a', 2), s_union(&[0, 2]), s_match(),]
+ );
+ }
+}
diff --git a/vendor/regex-automata/src/nfa/map.rs b/vendor/regex-automata/src/nfa/map.rs
new file mode 100644
index 000000000..e636c0dd3
--- /dev/null
+++ b/vendor/regex-automata/src/nfa/map.rs
@@ -0,0 +1,282 @@
+// This module contains a couple simple and purpose built hash maps. The key
+// trade off they make is that they serve as caches rather than true maps. That
+// is, inserting a new entry may cause eviction of another entry. This gives
+// us two things. First, there's less overhead associated with inserts and
+// lookups. Secondly, it lets us control our memory usage.
+//
+// These maps are used in some fairly hot code when generating NFA states for
+// large Unicode character classes.
+//
+// Instead of exposing a rich hashmap entry API, we just permit the caller
+// to produce a hash of the key directly. The hash can then be reused for both
+// lookups and insertions at the cost of leaking things a bit. But these are
+// for internal use only, so it's fine.
+//
+// The Utf8BoundedMap is used for Daciuk's algorithm for constructing a
+// (almost) minimal DFA for large Unicode character classes in linear time.
+// (Daciuk's algorithm is always used when compiling forward NFAs. For reverse
+// NFAs, it's only used when the compiler is configured to 'shrink' the NFA,
+// since there's a bit more expense in the reverse direction.)
+//
+// The Utf8SuffixMap is used when compiling large Unicode character classes for
+// reverse NFAs when 'shrink' is disabled. Specifically, it augments the naive
+// construction of UTF-8 automata by caching common suffixes. This doesn't
+// get the same space savings as Daciuk's algorithm, but it's basically as
+// fast as the naive approach and typically winds up using less memory (since
+// it generates smaller NFAs) despite the presence of the cache.
+//
+// These maps effectively represent caching mechanisms for CState::Sparse and
+// CState::Range, respectively. The former represents a single NFA state with
+// many transitions of equivalent priority while the latter represents a single
+// NFA state with a single transition. (Neither state ever has or is an
+// epsilon transition.) Thus, they have different key types. It's likely we
+// could make one generic map, but the machinery didn't seem worth it. They
+// are simple enough.
+
+use nfa::{StateID, Transition};
+
+// Basic FNV-1a hash constants as described in:
+// https://en.wikipedia.org/wiki/Fowler%E2%80%93Noll%E2%80%93Vo_hash_function
+const PRIME: u64 = 1099511628211;
+const INIT: u64 = 14695981039346656037;
+
+/// A bounded hash map where the key is a sequence of NFA transitions and the
+/// value is a pre-existing NFA state ID.
+///
+/// std's hashmap can be used for this, however, this map has two important
+/// advantages. Firstly, it has lower overhead. Secondly, it permits us to
+/// control our memory usage by limited the number of slots. In general, the
+/// cost here is that this map acts as a cache. That is, inserting a new entry
+/// may remove an old entry. We are okay with this, since it does not impact
+/// correctness in the cases where it is used. The only effect that dropping
+/// states from the cache has is that the resulting NFA generated may be bigger
+/// than it otherwise would be.
+///
+/// This improves benchmarks that compile large Unicode character classes,
+/// since it makes the generation of (almost) minimal UTF-8 automaton faster.
+/// Specifically, one could observe the difference with std's hashmap via
+/// something like the following benchmark:
+///
+/// hyperfine "regex-automata-debug debug -acqr '\w{40} ecurB'"
+///
+/// But to observe that difference, you'd have to modify the code to use
+/// std's hashmap.
+///
+/// It is quite possible that there is a better way to approach this problem.
+/// For example, if there happens to be a very common state that collides with
+/// a lot of less frequent states, then we could wind up with very poor caching
+/// behavior. Alas, the effectiveness of this cache has not been measured.
+/// Instead, ad hoc experiments suggest that it is "good enough." Additional
+/// smarts (such as an LRU eviction policy) have to be weighed against the
+/// amount of extra time they cost.
+#[derive(Clone, Debug)]
+pub struct Utf8BoundedMap {
+ /// The current version of this map. Only entries with matching versions
+ /// are considered during lookups. If an entry is found with a mismatched
+ /// version, then the map behaves as if the entry does not exist.
+ version: u16,
+ /// The total number of entries this map can store.
+ capacity: usize,
+ /// The actual entries, keyed by hash. Collisions between different states
+ /// result in the old state being dropped.
+ map: Vec<Utf8BoundedEntry>,
+}
+
+/// An entry in this map.
+#[derive(Clone, Debug, Default)]
+struct Utf8BoundedEntry {
+ /// The version of the map used to produce this entry. If this entry's
+ /// version does not match the current version of the map, then the map
+ /// should behave as if this entry does not exist.
+ version: u16,
+ /// The key, which is a sorted sequence of non-overlapping NFA transitions.
+ key: Vec<Transition>,
+ /// The state ID corresponding to the state containing the transitions in
+ /// this entry.
+ val: StateID,
+}
+
+impl Utf8BoundedMap {
+ /// Create a new bounded map with the given capacity. The map will never
+ /// grow beyond the given size.
+ ///
+ /// Note that this does not allocate. Instead, callers must call `clear`
+ /// before using this map. `clear` will allocate space if necessary.
+ ///
+ /// This avoids the need to pay for the allocation of this map when
+ /// compiling regexes that lack large Unicode character classes.
+ pub fn new(capacity: usize) -> Utf8BoundedMap {
+ assert!(capacity > 0);
+ Utf8BoundedMap { version: 0, capacity, map: vec![] }
+ }
+
+ /// Clear this map of all entries, but permit the reuse of allocation
+ /// if possible.
+ ///
+ /// This must be called before the map can be used.
+ pub fn clear(&mut self) {
+ if self.map.is_empty() {
+ self.map = vec![Utf8BoundedEntry::default(); self.capacity];
+ } else {
+ self.version = self.version.wrapping_add(1);
+ if self.version == 0 {
+ self.map = vec![Utf8BoundedEntry::default(); self.capacity];
+ }
+ }
+ }
+
+ /// Return a hash of the given transitions.
+ pub fn hash(&self, key: &[Transition]) -> usize {
+ let mut h = INIT;
+ for t in key {
+ h = (h ^ (t.start as u64)).wrapping_mul(PRIME);
+ h = (h ^ (t.end as u64)).wrapping_mul(PRIME);
+ h = (h ^ (t.next as u64)).wrapping_mul(PRIME);
+ }
+ (h as usize) % self.map.len()
+ }
+
+ /// Retrieve the cached state ID corresponding to the given key. The hash
+ /// given must have been computed with `hash` using the same key value.
+ ///
+ /// If there is no cached state with the given transitions, then None is
+ /// returned.
+ pub fn get(&mut self, key: &[Transition], hash: usize) -> Option<StateID> {
+ let entry = &self.map[hash];
+ if entry.version != self.version {
+ return None;
+ }
+ // There may be a hash collision, so we need to confirm real equality.
+ if entry.key != key {
+ return None;
+ }
+ Some(entry.val)
+ }
+
+ /// Add a cached state to this map with the given key. Callers should
+ /// ensure that `state_id` points to a state that contains precisely the
+ /// NFA transitions given.
+ ///
+ /// `hash` must have been computed using the `hash` method with the same
+ /// key.
+ pub fn set(
+ &mut self,
+ key: Vec<Transition>,
+ hash: usize,
+ state_id: StateID,
+ ) {
+ self.map[hash] =
+ Utf8BoundedEntry { version: self.version, key, val: state_id };
+ }
+}
+
+/// A cache of suffixes used to modestly compress UTF-8 automata for large
+/// Unicode character classes.
+#[derive(Clone, Debug)]
+pub struct Utf8SuffixMap {
+ /// The current version of this map. Only entries with matching versions
+ /// are considered during lookups. If an entry is found with a mismatched
+ /// version, then the map behaves as if the entry does not exist.
+ version: u16,
+ /// The total number of entries this map can store.
+ capacity: usize,
+ /// The actual entries, keyed by hash. Collisions between different states
+ /// result in the old state being dropped.
+ map: Vec<Utf8SuffixEntry>,
+}
+
+/// A key that uniquely identifies an NFA state. It is a triple that represents
+/// a transition from one state for a particular byte range.
+#[derive(Clone, Debug, Default, Eq, PartialEq)]
+pub struct Utf8SuffixKey {
+ pub from: StateID,
+ pub start: u8,
+ pub end: u8,
+}
+
+/// An entry in this map.
+#[derive(Clone, Debug, Default)]
+struct Utf8SuffixEntry {
+ /// The version of the map used to produce this entry. If this entry's
+ /// version does not match the current version of the map, then the map
+ /// should behave as if this entry does not exist.
+ version: u16,
+ /// The key, which consists of a transition in a particular state.
+ key: Utf8SuffixKey,
+ /// The identifier that the transition in the key maps to.
+ val: StateID,
+}
+
+impl Utf8SuffixMap {
+ /// Create a new bounded map with the given capacity. The map will never
+ /// grow beyond the given size.
+ ///
+ /// Note that this does not allocate. Instead, callers must call `clear`
+ /// before using this map. `clear` will allocate space if necessary.
+ ///
+ /// This avoids the need to pay for the allocation of this map when
+ /// compiling regexes that lack large Unicode character classes.
+ pub fn new(capacity: usize) -> Utf8SuffixMap {
+ assert!(capacity > 0);
+ Utf8SuffixMap { version: 0, capacity, map: vec![] }
+ }
+
+ /// Clear this map of all entries, but permit the reuse of allocation
+ /// if possible.
+ ///
+ /// This must be called before the map can be used.
+ pub fn clear(&mut self) {
+ if self.map.is_empty() {
+ self.map = vec![Utf8SuffixEntry::default(); self.capacity];
+ } else {
+ self.version = self.version.wrapping_add(1);
+ if self.version == 0 {
+ self.map = vec![Utf8SuffixEntry::default(); self.capacity];
+ }
+ }
+ }
+
+ /// Return a hash of the given transition.
+ pub fn hash(&self, key: &Utf8SuffixKey) -> usize {
+ // Basic FNV-1a hash as described:
+ // https://en.wikipedia.org/wiki/Fowler%E2%80%93Noll%E2%80%93Vo_hash_function
+ const PRIME: u64 = 1099511628211;
+ const INIT: u64 = 14695981039346656037;
+
+ let mut h = INIT;
+ h = (h ^ (key.from as u64)).wrapping_mul(PRIME);
+ h = (h ^ (key.start as u64)).wrapping_mul(PRIME);
+ h = (h ^ (key.end as u64)).wrapping_mul(PRIME);
+ (h as usize) % self.map.len()
+ }
+
+ /// Retrieve the cached state ID corresponding to the given key. The hash
+ /// given must have been computed with `hash` using the same key value.
+ ///
+ /// If there is no cached state with the given key, then None is returned.
+ pub fn get(
+ &mut self,
+ key: &Utf8SuffixKey,
+ hash: usize,
+ ) -> Option<StateID> {
+ let entry = &self.map[hash];
+ if entry.version != self.version {
+ return None;
+ }
+ if key != &entry.key {
+ return None;
+ }
+ Some(entry.val)
+ }
+
+ /// Add a cached state to this map with the given key. Callers should
+ /// ensure that `state_id` points to a state that contains precisely the
+ /// NFA transition given.
+ ///
+ /// `hash` must have been computed using the `hash` method with the same
+ /// key.
+ pub fn set(&mut self, key: Utf8SuffixKey, hash: usize, state_id: StateID) {
+ self.map[hash] =
+ Utf8SuffixEntry { version: self.version, key, val: state_id };
+ }
+}
diff --git a/vendor/regex-automata/src/nfa/mod.rs b/vendor/regex-automata/src/nfa/mod.rs
new file mode 100644
index 000000000..02d0501de
--- /dev/null
+++ b/vendor/regex-automata/src/nfa/mod.rs
@@ -0,0 +1,252 @@
+use std::fmt;
+
+use classes::ByteClasses;
+pub use nfa::compiler::Builder;
+
+mod compiler;
+mod map;
+mod range_trie;
+
+/// The representation for an NFA state identifier.
+pub type StateID = usize;
+
+/// A final compiled NFA.
+///
+/// The states of the NFA are indexed by state IDs, which are how transitions
+/// are expressed.
+#[derive(Clone)]
+pub struct NFA {
+ /// Whether this NFA can only match at the beginning of input or not.
+ ///
+ /// When true, a match should only be reported if it begins at the 0th
+ /// index of the haystack.
+ anchored: bool,
+ /// The starting state of this NFA.
+ start: StateID,
+ /// The state list. This list is guaranteed to be indexable by the starting
+ /// state ID, and it is also guaranteed to contain exactly one `Match`
+ /// state.
+ states: Vec<State>,
+ /// A mapping from any byte value to its corresponding equivalence class
+ /// identifier. Two bytes in the same equivalence class cannot discriminate
+ /// between a match or a non-match. This map can be used to shrink the
+ /// total size of a DFA's transition table with a small match-time cost.
+ ///
+ /// Note that the NFA's transitions are *not* defined in terms of these
+ /// equivalence classes. The NFA's transitions are defined on the original
+ /// byte values. For the most part, this is because they wouldn't really
+ /// help the NFA much since the NFA already uses a sparse representation
+ /// to represent transitions. Byte classes are most effective in a dense
+ /// representation.
+ byte_classes: ByteClasses,
+}
+
+impl NFA {
+ /// Returns an NFA that always matches at every position.
+ pub fn always_match() -> NFA {
+ NFA {
+ anchored: false,
+ start: 0,
+ states: vec![State::Match],
+ byte_classes: ByteClasses::empty(),
+ }
+ }
+
+ /// Returns an NFA that never matches at any position.
+ pub fn never_match() -> NFA {
+ NFA {
+ anchored: false,
+ start: 0,
+ states: vec![State::Fail],
+ byte_classes: ByteClasses::empty(),
+ }
+ }
+
+ /// Returns true if and only if this NFA is anchored.
+ pub fn is_anchored(&self) -> bool {
+ self.anchored
+ }
+
+ /// Return the number of states in this NFA.
+ pub fn len(&self) -> usize {
+ self.states.len()
+ }
+
+ /// Return the ID of the initial state of this NFA.
+ pub fn start(&self) -> StateID {
+ self.start
+ }
+
+ /// Return the NFA state corresponding to the given ID.
+ pub fn state(&self, id: StateID) -> &State {
+ &self.states[id]
+ }
+
+ /// Return the set of equivalence classes for this NFA. The slice returned
+ /// always has length 256 and maps each possible byte value to its
+ /// corresponding equivalence class ID (which is never more than 255).
+ pub fn byte_classes(&self) -> &ByteClasses {
+ &self.byte_classes
+ }
+}
+
+impl fmt::Debug for NFA {
+ fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
+ for (i, state) in self.states.iter().enumerate() {
+ let status = if i == self.start { '>' } else { ' ' };
+ writeln!(f, "{}{:06}: {:?}", status, i, state)?;
+ }
+ Ok(())
+ }
+}
+
+/// A state in a final compiled NFA.
+#[derive(Clone, Eq, PartialEq)]
+pub enum State {
+ /// A state that transitions to `next` if and only if the current input
+ /// byte is in the range `[start, end]` (inclusive).
+ ///
+ /// This is a special case of Sparse in that it encodes only one transition
+ /// (and therefore avoids the allocation).
+ Range { range: Transition },
+ /// A state with possibly many transitions, represented in a sparse
+ /// fashion. Transitions are ordered lexicographically by input range.
+ /// As such, this may only be used when every transition has equal
+ /// priority. (In practice, this is only used for encoding large UTF-8
+ /// automata.)
+ Sparse { ranges: Box<[Transition]> },
+ /// An alternation such that there exists an epsilon transition to all
+ /// states in `alternates`, where matches found via earlier transitions
+ /// are preferred over later transitions.
+ Union { alternates: Box<[StateID]> },
+ /// A fail state. When encountered, the automaton is guaranteed to never
+ /// reach a match state.
+ Fail,
+ /// A match state. There is exactly one such occurrence of this state in
+ /// an NFA.
+ Match,
+}
+
+/// A transition to another state, only if the given byte falls in the
+/// inclusive range specified.
+#[derive(Clone, Copy, Eq, Hash, PartialEq)]
+pub struct Transition {
+ pub start: u8,
+ pub end: u8,
+ pub next: StateID,
+}
+
+impl State {
+ /// Returns true if and only if this state contains one or more epsilon
+ /// transitions.
+ pub fn is_epsilon(&self) -> bool {
+ match *self {
+ State::Range { .. }
+ | State::Sparse { .. }
+ | State::Fail
+ | State::Match => false,
+ State::Union { .. } => true,
+ }
+ }
+
+ /// Remap the transitions in this state using the given map. Namely, the
+ /// given map should be indexed according to the transitions currently
+ /// in this state.
+ ///
+ /// This is used during the final phase of the NFA compiler, which turns
+ /// its intermediate NFA into the final NFA.
+ fn remap(&mut self, remap: &[StateID]) {
+ match *self {
+ State::Range { ref mut range } => range.next = remap[range.next],
+ State::Sparse { ref mut ranges } => {
+ for r in ranges.iter_mut() {
+ r.next = remap[r.next];
+ }
+ }
+ State::Union { ref mut alternates } => {
+ for alt in alternates.iter_mut() {
+ *alt = remap[*alt];
+ }
+ }
+ State::Fail => {}
+ State::Match => {}
+ }
+ }
+}
+
+impl fmt::Debug for State {
+ fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
+ match *self {
+ State::Range { ref range } => range.fmt(f),
+ State::Sparse { ref ranges } => {
+ let rs = ranges
+ .iter()
+ .map(|t| format!("{:?}", t))
+ .collect::<Vec<String>>()
+ .join(", ");
+ write!(f, "sparse({})", rs)
+ }
+ State::Union { ref alternates } => {
+ let alts = alternates
+ .iter()
+ .map(|id| format!("{}", id))
+ .collect::<Vec<String>>()
+ .join(", ");
+ write!(f, "alt({})", alts)
+ }
+ State::Fail => write!(f, "FAIL"),
+ State::Match => write!(f, "MATCH"),
+ }
+ }
+}
+
+impl fmt::Debug for Transition {
+ fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
+ let Transition { start, end, next } = *self;
+ if self.start == self.end {
+ write!(f, "{} => {}", escape(start), next)
+ } else {
+ write!(f, "{}-{} => {}", escape(start), escape(end), next)
+ }
+ }
+}
+
+/// Return the given byte as its escaped string form.
+fn escape(b: u8) -> String {
+ use std::ascii;
+
+ String::from_utf8(ascii::escape_default(b).collect::<Vec<_>>()).unwrap()
+}
+
+#[cfg(test)]
+mod tests {
+ use super::*;
+ use dense;
+ use dfa::DFA;
+
+ #[test]
+ fn always_match() {
+ let nfa = NFA::always_match();
+ let dfa = dense::Builder::new().build_from_nfa::<usize>(&nfa).unwrap();
+
+ assert_eq!(Some(0), dfa.find_at(b"", 0));
+ assert_eq!(Some(0), dfa.find_at(b"a", 0));
+ assert_eq!(Some(1), dfa.find_at(b"a", 1));
+ assert_eq!(Some(0), dfa.find_at(b"ab", 0));
+ assert_eq!(Some(1), dfa.find_at(b"ab", 1));
+ assert_eq!(Some(2), dfa.find_at(b"ab", 2));
+ }
+
+ #[test]
+ fn never_match() {
+ let nfa = NFA::never_match();
+ let dfa = dense::Builder::new().build_from_nfa::<usize>(&nfa).unwrap();
+
+ assert_eq!(None, dfa.find_at(b"", 0));
+ assert_eq!(None, dfa.find_at(b"a", 0));
+ assert_eq!(None, dfa.find_at(b"a", 1));
+ assert_eq!(None, dfa.find_at(b"ab", 0));
+ assert_eq!(None, dfa.find_at(b"ab", 1));
+ assert_eq!(None, dfa.find_at(b"ab", 2));
+ }
+}
diff --git a/vendor/regex-automata/src/nfa/range_trie.rs b/vendor/regex-automata/src/nfa/range_trie.rs
new file mode 100644
index 000000000..50767c7c6
--- /dev/null
+++ b/vendor/regex-automata/src/nfa/range_trie.rs
@@ -0,0 +1,1048 @@
+// I've called the primary data structure in this module a "range trie." As far
+// as I can tell, there is no prior art on a data structure like this, however,
+// it's likely someone somewhere has built something like it. Searching for
+// "range trie" turns up the paper "Range Tries for Scalable Address Lookup,"
+// but it does not appear relevant.
+//
+// The range trie is just like a trie in that it is a special case of a
+// deterministic finite state machine. It has states and each state has a set
+// of transitions to other states. It is acyclic, and, like a normal trie,
+// it makes no attempt to reuse common suffixes among its elements. The key
+// difference between a normal trie and a range trie below is that a range trie
+// operates on *contiguous sequences* of bytes instead of singleton bytes.
+// One could say say that our alphabet is ranges of bytes instead of bytes
+// themselves, except a key part of range trie construction is splitting ranges
+// apart to ensure there is at most one transition that can be taken for any
+// byte in a given state.
+//
+// I've tried to explain the details of how the range trie works below, so
+// for now, we are left with trying to understand what problem we're trying to
+// solve. Which is itself fairly involved!
+//
+// At the highest level, here's what we want to do. We want to convert a
+// sequence of Unicode codepoints into a finite state machine whose transitions
+// are over *bytes* and *not* Unicode codepoints. We want this because it makes
+// said finite state machines much smaller and much faster to execute. As a
+// simple example, consider a byte oriented automaton for all Unicode scalar
+// values (0x00 through 0x10FFFF, not including surrogate codepoints):
+//
+// [00-7F]
+// [C2-DF][80-BF]
+// [E0-E0][A0-BF][80-BF]
+// [E1-EC][80-BF][80-BF]
+// [ED-ED][80-9F][80-BF]
+// [EE-EF][80-BF][80-BF]
+// [F0-F0][90-BF][80-BF][80-BF]
+// [F1-F3][80-BF][80-BF][80-BF]
+// [F4-F4][80-8F][80-BF][80-BF]
+//
+// (These byte ranges are generated via the regex-syntax::utf8 module, which
+// was based on Russ Cox's code in RE2, which was in turn based on Ken
+// Thompson's implementation of the same idea in his Plan9 implementation of
+// grep.)
+//
+// It should be fairly straight-forward to see how one could compile this into
+// a DFA. The sequences are sorted and non-overlapping. Essentially, you could
+// build a trie from this fairly easy. The problem comes when your initial
+// range (in this case, 0x00-0x10FFFF) isn't so nice. For example, the class
+// represented by '\w' contains only a tenth of the codepoints that
+// 0x00-0x10FFFF contains, but if we were to write out the byte based ranges
+// as we did above, the list would stretch to 892 entries! This turns into
+// quite a large NFA with a few thousand states. Turning this beast into a DFA
+// takes quite a bit of time. We are thus left with trying to trim down the
+// number of states we produce as early as possible.
+//
+// One approach (used by RE2 and still by the regex crate, at time of writing)
+// is to try to find common suffixes while building NFA states for the above
+// and reuse them. This is very cheap to do and one can control precisely how
+// much extra memory you want to use for the cache.
+//
+// Another approach, however, is to reuse an algorithm for constructing a
+// *minimal* DFA from a sorted sequence of inputs. I don't want to go into
+// the full details here, but I explain it in more depth in my blog post on
+// FSTs[1]. Note that the algorithm not invented by me, but was published
+// in paper by Daciuk et al. in 2000 called "Incremental Construction of
+// MinimalAcyclic Finite-State Automata." Like the suffix cache approach above,
+// it is also possible to control the amount of extra memory one uses, although
+// this usually comes with the cost of sacrificing true minimality. (But it's
+// typically close enough with a reasonably sized cache of states.)
+//
+// The catch is that Daciuk's algorithm only works if you add your keys in
+// lexicographic ascending order. In our case, since we're dealing with ranges,
+// we also need the additional requirement that ranges are either equivalent
+// or do not overlap at all. For example, if one were given the following byte
+// ranges:
+//
+// [BC-BF][80-BF]
+// [BC-BF][90-BF]
+//
+// Then Daciuk's algorithm also would not work, since there is nothing to
+// handle the fact that the ranges overlap. They would need to be split apart.
+// Thankfully, Thompson's algorithm for producing byte ranges for Unicode
+// codepoint ranges meets both of our requirements.
+//
+// ... however, we would also like to be able to compile UTF-8 automata in
+// reverse. We want this because in order to find the starting location of a
+// match using a DFA, we need to run a second DFA---a reversed version of the
+// forward DFA---backwards to discover the match location. Unfortunately, if
+// we reverse our byte sequences for 0x00-0x10FFFF, we get sequences that are
+// can overlap, even if they are sorted:
+//
+// [00-7F]
+// [80-BF][80-9F][ED-ED]
+// [80-BF][80-BF][80-8F][F4-F4]
+// [80-BF][80-BF][80-BF][F1-F3]
+// [80-BF][80-BF][90-BF][F0-F0]
+// [80-BF][80-BF][E1-EC]
+// [80-BF][80-BF][EE-EF]
+// [80-BF][A0-BF][E0-E0]
+// [80-BF][C2-DF]
+//
+// For example, '[80-BF][80-BF][EE-EF]' and '[80-BF][A0-BF][E0-E0]' have
+// overlapping ranges between '[80-BF]' and '[A0-BF]'. Thus, there is no
+// simple way to apply Daciuk's algorithm.
+//
+// And thus, the range trie was born. The range trie's only purpose is to take
+// sequences of byte ranges like the ones above, collect them into a trie and
+// then spit them in a sorted fashion with no overlapping ranges. For example,
+// 0x00-0x10FFFF gets translated to:
+//
+// [0-7F]
+// [80-BF][80-9F][80-8F][F1-F3]
+// [80-BF][80-9F][80-8F][F4]
+// [80-BF][80-9F][90-BF][F0]
+// [80-BF][80-9F][90-BF][F1-F3]
+// [80-BF][80-9F][E1-EC]
+// [80-BF][80-9F][ED]
+// [80-BF][80-9F][EE-EF]
+// [80-BF][A0-BF][80-8F][F1-F3]
+// [80-BF][A0-BF][80-8F][F4]
+// [80-BF][A0-BF][90-BF][F0]
+// [80-BF][A0-BF][90-BF][F1-F3]
+// [80-BF][A0-BF][E0]
+// [80-BF][A0-BF][E1-EC]
+// [80-BF][A0-BF][EE-EF]
+// [80-BF][C2-DF]
+//
+// We've thus satisfied our requirements for running Daciuk's algorithm. All
+// sequences of ranges are sorted, and any corresponding ranges are either
+// exactly equivalent or non-overlapping.
+//
+// In effect, a range trie is building a DFA from a sequence of arbitrary
+// byte ranges. But it uses an algoritm custom tailored to its input, so it
+// is not as costly as traditional DFA construction. While it is still quite
+// a bit more costly than the forward's case (which only needs Daciuk's
+// algorithm), it winds up saving a substantial amount of time if one is doing
+// a full DFA powerset construction later by virtue of producing a much much
+// smaller NFA.
+//
+// [1] - https://blog.burntsushi.net/transducers/
+// [2] - https://www.mitpressjournals.org/doi/pdfplus/10.1162/089120100561601
+
+use std::cell::RefCell;
+use std::fmt;
+use std::mem;
+use std::ops::RangeInclusive;
+use std::u32;
+
+use regex_syntax::utf8::Utf8Range;
+
+/// A smaller state ID means more effective use of the CPU cache and less
+/// time spent copying. The implementation below will panic if the state ID
+/// space is exhausted, but in order for that to happen, the range trie itself
+/// would use well over 100GB of memory. Moreover, it's likely impossible
+/// for the state ID space to get that big. In fact, it's likely that even a
+/// u16 would be good enough here. But it's not quite clear how to prove this.
+type StateID = u32;
+
+/// There is only one final state in this trie. Every sequence of byte ranges
+/// added shares the same final state.
+const FINAL: StateID = 0;
+
+/// The root state of the trie.
+const ROOT: StateID = 1;
+
+/// A range trie represents an ordered set of sequences of bytes.
+///
+/// A range trie accepts as input a sequence of byte ranges and merges
+/// them into the existing set such that the trie can produce a sorted
+/// non-overlapping sequence of byte ranges. The sequence emitted corresponds
+/// precisely to the sequence of bytes matched by the given keys, although the
+/// byte ranges themselves may be split at different boundaries.
+///
+/// The order complexity of this data structure seems difficult to analyze.
+/// If the size of a byte is held as a constant, then insertion is clearly
+/// O(n) where n is the number of byte ranges in the input key. However, if
+/// k=256 is our alphabet size, then insertion could be O(k^2 * n). In
+/// particular it seems possible for pathological inputs to cause insertion
+/// to do a lot of work. However, for what we use this data structure for,
+/// there should be no pathological inputs since the ultimate source is always
+/// a sorted set of Unicode scalar value ranges.
+///
+/// Internally, this trie is setup like a finite state machine. Note though
+/// that it is acyclic.
+#[derive(Clone)]
+pub struct RangeTrie {
+ /// The states in this trie. The first is always the shared final state.
+ /// The second is always the root state. Otherwise, there is no
+ /// particular order.
+ states: Vec<State>,
+ /// A free-list of states. When a range trie is cleared, all of its states
+ /// are added to list. Creating a new state reuses states from this list
+ /// before allocating a new one.
+ free: Vec<State>,
+ /// A stack for traversing this trie to yield sequences of byte ranges in
+ /// lexicographic order.
+ iter_stack: RefCell<Vec<NextIter>>,
+ /// A bufer that stores the current sequence during iteration.
+ iter_ranges: RefCell<Vec<Utf8Range>>,
+ /// A stack used for traversing the trie in order to (deeply) duplicate
+ /// a state.
+ dupe_stack: Vec<NextDupe>,
+ /// A stack used for traversing the trie during insertion of a new
+ /// sequence of byte ranges.
+ insert_stack: Vec<NextInsert>,
+}
+
+/// A single state in this trie.
+#[derive(Clone)]
+struct State {
+ /// A sorted sequence of non-overlapping transitions to other states. Each
+ /// transition corresponds to a single range of bytes.
+ transitions: Vec<Transition>,
+}
+
+/// A transition is a single range of bytes. If a particular byte is in this
+/// range, then the corresponding machine may transition to the state pointed
+/// to by `next_id`.
+#[derive(Clone)]
+struct Transition {
+ /// The byte range.
+ range: Utf8Range,
+ /// The next state to transition to.
+ next_id: StateID,
+}
+
+impl RangeTrie {
+ /// Create a new empty range trie.
+ pub fn new() -> RangeTrie {
+ let mut trie = RangeTrie {
+ states: vec![],
+ free: vec![],
+ iter_stack: RefCell::new(vec![]),
+ iter_ranges: RefCell::new(vec![]),
+ dupe_stack: vec![],
+ insert_stack: vec![],
+ };
+ trie.clear();
+ trie
+ }
+
+ /// Clear this range trie such that it is empty. Clearing a range trie
+ /// and reusing it can beneficial because this may reuse allocations.
+ pub fn clear(&mut self) {
+ self.free.extend(self.states.drain(..));
+ self.add_empty(); // final
+ self.add_empty(); // root
+ }
+
+ /// Iterate over all of the sequences of byte ranges in this trie, and
+ /// call the provided function for each sequence. Iteration occurs in
+ /// lexicographic order.
+ pub fn iter<F: FnMut(&[Utf8Range])>(&self, mut f: F) {
+ let mut stack = self.iter_stack.borrow_mut();
+ stack.clear();
+ let mut ranges = self.iter_ranges.borrow_mut();
+ ranges.clear();
+
+ // We do iteration in a way that permits us to use a single buffer
+ // for our keys. We iterate in a depth first fashion, while being
+ // careful to expand our frontier as we move deeper in the trie.
+ stack.push(NextIter { state_id: ROOT, tidx: 0 });
+ while let Some(NextIter { mut state_id, mut tidx }) = stack.pop() {
+ // This could be implemented more simply without an inner loop
+ // here, but at the cost of more stack pushes.
+ loop {
+ let state = self.state(state_id);
+ // If we're visited all transitions in this state, then pop
+ // back to the parent state.
+ if tidx >= state.transitions.len() {
+ ranges.pop();
+ break;
+ }
+
+ let t = &state.transitions[tidx];
+ ranges.push(t.range);
+ if t.next_id == FINAL {
+ f(&ranges);
+ ranges.pop();
+ tidx += 1;
+ } else {
+ // Expand our frontier. Once we come back to this state
+ // via the stack, start in on the next transition.
+ stack.push(NextIter { state_id, tidx: tidx + 1 });
+ // Otherwise, move to the first transition of the next
+ // state.
+ state_id = t.next_id;
+ tidx = 0;
+ }
+ }
+ }
+ }
+
+ /// Inserts a new sequence of ranges into this trie.
+ ///
+ /// The sequence given must be non-empty and must not have a length
+ /// exceeding 4.
+ pub fn insert(&mut self, ranges: &[Utf8Range]) {
+ assert!(!ranges.is_empty());
+ assert!(ranges.len() <= 4);
+
+ let mut stack = mem::replace(&mut self.insert_stack, vec![]);
+ stack.clear();
+
+ stack.push(NextInsert::new(ROOT, ranges));
+ while let Some(next) = stack.pop() {
+ let (state_id, ranges) = (next.state_id(), next.ranges());
+ assert!(!ranges.is_empty());
+
+ let (mut new, rest) = (ranges[0], &ranges[1..]);
+
+ // i corresponds to the position of the existing transition on
+ // which we are operating. Typically, the result is to remove the
+ // transition and replace it with two or more new transitions
+ // corresponding to the partitions generated by splitting the
+ // 'new' with the ith transition's range.
+ let mut i = self.state(state_id).find(new);
+
+ // In this case, there is no overlap *and* the new range is greater
+ // than all existing ranges. So we can just add it to the end.
+ if i == self.state(state_id).transitions.len() {
+ let next_id = NextInsert::push(self, &mut stack, rest);
+ self.add_transition(state_id, new, next_id);
+ continue;
+ }
+
+ // The need for this loop is a bit subtle, buf basically, after
+ // we've handled the partitions from our initial split, it's
+ // possible that there will be a partition leftover that overlaps
+ // with a subsequent transition. If so, then we have to repeat
+ // the split process again with the leftovers and that subsequent
+ // transition.
+ 'OUTER: loop {
+ let old = self.state(state_id).transitions[i].clone();
+ let split = match Split::new(old.range, new) {
+ Some(split) => split,
+ None => {
+ let next_id = NextInsert::push(self, &mut stack, rest);
+ self.add_transition_at(i, state_id, new, next_id);
+ continue;
+ }
+ };
+ let splits = split.as_slice();
+ // If we only have one partition, then the ranges must be
+ // equivalent. There's nothing to do here for this state, so
+ // just move on to the next one.
+ if splits.len() == 1 {
+ // ... but only if we have anything left to do.
+ if !rest.is_empty() {
+ stack.push(NextInsert::new(old.next_id, rest));
+ }
+ break;
+ }
+ // At this point, we know that 'split' is non-empty and there
+ // must be some overlap AND that the two ranges are not
+ // equivalent. Therefore, the existing range MUST be removed
+ // and split up somehow. Instead of actually doing the removal
+ // and then a subsequent insertion---with all the memory
+ // shuffling that entails---we simply overwrite the transition
+ // at position `i` for the first new transition we want to
+ // insert. After that, we're forced to do expensive inserts.
+ let mut first = true;
+ let mut add_trans =
+ |trie: &mut RangeTrie, pos, from, range, to| {
+ if first {
+ trie.set_transition_at(pos, from, range, to);
+ first = false;
+ } else {
+ trie.add_transition_at(pos, from, range, to);
+ }
+ };
+ for (j, &srange) in splits.iter().enumerate() {
+ match srange {
+ SplitRange::Old(r) => {
+ // Deep clone the state pointed to by the ith
+ // transition. This is always necessary since 'old'
+ // is always coupled with at least a 'both'
+ // partition. We don't want any new changes made
+ // via the 'both' partition to impact the part of
+ // the transition that doesn't overlap with the
+ // new range.
+ let dup_id = self.duplicate(old.next_id);
+ add_trans(self, i, state_id, r, dup_id);
+ }
+ SplitRange::New(r) => {
+ // This is a bit subtle, but if this happens to be
+ // the last partition in our split, it is possible
+ // that this overlaps with a subsequent transition.
+ // If it does, then we must repeat the whole
+ // splitting process over again with `r` and the
+ // subsequent transition.
+ {
+ let trans = &self.state(state_id).transitions;
+ if j + 1 == splits.len()
+ && i < trans.len()
+ && intersects(r, trans[i].range)
+ {
+ new = r;
+ continue 'OUTER;
+ }
+ }
+
+ // ... otherwise, setup exploration for a new
+ // empty state and add a brand new transition for
+ // this new range.
+ let next_id =
+ NextInsert::push(self, &mut stack, rest);
+ add_trans(self, i, state_id, r, next_id);
+ }
+ SplitRange::Both(r) => {
+ // Continue adding the remaining ranges on this
+ // path and update the transition with the new
+ // range.
+ if !rest.is_empty() {
+ stack.push(NextInsert::new(old.next_id, rest));
+ }
+ add_trans(self, i, state_id, r, old.next_id);
+ }
+ }
+ i += 1;
+ }
+ // If we've reached this point, then we know that there are
+ // no subsequent transitions with any overlap. Therefore, we
+ // can stop processing this range and move on to the next one.
+ break;
+ }
+ }
+ self.insert_stack = stack;
+ }
+
+ pub fn add_empty(&mut self) -> StateID {
+ if self.states.len() as u64 > u32::MAX as u64 {
+ // This generally should not happen since a range trie is only
+ // ever used to compile a single sequence of Unicode scalar values.
+ // If we ever got to this point, we would, at *minimum*, be using
+ // 96GB in just the range trie alone.
+ panic!("too many sequences added to range trie");
+ }
+ let id = self.states.len() as StateID;
+ // If we have some free states available, then use them to avoid
+ // more allocations.
+ if let Some(mut state) = self.free.pop() {
+ state.clear();
+ self.states.push(state);
+ } else {
+ self.states.push(State { transitions: vec![] });
+ }
+ id
+ }
+
+ /// Performs a deep clone of the given state and returns the duplicate's
+ /// state ID.
+ ///
+ /// A "deep clone" in this context means that the state given along with
+ /// recursively all states that it points to are copied. Once complete,
+ /// the given state ID and the returned state ID share nothing.
+ ///
+ /// This is useful during range trie insertion when a new range overlaps
+ /// with an existing range that is bigger than the new one. The part of
+ /// the existing range that does *not* overlap with the new one is that
+ /// duplicated so that adding the new range to the overlap doesn't disturb
+ /// the non-overlapping portion.
+ ///
+ /// There's one exception: if old_id is the final state, then it is not
+ /// duplicated and the same final state is returned. This is because all
+ /// final states in this trie are equivalent.
+ fn duplicate(&mut self, old_id: StateID) -> StateID {
+ if old_id == FINAL {
+ return FINAL;
+ }
+
+ let mut stack = mem::replace(&mut self.dupe_stack, vec![]);
+ stack.clear();
+
+ let new_id = self.add_empty();
+ // old_id is the state we're cloning and new_id is the ID of the
+ // duplicated state for old_id.
+ stack.push(NextDupe { old_id, new_id });
+ while let Some(NextDupe { old_id, new_id }) = stack.pop() {
+ for i in 0..self.state(old_id).transitions.len() {
+ let t = self.state(old_id).transitions[i].clone();
+ if t.next_id == FINAL {
+ // All final states are the same, so there's no need to
+ // duplicate it.
+ self.add_transition(new_id, t.range, FINAL);
+ continue;
+ }
+
+ let new_child_id = self.add_empty();
+ self.add_transition(new_id, t.range, new_child_id);
+ stack.push(NextDupe {
+ old_id: t.next_id,
+ new_id: new_child_id,
+ });
+ }
+ }
+ self.dupe_stack = stack;
+ new_id
+ }
+
+ /// Adds the given transition to the given state.
+ ///
+ /// Callers must ensure that all previous transitions in this state
+ /// are lexicographically smaller than the given range.
+ fn add_transition(
+ &mut self,
+ from_id: StateID,
+ range: Utf8Range,
+ next_id: StateID,
+ ) {
+ self.state_mut(from_id)
+ .transitions
+ .push(Transition { range, next_id });
+ }
+
+ /// Like `add_transition`, except this inserts the transition just before
+ /// the ith transition.
+ fn add_transition_at(
+ &mut self,
+ i: usize,
+ from_id: StateID,
+ range: Utf8Range,
+ next_id: StateID,
+ ) {
+ self.state_mut(from_id)
+ .transitions
+ .insert(i, Transition { range, next_id });
+ }
+
+ /// Overwrites the transition at position i with the given transition.
+ fn set_transition_at(
+ &mut self,
+ i: usize,
+ from_id: StateID,
+ range: Utf8Range,
+ next_id: StateID,
+ ) {
+ self.state_mut(from_id).transitions[i] = Transition { range, next_id };
+ }
+
+ /// Return an immutable borrow for the state with the given ID.
+ fn state(&self, id: StateID) -> &State {
+ &self.states[id as usize]
+ }
+
+ /// Return a mutable borrow for the state with the given ID.
+ fn state_mut(&mut self, id: StateID) -> &mut State {
+ &mut self.states[id as usize]
+ }
+}
+
+impl State {
+ /// Find the position at which the given range should be inserted in this
+ /// state.
+ ///
+ /// The position returned is always in the inclusive range
+ /// [0, transitions.len()]. If 'transitions.len()' is returned, then the
+ /// given range overlaps with no other range in this state *and* is greater
+ /// than all of them.
+ ///
+ /// For all other possible positions, the given range either overlaps
+ /// with the transition at that position or is otherwise less than it
+ /// with no overlap (and is greater than the previous transition). In the
+ /// former case, careful attention must be paid to inserting this range
+ /// as a new transition. In the latter case, the range can be inserted as
+ /// a new transition at the given position without disrupting any other
+ /// transitions.
+ fn find(&self, range: Utf8Range) -> usize {
+ /// Returns the position `i` at which `pred(xs[i])` first returns true
+ /// such that for all `j >= i`, `pred(xs[j]) == true`. If `pred` never
+ /// returns true, then `xs.len()` is returned.
+ ///
+ /// We roll our own binary search because it doesn't seem like the
+ /// standard library's binary search can be used here. Namely, if
+ /// there is an overlapping range, then we want to find the first such
+ /// occurrence, but there may be many. Or at least, it's not quite
+ /// clear to me how to do it.
+ fn binary_search<T, F>(xs: &[T], mut pred: F) -> usize
+ where
+ F: FnMut(&T) -> bool,
+ {
+ let (mut left, mut right) = (0, xs.len());
+ while left < right {
+ // Overflow is impossible because xs.len() <= 256.
+ let mid = (left + right) / 2;
+ if pred(&xs[mid]) {
+ right = mid;
+ } else {
+ left = mid + 1;
+ }
+ }
+ left
+ }
+
+ // Benchmarks suggest that binary search is just a bit faster than
+ // straight linear search. Specifically when using the debug tool:
+ //
+ // hyperfine "regex-automata-debug debug -acqr '\w{40} ecurB'"
+ binary_search(&self.transitions, |t| range.start <= t.range.end)
+ }
+
+ /// Clear this state such that it has zero transitions.
+ fn clear(&mut self) {
+ self.transitions.clear();
+ }
+}
+
+/// The next state to process during duplication.
+#[derive(Clone, Debug)]
+struct NextDupe {
+ /// The state we want to duplicate.
+ old_id: StateID,
+ /// The ID of the new state that is a duplicate of old_id.
+ new_id: StateID,
+}
+
+/// The next state (and its corresponding transition) that we want to visit
+/// during iteration in lexicographic order.
+#[derive(Clone, Debug)]
+struct NextIter {
+ state_id: StateID,
+ tidx: usize,
+}
+
+/// The next state to process during insertion and any remaining ranges that we
+/// want to add for a partcular sequence of ranges. The first such instance
+/// is always the root state along with all ranges given.
+#[derive(Clone, Debug)]
+struct NextInsert {
+ /// The next state to begin inserting ranges. This state should be the
+ /// state at which `ranges[0]` should be inserted.
+ state_id: StateID,
+ /// The ranges to insert. We used a fixed-size array here to avoid an
+ /// allocation.
+ ranges: [Utf8Range; 4],
+ /// The number of valid ranges in the above array.
+ len: u8,
+}
+
+impl NextInsert {
+ /// Create the next item to visit. The given state ID should correspond
+ /// to the state at which the first range in the given slice should be
+ /// inserted. The slice given must not be empty and it must be no longer
+ /// than 4.
+ fn new(state_id: StateID, ranges: &[Utf8Range]) -> NextInsert {
+ let len = ranges.len();
+ assert!(len > 0);
+ assert!(len <= 4);
+
+ let mut tmp = [Utf8Range { start: 0, end: 0 }; 4];
+ tmp[..len].copy_from_slice(ranges);
+ NextInsert { state_id, ranges: tmp, len: len as u8 }
+ }
+
+ /// Push a new empty state to visit along with any remaining ranges that
+ /// still need to be inserted. The ID of the new empty state is returned.
+ ///
+ /// If ranges is empty, then no new state is created and FINAL is returned.
+ fn push(
+ trie: &mut RangeTrie,
+ stack: &mut Vec<NextInsert>,
+ ranges: &[Utf8Range],
+ ) -> StateID {
+ if ranges.is_empty() {
+ FINAL
+ } else {
+ let next_id = trie.add_empty();
+ stack.push(NextInsert::new(next_id, ranges));
+ next_id
+ }
+ }
+
+ /// Return the ID of the state to visit.
+ fn state_id(&self) -> StateID {
+ self.state_id
+ }
+
+ /// Return the remaining ranges to insert.
+ fn ranges(&self) -> &[Utf8Range] {
+ &self.ranges[..self.len as usize]
+ }
+}
+
+/// Split represents a partitioning of two ranges into one or more ranges. This
+/// is the secret sauce that makes a range trie work, as it's what tells us
+/// how to deal with two overlapping but unequal ranges during insertion.
+///
+/// Essentially, either two ranges overlap or they don't. If they don't, then
+/// handling insertion is easy: just insert the new range into its
+/// lexicographically correct position. Since it does not overlap with anything
+/// else, no other transitions are impacted by the new range.
+///
+/// If they do overlap though, there are generally three possible cases to
+/// handle:
+///
+/// 1. The part where the two ranges actually overlap. i.e., The intersection.
+/// 2. The part of the existing range that is not in the the new range.
+/// 3. The part of the new range that is not in the old range.
+///
+/// (1) is guaranteed to always occur since all overlapping ranges have a
+/// non-empty intersection. If the two ranges are not equivalent, then at
+/// least one of (2) or (3) is guaranteed to occur as well. In some cases,
+/// e.g., `[0-4]` and `[4-9]`, all three cases will occur.
+///
+/// This `Split` type is responsible for providing (1), (2) and (3) for any
+/// possible pair of byte ranges.
+///
+/// As for insertion, for the overlap in (1), the remaining ranges to insert
+/// should be added by following the corresponding transition. However, this
+/// should only be done for the overlapping parts of the range. If there was
+/// a part of the existing range that was not in the new range, then that
+/// existing part must be split off from the transition and duplicated. The
+/// remaining parts of the overlap can then be added to using the new ranges
+/// without disturbing the existing range.
+///
+/// Handling the case for the part of a new range that is not in an existing
+/// range is seemingly easy. Just treat it as if it were a non-overlapping
+/// range. The problem here is that if this new non-overlapping range occurs
+/// after both (1) and (2), then it's possible that it can overlap with the
+/// next transition in the current state. If it does, then the whole process
+/// must be repeated!
+///
+/// # Details of the 3 cases
+///
+/// The following details the various cases that are implemented in code
+/// below. It's plausible that the number of cases is not actually minimal,
+/// but it's important for this code to remain at least somewhat readable.
+///
+/// Given [a,b] and [x,y], where a <= b, x <= y, b < 256 and y < 256, we define
+/// the follow distinct relationships where at least one must apply. The order
+/// of these matters, since multiple can match. The first to match applies.
+///
+/// 1. b < x <=> [a,b] < [x,y]
+/// 2. y < a <=> [x,y] < [a,b]
+///
+/// In the case of (1) and (2), these are the only cases where there is no
+/// overlap. Or otherwise, the intersection of [a,b] and [x,y] is empty. In
+/// order to compute the intersection, one can do [max(a,x), min(b,y)]. The
+/// intersection in all of the following cases is non-empty.
+///
+/// 3. a = x && b = y <=> [a,b] == [x,y]
+/// 4. a = x && b < y <=> [x,y] right-extends [a,b]
+/// 5. b = y && a > x <=> [x,y] left-extends [a,b]
+/// 6. x = a && y < b <=> [a,b] right-extends [x,y]
+/// 7. y = b && x > a <=> [a,b] left-extends [x,y]
+/// 8. a > x && b < y <=> [x,y] covers [a,b]
+/// 9. x > a && y < b <=> [a,b] covers [x,y]
+/// 10. b = x && a < y <=> [a,b] is left-adjacent to [x,y]
+/// 11. y = a && x < b <=> [x,y] is left-adjacent to [a,b]
+/// 12. b > x && b < y <=> [a,b] left-overlaps [x,y]
+/// 13. y > a && y < b <=> [x,y] left-overlaps [a,b]
+///
+/// In cases 3-13, we can form rules that partition the ranges into a
+/// non-overlapping ordered sequence of ranges:
+///
+/// 3. [a,b]
+/// 4. [a,b], [b+1,y]
+/// 5. [x,a-1], [a,b]
+/// 6. [x,y], [y+1,b]
+/// 7. [a,x-1], [x,y]
+/// 8. [x,a-1], [a,b], [b+1,y]
+/// 9. [a,x-1], [x,y], [y+1,b]
+/// 10. [a,b-1], [b,b], [b+1,y]
+/// 11. [x,y-1], [y,y], [y+1,b]
+/// 12. [a,x-1], [x,b], [b+1,y]
+/// 13. [x,a-1], [a,y], [y+1,b]
+///
+/// In the code below, we go a step further and identify each of the above
+/// outputs as belonging either to the overlap of the two ranges or to one
+/// of [a,b] or [x,y] exclusively.
+#[derive(Clone, Debug, Eq, PartialEq)]
+struct Split {
+ partitions: [SplitRange; 3],
+ len: usize,
+}
+
+/// A tagged range indicating how it was derived from a pair of ranges.
+#[derive(Clone, Copy, Debug, Eq, PartialEq)]
+enum SplitRange {
+ Old(Utf8Range),
+ New(Utf8Range),
+ Both(Utf8Range),
+}
+
+impl Split {
+ /// Create a partitioning of the given ranges.
+ ///
+ /// If the given ranges have an empty intersection, then None is returned.
+ fn new(o: Utf8Range, n: Utf8Range) -> Option<Split> {
+ let range = |r: RangeInclusive<u8>| Utf8Range {
+ start: *r.start(),
+ end: *r.end(),
+ };
+ let old = |r| SplitRange::Old(range(r));
+ let new = |r| SplitRange::New(range(r));
+ let both = |r| SplitRange::Both(range(r));
+
+ // Use same names as the comment above to make it easier to compare.
+ let (a, b, x, y) = (o.start, o.end, n.start, n.end);
+
+ if b < x || y < a {
+ // case 1, case 2
+ None
+ } else if a == x && b == y {
+ // case 3
+ Some(Split::parts1(both(a..=b)))
+ } else if a == x && b < y {
+ // case 4
+ Some(Split::parts2(both(a..=b), new(b + 1..=y)))
+ } else if b == y && a > x {
+ // case 5
+ Some(Split::parts2(new(x..=a - 1), both(a..=b)))
+ } else if x == a && y < b {
+ // case 6
+ Some(Split::parts2(both(x..=y), old(y + 1..=b)))
+ } else if y == b && x > a {
+ // case 7
+ Some(Split::parts2(old(a..=x - 1), both(x..=y)))
+ } else if a > x && b < y {
+ // case 8
+ Some(Split::parts3(new(x..=a - 1), both(a..=b), new(b + 1..=y)))
+ } else if x > a && y < b {
+ // case 9
+ Some(Split::parts3(old(a..=x - 1), both(x..=y), old(y + 1..=b)))
+ } else if b == x && a < y {
+ // case 10
+ Some(Split::parts3(old(a..=b - 1), both(b..=b), new(b + 1..=y)))
+ } else if y == a && x < b {
+ // case 11
+ Some(Split::parts3(new(x..=y - 1), both(y..=y), old(y + 1..=b)))
+ } else if b > x && b < y {
+ // case 12
+ Some(Split::parts3(old(a..=x - 1), both(x..=b), new(b + 1..=y)))
+ } else if y > a && y < b {
+ // case 13
+ Some(Split::parts3(new(x..=a - 1), both(a..=y), old(y + 1..=b)))
+ } else {
+ unreachable!()
+ }
+ }
+
+ /// Create a new split with a single partition. This only occurs when two
+ /// ranges are equivalent.
+ fn parts1(r1: SplitRange) -> Split {
+ // This value doesn't matter since it is never accessed.
+ let nada = SplitRange::Old(Utf8Range { start: 0, end: 0 });
+ Split { partitions: [r1, nada, nada], len: 1 }
+ }
+
+ /// Create a new split with two partitions.
+ fn parts2(r1: SplitRange, r2: SplitRange) -> Split {
+ // This value doesn't matter since it is never accessed.
+ let nada = SplitRange::Old(Utf8Range { start: 0, end: 0 });
+ Split { partitions: [r1, r2, nada], len: 2 }
+ }
+
+ /// Create a new split with three partitions.
+ fn parts3(r1: SplitRange, r2: SplitRange, r3: SplitRange) -> Split {
+ Split { partitions: [r1, r2, r3], len: 3 }
+ }
+
+ /// Return the partitions in this split as a slice.
+ fn as_slice(&self) -> &[SplitRange] {
+ &self.partitions[..self.len]
+ }
+}
+
+impl fmt::Debug for RangeTrie {
+ fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
+ writeln!(f, "")?;
+ for (i, state) in self.states.iter().enumerate() {
+ let status = if i == FINAL as usize { '*' } else { ' ' };
+ writeln!(f, "{}{:06}: {:?}", status, i, state)?;
+ }
+ Ok(())
+ }
+}
+
+impl fmt::Debug for State {
+ fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
+ let rs = self
+ .transitions
+ .iter()
+ .map(|t| format!("{:?}", t))
+ .collect::<Vec<String>>()
+ .join(", ");
+ write!(f, "{}", rs)
+ }
+}
+
+impl fmt::Debug for Transition {
+ fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
+ if self.range.start == self.range.end {
+ write!(f, "{:02X} => {:02X}", self.range.start, self.next_id)
+ } else {
+ write!(
+ f,
+ "{:02X}-{:02X} => {:02X}",
+ self.range.start, self.range.end, self.next_id
+ )
+ }
+ }
+}
+
+/// Returns true if and only if the given ranges intersect.
+fn intersects(r1: Utf8Range, r2: Utf8Range) -> bool {
+ !(r1.end < r2.start || r2.end < r1.start)
+}
+
+#[cfg(test)]
+mod tests {
+ use std::ops::RangeInclusive;
+
+ use regex_syntax::utf8::Utf8Range;
+
+ use super::*;
+
+ fn r(range: RangeInclusive<u8>) -> Utf8Range {
+ Utf8Range { start: *range.start(), end: *range.end() }
+ }
+
+ fn split_maybe(
+ old: RangeInclusive<u8>,
+ new: RangeInclusive<u8>,
+ ) -> Option<Split> {
+ Split::new(r(old), r(new))
+ }
+
+ fn split(
+ old: RangeInclusive<u8>,
+ new: RangeInclusive<u8>,
+ ) -> Vec<SplitRange> {
+ split_maybe(old, new).unwrap().as_slice().to_vec()
+ }
+
+ #[test]
+ fn no_splits() {
+ // case 1
+ assert_eq!(None, split_maybe(0..=1, 2..=3));
+ // case 2
+ assert_eq!(None, split_maybe(2..=3, 0..=1));
+ }
+
+ #[test]
+ fn splits() {
+ let range = |r: RangeInclusive<u8>| Utf8Range {
+ start: *r.start(),
+ end: *r.end(),
+ };
+ let old = |r| SplitRange::Old(range(r));
+ let new = |r| SplitRange::New(range(r));
+ let both = |r| SplitRange::Both(range(r));
+
+ // case 3
+ assert_eq!(split(0..=0, 0..=0), vec![both(0..=0)]);
+ assert_eq!(split(9..=9, 9..=9), vec![both(9..=9)]);
+
+ // case 4
+ assert_eq!(split(0..=5, 0..=6), vec![both(0..=5), new(6..=6)]);
+ assert_eq!(split(0..=5, 0..=8), vec![both(0..=5), new(6..=8)]);
+ assert_eq!(split(5..=5, 5..=8), vec![both(5..=5), new(6..=8)]);
+
+ // case 5
+ assert_eq!(split(1..=5, 0..=5), vec![new(0..=0), both(1..=5)]);
+ assert_eq!(split(3..=5, 0..=5), vec![new(0..=2), both(3..=5)]);
+ assert_eq!(split(5..=5, 0..=5), vec![new(0..=4), both(5..=5)]);
+
+ // case 6
+ assert_eq!(split(0..=6, 0..=5), vec![both(0..=5), old(6..=6)]);
+ assert_eq!(split(0..=8, 0..=5), vec![both(0..=5), old(6..=8)]);
+ assert_eq!(split(5..=8, 5..=5), vec![both(5..=5), old(6..=8)]);
+
+ // case 7
+ assert_eq!(split(0..=5, 1..=5), vec![old(0..=0), both(1..=5)]);
+ assert_eq!(split(0..=5, 3..=5), vec![old(0..=2), both(3..=5)]);
+ assert_eq!(split(0..=5, 5..=5), vec![old(0..=4), both(5..=5)]);
+
+ // case 8
+ assert_eq!(
+ split(3..=6, 2..=7),
+ vec![new(2..=2), both(3..=6), new(7..=7)],
+ );
+ assert_eq!(
+ split(3..=6, 1..=8),
+ vec![new(1..=2), both(3..=6), new(7..=8)],
+ );
+
+ // case 9
+ assert_eq!(
+ split(2..=7, 3..=6),
+ vec![old(2..=2), both(3..=6), old(7..=7)],
+ );
+ assert_eq!(
+ split(1..=8, 3..=6),
+ vec![old(1..=2), both(3..=6), old(7..=8)],
+ );
+
+ // case 10
+ assert_eq!(
+ split(3..=6, 6..=7),
+ vec![old(3..=5), both(6..=6), new(7..=7)],
+ );
+ assert_eq!(
+ split(3..=6, 6..=8),
+ vec![old(3..=5), both(6..=6), new(7..=8)],
+ );
+ assert_eq!(
+ split(5..=6, 6..=7),
+ vec![old(5..=5), both(6..=6), new(7..=7)],
+ );
+
+ // case 11
+ assert_eq!(
+ split(6..=7, 3..=6),
+ vec![new(3..=5), both(6..=6), old(7..=7)],
+ );
+ assert_eq!(
+ split(6..=8, 3..=6),
+ vec![new(3..=5), both(6..=6), old(7..=8)],
+ );
+ assert_eq!(
+ split(6..=7, 5..=6),
+ vec![new(5..=5), both(6..=6), old(7..=7)],
+ );
+
+ // case 12
+ assert_eq!(
+ split(3..=7, 5..=9),
+ vec![old(3..=4), both(5..=7), new(8..=9)],
+ );
+ assert_eq!(
+ split(3..=5, 4..=6),
+ vec![old(3..=3), both(4..=5), new(6..=6)],
+ );
+
+ // case 13
+ assert_eq!(
+ split(5..=9, 3..=7),
+ vec![new(3..=4), both(5..=7), old(8..=9)],
+ );
+ assert_eq!(
+ split(4..=6, 3..=5),
+ vec![new(3..=3), both(4..=5), old(6..=6)],
+ );
+ }
+
+ // Arguably there should be more tests here, but in practice, this data
+ // structure is well covered by the huge number of regex tests.
+}