summaryrefslogtreecommitdiffstats
path: root/vendor/regex-automata/src/nfa/mod.rs
diff options
context:
space:
mode:
Diffstat (limited to 'vendor/regex-automata/src/nfa/mod.rs')
-rw-r--r--vendor/regex-automata/src/nfa/mod.rs252
1 files changed, 252 insertions, 0 deletions
diff --git a/vendor/regex-automata/src/nfa/mod.rs b/vendor/regex-automata/src/nfa/mod.rs
new file mode 100644
index 000000000..02d0501de
--- /dev/null
+++ b/vendor/regex-automata/src/nfa/mod.rs
@@ -0,0 +1,252 @@
+use std::fmt;
+
+use classes::ByteClasses;
+pub use nfa::compiler::Builder;
+
+mod compiler;
+mod map;
+mod range_trie;
+
+/// The representation for an NFA state identifier.
+pub type StateID = usize;
+
+/// A final compiled NFA.
+///
+/// The states of the NFA are indexed by state IDs, which are how transitions
+/// are expressed.
+#[derive(Clone)]
+pub struct NFA {
+ /// Whether this NFA can only match at the beginning of input or not.
+ ///
+ /// When true, a match should only be reported if it begins at the 0th
+ /// index of the haystack.
+ anchored: bool,
+ /// The starting state of this NFA.
+ start: StateID,
+ /// The state list. This list is guaranteed to be indexable by the starting
+ /// state ID, and it is also guaranteed to contain exactly one `Match`
+ /// state.
+ states: Vec<State>,
+ /// A mapping from any byte value to its corresponding equivalence class
+ /// identifier. Two bytes in the same equivalence class cannot discriminate
+ /// between a match or a non-match. This map can be used to shrink the
+ /// total size of a DFA's transition table with a small match-time cost.
+ ///
+ /// Note that the NFA's transitions are *not* defined in terms of these
+ /// equivalence classes. The NFA's transitions are defined on the original
+ /// byte values. For the most part, this is because they wouldn't really
+ /// help the NFA much since the NFA already uses a sparse representation
+ /// to represent transitions. Byte classes are most effective in a dense
+ /// representation.
+ byte_classes: ByteClasses,
+}
+
+impl NFA {
+ /// Returns an NFA that always matches at every position.
+ pub fn always_match() -> NFA {
+ NFA {
+ anchored: false,
+ start: 0,
+ states: vec![State::Match],
+ byte_classes: ByteClasses::empty(),
+ }
+ }
+
+ /// Returns an NFA that never matches at any position.
+ pub fn never_match() -> NFA {
+ NFA {
+ anchored: false,
+ start: 0,
+ states: vec![State::Fail],
+ byte_classes: ByteClasses::empty(),
+ }
+ }
+
+ /// Returns true if and only if this NFA is anchored.
+ pub fn is_anchored(&self) -> bool {
+ self.anchored
+ }
+
+ /// Return the number of states in this NFA.
+ pub fn len(&self) -> usize {
+ self.states.len()
+ }
+
+ /// Return the ID of the initial state of this NFA.
+ pub fn start(&self) -> StateID {
+ self.start
+ }
+
+ /// Return the NFA state corresponding to the given ID.
+ pub fn state(&self, id: StateID) -> &State {
+ &self.states[id]
+ }
+
+ /// Return the set of equivalence classes for this NFA. The slice returned
+ /// always has length 256 and maps each possible byte value to its
+ /// corresponding equivalence class ID (which is never more than 255).
+ pub fn byte_classes(&self) -> &ByteClasses {
+ &self.byte_classes
+ }
+}
+
+impl fmt::Debug for NFA {
+ fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
+ for (i, state) in self.states.iter().enumerate() {
+ let status = if i == self.start { '>' } else { ' ' };
+ writeln!(f, "{}{:06}: {:?}", status, i, state)?;
+ }
+ Ok(())
+ }
+}
+
+/// A state in a final compiled NFA.
+#[derive(Clone, Eq, PartialEq)]
+pub enum State {
+ /// A state that transitions to `next` if and only if the current input
+ /// byte is in the range `[start, end]` (inclusive).
+ ///
+ /// This is a special case of Sparse in that it encodes only one transition
+ /// (and therefore avoids the allocation).
+ Range { range: Transition },
+ /// A state with possibly many transitions, represented in a sparse
+ /// fashion. Transitions are ordered lexicographically by input range.
+ /// As such, this may only be used when every transition has equal
+ /// priority. (In practice, this is only used for encoding large UTF-8
+ /// automata.)
+ Sparse { ranges: Box<[Transition]> },
+ /// An alternation such that there exists an epsilon transition to all
+ /// states in `alternates`, where matches found via earlier transitions
+ /// are preferred over later transitions.
+ Union { alternates: Box<[StateID]> },
+ /// A fail state. When encountered, the automaton is guaranteed to never
+ /// reach a match state.
+ Fail,
+ /// A match state. There is exactly one such occurrence of this state in
+ /// an NFA.
+ Match,
+}
+
+/// A transition to another state, only if the given byte falls in the
+/// inclusive range specified.
+#[derive(Clone, Copy, Eq, Hash, PartialEq)]
+pub struct Transition {
+ pub start: u8,
+ pub end: u8,
+ pub next: StateID,
+}
+
+impl State {
+ /// Returns true if and only if this state contains one or more epsilon
+ /// transitions.
+ pub fn is_epsilon(&self) -> bool {
+ match *self {
+ State::Range { .. }
+ | State::Sparse { .. }
+ | State::Fail
+ | State::Match => false,
+ State::Union { .. } => true,
+ }
+ }
+
+ /// Remap the transitions in this state using the given map. Namely, the
+ /// given map should be indexed according to the transitions currently
+ /// in this state.
+ ///
+ /// This is used during the final phase of the NFA compiler, which turns
+ /// its intermediate NFA into the final NFA.
+ fn remap(&mut self, remap: &[StateID]) {
+ match *self {
+ State::Range { ref mut range } => range.next = remap[range.next],
+ State::Sparse { ref mut ranges } => {
+ for r in ranges.iter_mut() {
+ r.next = remap[r.next];
+ }
+ }
+ State::Union { ref mut alternates } => {
+ for alt in alternates.iter_mut() {
+ *alt = remap[*alt];
+ }
+ }
+ State::Fail => {}
+ State::Match => {}
+ }
+ }
+}
+
+impl fmt::Debug for State {
+ fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
+ match *self {
+ State::Range { ref range } => range.fmt(f),
+ State::Sparse { ref ranges } => {
+ let rs = ranges
+ .iter()
+ .map(|t| format!("{:?}", t))
+ .collect::<Vec<String>>()
+ .join(", ");
+ write!(f, "sparse({})", rs)
+ }
+ State::Union { ref alternates } => {
+ let alts = alternates
+ .iter()
+ .map(|id| format!("{}", id))
+ .collect::<Vec<String>>()
+ .join(", ");
+ write!(f, "alt({})", alts)
+ }
+ State::Fail => write!(f, "FAIL"),
+ State::Match => write!(f, "MATCH"),
+ }
+ }
+}
+
+impl fmt::Debug for Transition {
+ fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
+ let Transition { start, end, next } = *self;
+ if self.start == self.end {
+ write!(f, "{} => {}", escape(start), next)
+ } else {
+ write!(f, "{}-{} => {}", escape(start), escape(end), next)
+ }
+ }
+}
+
+/// Return the given byte as its escaped string form.
+fn escape(b: u8) -> String {
+ use std::ascii;
+
+ String::from_utf8(ascii::escape_default(b).collect::<Vec<_>>()).unwrap()
+}
+
+#[cfg(test)]
+mod tests {
+ use super::*;
+ use dense;
+ use dfa::DFA;
+
+ #[test]
+ fn always_match() {
+ let nfa = NFA::always_match();
+ let dfa = dense::Builder::new().build_from_nfa::<usize>(&nfa).unwrap();
+
+ assert_eq!(Some(0), dfa.find_at(b"", 0));
+ assert_eq!(Some(0), dfa.find_at(b"a", 0));
+ assert_eq!(Some(1), dfa.find_at(b"a", 1));
+ assert_eq!(Some(0), dfa.find_at(b"ab", 0));
+ assert_eq!(Some(1), dfa.find_at(b"ab", 1));
+ assert_eq!(Some(2), dfa.find_at(b"ab", 2));
+ }
+
+ #[test]
+ fn never_match() {
+ let nfa = NFA::never_match();
+ let dfa = dense::Builder::new().build_from_nfa::<usize>(&nfa).unwrap();
+
+ assert_eq!(None, dfa.find_at(b"", 0));
+ assert_eq!(None, dfa.find_at(b"a", 0));
+ assert_eq!(None, dfa.find_at(b"a", 1));
+ assert_eq!(None, dfa.find_at(b"ab", 0));
+ assert_eq!(None, dfa.find_at(b"ab", 1));
+ assert_eq!(None, dfa.find_at(b"ab", 2));
+ }
+}