use state_id::StateID; /// A trait describing the interface of a deterministic finite automaton (DFA). /// /// Every DFA has exactly one start state and at least one dead state (which /// may be the same, as in the case of an empty DFA). In all cases, a state /// identifier of `0` must be a dead state such that `DFA::is_dead_state(0)` /// always returns `true`. /// /// Every DFA also has zero or more match states, such that /// `DFA::is_match_state(id)` returns `true` if and only if `id` corresponds to /// a match state. /// /// In general, users of this trait likely will only need to use the search /// routines such as `is_match`, `shortest_match`, `find` or `rfind`. The other /// methods are lower level and are used for walking the transitions of a DFA /// manually. In particular, the aforementioned search routines are implemented /// generically in terms of the lower level transition walking routines. pub trait DFA { /// The representation used for state identifiers in this DFA. /// /// Typically, this is one of `u8`, `u16`, `u32`, `u64` or `usize`. type ID: StateID; /// Return the identifier of this DFA's start state. fn start_state(&self) -> Self::ID; /// Returns true if and only if the given identifier corresponds to a match /// state. fn is_match_state(&self, id: Self::ID) -> bool; /// Returns true if and only if the given identifier corresponds to a dead /// state. When a DFA enters a dead state, it is impossible to leave and /// thus can never lead to a match. fn is_dead_state(&self, id: Self::ID) -> bool; /// Returns true if and only if the given identifier corresponds to either /// a dead state or a match state, such that one of `is_match_state(id)` /// or `is_dead_state(id)` must return true. /// /// Depending on the implementation of the DFA, this routine can be used /// to save a branch in the core matching loop. Nevertheless, /// `is_match_state(id) || is_dead_state(id)` is always a valid /// implementation. fn is_match_or_dead_state(&self, id: Self::ID) -> bool; /// Returns true if and only if this DFA is anchored. /// /// When a DFA is anchored, it is only allowed to report matches that /// start at index `0`. fn is_anchored(&self) -> bool; /// Given the current state that this DFA is in and the next input byte, /// this method returns the identifier of the next state. The identifier /// returned is always valid, but it may correspond to a dead state. fn next_state(&self, current: Self::ID, input: u8) -> Self::ID; /// Like `next_state`, but its implementation may look up the next state /// without memory safety checks such as bounds checks. As such, callers /// must ensure that the given identifier corresponds to a valid DFA /// state. Implementors must, in turn, ensure that this routine is safe /// for all valid state identifiers and for all possible `u8` values. unsafe fn next_state_unchecked( &self, current: Self::ID, input: u8, ) -> Self::ID; /// Returns true if and only if the given bytes match this DFA. /// /// This routine may short circuit if it knows that scanning future input /// will never lead to a different result. In particular, if a DFA enters /// a match state or a dead state, then this routine will return `true` or /// `false`, respectively, without inspecting any future input. /// /// # Example /// /// This example shows how to use this method with a /// [`DenseDFA`](enum.DenseDFA.html). /// /// ``` /// use regex_automata::{DFA, DenseDFA}; /// /// # fn example() -> Result<(), regex_automata::Error> { /// let dfa = DenseDFA::new("foo[0-9]+bar")?; /// assert_eq!(true, dfa.is_match(b"foo12345bar")); /// assert_eq!(false, dfa.is_match(b"foobar")); /// # Ok(()) }; example().unwrap() /// ``` #[inline] fn is_match(&self, bytes: &[u8]) -> bool { self.is_match_at(bytes, 0) } /// Returns the first position at which a match is found. /// /// This routine stops scanning input in precisely the same circumstances /// as `is_match`. The key difference is that this routine returns the /// position at which it stopped scanning input if and only if a match /// was found. If no match is found, then `None` is returned. /// /// # Example /// /// This example shows how to use this method with a /// [`DenseDFA`](enum.DenseDFA.html). /// /// ``` /// use regex_automata::{DFA, DenseDFA}; /// /// # fn example() -> Result<(), regex_automata::Error> { /// let dfa = DenseDFA::new("foo[0-9]+")?; /// assert_eq!(Some(4), dfa.shortest_match(b"foo12345")); /// /// // Normally, the end of the leftmost first match here would be 3, /// // but the shortest match semantics detect a match earlier. /// let dfa = DenseDFA::new("abc|a")?; /// assert_eq!(Some(1), dfa.shortest_match(b"abc")); /// # Ok(()) }; example().unwrap() /// ``` #[inline] fn shortest_match(&self, bytes: &[u8]) -> Option { self.shortest_match_at(bytes, 0) } /// Returns the end offset of the longest match. If no match exists, /// then `None` is returned. /// /// Implementors of this trait are not required to implement any particular /// match semantics (such as leftmost-first), which are instead manifest in /// the DFA's topology itself. /// /// In particular, this method must continue searching even after it /// enters a match state. The search should only terminate once it has /// reached the end of the input or when it has entered a dead state. Upon /// termination, the position of the last byte seen while still in a match /// state is returned. /// /// # Example /// /// This example shows how to use this method with a /// [`DenseDFA`](enum.DenseDFA.html). By default, a dense DFA uses /// "leftmost first" match semantics. /// /// Leftmost first match semantics corresponds to the match with the /// smallest starting offset, but where the end offset is determined by /// preferring earlier branches in the original regular expression. For /// example, `Sam|Samwise` will match `Sam` in `Samwise`, but `Samwise|Sam` /// will match `Samwise` in `Samwise`. /// /// Generally speaking, the "leftmost first" match is how most backtracking /// regular expressions tend to work. This is in contrast to POSIX-style /// regular expressions that yield "leftmost longest" matches. Namely, /// both `Sam|Samwise` and `Samwise|Sam` match `Samwise` when using /// leftmost longest semantics. /// /// ``` /// use regex_automata::{DFA, DenseDFA}; /// /// # fn example() -> Result<(), regex_automata::Error> { /// let dfa = DenseDFA::new("foo[0-9]+")?; /// assert_eq!(Some(8), dfa.find(b"foo12345")); /// /// // Even though a match is found after reading the first byte (`a`), /// // the leftmost first match semantics demand that we find the earliest /// // match that prefers earlier parts of the pattern over latter parts. /// let dfa = DenseDFA::new("abc|a")?; /// assert_eq!(Some(3), dfa.find(b"abc")); /// # Ok(()) }; example().unwrap() /// ``` #[inline] fn find(&self, bytes: &[u8]) -> Option { self.find_at(bytes, 0) } /// Returns the start offset of the longest match in reverse, by searching /// from the end of the input towards the start of the input. If no match /// exists, then `None` is returned. In other words, this has the same /// match semantics as `find`, but in reverse. /// /// # Example /// /// This example shows how to use this method with a /// [`DenseDFA`](enum.DenseDFA.html). In particular, this routine /// is principally useful when used in conjunction with the /// [`dense::Builder::reverse`](dense/struct.Builder.html#method.reverse) /// configuration knob. In general, it's unlikely to be correct to use both /// `find` and `rfind` with the same DFA since any particular DFA will only /// support searching in one direction. /// /// ``` /// use regex_automata::{dense, DFA}; /// /// # fn example() -> Result<(), regex_automata::Error> { /// let dfa = dense::Builder::new().reverse(true).build("foo[0-9]+")?; /// assert_eq!(Some(0), dfa.rfind(b"foo12345")); /// # Ok(()) }; example().unwrap() /// ``` #[inline] fn rfind(&self, bytes: &[u8]) -> Option { self.rfind_at(bytes, bytes.len()) } /// Returns the same as `is_match`, but starts the search at the given /// offset. /// /// The significance of the starting point is that it takes the surrounding /// context into consideration. For example, if the DFA is anchored, then /// a match can only occur when `start == 0`. #[inline] fn is_match_at(&self, bytes: &[u8], start: usize) -> bool { if self.is_anchored() && start > 0 { return false; } let mut state = self.start_state(); if self.is_match_or_dead_state(state) { return self.is_match_state(state); } for &b in bytes[start..].iter() { state = unsafe { self.next_state_unchecked(state, b) }; if self.is_match_or_dead_state(state) { return self.is_match_state(state); } } false } /// Returns the same as `shortest_match`, but starts the search at the /// given offset. /// /// The significance of the starting point is that it takes the surrounding /// context into consideration. For example, if the DFA is anchored, then /// a match can only occur when `start == 0`. #[inline] fn shortest_match_at(&self, bytes: &[u8], start: usize) -> Option { if self.is_anchored() && start > 0 { return None; } let mut state = self.start_state(); if self.is_match_or_dead_state(state) { return if self.is_dead_state(state) { None } else { Some(start) }; } for (i, &b) in bytes[start..].iter().enumerate() { state = unsafe { self.next_state_unchecked(state, b) }; if self.is_match_or_dead_state(state) { return if self.is_dead_state(state) { None } else { Some(start + i + 1) }; } } None } /// Returns the same as `find`, but starts the search at the given /// offset. /// /// The significance of the starting point is that it takes the surrounding /// context into consideration. For example, if the DFA is anchored, then /// a match can only occur when `start == 0`. #[inline] fn find_at(&self, bytes: &[u8], start: usize) -> Option { if self.is_anchored() && start > 0 { return None; } let mut state = self.start_state(); let mut last_match = if self.is_dead_state(state) { return None; } else if self.is_match_state(state) { Some(start) } else { None }; for (i, &b) in bytes[start..].iter().enumerate() { state = unsafe { self.next_state_unchecked(state, b) }; if self.is_match_or_dead_state(state) { if self.is_dead_state(state) { return last_match; } last_match = Some(start + i + 1); } } last_match } /// Returns the same as `rfind`, but starts the search at the given /// offset. /// /// The significance of the starting point is that it takes the surrounding /// context into consideration. For example, if the DFA is anchored, then /// a match can only occur when `start == bytes.len()`. #[inline(never)] fn rfind_at(&self, bytes: &[u8], start: usize) -> Option { if self.is_anchored() && start < bytes.len() { return None; } let mut state = self.start_state(); let mut last_match = if self.is_dead_state(state) { return None; } else if self.is_match_state(state) { Some(start) } else { None }; for (i, &b) in bytes[..start].iter().enumerate().rev() { state = unsafe { self.next_state_unchecked(state, b) }; if self.is_match_or_dead_state(state) { if self.is_dead_state(state) { return last_match; } last_match = Some(i); } } last_match } } impl<'a, T: DFA> DFA for &'a T { type ID = T::ID; #[inline] fn start_state(&self) -> Self::ID { (**self).start_state() } #[inline] fn is_match_state(&self, id: Self::ID) -> bool { (**self).is_match_state(id) } #[inline] fn is_match_or_dead_state(&self, id: Self::ID) -> bool { (**self).is_match_or_dead_state(id) } #[inline] fn is_dead_state(&self, id: Self::ID) -> bool { (**self).is_dead_state(id) } #[inline] fn is_anchored(&self) -> bool { (**self).is_anchored() } #[inline] fn next_state(&self, current: Self::ID, input: u8) -> Self::ID { (**self).next_state(current, input) } #[inline] unsafe fn next_state_unchecked( &self, current: Self::ID, input: u8, ) -> Self::ID { (**self).next_state_unchecked(current, input) } }