diff options
Diffstat (limited to 'vendor/regex-automata/src/nfa/thompson/mod.rs')
-rw-r--r-- | vendor/regex-automata/src/nfa/thompson/mod.rs | 1624 |
1 files changed, 75 insertions, 1549 deletions
diff --git a/vendor/regex-automata/src/nfa/thompson/mod.rs b/vendor/regex-automata/src/nfa/thompson/mod.rs index 88a438e8e..cf426736d 100644 --- a/vendor/regex-automata/src/nfa/thompson/mod.rs +++ b/vendor/regex-automata/src/nfa/thompson/mod.rs @@ -1,1555 +1,81 @@ -use core::{convert::TryFrom, fmt, mem, ops::Range}; - -use alloc::{boxed::Box, format, string::String, sync::Arc, vec, vec::Vec}; - -use crate::util::{ - alphabet::{self, ByteClassSet}, - decode_last_utf8, decode_utf8, - id::{IteratorIDExt, PatternID, PatternIDIter, StateID}, - is_word_byte, is_word_char_fwd, is_word_char_rev, -}; - -pub use self::{ - compiler::{Builder, Config}, - error::Error, -}; - +/*! +Defines a Thompson NFA and provides the [`PikeVM`](pikevm::PikeVM) and +[`BoundedBacktracker`](backtrack::BoundedBacktracker) regex engines. + +A Thompson NFA (non-deterministic finite automaton) is arguably _the_ central +data type in this library. It is the result of what is commonly referred to as +"regex compilation." That is, turning a regex pattern from its concrete syntax +string into something that can run a search looks roughly like this: + +* A `&str` is parsed into a [`regex-syntax::ast::Ast`](regex_syntax::ast::Ast). +* An `Ast` is translated into a [`regex-syntax::hir::Hir`](regex_syntax::hir::Hir). +* An `Hir` is compiled into a [`NFA`]. +* The `NFA` is then used to build one of a few different regex engines: + * An `NFA` is used directly in the `PikeVM` and `BoundedBacktracker` engines. + * An `NFA` is used by a [hybrid NFA/DFA](crate::hybrid) to build out a DFA's + transition table at search time. + * An `NFA`, assuming it is one-pass, is used to build a full + [one-pass DFA](crate::dfa::onepass) ahead of time. + * An `NFA` is used to build a [full DFA](crate::dfa) ahead of time. + +The [`meta`](crate::meta) regex engine makes all of these choices for you based +on various criteria. However, if you have a lower level use case, _you_ can +build any of the above regex engines and use them directly. But you must start +here by building an `NFA`. + +# Details + +It is perhaps worth expanding a bit more on what it means to go through the +`&str`->`Ast`->`Hir`->`NFA` process. + +* Parsing a string into an `Ast` gives it a structured representation. +Crucially, the size and amount of work done in this step is proportional to the +size of the original string. No optimization or Unicode handling is done at +this point. This means that parsing into an `Ast` has very predictable costs. +Moreover, an `Ast` can be roundtripped back to its original pattern string as +written. +* Translating an `Ast` into an `Hir` is a process by which the structured +representation is simplified down to its most fundamental components. +Translation deals with flags such as case insensitivity by converting things +like `(?i:a)` to `[Aa]`. Translation is also where Unicode tables are consulted +to resolve things like `\p{Emoji}` and `\p{Greek}`. It also flattens each +character class, regardless of how deeply nested it is, into a single sequence +of non-overlapping ranges. All the various literal forms are thrown out in +favor of one common representation. Overall, the `Hir` is small enough to fit +into your head and makes analysis and other tasks much simpler. +* Compiling an `Hir` into an `NFA` formulates the regex into a finite state +machine whose transitions are defined over bytes. For example, an `Hir` might +have a Unicode character class corresponding to a sequence of ranges defined +in terms of `char`. Compilation is then responsible for turning those ranges +into a UTF-8 automaton. That is, an automaton that matches the UTF-8 encoding +of just the codepoints specified by those ranges. Otherwise, the main job of +an `NFA` is to serve as a byte-code of sorts for a virtual machine. It can be +seen as a sequence of instructions for how to match a regex. +*/ + +#[cfg(feature = "nfa-backtrack")] +pub mod backtrack; +mod builder; +#[cfg(feature = "syntax")] mod compiler; mod error; +#[cfg(feature = "syntax")] +mod literal_trie; +#[cfg(feature = "syntax")] mod map; +mod nfa; +#[cfg(feature = "nfa-pikevm")] pub mod pikevm; +#[cfg(feature = "syntax")] mod range_trie; -/// A map from capture group name to its corresponding capture index. -/// -/// Since there are always two slots for each capture index, the pair of slots -/// corresponding to the capture index for a pattern ID of 0 are indexed at -/// `map["<name>"] * 2` and `map["<name>"] * 2 + 1`. -/// -/// This type is actually wrapped inside a Vec indexed by pattern ID on the -/// NFA, since multiple patterns may have the same capture group name. -/// -/// Note that this is somewhat of a sub-optimal representation, since it -/// requires a hashmap for each pattern. A better representation would be -/// HashMap<(PatternID, Arc<str>), usize>, but this makes it difficult to look -/// up a capture index by name without producing a `Arc<str>`, which requires -/// an allocation. To fix this, I think we'd need to define our own unsized -/// type or something? -#[cfg(feature = "std")] -type CaptureNameMap = std::collections::HashMap<Arc<str>, usize>; -#[cfg(not(feature = "std"))] -type CaptureNameMap = alloc::collections::BTreeMap<Arc<str>, usize>; - -// The NFA API below is not something I'm terribly proud of at the moment. In -// particular, it supports both mutating the NFA and actually using the NFA to -// perform a search. I think combining these two things muddies the waters a -// bit too much. -// -// I think the issue is that I saw the compiler as the 'builder,' and where -// the compiler had the ability to manipulate the internal state of the NFA. -// However, one of my goals was to make it possible for others to build their -// own NFAs in a way that is *not* couple to the regex-syntax crate. -// -// So I think really, there should be an NFA, a NFABuilder and then the -// internal compiler which uses the NFABuilder API to build an NFA. Alas, at -// the time of writing, I kind of ran out of steam. - -/// A fully compiled Thompson NFA. -/// -/// The states of the NFA are indexed by state IDs, which are how transitions -/// are expressed. -#[derive(Clone)] -pub struct NFA { - /// The state list. This list is guaranteed to be indexable by all starting - /// state IDs, and it is also guaranteed to contain at most one `Match` - /// state for each pattern compiled into this NFA. (A pattern may not have - /// a corresponding `Match` state if a `Match` state is impossible to - /// reach.) - states: Vec<State>, - /// The anchored starting state of this NFA. - start_anchored: StateID, - /// The unanchored starting state of this NFA. - start_unanchored: StateID, - /// The starting states for each individual pattern. Starting at any - /// of these states will result in only an anchored search for the - /// corresponding pattern. The vec is indexed by pattern ID. When the NFA - /// contains a single regex, then `start_pattern[0]` and `start_anchored` - /// are always equivalent. - start_pattern: Vec<StateID>, - /// A map from PatternID to its corresponding range of capture slots. Each - /// range is guaranteed to be contiguous with the previous range. The - /// end of the last range corresponds to the total number of slots needed - /// for this NFA. - patterns_to_slots: Vec<Range<usize>>, - /// A map from capture name to its corresponding index. So e.g., given - /// a single regex like '(\w+) (\w+) (?P<word>\w+)', the capture name - /// 'word' for pattern ID=0 would corresponding to the index '3'. Its - /// corresponding slots would then be '3 * 2 = 6' and '3 * 2 + 1 = 7'. - capture_name_to_index: Vec<CaptureNameMap>, - /// A map from pattern ID to capture group index to name, if one exists. - /// This is effectively the inverse of 'capture_name_to_index'. The outer - /// vec is indexed by pattern ID, while the inner vec is index by capture - /// index offset for the corresponding pattern. - /// - /// The first capture group for each pattern is always unnamed and is thus - /// always None. - capture_index_to_name: Vec<Vec<Option<Arc<str>>>>, - /// A representation of equivalence classes over the transitions in this - /// NFA. Two bytes in the same equivalence class must not discriminate - /// between a match or a non-match. This map can be used to shrink the - /// total size of a DFA's transition table with a small match-time cost. - /// - /// Note that the NFA's transitions are *not* defined in terms of these - /// equivalence classes. The NFA's transitions are defined on the original - /// byte values. For the most part, this is because they wouldn't really - /// help the NFA much since the NFA already uses a sparse representation - /// to represent transitions. Byte classes are most effective in a dense - /// representation. - byte_class_set: ByteClassSet, - /// Various facts about this NFA, which can be used to improve failure - /// modes (e.g., rejecting DFA construction if an NFA has Unicode word - /// boundaries) or for performing optimizations (avoiding an increase in - /// states if there are no look-around states). - facts: Facts, - /// Heap memory used indirectly by NFA states. Since each state might use a - /// different amount of heap, we need to keep track of this incrementally. - memory_states: usize, -} - -impl NFA { - pub fn config() -> Config { - Config::new() - } - - pub fn builder() -> Builder { - Builder::new() - } - - /// Returns an NFA with no states. Its match semantics are unspecified. - /// - /// An empty NFA is useful as a starting point for building one. It is - /// itself not intended to be used for matching. For example, its starting - /// state identifiers are configured to be `0`, but since it has no states, - /// the identifiers are invalid. - /// - /// If you need an NFA that never matches is anything and can be correctly - /// used for matching, use [`NFA::never_match`]. - #[inline] - pub fn empty() -> NFA { - NFA { - states: vec![], - start_anchored: StateID::ZERO, - start_unanchored: StateID::ZERO, - start_pattern: vec![], - patterns_to_slots: vec![], - capture_name_to_index: vec![], - capture_index_to_name: vec![], - byte_class_set: ByteClassSet::empty(), - facts: Facts::default(), - memory_states: 0, - } - } - - /// Returns an NFA with a single regex that always matches at every - /// position. - #[inline] - pub fn always_match() -> NFA { - let mut nfa = NFA::empty(); - // Since we're only adding one pattern, these are guaranteed to work. - let start = nfa.add_match().unwrap(); - assert_eq!(start.as_usize(), 0); - let pid = nfa.finish_pattern(start).unwrap(); - assert_eq!(pid.as_usize(), 0); - nfa - } - - /// Returns an NFA that never matches at any position. It contains no - /// regexes. - #[inline] - pub fn never_match() -> NFA { - let mut nfa = NFA::empty(); - // Since we're only adding one state, this can never fail. - nfa.add_fail().unwrap(); - nfa - } - - /// Return the number of states in this NFA. - /// - /// This is guaranteed to be no bigger than [`StateID::LIMIT`]. - #[inline] - pub fn len(&self) -> usize { - self.states.len() - } - - /// Returns the total number of distinct match states in this NFA. - /// Stated differently, this returns the total number of regex patterns - /// used to build this NFA. - /// - /// This may return zero if the NFA was constructed with no patterns. In - /// this case, and only this case, the NFA can never produce a match for - /// any input. - /// - /// This is guaranteed to be no bigger than [`PatternID::LIMIT`]. - #[inline] - pub fn pattern_len(&self) -> usize { - self.start_pattern.len() - } - - /// Returns the pattern ID of the pattern currently being compiled by this - /// NFA. - fn current_pattern_id(&self) -> PatternID { - // This always works because we never permit more patterns in - // 'start_pattern' than can be addressed by PatternID. Also, we only - // add a new entry to 'start_pattern' once we finish compiling a - // pattern. Thus, the length refers to the ID of the current pattern - // being compiled. - PatternID::new(self.start_pattern.len()).unwrap() - } - - /// Returns the total number of capturing groups in this NFA. - /// - /// This includes the special 0th capture group that is always present and - /// captures the start and end offset of the entire match. - /// - /// This is a convenience routine for `nfa.capture_slot_len() / 2`. - #[inline] - pub fn capture_len(&self) -> usize { - let slots = self.capture_slot_len(); - // This assert is guaranteed to pass since the NFA construction process - // guarantees that it is always true. - assert_eq!(slots % 2, 0, "capture slots must be divisible by 2"); - slots / 2 - } - - /// Returns the total number of capturing slots in this NFA. - /// - /// This value is guaranteed to be a multiple of 2. (Where each capturing - /// group has precisely two capturing slots in the NFA.) - #[inline] - pub fn capture_slot_len(&self) -> usize { - self.patterns_to_slots.last().map_or(0, |r| r.end) - } - - /// Return a range of capture slots for the given pattern. - /// - /// The range returned is guaranteed to be contiguous with ranges for - /// adjacent patterns. - /// - /// This panics if the given pattern ID is greater than or equal to the - /// number of patterns in this NFA. - #[inline] - pub fn pattern_slots(&self, pid: PatternID) -> Range<usize> { - self.patterns_to_slots[pid].clone() - } - - /// Return the capture group index corresponding to the given name in the - /// given pattern. If no such capture group name exists in the given - /// pattern, then this returns `None`. - /// - /// If the given pattern ID is invalid, then this panics. - #[inline] - pub fn capture_name_to_index( - &self, - pid: PatternID, - name: &str, - ) -> Option<usize> { - assert!(pid.as_usize() < self.pattern_len(), "invalid pattern ID"); - self.capture_name_to_index[pid].get(name).cloned() - } - - // TODO: add iterators over capture group names. - // Do we also permit indexing? - - /// Returns an iterator over all pattern IDs in this NFA. - #[inline] - pub fn patterns(&self) -> PatternIter { - PatternIter { - it: PatternID::iter(self.pattern_len()), - _marker: core::marker::PhantomData, - } - } - - /// Return the ID of the initial anchored state of this NFA. - #[inline] - pub fn start_anchored(&self) -> StateID { - self.start_anchored - } - - /// Set the anchored starting state ID for this NFA. - #[inline] - pub fn set_start_anchored(&mut self, id: StateID) { - self.start_anchored = id; - } - - /// Return the ID of the initial unanchored state of this NFA. - #[inline] - pub fn start_unanchored(&self) -> StateID { - self.start_unanchored - } - - /// Set the unanchored starting state ID for this NFA. - #[inline] - pub fn set_start_unanchored(&mut self, id: StateID) { - self.start_unanchored = id; - } - - /// Return the ID of the initial anchored state for the given pattern. - /// - /// If the pattern doesn't exist in this NFA, then this panics. - #[inline] - pub fn start_pattern(&self, pid: PatternID) -> StateID { - self.start_pattern[pid] - } - - /// Get the byte class set for this NFA. - #[inline] - pub fn byte_class_set(&self) -> &ByteClassSet { - &self.byte_class_set - } - - /// Return a reference to the NFA state corresponding to the given ID. - #[inline] - pub fn state(&self, id: StateID) -> &State { - &self.states[id] - } - - /// Returns a slice of all states in this NFA. - /// - /// The slice returned may be indexed by a `StateID` generated by `add`. - #[inline] - pub fn states(&self) -> &[State] { - &self.states - } - - #[inline] - pub fn is_always_start_anchored(&self) -> bool { - self.start_anchored() == self.start_unanchored() - } - - #[inline] - pub fn has_any_look(&self) -> bool { - self.facts.has_any_look() - } - - #[inline] - pub fn has_any_anchor(&self) -> bool { - self.facts.has_any_anchor() - } - - #[inline] - pub fn has_word_boundary(&self) -> bool { - self.has_word_boundary_unicode() || self.has_word_boundary_ascii() - } - - #[inline] - pub fn has_word_boundary_unicode(&self) -> bool { - self.facts.has_word_boundary_unicode() - } - - #[inline] - pub fn has_word_boundary_ascii(&self) -> bool { - self.facts.has_word_boundary_ascii() - } - - /// Returns the memory usage, in bytes, of this NFA. - /// - /// This does **not** include the stack size used up by this NFA. To - /// compute that, use `std::mem::size_of::<NFA>()`. - #[inline] - pub fn memory_usage(&self) -> usize { - self.states.len() * mem::size_of::<State>() - + self.memory_states - + self.start_pattern.len() * mem::size_of::<StateID>() - } - - // Why do we define a bunch of 'add_*' routines below instead of just - // defining a single 'add' routine that accepts a 'State'? Indeed, for most - // of the 'add_*' routines below, such a simple API would be more than - // appropriate. Unfortunately, adding capture states and, to a lesser - // extent, match states, is a bit more complex. Namely, when we add a - // capture state, we *really* want to know the corresponding capture - // group's name and index and what not, so that we can update other state - // inside this NFA. But, e.g., the capture group name is not and should - // not be included in 'State::Capture'. So what are our choices? - // - // 1) Define one 'add' and require some additional optional parameters. - // This feels quite ugly, and adds unnecessary complexity to more common - // and simpler cases. - // - // 2) Do what we do below. The sad thing is that our API is bigger with - // more methods. But each method is very specific and hopefully simple. - // - // 3) Define a new enum, say, 'StateWithInfo', or something that permits - // providing both a State and some extra ancillary info in some cases. This - // doesn't seem too bad to me, but seems slightly worse than (2) because of - // the additional type required. - // - // 4) Abandon the idea that we have to specify things like the capture - // group name when we add the Capture state to the NFA. We would then need - // to add other methods that permit the caller to add this additional state - // "out of band." Other than it introducing some additional complexity, I - // decided against this because I wanted the NFA builder API to make it - // as hard as possible to build a bad or invalid NFA. Using the approach - // below, as you'll see, permits us to do a lot of strict checking of our - // inputs and return an error if we see something we don't expect. - - pub fn add_range(&mut self, range: Transition) -> Result<StateID, Error> { - self.byte_class_set.set_range(range.start, range.end); - self.add_state(State::Range { range }) - } - - pub fn add_sparse( - &mut self, - sparse: SparseTransitions, - ) -> Result<StateID, Error> { - for range in sparse.ranges.iter() { - self.byte_class_set.set_range(range.start, range.end); - } - self.add_state(State::Sparse(sparse)) - } - - pub fn add_look( - &mut self, - next: StateID, - look: Look, - ) -> Result<StateID, Error> { - self.facts.set_has_any_look(true); - look.add_to_byteset(&mut self.byte_class_set); - match look { - Look::StartLine - | Look::EndLine - | Look::StartText - | Look::EndText => { - self.facts.set_has_any_anchor(true); - } - Look::WordBoundaryUnicode | Look::WordBoundaryUnicodeNegate => { - self.facts.set_has_word_boundary_unicode(true); - } - Look::WordBoundaryAscii | Look::WordBoundaryAsciiNegate => { - self.facts.set_has_word_boundary_ascii(true); - } - } - self.add_state(State::Look { look, next }) - } - - pub fn add_union( - &mut self, - alternates: Box<[StateID]>, - ) -> Result<StateID, Error> { - self.add_state(State::Union { alternates }) - } - - pub fn add_capture_start( - &mut self, - next_id: StateID, - capture_index: u32, - name: Option<Arc<str>>, - ) -> Result<StateID, Error> { - let pid = self.current_pattern_id(); - let capture_index = match usize::try_from(capture_index) { - Err(_) => { - return Err(Error::invalid_capture_index(core::usize::MAX)) - } - Ok(capture_index) => capture_index, - }; - // Do arithmetic to find our absolute slot index first, to make sure - // the index is at least possibly valid (doesn't overflow). - let relative_slot = match capture_index.checked_mul(2) { - Some(relative_slot) => relative_slot, - None => return Err(Error::invalid_capture_index(capture_index)), - }; - let slot = match relative_slot.checked_add(self.capture_slot_len()) { - Some(slot) => slot, - None => return Err(Error::invalid_capture_index(capture_index)), - }; - // Make sure we have space to insert our (pid,index)|-->name mapping. - if pid.as_usize() >= self.capture_index_to_name.len() { - // Note that we require that if you're adding capturing groups, - // then there must be at least one capturing group per pattern. - // Moreover, whenever we expand our space here, it should always - // first be for the first capture group (at index==0). - if pid.as_usize() > self.capture_index_to_name.len() - || capture_index > 0 - { - return Err(Error::invalid_capture_index(capture_index)); - } - self.capture_name_to_index.push(CaptureNameMap::new()); - self.capture_index_to_name.push(vec![]); - } - if capture_index >= self.capture_index_to_name[pid].len() { - // We require that capturing groups are added in correspondence - // to their index. So no discontinuous indices. This is likely - // overly strict, but also makes it simpler to provide guarantees - // about our capturing group data. - if capture_index > self.capture_index_to_name[pid].len() { - return Err(Error::invalid_capture_index(capture_index)); - } - self.capture_index_to_name[pid].push(None); - } - if let Some(ref name) = name { - self.capture_name_to_index[pid] - .insert(Arc::clone(name), capture_index); - } - self.capture_index_to_name[pid][capture_index] = name; - self.add_state(State::Capture { next: next_id, slot }) - } - - pub fn add_capture_end( - &mut self, - next_id: StateID, - capture_index: u32, - ) -> Result<StateID, Error> { - let pid = self.current_pattern_id(); - let capture_index = match usize::try_from(capture_index) { - Err(_) => { - return Err(Error::invalid_capture_index(core::usize::MAX)) - } - Ok(capture_index) => capture_index, - }; - // If we haven't already added this capture group via a corresponding - // 'add_capture_start' call, then we consider the index given to be - // invalid. - if pid.as_usize() >= self.capture_index_to_name.len() - || capture_index >= self.capture_index_to_name[pid].len() - { - return Err(Error::invalid_capture_index(capture_index)); - } - // Since we've already confirmed that this capture index is invalid - // and has a corresponding starting slot, we know the multiplcation - // has already been done and succeeded. - let relative_slot_start = capture_index.checked_mul(2).unwrap(); - let relative_slot = match relative_slot_start.checked_add(1) { - Some(relative_slot) => relative_slot, - None => return Err(Error::invalid_capture_index(capture_index)), - }; - let slot = match relative_slot.checked_add(self.capture_slot_len()) { - Some(slot) => slot, - None => return Err(Error::invalid_capture_index(capture_index)), - }; - self.add_state(State::Capture { next: next_id, slot }) - } - - pub fn add_fail(&mut self) -> Result<StateID, Error> { - self.add_state(State::Fail) - } - - /// Add a new match state to this NFA and return its state ID. - pub fn add_match(&mut self) -> Result<StateID, Error> { - let pattern_id = self.current_pattern_id(); - let sid = self.add_state(State::Match { id: pattern_id })?; - Ok(sid) - } - - /// Finish compiling the current pattern and return its identifier. The - /// given ID should be the state ID corresponding to the anchored starting - /// state for matching this pattern. - pub fn finish_pattern( - &mut self, - start_id: StateID, - ) -> Result<PatternID, Error> { - // We've gotta make sure that we never permit the user to add more - // patterns than we can identify. So if we're already at the limit, - // then return an error. This is somewhat non-ideal since this won't - // result in an error until trying to complete the compilation of a - // pattern instead of starting it. - if self.start_pattern.len() >= PatternID::LIMIT { - return Err(Error::too_many_patterns( - self.start_pattern.len().saturating_add(1), - )); - } - let pid = self.current_pattern_id(); - self.start_pattern.push(start_id); - // Add the number of new slots created by this pattern. This is always - // equivalent to '2 * caps.len()', where 'caps.len()' is the number of - // new capturing groups introduced by the pattern we're finishing. - let new_cap_groups = self - .capture_index_to_name - .get(pid.as_usize()) - .map_or(0, |caps| caps.len()); - let new_slots = match new_cap_groups.checked_mul(2) { - Some(new_slots) => new_slots, - None => { - // Just return the biggest index that we know exists. - let index = new_cap_groups.saturating_sub(1); - return Err(Error::invalid_capture_index(index)); - } - }; - let slot_start = self.capture_slot_len(); - self.patterns_to_slots.push(slot_start..(slot_start + new_slots)); - Ok(pid) - } - - fn add_state(&mut self, state: State) -> Result<StateID, Error> { - let id = StateID::new(self.states.len()) - .map_err(|_| Error::too_many_states(self.states.len()))?; - self.memory_states += state.memory_usage(); - self.states.push(state); - Ok(id) - } - - /// Remap the transitions in every state of this NFA using the given map. - /// The given map should be indexed according to state ID namespace used by - /// the transitions of the states currently in this NFA. - /// - /// This may be used during the final phases of an NFA compiler, which - /// turns its intermediate NFA into the final NFA. Remapping may be - /// required to bring the state pointers from the intermediate NFA to the - /// final NFA. - pub fn remap(&mut self, old_to_new: &[StateID]) { - for state in &mut self.states { - state.remap(old_to_new); - } - self.start_anchored = old_to_new[self.start_anchored]; - self.start_unanchored = old_to_new[self.start_unanchored]; - for (pid, id) in self.start_pattern.iter_mut().with_pattern_ids() { - *id = old_to_new[*id]; - } - } - - /// Clear this NFA such that it has zero states and is otherwise "empty." - /// - /// An empty NFA is useful as a starting point for building one. It is - /// itself not intended to be used for matching. For example, its starting - /// state identifiers are configured to be `0`, but since it has no states, - /// the identifiers are invalid. - pub fn clear(&mut self) { - self.states.clear(); - self.start_anchored = StateID::ZERO; - self.start_unanchored = StateID::ZERO; - self.start_pattern.clear(); - self.patterns_to_slots.clear(); - self.capture_name_to_index.clear(); - self.capture_index_to_name.clear(); - self.byte_class_set = ByteClassSet::empty(); - self.facts = Facts::default(); - self.memory_states = 0; - } -} - -impl fmt::Debug for NFA { - fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { - writeln!(f, "thompson::NFA(")?; - for (sid, state) in self.states.iter().with_state_ids() { - let status = if sid == self.start_anchored { - '^' - } else if sid == self.start_unanchored { - '>' - } else { - ' ' - }; - writeln!(f, "{}{:06?}: {:?}", status, sid.as_usize(), state)?; - } - if self.pattern_len() > 1 { - writeln!(f, "")?; - for pid in self.patterns() { - let sid = self.start_pattern(pid); - writeln!( - f, - "START({:06?}): {:?}", - pid.as_usize(), - sid.as_usize() - )?; - } - } - writeln!(f, "")?; - writeln!( - f, - "transition equivalence classes: {:?}", - self.byte_class_set().byte_classes() - )?; - writeln!(f, ")")?; - Ok(()) - } -} - -/// A state in a final compiled NFA. -#[derive(Clone, Eq, PartialEq)] -pub enum State { - /// A state that transitions to `next` if and only if the current input - /// byte is in the range `[start, end]` (inclusive). - /// - /// This is a special case of Sparse in that it encodes only one transition - /// (and therefore avoids the allocation). - Range { range: Transition }, - /// A state with possibly many transitions, represented in a sparse - /// fashion. Transitions are ordered lexicographically by input range. As - /// such, this may only be used when every transition has equal priority. - /// (In practice, this is only used for encoding UTF-8 automata.) - Sparse(SparseTransitions), - /// A conditional epsilon transition satisfied via some sort of - /// look-around. - Look { look: Look, next: StateID }, - /// An alternation such that there exists an epsilon transition to all - /// states in `alternates`, where matches found via earlier transitions - /// are preferred over later transitions. - Union { alternates: Box<[StateID]> }, - /// An empty state that records a capture location. - /// - /// From the perspective of finite automata, this is precisely equivalent - /// to an epsilon transition, but serves the purpose of instructing NFA - /// simulations to record additional state when the finite state machine - /// passes through this epsilon transition. - /// - /// These transitions are treated as epsilon transitions with no additional - /// effects in DFAs. - /// - /// 'slot' in this context refers to the specific capture group offset that - /// is being recorded. Each capturing group has two slots corresponding to - /// the start and end of the matching portion of that group. - /// A fail state. When encountered, the automaton is guaranteed to never - /// reach a match state. - Capture { next: StateID, slot: usize }, - /// A state that cannot be transitioned out of. If a search reaches this - /// state, then no match is possible and the search should terminate. - Fail, - /// A match state. There is exactly one such occurrence of this state for - /// each regex compiled into the NFA. - Match { id: PatternID }, -} - -impl State { - /// Returns true if and only if this state contains one or more epsilon - /// transitions. - #[inline] - pub fn is_epsilon(&self) -> bool { - match *self { - State::Range { .. } - | State::Sparse { .. } - | State::Fail - | State::Match { .. } => false, - State::Look { .. } - | State::Union { .. } - | State::Capture { .. } => true, - } - } - - /// Returns the heap memory usage of this NFA state in bytes. - fn memory_usage(&self) -> usize { - match *self { - State::Range { .. } - | State::Look { .. } - | State::Capture { .. } - | State::Match { .. } - | State::Fail => 0, - State::Sparse(SparseTransitions { ref ranges }) => { - ranges.len() * mem::size_of::<Transition>() - } - State::Union { ref alternates } => { - alternates.len() * mem::size_of::<StateID>() - } - } - } - - /// Remap the transitions in this state using the given map. Namely, the - /// given map should be indexed according to the transitions currently - /// in this state. - /// - /// This is used during the final phase of the NFA compiler, which turns - /// its intermediate NFA into the final NFA. - fn remap(&mut self, remap: &[StateID]) { - match *self { - State::Range { ref mut range } => range.next = remap[range.next], - State::Sparse(SparseTransitions { ref mut ranges }) => { - for r in ranges.iter_mut() { - r.next = remap[r.next]; - } - } - State::Look { ref mut next, .. } => *next = remap[*next], - State::Union { ref mut alternates } => { - for alt in alternates.iter_mut() { - *alt = remap[*alt]; - } - } - State::Capture { ref mut next, .. } => *next = remap[*next], - State::Fail => {} - State::Match { .. } => {} - } - } -} - -impl fmt::Debug for State { - fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { - match *self { - State::Range { ref range } => range.fmt(f), - State::Sparse(SparseTransitions { ref ranges }) => { - let rs = ranges - .iter() - .map(|t| format!("{:?}", t)) - .collect::<Vec<String>>() - .join(", "); - write!(f, "sparse({})", rs) - } - State::Look { ref look, next } => { - write!(f, "{:?} => {:?}", look, next.as_usize()) - } - State::Union { ref alternates } => { - let alts = alternates - .iter() - .map(|id| format!("{:?}", id.as_usize())) - .collect::<Vec<String>>() - .join(", "); - write!(f, "alt({})", alts) - } - State::Capture { next, slot } => { - write!(f, "capture({:?}) => {:?}", slot, next.as_usize()) - } - State::Fail => write!(f, "FAIL"), - State::Match { id } => write!(f, "MATCH({:?})", id.as_usize()), - } - } -} - -/// A collection of facts about an NFA. -/// -/// There are no real cohesive principles behind what gets put in here. For -/// the most part, it is implementation driven. -#[derive(Clone, Copy, Debug, Default)] -struct Facts { - /// Various yes/no facts about this NFA. - bools: u16, -} - -impl Facts { - define_bool!(0, has_any_look, set_has_any_look); - define_bool!(1, has_any_anchor, set_has_any_anchor); - define_bool!(2, has_word_boundary_unicode, set_has_word_boundary_unicode); - define_bool!(3, has_word_boundary_ascii, set_has_word_boundary_ascii); -} - -/// A sequence of transitions used to represent a sparse state. -#[derive(Clone, Debug, Eq, PartialEq)] -pub struct SparseTransitions { - pub ranges: Box<[Transition]>, -} - -impl SparseTransitions { - pub fn matches(&self, haystack: &[u8], at: usize) -> Option<StateID> { - haystack.get(at).and_then(|&b| self.matches_byte(b)) - } - - pub fn matches_unit(&self, unit: alphabet::Unit) -> Option<StateID> { - unit.as_u8().map_or(None, |byte| self.matches_byte(byte)) - } - - pub fn matches_byte(&self, byte: u8) -> Option<StateID> { - for t in self.ranges.iter() { - if t.start > byte { - break; - } else if t.matches_byte(byte) { - return Some(t.next); - } - } - None - - /* - // This is an alternative implementation that uses binary search. In - // some ad hoc experiments, like - // - // smallishru=OpenSubtitles2018.raw.sample.smallish.ru - // regex-cli find nfa thompson pikevm -b "@$smallishru" '\b\w+\b' - // - // I could not observe any improvement, and in fact, things seemed to - // be a bit slower. - self.ranges - .binary_search_by(|t| { - if t.end < byte { - core::cmp::Ordering::Less - } else if t.start > byte { - core::cmp::Ordering::Greater - } else { - core::cmp::Ordering::Equal - } - }) - .ok() - .map(|i| self.ranges[i].next) - */ - } -} - -/// A transition to another state, only if the given byte falls in the -/// inclusive range specified. -#[derive(Clone, Copy, Eq, Hash, PartialEq)] -pub struct Transition { - pub start: u8, - pub end: u8, - pub next: StateID, -} - -impl Transition { - pub fn matches(&self, haystack: &[u8], at: usize) -> bool { - haystack.get(at).map_or(false, |&b| self.matches_byte(b)) - } - - pub fn matches_unit(&self, unit: alphabet::Unit) -> bool { - unit.as_u8().map_or(false, |byte| self.matches_byte(byte)) - } - - pub fn matches_byte(&self, byte: u8) -> bool { - self.start <= byte && byte <= self.end - } -} - -impl fmt::Debug for Transition { - fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { - use crate::util::DebugByte; - - let Transition { start, end, next } = *self; - if self.start == self.end { - write!(f, "{:?} => {:?}", DebugByte(start), next.as_usize()) - } else { - write!( - f, - "{:?}-{:?} => {:?}", - DebugByte(start), - DebugByte(end), - next.as_usize(), - ) - } - } -} - -/// A conditional NFA epsilon transition. -/// -/// A simulation of the NFA can only move through this epsilon transition if -/// the current position satisfies some look-around property. Some assertions -/// are look-behind (StartLine, StartText), some assertions are look-ahead -/// (EndLine, EndText) while other assertions are both look-behind and -/// look-ahead (WordBoundary*). -#[derive(Clone, Copy, Debug, Eq, PartialEq)] -pub enum Look { - /// The previous position is either `\n` or the current position is the - /// beginning of the haystack (i.e., at position `0`). - StartLine = 1 << 0, - /// The next position is either `\n` or the current position is the end of - /// the haystack (i.e., at position `haystack.len()`). - EndLine = 1 << 1, - /// The current position is the beginning of the haystack (i.e., at - /// position `0`). - StartText = 1 << 2, - /// The current position is the end of the haystack (i.e., at position - /// `haystack.len()`). - EndText = 1 << 3, - /// When tested at position `i`, where `p=decode_utf8_rev(&haystack[..i])` - /// and `n=decode_utf8(&haystack[i..])`, this assertion passes if and only - /// if `is_word(p) != is_word(n)`. If `i=0`, then `is_word(p)=false` and if - /// `i=haystack.len()`, then `is_word(n)=false`. - WordBoundaryUnicode = 1 << 4, - /// Same as for `WordBoundaryUnicode`, but requires that - /// `is_word(p) == is_word(n)`. - WordBoundaryUnicodeNegate = 1 << 5, - /// When tested at position `i`, where `p=haystack[i-1]` and - /// `n=haystack[i]`, this assertion passes if and only if `is_word(p) - /// != is_word(n)`. If `i=0`, then `is_word(p)=false` and if - /// `i=haystack.len()`, then `is_word(n)=false`. - WordBoundaryAscii = 1 << 6, - /// Same as for `WordBoundaryAscii`, but requires that - /// `is_word(p) == is_word(n)`. - /// - /// Note that it is possible for this assertion to match at positions that - /// split the UTF-8 encoding of a codepoint. For this reason, this may only - /// be used when UTF-8 mode is disable in the regex syntax. - WordBoundaryAsciiNegate = 1 << 7, -} - -impl Look { - #[inline(always)] - pub fn matches(&self, bytes: &[u8], at: usize) -> bool { - match *self { - Look::StartLine => at == 0 || bytes[at - 1] == b'\n', - Look::EndLine => at == bytes.len() || bytes[at] == b'\n', - Look::StartText => at == 0, - Look::EndText => at == bytes.len(), - Look::WordBoundaryUnicode => { - let word_before = is_word_char_rev(bytes, at); - let word_after = is_word_char_fwd(bytes, at); - word_before != word_after - } - Look::WordBoundaryUnicodeNegate => { - // This is pretty subtle. Why do we need to do UTF-8 decoding - // here? Well... at time of writing, the is_word_char_{fwd,rev} - // routines will only return true if there is a valid UTF-8 - // encoding of a "word" codepoint, and false in every other - // case (including invalid UTF-8). This means that in regions - // of invalid UTF-8 (which might be a subset of valid UTF-8!), - // it would result in \B matching. While this would be - // questionable in the context of truly invalid UTF-8, it is - // *certainly* wrong to report match boundaries that split the - // encoding of a codepoint. So to work around this, we ensure - // that we can decode a codepoint on either side of `at`. If - // either direction fails, then we don't permit \B to match at - // all. - // - // Now, this isn't exactly optimal from a perf perspective. We - // could try and detect this in is_word_char_{fwd,rev}, but - // it's not clear if it's worth it. \B is, after all, rarely - // used. - // - // And in particular, we do *not* have to do this with \b, - // because \b *requires* that at least one side of `at` be a - // "word" codepoint, which in turn implies one side of `at` - // must be valid UTF-8. This in turn implies that \b can never - // split a valid UTF-8 encoding of a codepoint. In the case - // where one side of `at` is truly invalid UTF-8 and the other - // side IS a word codepoint, then we want \b to match since it - // represents a valid UTF-8 boundary. It also makes sense. For - // example, you'd want \b\w+\b to match 'abc' in '\xFFabc\xFF'. - let word_before = at > 0 - && match decode_last_utf8(&bytes[..at]) { - None | Some(Err(_)) => return false, - Some(Ok(_)) => is_word_char_rev(bytes, at), - }; - let word_after = at < bytes.len() - && match decode_utf8(&bytes[at..]) { - None | Some(Err(_)) => return false, - Some(Ok(_)) => is_word_char_fwd(bytes, at), - }; - word_before == word_after - } - Look::WordBoundaryAscii => { - let word_before = at > 0 && is_word_byte(bytes[at - 1]); - let word_after = at < bytes.len() && is_word_byte(bytes[at]); - word_before != word_after - } - Look::WordBoundaryAsciiNegate => { - let word_before = at > 0 && is_word_byte(bytes[at - 1]); - let word_after = at < bytes.len() && is_word_byte(bytes[at]); - word_before == word_after - } - } - } - - /// Create a look-around assertion from its corresponding integer (as - /// defined in `Look`). If the given integer does not correspond to any - /// assertion, then None is returned. - fn from_int(n: u8) -> Option<Look> { - match n { - 0b0000_0001 => Some(Look::StartLine), - 0b0000_0010 => Some(Look::EndLine), - 0b0000_0100 => Some(Look::StartText), - 0b0000_1000 => Some(Look::EndText), - 0b0001_0000 => Some(Look::WordBoundaryUnicode), - 0b0010_0000 => Some(Look::WordBoundaryUnicodeNegate), - 0b0100_0000 => Some(Look::WordBoundaryAscii), - 0b1000_0000 => Some(Look::WordBoundaryAsciiNegate), - _ => None, - } - } - - /// Flip the look-around assertion to its equivalent for reverse searches. - fn reversed(&self) -> Look { - match *self { - Look::StartLine => Look::EndLine, - Look::EndLine => Look::StartLine, - Look::StartText => Look::EndText, - Look::EndText => Look::StartText, - Look::WordBoundaryUnicode => Look::WordBoundaryUnicode, - Look::WordBoundaryUnicodeNegate => Look::WordBoundaryUnicodeNegate, - Look::WordBoundaryAscii => Look::WordBoundaryAscii, - Look::WordBoundaryAsciiNegate => Look::WordBoundaryAsciiNegate, - } - } - - /// Split up the given byte classes into equivalence classes in a way that - /// is consistent with this look-around assertion. - fn add_to_byteset(&self, set: &mut ByteClassSet) { - match *self { - Look::StartText | Look::EndText => {} - Look::StartLine | Look::EndLine => { - set.set_range(b'\n', b'\n'); - } - Look::WordBoundaryUnicode - | Look::WordBoundaryUnicodeNegate - | Look::WordBoundaryAscii - | Look::WordBoundaryAsciiNegate => { - // We need to mark all ranges of bytes whose pairs result in - // evaluating \b differently. This isn't technically correct - // for Unicode word boundaries, but DFAs can't handle those - // anyway, and thus, the byte classes don't need to either - // since they are themselves only used in DFAs. - let iswb = regex_syntax::is_word_byte; - let mut b1: u16 = 0; - let mut b2: u16; - while b1 <= 255 { - b2 = b1 + 1; - while b2 <= 255 && iswb(b1 as u8) == iswb(b2 as u8) { - b2 += 1; - } - set.set_range(b1 as u8, (b2 - 1) as u8); - b1 = b2; - } - } - } - } -} - -/// LookSet is a memory-efficient set of look-around assertions. Callers may -/// idempotently insert or remove any look-around assertion from a set. -#[repr(transparent)] -#[derive(Clone, Copy, Default, Eq, Hash, PartialEq, PartialOrd, Ord)] -pub(crate) struct LookSet { - set: u8, -} - -impl LookSet { - /// Return a LookSet from its representation. - pub(crate) fn from_repr(repr: u8) -> LookSet { - LookSet { set: repr } - } - - /// Return a mutable LookSet from a mutable pointer to its representation. - pub(crate) fn from_repr_mut(repr: &mut u8) -> &mut LookSet { - // SAFETY: This is safe since a LookSet is repr(transparent) where its - // repr is a u8. - unsafe { core::mem::transmute::<&mut u8, &mut LookSet>(repr) } - } - - /// Return true if and only if this set is empty. - pub(crate) fn is_empty(&self) -> bool { - self.set == 0 - } - - /// Clears this set such that it has no assertions in it. - pub(crate) fn clear(&mut self) { - self.set = 0; - } - - /// Insert the given look-around assertion into this set. If the assertion - /// already exists, then this is a no-op. - pub(crate) fn insert(&mut self, look: Look) { - self.set |= look as u8; - } - - /// Remove the given look-around assertion from this set. If the assertion - /// is not in this set, then this is a no-op. - #[cfg(test)] - pub(crate) fn remove(&mut self, look: Look) { - self.set &= !(look as u8); - } - - /// Return true if and only if the given assertion is in this set. - pub(crate) fn contains(&self, look: Look) -> bool { - (look as u8) & self.set != 0 - } - - /// Subtract the given `other` set from the `self` set and return a new - /// set. - pub(crate) fn subtract(&self, other: LookSet) -> LookSet { - LookSet { set: self.set & !other.set } - } - - /// Return the intersection of the given `other` set with the `self` set - /// and return the resulting set. - pub(crate) fn intersect(&self, other: LookSet) -> LookSet { - LookSet { set: self.set & other.set } - } -} - -impl core::fmt::Debug for LookSet { - fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result { - let mut members = vec![]; - for i in 0..8 { - let look = match Look::from_int(1 << i) { - None => continue, - Some(look) => look, - }; - if self.contains(look) { - members.push(look); - } - } - f.debug_tuple("LookSet").field(&members).finish() - } -} - -/// An iterator over all pattern IDs in an NFA. -pub struct PatternIter<'a> { - it: PatternIDIter, - /// We explicitly associate a lifetime with this iterator even though we - /// don't actually borrow anything from the NFA. We do this for backward - /// compatibility purposes. If we ever do need to borrow something from - /// the NFA, then we can and just get rid of this marker without breaking - /// the public API. - _marker: core::marker::PhantomData<&'a ()>, -} - -impl<'a> Iterator for PatternIter<'a> { - type Item = PatternID; - - fn next(&mut self) -> Option<PatternID> { - self.it.next() - } -} - -#[cfg(test)] -mod tests { - use super::*; - // TODO: Replace tests using DFA with NFA matching engine once implemented. - use crate::dfa::{dense, Automaton}; - - #[test] - fn always_match() { - let nfa = NFA::always_match(); - let dfa = dense::Builder::new().build_from_nfa(&nfa).unwrap(); - let find = |input, start, end| { - dfa.find_leftmost_fwd_at(None, None, input, start, end) - .unwrap() - .map(|m| m.offset()) - }; - - assert_eq!(Some(0), find(b"", 0, 0)); - assert_eq!(Some(0), find(b"a", 0, 1)); - assert_eq!(Some(1), find(b"a", 1, 1)); - assert_eq!(Some(0), find(b"ab", 0, 2)); - assert_eq!(Some(1), find(b"ab", 1, 2)); - assert_eq!(Some(2), find(b"ab", 2, 2)); - } - - #[test] - fn never_match() { - let nfa = NFA::never_match(); - let dfa = dense::Builder::new().build_from_nfa(&nfa).unwrap(); - let find = |input, start, end| { - dfa.find_leftmost_fwd_at(None, None, input, start, end) - .unwrap() - .map(|m| m.offset()) - }; - - assert_eq!(None, find(b"", 0, 0)); - assert_eq!(None, find(b"a", 0, 1)); - assert_eq!(None, find(b"a", 1, 1)); - assert_eq!(None, find(b"ab", 0, 2)); - assert_eq!(None, find(b"ab", 1, 2)); - assert_eq!(None, find(b"ab", 2, 2)); - } - - #[test] - fn look_set() { - let mut f = LookSet::default(); - assert!(!f.contains(Look::StartText)); - assert!(!f.contains(Look::EndText)); - assert!(!f.contains(Look::StartLine)); - assert!(!f.contains(Look::EndLine)); - assert!(!f.contains(Look::WordBoundaryUnicode)); - assert!(!f.contains(Look::WordBoundaryUnicodeNegate)); - assert!(!f.contains(Look::WordBoundaryAscii)); - assert!(!f.contains(Look::WordBoundaryAsciiNegate)); - - f.insert(Look::StartText); - assert!(f.contains(Look::StartText)); - f.remove(Look::StartText); - assert!(!f.contains(Look::StartText)); - - f.insert(Look::EndText); - assert!(f.contains(Look::EndText)); - f.remove(Look::EndText); - assert!(!f.contains(Look::EndText)); - - f.insert(Look::StartLine); - assert!(f.contains(Look::StartLine)); - f.remove(Look::StartLine); - assert!(!f.contains(Look::StartLine)); - - f.insert(Look::EndLine); - assert!(f.contains(Look::EndLine)); - f.remove(Look::EndLine); - assert!(!f.contains(Look::EndLine)); - - f.insert(Look::WordBoundaryUnicode); - assert!(f.contains(Look::WordBoundaryUnicode)); - f.remove(Look::WordBoundaryUnicode); - assert!(!f.contains(Look::WordBoundaryUnicode)); - - f.insert(Look::WordBoundaryUnicodeNegate); - assert!(f.contains(Look::WordBoundaryUnicodeNegate)); - f.remove(Look::WordBoundaryUnicodeNegate); - assert!(!f.contains(Look::WordBoundaryUnicodeNegate)); - - f.insert(Look::WordBoundaryAscii); - assert!(f.contains(Look::WordBoundaryAscii)); - f.remove(Look::WordBoundaryAscii); - assert!(!f.contains(Look::WordBoundaryAscii)); - - f.insert(Look::WordBoundaryAsciiNegate); - assert!(f.contains(Look::WordBoundaryAsciiNegate)); - f.remove(Look::WordBoundaryAsciiNegate); - assert!(!f.contains(Look::WordBoundaryAsciiNegate)); - } - - #[test] - fn look_matches_start_line() { - let look = Look::StartLine; - - assert!(look.matches(B(""), 0)); - assert!(look.matches(B("\n"), 0)); - assert!(look.matches(B("\n"), 1)); - assert!(look.matches(B("a"), 0)); - assert!(look.matches(B("\na"), 1)); - - assert!(!look.matches(B("a"), 1)); - assert!(!look.matches(B("a\na"), 1)); - } - - #[test] - fn look_matches_end_line() { - let look = Look::EndLine; - - assert!(look.matches(B(""), 0)); - assert!(look.matches(B("\n"), 1)); - assert!(look.matches(B("\na"), 0)); - assert!(look.matches(B("\na"), 2)); - assert!(look.matches(B("a\na"), 1)); - - assert!(!look.matches(B("a"), 0)); - assert!(!look.matches(B("\na"), 1)); - assert!(!look.matches(B("a\na"), 0)); - assert!(!look.matches(B("a\na"), 2)); - } - - #[test] - fn look_matches_start_text() { - let look = Look::StartText; - - assert!(look.matches(B(""), 0)); - assert!(look.matches(B("\n"), 0)); - assert!(look.matches(B("a"), 0)); - - assert!(!look.matches(B("\n"), 1)); - assert!(!look.matches(B("\na"), 1)); - assert!(!look.matches(B("a"), 1)); - assert!(!look.matches(B("a\na"), 1)); - } - - #[test] - fn look_matches_end_text() { - let look = Look::EndText; - - assert!(look.matches(B(""), 0)); - assert!(look.matches(B("\n"), 1)); - assert!(look.matches(B("\na"), 2)); - - assert!(!look.matches(B("\na"), 0)); - assert!(!look.matches(B("a\na"), 1)); - assert!(!look.matches(B("a"), 0)); - assert!(!look.matches(B("\na"), 1)); - assert!(!look.matches(B("a\na"), 0)); - assert!(!look.matches(B("a\na"), 2)); - } - - #[test] - fn look_matches_word_unicode() { - let look = Look::WordBoundaryUnicode; - - // \xF0\x9D\x9B\x83 = 𝛃 (in \w) - // \xF0\x90\x86\x80 = 𐆀 (not in \w) - - // Simple ASCII word boundaries. - assert!(look.matches(B("a"), 0)); - assert!(look.matches(B("a"), 1)); - assert!(look.matches(B("a "), 1)); - assert!(look.matches(B(" a "), 1)); - assert!(look.matches(B(" a "), 2)); - - // Unicode word boundaries with a non-ASCII codepoint. - assert!(look.matches(B("𝛃"), 0)); - assert!(look.matches(B("𝛃"), 4)); - assert!(look.matches(B("𝛃 "), 4)); - assert!(look.matches(B(" 𝛃 "), 1)); - assert!(look.matches(B(" 𝛃 "), 5)); - - // Unicode word boundaries between non-ASCII codepoints. - assert!(look.matches(B("𝛃𐆀"), 0)); - assert!(look.matches(B("𝛃𐆀"), 4)); - - // Non word boundaries for ASCII. - assert!(!look.matches(B(""), 0)); - assert!(!look.matches(B("ab"), 1)); - assert!(!look.matches(B("a "), 2)); - assert!(!look.matches(B(" a "), 0)); - assert!(!look.matches(B(" a "), 3)); - - // Non word boundaries with a non-ASCII codepoint. - assert!(!look.matches(B("𝛃b"), 4)); - assert!(!look.matches(B("𝛃 "), 5)); - assert!(!look.matches(B(" 𝛃 "), 0)); - assert!(!look.matches(B(" 𝛃 "), 6)); - assert!(!look.matches(B("𝛃"), 1)); - assert!(!look.matches(B("𝛃"), 2)); - assert!(!look.matches(B("𝛃"), 3)); - - // Non word boundaries with non-ASCII codepoints. - assert!(!look.matches(B("𝛃𐆀"), 1)); - assert!(!look.matches(B("𝛃𐆀"), 2)); - assert!(!look.matches(B("𝛃𐆀"), 3)); - assert!(!look.matches(B("𝛃𐆀"), 5)); - assert!(!look.matches(B("𝛃𐆀"), 6)); - assert!(!look.matches(B("𝛃𐆀"), 7)); - assert!(!look.matches(B("𝛃𐆀"), 8)); - } - - #[test] - fn look_matches_word_ascii() { - let look = Look::WordBoundaryAscii; - - // \xF0\x9D\x9B\x83 = 𝛃 (in \w) - // \xF0\x90\x86\x80 = 𐆀 (not in \w) - - // Simple ASCII word boundaries. - assert!(look.matches(B("a"), 0)); - assert!(look.matches(B("a"), 1)); - assert!(look.matches(B("a "), 1)); - assert!(look.matches(B(" a "), 1)); - assert!(look.matches(B(" a "), 2)); - - // Unicode word boundaries with a non-ASCII codepoint. Since this is - // an ASCII word boundary, none of these match. - assert!(!look.matches(B("𝛃"), 0)); - assert!(!look.matches(B("𝛃"), 4)); - assert!(!look.matches(B("𝛃 "), 4)); - assert!(!look.matches(B(" 𝛃 "), 1)); - assert!(!look.matches(B(" 𝛃 "), 5)); - - // Unicode word boundaries between non-ASCII codepoints. Again, since - // this is an ASCII word boundary, none of these match. - assert!(!look.matches(B("𝛃𐆀"), 0)); - assert!(!look.matches(B("𝛃𐆀"), 4)); - - // Non word boundaries for ASCII. - assert!(!look.matches(B(""), 0)); - assert!(!look.matches(B("ab"), 1)); - assert!(!look.matches(B("a "), 2)); - assert!(!look.matches(B(" a "), 0)); - assert!(!look.matches(B(" a "), 3)); - - // Non word boundaries with a non-ASCII codepoint. - assert!(look.matches(B("𝛃b"), 4)); - assert!(!look.matches(B("𝛃 "), 5)); - assert!(!look.matches(B(" 𝛃 "), 0)); - assert!(!look.matches(B(" 𝛃 "), 6)); - assert!(!look.matches(B("𝛃"), 1)); - assert!(!look.matches(B("𝛃"), 2)); - assert!(!look.matches(B("𝛃"), 3)); - - // Non word boundaries with non-ASCII codepoints. - assert!(!look.matches(B("𝛃𐆀"), 1)); - assert!(!look.matches(B("𝛃𐆀"), 2)); - assert!(!look.matches(B("𝛃𐆀"), 3)); - assert!(!look.matches(B("𝛃𐆀"), 5)); - assert!(!look.matches(B("𝛃𐆀"), 6)); - assert!(!look.matches(B("𝛃𐆀"), 7)); - assert!(!look.matches(B("𝛃𐆀"), 8)); - } - - #[test] - fn look_matches_word_unicode_negate() { - let look = Look::WordBoundaryUnicodeNegate; - - // \xF0\x9D\x9B\x83 = 𝛃 (in \w) - // \xF0\x90\x86\x80 = 𐆀 (not in \w) - - // Simple ASCII word boundaries. - assert!(!look.matches(B("a"), 0)); - assert!(!look.matches(B("a"), 1)); - assert!(!look.matches(B("a "), 1)); - assert!(!look.matches(B(" a "), 1)); - assert!(!look.matches(B(" a "), 2)); - - // Unicode word boundaries with a non-ASCII codepoint. - assert!(!look.matches(B("𝛃"), 0)); - assert!(!look.matches(B("𝛃"), 4)); - assert!(!look.matches(B("𝛃 "), 4)); - assert!(!look.matches(B(" 𝛃 "), 1)); - assert!(!look.matches(B(" 𝛃 "), 5)); - - // Unicode word boundaries between non-ASCII codepoints. - assert!(!look.matches(B("𝛃𐆀"), 0)); - assert!(!look.matches(B("𝛃𐆀"), 4)); - - // Non word boundaries for ASCII. - assert!(look.matches(B(""), 0)); - assert!(look.matches(B("ab"), 1)); - assert!(look.matches(B("a "), 2)); - assert!(look.matches(B(" a "), 0)); - assert!(look.matches(B(" a "), 3)); - - // Non word boundaries with a non-ASCII codepoint. - assert!(look.matches(B("𝛃b"), 4)); - assert!(look.matches(B("𝛃 "), 5)); - assert!(look.matches(B(" 𝛃 "), 0)); - assert!(look.matches(B(" 𝛃 "), 6)); - // These don't match because they could otherwise return an offset that - // splits the UTF-8 encoding of a codepoint. - assert!(!look.matches(B("𝛃"), 1)); - assert!(!look.matches(B("𝛃"), 2)); - assert!(!look.matches(B("𝛃"), 3)); - - // Non word boundaries with non-ASCII codepoints. These also don't - // match because they could otherwise return an offset that splits the - // UTF-8 encoding of a codepoint. - assert!(!look.matches(B("𝛃𐆀"), 1)); - assert!(!look.matches(B("𝛃𐆀"), 2)); - assert!(!look.matches(B("𝛃𐆀"), 3)); - assert!(!look.matches(B("𝛃𐆀"), 5)); - assert!(!look.matches(B("𝛃𐆀"), 6)); - assert!(!look.matches(B("𝛃𐆀"), 7)); - // But this one does, since 𐆀 isn't a word codepoint, and 8 is the end - // of the haystack. So the "end" of the haystack isn't a word and 𐆀 - // isn't a word, thus, \B matches. - assert!(look.matches(B("𝛃𐆀"), 8)); - } - - #[test] - fn look_matches_word_ascii_negate() { - let look = Look::WordBoundaryAsciiNegate; - - // \xF0\x9D\x9B\x83 = 𝛃 (in \w) - // \xF0\x90\x86\x80 = 𐆀 (not in \w) - - // Simple ASCII word boundaries. - assert!(!look.matches(B("a"), 0)); - assert!(!look.matches(B("a"), 1)); - assert!(!look.matches(B("a "), 1)); - assert!(!look.matches(B(" a "), 1)); - assert!(!look.matches(B(" a "), 2)); - - // Unicode word boundaries with a non-ASCII codepoint. Since this is - // an ASCII word boundary, none of these match. - assert!(look.matches(B("𝛃"), 0)); - assert!(look.matches(B("𝛃"), 4)); - assert!(look.matches(B("𝛃 "), 4)); - assert!(look.matches(B(" 𝛃 "), 1)); - assert!(look.matches(B(" 𝛃 "), 5)); - - // Unicode word boundaries between non-ASCII codepoints. Again, since - // this is an ASCII word boundary, none of these match. - assert!(look.matches(B("𝛃𐆀"), 0)); - assert!(look.matches(B("𝛃𐆀"), 4)); - - // Non word boundaries for ASCII. - assert!(look.matches(B(""), 0)); - assert!(look.matches(B("ab"), 1)); - assert!(look.matches(B("a "), 2)); - assert!(look.matches(B(" a "), 0)); - assert!(look.matches(B(" a "), 3)); - - // Non word boundaries with a non-ASCII codepoint. - assert!(!look.matches(B("𝛃b"), 4)); - assert!(look.matches(B("𝛃 "), 5)); - assert!(look.matches(B(" 𝛃 "), 0)); - assert!(look.matches(B(" 𝛃 "), 6)); - assert!(look.matches(B("𝛃"), 1)); - assert!(look.matches(B("𝛃"), 2)); - assert!(look.matches(B("𝛃"), 3)); - - // Non word boundaries with non-ASCII codepoints. - assert!(look.matches(B("𝛃𐆀"), 1)); - assert!(look.matches(B("𝛃𐆀"), 2)); - assert!(look.matches(B("𝛃𐆀"), 3)); - assert!(look.matches(B("𝛃𐆀"), 5)); - assert!(look.matches(B("𝛃𐆀"), 6)); - assert!(look.matches(B("𝛃𐆀"), 7)); - assert!(look.matches(B("𝛃𐆀"), 8)); - } - - fn B<'a, T: 'a + ?Sized + AsRef<[u8]>>(string: &'a T) -> &'a [u8] { - string.as_ref() - } -} +pub use self::{ + builder::Builder, + error::BuildError, + nfa::{ + DenseTransitions, PatternIter, SparseTransitions, State, Transition, + NFA, + }, +}; +#[cfg(feature = "syntax")] +pub use compiler::{Compiler, Config, WhichCaptures}; |