diff options
Diffstat (limited to 'vendor/regex-automata-0.2.0/src/nfa/thompson/mod.rs')
-rw-r--r-- | vendor/regex-automata-0.2.0/src/nfa/thompson/mod.rs | 1555 |
1 files changed, 1555 insertions, 0 deletions
diff --git a/vendor/regex-automata-0.2.0/src/nfa/thompson/mod.rs b/vendor/regex-automata-0.2.0/src/nfa/thompson/mod.rs new file mode 100644 index 000000000..88a438e8e --- /dev/null +++ b/vendor/regex-automata-0.2.0/src/nfa/thompson/mod.rs @@ -0,0 +1,1555 @@ +use core::{convert::TryFrom, fmt, mem, ops::Range}; + +use alloc::{boxed::Box, format, string::String, sync::Arc, vec, vec::Vec}; + +use crate::util::{ + alphabet::{self, ByteClassSet}, + decode_last_utf8, decode_utf8, + id::{IteratorIDExt, PatternID, PatternIDIter, StateID}, + is_word_byte, is_word_char_fwd, is_word_char_rev, +}; + +pub use self::{ + compiler::{Builder, Config}, + error::Error, +}; + +mod compiler; +mod error; +mod map; +pub mod pikevm; +mod range_trie; + +/// A map from capture group name to its corresponding capture index. +/// +/// Since there are always two slots for each capture index, the pair of slots +/// corresponding to the capture index for a pattern ID of 0 are indexed at +/// `map["<name>"] * 2` and `map["<name>"] * 2 + 1`. +/// +/// This type is actually wrapped inside a Vec indexed by pattern ID on the +/// NFA, since multiple patterns may have the same capture group name. +/// +/// Note that this is somewhat of a sub-optimal representation, since it +/// requires a hashmap for each pattern. A better representation would be +/// HashMap<(PatternID, Arc<str>), usize>, but this makes it difficult to look +/// up a capture index by name without producing a `Arc<str>`, which requires +/// an allocation. To fix this, I think we'd need to define our own unsized +/// type or something? +#[cfg(feature = "std")] +type CaptureNameMap = std::collections::HashMap<Arc<str>, usize>; +#[cfg(not(feature = "std"))] +type CaptureNameMap = alloc::collections::BTreeMap<Arc<str>, usize>; + +// The NFA API below is not something I'm terribly proud of at the moment. In +// particular, it supports both mutating the NFA and actually using the NFA to +// perform a search. I think combining these two things muddies the waters a +// bit too much. +// +// I think the issue is that I saw the compiler as the 'builder,' and where +// the compiler had the ability to manipulate the internal state of the NFA. +// However, one of my goals was to make it possible for others to build their +// own NFAs in a way that is *not* couple to the regex-syntax crate. +// +// So I think really, there should be an NFA, a NFABuilder and then the +// internal compiler which uses the NFABuilder API to build an NFA. Alas, at +// the time of writing, I kind of ran out of steam. + +/// A fully compiled Thompson NFA. +/// +/// The states of the NFA are indexed by state IDs, which are how transitions +/// are expressed. +#[derive(Clone)] +pub struct NFA { + /// The state list. This list is guaranteed to be indexable by all starting + /// state IDs, and it is also guaranteed to contain at most one `Match` + /// state for each pattern compiled into this NFA. (A pattern may not have + /// a corresponding `Match` state if a `Match` state is impossible to + /// reach.) + states: Vec<State>, + /// The anchored starting state of this NFA. + start_anchored: StateID, + /// The unanchored starting state of this NFA. + start_unanchored: StateID, + /// The starting states for each individual pattern. Starting at any + /// of these states will result in only an anchored search for the + /// corresponding pattern. The vec is indexed by pattern ID. When the NFA + /// contains a single regex, then `start_pattern[0]` and `start_anchored` + /// are always equivalent. + start_pattern: Vec<StateID>, + /// A map from PatternID to its corresponding range of capture slots. Each + /// range is guaranteed to be contiguous with the previous range. The + /// end of the last range corresponds to the total number of slots needed + /// for this NFA. + patterns_to_slots: Vec<Range<usize>>, + /// A map from capture name to its corresponding index. So e.g., given + /// a single regex like '(\w+) (\w+) (?P<word>\w+)', the capture name + /// 'word' for pattern ID=0 would corresponding to the index '3'. Its + /// corresponding slots would then be '3 * 2 = 6' and '3 * 2 + 1 = 7'. + capture_name_to_index: Vec<CaptureNameMap>, + /// A map from pattern ID to capture group index to name, if one exists. + /// This is effectively the inverse of 'capture_name_to_index'. The outer + /// vec is indexed by pattern ID, while the inner vec is index by capture + /// index offset for the corresponding pattern. + /// + /// The first capture group for each pattern is always unnamed and is thus + /// always None. + capture_index_to_name: Vec<Vec<Option<Arc<str>>>>, + /// A representation of equivalence classes over the transitions in this + /// NFA. Two bytes in the same equivalence class must not discriminate + /// between a match or a non-match. This map can be used to shrink the + /// total size of a DFA's transition table with a small match-time cost. + /// + /// Note that the NFA's transitions are *not* defined in terms of these + /// equivalence classes. The NFA's transitions are defined on the original + /// byte values. For the most part, this is because they wouldn't really + /// help the NFA much since the NFA already uses a sparse representation + /// to represent transitions. Byte classes are most effective in a dense + /// representation. + byte_class_set: ByteClassSet, + /// Various facts about this NFA, which can be used to improve failure + /// modes (e.g., rejecting DFA construction if an NFA has Unicode word + /// boundaries) or for performing optimizations (avoiding an increase in + /// states if there are no look-around states). + facts: Facts, + /// Heap memory used indirectly by NFA states. Since each state might use a + /// different amount of heap, we need to keep track of this incrementally. + memory_states: usize, +} + +impl NFA { + pub fn config() -> Config { + Config::new() + } + + pub fn builder() -> Builder { + Builder::new() + } + + /// Returns an NFA with no states. Its match semantics are unspecified. + /// + /// An empty NFA is useful as a starting point for building one. It is + /// itself not intended to be used for matching. For example, its starting + /// state identifiers are configured to be `0`, but since it has no states, + /// the identifiers are invalid. + /// + /// If you need an NFA that never matches is anything and can be correctly + /// used for matching, use [`NFA::never_match`]. + #[inline] + pub fn empty() -> NFA { + NFA { + states: vec![], + start_anchored: StateID::ZERO, + start_unanchored: StateID::ZERO, + start_pattern: vec![], + patterns_to_slots: vec![], + capture_name_to_index: vec![], + capture_index_to_name: vec![], + byte_class_set: ByteClassSet::empty(), + facts: Facts::default(), + memory_states: 0, + } + } + + /// Returns an NFA with a single regex that always matches at every + /// position. + #[inline] + pub fn always_match() -> NFA { + let mut nfa = NFA::empty(); + // Since we're only adding one pattern, these are guaranteed to work. + let start = nfa.add_match().unwrap(); + assert_eq!(start.as_usize(), 0); + let pid = nfa.finish_pattern(start).unwrap(); + assert_eq!(pid.as_usize(), 0); + nfa + } + + /// Returns an NFA that never matches at any position. It contains no + /// regexes. + #[inline] + pub fn never_match() -> NFA { + let mut nfa = NFA::empty(); + // Since we're only adding one state, this can never fail. + nfa.add_fail().unwrap(); + nfa + } + + /// Return the number of states in this NFA. + /// + /// This is guaranteed to be no bigger than [`StateID::LIMIT`]. + #[inline] + pub fn len(&self) -> usize { + self.states.len() + } + + /// Returns the total number of distinct match states in this NFA. + /// Stated differently, this returns the total number of regex patterns + /// used to build this NFA. + /// + /// This may return zero if the NFA was constructed with no patterns. In + /// this case, and only this case, the NFA can never produce a match for + /// any input. + /// + /// This is guaranteed to be no bigger than [`PatternID::LIMIT`]. + #[inline] + pub fn pattern_len(&self) -> usize { + self.start_pattern.len() + } + + /// Returns the pattern ID of the pattern currently being compiled by this + /// NFA. + fn current_pattern_id(&self) -> PatternID { + // This always works because we never permit more patterns in + // 'start_pattern' than can be addressed by PatternID. Also, we only + // add a new entry to 'start_pattern' once we finish compiling a + // pattern. Thus, the length refers to the ID of the current pattern + // being compiled. + PatternID::new(self.start_pattern.len()).unwrap() + } + + /// Returns the total number of capturing groups in this NFA. + /// + /// This includes the special 0th capture group that is always present and + /// captures the start and end offset of the entire match. + /// + /// This is a convenience routine for `nfa.capture_slot_len() / 2`. + #[inline] + pub fn capture_len(&self) -> usize { + let slots = self.capture_slot_len(); + // This assert is guaranteed to pass since the NFA construction process + // guarantees that it is always true. + assert_eq!(slots % 2, 0, "capture slots must be divisible by 2"); + slots / 2 + } + + /// Returns the total number of capturing slots in this NFA. + /// + /// This value is guaranteed to be a multiple of 2. (Where each capturing + /// group has precisely two capturing slots in the NFA.) + #[inline] + pub fn capture_slot_len(&self) -> usize { + self.patterns_to_slots.last().map_or(0, |r| r.end) + } + + /// Return a range of capture slots for the given pattern. + /// + /// The range returned is guaranteed to be contiguous with ranges for + /// adjacent patterns. + /// + /// This panics if the given pattern ID is greater than or equal to the + /// number of patterns in this NFA. + #[inline] + pub fn pattern_slots(&self, pid: PatternID) -> Range<usize> { + self.patterns_to_slots[pid].clone() + } + + /// Return the capture group index corresponding to the given name in the + /// given pattern. If no such capture group name exists in the given + /// pattern, then this returns `None`. + /// + /// If the given pattern ID is invalid, then this panics. + #[inline] + pub fn capture_name_to_index( + &self, + pid: PatternID, + name: &str, + ) -> Option<usize> { + assert!(pid.as_usize() < self.pattern_len(), "invalid pattern ID"); + self.capture_name_to_index[pid].get(name).cloned() + } + + // TODO: add iterators over capture group names. + // Do we also permit indexing? + + /// Returns an iterator over all pattern IDs in this NFA. + #[inline] + pub fn patterns(&self) -> PatternIter { + PatternIter { + it: PatternID::iter(self.pattern_len()), + _marker: core::marker::PhantomData, + } + } + + /// Return the ID of the initial anchored state of this NFA. + #[inline] + pub fn start_anchored(&self) -> StateID { + self.start_anchored + } + + /// Set the anchored starting state ID for this NFA. + #[inline] + pub fn set_start_anchored(&mut self, id: StateID) { + self.start_anchored = id; + } + + /// Return the ID of the initial unanchored state of this NFA. + #[inline] + pub fn start_unanchored(&self) -> StateID { + self.start_unanchored + } + + /// Set the unanchored starting state ID for this NFA. + #[inline] + pub fn set_start_unanchored(&mut self, id: StateID) { + self.start_unanchored = id; + } + + /// Return the ID of the initial anchored state for the given pattern. + /// + /// If the pattern doesn't exist in this NFA, then this panics. + #[inline] + pub fn start_pattern(&self, pid: PatternID) -> StateID { + self.start_pattern[pid] + } + + /// Get the byte class set for this NFA. + #[inline] + pub fn byte_class_set(&self) -> &ByteClassSet { + &self.byte_class_set + } + + /// Return a reference to the NFA state corresponding to the given ID. + #[inline] + pub fn state(&self, id: StateID) -> &State { + &self.states[id] + } + + /// Returns a slice of all states in this NFA. + /// + /// The slice returned may be indexed by a `StateID` generated by `add`. + #[inline] + pub fn states(&self) -> &[State] { + &self.states + } + + #[inline] + pub fn is_always_start_anchored(&self) -> bool { + self.start_anchored() == self.start_unanchored() + } + + #[inline] + pub fn has_any_look(&self) -> bool { + self.facts.has_any_look() + } + + #[inline] + pub fn has_any_anchor(&self) -> bool { + self.facts.has_any_anchor() + } + + #[inline] + pub fn has_word_boundary(&self) -> bool { + self.has_word_boundary_unicode() || self.has_word_boundary_ascii() + } + + #[inline] + pub fn has_word_boundary_unicode(&self) -> bool { + self.facts.has_word_boundary_unicode() + } + + #[inline] + pub fn has_word_boundary_ascii(&self) -> bool { + self.facts.has_word_boundary_ascii() + } + + /// Returns the memory usage, in bytes, of this NFA. + /// + /// This does **not** include the stack size used up by this NFA. To + /// compute that, use `std::mem::size_of::<NFA>()`. + #[inline] + pub fn memory_usage(&self) -> usize { + self.states.len() * mem::size_of::<State>() + + self.memory_states + + self.start_pattern.len() * mem::size_of::<StateID>() + } + + // Why do we define a bunch of 'add_*' routines below instead of just + // defining a single 'add' routine that accepts a 'State'? Indeed, for most + // of the 'add_*' routines below, such a simple API would be more than + // appropriate. Unfortunately, adding capture states and, to a lesser + // extent, match states, is a bit more complex. Namely, when we add a + // capture state, we *really* want to know the corresponding capture + // group's name and index and what not, so that we can update other state + // inside this NFA. But, e.g., the capture group name is not and should + // not be included in 'State::Capture'. So what are our choices? + // + // 1) Define one 'add' and require some additional optional parameters. + // This feels quite ugly, and adds unnecessary complexity to more common + // and simpler cases. + // + // 2) Do what we do below. The sad thing is that our API is bigger with + // more methods. But each method is very specific and hopefully simple. + // + // 3) Define a new enum, say, 'StateWithInfo', or something that permits + // providing both a State and some extra ancillary info in some cases. This + // doesn't seem too bad to me, but seems slightly worse than (2) because of + // the additional type required. + // + // 4) Abandon the idea that we have to specify things like the capture + // group name when we add the Capture state to the NFA. We would then need + // to add other methods that permit the caller to add this additional state + // "out of band." Other than it introducing some additional complexity, I + // decided against this because I wanted the NFA builder API to make it + // as hard as possible to build a bad or invalid NFA. Using the approach + // below, as you'll see, permits us to do a lot of strict checking of our + // inputs and return an error if we see something we don't expect. + + pub fn add_range(&mut self, range: Transition) -> Result<StateID, Error> { + self.byte_class_set.set_range(range.start, range.end); + self.add_state(State::Range { range }) + } + + pub fn add_sparse( + &mut self, + sparse: SparseTransitions, + ) -> Result<StateID, Error> { + for range in sparse.ranges.iter() { + self.byte_class_set.set_range(range.start, range.end); + } + self.add_state(State::Sparse(sparse)) + } + + pub fn add_look( + &mut self, + next: StateID, + look: Look, + ) -> Result<StateID, Error> { + self.facts.set_has_any_look(true); + look.add_to_byteset(&mut self.byte_class_set); + match look { + Look::StartLine + | Look::EndLine + | Look::StartText + | Look::EndText => { + self.facts.set_has_any_anchor(true); + } + Look::WordBoundaryUnicode | Look::WordBoundaryUnicodeNegate => { + self.facts.set_has_word_boundary_unicode(true); + } + Look::WordBoundaryAscii | Look::WordBoundaryAsciiNegate => { + self.facts.set_has_word_boundary_ascii(true); + } + } + self.add_state(State::Look { look, next }) + } + + pub fn add_union( + &mut self, + alternates: Box<[StateID]>, + ) -> Result<StateID, Error> { + self.add_state(State::Union { alternates }) + } + + pub fn add_capture_start( + &mut self, + next_id: StateID, + capture_index: u32, + name: Option<Arc<str>>, + ) -> Result<StateID, Error> { + let pid = self.current_pattern_id(); + let capture_index = match usize::try_from(capture_index) { + Err(_) => { + return Err(Error::invalid_capture_index(core::usize::MAX)) + } + Ok(capture_index) => capture_index, + }; + // Do arithmetic to find our absolute slot index first, to make sure + // the index is at least possibly valid (doesn't overflow). + let relative_slot = match capture_index.checked_mul(2) { + Some(relative_slot) => relative_slot, + None => return Err(Error::invalid_capture_index(capture_index)), + }; + let slot = match relative_slot.checked_add(self.capture_slot_len()) { + Some(slot) => slot, + None => return Err(Error::invalid_capture_index(capture_index)), + }; + // Make sure we have space to insert our (pid,index)|-->name mapping. + if pid.as_usize() >= self.capture_index_to_name.len() { + // Note that we require that if you're adding capturing groups, + // then there must be at least one capturing group per pattern. + // Moreover, whenever we expand our space here, it should always + // first be for the first capture group (at index==0). + if pid.as_usize() > self.capture_index_to_name.len() + || capture_index > 0 + { + return Err(Error::invalid_capture_index(capture_index)); + } + self.capture_name_to_index.push(CaptureNameMap::new()); + self.capture_index_to_name.push(vec![]); + } + if capture_index >= self.capture_index_to_name[pid].len() { + // We require that capturing groups are added in correspondence + // to their index. So no discontinuous indices. This is likely + // overly strict, but also makes it simpler to provide guarantees + // about our capturing group data. + if capture_index > self.capture_index_to_name[pid].len() { + return Err(Error::invalid_capture_index(capture_index)); + } + self.capture_index_to_name[pid].push(None); + } + if let Some(ref name) = name { + self.capture_name_to_index[pid] + .insert(Arc::clone(name), capture_index); + } + self.capture_index_to_name[pid][capture_index] = name; + self.add_state(State::Capture { next: next_id, slot }) + } + + pub fn add_capture_end( + &mut self, + next_id: StateID, + capture_index: u32, + ) -> Result<StateID, Error> { + let pid = self.current_pattern_id(); + let capture_index = match usize::try_from(capture_index) { + Err(_) => { + return Err(Error::invalid_capture_index(core::usize::MAX)) + } + Ok(capture_index) => capture_index, + }; + // If we haven't already added this capture group via a corresponding + // 'add_capture_start' call, then we consider the index given to be + // invalid. + if pid.as_usize() >= self.capture_index_to_name.len() + || capture_index >= self.capture_index_to_name[pid].len() + { + return Err(Error::invalid_capture_index(capture_index)); + } + // Since we've already confirmed that this capture index is invalid + // and has a corresponding starting slot, we know the multiplcation + // has already been done and succeeded. + let relative_slot_start = capture_index.checked_mul(2).unwrap(); + let relative_slot = match relative_slot_start.checked_add(1) { + Some(relative_slot) => relative_slot, + None => return Err(Error::invalid_capture_index(capture_index)), + }; + let slot = match relative_slot.checked_add(self.capture_slot_len()) { + Some(slot) => slot, + None => return Err(Error::invalid_capture_index(capture_index)), + }; + self.add_state(State::Capture { next: next_id, slot }) + } + + pub fn add_fail(&mut self) -> Result<StateID, Error> { + self.add_state(State::Fail) + } + + /// Add a new match state to this NFA and return its state ID. + pub fn add_match(&mut self) -> Result<StateID, Error> { + let pattern_id = self.current_pattern_id(); + let sid = self.add_state(State::Match { id: pattern_id })?; + Ok(sid) + } + + /// Finish compiling the current pattern and return its identifier. The + /// given ID should be the state ID corresponding to the anchored starting + /// state for matching this pattern. + pub fn finish_pattern( + &mut self, + start_id: StateID, + ) -> Result<PatternID, Error> { + // We've gotta make sure that we never permit the user to add more + // patterns than we can identify. So if we're already at the limit, + // then return an error. This is somewhat non-ideal since this won't + // result in an error until trying to complete the compilation of a + // pattern instead of starting it. + if self.start_pattern.len() >= PatternID::LIMIT { + return Err(Error::too_many_patterns( + self.start_pattern.len().saturating_add(1), + )); + } + let pid = self.current_pattern_id(); + self.start_pattern.push(start_id); + // Add the number of new slots created by this pattern. This is always + // equivalent to '2 * caps.len()', where 'caps.len()' is the number of + // new capturing groups introduced by the pattern we're finishing. + let new_cap_groups = self + .capture_index_to_name + .get(pid.as_usize()) + .map_or(0, |caps| caps.len()); + let new_slots = match new_cap_groups.checked_mul(2) { + Some(new_slots) => new_slots, + None => { + // Just return the biggest index that we know exists. + let index = new_cap_groups.saturating_sub(1); + return Err(Error::invalid_capture_index(index)); + } + }; + let slot_start = self.capture_slot_len(); + self.patterns_to_slots.push(slot_start..(slot_start + new_slots)); + Ok(pid) + } + + fn add_state(&mut self, state: State) -> Result<StateID, Error> { + let id = StateID::new(self.states.len()) + .map_err(|_| Error::too_many_states(self.states.len()))?; + self.memory_states += state.memory_usage(); + self.states.push(state); + Ok(id) + } + + /// Remap the transitions in every state of this NFA using the given map. + /// The given map should be indexed according to state ID namespace used by + /// the transitions of the states currently in this NFA. + /// + /// This may be used during the final phases of an NFA compiler, which + /// turns its intermediate NFA into the final NFA. Remapping may be + /// required to bring the state pointers from the intermediate NFA to the + /// final NFA. + pub fn remap(&mut self, old_to_new: &[StateID]) { + for state in &mut self.states { + state.remap(old_to_new); + } + self.start_anchored = old_to_new[self.start_anchored]; + self.start_unanchored = old_to_new[self.start_unanchored]; + for (pid, id) in self.start_pattern.iter_mut().with_pattern_ids() { + *id = old_to_new[*id]; + } + } + + /// Clear this NFA such that it has zero states and is otherwise "empty." + /// + /// An empty NFA is useful as a starting point for building one. It is + /// itself not intended to be used for matching. For example, its starting + /// state identifiers are configured to be `0`, but since it has no states, + /// the identifiers are invalid. + pub fn clear(&mut self) { + self.states.clear(); + self.start_anchored = StateID::ZERO; + self.start_unanchored = StateID::ZERO; + self.start_pattern.clear(); + self.patterns_to_slots.clear(); + self.capture_name_to_index.clear(); + self.capture_index_to_name.clear(); + self.byte_class_set = ByteClassSet::empty(); + self.facts = Facts::default(); + self.memory_states = 0; + } +} + +impl fmt::Debug for NFA { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + writeln!(f, "thompson::NFA(")?; + for (sid, state) in self.states.iter().with_state_ids() { + let status = if sid == self.start_anchored { + '^' + } else if sid == self.start_unanchored { + '>' + } else { + ' ' + }; + writeln!(f, "{}{:06?}: {:?}", status, sid.as_usize(), state)?; + } + if self.pattern_len() > 1 { + writeln!(f, "")?; + for pid in self.patterns() { + let sid = self.start_pattern(pid); + writeln!( + f, + "START({:06?}): {:?}", + pid.as_usize(), + sid.as_usize() + )?; + } + } + writeln!(f, "")?; + writeln!( + f, + "transition equivalence classes: {:?}", + self.byte_class_set().byte_classes() + )?; + writeln!(f, ")")?; + Ok(()) + } +} + +/// A state in a final compiled NFA. +#[derive(Clone, Eq, PartialEq)] +pub enum State { + /// A state that transitions to `next` if and only if the current input + /// byte is in the range `[start, end]` (inclusive). + /// + /// This is a special case of Sparse in that it encodes only one transition + /// (and therefore avoids the allocation). + Range { range: Transition }, + /// A state with possibly many transitions, represented in a sparse + /// fashion. Transitions are ordered lexicographically by input range. As + /// such, this may only be used when every transition has equal priority. + /// (In practice, this is only used for encoding UTF-8 automata.) + Sparse(SparseTransitions), + /// A conditional epsilon transition satisfied via some sort of + /// look-around. + Look { look: Look, next: StateID }, + /// An alternation such that there exists an epsilon transition to all + /// states in `alternates`, where matches found via earlier transitions + /// are preferred over later transitions. + Union { alternates: Box<[StateID]> }, + /// An empty state that records a capture location. + /// + /// From the perspective of finite automata, this is precisely equivalent + /// to an epsilon transition, but serves the purpose of instructing NFA + /// simulations to record additional state when the finite state machine + /// passes through this epsilon transition. + /// + /// These transitions are treated as epsilon transitions with no additional + /// effects in DFAs. + /// + /// 'slot' in this context refers to the specific capture group offset that + /// is being recorded. Each capturing group has two slots corresponding to + /// the start and end of the matching portion of that group. + /// A fail state. When encountered, the automaton is guaranteed to never + /// reach a match state. + Capture { next: StateID, slot: usize }, + /// A state that cannot be transitioned out of. If a search reaches this + /// state, then no match is possible and the search should terminate. + Fail, + /// A match state. There is exactly one such occurrence of this state for + /// each regex compiled into the NFA. + Match { id: PatternID }, +} + +impl State { + /// Returns true if and only if this state contains one or more epsilon + /// transitions. + #[inline] + pub fn is_epsilon(&self) -> bool { + match *self { + State::Range { .. } + | State::Sparse { .. } + | State::Fail + | State::Match { .. } => false, + State::Look { .. } + | State::Union { .. } + | State::Capture { .. } => true, + } + } + + /// Returns the heap memory usage of this NFA state in bytes. + fn memory_usage(&self) -> usize { + match *self { + State::Range { .. } + | State::Look { .. } + | State::Capture { .. } + | State::Match { .. } + | State::Fail => 0, + State::Sparse(SparseTransitions { ref ranges }) => { + ranges.len() * mem::size_of::<Transition>() + } + State::Union { ref alternates } => { + alternates.len() * mem::size_of::<StateID>() + } + } + } + + /// Remap the transitions in this state using the given map. Namely, the + /// given map should be indexed according to the transitions currently + /// in this state. + /// + /// This is used during the final phase of the NFA compiler, which turns + /// its intermediate NFA into the final NFA. + fn remap(&mut self, remap: &[StateID]) { + match *self { + State::Range { ref mut range } => range.next = remap[range.next], + State::Sparse(SparseTransitions { ref mut ranges }) => { + for r in ranges.iter_mut() { + r.next = remap[r.next]; + } + } + State::Look { ref mut next, .. } => *next = remap[*next], + State::Union { ref mut alternates } => { + for alt in alternates.iter_mut() { + *alt = remap[*alt]; + } + } + State::Capture { ref mut next, .. } => *next = remap[*next], + State::Fail => {} + State::Match { .. } => {} + } + } +} + +impl fmt::Debug for State { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + match *self { + State::Range { ref range } => range.fmt(f), + State::Sparse(SparseTransitions { ref ranges }) => { + let rs = ranges + .iter() + .map(|t| format!("{:?}", t)) + .collect::<Vec<String>>() + .join(", "); + write!(f, "sparse({})", rs) + } + State::Look { ref look, next } => { + write!(f, "{:?} => {:?}", look, next.as_usize()) + } + State::Union { ref alternates } => { + let alts = alternates + .iter() + .map(|id| format!("{:?}", id.as_usize())) + .collect::<Vec<String>>() + .join(", "); + write!(f, "alt({})", alts) + } + State::Capture { next, slot } => { + write!(f, "capture({:?}) => {:?}", slot, next.as_usize()) + } + State::Fail => write!(f, "FAIL"), + State::Match { id } => write!(f, "MATCH({:?})", id.as_usize()), + } + } +} + +/// A collection of facts about an NFA. +/// +/// There are no real cohesive principles behind what gets put in here. For +/// the most part, it is implementation driven. +#[derive(Clone, Copy, Debug, Default)] +struct Facts { + /// Various yes/no facts about this NFA. + bools: u16, +} + +impl Facts { + define_bool!(0, has_any_look, set_has_any_look); + define_bool!(1, has_any_anchor, set_has_any_anchor); + define_bool!(2, has_word_boundary_unicode, set_has_word_boundary_unicode); + define_bool!(3, has_word_boundary_ascii, set_has_word_boundary_ascii); +} + +/// A sequence of transitions used to represent a sparse state. +#[derive(Clone, Debug, Eq, PartialEq)] +pub struct SparseTransitions { + pub ranges: Box<[Transition]>, +} + +impl SparseTransitions { + pub fn matches(&self, haystack: &[u8], at: usize) -> Option<StateID> { + haystack.get(at).and_then(|&b| self.matches_byte(b)) + } + + pub fn matches_unit(&self, unit: alphabet::Unit) -> Option<StateID> { + unit.as_u8().map_or(None, |byte| self.matches_byte(byte)) + } + + pub fn matches_byte(&self, byte: u8) -> Option<StateID> { + for t in self.ranges.iter() { + if t.start > byte { + break; + } else if t.matches_byte(byte) { + return Some(t.next); + } + } + None + + /* + // This is an alternative implementation that uses binary search. In + // some ad hoc experiments, like + // + // smallishru=OpenSubtitles2018.raw.sample.smallish.ru + // regex-cli find nfa thompson pikevm -b "@$smallishru" '\b\w+\b' + // + // I could not observe any improvement, and in fact, things seemed to + // be a bit slower. + self.ranges + .binary_search_by(|t| { + if t.end < byte { + core::cmp::Ordering::Less + } else if t.start > byte { + core::cmp::Ordering::Greater + } else { + core::cmp::Ordering::Equal + } + }) + .ok() + .map(|i| self.ranges[i].next) + */ + } +} + +/// A transition to another state, only if the given byte falls in the +/// inclusive range specified. +#[derive(Clone, Copy, Eq, Hash, PartialEq)] +pub struct Transition { + pub start: u8, + pub end: u8, + pub next: StateID, +} + +impl Transition { + pub fn matches(&self, haystack: &[u8], at: usize) -> bool { + haystack.get(at).map_or(false, |&b| self.matches_byte(b)) + } + + pub fn matches_unit(&self, unit: alphabet::Unit) -> bool { + unit.as_u8().map_or(false, |byte| self.matches_byte(byte)) + } + + pub fn matches_byte(&self, byte: u8) -> bool { + self.start <= byte && byte <= self.end + } +} + +impl fmt::Debug for Transition { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + use crate::util::DebugByte; + + let Transition { start, end, next } = *self; + if self.start == self.end { + write!(f, "{:?} => {:?}", DebugByte(start), next.as_usize()) + } else { + write!( + f, + "{:?}-{:?} => {:?}", + DebugByte(start), + DebugByte(end), + next.as_usize(), + ) + } + } +} + +/// A conditional NFA epsilon transition. +/// +/// A simulation of the NFA can only move through this epsilon transition if +/// the current position satisfies some look-around property. Some assertions +/// are look-behind (StartLine, StartText), some assertions are look-ahead +/// (EndLine, EndText) while other assertions are both look-behind and +/// look-ahead (WordBoundary*). +#[derive(Clone, Copy, Debug, Eq, PartialEq)] +pub enum Look { + /// The previous position is either `\n` or the current position is the + /// beginning of the haystack (i.e., at position `0`). + StartLine = 1 << 0, + /// The next position is either `\n` or the current position is the end of + /// the haystack (i.e., at position `haystack.len()`). + EndLine = 1 << 1, + /// The current position is the beginning of the haystack (i.e., at + /// position `0`). + StartText = 1 << 2, + /// The current position is the end of the haystack (i.e., at position + /// `haystack.len()`). + EndText = 1 << 3, + /// When tested at position `i`, where `p=decode_utf8_rev(&haystack[..i])` + /// and `n=decode_utf8(&haystack[i..])`, this assertion passes if and only + /// if `is_word(p) != is_word(n)`. If `i=0`, then `is_word(p)=false` and if + /// `i=haystack.len()`, then `is_word(n)=false`. + WordBoundaryUnicode = 1 << 4, + /// Same as for `WordBoundaryUnicode`, but requires that + /// `is_word(p) == is_word(n)`. + WordBoundaryUnicodeNegate = 1 << 5, + /// When tested at position `i`, where `p=haystack[i-1]` and + /// `n=haystack[i]`, this assertion passes if and only if `is_word(p) + /// != is_word(n)`. If `i=0`, then `is_word(p)=false` and if + /// `i=haystack.len()`, then `is_word(n)=false`. + WordBoundaryAscii = 1 << 6, + /// Same as for `WordBoundaryAscii`, but requires that + /// `is_word(p) == is_word(n)`. + /// + /// Note that it is possible for this assertion to match at positions that + /// split the UTF-8 encoding of a codepoint. For this reason, this may only + /// be used when UTF-8 mode is disable in the regex syntax. + WordBoundaryAsciiNegate = 1 << 7, +} + +impl Look { + #[inline(always)] + pub fn matches(&self, bytes: &[u8], at: usize) -> bool { + match *self { + Look::StartLine => at == 0 || bytes[at - 1] == b'\n', + Look::EndLine => at == bytes.len() || bytes[at] == b'\n', + Look::StartText => at == 0, + Look::EndText => at == bytes.len(), + Look::WordBoundaryUnicode => { + let word_before = is_word_char_rev(bytes, at); + let word_after = is_word_char_fwd(bytes, at); + word_before != word_after + } + Look::WordBoundaryUnicodeNegate => { + // This is pretty subtle. Why do we need to do UTF-8 decoding + // here? Well... at time of writing, the is_word_char_{fwd,rev} + // routines will only return true if there is a valid UTF-8 + // encoding of a "word" codepoint, and false in every other + // case (including invalid UTF-8). This means that in regions + // of invalid UTF-8 (which might be a subset of valid UTF-8!), + // it would result in \B matching. While this would be + // questionable in the context of truly invalid UTF-8, it is + // *certainly* wrong to report match boundaries that split the + // encoding of a codepoint. So to work around this, we ensure + // that we can decode a codepoint on either side of `at`. If + // either direction fails, then we don't permit \B to match at + // all. + // + // Now, this isn't exactly optimal from a perf perspective. We + // could try and detect this in is_word_char_{fwd,rev}, but + // it's not clear if it's worth it. \B is, after all, rarely + // used. + // + // And in particular, we do *not* have to do this with \b, + // because \b *requires* that at least one side of `at` be a + // "word" codepoint, which in turn implies one side of `at` + // must be valid UTF-8. This in turn implies that \b can never + // split a valid UTF-8 encoding of a codepoint. In the case + // where one side of `at` is truly invalid UTF-8 and the other + // side IS a word codepoint, then we want \b to match since it + // represents a valid UTF-8 boundary. It also makes sense. For + // example, you'd want \b\w+\b to match 'abc' in '\xFFabc\xFF'. + let word_before = at > 0 + && match decode_last_utf8(&bytes[..at]) { + None | Some(Err(_)) => return false, + Some(Ok(_)) => is_word_char_rev(bytes, at), + }; + let word_after = at < bytes.len() + && match decode_utf8(&bytes[at..]) { + None | Some(Err(_)) => return false, + Some(Ok(_)) => is_word_char_fwd(bytes, at), + }; + word_before == word_after + } + Look::WordBoundaryAscii => { + let word_before = at > 0 && is_word_byte(bytes[at - 1]); + let word_after = at < bytes.len() && is_word_byte(bytes[at]); + word_before != word_after + } + Look::WordBoundaryAsciiNegate => { + let word_before = at > 0 && is_word_byte(bytes[at - 1]); + let word_after = at < bytes.len() && is_word_byte(bytes[at]); + word_before == word_after + } + } + } + + /// Create a look-around assertion from its corresponding integer (as + /// defined in `Look`). If the given integer does not correspond to any + /// assertion, then None is returned. + fn from_int(n: u8) -> Option<Look> { + match n { + 0b0000_0001 => Some(Look::StartLine), + 0b0000_0010 => Some(Look::EndLine), + 0b0000_0100 => Some(Look::StartText), + 0b0000_1000 => Some(Look::EndText), + 0b0001_0000 => Some(Look::WordBoundaryUnicode), + 0b0010_0000 => Some(Look::WordBoundaryUnicodeNegate), + 0b0100_0000 => Some(Look::WordBoundaryAscii), + 0b1000_0000 => Some(Look::WordBoundaryAsciiNegate), + _ => None, + } + } + + /// Flip the look-around assertion to its equivalent for reverse searches. + fn reversed(&self) -> Look { + match *self { + Look::StartLine => Look::EndLine, + Look::EndLine => Look::StartLine, + Look::StartText => Look::EndText, + Look::EndText => Look::StartText, + Look::WordBoundaryUnicode => Look::WordBoundaryUnicode, + Look::WordBoundaryUnicodeNegate => Look::WordBoundaryUnicodeNegate, + Look::WordBoundaryAscii => Look::WordBoundaryAscii, + Look::WordBoundaryAsciiNegate => Look::WordBoundaryAsciiNegate, + } + } + + /// Split up the given byte classes into equivalence classes in a way that + /// is consistent with this look-around assertion. + fn add_to_byteset(&self, set: &mut ByteClassSet) { + match *self { + Look::StartText | Look::EndText => {} + Look::StartLine | Look::EndLine => { + set.set_range(b'\n', b'\n'); + } + Look::WordBoundaryUnicode + | Look::WordBoundaryUnicodeNegate + | Look::WordBoundaryAscii + | Look::WordBoundaryAsciiNegate => { + // We need to mark all ranges of bytes whose pairs result in + // evaluating \b differently. This isn't technically correct + // for Unicode word boundaries, but DFAs can't handle those + // anyway, and thus, the byte classes don't need to either + // since they are themselves only used in DFAs. + let iswb = regex_syntax::is_word_byte; + let mut b1: u16 = 0; + let mut b2: u16; + while b1 <= 255 { + b2 = b1 + 1; + while b2 <= 255 && iswb(b1 as u8) == iswb(b2 as u8) { + b2 += 1; + } + set.set_range(b1 as u8, (b2 - 1) as u8); + b1 = b2; + } + } + } + } +} + +/// LookSet is a memory-efficient set of look-around assertions. Callers may +/// idempotently insert or remove any look-around assertion from a set. +#[repr(transparent)] +#[derive(Clone, Copy, Default, Eq, Hash, PartialEq, PartialOrd, Ord)] +pub(crate) struct LookSet { + set: u8, +} + +impl LookSet { + /// Return a LookSet from its representation. + pub(crate) fn from_repr(repr: u8) -> LookSet { + LookSet { set: repr } + } + + /// Return a mutable LookSet from a mutable pointer to its representation. + pub(crate) fn from_repr_mut(repr: &mut u8) -> &mut LookSet { + // SAFETY: This is safe since a LookSet is repr(transparent) where its + // repr is a u8. + unsafe { core::mem::transmute::<&mut u8, &mut LookSet>(repr) } + } + + /// Return true if and only if this set is empty. + pub(crate) fn is_empty(&self) -> bool { + self.set == 0 + } + + /// Clears this set such that it has no assertions in it. + pub(crate) fn clear(&mut self) { + self.set = 0; + } + + /// Insert the given look-around assertion into this set. If the assertion + /// already exists, then this is a no-op. + pub(crate) fn insert(&mut self, look: Look) { + self.set |= look as u8; + } + + /// Remove the given look-around assertion from this set. If the assertion + /// is not in this set, then this is a no-op. + #[cfg(test)] + pub(crate) fn remove(&mut self, look: Look) { + self.set &= !(look as u8); + } + + /// Return true if and only if the given assertion is in this set. + pub(crate) fn contains(&self, look: Look) -> bool { + (look as u8) & self.set != 0 + } + + /// Subtract the given `other` set from the `self` set and return a new + /// set. + pub(crate) fn subtract(&self, other: LookSet) -> LookSet { + LookSet { set: self.set & !other.set } + } + + /// Return the intersection of the given `other` set with the `self` set + /// and return the resulting set. + pub(crate) fn intersect(&self, other: LookSet) -> LookSet { + LookSet { set: self.set & other.set } + } +} + +impl core::fmt::Debug for LookSet { + fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result { + let mut members = vec![]; + for i in 0..8 { + let look = match Look::from_int(1 << i) { + None => continue, + Some(look) => look, + }; + if self.contains(look) { + members.push(look); + } + } + f.debug_tuple("LookSet").field(&members).finish() + } +} + +/// An iterator over all pattern IDs in an NFA. +pub struct PatternIter<'a> { + it: PatternIDIter, + /// We explicitly associate a lifetime with this iterator even though we + /// don't actually borrow anything from the NFA. We do this for backward + /// compatibility purposes. If we ever do need to borrow something from + /// the NFA, then we can and just get rid of this marker without breaking + /// the public API. + _marker: core::marker::PhantomData<&'a ()>, +} + +impl<'a> Iterator for PatternIter<'a> { + type Item = PatternID; + + fn next(&mut self) -> Option<PatternID> { + self.it.next() + } +} + +#[cfg(test)] +mod tests { + use super::*; + // TODO: Replace tests using DFA with NFA matching engine once implemented. + use crate::dfa::{dense, Automaton}; + + #[test] + fn always_match() { + let nfa = NFA::always_match(); + let dfa = dense::Builder::new().build_from_nfa(&nfa).unwrap(); + let find = |input, start, end| { + dfa.find_leftmost_fwd_at(None, None, input, start, end) + .unwrap() + .map(|m| m.offset()) + }; + + assert_eq!(Some(0), find(b"", 0, 0)); + assert_eq!(Some(0), find(b"a", 0, 1)); + assert_eq!(Some(1), find(b"a", 1, 1)); + assert_eq!(Some(0), find(b"ab", 0, 2)); + assert_eq!(Some(1), find(b"ab", 1, 2)); + assert_eq!(Some(2), find(b"ab", 2, 2)); + } + + #[test] + fn never_match() { + let nfa = NFA::never_match(); + let dfa = dense::Builder::new().build_from_nfa(&nfa).unwrap(); + let find = |input, start, end| { + dfa.find_leftmost_fwd_at(None, None, input, start, end) + .unwrap() + .map(|m| m.offset()) + }; + + assert_eq!(None, find(b"", 0, 0)); + assert_eq!(None, find(b"a", 0, 1)); + assert_eq!(None, find(b"a", 1, 1)); + assert_eq!(None, find(b"ab", 0, 2)); + assert_eq!(None, find(b"ab", 1, 2)); + assert_eq!(None, find(b"ab", 2, 2)); + } + + #[test] + fn look_set() { + let mut f = LookSet::default(); + assert!(!f.contains(Look::StartText)); + assert!(!f.contains(Look::EndText)); + assert!(!f.contains(Look::StartLine)); + assert!(!f.contains(Look::EndLine)); + assert!(!f.contains(Look::WordBoundaryUnicode)); + assert!(!f.contains(Look::WordBoundaryUnicodeNegate)); + assert!(!f.contains(Look::WordBoundaryAscii)); + assert!(!f.contains(Look::WordBoundaryAsciiNegate)); + + f.insert(Look::StartText); + assert!(f.contains(Look::StartText)); + f.remove(Look::StartText); + assert!(!f.contains(Look::StartText)); + + f.insert(Look::EndText); + assert!(f.contains(Look::EndText)); + f.remove(Look::EndText); + assert!(!f.contains(Look::EndText)); + + f.insert(Look::StartLine); + assert!(f.contains(Look::StartLine)); + f.remove(Look::StartLine); + assert!(!f.contains(Look::StartLine)); + + f.insert(Look::EndLine); + assert!(f.contains(Look::EndLine)); + f.remove(Look::EndLine); + assert!(!f.contains(Look::EndLine)); + + f.insert(Look::WordBoundaryUnicode); + assert!(f.contains(Look::WordBoundaryUnicode)); + f.remove(Look::WordBoundaryUnicode); + assert!(!f.contains(Look::WordBoundaryUnicode)); + + f.insert(Look::WordBoundaryUnicodeNegate); + assert!(f.contains(Look::WordBoundaryUnicodeNegate)); + f.remove(Look::WordBoundaryUnicodeNegate); + assert!(!f.contains(Look::WordBoundaryUnicodeNegate)); + + f.insert(Look::WordBoundaryAscii); + assert!(f.contains(Look::WordBoundaryAscii)); + f.remove(Look::WordBoundaryAscii); + assert!(!f.contains(Look::WordBoundaryAscii)); + + f.insert(Look::WordBoundaryAsciiNegate); + assert!(f.contains(Look::WordBoundaryAsciiNegate)); + f.remove(Look::WordBoundaryAsciiNegate); + assert!(!f.contains(Look::WordBoundaryAsciiNegate)); + } + + #[test] + fn look_matches_start_line() { + let look = Look::StartLine; + + assert!(look.matches(B(""), 0)); + assert!(look.matches(B("\n"), 0)); + assert!(look.matches(B("\n"), 1)); + assert!(look.matches(B("a"), 0)); + assert!(look.matches(B("\na"), 1)); + + assert!(!look.matches(B("a"), 1)); + assert!(!look.matches(B("a\na"), 1)); + } + + #[test] + fn look_matches_end_line() { + let look = Look::EndLine; + + assert!(look.matches(B(""), 0)); + assert!(look.matches(B("\n"), 1)); + assert!(look.matches(B("\na"), 0)); + assert!(look.matches(B("\na"), 2)); + assert!(look.matches(B("a\na"), 1)); + + assert!(!look.matches(B("a"), 0)); + assert!(!look.matches(B("\na"), 1)); + assert!(!look.matches(B("a\na"), 0)); + assert!(!look.matches(B("a\na"), 2)); + } + + #[test] + fn look_matches_start_text() { + let look = Look::StartText; + + assert!(look.matches(B(""), 0)); + assert!(look.matches(B("\n"), 0)); + assert!(look.matches(B("a"), 0)); + + assert!(!look.matches(B("\n"), 1)); + assert!(!look.matches(B("\na"), 1)); + assert!(!look.matches(B("a"), 1)); + assert!(!look.matches(B("a\na"), 1)); + } + + #[test] + fn look_matches_end_text() { + let look = Look::EndText; + + assert!(look.matches(B(""), 0)); + assert!(look.matches(B("\n"), 1)); + assert!(look.matches(B("\na"), 2)); + + assert!(!look.matches(B("\na"), 0)); + assert!(!look.matches(B("a\na"), 1)); + assert!(!look.matches(B("a"), 0)); + assert!(!look.matches(B("\na"), 1)); + assert!(!look.matches(B("a\na"), 0)); + assert!(!look.matches(B("a\na"), 2)); + } + + #[test] + fn look_matches_word_unicode() { + let look = Look::WordBoundaryUnicode; + + // \xF0\x9D\x9B\x83 = 𝛃 (in \w) + // \xF0\x90\x86\x80 = 𐆀 (not in \w) + + // Simple ASCII word boundaries. + assert!(look.matches(B("a"), 0)); + assert!(look.matches(B("a"), 1)); + assert!(look.matches(B("a "), 1)); + assert!(look.matches(B(" a "), 1)); + assert!(look.matches(B(" a "), 2)); + + // Unicode word boundaries with a non-ASCII codepoint. + assert!(look.matches(B("𝛃"), 0)); + assert!(look.matches(B("𝛃"), 4)); + assert!(look.matches(B("𝛃 "), 4)); + assert!(look.matches(B(" 𝛃 "), 1)); + assert!(look.matches(B(" 𝛃 "), 5)); + + // Unicode word boundaries between non-ASCII codepoints. + assert!(look.matches(B("𝛃𐆀"), 0)); + assert!(look.matches(B("𝛃𐆀"), 4)); + + // Non word boundaries for ASCII. + assert!(!look.matches(B(""), 0)); + assert!(!look.matches(B("ab"), 1)); + assert!(!look.matches(B("a "), 2)); + assert!(!look.matches(B(" a "), 0)); + assert!(!look.matches(B(" a "), 3)); + + // Non word boundaries with a non-ASCII codepoint. + assert!(!look.matches(B("𝛃b"), 4)); + assert!(!look.matches(B("𝛃 "), 5)); + assert!(!look.matches(B(" 𝛃 "), 0)); + assert!(!look.matches(B(" 𝛃 "), 6)); + assert!(!look.matches(B("𝛃"), 1)); + assert!(!look.matches(B("𝛃"), 2)); + assert!(!look.matches(B("𝛃"), 3)); + + // Non word boundaries with non-ASCII codepoints. + assert!(!look.matches(B("𝛃𐆀"), 1)); + assert!(!look.matches(B("𝛃𐆀"), 2)); + assert!(!look.matches(B("𝛃𐆀"), 3)); + assert!(!look.matches(B("𝛃𐆀"), 5)); + assert!(!look.matches(B("𝛃𐆀"), 6)); + assert!(!look.matches(B("𝛃𐆀"), 7)); + assert!(!look.matches(B("𝛃𐆀"), 8)); + } + + #[test] + fn look_matches_word_ascii() { + let look = Look::WordBoundaryAscii; + + // \xF0\x9D\x9B\x83 = 𝛃 (in \w) + // \xF0\x90\x86\x80 = 𐆀 (not in \w) + + // Simple ASCII word boundaries. + assert!(look.matches(B("a"), 0)); + assert!(look.matches(B("a"), 1)); + assert!(look.matches(B("a "), 1)); + assert!(look.matches(B(" a "), 1)); + assert!(look.matches(B(" a "), 2)); + + // Unicode word boundaries with a non-ASCII codepoint. Since this is + // an ASCII word boundary, none of these match. + assert!(!look.matches(B("𝛃"), 0)); + assert!(!look.matches(B("𝛃"), 4)); + assert!(!look.matches(B("𝛃 "), 4)); + assert!(!look.matches(B(" 𝛃 "), 1)); + assert!(!look.matches(B(" 𝛃 "), 5)); + + // Unicode word boundaries between non-ASCII codepoints. Again, since + // this is an ASCII word boundary, none of these match. + assert!(!look.matches(B("𝛃𐆀"), 0)); + assert!(!look.matches(B("𝛃𐆀"), 4)); + + // Non word boundaries for ASCII. + assert!(!look.matches(B(""), 0)); + assert!(!look.matches(B("ab"), 1)); + assert!(!look.matches(B("a "), 2)); + assert!(!look.matches(B(" a "), 0)); + assert!(!look.matches(B(" a "), 3)); + + // Non word boundaries with a non-ASCII codepoint. + assert!(look.matches(B("𝛃b"), 4)); + assert!(!look.matches(B("𝛃 "), 5)); + assert!(!look.matches(B(" 𝛃 "), 0)); + assert!(!look.matches(B(" 𝛃 "), 6)); + assert!(!look.matches(B("𝛃"), 1)); + assert!(!look.matches(B("𝛃"), 2)); + assert!(!look.matches(B("𝛃"), 3)); + + // Non word boundaries with non-ASCII codepoints. + assert!(!look.matches(B("𝛃𐆀"), 1)); + assert!(!look.matches(B("𝛃𐆀"), 2)); + assert!(!look.matches(B("𝛃𐆀"), 3)); + assert!(!look.matches(B("𝛃𐆀"), 5)); + assert!(!look.matches(B("𝛃𐆀"), 6)); + assert!(!look.matches(B("𝛃𐆀"), 7)); + assert!(!look.matches(B("𝛃𐆀"), 8)); + } + + #[test] + fn look_matches_word_unicode_negate() { + let look = Look::WordBoundaryUnicodeNegate; + + // \xF0\x9D\x9B\x83 = 𝛃 (in \w) + // \xF0\x90\x86\x80 = 𐆀 (not in \w) + + // Simple ASCII word boundaries. + assert!(!look.matches(B("a"), 0)); + assert!(!look.matches(B("a"), 1)); + assert!(!look.matches(B("a "), 1)); + assert!(!look.matches(B(" a "), 1)); + assert!(!look.matches(B(" a "), 2)); + + // Unicode word boundaries with a non-ASCII codepoint. + assert!(!look.matches(B("𝛃"), 0)); + assert!(!look.matches(B("𝛃"), 4)); + assert!(!look.matches(B("𝛃 "), 4)); + assert!(!look.matches(B(" 𝛃 "), 1)); + assert!(!look.matches(B(" 𝛃 "), 5)); + + // Unicode word boundaries between non-ASCII codepoints. + assert!(!look.matches(B("𝛃𐆀"), 0)); + assert!(!look.matches(B("𝛃𐆀"), 4)); + + // Non word boundaries for ASCII. + assert!(look.matches(B(""), 0)); + assert!(look.matches(B("ab"), 1)); + assert!(look.matches(B("a "), 2)); + assert!(look.matches(B(" a "), 0)); + assert!(look.matches(B(" a "), 3)); + + // Non word boundaries with a non-ASCII codepoint. + assert!(look.matches(B("𝛃b"), 4)); + assert!(look.matches(B("𝛃 "), 5)); + assert!(look.matches(B(" 𝛃 "), 0)); + assert!(look.matches(B(" 𝛃 "), 6)); + // These don't match because they could otherwise return an offset that + // splits the UTF-8 encoding of a codepoint. + assert!(!look.matches(B("𝛃"), 1)); + assert!(!look.matches(B("𝛃"), 2)); + assert!(!look.matches(B("𝛃"), 3)); + + // Non word boundaries with non-ASCII codepoints. These also don't + // match because they could otherwise return an offset that splits the + // UTF-8 encoding of a codepoint. + assert!(!look.matches(B("𝛃𐆀"), 1)); + assert!(!look.matches(B("𝛃𐆀"), 2)); + assert!(!look.matches(B("𝛃𐆀"), 3)); + assert!(!look.matches(B("𝛃𐆀"), 5)); + assert!(!look.matches(B("𝛃𐆀"), 6)); + assert!(!look.matches(B("𝛃𐆀"), 7)); + // But this one does, since 𐆀 isn't a word codepoint, and 8 is the end + // of the haystack. So the "end" of the haystack isn't a word and 𐆀 + // isn't a word, thus, \B matches. + assert!(look.matches(B("𝛃𐆀"), 8)); + } + + #[test] + fn look_matches_word_ascii_negate() { + let look = Look::WordBoundaryAsciiNegate; + + // \xF0\x9D\x9B\x83 = 𝛃 (in \w) + // \xF0\x90\x86\x80 = 𐆀 (not in \w) + + // Simple ASCII word boundaries. + assert!(!look.matches(B("a"), 0)); + assert!(!look.matches(B("a"), 1)); + assert!(!look.matches(B("a "), 1)); + assert!(!look.matches(B(" a "), 1)); + assert!(!look.matches(B(" a "), 2)); + + // Unicode word boundaries with a non-ASCII codepoint. Since this is + // an ASCII word boundary, none of these match. + assert!(look.matches(B("𝛃"), 0)); + assert!(look.matches(B("𝛃"), 4)); + assert!(look.matches(B("𝛃 "), 4)); + assert!(look.matches(B(" 𝛃 "), 1)); + assert!(look.matches(B(" 𝛃 "), 5)); + + // Unicode word boundaries between non-ASCII codepoints. Again, since + // this is an ASCII word boundary, none of these match. + assert!(look.matches(B("𝛃𐆀"), 0)); + assert!(look.matches(B("𝛃𐆀"), 4)); + + // Non word boundaries for ASCII. + assert!(look.matches(B(""), 0)); + assert!(look.matches(B("ab"), 1)); + assert!(look.matches(B("a "), 2)); + assert!(look.matches(B(" a "), 0)); + assert!(look.matches(B(" a "), 3)); + + // Non word boundaries with a non-ASCII codepoint. + assert!(!look.matches(B("𝛃b"), 4)); + assert!(look.matches(B("𝛃 "), 5)); + assert!(look.matches(B(" 𝛃 "), 0)); + assert!(look.matches(B(" 𝛃 "), 6)); + assert!(look.matches(B("𝛃"), 1)); + assert!(look.matches(B("𝛃"), 2)); + assert!(look.matches(B("𝛃"), 3)); + + // Non word boundaries with non-ASCII codepoints. + assert!(look.matches(B("𝛃𐆀"), 1)); + assert!(look.matches(B("𝛃𐆀"), 2)); + assert!(look.matches(B("𝛃𐆀"), 3)); + assert!(look.matches(B("𝛃𐆀"), 5)); + assert!(look.matches(B("𝛃𐆀"), 6)); + assert!(look.matches(B("𝛃𐆀"), 7)); + assert!(look.matches(B("𝛃𐆀"), 8)); + } + + fn B<'a, T: 'a + ?Sized + AsRef<[u8]>>(string: &'a T) -> &'a [u8] { + string.as_ref() + } +} |