diff options
Diffstat (limited to 'vendor/regex-automata/src/util/determinize/state.rs')
-rw-r--r-- | vendor/regex-automata/src/util/determinize/state.rs | 191 |
1 files changed, 112 insertions, 79 deletions
diff --git a/vendor/regex-automata/src/util/determinize/state.rs b/vendor/regex-automata/src/util/determinize/state.rs index 567e600d6..e64123587 100644 --- a/vendor/regex-automata/src/util/determinize/state.rs +++ b/vendor/regex-automata/src/util/determinize/state.rs @@ -10,13 +10,13 @@ The term "DFA state" is somewhat overloaded in this crate. In some cases, it refers to the set of transitions over an alphabet for a particular state. In other cases, it refers to a set of NFA states. The former is really about the final representation of a state in a DFA's transition table, where as the -latter---what this module is focusedon---is closer to an intermediate form that -is used to help eventually build the transition table. +latter---what this module is focused on---is closer to an intermediate form +that is used to help eventually build the transition table. This module exports four types. All four types represent the same idea: an ordered set of NFA states. This ordered set represents the epsilon closure of a particular NFA state, where the "epsilon closure" is the set of NFA states that -can be transitioned to without consuming any input. i.e., Follow all of theNFA +can be transitioned to without consuming any input. i.e., Follow all of the NFA state's epsilon transitions. In addition, this implementation of DFA states cares about two other things: the ordered set of pattern IDs corresponding to the patterns that match if the state is a match state, and the set of @@ -46,9 +46,11 @@ a copy). Here are the three types described succinctly: and no NFA states. Creating a `StateBuilderEmpty` performs no allocs. A `StateBuilderEmpty` can only be used to query its underlying memory capacity, or to convert into a builder for recording pattern IDs and/or assertions. + * `StateBuilderMatches` represents a state with zero or more pattern IDs, zero or more satisfied assertions and zero NFA state IDs. A `StateBuilderMatches` can only be used for adding pattern IDs and recording assertions. + * `StateBuilderNFA` represents a state with zero or more pattern IDs, zero or more satisfied assertions and zero or more NFA state IDs. A `StateBuilderNFA` can only be used for adding NFA state IDs and recording some assertions. @@ -58,7 +60,7 @@ DFA state to check if it already exists. If it does, then there's no need to freeze it into a `State`. It it doesn't exist, then `StateBuilderNFA::to_state` can be called to freeze the builder into an immutable `State`. In either case, `clear` should be called on the builder to turn it back into a -`StateBuilderEmpty` that reuses the underyling memory. +`StateBuilderEmpty` that reuses the underlying memory. The main purpose for splitting the builder into these distinct types is to make it impossible to do things like adding a pattern ID after adding an NFA @@ -68,7 +70,7 @@ type below.) If we just used one type for everything, it would be possible for callers to use an incorrect interleaving of calls and thus result in a corrupt representation. I chose to use more type machinery to make this impossible to do because 1) determinization is itself pretty complex and it wouldn't be too -hard to foul this up and 2) there isn't too much machinery involve and it's +hard to foul this up and 2) there isn't too much machinery involved and it's well contained. As an optimization, sometimes states won't have certain things set. For @@ -88,12 +90,11 @@ use core::{convert::TryFrom, mem}; use alloc::{sync::Arc, vec::Vec}; -use crate::{ - nfa::thompson::LookSet, - util::{ - bytes::{self, Endian}, - id::{PatternID, StateID}, - }, +use crate::util::{ + int::{I32, U32}, + look::LookSet, + primitives::{PatternID, StateID}, + wire::{self, Endian}, }; /// A DFA state that, at its core, is represented by an ordered set of NFA @@ -102,7 +103,7 @@ use crate::{ /// This type is intended to be used only in NFA-to-DFA conversion via powerset /// construction. /// -/// It may be cheaply cloned and accessed safely from mulitple threads +/// It may be cheaply cloned and accessed safely from multiple threads /// simultaneously. #[derive(Clone, Eq, Hash, PartialEq, PartialOrd, Ord)] pub(crate) struct State(Arc<[u8]>); @@ -138,6 +139,10 @@ impl State { self.repr().is_from_word() } + pub(crate) fn is_half_crlf(&self) -> bool { + self.repr().is_half_crlf() + } + pub(crate) fn look_have(&self) -> LookSet { self.repr().look_have() } @@ -146,8 +151,8 @@ impl State { self.repr().look_need() } - pub(crate) fn match_count(&self) -> usize { - self.repr().match_count() + pub(crate) fn match_len(&self) -> usize { + self.repr().match_len() } pub(crate) fn match_pattern(&self, index: usize) -> PatternID { @@ -158,6 +163,7 @@ impl State { self.repr().match_pattern_ids() } + #[cfg(all(test, not(miri)))] pub(crate) fn iter_match_pattern_ids<F: FnMut(PatternID)>(&self, f: F) { self.repr().iter_match_pattern_ids(f) } @@ -191,7 +197,7 @@ impl StateBuilderEmpty { } pub(crate) fn into_matches(mut self) -> StateBuilderMatches { - self.0.extend_from_slice(&[0, 0, 0]); + self.0.extend_from_slice(&[0, 0, 0, 0, 0]); StateBuilderMatches(self.0) } @@ -224,30 +230,23 @@ impl StateBuilderMatches { StateBuilderNFA { repr: self.0, prev_nfa_state_id: StateID::ZERO } } - pub(crate) fn clear(self) -> StateBuilderEmpty { - let mut builder = StateBuilderEmpty(self.0); - builder.clear(); - builder - } - - pub(crate) fn is_match(&self) -> bool { - self.repr().is_match() - } - - pub(crate) fn is_from_word(&self) -> bool { - self.repr().is_from_word() - } - pub(crate) fn set_is_from_word(&mut self) { self.repr_vec().set_is_from_word() } - pub(crate) fn look_have(&mut self) -> &mut LookSet { - LookSet::from_repr_mut(&mut self.0[1]) + pub(crate) fn set_is_half_crlf(&mut self) { + self.repr_vec().set_is_half_crlf() + } + + pub(crate) fn look_have(&self) -> LookSet { + LookSet::read_repr(&self.0[1..]) } - pub(crate) fn look_need(&mut self) -> &mut LookSet { - LookSet::from_repr_mut(&mut self.0[2]) + pub(crate) fn set_look_have( + &mut self, + set: impl FnMut(LookSet) -> LookSet, + ) { + self.repr_vec().set_look_have(set) } pub(crate) fn add_match_pattern_id(&mut self, pid: PatternID) { @@ -295,20 +294,22 @@ impl StateBuilderNFA { builder } - pub(crate) fn is_match(&self) -> bool { - self.repr().is_match() - } - - pub(crate) fn is_from_word(&self) -> bool { - self.repr().is_from_word() + pub(crate) fn look_need(&self) -> LookSet { + self.repr().look_need() } - pub(crate) fn look_have(&mut self) -> &mut LookSet { - LookSet::from_repr_mut(&mut self.repr[1]) + pub(crate) fn set_look_have( + &mut self, + set: impl FnMut(LookSet) -> LookSet, + ) { + self.repr_vec().set_look_have(set) } - pub(crate) fn look_need(&mut self) -> &mut LookSet { - LookSet::from_repr_mut(&mut self.repr[2]) + pub(crate) fn set_look_need( + &mut self, + set: impl FnMut(LookSet) -> LookSet, + ) { + self.repr_vec().set_look_need(set) } pub(crate) fn add_nfa_state_id(&mut self, sid: StateID) { @@ -316,10 +317,6 @@ impl StateBuilderNFA { .add_nfa_state_id(&mut self.prev_nfa_state_id, sid) } - pub(crate) fn memory_usage(&self) -> usize { - self.repr.len() - } - pub(crate) fn as_bytes(&self) -> &[u8] { &self.repr } @@ -355,8 +352,8 @@ impl StateBuilderNFA { /// /// Byte 1 corresponds to the look-behind assertions that were satisfied by /// the transition that created this state. This generally only includes the -/// StartLine and StartText assertions. (Look-ahead assertions are not tracked -/// as part of states. Instead, these are applied by re-computing the epsilon +/// StartLF and Start assertions. (Look-ahead assertions are not tracked as +/// part of states. Instead, these are applied by re-computing the epsilon /// closure of a state when computing the transition function. See `next` in /// the parent module.) /// @@ -425,6 +422,14 @@ impl<'a> Repr<'a> { self.0[0] & (1 << 2) > 0 } + /// Returns true if and only if this state is marked as being inside of a + /// CRLF terminator. In the forward direction, this means the state was + /// created after seeing a `\r`. In the reverse direction, this means the + /// state was created after seeing a `\n`. + fn is_half_crlf(&self) -> bool { + self.0[0] & (1 << 3) > 0 + } + /// The set of look-behind assertions that were true in the transition that /// created this state. /// @@ -436,7 +441,7 @@ impl<'a> Repr<'a> { /// these are re-computed on demand via epsilon closure when computing the /// transition function. fn look_have(&self) -> LookSet { - LookSet::from_repr(self.0[1]) + LookSet::read_repr(&self.0[1..]) } /// The set of look-around (both behind and ahead) assertions that appear @@ -447,34 +452,34 @@ impl<'a> Repr<'a> { /// state has no conditional epsilon transitions, then there is no need /// to re-compute the epsilon closure. fn look_need(&self) -> LookSet { - LookSet::from_repr(self.0[2]) + LookSet::read_repr(&self.0[3..]) } /// Returns the total number of match pattern IDs in this state. /// /// If this state is not a match state, then this always returns 0. - fn match_count(&self) -> usize { + fn match_len(&self) -> usize { if !self.is_match() { return 0; } else if !self.has_pattern_ids() { 1 } else { - self.encoded_pattern_count() + self.encoded_pattern_len() } } /// Returns the pattern ID for this match state at the given index. /// - /// If the given index is greater than or equal to `match_count()` for this + /// If the given index is greater than or equal to `match_len()` for this /// state, then this could panic or return incorrect results. fn match_pattern(&self, index: usize) -> PatternID { if !self.has_pattern_ids() { PatternID::ZERO } else { - let offset = 7 + index * PatternID::SIZE; + let offset = 9 + index * PatternID::SIZE; // This is OK since we only ever serialize valid PatternIDs to // states. - bytes::read_pattern_id_unchecked(&self.0[offset..]).0 + wire::read_pattern_id_unchecked(&self.0[offset..]).0 } } @@ -502,9 +507,9 @@ impl<'a> Repr<'a> { f(PatternID::ZERO); return; } - let mut pids = &self.0[7..self.pattern_offset_end()]; + let mut pids = &self.0[9..self.pattern_offset_end()]; while !pids.is_empty() { - let pid = bytes::read_u32(pids); + let pid = wire::read_u32(pids); pids = &pids[PatternID::SIZE..]; // This is OK since we only ever serialize valid PatternIDs to // states. And since pattern IDs can never exceed a usize, the @@ -525,20 +530,20 @@ impl<'a> Repr<'a> { // This is OK since we only ever serialize valid StateIDs to // states. And since state IDs can never exceed an isize, they must // always be able to fit into a usize, and thus cast is OK. - f(StateID::new_unchecked(sid as usize)) + f(StateID::new_unchecked(sid.as_usize())) } } /// Returns the offset into this state's representation where the pattern /// IDs end and the NFA state IDs begin. fn pattern_offset_end(&self) -> usize { - let encoded = self.encoded_pattern_count(); + let encoded = self.encoded_pattern_len(); if encoded == 0 { - return 3; + return 5; } // This arithmetic is OK since we were able to address this many bytes // when writing to the state, thus, it must fit into a usize. - encoded.checked_mul(4).unwrap().checked_add(7).unwrap() + encoded.checked_mul(4).unwrap().checked_add(9).unwrap() } /// Returns the total number of *encoded* pattern IDs in this state. @@ -546,13 +551,13 @@ impl<'a> Repr<'a> { /// This may return 0 even when this is a match state, since the pattern /// ID `PatternID::ZERO` is not encoded when it's the only pattern ID in /// the match state (the overwhelming common case). - fn encoded_pattern_count(&self) -> usize { + fn encoded_pattern_len(&self) -> usize { if !self.has_pattern_ids() { return 0; } // This unwrap is OK since the total number of patterns is always // guaranteed to fit into a usize. - usize::try_from(bytes::read_u32(&self.0[3..7])).unwrap() + usize::try_from(wire::read_u32(&self.0[5..9])).unwrap() } } @@ -563,6 +568,7 @@ impl<'a> core::fmt::Debug for Repr<'a> { f.debug_struct("Repr") .field("is_match", &self.is_match()) .field("is_from_word", &self.is_from_word()) + .field("is_half_crlf", &self.is_half_crlf()) .field("look_have", &self.look_have()) .field("look_need", &self.look_need()) .field("match_pattern_ids", &self.match_pattern_ids()) @@ -608,14 +614,36 @@ impl<'a> ReprVec<'a> { self.0[0] |= 1 << 2; } - /// Return a mutable reference to the 'look_have' assertion set. - fn look_have_mut(&mut self) -> &mut LookSet { - LookSet::from_repr_mut(&mut self.0[1]) + /// Set this state as having seen half of a CRLF terminator. + /// + /// In the forward direction, this should be set when a `\r` has been seen. + /// In the reverse direction, this should be set when a `\n` has been seen. + fn set_is_half_crlf(&mut self) { + self.0[0] |= 1 << 3; } - /// Return a mutable reference to the 'look_need' assertion set. - fn look_need_mut(&mut self) -> &mut LookSet { - LookSet::from_repr_mut(&mut self.0[2]) + /// The set of look-behind assertions that were true in the transition that + /// created this state. + fn look_have(&self) -> LookSet { + self.repr().look_have() + } + + /// The set of look-around (both behind and ahead) assertions that appear + /// at least once in this state's set of NFA states. + fn look_need(&self) -> LookSet { + self.repr().look_need() + } + + /// Mutate the set of look-behind assertions that were true in the + /// transition that created this state. + fn set_look_have(&mut self, mut set: impl FnMut(LookSet) -> LookSet) { + set(self.look_have()).write_repr(&mut self.0[1..]); + } + + /// Mutate the set of look-around (both behind and ahead) assertions that + /// appear at least once in this state's set of NFA states. + fn set_look_need(&mut self, mut set: impl FnMut(LookSet) -> LookSet) { + set(self.look_need()).write_repr(&mut self.0[3..]); } /// Add a pattern ID to this state. All match states must have at least @@ -675,14 +703,14 @@ impl<'a> ReprVec<'a> { return; } let patsize = PatternID::SIZE; - let pattern_bytes = self.0.len() - 7; + let pattern_bytes = self.0.len() - 9; // Every pattern ID uses 4 bytes, so number of bytes should be // divisible by 4. assert_eq!(pattern_bytes % patsize, 0); // This unwrap is OK since we are guaranteed that the maximum number // of possible patterns fits into a u32. let count32 = u32::try_from(pattern_bytes / patsize).unwrap(); - bytes::NE::write_u32(count32, &mut self.0[3..7]); + wire::NE::write_u32(count32, &mut self.0[5..9]); } /// Add an NFA state ID to this state. The order in which NFA states are @@ -704,7 +732,7 @@ impl<'a> ReprVec<'a> { /// /// https://developers.google.com/protocol-buffers/docs/encoding#varints fn write_vari32(data: &mut Vec<u8>, n: i32) { - let mut un = (n as u32) << 1; + let mut un = n.to_bits() << 1; if n < 0 { un = !un; } @@ -717,7 +745,7 @@ fn write_vari32(data: &mut Vec<u8>, n: i32) { /// https://developers.google.com/protocol-buffers/docs/encoding#varints fn read_vari32(data: &[u8]) -> (i32, usize) { let (un, i) = read_varu32(data); - let mut n = (un >> 1) as i32; + let mut n = i32::from_bits(un >> 1); if un & 1 != 0 { n = !n; } @@ -733,10 +761,10 @@ fn read_vari32(data: &[u8]) -> (i32, usize) { /// https://developers.google.com/protocol-buffers/docs/encoding#varints fn write_varu32(data: &mut Vec<u8>, mut n: u32) { while n >= 0b1000_0000 { - data.push((n as u8) | 0b1000_0000); + data.push(n.low_u8() | 0b1000_0000); n >>= 7; } - data.push(n as u8); + data.push(n.low_u8()); } /// Read an unsigned 32-bit varint. Also, return the number of bytes read. @@ -750,9 +778,9 @@ fn read_varu32(data: &[u8]) -> (u32, usize) { let mut shift: u32 = 0; for (i, &b) in data.iter().enumerate() { if b < 0b1000_0000 { - return (n | ((b as u32) << shift), i + 1); + return (n | (u32::from(b) << shift), i + 1); } - n |= ((b as u32) & 0b0111_1111) << shift; + n |= (u32::from(b) & 0b0111_1111) << shift; shift += 7; } (0, 0) @@ -760,7 +788,7 @@ fn read_varu32(data: &[u8]) -> (u32, usize) { /// Push a native-endian encoded `n` on to `dst`. fn write_u32(dst: &mut Vec<u8>, n: u32) { - use crate::util::bytes::{Endian, NE}; + use crate::util::wire::NE; let start = dst.len(); dst.extend(core::iter::repeat(0).take(mem::size_of::<u32>())); @@ -775,6 +803,7 @@ mod tests { use super::*; + #[cfg(not(miri))] quickcheck! { fn prop_state_read_write_nfa_state_ids(sids: Vec<StateID>) -> bool { // Builders states do not permit duplicate IDs. @@ -829,7 +858,9 @@ mod tests { s.iter_nfa_state_ids(|sid| got_sids.push(sid)); got_pids == pids && got_sids == sids } + } + quickcheck! { fn prop_read_write_varu32(n: u32) -> bool { let mut buf = vec![]; write_varu32(&mut buf, n); @@ -845,6 +876,7 @@ mod tests { } } + #[cfg(not(miri))] fn dedup_state_ids(sids: Vec<StateID>) -> Vec<StateID> { let mut set = alloc::collections::BTreeSet::new(); let mut deduped = vec![]; @@ -858,6 +890,7 @@ mod tests { deduped } + #[cfg(not(miri))] fn dedup_pattern_ids(pids: Vec<PatternID>) -> Vec<PatternID> { let mut set = alloc::collections::BTreeSet::new(); let mut deduped = vec![]; |