summaryrefslogtreecommitdiffstats
path: root/vendor/regex-automata/src/util/determinize/state.rs
diff options
context:
space:
mode:
Diffstat (limited to 'vendor/regex-automata/src/util/determinize/state.rs')
-rw-r--r--vendor/regex-automata/src/util/determinize/state.rs191
1 files changed, 112 insertions, 79 deletions
diff --git a/vendor/regex-automata/src/util/determinize/state.rs b/vendor/regex-automata/src/util/determinize/state.rs
index 567e600d6..e64123587 100644
--- a/vendor/regex-automata/src/util/determinize/state.rs
+++ b/vendor/regex-automata/src/util/determinize/state.rs
@@ -10,13 +10,13 @@ The term "DFA state" is somewhat overloaded in this crate. In some cases, it
refers to the set of transitions over an alphabet for a particular state. In
other cases, it refers to a set of NFA states. The former is really about the
final representation of a state in a DFA's transition table, where as the
-latter---what this module is focusedon---is closer to an intermediate form that
-is used to help eventually build the transition table.
+latter---what this module is focused on---is closer to an intermediate form
+that is used to help eventually build the transition table.
This module exports four types. All four types represent the same idea: an
ordered set of NFA states. This ordered set represents the epsilon closure of a
particular NFA state, where the "epsilon closure" is the set of NFA states that
-can be transitioned to without consuming any input. i.e., Follow all of theNFA
+can be transitioned to without consuming any input. i.e., Follow all of the NFA
state's epsilon transitions. In addition, this implementation of DFA states
cares about two other things: the ordered set of pattern IDs corresponding
to the patterns that match if the state is a match state, and the set of
@@ -46,9 +46,11 @@ a copy). Here are the three types described succinctly:
and no NFA states. Creating a `StateBuilderEmpty` performs no allocs. A
`StateBuilderEmpty` can only be used to query its underlying memory capacity,
or to convert into a builder for recording pattern IDs and/or assertions.
+
* `StateBuilderMatches` represents a state with zero or more pattern IDs, zero
or more satisfied assertions and zero NFA state IDs. A `StateBuilderMatches`
can only be used for adding pattern IDs and recording assertions.
+
* `StateBuilderNFA` represents a state with zero or more pattern IDs, zero or
more satisfied assertions and zero or more NFA state IDs. A `StateBuilderNFA`
can only be used for adding NFA state IDs and recording some assertions.
@@ -58,7 +60,7 @@ DFA state to check if it already exists. If it does, then there's no need to
freeze it into a `State`. It it doesn't exist, then `StateBuilderNFA::to_state`
can be called to freeze the builder into an immutable `State`. In either
case, `clear` should be called on the builder to turn it back into a
-`StateBuilderEmpty` that reuses the underyling memory.
+`StateBuilderEmpty` that reuses the underlying memory.
The main purpose for splitting the builder into these distinct types is to
make it impossible to do things like adding a pattern ID after adding an NFA
@@ -68,7 +70,7 @@ type below.) If we just used one type for everything, it would be possible for
callers to use an incorrect interleaving of calls and thus result in a corrupt
representation. I chose to use more type machinery to make this impossible to
do because 1) determinization is itself pretty complex and it wouldn't be too
-hard to foul this up and 2) there isn't too much machinery involve and it's
+hard to foul this up and 2) there isn't too much machinery involved and it's
well contained.
As an optimization, sometimes states won't have certain things set. For
@@ -88,12 +90,11 @@ use core::{convert::TryFrom, mem};
use alloc::{sync::Arc, vec::Vec};
-use crate::{
- nfa::thompson::LookSet,
- util::{
- bytes::{self, Endian},
- id::{PatternID, StateID},
- },
+use crate::util::{
+ int::{I32, U32},
+ look::LookSet,
+ primitives::{PatternID, StateID},
+ wire::{self, Endian},
};
/// A DFA state that, at its core, is represented by an ordered set of NFA
@@ -102,7 +103,7 @@ use crate::{
/// This type is intended to be used only in NFA-to-DFA conversion via powerset
/// construction.
///
-/// It may be cheaply cloned and accessed safely from mulitple threads
+/// It may be cheaply cloned and accessed safely from multiple threads
/// simultaneously.
#[derive(Clone, Eq, Hash, PartialEq, PartialOrd, Ord)]
pub(crate) struct State(Arc<[u8]>);
@@ -138,6 +139,10 @@ impl State {
self.repr().is_from_word()
}
+ pub(crate) fn is_half_crlf(&self) -> bool {
+ self.repr().is_half_crlf()
+ }
+
pub(crate) fn look_have(&self) -> LookSet {
self.repr().look_have()
}
@@ -146,8 +151,8 @@ impl State {
self.repr().look_need()
}
- pub(crate) fn match_count(&self) -> usize {
- self.repr().match_count()
+ pub(crate) fn match_len(&self) -> usize {
+ self.repr().match_len()
}
pub(crate) fn match_pattern(&self, index: usize) -> PatternID {
@@ -158,6 +163,7 @@ impl State {
self.repr().match_pattern_ids()
}
+ #[cfg(all(test, not(miri)))]
pub(crate) fn iter_match_pattern_ids<F: FnMut(PatternID)>(&self, f: F) {
self.repr().iter_match_pattern_ids(f)
}
@@ -191,7 +197,7 @@ impl StateBuilderEmpty {
}
pub(crate) fn into_matches(mut self) -> StateBuilderMatches {
- self.0.extend_from_slice(&[0, 0, 0]);
+ self.0.extend_from_slice(&[0, 0, 0, 0, 0]);
StateBuilderMatches(self.0)
}
@@ -224,30 +230,23 @@ impl StateBuilderMatches {
StateBuilderNFA { repr: self.0, prev_nfa_state_id: StateID::ZERO }
}
- pub(crate) fn clear(self) -> StateBuilderEmpty {
- let mut builder = StateBuilderEmpty(self.0);
- builder.clear();
- builder
- }
-
- pub(crate) fn is_match(&self) -> bool {
- self.repr().is_match()
- }
-
- pub(crate) fn is_from_word(&self) -> bool {
- self.repr().is_from_word()
- }
-
pub(crate) fn set_is_from_word(&mut self) {
self.repr_vec().set_is_from_word()
}
- pub(crate) fn look_have(&mut self) -> &mut LookSet {
- LookSet::from_repr_mut(&mut self.0[1])
+ pub(crate) fn set_is_half_crlf(&mut self) {
+ self.repr_vec().set_is_half_crlf()
+ }
+
+ pub(crate) fn look_have(&self) -> LookSet {
+ LookSet::read_repr(&self.0[1..])
}
- pub(crate) fn look_need(&mut self) -> &mut LookSet {
- LookSet::from_repr_mut(&mut self.0[2])
+ pub(crate) fn set_look_have(
+ &mut self,
+ set: impl FnMut(LookSet) -> LookSet,
+ ) {
+ self.repr_vec().set_look_have(set)
}
pub(crate) fn add_match_pattern_id(&mut self, pid: PatternID) {
@@ -295,20 +294,22 @@ impl StateBuilderNFA {
builder
}
- pub(crate) fn is_match(&self) -> bool {
- self.repr().is_match()
- }
-
- pub(crate) fn is_from_word(&self) -> bool {
- self.repr().is_from_word()
+ pub(crate) fn look_need(&self) -> LookSet {
+ self.repr().look_need()
}
- pub(crate) fn look_have(&mut self) -> &mut LookSet {
- LookSet::from_repr_mut(&mut self.repr[1])
+ pub(crate) fn set_look_have(
+ &mut self,
+ set: impl FnMut(LookSet) -> LookSet,
+ ) {
+ self.repr_vec().set_look_have(set)
}
- pub(crate) fn look_need(&mut self) -> &mut LookSet {
- LookSet::from_repr_mut(&mut self.repr[2])
+ pub(crate) fn set_look_need(
+ &mut self,
+ set: impl FnMut(LookSet) -> LookSet,
+ ) {
+ self.repr_vec().set_look_need(set)
}
pub(crate) fn add_nfa_state_id(&mut self, sid: StateID) {
@@ -316,10 +317,6 @@ impl StateBuilderNFA {
.add_nfa_state_id(&mut self.prev_nfa_state_id, sid)
}
- pub(crate) fn memory_usage(&self) -> usize {
- self.repr.len()
- }
-
pub(crate) fn as_bytes(&self) -> &[u8] {
&self.repr
}
@@ -355,8 +352,8 @@ impl StateBuilderNFA {
///
/// Byte 1 corresponds to the look-behind assertions that were satisfied by
/// the transition that created this state. This generally only includes the
-/// StartLine and StartText assertions. (Look-ahead assertions are not tracked
-/// as part of states. Instead, these are applied by re-computing the epsilon
+/// StartLF and Start assertions. (Look-ahead assertions are not tracked as
+/// part of states. Instead, these are applied by re-computing the epsilon
/// closure of a state when computing the transition function. See `next` in
/// the parent module.)
///
@@ -425,6 +422,14 @@ impl<'a> Repr<'a> {
self.0[0] & (1 << 2) > 0
}
+ /// Returns true if and only if this state is marked as being inside of a
+ /// CRLF terminator. In the forward direction, this means the state was
+ /// created after seeing a `\r`. In the reverse direction, this means the
+ /// state was created after seeing a `\n`.
+ fn is_half_crlf(&self) -> bool {
+ self.0[0] & (1 << 3) > 0
+ }
+
/// The set of look-behind assertions that were true in the transition that
/// created this state.
///
@@ -436,7 +441,7 @@ impl<'a> Repr<'a> {
/// these are re-computed on demand via epsilon closure when computing the
/// transition function.
fn look_have(&self) -> LookSet {
- LookSet::from_repr(self.0[1])
+ LookSet::read_repr(&self.0[1..])
}
/// The set of look-around (both behind and ahead) assertions that appear
@@ -447,34 +452,34 @@ impl<'a> Repr<'a> {
/// state has no conditional epsilon transitions, then there is no need
/// to re-compute the epsilon closure.
fn look_need(&self) -> LookSet {
- LookSet::from_repr(self.0[2])
+ LookSet::read_repr(&self.0[3..])
}
/// Returns the total number of match pattern IDs in this state.
///
/// If this state is not a match state, then this always returns 0.
- fn match_count(&self) -> usize {
+ fn match_len(&self) -> usize {
if !self.is_match() {
return 0;
} else if !self.has_pattern_ids() {
1
} else {
- self.encoded_pattern_count()
+ self.encoded_pattern_len()
}
}
/// Returns the pattern ID for this match state at the given index.
///
- /// If the given index is greater than or equal to `match_count()` for this
+ /// If the given index is greater than or equal to `match_len()` for this
/// state, then this could panic or return incorrect results.
fn match_pattern(&self, index: usize) -> PatternID {
if !self.has_pattern_ids() {
PatternID::ZERO
} else {
- let offset = 7 + index * PatternID::SIZE;
+ let offset = 9 + index * PatternID::SIZE;
// This is OK since we only ever serialize valid PatternIDs to
// states.
- bytes::read_pattern_id_unchecked(&self.0[offset..]).0
+ wire::read_pattern_id_unchecked(&self.0[offset..]).0
}
}
@@ -502,9 +507,9 @@ impl<'a> Repr<'a> {
f(PatternID::ZERO);
return;
}
- let mut pids = &self.0[7..self.pattern_offset_end()];
+ let mut pids = &self.0[9..self.pattern_offset_end()];
while !pids.is_empty() {
- let pid = bytes::read_u32(pids);
+ let pid = wire::read_u32(pids);
pids = &pids[PatternID::SIZE..];
// This is OK since we only ever serialize valid PatternIDs to
// states. And since pattern IDs can never exceed a usize, the
@@ -525,20 +530,20 @@ impl<'a> Repr<'a> {
// This is OK since we only ever serialize valid StateIDs to
// states. And since state IDs can never exceed an isize, they must
// always be able to fit into a usize, and thus cast is OK.
- f(StateID::new_unchecked(sid as usize))
+ f(StateID::new_unchecked(sid.as_usize()))
}
}
/// Returns the offset into this state's representation where the pattern
/// IDs end and the NFA state IDs begin.
fn pattern_offset_end(&self) -> usize {
- let encoded = self.encoded_pattern_count();
+ let encoded = self.encoded_pattern_len();
if encoded == 0 {
- return 3;
+ return 5;
}
// This arithmetic is OK since we were able to address this many bytes
// when writing to the state, thus, it must fit into a usize.
- encoded.checked_mul(4).unwrap().checked_add(7).unwrap()
+ encoded.checked_mul(4).unwrap().checked_add(9).unwrap()
}
/// Returns the total number of *encoded* pattern IDs in this state.
@@ -546,13 +551,13 @@ impl<'a> Repr<'a> {
/// This may return 0 even when this is a match state, since the pattern
/// ID `PatternID::ZERO` is not encoded when it's the only pattern ID in
/// the match state (the overwhelming common case).
- fn encoded_pattern_count(&self) -> usize {
+ fn encoded_pattern_len(&self) -> usize {
if !self.has_pattern_ids() {
return 0;
}
// This unwrap is OK since the total number of patterns is always
// guaranteed to fit into a usize.
- usize::try_from(bytes::read_u32(&self.0[3..7])).unwrap()
+ usize::try_from(wire::read_u32(&self.0[5..9])).unwrap()
}
}
@@ -563,6 +568,7 @@ impl<'a> core::fmt::Debug for Repr<'a> {
f.debug_struct("Repr")
.field("is_match", &self.is_match())
.field("is_from_word", &self.is_from_word())
+ .field("is_half_crlf", &self.is_half_crlf())
.field("look_have", &self.look_have())
.field("look_need", &self.look_need())
.field("match_pattern_ids", &self.match_pattern_ids())
@@ -608,14 +614,36 @@ impl<'a> ReprVec<'a> {
self.0[0] |= 1 << 2;
}
- /// Return a mutable reference to the 'look_have' assertion set.
- fn look_have_mut(&mut self) -> &mut LookSet {
- LookSet::from_repr_mut(&mut self.0[1])
+ /// Set this state as having seen half of a CRLF terminator.
+ ///
+ /// In the forward direction, this should be set when a `\r` has been seen.
+ /// In the reverse direction, this should be set when a `\n` has been seen.
+ fn set_is_half_crlf(&mut self) {
+ self.0[0] |= 1 << 3;
}
- /// Return a mutable reference to the 'look_need' assertion set.
- fn look_need_mut(&mut self) -> &mut LookSet {
- LookSet::from_repr_mut(&mut self.0[2])
+ /// The set of look-behind assertions that were true in the transition that
+ /// created this state.
+ fn look_have(&self) -> LookSet {
+ self.repr().look_have()
+ }
+
+ /// The set of look-around (both behind and ahead) assertions that appear
+ /// at least once in this state's set of NFA states.
+ fn look_need(&self) -> LookSet {
+ self.repr().look_need()
+ }
+
+ /// Mutate the set of look-behind assertions that were true in the
+ /// transition that created this state.
+ fn set_look_have(&mut self, mut set: impl FnMut(LookSet) -> LookSet) {
+ set(self.look_have()).write_repr(&mut self.0[1..]);
+ }
+
+ /// Mutate the set of look-around (both behind and ahead) assertions that
+ /// appear at least once in this state's set of NFA states.
+ fn set_look_need(&mut self, mut set: impl FnMut(LookSet) -> LookSet) {
+ set(self.look_need()).write_repr(&mut self.0[3..]);
}
/// Add a pattern ID to this state. All match states must have at least
@@ -675,14 +703,14 @@ impl<'a> ReprVec<'a> {
return;
}
let patsize = PatternID::SIZE;
- let pattern_bytes = self.0.len() - 7;
+ let pattern_bytes = self.0.len() - 9;
// Every pattern ID uses 4 bytes, so number of bytes should be
// divisible by 4.
assert_eq!(pattern_bytes % patsize, 0);
// This unwrap is OK since we are guaranteed that the maximum number
// of possible patterns fits into a u32.
let count32 = u32::try_from(pattern_bytes / patsize).unwrap();
- bytes::NE::write_u32(count32, &mut self.0[3..7]);
+ wire::NE::write_u32(count32, &mut self.0[5..9]);
}
/// Add an NFA state ID to this state. The order in which NFA states are
@@ -704,7 +732,7 @@ impl<'a> ReprVec<'a> {
///
/// https://developers.google.com/protocol-buffers/docs/encoding#varints
fn write_vari32(data: &mut Vec<u8>, n: i32) {
- let mut un = (n as u32) << 1;
+ let mut un = n.to_bits() << 1;
if n < 0 {
un = !un;
}
@@ -717,7 +745,7 @@ fn write_vari32(data: &mut Vec<u8>, n: i32) {
/// https://developers.google.com/protocol-buffers/docs/encoding#varints
fn read_vari32(data: &[u8]) -> (i32, usize) {
let (un, i) = read_varu32(data);
- let mut n = (un >> 1) as i32;
+ let mut n = i32::from_bits(un >> 1);
if un & 1 != 0 {
n = !n;
}
@@ -733,10 +761,10 @@ fn read_vari32(data: &[u8]) -> (i32, usize) {
/// https://developers.google.com/protocol-buffers/docs/encoding#varints
fn write_varu32(data: &mut Vec<u8>, mut n: u32) {
while n >= 0b1000_0000 {
- data.push((n as u8) | 0b1000_0000);
+ data.push(n.low_u8() | 0b1000_0000);
n >>= 7;
}
- data.push(n as u8);
+ data.push(n.low_u8());
}
/// Read an unsigned 32-bit varint. Also, return the number of bytes read.
@@ -750,9 +778,9 @@ fn read_varu32(data: &[u8]) -> (u32, usize) {
let mut shift: u32 = 0;
for (i, &b) in data.iter().enumerate() {
if b < 0b1000_0000 {
- return (n | ((b as u32) << shift), i + 1);
+ return (n | (u32::from(b) << shift), i + 1);
}
- n |= ((b as u32) & 0b0111_1111) << shift;
+ n |= (u32::from(b) & 0b0111_1111) << shift;
shift += 7;
}
(0, 0)
@@ -760,7 +788,7 @@ fn read_varu32(data: &[u8]) -> (u32, usize) {
/// Push a native-endian encoded `n` on to `dst`.
fn write_u32(dst: &mut Vec<u8>, n: u32) {
- use crate::util::bytes::{Endian, NE};
+ use crate::util::wire::NE;
let start = dst.len();
dst.extend(core::iter::repeat(0).take(mem::size_of::<u32>()));
@@ -775,6 +803,7 @@ mod tests {
use super::*;
+ #[cfg(not(miri))]
quickcheck! {
fn prop_state_read_write_nfa_state_ids(sids: Vec<StateID>) -> bool {
// Builders states do not permit duplicate IDs.
@@ -829,7 +858,9 @@ mod tests {
s.iter_nfa_state_ids(|sid| got_sids.push(sid));
got_pids == pids && got_sids == sids
}
+ }
+ quickcheck! {
fn prop_read_write_varu32(n: u32) -> bool {
let mut buf = vec![];
write_varu32(&mut buf, n);
@@ -845,6 +876,7 @@ mod tests {
}
}
+ #[cfg(not(miri))]
fn dedup_state_ids(sids: Vec<StateID>) -> Vec<StateID> {
let mut set = alloc::collections::BTreeSet::new();
let mut deduped = vec![];
@@ -858,6 +890,7 @@ mod tests {
deduped
}
+ #[cfg(not(miri))]
fn dedup_pattern_ids(pids: Vec<PatternID>) -> Vec<PatternID> {
let mut set = alloc::collections::BTreeSet::new();
let mut deduped = vec![];