diff options
author | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-04-17 12:02:58 +0000 |
---|---|---|
committer | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-04-17 12:02:58 +0000 |
commit | 698f8c2f01ea549d77d7dc3338a12e04c11057b9 (patch) | |
tree | 173a775858bd501c378080a10dca74132f05bc50 /vendor/regex-automata/src/sparse.rs | |
parent | Initial commit. (diff) | |
download | rustc-698f8c2f01ea549d77d7dc3338a12e04c11057b9.tar.xz rustc-698f8c2f01ea549d77d7dc3338a12e04c11057b9.zip |
Adding upstream version 1.64.0+dfsg1.upstream/1.64.0+dfsg1
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'vendor/regex-automata/src/sparse.rs')
-rw-r--r-- | vendor/regex-automata/src/sparse.rs | 1256 |
1 files changed, 1256 insertions, 0 deletions
diff --git a/vendor/regex-automata/src/sparse.rs b/vendor/regex-automata/src/sparse.rs new file mode 100644 index 000000000..d18024b34 --- /dev/null +++ b/vendor/regex-automata/src/sparse.rs @@ -0,0 +1,1256 @@ +#[cfg(feature = "std")] +use core::fmt; +#[cfg(feature = "std")] +use core::iter; +use core::marker::PhantomData; +use core::mem::size_of; +#[cfg(feature = "std")] +use std::collections::HashMap; + +#[cfg(feature = "std")] +use byteorder::{BigEndian, LittleEndian}; +use byteorder::{ByteOrder, NativeEndian}; + +use classes::ByteClasses; +use dense; +use dfa::DFA; +#[cfg(feature = "std")] +use error::{Error, Result}; +#[cfg(feature = "std")] +use state_id::{dead_id, usize_to_state_id, write_state_id_bytes, StateID}; +#[cfg(not(feature = "std"))] +use state_id::{dead_id, StateID}; + +/// A sparse table-based deterministic finite automaton (DFA). +/// +/// In contrast to a [dense DFA](enum.DenseDFA.html), a sparse DFA uses a +/// more space efficient representation for its transition table. Consequently, +/// sparse DFAs can use much less memory than dense DFAs, but this comes at a +/// price. In particular, reading the more space efficient transitions takes +/// more work, and consequently, searching using a sparse DFA is typically +/// slower than a dense DFA. +/// +/// A sparse DFA can be built using the default configuration via the +/// [`SparseDFA::new`](enum.SparseDFA.html#method.new) constructor. Otherwise, +/// one can configure various aspects of a dense DFA via +/// [`dense::Builder`](dense/struct.Builder.html), and then convert a dense +/// DFA to a sparse DFA using +/// [`DenseDFA::to_sparse`](enum.DenseDFA.html#method.to_sparse). +/// +/// In general, a sparse DFA supports all the same operations as a dense DFA. +/// +/// Making the choice between a dense and sparse DFA depends on your specific +/// work load. If you can sacrifice a bit of search time performance, then a +/// sparse DFA might be the best choice. In particular, while sparse DFAs are +/// probably always slower than dense DFAs, you may find that they are easily +/// fast enough for your purposes! +/// +/// # State size +/// +/// A `SparseDFA` has two type parameters, `T` and `S`. `T` corresponds to +/// the type of the DFA's transition table while `S` corresponds to the +/// representation used for the DFA's state identifiers as described by the +/// [`StateID`](trait.StateID.html) trait. This type parameter is typically +/// `usize`, but other valid choices provided by this crate include `u8`, +/// `u16`, `u32` and `u64`. The primary reason for choosing a different state +/// identifier representation than the default is to reduce the amount of +/// memory used by a DFA. Note though, that if the chosen representation cannot +/// accommodate the size of your DFA, then building the DFA will fail and +/// return an error. +/// +/// While the reduction in heap memory used by a DFA is one reason for choosing +/// a smaller state identifier representation, another possible reason is for +/// decreasing the serialization size of a DFA, as returned by +/// [`to_bytes_little_endian`](enum.SparseDFA.html#method.to_bytes_little_endian), +/// [`to_bytes_big_endian`](enum.SparseDFA.html#method.to_bytes_big_endian) +/// or +/// [`to_bytes_native_endian`](enum.DenseDFA.html#method.to_bytes_native_endian). +/// +/// The type of the transition table is typically either `Vec<u8>` or `&[u8]`, +/// depending on where the transition table is stored. Note that this is +/// different than a dense DFA, whose transition table is typically +/// `Vec<S>` or `&[S]`. The reason for this is that a sparse DFA always reads +/// its transition table from raw bytes because the table is compactly packed. +/// +/// # Variants +/// +/// This DFA is defined as a non-exhaustive enumeration of different types of +/// dense DFAs. All of the variants use the same internal representation +/// for the transition table, but they vary in how the transition table is +/// read. A DFA's specific variant depends on the configuration options set via +/// [`dense::Builder`](dense/struct.Builder.html). The default variant is +/// `ByteClass`. +/// +/// # The `DFA` trait +/// +/// This type implements the [`DFA`](trait.DFA.html) trait, which means it +/// can be used for searching. For example: +/// +/// ``` +/// use regex_automata::{DFA, SparseDFA}; +/// +/// # fn example() -> Result<(), regex_automata::Error> { +/// let dfa = SparseDFA::new("foo[0-9]+")?; +/// assert_eq!(Some(8), dfa.find(b"foo12345")); +/// # Ok(()) }; example().unwrap() +/// ``` +/// +/// The `DFA` trait also provides an assortment of other lower level methods +/// for DFAs, such as `start_state` and `next_state`. While these are correctly +/// implemented, it is an anti-pattern to use them in performance sensitive +/// code on the `SparseDFA` type directly. Namely, each implementation requires +/// a branch to determine which type of sparse DFA is being used. Instead, +/// this branch should be pushed up a layer in the code since walking the +/// transitions of a DFA is usually a hot path. If you do need to use these +/// lower level methods in performance critical code, then you should match on +/// the variants of this DFA and use each variant's implementation of the `DFA` +/// trait directly. +#[derive(Clone, Debug)] +pub enum SparseDFA<T: AsRef<[u8]>, S: StateID = usize> { + /// A standard DFA that does not use byte classes. + Standard(Standard<T, S>), + /// A DFA that shrinks its alphabet to a set of equivalence classes instead + /// of using all possible byte values. Any two bytes belong to the same + /// equivalence class if and only if they can be used interchangeably + /// anywhere in the DFA while never discriminating between a match and a + /// non-match. + /// + /// Unlike dense DFAs, sparse DFAs do not tend to benefit nearly as much + /// from using byte classes. In some cases, using byte classes can even + /// marginally increase the size of a sparse DFA's transition table. The + /// reason for this is that a sparse DFA already compacts each state's + /// transitions separate from whether byte classes are used. + ByteClass(ByteClass<T, S>), + /// Hints that destructuring should not be exhaustive. + /// + /// This enum may grow additional variants, so this makes sure clients + /// don't count on exhaustive matching. (Otherwise, adding a new variant + /// could break existing code.) + #[doc(hidden)] + __Nonexhaustive, +} + +#[cfg(feature = "std")] +impl SparseDFA<Vec<u8>, usize> { + /// Parse the given regular expression using a default configuration and + /// return the corresponding sparse DFA. + /// + /// The default configuration uses `usize` for state IDs and reduces the + /// alphabet size by splitting bytes into equivalence classes. The + /// resulting DFA is *not* minimized. + /// + /// If you want a non-default configuration, then use the + /// [`dense::Builder`](dense/struct.Builder.html) + /// to set your own configuration, and then call + /// [`DenseDFA::to_sparse`](enum.DenseDFA.html#method.to_sparse) + /// to create a sparse DFA. + /// + /// # Example + /// + /// ``` + /// use regex_automata::{DFA, SparseDFA}; + /// + /// # fn example() -> Result<(), regex_automata::Error> { + /// let dfa = SparseDFA::new("foo[0-9]+bar")?; + /// assert_eq!(Some(11), dfa.find(b"foo12345bar")); + /// # Ok(()) }; example().unwrap() + /// ``` + pub fn new(pattern: &str) -> Result<SparseDFA<Vec<u8>, usize>> { + dense::Builder::new() + .build(pattern) + .and_then(|dense| dense.to_sparse()) + } +} + +#[cfg(feature = "std")] +impl<S: StateID> SparseDFA<Vec<u8>, S> { + /// Create a new empty sparse DFA that never matches any input. + /// + /// # Example + /// + /// In order to build an empty DFA, callers must provide a type hint + /// indicating their choice of state identifier representation. + /// + /// ``` + /// use regex_automata::{DFA, SparseDFA}; + /// + /// # fn example() -> Result<(), regex_automata::Error> { + /// let dfa: SparseDFA<Vec<u8>, usize> = SparseDFA::empty(); + /// assert_eq!(None, dfa.find(b"")); + /// assert_eq!(None, dfa.find(b"foo")); + /// # Ok(()) }; example().unwrap() + /// ``` + pub fn empty() -> SparseDFA<Vec<u8>, S> { + dense::DenseDFA::empty().to_sparse().unwrap() + } + + pub(crate) fn from_dense_sized<T: AsRef<[S]>, A: StateID>( + dfa: &dense::Repr<T, S>, + ) -> Result<SparseDFA<Vec<u8>, A>> { + Repr::from_dense_sized(dfa).map(|r| r.into_sparse_dfa()) + } +} + +impl<T: AsRef<[u8]>, S: StateID> SparseDFA<T, S> { + /// Cheaply return a borrowed version of this sparse DFA. Specifically, the + /// DFA returned always uses `&[u8]` for its transition table while keeping + /// the same state identifier representation. + pub fn as_ref<'a>(&'a self) -> SparseDFA<&'a [u8], S> { + match *self { + SparseDFA::Standard(Standard(ref r)) => { + SparseDFA::Standard(Standard(r.as_ref())) + } + SparseDFA::ByteClass(ByteClass(ref r)) => { + SparseDFA::ByteClass(ByteClass(r.as_ref())) + } + SparseDFA::__Nonexhaustive => unreachable!(), + } + } + + /// Return an owned version of this sparse DFA. Specifically, the DFA + /// returned always uses `Vec<u8>` for its transition table while keeping + /// the same state identifier representation. + /// + /// Effectively, this returns a sparse DFA whose transition table lives + /// on the heap. + #[cfg(feature = "std")] + pub fn to_owned(&self) -> SparseDFA<Vec<u8>, S> { + match *self { + SparseDFA::Standard(Standard(ref r)) => { + SparseDFA::Standard(Standard(r.to_owned())) + } + SparseDFA::ByteClass(ByteClass(ref r)) => { + SparseDFA::ByteClass(ByteClass(r.to_owned())) + } + SparseDFA::__Nonexhaustive => unreachable!(), + } + } + + /// Returns the memory usage, in bytes, of this DFA. + /// + /// The memory usage is computed based on the number of bytes used to + /// represent this DFA's transition table. This typically corresponds to + /// heap memory usage. + /// + /// This does **not** include the stack size used up by this DFA. To + /// compute that, used `std::mem::size_of::<SparseDFA>()`. + pub fn memory_usage(&self) -> usize { + self.repr().memory_usage() + } + + fn repr(&self) -> &Repr<T, S> { + match *self { + SparseDFA::Standard(ref r) => &r.0, + SparseDFA::ByteClass(ref r) => &r.0, + SparseDFA::__Nonexhaustive => unreachable!(), + } + } +} + +/// Routines for converting a sparse DFA to other representations, such as +/// smaller state identifiers or raw bytes suitable for persistent storage. +#[cfg(feature = "std")] +impl<T: AsRef<[u8]>, S: StateID> SparseDFA<T, S> { + /// Create a new sparse DFA whose match semantics are equivalent to + /// this DFA, but attempt to use `u8` for the representation of state + /// identifiers. If `u8` is insufficient to represent all state identifiers + /// in this DFA, then this returns an error. + /// + /// This is a convenience routine for `to_sized::<u8>()`. + pub fn to_u8(&self) -> Result<SparseDFA<Vec<u8>, u8>> { + self.to_sized() + } + + /// Create a new sparse DFA whose match semantics are equivalent to + /// this DFA, but attempt to use `u16` for the representation of state + /// identifiers. If `u16` is insufficient to represent all state + /// identifiers in this DFA, then this returns an error. + /// + /// This is a convenience routine for `to_sized::<u16>()`. + pub fn to_u16(&self) -> Result<SparseDFA<Vec<u8>, u16>> { + self.to_sized() + } + + /// Create a new sparse DFA whose match semantics are equivalent to + /// this DFA, but attempt to use `u32` for the representation of state + /// identifiers. If `u32` is insufficient to represent all state + /// identifiers in this DFA, then this returns an error. + /// + /// This is a convenience routine for `to_sized::<u32>()`. + #[cfg(any(target_pointer_width = "32", target_pointer_width = "64"))] + pub fn to_u32(&self) -> Result<SparseDFA<Vec<u8>, u32>> { + self.to_sized() + } + + /// Create a new sparse DFA whose match semantics are equivalent to + /// this DFA, but attempt to use `u64` for the representation of state + /// identifiers. If `u64` is insufficient to represent all state + /// identifiers in this DFA, then this returns an error. + /// + /// This is a convenience routine for `to_sized::<u64>()`. + #[cfg(target_pointer_width = "64")] + pub fn to_u64(&self) -> Result<SparseDFA<Vec<u8>, u64>> { + self.to_sized() + } + + /// Create a new sparse DFA whose match semantics are equivalent to + /// this DFA, but attempt to use `A` for the representation of state + /// identifiers. If `A` is insufficient to represent all state identifiers + /// in this DFA, then this returns an error. + /// + /// An alternative way to construct such a DFA is to use + /// [`DenseDFA::to_sparse_sized`](enum.DenseDFA.html#method.to_sparse_sized). + /// In general, picking the appropriate size upon initial construction of + /// a sparse DFA is preferred, since it will do the conversion in one + /// step instead of two. + pub fn to_sized<A: StateID>(&self) -> Result<SparseDFA<Vec<u8>, A>> { + self.repr().to_sized().map(|r| r.into_sparse_dfa()) + } + + /// Serialize a sparse DFA to raw bytes in little endian format. + /// + /// If the state identifier representation of this DFA has a size different + /// than 1, 2, 4 or 8 bytes, then this returns an error. All + /// implementations of `StateID` provided by this crate satisfy this + /// requirement. + pub fn to_bytes_little_endian(&self) -> Result<Vec<u8>> { + self.repr().to_bytes::<LittleEndian>() + } + + /// Serialize a sparse DFA to raw bytes in big endian format. + /// + /// If the state identifier representation of this DFA has a size different + /// than 1, 2, 4 or 8 bytes, then this returns an error. All + /// implementations of `StateID` provided by this crate satisfy this + /// requirement. + pub fn to_bytes_big_endian(&self) -> Result<Vec<u8>> { + self.repr().to_bytes::<BigEndian>() + } + + /// Serialize a sparse DFA to raw bytes in native endian format. + /// Generally, it is better to pick an explicit endianness using either + /// `to_bytes_little_endian` or `to_bytes_big_endian`. This routine is + /// useful in tests where the DFA is serialized and deserialized on the + /// same platform. + /// + /// If the state identifier representation of this DFA has a size different + /// than 1, 2, 4 or 8 bytes, then this returns an error. All + /// implementations of `StateID` provided by this crate satisfy this + /// requirement. + pub fn to_bytes_native_endian(&self) -> Result<Vec<u8>> { + self.repr().to_bytes::<NativeEndian>() + } +} + +impl<'a, S: StateID> SparseDFA<&'a [u8], S> { + /// Deserialize a sparse DFA with a specific state identifier + /// representation. + /// + /// Deserializing a DFA using this routine will never allocate heap memory. + /// This is also guaranteed to be a constant time operation that does not + /// vary with the size of the DFA. + /// + /// The bytes given should be generated by the serialization of a DFA with + /// either the + /// [`to_bytes_little_endian`](enum.DenseDFA.html#method.to_bytes_little_endian) + /// method or the + /// [`to_bytes_big_endian`](enum.DenseDFA.html#method.to_bytes_big_endian) + /// endian, depending on the endianness of the machine you are + /// deserializing this DFA from. + /// + /// If the state identifier representation is `usize`, then deserialization + /// is dependent on the pointer size. For this reason, it is best to + /// serialize DFAs using a fixed size representation for your state + /// identifiers, such as `u8`, `u16`, `u32` or `u64`. + /// + /// # Panics + /// + /// The bytes given should be *trusted*. In particular, if the bytes + /// are not a valid serialization of a DFA, or if the endianness of the + /// serialized bytes is different than the endianness of the machine that + /// is deserializing the DFA, then this routine will panic. Moreover, it + /// is possible for this deserialization routine to succeed even if the + /// given bytes do not represent a valid serialized sparse DFA. + /// + /// # Safety + /// + /// This routine is unsafe because it permits callers to provide an + /// arbitrary transition table with possibly incorrect transitions. While + /// the various serialization routines will never return an incorrect + /// transition table, there is no guarantee that the bytes provided here + /// are correct. While deserialization does many checks (as documented + /// above in the panic conditions), this routine does not check that the + /// transition table is correct. Given an incorrect transition table, it is + /// possible for the search routines to access out-of-bounds memory because + /// of explicit bounds check elision. + /// + /// # Example + /// + /// This example shows how to serialize a DFA to raw bytes, deserialize it + /// and then use it for searching. Note that we first convert the DFA to + /// using `u16` for its state identifier representation before serializing + /// it. While this isn't strictly necessary, it's good practice in order to + /// decrease the size of the DFA and to avoid platform specific pitfalls + /// such as differing pointer sizes. + /// + /// ``` + /// use regex_automata::{DFA, DenseDFA, SparseDFA}; + /// + /// # fn example() -> Result<(), regex_automata::Error> { + /// let sparse = SparseDFA::new("foo[0-9]+")?; + /// let bytes = sparse.to_u16()?.to_bytes_native_endian()?; + /// + /// let dfa: SparseDFA<&[u8], u16> = unsafe { + /// SparseDFA::from_bytes(&bytes) + /// }; + /// + /// assert_eq!(Some(8), dfa.find(b"foo12345")); + /// # Ok(()) }; example().unwrap() + /// ``` + pub unsafe fn from_bytes(buf: &'a [u8]) -> SparseDFA<&'a [u8], S> { + Repr::from_bytes(buf).into_sparse_dfa() + } +} + +impl<T: AsRef<[u8]>, S: StateID> DFA for SparseDFA<T, S> { + type ID = S; + + #[inline] + fn start_state(&self) -> S { + self.repr().start_state() + } + + #[inline] + fn is_match_state(&self, id: S) -> bool { + self.repr().is_match_state(id) + } + + #[inline] + fn is_dead_state(&self, id: S) -> bool { + self.repr().is_dead_state(id) + } + + #[inline] + fn is_match_or_dead_state(&self, id: S) -> bool { + self.repr().is_match_or_dead_state(id) + } + + #[inline] + fn is_anchored(&self) -> bool { + self.repr().is_anchored() + } + + #[inline] + fn next_state(&self, current: S, input: u8) -> S { + match *self { + SparseDFA::Standard(ref r) => r.next_state(current, input), + SparseDFA::ByteClass(ref r) => r.next_state(current, input), + SparseDFA::__Nonexhaustive => unreachable!(), + } + } + + #[inline] + unsafe fn next_state_unchecked(&self, current: S, input: u8) -> S { + self.next_state(current, input) + } + + // We specialize the following methods because it lets us lift the + // case analysis between the different types of sparse DFAs. Instead of + // doing the case analysis for every transition, we do it once before + // searching. For sparse DFAs, this doesn't seem to benefit performance as + // much as it does for the dense DFAs, but it's easy to do so we might as + // well do it. + + #[inline] + fn is_match_at(&self, bytes: &[u8], start: usize) -> bool { + match *self { + SparseDFA::Standard(ref r) => r.is_match_at(bytes, start), + SparseDFA::ByteClass(ref r) => r.is_match_at(bytes, start), + SparseDFA::__Nonexhaustive => unreachable!(), + } + } + + #[inline] + fn shortest_match_at(&self, bytes: &[u8], start: usize) -> Option<usize> { + match *self { + SparseDFA::Standard(ref r) => r.shortest_match_at(bytes, start), + SparseDFA::ByteClass(ref r) => r.shortest_match_at(bytes, start), + SparseDFA::__Nonexhaustive => unreachable!(), + } + } + + #[inline] + fn find_at(&self, bytes: &[u8], start: usize) -> Option<usize> { + match *self { + SparseDFA::Standard(ref r) => r.find_at(bytes, start), + SparseDFA::ByteClass(ref r) => r.find_at(bytes, start), + SparseDFA::__Nonexhaustive => unreachable!(), + } + } + + #[inline] + fn rfind_at(&self, bytes: &[u8], start: usize) -> Option<usize> { + match *self { + SparseDFA::Standard(ref r) => r.rfind_at(bytes, start), + SparseDFA::ByteClass(ref r) => r.rfind_at(bytes, start), + SparseDFA::__Nonexhaustive => unreachable!(), + } + } +} + +/// A standard sparse DFA that does not use premultiplication or byte classes. +/// +/// Generally, it isn't necessary to use this type directly, since a +/// `SparseDFA` can be used for searching directly. One possible reason why +/// one might want to use this type directly is if you are implementing your +/// own search routines by walking a DFA's transitions directly. In that case, +/// you'll want to use this type (or any of the other DFA variant types) +/// directly, since they implement `next_state` more efficiently. +#[derive(Clone, Debug)] +pub struct Standard<T: AsRef<[u8]>, S: StateID = usize>(Repr<T, S>); + +impl<T: AsRef<[u8]>, S: StateID> DFA for Standard<T, S> { + type ID = S; + + #[inline] + fn start_state(&self) -> S { + self.0.start_state() + } + + #[inline] + fn is_match_state(&self, id: S) -> bool { + self.0.is_match_state(id) + } + + #[inline] + fn is_dead_state(&self, id: S) -> bool { + self.0.is_dead_state(id) + } + + #[inline] + fn is_match_or_dead_state(&self, id: S) -> bool { + self.0.is_match_or_dead_state(id) + } + + #[inline] + fn is_anchored(&self) -> bool { + self.0.is_anchored() + } + + #[inline] + fn next_state(&self, current: S, input: u8) -> S { + self.0.state(current).next(input) + } + + #[inline] + unsafe fn next_state_unchecked(&self, current: S, input: u8) -> S { + self.next_state(current, input) + } +} + +/// A sparse DFA that shrinks its alphabet. +/// +/// Alphabet shrinking is achieved by using a set of equivalence classes +/// instead of using all possible byte values. Any two bytes belong to the same +/// equivalence class if and only if they can be used interchangeably anywhere +/// in the DFA while never discriminating between a match and a non-match. +/// +/// Unlike dense DFAs, sparse DFAs do not tend to benefit nearly as much from +/// using byte classes. In some cases, using byte classes can even marginally +/// increase the size of a sparse DFA's transition table. The reason for this +/// is that a sparse DFA already compacts each state's transitions separate +/// from whether byte classes are used. +/// +/// Generally, it isn't necessary to use this type directly, since a +/// `SparseDFA` can be used for searching directly. One possible reason why +/// one might want to use this type directly is if you are implementing your +/// own search routines by walking a DFA's transitions directly. In that case, +/// you'll want to use this type (or any of the other DFA variant types) +/// directly, since they implement `next_state` more efficiently. +#[derive(Clone, Debug)] +pub struct ByteClass<T: AsRef<[u8]>, S: StateID = usize>(Repr<T, S>); + +impl<T: AsRef<[u8]>, S: StateID> DFA for ByteClass<T, S> { + type ID = S; + + #[inline] + fn start_state(&self) -> S { + self.0.start_state() + } + + #[inline] + fn is_match_state(&self, id: S) -> bool { + self.0.is_match_state(id) + } + + #[inline] + fn is_dead_state(&self, id: S) -> bool { + self.0.is_dead_state(id) + } + + #[inline] + fn is_match_or_dead_state(&self, id: S) -> bool { + self.0.is_match_or_dead_state(id) + } + + #[inline] + fn is_anchored(&self) -> bool { + self.0.is_anchored() + } + + #[inline] + fn next_state(&self, current: S, input: u8) -> S { + let input = self.0.byte_classes.get(input); + self.0.state(current).next(input) + } + + #[inline] + unsafe fn next_state_unchecked(&self, current: S, input: u8) -> S { + self.next_state(current, input) + } +} + +/// The underlying representation of a sparse DFA. This is shared by all of +/// the different variants of a sparse DFA. +#[derive(Clone)] +#[cfg_attr(not(feature = "std"), derive(Debug))] +struct Repr<T: AsRef<[u8]>, S: StateID = usize> { + anchored: bool, + start: S, + state_count: usize, + max_match: S, + byte_classes: ByteClasses, + trans: T, +} + +impl<T: AsRef<[u8]>, S: StateID> Repr<T, S> { + fn into_sparse_dfa(self) -> SparseDFA<T, S> { + if self.byte_classes.is_singleton() { + SparseDFA::Standard(Standard(self)) + } else { + SparseDFA::ByteClass(ByteClass(self)) + } + } + + fn as_ref<'a>(&'a self) -> Repr<&'a [u8], S> { + Repr { + anchored: self.anchored, + start: self.start, + state_count: self.state_count, + max_match: self.max_match, + byte_classes: self.byte_classes.clone(), + trans: self.trans(), + } + } + + #[cfg(feature = "std")] + fn to_owned(&self) -> Repr<Vec<u8>, S> { + Repr { + anchored: self.anchored, + start: self.start, + state_count: self.state_count, + max_match: self.max_match, + byte_classes: self.byte_classes.clone(), + trans: self.trans().to_vec(), + } + } + + /// Return a convenient representation of the given state. + /// + /// This is marked as inline because it doesn't seem to get inlined + /// otherwise, which leads to a fairly significant performance loss (~25%). + #[inline] + fn state<'a>(&'a self, id: S) -> State<'a, S> { + let mut pos = id.to_usize(); + let ntrans = NativeEndian::read_u16(&self.trans()[pos..]) as usize; + pos += 2; + let input_ranges = &self.trans()[pos..pos + (ntrans * 2)]; + pos += 2 * ntrans; + let next = &self.trans()[pos..pos + (ntrans * size_of::<S>())]; + State { _state_id_repr: PhantomData, ntrans, input_ranges, next } + } + + /// Return an iterator over all of the states in this DFA. + /// + /// The iterator returned yields tuples, where the first element is the + /// state ID and the second element is the state itself. + #[cfg(feature = "std")] + fn states<'a>(&'a self) -> StateIter<'a, T, S> { + StateIter { dfa: self, id: dead_id() } + } + + fn memory_usage(&self) -> usize { + self.trans().len() + } + + fn start_state(&self) -> S { + self.start + } + + fn is_match_state(&self, id: S) -> bool { + self.is_match_or_dead_state(id) && !self.is_dead_state(id) + } + + fn is_dead_state(&self, id: S) -> bool { + id == dead_id() + } + + fn is_match_or_dead_state(&self, id: S) -> bool { + id <= self.max_match + } + + fn is_anchored(&self) -> bool { + self.anchored + } + + fn trans(&self) -> &[u8] { + self.trans.as_ref() + } + + /// Create a new sparse DFA whose match semantics are equivalent to this + /// DFA, but attempt to use `A` for the representation of state + /// identifiers. If `A` is insufficient to represent all state identifiers + /// in this DFA, then this returns an error. + #[cfg(feature = "std")] + fn to_sized<A: StateID>(&self) -> Result<Repr<Vec<u8>, A>> { + // To build the new DFA, we proceed much like the initial construction + // of the sparse DFA. Namely, since the state ID size is changing, + // we don't actually know all of our state IDs until we've allocated + // all necessary space. So we do one pass that allocates all of the + // storage we need, and then another pass to fill in the transitions. + + let mut trans = Vec::with_capacity(size_of::<A>() * self.state_count); + let mut map: HashMap<S, A> = HashMap::with_capacity(self.state_count); + for (old_id, state) in self.states() { + let pos = trans.len(); + map.insert(old_id, usize_to_state_id(pos)?); + + let n = state.ntrans; + let zeros = 2 + (n * 2) + (n * size_of::<A>()); + trans.extend(iter::repeat(0).take(zeros)); + + NativeEndian::write_u16(&mut trans[pos..], n as u16); + let (s, e) = (pos + 2, pos + 2 + (n * 2)); + trans[s..e].copy_from_slice(state.input_ranges); + } + + let mut new = Repr { + anchored: self.anchored, + start: map[&self.start], + state_count: self.state_count, + max_match: map[&self.max_match], + byte_classes: self.byte_classes.clone(), + trans, + }; + for (&old_id, &new_id) in map.iter() { + let old_state = self.state(old_id); + let mut new_state = new.state_mut(new_id); + for i in 0..new_state.ntrans { + let next = map[&old_state.next_at(i)]; + new_state.set_next_at(i, usize_to_state_id(next.to_usize())?); + } + } + new.start = map[&self.start]; + new.max_match = map[&self.max_match]; + Ok(new) + } + + /// Serialize a sparse DFA to raw bytes using the provided endianness. + /// + /// If the state identifier representation of this DFA has a size different + /// than 1, 2, 4 or 8 bytes, then this returns an error. All + /// implementations of `StateID` provided by this crate satisfy this + /// requirement. + /// + /// Unlike dense DFAs, the result is not necessarily aligned since a + /// sparse DFA's transition table is always read as a sequence of bytes. + #[cfg(feature = "std")] + fn to_bytes<A: ByteOrder>(&self) -> Result<Vec<u8>> { + let label = b"rust-regex-automata-sparse-dfa\x00"; + let size = + // For human readable label. + label.len() + // endiannes check, must be equal to 0xFEFF for native endian + + 2 + // For version number. + + 2 + // Size of state ID representation, in bytes. + // Must be 1, 2, 4 or 8. + + 2 + // For DFA misc options. (Currently unused.) + + 2 + // For start state. + + 8 + // For state count. + + 8 + // For max match state. + + 8 + // For byte class map. + + 256 + // For transition table. + + self.trans().len(); + + let mut i = 0; + let mut buf = vec![0; size]; + + // write label + for &b in label { + buf[i] = b; + i += 1; + } + // endianness check + A::write_u16(&mut buf[i..], 0xFEFF); + i += 2; + // version number + A::write_u16(&mut buf[i..], 1); + i += 2; + // size of state ID + let state_size = size_of::<S>(); + if ![1, 2, 4, 8].contains(&state_size) { + return Err(Error::serialize(&format!( + "state size of {} not supported, must be 1, 2, 4 or 8", + state_size + ))); + } + A::write_u16(&mut buf[i..], state_size as u16); + i += 2; + // DFA misc options + let mut options = 0u16; + if self.anchored { + options |= dense::MASK_ANCHORED; + } + A::write_u16(&mut buf[i..], options); + i += 2; + // start state + A::write_u64(&mut buf[i..], self.start.to_usize() as u64); + i += 8; + // state count + A::write_u64(&mut buf[i..], self.state_count as u64); + i += 8; + // max match state + A::write_u64(&mut buf[i..], self.max_match.to_usize() as u64); + i += 8; + // byte class map + for b in (0..256).map(|b| b as u8) { + buf[i] = self.byte_classes.get(b); + i += 1; + } + // transition table + for (_, state) in self.states() { + A::write_u16(&mut buf[i..], state.ntrans as u16); + i += 2; + buf[i..i + (state.ntrans * 2)].copy_from_slice(state.input_ranges); + i += state.ntrans * 2; + for j in 0..state.ntrans { + write_state_id_bytes::<A, _>(&mut buf[i..], state.next_at(j)); + i += size_of::<S>(); + } + } + + assert_eq!(size, i, "expected to consume entire buffer"); + + Ok(buf) + } +} + +impl<'a, S: StateID> Repr<&'a [u8], S> { + /// The implementation for deserializing a sparse DFA from raw bytes. + unsafe fn from_bytes(mut buf: &'a [u8]) -> Repr<&'a [u8], S> { + // skip over label + match buf.iter().position(|&b| b == b'\x00') { + None => panic!("could not find label"), + Some(i) => buf = &buf[i + 1..], + } + + // check that current endianness is same as endianness of DFA + let endian_check = NativeEndian::read_u16(buf); + buf = &buf[2..]; + if endian_check != 0xFEFF { + panic!( + "endianness mismatch, expected 0xFEFF but got 0x{:X}. \ + are you trying to load a SparseDFA serialized with a \ + different endianness?", + endian_check, + ); + } + + // check that the version number is supported + let version = NativeEndian::read_u16(buf); + buf = &buf[2..]; + if version != 1 { + panic!( + "expected version 1, but found unsupported version {}", + version, + ); + } + + // read size of state + let state_size = NativeEndian::read_u16(buf) as usize; + if state_size != size_of::<S>() { + panic!( + "state size of SparseDFA ({}) does not match \ + requested state size ({})", + state_size, + size_of::<S>(), + ); + } + buf = &buf[2..]; + + // read miscellaneous options + let opts = NativeEndian::read_u16(buf); + buf = &buf[2..]; + + // read start state + let start = S::from_usize(NativeEndian::read_u64(buf) as usize); + buf = &buf[8..]; + + // read state count + let state_count = NativeEndian::read_u64(buf) as usize; + buf = &buf[8..]; + + // read max match state + let max_match = S::from_usize(NativeEndian::read_u64(buf) as usize); + buf = &buf[8..]; + + // read byte classes + let byte_classes = ByteClasses::from_slice(&buf[..256]); + buf = &buf[256..]; + + Repr { + anchored: opts & dense::MASK_ANCHORED > 0, + start, + state_count, + max_match, + byte_classes, + trans: buf, + } + } +} + +#[cfg(feature = "std")] +impl<S: StateID> Repr<Vec<u8>, S> { + /// The implementation for constructing a sparse DFA from a dense DFA. + fn from_dense_sized<T: AsRef<[S]>, A: StateID>( + dfa: &dense::Repr<T, S>, + ) -> Result<Repr<Vec<u8>, A>> { + // In order to build the transition table, we need to be able to write + // state identifiers for each of the "next" transitions in each state. + // Our state identifiers correspond to the byte offset in the + // transition table at which the state is encoded. Therefore, we do not + // actually know what the state identifiers are until we've allocated + // exactly as much space as we need for each state. Thus, construction + // of the transition table happens in two passes. + // + // In the first pass, we fill out the shell of each state, which + // includes the transition count, the input byte ranges and zero-filled + // space for the transitions. In this first pass, we also build up a + // map from the state identifier index of the dense DFA to the state + // identifier in this sparse DFA. + // + // In the second pass, we fill in the transitions based on the map + // built in the first pass. + + let mut trans = Vec::with_capacity(size_of::<A>() * dfa.state_count()); + let mut remap: Vec<A> = vec![dead_id(); dfa.state_count()]; + for (old_id, state) in dfa.states() { + let pos = trans.len(); + + remap[dfa.state_id_to_index(old_id)] = usize_to_state_id(pos)?; + // zero-filled space for the transition count + trans.push(0); + trans.push(0); + + let mut trans_count = 0; + for (b1, b2, _) in state.sparse_transitions() { + trans_count += 1; + trans.push(b1); + trans.push(b2); + } + // fill in the transition count + NativeEndian::write_u16(&mut trans[pos..], trans_count); + + // zero-fill the actual transitions + let zeros = trans_count as usize * size_of::<A>(); + trans.extend(iter::repeat(0).take(zeros)); + } + + let mut new = Repr { + anchored: dfa.is_anchored(), + start: remap[dfa.state_id_to_index(dfa.start_state())], + state_count: dfa.state_count(), + max_match: remap[dfa.state_id_to_index(dfa.max_match_state())], + byte_classes: dfa.byte_classes().clone(), + trans, + }; + for (old_id, old_state) in dfa.states() { + let new_id = remap[dfa.state_id_to_index(old_id)]; + let mut new_state = new.state_mut(new_id); + let sparse = old_state.sparse_transitions(); + for (i, (_, _, next)) in sparse.enumerate() { + let next = remap[dfa.state_id_to_index(next)]; + new_state.set_next_at(i, next); + } + } + Ok(new) + } + + /// Return a convenient mutable representation of the given state. + fn state_mut<'a>(&'a mut self, id: S) -> StateMut<'a, S> { + let mut pos = id.to_usize(); + let ntrans = NativeEndian::read_u16(&self.trans[pos..]) as usize; + pos += 2; + + let size = (ntrans * 2) + (ntrans * size_of::<S>()); + let ranges_and_next = &mut self.trans[pos..pos + size]; + let (input_ranges, next) = ranges_and_next.split_at_mut(ntrans * 2); + StateMut { _state_id_repr: PhantomData, ntrans, input_ranges, next } + } +} + +#[cfg(feature = "std")] +impl<T: AsRef<[u8]>, S: StateID> fmt::Debug for Repr<T, S> { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + fn state_status<T: AsRef<[u8]>, S: StateID>( + dfa: &Repr<T, S>, + id: S, + ) -> &'static str { + if id == dead_id() { + if dfa.is_match_state(id) { + "D*" + } else { + "D " + } + } else if id == dfa.start_state() { + if dfa.is_match_state(id) { + ">*" + } else { + "> " + } + } else { + if dfa.is_match_state(id) { + " *" + } else { + " " + } + } + } + + writeln!(f, "SparseDFA(")?; + for (id, state) in self.states() { + let status = state_status(self, id); + writeln!(f, "{}{:06}: {:?}", status, id.to_usize(), state)?; + } + writeln!(f, ")")?; + Ok(()) + } +} + +/// An iterator over all states in a sparse DFA. +/// +/// This iterator yields tuples, where the first element is the state ID and +/// the second element is the state itself. +#[cfg(feature = "std")] +#[derive(Debug)] +struct StateIter<'a, T: AsRef<[u8]> + 'a, S: StateID + 'a = usize> { + dfa: &'a Repr<T, S>, + id: S, +} + +#[cfg(feature = "std")] +impl<'a, T: AsRef<[u8]>, S: StateID> Iterator for StateIter<'a, T, S> { + type Item = (S, State<'a, S>); + + fn next(&mut self) -> Option<(S, State<'a, S>)> { + if self.id.to_usize() >= self.dfa.trans().len() { + return None; + } + let id = self.id; + let state = self.dfa.state(id); + self.id = S::from_usize(self.id.to_usize() + state.bytes()); + Some((id, state)) + } +} + +/// A representation of a sparse DFA state that can be cheaply materialized +/// from a state identifier. +#[derive(Clone)] +struct State<'a, S: StateID = usize> { + /// The state identifier representation used by the DFA from which this + /// state was extracted. Since our transition table is compacted in a + /// &[u8], we don't actually use the state ID type parameter explicitly + /// anywhere, so we fake it. This prevents callers from using an incorrect + /// state ID representation to read from this state. + _state_id_repr: PhantomData<S>, + /// The number of transitions in this state. + ntrans: usize, + /// Pairs of input ranges, where there is one pair for each transition. + /// Each pair specifies an inclusive start and end byte range for the + /// corresponding transition. + input_ranges: &'a [u8], + /// Transitions to the next state. This slice contains native endian + /// encoded state identifiers, with `S` as the representation. Thus, there + /// are `ntrans * size_of::<S>()` bytes in this slice. + next: &'a [u8], +} + +impl<'a, S: StateID> State<'a, S> { + /// Searches for the next transition given an input byte. If no such + /// transition could be found, then a dead state is returned. + fn next(&self, input: u8) -> S { + // This straight linear search was observed to be much better than + // binary search on ASCII haystacks, likely because a binary search + // visits the ASCII case last but a linear search sees it first. A + // binary search does do a little better on non-ASCII haystacks, but + // not by much. There might be a better trade off lurking here. + for i in 0..self.ntrans { + let (start, end) = self.range(i); + if start <= input && input <= end { + return self.next_at(i); + } + // We could bail early with an extra branch: if input < b1, then + // we know we'll never find a matching transition. Interestingly, + // this extra branch seems to not help performance, or will even + // hurt it. It's likely very dependent on the DFA itself and what + // is being searched. + } + dead_id() + } + + /// Returns the inclusive input byte range for the ith transition in this + /// state. + fn range(&self, i: usize) -> (u8, u8) { + (self.input_ranges[i * 2], self.input_ranges[i * 2 + 1]) + } + + /// Returns the next state for the ith transition in this state. + fn next_at(&self, i: usize) -> S { + S::read_bytes(&self.next[i * size_of::<S>()..]) + } + + /// Return the total number of bytes that this state consumes in its + /// encoded form. + #[cfg(feature = "std")] + fn bytes(&self) -> usize { + 2 + (self.ntrans * 2) + (self.ntrans * size_of::<S>()) + } +} + +#[cfg(feature = "std")] +impl<'a, S: StateID> fmt::Debug for State<'a, S> { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + let mut transitions = vec![]; + for i in 0..self.ntrans { + let next = self.next_at(i); + if next == dead_id() { + continue; + } + + let (start, end) = self.range(i); + if start == end { + transitions.push(format!( + "{} => {}", + escape(start), + next.to_usize() + )); + } else { + transitions.push(format!( + "{}-{} => {}", + escape(start), + escape(end), + next.to_usize(), + )); + } + } + write!(f, "{}", transitions.join(", ")) + } +} + +/// A representation of a mutable sparse DFA state that can be cheaply +/// materialized from a state identifier. +#[cfg(feature = "std")] +struct StateMut<'a, S: StateID = usize> { + /// The state identifier representation used by the DFA from which this + /// state was extracted. Since our transition table is compacted in a + /// &[u8], we don't actually use the state ID type parameter explicitly + /// anywhere, so we fake it. This prevents callers from using an incorrect + /// state ID representation to read from this state. + _state_id_repr: PhantomData<S>, + /// The number of transitions in this state. + ntrans: usize, + /// Pairs of input ranges, where there is one pair for each transition. + /// Each pair specifies an inclusive start and end byte range for the + /// corresponding transition. + input_ranges: &'a mut [u8], + /// Transitions to the next state. This slice contains native endian + /// encoded state identifiers, with `S` as the representation. Thus, there + /// are `ntrans * size_of::<S>()` bytes in this slice. + next: &'a mut [u8], +} + +#[cfg(feature = "std")] +impl<'a, S: StateID> StateMut<'a, S> { + /// Sets the ith transition to the given state. + fn set_next_at(&mut self, i: usize, next: S) { + next.write_bytes(&mut self.next[i * size_of::<S>()..]); + } +} + +#[cfg(feature = "std")] +impl<'a, S: StateID> fmt::Debug for StateMut<'a, S> { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + let state = State { + _state_id_repr: self._state_id_repr, + ntrans: self.ntrans, + input_ranges: self.input_ranges, + next: self.next, + }; + fmt::Debug::fmt(&state, f) + } +} + +/// Return the given byte as its escaped string form. +#[cfg(feature = "std")] +fn escape(b: u8) -> String { + use std::ascii; + + String::from_utf8(ascii::escape_default(b).collect::<Vec<_>>()).unwrap() +} + +/// A binary search routine specialized specifically to a sparse DFA state's +/// transitions. Specifically, the transitions are defined as a set of pairs +/// of input bytes that delineate an inclusive range of bytes. If the input +/// byte is in the range, then the corresponding transition is a match. +/// +/// This binary search accepts a slice of these pairs and returns the position +/// of the matching pair (the ith transition), or None if no matching pair +/// could be found. +/// +/// Note that this routine is not currently used since it was observed to +/// either decrease performance when searching ASCII, or did not provide enough +/// of a boost on non-ASCII haystacks to be worth it. However, we leave it here +/// for posterity in case we can find a way to use it. +/// +/// In theory, we could use the standard library's search routine if we could +/// cast a `&[u8]` to a `&[(u8, u8)]`, but I don't believe this is currently +/// guaranteed to be safe and is thus UB (since I don't think the in-memory +/// representation of `(u8, u8)` has been nailed down). +#[inline(always)] +#[allow(dead_code)] +fn binary_search_ranges(ranges: &[u8], needle: u8) -> Option<usize> { + debug_assert!(ranges.len() % 2 == 0, "ranges must have even length"); + debug_assert!(ranges.len() <= 512, "ranges should be short"); + + let (mut left, mut right) = (0, ranges.len() / 2); + while left < right { + let mid = (left + right) / 2; + let (b1, b2) = (ranges[mid * 2], ranges[mid * 2 + 1]); + if needle < b1 { + right = mid; + } else if needle > b2 { + left = mid + 1; + } else { + return Some(mid); + } + } + None +} |