diff options
Diffstat (limited to 'vendor/regex-automata/src/nfa/map.rs')
-rw-r--r-- | vendor/regex-automata/src/nfa/map.rs | 282 |
1 files changed, 282 insertions, 0 deletions
diff --git a/vendor/regex-automata/src/nfa/map.rs b/vendor/regex-automata/src/nfa/map.rs new file mode 100644 index 000000000..e636c0dd3 --- /dev/null +++ b/vendor/regex-automata/src/nfa/map.rs @@ -0,0 +1,282 @@ +// This module contains a couple simple and purpose built hash maps. The key +// trade off they make is that they serve as caches rather than true maps. That +// is, inserting a new entry may cause eviction of another entry. This gives +// us two things. First, there's less overhead associated with inserts and +// lookups. Secondly, it lets us control our memory usage. +// +// These maps are used in some fairly hot code when generating NFA states for +// large Unicode character classes. +// +// Instead of exposing a rich hashmap entry API, we just permit the caller +// to produce a hash of the key directly. The hash can then be reused for both +// lookups and insertions at the cost of leaking things a bit. But these are +// for internal use only, so it's fine. +// +// The Utf8BoundedMap is used for Daciuk's algorithm for constructing a +// (almost) minimal DFA for large Unicode character classes in linear time. +// (Daciuk's algorithm is always used when compiling forward NFAs. For reverse +// NFAs, it's only used when the compiler is configured to 'shrink' the NFA, +// since there's a bit more expense in the reverse direction.) +// +// The Utf8SuffixMap is used when compiling large Unicode character classes for +// reverse NFAs when 'shrink' is disabled. Specifically, it augments the naive +// construction of UTF-8 automata by caching common suffixes. This doesn't +// get the same space savings as Daciuk's algorithm, but it's basically as +// fast as the naive approach and typically winds up using less memory (since +// it generates smaller NFAs) despite the presence of the cache. +// +// These maps effectively represent caching mechanisms for CState::Sparse and +// CState::Range, respectively. The former represents a single NFA state with +// many transitions of equivalent priority while the latter represents a single +// NFA state with a single transition. (Neither state ever has or is an +// epsilon transition.) Thus, they have different key types. It's likely we +// could make one generic map, but the machinery didn't seem worth it. They +// are simple enough. + +use nfa::{StateID, Transition}; + +// Basic FNV-1a hash constants as described in: +// https://en.wikipedia.org/wiki/Fowler%E2%80%93Noll%E2%80%93Vo_hash_function +const PRIME: u64 = 1099511628211; +const INIT: u64 = 14695981039346656037; + +/// A bounded hash map where the key is a sequence of NFA transitions and the +/// value is a pre-existing NFA state ID. +/// +/// std's hashmap can be used for this, however, this map has two important +/// advantages. Firstly, it has lower overhead. Secondly, it permits us to +/// control our memory usage by limited the number of slots. In general, the +/// cost here is that this map acts as a cache. That is, inserting a new entry +/// may remove an old entry. We are okay with this, since it does not impact +/// correctness in the cases where it is used. The only effect that dropping +/// states from the cache has is that the resulting NFA generated may be bigger +/// than it otherwise would be. +/// +/// This improves benchmarks that compile large Unicode character classes, +/// since it makes the generation of (almost) minimal UTF-8 automaton faster. +/// Specifically, one could observe the difference with std's hashmap via +/// something like the following benchmark: +/// +/// hyperfine "regex-automata-debug debug -acqr '\w{40} ecurB'" +/// +/// But to observe that difference, you'd have to modify the code to use +/// std's hashmap. +/// +/// It is quite possible that there is a better way to approach this problem. +/// For example, if there happens to be a very common state that collides with +/// a lot of less frequent states, then we could wind up with very poor caching +/// behavior. Alas, the effectiveness of this cache has not been measured. +/// Instead, ad hoc experiments suggest that it is "good enough." Additional +/// smarts (such as an LRU eviction policy) have to be weighed against the +/// amount of extra time they cost. +#[derive(Clone, Debug)] +pub struct Utf8BoundedMap { + /// The current version of this map. Only entries with matching versions + /// are considered during lookups. If an entry is found with a mismatched + /// version, then the map behaves as if the entry does not exist. + version: u16, + /// The total number of entries this map can store. + capacity: usize, + /// The actual entries, keyed by hash. Collisions between different states + /// result in the old state being dropped. + map: Vec<Utf8BoundedEntry>, +} + +/// An entry in this map. +#[derive(Clone, Debug, Default)] +struct Utf8BoundedEntry { + /// The version of the map used to produce this entry. If this entry's + /// version does not match the current version of the map, then the map + /// should behave as if this entry does not exist. + version: u16, + /// The key, which is a sorted sequence of non-overlapping NFA transitions. + key: Vec<Transition>, + /// The state ID corresponding to the state containing the transitions in + /// this entry. + val: StateID, +} + +impl Utf8BoundedMap { + /// Create a new bounded map with the given capacity. The map will never + /// grow beyond the given size. + /// + /// Note that this does not allocate. Instead, callers must call `clear` + /// before using this map. `clear` will allocate space if necessary. + /// + /// This avoids the need to pay for the allocation of this map when + /// compiling regexes that lack large Unicode character classes. + pub fn new(capacity: usize) -> Utf8BoundedMap { + assert!(capacity > 0); + Utf8BoundedMap { version: 0, capacity, map: vec![] } + } + + /// Clear this map of all entries, but permit the reuse of allocation + /// if possible. + /// + /// This must be called before the map can be used. + pub fn clear(&mut self) { + if self.map.is_empty() { + self.map = vec![Utf8BoundedEntry::default(); self.capacity]; + } else { + self.version = self.version.wrapping_add(1); + if self.version == 0 { + self.map = vec![Utf8BoundedEntry::default(); self.capacity]; + } + } + } + + /// Return a hash of the given transitions. + pub fn hash(&self, key: &[Transition]) -> usize { + let mut h = INIT; + for t in key { + h = (h ^ (t.start as u64)).wrapping_mul(PRIME); + h = (h ^ (t.end as u64)).wrapping_mul(PRIME); + h = (h ^ (t.next as u64)).wrapping_mul(PRIME); + } + (h as usize) % self.map.len() + } + + /// Retrieve the cached state ID corresponding to the given key. The hash + /// given must have been computed with `hash` using the same key value. + /// + /// If there is no cached state with the given transitions, then None is + /// returned. + pub fn get(&mut self, key: &[Transition], hash: usize) -> Option<StateID> { + let entry = &self.map[hash]; + if entry.version != self.version { + return None; + } + // There may be a hash collision, so we need to confirm real equality. + if entry.key != key { + return None; + } + Some(entry.val) + } + + /// Add a cached state to this map with the given key. Callers should + /// ensure that `state_id` points to a state that contains precisely the + /// NFA transitions given. + /// + /// `hash` must have been computed using the `hash` method with the same + /// key. + pub fn set( + &mut self, + key: Vec<Transition>, + hash: usize, + state_id: StateID, + ) { + self.map[hash] = + Utf8BoundedEntry { version: self.version, key, val: state_id }; + } +} + +/// A cache of suffixes used to modestly compress UTF-8 automata for large +/// Unicode character classes. +#[derive(Clone, Debug)] +pub struct Utf8SuffixMap { + /// The current version of this map. Only entries with matching versions + /// are considered during lookups. If an entry is found with a mismatched + /// version, then the map behaves as if the entry does not exist. + version: u16, + /// The total number of entries this map can store. + capacity: usize, + /// The actual entries, keyed by hash. Collisions between different states + /// result in the old state being dropped. + map: Vec<Utf8SuffixEntry>, +} + +/// A key that uniquely identifies an NFA state. It is a triple that represents +/// a transition from one state for a particular byte range. +#[derive(Clone, Debug, Default, Eq, PartialEq)] +pub struct Utf8SuffixKey { + pub from: StateID, + pub start: u8, + pub end: u8, +} + +/// An entry in this map. +#[derive(Clone, Debug, Default)] +struct Utf8SuffixEntry { + /// The version of the map used to produce this entry. If this entry's + /// version does not match the current version of the map, then the map + /// should behave as if this entry does not exist. + version: u16, + /// The key, which consists of a transition in a particular state. + key: Utf8SuffixKey, + /// The identifier that the transition in the key maps to. + val: StateID, +} + +impl Utf8SuffixMap { + /// Create a new bounded map with the given capacity. The map will never + /// grow beyond the given size. + /// + /// Note that this does not allocate. Instead, callers must call `clear` + /// before using this map. `clear` will allocate space if necessary. + /// + /// This avoids the need to pay for the allocation of this map when + /// compiling regexes that lack large Unicode character classes. + pub fn new(capacity: usize) -> Utf8SuffixMap { + assert!(capacity > 0); + Utf8SuffixMap { version: 0, capacity, map: vec![] } + } + + /// Clear this map of all entries, but permit the reuse of allocation + /// if possible. + /// + /// This must be called before the map can be used. + pub fn clear(&mut self) { + if self.map.is_empty() { + self.map = vec![Utf8SuffixEntry::default(); self.capacity]; + } else { + self.version = self.version.wrapping_add(1); + if self.version == 0 { + self.map = vec![Utf8SuffixEntry::default(); self.capacity]; + } + } + } + + /// Return a hash of the given transition. + pub fn hash(&self, key: &Utf8SuffixKey) -> usize { + // Basic FNV-1a hash as described: + // https://en.wikipedia.org/wiki/Fowler%E2%80%93Noll%E2%80%93Vo_hash_function + const PRIME: u64 = 1099511628211; + const INIT: u64 = 14695981039346656037; + + let mut h = INIT; + h = (h ^ (key.from as u64)).wrapping_mul(PRIME); + h = (h ^ (key.start as u64)).wrapping_mul(PRIME); + h = (h ^ (key.end as u64)).wrapping_mul(PRIME); + (h as usize) % self.map.len() + } + + /// Retrieve the cached state ID corresponding to the given key. The hash + /// given must have been computed with `hash` using the same key value. + /// + /// If there is no cached state with the given key, then None is returned. + pub fn get( + &mut self, + key: &Utf8SuffixKey, + hash: usize, + ) -> Option<StateID> { + let entry = &self.map[hash]; + if entry.version != self.version { + return None; + } + if key != &entry.key { + return None; + } + Some(entry.val) + } + + /// Add a cached state to this map with the given key. Callers should + /// ensure that `state_id` points to a state that contains precisely the + /// NFA transition given. + /// + /// `hash` must have been computed using the `hash` method with the same + /// key. + pub fn set(&mut self, key: Utf8SuffixKey, hash: usize, state_id: StateID) { + self.map[hash] = + Utf8SuffixEntry { version: self.version, key, val: state_id }; + } +} |