diff options
Diffstat (limited to 'third_party/rust/regex/src/prog.rs')
-rw-r--r-- | third_party/rust/regex/src/prog.rs | 447 |
1 files changed, 447 insertions, 0 deletions
diff --git a/third_party/rust/regex/src/prog.rs b/third_party/rust/regex/src/prog.rs new file mode 100644 index 0000000000..c211f71d8a --- /dev/null +++ b/third_party/rust/regex/src/prog.rs @@ -0,0 +1,447 @@ +use std::cmp::Ordering; +use std::collections::HashMap; +use std::fmt; +use std::mem; +use std::ops::Deref; +use std::slice; +use std::sync::Arc; + +use crate::input::Char; +use crate::literal::LiteralSearcher; + +/// `InstPtr` represents the index of an instruction in a regex program. +pub type InstPtr = usize; + +/// Program is a sequence of instructions and various facts about thos +/// instructions. +#[derive(Clone)] +pub struct Program { + /// A sequence of instructions that represents an NFA. + pub insts: Vec<Inst>, + /// Pointers to each Match instruction in the sequence. + /// + /// This is always length 1 unless this program represents a regex set. + pub matches: Vec<InstPtr>, + /// The ordered sequence of all capture groups extracted from the AST. + /// Unnamed groups are `None`. + pub captures: Vec<Option<String>>, + /// Pointers to all named capture groups into `captures`. + pub capture_name_idx: Arc<HashMap<String, usize>>, + /// A pointer to the start instruction. This can vary depending on how + /// the program was compiled. For example, programs for use with the DFA + /// engine have a `.*?` inserted at the beginning of unanchored regular + /// expressions. The actual starting point of the program is after the + /// `.*?`. + pub start: InstPtr, + /// A set of equivalence classes for discriminating bytes in the compiled + /// program. + pub byte_classes: Vec<u8>, + /// When true, this program can only match valid UTF-8. + pub only_utf8: bool, + /// When true, this program uses byte range instructions instead of Unicode + /// range instructions. + pub is_bytes: bool, + /// When true, the program is compiled for DFA matching. For example, this + /// implies `is_bytes` and also inserts a preceding `.*?` for unanchored + /// regexes. + pub is_dfa: bool, + /// When true, the program matches text in reverse (for use only in the + /// DFA). + pub is_reverse: bool, + /// Whether the regex must match from the start of the input. + pub is_anchored_start: bool, + /// Whether the regex must match at the end of the input. + pub is_anchored_end: bool, + /// Whether this program contains a Unicode word boundary instruction. + pub has_unicode_word_boundary: bool, + /// A possibly empty machine for very quickly matching prefix literals. + pub prefixes: LiteralSearcher, + /// A limit on the size of the cache that the DFA is allowed to use while + /// matching. + /// + /// The cache limit specifies approximately how much space we're willing to + /// give to the state cache. Once the state cache exceeds the size, it is + /// wiped and all states must be re-computed. + /// + /// Note that this value does not impact correctness. It can be set to 0 + /// and the DFA will run just fine. (It will only ever store exactly one + /// state in the cache, and will likely run very slowly, but it will work.) + /// + /// Also note that this limit is *per thread of execution*. That is, + /// if the same regex is used to search text across multiple threads + /// simultaneously, then the DFA cache is not shared. Instead, copies are + /// made. + pub dfa_size_limit: usize, +} + +impl Program { + /// Creates an empty instruction sequence. Fields are given default + /// values. + pub fn new() -> Self { + Program { + insts: vec![], + matches: vec![], + captures: vec![], + capture_name_idx: Arc::new(HashMap::new()), + start: 0, + byte_classes: vec![0; 256], + only_utf8: true, + is_bytes: false, + is_dfa: false, + is_reverse: false, + is_anchored_start: false, + is_anchored_end: false, + has_unicode_word_boundary: false, + prefixes: LiteralSearcher::empty(), + dfa_size_limit: 2 * (1 << 20), + } + } + + /// If pc is an index to a no-op instruction (like Save), then return the + /// next pc that is not a no-op instruction. + pub fn skip(&self, mut pc: usize) -> usize { + loop { + match self[pc] { + Inst::Save(ref i) => pc = i.goto, + _ => return pc, + } + } + } + + /// Return true if and only if an execution engine at instruction `pc` will + /// always lead to a match. + pub fn leads_to_match(&self, pc: usize) -> bool { + if self.matches.len() > 1 { + // If we have a regex set, then we have more than one ending + // state, so leading to one of those states is generally + // meaningless. + return false; + } + match self[self.skip(pc)] { + Inst::Match(_) => true, + _ => false, + } + } + + /// Returns true if the current configuration demands that an implicit + /// `.*?` be prepended to the instruction sequence. + pub fn needs_dotstar(&self) -> bool { + self.is_dfa && !self.is_reverse && !self.is_anchored_start + } + + /// Returns true if this program uses Byte instructions instead of + /// Char/Range instructions. + pub fn uses_bytes(&self) -> bool { + self.is_bytes || self.is_dfa + } + + /// Returns true if this program exclusively matches valid UTF-8 bytes. + /// + /// That is, if an invalid UTF-8 byte is seen, then no match is possible. + pub fn only_utf8(&self) -> bool { + self.only_utf8 + } + + /// Return the approximate heap usage of this instruction sequence in + /// bytes. + pub fn approximate_size(&self) -> usize { + // The only instruction that uses heap space is Ranges (for + // Unicode codepoint programs) to store non-overlapping codepoint + // ranges. To keep this operation constant time, we ignore them. + (self.len() * mem::size_of::<Inst>()) + + (self.matches.len() * mem::size_of::<InstPtr>()) + + (self.captures.len() * mem::size_of::<Option<String>>()) + + (self.capture_name_idx.len() + * (mem::size_of::<String>() + mem::size_of::<usize>())) + + (self.byte_classes.len() * mem::size_of::<u8>()) + + self.prefixes.approximate_size() + } +} + +impl Deref for Program { + type Target = [Inst]; + + #[cfg_attr(feature = "perf-inline", inline(always))] + fn deref(&self) -> &Self::Target { + &*self.insts + } +} + +impl fmt::Debug for Program { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + use self::Inst::*; + + fn with_goto(cur: usize, goto: usize, fmtd: String) -> String { + if goto == cur + 1 { + fmtd + } else { + format!("{} (goto: {})", fmtd, goto) + } + } + + fn visible_byte(b: u8) -> String { + use std::ascii::escape_default; + let escaped = escape_default(b).collect::<Vec<u8>>(); + String::from_utf8_lossy(&escaped).into_owned() + } + + for (pc, inst) in self.iter().enumerate() { + match *inst { + Match(slot) => write!(f, "{:04} Match({:?})", pc, slot)?, + Save(ref inst) => { + let s = format!("{:04} Save({})", pc, inst.slot); + write!(f, "{}", with_goto(pc, inst.goto, s))?; + } + Split(ref inst) => { + write!( + f, + "{:04} Split({}, {})", + pc, inst.goto1, inst.goto2 + )?; + } + EmptyLook(ref inst) => { + let s = format!("{:?}", inst.look); + write!(f, "{:04} {}", pc, with_goto(pc, inst.goto, s))?; + } + Char(ref inst) => { + let s = format!("{:?}", inst.c); + write!(f, "{:04} {}", pc, with_goto(pc, inst.goto, s))?; + } + Ranges(ref inst) => { + let ranges = inst + .ranges + .iter() + .map(|r| format!("{:?}-{:?}", r.0, r.1)) + .collect::<Vec<String>>() + .join(", "); + write!( + f, + "{:04} {}", + pc, + with_goto(pc, inst.goto, ranges) + )?; + } + Bytes(ref inst) => { + let s = format!( + "Bytes({}, {})", + visible_byte(inst.start), + visible_byte(inst.end) + ); + write!(f, "{:04} {}", pc, with_goto(pc, inst.goto, s))?; + } + } + if pc == self.start { + write!(f, " (start)")?; + } + writeln!(f)?; + } + Ok(()) + } +} + +impl<'a> IntoIterator for &'a Program { + type Item = &'a Inst; + type IntoIter = slice::Iter<'a, Inst>; + fn into_iter(self) -> Self::IntoIter { + self.iter() + } +} + +/// Inst is an instruction code in a Regex program. +/// +/// Regrettably, a regex program either contains Unicode codepoint +/// instructions (Char and Ranges) or it contains byte instructions (Bytes). +/// A regex program can never contain both. +/// +/// It would be worth investigating splitting this into two distinct types and +/// then figuring out how to make the matching engines polymorphic over those +/// types without sacrificing performance. +/// +/// Other than the benefit of moving invariants into the type system, another +/// benefit is the decreased size. If we remove the `Char` and `Ranges` +/// instructions from the `Inst` enum, then its size shrinks from 32 bytes to +/// 24 bytes. (This is because of the removal of a `Box<[]>` in the `Ranges` +/// variant.) Given that byte based machines are typically much bigger than +/// their Unicode analogues (because they can decode UTF-8 directly), this ends +/// up being a pretty significant savings. +#[derive(Clone, Debug)] +pub enum Inst { + /// Match indicates that the program has reached a match state. + /// + /// The number in the match corresponds to the Nth logical regular + /// expression in this program. This index is always 0 for normal regex + /// programs. Values greater than 0 appear when compiling regex sets, and + /// each match instruction gets its own unique value. The value corresponds + /// to the Nth regex in the set. + Match(usize), + /// Save causes the program to save the current location of the input in + /// the slot indicated by InstSave. + Save(InstSave), + /// Split causes the program to diverge to one of two paths in the + /// program, preferring goto1 in InstSplit. + Split(InstSplit), + /// EmptyLook represents a zero-width assertion in a regex program. A + /// zero-width assertion does not consume any of the input text. + EmptyLook(InstEmptyLook), + /// Char requires the regex program to match the character in InstChar at + /// the current position in the input. + Char(InstChar), + /// Ranges requires the regex program to match the character at the current + /// position in the input with one of the ranges specified in InstRanges. + Ranges(InstRanges), + /// Bytes is like Ranges, except it expresses a single byte range. It is + /// used in conjunction with Split instructions to implement multi-byte + /// character classes. + Bytes(InstBytes), +} + +impl Inst { + /// Returns true if and only if this is a match instruction. + pub fn is_match(&self) -> bool { + match *self { + Inst::Match(_) => true, + _ => false, + } + } +} + +/// Representation of the Save instruction. +#[derive(Clone, Debug)] +pub struct InstSave { + /// The next location to execute in the program. + pub goto: InstPtr, + /// The capture slot (there are two slots for every capture in a regex, + /// including the zeroth capture for the entire match). + pub slot: usize, +} + +/// Representation of the Split instruction. +#[derive(Clone, Debug)] +pub struct InstSplit { + /// The first instruction to try. A match resulting from following goto1 + /// has precedence over a match resulting from following goto2. + pub goto1: InstPtr, + /// The second instruction to try. A match resulting from following goto1 + /// has precedence over a match resulting from following goto2. + pub goto2: InstPtr, +} + +/// Representation of the `EmptyLook` instruction. +#[derive(Clone, Debug)] +pub struct InstEmptyLook { + /// The next location to execute in the program if this instruction + /// succeeds. + pub goto: InstPtr, + /// The type of zero-width assertion to check. + pub look: EmptyLook, +} + +/// The set of zero-width match instructions. +#[derive(Clone, Copy, Debug, PartialEq, Eq)] +pub enum EmptyLook { + /// Start of line or input. + StartLine, + /// End of line or input. + EndLine, + /// Start of input. + StartText, + /// End of input. + EndText, + /// Word character on one side and non-word character on other. + WordBoundary, + /// Word character on both sides or non-word character on both sides. + NotWordBoundary, + /// ASCII word boundary. + WordBoundaryAscii, + /// Not ASCII word boundary. + NotWordBoundaryAscii, +} + +/// Representation of the Char instruction. +#[derive(Clone, Debug)] +pub struct InstChar { + /// The next location to execute in the program if this instruction + /// succeeds. + pub goto: InstPtr, + /// The character to test. + pub c: char, +} + +/// Representation of the Ranges instruction. +#[derive(Clone, Debug)] +pub struct InstRanges { + /// The next location to execute in the program if this instruction + /// succeeds. + pub goto: InstPtr, + /// The set of Unicode scalar value ranges to test. + pub ranges: Box<[(char, char)]>, +} + +impl InstRanges { + /// Tests whether the given input character matches this instruction. + pub fn matches(&self, c: Char) -> bool { + // This speeds up the `match_class_unicode` benchmark by checking + // some common cases quickly without binary search. e.g., Matching + // a Unicode class on predominantly ASCII text. + for r in self.ranges.iter().take(4) { + if c < r.0 { + return false; + } + if c <= r.1 { + return true; + } + } + self.ranges + .binary_search_by(|r| { + if r.1 < c { + Ordering::Less + } else if r.0 > c { + Ordering::Greater + } else { + Ordering::Equal + } + }) + .is_ok() + } + + /// Return the number of distinct characters represented by all of the + /// ranges. + pub fn num_chars(&self) -> usize { + self.ranges + .iter() + .map(|&(s, e)| 1 + (e as u32) - (s as u32)) + .sum::<u32>() as usize + } +} + +/// Representation of the Bytes instruction. +#[derive(Clone, Debug)] +pub struct InstBytes { + /// The next location to execute in the program if this instruction + /// succeeds. + pub goto: InstPtr, + /// The start (inclusive) of this byte range. + pub start: u8, + /// The end (inclusive) of this byte range. + pub end: u8, +} + +impl InstBytes { + /// Returns true if and only if the given byte is in this range. + pub fn matches(&self, byte: u8) -> bool { + self.start <= byte && byte <= self.end + } +} + +#[cfg(test)] +mod test { + #[test] + #[cfg(target_pointer_width = "64")] + fn test_size_of_inst() { + use std::mem::size_of; + + use super::Inst; + + assert_eq!(32, size_of::<Inst>()); + } +} |