summaryrefslogtreecommitdiffstats
path: root/third_party/rust/regex/src/prog.rs
diff options
context:
space:
mode:
Diffstat (limited to 'third_party/rust/regex/src/prog.rs')
-rw-r--r--third_party/rust/regex/src/prog.rs447
1 files changed, 447 insertions, 0 deletions
diff --git a/third_party/rust/regex/src/prog.rs b/third_party/rust/regex/src/prog.rs
new file mode 100644
index 0000000000..c211f71d8a
--- /dev/null
+++ b/third_party/rust/regex/src/prog.rs
@@ -0,0 +1,447 @@
+use std::cmp::Ordering;
+use std::collections::HashMap;
+use std::fmt;
+use std::mem;
+use std::ops::Deref;
+use std::slice;
+use std::sync::Arc;
+
+use crate::input::Char;
+use crate::literal::LiteralSearcher;
+
+/// `InstPtr` represents the index of an instruction in a regex program.
+pub type InstPtr = usize;
+
+/// Program is a sequence of instructions and various facts about thos
+/// instructions.
+#[derive(Clone)]
+pub struct Program {
+ /// A sequence of instructions that represents an NFA.
+ pub insts: Vec<Inst>,
+ /// Pointers to each Match instruction in the sequence.
+ ///
+ /// This is always length 1 unless this program represents a regex set.
+ pub matches: Vec<InstPtr>,
+ /// The ordered sequence of all capture groups extracted from the AST.
+ /// Unnamed groups are `None`.
+ pub captures: Vec<Option<String>>,
+ /// Pointers to all named capture groups into `captures`.
+ pub capture_name_idx: Arc<HashMap<String, usize>>,
+ /// A pointer to the start instruction. This can vary depending on how
+ /// the program was compiled. For example, programs for use with the DFA
+ /// engine have a `.*?` inserted at the beginning of unanchored regular
+ /// expressions. The actual starting point of the program is after the
+ /// `.*?`.
+ pub start: InstPtr,
+ /// A set of equivalence classes for discriminating bytes in the compiled
+ /// program.
+ pub byte_classes: Vec<u8>,
+ /// When true, this program can only match valid UTF-8.
+ pub only_utf8: bool,
+ /// When true, this program uses byte range instructions instead of Unicode
+ /// range instructions.
+ pub is_bytes: bool,
+ /// When true, the program is compiled for DFA matching. For example, this
+ /// implies `is_bytes` and also inserts a preceding `.*?` for unanchored
+ /// regexes.
+ pub is_dfa: bool,
+ /// When true, the program matches text in reverse (for use only in the
+ /// DFA).
+ pub is_reverse: bool,
+ /// Whether the regex must match from the start of the input.
+ pub is_anchored_start: bool,
+ /// Whether the regex must match at the end of the input.
+ pub is_anchored_end: bool,
+ /// Whether this program contains a Unicode word boundary instruction.
+ pub has_unicode_word_boundary: bool,
+ /// A possibly empty machine for very quickly matching prefix literals.
+ pub prefixes: LiteralSearcher,
+ /// A limit on the size of the cache that the DFA is allowed to use while
+ /// matching.
+ ///
+ /// The cache limit specifies approximately how much space we're willing to
+ /// give to the state cache. Once the state cache exceeds the size, it is
+ /// wiped and all states must be re-computed.
+ ///
+ /// Note that this value does not impact correctness. It can be set to 0
+ /// and the DFA will run just fine. (It will only ever store exactly one
+ /// state in the cache, and will likely run very slowly, but it will work.)
+ ///
+ /// Also note that this limit is *per thread of execution*. That is,
+ /// if the same regex is used to search text across multiple threads
+ /// simultaneously, then the DFA cache is not shared. Instead, copies are
+ /// made.
+ pub dfa_size_limit: usize,
+}
+
+impl Program {
+ /// Creates an empty instruction sequence. Fields are given default
+ /// values.
+ pub fn new() -> Self {
+ Program {
+ insts: vec![],
+ matches: vec![],
+ captures: vec![],
+ capture_name_idx: Arc::new(HashMap::new()),
+ start: 0,
+ byte_classes: vec![0; 256],
+ only_utf8: true,
+ is_bytes: false,
+ is_dfa: false,
+ is_reverse: false,
+ is_anchored_start: false,
+ is_anchored_end: false,
+ has_unicode_word_boundary: false,
+ prefixes: LiteralSearcher::empty(),
+ dfa_size_limit: 2 * (1 << 20),
+ }
+ }
+
+ /// If pc is an index to a no-op instruction (like Save), then return the
+ /// next pc that is not a no-op instruction.
+ pub fn skip(&self, mut pc: usize) -> usize {
+ loop {
+ match self[pc] {
+ Inst::Save(ref i) => pc = i.goto,
+ _ => return pc,
+ }
+ }
+ }
+
+ /// Return true if and only if an execution engine at instruction `pc` will
+ /// always lead to a match.
+ pub fn leads_to_match(&self, pc: usize) -> bool {
+ if self.matches.len() > 1 {
+ // If we have a regex set, then we have more than one ending
+ // state, so leading to one of those states is generally
+ // meaningless.
+ return false;
+ }
+ match self[self.skip(pc)] {
+ Inst::Match(_) => true,
+ _ => false,
+ }
+ }
+
+ /// Returns true if the current configuration demands that an implicit
+ /// `.*?` be prepended to the instruction sequence.
+ pub fn needs_dotstar(&self) -> bool {
+ self.is_dfa && !self.is_reverse && !self.is_anchored_start
+ }
+
+ /// Returns true if this program uses Byte instructions instead of
+ /// Char/Range instructions.
+ pub fn uses_bytes(&self) -> bool {
+ self.is_bytes || self.is_dfa
+ }
+
+ /// Returns true if this program exclusively matches valid UTF-8 bytes.
+ ///
+ /// That is, if an invalid UTF-8 byte is seen, then no match is possible.
+ pub fn only_utf8(&self) -> bool {
+ self.only_utf8
+ }
+
+ /// Return the approximate heap usage of this instruction sequence in
+ /// bytes.
+ pub fn approximate_size(&self) -> usize {
+ // The only instruction that uses heap space is Ranges (for
+ // Unicode codepoint programs) to store non-overlapping codepoint
+ // ranges. To keep this operation constant time, we ignore them.
+ (self.len() * mem::size_of::<Inst>())
+ + (self.matches.len() * mem::size_of::<InstPtr>())
+ + (self.captures.len() * mem::size_of::<Option<String>>())
+ + (self.capture_name_idx.len()
+ * (mem::size_of::<String>() + mem::size_of::<usize>()))
+ + (self.byte_classes.len() * mem::size_of::<u8>())
+ + self.prefixes.approximate_size()
+ }
+}
+
+impl Deref for Program {
+ type Target = [Inst];
+
+ #[cfg_attr(feature = "perf-inline", inline(always))]
+ fn deref(&self) -> &Self::Target {
+ &*self.insts
+ }
+}
+
+impl fmt::Debug for Program {
+ fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+ use self::Inst::*;
+
+ fn with_goto(cur: usize, goto: usize, fmtd: String) -> String {
+ if goto == cur + 1 {
+ fmtd
+ } else {
+ format!("{} (goto: {})", fmtd, goto)
+ }
+ }
+
+ fn visible_byte(b: u8) -> String {
+ use std::ascii::escape_default;
+ let escaped = escape_default(b).collect::<Vec<u8>>();
+ String::from_utf8_lossy(&escaped).into_owned()
+ }
+
+ for (pc, inst) in self.iter().enumerate() {
+ match *inst {
+ Match(slot) => write!(f, "{:04} Match({:?})", pc, slot)?,
+ Save(ref inst) => {
+ let s = format!("{:04} Save({})", pc, inst.slot);
+ write!(f, "{}", with_goto(pc, inst.goto, s))?;
+ }
+ Split(ref inst) => {
+ write!(
+ f,
+ "{:04} Split({}, {})",
+ pc, inst.goto1, inst.goto2
+ )?;
+ }
+ EmptyLook(ref inst) => {
+ let s = format!("{:?}", inst.look);
+ write!(f, "{:04} {}", pc, with_goto(pc, inst.goto, s))?;
+ }
+ Char(ref inst) => {
+ let s = format!("{:?}", inst.c);
+ write!(f, "{:04} {}", pc, with_goto(pc, inst.goto, s))?;
+ }
+ Ranges(ref inst) => {
+ let ranges = inst
+ .ranges
+ .iter()
+ .map(|r| format!("{:?}-{:?}", r.0, r.1))
+ .collect::<Vec<String>>()
+ .join(", ");
+ write!(
+ f,
+ "{:04} {}",
+ pc,
+ with_goto(pc, inst.goto, ranges)
+ )?;
+ }
+ Bytes(ref inst) => {
+ let s = format!(
+ "Bytes({}, {})",
+ visible_byte(inst.start),
+ visible_byte(inst.end)
+ );
+ write!(f, "{:04} {}", pc, with_goto(pc, inst.goto, s))?;
+ }
+ }
+ if pc == self.start {
+ write!(f, " (start)")?;
+ }
+ writeln!(f)?;
+ }
+ Ok(())
+ }
+}
+
+impl<'a> IntoIterator for &'a Program {
+ type Item = &'a Inst;
+ type IntoIter = slice::Iter<'a, Inst>;
+ fn into_iter(self) -> Self::IntoIter {
+ self.iter()
+ }
+}
+
+/// Inst is an instruction code in a Regex program.
+///
+/// Regrettably, a regex program either contains Unicode codepoint
+/// instructions (Char and Ranges) or it contains byte instructions (Bytes).
+/// A regex program can never contain both.
+///
+/// It would be worth investigating splitting this into two distinct types and
+/// then figuring out how to make the matching engines polymorphic over those
+/// types without sacrificing performance.
+///
+/// Other than the benefit of moving invariants into the type system, another
+/// benefit is the decreased size. If we remove the `Char` and `Ranges`
+/// instructions from the `Inst` enum, then its size shrinks from 32 bytes to
+/// 24 bytes. (This is because of the removal of a `Box<[]>` in the `Ranges`
+/// variant.) Given that byte based machines are typically much bigger than
+/// their Unicode analogues (because they can decode UTF-8 directly), this ends
+/// up being a pretty significant savings.
+#[derive(Clone, Debug)]
+pub enum Inst {
+ /// Match indicates that the program has reached a match state.
+ ///
+ /// The number in the match corresponds to the Nth logical regular
+ /// expression in this program. This index is always 0 for normal regex
+ /// programs. Values greater than 0 appear when compiling regex sets, and
+ /// each match instruction gets its own unique value. The value corresponds
+ /// to the Nth regex in the set.
+ Match(usize),
+ /// Save causes the program to save the current location of the input in
+ /// the slot indicated by InstSave.
+ Save(InstSave),
+ /// Split causes the program to diverge to one of two paths in the
+ /// program, preferring goto1 in InstSplit.
+ Split(InstSplit),
+ /// EmptyLook represents a zero-width assertion in a regex program. A
+ /// zero-width assertion does not consume any of the input text.
+ EmptyLook(InstEmptyLook),
+ /// Char requires the regex program to match the character in InstChar at
+ /// the current position in the input.
+ Char(InstChar),
+ /// Ranges requires the regex program to match the character at the current
+ /// position in the input with one of the ranges specified in InstRanges.
+ Ranges(InstRanges),
+ /// Bytes is like Ranges, except it expresses a single byte range. It is
+ /// used in conjunction with Split instructions to implement multi-byte
+ /// character classes.
+ Bytes(InstBytes),
+}
+
+impl Inst {
+ /// Returns true if and only if this is a match instruction.
+ pub fn is_match(&self) -> bool {
+ match *self {
+ Inst::Match(_) => true,
+ _ => false,
+ }
+ }
+}
+
+/// Representation of the Save instruction.
+#[derive(Clone, Debug)]
+pub struct InstSave {
+ /// The next location to execute in the program.
+ pub goto: InstPtr,
+ /// The capture slot (there are two slots for every capture in a regex,
+ /// including the zeroth capture for the entire match).
+ pub slot: usize,
+}
+
+/// Representation of the Split instruction.
+#[derive(Clone, Debug)]
+pub struct InstSplit {
+ /// The first instruction to try. A match resulting from following goto1
+ /// has precedence over a match resulting from following goto2.
+ pub goto1: InstPtr,
+ /// The second instruction to try. A match resulting from following goto1
+ /// has precedence over a match resulting from following goto2.
+ pub goto2: InstPtr,
+}
+
+/// Representation of the `EmptyLook` instruction.
+#[derive(Clone, Debug)]
+pub struct InstEmptyLook {
+ /// The next location to execute in the program if this instruction
+ /// succeeds.
+ pub goto: InstPtr,
+ /// The type of zero-width assertion to check.
+ pub look: EmptyLook,
+}
+
+/// The set of zero-width match instructions.
+#[derive(Clone, Copy, Debug, PartialEq, Eq)]
+pub enum EmptyLook {
+ /// Start of line or input.
+ StartLine,
+ /// End of line or input.
+ EndLine,
+ /// Start of input.
+ StartText,
+ /// End of input.
+ EndText,
+ /// Word character on one side and non-word character on other.
+ WordBoundary,
+ /// Word character on both sides or non-word character on both sides.
+ NotWordBoundary,
+ /// ASCII word boundary.
+ WordBoundaryAscii,
+ /// Not ASCII word boundary.
+ NotWordBoundaryAscii,
+}
+
+/// Representation of the Char instruction.
+#[derive(Clone, Debug)]
+pub struct InstChar {
+ /// The next location to execute in the program if this instruction
+ /// succeeds.
+ pub goto: InstPtr,
+ /// The character to test.
+ pub c: char,
+}
+
+/// Representation of the Ranges instruction.
+#[derive(Clone, Debug)]
+pub struct InstRanges {
+ /// The next location to execute in the program if this instruction
+ /// succeeds.
+ pub goto: InstPtr,
+ /// The set of Unicode scalar value ranges to test.
+ pub ranges: Box<[(char, char)]>,
+}
+
+impl InstRanges {
+ /// Tests whether the given input character matches this instruction.
+ pub fn matches(&self, c: Char) -> bool {
+ // This speeds up the `match_class_unicode` benchmark by checking
+ // some common cases quickly without binary search. e.g., Matching
+ // a Unicode class on predominantly ASCII text.
+ for r in self.ranges.iter().take(4) {
+ if c < r.0 {
+ return false;
+ }
+ if c <= r.1 {
+ return true;
+ }
+ }
+ self.ranges
+ .binary_search_by(|r| {
+ if r.1 < c {
+ Ordering::Less
+ } else if r.0 > c {
+ Ordering::Greater
+ } else {
+ Ordering::Equal
+ }
+ })
+ .is_ok()
+ }
+
+ /// Return the number of distinct characters represented by all of the
+ /// ranges.
+ pub fn num_chars(&self) -> usize {
+ self.ranges
+ .iter()
+ .map(|&(s, e)| 1 + (e as u32) - (s as u32))
+ .sum::<u32>() as usize
+ }
+}
+
+/// Representation of the Bytes instruction.
+#[derive(Clone, Debug)]
+pub struct InstBytes {
+ /// The next location to execute in the program if this instruction
+ /// succeeds.
+ pub goto: InstPtr,
+ /// The start (inclusive) of this byte range.
+ pub start: u8,
+ /// The end (inclusive) of this byte range.
+ pub end: u8,
+}
+
+impl InstBytes {
+ /// Returns true if and only if the given byte is in this range.
+ pub fn matches(&self, byte: u8) -> bool {
+ self.start <= byte && byte <= self.end
+ }
+}
+
+#[cfg(test)]
+mod test {
+ #[test]
+ #[cfg(target_pointer_width = "64")]
+ fn test_size_of_inst() {
+ use std::mem::size_of;
+
+ use super::Inst;
+
+ assert_eq!(32, size_of::<Inst>());
+ }
+}