diff options
author | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-06-19 09:25:56 +0000 |
---|---|---|
committer | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-06-19 09:25:56 +0000 |
commit | 018c4950b9406055dec02ef0fb52f132e2bb1e2c (patch) | |
tree | a835ebdf2088ef88fa681f8fad45f09922c1ae9a /vendor/regex/src/compile.rs | |
parent | Adding debian version 1.75.0+dfsg1-5. (diff) | |
download | rustc-018c4950b9406055dec02ef0fb52f132e2bb1e2c.tar.xz rustc-018c4950b9406055dec02ef0fb52f132e2bb1e2c.zip |
Merging upstream version 1.76.0+dfsg1.
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'vendor/regex/src/compile.rs')
-rw-r--r-- | vendor/regex/src/compile.rs | 1333 |
1 files changed, 0 insertions, 1333 deletions
diff --git a/vendor/regex/src/compile.rs b/vendor/regex/src/compile.rs deleted file mode 100644 index 23e63ec89..000000000 --- a/vendor/regex/src/compile.rs +++ /dev/null @@ -1,1333 +0,0 @@ -use std::collections::HashMap; -use std::fmt; -use std::iter; -use std::result; -use std::sync::Arc; - -use regex_syntax::hir::{self, Hir, Look}; -use regex_syntax::is_word_byte; -use regex_syntax::utf8::{Utf8Range, Utf8Sequence, Utf8Sequences}; - -use crate::prog::{ - EmptyLook, Inst, InstBytes, InstChar, InstEmptyLook, InstPtr, InstRanges, - InstSave, InstSplit, Program, -}; - -use crate::Error; - -type Result = result::Result<Patch, Error>; -type ResultOrEmpty = result::Result<Option<Patch>, Error>; - -#[derive(Debug)] -struct Patch { - hole: Hole, - entry: InstPtr, -} - -/// A compiler translates a regular expression AST to a sequence of -/// instructions. The sequence of instructions represents an NFA. -// `Compiler` is only public via the `internal` module, so avoid deriving -// `Debug`. -#[allow(missing_debug_implementations)] -pub struct Compiler { - insts: Vec<MaybeInst>, - compiled: Program, - capture_name_idx: HashMap<String, usize>, - num_exprs: usize, - size_limit: usize, - suffix_cache: SuffixCache, - utf8_seqs: Option<Utf8Sequences>, - byte_classes: ByteClassSet, - // This keeps track of extra bytes allocated while compiling the regex - // program. Currently, this corresponds to two things. First is the heap - // memory allocated by Unicode character classes ('InstRanges'). Second is - // a "fake" amount of memory used by empty sub-expressions, so that enough - // empty sub-expressions will ultimately trigger the compiler to bail - // because of a size limit restriction. (That empty sub-expressions don't - // add to heap memory usage is more-or-less an implementation detail.) In - // the second case, if we don't bail, then an excessively large repetition - // on an empty sub-expression can result in the compiler using a very large - // amount of CPU time. - extra_inst_bytes: usize, -} - -impl Compiler { - /// Create a new regular expression compiler. - /// - /// Various options can be set before calling `compile` on an expression. - pub fn new() -> Self { - Compiler { - insts: vec![], - compiled: Program::new(), - capture_name_idx: HashMap::new(), - num_exprs: 0, - size_limit: 10 * (1 << 20), - suffix_cache: SuffixCache::new(1000), - utf8_seqs: Some(Utf8Sequences::new('\x00', '\x00')), - byte_classes: ByteClassSet::new(), - extra_inst_bytes: 0, - } - } - - /// The size of the resulting program is limited by size_limit. If - /// the program approximately exceeds the given size (in bytes), then - /// compilation will stop and return an error. - pub fn size_limit(mut self, size_limit: usize) -> Self { - self.size_limit = size_limit; - self - } - - /// If bytes is true, then the program is compiled as a byte based - /// automaton, which incorporates UTF-8 decoding into the machine. If it's - /// false, then the automaton is Unicode scalar value based, e.g., an - /// engine utilizing such an automaton is responsible for UTF-8 decoding. - /// - /// The specific invariant is that when returning a byte based machine, - /// the neither the `Char` nor `Ranges` instructions are produced. - /// Conversely, when producing a Unicode scalar value machine, the `Bytes` - /// instruction is never produced. - /// - /// Note that `dfa(true)` implies `bytes(true)`. - pub fn bytes(mut self, yes: bool) -> Self { - self.compiled.is_bytes = yes; - self - } - - /// When disabled, the program compiled may match arbitrary bytes. - /// - /// When enabled (the default), all compiled programs exclusively match - /// valid UTF-8 bytes. - pub fn only_utf8(mut self, yes: bool) -> Self { - self.compiled.only_utf8 = yes; - self - } - - /// When set, the machine returned is suitable for use in the DFA matching - /// engine. - /// - /// In particular, this ensures that if the regex is not anchored in the - /// beginning, then a preceding `.*?` is included in the program. (The NFA - /// based engines handle the preceding `.*?` explicitly, which is difficult - /// or impossible in the DFA engine.) - pub fn dfa(mut self, yes: bool) -> Self { - self.compiled.is_dfa = yes; - self - } - - /// When set, the machine returned is suitable for matching text in - /// reverse. In particular, all concatenations are flipped. - pub fn reverse(mut self, yes: bool) -> Self { - self.compiled.is_reverse = yes; - self - } - - /// Compile a regular expression given its AST. - /// - /// The compiler is guaranteed to succeed unless the program exceeds the - /// specified size limit. If the size limit is exceeded, then compilation - /// stops and returns an error. - pub fn compile(mut self, exprs: &[Hir]) -> result::Result<Program, Error> { - debug_assert!(!exprs.is_empty()); - self.num_exprs = exprs.len(); - if exprs.len() == 1 { - self.compile_one(&exprs[0]) - } else { - self.compile_many(exprs) - } - } - - fn compile_one(mut self, expr: &Hir) -> result::Result<Program, Error> { - if self.compiled.only_utf8 - && expr.properties().look_set().contains(Look::WordAsciiNegate) - { - return Err(Error::Syntax( - "ASCII-only \\B is not allowed in Unicode regexes \ - because it may result in invalid UTF-8 matches" - .to_string(), - )); - } - // If we're compiling a forward DFA and we aren't anchored, then - // add a `.*?` before the first capture group. - // Other matching engines handle this by baking the logic into the - // matching engine itself. - let mut dotstar_patch = Patch { hole: Hole::None, entry: 0 }; - self.compiled.is_anchored_start = - expr.properties().look_set_prefix().contains(Look::Start); - self.compiled.is_anchored_end = - expr.properties().look_set_suffix().contains(Look::End); - if self.compiled.needs_dotstar() { - dotstar_patch = self.c_dotstar()?; - self.compiled.start = dotstar_patch.entry; - } - self.compiled.captures = vec![None]; - let patch = - self.c_capture(0, expr)?.unwrap_or_else(|| self.next_inst()); - if self.compiled.needs_dotstar() { - self.fill(dotstar_patch.hole, patch.entry); - } else { - self.compiled.start = patch.entry; - } - self.fill_to_next(patch.hole); - self.compiled.matches = vec![self.insts.len()]; - self.push_compiled(Inst::Match(0)); - self.compiled.static_captures_len = - expr.properties().static_explicit_captures_len(); - self.compile_finish() - } - - fn compile_many( - mut self, - exprs: &[Hir], - ) -> result::Result<Program, Error> { - debug_assert!(exprs.len() > 1); - - self.compiled.is_anchored_start = exprs - .iter() - .all(|e| e.properties().look_set_prefix().contains(Look::Start)); - self.compiled.is_anchored_end = exprs - .iter() - .all(|e| e.properties().look_set_suffix().contains(Look::End)); - let mut dotstar_patch = Patch { hole: Hole::None, entry: 0 }; - if self.compiled.needs_dotstar() { - dotstar_patch = self.c_dotstar()?; - self.compiled.start = dotstar_patch.entry; - } else { - self.compiled.start = 0; // first instruction is always split - } - self.fill_to_next(dotstar_patch.hole); - - let mut prev_hole = Hole::None; - for (i, expr) in exprs[0..exprs.len() - 1].iter().enumerate() { - self.fill_to_next(prev_hole); - let split = self.push_split_hole(); - let Patch { hole, entry } = - self.c_capture(0, expr)?.unwrap_or_else(|| self.next_inst()); - self.fill_to_next(hole); - self.compiled.matches.push(self.insts.len()); - self.push_compiled(Inst::Match(i)); - prev_hole = self.fill_split(split, Some(entry), None); - } - let i = exprs.len() - 1; - let Patch { hole, entry } = - self.c_capture(0, &exprs[i])?.unwrap_or_else(|| self.next_inst()); - self.fill(prev_hole, entry); - self.fill_to_next(hole); - self.compiled.matches.push(self.insts.len()); - self.push_compiled(Inst::Match(i)); - self.compile_finish() - } - - fn compile_finish(mut self) -> result::Result<Program, Error> { - self.compiled.insts = - self.insts.into_iter().map(|inst| inst.unwrap()).collect(); - self.compiled.byte_classes = self.byte_classes.byte_classes(); - self.compiled.capture_name_idx = Arc::new(self.capture_name_idx); - Ok(self.compiled) - } - - /// Compile expr into self.insts, returning a patch on success, - /// or an error if we run out of memory. - /// - /// All of the c_* methods of the compiler share the contract outlined - /// here. - /// - /// The main thing that a c_* method does is mutate `self.insts` - /// to add a list of mostly compiled instructions required to execute - /// the given expression. `self.insts` contains MaybeInsts rather than - /// Insts because there is some backpatching required. - /// - /// The `Patch` value returned by each c_* method provides metadata - /// about the compiled instructions emitted to `self.insts`. The - /// `entry` member of the patch refers to the first instruction - /// (the entry point), while the `hole` member contains zero or - /// more offsets to partial instructions that need to be backpatched. - /// The c_* routine can't know where its list of instructions are going to - /// jump to after execution, so it is up to the caller to patch - /// these jumps to point to the right place. So compiling some - /// expression, e, we would end up with a situation that looked like: - /// - /// ```text - /// self.insts = [ ..., i1, i2, ..., iexit1, ..., iexitn, ...] - /// ^ ^ ^ - /// | \ / - /// entry \ / - /// hole - /// ``` - /// - /// To compile two expressions, e1 and e2, concatenated together we - /// would do: - /// - /// ```ignore - /// let patch1 = self.c(e1); - /// let patch2 = self.c(e2); - /// ``` - /// - /// while leaves us with a situation that looks like - /// - /// ```text - /// self.insts = [ ..., i1, ..., iexit1, ..., i2, ..., iexit2 ] - /// ^ ^ ^ ^ - /// | | | | - /// entry1 hole1 entry2 hole2 - /// ``` - /// - /// Then to merge the two patches together into one we would backpatch - /// hole1 with entry2 and return a new patch that enters at entry1 - /// and has hole2 for a hole. In fact, if you look at the c_concat - /// method you will see that it does exactly this, though it handles - /// a list of expressions rather than just the two that we use for - /// an example. - /// - /// Ok(None) is returned when an expression is compiled to no - /// instruction, and so no patch.entry value makes sense. - fn c(&mut self, expr: &Hir) -> ResultOrEmpty { - use crate::prog; - use regex_syntax::hir::HirKind::*; - - self.check_size()?; - match *expr.kind() { - Empty => self.c_empty(), - Literal(hir::Literal(ref bytes)) => { - if self.compiled.is_reverse { - let mut bytes = bytes.to_vec(); - bytes.reverse(); - self.c_literal(&bytes) - } else { - self.c_literal(bytes) - } - } - Class(hir::Class::Unicode(ref cls)) => self.c_class(cls.ranges()), - Class(hir::Class::Bytes(ref cls)) => { - if self.compiled.uses_bytes() { - self.c_class_bytes(cls.ranges()) - } else { - assert!(cls.is_ascii()); - let mut char_ranges = vec![]; - for r in cls.iter() { - let (s, e) = (r.start() as char, r.end() as char); - char_ranges.push(hir::ClassUnicodeRange::new(s, e)); - } - self.c_class(&char_ranges) - } - } - Look(ref look) => match *look { - hir::Look::Start if self.compiled.is_reverse => { - self.c_empty_look(prog::EmptyLook::EndText) - } - hir::Look::Start => { - self.c_empty_look(prog::EmptyLook::StartText) - } - hir::Look::End if self.compiled.is_reverse => { - self.c_empty_look(prog::EmptyLook::StartText) - } - hir::Look::End => self.c_empty_look(prog::EmptyLook::EndText), - hir::Look::StartLF if self.compiled.is_reverse => { - self.byte_classes.set_range(b'\n', b'\n'); - self.c_empty_look(prog::EmptyLook::EndLine) - } - hir::Look::StartLF => { - self.byte_classes.set_range(b'\n', b'\n'); - self.c_empty_look(prog::EmptyLook::StartLine) - } - hir::Look::EndLF if self.compiled.is_reverse => { - self.byte_classes.set_range(b'\n', b'\n'); - self.c_empty_look(prog::EmptyLook::StartLine) - } - hir::Look::EndLF => { - self.byte_classes.set_range(b'\n', b'\n'); - self.c_empty_look(prog::EmptyLook::EndLine) - } - hir::Look::StartCRLF | hir::Look::EndCRLF => { - return Err(Error::Syntax( - "CRLF-aware line anchors are not supported yet" - .to_string(), - )); - } - hir::Look::WordAscii => { - self.byte_classes.set_word_boundary(); - self.c_empty_look(prog::EmptyLook::WordBoundaryAscii) - } - hir::Look::WordAsciiNegate => { - self.byte_classes.set_word_boundary(); - self.c_empty_look(prog::EmptyLook::NotWordBoundaryAscii) - } - hir::Look::WordUnicode => { - if !cfg!(feature = "unicode-perl") { - return Err(Error::Syntax( - "Unicode word boundaries are unavailable when \ - the unicode-perl feature is disabled" - .to_string(), - )); - } - self.compiled.has_unicode_word_boundary = true; - self.byte_classes.set_word_boundary(); - // We also make sure that all ASCII bytes are in a different - // class from non-ASCII bytes. Otherwise, it's possible for - // ASCII bytes to get lumped into the same class as non-ASCII - // bytes. This in turn may cause the lazy DFA to falsely start - // when it sees an ASCII byte that maps to a byte class with - // non-ASCII bytes. This ensures that never happens. - self.byte_classes.set_range(0, 0x7F); - self.c_empty_look(prog::EmptyLook::WordBoundary) - } - hir::Look::WordUnicodeNegate => { - if !cfg!(feature = "unicode-perl") { - return Err(Error::Syntax( - "Unicode word boundaries are unavailable when \ - the unicode-perl feature is disabled" - .to_string(), - )); - } - self.compiled.has_unicode_word_boundary = true; - self.byte_classes.set_word_boundary(); - // See comments above for why we set the ASCII range here. - self.byte_classes.set_range(0, 0x7F); - self.c_empty_look(prog::EmptyLook::NotWordBoundary) - } - }, - Capture(hir::Capture { index, ref name, ref sub }) => { - if index as usize >= self.compiled.captures.len() { - let name = match *name { - None => None, - Some(ref boxed_str) => Some(boxed_str.to_string()), - }; - self.compiled.captures.push(name.clone()); - if let Some(name) = name { - self.capture_name_idx.insert(name, index as usize); - } - } - self.c_capture(2 * index as usize, sub) - } - Concat(ref es) => { - if self.compiled.is_reverse { - self.c_concat(es.iter().rev()) - } else { - self.c_concat(es) - } - } - Alternation(ref es) => self.c_alternate(&**es), - Repetition(ref rep) => self.c_repeat(rep), - } - } - - fn c_empty(&mut self) -> ResultOrEmpty { - // See: https://github.com/rust-lang/regex/security/advisories/GHSA-m5pq-gvj9-9vr8 - // See: CVE-2022-24713 - // - // Since 'empty' sub-expressions don't increase the size of - // the actual compiled object, we "fake" an increase in its - // size so that our 'check_size_limit' routine will eventually - // stop compilation if there are too many empty sub-expressions - // (e.g., via a large repetition). - self.extra_inst_bytes += std::mem::size_of::<Inst>(); - Ok(None) - } - - fn c_capture(&mut self, first_slot: usize, expr: &Hir) -> ResultOrEmpty { - if self.num_exprs > 1 || self.compiled.is_dfa { - // Don't ever compile Save instructions for regex sets because - // they are never used. They are also never used in DFA programs - // because DFAs can't handle captures. - self.c(expr) - } else { - let entry = self.insts.len(); - let hole = self.push_hole(InstHole::Save { slot: first_slot }); - let patch = self.c(expr)?.unwrap_or_else(|| self.next_inst()); - self.fill(hole, patch.entry); - self.fill_to_next(patch.hole); - let hole = self.push_hole(InstHole::Save { slot: first_slot + 1 }); - Ok(Some(Patch { hole, entry })) - } - } - - fn c_dotstar(&mut self) -> Result { - let hir = if self.compiled.only_utf8() { - Hir::dot(hir::Dot::AnyChar) - } else { - Hir::dot(hir::Dot::AnyByte) - }; - Ok(self - .c(&Hir::repetition(hir::Repetition { - min: 0, - max: None, - greedy: false, - sub: Box::new(hir), - }))? - .unwrap()) - } - - fn c_char(&mut self, c: char) -> ResultOrEmpty { - if self.compiled.uses_bytes() { - if c.is_ascii() { - let b = c as u8; - let hole = - self.push_hole(InstHole::Bytes { start: b, end: b }); - self.byte_classes.set_range(b, b); - Ok(Some(Patch { hole, entry: self.insts.len() - 1 })) - } else { - self.c_class(&[hir::ClassUnicodeRange::new(c, c)]) - } - } else { - let hole = self.push_hole(InstHole::Char { c }); - Ok(Some(Patch { hole, entry: self.insts.len() - 1 })) - } - } - - fn c_class(&mut self, ranges: &[hir::ClassUnicodeRange]) -> ResultOrEmpty { - use std::mem::size_of; - - if ranges.is_empty() { - return Err(Error::Syntax( - "empty character classes are not allowed".to_string(), - )); - } - if self.compiled.uses_bytes() { - Ok(Some(CompileClass { c: self, ranges }.compile()?)) - } else { - let ranges: Vec<(char, char)> = - ranges.iter().map(|r| (r.start(), r.end())).collect(); - let hole = if ranges.len() == 1 && ranges[0].0 == ranges[0].1 { - self.push_hole(InstHole::Char { c: ranges[0].0 }) - } else { - self.extra_inst_bytes += - ranges.len() * (size_of::<char>() * 2); - self.push_hole(InstHole::Ranges { ranges }) - }; - Ok(Some(Patch { hole, entry: self.insts.len() - 1 })) - } - } - - fn c_byte(&mut self, b: u8) -> ResultOrEmpty { - self.c_class_bytes(&[hir::ClassBytesRange::new(b, b)]) - } - - fn c_class_bytes( - &mut self, - ranges: &[hir::ClassBytesRange], - ) -> ResultOrEmpty { - if ranges.is_empty() { - return Err(Error::Syntax( - "empty character classes are not allowed".to_string(), - )); - } - - let first_split_entry = self.insts.len(); - let mut holes = vec![]; - let mut prev_hole = Hole::None; - for r in &ranges[0..ranges.len() - 1] { - self.fill_to_next(prev_hole); - let split = self.push_split_hole(); - let next = self.insts.len(); - self.byte_classes.set_range(r.start(), r.end()); - holes.push(self.push_hole(InstHole::Bytes { - start: r.start(), - end: r.end(), - })); - prev_hole = self.fill_split(split, Some(next), None); - } - let next = self.insts.len(); - let r = &ranges[ranges.len() - 1]; - self.byte_classes.set_range(r.start(), r.end()); - holes.push( - self.push_hole(InstHole::Bytes { start: r.start(), end: r.end() }), - ); - self.fill(prev_hole, next); - Ok(Some(Patch { hole: Hole::Many(holes), entry: first_split_entry })) - } - - fn c_empty_look(&mut self, look: EmptyLook) -> ResultOrEmpty { - let hole = self.push_hole(InstHole::EmptyLook { look }); - Ok(Some(Patch { hole, entry: self.insts.len() - 1 })) - } - - fn c_literal(&mut self, bytes: &[u8]) -> ResultOrEmpty { - match core::str::from_utf8(bytes) { - Ok(string) => { - let mut it = string.chars(); - let Patch { mut hole, entry } = loop { - match it.next() { - None => return self.c_empty(), - Some(ch) => { - if let Some(p) = self.c_char(ch)? { - break p; - } - } - } - }; - for ch in it { - if let Some(p) = self.c_char(ch)? { - self.fill(hole, p.entry); - hole = p.hole; - } - } - Ok(Some(Patch { hole, entry })) - } - Err(_) => { - assert!(self.compiled.uses_bytes()); - let mut it = bytes.iter().copied(); - let Patch { mut hole, entry } = loop { - match it.next() { - None => return self.c_empty(), - Some(byte) => { - if let Some(p) = self.c_byte(byte)? { - break p; - } - } - } - }; - for byte in it { - if let Some(p) = self.c_byte(byte)? { - self.fill(hole, p.entry); - hole = p.hole; - } - } - Ok(Some(Patch { hole, entry })) - } - } - } - - fn c_concat<'a, I>(&mut self, exprs: I) -> ResultOrEmpty - where - I: IntoIterator<Item = &'a Hir>, - { - let mut exprs = exprs.into_iter(); - let Patch { mut hole, entry } = loop { - match exprs.next() { - None => return self.c_empty(), - Some(e) => { - if let Some(p) = self.c(e)? { - break p; - } - } - } - }; - for e in exprs { - if let Some(p) = self.c(e)? { - self.fill(hole, p.entry); - hole = p.hole; - } - } - Ok(Some(Patch { hole, entry })) - } - - fn c_alternate(&mut self, exprs: &[Hir]) -> ResultOrEmpty { - debug_assert!( - exprs.len() >= 2, - "alternates must have at least 2 exprs" - ); - - // Initial entry point is always the first split. - let first_split_entry = self.insts.len(); - - // Save up all of the holes from each alternate. They will all get - // patched to point to the same location. - let mut holes = vec![]; - - // true indicates that the hole is a split where we want to fill - // the second branch. - let mut prev_hole = (Hole::None, false); - for e in &exprs[0..exprs.len() - 1] { - if prev_hole.1 { - let next = self.insts.len(); - self.fill_split(prev_hole.0, None, Some(next)); - } else { - self.fill_to_next(prev_hole.0); - } - let split = self.push_split_hole(); - if let Some(Patch { hole, entry }) = self.c(e)? { - holes.push(hole); - prev_hole = (self.fill_split(split, Some(entry), None), false); - } else { - let (split1, split2) = split.dup_one(); - holes.push(split1); - prev_hole = (split2, true); - } - } - if let Some(Patch { hole, entry }) = self.c(&exprs[exprs.len() - 1])? { - holes.push(hole); - if prev_hole.1 { - self.fill_split(prev_hole.0, None, Some(entry)); - } else { - self.fill(prev_hole.0, entry); - } - } else { - // We ignore prev_hole.1. When it's true, it means we have two - // empty branches both pushing prev_hole.0 into holes, so both - // branches will go to the same place anyway. - holes.push(prev_hole.0); - } - Ok(Some(Patch { hole: Hole::Many(holes), entry: first_split_entry })) - } - - fn c_repeat(&mut self, rep: &hir::Repetition) -> ResultOrEmpty { - match (rep.min, rep.max) { - (0, Some(1)) => self.c_repeat_zero_or_one(&rep.sub, rep.greedy), - (0, None) => self.c_repeat_zero_or_more(&rep.sub, rep.greedy), - (1, None) => self.c_repeat_one_or_more(&rep.sub, rep.greedy), - (min, None) => { - self.c_repeat_range_min_or_more(&rep.sub, rep.greedy, min) - } - (min, Some(max)) => { - self.c_repeat_range(&rep.sub, rep.greedy, min, max) - } - } - } - - fn c_repeat_zero_or_one( - &mut self, - expr: &Hir, - greedy: bool, - ) -> ResultOrEmpty { - let split_entry = self.insts.len(); - let split = self.push_split_hole(); - let Patch { hole: hole_rep, entry: entry_rep } = match self.c(expr)? { - Some(p) => p, - None => return self.pop_split_hole(), - }; - let split_hole = if greedy { - self.fill_split(split, Some(entry_rep), None) - } else { - self.fill_split(split, None, Some(entry_rep)) - }; - let holes = vec![hole_rep, split_hole]; - Ok(Some(Patch { hole: Hole::Many(holes), entry: split_entry })) - } - - fn c_repeat_zero_or_more( - &mut self, - expr: &Hir, - greedy: bool, - ) -> ResultOrEmpty { - let split_entry = self.insts.len(); - let split = self.push_split_hole(); - let Patch { hole: hole_rep, entry: entry_rep } = match self.c(expr)? { - Some(p) => p, - None => return self.pop_split_hole(), - }; - - self.fill(hole_rep, split_entry); - let split_hole = if greedy { - self.fill_split(split, Some(entry_rep), None) - } else { - self.fill_split(split, None, Some(entry_rep)) - }; - Ok(Some(Patch { hole: split_hole, entry: split_entry })) - } - - fn c_repeat_one_or_more( - &mut self, - expr: &Hir, - greedy: bool, - ) -> ResultOrEmpty { - let Patch { hole: hole_rep, entry: entry_rep } = match self.c(expr)? { - Some(p) => p, - None => return Ok(None), - }; - self.fill_to_next(hole_rep); - let split = self.push_split_hole(); - - let split_hole = if greedy { - self.fill_split(split, Some(entry_rep), None) - } else { - self.fill_split(split, None, Some(entry_rep)) - }; - Ok(Some(Patch { hole: split_hole, entry: entry_rep })) - } - - fn c_repeat_range_min_or_more( - &mut self, - expr: &Hir, - greedy: bool, - min: u32, - ) -> ResultOrEmpty { - let min = u32_to_usize(min); - // Using next_inst() is ok, because we can't return it (concat would - // have to return Some(_) while c_repeat_range_min_or_more returns - // None). - let patch_concat = self - .c_concat(iter::repeat(expr).take(min))? - .unwrap_or_else(|| self.next_inst()); - if let Some(patch_rep) = self.c_repeat_zero_or_more(expr, greedy)? { - self.fill(patch_concat.hole, patch_rep.entry); - Ok(Some(Patch { hole: patch_rep.hole, entry: patch_concat.entry })) - } else { - Ok(None) - } - } - - fn c_repeat_range( - &mut self, - expr: &Hir, - greedy: bool, - min: u32, - max: u32, - ) -> ResultOrEmpty { - let (min, max) = (u32_to_usize(min), u32_to_usize(max)); - debug_assert!(min <= max); - let patch_concat = self.c_concat(iter::repeat(expr).take(min))?; - if min == max { - return Ok(patch_concat); - } - // Same reasoning as in c_repeat_range_min_or_more (we know that min < - // max at this point). - let patch_concat = patch_concat.unwrap_or_else(|| self.next_inst()); - let initial_entry = patch_concat.entry; - // It is much simpler to compile, e.g., `a{2,5}` as: - // - // aaa?a?a? - // - // But you end up with a sequence of instructions like this: - // - // 0: 'a' - // 1: 'a', - // 2: split(3, 4) - // 3: 'a' - // 4: split(5, 6) - // 5: 'a' - // 6: split(7, 8) - // 7: 'a' - // 8: MATCH - // - // This is *incredibly* inefficient because the splits end - // up forming a chain, which has to be resolved everything a - // transition is followed. - let mut holes = vec![]; - let mut prev_hole = patch_concat.hole; - for _ in min..max { - self.fill_to_next(prev_hole); - let split = self.push_split_hole(); - let Patch { hole, entry } = match self.c(expr)? { - Some(p) => p, - None => return self.pop_split_hole(), - }; - prev_hole = hole; - if greedy { - holes.push(self.fill_split(split, Some(entry), None)); - } else { - holes.push(self.fill_split(split, None, Some(entry))); - } - } - holes.push(prev_hole); - Ok(Some(Patch { hole: Hole::Many(holes), entry: initial_entry })) - } - - /// Can be used as a default value for the c_* functions when the call to - /// c_function is followed by inserting at least one instruction that is - /// always executed after the ones written by the c* function. - fn next_inst(&self) -> Patch { - Patch { hole: Hole::None, entry: self.insts.len() } - } - - fn fill(&mut self, hole: Hole, goto: InstPtr) { - match hole { - Hole::None => {} - Hole::One(pc) => { - self.insts[pc].fill(goto); - } - Hole::Many(holes) => { - for hole in holes { - self.fill(hole, goto); - } - } - } - } - - fn fill_to_next(&mut self, hole: Hole) { - let next = self.insts.len(); - self.fill(hole, next); - } - - fn fill_split( - &mut self, - hole: Hole, - goto1: Option<InstPtr>, - goto2: Option<InstPtr>, - ) -> Hole { - match hole { - Hole::None => Hole::None, - Hole::One(pc) => match (goto1, goto2) { - (Some(goto1), Some(goto2)) => { - self.insts[pc].fill_split(goto1, goto2); - Hole::None - } - (Some(goto1), None) => { - self.insts[pc].half_fill_split_goto1(goto1); - Hole::One(pc) - } - (None, Some(goto2)) => { - self.insts[pc].half_fill_split_goto2(goto2); - Hole::One(pc) - } - (None, None) => unreachable!( - "at least one of the split \ - holes must be filled" - ), - }, - Hole::Many(holes) => { - let mut new_holes = vec![]; - for hole in holes { - new_holes.push(self.fill_split(hole, goto1, goto2)); - } - if new_holes.is_empty() { - Hole::None - } else if new_holes.len() == 1 { - new_holes.pop().unwrap() - } else { - Hole::Many(new_holes) - } - } - } - } - - fn push_compiled(&mut self, inst: Inst) { - self.insts.push(MaybeInst::Compiled(inst)); - } - - fn push_hole(&mut self, inst: InstHole) -> Hole { - let hole = self.insts.len(); - self.insts.push(MaybeInst::Uncompiled(inst)); - Hole::One(hole) - } - - fn push_split_hole(&mut self) -> Hole { - let hole = self.insts.len(); - self.insts.push(MaybeInst::Split); - Hole::One(hole) - } - - fn pop_split_hole(&mut self) -> ResultOrEmpty { - self.insts.pop(); - Ok(None) - } - - fn check_size(&self) -> result::Result<(), Error> { - use std::mem::size_of; - - let size = - self.extra_inst_bytes + (self.insts.len() * size_of::<Inst>()); - if size > self.size_limit { - Err(Error::CompiledTooBig(self.size_limit)) - } else { - Ok(()) - } - } -} - -#[derive(Debug)] -enum Hole { - None, - One(InstPtr), - Many(Vec<Hole>), -} - -impl Hole { - fn dup_one(self) -> (Self, Self) { - match self { - Hole::One(pc) => (Hole::One(pc), Hole::One(pc)), - Hole::None | Hole::Many(_) => { - unreachable!("must be called on single hole") - } - } - } -} - -#[derive(Clone, Debug)] -enum MaybeInst { - Compiled(Inst), - Uncompiled(InstHole), - Split, - Split1(InstPtr), - Split2(InstPtr), -} - -impl MaybeInst { - fn fill(&mut self, goto: InstPtr) { - let maybeinst = match *self { - MaybeInst::Split => MaybeInst::Split1(goto), - MaybeInst::Uncompiled(ref inst) => { - MaybeInst::Compiled(inst.fill(goto)) - } - MaybeInst::Split1(goto1) => { - MaybeInst::Compiled(Inst::Split(InstSplit { - goto1, - goto2: goto, - })) - } - MaybeInst::Split2(goto2) => { - MaybeInst::Compiled(Inst::Split(InstSplit { - goto1: goto, - goto2, - })) - } - _ => unreachable!( - "not all instructions were compiled! \ - found uncompiled instruction: {:?}", - self - ), - }; - *self = maybeinst; - } - - fn fill_split(&mut self, goto1: InstPtr, goto2: InstPtr) { - let filled = match *self { - MaybeInst::Split => Inst::Split(InstSplit { goto1, goto2 }), - _ => unreachable!( - "must be called on Split instruction, \ - instead it was called on: {:?}", - self - ), - }; - *self = MaybeInst::Compiled(filled); - } - - fn half_fill_split_goto1(&mut self, goto1: InstPtr) { - let half_filled = match *self { - MaybeInst::Split => goto1, - _ => unreachable!( - "must be called on Split instruction, \ - instead it was called on: {:?}", - self - ), - }; - *self = MaybeInst::Split1(half_filled); - } - - fn half_fill_split_goto2(&mut self, goto2: InstPtr) { - let half_filled = match *self { - MaybeInst::Split => goto2, - _ => unreachable!( - "must be called on Split instruction, \ - instead it was called on: {:?}", - self - ), - }; - *self = MaybeInst::Split2(half_filled); - } - - fn unwrap(self) -> Inst { - match self { - MaybeInst::Compiled(inst) => inst, - _ => unreachable!( - "must be called on a compiled instruction, \ - instead it was called on: {:?}", - self - ), - } - } -} - -#[derive(Clone, Debug)] -enum InstHole { - Save { slot: usize }, - EmptyLook { look: EmptyLook }, - Char { c: char }, - Ranges { ranges: Vec<(char, char)> }, - Bytes { start: u8, end: u8 }, -} - -impl InstHole { - fn fill(&self, goto: InstPtr) -> Inst { - match *self { - InstHole::Save { slot } => Inst::Save(InstSave { goto, slot }), - InstHole::EmptyLook { look } => { - Inst::EmptyLook(InstEmptyLook { goto, look }) - } - InstHole::Char { c } => Inst::Char(InstChar { goto, c }), - InstHole::Ranges { ref ranges } => Inst::Ranges(InstRanges { - goto, - ranges: ranges.clone().into_boxed_slice(), - }), - InstHole::Bytes { start, end } => { - Inst::Bytes(InstBytes { goto, start, end }) - } - } - } -} - -struct CompileClass<'a, 'b> { - c: &'a mut Compiler, - ranges: &'b [hir::ClassUnicodeRange], -} - -impl<'a, 'b> CompileClass<'a, 'b> { - fn compile(mut self) -> Result { - let mut holes = vec![]; - let mut initial_entry = None; - let mut last_split = Hole::None; - let mut utf8_seqs = self.c.utf8_seqs.take().unwrap(); - self.c.suffix_cache.clear(); - - for (i, range) in self.ranges.iter().enumerate() { - let is_last_range = i + 1 == self.ranges.len(); - utf8_seqs.reset(range.start(), range.end()); - let mut it = (&mut utf8_seqs).peekable(); - loop { - let utf8_seq = match it.next() { - None => break, - Some(utf8_seq) => utf8_seq, - }; - if is_last_range && it.peek().is_none() { - let Patch { hole, entry } = self.c_utf8_seq(&utf8_seq)?; - holes.push(hole); - self.c.fill(last_split, entry); - last_split = Hole::None; - if initial_entry.is_none() { - initial_entry = Some(entry); - } - } else { - if initial_entry.is_none() { - initial_entry = Some(self.c.insts.len()); - } - self.c.fill_to_next(last_split); - last_split = self.c.push_split_hole(); - let Patch { hole, entry } = self.c_utf8_seq(&utf8_seq)?; - holes.push(hole); - last_split = - self.c.fill_split(last_split, Some(entry), None); - } - } - } - self.c.utf8_seqs = Some(utf8_seqs); - Ok(Patch { hole: Hole::Many(holes), entry: initial_entry.unwrap() }) - } - - fn c_utf8_seq(&mut self, seq: &Utf8Sequence) -> Result { - if self.c.compiled.is_reverse { - self.c_utf8_seq_(seq) - } else { - self.c_utf8_seq_(seq.into_iter().rev()) - } - } - - fn c_utf8_seq_<'r, I>(&mut self, seq: I) -> Result - where - I: IntoIterator<Item = &'r Utf8Range>, - { - // The initial instruction for each UTF-8 sequence should be the same. - let mut from_inst = ::std::usize::MAX; - let mut last_hole = Hole::None; - for byte_range in seq { - let key = SuffixCacheKey { - from_inst, - start: byte_range.start, - end: byte_range.end, - }; - { - let pc = self.c.insts.len(); - if let Some(cached_pc) = self.c.suffix_cache.get(key, pc) { - from_inst = cached_pc; - continue; - } - } - self.c.byte_classes.set_range(byte_range.start, byte_range.end); - if from_inst == ::std::usize::MAX { - last_hole = self.c.push_hole(InstHole::Bytes { - start: byte_range.start, - end: byte_range.end, - }); - } else { - self.c.push_compiled(Inst::Bytes(InstBytes { - goto: from_inst, - start: byte_range.start, - end: byte_range.end, - })); - } - from_inst = self.c.insts.len().checked_sub(1).unwrap(); - debug_assert!(from_inst < ::std::usize::MAX); - } - debug_assert!(from_inst < ::std::usize::MAX); - Ok(Patch { hole: last_hole, entry: from_inst }) - } -} - -/// `SuffixCache` is a simple bounded hash map for caching suffix entries in -/// UTF-8 automata. For example, consider the Unicode range \u{0}-\u{FFFF}. -/// The set of byte ranges looks like this: -/// -/// [0-7F] -/// [C2-DF][80-BF] -/// [E0][A0-BF][80-BF] -/// [E1-EC][80-BF][80-BF] -/// [ED][80-9F][80-BF] -/// [EE-EF][80-BF][80-BF] -/// -/// Each line above translates to one alternate in the compiled regex program. -/// However, all but one of the alternates end in the same suffix, which is -/// a waste of an instruction. The suffix cache facilitates reusing them across -/// alternates. -/// -/// Note that a HashMap could be trivially used for this, but we don't need its -/// overhead. Some small bounded space (LRU style) is more than enough. -/// -/// This uses similar idea to [`SparseSet`](../sparse/struct.SparseSet.html), -/// except it uses hashes as original indices and then compares full keys for -/// validation against `dense` array. -#[derive(Debug)] -struct SuffixCache { - sparse: Box<[usize]>, - dense: Vec<SuffixCacheEntry>, -} - -#[derive(Clone, Copy, Debug, Default, Eq, Hash, PartialEq)] -struct SuffixCacheEntry { - key: SuffixCacheKey, - pc: InstPtr, -} - -#[derive(Clone, Copy, Debug, Default, Eq, Hash, PartialEq)] -struct SuffixCacheKey { - from_inst: InstPtr, - start: u8, - end: u8, -} - -impl SuffixCache { - fn new(size: usize) -> Self { - SuffixCache { - sparse: vec![0usize; size].into(), - dense: Vec::with_capacity(size), - } - } - - fn get(&mut self, key: SuffixCacheKey, pc: InstPtr) -> Option<InstPtr> { - let hash = self.hash(&key); - let pos = &mut self.sparse[hash]; - if let Some(entry) = self.dense.get(*pos) { - if entry.key == key { - return Some(entry.pc); - } - } - *pos = self.dense.len(); - self.dense.push(SuffixCacheEntry { key, pc }); - None - } - - fn clear(&mut self) { - self.dense.clear(); - } - - fn hash(&self, suffix: &SuffixCacheKey) -> usize { - // Basic FNV-1a hash as described: - // https://en.wikipedia.org/wiki/Fowler%E2%80%93Noll%E2%80%93Vo_hash_function - const FNV_PRIME: u64 = 1_099_511_628_211; - let mut h = 14_695_981_039_346_656_037; - h = (h ^ (suffix.from_inst as u64)).wrapping_mul(FNV_PRIME); - h = (h ^ (suffix.start as u64)).wrapping_mul(FNV_PRIME); - h = (h ^ (suffix.end as u64)).wrapping_mul(FNV_PRIME); - (h as usize) % self.sparse.len() - } -} - -struct ByteClassSet([bool; 256]); - -impl ByteClassSet { - fn new() -> Self { - ByteClassSet([false; 256]) - } - - fn set_range(&mut self, start: u8, end: u8) { - debug_assert!(start <= end); - if start > 0 { - self.0[start as usize - 1] = true; - } - self.0[end as usize] = true; - } - - fn set_word_boundary(&mut self) { - // We need to mark all ranges of bytes whose pairs result in - // evaluating \b differently. - let iswb = is_word_byte; - let mut b1: u16 = 0; - let mut b2: u16; - while b1 <= 255 { - b2 = b1 + 1; - while b2 <= 255 && iswb(b1 as u8) == iswb(b2 as u8) { - b2 += 1; - } - self.set_range(b1 as u8, (b2 - 1) as u8); - b1 = b2; - } - } - - fn byte_classes(&self) -> Vec<u8> { - // N.B. If you're debugging the DFA, it's useful to simply return - // `(0..256).collect()`, which effectively removes the byte classes - // and makes the transitions easier to read. - // (0usize..256).map(|x| x as u8).collect() - let mut byte_classes = vec![0; 256]; - let mut class = 0u8; - let mut i = 0; - loop { - byte_classes[i] = class as u8; - if i >= 255 { - break; - } - if self.0[i] { - class = class.checked_add(1).unwrap(); - } - i += 1; - } - byte_classes - } -} - -impl fmt::Debug for ByteClassSet { - fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { - f.debug_tuple("ByteClassSet").field(&&self.0[..]).finish() - } -} - -fn u32_to_usize(n: u32) -> usize { - // In case usize is less than 32 bits, we need to guard against overflow. - // On most platforms this compiles to nothing. - // TODO Use `std::convert::TryFrom` once it's stable. - if (n as u64) > (::std::usize::MAX as u64) { - panic!("BUG: {} is too big to be pointer sized", n) - } - n as usize -} - -#[cfg(test)] -mod tests { - use super::ByteClassSet; - - #[test] - fn byte_classes() { - let mut set = ByteClassSet::new(); - set.set_range(b'a', b'z'); - let classes = set.byte_classes(); - assert_eq!(classes[0], 0); - assert_eq!(classes[1], 0); - assert_eq!(classes[2], 0); - assert_eq!(classes[b'a' as usize - 1], 0); - assert_eq!(classes[b'a' as usize], 1); - assert_eq!(classes[b'm' as usize], 1); - assert_eq!(classes[b'z' as usize], 1); - assert_eq!(classes[b'z' as usize + 1], 2); - assert_eq!(classes[254], 2); - assert_eq!(classes[255], 2); - - let mut set = ByteClassSet::new(); - set.set_range(0, 2); - set.set_range(4, 6); - let classes = set.byte_classes(); - assert_eq!(classes[0], 0); - assert_eq!(classes[1], 0); - assert_eq!(classes[2], 0); - assert_eq!(classes[3], 1); - assert_eq!(classes[4], 2); - assert_eq!(classes[5], 2); - assert_eq!(classes[6], 2); - assert_eq!(classes[7], 3); - assert_eq!(classes[255], 3); - } - - #[test] - fn full_byte_classes() { - let mut set = ByteClassSet::new(); - for i in 0..256u16 { - set.set_range(i as u8, i as u8); - } - assert_eq!(set.byte_classes().len(), 256); - } -} |