use std::collections::HashMap; use std::fmt; use std::iter; use std::result; use std::sync::Arc; use regex_syntax::hir::{self, Hir}; use regex_syntax::is_word_byte; use regex_syntax::utf8::{Utf8Range, Utf8Sequence, Utf8Sequences}; use crate::prog::{ EmptyLook, Inst, InstBytes, InstChar, InstEmptyLook, InstPtr, InstRanges, InstSave, InstSplit, Program, }; use crate::Error; type Result = result::Result; type ResultOrEmpty = result::Result, Error>; #[derive(Debug)] struct Patch { hole: Hole, entry: InstPtr, } /// A compiler translates a regular expression AST to a sequence of /// instructions. The sequence of instructions represents an NFA. // `Compiler` is only public via the `internal` module, so avoid deriving // `Debug`. #[allow(missing_debug_implementations)] pub struct Compiler { insts: Vec, compiled: Program, capture_name_idx: HashMap, num_exprs: usize, size_limit: usize, suffix_cache: SuffixCache, utf8_seqs: Option, byte_classes: ByteClassSet, // This keeps track of extra bytes allocated while compiling the regex // program. Currently, this corresponds to two things. First is the heap // memory allocated by Unicode character classes ('InstRanges'). Second is // a "fake" amount of memory used by empty sub-expressions, so that enough // empty sub-expressions will ultimately trigger the compiler to bail // because of a size limit restriction. (That empty sub-expressions don't // add to heap memory usage is more-or-less an implementation detail.) In // the second case, if we don't bail, then an excessively large repetition // on an empty sub-expression can result in the compiler using a very large // amount of CPU time. extra_inst_bytes: usize, } impl Compiler { /// Create a new regular expression compiler. /// /// Various options can be set before calling `compile` on an expression. pub fn new() -> Self { Compiler { insts: vec![], compiled: Program::new(), capture_name_idx: HashMap::new(), num_exprs: 0, size_limit: 10 * (1 << 20), suffix_cache: SuffixCache::new(1000), utf8_seqs: Some(Utf8Sequences::new('\x00', '\x00')), byte_classes: ByteClassSet::new(), extra_inst_bytes: 0, } } /// The size of the resulting program is limited by size_limit. If /// the program approximately exceeds the given size (in bytes), then /// compilation will stop and return an error. pub fn size_limit(mut self, size_limit: usize) -> Self { self.size_limit = size_limit; self } /// If bytes is true, then the program is compiled as a byte based /// automaton, which incorporates UTF-8 decoding into the machine. If it's /// false, then the automaton is Unicode scalar value based, e.g., an /// engine utilizing such an automaton is responsible for UTF-8 decoding. /// /// The specific invariant is that when returning a byte based machine, /// the neither the `Char` nor `Ranges` instructions are produced. /// Conversely, when producing a Unicode scalar value machine, the `Bytes` /// instruction is never produced. /// /// Note that `dfa(true)` implies `bytes(true)`. pub fn bytes(mut self, yes: bool) -> Self { self.compiled.is_bytes = yes; self } /// When disabled, the program compiled may match arbitrary bytes. /// /// When enabled (the default), all compiled programs exclusively match /// valid UTF-8 bytes. pub fn only_utf8(mut self, yes: bool) -> Self { self.compiled.only_utf8 = yes; self } /// When set, the machine returned is suitable for use in the DFA matching /// engine. /// /// In particular, this ensures that if the regex is not anchored in the /// beginning, then a preceding `.*?` is included in the program. (The NFA /// based engines handle the preceding `.*?` explicitly, which is difficult /// or impossible in the DFA engine.) pub fn dfa(mut self, yes: bool) -> Self { self.compiled.is_dfa = yes; self } /// When set, the machine returned is suitable for matching text in /// reverse. In particular, all concatenations are flipped. pub fn reverse(mut self, yes: bool) -> Self { self.compiled.is_reverse = yes; self } /// Compile a regular expression given its AST. /// /// The compiler is guaranteed to succeed unless the program exceeds the /// specified size limit. If the size limit is exceeded, then compilation /// stops and returns an error. pub fn compile(mut self, exprs: &[Hir]) -> result::Result { debug_assert!(!exprs.is_empty()); self.num_exprs = exprs.len(); if exprs.len() == 1 { self.compile_one(&exprs[0]) } else { self.compile_many(exprs) } } fn compile_one(mut self, expr: &Hir) -> result::Result { // If we're compiling a forward DFA and we aren't anchored, then // add a `.*?` before the first capture group. // Other matching engines handle this by baking the logic into the // matching engine itself. let mut dotstar_patch = Patch { hole: Hole::None, entry: 0 }; self.compiled.is_anchored_start = expr.is_anchored_start(); self.compiled.is_anchored_end = expr.is_anchored_end(); if self.compiled.needs_dotstar() { dotstar_patch = self.c_dotstar()?; self.compiled.start = dotstar_patch.entry; } self.compiled.captures = vec![None]; let patch = self.c_capture(0, expr)?.unwrap_or_else(|| self.next_inst()); if self.compiled.needs_dotstar() { self.fill(dotstar_patch.hole, patch.entry); } else { self.compiled.start = patch.entry; } self.fill_to_next(patch.hole); self.compiled.matches = vec![self.insts.len()]; self.push_compiled(Inst::Match(0)); self.compile_finish() } fn compile_many( mut self, exprs: &[Hir], ) -> result::Result { debug_assert!(exprs.len() > 1); self.compiled.is_anchored_start = exprs.iter().all(|e| e.is_anchored_start()); self.compiled.is_anchored_end = exprs.iter().all(|e| e.is_anchored_end()); let mut dotstar_patch = Patch { hole: Hole::None, entry: 0 }; if self.compiled.needs_dotstar() { dotstar_patch = self.c_dotstar()?; self.compiled.start = dotstar_patch.entry; } else { self.compiled.start = 0; // first instruction is always split } self.fill_to_next(dotstar_patch.hole); let mut prev_hole = Hole::None; for (i, expr) in exprs[0..exprs.len() - 1].iter().enumerate() { self.fill_to_next(prev_hole); let split = self.push_split_hole(); let Patch { hole, entry } = self.c_capture(0, expr)?.unwrap_or_else(|| self.next_inst()); self.fill_to_next(hole); self.compiled.matches.push(self.insts.len()); self.push_compiled(Inst::Match(i)); prev_hole = self.fill_split(split, Some(entry), None); } let i = exprs.len() - 1; let Patch { hole, entry } = self.c_capture(0, &exprs[i])?.unwrap_or_else(|| self.next_inst()); self.fill(prev_hole, entry); self.fill_to_next(hole); self.compiled.matches.push(self.insts.len()); self.push_compiled(Inst::Match(i)); self.compile_finish() } fn compile_finish(mut self) -> result::Result { self.compiled.insts = self.insts.into_iter().map(|inst| inst.unwrap()).collect(); self.compiled.byte_classes = self.byte_classes.byte_classes(); self.compiled.capture_name_idx = Arc::new(self.capture_name_idx); Ok(self.compiled) } /// Compile expr into self.insts, returning a patch on success, /// or an error if we run out of memory. /// /// All of the c_* methods of the compiler share the contract outlined /// here. /// /// The main thing that a c_* method does is mutate `self.insts` /// to add a list of mostly compiled instructions required to execute /// the given expression. `self.insts` contains MaybeInsts rather than /// Insts because there is some backpatching required. /// /// The `Patch` value returned by each c_* method provides metadata /// about the compiled instructions emitted to `self.insts`. The /// `entry` member of the patch refers to the first instruction /// (the entry point), while the `hole` member contains zero or /// more offsets to partial instructions that need to be backpatched. /// The c_* routine can't know where its list of instructions are going to /// jump to after execution, so it is up to the caller to patch /// these jumps to point to the right place. So compiling some /// expression, e, we would end up with a situation that looked like: /// /// ```text /// self.insts = [ ..., i1, i2, ..., iexit1, ..., iexitn, ...] /// ^ ^ ^ /// | \ / /// entry \ / /// hole /// ``` /// /// To compile two expressions, e1 and e2, concatenated together we /// would do: /// /// ```ignore /// let patch1 = self.c(e1); /// let patch2 = self.c(e2); /// ``` /// /// while leaves us with a situation that looks like /// /// ```text /// self.insts = [ ..., i1, ..., iexit1, ..., i2, ..., iexit2 ] /// ^ ^ ^ ^ /// | | | | /// entry1 hole1 entry2 hole2 /// ``` /// /// Then to merge the two patches together into one we would backpatch /// hole1 with entry2 and return a new patch that enters at entry1 /// and has hole2 for a hole. In fact, if you look at the c_concat /// method you will see that it does exactly this, though it handles /// a list of expressions rather than just the two that we use for /// an example. /// /// Ok(None) is returned when an expression is compiled to no /// instruction, and so no patch.entry value makes sense. fn c(&mut self, expr: &Hir) -> ResultOrEmpty { use crate::prog; use regex_syntax::hir::HirKind::*; self.check_size()?; match *expr.kind() { Empty => self.c_empty(), Literal(hir::Literal::Unicode(c)) => self.c_char(c), Literal(hir::Literal::Byte(b)) => { assert!(self.compiled.uses_bytes()); self.c_byte(b) } Class(hir::Class::Unicode(ref cls)) => self.c_class(cls.ranges()), Class(hir::Class::Bytes(ref cls)) => { if self.compiled.uses_bytes() { self.c_class_bytes(cls.ranges()) } else { assert!(cls.is_all_ascii()); let mut char_ranges = vec![]; for r in cls.iter() { let (s, e) = (r.start() as char, r.end() as char); char_ranges.push(hir::ClassUnicodeRange::new(s, e)); } self.c_class(&char_ranges) } } Anchor(hir::Anchor::StartLine) if self.compiled.is_reverse => { self.byte_classes.set_range(b'\n', b'\n'); self.c_empty_look(prog::EmptyLook::EndLine) } Anchor(hir::Anchor::StartLine) => { self.byte_classes.set_range(b'\n', b'\n'); self.c_empty_look(prog::EmptyLook::StartLine) } Anchor(hir::Anchor::EndLine) if self.compiled.is_reverse => { self.byte_classes.set_range(b'\n', b'\n'); self.c_empty_look(prog::EmptyLook::StartLine) } Anchor(hir::Anchor::EndLine) => { self.byte_classes.set_range(b'\n', b'\n'); self.c_empty_look(prog::EmptyLook::EndLine) } Anchor(hir::Anchor::StartText) if self.compiled.is_reverse => { self.c_empty_look(prog::EmptyLook::EndText) } Anchor(hir::Anchor::StartText) => { self.c_empty_look(prog::EmptyLook::StartText) } Anchor(hir::Anchor::EndText) if self.compiled.is_reverse => { self.c_empty_look(prog::EmptyLook::StartText) } Anchor(hir::Anchor::EndText) => { self.c_empty_look(prog::EmptyLook::EndText) } WordBoundary(hir::WordBoundary::Unicode) => { if !cfg!(feature = "unicode-perl") { return Err(Error::Syntax( "Unicode word boundaries are unavailable when \ the unicode-perl feature is disabled" .to_string(), )); } self.compiled.has_unicode_word_boundary = true; self.byte_classes.set_word_boundary(); // We also make sure that all ASCII bytes are in a different // class from non-ASCII bytes. Otherwise, it's possible for // ASCII bytes to get lumped into the same class as non-ASCII // bytes. This in turn may cause the lazy DFA to falsely start // when it sees an ASCII byte that maps to a byte class with // non-ASCII bytes. This ensures that never happens. self.byte_classes.set_range(0, 0x7F); self.c_empty_look(prog::EmptyLook::WordBoundary) } WordBoundary(hir::WordBoundary::UnicodeNegate) => { if !cfg!(feature = "unicode-perl") { return Err(Error::Syntax( "Unicode word boundaries are unavailable when \ the unicode-perl feature is disabled" .to_string(), )); } self.compiled.has_unicode_word_boundary = true; self.byte_classes.set_word_boundary(); // See comments above for why we set the ASCII range here. self.byte_classes.set_range(0, 0x7F); self.c_empty_look(prog::EmptyLook::NotWordBoundary) } WordBoundary(hir::WordBoundary::Ascii) => { self.byte_classes.set_word_boundary(); self.c_empty_look(prog::EmptyLook::WordBoundaryAscii) } WordBoundary(hir::WordBoundary::AsciiNegate) => { self.byte_classes.set_word_boundary(); self.c_empty_look(prog::EmptyLook::NotWordBoundaryAscii) } Group(ref g) => match g.kind { hir::GroupKind::NonCapturing => self.c(&g.hir), hir::GroupKind::CaptureIndex(index) => { if index as usize >= self.compiled.captures.len() { self.compiled.captures.push(None); } self.c_capture(2 * index as usize, &g.hir) } hir::GroupKind::CaptureName { index, ref name } => { if index as usize >= self.compiled.captures.len() { let n = name.to_string(); self.compiled.captures.push(Some(n.clone())); self.capture_name_idx.insert(n, index as usize); } self.c_capture(2 * index as usize, &g.hir) } }, Concat(ref es) => { if self.compiled.is_reverse { self.c_concat(es.iter().rev()) } else { self.c_concat(es) } } Alternation(ref es) => self.c_alternate(&**es), Repetition(ref rep) => self.c_repeat(rep), } } fn c_empty(&mut self) -> ResultOrEmpty { // See: https://github.com/rust-lang/regex/security/advisories/GHSA-m5pq-gvj9-9vr8 // See: CVE-2022-24713 // // Since 'empty' sub-expressions don't increase the size of // the actual compiled object, we "fake" an increase in its // size so that our 'check_size_limit' routine will eventually // stop compilation if there are too many empty sub-expressions // (e.g., via a large repetition). self.extra_inst_bytes += std::mem::size_of::(); Ok(None) } fn c_capture(&mut self, first_slot: usize, expr: &Hir) -> ResultOrEmpty { if self.num_exprs > 1 || self.compiled.is_dfa { // Don't ever compile Save instructions for regex sets because // they are never used. They are also never used in DFA programs // because DFAs can't handle captures. self.c(expr) } else { let entry = self.insts.len(); let hole = self.push_hole(InstHole::Save { slot: first_slot }); let patch = self.c(expr)?.unwrap_or_else(|| self.next_inst()); self.fill(hole, patch.entry); self.fill_to_next(patch.hole); let hole = self.push_hole(InstHole::Save { slot: first_slot + 1 }); Ok(Some(Patch { hole, entry })) } } fn c_dotstar(&mut self) -> Result { Ok(if !self.compiled.only_utf8() { self.c(&Hir::repetition(hir::Repetition { kind: hir::RepetitionKind::ZeroOrMore, greedy: false, hir: Box::new(Hir::any(true)), }))? .unwrap() } else { self.c(&Hir::repetition(hir::Repetition { kind: hir::RepetitionKind::ZeroOrMore, greedy: false, hir: Box::new(Hir::any(false)), }))? .unwrap() }) } fn c_char(&mut self, c: char) -> ResultOrEmpty { if self.compiled.uses_bytes() { if c.is_ascii() { let b = c as u8; let hole = self.push_hole(InstHole::Bytes { start: b, end: b }); self.byte_classes.set_range(b, b); Ok(Some(Patch { hole, entry: self.insts.len() - 1 })) } else { self.c_class(&[hir::ClassUnicodeRange::new(c, c)]) } } else { let hole = self.push_hole(InstHole::Char { c }); Ok(Some(Patch { hole, entry: self.insts.len() - 1 })) } } fn c_class(&mut self, ranges: &[hir::ClassUnicodeRange]) -> ResultOrEmpty { use std::mem::size_of; assert!(!ranges.is_empty()); if self.compiled.uses_bytes() { Ok(Some(CompileClass { c: self, ranges }.compile()?)) } else { let ranges: Vec<(char, char)> = ranges.iter().map(|r| (r.start(), r.end())).collect(); let hole = if ranges.len() == 1 && ranges[0].0 == ranges[0].1 { self.push_hole(InstHole::Char { c: ranges[0].0 }) } else { self.extra_inst_bytes += ranges.len() * (size_of::() * 2); self.push_hole(InstHole::Ranges { ranges }) }; Ok(Some(Patch { hole, entry: self.insts.len() - 1 })) } } fn c_byte(&mut self, b: u8) -> ResultOrEmpty { self.c_class_bytes(&[hir::ClassBytesRange::new(b, b)]) } fn c_class_bytes( &mut self, ranges: &[hir::ClassBytesRange], ) -> ResultOrEmpty { debug_assert!(!ranges.is_empty()); let first_split_entry = self.insts.len(); let mut holes = vec![]; let mut prev_hole = Hole::None; for r in &ranges[0..ranges.len() - 1] { self.fill_to_next(prev_hole); let split = self.push_split_hole(); let next = self.insts.len(); self.byte_classes.set_range(r.start(), r.end()); holes.push(self.push_hole(InstHole::Bytes { start: r.start(), end: r.end(), })); prev_hole = self.fill_split(split, Some(next), None); } let next = self.insts.len(); let r = &ranges[ranges.len() - 1]; self.byte_classes.set_range(r.start(), r.end()); holes.push( self.push_hole(InstHole::Bytes { start: r.start(), end: r.end() }), ); self.fill(prev_hole, next); Ok(Some(Patch { hole: Hole::Many(holes), entry: first_split_entry })) } fn c_empty_look(&mut self, look: EmptyLook) -> ResultOrEmpty { let hole = self.push_hole(InstHole::EmptyLook { look }); Ok(Some(Patch { hole, entry: self.insts.len() - 1 })) } fn c_concat<'a, I>(&mut self, exprs: I) -> ResultOrEmpty where I: IntoIterator, { let mut exprs = exprs.into_iter(); let Patch { mut hole, entry } = loop { match exprs.next() { None => return self.c_empty(), Some(e) => { if let Some(p) = self.c(e)? { break p; } } } }; for e in exprs { if let Some(p) = self.c(e)? { self.fill(hole, p.entry); hole = p.hole; } } Ok(Some(Patch { hole, entry })) } fn c_alternate(&mut self, exprs: &[Hir]) -> ResultOrEmpty { debug_assert!( exprs.len() >= 2, "alternates must have at least 2 exprs" ); // Initial entry point is always the first split. let first_split_entry = self.insts.len(); // Save up all of the holes from each alternate. They will all get // patched to point to the same location. let mut holes = vec![]; // true indicates that the hole is a split where we want to fill // the second branch. let mut prev_hole = (Hole::None, false); for e in &exprs[0..exprs.len() - 1] { if prev_hole.1 { let next = self.insts.len(); self.fill_split(prev_hole.0, None, Some(next)); } else { self.fill_to_next(prev_hole.0); } let split = self.push_split_hole(); if let Some(Patch { hole, entry }) = self.c(e)? { holes.push(hole); prev_hole = (self.fill_split(split, Some(entry), None), false); } else { let (split1, split2) = split.dup_one(); holes.push(split1); prev_hole = (split2, true); } } if let Some(Patch { hole, entry }) = self.c(&exprs[exprs.len() - 1])? { holes.push(hole); if prev_hole.1 { self.fill_split(prev_hole.0, None, Some(entry)); } else { self.fill(prev_hole.0, entry); } } else { // We ignore prev_hole.1. When it's true, it means we have two // empty branches both pushing prev_hole.0 into holes, so both // branches will go to the same place anyway. holes.push(prev_hole.0); } Ok(Some(Patch { hole: Hole::Many(holes), entry: first_split_entry })) } fn c_repeat(&mut self, rep: &hir::Repetition) -> ResultOrEmpty { use regex_syntax::hir::RepetitionKind::*; match rep.kind { ZeroOrOne => self.c_repeat_zero_or_one(&rep.hir, rep.greedy), ZeroOrMore => self.c_repeat_zero_or_more(&rep.hir, rep.greedy), OneOrMore => self.c_repeat_one_or_more(&rep.hir, rep.greedy), Range(hir::RepetitionRange::Exactly(min_max)) => { self.c_repeat_range(&rep.hir, rep.greedy, min_max, min_max) } Range(hir::RepetitionRange::AtLeast(min)) => { self.c_repeat_range_min_or_more(&rep.hir, rep.greedy, min) } Range(hir::RepetitionRange::Bounded(min, max)) => { self.c_repeat_range(&rep.hir, rep.greedy, min, max) } } } fn c_repeat_zero_or_one( &mut self, expr: &Hir, greedy: bool, ) -> ResultOrEmpty { let split_entry = self.insts.len(); let split = self.push_split_hole(); let Patch { hole: hole_rep, entry: entry_rep } = match self.c(expr)? { Some(p) => p, None => return self.pop_split_hole(), }; let split_hole = if greedy { self.fill_split(split, Some(entry_rep), None) } else { self.fill_split(split, None, Some(entry_rep)) }; let holes = vec![hole_rep, split_hole]; Ok(Some(Patch { hole: Hole::Many(holes), entry: split_entry })) } fn c_repeat_zero_or_more( &mut self, expr: &Hir, greedy: bool, ) -> ResultOrEmpty { let split_entry = self.insts.len(); let split = self.push_split_hole(); let Patch { hole: hole_rep, entry: entry_rep } = match self.c(expr)? { Some(p) => p, None => return self.pop_split_hole(), }; self.fill(hole_rep, split_entry); let split_hole = if greedy { self.fill_split(split, Some(entry_rep), None) } else { self.fill_split(split, None, Some(entry_rep)) }; Ok(Some(Patch { hole: split_hole, entry: split_entry })) } fn c_repeat_one_or_more( &mut self, expr: &Hir, greedy: bool, ) -> ResultOrEmpty { let Patch { hole: hole_rep, entry: entry_rep } = match self.c(expr)? { Some(p) => p, None => return Ok(None), }; self.fill_to_next(hole_rep); let split = self.push_split_hole(); let split_hole = if greedy { self.fill_split(split, Some(entry_rep), None) } else { self.fill_split(split, None, Some(entry_rep)) }; Ok(Some(Patch { hole: split_hole, entry: entry_rep })) } fn c_repeat_range_min_or_more( &mut self, expr: &Hir, greedy: bool, min: u32, ) -> ResultOrEmpty { let min = u32_to_usize(min); // Using next_inst() is ok, because we can't return it (concat would // have to return Some(_) while c_repeat_range_min_or_more returns // None). let patch_concat = self .c_concat(iter::repeat(expr).take(min))? .unwrap_or_else(|| self.next_inst()); if let Some(patch_rep) = self.c_repeat_zero_or_more(expr, greedy)? { self.fill(patch_concat.hole, patch_rep.entry); Ok(Some(Patch { hole: patch_rep.hole, entry: patch_concat.entry })) } else { Ok(None) } } fn c_repeat_range( &mut self, expr: &Hir, greedy: bool, min: u32, max: u32, ) -> ResultOrEmpty { let (min, max) = (u32_to_usize(min), u32_to_usize(max)); debug_assert!(min <= max); let patch_concat = self.c_concat(iter::repeat(expr).take(min))?; if min == max { return Ok(patch_concat); } // Same reasoning as in c_repeat_range_min_or_more (we know that min < // max at this point). let patch_concat = patch_concat.unwrap_or_else(|| self.next_inst()); let initial_entry = patch_concat.entry; // It is much simpler to compile, e.g., `a{2,5}` as: // // aaa?a?a? // // But you end up with a sequence of instructions like this: // // 0: 'a' // 1: 'a', // 2: split(3, 4) // 3: 'a' // 4: split(5, 6) // 5: 'a' // 6: split(7, 8) // 7: 'a' // 8: MATCH // // This is *incredibly* inefficient because the splits end // up forming a chain, which has to be resolved everything a // transition is followed. let mut holes = vec![]; let mut prev_hole = patch_concat.hole; for _ in min..max { self.fill_to_next(prev_hole); let split = self.push_split_hole(); let Patch { hole, entry } = match self.c(expr)? { Some(p) => p, None => return self.pop_split_hole(), }; prev_hole = hole; if greedy { holes.push(self.fill_split(split, Some(entry), None)); } else { holes.push(self.fill_split(split, None, Some(entry))); } } holes.push(prev_hole); Ok(Some(Patch { hole: Hole::Many(holes), entry: initial_entry })) } /// Can be used as a default value for the c_* functions when the call to /// c_function is followed by inserting at least one instruction that is /// always executed after the ones written by the c* function. fn next_inst(&self) -> Patch { Patch { hole: Hole::None, entry: self.insts.len() } } fn fill(&mut self, hole: Hole, goto: InstPtr) { match hole { Hole::None => {} Hole::One(pc) => { self.insts[pc].fill(goto); } Hole::Many(holes) => { for hole in holes { self.fill(hole, goto); } } } } fn fill_to_next(&mut self, hole: Hole) { let next = self.insts.len(); self.fill(hole, next); } fn fill_split( &mut self, hole: Hole, goto1: Option, goto2: Option, ) -> Hole { match hole { Hole::None => Hole::None, Hole::One(pc) => match (goto1, goto2) { (Some(goto1), Some(goto2)) => { self.insts[pc].fill_split(goto1, goto2); Hole::None } (Some(goto1), None) => { self.insts[pc].half_fill_split_goto1(goto1); Hole::One(pc) } (None, Some(goto2)) => { self.insts[pc].half_fill_split_goto2(goto2); Hole::One(pc) } (None, None) => unreachable!( "at least one of the split \ holes must be filled" ), }, Hole::Many(holes) => { let mut new_holes = vec![]; for hole in holes { new_holes.push(self.fill_split(hole, goto1, goto2)); } if new_holes.is_empty() { Hole::None } else if new_holes.len() == 1 { new_holes.pop().unwrap() } else { Hole::Many(new_holes) } } } } fn push_compiled(&mut self, inst: Inst) { self.insts.push(MaybeInst::Compiled(inst)); } fn push_hole(&mut self, inst: InstHole) -> Hole { let hole = self.insts.len(); self.insts.push(MaybeInst::Uncompiled(inst)); Hole::One(hole) } fn push_split_hole(&mut self) -> Hole { let hole = self.insts.len(); self.insts.push(MaybeInst::Split); Hole::One(hole) } fn pop_split_hole(&mut self) -> ResultOrEmpty { self.insts.pop(); Ok(None) } fn check_size(&self) -> result::Result<(), Error> { use std::mem::size_of; let size = self.extra_inst_bytes + (self.insts.len() * size_of::()); if size > self.size_limit { Err(Error::CompiledTooBig(self.size_limit)) } else { Ok(()) } } } #[derive(Debug)] enum Hole { None, One(InstPtr), Many(Vec), } impl Hole { fn dup_one(self) -> (Self, Self) { match self { Hole::One(pc) => (Hole::One(pc), Hole::One(pc)), Hole::None | Hole::Many(_) => { unreachable!("must be called on single hole") } } } } #[derive(Clone, Debug)] enum MaybeInst { Compiled(Inst), Uncompiled(InstHole), Split, Split1(InstPtr), Split2(InstPtr), } impl MaybeInst { fn fill(&mut self, goto: InstPtr) { let maybeinst = match *self { MaybeInst::Split => MaybeInst::Split1(goto), MaybeInst::Uncompiled(ref inst) => { MaybeInst::Compiled(inst.fill(goto)) } MaybeInst::Split1(goto1) => { MaybeInst::Compiled(Inst::Split(InstSplit { goto1, goto2: goto, })) } MaybeInst::Split2(goto2) => { MaybeInst::Compiled(Inst::Split(InstSplit { goto1: goto, goto2, })) } _ => unreachable!( "not all instructions were compiled! \ found uncompiled instruction: {:?}", self ), }; *self = maybeinst; } fn fill_split(&mut self, goto1: InstPtr, goto2: InstPtr) { let filled = match *self { MaybeInst::Split => Inst::Split(InstSplit { goto1, goto2 }), _ => unreachable!( "must be called on Split instruction, \ instead it was called on: {:?}", self ), }; *self = MaybeInst::Compiled(filled); } fn half_fill_split_goto1(&mut self, goto1: InstPtr) { let half_filled = match *self { MaybeInst::Split => goto1, _ => unreachable!( "must be called on Split instruction, \ instead it was called on: {:?}", self ), }; *self = MaybeInst::Split1(half_filled); } fn half_fill_split_goto2(&mut self, goto2: InstPtr) { let half_filled = match *self { MaybeInst::Split => goto2, _ => unreachable!( "must be called on Split instruction, \ instead it was called on: {:?}", self ), }; *self = MaybeInst::Split2(half_filled); } fn unwrap(self) -> Inst { match self { MaybeInst::Compiled(inst) => inst, _ => unreachable!( "must be called on a compiled instruction, \ instead it was called on: {:?}", self ), } } } #[derive(Clone, Debug)] enum InstHole { Save { slot: usize }, EmptyLook { look: EmptyLook }, Char { c: char }, Ranges { ranges: Vec<(char, char)> }, Bytes { start: u8, end: u8 }, } impl InstHole { fn fill(&self, goto: InstPtr) -> Inst { match *self { InstHole::Save { slot } => Inst::Save(InstSave { goto, slot }), InstHole::EmptyLook { look } => { Inst::EmptyLook(InstEmptyLook { goto, look }) } InstHole::Char { c } => Inst::Char(InstChar { goto, c }), InstHole::Ranges { ref ranges } => Inst::Ranges(InstRanges { goto, ranges: ranges.clone().into_boxed_slice(), }), InstHole::Bytes { start, end } => { Inst::Bytes(InstBytes { goto, start, end }) } } } } struct CompileClass<'a, 'b> { c: &'a mut Compiler, ranges: &'b [hir::ClassUnicodeRange], } impl<'a, 'b> CompileClass<'a, 'b> { fn compile(mut self) -> Result { let mut holes = vec![]; let mut initial_entry = None; let mut last_split = Hole::None; let mut utf8_seqs = self.c.utf8_seqs.take().unwrap(); self.c.suffix_cache.clear(); for (i, range) in self.ranges.iter().enumerate() { let is_last_range = i + 1 == self.ranges.len(); utf8_seqs.reset(range.start(), range.end()); let mut it = (&mut utf8_seqs).peekable(); loop { let utf8_seq = match it.next() { None => break, Some(utf8_seq) => utf8_seq, }; if is_last_range && it.peek().is_none() { let Patch { hole, entry } = self.c_utf8_seq(&utf8_seq)?; holes.push(hole); self.c.fill(last_split, entry); last_split = Hole::None; if initial_entry.is_none() { initial_entry = Some(entry); } } else { if initial_entry.is_none() { initial_entry = Some(self.c.insts.len()); } self.c.fill_to_next(last_split); last_split = self.c.push_split_hole(); let Patch { hole, entry } = self.c_utf8_seq(&utf8_seq)?; holes.push(hole); last_split = self.c.fill_split(last_split, Some(entry), None); } } } self.c.utf8_seqs = Some(utf8_seqs); Ok(Patch { hole: Hole::Many(holes), entry: initial_entry.unwrap() }) } fn c_utf8_seq(&mut self, seq: &Utf8Sequence) -> Result { if self.c.compiled.is_reverse { self.c_utf8_seq_(seq) } else { self.c_utf8_seq_(seq.into_iter().rev()) } } fn c_utf8_seq_<'r, I>(&mut self, seq: I) -> Result where I: IntoIterator, { // The initial instruction for each UTF-8 sequence should be the same. let mut from_inst = ::std::usize::MAX; let mut last_hole = Hole::None; for byte_range in seq { let key = SuffixCacheKey { from_inst, start: byte_range.start, end: byte_range.end, }; { let pc = self.c.insts.len(); if let Some(cached_pc) = self.c.suffix_cache.get(key, pc) { from_inst = cached_pc; continue; } } self.c.byte_classes.set_range(byte_range.start, byte_range.end); if from_inst == ::std::usize::MAX { last_hole = self.c.push_hole(InstHole::Bytes { start: byte_range.start, end: byte_range.end, }); } else { self.c.push_compiled(Inst::Bytes(InstBytes { goto: from_inst, start: byte_range.start, end: byte_range.end, })); } from_inst = self.c.insts.len().checked_sub(1).unwrap(); debug_assert!(from_inst < ::std::usize::MAX); } debug_assert!(from_inst < ::std::usize::MAX); Ok(Patch { hole: last_hole, entry: from_inst }) } } /// `SuffixCache` is a simple bounded hash map for caching suffix entries in /// UTF-8 automata. For example, consider the Unicode range \u{0}-\u{FFFF}. /// The set of byte ranges looks like this: /// /// [0-7F] /// [C2-DF][80-BF] /// [E0][A0-BF][80-BF] /// [E1-EC][80-BF][80-BF] /// [ED][80-9F][80-BF] /// [EE-EF][80-BF][80-BF] /// /// Each line above translates to one alternate in the compiled regex program. /// However, all but one of the alternates end in the same suffix, which is /// a waste of an instruction. The suffix cache facilitates reusing them across /// alternates. /// /// Note that a HashMap could be trivially used for this, but we don't need its /// overhead. Some small bounded space (LRU style) is more than enough. /// /// This uses similar idea to [`SparseSet`](../sparse/struct.SparseSet.html), /// except it uses hashes as original indices and then compares full keys for /// validation against `dense` array. #[derive(Debug)] struct SuffixCache { sparse: Box<[usize]>, dense: Vec, } #[derive(Clone, Copy, Debug, Default, Eq, Hash, PartialEq)] struct SuffixCacheEntry { key: SuffixCacheKey, pc: InstPtr, } #[derive(Clone, Copy, Debug, Default, Eq, Hash, PartialEq)] struct SuffixCacheKey { from_inst: InstPtr, start: u8, end: u8, } impl SuffixCache { fn new(size: usize) -> Self { SuffixCache { sparse: vec![0usize; size].into(), dense: Vec::with_capacity(size), } } fn get(&mut self, key: SuffixCacheKey, pc: InstPtr) -> Option { let hash = self.hash(&key); let pos = &mut self.sparse[hash]; if let Some(entry) = self.dense.get(*pos) { if entry.key == key { return Some(entry.pc); } } *pos = self.dense.len(); self.dense.push(SuffixCacheEntry { key, pc }); None } fn clear(&mut self) { self.dense.clear(); } fn hash(&self, suffix: &SuffixCacheKey) -> usize { // Basic FNV-1a hash as described: // https://en.wikipedia.org/wiki/Fowler%E2%80%93Noll%E2%80%93Vo_hash_function const FNV_PRIME: u64 = 1_099_511_628_211; let mut h = 14_695_981_039_346_656_037; h = (h ^ (suffix.from_inst as u64)).wrapping_mul(FNV_PRIME); h = (h ^ (suffix.start as u64)).wrapping_mul(FNV_PRIME); h = (h ^ (suffix.end as u64)).wrapping_mul(FNV_PRIME); (h as usize) % self.sparse.len() } } struct ByteClassSet([bool; 256]); impl ByteClassSet { fn new() -> Self { ByteClassSet([false; 256]) } fn set_range(&mut self, start: u8, end: u8) { debug_assert!(start <= end); if start > 0 { self.0[start as usize - 1] = true; } self.0[end as usize] = true; } fn set_word_boundary(&mut self) { // We need to mark all ranges of bytes whose pairs result in // evaluating \b differently. let iswb = is_word_byte; let mut b1: u16 = 0; let mut b2: u16; while b1 <= 255 { b2 = b1 + 1; while b2 <= 255 && iswb(b1 as u8) == iswb(b2 as u8) { b2 += 1; } self.set_range(b1 as u8, (b2 - 1) as u8); b1 = b2; } } fn byte_classes(&self) -> Vec { // N.B. If you're debugging the DFA, it's useful to simply return // `(0..256).collect()`, which effectively removes the byte classes // and makes the transitions easier to read. // (0usize..256).map(|x| x as u8).collect() let mut byte_classes = vec![0; 256]; let mut class = 0u8; let mut i = 0; loop { byte_classes[i] = class as u8; if i >= 255 { break; } if self.0[i] { class = class.checked_add(1).unwrap(); } i += 1; } byte_classes } } impl fmt::Debug for ByteClassSet { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { f.debug_tuple("ByteClassSet").field(&&self.0[..]).finish() } } fn u32_to_usize(n: u32) -> usize { // In case usize is less than 32 bits, we need to guard against overflow. // On most platforms this compiles to nothing. // TODO Use `std::convert::TryFrom` once it's stable. if (n as u64) > (::std::usize::MAX as u64) { panic!("BUG: {} is too big to be pointer sized", n) } n as usize } #[cfg(test)] mod tests { use super::ByteClassSet; #[test] fn byte_classes() { let mut set = ByteClassSet::new(); set.set_range(b'a', b'z'); let classes = set.byte_classes(); assert_eq!(classes[0], 0); assert_eq!(classes[1], 0); assert_eq!(classes[2], 0); assert_eq!(classes[b'a' as usize - 1], 0); assert_eq!(classes[b'a' as usize], 1); assert_eq!(classes[b'm' as usize], 1); assert_eq!(classes[b'z' as usize], 1); assert_eq!(classes[b'z' as usize + 1], 2); assert_eq!(classes[254], 2); assert_eq!(classes[255], 2); let mut set = ByteClassSet::new(); set.set_range(0, 2); set.set_range(4, 6); let classes = set.byte_classes(); assert_eq!(classes[0], 0); assert_eq!(classes[1], 0); assert_eq!(classes[2], 0); assert_eq!(classes[3], 1); assert_eq!(classes[4], 2); assert_eq!(classes[5], 2); assert_eq!(classes[6], 2); assert_eq!(classes[7], 3); assert_eq!(classes[255], 3); } #[test] fn full_byte_classes() { let mut set = ByteClassSet::new(); for i in 0..256u16 { set.set_range(i as u8, i as u8); } assert_eq!(set.byte_classes().len(), 256); } }