diff options
author | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-04-07 09:22:09 +0000 |
---|---|---|
committer | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-04-07 09:22:09 +0000 |
commit | 43a97878ce14b72f0981164f87f2e35e14151312 (patch) | |
tree | 620249daf56c0258faa40cbdcf9cfba06de2a846 /third_party/rust/regex/src | |
parent | Initial commit. (diff) | |
download | firefox-upstream.tar.xz firefox-upstream.zip |
Adding upstream version 110.0.1.upstream/110.0.1upstream
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'third_party/rust/regex/src')
28 files changed, 13236 insertions, 0 deletions
diff --git a/third_party/rust/regex/src/backtrack.rs b/third_party/rust/regex/src/backtrack.rs new file mode 100644 index 0000000000..4d83856ca0 --- /dev/null +++ b/third_party/rust/regex/src/backtrack.rs @@ -0,0 +1,282 @@ +// This is the backtracking matching engine. It has the same exact capability +// as the full NFA simulation, except it is artificially restricted to small +// regexes on small inputs because of its memory requirements. +// +// In particular, this is a *bounded* backtracking engine. It retains worst +// case linear time by keeping track of the states that it has visited (using a +// bitmap). Namely, once a state is visited, it is never visited again. Since a +// state is keyed by `(instruction index, input index)`, we have that its time +// complexity is `O(mn)` (i.e., linear in the size of the search text). +// +// The backtracking engine can beat out the NFA simulation on small +// regexes/inputs because it doesn't have to keep track of multiple copies of +// the capture groups. In benchmarks, the backtracking engine is roughly twice +// as fast as the full NFA simulation. Note though that its performance doesn't +// scale, even if you're willing to live with the memory requirements. Namely, +// the bitset has to be zeroed on each execution, which becomes quite expensive +// on large bitsets. + +use crate::exec::ProgramCache; +use crate::input::{Input, InputAt}; +use crate::prog::{InstPtr, Program}; +use crate::re_trait::Slot; + +type Bits = u32; + +const BIT_SIZE: usize = 32; +const MAX_SIZE_BYTES: usize = 256 * (1 << 10); // 256 KB + +/// Returns true iff the given regex and input should be executed by this +/// engine with reasonable memory usage. +pub fn should_exec(num_insts: usize, text_len: usize) -> bool { + // Total memory usage in bytes is determined by: + // + // ((len(insts) * (len(input) + 1) + bits - 1) / bits) * (size_of(u32)) + // + // The actual limit picked is pretty much a heuristic. + // See: https://github.com/rust-lang/regex/issues/215 + let size = ((num_insts * (text_len + 1) + BIT_SIZE - 1) / BIT_SIZE) * 4; + size <= MAX_SIZE_BYTES +} + +/// A backtracking matching engine. +#[derive(Debug)] +pub struct Bounded<'a, 'm, 'r, 's, I> { + prog: &'r Program, + input: I, + matches: &'m mut [bool], + slots: &'s mut [Slot], + m: &'a mut Cache, +} + +/// Shared cached state between multiple invocations of a backtracking engine +/// in the same thread. +#[derive(Clone, Debug)] +pub struct Cache { + jobs: Vec<Job>, + visited: Vec<Bits>, +} + +impl Cache { + /// Create new empty cache for the backtracking engine. + pub fn new(_prog: &Program) -> Self { + Cache { jobs: vec![], visited: vec![] } + } +} + +/// A job is an explicit unit of stack space in the backtracking engine. +/// +/// The "normal" representation is a single state transition, which corresponds +/// to an NFA state and a character in the input. However, the backtracking +/// engine must keep track of old capture group values. We use the explicit +/// stack to do it. +#[derive(Clone, Copy, Debug)] +enum Job { + Inst { ip: InstPtr, at: InputAt }, + SaveRestore { slot: usize, old_pos: Option<usize> }, +} + +impl<'a, 'm, 'r, 's, I: Input> Bounded<'a, 'm, 'r, 's, I> { + /// Execute the backtracking matching engine. + /// + /// If there's a match, `exec` returns `true` and populates the given + /// captures accordingly. + pub fn exec( + prog: &'r Program, + cache: &ProgramCache, + matches: &'m mut [bool], + slots: &'s mut [Slot], + input: I, + start: usize, + end: usize, + ) -> bool { + let mut cache = cache.borrow_mut(); + let cache = &mut cache.backtrack; + let start = input.at(start); + let mut b = Bounded { prog, input, matches, slots, m: cache }; + b.exec_(start, end) + } + + /// Clears the cache such that the backtracking engine can be executed + /// on some input of fixed length. + fn clear(&mut self) { + // Reset the job memory so that we start fresh. + self.m.jobs.clear(); + + // Now we need to clear the bit state set. + // We do this by figuring out how much space we need to keep track + // of the states we've visited. + // Then we reset all existing allocated space to 0. + // Finally, we request more space if we need it. + // + // This is all a little circuitous, but doing this using unchecked + // operations doesn't seem to have a measurable impact on performance. + // (Probably because backtracking is limited to such small + // inputs/regexes in the first place.) + let visited_len = + (self.prog.len() * (self.input.len() + 1) + BIT_SIZE - 1) + / BIT_SIZE; + self.m.visited.truncate(visited_len); + for v in &mut self.m.visited { + *v = 0; + } + if visited_len > self.m.visited.len() { + let len = self.m.visited.len(); + self.m.visited.reserve_exact(visited_len - len); + for _ in 0..(visited_len - len) { + self.m.visited.push(0); + } + } + } + + /// Start backtracking at the given position in the input, but also look + /// for literal prefixes. + fn exec_(&mut self, mut at: InputAt, end: usize) -> bool { + self.clear(); + // If this is an anchored regex at the beginning of the input, then + // we're either already done or we only need to try backtracking once. + if self.prog.is_anchored_start { + return if !at.is_start() { false } else { self.backtrack(at) }; + } + let mut matched = false; + loop { + if !self.prog.prefixes.is_empty() { + at = match self.input.prefix_at(&self.prog.prefixes, at) { + None => break, + Some(at) => at, + }; + } + matched = self.backtrack(at) || matched; + if matched && self.prog.matches.len() == 1 { + return true; + } + if at.pos() >= end { + break; + } + at = self.input.at(at.next_pos()); + } + matched + } + + /// The main backtracking loop starting at the given input position. + fn backtrack(&mut self, start: InputAt) -> bool { + // N.B. We use an explicit stack to avoid recursion. + // To avoid excessive pushing and popping, most transitions are handled + // in the `step` helper function, which only pushes to the stack when + // there's a capture or a branch. + let mut matched = false; + self.m.jobs.push(Job::Inst { ip: 0, at: start }); + while let Some(job) = self.m.jobs.pop() { + match job { + Job::Inst { ip, at } => { + if self.step(ip, at) { + // Only quit if we're matching one regex. + // If we're matching a regex set, then mush on and + // try to find other matches (if we want them). + if self.prog.matches.len() == 1 { + return true; + } + matched = true; + } + } + Job::SaveRestore { slot, old_pos } => { + if slot < self.slots.len() { + self.slots[slot] = old_pos; + } + } + } + } + matched + } + + fn step(&mut self, mut ip: InstPtr, mut at: InputAt) -> bool { + use crate::prog::Inst::*; + loop { + // This loop is an optimization to avoid constantly pushing/popping + // from the stack. Namely, if we're pushing a job only to run it + // next, avoid the push and just mutate `ip` (and possibly `at`) + // in place. + if self.has_visited(ip, at) { + return false; + } + match self.prog[ip] { + Match(slot) => { + if slot < self.matches.len() { + self.matches[slot] = true; + } + return true; + } + Save(ref inst) => { + if let Some(&old_pos) = self.slots.get(inst.slot) { + // If this path doesn't work out, then we save the old + // capture index (if one exists) in an alternate + // job. If the next path fails, then the alternate + // job is popped and the old capture index is restored. + self.m.jobs.push(Job::SaveRestore { + slot: inst.slot, + old_pos, + }); + self.slots[inst.slot] = Some(at.pos()); + } + ip = inst.goto; + } + Split(ref inst) => { + self.m.jobs.push(Job::Inst { ip: inst.goto2, at }); + ip = inst.goto1; + } + EmptyLook(ref inst) => { + if self.input.is_empty_match(at, inst) { + ip = inst.goto; + } else { + return false; + } + } + Char(ref inst) => { + if inst.c == at.char() { + ip = inst.goto; + at = self.input.at(at.next_pos()); + } else { + return false; + } + } + Ranges(ref inst) => { + if inst.matches(at.char()) { + ip = inst.goto; + at = self.input.at(at.next_pos()); + } else { + return false; + } + } + Bytes(ref inst) => { + if let Some(b) = at.byte() { + if inst.matches(b) { + ip = inst.goto; + at = self.input.at(at.next_pos()); + continue; + } + } + return false; + } + } + } + } + + fn has_visited(&mut self, ip: InstPtr, at: InputAt) -> bool { + let k = ip * (self.input.len() + 1) + at.pos(); + let k1 = k / BIT_SIZE; + let k2 = usize_to_u32(1 << (k & (BIT_SIZE - 1))); + if self.m.visited[k1] & k2 == 0 { + self.m.visited[k1] |= k2; + false + } else { + true + } + } +} + +fn usize_to_u32(n: usize) -> u32 { + if (n as u64) > (::std::u32::MAX as u64) { + panic!("BUG: {} is too big to fit into u32", n) + } + n as u32 +} diff --git a/third_party/rust/regex/src/compile.rs b/third_party/rust/regex/src/compile.rs new file mode 100644 index 0000000000..90ca25015f --- /dev/null +++ b/third_party/rust/regex/src/compile.rs @@ -0,0 +1,1264 @@ +use std::collections::HashMap; +use std::fmt; +use std::iter; +use std::result; +use std::sync::Arc; + +use regex_syntax::hir::{self, Hir}; +use regex_syntax::is_word_byte; +use regex_syntax::utf8::{Utf8Range, Utf8Sequence, Utf8Sequences}; + +use crate::prog::{ + EmptyLook, Inst, InstBytes, InstChar, InstEmptyLook, InstPtr, InstRanges, + InstSave, InstSplit, Program, +}; + +use crate::Error; + +type Result = result::Result<Patch, Error>; +type ResultOrEmpty = result::Result<Option<Patch>, Error>; + +#[derive(Debug)] +struct Patch { + hole: Hole, + entry: InstPtr, +} + +/// A compiler translates a regular expression AST to a sequence of +/// instructions. The sequence of instructions represents an NFA. +// `Compiler` is only public via the `internal` module, so avoid deriving +// `Debug`. +#[allow(missing_debug_implementations)] +pub struct Compiler { + insts: Vec<MaybeInst>, + compiled: Program, + capture_name_idx: HashMap<String, usize>, + num_exprs: usize, + size_limit: usize, + suffix_cache: SuffixCache, + utf8_seqs: Option<Utf8Sequences>, + byte_classes: ByteClassSet, + // This keeps track of extra bytes allocated while compiling the regex + // program. Currently, this corresponds to two things. First is the heap + // memory allocated by Unicode character classes ('InstRanges'). Second is + // a "fake" amount of memory used by empty sub-expressions, so that enough + // empty sub-expressions will ultimately trigger the compiler to bail + // because of a size limit restriction. (That empty sub-expressions don't + // add to heap memory usage is more-or-less an implementation detail.) In + // the second case, if we don't bail, then an excessively large repetition + // on an empty sub-expression can result in the compiler using a very large + // amount of CPU time. + extra_inst_bytes: usize, +} + +impl Compiler { + /// Create a new regular expression compiler. + /// + /// Various options can be set before calling `compile` on an expression. + pub fn new() -> Self { + Compiler { + insts: vec![], + compiled: Program::new(), + capture_name_idx: HashMap::new(), + num_exprs: 0, + size_limit: 10 * (1 << 20), + suffix_cache: SuffixCache::new(1000), + utf8_seqs: Some(Utf8Sequences::new('\x00', '\x00')), + byte_classes: ByteClassSet::new(), + extra_inst_bytes: 0, + } + } + + /// The size of the resulting program is limited by size_limit. If + /// the program approximately exceeds the given size (in bytes), then + /// compilation will stop and return an error. + pub fn size_limit(mut self, size_limit: usize) -> Self { + self.size_limit = size_limit; + self + } + + /// If bytes is true, then the program is compiled as a byte based + /// automaton, which incorporates UTF-8 decoding into the machine. If it's + /// false, then the automaton is Unicode scalar value based, e.g., an + /// engine utilizing such an automaton is responsible for UTF-8 decoding. + /// + /// The specific invariant is that when returning a byte based machine, + /// the neither the `Char` nor `Ranges` instructions are produced. + /// Conversely, when producing a Unicode scalar value machine, the `Bytes` + /// instruction is never produced. + /// + /// Note that `dfa(true)` implies `bytes(true)`. + pub fn bytes(mut self, yes: bool) -> Self { + self.compiled.is_bytes = yes; + self + } + + /// When disabled, the program compiled may match arbitrary bytes. + /// + /// When enabled (the default), all compiled programs exclusively match + /// valid UTF-8 bytes. + pub fn only_utf8(mut self, yes: bool) -> Self { + self.compiled.only_utf8 = yes; + self + } + + /// When set, the machine returned is suitable for use in the DFA matching + /// engine. + /// + /// In particular, this ensures that if the regex is not anchored in the + /// beginning, then a preceding `.*?` is included in the program. (The NFA + /// based engines handle the preceding `.*?` explicitly, which is difficult + /// or impossible in the DFA engine.) + pub fn dfa(mut self, yes: bool) -> Self { + self.compiled.is_dfa = yes; + self + } + + /// When set, the machine returned is suitable for matching text in + /// reverse. In particular, all concatenations are flipped. + pub fn reverse(mut self, yes: bool) -> Self { + self.compiled.is_reverse = yes; + self + } + + /// Compile a regular expression given its AST. + /// + /// The compiler is guaranteed to succeed unless the program exceeds the + /// specified size limit. If the size limit is exceeded, then compilation + /// stops and returns an error. + pub fn compile(mut self, exprs: &[Hir]) -> result::Result<Program, Error> { + debug_assert!(!exprs.is_empty()); + self.num_exprs = exprs.len(); + if exprs.len() == 1 { + self.compile_one(&exprs[0]) + } else { + self.compile_many(exprs) + } + } + + fn compile_one(mut self, expr: &Hir) -> result::Result<Program, Error> { + // If we're compiling a forward DFA and we aren't anchored, then + // add a `.*?` before the first capture group. + // Other matching engines handle this by baking the logic into the + // matching engine itself. + let mut dotstar_patch = Patch { hole: Hole::None, entry: 0 }; + self.compiled.is_anchored_start = expr.is_anchored_start(); + self.compiled.is_anchored_end = expr.is_anchored_end(); + if self.compiled.needs_dotstar() { + dotstar_patch = self.c_dotstar()?; + self.compiled.start = dotstar_patch.entry; + } + self.compiled.captures = vec![None]; + let patch = + self.c_capture(0, expr)?.unwrap_or_else(|| self.next_inst()); + if self.compiled.needs_dotstar() { + self.fill(dotstar_patch.hole, patch.entry); + } else { + self.compiled.start = patch.entry; + } + self.fill_to_next(patch.hole); + self.compiled.matches = vec![self.insts.len()]; + self.push_compiled(Inst::Match(0)); + self.compile_finish() + } + + fn compile_many( + mut self, + exprs: &[Hir], + ) -> result::Result<Program, Error> { + debug_assert!(exprs.len() > 1); + + self.compiled.is_anchored_start = + exprs.iter().all(|e| e.is_anchored_start()); + self.compiled.is_anchored_end = + exprs.iter().all(|e| e.is_anchored_end()); + let mut dotstar_patch = Patch { hole: Hole::None, entry: 0 }; + if self.compiled.needs_dotstar() { + dotstar_patch = self.c_dotstar()?; + self.compiled.start = dotstar_patch.entry; + } else { + self.compiled.start = 0; // first instruction is always split + } + self.fill_to_next(dotstar_patch.hole); + + let mut prev_hole = Hole::None; + for (i, expr) in exprs[0..exprs.len() - 1].iter().enumerate() { + self.fill_to_next(prev_hole); + let split = self.push_split_hole(); + let Patch { hole, entry } = + self.c_capture(0, expr)?.unwrap_or_else(|| self.next_inst()); + self.fill_to_next(hole); + self.compiled.matches.push(self.insts.len()); + self.push_compiled(Inst::Match(i)); + prev_hole = self.fill_split(split, Some(entry), None); + } + let i = exprs.len() - 1; + let Patch { hole, entry } = + self.c_capture(0, &exprs[i])?.unwrap_or_else(|| self.next_inst()); + self.fill(prev_hole, entry); + self.fill_to_next(hole); + self.compiled.matches.push(self.insts.len()); + self.push_compiled(Inst::Match(i)); + self.compile_finish() + } + + fn compile_finish(mut self) -> result::Result<Program, Error> { + self.compiled.insts = + self.insts.into_iter().map(|inst| inst.unwrap()).collect(); + self.compiled.byte_classes = self.byte_classes.byte_classes(); + self.compiled.capture_name_idx = Arc::new(self.capture_name_idx); + Ok(self.compiled) + } + + /// Compile expr into self.insts, returning a patch on success, + /// or an error if we run out of memory. + /// + /// All of the c_* methods of the compiler share the contract outlined + /// here. + /// + /// The main thing that a c_* method does is mutate `self.insts` + /// to add a list of mostly compiled instructions required to execute + /// the given expression. `self.insts` contains MaybeInsts rather than + /// Insts because there is some backpatching required. + /// + /// The `Patch` value returned by each c_* method provides metadata + /// about the compiled instructions emitted to `self.insts`. The + /// `entry` member of the patch refers to the first instruction + /// (the entry point), while the `hole` member contains zero or + /// more offsets to partial instructions that need to be backpatched. + /// The c_* routine can't know where its list of instructions are going to + /// jump to after execution, so it is up to the caller to patch + /// these jumps to point to the right place. So compiling some + /// expression, e, we would end up with a situation that looked like: + /// + /// ```text + /// self.insts = [ ..., i1, i2, ..., iexit1, ..., iexitn, ...] + /// ^ ^ ^ + /// | \ / + /// entry \ / + /// hole + /// ``` + /// + /// To compile two expressions, e1 and e2, concatenated together we + /// would do: + /// + /// ```ignore + /// let patch1 = self.c(e1); + /// let patch2 = self.c(e2); + /// ``` + /// + /// while leaves us with a situation that looks like + /// + /// ```text + /// self.insts = [ ..., i1, ..., iexit1, ..., i2, ..., iexit2 ] + /// ^ ^ ^ ^ + /// | | | | + /// entry1 hole1 entry2 hole2 + /// ``` + /// + /// Then to merge the two patches together into one we would backpatch + /// hole1 with entry2 and return a new patch that enters at entry1 + /// and has hole2 for a hole. In fact, if you look at the c_concat + /// method you will see that it does exactly this, though it handles + /// a list of expressions rather than just the two that we use for + /// an example. + /// + /// Ok(None) is returned when an expression is compiled to no + /// instruction, and so no patch.entry value makes sense. + fn c(&mut self, expr: &Hir) -> ResultOrEmpty { + use crate::prog; + use regex_syntax::hir::HirKind::*; + + self.check_size()?; + match *expr.kind() { + Empty => self.c_empty(), + Literal(hir::Literal::Unicode(c)) => self.c_char(c), + Literal(hir::Literal::Byte(b)) => { + assert!(self.compiled.uses_bytes()); + self.c_byte(b) + } + Class(hir::Class::Unicode(ref cls)) => self.c_class(cls.ranges()), + Class(hir::Class::Bytes(ref cls)) => { + if self.compiled.uses_bytes() { + self.c_class_bytes(cls.ranges()) + } else { + assert!(cls.is_all_ascii()); + let mut char_ranges = vec![]; + for r in cls.iter() { + let (s, e) = (r.start() as char, r.end() as char); + char_ranges.push(hir::ClassUnicodeRange::new(s, e)); + } + self.c_class(&char_ranges) + } + } + Anchor(hir::Anchor::StartLine) if self.compiled.is_reverse => { + self.byte_classes.set_range(b'\n', b'\n'); + self.c_empty_look(prog::EmptyLook::EndLine) + } + Anchor(hir::Anchor::StartLine) => { + self.byte_classes.set_range(b'\n', b'\n'); + self.c_empty_look(prog::EmptyLook::StartLine) + } + Anchor(hir::Anchor::EndLine) if self.compiled.is_reverse => { + self.byte_classes.set_range(b'\n', b'\n'); + self.c_empty_look(prog::EmptyLook::StartLine) + } + Anchor(hir::Anchor::EndLine) => { + self.byte_classes.set_range(b'\n', b'\n'); + self.c_empty_look(prog::EmptyLook::EndLine) + } + Anchor(hir::Anchor::StartText) if self.compiled.is_reverse => { + self.c_empty_look(prog::EmptyLook::EndText) + } + Anchor(hir::Anchor::StartText) => { + self.c_empty_look(prog::EmptyLook::StartText) + } + Anchor(hir::Anchor::EndText) if self.compiled.is_reverse => { + self.c_empty_look(prog::EmptyLook::StartText) + } + Anchor(hir::Anchor::EndText) => { + self.c_empty_look(prog::EmptyLook::EndText) + } + WordBoundary(hir::WordBoundary::Unicode) => { + if !cfg!(feature = "unicode-perl") { + return Err(Error::Syntax( + "Unicode word boundaries are unavailable when \ + the unicode-perl feature is disabled" + .to_string(), + )); + } + self.compiled.has_unicode_word_boundary = true; + self.byte_classes.set_word_boundary(); + // We also make sure that all ASCII bytes are in a different + // class from non-ASCII bytes. Otherwise, it's possible for + // ASCII bytes to get lumped into the same class as non-ASCII + // bytes. This in turn may cause the lazy DFA to falsely start + // when it sees an ASCII byte that maps to a byte class with + // non-ASCII bytes. This ensures that never happens. + self.byte_classes.set_range(0, 0x7F); + self.c_empty_look(prog::EmptyLook::WordBoundary) + } + WordBoundary(hir::WordBoundary::UnicodeNegate) => { + if !cfg!(feature = "unicode-perl") { + return Err(Error::Syntax( + "Unicode word boundaries are unavailable when \ + the unicode-perl feature is disabled" + .to_string(), + )); + } + self.compiled.has_unicode_word_boundary = true; + self.byte_classes.set_word_boundary(); + // See comments above for why we set the ASCII range here. + self.byte_classes.set_range(0, 0x7F); + self.c_empty_look(prog::EmptyLook::NotWordBoundary) + } + WordBoundary(hir::WordBoundary::Ascii) => { + self.byte_classes.set_word_boundary(); + self.c_empty_look(prog::EmptyLook::WordBoundaryAscii) + } + WordBoundary(hir::WordBoundary::AsciiNegate) => { + self.byte_classes.set_word_boundary(); + self.c_empty_look(prog::EmptyLook::NotWordBoundaryAscii) + } + Group(ref g) => match g.kind { + hir::GroupKind::NonCapturing => self.c(&g.hir), + hir::GroupKind::CaptureIndex(index) => { + if index as usize >= self.compiled.captures.len() { + self.compiled.captures.push(None); + } + self.c_capture(2 * index as usize, &g.hir) + } + hir::GroupKind::CaptureName { index, ref name } => { + if index as usize >= self.compiled.captures.len() { + let n = name.to_string(); + self.compiled.captures.push(Some(n.clone())); + self.capture_name_idx.insert(n, index as usize); + } + self.c_capture(2 * index as usize, &g.hir) + } + }, + Concat(ref es) => { + if self.compiled.is_reverse { + self.c_concat(es.iter().rev()) + } else { + self.c_concat(es) + } + } + Alternation(ref es) => self.c_alternate(&**es), + Repetition(ref rep) => self.c_repeat(rep), + } + } + + fn c_empty(&mut self) -> ResultOrEmpty { + // See: https://github.com/rust-lang/regex/security/advisories/GHSA-m5pq-gvj9-9vr8 + // See: CVE-2022-24713 + // + // Since 'empty' sub-expressions don't increase the size of + // the actual compiled object, we "fake" an increase in its + // size so that our 'check_size_limit' routine will eventually + // stop compilation if there are too many empty sub-expressions + // (e.g., via a large repetition). + self.extra_inst_bytes += std::mem::size_of::<Inst>(); + Ok(None) + } + + fn c_capture(&mut self, first_slot: usize, expr: &Hir) -> ResultOrEmpty { + if self.num_exprs > 1 || self.compiled.is_dfa { + // Don't ever compile Save instructions for regex sets because + // they are never used. They are also never used in DFA programs + // because DFAs can't handle captures. + self.c(expr) + } else { + let entry = self.insts.len(); + let hole = self.push_hole(InstHole::Save { slot: first_slot }); + let patch = self.c(expr)?.unwrap_or_else(|| self.next_inst()); + self.fill(hole, patch.entry); + self.fill_to_next(patch.hole); + let hole = self.push_hole(InstHole::Save { slot: first_slot + 1 }); + Ok(Some(Patch { hole, entry })) + } + } + + fn c_dotstar(&mut self) -> Result { + Ok(if !self.compiled.only_utf8() { + self.c(&Hir::repetition(hir::Repetition { + kind: hir::RepetitionKind::ZeroOrMore, + greedy: false, + hir: Box::new(Hir::any(true)), + }))? + .unwrap() + } else { + self.c(&Hir::repetition(hir::Repetition { + kind: hir::RepetitionKind::ZeroOrMore, + greedy: false, + hir: Box::new(Hir::any(false)), + }))? + .unwrap() + }) + } + + fn c_char(&mut self, c: char) -> ResultOrEmpty { + if self.compiled.uses_bytes() { + if c.is_ascii() { + let b = c as u8; + let hole = + self.push_hole(InstHole::Bytes { start: b, end: b }); + self.byte_classes.set_range(b, b); + Ok(Some(Patch { hole, entry: self.insts.len() - 1 })) + } else { + self.c_class(&[hir::ClassUnicodeRange::new(c, c)]) + } + } else { + let hole = self.push_hole(InstHole::Char { c }); + Ok(Some(Patch { hole, entry: self.insts.len() - 1 })) + } + } + + fn c_class(&mut self, ranges: &[hir::ClassUnicodeRange]) -> ResultOrEmpty { + use std::mem::size_of; + + assert!(!ranges.is_empty()); + if self.compiled.uses_bytes() { + Ok(Some(CompileClass { c: self, ranges }.compile()?)) + } else { + let ranges: Vec<(char, char)> = + ranges.iter().map(|r| (r.start(), r.end())).collect(); + let hole = if ranges.len() == 1 && ranges[0].0 == ranges[0].1 { + self.push_hole(InstHole::Char { c: ranges[0].0 }) + } else { + self.extra_inst_bytes += + ranges.len() * (size_of::<char>() * 2); + self.push_hole(InstHole::Ranges { ranges }) + }; + Ok(Some(Patch { hole, entry: self.insts.len() - 1 })) + } + } + + fn c_byte(&mut self, b: u8) -> ResultOrEmpty { + self.c_class_bytes(&[hir::ClassBytesRange::new(b, b)]) + } + + fn c_class_bytes( + &mut self, + ranges: &[hir::ClassBytesRange], + ) -> ResultOrEmpty { + debug_assert!(!ranges.is_empty()); + + let first_split_entry = self.insts.len(); + let mut holes = vec![]; + let mut prev_hole = Hole::None; + for r in &ranges[0..ranges.len() - 1] { + self.fill_to_next(prev_hole); + let split = self.push_split_hole(); + let next = self.insts.len(); + self.byte_classes.set_range(r.start(), r.end()); + holes.push(self.push_hole(InstHole::Bytes { + start: r.start(), + end: r.end(), + })); + prev_hole = self.fill_split(split, Some(next), None); + } + let next = self.insts.len(); + let r = &ranges[ranges.len() - 1]; + self.byte_classes.set_range(r.start(), r.end()); + holes.push( + self.push_hole(InstHole::Bytes { start: r.start(), end: r.end() }), + ); + self.fill(prev_hole, next); + Ok(Some(Patch { hole: Hole::Many(holes), entry: first_split_entry })) + } + + fn c_empty_look(&mut self, look: EmptyLook) -> ResultOrEmpty { + let hole = self.push_hole(InstHole::EmptyLook { look }); + Ok(Some(Patch { hole, entry: self.insts.len() - 1 })) + } + + fn c_concat<'a, I>(&mut self, exprs: I) -> ResultOrEmpty + where + I: IntoIterator<Item = &'a Hir>, + { + let mut exprs = exprs.into_iter(); + let Patch { mut hole, entry } = loop { + match exprs.next() { + None => return self.c_empty(), + Some(e) => { + if let Some(p) = self.c(e)? { + break p; + } + } + } + }; + for e in exprs { + if let Some(p) = self.c(e)? { + self.fill(hole, p.entry); + hole = p.hole; + } + } + Ok(Some(Patch { hole, entry })) + } + + fn c_alternate(&mut self, exprs: &[Hir]) -> ResultOrEmpty { + debug_assert!( + exprs.len() >= 2, + "alternates must have at least 2 exprs" + ); + + // Initial entry point is always the first split. + let first_split_entry = self.insts.len(); + + // Save up all of the holes from each alternate. They will all get + // patched to point to the same location. + let mut holes = vec![]; + + // true indicates that the hole is a split where we want to fill + // the second branch. + let mut prev_hole = (Hole::None, false); + for e in &exprs[0..exprs.len() - 1] { + if prev_hole.1 { + let next = self.insts.len(); + self.fill_split(prev_hole.0, None, Some(next)); + } else { + self.fill_to_next(prev_hole.0); + } + let split = self.push_split_hole(); + if let Some(Patch { hole, entry }) = self.c(e)? { + holes.push(hole); + prev_hole = (self.fill_split(split, Some(entry), None), false); + } else { + let (split1, split2) = split.dup_one(); + holes.push(split1); + prev_hole = (split2, true); + } + } + if let Some(Patch { hole, entry }) = self.c(&exprs[exprs.len() - 1])? { + holes.push(hole); + if prev_hole.1 { + self.fill_split(prev_hole.0, None, Some(entry)); + } else { + self.fill(prev_hole.0, entry); + } + } else { + // We ignore prev_hole.1. When it's true, it means we have two + // empty branches both pushing prev_hole.0 into holes, so both + // branches will go to the same place anyway. + holes.push(prev_hole.0); + } + Ok(Some(Patch { hole: Hole::Many(holes), entry: first_split_entry })) + } + + fn c_repeat(&mut self, rep: &hir::Repetition) -> ResultOrEmpty { + use regex_syntax::hir::RepetitionKind::*; + match rep.kind { + ZeroOrOne => self.c_repeat_zero_or_one(&rep.hir, rep.greedy), + ZeroOrMore => self.c_repeat_zero_or_more(&rep.hir, rep.greedy), + OneOrMore => self.c_repeat_one_or_more(&rep.hir, rep.greedy), + Range(hir::RepetitionRange::Exactly(min_max)) => { + self.c_repeat_range(&rep.hir, rep.greedy, min_max, min_max) + } + Range(hir::RepetitionRange::AtLeast(min)) => { + self.c_repeat_range_min_or_more(&rep.hir, rep.greedy, min) + } + Range(hir::RepetitionRange::Bounded(min, max)) => { + self.c_repeat_range(&rep.hir, rep.greedy, min, max) + } + } + } + + fn c_repeat_zero_or_one( + &mut self, + expr: &Hir, + greedy: bool, + ) -> ResultOrEmpty { + let split_entry = self.insts.len(); + let split = self.push_split_hole(); + let Patch { hole: hole_rep, entry: entry_rep } = match self.c(expr)? { + Some(p) => p, + None => return self.pop_split_hole(), + }; + let split_hole = if greedy { + self.fill_split(split, Some(entry_rep), None) + } else { + self.fill_split(split, None, Some(entry_rep)) + }; + let holes = vec![hole_rep, split_hole]; + Ok(Some(Patch { hole: Hole::Many(holes), entry: split_entry })) + } + + fn c_repeat_zero_or_more( + &mut self, + expr: &Hir, + greedy: bool, + ) -> ResultOrEmpty { + let split_entry = self.insts.len(); + let split = self.push_split_hole(); + let Patch { hole: hole_rep, entry: entry_rep } = match self.c(expr)? { + Some(p) => p, + None => return self.pop_split_hole(), + }; + + self.fill(hole_rep, split_entry); + let split_hole = if greedy { + self.fill_split(split, Some(entry_rep), None) + } else { + self.fill_split(split, None, Some(entry_rep)) + }; + Ok(Some(Patch { hole: split_hole, entry: split_entry })) + } + + fn c_repeat_one_or_more( + &mut self, + expr: &Hir, + greedy: bool, + ) -> ResultOrEmpty { + let Patch { hole: hole_rep, entry: entry_rep } = match self.c(expr)? { + Some(p) => p, + None => return Ok(None), + }; + self.fill_to_next(hole_rep); + let split = self.push_split_hole(); + + let split_hole = if greedy { + self.fill_split(split, Some(entry_rep), None) + } else { + self.fill_split(split, None, Some(entry_rep)) + }; + Ok(Some(Patch { hole: split_hole, entry: entry_rep })) + } + + fn c_repeat_range_min_or_more( + &mut self, + expr: &Hir, + greedy: bool, + min: u32, + ) -> ResultOrEmpty { + let min = u32_to_usize(min); + // Using next_inst() is ok, because we can't return it (concat would + // have to return Some(_) while c_repeat_range_min_or_more returns + // None). + let patch_concat = self + .c_concat(iter::repeat(expr).take(min))? + .unwrap_or_else(|| self.next_inst()); + if let Some(patch_rep) = self.c_repeat_zero_or_more(expr, greedy)? { + self.fill(patch_concat.hole, patch_rep.entry); + Ok(Some(Patch { hole: patch_rep.hole, entry: patch_concat.entry })) + } else { + Ok(None) + } + } + + fn c_repeat_range( + &mut self, + expr: &Hir, + greedy: bool, + min: u32, + max: u32, + ) -> ResultOrEmpty { + let (min, max) = (u32_to_usize(min), u32_to_usize(max)); + debug_assert!(min <= max); + let patch_concat = self.c_concat(iter::repeat(expr).take(min))?; + if min == max { + return Ok(patch_concat); + } + // Same reasoning as in c_repeat_range_min_or_more (we know that min < + // max at this point). + let patch_concat = patch_concat.unwrap_or_else(|| self.next_inst()); + let initial_entry = patch_concat.entry; + // It is much simpler to compile, e.g., `a{2,5}` as: + // + // aaa?a?a? + // + // But you end up with a sequence of instructions like this: + // + // 0: 'a' + // 1: 'a', + // 2: split(3, 4) + // 3: 'a' + // 4: split(5, 6) + // 5: 'a' + // 6: split(7, 8) + // 7: 'a' + // 8: MATCH + // + // This is *incredibly* inefficient because the splits end + // up forming a chain, which has to be resolved everything a + // transition is followed. + let mut holes = vec![]; + let mut prev_hole = patch_concat.hole; + for _ in min..max { + self.fill_to_next(prev_hole); + let split = self.push_split_hole(); + let Patch { hole, entry } = match self.c(expr)? { + Some(p) => p, + None => return self.pop_split_hole(), + }; + prev_hole = hole; + if greedy { + holes.push(self.fill_split(split, Some(entry), None)); + } else { + holes.push(self.fill_split(split, None, Some(entry))); + } + } + holes.push(prev_hole); + Ok(Some(Patch { hole: Hole::Many(holes), entry: initial_entry })) + } + + /// Can be used as a default value for the c_* functions when the call to + /// c_function is followed by inserting at least one instruction that is + /// always executed after the ones written by the c* function. + fn next_inst(&self) -> Patch { + Patch { hole: Hole::None, entry: self.insts.len() } + } + + fn fill(&mut self, hole: Hole, goto: InstPtr) { + match hole { + Hole::None => {} + Hole::One(pc) => { + self.insts[pc].fill(goto); + } + Hole::Many(holes) => { + for hole in holes { + self.fill(hole, goto); + } + } + } + } + + fn fill_to_next(&mut self, hole: Hole) { + let next = self.insts.len(); + self.fill(hole, next); + } + + fn fill_split( + &mut self, + hole: Hole, + goto1: Option<InstPtr>, + goto2: Option<InstPtr>, + ) -> Hole { + match hole { + Hole::None => Hole::None, + Hole::One(pc) => match (goto1, goto2) { + (Some(goto1), Some(goto2)) => { + self.insts[pc].fill_split(goto1, goto2); + Hole::None + } + (Some(goto1), None) => { + self.insts[pc].half_fill_split_goto1(goto1); + Hole::One(pc) + } + (None, Some(goto2)) => { + self.insts[pc].half_fill_split_goto2(goto2); + Hole::One(pc) + } + (None, None) => unreachable!( + "at least one of the split \ + holes must be filled" + ), + }, + Hole::Many(holes) => { + let mut new_holes = vec![]; + for hole in holes { + new_holes.push(self.fill_split(hole, goto1, goto2)); + } + if new_holes.is_empty() { + Hole::None + } else if new_holes.len() == 1 { + new_holes.pop().unwrap() + } else { + Hole::Many(new_holes) + } + } + } + } + + fn push_compiled(&mut self, inst: Inst) { + self.insts.push(MaybeInst::Compiled(inst)); + } + + fn push_hole(&mut self, inst: InstHole) -> Hole { + let hole = self.insts.len(); + self.insts.push(MaybeInst::Uncompiled(inst)); + Hole::One(hole) + } + + fn push_split_hole(&mut self) -> Hole { + let hole = self.insts.len(); + self.insts.push(MaybeInst::Split); + Hole::One(hole) + } + + fn pop_split_hole(&mut self) -> ResultOrEmpty { + self.insts.pop(); + Ok(None) + } + + fn check_size(&self) -> result::Result<(), Error> { + use std::mem::size_of; + + let size = + self.extra_inst_bytes + (self.insts.len() * size_of::<Inst>()); + if size > self.size_limit { + Err(Error::CompiledTooBig(self.size_limit)) + } else { + Ok(()) + } + } +} + +#[derive(Debug)] +enum Hole { + None, + One(InstPtr), + Many(Vec<Hole>), +} + +impl Hole { + fn dup_one(self) -> (Self, Self) { + match self { + Hole::One(pc) => (Hole::One(pc), Hole::One(pc)), + Hole::None | Hole::Many(_) => { + unreachable!("must be called on single hole") + } + } + } +} + +#[derive(Clone, Debug)] +enum MaybeInst { + Compiled(Inst), + Uncompiled(InstHole), + Split, + Split1(InstPtr), + Split2(InstPtr), +} + +impl MaybeInst { + fn fill(&mut self, goto: InstPtr) { + let maybeinst = match *self { + MaybeInst::Split => MaybeInst::Split1(goto), + MaybeInst::Uncompiled(ref inst) => { + MaybeInst::Compiled(inst.fill(goto)) + } + MaybeInst::Split1(goto1) => { + MaybeInst::Compiled(Inst::Split(InstSplit { + goto1, + goto2: goto, + })) + } + MaybeInst::Split2(goto2) => { + MaybeInst::Compiled(Inst::Split(InstSplit { + goto1: goto, + goto2, + })) + } + _ => unreachable!( + "not all instructions were compiled! \ + found uncompiled instruction: {:?}", + self + ), + }; + *self = maybeinst; + } + + fn fill_split(&mut self, goto1: InstPtr, goto2: InstPtr) { + let filled = match *self { + MaybeInst::Split => Inst::Split(InstSplit { goto1, goto2 }), + _ => unreachable!( + "must be called on Split instruction, \ + instead it was called on: {:?}", + self + ), + }; + *self = MaybeInst::Compiled(filled); + } + + fn half_fill_split_goto1(&mut self, goto1: InstPtr) { + let half_filled = match *self { + MaybeInst::Split => goto1, + _ => unreachable!( + "must be called on Split instruction, \ + instead it was called on: {:?}", + self + ), + }; + *self = MaybeInst::Split1(half_filled); + } + + fn half_fill_split_goto2(&mut self, goto2: InstPtr) { + let half_filled = match *self { + MaybeInst::Split => goto2, + _ => unreachable!( + "must be called on Split instruction, \ + instead it was called on: {:?}", + self + ), + }; + *self = MaybeInst::Split2(half_filled); + } + + fn unwrap(self) -> Inst { + match self { + MaybeInst::Compiled(inst) => inst, + _ => unreachable!( + "must be called on a compiled instruction, \ + instead it was called on: {:?}", + self + ), + } + } +} + +#[derive(Clone, Debug)] +enum InstHole { + Save { slot: usize }, + EmptyLook { look: EmptyLook }, + Char { c: char }, + Ranges { ranges: Vec<(char, char)> }, + Bytes { start: u8, end: u8 }, +} + +impl InstHole { + fn fill(&self, goto: InstPtr) -> Inst { + match *self { + InstHole::Save { slot } => Inst::Save(InstSave { goto, slot }), + InstHole::EmptyLook { look } => { + Inst::EmptyLook(InstEmptyLook { goto, look }) + } + InstHole::Char { c } => Inst::Char(InstChar { goto, c }), + InstHole::Ranges { ref ranges } => Inst::Ranges(InstRanges { + goto, + ranges: ranges.clone().into_boxed_slice(), + }), + InstHole::Bytes { start, end } => { + Inst::Bytes(InstBytes { goto, start, end }) + } + } + } +} + +struct CompileClass<'a, 'b> { + c: &'a mut Compiler, + ranges: &'b [hir::ClassUnicodeRange], +} + +impl<'a, 'b> CompileClass<'a, 'b> { + fn compile(mut self) -> Result { + let mut holes = vec![]; + let mut initial_entry = None; + let mut last_split = Hole::None; + let mut utf8_seqs = self.c.utf8_seqs.take().unwrap(); + self.c.suffix_cache.clear(); + + for (i, range) in self.ranges.iter().enumerate() { + let is_last_range = i + 1 == self.ranges.len(); + utf8_seqs.reset(range.start(), range.end()); + let mut it = (&mut utf8_seqs).peekable(); + loop { + let utf8_seq = match it.next() { + None => break, + Some(utf8_seq) => utf8_seq, + }; + if is_last_range && it.peek().is_none() { + let Patch { hole, entry } = self.c_utf8_seq(&utf8_seq)?; + holes.push(hole); + self.c.fill(last_split, entry); + last_split = Hole::None; + if initial_entry.is_none() { + initial_entry = Some(entry); + } + } else { + if initial_entry.is_none() { + initial_entry = Some(self.c.insts.len()); + } + self.c.fill_to_next(last_split); + last_split = self.c.push_split_hole(); + let Patch { hole, entry } = self.c_utf8_seq(&utf8_seq)?; + holes.push(hole); + last_split = + self.c.fill_split(last_split, Some(entry), None); + } + } + } + self.c.utf8_seqs = Some(utf8_seqs); + Ok(Patch { hole: Hole::Many(holes), entry: initial_entry.unwrap() }) + } + + fn c_utf8_seq(&mut self, seq: &Utf8Sequence) -> Result { + if self.c.compiled.is_reverse { + self.c_utf8_seq_(seq) + } else { + self.c_utf8_seq_(seq.into_iter().rev()) + } + } + + fn c_utf8_seq_<'r, I>(&mut self, seq: I) -> Result + where + I: IntoIterator<Item = &'r Utf8Range>, + { + // The initial instruction for each UTF-8 sequence should be the same. + let mut from_inst = ::std::usize::MAX; + let mut last_hole = Hole::None; + for byte_range in seq { + let key = SuffixCacheKey { + from_inst, + start: byte_range.start, + end: byte_range.end, + }; + { + let pc = self.c.insts.len(); + if let Some(cached_pc) = self.c.suffix_cache.get(key, pc) { + from_inst = cached_pc; + continue; + } + } + self.c.byte_classes.set_range(byte_range.start, byte_range.end); + if from_inst == ::std::usize::MAX { + last_hole = self.c.push_hole(InstHole::Bytes { + start: byte_range.start, + end: byte_range.end, + }); + } else { + self.c.push_compiled(Inst::Bytes(InstBytes { + goto: from_inst, + start: byte_range.start, + end: byte_range.end, + })); + } + from_inst = self.c.insts.len().checked_sub(1).unwrap(); + debug_assert!(from_inst < ::std::usize::MAX); + } + debug_assert!(from_inst < ::std::usize::MAX); + Ok(Patch { hole: last_hole, entry: from_inst }) + } +} + +/// `SuffixCache` is a simple bounded hash map for caching suffix entries in +/// UTF-8 automata. For example, consider the Unicode range \u{0}-\u{FFFF}. +/// The set of byte ranges looks like this: +/// +/// [0-7F] +/// [C2-DF][80-BF] +/// [E0][A0-BF][80-BF] +/// [E1-EC][80-BF][80-BF] +/// [ED][80-9F][80-BF] +/// [EE-EF][80-BF][80-BF] +/// +/// Each line above translates to one alternate in the compiled regex program. +/// However, all but one of the alternates end in the same suffix, which is +/// a waste of an instruction. The suffix cache facilitates reusing them across +/// alternates. +/// +/// Note that a HashMap could be trivially used for this, but we don't need its +/// overhead. Some small bounded space (LRU style) is more than enough. +/// +/// This uses similar idea to [`SparseSet`](../sparse/struct.SparseSet.html), +/// except it uses hashes as original indices and then compares full keys for +/// validation against `dense` array. +#[derive(Debug)] +struct SuffixCache { + sparse: Box<[usize]>, + dense: Vec<SuffixCacheEntry>, +} + +#[derive(Clone, Copy, Debug, Default, Eq, Hash, PartialEq)] +struct SuffixCacheEntry { + key: SuffixCacheKey, + pc: InstPtr, +} + +#[derive(Clone, Copy, Debug, Default, Eq, Hash, PartialEq)] +struct SuffixCacheKey { + from_inst: InstPtr, + start: u8, + end: u8, +} + +impl SuffixCache { + fn new(size: usize) -> Self { + SuffixCache { + sparse: vec![0usize; size].into(), + dense: Vec::with_capacity(size), + } + } + + fn get(&mut self, key: SuffixCacheKey, pc: InstPtr) -> Option<InstPtr> { + let hash = self.hash(&key); + let pos = &mut self.sparse[hash]; + if let Some(entry) = self.dense.get(*pos) { + if entry.key == key { + return Some(entry.pc); + } + } + *pos = self.dense.len(); + self.dense.push(SuffixCacheEntry { key, pc }); + None + } + + fn clear(&mut self) { + self.dense.clear(); + } + + fn hash(&self, suffix: &SuffixCacheKey) -> usize { + // Basic FNV-1a hash as described: + // https://en.wikipedia.org/wiki/Fowler%E2%80%93Noll%E2%80%93Vo_hash_function + const FNV_PRIME: u64 = 1_099_511_628_211; + let mut h = 14_695_981_039_346_656_037; + h = (h ^ (suffix.from_inst as u64)).wrapping_mul(FNV_PRIME); + h = (h ^ (suffix.start as u64)).wrapping_mul(FNV_PRIME); + h = (h ^ (suffix.end as u64)).wrapping_mul(FNV_PRIME); + (h as usize) % self.sparse.len() + } +} + +struct ByteClassSet([bool; 256]); + +impl ByteClassSet { + fn new() -> Self { + ByteClassSet([false; 256]) + } + + fn set_range(&mut self, start: u8, end: u8) { + debug_assert!(start <= end); + if start > 0 { + self.0[start as usize - 1] = true; + } + self.0[end as usize] = true; + } + + fn set_word_boundary(&mut self) { + // We need to mark all ranges of bytes whose pairs result in + // evaluating \b differently. + let iswb = is_word_byte; + let mut b1: u16 = 0; + let mut b2: u16; + while b1 <= 255 { + b2 = b1 + 1; + while b2 <= 255 && iswb(b1 as u8) == iswb(b2 as u8) { + b2 += 1; + } + self.set_range(b1 as u8, (b2 - 1) as u8); + b1 = b2; + } + } + + fn byte_classes(&self) -> Vec<u8> { + // N.B. If you're debugging the DFA, it's useful to simply return + // `(0..256).collect()`, which effectively removes the byte classes + // and makes the transitions easier to read. + // (0usize..256).map(|x| x as u8).collect() + let mut byte_classes = vec![0; 256]; + let mut class = 0u8; + let mut i = 0; + loop { + byte_classes[i] = class as u8; + if i >= 255 { + break; + } + if self.0[i] { + class = class.checked_add(1).unwrap(); + } + i += 1; + } + byte_classes + } +} + +impl fmt::Debug for ByteClassSet { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + f.debug_tuple("ByteClassSet").field(&&self.0[..]).finish() + } +} + +fn u32_to_usize(n: u32) -> usize { + // In case usize is less than 32 bits, we need to guard against overflow. + // On most platforms this compiles to nothing. + // TODO Use `std::convert::TryFrom` once it's stable. + if (n as u64) > (::std::usize::MAX as u64) { + panic!("BUG: {} is too big to be pointer sized", n) + } + n as usize +} + +#[cfg(test)] +mod tests { + use super::ByteClassSet; + + #[test] + fn byte_classes() { + let mut set = ByteClassSet::new(); + set.set_range(b'a', b'z'); + let classes = set.byte_classes(); + assert_eq!(classes[0], 0); + assert_eq!(classes[1], 0); + assert_eq!(classes[2], 0); + assert_eq!(classes[b'a' as usize - 1], 0); + assert_eq!(classes[b'a' as usize], 1); + assert_eq!(classes[b'm' as usize], 1); + assert_eq!(classes[b'z' as usize], 1); + assert_eq!(classes[b'z' as usize + 1], 2); + assert_eq!(classes[254], 2); + assert_eq!(classes[255], 2); + + let mut set = ByteClassSet::new(); + set.set_range(0, 2); + set.set_range(4, 6); + let classes = set.byte_classes(); + assert_eq!(classes[0], 0); + assert_eq!(classes[1], 0); + assert_eq!(classes[2], 0); + assert_eq!(classes[3], 1); + assert_eq!(classes[4], 2); + assert_eq!(classes[5], 2); + assert_eq!(classes[6], 2); + assert_eq!(classes[7], 3); + assert_eq!(classes[255], 3); + } + + #[test] + fn full_byte_classes() { + let mut set = ByteClassSet::new(); + for i in 0..256u16 { + set.set_range(i as u8, i as u8); + } + assert_eq!(set.byte_classes().len(), 256); + } +} diff --git a/third_party/rust/regex/src/dfa.rs b/third_party/rust/regex/src/dfa.rs new file mode 100644 index 0000000000..dc9952120e --- /dev/null +++ b/third_party/rust/regex/src/dfa.rs @@ -0,0 +1,1945 @@ +/*! +The DFA matching engine. + +A DFA provides faster matching because the engine is in exactly one state at +any point in time. In the NFA, there may be multiple active states, and +considerable CPU cycles are spent shuffling them around. In finite automata +speak, the DFA follows epsilon transitions in the regex far less than the NFA. + +A DFA is a classic trade off between time and space. The NFA is slower, but +its memory requirements are typically small and predictable. The DFA is faster, +but given the right regex and the right input, the number of states in the +DFA can grow exponentially. To mitigate this space problem, we do two things: + +1. We implement an *online* DFA. That is, the DFA is constructed from the NFA + during a search. When a new state is computed, it is stored in a cache so + that it may be reused. An important consequence of this implementation + is that states that are never reached for a particular input are never + computed. (This is impossible in an "offline" DFA which needs to compute + all possible states up front.) +2. If the cache gets too big, we wipe it and continue matching. + +In pathological cases, a new state can be created for every byte of input. +(e.g., The regex `(a|b)*a(a|b){20}` on a long sequence of a's and b's.) +In this case, performance regresses to slightly slower than the full NFA +simulation, in large part because the cache becomes useless. If the cache +is wiped too frequently, the DFA quits and control falls back to one of the +NFA simulations. + +Because of the "lazy" nature of this DFA, the inner matching loop is +considerably more complex than one might expect out of a DFA. A number of +tricks are employed to make it fast. Tread carefully. + +N.B. While this implementation is heavily commented, Russ Cox's series of +articles on regexes is strongly recommended: <https://swtch.com/~rsc/regexp/> +(As is the DFA implementation in RE2, which heavily influenced this +implementation.) +*/ + +use std::collections::HashMap; +use std::fmt; +use std::iter::repeat; +use std::mem; +use std::sync::Arc; + +use crate::exec::ProgramCache; +use crate::prog::{Inst, Program}; +use crate::sparse::SparseSet; + +/// Return true if and only if the given program can be executed by a DFA. +/// +/// Generally, a DFA is always possible. A pathological case where it is not +/// possible is if the number of NFA states exceeds `u32::MAX`, in which case, +/// this function will return false. +/// +/// This function will also return false if the given program has any Unicode +/// instructions (Char or Ranges) since the DFA operates on bytes only. +pub fn can_exec(insts: &Program) -> bool { + use crate::prog::Inst::*; + // If for some reason we manage to allocate a regex program with more + // than i32::MAX instructions, then we can't execute the DFA because we + // use 32 bit instruction pointer deltas for memory savings. + // If i32::MAX is the largest positive delta, + // then -i32::MAX == i32::MIN + 1 is the largest negative delta, + // and we are OK to use 32 bits. + if insts.dfa_size_limit == 0 || insts.len() > ::std::i32::MAX as usize { + return false; + } + for inst in insts { + match *inst { + Char(_) | Ranges(_) => return false, + EmptyLook(_) | Match(_) | Save(_) | Split(_) | Bytes(_) => {} + } + } + true +} + +/// A reusable cache of DFA states. +/// +/// This cache is reused between multiple invocations of the same regex +/// program. (It is not shared simultaneously between threads. If there is +/// contention, then new caches are created.) +#[derive(Debug)] +pub struct Cache { + /// Group persistent DFA related cache state together. The sparse sets + /// listed below are used as scratch space while computing uncached states. + inner: CacheInner, + /// qcur and qnext are ordered sets with constant time + /// addition/membership/clearing-whole-set and linear time iteration. They + /// are used to manage the sets of NFA states in DFA states when computing + /// cached DFA states. In particular, the order of the NFA states matters + /// for leftmost-first style matching. Namely, when computing a cached + /// state, the set of NFA states stops growing as soon as the first Match + /// instruction is observed. + qcur: SparseSet, + qnext: SparseSet, +} + +/// `CacheInner` is logically just a part of Cache, but groups together fields +/// that aren't passed as function parameters throughout search. (This split +/// is mostly an artifact of the borrow checker. It is happily paid.) +#[derive(Debug)] +struct CacheInner { + /// A cache of pre-compiled DFA states, keyed by the set of NFA states + /// and the set of empty-width flags set at the byte in the input when the + /// state was observed. + /// + /// A StatePtr is effectively a `*State`, but to avoid various inconvenient + /// things, we just pass indexes around manually. The performance impact of + /// this is probably an instruction or two in the inner loop. However, on + /// 64 bit, each StatePtr is half the size of a *State. + compiled: StateMap, + /// The transition table. + /// + /// The transition table is laid out in row-major order, where states are + /// rows and the transitions for each state are columns. At a high level, + /// given state `s` and byte `b`, the next state can be found at index + /// `s * 256 + b`. + /// + /// This is, of course, a lie. A StatePtr is actually a pointer to the + /// *start* of a row in this table. When indexing in the DFA's inner loop, + /// this removes the need to multiply the StatePtr by the stride. Yes, it + /// matters. This reduces the number of states we can store, but: the + /// stride is rarely 256 since we define transitions in terms of + /// *equivalence classes* of bytes. Each class corresponds to a set of + /// bytes that never discriminate a distinct path through the DFA from each + /// other. + trans: Transitions, + /// A set of cached start states, which are limited to the number of + /// permutations of flags set just before the initial byte of input. (The + /// index into this vec is a `EmptyFlags`.) + /// + /// N.B. A start state can be "dead" (i.e., no possible match), so we + /// represent it with a StatePtr. + start_states: Vec<StatePtr>, + /// Stack scratch space used to follow epsilon transitions in the NFA. + /// (This permits us to avoid recursion.) + /// + /// The maximum stack size is the number of NFA states. + stack: Vec<InstPtr>, + /// The total number of times this cache has been flushed by the DFA + /// because of space constraints. + flush_count: u64, + /// The total heap size of the DFA's cache. We use this to determine when + /// we should flush the cache. + size: usize, + /// Scratch space used when building instruction pointer lists for new + /// states. This helps amortize allocation. + insts_scratch_space: Vec<u8>, +} + +/// The transition table. +/// +/// It is laid out in row-major order, with states as rows and byte class +/// transitions as columns. +/// +/// The transition table is responsible for producing valid `StatePtrs`. A +/// `StatePtr` points to the start of a particular row in this table. When +/// indexing to find the next state this allows us to avoid a multiplication +/// when computing an index into the table. +#[derive(Clone)] +struct Transitions { + /// The table. + table: Vec<StatePtr>, + /// The stride. + num_byte_classes: usize, +} + +/// Fsm encapsulates the actual execution of the DFA. +#[derive(Debug)] +pub struct Fsm<'a> { + /// prog contains the NFA instruction opcodes. DFA execution uses either + /// the `dfa` instructions or the `dfa_reverse` instructions from + /// `exec::ExecReadOnly`. (It never uses `ExecReadOnly.nfa`, which may have + /// Unicode opcodes that cannot be executed by the DFA.) + prog: &'a Program, + /// The start state. We record it here because the pointer may change + /// when the cache is wiped. + start: StatePtr, + /// The current position in the input. + at: usize, + /// Should we quit after seeing the first match? e.g., When the caller + /// uses `is_match` or `shortest_match`. + quit_after_match: bool, + /// The last state that matched. + /// + /// When no match has occurred, this is set to STATE_UNKNOWN. + /// + /// This is only useful when matching regex sets. The last match state + /// is useful because it contains all of the match instructions seen, + /// thereby allowing us to enumerate which regexes in the set matched. + last_match_si: StatePtr, + /// The input position of the last cache flush. We use this to determine + /// if we're thrashing in the cache too often. If so, the DFA quits so + /// that we can fall back to the NFA algorithm. + last_cache_flush: usize, + /// All cached DFA information that is persisted between searches. + cache: &'a mut CacheInner, +} + +/// The result of running the DFA. +/// +/// Generally, the result is either a match or not a match, but sometimes the +/// DFA runs too slowly because the cache size is too small. In that case, it +/// gives up with the intent of falling back to the NFA algorithm. +/// +/// The DFA can also give up if it runs out of room to create new states, or if +/// it sees non-ASCII bytes in the presence of a Unicode word boundary. +#[derive(Clone, Debug)] +pub enum Result<T> { + Match(T), + NoMatch(usize), + Quit, +} + +impl<T> Result<T> { + /// Returns true if this result corresponds to a match. + pub fn is_match(&self) -> bool { + match *self { + Result::Match(_) => true, + Result::NoMatch(_) | Result::Quit => false, + } + } + + /// Maps the given function onto T and returns the result. + /// + /// If this isn't a match, then this is a no-op. + #[cfg(feature = "perf-literal")] + pub fn map<U, F: FnMut(T) -> U>(self, mut f: F) -> Result<U> { + match self { + Result::Match(t) => Result::Match(f(t)), + Result::NoMatch(x) => Result::NoMatch(x), + Result::Quit => Result::Quit, + } + } + + /// Sets the non-match position. + /// + /// If this isn't a non-match, then this is a no-op. + fn set_non_match(self, at: usize) -> Result<T> { + match self { + Result::NoMatch(_) => Result::NoMatch(at), + r => r, + } + } +} + +/// `State` is a DFA state. It contains an ordered set of NFA states (not +/// necessarily complete) and a smattering of flags. +/// +/// The flags are packed into the first byte of data. +/// +/// States don't carry their transitions. Instead, transitions are stored in +/// a single row-major table. +/// +/// Delta encoding is used to store the instruction pointers. +/// The first instruction pointer is stored directly starting +/// at data[1], and each following pointer is stored as an offset +/// to the previous one. If a delta is in the range -127..127, +/// it is packed into a single byte; Otherwise the byte 128 (-128 as an i8) +/// is coded as a flag, followed by 4 bytes encoding the delta. +#[derive(Clone, Eq, Hash, PartialEq)] +struct State { + data: Arc<[u8]>, +} + +/// `InstPtr` is a 32 bit pointer into a sequence of opcodes (i.e., it indexes +/// an NFA state). +/// +/// Throughout this library, this is usually set to `usize`, but we force a +/// `u32` here for the DFA to save on space. +type InstPtr = u32; + +/// Adds ip to data using delta encoding with respect to prev. +/// +/// After completion, `data` will contain `ip` and `prev` will be set to `ip`. +fn push_inst_ptr(data: &mut Vec<u8>, prev: &mut InstPtr, ip: InstPtr) { + let delta = (ip as i32) - (*prev as i32); + write_vari32(data, delta); + *prev = ip; +} + +struct InstPtrs<'a> { + base: usize, + data: &'a [u8], +} + +impl<'a> Iterator for InstPtrs<'a> { + type Item = usize; + + fn next(&mut self) -> Option<usize> { + if self.data.is_empty() { + return None; + } + let (delta, nread) = read_vari32(self.data); + let base = self.base as i32 + delta; + debug_assert!(base >= 0); + debug_assert!(nread > 0); + self.data = &self.data[nread..]; + self.base = base as usize; + Some(self.base) + } +} + +impl State { + fn flags(&self) -> StateFlags { + StateFlags(self.data[0]) + } + + fn inst_ptrs(&self) -> InstPtrs<'_> { + InstPtrs { base: 0, data: &self.data[1..] } + } +} + +/// `StatePtr` is a 32 bit pointer to the start of a row in the transition +/// table. +/// +/// It has many special values. There are two types of special values: +/// sentinels and flags. +/// +/// Sentinels corresponds to special states that carry some kind of +/// significance. There are three such states: unknown, dead and quit states. +/// +/// Unknown states are states that haven't been computed yet. They indicate +/// that a transition should be filled in that points to either an existing +/// cached state or a new state altogether. In general, an unknown state means +/// "follow the NFA's epsilon transitions." +/// +/// Dead states are states that can never lead to a match, no matter what +/// subsequent input is observed. This means that the DFA should quit +/// immediately and return the longest match it has found thus far. +/// +/// Quit states are states that imply the DFA is not capable of matching the +/// regex correctly. Currently, this is only used when a Unicode word boundary +/// exists in the regex *and* a non-ASCII byte is observed. +/// +/// The other type of state pointer is a state pointer with special flag bits. +/// There are two flags: a start flag and a match flag. The lower bits of both +/// kinds always contain a "valid" `StatePtr` (indicated by the `STATE_MAX` +/// mask). +/// +/// The start flag means that the state is a start state, and therefore may be +/// subject to special prefix scanning optimizations. +/// +/// The match flag means that the state is a match state, and therefore the +/// current position in the input (while searching) should be recorded. +/// +/// The above exists mostly in the service of making the inner loop fast. +/// In particular, the inner *inner* loop looks something like this: +/// +/// ```ignore +/// while state <= STATE_MAX and i < len(text): +/// state = state.next[i] +/// ``` +/// +/// This is nice because it lets us execute a lazy DFA as if it were an +/// entirely offline DFA (i.e., with very few instructions). The loop will +/// quit only when we need to examine a case that needs special attention. +type StatePtr = u32; + +/// An unknown state means that the state has not been computed yet, and that +/// the only way to progress is to compute it. +const STATE_UNKNOWN: StatePtr = 1 << 31; + +/// A dead state means that the state has been computed and it is known that +/// once it is entered, no future match can ever occur. +const STATE_DEAD: StatePtr = STATE_UNKNOWN + 1; + +/// A quit state means that the DFA came across some input that it doesn't +/// know how to process correctly. The DFA should quit and another matching +/// engine should be run in its place. +const STATE_QUIT: StatePtr = STATE_DEAD + 1; + +/// A start state is a state that the DFA can start in. +/// +/// Note that start states have their lower bits set to a state pointer. +const STATE_START: StatePtr = 1 << 30; + +/// A match state means that the regex has successfully matched. +/// +/// Note that match states have their lower bits set to a state pointer. +const STATE_MATCH: StatePtr = 1 << 29; + +/// The maximum state pointer. This is useful to mask out the "valid" state +/// pointer from a state with the "start" or "match" bits set. +/// +/// It doesn't make sense to use this with unknown, dead or quit state +/// pointers, since those pointers are sentinels and never have their lower +/// bits set to anything meaningful. +const STATE_MAX: StatePtr = STATE_MATCH - 1; + +/// Byte is a u8 in spirit, but a u16 in practice so that we can represent the +/// special EOF sentinel value. +#[derive(Copy, Clone, Debug)] +struct Byte(u16); + +/// A set of flags for zero-width assertions. +#[derive(Clone, Copy, Eq, Debug, Default, Hash, PartialEq)] +struct EmptyFlags { + start: bool, + end: bool, + start_line: bool, + end_line: bool, + word_boundary: bool, + not_word_boundary: bool, +} + +/// A set of flags describing various configurations of a DFA state. This is +/// represented by a `u8` so that it is compact. +#[derive(Clone, Copy, Eq, Default, Hash, PartialEq)] +struct StateFlags(u8); + +impl Cache { + /// Create new empty cache for the DFA engine. + pub fn new(prog: &Program) -> Self { + // We add 1 to account for the special EOF byte. + let num_byte_classes = (prog.byte_classes[255] as usize + 1) + 1; + let starts = vec![STATE_UNKNOWN; 256]; + let mut cache = Cache { + inner: CacheInner { + compiled: StateMap::new(num_byte_classes), + trans: Transitions::new(num_byte_classes), + start_states: starts, + stack: vec![], + flush_count: 0, + size: 0, + insts_scratch_space: vec![], + }, + qcur: SparseSet::new(prog.insts.len()), + qnext: SparseSet::new(prog.insts.len()), + }; + cache.inner.reset_size(); + cache + } +} + +impl CacheInner { + /// Resets the cache size to account for fixed costs, such as the program + /// and stack sizes. + fn reset_size(&mut self) { + self.size = (self.start_states.len() * mem::size_of::<StatePtr>()) + + (self.stack.len() * mem::size_of::<InstPtr>()); + } +} + +impl<'a> Fsm<'a> { + #[cfg_attr(feature = "perf-inline", inline(always))] + pub fn forward( + prog: &'a Program, + cache: &ProgramCache, + quit_after_match: bool, + text: &[u8], + at: usize, + ) -> Result<usize> { + let mut cache = cache.borrow_mut(); + let cache = &mut cache.dfa; + let mut dfa = Fsm { + prog, + start: 0, // filled in below + at, + quit_after_match, + last_match_si: STATE_UNKNOWN, + last_cache_flush: at, + cache: &mut cache.inner, + }; + let (empty_flags, state_flags) = dfa.start_flags(text, at); + dfa.start = + match dfa.start_state(&mut cache.qcur, empty_flags, state_flags) { + None => return Result::Quit, + Some(STATE_DEAD) => return Result::NoMatch(at), + Some(si) => si, + }; + debug_assert!(dfa.start != STATE_UNKNOWN); + dfa.exec_at(&mut cache.qcur, &mut cache.qnext, text) + } + + #[cfg_attr(feature = "perf-inline", inline(always))] + pub fn reverse( + prog: &'a Program, + cache: &ProgramCache, + quit_after_match: bool, + text: &[u8], + at: usize, + ) -> Result<usize> { + let mut cache = cache.borrow_mut(); + let cache = &mut cache.dfa_reverse; + let mut dfa = Fsm { + prog, + start: 0, // filled in below + at, + quit_after_match, + last_match_si: STATE_UNKNOWN, + last_cache_flush: at, + cache: &mut cache.inner, + }; + let (empty_flags, state_flags) = dfa.start_flags_reverse(text, at); + dfa.start = + match dfa.start_state(&mut cache.qcur, empty_flags, state_flags) { + None => return Result::Quit, + Some(STATE_DEAD) => return Result::NoMatch(at), + Some(si) => si, + }; + debug_assert!(dfa.start != STATE_UNKNOWN); + dfa.exec_at_reverse(&mut cache.qcur, &mut cache.qnext, text) + } + + #[cfg_attr(feature = "perf-inline", inline(always))] + pub fn forward_many( + prog: &'a Program, + cache: &ProgramCache, + matches: &mut [bool], + text: &[u8], + at: usize, + ) -> Result<usize> { + debug_assert!(matches.len() == prog.matches.len()); + let mut cache = cache.borrow_mut(); + let cache = &mut cache.dfa; + let mut dfa = Fsm { + prog, + start: 0, // filled in below + at, + quit_after_match: false, + last_match_si: STATE_UNKNOWN, + last_cache_flush: at, + cache: &mut cache.inner, + }; + let (empty_flags, state_flags) = dfa.start_flags(text, at); + dfa.start = + match dfa.start_state(&mut cache.qcur, empty_flags, state_flags) { + None => return Result::Quit, + Some(STATE_DEAD) => return Result::NoMatch(at), + Some(si) => si, + }; + debug_assert!(dfa.start != STATE_UNKNOWN); + let result = dfa.exec_at(&mut cache.qcur, &mut cache.qnext, text); + if result.is_match() { + if matches.len() == 1 { + matches[0] = true; + } else { + debug_assert!(dfa.last_match_si != STATE_UNKNOWN); + debug_assert!(dfa.last_match_si != STATE_DEAD); + for ip in dfa.state(dfa.last_match_si).inst_ptrs() { + if let Inst::Match(slot) = dfa.prog[ip] { + matches[slot] = true; + } + } + } + } + result + } + + /// Executes the DFA on a forward NFA. + /// + /// {qcur,qnext} are scratch ordered sets which may be non-empty. + #[cfg_attr(feature = "perf-inline", inline(always))] + fn exec_at( + &mut self, + qcur: &mut SparseSet, + qnext: &mut SparseSet, + text: &[u8], + ) -> Result<usize> { + // For the most part, the DFA is basically: + // + // last_match = null + // while current_byte != EOF: + // si = current_state.next[current_byte] + // if si is match + // last_match = si + // return last_match + // + // However, we need to deal with a few things: + // + // 1. This is an *online* DFA, so the current state's next list + // may not point to anywhere yet, so we must go out and compute + // them. (They are then cached into the current state's next list + // to avoid re-computation.) + // 2. If we come across a state that is known to be dead (i.e., never + // leads to a match), then we can quit early. + // 3. If the caller just wants to know if a match occurs, then we + // can quit as soon as we know we have a match. (Full leftmost + // first semantics require continuing on.) + // 4. If we're in the start state, then we can use a pre-computed set + // of prefix literals to skip quickly along the input. + // 5. After the input is exhausted, we run the DFA on one symbol + // that stands for EOF. This is useful for handling empty width + // assertions. + // 6. We can't actually do state.next[byte]. Instead, we have to do + // state.next[byte_classes[byte]], which permits us to keep the + // 'next' list very small. + // + // Since there's a bunch of extra stuff we need to consider, we do some + // pretty hairy tricks to get the inner loop to run as fast as + // possible. + debug_assert!(!self.prog.is_reverse); + + // The last match is the currently known ending match position. It is + // reported as an index to the most recent byte that resulted in a + // transition to a match state and is always stored in capture slot `1` + // when searching forwards. Its maximum value is `text.len()`. + let mut result = Result::NoMatch(self.at); + let (mut prev_si, mut next_si) = (self.start, self.start); + let mut at = self.at; + while at < text.len() { + // This is the real inner loop. We take advantage of special bits + // set in the state pointer to determine whether a state is in the + // "common" case or not. Specifically, the common case is a + // non-match non-start non-dead state that has already been + // computed. So long as we remain in the common case, this inner + // loop will chew through the input. + // + // We also unroll the loop 4 times to amortize the cost of checking + // whether we've consumed the entire input. We are also careful + // to make sure that `prev_si` always represents the previous state + // and `next_si` always represents the next state after the loop + // exits, even if it isn't always true inside the loop. + while next_si <= STATE_MAX && at < text.len() { + // Argument for safety is in the definition of next_si. + prev_si = unsafe { self.next_si(next_si, text, at) }; + at += 1; + if prev_si > STATE_MAX || at + 2 >= text.len() { + mem::swap(&mut prev_si, &mut next_si); + break; + } + next_si = unsafe { self.next_si(prev_si, text, at) }; + at += 1; + if next_si > STATE_MAX { + break; + } + prev_si = unsafe { self.next_si(next_si, text, at) }; + at += 1; + if prev_si > STATE_MAX { + mem::swap(&mut prev_si, &mut next_si); + break; + } + next_si = unsafe { self.next_si(prev_si, text, at) }; + at += 1; + } + if next_si & STATE_MATCH > 0 { + // A match state is outside of the common case because it needs + // special case analysis. In particular, we need to record the + // last position as having matched and possibly quit the DFA if + // we don't need to keep matching. + next_si &= !STATE_MATCH; + result = Result::Match(at - 1); + if self.quit_after_match { + return result; + } + self.last_match_si = next_si; + prev_si = next_si; + + // This permits short-circuiting when matching a regex set. + // In particular, if this DFA state contains only match states, + // then it's impossible to extend the set of matches since + // match states are final. Therefore, we can quit. + if self.prog.matches.len() > 1 { + let state = self.state(next_si); + let just_matches = + state.inst_ptrs().all(|ip| self.prog[ip].is_match()); + if just_matches { + return result; + } + } + + // Another inner loop! If the DFA stays in this particular + // match state, then we can rip through all of the input + // very quickly, and only recording the match location once + // we've left this particular state. + let cur = at; + while (next_si & !STATE_MATCH) == prev_si + && at + 2 < text.len() + { + // Argument for safety is in the definition of next_si. + next_si = unsafe { + self.next_si(next_si & !STATE_MATCH, text, at) + }; + at += 1; + } + if at > cur { + result = Result::Match(at - 2); + } + } else if next_si & STATE_START > 0 { + // A start state isn't in the common case because we may + // want to do quick prefix scanning. If the program doesn't + // have a detected prefix, then start states are actually + // considered common and this case is never reached. + debug_assert!(self.has_prefix()); + next_si &= !STATE_START; + prev_si = next_si; + at = match self.prefix_at(text, at) { + None => return Result::NoMatch(text.len()), + Some(i) => i, + }; + } else if next_si >= STATE_UNKNOWN { + if next_si == STATE_QUIT { + return Result::Quit; + } + // Finally, this corresponds to the case where the transition + // entered a state that can never lead to a match or a state + // that hasn't been computed yet. The latter being the "slow" + // path. + let byte = Byte::byte(text[at - 1]); + // We no longer care about the special bits in the state + // pointer. + prev_si &= STATE_MAX; + // Record where we are. This is used to track progress for + // determining whether we should quit if we've flushed the + // cache too much. + self.at = at; + next_si = match self.next_state(qcur, qnext, prev_si, byte) { + None => return Result::Quit, + Some(STATE_DEAD) => return result.set_non_match(at), + Some(si) => si, + }; + debug_assert!(next_si != STATE_UNKNOWN); + if next_si & STATE_MATCH > 0 { + next_si &= !STATE_MATCH; + result = Result::Match(at - 1); + if self.quit_after_match { + return result; + } + self.last_match_si = next_si; + } + prev_si = next_si; + } else { + prev_si = next_si; + } + } + + // Run the DFA once more on the special EOF sentinel value. + // We don't care about the special bits in the state pointer any more, + // so get rid of them. + prev_si &= STATE_MAX; + prev_si = match self.next_state(qcur, qnext, prev_si, Byte::eof()) { + None => return Result::Quit, + Some(STATE_DEAD) => return result.set_non_match(text.len()), + Some(si) => si & !STATE_START, + }; + debug_assert!(prev_si != STATE_UNKNOWN); + if prev_si & STATE_MATCH > 0 { + prev_si &= !STATE_MATCH; + self.last_match_si = prev_si; + result = Result::Match(text.len()); + } + result + } + + /// Executes the DFA on a reverse NFA. + #[cfg_attr(feature = "perf-inline", inline(always))] + fn exec_at_reverse( + &mut self, + qcur: &mut SparseSet, + qnext: &mut SparseSet, + text: &[u8], + ) -> Result<usize> { + // The comments in `exec_at` above mostly apply here too. The main + // difference is that we move backwards over the input and we look for + // the longest possible match instead of the leftmost-first match. + // + // N.B. The code duplication here is regrettable. Efforts to improve + // it without sacrificing performance are welcome. ---AG + debug_assert!(self.prog.is_reverse); + let mut result = Result::NoMatch(self.at); + let (mut prev_si, mut next_si) = (self.start, self.start); + let mut at = self.at; + while at > 0 { + while next_si <= STATE_MAX && at > 0 { + // Argument for safety is in the definition of next_si. + at -= 1; + prev_si = unsafe { self.next_si(next_si, text, at) }; + if prev_si > STATE_MAX || at <= 4 { + mem::swap(&mut prev_si, &mut next_si); + break; + } + at -= 1; + next_si = unsafe { self.next_si(prev_si, text, at) }; + if next_si > STATE_MAX { + break; + } + at -= 1; + prev_si = unsafe { self.next_si(next_si, text, at) }; + if prev_si > STATE_MAX { + mem::swap(&mut prev_si, &mut next_si); + break; + } + at -= 1; + next_si = unsafe { self.next_si(prev_si, text, at) }; + } + if next_si & STATE_MATCH > 0 { + next_si &= !STATE_MATCH; + result = Result::Match(at + 1); + if self.quit_after_match { + return result; + } + self.last_match_si = next_si; + prev_si = next_si; + let cur = at; + while (next_si & !STATE_MATCH) == prev_si && at >= 2 { + // Argument for safety is in the definition of next_si. + at -= 1; + next_si = unsafe { + self.next_si(next_si & !STATE_MATCH, text, at) + }; + } + if at < cur { + result = Result::Match(at + 2); + } + } else if next_si >= STATE_UNKNOWN { + if next_si == STATE_QUIT { + return Result::Quit; + } + let byte = Byte::byte(text[at]); + prev_si &= STATE_MAX; + self.at = at; + next_si = match self.next_state(qcur, qnext, prev_si, byte) { + None => return Result::Quit, + Some(STATE_DEAD) => return result.set_non_match(at), + Some(si) => si, + }; + debug_assert!(next_si != STATE_UNKNOWN); + if next_si & STATE_MATCH > 0 { + next_si &= !STATE_MATCH; + result = Result::Match(at + 1); + if self.quit_after_match { + return result; + } + self.last_match_si = next_si; + } + prev_si = next_si; + } else { + prev_si = next_si; + } + } + + // Run the DFA once more on the special EOF sentinel value. + prev_si = match self.next_state(qcur, qnext, prev_si, Byte::eof()) { + None => return Result::Quit, + Some(STATE_DEAD) => return result.set_non_match(0), + Some(si) => si, + }; + debug_assert!(prev_si != STATE_UNKNOWN); + if prev_si & STATE_MATCH > 0 { + prev_si &= !STATE_MATCH; + self.last_match_si = prev_si; + result = Result::Match(0); + } + result + } + + /// next_si transitions to the next state, where the transition input + /// corresponds to text[i]. + /// + /// This elides bounds checks, and is therefore not safe. + #[cfg_attr(feature = "perf-inline", inline(always))] + unsafe fn next_si(&self, si: StatePtr, text: &[u8], i: usize) -> StatePtr { + // What is the argument for safety here? + // We have three unchecked accesses that could possibly violate safety: + // + // 1. The given byte of input (`text[i]`). + // 2. The class of the byte of input (`classes[text[i]]`). + // 3. The transition for the class (`trans[si + cls]`). + // + // (1) is only safe when calling next_si is guarded by + // `i < text.len()`. + // + // (2) is the easiest case to guarantee since `text[i]` is always a + // `u8` and `self.prog.byte_classes` always has length `u8::MAX`. + // (See `ByteClassSet.byte_classes` in `compile.rs`.) + // + // (3) is only safe if (1)+(2) are safe. Namely, the transitions + // of every state are defined to have length equal to the number of + // byte classes in the program. Therefore, a valid class leads to a + // valid transition. (All possible transitions are valid lookups, even + // if it points to a state that hasn't been computed yet.) (3) also + // relies on `si` being correct, but StatePtrs should only ever be + // retrieved from the transition table, which ensures they are correct. + debug_assert!(i < text.len()); + let b = *text.get_unchecked(i); + debug_assert!((b as usize) < self.prog.byte_classes.len()); + let cls = *self.prog.byte_classes.get_unchecked(b as usize); + self.cache.trans.next_unchecked(si, cls as usize) + } + + /// Computes the next state given the current state and the current input + /// byte (which may be EOF). + /// + /// If STATE_DEAD is returned, then there is no valid state transition. + /// This implies that no permutation of future input can lead to a match + /// state. + /// + /// STATE_UNKNOWN can never be returned. + fn exec_byte( + &mut self, + qcur: &mut SparseSet, + qnext: &mut SparseSet, + mut si: StatePtr, + b: Byte, + ) -> Option<StatePtr> { + use crate::prog::Inst::*; + + // Initialize a queue with the current DFA state's NFA states. + qcur.clear(); + for ip in self.state(si).inst_ptrs() { + qcur.insert(ip); + } + + // Before inspecting the current byte, we may need to also inspect + // whether the position immediately preceding the current byte + // satisfies the empty assertions found in the current state. + // + // We only need to do this step if there are any empty assertions in + // the current state. + let is_word_last = self.state(si).flags().is_word(); + let is_word = b.is_ascii_word(); + if self.state(si).flags().has_empty() { + // Compute the flags immediately preceding the current byte. + // This means we only care about the "end" or "end line" flags. + // (The "start" flags are computed immediately following the + // current byte and are handled below.) + let mut flags = EmptyFlags::default(); + if b.is_eof() { + flags.end = true; + flags.end_line = true; + } else if b.as_byte().map_or(false, |b| b == b'\n') { + flags.end_line = true; + } + if is_word_last == is_word { + flags.not_word_boundary = true; + } else { + flags.word_boundary = true; + } + // Now follow epsilon transitions from every NFA state, but make + // sure we only follow transitions that satisfy our flags. + qnext.clear(); + for &ip in &*qcur { + self.follow_epsilons(usize_to_u32(ip), qnext, flags); + } + mem::swap(qcur, qnext); + } + + // Now we set flags for immediately after the current byte. Since start + // states are processed separately, and are the only states that can + // have the StartText flag set, we therefore only need to worry about + // the StartLine flag here. + // + // We do also keep track of whether this DFA state contains a NFA state + // that is a matching state. This is precisely how we delay the DFA + // matching by one byte in order to process the special EOF sentinel + // byte. Namely, if this DFA state containing a matching NFA state, + // then it is the *next* DFA state that is marked as a match. + let mut empty_flags = EmptyFlags::default(); + let mut state_flags = StateFlags::default(); + empty_flags.start_line = b.as_byte().map_or(false, |b| b == b'\n'); + if b.is_ascii_word() { + state_flags.set_word(); + } + // Now follow all epsilon transitions again, but only after consuming + // the current byte. + qnext.clear(); + for &ip in &*qcur { + match self.prog[ip as usize] { + // These states never happen in a byte-based program. + Char(_) | Ranges(_) => unreachable!(), + // These states are handled when following epsilon transitions. + Save(_) | Split(_) | EmptyLook(_) => {} + Match(_) => { + state_flags.set_match(); + if !self.continue_past_first_match() { + break; + } else if self.prog.matches.len() > 1 + && !qnext.contains(ip as usize) + { + // If we are continuing on to find other matches, + // then keep a record of the match states we've seen. + qnext.insert(ip); + } + } + Bytes(ref inst) => { + if b.as_byte().map_or(false, |b| inst.matches(b)) { + self.follow_epsilons( + inst.goto as InstPtr, + qnext, + empty_flags, + ); + } + } + } + } + + let cache = if b.is_eof() && self.prog.matches.len() > 1 { + // If we're processing the last byte of the input and we're + // matching a regex set, then make the next state contain the + // previous states transitions. We do this so that the main + // matching loop can extract all of the match instructions. + mem::swap(qcur, qnext); + // And don't cache this state because it's totally bunk. + false + } else { + true + }; + + // We've now built up the set of NFA states that ought to comprise the + // next DFA state, so try to find it in the cache, and if it doesn't + // exist, cache it. + // + // N.B. We pass `&mut si` here because the cache may clear itself if + // it has gotten too full. When that happens, the location of the + // current state may change. + let mut next = + match self.cached_state(qnext, state_flags, Some(&mut si)) { + None => return None, + Some(next) => next, + }; + if (self.start & !STATE_START) == next { + // Start states can never be match states since all matches are + // delayed by one byte. + debug_assert!(!self.state(next).flags().is_match()); + next = self.start_ptr(next); + } + if next <= STATE_MAX && self.state(next).flags().is_match() { + next |= STATE_MATCH; + } + debug_assert!(next != STATE_UNKNOWN); + // And now store our state in the current state's next list. + if cache { + let cls = self.byte_class(b); + self.cache.trans.set_next(si, cls, next); + } + Some(next) + } + + /// Follows the epsilon transitions starting at (and including) `ip`. The + /// resulting states are inserted into the ordered set `q`. + /// + /// Conditional epsilon transitions (i.e., empty width assertions) are only + /// followed if they are satisfied by the given flags, which should + /// represent the flags set at the current location in the input. + /// + /// If the current location corresponds to the empty string, then only the + /// end line and/or end text flags may be set. If the current location + /// corresponds to a real byte in the input, then only the start line + /// and/or start text flags may be set. + /// + /// As an exception to the above, when finding the initial state, any of + /// the above flags may be set: + /// + /// If matching starts at the beginning of the input, then start text and + /// start line should be set. If the input is empty, then end text and end + /// line should also be set. + /// + /// If matching starts after the beginning of the input, then only start + /// line should be set if the preceding byte is `\n`. End line should never + /// be set in this case. (Even if the following byte is a `\n`, it will + /// be handled in a subsequent DFA state.) + fn follow_epsilons( + &mut self, + ip: InstPtr, + q: &mut SparseSet, + flags: EmptyFlags, + ) { + use crate::prog::EmptyLook::*; + use crate::prog::Inst::*; + + // We need to traverse the NFA to follow epsilon transitions, so avoid + // recursion with an explicit stack. + self.cache.stack.push(ip); + while let Some(mut ip) = self.cache.stack.pop() { + // Try to munch through as many states as possible without + // pushes/pops to the stack. + loop { + // Don't visit states we've already added. + if q.contains(ip as usize) { + break; + } + q.insert(ip as usize); + match self.prog[ip as usize] { + Char(_) | Ranges(_) => unreachable!(), + Match(_) | Bytes(_) => { + break; + } + EmptyLook(ref inst) => { + // Only follow empty assertion states if our flags + // satisfy the assertion. + match inst.look { + StartLine if flags.start_line => { + ip = inst.goto as InstPtr; + } + EndLine if flags.end_line => { + ip = inst.goto as InstPtr; + } + StartText if flags.start => { + ip = inst.goto as InstPtr; + } + EndText if flags.end => { + ip = inst.goto as InstPtr; + } + WordBoundaryAscii if flags.word_boundary => { + ip = inst.goto as InstPtr; + } + NotWordBoundaryAscii + if flags.not_word_boundary => + { + ip = inst.goto as InstPtr; + } + WordBoundary if flags.word_boundary => { + ip = inst.goto as InstPtr; + } + NotWordBoundary if flags.not_word_boundary => { + ip = inst.goto as InstPtr; + } + StartLine | EndLine | StartText | EndText + | WordBoundaryAscii | NotWordBoundaryAscii + | WordBoundary | NotWordBoundary => { + break; + } + } + } + Save(ref inst) => { + ip = inst.goto as InstPtr; + } + Split(ref inst) => { + self.cache.stack.push(inst.goto2 as InstPtr); + ip = inst.goto1 as InstPtr; + } + } + } + } + } + + /// Find a previously computed state matching the given set of instructions + /// and is_match bool. + /// + /// The given set of instructions should represent a single state in the + /// NFA along with all states reachable without consuming any input. + /// + /// The is_match bool should be true if and only if the preceding DFA state + /// contains an NFA matching state. The cached state produced here will + /// then signify a match. (This enables us to delay a match by one byte, + /// in order to account for the EOF sentinel byte.) + /// + /// If the cache is full, then it is wiped before caching a new state. + /// + /// The current state should be specified if it exists, since it will need + /// to be preserved if the cache clears itself. (Start states are + /// always saved, so they should not be passed here.) It takes a mutable + /// pointer to the index because if the cache is cleared, the state's + /// location may change. + fn cached_state( + &mut self, + q: &SparseSet, + mut state_flags: StateFlags, + current_state: Option<&mut StatePtr>, + ) -> Option<StatePtr> { + // If we couldn't come up with a non-empty key to represent this state, + // then it is dead and can never lead to a match. + // + // Note that inst_flags represent the set of empty width assertions + // in q. We use this as an optimization in exec_byte to determine when + // we should follow epsilon transitions at the empty string preceding + // the current byte. + let key = match self.cached_state_key(q, &mut state_flags) { + None => return Some(STATE_DEAD), + Some(v) => v, + }; + // In the cache? Cool. Done. + if let Some(si) = self.cache.compiled.get_ptr(&key) { + return Some(si); + } + // If the cache has gotten too big, wipe it. + if self.approximate_size() > self.prog.dfa_size_limit + && !self.clear_cache_and_save(current_state) + { + // Ooops. DFA is giving up. + return None; + } + // Allocate room for our state and add it. + self.add_state(key) + } + + /// Produces a key suitable for describing a state in the DFA cache. + /// + /// The key invariant here is that equivalent keys are produced for any two + /// sets of ordered NFA states (and toggling of whether the previous NFA + /// states contain a match state) that do not discriminate a match for any + /// input. + /// + /// Specifically, q should be an ordered set of NFA states and is_match + /// should be true if and only if the previous NFA states contained a match + /// state. + fn cached_state_key( + &mut self, + q: &SparseSet, + state_flags: &mut StateFlags, + ) -> Option<State> { + use crate::prog::Inst::*; + + // We need to build up enough information to recognize pre-built states + // in the DFA. Generally speaking, this includes every instruction + // except for those which are purely epsilon transitions, e.g., the + // Save and Split instructions. + // + // Empty width assertions are also epsilon transitions, but since they + // are conditional, we need to make them part of a state's key in the + // cache. + + let mut insts = + mem::replace(&mut self.cache.insts_scratch_space, vec![]); + insts.clear(); + // Reserve 1 byte for flags. + insts.push(0); + + let mut prev = 0; + for &ip in q { + let ip = usize_to_u32(ip); + match self.prog[ip as usize] { + Char(_) | Ranges(_) => unreachable!(), + Save(_) | Split(_) => {} + Bytes(_) => push_inst_ptr(&mut insts, &mut prev, ip), + EmptyLook(_) => { + state_flags.set_empty(); + push_inst_ptr(&mut insts, &mut prev, ip) + } + Match(_) => { + push_inst_ptr(&mut insts, &mut prev, ip); + if !self.continue_past_first_match() { + break; + } + } + } + } + // If we couldn't transition to any other instructions and we didn't + // see a match when expanding NFA states previously, then this is a + // dead state and no amount of additional input can transition out + // of this state. + let opt_state = if insts.len() == 1 && !state_flags.is_match() { + None + } else { + let StateFlags(f) = *state_flags; + insts[0] = f; + Some(State { data: Arc::from(&*insts) }) + }; + self.cache.insts_scratch_space = insts; + opt_state + } + + /// Clears the cache, but saves and restores current_state if it is not + /// none. + /// + /// The current state must be provided here in case its location in the + /// cache changes. + /// + /// This returns false if the cache is not cleared and the DFA should + /// give up. + fn clear_cache_and_save( + &mut self, + current_state: Option<&mut StatePtr>, + ) -> bool { + if self.cache.compiled.is_empty() { + // Nothing to clear... + return true; + } + match current_state { + None => self.clear_cache(), + Some(si) => { + let cur = self.state(*si).clone(); + if !self.clear_cache() { + return false; + } + // The unwrap is OK because we just cleared the cache and + // therefore know that the next state pointer won't exceed + // STATE_MAX. + *si = self.restore_state(cur).unwrap(); + true + } + } + } + + /// Wipes the state cache, but saves and restores the current start state. + /// + /// This returns false if the cache is not cleared and the DFA should + /// give up. + fn clear_cache(&mut self) -> bool { + // Bail out of the DFA if we're moving too "slowly." + // A heuristic from RE2: assume the DFA is too slow if it is processing + // 10 or fewer bytes per state. + // Additionally, we permit the cache to be flushed a few times before + // caling it quits. + let nstates = self.cache.compiled.len(); + if self.cache.flush_count >= 3 + && self.at >= self.last_cache_flush + && (self.at - self.last_cache_flush) <= 10 * nstates + { + return false; + } + // Update statistics tracking cache flushes. + self.last_cache_flush = self.at; + self.cache.flush_count += 1; + + // OK, actually flush the cache. + let start = self.state(self.start & !STATE_START).clone(); + let last_match = if self.last_match_si <= STATE_MAX { + Some(self.state(self.last_match_si).clone()) + } else { + None + }; + self.cache.reset_size(); + self.cache.trans.clear(); + self.cache.compiled.clear(); + for s in &mut self.cache.start_states { + *s = STATE_UNKNOWN; + } + // The unwraps are OK because we just cleared the cache and therefore + // know that the next state pointer won't exceed STATE_MAX. + let start_ptr = self.restore_state(start).unwrap(); + self.start = self.start_ptr(start_ptr); + if let Some(last_match) = last_match { + self.last_match_si = self.restore_state(last_match).unwrap(); + } + true + } + + /// Restores the given state back into the cache, and returns a pointer + /// to it. + fn restore_state(&mut self, state: State) -> Option<StatePtr> { + // If we've already stored this state, just return a pointer to it. + // None will be the wiser. + if let Some(si) = self.cache.compiled.get_ptr(&state) { + return Some(si); + } + self.add_state(state) + } + + /// Returns the next state given the current state si and current byte + /// b. {qcur,qnext} are used as scratch space for storing ordered NFA + /// states. + /// + /// This tries to fetch the next state from the cache, but if that fails, + /// it computes the next state, caches it and returns a pointer to it. + /// + /// The pointer can be to a real state, or it can be STATE_DEAD. + /// STATE_UNKNOWN cannot be returned. + /// + /// None is returned if a new state could not be allocated (i.e., the DFA + /// ran out of space and thinks it's running too slowly). + fn next_state( + &mut self, + qcur: &mut SparseSet, + qnext: &mut SparseSet, + si: StatePtr, + b: Byte, + ) -> Option<StatePtr> { + if si == STATE_DEAD { + return Some(STATE_DEAD); + } + match self.cache.trans.next(si, self.byte_class(b)) { + STATE_UNKNOWN => self.exec_byte(qcur, qnext, si, b), + STATE_QUIT => None, + nsi => Some(nsi), + } + } + + /// Computes and returns the start state, where searching begins at + /// position `at` in `text`. If the state has already been computed, + /// then it is pulled from the cache. If the state hasn't been cached, + /// then it is computed, cached and a pointer to it is returned. + /// + /// This may return STATE_DEAD but never STATE_UNKNOWN. + #[cfg_attr(feature = "perf-inline", inline(always))] + fn start_state( + &mut self, + q: &mut SparseSet, + empty_flags: EmptyFlags, + state_flags: StateFlags, + ) -> Option<StatePtr> { + // Compute an index into our cache of start states based on the set + // of empty/state flags set at the current position in the input. We + // don't use every flag since not all flags matter. For example, since + // matches are delayed by one byte, start states can never be match + // states. + let flagi = { + (((empty_flags.start as u8) << 0) + | ((empty_flags.end as u8) << 1) + | ((empty_flags.start_line as u8) << 2) + | ((empty_flags.end_line as u8) << 3) + | ((empty_flags.word_boundary as u8) << 4) + | ((empty_flags.not_word_boundary as u8) << 5) + | ((state_flags.is_word() as u8) << 6)) as usize + }; + match self.cache.start_states[flagi] { + STATE_UNKNOWN => {} + si => return Some(si), + } + q.clear(); + let start = usize_to_u32(self.prog.start); + self.follow_epsilons(start, q, empty_flags); + // Start states can never be match states because we delay every match + // by one byte. Given an empty string and an empty match, the match + // won't actually occur until the DFA processes the special EOF + // sentinel byte. + let sp = match self.cached_state(q, state_flags, None) { + None => return None, + Some(sp) => self.start_ptr(sp), + }; + self.cache.start_states[flagi] = sp; + Some(sp) + } + + /// Computes the set of starting flags for the given position in text. + /// + /// This should only be used when executing the DFA forwards over the + /// input. + fn start_flags(&self, text: &[u8], at: usize) -> (EmptyFlags, StateFlags) { + let mut empty_flags = EmptyFlags::default(); + let mut state_flags = StateFlags::default(); + empty_flags.start = at == 0; + empty_flags.end = text.is_empty(); + empty_flags.start_line = at == 0 || text[at - 1] == b'\n'; + empty_flags.end_line = text.is_empty(); + + let is_word_last = at > 0 && Byte::byte(text[at - 1]).is_ascii_word(); + let is_word = at < text.len() && Byte::byte(text[at]).is_ascii_word(); + if is_word_last { + state_flags.set_word(); + } + if is_word == is_word_last { + empty_flags.not_word_boundary = true; + } else { + empty_flags.word_boundary = true; + } + (empty_flags, state_flags) + } + + /// Computes the set of starting flags for the given position in text. + /// + /// This should only be used when executing the DFA in reverse over the + /// input. + fn start_flags_reverse( + &self, + text: &[u8], + at: usize, + ) -> (EmptyFlags, StateFlags) { + let mut empty_flags = EmptyFlags::default(); + let mut state_flags = StateFlags::default(); + empty_flags.start = at == text.len(); + empty_flags.end = text.is_empty(); + empty_flags.start_line = at == text.len() || text[at] == b'\n'; + empty_flags.end_line = text.is_empty(); + + let is_word_last = + at < text.len() && Byte::byte(text[at]).is_ascii_word(); + let is_word = at > 0 && Byte::byte(text[at - 1]).is_ascii_word(); + if is_word_last { + state_flags.set_word(); + } + if is_word == is_word_last { + empty_flags.not_word_boundary = true; + } else { + empty_flags.word_boundary = true; + } + (empty_flags, state_flags) + } + + /// Returns a reference to a State given a pointer to it. + fn state(&self, si: StatePtr) -> &State { + self.cache.compiled.get_state(si).unwrap() + } + + /// Adds the given state to the DFA. + /// + /// This allocates room for transitions out of this state in + /// self.cache.trans. The transitions can be set with the returned + /// StatePtr. + /// + /// If None is returned, then the state limit was reached and the DFA + /// should quit. + fn add_state(&mut self, state: State) -> Option<StatePtr> { + // This will fail if the next state pointer exceeds STATE_PTR. In + // practice, the cache limit will prevent us from ever getting here, + // but maybe callers will set the cache size to something ridiculous... + let si = match self.cache.trans.add() { + None => return None, + Some(si) => si, + }; + // If the program has a Unicode word boundary, then set any transitions + // for non-ASCII bytes to STATE_QUIT. If the DFA stumbles over such a + // transition, then it will quit and an alternative matching engine + // will take over. + if self.prog.has_unicode_word_boundary { + for b in 128..256 { + let cls = self.byte_class(Byte::byte(b as u8)); + self.cache.trans.set_next(si, cls, STATE_QUIT); + } + } + // Finally, put our actual state on to our heap of states and index it + // so we can find it later. + self.cache.size += self.cache.trans.state_heap_size() + + state.data.len() + + (2 * mem::size_of::<State>()) + + mem::size_of::<StatePtr>(); + self.cache.compiled.insert(state, si); + // Transition table and set of states and map should all be in sync. + debug_assert!( + self.cache.compiled.len() == self.cache.trans.num_states() + ); + Some(si) + } + + /// Quickly finds the next occurrence of any literal prefixes in the regex. + /// If there are no literal prefixes, then the current position is + /// returned. If there are literal prefixes and one could not be found, + /// then None is returned. + /// + /// This should only be called when the DFA is in a start state. + fn prefix_at(&self, text: &[u8], at: usize) -> Option<usize> { + self.prog.prefixes.find(&text[at..]).map(|(s, _)| at + s) + } + + /// Returns the number of byte classes required to discriminate transitions + /// in each state. + /// + /// invariant: num_byte_classes() == len(State.next) + fn num_byte_classes(&self) -> usize { + // We add 1 to account for the special EOF byte. + (self.prog.byte_classes[255] as usize + 1) + 1 + } + + /// Given an input byte or the special EOF sentinel, return its + /// corresponding byte class. + #[cfg_attr(feature = "perf-inline", inline(always))] + fn byte_class(&self, b: Byte) -> usize { + match b.as_byte() { + None => self.num_byte_classes() - 1, + Some(b) => self.u8_class(b), + } + } + + /// Like byte_class, but explicitly for u8s. + #[cfg_attr(feature = "perf-inline", inline(always))] + fn u8_class(&self, b: u8) -> usize { + self.prog.byte_classes[b as usize] as usize + } + + /// Returns true if the DFA should continue searching past the first match. + /// + /// Leftmost first semantics in the DFA are preserved by not following NFA + /// transitions after the first match is seen. + /// + /// On occasion, we want to avoid leftmost first semantics to find either + /// the longest match (for reverse search) or all possible matches (for + /// regex sets). + fn continue_past_first_match(&self) -> bool { + self.prog.is_reverse || self.prog.matches.len() > 1 + } + + /// Returns true if there is a prefix we can quickly search for. + fn has_prefix(&self) -> bool { + !self.prog.is_reverse + && !self.prog.prefixes.is_empty() + && !self.prog.is_anchored_start + } + + /// Sets the STATE_START bit in the given state pointer if and only if + /// we have a prefix to scan for. + /// + /// If there's no prefix, then it's a waste to treat the start state + /// specially. + fn start_ptr(&self, si: StatePtr) -> StatePtr { + if self.has_prefix() { + si | STATE_START + } else { + si + } + } + + /// Approximate size returns the approximate heap space currently used by + /// the DFA. It is used to determine whether the DFA's state cache needs to + /// be wiped. Namely, it is possible that for certain regexes on certain + /// inputs, a new state could be created for every byte of input. (This is + /// bad for memory use, so we bound it with a cache.) + fn approximate_size(&self) -> usize { + self.cache.size + self.prog.approximate_size() + } +} + +/// An abstraction for representing a map of states. The map supports two +/// different ways of state lookup. One is fast constant time access via a +/// state pointer. The other is a hashmap lookup based on the DFA's +/// constituent NFA states. +/// +/// A DFA state internally uses an Arc such that we only need to store the +/// set of NFA states on the heap once, even though we support looking up +/// states by two different means. A more natural way to express this might +/// use raw pointers, but an Arc is safe and effectively achieves the same +/// thing. +#[derive(Debug)] +struct StateMap { + /// The keys are not actually static but rely on always pointing to a + /// buffer in `states` which will never be moved except when clearing + /// the map or on drop, in which case the keys of this map will be + /// removed before + map: HashMap<State, StatePtr>, + /// Our set of states. Note that `StatePtr / num_byte_classes` indexes + /// this Vec rather than just a `StatePtr`. + states: Vec<State>, + /// The number of byte classes in the DFA. Used to index `states`. + num_byte_classes: usize, +} + +impl StateMap { + fn new(num_byte_classes: usize) -> StateMap { + StateMap { map: HashMap::new(), states: vec![], num_byte_classes } + } + + fn len(&self) -> usize { + self.states.len() + } + + fn is_empty(&self) -> bool { + self.states.is_empty() + } + + fn get_ptr(&self, state: &State) -> Option<StatePtr> { + self.map.get(state).cloned() + } + + fn get_state(&self, si: StatePtr) -> Option<&State> { + self.states.get(si as usize / self.num_byte_classes) + } + + fn insert(&mut self, state: State, si: StatePtr) { + self.map.insert(state.clone(), si); + self.states.push(state); + } + + fn clear(&mut self) { + self.map.clear(); + self.states.clear(); + } +} + +impl Transitions { + /// Create a new transition table. + /// + /// The number of byte classes corresponds to the stride. Every state will + /// have `num_byte_classes` slots for transitions. + fn new(num_byte_classes: usize) -> Transitions { + Transitions { table: vec![], num_byte_classes } + } + + /// Returns the total number of states currently in this table. + fn num_states(&self) -> usize { + self.table.len() / self.num_byte_classes + } + + /// Allocates room for one additional state and returns a pointer to it. + /// + /// If there's no more room, None is returned. + fn add(&mut self) -> Option<StatePtr> { + let si = self.table.len(); + if si > STATE_MAX as usize { + return None; + } + self.table.extend(repeat(STATE_UNKNOWN).take(self.num_byte_classes)); + Some(usize_to_u32(si)) + } + + /// Clears the table of all states. + fn clear(&mut self) { + self.table.clear(); + } + + /// Sets the transition from (si, cls) to next. + fn set_next(&mut self, si: StatePtr, cls: usize, next: StatePtr) { + self.table[si as usize + cls] = next; + } + + /// Returns the transition corresponding to (si, cls). + fn next(&self, si: StatePtr, cls: usize) -> StatePtr { + self.table[si as usize + cls] + } + + /// The heap size, in bytes, of a single state in the transition table. + fn state_heap_size(&self) -> usize { + self.num_byte_classes * mem::size_of::<StatePtr>() + } + + /// Like `next`, but uses unchecked access and is therefore not safe. + unsafe fn next_unchecked(&self, si: StatePtr, cls: usize) -> StatePtr { + debug_assert!((si as usize) < self.table.len()); + debug_assert!(cls < self.num_byte_classes); + *self.table.get_unchecked(si as usize + cls) + } +} + +impl StateFlags { + fn is_match(&self) -> bool { + self.0 & 0b0000_0001 > 0 + } + + fn set_match(&mut self) { + self.0 |= 0b0000_0001; + } + + fn is_word(&self) -> bool { + self.0 & 0b0000_0010 > 0 + } + + fn set_word(&mut self) { + self.0 |= 0b0000_0010; + } + + fn has_empty(&self) -> bool { + self.0 & 0b0000_0100 > 0 + } + + fn set_empty(&mut self) { + self.0 |= 0b0000_0100; + } +} + +impl Byte { + fn byte(b: u8) -> Self { + Byte(b as u16) + } + fn eof() -> Self { + Byte(256) + } + fn is_eof(&self) -> bool { + self.0 == 256 + } + + fn is_ascii_word(&self) -> bool { + let b = match self.as_byte() { + None => return false, + Some(b) => b, + }; + match b { + b'A'..=b'Z' | b'a'..=b'z' | b'0'..=b'9' | b'_' => true, + _ => false, + } + } + + fn as_byte(&self) -> Option<u8> { + if self.is_eof() { + None + } else { + Some(self.0 as u8) + } + } +} + +impl fmt::Debug for State { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + let ips: Vec<usize> = self.inst_ptrs().collect(); + f.debug_struct("State") + .field("flags", &self.flags()) + .field("insts", &ips) + .finish() + } +} + +impl fmt::Debug for Transitions { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + let mut fmtd = f.debug_map(); + for si in 0..self.num_states() { + let s = si * self.num_byte_classes; + let e = s + self.num_byte_classes; + fmtd.entry(&si.to_string(), &TransitionsRow(&self.table[s..e])); + } + fmtd.finish() + } +} + +struct TransitionsRow<'a>(&'a [StatePtr]); + +impl<'a> fmt::Debug for TransitionsRow<'a> { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + let mut fmtd = f.debug_map(); + for (b, si) in self.0.iter().enumerate() { + match *si { + STATE_UNKNOWN => {} + STATE_DEAD => { + fmtd.entry(&vb(b as usize), &"DEAD"); + } + si => { + fmtd.entry(&vb(b as usize), &si.to_string()); + } + } + } + fmtd.finish() + } +} + +impl fmt::Debug for StateFlags { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + f.debug_struct("StateFlags") + .field("is_match", &self.is_match()) + .field("is_word", &self.is_word()) + .field("has_empty", &self.has_empty()) + .finish() + } +} + +/// Helper function for formatting a byte as a nice-to-read escaped string. +fn vb(b: usize) -> String { + use std::ascii::escape_default; + + if b > ::std::u8::MAX as usize { + "EOF".to_owned() + } else { + let escaped = escape_default(b as u8).collect::<Vec<u8>>(); + String::from_utf8_lossy(&escaped).into_owned() + } +} + +fn usize_to_u32(n: usize) -> u32 { + if (n as u64) > (::std::u32::MAX as u64) { + panic!("BUG: {} is too big to fit into u32", n) + } + n as u32 +} + +#[allow(dead_code)] // useful for debugging +fn show_state_ptr(si: StatePtr) -> String { + let mut s = format!("{:?}", si & STATE_MAX); + if si == STATE_UNKNOWN { + s = format!("{} (unknown)", s); + } + if si == STATE_DEAD { + s = format!("{} (dead)", s); + } + if si == STATE_QUIT { + s = format!("{} (quit)", s); + } + if si & STATE_START > 0 { + s = format!("{} (start)", s); + } + if si & STATE_MATCH > 0 { + s = format!("{} (match)", s); + } + s +} + +/// https://developers.google.com/protocol-buffers/docs/encoding#varints +fn write_vari32(data: &mut Vec<u8>, n: i32) { + let mut un = (n as u32) << 1; + if n < 0 { + un = !un; + } + write_varu32(data, un) +} + +/// https://developers.google.com/protocol-buffers/docs/encoding#varints +fn read_vari32(data: &[u8]) -> (i32, usize) { + let (un, i) = read_varu32(data); + let mut n = (un >> 1) as i32; + if un & 1 != 0 { + n = !n; + } + (n, i) +} + +/// https://developers.google.com/protocol-buffers/docs/encoding#varints +fn write_varu32(data: &mut Vec<u8>, mut n: u32) { + while n >= 0b1000_0000 { + data.push((n as u8) | 0b1000_0000); + n >>= 7; + } + data.push(n as u8); +} + +/// https://developers.google.com/protocol-buffers/docs/encoding#varints +fn read_varu32(data: &[u8]) -> (u32, usize) { + let mut n: u32 = 0; + let mut shift: u32 = 0; + for (i, &b) in data.iter().enumerate() { + if b < 0b1000_0000 { + return (n | ((b as u32) << shift), i + 1); + } + n |= ((b as u32) & 0b0111_1111) << shift; + shift += 7; + } + (0, 0) +} + +#[cfg(test)] +mod tests { + + use super::{ + push_inst_ptr, read_vari32, read_varu32, write_vari32, write_varu32, + State, StateFlags, + }; + use quickcheck::{quickcheck, Gen, QuickCheck}; + use std::sync::Arc; + + #[test] + fn prop_state_encode_decode() { + fn p(mut ips: Vec<u32>, flags: u8) -> bool { + // It looks like our encoding scheme can't handle instruction + // pointers at or above 2**31. We should fix that, but it seems + // unlikely to occur in real code due to the amount of memory + // required for such a state machine. So for now, we just clamp + // our test data. + for ip in &mut ips { + if *ip >= 1 << 31 { + *ip = (1 << 31) - 1; + } + } + let mut data = vec![flags]; + let mut prev = 0; + for &ip in ips.iter() { + push_inst_ptr(&mut data, &mut prev, ip); + } + let state = State { data: Arc::from(&data[..]) }; + + let expected: Vec<usize> = + ips.into_iter().map(|ip| ip as usize).collect(); + let got: Vec<usize> = state.inst_ptrs().collect(); + expected == got && state.flags() == StateFlags(flags) + } + QuickCheck::new() + .gen(Gen::new(10_000)) + .quickcheck(p as fn(Vec<u32>, u8) -> bool); + } + + #[test] + fn prop_read_write_u32() { + fn p(n: u32) -> bool { + let mut buf = vec![]; + write_varu32(&mut buf, n); + let (got, nread) = read_varu32(&buf); + nread == buf.len() && got == n + } + quickcheck(p as fn(u32) -> bool); + } + + #[test] + fn prop_read_write_i32() { + fn p(n: i32) -> bool { + let mut buf = vec![]; + write_vari32(&mut buf, n); + let (got, nread) = read_vari32(&buf); + nread == buf.len() && got == n + } + quickcheck(p as fn(i32) -> bool); + } +} diff --git a/third_party/rust/regex/src/error.rs b/third_party/rust/regex/src/error.rs new file mode 100644 index 0000000000..3e0ec75210 --- /dev/null +++ b/third_party/rust/regex/src/error.rs @@ -0,0 +1,71 @@ +use std::fmt; +use std::iter::repeat; + +/// An error that occurred during parsing or compiling a regular expression. +#[derive(Clone, PartialEq)] +pub enum Error { + /// A syntax error. + Syntax(String), + /// The compiled program exceeded the set size limit. + /// The argument is the size limit imposed. + CompiledTooBig(usize), + /// Hints that destructuring should not be exhaustive. + /// + /// This enum may grow additional variants, so this makes sure clients + /// don't count on exhaustive matching. (Otherwise, adding a new variant + /// could break existing code.) + #[doc(hidden)] + __Nonexhaustive, +} + +impl ::std::error::Error for Error { + // TODO: Remove this method entirely on the next breaking semver release. + #[allow(deprecated)] + fn description(&self) -> &str { + match *self { + Error::Syntax(ref err) => err, + Error::CompiledTooBig(_) => "compiled program too big", + Error::__Nonexhaustive => unreachable!(), + } + } +} + +impl fmt::Display for Error { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + match *self { + Error::Syntax(ref err) => err.fmt(f), + Error::CompiledTooBig(limit) => write!( + f, + "Compiled regex exceeds size limit of {} bytes.", + limit + ), + Error::__Nonexhaustive => unreachable!(), + } + } +} + +// We implement our own Debug implementation so that we show nicer syntax +// errors when people use `Regex::new(...).unwrap()`. It's a little weird, +// but the `Syntax` variant is already storing a `String` anyway, so we might +// as well format it nicely. +impl fmt::Debug for Error { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + match *self { + Error::Syntax(ref err) => { + let hr: String = repeat('~').take(79).collect(); + writeln!(f, "Syntax(")?; + writeln!(f, "{}", hr)?; + writeln!(f, "{}", err)?; + writeln!(f, "{}", hr)?; + write!(f, ")")?; + Ok(()) + } + Error::CompiledTooBig(limit) => { + f.debug_tuple("CompiledTooBig").field(&limit).finish() + } + Error::__Nonexhaustive => { + f.debug_tuple("__Nonexhaustive").finish() + } + } + } +} diff --git a/third_party/rust/regex/src/exec.rs b/third_party/rust/regex/src/exec.rs new file mode 100644 index 0000000000..e75ca083a0 --- /dev/null +++ b/third_party/rust/regex/src/exec.rs @@ -0,0 +1,1655 @@ +use std::cell::RefCell; +use std::collections::HashMap; +use std::panic::AssertUnwindSafe; +use std::sync::Arc; + +#[cfg(feature = "perf-literal")] +use aho_corasick::{AhoCorasick, AhoCorasickBuilder, MatchKind}; +use regex_syntax::hir::literal::Literals; +use regex_syntax::hir::Hir; +use regex_syntax::ParserBuilder; + +use crate::backtrack; +use crate::compile::Compiler; +#[cfg(feature = "perf-dfa")] +use crate::dfa; +use crate::error::Error; +use crate::input::{ByteInput, CharInput}; +use crate::literal::LiteralSearcher; +use crate::pikevm; +use crate::pool::{Pool, PoolGuard}; +use crate::prog::Program; +use crate::re_builder::RegexOptions; +use crate::re_bytes; +use crate::re_set; +use crate::re_trait::{Locations, RegularExpression, Slot}; +use crate::re_unicode; +use crate::utf8::next_utf8; + +/// `Exec` manages the execution of a regular expression. +/// +/// In particular, this manages the various compiled forms of a single regular +/// expression and the choice of which matching engine to use to execute a +/// regular expression. +#[derive(Debug)] +pub struct Exec { + /// All read only state. + ro: Arc<ExecReadOnly>, + /// A pool of reusable values for the various matching engines. + /// + /// Note that boxing this value is not strictly necessary, but it is an + /// easy way to ensure that T does not bloat the stack sized used by a pool + /// in the case where T is big. And this turns out to be the case at the + /// time of writing for regex's use of this pool. At the time of writing, + /// the size of a Regex on the stack is 856 bytes. Boxing this value + /// reduces that size to 16 bytes. + pool: Box<Pool<ProgramCache>>, +} + +/// `ExecNoSync` is like `Exec`, except it embeds a reference to a cache. This +/// means it is no longer Sync, but we can now avoid the overhead of +/// synchronization to fetch the cache. +#[derive(Debug)] +pub struct ExecNoSync<'c> { + /// All read only state. + ro: &'c Arc<ExecReadOnly>, + /// Caches for the various matching engines. + cache: PoolGuard<'c, ProgramCache>, +} + +/// `ExecNoSyncStr` is like `ExecNoSync`, but matches on &str instead of &[u8]. +#[derive(Debug)] +pub struct ExecNoSyncStr<'c>(ExecNoSync<'c>); + +/// `ExecReadOnly` comprises all read only state for a regex. Namely, all such +/// state is determined at compile time and never changes during search. +#[derive(Debug)] +struct ExecReadOnly { + /// The original regular expressions given by the caller to compile. + res: Vec<String>, + /// A compiled program that is used in the NFA simulation and backtracking. + /// It can be byte-based or Unicode codepoint based. + /// + /// N.B. It is not possibly to make this byte-based from the public API. + /// It is only used for testing byte based programs in the NFA simulations. + nfa: Program, + /// A compiled byte based program for DFA execution. This is only used + /// if a DFA can be executed. (Currently, only word boundary assertions are + /// not supported.) Note that this program contains an embedded `.*?` + /// preceding the first capture group, unless the regex is anchored at the + /// beginning. + dfa: Program, + /// The same as above, except the program is reversed (and there is no + /// preceding `.*?`). This is used by the DFA to find the starting location + /// of matches. + dfa_reverse: Program, + /// A set of suffix literals extracted from the regex. + /// + /// Prefix literals are stored on the `Program`, since they are used inside + /// the matching engines. + suffixes: LiteralSearcher, + /// An Aho-Corasick automaton with leftmost-first match semantics. + /// + /// This is only set when the entire regex is a simple unanchored + /// alternation of literals. We could probably use it more circumstances, + /// but this is already hacky enough in this architecture. + /// + /// N.B. We use u32 as a state ID representation under the assumption that + /// if we were to exhaust the ID space, we probably would have long + /// surpassed the compilation size limit. + #[cfg(feature = "perf-literal")] + ac: Option<AhoCorasick<u32>>, + /// match_type encodes as much upfront knowledge about how we're going to + /// execute a search as possible. + match_type: MatchType, +} + +/// Facilitates the construction of an executor by exposing various knobs +/// to control how a regex is executed and what kinds of resources it's +/// permitted to use. +// `ExecBuilder` is only public via the `internal` module, so avoid deriving +// `Debug`. +#[allow(missing_debug_implementations)] +pub struct ExecBuilder { + options: RegexOptions, + match_type: Option<MatchType>, + bytes: bool, + only_utf8: bool, +} + +/// Parsed represents a set of parsed regular expressions and their detected +/// literals. +struct Parsed { + exprs: Vec<Hir>, + prefixes: Literals, + suffixes: Literals, + bytes: bool, +} + +impl ExecBuilder { + /// Create a regex execution builder. + /// + /// This uses default settings for everything except the regex itself, + /// which must be provided. Further knobs can be set by calling methods, + /// and then finally, `build` to actually create the executor. + pub fn new(re: &str) -> Self { + Self::new_many(&[re]) + } + + /// Like new, but compiles the union of the given regular expressions. + /// + /// Note that when compiling 2 or more regular expressions, capture groups + /// are completely unsupported. (This means both `find` and `captures` + /// won't work.) + pub fn new_many<I, S>(res: I) -> Self + where + S: AsRef<str>, + I: IntoIterator<Item = S>, + { + let mut opts = RegexOptions::default(); + opts.pats = res.into_iter().map(|s| s.as_ref().to_owned()).collect(); + Self::new_options(opts) + } + + /// Create a regex execution builder. + pub fn new_options(opts: RegexOptions) -> Self { + ExecBuilder { + options: opts, + match_type: None, + bytes: false, + only_utf8: true, + } + } + + /// Set the matching engine to be automatically determined. + /// + /// This is the default state and will apply whatever optimizations are + /// possible, such as running a DFA. + /// + /// This overrides whatever was previously set via the `nfa` or + /// `bounded_backtracking` methods. + pub fn automatic(mut self) -> Self { + self.match_type = None; + self + } + + /// Sets the matching engine to use the NFA algorithm no matter what + /// optimizations are possible. + /// + /// This overrides whatever was previously set via the `automatic` or + /// `bounded_backtracking` methods. + pub fn nfa(mut self) -> Self { + self.match_type = Some(MatchType::Nfa(MatchNfaType::PikeVM)); + self + } + + /// Sets the matching engine to use a bounded backtracking engine no + /// matter what optimizations are possible. + /// + /// One must use this with care, since the bounded backtracking engine + /// uses memory proportion to `len(regex) * len(text)`. + /// + /// This overrides whatever was previously set via the `automatic` or + /// `nfa` methods. + pub fn bounded_backtracking(mut self) -> Self { + self.match_type = Some(MatchType::Nfa(MatchNfaType::Backtrack)); + self + } + + /// Compiles byte based programs for use with the NFA matching engines. + /// + /// By default, the NFA engines match on Unicode scalar values. They can + /// be made to use byte based programs instead. In general, the byte based + /// programs are slower because of a less efficient encoding of character + /// classes. + /// + /// Note that this does not impact DFA matching engines, which always + /// execute on bytes. + pub fn bytes(mut self, yes: bool) -> Self { + self.bytes = yes; + self + } + + /// When disabled, the program compiled may match arbitrary bytes. + /// + /// When enabled (the default), all compiled programs exclusively match + /// valid UTF-8 bytes. + pub fn only_utf8(mut self, yes: bool) -> Self { + self.only_utf8 = yes; + self + } + + /// Set the Unicode flag. + pub fn unicode(mut self, yes: bool) -> Self { + self.options.unicode = yes; + self + } + + /// Parse the current set of patterns into their AST and extract literals. + fn parse(&self) -> Result<Parsed, Error> { + let mut exprs = Vec::with_capacity(self.options.pats.len()); + let mut prefixes = Some(Literals::empty()); + let mut suffixes = Some(Literals::empty()); + let mut bytes = false; + let is_set = self.options.pats.len() > 1; + // If we're compiling a regex set and that set has any anchored + // expressions, then disable all literal optimizations. + for pat in &self.options.pats { + let mut parser = ParserBuilder::new() + .octal(self.options.octal) + .case_insensitive(self.options.case_insensitive) + .multi_line(self.options.multi_line) + .dot_matches_new_line(self.options.dot_matches_new_line) + .swap_greed(self.options.swap_greed) + .ignore_whitespace(self.options.ignore_whitespace) + .unicode(self.options.unicode) + .allow_invalid_utf8(!self.only_utf8) + .nest_limit(self.options.nest_limit) + .build(); + let expr = + parser.parse(pat).map_err(|e| Error::Syntax(e.to_string()))?; + bytes = bytes || !expr.is_always_utf8(); + + if cfg!(feature = "perf-literal") { + if !expr.is_anchored_start() && expr.is_any_anchored_start() { + // Partial anchors unfortunately make it hard to use + // prefixes, so disable them. + prefixes = None; + } else if is_set && expr.is_anchored_start() { + // Regex sets with anchors do not go well with literal + // optimizations. + prefixes = None; + } + prefixes = prefixes.and_then(|mut prefixes| { + if !prefixes.union_prefixes(&expr) { + None + } else { + Some(prefixes) + } + }); + + if !expr.is_anchored_end() && expr.is_any_anchored_end() { + // Partial anchors unfortunately make it hard to use + // suffixes, so disable them. + suffixes = None; + } else if is_set && expr.is_anchored_end() { + // Regex sets with anchors do not go well with literal + // optimizations. + suffixes = None; + } + suffixes = suffixes.and_then(|mut suffixes| { + if !suffixes.union_suffixes(&expr) { + None + } else { + Some(suffixes) + } + }); + } + exprs.push(expr); + } + Ok(Parsed { + exprs, + prefixes: prefixes.unwrap_or_else(Literals::empty), + suffixes: suffixes.unwrap_or_else(Literals::empty), + bytes, + }) + } + + /// Build an executor that can run a regular expression. + pub fn build(self) -> Result<Exec, Error> { + // Special case when we have no patterns to compile. + // This can happen when compiling a regex set. + if self.options.pats.is_empty() { + let ro = Arc::new(ExecReadOnly { + res: vec![], + nfa: Program::new(), + dfa: Program::new(), + dfa_reverse: Program::new(), + suffixes: LiteralSearcher::empty(), + #[cfg(feature = "perf-literal")] + ac: None, + match_type: MatchType::Nothing, + }); + let pool = ExecReadOnly::new_pool(&ro); + return Ok(Exec { ro, pool }); + } + let parsed = self.parse()?; + let mut nfa = Compiler::new() + .size_limit(self.options.size_limit) + .bytes(self.bytes || parsed.bytes) + .only_utf8(self.only_utf8) + .compile(&parsed.exprs)?; + let mut dfa = Compiler::new() + .size_limit(self.options.size_limit) + .dfa(true) + .only_utf8(self.only_utf8) + .compile(&parsed.exprs)?; + let mut dfa_reverse = Compiler::new() + .size_limit(self.options.size_limit) + .dfa(true) + .only_utf8(self.only_utf8) + .reverse(true) + .compile(&parsed.exprs)?; + + #[cfg(feature = "perf-literal")] + let ac = self.build_aho_corasick(&parsed); + nfa.prefixes = LiteralSearcher::prefixes(parsed.prefixes); + dfa.prefixes = nfa.prefixes.clone(); + dfa.dfa_size_limit = self.options.dfa_size_limit; + dfa_reverse.dfa_size_limit = self.options.dfa_size_limit; + + let mut ro = ExecReadOnly { + res: self.options.pats, + nfa, + dfa, + dfa_reverse, + suffixes: LiteralSearcher::suffixes(parsed.suffixes), + #[cfg(feature = "perf-literal")] + ac, + match_type: MatchType::Nothing, + }; + ro.match_type = ro.choose_match_type(self.match_type); + + let ro = Arc::new(ro); + let pool = ExecReadOnly::new_pool(&ro); + Ok(Exec { ro, pool }) + } + + #[cfg(feature = "perf-literal")] + fn build_aho_corasick(&self, parsed: &Parsed) -> Option<AhoCorasick<u32>> { + if parsed.exprs.len() != 1 { + return None; + } + let lits = match alternation_literals(&parsed.exprs[0]) { + None => return None, + Some(lits) => lits, + }; + // If we have a small number of literals, then let Teddy handle + // things (see literal/mod.rs). + if lits.len() <= 32 { + return None; + } + Some( + AhoCorasickBuilder::new() + .match_kind(MatchKind::LeftmostFirst) + .auto_configure(&lits) + .build_with_size::<u32, _, _>(&lits) + // This should never happen because we'd long exceed the + // compilation limit for regexes first. + .expect("AC automaton too big"), + ) + } +} + +impl<'c> RegularExpression for ExecNoSyncStr<'c> { + type Text = str; + + fn slots_len(&self) -> usize { + self.0.slots_len() + } + + fn next_after_empty(&self, text: &str, i: usize) -> usize { + next_utf8(text.as_bytes(), i) + } + + #[cfg_attr(feature = "perf-inline", inline(always))] + fn shortest_match_at(&self, text: &str, start: usize) -> Option<usize> { + self.0.shortest_match_at(text.as_bytes(), start) + } + + #[cfg_attr(feature = "perf-inline", inline(always))] + fn is_match_at(&self, text: &str, start: usize) -> bool { + self.0.is_match_at(text.as_bytes(), start) + } + + #[cfg_attr(feature = "perf-inline", inline(always))] + fn find_at(&self, text: &str, start: usize) -> Option<(usize, usize)> { + self.0.find_at(text.as_bytes(), start) + } + + #[cfg_attr(feature = "perf-inline", inline(always))] + fn captures_read_at( + &self, + locs: &mut Locations, + text: &str, + start: usize, + ) -> Option<(usize, usize)> { + self.0.captures_read_at(locs, text.as_bytes(), start) + } +} + +impl<'c> RegularExpression for ExecNoSync<'c> { + type Text = [u8]; + + /// Returns the number of capture slots in the regular expression. (There + /// are two slots for every capture group, corresponding to possibly empty + /// start and end locations of the capture.) + fn slots_len(&self) -> usize { + self.ro.nfa.captures.len() * 2 + } + + fn next_after_empty(&self, _text: &[u8], i: usize) -> usize { + i + 1 + } + + /// Returns the end of a match location, possibly occurring before the + /// end location of the correct leftmost-first match. + #[cfg_attr(feature = "perf-inline", inline(always))] + fn shortest_match_at(&self, text: &[u8], start: usize) -> Option<usize> { + if !self.is_anchor_end_match(text) { + return None; + } + match self.ro.match_type { + #[cfg(feature = "perf-literal")] + MatchType::Literal(ty) => { + self.find_literals(ty, text, start).map(|(_, e)| e) + } + #[cfg(feature = "perf-dfa")] + MatchType::Dfa | MatchType::DfaMany => { + match self.shortest_dfa(text, start) { + dfa::Result::Match(end) => Some(end), + dfa::Result::NoMatch(_) => None, + dfa::Result::Quit => self.shortest_nfa(text, start), + } + } + #[cfg(feature = "perf-dfa")] + MatchType::DfaAnchoredReverse => { + match dfa::Fsm::reverse( + &self.ro.dfa_reverse, + self.cache.value(), + true, + &text[start..], + text.len(), + ) { + dfa::Result::Match(_) => Some(text.len()), + dfa::Result::NoMatch(_) => None, + dfa::Result::Quit => self.shortest_nfa(text, start), + } + } + #[cfg(all(feature = "perf-dfa", feature = "perf-literal"))] + MatchType::DfaSuffix => { + match self.shortest_dfa_reverse_suffix(text, start) { + dfa::Result::Match(e) => Some(e), + dfa::Result::NoMatch(_) => None, + dfa::Result::Quit => self.shortest_nfa(text, start), + } + } + MatchType::Nfa(ty) => self.shortest_nfa_type(ty, text, start), + MatchType::Nothing => None, + } + } + + /// Returns true if and only if the regex matches text. + /// + /// For single regular expressions, this is equivalent to calling + /// shortest_match(...).is_some(). + #[cfg_attr(feature = "perf-inline", inline(always))] + fn is_match_at(&self, text: &[u8], start: usize) -> bool { + if !self.is_anchor_end_match(text) { + return false; + } + // We need to do this dance because shortest_match relies on the NFA + // filling in captures[1], but a RegexSet has no captures. In other + // words, a RegexSet can't (currently) use shortest_match. ---AG + match self.ro.match_type { + #[cfg(feature = "perf-literal")] + MatchType::Literal(ty) => { + self.find_literals(ty, text, start).is_some() + } + #[cfg(feature = "perf-dfa")] + MatchType::Dfa | MatchType::DfaMany => { + match self.shortest_dfa(text, start) { + dfa::Result::Match(_) => true, + dfa::Result::NoMatch(_) => false, + dfa::Result::Quit => self.match_nfa(text, start), + } + } + #[cfg(feature = "perf-dfa")] + MatchType::DfaAnchoredReverse => { + match dfa::Fsm::reverse( + &self.ro.dfa_reverse, + self.cache.value(), + true, + &text[start..], + text.len(), + ) { + dfa::Result::Match(_) => true, + dfa::Result::NoMatch(_) => false, + dfa::Result::Quit => self.match_nfa(text, start), + } + } + #[cfg(all(feature = "perf-dfa", feature = "perf-literal"))] + MatchType::DfaSuffix => { + match self.shortest_dfa_reverse_suffix(text, start) { + dfa::Result::Match(_) => true, + dfa::Result::NoMatch(_) => false, + dfa::Result::Quit => self.match_nfa(text, start), + } + } + MatchType::Nfa(ty) => self.match_nfa_type(ty, text, start), + MatchType::Nothing => false, + } + } + + /// Finds the start and end location of the leftmost-first match, starting + /// at the given location. + #[cfg_attr(feature = "perf-inline", inline(always))] + fn find_at(&self, text: &[u8], start: usize) -> Option<(usize, usize)> { + if !self.is_anchor_end_match(text) { + return None; + } + match self.ro.match_type { + #[cfg(feature = "perf-literal")] + MatchType::Literal(ty) => self.find_literals(ty, text, start), + #[cfg(feature = "perf-dfa")] + MatchType::Dfa => match self.find_dfa_forward(text, start) { + dfa::Result::Match((s, e)) => Some((s, e)), + dfa::Result::NoMatch(_) => None, + dfa::Result::Quit => { + self.find_nfa(MatchNfaType::Auto, text, start) + } + }, + #[cfg(feature = "perf-dfa")] + MatchType::DfaAnchoredReverse => { + match self.find_dfa_anchored_reverse(text, start) { + dfa::Result::Match((s, e)) => Some((s, e)), + dfa::Result::NoMatch(_) => None, + dfa::Result::Quit => { + self.find_nfa(MatchNfaType::Auto, text, start) + } + } + } + #[cfg(all(feature = "perf-dfa", feature = "perf-literal"))] + MatchType::DfaSuffix => { + match self.find_dfa_reverse_suffix(text, start) { + dfa::Result::Match((s, e)) => Some((s, e)), + dfa::Result::NoMatch(_) => None, + dfa::Result::Quit => { + self.find_nfa(MatchNfaType::Auto, text, start) + } + } + } + MatchType::Nfa(ty) => self.find_nfa(ty, text, start), + MatchType::Nothing => None, + #[cfg(feature = "perf-dfa")] + MatchType::DfaMany => { + unreachable!("BUG: RegexSet cannot be used with find") + } + } + } + + /// Finds the start and end location of the leftmost-first match and also + /// fills in all matching capture groups. + /// + /// The number of capture slots given should be equal to the total number + /// of capture slots in the compiled program. + /// + /// Note that the first two slots always correspond to the start and end + /// locations of the overall match. + fn captures_read_at( + &self, + locs: &mut Locations, + text: &[u8], + start: usize, + ) -> Option<(usize, usize)> { + let slots = locs.as_slots(); + for slot in slots.iter_mut() { + *slot = None; + } + // If the caller unnecessarily uses this, then we try to save them + // from themselves. + match slots.len() { + 0 => return self.find_at(text, start), + 2 => { + return self.find_at(text, start).map(|(s, e)| { + slots[0] = Some(s); + slots[1] = Some(e); + (s, e) + }); + } + _ => {} // fallthrough + } + if !self.is_anchor_end_match(text) { + return None; + } + match self.ro.match_type { + #[cfg(feature = "perf-literal")] + MatchType::Literal(ty) => { + self.find_literals(ty, text, start).and_then(|(s, e)| { + self.captures_nfa_type( + MatchNfaType::Auto, + slots, + text, + s, + e, + ) + }) + } + #[cfg(feature = "perf-dfa")] + MatchType::Dfa => { + if self.ro.nfa.is_anchored_start { + self.captures_nfa(slots, text, start) + } else { + match self.find_dfa_forward(text, start) { + dfa::Result::Match((s, e)) => self.captures_nfa_type( + MatchNfaType::Auto, + slots, + text, + s, + e, + ), + dfa::Result::NoMatch(_) => None, + dfa::Result::Quit => { + self.captures_nfa(slots, text, start) + } + } + } + } + #[cfg(feature = "perf-dfa")] + MatchType::DfaAnchoredReverse => { + match self.find_dfa_anchored_reverse(text, start) { + dfa::Result::Match((s, e)) => self.captures_nfa_type( + MatchNfaType::Auto, + slots, + text, + s, + e, + ), + dfa::Result::NoMatch(_) => None, + dfa::Result::Quit => self.captures_nfa(slots, text, start), + } + } + #[cfg(all(feature = "perf-dfa", feature = "perf-literal"))] + MatchType::DfaSuffix => { + match self.find_dfa_reverse_suffix(text, start) { + dfa::Result::Match((s, e)) => self.captures_nfa_type( + MatchNfaType::Auto, + slots, + text, + s, + e, + ), + dfa::Result::NoMatch(_) => None, + dfa::Result::Quit => self.captures_nfa(slots, text, start), + } + } + MatchType::Nfa(ty) => { + self.captures_nfa_type(ty, slots, text, start, text.len()) + } + MatchType::Nothing => None, + #[cfg(feature = "perf-dfa")] + MatchType::DfaMany => { + unreachable!("BUG: RegexSet cannot be used with captures") + } + } + } +} + +impl<'c> ExecNoSync<'c> { + /// Finds the leftmost-first match using only literal search. + #[cfg(feature = "perf-literal")] + #[cfg_attr(feature = "perf-inline", inline(always))] + fn find_literals( + &self, + ty: MatchLiteralType, + text: &[u8], + start: usize, + ) -> Option<(usize, usize)> { + use self::MatchLiteralType::*; + match ty { + Unanchored => { + let lits = &self.ro.nfa.prefixes; + lits.find(&text[start..]).map(|(s, e)| (start + s, start + e)) + } + AnchoredStart => { + let lits = &self.ro.nfa.prefixes; + if start == 0 || !self.ro.nfa.is_anchored_start { + lits.find_start(&text[start..]) + .map(|(s, e)| (start + s, start + e)) + } else { + None + } + } + AnchoredEnd => { + let lits = &self.ro.suffixes; + lits.find_end(&text[start..]) + .map(|(s, e)| (start + s, start + e)) + } + AhoCorasick => self + .ro + .ac + .as_ref() + .unwrap() + .find(&text[start..]) + .map(|m| (start + m.start(), start + m.end())), + } + } + + /// Finds the leftmost-first match (start and end) using only the DFA. + /// + /// If the result returned indicates that the DFA quit, then another + /// matching engine should be used. + #[cfg(feature = "perf-dfa")] + #[cfg_attr(feature = "perf-inline", inline(always))] + fn find_dfa_forward( + &self, + text: &[u8], + start: usize, + ) -> dfa::Result<(usize, usize)> { + use crate::dfa::Result::*; + let end = match dfa::Fsm::forward( + &self.ro.dfa, + self.cache.value(), + false, + text, + start, + ) { + NoMatch(i) => return NoMatch(i), + Quit => return Quit, + Match(end) if start == end => return Match((start, start)), + Match(end) => end, + }; + // Now run the DFA in reverse to find the start of the match. + match dfa::Fsm::reverse( + &self.ro.dfa_reverse, + self.cache.value(), + false, + &text[start..], + end - start, + ) { + Match(s) => Match((start + s, end)), + NoMatch(i) => NoMatch(i), + Quit => Quit, + } + } + + /// Finds the leftmost-first match (start and end) using only the DFA, + /// but assumes the regex is anchored at the end and therefore starts at + /// the end of the regex and matches in reverse. + /// + /// If the result returned indicates that the DFA quit, then another + /// matching engine should be used. + #[cfg(feature = "perf-dfa")] + #[cfg_attr(feature = "perf-inline", inline(always))] + fn find_dfa_anchored_reverse( + &self, + text: &[u8], + start: usize, + ) -> dfa::Result<(usize, usize)> { + use crate::dfa::Result::*; + match dfa::Fsm::reverse( + &self.ro.dfa_reverse, + self.cache.value(), + false, + &text[start..], + text.len() - start, + ) { + Match(s) => Match((start + s, text.len())), + NoMatch(i) => NoMatch(i), + Quit => Quit, + } + } + + /// Finds the end of the shortest match using only the DFA. + #[cfg(feature = "perf-dfa")] + #[cfg_attr(feature = "perf-inline", inline(always))] + fn shortest_dfa(&self, text: &[u8], start: usize) -> dfa::Result<usize> { + dfa::Fsm::forward(&self.ro.dfa, self.cache.value(), true, text, start) + } + + /// Finds the end of the shortest match using only the DFA by scanning for + /// suffix literals. + #[cfg(all(feature = "perf-dfa", feature = "perf-literal"))] + #[cfg_attr(feature = "perf-inline", inline(always))] + fn shortest_dfa_reverse_suffix( + &self, + text: &[u8], + start: usize, + ) -> dfa::Result<usize> { + match self.exec_dfa_reverse_suffix(text, start) { + None => self.shortest_dfa(text, start), + Some(r) => r.map(|(_, end)| end), + } + } + + /// Finds the end of the shortest match using only the DFA by scanning for + /// suffix literals. It also reports the start of the match. + /// + /// Note that if None is returned, then the optimization gave up to avoid + /// worst case quadratic behavior. A forward scanning DFA should be tried + /// next. + /// + /// If a match is returned and the full leftmost-first match is desired, + /// then a forward scan starting from the beginning of the match must be + /// done. + /// + /// If the result returned indicates that the DFA quit, then another + /// matching engine should be used. + #[cfg(all(feature = "perf-dfa", feature = "perf-literal"))] + #[cfg_attr(feature = "perf-inline", inline(always))] + fn exec_dfa_reverse_suffix( + &self, + text: &[u8], + original_start: usize, + ) -> Option<dfa::Result<(usize, usize)>> { + use crate::dfa::Result::*; + + let lcs = self.ro.suffixes.lcs(); + debug_assert!(lcs.len() >= 1); + let mut start = original_start; + let mut end = start; + let mut last_literal = start; + while end <= text.len() { + last_literal += match lcs.find(&text[last_literal..]) { + None => return Some(NoMatch(text.len())), + Some(i) => i, + }; + end = last_literal + lcs.len(); + match dfa::Fsm::reverse( + &self.ro.dfa_reverse, + self.cache.value(), + false, + &text[start..end], + end - start, + ) { + Match(0) | NoMatch(0) => return None, + Match(i) => return Some(Match((start + i, end))), + NoMatch(i) => { + start += i; + last_literal += 1; + continue; + } + Quit => return Some(Quit), + }; + } + Some(NoMatch(text.len())) + } + + /// Finds the leftmost-first match (start and end) using only the DFA + /// by scanning for suffix literals. + /// + /// If the result returned indicates that the DFA quit, then another + /// matching engine should be used. + #[cfg(all(feature = "perf-dfa", feature = "perf-literal"))] + #[cfg_attr(feature = "perf-inline", inline(always))] + fn find_dfa_reverse_suffix( + &self, + text: &[u8], + start: usize, + ) -> dfa::Result<(usize, usize)> { + use crate::dfa::Result::*; + + let match_start = match self.exec_dfa_reverse_suffix(text, start) { + None => return self.find_dfa_forward(text, start), + Some(Match((start, _))) => start, + Some(r) => return r, + }; + // At this point, we've found a match. The only way to quit now + // without a match is if the DFA gives up (seems unlikely). + // + // Now run the DFA forwards to find the proper end of the match. + // (The suffix literal match can only indicate the earliest + // possible end location, which may appear before the end of the + // leftmost-first match.) + match dfa::Fsm::forward( + &self.ro.dfa, + self.cache.value(), + false, + text, + match_start, + ) { + NoMatch(_) => panic!("BUG: reverse match implies forward match"), + Quit => Quit, + Match(e) => Match((match_start, e)), + } + } + + /// Executes the NFA engine to return whether there is a match or not. + /// + /// Ideally, we could use shortest_nfa(...).is_some() and get the same + /// performance characteristics, but regex sets don't have captures, which + /// shortest_nfa depends on. + #[cfg(feature = "perf-dfa")] + fn match_nfa(&self, text: &[u8], start: usize) -> bool { + self.match_nfa_type(MatchNfaType::Auto, text, start) + } + + /// Like match_nfa, but allows specification of the type of NFA engine. + fn match_nfa_type( + &self, + ty: MatchNfaType, + text: &[u8], + start: usize, + ) -> bool { + self.exec_nfa( + ty, + &mut [false], + &mut [], + true, + false, + text, + start, + text.len(), + ) + } + + /// Finds the shortest match using an NFA. + #[cfg(feature = "perf-dfa")] + fn shortest_nfa(&self, text: &[u8], start: usize) -> Option<usize> { + self.shortest_nfa_type(MatchNfaType::Auto, text, start) + } + + /// Like shortest_nfa, but allows specification of the type of NFA engine. + fn shortest_nfa_type( + &self, + ty: MatchNfaType, + text: &[u8], + start: usize, + ) -> Option<usize> { + let mut slots = [None, None]; + if self.exec_nfa( + ty, + &mut [false], + &mut slots, + true, + true, + text, + start, + text.len(), + ) { + slots[1] + } else { + None + } + } + + /// Like find, but executes an NFA engine. + fn find_nfa( + &self, + ty: MatchNfaType, + text: &[u8], + start: usize, + ) -> Option<(usize, usize)> { + let mut slots = [None, None]; + if self.exec_nfa( + ty, + &mut [false], + &mut slots, + false, + false, + text, + start, + text.len(), + ) { + match (slots[0], slots[1]) { + (Some(s), Some(e)) => Some((s, e)), + _ => None, + } + } else { + None + } + } + + /// Like find_nfa, but fills in captures. + /// + /// `slots` should have length equal to `2 * nfa.captures.len()`. + #[cfg(feature = "perf-dfa")] + fn captures_nfa( + &self, + slots: &mut [Slot], + text: &[u8], + start: usize, + ) -> Option<(usize, usize)> { + self.captures_nfa_type( + MatchNfaType::Auto, + slots, + text, + start, + text.len(), + ) + } + + /// Like captures_nfa, but allows specification of type of NFA engine. + fn captures_nfa_type( + &self, + ty: MatchNfaType, + slots: &mut [Slot], + text: &[u8], + start: usize, + end: usize, + ) -> Option<(usize, usize)> { + if self.exec_nfa( + ty, + &mut [false], + slots, + false, + false, + text, + start, + end, + ) { + match (slots[0], slots[1]) { + (Some(s), Some(e)) => Some((s, e)), + _ => None, + } + } else { + None + } + } + + fn exec_nfa( + &self, + mut ty: MatchNfaType, + matches: &mut [bool], + slots: &mut [Slot], + quit_after_match: bool, + quit_after_match_with_pos: bool, + text: &[u8], + start: usize, + end: usize, + ) -> bool { + use self::MatchNfaType::*; + if let Auto = ty { + if backtrack::should_exec(self.ro.nfa.len(), text.len()) { + ty = Backtrack; + } else { + ty = PikeVM; + } + } + // The backtracker can't return the shortest match position as it is + // implemented today. So if someone calls `shortest_match` and we need + // to run an NFA, then use the PikeVM. + if quit_after_match_with_pos || ty == PikeVM { + self.exec_pikevm( + matches, + slots, + quit_after_match, + text, + start, + end, + ) + } else { + self.exec_backtrack(matches, slots, text, start, end) + } + } + + /// Always run the NFA algorithm. + fn exec_pikevm( + &self, + matches: &mut [bool], + slots: &mut [Slot], + quit_after_match: bool, + text: &[u8], + start: usize, + end: usize, + ) -> bool { + if self.ro.nfa.uses_bytes() { + pikevm::Fsm::exec( + &self.ro.nfa, + self.cache.value(), + matches, + slots, + quit_after_match, + ByteInput::new(text, self.ro.nfa.only_utf8), + start, + end, + ) + } else { + pikevm::Fsm::exec( + &self.ro.nfa, + self.cache.value(), + matches, + slots, + quit_after_match, + CharInput::new(text), + start, + end, + ) + } + } + + /// Always runs the NFA using bounded backtracking. + fn exec_backtrack( + &self, + matches: &mut [bool], + slots: &mut [Slot], + text: &[u8], + start: usize, + end: usize, + ) -> bool { + if self.ro.nfa.uses_bytes() { + backtrack::Bounded::exec( + &self.ro.nfa, + self.cache.value(), + matches, + slots, + ByteInput::new(text, self.ro.nfa.only_utf8), + start, + end, + ) + } else { + backtrack::Bounded::exec( + &self.ro.nfa, + self.cache.value(), + matches, + slots, + CharInput::new(text), + start, + end, + ) + } + } + + /// Finds which regular expressions match the given text. + /// + /// `matches` should have length equal to the number of regexes being + /// searched. + /// + /// This is only useful when one wants to know which regexes in a set + /// match some text. + pub fn many_matches_at( + &self, + matches: &mut [bool], + text: &[u8], + start: usize, + ) -> bool { + use self::MatchType::*; + if !self.is_anchor_end_match(text) { + return false; + } + match self.ro.match_type { + #[cfg(feature = "perf-literal")] + Literal(ty) => { + debug_assert_eq!(matches.len(), 1); + matches[0] = self.find_literals(ty, text, start).is_some(); + matches[0] + } + #[cfg(feature = "perf-dfa")] + Dfa | DfaAnchoredReverse | DfaMany => { + match dfa::Fsm::forward_many( + &self.ro.dfa, + self.cache.value(), + matches, + text, + start, + ) { + dfa::Result::Match(_) => true, + dfa::Result::NoMatch(_) => false, + dfa::Result::Quit => self.exec_nfa( + MatchNfaType::Auto, + matches, + &mut [], + false, + false, + text, + start, + text.len(), + ), + } + } + #[cfg(all(feature = "perf-dfa", feature = "perf-literal"))] + DfaSuffix => { + match dfa::Fsm::forward_many( + &self.ro.dfa, + self.cache.value(), + matches, + text, + start, + ) { + dfa::Result::Match(_) => true, + dfa::Result::NoMatch(_) => false, + dfa::Result::Quit => self.exec_nfa( + MatchNfaType::Auto, + matches, + &mut [], + false, + false, + text, + start, + text.len(), + ), + } + } + Nfa(ty) => self.exec_nfa( + ty, + matches, + &mut [], + false, + false, + text, + start, + text.len(), + ), + Nothing => false, + } + } + + #[cfg_attr(feature = "perf-inline", inline(always))] + fn is_anchor_end_match(&self, text: &[u8]) -> bool { + #[cfg(not(feature = "perf-literal"))] + fn imp(_: &ExecReadOnly, _: &[u8]) -> bool { + true + } + + #[cfg(feature = "perf-literal")] + fn imp(ro: &ExecReadOnly, text: &[u8]) -> bool { + // Only do this check if the haystack is big (>1MB). + if text.len() > (1 << 20) && ro.nfa.is_anchored_end { + let lcs = ro.suffixes.lcs(); + if lcs.len() >= 1 && !lcs.is_suffix(text) { + return false; + } + } + true + } + + imp(&self.ro, text) + } + + pub fn capture_name_idx(&self) -> &Arc<HashMap<String, usize>> { + &self.ro.nfa.capture_name_idx + } +} + +impl<'c> ExecNoSyncStr<'c> { + pub fn capture_name_idx(&self) -> &Arc<HashMap<String, usize>> { + self.0.capture_name_idx() + } +} + +impl Exec { + /// Get a searcher that isn't Sync. + #[cfg_attr(feature = "perf-inline", inline(always))] + pub fn searcher(&self) -> ExecNoSync<'_> { + ExecNoSync { + ro: &self.ro, // a clone is too expensive here! (and not needed) + cache: self.pool.get(), + } + } + + /// Get a searcher that isn't Sync and can match on &str. + #[cfg_attr(feature = "perf-inline", inline(always))] + pub fn searcher_str(&self) -> ExecNoSyncStr<'_> { + ExecNoSyncStr(self.searcher()) + } + + /// Build a Regex from this executor. + pub fn into_regex(self) -> re_unicode::Regex { + re_unicode::Regex::from(self) + } + + /// Build a RegexSet from this executor. + pub fn into_regex_set(self) -> re_set::unicode::RegexSet { + re_set::unicode::RegexSet::from(self) + } + + /// Build a Regex from this executor that can match arbitrary bytes. + pub fn into_byte_regex(self) -> re_bytes::Regex { + re_bytes::Regex::from(self) + } + + /// Build a RegexSet from this executor that can match arbitrary bytes. + pub fn into_byte_regex_set(self) -> re_set::bytes::RegexSet { + re_set::bytes::RegexSet::from(self) + } + + /// The original regular expressions given by the caller that were + /// compiled. + pub fn regex_strings(&self) -> &[String] { + &self.ro.res + } + + /// Return a slice of capture names. + /// + /// Any capture that isn't named is None. + pub fn capture_names(&self) -> &[Option<String>] { + &self.ro.nfa.captures + } + + /// Return a reference to named groups mapping (from group name to + /// group position). + pub fn capture_name_idx(&self) -> &Arc<HashMap<String, usize>> { + &self.ro.nfa.capture_name_idx + } +} + +impl Clone for Exec { + fn clone(&self) -> Exec { + let pool = ExecReadOnly::new_pool(&self.ro); + Exec { ro: self.ro.clone(), pool } + } +} + +impl ExecReadOnly { + fn choose_match_type(&self, hint: Option<MatchType>) -> MatchType { + if let Some(MatchType::Nfa(_)) = hint { + return hint.unwrap(); + } + // If the NFA is empty, then we'll never match anything. + if self.nfa.insts.is_empty() { + return MatchType::Nothing; + } + if let Some(literalty) = self.choose_literal_match_type() { + return literalty; + } + if let Some(dfaty) = self.choose_dfa_match_type() { + return dfaty; + } + // We're so totally hosed. + MatchType::Nfa(MatchNfaType::Auto) + } + + /// If a plain literal scan can be used, then a corresponding literal + /// search type is returned. + fn choose_literal_match_type(&self) -> Option<MatchType> { + #[cfg(not(feature = "perf-literal"))] + fn imp(_: &ExecReadOnly) -> Option<MatchType> { + None + } + + #[cfg(feature = "perf-literal")] + fn imp(ro: &ExecReadOnly) -> Option<MatchType> { + // If our set of prefixes is complete, then we can use it to find + // a match in lieu of a regex engine. This doesn't quite work well + // in the presence of multiple regexes, so only do it when there's + // one. + // + // TODO(burntsushi): Also, don't try to match literals if the regex + // is partially anchored. We could technically do it, but we'd need + // to create two sets of literals: all of them and then the subset + // that aren't anchored. We would then only search for all of them + // when at the beginning of the input and use the subset in all + // other cases. + if ro.res.len() != 1 { + return None; + } + if ro.ac.is_some() { + return Some(MatchType::Literal( + MatchLiteralType::AhoCorasick, + )); + } + if ro.nfa.prefixes.complete() { + return if ro.nfa.is_anchored_start { + Some(MatchType::Literal(MatchLiteralType::AnchoredStart)) + } else { + Some(MatchType::Literal(MatchLiteralType::Unanchored)) + }; + } + if ro.suffixes.complete() { + return if ro.nfa.is_anchored_end { + Some(MatchType::Literal(MatchLiteralType::AnchoredEnd)) + } else { + // This case shouldn't happen. When the regex isn't + // anchored, then complete prefixes should imply complete + // suffixes. + Some(MatchType::Literal(MatchLiteralType::Unanchored)) + }; + } + None + } + + imp(self) + } + + /// If a DFA scan can be used, then choose the appropriate DFA strategy. + fn choose_dfa_match_type(&self) -> Option<MatchType> { + #[cfg(not(feature = "perf-dfa"))] + fn imp(_: &ExecReadOnly) -> Option<MatchType> { + None + } + + #[cfg(feature = "perf-dfa")] + fn imp(ro: &ExecReadOnly) -> Option<MatchType> { + if !dfa::can_exec(&ro.dfa) { + return None; + } + // Regex sets require a slightly specialized path. + if ro.res.len() >= 2 { + return Some(MatchType::DfaMany); + } + // If the regex is anchored at the end but not the start, then + // just match in reverse from the end of the haystack. + if !ro.nfa.is_anchored_start && ro.nfa.is_anchored_end { + return Some(MatchType::DfaAnchoredReverse); + } + #[cfg(feature = "perf-literal")] + { + // If there's a longish suffix literal, then it might be faster + // to look for that first. + if ro.should_suffix_scan() { + return Some(MatchType::DfaSuffix); + } + } + // Fall back to your garden variety forward searching lazy DFA. + Some(MatchType::Dfa) + } + + imp(self) + } + + /// Returns true if the program is amenable to suffix scanning. + /// + /// When this is true, as a heuristic, we assume it is OK to quickly scan + /// for suffix literals and then do a *reverse* DFA match from any matches + /// produced by the literal scan. (And then followed by a forward DFA + /// search, since the previously found suffix literal maybe not actually be + /// the end of a match.) + /// + /// This is a bit of a specialized optimization, but can result in pretty + /// big performance wins if 1) there are no prefix literals and 2) the + /// suffix literals are pretty rare in the text. (1) is obviously easy to + /// account for but (2) is harder. As a proxy, we assume that longer + /// strings are generally rarer, so we only enable this optimization when + /// we have a meaty suffix. + #[cfg(all(feature = "perf-dfa", feature = "perf-literal"))] + fn should_suffix_scan(&self) -> bool { + if self.suffixes.is_empty() { + return false; + } + let lcs_len = self.suffixes.lcs().char_len(); + lcs_len >= 3 && lcs_len > self.dfa.prefixes.lcp().char_len() + } + + fn new_pool(ro: &Arc<ExecReadOnly>) -> Box<Pool<ProgramCache>> { + let ro = ro.clone(); + Box::new(Pool::new(Box::new(move || { + AssertUnwindSafe(RefCell::new(ProgramCacheInner::new(&ro))) + }))) + } +} + +#[derive(Clone, Copy, Debug)] +enum MatchType { + /// A single or multiple literal search. This is only used when the regex + /// can be decomposed into a literal search. + #[cfg(feature = "perf-literal")] + Literal(MatchLiteralType), + /// A normal DFA search. + #[cfg(feature = "perf-dfa")] + Dfa, + /// A reverse DFA search starting from the end of a haystack. + #[cfg(feature = "perf-dfa")] + DfaAnchoredReverse, + /// A reverse DFA search with suffix literal scanning. + #[cfg(all(feature = "perf-dfa", feature = "perf-literal"))] + DfaSuffix, + /// Use the DFA on two or more regular expressions. + #[cfg(feature = "perf-dfa")] + DfaMany, + /// An NFA variant. + Nfa(MatchNfaType), + /// No match is ever possible, so don't ever try to search. + Nothing, +} + +#[derive(Clone, Copy, Debug)] +#[cfg(feature = "perf-literal")] +enum MatchLiteralType { + /// Match literals anywhere in text. + Unanchored, + /// Match literals only at the start of text. + AnchoredStart, + /// Match literals only at the end of text. + AnchoredEnd, + /// Use an Aho-Corasick automaton. This requires `ac` to be Some on + /// ExecReadOnly. + AhoCorasick, +} + +#[derive(Clone, Copy, Debug, Eq, PartialEq)] +enum MatchNfaType { + /// Choose between Backtrack and PikeVM. + Auto, + /// NFA bounded backtracking. + /// + /// (This is only set by tests, since it never makes sense to always want + /// backtracking.) + Backtrack, + /// The Pike VM. + /// + /// (This is only set by tests, since it never makes sense to always want + /// the Pike VM.) + PikeVM, +} + +/// `ProgramCache` maintains reusable allocations for each matching engine +/// available to a particular program. +/// +/// We declare this as unwind safe since it's a cache that's only used for +/// performance purposes. If a panic occurs, it is (or should be) always safe +/// to continue using the same regex object. +pub type ProgramCache = AssertUnwindSafe<RefCell<ProgramCacheInner>>; + +#[derive(Debug)] +pub struct ProgramCacheInner { + pub pikevm: pikevm::Cache, + pub backtrack: backtrack::Cache, + #[cfg(feature = "perf-dfa")] + pub dfa: dfa::Cache, + #[cfg(feature = "perf-dfa")] + pub dfa_reverse: dfa::Cache, +} + +impl ProgramCacheInner { + fn new(ro: &ExecReadOnly) -> Self { + ProgramCacheInner { + pikevm: pikevm::Cache::new(&ro.nfa), + backtrack: backtrack::Cache::new(&ro.nfa), + #[cfg(feature = "perf-dfa")] + dfa: dfa::Cache::new(&ro.dfa), + #[cfg(feature = "perf-dfa")] + dfa_reverse: dfa::Cache::new(&ro.dfa_reverse), + } + } +} + +/// Alternation literals checks if the given HIR is a simple alternation of +/// literals, and if so, returns them. Otherwise, this returns None. +#[cfg(feature = "perf-literal")] +fn alternation_literals(expr: &Hir) -> Option<Vec<Vec<u8>>> { + use regex_syntax::hir::{HirKind, Literal}; + + // This is pretty hacky, but basically, if `is_alternation_literal` is + // true, then we can make several assumptions about the structure of our + // HIR. This is what justifies the `unreachable!` statements below. + // + // This code should be refactored once we overhaul this crate's + // optimization pipeline, because this is a terribly inflexible way to go + // about things. + + if !expr.is_alternation_literal() { + return None; + } + let alts = match *expr.kind() { + HirKind::Alternation(ref alts) => alts, + _ => return None, // one literal isn't worth it + }; + + let extendlit = |lit: &Literal, dst: &mut Vec<u8>| match *lit { + Literal::Unicode(c) => { + let mut buf = [0; 4]; + dst.extend_from_slice(c.encode_utf8(&mut buf).as_bytes()); + } + Literal::Byte(b) => { + dst.push(b); + } + }; + + let mut lits = vec![]; + for alt in alts { + let mut lit = vec![]; + match *alt.kind() { + HirKind::Literal(ref x) => extendlit(x, &mut lit), + HirKind::Concat(ref exprs) => { + for e in exprs { + match *e.kind() { + HirKind::Literal(ref x) => extendlit(x, &mut lit), + _ => unreachable!("expected literal, got {:?}", e), + } + } + } + _ => unreachable!("expected literal or concat, got {:?}", alt), + } + lits.push(lit); + } + Some(lits) +} + +#[cfg(test)] +mod test { + #[test] + fn uppercut_s_backtracking_bytes_default_bytes_mismatch() { + use crate::internal::ExecBuilder; + + let backtrack_bytes_re = ExecBuilder::new("^S") + .bounded_backtracking() + .only_utf8(false) + .build() + .map(|exec| exec.into_byte_regex()) + .map_err(|err| format!("{}", err)) + .unwrap(); + + let default_bytes_re = ExecBuilder::new("^S") + .only_utf8(false) + .build() + .map(|exec| exec.into_byte_regex()) + .map_err(|err| format!("{}", err)) + .unwrap(); + + let input = vec![83, 83]; + + let s1 = backtrack_bytes_re.split(&input); + let s2 = default_bytes_re.split(&input); + for (chunk1, chunk2) in s1.zip(s2) { + assert_eq!(chunk1, chunk2); + } + } + + #[test] + fn unicode_lit_star_backtracking_utf8bytes_default_utf8bytes_mismatch() { + use crate::internal::ExecBuilder; + + let backtrack_bytes_re = ExecBuilder::new(r"^(?u:\*)") + .bounded_backtracking() + .bytes(true) + .build() + .map(|exec| exec.into_regex()) + .map_err(|err| format!("{}", err)) + .unwrap(); + + let default_bytes_re = ExecBuilder::new(r"^(?u:\*)") + .bytes(true) + .build() + .map(|exec| exec.into_regex()) + .map_err(|err| format!("{}", err)) + .unwrap(); + + let input = "**"; + + let s1 = backtrack_bytes_re.split(input); + let s2 = default_bytes_re.split(input); + for (chunk1, chunk2) in s1.zip(s2) { + assert_eq!(chunk1, chunk2); + } + } +} diff --git a/third_party/rust/regex/src/expand.rs b/third_party/rust/regex/src/expand.rs new file mode 100644 index 0000000000..67b514926a --- /dev/null +++ b/third_party/rust/regex/src/expand.rs @@ -0,0 +1,239 @@ +use std::str; + +use crate::find_byte::find_byte; + +use crate::re_bytes; +use crate::re_unicode; + +pub fn expand_str( + caps: &re_unicode::Captures<'_>, + mut replacement: &str, + dst: &mut String, +) { + while !replacement.is_empty() { + match find_byte(b'$', replacement.as_bytes()) { + None => break, + Some(i) => { + dst.push_str(&replacement[..i]); + replacement = &replacement[i..]; + } + } + if replacement.as_bytes().get(1).map_or(false, |&b| b == b'$') { + dst.push_str("$"); + replacement = &replacement[2..]; + continue; + } + debug_assert!(!replacement.is_empty()); + let cap_ref = match find_cap_ref(replacement.as_bytes()) { + Some(cap_ref) => cap_ref, + None => { + dst.push_str("$"); + replacement = &replacement[1..]; + continue; + } + }; + replacement = &replacement[cap_ref.end..]; + match cap_ref.cap { + Ref::Number(i) => { + dst.push_str(caps.get(i).map(|m| m.as_str()).unwrap_or("")); + } + Ref::Named(name) => { + dst.push_str( + caps.name(name).map(|m| m.as_str()).unwrap_or(""), + ); + } + } + } + dst.push_str(replacement); +} + +pub fn expand_bytes( + caps: &re_bytes::Captures<'_>, + mut replacement: &[u8], + dst: &mut Vec<u8>, +) { + while !replacement.is_empty() { + match find_byte(b'$', replacement) { + None => break, + Some(i) => { + dst.extend(&replacement[..i]); + replacement = &replacement[i..]; + } + } + if replacement.get(1).map_or(false, |&b| b == b'$') { + dst.push(b'$'); + replacement = &replacement[2..]; + continue; + } + debug_assert!(!replacement.is_empty()); + let cap_ref = match find_cap_ref(replacement) { + Some(cap_ref) => cap_ref, + None => { + dst.push(b'$'); + replacement = &replacement[1..]; + continue; + } + }; + replacement = &replacement[cap_ref.end..]; + match cap_ref.cap { + Ref::Number(i) => { + dst.extend(caps.get(i).map(|m| m.as_bytes()).unwrap_or(b"")); + } + Ref::Named(name) => { + dst.extend( + caps.name(name).map(|m| m.as_bytes()).unwrap_or(b""), + ); + } + } + } + dst.extend(replacement); +} + +/// `CaptureRef` represents a reference to a capture group inside some text. +/// The reference is either a capture group name or a number. +/// +/// It is also tagged with the position in the text following the +/// capture reference. +#[derive(Clone, Copy, Debug, Eq, PartialEq)] +struct CaptureRef<'a> { + cap: Ref<'a>, + end: usize, +} + +/// A reference to a capture group in some text. +/// +/// e.g., `$2`, `$foo`, `${foo}`. +#[derive(Clone, Copy, Debug, Eq, PartialEq)] +enum Ref<'a> { + Named(&'a str), + Number(usize), +} + +impl<'a> From<&'a str> for Ref<'a> { + fn from(x: &'a str) -> Ref<'a> { + Ref::Named(x) + } +} + +impl From<usize> for Ref<'static> { + fn from(x: usize) -> Ref<'static> { + Ref::Number(x) + } +} + +/// Parses a possible reference to a capture group name in the given text, +/// starting at the beginning of `replacement`. +/// +/// If no such valid reference could be found, None is returned. +fn find_cap_ref(replacement: &[u8]) -> Option<CaptureRef<'_>> { + let mut i = 0; + let rep: &[u8] = replacement; + if rep.len() <= 1 || rep[0] != b'$' { + return None; + } + i += 1; + if rep[i] == b'{' { + return find_cap_ref_braced(rep, i + 1); + } + let mut cap_end = i; + while rep.get(cap_end).copied().map_or(false, is_valid_cap_letter) { + cap_end += 1; + } + if cap_end == i { + return None; + } + // We just verified that the range 0..cap_end is valid ASCII, so it must + // therefore be valid UTF-8. If we really cared, we could avoid this UTF-8 + // check via an unchecked conversion or by parsing the number straight from + // &[u8]. + let cap = + str::from_utf8(&rep[i..cap_end]).expect("valid UTF-8 capture name"); + Some(CaptureRef { + cap: match cap.parse::<u32>() { + Ok(i) => Ref::Number(i as usize), + Err(_) => Ref::Named(cap), + }, + end: cap_end, + }) +} + +fn find_cap_ref_braced(rep: &[u8], mut i: usize) -> Option<CaptureRef<'_>> { + let start = i; + while rep.get(i).map_or(false, |&b| b != b'}') { + i += 1; + } + if !rep.get(i).map_or(false, |&b| b == b'}') { + return None; + } + // When looking at braced names, we don't put any restrictions on the name, + // so it's possible it could be invalid UTF-8. But a capture group name + // can never be invalid UTF-8, so if we have invalid UTF-8, then we can + // safely return None. + let cap = match str::from_utf8(&rep[start..i]) { + Err(_) => return None, + Ok(cap) => cap, + }; + Some(CaptureRef { + cap: match cap.parse::<u32>() { + Ok(i) => Ref::Number(i as usize), + Err(_) => Ref::Named(cap), + }, + end: i + 1, + }) +} + +/// Returns true if and only if the given byte is allowed in a capture name. +fn is_valid_cap_letter(b: u8) -> bool { + match b { + b'0'..=b'9' | b'a'..=b'z' | b'A'..=b'Z' | b'_' => true, + _ => false, + } +} + +#[cfg(test)] +mod tests { + use super::{find_cap_ref, CaptureRef}; + + macro_rules! find { + ($name:ident, $text:expr) => { + #[test] + fn $name() { + assert_eq!(None, find_cap_ref($text.as_bytes())); + } + }; + ($name:ident, $text:expr, $capref:expr) => { + #[test] + fn $name() { + assert_eq!(Some($capref), find_cap_ref($text.as_bytes())); + } + }; + } + + macro_rules! c { + ($name_or_number:expr, $pos:expr) => { + CaptureRef { cap: $name_or_number.into(), end: $pos } + }; + } + + find!(find_cap_ref1, "$foo", c!("foo", 4)); + find!(find_cap_ref2, "${foo}", c!("foo", 6)); + find!(find_cap_ref3, "$0", c!(0, 2)); + find!(find_cap_ref4, "$5", c!(5, 2)); + find!(find_cap_ref5, "$10", c!(10, 3)); + // See https://github.com/rust-lang/regex/pull/585 + // for more on characters following numbers + find!(find_cap_ref6, "$42a", c!("42a", 4)); + find!(find_cap_ref7, "${42}a", c!(42, 5)); + find!(find_cap_ref8, "${42"); + find!(find_cap_ref9, "${42 "); + find!(find_cap_ref10, " $0 "); + find!(find_cap_ref11, "$"); + find!(find_cap_ref12, " "); + find!(find_cap_ref13, ""); + find!(find_cap_ref14, "$1-$2", c!(1, 2)); + find!(find_cap_ref15, "$1_$2", c!("1_", 3)); + find!(find_cap_ref16, "$x-$y", c!("x", 2)); + find!(find_cap_ref17, "$x_$y", c!("x_", 3)); + find!(find_cap_ref18, "${#}", c!("#", 4)); + find!(find_cap_ref19, "${Z[}", c!("Z[", 5)); +} diff --git a/third_party/rust/regex/src/find_byte.rs b/third_party/rust/regex/src/find_byte.rs new file mode 100644 index 0000000000..e95f72afb9 --- /dev/null +++ b/third_party/rust/regex/src/find_byte.rs @@ -0,0 +1,18 @@ +/// Searches for the given needle in the given haystack. +/// +/// If the perf-literal feature is enabled, then this uses the super optimized +/// memchr crate. Otherwise, it uses the naive byte-at-a-time implementation. +pub fn find_byte(needle: u8, haystack: &[u8]) -> Option<usize> { + #[cfg(not(feature = "perf-literal"))] + fn imp(needle: u8, haystack: &[u8]) -> Option<usize> { + haystack.iter().position(|&b| b == needle) + } + + #[cfg(feature = "perf-literal")] + fn imp(needle: u8, haystack: &[u8]) -> Option<usize> { + use memchr::memchr; + memchr(needle, haystack) + } + + imp(needle, haystack) +} diff --git a/third_party/rust/regex/src/freqs.rs b/third_party/rust/regex/src/freqs.rs new file mode 100644 index 0000000000..fcffa95fb5 --- /dev/null +++ b/third_party/rust/regex/src/freqs.rs @@ -0,0 +1,261 @@ +// NOTE: The following code was generated by "scripts/frequencies.py", do not +// edit directly + +pub const BYTE_FREQUENCIES: [u8; 256] = [ + 55, // '\x00' + 52, // '\x01' + 51, // '\x02' + 50, // '\x03' + 49, // '\x04' + 48, // '\x05' + 47, // '\x06' + 46, // '\x07' + 45, // '\x08' + 103, // '\t' + 242, // '\n' + 66, // '\x0b' + 67, // '\x0c' + 229, // '\r' + 44, // '\x0e' + 43, // '\x0f' + 42, // '\x10' + 41, // '\x11' + 40, // '\x12' + 39, // '\x13' + 38, // '\x14' + 37, // '\x15' + 36, // '\x16' + 35, // '\x17' + 34, // '\x18' + 33, // '\x19' + 56, // '\x1a' + 32, // '\x1b' + 31, // '\x1c' + 30, // '\x1d' + 29, // '\x1e' + 28, // '\x1f' + 255, // ' ' + 148, // '!' + 164, // '"' + 149, // '#' + 136, // '$' + 160, // '%' + 155, // '&' + 173, // "'" + 221, // '(' + 222, // ')' + 134, // '*' + 122, // '+' + 232, // ',' + 202, // '-' + 215, // '.' + 224, // '/' + 208, // '0' + 220, // '1' + 204, // '2' + 187, // '3' + 183, // '4' + 179, // '5' + 177, // '6' + 168, // '7' + 178, // '8' + 200, // '9' + 226, // ':' + 195, // ';' + 154, // '<' + 184, // '=' + 174, // '>' + 126, // '?' + 120, // '@' + 191, // 'A' + 157, // 'B' + 194, // 'C' + 170, // 'D' + 189, // 'E' + 162, // 'F' + 161, // 'G' + 150, // 'H' + 193, // 'I' + 142, // 'J' + 137, // 'K' + 171, // 'L' + 176, // 'M' + 185, // 'N' + 167, // 'O' + 186, // 'P' + 112, // 'Q' + 175, // 'R' + 192, // 'S' + 188, // 'T' + 156, // 'U' + 140, // 'V' + 143, // 'W' + 123, // 'X' + 133, // 'Y' + 128, // 'Z' + 147, // '[' + 138, // '\\' + 146, // ']' + 114, // '^' + 223, // '_' + 151, // '`' + 249, // 'a' + 216, // 'b' + 238, // 'c' + 236, // 'd' + 253, // 'e' + 227, // 'f' + 218, // 'g' + 230, // 'h' + 247, // 'i' + 135, // 'j' + 180, // 'k' + 241, // 'l' + 233, // 'm' + 246, // 'n' + 244, // 'o' + 231, // 'p' + 139, // 'q' + 245, // 'r' + 243, // 's' + 251, // 't' + 235, // 'u' + 201, // 'v' + 196, // 'w' + 240, // 'x' + 214, // 'y' + 152, // 'z' + 182, // '{' + 205, // '|' + 181, // '}' + 127, // '~' + 27, // '\x7f' + 212, // '\x80' + 211, // '\x81' + 210, // '\x82' + 213, // '\x83' + 228, // '\x84' + 197, // '\x85' + 169, // '\x86' + 159, // '\x87' + 131, // '\x88' + 172, // '\x89' + 105, // '\x8a' + 80, // '\x8b' + 98, // '\x8c' + 96, // '\x8d' + 97, // '\x8e' + 81, // '\x8f' + 207, // '\x90' + 145, // '\x91' + 116, // '\x92' + 115, // '\x93' + 144, // '\x94' + 130, // '\x95' + 153, // '\x96' + 121, // '\x97' + 107, // '\x98' + 132, // '\x99' + 109, // '\x9a' + 110, // '\x9b' + 124, // '\x9c' + 111, // '\x9d' + 82, // '\x9e' + 108, // '\x9f' + 118, // '\xa0' + 141, // '¡' + 113, // '¢' + 129, // '£' + 119, // '¤' + 125, // '¥' + 165, // '¦' + 117, // '§' + 92, // '¨' + 106, // '©' + 83, // 'ª' + 72, // '«' + 99, // '¬' + 93, // '\xad' + 65, // '®' + 79, // '¯' + 166, // '°' + 237, // '±' + 163, // '²' + 199, // '³' + 190, // '´' + 225, // 'µ' + 209, // '¶' + 203, // '·' + 198, // '¸' + 217, // '¹' + 219, // 'º' + 206, // '»' + 234, // '¼' + 248, // '½' + 158, // '¾' + 239, // '¿' + 255, // 'À' + 255, // 'Á' + 255, // 'Â' + 255, // 'Ã' + 255, // 'Ä' + 255, // 'Å' + 255, // 'Æ' + 255, // 'Ç' + 255, // 'È' + 255, // 'É' + 255, // 'Ê' + 255, // 'Ë' + 255, // 'Ì' + 255, // 'Í' + 255, // 'Î' + 255, // 'Ï' + 255, // 'Ð' + 255, // 'Ñ' + 255, // 'Ò' + 255, // 'Ó' + 255, // 'Ô' + 255, // 'Õ' + 255, // 'Ö' + 255, // '×' + 255, // 'Ø' + 255, // 'Ù' + 255, // 'Ú' + 255, // 'Û' + 255, // 'Ü' + 255, // 'Ý' + 255, // 'Þ' + 255, // 'ß' + 255, // 'à' + 255, // 'á' + 255, // 'â' + 255, // 'ã' + 255, // 'ä' + 255, // 'å' + 255, // 'æ' + 255, // 'ç' + 255, // 'è' + 255, // 'é' + 255, // 'ê' + 255, // 'ë' + 255, // 'ì' + 255, // 'í' + 255, // 'î' + 255, // 'ï' + 255, // 'ð' + 255, // 'ñ' + 255, // 'ò' + 255, // 'ó' + 255, // 'ô' + 255, // 'õ' + 255, // 'ö' + 255, // '÷' + 255, // 'ø' + 255, // 'ù' + 255, // 'ú' + 255, // 'û' + 255, // 'ü' + 255, // 'ý' + 255, // 'þ' + 255, // 'ÿ' +]; diff --git a/third_party/rust/regex/src/input.rs b/third_party/rust/regex/src/input.rs new file mode 100644 index 0000000000..df6c3e0c91 --- /dev/null +++ b/third_party/rust/regex/src/input.rs @@ -0,0 +1,432 @@ +use std::char; +use std::cmp::Ordering; +use std::fmt; +use std::ops; +use std::u32; + +use crate::literal::LiteralSearcher; +use crate::prog::InstEmptyLook; +use crate::utf8::{decode_last_utf8, decode_utf8}; + +/// Represents a location in the input. +#[derive(Clone, Copy, Debug)] +pub struct InputAt { + pos: usize, + c: Char, + byte: Option<u8>, + len: usize, +} + +impl InputAt { + /// Returns true iff this position is at the beginning of the input. + pub fn is_start(&self) -> bool { + self.pos == 0 + } + + /// Returns true iff this position is past the end of the input. + pub fn is_end(&self) -> bool { + self.c.is_none() && self.byte.is_none() + } + + /// Returns the character at this position. + /// + /// If this position is just before or after the input, then an absent + /// character is returned. + pub fn char(&self) -> Char { + self.c + } + + /// Returns the byte at this position. + pub fn byte(&self) -> Option<u8> { + self.byte + } + + /// Returns the UTF-8 width of the character at this position. + pub fn len(&self) -> usize { + self.len + } + + /// Returns whether the UTF-8 width of the character at this position + /// is zero. + pub fn is_empty(&self) -> bool { + self.len == 0 + } + + /// Returns the byte offset of this position. + pub fn pos(&self) -> usize { + self.pos + } + + /// Returns the byte offset of the next position in the input. + pub fn next_pos(&self) -> usize { + self.pos + self.len + } +} + +/// An abstraction over input used in the matching engines. +pub trait Input: fmt::Debug { + /// Return an encoding of the position at byte offset `i`. + fn at(&self, i: usize) -> InputAt; + + /// Return the Unicode character occurring next to `at`. + /// + /// If no such character could be decoded, then `Char` is absent. + fn next_char(&self, at: InputAt) -> Char; + + /// Return the Unicode character occurring previous to `at`. + /// + /// If no such character could be decoded, then `Char` is absent. + fn previous_char(&self, at: InputAt) -> Char; + + /// Return true if the given empty width instruction matches at the + /// input position given. + fn is_empty_match(&self, at: InputAt, empty: &InstEmptyLook) -> bool; + + /// Scan the input for a matching prefix. + fn prefix_at( + &self, + prefixes: &LiteralSearcher, + at: InputAt, + ) -> Option<InputAt>; + + /// The number of bytes in the input. + fn len(&self) -> usize; + + /// Whether the input is empty. + fn is_empty(&self) -> bool { + self.len() == 0 + } + + /// Return the given input as a sequence of bytes. + fn as_bytes(&self) -> &[u8]; +} + +impl<'a, T: Input> Input for &'a T { + fn at(&self, i: usize) -> InputAt { + (**self).at(i) + } + + fn next_char(&self, at: InputAt) -> Char { + (**self).next_char(at) + } + + fn previous_char(&self, at: InputAt) -> Char { + (**self).previous_char(at) + } + + fn is_empty_match(&self, at: InputAt, empty: &InstEmptyLook) -> bool { + (**self).is_empty_match(at, empty) + } + + fn prefix_at( + &self, + prefixes: &LiteralSearcher, + at: InputAt, + ) -> Option<InputAt> { + (**self).prefix_at(prefixes, at) + } + + fn len(&self) -> usize { + (**self).len() + } + + fn as_bytes(&self) -> &[u8] { + (**self).as_bytes() + } +} + +/// An input reader over characters. +#[derive(Clone, Copy, Debug)] +pub struct CharInput<'t>(&'t [u8]); + +impl<'t> CharInput<'t> { + /// Return a new character input reader for the given string. + pub fn new(s: &'t [u8]) -> CharInput<'t> { + CharInput(s) + } +} + +impl<'t> ops::Deref for CharInput<'t> { + type Target = [u8]; + + fn deref(&self) -> &[u8] { + self.0 + } +} + +impl<'t> Input for CharInput<'t> { + fn at(&self, i: usize) -> InputAt { + if i >= self.len() { + InputAt { pos: self.len(), c: None.into(), byte: None, len: 0 } + } else { + let c = decode_utf8(&self[i..]).map(|(c, _)| c).into(); + InputAt { pos: i, c, byte: None, len: c.len_utf8() } + } + } + + fn next_char(&self, at: InputAt) -> Char { + at.char() + } + + fn previous_char(&self, at: InputAt) -> Char { + decode_last_utf8(&self[..at.pos()]).map(|(c, _)| c).into() + } + + fn is_empty_match(&self, at: InputAt, empty: &InstEmptyLook) -> bool { + use crate::prog::EmptyLook::*; + match empty.look { + StartLine => { + let c = self.previous_char(at); + at.pos() == 0 || c == '\n' + } + EndLine => { + let c = self.next_char(at); + at.pos() == self.len() || c == '\n' + } + StartText => at.pos() == 0, + EndText => at.pos() == self.len(), + WordBoundary => { + let (c1, c2) = (self.previous_char(at), self.next_char(at)); + c1.is_word_char() != c2.is_word_char() + } + NotWordBoundary => { + let (c1, c2) = (self.previous_char(at), self.next_char(at)); + c1.is_word_char() == c2.is_word_char() + } + WordBoundaryAscii => { + let (c1, c2) = (self.previous_char(at), self.next_char(at)); + c1.is_word_byte() != c2.is_word_byte() + } + NotWordBoundaryAscii => { + let (c1, c2) = (self.previous_char(at), self.next_char(at)); + c1.is_word_byte() == c2.is_word_byte() + } + } + } + + fn prefix_at( + &self, + prefixes: &LiteralSearcher, + at: InputAt, + ) -> Option<InputAt> { + prefixes.find(&self[at.pos()..]).map(|(s, _)| self.at(at.pos() + s)) + } + + fn len(&self) -> usize { + self.0.len() + } + + fn as_bytes(&self) -> &[u8] { + self.0 + } +} + +/// An input reader over bytes. +#[derive(Clone, Copy, Debug)] +pub struct ByteInput<'t> { + text: &'t [u8], + only_utf8: bool, +} + +impl<'t> ByteInput<'t> { + /// Return a new byte-based input reader for the given string. + pub fn new(text: &'t [u8], only_utf8: bool) -> ByteInput<'t> { + ByteInput { text, only_utf8 } + } +} + +impl<'t> ops::Deref for ByteInput<'t> { + type Target = [u8]; + + fn deref(&self) -> &[u8] { + self.text + } +} + +impl<'t> Input for ByteInput<'t> { + fn at(&self, i: usize) -> InputAt { + if i >= self.len() { + InputAt { pos: self.len(), c: None.into(), byte: None, len: 0 } + } else { + InputAt { + pos: i, + c: None.into(), + byte: self.get(i).cloned(), + len: 1, + } + } + } + + fn next_char(&self, at: InputAt) -> Char { + decode_utf8(&self[at.pos()..]).map(|(c, _)| c).into() + } + + fn previous_char(&self, at: InputAt) -> Char { + decode_last_utf8(&self[..at.pos()]).map(|(c, _)| c).into() + } + + fn is_empty_match(&self, at: InputAt, empty: &InstEmptyLook) -> bool { + use crate::prog::EmptyLook::*; + match empty.look { + StartLine => { + let c = self.previous_char(at); + at.pos() == 0 || c == '\n' + } + EndLine => { + let c = self.next_char(at); + at.pos() == self.len() || c == '\n' + } + StartText => at.pos() == 0, + EndText => at.pos() == self.len(), + WordBoundary => { + let (c1, c2) = (self.previous_char(at), self.next_char(at)); + c1.is_word_char() != c2.is_word_char() + } + NotWordBoundary => { + let (c1, c2) = (self.previous_char(at), self.next_char(at)); + c1.is_word_char() == c2.is_word_char() + } + WordBoundaryAscii => { + let (c1, c2) = (self.previous_char(at), self.next_char(at)); + if self.only_utf8 { + // If we must match UTF-8, then we can't match word + // boundaries at invalid UTF-8. + if c1.is_none() && !at.is_start() { + return false; + } + if c2.is_none() && !at.is_end() { + return false; + } + } + c1.is_word_byte() != c2.is_word_byte() + } + NotWordBoundaryAscii => { + let (c1, c2) = (self.previous_char(at), self.next_char(at)); + if self.only_utf8 { + // If we must match UTF-8, then we can't match word + // boundaries at invalid UTF-8. + if c1.is_none() && !at.is_start() { + return false; + } + if c2.is_none() && !at.is_end() { + return false; + } + } + c1.is_word_byte() == c2.is_word_byte() + } + } + } + + fn prefix_at( + &self, + prefixes: &LiteralSearcher, + at: InputAt, + ) -> Option<InputAt> { + prefixes.find(&self[at.pos()..]).map(|(s, _)| self.at(at.pos() + s)) + } + + fn len(&self) -> usize { + self.text.len() + } + + fn as_bytes(&self) -> &[u8] { + self.text + } +} + +/// An inline representation of `Option<char>`. +/// +/// This eliminates the need to do case analysis on `Option<char>` to determine +/// ordinality with other characters. +/// +/// (The `Option<char>` is not related to encoding. Instead, it is used in the +/// matching engines to represent the beginning and ending boundaries of the +/// search text.) +#[derive(Clone, Copy, Hash, PartialEq, Eq, PartialOrd, Ord)] +pub struct Char(u32); + +impl fmt::Debug for Char { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + match char::from_u32(self.0) { + None => write!(f, "Empty"), + Some(c) => write!(f, "{:?}", c), + } + } +} + +impl Char { + /// Returns true iff the character is absent. + #[inline] + pub fn is_none(self) -> bool { + self.0 == u32::MAX + } + + /// Returns the length of the character's UTF-8 encoding. + /// + /// If the character is absent, then `1` is returned. + #[inline] + pub fn len_utf8(self) -> usize { + char::from_u32(self.0).map_or(1, |c| c.len_utf8()) + } + + /// Returns true iff the character is a word character. + /// + /// If the character is absent, then false is returned. + pub fn is_word_char(self) -> bool { + // is_word_character can panic if the Unicode data for \w isn't + // available. However, our compiler ensures that if a Unicode word + // boundary is used, then the data must also be available. If it isn't, + // then the compiler returns an error. + char::from_u32(self.0).map_or(false, regex_syntax::is_word_character) + } + + /// Returns true iff the byte is a word byte. + /// + /// If the byte is absent, then false is returned. + pub fn is_word_byte(self) -> bool { + match char::from_u32(self.0) { + Some(c) if c <= '\u{7F}' => regex_syntax::is_word_byte(c as u8), + None | Some(_) => false, + } + } +} + +impl From<char> for Char { + fn from(c: char) -> Char { + Char(c as u32) + } +} + +impl From<Option<char>> for Char { + fn from(c: Option<char>) -> Char { + c.map_or(Char(u32::MAX), |c| c.into()) + } +} + +impl PartialEq<char> for Char { + #[inline] + fn eq(&self, other: &char) -> bool { + self.0 == *other as u32 + } +} + +impl PartialEq<Char> for char { + #[inline] + fn eq(&self, other: &Char) -> bool { + *self as u32 == other.0 + } +} + +impl PartialOrd<char> for Char { + #[inline] + fn partial_cmp(&self, other: &char) -> Option<Ordering> { + self.0.partial_cmp(&(*other as u32)) + } +} + +impl PartialOrd<Char> for char { + #[inline] + fn partial_cmp(&self, other: &Char) -> Option<Ordering> { + (*self as u32).partial_cmp(&other.0) + } +} diff --git a/third_party/rust/regex/src/lib.rs b/third_party/rust/regex/src/lib.rs new file mode 100644 index 0000000000..6b95739c5c --- /dev/null +++ b/third_party/rust/regex/src/lib.rs @@ -0,0 +1,769 @@ +/*! +This crate provides a library for parsing, compiling, and executing regular +expressions. Its syntax is similar to Perl-style regular expressions, but lacks +a few features like look around and backreferences. In exchange, all searches +execute in linear time with respect to the size of the regular expression and +search text. + +This crate's documentation provides some simple examples, describes +[Unicode support](#unicode) and exhaustively lists the +[supported syntax](#syntax). + +For more specific details on the API for regular expressions, please see the +documentation for the [`Regex`](struct.Regex.html) type. + +# Usage + +This crate is [on crates.io](https://crates.io/crates/regex) and can be +used by adding `regex` to your dependencies in your project's `Cargo.toml`. + +```toml +[dependencies] +regex = "1" +``` + +# Example: find a date + +General use of regular expressions in this package involves compiling an +expression and then using it to search, split or replace text. For example, +to confirm that some text resembles a date: + +```rust +use regex::Regex; +let re = Regex::new(r"^\d{4}-\d{2}-\d{2}$").unwrap(); +assert!(re.is_match("2014-01-01")); +``` + +Notice the use of the `^` and `$` anchors. In this crate, every expression +is executed with an implicit `.*?` at the beginning and end, which allows +it to match anywhere in the text. Anchors can be used to ensure that the +full text matches an expression. + +This example also demonstrates the utility of +[raw strings](https://doc.rust-lang.org/stable/reference/tokens.html#raw-string-literals) +in Rust, which +are just like regular strings except they are prefixed with an `r` and do +not process any escape sequences. For example, `"\\d"` is the same +expression as `r"\d"`. + +# Example: Avoid compiling the same regex in a loop + +It is an anti-pattern to compile the same regular expression in a loop +since compilation is typically expensive. (It takes anywhere from a few +microseconds to a few **milliseconds** depending on the size of the +regex.) Not only is compilation itself expensive, but this also prevents +optimizations that reuse allocations internally to the matching engines. + +In Rust, it can sometimes be a pain to pass regular expressions around if +they're used from inside a helper function. Instead, we recommend using the +[`lazy_static`](https://crates.io/crates/lazy_static) crate to ensure that +regular expressions are compiled exactly once. + +For example: + +```rust +use lazy_static::lazy_static; +use regex::Regex; + +fn some_helper_function(text: &str) -> bool { + lazy_static! { + static ref RE: Regex = Regex::new("...").unwrap(); + } + RE.is_match(text) +} + +fn main() {} +``` + +Specifically, in this example, the regex will be compiled when it is used for +the first time. On subsequent uses, it will reuse the previous compilation. + +# Example: iterating over capture groups + +This crate provides convenient iterators for matching an expression +repeatedly against a search string to find successive non-overlapping +matches. For example, to find all dates in a string and be able to access +them by their component pieces: + +```rust +# use regex::Regex; +# fn main() { +let re = Regex::new(r"(\d{4})-(\d{2})-(\d{2})").unwrap(); +let text = "2012-03-14, 2013-01-01 and 2014-07-05"; +for cap in re.captures_iter(text) { + println!("Month: {} Day: {} Year: {}", &cap[2], &cap[3], &cap[1]); +} +// Output: +// Month: 03 Day: 14 Year: 2012 +// Month: 01 Day: 01 Year: 2013 +// Month: 07 Day: 05 Year: 2014 +# } +``` + +Notice that the year is in the capture group indexed at `1`. This is +because the *entire match* is stored in the capture group at index `0`. + +# Example: replacement with named capture groups + +Building on the previous example, perhaps we'd like to rearrange the date +formats. This can be done with text replacement. But to make the code +clearer, we can *name* our capture groups and use those names as variables +in our replacement text: + +```rust +# use regex::Regex; +# fn main() { +let re = Regex::new(r"(?P<y>\d{4})-(?P<m>\d{2})-(?P<d>\d{2})").unwrap(); +let before = "2012-03-14, 2013-01-01 and 2014-07-05"; +let after = re.replace_all(before, "$m/$d/$y"); +assert_eq!(after, "03/14/2012, 01/01/2013 and 07/05/2014"); +# } +``` + +The `replace` methods are actually polymorphic in the replacement, which +provides more flexibility than is seen here. (See the documentation for +`Regex::replace` for more details.) + +Note that if your regex gets complicated, you can use the `x` flag to +enable insignificant whitespace mode, which also lets you write comments: + +```rust +# use regex::Regex; +# fn main() { +let re = Regex::new(r"(?x) + (?P<y>\d{4}) # the year + - + (?P<m>\d{2}) # the month + - + (?P<d>\d{2}) # the day +").unwrap(); +let before = "2012-03-14, 2013-01-01 and 2014-07-05"; +let after = re.replace_all(before, "$m/$d/$y"); +assert_eq!(after, "03/14/2012, 01/01/2013 and 07/05/2014"); +# } +``` + +If you wish to match against whitespace in this mode, you can still use `\s`, +`\n`, `\t`, etc. For escaping a single space character, you can escape it +directly with `\ `, use its hex character code `\x20` or temporarily disable +the `x` flag, e.g., `(?-x: )`. + +# Example: match multiple regular expressions simultaneously + +This demonstrates how to use a `RegexSet` to match multiple (possibly +overlapping) regular expressions in a single scan of the search text: + +```rust +use regex::RegexSet; + +let set = RegexSet::new(&[ + r"\w+", + r"\d+", + r"\pL+", + r"foo", + r"bar", + r"barfoo", + r"foobar", +]).unwrap(); + +// Iterate over and collect all of the matches. +let matches: Vec<_> = set.matches("foobar").into_iter().collect(); +assert_eq!(matches, vec![0, 2, 3, 4, 6]); + +// You can also test whether a particular regex matched: +let matches = set.matches("foobar"); +assert!(!matches.matched(5)); +assert!(matches.matched(6)); +``` + +# Pay for what you use + +With respect to searching text with a regular expression, there are three +questions that can be asked: + +1. Does the text match this expression? +2. If so, where does it match? +3. Where did the capturing groups match? + +Generally speaking, this crate could provide a function to answer only #3, +which would subsume #1 and #2 automatically. However, it can be significantly +more expensive to compute the location of capturing group matches, so it's best +not to do it if you don't need to. + +Therefore, only use what you need. For example, don't use `find` if you +only need to test if an expression matches a string. (Use `is_match` +instead.) + +# Unicode + +This implementation executes regular expressions **only** on valid UTF-8 +while exposing match locations as byte indices into the search string. (To +relax this restriction, use the [`bytes`](bytes/index.html) sub-module.) + +Only simple case folding is supported. Namely, when matching +case-insensitively, the characters are first mapped using the "simple" case +folding rules defined by Unicode. + +Regular expressions themselves are **only** interpreted as a sequence of +Unicode scalar values. This means you can use Unicode characters directly +in your expression: + +```rust +# use regex::Regex; +# fn main() { +let re = Regex::new(r"(?i)Δ+").unwrap(); +let mat = re.find("ΔδΔ").unwrap(); +assert_eq!((mat.start(), mat.end()), (0, 6)); +# } +``` + +Most features of the regular expressions in this crate are Unicode aware. Here +are some examples: + +* `.` will match any valid UTF-8 encoded Unicode scalar value except for `\n`. + (To also match `\n`, enable the `s` flag, e.g., `(?s:.)`.) +* `\w`, `\d` and `\s` are Unicode aware. For example, `\s` will match all forms + of whitespace categorized by Unicode. +* `\b` matches a Unicode word boundary. +* Negated character classes like `[^a]` match all Unicode scalar values except + for `a`. +* `^` and `$` are **not** Unicode aware in multi-line mode. Namely, they only + recognize `\n` and not any of the other forms of line terminators defined + by Unicode. + +Unicode general categories, scripts, script extensions, ages and a smattering +of boolean properties are available as character classes. For example, you can +match a sequence of numerals, Greek or Cherokee letters: + +```rust +# use regex::Regex; +# fn main() { +let re = Regex::new(r"[\pN\p{Greek}\p{Cherokee}]+").unwrap(); +let mat = re.find("abcΔᎠβⅠᏴγδⅡxyz").unwrap(); +assert_eq!((mat.start(), mat.end()), (3, 23)); +# } +``` + +For a more detailed breakdown of Unicode support with respect to +[UTS#18](https://unicode.org/reports/tr18/), +please see the +[UNICODE](https://github.com/rust-lang/regex/blob/master/UNICODE.md) +document in the root of the regex repository. + +# Opt out of Unicode support + +The `bytes` sub-module provides a `Regex` type that can be used to match +on `&[u8]`. By default, text is interpreted as UTF-8 just like it is with +the main `Regex` type. However, this behavior can be disabled by turning +off the `u` flag, even if doing so could result in matching invalid UTF-8. +For example, when the `u` flag is disabled, `.` will match any byte instead +of any Unicode scalar value. + +Disabling the `u` flag is also possible with the standard `&str`-based `Regex` +type, but it is only allowed where the UTF-8 invariant is maintained. For +example, `(?-u:\w)` is an ASCII-only `\w` character class and is legal in an +`&str`-based `Regex`, but `(?-u:\xFF)` will attempt to match the raw byte +`\xFF`, which is invalid UTF-8 and therefore is illegal in `&str`-based +regexes. + +Finally, since Unicode support requires bundling large Unicode data +tables, this crate exposes knobs to disable the compilation of those +data tables, which can be useful for shrinking binary size and reducing +compilation times. For details on how to do that, see the section on [crate +features](#crate-features). + +# Syntax + +The syntax supported in this crate is documented below. + +Note that the regular expression parser and abstract syntax are exposed in +a separate crate, [`regex-syntax`](https://docs.rs/regex-syntax). + +## Matching one character + +<pre class="rust"> +. any character except new line (includes new line with s flag) +\d digit (\p{Nd}) +\D not digit +\pN One-letter name Unicode character class +\p{Greek} Unicode character class (general category or script) +\PN Negated one-letter name Unicode character class +\P{Greek} negated Unicode character class (general category or script) +</pre> + +### Character classes + +<pre class="rust"> +[xyz] A character class matching either x, y or z (union). +[^xyz] A character class matching any character except x, y and z. +[a-z] A character class matching any character in range a-z. +[[:alpha:]] ASCII character class ([A-Za-z]) +[[:^alpha:]] Negated ASCII character class ([^A-Za-z]) +[x[^xyz]] Nested/grouping character class (matching any character except y and z) +[a-y&&xyz] Intersection (matching x or y) +[0-9&&[^4]] Subtraction using intersection and negation (matching 0-9 except 4) +[0-9--4] Direct subtraction (matching 0-9 except 4) +[a-g~~b-h] Symmetric difference (matching `a` and `h` only) +[\[\]] Escaping in character classes (matching [ or ]) +</pre> + +Any named character class may appear inside a bracketed `[...]` character +class. For example, `[\p{Greek}[:digit:]]` matches any Greek or ASCII +digit. `[\p{Greek}&&\pL]` matches Greek letters. + +Precedence in character classes, from most binding to least: + +1. Ranges: `a-cd` == `[a-c]d` +2. Union: `ab&&bc` == `[ab]&&[bc]` +3. Intersection: `^a-z&&b` == `^[a-z&&b]` +4. Negation + +## Composites + +<pre class="rust"> +xy concatenation (x followed by y) +x|y alternation (x or y, prefer x) +</pre> + +## Repetitions + +<pre class="rust"> +x* zero or more of x (greedy) +x+ one or more of x (greedy) +x? zero or one of x (greedy) +x*? zero or more of x (ungreedy/lazy) +x+? one or more of x (ungreedy/lazy) +x?? zero or one of x (ungreedy/lazy) +x{n,m} at least n x and at most m x (greedy) +x{n,} at least n x (greedy) +x{n} exactly n x +x{n,m}? at least n x and at most m x (ungreedy/lazy) +x{n,}? at least n x (ungreedy/lazy) +x{n}? exactly n x +</pre> + +## Empty matches + +<pre class="rust"> +^ the beginning of text (or start-of-line with multi-line mode) +$ the end of text (or end-of-line with multi-line mode) +\A only the beginning of text (even with multi-line mode enabled) +\z only the end of text (even with multi-line mode enabled) +\b a Unicode word boundary (\w on one side and \W, \A, or \z on other) +\B not a Unicode word boundary +</pre> + +The empty regex is valid and matches the empty string. For example, the empty +regex matches `abc` at positions `0`, `1`, `2` and `3`. + +## Grouping and flags + +<pre class="rust"> +(exp) numbered capture group (indexed by opening parenthesis) +(?P<name>exp) named (also numbered) capture group (allowed chars: [_0-9a-zA-Z.\[\]]) +(?:exp) non-capturing group +(?flags) set flags within current group +(?flags:exp) set flags for exp (non-capturing) +</pre> + +Flags are each a single character. For example, `(?x)` sets the flag `x` +and `(?-x)` clears the flag `x`. Multiple flags can be set or cleared at +the same time: `(?xy)` sets both the `x` and `y` flags and `(?x-y)` sets +the `x` flag and clears the `y` flag. + +All flags are by default disabled unless stated otherwise. They are: + +<pre class="rust"> +i case-insensitive: letters match both upper and lower case +m multi-line mode: ^ and $ match begin/end of line +s allow . to match \n +U swap the meaning of x* and x*? +u Unicode support (enabled by default) +x ignore whitespace and allow line comments (starting with `#`) +</pre> + +Flags can be toggled within a pattern. Here's an example that matches +case-insensitively for the first part but case-sensitively for the second part: + +```rust +# use regex::Regex; +# fn main() { +let re = Regex::new(r"(?i)a+(?-i)b+").unwrap(); +let cap = re.captures("AaAaAbbBBBb").unwrap(); +assert_eq!(&cap[0], "AaAaAbb"); +# } +``` + +Notice that the `a+` matches either `a` or `A`, but the `b+` only matches +`b`. + +Multi-line mode means `^` and `$` no longer match just at the beginning/end of +the input, but at the beginning/end of lines: + +``` +# use regex::Regex; +let re = Regex::new(r"(?m)^line \d+").unwrap(); +let m = re.find("line one\nline 2\n").unwrap(); +assert_eq!(m.as_str(), "line 2"); +``` + +Note that `^` matches after new lines, even at the end of input: + +``` +# use regex::Regex; +let re = Regex::new(r"(?m)^").unwrap(); +let m = re.find_iter("test\n").last().unwrap(); +assert_eq!((m.start(), m.end()), (5, 5)); +``` + +Here is an example that uses an ASCII word boundary instead of a Unicode +word boundary: + +```rust +# use regex::Regex; +# fn main() { +let re = Regex::new(r"(?-u:\b).+(?-u:\b)").unwrap(); +let cap = re.captures("$$abc$$").unwrap(); +assert_eq!(&cap[0], "abc"); +# } +``` + +## Escape sequences + +<pre class="rust"> +\* literal *, works for any punctuation character: \.+*?()|[]{}^$ +\a bell (\x07) +\f form feed (\x0C) +\t horizontal tab +\n new line +\r carriage return +\v vertical tab (\x0B) +\123 octal character code (up to three digits) (when enabled) +\x7F hex character code (exactly two digits) +\x{10FFFF} any hex character code corresponding to a Unicode code point +\u007F hex character code (exactly four digits) +\u{7F} any hex character code corresponding to a Unicode code point +\U0000007F hex character code (exactly eight digits) +\U{7F} any hex character code corresponding to a Unicode code point +</pre> + +## Perl character classes (Unicode friendly) + +These classes are based on the definitions provided in +[UTS#18](https://www.unicode.org/reports/tr18/#Compatibility_Properties): + +<pre class="rust"> +\d digit (\p{Nd}) +\D not digit +\s whitespace (\p{White_Space}) +\S not whitespace +\w word character (\p{Alphabetic} + \p{M} + \d + \p{Pc} + \p{Join_Control}) +\W not word character +</pre> + +## ASCII character classes + +<pre class="rust"> +[[:alnum:]] alphanumeric ([0-9A-Za-z]) +[[:alpha:]] alphabetic ([A-Za-z]) +[[:ascii:]] ASCII ([\x00-\x7F]) +[[:blank:]] blank ([\t ]) +[[:cntrl:]] control ([\x00-\x1F\x7F]) +[[:digit:]] digits ([0-9]) +[[:graph:]] graphical ([!-~]) +[[:lower:]] lower case ([a-z]) +[[:print:]] printable ([ -~]) +[[:punct:]] punctuation ([!-/:-@\[-`{-~]) +[[:space:]] whitespace ([\t\n\v\f\r ]) +[[:upper:]] upper case ([A-Z]) +[[:word:]] word characters ([0-9A-Za-z_]) +[[:xdigit:]] hex digit ([0-9A-Fa-f]) +</pre> + +# Crate features + +By default, this crate tries pretty hard to make regex matching both as fast +as possible and as correct as it can be, within reason. This means that there +is a lot of code dedicated to performance, the handling of Unicode data and the +Unicode data itself. Overall, this leads to more dependencies, larger binaries +and longer compile times. This trade off may not be appropriate in all cases, +and indeed, even when all Unicode and performance features are disabled, one +is still left with a perfectly serviceable regex engine that will work well +in many cases. + +This crate exposes a number of features for controlling that trade off. Some +of these features are strictly performance oriented, such that disabling them +won't result in a loss of functionality, but may result in worse performance. +Other features, such as the ones controlling the presence or absence of Unicode +data, can result in a loss of functionality. For example, if one disables the +`unicode-case` feature (described below), then compiling the regex `(?i)a` +will fail since Unicode case insensitivity is enabled by default. Instead, +callers must use `(?i-u)a` instead to disable Unicode case folding. Stated +differently, enabling or disabling any of the features below can only add or +subtract from the total set of valid regular expressions. Enabling or disabling +a feature will never modify the match semantics of a regular expression. + +All features below are enabled by default. + +### Ecosystem features + +* **std** - + When enabled, this will cause `regex` to use the standard library. Currently, + disabling this feature will always result in a compilation error. It is + intended to add `alloc`-only support to regex in the future. + +### Performance features + +* **perf** - + Enables all performance related features. This feature is enabled by default + and will always cover all features that improve performance, even if more + are added in the future. +* **perf-dfa** - + Enables the use of a lazy DFA for matching. The lazy DFA is used to compile + portions of a regex to a very fast DFA on an as-needed basis. This can + result in substantial speedups, usually by an order of magnitude on large + haystacks. The lazy DFA does not bring in any new dependencies, but it can + make compile times longer. +* **perf-inline** - + Enables the use of aggressive inlining inside match routines. This reduces + the overhead of each match. The aggressive inlining, however, increases + compile times and binary size. +* **perf-literal** - + Enables the use of literal optimizations for speeding up matches. In some + cases, literal optimizations can result in speedups of _several_ orders of + magnitude. Disabling this drops the `aho-corasick` and `memchr` dependencies. +* **perf-cache** - + This feature used to enable a faster internal cache at the cost of using + additional dependencies, but this is no longer an option. A fast internal + cache is now used unconditionally with no additional dependencies. This may + change in the future. + +### Unicode features + +* **unicode** - + Enables all Unicode features. This feature is enabled by default, and will + always cover all Unicode features, even if more are added in the future. +* **unicode-age** - + Provide the data for the + [Unicode `Age` property](https://www.unicode.org/reports/tr44/tr44-24.html#Character_Age). + This makes it possible to use classes like `\p{Age:6.0}` to refer to all + codepoints first introduced in Unicode 6.0 +* **unicode-bool** - + Provide the data for numerous Unicode boolean properties. The full list + is not included here, but contains properties like `Alphabetic`, `Emoji`, + `Lowercase`, `Math`, `Uppercase` and `White_Space`. +* **unicode-case** - + Provide the data for case insensitive matching using + [Unicode's "simple loose matches" specification](https://www.unicode.org/reports/tr18/#Simple_Loose_Matches). +* **unicode-gencat** - + Provide the data for + [Unicode general categories](https://www.unicode.org/reports/tr44/tr44-24.html#General_Category_Values). + This includes, but is not limited to, `Decimal_Number`, `Letter`, + `Math_Symbol`, `Number` and `Punctuation`. +* **unicode-perl** - + Provide the data for supporting the Unicode-aware Perl character classes, + corresponding to `\w`, `\s` and `\d`. This is also necessary for using + Unicode-aware word boundary assertions. Note that if this feature is + disabled, the `\s` and `\d` character classes are still available if the + `unicode-bool` and `unicode-gencat` features are enabled, respectively. +* **unicode-script** - + Provide the data for + [Unicode scripts and script extensions](https://www.unicode.org/reports/tr24/). + This includes, but is not limited to, `Arabic`, `Cyrillic`, `Hebrew`, + `Latin` and `Thai`. +* **unicode-segment** - + Provide the data necessary to provide the properties used to implement the + [Unicode text segmentation algorithms](https://www.unicode.org/reports/tr29/). + This enables using classes like `\p{gcb=Extend}`, `\p{wb=Katakana}` and + `\p{sb=ATerm}`. + + +# Untrusted input + +This crate can handle both untrusted regular expressions and untrusted +search text. + +Untrusted regular expressions are handled by capping the size of a compiled +regular expression. +(See [`RegexBuilder::size_limit`](struct.RegexBuilder.html#method.size_limit).) +Without this, it would be trivial for an attacker to exhaust your system's +memory with expressions like `a{100}{100}{100}`. + +Untrusted search text is allowed because the matching engine(s) in this +crate have time complexity `O(mn)` (with `m ~ regex` and `n ~ search +text`), which means there's no way to cause exponential blow-up like with +some other regular expression engines. (We pay for this by disallowing +features like arbitrary look-ahead and backreferences.) + +When a DFA is used, pathological cases with exponential state blow-up are +avoided by constructing the DFA lazily or in an "online" manner. Therefore, +at most one new state can be created for each byte of input. This satisfies +our time complexity guarantees, but can lead to memory growth +proportional to the size of the input. As a stopgap, the DFA is only +allowed to store a fixed number of states. When the limit is reached, its +states are wiped and continues on, possibly duplicating previous work. If +the limit is reached too frequently, it gives up and hands control off to +another matching engine with fixed memory requirements. +(The DFA size limit can also be tweaked. See +[`RegexBuilder::dfa_size_limit`](struct.RegexBuilder.html#method.dfa_size_limit).) +*/ + +#![deny(missing_docs)] +#![cfg_attr(feature = "pattern", feature(pattern))] +#![warn(missing_debug_implementations)] + +#[cfg(not(feature = "std"))] +compile_error!("`std` feature is currently required to build this crate"); + +// To check README's example +// TODO: Re-enable this once the MSRV is 1.43 or greater. +// See: https://github.com/rust-lang/regex/issues/684 +// See: https://github.com/rust-lang/regex/issues/685 +// #[cfg(doctest)] +// doc_comment::doctest!("../README.md"); + +#[cfg(feature = "std")] +pub use crate::error::Error; +#[cfg(feature = "std")] +pub use crate::re_builder::set_unicode::*; +#[cfg(feature = "std")] +pub use crate::re_builder::unicode::*; +#[cfg(feature = "std")] +pub use crate::re_set::unicode::*; +#[cfg(feature = "std")] +pub use crate::re_unicode::{ + escape, CaptureLocations, CaptureMatches, CaptureNames, Captures, + Locations, Match, Matches, NoExpand, Regex, Replacer, ReplacerRef, Split, + SplitN, SubCaptureMatches, +}; + +/** +Match regular expressions on arbitrary bytes. + +This module provides a nearly identical API to the one found in the +top-level of this crate. There are two important differences: + +1. Matching is done on `&[u8]` instead of `&str`. Additionally, `Vec<u8>` +is used where `String` would have been used. +2. Unicode support can be disabled even when disabling it would result in +matching invalid UTF-8 bytes. + +# Example: match null terminated string + +This shows how to find all null-terminated strings in a slice of bytes: + +```rust +# use regex::bytes::Regex; +let re = Regex::new(r"(?-u)(?P<cstr>[^\x00]+)\x00").unwrap(); +let text = b"foo\x00bar\x00baz\x00"; + +// Extract all of the strings without the null terminator from each match. +// The unwrap is OK here since a match requires the `cstr` capture to match. +let cstrs: Vec<&[u8]> = + re.captures_iter(text) + .map(|c| c.name("cstr").unwrap().as_bytes()) + .collect(); +assert_eq!(vec![&b"foo"[..], &b"bar"[..], &b"baz"[..]], cstrs); +``` + +# Example: selectively enable Unicode support + +This shows how to match an arbitrary byte pattern followed by a UTF-8 encoded +string (e.g., to extract a title from a Matroska file): + +```rust +# use std::str; +# use regex::bytes::Regex; +let re = Regex::new( + r"(?-u)\x7b\xa9(?:[\x80-\xfe]|[\x40-\xff].)(?u:(.*))" +).unwrap(); +let text = b"\x12\xd0\x3b\x5f\x7b\xa9\x85\xe2\x98\x83\x80\x98\x54\x76\x68\x65"; +let caps = re.captures(text).unwrap(); + +// Notice that despite the `.*` at the end, it will only match valid UTF-8 +// because Unicode mode was enabled with the `u` flag. Without the `u` flag, +// the `.*` would match the rest of the bytes. +let mat = caps.get(1).unwrap(); +assert_eq!((7, 10), (mat.start(), mat.end())); + +// If there was a match, Unicode mode guarantees that `title` is valid UTF-8. +let title = str::from_utf8(&caps[1]).unwrap(); +assert_eq!("☃", title); +``` + +In general, if the Unicode flag is enabled in a capture group and that capture +is part of the overall match, then the capture is *guaranteed* to be valid +UTF-8. + +# Syntax + +The supported syntax is pretty much the same as the syntax for Unicode +regular expressions with a few changes that make sense for matching arbitrary +bytes: + +1. The `u` flag can be disabled even when disabling it might cause the regex to +match invalid UTF-8. When the `u` flag is disabled, the regex is said to be in +"ASCII compatible" mode. +2. In ASCII compatible mode, neither Unicode scalar values nor Unicode +character classes are allowed. +3. In ASCII compatible mode, Perl character classes (`\w`, `\d` and `\s`) +revert to their typical ASCII definition. `\w` maps to `[[:word:]]`, `\d` maps +to `[[:digit:]]` and `\s` maps to `[[:space:]]`. +4. In ASCII compatible mode, word boundaries use the ASCII compatible `\w` to +determine whether a byte is a word byte or not. +5. Hexadecimal notation can be used to specify arbitrary bytes instead of +Unicode codepoints. For example, in ASCII compatible mode, `\xFF` matches the +literal byte `\xFF`, while in Unicode mode, `\xFF` is a Unicode codepoint that +matches its UTF-8 encoding of `\xC3\xBF`. Similarly for octal notation when +enabled. +6. In ASCII compatible mode, `.` matches any *byte* except for `\n`. When the +`s` flag is additionally enabled, `.` matches any byte. + +# Performance + +In general, one should expect performance on `&[u8]` to be roughly similar to +performance on `&str`. +*/ +#[cfg(feature = "std")] +pub mod bytes { + pub use crate::re_builder::bytes::*; + pub use crate::re_builder::set_bytes::*; + pub use crate::re_bytes::*; + pub use crate::re_set::bytes::*; +} + +mod backtrack; +mod compile; +#[cfg(feature = "perf-dfa")] +mod dfa; +mod error; +mod exec; +mod expand; +mod find_byte; +mod input; +mod literal; +#[cfg(feature = "pattern")] +mod pattern; +mod pikevm; +mod pool; +mod prog; +mod re_builder; +mod re_bytes; +mod re_set; +mod re_trait; +mod re_unicode; +mod sparse; +mod utf8; + +/// The `internal` module exists to support suspicious activity, such as +/// testing different matching engines and supporting the `regex-debug` CLI +/// utility. +#[doc(hidden)] +#[cfg(feature = "std")] +pub mod internal { + pub use crate::compile::Compiler; + pub use crate::exec::{Exec, ExecBuilder}; + pub use crate::input::{Char, CharInput, Input, InputAt}; + pub use crate::literal::LiteralSearcher; + pub use crate::prog::{EmptyLook, Inst, InstRanges, Program}; +} diff --git a/third_party/rust/regex/src/literal/imp.rs b/third_party/rust/regex/src/literal/imp.rs new file mode 100644 index 0000000000..90b2f11606 --- /dev/null +++ b/third_party/rust/regex/src/literal/imp.rs @@ -0,0 +1,402 @@ +use std::mem; + +use aho_corasick::{self, packed, AhoCorasick, AhoCorasickBuilder}; +use memchr::{memchr, memchr2, memchr3, memmem}; +use regex_syntax::hir::literal::{Literal, Literals}; + +/// A prefix extracted from a compiled regular expression. +/// +/// A regex prefix is a set of literal strings that *must* be matched at the +/// beginning of a regex in order for the entire regex to match. Similarly +/// for a regex suffix. +#[derive(Clone, Debug)] +pub struct LiteralSearcher { + complete: bool, + lcp: Memmem, + lcs: Memmem, + matcher: Matcher, +} + +#[derive(Clone, Debug)] +enum Matcher { + /// No literals. (Never advances through the input.) + Empty, + /// A set of four or more single byte literals. + Bytes(SingleByteSet), + /// A single substring, using vector accelerated routines when available. + Memmem(Memmem), + /// An Aho-Corasick automaton. + AC { ac: AhoCorasick<u32>, lits: Vec<Literal> }, + /// A packed multiple substring searcher, using SIMD. + /// + /// Note that Aho-Corasick will actually use this packed searcher + /// internally automatically, however, there is some overhead associated + /// with going through the Aho-Corasick machinery. So using the packed + /// searcher directly results in some gains. + Packed { s: packed::Searcher, lits: Vec<Literal> }, +} + +impl LiteralSearcher { + /// Returns a matcher that never matches and never advances the input. + pub fn empty() -> Self { + Self::new(Literals::empty(), Matcher::Empty) + } + + /// Returns a matcher for literal prefixes from the given set. + pub fn prefixes(lits: Literals) -> Self { + let matcher = Matcher::prefixes(&lits); + Self::new(lits, matcher) + } + + /// Returns a matcher for literal suffixes from the given set. + pub fn suffixes(lits: Literals) -> Self { + let matcher = Matcher::suffixes(&lits); + Self::new(lits, matcher) + } + + fn new(lits: Literals, matcher: Matcher) -> Self { + let complete = lits.all_complete(); + LiteralSearcher { + complete, + lcp: Memmem::new(lits.longest_common_prefix()), + lcs: Memmem::new(lits.longest_common_suffix()), + matcher, + } + } + + /// Returns true if all matches comprise the entire regular expression. + /// + /// This does not necessarily mean that a literal match implies a match + /// of the regular expression. For example, the regular expression `^a` + /// is comprised of a single complete literal `a`, but the regular + /// expression demands that it only match at the beginning of a string. + pub fn complete(&self) -> bool { + self.complete && !self.is_empty() + } + + /// Find the position of a literal in `haystack` if it exists. + #[cfg_attr(feature = "perf-inline", inline(always))] + pub fn find(&self, haystack: &[u8]) -> Option<(usize, usize)> { + use self::Matcher::*; + match self.matcher { + Empty => Some((0, 0)), + Bytes(ref sset) => sset.find(haystack).map(|i| (i, i + 1)), + Memmem(ref s) => s.find(haystack).map(|i| (i, i + s.len())), + AC { ref ac, .. } => { + ac.find(haystack).map(|m| (m.start(), m.end())) + } + Packed { ref s, .. } => { + s.find(haystack).map(|m| (m.start(), m.end())) + } + } + } + + /// Like find, except matches must start at index `0`. + pub fn find_start(&self, haystack: &[u8]) -> Option<(usize, usize)> { + for lit in self.iter() { + if lit.len() > haystack.len() { + continue; + } + if lit == &haystack[0..lit.len()] { + return Some((0, lit.len())); + } + } + None + } + + /// Like find, except matches must end at index `haystack.len()`. + pub fn find_end(&self, haystack: &[u8]) -> Option<(usize, usize)> { + for lit in self.iter() { + if lit.len() > haystack.len() { + continue; + } + if lit == &haystack[haystack.len() - lit.len()..] { + return Some((haystack.len() - lit.len(), haystack.len())); + } + } + None + } + + /// Returns an iterator over all literals to be matched. + pub fn iter(&self) -> LiteralIter<'_> { + match self.matcher { + Matcher::Empty => LiteralIter::Empty, + Matcher::Bytes(ref sset) => LiteralIter::Bytes(&sset.dense), + Matcher::Memmem(ref s) => LiteralIter::Single(&s.finder.needle()), + Matcher::AC { ref lits, .. } => LiteralIter::AC(lits), + Matcher::Packed { ref lits, .. } => LiteralIter::Packed(lits), + } + } + + /// Returns a matcher for the longest common prefix of this matcher. + pub fn lcp(&self) -> &Memmem { + &self.lcp + } + + /// Returns a matcher for the longest common suffix of this matcher. + pub fn lcs(&self) -> &Memmem { + &self.lcs + } + + /// Returns true iff this prefix is empty. + pub fn is_empty(&self) -> bool { + self.len() == 0 + } + + /// Returns the number of prefixes in this machine. + pub fn len(&self) -> usize { + use self::Matcher::*; + match self.matcher { + Empty => 0, + Bytes(ref sset) => sset.dense.len(), + Memmem(_) => 1, + AC { ref ac, .. } => ac.pattern_count(), + Packed { ref lits, .. } => lits.len(), + } + } + + /// Return the approximate heap usage of literals in bytes. + pub fn approximate_size(&self) -> usize { + use self::Matcher::*; + match self.matcher { + Empty => 0, + Bytes(ref sset) => sset.approximate_size(), + Memmem(ref single) => single.approximate_size(), + AC { ref ac, .. } => ac.heap_bytes(), + Packed { ref s, .. } => s.heap_bytes(), + } + } +} + +impl Matcher { + fn prefixes(lits: &Literals) -> Self { + let sset = SingleByteSet::prefixes(lits); + Matcher::new(lits, sset) + } + + fn suffixes(lits: &Literals) -> Self { + let sset = SingleByteSet::suffixes(lits); + Matcher::new(lits, sset) + } + + fn new(lits: &Literals, sset: SingleByteSet) -> Self { + if lits.literals().is_empty() { + return Matcher::Empty; + } + if sset.dense.len() >= 26 { + // Avoid trying to match a large number of single bytes. + // This is *very* sensitive to a frequency analysis comparison + // between the bytes in sset and the composition of the haystack. + // No matter the size of sset, if its members all are rare in the + // haystack, then it'd be worth using it. How to tune this... IDK. + // ---AG + return Matcher::Empty; + } + if sset.complete { + return Matcher::Bytes(sset); + } + if lits.literals().len() == 1 { + return Matcher::Memmem(Memmem::new(&lits.literals()[0])); + } + + let pats = lits.literals().to_owned(); + let is_aho_corasick_fast = sset.dense.len() <= 1 && sset.all_ascii; + if lits.literals().len() <= 100 && !is_aho_corasick_fast { + let mut builder = packed::Config::new() + .match_kind(packed::MatchKind::LeftmostFirst) + .builder(); + if let Some(s) = builder.extend(&pats).build() { + return Matcher::Packed { s, lits: pats }; + } + } + let ac = AhoCorasickBuilder::new() + .match_kind(aho_corasick::MatchKind::LeftmostFirst) + .dfa(true) + .build_with_size::<u32, _, _>(&pats) + .unwrap(); + Matcher::AC { ac, lits: pats } + } +} + +#[derive(Debug)] +pub enum LiteralIter<'a> { + Empty, + Bytes(&'a [u8]), + Single(&'a [u8]), + AC(&'a [Literal]), + Packed(&'a [Literal]), +} + +impl<'a> Iterator for LiteralIter<'a> { + type Item = &'a [u8]; + + fn next(&mut self) -> Option<Self::Item> { + match *self { + LiteralIter::Empty => None, + LiteralIter::Bytes(ref mut many) => { + if many.is_empty() { + None + } else { + let next = &many[0..1]; + *many = &many[1..]; + Some(next) + } + } + LiteralIter::Single(ref mut one) => { + if one.is_empty() { + None + } else { + let next = &one[..]; + *one = &[]; + Some(next) + } + } + LiteralIter::AC(ref mut lits) => { + if lits.is_empty() { + None + } else { + let next = &lits[0]; + *lits = &lits[1..]; + Some(&**next) + } + } + LiteralIter::Packed(ref mut lits) => { + if lits.is_empty() { + None + } else { + let next = &lits[0]; + *lits = &lits[1..]; + Some(&**next) + } + } + } + } +} + +#[derive(Clone, Debug)] +struct SingleByteSet { + sparse: Vec<bool>, + dense: Vec<u8>, + complete: bool, + all_ascii: bool, +} + +impl SingleByteSet { + fn new() -> SingleByteSet { + SingleByteSet { + sparse: vec![false; 256], + dense: vec![], + complete: true, + all_ascii: true, + } + } + + fn prefixes(lits: &Literals) -> SingleByteSet { + let mut sset = SingleByteSet::new(); + for lit in lits.literals() { + sset.complete = sset.complete && lit.len() == 1; + if let Some(&b) = lit.get(0) { + if !sset.sparse[b as usize] { + if b > 0x7F { + sset.all_ascii = false; + } + sset.dense.push(b); + sset.sparse[b as usize] = true; + } + } + } + sset + } + + fn suffixes(lits: &Literals) -> SingleByteSet { + let mut sset = SingleByteSet::new(); + for lit in lits.literals() { + sset.complete = sset.complete && lit.len() == 1; + if let Some(&b) = lit.get(lit.len().checked_sub(1).unwrap()) { + if !sset.sparse[b as usize] { + if b > 0x7F { + sset.all_ascii = false; + } + sset.dense.push(b); + sset.sparse[b as usize] = true; + } + } + } + sset + } + + /// Faster find that special cases certain sizes to use memchr. + #[cfg_attr(feature = "perf-inline", inline(always))] + fn find(&self, text: &[u8]) -> Option<usize> { + match self.dense.len() { + 0 => None, + 1 => memchr(self.dense[0], text), + 2 => memchr2(self.dense[0], self.dense[1], text), + 3 => memchr3(self.dense[0], self.dense[1], self.dense[2], text), + _ => self._find(text), + } + } + + /// Generic find that works on any sized set. + fn _find(&self, haystack: &[u8]) -> Option<usize> { + for (i, &b) in haystack.iter().enumerate() { + if self.sparse[b as usize] { + return Some(i); + } + } + None + } + + fn approximate_size(&self) -> usize { + (self.dense.len() * mem::size_of::<u8>()) + + (self.sparse.len() * mem::size_of::<bool>()) + } +} + +/// A simple wrapper around the memchr crate's memmem implementation. +/// +/// The API this exposes mirrors the API of previous substring searchers that +/// this supplanted. +#[derive(Clone, Debug)] +pub struct Memmem { + finder: memmem::Finder<'static>, + char_len: usize, +} + +impl Memmem { + fn new(pat: &[u8]) -> Memmem { + Memmem { + finder: memmem::Finder::new(pat).into_owned(), + char_len: char_len_lossy(pat), + } + } + + #[cfg_attr(feature = "perf-inline", inline(always))] + pub fn find(&self, haystack: &[u8]) -> Option<usize> { + self.finder.find(haystack) + } + + #[cfg_attr(feature = "perf-inline", inline(always))] + pub fn is_suffix(&self, text: &[u8]) -> bool { + if text.len() < self.len() { + return false; + } + &text[text.len() - self.len()..] == self.finder.needle() + } + + pub fn len(&self) -> usize { + self.finder.needle().len() + } + + pub fn char_len(&self) -> usize { + self.char_len + } + + fn approximate_size(&self) -> usize { + self.finder.needle().len() * mem::size_of::<u8>() + } +} + +fn char_len_lossy(bytes: &[u8]) -> usize { + String::from_utf8_lossy(bytes).chars().count() +} diff --git a/third_party/rust/regex/src/literal/mod.rs b/third_party/rust/regex/src/literal/mod.rs new file mode 100644 index 0000000000..980f523309 --- /dev/null +++ b/third_party/rust/regex/src/literal/mod.rs @@ -0,0 +1,55 @@ +pub use self::imp::*; + +#[cfg(feature = "perf-literal")] +mod imp; + +#[allow(missing_docs)] +#[cfg(not(feature = "perf-literal"))] +mod imp { + use regex_syntax::hir::literal::Literals; + + #[derive(Clone, Debug)] + pub struct LiteralSearcher(()); + + impl LiteralSearcher { + pub fn empty() -> Self { + LiteralSearcher(()) + } + + pub fn prefixes(_: Literals) -> Self { + LiteralSearcher(()) + } + + pub fn suffixes(_: Literals) -> Self { + LiteralSearcher(()) + } + + pub fn complete(&self) -> bool { + false + } + + pub fn find(&self, _: &[u8]) -> Option<(usize, usize)> { + unreachable!() + } + + pub fn find_start(&self, _: &[u8]) -> Option<(usize, usize)> { + unreachable!() + } + + pub fn find_end(&self, _: &[u8]) -> Option<(usize, usize)> { + unreachable!() + } + + pub fn is_empty(&self) -> bool { + true + } + + pub fn len(&self) -> usize { + 0 + } + + pub fn approximate_size(&self) -> usize { + 0 + } + } +} diff --git a/third_party/rust/regex/src/pattern.rs b/third_party/rust/regex/src/pattern.rs new file mode 100644 index 0000000000..00549e5106 --- /dev/null +++ b/third_party/rust/regex/src/pattern.rs @@ -0,0 +1,63 @@ +use std::str::pattern::{Pattern, SearchStep, Searcher}; + +use crate::re_unicode::{Matches, Regex}; + +#[derive(Debug)] +pub struct RegexSearcher<'r, 't> { + haystack: &'t str, + it: Matches<'r, 't>, + last_step_end: usize, + next_match: Option<(usize, usize)>, +} + +impl<'r, 't> Pattern<'t> for &'r Regex { + type Searcher = RegexSearcher<'r, 't>; + + fn into_searcher(self, haystack: &'t str) -> RegexSearcher<'r, 't> { + RegexSearcher { + haystack, + it: self.find_iter(haystack), + last_step_end: 0, + next_match: None, + } + } +} + +unsafe impl<'r, 't> Searcher<'t> for RegexSearcher<'r, 't> { + #[inline] + fn haystack(&self) -> &'t str { + self.haystack + } + + #[inline] + fn next(&mut self) -> SearchStep { + if let Some((s, e)) = self.next_match { + self.next_match = None; + self.last_step_end = e; + return SearchStep::Match(s, e); + } + match self.it.next() { + None => { + if self.last_step_end < self.haystack().len() { + let last = self.last_step_end; + self.last_step_end = self.haystack().len(); + SearchStep::Reject(last, self.haystack().len()) + } else { + SearchStep::Done + } + } + Some(m) => { + let (s, e) = (m.start(), m.end()); + if s == self.last_step_end { + self.last_step_end = e; + SearchStep::Match(s, e) + } else { + self.next_match = Some((s, e)); + let last = self.last_step_end; + self.last_step_end = s; + SearchStep::Reject(last, s) + } + } + } + } +} diff --git a/third_party/rust/regex/src/pikevm.rs b/third_party/rust/regex/src/pikevm.rs new file mode 100644 index 0000000000..8c9eac2d39 --- /dev/null +++ b/third_party/rust/regex/src/pikevm.rs @@ -0,0 +1,360 @@ +// This module implements the Pike VM. That is, it guarantees linear time +// search of a regex on any text with memory use proportional to the size of +// the regex. +// +// It is equal in power to the backtracking engine in this crate, except the +// backtracking engine is typically faster on small regexes/texts at the +// expense of a bigger memory footprint. +// +// It can do more than the DFA can (specifically, record capture locations +// and execute Unicode word boundary assertions), but at a slower speed. +// Specifically, the Pike VM executes a DFA implicitly by repeatedly expanding +// epsilon transitions. That is, the Pike VM engine can be in multiple states +// at once where as the DFA is only ever in one state at a time. +// +// Therefore, the Pike VM is generally treated as the fallback when the other +// matching engines either aren't feasible to run or are insufficient. + +use std::mem; + +use crate::exec::ProgramCache; +use crate::input::{Input, InputAt}; +use crate::prog::{InstPtr, Program}; +use crate::re_trait::Slot; +use crate::sparse::SparseSet; + +/// An NFA simulation matching engine. +#[derive(Debug)] +pub struct Fsm<'r, I> { + /// The sequence of opcodes (among other things) that is actually executed. + /// + /// The program may be byte oriented or Unicode codepoint oriented. + prog: &'r Program, + /// An explicit stack used for following epsilon transitions. (This is + /// borrowed from the cache.) + stack: &'r mut Vec<FollowEpsilon>, + /// The input to search. + input: I, +} + +/// A cached allocation that can be reused on each execution. +#[derive(Clone, Debug)] +pub struct Cache { + /// A pair of ordered sets for tracking NFA states. + clist: Threads, + nlist: Threads, + /// An explicit stack used for following epsilon transitions. + stack: Vec<FollowEpsilon>, +} + +/// An ordered set of NFA states and their captures. +#[derive(Clone, Debug)] +struct Threads { + /// An ordered set of opcodes (each opcode is an NFA state). + set: SparseSet, + /// Captures for every NFA state. + /// + /// It is stored in row-major order, where the columns are the capture + /// slots and the rows are the states. + caps: Vec<Slot>, + /// The number of capture slots stored per thread. (Every capture has + /// two slots.) + slots_per_thread: usize, +} + +/// A representation of an explicit stack frame when following epsilon +/// transitions. This is used to avoid recursion. +#[derive(Clone, Debug)] +enum FollowEpsilon { + /// Follow transitions at the given instruction pointer. + IP(InstPtr), + /// Restore the capture slot with the given position in the input. + Capture { slot: usize, pos: Slot }, +} + +impl Cache { + /// Create a new allocation used by the NFA machine to record execution + /// and captures. + pub fn new(_prog: &Program) -> Self { + Cache { clist: Threads::new(), nlist: Threads::new(), stack: vec![] } + } +} + +impl<'r, I: Input> Fsm<'r, I> { + /// Execute the NFA matching engine. + /// + /// If there's a match, `exec` returns `true` and populates the given + /// captures accordingly. + pub fn exec( + prog: &'r Program, + cache: &ProgramCache, + matches: &mut [bool], + slots: &mut [Slot], + quit_after_match: bool, + input: I, + start: usize, + end: usize, + ) -> bool { + let mut cache = cache.borrow_mut(); + let cache = &mut cache.pikevm; + cache.clist.resize(prog.len(), prog.captures.len()); + cache.nlist.resize(prog.len(), prog.captures.len()); + let at = input.at(start); + Fsm { prog, stack: &mut cache.stack, input }.exec_( + &mut cache.clist, + &mut cache.nlist, + matches, + slots, + quit_after_match, + at, + end, + ) + } + + fn exec_( + &mut self, + mut clist: &mut Threads, + mut nlist: &mut Threads, + matches: &mut [bool], + slots: &mut [Slot], + quit_after_match: bool, + mut at: InputAt, + end: usize, + ) -> bool { + let mut matched = false; + let mut all_matched = false; + clist.set.clear(); + nlist.set.clear(); + 'LOOP: loop { + if clist.set.is_empty() { + // Three ways to bail out when our current set of threads is + // empty. + // + // 1. We have a match---so we're done exploring any possible + // alternatives. Time to quit. (We can't do this if we're + // looking for matches for multiple regexes, unless we know + // they all matched.) + // + // 2. If the expression starts with a '^' we can terminate as + // soon as the last thread dies. + if (matched && matches.len() <= 1) + || all_matched + || (!at.is_start() && self.prog.is_anchored_start) + { + break; + } + + // 3. If there's a literal prefix for the program, try to + // jump ahead quickly. If it can't be found, then we can + // bail out early. + if !self.prog.prefixes.is_empty() { + at = match self.input.prefix_at(&self.prog.prefixes, at) { + None => break, + Some(at) => at, + }; + } + } + + // This simulates a preceding '.*?' for every regex by adding + // a state starting at the current position in the input for the + // beginning of the program only if we don't already have a match. + if clist.set.is_empty() + || (!self.prog.is_anchored_start && !all_matched) + { + self.add(&mut clist, slots, 0, at); + } + // The previous call to "add" actually inspects the position just + // before the current character. For stepping through the machine, + // we can to look at the current character, so we advance the + // input. + let at_next = self.input.at(at.next_pos()); + for i in 0..clist.set.len() { + let ip = clist.set[i]; + if self.step( + &mut nlist, + matches, + slots, + clist.caps(ip), + ip, + at, + at_next, + ) { + matched = true; + all_matched = all_matched || matches.iter().all(|&b| b); + if quit_after_match { + // If we only care if a match occurs (not its + // position), then we can quit right now. + break 'LOOP; + } + if self.prog.matches.len() == 1 { + // We don't need to check the rest of the threads + // in this set because we've matched something + // ("leftmost-first"). However, we still need to check + // threads in the next set to support things like + // greedy matching. + // + // This is only true on normal regexes. For regex sets, + // we need to mush on to observe other matches. + break; + } + } + } + if at.pos() >= end { + break; + } + at = at_next; + mem::swap(clist, nlist); + nlist.set.clear(); + } + matched + } + + /// Step through the input, one token (byte or codepoint) at a time. + /// + /// nlist is the set of states that will be processed on the next token + /// in the input. + /// + /// caps is the set of captures passed by the caller of the NFA. They are + /// written to only when a match state is visited. + /// + /// thread_caps is the set of captures set for the current NFA state, ip. + /// + /// at and at_next are the current and next positions in the input. at or + /// at_next may be EOF. + fn step( + &mut self, + nlist: &mut Threads, + matches: &mut [bool], + slots: &mut [Slot], + thread_caps: &mut [Option<usize>], + ip: usize, + at: InputAt, + at_next: InputAt, + ) -> bool { + use crate::prog::Inst::*; + match self.prog[ip] { + Match(match_slot) => { + if match_slot < matches.len() { + matches[match_slot] = true; + } + for (slot, val) in slots.iter_mut().zip(thread_caps.iter()) { + *slot = *val; + } + true + } + Char(ref inst) => { + if inst.c == at.char() { + self.add(nlist, thread_caps, inst.goto, at_next); + } + false + } + Ranges(ref inst) => { + if inst.matches(at.char()) { + self.add(nlist, thread_caps, inst.goto, at_next); + } + false + } + Bytes(ref inst) => { + if let Some(b) = at.byte() { + if inst.matches(b) { + self.add(nlist, thread_caps, inst.goto, at_next); + } + } + false + } + EmptyLook(_) | Save(_) | Split(_) => false, + } + } + + /// Follows epsilon transitions and adds them for processing to nlist, + /// starting at and including ip. + fn add( + &mut self, + nlist: &mut Threads, + thread_caps: &mut [Option<usize>], + ip: usize, + at: InputAt, + ) { + self.stack.push(FollowEpsilon::IP(ip)); + while let Some(frame) = self.stack.pop() { + match frame { + FollowEpsilon::IP(ip) => { + self.add_step(nlist, thread_caps, ip, at); + } + FollowEpsilon::Capture { slot, pos } => { + thread_caps[slot] = pos; + } + } + } + } + + /// A helper function for add that avoids excessive pushing to the stack. + fn add_step( + &mut self, + nlist: &mut Threads, + thread_caps: &mut [Option<usize>], + mut ip: usize, + at: InputAt, + ) { + // Instead of pushing and popping to the stack, we mutate ip as we + // traverse the set of states. We only push to the stack when we + // absolutely need recursion (restoring captures or following a + // branch). + use crate::prog::Inst::*; + loop { + // Don't visit states we've already added. + if nlist.set.contains(ip) { + return; + } + nlist.set.insert(ip); + match self.prog[ip] { + EmptyLook(ref inst) => { + if self.input.is_empty_match(at, inst) { + ip = inst.goto; + } + } + Save(ref inst) => { + if inst.slot < thread_caps.len() { + self.stack.push(FollowEpsilon::Capture { + slot: inst.slot, + pos: thread_caps[inst.slot], + }); + thread_caps[inst.slot] = Some(at.pos()); + } + ip = inst.goto; + } + Split(ref inst) => { + self.stack.push(FollowEpsilon::IP(inst.goto2)); + ip = inst.goto1; + } + Match(_) | Char(_) | Ranges(_) | Bytes(_) => { + let t = &mut nlist.caps(ip); + for (slot, val) in t.iter_mut().zip(thread_caps.iter()) { + *slot = *val; + } + return; + } + } + } + } +} + +impl Threads { + fn new() -> Self { + Threads { set: SparseSet::new(0), caps: vec![], slots_per_thread: 0 } + } + + fn resize(&mut self, num_insts: usize, ncaps: usize) { + if num_insts == self.set.capacity() { + return; + } + self.slots_per_thread = ncaps * 2; + self.set = SparseSet::new(num_insts); + self.caps = vec![None; self.slots_per_thread * num_insts]; + } + + fn caps(&mut self, pc: usize) -> &mut [Option<usize>] { + let i = pc * self.slots_per_thread; + &mut self.caps[i..i + self.slots_per_thread] + } +} diff --git a/third_party/rust/regex/src/pool.rs b/third_party/rust/regex/src/pool.rs new file mode 100644 index 0000000000..6a6f15b194 --- /dev/null +++ b/third_party/rust/regex/src/pool.rs @@ -0,0 +1,333 @@ +// This module provides a relatively simple thread-safe pool of reusable +// objects. For the most part, it's implemented by a stack represented by a +// Mutex<Vec<T>>. It has one small trick: because unlocking a mutex is somewhat +// costly, in the case where a pool is accessed by the first thread that tried +// to get a value, we bypass the mutex. Here are some benchmarks showing the +// difference. +// +// 1) misc::anchored_literal_long_non_match 21 (18571 MB/s) +// 2) misc::anchored_literal_long_non_match 107 (3644 MB/s) +// 3) misc::anchored_literal_long_non_match 45 (8666 MB/s) +// 4) misc::anchored_literal_long_non_match 19 (20526 MB/s) +// +// (1) represents our baseline: the master branch at the time of writing when +// using the 'thread_local' crate to implement the pool below. +// +// (2) represents a naive pool implemented completely via Mutex<Vec<T>>. There +// is no special trick for bypassing the mutex. +// +// (3) is the same as (2), except it uses Mutex<Vec<Box<T>>>. It is twice as +// fast because a Box<T> is much smaller than the T we use with a Pool in this +// crate. So pushing and popping a Box<T> from a Vec is quite a bit faster +// than for T. +// +// (4) is the same as (3), but with the trick for bypassing the mutex in the +// case of the first-to-get thread. +// +// Why move off of thread_local? Even though (4) is a hair faster than (1) +// above, this was not the main goal. The main goal was to move off of +// thread_local and find a way to *simply* re-capture some of its speed for +// regex's specific case. So again, why move off of it? The *primary* reason is +// because of memory leaks. See https://github.com/rust-lang/regex/issues/362 +// for example. (Why do I want it to be simple? Well, I suppose what I mean is, +// "use as much safe code as possible to minimize risk and be as sure as I can +// be that it is correct.") +// +// My guess is that the thread_local design is probably not appropriate for +// regex since its memory usage scales to the number of active threads that +// have used a regex, where as the pool below scales to the number of threads +// that simultaneously use a regex. While neither case permits contraction, +// since we own the pool data structure below, we can add contraction if a +// clear use case pops up in the wild. More pressingly though, it seems that +// there are at least some use case patterns where one might have many threads +// sitting around that might have used a regex at one point. While thread_local +// does try to reuse space previously used by a thread that has since stopped, +// its maximal memory usage still scales with the total number of active +// threads. In contrast, the pool below scales with the total number of threads +// *simultaneously* using the pool. The hope is that this uses less memory +// overall. And if it doesn't, we can hopefully tune it somehow. +// +// It seems that these sort of conditions happen frequently +// in FFI inside of other more "managed" languages. This was +// mentioned in the issue linked above, and also mentioned here: +// https://github.com/BurntSushi/rure-go/issues/3. And in particular, users +// confirm that disabling the use of thread_local resolves the leak. +// +// There were other weaker reasons for moving off of thread_local as well. +// Namely, at the time, I was looking to reduce dependencies. And for something +// like regex, maintenance can be simpler when we own the full dependency tree. + +use std::panic::{RefUnwindSafe, UnwindSafe}; +use std::sync::atomic::{AtomicUsize, Ordering}; +use std::sync::Mutex; + +/// An atomic counter used to allocate thread IDs. +static COUNTER: AtomicUsize = AtomicUsize::new(1); + +thread_local!( + /// A thread local used to assign an ID to a thread. + static THREAD_ID: usize = { + let next = COUNTER.fetch_add(1, Ordering::Relaxed); + // SAFETY: We cannot permit the reuse of thread IDs since reusing a + // thread ID might result in more than one thread "owning" a pool, + // and thus, permit accessing a mutable value from multiple threads + // simultaneously without synchronization. The intent of this panic is + // to be a sanity check. It is not expected that the thread ID space + // will actually be exhausted in practice. + // + // This checks that the counter never wraps around, since atomic + // addition wraps around on overflow. + if next == 0 { + panic!("regex: thread ID allocation space exhausted"); + } + next + }; +); + +/// The type of the function used to create values in a pool when the pool is +/// empty and the caller requests one. +type CreateFn<T> = + Box<dyn Fn() -> T + Send + Sync + UnwindSafe + RefUnwindSafe + 'static>; + +/// A simple thread safe pool for reusing values. +/// +/// Getting a value out comes with a guard. When that guard is dropped, the +/// value is automatically put back in the pool. +/// +/// A Pool<T> impls Sync when T is Send (even if it's not Sync). This means +/// that T can use interior mutability. This is possible because a pool is +/// guaranteed to provide a value to exactly one thread at any time. +/// +/// Currently, a pool never contracts in size. Its size is proportional to the +/// number of simultaneous uses. +pub struct Pool<T> { + /// A stack of T values to hand out. These are used when a Pool is + /// accessed by a thread that didn't create it. + stack: Mutex<Vec<Box<T>>>, + /// A function to create more T values when stack is empty and a caller + /// has requested a T. + create: CreateFn<T>, + /// The ID of the thread that owns this pool. The owner is the thread + /// that makes the first call to 'get'. When the owner calls 'get', it + /// gets 'owner_val' directly instead of returning a T from 'stack'. + /// See comments elsewhere for details, but this is intended to be an + /// optimization for the common case that makes getting a T faster. + /// + /// It is initialized to a value of zero (an impossible thread ID) as a + /// sentinel to indicate that it is unowned. + owner: AtomicUsize, + /// A value to return when the caller is in the same thread that created + /// the Pool. + owner_val: T, +} + +// SAFETY: Since we want to use a Pool from multiple threads simultaneously +// behind an Arc, we need for it to be Sync. In cases where T is sync, Pool<T> +// would be Sync. However, since we use a Pool to store mutable scratch space, +// we wind up using a T that has interior mutability and is thus itself not +// Sync. So what we *really* want is for our Pool<T> to by Sync even when T is +// not Sync (but is at least Send). +// +// The only non-sync aspect of a Pool is its 'owner_val' field, which is used +// to implement faster access to a pool value in the common case of a pool +// being accessed in the same thread in which it was created. The 'stack' field +// is also shared, but a Mutex<T> where T: Send is already Sync. So we only +// need to worry about 'owner_val'. +// +// The key is to guarantee that 'owner_val' can only ever be accessed from one +// thread. In our implementation below, we guarantee this by only returning the +// 'owner_val' when the ID of the current thread matches the ID of the thread +// that created the Pool. Since this can only ever be one thread, it follows +// that only one thread can access 'owner_val' at any point in time. Thus, it +// is safe to declare that Pool<T> is Sync when T is Send. +// +// NOTE: It would also be possible to make the owning thread be the *first* +// thread that tries to get a value out of a Pool. However, the current +// implementation is a little simpler and it's not clear if making the first +// thread (rather than the creating thread) is meaningfully better. +// +// If there is a way to achieve our performance goals using safe code, then +// I would very much welcome a patch. As it stands, the implementation below +// tries to balance safety with performance. The case where a Regex is used +// from multiple threads simultaneously will suffer a bit since getting a cache +// will require unlocking a mutex. +unsafe impl<T: Send> Sync for Pool<T> {} + +impl<T: ::std::fmt::Debug> ::std::fmt::Debug for Pool<T> { + fn fmt(&self, f: &mut ::std::fmt::Formatter<'_>) -> ::std::fmt::Result { + f.debug_struct("Pool") + .field("stack", &self.stack) + .field("owner", &self.owner) + .field("owner_val", &self.owner_val) + .finish() + } +} + +/// A guard that is returned when a caller requests a value from the pool. +/// +/// The purpose of the guard is to use RAII to automatically put the value back +/// in the pool once it's dropped. +#[derive(Debug)] +pub struct PoolGuard<'a, T: Send> { + /// The pool that this guard is attached to. + pool: &'a Pool<T>, + /// This is None when the guard represents the special "owned" value. In + /// which case, the value is retrieved from 'pool.owner_val'. + value: Option<Box<T>>, +} + +impl<T: Send> Pool<T> { + /// Create a new pool. The given closure is used to create values in the + /// pool when necessary. + pub fn new(create: CreateFn<T>) -> Pool<T> { + let owner = AtomicUsize::new(0); + let owner_val = create(); + Pool { stack: Mutex::new(vec![]), create, owner, owner_val } + } + + /// Get a value from the pool. The caller is guaranteed to have exclusive + /// access to the given value. + /// + /// Note that there is no guarantee provided about which value in the + /// pool is returned. That is, calling get, dropping the guard (causing + /// the value to go back into the pool) and then calling get again is NOT + /// guaranteed to return the same value received in the first get call. + #[cfg_attr(feature = "perf-inline", inline(always))] + pub fn get(&self) -> PoolGuard<'_, T> { + // Our fast path checks if the caller is the thread that "owns" this + // pool. Or stated differently, whether it is the first thread that + // tried to extract a value from the pool. If it is, then we can return + // a T to the caller without going through a mutex. + // + // SAFETY: We must guarantee that only one thread gets access to this + // value. Since a thread is uniquely identified by the THREAD_ID thread + // local, it follows that is the caller's thread ID is equal to the + // owner, then only one thread may receive this value. + let caller = THREAD_ID.with(|id| *id); + let owner = self.owner.load(Ordering::Relaxed); + if caller == owner { + return self.guard_owned(); + } + self.get_slow(caller, owner) + } + + /// This is the "slow" version that goes through a mutex to pop an + /// allocated value off a stack to return to the caller. (Or, if the stack + /// is empty, a new value is created.) + /// + /// If the pool has no owner, then this will set the owner. + #[cold] + fn get_slow(&self, caller: usize, owner: usize) -> PoolGuard<'_, T> { + use std::sync::atomic::Ordering::Relaxed; + + if owner == 0 { + // The sentinel 0 value means this pool is not yet owned. We + // try to atomically set the owner. If we do, then this thread + // becomes the owner and we can return a guard that represents + // the special T for the owner. + let res = self.owner.compare_exchange(0, caller, Relaxed, Relaxed); + if res.is_ok() { + return self.guard_owned(); + } + } + let mut stack = self.stack.lock().unwrap(); + let value = match stack.pop() { + None => Box::new((self.create)()), + Some(value) => value, + }; + self.guard_stack(value) + } + + /// Puts a value back into the pool. Callers don't need to call this. Once + /// the guard that's returned by 'get' is dropped, it is put back into the + /// pool automatically. + fn put(&self, value: Box<T>) { + let mut stack = self.stack.lock().unwrap(); + stack.push(value); + } + + /// Create a guard that represents the special owned T. + fn guard_owned(&self) -> PoolGuard<'_, T> { + PoolGuard { pool: self, value: None } + } + + /// Create a guard that contains a value from the pool's stack. + fn guard_stack(&self, value: Box<T>) -> PoolGuard<'_, T> { + PoolGuard { pool: self, value: Some(value) } + } +} + +impl<'a, T: Send> PoolGuard<'a, T> { + /// Return the underlying value. + pub fn value(&self) -> &T { + match self.value { + None => &self.pool.owner_val, + Some(ref v) => &**v, + } + } +} + +impl<'a, T: Send> Drop for PoolGuard<'a, T> { + #[cfg_attr(feature = "perf-inline", inline(always))] + fn drop(&mut self) { + if let Some(value) = self.value.take() { + self.pool.put(value); + } + } +} + +#[cfg(test)] +mod tests { + use std::panic::{RefUnwindSafe, UnwindSafe}; + + use super::*; + + #[test] + fn oibits() { + use crate::exec::ProgramCache; + + fn has_oibits<T: Send + Sync + UnwindSafe + RefUnwindSafe>() {} + has_oibits::<Pool<ProgramCache>>(); + } + + // Tests that Pool implements the "single owner" optimization. That is, the + // thread that first accesses the pool gets its own copy, while all other + // threads get distinct copies. + #[test] + fn thread_owner_optimization() { + use std::cell::RefCell; + use std::sync::Arc; + + let pool: Arc<Pool<RefCell<Vec<char>>>> = + Arc::new(Pool::new(Box::new(|| RefCell::new(vec!['a'])))); + pool.get().value().borrow_mut().push('x'); + + let pool1 = pool.clone(); + let t1 = std::thread::spawn(move || { + let guard = pool1.get(); + let v = guard.value(); + v.borrow_mut().push('y'); + }); + + let pool2 = pool.clone(); + let t2 = std::thread::spawn(move || { + let guard = pool2.get(); + let v = guard.value(); + v.borrow_mut().push('z'); + }); + + t1.join().unwrap(); + t2.join().unwrap(); + + // If we didn't implement the single owner optimization, then one of + // the threads above is likely to have mutated the [a, x] vec that + // we stuffed in the pool before spawning the threads. But since + // neither thread was first to access the pool, and because of the + // optimization, we should be guaranteed that neither thread mutates + // the special owned pool value. + // + // (Technically this is an implementation detail and not a contract of + // Pool's API.) + assert_eq!(vec!['a', 'x'], *pool.get().value().borrow()); + } +} diff --git a/third_party/rust/regex/src/prog.rs b/third_party/rust/regex/src/prog.rs new file mode 100644 index 0000000000..c211f71d8a --- /dev/null +++ b/third_party/rust/regex/src/prog.rs @@ -0,0 +1,447 @@ +use std::cmp::Ordering; +use std::collections::HashMap; +use std::fmt; +use std::mem; +use std::ops::Deref; +use std::slice; +use std::sync::Arc; + +use crate::input::Char; +use crate::literal::LiteralSearcher; + +/// `InstPtr` represents the index of an instruction in a regex program. +pub type InstPtr = usize; + +/// Program is a sequence of instructions and various facts about thos +/// instructions. +#[derive(Clone)] +pub struct Program { + /// A sequence of instructions that represents an NFA. + pub insts: Vec<Inst>, + /// Pointers to each Match instruction in the sequence. + /// + /// This is always length 1 unless this program represents a regex set. + pub matches: Vec<InstPtr>, + /// The ordered sequence of all capture groups extracted from the AST. + /// Unnamed groups are `None`. + pub captures: Vec<Option<String>>, + /// Pointers to all named capture groups into `captures`. + pub capture_name_idx: Arc<HashMap<String, usize>>, + /// A pointer to the start instruction. This can vary depending on how + /// the program was compiled. For example, programs for use with the DFA + /// engine have a `.*?` inserted at the beginning of unanchored regular + /// expressions. The actual starting point of the program is after the + /// `.*?`. + pub start: InstPtr, + /// A set of equivalence classes for discriminating bytes in the compiled + /// program. + pub byte_classes: Vec<u8>, + /// When true, this program can only match valid UTF-8. + pub only_utf8: bool, + /// When true, this program uses byte range instructions instead of Unicode + /// range instructions. + pub is_bytes: bool, + /// When true, the program is compiled for DFA matching. For example, this + /// implies `is_bytes` and also inserts a preceding `.*?` for unanchored + /// regexes. + pub is_dfa: bool, + /// When true, the program matches text in reverse (for use only in the + /// DFA). + pub is_reverse: bool, + /// Whether the regex must match from the start of the input. + pub is_anchored_start: bool, + /// Whether the regex must match at the end of the input. + pub is_anchored_end: bool, + /// Whether this program contains a Unicode word boundary instruction. + pub has_unicode_word_boundary: bool, + /// A possibly empty machine for very quickly matching prefix literals. + pub prefixes: LiteralSearcher, + /// A limit on the size of the cache that the DFA is allowed to use while + /// matching. + /// + /// The cache limit specifies approximately how much space we're willing to + /// give to the state cache. Once the state cache exceeds the size, it is + /// wiped and all states must be re-computed. + /// + /// Note that this value does not impact correctness. It can be set to 0 + /// and the DFA will run just fine. (It will only ever store exactly one + /// state in the cache, and will likely run very slowly, but it will work.) + /// + /// Also note that this limit is *per thread of execution*. That is, + /// if the same regex is used to search text across multiple threads + /// simultaneously, then the DFA cache is not shared. Instead, copies are + /// made. + pub dfa_size_limit: usize, +} + +impl Program { + /// Creates an empty instruction sequence. Fields are given default + /// values. + pub fn new() -> Self { + Program { + insts: vec![], + matches: vec![], + captures: vec![], + capture_name_idx: Arc::new(HashMap::new()), + start: 0, + byte_classes: vec![0; 256], + only_utf8: true, + is_bytes: false, + is_dfa: false, + is_reverse: false, + is_anchored_start: false, + is_anchored_end: false, + has_unicode_word_boundary: false, + prefixes: LiteralSearcher::empty(), + dfa_size_limit: 2 * (1 << 20), + } + } + + /// If pc is an index to a no-op instruction (like Save), then return the + /// next pc that is not a no-op instruction. + pub fn skip(&self, mut pc: usize) -> usize { + loop { + match self[pc] { + Inst::Save(ref i) => pc = i.goto, + _ => return pc, + } + } + } + + /// Return true if and only if an execution engine at instruction `pc` will + /// always lead to a match. + pub fn leads_to_match(&self, pc: usize) -> bool { + if self.matches.len() > 1 { + // If we have a regex set, then we have more than one ending + // state, so leading to one of those states is generally + // meaningless. + return false; + } + match self[self.skip(pc)] { + Inst::Match(_) => true, + _ => false, + } + } + + /// Returns true if the current configuration demands that an implicit + /// `.*?` be prepended to the instruction sequence. + pub fn needs_dotstar(&self) -> bool { + self.is_dfa && !self.is_reverse && !self.is_anchored_start + } + + /// Returns true if this program uses Byte instructions instead of + /// Char/Range instructions. + pub fn uses_bytes(&self) -> bool { + self.is_bytes || self.is_dfa + } + + /// Returns true if this program exclusively matches valid UTF-8 bytes. + /// + /// That is, if an invalid UTF-8 byte is seen, then no match is possible. + pub fn only_utf8(&self) -> bool { + self.only_utf8 + } + + /// Return the approximate heap usage of this instruction sequence in + /// bytes. + pub fn approximate_size(&self) -> usize { + // The only instruction that uses heap space is Ranges (for + // Unicode codepoint programs) to store non-overlapping codepoint + // ranges. To keep this operation constant time, we ignore them. + (self.len() * mem::size_of::<Inst>()) + + (self.matches.len() * mem::size_of::<InstPtr>()) + + (self.captures.len() * mem::size_of::<Option<String>>()) + + (self.capture_name_idx.len() + * (mem::size_of::<String>() + mem::size_of::<usize>())) + + (self.byte_classes.len() * mem::size_of::<u8>()) + + self.prefixes.approximate_size() + } +} + +impl Deref for Program { + type Target = [Inst]; + + #[cfg_attr(feature = "perf-inline", inline(always))] + fn deref(&self) -> &Self::Target { + &*self.insts + } +} + +impl fmt::Debug for Program { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + use self::Inst::*; + + fn with_goto(cur: usize, goto: usize, fmtd: String) -> String { + if goto == cur + 1 { + fmtd + } else { + format!("{} (goto: {})", fmtd, goto) + } + } + + fn visible_byte(b: u8) -> String { + use std::ascii::escape_default; + let escaped = escape_default(b).collect::<Vec<u8>>(); + String::from_utf8_lossy(&escaped).into_owned() + } + + for (pc, inst) in self.iter().enumerate() { + match *inst { + Match(slot) => write!(f, "{:04} Match({:?})", pc, slot)?, + Save(ref inst) => { + let s = format!("{:04} Save({})", pc, inst.slot); + write!(f, "{}", with_goto(pc, inst.goto, s))?; + } + Split(ref inst) => { + write!( + f, + "{:04} Split({}, {})", + pc, inst.goto1, inst.goto2 + )?; + } + EmptyLook(ref inst) => { + let s = format!("{:?}", inst.look); + write!(f, "{:04} {}", pc, with_goto(pc, inst.goto, s))?; + } + Char(ref inst) => { + let s = format!("{:?}", inst.c); + write!(f, "{:04} {}", pc, with_goto(pc, inst.goto, s))?; + } + Ranges(ref inst) => { + let ranges = inst + .ranges + .iter() + .map(|r| format!("{:?}-{:?}", r.0, r.1)) + .collect::<Vec<String>>() + .join(", "); + write!( + f, + "{:04} {}", + pc, + with_goto(pc, inst.goto, ranges) + )?; + } + Bytes(ref inst) => { + let s = format!( + "Bytes({}, {})", + visible_byte(inst.start), + visible_byte(inst.end) + ); + write!(f, "{:04} {}", pc, with_goto(pc, inst.goto, s))?; + } + } + if pc == self.start { + write!(f, " (start)")?; + } + writeln!(f)?; + } + Ok(()) + } +} + +impl<'a> IntoIterator for &'a Program { + type Item = &'a Inst; + type IntoIter = slice::Iter<'a, Inst>; + fn into_iter(self) -> Self::IntoIter { + self.iter() + } +} + +/// Inst is an instruction code in a Regex program. +/// +/// Regrettably, a regex program either contains Unicode codepoint +/// instructions (Char and Ranges) or it contains byte instructions (Bytes). +/// A regex program can never contain both. +/// +/// It would be worth investigating splitting this into two distinct types and +/// then figuring out how to make the matching engines polymorphic over those +/// types without sacrificing performance. +/// +/// Other than the benefit of moving invariants into the type system, another +/// benefit is the decreased size. If we remove the `Char` and `Ranges` +/// instructions from the `Inst` enum, then its size shrinks from 32 bytes to +/// 24 bytes. (This is because of the removal of a `Box<[]>` in the `Ranges` +/// variant.) Given that byte based machines are typically much bigger than +/// their Unicode analogues (because they can decode UTF-8 directly), this ends +/// up being a pretty significant savings. +#[derive(Clone, Debug)] +pub enum Inst { + /// Match indicates that the program has reached a match state. + /// + /// The number in the match corresponds to the Nth logical regular + /// expression in this program. This index is always 0 for normal regex + /// programs. Values greater than 0 appear when compiling regex sets, and + /// each match instruction gets its own unique value. The value corresponds + /// to the Nth regex in the set. + Match(usize), + /// Save causes the program to save the current location of the input in + /// the slot indicated by InstSave. + Save(InstSave), + /// Split causes the program to diverge to one of two paths in the + /// program, preferring goto1 in InstSplit. + Split(InstSplit), + /// EmptyLook represents a zero-width assertion in a regex program. A + /// zero-width assertion does not consume any of the input text. + EmptyLook(InstEmptyLook), + /// Char requires the regex program to match the character in InstChar at + /// the current position in the input. + Char(InstChar), + /// Ranges requires the regex program to match the character at the current + /// position in the input with one of the ranges specified in InstRanges. + Ranges(InstRanges), + /// Bytes is like Ranges, except it expresses a single byte range. It is + /// used in conjunction with Split instructions to implement multi-byte + /// character classes. + Bytes(InstBytes), +} + +impl Inst { + /// Returns true if and only if this is a match instruction. + pub fn is_match(&self) -> bool { + match *self { + Inst::Match(_) => true, + _ => false, + } + } +} + +/// Representation of the Save instruction. +#[derive(Clone, Debug)] +pub struct InstSave { + /// The next location to execute in the program. + pub goto: InstPtr, + /// The capture slot (there are two slots for every capture in a regex, + /// including the zeroth capture for the entire match). + pub slot: usize, +} + +/// Representation of the Split instruction. +#[derive(Clone, Debug)] +pub struct InstSplit { + /// The first instruction to try. A match resulting from following goto1 + /// has precedence over a match resulting from following goto2. + pub goto1: InstPtr, + /// The second instruction to try. A match resulting from following goto1 + /// has precedence over a match resulting from following goto2. + pub goto2: InstPtr, +} + +/// Representation of the `EmptyLook` instruction. +#[derive(Clone, Debug)] +pub struct InstEmptyLook { + /// The next location to execute in the program if this instruction + /// succeeds. + pub goto: InstPtr, + /// The type of zero-width assertion to check. + pub look: EmptyLook, +} + +/// The set of zero-width match instructions. +#[derive(Clone, Copy, Debug, PartialEq, Eq)] +pub enum EmptyLook { + /// Start of line or input. + StartLine, + /// End of line or input. + EndLine, + /// Start of input. + StartText, + /// End of input. + EndText, + /// Word character on one side and non-word character on other. + WordBoundary, + /// Word character on both sides or non-word character on both sides. + NotWordBoundary, + /// ASCII word boundary. + WordBoundaryAscii, + /// Not ASCII word boundary. + NotWordBoundaryAscii, +} + +/// Representation of the Char instruction. +#[derive(Clone, Debug)] +pub struct InstChar { + /// The next location to execute in the program if this instruction + /// succeeds. + pub goto: InstPtr, + /// The character to test. + pub c: char, +} + +/// Representation of the Ranges instruction. +#[derive(Clone, Debug)] +pub struct InstRanges { + /// The next location to execute in the program if this instruction + /// succeeds. + pub goto: InstPtr, + /// The set of Unicode scalar value ranges to test. + pub ranges: Box<[(char, char)]>, +} + +impl InstRanges { + /// Tests whether the given input character matches this instruction. + pub fn matches(&self, c: Char) -> bool { + // This speeds up the `match_class_unicode` benchmark by checking + // some common cases quickly without binary search. e.g., Matching + // a Unicode class on predominantly ASCII text. + for r in self.ranges.iter().take(4) { + if c < r.0 { + return false; + } + if c <= r.1 { + return true; + } + } + self.ranges + .binary_search_by(|r| { + if r.1 < c { + Ordering::Less + } else if r.0 > c { + Ordering::Greater + } else { + Ordering::Equal + } + }) + .is_ok() + } + + /// Return the number of distinct characters represented by all of the + /// ranges. + pub fn num_chars(&self) -> usize { + self.ranges + .iter() + .map(|&(s, e)| 1 + (e as u32) - (s as u32)) + .sum::<u32>() as usize + } +} + +/// Representation of the Bytes instruction. +#[derive(Clone, Debug)] +pub struct InstBytes { + /// The next location to execute in the program if this instruction + /// succeeds. + pub goto: InstPtr, + /// The start (inclusive) of this byte range. + pub start: u8, + /// The end (inclusive) of this byte range. + pub end: u8, +} + +impl InstBytes { + /// Returns true if and only if the given byte is in this range. + pub fn matches(&self, byte: u8) -> bool { + self.start <= byte && byte <= self.end + } +} + +#[cfg(test)] +mod test { + #[test] + #[cfg(target_pointer_width = "64")] + fn test_size_of_inst() { + use std::mem::size_of; + + use super::Inst; + + assert_eq!(32, size_of::<Inst>()); + } +} diff --git a/third_party/rust/regex/src/re_builder.rs b/third_party/rust/regex/src/re_builder.rs new file mode 100644 index 0000000000..ee6383690d --- /dev/null +++ b/third_party/rust/regex/src/re_builder.rs @@ -0,0 +1,421 @@ +/// The set of user configurable options for compiling zero or more regexes. +#[derive(Clone, Debug)] +#[allow(missing_docs)] +pub struct RegexOptions { + pub pats: Vec<String>, + pub size_limit: usize, + pub dfa_size_limit: usize, + pub nest_limit: u32, + pub case_insensitive: bool, + pub multi_line: bool, + pub dot_matches_new_line: bool, + pub swap_greed: bool, + pub ignore_whitespace: bool, + pub unicode: bool, + pub octal: bool, +} + +impl Default for RegexOptions { + fn default() -> Self { + RegexOptions { + pats: vec![], + size_limit: 10 * (1 << 20), + dfa_size_limit: 2 * (1 << 20), + nest_limit: 250, + case_insensitive: false, + multi_line: false, + dot_matches_new_line: false, + swap_greed: false, + ignore_whitespace: false, + unicode: true, + octal: false, + } + } +} + +macro_rules! define_builder { + ($name:ident, $regex_mod:ident, $only_utf8:expr) => { + pub mod $name { + use super::RegexOptions; + use crate::error::Error; + use crate::exec::ExecBuilder; + + use crate::$regex_mod::Regex; + + /// A configurable builder for a regular expression. + /// + /// A builder can be used to configure how the regex is built, for example, by + /// setting the default flags (which can be overridden in the expression + /// itself) or setting various limits. + #[derive(Debug)] + pub struct RegexBuilder(RegexOptions); + + impl RegexBuilder { + /// Create a new regular expression builder with the given pattern. + /// + /// If the pattern is invalid, then an error will be returned when + /// `build` is called. + pub fn new(pattern: &str) -> RegexBuilder { + let mut builder = RegexBuilder(RegexOptions::default()); + builder.0.pats.push(pattern.to_owned()); + builder + } + + /// Consume the builder and compile the regular expression. + /// + /// Note that calling `as_str` on the resulting `Regex` will produce the + /// pattern given to `new` verbatim. Notably, it will not incorporate any + /// of the flags set on this builder. + pub fn build(&self) -> Result<Regex, Error> { + ExecBuilder::new_options(self.0.clone()) + .only_utf8($only_utf8) + .build() + .map(Regex::from) + } + + /// Set the value for the case insensitive (`i`) flag. + /// + /// When enabled, letters in the pattern will match both upper case and + /// lower case variants. + pub fn case_insensitive( + &mut self, + yes: bool, + ) -> &mut RegexBuilder { + self.0.case_insensitive = yes; + self + } + + /// Set the value for the multi-line matching (`m`) flag. + /// + /// When enabled, `^` matches the beginning of lines and `$` matches the + /// end of lines. + /// + /// By default, they match beginning/end of the input. + pub fn multi_line(&mut self, yes: bool) -> &mut RegexBuilder { + self.0.multi_line = yes; + self + } + + /// Set the value for the any character (`s`) flag, where in `.` matches + /// anything when `s` is set and matches anything except for new line when + /// it is not set (the default). + /// + /// N.B. "matches anything" means "any byte" when Unicode is disabled and + /// means "any valid UTF-8 encoding of any Unicode scalar value" when + /// Unicode is enabled. + pub fn dot_matches_new_line( + &mut self, + yes: bool, + ) -> &mut RegexBuilder { + self.0.dot_matches_new_line = yes; + self + } + + /// Set the value for the greedy swap (`U`) flag. + /// + /// When enabled, a pattern like `a*` is lazy (tries to find shortest + /// match) and `a*?` is greedy (tries to find longest match). + /// + /// By default, `a*` is greedy and `a*?` is lazy. + pub fn swap_greed(&mut self, yes: bool) -> &mut RegexBuilder { + self.0.swap_greed = yes; + self + } + + /// Set the value for the ignore whitespace (`x`) flag. + /// + /// When enabled, whitespace such as new lines and spaces will be ignored + /// between expressions of the pattern, and `#` can be used to start a + /// comment until the next new line. + pub fn ignore_whitespace( + &mut self, + yes: bool, + ) -> &mut RegexBuilder { + self.0.ignore_whitespace = yes; + self + } + + /// Set the value for the Unicode (`u`) flag. + /// + /// Enabled by default. When disabled, character classes such as `\w` only + /// match ASCII word characters instead of all Unicode word characters. + pub fn unicode(&mut self, yes: bool) -> &mut RegexBuilder { + self.0.unicode = yes; + self + } + + /// Whether to support octal syntax or not. + /// + /// Octal syntax is a little-known way of uttering Unicode codepoints in + /// a regular expression. For example, `a`, `\x61`, `\u0061` and + /// `\141` are all equivalent regular expressions, where the last example + /// shows octal syntax. + /// + /// While supporting octal syntax isn't in and of itself a problem, it does + /// make good error messages harder. That is, in PCRE based regex engines, + /// syntax like `\0` invokes a backreference, which is explicitly + /// unsupported in Rust's regex engine. However, many users expect it to + /// be supported. Therefore, when octal support is disabled, the error + /// message will explicitly mention that backreferences aren't supported. + /// + /// Octal syntax is disabled by default. + pub fn octal(&mut self, yes: bool) -> &mut RegexBuilder { + self.0.octal = yes; + self + } + + /// Set the approximate size limit of the compiled regular expression. + /// + /// This roughly corresponds to the number of bytes occupied by a single + /// compiled program. If the program exceeds this number, then a + /// compilation error is returned. + pub fn size_limit( + &mut self, + limit: usize, + ) -> &mut RegexBuilder { + self.0.size_limit = limit; + self + } + + /// Set the approximate size of the cache used by the DFA. + /// + /// This roughly corresponds to the number of bytes that the DFA will + /// use while searching. + /// + /// Note that this is a *per thread* limit. There is no way to set a global + /// limit. In particular, if a regex is used from multiple threads + /// simultaneously, then each thread may use up to the number of bytes + /// specified here. + pub fn dfa_size_limit( + &mut self, + limit: usize, + ) -> &mut RegexBuilder { + self.0.dfa_size_limit = limit; + self + } + + /// Set the nesting limit for this parser. + /// + /// The nesting limit controls how deep the abstract syntax tree is allowed + /// to be. If the AST exceeds the given limit (e.g., with too many nested + /// groups), then an error is returned by the parser. + /// + /// The purpose of this limit is to act as a heuristic to prevent stack + /// overflow for consumers that do structural induction on an `Ast` using + /// explicit recursion. While this crate never does this (instead using + /// constant stack space and moving the call stack to the heap), other + /// crates may. + /// + /// This limit is not checked until the entire Ast is parsed. Therefore, + /// if callers want to put a limit on the amount of heap space used, then + /// they should impose a limit on the length, in bytes, of the concrete + /// pattern string. In particular, this is viable since this parser + /// implementation will limit itself to heap space proportional to the + /// length of the pattern string. + /// + /// Note that a nest limit of `0` will return a nest limit error for most + /// patterns but not all. For example, a nest limit of `0` permits `a` but + /// not `ab`, since `ab` requires a concatenation, which results in a nest + /// depth of `1`. In general, a nest limit is not something that manifests + /// in an obvious way in the concrete syntax, therefore, it should not be + /// used in a granular way. + pub fn nest_limit(&mut self, limit: u32) -> &mut RegexBuilder { + self.0.nest_limit = limit; + self + } + } + } + }; +} + +define_builder!(bytes, re_bytes, false); +define_builder!(unicode, re_unicode, true); + +macro_rules! define_set_builder { + ($name:ident, $regex_mod:ident, $only_utf8:expr) => { + pub mod $name { + use super::RegexOptions; + use crate::error::Error; + use crate::exec::ExecBuilder; + + use crate::re_set::$regex_mod::RegexSet; + + /// A configurable builder for a set of regular expressions. + /// + /// A builder can be used to configure how the regexes are built, for example, + /// by setting the default flags (which can be overridden in the expression + /// itself) or setting various limits. + #[derive(Debug)] + pub struct RegexSetBuilder(RegexOptions); + + impl RegexSetBuilder { + /// Create a new regular expression builder with the given pattern. + /// + /// If the pattern is invalid, then an error will be returned when + /// `build` is called. + pub fn new<I, S>(patterns: I) -> RegexSetBuilder + where + S: AsRef<str>, + I: IntoIterator<Item = S>, + { + let mut builder = RegexSetBuilder(RegexOptions::default()); + for pat in patterns { + builder.0.pats.push(pat.as_ref().to_owned()); + } + builder + } + + /// Consume the builder and compile the regular expressions into a set. + pub fn build(&self) -> Result<RegexSet, Error> { + ExecBuilder::new_options(self.0.clone()) + .only_utf8($only_utf8) + .build() + .map(RegexSet::from) + } + + /// Set the value for the case insensitive (`i`) flag. + pub fn case_insensitive( + &mut self, + yes: bool, + ) -> &mut RegexSetBuilder { + self.0.case_insensitive = yes; + self + } + + /// Set the value for the multi-line matching (`m`) flag. + pub fn multi_line( + &mut self, + yes: bool, + ) -> &mut RegexSetBuilder { + self.0.multi_line = yes; + self + } + + /// Set the value for the any character (`s`) flag, where in `.` matches + /// anything when `s` is set and matches anything except for new line when + /// it is not set (the default). + /// + /// N.B. "matches anything" means "any byte" for `regex::bytes::RegexSet` + /// expressions and means "any Unicode scalar value" for `regex::RegexSet` + /// expressions. + pub fn dot_matches_new_line( + &mut self, + yes: bool, + ) -> &mut RegexSetBuilder { + self.0.dot_matches_new_line = yes; + self + } + + /// Set the value for the greedy swap (`U`) flag. + pub fn swap_greed( + &mut self, + yes: bool, + ) -> &mut RegexSetBuilder { + self.0.swap_greed = yes; + self + } + + /// Set the value for the ignore whitespace (`x`) flag. + pub fn ignore_whitespace( + &mut self, + yes: bool, + ) -> &mut RegexSetBuilder { + self.0.ignore_whitespace = yes; + self + } + + /// Set the value for the Unicode (`u`) flag. + pub fn unicode(&mut self, yes: bool) -> &mut RegexSetBuilder { + self.0.unicode = yes; + self + } + + /// Whether to support octal syntax or not. + /// + /// Octal syntax is a little-known way of uttering Unicode codepoints in + /// a regular expression. For example, `a`, `\x61`, `\u0061` and + /// `\141` are all equivalent regular expressions, where the last example + /// shows octal syntax. + /// + /// While supporting octal syntax isn't in and of itself a problem, it does + /// make good error messages harder. That is, in PCRE based regex engines, + /// syntax like `\0` invokes a backreference, which is explicitly + /// unsupported in Rust's regex engine. However, many users expect it to + /// be supported. Therefore, when octal support is disabled, the error + /// message will explicitly mention that backreferences aren't supported. + /// + /// Octal syntax is disabled by default. + pub fn octal(&mut self, yes: bool) -> &mut RegexSetBuilder { + self.0.octal = yes; + self + } + + /// Set the approximate size limit of the compiled regular expression. + /// + /// This roughly corresponds to the number of bytes occupied by a single + /// compiled program. If the program exceeds this number, then a + /// compilation error is returned. + pub fn size_limit( + &mut self, + limit: usize, + ) -> &mut RegexSetBuilder { + self.0.size_limit = limit; + self + } + + /// Set the approximate size of the cache used by the DFA. + /// + /// This roughly corresponds to the number of bytes that the DFA will + /// use while searching. + /// + /// Note that this is a *per thread* limit. There is no way to set a global + /// limit. In particular, if a regex is used from multiple threads + /// simultaneously, then each thread may use up to the number of bytes + /// specified here. + pub fn dfa_size_limit( + &mut self, + limit: usize, + ) -> &mut RegexSetBuilder { + self.0.dfa_size_limit = limit; + self + } + + /// Set the nesting limit for this parser. + /// + /// The nesting limit controls how deep the abstract syntax tree is allowed + /// to be. If the AST exceeds the given limit (e.g., with too many nested + /// groups), then an error is returned by the parser. + /// + /// The purpose of this limit is to act as a heuristic to prevent stack + /// overflow for consumers that do structural induction on an `Ast` using + /// explicit recursion. While this crate never does this (instead using + /// constant stack space and moving the call stack to the heap), other + /// crates may. + /// + /// This limit is not checked until the entire Ast is parsed. Therefore, + /// if callers want to put a limit on the amount of heap space used, then + /// they should impose a limit on the length, in bytes, of the concrete + /// pattern string. In particular, this is viable since this parser + /// implementation will limit itself to heap space proportional to the + /// length of the pattern string. + /// + /// Note that a nest limit of `0` will return a nest limit error for most + /// patterns but not all. For example, a nest limit of `0` permits `a` but + /// not `ab`, since `ab` requires a concatenation, which results in a nest + /// depth of `1`. In general, a nest limit is not something that manifests + /// in an obvious way in the concrete syntax, therefore, it should not be + /// used in a granular way. + pub fn nest_limit( + &mut self, + limit: u32, + ) -> &mut RegexSetBuilder { + self.0.nest_limit = limit; + self + } + } + } + }; +} + +define_set_builder!(set_bytes, bytes, false); +define_set_builder!(set_unicode, unicode, true); diff --git a/third_party/rust/regex/src/re_bytes.rs b/third_party/rust/regex/src/re_bytes.rs new file mode 100644 index 0000000000..d71969257b --- /dev/null +++ b/third_party/rust/regex/src/re_bytes.rs @@ -0,0 +1,1260 @@ +use std::borrow::Cow; +use std::collections::HashMap; +use std::fmt; +use std::iter::FusedIterator; +use std::ops::{Index, Range}; +use std::str::FromStr; +use std::sync::Arc; + +use crate::find_byte::find_byte; + +use crate::error::Error; +use crate::exec::{Exec, ExecNoSync}; +use crate::expand::expand_bytes; +use crate::re_builder::bytes::RegexBuilder; +use crate::re_trait::{self, RegularExpression, SubCapturesPosIter}; + +/// Match represents a single match of a regex in a haystack. +/// +/// The lifetime parameter `'t` refers to the lifetime of the matched text. +#[derive(Copy, Clone, Debug, Eq, PartialEq)] +pub struct Match<'t> { + text: &'t [u8], + start: usize, + end: usize, +} + +impl<'t> Match<'t> { + /// Returns the starting byte offset of the match in the haystack. + #[inline] + pub fn start(&self) -> usize { + self.start + } + + /// Returns the ending byte offset of the match in the haystack. + #[inline] + pub fn end(&self) -> usize { + self.end + } + + /// Returns the range over the starting and ending byte offsets of the + /// match in the haystack. + #[inline] + pub fn range(&self) -> Range<usize> { + self.start..self.end + } + + /// Returns the matched text. + #[inline] + pub fn as_bytes(&self) -> &'t [u8] { + &self.text[self.range()] + } + + /// Creates a new match from the given haystack and byte offsets. + #[inline] + fn new(haystack: &'t [u8], start: usize, end: usize) -> Match<'t> { + Match { text: haystack, start, end } + } +} + +impl<'t> From<Match<'t>> for Range<usize> { + fn from(m: Match<'t>) -> Range<usize> { + m.range() + } +} + +/// A compiled regular expression for matching arbitrary bytes. +/// +/// It can be used to search, split or replace text. All searching is done with +/// an implicit `.*?` at the beginning and end of an expression. To force an +/// expression to match the whole string (or a prefix or a suffix), you must +/// use an anchor like `^` or `$` (or `\A` and `\z`). +/// +/// Like the `Regex` type in the parent module, matches with this regex return +/// byte offsets into the search text. **Unlike** the parent `Regex` type, +/// these byte offsets may not correspond to UTF-8 sequence boundaries since +/// the regexes in this module can match arbitrary bytes. +#[derive(Clone)] +pub struct Regex(Exec); + +impl fmt::Display for Regex { + /// Shows the original regular expression. + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + write!(f, "{}", self.as_str()) + } +} + +impl fmt::Debug for Regex { + /// Shows the original regular expression. + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + fmt::Display::fmt(self, f) + } +} + +/// A constructor for Regex from an Exec. +/// +/// This is hidden because Exec isn't actually part of the public API. +#[doc(hidden)] +impl From<Exec> for Regex { + fn from(exec: Exec) -> Regex { + Regex(exec) + } +} + +impl FromStr for Regex { + type Err = Error; + + /// Attempts to parse a string into a regular expression + fn from_str(s: &str) -> Result<Regex, Error> { + Regex::new(s) + } +} + +/// Core regular expression methods. +impl Regex { + /// Compiles a regular expression. Once compiled, it can be used repeatedly + /// to search, split or replace text in a string. + /// + /// If an invalid expression is given, then an error is returned. + pub fn new(re: &str) -> Result<Regex, Error> { + RegexBuilder::new(re).build() + } + + /// Returns true if and only if there is a match for the regex in the + /// string given. + /// + /// It is recommended to use this method if all you need to do is test + /// a match, since the underlying matching engine may be able to do less + /// work. + /// + /// # Example + /// + /// Test if some text contains at least one word with exactly 13 ASCII word + /// bytes: + /// + /// ```rust + /// # use regex::bytes::Regex; + /// # fn main() { + /// let text = b"I categorically deny having triskaidekaphobia."; + /// assert!(Regex::new(r"\b\w{13}\b").unwrap().is_match(text)); + /// # } + /// ``` + pub fn is_match(&self, text: &[u8]) -> bool { + self.is_match_at(text, 0) + } + + /// Returns the start and end byte range of the leftmost-first match in + /// `text`. If no match exists, then `None` is returned. + /// + /// Note that this should only be used if you want to discover the position + /// of the match. Testing the existence of a match is faster if you use + /// `is_match`. + /// + /// # Example + /// + /// Find the start and end location of the first word with exactly 13 + /// ASCII word bytes: + /// + /// ```rust + /// # use regex::bytes::Regex; + /// # fn main() { + /// let text = b"I categorically deny having triskaidekaphobia."; + /// let mat = Regex::new(r"\b\w{13}\b").unwrap().find(text).unwrap(); + /// assert_eq!((mat.start(), mat.end()), (2, 15)); + /// # } + /// ``` + pub fn find<'t>(&self, text: &'t [u8]) -> Option<Match<'t>> { + self.find_at(text, 0) + } + + /// Returns an iterator for each successive non-overlapping match in + /// `text`, returning the start and end byte indices with respect to + /// `text`. + /// + /// # Example + /// + /// Find the start and end location of every word with exactly 13 ASCII + /// word bytes: + /// + /// ```rust + /// # use regex::bytes::Regex; + /// # fn main() { + /// let text = b"Retroactively relinquishing remunerations is reprehensible."; + /// for mat in Regex::new(r"\b\w{13}\b").unwrap().find_iter(text) { + /// println!("{:?}", mat); + /// } + /// # } + /// ``` + pub fn find_iter<'r, 't>(&'r self, text: &'t [u8]) -> Matches<'r, 't> { + Matches(self.0.searcher().find_iter(text)) + } + + /// Returns the capture groups corresponding to the leftmost-first + /// match in `text`. Capture group `0` always corresponds to the entire + /// match. If no match is found, then `None` is returned. + /// + /// You should only use `captures` if you need access to the location of + /// capturing group matches. Otherwise, `find` is faster for discovering + /// the location of the overall match. + /// + /// # Examples + /// + /// Say you have some text with movie names and their release years, + /// like "'Citizen Kane' (1941)". It'd be nice if we could search for text + /// looking like that, while also extracting the movie name and its release + /// year separately. + /// + /// ```rust + /// # use regex::bytes::Regex; + /// # fn main() { + /// let re = Regex::new(r"'([^']+)'\s+\((\d{4})\)").unwrap(); + /// let text = b"Not my favorite movie: 'Citizen Kane' (1941)."; + /// let caps = re.captures(text).unwrap(); + /// assert_eq!(caps.get(1).unwrap().as_bytes(), &b"Citizen Kane"[..]); + /// assert_eq!(caps.get(2).unwrap().as_bytes(), &b"1941"[..]); + /// assert_eq!(caps.get(0).unwrap().as_bytes(), &b"'Citizen Kane' (1941)"[..]); + /// // You can also access the groups by index using the Index notation. + /// // Note that this will panic on an invalid index. + /// assert_eq!(&caps[1], b"Citizen Kane"); + /// assert_eq!(&caps[2], b"1941"); + /// assert_eq!(&caps[0], b"'Citizen Kane' (1941)"); + /// # } + /// ``` + /// + /// Note that the full match is at capture group `0`. Each subsequent + /// capture group is indexed by the order of its opening `(`. + /// + /// We can make this example a bit clearer by using *named* capture groups: + /// + /// ```rust + /// # use regex::bytes::Regex; + /// # fn main() { + /// let re = Regex::new(r"'(?P<title>[^']+)'\s+\((?P<year>\d{4})\)") + /// .unwrap(); + /// let text = b"Not my favorite movie: 'Citizen Kane' (1941)."; + /// let caps = re.captures(text).unwrap(); + /// assert_eq!(caps.name("title").unwrap().as_bytes(), b"Citizen Kane"); + /// assert_eq!(caps.name("year").unwrap().as_bytes(), b"1941"); + /// assert_eq!(caps.get(0).unwrap().as_bytes(), &b"'Citizen Kane' (1941)"[..]); + /// // You can also access the groups by name using the Index notation. + /// // Note that this will panic on an invalid group name. + /// assert_eq!(&caps["title"], b"Citizen Kane"); + /// assert_eq!(&caps["year"], b"1941"); + /// assert_eq!(&caps[0], b"'Citizen Kane' (1941)"); + /// + /// # } + /// ``` + /// + /// Here we name the capture groups, which we can access with the `name` + /// method or the `Index` notation with a `&str`. Note that the named + /// capture groups are still accessible with `get` or the `Index` notation + /// with a `usize`. + /// + /// The `0`th capture group is always unnamed, so it must always be + /// accessed with `get(0)` or `[0]`. + pub fn captures<'t>(&self, text: &'t [u8]) -> Option<Captures<'t>> { + let mut locs = self.capture_locations(); + self.captures_read_at(&mut locs, text, 0).map(move |_| Captures { + text, + locs: locs.0, + named_groups: self.0.capture_name_idx().clone(), + }) + } + + /// Returns an iterator over all the non-overlapping capture groups matched + /// in `text`. This is operationally the same as `find_iter`, except it + /// yields information about capturing group matches. + /// + /// # Example + /// + /// We can use this to find all movie titles and their release years in + /// some text, where the movie is formatted like "'Title' (xxxx)": + /// + /// ```rust + /// # use std::str; use regex::bytes::Regex; + /// # fn main() { + /// let re = Regex::new(r"'(?P<title>[^']+)'\s+\((?P<year>\d{4})\)") + /// .unwrap(); + /// let text = b"'Citizen Kane' (1941), 'The Wizard of Oz' (1939), 'M' (1931)."; + /// for caps in re.captures_iter(text) { + /// let title = str::from_utf8(&caps["title"]).unwrap(); + /// let year = str::from_utf8(&caps["year"]).unwrap(); + /// println!("Movie: {:?}, Released: {:?}", title, year); + /// } + /// // Output: + /// // Movie: Citizen Kane, Released: 1941 + /// // Movie: The Wizard of Oz, Released: 1939 + /// // Movie: M, Released: 1931 + /// # } + /// ``` + pub fn captures_iter<'r, 't>( + &'r self, + text: &'t [u8], + ) -> CaptureMatches<'r, 't> { + CaptureMatches(self.0.searcher().captures_iter(text)) + } + + /// Returns an iterator of substrings of `text` delimited by a match of the + /// regular expression. Namely, each element of the iterator corresponds to + /// text that *isn't* matched by the regular expression. + /// + /// This method will *not* copy the text given. + /// + /// # Example + /// + /// To split a string delimited by arbitrary amounts of spaces or tabs: + /// + /// ```rust + /// # use regex::bytes::Regex; + /// # fn main() { + /// let re = Regex::new(r"[ \t]+").unwrap(); + /// let fields: Vec<&[u8]> = re.split(b"a b \t c\td e").collect(); + /// assert_eq!(fields, vec![ + /// &b"a"[..], &b"b"[..], &b"c"[..], &b"d"[..], &b"e"[..], + /// ]); + /// # } + /// ``` + pub fn split<'r, 't>(&'r self, text: &'t [u8]) -> Split<'r, 't> { + Split { finder: self.find_iter(text), last: 0 } + } + + /// Returns an iterator of at most `limit` substrings of `text` delimited + /// by a match of the regular expression. (A `limit` of `0` will return no + /// substrings.) Namely, each element of the iterator corresponds to text + /// that *isn't* matched by the regular expression. The remainder of the + /// string that is not split will be the last element in the iterator. + /// + /// This method will *not* copy the text given. + /// + /// # Example + /// + /// Get the first two words in some text: + /// + /// ```rust + /// # use regex::bytes::Regex; + /// # fn main() { + /// let re = Regex::new(r"\W+").unwrap(); + /// let fields: Vec<&[u8]> = re.splitn(b"Hey! How are you?", 3).collect(); + /// assert_eq!(fields, vec![&b"Hey"[..], &b"How"[..], &b"are you?"[..]]); + /// # } + /// ``` + pub fn splitn<'r, 't>( + &'r self, + text: &'t [u8], + limit: usize, + ) -> SplitN<'r, 't> { + SplitN { splits: self.split(text), n: limit } + } + + /// Replaces the leftmost-first match with the replacement provided. The + /// replacement can be a regular byte string (where `$N` and `$name` are + /// expanded to match capture groups) or a function that takes the matches' + /// `Captures` and returns the replaced byte string. + /// + /// If no match is found, then a copy of the byte string is returned + /// unchanged. + /// + /// # Replacement string syntax + /// + /// All instances of `$name` in the replacement text is replaced with the + /// corresponding capture group `name`. + /// + /// `name` may be an integer corresponding to the index of the + /// capture group (counted by order of opening parenthesis where `0` is the + /// entire match) or it can be a name (consisting of letters, digits or + /// underscores) corresponding to a named capture group. + /// + /// If `name` isn't a valid capture group (whether the name doesn't exist + /// or isn't a valid index), then it is replaced with the empty string. + /// + /// The longest possible name is used. e.g., `$1a` looks up the capture + /// group named `1a` and not the capture group at index `1`. To exert more + /// precise control over the name, use braces, e.g., `${1}a`. + /// + /// To write a literal `$` use `$$`. + /// + /// # Examples + /// + /// Note that this function is polymorphic with respect to the replacement. + /// In typical usage, this can just be a normal byte string: + /// + /// ```rust + /// # use regex::bytes::Regex; + /// # fn main() { + /// let re = Regex::new("[^01]+").unwrap(); + /// assert_eq!(re.replace(b"1078910", &b""[..]), &b"1010"[..]); + /// # } + /// ``` + /// + /// But anything satisfying the `Replacer` trait will work. For example, a + /// closure of type `|&Captures| -> Vec<u8>` provides direct access to the + /// captures corresponding to a match. This allows one to access capturing + /// group matches easily: + /// + /// ```rust + /// # use regex::bytes::Regex; + /// # use regex::bytes::Captures; fn main() { + /// let re = Regex::new(r"([^,\s]+),\s+(\S+)").unwrap(); + /// let result = re.replace(b"Springsteen, Bruce", |caps: &Captures| { + /// let mut replacement = caps[2].to_owned(); + /// replacement.push(b' '); + /// replacement.extend(&caps[1]); + /// replacement + /// }); + /// assert_eq!(result, &b"Bruce Springsteen"[..]); + /// # } + /// ``` + /// + /// But this is a bit cumbersome to use all the time. Instead, a simple + /// syntax is supported that expands `$name` into the corresponding capture + /// group. Here's the last example, but using this expansion technique + /// with named capture groups: + /// + /// ```rust + /// # use regex::bytes::Regex; + /// # fn main() { + /// let re = Regex::new(r"(?P<last>[^,\s]+),\s+(?P<first>\S+)").unwrap(); + /// let result = re.replace(b"Springsteen, Bruce", &b"$first $last"[..]); + /// assert_eq!(result, &b"Bruce Springsteen"[..]); + /// # } + /// ``` + /// + /// Note that using `$2` instead of `$first` or `$1` instead of `$last` + /// would produce the same result. To write a literal `$` use `$$`. + /// + /// Sometimes the replacement string requires use of curly braces to + /// delineate a capture group replacement and surrounding literal text. + /// For example, if we wanted to join two words together with an + /// underscore: + /// + /// ```rust + /// # use regex::bytes::Regex; + /// # fn main() { + /// let re = Regex::new(r"(?P<first>\w+)\s+(?P<second>\w+)").unwrap(); + /// let result = re.replace(b"deep fried", &b"${first}_$second"[..]); + /// assert_eq!(result, &b"deep_fried"[..]); + /// # } + /// ``` + /// + /// Without the curly braces, the capture group name `first_` would be + /// used, and since it doesn't exist, it would be replaced with the empty + /// string. + /// + /// Finally, sometimes you just want to replace a literal string with no + /// regard for capturing group expansion. This can be done by wrapping a + /// byte string with `NoExpand`: + /// + /// ```rust + /// # use regex::bytes::Regex; + /// # fn main() { + /// use regex::bytes::NoExpand; + /// + /// let re = Regex::new(r"(?P<last>[^,\s]+),\s+(\S+)").unwrap(); + /// let result = re.replace(b"Springsteen, Bruce", NoExpand(b"$2 $last")); + /// assert_eq!(result, &b"$2 $last"[..]); + /// # } + /// ``` + pub fn replace<'t, R: Replacer>( + &self, + text: &'t [u8], + rep: R, + ) -> Cow<'t, [u8]> { + self.replacen(text, 1, rep) + } + + /// Replaces all non-overlapping matches in `text` with the replacement + /// provided. This is the same as calling `replacen` with `limit` set to + /// `0`. + /// + /// See the documentation for `replace` for details on how to access + /// capturing group matches in the replacement text. + pub fn replace_all<'t, R: Replacer>( + &self, + text: &'t [u8], + rep: R, + ) -> Cow<'t, [u8]> { + self.replacen(text, 0, rep) + } + + /// Replaces at most `limit` non-overlapping matches in `text` with the + /// replacement provided. If `limit` is 0, then all non-overlapping matches + /// are replaced. + /// + /// See the documentation for `replace` for details on how to access + /// capturing group matches in the replacement text. + pub fn replacen<'t, R: Replacer>( + &self, + text: &'t [u8], + limit: usize, + mut rep: R, + ) -> Cow<'t, [u8]> { + if let Some(rep) = rep.no_expansion() { + let mut it = self.find_iter(text).enumerate().peekable(); + if it.peek().is_none() { + return Cow::Borrowed(text); + } + let mut new = Vec::with_capacity(text.len()); + let mut last_match = 0; + for (i, m) in it { + if limit > 0 && i >= limit { + break; + } + new.extend_from_slice(&text[last_match..m.start()]); + new.extend_from_slice(&rep); + last_match = m.end(); + } + new.extend_from_slice(&text[last_match..]); + return Cow::Owned(new); + } + + // The slower path, which we use if the replacement needs access to + // capture groups. + let mut it = self.captures_iter(text).enumerate().peekable(); + if it.peek().is_none() { + return Cow::Borrowed(text); + } + let mut new = Vec::with_capacity(text.len()); + let mut last_match = 0; + for (i, cap) in it { + if limit > 0 && i >= limit { + break; + } + // unwrap on 0 is OK because captures only reports matches + let m = cap.get(0).unwrap(); + new.extend_from_slice(&text[last_match..m.start()]); + rep.replace_append(&cap, &mut new); + last_match = m.end(); + } + new.extend_from_slice(&text[last_match..]); + Cow::Owned(new) + } +} + +/// Advanced or "lower level" search methods. +impl Regex { + /// Returns the end location of a match in the text given. + /// + /// This method may have the same performance characteristics as + /// `is_match`, except it provides an end location for a match. In + /// particular, the location returned *may be shorter* than the proper end + /// of the leftmost-first match. + /// + /// # Example + /// + /// Typically, `a+` would match the entire first sequence of `a` in some + /// text, but `shortest_match` can give up as soon as it sees the first + /// `a`. + /// + /// ```rust + /// # use regex::bytes::Regex; + /// # fn main() { + /// let text = b"aaaaa"; + /// let pos = Regex::new(r"a+").unwrap().shortest_match(text); + /// assert_eq!(pos, Some(1)); + /// # } + /// ``` + pub fn shortest_match(&self, text: &[u8]) -> Option<usize> { + self.shortest_match_at(text, 0) + } + + /// Returns the same as shortest_match, but starts the search at the given + /// offset. + /// + /// The significance of the starting point is that it takes the surrounding + /// context into consideration. For example, the `\A` anchor can only + /// match when `start == 0`. + pub fn shortest_match_at( + &self, + text: &[u8], + start: usize, + ) -> Option<usize> { + self.0.searcher().shortest_match_at(text, start) + } + + /// Returns the same as is_match, but starts the search at the given + /// offset. + /// + /// The significance of the starting point is that it takes the surrounding + /// context into consideration. For example, the `\A` anchor can only + /// match when `start == 0`. + pub fn is_match_at(&self, text: &[u8], start: usize) -> bool { + self.0.searcher().is_match_at(text, start) + } + + /// Returns the same as find, but starts the search at the given + /// offset. + /// + /// The significance of the starting point is that it takes the surrounding + /// context into consideration. For example, the `\A` anchor can only + /// match when `start == 0`. + pub fn find_at<'t>( + &self, + text: &'t [u8], + start: usize, + ) -> Option<Match<'t>> { + self.0 + .searcher() + .find_at(text, start) + .map(|(s, e)| Match::new(text, s, e)) + } + + /// This is like `captures`, but uses + /// [`CaptureLocations`](struct.CaptureLocations.html) + /// instead of + /// [`Captures`](struct.Captures.html) in order to amortize allocations. + /// + /// To create a `CaptureLocations` value, use the + /// `Regex::capture_locations` method. + /// + /// This returns the overall match if this was successful, which is always + /// equivalence to the `0`th capture group. + pub fn captures_read<'t>( + &self, + locs: &mut CaptureLocations, + text: &'t [u8], + ) -> Option<Match<'t>> { + self.captures_read_at(locs, text, 0) + } + + /// Returns the same as `captures_read`, but starts the search at the given + /// offset and populates the capture locations given. + /// + /// The significance of the starting point is that it takes the surrounding + /// context into consideration. For example, the `\A` anchor can only + /// match when `start == 0`. + pub fn captures_read_at<'t>( + &self, + locs: &mut CaptureLocations, + text: &'t [u8], + start: usize, + ) -> Option<Match<'t>> { + self.0 + .searcher() + .captures_read_at(&mut locs.0, text, start) + .map(|(s, e)| Match::new(text, s, e)) + } + + /// An undocumented alias for `captures_read_at`. + /// + /// The `regex-capi` crate previously used this routine, so to avoid + /// breaking that crate, we continue to provide the name as an undocumented + /// alias. + #[doc(hidden)] + pub fn read_captures_at<'t>( + &self, + locs: &mut CaptureLocations, + text: &'t [u8], + start: usize, + ) -> Option<Match<'t>> { + self.captures_read_at(locs, text, start) + } +} + +/// Auxiliary methods. +impl Regex { + /// Returns the original string of this regex. + pub fn as_str(&self) -> &str { + &self.0.regex_strings()[0] + } + + /// Returns an iterator over the capture names. + pub fn capture_names(&self) -> CaptureNames<'_> { + CaptureNames(self.0.capture_names().iter()) + } + + /// Returns the number of captures. + pub fn captures_len(&self) -> usize { + self.0.capture_names().len() + } + + /// Returns an empty set of capture locations that can be reused in + /// multiple calls to `captures_read` or `captures_read_at`. + pub fn capture_locations(&self) -> CaptureLocations { + CaptureLocations(self.0.searcher().locations()) + } + + /// An alias for `capture_locations` to preserve backward compatibility. + /// + /// The `regex-capi` crate uses this method, so to avoid breaking that + /// crate, we continue to export it as an undocumented API. + #[doc(hidden)] + pub fn locations(&self) -> CaptureLocations { + CaptureLocations(self.0.searcher().locations()) + } +} + +/// An iterator over all non-overlapping matches for a particular string. +/// +/// The iterator yields a tuple of integers corresponding to the start and end +/// of the match. The indices are byte offsets. The iterator stops when no more +/// matches can be found. +/// +/// `'r` is the lifetime of the compiled regular expression and `'t` is the +/// lifetime of the matched byte string. +#[derive(Debug)] +pub struct Matches<'r, 't>(re_trait::Matches<'t, ExecNoSync<'r>>); + +impl<'r, 't> Iterator for Matches<'r, 't> { + type Item = Match<'t>; + + fn next(&mut self) -> Option<Match<'t>> { + let text = self.0.text(); + self.0.next().map(|(s, e)| Match::new(text, s, e)) + } +} + +impl<'r, 't> FusedIterator for Matches<'r, 't> {} + +/// An iterator that yields all non-overlapping capture groups matching a +/// particular regular expression. +/// +/// The iterator stops when no more matches can be found. +/// +/// `'r` is the lifetime of the compiled regular expression and `'t` is the +/// lifetime of the matched byte string. +#[derive(Debug)] +pub struct CaptureMatches<'r, 't>( + re_trait::CaptureMatches<'t, ExecNoSync<'r>>, +); + +impl<'r, 't> Iterator for CaptureMatches<'r, 't> { + type Item = Captures<'t>; + + fn next(&mut self) -> Option<Captures<'t>> { + self.0.next().map(|locs| Captures { + text: self.0.text(), + locs, + named_groups: self.0.regex().capture_name_idx().clone(), + }) + } +} + +impl<'r, 't> FusedIterator for CaptureMatches<'r, 't> {} + +/// Yields all substrings delimited by a regular expression match. +/// +/// `'r` is the lifetime of the compiled regular expression and `'t` is the +/// lifetime of the byte string being split. +#[derive(Debug)] +pub struct Split<'r, 't> { + finder: Matches<'r, 't>, + last: usize, +} + +impl<'r, 't> Iterator for Split<'r, 't> { + type Item = &'t [u8]; + + fn next(&mut self) -> Option<&'t [u8]> { + let text = self.finder.0.text(); + match self.finder.next() { + None => { + if self.last > text.len() { + None + } else { + let s = &text[self.last..]; + self.last = text.len() + 1; // Next call will return None + Some(s) + } + } + Some(m) => { + let matched = &text[self.last..m.start()]; + self.last = m.end(); + Some(matched) + } + } + } +} + +impl<'r, 't> FusedIterator for Split<'r, 't> {} + +/// Yields at most `N` substrings delimited by a regular expression match. +/// +/// The last substring will be whatever remains after splitting. +/// +/// `'r` is the lifetime of the compiled regular expression and `'t` is the +/// lifetime of the byte string being split. +#[derive(Debug)] +pub struct SplitN<'r, 't> { + splits: Split<'r, 't>, + n: usize, +} + +impl<'r, 't> Iterator for SplitN<'r, 't> { + type Item = &'t [u8]; + + fn next(&mut self) -> Option<&'t [u8]> { + if self.n == 0 { + return None; + } + + self.n -= 1; + if self.n > 0 { + return self.splits.next(); + } + + let text = self.splits.finder.0.text(); + if self.splits.last > text.len() { + // We've already returned all substrings. + None + } else { + // self.n == 0, so future calls will return None immediately + Some(&text[self.splits.last..]) + } + } + + fn size_hint(&self) -> (usize, Option<usize>) { + (0, Some(self.n)) + } +} + +impl<'r, 't> FusedIterator for SplitN<'r, 't> {} + +/// An iterator over the names of all possible captures. +/// +/// `None` indicates an unnamed capture; the first element (capture 0, the +/// whole matched region) is always unnamed. +/// +/// `'r` is the lifetime of the compiled regular expression. +#[derive(Clone, Debug)] +pub struct CaptureNames<'r>(::std::slice::Iter<'r, Option<String>>); + +impl<'r> Iterator for CaptureNames<'r> { + type Item = Option<&'r str>; + + fn next(&mut self) -> Option<Option<&'r str>> { + self.0 + .next() + .as_ref() + .map(|slot| slot.as_ref().map(|name| name.as_ref())) + } + + fn size_hint(&self) -> (usize, Option<usize>) { + self.0.size_hint() + } + + fn count(self) -> usize { + self.0.count() + } +} + +impl<'r> ExactSizeIterator for CaptureNames<'r> {} + +impl<'r> FusedIterator for CaptureNames<'r> {} + +/// CaptureLocations is a low level representation of the raw offsets of each +/// submatch. +/// +/// You can think of this as a lower level +/// [`Captures`](struct.Captures.html), where this type does not support +/// named capturing groups directly and it does not borrow the text that these +/// offsets were matched on. +/// +/// Primarily, this type is useful when using the lower level `Regex` APIs +/// such as `read_captures`, which permits amortizing the allocation in which +/// capture match locations are stored. +/// +/// In order to build a value of this type, you'll need to call the +/// `capture_locations` method on the `Regex` being used to execute the search. +/// The value returned can then be reused in subsequent searches. +#[derive(Clone, Debug)] +pub struct CaptureLocations(re_trait::Locations); + +/// A type alias for `CaptureLocations` for backwards compatibility. +/// +/// Previously, we exported `CaptureLocations` as `Locations` in an +/// undocumented API. To prevent breaking that code (e.g., in `regex-capi`), +/// we continue re-exporting the same undocumented API. +#[doc(hidden)] +pub type Locations = CaptureLocations; + +impl CaptureLocations { + /// Returns the start and end positions of the Nth capture group. Returns + /// `None` if `i` is not a valid capture group or if the capture group did + /// not match anything. The positions returned are *always* byte indices + /// with respect to the original string matched. + #[inline] + pub fn get(&self, i: usize) -> Option<(usize, usize)> { + self.0.pos(i) + } + + /// Returns the total number of capture groups (even if they didn't match). + /// + /// This is always at least `1` since every regex has at least `1` + /// capturing group that corresponds to the entire match. + #[inline] + pub fn len(&self) -> usize { + self.0.len() + } + + /// An alias for the `get` method for backwards compatibility. + /// + /// Previously, we exported `get` as `pos` in an undocumented API. To + /// prevent breaking that code (e.g., in `regex-capi`), we continue + /// re-exporting the same undocumented API. + #[doc(hidden)] + #[inline] + pub fn pos(&self, i: usize) -> Option<(usize, usize)> { + self.get(i) + } +} + +/// Captures represents a group of captured byte strings for a single match. +/// +/// The 0th capture always corresponds to the entire match. Each subsequent +/// index corresponds to the next capture group in the regex. If a capture +/// group is named, then the matched byte string is *also* available via the +/// `name` method. (Note that the 0th capture is always unnamed and so must be +/// accessed with the `get` method.) +/// +/// Positions returned from a capture group are always byte indices. +/// +/// `'t` is the lifetime of the matched text. +pub struct Captures<'t> { + text: &'t [u8], + locs: re_trait::Locations, + named_groups: Arc<HashMap<String, usize>>, +} + +impl<'t> Captures<'t> { + /// Returns the match associated with the capture group at index `i`. If + /// `i` does not correspond to a capture group, or if the capture group + /// did not participate in the match, then `None` is returned. + /// + /// # Examples + /// + /// Get the text of the match with a default of an empty string if this + /// group didn't participate in the match: + /// + /// ```rust + /// # use regex::bytes::Regex; + /// let re = Regex::new(r"[a-z]+(?:([0-9]+)|([A-Z]+))").unwrap(); + /// let caps = re.captures(b"abc123").unwrap(); + /// + /// let text1 = caps.get(1).map_or(&b""[..], |m| m.as_bytes()); + /// let text2 = caps.get(2).map_or(&b""[..], |m| m.as_bytes()); + /// assert_eq!(text1, &b"123"[..]); + /// assert_eq!(text2, &b""[..]); + /// ``` + pub fn get(&self, i: usize) -> Option<Match<'t>> { + self.locs.pos(i).map(|(s, e)| Match::new(self.text, s, e)) + } + + /// Returns the match for the capture group named `name`. If `name` isn't a + /// valid capture group or didn't match anything, then `None` is returned. + pub fn name(&self, name: &str) -> Option<Match<'t>> { + self.named_groups.get(name).and_then(|&i| self.get(i)) + } + + /// An iterator that yields all capturing matches in the order in which + /// they appear in the regex. If a particular capture group didn't + /// participate in the match, then `None` is yielded for that capture. + /// + /// The first match always corresponds to the overall match of the regex. + pub fn iter<'c>(&'c self) -> SubCaptureMatches<'c, 't> { + SubCaptureMatches { caps: self, it: self.locs.iter() } + } + + /// Expands all instances of `$name` in `replacement` to the corresponding + /// capture group `name`, and writes them to the `dst` buffer given. + /// + /// `name` may be an integer corresponding to the index of the capture + /// group (counted by order of opening parenthesis where `0` is the + /// entire match) or it can be a name (consisting of letters, digits or + /// underscores) corresponding to a named capture group. + /// + /// If `name` isn't a valid capture group (whether the name doesn't exist + /// or isn't a valid index), then it is replaced with the empty string. + /// + /// The longest possible name consisting of the characters `[_0-9A-Za-z]` + /// is used. e.g., `$1a` looks up the capture group named `1a` and not the + /// capture group at index `1`. To exert more precise control over the + /// name, or to refer to a capture group name that uses characters outside + /// of `[_0-9A-Za-z]`, use braces, e.g., `${1}a` or `${foo[bar].baz}`. When + /// using braces, any sequence of valid UTF-8 bytes is permitted. If the + /// sequence does not refer to a capture group name in the corresponding + /// regex, then it is replaced with an empty string. + /// + /// To write a literal `$` use `$$`. + pub fn expand(&self, replacement: &[u8], dst: &mut Vec<u8>) { + expand_bytes(self, replacement, dst) + } + + /// Returns the total number of capture groups (even if they didn't match). + /// + /// This is always at least `1`, since every regex has at least one capture + /// group that corresponds to the full match. + #[inline] + pub fn len(&self) -> usize { + self.locs.len() + } +} + +impl<'t> fmt::Debug for Captures<'t> { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + f.debug_tuple("Captures").field(&CapturesDebug(self)).finish() + } +} + +struct CapturesDebug<'c, 't>(&'c Captures<'t>); + +impl<'c, 't> fmt::Debug for CapturesDebug<'c, 't> { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + fn escape_bytes(bytes: &[u8]) -> String { + let mut s = String::new(); + for &b in bytes { + s.push_str(&escape_byte(b)); + } + s + } + + fn escape_byte(byte: u8) -> String { + use std::ascii::escape_default; + + let escaped: Vec<u8> = escape_default(byte).collect(); + String::from_utf8_lossy(&escaped).into_owned() + } + + // We'd like to show something nice here, even if it means an + // allocation to build a reverse index. + let slot_to_name: HashMap<&usize, &String> = + self.0.named_groups.iter().map(|(a, b)| (b, a)).collect(); + let mut map = f.debug_map(); + for (slot, m) in self.0.locs.iter().enumerate() { + let m = m.map(|(s, e)| escape_bytes(&self.0.text[s..e])); + if let Some(name) = slot_to_name.get(&slot) { + map.entry(&name, &m); + } else { + map.entry(&slot, &m); + } + } + map.finish() + } +} + +/// Get a group by index. +/// +/// `'t` is the lifetime of the matched text. +/// +/// The text can't outlive the `Captures` object if this method is +/// used, because of how `Index` is defined (normally `a[i]` is part +/// of `a` and can't outlive it); to do that, use `get()` instead. +/// +/// # Panics +/// +/// If there is no group at the given index. +impl<'t> Index<usize> for Captures<'t> { + type Output = [u8]; + + fn index(&self, i: usize) -> &[u8] { + self.get(i) + .map(|m| m.as_bytes()) + .unwrap_or_else(|| panic!("no group at index '{}'", i)) + } +} + +/// Get a group by name. +/// +/// `'t` is the lifetime of the matched text and `'i` is the lifetime +/// of the group name (the index). +/// +/// The text can't outlive the `Captures` object if this method is +/// used, because of how `Index` is defined (normally `a[i]` is part +/// of `a` and can't outlive it); to do that, use `name` instead. +/// +/// # Panics +/// +/// If there is no group named by the given value. +impl<'t, 'i> Index<&'i str> for Captures<'t> { + type Output = [u8]; + + fn index<'a>(&'a self, name: &'i str) -> &'a [u8] { + self.name(name) + .map(|m| m.as_bytes()) + .unwrap_or_else(|| panic!("no group named '{}'", name)) + } +} + +/// An iterator that yields all capturing matches in the order in which they +/// appear in the regex. +/// +/// If a particular capture group didn't participate in the match, then `None` +/// is yielded for that capture. The first match always corresponds to the +/// overall match of the regex. +/// +/// The lifetime `'c` corresponds to the lifetime of the `Captures` value, and +/// the lifetime `'t` corresponds to the originally matched text. +#[derive(Clone, Debug)] +pub struct SubCaptureMatches<'c, 't> { + caps: &'c Captures<'t>, + it: SubCapturesPosIter<'c>, +} + +impl<'c, 't> Iterator for SubCaptureMatches<'c, 't> { + type Item = Option<Match<'t>>; + + fn next(&mut self) -> Option<Option<Match<'t>>> { + self.it + .next() + .map(|cap| cap.map(|(s, e)| Match::new(self.caps.text, s, e))) + } +} + +impl<'c, 't> FusedIterator for SubCaptureMatches<'c, 't> {} + +/// Replacer describes types that can be used to replace matches in a byte +/// string. +/// +/// In general, users of this crate shouldn't need to implement this trait, +/// since implementations are already provided for `&[u8]` along with other +/// variants of bytes types and `FnMut(&Captures) -> Vec<u8>` (or any +/// `FnMut(&Captures) -> T` where `T: AsRef<[u8]>`), which covers most use cases. +pub trait Replacer { + /// Appends text to `dst` to replace the current match. + /// + /// The current match is represented by `caps`, which is guaranteed to + /// have a match at capture group `0`. + /// + /// For example, a no-op replacement would be + /// `dst.extend(&caps[0])`. + fn replace_append(&mut self, caps: &Captures<'_>, dst: &mut Vec<u8>); + + /// Return a fixed unchanging replacement byte string. + /// + /// When doing replacements, if access to `Captures` is not needed (e.g., + /// the replacement byte string does not need `$` expansion), then it can + /// be beneficial to avoid finding sub-captures. + /// + /// In general, this is called once for every call to `replacen`. + fn no_expansion<'r>(&'r mut self) -> Option<Cow<'r, [u8]>> { + None + } + + /// Return a `Replacer` that borrows and wraps this `Replacer`. + /// + /// This is useful when you want to take a generic `Replacer` (which might + /// not be cloneable) and use it without consuming it, so it can be used + /// more than once. + /// + /// # Example + /// + /// ``` + /// use regex::bytes::{Regex, Replacer}; + /// + /// fn replace_all_twice<R: Replacer>( + /// re: Regex, + /// src: &[u8], + /// mut rep: R, + /// ) -> Vec<u8> { + /// let dst = re.replace_all(src, rep.by_ref()); + /// let dst = re.replace_all(&dst, rep.by_ref()); + /// dst.into_owned() + /// } + /// ``` + fn by_ref<'r>(&'r mut self) -> ReplacerRef<'r, Self> { + ReplacerRef(self) + } +} + +/// By-reference adaptor for a `Replacer` +/// +/// Returned by [`Replacer::by_ref`](trait.Replacer.html#method.by_ref). +#[derive(Debug)] +pub struct ReplacerRef<'a, R: ?Sized>(&'a mut R); + +impl<'a, R: Replacer + ?Sized + 'a> Replacer for ReplacerRef<'a, R> { + fn replace_append(&mut self, caps: &Captures<'_>, dst: &mut Vec<u8>) { + self.0.replace_append(caps, dst) + } + fn no_expansion<'r>(&'r mut self) -> Option<Cow<'r, [u8]>> { + self.0.no_expansion() + } +} + +impl<'a> Replacer for &'a [u8] { + fn replace_append(&mut self, caps: &Captures<'_>, dst: &mut Vec<u8>) { + caps.expand(*self, dst); + } + + fn no_expansion(&mut self) -> Option<Cow<'_, [u8]>> { + no_expansion(self) + } +} + +impl<'a> Replacer for &'a Vec<u8> { + fn replace_append(&mut self, caps: &Captures<'_>, dst: &mut Vec<u8>) { + caps.expand(*self, dst); + } + + fn no_expansion(&mut self) -> Option<Cow<'_, [u8]>> { + no_expansion(self) + } +} + +impl Replacer for Vec<u8> { + fn replace_append(&mut self, caps: &Captures<'_>, dst: &mut Vec<u8>) { + caps.expand(self, dst); + } + + fn no_expansion(&mut self) -> Option<Cow<'_, [u8]>> { + no_expansion(self) + } +} + +impl<'a> Replacer for Cow<'a, [u8]> { + fn replace_append(&mut self, caps: &Captures<'_>, dst: &mut Vec<u8>) { + caps.expand(self.as_ref(), dst); + } + + fn no_expansion(&mut self) -> Option<Cow<'_, [u8]>> { + no_expansion(self) + } +} + +impl<'a> Replacer for &'a Cow<'a, [u8]> { + fn replace_append(&mut self, caps: &Captures<'_>, dst: &mut Vec<u8>) { + caps.expand(self.as_ref(), dst); + } + + fn no_expansion(&mut self) -> Option<Cow<'_, [u8]>> { + no_expansion(self) + } +} + +fn no_expansion<T: AsRef<[u8]>>(t: &T) -> Option<Cow<'_, [u8]>> { + let s = t.as_ref(); + match find_byte(b'$', s) { + Some(_) => None, + None => Some(Cow::Borrowed(s)), + } +} + +impl<F, T> Replacer for F +where + F: FnMut(&Captures<'_>) -> T, + T: AsRef<[u8]>, +{ + fn replace_append(&mut self, caps: &Captures<'_>, dst: &mut Vec<u8>) { + dst.extend_from_slice((*self)(caps).as_ref()); + } +} + +/// `NoExpand` indicates literal byte string replacement. +/// +/// It can be used with `replace` and `replace_all` to do a literal byte string +/// replacement without expanding `$name` to their corresponding capture +/// groups. This can be both convenient (to avoid escaping `$`, for example) +/// and performant (since capture groups don't need to be found). +/// +/// `'t` is the lifetime of the literal text. +#[derive(Clone, Debug)] +pub struct NoExpand<'t>(pub &'t [u8]); + +impl<'t> Replacer for NoExpand<'t> { + fn replace_append(&mut self, _: &Captures<'_>, dst: &mut Vec<u8>) { + dst.extend_from_slice(self.0); + } + + fn no_expansion(&mut self) -> Option<Cow<'_, [u8]>> { + Some(Cow::Borrowed(self.0)) + } +} diff --git a/third_party/rust/regex/src/re_set.rs b/third_party/rust/regex/src/re_set.rs new file mode 100644 index 0000000000..a6d886d761 --- /dev/null +++ b/third_party/rust/regex/src/re_set.rs @@ -0,0 +1,507 @@ +macro_rules! define_set { + ($name:ident, $builder_mod:ident, $text_ty:ty, $as_bytes:expr, + $(#[$doc_regexset_example:meta])* ) => { + pub mod $name { + use std::fmt; + use std::iter; + use std::slice; + use std::vec; + + use crate::error::Error; + use crate::exec::Exec; + use crate::re_builder::$builder_mod::RegexSetBuilder; + use crate::re_trait::RegularExpression; + +/// Match multiple (possibly overlapping) regular expressions in a single scan. +/// +/// A regex set corresponds to the union of two or more regular expressions. +/// That is, a regex set will match text where at least one of its +/// constituent regular expressions matches. A regex set as its formulated here +/// provides a touch more power: it will also report *which* regular +/// expressions in the set match. Indeed, this is the key difference between +/// regex sets and a single `Regex` with many alternates, since only one +/// alternate can match at a time. +/// +/// For example, consider regular expressions to match email addresses and +/// domains: `[a-z]+@[a-z]+\.(com|org|net)` and `[a-z]+\.(com|org|net)`. If a +/// regex set is constructed from those regexes, then searching the text +/// `foo@example.com` will report both regexes as matching. Of course, one +/// could accomplish this by compiling each regex on its own and doing two +/// searches over the text. The key advantage of using a regex set is that it +/// will report the matching regexes using a *single pass through the text*. +/// If one has hundreds or thousands of regexes to match repeatedly (like a URL +/// router for a complex web application or a user agent matcher), then a regex +/// set can realize huge performance gains. +/// +/// # Example +/// +/// This shows how the above two regexes (for matching email addresses and +/// domains) might work: +/// +$(#[$doc_regexset_example])* +/// +/// Note that it would be possible to adapt the above example to using `Regex` +/// with an expression like: +/// +/// ```text +/// (?P<email>[a-z]+@(?P<email_domain>[a-z]+[.](com|org|net)))|(?P<domain>[a-z]+[.](com|org|net)) +/// ``` +/// +/// After a match, one could then inspect the capture groups to figure out +/// which alternates matched. The problem is that it is hard to make this +/// approach scale when there are many regexes since the overlap between each +/// alternate isn't always obvious to reason about. +/// +/// # Limitations +/// +/// Regex sets are limited to answering the following two questions: +/// +/// 1. Does any regex in the set match? +/// 2. If so, which regexes in the set match? +/// +/// As with the main [`Regex`][crate::Regex] type, it is cheaper to ask (1) +/// instead of (2) since the matching engines can stop after the first match +/// is found. +/// +/// You cannot directly extract [`Match`][crate::Match] or +/// [`Captures`][crate::Captures] objects from a regex set. If you need these +/// operations, the recommended approach is to compile each pattern in the set +/// independently and scan the exact same input a second time with those +/// independently compiled patterns: +/// +/// ```rust +/// use regex::{Regex, RegexSet}; +/// +/// let patterns = ["foo", "bar"]; +/// // Both patterns will match different ranges of this string. +/// let text = "barfoo"; +/// +/// // Compile a set matching any of our patterns. +/// let set = RegexSet::new(&patterns).unwrap(); +/// // Compile each pattern independently. +/// let regexes: Vec<_> = set.patterns().iter() +/// .map(|pat| Regex::new(pat).unwrap()) +/// .collect(); +/// +/// // Match against the whole set first and identify the individual +/// // matching patterns. +/// let matches: Vec<&str> = set.matches(text).into_iter() +/// // Dereference the match index to get the corresponding +/// // compiled pattern. +/// .map(|match_idx| ®exes[match_idx]) +/// // To get match locations or any other info, we then have to search +/// // the exact same text again, using our separately-compiled pattern. +/// .map(|pat| pat.find(text).unwrap().as_str()) +/// .collect(); +/// +/// // Matches arrive in the order the constituent patterns were declared, +/// // not the order they appear in the input. +/// assert_eq!(vec!["foo", "bar"], matches); +/// ``` +/// +/// # Performance +/// +/// A `RegexSet` has the same performance characteristics as `Regex`. Namely, +/// search takes `O(mn)` time, where `m` is proportional to the size of the +/// regex set and `n` is proportional to the length of the search text. +#[derive(Clone)] +pub struct RegexSet(Exec); + +impl RegexSet { + /// Create a new regex set with the given regular expressions. + /// + /// This takes an iterator of `S`, where `S` is something that can produce + /// a `&str`. If any of the strings in the iterator are not valid regular + /// expressions, then an error is returned. + /// + /// # Example + /// + /// Create a new regex set from an iterator of strings: + /// + /// ```rust + /// # use regex::RegexSet; + /// let set = RegexSet::new(&[r"\w+", r"\d+"]).unwrap(); + /// assert!(set.is_match("foo")); + /// ``` + pub fn new<I, S>(exprs: I) -> Result<RegexSet, Error> + where S: AsRef<str>, I: IntoIterator<Item=S> { + RegexSetBuilder::new(exprs).build() + } + + /// Create a new empty regex set. + /// + /// # Example + /// + /// ```rust + /// # use regex::RegexSet; + /// let set = RegexSet::empty(); + /// assert!(set.is_empty()); + /// ``` + pub fn empty() -> RegexSet { + RegexSetBuilder::new(&[""; 0]).build().unwrap() + } + + /// Returns true if and only if one of the regexes in this set matches + /// the text given. + /// + /// This method should be preferred if you only need to test whether any + /// of the regexes in the set should match, but don't care about *which* + /// regexes matched. This is because the underlying matching engine will + /// quit immediately after seeing the first match instead of continuing to + /// find all matches. + /// + /// Note that as with searches using `Regex`, the expression is unanchored + /// by default. That is, if the regex does not start with `^` or `\A`, or + /// end with `$` or `\z`, then it is permitted to match anywhere in the + /// text. + /// + /// # Example + /// + /// Tests whether a set matches some text: + /// + /// ```rust + /// # use regex::RegexSet; + /// let set = RegexSet::new(&[r"\w+", r"\d+"]).unwrap(); + /// assert!(set.is_match("foo")); + /// assert!(!set.is_match("☃")); + /// ``` + pub fn is_match(&self, text: $text_ty) -> bool { + self.is_match_at(text, 0) + } + + /// Returns the same as is_match, but starts the search at the given + /// offset. + /// + /// The significance of the starting point is that it takes the surrounding + /// context into consideration. For example, the `\A` anchor can only + /// match when `start == 0`. + #[doc(hidden)] + pub fn is_match_at(&self, text: $text_ty, start: usize) -> bool { + self.0.searcher().is_match_at($as_bytes(text), start) + } + + /// Returns the set of regular expressions that match in the given text. + /// + /// The set returned contains the index of each regular expression that + /// matches in the given text. The index is in correspondence with the + /// order of regular expressions given to `RegexSet`'s constructor. + /// + /// The set can also be used to iterate over the matched indices. + /// + /// Note that as with searches using `Regex`, the expression is unanchored + /// by default. That is, if the regex does not start with `^` or `\A`, or + /// end with `$` or `\z`, then it is permitted to match anywhere in the + /// text. + /// + /// # Example + /// + /// Tests which regular expressions match the given text: + /// + /// ```rust + /// # use regex::RegexSet; + /// let set = RegexSet::new(&[ + /// r"\w+", + /// r"\d+", + /// r"\pL+", + /// r"foo", + /// r"bar", + /// r"barfoo", + /// r"foobar", + /// ]).unwrap(); + /// let matches: Vec<_> = set.matches("foobar").into_iter().collect(); + /// assert_eq!(matches, vec![0, 2, 3, 4, 6]); + /// + /// // You can also test whether a particular regex matched: + /// let matches = set.matches("foobar"); + /// assert!(!matches.matched(5)); + /// assert!(matches.matched(6)); + /// ``` + pub fn matches(&self, text: $text_ty) -> SetMatches { + let mut matches = vec![false; self.0.regex_strings().len()]; + let any = self.read_matches_at(&mut matches, text, 0); + SetMatches { + matched_any: any, + matches: matches, + } + } + + /// Returns the same as matches, but starts the search at the given + /// offset and stores the matches into the slice given. + /// + /// The significance of the starting point is that it takes the surrounding + /// context into consideration. For example, the `\A` anchor can only + /// match when `start == 0`. + /// + /// `matches` must have a length that is at least the number of regexes + /// in this set. + /// + /// This method returns true if and only if at least one member of + /// `matches` is true after executing the set against `text`. + #[doc(hidden)] + pub fn read_matches_at( + &self, + matches: &mut [bool], + text: $text_ty, + start: usize, + ) -> bool { + self.0.searcher().many_matches_at(matches, $as_bytes(text), start) + } + + /// Returns the total number of regular expressions in this set. + pub fn len(&self) -> usize { + self.0.regex_strings().len() + } + + /// Returns `true` if this set contains no regular expressions. + pub fn is_empty(&self) -> bool { + self.0.regex_strings().is_empty() + } + + /// Returns the patterns that this set will match on. + /// + /// This function can be used to determine the pattern for a match. The + /// slice returned has exactly as many patterns givens to this regex set, + /// and the order of the slice is the same as the order of the patterns + /// provided to the set. + /// + /// # Example + /// + /// ```rust + /// # use regex::RegexSet; + /// let set = RegexSet::new(&[ + /// r"\w+", + /// r"\d+", + /// r"\pL+", + /// r"foo", + /// r"bar", + /// r"barfoo", + /// r"foobar", + /// ]).unwrap(); + /// let matches: Vec<_> = set + /// .matches("foobar") + /// .into_iter() + /// .map(|match_idx| &set.patterns()[match_idx]) + /// .collect(); + /// assert_eq!(matches, vec![r"\w+", r"\pL+", r"foo", r"bar", r"foobar"]); + /// ``` + pub fn patterns(&self) -> &[String] { + self.0.regex_strings() + } +} + +/// A set of matches returned by a regex set. +#[derive(Clone, Debug)] +pub struct SetMatches { + matched_any: bool, + matches: Vec<bool>, +} + +impl SetMatches { + /// Whether this set contains any matches. + pub fn matched_any(&self) -> bool { + self.matched_any + } + + /// Whether the regex at the given index matched. + /// + /// The index for a regex is determined by its insertion order upon the + /// initial construction of a `RegexSet`, starting at `0`. + /// + /// # Panics + /// + /// If `regex_index` is greater than or equal to `self.len()`. + pub fn matched(&self, regex_index: usize) -> bool { + self.matches[regex_index] + } + + /// The total number of regexes in the set that created these matches. + pub fn len(&self) -> usize { + self.matches.len() + } + + /// Returns an iterator over indexes in the regex that matched. + /// + /// This will always produces matches in ascending order of index, where + /// the index corresponds to the index of the regex that matched with + /// respect to its position when initially building the set. + pub fn iter(&self) -> SetMatchesIter<'_> { + SetMatchesIter((&*self.matches).into_iter().enumerate()) + } +} + +impl IntoIterator for SetMatches { + type IntoIter = SetMatchesIntoIter; + type Item = usize; + + fn into_iter(self) -> Self::IntoIter { + SetMatchesIntoIter(self.matches.into_iter().enumerate()) + } +} + +impl<'a> IntoIterator for &'a SetMatches { + type IntoIter = SetMatchesIter<'a>; + type Item = usize; + + fn into_iter(self) -> Self::IntoIter { + self.iter() + } +} + +/// An owned iterator over the set of matches from a regex set. +/// +/// This will always produces matches in ascending order of index, where the +/// index corresponds to the index of the regex that matched with respect to +/// its position when initially building the set. +#[derive(Debug)] +pub struct SetMatchesIntoIter(iter::Enumerate<vec::IntoIter<bool>>); + +impl Iterator for SetMatchesIntoIter { + type Item = usize; + + fn next(&mut self) -> Option<usize> { + loop { + match self.0.next() { + None => return None, + Some((_, false)) => {} + Some((i, true)) => return Some(i), + } + } + } + + fn size_hint(&self) -> (usize, Option<usize>) { + self.0.size_hint() + } +} + +impl DoubleEndedIterator for SetMatchesIntoIter { + fn next_back(&mut self) -> Option<usize> { + loop { + match self.0.next_back() { + None => return None, + Some((_, false)) => {} + Some((i, true)) => return Some(i), + } + } + } +} + +impl iter::FusedIterator for SetMatchesIntoIter {} + +/// A borrowed iterator over the set of matches from a regex set. +/// +/// The lifetime `'a` refers to the lifetime of a `SetMatches` value. +/// +/// This will always produces matches in ascending order of index, where the +/// index corresponds to the index of the regex that matched with respect to +/// its position when initially building the set. +#[derive(Clone, Debug)] +pub struct SetMatchesIter<'a>(iter::Enumerate<slice::Iter<'a, bool>>); + +impl<'a> Iterator for SetMatchesIter<'a> { + type Item = usize; + + fn next(&mut self) -> Option<usize> { + loop { + match self.0.next() { + None => return None, + Some((_, &false)) => {} + Some((i, &true)) => return Some(i), + } + } + } + + fn size_hint(&self) -> (usize, Option<usize>) { + self.0.size_hint() + } +} + +impl<'a> DoubleEndedIterator for SetMatchesIter<'a> { + fn next_back(&mut self) -> Option<usize> { + loop { + match self.0.next_back() { + None => return None, + Some((_, &false)) => {} + Some((i, &true)) => return Some(i), + } + } + } +} + +impl<'a> iter::FusedIterator for SetMatchesIter<'a> {} + +#[doc(hidden)] +impl From<Exec> for RegexSet { + fn from(exec: Exec) -> Self { + RegexSet(exec) + } +} + +impl fmt::Debug for RegexSet { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + write!(f, "RegexSet({:?})", self.0.regex_strings()) + } +} + +#[allow(dead_code)] fn as_bytes_str(text: &str) -> &[u8] { text.as_bytes() } +#[allow(dead_code)] fn as_bytes_bytes(text: &[u8]) -> &[u8] { text } + } + } +} + +define_set! { + unicode, + set_unicode, + &str, + as_bytes_str, +/// ```rust +/// # use regex::RegexSet; +/// let set = RegexSet::new(&[ +/// r"[a-z]+@[a-z]+\.(com|org|net)", +/// r"[a-z]+\.(com|org|net)", +/// ]).unwrap(); +/// +/// // Ask whether any regexes in the set match. +/// assert!(set.is_match("foo@example.com")); +/// +/// // Identify which regexes in the set match. +/// let matches: Vec<_> = set.matches("foo@example.com").into_iter().collect(); +/// assert_eq!(vec![0, 1], matches); +/// +/// // Try again, but with text that only matches one of the regexes. +/// let matches: Vec<_> = set.matches("example.com").into_iter().collect(); +/// assert_eq!(vec![1], matches); +/// +/// // Try again, but with text that doesn't match any regex in the set. +/// let matches: Vec<_> = set.matches("example").into_iter().collect(); +/// assert!(matches.is_empty()); +/// ``` +} + +define_set! { + bytes, + set_bytes, + &[u8], + as_bytes_bytes, +/// ```rust +/// # use regex::bytes::RegexSet; +/// let set = RegexSet::new(&[ +/// r"[a-z]+@[a-z]+\.(com|org|net)", +/// r"[a-z]+\.(com|org|net)", +/// ]).unwrap(); +/// +/// // Ask whether any regexes in the set match. +/// assert!(set.is_match(b"foo@example.com")); +/// +/// // Identify which regexes in the set match. +/// let matches: Vec<_> = set.matches(b"foo@example.com").into_iter().collect(); +/// assert_eq!(vec![0, 1], matches); +/// +/// // Try again, but with text that only matches one of the regexes. +/// let matches: Vec<_> = set.matches(b"example.com").into_iter().collect(); +/// assert_eq!(vec![1], matches); +/// +/// // Try again, but with text that doesn't match any regex in the set. +/// let matches: Vec<_> = set.matches(b"example").into_iter().collect(); +/// assert!(matches.is_empty()); +/// ``` +} diff --git a/third_party/rust/regex/src/re_trait.rs b/third_party/rust/regex/src/re_trait.rs new file mode 100644 index 0000000000..d0c717df5a --- /dev/null +++ b/third_party/rust/regex/src/re_trait.rs @@ -0,0 +1,294 @@ +use std::fmt; +use std::iter::FusedIterator; + +/// Slot is a single saved capture location. Note that there are two slots for +/// every capture in a regular expression (one slot each for the start and end +/// of the capture). +pub type Slot = Option<usize>; + +/// Locations represents the offsets of each capturing group in a regex for +/// a single match. +/// +/// Unlike `Captures`, a `Locations` value only stores offsets. +#[doc(hidden)] +#[derive(Clone, Debug)] +pub struct Locations(Vec<Slot>); + +impl Locations { + /// Returns the start and end positions of the Nth capture group. Returns + /// `None` if `i` is not a valid capture group or if the capture group did + /// not match anything. The positions returned are *always* byte indices + /// with respect to the original string matched. + pub fn pos(&self, i: usize) -> Option<(usize, usize)> { + let (s, e) = (i * 2, i * 2 + 1); + match (self.0.get(s), self.0.get(e)) { + (Some(&Some(s)), Some(&Some(e))) => Some((s, e)), + _ => None, + } + } + + /// Creates an iterator of all the capture group positions in order of + /// appearance in the regular expression. Positions are byte indices + /// in terms of the original string matched. + pub fn iter(&self) -> SubCapturesPosIter<'_> { + SubCapturesPosIter { idx: 0, locs: self } + } + + /// Returns the total number of capturing groups. + /// + /// This is always at least `1` since every regex has at least `1` + /// capturing group that corresponds to the entire match. + pub fn len(&self) -> usize { + self.0.len() / 2 + } + + /// Return the individual slots as a slice. + pub(crate) fn as_slots(&mut self) -> &mut [Slot] { + &mut self.0 + } +} + +/// An iterator over capture group positions for a particular match of a +/// regular expression. +/// +/// Positions are byte indices in terms of the original string matched. +/// +/// `'c` is the lifetime of the captures. +#[derive(Clone, Debug)] +pub struct SubCapturesPosIter<'c> { + idx: usize, + locs: &'c Locations, +} + +impl<'c> Iterator for SubCapturesPosIter<'c> { + type Item = Option<(usize, usize)>; + + fn next(&mut self) -> Option<Option<(usize, usize)>> { + if self.idx >= self.locs.len() { + return None; + } + let x = match self.locs.pos(self.idx) { + None => Some(None), + Some((s, e)) => Some(Some((s, e))), + }; + self.idx += 1; + x + } + + fn size_hint(&self) -> (usize, Option<usize>) { + let len = self.locs.len() - self.idx; + (len, Some(len)) + } + + fn count(self) -> usize { + self.len() + } +} + +impl<'c> ExactSizeIterator for SubCapturesPosIter<'c> {} + +impl<'c> FusedIterator for SubCapturesPosIter<'c> {} + +/// `RegularExpression` describes types that can implement regex searching. +/// +/// This trait is my attempt at reducing code duplication and to standardize +/// the internal API. Specific duplication that is avoided are the `find` +/// and `capture` iterators, which are slightly tricky. +/// +/// It's not clear whether this trait is worth it, and it also isn't +/// clear whether it's useful as a public trait or not. Methods like +/// `next_after_empty` reak of bad design, but the rest of the methods seem +/// somewhat reasonable. One particular thing this trait would expose would be +/// the ability to start the search of a regex anywhere in a haystack, which +/// isn't possible in the current public API. +pub trait RegularExpression: Sized + fmt::Debug { + /// The type of the haystack. + type Text: ?Sized + fmt::Debug; + + /// The number of capture slots in the compiled regular expression. This is + /// always two times the number of capture groups (two slots per group). + fn slots_len(&self) -> usize; + + /// Allocates fresh space for all capturing groups in this regex. + fn locations(&self) -> Locations { + Locations(vec![None; self.slots_len()]) + } + + /// Returns the position of the next character after `i`. + /// + /// For example, a haystack with type `&[u8]` probably returns `i+1`, + /// whereas a haystack with type `&str` probably returns `i` plus the + /// length of the next UTF-8 sequence. + fn next_after_empty(&self, text: &Self::Text, i: usize) -> usize; + + /// Returns the location of the shortest match. + fn shortest_match_at( + &self, + text: &Self::Text, + start: usize, + ) -> Option<usize>; + + /// Returns whether the regex matches the text given. + fn is_match_at(&self, text: &Self::Text, start: usize) -> bool; + + /// Returns the leftmost-first match location if one exists. + fn find_at( + &self, + text: &Self::Text, + start: usize, + ) -> Option<(usize, usize)>; + + /// Returns the leftmost-first match location if one exists, and also + /// fills in any matching capture slot locations. + fn captures_read_at( + &self, + locs: &mut Locations, + text: &Self::Text, + start: usize, + ) -> Option<(usize, usize)>; + + /// Returns an iterator over all non-overlapping successive leftmost-first + /// matches. + fn find_iter(self, text: &Self::Text) -> Matches<'_, Self> { + Matches { re: self, text, last_end: 0, last_match: None } + } + + /// Returns an iterator over all non-overlapping successive leftmost-first + /// matches with captures. + fn captures_iter(self, text: &Self::Text) -> CaptureMatches<'_, Self> { + CaptureMatches(self.find_iter(text)) + } +} + +/// An iterator over all non-overlapping successive leftmost-first matches. +#[derive(Debug)] +pub struct Matches<'t, R> +where + R: RegularExpression, + R::Text: 't, +{ + re: R, + text: &'t R::Text, + last_end: usize, + last_match: Option<usize>, +} + +impl<'t, R> Matches<'t, R> +where + R: RegularExpression, + R::Text: 't, +{ + /// Return the text being searched. + pub fn text(&self) -> &'t R::Text { + self.text + } + + /// Return the underlying regex. + pub fn regex(&self) -> &R { + &self.re + } +} + +impl<'t, R> Iterator for Matches<'t, R> +where + R: RegularExpression, + R::Text: 't + AsRef<[u8]>, +{ + type Item = (usize, usize); + + fn next(&mut self) -> Option<(usize, usize)> { + if self.last_end > self.text.as_ref().len() { + return None; + } + let (s, e) = match self.re.find_at(self.text, self.last_end) { + None => return None, + Some((s, e)) => (s, e), + }; + if s == e { + // This is an empty match. To ensure we make progress, start + // the next search at the smallest possible starting position + // of the next match following this one. + self.last_end = self.re.next_after_empty(self.text, e); + // Don't accept empty matches immediately following a match. + // Just move on to the next match. + if Some(e) == self.last_match { + return self.next(); + } + } else { + self.last_end = e; + } + self.last_match = Some(e); + Some((s, e)) + } +} + +impl<'t, R> FusedIterator for Matches<'t, R> +where + R: RegularExpression, + R::Text: 't + AsRef<[u8]>, +{ +} + +/// An iterator over all non-overlapping successive leftmost-first matches with +/// captures. +#[derive(Debug)] +pub struct CaptureMatches<'t, R>(Matches<'t, R>) +where + R: RegularExpression, + R::Text: 't; + +impl<'t, R> CaptureMatches<'t, R> +where + R: RegularExpression, + R::Text: 't, +{ + /// Return the text being searched. + pub fn text(&self) -> &'t R::Text { + self.0.text() + } + + /// Return the underlying regex. + pub fn regex(&self) -> &R { + self.0.regex() + } +} + +impl<'t, R> Iterator for CaptureMatches<'t, R> +where + R: RegularExpression, + R::Text: 't + AsRef<[u8]>, +{ + type Item = Locations; + + fn next(&mut self) -> Option<Locations> { + if self.0.last_end > self.0.text.as_ref().len() { + return None; + } + let mut locs = self.0.re.locations(); + let (s, e) = match self.0.re.captures_read_at( + &mut locs, + self.0.text, + self.0.last_end, + ) { + None => return None, + Some((s, e)) => (s, e), + }; + if s == e { + self.0.last_end = self.0.re.next_after_empty(self.0.text, e); + if Some(e) == self.0.last_match { + return self.next(); + } + } else { + self.0.last_end = e; + } + self.0.last_match = Some(e); + Some(locs) + } +} + +impl<'t, R> FusedIterator for CaptureMatches<'t, R> +where + R: RegularExpression, + R::Text: 't + AsRef<[u8]>, +{ +} diff --git a/third_party/rust/regex/src/re_unicode.rs b/third_party/rust/regex/src/re_unicode.rs new file mode 100644 index 0000000000..60d81a7d95 --- /dev/null +++ b/third_party/rust/regex/src/re_unicode.rs @@ -0,0 +1,1311 @@ +use std::borrow::Cow; +use std::collections::HashMap; +use std::fmt; +use std::iter::FusedIterator; +use std::ops::{Index, Range}; +use std::str::FromStr; +use std::sync::Arc; + +use crate::find_byte::find_byte; + +use crate::error::Error; +use crate::exec::{Exec, ExecNoSyncStr}; +use crate::expand::expand_str; +use crate::re_builder::unicode::RegexBuilder; +use crate::re_trait::{self, RegularExpression, SubCapturesPosIter}; + +/// Escapes all regular expression meta characters in `text`. +/// +/// The string returned may be safely used as a literal in a regular +/// expression. +pub fn escape(text: &str) -> String { + regex_syntax::escape(text) +} + +/// Match represents a single match of a regex in a haystack. +/// +/// The lifetime parameter `'t` refers to the lifetime of the matched text. +#[derive(Copy, Clone, Debug, Eq, PartialEq)] +pub struct Match<'t> { + text: &'t str, + start: usize, + end: usize, +} + +impl<'t> Match<'t> { + /// Returns the starting byte offset of the match in the haystack. + #[inline] + pub fn start(&self) -> usize { + self.start + } + + /// Returns the ending byte offset of the match in the haystack. + #[inline] + pub fn end(&self) -> usize { + self.end + } + + /// Returns the range over the starting and ending byte offsets of the + /// match in the haystack. + #[inline] + pub fn range(&self) -> Range<usize> { + self.start..self.end + } + + /// Returns the matched text. + #[inline] + pub fn as_str(&self) -> &'t str { + &self.text[self.range()] + } + + /// Creates a new match from the given haystack and byte offsets. + #[inline] + fn new(haystack: &'t str, start: usize, end: usize) -> Match<'t> { + Match { text: haystack, start, end } + } +} + +impl<'t> From<Match<'t>> for &'t str { + fn from(m: Match<'t>) -> &'t str { + m.as_str() + } +} + +impl<'t> From<Match<'t>> for Range<usize> { + fn from(m: Match<'t>) -> Range<usize> { + m.range() + } +} + +/// A compiled regular expression for matching Unicode strings. +/// +/// It is represented as either a sequence of bytecode instructions (dynamic) +/// or as a specialized Rust function (native). It can be used to search, split +/// or replace text. All searching is done with an implicit `.*?` at the +/// beginning and end of an expression. To force an expression to match the +/// whole string (or a prefix or a suffix), you must use an anchor like `^` or +/// `$` (or `\A` and `\z`). +/// +/// While this crate will handle Unicode strings (whether in the regular +/// expression or in the search text), all positions returned are **byte +/// indices**. Every byte index is guaranteed to be at a Unicode code point +/// boundary. +/// +/// The lifetimes `'r` and `'t` in this crate correspond to the lifetime of a +/// compiled regular expression and text to search, respectively. +/// +/// The only methods that allocate new strings are the string replacement +/// methods. All other methods (searching and splitting) return borrowed +/// pointers into the string given. +/// +/// # Examples +/// +/// Find the location of a US phone number: +/// +/// ```rust +/// # use regex::Regex; +/// let re = Regex::new("[0-9]{3}-[0-9]{3}-[0-9]{4}").unwrap(); +/// let mat = re.find("phone: 111-222-3333").unwrap(); +/// assert_eq!((mat.start(), mat.end()), (7, 19)); +/// ``` +/// +/// # Using the `std::str::pattern` methods with `Regex` +/// +/// > **Note**: This section requires that this crate is compiled with the +/// > `pattern` Cargo feature enabled, which **requires nightly Rust**. +/// +/// Since `Regex` implements `Pattern`, you can use regexes with methods +/// defined on `&str`. For example, `is_match`, `find`, `find_iter` +/// and `split` can be replaced with `str::contains`, `str::find`, +/// `str::match_indices` and `str::split`. +/// +/// Here are some examples: +/// +/// ```rust,ignore +/// # use regex::Regex; +/// let re = Regex::new(r"\d+").unwrap(); +/// let haystack = "a111b222c"; +/// +/// assert!(haystack.contains(&re)); +/// assert_eq!(haystack.find(&re), Some(1)); +/// assert_eq!(haystack.match_indices(&re).collect::<Vec<_>>(), +/// vec![(1, "111"), (5, "222")]); +/// assert_eq!(haystack.split(&re).collect::<Vec<_>>(), vec!["a", "b", "c"]); +/// ``` +#[derive(Clone)] +pub struct Regex(Exec); + +impl fmt::Display for Regex { + /// Shows the original regular expression. + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + write!(f, "{}", self.as_str()) + } +} + +impl fmt::Debug for Regex { + /// Shows the original regular expression. + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + fmt::Display::fmt(self, f) + } +} + +#[doc(hidden)] +impl From<Exec> for Regex { + fn from(exec: Exec) -> Regex { + Regex(exec) + } +} + +impl FromStr for Regex { + type Err = Error; + + /// Attempts to parse a string into a regular expression + fn from_str(s: &str) -> Result<Regex, Error> { + Regex::new(s) + } +} + +/// Core regular expression methods. +impl Regex { + /// Compiles a regular expression. Once compiled, it can be used repeatedly + /// to search, split or replace text in a string. + /// + /// If an invalid expression is given, then an error is returned. + pub fn new(re: &str) -> Result<Regex, Error> { + RegexBuilder::new(re).build() + } + + /// Returns true if and only if there is a match for the regex in the + /// string given. + /// + /// It is recommended to use this method if all you need to do is test + /// a match, since the underlying matching engine may be able to do less + /// work. + /// + /// # Example + /// + /// Test if some text contains at least one word with exactly 13 + /// Unicode word characters: + /// + /// ```rust + /// # use regex::Regex; + /// # fn main() { + /// let text = "I categorically deny having triskaidekaphobia."; + /// assert!(Regex::new(r"\b\w{13}\b").unwrap().is_match(text)); + /// # } + /// ``` + pub fn is_match(&self, text: &str) -> bool { + self.is_match_at(text, 0) + } + + /// Returns the start and end byte range of the leftmost-first match in + /// `text`. If no match exists, then `None` is returned. + /// + /// Note that this should only be used if you want to discover the position + /// of the match. Testing the existence of a match is faster if you use + /// `is_match`. + /// + /// # Example + /// + /// Find the start and end location of the first word with exactly 13 + /// Unicode word characters: + /// + /// ```rust + /// # use regex::Regex; + /// # fn main() { + /// let text = "I categorically deny having triskaidekaphobia."; + /// let mat = Regex::new(r"\b\w{13}\b").unwrap().find(text).unwrap(); + /// assert_eq!(mat.start(), 2); + /// assert_eq!(mat.end(), 15); + /// # } + /// ``` + pub fn find<'t>(&self, text: &'t str) -> Option<Match<'t>> { + self.find_at(text, 0) + } + + /// Returns an iterator for each successive non-overlapping match in + /// `text`, returning the start and end byte indices with respect to + /// `text`. + /// + /// # Example + /// + /// Find the start and end location of every word with exactly 13 Unicode + /// word characters: + /// + /// ```rust + /// # use regex::Regex; + /// # fn main() { + /// let text = "Retroactively relinquishing remunerations is reprehensible."; + /// for mat in Regex::new(r"\b\w{13}\b").unwrap().find_iter(text) { + /// println!("{:?}", mat); + /// } + /// # } + /// ``` + pub fn find_iter<'r, 't>(&'r self, text: &'t str) -> Matches<'r, 't> { + Matches(self.0.searcher_str().find_iter(text)) + } + + /// Returns the capture groups corresponding to the leftmost-first + /// match in `text`. Capture group `0` always corresponds to the entire + /// match. If no match is found, then `None` is returned. + /// + /// You should only use `captures` if you need access to the location of + /// capturing group matches. Otherwise, `find` is faster for discovering + /// the location of the overall match. + /// + /// # Examples + /// + /// Say you have some text with movie names and their release years, + /// like "'Citizen Kane' (1941)". It'd be nice if we could search for text + /// looking like that, while also extracting the movie name and its release + /// year separately. + /// + /// ```rust + /// # use regex::Regex; + /// # fn main() { + /// let re = Regex::new(r"'([^']+)'\s+\((\d{4})\)").unwrap(); + /// let text = "Not my favorite movie: 'Citizen Kane' (1941)."; + /// let caps = re.captures(text).unwrap(); + /// assert_eq!(caps.get(1).unwrap().as_str(), "Citizen Kane"); + /// assert_eq!(caps.get(2).unwrap().as_str(), "1941"); + /// assert_eq!(caps.get(0).unwrap().as_str(), "'Citizen Kane' (1941)"); + /// // You can also access the groups by index using the Index notation. + /// // Note that this will panic on an invalid index. + /// assert_eq!(&caps[1], "Citizen Kane"); + /// assert_eq!(&caps[2], "1941"); + /// assert_eq!(&caps[0], "'Citizen Kane' (1941)"); + /// # } + /// ``` + /// + /// Note that the full match is at capture group `0`. Each subsequent + /// capture group is indexed by the order of its opening `(`. + /// + /// We can make this example a bit clearer by using *named* capture groups: + /// + /// ```rust + /// # use regex::Regex; + /// # fn main() { + /// let re = Regex::new(r"'(?P<title>[^']+)'\s+\((?P<year>\d{4})\)") + /// .unwrap(); + /// let text = "Not my favorite movie: 'Citizen Kane' (1941)."; + /// let caps = re.captures(text).unwrap(); + /// assert_eq!(caps.name("title").unwrap().as_str(), "Citizen Kane"); + /// assert_eq!(caps.name("year").unwrap().as_str(), "1941"); + /// assert_eq!(caps.get(0).unwrap().as_str(), "'Citizen Kane' (1941)"); + /// // You can also access the groups by name using the Index notation. + /// // Note that this will panic on an invalid group name. + /// assert_eq!(&caps["title"], "Citizen Kane"); + /// assert_eq!(&caps["year"], "1941"); + /// assert_eq!(&caps[0], "'Citizen Kane' (1941)"); + /// + /// # } + /// ``` + /// + /// Here we name the capture groups, which we can access with the `name` + /// method or the `Index` notation with a `&str`. Note that the named + /// capture groups are still accessible with `get` or the `Index` notation + /// with a `usize`. + /// + /// The `0`th capture group is always unnamed, so it must always be + /// accessed with `get(0)` or `[0]`. + pub fn captures<'t>(&self, text: &'t str) -> Option<Captures<'t>> { + let mut locs = self.capture_locations(); + self.captures_read_at(&mut locs, text, 0).map(move |_| Captures { + text, + locs: locs.0, + named_groups: self.0.capture_name_idx().clone(), + }) + } + + /// Returns an iterator over all the non-overlapping capture groups matched + /// in `text`. This is operationally the same as `find_iter`, except it + /// yields information about capturing group matches. + /// + /// # Example + /// + /// We can use this to find all movie titles and their release years in + /// some text, where the movie is formatted like "'Title' (xxxx)": + /// + /// ```rust + /// # use regex::Regex; + /// # fn main() { + /// let re = Regex::new(r"'(?P<title>[^']+)'\s+\((?P<year>\d{4})\)") + /// .unwrap(); + /// let text = "'Citizen Kane' (1941), 'The Wizard of Oz' (1939), 'M' (1931)."; + /// for caps in re.captures_iter(text) { + /// println!("Movie: {:?}, Released: {:?}", + /// &caps["title"], &caps["year"]); + /// } + /// // Output: + /// // Movie: Citizen Kane, Released: 1941 + /// // Movie: The Wizard of Oz, Released: 1939 + /// // Movie: M, Released: 1931 + /// # } + /// ``` + pub fn captures_iter<'r, 't>( + &'r self, + text: &'t str, + ) -> CaptureMatches<'r, 't> { + CaptureMatches(self.0.searcher_str().captures_iter(text)) + } + + /// Returns an iterator of substrings of `text` delimited by a match of the + /// regular expression. Namely, each element of the iterator corresponds to + /// text that *isn't* matched by the regular expression. + /// + /// This method will *not* copy the text given. + /// + /// # Example + /// + /// To split a string delimited by arbitrary amounts of spaces or tabs: + /// + /// ```rust + /// # use regex::Regex; + /// # fn main() { + /// let re = Regex::new(r"[ \t]+").unwrap(); + /// let fields: Vec<&str> = re.split("a b \t c\td e").collect(); + /// assert_eq!(fields, vec!["a", "b", "c", "d", "e"]); + /// # } + /// ``` + pub fn split<'r, 't>(&'r self, text: &'t str) -> Split<'r, 't> { + Split { finder: self.find_iter(text), last: 0 } + } + + /// Returns an iterator of at most `limit` substrings of `text` delimited + /// by a match of the regular expression. (A `limit` of `0` will return no + /// substrings.) Namely, each element of the iterator corresponds to text + /// that *isn't* matched by the regular expression. The remainder of the + /// string that is not split will be the last element in the iterator. + /// + /// This method will *not* copy the text given. + /// + /// # Example + /// + /// Get the first two words in some text: + /// + /// ```rust + /// # use regex::Regex; + /// # fn main() { + /// let re = Regex::new(r"\W+").unwrap(); + /// let fields: Vec<&str> = re.splitn("Hey! How are you?", 3).collect(); + /// assert_eq!(fields, vec!("Hey", "How", "are you?")); + /// # } + /// ``` + pub fn splitn<'r, 't>( + &'r self, + text: &'t str, + limit: usize, + ) -> SplitN<'r, 't> { + SplitN { splits: self.split(text), n: limit } + } + + /// Replaces the leftmost-first match with the replacement provided. + /// The replacement can be a regular string (where `$N` and `$name` are + /// expanded to match capture groups) or a function that takes the matches' + /// `Captures` and returns the replaced string. + /// + /// If no match is found, then a copy of the string is returned unchanged. + /// + /// # Replacement string syntax + /// + /// All instances of `$name` in the replacement text is replaced with the + /// corresponding capture group `name`. + /// + /// `name` may be an integer corresponding to the index of the + /// capture group (counted by order of opening parenthesis where `0` is the + /// entire match) or it can be a name (consisting of letters, digits or + /// underscores) corresponding to a named capture group. + /// + /// If `name` isn't a valid capture group (whether the name doesn't exist + /// or isn't a valid index), then it is replaced with the empty string. + /// + /// The longest possible name is used. e.g., `$1a` looks up the capture + /// group named `1a` and not the capture group at index `1`. To exert more + /// precise control over the name, use braces, e.g., `${1}a`. + /// + /// To write a literal `$` use `$$`. + /// + /// # Examples + /// + /// Note that this function is polymorphic with respect to the replacement. + /// In typical usage, this can just be a normal string: + /// + /// ```rust + /// # use regex::Regex; + /// # fn main() { + /// let re = Regex::new("[^01]+").unwrap(); + /// assert_eq!(re.replace("1078910", ""), "1010"); + /// # } + /// ``` + /// + /// But anything satisfying the `Replacer` trait will work. For example, + /// a closure of type `|&Captures| -> String` provides direct access to the + /// captures corresponding to a match. This allows one to access + /// capturing group matches easily: + /// + /// ```rust + /// # use regex::Regex; + /// # use regex::Captures; fn main() { + /// let re = Regex::new(r"([^,\s]+),\s+(\S+)").unwrap(); + /// let result = re.replace("Springsteen, Bruce", |caps: &Captures| { + /// format!("{} {}", &caps[2], &caps[1]) + /// }); + /// assert_eq!(result, "Bruce Springsteen"); + /// # } + /// ``` + /// + /// But this is a bit cumbersome to use all the time. Instead, a simple + /// syntax is supported that expands `$name` into the corresponding capture + /// group. Here's the last example, but using this expansion technique + /// with named capture groups: + /// + /// ```rust + /// # use regex::Regex; + /// # fn main() { + /// let re = Regex::new(r"(?P<last>[^,\s]+),\s+(?P<first>\S+)").unwrap(); + /// let result = re.replace("Springsteen, Bruce", "$first $last"); + /// assert_eq!(result, "Bruce Springsteen"); + /// # } + /// ``` + /// + /// Note that using `$2` instead of `$first` or `$1` instead of `$last` + /// would produce the same result. To write a literal `$` use `$$`. + /// + /// Sometimes the replacement string requires use of curly braces to + /// delineate a capture group replacement and surrounding literal text. + /// For example, if we wanted to join two words together with an + /// underscore: + /// + /// ```rust + /// # use regex::Regex; + /// # fn main() { + /// let re = Regex::new(r"(?P<first>\w+)\s+(?P<second>\w+)").unwrap(); + /// let result = re.replace("deep fried", "${first}_$second"); + /// assert_eq!(result, "deep_fried"); + /// # } + /// ``` + /// + /// Without the curly braces, the capture group name `first_` would be + /// used, and since it doesn't exist, it would be replaced with the empty + /// string. + /// + /// Finally, sometimes you just want to replace a literal string with no + /// regard for capturing group expansion. This can be done by wrapping a + /// byte string with `NoExpand`: + /// + /// ```rust + /// # use regex::Regex; + /// # fn main() { + /// use regex::NoExpand; + /// + /// let re = Regex::new(r"(?P<last>[^,\s]+),\s+(\S+)").unwrap(); + /// let result = re.replace("Springsteen, Bruce", NoExpand("$2 $last")); + /// assert_eq!(result, "$2 $last"); + /// # } + /// ``` + pub fn replace<'t, R: Replacer>( + &self, + text: &'t str, + rep: R, + ) -> Cow<'t, str> { + self.replacen(text, 1, rep) + } + + /// Replaces all non-overlapping matches in `text` with the replacement + /// provided. This is the same as calling `replacen` with `limit` set to + /// `0`. + /// + /// See the documentation for `replace` for details on how to access + /// capturing group matches in the replacement string. + pub fn replace_all<'t, R: Replacer>( + &self, + text: &'t str, + rep: R, + ) -> Cow<'t, str> { + self.replacen(text, 0, rep) + } + + /// Replaces at most `limit` non-overlapping matches in `text` with the + /// replacement provided. If `limit` is 0, then all non-overlapping matches + /// are replaced. + /// + /// See the documentation for `replace` for details on how to access + /// capturing group matches in the replacement string. + pub fn replacen<'t, R: Replacer>( + &self, + text: &'t str, + limit: usize, + mut rep: R, + ) -> Cow<'t, str> { + // If we know that the replacement doesn't have any capture expansions, + // then we can use the fast path. The fast path can make a tremendous + // difference: + // + // 1) We use `find_iter` instead of `captures_iter`. Not asking for + // captures generally makes the regex engines faster. + // 2) We don't need to look up all of the capture groups and do + // replacements inside the replacement string. We just push it + // at each match and be done with it. + if let Some(rep) = rep.no_expansion() { + let mut it = self.find_iter(text).enumerate().peekable(); + if it.peek().is_none() { + return Cow::Borrowed(text); + } + let mut new = String::with_capacity(text.len()); + let mut last_match = 0; + for (i, m) in it { + if limit > 0 && i >= limit { + break; + } + new.push_str(&text[last_match..m.start()]); + new.push_str(&rep); + last_match = m.end(); + } + new.push_str(&text[last_match..]); + return Cow::Owned(new); + } + + // The slower path, which we use if the replacement needs access to + // capture groups. + let mut it = self.captures_iter(text).enumerate().peekable(); + if it.peek().is_none() { + return Cow::Borrowed(text); + } + let mut new = String::with_capacity(text.len()); + let mut last_match = 0; + for (i, cap) in it { + if limit > 0 && i >= limit { + break; + } + // unwrap on 0 is OK because captures only reports matches + let m = cap.get(0).unwrap(); + new.push_str(&text[last_match..m.start()]); + rep.replace_append(&cap, &mut new); + last_match = m.end(); + } + new.push_str(&text[last_match..]); + Cow::Owned(new) + } +} + +/// Advanced or "lower level" search methods. +impl Regex { + /// Returns the end location of a match in the text given. + /// + /// This method may have the same performance characteristics as + /// `is_match`, except it provides an end location for a match. In + /// particular, the location returned *may be shorter* than the proper end + /// of the leftmost-first match. + /// + /// # Example + /// + /// Typically, `a+` would match the entire first sequence of `a` in some + /// text, but `shortest_match` can give up as soon as it sees the first + /// `a`. + /// + /// ```rust + /// # use regex::Regex; + /// # fn main() { + /// let text = "aaaaa"; + /// let pos = Regex::new(r"a+").unwrap().shortest_match(text); + /// assert_eq!(pos, Some(1)); + /// # } + /// ``` + pub fn shortest_match(&self, text: &str) -> Option<usize> { + self.shortest_match_at(text, 0) + } + + /// Returns the same as shortest_match, but starts the search at the given + /// offset. + /// + /// The significance of the starting point is that it takes the surrounding + /// context into consideration. For example, the `\A` anchor can only + /// match when `start == 0`. + pub fn shortest_match_at( + &self, + text: &str, + start: usize, + ) -> Option<usize> { + self.0.searcher_str().shortest_match_at(text, start) + } + + /// Returns the same as is_match, but starts the search at the given + /// offset. + /// + /// The significance of the starting point is that it takes the surrounding + /// context into consideration. For example, the `\A` anchor can only + /// match when `start == 0`. + pub fn is_match_at(&self, text: &str, start: usize) -> bool { + self.0.searcher_str().is_match_at(text, start) + } + + /// Returns the same as find, but starts the search at the given + /// offset. + /// + /// The significance of the starting point is that it takes the surrounding + /// context into consideration. For example, the `\A` anchor can only + /// match when `start == 0`. + pub fn find_at<'t>( + &self, + text: &'t str, + start: usize, + ) -> Option<Match<'t>> { + self.0 + .searcher_str() + .find_at(text, start) + .map(|(s, e)| Match::new(text, s, e)) + } + + /// This is like `captures`, but uses + /// [`CaptureLocations`](struct.CaptureLocations.html) + /// instead of + /// [`Captures`](struct.Captures.html) in order to amortize allocations. + /// + /// To create a `CaptureLocations` value, use the + /// `Regex::capture_locations` method. + /// + /// This returns the overall match if this was successful, which is always + /// equivalence to the `0`th capture group. + pub fn captures_read<'t>( + &self, + locs: &mut CaptureLocations, + text: &'t str, + ) -> Option<Match<'t>> { + self.captures_read_at(locs, text, 0) + } + + /// Returns the same as captures, but starts the search at the given + /// offset and populates the capture locations given. + /// + /// The significance of the starting point is that it takes the surrounding + /// context into consideration. For example, the `\A` anchor can only + /// match when `start == 0`. + pub fn captures_read_at<'t>( + &self, + locs: &mut CaptureLocations, + text: &'t str, + start: usize, + ) -> Option<Match<'t>> { + self.0 + .searcher_str() + .captures_read_at(&mut locs.0, text, start) + .map(|(s, e)| Match::new(text, s, e)) + } + + /// An undocumented alias for `captures_read_at`. + /// + /// The `regex-capi` crate previously used this routine, so to avoid + /// breaking that crate, we continue to provide the name as an undocumented + /// alias. + #[doc(hidden)] + pub fn read_captures_at<'t>( + &self, + locs: &mut CaptureLocations, + text: &'t str, + start: usize, + ) -> Option<Match<'t>> { + self.captures_read_at(locs, text, start) + } +} + +/// Auxiliary methods. +impl Regex { + /// Returns the original string of this regex. + pub fn as_str(&self) -> &str { + &self.0.regex_strings()[0] + } + + /// Returns an iterator over the capture names. + pub fn capture_names(&self) -> CaptureNames<'_> { + CaptureNames(self.0.capture_names().iter()) + } + + /// Returns the number of captures. + pub fn captures_len(&self) -> usize { + self.0.capture_names().len() + } + + /// Returns an empty set of capture locations that can be reused in + /// multiple calls to `captures_read` or `captures_read_at`. + pub fn capture_locations(&self) -> CaptureLocations { + CaptureLocations(self.0.searcher_str().locations()) + } + + /// An alias for `capture_locations` to preserve backward compatibility. + /// + /// The `regex-capi` crate uses this method, so to avoid breaking that + /// crate, we continue to export it as an undocumented API. + #[doc(hidden)] + pub fn locations(&self) -> CaptureLocations { + CaptureLocations(self.0.searcher_str().locations()) + } +} + +/// An iterator over the names of all possible captures. +/// +/// `None` indicates an unnamed capture; the first element (capture 0, the +/// whole matched region) is always unnamed. +/// +/// `'r` is the lifetime of the compiled regular expression. +#[derive(Clone, Debug)] +pub struct CaptureNames<'r>(::std::slice::Iter<'r, Option<String>>); + +impl<'r> Iterator for CaptureNames<'r> { + type Item = Option<&'r str>; + + fn next(&mut self) -> Option<Option<&'r str>> { + self.0 + .next() + .as_ref() + .map(|slot| slot.as_ref().map(|name| name.as_ref())) + } + + fn size_hint(&self) -> (usize, Option<usize>) { + self.0.size_hint() + } + + fn count(self) -> usize { + self.0.count() + } +} + +impl<'r> ExactSizeIterator for CaptureNames<'r> {} + +impl<'r> FusedIterator for CaptureNames<'r> {} + +/// Yields all substrings delimited by a regular expression match. +/// +/// `'r` is the lifetime of the compiled regular expression and `'t` is the +/// lifetime of the string being split. +#[derive(Debug)] +pub struct Split<'r, 't> { + finder: Matches<'r, 't>, + last: usize, +} + +impl<'r, 't> Iterator for Split<'r, 't> { + type Item = &'t str; + + fn next(&mut self) -> Option<&'t str> { + let text = self.finder.0.text(); + match self.finder.next() { + None => { + if self.last > text.len() { + None + } else { + let s = &text[self.last..]; + self.last = text.len() + 1; // Next call will return None + Some(s) + } + } + Some(m) => { + let matched = &text[self.last..m.start()]; + self.last = m.end(); + Some(matched) + } + } + } +} + +impl<'r, 't> FusedIterator for Split<'r, 't> {} + +/// Yields at most `N` substrings delimited by a regular expression match. +/// +/// The last substring will be whatever remains after splitting. +/// +/// `'r` is the lifetime of the compiled regular expression and `'t` is the +/// lifetime of the string being split. +#[derive(Debug)] +pub struct SplitN<'r, 't> { + splits: Split<'r, 't>, + n: usize, +} + +impl<'r, 't> Iterator for SplitN<'r, 't> { + type Item = &'t str; + + fn next(&mut self) -> Option<&'t str> { + if self.n == 0 { + return None; + } + + self.n -= 1; + if self.n > 0 { + return self.splits.next(); + } + + let text = self.splits.finder.0.text(); + if self.splits.last > text.len() { + // We've already returned all substrings. + None + } else { + // self.n == 0, so future calls will return None immediately + Some(&text[self.splits.last..]) + } + } + + fn size_hint(&self) -> (usize, Option<usize>) { + (0, Some(self.n)) + } +} + +impl<'r, 't> FusedIterator for SplitN<'r, 't> {} + +/// CaptureLocations is a low level representation of the raw offsets of each +/// submatch. +/// +/// You can think of this as a lower level +/// [`Captures`](struct.Captures.html), where this type does not support +/// named capturing groups directly and it does not borrow the text that these +/// offsets were matched on. +/// +/// Primarily, this type is useful when using the lower level `Regex` APIs +/// such as `read_captures`, which permits amortizing the allocation in which +/// capture match locations are stored. +/// +/// In order to build a value of this type, you'll need to call the +/// `capture_locations` method on the `Regex` being used to execute the search. +/// The value returned can then be reused in subsequent searches. +#[derive(Clone, Debug)] +pub struct CaptureLocations(re_trait::Locations); + +/// A type alias for `CaptureLocations` for backwards compatibility. +/// +/// Previously, we exported `CaptureLocations` as `Locations` in an +/// undocumented API. To prevent breaking that code (e.g., in `regex-capi`), +/// we continue re-exporting the same undocumented API. +#[doc(hidden)] +pub type Locations = CaptureLocations; + +impl CaptureLocations { + /// Returns the start and end positions of the Nth capture group. Returns + /// `None` if `i` is not a valid capture group or if the capture group did + /// not match anything. The positions returned are *always* byte indices + /// with respect to the original string matched. + #[inline] + pub fn get(&self, i: usize) -> Option<(usize, usize)> { + self.0.pos(i) + } + + /// Returns the total number of capture groups (even if they didn't match). + /// + /// This is always at least `1` since every regex has at least `1` + /// capturing group that corresponds to the entire match. + #[inline] + pub fn len(&self) -> usize { + self.0.len() + } + + /// An alias for the `get` method for backwards compatibility. + /// + /// Previously, we exported `get` as `pos` in an undocumented API. To + /// prevent breaking that code (e.g., in `regex-capi`), we continue + /// re-exporting the same undocumented API. + #[doc(hidden)] + #[inline] + pub fn pos(&self, i: usize) -> Option<(usize, usize)> { + self.get(i) + } +} + +/// Captures represents a group of captured strings for a single match. +/// +/// The 0th capture always corresponds to the entire match. Each subsequent +/// index corresponds to the next capture group in the regex. If a capture +/// group is named, then the matched string is *also* available via the `name` +/// method. (Note that the 0th capture is always unnamed and so must be +/// accessed with the `get` method.) +/// +/// Positions returned from a capture group are always byte indices. +/// +/// `'t` is the lifetime of the matched text. +pub struct Captures<'t> { + text: &'t str, + locs: re_trait::Locations, + named_groups: Arc<HashMap<String, usize>>, +} + +impl<'t> Captures<'t> { + /// Returns the match associated with the capture group at index `i`. If + /// `i` does not correspond to a capture group, or if the capture group + /// did not participate in the match, then `None` is returned. + /// + /// # Examples + /// + /// Get the text of the match with a default of an empty string if this + /// group didn't participate in the match: + /// + /// ```rust + /// # use regex::Regex; + /// let re = Regex::new(r"[a-z]+(?:([0-9]+)|([A-Z]+))").unwrap(); + /// let caps = re.captures("abc123").unwrap(); + /// + /// let text1 = caps.get(1).map_or("", |m| m.as_str()); + /// let text2 = caps.get(2).map_or("", |m| m.as_str()); + /// assert_eq!(text1, "123"); + /// assert_eq!(text2, ""); + /// ``` + pub fn get(&self, i: usize) -> Option<Match<'t>> { + self.locs.pos(i).map(|(s, e)| Match::new(self.text, s, e)) + } + + /// Returns the match for the capture group named `name`. If `name` isn't a + /// valid capture group or didn't match anything, then `None` is returned. + pub fn name(&self, name: &str) -> Option<Match<'t>> { + self.named_groups.get(name).and_then(|&i| self.get(i)) + } + + /// An iterator that yields all capturing matches in the order in which + /// they appear in the regex. If a particular capture group didn't + /// participate in the match, then `None` is yielded for that capture. + /// + /// The first match always corresponds to the overall match of the regex. + pub fn iter<'c>(&'c self) -> SubCaptureMatches<'c, 't> { + SubCaptureMatches { caps: self, it: self.locs.iter() } + } + + /// Expands all instances of `$name` in `replacement` to the corresponding + /// capture group `name`, and writes them to the `dst` buffer given. + /// + /// `name` may be an integer corresponding to the index of the capture + /// group (counted by order of opening parenthesis where `0` is the + /// entire match) or it can be a name (consisting of letters, digits or + /// underscores) corresponding to a named capture group. + /// + /// If `name` isn't a valid capture group (whether the name doesn't exist + /// or isn't a valid index), then it is replaced with the empty string. + /// + /// The longest possible name consisting of the characters `[_0-9A-Za-z]` + /// is used. e.g., `$1a` looks up the capture group named `1a` and not the + /// capture group at index `1`. To exert more precise control over the + /// name, or to refer to a capture group name that uses characters outside + /// of `[_0-9A-Za-z]`, use braces, e.g., `${1}a` or `${foo[bar].baz}`. When + /// using braces, any sequence of characters is permitted. If the sequence + /// does not refer to a capture group name in the corresponding regex, then + /// it is replaced with an empty string. + /// + /// To write a literal `$` use `$$`. + pub fn expand(&self, replacement: &str, dst: &mut String) { + expand_str(self, replacement, dst) + } + + /// Returns the total number of capture groups (even if they didn't match). + /// + /// This is always at least `1`, since every regex has at least one capture + /// group that corresponds to the full match. + #[inline] + pub fn len(&self) -> usize { + self.locs.len() + } +} + +impl<'t> fmt::Debug for Captures<'t> { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + f.debug_tuple("Captures").field(&CapturesDebug(self)).finish() + } +} + +struct CapturesDebug<'c, 't>(&'c Captures<'t>); + +impl<'c, 't> fmt::Debug for CapturesDebug<'c, 't> { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + // We'd like to show something nice here, even if it means an + // allocation to build a reverse index. + let slot_to_name: HashMap<&usize, &String> = + self.0.named_groups.iter().map(|(a, b)| (b, a)).collect(); + let mut map = f.debug_map(); + for (slot, m) in self.0.locs.iter().enumerate() { + let m = m.map(|(s, e)| &self.0.text[s..e]); + if let Some(name) = slot_to_name.get(&slot) { + map.entry(&name, &m); + } else { + map.entry(&slot, &m); + } + } + map.finish() + } +} + +/// Get a group by index. +/// +/// `'t` is the lifetime of the matched text. +/// +/// The text can't outlive the `Captures` object if this method is +/// used, because of how `Index` is defined (normally `a[i]` is part +/// of `a` and can't outlive it); to do that, use `get()` instead. +/// +/// # Panics +/// +/// If there is no group at the given index. +impl<'t> Index<usize> for Captures<'t> { + type Output = str; + + fn index(&self, i: usize) -> &str { + self.get(i) + .map(|m| m.as_str()) + .unwrap_or_else(|| panic!("no group at index '{}'", i)) + } +} + +/// Get a group by name. +/// +/// `'t` is the lifetime of the matched text and `'i` is the lifetime +/// of the group name (the index). +/// +/// The text can't outlive the `Captures` object if this method is +/// used, because of how `Index` is defined (normally `a[i]` is part +/// of `a` and can't outlive it); to do that, use `name` instead. +/// +/// # Panics +/// +/// If there is no group named by the given value. +impl<'t, 'i> Index<&'i str> for Captures<'t> { + type Output = str; + + fn index<'a>(&'a self, name: &'i str) -> &'a str { + self.name(name) + .map(|m| m.as_str()) + .unwrap_or_else(|| panic!("no group named '{}'", name)) + } +} + +/// An iterator that yields all capturing matches in the order in which they +/// appear in the regex. +/// +/// If a particular capture group didn't participate in the match, then `None` +/// is yielded for that capture. The first match always corresponds to the +/// overall match of the regex. +/// +/// The lifetime `'c` corresponds to the lifetime of the `Captures` value, and +/// the lifetime `'t` corresponds to the originally matched text. +#[derive(Clone, Debug)] +pub struct SubCaptureMatches<'c, 't> { + caps: &'c Captures<'t>, + it: SubCapturesPosIter<'c>, +} + +impl<'c, 't> Iterator for SubCaptureMatches<'c, 't> { + type Item = Option<Match<'t>>; + + fn next(&mut self) -> Option<Option<Match<'t>>> { + self.it + .next() + .map(|cap| cap.map(|(s, e)| Match::new(self.caps.text, s, e))) + } + + fn size_hint(&self) -> (usize, Option<usize>) { + self.it.size_hint() + } + + fn count(self) -> usize { + self.it.count() + } +} + +impl<'c, 't> ExactSizeIterator for SubCaptureMatches<'c, 't> {} + +impl<'c, 't> FusedIterator for SubCaptureMatches<'c, 't> {} + +/// An iterator that yields all non-overlapping capture groups matching a +/// particular regular expression. +/// +/// The iterator stops when no more matches can be found. +/// +/// `'r` is the lifetime of the compiled regular expression and `'t` is the +/// lifetime of the matched string. +#[derive(Debug)] +pub struct CaptureMatches<'r, 't>( + re_trait::CaptureMatches<'t, ExecNoSyncStr<'r>>, +); + +impl<'r, 't> Iterator for CaptureMatches<'r, 't> { + type Item = Captures<'t>; + + fn next(&mut self) -> Option<Captures<'t>> { + self.0.next().map(|locs| Captures { + text: self.0.text(), + locs, + named_groups: self.0.regex().capture_name_idx().clone(), + }) + } +} + +impl<'r, 't> FusedIterator for CaptureMatches<'r, 't> {} + +/// An iterator over all non-overlapping matches for a particular string. +/// +/// The iterator yields a `Match` value. The iterator stops when no more +/// matches can be found. +/// +/// `'r` is the lifetime of the compiled regular expression and `'t` is the +/// lifetime of the matched string. +#[derive(Debug)] +pub struct Matches<'r, 't>(re_trait::Matches<'t, ExecNoSyncStr<'r>>); + +impl<'r, 't> Iterator for Matches<'r, 't> { + type Item = Match<'t>; + + fn next(&mut self) -> Option<Match<'t>> { + let text = self.0.text(); + self.0.next().map(|(s, e)| Match::new(text, s, e)) + } +} + +impl<'r, 't> FusedIterator for Matches<'r, 't> {} + +/// Replacer describes types that can be used to replace matches in a string. +/// +/// In general, users of this crate shouldn't need to implement this trait, +/// since implementations are already provided for `&str` along with other +/// variants of string types and `FnMut(&Captures) -> String` (or any +/// `FnMut(&Captures) -> T` where `T: AsRef<str>`), which covers most use cases. +pub trait Replacer { + /// Appends text to `dst` to replace the current match. + /// + /// The current match is represented by `caps`, which is guaranteed to + /// have a match at capture group `0`. + /// + /// For example, a no-op replacement would be + /// `dst.push_str(caps.get(0).unwrap().as_str())`. + fn replace_append(&mut self, caps: &Captures<'_>, dst: &mut String); + + /// Return a fixed unchanging replacement string. + /// + /// When doing replacements, if access to `Captures` is not needed (e.g., + /// the replacement byte string does not need `$` expansion), then it can + /// be beneficial to avoid finding sub-captures. + /// + /// In general, this is called once for every call to `replacen`. + fn no_expansion<'r>(&'r mut self) -> Option<Cow<'r, str>> { + None + } + + /// Return a `Replacer` that borrows and wraps this `Replacer`. + /// + /// This is useful when you want to take a generic `Replacer` (which might + /// not be cloneable) and use it without consuming it, so it can be used + /// more than once. + /// + /// # Example + /// + /// ``` + /// use regex::{Regex, Replacer}; + /// + /// fn replace_all_twice<R: Replacer>( + /// re: Regex, + /// src: &str, + /// mut rep: R, + /// ) -> String { + /// let dst = re.replace_all(src, rep.by_ref()); + /// let dst = re.replace_all(&dst, rep.by_ref()); + /// dst.into_owned() + /// } + /// ``` + fn by_ref<'r>(&'r mut self) -> ReplacerRef<'r, Self> { + ReplacerRef(self) + } +} + +/// By-reference adaptor for a `Replacer` +/// +/// Returned by [`Replacer::by_ref`](trait.Replacer.html#method.by_ref). +#[derive(Debug)] +pub struct ReplacerRef<'a, R: ?Sized>(&'a mut R); + +impl<'a, R: Replacer + ?Sized + 'a> Replacer for ReplacerRef<'a, R> { + fn replace_append(&mut self, caps: &Captures<'_>, dst: &mut String) { + self.0.replace_append(caps, dst) + } + fn no_expansion(&mut self) -> Option<Cow<'_, str>> { + self.0.no_expansion() + } +} + +impl<'a> Replacer for &'a str { + fn replace_append(&mut self, caps: &Captures<'_>, dst: &mut String) { + caps.expand(*self, dst); + } + + fn no_expansion(&mut self) -> Option<Cow<'_, str>> { + no_expansion(self) + } +} + +impl<'a> Replacer for &'a String { + fn replace_append(&mut self, caps: &Captures<'_>, dst: &mut String) { + self.as_str().replace_append(caps, dst) + } + + fn no_expansion(&mut self) -> Option<Cow<'_, str>> { + no_expansion(self) + } +} + +impl Replacer for String { + fn replace_append(&mut self, caps: &Captures<'_>, dst: &mut String) { + self.as_str().replace_append(caps, dst) + } + + fn no_expansion(&mut self) -> Option<Cow<'_, str>> { + no_expansion(self) + } +} + +impl<'a> Replacer for Cow<'a, str> { + fn replace_append(&mut self, caps: &Captures<'_>, dst: &mut String) { + self.as_ref().replace_append(caps, dst) + } + + fn no_expansion(&mut self) -> Option<Cow<'_, str>> { + no_expansion(self) + } +} + +impl<'a> Replacer for &'a Cow<'a, str> { + fn replace_append(&mut self, caps: &Captures<'_>, dst: &mut String) { + self.as_ref().replace_append(caps, dst) + } + + fn no_expansion(&mut self) -> Option<Cow<'_, str>> { + no_expansion(self) + } +} + +fn no_expansion<T: AsRef<str>>(t: &T) -> Option<Cow<'_, str>> { + let s = t.as_ref(); + match find_byte(b'$', s.as_bytes()) { + Some(_) => None, + None => Some(Cow::Borrowed(s)), + } +} + +impl<F, T> Replacer for F +where + F: FnMut(&Captures<'_>) -> T, + T: AsRef<str>, +{ + fn replace_append(&mut self, caps: &Captures<'_>, dst: &mut String) { + dst.push_str((*self)(caps).as_ref()); + } +} + +/// `NoExpand` indicates literal string replacement. +/// +/// It can be used with `replace` and `replace_all` to do a literal string +/// replacement without expanding `$name` to their corresponding capture +/// groups. This can be both convenient (to avoid escaping `$`, for example) +/// and performant (since capture groups don't need to be found). +/// +/// `'t` is the lifetime of the literal text. +#[derive(Clone, Debug)] +pub struct NoExpand<'t>(pub &'t str); + +impl<'t> Replacer for NoExpand<'t> { + fn replace_append(&mut self, _: &Captures<'_>, dst: &mut String) { + dst.push_str(self.0); + } + + fn no_expansion(&mut self) -> Option<Cow<'_, str>> { + Some(Cow::Borrowed(self.0)) + } +} diff --git a/third_party/rust/regex/src/sparse.rs b/third_party/rust/regex/src/sparse.rs new file mode 100644 index 0000000000..98b726613d --- /dev/null +++ b/third_party/rust/regex/src/sparse.rs @@ -0,0 +1,84 @@ +use std::fmt; +use std::ops::Deref; +use std::slice; + +/// A sparse set used for representing ordered NFA states. +/// +/// This supports constant time addition and membership testing. Clearing an +/// entire set can also be done in constant time. Iteration yields elements +/// in the order in which they were inserted. +/// +/// The data structure is based on: https://research.swtch.com/sparse +/// Note though that we don't actually use uninitialized memory. We generally +/// reuse allocations, so the initial allocation cost is bareable. However, +/// its other properties listed above are extremely useful. +#[derive(Clone)] +pub struct SparseSet { + /// Dense contains the instruction pointers in the order in which they + /// were inserted. + dense: Vec<usize>, + /// Sparse maps instruction pointers to their location in dense. + /// + /// An instruction pointer is in the set if and only if + /// sparse[ip] < dense.len() && ip == dense[sparse[ip]]. + sparse: Box<[usize]>, +} + +impl SparseSet { + pub fn new(size: usize) -> SparseSet { + SparseSet { + dense: Vec::with_capacity(size), + sparse: vec![0; size].into_boxed_slice(), + } + } + + pub fn len(&self) -> usize { + self.dense.len() + } + + pub fn is_empty(&self) -> bool { + self.dense.is_empty() + } + + pub fn capacity(&self) -> usize { + self.dense.capacity() + } + + pub fn insert(&mut self, value: usize) { + let i = self.len(); + assert!(i < self.capacity()); + self.dense.push(value); + self.sparse[value] = i; + } + + pub fn contains(&self, value: usize) -> bool { + let i = self.sparse[value]; + self.dense.get(i) == Some(&value) + } + + pub fn clear(&mut self) { + self.dense.clear(); + } +} + +impl fmt::Debug for SparseSet { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + write!(f, "SparseSet({:?})", self.dense) + } +} + +impl Deref for SparseSet { + type Target = [usize]; + + fn deref(&self) -> &Self::Target { + &self.dense + } +} + +impl<'a> IntoIterator for &'a SparseSet { + type Item = &'a usize; + type IntoIter = slice::Iter<'a, usize>; + fn into_iter(self) -> Self::IntoIter { + self.iter() + } +} diff --git a/third_party/rust/regex/src/testdata/LICENSE b/third_party/rust/regex/src/testdata/LICENSE new file mode 100644 index 0000000000..f47dbf4c44 --- /dev/null +++ b/third_party/rust/regex/src/testdata/LICENSE @@ -0,0 +1,19 @@ +The following license covers testregex.c and all associated test data. + +Permission is hereby granted, free of charge, to any person obtaining a +copy of THIS SOFTWARE FILE (the "Software"), to deal in the Software +without restriction, including without limitation the rights to use, +copy, modify, merge, publish, distribute, and/or sell copies of the +Software, and to permit persons to whom the Software is furnished to do +so, subject to the following disclaimer: + +THIS SOFTWARE IS PROVIDED BY AT&T ``AS IS'' AND ANY EXPRESS OR IMPLIED +WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF +MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. +IN NO EVENT SHALL AT&T BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. diff --git a/third_party/rust/regex/src/testdata/README b/third_party/rust/regex/src/testdata/README new file mode 100644 index 0000000000..6efc2dad33 --- /dev/null +++ b/third_party/rust/regex/src/testdata/README @@ -0,0 +1,17 @@ +Test data was taken from the Go distribution, which was in turn taken from the +testregex test suite: + + http://www2.research.att.com/~astopen/testregex/testregex.html + +The LICENSE in this directory corresponds to the LICENSE that the data was +released under. + +The tests themselves were modified for RE2/Go. A couple were modified further +by me (Andrew Gallant) (only in repetition.dat) so that RE2/Go would pass them. +(Yes, it seems like RE2/Go includes failing test cases.) This may or may not +have been a bad idea, but I think being consistent with an established Regex +library is worth something. + +Note that these files are read by 'scripts/regex-match-tests.py' and turned +into Rust tests found in 'regex_macros/tests/matches.rs'. + diff --git a/third_party/rust/regex/src/testdata/basic.dat b/third_party/rust/regex/src/testdata/basic.dat new file mode 100644 index 0000000000..632e1bb416 --- /dev/null +++ b/third_party/rust/regex/src/testdata/basic.dat @@ -0,0 +1,221 @@ +NOTE all standard compliant implementations should pass these : 2002-05-31 + +BE abracadabra$ abracadabracadabra (7,18) +BE a...b abababbb (2,7) +BE XXXXXX ..XXXXXX (2,8) +E \) () (1,2) +BE a] a]a (0,2) +B } } (0,1) +E \} } (0,1) +BE \] ] (0,1) +B ] ] (0,1) +E ] ] (0,1) +B { { (0,1) +B } } (0,1) +BE ^a ax (0,1) +BE \^a a^a (1,3) +BE a\^ a^ (0,2) +BE a$ aa (1,2) +BE a\$ a$ (0,2) +BE ^$ NULL (0,0) +E $^ NULL (0,0) +E a($) aa (1,2)(2,2) +E a*(^a) aa (0,1)(0,1) +E (..)*(...)* a (0,0) +E (..)*(...)* abcd (0,4)(2,4) +E (ab|a)(bc|c) abc (0,3)(0,2)(2,3) +E (ab)c|abc abc (0,3)(0,2) +E a{0}b ab (1,2) +E (a*)(b?)(b+)b{3} aaabbbbbbb (0,10)(0,3)(3,4)(4,7) +E (a*)(b{0,1})(b{1,})b{3} aaabbbbbbb (0,10)(0,3)(3,4)(4,7) +E a{9876543210} NULL BADBR +E ((a|a)|a) a (0,1)(0,1)(0,1) +E (a*)(a|aa) aaaa (0,4)(0,3)(3,4) +E a*(a.|aa) aaaa (0,4)(2,4) +E a(b)|c(d)|a(e)f aef (0,3)(?,?)(?,?)(1,2) +E (a|b)?.* b (0,1)(0,1) +E (a|b)c|a(b|c) ac (0,2)(0,1) +E (a|b)c|a(b|c) ab (0,2)(?,?)(1,2) +E (a|b)*c|(a|ab)*c abc (0,3)(1,2) +E (a|b)*c|(a|ab)*c xc (1,2) +E (.a|.b).*|.*(.a|.b) xa (0,2)(0,2) +E a?(ab|ba)ab abab (0,4)(0,2) +E a?(ac{0}b|ba)ab abab (0,4)(0,2) +E ab|abab abbabab (0,2) +E aba|bab|bba baaabbbaba (5,8) +E aba|bab baaabbbaba (6,9) +E (aa|aaa)*|(a|aaaaa) aa (0,2)(0,2) +E (a.|.a.)*|(a|.a...) aa (0,2)(0,2) +E ab|a xabc (1,3) +E ab|a xxabc (2,4) +Ei (?-u)(Ab|cD)* aBcD (0,4)(2,4) +BE [^-] --a (2,3) +BE [a-]* --a (0,3) +BE [a-m-]* --amoma-- (0,4) +E :::1:::0:|:::1:1:0: :::0:::1:::1:::0: (8,17) +E :::1:::0:|:::1:1:1: :::0:::1:::1:::0: (8,17) +{E [[:upper:]] A (0,1) [[<element>]] not supported +E [[:lower:]]+ `az{ (1,3) +E [[:upper:]]+ @AZ[ (1,3) +# No collation in Go +#BE [[-]] [[-]] (2,4) +#BE [[.NIL.]] NULL ECOLLATE +#BE [[=aleph=]] NULL ECOLLATE +} +BE$ \n \n (0,1) +BEn$ \n \n (0,1) +BE$ [^a] \n (0,1) +BE$ \na \na (0,2) +E (a)(b)(c) abc (0,3)(0,1)(1,2)(2,3) +BE xxx xxx (0,3) +E1 (^|[ (,;])((([Ff]eb[^ ]* *|0*2/|\* */?)0*[6-7]))([^0-9]|$) feb 6, (0,6) +E1 (^|[ (,;])((([Ff]eb[^ ]* *|0*2/|\* */?)0*[6-7]))([^0-9]|$) 2/7 (0,3) +E1 (^|[ (,;])((([Ff]eb[^ ]* *|0*2/|\* */?)0*[6-7]))([^0-9]|$) feb 1,Feb 6 (5,11) +E3 ((((((((((((((((((((((((((((((x)))))))))))))))))))))))))))))) x (0,1)(0,1)(0,1) +E3 ((((((((((((((((((((((((((((((x))))))))))))))))))))))))))))))* xx (0,2)(1,2)(1,2) +E a?(ab|ba)* ababababababababababababababababababababababababababababababababababababababababa (0,81)(79,81) +E abaa|abbaa|abbbaa|abbbbaa ababbabbbabbbabbbbabbbbaa (18,25) +E abaa|abbaa|abbbaa|abbbbaa ababbabbbabbbabbbbabaa (18,22) +E aaac|aabc|abac|abbc|baac|babc|bbac|bbbc baaabbbabac (7,11) +BE$ .* \x01\x7f (0,2) +E aaaa|bbbb|cccc|ddddd|eeeeee|fffffff|gggg|hhhh|iiiii|jjjjj|kkkkk|llll XaaaXbbbXcccXdddXeeeXfffXgggXhhhXiiiXjjjXkkkXlllXcbaXaaaa (53,57) +L aaaa\nbbbb\ncccc\nddddd\neeeeee\nfffffff\ngggg\nhhhh\niiiii\njjjjj\nkkkkk\nllll XaaaXbbbXcccXdddXeeeXfffXgggXhhhXiiiXjjjXkkkXlllXcbaXaaaa NOMATCH +E a*a*a*a*a*b aaaaaaaaab (0,10) +BE ^ NULL (0,0) +BE $ NULL (0,0) +BE ^$ NULL (0,0) +BE ^a$ a (0,1) +BE abc abc (0,3) +BE abc xabcy (1,4) +BE abc ababc (2,5) +BE ab*c abc (0,3) +BE ab*bc abc (0,3) +BE ab*bc abbc (0,4) +BE ab*bc abbbbc (0,6) +E ab+bc abbc (0,4) +E ab+bc abbbbc (0,6) +E ab?bc abbc (0,4) +E ab?bc abc (0,3) +E ab?c abc (0,3) +BE ^abc$ abc (0,3) +BE ^abc abcc (0,3) +BE abc$ aabc (1,4) +BE ^ abc (0,0) +BE $ abc (3,3) +BE a.c abc (0,3) +BE a.c axc (0,3) +BE a.*c axyzc (0,5) +BE a[bc]d abd (0,3) +BE a[b-d]e ace (0,3) +BE a[b-d] aac (1,3) +BE a[-b] a- (0,2) +BE a[b-] a- (0,2) +BE a] a] (0,2) +BE a[]]b a]b (0,3) +BE a[^bc]d aed (0,3) +BE a[^-b]c adc (0,3) +BE a[^]b]c adc (0,3) +E ab|cd abc (0,2) +E ab|cd abcd (0,2) +E a\(b a(b (0,3) +E a\(*b ab (0,2) +E a\(*b a((b (0,4) +E ((a)) abc (0,1)(0,1)(0,1) +E (a)b(c) abc (0,3)(0,1)(2,3) +E a+b+c aabbabc (4,7) +E a* aaa (0,3) +#E (a*)* - (0,0)(0,0) +E (a*)* - (0,0)(?,?) RE2/Go +E (a*)+ - (0,0)(0,0) +#E (a*|b)* - (0,0)(0,0) +E (a*|b)* - (0,0)(?,?) RE2/Go +E (a+|b)* ab (0,2)(1,2) +E (a+|b)+ ab (0,2)(1,2) +E (a+|b)? ab (0,1)(0,1) +BE [^ab]* cde (0,3) +#E (^)* - (0,0)(0,0) +E (^)* - (0,0)(?,?) RE2/Go +BE a* NULL (0,0) +E ([abc])*d abbbcd (0,6)(4,5) +E ([abc])*bcd abcd (0,4)(0,1) +E a|b|c|d|e e (0,1) +E (a|b|c|d|e)f ef (0,2)(0,1) +#E ((a*|b))* - (0,0)(0,0)(0,0) +E ((a*|b))* - (0,0)(?,?)(?,?) RE2/Go +BE abcd*efg abcdefg (0,7) +BE ab* xabyabbbz (1,3) +BE ab* xayabbbz (1,2) +E (ab|cd)e abcde (2,5)(2,4) +BE [abhgefdc]ij hij (0,3) +E (a|b)c*d abcd (1,4)(1,2) +E (ab|ab*)bc abc (0,3)(0,1) +E a([bc]*)c* abc (0,3)(1,3) +E a([bc]*)(c*d) abcd (0,4)(1,3)(3,4) +E a([bc]+)(c*d) abcd (0,4)(1,3)(3,4) +E a([bc]*)(c+d) abcd (0,4)(1,2)(2,4) +E a[bcd]*dcdcde adcdcde (0,7) +E (ab|a)b*c abc (0,3)(0,2) +E ((a)(b)c)(d) abcd (0,4)(0,3)(0,1)(1,2)(3,4) +BE [A-Za-z_][A-Za-z0-9_]* alpha (0,5) +E ^a(bc+|b[eh])g|.h$ abh (1,3) +E (bc+d$|ef*g.|h?i(j|k)) effgz (0,5)(0,5) +E (bc+d$|ef*g.|h?i(j|k)) ij (0,2)(0,2)(1,2) +E (bc+d$|ef*g.|h?i(j|k)) reffgz (1,6)(1,6) +E (((((((((a))))))))) a (0,1)(0,1)(0,1)(0,1)(0,1)(0,1)(0,1)(0,1)(0,1)(0,1) +BE multiple words multiple words yeah (0,14) +E (.*)c(.*) abcde (0,5)(0,2)(3,5) +BE abcd abcd (0,4) +E a(bc)d abcd (0,4)(1,3) +E a[-]?c ac (0,3) +E M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy] Muammar Qaddafi (0,15)(?,?)(10,12) +E M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy] Mo'ammar Gadhafi (0,16)(?,?)(11,13) +E M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy] Muammar Kaddafi (0,15)(?,?)(10,12) +E M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy] Muammar Qadhafi (0,15)(?,?)(10,12) +E M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy] Muammar Gadafi (0,14)(?,?)(10,11) +E M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy] Mu'ammar Qadafi (0,15)(?,?)(11,12) +E M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy] Moamar Gaddafi (0,14)(?,?)(9,11) +E M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy] Mu'ammar Qadhdhafi (0,18)(?,?)(13,15) +E M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy] Muammar Khaddafi (0,16)(?,?)(11,13) +E M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy] Muammar Ghaddafy (0,16)(?,?)(11,13) +E M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy] Muammar Ghadafi (0,15)(?,?)(11,12) +E M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy] Muammar Ghaddafi (0,16)(?,?)(11,13) +E M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy] Muamar Kaddafi (0,14)(?,?)(9,11) +E M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy] Muammar Quathafi (0,16)(?,?)(11,13) +E M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy] Muammar Gheddafi (0,16)(?,?)(11,13) +E M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy] Moammar Khadafy (0,15)(?,?)(11,12) +E M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy] Moammar Qudhafi (0,15)(?,?)(10,12) +E a+(b|c)*d+ aabcdd (0,6)(3,4) +E ^.+$ vivi (0,4) +E ^(.+)$ vivi (0,4)(0,4) +E ^([^!.]+).att.com!(.+)$ gryphon.att.com!eby (0,19)(0,7)(16,19) +E ^([^!]+!)?([^!]+)$ bas (0,3)(?,?)(0,3) +E ^([^!]+!)?([^!]+)$ bar!bas (0,7)(0,4)(4,7) +E ^([^!]+!)?([^!]+)$ foo!bas (0,7)(0,4)(4,7) +E ^.+!([^!]+!)([^!]+)$ foo!bar!bas (0,11)(4,8)(8,11) +E ((foo)|(bar))!bas bar!bas (0,7)(0,3)(?,?)(0,3) +E ((foo)|(bar))!bas foo!bar!bas (4,11)(4,7)(?,?)(4,7) +E ((foo)|(bar))!bas foo!bas (0,7)(0,3)(0,3) +E ((foo)|bar)!bas bar!bas (0,7)(0,3) +E ((foo)|bar)!bas foo!bar!bas (4,11)(4,7) +E ((foo)|bar)!bas foo!bas (0,7)(0,3)(0,3) +E (foo|(bar))!bas bar!bas (0,7)(0,3)(0,3) +E (foo|(bar))!bas foo!bar!bas (4,11)(4,7)(4,7) +E (foo|(bar))!bas foo!bas (0,7)(0,3) +E (foo|bar)!bas bar!bas (0,7)(0,3) +E (foo|bar)!bas foo!bar!bas (4,11)(4,7) +E (foo|bar)!bas foo!bas (0,7)(0,3) +E ^(([^!]+!)?([^!]+)|.+!([^!]+!)([^!]+))$ foo!bar!bas (0,11)(0,11)(?,?)(?,?)(4,8)(8,11) +E ^([^!]+!)?([^!]+)$|^.+!([^!]+!)([^!]+)$ bas (0,3)(?,?)(0,3) +E ^([^!]+!)?([^!]+)$|^.+!([^!]+!)([^!]+)$ bar!bas (0,7)(0,4)(4,7) +E ^([^!]+!)?([^!]+)$|^.+!([^!]+!)([^!]+)$ foo!bar!bas (0,11)(?,?)(?,?)(4,8)(8,11) +E ^([^!]+!)?([^!]+)$|^.+!([^!]+!)([^!]+)$ foo!bas (0,7)(0,4)(4,7) +E ^(([^!]+!)?([^!]+)|.+!([^!]+!)([^!]+))$ bas (0,3)(0,3)(?,?)(0,3) +E ^(([^!]+!)?([^!]+)|.+!([^!]+!)([^!]+))$ bar!bas (0,7)(0,7)(0,4)(4,7) +E ^(([^!]+!)?([^!]+)|.+!([^!]+!)([^!]+))$ foo!bar!bas (0,11)(0,11)(?,?)(?,?)(4,8)(8,11) +E ^(([^!]+!)?([^!]+)|.+!([^!]+!)([^!]+))$ foo!bas (0,7)(0,7)(0,4)(4,7) +E .*(/XXX).* /XXX (0,4)(0,4) +E .*(\\XXX).* \XXX (0,4)(0,4) +E \\XXX \XXX (0,4) +E .*(/000).* /000 (0,4)(0,4) +E .*(\\000).* \000 (0,4)(0,4) +E \\000 \000 (0,4) diff --git a/third_party/rust/regex/src/testdata/nullsubexpr.dat b/third_party/rust/regex/src/testdata/nullsubexpr.dat new file mode 100644 index 0000000000..2e18fbb917 --- /dev/null +++ b/third_party/rust/regex/src/testdata/nullsubexpr.dat @@ -0,0 +1,79 @@ +NOTE null subexpression matches : 2002-06-06 + +E (a*)* a (0,1)(0,1) +#E SAME x (0,0)(0,0) +E SAME x (0,0)(?,?) RE2/Go +E SAME aaaaaa (0,6)(0,6) +E SAME aaaaaax (0,6)(0,6) +E (a*)+ a (0,1)(0,1) +E SAME x (0,0)(0,0) +E SAME aaaaaa (0,6)(0,6) +E SAME aaaaaax (0,6)(0,6) +E (a+)* a (0,1)(0,1) +E SAME x (0,0) +E SAME aaaaaa (0,6)(0,6) +E SAME aaaaaax (0,6)(0,6) +E (a+)+ a (0,1)(0,1) +E SAME x NOMATCH +E SAME aaaaaa (0,6)(0,6) +E SAME aaaaaax (0,6)(0,6) + +E ([a]*)* a (0,1)(0,1) +#E SAME x (0,0)(0,0) +E SAME x (0,0)(?,?) RE2/Go +E SAME aaaaaa (0,6)(0,6) +E SAME aaaaaax (0,6)(0,6) +E ([a]*)+ a (0,1)(0,1) +E SAME x (0,0)(0,0) +E SAME aaaaaa (0,6)(0,6) +E SAME aaaaaax (0,6)(0,6) +E ([^b]*)* a (0,1)(0,1) +#E SAME b (0,0)(0,0) +E SAME b (0,0)(?,?) RE2/Go +E SAME aaaaaa (0,6)(0,6) +E SAME aaaaaab (0,6)(0,6) +E ([ab]*)* a (0,1)(0,1) +E SAME aaaaaa (0,6)(0,6) +E SAME ababab (0,6)(0,6) +E SAME bababa (0,6)(0,6) +E SAME b (0,1)(0,1) +E SAME bbbbbb (0,6)(0,6) +E SAME aaaabcde (0,5)(0,5) +E ([^a]*)* b (0,1)(0,1) +E SAME bbbbbb (0,6)(0,6) +#E SAME aaaaaa (0,0)(0,0) +E SAME aaaaaa (0,0)(?,?) RE2/Go +E ([^ab]*)* ccccxx (0,6)(0,6) +#E SAME ababab (0,0)(0,0) +E SAME ababab (0,0)(?,?) RE2/Go + +E ((z)+|a)* zabcde (0,2)(1,2) + +#{E a+? aaaaaa (0,1) no *? +? mimimal match ops +#E (a) aaa (0,1)(0,1) +#E (a*?) aaa (0,0)(0,0) +#E (a)*? aaa (0,0) +#E (a*?)*? aaa (0,0) +#} + +B \(a*\)*\(x\) x (0,1)(0,0)(0,1) +B \(a*\)*\(x\) ax (0,2)(0,1)(1,2) +B \(a*\)*\(x\) axa (0,2)(0,1)(1,2) +B \(a*\)*\(x\)\(\1\) x (0,1)(0,0)(0,1)(1,1) +B \(a*\)*\(x\)\(\1\) ax (0,2)(1,1)(1,2)(2,2) +B \(a*\)*\(x\)\(\1\) axa (0,3)(0,1)(1,2)(2,3) +B \(a*\)*\(x\)\(\1\)\(x\) axax (0,4)(0,1)(1,2)(2,3)(3,4) +B \(a*\)*\(x\)\(\1\)\(x\) axxa (0,3)(1,1)(1,2)(2,2)(2,3) + +#E (a*)*(x) x (0,1)(0,0)(0,1) +E (a*)*(x) x (0,1)(?,?)(0,1) RE2/Go +E (a*)*(x) ax (0,2)(0,1)(1,2) +E (a*)*(x) axa (0,2)(0,1)(1,2) + +E (a*)+(x) x (0,1)(0,0)(0,1) +E (a*)+(x) ax (0,2)(0,1)(1,2) +E (a*)+(x) axa (0,2)(0,1)(1,2) + +E (a*){2}(x) x (0,1)(0,0)(0,1) +E (a*){2}(x) ax (0,2)(1,1)(1,2) +E (a*){2}(x) axa (0,2)(1,1)(1,2) diff --git a/third_party/rust/regex/src/testdata/repetition.dat b/third_party/rust/regex/src/testdata/repetition.dat new file mode 100644 index 0000000000..3bb2121180 --- /dev/null +++ b/third_party/rust/regex/src/testdata/repetition.dat @@ -0,0 +1,163 @@ +NOTE implicit vs. explicit repetitions : 2009-02-02 + +# Glenn Fowler <gsf@research.att.com> +# conforming matches (column 4) must match one of the following BREs +# NOMATCH +# (0,.)\((\(.\),\(.\))(?,?)(\2,\3)\)* +# (0,.)\((\(.\),\(.\))(\2,\3)(?,?)\)* +# i.e., each 3-tuple has two identical elements and one (?,?) + +E ((..)|(.)) NULL NOMATCH +E ((..)|(.))((..)|(.)) NULL NOMATCH +E ((..)|(.))((..)|(.))((..)|(.)) NULL NOMATCH + +E ((..)|(.)){1} NULL NOMATCH +E ((..)|(.)){2} NULL NOMATCH +E ((..)|(.)){3} NULL NOMATCH + +E ((..)|(.))* NULL (0,0) + +E ((..)|(.)) a (0,1)(0,1)(?,?)(0,1) +E ((..)|(.))((..)|(.)) a NOMATCH +E ((..)|(.))((..)|(.))((..)|(.)) a NOMATCH + +E ((..)|(.)){1} a (0,1)(0,1)(?,?)(0,1) +E ((..)|(.)){2} a NOMATCH +E ((..)|(.)){3} a NOMATCH + +E ((..)|(.))* a (0,1)(0,1)(?,?)(0,1) + +E ((..)|(.)) aa (0,2)(0,2)(0,2)(?,?) +E ((..)|(.))((..)|(.)) aa (0,2)(0,1)(?,?)(0,1)(1,2)(?,?)(1,2) +E ((..)|(.))((..)|(.))((..)|(.)) aa NOMATCH + +E ((..)|(.)){1} aa (0,2)(0,2)(0,2)(?,?) +E ((..)|(.)){2} aa (0,2)(1,2)(?,?)(1,2) +E ((..)|(.)){3} aa NOMATCH + +E ((..)|(.))* aa (0,2)(0,2)(0,2)(?,?) + +E ((..)|(.)) aaa (0,2)(0,2)(0,2)(?,?) +E ((..)|(.))((..)|(.)) aaa (0,3)(0,2)(0,2)(?,?)(2,3)(?,?)(2,3) +E ((..)|(.))((..)|(.))((..)|(.)) aaa (0,3)(0,1)(?,?)(0,1)(1,2)(?,?)(1,2)(2,3)(?,?)(2,3) + +E ((..)|(.)){1} aaa (0,2)(0,2)(0,2)(?,?) +#E ((..)|(.)){2} aaa (0,3)(2,3)(?,?)(2,3) +E ((..)|(.)){2} aaa (0,3)(2,3)(0,2)(2,3) RE2/Go +E ((..)|(.)){3} aaa (0,3)(2,3)(?,?)(2,3) + +#E ((..)|(.))* aaa (0,3)(2,3)(?,?)(2,3) +E ((..)|(.))* aaa (0,3)(2,3)(0,2)(2,3) RE2/Go + +E ((..)|(.)) aaaa (0,2)(0,2)(0,2)(?,?) +E ((..)|(.))((..)|(.)) aaaa (0,4)(0,2)(0,2)(?,?)(2,4)(2,4)(?,?) +E ((..)|(.))((..)|(.))((..)|(.)) aaaa (0,4)(0,2)(0,2)(?,?)(2,3)(?,?)(2,3)(3,4)(?,?)(3,4) + +E ((..)|(.)){1} aaaa (0,2)(0,2)(0,2)(?,?) +E ((..)|(.)){2} aaaa (0,4)(2,4)(2,4)(?,?) +#E ((..)|(.)){3} aaaa (0,4)(3,4)(?,?)(3,4) +E ((..)|(.)){3} aaaa (0,4)(3,4)(0,2)(3,4) RE2/Go + +E ((..)|(.))* aaaa (0,4)(2,4)(2,4)(?,?) + +E ((..)|(.)) aaaaa (0,2)(0,2)(0,2)(?,?) +E ((..)|(.))((..)|(.)) aaaaa (0,4)(0,2)(0,2)(?,?)(2,4)(2,4)(?,?) +E ((..)|(.))((..)|(.))((..)|(.)) aaaaa (0,5)(0,2)(0,2)(?,?)(2,4)(2,4)(?,?)(4,5)(?,?)(4,5) + +E ((..)|(.)){1} aaaaa (0,2)(0,2)(0,2)(?,?) +E ((..)|(.)){2} aaaaa (0,4)(2,4)(2,4)(?,?) +#E ((..)|(.)){3} aaaaa (0,5)(4,5)(?,?)(4,5) +E ((..)|(.)){3} aaaaa (0,5)(4,5)(2,4)(4,5) RE2/Go + +#E ((..)|(.))* aaaaa (0,5)(4,5)(?,?)(4,5) +E ((..)|(.))* aaaaa (0,5)(4,5)(2,4)(4,5) RE2/Go + +E ((..)|(.)) aaaaaa (0,2)(0,2)(0,2)(?,?) +E ((..)|(.))((..)|(.)) aaaaaa (0,4)(0,2)(0,2)(?,?)(2,4)(2,4)(?,?) +E ((..)|(.))((..)|(.))((..)|(.)) aaaaaa (0,6)(0,2)(0,2)(?,?)(2,4)(2,4)(?,?)(4,6)(4,6)(?,?) + +E ((..)|(.)){1} aaaaaa (0,2)(0,2)(0,2)(?,?) +E ((..)|(.)){2} aaaaaa (0,4)(2,4)(2,4)(?,?) +E ((..)|(.)){3} aaaaaa (0,6)(4,6)(4,6)(?,?) + +E ((..)|(.))* aaaaaa (0,6)(4,6)(4,6)(?,?) + +NOTE additional repetition tests graciously provided by Chris Kuklewicz www.haskell.org 2009-02-02 + +# These test a bug in OS X / FreeBSD / NetBSD, and libtree. +# Linux/GLIBC gets the {8,} and {8,8} wrong. + +:HA#100:E X(.?){0,}Y X1234567Y (0,9)(7,8) +:HA#101:E X(.?){1,}Y X1234567Y (0,9)(7,8) +:HA#102:E X(.?){2,}Y X1234567Y (0,9)(7,8) +:HA#103:E X(.?){3,}Y X1234567Y (0,9)(7,8) +:HA#104:E X(.?){4,}Y X1234567Y (0,9)(7,8) +:HA#105:E X(.?){5,}Y X1234567Y (0,9)(7,8) +:HA#106:E X(.?){6,}Y X1234567Y (0,9)(7,8) +:HA#107:E X(.?){7,}Y X1234567Y (0,9)(7,8) +:HA#108:E X(.?){8,}Y X1234567Y (0,9)(8,8) +#:HA#110:E X(.?){0,8}Y X1234567Y (0,9)(7,8) +:HA#110:E X(.?){0,8}Y X1234567Y (0,9)(8,8) RE2/Go +#:HA#111:E X(.?){1,8}Y X1234567Y (0,9)(7,8) +:HA#111:E X(.?){1,8}Y X1234567Y (0,9)(8,8) RE2/Go +#:HA#112:E X(.?){2,8}Y X1234567Y (0,9)(7,8) +:HA#112:E X(.?){2,8}Y X1234567Y (0,9)(8,8) RE2/Go +#:HA#113:E X(.?){3,8}Y X1234567Y (0,9)(7,8) +:HA#113:E X(.?){3,8}Y X1234567Y (0,9)(8,8) RE2/Go +#:HA#114:E X(.?){4,8}Y X1234567Y (0,9)(7,8) +:HA#114:E X(.?){4,8}Y X1234567Y (0,9)(8,8) RE2/Go +#:HA#115:E X(.?){5,8}Y X1234567Y (0,9)(7,8) +:HA#115:E X(.?){5,8}Y X1234567Y (0,9)(8,8) RE2/Go +#:HA#116:E X(.?){6,8}Y X1234567Y (0,9)(7,8) +:HA#116:E X(.?){6,8}Y X1234567Y (0,9)(8,8) RE2/Go +#:HA#117:E X(.?){7,8}Y X1234567Y (0,9)(7,8) +:HA#117:E X(.?){7,8}Y X1234567Y (0,9)(8,8) RE2/Go +:HA#118:E X(.?){8,8}Y X1234567Y (0,9)(8,8) + +# These test a fixed bug in my regex-tdfa that did not keep the expanded +# form properly grouped, so right association did the wrong thing with +# these ambiguous patterns (crafted just to test my code when I became +# suspicious of my implementation). The first subexpression should use +# "ab" then "a" then "bcd". + +# OS X / FreeBSD / NetBSD badly fail many of these, with impossible +# results like (0,6)(4,5)(6,6). + +:HA#260:E (a|ab|c|bcd){0,}(d*) ababcd (0,1)(0,1)(1,1) +:HA#261:E (a|ab|c|bcd){1,}(d*) ababcd (0,1)(0,1)(1,1) +:HA#262:E (a|ab|c|bcd){2,}(d*) ababcd (0,6)(3,6)(6,6) +:HA#263:E (a|ab|c|bcd){3,}(d*) ababcd (0,6)(3,6)(6,6) +:HA#264:E (a|ab|c|bcd){4,}(d*) ababcd NOMATCH +:HA#265:E (a|ab|c|bcd){0,10}(d*) ababcd (0,1)(0,1)(1,1) +:HA#266:E (a|ab|c|bcd){1,10}(d*) ababcd (0,1)(0,1)(1,1) +:HA#267:E (a|ab|c|bcd){2,10}(d*) ababcd (0,6)(3,6)(6,6) +:HA#268:E (a|ab|c|bcd){3,10}(d*) ababcd (0,6)(3,6)(6,6) +:HA#269:E (a|ab|c|bcd){4,10}(d*) ababcd NOMATCH +:HA#270:E (a|ab|c|bcd)*(d*) ababcd (0,1)(0,1)(1,1) +:HA#271:E (a|ab|c|bcd)+(d*) ababcd (0,1)(0,1)(1,1) + +# The above worked on Linux/GLIBC but the following often fail. +# They also trip up OS X / FreeBSD / NetBSD: + +#:HA#280:E (ab|a|c|bcd){0,}(d*) ababcd (0,6)(3,6)(6,6) +:HA#280:E (ab|a|c|bcd){0,}(d*) ababcd (0,6)(4,5)(5,6) RE2/Go +#:HA#281:E (ab|a|c|bcd){1,}(d*) ababcd (0,6)(3,6)(6,6) +:HA#281:E (ab|a|c|bcd){1,}(d*) ababcd (0,6)(4,5)(5,6) RE2/Go +#:HA#282:E (ab|a|c|bcd){2,}(d*) ababcd (0,6)(3,6)(6,6) +:HA#282:E (ab|a|c|bcd){2,}(d*) ababcd (0,6)(4,5)(5,6) RE2/Go +#:HA#283:E (ab|a|c|bcd){3,}(d*) ababcd (0,6)(3,6)(6,6) +:HA#283:E (ab|a|c|bcd){3,}(d*) ababcd (0,6)(4,5)(5,6) RE2/Go +:HA#284:E (ab|a|c|bcd){4,}(d*) ababcd NOMATCH +#:HA#285:E (ab|a|c|bcd){0,10}(d*) ababcd (0,6)(3,6)(6,6) +:HA#285:E (ab|a|c|bcd){0,10}(d*) ababcd (0,6)(4,5)(5,6) RE2/Go +#:HA#286:E (ab|a|c|bcd){1,10}(d*) ababcd (0,6)(3,6)(6,6) +:HA#286:E (ab|a|c|bcd){1,10}(d*) ababcd (0,6)(4,5)(5,6) RE2/Go +#:HA#287:E (ab|a|c|bcd){2,10}(d*) ababcd (0,6)(3,6)(6,6) +:HA#287:E (ab|a|c|bcd){2,10}(d*) ababcd (0,6)(4,5)(5,6) RE2/Go +#:HA#288:E (ab|a|c|bcd){3,10}(d*) ababcd (0,6)(3,6)(6,6) +:HA#288:E (ab|a|c|bcd){3,10}(d*) ababcd (0,6)(4,5)(5,6) RE2/Go +:HA#289:E (ab|a|c|bcd){4,10}(d*) ababcd NOMATCH +#:HA#290:E (ab|a|c|bcd)*(d*) ababcd (0,6)(3,6)(6,6) +:HA#290:E (ab|a|c|bcd)*(d*) ababcd (0,6)(4,5)(5,6) RE2/Go +#:HA#291:E (ab|a|c|bcd)+(d*) ababcd (0,6)(3,6)(6,6) +:HA#291:E (ab|a|c|bcd)+(d*) ababcd (0,6)(4,5)(5,6) RE2/Go diff --git a/third_party/rust/regex/src/utf8.rs b/third_party/rust/regex/src/utf8.rs new file mode 100644 index 0000000000..2dfd2c0d1d --- /dev/null +++ b/third_party/rust/regex/src/utf8.rs @@ -0,0 +1,264 @@ +/// A few elementary UTF-8 encoding and decoding functions used by the matching +/// engines. +/// +/// In an ideal world, the matching engines operate on `&str` and we can just +/// lean on the standard library for all our UTF-8 needs. However, to support +/// byte based regexes (that can match on arbitrary bytes which may contain +/// UTF-8), we need to be capable of searching and decoding UTF-8 on a `&[u8]`. +/// The standard library doesn't really recognize this use case, so we have +/// to build it out ourselves. +/// +/// Should this be factored out into a separate crate? It seems independently +/// useful. There are other crates that already exist (e.g., `utf-8`) that have +/// overlapping use cases. Not sure what to do. +use std::char; + +const TAG_CONT: u8 = 0b1000_0000; +const TAG_TWO: u8 = 0b1100_0000; +const TAG_THREE: u8 = 0b1110_0000; +const TAG_FOUR: u8 = 0b1111_0000; + +/// Returns the smallest possible index of the next valid UTF-8 sequence +/// starting after `i`. +pub fn next_utf8(text: &[u8], i: usize) -> usize { + let b = match text.get(i) { + None => return i + 1, + Some(&b) => b, + }; + let inc = if b <= 0x7F { + 1 + } else if b <= 0b110_11111 { + 2 + } else if b <= 0b1110_1111 { + 3 + } else { + 4 + }; + i + inc +} + +/// Decode a single UTF-8 sequence into a single Unicode codepoint from `src`. +/// +/// If no valid UTF-8 sequence could be found, then `None` is returned. +/// Otherwise, the decoded codepoint and the number of bytes read is returned. +/// The number of bytes read (for a valid UTF-8 sequence) is guaranteed to be +/// 1, 2, 3 or 4. +/// +/// Note that a UTF-8 sequence is invalid if it is incorrect UTF-8, encodes a +/// codepoint that is out of range (surrogate codepoints are out of range) or +/// is not the shortest possible UTF-8 sequence for that codepoint. +#[inline] +pub fn decode_utf8(src: &[u8]) -> Option<(char, usize)> { + let b0 = match src.get(0) { + None => return None, + Some(&b) if b <= 0x7F => return Some((b as char, 1)), + Some(&b) => b, + }; + match b0 { + 0b110_00000..=0b110_11111 => { + if src.len() < 2 { + return None; + } + let b1 = src[1]; + if 0b11_000000 & b1 != TAG_CONT { + return None; + } + let cp = ((b0 & !TAG_TWO) as u32) << 6 | ((b1 & !TAG_CONT) as u32); + match cp { + 0x80..=0x7FF => char::from_u32(cp).map(|cp| (cp, 2)), + _ => None, + } + } + 0b1110_0000..=0b1110_1111 => { + if src.len() < 3 { + return None; + } + let (b1, b2) = (src[1], src[2]); + if 0b11_000000 & b1 != TAG_CONT { + return None; + } + if 0b11_000000 & b2 != TAG_CONT { + return None; + } + let cp = ((b0 & !TAG_THREE) as u32) << 12 + | ((b1 & !TAG_CONT) as u32) << 6 + | ((b2 & !TAG_CONT) as u32); + match cp { + // char::from_u32 will disallow surrogate codepoints. + 0x800..=0xFFFF => char::from_u32(cp).map(|cp| (cp, 3)), + _ => None, + } + } + 0b11110_000..=0b11110_111 => { + if src.len() < 4 { + return None; + } + let (b1, b2, b3) = (src[1], src[2], src[3]); + if 0b11_000000 & b1 != TAG_CONT { + return None; + } + if 0b11_000000 & b2 != TAG_CONT { + return None; + } + if 0b11_000000 & b3 != TAG_CONT { + return None; + } + let cp = ((b0 & !TAG_FOUR) as u32) << 18 + | ((b1 & !TAG_CONT) as u32) << 12 + | ((b2 & !TAG_CONT) as u32) << 6 + | ((b3 & !TAG_CONT) as u32); + match cp { + 0x10000..=0x0010_FFFF => char::from_u32(cp).map(|cp| (cp, 4)), + _ => None, + } + } + _ => None, + } +} + +/// Like `decode_utf8`, but decodes the last UTF-8 sequence in `src` instead +/// of the first. +pub fn decode_last_utf8(src: &[u8]) -> Option<(char, usize)> { + if src.is_empty() { + return None; + } + let mut start = src.len() - 1; + if src[start] <= 0x7F { + return Some((src[start] as char, 1)); + } + while start > src.len().saturating_sub(4) { + start -= 1; + if is_start_byte(src[start]) { + break; + } + } + match decode_utf8(&src[start..]) { + None => None, + Some((_, n)) if n < src.len() - start => None, + Some((cp, n)) => Some((cp, n)), + } +} + +fn is_start_byte(b: u8) -> bool { + b & 0b11_000000 != 0b1_0000000 +} + +#[cfg(test)] +mod tests { + use std::str; + + use quickcheck::quickcheck; + + use super::{ + decode_last_utf8, decode_utf8, TAG_CONT, TAG_FOUR, TAG_THREE, TAG_TWO, + }; + + #[test] + fn prop_roundtrip() { + fn p(given_cp: char) -> bool { + let mut tmp = [0; 4]; + let encoded_len = given_cp.encode_utf8(&mut tmp).len(); + let (got_cp, got_len) = decode_utf8(&tmp[..encoded_len]).unwrap(); + encoded_len == got_len && given_cp == got_cp + } + quickcheck(p as fn(char) -> bool) + } + + #[test] + fn prop_roundtrip_last() { + fn p(given_cp: char) -> bool { + let mut tmp = [0; 4]; + let encoded_len = given_cp.encode_utf8(&mut tmp).len(); + let (got_cp, got_len) = + decode_last_utf8(&tmp[..encoded_len]).unwrap(); + encoded_len == got_len && given_cp == got_cp + } + quickcheck(p as fn(char) -> bool) + } + + #[test] + fn prop_encode_matches_std() { + fn p(cp: char) -> bool { + let mut got = [0; 4]; + let n = cp.encode_utf8(&mut got).len(); + let expected = cp.to_string(); + &got[..n] == expected.as_bytes() + } + quickcheck(p as fn(char) -> bool) + } + + #[test] + fn prop_decode_matches_std() { + fn p(given_cp: char) -> bool { + let mut tmp = [0; 4]; + let n = given_cp.encode_utf8(&mut tmp).len(); + let (got_cp, _) = decode_utf8(&tmp[..n]).unwrap(); + let expected_cp = + str::from_utf8(&tmp[..n]).unwrap().chars().next().unwrap(); + got_cp == expected_cp + } + quickcheck(p as fn(char) -> bool) + } + + #[test] + fn prop_decode_last_matches_std() { + fn p(given_cp: char) -> bool { + let mut tmp = [0; 4]; + let n = given_cp.encode_utf8(&mut tmp).len(); + let (got_cp, _) = decode_last_utf8(&tmp[..n]).unwrap(); + let expected_cp = str::from_utf8(&tmp[..n]) + .unwrap() + .chars() + .rev() + .next() + .unwrap(); + got_cp == expected_cp + } + quickcheck(p as fn(char) -> bool) + } + + #[test] + fn reject_invalid() { + // Invalid start byte + assert_eq!(decode_utf8(&[0xFF]), None); + // Surrogate pair + assert_eq!(decode_utf8(&[0xED, 0xA0, 0x81]), None); + // Invalid continuation byte. + assert_eq!(decode_utf8(&[0xD4, 0xC2]), None); + // Bad lengths + assert_eq!(decode_utf8(&[0xC3]), None); // 2 bytes + assert_eq!(decode_utf8(&[0xEF, 0xBF]), None); // 3 bytes + assert_eq!(decode_utf8(&[0xF4, 0x8F, 0xBF]), None); // 4 bytes + // Not a minimal UTF-8 sequence + assert_eq!(decode_utf8(&[TAG_TWO, TAG_CONT | b'a']), None); + assert_eq!(decode_utf8(&[TAG_THREE, TAG_CONT, TAG_CONT | b'a']), None); + assert_eq!( + decode_utf8(&[TAG_FOUR, TAG_CONT, TAG_CONT, TAG_CONT | b'a',]), + None + ); + } + + #[test] + fn reject_invalid_last() { + // Invalid start byte + assert_eq!(decode_last_utf8(&[0xFF]), None); + // Surrogate pair + assert_eq!(decode_last_utf8(&[0xED, 0xA0, 0x81]), None); + // Bad lengths + assert_eq!(decode_last_utf8(&[0xC3]), None); // 2 bytes + assert_eq!(decode_last_utf8(&[0xEF, 0xBF]), None); // 3 bytes + assert_eq!(decode_last_utf8(&[0xF4, 0x8F, 0xBF]), None); // 4 bytes + // Not a minimal UTF-8 sequence + assert_eq!(decode_last_utf8(&[TAG_TWO, TAG_CONT | b'a']), None); + assert_eq!( + decode_last_utf8(&[TAG_THREE, TAG_CONT, TAG_CONT | b'a',]), + None + ); + assert_eq!( + decode_last_utf8( + &[TAG_FOUR, TAG_CONT, TAG_CONT, TAG_CONT | b'a',] + ), + None + ); + } +} |