summaryrefslogtreecommitdiffstats
path: root/third_party/rust/regex/src
diff options
context:
space:
mode:
Diffstat (limited to 'third_party/rust/regex/src')
-rw-r--r--third_party/rust/regex/src/backtrack.rs282
-rw-r--r--third_party/rust/regex/src/compile.rs1264
-rw-r--r--third_party/rust/regex/src/dfa.rs1945
-rw-r--r--third_party/rust/regex/src/error.rs71
-rw-r--r--third_party/rust/regex/src/exec.rs1655
-rw-r--r--third_party/rust/regex/src/expand.rs239
-rw-r--r--third_party/rust/regex/src/find_byte.rs18
-rw-r--r--third_party/rust/regex/src/freqs.rs261
-rw-r--r--third_party/rust/regex/src/input.rs432
-rw-r--r--third_party/rust/regex/src/lib.rs769
-rw-r--r--third_party/rust/regex/src/literal/imp.rs402
-rw-r--r--third_party/rust/regex/src/literal/mod.rs55
-rw-r--r--third_party/rust/regex/src/pattern.rs63
-rw-r--r--third_party/rust/regex/src/pikevm.rs360
-rw-r--r--third_party/rust/regex/src/pool.rs333
-rw-r--r--third_party/rust/regex/src/prog.rs447
-rw-r--r--third_party/rust/regex/src/re_builder.rs421
-rw-r--r--third_party/rust/regex/src/re_bytes.rs1260
-rw-r--r--third_party/rust/regex/src/re_set.rs507
-rw-r--r--third_party/rust/regex/src/re_trait.rs294
-rw-r--r--third_party/rust/regex/src/re_unicode.rs1311
-rw-r--r--third_party/rust/regex/src/sparse.rs84
-rw-r--r--third_party/rust/regex/src/testdata/LICENSE19
-rw-r--r--third_party/rust/regex/src/testdata/README17
-rw-r--r--third_party/rust/regex/src/testdata/basic.dat221
-rw-r--r--third_party/rust/regex/src/testdata/nullsubexpr.dat79
-rw-r--r--third_party/rust/regex/src/testdata/repetition.dat163
-rw-r--r--third_party/rust/regex/src/utf8.rs264
28 files changed, 13236 insertions, 0 deletions
diff --git a/third_party/rust/regex/src/backtrack.rs b/third_party/rust/regex/src/backtrack.rs
new file mode 100644
index 0000000000..4d83856ca0
--- /dev/null
+++ b/third_party/rust/regex/src/backtrack.rs
@@ -0,0 +1,282 @@
+// This is the backtracking matching engine. It has the same exact capability
+// as the full NFA simulation, except it is artificially restricted to small
+// regexes on small inputs because of its memory requirements.
+//
+// In particular, this is a *bounded* backtracking engine. It retains worst
+// case linear time by keeping track of the states that it has visited (using a
+// bitmap). Namely, once a state is visited, it is never visited again. Since a
+// state is keyed by `(instruction index, input index)`, we have that its time
+// complexity is `O(mn)` (i.e., linear in the size of the search text).
+//
+// The backtracking engine can beat out the NFA simulation on small
+// regexes/inputs because it doesn't have to keep track of multiple copies of
+// the capture groups. In benchmarks, the backtracking engine is roughly twice
+// as fast as the full NFA simulation. Note though that its performance doesn't
+// scale, even if you're willing to live with the memory requirements. Namely,
+// the bitset has to be zeroed on each execution, which becomes quite expensive
+// on large bitsets.
+
+use crate::exec::ProgramCache;
+use crate::input::{Input, InputAt};
+use crate::prog::{InstPtr, Program};
+use crate::re_trait::Slot;
+
+type Bits = u32;
+
+const BIT_SIZE: usize = 32;
+const MAX_SIZE_BYTES: usize = 256 * (1 << 10); // 256 KB
+
+/// Returns true iff the given regex and input should be executed by this
+/// engine with reasonable memory usage.
+pub fn should_exec(num_insts: usize, text_len: usize) -> bool {
+ // Total memory usage in bytes is determined by:
+ //
+ // ((len(insts) * (len(input) + 1) + bits - 1) / bits) * (size_of(u32))
+ //
+ // The actual limit picked is pretty much a heuristic.
+ // See: https://github.com/rust-lang/regex/issues/215
+ let size = ((num_insts * (text_len + 1) + BIT_SIZE - 1) / BIT_SIZE) * 4;
+ size <= MAX_SIZE_BYTES
+}
+
+/// A backtracking matching engine.
+#[derive(Debug)]
+pub struct Bounded<'a, 'm, 'r, 's, I> {
+ prog: &'r Program,
+ input: I,
+ matches: &'m mut [bool],
+ slots: &'s mut [Slot],
+ m: &'a mut Cache,
+}
+
+/// Shared cached state between multiple invocations of a backtracking engine
+/// in the same thread.
+#[derive(Clone, Debug)]
+pub struct Cache {
+ jobs: Vec<Job>,
+ visited: Vec<Bits>,
+}
+
+impl Cache {
+ /// Create new empty cache for the backtracking engine.
+ pub fn new(_prog: &Program) -> Self {
+ Cache { jobs: vec![], visited: vec![] }
+ }
+}
+
+/// A job is an explicit unit of stack space in the backtracking engine.
+///
+/// The "normal" representation is a single state transition, which corresponds
+/// to an NFA state and a character in the input. However, the backtracking
+/// engine must keep track of old capture group values. We use the explicit
+/// stack to do it.
+#[derive(Clone, Copy, Debug)]
+enum Job {
+ Inst { ip: InstPtr, at: InputAt },
+ SaveRestore { slot: usize, old_pos: Option<usize> },
+}
+
+impl<'a, 'm, 'r, 's, I: Input> Bounded<'a, 'm, 'r, 's, I> {
+ /// Execute the backtracking matching engine.
+ ///
+ /// If there's a match, `exec` returns `true` and populates the given
+ /// captures accordingly.
+ pub fn exec(
+ prog: &'r Program,
+ cache: &ProgramCache,
+ matches: &'m mut [bool],
+ slots: &'s mut [Slot],
+ input: I,
+ start: usize,
+ end: usize,
+ ) -> bool {
+ let mut cache = cache.borrow_mut();
+ let cache = &mut cache.backtrack;
+ let start = input.at(start);
+ let mut b = Bounded { prog, input, matches, slots, m: cache };
+ b.exec_(start, end)
+ }
+
+ /// Clears the cache such that the backtracking engine can be executed
+ /// on some input of fixed length.
+ fn clear(&mut self) {
+ // Reset the job memory so that we start fresh.
+ self.m.jobs.clear();
+
+ // Now we need to clear the bit state set.
+ // We do this by figuring out how much space we need to keep track
+ // of the states we've visited.
+ // Then we reset all existing allocated space to 0.
+ // Finally, we request more space if we need it.
+ //
+ // This is all a little circuitous, but doing this using unchecked
+ // operations doesn't seem to have a measurable impact on performance.
+ // (Probably because backtracking is limited to such small
+ // inputs/regexes in the first place.)
+ let visited_len =
+ (self.prog.len() * (self.input.len() + 1) + BIT_SIZE - 1)
+ / BIT_SIZE;
+ self.m.visited.truncate(visited_len);
+ for v in &mut self.m.visited {
+ *v = 0;
+ }
+ if visited_len > self.m.visited.len() {
+ let len = self.m.visited.len();
+ self.m.visited.reserve_exact(visited_len - len);
+ for _ in 0..(visited_len - len) {
+ self.m.visited.push(0);
+ }
+ }
+ }
+
+ /// Start backtracking at the given position in the input, but also look
+ /// for literal prefixes.
+ fn exec_(&mut self, mut at: InputAt, end: usize) -> bool {
+ self.clear();
+ // If this is an anchored regex at the beginning of the input, then
+ // we're either already done or we only need to try backtracking once.
+ if self.prog.is_anchored_start {
+ return if !at.is_start() { false } else { self.backtrack(at) };
+ }
+ let mut matched = false;
+ loop {
+ if !self.prog.prefixes.is_empty() {
+ at = match self.input.prefix_at(&self.prog.prefixes, at) {
+ None => break,
+ Some(at) => at,
+ };
+ }
+ matched = self.backtrack(at) || matched;
+ if matched && self.prog.matches.len() == 1 {
+ return true;
+ }
+ if at.pos() >= end {
+ break;
+ }
+ at = self.input.at(at.next_pos());
+ }
+ matched
+ }
+
+ /// The main backtracking loop starting at the given input position.
+ fn backtrack(&mut self, start: InputAt) -> bool {
+ // N.B. We use an explicit stack to avoid recursion.
+ // To avoid excessive pushing and popping, most transitions are handled
+ // in the `step` helper function, which only pushes to the stack when
+ // there's a capture or a branch.
+ let mut matched = false;
+ self.m.jobs.push(Job::Inst { ip: 0, at: start });
+ while let Some(job) = self.m.jobs.pop() {
+ match job {
+ Job::Inst { ip, at } => {
+ if self.step(ip, at) {
+ // Only quit if we're matching one regex.
+ // If we're matching a regex set, then mush on and
+ // try to find other matches (if we want them).
+ if self.prog.matches.len() == 1 {
+ return true;
+ }
+ matched = true;
+ }
+ }
+ Job::SaveRestore { slot, old_pos } => {
+ if slot < self.slots.len() {
+ self.slots[slot] = old_pos;
+ }
+ }
+ }
+ }
+ matched
+ }
+
+ fn step(&mut self, mut ip: InstPtr, mut at: InputAt) -> bool {
+ use crate::prog::Inst::*;
+ loop {
+ // This loop is an optimization to avoid constantly pushing/popping
+ // from the stack. Namely, if we're pushing a job only to run it
+ // next, avoid the push and just mutate `ip` (and possibly `at`)
+ // in place.
+ if self.has_visited(ip, at) {
+ return false;
+ }
+ match self.prog[ip] {
+ Match(slot) => {
+ if slot < self.matches.len() {
+ self.matches[slot] = true;
+ }
+ return true;
+ }
+ Save(ref inst) => {
+ if let Some(&old_pos) = self.slots.get(inst.slot) {
+ // If this path doesn't work out, then we save the old
+ // capture index (if one exists) in an alternate
+ // job. If the next path fails, then the alternate
+ // job is popped and the old capture index is restored.
+ self.m.jobs.push(Job::SaveRestore {
+ slot: inst.slot,
+ old_pos,
+ });
+ self.slots[inst.slot] = Some(at.pos());
+ }
+ ip = inst.goto;
+ }
+ Split(ref inst) => {
+ self.m.jobs.push(Job::Inst { ip: inst.goto2, at });
+ ip = inst.goto1;
+ }
+ EmptyLook(ref inst) => {
+ if self.input.is_empty_match(at, inst) {
+ ip = inst.goto;
+ } else {
+ return false;
+ }
+ }
+ Char(ref inst) => {
+ if inst.c == at.char() {
+ ip = inst.goto;
+ at = self.input.at(at.next_pos());
+ } else {
+ return false;
+ }
+ }
+ Ranges(ref inst) => {
+ if inst.matches(at.char()) {
+ ip = inst.goto;
+ at = self.input.at(at.next_pos());
+ } else {
+ return false;
+ }
+ }
+ Bytes(ref inst) => {
+ if let Some(b) = at.byte() {
+ if inst.matches(b) {
+ ip = inst.goto;
+ at = self.input.at(at.next_pos());
+ continue;
+ }
+ }
+ return false;
+ }
+ }
+ }
+ }
+
+ fn has_visited(&mut self, ip: InstPtr, at: InputAt) -> bool {
+ let k = ip * (self.input.len() + 1) + at.pos();
+ let k1 = k / BIT_SIZE;
+ let k2 = usize_to_u32(1 << (k & (BIT_SIZE - 1)));
+ if self.m.visited[k1] & k2 == 0 {
+ self.m.visited[k1] |= k2;
+ false
+ } else {
+ true
+ }
+ }
+}
+
+fn usize_to_u32(n: usize) -> u32 {
+ if (n as u64) > (::std::u32::MAX as u64) {
+ panic!("BUG: {} is too big to fit into u32", n)
+ }
+ n as u32
+}
diff --git a/third_party/rust/regex/src/compile.rs b/third_party/rust/regex/src/compile.rs
new file mode 100644
index 0000000000..90ca25015f
--- /dev/null
+++ b/third_party/rust/regex/src/compile.rs
@@ -0,0 +1,1264 @@
+use std::collections::HashMap;
+use std::fmt;
+use std::iter;
+use std::result;
+use std::sync::Arc;
+
+use regex_syntax::hir::{self, Hir};
+use regex_syntax::is_word_byte;
+use regex_syntax::utf8::{Utf8Range, Utf8Sequence, Utf8Sequences};
+
+use crate::prog::{
+ EmptyLook, Inst, InstBytes, InstChar, InstEmptyLook, InstPtr, InstRanges,
+ InstSave, InstSplit, Program,
+};
+
+use crate::Error;
+
+type Result = result::Result<Patch, Error>;
+type ResultOrEmpty = result::Result<Option<Patch>, Error>;
+
+#[derive(Debug)]
+struct Patch {
+ hole: Hole,
+ entry: InstPtr,
+}
+
+/// A compiler translates a regular expression AST to a sequence of
+/// instructions. The sequence of instructions represents an NFA.
+// `Compiler` is only public via the `internal` module, so avoid deriving
+// `Debug`.
+#[allow(missing_debug_implementations)]
+pub struct Compiler {
+ insts: Vec<MaybeInst>,
+ compiled: Program,
+ capture_name_idx: HashMap<String, usize>,
+ num_exprs: usize,
+ size_limit: usize,
+ suffix_cache: SuffixCache,
+ utf8_seqs: Option<Utf8Sequences>,
+ byte_classes: ByteClassSet,
+ // This keeps track of extra bytes allocated while compiling the regex
+ // program. Currently, this corresponds to two things. First is the heap
+ // memory allocated by Unicode character classes ('InstRanges'). Second is
+ // a "fake" amount of memory used by empty sub-expressions, so that enough
+ // empty sub-expressions will ultimately trigger the compiler to bail
+ // because of a size limit restriction. (That empty sub-expressions don't
+ // add to heap memory usage is more-or-less an implementation detail.) In
+ // the second case, if we don't bail, then an excessively large repetition
+ // on an empty sub-expression can result in the compiler using a very large
+ // amount of CPU time.
+ extra_inst_bytes: usize,
+}
+
+impl Compiler {
+ /// Create a new regular expression compiler.
+ ///
+ /// Various options can be set before calling `compile` on an expression.
+ pub fn new() -> Self {
+ Compiler {
+ insts: vec![],
+ compiled: Program::new(),
+ capture_name_idx: HashMap::new(),
+ num_exprs: 0,
+ size_limit: 10 * (1 << 20),
+ suffix_cache: SuffixCache::new(1000),
+ utf8_seqs: Some(Utf8Sequences::new('\x00', '\x00')),
+ byte_classes: ByteClassSet::new(),
+ extra_inst_bytes: 0,
+ }
+ }
+
+ /// The size of the resulting program is limited by size_limit. If
+ /// the program approximately exceeds the given size (in bytes), then
+ /// compilation will stop and return an error.
+ pub fn size_limit(mut self, size_limit: usize) -> Self {
+ self.size_limit = size_limit;
+ self
+ }
+
+ /// If bytes is true, then the program is compiled as a byte based
+ /// automaton, which incorporates UTF-8 decoding into the machine. If it's
+ /// false, then the automaton is Unicode scalar value based, e.g., an
+ /// engine utilizing such an automaton is responsible for UTF-8 decoding.
+ ///
+ /// The specific invariant is that when returning a byte based machine,
+ /// the neither the `Char` nor `Ranges` instructions are produced.
+ /// Conversely, when producing a Unicode scalar value machine, the `Bytes`
+ /// instruction is never produced.
+ ///
+ /// Note that `dfa(true)` implies `bytes(true)`.
+ pub fn bytes(mut self, yes: bool) -> Self {
+ self.compiled.is_bytes = yes;
+ self
+ }
+
+ /// When disabled, the program compiled may match arbitrary bytes.
+ ///
+ /// When enabled (the default), all compiled programs exclusively match
+ /// valid UTF-8 bytes.
+ pub fn only_utf8(mut self, yes: bool) -> Self {
+ self.compiled.only_utf8 = yes;
+ self
+ }
+
+ /// When set, the machine returned is suitable for use in the DFA matching
+ /// engine.
+ ///
+ /// In particular, this ensures that if the regex is not anchored in the
+ /// beginning, then a preceding `.*?` is included in the program. (The NFA
+ /// based engines handle the preceding `.*?` explicitly, which is difficult
+ /// or impossible in the DFA engine.)
+ pub fn dfa(mut self, yes: bool) -> Self {
+ self.compiled.is_dfa = yes;
+ self
+ }
+
+ /// When set, the machine returned is suitable for matching text in
+ /// reverse. In particular, all concatenations are flipped.
+ pub fn reverse(mut self, yes: bool) -> Self {
+ self.compiled.is_reverse = yes;
+ self
+ }
+
+ /// Compile a regular expression given its AST.
+ ///
+ /// The compiler is guaranteed to succeed unless the program exceeds the
+ /// specified size limit. If the size limit is exceeded, then compilation
+ /// stops and returns an error.
+ pub fn compile(mut self, exprs: &[Hir]) -> result::Result<Program, Error> {
+ debug_assert!(!exprs.is_empty());
+ self.num_exprs = exprs.len();
+ if exprs.len() == 1 {
+ self.compile_one(&exprs[0])
+ } else {
+ self.compile_many(exprs)
+ }
+ }
+
+ fn compile_one(mut self, expr: &Hir) -> result::Result<Program, Error> {
+ // If we're compiling a forward DFA and we aren't anchored, then
+ // add a `.*?` before the first capture group.
+ // Other matching engines handle this by baking the logic into the
+ // matching engine itself.
+ let mut dotstar_patch = Patch { hole: Hole::None, entry: 0 };
+ self.compiled.is_anchored_start = expr.is_anchored_start();
+ self.compiled.is_anchored_end = expr.is_anchored_end();
+ if self.compiled.needs_dotstar() {
+ dotstar_patch = self.c_dotstar()?;
+ self.compiled.start = dotstar_patch.entry;
+ }
+ self.compiled.captures = vec![None];
+ let patch =
+ self.c_capture(0, expr)?.unwrap_or_else(|| self.next_inst());
+ if self.compiled.needs_dotstar() {
+ self.fill(dotstar_patch.hole, patch.entry);
+ } else {
+ self.compiled.start = patch.entry;
+ }
+ self.fill_to_next(patch.hole);
+ self.compiled.matches = vec![self.insts.len()];
+ self.push_compiled(Inst::Match(0));
+ self.compile_finish()
+ }
+
+ fn compile_many(
+ mut self,
+ exprs: &[Hir],
+ ) -> result::Result<Program, Error> {
+ debug_assert!(exprs.len() > 1);
+
+ self.compiled.is_anchored_start =
+ exprs.iter().all(|e| e.is_anchored_start());
+ self.compiled.is_anchored_end =
+ exprs.iter().all(|e| e.is_anchored_end());
+ let mut dotstar_patch = Patch { hole: Hole::None, entry: 0 };
+ if self.compiled.needs_dotstar() {
+ dotstar_patch = self.c_dotstar()?;
+ self.compiled.start = dotstar_patch.entry;
+ } else {
+ self.compiled.start = 0; // first instruction is always split
+ }
+ self.fill_to_next(dotstar_patch.hole);
+
+ let mut prev_hole = Hole::None;
+ for (i, expr) in exprs[0..exprs.len() - 1].iter().enumerate() {
+ self.fill_to_next(prev_hole);
+ let split = self.push_split_hole();
+ let Patch { hole, entry } =
+ self.c_capture(0, expr)?.unwrap_or_else(|| self.next_inst());
+ self.fill_to_next(hole);
+ self.compiled.matches.push(self.insts.len());
+ self.push_compiled(Inst::Match(i));
+ prev_hole = self.fill_split(split, Some(entry), None);
+ }
+ let i = exprs.len() - 1;
+ let Patch { hole, entry } =
+ self.c_capture(0, &exprs[i])?.unwrap_or_else(|| self.next_inst());
+ self.fill(prev_hole, entry);
+ self.fill_to_next(hole);
+ self.compiled.matches.push(self.insts.len());
+ self.push_compiled(Inst::Match(i));
+ self.compile_finish()
+ }
+
+ fn compile_finish(mut self) -> result::Result<Program, Error> {
+ self.compiled.insts =
+ self.insts.into_iter().map(|inst| inst.unwrap()).collect();
+ self.compiled.byte_classes = self.byte_classes.byte_classes();
+ self.compiled.capture_name_idx = Arc::new(self.capture_name_idx);
+ Ok(self.compiled)
+ }
+
+ /// Compile expr into self.insts, returning a patch on success,
+ /// or an error if we run out of memory.
+ ///
+ /// All of the c_* methods of the compiler share the contract outlined
+ /// here.
+ ///
+ /// The main thing that a c_* method does is mutate `self.insts`
+ /// to add a list of mostly compiled instructions required to execute
+ /// the given expression. `self.insts` contains MaybeInsts rather than
+ /// Insts because there is some backpatching required.
+ ///
+ /// The `Patch` value returned by each c_* method provides metadata
+ /// about the compiled instructions emitted to `self.insts`. The
+ /// `entry` member of the patch refers to the first instruction
+ /// (the entry point), while the `hole` member contains zero or
+ /// more offsets to partial instructions that need to be backpatched.
+ /// The c_* routine can't know where its list of instructions are going to
+ /// jump to after execution, so it is up to the caller to patch
+ /// these jumps to point to the right place. So compiling some
+ /// expression, e, we would end up with a situation that looked like:
+ ///
+ /// ```text
+ /// self.insts = [ ..., i1, i2, ..., iexit1, ..., iexitn, ...]
+ /// ^ ^ ^
+ /// | \ /
+ /// entry \ /
+ /// hole
+ /// ```
+ ///
+ /// To compile two expressions, e1 and e2, concatenated together we
+ /// would do:
+ ///
+ /// ```ignore
+ /// let patch1 = self.c(e1);
+ /// let patch2 = self.c(e2);
+ /// ```
+ ///
+ /// while leaves us with a situation that looks like
+ ///
+ /// ```text
+ /// self.insts = [ ..., i1, ..., iexit1, ..., i2, ..., iexit2 ]
+ /// ^ ^ ^ ^
+ /// | | | |
+ /// entry1 hole1 entry2 hole2
+ /// ```
+ ///
+ /// Then to merge the two patches together into one we would backpatch
+ /// hole1 with entry2 and return a new patch that enters at entry1
+ /// and has hole2 for a hole. In fact, if you look at the c_concat
+ /// method you will see that it does exactly this, though it handles
+ /// a list of expressions rather than just the two that we use for
+ /// an example.
+ ///
+ /// Ok(None) is returned when an expression is compiled to no
+ /// instruction, and so no patch.entry value makes sense.
+ fn c(&mut self, expr: &Hir) -> ResultOrEmpty {
+ use crate::prog;
+ use regex_syntax::hir::HirKind::*;
+
+ self.check_size()?;
+ match *expr.kind() {
+ Empty => self.c_empty(),
+ Literal(hir::Literal::Unicode(c)) => self.c_char(c),
+ Literal(hir::Literal::Byte(b)) => {
+ assert!(self.compiled.uses_bytes());
+ self.c_byte(b)
+ }
+ Class(hir::Class::Unicode(ref cls)) => self.c_class(cls.ranges()),
+ Class(hir::Class::Bytes(ref cls)) => {
+ if self.compiled.uses_bytes() {
+ self.c_class_bytes(cls.ranges())
+ } else {
+ assert!(cls.is_all_ascii());
+ let mut char_ranges = vec![];
+ for r in cls.iter() {
+ let (s, e) = (r.start() as char, r.end() as char);
+ char_ranges.push(hir::ClassUnicodeRange::new(s, e));
+ }
+ self.c_class(&char_ranges)
+ }
+ }
+ Anchor(hir::Anchor::StartLine) if self.compiled.is_reverse => {
+ self.byte_classes.set_range(b'\n', b'\n');
+ self.c_empty_look(prog::EmptyLook::EndLine)
+ }
+ Anchor(hir::Anchor::StartLine) => {
+ self.byte_classes.set_range(b'\n', b'\n');
+ self.c_empty_look(prog::EmptyLook::StartLine)
+ }
+ Anchor(hir::Anchor::EndLine) if self.compiled.is_reverse => {
+ self.byte_classes.set_range(b'\n', b'\n');
+ self.c_empty_look(prog::EmptyLook::StartLine)
+ }
+ Anchor(hir::Anchor::EndLine) => {
+ self.byte_classes.set_range(b'\n', b'\n');
+ self.c_empty_look(prog::EmptyLook::EndLine)
+ }
+ Anchor(hir::Anchor::StartText) if self.compiled.is_reverse => {
+ self.c_empty_look(prog::EmptyLook::EndText)
+ }
+ Anchor(hir::Anchor::StartText) => {
+ self.c_empty_look(prog::EmptyLook::StartText)
+ }
+ Anchor(hir::Anchor::EndText) if self.compiled.is_reverse => {
+ self.c_empty_look(prog::EmptyLook::StartText)
+ }
+ Anchor(hir::Anchor::EndText) => {
+ self.c_empty_look(prog::EmptyLook::EndText)
+ }
+ WordBoundary(hir::WordBoundary::Unicode) => {
+ if !cfg!(feature = "unicode-perl") {
+ return Err(Error::Syntax(
+ "Unicode word boundaries are unavailable when \
+ the unicode-perl feature is disabled"
+ .to_string(),
+ ));
+ }
+ self.compiled.has_unicode_word_boundary = true;
+ self.byte_classes.set_word_boundary();
+ // We also make sure that all ASCII bytes are in a different
+ // class from non-ASCII bytes. Otherwise, it's possible for
+ // ASCII bytes to get lumped into the same class as non-ASCII
+ // bytes. This in turn may cause the lazy DFA to falsely start
+ // when it sees an ASCII byte that maps to a byte class with
+ // non-ASCII bytes. This ensures that never happens.
+ self.byte_classes.set_range(0, 0x7F);
+ self.c_empty_look(prog::EmptyLook::WordBoundary)
+ }
+ WordBoundary(hir::WordBoundary::UnicodeNegate) => {
+ if !cfg!(feature = "unicode-perl") {
+ return Err(Error::Syntax(
+ "Unicode word boundaries are unavailable when \
+ the unicode-perl feature is disabled"
+ .to_string(),
+ ));
+ }
+ self.compiled.has_unicode_word_boundary = true;
+ self.byte_classes.set_word_boundary();
+ // See comments above for why we set the ASCII range here.
+ self.byte_classes.set_range(0, 0x7F);
+ self.c_empty_look(prog::EmptyLook::NotWordBoundary)
+ }
+ WordBoundary(hir::WordBoundary::Ascii) => {
+ self.byte_classes.set_word_boundary();
+ self.c_empty_look(prog::EmptyLook::WordBoundaryAscii)
+ }
+ WordBoundary(hir::WordBoundary::AsciiNegate) => {
+ self.byte_classes.set_word_boundary();
+ self.c_empty_look(prog::EmptyLook::NotWordBoundaryAscii)
+ }
+ Group(ref g) => match g.kind {
+ hir::GroupKind::NonCapturing => self.c(&g.hir),
+ hir::GroupKind::CaptureIndex(index) => {
+ if index as usize >= self.compiled.captures.len() {
+ self.compiled.captures.push(None);
+ }
+ self.c_capture(2 * index as usize, &g.hir)
+ }
+ hir::GroupKind::CaptureName { index, ref name } => {
+ if index as usize >= self.compiled.captures.len() {
+ let n = name.to_string();
+ self.compiled.captures.push(Some(n.clone()));
+ self.capture_name_idx.insert(n, index as usize);
+ }
+ self.c_capture(2 * index as usize, &g.hir)
+ }
+ },
+ Concat(ref es) => {
+ if self.compiled.is_reverse {
+ self.c_concat(es.iter().rev())
+ } else {
+ self.c_concat(es)
+ }
+ }
+ Alternation(ref es) => self.c_alternate(&**es),
+ Repetition(ref rep) => self.c_repeat(rep),
+ }
+ }
+
+ fn c_empty(&mut self) -> ResultOrEmpty {
+ // See: https://github.com/rust-lang/regex/security/advisories/GHSA-m5pq-gvj9-9vr8
+ // See: CVE-2022-24713
+ //
+ // Since 'empty' sub-expressions don't increase the size of
+ // the actual compiled object, we "fake" an increase in its
+ // size so that our 'check_size_limit' routine will eventually
+ // stop compilation if there are too many empty sub-expressions
+ // (e.g., via a large repetition).
+ self.extra_inst_bytes += std::mem::size_of::<Inst>();
+ Ok(None)
+ }
+
+ fn c_capture(&mut self, first_slot: usize, expr: &Hir) -> ResultOrEmpty {
+ if self.num_exprs > 1 || self.compiled.is_dfa {
+ // Don't ever compile Save instructions for regex sets because
+ // they are never used. They are also never used in DFA programs
+ // because DFAs can't handle captures.
+ self.c(expr)
+ } else {
+ let entry = self.insts.len();
+ let hole = self.push_hole(InstHole::Save { slot: first_slot });
+ let patch = self.c(expr)?.unwrap_or_else(|| self.next_inst());
+ self.fill(hole, patch.entry);
+ self.fill_to_next(patch.hole);
+ let hole = self.push_hole(InstHole::Save { slot: first_slot + 1 });
+ Ok(Some(Patch { hole, entry }))
+ }
+ }
+
+ fn c_dotstar(&mut self) -> Result {
+ Ok(if !self.compiled.only_utf8() {
+ self.c(&Hir::repetition(hir::Repetition {
+ kind: hir::RepetitionKind::ZeroOrMore,
+ greedy: false,
+ hir: Box::new(Hir::any(true)),
+ }))?
+ .unwrap()
+ } else {
+ self.c(&Hir::repetition(hir::Repetition {
+ kind: hir::RepetitionKind::ZeroOrMore,
+ greedy: false,
+ hir: Box::new(Hir::any(false)),
+ }))?
+ .unwrap()
+ })
+ }
+
+ fn c_char(&mut self, c: char) -> ResultOrEmpty {
+ if self.compiled.uses_bytes() {
+ if c.is_ascii() {
+ let b = c as u8;
+ let hole =
+ self.push_hole(InstHole::Bytes { start: b, end: b });
+ self.byte_classes.set_range(b, b);
+ Ok(Some(Patch { hole, entry: self.insts.len() - 1 }))
+ } else {
+ self.c_class(&[hir::ClassUnicodeRange::new(c, c)])
+ }
+ } else {
+ let hole = self.push_hole(InstHole::Char { c });
+ Ok(Some(Patch { hole, entry: self.insts.len() - 1 }))
+ }
+ }
+
+ fn c_class(&mut self, ranges: &[hir::ClassUnicodeRange]) -> ResultOrEmpty {
+ use std::mem::size_of;
+
+ assert!(!ranges.is_empty());
+ if self.compiled.uses_bytes() {
+ Ok(Some(CompileClass { c: self, ranges }.compile()?))
+ } else {
+ let ranges: Vec<(char, char)> =
+ ranges.iter().map(|r| (r.start(), r.end())).collect();
+ let hole = if ranges.len() == 1 && ranges[0].0 == ranges[0].1 {
+ self.push_hole(InstHole::Char { c: ranges[0].0 })
+ } else {
+ self.extra_inst_bytes +=
+ ranges.len() * (size_of::<char>() * 2);
+ self.push_hole(InstHole::Ranges { ranges })
+ };
+ Ok(Some(Patch { hole, entry: self.insts.len() - 1 }))
+ }
+ }
+
+ fn c_byte(&mut self, b: u8) -> ResultOrEmpty {
+ self.c_class_bytes(&[hir::ClassBytesRange::new(b, b)])
+ }
+
+ fn c_class_bytes(
+ &mut self,
+ ranges: &[hir::ClassBytesRange],
+ ) -> ResultOrEmpty {
+ debug_assert!(!ranges.is_empty());
+
+ let first_split_entry = self.insts.len();
+ let mut holes = vec![];
+ let mut prev_hole = Hole::None;
+ for r in &ranges[0..ranges.len() - 1] {
+ self.fill_to_next(prev_hole);
+ let split = self.push_split_hole();
+ let next = self.insts.len();
+ self.byte_classes.set_range(r.start(), r.end());
+ holes.push(self.push_hole(InstHole::Bytes {
+ start: r.start(),
+ end: r.end(),
+ }));
+ prev_hole = self.fill_split(split, Some(next), None);
+ }
+ let next = self.insts.len();
+ let r = &ranges[ranges.len() - 1];
+ self.byte_classes.set_range(r.start(), r.end());
+ holes.push(
+ self.push_hole(InstHole::Bytes { start: r.start(), end: r.end() }),
+ );
+ self.fill(prev_hole, next);
+ Ok(Some(Patch { hole: Hole::Many(holes), entry: first_split_entry }))
+ }
+
+ fn c_empty_look(&mut self, look: EmptyLook) -> ResultOrEmpty {
+ let hole = self.push_hole(InstHole::EmptyLook { look });
+ Ok(Some(Patch { hole, entry: self.insts.len() - 1 }))
+ }
+
+ fn c_concat<'a, I>(&mut self, exprs: I) -> ResultOrEmpty
+ where
+ I: IntoIterator<Item = &'a Hir>,
+ {
+ let mut exprs = exprs.into_iter();
+ let Patch { mut hole, entry } = loop {
+ match exprs.next() {
+ None => return self.c_empty(),
+ Some(e) => {
+ if let Some(p) = self.c(e)? {
+ break p;
+ }
+ }
+ }
+ };
+ for e in exprs {
+ if let Some(p) = self.c(e)? {
+ self.fill(hole, p.entry);
+ hole = p.hole;
+ }
+ }
+ Ok(Some(Patch { hole, entry }))
+ }
+
+ fn c_alternate(&mut self, exprs: &[Hir]) -> ResultOrEmpty {
+ debug_assert!(
+ exprs.len() >= 2,
+ "alternates must have at least 2 exprs"
+ );
+
+ // Initial entry point is always the first split.
+ let first_split_entry = self.insts.len();
+
+ // Save up all of the holes from each alternate. They will all get
+ // patched to point to the same location.
+ let mut holes = vec![];
+
+ // true indicates that the hole is a split where we want to fill
+ // the second branch.
+ let mut prev_hole = (Hole::None, false);
+ for e in &exprs[0..exprs.len() - 1] {
+ if prev_hole.1 {
+ let next = self.insts.len();
+ self.fill_split(prev_hole.0, None, Some(next));
+ } else {
+ self.fill_to_next(prev_hole.0);
+ }
+ let split = self.push_split_hole();
+ if let Some(Patch { hole, entry }) = self.c(e)? {
+ holes.push(hole);
+ prev_hole = (self.fill_split(split, Some(entry), None), false);
+ } else {
+ let (split1, split2) = split.dup_one();
+ holes.push(split1);
+ prev_hole = (split2, true);
+ }
+ }
+ if let Some(Patch { hole, entry }) = self.c(&exprs[exprs.len() - 1])? {
+ holes.push(hole);
+ if prev_hole.1 {
+ self.fill_split(prev_hole.0, None, Some(entry));
+ } else {
+ self.fill(prev_hole.0, entry);
+ }
+ } else {
+ // We ignore prev_hole.1. When it's true, it means we have two
+ // empty branches both pushing prev_hole.0 into holes, so both
+ // branches will go to the same place anyway.
+ holes.push(prev_hole.0);
+ }
+ Ok(Some(Patch { hole: Hole::Many(holes), entry: first_split_entry }))
+ }
+
+ fn c_repeat(&mut self, rep: &hir::Repetition) -> ResultOrEmpty {
+ use regex_syntax::hir::RepetitionKind::*;
+ match rep.kind {
+ ZeroOrOne => self.c_repeat_zero_or_one(&rep.hir, rep.greedy),
+ ZeroOrMore => self.c_repeat_zero_or_more(&rep.hir, rep.greedy),
+ OneOrMore => self.c_repeat_one_or_more(&rep.hir, rep.greedy),
+ Range(hir::RepetitionRange::Exactly(min_max)) => {
+ self.c_repeat_range(&rep.hir, rep.greedy, min_max, min_max)
+ }
+ Range(hir::RepetitionRange::AtLeast(min)) => {
+ self.c_repeat_range_min_or_more(&rep.hir, rep.greedy, min)
+ }
+ Range(hir::RepetitionRange::Bounded(min, max)) => {
+ self.c_repeat_range(&rep.hir, rep.greedy, min, max)
+ }
+ }
+ }
+
+ fn c_repeat_zero_or_one(
+ &mut self,
+ expr: &Hir,
+ greedy: bool,
+ ) -> ResultOrEmpty {
+ let split_entry = self.insts.len();
+ let split = self.push_split_hole();
+ let Patch { hole: hole_rep, entry: entry_rep } = match self.c(expr)? {
+ Some(p) => p,
+ None => return self.pop_split_hole(),
+ };
+ let split_hole = if greedy {
+ self.fill_split(split, Some(entry_rep), None)
+ } else {
+ self.fill_split(split, None, Some(entry_rep))
+ };
+ let holes = vec![hole_rep, split_hole];
+ Ok(Some(Patch { hole: Hole::Many(holes), entry: split_entry }))
+ }
+
+ fn c_repeat_zero_or_more(
+ &mut self,
+ expr: &Hir,
+ greedy: bool,
+ ) -> ResultOrEmpty {
+ let split_entry = self.insts.len();
+ let split = self.push_split_hole();
+ let Patch { hole: hole_rep, entry: entry_rep } = match self.c(expr)? {
+ Some(p) => p,
+ None => return self.pop_split_hole(),
+ };
+
+ self.fill(hole_rep, split_entry);
+ let split_hole = if greedy {
+ self.fill_split(split, Some(entry_rep), None)
+ } else {
+ self.fill_split(split, None, Some(entry_rep))
+ };
+ Ok(Some(Patch { hole: split_hole, entry: split_entry }))
+ }
+
+ fn c_repeat_one_or_more(
+ &mut self,
+ expr: &Hir,
+ greedy: bool,
+ ) -> ResultOrEmpty {
+ let Patch { hole: hole_rep, entry: entry_rep } = match self.c(expr)? {
+ Some(p) => p,
+ None => return Ok(None),
+ };
+ self.fill_to_next(hole_rep);
+ let split = self.push_split_hole();
+
+ let split_hole = if greedy {
+ self.fill_split(split, Some(entry_rep), None)
+ } else {
+ self.fill_split(split, None, Some(entry_rep))
+ };
+ Ok(Some(Patch { hole: split_hole, entry: entry_rep }))
+ }
+
+ fn c_repeat_range_min_or_more(
+ &mut self,
+ expr: &Hir,
+ greedy: bool,
+ min: u32,
+ ) -> ResultOrEmpty {
+ let min = u32_to_usize(min);
+ // Using next_inst() is ok, because we can't return it (concat would
+ // have to return Some(_) while c_repeat_range_min_or_more returns
+ // None).
+ let patch_concat = self
+ .c_concat(iter::repeat(expr).take(min))?
+ .unwrap_or_else(|| self.next_inst());
+ if let Some(patch_rep) = self.c_repeat_zero_or_more(expr, greedy)? {
+ self.fill(patch_concat.hole, patch_rep.entry);
+ Ok(Some(Patch { hole: patch_rep.hole, entry: patch_concat.entry }))
+ } else {
+ Ok(None)
+ }
+ }
+
+ fn c_repeat_range(
+ &mut self,
+ expr: &Hir,
+ greedy: bool,
+ min: u32,
+ max: u32,
+ ) -> ResultOrEmpty {
+ let (min, max) = (u32_to_usize(min), u32_to_usize(max));
+ debug_assert!(min <= max);
+ let patch_concat = self.c_concat(iter::repeat(expr).take(min))?;
+ if min == max {
+ return Ok(patch_concat);
+ }
+ // Same reasoning as in c_repeat_range_min_or_more (we know that min <
+ // max at this point).
+ let patch_concat = patch_concat.unwrap_or_else(|| self.next_inst());
+ let initial_entry = patch_concat.entry;
+ // It is much simpler to compile, e.g., `a{2,5}` as:
+ //
+ // aaa?a?a?
+ //
+ // But you end up with a sequence of instructions like this:
+ //
+ // 0: 'a'
+ // 1: 'a',
+ // 2: split(3, 4)
+ // 3: 'a'
+ // 4: split(5, 6)
+ // 5: 'a'
+ // 6: split(7, 8)
+ // 7: 'a'
+ // 8: MATCH
+ //
+ // This is *incredibly* inefficient because the splits end
+ // up forming a chain, which has to be resolved everything a
+ // transition is followed.
+ let mut holes = vec![];
+ let mut prev_hole = patch_concat.hole;
+ for _ in min..max {
+ self.fill_to_next(prev_hole);
+ let split = self.push_split_hole();
+ let Patch { hole, entry } = match self.c(expr)? {
+ Some(p) => p,
+ None => return self.pop_split_hole(),
+ };
+ prev_hole = hole;
+ if greedy {
+ holes.push(self.fill_split(split, Some(entry), None));
+ } else {
+ holes.push(self.fill_split(split, None, Some(entry)));
+ }
+ }
+ holes.push(prev_hole);
+ Ok(Some(Patch { hole: Hole::Many(holes), entry: initial_entry }))
+ }
+
+ /// Can be used as a default value for the c_* functions when the call to
+ /// c_function is followed by inserting at least one instruction that is
+ /// always executed after the ones written by the c* function.
+ fn next_inst(&self) -> Patch {
+ Patch { hole: Hole::None, entry: self.insts.len() }
+ }
+
+ fn fill(&mut self, hole: Hole, goto: InstPtr) {
+ match hole {
+ Hole::None => {}
+ Hole::One(pc) => {
+ self.insts[pc].fill(goto);
+ }
+ Hole::Many(holes) => {
+ for hole in holes {
+ self.fill(hole, goto);
+ }
+ }
+ }
+ }
+
+ fn fill_to_next(&mut self, hole: Hole) {
+ let next = self.insts.len();
+ self.fill(hole, next);
+ }
+
+ fn fill_split(
+ &mut self,
+ hole: Hole,
+ goto1: Option<InstPtr>,
+ goto2: Option<InstPtr>,
+ ) -> Hole {
+ match hole {
+ Hole::None => Hole::None,
+ Hole::One(pc) => match (goto1, goto2) {
+ (Some(goto1), Some(goto2)) => {
+ self.insts[pc].fill_split(goto1, goto2);
+ Hole::None
+ }
+ (Some(goto1), None) => {
+ self.insts[pc].half_fill_split_goto1(goto1);
+ Hole::One(pc)
+ }
+ (None, Some(goto2)) => {
+ self.insts[pc].half_fill_split_goto2(goto2);
+ Hole::One(pc)
+ }
+ (None, None) => unreachable!(
+ "at least one of the split \
+ holes must be filled"
+ ),
+ },
+ Hole::Many(holes) => {
+ let mut new_holes = vec![];
+ for hole in holes {
+ new_holes.push(self.fill_split(hole, goto1, goto2));
+ }
+ if new_holes.is_empty() {
+ Hole::None
+ } else if new_holes.len() == 1 {
+ new_holes.pop().unwrap()
+ } else {
+ Hole::Many(new_holes)
+ }
+ }
+ }
+ }
+
+ fn push_compiled(&mut self, inst: Inst) {
+ self.insts.push(MaybeInst::Compiled(inst));
+ }
+
+ fn push_hole(&mut self, inst: InstHole) -> Hole {
+ let hole = self.insts.len();
+ self.insts.push(MaybeInst::Uncompiled(inst));
+ Hole::One(hole)
+ }
+
+ fn push_split_hole(&mut self) -> Hole {
+ let hole = self.insts.len();
+ self.insts.push(MaybeInst::Split);
+ Hole::One(hole)
+ }
+
+ fn pop_split_hole(&mut self) -> ResultOrEmpty {
+ self.insts.pop();
+ Ok(None)
+ }
+
+ fn check_size(&self) -> result::Result<(), Error> {
+ use std::mem::size_of;
+
+ let size =
+ self.extra_inst_bytes + (self.insts.len() * size_of::<Inst>());
+ if size > self.size_limit {
+ Err(Error::CompiledTooBig(self.size_limit))
+ } else {
+ Ok(())
+ }
+ }
+}
+
+#[derive(Debug)]
+enum Hole {
+ None,
+ One(InstPtr),
+ Many(Vec<Hole>),
+}
+
+impl Hole {
+ fn dup_one(self) -> (Self, Self) {
+ match self {
+ Hole::One(pc) => (Hole::One(pc), Hole::One(pc)),
+ Hole::None | Hole::Many(_) => {
+ unreachable!("must be called on single hole")
+ }
+ }
+ }
+}
+
+#[derive(Clone, Debug)]
+enum MaybeInst {
+ Compiled(Inst),
+ Uncompiled(InstHole),
+ Split,
+ Split1(InstPtr),
+ Split2(InstPtr),
+}
+
+impl MaybeInst {
+ fn fill(&mut self, goto: InstPtr) {
+ let maybeinst = match *self {
+ MaybeInst::Split => MaybeInst::Split1(goto),
+ MaybeInst::Uncompiled(ref inst) => {
+ MaybeInst::Compiled(inst.fill(goto))
+ }
+ MaybeInst::Split1(goto1) => {
+ MaybeInst::Compiled(Inst::Split(InstSplit {
+ goto1,
+ goto2: goto,
+ }))
+ }
+ MaybeInst::Split2(goto2) => {
+ MaybeInst::Compiled(Inst::Split(InstSplit {
+ goto1: goto,
+ goto2,
+ }))
+ }
+ _ => unreachable!(
+ "not all instructions were compiled! \
+ found uncompiled instruction: {:?}",
+ self
+ ),
+ };
+ *self = maybeinst;
+ }
+
+ fn fill_split(&mut self, goto1: InstPtr, goto2: InstPtr) {
+ let filled = match *self {
+ MaybeInst::Split => Inst::Split(InstSplit { goto1, goto2 }),
+ _ => unreachable!(
+ "must be called on Split instruction, \
+ instead it was called on: {:?}",
+ self
+ ),
+ };
+ *self = MaybeInst::Compiled(filled);
+ }
+
+ fn half_fill_split_goto1(&mut self, goto1: InstPtr) {
+ let half_filled = match *self {
+ MaybeInst::Split => goto1,
+ _ => unreachable!(
+ "must be called on Split instruction, \
+ instead it was called on: {:?}",
+ self
+ ),
+ };
+ *self = MaybeInst::Split1(half_filled);
+ }
+
+ fn half_fill_split_goto2(&mut self, goto2: InstPtr) {
+ let half_filled = match *self {
+ MaybeInst::Split => goto2,
+ _ => unreachable!(
+ "must be called on Split instruction, \
+ instead it was called on: {:?}",
+ self
+ ),
+ };
+ *self = MaybeInst::Split2(half_filled);
+ }
+
+ fn unwrap(self) -> Inst {
+ match self {
+ MaybeInst::Compiled(inst) => inst,
+ _ => unreachable!(
+ "must be called on a compiled instruction, \
+ instead it was called on: {:?}",
+ self
+ ),
+ }
+ }
+}
+
+#[derive(Clone, Debug)]
+enum InstHole {
+ Save { slot: usize },
+ EmptyLook { look: EmptyLook },
+ Char { c: char },
+ Ranges { ranges: Vec<(char, char)> },
+ Bytes { start: u8, end: u8 },
+}
+
+impl InstHole {
+ fn fill(&self, goto: InstPtr) -> Inst {
+ match *self {
+ InstHole::Save { slot } => Inst::Save(InstSave { goto, slot }),
+ InstHole::EmptyLook { look } => {
+ Inst::EmptyLook(InstEmptyLook { goto, look })
+ }
+ InstHole::Char { c } => Inst::Char(InstChar { goto, c }),
+ InstHole::Ranges { ref ranges } => Inst::Ranges(InstRanges {
+ goto,
+ ranges: ranges.clone().into_boxed_slice(),
+ }),
+ InstHole::Bytes { start, end } => {
+ Inst::Bytes(InstBytes { goto, start, end })
+ }
+ }
+ }
+}
+
+struct CompileClass<'a, 'b> {
+ c: &'a mut Compiler,
+ ranges: &'b [hir::ClassUnicodeRange],
+}
+
+impl<'a, 'b> CompileClass<'a, 'b> {
+ fn compile(mut self) -> Result {
+ let mut holes = vec![];
+ let mut initial_entry = None;
+ let mut last_split = Hole::None;
+ let mut utf8_seqs = self.c.utf8_seqs.take().unwrap();
+ self.c.suffix_cache.clear();
+
+ for (i, range) in self.ranges.iter().enumerate() {
+ let is_last_range = i + 1 == self.ranges.len();
+ utf8_seqs.reset(range.start(), range.end());
+ let mut it = (&mut utf8_seqs).peekable();
+ loop {
+ let utf8_seq = match it.next() {
+ None => break,
+ Some(utf8_seq) => utf8_seq,
+ };
+ if is_last_range && it.peek().is_none() {
+ let Patch { hole, entry } = self.c_utf8_seq(&utf8_seq)?;
+ holes.push(hole);
+ self.c.fill(last_split, entry);
+ last_split = Hole::None;
+ if initial_entry.is_none() {
+ initial_entry = Some(entry);
+ }
+ } else {
+ if initial_entry.is_none() {
+ initial_entry = Some(self.c.insts.len());
+ }
+ self.c.fill_to_next(last_split);
+ last_split = self.c.push_split_hole();
+ let Patch { hole, entry } = self.c_utf8_seq(&utf8_seq)?;
+ holes.push(hole);
+ last_split =
+ self.c.fill_split(last_split, Some(entry), None);
+ }
+ }
+ }
+ self.c.utf8_seqs = Some(utf8_seqs);
+ Ok(Patch { hole: Hole::Many(holes), entry: initial_entry.unwrap() })
+ }
+
+ fn c_utf8_seq(&mut self, seq: &Utf8Sequence) -> Result {
+ if self.c.compiled.is_reverse {
+ self.c_utf8_seq_(seq)
+ } else {
+ self.c_utf8_seq_(seq.into_iter().rev())
+ }
+ }
+
+ fn c_utf8_seq_<'r, I>(&mut self, seq: I) -> Result
+ where
+ I: IntoIterator<Item = &'r Utf8Range>,
+ {
+ // The initial instruction for each UTF-8 sequence should be the same.
+ let mut from_inst = ::std::usize::MAX;
+ let mut last_hole = Hole::None;
+ for byte_range in seq {
+ let key = SuffixCacheKey {
+ from_inst,
+ start: byte_range.start,
+ end: byte_range.end,
+ };
+ {
+ let pc = self.c.insts.len();
+ if let Some(cached_pc) = self.c.suffix_cache.get(key, pc) {
+ from_inst = cached_pc;
+ continue;
+ }
+ }
+ self.c.byte_classes.set_range(byte_range.start, byte_range.end);
+ if from_inst == ::std::usize::MAX {
+ last_hole = self.c.push_hole(InstHole::Bytes {
+ start: byte_range.start,
+ end: byte_range.end,
+ });
+ } else {
+ self.c.push_compiled(Inst::Bytes(InstBytes {
+ goto: from_inst,
+ start: byte_range.start,
+ end: byte_range.end,
+ }));
+ }
+ from_inst = self.c.insts.len().checked_sub(1).unwrap();
+ debug_assert!(from_inst < ::std::usize::MAX);
+ }
+ debug_assert!(from_inst < ::std::usize::MAX);
+ Ok(Patch { hole: last_hole, entry: from_inst })
+ }
+}
+
+/// `SuffixCache` is a simple bounded hash map for caching suffix entries in
+/// UTF-8 automata. For example, consider the Unicode range \u{0}-\u{FFFF}.
+/// The set of byte ranges looks like this:
+///
+/// [0-7F]
+/// [C2-DF][80-BF]
+/// [E0][A0-BF][80-BF]
+/// [E1-EC][80-BF][80-BF]
+/// [ED][80-9F][80-BF]
+/// [EE-EF][80-BF][80-BF]
+///
+/// Each line above translates to one alternate in the compiled regex program.
+/// However, all but one of the alternates end in the same suffix, which is
+/// a waste of an instruction. The suffix cache facilitates reusing them across
+/// alternates.
+///
+/// Note that a HashMap could be trivially used for this, but we don't need its
+/// overhead. Some small bounded space (LRU style) is more than enough.
+///
+/// This uses similar idea to [`SparseSet`](../sparse/struct.SparseSet.html),
+/// except it uses hashes as original indices and then compares full keys for
+/// validation against `dense` array.
+#[derive(Debug)]
+struct SuffixCache {
+ sparse: Box<[usize]>,
+ dense: Vec<SuffixCacheEntry>,
+}
+
+#[derive(Clone, Copy, Debug, Default, Eq, Hash, PartialEq)]
+struct SuffixCacheEntry {
+ key: SuffixCacheKey,
+ pc: InstPtr,
+}
+
+#[derive(Clone, Copy, Debug, Default, Eq, Hash, PartialEq)]
+struct SuffixCacheKey {
+ from_inst: InstPtr,
+ start: u8,
+ end: u8,
+}
+
+impl SuffixCache {
+ fn new(size: usize) -> Self {
+ SuffixCache {
+ sparse: vec![0usize; size].into(),
+ dense: Vec::with_capacity(size),
+ }
+ }
+
+ fn get(&mut self, key: SuffixCacheKey, pc: InstPtr) -> Option<InstPtr> {
+ let hash = self.hash(&key);
+ let pos = &mut self.sparse[hash];
+ if let Some(entry) = self.dense.get(*pos) {
+ if entry.key == key {
+ return Some(entry.pc);
+ }
+ }
+ *pos = self.dense.len();
+ self.dense.push(SuffixCacheEntry { key, pc });
+ None
+ }
+
+ fn clear(&mut self) {
+ self.dense.clear();
+ }
+
+ fn hash(&self, suffix: &SuffixCacheKey) -> usize {
+ // Basic FNV-1a hash as described:
+ // https://en.wikipedia.org/wiki/Fowler%E2%80%93Noll%E2%80%93Vo_hash_function
+ const FNV_PRIME: u64 = 1_099_511_628_211;
+ let mut h = 14_695_981_039_346_656_037;
+ h = (h ^ (suffix.from_inst as u64)).wrapping_mul(FNV_PRIME);
+ h = (h ^ (suffix.start as u64)).wrapping_mul(FNV_PRIME);
+ h = (h ^ (suffix.end as u64)).wrapping_mul(FNV_PRIME);
+ (h as usize) % self.sparse.len()
+ }
+}
+
+struct ByteClassSet([bool; 256]);
+
+impl ByteClassSet {
+ fn new() -> Self {
+ ByteClassSet([false; 256])
+ }
+
+ fn set_range(&mut self, start: u8, end: u8) {
+ debug_assert!(start <= end);
+ if start > 0 {
+ self.0[start as usize - 1] = true;
+ }
+ self.0[end as usize] = true;
+ }
+
+ fn set_word_boundary(&mut self) {
+ // We need to mark all ranges of bytes whose pairs result in
+ // evaluating \b differently.
+ let iswb = is_word_byte;
+ let mut b1: u16 = 0;
+ let mut b2: u16;
+ while b1 <= 255 {
+ b2 = b1 + 1;
+ while b2 <= 255 && iswb(b1 as u8) == iswb(b2 as u8) {
+ b2 += 1;
+ }
+ self.set_range(b1 as u8, (b2 - 1) as u8);
+ b1 = b2;
+ }
+ }
+
+ fn byte_classes(&self) -> Vec<u8> {
+ // N.B. If you're debugging the DFA, it's useful to simply return
+ // `(0..256).collect()`, which effectively removes the byte classes
+ // and makes the transitions easier to read.
+ // (0usize..256).map(|x| x as u8).collect()
+ let mut byte_classes = vec![0; 256];
+ let mut class = 0u8;
+ let mut i = 0;
+ loop {
+ byte_classes[i] = class as u8;
+ if i >= 255 {
+ break;
+ }
+ if self.0[i] {
+ class = class.checked_add(1).unwrap();
+ }
+ i += 1;
+ }
+ byte_classes
+ }
+}
+
+impl fmt::Debug for ByteClassSet {
+ fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+ f.debug_tuple("ByteClassSet").field(&&self.0[..]).finish()
+ }
+}
+
+fn u32_to_usize(n: u32) -> usize {
+ // In case usize is less than 32 bits, we need to guard against overflow.
+ // On most platforms this compiles to nothing.
+ // TODO Use `std::convert::TryFrom` once it's stable.
+ if (n as u64) > (::std::usize::MAX as u64) {
+ panic!("BUG: {} is too big to be pointer sized", n)
+ }
+ n as usize
+}
+
+#[cfg(test)]
+mod tests {
+ use super::ByteClassSet;
+
+ #[test]
+ fn byte_classes() {
+ let mut set = ByteClassSet::new();
+ set.set_range(b'a', b'z');
+ let classes = set.byte_classes();
+ assert_eq!(classes[0], 0);
+ assert_eq!(classes[1], 0);
+ assert_eq!(classes[2], 0);
+ assert_eq!(classes[b'a' as usize - 1], 0);
+ assert_eq!(classes[b'a' as usize], 1);
+ assert_eq!(classes[b'm' as usize], 1);
+ assert_eq!(classes[b'z' as usize], 1);
+ assert_eq!(classes[b'z' as usize + 1], 2);
+ assert_eq!(classes[254], 2);
+ assert_eq!(classes[255], 2);
+
+ let mut set = ByteClassSet::new();
+ set.set_range(0, 2);
+ set.set_range(4, 6);
+ let classes = set.byte_classes();
+ assert_eq!(classes[0], 0);
+ assert_eq!(classes[1], 0);
+ assert_eq!(classes[2], 0);
+ assert_eq!(classes[3], 1);
+ assert_eq!(classes[4], 2);
+ assert_eq!(classes[5], 2);
+ assert_eq!(classes[6], 2);
+ assert_eq!(classes[7], 3);
+ assert_eq!(classes[255], 3);
+ }
+
+ #[test]
+ fn full_byte_classes() {
+ let mut set = ByteClassSet::new();
+ for i in 0..256u16 {
+ set.set_range(i as u8, i as u8);
+ }
+ assert_eq!(set.byte_classes().len(), 256);
+ }
+}
diff --git a/third_party/rust/regex/src/dfa.rs b/third_party/rust/regex/src/dfa.rs
new file mode 100644
index 0000000000..dc9952120e
--- /dev/null
+++ b/third_party/rust/regex/src/dfa.rs
@@ -0,0 +1,1945 @@
+/*!
+The DFA matching engine.
+
+A DFA provides faster matching because the engine is in exactly one state at
+any point in time. In the NFA, there may be multiple active states, and
+considerable CPU cycles are spent shuffling them around. In finite automata
+speak, the DFA follows epsilon transitions in the regex far less than the NFA.
+
+A DFA is a classic trade off between time and space. The NFA is slower, but
+its memory requirements are typically small and predictable. The DFA is faster,
+but given the right regex and the right input, the number of states in the
+DFA can grow exponentially. To mitigate this space problem, we do two things:
+
+1. We implement an *online* DFA. That is, the DFA is constructed from the NFA
+ during a search. When a new state is computed, it is stored in a cache so
+ that it may be reused. An important consequence of this implementation
+ is that states that are never reached for a particular input are never
+ computed. (This is impossible in an "offline" DFA which needs to compute
+ all possible states up front.)
+2. If the cache gets too big, we wipe it and continue matching.
+
+In pathological cases, a new state can be created for every byte of input.
+(e.g., The regex `(a|b)*a(a|b){20}` on a long sequence of a's and b's.)
+In this case, performance regresses to slightly slower than the full NFA
+simulation, in large part because the cache becomes useless. If the cache
+is wiped too frequently, the DFA quits and control falls back to one of the
+NFA simulations.
+
+Because of the "lazy" nature of this DFA, the inner matching loop is
+considerably more complex than one might expect out of a DFA. A number of
+tricks are employed to make it fast. Tread carefully.
+
+N.B. While this implementation is heavily commented, Russ Cox's series of
+articles on regexes is strongly recommended: <https://swtch.com/~rsc/regexp/>
+(As is the DFA implementation in RE2, which heavily influenced this
+implementation.)
+*/
+
+use std::collections::HashMap;
+use std::fmt;
+use std::iter::repeat;
+use std::mem;
+use std::sync::Arc;
+
+use crate::exec::ProgramCache;
+use crate::prog::{Inst, Program};
+use crate::sparse::SparseSet;
+
+/// Return true if and only if the given program can be executed by a DFA.
+///
+/// Generally, a DFA is always possible. A pathological case where it is not
+/// possible is if the number of NFA states exceeds `u32::MAX`, in which case,
+/// this function will return false.
+///
+/// This function will also return false if the given program has any Unicode
+/// instructions (Char or Ranges) since the DFA operates on bytes only.
+pub fn can_exec(insts: &Program) -> bool {
+ use crate::prog::Inst::*;
+ // If for some reason we manage to allocate a regex program with more
+ // than i32::MAX instructions, then we can't execute the DFA because we
+ // use 32 bit instruction pointer deltas for memory savings.
+ // If i32::MAX is the largest positive delta,
+ // then -i32::MAX == i32::MIN + 1 is the largest negative delta,
+ // and we are OK to use 32 bits.
+ if insts.dfa_size_limit == 0 || insts.len() > ::std::i32::MAX as usize {
+ return false;
+ }
+ for inst in insts {
+ match *inst {
+ Char(_) | Ranges(_) => return false,
+ EmptyLook(_) | Match(_) | Save(_) | Split(_) | Bytes(_) => {}
+ }
+ }
+ true
+}
+
+/// A reusable cache of DFA states.
+///
+/// This cache is reused between multiple invocations of the same regex
+/// program. (It is not shared simultaneously between threads. If there is
+/// contention, then new caches are created.)
+#[derive(Debug)]
+pub struct Cache {
+ /// Group persistent DFA related cache state together. The sparse sets
+ /// listed below are used as scratch space while computing uncached states.
+ inner: CacheInner,
+ /// qcur and qnext are ordered sets with constant time
+ /// addition/membership/clearing-whole-set and linear time iteration. They
+ /// are used to manage the sets of NFA states in DFA states when computing
+ /// cached DFA states. In particular, the order of the NFA states matters
+ /// for leftmost-first style matching. Namely, when computing a cached
+ /// state, the set of NFA states stops growing as soon as the first Match
+ /// instruction is observed.
+ qcur: SparseSet,
+ qnext: SparseSet,
+}
+
+/// `CacheInner` is logically just a part of Cache, but groups together fields
+/// that aren't passed as function parameters throughout search. (This split
+/// is mostly an artifact of the borrow checker. It is happily paid.)
+#[derive(Debug)]
+struct CacheInner {
+ /// A cache of pre-compiled DFA states, keyed by the set of NFA states
+ /// and the set of empty-width flags set at the byte in the input when the
+ /// state was observed.
+ ///
+ /// A StatePtr is effectively a `*State`, but to avoid various inconvenient
+ /// things, we just pass indexes around manually. The performance impact of
+ /// this is probably an instruction or two in the inner loop. However, on
+ /// 64 bit, each StatePtr is half the size of a *State.
+ compiled: StateMap,
+ /// The transition table.
+ ///
+ /// The transition table is laid out in row-major order, where states are
+ /// rows and the transitions for each state are columns. At a high level,
+ /// given state `s` and byte `b`, the next state can be found at index
+ /// `s * 256 + b`.
+ ///
+ /// This is, of course, a lie. A StatePtr is actually a pointer to the
+ /// *start* of a row in this table. When indexing in the DFA's inner loop,
+ /// this removes the need to multiply the StatePtr by the stride. Yes, it
+ /// matters. This reduces the number of states we can store, but: the
+ /// stride is rarely 256 since we define transitions in terms of
+ /// *equivalence classes* of bytes. Each class corresponds to a set of
+ /// bytes that never discriminate a distinct path through the DFA from each
+ /// other.
+ trans: Transitions,
+ /// A set of cached start states, which are limited to the number of
+ /// permutations of flags set just before the initial byte of input. (The
+ /// index into this vec is a `EmptyFlags`.)
+ ///
+ /// N.B. A start state can be "dead" (i.e., no possible match), so we
+ /// represent it with a StatePtr.
+ start_states: Vec<StatePtr>,
+ /// Stack scratch space used to follow epsilon transitions in the NFA.
+ /// (This permits us to avoid recursion.)
+ ///
+ /// The maximum stack size is the number of NFA states.
+ stack: Vec<InstPtr>,
+ /// The total number of times this cache has been flushed by the DFA
+ /// because of space constraints.
+ flush_count: u64,
+ /// The total heap size of the DFA's cache. We use this to determine when
+ /// we should flush the cache.
+ size: usize,
+ /// Scratch space used when building instruction pointer lists for new
+ /// states. This helps amortize allocation.
+ insts_scratch_space: Vec<u8>,
+}
+
+/// The transition table.
+///
+/// It is laid out in row-major order, with states as rows and byte class
+/// transitions as columns.
+///
+/// The transition table is responsible for producing valid `StatePtrs`. A
+/// `StatePtr` points to the start of a particular row in this table. When
+/// indexing to find the next state this allows us to avoid a multiplication
+/// when computing an index into the table.
+#[derive(Clone)]
+struct Transitions {
+ /// The table.
+ table: Vec<StatePtr>,
+ /// The stride.
+ num_byte_classes: usize,
+}
+
+/// Fsm encapsulates the actual execution of the DFA.
+#[derive(Debug)]
+pub struct Fsm<'a> {
+ /// prog contains the NFA instruction opcodes. DFA execution uses either
+ /// the `dfa` instructions or the `dfa_reverse` instructions from
+ /// `exec::ExecReadOnly`. (It never uses `ExecReadOnly.nfa`, which may have
+ /// Unicode opcodes that cannot be executed by the DFA.)
+ prog: &'a Program,
+ /// The start state. We record it here because the pointer may change
+ /// when the cache is wiped.
+ start: StatePtr,
+ /// The current position in the input.
+ at: usize,
+ /// Should we quit after seeing the first match? e.g., When the caller
+ /// uses `is_match` or `shortest_match`.
+ quit_after_match: bool,
+ /// The last state that matched.
+ ///
+ /// When no match has occurred, this is set to STATE_UNKNOWN.
+ ///
+ /// This is only useful when matching regex sets. The last match state
+ /// is useful because it contains all of the match instructions seen,
+ /// thereby allowing us to enumerate which regexes in the set matched.
+ last_match_si: StatePtr,
+ /// The input position of the last cache flush. We use this to determine
+ /// if we're thrashing in the cache too often. If so, the DFA quits so
+ /// that we can fall back to the NFA algorithm.
+ last_cache_flush: usize,
+ /// All cached DFA information that is persisted between searches.
+ cache: &'a mut CacheInner,
+}
+
+/// The result of running the DFA.
+///
+/// Generally, the result is either a match or not a match, but sometimes the
+/// DFA runs too slowly because the cache size is too small. In that case, it
+/// gives up with the intent of falling back to the NFA algorithm.
+///
+/// The DFA can also give up if it runs out of room to create new states, or if
+/// it sees non-ASCII bytes in the presence of a Unicode word boundary.
+#[derive(Clone, Debug)]
+pub enum Result<T> {
+ Match(T),
+ NoMatch(usize),
+ Quit,
+}
+
+impl<T> Result<T> {
+ /// Returns true if this result corresponds to a match.
+ pub fn is_match(&self) -> bool {
+ match *self {
+ Result::Match(_) => true,
+ Result::NoMatch(_) | Result::Quit => false,
+ }
+ }
+
+ /// Maps the given function onto T and returns the result.
+ ///
+ /// If this isn't a match, then this is a no-op.
+ #[cfg(feature = "perf-literal")]
+ pub fn map<U, F: FnMut(T) -> U>(self, mut f: F) -> Result<U> {
+ match self {
+ Result::Match(t) => Result::Match(f(t)),
+ Result::NoMatch(x) => Result::NoMatch(x),
+ Result::Quit => Result::Quit,
+ }
+ }
+
+ /// Sets the non-match position.
+ ///
+ /// If this isn't a non-match, then this is a no-op.
+ fn set_non_match(self, at: usize) -> Result<T> {
+ match self {
+ Result::NoMatch(_) => Result::NoMatch(at),
+ r => r,
+ }
+ }
+}
+
+/// `State` is a DFA state. It contains an ordered set of NFA states (not
+/// necessarily complete) and a smattering of flags.
+///
+/// The flags are packed into the first byte of data.
+///
+/// States don't carry their transitions. Instead, transitions are stored in
+/// a single row-major table.
+///
+/// Delta encoding is used to store the instruction pointers.
+/// The first instruction pointer is stored directly starting
+/// at data[1], and each following pointer is stored as an offset
+/// to the previous one. If a delta is in the range -127..127,
+/// it is packed into a single byte; Otherwise the byte 128 (-128 as an i8)
+/// is coded as a flag, followed by 4 bytes encoding the delta.
+#[derive(Clone, Eq, Hash, PartialEq)]
+struct State {
+ data: Arc<[u8]>,
+}
+
+/// `InstPtr` is a 32 bit pointer into a sequence of opcodes (i.e., it indexes
+/// an NFA state).
+///
+/// Throughout this library, this is usually set to `usize`, but we force a
+/// `u32` here for the DFA to save on space.
+type InstPtr = u32;
+
+/// Adds ip to data using delta encoding with respect to prev.
+///
+/// After completion, `data` will contain `ip` and `prev` will be set to `ip`.
+fn push_inst_ptr(data: &mut Vec<u8>, prev: &mut InstPtr, ip: InstPtr) {
+ let delta = (ip as i32) - (*prev as i32);
+ write_vari32(data, delta);
+ *prev = ip;
+}
+
+struct InstPtrs<'a> {
+ base: usize,
+ data: &'a [u8],
+}
+
+impl<'a> Iterator for InstPtrs<'a> {
+ type Item = usize;
+
+ fn next(&mut self) -> Option<usize> {
+ if self.data.is_empty() {
+ return None;
+ }
+ let (delta, nread) = read_vari32(self.data);
+ let base = self.base as i32 + delta;
+ debug_assert!(base >= 0);
+ debug_assert!(nread > 0);
+ self.data = &self.data[nread..];
+ self.base = base as usize;
+ Some(self.base)
+ }
+}
+
+impl State {
+ fn flags(&self) -> StateFlags {
+ StateFlags(self.data[0])
+ }
+
+ fn inst_ptrs(&self) -> InstPtrs<'_> {
+ InstPtrs { base: 0, data: &self.data[1..] }
+ }
+}
+
+/// `StatePtr` is a 32 bit pointer to the start of a row in the transition
+/// table.
+///
+/// It has many special values. There are two types of special values:
+/// sentinels and flags.
+///
+/// Sentinels corresponds to special states that carry some kind of
+/// significance. There are three such states: unknown, dead and quit states.
+///
+/// Unknown states are states that haven't been computed yet. They indicate
+/// that a transition should be filled in that points to either an existing
+/// cached state or a new state altogether. In general, an unknown state means
+/// "follow the NFA's epsilon transitions."
+///
+/// Dead states are states that can never lead to a match, no matter what
+/// subsequent input is observed. This means that the DFA should quit
+/// immediately and return the longest match it has found thus far.
+///
+/// Quit states are states that imply the DFA is not capable of matching the
+/// regex correctly. Currently, this is only used when a Unicode word boundary
+/// exists in the regex *and* a non-ASCII byte is observed.
+///
+/// The other type of state pointer is a state pointer with special flag bits.
+/// There are two flags: a start flag and a match flag. The lower bits of both
+/// kinds always contain a "valid" `StatePtr` (indicated by the `STATE_MAX`
+/// mask).
+///
+/// The start flag means that the state is a start state, and therefore may be
+/// subject to special prefix scanning optimizations.
+///
+/// The match flag means that the state is a match state, and therefore the
+/// current position in the input (while searching) should be recorded.
+///
+/// The above exists mostly in the service of making the inner loop fast.
+/// In particular, the inner *inner* loop looks something like this:
+///
+/// ```ignore
+/// while state <= STATE_MAX and i < len(text):
+/// state = state.next[i]
+/// ```
+///
+/// This is nice because it lets us execute a lazy DFA as if it were an
+/// entirely offline DFA (i.e., with very few instructions). The loop will
+/// quit only when we need to examine a case that needs special attention.
+type StatePtr = u32;
+
+/// An unknown state means that the state has not been computed yet, and that
+/// the only way to progress is to compute it.
+const STATE_UNKNOWN: StatePtr = 1 << 31;
+
+/// A dead state means that the state has been computed and it is known that
+/// once it is entered, no future match can ever occur.
+const STATE_DEAD: StatePtr = STATE_UNKNOWN + 1;
+
+/// A quit state means that the DFA came across some input that it doesn't
+/// know how to process correctly. The DFA should quit and another matching
+/// engine should be run in its place.
+const STATE_QUIT: StatePtr = STATE_DEAD + 1;
+
+/// A start state is a state that the DFA can start in.
+///
+/// Note that start states have their lower bits set to a state pointer.
+const STATE_START: StatePtr = 1 << 30;
+
+/// A match state means that the regex has successfully matched.
+///
+/// Note that match states have their lower bits set to a state pointer.
+const STATE_MATCH: StatePtr = 1 << 29;
+
+/// The maximum state pointer. This is useful to mask out the "valid" state
+/// pointer from a state with the "start" or "match" bits set.
+///
+/// It doesn't make sense to use this with unknown, dead or quit state
+/// pointers, since those pointers are sentinels and never have their lower
+/// bits set to anything meaningful.
+const STATE_MAX: StatePtr = STATE_MATCH - 1;
+
+/// Byte is a u8 in spirit, but a u16 in practice so that we can represent the
+/// special EOF sentinel value.
+#[derive(Copy, Clone, Debug)]
+struct Byte(u16);
+
+/// A set of flags for zero-width assertions.
+#[derive(Clone, Copy, Eq, Debug, Default, Hash, PartialEq)]
+struct EmptyFlags {
+ start: bool,
+ end: bool,
+ start_line: bool,
+ end_line: bool,
+ word_boundary: bool,
+ not_word_boundary: bool,
+}
+
+/// A set of flags describing various configurations of a DFA state. This is
+/// represented by a `u8` so that it is compact.
+#[derive(Clone, Copy, Eq, Default, Hash, PartialEq)]
+struct StateFlags(u8);
+
+impl Cache {
+ /// Create new empty cache for the DFA engine.
+ pub fn new(prog: &Program) -> Self {
+ // We add 1 to account for the special EOF byte.
+ let num_byte_classes = (prog.byte_classes[255] as usize + 1) + 1;
+ let starts = vec![STATE_UNKNOWN; 256];
+ let mut cache = Cache {
+ inner: CacheInner {
+ compiled: StateMap::new(num_byte_classes),
+ trans: Transitions::new(num_byte_classes),
+ start_states: starts,
+ stack: vec![],
+ flush_count: 0,
+ size: 0,
+ insts_scratch_space: vec![],
+ },
+ qcur: SparseSet::new(prog.insts.len()),
+ qnext: SparseSet::new(prog.insts.len()),
+ };
+ cache.inner.reset_size();
+ cache
+ }
+}
+
+impl CacheInner {
+ /// Resets the cache size to account for fixed costs, such as the program
+ /// and stack sizes.
+ fn reset_size(&mut self) {
+ self.size = (self.start_states.len() * mem::size_of::<StatePtr>())
+ + (self.stack.len() * mem::size_of::<InstPtr>());
+ }
+}
+
+impl<'a> Fsm<'a> {
+ #[cfg_attr(feature = "perf-inline", inline(always))]
+ pub fn forward(
+ prog: &'a Program,
+ cache: &ProgramCache,
+ quit_after_match: bool,
+ text: &[u8],
+ at: usize,
+ ) -> Result<usize> {
+ let mut cache = cache.borrow_mut();
+ let cache = &mut cache.dfa;
+ let mut dfa = Fsm {
+ prog,
+ start: 0, // filled in below
+ at,
+ quit_after_match,
+ last_match_si: STATE_UNKNOWN,
+ last_cache_flush: at,
+ cache: &mut cache.inner,
+ };
+ let (empty_flags, state_flags) = dfa.start_flags(text, at);
+ dfa.start =
+ match dfa.start_state(&mut cache.qcur, empty_flags, state_flags) {
+ None => return Result::Quit,
+ Some(STATE_DEAD) => return Result::NoMatch(at),
+ Some(si) => si,
+ };
+ debug_assert!(dfa.start != STATE_UNKNOWN);
+ dfa.exec_at(&mut cache.qcur, &mut cache.qnext, text)
+ }
+
+ #[cfg_attr(feature = "perf-inline", inline(always))]
+ pub fn reverse(
+ prog: &'a Program,
+ cache: &ProgramCache,
+ quit_after_match: bool,
+ text: &[u8],
+ at: usize,
+ ) -> Result<usize> {
+ let mut cache = cache.borrow_mut();
+ let cache = &mut cache.dfa_reverse;
+ let mut dfa = Fsm {
+ prog,
+ start: 0, // filled in below
+ at,
+ quit_after_match,
+ last_match_si: STATE_UNKNOWN,
+ last_cache_flush: at,
+ cache: &mut cache.inner,
+ };
+ let (empty_flags, state_flags) = dfa.start_flags_reverse(text, at);
+ dfa.start =
+ match dfa.start_state(&mut cache.qcur, empty_flags, state_flags) {
+ None => return Result::Quit,
+ Some(STATE_DEAD) => return Result::NoMatch(at),
+ Some(si) => si,
+ };
+ debug_assert!(dfa.start != STATE_UNKNOWN);
+ dfa.exec_at_reverse(&mut cache.qcur, &mut cache.qnext, text)
+ }
+
+ #[cfg_attr(feature = "perf-inline", inline(always))]
+ pub fn forward_many(
+ prog: &'a Program,
+ cache: &ProgramCache,
+ matches: &mut [bool],
+ text: &[u8],
+ at: usize,
+ ) -> Result<usize> {
+ debug_assert!(matches.len() == prog.matches.len());
+ let mut cache = cache.borrow_mut();
+ let cache = &mut cache.dfa;
+ let mut dfa = Fsm {
+ prog,
+ start: 0, // filled in below
+ at,
+ quit_after_match: false,
+ last_match_si: STATE_UNKNOWN,
+ last_cache_flush: at,
+ cache: &mut cache.inner,
+ };
+ let (empty_flags, state_flags) = dfa.start_flags(text, at);
+ dfa.start =
+ match dfa.start_state(&mut cache.qcur, empty_flags, state_flags) {
+ None => return Result::Quit,
+ Some(STATE_DEAD) => return Result::NoMatch(at),
+ Some(si) => si,
+ };
+ debug_assert!(dfa.start != STATE_UNKNOWN);
+ let result = dfa.exec_at(&mut cache.qcur, &mut cache.qnext, text);
+ if result.is_match() {
+ if matches.len() == 1 {
+ matches[0] = true;
+ } else {
+ debug_assert!(dfa.last_match_si != STATE_UNKNOWN);
+ debug_assert!(dfa.last_match_si != STATE_DEAD);
+ for ip in dfa.state(dfa.last_match_si).inst_ptrs() {
+ if let Inst::Match(slot) = dfa.prog[ip] {
+ matches[slot] = true;
+ }
+ }
+ }
+ }
+ result
+ }
+
+ /// Executes the DFA on a forward NFA.
+ ///
+ /// {qcur,qnext} are scratch ordered sets which may be non-empty.
+ #[cfg_attr(feature = "perf-inline", inline(always))]
+ fn exec_at(
+ &mut self,
+ qcur: &mut SparseSet,
+ qnext: &mut SparseSet,
+ text: &[u8],
+ ) -> Result<usize> {
+ // For the most part, the DFA is basically:
+ //
+ // last_match = null
+ // while current_byte != EOF:
+ // si = current_state.next[current_byte]
+ // if si is match
+ // last_match = si
+ // return last_match
+ //
+ // However, we need to deal with a few things:
+ //
+ // 1. This is an *online* DFA, so the current state's next list
+ // may not point to anywhere yet, so we must go out and compute
+ // them. (They are then cached into the current state's next list
+ // to avoid re-computation.)
+ // 2. If we come across a state that is known to be dead (i.e., never
+ // leads to a match), then we can quit early.
+ // 3. If the caller just wants to know if a match occurs, then we
+ // can quit as soon as we know we have a match. (Full leftmost
+ // first semantics require continuing on.)
+ // 4. If we're in the start state, then we can use a pre-computed set
+ // of prefix literals to skip quickly along the input.
+ // 5. After the input is exhausted, we run the DFA on one symbol
+ // that stands for EOF. This is useful for handling empty width
+ // assertions.
+ // 6. We can't actually do state.next[byte]. Instead, we have to do
+ // state.next[byte_classes[byte]], which permits us to keep the
+ // 'next' list very small.
+ //
+ // Since there's a bunch of extra stuff we need to consider, we do some
+ // pretty hairy tricks to get the inner loop to run as fast as
+ // possible.
+ debug_assert!(!self.prog.is_reverse);
+
+ // The last match is the currently known ending match position. It is
+ // reported as an index to the most recent byte that resulted in a
+ // transition to a match state and is always stored in capture slot `1`
+ // when searching forwards. Its maximum value is `text.len()`.
+ let mut result = Result::NoMatch(self.at);
+ let (mut prev_si, mut next_si) = (self.start, self.start);
+ let mut at = self.at;
+ while at < text.len() {
+ // This is the real inner loop. We take advantage of special bits
+ // set in the state pointer to determine whether a state is in the
+ // "common" case or not. Specifically, the common case is a
+ // non-match non-start non-dead state that has already been
+ // computed. So long as we remain in the common case, this inner
+ // loop will chew through the input.
+ //
+ // We also unroll the loop 4 times to amortize the cost of checking
+ // whether we've consumed the entire input. We are also careful
+ // to make sure that `prev_si` always represents the previous state
+ // and `next_si` always represents the next state after the loop
+ // exits, even if it isn't always true inside the loop.
+ while next_si <= STATE_MAX && at < text.len() {
+ // Argument for safety is in the definition of next_si.
+ prev_si = unsafe { self.next_si(next_si, text, at) };
+ at += 1;
+ if prev_si > STATE_MAX || at + 2 >= text.len() {
+ mem::swap(&mut prev_si, &mut next_si);
+ break;
+ }
+ next_si = unsafe { self.next_si(prev_si, text, at) };
+ at += 1;
+ if next_si > STATE_MAX {
+ break;
+ }
+ prev_si = unsafe { self.next_si(next_si, text, at) };
+ at += 1;
+ if prev_si > STATE_MAX {
+ mem::swap(&mut prev_si, &mut next_si);
+ break;
+ }
+ next_si = unsafe { self.next_si(prev_si, text, at) };
+ at += 1;
+ }
+ if next_si & STATE_MATCH > 0 {
+ // A match state is outside of the common case because it needs
+ // special case analysis. In particular, we need to record the
+ // last position as having matched and possibly quit the DFA if
+ // we don't need to keep matching.
+ next_si &= !STATE_MATCH;
+ result = Result::Match(at - 1);
+ if self.quit_after_match {
+ return result;
+ }
+ self.last_match_si = next_si;
+ prev_si = next_si;
+
+ // This permits short-circuiting when matching a regex set.
+ // In particular, if this DFA state contains only match states,
+ // then it's impossible to extend the set of matches since
+ // match states are final. Therefore, we can quit.
+ if self.prog.matches.len() > 1 {
+ let state = self.state(next_si);
+ let just_matches =
+ state.inst_ptrs().all(|ip| self.prog[ip].is_match());
+ if just_matches {
+ return result;
+ }
+ }
+
+ // Another inner loop! If the DFA stays in this particular
+ // match state, then we can rip through all of the input
+ // very quickly, and only recording the match location once
+ // we've left this particular state.
+ let cur = at;
+ while (next_si & !STATE_MATCH) == prev_si
+ && at + 2 < text.len()
+ {
+ // Argument for safety is in the definition of next_si.
+ next_si = unsafe {
+ self.next_si(next_si & !STATE_MATCH, text, at)
+ };
+ at += 1;
+ }
+ if at > cur {
+ result = Result::Match(at - 2);
+ }
+ } else if next_si & STATE_START > 0 {
+ // A start state isn't in the common case because we may
+ // want to do quick prefix scanning. If the program doesn't
+ // have a detected prefix, then start states are actually
+ // considered common and this case is never reached.
+ debug_assert!(self.has_prefix());
+ next_si &= !STATE_START;
+ prev_si = next_si;
+ at = match self.prefix_at(text, at) {
+ None => return Result::NoMatch(text.len()),
+ Some(i) => i,
+ };
+ } else if next_si >= STATE_UNKNOWN {
+ if next_si == STATE_QUIT {
+ return Result::Quit;
+ }
+ // Finally, this corresponds to the case where the transition
+ // entered a state that can never lead to a match or a state
+ // that hasn't been computed yet. The latter being the "slow"
+ // path.
+ let byte = Byte::byte(text[at - 1]);
+ // We no longer care about the special bits in the state
+ // pointer.
+ prev_si &= STATE_MAX;
+ // Record where we are. This is used to track progress for
+ // determining whether we should quit if we've flushed the
+ // cache too much.
+ self.at = at;
+ next_si = match self.next_state(qcur, qnext, prev_si, byte) {
+ None => return Result::Quit,
+ Some(STATE_DEAD) => return result.set_non_match(at),
+ Some(si) => si,
+ };
+ debug_assert!(next_si != STATE_UNKNOWN);
+ if next_si & STATE_MATCH > 0 {
+ next_si &= !STATE_MATCH;
+ result = Result::Match(at - 1);
+ if self.quit_after_match {
+ return result;
+ }
+ self.last_match_si = next_si;
+ }
+ prev_si = next_si;
+ } else {
+ prev_si = next_si;
+ }
+ }
+
+ // Run the DFA once more on the special EOF sentinel value.
+ // We don't care about the special bits in the state pointer any more,
+ // so get rid of them.
+ prev_si &= STATE_MAX;
+ prev_si = match self.next_state(qcur, qnext, prev_si, Byte::eof()) {
+ None => return Result::Quit,
+ Some(STATE_DEAD) => return result.set_non_match(text.len()),
+ Some(si) => si & !STATE_START,
+ };
+ debug_assert!(prev_si != STATE_UNKNOWN);
+ if prev_si & STATE_MATCH > 0 {
+ prev_si &= !STATE_MATCH;
+ self.last_match_si = prev_si;
+ result = Result::Match(text.len());
+ }
+ result
+ }
+
+ /// Executes the DFA on a reverse NFA.
+ #[cfg_attr(feature = "perf-inline", inline(always))]
+ fn exec_at_reverse(
+ &mut self,
+ qcur: &mut SparseSet,
+ qnext: &mut SparseSet,
+ text: &[u8],
+ ) -> Result<usize> {
+ // The comments in `exec_at` above mostly apply here too. The main
+ // difference is that we move backwards over the input and we look for
+ // the longest possible match instead of the leftmost-first match.
+ //
+ // N.B. The code duplication here is regrettable. Efforts to improve
+ // it without sacrificing performance are welcome. ---AG
+ debug_assert!(self.prog.is_reverse);
+ let mut result = Result::NoMatch(self.at);
+ let (mut prev_si, mut next_si) = (self.start, self.start);
+ let mut at = self.at;
+ while at > 0 {
+ while next_si <= STATE_MAX && at > 0 {
+ // Argument for safety is in the definition of next_si.
+ at -= 1;
+ prev_si = unsafe { self.next_si(next_si, text, at) };
+ if prev_si > STATE_MAX || at <= 4 {
+ mem::swap(&mut prev_si, &mut next_si);
+ break;
+ }
+ at -= 1;
+ next_si = unsafe { self.next_si(prev_si, text, at) };
+ if next_si > STATE_MAX {
+ break;
+ }
+ at -= 1;
+ prev_si = unsafe { self.next_si(next_si, text, at) };
+ if prev_si > STATE_MAX {
+ mem::swap(&mut prev_si, &mut next_si);
+ break;
+ }
+ at -= 1;
+ next_si = unsafe { self.next_si(prev_si, text, at) };
+ }
+ if next_si & STATE_MATCH > 0 {
+ next_si &= !STATE_MATCH;
+ result = Result::Match(at + 1);
+ if self.quit_after_match {
+ return result;
+ }
+ self.last_match_si = next_si;
+ prev_si = next_si;
+ let cur = at;
+ while (next_si & !STATE_MATCH) == prev_si && at >= 2 {
+ // Argument for safety is in the definition of next_si.
+ at -= 1;
+ next_si = unsafe {
+ self.next_si(next_si & !STATE_MATCH, text, at)
+ };
+ }
+ if at < cur {
+ result = Result::Match(at + 2);
+ }
+ } else if next_si >= STATE_UNKNOWN {
+ if next_si == STATE_QUIT {
+ return Result::Quit;
+ }
+ let byte = Byte::byte(text[at]);
+ prev_si &= STATE_MAX;
+ self.at = at;
+ next_si = match self.next_state(qcur, qnext, prev_si, byte) {
+ None => return Result::Quit,
+ Some(STATE_DEAD) => return result.set_non_match(at),
+ Some(si) => si,
+ };
+ debug_assert!(next_si != STATE_UNKNOWN);
+ if next_si & STATE_MATCH > 0 {
+ next_si &= !STATE_MATCH;
+ result = Result::Match(at + 1);
+ if self.quit_after_match {
+ return result;
+ }
+ self.last_match_si = next_si;
+ }
+ prev_si = next_si;
+ } else {
+ prev_si = next_si;
+ }
+ }
+
+ // Run the DFA once more on the special EOF sentinel value.
+ prev_si = match self.next_state(qcur, qnext, prev_si, Byte::eof()) {
+ None => return Result::Quit,
+ Some(STATE_DEAD) => return result.set_non_match(0),
+ Some(si) => si,
+ };
+ debug_assert!(prev_si != STATE_UNKNOWN);
+ if prev_si & STATE_MATCH > 0 {
+ prev_si &= !STATE_MATCH;
+ self.last_match_si = prev_si;
+ result = Result::Match(0);
+ }
+ result
+ }
+
+ /// next_si transitions to the next state, where the transition input
+ /// corresponds to text[i].
+ ///
+ /// This elides bounds checks, and is therefore not safe.
+ #[cfg_attr(feature = "perf-inline", inline(always))]
+ unsafe fn next_si(&self, si: StatePtr, text: &[u8], i: usize) -> StatePtr {
+ // What is the argument for safety here?
+ // We have three unchecked accesses that could possibly violate safety:
+ //
+ // 1. The given byte of input (`text[i]`).
+ // 2. The class of the byte of input (`classes[text[i]]`).
+ // 3. The transition for the class (`trans[si + cls]`).
+ //
+ // (1) is only safe when calling next_si is guarded by
+ // `i < text.len()`.
+ //
+ // (2) is the easiest case to guarantee since `text[i]` is always a
+ // `u8` and `self.prog.byte_classes` always has length `u8::MAX`.
+ // (See `ByteClassSet.byte_classes` in `compile.rs`.)
+ //
+ // (3) is only safe if (1)+(2) are safe. Namely, the transitions
+ // of every state are defined to have length equal to the number of
+ // byte classes in the program. Therefore, a valid class leads to a
+ // valid transition. (All possible transitions are valid lookups, even
+ // if it points to a state that hasn't been computed yet.) (3) also
+ // relies on `si` being correct, but StatePtrs should only ever be
+ // retrieved from the transition table, which ensures they are correct.
+ debug_assert!(i < text.len());
+ let b = *text.get_unchecked(i);
+ debug_assert!((b as usize) < self.prog.byte_classes.len());
+ let cls = *self.prog.byte_classes.get_unchecked(b as usize);
+ self.cache.trans.next_unchecked(si, cls as usize)
+ }
+
+ /// Computes the next state given the current state and the current input
+ /// byte (which may be EOF).
+ ///
+ /// If STATE_DEAD is returned, then there is no valid state transition.
+ /// This implies that no permutation of future input can lead to a match
+ /// state.
+ ///
+ /// STATE_UNKNOWN can never be returned.
+ fn exec_byte(
+ &mut self,
+ qcur: &mut SparseSet,
+ qnext: &mut SparseSet,
+ mut si: StatePtr,
+ b: Byte,
+ ) -> Option<StatePtr> {
+ use crate::prog::Inst::*;
+
+ // Initialize a queue with the current DFA state's NFA states.
+ qcur.clear();
+ for ip in self.state(si).inst_ptrs() {
+ qcur.insert(ip);
+ }
+
+ // Before inspecting the current byte, we may need to also inspect
+ // whether the position immediately preceding the current byte
+ // satisfies the empty assertions found in the current state.
+ //
+ // We only need to do this step if there are any empty assertions in
+ // the current state.
+ let is_word_last = self.state(si).flags().is_word();
+ let is_word = b.is_ascii_word();
+ if self.state(si).flags().has_empty() {
+ // Compute the flags immediately preceding the current byte.
+ // This means we only care about the "end" or "end line" flags.
+ // (The "start" flags are computed immediately following the
+ // current byte and are handled below.)
+ let mut flags = EmptyFlags::default();
+ if b.is_eof() {
+ flags.end = true;
+ flags.end_line = true;
+ } else if b.as_byte().map_or(false, |b| b == b'\n') {
+ flags.end_line = true;
+ }
+ if is_word_last == is_word {
+ flags.not_word_boundary = true;
+ } else {
+ flags.word_boundary = true;
+ }
+ // Now follow epsilon transitions from every NFA state, but make
+ // sure we only follow transitions that satisfy our flags.
+ qnext.clear();
+ for &ip in &*qcur {
+ self.follow_epsilons(usize_to_u32(ip), qnext, flags);
+ }
+ mem::swap(qcur, qnext);
+ }
+
+ // Now we set flags for immediately after the current byte. Since start
+ // states are processed separately, and are the only states that can
+ // have the StartText flag set, we therefore only need to worry about
+ // the StartLine flag here.
+ //
+ // We do also keep track of whether this DFA state contains a NFA state
+ // that is a matching state. This is precisely how we delay the DFA
+ // matching by one byte in order to process the special EOF sentinel
+ // byte. Namely, if this DFA state containing a matching NFA state,
+ // then it is the *next* DFA state that is marked as a match.
+ let mut empty_flags = EmptyFlags::default();
+ let mut state_flags = StateFlags::default();
+ empty_flags.start_line = b.as_byte().map_or(false, |b| b == b'\n');
+ if b.is_ascii_word() {
+ state_flags.set_word();
+ }
+ // Now follow all epsilon transitions again, but only after consuming
+ // the current byte.
+ qnext.clear();
+ for &ip in &*qcur {
+ match self.prog[ip as usize] {
+ // These states never happen in a byte-based program.
+ Char(_) | Ranges(_) => unreachable!(),
+ // These states are handled when following epsilon transitions.
+ Save(_) | Split(_) | EmptyLook(_) => {}
+ Match(_) => {
+ state_flags.set_match();
+ if !self.continue_past_first_match() {
+ break;
+ } else if self.prog.matches.len() > 1
+ && !qnext.contains(ip as usize)
+ {
+ // If we are continuing on to find other matches,
+ // then keep a record of the match states we've seen.
+ qnext.insert(ip);
+ }
+ }
+ Bytes(ref inst) => {
+ if b.as_byte().map_or(false, |b| inst.matches(b)) {
+ self.follow_epsilons(
+ inst.goto as InstPtr,
+ qnext,
+ empty_flags,
+ );
+ }
+ }
+ }
+ }
+
+ let cache = if b.is_eof() && self.prog.matches.len() > 1 {
+ // If we're processing the last byte of the input and we're
+ // matching a regex set, then make the next state contain the
+ // previous states transitions. We do this so that the main
+ // matching loop can extract all of the match instructions.
+ mem::swap(qcur, qnext);
+ // And don't cache this state because it's totally bunk.
+ false
+ } else {
+ true
+ };
+
+ // We've now built up the set of NFA states that ought to comprise the
+ // next DFA state, so try to find it in the cache, and if it doesn't
+ // exist, cache it.
+ //
+ // N.B. We pass `&mut si` here because the cache may clear itself if
+ // it has gotten too full. When that happens, the location of the
+ // current state may change.
+ let mut next =
+ match self.cached_state(qnext, state_flags, Some(&mut si)) {
+ None => return None,
+ Some(next) => next,
+ };
+ if (self.start & !STATE_START) == next {
+ // Start states can never be match states since all matches are
+ // delayed by one byte.
+ debug_assert!(!self.state(next).flags().is_match());
+ next = self.start_ptr(next);
+ }
+ if next <= STATE_MAX && self.state(next).flags().is_match() {
+ next |= STATE_MATCH;
+ }
+ debug_assert!(next != STATE_UNKNOWN);
+ // And now store our state in the current state's next list.
+ if cache {
+ let cls = self.byte_class(b);
+ self.cache.trans.set_next(si, cls, next);
+ }
+ Some(next)
+ }
+
+ /// Follows the epsilon transitions starting at (and including) `ip`. The
+ /// resulting states are inserted into the ordered set `q`.
+ ///
+ /// Conditional epsilon transitions (i.e., empty width assertions) are only
+ /// followed if they are satisfied by the given flags, which should
+ /// represent the flags set at the current location in the input.
+ ///
+ /// If the current location corresponds to the empty string, then only the
+ /// end line and/or end text flags may be set. If the current location
+ /// corresponds to a real byte in the input, then only the start line
+ /// and/or start text flags may be set.
+ ///
+ /// As an exception to the above, when finding the initial state, any of
+ /// the above flags may be set:
+ ///
+ /// If matching starts at the beginning of the input, then start text and
+ /// start line should be set. If the input is empty, then end text and end
+ /// line should also be set.
+ ///
+ /// If matching starts after the beginning of the input, then only start
+ /// line should be set if the preceding byte is `\n`. End line should never
+ /// be set in this case. (Even if the following byte is a `\n`, it will
+ /// be handled in a subsequent DFA state.)
+ fn follow_epsilons(
+ &mut self,
+ ip: InstPtr,
+ q: &mut SparseSet,
+ flags: EmptyFlags,
+ ) {
+ use crate::prog::EmptyLook::*;
+ use crate::prog::Inst::*;
+
+ // We need to traverse the NFA to follow epsilon transitions, so avoid
+ // recursion with an explicit stack.
+ self.cache.stack.push(ip);
+ while let Some(mut ip) = self.cache.stack.pop() {
+ // Try to munch through as many states as possible without
+ // pushes/pops to the stack.
+ loop {
+ // Don't visit states we've already added.
+ if q.contains(ip as usize) {
+ break;
+ }
+ q.insert(ip as usize);
+ match self.prog[ip as usize] {
+ Char(_) | Ranges(_) => unreachable!(),
+ Match(_) | Bytes(_) => {
+ break;
+ }
+ EmptyLook(ref inst) => {
+ // Only follow empty assertion states if our flags
+ // satisfy the assertion.
+ match inst.look {
+ StartLine if flags.start_line => {
+ ip = inst.goto as InstPtr;
+ }
+ EndLine if flags.end_line => {
+ ip = inst.goto as InstPtr;
+ }
+ StartText if flags.start => {
+ ip = inst.goto as InstPtr;
+ }
+ EndText if flags.end => {
+ ip = inst.goto as InstPtr;
+ }
+ WordBoundaryAscii if flags.word_boundary => {
+ ip = inst.goto as InstPtr;
+ }
+ NotWordBoundaryAscii
+ if flags.not_word_boundary =>
+ {
+ ip = inst.goto as InstPtr;
+ }
+ WordBoundary if flags.word_boundary => {
+ ip = inst.goto as InstPtr;
+ }
+ NotWordBoundary if flags.not_word_boundary => {
+ ip = inst.goto as InstPtr;
+ }
+ StartLine | EndLine | StartText | EndText
+ | WordBoundaryAscii | NotWordBoundaryAscii
+ | WordBoundary | NotWordBoundary => {
+ break;
+ }
+ }
+ }
+ Save(ref inst) => {
+ ip = inst.goto as InstPtr;
+ }
+ Split(ref inst) => {
+ self.cache.stack.push(inst.goto2 as InstPtr);
+ ip = inst.goto1 as InstPtr;
+ }
+ }
+ }
+ }
+ }
+
+ /// Find a previously computed state matching the given set of instructions
+ /// and is_match bool.
+ ///
+ /// The given set of instructions should represent a single state in the
+ /// NFA along with all states reachable without consuming any input.
+ ///
+ /// The is_match bool should be true if and only if the preceding DFA state
+ /// contains an NFA matching state. The cached state produced here will
+ /// then signify a match. (This enables us to delay a match by one byte,
+ /// in order to account for the EOF sentinel byte.)
+ ///
+ /// If the cache is full, then it is wiped before caching a new state.
+ ///
+ /// The current state should be specified if it exists, since it will need
+ /// to be preserved if the cache clears itself. (Start states are
+ /// always saved, so they should not be passed here.) It takes a mutable
+ /// pointer to the index because if the cache is cleared, the state's
+ /// location may change.
+ fn cached_state(
+ &mut self,
+ q: &SparseSet,
+ mut state_flags: StateFlags,
+ current_state: Option<&mut StatePtr>,
+ ) -> Option<StatePtr> {
+ // If we couldn't come up with a non-empty key to represent this state,
+ // then it is dead and can never lead to a match.
+ //
+ // Note that inst_flags represent the set of empty width assertions
+ // in q. We use this as an optimization in exec_byte to determine when
+ // we should follow epsilon transitions at the empty string preceding
+ // the current byte.
+ let key = match self.cached_state_key(q, &mut state_flags) {
+ None => return Some(STATE_DEAD),
+ Some(v) => v,
+ };
+ // In the cache? Cool. Done.
+ if let Some(si) = self.cache.compiled.get_ptr(&key) {
+ return Some(si);
+ }
+ // If the cache has gotten too big, wipe it.
+ if self.approximate_size() > self.prog.dfa_size_limit
+ && !self.clear_cache_and_save(current_state)
+ {
+ // Ooops. DFA is giving up.
+ return None;
+ }
+ // Allocate room for our state and add it.
+ self.add_state(key)
+ }
+
+ /// Produces a key suitable for describing a state in the DFA cache.
+ ///
+ /// The key invariant here is that equivalent keys are produced for any two
+ /// sets of ordered NFA states (and toggling of whether the previous NFA
+ /// states contain a match state) that do not discriminate a match for any
+ /// input.
+ ///
+ /// Specifically, q should be an ordered set of NFA states and is_match
+ /// should be true if and only if the previous NFA states contained a match
+ /// state.
+ fn cached_state_key(
+ &mut self,
+ q: &SparseSet,
+ state_flags: &mut StateFlags,
+ ) -> Option<State> {
+ use crate::prog::Inst::*;
+
+ // We need to build up enough information to recognize pre-built states
+ // in the DFA. Generally speaking, this includes every instruction
+ // except for those which are purely epsilon transitions, e.g., the
+ // Save and Split instructions.
+ //
+ // Empty width assertions are also epsilon transitions, but since they
+ // are conditional, we need to make them part of a state's key in the
+ // cache.
+
+ let mut insts =
+ mem::replace(&mut self.cache.insts_scratch_space, vec![]);
+ insts.clear();
+ // Reserve 1 byte for flags.
+ insts.push(0);
+
+ let mut prev = 0;
+ for &ip in q {
+ let ip = usize_to_u32(ip);
+ match self.prog[ip as usize] {
+ Char(_) | Ranges(_) => unreachable!(),
+ Save(_) | Split(_) => {}
+ Bytes(_) => push_inst_ptr(&mut insts, &mut prev, ip),
+ EmptyLook(_) => {
+ state_flags.set_empty();
+ push_inst_ptr(&mut insts, &mut prev, ip)
+ }
+ Match(_) => {
+ push_inst_ptr(&mut insts, &mut prev, ip);
+ if !self.continue_past_first_match() {
+ break;
+ }
+ }
+ }
+ }
+ // If we couldn't transition to any other instructions and we didn't
+ // see a match when expanding NFA states previously, then this is a
+ // dead state and no amount of additional input can transition out
+ // of this state.
+ let opt_state = if insts.len() == 1 && !state_flags.is_match() {
+ None
+ } else {
+ let StateFlags(f) = *state_flags;
+ insts[0] = f;
+ Some(State { data: Arc::from(&*insts) })
+ };
+ self.cache.insts_scratch_space = insts;
+ opt_state
+ }
+
+ /// Clears the cache, but saves and restores current_state if it is not
+ /// none.
+ ///
+ /// The current state must be provided here in case its location in the
+ /// cache changes.
+ ///
+ /// This returns false if the cache is not cleared and the DFA should
+ /// give up.
+ fn clear_cache_and_save(
+ &mut self,
+ current_state: Option<&mut StatePtr>,
+ ) -> bool {
+ if self.cache.compiled.is_empty() {
+ // Nothing to clear...
+ return true;
+ }
+ match current_state {
+ None => self.clear_cache(),
+ Some(si) => {
+ let cur = self.state(*si).clone();
+ if !self.clear_cache() {
+ return false;
+ }
+ // The unwrap is OK because we just cleared the cache and
+ // therefore know that the next state pointer won't exceed
+ // STATE_MAX.
+ *si = self.restore_state(cur).unwrap();
+ true
+ }
+ }
+ }
+
+ /// Wipes the state cache, but saves and restores the current start state.
+ ///
+ /// This returns false if the cache is not cleared and the DFA should
+ /// give up.
+ fn clear_cache(&mut self) -> bool {
+ // Bail out of the DFA if we're moving too "slowly."
+ // A heuristic from RE2: assume the DFA is too slow if it is processing
+ // 10 or fewer bytes per state.
+ // Additionally, we permit the cache to be flushed a few times before
+ // caling it quits.
+ let nstates = self.cache.compiled.len();
+ if self.cache.flush_count >= 3
+ && self.at >= self.last_cache_flush
+ && (self.at - self.last_cache_flush) <= 10 * nstates
+ {
+ return false;
+ }
+ // Update statistics tracking cache flushes.
+ self.last_cache_flush = self.at;
+ self.cache.flush_count += 1;
+
+ // OK, actually flush the cache.
+ let start = self.state(self.start & !STATE_START).clone();
+ let last_match = if self.last_match_si <= STATE_MAX {
+ Some(self.state(self.last_match_si).clone())
+ } else {
+ None
+ };
+ self.cache.reset_size();
+ self.cache.trans.clear();
+ self.cache.compiled.clear();
+ for s in &mut self.cache.start_states {
+ *s = STATE_UNKNOWN;
+ }
+ // The unwraps are OK because we just cleared the cache and therefore
+ // know that the next state pointer won't exceed STATE_MAX.
+ let start_ptr = self.restore_state(start).unwrap();
+ self.start = self.start_ptr(start_ptr);
+ if let Some(last_match) = last_match {
+ self.last_match_si = self.restore_state(last_match).unwrap();
+ }
+ true
+ }
+
+ /// Restores the given state back into the cache, and returns a pointer
+ /// to it.
+ fn restore_state(&mut self, state: State) -> Option<StatePtr> {
+ // If we've already stored this state, just return a pointer to it.
+ // None will be the wiser.
+ if let Some(si) = self.cache.compiled.get_ptr(&state) {
+ return Some(si);
+ }
+ self.add_state(state)
+ }
+
+ /// Returns the next state given the current state si and current byte
+ /// b. {qcur,qnext} are used as scratch space for storing ordered NFA
+ /// states.
+ ///
+ /// This tries to fetch the next state from the cache, but if that fails,
+ /// it computes the next state, caches it and returns a pointer to it.
+ ///
+ /// The pointer can be to a real state, or it can be STATE_DEAD.
+ /// STATE_UNKNOWN cannot be returned.
+ ///
+ /// None is returned if a new state could not be allocated (i.e., the DFA
+ /// ran out of space and thinks it's running too slowly).
+ fn next_state(
+ &mut self,
+ qcur: &mut SparseSet,
+ qnext: &mut SparseSet,
+ si: StatePtr,
+ b: Byte,
+ ) -> Option<StatePtr> {
+ if si == STATE_DEAD {
+ return Some(STATE_DEAD);
+ }
+ match self.cache.trans.next(si, self.byte_class(b)) {
+ STATE_UNKNOWN => self.exec_byte(qcur, qnext, si, b),
+ STATE_QUIT => None,
+ nsi => Some(nsi),
+ }
+ }
+
+ /// Computes and returns the start state, where searching begins at
+ /// position `at` in `text`. If the state has already been computed,
+ /// then it is pulled from the cache. If the state hasn't been cached,
+ /// then it is computed, cached and a pointer to it is returned.
+ ///
+ /// This may return STATE_DEAD but never STATE_UNKNOWN.
+ #[cfg_attr(feature = "perf-inline", inline(always))]
+ fn start_state(
+ &mut self,
+ q: &mut SparseSet,
+ empty_flags: EmptyFlags,
+ state_flags: StateFlags,
+ ) -> Option<StatePtr> {
+ // Compute an index into our cache of start states based on the set
+ // of empty/state flags set at the current position in the input. We
+ // don't use every flag since not all flags matter. For example, since
+ // matches are delayed by one byte, start states can never be match
+ // states.
+ let flagi = {
+ (((empty_flags.start as u8) << 0)
+ | ((empty_flags.end as u8) << 1)
+ | ((empty_flags.start_line as u8) << 2)
+ | ((empty_flags.end_line as u8) << 3)
+ | ((empty_flags.word_boundary as u8) << 4)
+ | ((empty_flags.not_word_boundary as u8) << 5)
+ | ((state_flags.is_word() as u8) << 6)) as usize
+ };
+ match self.cache.start_states[flagi] {
+ STATE_UNKNOWN => {}
+ si => return Some(si),
+ }
+ q.clear();
+ let start = usize_to_u32(self.prog.start);
+ self.follow_epsilons(start, q, empty_flags);
+ // Start states can never be match states because we delay every match
+ // by one byte. Given an empty string and an empty match, the match
+ // won't actually occur until the DFA processes the special EOF
+ // sentinel byte.
+ let sp = match self.cached_state(q, state_flags, None) {
+ None => return None,
+ Some(sp) => self.start_ptr(sp),
+ };
+ self.cache.start_states[flagi] = sp;
+ Some(sp)
+ }
+
+ /// Computes the set of starting flags for the given position in text.
+ ///
+ /// This should only be used when executing the DFA forwards over the
+ /// input.
+ fn start_flags(&self, text: &[u8], at: usize) -> (EmptyFlags, StateFlags) {
+ let mut empty_flags = EmptyFlags::default();
+ let mut state_flags = StateFlags::default();
+ empty_flags.start = at == 0;
+ empty_flags.end = text.is_empty();
+ empty_flags.start_line = at == 0 || text[at - 1] == b'\n';
+ empty_flags.end_line = text.is_empty();
+
+ let is_word_last = at > 0 && Byte::byte(text[at - 1]).is_ascii_word();
+ let is_word = at < text.len() && Byte::byte(text[at]).is_ascii_word();
+ if is_word_last {
+ state_flags.set_word();
+ }
+ if is_word == is_word_last {
+ empty_flags.not_word_boundary = true;
+ } else {
+ empty_flags.word_boundary = true;
+ }
+ (empty_flags, state_flags)
+ }
+
+ /// Computes the set of starting flags for the given position in text.
+ ///
+ /// This should only be used when executing the DFA in reverse over the
+ /// input.
+ fn start_flags_reverse(
+ &self,
+ text: &[u8],
+ at: usize,
+ ) -> (EmptyFlags, StateFlags) {
+ let mut empty_flags = EmptyFlags::default();
+ let mut state_flags = StateFlags::default();
+ empty_flags.start = at == text.len();
+ empty_flags.end = text.is_empty();
+ empty_flags.start_line = at == text.len() || text[at] == b'\n';
+ empty_flags.end_line = text.is_empty();
+
+ let is_word_last =
+ at < text.len() && Byte::byte(text[at]).is_ascii_word();
+ let is_word = at > 0 && Byte::byte(text[at - 1]).is_ascii_word();
+ if is_word_last {
+ state_flags.set_word();
+ }
+ if is_word == is_word_last {
+ empty_flags.not_word_boundary = true;
+ } else {
+ empty_flags.word_boundary = true;
+ }
+ (empty_flags, state_flags)
+ }
+
+ /// Returns a reference to a State given a pointer to it.
+ fn state(&self, si: StatePtr) -> &State {
+ self.cache.compiled.get_state(si).unwrap()
+ }
+
+ /// Adds the given state to the DFA.
+ ///
+ /// This allocates room for transitions out of this state in
+ /// self.cache.trans. The transitions can be set with the returned
+ /// StatePtr.
+ ///
+ /// If None is returned, then the state limit was reached and the DFA
+ /// should quit.
+ fn add_state(&mut self, state: State) -> Option<StatePtr> {
+ // This will fail if the next state pointer exceeds STATE_PTR. In
+ // practice, the cache limit will prevent us from ever getting here,
+ // but maybe callers will set the cache size to something ridiculous...
+ let si = match self.cache.trans.add() {
+ None => return None,
+ Some(si) => si,
+ };
+ // If the program has a Unicode word boundary, then set any transitions
+ // for non-ASCII bytes to STATE_QUIT. If the DFA stumbles over such a
+ // transition, then it will quit and an alternative matching engine
+ // will take over.
+ if self.prog.has_unicode_word_boundary {
+ for b in 128..256 {
+ let cls = self.byte_class(Byte::byte(b as u8));
+ self.cache.trans.set_next(si, cls, STATE_QUIT);
+ }
+ }
+ // Finally, put our actual state on to our heap of states and index it
+ // so we can find it later.
+ self.cache.size += self.cache.trans.state_heap_size()
+ + state.data.len()
+ + (2 * mem::size_of::<State>())
+ + mem::size_of::<StatePtr>();
+ self.cache.compiled.insert(state, si);
+ // Transition table and set of states and map should all be in sync.
+ debug_assert!(
+ self.cache.compiled.len() == self.cache.trans.num_states()
+ );
+ Some(si)
+ }
+
+ /// Quickly finds the next occurrence of any literal prefixes in the regex.
+ /// If there are no literal prefixes, then the current position is
+ /// returned. If there are literal prefixes and one could not be found,
+ /// then None is returned.
+ ///
+ /// This should only be called when the DFA is in a start state.
+ fn prefix_at(&self, text: &[u8], at: usize) -> Option<usize> {
+ self.prog.prefixes.find(&text[at..]).map(|(s, _)| at + s)
+ }
+
+ /// Returns the number of byte classes required to discriminate transitions
+ /// in each state.
+ ///
+ /// invariant: num_byte_classes() == len(State.next)
+ fn num_byte_classes(&self) -> usize {
+ // We add 1 to account for the special EOF byte.
+ (self.prog.byte_classes[255] as usize + 1) + 1
+ }
+
+ /// Given an input byte or the special EOF sentinel, return its
+ /// corresponding byte class.
+ #[cfg_attr(feature = "perf-inline", inline(always))]
+ fn byte_class(&self, b: Byte) -> usize {
+ match b.as_byte() {
+ None => self.num_byte_classes() - 1,
+ Some(b) => self.u8_class(b),
+ }
+ }
+
+ /// Like byte_class, but explicitly for u8s.
+ #[cfg_attr(feature = "perf-inline", inline(always))]
+ fn u8_class(&self, b: u8) -> usize {
+ self.prog.byte_classes[b as usize] as usize
+ }
+
+ /// Returns true if the DFA should continue searching past the first match.
+ ///
+ /// Leftmost first semantics in the DFA are preserved by not following NFA
+ /// transitions after the first match is seen.
+ ///
+ /// On occasion, we want to avoid leftmost first semantics to find either
+ /// the longest match (for reverse search) or all possible matches (for
+ /// regex sets).
+ fn continue_past_first_match(&self) -> bool {
+ self.prog.is_reverse || self.prog.matches.len() > 1
+ }
+
+ /// Returns true if there is a prefix we can quickly search for.
+ fn has_prefix(&self) -> bool {
+ !self.prog.is_reverse
+ && !self.prog.prefixes.is_empty()
+ && !self.prog.is_anchored_start
+ }
+
+ /// Sets the STATE_START bit in the given state pointer if and only if
+ /// we have a prefix to scan for.
+ ///
+ /// If there's no prefix, then it's a waste to treat the start state
+ /// specially.
+ fn start_ptr(&self, si: StatePtr) -> StatePtr {
+ if self.has_prefix() {
+ si | STATE_START
+ } else {
+ si
+ }
+ }
+
+ /// Approximate size returns the approximate heap space currently used by
+ /// the DFA. It is used to determine whether the DFA's state cache needs to
+ /// be wiped. Namely, it is possible that for certain regexes on certain
+ /// inputs, a new state could be created for every byte of input. (This is
+ /// bad for memory use, so we bound it with a cache.)
+ fn approximate_size(&self) -> usize {
+ self.cache.size + self.prog.approximate_size()
+ }
+}
+
+/// An abstraction for representing a map of states. The map supports two
+/// different ways of state lookup. One is fast constant time access via a
+/// state pointer. The other is a hashmap lookup based on the DFA's
+/// constituent NFA states.
+///
+/// A DFA state internally uses an Arc such that we only need to store the
+/// set of NFA states on the heap once, even though we support looking up
+/// states by two different means. A more natural way to express this might
+/// use raw pointers, but an Arc is safe and effectively achieves the same
+/// thing.
+#[derive(Debug)]
+struct StateMap {
+ /// The keys are not actually static but rely on always pointing to a
+ /// buffer in `states` which will never be moved except when clearing
+ /// the map or on drop, in which case the keys of this map will be
+ /// removed before
+ map: HashMap<State, StatePtr>,
+ /// Our set of states. Note that `StatePtr / num_byte_classes` indexes
+ /// this Vec rather than just a `StatePtr`.
+ states: Vec<State>,
+ /// The number of byte classes in the DFA. Used to index `states`.
+ num_byte_classes: usize,
+}
+
+impl StateMap {
+ fn new(num_byte_classes: usize) -> StateMap {
+ StateMap { map: HashMap::new(), states: vec![], num_byte_classes }
+ }
+
+ fn len(&self) -> usize {
+ self.states.len()
+ }
+
+ fn is_empty(&self) -> bool {
+ self.states.is_empty()
+ }
+
+ fn get_ptr(&self, state: &State) -> Option<StatePtr> {
+ self.map.get(state).cloned()
+ }
+
+ fn get_state(&self, si: StatePtr) -> Option<&State> {
+ self.states.get(si as usize / self.num_byte_classes)
+ }
+
+ fn insert(&mut self, state: State, si: StatePtr) {
+ self.map.insert(state.clone(), si);
+ self.states.push(state);
+ }
+
+ fn clear(&mut self) {
+ self.map.clear();
+ self.states.clear();
+ }
+}
+
+impl Transitions {
+ /// Create a new transition table.
+ ///
+ /// The number of byte classes corresponds to the stride. Every state will
+ /// have `num_byte_classes` slots for transitions.
+ fn new(num_byte_classes: usize) -> Transitions {
+ Transitions { table: vec![], num_byte_classes }
+ }
+
+ /// Returns the total number of states currently in this table.
+ fn num_states(&self) -> usize {
+ self.table.len() / self.num_byte_classes
+ }
+
+ /// Allocates room for one additional state and returns a pointer to it.
+ ///
+ /// If there's no more room, None is returned.
+ fn add(&mut self) -> Option<StatePtr> {
+ let si = self.table.len();
+ if si > STATE_MAX as usize {
+ return None;
+ }
+ self.table.extend(repeat(STATE_UNKNOWN).take(self.num_byte_classes));
+ Some(usize_to_u32(si))
+ }
+
+ /// Clears the table of all states.
+ fn clear(&mut self) {
+ self.table.clear();
+ }
+
+ /// Sets the transition from (si, cls) to next.
+ fn set_next(&mut self, si: StatePtr, cls: usize, next: StatePtr) {
+ self.table[si as usize + cls] = next;
+ }
+
+ /// Returns the transition corresponding to (si, cls).
+ fn next(&self, si: StatePtr, cls: usize) -> StatePtr {
+ self.table[si as usize + cls]
+ }
+
+ /// The heap size, in bytes, of a single state in the transition table.
+ fn state_heap_size(&self) -> usize {
+ self.num_byte_classes * mem::size_of::<StatePtr>()
+ }
+
+ /// Like `next`, but uses unchecked access and is therefore not safe.
+ unsafe fn next_unchecked(&self, si: StatePtr, cls: usize) -> StatePtr {
+ debug_assert!((si as usize) < self.table.len());
+ debug_assert!(cls < self.num_byte_classes);
+ *self.table.get_unchecked(si as usize + cls)
+ }
+}
+
+impl StateFlags {
+ fn is_match(&self) -> bool {
+ self.0 & 0b0000_0001 > 0
+ }
+
+ fn set_match(&mut self) {
+ self.0 |= 0b0000_0001;
+ }
+
+ fn is_word(&self) -> bool {
+ self.0 & 0b0000_0010 > 0
+ }
+
+ fn set_word(&mut self) {
+ self.0 |= 0b0000_0010;
+ }
+
+ fn has_empty(&self) -> bool {
+ self.0 & 0b0000_0100 > 0
+ }
+
+ fn set_empty(&mut self) {
+ self.0 |= 0b0000_0100;
+ }
+}
+
+impl Byte {
+ fn byte(b: u8) -> Self {
+ Byte(b as u16)
+ }
+ fn eof() -> Self {
+ Byte(256)
+ }
+ fn is_eof(&self) -> bool {
+ self.0 == 256
+ }
+
+ fn is_ascii_word(&self) -> bool {
+ let b = match self.as_byte() {
+ None => return false,
+ Some(b) => b,
+ };
+ match b {
+ b'A'..=b'Z' | b'a'..=b'z' | b'0'..=b'9' | b'_' => true,
+ _ => false,
+ }
+ }
+
+ fn as_byte(&self) -> Option<u8> {
+ if self.is_eof() {
+ None
+ } else {
+ Some(self.0 as u8)
+ }
+ }
+}
+
+impl fmt::Debug for State {
+ fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+ let ips: Vec<usize> = self.inst_ptrs().collect();
+ f.debug_struct("State")
+ .field("flags", &self.flags())
+ .field("insts", &ips)
+ .finish()
+ }
+}
+
+impl fmt::Debug for Transitions {
+ fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+ let mut fmtd = f.debug_map();
+ for si in 0..self.num_states() {
+ let s = si * self.num_byte_classes;
+ let e = s + self.num_byte_classes;
+ fmtd.entry(&si.to_string(), &TransitionsRow(&self.table[s..e]));
+ }
+ fmtd.finish()
+ }
+}
+
+struct TransitionsRow<'a>(&'a [StatePtr]);
+
+impl<'a> fmt::Debug for TransitionsRow<'a> {
+ fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+ let mut fmtd = f.debug_map();
+ for (b, si) in self.0.iter().enumerate() {
+ match *si {
+ STATE_UNKNOWN => {}
+ STATE_DEAD => {
+ fmtd.entry(&vb(b as usize), &"DEAD");
+ }
+ si => {
+ fmtd.entry(&vb(b as usize), &si.to_string());
+ }
+ }
+ }
+ fmtd.finish()
+ }
+}
+
+impl fmt::Debug for StateFlags {
+ fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+ f.debug_struct("StateFlags")
+ .field("is_match", &self.is_match())
+ .field("is_word", &self.is_word())
+ .field("has_empty", &self.has_empty())
+ .finish()
+ }
+}
+
+/// Helper function for formatting a byte as a nice-to-read escaped string.
+fn vb(b: usize) -> String {
+ use std::ascii::escape_default;
+
+ if b > ::std::u8::MAX as usize {
+ "EOF".to_owned()
+ } else {
+ let escaped = escape_default(b as u8).collect::<Vec<u8>>();
+ String::from_utf8_lossy(&escaped).into_owned()
+ }
+}
+
+fn usize_to_u32(n: usize) -> u32 {
+ if (n as u64) > (::std::u32::MAX as u64) {
+ panic!("BUG: {} is too big to fit into u32", n)
+ }
+ n as u32
+}
+
+#[allow(dead_code)] // useful for debugging
+fn show_state_ptr(si: StatePtr) -> String {
+ let mut s = format!("{:?}", si & STATE_MAX);
+ if si == STATE_UNKNOWN {
+ s = format!("{} (unknown)", s);
+ }
+ if si == STATE_DEAD {
+ s = format!("{} (dead)", s);
+ }
+ if si == STATE_QUIT {
+ s = format!("{} (quit)", s);
+ }
+ if si & STATE_START > 0 {
+ s = format!("{} (start)", s);
+ }
+ if si & STATE_MATCH > 0 {
+ s = format!("{} (match)", s);
+ }
+ s
+}
+
+/// https://developers.google.com/protocol-buffers/docs/encoding#varints
+fn write_vari32(data: &mut Vec<u8>, n: i32) {
+ let mut un = (n as u32) << 1;
+ if n < 0 {
+ un = !un;
+ }
+ write_varu32(data, un)
+}
+
+/// https://developers.google.com/protocol-buffers/docs/encoding#varints
+fn read_vari32(data: &[u8]) -> (i32, usize) {
+ let (un, i) = read_varu32(data);
+ let mut n = (un >> 1) as i32;
+ if un & 1 != 0 {
+ n = !n;
+ }
+ (n, i)
+}
+
+/// https://developers.google.com/protocol-buffers/docs/encoding#varints
+fn write_varu32(data: &mut Vec<u8>, mut n: u32) {
+ while n >= 0b1000_0000 {
+ data.push((n as u8) | 0b1000_0000);
+ n >>= 7;
+ }
+ data.push(n as u8);
+}
+
+/// https://developers.google.com/protocol-buffers/docs/encoding#varints
+fn read_varu32(data: &[u8]) -> (u32, usize) {
+ let mut n: u32 = 0;
+ let mut shift: u32 = 0;
+ for (i, &b) in data.iter().enumerate() {
+ if b < 0b1000_0000 {
+ return (n | ((b as u32) << shift), i + 1);
+ }
+ n |= ((b as u32) & 0b0111_1111) << shift;
+ shift += 7;
+ }
+ (0, 0)
+}
+
+#[cfg(test)]
+mod tests {
+
+ use super::{
+ push_inst_ptr, read_vari32, read_varu32, write_vari32, write_varu32,
+ State, StateFlags,
+ };
+ use quickcheck::{quickcheck, Gen, QuickCheck};
+ use std::sync::Arc;
+
+ #[test]
+ fn prop_state_encode_decode() {
+ fn p(mut ips: Vec<u32>, flags: u8) -> bool {
+ // It looks like our encoding scheme can't handle instruction
+ // pointers at or above 2**31. We should fix that, but it seems
+ // unlikely to occur in real code due to the amount of memory
+ // required for such a state machine. So for now, we just clamp
+ // our test data.
+ for ip in &mut ips {
+ if *ip >= 1 << 31 {
+ *ip = (1 << 31) - 1;
+ }
+ }
+ let mut data = vec![flags];
+ let mut prev = 0;
+ for &ip in ips.iter() {
+ push_inst_ptr(&mut data, &mut prev, ip);
+ }
+ let state = State { data: Arc::from(&data[..]) };
+
+ let expected: Vec<usize> =
+ ips.into_iter().map(|ip| ip as usize).collect();
+ let got: Vec<usize> = state.inst_ptrs().collect();
+ expected == got && state.flags() == StateFlags(flags)
+ }
+ QuickCheck::new()
+ .gen(Gen::new(10_000))
+ .quickcheck(p as fn(Vec<u32>, u8) -> bool);
+ }
+
+ #[test]
+ fn prop_read_write_u32() {
+ fn p(n: u32) -> bool {
+ let mut buf = vec![];
+ write_varu32(&mut buf, n);
+ let (got, nread) = read_varu32(&buf);
+ nread == buf.len() && got == n
+ }
+ quickcheck(p as fn(u32) -> bool);
+ }
+
+ #[test]
+ fn prop_read_write_i32() {
+ fn p(n: i32) -> bool {
+ let mut buf = vec![];
+ write_vari32(&mut buf, n);
+ let (got, nread) = read_vari32(&buf);
+ nread == buf.len() && got == n
+ }
+ quickcheck(p as fn(i32) -> bool);
+ }
+}
diff --git a/third_party/rust/regex/src/error.rs b/third_party/rust/regex/src/error.rs
new file mode 100644
index 0000000000..3e0ec75210
--- /dev/null
+++ b/third_party/rust/regex/src/error.rs
@@ -0,0 +1,71 @@
+use std::fmt;
+use std::iter::repeat;
+
+/// An error that occurred during parsing or compiling a regular expression.
+#[derive(Clone, PartialEq)]
+pub enum Error {
+ /// A syntax error.
+ Syntax(String),
+ /// The compiled program exceeded the set size limit.
+ /// The argument is the size limit imposed.
+ CompiledTooBig(usize),
+ /// Hints that destructuring should not be exhaustive.
+ ///
+ /// This enum may grow additional variants, so this makes sure clients
+ /// don't count on exhaustive matching. (Otherwise, adding a new variant
+ /// could break existing code.)
+ #[doc(hidden)]
+ __Nonexhaustive,
+}
+
+impl ::std::error::Error for Error {
+ // TODO: Remove this method entirely on the next breaking semver release.
+ #[allow(deprecated)]
+ fn description(&self) -> &str {
+ match *self {
+ Error::Syntax(ref err) => err,
+ Error::CompiledTooBig(_) => "compiled program too big",
+ Error::__Nonexhaustive => unreachable!(),
+ }
+ }
+}
+
+impl fmt::Display for Error {
+ fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+ match *self {
+ Error::Syntax(ref err) => err.fmt(f),
+ Error::CompiledTooBig(limit) => write!(
+ f,
+ "Compiled regex exceeds size limit of {} bytes.",
+ limit
+ ),
+ Error::__Nonexhaustive => unreachable!(),
+ }
+ }
+}
+
+// We implement our own Debug implementation so that we show nicer syntax
+// errors when people use `Regex::new(...).unwrap()`. It's a little weird,
+// but the `Syntax` variant is already storing a `String` anyway, so we might
+// as well format it nicely.
+impl fmt::Debug for Error {
+ fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+ match *self {
+ Error::Syntax(ref err) => {
+ let hr: String = repeat('~').take(79).collect();
+ writeln!(f, "Syntax(")?;
+ writeln!(f, "{}", hr)?;
+ writeln!(f, "{}", err)?;
+ writeln!(f, "{}", hr)?;
+ write!(f, ")")?;
+ Ok(())
+ }
+ Error::CompiledTooBig(limit) => {
+ f.debug_tuple("CompiledTooBig").field(&limit).finish()
+ }
+ Error::__Nonexhaustive => {
+ f.debug_tuple("__Nonexhaustive").finish()
+ }
+ }
+ }
+}
diff --git a/third_party/rust/regex/src/exec.rs b/third_party/rust/regex/src/exec.rs
new file mode 100644
index 0000000000..e75ca083a0
--- /dev/null
+++ b/third_party/rust/regex/src/exec.rs
@@ -0,0 +1,1655 @@
+use std::cell::RefCell;
+use std::collections::HashMap;
+use std::panic::AssertUnwindSafe;
+use std::sync::Arc;
+
+#[cfg(feature = "perf-literal")]
+use aho_corasick::{AhoCorasick, AhoCorasickBuilder, MatchKind};
+use regex_syntax::hir::literal::Literals;
+use regex_syntax::hir::Hir;
+use regex_syntax::ParserBuilder;
+
+use crate::backtrack;
+use crate::compile::Compiler;
+#[cfg(feature = "perf-dfa")]
+use crate::dfa;
+use crate::error::Error;
+use crate::input::{ByteInput, CharInput};
+use crate::literal::LiteralSearcher;
+use crate::pikevm;
+use crate::pool::{Pool, PoolGuard};
+use crate::prog::Program;
+use crate::re_builder::RegexOptions;
+use crate::re_bytes;
+use crate::re_set;
+use crate::re_trait::{Locations, RegularExpression, Slot};
+use crate::re_unicode;
+use crate::utf8::next_utf8;
+
+/// `Exec` manages the execution of a regular expression.
+///
+/// In particular, this manages the various compiled forms of a single regular
+/// expression and the choice of which matching engine to use to execute a
+/// regular expression.
+#[derive(Debug)]
+pub struct Exec {
+ /// All read only state.
+ ro: Arc<ExecReadOnly>,
+ /// A pool of reusable values for the various matching engines.
+ ///
+ /// Note that boxing this value is not strictly necessary, but it is an
+ /// easy way to ensure that T does not bloat the stack sized used by a pool
+ /// in the case where T is big. And this turns out to be the case at the
+ /// time of writing for regex's use of this pool. At the time of writing,
+ /// the size of a Regex on the stack is 856 bytes. Boxing this value
+ /// reduces that size to 16 bytes.
+ pool: Box<Pool<ProgramCache>>,
+}
+
+/// `ExecNoSync` is like `Exec`, except it embeds a reference to a cache. This
+/// means it is no longer Sync, but we can now avoid the overhead of
+/// synchronization to fetch the cache.
+#[derive(Debug)]
+pub struct ExecNoSync<'c> {
+ /// All read only state.
+ ro: &'c Arc<ExecReadOnly>,
+ /// Caches for the various matching engines.
+ cache: PoolGuard<'c, ProgramCache>,
+}
+
+/// `ExecNoSyncStr` is like `ExecNoSync`, but matches on &str instead of &[u8].
+#[derive(Debug)]
+pub struct ExecNoSyncStr<'c>(ExecNoSync<'c>);
+
+/// `ExecReadOnly` comprises all read only state for a regex. Namely, all such
+/// state is determined at compile time and never changes during search.
+#[derive(Debug)]
+struct ExecReadOnly {
+ /// The original regular expressions given by the caller to compile.
+ res: Vec<String>,
+ /// A compiled program that is used in the NFA simulation and backtracking.
+ /// It can be byte-based or Unicode codepoint based.
+ ///
+ /// N.B. It is not possibly to make this byte-based from the public API.
+ /// It is only used for testing byte based programs in the NFA simulations.
+ nfa: Program,
+ /// A compiled byte based program for DFA execution. This is only used
+ /// if a DFA can be executed. (Currently, only word boundary assertions are
+ /// not supported.) Note that this program contains an embedded `.*?`
+ /// preceding the first capture group, unless the regex is anchored at the
+ /// beginning.
+ dfa: Program,
+ /// The same as above, except the program is reversed (and there is no
+ /// preceding `.*?`). This is used by the DFA to find the starting location
+ /// of matches.
+ dfa_reverse: Program,
+ /// A set of suffix literals extracted from the regex.
+ ///
+ /// Prefix literals are stored on the `Program`, since they are used inside
+ /// the matching engines.
+ suffixes: LiteralSearcher,
+ /// An Aho-Corasick automaton with leftmost-first match semantics.
+ ///
+ /// This is only set when the entire regex is a simple unanchored
+ /// alternation of literals. We could probably use it more circumstances,
+ /// but this is already hacky enough in this architecture.
+ ///
+ /// N.B. We use u32 as a state ID representation under the assumption that
+ /// if we were to exhaust the ID space, we probably would have long
+ /// surpassed the compilation size limit.
+ #[cfg(feature = "perf-literal")]
+ ac: Option<AhoCorasick<u32>>,
+ /// match_type encodes as much upfront knowledge about how we're going to
+ /// execute a search as possible.
+ match_type: MatchType,
+}
+
+/// Facilitates the construction of an executor by exposing various knobs
+/// to control how a regex is executed and what kinds of resources it's
+/// permitted to use.
+// `ExecBuilder` is only public via the `internal` module, so avoid deriving
+// `Debug`.
+#[allow(missing_debug_implementations)]
+pub struct ExecBuilder {
+ options: RegexOptions,
+ match_type: Option<MatchType>,
+ bytes: bool,
+ only_utf8: bool,
+}
+
+/// Parsed represents a set of parsed regular expressions and their detected
+/// literals.
+struct Parsed {
+ exprs: Vec<Hir>,
+ prefixes: Literals,
+ suffixes: Literals,
+ bytes: bool,
+}
+
+impl ExecBuilder {
+ /// Create a regex execution builder.
+ ///
+ /// This uses default settings for everything except the regex itself,
+ /// which must be provided. Further knobs can be set by calling methods,
+ /// and then finally, `build` to actually create the executor.
+ pub fn new(re: &str) -> Self {
+ Self::new_many(&[re])
+ }
+
+ /// Like new, but compiles the union of the given regular expressions.
+ ///
+ /// Note that when compiling 2 or more regular expressions, capture groups
+ /// are completely unsupported. (This means both `find` and `captures`
+ /// won't work.)
+ pub fn new_many<I, S>(res: I) -> Self
+ where
+ S: AsRef<str>,
+ I: IntoIterator<Item = S>,
+ {
+ let mut opts = RegexOptions::default();
+ opts.pats = res.into_iter().map(|s| s.as_ref().to_owned()).collect();
+ Self::new_options(opts)
+ }
+
+ /// Create a regex execution builder.
+ pub fn new_options(opts: RegexOptions) -> Self {
+ ExecBuilder {
+ options: opts,
+ match_type: None,
+ bytes: false,
+ only_utf8: true,
+ }
+ }
+
+ /// Set the matching engine to be automatically determined.
+ ///
+ /// This is the default state and will apply whatever optimizations are
+ /// possible, such as running a DFA.
+ ///
+ /// This overrides whatever was previously set via the `nfa` or
+ /// `bounded_backtracking` methods.
+ pub fn automatic(mut self) -> Self {
+ self.match_type = None;
+ self
+ }
+
+ /// Sets the matching engine to use the NFA algorithm no matter what
+ /// optimizations are possible.
+ ///
+ /// This overrides whatever was previously set via the `automatic` or
+ /// `bounded_backtracking` methods.
+ pub fn nfa(mut self) -> Self {
+ self.match_type = Some(MatchType::Nfa(MatchNfaType::PikeVM));
+ self
+ }
+
+ /// Sets the matching engine to use a bounded backtracking engine no
+ /// matter what optimizations are possible.
+ ///
+ /// One must use this with care, since the bounded backtracking engine
+ /// uses memory proportion to `len(regex) * len(text)`.
+ ///
+ /// This overrides whatever was previously set via the `automatic` or
+ /// `nfa` methods.
+ pub fn bounded_backtracking(mut self) -> Self {
+ self.match_type = Some(MatchType::Nfa(MatchNfaType::Backtrack));
+ self
+ }
+
+ /// Compiles byte based programs for use with the NFA matching engines.
+ ///
+ /// By default, the NFA engines match on Unicode scalar values. They can
+ /// be made to use byte based programs instead. In general, the byte based
+ /// programs are slower because of a less efficient encoding of character
+ /// classes.
+ ///
+ /// Note that this does not impact DFA matching engines, which always
+ /// execute on bytes.
+ pub fn bytes(mut self, yes: bool) -> Self {
+ self.bytes = yes;
+ self
+ }
+
+ /// When disabled, the program compiled may match arbitrary bytes.
+ ///
+ /// When enabled (the default), all compiled programs exclusively match
+ /// valid UTF-8 bytes.
+ pub fn only_utf8(mut self, yes: bool) -> Self {
+ self.only_utf8 = yes;
+ self
+ }
+
+ /// Set the Unicode flag.
+ pub fn unicode(mut self, yes: bool) -> Self {
+ self.options.unicode = yes;
+ self
+ }
+
+ /// Parse the current set of patterns into their AST and extract literals.
+ fn parse(&self) -> Result<Parsed, Error> {
+ let mut exprs = Vec::with_capacity(self.options.pats.len());
+ let mut prefixes = Some(Literals::empty());
+ let mut suffixes = Some(Literals::empty());
+ let mut bytes = false;
+ let is_set = self.options.pats.len() > 1;
+ // If we're compiling a regex set and that set has any anchored
+ // expressions, then disable all literal optimizations.
+ for pat in &self.options.pats {
+ let mut parser = ParserBuilder::new()
+ .octal(self.options.octal)
+ .case_insensitive(self.options.case_insensitive)
+ .multi_line(self.options.multi_line)
+ .dot_matches_new_line(self.options.dot_matches_new_line)
+ .swap_greed(self.options.swap_greed)
+ .ignore_whitespace(self.options.ignore_whitespace)
+ .unicode(self.options.unicode)
+ .allow_invalid_utf8(!self.only_utf8)
+ .nest_limit(self.options.nest_limit)
+ .build();
+ let expr =
+ parser.parse(pat).map_err(|e| Error::Syntax(e.to_string()))?;
+ bytes = bytes || !expr.is_always_utf8();
+
+ if cfg!(feature = "perf-literal") {
+ if !expr.is_anchored_start() && expr.is_any_anchored_start() {
+ // Partial anchors unfortunately make it hard to use
+ // prefixes, so disable them.
+ prefixes = None;
+ } else if is_set && expr.is_anchored_start() {
+ // Regex sets with anchors do not go well with literal
+ // optimizations.
+ prefixes = None;
+ }
+ prefixes = prefixes.and_then(|mut prefixes| {
+ if !prefixes.union_prefixes(&expr) {
+ None
+ } else {
+ Some(prefixes)
+ }
+ });
+
+ if !expr.is_anchored_end() && expr.is_any_anchored_end() {
+ // Partial anchors unfortunately make it hard to use
+ // suffixes, so disable them.
+ suffixes = None;
+ } else if is_set && expr.is_anchored_end() {
+ // Regex sets with anchors do not go well with literal
+ // optimizations.
+ suffixes = None;
+ }
+ suffixes = suffixes.and_then(|mut suffixes| {
+ if !suffixes.union_suffixes(&expr) {
+ None
+ } else {
+ Some(suffixes)
+ }
+ });
+ }
+ exprs.push(expr);
+ }
+ Ok(Parsed {
+ exprs,
+ prefixes: prefixes.unwrap_or_else(Literals::empty),
+ suffixes: suffixes.unwrap_or_else(Literals::empty),
+ bytes,
+ })
+ }
+
+ /// Build an executor that can run a regular expression.
+ pub fn build(self) -> Result<Exec, Error> {
+ // Special case when we have no patterns to compile.
+ // This can happen when compiling a regex set.
+ if self.options.pats.is_empty() {
+ let ro = Arc::new(ExecReadOnly {
+ res: vec![],
+ nfa: Program::new(),
+ dfa: Program::new(),
+ dfa_reverse: Program::new(),
+ suffixes: LiteralSearcher::empty(),
+ #[cfg(feature = "perf-literal")]
+ ac: None,
+ match_type: MatchType::Nothing,
+ });
+ let pool = ExecReadOnly::new_pool(&ro);
+ return Ok(Exec { ro, pool });
+ }
+ let parsed = self.parse()?;
+ let mut nfa = Compiler::new()
+ .size_limit(self.options.size_limit)
+ .bytes(self.bytes || parsed.bytes)
+ .only_utf8(self.only_utf8)
+ .compile(&parsed.exprs)?;
+ let mut dfa = Compiler::new()
+ .size_limit(self.options.size_limit)
+ .dfa(true)
+ .only_utf8(self.only_utf8)
+ .compile(&parsed.exprs)?;
+ let mut dfa_reverse = Compiler::new()
+ .size_limit(self.options.size_limit)
+ .dfa(true)
+ .only_utf8(self.only_utf8)
+ .reverse(true)
+ .compile(&parsed.exprs)?;
+
+ #[cfg(feature = "perf-literal")]
+ let ac = self.build_aho_corasick(&parsed);
+ nfa.prefixes = LiteralSearcher::prefixes(parsed.prefixes);
+ dfa.prefixes = nfa.prefixes.clone();
+ dfa.dfa_size_limit = self.options.dfa_size_limit;
+ dfa_reverse.dfa_size_limit = self.options.dfa_size_limit;
+
+ let mut ro = ExecReadOnly {
+ res: self.options.pats,
+ nfa,
+ dfa,
+ dfa_reverse,
+ suffixes: LiteralSearcher::suffixes(parsed.suffixes),
+ #[cfg(feature = "perf-literal")]
+ ac,
+ match_type: MatchType::Nothing,
+ };
+ ro.match_type = ro.choose_match_type(self.match_type);
+
+ let ro = Arc::new(ro);
+ let pool = ExecReadOnly::new_pool(&ro);
+ Ok(Exec { ro, pool })
+ }
+
+ #[cfg(feature = "perf-literal")]
+ fn build_aho_corasick(&self, parsed: &Parsed) -> Option<AhoCorasick<u32>> {
+ if parsed.exprs.len() != 1 {
+ return None;
+ }
+ let lits = match alternation_literals(&parsed.exprs[0]) {
+ None => return None,
+ Some(lits) => lits,
+ };
+ // If we have a small number of literals, then let Teddy handle
+ // things (see literal/mod.rs).
+ if lits.len() <= 32 {
+ return None;
+ }
+ Some(
+ AhoCorasickBuilder::new()
+ .match_kind(MatchKind::LeftmostFirst)
+ .auto_configure(&lits)
+ .build_with_size::<u32, _, _>(&lits)
+ // This should never happen because we'd long exceed the
+ // compilation limit for regexes first.
+ .expect("AC automaton too big"),
+ )
+ }
+}
+
+impl<'c> RegularExpression for ExecNoSyncStr<'c> {
+ type Text = str;
+
+ fn slots_len(&self) -> usize {
+ self.0.slots_len()
+ }
+
+ fn next_after_empty(&self, text: &str, i: usize) -> usize {
+ next_utf8(text.as_bytes(), i)
+ }
+
+ #[cfg_attr(feature = "perf-inline", inline(always))]
+ fn shortest_match_at(&self, text: &str, start: usize) -> Option<usize> {
+ self.0.shortest_match_at(text.as_bytes(), start)
+ }
+
+ #[cfg_attr(feature = "perf-inline", inline(always))]
+ fn is_match_at(&self, text: &str, start: usize) -> bool {
+ self.0.is_match_at(text.as_bytes(), start)
+ }
+
+ #[cfg_attr(feature = "perf-inline", inline(always))]
+ fn find_at(&self, text: &str, start: usize) -> Option<(usize, usize)> {
+ self.0.find_at(text.as_bytes(), start)
+ }
+
+ #[cfg_attr(feature = "perf-inline", inline(always))]
+ fn captures_read_at(
+ &self,
+ locs: &mut Locations,
+ text: &str,
+ start: usize,
+ ) -> Option<(usize, usize)> {
+ self.0.captures_read_at(locs, text.as_bytes(), start)
+ }
+}
+
+impl<'c> RegularExpression for ExecNoSync<'c> {
+ type Text = [u8];
+
+ /// Returns the number of capture slots in the regular expression. (There
+ /// are two slots for every capture group, corresponding to possibly empty
+ /// start and end locations of the capture.)
+ fn slots_len(&self) -> usize {
+ self.ro.nfa.captures.len() * 2
+ }
+
+ fn next_after_empty(&self, _text: &[u8], i: usize) -> usize {
+ i + 1
+ }
+
+ /// Returns the end of a match location, possibly occurring before the
+ /// end location of the correct leftmost-first match.
+ #[cfg_attr(feature = "perf-inline", inline(always))]
+ fn shortest_match_at(&self, text: &[u8], start: usize) -> Option<usize> {
+ if !self.is_anchor_end_match(text) {
+ return None;
+ }
+ match self.ro.match_type {
+ #[cfg(feature = "perf-literal")]
+ MatchType::Literal(ty) => {
+ self.find_literals(ty, text, start).map(|(_, e)| e)
+ }
+ #[cfg(feature = "perf-dfa")]
+ MatchType::Dfa | MatchType::DfaMany => {
+ match self.shortest_dfa(text, start) {
+ dfa::Result::Match(end) => Some(end),
+ dfa::Result::NoMatch(_) => None,
+ dfa::Result::Quit => self.shortest_nfa(text, start),
+ }
+ }
+ #[cfg(feature = "perf-dfa")]
+ MatchType::DfaAnchoredReverse => {
+ match dfa::Fsm::reverse(
+ &self.ro.dfa_reverse,
+ self.cache.value(),
+ true,
+ &text[start..],
+ text.len(),
+ ) {
+ dfa::Result::Match(_) => Some(text.len()),
+ dfa::Result::NoMatch(_) => None,
+ dfa::Result::Quit => self.shortest_nfa(text, start),
+ }
+ }
+ #[cfg(all(feature = "perf-dfa", feature = "perf-literal"))]
+ MatchType::DfaSuffix => {
+ match self.shortest_dfa_reverse_suffix(text, start) {
+ dfa::Result::Match(e) => Some(e),
+ dfa::Result::NoMatch(_) => None,
+ dfa::Result::Quit => self.shortest_nfa(text, start),
+ }
+ }
+ MatchType::Nfa(ty) => self.shortest_nfa_type(ty, text, start),
+ MatchType::Nothing => None,
+ }
+ }
+
+ /// Returns true if and only if the regex matches text.
+ ///
+ /// For single regular expressions, this is equivalent to calling
+ /// shortest_match(...).is_some().
+ #[cfg_attr(feature = "perf-inline", inline(always))]
+ fn is_match_at(&self, text: &[u8], start: usize) -> bool {
+ if !self.is_anchor_end_match(text) {
+ return false;
+ }
+ // We need to do this dance because shortest_match relies on the NFA
+ // filling in captures[1], but a RegexSet has no captures. In other
+ // words, a RegexSet can't (currently) use shortest_match. ---AG
+ match self.ro.match_type {
+ #[cfg(feature = "perf-literal")]
+ MatchType::Literal(ty) => {
+ self.find_literals(ty, text, start).is_some()
+ }
+ #[cfg(feature = "perf-dfa")]
+ MatchType::Dfa | MatchType::DfaMany => {
+ match self.shortest_dfa(text, start) {
+ dfa::Result::Match(_) => true,
+ dfa::Result::NoMatch(_) => false,
+ dfa::Result::Quit => self.match_nfa(text, start),
+ }
+ }
+ #[cfg(feature = "perf-dfa")]
+ MatchType::DfaAnchoredReverse => {
+ match dfa::Fsm::reverse(
+ &self.ro.dfa_reverse,
+ self.cache.value(),
+ true,
+ &text[start..],
+ text.len(),
+ ) {
+ dfa::Result::Match(_) => true,
+ dfa::Result::NoMatch(_) => false,
+ dfa::Result::Quit => self.match_nfa(text, start),
+ }
+ }
+ #[cfg(all(feature = "perf-dfa", feature = "perf-literal"))]
+ MatchType::DfaSuffix => {
+ match self.shortest_dfa_reverse_suffix(text, start) {
+ dfa::Result::Match(_) => true,
+ dfa::Result::NoMatch(_) => false,
+ dfa::Result::Quit => self.match_nfa(text, start),
+ }
+ }
+ MatchType::Nfa(ty) => self.match_nfa_type(ty, text, start),
+ MatchType::Nothing => false,
+ }
+ }
+
+ /// Finds the start and end location of the leftmost-first match, starting
+ /// at the given location.
+ #[cfg_attr(feature = "perf-inline", inline(always))]
+ fn find_at(&self, text: &[u8], start: usize) -> Option<(usize, usize)> {
+ if !self.is_anchor_end_match(text) {
+ return None;
+ }
+ match self.ro.match_type {
+ #[cfg(feature = "perf-literal")]
+ MatchType::Literal(ty) => self.find_literals(ty, text, start),
+ #[cfg(feature = "perf-dfa")]
+ MatchType::Dfa => match self.find_dfa_forward(text, start) {
+ dfa::Result::Match((s, e)) => Some((s, e)),
+ dfa::Result::NoMatch(_) => None,
+ dfa::Result::Quit => {
+ self.find_nfa(MatchNfaType::Auto, text, start)
+ }
+ },
+ #[cfg(feature = "perf-dfa")]
+ MatchType::DfaAnchoredReverse => {
+ match self.find_dfa_anchored_reverse(text, start) {
+ dfa::Result::Match((s, e)) => Some((s, e)),
+ dfa::Result::NoMatch(_) => None,
+ dfa::Result::Quit => {
+ self.find_nfa(MatchNfaType::Auto, text, start)
+ }
+ }
+ }
+ #[cfg(all(feature = "perf-dfa", feature = "perf-literal"))]
+ MatchType::DfaSuffix => {
+ match self.find_dfa_reverse_suffix(text, start) {
+ dfa::Result::Match((s, e)) => Some((s, e)),
+ dfa::Result::NoMatch(_) => None,
+ dfa::Result::Quit => {
+ self.find_nfa(MatchNfaType::Auto, text, start)
+ }
+ }
+ }
+ MatchType::Nfa(ty) => self.find_nfa(ty, text, start),
+ MatchType::Nothing => None,
+ #[cfg(feature = "perf-dfa")]
+ MatchType::DfaMany => {
+ unreachable!("BUG: RegexSet cannot be used with find")
+ }
+ }
+ }
+
+ /// Finds the start and end location of the leftmost-first match and also
+ /// fills in all matching capture groups.
+ ///
+ /// The number of capture slots given should be equal to the total number
+ /// of capture slots in the compiled program.
+ ///
+ /// Note that the first two slots always correspond to the start and end
+ /// locations of the overall match.
+ fn captures_read_at(
+ &self,
+ locs: &mut Locations,
+ text: &[u8],
+ start: usize,
+ ) -> Option<(usize, usize)> {
+ let slots = locs.as_slots();
+ for slot in slots.iter_mut() {
+ *slot = None;
+ }
+ // If the caller unnecessarily uses this, then we try to save them
+ // from themselves.
+ match slots.len() {
+ 0 => return self.find_at(text, start),
+ 2 => {
+ return self.find_at(text, start).map(|(s, e)| {
+ slots[0] = Some(s);
+ slots[1] = Some(e);
+ (s, e)
+ });
+ }
+ _ => {} // fallthrough
+ }
+ if !self.is_anchor_end_match(text) {
+ return None;
+ }
+ match self.ro.match_type {
+ #[cfg(feature = "perf-literal")]
+ MatchType::Literal(ty) => {
+ self.find_literals(ty, text, start).and_then(|(s, e)| {
+ self.captures_nfa_type(
+ MatchNfaType::Auto,
+ slots,
+ text,
+ s,
+ e,
+ )
+ })
+ }
+ #[cfg(feature = "perf-dfa")]
+ MatchType::Dfa => {
+ if self.ro.nfa.is_anchored_start {
+ self.captures_nfa(slots, text, start)
+ } else {
+ match self.find_dfa_forward(text, start) {
+ dfa::Result::Match((s, e)) => self.captures_nfa_type(
+ MatchNfaType::Auto,
+ slots,
+ text,
+ s,
+ e,
+ ),
+ dfa::Result::NoMatch(_) => None,
+ dfa::Result::Quit => {
+ self.captures_nfa(slots, text, start)
+ }
+ }
+ }
+ }
+ #[cfg(feature = "perf-dfa")]
+ MatchType::DfaAnchoredReverse => {
+ match self.find_dfa_anchored_reverse(text, start) {
+ dfa::Result::Match((s, e)) => self.captures_nfa_type(
+ MatchNfaType::Auto,
+ slots,
+ text,
+ s,
+ e,
+ ),
+ dfa::Result::NoMatch(_) => None,
+ dfa::Result::Quit => self.captures_nfa(slots, text, start),
+ }
+ }
+ #[cfg(all(feature = "perf-dfa", feature = "perf-literal"))]
+ MatchType::DfaSuffix => {
+ match self.find_dfa_reverse_suffix(text, start) {
+ dfa::Result::Match((s, e)) => self.captures_nfa_type(
+ MatchNfaType::Auto,
+ slots,
+ text,
+ s,
+ e,
+ ),
+ dfa::Result::NoMatch(_) => None,
+ dfa::Result::Quit => self.captures_nfa(slots, text, start),
+ }
+ }
+ MatchType::Nfa(ty) => {
+ self.captures_nfa_type(ty, slots, text, start, text.len())
+ }
+ MatchType::Nothing => None,
+ #[cfg(feature = "perf-dfa")]
+ MatchType::DfaMany => {
+ unreachable!("BUG: RegexSet cannot be used with captures")
+ }
+ }
+ }
+}
+
+impl<'c> ExecNoSync<'c> {
+ /// Finds the leftmost-first match using only literal search.
+ #[cfg(feature = "perf-literal")]
+ #[cfg_attr(feature = "perf-inline", inline(always))]
+ fn find_literals(
+ &self,
+ ty: MatchLiteralType,
+ text: &[u8],
+ start: usize,
+ ) -> Option<(usize, usize)> {
+ use self::MatchLiteralType::*;
+ match ty {
+ Unanchored => {
+ let lits = &self.ro.nfa.prefixes;
+ lits.find(&text[start..]).map(|(s, e)| (start + s, start + e))
+ }
+ AnchoredStart => {
+ let lits = &self.ro.nfa.prefixes;
+ if start == 0 || !self.ro.nfa.is_anchored_start {
+ lits.find_start(&text[start..])
+ .map(|(s, e)| (start + s, start + e))
+ } else {
+ None
+ }
+ }
+ AnchoredEnd => {
+ let lits = &self.ro.suffixes;
+ lits.find_end(&text[start..])
+ .map(|(s, e)| (start + s, start + e))
+ }
+ AhoCorasick => self
+ .ro
+ .ac
+ .as_ref()
+ .unwrap()
+ .find(&text[start..])
+ .map(|m| (start + m.start(), start + m.end())),
+ }
+ }
+
+ /// Finds the leftmost-first match (start and end) using only the DFA.
+ ///
+ /// If the result returned indicates that the DFA quit, then another
+ /// matching engine should be used.
+ #[cfg(feature = "perf-dfa")]
+ #[cfg_attr(feature = "perf-inline", inline(always))]
+ fn find_dfa_forward(
+ &self,
+ text: &[u8],
+ start: usize,
+ ) -> dfa::Result<(usize, usize)> {
+ use crate::dfa::Result::*;
+ let end = match dfa::Fsm::forward(
+ &self.ro.dfa,
+ self.cache.value(),
+ false,
+ text,
+ start,
+ ) {
+ NoMatch(i) => return NoMatch(i),
+ Quit => return Quit,
+ Match(end) if start == end => return Match((start, start)),
+ Match(end) => end,
+ };
+ // Now run the DFA in reverse to find the start of the match.
+ match dfa::Fsm::reverse(
+ &self.ro.dfa_reverse,
+ self.cache.value(),
+ false,
+ &text[start..],
+ end - start,
+ ) {
+ Match(s) => Match((start + s, end)),
+ NoMatch(i) => NoMatch(i),
+ Quit => Quit,
+ }
+ }
+
+ /// Finds the leftmost-first match (start and end) using only the DFA,
+ /// but assumes the regex is anchored at the end and therefore starts at
+ /// the end of the regex and matches in reverse.
+ ///
+ /// If the result returned indicates that the DFA quit, then another
+ /// matching engine should be used.
+ #[cfg(feature = "perf-dfa")]
+ #[cfg_attr(feature = "perf-inline", inline(always))]
+ fn find_dfa_anchored_reverse(
+ &self,
+ text: &[u8],
+ start: usize,
+ ) -> dfa::Result<(usize, usize)> {
+ use crate::dfa::Result::*;
+ match dfa::Fsm::reverse(
+ &self.ro.dfa_reverse,
+ self.cache.value(),
+ false,
+ &text[start..],
+ text.len() - start,
+ ) {
+ Match(s) => Match((start + s, text.len())),
+ NoMatch(i) => NoMatch(i),
+ Quit => Quit,
+ }
+ }
+
+ /// Finds the end of the shortest match using only the DFA.
+ #[cfg(feature = "perf-dfa")]
+ #[cfg_attr(feature = "perf-inline", inline(always))]
+ fn shortest_dfa(&self, text: &[u8], start: usize) -> dfa::Result<usize> {
+ dfa::Fsm::forward(&self.ro.dfa, self.cache.value(), true, text, start)
+ }
+
+ /// Finds the end of the shortest match using only the DFA by scanning for
+ /// suffix literals.
+ #[cfg(all(feature = "perf-dfa", feature = "perf-literal"))]
+ #[cfg_attr(feature = "perf-inline", inline(always))]
+ fn shortest_dfa_reverse_suffix(
+ &self,
+ text: &[u8],
+ start: usize,
+ ) -> dfa::Result<usize> {
+ match self.exec_dfa_reverse_suffix(text, start) {
+ None => self.shortest_dfa(text, start),
+ Some(r) => r.map(|(_, end)| end),
+ }
+ }
+
+ /// Finds the end of the shortest match using only the DFA by scanning for
+ /// suffix literals. It also reports the start of the match.
+ ///
+ /// Note that if None is returned, then the optimization gave up to avoid
+ /// worst case quadratic behavior. A forward scanning DFA should be tried
+ /// next.
+ ///
+ /// If a match is returned and the full leftmost-first match is desired,
+ /// then a forward scan starting from the beginning of the match must be
+ /// done.
+ ///
+ /// If the result returned indicates that the DFA quit, then another
+ /// matching engine should be used.
+ #[cfg(all(feature = "perf-dfa", feature = "perf-literal"))]
+ #[cfg_attr(feature = "perf-inline", inline(always))]
+ fn exec_dfa_reverse_suffix(
+ &self,
+ text: &[u8],
+ original_start: usize,
+ ) -> Option<dfa::Result<(usize, usize)>> {
+ use crate::dfa::Result::*;
+
+ let lcs = self.ro.suffixes.lcs();
+ debug_assert!(lcs.len() >= 1);
+ let mut start = original_start;
+ let mut end = start;
+ let mut last_literal = start;
+ while end <= text.len() {
+ last_literal += match lcs.find(&text[last_literal..]) {
+ None => return Some(NoMatch(text.len())),
+ Some(i) => i,
+ };
+ end = last_literal + lcs.len();
+ match dfa::Fsm::reverse(
+ &self.ro.dfa_reverse,
+ self.cache.value(),
+ false,
+ &text[start..end],
+ end - start,
+ ) {
+ Match(0) | NoMatch(0) => return None,
+ Match(i) => return Some(Match((start + i, end))),
+ NoMatch(i) => {
+ start += i;
+ last_literal += 1;
+ continue;
+ }
+ Quit => return Some(Quit),
+ };
+ }
+ Some(NoMatch(text.len()))
+ }
+
+ /// Finds the leftmost-first match (start and end) using only the DFA
+ /// by scanning for suffix literals.
+ ///
+ /// If the result returned indicates that the DFA quit, then another
+ /// matching engine should be used.
+ #[cfg(all(feature = "perf-dfa", feature = "perf-literal"))]
+ #[cfg_attr(feature = "perf-inline", inline(always))]
+ fn find_dfa_reverse_suffix(
+ &self,
+ text: &[u8],
+ start: usize,
+ ) -> dfa::Result<(usize, usize)> {
+ use crate::dfa::Result::*;
+
+ let match_start = match self.exec_dfa_reverse_suffix(text, start) {
+ None => return self.find_dfa_forward(text, start),
+ Some(Match((start, _))) => start,
+ Some(r) => return r,
+ };
+ // At this point, we've found a match. The only way to quit now
+ // without a match is if the DFA gives up (seems unlikely).
+ //
+ // Now run the DFA forwards to find the proper end of the match.
+ // (The suffix literal match can only indicate the earliest
+ // possible end location, which may appear before the end of the
+ // leftmost-first match.)
+ match dfa::Fsm::forward(
+ &self.ro.dfa,
+ self.cache.value(),
+ false,
+ text,
+ match_start,
+ ) {
+ NoMatch(_) => panic!("BUG: reverse match implies forward match"),
+ Quit => Quit,
+ Match(e) => Match((match_start, e)),
+ }
+ }
+
+ /// Executes the NFA engine to return whether there is a match or not.
+ ///
+ /// Ideally, we could use shortest_nfa(...).is_some() and get the same
+ /// performance characteristics, but regex sets don't have captures, which
+ /// shortest_nfa depends on.
+ #[cfg(feature = "perf-dfa")]
+ fn match_nfa(&self, text: &[u8], start: usize) -> bool {
+ self.match_nfa_type(MatchNfaType::Auto, text, start)
+ }
+
+ /// Like match_nfa, but allows specification of the type of NFA engine.
+ fn match_nfa_type(
+ &self,
+ ty: MatchNfaType,
+ text: &[u8],
+ start: usize,
+ ) -> bool {
+ self.exec_nfa(
+ ty,
+ &mut [false],
+ &mut [],
+ true,
+ false,
+ text,
+ start,
+ text.len(),
+ )
+ }
+
+ /// Finds the shortest match using an NFA.
+ #[cfg(feature = "perf-dfa")]
+ fn shortest_nfa(&self, text: &[u8], start: usize) -> Option<usize> {
+ self.shortest_nfa_type(MatchNfaType::Auto, text, start)
+ }
+
+ /// Like shortest_nfa, but allows specification of the type of NFA engine.
+ fn shortest_nfa_type(
+ &self,
+ ty: MatchNfaType,
+ text: &[u8],
+ start: usize,
+ ) -> Option<usize> {
+ let mut slots = [None, None];
+ if self.exec_nfa(
+ ty,
+ &mut [false],
+ &mut slots,
+ true,
+ true,
+ text,
+ start,
+ text.len(),
+ ) {
+ slots[1]
+ } else {
+ None
+ }
+ }
+
+ /// Like find, but executes an NFA engine.
+ fn find_nfa(
+ &self,
+ ty: MatchNfaType,
+ text: &[u8],
+ start: usize,
+ ) -> Option<(usize, usize)> {
+ let mut slots = [None, None];
+ if self.exec_nfa(
+ ty,
+ &mut [false],
+ &mut slots,
+ false,
+ false,
+ text,
+ start,
+ text.len(),
+ ) {
+ match (slots[0], slots[1]) {
+ (Some(s), Some(e)) => Some((s, e)),
+ _ => None,
+ }
+ } else {
+ None
+ }
+ }
+
+ /// Like find_nfa, but fills in captures.
+ ///
+ /// `slots` should have length equal to `2 * nfa.captures.len()`.
+ #[cfg(feature = "perf-dfa")]
+ fn captures_nfa(
+ &self,
+ slots: &mut [Slot],
+ text: &[u8],
+ start: usize,
+ ) -> Option<(usize, usize)> {
+ self.captures_nfa_type(
+ MatchNfaType::Auto,
+ slots,
+ text,
+ start,
+ text.len(),
+ )
+ }
+
+ /// Like captures_nfa, but allows specification of type of NFA engine.
+ fn captures_nfa_type(
+ &self,
+ ty: MatchNfaType,
+ slots: &mut [Slot],
+ text: &[u8],
+ start: usize,
+ end: usize,
+ ) -> Option<(usize, usize)> {
+ if self.exec_nfa(
+ ty,
+ &mut [false],
+ slots,
+ false,
+ false,
+ text,
+ start,
+ end,
+ ) {
+ match (slots[0], slots[1]) {
+ (Some(s), Some(e)) => Some((s, e)),
+ _ => None,
+ }
+ } else {
+ None
+ }
+ }
+
+ fn exec_nfa(
+ &self,
+ mut ty: MatchNfaType,
+ matches: &mut [bool],
+ slots: &mut [Slot],
+ quit_after_match: bool,
+ quit_after_match_with_pos: bool,
+ text: &[u8],
+ start: usize,
+ end: usize,
+ ) -> bool {
+ use self::MatchNfaType::*;
+ if let Auto = ty {
+ if backtrack::should_exec(self.ro.nfa.len(), text.len()) {
+ ty = Backtrack;
+ } else {
+ ty = PikeVM;
+ }
+ }
+ // The backtracker can't return the shortest match position as it is
+ // implemented today. So if someone calls `shortest_match` and we need
+ // to run an NFA, then use the PikeVM.
+ if quit_after_match_with_pos || ty == PikeVM {
+ self.exec_pikevm(
+ matches,
+ slots,
+ quit_after_match,
+ text,
+ start,
+ end,
+ )
+ } else {
+ self.exec_backtrack(matches, slots, text, start, end)
+ }
+ }
+
+ /// Always run the NFA algorithm.
+ fn exec_pikevm(
+ &self,
+ matches: &mut [bool],
+ slots: &mut [Slot],
+ quit_after_match: bool,
+ text: &[u8],
+ start: usize,
+ end: usize,
+ ) -> bool {
+ if self.ro.nfa.uses_bytes() {
+ pikevm::Fsm::exec(
+ &self.ro.nfa,
+ self.cache.value(),
+ matches,
+ slots,
+ quit_after_match,
+ ByteInput::new(text, self.ro.nfa.only_utf8),
+ start,
+ end,
+ )
+ } else {
+ pikevm::Fsm::exec(
+ &self.ro.nfa,
+ self.cache.value(),
+ matches,
+ slots,
+ quit_after_match,
+ CharInput::new(text),
+ start,
+ end,
+ )
+ }
+ }
+
+ /// Always runs the NFA using bounded backtracking.
+ fn exec_backtrack(
+ &self,
+ matches: &mut [bool],
+ slots: &mut [Slot],
+ text: &[u8],
+ start: usize,
+ end: usize,
+ ) -> bool {
+ if self.ro.nfa.uses_bytes() {
+ backtrack::Bounded::exec(
+ &self.ro.nfa,
+ self.cache.value(),
+ matches,
+ slots,
+ ByteInput::new(text, self.ro.nfa.only_utf8),
+ start,
+ end,
+ )
+ } else {
+ backtrack::Bounded::exec(
+ &self.ro.nfa,
+ self.cache.value(),
+ matches,
+ slots,
+ CharInput::new(text),
+ start,
+ end,
+ )
+ }
+ }
+
+ /// Finds which regular expressions match the given text.
+ ///
+ /// `matches` should have length equal to the number of regexes being
+ /// searched.
+ ///
+ /// This is only useful when one wants to know which regexes in a set
+ /// match some text.
+ pub fn many_matches_at(
+ &self,
+ matches: &mut [bool],
+ text: &[u8],
+ start: usize,
+ ) -> bool {
+ use self::MatchType::*;
+ if !self.is_anchor_end_match(text) {
+ return false;
+ }
+ match self.ro.match_type {
+ #[cfg(feature = "perf-literal")]
+ Literal(ty) => {
+ debug_assert_eq!(matches.len(), 1);
+ matches[0] = self.find_literals(ty, text, start).is_some();
+ matches[0]
+ }
+ #[cfg(feature = "perf-dfa")]
+ Dfa | DfaAnchoredReverse | DfaMany => {
+ match dfa::Fsm::forward_many(
+ &self.ro.dfa,
+ self.cache.value(),
+ matches,
+ text,
+ start,
+ ) {
+ dfa::Result::Match(_) => true,
+ dfa::Result::NoMatch(_) => false,
+ dfa::Result::Quit => self.exec_nfa(
+ MatchNfaType::Auto,
+ matches,
+ &mut [],
+ false,
+ false,
+ text,
+ start,
+ text.len(),
+ ),
+ }
+ }
+ #[cfg(all(feature = "perf-dfa", feature = "perf-literal"))]
+ DfaSuffix => {
+ match dfa::Fsm::forward_many(
+ &self.ro.dfa,
+ self.cache.value(),
+ matches,
+ text,
+ start,
+ ) {
+ dfa::Result::Match(_) => true,
+ dfa::Result::NoMatch(_) => false,
+ dfa::Result::Quit => self.exec_nfa(
+ MatchNfaType::Auto,
+ matches,
+ &mut [],
+ false,
+ false,
+ text,
+ start,
+ text.len(),
+ ),
+ }
+ }
+ Nfa(ty) => self.exec_nfa(
+ ty,
+ matches,
+ &mut [],
+ false,
+ false,
+ text,
+ start,
+ text.len(),
+ ),
+ Nothing => false,
+ }
+ }
+
+ #[cfg_attr(feature = "perf-inline", inline(always))]
+ fn is_anchor_end_match(&self, text: &[u8]) -> bool {
+ #[cfg(not(feature = "perf-literal"))]
+ fn imp(_: &ExecReadOnly, _: &[u8]) -> bool {
+ true
+ }
+
+ #[cfg(feature = "perf-literal")]
+ fn imp(ro: &ExecReadOnly, text: &[u8]) -> bool {
+ // Only do this check if the haystack is big (>1MB).
+ if text.len() > (1 << 20) && ro.nfa.is_anchored_end {
+ let lcs = ro.suffixes.lcs();
+ if lcs.len() >= 1 && !lcs.is_suffix(text) {
+ return false;
+ }
+ }
+ true
+ }
+
+ imp(&self.ro, text)
+ }
+
+ pub fn capture_name_idx(&self) -> &Arc<HashMap<String, usize>> {
+ &self.ro.nfa.capture_name_idx
+ }
+}
+
+impl<'c> ExecNoSyncStr<'c> {
+ pub fn capture_name_idx(&self) -> &Arc<HashMap<String, usize>> {
+ self.0.capture_name_idx()
+ }
+}
+
+impl Exec {
+ /// Get a searcher that isn't Sync.
+ #[cfg_attr(feature = "perf-inline", inline(always))]
+ pub fn searcher(&self) -> ExecNoSync<'_> {
+ ExecNoSync {
+ ro: &self.ro, // a clone is too expensive here! (and not needed)
+ cache: self.pool.get(),
+ }
+ }
+
+ /// Get a searcher that isn't Sync and can match on &str.
+ #[cfg_attr(feature = "perf-inline", inline(always))]
+ pub fn searcher_str(&self) -> ExecNoSyncStr<'_> {
+ ExecNoSyncStr(self.searcher())
+ }
+
+ /// Build a Regex from this executor.
+ pub fn into_regex(self) -> re_unicode::Regex {
+ re_unicode::Regex::from(self)
+ }
+
+ /// Build a RegexSet from this executor.
+ pub fn into_regex_set(self) -> re_set::unicode::RegexSet {
+ re_set::unicode::RegexSet::from(self)
+ }
+
+ /// Build a Regex from this executor that can match arbitrary bytes.
+ pub fn into_byte_regex(self) -> re_bytes::Regex {
+ re_bytes::Regex::from(self)
+ }
+
+ /// Build a RegexSet from this executor that can match arbitrary bytes.
+ pub fn into_byte_regex_set(self) -> re_set::bytes::RegexSet {
+ re_set::bytes::RegexSet::from(self)
+ }
+
+ /// The original regular expressions given by the caller that were
+ /// compiled.
+ pub fn regex_strings(&self) -> &[String] {
+ &self.ro.res
+ }
+
+ /// Return a slice of capture names.
+ ///
+ /// Any capture that isn't named is None.
+ pub fn capture_names(&self) -> &[Option<String>] {
+ &self.ro.nfa.captures
+ }
+
+ /// Return a reference to named groups mapping (from group name to
+ /// group position).
+ pub fn capture_name_idx(&self) -> &Arc<HashMap<String, usize>> {
+ &self.ro.nfa.capture_name_idx
+ }
+}
+
+impl Clone for Exec {
+ fn clone(&self) -> Exec {
+ let pool = ExecReadOnly::new_pool(&self.ro);
+ Exec { ro: self.ro.clone(), pool }
+ }
+}
+
+impl ExecReadOnly {
+ fn choose_match_type(&self, hint: Option<MatchType>) -> MatchType {
+ if let Some(MatchType::Nfa(_)) = hint {
+ return hint.unwrap();
+ }
+ // If the NFA is empty, then we'll never match anything.
+ if self.nfa.insts.is_empty() {
+ return MatchType::Nothing;
+ }
+ if let Some(literalty) = self.choose_literal_match_type() {
+ return literalty;
+ }
+ if let Some(dfaty) = self.choose_dfa_match_type() {
+ return dfaty;
+ }
+ // We're so totally hosed.
+ MatchType::Nfa(MatchNfaType::Auto)
+ }
+
+ /// If a plain literal scan can be used, then a corresponding literal
+ /// search type is returned.
+ fn choose_literal_match_type(&self) -> Option<MatchType> {
+ #[cfg(not(feature = "perf-literal"))]
+ fn imp(_: &ExecReadOnly) -> Option<MatchType> {
+ None
+ }
+
+ #[cfg(feature = "perf-literal")]
+ fn imp(ro: &ExecReadOnly) -> Option<MatchType> {
+ // If our set of prefixes is complete, then we can use it to find
+ // a match in lieu of a regex engine. This doesn't quite work well
+ // in the presence of multiple regexes, so only do it when there's
+ // one.
+ //
+ // TODO(burntsushi): Also, don't try to match literals if the regex
+ // is partially anchored. We could technically do it, but we'd need
+ // to create two sets of literals: all of them and then the subset
+ // that aren't anchored. We would then only search for all of them
+ // when at the beginning of the input and use the subset in all
+ // other cases.
+ if ro.res.len() != 1 {
+ return None;
+ }
+ if ro.ac.is_some() {
+ return Some(MatchType::Literal(
+ MatchLiteralType::AhoCorasick,
+ ));
+ }
+ if ro.nfa.prefixes.complete() {
+ return if ro.nfa.is_anchored_start {
+ Some(MatchType::Literal(MatchLiteralType::AnchoredStart))
+ } else {
+ Some(MatchType::Literal(MatchLiteralType::Unanchored))
+ };
+ }
+ if ro.suffixes.complete() {
+ return if ro.nfa.is_anchored_end {
+ Some(MatchType::Literal(MatchLiteralType::AnchoredEnd))
+ } else {
+ // This case shouldn't happen. When the regex isn't
+ // anchored, then complete prefixes should imply complete
+ // suffixes.
+ Some(MatchType::Literal(MatchLiteralType::Unanchored))
+ };
+ }
+ None
+ }
+
+ imp(self)
+ }
+
+ /// If a DFA scan can be used, then choose the appropriate DFA strategy.
+ fn choose_dfa_match_type(&self) -> Option<MatchType> {
+ #[cfg(not(feature = "perf-dfa"))]
+ fn imp(_: &ExecReadOnly) -> Option<MatchType> {
+ None
+ }
+
+ #[cfg(feature = "perf-dfa")]
+ fn imp(ro: &ExecReadOnly) -> Option<MatchType> {
+ if !dfa::can_exec(&ro.dfa) {
+ return None;
+ }
+ // Regex sets require a slightly specialized path.
+ if ro.res.len() >= 2 {
+ return Some(MatchType::DfaMany);
+ }
+ // If the regex is anchored at the end but not the start, then
+ // just match in reverse from the end of the haystack.
+ if !ro.nfa.is_anchored_start && ro.nfa.is_anchored_end {
+ return Some(MatchType::DfaAnchoredReverse);
+ }
+ #[cfg(feature = "perf-literal")]
+ {
+ // If there's a longish suffix literal, then it might be faster
+ // to look for that first.
+ if ro.should_suffix_scan() {
+ return Some(MatchType::DfaSuffix);
+ }
+ }
+ // Fall back to your garden variety forward searching lazy DFA.
+ Some(MatchType::Dfa)
+ }
+
+ imp(self)
+ }
+
+ /// Returns true if the program is amenable to suffix scanning.
+ ///
+ /// When this is true, as a heuristic, we assume it is OK to quickly scan
+ /// for suffix literals and then do a *reverse* DFA match from any matches
+ /// produced by the literal scan. (And then followed by a forward DFA
+ /// search, since the previously found suffix literal maybe not actually be
+ /// the end of a match.)
+ ///
+ /// This is a bit of a specialized optimization, but can result in pretty
+ /// big performance wins if 1) there are no prefix literals and 2) the
+ /// suffix literals are pretty rare in the text. (1) is obviously easy to
+ /// account for but (2) is harder. As a proxy, we assume that longer
+ /// strings are generally rarer, so we only enable this optimization when
+ /// we have a meaty suffix.
+ #[cfg(all(feature = "perf-dfa", feature = "perf-literal"))]
+ fn should_suffix_scan(&self) -> bool {
+ if self.suffixes.is_empty() {
+ return false;
+ }
+ let lcs_len = self.suffixes.lcs().char_len();
+ lcs_len >= 3 && lcs_len > self.dfa.prefixes.lcp().char_len()
+ }
+
+ fn new_pool(ro: &Arc<ExecReadOnly>) -> Box<Pool<ProgramCache>> {
+ let ro = ro.clone();
+ Box::new(Pool::new(Box::new(move || {
+ AssertUnwindSafe(RefCell::new(ProgramCacheInner::new(&ro)))
+ })))
+ }
+}
+
+#[derive(Clone, Copy, Debug)]
+enum MatchType {
+ /// A single or multiple literal search. This is only used when the regex
+ /// can be decomposed into a literal search.
+ #[cfg(feature = "perf-literal")]
+ Literal(MatchLiteralType),
+ /// A normal DFA search.
+ #[cfg(feature = "perf-dfa")]
+ Dfa,
+ /// A reverse DFA search starting from the end of a haystack.
+ #[cfg(feature = "perf-dfa")]
+ DfaAnchoredReverse,
+ /// A reverse DFA search with suffix literal scanning.
+ #[cfg(all(feature = "perf-dfa", feature = "perf-literal"))]
+ DfaSuffix,
+ /// Use the DFA on two or more regular expressions.
+ #[cfg(feature = "perf-dfa")]
+ DfaMany,
+ /// An NFA variant.
+ Nfa(MatchNfaType),
+ /// No match is ever possible, so don't ever try to search.
+ Nothing,
+}
+
+#[derive(Clone, Copy, Debug)]
+#[cfg(feature = "perf-literal")]
+enum MatchLiteralType {
+ /// Match literals anywhere in text.
+ Unanchored,
+ /// Match literals only at the start of text.
+ AnchoredStart,
+ /// Match literals only at the end of text.
+ AnchoredEnd,
+ /// Use an Aho-Corasick automaton. This requires `ac` to be Some on
+ /// ExecReadOnly.
+ AhoCorasick,
+}
+
+#[derive(Clone, Copy, Debug, Eq, PartialEq)]
+enum MatchNfaType {
+ /// Choose between Backtrack and PikeVM.
+ Auto,
+ /// NFA bounded backtracking.
+ ///
+ /// (This is only set by tests, since it never makes sense to always want
+ /// backtracking.)
+ Backtrack,
+ /// The Pike VM.
+ ///
+ /// (This is only set by tests, since it never makes sense to always want
+ /// the Pike VM.)
+ PikeVM,
+}
+
+/// `ProgramCache` maintains reusable allocations for each matching engine
+/// available to a particular program.
+///
+/// We declare this as unwind safe since it's a cache that's only used for
+/// performance purposes. If a panic occurs, it is (or should be) always safe
+/// to continue using the same regex object.
+pub type ProgramCache = AssertUnwindSafe<RefCell<ProgramCacheInner>>;
+
+#[derive(Debug)]
+pub struct ProgramCacheInner {
+ pub pikevm: pikevm::Cache,
+ pub backtrack: backtrack::Cache,
+ #[cfg(feature = "perf-dfa")]
+ pub dfa: dfa::Cache,
+ #[cfg(feature = "perf-dfa")]
+ pub dfa_reverse: dfa::Cache,
+}
+
+impl ProgramCacheInner {
+ fn new(ro: &ExecReadOnly) -> Self {
+ ProgramCacheInner {
+ pikevm: pikevm::Cache::new(&ro.nfa),
+ backtrack: backtrack::Cache::new(&ro.nfa),
+ #[cfg(feature = "perf-dfa")]
+ dfa: dfa::Cache::new(&ro.dfa),
+ #[cfg(feature = "perf-dfa")]
+ dfa_reverse: dfa::Cache::new(&ro.dfa_reverse),
+ }
+ }
+}
+
+/// Alternation literals checks if the given HIR is a simple alternation of
+/// literals, and if so, returns them. Otherwise, this returns None.
+#[cfg(feature = "perf-literal")]
+fn alternation_literals(expr: &Hir) -> Option<Vec<Vec<u8>>> {
+ use regex_syntax::hir::{HirKind, Literal};
+
+ // This is pretty hacky, but basically, if `is_alternation_literal` is
+ // true, then we can make several assumptions about the structure of our
+ // HIR. This is what justifies the `unreachable!` statements below.
+ //
+ // This code should be refactored once we overhaul this crate's
+ // optimization pipeline, because this is a terribly inflexible way to go
+ // about things.
+
+ if !expr.is_alternation_literal() {
+ return None;
+ }
+ let alts = match *expr.kind() {
+ HirKind::Alternation(ref alts) => alts,
+ _ => return None, // one literal isn't worth it
+ };
+
+ let extendlit = |lit: &Literal, dst: &mut Vec<u8>| match *lit {
+ Literal::Unicode(c) => {
+ let mut buf = [0; 4];
+ dst.extend_from_slice(c.encode_utf8(&mut buf).as_bytes());
+ }
+ Literal::Byte(b) => {
+ dst.push(b);
+ }
+ };
+
+ let mut lits = vec![];
+ for alt in alts {
+ let mut lit = vec![];
+ match *alt.kind() {
+ HirKind::Literal(ref x) => extendlit(x, &mut lit),
+ HirKind::Concat(ref exprs) => {
+ for e in exprs {
+ match *e.kind() {
+ HirKind::Literal(ref x) => extendlit(x, &mut lit),
+ _ => unreachable!("expected literal, got {:?}", e),
+ }
+ }
+ }
+ _ => unreachable!("expected literal or concat, got {:?}", alt),
+ }
+ lits.push(lit);
+ }
+ Some(lits)
+}
+
+#[cfg(test)]
+mod test {
+ #[test]
+ fn uppercut_s_backtracking_bytes_default_bytes_mismatch() {
+ use crate::internal::ExecBuilder;
+
+ let backtrack_bytes_re = ExecBuilder::new("^S")
+ .bounded_backtracking()
+ .only_utf8(false)
+ .build()
+ .map(|exec| exec.into_byte_regex())
+ .map_err(|err| format!("{}", err))
+ .unwrap();
+
+ let default_bytes_re = ExecBuilder::new("^S")
+ .only_utf8(false)
+ .build()
+ .map(|exec| exec.into_byte_regex())
+ .map_err(|err| format!("{}", err))
+ .unwrap();
+
+ let input = vec![83, 83];
+
+ let s1 = backtrack_bytes_re.split(&input);
+ let s2 = default_bytes_re.split(&input);
+ for (chunk1, chunk2) in s1.zip(s2) {
+ assert_eq!(chunk1, chunk2);
+ }
+ }
+
+ #[test]
+ fn unicode_lit_star_backtracking_utf8bytes_default_utf8bytes_mismatch() {
+ use crate::internal::ExecBuilder;
+
+ let backtrack_bytes_re = ExecBuilder::new(r"^(?u:\*)")
+ .bounded_backtracking()
+ .bytes(true)
+ .build()
+ .map(|exec| exec.into_regex())
+ .map_err(|err| format!("{}", err))
+ .unwrap();
+
+ let default_bytes_re = ExecBuilder::new(r"^(?u:\*)")
+ .bytes(true)
+ .build()
+ .map(|exec| exec.into_regex())
+ .map_err(|err| format!("{}", err))
+ .unwrap();
+
+ let input = "**";
+
+ let s1 = backtrack_bytes_re.split(input);
+ let s2 = default_bytes_re.split(input);
+ for (chunk1, chunk2) in s1.zip(s2) {
+ assert_eq!(chunk1, chunk2);
+ }
+ }
+}
diff --git a/third_party/rust/regex/src/expand.rs b/third_party/rust/regex/src/expand.rs
new file mode 100644
index 0000000000..67b514926a
--- /dev/null
+++ b/third_party/rust/regex/src/expand.rs
@@ -0,0 +1,239 @@
+use std::str;
+
+use crate::find_byte::find_byte;
+
+use crate::re_bytes;
+use crate::re_unicode;
+
+pub fn expand_str(
+ caps: &re_unicode::Captures<'_>,
+ mut replacement: &str,
+ dst: &mut String,
+) {
+ while !replacement.is_empty() {
+ match find_byte(b'$', replacement.as_bytes()) {
+ None => break,
+ Some(i) => {
+ dst.push_str(&replacement[..i]);
+ replacement = &replacement[i..];
+ }
+ }
+ if replacement.as_bytes().get(1).map_or(false, |&b| b == b'$') {
+ dst.push_str("$");
+ replacement = &replacement[2..];
+ continue;
+ }
+ debug_assert!(!replacement.is_empty());
+ let cap_ref = match find_cap_ref(replacement.as_bytes()) {
+ Some(cap_ref) => cap_ref,
+ None => {
+ dst.push_str("$");
+ replacement = &replacement[1..];
+ continue;
+ }
+ };
+ replacement = &replacement[cap_ref.end..];
+ match cap_ref.cap {
+ Ref::Number(i) => {
+ dst.push_str(caps.get(i).map(|m| m.as_str()).unwrap_or(""));
+ }
+ Ref::Named(name) => {
+ dst.push_str(
+ caps.name(name).map(|m| m.as_str()).unwrap_or(""),
+ );
+ }
+ }
+ }
+ dst.push_str(replacement);
+}
+
+pub fn expand_bytes(
+ caps: &re_bytes::Captures<'_>,
+ mut replacement: &[u8],
+ dst: &mut Vec<u8>,
+) {
+ while !replacement.is_empty() {
+ match find_byte(b'$', replacement) {
+ None => break,
+ Some(i) => {
+ dst.extend(&replacement[..i]);
+ replacement = &replacement[i..];
+ }
+ }
+ if replacement.get(1).map_or(false, |&b| b == b'$') {
+ dst.push(b'$');
+ replacement = &replacement[2..];
+ continue;
+ }
+ debug_assert!(!replacement.is_empty());
+ let cap_ref = match find_cap_ref(replacement) {
+ Some(cap_ref) => cap_ref,
+ None => {
+ dst.push(b'$');
+ replacement = &replacement[1..];
+ continue;
+ }
+ };
+ replacement = &replacement[cap_ref.end..];
+ match cap_ref.cap {
+ Ref::Number(i) => {
+ dst.extend(caps.get(i).map(|m| m.as_bytes()).unwrap_or(b""));
+ }
+ Ref::Named(name) => {
+ dst.extend(
+ caps.name(name).map(|m| m.as_bytes()).unwrap_or(b""),
+ );
+ }
+ }
+ }
+ dst.extend(replacement);
+}
+
+/// `CaptureRef` represents a reference to a capture group inside some text.
+/// The reference is either a capture group name or a number.
+///
+/// It is also tagged with the position in the text following the
+/// capture reference.
+#[derive(Clone, Copy, Debug, Eq, PartialEq)]
+struct CaptureRef<'a> {
+ cap: Ref<'a>,
+ end: usize,
+}
+
+/// A reference to a capture group in some text.
+///
+/// e.g., `$2`, `$foo`, `${foo}`.
+#[derive(Clone, Copy, Debug, Eq, PartialEq)]
+enum Ref<'a> {
+ Named(&'a str),
+ Number(usize),
+}
+
+impl<'a> From<&'a str> for Ref<'a> {
+ fn from(x: &'a str) -> Ref<'a> {
+ Ref::Named(x)
+ }
+}
+
+impl From<usize> for Ref<'static> {
+ fn from(x: usize) -> Ref<'static> {
+ Ref::Number(x)
+ }
+}
+
+/// Parses a possible reference to a capture group name in the given text,
+/// starting at the beginning of `replacement`.
+///
+/// If no such valid reference could be found, None is returned.
+fn find_cap_ref(replacement: &[u8]) -> Option<CaptureRef<'_>> {
+ let mut i = 0;
+ let rep: &[u8] = replacement;
+ if rep.len() <= 1 || rep[0] != b'$' {
+ return None;
+ }
+ i += 1;
+ if rep[i] == b'{' {
+ return find_cap_ref_braced(rep, i + 1);
+ }
+ let mut cap_end = i;
+ while rep.get(cap_end).copied().map_or(false, is_valid_cap_letter) {
+ cap_end += 1;
+ }
+ if cap_end == i {
+ return None;
+ }
+ // We just verified that the range 0..cap_end is valid ASCII, so it must
+ // therefore be valid UTF-8. If we really cared, we could avoid this UTF-8
+ // check via an unchecked conversion or by parsing the number straight from
+ // &[u8].
+ let cap =
+ str::from_utf8(&rep[i..cap_end]).expect("valid UTF-8 capture name");
+ Some(CaptureRef {
+ cap: match cap.parse::<u32>() {
+ Ok(i) => Ref::Number(i as usize),
+ Err(_) => Ref::Named(cap),
+ },
+ end: cap_end,
+ })
+}
+
+fn find_cap_ref_braced(rep: &[u8], mut i: usize) -> Option<CaptureRef<'_>> {
+ let start = i;
+ while rep.get(i).map_or(false, |&b| b != b'}') {
+ i += 1;
+ }
+ if !rep.get(i).map_or(false, |&b| b == b'}') {
+ return None;
+ }
+ // When looking at braced names, we don't put any restrictions on the name,
+ // so it's possible it could be invalid UTF-8. But a capture group name
+ // can never be invalid UTF-8, so if we have invalid UTF-8, then we can
+ // safely return None.
+ let cap = match str::from_utf8(&rep[start..i]) {
+ Err(_) => return None,
+ Ok(cap) => cap,
+ };
+ Some(CaptureRef {
+ cap: match cap.parse::<u32>() {
+ Ok(i) => Ref::Number(i as usize),
+ Err(_) => Ref::Named(cap),
+ },
+ end: i + 1,
+ })
+}
+
+/// Returns true if and only if the given byte is allowed in a capture name.
+fn is_valid_cap_letter(b: u8) -> bool {
+ match b {
+ b'0'..=b'9' | b'a'..=b'z' | b'A'..=b'Z' | b'_' => true,
+ _ => false,
+ }
+}
+
+#[cfg(test)]
+mod tests {
+ use super::{find_cap_ref, CaptureRef};
+
+ macro_rules! find {
+ ($name:ident, $text:expr) => {
+ #[test]
+ fn $name() {
+ assert_eq!(None, find_cap_ref($text.as_bytes()));
+ }
+ };
+ ($name:ident, $text:expr, $capref:expr) => {
+ #[test]
+ fn $name() {
+ assert_eq!(Some($capref), find_cap_ref($text.as_bytes()));
+ }
+ };
+ }
+
+ macro_rules! c {
+ ($name_or_number:expr, $pos:expr) => {
+ CaptureRef { cap: $name_or_number.into(), end: $pos }
+ };
+ }
+
+ find!(find_cap_ref1, "$foo", c!("foo", 4));
+ find!(find_cap_ref2, "${foo}", c!("foo", 6));
+ find!(find_cap_ref3, "$0", c!(0, 2));
+ find!(find_cap_ref4, "$5", c!(5, 2));
+ find!(find_cap_ref5, "$10", c!(10, 3));
+ // See https://github.com/rust-lang/regex/pull/585
+ // for more on characters following numbers
+ find!(find_cap_ref6, "$42a", c!("42a", 4));
+ find!(find_cap_ref7, "${42}a", c!(42, 5));
+ find!(find_cap_ref8, "${42");
+ find!(find_cap_ref9, "${42 ");
+ find!(find_cap_ref10, " $0 ");
+ find!(find_cap_ref11, "$");
+ find!(find_cap_ref12, " ");
+ find!(find_cap_ref13, "");
+ find!(find_cap_ref14, "$1-$2", c!(1, 2));
+ find!(find_cap_ref15, "$1_$2", c!("1_", 3));
+ find!(find_cap_ref16, "$x-$y", c!("x", 2));
+ find!(find_cap_ref17, "$x_$y", c!("x_", 3));
+ find!(find_cap_ref18, "${#}", c!("#", 4));
+ find!(find_cap_ref19, "${Z[}", c!("Z[", 5));
+}
diff --git a/third_party/rust/regex/src/find_byte.rs b/third_party/rust/regex/src/find_byte.rs
new file mode 100644
index 0000000000..e95f72afb9
--- /dev/null
+++ b/third_party/rust/regex/src/find_byte.rs
@@ -0,0 +1,18 @@
+/// Searches for the given needle in the given haystack.
+///
+/// If the perf-literal feature is enabled, then this uses the super optimized
+/// memchr crate. Otherwise, it uses the naive byte-at-a-time implementation.
+pub fn find_byte(needle: u8, haystack: &[u8]) -> Option<usize> {
+ #[cfg(not(feature = "perf-literal"))]
+ fn imp(needle: u8, haystack: &[u8]) -> Option<usize> {
+ haystack.iter().position(|&b| b == needle)
+ }
+
+ #[cfg(feature = "perf-literal")]
+ fn imp(needle: u8, haystack: &[u8]) -> Option<usize> {
+ use memchr::memchr;
+ memchr(needle, haystack)
+ }
+
+ imp(needle, haystack)
+}
diff --git a/third_party/rust/regex/src/freqs.rs b/third_party/rust/regex/src/freqs.rs
new file mode 100644
index 0000000000..fcffa95fb5
--- /dev/null
+++ b/third_party/rust/regex/src/freqs.rs
@@ -0,0 +1,261 @@
+// NOTE: The following code was generated by "scripts/frequencies.py", do not
+// edit directly
+
+pub const BYTE_FREQUENCIES: [u8; 256] = [
+ 55, // '\x00'
+ 52, // '\x01'
+ 51, // '\x02'
+ 50, // '\x03'
+ 49, // '\x04'
+ 48, // '\x05'
+ 47, // '\x06'
+ 46, // '\x07'
+ 45, // '\x08'
+ 103, // '\t'
+ 242, // '\n'
+ 66, // '\x0b'
+ 67, // '\x0c'
+ 229, // '\r'
+ 44, // '\x0e'
+ 43, // '\x0f'
+ 42, // '\x10'
+ 41, // '\x11'
+ 40, // '\x12'
+ 39, // '\x13'
+ 38, // '\x14'
+ 37, // '\x15'
+ 36, // '\x16'
+ 35, // '\x17'
+ 34, // '\x18'
+ 33, // '\x19'
+ 56, // '\x1a'
+ 32, // '\x1b'
+ 31, // '\x1c'
+ 30, // '\x1d'
+ 29, // '\x1e'
+ 28, // '\x1f'
+ 255, // ' '
+ 148, // '!'
+ 164, // '"'
+ 149, // '#'
+ 136, // '$'
+ 160, // '%'
+ 155, // '&'
+ 173, // "'"
+ 221, // '('
+ 222, // ')'
+ 134, // '*'
+ 122, // '+'
+ 232, // ','
+ 202, // '-'
+ 215, // '.'
+ 224, // '/'
+ 208, // '0'
+ 220, // '1'
+ 204, // '2'
+ 187, // '3'
+ 183, // '4'
+ 179, // '5'
+ 177, // '6'
+ 168, // '7'
+ 178, // '8'
+ 200, // '9'
+ 226, // ':'
+ 195, // ';'
+ 154, // '<'
+ 184, // '='
+ 174, // '>'
+ 126, // '?'
+ 120, // '@'
+ 191, // 'A'
+ 157, // 'B'
+ 194, // 'C'
+ 170, // 'D'
+ 189, // 'E'
+ 162, // 'F'
+ 161, // 'G'
+ 150, // 'H'
+ 193, // 'I'
+ 142, // 'J'
+ 137, // 'K'
+ 171, // 'L'
+ 176, // 'M'
+ 185, // 'N'
+ 167, // 'O'
+ 186, // 'P'
+ 112, // 'Q'
+ 175, // 'R'
+ 192, // 'S'
+ 188, // 'T'
+ 156, // 'U'
+ 140, // 'V'
+ 143, // 'W'
+ 123, // 'X'
+ 133, // 'Y'
+ 128, // 'Z'
+ 147, // '['
+ 138, // '\\'
+ 146, // ']'
+ 114, // '^'
+ 223, // '_'
+ 151, // '`'
+ 249, // 'a'
+ 216, // 'b'
+ 238, // 'c'
+ 236, // 'd'
+ 253, // 'e'
+ 227, // 'f'
+ 218, // 'g'
+ 230, // 'h'
+ 247, // 'i'
+ 135, // 'j'
+ 180, // 'k'
+ 241, // 'l'
+ 233, // 'm'
+ 246, // 'n'
+ 244, // 'o'
+ 231, // 'p'
+ 139, // 'q'
+ 245, // 'r'
+ 243, // 's'
+ 251, // 't'
+ 235, // 'u'
+ 201, // 'v'
+ 196, // 'w'
+ 240, // 'x'
+ 214, // 'y'
+ 152, // 'z'
+ 182, // '{'
+ 205, // '|'
+ 181, // '}'
+ 127, // '~'
+ 27, // '\x7f'
+ 212, // '\x80'
+ 211, // '\x81'
+ 210, // '\x82'
+ 213, // '\x83'
+ 228, // '\x84'
+ 197, // '\x85'
+ 169, // '\x86'
+ 159, // '\x87'
+ 131, // '\x88'
+ 172, // '\x89'
+ 105, // '\x8a'
+ 80, // '\x8b'
+ 98, // '\x8c'
+ 96, // '\x8d'
+ 97, // '\x8e'
+ 81, // '\x8f'
+ 207, // '\x90'
+ 145, // '\x91'
+ 116, // '\x92'
+ 115, // '\x93'
+ 144, // '\x94'
+ 130, // '\x95'
+ 153, // '\x96'
+ 121, // '\x97'
+ 107, // '\x98'
+ 132, // '\x99'
+ 109, // '\x9a'
+ 110, // '\x9b'
+ 124, // '\x9c'
+ 111, // '\x9d'
+ 82, // '\x9e'
+ 108, // '\x9f'
+ 118, // '\xa0'
+ 141, // '¡'
+ 113, // '¢'
+ 129, // '£'
+ 119, // '¤'
+ 125, // '¥'
+ 165, // '¦'
+ 117, // '§'
+ 92, // '¨'
+ 106, // '©'
+ 83, // 'ª'
+ 72, // '«'
+ 99, // '¬'
+ 93, // '\xad'
+ 65, // '®'
+ 79, // '¯'
+ 166, // '°'
+ 237, // '±'
+ 163, // '²'
+ 199, // '³'
+ 190, // '´'
+ 225, // 'µ'
+ 209, // '¶'
+ 203, // '·'
+ 198, // '¸'
+ 217, // '¹'
+ 219, // 'º'
+ 206, // '»'
+ 234, // '¼'
+ 248, // '½'
+ 158, // '¾'
+ 239, // '¿'
+ 255, // 'À'
+ 255, // 'Á'
+ 255, // 'Â'
+ 255, // 'Ã'
+ 255, // 'Ä'
+ 255, // 'Å'
+ 255, // 'Æ'
+ 255, // 'Ç'
+ 255, // 'È'
+ 255, // 'É'
+ 255, // 'Ê'
+ 255, // 'Ë'
+ 255, // 'Ì'
+ 255, // 'Í'
+ 255, // 'Î'
+ 255, // 'Ï'
+ 255, // 'Ð'
+ 255, // 'Ñ'
+ 255, // 'Ò'
+ 255, // 'Ó'
+ 255, // 'Ô'
+ 255, // 'Õ'
+ 255, // 'Ö'
+ 255, // '×'
+ 255, // 'Ø'
+ 255, // 'Ù'
+ 255, // 'Ú'
+ 255, // 'Û'
+ 255, // 'Ü'
+ 255, // 'Ý'
+ 255, // 'Þ'
+ 255, // 'ß'
+ 255, // 'à'
+ 255, // 'á'
+ 255, // 'â'
+ 255, // 'ã'
+ 255, // 'ä'
+ 255, // 'å'
+ 255, // 'æ'
+ 255, // 'ç'
+ 255, // 'è'
+ 255, // 'é'
+ 255, // 'ê'
+ 255, // 'ë'
+ 255, // 'ì'
+ 255, // 'í'
+ 255, // 'î'
+ 255, // 'ï'
+ 255, // 'ð'
+ 255, // 'ñ'
+ 255, // 'ò'
+ 255, // 'ó'
+ 255, // 'ô'
+ 255, // 'õ'
+ 255, // 'ö'
+ 255, // '÷'
+ 255, // 'ø'
+ 255, // 'ù'
+ 255, // 'ú'
+ 255, // 'û'
+ 255, // 'ü'
+ 255, // 'ý'
+ 255, // 'þ'
+ 255, // 'ÿ'
+];
diff --git a/third_party/rust/regex/src/input.rs b/third_party/rust/regex/src/input.rs
new file mode 100644
index 0000000000..df6c3e0c91
--- /dev/null
+++ b/third_party/rust/regex/src/input.rs
@@ -0,0 +1,432 @@
+use std::char;
+use std::cmp::Ordering;
+use std::fmt;
+use std::ops;
+use std::u32;
+
+use crate::literal::LiteralSearcher;
+use crate::prog::InstEmptyLook;
+use crate::utf8::{decode_last_utf8, decode_utf8};
+
+/// Represents a location in the input.
+#[derive(Clone, Copy, Debug)]
+pub struct InputAt {
+ pos: usize,
+ c: Char,
+ byte: Option<u8>,
+ len: usize,
+}
+
+impl InputAt {
+ /// Returns true iff this position is at the beginning of the input.
+ pub fn is_start(&self) -> bool {
+ self.pos == 0
+ }
+
+ /// Returns true iff this position is past the end of the input.
+ pub fn is_end(&self) -> bool {
+ self.c.is_none() && self.byte.is_none()
+ }
+
+ /// Returns the character at this position.
+ ///
+ /// If this position is just before or after the input, then an absent
+ /// character is returned.
+ pub fn char(&self) -> Char {
+ self.c
+ }
+
+ /// Returns the byte at this position.
+ pub fn byte(&self) -> Option<u8> {
+ self.byte
+ }
+
+ /// Returns the UTF-8 width of the character at this position.
+ pub fn len(&self) -> usize {
+ self.len
+ }
+
+ /// Returns whether the UTF-8 width of the character at this position
+ /// is zero.
+ pub fn is_empty(&self) -> bool {
+ self.len == 0
+ }
+
+ /// Returns the byte offset of this position.
+ pub fn pos(&self) -> usize {
+ self.pos
+ }
+
+ /// Returns the byte offset of the next position in the input.
+ pub fn next_pos(&self) -> usize {
+ self.pos + self.len
+ }
+}
+
+/// An abstraction over input used in the matching engines.
+pub trait Input: fmt::Debug {
+ /// Return an encoding of the position at byte offset `i`.
+ fn at(&self, i: usize) -> InputAt;
+
+ /// Return the Unicode character occurring next to `at`.
+ ///
+ /// If no such character could be decoded, then `Char` is absent.
+ fn next_char(&self, at: InputAt) -> Char;
+
+ /// Return the Unicode character occurring previous to `at`.
+ ///
+ /// If no such character could be decoded, then `Char` is absent.
+ fn previous_char(&self, at: InputAt) -> Char;
+
+ /// Return true if the given empty width instruction matches at the
+ /// input position given.
+ fn is_empty_match(&self, at: InputAt, empty: &InstEmptyLook) -> bool;
+
+ /// Scan the input for a matching prefix.
+ fn prefix_at(
+ &self,
+ prefixes: &LiteralSearcher,
+ at: InputAt,
+ ) -> Option<InputAt>;
+
+ /// The number of bytes in the input.
+ fn len(&self) -> usize;
+
+ /// Whether the input is empty.
+ fn is_empty(&self) -> bool {
+ self.len() == 0
+ }
+
+ /// Return the given input as a sequence of bytes.
+ fn as_bytes(&self) -> &[u8];
+}
+
+impl<'a, T: Input> Input for &'a T {
+ fn at(&self, i: usize) -> InputAt {
+ (**self).at(i)
+ }
+
+ fn next_char(&self, at: InputAt) -> Char {
+ (**self).next_char(at)
+ }
+
+ fn previous_char(&self, at: InputAt) -> Char {
+ (**self).previous_char(at)
+ }
+
+ fn is_empty_match(&self, at: InputAt, empty: &InstEmptyLook) -> bool {
+ (**self).is_empty_match(at, empty)
+ }
+
+ fn prefix_at(
+ &self,
+ prefixes: &LiteralSearcher,
+ at: InputAt,
+ ) -> Option<InputAt> {
+ (**self).prefix_at(prefixes, at)
+ }
+
+ fn len(&self) -> usize {
+ (**self).len()
+ }
+
+ fn as_bytes(&self) -> &[u8] {
+ (**self).as_bytes()
+ }
+}
+
+/// An input reader over characters.
+#[derive(Clone, Copy, Debug)]
+pub struct CharInput<'t>(&'t [u8]);
+
+impl<'t> CharInput<'t> {
+ /// Return a new character input reader for the given string.
+ pub fn new(s: &'t [u8]) -> CharInput<'t> {
+ CharInput(s)
+ }
+}
+
+impl<'t> ops::Deref for CharInput<'t> {
+ type Target = [u8];
+
+ fn deref(&self) -> &[u8] {
+ self.0
+ }
+}
+
+impl<'t> Input for CharInput<'t> {
+ fn at(&self, i: usize) -> InputAt {
+ if i >= self.len() {
+ InputAt { pos: self.len(), c: None.into(), byte: None, len: 0 }
+ } else {
+ let c = decode_utf8(&self[i..]).map(|(c, _)| c).into();
+ InputAt { pos: i, c, byte: None, len: c.len_utf8() }
+ }
+ }
+
+ fn next_char(&self, at: InputAt) -> Char {
+ at.char()
+ }
+
+ fn previous_char(&self, at: InputAt) -> Char {
+ decode_last_utf8(&self[..at.pos()]).map(|(c, _)| c).into()
+ }
+
+ fn is_empty_match(&self, at: InputAt, empty: &InstEmptyLook) -> bool {
+ use crate::prog::EmptyLook::*;
+ match empty.look {
+ StartLine => {
+ let c = self.previous_char(at);
+ at.pos() == 0 || c == '\n'
+ }
+ EndLine => {
+ let c = self.next_char(at);
+ at.pos() == self.len() || c == '\n'
+ }
+ StartText => at.pos() == 0,
+ EndText => at.pos() == self.len(),
+ WordBoundary => {
+ let (c1, c2) = (self.previous_char(at), self.next_char(at));
+ c1.is_word_char() != c2.is_word_char()
+ }
+ NotWordBoundary => {
+ let (c1, c2) = (self.previous_char(at), self.next_char(at));
+ c1.is_word_char() == c2.is_word_char()
+ }
+ WordBoundaryAscii => {
+ let (c1, c2) = (self.previous_char(at), self.next_char(at));
+ c1.is_word_byte() != c2.is_word_byte()
+ }
+ NotWordBoundaryAscii => {
+ let (c1, c2) = (self.previous_char(at), self.next_char(at));
+ c1.is_word_byte() == c2.is_word_byte()
+ }
+ }
+ }
+
+ fn prefix_at(
+ &self,
+ prefixes: &LiteralSearcher,
+ at: InputAt,
+ ) -> Option<InputAt> {
+ prefixes.find(&self[at.pos()..]).map(|(s, _)| self.at(at.pos() + s))
+ }
+
+ fn len(&self) -> usize {
+ self.0.len()
+ }
+
+ fn as_bytes(&self) -> &[u8] {
+ self.0
+ }
+}
+
+/// An input reader over bytes.
+#[derive(Clone, Copy, Debug)]
+pub struct ByteInput<'t> {
+ text: &'t [u8],
+ only_utf8: bool,
+}
+
+impl<'t> ByteInput<'t> {
+ /// Return a new byte-based input reader for the given string.
+ pub fn new(text: &'t [u8], only_utf8: bool) -> ByteInput<'t> {
+ ByteInput { text, only_utf8 }
+ }
+}
+
+impl<'t> ops::Deref for ByteInput<'t> {
+ type Target = [u8];
+
+ fn deref(&self) -> &[u8] {
+ self.text
+ }
+}
+
+impl<'t> Input for ByteInput<'t> {
+ fn at(&self, i: usize) -> InputAt {
+ if i >= self.len() {
+ InputAt { pos: self.len(), c: None.into(), byte: None, len: 0 }
+ } else {
+ InputAt {
+ pos: i,
+ c: None.into(),
+ byte: self.get(i).cloned(),
+ len: 1,
+ }
+ }
+ }
+
+ fn next_char(&self, at: InputAt) -> Char {
+ decode_utf8(&self[at.pos()..]).map(|(c, _)| c).into()
+ }
+
+ fn previous_char(&self, at: InputAt) -> Char {
+ decode_last_utf8(&self[..at.pos()]).map(|(c, _)| c).into()
+ }
+
+ fn is_empty_match(&self, at: InputAt, empty: &InstEmptyLook) -> bool {
+ use crate::prog::EmptyLook::*;
+ match empty.look {
+ StartLine => {
+ let c = self.previous_char(at);
+ at.pos() == 0 || c == '\n'
+ }
+ EndLine => {
+ let c = self.next_char(at);
+ at.pos() == self.len() || c == '\n'
+ }
+ StartText => at.pos() == 0,
+ EndText => at.pos() == self.len(),
+ WordBoundary => {
+ let (c1, c2) = (self.previous_char(at), self.next_char(at));
+ c1.is_word_char() != c2.is_word_char()
+ }
+ NotWordBoundary => {
+ let (c1, c2) = (self.previous_char(at), self.next_char(at));
+ c1.is_word_char() == c2.is_word_char()
+ }
+ WordBoundaryAscii => {
+ let (c1, c2) = (self.previous_char(at), self.next_char(at));
+ if self.only_utf8 {
+ // If we must match UTF-8, then we can't match word
+ // boundaries at invalid UTF-8.
+ if c1.is_none() && !at.is_start() {
+ return false;
+ }
+ if c2.is_none() && !at.is_end() {
+ return false;
+ }
+ }
+ c1.is_word_byte() != c2.is_word_byte()
+ }
+ NotWordBoundaryAscii => {
+ let (c1, c2) = (self.previous_char(at), self.next_char(at));
+ if self.only_utf8 {
+ // If we must match UTF-8, then we can't match word
+ // boundaries at invalid UTF-8.
+ if c1.is_none() && !at.is_start() {
+ return false;
+ }
+ if c2.is_none() && !at.is_end() {
+ return false;
+ }
+ }
+ c1.is_word_byte() == c2.is_word_byte()
+ }
+ }
+ }
+
+ fn prefix_at(
+ &self,
+ prefixes: &LiteralSearcher,
+ at: InputAt,
+ ) -> Option<InputAt> {
+ prefixes.find(&self[at.pos()..]).map(|(s, _)| self.at(at.pos() + s))
+ }
+
+ fn len(&self) -> usize {
+ self.text.len()
+ }
+
+ fn as_bytes(&self) -> &[u8] {
+ self.text
+ }
+}
+
+/// An inline representation of `Option<char>`.
+///
+/// This eliminates the need to do case analysis on `Option<char>` to determine
+/// ordinality with other characters.
+///
+/// (The `Option<char>` is not related to encoding. Instead, it is used in the
+/// matching engines to represent the beginning and ending boundaries of the
+/// search text.)
+#[derive(Clone, Copy, Hash, PartialEq, Eq, PartialOrd, Ord)]
+pub struct Char(u32);
+
+impl fmt::Debug for Char {
+ fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+ match char::from_u32(self.0) {
+ None => write!(f, "Empty"),
+ Some(c) => write!(f, "{:?}", c),
+ }
+ }
+}
+
+impl Char {
+ /// Returns true iff the character is absent.
+ #[inline]
+ pub fn is_none(self) -> bool {
+ self.0 == u32::MAX
+ }
+
+ /// Returns the length of the character's UTF-8 encoding.
+ ///
+ /// If the character is absent, then `1` is returned.
+ #[inline]
+ pub fn len_utf8(self) -> usize {
+ char::from_u32(self.0).map_or(1, |c| c.len_utf8())
+ }
+
+ /// Returns true iff the character is a word character.
+ ///
+ /// If the character is absent, then false is returned.
+ pub fn is_word_char(self) -> bool {
+ // is_word_character can panic if the Unicode data for \w isn't
+ // available. However, our compiler ensures that if a Unicode word
+ // boundary is used, then the data must also be available. If it isn't,
+ // then the compiler returns an error.
+ char::from_u32(self.0).map_or(false, regex_syntax::is_word_character)
+ }
+
+ /// Returns true iff the byte is a word byte.
+ ///
+ /// If the byte is absent, then false is returned.
+ pub fn is_word_byte(self) -> bool {
+ match char::from_u32(self.0) {
+ Some(c) if c <= '\u{7F}' => regex_syntax::is_word_byte(c as u8),
+ None | Some(_) => false,
+ }
+ }
+}
+
+impl From<char> for Char {
+ fn from(c: char) -> Char {
+ Char(c as u32)
+ }
+}
+
+impl From<Option<char>> for Char {
+ fn from(c: Option<char>) -> Char {
+ c.map_or(Char(u32::MAX), |c| c.into())
+ }
+}
+
+impl PartialEq<char> for Char {
+ #[inline]
+ fn eq(&self, other: &char) -> bool {
+ self.0 == *other as u32
+ }
+}
+
+impl PartialEq<Char> for char {
+ #[inline]
+ fn eq(&self, other: &Char) -> bool {
+ *self as u32 == other.0
+ }
+}
+
+impl PartialOrd<char> for Char {
+ #[inline]
+ fn partial_cmp(&self, other: &char) -> Option<Ordering> {
+ self.0.partial_cmp(&(*other as u32))
+ }
+}
+
+impl PartialOrd<Char> for char {
+ #[inline]
+ fn partial_cmp(&self, other: &Char) -> Option<Ordering> {
+ (*self as u32).partial_cmp(&other.0)
+ }
+}
diff --git a/third_party/rust/regex/src/lib.rs b/third_party/rust/regex/src/lib.rs
new file mode 100644
index 0000000000..6b95739c5c
--- /dev/null
+++ b/third_party/rust/regex/src/lib.rs
@@ -0,0 +1,769 @@
+/*!
+This crate provides a library for parsing, compiling, and executing regular
+expressions. Its syntax is similar to Perl-style regular expressions, but lacks
+a few features like look around and backreferences. In exchange, all searches
+execute in linear time with respect to the size of the regular expression and
+search text.
+
+This crate's documentation provides some simple examples, describes
+[Unicode support](#unicode) and exhaustively lists the
+[supported syntax](#syntax).
+
+For more specific details on the API for regular expressions, please see the
+documentation for the [`Regex`](struct.Regex.html) type.
+
+# Usage
+
+This crate is [on crates.io](https://crates.io/crates/regex) and can be
+used by adding `regex` to your dependencies in your project's `Cargo.toml`.
+
+```toml
+[dependencies]
+regex = "1"
+```
+
+# Example: find a date
+
+General use of regular expressions in this package involves compiling an
+expression and then using it to search, split or replace text. For example,
+to confirm that some text resembles a date:
+
+```rust
+use regex::Regex;
+let re = Regex::new(r"^\d{4}-\d{2}-\d{2}$").unwrap();
+assert!(re.is_match("2014-01-01"));
+```
+
+Notice the use of the `^` and `$` anchors. In this crate, every expression
+is executed with an implicit `.*?` at the beginning and end, which allows
+it to match anywhere in the text. Anchors can be used to ensure that the
+full text matches an expression.
+
+This example also demonstrates the utility of
+[raw strings](https://doc.rust-lang.org/stable/reference/tokens.html#raw-string-literals)
+in Rust, which
+are just like regular strings except they are prefixed with an `r` and do
+not process any escape sequences. For example, `"\\d"` is the same
+expression as `r"\d"`.
+
+# Example: Avoid compiling the same regex in a loop
+
+It is an anti-pattern to compile the same regular expression in a loop
+since compilation is typically expensive. (It takes anywhere from a few
+microseconds to a few **milliseconds** depending on the size of the
+regex.) Not only is compilation itself expensive, but this also prevents
+optimizations that reuse allocations internally to the matching engines.
+
+In Rust, it can sometimes be a pain to pass regular expressions around if
+they're used from inside a helper function. Instead, we recommend using the
+[`lazy_static`](https://crates.io/crates/lazy_static) crate to ensure that
+regular expressions are compiled exactly once.
+
+For example:
+
+```rust
+use lazy_static::lazy_static;
+use regex::Regex;
+
+fn some_helper_function(text: &str) -> bool {
+ lazy_static! {
+ static ref RE: Regex = Regex::new("...").unwrap();
+ }
+ RE.is_match(text)
+}
+
+fn main() {}
+```
+
+Specifically, in this example, the regex will be compiled when it is used for
+the first time. On subsequent uses, it will reuse the previous compilation.
+
+# Example: iterating over capture groups
+
+This crate provides convenient iterators for matching an expression
+repeatedly against a search string to find successive non-overlapping
+matches. For example, to find all dates in a string and be able to access
+them by their component pieces:
+
+```rust
+# use regex::Regex;
+# fn main() {
+let re = Regex::new(r"(\d{4})-(\d{2})-(\d{2})").unwrap();
+let text = "2012-03-14, 2013-01-01 and 2014-07-05";
+for cap in re.captures_iter(text) {
+ println!("Month: {} Day: {} Year: {}", &cap[2], &cap[3], &cap[1]);
+}
+// Output:
+// Month: 03 Day: 14 Year: 2012
+// Month: 01 Day: 01 Year: 2013
+// Month: 07 Day: 05 Year: 2014
+# }
+```
+
+Notice that the year is in the capture group indexed at `1`. This is
+because the *entire match* is stored in the capture group at index `0`.
+
+# Example: replacement with named capture groups
+
+Building on the previous example, perhaps we'd like to rearrange the date
+formats. This can be done with text replacement. But to make the code
+clearer, we can *name* our capture groups and use those names as variables
+in our replacement text:
+
+```rust
+# use regex::Regex;
+# fn main() {
+let re = Regex::new(r"(?P<y>\d{4})-(?P<m>\d{2})-(?P<d>\d{2})").unwrap();
+let before = "2012-03-14, 2013-01-01 and 2014-07-05";
+let after = re.replace_all(before, "$m/$d/$y");
+assert_eq!(after, "03/14/2012, 01/01/2013 and 07/05/2014");
+# }
+```
+
+The `replace` methods are actually polymorphic in the replacement, which
+provides more flexibility than is seen here. (See the documentation for
+`Regex::replace` for more details.)
+
+Note that if your regex gets complicated, you can use the `x` flag to
+enable insignificant whitespace mode, which also lets you write comments:
+
+```rust
+# use regex::Regex;
+# fn main() {
+let re = Regex::new(r"(?x)
+ (?P<y>\d{4}) # the year
+ -
+ (?P<m>\d{2}) # the month
+ -
+ (?P<d>\d{2}) # the day
+").unwrap();
+let before = "2012-03-14, 2013-01-01 and 2014-07-05";
+let after = re.replace_all(before, "$m/$d/$y");
+assert_eq!(after, "03/14/2012, 01/01/2013 and 07/05/2014");
+# }
+```
+
+If you wish to match against whitespace in this mode, you can still use `\s`,
+`\n`, `\t`, etc. For escaping a single space character, you can escape it
+directly with `\ `, use its hex character code `\x20` or temporarily disable
+the `x` flag, e.g., `(?-x: )`.
+
+# Example: match multiple regular expressions simultaneously
+
+This demonstrates how to use a `RegexSet` to match multiple (possibly
+overlapping) regular expressions in a single scan of the search text:
+
+```rust
+use regex::RegexSet;
+
+let set = RegexSet::new(&[
+ r"\w+",
+ r"\d+",
+ r"\pL+",
+ r"foo",
+ r"bar",
+ r"barfoo",
+ r"foobar",
+]).unwrap();
+
+// Iterate over and collect all of the matches.
+let matches: Vec<_> = set.matches("foobar").into_iter().collect();
+assert_eq!(matches, vec![0, 2, 3, 4, 6]);
+
+// You can also test whether a particular regex matched:
+let matches = set.matches("foobar");
+assert!(!matches.matched(5));
+assert!(matches.matched(6));
+```
+
+# Pay for what you use
+
+With respect to searching text with a regular expression, there are three
+questions that can be asked:
+
+1. Does the text match this expression?
+2. If so, where does it match?
+3. Where did the capturing groups match?
+
+Generally speaking, this crate could provide a function to answer only #3,
+which would subsume #1 and #2 automatically. However, it can be significantly
+more expensive to compute the location of capturing group matches, so it's best
+not to do it if you don't need to.
+
+Therefore, only use what you need. For example, don't use `find` if you
+only need to test if an expression matches a string. (Use `is_match`
+instead.)
+
+# Unicode
+
+This implementation executes regular expressions **only** on valid UTF-8
+while exposing match locations as byte indices into the search string. (To
+relax this restriction, use the [`bytes`](bytes/index.html) sub-module.)
+
+Only simple case folding is supported. Namely, when matching
+case-insensitively, the characters are first mapped using the "simple" case
+folding rules defined by Unicode.
+
+Regular expressions themselves are **only** interpreted as a sequence of
+Unicode scalar values. This means you can use Unicode characters directly
+in your expression:
+
+```rust
+# use regex::Regex;
+# fn main() {
+let re = Regex::new(r"(?i)Δ+").unwrap();
+let mat = re.find("ΔδΔ").unwrap();
+assert_eq!((mat.start(), mat.end()), (0, 6));
+# }
+```
+
+Most features of the regular expressions in this crate are Unicode aware. Here
+are some examples:
+
+* `.` will match any valid UTF-8 encoded Unicode scalar value except for `\n`.
+ (To also match `\n`, enable the `s` flag, e.g., `(?s:.)`.)
+* `\w`, `\d` and `\s` are Unicode aware. For example, `\s` will match all forms
+ of whitespace categorized by Unicode.
+* `\b` matches a Unicode word boundary.
+* Negated character classes like `[^a]` match all Unicode scalar values except
+ for `a`.
+* `^` and `$` are **not** Unicode aware in multi-line mode. Namely, they only
+ recognize `\n` and not any of the other forms of line terminators defined
+ by Unicode.
+
+Unicode general categories, scripts, script extensions, ages and a smattering
+of boolean properties are available as character classes. For example, you can
+match a sequence of numerals, Greek or Cherokee letters:
+
+```rust
+# use regex::Regex;
+# fn main() {
+let re = Regex::new(r"[\pN\p{Greek}\p{Cherokee}]+").unwrap();
+let mat = re.find("abcΔᎠβⅠᏴγδⅡxyz").unwrap();
+assert_eq!((mat.start(), mat.end()), (3, 23));
+# }
+```
+
+For a more detailed breakdown of Unicode support with respect to
+[UTS#18](https://unicode.org/reports/tr18/),
+please see the
+[UNICODE](https://github.com/rust-lang/regex/blob/master/UNICODE.md)
+document in the root of the regex repository.
+
+# Opt out of Unicode support
+
+The `bytes` sub-module provides a `Regex` type that can be used to match
+on `&[u8]`. By default, text is interpreted as UTF-8 just like it is with
+the main `Regex` type. However, this behavior can be disabled by turning
+off the `u` flag, even if doing so could result in matching invalid UTF-8.
+For example, when the `u` flag is disabled, `.` will match any byte instead
+of any Unicode scalar value.
+
+Disabling the `u` flag is also possible with the standard `&str`-based `Regex`
+type, but it is only allowed where the UTF-8 invariant is maintained. For
+example, `(?-u:\w)` is an ASCII-only `\w` character class and is legal in an
+`&str`-based `Regex`, but `(?-u:\xFF)` will attempt to match the raw byte
+`\xFF`, which is invalid UTF-8 and therefore is illegal in `&str`-based
+regexes.
+
+Finally, since Unicode support requires bundling large Unicode data
+tables, this crate exposes knobs to disable the compilation of those
+data tables, which can be useful for shrinking binary size and reducing
+compilation times. For details on how to do that, see the section on [crate
+features](#crate-features).
+
+# Syntax
+
+The syntax supported in this crate is documented below.
+
+Note that the regular expression parser and abstract syntax are exposed in
+a separate crate, [`regex-syntax`](https://docs.rs/regex-syntax).
+
+## Matching one character
+
+<pre class="rust">
+. any character except new line (includes new line with s flag)
+\d digit (\p{Nd})
+\D not digit
+\pN One-letter name Unicode character class
+\p{Greek} Unicode character class (general category or script)
+\PN Negated one-letter name Unicode character class
+\P{Greek} negated Unicode character class (general category or script)
+</pre>
+
+### Character classes
+
+<pre class="rust">
+[xyz] A character class matching either x, y or z (union).
+[^xyz] A character class matching any character except x, y and z.
+[a-z] A character class matching any character in range a-z.
+[[:alpha:]] ASCII character class ([A-Za-z])
+[[:^alpha:]] Negated ASCII character class ([^A-Za-z])
+[x[^xyz]] Nested/grouping character class (matching any character except y and z)
+[a-y&&xyz] Intersection (matching x or y)
+[0-9&&[^4]] Subtraction using intersection and negation (matching 0-9 except 4)
+[0-9--4] Direct subtraction (matching 0-9 except 4)
+[a-g~~b-h] Symmetric difference (matching `a` and `h` only)
+[\[\]] Escaping in character classes (matching [ or ])
+</pre>
+
+Any named character class may appear inside a bracketed `[...]` character
+class. For example, `[\p{Greek}[:digit:]]` matches any Greek or ASCII
+digit. `[\p{Greek}&&\pL]` matches Greek letters.
+
+Precedence in character classes, from most binding to least:
+
+1. Ranges: `a-cd` == `[a-c]d`
+2. Union: `ab&&bc` == `[ab]&&[bc]`
+3. Intersection: `^a-z&&b` == `^[a-z&&b]`
+4. Negation
+
+## Composites
+
+<pre class="rust">
+xy concatenation (x followed by y)
+x|y alternation (x or y, prefer x)
+</pre>
+
+## Repetitions
+
+<pre class="rust">
+x* zero or more of x (greedy)
+x+ one or more of x (greedy)
+x? zero or one of x (greedy)
+x*? zero or more of x (ungreedy/lazy)
+x+? one or more of x (ungreedy/lazy)
+x?? zero or one of x (ungreedy/lazy)
+x{n,m} at least n x and at most m x (greedy)
+x{n,} at least n x (greedy)
+x{n} exactly n x
+x{n,m}? at least n x and at most m x (ungreedy/lazy)
+x{n,}? at least n x (ungreedy/lazy)
+x{n}? exactly n x
+</pre>
+
+## Empty matches
+
+<pre class="rust">
+^ the beginning of text (or start-of-line with multi-line mode)
+$ the end of text (or end-of-line with multi-line mode)
+\A only the beginning of text (even with multi-line mode enabled)
+\z only the end of text (even with multi-line mode enabled)
+\b a Unicode word boundary (\w on one side and \W, \A, or \z on other)
+\B not a Unicode word boundary
+</pre>
+
+The empty regex is valid and matches the empty string. For example, the empty
+regex matches `abc` at positions `0`, `1`, `2` and `3`.
+
+## Grouping and flags
+
+<pre class="rust">
+(exp) numbered capture group (indexed by opening parenthesis)
+(?P&lt;name&gt;exp) named (also numbered) capture group (allowed chars: [_0-9a-zA-Z.\[\]])
+(?:exp) non-capturing group
+(?flags) set flags within current group
+(?flags:exp) set flags for exp (non-capturing)
+</pre>
+
+Flags are each a single character. For example, `(?x)` sets the flag `x`
+and `(?-x)` clears the flag `x`. Multiple flags can be set or cleared at
+the same time: `(?xy)` sets both the `x` and `y` flags and `(?x-y)` sets
+the `x` flag and clears the `y` flag.
+
+All flags are by default disabled unless stated otherwise. They are:
+
+<pre class="rust">
+i case-insensitive: letters match both upper and lower case
+m multi-line mode: ^ and $ match begin/end of line
+s allow . to match \n
+U swap the meaning of x* and x*?
+u Unicode support (enabled by default)
+x ignore whitespace and allow line comments (starting with `#`)
+</pre>
+
+Flags can be toggled within a pattern. Here's an example that matches
+case-insensitively for the first part but case-sensitively for the second part:
+
+```rust
+# use regex::Regex;
+# fn main() {
+let re = Regex::new(r"(?i)a+(?-i)b+").unwrap();
+let cap = re.captures("AaAaAbbBBBb").unwrap();
+assert_eq!(&cap[0], "AaAaAbb");
+# }
+```
+
+Notice that the `a+` matches either `a` or `A`, but the `b+` only matches
+`b`.
+
+Multi-line mode means `^` and `$` no longer match just at the beginning/end of
+the input, but at the beginning/end of lines:
+
+```
+# use regex::Regex;
+let re = Regex::new(r"(?m)^line \d+").unwrap();
+let m = re.find("line one\nline 2\n").unwrap();
+assert_eq!(m.as_str(), "line 2");
+```
+
+Note that `^` matches after new lines, even at the end of input:
+
+```
+# use regex::Regex;
+let re = Regex::new(r"(?m)^").unwrap();
+let m = re.find_iter("test\n").last().unwrap();
+assert_eq!((m.start(), m.end()), (5, 5));
+```
+
+Here is an example that uses an ASCII word boundary instead of a Unicode
+word boundary:
+
+```rust
+# use regex::Regex;
+# fn main() {
+let re = Regex::new(r"(?-u:\b).+(?-u:\b)").unwrap();
+let cap = re.captures("$$abc$$").unwrap();
+assert_eq!(&cap[0], "abc");
+# }
+```
+
+## Escape sequences
+
+<pre class="rust">
+\* literal *, works for any punctuation character: \.+*?()|[]{}^$
+\a bell (\x07)
+\f form feed (\x0C)
+\t horizontal tab
+\n new line
+\r carriage return
+\v vertical tab (\x0B)
+\123 octal character code (up to three digits) (when enabled)
+\x7F hex character code (exactly two digits)
+\x{10FFFF} any hex character code corresponding to a Unicode code point
+\u007F hex character code (exactly four digits)
+\u{7F} any hex character code corresponding to a Unicode code point
+\U0000007F hex character code (exactly eight digits)
+\U{7F} any hex character code corresponding to a Unicode code point
+</pre>
+
+## Perl character classes (Unicode friendly)
+
+These classes are based on the definitions provided in
+[UTS#18](https://www.unicode.org/reports/tr18/#Compatibility_Properties):
+
+<pre class="rust">
+\d digit (\p{Nd})
+\D not digit
+\s whitespace (\p{White_Space})
+\S not whitespace
+\w word character (\p{Alphabetic} + \p{M} + \d + \p{Pc} + \p{Join_Control})
+\W not word character
+</pre>
+
+## ASCII character classes
+
+<pre class="rust">
+[[:alnum:]] alphanumeric ([0-9A-Za-z])
+[[:alpha:]] alphabetic ([A-Za-z])
+[[:ascii:]] ASCII ([\x00-\x7F])
+[[:blank:]] blank ([\t ])
+[[:cntrl:]] control ([\x00-\x1F\x7F])
+[[:digit:]] digits ([0-9])
+[[:graph:]] graphical ([!-~])
+[[:lower:]] lower case ([a-z])
+[[:print:]] printable ([ -~])
+[[:punct:]] punctuation ([!-/:-@\[-`{-~])
+[[:space:]] whitespace ([\t\n\v\f\r ])
+[[:upper:]] upper case ([A-Z])
+[[:word:]] word characters ([0-9A-Za-z_])
+[[:xdigit:]] hex digit ([0-9A-Fa-f])
+</pre>
+
+# Crate features
+
+By default, this crate tries pretty hard to make regex matching both as fast
+as possible and as correct as it can be, within reason. This means that there
+is a lot of code dedicated to performance, the handling of Unicode data and the
+Unicode data itself. Overall, this leads to more dependencies, larger binaries
+and longer compile times. This trade off may not be appropriate in all cases,
+and indeed, even when all Unicode and performance features are disabled, one
+is still left with a perfectly serviceable regex engine that will work well
+in many cases.
+
+This crate exposes a number of features for controlling that trade off. Some
+of these features are strictly performance oriented, such that disabling them
+won't result in a loss of functionality, but may result in worse performance.
+Other features, such as the ones controlling the presence or absence of Unicode
+data, can result in a loss of functionality. For example, if one disables the
+`unicode-case` feature (described below), then compiling the regex `(?i)a`
+will fail since Unicode case insensitivity is enabled by default. Instead,
+callers must use `(?i-u)a` instead to disable Unicode case folding. Stated
+differently, enabling or disabling any of the features below can only add or
+subtract from the total set of valid regular expressions. Enabling or disabling
+a feature will never modify the match semantics of a regular expression.
+
+All features below are enabled by default.
+
+### Ecosystem features
+
+* **std** -
+ When enabled, this will cause `regex` to use the standard library. Currently,
+ disabling this feature will always result in a compilation error. It is
+ intended to add `alloc`-only support to regex in the future.
+
+### Performance features
+
+* **perf** -
+ Enables all performance related features. This feature is enabled by default
+ and will always cover all features that improve performance, even if more
+ are added in the future.
+* **perf-dfa** -
+ Enables the use of a lazy DFA for matching. The lazy DFA is used to compile
+ portions of a regex to a very fast DFA on an as-needed basis. This can
+ result in substantial speedups, usually by an order of magnitude on large
+ haystacks. The lazy DFA does not bring in any new dependencies, but it can
+ make compile times longer.
+* **perf-inline** -
+ Enables the use of aggressive inlining inside match routines. This reduces
+ the overhead of each match. The aggressive inlining, however, increases
+ compile times and binary size.
+* **perf-literal** -
+ Enables the use of literal optimizations for speeding up matches. In some
+ cases, literal optimizations can result in speedups of _several_ orders of
+ magnitude. Disabling this drops the `aho-corasick` and `memchr` dependencies.
+* **perf-cache** -
+ This feature used to enable a faster internal cache at the cost of using
+ additional dependencies, but this is no longer an option. A fast internal
+ cache is now used unconditionally with no additional dependencies. This may
+ change in the future.
+
+### Unicode features
+
+* **unicode** -
+ Enables all Unicode features. This feature is enabled by default, and will
+ always cover all Unicode features, even if more are added in the future.
+* **unicode-age** -
+ Provide the data for the
+ [Unicode `Age` property](https://www.unicode.org/reports/tr44/tr44-24.html#Character_Age).
+ This makes it possible to use classes like `\p{Age:6.0}` to refer to all
+ codepoints first introduced in Unicode 6.0
+* **unicode-bool** -
+ Provide the data for numerous Unicode boolean properties. The full list
+ is not included here, but contains properties like `Alphabetic`, `Emoji`,
+ `Lowercase`, `Math`, `Uppercase` and `White_Space`.
+* **unicode-case** -
+ Provide the data for case insensitive matching using
+ [Unicode's "simple loose matches" specification](https://www.unicode.org/reports/tr18/#Simple_Loose_Matches).
+* **unicode-gencat** -
+ Provide the data for
+ [Unicode general categories](https://www.unicode.org/reports/tr44/tr44-24.html#General_Category_Values).
+ This includes, but is not limited to, `Decimal_Number`, `Letter`,
+ `Math_Symbol`, `Number` and `Punctuation`.
+* **unicode-perl** -
+ Provide the data for supporting the Unicode-aware Perl character classes,
+ corresponding to `\w`, `\s` and `\d`. This is also necessary for using
+ Unicode-aware word boundary assertions. Note that if this feature is
+ disabled, the `\s` and `\d` character classes are still available if the
+ `unicode-bool` and `unicode-gencat` features are enabled, respectively.
+* **unicode-script** -
+ Provide the data for
+ [Unicode scripts and script extensions](https://www.unicode.org/reports/tr24/).
+ This includes, but is not limited to, `Arabic`, `Cyrillic`, `Hebrew`,
+ `Latin` and `Thai`.
+* **unicode-segment** -
+ Provide the data necessary to provide the properties used to implement the
+ [Unicode text segmentation algorithms](https://www.unicode.org/reports/tr29/).
+ This enables using classes like `\p{gcb=Extend}`, `\p{wb=Katakana}` and
+ `\p{sb=ATerm}`.
+
+
+# Untrusted input
+
+This crate can handle both untrusted regular expressions and untrusted
+search text.
+
+Untrusted regular expressions are handled by capping the size of a compiled
+regular expression.
+(See [`RegexBuilder::size_limit`](struct.RegexBuilder.html#method.size_limit).)
+Without this, it would be trivial for an attacker to exhaust your system's
+memory with expressions like `a{100}{100}{100}`.
+
+Untrusted search text is allowed because the matching engine(s) in this
+crate have time complexity `O(mn)` (with `m ~ regex` and `n ~ search
+text`), which means there's no way to cause exponential blow-up like with
+some other regular expression engines. (We pay for this by disallowing
+features like arbitrary look-ahead and backreferences.)
+
+When a DFA is used, pathological cases with exponential state blow-up are
+avoided by constructing the DFA lazily or in an "online" manner. Therefore,
+at most one new state can be created for each byte of input. This satisfies
+our time complexity guarantees, but can lead to memory growth
+proportional to the size of the input. As a stopgap, the DFA is only
+allowed to store a fixed number of states. When the limit is reached, its
+states are wiped and continues on, possibly duplicating previous work. If
+the limit is reached too frequently, it gives up and hands control off to
+another matching engine with fixed memory requirements.
+(The DFA size limit can also be tweaked. See
+[`RegexBuilder::dfa_size_limit`](struct.RegexBuilder.html#method.dfa_size_limit).)
+*/
+
+#![deny(missing_docs)]
+#![cfg_attr(feature = "pattern", feature(pattern))]
+#![warn(missing_debug_implementations)]
+
+#[cfg(not(feature = "std"))]
+compile_error!("`std` feature is currently required to build this crate");
+
+// To check README's example
+// TODO: Re-enable this once the MSRV is 1.43 or greater.
+// See: https://github.com/rust-lang/regex/issues/684
+// See: https://github.com/rust-lang/regex/issues/685
+// #[cfg(doctest)]
+// doc_comment::doctest!("../README.md");
+
+#[cfg(feature = "std")]
+pub use crate::error::Error;
+#[cfg(feature = "std")]
+pub use crate::re_builder::set_unicode::*;
+#[cfg(feature = "std")]
+pub use crate::re_builder::unicode::*;
+#[cfg(feature = "std")]
+pub use crate::re_set::unicode::*;
+#[cfg(feature = "std")]
+pub use crate::re_unicode::{
+ escape, CaptureLocations, CaptureMatches, CaptureNames, Captures,
+ Locations, Match, Matches, NoExpand, Regex, Replacer, ReplacerRef, Split,
+ SplitN, SubCaptureMatches,
+};
+
+/**
+Match regular expressions on arbitrary bytes.
+
+This module provides a nearly identical API to the one found in the
+top-level of this crate. There are two important differences:
+
+1. Matching is done on `&[u8]` instead of `&str`. Additionally, `Vec<u8>`
+is used where `String` would have been used.
+2. Unicode support can be disabled even when disabling it would result in
+matching invalid UTF-8 bytes.
+
+# Example: match null terminated string
+
+This shows how to find all null-terminated strings in a slice of bytes:
+
+```rust
+# use regex::bytes::Regex;
+let re = Regex::new(r"(?-u)(?P<cstr>[^\x00]+)\x00").unwrap();
+let text = b"foo\x00bar\x00baz\x00";
+
+// Extract all of the strings without the null terminator from each match.
+// The unwrap is OK here since a match requires the `cstr` capture to match.
+let cstrs: Vec<&[u8]> =
+ re.captures_iter(text)
+ .map(|c| c.name("cstr").unwrap().as_bytes())
+ .collect();
+assert_eq!(vec![&b"foo"[..], &b"bar"[..], &b"baz"[..]], cstrs);
+```
+
+# Example: selectively enable Unicode support
+
+This shows how to match an arbitrary byte pattern followed by a UTF-8 encoded
+string (e.g., to extract a title from a Matroska file):
+
+```rust
+# use std::str;
+# use regex::bytes::Regex;
+let re = Regex::new(
+ r"(?-u)\x7b\xa9(?:[\x80-\xfe]|[\x40-\xff].)(?u:(.*))"
+).unwrap();
+let text = b"\x12\xd0\x3b\x5f\x7b\xa9\x85\xe2\x98\x83\x80\x98\x54\x76\x68\x65";
+let caps = re.captures(text).unwrap();
+
+// Notice that despite the `.*` at the end, it will only match valid UTF-8
+// because Unicode mode was enabled with the `u` flag. Without the `u` flag,
+// the `.*` would match the rest of the bytes.
+let mat = caps.get(1).unwrap();
+assert_eq!((7, 10), (mat.start(), mat.end()));
+
+// If there was a match, Unicode mode guarantees that `title` is valid UTF-8.
+let title = str::from_utf8(&caps[1]).unwrap();
+assert_eq!("☃", title);
+```
+
+In general, if the Unicode flag is enabled in a capture group and that capture
+is part of the overall match, then the capture is *guaranteed* to be valid
+UTF-8.
+
+# Syntax
+
+The supported syntax is pretty much the same as the syntax for Unicode
+regular expressions with a few changes that make sense for matching arbitrary
+bytes:
+
+1. The `u` flag can be disabled even when disabling it might cause the regex to
+match invalid UTF-8. When the `u` flag is disabled, the regex is said to be in
+"ASCII compatible" mode.
+2. In ASCII compatible mode, neither Unicode scalar values nor Unicode
+character classes are allowed.
+3. In ASCII compatible mode, Perl character classes (`\w`, `\d` and `\s`)
+revert to their typical ASCII definition. `\w` maps to `[[:word:]]`, `\d` maps
+to `[[:digit:]]` and `\s` maps to `[[:space:]]`.
+4. In ASCII compatible mode, word boundaries use the ASCII compatible `\w` to
+determine whether a byte is a word byte or not.
+5. Hexadecimal notation can be used to specify arbitrary bytes instead of
+Unicode codepoints. For example, in ASCII compatible mode, `\xFF` matches the
+literal byte `\xFF`, while in Unicode mode, `\xFF` is a Unicode codepoint that
+matches its UTF-8 encoding of `\xC3\xBF`. Similarly for octal notation when
+enabled.
+6. In ASCII compatible mode, `.` matches any *byte* except for `\n`. When the
+`s` flag is additionally enabled, `.` matches any byte.
+
+# Performance
+
+In general, one should expect performance on `&[u8]` to be roughly similar to
+performance on `&str`.
+*/
+#[cfg(feature = "std")]
+pub mod bytes {
+ pub use crate::re_builder::bytes::*;
+ pub use crate::re_builder::set_bytes::*;
+ pub use crate::re_bytes::*;
+ pub use crate::re_set::bytes::*;
+}
+
+mod backtrack;
+mod compile;
+#[cfg(feature = "perf-dfa")]
+mod dfa;
+mod error;
+mod exec;
+mod expand;
+mod find_byte;
+mod input;
+mod literal;
+#[cfg(feature = "pattern")]
+mod pattern;
+mod pikevm;
+mod pool;
+mod prog;
+mod re_builder;
+mod re_bytes;
+mod re_set;
+mod re_trait;
+mod re_unicode;
+mod sparse;
+mod utf8;
+
+/// The `internal` module exists to support suspicious activity, such as
+/// testing different matching engines and supporting the `regex-debug` CLI
+/// utility.
+#[doc(hidden)]
+#[cfg(feature = "std")]
+pub mod internal {
+ pub use crate::compile::Compiler;
+ pub use crate::exec::{Exec, ExecBuilder};
+ pub use crate::input::{Char, CharInput, Input, InputAt};
+ pub use crate::literal::LiteralSearcher;
+ pub use crate::prog::{EmptyLook, Inst, InstRanges, Program};
+}
diff --git a/third_party/rust/regex/src/literal/imp.rs b/third_party/rust/regex/src/literal/imp.rs
new file mode 100644
index 0000000000..90b2f11606
--- /dev/null
+++ b/third_party/rust/regex/src/literal/imp.rs
@@ -0,0 +1,402 @@
+use std::mem;
+
+use aho_corasick::{self, packed, AhoCorasick, AhoCorasickBuilder};
+use memchr::{memchr, memchr2, memchr3, memmem};
+use regex_syntax::hir::literal::{Literal, Literals};
+
+/// A prefix extracted from a compiled regular expression.
+///
+/// A regex prefix is a set of literal strings that *must* be matched at the
+/// beginning of a regex in order for the entire regex to match. Similarly
+/// for a regex suffix.
+#[derive(Clone, Debug)]
+pub struct LiteralSearcher {
+ complete: bool,
+ lcp: Memmem,
+ lcs: Memmem,
+ matcher: Matcher,
+}
+
+#[derive(Clone, Debug)]
+enum Matcher {
+ /// No literals. (Never advances through the input.)
+ Empty,
+ /// A set of four or more single byte literals.
+ Bytes(SingleByteSet),
+ /// A single substring, using vector accelerated routines when available.
+ Memmem(Memmem),
+ /// An Aho-Corasick automaton.
+ AC { ac: AhoCorasick<u32>, lits: Vec<Literal> },
+ /// A packed multiple substring searcher, using SIMD.
+ ///
+ /// Note that Aho-Corasick will actually use this packed searcher
+ /// internally automatically, however, there is some overhead associated
+ /// with going through the Aho-Corasick machinery. So using the packed
+ /// searcher directly results in some gains.
+ Packed { s: packed::Searcher, lits: Vec<Literal> },
+}
+
+impl LiteralSearcher {
+ /// Returns a matcher that never matches and never advances the input.
+ pub fn empty() -> Self {
+ Self::new(Literals::empty(), Matcher::Empty)
+ }
+
+ /// Returns a matcher for literal prefixes from the given set.
+ pub fn prefixes(lits: Literals) -> Self {
+ let matcher = Matcher::prefixes(&lits);
+ Self::new(lits, matcher)
+ }
+
+ /// Returns a matcher for literal suffixes from the given set.
+ pub fn suffixes(lits: Literals) -> Self {
+ let matcher = Matcher::suffixes(&lits);
+ Self::new(lits, matcher)
+ }
+
+ fn new(lits: Literals, matcher: Matcher) -> Self {
+ let complete = lits.all_complete();
+ LiteralSearcher {
+ complete,
+ lcp: Memmem::new(lits.longest_common_prefix()),
+ lcs: Memmem::new(lits.longest_common_suffix()),
+ matcher,
+ }
+ }
+
+ /// Returns true if all matches comprise the entire regular expression.
+ ///
+ /// This does not necessarily mean that a literal match implies a match
+ /// of the regular expression. For example, the regular expression `^a`
+ /// is comprised of a single complete literal `a`, but the regular
+ /// expression demands that it only match at the beginning of a string.
+ pub fn complete(&self) -> bool {
+ self.complete && !self.is_empty()
+ }
+
+ /// Find the position of a literal in `haystack` if it exists.
+ #[cfg_attr(feature = "perf-inline", inline(always))]
+ pub fn find(&self, haystack: &[u8]) -> Option<(usize, usize)> {
+ use self::Matcher::*;
+ match self.matcher {
+ Empty => Some((0, 0)),
+ Bytes(ref sset) => sset.find(haystack).map(|i| (i, i + 1)),
+ Memmem(ref s) => s.find(haystack).map(|i| (i, i + s.len())),
+ AC { ref ac, .. } => {
+ ac.find(haystack).map(|m| (m.start(), m.end()))
+ }
+ Packed { ref s, .. } => {
+ s.find(haystack).map(|m| (m.start(), m.end()))
+ }
+ }
+ }
+
+ /// Like find, except matches must start at index `0`.
+ pub fn find_start(&self, haystack: &[u8]) -> Option<(usize, usize)> {
+ for lit in self.iter() {
+ if lit.len() > haystack.len() {
+ continue;
+ }
+ if lit == &haystack[0..lit.len()] {
+ return Some((0, lit.len()));
+ }
+ }
+ None
+ }
+
+ /// Like find, except matches must end at index `haystack.len()`.
+ pub fn find_end(&self, haystack: &[u8]) -> Option<(usize, usize)> {
+ for lit in self.iter() {
+ if lit.len() > haystack.len() {
+ continue;
+ }
+ if lit == &haystack[haystack.len() - lit.len()..] {
+ return Some((haystack.len() - lit.len(), haystack.len()));
+ }
+ }
+ None
+ }
+
+ /// Returns an iterator over all literals to be matched.
+ pub fn iter(&self) -> LiteralIter<'_> {
+ match self.matcher {
+ Matcher::Empty => LiteralIter::Empty,
+ Matcher::Bytes(ref sset) => LiteralIter::Bytes(&sset.dense),
+ Matcher::Memmem(ref s) => LiteralIter::Single(&s.finder.needle()),
+ Matcher::AC { ref lits, .. } => LiteralIter::AC(lits),
+ Matcher::Packed { ref lits, .. } => LiteralIter::Packed(lits),
+ }
+ }
+
+ /// Returns a matcher for the longest common prefix of this matcher.
+ pub fn lcp(&self) -> &Memmem {
+ &self.lcp
+ }
+
+ /// Returns a matcher for the longest common suffix of this matcher.
+ pub fn lcs(&self) -> &Memmem {
+ &self.lcs
+ }
+
+ /// Returns true iff this prefix is empty.
+ pub fn is_empty(&self) -> bool {
+ self.len() == 0
+ }
+
+ /// Returns the number of prefixes in this machine.
+ pub fn len(&self) -> usize {
+ use self::Matcher::*;
+ match self.matcher {
+ Empty => 0,
+ Bytes(ref sset) => sset.dense.len(),
+ Memmem(_) => 1,
+ AC { ref ac, .. } => ac.pattern_count(),
+ Packed { ref lits, .. } => lits.len(),
+ }
+ }
+
+ /// Return the approximate heap usage of literals in bytes.
+ pub fn approximate_size(&self) -> usize {
+ use self::Matcher::*;
+ match self.matcher {
+ Empty => 0,
+ Bytes(ref sset) => sset.approximate_size(),
+ Memmem(ref single) => single.approximate_size(),
+ AC { ref ac, .. } => ac.heap_bytes(),
+ Packed { ref s, .. } => s.heap_bytes(),
+ }
+ }
+}
+
+impl Matcher {
+ fn prefixes(lits: &Literals) -> Self {
+ let sset = SingleByteSet::prefixes(lits);
+ Matcher::new(lits, sset)
+ }
+
+ fn suffixes(lits: &Literals) -> Self {
+ let sset = SingleByteSet::suffixes(lits);
+ Matcher::new(lits, sset)
+ }
+
+ fn new(lits: &Literals, sset: SingleByteSet) -> Self {
+ if lits.literals().is_empty() {
+ return Matcher::Empty;
+ }
+ if sset.dense.len() >= 26 {
+ // Avoid trying to match a large number of single bytes.
+ // This is *very* sensitive to a frequency analysis comparison
+ // between the bytes in sset and the composition of the haystack.
+ // No matter the size of sset, if its members all are rare in the
+ // haystack, then it'd be worth using it. How to tune this... IDK.
+ // ---AG
+ return Matcher::Empty;
+ }
+ if sset.complete {
+ return Matcher::Bytes(sset);
+ }
+ if lits.literals().len() == 1 {
+ return Matcher::Memmem(Memmem::new(&lits.literals()[0]));
+ }
+
+ let pats = lits.literals().to_owned();
+ let is_aho_corasick_fast = sset.dense.len() <= 1 && sset.all_ascii;
+ if lits.literals().len() <= 100 && !is_aho_corasick_fast {
+ let mut builder = packed::Config::new()
+ .match_kind(packed::MatchKind::LeftmostFirst)
+ .builder();
+ if let Some(s) = builder.extend(&pats).build() {
+ return Matcher::Packed { s, lits: pats };
+ }
+ }
+ let ac = AhoCorasickBuilder::new()
+ .match_kind(aho_corasick::MatchKind::LeftmostFirst)
+ .dfa(true)
+ .build_with_size::<u32, _, _>(&pats)
+ .unwrap();
+ Matcher::AC { ac, lits: pats }
+ }
+}
+
+#[derive(Debug)]
+pub enum LiteralIter<'a> {
+ Empty,
+ Bytes(&'a [u8]),
+ Single(&'a [u8]),
+ AC(&'a [Literal]),
+ Packed(&'a [Literal]),
+}
+
+impl<'a> Iterator for LiteralIter<'a> {
+ type Item = &'a [u8];
+
+ fn next(&mut self) -> Option<Self::Item> {
+ match *self {
+ LiteralIter::Empty => None,
+ LiteralIter::Bytes(ref mut many) => {
+ if many.is_empty() {
+ None
+ } else {
+ let next = &many[0..1];
+ *many = &many[1..];
+ Some(next)
+ }
+ }
+ LiteralIter::Single(ref mut one) => {
+ if one.is_empty() {
+ None
+ } else {
+ let next = &one[..];
+ *one = &[];
+ Some(next)
+ }
+ }
+ LiteralIter::AC(ref mut lits) => {
+ if lits.is_empty() {
+ None
+ } else {
+ let next = &lits[0];
+ *lits = &lits[1..];
+ Some(&**next)
+ }
+ }
+ LiteralIter::Packed(ref mut lits) => {
+ if lits.is_empty() {
+ None
+ } else {
+ let next = &lits[0];
+ *lits = &lits[1..];
+ Some(&**next)
+ }
+ }
+ }
+ }
+}
+
+#[derive(Clone, Debug)]
+struct SingleByteSet {
+ sparse: Vec<bool>,
+ dense: Vec<u8>,
+ complete: bool,
+ all_ascii: bool,
+}
+
+impl SingleByteSet {
+ fn new() -> SingleByteSet {
+ SingleByteSet {
+ sparse: vec![false; 256],
+ dense: vec![],
+ complete: true,
+ all_ascii: true,
+ }
+ }
+
+ fn prefixes(lits: &Literals) -> SingleByteSet {
+ let mut sset = SingleByteSet::new();
+ for lit in lits.literals() {
+ sset.complete = sset.complete && lit.len() == 1;
+ if let Some(&b) = lit.get(0) {
+ if !sset.sparse[b as usize] {
+ if b > 0x7F {
+ sset.all_ascii = false;
+ }
+ sset.dense.push(b);
+ sset.sparse[b as usize] = true;
+ }
+ }
+ }
+ sset
+ }
+
+ fn suffixes(lits: &Literals) -> SingleByteSet {
+ let mut sset = SingleByteSet::new();
+ for lit in lits.literals() {
+ sset.complete = sset.complete && lit.len() == 1;
+ if let Some(&b) = lit.get(lit.len().checked_sub(1).unwrap()) {
+ if !sset.sparse[b as usize] {
+ if b > 0x7F {
+ sset.all_ascii = false;
+ }
+ sset.dense.push(b);
+ sset.sparse[b as usize] = true;
+ }
+ }
+ }
+ sset
+ }
+
+ /// Faster find that special cases certain sizes to use memchr.
+ #[cfg_attr(feature = "perf-inline", inline(always))]
+ fn find(&self, text: &[u8]) -> Option<usize> {
+ match self.dense.len() {
+ 0 => None,
+ 1 => memchr(self.dense[0], text),
+ 2 => memchr2(self.dense[0], self.dense[1], text),
+ 3 => memchr3(self.dense[0], self.dense[1], self.dense[2], text),
+ _ => self._find(text),
+ }
+ }
+
+ /// Generic find that works on any sized set.
+ fn _find(&self, haystack: &[u8]) -> Option<usize> {
+ for (i, &b) in haystack.iter().enumerate() {
+ if self.sparse[b as usize] {
+ return Some(i);
+ }
+ }
+ None
+ }
+
+ fn approximate_size(&self) -> usize {
+ (self.dense.len() * mem::size_of::<u8>())
+ + (self.sparse.len() * mem::size_of::<bool>())
+ }
+}
+
+/// A simple wrapper around the memchr crate's memmem implementation.
+///
+/// The API this exposes mirrors the API of previous substring searchers that
+/// this supplanted.
+#[derive(Clone, Debug)]
+pub struct Memmem {
+ finder: memmem::Finder<'static>,
+ char_len: usize,
+}
+
+impl Memmem {
+ fn new(pat: &[u8]) -> Memmem {
+ Memmem {
+ finder: memmem::Finder::new(pat).into_owned(),
+ char_len: char_len_lossy(pat),
+ }
+ }
+
+ #[cfg_attr(feature = "perf-inline", inline(always))]
+ pub fn find(&self, haystack: &[u8]) -> Option<usize> {
+ self.finder.find(haystack)
+ }
+
+ #[cfg_attr(feature = "perf-inline", inline(always))]
+ pub fn is_suffix(&self, text: &[u8]) -> bool {
+ if text.len() < self.len() {
+ return false;
+ }
+ &text[text.len() - self.len()..] == self.finder.needle()
+ }
+
+ pub fn len(&self) -> usize {
+ self.finder.needle().len()
+ }
+
+ pub fn char_len(&self) -> usize {
+ self.char_len
+ }
+
+ fn approximate_size(&self) -> usize {
+ self.finder.needle().len() * mem::size_of::<u8>()
+ }
+}
+
+fn char_len_lossy(bytes: &[u8]) -> usize {
+ String::from_utf8_lossy(bytes).chars().count()
+}
diff --git a/third_party/rust/regex/src/literal/mod.rs b/third_party/rust/regex/src/literal/mod.rs
new file mode 100644
index 0000000000..980f523309
--- /dev/null
+++ b/third_party/rust/regex/src/literal/mod.rs
@@ -0,0 +1,55 @@
+pub use self::imp::*;
+
+#[cfg(feature = "perf-literal")]
+mod imp;
+
+#[allow(missing_docs)]
+#[cfg(not(feature = "perf-literal"))]
+mod imp {
+ use regex_syntax::hir::literal::Literals;
+
+ #[derive(Clone, Debug)]
+ pub struct LiteralSearcher(());
+
+ impl LiteralSearcher {
+ pub fn empty() -> Self {
+ LiteralSearcher(())
+ }
+
+ pub fn prefixes(_: Literals) -> Self {
+ LiteralSearcher(())
+ }
+
+ pub fn suffixes(_: Literals) -> Self {
+ LiteralSearcher(())
+ }
+
+ pub fn complete(&self) -> bool {
+ false
+ }
+
+ pub fn find(&self, _: &[u8]) -> Option<(usize, usize)> {
+ unreachable!()
+ }
+
+ pub fn find_start(&self, _: &[u8]) -> Option<(usize, usize)> {
+ unreachable!()
+ }
+
+ pub fn find_end(&self, _: &[u8]) -> Option<(usize, usize)> {
+ unreachable!()
+ }
+
+ pub fn is_empty(&self) -> bool {
+ true
+ }
+
+ pub fn len(&self) -> usize {
+ 0
+ }
+
+ pub fn approximate_size(&self) -> usize {
+ 0
+ }
+ }
+}
diff --git a/third_party/rust/regex/src/pattern.rs b/third_party/rust/regex/src/pattern.rs
new file mode 100644
index 0000000000..00549e5106
--- /dev/null
+++ b/third_party/rust/regex/src/pattern.rs
@@ -0,0 +1,63 @@
+use std::str::pattern::{Pattern, SearchStep, Searcher};
+
+use crate::re_unicode::{Matches, Regex};
+
+#[derive(Debug)]
+pub struct RegexSearcher<'r, 't> {
+ haystack: &'t str,
+ it: Matches<'r, 't>,
+ last_step_end: usize,
+ next_match: Option<(usize, usize)>,
+}
+
+impl<'r, 't> Pattern<'t> for &'r Regex {
+ type Searcher = RegexSearcher<'r, 't>;
+
+ fn into_searcher(self, haystack: &'t str) -> RegexSearcher<'r, 't> {
+ RegexSearcher {
+ haystack,
+ it: self.find_iter(haystack),
+ last_step_end: 0,
+ next_match: None,
+ }
+ }
+}
+
+unsafe impl<'r, 't> Searcher<'t> for RegexSearcher<'r, 't> {
+ #[inline]
+ fn haystack(&self) -> &'t str {
+ self.haystack
+ }
+
+ #[inline]
+ fn next(&mut self) -> SearchStep {
+ if let Some((s, e)) = self.next_match {
+ self.next_match = None;
+ self.last_step_end = e;
+ return SearchStep::Match(s, e);
+ }
+ match self.it.next() {
+ None => {
+ if self.last_step_end < self.haystack().len() {
+ let last = self.last_step_end;
+ self.last_step_end = self.haystack().len();
+ SearchStep::Reject(last, self.haystack().len())
+ } else {
+ SearchStep::Done
+ }
+ }
+ Some(m) => {
+ let (s, e) = (m.start(), m.end());
+ if s == self.last_step_end {
+ self.last_step_end = e;
+ SearchStep::Match(s, e)
+ } else {
+ self.next_match = Some((s, e));
+ let last = self.last_step_end;
+ self.last_step_end = s;
+ SearchStep::Reject(last, s)
+ }
+ }
+ }
+ }
+}
diff --git a/third_party/rust/regex/src/pikevm.rs b/third_party/rust/regex/src/pikevm.rs
new file mode 100644
index 0000000000..8c9eac2d39
--- /dev/null
+++ b/third_party/rust/regex/src/pikevm.rs
@@ -0,0 +1,360 @@
+// This module implements the Pike VM. That is, it guarantees linear time
+// search of a regex on any text with memory use proportional to the size of
+// the regex.
+//
+// It is equal in power to the backtracking engine in this crate, except the
+// backtracking engine is typically faster on small regexes/texts at the
+// expense of a bigger memory footprint.
+//
+// It can do more than the DFA can (specifically, record capture locations
+// and execute Unicode word boundary assertions), but at a slower speed.
+// Specifically, the Pike VM executes a DFA implicitly by repeatedly expanding
+// epsilon transitions. That is, the Pike VM engine can be in multiple states
+// at once where as the DFA is only ever in one state at a time.
+//
+// Therefore, the Pike VM is generally treated as the fallback when the other
+// matching engines either aren't feasible to run or are insufficient.
+
+use std::mem;
+
+use crate::exec::ProgramCache;
+use crate::input::{Input, InputAt};
+use crate::prog::{InstPtr, Program};
+use crate::re_trait::Slot;
+use crate::sparse::SparseSet;
+
+/// An NFA simulation matching engine.
+#[derive(Debug)]
+pub struct Fsm<'r, I> {
+ /// The sequence of opcodes (among other things) that is actually executed.
+ ///
+ /// The program may be byte oriented or Unicode codepoint oriented.
+ prog: &'r Program,
+ /// An explicit stack used for following epsilon transitions. (This is
+ /// borrowed from the cache.)
+ stack: &'r mut Vec<FollowEpsilon>,
+ /// The input to search.
+ input: I,
+}
+
+/// A cached allocation that can be reused on each execution.
+#[derive(Clone, Debug)]
+pub struct Cache {
+ /// A pair of ordered sets for tracking NFA states.
+ clist: Threads,
+ nlist: Threads,
+ /// An explicit stack used for following epsilon transitions.
+ stack: Vec<FollowEpsilon>,
+}
+
+/// An ordered set of NFA states and their captures.
+#[derive(Clone, Debug)]
+struct Threads {
+ /// An ordered set of opcodes (each opcode is an NFA state).
+ set: SparseSet,
+ /// Captures for every NFA state.
+ ///
+ /// It is stored in row-major order, where the columns are the capture
+ /// slots and the rows are the states.
+ caps: Vec<Slot>,
+ /// The number of capture slots stored per thread. (Every capture has
+ /// two slots.)
+ slots_per_thread: usize,
+}
+
+/// A representation of an explicit stack frame when following epsilon
+/// transitions. This is used to avoid recursion.
+#[derive(Clone, Debug)]
+enum FollowEpsilon {
+ /// Follow transitions at the given instruction pointer.
+ IP(InstPtr),
+ /// Restore the capture slot with the given position in the input.
+ Capture { slot: usize, pos: Slot },
+}
+
+impl Cache {
+ /// Create a new allocation used by the NFA machine to record execution
+ /// and captures.
+ pub fn new(_prog: &Program) -> Self {
+ Cache { clist: Threads::new(), nlist: Threads::new(), stack: vec![] }
+ }
+}
+
+impl<'r, I: Input> Fsm<'r, I> {
+ /// Execute the NFA matching engine.
+ ///
+ /// If there's a match, `exec` returns `true` and populates the given
+ /// captures accordingly.
+ pub fn exec(
+ prog: &'r Program,
+ cache: &ProgramCache,
+ matches: &mut [bool],
+ slots: &mut [Slot],
+ quit_after_match: bool,
+ input: I,
+ start: usize,
+ end: usize,
+ ) -> bool {
+ let mut cache = cache.borrow_mut();
+ let cache = &mut cache.pikevm;
+ cache.clist.resize(prog.len(), prog.captures.len());
+ cache.nlist.resize(prog.len(), prog.captures.len());
+ let at = input.at(start);
+ Fsm { prog, stack: &mut cache.stack, input }.exec_(
+ &mut cache.clist,
+ &mut cache.nlist,
+ matches,
+ slots,
+ quit_after_match,
+ at,
+ end,
+ )
+ }
+
+ fn exec_(
+ &mut self,
+ mut clist: &mut Threads,
+ mut nlist: &mut Threads,
+ matches: &mut [bool],
+ slots: &mut [Slot],
+ quit_after_match: bool,
+ mut at: InputAt,
+ end: usize,
+ ) -> bool {
+ let mut matched = false;
+ let mut all_matched = false;
+ clist.set.clear();
+ nlist.set.clear();
+ 'LOOP: loop {
+ if clist.set.is_empty() {
+ // Three ways to bail out when our current set of threads is
+ // empty.
+ //
+ // 1. We have a match---so we're done exploring any possible
+ // alternatives. Time to quit. (We can't do this if we're
+ // looking for matches for multiple regexes, unless we know
+ // they all matched.)
+ //
+ // 2. If the expression starts with a '^' we can terminate as
+ // soon as the last thread dies.
+ if (matched && matches.len() <= 1)
+ || all_matched
+ || (!at.is_start() && self.prog.is_anchored_start)
+ {
+ break;
+ }
+
+ // 3. If there's a literal prefix for the program, try to
+ // jump ahead quickly. If it can't be found, then we can
+ // bail out early.
+ if !self.prog.prefixes.is_empty() {
+ at = match self.input.prefix_at(&self.prog.prefixes, at) {
+ None => break,
+ Some(at) => at,
+ };
+ }
+ }
+
+ // This simulates a preceding '.*?' for every regex by adding
+ // a state starting at the current position in the input for the
+ // beginning of the program only if we don't already have a match.
+ if clist.set.is_empty()
+ || (!self.prog.is_anchored_start && !all_matched)
+ {
+ self.add(&mut clist, slots, 0, at);
+ }
+ // The previous call to "add" actually inspects the position just
+ // before the current character. For stepping through the machine,
+ // we can to look at the current character, so we advance the
+ // input.
+ let at_next = self.input.at(at.next_pos());
+ for i in 0..clist.set.len() {
+ let ip = clist.set[i];
+ if self.step(
+ &mut nlist,
+ matches,
+ slots,
+ clist.caps(ip),
+ ip,
+ at,
+ at_next,
+ ) {
+ matched = true;
+ all_matched = all_matched || matches.iter().all(|&b| b);
+ if quit_after_match {
+ // If we only care if a match occurs (not its
+ // position), then we can quit right now.
+ break 'LOOP;
+ }
+ if self.prog.matches.len() == 1 {
+ // We don't need to check the rest of the threads
+ // in this set because we've matched something
+ // ("leftmost-first"). However, we still need to check
+ // threads in the next set to support things like
+ // greedy matching.
+ //
+ // This is only true on normal regexes. For regex sets,
+ // we need to mush on to observe other matches.
+ break;
+ }
+ }
+ }
+ if at.pos() >= end {
+ break;
+ }
+ at = at_next;
+ mem::swap(clist, nlist);
+ nlist.set.clear();
+ }
+ matched
+ }
+
+ /// Step through the input, one token (byte or codepoint) at a time.
+ ///
+ /// nlist is the set of states that will be processed on the next token
+ /// in the input.
+ ///
+ /// caps is the set of captures passed by the caller of the NFA. They are
+ /// written to only when a match state is visited.
+ ///
+ /// thread_caps is the set of captures set for the current NFA state, ip.
+ ///
+ /// at and at_next are the current and next positions in the input. at or
+ /// at_next may be EOF.
+ fn step(
+ &mut self,
+ nlist: &mut Threads,
+ matches: &mut [bool],
+ slots: &mut [Slot],
+ thread_caps: &mut [Option<usize>],
+ ip: usize,
+ at: InputAt,
+ at_next: InputAt,
+ ) -> bool {
+ use crate::prog::Inst::*;
+ match self.prog[ip] {
+ Match(match_slot) => {
+ if match_slot < matches.len() {
+ matches[match_slot] = true;
+ }
+ for (slot, val) in slots.iter_mut().zip(thread_caps.iter()) {
+ *slot = *val;
+ }
+ true
+ }
+ Char(ref inst) => {
+ if inst.c == at.char() {
+ self.add(nlist, thread_caps, inst.goto, at_next);
+ }
+ false
+ }
+ Ranges(ref inst) => {
+ if inst.matches(at.char()) {
+ self.add(nlist, thread_caps, inst.goto, at_next);
+ }
+ false
+ }
+ Bytes(ref inst) => {
+ if let Some(b) = at.byte() {
+ if inst.matches(b) {
+ self.add(nlist, thread_caps, inst.goto, at_next);
+ }
+ }
+ false
+ }
+ EmptyLook(_) | Save(_) | Split(_) => false,
+ }
+ }
+
+ /// Follows epsilon transitions and adds them for processing to nlist,
+ /// starting at and including ip.
+ fn add(
+ &mut self,
+ nlist: &mut Threads,
+ thread_caps: &mut [Option<usize>],
+ ip: usize,
+ at: InputAt,
+ ) {
+ self.stack.push(FollowEpsilon::IP(ip));
+ while let Some(frame) = self.stack.pop() {
+ match frame {
+ FollowEpsilon::IP(ip) => {
+ self.add_step(nlist, thread_caps, ip, at);
+ }
+ FollowEpsilon::Capture { slot, pos } => {
+ thread_caps[slot] = pos;
+ }
+ }
+ }
+ }
+
+ /// A helper function for add that avoids excessive pushing to the stack.
+ fn add_step(
+ &mut self,
+ nlist: &mut Threads,
+ thread_caps: &mut [Option<usize>],
+ mut ip: usize,
+ at: InputAt,
+ ) {
+ // Instead of pushing and popping to the stack, we mutate ip as we
+ // traverse the set of states. We only push to the stack when we
+ // absolutely need recursion (restoring captures or following a
+ // branch).
+ use crate::prog::Inst::*;
+ loop {
+ // Don't visit states we've already added.
+ if nlist.set.contains(ip) {
+ return;
+ }
+ nlist.set.insert(ip);
+ match self.prog[ip] {
+ EmptyLook(ref inst) => {
+ if self.input.is_empty_match(at, inst) {
+ ip = inst.goto;
+ }
+ }
+ Save(ref inst) => {
+ if inst.slot < thread_caps.len() {
+ self.stack.push(FollowEpsilon::Capture {
+ slot: inst.slot,
+ pos: thread_caps[inst.slot],
+ });
+ thread_caps[inst.slot] = Some(at.pos());
+ }
+ ip = inst.goto;
+ }
+ Split(ref inst) => {
+ self.stack.push(FollowEpsilon::IP(inst.goto2));
+ ip = inst.goto1;
+ }
+ Match(_) | Char(_) | Ranges(_) | Bytes(_) => {
+ let t = &mut nlist.caps(ip);
+ for (slot, val) in t.iter_mut().zip(thread_caps.iter()) {
+ *slot = *val;
+ }
+ return;
+ }
+ }
+ }
+ }
+}
+
+impl Threads {
+ fn new() -> Self {
+ Threads { set: SparseSet::new(0), caps: vec![], slots_per_thread: 0 }
+ }
+
+ fn resize(&mut self, num_insts: usize, ncaps: usize) {
+ if num_insts == self.set.capacity() {
+ return;
+ }
+ self.slots_per_thread = ncaps * 2;
+ self.set = SparseSet::new(num_insts);
+ self.caps = vec![None; self.slots_per_thread * num_insts];
+ }
+
+ fn caps(&mut self, pc: usize) -> &mut [Option<usize>] {
+ let i = pc * self.slots_per_thread;
+ &mut self.caps[i..i + self.slots_per_thread]
+ }
+}
diff --git a/third_party/rust/regex/src/pool.rs b/third_party/rust/regex/src/pool.rs
new file mode 100644
index 0000000000..6a6f15b194
--- /dev/null
+++ b/third_party/rust/regex/src/pool.rs
@@ -0,0 +1,333 @@
+// This module provides a relatively simple thread-safe pool of reusable
+// objects. For the most part, it's implemented by a stack represented by a
+// Mutex<Vec<T>>. It has one small trick: because unlocking a mutex is somewhat
+// costly, in the case where a pool is accessed by the first thread that tried
+// to get a value, we bypass the mutex. Here are some benchmarks showing the
+// difference.
+//
+// 1) misc::anchored_literal_long_non_match 21 (18571 MB/s)
+// 2) misc::anchored_literal_long_non_match 107 (3644 MB/s)
+// 3) misc::anchored_literal_long_non_match 45 (8666 MB/s)
+// 4) misc::anchored_literal_long_non_match 19 (20526 MB/s)
+//
+// (1) represents our baseline: the master branch at the time of writing when
+// using the 'thread_local' crate to implement the pool below.
+//
+// (2) represents a naive pool implemented completely via Mutex<Vec<T>>. There
+// is no special trick for bypassing the mutex.
+//
+// (3) is the same as (2), except it uses Mutex<Vec<Box<T>>>. It is twice as
+// fast because a Box<T> is much smaller than the T we use with a Pool in this
+// crate. So pushing and popping a Box<T> from a Vec is quite a bit faster
+// than for T.
+//
+// (4) is the same as (3), but with the trick for bypassing the mutex in the
+// case of the first-to-get thread.
+//
+// Why move off of thread_local? Even though (4) is a hair faster than (1)
+// above, this was not the main goal. The main goal was to move off of
+// thread_local and find a way to *simply* re-capture some of its speed for
+// regex's specific case. So again, why move off of it? The *primary* reason is
+// because of memory leaks. See https://github.com/rust-lang/regex/issues/362
+// for example. (Why do I want it to be simple? Well, I suppose what I mean is,
+// "use as much safe code as possible to minimize risk and be as sure as I can
+// be that it is correct.")
+//
+// My guess is that the thread_local design is probably not appropriate for
+// regex since its memory usage scales to the number of active threads that
+// have used a regex, where as the pool below scales to the number of threads
+// that simultaneously use a regex. While neither case permits contraction,
+// since we own the pool data structure below, we can add contraction if a
+// clear use case pops up in the wild. More pressingly though, it seems that
+// there are at least some use case patterns where one might have many threads
+// sitting around that might have used a regex at one point. While thread_local
+// does try to reuse space previously used by a thread that has since stopped,
+// its maximal memory usage still scales with the total number of active
+// threads. In contrast, the pool below scales with the total number of threads
+// *simultaneously* using the pool. The hope is that this uses less memory
+// overall. And if it doesn't, we can hopefully tune it somehow.
+//
+// It seems that these sort of conditions happen frequently
+// in FFI inside of other more "managed" languages. This was
+// mentioned in the issue linked above, and also mentioned here:
+// https://github.com/BurntSushi/rure-go/issues/3. And in particular, users
+// confirm that disabling the use of thread_local resolves the leak.
+//
+// There were other weaker reasons for moving off of thread_local as well.
+// Namely, at the time, I was looking to reduce dependencies. And for something
+// like regex, maintenance can be simpler when we own the full dependency tree.
+
+use std::panic::{RefUnwindSafe, UnwindSafe};
+use std::sync::atomic::{AtomicUsize, Ordering};
+use std::sync::Mutex;
+
+/// An atomic counter used to allocate thread IDs.
+static COUNTER: AtomicUsize = AtomicUsize::new(1);
+
+thread_local!(
+ /// A thread local used to assign an ID to a thread.
+ static THREAD_ID: usize = {
+ let next = COUNTER.fetch_add(1, Ordering::Relaxed);
+ // SAFETY: We cannot permit the reuse of thread IDs since reusing a
+ // thread ID might result in more than one thread "owning" a pool,
+ // and thus, permit accessing a mutable value from multiple threads
+ // simultaneously without synchronization. The intent of this panic is
+ // to be a sanity check. It is not expected that the thread ID space
+ // will actually be exhausted in practice.
+ //
+ // This checks that the counter never wraps around, since atomic
+ // addition wraps around on overflow.
+ if next == 0 {
+ panic!("regex: thread ID allocation space exhausted");
+ }
+ next
+ };
+);
+
+/// The type of the function used to create values in a pool when the pool is
+/// empty and the caller requests one.
+type CreateFn<T> =
+ Box<dyn Fn() -> T + Send + Sync + UnwindSafe + RefUnwindSafe + 'static>;
+
+/// A simple thread safe pool for reusing values.
+///
+/// Getting a value out comes with a guard. When that guard is dropped, the
+/// value is automatically put back in the pool.
+///
+/// A Pool<T> impls Sync when T is Send (even if it's not Sync). This means
+/// that T can use interior mutability. This is possible because a pool is
+/// guaranteed to provide a value to exactly one thread at any time.
+///
+/// Currently, a pool never contracts in size. Its size is proportional to the
+/// number of simultaneous uses.
+pub struct Pool<T> {
+ /// A stack of T values to hand out. These are used when a Pool is
+ /// accessed by a thread that didn't create it.
+ stack: Mutex<Vec<Box<T>>>,
+ /// A function to create more T values when stack is empty and a caller
+ /// has requested a T.
+ create: CreateFn<T>,
+ /// The ID of the thread that owns this pool. The owner is the thread
+ /// that makes the first call to 'get'. When the owner calls 'get', it
+ /// gets 'owner_val' directly instead of returning a T from 'stack'.
+ /// See comments elsewhere for details, but this is intended to be an
+ /// optimization for the common case that makes getting a T faster.
+ ///
+ /// It is initialized to a value of zero (an impossible thread ID) as a
+ /// sentinel to indicate that it is unowned.
+ owner: AtomicUsize,
+ /// A value to return when the caller is in the same thread that created
+ /// the Pool.
+ owner_val: T,
+}
+
+// SAFETY: Since we want to use a Pool from multiple threads simultaneously
+// behind an Arc, we need for it to be Sync. In cases where T is sync, Pool<T>
+// would be Sync. However, since we use a Pool to store mutable scratch space,
+// we wind up using a T that has interior mutability and is thus itself not
+// Sync. So what we *really* want is for our Pool<T> to by Sync even when T is
+// not Sync (but is at least Send).
+//
+// The only non-sync aspect of a Pool is its 'owner_val' field, which is used
+// to implement faster access to a pool value in the common case of a pool
+// being accessed in the same thread in which it was created. The 'stack' field
+// is also shared, but a Mutex<T> where T: Send is already Sync. So we only
+// need to worry about 'owner_val'.
+//
+// The key is to guarantee that 'owner_val' can only ever be accessed from one
+// thread. In our implementation below, we guarantee this by only returning the
+// 'owner_val' when the ID of the current thread matches the ID of the thread
+// that created the Pool. Since this can only ever be one thread, it follows
+// that only one thread can access 'owner_val' at any point in time. Thus, it
+// is safe to declare that Pool<T> is Sync when T is Send.
+//
+// NOTE: It would also be possible to make the owning thread be the *first*
+// thread that tries to get a value out of a Pool. However, the current
+// implementation is a little simpler and it's not clear if making the first
+// thread (rather than the creating thread) is meaningfully better.
+//
+// If there is a way to achieve our performance goals using safe code, then
+// I would very much welcome a patch. As it stands, the implementation below
+// tries to balance safety with performance. The case where a Regex is used
+// from multiple threads simultaneously will suffer a bit since getting a cache
+// will require unlocking a mutex.
+unsafe impl<T: Send> Sync for Pool<T> {}
+
+impl<T: ::std::fmt::Debug> ::std::fmt::Debug for Pool<T> {
+ fn fmt(&self, f: &mut ::std::fmt::Formatter<'_>) -> ::std::fmt::Result {
+ f.debug_struct("Pool")
+ .field("stack", &self.stack)
+ .field("owner", &self.owner)
+ .field("owner_val", &self.owner_val)
+ .finish()
+ }
+}
+
+/// A guard that is returned when a caller requests a value from the pool.
+///
+/// The purpose of the guard is to use RAII to automatically put the value back
+/// in the pool once it's dropped.
+#[derive(Debug)]
+pub struct PoolGuard<'a, T: Send> {
+ /// The pool that this guard is attached to.
+ pool: &'a Pool<T>,
+ /// This is None when the guard represents the special "owned" value. In
+ /// which case, the value is retrieved from 'pool.owner_val'.
+ value: Option<Box<T>>,
+}
+
+impl<T: Send> Pool<T> {
+ /// Create a new pool. The given closure is used to create values in the
+ /// pool when necessary.
+ pub fn new(create: CreateFn<T>) -> Pool<T> {
+ let owner = AtomicUsize::new(0);
+ let owner_val = create();
+ Pool { stack: Mutex::new(vec![]), create, owner, owner_val }
+ }
+
+ /// Get a value from the pool. The caller is guaranteed to have exclusive
+ /// access to the given value.
+ ///
+ /// Note that there is no guarantee provided about which value in the
+ /// pool is returned. That is, calling get, dropping the guard (causing
+ /// the value to go back into the pool) and then calling get again is NOT
+ /// guaranteed to return the same value received in the first get call.
+ #[cfg_attr(feature = "perf-inline", inline(always))]
+ pub fn get(&self) -> PoolGuard<'_, T> {
+ // Our fast path checks if the caller is the thread that "owns" this
+ // pool. Or stated differently, whether it is the first thread that
+ // tried to extract a value from the pool. If it is, then we can return
+ // a T to the caller without going through a mutex.
+ //
+ // SAFETY: We must guarantee that only one thread gets access to this
+ // value. Since a thread is uniquely identified by the THREAD_ID thread
+ // local, it follows that is the caller's thread ID is equal to the
+ // owner, then only one thread may receive this value.
+ let caller = THREAD_ID.with(|id| *id);
+ let owner = self.owner.load(Ordering::Relaxed);
+ if caller == owner {
+ return self.guard_owned();
+ }
+ self.get_slow(caller, owner)
+ }
+
+ /// This is the "slow" version that goes through a mutex to pop an
+ /// allocated value off a stack to return to the caller. (Or, if the stack
+ /// is empty, a new value is created.)
+ ///
+ /// If the pool has no owner, then this will set the owner.
+ #[cold]
+ fn get_slow(&self, caller: usize, owner: usize) -> PoolGuard<'_, T> {
+ use std::sync::atomic::Ordering::Relaxed;
+
+ if owner == 0 {
+ // The sentinel 0 value means this pool is not yet owned. We
+ // try to atomically set the owner. If we do, then this thread
+ // becomes the owner and we can return a guard that represents
+ // the special T for the owner.
+ let res = self.owner.compare_exchange(0, caller, Relaxed, Relaxed);
+ if res.is_ok() {
+ return self.guard_owned();
+ }
+ }
+ let mut stack = self.stack.lock().unwrap();
+ let value = match stack.pop() {
+ None => Box::new((self.create)()),
+ Some(value) => value,
+ };
+ self.guard_stack(value)
+ }
+
+ /// Puts a value back into the pool. Callers don't need to call this. Once
+ /// the guard that's returned by 'get' is dropped, it is put back into the
+ /// pool automatically.
+ fn put(&self, value: Box<T>) {
+ let mut stack = self.stack.lock().unwrap();
+ stack.push(value);
+ }
+
+ /// Create a guard that represents the special owned T.
+ fn guard_owned(&self) -> PoolGuard<'_, T> {
+ PoolGuard { pool: self, value: None }
+ }
+
+ /// Create a guard that contains a value from the pool's stack.
+ fn guard_stack(&self, value: Box<T>) -> PoolGuard<'_, T> {
+ PoolGuard { pool: self, value: Some(value) }
+ }
+}
+
+impl<'a, T: Send> PoolGuard<'a, T> {
+ /// Return the underlying value.
+ pub fn value(&self) -> &T {
+ match self.value {
+ None => &self.pool.owner_val,
+ Some(ref v) => &**v,
+ }
+ }
+}
+
+impl<'a, T: Send> Drop for PoolGuard<'a, T> {
+ #[cfg_attr(feature = "perf-inline", inline(always))]
+ fn drop(&mut self) {
+ if let Some(value) = self.value.take() {
+ self.pool.put(value);
+ }
+ }
+}
+
+#[cfg(test)]
+mod tests {
+ use std::panic::{RefUnwindSafe, UnwindSafe};
+
+ use super::*;
+
+ #[test]
+ fn oibits() {
+ use crate::exec::ProgramCache;
+
+ fn has_oibits<T: Send + Sync + UnwindSafe + RefUnwindSafe>() {}
+ has_oibits::<Pool<ProgramCache>>();
+ }
+
+ // Tests that Pool implements the "single owner" optimization. That is, the
+ // thread that first accesses the pool gets its own copy, while all other
+ // threads get distinct copies.
+ #[test]
+ fn thread_owner_optimization() {
+ use std::cell::RefCell;
+ use std::sync::Arc;
+
+ let pool: Arc<Pool<RefCell<Vec<char>>>> =
+ Arc::new(Pool::new(Box::new(|| RefCell::new(vec!['a']))));
+ pool.get().value().borrow_mut().push('x');
+
+ let pool1 = pool.clone();
+ let t1 = std::thread::spawn(move || {
+ let guard = pool1.get();
+ let v = guard.value();
+ v.borrow_mut().push('y');
+ });
+
+ let pool2 = pool.clone();
+ let t2 = std::thread::spawn(move || {
+ let guard = pool2.get();
+ let v = guard.value();
+ v.borrow_mut().push('z');
+ });
+
+ t1.join().unwrap();
+ t2.join().unwrap();
+
+ // If we didn't implement the single owner optimization, then one of
+ // the threads above is likely to have mutated the [a, x] vec that
+ // we stuffed in the pool before spawning the threads. But since
+ // neither thread was first to access the pool, and because of the
+ // optimization, we should be guaranteed that neither thread mutates
+ // the special owned pool value.
+ //
+ // (Technically this is an implementation detail and not a contract of
+ // Pool's API.)
+ assert_eq!(vec!['a', 'x'], *pool.get().value().borrow());
+ }
+}
diff --git a/third_party/rust/regex/src/prog.rs b/third_party/rust/regex/src/prog.rs
new file mode 100644
index 0000000000..c211f71d8a
--- /dev/null
+++ b/third_party/rust/regex/src/prog.rs
@@ -0,0 +1,447 @@
+use std::cmp::Ordering;
+use std::collections::HashMap;
+use std::fmt;
+use std::mem;
+use std::ops::Deref;
+use std::slice;
+use std::sync::Arc;
+
+use crate::input::Char;
+use crate::literal::LiteralSearcher;
+
+/// `InstPtr` represents the index of an instruction in a regex program.
+pub type InstPtr = usize;
+
+/// Program is a sequence of instructions and various facts about thos
+/// instructions.
+#[derive(Clone)]
+pub struct Program {
+ /// A sequence of instructions that represents an NFA.
+ pub insts: Vec<Inst>,
+ /// Pointers to each Match instruction in the sequence.
+ ///
+ /// This is always length 1 unless this program represents a regex set.
+ pub matches: Vec<InstPtr>,
+ /// The ordered sequence of all capture groups extracted from the AST.
+ /// Unnamed groups are `None`.
+ pub captures: Vec<Option<String>>,
+ /// Pointers to all named capture groups into `captures`.
+ pub capture_name_idx: Arc<HashMap<String, usize>>,
+ /// A pointer to the start instruction. This can vary depending on how
+ /// the program was compiled. For example, programs for use with the DFA
+ /// engine have a `.*?` inserted at the beginning of unanchored regular
+ /// expressions. The actual starting point of the program is after the
+ /// `.*?`.
+ pub start: InstPtr,
+ /// A set of equivalence classes for discriminating bytes in the compiled
+ /// program.
+ pub byte_classes: Vec<u8>,
+ /// When true, this program can only match valid UTF-8.
+ pub only_utf8: bool,
+ /// When true, this program uses byte range instructions instead of Unicode
+ /// range instructions.
+ pub is_bytes: bool,
+ /// When true, the program is compiled for DFA matching. For example, this
+ /// implies `is_bytes` and also inserts a preceding `.*?` for unanchored
+ /// regexes.
+ pub is_dfa: bool,
+ /// When true, the program matches text in reverse (for use only in the
+ /// DFA).
+ pub is_reverse: bool,
+ /// Whether the regex must match from the start of the input.
+ pub is_anchored_start: bool,
+ /// Whether the regex must match at the end of the input.
+ pub is_anchored_end: bool,
+ /// Whether this program contains a Unicode word boundary instruction.
+ pub has_unicode_word_boundary: bool,
+ /// A possibly empty machine for very quickly matching prefix literals.
+ pub prefixes: LiteralSearcher,
+ /// A limit on the size of the cache that the DFA is allowed to use while
+ /// matching.
+ ///
+ /// The cache limit specifies approximately how much space we're willing to
+ /// give to the state cache. Once the state cache exceeds the size, it is
+ /// wiped and all states must be re-computed.
+ ///
+ /// Note that this value does not impact correctness. It can be set to 0
+ /// and the DFA will run just fine. (It will only ever store exactly one
+ /// state in the cache, and will likely run very slowly, but it will work.)
+ ///
+ /// Also note that this limit is *per thread of execution*. That is,
+ /// if the same regex is used to search text across multiple threads
+ /// simultaneously, then the DFA cache is not shared. Instead, copies are
+ /// made.
+ pub dfa_size_limit: usize,
+}
+
+impl Program {
+ /// Creates an empty instruction sequence. Fields are given default
+ /// values.
+ pub fn new() -> Self {
+ Program {
+ insts: vec![],
+ matches: vec![],
+ captures: vec![],
+ capture_name_idx: Arc::new(HashMap::new()),
+ start: 0,
+ byte_classes: vec![0; 256],
+ only_utf8: true,
+ is_bytes: false,
+ is_dfa: false,
+ is_reverse: false,
+ is_anchored_start: false,
+ is_anchored_end: false,
+ has_unicode_word_boundary: false,
+ prefixes: LiteralSearcher::empty(),
+ dfa_size_limit: 2 * (1 << 20),
+ }
+ }
+
+ /// If pc is an index to a no-op instruction (like Save), then return the
+ /// next pc that is not a no-op instruction.
+ pub fn skip(&self, mut pc: usize) -> usize {
+ loop {
+ match self[pc] {
+ Inst::Save(ref i) => pc = i.goto,
+ _ => return pc,
+ }
+ }
+ }
+
+ /// Return true if and only if an execution engine at instruction `pc` will
+ /// always lead to a match.
+ pub fn leads_to_match(&self, pc: usize) -> bool {
+ if self.matches.len() > 1 {
+ // If we have a regex set, then we have more than one ending
+ // state, so leading to one of those states is generally
+ // meaningless.
+ return false;
+ }
+ match self[self.skip(pc)] {
+ Inst::Match(_) => true,
+ _ => false,
+ }
+ }
+
+ /// Returns true if the current configuration demands that an implicit
+ /// `.*?` be prepended to the instruction sequence.
+ pub fn needs_dotstar(&self) -> bool {
+ self.is_dfa && !self.is_reverse && !self.is_anchored_start
+ }
+
+ /// Returns true if this program uses Byte instructions instead of
+ /// Char/Range instructions.
+ pub fn uses_bytes(&self) -> bool {
+ self.is_bytes || self.is_dfa
+ }
+
+ /// Returns true if this program exclusively matches valid UTF-8 bytes.
+ ///
+ /// That is, if an invalid UTF-8 byte is seen, then no match is possible.
+ pub fn only_utf8(&self) -> bool {
+ self.only_utf8
+ }
+
+ /// Return the approximate heap usage of this instruction sequence in
+ /// bytes.
+ pub fn approximate_size(&self) -> usize {
+ // The only instruction that uses heap space is Ranges (for
+ // Unicode codepoint programs) to store non-overlapping codepoint
+ // ranges. To keep this operation constant time, we ignore them.
+ (self.len() * mem::size_of::<Inst>())
+ + (self.matches.len() * mem::size_of::<InstPtr>())
+ + (self.captures.len() * mem::size_of::<Option<String>>())
+ + (self.capture_name_idx.len()
+ * (mem::size_of::<String>() + mem::size_of::<usize>()))
+ + (self.byte_classes.len() * mem::size_of::<u8>())
+ + self.prefixes.approximate_size()
+ }
+}
+
+impl Deref for Program {
+ type Target = [Inst];
+
+ #[cfg_attr(feature = "perf-inline", inline(always))]
+ fn deref(&self) -> &Self::Target {
+ &*self.insts
+ }
+}
+
+impl fmt::Debug for Program {
+ fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+ use self::Inst::*;
+
+ fn with_goto(cur: usize, goto: usize, fmtd: String) -> String {
+ if goto == cur + 1 {
+ fmtd
+ } else {
+ format!("{} (goto: {})", fmtd, goto)
+ }
+ }
+
+ fn visible_byte(b: u8) -> String {
+ use std::ascii::escape_default;
+ let escaped = escape_default(b).collect::<Vec<u8>>();
+ String::from_utf8_lossy(&escaped).into_owned()
+ }
+
+ for (pc, inst) in self.iter().enumerate() {
+ match *inst {
+ Match(slot) => write!(f, "{:04} Match({:?})", pc, slot)?,
+ Save(ref inst) => {
+ let s = format!("{:04} Save({})", pc, inst.slot);
+ write!(f, "{}", with_goto(pc, inst.goto, s))?;
+ }
+ Split(ref inst) => {
+ write!(
+ f,
+ "{:04} Split({}, {})",
+ pc, inst.goto1, inst.goto2
+ )?;
+ }
+ EmptyLook(ref inst) => {
+ let s = format!("{:?}", inst.look);
+ write!(f, "{:04} {}", pc, with_goto(pc, inst.goto, s))?;
+ }
+ Char(ref inst) => {
+ let s = format!("{:?}", inst.c);
+ write!(f, "{:04} {}", pc, with_goto(pc, inst.goto, s))?;
+ }
+ Ranges(ref inst) => {
+ let ranges = inst
+ .ranges
+ .iter()
+ .map(|r| format!("{:?}-{:?}", r.0, r.1))
+ .collect::<Vec<String>>()
+ .join(", ");
+ write!(
+ f,
+ "{:04} {}",
+ pc,
+ with_goto(pc, inst.goto, ranges)
+ )?;
+ }
+ Bytes(ref inst) => {
+ let s = format!(
+ "Bytes({}, {})",
+ visible_byte(inst.start),
+ visible_byte(inst.end)
+ );
+ write!(f, "{:04} {}", pc, with_goto(pc, inst.goto, s))?;
+ }
+ }
+ if pc == self.start {
+ write!(f, " (start)")?;
+ }
+ writeln!(f)?;
+ }
+ Ok(())
+ }
+}
+
+impl<'a> IntoIterator for &'a Program {
+ type Item = &'a Inst;
+ type IntoIter = slice::Iter<'a, Inst>;
+ fn into_iter(self) -> Self::IntoIter {
+ self.iter()
+ }
+}
+
+/// Inst is an instruction code in a Regex program.
+///
+/// Regrettably, a regex program either contains Unicode codepoint
+/// instructions (Char and Ranges) or it contains byte instructions (Bytes).
+/// A regex program can never contain both.
+///
+/// It would be worth investigating splitting this into two distinct types and
+/// then figuring out how to make the matching engines polymorphic over those
+/// types without sacrificing performance.
+///
+/// Other than the benefit of moving invariants into the type system, another
+/// benefit is the decreased size. If we remove the `Char` and `Ranges`
+/// instructions from the `Inst` enum, then its size shrinks from 32 bytes to
+/// 24 bytes. (This is because of the removal of a `Box<[]>` in the `Ranges`
+/// variant.) Given that byte based machines are typically much bigger than
+/// their Unicode analogues (because they can decode UTF-8 directly), this ends
+/// up being a pretty significant savings.
+#[derive(Clone, Debug)]
+pub enum Inst {
+ /// Match indicates that the program has reached a match state.
+ ///
+ /// The number in the match corresponds to the Nth logical regular
+ /// expression in this program. This index is always 0 for normal regex
+ /// programs. Values greater than 0 appear when compiling regex sets, and
+ /// each match instruction gets its own unique value. The value corresponds
+ /// to the Nth regex in the set.
+ Match(usize),
+ /// Save causes the program to save the current location of the input in
+ /// the slot indicated by InstSave.
+ Save(InstSave),
+ /// Split causes the program to diverge to one of two paths in the
+ /// program, preferring goto1 in InstSplit.
+ Split(InstSplit),
+ /// EmptyLook represents a zero-width assertion in a regex program. A
+ /// zero-width assertion does not consume any of the input text.
+ EmptyLook(InstEmptyLook),
+ /// Char requires the regex program to match the character in InstChar at
+ /// the current position in the input.
+ Char(InstChar),
+ /// Ranges requires the regex program to match the character at the current
+ /// position in the input with one of the ranges specified in InstRanges.
+ Ranges(InstRanges),
+ /// Bytes is like Ranges, except it expresses a single byte range. It is
+ /// used in conjunction with Split instructions to implement multi-byte
+ /// character classes.
+ Bytes(InstBytes),
+}
+
+impl Inst {
+ /// Returns true if and only if this is a match instruction.
+ pub fn is_match(&self) -> bool {
+ match *self {
+ Inst::Match(_) => true,
+ _ => false,
+ }
+ }
+}
+
+/// Representation of the Save instruction.
+#[derive(Clone, Debug)]
+pub struct InstSave {
+ /// The next location to execute in the program.
+ pub goto: InstPtr,
+ /// The capture slot (there are two slots for every capture in a regex,
+ /// including the zeroth capture for the entire match).
+ pub slot: usize,
+}
+
+/// Representation of the Split instruction.
+#[derive(Clone, Debug)]
+pub struct InstSplit {
+ /// The first instruction to try. A match resulting from following goto1
+ /// has precedence over a match resulting from following goto2.
+ pub goto1: InstPtr,
+ /// The second instruction to try. A match resulting from following goto1
+ /// has precedence over a match resulting from following goto2.
+ pub goto2: InstPtr,
+}
+
+/// Representation of the `EmptyLook` instruction.
+#[derive(Clone, Debug)]
+pub struct InstEmptyLook {
+ /// The next location to execute in the program if this instruction
+ /// succeeds.
+ pub goto: InstPtr,
+ /// The type of zero-width assertion to check.
+ pub look: EmptyLook,
+}
+
+/// The set of zero-width match instructions.
+#[derive(Clone, Copy, Debug, PartialEq, Eq)]
+pub enum EmptyLook {
+ /// Start of line or input.
+ StartLine,
+ /// End of line or input.
+ EndLine,
+ /// Start of input.
+ StartText,
+ /// End of input.
+ EndText,
+ /// Word character on one side and non-word character on other.
+ WordBoundary,
+ /// Word character on both sides or non-word character on both sides.
+ NotWordBoundary,
+ /// ASCII word boundary.
+ WordBoundaryAscii,
+ /// Not ASCII word boundary.
+ NotWordBoundaryAscii,
+}
+
+/// Representation of the Char instruction.
+#[derive(Clone, Debug)]
+pub struct InstChar {
+ /// The next location to execute in the program if this instruction
+ /// succeeds.
+ pub goto: InstPtr,
+ /// The character to test.
+ pub c: char,
+}
+
+/// Representation of the Ranges instruction.
+#[derive(Clone, Debug)]
+pub struct InstRanges {
+ /// The next location to execute in the program if this instruction
+ /// succeeds.
+ pub goto: InstPtr,
+ /// The set of Unicode scalar value ranges to test.
+ pub ranges: Box<[(char, char)]>,
+}
+
+impl InstRanges {
+ /// Tests whether the given input character matches this instruction.
+ pub fn matches(&self, c: Char) -> bool {
+ // This speeds up the `match_class_unicode` benchmark by checking
+ // some common cases quickly without binary search. e.g., Matching
+ // a Unicode class on predominantly ASCII text.
+ for r in self.ranges.iter().take(4) {
+ if c < r.0 {
+ return false;
+ }
+ if c <= r.1 {
+ return true;
+ }
+ }
+ self.ranges
+ .binary_search_by(|r| {
+ if r.1 < c {
+ Ordering::Less
+ } else if r.0 > c {
+ Ordering::Greater
+ } else {
+ Ordering::Equal
+ }
+ })
+ .is_ok()
+ }
+
+ /// Return the number of distinct characters represented by all of the
+ /// ranges.
+ pub fn num_chars(&self) -> usize {
+ self.ranges
+ .iter()
+ .map(|&(s, e)| 1 + (e as u32) - (s as u32))
+ .sum::<u32>() as usize
+ }
+}
+
+/// Representation of the Bytes instruction.
+#[derive(Clone, Debug)]
+pub struct InstBytes {
+ /// The next location to execute in the program if this instruction
+ /// succeeds.
+ pub goto: InstPtr,
+ /// The start (inclusive) of this byte range.
+ pub start: u8,
+ /// The end (inclusive) of this byte range.
+ pub end: u8,
+}
+
+impl InstBytes {
+ /// Returns true if and only if the given byte is in this range.
+ pub fn matches(&self, byte: u8) -> bool {
+ self.start <= byte && byte <= self.end
+ }
+}
+
+#[cfg(test)]
+mod test {
+ #[test]
+ #[cfg(target_pointer_width = "64")]
+ fn test_size_of_inst() {
+ use std::mem::size_of;
+
+ use super::Inst;
+
+ assert_eq!(32, size_of::<Inst>());
+ }
+}
diff --git a/third_party/rust/regex/src/re_builder.rs b/third_party/rust/regex/src/re_builder.rs
new file mode 100644
index 0000000000..ee6383690d
--- /dev/null
+++ b/third_party/rust/regex/src/re_builder.rs
@@ -0,0 +1,421 @@
+/// The set of user configurable options for compiling zero or more regexes.
+#[derive(Clone, Debug)]
+#[allow(missing_docs)]
+pub struct RegexOptions {
+ pub pats: Vec<String>,
+ pub size_limit: usize,
+ pub dfa_size_limit: usize,
+ pub nest_limit: u32,
+ pub case_insensitive: bool,
+ pub multi_line: bool,
+ pub dot_matches_new_line: bool,
+ pub swap_greed: bool,
+ pub ignore_whitespace: bool,
+ pub unicode: bool,
+ pub octal: bool,
+}
+
+impl Default for RegexOptions {
+ fn default() -> Self {
+ RegexOptions {
+ pats: vec![],
+ size_limit: 10 * (1 << 20),
+ dfa_size_limit: 2 * (1 << 20),
+ nest_limit: 250,
+ case_insensitive: false,
+ multi_line: false,
+ dot_matches_new_line: false,
+ swap_greed: false,
+ ignore_whitespace: false,
+ unicode: true,
+ octal: false,
+ }
+ }
+}
+
+macro_rules! define_builder {
+ ($name:ident, $regex_mod:ident, $only_utf8:expr) => {
+ pub mod $name {
+ use super::RegexOptions;
+ use crate::error::Error;
+ use crate::exec::ExecBuilder;
+
+ use crate::$regex_mod::Regex;
+
+ /// A configurable builder for a regular expression.
+ ///
+ /// A builder can be used to configure how the regex is built, for example, by
+ /// setting the default flags (which can be overridden in the expression
+ /// itself) or setting various limits.
+ #[derive(Debug)]
+ pub struct RegexBuilder(RegexOptions);
+
+ impl RegexBuilder {
+ /// Create a new regular expression builder with the given pattern.
+ ///
+ /// If the pattern is invalid, then an error will be returned when
+ /// `build` is called.
+ pub fn new(pattern: &str) -> RegexBuilder {
+ let mut builder = RegexBuilder(RegexOptions::default());
+ builder.0.pats.push(pattern.to_owned());
+ builder
+ }
+
+ /// Consume the builder and compile the regular expression.
+ ///
+ /// Note that calling `as_str` on the resulting `Regex` will produce the
+ /// pattern given to `new` verbatim. Notably, it will not incorporate any
+ /// of the flags set on this builder.
+ pub fn build(&self) -> Result<Regex, Error> {
+ ExecBuilder::new_options(self.0.clone())
+ .only_utf8($only_utf8)
+ .build()
+ .map(Regex::from)
+ }
+
+ /// Set the value for the case insensitive (`i`) flag.
+ ///
+ /// When enabled, letters in the pattern will match both upper case and
+ /// lower case variants.
+ pub fn case_insensitive(
+ &mut self,
+ yes: bool,
+ ) -> &mut RegexBuilder {
+ self.0.case_insensitive = yes;
+ self
+ }
+
+ /// Set the value for the multi-line matching (`m`) flag.
+ ///
+ /// When enabled, `^` matches the beginning of lines and `$` matches the
+ /// end of lines.
+ ///
+ /// By default, they match beginning/end of the input.
+ pub fn multi_line(&mut self, yes: bool) -> &mut RegexBuilder {
+ self.0.multi_line = yes;
+ self
+ }
+
+ /// Set the value for the any character (`s`) flag, where in `.` matches
+ /// anything when `s` is set and matches anything except for new line when
+ /// it is not set (the default).
+ ///
+ /// N.B. "matches anything" means "any byte" when Unicode is disabled and
+ /// means "any valid UTF-8 encoding of any Unicode scalar value" when
+ /// Unicode is enabled.
+ pub fn dot_matches_new_line(
+ &mut self,
+ yes: bool,
+ ) -> &mut RegexBuilder {
+ self.0.dot_matches_new_line = yes;
+ self
+ }
+
+ /// Set the value for the greedy swap (`U`) flag.
+ ///
+ /// When enabled, a pattern like `a*` is lazy (tries to find shortest
+ /// match) and `a*?` is greedy (tries to find longest match).
+ ///
+ /// By default, `a*` is greedy and `a*?` is lazy.
+ pub fn swap_greed(&mut self, yes: bool) -> &mut RegexBuilder {
+ self.0.swap_greed = yes;
+ self
+ }
+
+ /// Set the value for the ignore whitespace (`x`) flag.
+ ///
+ /// When enabled, whitespace such as new lines and spaces will be ignored
+ /// between expressions of the pattern, and `#` can be used to start a
+ /// comment until the next new line.
+ pub fn ignore_whitespace(
+ &mut self,
+ yes: bool,
+ ) -> &mut RegexBuilder {
+ self.0.ignore_whitespace = yes;
+ self
+ }
+
+ /// Set the value for the Unicode (`u`) flag.
+ ///
+ /// Enabled by default. When disabled, character classes such as `\w` only
+ /// match ASCII word characters instead of all Unicode word characters.
+ pub fn unicode(&mut self, yes: bool) -> &mut RegexBuilder {
+ self.0.unicode = yes;
+ self
+ }
+
+ /// Whether to support octal syntax or not.
+ ///
+ /// Octal syntax is a little-known way of uttering Unicode codepoints in
+ /// a regular expression. For example, `a`, `\x61`, `\u0061` and
+ /// `\141` are all equivalent regular expressions, where the last example
+ /// shows octal syntax.
+ ///
+ /// While supporting octal syntax isn't in and of itself a problem, it does
+ /// make good error messages harder. That is, in PCRE based regex engines,
+ /// syntax like `\0` invokes a backreference, which is explicitly
+ /// unsupported in Rust's regex engine. However, many users expect it to
+ /// be supported. Therefore, when octal support is disabled, the error
+ /// message will explicitly mention that backreferences aren't supported.
+ ///
+ /// Octal syntax is disabled by default.
+ pub fn octal(&mut self, yes: bool) -> &mut RegexBuilder {
+ self.0.octal = yes;
+ self
+ }
+
+ /// Set the approximate size limit of the compiled regular expression.
+ ///
+ /// This roughly corresponds to the number of bytes occupied by a single
+ /// compiled program. If the program exceeds this number, then a
+ /// compilation error is returned.
+ pub fn size_limit(
+ &mut self,
+ limit: usize,
+ ) -> &mut RegexBuilder {
+ self.0.size_limit = limit;
+ self
+ }
+
+ /// Set the approximate size of the cache used by the DFA.
+ ///
+ /// This roughly corresponds to the number of bytes that the DFA will
+ /// use while searching.
+ ///
+ /// Note that this is a *per thread* limit. There is no way to set a global
+ /// limit. In particular, if a regex is used from multiple threads
+ /// simultaneously, then each thread may use up to the number of bytes
+ /// specified here.
+ pub fn dfa_size_limit(
+ &mut self,
+ limit: usize,
+ ) -> &mut RegexBuilder {
+ self.0.dfa_size_limit = limit;
+ self
+ }
+
+ /// Set the nesting limit for this parser.
+ ///
+ /// The nesting limit controls how deep the abstract syntax tree is allowed
+ /// to be. If the AST exceeds the given limit (e.g., with too many nested
+ /// groups), then an error is returned by the parser.
+ ///
+ /// The purpose of this limit is to act as a heuristic to prevent stack
+ /// overflow for consumers that do structural induction on an `Ast` using
+ /// explicit recursion. While this crate never does this (instead using
+ /// constant stack space and moving the call stack to the heap), other
+ /// crates may.
+ ///
+ /// This limit is not checked until the entire Ast is parsed. Therefore,
+ /// if callers want to put a limit on the amount of heap space used, then
+ /// they should impose a limit on the length, in bytes, of the concrete
+ /// pattern string. In particular, this is viable since this parser
+ /// implementation will limit itself to heap space proportional to the
+ /// length of the pattern string.
+ ///
+ /// Note that a nest limit of `0` will return a nest limit error for most
+ /// patterns but not all. For example, a nest limit of `0` permits `a` but
+ /// not `ab`, since `ab` requires a concatenation, which results in a nest
+ /// depth of `1`. In general, a nest limit is not something that manifests
+ /// in an obvious way in the concrete syntax, therefore, it should not be
+ /// used in a granular way.
+ pub fn nest_limit(&mut self, limit: u32) -> &mut RegexBuilder {
+ self.0.nest_limit = limit;
+ self
+ }
+ }
+ }
+ };
+}
+
+define_builder!(bytes, re_bytes, false);
+define_builder!(unicode, re_unicode, true);
+
+macro_rules! define_set_builder {
+ ($name:ident, $regex_mod:ident, $only_utf8:expr) => {
+ pub mod $name {
+ use super::RegexOptions;
+ use crate::error::Error;
+ use crate::exec::ExecBuilder;
+
+ use crate::re_set::$regex_mod::RegexSet;
+
+ /// A configurable builder for a set of regular expressions.
+ ///
+ /// A builder can be used to configure how the regexes are built, for example,
+ /// by setting the default flags (which can be overridden in the expression
+ /// itself) or setting various limits.
+ #[derive(Debug)]
+ pub struct RegexSetBuilder(RegexOptions);
+
+ impl RegexSetBuilder {
+ /// Create a new regular expression builder with the given pattern.
+ ///
+ /// If the pattern is invalid, then an error will be returned when
+ /// `build` is called.
+ pub fn new<I, S>(patterns: I) -> RegexSetBuilder
+ where
+ S: AsRef<str>,
+ I: IntoIterator<Item = S>,
+ {
+ let mut builder = RegexSetBuilder(RegexOptions::default());
+ for pat in patterns {
+ builder.0.pats.push(pat.as_ref().to_owned());
+ }
+ builder
+ }
+
+ /// Consume the builder and compile the regular expressions into a set.
+ pub fn build(&self) -> Result<RegexSet, Error> {
+ ExecBuilder::new_options(self.0.clone())
+ .only_utf8($only_utf8)
+ .build()
+ .map(RegexSet::from)
+ }
+
+ /// Set the value for the case insensitive (`i`) flag.
+ pub fn case_insensitive(
+ &mut self,
+ yes: bool,
+ ) -> &mut RegexSetBuilder {
+ self.0.case_insensitive = yes;
+ self
+ }
+
+ /// Set the value for the multi-line matching (`m`) flag.
+ pub fn multi_line(
+ &mut self,
+ yes: bool,
+ ) -> &mut RegexSetBuilder {
+ self.0.multi_line = yes;
+ self
+ }
+
+ /// Set the value for the any character (`s`) flag, where in `.` matches
+ /// anything when `s` is set and matches anything except for new line when
+ /// it is not set (the default).
+ ///
+ /// N.B. "matches anything" means "any byte" for `regex::bytes::RegexSet`
+ /// expressions and means "any Unicode scalar value" for `regex::RegexSet`
+ /// expressions.
+ pub fn dot_matches_new_line(
+ &mut self,
+ yes: bool,
+ ) -> &mut RegexSetBuilder {
+ self.0.dot_matches_new_line = yes;
+ self
+ }
+
+ /// Set the value for the greedy swap (`U`) flag.
+ pub fn swap_greed(
+ &mut self,
+ yes: bool,
+ ) -> &mut RegexSetBuilder {
+ self.0.swap_greed = yes;
+ self
+ }
+
+ /// Set the value for the ignore whitespace (`x`) flag.
+ pub fn ignore_whitespace(
+ &mut self,
+ yes: bool,
+ ) -> &mut RegexSetBuilder {
+ self.0.ignore_whitespace = yes;
+ self
+ }
+
+ /// Set the value for the Unicode (`u`) flag.
+ pub fn unicode(&mut self, yes: bool) -> &mut RegexSetBuilder {
+ self.0.unicode = yes;
+ self
+ }
+
+ /// Whether to support octal syntax or not.
+ ///
+ /// Octal syntax is a little-known way of uttering Unicode codepoints in
+ /// a regular expression. For example, `a`, `\x61`, `\u0061` and
+ /// `\141` are all equivalent regular expressions, where the last example
+ /// shows octal syntax.
+ ///
+ /// While supporting octal syntax isn't in and of itself a problem, it does
+ /// make good error messages harder. That is, in PCRE based regex engines,
+ /// syntax like `\0` invokes a backreference, which is explicitly
+ /// unsupported in Rust's regex engine. However, many users expect it to
+ /// be supported. Therefore, when octal support is disabled, the error
+ /// message will explicitly mention that backreferences aren't supported.
+ ///
+ /// Octal syntax is disabled by default.
+ pub fn octal(&mut self, yes: bool) -> &mut RegexSetBuilder {
+ self.0.octal = yes;
+ self
+ }
+
+ /// Set the approximate size limit of the compiled regular expression.
+ ///
+ /// This roughly corresponds to the number of bytes occupied by a single
+ /// compiled program. If the program exceeds this number, then a
+ /// compilation error is returned.
+ pub fn size_limit(
+ &mut self,
+ limit: usize,
+ ) -> &mut RegexSetBuilder {
+ self.0.size_limit = limit;
+ self
+ }
+
+ /// Set the approximate size of the cache used by the DFA.
+ ///
+ /// This roughly corresponds to the number of bytes that the DFA will
+ /// use while searching.
+ ///
+ /// Note that this is a *per thread* limit. There is no way to set a global
+ /// limit. In particular, if a regex is used from multiple threads
+ /// simultaneously, then each thread may use up to the number of bytes
+ /// specified here.
+ pub fn dfa_size_limit(
+ &mut self,
+ limit: usize,
+ ) -> &mut RegexSetBuilder {
+ self.0.dfa_size_limit = limit;
+ self
+ }
+
+ /// Set the nesting limit for this parser.
+ ///
+ /// The nesting limit controls how deep the abstract syntax tree is allowed
+ /// to be. If the AST exceeds the given limit (e.g., with too many nested
+ /// groups), then an error is returned by the parser.
+ ///
+ /// The purpose of this limit is to act as a heuristic to prevent stack
+ /// overflow for consumers that do structural induction on an `Ast` using
+ /// explicit recursion. While this crate never does this (instead using
+ /// constant stack space and moving the call stack to the heap), other
+ /// crates may.
+ ///
+ /// This limit is not checked until the entire Ast is parsed. Therefore,
+ /// if callers want to put a limit on the amount of heap space used, then
+ /// they should impose a limit on the length, in bytes, of the concrete
+ /// pattern string. In particular, this is viable since this parser
+ /// implementation will limit itself to heap space proportional to the
+ /// length of the pattern string.
+ ///
+ /// Note that a nest limit of `0` will return a nest limit error for most
+ /// patterns but not all. For example, a nest limit of `0` permits `a` but
+ /// not `ab`, since `ab` requires a concatenation, which results in a nest
+ /// depth of `1`. In general, a nest limit is not something that manifests
+ /// in an obvious way in the concrete syntax, therefore, it should not be
+ /// used in a granular way.
+ pub fn nest_limit(
+ &mut self,
+ limit: u32,
+ ) -> &mut RegexSetBuilder {
+ self.0.nest_limit = limit;
+ self
+ }
+ }
+ }
+ };
+}
+
+define_set_builder!(set_bytes, bytes, false);
+define_set_builder!(set_unicode, unicode, true);
diff --git a/third_party/rust/regex/src/re_bytes.rs b/third_party/rust/regex/src/re_bytes.rs
new file mode 100644
index 0000000000..d71969257b
--- /dev/null
+++ b/third_party/rust/regex/src/re_bytes.rs
@@ -0,0 +1,1260 @@
+use std::borrow::Cow;
+use std::collections::HashMap;
+use std::fmt;
+use std::iter::FusedIterator;
+use std::ops::{Index, Range};
+use std::str::FromStr;
+use std::sync::Arc;
+
+use crate::find_byte::find_byte;
+
+use crate::error::Error;
+use crate::exec::{Exec, ExecNoSync};
+use crate::expand::expand_bytes;
+use crate::re_builder::bytes::RegexBuilder;
+use crate::re_trait::{self, RegularExpression, SubCapturesPosIter};
+
+/// Match represents a single match of a regex in a haystack.
+///
+/// The lifetime parameter `'t` refers to the lifetime of the matched text.
+#[derive(Copy, Clone, Debug, Eq, PartialEq)]
+pub struct Match<'t> {
+ text: &'t [u8],
+ start: usize,
+ end: usize,
+}
+
+impl<'t> Match<'t> {
+ /// Returns the starting byte offset of the match in the haystack.
+ #[inline]
+ pub fn start(&self) -> usize {
+ self.start
+ }
+
+ /// Returns the ending byte offset of the match in the haystack.
+ #[inline]
+ pub fn end(&self) -> usize {
+ self.end
+ }
+
+ /// Returns the range over the starting and ending byte offsets of the
+ /// match in the haystack.
+ #[inline]
+ pub fn range(&self) -> Range<usize> {
+ self.start..self.end
+ }
+
+ /// Returns the matched text.
+ #[inline]
+ pub fn as_bytes(&self) -> &'t [u8] {
+ &self.text[self.range()]
+ }
+
+ /// Creates a new match from the given haystack and byte offsets.
+ #[inline]
+ fn new(haystack: &'t [u8], start: usize, end: usize) -> Match<'t> {
+ Match { text: haystack, start, end }
+ }
+}
+
+impl<'t> From<Match<'t>> for Range<usize> {
+ fn from(m: Match<'t>) -> Range<usize> {
+ m.range()
+ }
+}
+
+/// A compiled regular expression for matching arbitrary bytes.
+///
+/// It can be used to search, split or replace text. All searching is done with
+/// an implicit `.*?` at the beginning and end of an expression. To force an
+/// expression to match the whole string (or a prefix or a suffix), you must
+/// use an anchor like `^` or `$` (or `\A` and `\z`).
+///
+/// Like the `Regex` type in the parent module, matches with this regex return
+/// byte offsets into the search text. **Unlike** the parent `Regex` type,
+/// these byte offsets may not correspond to UTF-8 sequence boundaries since
+/// the regexes in this module can match arbitrary bytes.
+#[derive(Clone)]
+pub struct Regex(Exec);
+
+impl fmt::Display for Regex {
+ /// Shows the original regular expression.
+ fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+ write!(f, "{}", self.as_str())
+ }
+}
+
+impl fmt::Debug for Regex {
+ /// Shows the original regular expression.
+ fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+ fmt::Display::fmt(self, f)
+ }
+}
+
+/// A constructor for Regex from an Exec.
+///
+/// This is hidden because Exec isn't actually part of the public API.
+#[doc(hidden)]
+impl From<Exec> for Regex {
+ fn from(exec: Exec) -> Regex {
+ Regex(exec)
+ }
+}
+
+impl FromStr for Regex {
+ type Err = Error;
+
+ /// Attempts to parse a string into a regular expression
+ fn from_str(s: &str) -> Result<Regex, Error> {
+ Regex::new(s)
+ }
+}
+
+/// Core regular expression methods.
+impl Regex {
+ /// Compiles a regular expression. Once compiled, it can be used repeatedly
+ /// to search, split or replace text in a string.
+ ///
+ /// If an invalid expression is given, then an error is returned.
+ pub fn new(re: &str) -> Result<Regex, Error> {
+ RegexBuilder::new(re).build()
+ }
+
+ /// Returns true if and only if there is a match for the regex in the
+ /// string given.
+ ///
+ /// It is recommended to use this method if all you need to do is test
+ /// a match, since the underlying matching engine may be able to do less
+ /// work.
+ ///
+ /// # Example
+ ///
+ /// Test if some text contains at least one word with exactly 13 ASCII word
+ /// bytes:
+ ///
+ /// ```rust
+ /// # use regex::bytes::Regex;
+ /// # fn main() {
+ /// let text = b"I categorically deny having triskaidekaphobia.";
+ /// assert!(Regex::new(r"\b\w{13}\b").unwrap().is_match(text));
+ /// # }
+ /// ```
+ pub fn is_match(&self, text: &[u8]) -> bool {
+ self.is_match_at(text, 0)
+ }
+
+ /// Returns the start and end byte range of the leftmost-first match in
+ /// `text`. If no match exists, then `None` is returned.
+ ///
+ /// Note that this should only be used if you want to discover the position
+ /// of the match. Testing the existence of a match is faster if you use
+ /// `is_match`.
+ ///
+ /// # Example
+ ///
+ /// Find the start and end location of the first word with exactly 13
+ /// ASCII word bytes:
+ ///
+ /// ```rust
+ /// # use regex::bytes::Regex;
+ /// # fn main() {
+ /// let text = b"I categorically deny having triskaidekaphobia.";
+ /// let mat = Regex::new(r"\b\w{13}\b").unwrap().find(text).unwrap();
+ /// assert_eq!((mat.start(), mat.end()), (2, 15));
+ /// # }
+ /// ```
+ pub fn find<'t>(&self, text: &'t [u8]) -> Option<Match<'t>> {
+ self.find_at(text, 0)
+ }
+
+ /// Returns an iterator for each successive non-overlapping match in
+ /// `text`, returning the start and end byte indices with respect to
+ /// `text`.
+ ///
+ /// # Example
+ ///
+ /// Find the start and end location of every word with exactly 13 ASCII
+ /// word bytes:
+ ///
+ /// ```rust
+ /// # use regex::bytes::Regex;
+ /// # fn main() {
+ /// let text = b"Retroactively relinquishing remunerations is reprehensible.";
+ /// for mat in Regex::new(r"\b\w{13}\b").unwrap().find_iter(text) {
+ /// println!("{:?}", mat);
+ /// }
+ /// # }
+ /// ```
+ pub fn find_iter<'r, 't>(&'r self, text: &'t [u8]) -> Matches<'r, 't> {
+ Matches(self.0.searcher().find_iter(text))
+ }
+
+ /// Returns the capture groups corresponding to the leftmost-first
+ /// match in `text`. Capture group `0` always corresponds to the entire
+ /// match. If no match is found, then `None` is returned.
+ ///
+ /// You should only use `captures` if you need access to the location of
+ /// capturing group matches. Otherwise, `find` is faster for discovering
+ /// the location of the overall match.
+ ///
+ /// # Examples
+ ///
+ /// Say you have some text with movie names and their release years,
+ /// like "'Citizen Kane' (1941)". It'd be nice if we could search for text
+ /// looking like that, while also extracting the movie name and its release
+ /// year separately.
+ ///
+ /// ```rust
+ /// # use regex::bytes::Regex;
+ /// # fn main() {
+ /// let re = Regex::new(r"'([^']+)'\s+\((\d{4})\)").unwrap();
+ /// let text = b"Not my favorite movie: 'Citizen Kane' (1941).";
+ /// let caps = re.captures(text).unwrap();
+ /// assert_eq!(caps.get(1).unwrap().as_bytes(), &b"Citizen Kane"[..]);
+ /// assert_eq!(caps.get(2).unwrap().as_bytes(), &b"1941"[..]);
+ /// assert_eq!(caps.get(0).unwrap().as_bytes(), &b"'Citizen Kane' (1941)"[..]);
+ /// // You can also access the groups by index using the Index notation.
+ /// // Note that this will panic on an invalid index.
+ /// assert_eq!(&caps[1], b"Citizen Kane");
+ /// assert_eq!(&caps[2], b"1941");
+ /// assert_eq!(&caps[0], b"'Citizen Kane' (1941)");
+ /// # }
+ /// ```
+ ///
+ /// Note that the full match is at capture group `0`. Each subsequent
+ /// capture group is indexed by the order of its opening `(`.
+ ///
+ /// We can make this example a bit clearer by using *named* capture groups:
+ ///
+ /// ```rust
+ /// # use regex::bytes::Regex;
+ /// # fn main() {
+ /// let re = Regex::new(r"'(?P<title>[^']+)'\s+\((?P<year>\d{4})\)")
+ /// .unwrap();
+ /// let text = b"Not my favorite movie: 'Citizen Kane' (1941).";
+ /// let caps = re.captures(text).unwrap();
+ /// assert_eq!(caps.name("title").unwrap().as_bytes(), b"Citizen Kane");
+ /// assert_eq!(caps.name("year").unwrap().as_bytes(), b"1941");
+ /// assert_eq!(caps.get(0).unwrap().as_bytes(), &b"'Citizen Kane' (1941)"[..]);
+ /// // You can also access the groups by name using the Index notation.
+ /// // Note that this will panic on an invalid group name.
+ /// assert_eq!(&caps["title"], b"Citizen Kane");
+ /// assert_eq!(&caps["year"], b"1941");
+ /// assert_eq!(&caps[0], b"'Citizen Kane' (1941)");
+ ///
+ /// # }
+ /// ```
+ ///
+ /// Here we name the capture groups, which we can access with the `name`
+ /// method or the `Index` notation with a `&str`. Note that the named
+ /// capture groups are still accessible with `get` or the `Index` notation
+ /// with a `usize`.
+ ///
+ /// The `0`th capture group is always unnamed, so it must always be
+ /// accessed with `get(0)` or `[0]`.
+ pub fn captures<'t>(&self, text: &'t [u8]) -> Option<Captures<'t>> {
+ let mut locs = self.capture_locations();
+ self.captures_read_at(&mut locs, text, 0).map(move |_| Captures {
+ text,
+ locs: locs.0,
+ named_groups: self.0.capture_name_idx().clone(),
+ })
+ }
+
+ /// Returns an iterator over all the non-overlapping capture groups matched
+ /// in `text`. This is operationally the same as `find_iter`, except it
+ /// yields information about capturing group matches.
+ ///
+ /// # Example
+ ///
+ /// We can use this to find all movie titles and their release years in
+ /// some text, where the movie is formatted like "'Title' (xxxx)":
+ ///
+ /// ```rust
+ /// # use std::str; use regex::bytes::Regex;
+ /// # fn main() {
+ /// let re = Regex::new(r"'(?P<title>[^']+)'\s+\((?P<year>\d{4})\)")
+ /// .unwrap();
+ /// let text = b"'Citizen Kane' (1941), 'The Wizard of Oz' (1939), 'M' (1931).";
+ /// for caps in re.captures_iter(text) {
+ /// let title = str::from_utf8(&caps["title"]).unwrap();
+ /// let year = str::from_utf8(&caps["year"]).unwrap();
+ /// println!("Movie: {:?}, Released: {:?}", title, year);
+ /// }
+ /// // Output:
+ /// // Movie: Citizen Kane, Released: 1941
+ /// // Movie: The Wizard of Oz, Released: 1939
+ /// // Movie: M, Released: 1931
+ /// # }
+ /// ```
+ pub fn captures_iter<'r, 't>(
+ &'r self,
+ text: &'t [u8],
+ ) -> CaptureMatches<'r, 't> {
+ CaptureMatches(self.0.searcher().captures_iter(text))
+ }
+
+ /// Returns an iterator of substrings of `text` delimited by a match of the
+ /// regular expression. Namely, each element of the iterator corresponds to
+ /// text that *isn't* matched by the regular expression.
+ ///
+ /// This method will *not* copy the text given.
+ ///
+ /// # Example
+ ///
+ /// To split a string delimited by arbitrary amounts of spaces or tabs:
+ ///
+ /// ```rust
+ /// # use regex::bytes::Regex;
+ /// # fn main() {
+ /// let re = Regex::new(r"[ \t]+").unwrap();
+ /// let fields: Vec<&[u8]> = re.split(b"a b \t c\td e").collect();
+ /// assert_eq!(fields, vec![
+ /// &b"a"[..], &b"b"[..], &b"c"[..], &b"d"[..], &b"e"[..],
+ /// ]);
+ /// # }
+ /// ```
+ pub fn split<'r, 't>(&'r self, text: &'t [u8]) -> Split<'r, 't> {
+ Split { finder: self.find_iter(text), last: 0 }
+ }
+
+ /// Returns an iterator of at most `limit` substrings of `text` delimited
+ /// by a match of the regular expression. (A `limit` of `0` will return no
+ /// substrings.) Namely, each element of the iterator corresponds to text
+ /// that *isn't* matched by the regular expression. The remainder of the
+ /// string that is not split will be the last element in the iterator.
+ ///
+ /// This method will *not* copy the text given.
+ ///
+ /// # Example
+ ///
+ /// Get the first two words in some text:
+ ///
+ /// ```rust
+ /// # use regex::bytes::Regex;
+ /// # fn main() {
+ /// let re = Regex::new(r"\W+").unwrap();
+ /// let fields: Vec<&[u8]> = re.splitn(b"Hey! How are you?", 3).collect();
+ /// assert_eq!(fields, vec![&b"Hey"[..], &b"How"[..], &b"are you?"[..]]);
+ /// # }
+ /// ```
+ pub fn splitn<'r, 't>(
+ &'r self,
+ text: &'t [u8],
+ limit: usize,
+ ) -> SplitN<'r, 't> {
+ SplitN { splits: self.split(text), n: limit }
+ }
+
+ /// Replaces the leftmost-first match with the replacement provided. The
+ /// replacement can be a regular byte string (where `$N` and `$name` are
+ /// expanded to match capture groups) or a function that takes the matches'
+ /// `Captures` and returns the replaced byte string.
+ ///
+ /// If no match is found, then a copy of the byte string is returned
+ /// unchanged.
+ ///
+ /// # Replacement string syntax
+ ///
+ /// All instances of `$name` in the replacement text is replaced with the
+ /// corresponding capture group `name`.
+ ///
+ /// `name` may be an integer corresponding to the index of the
+ /// capture group (counted by order of opening parenthesis where `0` is the
+ /// entire match) or it can be a name (consisting of letters, digits or
+ /// underscores) corresponding to a named capture group.
+ ///
+ /// If `name` isn't a valid capture group (whether the name doesn't exist
+ /// or isn't a valid index), then it is replaced with the empty string.
+ ///
+ /// The longest possible name is used. e.g., `$1a` looks up the capture
+ /// group named `1a` and not the capture group at index `1`. To exert more
+ /// precise control over the name, use braces, e.g., `${1}a`.
+ ///
+ /// To write a literal `$` use `$$`.
+ ///
+ /// # Examples
+ ///
+ /// Note that this function is polymorphic with respect to the replacement.
+ /// In typical usage, this can just be a normal byte string:
+ ///
+ /// ```rust
+ /// # use regex::bytes::Regex;
+ /// # fn main() {
+ /// let re = Regex::new("[^01]+").unwrap();
+ /// assert_eq!(re.replace(b"1078910", &b""[..]), &b"1010"[..]);
+ /// # }
+ /// ```
+ ///
+ /// But anything satisfying the `Replacer` trait will work. For example, a
+ /// closure of type `|&Captures| -> Vec<u8>` provides direct access to the
+ /// captures corresponding to a match. This allows one to access capturing
+ /// group matches easily:
+ ///
+ /// ```rust
+ /// # use regex::bytes::Regex;
+ /// # use regex::bytes::Captures; fn main() {
+ /// let re = Regex::new(r"([^,\s]+),\s+(\S+)").unwrap();
+ /// let result = re.replace(b"Springsteen, Bruce", |caps: &Captures| {
+ /// let mut replacement = caps[2].to_owned();
+ /// replacement.push(b' ');
+ /// replacement.extend(&caps[1]);
+ /// replacement
+ /// });
+ /// assert_eq!(result, &b"Bruce Springsteen"[..]);
+ /// # }
+ /// ```
+ ///
+ /// But this is a bit cumbersome to use all the time. Instead, a simple
+ /// syntax is supported that expands `$name` into the corresponding capture
+ /// group. Here's the last example, but using this expansion technique
+ /// with named capture groups:
+ ///
+ /// ```rust
+ /// # use regex::bytes::Regex;
+ /// # fn main() {
+ /// let re = Regex::new(r"(?P<last>[^,\s]+),\s+(?P<first>\S+)").unwrap();
+ /// let result = re.replace(b"Springsteen, Bruce", &b"$first $last"[..]);
+ /// assert_eq!(result, &b"Bruce Springsteen"[..]);
+ /// # }
+ /// ```
+ ///
+ /// Note that using `$2` instead of `$first` or `$1` instead of `$last`
+ /// would produce the same result. To write a literal `$` use `$$`.
+ ///
+ /// Sometimes the replacement string requires use of curly braces to
+ /// delineate a capture group replacement and surrounding literal text.
+ /// For example, if we wanted to join two words together with an
+ /// underscore:
+ ///
+ /// ```rust
+ /// # use regex::bytes::Regex;
+ /// # fn main() {
+ /// let re = Regex::new(r"(?P<first>\w+)\s+(?P<second>\w+)").unwrap();
+ /// let result = re.replace(b"deep fried", &b"${first}_$second"[..]);
+ /// assert_eq!(result, &b"deep_fried"[..]);
+ /// # }
+ /// ```
+ ///
+ /// Without the curly braces, the capture group name `first_` would be
+ /// used, and since it doesn't exist, it would be replaced with the empty
+ /// string.
+ ///
+ /// Finally, sometimes you just want to replace a literal string with no
+ /// regard for capturing group expansion. This can be done by wrapping a
+ /// byte string with `NoExpand`:
+ ///
+ /// ```rust
+ /// # use regex::bytes::Regex;
+ /// # fn main() {
+ /// use regex::bytes::NoExpand;
+ ///
+ /// let re = Regex::new(r"(?P<last>[^,\s]+),\s+(\S+)").unwrap();
+ /// let result = re.replace(b"Springsteen, Bruce", NoExpand(b"$2 $last"));
+ /// assert_eq!(result, &b"$2 $last"[..]);
+ /// # }
+ /// ```
+ pub fn replace<'t, R: Replacer>(
+ &self,
+ text: &'t [u8],
+ rep: R,
+ ) -> Cow<'t, [u8]> {
+ self.replacen(text, 1, rep)
+ }
+
+ /// Replaces all non-overlapping matches in `text` with the replacement
+ /// provided. This is the same as calling `replacen` with `limit` set to
+ /// `0`.
+ ///
+ /// See the documentation for `replace` for details on how to access
+ /// capturing group matches in the replacement text.
+ pub fn replace_all<'t, R: Replacer>(
+ &self,
+ text: &'t [u8],
+ rep: R,
+ ) -> Cow<'t, [u8]> {
+ self.replacen(text, 0, rep)
+ }
+
+ /// Replaces at most `limit` non-overlapping matches in `text` with the
+ /// replacement provided. If `limit` is 0, then all non-overlapping matches
+ /// are replaced.
+ ///
+ /// See the documentation for `replace` for details on how to access
+ /// capturing group matches in the replacement text.
+ pub fn replacen<'t, R: Replacer>(
+ &self,
+ text: &'t [u8],
+ limit: usize,
+ mut rep: R,
+ ) -> Cow<'t, [u8]> {
+ if let Some(rep) = rep.no_expansion() {
+ let mut it = self.find_iter(text).enumerate().peekable();
+ if it.peek().is_none() {
+ return Cow::Borrowed(text);
+ }
+ let mut new = Vec::with_capacity(text.len());
+ let mut last_match = 0;
+ for (i, m) in it {
+ if limit > 0 && i >= limit {
+ break;
+ }
+ new.extend_from_slice(&text[last_match..m.start()]);
+ new.extend_from_slice(&rep);
+ last_match = m.end();
+ }
+ new.extend_from_slice(&text[last_match..]);
+ return Cow::Owned(new);
+ }
+
+ // The slower path, which we use if the replacement needs access to
+ // capture groups.
+ let mut it = self.captures_iter(text).enumerate().peekable();
+ if it.peek().is_none() {
+ return Cow::Borrowed(text);
+ }
+ let mut new = Vec::with_capacity(text.len());
+ let mut last_match = 0;
+ for (i, cap) in it {
+ if limit > 0 && i >= limit {
+ break;
+ }
+ // unwrap on 0 is OK because captures only reports matches
+ let m = cap.get(0).unwrap();
+ new.extend_from_slice(&text[last_match..m.start()]);
+ rep.replace_append(&cap, &mut new);
+ last_match = m.end();
+ }
+ new.extend_from_slice(&text[last_match..]);
+ Cow::Owned(new)
+ }
+}
+
+/// Advanced or "lower level" search methods.
+impl Regex {
+ /// Returns the end location of a match in the text given.
+ ///
+ /// This method may have the same performance characteristics as
+ /// `is_match`, except it provides an end location for a match. In
+ /// particular, the location returned *may be shorter* than the proper end
+ /// of the leftmost-first match.
+ ///
+ /// # Example
+ ///
+ /// Typically, `a+` would match the entire first sequence of `a` in some
+ /// text, but `shortest_match` can give up as soon as it sees the first
+ /// `a`.
+ ///
+ /// ```rust
+ /// # use regex::bytes::Regex;
+ /// # fn main() {
+ /// let text = b"aaaaa";
+ /// let pos = Regex::new(r"a+").unwrap().shortest_match(text);
+ /// assert_eq!(pos, Some(1));
+ /// # }
+ /// ```
+ pub fn shortest_match(&self, text: &[u8]) -> Option<usize> {
+ self.shortest_match_at(text, 0)
+ }
+
+ /// Returns the same as shortest_match, but starts the search at the given
+ /// offset.
+ ///
+ /// The significance of the starting point is that it takes the surrounding
+ /// context into consideration. For example, the `\A` anchor can only
+ /// match when `start == 0`.
+ pub fn shortest_match_at(
+ &self,
+ text: &[u8],
+ start: usize,
+ ) -> Option<usize> {
+ self.0.searcher().shortest_match_at(text, start)
+ }
+
+ /// Returns the same as is_match, but starts the search at the given
+ /// offset.
+ ///
+ /// The significance of the starting point is that it takes the surrounding
+ /// context into consideration. For example, the `\A` anchor can only
+ /// match when `start == 0`.
+ pub fn is_match_at(&self, text: &[u8], start: usize) -> bool {
+ self.0.searcher().is_match_at(text, start)
+ }
+
+ /// Returns the same as find, but starts the search at the given
+ /// offset.
+ ///
+ /// The significance of the starting point is that it takes the surrounding
+ /// context into consideration. For example, the `\A` anchor can only
+ /// match when `start == 0`.
+ pub fn find_at<'t>(
+ &self,
+ text: &'t [u8],
+ start: usize,
+ ) -> Option<Match<'t>> {
+ self.0
+ .searcher()
+ .find_at(text, start)
+ .map(|(s, e)| Match::new(text, s, e))
+ }
+
+ /// This is like `captures`, but uses
+ /// [`CaptureLocations`](struct.CaptureLocations.html)
+ /// instead of
+ /// [`Captures`](struct.Captures.html) in order to amortize allocations.
+ ///
+ /// To create a `CaptureLocations` value, use the
+ /// `Regex::capture_locations` method.
+ ///
+ /// This returns the overall match if this was successful, which is always
+ /// equivalence to the `0`th capture group.
+ pub fn captures_read<'t>(
+ &self,
+ locs: &mut CaptureLocations,
+ text: &'t [u8],
+ ) -> Option<Match<'t>> {
+ self.captures_read_at(locs, text, 0)
+ }
+
+ /// Returns the same as `captures_read`, but starts the search at the given
+ /// offset and populates the capture locations given.
+ ///
+ /// The significance of the starting point is that it takes the surrounding
+ /// context into consideration. For example, the `\A` anchor can only
+ /// match when `start == 0`.
+ pub fn captures_read_at<'t>(
+ &self,
+ locs: &mut CaptureLocations,
+ text: &'t [u8],
+ start: usize,
+ ) -> Option<Match<'t>> {
+ self.0
+ .searcher()
+ .captures_read_at(&mut locs.0, text, start)
+ .map(|(s, e)| Match::new(text, s, e))
+ }
+
+ /// An undocumented alias for `captures_read_at`.
+ ///
+ /// The `regex-capi` crate previously used this routine, so to avoid
+ /// breaking that crate, we continue to provide the name as an undocumented
+ /// alias.
+ #[doc(hidden)]
+ pub fn read_captures_at<'t>(
+ &self,
+ locs: &mut CaptureLocations,
+ text: &'t [u8],
+ start: usize,
+ ) -> Option<Match<'t>> {
+ self.captures_read_at(locs, text, start)
+ }
+}
+
+/// Auxiliary methods.
+impl Regex {
+ /// Returns the original string of this regex.
+ pub fn as_str(&self) -> &str {
+ &self.0.regex_strings()[0]
+ }
+
+ /// Returns an iterator over the capture names.
+ pub fn capture_names(&self) -> CaptureNames<'_> {
+ CaptureNames(self.0.capture_names().iter())
+ }
+
+ /// Returns the number of captures.
+ pub fn captures_len(&self) -> usize {
+ self.0.capture_names().len()
+ }
+
+ /// Returns an empty set of capture locations that can be reused in
+ /// multiple calls to `captures_read` or `captures_read_at`.
+ pub fn capture_locations(&self) -> CaptureLocations {
+ CaptureLocations(self.0.searcher().locations())
+ }
+
+ /// An alias for `capture_locations` to preserve backward compatibility.
+ ///
+ /// The `regex-capi` crate uses this method, so to avoid breaking that
+ /// crate, we continue to export it as an undocumented API.
+ #[doc(hidden)]
+ pub fn locations(&self) -> CaptureLocations {
+ CaptureLocations(self.0.searcher().locations())
+ }
+}
+
+/// An iterator over all non-overlapping matches for a particular string.
+///
+/// The iterator yields a tuple of integers corresponding to the start and end
+/// of the match. The indices are byte offsets. The iterator stops when no more
+/// matches can be found.
+///
+/// `'r` is the lifetime of the compiled regular expression and `'t` is the
+/// lifetime of the matched byte string.
+#[derive(Debug)]
+pub struct Matches<'r, 't>(re_trait::Matches<'t, ExecNoSync<'r>>);
+
+impl<'r, 't> Iterator for Matches<'r, 't> {
+ type Item = Match<'t>;
+
+ fn next(&mut self) -> Option<Match<'t>> {
+ let text = self.0.text();
+ self.0.next().map(|(s, e)| Match::new(text, s, e))
+ }
+}
+
+impl<'r, 't> FusedIterator for Matches<'r, 't> {}
+
+/// An iterator that yields all non-overlapping capture groups matching a
+/// particular regular expression.
+///
+/// The iterator stops when no more matches can be found.
+///
+/// `'r` is the lifetime of the compiled regular expression and `'t` is the
+/// lifetime of the matched byte string.
+#[derive(Debug)]
+pub struct CaptureMatches<'r, 't>(
+ re_trait::CaptureMatches<'t, ExecNoSync<'r>>,
+);
+
+impl<'r, 't> Iterator for CaptureMatches<'r, 't> {
+ type Item = Captures<'t>;
+
+ fn next(&mut self) -> Option<Captures<'t>> {
+ self.0.next().map(|locs| Captures {
+ text: self.0.text(),
+ locs,
+ named_groups: self.0.regex().capture_name_idx().clone(),
+ })
+ }
+}
+
+impl<'r, 't> FusedIterator for CaptureMatches<'r, 't> {}
+
+/// Yields all substrings delimited by a regular expression match.
+///
+/// `'r` is the lifetime of the compiled regular expression and `'t` is the
+/// lifetime of the byte string being split.
+#[derive(Debug)]
+pub struct Split<'r, 't> {
+ finder: Matches<'r, 't>,
+ last: usize,
+}
+
+impl<'r, 't> Iterator for Split<'r, 't> {
+ type Item = &'t [u8];
+
+ fn next(&mut self) -> Option<&'t [u8]> {
+ let text = self.finder.0.text();
+ match self.finder.next() {
+ None => {
+ if self.last > text.len() {
+ None
+ } else {
+ let s = &text[self.last..];
+ self.last = text.len() + 1; // Next call will return None
+ Some(s)
+ }
+ }
+ Some(m) => {
+ let matched = &text[self.last..m.start()];
+ self.last = m.end();
+ Some(matched)
+ }
+ }
+ }
+}
+
+impl<'r, 't> FusedIterator for Split<'r, 't> {}
+
+/// Yields at most `N` substrings delimited by a regular expression match.
+///
+/// The last substring will be whatever remains after splitting.
+///
+/// `'r` is the lifetime of the compiled regular expression and `'t` is the
+/// lifetime of the byte string being split.
+#[derive(Debug)]
+pub struct SplitN<'r, 't> {
+ splits: Split<'r, 't>,
+ n: usize,
+}
+
+impl<'r, 't> Iterator for SplitN<'r, 't> {
+ type Item = &'t [u8];
+
+ fn next(&mut self) -> Option<&'t [u8]> {
+ if self.n == 0 {
+ return None;
+ }
+
+ self.n -= 1;
+ if self.n > 0 {
+ return self.splits.next();
+ }
+
+ let text = self.splits.finder.0.text();
+ if self.splits.last > text.len() {
+ // We've already returned all substrings.
+ None
+ } else {
+ // self.n == 0, so future calls will return None immediately
+ Some(&text[self.splits.last..])
+ }
+ }
+
+ fn size_hint(&self) -> (usize, Option<usize>) {
+ (0, Some(self.n))
+ }
+}
+
+impl<'r, 't> FusedIterator for SplitN<'r, 't> {}
+
+/// An iterator over the names of all possible captures.
+///
+/// `None` indicates an unnamed capture; the first element (capture 0, the
+/// whole matched region) is always unnamed.
+///
+/// `'r` is the lifetime of the compiled regular expression.
+#[derive(Clone, Debug)]
+pub struct CaptureNames<'r>(::std::slice::Iter<'r, Option<String>>);
+
+impl<'r> Iterator for CaptureNames<'r> {
+ type Item = Option<&'r str>;
+
+ fn next(&mut self) -> Option<Option<&'r str>> {
+ self.0
+ .next()
+ .as_ref()
+ .map(|slot| slot.as_ref().map(|name| name.as_ref()))
+ }
+
+ fn size_hint(&self) -> (usize, Option<usize>) {
+ self.0.size_hint()
+ }
+
+ fn count(self) -> usize {
+ self.0.count()
+ }
+}
+
+impl<'r> ExactSizeIterator for CaptureNames<'r> {}
+
+impl<'r> FusedIterator for CaptureNames<'r> {}
+
+/// CaptureLocations is a low level representation of the raw offsets of each
+/// submatch.
+///
+/// You can think of this as a lower level
+/// [`Captures`](struct.Captures.html), where this type does not support
+/// named capturing groups directly and it does not borrow the text that these
+/// offsets were matched on.
+///
+/// Primarily, this type is useful when using the lower level `Regex` APIs
+/// such as `read_captures`, which permits amortizing the allocation in which
+/// capture match locations are stored.
+///
+/// In order to build a value of this type, you'll need to call the
+/// `capture_locations` method on the `Regex` being used to execute the search.
+/// The value returned can then be reused in subsequent searches.
+#[derive(Clone, Debug)]
+pub struct CaptureLocations(re_trait::Locations);
+
+/// A type alias for `CaptureLocations` for backwards compatibility.
+///
+/// Previously, we exported `CaptureLocations` as `Locations` in an
+/// undocumented API. To prevent breaking that code (e.g., in `regex-capi`),
+/// we continue re-exporting the same undocumented API.
+#[doc(hidden)]
+pub type Locations = CaptureLocations;
+
+impl CaptureLocations {
+ /// Returns the start and end positions of the Nth capture group. Returns
+ /// `None` if `i` is not a valid capture group or if the capture group did
+ /// not match anything. The positions returned are *always* byte indices
+ /// with respect to the original string matched.
+ #[inline]
+ pub fn get(&self, i: usize) -> Option<(usize, usize)> {
+ self.0.pos(i)
+ }
+
+ /// Returns the total number of capture groups (even if they didn't match).
+ ///
+ /// This is always at least `1` since every regex has at least `1`
+ /// capturing group that corresponds to the entire match.
+ #[inline]
+ pub fn len(&self) -> usize {
+ self.0.len()
+ }
+
+ /// An alias for the `get` method for backwards compatibility.
+ ///
+ /// Previously, we exported `get` as `pos` in an undocumented API. To
+ /// prevent breaking that code (e.g., in `regex-capi`), we continue
+ /// re-exporting the same undocumented API.
+ #[doc(hidden)]
+ #[inline]
+ pub fn pos(&self, i: usize) -> Option<(usize, usize)> {
+ self.get(i)
+ }
+}
+
+/// Captures represents a group of captured byte strings for a single match.
+///
+/// The 0th capture always corresponds to the entire match. Each subsequent
+/// index corresponds to the next capture group in the regex. If a capture
+/// group is named, then the matched byte string is *also* available via the
+/// `name` method. (Note that the 0th capture is always unnamed and so must be
+/// accessed with the `get` method.)
+///
+/// Positions returned from a capture group are always byte indices.
+///
+/// `'t` is the lifetime of the matched text.
+pub struct Captures<'t> {
+ text: &'t [u8],
+ locs: re_trait::Locations,
+ named_groups: Arc<HashMap<String, usize>>,
+}
+
+impl<'t> Captures<'t> {
+ /// Returns the match associated with the capture group at index `i`. If
+ /// `i` does not correspond to a capture group, or if the capture group
+ /// did not participate in the match, then `None` is returned.
+ ///
+ /// # Examples
+ ///
+ /// Get the text of the match with a default of an empty string if this
+ /// group didn't participate in the match:
+ ///
+ /// ```rust
+ /// # use regex::bytes::Regex;
+ /// let re = Regex::new(r"[a-z]+(?:([0-9]+)|([A-Z]+))").unwrap();
+ /// let caps = re.captures(b"abc123").unwrap();
+ ///
+ /// let text1 = caps.get(1).map_or(&b""[..], |m| m.as_bytes());
+ /// let text2 = caps.get(2).map_or(&b""[..], |m| m.as_bytes());
+ /// assert_eq!(text1, &b"123"[..]);
+ /// assert_eq!(text2, &b""[..]);
+ /// ```
+ pub fn get(&self, i: usize) -> Option<Match<'t>> {
+ self.locs.pos(i).map(|(s, e)| Match::new(self.text, s, e))
+ }
+
+ /// Returns the match for the capture group named `name`. If `name` isn't a
+ /// valid capture group or didn't match anything, then `None` is returned.
+ pub fn name(&self, name: &str) -> Option<Match<'t>> {
+ self.named_groups.get(name).and_then(|&i| self.get(i))
+ }
+
+ /// An iterator that yields all capturing matches in the order in which
+ /// they appear in the regex. If a particular capture group didn't
+ /// participate in the match, then `None` is yielded for that capture.
+ ///
+ /// The first match always corresponds to the overall match of the regex.
+ pub fn iter<'c>(&'c self) -> SubCaptureMatches<'c, 't> {
+ SubCaptureMatches { caps: self, it: self.locs.iter() }
+ }
+
+ /// Expands all instances of `$name` in `replacement` to the corresponding
+ /// capture group `name`, and writes them to the `dst` buffer given.
+ ///
+ /// `name` may be an integer corresponding to the index of the capture
+ /// group (counted by order of opening parenthesis where `0` is the
+ /// entire match) or it can be a name (consisting of letters, digits or
+ /// underscores) corresponding to a named capture group.
+ ///
+ /// If `name` isn't a valid capture group (whether the name doesn't exist
+ /// or isn't a valid index), then it is replaced with the empty string.
+ ///
+ /// The longest possible name consisting of the characters `[_0-9A-Za-z]`
+ /// is used. e.g., `$1a` looks up the capture group named `1a` and not the
+ /// capture group at index `1`. To exert more precise control over the
+ /// name, or to refer to a capture group name that uses characters outside
+ /// of `[_0-9A-Za-z]`, use braces, e.g., `${1}a` or `${foo[bar].baz}`. When
+ /// using braces, any sequence of valid UTF-8 bytes is permitted. If the
+ /// sequence does not refer to a capture group name in the corresponding
+ /// regex, then it is replaced with an empty string.
+ ///
+ /// To write a literal `$` use `$$`.
+ pub fn expand(&self, replacement: &[u8], dst: &mut Vec<u8>) {
+ expand_bytes(self, replacement, dst)
+ }
+
+ /// Returns the total number of capture groups (even if they didn't match).
+ ///
+ /// This is always at least `1`, since every regex has at least one capture
+ /// group that corresponds to the full match.
+ #[inline]
+ pub fn len(&self) -> usize {
+ self.locs.len()
+ }
+}
+
+impl<'t> fmt::Debug for Captures<'t> {
+ fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+ f.debug_tuple("Captures").field(&CapturesDebug(self)).finish()
+ }
+}
+
+struct CapturesDebug<'c, 't>(&'c Captures<'t>);
+
+impl<'c, 't> fmt::Debug for CapturesDebug<'c, 't> {
+ fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+ fn escape_bytes(bytes: &[u8]) -> String {
+ let mut s = String::new();
+ for &b in bytes {
+ s.push_str(&escape_byte(b));
+ }
+ s
+ }
+
+ fn escape_byte(byte: u8) -> String {
+ use std::ascii::escape_default;
+
+ let escaped: Vec<u8> = escape_default(byte).collect();
+ String::from_utf8_lossy(&escaped).into_owned()
+ }
+
+ // We'd like to show something nice here, even if it means an
+ // allocation to build a reverse index.
+ let slot_to_name: HashMap<&usize, &String> =
+ self.0.named_groups.iter().map(|(a, b)| (b, a)).collect();
+ let mut map = f.debug_map();
+ for (slot, m) in self.0.locs.iter().enumerate() {
+ let m = m.map(|(s, e)| escape_bytes(&self.0.text[s..e]));
+ if let Some(name) = slot_to_name.get(&slot) {
+ map.entry(&name, &m);
+ } else {
+ map.entry(&slot, &m);
+ }
+ }
+ map.finish()
+ }
+}
+
+/// Get a group by index.
+///
+/// `'t` is the lifetime of the matched text.
+///
+/// The text can't outlive the `Captures` object if this method is
+/// used, because of how `Index` is defined (normally `a[i]` is part
+/// of `a` and can't outlive it); to do that, use `get()` instead.
+///
+/// # Panics
+///
+/// If there is no group at the given index.
+impl<'t> Index<usize> for Captures<'t> {
+ type Output = [u8];
+
+ fn index(&self, i: usize) -> &[u8] {
+ self.get(i)
+ .map(|m| m.as_bytes())
+ .unwrap_or_else(|| panic!("no group at index '{}'", i))
+ }
+}
+
+/// Get a group by name.
+///
+/// `'t` is the lifetime of the matched text and `'i` is the lifetime
+/// of the group name (the index).
+///
+/// The text can't outlive the `Captures` object if this method is
+/// used, because of how `Index` is defined (normally `a[i]` is part
+/// of `a` and can't outlive it); to do that, use `name` instead.
+///
+/// # Panics
+///
+/// If there is no group named by the given value.
+impl<'t, 'i> Index<&'i str> for Captures<'t> {
+ type Output = [u8];
+
+ fn index<'a>(&'a self, name: &'i str) -> &'a [u8] {
+ self.name(name)
+ .map(|m| m.as_bytes())
+ .unwrap_or_else(|| panic!("no group named '{}'", name))
+ }
+}
+
+/// An iterator that yields all capturing matches in the order in which they
+/// appear in the regex.
+///
+/// If a particular capture group didn't participate in the match, then `None`
+/// is yielded for that capture. The first match always corresponds to the
+/// overall match of the regex.
+///
+/// The lifetime `'c` corresponds to the lifetime of the `Captures` value, and
+/// the lifetime `'t` corresponds to the originally matched text.
+#[derive(Clone, Debug)]
+pub struct SubCaptureMatches<'c, 't> {
+ caps: &'c Captures<'t>,
+ it: SubCapturesPosIter<'c>,
+}
+
+impl<'c, 't> Iterator for SubCaptureMatches<'c, 't> {
+ type Item = Option<Match<'t>>;
+
+ fn next(&mut self) -> Option<Option<Match<'t>>> {
+ self.it
+ .next()
+ .map(|cap| cap.map(|(s, e)| Match::new(self.caps.text, s, e)))
+ }
+}
+
+impl<'c, 't> FusedIterator for SubCaptureMatches<'c, 't> {}
+
+/// Replacer describes types that can be used to replace matches in a byte
+/// string.
+///
+/// In general, users of this crate shouldn't need to implement this trait,
+/// since implementations are already provided for `&[u8]` along with other
+/// variants of bytes types and `FnMut(&Captures) -> Vec<u8>` (or any
+/// `FnMut(&Captures) -> T` where `T: AsRef<[u8]>`), which covers most use cases.
+pub trait Replacer {
+ /// Appends text to `dst` to replace the current match.
+ ///
+ /// The current match is represented by `caps`, which is guaranteed to
+ /// have a match at capture group `0`.
+ ///
+ /// For example, a no-op replacement would be
+ /// `dst.extend(&caps[0])`.
+ fn replace_append(&mut self, caps: &Captures<'_>, dst: &mut Vec<u8>);
+
+ /// Return a fixed unchanging replacement byte string.
+ ///
+ /// When doing replacements, if access to `Captures` is not needed (e.g.,
+ /// the replacement byte string does not need `$` expansion), then it can
+ /// be beneficial to avoid finding sub-captures.
+ ///
+ /// In general, this is called once for every call to `replacen`.
+ fn no_expansion<'r>(&'r mut self) -> Option<Cow<'r, [u8]>> {
+ None
+ }
+
+ /// Return a `Replacer` that borrows and wraps this `Replacer`.
+ ///
+ /// This is useful when you want to take a generic `Replacer` (which might
+ /// not be cloneable) and use it without consuming it, so it can be used
+ /// more than once.
+ ///
+ /// # Example
+ ///
+ /// ```
+ /// use regex::bytes::{Regex, Replacer};
+ ///
+ /// fn replace_all_twice<R: Replacer>(
+ /// re: Regex,
+ /// src: &[u8],
+ /// mut rep: R,
+ /// ) -> Vec<u8> {
+ /// let dst = re.replace_all(src, rep.by_ref());
+ /// let dst = re.replace_all(&dst, rep.by_ref());
+ /// dst.into_owned()
+ /// }
+ /// ```
+ fn by_ref<'r>(&'r mut self) -> ReplacerRef<'r, Self> {
+ ReplacerRef(self)
+ }
+}
+
+/// By-reference adaptor for a `Replacer`
+///
+/// Returned by [`Replacer::by_ref`](trait.Replacer.html#method.by_ref).
+#[derive(Debug)]
+pub struct ReplacerRef<'a, R: ?Sized>(&'a mut R);
+
+impl<'a, R: Replacer + ?Sized + 'a> Replacer for ReplacerRef<'a, R> {
+ fn replace_append(&mut self, caps: &Captures<'_>, dst: &mut Vec<u8>) {
+ self.0.replace_append(caps, dst)
+ }
+ fn no_expansion<'r>(&'r mut self) -> Option<Cow<'r, [u8]>> {
+ self.0.no_expansion()
+ }
+}
+
+impl<'a> Replacer for &'a [u8] {
+ fn replace_append(&mut self, caps: &Captures<'_>, dst: &mut Vec<u8>) {
+ caps.expand(*self, dst);
+ }
+
+ fn no_expansion(&mut self) -> Option<Cow<'_, [u8]>> {
+ no_expansion(self)
+ }
+}
+
+impl<'a> Replacer for &'a Vec<u8> {
+ fn replace_append(&mut self, caps: &Captures<'_>, dst: &mut Vec<u8>) {
+ caps.expand(*self, dst);
+ }
+
+ fn no_expansion(&mut self) -> Option<Cow<'_, [u8]>> {
+ no_expansion(self)
+ }
+}
+
+impl Replacer for Vec<u8> {
+ fn replace_append(&mut self, caps: &Captures<'_>, dst: &mut Vec<u8>) {
+ caps.expand(self, dst);
+ }
+
+ fn no_expansion(&mut self) -> Option<Cow<'_, [u8]>> {
+ no_expansion(self)
+ }
+}
+
+impl<'a> Replacer for Cow<'a, [u8]> {
+ fn replace_append(&mut self, caps: &Captures<'_>, dst: &mut Vec<u8>) {
+ caps.expand(self.as_ref(), dst);
+ }
+
+ fn no_expansion(&mut self) -> Option<Cow<'_, [u8]>> {
+ no_expansion(self)
+ }
+}
+
+impl<'a> Replacer for &'a Cow<'a, [u8]> {
+ fn replace_append(&mut self, caps: &Captures<'_>, dst: &mut Vec<u8>) {
+ caps.expand(self.as_ref(), dst);
+ }
+
+ fn no_expansion(&mut self) -> Option<Cow<'_, [u8]>> {
+ no_expansion(self)
+ }
+}
+
+fn no_expansion<T: AsRef<[u8]>>(t: &T) -> Option<Cow<'_, [u8]>> {
+ let s = t.as_ref();
+ match find_byte(b'$', s) {
+ Some(_) => None,
+ None => Some(Cow::Borrowed(s)),
+ }
+}
+
+impl<F, T> Replacer for F
+where
+ F: FnMut(&Captures<'_>) -> T,
+ T: AsRef<[u8]>,
+{
+ fn replace_append(&mut self, caps: &Captures<'_>, dst: &mut Vec<u8>) {
+ dst.extend_from_slice((*self)(caps).as_ref());
+ }
+}
+
+/// `NoExpand` indicates literal byte string replacement.
+///
+/// It can be used with `replace` and `replace_all` to do a literal byte string
+/// replacement without expanding `$name` to their corresponding capture
+/// groups. This can be both convenient (to avoid escaping `$`, for example)
+/// and performant (since capture groups don't need to be found).
+///
+/// `'t` is the lifetime of the literal text.
+#[derive(Clone, Debug)]
+pub struct NoExpand<'t>(pub &'t [u8]);
+
+impl<'t> Replacer for NoExpand<'t> {
+ fn replace_append(&mut self, _: &Captures<'_>, dst: &mut Vec<u8>) {
+ dst.extend_from_slice(self.0);
+ }
+
+ fn no_expansion(&mut self) -> Option<Cow<'_, [u8]>> {
+ Some(Cow::Borrowed(self.0))
+ }
+}
diff --git a/third_party/rust/regex/src/re_set.rs b/third_party/rust/regex/src/re_set.rs
new file mode 100644
index 0000000000..a6d886d761
--- /dev/null
+++ b/third_party/rust/regex/src/re_set.rs
@@ -0,0 +1,507 @@
+macro_rules! define_set {
+ ($name:ident, $builder_mod:ident, $text_ty:ty, $as_bytes:expr,
+ $(#[$doc_regexset_example:meta])* ) => {
+ pub mod $name {
+ use std::fmt;
+ use std::iter;
+ use std::slice;
+ use std::vec;
+
+ use crate::error::Error;
+ use crate::exec::Exec;
+ use crate::re_builder::$builder_mod::RegexSetBuilder;
+ use crate::re_trait::RegularExpression;
+
+/// Match multiple (possibly overlapping) regular expressions in a single scan.
+///
+/// A regex set corresponds to the union of two or more regular expressions.
+/// That is, a regex set will match text where at least one of its
+/// constituent regular expressions matches. A regex set as its formulated here
+/// provides a touch more power: it will also report *which* regular
+/// expressions in the set match. Indeed, this is the key difference between
+/// regex sets and a single `Regex` with many alternates, since only one
+/// alternate can match at a time.
+///
+/// For example, consider regular expressions to match email addresses and
+/// domains: `[a-z]+@[a-z]+\.(com|org|net)` and `[a-z]+\.(com|org|net)`. If a
+/// regex set is constructed from those regexes, then searching the text
+/// `foo@example.com` will report both regexes as matching. Of course, one
+/// could accomplish this by compiling each regex on its own and doing two
+/// searches over the text. The key advantage of using a regex set is that it
+/// will report the matching regexes using a *single pass through the text*.
+/// If one has hundreds or thousands of regexes to match repeatedly (like a URL
+/// router for a complex web application or a user agent matcher), then a regex
+/// set can realize huge performance gains.
+///
+/// # Example
+///
+/// This shows how the above two regexes (for matching email addresses and
+/// domains) might work:
+///
+$(#[$doc_regexset_example])*
+///
+/// Note that it would be possible to adapt the above example to using `Regex`
+/// with an expression like:
+///
+/// ```text
+/// (?P<email>[a-z]+@(?P<email_domain>[a-z]+[.](com|org|net)))|(?P<domain>[a-z]+[.](com|org|net))
+/// ```
+///
+/// After a match, one could then inspect the capture groups to figure out
+/// which alternates matched. The problem is that it is hard to make this
+/// approach scale when there are many regexes since the overlap between each
+/// alternate isn't always obvious to reason about.
+///
+/// # Limitations
+///
+/// Regex sets are limited to answering the following two questions:
+///
+/// 1. Does any regex in the set match?
+/// 2. If so, which regexes in the set match?
+///
+/// As with the main [`Regex`][crate::Regex] type, it is cheaper to ask (1)
+/// instead of (2) since the matching engines can stop after the first match
+/// is found.
+///
+/// You cannot directly extract [`Match`][crate::Match] or
+/// [`Captures`][crate::Captures] objects from a regex set. If you need these
+/// operations, the recommended approach is to compile each pattern in the set
+/// independently and scan the exact same input a second time with those
+/// independently compiled patterns:
+///
+/// ```rust
+/// use regex::{Regex, RegexSet};
+///
+/// let patterns = ["foo", "bar"];
+/// // Both patterns will match different ranges of this string.
+/// let text = "barfoo";
+///
+/// // Compile a set matching any of our patterns.
+/// let set = RegexSet::new(&patterns).unwrap();
+/// // Compile each pattern independently.
+/// let regexes: Vec<_> = set.patterns().iter()
+/// .map(|pat| Regex::new(pat).unwrap())
+/// .collect();
+///
+/// // Match against the whole set first and identify the individual
+/// // matching patterns.
+/// let matches: Vec<&str> = set.matches(text).into_iter()
+/// // Dereference the match index to get the corresponding
+/// // compiled pattern.
+/// .map(|match_idx| &regexes[match_idx])
+/// // To get match locations or any other info, we then have to search
+/// // the exact same text again, using our separately-compiled pattern.
+/// .map(|pat| pat.find(text).unwrap().as_str())
+/// .collect();
+///
+/// // Matches arrive in the order the constituent patterns were declared,
+/// // not the order they appear in the input.
+/// assert_eq!(vec!["foo", "bar"], matches);
+/// ```
+///
+/// # Performance
+///
+/// A `RegexSet` has the same performance characteristics as `Regex`. Namely,
+/// search takes `O(mn)` time, where `m` is proportional to the size of the
+/// regex set and `n` is proportional to the length of the search text.
+#[derive(Clone)]
+pub struct RegexSet(Exec);
+
+impl RegexSet {
+ /// Create a new regex set with the given regular expressions.
+ ///
+ /// This takes an iterator of `S`, where `S` is something that can produce
+ /// a `&str`. If any of the strings in the iterator are not valid regular
+ /// expressions, then an error is returned.
+ ///
+ /// # Example
+ ///
+ /// Create a new regex set from an iterator of strings:
+ ///
+ /// ```rust
+ /// # use regex::RegexSet;
+ /// let set = RegexSet::new(&[r"\w+", r"\d+"]).unwrap();
+ /// assert!(set.is_match("foo"));
+ /// ```
+ pub fn new<I, S>(exprs: I) -> Result<RegexSet, Error>
+ where S: AsRef<str>, I: IntoIterator<Item=S> {
+ RegexSetBuilder::new(exprs).build()
+ }
+
+ /// Create a new empty regex set.
+ ///
+ /// # Example
+ ///
+ /// ```rust
+ /// # use regex::RegexSet;
+ /// let set = RegexSet::empty();
+ /// assert!(set.is_empty());
+ /// ```
+ pub fn empty() -> RegexSet {
+ RegexSetBuilder::new(&[""; 0]).build().unwrap()
+ }
+
+ /// Returns true if and only if one of the regexes in this set matches
+ /// the text given.
+ ///
+ /// This method should be preferred if you only need to test whether any
+ /// of the regexes in the set should match, but don't care about *which*
+ /// regexes matched. This is because the underlying matching engine will
+ /// quit immediately after seeing the first match instead of continuing to
+ /// find all matches.
+ ///
+ /// Note that as with searches using `Regex`, the expression is unanchored
+ /// by default. That is, if the regex does not start with `^` or `\A`, or
+ /// end with `$` or `\z`, then it is permitted to match anywhere in the
+ /// text.
+ ///
+ /// # Example
+ ///
+ /// Tests whether a set matches some text:
+ ///
+ /// ```rust
+ /// # use regex::RegexSet;
+ /// let set = RegexSet::new(&[r"\w+", r"\d+"]).unwrap();
+ /// assert!(set.is_match("foo"));
+ /// assert!(!set.is_match("☃"));
+ /// ```
+ pub fn is_match(&self, text: $text_ty) -> bool {
+ self.is_match_at(text, 0)
+ }
+
+ /// Returns the same as is_match, but starts the search at the given
+ /// offset.
+ ///
+ /// The significance of the starting point is that it takes the surrounding
+ /// context into consideration. For example, the `\A` anchor can only
+ /// match when `start == 0`.
+ #[doc(hidden)]
+ pub fn is_match_at(&self, text: $text_ty, start: usize) -> bool {
+ self.0.searcher().is_match_at($as_bytes(text), start)
+ }
+
+ /// Returns the set of regular expressions that match in the given text.
+ ///
+ /// The set returned contains the index of each regular expression that
+ /// matches in the given text. The index is in correspondence with the
+ /// order of regular expressions given to `RegexSet`'s constructor.
+ ///
+ /// The set can also be used to iterate over the matched indices.
+ ///
+ /// Note that as with searches using `Regex`, the expression is unanchored
+ /// by default. That is, if the regex does not start with `^` or `\A`, or
+ /// end with `$` or `\z`, then it is permitted to match anywhere in the
+ /// text.
+ ///
+ /// # Example
+ ///
+ /// Tests which regular expressions match the given text:
+ ///
+ /// ```rust
+ /// # use regex::RegexSet;
+ /// let set = RegexSet::new(&[
+ /// r"\w+",
+ /// r"\d+",
+ /// r"\pL+",
+ /// r"foo",
+ /// r"bar",
+ /// r"barfoo",
+ /// r"foobar",
+ /// ]).unwrap();
+ /// let matches: Vec<_> = set.matches("foobar").into_iter().collect();
+ /// assert_eq!(matches, vec![0, 2, 3, 4, 6]);
+ ///
+ /// // You can also test whether a particular regex matched:
+ /// let matches = set.matches("foobar");
+ /// assert!(!matches.matched(5));
+ /// assert!(matches.matched(6));
+ /// ```
+ pub fn matches(&self, text: $text_ty) -> SetMatches {
+ let mut matches = vec![false; self.0.regex_strings().len()];
+ let any = self.read_matches_at(&mut matches, text, 0);
+ SetMatches {
+ matched_any: any,
+ matches: matches,
+ }
+ }
+
+ /// Returns the same as matches, but starts the search at the given
+ /// offset and stores the matches into the slice given.
+ ///
+ /// The significance of the starting point is that it takes the surrounding
+ /// context into consideration. For example, the `\A` anchor can only
+ /// match when `start == 0`.
+ ///
+ /// `matches` must have a length that is at least the number of regexes
+ /// in this set.
+ ///
+ /// This method returns true if and only if at least one member of
+ /// `matches` is true after executing the set against `text`.
+ #[doc(hidden)]
+ pub fn read_matches_at(
+ &self,
+ matches: &mut [bool],
+ text: $text_ty,
+ start: usize,
+ ) -> bool {
+ self.0.searcher().many_matches_at(matches, $as_bytes(text), start)
+ }
+
+ /// Returns the total number of regular expressions in this set.
+ pub fn len(&self) -> usize {
+ self.0.regex_strings().len()
+ }
+
+ /// Returns `true` if this set contains no regular expressions.
+ pub fn is_empty(&self) -> bool {
+ self.0.regex_strings().is_empty()
+ }
+
+ /// Returns the patterns that this set will match on.
+ ///
+ /// This function can be used to determine the pattern for a match. The
+ /// slice returned has exactly as many patterns givens to this regex set,
+ /// and the order of the slice is the same as the order of the patterns
+ /// provided to the set.
+ ///
+ /// # Example
+ ///
+ /// ```rust
+ /// # use regex::RegexSet;
+ /// let set = RegexSet::new(&[
+ /// r"\w+",
+ /// r"\d+",
+ /// r"\pL+",
+ /// r"foo",
+ /// r"bar",
+ /// r"barfoo",
+ /// r"foobar",
+ /// ]).unwrap();
+ /// let matches: Vec<_> = set
+ /// .matches("foobar")
+ /// .into_iter()
+ /// .map(|match_idx| &set.patterns()[match_idx])
+ /// .collect();
+ /// assert_eq!(matches, vec![r"\w+", r"\pL+", r"foo", r"bar", r"foobar"]);
+ /// ```
+ pub fn patterns(&self) -> &[String] {
+ self.0.regex_strings()
+ }
+}
+
+/// A set of matches returned by a regex set.
+#[derive(Clone, Debug)]
+pub struct SetMatches {
+ matched_any: bool,
+ matches: Vec<bool>,
+}
+
+impl SetMatches {
+ /// Whether this set contains any matches.
+ pub fn matched_any(&self) -> bool {
+ self.matched_any
+ }
+
+ /// Whether the regex at the given index matched.
+ ///
+ /// The index for a regex is determined by its insertion order upon the
+ /// initial construction of a `RegexSet`, starting at `0`.
+ ///
+ /// # Panics
+ ///
+ /// If `regex_index` is greater than or equal to `self.len()`.
+ pub fn matched(&self, regex_index: usize) -> bool {
+ self.matches[regex_index]
+ }
+
+ /// The total number of regexes in the set that created these matches.
+ pub fn len(&self) -> usize {
+ self.matches.len()
+ }
+
+ /// Returns an iterator over indexes in the regex that matched.
+ ///
+ /// This will always produces matches in ascending order of index, where
+ /// the index corresponds to the index of the regex that matched with
+ /// respect to its position when initially building the set.
+ pub fn iter(&self) -> SetMatchesIter<'_> {
+ SetMatchesIter((&*self.matches).into_iter().enumerate())
+ }
+}
+
+impl IntoIterator for SetMatches {
+ type IntoIter = SetMatchesIntoIter;
+ type Item = usize;
+
+ fn into_iter(self) -> Self::IntoIter {
+ SetMatchesIntoIter(self.matches.into_iter().enumerate())
+ }
+}
+
+impl<'a> IntoIterator for &'a SetMatches {
+ type IntoIter = SetMatchesIter<'a>;
+ type Item = usize;
+
+ fn into_iter(self) -> Self::IntoIter {
+ self.iter()
+ }
+}
+
+/// An owned iterator over the set of matches from a regex set.
+///
+/// This will always produces matches in ascending order of index, where the
+/// index corresponds to the index of the regex that matched with respect to
+/// its position when initially building the set.
+#[derive(Debug)]
+pub struct SetMatchesIntoIter(iter::Enumerate<vec::IntoIter<bool>>);
+
+impl Iterator for SetMatchesIntoIter {
+ type Item = usize;
+
+ fn next(&mut self) -> Option<usize> {
+ loop {
+ match self.0.next() {
+ None => return None,
+ Some((_, false)) => {}
+ Some((i, true)) => return Some(i),
+ }
+ }
+ }
+
+ fn size_hint(&self) -> (usize, Option<usize>) {
+ self.0.size_hint()
+ }
+}
+
+impl DoubleEndedIterator for SetMatchesIntoIter {
+ fn next_back(&mut self) -> Option<usize> {
+ loop {
+ match self.0.next_back() {
+ None => return None,
+ Some((_, false)) => {}
+ Some((i, true)) => return Some(i),
+ }
+ }
+ }
+}
+
+impl iter::FusedIterator for SetMatchesIntoIter {}
+
+/// A borrowed iterator over the set of matches from a regex set.
+///
+/// The lifetime `'a` refers to the lifetime of a `SetMatches` value.
+///
+/// This will always produces matches in ascending order of index, where the
+/// index corresponds to the index of the regex that matched with respect to
+/// its position when initially building the set.
+#[derive(Clone, Debug)]
+pub struct SetMatchesIter<'a>(iter::Enumerate<slice::Iter<'a, bool>>);
+
+impl<'a> Iterator for SetMatchesIter<'a> {
+ type Item = usize;
+
+ fn next(&mut self) -> Option<usize> {
+ loop {
+ match self.0.next() {
+ None => return None,
+ Some((_, &false)) => {}
+ Some((i, &true)) => return Some(i),
+ }
+ }
+ }
+
+ fn size_hint(&self) -> (usize, Option<usize>) {
+ self.0.size_hint()
+ }
+}
+
+impl<'a> DoubleEndedIterator for SetMatchesIter<'a> {
+ fn next_back(&mut self) -> Option<usize> {
+ loop {
+ match self.0.next_back() {
+ None => return None,
+ Some((_, &false)) => {}
+ Some((i, &true)) => return Some(i),
+ }
+ }
+ }
+}
+
+impl<'a> iter::FusedIterator for SetMatchesIter<'a> {}
+
+#[doc(hidden)]
+impl From<Exec> for RegexSet {
+ fn from(exec: Exec) -> Self {
+ RegexSet(exec)
+ }
+}
+
+impl fmt::Debug for RegexSet {
+ fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+ write!(f, "RegexSet({:?})", self.0.regex_strings())
+ }
+}
+
+#[allow(dead_code)] fn as_bytes_str(text: &str) -> &[u8] { text.as_bytes() }
+#[allow(dead_code)] fn as_bytes_bytes(text: &[u8]) -> &[u8] { text }
+ }
+ }
+}
+
+define_set! {
+ unicode,
+ set_unicode,
+ &str,
+ as_bytes_str,
+/// ```rust
+/// # use regex::RegexSet;
+/// let set = RegexSet::new(&[
+/// r"[a-z]+@[a-z]+\.(com|org|net)",
+/// r"[a-z]+\.(com|org|net)",
+/// ]).unwrap();
+///
+/// // Ask whether any regexes in the set match.
+/// assert!(set.is_match("foo@example.com"));
+///
+/// // Identify which regexes in the set match.
+/// let matches: Vec<_> = set.matches("foo@example.com").into_iter().collect();
+/// assert_eq!(vec![0, 1], matches);
+///
+/// // Try again, but with text that only matches one of the regexes.
+/// let matches: Vec<_> = set.matches("example.com").into_iter().collect();
+/// assert_eq!(vec![1], matches);
+///
+/// // Try again, but with text that doesn't match any regex in the set.
+/// let matches: Vec<_> = set.matches("example").into_iter().collect();
+/// assert!(matches.is_empty());
+/// ```
+}
+
+define_set! {
+ bytes,
+ set_bytes,
+ &[u8],
+ as_bytes_bytes,
+/// ```rust
+/// # use regex::bytes::RegexSet;
+/// let set = RegexSet::new(&[
+/// r"[a-z]+@[a-z]+\.(com|org|net)",
+/// r"[a-z]+\.(com|org|net)",
+/// ]).unwrap();
+///
+/// // Ask whether any regexes in the set match.
+/// assert!(set.is_match(b"foo@example.com"));
+///
+/// // Identify which regexes in the set match.
+/// let matches: Vec<_> = set.matches(b"foo@example.com").into_iter().collect();
+/// assert_eq!(vec![0, 1], matches);
+///
+/// // Try again, but with text that only matches one of the regexes.
+/// let matches: Vec<_> = set.matches(b"example.com").into_iter().collect();
+/// assert_eq!(vec![1], matches);
+///
+/// // Try again, but with text that doesn't match any regex in the set.
+/// let matches: Vec<_> = set.matches(b"example").into_iter().collect();
+/// assert!(matches.is_empty());
+/// ```
+}
diff --git a/third_party/rust/regex/src/re_trait.rs b/third_party/rust/regex/src/re_trait.rs
new file mode 100644
index 0000000000..d0c717df5a
--- /dev/null
+++ b/third_party/rust/regex/src/re_trait.rs
@@ -0,0 +1,294 @@
+use std::fmt;
+use std::iter::FusedIterator;
+
+/// Slot is a single saved capture location. Note that there are two slots for
+/// every capture in a regular expression (one slot each for the start and end
+/// of the capture).
+pub type Slot = Option<usize>;
+
+/// Locations represents the offsets of each capturing group in a regex for
+/// a single match.
+///
+/// Unlike `Captures`, a `Locations` value only stores offsets.
+#[doc(hidden)]
+#[derive(Clone, Debug)]
+pub struct Locations(Vec<Slot>);
+
+impl Locations {
+ /// Returns the start and end positions of the Nth capture group. Returns
+ /// `None` if `i` is not a valid capture group or if the capture group did
+ /// not match anything. The positions returned are *always* byte indices
+ /// with respect to the original string matched.
+ pub fn pos(&self, i: usize) -> Option<(usize, usize)> {
+ let (s, e) = (i * 2, i * 2 + 1);
+ match (self.0.get(s), self.0.get(e)) {
+ (Some(&Some(s)), Some(&Some(e))) => Some((s, e)),
+ _ => None,
+ }
+ }
+
+ /// Creates an iterator of all the capture group positions in order of
+ /// appearance in the regular expression. Positions are byte indices
+ /// in terms of the original string matched.
+ pub fn iter(&self) -> SubCapturesPosIter<'_> {
+ SubCapturesPosIter { idx: 0, locs: self }
+ }
+
+ /// Returns the total number of capturing groups.
+ ///
+ /// This is always at least `1` since every regex has at least `1`
+ /// capturing group that corresponds to the entire match.
+ pub fn len(&self) -> usize {
+ self.0.len() / 2
+ }
+
+ /// Return the individual slots as a slice.
+ pub(crate) fn as_slots(&mut self) -> &mut [Slot] {
+ &mut self.0
+ }
+}
+
+/// An iterator over capture group positions for a particular match of a
+/// regular expression.
+///
+/// Positions are byte indices in terms of the original string matched.
+///
+/// `'c` is the lifetime of the captures.
+#[derive(Clone, Debug)]
+pub struct SubCapturesPosIter<'c> {
+ idx: usize,
+ locs: &'c Locations,
+}
+
+impl<'c> Iterator for SubCapturesPosIter<'c> {
+ type Item = Option<(usize, usize)>;
+
+ fn next(&mut self) -> Option<Option<(usize, usize)>> {
+ if self.idx >= self.locs.len() {
+ return None;
+ }
+ let x = match self.locs.pos(self.idx) {
+ None => Some(None),
+ Some((s, e)) => Some(Some((s, e))),
+ };
+ self.idx += 1;
+ x
+ }
+
+ fn size_hint(&self) -> (usize, Option<usize>) {
+ let len = self.locs.len() - self.idx;
+ (len, Some(len))
+ }
+
+ fn count(self) -> usize {
+ self.len()
+ }
+}
+
+impl<'c> ExactSizeIterator for SubCapturesPosIter<'c> {}
+
+impl<'c> FusedIterator for SubCapturesPosIter<'c> {}
+
+/// `RegularExpression` describes types that can implement regex searching.
+///
+/// This trait is my attempt at reducing code duplication and to standardize
+/// the internal API. Specific duplication that is avoided are the `find`
+/// and `capture` iterators, which are slightly tricky.
+///
+/// It's not clear whether this trait is worth it, and it also isn't
+/// clear whether it's useful as a public trait or not. Methods like
+/// `next_after_empty` reak of bad design, but the rest of the methods seem
+/// somewhat reasonable. One particular thing this trait would expose would be
+/// the ability to start the search of a regex anywhere in a haystack, which
+/// isn't possible in the current public API.
+pub trait RegularExpression: Sized + fmt::Debug {
+ /// The type of the haystack.
+ type Text: ?Sized + fmt::Debug;
+
+ /// The number of capture slots in the compiled regular expression. This is
+ /// always two times the number of capture groups (two slots per group).
+ fn slots_len(&self) -> usize;
+
+ /// Allocates fresh space for all capturing groups in this regex.
+ fn locations(&self) -> Locations {
+ Locations(vec![None; self.slots_len()])
+ }
+
+ /// Returns the position of the next character after `i`.
+ ///
+ /// For example, a haystack with type `&[u8]` probably returns `i+1`,
+ /// whereas a haystack with type `&str` probably returns `i` plus the
+ /// length of the next UTF-8 sequence.
+ fn next_after_empty(&self, text: &Self::Text, i: usize) -> usize;
+
+ /// Returns the location of the shortest match.
+ fn shortest_match_at(
+ &self,
+ text: &Self::Text,
+ start: usize,
+ ) -> Option<usize>;
+
+ /// Returns whether the regex matches the text given.
+ fn is_match_at(&self, text: &Self::Text, start: usize) -> bool;
+
+ /// Returns the leftmost-first match location if one exists.
+ fn find_at(
+ &self,
+ text: &Self::Text,
+ start: usize,
+ ) -> Option<(usize, usize)>;
+
+ /// Returns the leftmost-first match location if one exists, and also
+ /// fills in any matching capture slot locations.
+ fn captures_read_at(
+ &self,
+ locs: &mut Locations,
+ text: &Self::Text,
+ start: usize,
+ ) -> Option<(usize, usize)>;
+
+ /// Returns an iterator over all non-overlapping successive leftmost-first
+ /// matches.
+ fn find_iter(self, text: &Self::Text) -> Matches<'_, Self> {
+ Matches { re: self, text, last_end: 0, last_match: None }
+ }
+
+ /// Returns an iterator over all non-overlapping successive leftmost-first
+ /// matches with captures.
+ fn captures_iter(self, text: &Self::Text) -> CaptureMatches<'_, Self> {
+ CaptureMatches(self.find_iter(text))
+ }
+}
+
+/// An iterator over all non-overlapping successive leftmost-first matches.
+#[derive(Debug)]
+pub struct Matches<'t, R>
+where
+ R: RegularExpression,
+ R::Text: 't,
+{
+ re: R,
+ text: &'t R::Text,
+ last_end: usize,
+ last_match: Option<usize>,
+}
+
+impl<'t, R> Matches<'t, R>
+where
+ R: RegularExpression,
+ R::Text: 't,
+{
+ /// Return the text being searched.
+ pub fn text(&self) -> &'t R::Text {
+ self.text
+ }
+
+ /// Return the underlying regex.
+ pub fn regex(&self) -> &R {
+ &self.re
+ }
+}
+
+impl<'t, R> Iterator for Matches<'t, R>
+where
+ R: RegularExpression,
+ R::Text: 't + AsRef<[u8]>,
+{
+ type Item = (usize, usize);
+
+ fn next(&mut self) -> Option<(usize, usize)> {
+ if self.last_end > self.text.as_ref().len() {
+ return None;
+ }
+ let (s, e) = match self.re.find_at(self.text, self.last_end) {
+ None => return None,
+ Some((s, e)) => (s, e),
+ };
+ if s == e {
+ // This is an empty match. To ensure we make progress, start
+ // the next search at the smallest possible starting position
+ // of the next match following this one.
+ self.last_end = self.re.next_after_empty(self.text, e);
+ // Don't accept empty matches immediately following a match.
+ // Just move on to the next match.
+ if Some(e) == self.last_match {
+ return self.next();
+ }
+ } else {
+ self.last_end = e;
+ }
+ self.last_match = Some(e);
+ Some((s, e))
+ }
+}
+
+impl<'t, R> FusedIterator for Matches<'t, R>
+where
+ R: RegularExpression,
+ R::Text: 't + AsRef<[u8]>,
+{
+}
+
+/// An iterator over all non-overlapping successive leftmost-first matches with
+/// captures.
+#[derive(Debug)]
+pub struct CaptureMatches<'t, R>(Matches<'t, R>)
+where
+ R: RegularExpression,
+ R::Text: 't;
+
+impl<'t, R> CaptureMatches<'t, R>
+where
+ R: RegularExpression,
+ R::Text: 't,
+{
+ /// Return the text being searched.
+ pub fn text(&self) -> &'t R::Text {
+ self.0.text()
+ }
+
+ /// Return the underlying regex.
+ pub fn regex(&self) -> &R {
+ self.0.regex()
+ }
+}
+
+impl<'t, R> Iterator for CaptureMatches<'t, R>
+where
+ R: RegularExpression,
+ R::Text: 't + AsRef<[u8]>,
+{
+ type Item = Locations;
+
+ fn next(&mut self) -> Option<Locations> {
+ if self.0.last_end > self.0.text.as_ref().len() {
+ return None;
+ }
+ let mut locs = self.0.re.locations();
+ let (s, e) = match self.0.re.captures_read_at(
+ &mut locs,
+ self.0.text,
+ self.0.last_end,
+ ) {
+ None => return None,
+ Some((s, e)) => (s, e),
+ };
+ if s == e {
+ self.0.last_end = self.0.re.next_after_empty(self.0.text, e);
+ if Some(e) == self.0.last_match {
+ return self.next();
+ }
+ } else {
+ self.0.last_end = e;
+ }
+ self.0.last_match = Some(e);
+ Some(locs)
+ }
+}
+
+impl<'t, R> FusedIterator for CaptureMatches<'t, R>
+where
+ R: RegularExpression,
+ R::Text: 't + AsRef<[u8]>,
+{
+}
diff --git a/third_party/rust/regex/src/re_unicode.rs b/third_party/rust/regex/src/re_unicode.rs
new file mode 100644
index 0000000000..60d81a7d95
--- /dev/null
+++ b/third_party/rust/regex/src/re_unicode.rs
@@ -0,0 +1,1311 @@
+use std::borrow::Cow;
+use std::collections::HashMap;
+use std::fmt;
+use std::iter::FusedIterator;
+use std::ops::{Index, Range};
+use std::str::FromStr;
+use std::sync::Arc;
+
+use crate::find_byte::find_byte;
+
+use crate::error::Error;
+use crate::exec::{Exec, ExecNoSyncStr};
+use crate::expand::expand_str;
+use crate::re_builder::unicode::RegexBuilder;
+use crate::re_trait::{self, RegularExpression, SubCapturesPosIter};
+
+/// Escapes all regular expression meta characters in `text`.
+///
+/// The string returned may be safely used as a literal in a regular
+/// expression.
+pub fn escape(text: &str) -> String {
+ regex_syntax::escape(text)
+}
+
+/// Match represents a single match of a regex in a haystack.
+///
+/// The lifetime parameter `'t` refers to the lifetime of the matched text.
+#[derive(Copy, Clone, Debug, Eq, PartialEq)]
+pub struct Match<'t> {
+ text: &'t str,
+ start: usize,
+ end: usize,
+}
+
+impl<'t> Match<'t> {
+ /// Returns the starting byte offset of the match in the haystack.
+ #[inline]
+ pub fn start(&self) -> usize {
+ self.start
+ }
+
+ /// Returns the ending byte offset of the match in the haystack.
+ #[inline]
+ pub fn end(&self) -> usize {
+ self.end
+ }
+
+ /// Returns the range over the starting and ending byte offsets of the
+ /// match in the haystack.
+ #[inline]
+ pub fn range(&self) -> Range<usize> {
+ self.start..self.end
+ }
+
+ /// Returns the matched text.
+ #[inline]
+ pub fn as_str(&self) -> &'t str {
+ &self.text[self.range()]
+ }
+
+ /// Creates a new match from the given haystack and byte offsets.
+ #[inline]
+ fn new(haystack: &'t str, start: usize, end: usize) -> Match<'t> {
+ Match { text: haystack, start, end }
+ }
+}
+
+impl<'t> From<Match<'t>> for &'t str {
+ fn from(m: Match<'t>) -> &'t str {
+ m.as_str()
+ }
+}
+
+impl<'t> From<Match<'t>> for Range<usize> {
+ fn from(m: Match<'t>) -> Range<usize> {
+ m.range()
+ }
+}
+
+/// A compiled regular expression for matching Unicode strings.
+///
+/// It is represented as either a sequence of bytecode instructions (dynamic)
+/// or as a specialized Rust function (native). It can be used to search, split
+/// or replace text. All searching is done with an implicit `.*?` at the
+/// beginning and end of an expression. To force an expression to match the
+/// whole string (or a prefix or a suffix), you must use an anchor like `^` or
+/// `$` (or `\A` and `\z`).
+///
+/// While this crate will handle Unicode strings (whether in the regular
+/// expression or in the search text), all positions returned are **byte
+/// indices**. Every byte index is guaranteed to be at a Unicode code point
+/// boundary.
+///
+/// The lifetimes `'r` and `'t` in this crate correspond to the lifetime of a
+/// compiled regular expression and text to search, respectively.
+///
+/// The only methods that allocate new strings are the string replacement
+/// methods. All other methods (searching and splitting) return borrowed
+/// pointers into the string given.
+///
+/// # Examples
+///
+/// Find the location of a US phone number:
+///
+/// ```rust
+/// # use regex::Regex;
+/// let re = Regex::new("[0-9]{3}-[0-9]{3}-[0-9]{4}").unwrap();
+/// let mat = re.find("phone: 111-222-3333").unwrap();
+/// assert_eq!((mat.start(), mat.end()), (7, 19));
+/// ```
+///
+/// # Using the `std::str::pattern` methods with `Regex`
+///
+/// > **Note**: This section requires that this crate is compiled with the
+/// > `pattern` Cargo feature enabled, which **requires nightly Rust**.
+///
+/// Since `Regex` implements `Pattern`, you can use regexes with methods
+/// defined on `&str`. For example, `is_match`, `find`, `find_iter`
+/// and `split` can be replaced with `str::contains`, `str::find`,
+/// `str::match_indices` and `str::split`.
+///
+/// Here are some examples:
+///
+/// ```rust,ignore
+/// # use regex::Regex;
+/// let re = Regex::new(r"\d+").unwrap();
+/// let haystack = "a111b222c";
+///
+/// assert!(haystack.contains(&re));
+/// assert_eq!(haystack.find(&re), Some(1));
+/// assert_eq!(haystack.match_indices(&re).collect::<Vec<_>>(),
+/// vec![(1, "111"), (5, "222")]);
+/// assert_eq!(haystack.split(&re).collect::<Vec<_>>(), vec!["a", "b", "c"]);
+/// ```
+#[derive(Clone)]
+pub struct Regex(Exec);
+
+impl fmt::Display for Regex {
+ /// Shows the original regular expression.
+ fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+ write!(f, "{}", self.as_str())
+ }
+}
+
+impl fmt::Debug for Regex {
+ /// Shows the original regular expression.
+ fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+ fmt::Display::fmt(self, f)
+ }
+}
+
+#[doc(hidden)]
+impl From<Exec> for Regex {
+ fn from(exec: Exec) -> Regex {
+ Regex(exec)
+ }
+}
+
+impl FromStr for Regex {
+ type Err = Error;
+
+ /// Attempts to parse a string into a regular expression
+ fn from_str(s: &str) -> Result<Regex, Error> {
+ Regex::new(s)
+ }
+}
+
+/// Core regular expression methods.
+impl Regex {
+ /// Compiles a regular expression. Once compiled, it can be used repeatedly
+ /// to search, split or replace text in a string.
+ ///
+ /// If an invalid expression is given, then an error is returned.
+ pub fn new(re: &str) -> Result<Regex, Error> {
+ RegexBuilder::new(re).build()
+ }
+
+ /// Returns true if and only if there is a match for the regex in the
+ /// string given.
+ ///
+ /// It is recommended to use this method if all you need to do is test
+ /// a match, since the underlying matching engine may be able to do less
+ /// work.
+ ///
+ /// # Example
+ ///
+ /// Test if some text contains at least one word with exactly 13
+ /// Unicode word characters:
+ ///
+ /// ```rust
+ /// # use regex::Regex;
+ /// # fn main() {
+ /// let text = "I categorically deny having triskaidekaphobia.";
+ /// assert!(Regex::new(r"\b\w{13}\b").unwrap().is_match(text));
+ /// # }
+ /// ```
+ pub fn is_match(&self, text: &str) -> bool {
+ self.is_match_at(text, 0)
+ }
+
+ /// Returns the start and end byte range of the leftmost-first match in
+ /// `text`. If no match exists, then `None` is returned.
+ ///
+ /// Note that this should only be used if you want to discover the position
+ /// of the match. Testing the existence of a match is faster if you use
+ /// `is_match`.
+ ///
+ /// # Example
+ ///
+ /// Find the start and end location of the first word with exactly 13
+ /// Unicode word characters:
+ ///
+ /// ```rust
+ /// # use regex::Regex;
+ /// # fn main() {
+ /// let text = "I categorically deny having triskaidekaphobia.";
+ /// let mat = Regex::new(r"\b\w{13}\b").unwrap().find(text).unwrap();
+ /// assert_eq!(mat.start(), 2);
+ /// assert_eq!(mat.end(), 15);
+ /// # }
+ /// ```
+ pub fn find<'t>(&self, text: &'t str) -> Option<Match<'t>> {
+ self.find_at(text, 0)
+ }
+
+ /// Returns an iterator for each successive non-overlapping match in
+ /// `text`, returning the start and end byte indices with respect to
+ /// `text`.
+ ///
+ /// # Example
+ ///
+ /// Find the start and end location of every word with exactly 13 Unicode
+ /// word characters:
+ ///
+ /// ```rust
+ /// # use regex::Regex;
+ /// # fn main() {
+ /// let text = "Retroactively relinquishing remunerations is reprehensible.";
+ /// for mat in Regex::new(r"\b\w{13}\b").unwrap().find_iter(text) {
+ /// println!("{:?}", mat);
+ /// }
+ /// # }
+ /// ```
+ pub fn find_iter<'r, 't>(&'r self, text: &'t str) -> Matches<'r, 't> {
+ Matches(self.0.searcher_str().find_iter(text))
+ }
+
+ /// Returns the capture groups corresponding to the leftmost-first
+ /// match in `text`. Capture group `0` always corresponds to the entire
+ /// match. If no match is found, then `None` is returned.
+ ///
+ /// You should only use `captures` if you need access to the location of
+ /// capturing group matches. Otherwise, `find` is faster for discovering
+ /// the location of the overall match.
+ ///
+ /// # Examples
+ ///
+ /// Say you have some text with movie names and their release years,
+ /// like "'Citizen Kane' (1941)". It'd be nice if we could search for text
+ /// looking like that, while also extracting the movie name and its release
+ /// year separately.
+ ///
+ /// ```rust
+ /// # use regex::Regex;
+ /// # fn main() {
+ /// let re = Regex::new(r"'([^']+)'\s+\((\d{4})\)").unwrap();
+ /// let text = "Not my favorite movie: 'Citizen Kane' (1941).";
+ /// let caps = re.captures(text).unwrap();
+ /// assert_eq!(caps.get(1).unwrap().as_str(), "Citizen Kane");
+ /// assert_eq!(caps.get(2).unwrap().as_str(), "1941");
+ /// assert_eq!(caps.get(0).unwrap().as_str(), "'Citizen Kane' (1941)");
+ /// // You can also access the groups by index using the Index notation.
+ /// // Note that this will panic on an invalid index.
+ /// assert_eq!(&caps[1], "Citizen Kane");
+ /// assert_eq!(&caps[2], "1941");
+ /// assert_eq!(&caps[0], "'Citizen Kane' (1941)");
+ /// # }
+ /// ```
+ ///
+ /// Note that the full match is at capture group `0`. Each subsequent
+ /// capture group is indexed by the order of its opening `(`.
+ ///
+ /// We can make this example a bit clearer by using *named* capture groups:
+ ///
+ /// ```rust
+ /// # use regex::Regex;
+ /// # fn main() {
+ /// let re = Regex::new(r"'(?P<title>[^']+)'\s+\((?P<year>\d{4})\)")
+ /// .unwrap();
+ /// let text = "Not my favorite movie: 'Citizen Kane' (1941).";
+ /// let caps = re.captures(text).unwrap();
+ /// assert_eq!(caps.name("title").unwrap().as_str(), "Citizen Kane");
+ /// assert_eq!(caps.name("year").unwrap().as_str(), "1941");
+ /// assert_eq!(caps.get(0).unwrap().as_str(), "'Citizen Kane' (1941)");
+ /// // You can also access the groups by name using the Index notation.
+ /// // Note that this will panic on an invalid group name.
+ /// assert_eq!(&caps["title"], "Citizen Kane");
+ /// assert_eq!(&caps["year"], "1941");
+ /// assert_eq!(&caps[0], "'Citizen Kane' (1941)");
+ ///
+ /// # }
+ /// ```
+ ///
+ /// Here we name the capture groups, which we can access with the `name`
+ /// method or the `Index` notation with a `&str`. Note that the named
+ /// capture groups are still accessible with `get` or the `Index` notation
+ /// with a `usize`.
+ ///
+ /// The `0`th capture group is always unnamed, so it must always be
+ /// accessed with `get(0)` or `[0]`.
+ pub fn captures<'t>(&self, text: &'t str) -> Option<Captures<'t>> {
+ let mut locs = self.capture_locations();
+ self.captures_read_at(&mut locs, text, 0).map(move |_| Captures {
+ text,
+ locs: locs.0,
+ named_groups: self.0.capture_name_idx().clone(),
+ })
+ }
+
+ /// Returns an iterator over all the non-overlapping capture groups matched
+ /// in `text`. This is operationally the same as `find_iter`, except it
+ /// yields information about capturing group matches.
+ ///
+ /// # Example
+ ///
+ /// We can use this to find all movie titles and their release years in
+ /// some text, where the movie is formatted like "'Title' (xxxx)":
+ ///
+ /// ```rust
+ /// # use regex::Regex;
+ /// # fn main() {
+ /// let re = Regex::new(r"'(?P<title>[^']+)'\s+\((?P<year>\d{4})\)")
+ /// .unwrap();
+ /// let text = "'Citizen Kane' (1941), 'The Wizard of Oz' (1939), 'M' (1931).";
+ /// for caps in re.captures_iter(text) {
+ /// println!("Movie: {:?}, Released: {:?}",
+ /// &caps["title"], &caps["year"]);
+ /// }
+ /// // Output:
+ /// // Movie: Citizen Kane, Released: 1941
+ /// // Movie: The Wizard of Oz, Released: 1939
+ /// // Movie: M, Released: 1931
+ /// # }
+ /// ```
+ pub fn captures_iter<'r, 't>(
+ &'r self,
+ text: &'t str,
+ ) -> CaptureMatches<'r, 't> {
+ CaptureMatches(self.0.searcher_str().captures_iter(text))
+ }
+
+ /// Returns an iterator of substrings of `text` delimited by a match of the
+ /// regular expression. Namely, each element of the iterator corresponds to
+ /// text that *isn't* matched by the regular expression.
+ ///
+ /// This method will *not* copy the text given.
+ ///
+ /// # Example
+ ///
+ /// To split a string delimited by arbitrary amounts of spaces or tabs:
+ ///
+ /// ```rust
+ /// # use regex::Regex;
+ /// # fn main() {
+ /// let re = Regex::new(r"[ \t]+").unwrap();
+ /// let fields: Vec<&str> = re.split("a b \t c\td e").collect();
+ /// assert_eq!(fields, vec!["a", "b", "c", "d", "e"]);
+ /// # }
+ /// ```
+ pub fn split<'r, 't>(&'r self, text: &'t str) -> Split<'r, 't> {
+ Split { finder: self.find_iter(text), last: 0 }
+ }
+
+ /// Returns an iterator of at most `limit` substrings of `text` delimited
+ /// by a match of the regular expression. (A `limit` of `0` will return no
+ /// substrings.) Namely, each element of the iterator corresponds to text
+ /// that *isn't* matched by the regular expression. The remainder of the
+ /// string that is not split will be the last element in the iterator.
+ ///
+ /// This method will *not* copy the text given.
+ ///
+ /// # Example
+ ///
+ /// Get the first two words in some text:
+ ///
+ /// ```rust
+ /// # use regex::Regex;
+ /// # fn main() {
+ /// let re = Regex::new(r"\W+").unwrap();
+ /// let fields: Vec<&str> = re.splitn("Hey! How are you?", 3).collect();
+ /// assert_eq!(fields, vec!("Hey", "How", "are you?"));
+ /// # }
+ /// ```
+ pub fn splitn<'r, 't>(
+ &'r self,
+ text: &'t str,
+ limit: usize,
+ ) -> SplitN<'r, 't> {
+ SplitN { splits: self.split(text), n: limit }
+ }
+
+ /// Replaces the leftmost-first match with the replacement provided.
+ /// The replacement can be a regular string (where `$N` and `$name` are
+ /// expanded to match capture groups) or a function that takes the matches'
+ /// `Captures` and returns the replaced string.
+ ///
+ /// If no match is found, then a copy of the string is returned unchanged.
+ ///
+ /// # Replacement string syntax
+ ///
+ /// All instances of `$name` in the replacement text is replaced with the
+ /// corresponding capture group `name`.
+ ///
+ /// `name` may be an integer corresponding to the index of the
+ /// capture group (counted by order of opening parenthesis where `0` is the
+ /// entire match) or it can be a name (consisting of letters, digits or
+ /// underscores) corresponding to a named capture group.
+ ///
+ /// If `name` isn't a valid capture group (whether the name doesn't exist
+ /// or isn't a valid index), then it is replaced with the empty string.
+ ///
+ /// The longest possible name is used. e.g., `$1a` looks up the capture
+ /// group named `1a` and not the capture group at index `1`. To exert more
+ /// precise control over the name, use braces, e.g., `${1}a`.
+ ///
+ /// To write a literal `$` use `$$`.
+ ///
+ /// # Examples
+ ///
+ /// Note that this function is polymorphic with respect to the replacement.
+ /// In typical usage, this can just be a normal string:
+ ///
+ /// ```rust
+ /// # use regex::Regex;
+ /// # fn main() {
+ /// let re = Regex::new("[^01]+").unwrap();
+ /// assert_eq!(re.replace("1078910", ""), "1010");
+ /// # }
+ /// ```
+ ///
+ /// But anything satisfying the `Replacer` trait will work. For example,
+ /// a closure of type `|&Captures| -> String` provides direct access to the
+ /// captures corresponding to a match. This allows one to access
+ /// capturing group matches easily:
+ ///
+ /// ```rust
+ /// # use regex::Regex;
+ /// # use regex::Captures; fn main() {
+ /// let re = Regex::new(r"([^,\s]+),\s+(\S+)").unwrap();
+ /// let result = re.replace("Springsteen, Bruce", |caps: &Captures| {
+ /// format!("{} {}", &caps[2], &caps[1])
+ /// });
+ /// assert_eq!(result, "Bruce Springsteen");
+ /// # }
+ /// ```
+ ///
+ /// But this is a bit cumbersome to use all the time. Instead, a simple
+ /// syntax is supported that expands `$name` into the corresponding capture
+ /// group. Here's the last example, but using this expansion technique
+ /// with named capture groups:
+ ///
+ /// ```rust
+ /// # use regex::Regex;
+ /// # fn main() {
+ /// let re = Regex::new(r"(?P<last>[^,\s]+),\s+(?P<first>\S+)").unwrap();
+ /// let result = re.replace("Springsteen, Bruce", "$first $last");
+ /// assert_eq!(result, "Bruce Springsteen");
+ /// # }
+ /// ```
+ ///
+ /// Note that using `$2` instead of `$first` or `$1` instead of `$last`
+ /// would produce the same result. To write a literal `$` use `$$`.
+ ///
+ /// Sometimes the replacement string requires use of curly braces to
+ /// delineate a capture group replacement and surrounding literal text.
+ /// For example, if we wanted to join two words together with an
+ /// underscore:
+ ///
+ /// ```rust
+ /// # use regex::Regex;
+ /// # fn main() {
+ /// let re = Regex::new(r"(?P<first>\w+)\s+(?P<second>\w+)").unwrap();
+ /// let result = re.replace("deep fried", "${first}_$second");
+ /// assert_eq!(result, "deep_fried");
+ /// # }
+ /// ```
+ ///
+ /// Without the curly braces, the capture group name `first_` would be
+ /// used, and since it doesn't exist, it would be replaced with the empty
+ /// string.
+ ///
+ /// Finally, sometimes you just want to replace a literal string with no
+ /// regard for capturing group expansion. This can be done by wrapping a
+ /// byte string with `NoExpand`:
+ ///
+ /// ```rust
+ /// # use regex::Regex;
+ /// # fn main() {
+ /// use regex::NoExpand;
+ ///
+ /// let re = Regex::new(r"(?P<last>[^,\s]+),\s+(\S+)").unwrap();
+ /// let result = re.replace("Springsteen, Bruce", NoExpand("$2 $last"));
+ /// assert_eq!(result, "$2 $last");
+ /// # }
+ /// ```
+ pub fn replace<'t, R: Replacer>(
+ &self,
+ text: &'t str,
+ rep: R,
+ ) -> Cow<'t, str> {
+ self.replacen(text, 1, rep)
+ }
+
+ /// Replaces all non-overlapping matches in `text` with the replacement
+ /// provided. This is the same as calling `replacen` with `limit` set to
+ /// `0`.
+ ///
+ /// See the documentation for `replace` for details on how to access
+ /// capturing group matches in the replacement string.
+ pub fn replace_all<'t, R: Replacer>(
+ &self,
+ text: &'t str,
+ rep: R,
+ ) -> Cow<'t, str> {
+ self.replacen(text, 0, rep)
+ }
+
+ /// Replaces at most `limit` non-overlapping matches in `text` with the
+ /// replacement provided. If `limit` is 0, then all non-overlapping matches
+ /// are replaced.
+ ///
+ /// See the documentation for `replace` for details on how to access
+ /// capturing group matches in the replacement string.
+ pub fn replacen<'t, R: Replacer>(
+ &self,
+ text: &'t str,
+ limit: usize,
+ mut rep: R,
+ ) -> Cow<'t, str> {
+ // If we know that the replacement doesn't have any capture expansions,
+ // then we can use the fast path. The fast path can make a tremendous
+ // difference:
+ //
+ // 1) We use `find_iter` instead of `captures_iter`. Not asking for
+ // captures generally makes the regex engines faster.
+ // 2) We don't need to look up all of the capture groups and do
+ // replacements inside the replacement string. We just push it
+ // at each match and be done with it.
+ if let Some(rep) = rep.no_expansion() {
+ let mut it = self.find_iter(text).enumerate().peekable();
+ if it.peek().is_none() {
+ return Cow::Borrowed(text);
+ }
+ let mut new = String::with_capacity(text.len());
+ let mut last_match = 0;
+ for (i, m) in it {
+ if limit > 0 && i >= limit {
+ break;
+ }
+ new.push_str(&text[last_match..m.start()]);
+ new.push_str(&rep);
+ last_match = m.end();
+ }
+ new.push_str(&text[last_match..]);
+ return Cow::Owned(new);
+ }
+
+ // The slower path, which we use if the replacement needs access to
+ // capture groups.
+ let mut it = self.captures_iter(text).enumerate().peekable();
+ if it.peek().is_none() {
+ return Cow::Borrowed(text);
+ }
+ let mut new = String::with_capacity(text.len());
+ let mut last_match = 0;
+ for (i, cap) in it {
+ if limit > 0 && i >= limit {
+ break;
+ }
+ // unwrap on 0 is OK because captures only reports matches
+ let m = cap.get(0).unwrap();
+ new.push_str(&text[last_match..m.start()]);
+ rep.replace_append(&cap, &mut new);
+ last_match = m.end();
+ }
+ new.push_str(&text[last_match..]);
+ Cow::Owned(new)
+ }
+}
+
+/// Advanced or "lower level" search methods.
+impl Regex {
+ /// Returns the end location of a match in the text given.
+ ///
+ /// This method may have the same performance characteristics as
+ /// `is_match`, except it provides an end location for a match. In
+ /// particular, the location returned *may be shorter* than the proper end
+ /// of the leftmost-first match.
+ ///
+ /// # Example
+ ///
+ /// Typically, `a+` would match the entire first sequence of `a` in some
+ /// text, but `shortest_match` can give up as soon as it sees the first
+ /// `a`.
+ ///
+ /// ```rust
+ /// # use regex::Regex;
+ /// # fn main() {
+ /// let text = "aaaaa";
+ /// let pos = Regex::new(r"a+").unwrap().shortest_match(text);
+ /// assert_eq!(pos, Some(1));
+ /// # }
+ /// ```
+ pub fn shortest_match(&self, text: &str) -> Option<usize> {
+ self.shortest_match_at(text, 0)
+ }
+
+ /// Returns the same as shortest_match, but starts the search at the given
+ /// offset.
+ ///
+ /// The significance of the starting point is that it takes the surrounding
+ /// context into consideration. For example, the `\A` anchor can only
+ /// match when `start == 0`.
+ pub fn shortest_match_at(
+ &self,
+ text: &str,
+ start: usize,
+ ) -> Option<usize> {
+ self.0.searcher_str().shortest_match_at(text, start)
+ }
+
+ /// Returns the same as is_match, but starts the search at the given
+ /// offset.
+ ///
+ /// The significance of the starting point is that it takes the surrounding
+ /// context into consideration. For example, the `\A` anchor can only
+ /// match when `start == 0`.
+ pub fn is_match_at(&self, text: &str, start: usize) -> bool {
+ self.0.searcher_str().is_match_at(text, start)
+ }
+
+ /// Returns the same as find, but starts the search at the given
+ /// offset.
+ ///
+ /// The significance of the starting point is that it takes the surrounding
+ /// context into consideration. For example, the `\A` anchor can only
+ /// match when `start == 0`.
+ pub fn find_at<'t>(
+ &self,
+ text: &'t str,
+ start: usize,
+ ) -> Option<Match<'t>> {
+ self.0
+ .searcher_str()
+ .find_at(text, start)
+ .map(|(s, e)| Match::new(text, s, e))
+ }
+
+ /// This is like `captures`, but uses
+ /// [`CaptureLocations`](struct.CaptureLocations.html)
+ /// instead of
+ /// [`Captures`](struct.Captures.html) in order to amortize allocations.
+ ///
+ /// To create a `CaptureLocations` value, use the
+ /// `Regex::capture_locations` method.
+ ///
+ /// This returns the overall match if this was successful, which is always
+ /// equivalence to the `0`th capture group.
+ pub fn captures_read<'t>(
+ &self,
+ locs: &mut CaptureLocations,
+ text: &'t str,
+ ) -> Option<Match<'t>> {
+ self.captures_read_at(locs, text, 0)
+ }
+
+ /// Returns the same as captures, but starts the search at the given
+ /// offset and populates the capture locations given.
+ ///
+ /// The significance of the starting point is that it takes the surrounding
+ /// context into consideration. For example, the `\A` anchor can only
+ /// match when `start == 0`.
+ pub fn captures_read_at<'t>(
+ &self,
+ locs: &mut CaptureLocations,
+ text: &'t str,
+ start: usize,
+ ) -> Option<Match<'t>> {
+ self.0
+ .searcher_str()
+ .captures_read_at(&mut locs.0, text, start)
+ .map(|(s, e)| Match::new(text, s, e))
+ }
+
+ /// An undocumented alias for `captures_read_at`.
+ ///
+ /// The `regex-capi` crate previously used this routine, so to avoid
+ /// breaking that crate, we continue to provide the name as an undocumented
+ /// alias.
+ #[doc(hidden)]
+ pub fn read_captures_at<'t>(
+ &self,
+ locs: &mut CaptureLocations,
+ text: &'t str,
+ start: usize,
+ ) -> Option<Match<'t>> {
+ self.captures_read_at(locs, text, start)
+ }
+}
+
+/// Auxiliary methods.
+impl Regex {
+ /// Returns the original string of this regex.
+ pub fn as_str(&self) -> &str {
+ &self.0.regex_strings()[0]
+ }
+
+ /// Returns an iterator over the capture names.
+ pub fn capture_names(&self) -> CaptureNames<'_> {
+ CaptureNames(self.0.capture_names().iter())
+ }
+
+ /// Returns the number of captures.
+ pub fn captures_len(&self) -> usize {
+ self.0.capture_names().len()
+ }
+
+ /// Returns an empty set of capture locations that can be reused in
+ /// multiple calls to `captures_read` or `captures_read_at`.
+ pub fn capture_locations(&self) -> CaptureLocations {
+ CaptureLocations(self.0.searcher_str().locations())
+ }
+
+ /// An alias for `capture_locations` to preserve backward compatibility.
+ ///
+ /// The `regex-capi` crate uses this method, so to avoid breaking that
+ /// crate, we continue to export it as an undocumented API.
+ #[doc(hidden)]
+ pub fn locations(&self) -> CaptureLocations {
+ CaptureLocations(self.0.searcher_str().locations())
+ }
+}
+
+/// An iterator over the names of all possible captures.
+///
+/// `None` indicates an unnamed capture; the first element (capture 0, the
+/// whole matched region) is always unnamed.
+///
+/// `'r` is the lifetime of the compiled regular expression.
+#[derive(Clone, Debug)]
+pub struct CaptureNames<'r>(::std::slice::Iter<'r, Option<String>>);
+
+impl<'r> Iterator for CaptureNames<'r> {
+ type Item = Option<&'r str>;
+
+ fn next(&mut self) -> Option<Option<&'r str>> {
+ self.0
+ .next()
+ .as_ref()
+ .map(|slot| slot.as_ref().map(|name| name.as_ref()))
+ }
+
+ fn size_hint(&self) -> (usize, Option<usize>) {
+ self.0.size_hint()
+ }
+
+ fn count(self) -> usize {
+ self.0.count()
+ }
+}
+
+impl<'r> ExactSizeIterator for CaptureNames<'r> {}
+
+impl<'r> FusedIterator for CaptureNames<'r> {}
+
+/// Yields all substrings delimited by a regular expression match.
+///
+/// `'r` is the lifetime of the compiled regular expression and `'t` is the
+/// lifetime of the string being split.
+#[derive(Debug)]
+pub struct Split<'r, 't> {
+ finder: Matches<'r, 't>,
+ last: usize,
+}
+
+impl<'r, 't> Iterator for Split<'r, 't> {
+ type Item = &'t str;
+
+ fn next(&mut self) -> Option<&'t str> {
+ let text = self.finder.0.text();
+ match self.finder.next() {
+ None => {
+ if self.last > text.len() {
+ None
+ } else {
+ let s = &text[self.last..];
+ self.last = text.len() + 1; // Next call will return None
+ Some(s)
+ }
+ }
+ Some(m) => {
+ let matched = &text[self.last..m.start()];
+ self.last = m.end();
+ Some(matched)
+ }
+ }
+ }
+}
+
+impl<'r, 't> FusedIterator for Split<'r, 't> {}
+
+/// Yields at most `N` substrings delimited by a regular expression match.
+///
+/// The last substring will be whatever remains after splitting.
+///
+/// `'r` is the lifetime of the compiled regular expression and `'t` is the
+/// lifetime of the string being split.
+#[derive(Debug)]
+pub struct SplitN<'r, 't> {
+ splits: Split<'r, 't>,
+ n: usize,
+}
+
+impl<'r, 't> Iterator for SplitN<'r, 't> {
+ type Item = &'t str;
+
+ fn next(&mut self) -> Option<&'t str> {
+ if self.n == 0 {
+ return None;
+ }
+
+ self.n -= 1;
+ if self.n > 0 {
+ return self.splits.next();
+ }
+
+ let text = self.splits.finder.0.text();
+ if self.splits.last > text.len() {
+ // We've already returned all substrings.
+ None
+ } else {
+ // self.n == 0, so future calls will return None immediately
+ Some(&text[self.splits.last..])
+ }
+ }
+
+ fn size_hint(&self) -> (usize, Option<usize>) {
+ (0, Some(self.n))
+ }
+}
+
+impl<'r, 't> FusedIterator for SplitN<'r, 't> {}
+
+/// CaptureLocations is a low level representation of the raw offsets of each
+/// submatch.
+///
+/// You can think of this as a lower level
+/// [`Captures`](struct.Captures.html), where this type does not support
+/// named capturing groups directly and it does not borrow the text that these
+/// offsets were matched on.
+///
+/// Primarily, this type is useful when using the lower level `Regex` APIs
+/// such as `read_captures`, which permits amortizing the allocation in which
+/// capture match locations are stored.
+///
+/// In order to build a value of this type, you'll need to call the
+/// `capture_locations` method on the `Regex` being used to execute the search.
+/// The value returned can then be reused in subsequent searches.
+#[derive(Clone, Debug)]
+pub struct CaptureLocations(re_trait::Locations);
+
+/// A type alias for `CaptureLocations` for backwards compatibility.
+///
+/// Previously, we exported `CaptureLocations` as `Locations` in an
+/// undocumented API. To prevent breaking that code (e.g., in `regex-capi`),
+/// we continue re-exporting the same undocumented API.
+#[doc(hidden)]
+pub type Locations = CaptureLocations;
+
+impl CaptureLocations {
+ /// Returns the start and end positions of the Nth capture group. Returns
+ /// `None` if `i` is not a valid capture group or if the capture group did
+ /// not match anything. The positions returned are *always* byte indices
+ /// with respect to the original string matched.
+ #[inline]
+ pub fn get(&self, i: usize) -> Option<(usize, usize)> {
+ self.0.pos(i)
+ }
+
+ /// Returns the total number of capture groups (even if they didn't match).
+ ///
+ /// This is always at least `1` since every regex has at least `1`
+ /// capturing group that corresponds to the entire match.
+ #[inline]
+ pub fn len(&self) -> usize {
+ self.0.len()
+ }
+
+ /// An alias for the `get` method for backwards compatibility.
+ ///
+ /// Previously, we exported `get` as `pos` in an undocumented API. To
+ /// prevent breaking that code (e.g., in `regex-capi`), we continue
+ /// re-exporting the same undocumented API.
+ #[doc(hidden)]
+ #[inline]
+ pub fn pos(&self, i: usize) -> Option<(usize, usize)> {
+ self.get(i)
+ }
+}
+
+/// Captures represents a group of captured strings for a single match.
+///
+/// The 0th capture always corresponds to the entire match. Each subsequent
+/// index corresponds to the next capture group in the regex. If a capture
+/// group is named, then the matched string is *also* available via the `name`
+/// method. (Note that the 0th capture is always unnamed and so must be
+/// accessed with the `get` method.)
+///
+/// Positions returned from a capture group are always byte indices.
+///
+/// `'t` is the lifetime of the matched text.
+pub struct Captures<'t> {
+ text: &'t str,
+ locs: re_trait::Locations,
+ named_groups: Arc<HashMap<String, usize>>,
+}
+
+impl<'t> Captures<'t> {
+ /// Returns the match associated with the capture group at index `i`. If
+ /// `i` does not correspond to a capture group, or if the capture group
+ /// did not participate in the match, then `None` is returned.
+ ///
+ /// # Examples
+ ///
+ /// Get the text of the match with a default of an empty string if this
+ /// group didn't participate in the match:
+ ///
+ /// ```rust
+ /// # use regex::Regex;
+ /// let re = Regex::new(r"[a-z]+(?:([0-9]+)|([A-Z]+))").unwrap();
+ /// let caps = re.captures("abc123").unwrap();
+ ///
+ /// let text1 = caps.get(1).map_or("", |m| m.as_str());
+ /// let text2 = caps.get(2).map_or("", |m| m.as_str());
+ /// assert_eq!(text1, "123");
+ /// assert_eq!(text2, "");
+ /// ```
+ pub fn get(&self, i: usize) -> Option<Match<'t>> {
+ self.locs.pos(i).map(|(s, e)| Match::new(self.text, s, e))
+ }
+
+ /// Returns the match for the capture group named `name`. If `name` isn't a
+ /// valid capture group or didn't match anything, then `None` is returned.
+ pub fn name(&self, name: &str) -> Option<Match<'t>> {
+ self.named_groups.get(name).and_then(|&i| self.get(i))
+ }
+
+ /// An iterator that yields all capturing matches in the order in which
+ /// they appear in the regex. If a particular capture group didn't
+ /// participate in the match, then `None` is yielded for that capture.
+ ///
+ /// The first match always corresponds to the overall match of the regex.
+ pub fn iter<'c>(&'c self) -> SubCaptureMatches<'c, 't> {
+ SubCaptureMatches { caps: self, it: self.locs.iter() }
+ }
+
+ /// Expands all instances of `$name` in `replacement` to the corresponding
+ /// capture group `name`, and writes them to the `dst` buffer given.
+ ///
+ /// `name` may be an integer corresponding to the index of the capture
+ /// group (counted by order of opening parenthesis where `0` is the
+ /// entire match) or it can be a name (consisting of letters, digits or
+ /// underscores) corresponding to a named capture group.
+ ///
+ /// If `name` isn't a valid capture group (whether the name doesn't exist
+ /// or isn't a valid index), then it is replaced with the empty string.
+ ///
+ /// The longest possible name consisting of the characters `[_0-9A-Za-z]`
+ /// is used. e.g., `$1a` looks up the capture group named `1a` and not the
+ /// capture group at index `1`. To exert more precise control over the
+ /// name, or to refer to a capture group name that uses characters outside
+ /// of `[_0-9A-Za-z]`, use braces, e.g., `${1}a` or `${foo[bar].baz}`. When
+ /// using braces, any sequence of characters is permitted. If the sequence
+ /// does not refer to a capture group name in the corresponding regex, then
+ /// it is replaced with an empty string.
+ ///
+ /// To write a literal `$` use `$$`.
+ pub fn expand(&self, replacement: &str, dst: &mut String) {
+ expand_str(self, replacement, dst)
+ }
+
+ /// Returns the total number of capture groups (even if they didn't match).
+ ///
+ /// This is always at least `1`, since every regex has at least one capture
+ /// group that corresponds to the full match.
+ #[inline]
+ pub fn len(&self) -> usize {
+ self.locs.len()
+ }
+}
+
+impl<'t> fmt::Debug for Captures<'t> {
+ fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+ f.debug_tuple("Captures").field(&CapturesDebug(self)).finish()
+ }
+}
+
+struct CapturesDebug<'c, 't>(&'c Captures<'t>);
+
+impl<'c, 't> fmt::Debug for CapturesDebug<'c, 't> {
+ fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+ // We'd like to show something nice here, even if it means an
+ // allocation to build a reverse index.
+ let slot_to_name: HashMap<&usize, &String> =
+ self.0.named_groups.iter().map(|(a, b)| (b, a)).collect();
+ let mut map = f.debug_map();
+ for (slot, m) in self.0.locs.iter().enumerate() {
+ let m = m.map(|(s, e)| &self.0.text[s..e]);
+ if let Some(name) = slot_to_name.get(&slot) {
+ map.entry(&name, &m);
+ } else {
+ map.entry(&slot, &m);
+ }
+ }
+ map.finish()
+ }
+}
+
+/// Get a group by index.
+///
+/// `'t` is the lifetime of the matched text.
+///
+/// The text can't outlive the `Captures` object if this method is
+/// used, because of how `Index` is defined (normally `a[i]` is part
+/// of `a` and can't outlive it); to do that, use `get()` instead.
+///
+/// # Panics
+///
+/// If there is no group at the given index.
+impl<'t> Index<usize> for Captures<'t> {
+ type Output = str;
+
+ fn index(&self, i: usize) -> &str {
+ self.get(i)
+ .map(|m| m.as_str())
+ .unwrap_or_else(|| panic!("no group at index '{}'", i))
+ }
+}
+
+/// Get a group by name.
+///
+/// `'t` is the lifetime of the matched text and `'i` is the lifetime
+/// of the group name (the index).
+///
+/// The text can't outlive the `Captures` object if this method is
+/// used, because of how `Index` is defined (normally `a[i]` is part
+/// of `a` and can't outlive it); to do that, use `name` instead.
+///
+/// # Panics
+///
+/// If there is no group named by the given value.
+impl<'t, 'i> Index<&'i str> for Captures<'t> {
+ type Output = str;
+
+ fn index<'a>(&'a self, name: &'i str) -> &'a str {
+ self.name(name)
+ .map(|m| m.as_str())
+ .unwrap_or_else(|| panic!("no group named '{}'", name))
+ }
+}
+
+/// An iterator that yields all capturing matches in the order in which they
+/// appear in the regex.
+///
+/// If a particular capture group didn't participate in the match, then `None`
+/// is yielded for that capture. The first match always corresponds to the
+/// overall match of the regex.
+///
+/// The lifetime `'c` corresponds to the lifetime of the `Captures` value, and
+/// the lifetime `'t` corresponds to the originally matched text.
+#[derive(Clone, Debug)]
+pub struct SubCaptureMatches<'c, 't> {
+ caps: &'c Captures<'t>,
+ it: SubCapturesPosIter<'c>,
+}
+
+impl<'c, 't> Iterator for SubCaptureMatches<'c, 't> {
+ type Item = Option<Match<'t>>;
+
+ fn next(&mut self) -> Option<Option<Match<'t>>> {
+ self.it
+ .next()
+ .map(|cap| cap.map(|(s, e)| Match::new(self.caps.text, s, e)))
+ }
+
+ fn size_hint(&self) -> (usize, Option<usize>) {
+ self.it.size_hint()
+ }
+
+ fn count(self) -> usize {
+ self.it.count()
+ }
+}
+
+impl<'c, 't> ExactSizeIterator for SubCaptureMatches<'c, 't> {}
+
+impl<'c, 't> FusedIterator for SubCaptureMatches<'c, 't> {}
+
+/// An iterator that yields all non-overlapping capture groups matching a
+/// particular regular expression.
+///
+/// The iterator stops when no more matches can be found.
+///
+/// `'r` is the lifetime of the compiled regular expression and `'t` is the
+/// lifetime of the matched string.
+#[derive(Debug)]
+pub struct CaptureMatches<'r, 't>(
+ re_trait::CaptureMatches<'t, ExecNoSyncStr<'r>>,
+);
+
+impl<'r, 't> Iterator for CaptureMatches<'r, 't> {
+ type Item = Captures<'t>;
+
+ fn next(&mut self) -> Option<Captures<'t>> {
+ self.0.next().map(|locs| Captures {
+ text: self.0.text(),
+ locs,
+ named_groups: self.0.regex().capture_name_idx().clone(),
+ })
+ }
+}
+
+impl<'r, 't> FusedIterator for CaptureMatches<'r, 't> {}
+
+/// An iterator over all non-overlapping matches for a particular string.
+///
+/// The iterator yields a `Match` value. The iterator stops when no more
+/// matches can be found.
+///
+/// `'r` is the lifetime of the compiled regular expression and `'t` is the
+/// lifetime of the matched string.
+#[derive(Debug)]
+pub struct Matches<'r, 't>(re_trait::Matches<'t, ExecNoSyncStr<'r>>);
+
+impl<'r, 't> Iterator for Matches<'r, 't> {
+ type Item = Match<'t>;
+
+ fn next(&mut self) -> Option<Match<'t>> {
+ let text = self.0.text();
+ self.0.next().map(|(s, e)| Match::new(text, s, e))
+ }
+}
+
+impl<'r, 't> FusedIterator for Matches<'r, 't> {}
+
+/// Replacer describes types that can be used to replace matches in a string.
+///
+/// In general, users of this crate shouldn't need to implement this trait,
+/// since implementations are already provided for `&str` along with other
+/// variants of string types and `FnMut(&Captures) -> String` (or any
+/// `FnMut(&Captures) -> T` where `T: AsRef<str>`), which covers most use cases.
+pub trait Replacer {
+ /// Appends text to `dst` to replace the current match.
+ ///
+ /// The current match is represented by `caps`, which is guaranteed to
+ /// have a match at capture group `0`.
+ ///
+ /// For example, a no-op replacement would be
+ /// `dst.push_str(caps.get(0).unwrap().as_str())`.
+ fn replace_append(&mut self, caps: &Captures<'_>, dst: &mut String);
+
+ /// Return a fixed unchanging replacement string.
+ ///
+ /// When doing replacements, if access to `Captures` is not needed (e.g.,
+ /// the replacement byte string does not need `$` expansion), then it can
+ /// be beneficial to avoid finding sub-captures.
+ ///
+ /// In general, this is called once for every call to `replacen`.
+ fn no_expansion<'r>(&'r mut self) -> Option<Cow<'r, str>> {
+ None
+ }
+
+ /// Return a `Replacer` that borrows and wraps this `Replacer`.
+ ///
+ /// This is useful when you want to take a generic `Replacer` (which might
+ /// not be cloneable) and use it without consuming it, so it can be used
+ /// more than once.
+ ///
+ /// # Example
+ ///
+ /// ```
+ /// use regex::{Regex, Replacer};
+ ///
+ /// fn replace_all_twice<R: Replacer>(
+ /// re: Regex,
+ /// src: &str,
+ /// mut rep: R,
+ /// ) -> String {
+ /// let dst = re.replace_all(src, rep.by_ref());
+ /// let dst = re.replace_all(&dst, rep.by_ref());
+ /// dst.into_owned()
+ /// }
+ /// ```
+ fn by_ref<'r>(&'r mut self) -> ReplacerRef<'r, Self> {
+ ReplacerRef(self)
+ }
+}
+
+/// By-reference adaptor for a `Replacer`
+///
+/// Returned by [`Replacer::by_ref`](trait.Replacer.html#method.by_ref).
+#[derive(Debug)]
+pub struct ReplacerRef<'a, R: ?Sized>(&'a mut R);
+
+impl<'a, R: Replacer + ?Sized + 'a> Replacer for ReplacerRef<'a, R> {
+ fn replace_append(&mut self, caps: &Captures<'_>, dst: &mut String) {
+ self.0.replace_append(caps, dst)
+ }
+ fn no_expansion(&mut self) -> Option<Cow<'_, str>> {
+ self.0.no_expansion()
+ }
+}
+
+impl<'a> Replacer for &'a str {
+ fn replace_append(&mut self, caps: &Captures<'_>, dst: &mut String) {
+ caps.expand(*self, dst);
+ }
+
+ fn no_expansion(&mut self) -> Option<Cow<'_, str>> {
+ no_expansion(self)
+ }
+}
+
+impl<'a> Replacer for &'a String {
+ fn replace_append(&mut self, caps: &Captures<'_>, dst: &mut String) {
+ self.as_str().replace_append(caps, dst)
+ }
+
+ fn no_expansion(&mut self) -> Option<Cow<'_, str>> {
+ no_expansion(self)
+ }
+}
+
+impl Replacer for String {
+ fn replace_append(&mut self, caps: &Captures<'_>, dst: &mut String) {
+ self.as_str().replace_append(caps, dst)
+ }
+
+ fn no_expansion(&mut self) -> Option<Cow<'_, str>> {
+ no_expansion(self)
+ }
+}
+
+impl<'a> Replacer for Cow<'a, str> {
+ fn replace_append(&mut self, caps: &Captures<'_>, dst: &mut String) {
+ self.as_ref().replace_append(caps, dst)
+ }
+
+ fn no_expansion(&mut self) -> Option<Cow<'_, str>> {
+ no_expansion(self)
+ }
+}
+
+impl<'a> Replacer for &'a Cow<'a, str> {
+ fn replace_append(&mut self, caps: &Captures<'_>, dst: &mut String) {
+ self.as_ref().replace_append(caps, dst)
+ }
+
+ fn no_expansion(&mut self) -> Option<Cow<'_, str>> {
+ no_expansion(self)
+ }
+}
+
+fn no_expansion<T: AsRef<str>>(t: &T) -> Option<Cow<'_, str>> {
+ let s = t.as_ref();
+ match find_byte(b'$', s.as_bytes()) {
+ Some(_) => None,
+ None => Some(Cow::Borrowed(s)),
+ }
+}
+
+impl<F, T> Replacer for F
+where
+ F: FnMut(&Captures<'_>) -> T,
+ T: AsRef<str>,
+{
+ fn replace_append(&mut self, caps: &Captures<'_>, dst: &mut String) {
+ dst.push_str((*self)(caps).as_ref());
+ }
+}
+
+/// `NoExpand` indicates literal string replacement.
+///
+/// It can be used with `replace` and `replace_all` to do a literal string
+/// replacement without expanding `$name` to their corresponding capture
+/// groups. This can be both convenient (to avoid escaping `$`, for example)
+/// and performant (since capture groups don't need to be found).
+///
+/// `'t` is the lifetime of the literal text.
+#[derive(Clone, Debug)]
+pub struct NoExpand<'t>(pub &'t str);
+
+impl<'t> Replacer for NoExpand<'t> {
+ fn replace_append(&mut self, _: &Captures<'_>, dst: &mut String) {
+ dst.push_str(self.0);
+ }
+
+ fn no_expansion(&mut self) -> Option<Cow<'_, str>> {
+ Some(Cow::Borrowed(self.0))
+ }
+}
diff --git a/third_party/rust/regex/src/sparse.rs b/third_party/rust/regex/src/sparse.rs
new file mode 100644
index 0000000000..98b726613d
--- /dev/null
+++ b/third_party/rust/regex/src/sparse.rs
@@ -0,0 +1,84 @@
+use std::fmt;
+use std::ops::Deref;
+use std::slice;
+
+/// A sparse set used for representing ordered NFA states.
+///
+/// This supports constant time addition and membership testing. Clearing an
+/// entire set can also be done in constant time. Iteration yields elements
+/// in the order in which they were inserted.
+///
+/// The data structure is based on: https://research.swtch.com/sparse
+/// Note though that we don't actually use uninitialized memory. We generally
+/// reuse allocations, so the initial allocation cost is bareable. However,
+/// its other properties listed above are extremely useful.
+#[derive(Clone)]
+pub struct SparseSet {
+ /// Dense contains the instruction pointers in the order in which they
+ /// were inserted.
+ dense: Vec<usize>,
+ /// Sparse maps instruction pointers to their location in dense.
+ ///
+ /// An instruction pointer is in the set if and only if
+ /// sparse[ip] < dense.len() && ip == dense[sparse[ip]].
+ sparse: Box<[usize]>,
+}
+
+impl SparseSet {
+ pub fn new(size: usize) -> SparseSet {
+ SparseSet {
+ dense: Vec::with_capacity(size),
+ sparse: vec![0; size].into_boxed_slice(),
+ }
+ }
+
+ pub fn len(&self) -> usize {
+ self.dense.len()
+ }
+
+ pub fn is_empty(&self) -> bool {
+ self.dense.is_empty()
+ }
+
+ pub fn capacity(&self) -> usize {
+ self.dense.capacity()
+ }
+
+ pub fn insert(&mut self, value: usize) {
+ let i = self.len();
+ assert!(i < self.capacity());
+ self.dense.push(value);
+ self.sparse[value] = i;
+ }
+
+ pub fn contains(&self, value: usize) -> bool {
+ let i = self.sparse[value];
+ self.dense.get(i) == Some(&value)
+ }
+
+ pub fn clear(&mut self) {
+ self.dense.clear();
+ }
+}
+
+impl fmt::Debug for SparseSet {
+ fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+ write!(f, "SparseSet({:?})", self.dense)
+ }
+}
+
+impl Deref for SparseSet {
+ type Target = [usize];
+
+ fn deref(&self) -> &Self::Target {
+ &self.dense
+ }
+}
+
+impl<'a> IntoIterator for &'a SparseSet {
+ type Item = &'a usize;
+ type IntoIter = slice::Iter<'a, usize>;
+ fn into_iter(self) -> Self::IntoIter {
+ self.iter()
+ }
+}
diff --git a/third_party/rust/regex/src/testdata/LICENSE b/third_party/rust/regex/src/testdata/LICENSE
new file mode 100644
index 0000000000..f47dbf4c44
--- /dev/null
+++ b/third_party/rust/regex/src/testdata/LICENSE
@@ -0,0 +1,19 @@
+The following license covers testregex.c and all associated test data.
+
+Permission is hereby granted, free of charge, to any person obtaining a
+copy of THIS SOFTWARE FILE (the "Software"), to deal in the Software
+without restriction, including without limitation the rights to use,
+copy, modify, merge, publish, distribute, and/or sell copies of the
+Software, and to permit persons to whom the Software is furnished to do
+so, subject to the following disclaimer:
+
+THIS SOFTWARE IS PROVIDED BY AT&T ``AS IS'' AND ANY EXPRESS OR IMPLIED
+WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+IN NO EVENT SHALL AT&T BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
diff --git a/third_party/rust/regex/src/testdata/README b/third_party/rust/regex/src/testdata/README
new file mode 100644
index 0000000000..6efc2dad33
--- /dev/null
+++ b/third_party/rust/regex/src/testdata/README
@@ -0,0 +1,17 @@
+Test data was taken from the Go distribution, which was in turn taken from the
+testregex test suite:
+
+ http://www2.research.att.com/~astopen/testregex/testregex.html
+
+The LICENSE in this directory corresponds to the LICENSE that the data was
+released under.
+
+The tests themselves were modified for RE2/Go. A couple were modified further
+by me (Andrew Gallant) (only in repetition.dat) so that RE2/Go would pass them.
+(Yes, it seems like RE2/Go includes failing test cases.) This may or may not
+have been a bad idea, but I think being consistent with an established Regex
+library is worth something.
+
+Note that these files are read by 'scripts/regex-match-tests.py' and turned
+into Rust tests found in 'regex_macros/tests/matches.rs'.
+
diff --git a/third_party/rust/regex/src/testdata/basic.dat b/third_party/rust/regex/src/testdata/basic.dat
new file mode 100644
index 0000000000..632e1bb416
--- /dev/null
+++ b/third_party/rust/regex/src/testdata/basic.dat
@@ -0,0 +1,221 @@
+NOTE all standard compliant implementations should pass these : 2002-05-31
+
+BE abracadabra$ abracadabracadabra (7,18)
+BE a...b abababbb (2,7)
+BE XXXXXX ..XXXXXX (2,8)
+E \) () (1,2)
+BE a] a]a (0,2)
+B } } (0,1)
+E \} } (0,1)
+BE \] ] (0,1)
+B ] ] (0,1)
+E ] ] (0,1)
+B { { (0,1)
+B } } (0,1)
+BE ^a ax (0,1)
+BE \^a a^a (1,3)
+BE a\^ a^ (0,2)
+BE a$ aa (1,2)
+BE a\$ a$ (0,2)
+BE ^$ NULL (0,0)
+E $^ NULL (0,0)
+E a($) aa (1,2)(2,2)
+E a*(^a) aa (0,1)(0,1)
+E (..)*(...)* a (0,0)
+E (..)*(...)* abcd (0,4)(2,4)
+E (ab|a)(bc|c) abc (0,3)(0,2)(2,3)
+E (ab)c|abc abc (0,3)(0,2)
+E a{0}b ab (1,2)
+E (a*)(b?)(b+)b{3} aaabbbbbbb (0,10)(0,3)(3,4)(4,7)
+E (a*)(b{0,1})(b{1,})b{3} aaabbbbbbb (0,10)(0,3)(3,4)(4,7)
+E a{9876543210} NULL BADBR
+E ((a|a)|a) a (0,1)(0,1)(0,1)
+E (a*)(a|aa) aaaa (0,4)(0,3)(3,4)
+E a*(a.|aa) aaaa (0,4)(2,4)
+E a(b)|c(d)|a(e)f aef (0,3)(?,?)(?,?)(1,2)
+E (a|b)?.* b (0,1)(0,1)
+E (a|b)c|a(b|c) ac (0,2)(0,1)
+E (a|b)c|a(b|c) ab (0,2)(?,?)(1,2)
+E (a|b)*c|(a|ab)*c abc (0,3)(1,2)
+E (a|b)*c|(a|ab)*c xc (1,2)
+E (.a|.b).*|.*(.a|.b) xa (0,2)(0,2)
+E a?(ab|ba)ab abab (0,4)(0,2)
+E a?(ac{0}b|ba)ab abab (0,4)(0,2)
+E ab|abab abbabab (0,2)
+E aba|bab|bba baaabbbaba (5,8)
+E aba|bab baaabbbaba (6,9)
+E (aa|aaa)*|(a|aaaaa) aa (0,2)(0,2)
+E (a.|.a.)*|(a|.a...) aa (0,2)(0,2)
+E ab|a xabc (1,3)
+E ab|a xxabc (2,4)
+Ei (?-u)(Ab|cD)* aBcD (0,4)(2,4)
+BE [^-] --a (2,3)
+BE [a-]* --a (0,3)
+BE [a-m-]* --amoma-- (0,4)
+E :::1:::0:|:::1:1:0: :::0:::1:::1:::0: (8,17)
+E :::1:::0:|:::1:1:1: :::0:::1:::1:::0: (8,17)
+{E [[:upper:]] A (0,1) [[<element>]] not supported
+E [[:lower:]]+ `az{ (1,3)
+E [[:upper:]]+ @AZ[ (1,3)
+# No collation in Go
+#BE [[-]] [[-]] (2,4)
+#BE [[.NIL.]] NULL ECOLLATE
+#BE [[=aleph=]] NULL ECOLLATE
+}
+BE$ \n \n (0,1)
+BEn$ \n \n (0,1)
+BE$ [^a] \n (0,1)
+BE$ \na \na (0,2)
+E (a)(b)(c) abc (0,3)(0,1)(1,2)(2,3)
+BE xxx xxx (0,3)
+E1 (^|[ (,;])((([Ff]eb[^ ]* *|0*2/|\* */?)0*[6-7]))([^0-9]|$) feb 6, (0,6)
+E1 (^|[ (,;])((([Ff]eb[^ ]* *|0*2/|\* */?)0*[6-7]))([^0-9]|$) 2/7 (0,3)
+E1 (^|[ (,;])((([Ff]eb[^ ]* *|0*2/|\* */?)0*[6-7]))([^0-9]|$) feb 1,Feb 6 (5,11)
+E3 ((((((((((((((((((((((((((((((x)))))))))))))))))))))))))))))) x (0,1)(0,1)(0,1)
+E3 ((((((((((((((((((((((((((((((x))))))))))))))))))))))))))))))* xx (0,2)(1,2)(1,2)
+E a?(ab|ba)* ababababababababababababababababababababababababababababababababababababababababa (0,81)(79,81)
+E abaa|abbaa|abbbaa|abbbbaa ababbabbbabbbabbbbabbbbaa (18,25)
+E abaa|abbaa|abbbaa|abbbbaa ababbabbbabbbabbbbabaa (18,22)
+E aaac|aabc|abac|abbc|baac|babc|bbac|bbbc baaabbbabac (7,11)
+BE$ .* \x01\x7f (0,2)
+E aaaa|bbbb|cccc|ddddd|eeeeee|fffffff|gggg|hhhh|iiiii|jjjjj|kkkkk|llll XaaaXbbbXcccXdddXeeeXfffXgggXhhhXiiiXjjjXkkkXlllXcbaXaaaa (53,57)
+L aaaa\nbbbb\ncccc\nddddd\neeeeee\nfffffff\ngggg\nhhhh\niiiii\njjjjj\nkkkkk\nllll XaaaXbbbXcccXdddXeeeXfffXgggXhhhXiiiXjjjXkkkXlllXcbaXaaaa NOMATCH
+E a*a*a*a*a*b aaaaaaaaab (0,10)
+BE ^ NULL (0,0)
+BE $ NULL (0,0)
+BE ^$ NULL (0,0)
+BE ^a$ a (0,1)
+BE abc abc (0,3)
+BE abc xabcy (1,4)
+BE abc ababc (2,5)
+BE ab*c abc (0,3)
+BE ab*bc abc (0,3)
+BE ab*bc abbc (0,4)
+BE ab*bc abbbbc (0,6)
+E ab+bc abbc (0,4)
+E ab+bc abbbbc (0,6)
+E ab?bc abbc (0,4)
+E ab?bc abc (0,3)
+E ab?c abc (0,3)
+BE ^abc$ abc (0,3)
+BE ^abc abcc (0,3)
+BE abc$ aabc (1,4)
+BE ^ abc (0,0)
+BE $ abc (3,3)
+BE a.c abc (0,3)
+BE a.c axc (0,3)
+BE a.*c axyzc (0,5)
+BE a[bc]d abd (0,3)
+BE a[b-d]e ace (0,3)
+BE a[b-d] aac (1,3)
+BE a[-b] a- (0,2)
+BE a[b-] a- (0,2)
+BE a] a] (0,2)
+BE a[]]b a]b (0,3)
+BE a[^bc]d aed (0,3)
+BE a[^-b]c adc (0,3)
+BE a[^]b]c adc (0,3)
+E ab|cd abc (0,2)
+E ab|cd abcd (0,2)
+E a\(b a(b (0,3)
+E a\(*b ab (0,2)
+E a\(*b a((b (0,4)
+E ((a)) abc (0,1)(0,1)(0,1)
+E (a)b(c) abc (0,3)(0,1)(2,3)
+E a+b+c aabbabc (4,7)
+E a* aaa (0,3)
+#E (a*)* - (0,0)(0,0)
+E (a*)* - (0,0)(?,?) RE2/Go
+E (a*)+ - (0,0)(0,0)
+#E (a*|b)* - (0,0)(0,0)
+E (a*|b)* - (0,0)(?,?) RE2/Go
+E (a+|b)* ab (0,2)(1,2)
+E (a+|b)+ ab (0,2)(1,2)
+E (a+|b)? ab (0,1)(0,1)
+BE [^ab]* cde (0,3)
+#E (^)* - (0,0)(0,0)
+E (^)* - (0,0)(?,?) RE2/Go
+BE a* NULL (0,0)
+E ([abc])*d abbbcd (0,6)(4,5)
+E ([abc])*bcd abcd (0,4)(0,1)
+E a|b|c|d|e e (0,1)
+E (a|b|c|d|e)f ef (0,2)(0,1)
+#E ((a*|b))* - (0,0)(0,0)(0,0)
+E ((a*|b))* - (0,0)(?,?)(?,?) RE2/Go
+BE abcd*efg abcdefg (0,7)
+BE ab* xabyabbbz (1,3)
+BE ab* xayabbbz (1,2)
+E (ab|cd)e abcde (2,5)(2,4)
+BE [abhgefdc]ij hij (0,3)
+E (a|b)c*d abcd (1,4)(1,2)
+E (ab|ab*)bc abc (0,3)(0,1)
+E a([bc]*)c* abc (0,3)(1,3)
+E a([bc]*)(c*d) abcd (0,4)(1,3)(3,4)
+E a([bc]+)(c*d) abcd (0,4)(1,3)(3,4)
+E a([bc]*)(c+d) abcd (0,4)(1,2)(2,4)
+E a[bcd]*dcdcde adcdcde (0,7)
+E (ab|a)b*c abc (0,3)(0,2)
+E ((a)(b)c)(d) abcd (0,4)(0,3)(0,1)(1,2)(3,4)
+BE [A-Za-z_][A-Za-z0-9_]* alpha (0,5)
+E ^a(bc+|b[eh])g|.h$ abh (1,3)
+E (bc+d$|ef*g.|h?i(j|k)) effgz (0,5)(0,5)
+E (bc+d$|ef*g.|h?i(j|k)) ij (0,2)(0,2)(1,2)
+E (bc+d$|ef*g.|h?i(j|k)) reffgz (1,6)(1,6)
+E (((((((((a))))))))) a (0,1)(0,1)(0,1)(0,1)(0,1)(0,1)(0,1)(0,1)(0,1)(0,1)
+BE multiple words multiple words yeah (0,14)
+E (.*)c(.*) abcde (0,5)(0,2)(3,5)
+BE abcd abcd (0,4)
+E a(bc)d abcd (0,4)(1,3)
+E a[-]?c ac (0,3)
+E M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy] Muammar Qaddafi (0,15)(?,?)(10,12)
+E M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy] Mo'ammar Gadhafi (0,16)(?,?)(11,13)
+E M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy] Muammar Kaddafi (0,15)(?,?)(10,12)
+E M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy] Muammar Qadhafi (0,15)(?,?)(10,12)
+E M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy] Muammar Gadafi (0,14)(?,?)(10,11)
+E M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy] Mu'ammar Qadafi (0,15)(?,?)(11,12)
+E M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy] Moamar Gaddafi (0,14)(?,?)(9,11)
+E M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy] Mu'ammar Qadhdhafi (0,18)(?,?)(13,15)
+E M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy] Muammar Khaddafi (0,16)(?,?)(11,13)
+E M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy] Muammar Ghaddafy (0,16)(?,?)(11,13)
+E M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy] Muammar Ghadafi (0,15)(?,?)(11,12)
+E M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy] Muammar Ghaddafi (0,16)(?,?)(11,13)
+E M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy] Muamar Kaddafi (0,14)(?,?)(9,11)
+E M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy] Muammar Quathafi (0,16)(?,?)(11,13)
+E M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy] Muammar Gheddafi (0,16)(?,?)(11,13)
+E M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy] Moammar Khadafy (0,15)(?,?)(11,12)
+E M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy] Moammar Qudhafi (0,15)(?,?)(10,12)
+E a+(b|c)*d+ aabcdd (0,6)(3,4)
+E ^.+$ vivi (0,4)
+E ^(.+)$ vivi (0,4)(0,4)
+E ^([^!.]+).att.com!(.+)$ gryphon.att.com!eby (0,19)(0,7)(16,19)
+E ^([^!]+!)?([^!]+)$ bas (0,3)(?,?)(0,3)
+E ^([^!]+!)?([^!]+)$ bar!bas (0,7)(0,4)(4,7)
+E ^([^!]+!)?([^!]+)$ foo!bas (0,7)(0,4)(4,7)
+E ^.+!([^!]+!)([^!]+)$ foo!bar!bas (0,11)(4,8)(8,11)
+E ((foo)|(bar))!bas bar!bas (0,7)(0,3)(?,?)(0,3)
+E ((foo)|(bar))!bas foo!bar!bas (4,11)(4,7)(?,?)(4,7)
+E ((foo)|(bar))!bas foo!bas (0,7)(0,3)(0,3)
+E ((foo)|bar)!bas bar!bas (0,7)(0,3)
+E ((foo)|bar)!bas foo!bar!bas (4,11)(4,7)
+E ((foo)|bar)!bas foo!bas (0,7)(0,3)(0,3)
+E (foo|(bar))!bas bar!bas (0,7)(0,3)(0,3)
+E (foo|(bar))!bas foo!bar!bas (4,11)(4,7)(4,7)
+E (foo|(bar))!bas foo!bas (0,7)(0,3)
+E (foo|bar)!bas bar!bas (0,7)(0,3)
+E (foo|bar)!bas foo!bar!bas (4,11)(4,7)
+E (foo|bar)!bas foo!bas (0,7)(0,3)
+E ^(([^!]+!)?([^!]+)|.+!([^!]+!)([^!]+))$ foo!bar!bas (0,11)(0,11)(?,?)(?,?)(4,8)(8,11)
+E ^([^!]+!)?([^!]+)$|^.+!([^!]+!)([^!]+)$ bas (0,3)(?,?)(0,3)
+E ^([^!]+!)?([^!]+)$|^.+!([^!]+!)([^!]+)$ bar!bas (0,7)(0,4)(4,7)
+E ^([^!]+!)?([^!]+)$|^.+!([^!]+!)([^!]+)$ foo!bar!bas (0,11)(?,?)(?,?)(4,8)(8,11)
+E ^([^!]+!)?([^!]+)$|^.+!([^!]+!)([^!]+)$ foo!bas (0,7)(0,4)(4,7)
+E ^(([^!]+!)?([^!]+)|.+!([^!]+!)([^!]+))$ bas (0,3)(0,3)(?,?)(0,3)
+E ^(([^!]+!)?([^!]+)|.+!([^!]+!)([^!]+))$ bar!bas (0,7)(0,7)(0,4)(4,7)
+E ^(([^!]+!)?([^!]+)|.+!([^!]+!)([^!]+))$ foo!bar!bas (0,11)(0,11)(?,?)(?,?)(4,8)(8,11)
+E ^(([^!]+!)?([^!]+)|.+!([^!]+!)([^!]+))$ foo!bas (0,7)(0,7)(0,4)(4,7)
+E .*(/XXX).* /XXX (0,4)(0,4)
+E .*(\\XXX).* \XXX (0,4)(0,4)
+E \\XXX \XXX (0,4)
+E .*(/000).* /000 (0,4)(0,4)
+E .*(\\000).* \000 (0,4)(0,4)
+E \\000 \000 (0,4)
diff --git a/third_party/rust/regex/src/testdata/nullsubexpr.dat b/third_party/rust/regex/src/testdata/nullsubexpr.dat
new file mode 100644
index 0000000000..2e18fbb917
--- /dev/null
+++ b/third_party/rust/regex/src/testdata/nullsubexpr.dat
@@ -0,0 +1,79 @@
+NOTE null subexpression matches : 2002-06-06
+
+E (a*)* a (0,1)(0,1)
+#E SAME x (0,0)(0,0)
+E SAME x (0,0)(?,?) RE2/Go
+E SAME aaaaaa (0,6)(0,6)
+E SAME aaaaaax (0,6)(0,6)
+E (a*)+ a (0,1)(0,1)
+E SAME x (0,0)(0,0)
+E SAME aaaaaa (0,6)(0,6)
+E SAME aaaaaax (0,6)(0,6)
+E (a+)* a (0,1)(0,1)
+E SAME x (0,0)
+E SAME aaaaaa (0,6)(0,6)
+E SAME aaaaaax (0,6)(0,6)
+E (a+)+ a (0,1)(0,1)
+E SAME x NOMATCH
+E SAME aaaaaa (0,6)(0,6)
+E SAME aaaaaax (0,6)(0,6)
+
+E ([a]*)* a (0,1)(0,1)
+#E SAME x (0,0)(0,0)
+E SAME x (0,0)(?,?) RE2/Go
+E SAME aaaaaa (0,6)(0,6)
+E SAME aaaaaax (0,6)(0,6)
+E ([a]*)+ a (0,1)(0,1)
+E SAME x (0,0)(0,0)
+E SAME aaaaaa (0,6)(0,6)
+E SAME aaaaaax (0,6)(0,6)
+E ([^b]*)* a (0,1)(0,1)
+#E SAME b (0,0)(0,0)
+E SAME b (0,0)(?,?) RE2/Go
+E SAME aaaaaa (0,6)(0,6)
+E SAME aaaaaab (0,6)(0,6)
+E ([ab]*)* a (0,1)(0,1)
+E SAME aaaaaa (0,6)(0,6)
+E SAME ababab (0,6)(0,6)
+E SAME bababa (0,6)(0,6)
+E SAME b (0,1)(0,1)
+E SAME bbbbbb (0,6)(0,6)
+E SAME aaaabcde (0,5)(0,5)
+E ([^a]*)* b (0,1)(0,1)
+E SAME bbbbbb (0,6)(0,6)
+#E SAME aaaaaa (0,0)(0,0)
+E SAME aaaaaa (0,0)(?,?) RE2/Go
+E ([^ab]*)* ccccxx (0,6)(0,6)
+#E SAME ababab (0,0)(0,0)
+E SAME ababab (0,0)(?,?) RE2/Go
+
+E ((z)+|a)* zabcde (0,2)(1,2)
+
+#{E a+? aaaaaa (0,1) no *? +? mimimal match ops
+#E (a) aaa (0,1)(0,1)
+#E (a*?) aaa (0,0)(0,0)
+#E (a)*? aaa (0,0)
+#E (a*?)*? aaa (0,0)
+#}
+
+B \(a*\)*\(x\) x (0,1)(0,0)(0,1)
+B \(a*\)*\(x\) ax (0,2)(0,1)(1,2)
+B \(a*\)*\(x\) axa (0,2)(0,1)(1,2)
+B \(a*\)*\(x\)\(\1\) x (0,1)(0,0)(0,1)(1,1)
+B \(a*\)*\(x\)\(\1\) ax (0,2)(1,1)(1,2)(2,2)
+B \(a*\)*\(x\)\(\1\) axa (0,3)(0,1)(1,2)(2,3)
+B \(a*\)*\(x\)\(\1\)\(x\) axax (0,4)(0,1)(1,2)(2,3)(3,4)
+B \(a*\)*\(x\)\(\1\)\(x\) axxa (0,3)(1,1)(1,2)(2,2)(2,3)
+
+#E (a*)*(x) x (0,1)(0,0)(0,1)
+E (a*)*(x) x (0,1)(?,?)(0,1) RE2/Go
+E (a*)*(x) ax (0,2)(0,1)(1,2)
+E (a*)*(x) axa (0,2)(0,1)(1,2)
+
+E (a*)+(x) x (0,1)(0,0)(0,1)
+E (a*)+(x) ax (0,2)(0,1)(1,2)
+E (a*)+(x) axa (0,2)(0,1)(1,2)
+
+E (a*){2}(x) x (0,1)(0,0)(0,1)
+E (a*){2}(x) ax (0,2)(1,1)(1,2)
+E (a*){2}(x) axa (0,2)(1,1)(1,2)
diff --git a/third_party/rust/regex/src/testdata/repetition.dat b/third_party/rust/regex/src/testdata/repetition.dat
new file mode 100644
index 0000000000..3bb2121180
--- /dev/null
+++ b/third_party/rust/regex/src/testdata/repetition.dat
@@ -0,0 +1,163 @@
+NOTE implicit vs. explicit repetitions : 2009-02-02
+
+# Glenn Fowler <gsf@research.att.com>
+# conforming matches (column 4) must match one of the following BREs
+# NOMATCH
+# (0,.)\((\(.\),\(.\))(?,?)(\2,\3)\)*
+# (0,.)\((\(.\),\(.\))(\2,\3)(?,?)\)*
+# i.e., each 3-tuple has two identical elements and one (?,?)
+
+E ((..)|(.)) NULL NOMATCH
+E ((..)|(.))((..)|(.)) NULL NOMATCH
+E ((..)|(.))((..)|(.))((..)|(.)) NULL NOMATCH
+
+E ((..)|(.)){1} NULL NOMATCH
+E ((..)|(.)){2} NULL NOMATCH
+E ((..)|(.)){3} NULL NOMATCH
+
+E ((..)|(.))* NULL (0,0)
+
+E ((..)|(.)) a (0,1)(0,1)(?,?)(0,1)
+E ((..)|(.))((..)|(.)) a NOMATCH
+E ((..)|(.))((..)|(.))((..)|(.)) a NOMATCH
+
+E ((..)|(.)){1} a (0,1)(0,1)(?,?)(0,1)
+E ((..)|(.)){2} a NOMATCH
+E ((..)|(.)){3} a NOMATCH
+
+E ((..)|(.))* a (0,1)(0,1)(?,?)(0,1)
+
+E ((..)|(.)) aa (0,2)(0,2)(0,2)(?,?)
+E ((..)|(.))((..)|(.)) aa (0,2)(0,1)(?,?)(0,1)(1,2)(?,?)(1,2)
+E ((..)|(.))((..)|(.))((..)|(.)) aa NOMATCH
+
+E ((..)|(.)){1} aa (0,2)(0,2)(0,2)(?,?)
+E ((..)|(.)){2} aa (0,2)(1,2)(?,?)(1,2)
+E ((..)|(.)){3} aa NOMATCH
+
+E ((..)|(.))* aa (0,2)(0,2)(0,2)(?,?)
+
+E ((..)|(.)) aaa (0,2)(0,2)(0,2)(?,?)
+E ((..)|(.))((..)|(.)) aaa (0,3)(0,2)(0,2)(?,?)(2,3)(?,?)(2,3)
+E ((..)|(.))((..)|(.))((..)|(.)) aaa (0,3)(0,1)(?,?)(0,1)(1,2)(?,?)(1,2)(2,3)(?,?)(2,3)
+
+E ((..)|(.)){1} aaa (0,2)(0,2)(0,2)(?,?)
+#E ((..)|(.)){2} aaa (0,3)(2,3)(?,?)(2,3)
+E ((..)|(.)){2} aaa (0,3)(2,3)(0,2)(2,3) RE2/Go
+E ((..)|(.)){3} aaa (0,3)(2,3)(?,?)(2,3)
+
+#E ((..)|(.))* aaa (0,3)(2,3)(?,?)(2,3)
+E ((..)|(.))* aaa (0,3)(2,3)(0,2)(2,3) RE2/Go
+
+E ((..)|(.)) aaaa (0,2)(0,2)(0,2)(?,?)
+E ((..)|(.))((..)|(.)) aaaa (0,4)(0,2)(0,2)(?,?)(2,4)(2,4)(?,?)
+E ((..)|(.))((..)|(.))((..)|(.)) aaaa (0,4)(0,2)(0,2)(?,?)(2,3)(?,?)(2,3)(3,4)(?,?)(3,4)
+
+E ((..)|(.)){1} aaaa (0,2)(0,2)(0,2)(?,?)
+E ((..)|(.)){2} aaaa (0,4)(2,4)(2,4)(?,?)
+#E ((..)|(.)){3} aaaa (0,4)(3,4)(?,?)(3,4)
+E ((..)|(.)){3} aaaa (0,4)(3,4)(0,2)(3,4) RE2/Go
+
+E ((..)|(.))* aaaa (0,4)(2,4)(2,4)(?,?)
+
+E ((..)|(.)) aaaaa (0,2)(0,2)(0,2)(?,?)
+E ((..)|(.))((..)|(.)) aaaaa (0,4)(0,2)(0,2)(?,?)(2,4)(2,4)(?,?)
+E ((..)|(.))((..)|(.))((..)|(.)) aaaaa (0,5)(0,2)(0,2)(?,?)(2,4)(2,4)(?,?)(4,5)(?,?)(4,5)
+
+E ((..)|(.)){1} aaaaa (0,2)(0,2)(0,2)(?,?)
+E ((..)|(.)){2} aaaaa (0,4)(2,4)(2,4)(?,?)
+#E ((..)|(.)){3} aaaaa (0,5)(4,5)(?,?)(4,5)
+E ((..)|(.)){3} aaaaa (0,5)(4,5)(2,4)(4,5) RE2/Go
+
+#E ((..)|(.))* aaaaa (0,5)(4,5)(?,?)(4,5)
+E ((..)|(.))* aaaaa (0,5)(4,5)(2,4)(4,5) RE2/Go
+
+E ((..)|(.)) aaaaaa (0,2)(0,2)(0,2)(?,?)
+E ((..)|(.))((..)|(.)) aaaaaa (0,4)(0,2)(0,2)(?,?)(2,4)(2,4)(?,?)
+E ((..)|(.))((..)|(.))((..)|(.)) aaaaaa (0,6)(0,2)(0,2)(?,?)(2,4)(2,4)(?,?)(4,6)(4,6)(?,?)
+
+E ((..)|(.)){1} aaaaaa (0,2)(0,2)(0,2)(?,?)
+E ((..)|(.)){2} aaaaaa (0,4)(2,4)(2,4)(?,?)
+E ((..)|(.)){3} aaaaaa (0,6)(4,6)(4,6)(?,?)
+
+E ((..)|(.))* aaaaaa (0,6)(4,6)(4,6)(?,?)
+
+NOTE additional repetition tests graciously provided by Chris Kuklewicz www.haskell.org 2009-02-02
+
+# These test a bug in OS X / FreeBSD / NetBSD, and libtree.
+# Linux/GLIBC gets the {8,} and {8,8} wrong.
+
+:HA#100:E X(.?){0,}Y X1234567Y (0,9)(7,8)
+:HA#101:E X(.?){1,}Y X1234567Y (0,9)(7,8)
+:HA#102:E X(.?){2,}Y X1234567Y (0,9)(7,8)
+:HA#103:E X(.?){3,}Y X1234567Y (0,9)(7,8)
+:HA#104:E X(.?){4,}Y X1234567Y (0,9)(7,8)
+:HA#105:E X(.?){5,}Y X1234567Y (0,9)(7,8)
+:HA#106:E X(.?){6,}Y X1234567Y (0,9)(7,8)
+:HA#107:E X(.?){7,}Y X1234567Y (0,9)(7,8)
+:HA#108:E X(.?){8,}Y X1234567Y (0,9)(8,8)
+#:HA#110:E X(.?){0,8}Y X1234567Y (0,9)(7,8)
+:HA#110:E X(.?){0,8}Y X1234567Y (0,9)(8,8) RE2/Go
+#:HA#111:E X(.?){1,8}Y X1234567Y (0,9)(7,8)
+:HA#111:E X(.?){1,8}Y X1234567Y (0,9)(8,8) RE2/Go
+#:HA#112:E X(.?){2,8}Y X1234567Y (0,9)(7,8)
+:HA#112:E X(.?){2,8}Y X1234567Y (0,9)(8,8) RE2/Go
+#:HA#113:E X(.?){3,8}Y X1234567Y (0,9)(7,8)
+:HA#113:E X(.?){3,8}Y X1234567Y (0,9)(8,8) RE2/Go
+#:HA#114:E X(.?){4,8}Y X1234567Y (0,9)(7,8)
+:HA#114:E X(.?){4,8}Y X1234567Y (0,9)(8,8) RE2/Go
+#:HA#115:E X(.?){5,8}Y X1234567Y (0,9)(7,8)
+:HA#115:E X(.?){5,8}Y X1234567Y (0,9)(8,8) RE2/Go
+#:HA#116:E X(.?){6,8}Y X1234567Y (0,9)(7,8)
+:HA#116:E X(.?){6,8}Y X1234567Y (0,9)(8,8) RE2/Go
+#:HA#117:E X(.?){7,8}Y X1234567Y (0,9)(7,8)
+:HA#117:E X(.?){7,8}Y X1234567Y (0,9)(8,8) RE2/Go
+:HA#118:E X(.?){8,8}Y X1234567Y (0,9)(8,8)
+
+# These test a fixed bug in my regex-tdfa that did not keep the expanded
+# form properly grouped, so right association did the wrong thing with
+# these ambiguous patterns (crafted just to test my code when I became
+# suspicious of my implementation). The first subexpression should use
+# "ab" then "a" then "bcd".
+
+# OS X / FreeBSD / NetBSD badly fail many of these, with impossible
+# results like (0,6)(4,5)(6,6).
+
+:HA#260:E (a|ab|c|bcd){0,}(d*) ababcd (0,1)(0,1)(1,1)
+:HA#261:E (a|ab|c|bcd){1,}(d*) ababcd (0,1)(0,1)(1,1)
+:HA#262:E (a|ab|c|bcd){2,}(d*) ababcd (0,6)(3,6)(6,6)
+:HA#263:E (a|ab|c|bcd){3,}(d*) ababcd (0,6)(3,6)(6,6)
+:HA#264:E (a|ab|c|bcd){4,}(d*) ababcd NOMATCH
+:HA#265:E (a|ab|c|bcd){0,10}(d*) ababcd (0,1)(0,1)(1,1)
+:HA#266:E (a|ab|c|bcd){1,10}(d*) ababcd (0,1)(0,1)(1,1)
+:HA#267:E (a|ab|c|bcd){2,10}(d*) ababcd (0,6)(3,6)(6,6)
+:HA#268:E (a|ab|c|bcd){3,10}(d*) ababcd (0,6)(3,6)(6,6)
+:HA#269:E (a|ab|c|bcd){4,10}(d*) ababcd NOMATCH
+:HA#270:E (a|ab|c|bcd)*(d*) ababcd (0,1)(0,1)(1,1)
+:HA#271:E (a|ab|c|bcd)+(d*) ababcd (0,1)(0,1)(1,1)
+
+# The above worked on Linux/GLIBC but the following often fail.
+# They also trip up OS X / FreeBSD / NetBSD:
+
+#:HA#280:E (ab|a|c|bcd){0,}(d*) ababcd (0,6)(3,6)(6,6)
+:HA#280:E (ab|a|c|bcd){0,}(d*) ababcd (0,6)(4,5)(5,6) RE2/Go
+#:HA#281:E (ab|a|c|bcd){1,}(d*) ababcd (0,6)(3,6)(6,6)
+:HA#281:E (ab|a|c|bcd){1,}(d*) ababcd (0,6)(4,5)(5,6) RE2/Go
+#:HA#282:E (ab|a|c|bcd){2,}(d*) ababcd (0,6)(3,6)(6,6)
+:HA#282:E (ab|a|c|bcd){2,}(d*) ababcd (0,6)(4,5)(5,6) RE2/Go
+#:HA#283:E (ab|a|c|bcd){3,}(d*) ababcd (0,6)(3,6)(6,6)
+:HA#283:E (ab|a|c|bcd){3,}(d*) ababcd (0,6)(4,5)(5,6) RE2/Go
+:HA#284:E (ab|a|c|bcd){4,}(d*) ababcd NOMATCH
+#:HA#285:E (ab|a|c|bcd){0,10}(d*) ababcd (0,6)(3,6)(6,6)
+:HA#285:E (ab|a|c|bcd){0,10}(d*) ababcd (0,6)(4,5)(5,6) RE2/Go
+#:HA#286:E (ab|a|c|bcd){1,10}(d*) ababcd (0,6)(3,6)(6,6)
+:HA#286:E (ab|a|c|bcd){1,10}(d*) ababcd (0,6)(4,5)(5,6) RE2/Go
+#:HA#287:E (ab|a|c|bcd){2,10}(d*) ababcd (0,6)(3,6)(6,6)
+:HA#287:E (ab|a|c|bcd){2,10}(d*) ababcd (0,6)(4,5)(5,6) RE2/Go
+#:HA#288:E (ab|a|c|bcd){3,10}(d*) ababcd (0,6)(3,6)(6,6)
+:HA#288:E (ab|a|c|bcd){3,10}(d*) ababcd (0,6)(4,5)(5,6) RE2/Go
+:HA#289:E (ab|a|c|bcd){4,10}(d*) ababcd NOMATCH
+#:HA#290:E (ab|a|c|bcd)*(d*) ababcd (0,6)(3,6)(6,6)
+:HA#290:E (ab|a|c|bcd)*(d*) ababcd (0,6)(4,5)(5,6) RE2/Go
+#:HA#291:E (ab|a|c|bcd)+(d*) ababcd (0,6)(3,6)(6,6)
+:HA#291:E (ab|a|c|bcd)+(d*) ababcd (0,6)(4,5)(5,6) RE2/Go
diff --git a/third_party/rust/regex/src/utf8.rs b/third_party/rust/regex/src/utf8.rs
new file mode 100644
index 0000000000..2dfd2c0d1d
--- /dev/null
+++ b/third_party/rust/regex/src/utf8.rs
@@ -0,0 +1,264 @@
+/// A few elementary UTF-8 encoding and decoding functions used by the matching
+/// engines.
+///
+/// In an ideal world, the matching engines operate on `&str` and we can just
+/// lean on the standard library for all our UTF-8 needs. However, to support
+/// byte based regexes (that can match on arbitrary bytes which may contain
+/// UTF-8), we need to be capable of searching and decoding UTF-8 on a `&[u8]`.
+/// The standard library doesn't really recognize this use case, so we have
+/// to build it out ourselves.
+///
+/// Should this be factored out into a separate crate? It seems independently
+/// useful. There are other crates that already exist (e.g., `utf-8`) that have
+/// overlapping use cases. Not sure what to do.
+use std::char;
+
+const TAG_CONT: u8 = 0b1000_0000;
+const TAG_TWO: u8 = 0b1100_0000;
+const TAG_THREE: u8 = 0b1110_0000;
+const TAG_FOUR: u8 = 0b1111_0000;
+
+/// Returns the smallest possible index of the next valid UTF-8 sequence
+/// starting after `i`.
+pub fn next_utf8(text: &[u8], i: usize) -> usize {
+ let b = match text.get(i) {
+ None => return i + 1,
+ Some(&b) => b,
+ };
+ let inc = if b <= 0x7F {
+ 1
+ } else if b <= 0b110_11111 {
+ 2
+ } else if b <= 0b1110_1111 {
+ 3
+ } else {
+ 4
+ };
+ i + inc
+}
+
+/// Decode a single UTF-8 sequence into a single Unicode codepoint from `src`.
+///
+/// If no valid UTF-8 sequence could be found, then `None` is returned.
+/// Otherwise, the decoded codepoint and the number of bytes read is returned.
+/// The number of bytes read (for a valid UTF-8 sequence) is guaranteed to be
+/// 1, 2, 3 or 4.
+///
+/// Note that a UTF-8 sequence is invalid if it is incorrect UTF-8, encodes a
+/// codepoint that is out of range (surrogate codepoints are out of range) or
+/// is not the shortest possible UTF-8 sequence for that codepoint.
+#[inline]
+pub fn decode_utf8(src: &[u8]) -> Option<(char, usize)> {
+ let b0 = match src.get(0) {
+ None => return None,
+ Some(&b) if b <= 0x7F => return Some((b as char, 1)),
+ Some(&b) => b,
+ };
+ match b0 {
+ 0b110_00000..=0b110_11111 => {
+ if src.len() < 2 {
+ return None;
+ }
+ let b1 = src[1];
+ if 0b11_000000 & b1 != TAG_CONT {
+ return None;
+ }
+ let cp = ((b0 & !TAG_TWO) as u32) << 6 | ((b1 & !TAG_CONT) as u32);
+ match cp {
+ 0x80..=0x7FF => char::from_u32(cp).map(|cp| (cp, 2)),
+ _ => None,
+ }
+ }
+ 0b1110_0000..=0b1110_1111 => {
+ if src.len() < 3 {
+ return None;
+ }
+ let (b1, b2) = (src[1], src[2]);
+ if 0b11_000000 & b1 != TAG_CONT {
+ return None;
+ }
+ if 0b11_000000 & b2 != TAG_CONT {
+ return None;
+ }
+ let cp = ((b0 & !TAG_THREE) as u32) << 12
+ | ((b1 & !TAG_CONT) as u32) << 6
+ | ((b2 & !TAG_CONT) as u32);
+ match cp {
+ // char::from_u32 will disallow surrogate codepoints.
+ 0x800..=0xFFFF => char::from_u32(cp).map(|cp| (cp, 3)),
+ _ => None,
+ }
+ }
+ 0b11110_000..=0b11110_111 => {
+ if src.len() < 4 {
+ return None;
+ }
+ let (b1, b2, b3) = (src[1], src[2], src[3]);
+ if 0b11_000000 & b1 != TAG_CONT {
+ return None;
+ }
+ if 0b11_000000 & b2 != TAG_CONT {
+ return None;
+ }
+ if 0b11_000000 & b3 != TAG_CONT {
+ return None;
+ }
+ let cp = ((b0 & !TAG_FOUR) as u32) << 18
+ | ((b1 & !TAG_CONT) as u32) << 12
+ | ((b2 & !TAG_CONT) as u32) << 6
+ | ((b3 & !TAG_CONT) as u32);
+ match cp {
+ 0x10000..=0x0010_FFFF => char::from_u32(cp).map(|cp| (cp, 4)),
+ _ => None,
+ }
+ }
+ _ => None,
+ }
+}
+
+/// Like `decode_utf8`, but decodes the last UTF-8 sequence in `src` instead
+/// of the first.
+pub fn decode_last_utf8(src: &[u8]) -> Option<(char, usize)> {
+ if src.is_empty() {
+ return None;
+ }
+ let mut start = src.len() - 1;
+ if src[start] <= 0x7F {
+ return Some((src[start] as char, 1));
+ }
+ while start > src.len().saturating_sub(4) {
+ start -= 1;
+ if is_start_byte(src[start]) {
+ break;
+ }
+ }
+ match decode_utf8(&src[start..]) {
+ None => None,
+ Some((_, n)) if n < src.len() - start => None,
+ Some((cp, n)) => Some((cp, n)),
+ }
+}
+
+fn is_start_byte(b: u8) -> bool {
+ b & 0b11_000000 != 0b1_0000000
+}
+
+#[cfg(test)]
+mod tests {
+ use std::str;
+
+ use quickcheck::quickcheck;
+
+ use super::{
+ decode_last_utf8, decode_utf8, TAG_CONT, TAG_FOUR, TAG_THREE, TAG_TWO,
+ };
+
+ #[test]
+ fn prop_roundtrip() {
+ fn p(given_cp: char) -> bool {
+ let mut tmp = [0; 4];
+ let encoded_len = given_cp.encode_utf8(&mut tmp).len();
+ let (got_cp, got_len) = decode_utf8(&tmp[..encoded_len]).unwrap();
+ encoded_len == got_len && given_cp == got_cp
+ }
+ quickcheck(p as fn(char) -> bool)
+ }
+
+ #[test]
+ fn prop_roundtrip_last() {
+ fn p(given_cp: char) -> bool {
+ let mut tmp = [0; 4];
+ let encoded_len = given_cp.encode_utf8(&mut tmp).len();
+ let (got_cp, got_len) =
+ decode_last_utf8(&tmp[..encoded_len]).unwrap();
+ encoded_len == got_len && given_cp == got_cp
+ }
+ quickcheck(p as fn(char) -> bool)
+ }
+
+ #[test]
+ fn prop_encode_matches_std() {
+ fn p(cp: char) -> bool {
+ let mut got = [0; 4];
+ let n = cp.encode_utf8(&mut got).len();
+ let expected = cp.to_string();
+ &got[..n] == expected.as_bytes()
+ }
+ quickcheck(p as fn(char) -> bool)
+ }
+
+ #[test]
+ fn prop_decode_matches_std() {
+ fn p(given_cp: char) -> bool {
+ let mut tmp = [0; 4];
+ let n = given_cp.encode_utf8(&mut tmp).len();
+ let (got_cp, _) = decode_utf8(&tmp[..n]).unwrap();
+ let expected_cp =
+ str::from_utf8(&tmp[..n]).unwrap().chars().next().unwrap();
+ got_cp == expected_cp
+ }
+ quickcheck(p as fn(char) -> bool)
+ }
+
+ #[test]
+ fn prop_decode_last_matches_std() {
+ fn p(given_cp: char) -> bool {
+ let mut tmp = [0; 4];
+ let n = given_cp.encode_utf8(&mut tmp).len();
+ let (got_cp, _) = decode_last_utf8(&tmp[..n]).unwrap();
+ let expected_cp = str::from_utf8(&tmp[..n])
+ .unwrap()
+ .chars()
+ .rev()
+ .next()
+ .unwrap();
+ got_cp == expected_cp
+ }
+ quickcheck(p as fn(char) -> bool)
+ }
+
+ #[test]
+ fn reject_invalid() {
+ // Invalid start byte
+ assert_eq!(decode_utf8(&[0xFF]), None);
+ // Surrogate pair
+ assert_eq!(decode_utf8(&[0xED, 0xA0, 0x81]), None);
+ // Invalid continuation byte.
+ assert_eq!(decode_utf8(&[0xD4, 0xC2]), None);
+ // Bad lengths
+ assert_eq!(decode_utf8(&[0xC3]), None); // 2 bytes
+ assert_eq!(decode_utf8(&[0xEF, 0xBF]), None); // 3 bytes
+ assert_eq!(decode_utf8(&[0xF4, 0x8F, 0xBF]), None); // 4 bytes
+ // Not a minimal UTF-8 sequence
+ assert_eq!(decode_utf8(&[TAG_TWO, TAG_CONT | b'a']), None);
+ assert_eq!(decode_utf8(&[TAG_THREE, TAG_CONT, TAG_CONT | b'a']), None);
+ assert_eq!(
+ decode_utf8(&[TAG_FOUR, TAG_CONT, TAG_CONT, TAG_CONT | b'a',]),
+ None
+ );
+ }
+
+ #[test]
+ fn reject_invalid_last() {
+ // Invalid start byte
+ assert_eq!(decode_last_utf8(&[0xFF]), None);
+ // Surrogate pair
+ assert_eq!(decode_last_utf8(&[0xED, 0xA0, 0x81]), None);
+ // Bad lengths
+ assert_eq!(decode_last_utf8(&[0xC3]), None); // 2 bytes
+ assert_eq!(decode_last_utf8(&[0xEF, 0xBF]), None); // 3 bytes
+ assert_eq!(decode_last_utf8(&[0xF4, 0x8F, 0xBF]), None); // 4 bytes
+ // Not a minimal UTF-8 sequence
+ assert_eq!(decode_last_utf8(&[TAG_TWO, TAG_CONT | b'a']), None);
+ assert_eq!(
+ decode_last_utf8(&[TAG_THREE, TAG_CONT, TAG_CONT | b'a',]),
+ None
+ );
+ assert_eq!(
+ decode_last_utf8(
+ &[TAG_FOUR, TAG_CONT, TAG_CONT, TAG_CONT | b'a',]
+ ),
+ None
+ );
+ }
+}