summaryrefslogtreecommitdiffstats
path: root/vendor/regex/src
diff options
context:
space:
mode:
Diffstat (limited to 'vendor/regex/src')
-rw-r--r--vendor/regex/src/backtrack.rs282
-rw-r--r--vendor/regex/src/builders.rs2539
-rw-r--r--vendor/regex/src/bytes.rs91
-rw-r--r--vendor/regex/src/compile.rs1333
-rw-r--r--vendor/regex/src/dfa.rs1945
-rw-r--r--vendor/regex/src/error.rs53
-rw-r--r--vendor/regex/src/exec.rs1759
-rw-r--r--vendor/regex/src/expand.rs247
-rw-r--r--vendor/regex/src/find_byte.rs5
-rw-r--r--vendor/regex/src/freqs.rs261
-rw-r--r--vendor/regex/src/input.rs432
-rw-r--r--vendor/regex/src/lib.rs1347
-rw-r--r--vendor/regex/src/literal/imp.rs413
-rw-r--r--vendor/regex/src/literal/mod.rs55
-rw-r--r--vendor/regex/src/pattern.rs4
-rw-r--r--vendor/regex/src/pikevm.rs360
-rw-r--r--vendor/regex/src/pool.rs333
-rw-r--r--vendor/regex/src/prog.rs451
-rw-r--r--vendor/regex/src/re_builder.rs421
-rw-r--r--vendor/regex/src/re_bytes.rs1372
-rw-r--r--vendor/regex/src/re_set.rs518
-rw-r--r--vendor/regex/src/re_trait.rs294
-rw-r--r--vendor/regex/src/re_unicode.rs1415
-rw-r--r--vendor/regex/src/regex/bytes.rs2600
-rw-r--r--vendor/regex/src/regex/mod.rs2
-rw-r--r--vendor/regex/src/regex/string.rs2582
-rw-r--r--vendor/regex/src/regexset/bytes.rs710
-rw-r--r--vendor/regex/src/regexset/mod.rs2
-rw-r--r--vendor/regex/src/regexset/string.rs706
-rw-r--r--vendor/regex/src/sparse.rs84
-rw-r--r--vendor/regex/src/testdata/LICENSE19
-rw-r--r--vendor/regex/src/testdata/README17
-rw-r--r--vendor/regex/src/testdata/basic.dat221
-rw-r--r--vendor/regex/src/testdata/nullsubexpr.dat79
-rw-r--r--vendor/regex/src/testdata/repetition.dat163
-rw-r--r--vendor/regex/src/utf8.rs264
36 files changed, 10215 insertions, 13164 deletions
diff --git a/vendor/regex/src/backtrack.rs b/vendor/regex/src/backtrack.rs
deleted file mode 100644
index 4d83856ca..000000000
--- a/vendor/regex/src/backtrack.rs
+++ /dev/null
@@ -1,282 +0,0 @@
-// This is the backtracking matching engine. It has the same exact capability
-// as the full NFA simulation, except it is artificially restricted to small
-// regexes on small inputs because of its memory requirements.
-//
-// In particular, this is a *bounded* backtracking engine. It retains worst
-// case linear time by keeping track of the states that it has visited (using a
-// bitmap). Namely, once a state is visited, it is never visited again. Since a
-// state is keyed by `(instruction index, input index)`, we have that its time
-// complexity is `O(mn)` (i.e., linear in the size of the search text).
-//
-// The backtracking engine can beat out the NFA simulation on small
-// regexes/inputs because it doesn't have to keep track of multiple copies of
-// the capture groups. In benchmarks, the backtracking engine is roughly twice
-// as fast as the full NFA simulation. Note though that its performance doesn't
-// scale, even if you're willing to live with the memory requirements. Namely,
-// the bitset has to be zeroed on each execution, which becomes quite expensive
-// on large bitsets.
-
-use crate::exec::ProgramCache;
-use crate::input::{Input, InputAt};
-use crate::prog::{InstPtr, Program};
-use crate::re_trait::Slot;
-
-type Bits = u32;
-
-const BIT_SIZE: usize = 32;
-const MAX_SIZE_BYTES: usize = 256 * (1 << 10); // 256 KB
-
-/// Returns true iff the given regex and input should be executed by this
-/// engine with reasonable memory usage.
-pub fn should_exec(num_insts: usize, text_len: usize) -> bool {
- // Total memory usage in bytes is determined by:
- //
- // ((len(insts) * (len(input) + 1) + bits - 1) / bits) * (size_of(u32))
- //
- // The actual limit picked is pretty much a heuristic.
- // See: https://github.com/rust-lang/regex/issues/215
- let size = ((num_insts * (text_len + 1) + BIT_SIZE - 1) / BIT_SIZE) * 4;
- size <= MAX_SIZE_BYTES
-}
-
-/// A backtracking matching engine.
-#[derive(Debug)]
-pub struct Bounded<'a, 'm, 'r, 's, I> {
- prog: &'r Program,
- input: I,
- matches: &'m mut [bool],
- slots: &'s mut [Slot],
- m: &'a mut Cache,
-}
-
-/// Shared cached state between multiple invocations of a backtracking engine
-/// in the same thread.
-#[derive(Clone, Debug)]
-pub struct Cache {
- jobs: Vec<Job>,
- visited: Vec<Bits>,
-}
-
-impl Cache {
- /// Create new empty cache for the backtracking engine.
- pub fn new(_prog: &Program) -> Self {
- Cache { jobs: vec![], visited: vec![] }
- }
-}
-
-/// A job is an explicit unit of stack space in the backtracking engine.
-///
-/// The "normal" representation is a single state transition, which corresponds
-/// to an NFA state and a character in the input. However, the backtracking
-/// engine must keep track of old capture group values. We use the explicit
-/// stack to do it.
-#[derive(Clone, Copy, Debug)]
-enum Job {
- Inst { ip: InstPtr, at: InputAt },
- SaveRestore { slot: usize, old_pos: Option<usize> },
-}
-
-impl<'a, 'm, 'r, 's, I: Input> Bounded<'a, 'm, 'r, 's, I> {
- /// Execute the backtracking matching engine.
- ///
- /// If there's a match, `exec` returns `true` and populates the given
- /// captures accordingly.
- pub fn exec(
- prog: &'r Program,
- cache: &ProgramCache,
- matches: &'m mut [bool],
- slots: &'s mut [Slot],
- input: I,
- start: usize,
- end: usize,
- ) -> bool {
- let mut cache = cache.borrow_mut();
- let cache = &mut cache.backtrack;
- let start = input.at(start);
- let mut b = Bounded { prog, input, matches, slots, m: cache };
- b.exec_(start, end)
- }
-
- /// Clears the cache such that the backtracking engine can be executed
- /// on some input of fixed length.
- fn clear(&mut self) {
- // Reset the job memory so that we start fresh.
- self.m.jobs.clear();
-
- // Now we need to clear the bit state set.
- // We do this by figuring out how much space we need to keep track
- // of the states we've visited.
- // Then we reset all existing allocated space to 0.
- // Finally, we request more space if we need it.
- //
- // This is all a little circuitous, but doing this using unchecked
- // operations doesn't seem to have a measurable impact on performance.
- // (Probably because backtracking is limited to such small
- // inputs/regexes in the first place.)
- let visited_len =
- (self.prog.len() * (self.input.len() + 1) + BIT_SIZE - 1)
- / BIT_SIZE;
- self.m.visited.truncate(visited_len);
- for v in &mut self.m.visited {
- *v = 0;
- }
- if visited_len > self.m.visited.len() {
- let len = self.m.visited.len();
- self.m.visited.reserve_exact(visited_len - len);
- for _ in 0..(visited_len - len) {
- self.m.visited.push(0);
- }
- }
- }
-
- /// Start backtracking at the given position in the input, but also look
- /// for literal prefixes.
- fn exec_(&mut self, mut at: InputAt, end: usize) -> bool {
- self.clear();
- // If this is an anchored regex at the beginning of the input, then
- // we're either already done or we only need to try backtracking once.
- if self.prog.is_anchored_start {
- return if !at.is_start() { false } else { self.backtrack(at) };
- }
- let mut matched = false;
- loop {
- if !self.prog.prefixes.is_empty() {
- at = match self.input.prefix_at(&self.prog.prefixes, at) {
- None => break,
- Some(at) => at,
- };
- }
- matched = self.backtrack(at) || matched;
- if matched && self.prog.matches.len() == 1 {
- return true;
- }
- if at.pos() >= end {
- break;
- }
- at = self.input.at(at.next_pos());
- }
- matched
- }
-
- /// The main backtracking loop starting at the given input position.
- fn backtrack(&mut self, start: InputAt) -> bool {
- // N.B. We use an explicit stack to avoid recursion.
- // To avoid excessive pushing and popping, most transitions are handled
- // in the `step` helper function, which only pushes to the stack when
- // there's a capture or a branch.
- let mut matched = false;
- self.m.jobs.push(Job::Inst { ip: 0, at: start });
- while let Some(job) = self.m.jobs.pop() {
- match job {
- Job::Inst { ip, at } => {
- if self.step(ip, at) {
- // Only quit if we're matching one regex.
- // If we're matching a regex set, then mush on and
- // try to find other matches (if we want them).
- if self.prog.matches.len() == 1 {
- return true;
- }
- matched = true;
- }
- }
- Job::SaveRestore { slot, old_pos } => {
- if slot < self.slots.len() {
- self.slots[slot] = old_pos;
- }
- }
- }
- }
- matched
- }
-
- fn step(&mut self, mut ip: InstPtr, mut at: InputAt) -> bool {
- use crate::prog::Inst::*;
- loop {
- // This loop is an optimization to avoid constantly pushing/popping
- // from the stack. Namely, if we're pushing a job only to run it
- // next, avoid the push and just mutate `ip` (and possibly `at`)
- // in place.
- if self.has_visited(ip, at) {
- return false;
- }
- match self.prog[ip] {
- Match(slot) => {
- if slot < self.matches.len() {
- self.matches[slot] = true;
- }
- return true;
- }
- Save(ref inst) => {
- if let Some(&old_pos) = self.slots.get(inst.slot) {
- // If this path doesn't work out, then we save the old
- // capture index (if one exists) in an alternate
- // job. If the next path fails, then the alternate
- // job is popped and the old capture index is restored.
- self.m.jobs.push(Job::SaveRestore {
- slot: inst.slot,
- old_pos,
- });
- self.slots[inst.slot] = Some(at.pos());
- }
- ip = inst.goto;
- }
- Split(ref inst) => {
- self.m.jobs.push(Job::Inst { ip: inst.goto2, at });
- ip = inst.goto1;
- }
- EmptyLook(ref inst) => {
- if self.input.is_empty_match(at, inst) {
- ip = inst.goto;
- } else {
- return false;
- }
- }
- Char(ref inst) => {
- if inst.c == at.char() {
- ip = inst.goto;
- at = self.input.at(at.next_pos());
- } else {
- return false;
- }
- }
- Ranges(ref inst) => {
- if inst.matches(at.char()) {
- ip = inst.goto;
- at = self.input.at(at.next_pos());
- } else {
- return false;
- }
- }
- Bytes(ref inst) => {
- if let Some(b) = at.byte() {
- if inst.matches(b) {
- ip = inst.goto;
- at = self.input.at(at.next_pos());
- continue;
- }
- }
- return false;
- }
- }
- }
- }
-
- fn has_visited(&mut self, ip: InstPtr, at: InputAt) -> bool {
- let k = ip * (self.input.len() + 1) + at.pos();
- let k1 = k / BIT_SIZE;
- let k2 = usize_to_u32(1 << (k & (BIT_SIZE - 1)));
- if self.m.visited[k1] & k2 == 0 {
- self.m.visited[k1] |= k2;
- false
- } else {
- true
- }
- }
-}
-
-fn usize_to_u32(n: usize) -> u32 {
- if (n as u64) > (::std::u32::MAX as u64) {
- panic!("BUG: {} is too big to fit into u32", n)
- }
- n as u32
-}
diff --git a/vendor/regex/src/builders.rs b/vendor/regex/src/builders.rs
new file mode 100644
index 000000000..c111a96c0
--- /dev/null
+++ b/vendor/regex/src/builders.rs
@@ -0,0 +1,2539 @@
+#![allow(warnings)]
+
+// This module defines an internal builder that encapsulates all interaction
+// with meta::Regex construction, and then 4 public API builders that wrap
+// around it. The docs are essentially repeated on each of the 4 public
+// builders, with tweaks to the examples as needed.
+//
+// The reason why there are so many builders is partially because of a misstep
+// in the initial API design: the builder constructor takes in the pattern
+// strings instead of using the `build` method to accept the pattern strings.
+// This means `new` has a different signature for each builder. It probably
+// would have been nicer to to use one builder with `fn new()`, and then add
+// `build(pat)` and `build_many(pats)` constructors.
+//
+// The other reason is because I think the `bytes` module should probably
+// have its own builder type. That way, it is completely isolated from the
+// top-level API.
+//
+// If I could do it again, I'd probably have a `regex::Builder` and a
+// `regex::bytes::Builder`. Each would have `build` and `build_set` (or
+// `build_many`) methods for constructing a single pattern `Regex` and a
+// multi-pattern `RegexSet`, respectively.
+
+use alloc::{
+ string::{String, ToString},
+ sync::Arc,
+ vec,
+ vec::Vec,
+};
+
+use regex_automata::{
+ meta, nfa::thompson::WhichCaptures, util::syntax, MatchKind,
+};
+
+use crate::error::Error;
+
+/// A builder for constructing a `Regex`, `bytes::Regex`, `RegexSet` or a
+/// `bytes::RegexSet`.
+///
+/// This is essentially the implementation of the four different builder types
+/// in the public API: `RegexBuilder`, `bytes::RegexBuilder`, `RegexSetBuilder`
+/// and `bytes::RegexSetBuilder`.
+#[derive(Clone, Debug)]
+struct Builder {
+ pats: Vec<String>,
+ metac: meta::Config,
+ syntaxc: syntax::Config,
+}
+
+impl Default for Builder {
+ fn default() -> Builder {
+ let metac = meta::Config::new()
+ .nfa_size_limit(Some(10 * (1 << 20)))
+ .hybrid_cache_capacity(2 * (1 << 20));
+ Builder { pats: vec![], metac, syntaxc: syntax::Config::default() }
+ }
+}
+
+impl Builder {
+ fn new<I, S>(patterns: I) -> Builder
+ where
+ S: AsRef<str>,
+ I: IntoIterator<Item = S>,
+ {
+ let mut b = Builder::default();
+ b.pats.extend(patterns.into_iter().map(|p| p.as_ref().to_string()));
+ b
+ }
+
+ fn build_one_string(&self) -> Result<crate::Regex, Error> {
+ assert_eq!(1, self.pats.len());
+ let metac = self
+ .metac
+ .clone()
+ .match_kind(MatchKind::LeftmostFirst)
+ .utf8_empty(true);
+ let syntaxc = self.syntaxc.clone().utf8(true);
+ let pattern = Arc::from(self.pats[0].as_str());
+ meta::Builder::new()
+ .configure(metac)
+ .syntax(syntaxc)
+ .build(&pattern)
+ .map(|meta| crate::Regex { meta, pattern })
+ .map_err(Error::from_meta_build_error)
+ }
+
+ fn build_one_bytes(&self) -> Result<crate::bytes::Regex, Error> {
+ assert_eq!(1, self.pats.len());
+ let metac = self
+ .metac
+ .clone()
+ .match_kind(MatchKind::LeftmostFirst)
+ .utf8_empty(false);
+ let syntaxc = self.syntaxc.clone().utf8(false);
+ let pattern = Arc::from(self.pats[0].as_str());
+ meta::Builder::new()
+ .configure(metac)
+ .syntax(syntaxc)
+ .build(&pattern)
+ .map(|meta| crate::bytes::Regex { meta, pattern })
+ .map_err(Error::from_meta_build_error)
+ }
+
+ fn build_many_string(&self) -> Result<crate::RegexSet, Error> {
+ let metac = self
+ .metac
+ .clone()
+ .match_kind(MatchKind::All)
+ .utf8_empty(true)
+ .which_captures(WhichCaptures::None);
+ let syntaxc = self.syntaxc.clone().utf8(true);
+ let patterns = Arc::from(self.pats.as_slice());
+ meta::Builder::new()
+ .configure(metac)
+ .syntax(syntaxc)
+ .build_many(&patterns)
+ .map(|meta| crate::RegexSet { meta, patterns })
+ .map_err(Error::from_meta_build_error)
+ }
+
+ fn build_many_bytes(&self) -> Result<crate::bytes::RegexSet, Error> {
+ let metac = self
+ .metac
+ .clone()
+ .match_kind(MatchKind::All)
+ .utf8_empty(false)
+ .which_captures(WhichCaptures::None);
+ let syntaxc = self.syntaxc.clone().utf8(false);
+ let patterns = Arc::from(self.pats.as_slice());
+ meta::Builder::new()
+ .configure(metac)
+ .syntax(syntaxc)
+ .build_many(&patterns)
+ .map(|meta| crate::bytes::RegexSet { meta, patterns })
+ .map_err(Error::from_meta_build_error)
+ }
+
+ fn case_insensitive(&mut self, yes: bool) -> &mut Builder {
+ self.syntaxc = self.syntaxc.case_insensitive(yes);
+ self
+ }
+
+ fn multi_line(&mut self, yes: bool) -> &mut Builder {
+ self.syntaxc = self.syntaxc.multi_line(yes);
+ self
+ }
+
+ fn dot_matches_new_line(&mut self, yes: bool) -> &mut Builder {
+ self.syntaxc = self.syntaxc.dot_matches_new_line(yes);
+ self
+ }
+
+ fn crlf(&mut self, yes: bool) -> &mut Builder {
+ self.syntaxc = self.syntaxc.crlf(yes);
+ self
+ }
+
+ fn line_terminator(&mut self, byte: u8) -> &mut Builder {
+ self.metac = self.metac.clone().line_terminator(byte);
+ self.syntaxc = self.syntaxc.line_terminator(byte);
+ self
+ }
+
+ fn swap_greed(&mut self, yes: bool) -> &mut Builder {
+ self.syntaxc = self.syntaxc.swap_greed(yes);
+ self
+ }
+
+ fn ignore_whitespace(&mut self, yes: bool) -> &mut Builder {
+ self.syntaxc = self.syntaxc.ignore_whitespace(yes);
+ self
+ }
+
+ fn unicode(&mut self, yes: bool) -> &mut Builder {
+ self.syntaxc = self.syntaxc.unicode(yes);
+ self
+ }
+
+ fn octal(&mut self, yes: bool) -> &mut Builder {
+ self.syntaxc = self.syntaxc.octal(yes);
+ self
+ }
+
+ fn size_limit(&mut self, limit: usize) -> &mut Builder {
+ self.metac = self.metac.clone().nfa_size_limit(Some(limit));
+ self
+ }
+
+ fn dfa_size_limit(&mut self, limit: usize) -> &mut Builder {
+ self.metac = self.metac.clone().hybrid_cache_capacity(limit);
+ self
+ }
+
+ fn nest_limit(&mut self, limit: u32) -> &mut Builder {
+ self.syntaxc = self.syntaxc.nest_limit(limit);
+ self
+ }
+}
+
+pub(crate) mod string {
+ use crate::{error::Error, Regex, RegexSet};
+
+ use super::Builder;
+
+ /// A configurable builder for a [`Regex`].
+ ///
+ /// This builder can be used to programmatically set flags such as `i`
+ /// (case insensitive) and `x` (for verbose mode). This builder can also be
+ /// used to configure things like the line terminator and a size limit on
+ /// the compiled regular expression.
+ #[derive(Clone, Debug)]
+ pub struct RegexBuilder {
+ builder: Builder,
+ }
+
+ impl RegexBuilder {
+ /// Create a new builder with a default configuration for the given
+ /// pattern.
+ ///
+ /// If the pattern is invalid or exceeds the configured size limits,
+ /// then an error will be returned when [`RegexBuilder::build`] is
+ /// called.
+ pub fn new(pattern: &str) -> RegexBuilder {
+ RegexBuilder { builder: Builder::new([pattern]) }
+ }
+
+ /// Compiles the pattern given to `RegexBuilder::new` with the
+ /// configuration set on this builder.
+ ///
+ /// If the pattern isn't a valid regex or if a configured size limit
+ /// was exceeded, then an error is returned.
+ pub fn build(&self) -> Result<Regex, Error> {
+ self.builder.build_one_string()
+ }
+
+ /// This configures Unicode mode for the entire pattern.
+ ///
+ /// Enabling Unicode mode does a number of things:
+ ///
+ /// * Most fundamentally, it causes the fundamental atom of matching
+ /// to be a single codepoint. When Unicode mode is disabled, it's a
+ /// single byte. For example, when Unicode mode is enabled, `.` will
+ /// match `💩` once, where as it will match 4 times when Unicode mode
+ /// is disabled. (Since the UTF-8 encoding of `💩` is 4 bytes long.)
+ /// * Case insensitive matching uses Unicode simple case folding rules.
+ /// * Unicode character classes like `\p{Letter}` and `\p{Greek}` are
+ /// available.
+ /// * Perl character classes are Unicode aware. That is, `\w`, `\s` and
+ /// `\d`.
+ /// * The word boundary assertions, `\b` and `\B`, use the Unicode
+ /// definition of a word character.
+ ///
+ /// Note that if Unicode mode is disabled, then the regex will fail to
+ /// compile if it could match invalid UTF-8. For example, when Unicode
+ /// mode is disabled, then since `.` matches any byte (except for
+ /// `\n`), then it can match invalid UTF-8 and thus building a regex
+ /// from it will fail. Another example is `\w` and `\W`. Since `\w` can
+ /// only match ASCII bytes when Unicode mode is disabled, it's allowed.
+ /// But `\W` can match more than ASCII bytes, including invalid UTF-8,
+ /// and so it is not allowed. This restriction can be lifted only by
+ /// using a [`bytes::Regex`](crate::bytes::Regex).
+ ///
+ /// For more details on the Unicode support in this crate, see the
+ /// [Unicode section](crate#unicode) in this crate's top-level
+ /// documentation.
+ ///
+ /// The default for this is `true`.
+ ///
+ /// # Example
+ ///
+ /// ```
+ /// use regex::RegexBuilder;
+ ///
+ /// let re = RegexBuilder::new(r"\w")
+ /// .unicode(false)
+ /// .build()
+ /// .unwrap();
+ /// // Normally greek letters would be included in \w, but since
+ /// // Unicode mode is disabled, it only matches ASCII letters.
+ /// assert!(!re.is_match("δ"));
+ ///
+ /// let re = RegexBuilder::new(r"s")
+ /// .case_insensitive(true)
+ /// .unicode(false)
+ /// .build()
+ /// .unwrap();
+ /// // Normally 'ſ' is included when searching for 's' case
+ /// // insensitively due to Unicode's simple case folding rules. But
+ /// // when Unicode mode is disabled, only ASCII case insensitive rules
+ /// // are used.
+ /// assert!(!re.is_match("ſ"));
+ /// ```
+ pub fn unicode(&mut self, yes: bool) -> &mut RegexBuilder {
+ self.builder.unicode(yes);
+ self
+ }
+
+ /// This configures whether to enable case insensitive matching for the
+ /// entire pattern.
+ ///
+ /// This setting can also be configured using the inline flag `i`
+ /// in the pattern. For example, `(?i:foo)` matches `foo` case
+ /// insensitively while `(?-i:foo)` matches `foo` case sensitively.
+ ///
+ /// The default for this is `false`.
+ ///
+ /// # Example
+ ///
+ /// ```
+ /// use regex::RegexBuilder;
+ ///
+ /// let re = RegexBuilder::new(r"foo(?-i:bar)quux")
+ /// .case_insensitive(true)
+ /// .build()
+ /// .unwrap();
+ /// assert!(re.is_match("FoObarQuUx"));
+ /// // Even though case insensitive matching is enabled in the builder,
+ /// // it can be locally disabled within the pattern. In this case,
+ /// // `bar` is matched case sensitively.
+ /// assert!(!re.is_match("fooBARquux"));
+ /// ```
+ pub fn case_insensitive(&mut self, yes: bool) -> &mut RegexBuilder {
+ self.builder.case_insensitive(yes);
+ self
+ }
+
+ /// This configures multi-line mode for the entire pattern.
+ ///
+ /// Enabling multi-line mode changes the behavior of the `^` and `$`
+ /// anchor assertions. Instead of only matching at the beginning and
+ /// end of a haystack, respectively, multi-line mode causes them to
+ /// match at the beginning and end of a line *in addition* to the
+ /// beginning and end of a haystack. More precisely, `^` will match at
+ /// the position immediately following a `\n` and `$` will match at the
+ /// position immediately preceding a `\n`.
+ ///
+ /// The behavior of this option can be impacted by other settings too:
+ ///
+ /// * The [`RegexBuilder::line_terminator`] option changes `\n` above
+ /// to any ASCII byte.
+ /// * The [`RegexBuilder::crlf`] option changes the line terminator to
+ /// be either `\r` or `\n`, but never at the position between a `\r`
+ /// and `\n`.
+ ///
+ /// This setting can also be configured using the inline flag `m` in
+ /// the pattern.
+ ///
+ /// The default for this is `false`.
+ ///
+ /// # Example
+ ///
+ /// ```
+ /// use regex::RegexBuilder;
+ ///
+ /// let re = RegexBuilder::new(r"^foo$")
+ /// .multi_line(true)
+ /// .build()
+ /// .unwrap();
+ /// assert_eq!(Some(1..4), re.find("\nfoo\n").map(|m| m.range()));
+ /// ```
+ pub fn multi_line(&mut self, yes: bool) -> &mut RegexBuilder {
+ self.builder.multi_line(yes);
+ self
+ }
+
+ /// This configures dot-matches-new-line mode for the entire pattern.
+ ///
+ /// Perhaps surprisingly, the default behavior for `.` is not to match
+ /// any character, but rather, to match any character except for the
+ /// line terminator (which is `\n` by default). When this mode is
+ /// enabled, the behavior changes such that `.` truly matches any
+ /// character.
+ ///
+ /// This setting can also be configured using the inline flag `s` in
+ /// the pattern. For example, `(?s:.)` and `\p{any}` are equivalent
+ /// regexes.
+ ///
+ /// The default for this is `false`.
+ ///
+ /// # Example
+ ///
+ /// ```
+ /// use regex::RegexBuilder;
+ ///
+ /// let re = RegexBuilder::new(r"foo.bar")
+ /// .dot_matches_new_line(true)
+ /// .build()
+ /// .unwrap();
+ /// let hay = "foo\nbar";
+ /// assert_eq!(Some("foo\nbar"), re.find(hay).map(|m| m.as_str()));
+ /// ```
+ pub fn dot_matches_new_line(
+ &mut self,
+ yes: bool,
+ ) -> &mut RegexBuilder {
+ self.builder.dot_matches_new_line(yes);
+ self
+ }
+
+ /// This configures CRLF mode for the entire pattern.
+ ///
+ /// When CRLF mode is enabled, both `\r` ("carriage return" or CR for
+ /// short) and `\n` ("line feed" or LF for short) are treated as line
+ /// terminators. This results in the following:
+ ///
+ /// * Unless dot-matches-new-line mode is enabled, `.` will now match
+ /// any character except for `\n` and `\r`.
+ /// * When multi-line mode is enabled, `^` will match immediately
+ /// following a `\n` or a `\r`. Similarly, `$` will match immediately
+ /// preceding a `\n` or a `\r`. Neither `^` nor `$` will ever match
+ /// between `\r` and `\n`.
+ ///
+ /// This setting can also be configured using the inline flag `R` in
+ /// the pattern.
+ ///
+ /// The default for this is `false`.
+ ///
+ /// # Example
+ ///
+ /// ```
+ /// use regex::RegexBuilder;
+ ///
+ /// let re = RegexBuilder::new(r"^foo$")
+ /// .multi_line(true)
+ /// .crlf(true)
+ /// .build()
+ /// .unwrap();
+ /// let hay = "\r\nfoo\r\n";
+ /// // If CRLF mode weren't enabled here, then '$' wouldn't match
+ /// // immediately after 'foo', and thus no match would be found.
+ /// assert_eq!(Some("foo"), re.find(hay).map(|m| m.as_str()));
+ /// ```
+ ///
+ /// This example demonstrates that `^` will never match at a position
+ /// between `\r` and `\n`. (`$` will similarly not match between a `\r`
+ /// and a `\n`.)
+ ///
+ /// ```
+ /// use regex::RegexBuilder;
+ ///
+ /// let re = RegexBuilder::new(r"^")
+ /// .multi_line(true)
+ /// .crlf(true)
+ /// .build()
+ /// .unwrap();
+ /// let hay = "\r\n\r\n";
+ /// let ranges: Vec<_> = re.find_iter(hay).map(|m| m.range()).collect();
+ /// assert_eq!(ranges, vec![0..0, 2..2, 4..4]);
+ /// ```
+ pub fn crlf(&mut self, yes: bool) -> &mut RegexBuilder {
+ self.builder.crlf(yes);
+ self
+ }
+
+ /// Configures the line terminator to be used by the regex.
+ ///
+ /// The line terminator is relevant in two ways for a particular regex:
+ ///
+ /// * When dot-matches-new-line mode is *not* enabled (the default),
+ /// then `.` will match any character except for the configured line
+ /// terminator.
+ /// * When multi-line mode is enabled (not the default), then `^` and
+ /// `$` will match immediately after and before, respectively, a line
+ /// terminator.
+ ///
+ /// In both cases, if CRLF mode is enabled in a particular context,
+ /// then it takes precedence over any configured line terminator.
+ ///
+ /// This option cannot be configured from within the pattern.
+ ///
+ /// The default line terminator is `\n`.
+ ///
+ /// # Example
+ ///
+ /// This shows how to treat the NUL byte as a line terminator. This can
+ /// be a useful heuristic when searching binary data.
+ ///
+ /// ```
+ /// use regex::RegexBuilder;
+ ///
+ /// let re = RegexBuilder::new(r"^foo$")
+ /// .multi_line(true)
+ /// .line_terminator(b'\x00')
+ /// .build()
+ /// .unwrap();
+ /// let hay = "\x00foo\x00";
+ /// assert_eq!(Some(1..4), re.find(hay).map(|m| m.range()));
+ /// ```
+ ///
+ /// This example shows that the behavior of `.` is impacted by this
+ /// setting as well:
+ ///
+ /// ```
+ /// use regex::RegexBuilder;
+ ///
+ /// let re = RegexBuilder::new(r".")
+ /// .line_terminator(b'\x00')
+ /// .build()
+ /// .unwrap();
+ /// assert!(re.is_match("\n"));
+ /// assert!(!re.is_match("\x00"));
+ /// ```
+ ///
+ /// This shows that building a regex will fail if the byte given
+ /// is not ASCII and the pattern could result in matching invalid
+ /// UTF-8. This is because any singular non-ASCII byte is not valid
+ /// UTF-8, and it is not permitted for a [`Regex`] to match invalid
+ /// UTF-8. (It is permissible to use a non-ASCII byte when building a
+ /// [`bytes::Regex`](crate::bytes::Regex).)
+ ///
+ /// ```
+ /// use regex::RegexBuilder;
+ ///
+ /// assert!(RegexBuilder::new(r".").line_terminator(0x80).build().is_err());
+ /// // Note that using a non-ASCII byte isn't enough on its own to
+ /// // cause regex compilation to fail. You actually have to make use
+ /// // of it in the regex in a way that leads to matching invalid
+ /// // UTF-8. If you don't, then regex compilation will succeed!
+ /// assert!(RegexBuilder::new(r"a").line_terminator(0x80).build().is_ok());
+ /// ```
+ pub fn line_terminator(&mut self, byte: u8) -> &mut RegexBuilder {
+ self.builder.line_terminator(byte);
+ self
+ }
+
+ /// This configures swap-greed mode for the entire pattern.
+ ///
+ /// When swap-greed mode is enabled, patterns like `a+` will become
+ /// non-greedy and patterns like `a+?` will become greedy. In other
+ /// words, the meanings of `a+` and `a+?` are switched.
+ ///
+ /// This setting can also be configured using the inline flag `U` in
+ /// the pattern.
+ ///
+ /// The default for this is `false`.
+ ///
+ /// # Example
+ ///
+ /// ```
+ /// use regex::RegexBuilder;
+ ///
+ /// let re = RegexBuilder::new(r"a+")
+ /// .swap_greed(true)
+ /// .build()
+ /// .unwrap();
+ /// assert_eq!(Some("a"), re.find("aaa").map(|m| m.as_str()));
+ /// ```
+ pub fn swap_greed(&mut self, yes: bool) -> &mut RegexBuilder {
+ self.builder.swap_greed(yes);
+ self
+ }
+
+ /// This configures verbose mode for the entire pattern.
+ ///
+ /// When enabled, whitespace will treated as insignifcant in the
+ /// pattern and `#` can be used to start a comment until the next new
+ /// line.
+ ///
+ /// Normally, in most places in a pattern, whitespace is treated
+ /// literally. For example ` +` will match one or more ASCII whitespace
+ /// characters.
+ ///
+ /// When verbose mode is enabled, `\#` can be used to match a literal
+ /// `#` and `\ ` can be used to match a literal ASCII whitespace
+ /// character.
+ ///
+ /// Verbose mode is useful for permitting regexes to be formatted and
+ /// broken up more nicely. This may make them more easily readable.
+ ///
+ /// This setting can also be configured using the inline flag `x` in
+ /// the pattern.
+ ///
+ /// The default for this is `false`.
+ ///
+ /// # Example
+ ///
+ /// ```
+ /// use regex::RegexBuilder;
+ ///
+ /// let pat = r"
+ /// \b
+ /// (?<first>\p{Uppercase}\w*) # always start with uppercase letter
+ /// [\s--\n]+ # whitespace should separate names
+ /// (?: # middle name can be an initial!
+ /// (?:(?<initial>\p{Uppercase})\.|(?<middle>\p{Uppercase}\w*))
+ /// [\s--\n]+
+ /// )?
+ /// (?<last>\p{Uppercase}\w*)
+ /// \b
+ /// ";
+ /// let re = RegexBuilder::new(pat)
+ /// .ignore_whitespace(true)
+ /// .build()
+ /// .unwrap();
+ ///
+ /// let caps = re.captures("Harry Potter").unwrap();
+ /// assert_eq!("Harry", &caps["first"]);
+ /// assert_eq!("Potter", &caps["last"]);
+ ///
+ /// let caps = re.captures("Harry J. Potter").unwrap();
+ /// assert_eq!("Harry", &caps["first"]);
+ /// // Since a middle name/initial isn't required for an overall match,
+ /// // we can't assume that 'initial' or 'middle' will be populated!
+ /// assert_eq!(Some("J"), caps.name("initial").map(|m| m.as_str()));
+ /// assert_eq!(None, caps.name("middle").map(|m| m.as_str()));
+ /// assert_eq!("Potter", &caps["last"]);
+ ///
+ /// let caps = re.captures("Harry James Potter").unwrap();
+ /// assert_eq!("Harry", &caps["first"]);
+ /// // Since a middle name/initial isn't required for an overall match,
+ /// // we can't assume that 'initial' or 'middle' will be populated!
+ /// assert_eq!(None, caps.name("initial").map(|m| m.as_str()));
+ /// assert_eq!(Some("James"), caps.name("middle").map(|m| m.as_str()));
+ /// assert_eq!("Potter", &caps["last"]);
+ /// ```
+ pub fn ignore_whitespace(&mut self, yes: bool) -> &mut RegexBuilder {
+ self.builder.ignore_whitespace(yes);
+ self
+ }
+
+ /// This configures octal mode for the entire pattern.
+ ///
+ /// Octal syntax is a little-known way of uttering Unicode codepoints
+ /// in a pattern. For example, `a`, `\x61`, `\u0061` and `\141` are all
+ /// equivalent patterns, where the last example shows octal syntax.
+ ///
+ /// While supporting octal syntax isn't in and of itself a problem,
+ /// it does make good error messages harder. That is, in PCRE based
+ /// regex engines, syntax like `\1` invokes a backreference, which is
+ /// explicitly unsupported this library. However, many users expect
+ /// backreferences to be supported. Therefore, when octal support
+ /// is disabled, the error message will explicitly mention that
+ /// backreferences aren't supported.
+ ///
+ /// The default for this is `false`.
+ ///
+ /// # Example
+ ///
+ /// ```
+ /// use regex::RegexBuilder;
+ ///
+ /// // Normally this pattern would not compile, with an error message
+ /// // about backreferences not being supported. But with octal mode
+ /// // enabled, octal escape sequences work.
+ /// let re = RegexBuilder::new(r"\141")
+ /// .octal(true)
+ /// .build()
+ /// .unwrap();
+ /// assert!(re.is_match("a"));
+ /// ```
+ pub fn octal(&mut self, yes: bool) -> &mut RegexBuilder {
+ self.builder.octal(yes);
+ self
+ }
+
+ /// Sets the approximate size limit, in bytes, of the compiled regex.
+ ///
+ /// This roughly corresponds to the number of heap memory, in
+ /// bytes, occupied by a single regex. If the regex would otherwise
+ /// approximately exceed this limit, then compiling that regex will
+ /// fail.
+ ///
+ /// The main utility of a method like this is to avoid compiling
+ /// regexes that use an unexpected amount of resources, such as
+ /// time and memory. Even if the memory usage of a large regex is
+ /// acceptable, its search time may not be. Namely, worst case time
+ /// complexity for search is `O(m * n)`, where `m ~ len(pattern)` and
+ /// `n ~ len(haystack)`. That is, search time depends, in part, on the
+ /// size of the compiled regex. This means that putting a limit on the
+ /// size of the regex limits how much a regex can impact search time.
+ ///
+ /// For more information about regex size limits, see the section on
+ /// [untrusted inputs](crate#untrusted-input) in the top-level crate
+ /// documentation.
+ ///
+ /// The default for this is some reasonable number that permits most
+ /// patterns to compile successfully.
+ ///
+ /// # Example
+ ///
+ /// ```
+ /// # if !cfg!(target_pointer_width = "64") { return; } // see #1041
+ /// use regex::RegexBuilder;
+ ///
+ /// // It may surprise you how big some seemingly small patterns can
+ /// // be! Since \w is Unicode aware, this generates a regex that can
+ /// // match approximately 140,000 distinct codepoints.
+ /// assert!(RegexBuilder::new(r"\w").size_limit(45_000).build().is_err());
+ /// ```
+ pub fn size_limit(&mut self, bytes: usize) -> &mut RegexBuilder {
+ self.builder.size_limit(bytes);
+ self
+ }
+
+ /// Set the approximate capacity, in bytes, of the cache of transitions
+ /// used by the lazy DFA.
+ ///
+ /// While the lazy DFA isn't always used, in tends to be the most
+ /// commonly use regex engine in default configurations. It tends to
+ /// adopt the performance profile of a fully build DFA, but without the
+ /// downside of taking worst case exponential time to build.
+ ///
+ /// The downside is that it needs to keep a cache of transitions and
+ /// states that are built while running a search, and this cache
+ /// can fill up. When it fills up, the cache will reset itself. Any
+ /// previously generated states and transitions will then need to be
+ /// re-generated. If this happens too many times, then this library
+ /// will bail out of using the lazy DFA and switch to a different regex
+ /// engine.
+ ///
+ /// If your regex provokes this particular downside of the lazy DFA,
+ /// then it may be beneficial to increase its cache capacity. This will
+ /// potentially reduce the frequency of cache resetting (ideally to
+ /// `0`). While it won't fix all potential performance problems with
+ /// the lazy DFA, increasing the cache capacity does fix some.
+ ///
+ /// There is no easy way to determine, a priori, whether increasing
+ /// this cache capacity will help. In general, the larger your regex,
+ /// the more cache it's likely to use. But that isn't an ironclad rule.
+ /// For example, a regex like `[01]*1[01]{N}` would normally produce a
+ /// fully build DFA that is exponential in size with respect to `N`.
+ /// The lazy DFA will prevent exponential space blow-up, but it cache
+ /// is likely to fill up, even when it's large and even for smallish
+ /// values of `N`.
+ ///
+ /// If you aren't sure whether this helps or not, it is sensible to
+ /// set this to some arbitrarily large number in testing, such as
+ /// `usize::MAX`. Namely, this represents the amount of capacity that
+ /// *may* be used. It's probably not a good idea to use `usize::MAX` in
+ /// production though, since it implies there are no controls on heap
+ /// memory used by this library during a search. In effect, set it to
+ /// whatever you're willing to allocate for a single regex search.
+ pub fn dfa_size_limit(&mut self, bytes: usize) -> &mut RegexBuilder {
+ self.builder.dfa_size_limit(bytes);
+ self
+ }
+
+ /// Set the nesting limit for this parser.
+ ///
+ /// The nesting limit controls how deep the abstract syntax tree is
+ /// allowed to be. If the AST exceeds the given limit (e.g., with too
+ /// many nested groups), then an error is returned by the parser.
+ ///
+ /// The purpose of this limit is to act as a heuristic to prevent stack
+ /// overflow for consumers that do structural induction on an AST using
+ /// explicit recursion. While this crate never does this (instead using
+ /// constant stack space and moving the call stack to the heap), other
+ /// crates may.
+ ///
+ /// This limit is not checked until the entire AST is parsed.
+ /// Therefore, if callers want to put a limit on the amount of heap
+ /// space used, then they should impose a limit on the length, in
+ /// bytes, of the concrete pattern string. In particular, this is
+ /// viable since this parser implementation will limit itself to heap
+ /// space proportional to the length of the pattern string. See also
+ /// the [untrusted inputs](crate#untrusted-input) section in the
+ /// top-level crate documentation for more information about this.
+ ///
+ /// Note that a nest limit of `0` will return a nest limit error for
+ /// most patterns but not all. For example, a nest limit of `0` permits
+ /// `a` but not `ab`, since `ab` requires an explicit concatenation,
+ /// which results in a nest depth of `1`. In general, a nest limit is
+ /// not something that manifests in an obvious way in the concrete
+ /// syntax, therefore, it should not be used in a granular way.
+ ///
+ /// # Example
+ ///
+ /// ```
+ /// use regex::RegexBuilder;
+ ///
+ /// assert!(RegexBuilder::new(r"a").nest_limit(0).build().is_ok());
+ /// assert!(RegexBuilder::new(r"ab").nest_limit(0).build().is_err());
+ /// ```
+ pub fn nest_limit(&mut self, limit: u32) -> &mut RegexBuilder {
+ self.builder.nest_limit(limit);
+ self
+ }
+ }
+
+ /// A configurable builder for a [`RegexSet`].
+ ///
+ /// This builder can be used to programmatically set flags such as
+ /// `i` (case insensitive) and `x` (for verbose mode). This builder
+ /// can also be used to configure things like the line terminator
+ /// and a size limit on the compiled regular expression.
+ #[derive(Clone, Debug)]
+ pub struct RegexSetBuilder {
+ builder: Builder,
+ }
+
+ impl RegexSetBuilder {
+ /// Create a new builder with a default configuration for the given
+ /// patterns.
+ ///
+ /// If the patterns are invalid or exceed the configured size limits,
+ /// then an error will be returned when [`RegexSetBuilder::build`] is
+ /// called.
+ pub fn new<I, S>(patterns: I) -> RegexSetBuilder
+ where
+ I: IntoIterator<Item = S>,
+ S: AsRef<str>,
+ {
+ RegexSetBuilder { builder: Builder::new(patterns) }
+ }
+
+ /// Compiles the patterns given to `RegexSetBuilder::new` with the
+ /// configuration set on this builder.
+ ///
+ /// If the patterns aren't valid regexes or if a configured size limit
+ /// was exceeded, then an error is returned.
+ pub fn build(&self) -> Result<RegexSet, Error> {
+ self.builder.build_many_string()
+ }
+
+ /// This configures Unicode mode for the all of the patterns.
+ ///
+ /// Enabling Unicode mode does a number of things:
+ ///
+ /// * Most fundamentally, it causes the fundamental atom of matching
+ /// to be a single codepoint. When Unicode mode is disabled, it's a
+ /// single byte. For example, when Unicode mode is enabled, `.` will
+ /// match `💩` once, where as it will match 4 times when Unicode mode
+ /// is disabled. (Since the UTF-8 encoding of `💩` is 4 bytes long.)
+ /// * Case insensitive matching uses Unicode simple case folding rules.
+ /// * Unicode character classes like `\p{Letter}` and `\p{Greek}` are
+ /// available.
+ /// * Perl character classes are Unicode aware. That is, `\w`, `\s` and
+ /// `\d`.
+ /// * The word boundary assertions, `\b` and `\B`, use the Unicode
+ /// definition of a word character.
+ ///
+ /// Note that if Unicode mode is disabled, then the regex will fail to
+ /// compile if it could match invalid UTF-8. For example, when Unicode
+ /// mode is disabled, then since `.` matches any byte (except for
+ /// `\n`), then it can match invalid UTF-8 and thus building a regex
+ /// from it will fail. Another example is `\w` and `\W`. Since `\w` can
+ /// only match ASCII bytes when Unicode mode is disabled, it's allowed.
+ /// But `\W` can match more than ASCII bytes, including invalid UTF-8,
+ /// and so it is not allowed. This restriction can be lifted only by
+ /// using a [`bytes::RegexSet`](crate::bytes::RegexSet).
+ ///
+ /// For more details on the Unicode support in this crate, see the
+ /// [Unicode section](crate#unicode) in this crate's top-level
+ /// documentation.
+ ///
+ /// The default for this is `true`.
+ ///
+ /// # Example
+ ///
+ /// ```
+ /// use regex::RegexSetBuilder;
+ ///
+ /// let re = RegexSetBuilder::new([r"\w"])
+ /// .unicode(false)
+ /// .build()
+ /// .unwrap();
+ /// // Normally greek letters would be included in \w, but since
+ /// // Unicode mode is disabled, it only matches ASCII letters.
+ /// assert!(!re.is_match("δ"));
+ ///
+ /// let re = RegexSetBuilder::new([r"s"])
+ /// .case_insensitive(true)
+ /// .unicode(false)
+ /// .build()
+ /// .unwrap();
+ /// // Normally 'ſ' is included when searching for 's' case
+ /// // insensitively due to Unicode's simple case folding rules. But
+ /// // when Unicode mode is disabled, only ASCII case insensitive rules
+ /// // are used.
+ /// assert!(!re.is_match("ſ"));
+ /// ```
+ pub fn unicode(&mut self, yes: bool) -> &mut RegexSetBuilder {
+ self.builder.unicode(yes);
+ self
+ }
+
+ /// This configures whether to enable case insensitive matching for all
+ /// of the patterns.
+ ///
+ /// This setting can also be configured using the inline flag `i`
+ /// in the pattern. For example, `(?i:foo)` matches `foo` case
+ /// insensitively while `(?-i:foo)` matches `foo` case sensitively.
+ ///
+ /// The default for this is `false`.
+ ///
+ /// # Example
+ ///
+ /// ```
+ /// use regex::RegexSetBuilder;
+ ///
+ /// let re = RegexSetBuilder::new([r"foo(?-i:bar)quux"])
+ /// .case_insensitive(true)
+ /// .build()
+ /// .unwrap();
+ /// assert!(re.is_match("FoObarQuUx"));
+ /// // Even though case insensitive matching is enabled in the builder,
+ /// // it can be locally disabled within the pattern. In this case,
+ /// // `bar` is matched case sensitively.
+ /// assert!(!re.is_match("fooBARquux"));
+ /// ```
+ pub fn case_insensitive(&mut self, yes: bool) -> &mut RegexSetBuilder {
+ self.builder.case_insensitive(yes);
+ self
+ }
+
+ /// This configures multi-line mode for all of the patterns.
+ ///
+ /// Enabling multi-line mode changes the behavior of the `^` and `$`
+ /// anchor assertions. Instead of only matching at the beginning and
+ /// end of a haystack, respectively, multi-line mode causes them to
+ /// match at the beginning and end of a line *in addition* to the
+ /// beginning and end of a haystack. More precisely, `^` will match at
+ /// the position immediately following a `\n` and `$` will match at the
+ /// position immediately preceding a `\n`.
+ ///
+ /// The behavior of this option can be impacted by other settings too:
+ ///
+ /// * The [`RegexSetBuilder::line_terminator`] option changes `\n`
+ /// above to any ASCII byte.
+ /// * The [`RegexSetBuilder::crlf`] option changes the line terminator
+ /// to be either `\r` or `\n`, but never at the position between a `\r`
+ /// and `\n`.
+ ///
+ /// This setting can also be configured using the inline flag `m` in
+ /// the pattern.
+ ///
+ /// The default for this is `false`.
+ ///
+ /// # Example
+ ///
+ /// ```
+ /// use regex::RegexSetBuilder;
+ ///
+ /// let re = RegexSetBuilder::new([r"^foo$"])
+ /// .multi_line(true)
+ /// .build()
+ /// .unwrap();
+ /// assert!(re.is_match("\nfoo\n"));
+ /// ```
+ pub fn multi_line(&mut self, yes: bool) -> &mut RegexSetBuilder {
+ self.builder.multi_line(yes);
+ self
+ }
+
+ /// This configures dot-matches-new-line mode for the entire pattern.
+ ///
+ /// Perhaps surprisingly, the default behavior for `.` is not to match
+ /// any character, but rather, to match any character except for the
+ /// line terminator (which is `\n` by default). When this mode is
+ /// enabled, the behavior changes such that `.` truly matches any
+ /// character.
+ ///
+ /// This setting can also be configured using the inline flag `s` in
+ /// the pattern. For example, `(?s:.)` and `\p{any}` are equivalent
+ /// regexes.
+ ///
+ /// The default for this is `false`.
+ ///
+ /// # Example
+ ///
+ /// ```
+ /// use regex::RegexSetBuilder;
+ ///
+ /// let re = RegexSetBuilder::new([r"foo.bar"])
+ /// .dot_matches_new_line(true)
+ /// .build()
+ /// .unwrap();
+ /// let hay = "foo\nbar";
+ /// assert!(re.is_match(hay));
+ /// ```
+ pub fn dot_matches_new_line(
+ &mut self,
+ yes: bool,
+ ) -> &mut RegexSetBuilder {
+ self.builder.dot_matches_new_line(yes);
+ self
+ }
+
+ /// This configures CRLF mode for all of the patterns.
+ ///
+ /// When CRLF mode is enabled, both `\r` ("carriage return" or CR for
+ /// short) and `\n` ("line feed" or LF for short) are treated as line
+ /// terminators. This results in the following:
+ ///
+ /// * Unless dot-matches-new-line mode is enabled, `.` will now match
+ /// any character except for `\n` and `\r`.
+ /// * When multi-line mode is enabled, `^` will match immediately
+ /// following a `\n` or a `\r`. Similarly, `$` will match immediately
+ /// preceding a `\n` or a `\r`. Neither `^` nor `$` will ever match
+ /// between `\r` and `\n`.
+ ///
+ /// This setting can also be configured using the inline flag `R` in
+ /// the pattern.
+ ///
+ /// The default for this is `false`.
+ ///
+ /// # Example
+ ///
+ /// ```
+ /// use regex::RegexSetBuilder;
+ ///
+ /// let re = RegexSetBuilder::new([r"^foo$"])
+ /// .multi_line(true)
+ /// .crlf(true)
+ /// .build()
+ /// .unwrap();
+ /// let hay = "\r\nfoo\r\n";
+ /// // If CRLF mode weren't enabled here, then '$' wouldn't match
+ /// // immediately after 'foo', and thus no match would be found.
+ /// assert!(re.is_match(hay));
+ /// ```
+ ///
+ /// This example demonstrates that `^` will never match at a position
+ /// between `\r` and `\n`. (`$` will similarly not match between a `\r`
+ /// and a `\n`.)
+ ///
+ /// ```
+ /// use regex::RegexSetBuilder;
+ ///
+ /// let re = RegexSetBuilder::new([r"^\n"])
+ /// .multi_line(true)
+ /// .crlf(true)
+ /// .build()
+ /// .unwrap();
+ /// assert!(!re.is_match("\r\n"));
+ /// ```
+ pub fn crlf(&mut self, yes: bool) -> &mut RegexSetBuilder {
+ self.builder.crlf(yes);
+ self
+ }
+
+ /// Configures the line terminator to be used by the regex.
+ ///
+ /// The line terminator is relevant in two ways for a particular regex:
+ ///
+ /// * When dot-matches-new-line mode is *not* enabled (the default),
+ /// then `.` will match any character except for the configured line
+ /// terminator.
+ /// * When multi-line mode is enabled (not the default), then `^` and
+ /// `$` will match immediately after and before, respectively, a line
+ /// terminator.
+ ///
+ /// In both cases, if CRLF mode is enabled in a particular context,
+ /// then it takes precedence over any configured line terminator.
+ ///
+ /// This option cannot be configured from within the pattern.
+ ///
+ /// The default line terminator is `\n`.
+ ///
+ /// # Example
+ ///
+ /// This shows how to treat the NUL byte as a line terminator. This can
+ /// be a useful heuristic when searching binary data.
+ ///
+ /// ```
+ /// use regex::RegexSetBuilder;
+ ///
+ /// let re = RegexSetBuilder::new([r"^foo$"])
+ /// .multi_line(true)
+ /// .line_terminator(b'\x00')
+ /// .build()
+ /// .unwrap();
+ /// let hay = "\x00foo\x00";
+ /// assert!(re.is_match(hay));
+ /// ```
+ ///
+ /// This example shows that the behavior of `.` is impacted by this
+ /// setting as well:
+ ///
+ /// ```
+ /// use regex::RegexSetBuilder;
+ ///
+ /// let re = RegexSetBuilder::new([r"."])
+ /// .line_terminator(b'\x00')
+ /// .build()
+ /// .unwrap();
+ /// assert!(re.is_match("\n"));
+ /// assert!(!re.is_match("\x00"));
+ /// ```
+ ///
+ /// This shows that building a regex will fail if the byte given
+ /// is not ASCII and the pattern could result in matching invalid
+ /// UTF-8. This is because any singular non-ASCII byte is not valid
+ /// UTF-8, and it is not permitted for a [`RegexSet`] to match invalid
+ /// UTF-8. (It is permissible to use a non-ASCII byte when building a
+ /// [`bytes::RegexSet`](crate::bytes::RegexSet).)
+ ///
+ /// ```
+ /// use regex::RegexSetBuilder;
+ ///
+ /// assert!(
+ /// RegexSetBuilder::new([r"."])
+ /// .line_terminator(0x80)
+ /// .build()
+ /// .is_err()
+ /// );
+ /// // Note that using a non-ASCII byte isn't enough on its own to
+ /// // cause regex compilation to fail. You actually have to make use
+ /// // of it in the regex in a way that leads to matching invalid
+ /// // UTF-8. If you don't, then regex compilation will succeed!
+ /// assert!(
+ /// RegexSetBuilder::new([r"a"])
+ /// .line_terminator(0x80)
+ /// .build()
+ /// .is_ok()
+ /// );
+ /// ```
+ pub fn line_terminator(&mut self, byte: u8) -> &mut RegexSetBuilder {
+ self.builder.line_terminator(byte);
+ self
+ }
+
+ /// This configures swap-greed mode for all of the patterns.
+ ///
+ /// When swap-greed mode is enabled, patterns like `a+` will become
+ /// non-greedy and patterns like `a+?` will become greedy. In other
+ /// words, the meanings of `a+` and `a+?` are switched.
+ ///
+ /// This setting can also be configured using the inline flag `U` in
+ /// the pattern.
+ ///
+ /// Note that this is generally not useful for a `RegexSet` since a
+ /// `RegexSet` can only report whether a pattern matches or not. Since
+ /// greediness never impacts whether a match is found or not (only the
+ /// offsets of the match), it follows that whether parts of a pattern
+ /// are greedy or not doesn't matter for a `RegexSet`.
+ ///
+ /// The default for this is `false`.
+ pub fn swap_greed(&mut self, yes: bool) -> &mut RegexSetBuilder {
+ self.builder.swap_greed(yes);
+ self
+ }
+
+ /// This configures verbose mode for all of the patterns.
+ ///
+ /// When enabled, whitespace will treated as insignifcant in the
+ /// pattern and `#` can be used to start a comment until the next new
+ /// line.
+ ///
+ /// Normally, in most places in a pattern, whitespace is treated
+ /// literally. For example ` +` will match one or more ASCII whitespace
+ /// characters.
+ ///
+ /// When verbose mode is enabled, `\#` can be used to match a literal
+ /// `#` and `\ ` can be used to match a literal ASCII whitespace
+ /// character.
+ ///
+ /// Verbose mode is useful for permitting regexes to be formatted and
+ /// broken up more nicely. This may make them more easily readable.
+ ///
+ /// This setting can also be configured using the inline flag `x` in
+ /// the pattern.
+ ///
+ /// The default for this is `false`.
+ ///
+ /// # Example
+ ///
+ /// ```
+ /// use regex::RegexSetBuilder;
+ ///
+ /// let pat = r"
+ /// \b
+ /// (?<first>\p{Uppercase}\w*) # always start with uppercase letter
+ /// [\s--\n]+ # whitespace should separate names
+ /// (?: # middle name can be an initial!
+ /// (?:(?<initial>\p{Uppercase})\.|(?<middle>\p{Uppercase}\w*))
+ /// [\s--\n]+
+ /// )?
+ /// (?<last>\p{Uppercase}\w*)
+ /// \b
+ /// ";
+ /// let re = RegexSetBuilder::new([pat])
+ /// .ignore_whitespace(true)
+ /// .build()
+ /// .unwrap();
+ /// assert!(re.is_match("Harry Potter"));
+ /// assert!(re.is_match("Harry J. Potter"));
+ /// assert!(re.is_match("Harry James Potter"));
+ /// assert!(!re.is_match("harry J. Potter"));
+ /// ```
+ pub fn ignore_whitespace(
+ &mut self,
+ yes: bool,
+ ) -> &mut RegexSetBuilder {
+ self.builder.ignore_whitespace(yes);
+ self
+ }
+
+ /// This configures octal mode for all of the patterns.
+ ///
+ /// Octal syntax is a little-known way of uttering Unicode codepoints
+ /// in a pattern. For example, `a`, `\x61`, `\u0061` and `\141` are all
+ /// equivalent patterns, where the last example shows octal syntax.
+ ///
+ /// While supporting octal syntax isn't in and of itself a problem,
+ /// it does make good error messages harder. That is, in PCRE based
+ /// regex engines, syntax like `\1` invokes a backreference, which is
+ /// explicitly unsupported this library. However, many users expect
+ /// backreferences to be supported. Therefore, when octal support
+ /// is disabled, the error message will explicitly mention that
+ /// backreferences aren't supported.
+ ///
+ /// The default for this is `false`.
+ ///
+ /// # Example
+ ///
+ /// ```
+ /// use regex::RegexSetBuilder;
+ ///
+ /// // Normally this pattern would not compile, with an error message
+ /// // about backreferences not being supported. But with octal mode
+ /// // enabled, octal escape sequences work.
+ /// let re = RegexSetBuilder::new([r"\141"])
+ /// .octal(true)
+ /// .build()
+ /// .unwrap();
+ /// assert!(re.is_match("a"));
+ /// ```
+ pub fn octal(&mut self, yes: bool) -> &mut RegexSetBuilder {
+ self.builder.octal(yes);
+ self
+ }
+
+ /// Sets the approximate size limit, in bytes, of the compiled regex.
+ ///
+ /// This roughly corresponds to the number of heap memory, in
+ /// bytes, occupied by a single regex. If the regex would otherwise
+ /// approximately exceed this limit, then compiling that regex will
+ /// fail.
+ ///
+ /// The main utility of a method like this is to avoid compiling
+ /// regexes that use an unexpected amount of resources, such as
+ /// time and memory. Even if the memory usage of a large regex is
+ /// acceptable, its search time may not be. Namely, worst case time
+ /// complexity for search is `O(m * n)`, where `m ~ len(pattern)` and
+ /// `n ~ len(haystack)`. That is, search time depends, in part, on the
+ /// size of the compiled regex. This means that putting a limit on the
+ /// size of the regex limits how much a regex can impact search time.
+ ///
+ /// For more information about regex size limits, see the section on
+ /// [untrusted inputs](crate#untrusted-input) in the top-level crate
+ /// documentation.
+ ///
+ /// The default for this is some reasonable number that permits most
+ /// patterns to compile successfully.
+ ///
+ /// # Example
+ ///
+ /// ```
+ /// # if !cfg!(target_pointer_width = "64") { return; } // see #1041
+ /// use regex::RegexSetBuilder;
+ ///
+ /// // It may surprise you how big some seemingly small patterns can
+ /// // be! Since \w is Unicode aware, this generates a regex that can
+ /// // match approximately 140,000 distinct codepoints.
+ /// assert!(
+ /// RegexSetBuilder::new([r"\w"])
+ /// .size_limit(45_000)
+ /// .build()
+ /// .is_err()
+ /// );
+ /// ```
+ pub fn size_limit(&mut self, bytes: usize) -> &mut RegexSetBuilder {
+ self.builder.size_limit(bytes);
+ self
+ }
+
+ /// Set the approximate capacity, in bytes, of the cache of transitions
+ /// used by the lazy DFA.
+ ///
+ /// While the lazy DFA isn't always used, in tends to be the most
+ /// commonly use regex engine in default configurations. It tends to
+ /// adopt the performance profile of a fully build DFA, but without the
+ /// downside of taking worst case exponential time to build.
+ ///
+ /// The downside is that it needs to keep a cache of transitions and
+ /// states that are built while running a search, and this cache
+ /// can fill up. When it fills up, the cache will reset itself. Any
+ /// previously generated states and transitions will then need to be
+ /// re-generated. If this happens too many times, then this library
+ /// will bail out of using the lazy DFA and switch to a different regex
+ /// engine.
+ ///
+ /// If your regex provokes this particular downside of the lazy DFA,
+ /// then it may be beneficial to increase its cache capacity. This will
+ /// potentially reduce the frequency of cache resetting (ideally to
+ /// `0`). While it won't fix all potential performance problems with
+ /// the lazy DFA, increasing the cache capacity does fix some.
+ ///
+ /// There is no easy way to determine, a priori, whether increasing
+ /// this cache capacity will help. In general, the larger your regex,
+ /// the more cache it's likely to use. But that isn't an ironclad rule.
+ /// For example, a regex like `[01]*1[01]{N}` would normally produce a
+ /// fully build DFA that is exponential in size with respect to `N`.
+ /// The lazy DFA will prevent exponential space blow-up, but it cache
+ /// is likely to fill up, even when it's large and even for smallish
+ /// values of `N`.
+ ///
+ /// If you aren't sure whether this helps or not, it is sensible to
+ /// set this to some arbitrarily large number in testing, such as
+ /// `usize::MAX`. Namely, this represents the amount of capacity that
+ /// *may* be used. It's probably not a good idea to use `usize::MAX` in
+ /// production though, since it implies there are no controls on heap
+ /// memory used by this library during a search. In effect, set it to
+ /// whatever you're willing to allocate for a single regex search.
+ pub fn dfa_size_limit(
+ &mut self,
+ bytes: usize,
+ ) -> &mut RegexSetBuilder {
+ self.builder.dfa_size_limit(bytes);
+ self
+ }
+
+ /// Set the nesting limit for this parser.
+ ///
+ /// The nesting limit controls how deep the abstract syntax tree is
+ /// allowed to be. If the AST exceeds the given limit (e.g., with too
+ /// many nested groups), then an error is returned by the parser.
+ ///
+ /// The purpose of this limit is to act as a heuristic to prevent stack
+ /// overflow for consumers that do structural induction on an AST using
+ /// explicit recursion. While this crate never does this (instead using
+ /// constant stack space and moving the call stack to the heap), other
+ /// crates may.
+ ///
+ /// This limit is not checked until the entire AST is parsed.
+ /// Therefore, if callers want to put a limit on the amount of heap
+ /// space used, then they should impose a limit on the length, in
+ /// bytes, of the concrete pattern string. In particular, this is
+ /// viable since this parser implementation will limit itself to heap
+ /// space proportional to the length of the pattern string. See also
+ /// the [untrusted inputs](crate#untrusted-input) section in the
+ /// top-level crate documentation for more information about this.
+ ///
+ /// Note that a nest limit of `0` will return a nest limit error for
+ /// most patterns but not all. For example, a nest limit of `0` permits
+ /// `a` but not `ab`, since `ab` requires an explicit concatenation,
+ /// which results in a nest depth of `1`. In general, a nest limit is
+ /// not something that manifests in an obvious way in the concrete
+ /// syntax, therefore, it should not be used in a granular way.
+ ///
+ /// # Example
+ ///
+ /// ```
+ /// use regex::RegexSetBuilder;
+ ///
+ /// assert!(RegexSetBuilder::new([r"a"]).nest_limit(0).build().is_ok());
+ /// assert!(RegexSetBuilder::new([r"ab"]).nest_limit(0).build().is_err());
+ /// ```
+ pub fn nest_limit(&mut self, limit: u32) -> &mut RegexSetBuilder {
+ self.builder.nest_limit(limit);
+ self
+ }
+ }
+}
+
+pub(crate) mod bytes {
+ use crate::{
+ bytes::{Regex, RegexSet},
+ error::Error,
+ };
+
+ use super::Builder;
+
+ /// A configurable builder for a [`Regex`].
+ ///
+ /// This builder can be used to programmatically set flags such as `i`
+ /// (case insensitive) and `x` (for verbose mode). This builder can also be
+ /// used to configure things like the line terminator and a size limit on
+ /// the compiled regular expression.
+ #[derive(Clone, Debug)]
+ pub struct RegexBuilder {
+ builder: Builder,
+ }
+
+ impl RegexBuilder {
+ /// Create a new builder with a default configuration for the given
+ /// pattern.
+ ///
+ /// If the pattern is invalid or exceeds the configured size limits,
+ /// then an error will be returned when [`RegexBuilder::build`] is
+ /// called.
+ pub fn new(pattern: &str) -> RegexBuilder {
+ RegexBuilder { builder: Builder::new([pattern]) }
+ }
+
+ /// Compiles the pattern given to `RegexBuilder::new` with the
+ /// configuration set on this builder.
+ ///
+ /// If the pattern isn't a valid regex or if a configured size limit
+ /// was exceeded, then an error is returned.
+ pub fn build(&self) -> Result<Regex, Error> {
+ self.builder.build_one_bytes()
+ }
+
+ /// This configures Unicode mode for the entire pattern.
+ ///
+ /// Enabling Unicode mode does a number of things:
+ ///
+ /// * Most fundamentally, it causes the fundamental atom of matching
+ /// to be a single codepoint. When Unicode mode is disabled, it's a
+ /// single byte. For example, when Unicode mode is enabled, `.` will
+ /// match `💩` once, where as it will match 4 times when Unicode mode
+ /// is disabled. (Since the UTF-8 encoding of `💩` is 4 bytes long.)
+ /// * Case insensitive matching uses Unicode simple case folding rules.
+ /// * Unicode character classes like `\p{Letter}` and `\p{Greek}` are
+ /// available.
+ /// * Perl character classes are Unicode aware. That is, `\w`, `\s` and
+ /// `\d`.
+ /// * The word boundary assertions, `\b` and `\B`, use the Unicode
+ /// definition of a word character.
+ ///
+ /// Note that unlike the top-level `Regex` for searching `&str`, it
+ /// is permitted to disable Unicode mode even if the resulting pattern
+ /// could match invalid UTF-8. For example, `(?-u:.)` is not a valid
+ /// pattern for a top-level `Regex`, but is valid for a `bytes::Regex`.
+ ///
+ /// For more details on the Unicode support in this crate, see the
+ /// [Unicode section](crate#unicode) in this crate's top-level
+ /// documentation.
+ ///
+ /// The default for this is `true`.
+ ///
+ /// # Example
+ ///
+ /// ```
+ /// use regex::bytes::RegexBuilder;
+ ///
+ /// let re = RegexBuilder::new(r"\w")
+ /// .unicode(false)
+ /// .build()
+ /// .unwrap();
+ /// // Normally greek letters would be included in \w, but since
+ /// // Unicode mode is disabled, it only matches ASCII letters.
+ /// assert!(!re.is_match("δ".as_bytes()));
+ ///
+ /// let re = RegexBuilder::new(r"s")
+ /// .case_insensitive(true)
+ /// .unicode(false)
+ /// .build()
+ /// .unwrap();
+ /// // Normally 'ſ' is included when searching for 's' case
+ /// // insensitively due to Unicode's simple case folding rules. But
+ /// // when Unicode mode is disabled, only ASCII case insensitive rules
+ /// // are used.
+ /// assert!(!re.is_match("ſ".as_bytes()));
+ /// ```
+ ///
+ /// Since this builder is for constructing a [`bytes::Regex`](Regex),
+ /// one can disable Unicode mode even if it would match invalid UTF-8:
+ ///
+ /// ```
+ /// use regex::bytes::RegexBuilder;
+ ///
+ /// let re = RegexBuilder::new(r".")
+ /// .unicode(false)
+ /// .build()
+ /// .unwrap();
+ /// // Normally greek letters would be included in \w, but since
+ /// // Unicode mode is disabled, it only matches ASCII letters.
+ /// assert!(re.is_match(b"\xFF"));
+ /// ```
+ pub fn unicode(&mut self, yes: bool) -> &mut RegexBuilder {
+ self.builder.unicode(yes);
+ self
+ }
+
+ /// This configures whether to enable case insensitive matching for the
+ /// entire pattern.
+ ///
+ /// This setting can also be configured using the inline flag `i`
+ /// in the pattern. For example, `(?i:foo)` matches `foo` case
+ /// insensitively while `(?-i:foo)` matches `foo` case sensitively.
+ ///
+ /// The default for this is `false`.
+ ///
+ /// # Example
+ ///
+ /// ```
+ /// use regex::bytes::RegexBuilder;
+ ///
+ /// let re = RegexBuilder::new(r"foo(?-i:bar)quux")
+ /// .case_insensitive(true)
+ /// .build()
+ /// .unwrap();
+ /// assert!(re.is_match(b"FoObarQuUx"));
+ /// // Even though case insensitive matching is enabled in the builder,
+ /// // it can be locally disabled within the pattern. In this case,
+ /// // `bar` is matched case sensitively.
+ /// assert!(!re.is_match(b"fooBARquux"));
+ /// ```
+ pub fn case_insensitive(&mut self, yes: bool) -> &mut RegexBuilder {
+ self.builder.case_insensitive(yes);
+ self
+ }
+
+ /// This configures multi-line mode for the entire pattern.
+ ///
+ /// Enabling multi-line mode changes the behavior of the `^` and `$`
+ /// anchor assertions. Instead of only matching at the beginning and
+ /// end of a haystack, respectively, multi-line mode causes them to
+ /// match at the beginning and end of a line *in addition* to the
+ /// beginning and end of a haystack. More precisely, `^` will match at
+ /// the position immediately following a `\n` and `$` will match at the
+ /// position immediately preceding a `\n`.
+ ///
+ /// The behavior of this option can be impacted by other settings too:
+ ///
+ /// * The [`RegexBuilder::line_terminator`] option changes `\n` above
+ /// to any ASCII byte.
+ /// * The [`RegexBuilder::crlf`] option changes the line terminator to
+ /// be either `\r` or `\n`, but never at the position between a `\r`
+ /// and `\n`.
+ ///
+ /// This setting can also be configured using the inline flag `m` in
+ /// the pattern.
+ ///
+ /// The default for this is `false`.
+ ///
+ /// # Example
+ ///
+ /// ```
+ /// use regex::bytes::RegexBuilder;
+ ///
+ /// let re = RegexBuilder::new(r"^foo$")
+ /// .multi_line(true)
+ /// .build()
+ /// .unwrap();
+ /// assert_eq!(Some(1..4), re.find(b"\nfoo\n").map(|m| m.range()));
+ /// ```
+ pub fn multi_line(&mut self, yes: bool) -> &mut RegexBuilder {
+ self.builder.multi_line(yes);
+ self
+ }
+
+ /// This configures dot-matches-new-line mode for the entire pattern.
+ ///
+ /// Perhaps surprisingly, the default behavior for `.` is not to match
+ /// any character, but rather, to match any character except for the
+ /// line terminator (which is `\n` by default). When this mode is
+ /// enabled, the behavior changes such that `.` truly matches any
+ /// character.
+ ///
+ /// This setting can also be configured using the inline flag `s` in
+ /// the pattern. For example, `(?s:.)` and `\p{any}` are equivalent
+ /// regexes.
+ ///
+ /// The default for this is `false`.
+ ///
+ /// # Example
+ ///
+ /// ```
+ /// use regex::bytes::RegexBuilder;
+ ///
+ /// let re = RegexBuilder::new(r"foo.bar")
+ /// .dot_matches_new_line(true)
+ /// .build()
+ /// .unwrap();
+ /// let hay = b"foo\nbar";
+ /// assert_eq!(Some(&b"foo\nbar"[..]), re.find(hay).map(|m| m.as_bytes()));
+ /// ```
+ pub fn dot_matches_new_line(
+ &mut self,
+ yes: bool,
+ ) -> &mut RegexBuilder {
+ self.builder.dot_matches_new_line(yes);
+ self
+ }
+
+ /// This configures CRLF mode for the entire pattern.
+ ///
+ /// When CRLF mode is enabled, both `\r` ("carriage return" or CR for
+ /// short) and `\n` ("line feed" or LF for short) are treated as line
+ /// terminators. This results in the following:
+ ///
+ /// * Unless dot-matches-new-line mode is enabled, `.` will now match
+ /// any character except for `\n` and `\r`.
+ /// * When multi-line mode is enabled, `^` will match immediately
+ /// following a `\n` or a `\r`. Similarly, `$` will match immediately
+ /// preceding a `\n` or a `\r`. Neither `^` nor `$` will ever match
+ /// between `\r` and `\n`.
+ ///
+ /// This setting can also be configured using the inline flag `R` in
+ /// the pattern.
+ ///
+ /// The default for this is `false`.
+ ///
+ /// # Example
+ ///
+ /// ```
+ /// use regex::bytes::RegexBuilder;
+ ///
+ /// let re = RegexBuilder::new(r"^foo$")
+ /// .multi_line(true)
+ /// .crlf(true)
+ /// .build()
+ /// .unwrap();
+ /// let hay = b"\r\nfoo\r\n";
+ /// // If CRLF mode weren't enabled here, then '$' wouldn't match
+ /// // immediately after 'foo', and thus no match would be found.
+ /// assert_eq!(Some(&b"foo"[..]), re.find(hay).map(|m| m.as_bytes()));
+ /// ```
+ ///
+ /// This example demonstrates that `^` will never match at a position
+ /// between `\r` and `\n`. (`$` will similarly not match between a `\r`
+ /// and a `\n`.)
+ ///
+ /// ```
+ /// use regex::bytes::RegexBuilder;
+ ///
+ /// let re = RegexBuilder::new(r"^")
+ /// .multi_line(true)
+ /// .crlf(true)
+ /// .build()
+ /// .unwrap();
+ /// let hay = b"\r\n\r\n";
+ /// let ranges: Vec<_> = re.find_iter(hay).map(|m| m.range()).collect();
+ /// assert_eq!(ranges, vec![0..0, 2..2, 4..4]);
+ /// ```
+ pub fn crlf(&mut self, yes: bool) -> &mut RegexBuilder {
+ self.builder.crlf(yes);
+ self
+ }
+
+ /// Configures the line terminator to be used by the regex.
+ ///
+ /// The line terminator is relevant in two ways for a particular regex:
+ ///
+ /// * When dot-matches-new-line mode is *not* enabled (the default),
+ /// then `.` will match any character except for the configured line
+ /// terminator.
+ /// * When multi-line mode is enabled (not the default), then `^` and
+ /// `$` will match immediately after and before, respectively, a line
+ /// terminator.
+ ///
+ /// In both cases, if CRLF mode is enabled in a particular context,
+ /// then it takes precedence over any configured line terminator.
+ ///
+ /// This option cannot be configured from within the pattern.
+ ///
+ /// The default line terminator is `\n`.
+ ///
+ /// # Example
+ ///
+ /// This shows how to treat the NUL byte as a line terminator. This can
+ /// be a useful heuristic when searching binary data.
+ ///
+ /// ```
+ /// use regex::bytes::RegexBuilder;
+ ///
+ /// let re = RegexBuilder::new(r"^foo$")
+ /// .multi_line(true)
+ /// .line_terminator(b'\x00')
+ /// .build()
+ /// .unwrap();
+ /// let hay = b"\x00foo\x00";
+ /// assert_eq!(Some(1..4), re.find(hay).map(|m| m.range()));
+ /// ```
+ ///
+ /// This example shows that the behavior of `.` is impacted by this
+ /// setting as well:
+ ///
+ /// ```
+ /// use regex::bytes::RegexBuilder;
+ ///
+ /// let re = RegexBuilder::new(r".")
+ /// .line_terminator(b'\x00')
+ /// .build()
+ /// .unwrap();
+ /// assert!(re.is_match(b"\n"));
+ /// assert!(!re.is_match(b"\x00"));
+ /// ```
+ ///
+ /// This shows that building a regex will work even when the byte
+ /// given is not ASCII. This is unlike the top-level `Regex` API where
+ /// matching invalid UTF-8 is not allowed.
+ ///
+ /// Note though that you must disable Unicode mode. This is required
+ /// because Unicode mode requires matching one codepoint at a time,
+ /// and there is no way to match a non-ASCII byte as if it were a
+ /// codepoint.
+ ///
+ /// ```
+ /// use regex::bytes::RegexBuilder;
+ ///
+ /// assert!(
+ /// RegexBuilder::new(r".")
+ /// .unicode(false)
+ /// .line_terminator(0x80)
+ /// .build()
+ /// .is_ok(),
+ /// );
+ /// ```
+ pub fn line_terminator(&mut self, byte: u8) -> &mut RegexBuilder {
+ self.builder.line_terminator(byte);
+ self
+ }
+
+ /// This configures swap-greed mode for the entire pattern.
+ ///
+ /// When swap-greed mode is enabled, patterns like `a+` will become
+ /// non-greedy and patterns like `a+?` will become greedy. In other
+ /// words, the meanings of `a+` and `a+?` are switched.
+ ///
+ /// This setting can also be configured using the inline flag `U` in
+ /// the pattern.
+ ///
+ /// The default for this is `false`.
+ ///
+ /// # Example
+ ///
+ /// ```
+ /// use regex::bytes::RegexBuilder;
+ ///
+ /// let re = RegexBuilder::new(r"a+")
+ /// .swap_greed(true)
+ /// .build()
+ /// .unwrap();
+ /// assert_eq!(Some(&b"a"[..]), re.find(b"aaa").map(|m| m.as_bytes()));
+ /// ```
+ pub fn swap_greed(&mut self, yes: bool) -> &mut RegexBuilder {
+ self.builder.swap_greed(yes);
+ self
+ }
+
+ /// This configures verbose mode for the entire pattern.
+ ///
+ /// When enabled, whitespace will treated as insignifcant in the
+ /// pattern and `#` can be used to start a comment until the next new
+ /// line.
+ ///
+ /// Normally, in most places in a pattern, whitespace is treated
+ /// literally. For example ` +` will match one or more ASCII whitespace
+ /// characters.
+ ///
+ /// When verbose mode is enabled, `\#` can be used to match a literal
+ /// `#` and `\ ` can be used to match a literal ASCII whitespace
+ /// character.
+ ///
+ /// Verbose mode is useful for permitting regexes to be formatted and
+ /// broken up more nicely. This may make them more easily readable.
+ ///
+ /// This setting can also be configured using the inline flag `x` in
+ /// the pattern.
+ ///
+ /// The default for this is `false`.
+ ///
+ /// # Example
+ ///
+ /// ```
+ /// use regex::bytes::RegexBuilder;
+ ///
+ /// let pat = r"
+ /// \b
+ /// (?<first>\p{Uppercase}\w*) # always start with uppercase letter
+ /// [\s--\n]+ # whitespace should separate names
+ /// (?: # middle name can be an initial!
+ /// (?:(?<initial>\p{Uppercase})\.|(?<middle>\p{Uppercase}\w*))
+ /// [\s--\n]+
+ /// )?
+ /// (?<last>\p{Uppercase}\w*)
+ /// \b
+ /// ";
+ /// let re = RegexBuilder::new(pat)
+ /// .ignore_whitespace(true)
+ /// .build()
+ /// .unwrap();
+ ///
+ /// let caps = re.captures(b"Harry Potter").unwrap();
+ /// assert_eq!(&b"Harry"[..], &caps["first"]);
+ /// assert_eq!(&b"Potter"[..], &caps["last"]);
+ ///
+ /// let caps = re.captures(b"Harry J. Potter").unwrap();
+ /// assert_eq!(&b"Harry"[..], &caps["first"]);
+ /// // Since a middle name/initial isn't required for an overall match,
+ /// // we can't assume that 'initial' or 'middle' will be populated!
+ /// assert_eq!(
+ /// Some(&b"J"[..]),
+ /// caps.name("initial").map(|m| m.as_bytes()),
+ /// );
+ /// assert_eq!(None, caps.name("middle").map(|m| m.as_bytes()));
+ /// assert_eq!(&b"Potter"[..], &caps["last"]);
+ ///
+ /// let caps = re.captures(b"Harry James Potter").unwrap();
+ /// assert_eq!(&b"Harry"[..], &caps["first"]);
+ /// // Since a middle name/initial isn't required for an overall match,
+ /// // we can't assume that 'initial' or 'middle' will be populated!
+ /// assert_eq!(None, caps.name("initial").map(|m| m.as_bytes()));
+ /// assert_eq!(
+ /// Some(&b"James"[..]),
+ /// caps.name("middle").map(|m| m.as_bytes()),
+ /// );
+ /// assert_eq!(&b"Potter"[..], &caps["last"]);
+ /// ```
+ pub fn ignore_whitespace(&mut self, yes: bool) -> &mut RegexBuilder {
+ self.builder.ignore_whitespace(yes);
+ self
+ }
+
+ /// This configures octal mode for the entire pattern.
+ ///
+ /// Octal syntax is a little-known way of uttering Unicode codepoints
+ /// in a pattern. For example, `a`, `\x61`, `\u0061` and `\141` are all
+ /// equivalent patterns, where the last example shows octal syntax.
+ ///
+ /// While supporting octal syntax isn't in and of itself a problem,
+ /// it does make good error messages harder. That is, in PCRE based
+ /// regex engines, syntax like `\1` invokes a backreference, which is
+ /// explicitly unsupported this library. However, many users expect
+ /// backreferences to be supported. Therefore, when octal support
+ /// is disabled, the error message will explicitly mention that
+ /// backreferences aren't supported.
+ ///
+ /// The default for this is `false`.
+ ///
+ /// # Example
+ ///
+ /// ```
+ /// use regex::bytes::RegexBuilder;
+ ///
+ /// // Normally this pattern would not compile, with an error message
+ /// // about backreferences not being supported. But with octal mode
+ /// // enabled, octal escape sequences work.
+ /// let re = RegexBuilder::new(r"\141")
+ /// .octal(true)
+ /// .build()
+ /// .unwrap();
+ /// assert!(re.is_match(b"a"));
+ /// ```
+ pub fn octal(&mut self, yes: bool) -> &mut RegexBuilder {
+ self.builder.octal(yes);
+ self
+ }
+
+ /// Sets the approximate size limit, in bytes, of the compiled regex.
+ ///
+ /// This roughly corresponds to the number of heap memory, in
+ /// bytes, occupied by a single regex. If the regex would otherwise
+ /// approximately exceed this limit, then compiling that regex will
+ /// fail.
+ ///
+ /// The main utility of a method like this is to avoid compiling
+ /// regexes that use an unexpected amount of resources, such as
+ /// time and memory. Even if the memory usage of a large regex is
+ /// acceptable, its search time may not be. Namely, worst case time
+ /// complexity for search is `O(m * n)`, where `m ~ len(pattern)` and
+ /// `n ~ len(haystack)`. That is, search time depends, in part, on the
+ /// size of the compiled regex. This means that putting a limit on the
+ /// size of the regex limits how much a regex can impact search time.
+ ///
+ /// For more information about regex size limits, see the section on
+ /// [untrusted inputs](crate#untrusted-input) in the top-level crate
+ /// documentation.
+ ///
+ /// The default for this is some reasonable number that permits most
+ /// patterns to compile successfully.
+ ///
+ /// # Example
+ ///
+ /// ```
+ /// # if !cfg!(target_pointer_width = "64") { return; } // see #1041
+ /// use regex::bytes::RegexBuilder;
+ ///
+ /// // It may surprise you how big some seemingly small patterns can
+ /// // be! Since \w is Unicode aware, this generates a regex that can
+ /// // match approximately 140,000 distinct codepoints.
+ /// assert!(RegexBuilder::new(r"\w").size_limit(45_000).build().is_err());
+ /// ```
+ pub fn size_limit(&mut self, bytes: usize) -> &mut RegexBuilder {
+ self.builder.size_limit(bytes);
+ self
+ }
+
+ /// Set the approximate capacity, in bytes, of the cache of transitions
+ /// used by the lazy DFA.
+ ///
+ /// While the lazy DFA isn't always used, in tends to be the most
+ /// commonly use regex engine in default configurations. It tends to
+ /// adopt the performance profile of a fully build DFA, but without the
+ /// downside of taking worst case exponential time to build.
+ ///
+ /// The downside is that it needs to keep a cache of transitions and
+ /// states that are built while running a search, and this cache
+ /// can fill up. When it fills up, the cache will reset itself. Any
+ /// previously generated states and transitions will then need to be
+ /// re-generated. If this happens too many times, then this library
+ /// will bail out of using the lazy DFA and switch to a different regex
+ /// engine.
+ ///
+ /// If your regex provokes this particular downside of the lazy DFA,
+ /// then it may be beneficial to increase its cache capacity. This will
+ /// potentially reduce the frequency of cache resetting (ideally to
+ /// `0`). While it won't fix all potential performance problems with
+ /// the lazy DFA, increasing the cache capacity does fix some.
+ ///
+ /// There is no easy way to determine, a priori, whether increasing
+ /// this cache capacity will help. In general, the larger your regex,
+ /// the more cache it's likely to use. But that isn't an ironclad rule.
+ /// For example, a regex like `[01]*1[01]{N}` would normally produce a
+ /// fully build DFA that is exponential in size with respect to `N`.
+ /// The lazy DFA will prevent exponential space blow-up, but it cache
+ /// is likely to fill up, even when it's large and even for smallish
+ /// values of `N`.
+ ///
+ /// If you aren't sure whether this helps or not, it is sensible to
+ /// set this to some arbitrarily large number in testing, such as
+ /// `usize::MAX`. Namely, this represents the amount of capacity that
+ /// *may* be used. It's probably not a good idea to use `usize::MAX` in
+ /// production though, since it implies there are no controls on heap
+ /// memory used by this library during a search. In effect, set it to
+ /// whatever you're willing to allocate for a single regex search.
+ pub fn dfa_size_limit(&mut self, bytes: usize) -> &mut RegexBuilder {
+ self.builder.dfa_size_limit(bytes);
+ self
+ }
+
+ /// Set the nesting limit for this parser.
+ ///
+ /// The nesting limit controls how deep the abstract syntax tree is
+ /// allowed to be. If the AST exceeds the given limit (e.g., with too
+ /// many nested groups), then an error is returned by the parser.
+ ///
+ /// The purpose of this limit is to act as a heuristic to prevent stack
+ /// overflow for consumers that do structural induction on an AST using
+ /// explicit recursion. While this crate never does this (instead using
+ /// constant stack space and moving the call stack to the heap), other
+ /// crates may.
+ ///
+ /// This limit is not checked until the entire AST is parsed.
+ /// Therefore, if callers want to put a limit on the amount of heap
+ /// space used, then they should impose a limit on the length, in
+ /// bytes, of the concrete pattern string. In particular, this is
+ /// viable since this parser implementation will limit itself to heap
+ /// space proportional to the length of the pattern string. See also
+ /// the [untrusted inputs](crate#untrusted-input) section in the
+ /// top-level crate documentation for more information about this.
+ ///
+ /// Note that a nest limit of `0` will return a nest limit error for
+ /// most patterns but not all. For example, a nest limit of `0` permits
+ /// `a` but not `ab`, since `ab` requires an explicit concatenation,
+ /// which results in a nest depth of `1`. In general, a nest limit is
+ /// not something that manifests in an obvious way in the concrete
+ /// syntax, therefore, it should not be used in a granular way.
+ ///
+ /// # Example
+ ///
+ /// ```
+ /// use regex::bytes::RegexBuilder;
+ ///
+ /// assert!(RegexBuilder::new(r"a").nest_limit(0).build().is_ok());
+ /// assert!(RegexBuilder::new(r"ab").nest_limit(0).build().is_err());
+ /// ```
+ pub fn nest_limit(&mut self, limit: u32) -> &mut RegexBuilder {
+ self.builder.nest_limit(limit);
+ self
+ }
+ }
+
+ /// A configurable builder for a [`RegexSet`].
+ ///
+ /// This builder can be used to programmatically set flags such as `i`
+ /// (case insensitive) and `x` (for verbose mode). This builder can also be
+ /// used to configure things like the line terminator and a size limit on
+ /// the compiled regular expression.
+ #[derive(Clone, Debug)]
+ pub struct RegexSetBuilder {
+ builder: Builder,
+ }
+
+ impl RegexSetBuilder {
+ /// Create a new builder with a default configuration for the given
+ /// patterns.
+ ///
+ /// If the patterns are invalid or exceed the configured size limits,
+ /// then an error will be returned when [`RegexSetBuilder::build`] is
+ /// called.
+ pub fn new<I, S>(patterns: I) -> RegexSetBuilder
+ where
+ I: IntoIterator<Item = S>,
+ S: AsRef<str>,
+ {
+ RegexSetBuilder { builder: Builder::new(patterns) }
+ }
+
+ /// Compiles the patterns given to `RegexSetBuilder::new` with the
+ /// configuration set on this builder.
+ ///
+ /// If the patterns aren't valid regexes or if a configured size limit
+ /// was exceeded, then an error is returned.
+ pub fn build(&self) -> Result<RegexSet, Error> {
+ self.builder.build_many_bytes()
+ }
+
+ /// This configures Unicode mode for the all of the patterns.
+ ///
+ /// Enabling Unicode mode does a number of things:
+ ///
+ /// * Most fundamentally, it causes the fundamental atom of matching
+ /// to be a single codepoint. When Unicode mode is disabled, it's a
+ /// single byte. For example, when Unicode mode is enabled, `.` will
+ /// match `💩` once, where as it will match 4 times when Unicode mode
+ /// is disabled. (Since the UTF-8 encoding of `💩` is 4 bytes long.)
+ /// * Case insensitive matching uses Unicode simple case folding rules.
+ /// * Unicode character classes like `\p{Letter}` and `\p{Greek}` are
+ /// available.
+ /// * Perl character classes are Unicode aware. That is, `\w`, `\s` and
+ /// `\d`.
+ /// * The word boundary assertions, `\b` and `\B`, use the Unicode
+ /// definition of a word character.
+ ///
+ /// Note that unlike the top-level `RegexSet` for searching `&str`,
+ /// it is permitted to disable Unicode mode even if the resulting
+ /// pattern could match invalid UTF-8. For example, `(?-u:.)` is not
+ /// a valid pattern for a top-level `RegexSet`, but is valid for a
+ /// `bytes::RegexSet`.
+ ///
+ /// For more details on the Unicode support in this crate, see the
+ /// [Unicode section](crate#unicode) in this crate's top-level
+ /// documentation.
+ ///
+ /// The default for this is `true`.
+ ///
+ /// # Example
+ ///
+ /// ```
+ /// use regex::bytes::RegexSetBuilder;
+ ///
+ /// let re = RegexSetBuilder::new([r"\w"])
+ /// .unicode(false)
+ /// .build()
+ /// .unwrap();
+ /// // Normally greek letters would be included in \w, but since
+ /// // Unicode mode is disabled, it only matches ASCII letters.
+ /// assert!(!re.is_match("δ".as_bytes()));
+ ///
+ /// let re = RegexSetBuilder::new([r"s"])
+ /// .case_insensitive(true)
+ /// .unicode(false)
+ /// .build()
+ /// .unwrap();
+ /// // Normally 'ſ' is included when searching for 's' case
+ /// // insensitively due to Unicode's simple case folding rules. But
+ /// // when Unicode mode is disabled, only ASCII case insensitive rules
+ /// // are used.
+ /// assert!(!re.is_match("ſ".as_bytes()));
+ /// ```
+ ///
+ /// Since this builder is for constructing a
+ /// [`bytes::RegexSet`](RegexSet), one can disable Unicode mode even if
+ /// it would match invalid UTF-8:
+ ///
+ /// ```
+ /// use regex::bytes::RegexSetBuilder;
+ ///
+ /// let re = RegexSetBuilder::new([r"."])
+ /// .unicode(false)
+ /// .build()
+ /// .unwrap();
+ /// // Normally greek letters would be included in \w, but since
+ /// // Unicode mode is disabled, it only matches ASCII letters.
+ /// assert!(re.is_match(b"\xFF"));
+ /// ```
+ pub fn unicode(&mut self, yes: bool) -> &mut RegexSetBuilder {
+ self.builder.unicode(yes);
+ self
+ }
+
+ /// This configures whether to enable case insensitive matching for all
+ /// of the patterns.
+ ///
+ /// This setting can also be configured using the inline flag `i`
+ /// in the pattern. For example, `(?i:foo)` matches `foo` case
+ /// insensitively while `(?-i:foo)` matches `foo` case sensitively.
+ ///
+ /// The default for this is `false`.
+ ///
+ /// # Example
+ ///
+ /// ```
+ /// use regex::bytes::RegexSetBuilder;
+ ///
+ /// let re = RegexSetBuilder::new([r"foo(?-i:bar)quux"])
+ /// .case_insensitive(true)
+ /// .build()
+ /// .unwrap();
+ /// assert!(re.is_match(b"FoObarQuUx"));
+ /// // Even though case insensitive matching is enabled in the builder,
+ /// // it can be locally disabled within the pattern. In this case,
+ /// // `bar` is matched case sensitively.
+ /// assert!(!re.is_match(b"fooBARquux"));
+ /// ```
+ pub fn case_insensitive(&mut self, yes: bool) -> &mut RegexSetBuilder {
+ self.builder.case_insensitive(yes);
+ self
+ }
+
+ /// This configures multi-line mode for all of the patterns.
+ ///
+ /// Enabling multi-line mode changes the behavior of the `^` and `$`
+ /// anchor assertions. Instead of only matching at the beginning and
+ /// end of a haystack, respectively, multi-line mode causes them to
+ /// match at the beginning and end of a line *in addition* to the
+ /// beginning and end of a haystack. More precisely, `^` will match at
+ /// the position immediately following a `\n` and `$` will match at the
+ /// position immediately preceding a `\n`.
+ ///
+ /// The behavior of this option can be impacted by other settings too:
+ ///
+ /// * The [`RegexSetBuilder::line_terminator`] option changes `\n`
+ /// above to any ASCII byte.
+ /// * The [`RegexSetBuilder::crlf`] option changes the line terminator
+ /// to be either `\r` or `\n`, but never at the position between a `\r`
+ /// and `\n`.
+ ///
+ /// This setting can also be configured using the inline flag `m` in
+ /// the pattern.
+ ///
+ /// The default for this is `false`.
+ ///
+ /// # Example
+ ///
+ /// ```
+ /// use regex::bytes::RegexSetBuilder;
+ ///
+ /// let re = RegexSetBuilder::new([r"^foo$"])
+ /// .multi_line(true)
+ /// .build()
+ /// .unwrap();
+ /// assert!(re.is_match(b"\nfoo\n"));
+ /// ```
+ pub fn multi_line(&mut self, yes: bool) -> &mut RegexSetBuilder {
+ self.builder.multi_line(yes);
+ self
+ }
+
+ /// This configures dot-matches-new-line mode for the entire pattern.
+ ///
+ /// Perhaps surprisingly, the default behavior for `.` is not to match
+ /// any character, but rather, to match any character except for the
+ /// line terminator (which is `\n` by default). When this mode is
+ /// enabled, the behavior changes such that `.` truly matches any
+ /// character.
+ ///
+ /// This setting can also be configured using the inline flag `s` in
+ /// the pattern. For example, `(?s:.)` and `\p{any}` are equivalent
+ /// regexes.
+ ///
+ /// The default for this is `false`.
+ ///
+ /// # Example
+ ///
+ /// ```
+ /// use regex::bytes::RegexSetBuilder;
+ ///
+ /// let re = RegexSetBuilder::new([r"foo.bar"])
+ /// .dot_matches_new_line(true)
+ /// .build()
+ /// .unwrap();
+ /// let hay = b"foo\nbar";
+ /// assert!(re.is_match(hay));
+ /// ```
+ pub fn dot_matches_new_line(
+ &mut self,
+ yes: bool,
+ ) -> &mut RegexSetBuilder {
+ self.builder.dot_matches_new_line(yes);
+ self
+ }
+
+ /// This configures CRLF mode for all of the patterns.
+ ///
+ /// When CRLF mode is enabled, both `\r` ("carriage return" or CR for
+ /// short) and `\n` ("line feed" or LF for short) are treated as line
+ /// terminators. This results in the following:
+ ///
+ /// * Unless dot-matches-new-line mode is enabled, `.` will now match
+ /// any character except for `\n` and `\r`.
+ /// * When multi-line mode is enabled, `^` will match immediately
+ /// following a `\n` or a `\r`. Similarly, `$` will match immediately
+ /// preceding a `\n` or a `\r`. Neither `^` nor `$` will ever match
+ /// between `\r` and `\n`.
+ ///
+ /// This setting can also be configured using the inline flag `R` in
+ /// the pattern.
+ ///
+ /// The default for this is `false`.
+ ///
+ /// # Example
+ ///
+ /// ```
+ /// use regex::bytes::RegexSetBuilder;
+ ///
+ /// let re = RegexSetBuilder::new([r"^foo$"])
+ /// .multi_line(true)
+ /// .crlf(true)
+ /// .build()
+ /// .unwrap();
+ /// let hay = b"\r\nfoo\r\n";
+ /// // If CRLF mode weren't enabled here, then '$' wouldn't match
+ /// // immediately after 'foo', and thus no match would be found.
+ /// assert!(re.is_match(hay));
+ /// ```
+ ///
+ /// This example demonstrates that `^` will never match at a position
+ /// between `\r` and `\n`. (`$` will similarly not match between a `\r`
+ /// and a `\n`.)
+ ///
+ /// ```
+ /// use regex::bytes::RegexSetBuilder;
+ ///
+ /// let re = RegexSetBuilder::new([r"^\n"])
+ /// .multi_line(true)
+ /// .crlf(true)
+ /// .build()
+ /// .unwrap();
+ /// assert!(!re.is_match(b"\r\n"));
+ /// ```
+ pub fn crlf(&mut self, yes: bool) -> &mut RegexSetBuilder {
+ self.builder.crlf(yes);
+ self
+ }
+
+ /// Configures the line terminator to be used by the regex.
+ ///
+ /// The line terminator is relevant in two ways for a particular regex:
+ ///
+ /// * When dot-matches-new-line mode is *not* enabled (the default),
+ /// then `.` will match any character except for the configured line
+ /// terminator.
+ /// * When multi-line mode is enabled (not the default), then `^` and
+ /// `$` will match immediately after and before, respectively, a line
+ /// terminator.
+ ///
+ /// In both cases, if CRLF mode is enabled in a particular context,
+ /// then it takes precedence over any configured line terminator.
+ ///
+ /// This option cannot be configured from within the pattern.
+ ///
+ /// The default line terminator is `\n`.
+ ///
+ /// # Example
+ ///
+ /// This shows how to treat the NUL byte as a line terminator. This can
+ /// be a useful heuristic when searching binary data.
+ ///
+ /// ```
+ /// use regex::bytes::RegexSetBuilder;
+ ///
+ /// let re = RegexSetBuilder::new([r"^foo$"])
+ /// .multi_line(true)
+ /// .line_terminator(b'\x00')
+ /// .build()
+ /// .unwrap();
+ /// let hay = b"\x00foo\x00";
+ /// assert!(re.is_match(hay));
+ /// ```
+ ///
+ /// This example shows that the behavior of `.` is impacted by this
+ /// setting as well:
+ ///
+ /// ```
+ /// use regex::bytes::RegexSetBuilder;
+ ///
+ /// let re = RegexSetBuilder::new([r"."])
+ /// .line_terminator(b'\x00')
+ /// .build()
+ /// .unwrap();
+ /// assert!(re.is_match(b"\n"));
+ /// assert!(!re.is_match(b"\x00"));
+ /// ```
+ ///
+ /// This shows that building a regex will work even when the byte given
+ /// is not ASCII. This is unlike the top-level `RegexSet` API where
+ /// matching invalid UTF-8 is not allowed.
+ ///
+ /// Note though that you must disable Unicode mode. This is required
+ /// because Unicode mode requires matching one codepoint at a time,
+ /// and there is no way to match a non-ASCII byte as if it were a
+ /// codepoint.
+ ///
+ /// ```
+ /// use regex::bytes::RegexSetBuilder;
+ ///
+ /// assert!(
+ /// RegexSetBuilder::new([r"."])
+ /// .unicode(false)
+ /// .line_terminator(0x80)
+ /// .build()
+ /// .is_ok(),
+ /// );
+ /// ```
+ pub fn line_terminator(&mut self, byte: u8) -> &mut RegexSetBuilder {
+ self.builder.line_terminator(byte);
+ self
+ }
+
+ /// This configures swap-greed mode for all of the patterns.
+ ///
+ /// When swap-greed mode is enabled, patterns like `a+` will become
+ /// non-greedy and patterns like `a+?` will become greedy. In other
+ /// words, the meanings of `a+` and `a+?` are switched.
+ ///
+ /// This setting can also be configured using the inline flag `U` in
+ /// the pattern.
+ ///
+ /// Note that this is generally not useful for a `RegexSet` since a
+ /// `RegexSet` can only report whether a pattern matches or not. Since
+ /// greediness never impacts whether a match is found or not (only the
+ /// offsets of the match), it follows that whether parts of a pattern
+ /// are greedy or not doesn't matter for a `RegexSet`.
+ ///
+ /// The default for this is `false`.
+ pub fn swap_greed(&mut self, yes: bool) -> &mut RegexSetBuilder {
+ self.builder.swap_greed(yes);
+ self
+ }
+
+ /// This configures verbose mode for all of the patterns.
+ ///
+ /// When enabled, whitespace will treated as insignifcant in the
+ /// pattern and `#` can be used to start a comment until the next new
+ /// line.
+ ///
+ /// Normally, in most places in a pattern, whitespace is treated
+ /// literally. For example ` +` will match one or more ASCII whitespace
+ /// characters.
+ ///
+ /// When verbose mode is enabled, `\#` can be used to match a literal
+ /// `#` and `\ ` can be used to match a literal ASCII whitespace
+ /// character.
+ ///
+ /// Verbose mode is useful for permitting regexes to be formatted and
+ /// broken up more nicely. This may make them more easily readable.
+ ///
+ /// This setting can also be configured using the inline flag `x` in
+ /// the pattern.
+ ///
+ /// The default for this is `false`.
+ ///
+ /// # Example
+ ///
+ /// ```
+ /// use regex::bytes::RegexSetBuilder;
+ ///
+ /// let pat = r"
+ /// \b
+ /// (?<first>\p{Uppercase}\w*) # always start with uppercase letter
+ /// [\s--\n]+ # whitespace should separate names
+ /// (?: # middle name can be an initial!
+ /// (?:(?<initial>\p{Uppercase})\.|(?<middle>\p{Uppercase}\w*))
+ /// [\s--\n]+
+ /// )?
+ /// (?<last>\p{Uppercase}\w*)
+ /// \b
+ /// ";
+ /// let re = RegexSetBuilder::new([pat])
+ /// .ignore_whitespace(true)
+ /// .build()
+ /// .unwrap();
+ /// assert!(re.is_match(b"Harry Potter"));
+ /// assert!(re.is_match(b"Harry J. Potter"));
+ /// assert!(re.is_match(b"Harry James Potter"));
+ /// assert!(!re.is_match(b"harry J. Potter"));
+ /// ```
+ pub fn ignore_whitespace(
+ &mut self,
+ yes: bool,
+ ) -> &mut RegexSetBuilder {
+ self.builder.ignore_whitespace(yes);
+ self
+ }
+
+ /// This configures octal mode for all of the patterns.
+ ///
+ /// Octal syntax is a little-known way of uttering Unicode codepoints
+ /// in a pattern. For example, `a`, `\x61`, `\u0061` and `\141` are all
+ /// equivalent patterns, where the last example shows octal syntax.
+ ///
+ /// While supporting octal syntax isn't in and of itself a problem,
+ /// it does make good error messages harder. That is, in PCRE based
+ /// regex engines, syntax like `\1` invokes a backreference, which is
+ /// explicitly unsupported this library. However, many users expect
+ /// backreferences to be supported. Therefore, when octal support
+ /// is disabled, the error message will explicitly mention that
+ /// backreferences aren't supported.
+ ///
+ /// The default for this is `false`.
+ ///
+ /// # Example
+ ///
+ /// ```
+ /// use regex::bytes::RegexSetBuilder;
+ ///
+ /// // Normally this pattern would not compile, with an error message
+ /// // about backreferences not being supported. But with octal mode
+ /// // enabled, octal escape sequences work.
+ /// let re = RegexSetBuilder::new([r"\141"])
+ /// .octal(true)
+ /// .build()
+ /// .unwrap();
+ /// assert!(re.is_match(b"a"));
+ /// ```
+ pub fn octal(&mut self, yes: bool) -> &mut RegexSetBuilder {
+ self.builder.octal(yes);
+ self
+ }
+
+ /// Sets the approximate size limit, in bytes, of the compiled regex.
+ ///
+ /// This roughly corresponds to the number of heap memory, in
+ /// bytes, occupied by a single regex. If the regex would otherwise
+ /// approximately exceed this limit, then compiling that regex will
+ /// fail.
+ ///
+ /// The main utility of a method like this is to avoid compiling
+ /// regexes that use an unexpected amount of resources, such as
+ /// time and memory. Even if the memory usage of a large regex is
+ /// acceptable, its search time may not be. Namely, worst case time
+ /// complexity for search is `O(m * n)`, where `m ~ len(pattern)` and
+ /// `n ~ len(haystack)`. That is, search time depends, in part, on the
+ /// size of the compiled regex. This means that putting a limit on the
+ /// size of the regex limits how much a regex can impact search time.
+ ///
+ /// For more information about regex size limits, see the section on
+ /// [untrusted inputs](crate#untrusted-input) in the top-level crate
+ /// documentation.
+ ///
+ /// The default for this is some reasonable number that permits most
+ /// patterns to compile successfully.
+ ///
+ /// # Example
+ ///
+ /// ```
+ /// # if !cfg!(target_pointer_width = "64") { return; } // see #1041
+ /// use regex::bytes::RegexSetBuilder;
+ ///
+ /// // It may surprise you how big some seemingly small patterns can
+ /// // be! Since \w is Unicode aware, this generates a regex that can
+ /// // match approximately 140,000 distinct codepoints.
+ /// assert!(
+ /// RegexSetBuilder::new([r"\w"])
+ /// .size_limit(45_000)
+ /// .build()
+ /// .is_err()
+ /// );
+ /// ```
+ pub fn size_limit(&mut self, bytes: usize) -> &mut RegexSetBuilder {
+ self.builder.size_limit(bytes);
+ self
+ }
+
+ /// Set the approximate capacity, in bytes, of the cache of transitions
+ /// used by the lazy DFA.
+ ///
+ /// While the lazy DFA isn't always used, in tends to be the most
+ /// commonly use regex engine in default configurations. It tends to
+ /// adopt the performance profile of a fully build DFA, but without the
+ /// downside of taking worst case exponential time to build.
+ ///
+ /// The downside is that it needs to keep a cache of transitions and
+ /// states that are built while running a search, and this cache
+ /// can fill up. When it fills up, the cache will reset itself. Any
+ /// previously generated states and transitions will then need to be
+ /// re-generated. If this happens too many times, then this library
+ /// will bail out of using the lazy DFA and switch to a different regex
+ /// engine.
+ ///
+ /// If your regex provokes this particular downside of the lazy DFA,
+ /// then it may be beneficial to increase its cache capacity. This will
+ /// potentially reduce the frequency of cache resetting (ideally to
+ /// `0`). While it won't fix all potential performance problems with
+ /// the lazy DFA, increasing the cache capacity does fix some.
+ ///
+ /// There is no easy way to determine, a priori, whether increasing
+ /// this cache capacity will help. In general, the larger your regex,
+ /// the more cache it's likely to use. But that isn't an ironclad rule.
+ /// For example, a regex like `[01]*1[01]{N}` would normally produce a
+ /// fully build DFA that is exponential in size with respect to `N`.
+ /// The lazy DFA will prevent exponential space blow-up, but it cache
+ /// is likely to fill up, even when it's large and even for smallish
+ /// values of `N`.
+ ///
+ /// If you aren't sure whether this helps or not, it is sensible to
+ /// set this to some arbitrarily large number in testing, such as
+ /// `usize::MAX`. Namely, this represents the amount of capacity that
+ /// *may* be used. It's probably not a good idea to use `usize::MAX` in
+ /// production though, since it implies there are no controls on heap
+ /// memory used by this library during a search. In effect, set it to
+ /// whatever you're willing to allocate for a single regex search.
+ pub fn dfa_size_limit(
+ &mut self,
+ bytes: usize,
+ ) -> &mut RegexSetBuilder {
+ self.builder.dfa_size_limit(bytes);
+ self
+ }
+
+ /// Set the nesting limit for this parser.
+ ///
+ /// The nesting limit controls how deep the abstract syntax tree is
+ /// allowed to be. If the AST exceeds the given limit (e.g., with too
+ /// many nested groups), then an error is returned by the parser.
+ ///
+ /// The purpose of this limit is to act as a heuristic to prevent stack
+ /// overflow for consumers that do structural induction on an AST using
+ /// explicit recursion. While this crate never does this (instead using
+ /// constant stack space and moving the call stack to the heap), other
+ /// crates may.
+ ///
+ /// This limit is not checked until the entire AST is parsed.
+ /// Therefore, if callers want to put a limit on the amount of heap
+ /// space used, then they should impose a limit on the length, in
+ /// bytes, of the concrete pattern string. In particular, this is
+ /// viable since this parser implementation will limit itself to heap
+ /// space proportional to the length of the pattern string. See also
+ /// the [untrusted inputs](crate#untrusted-input) section in the
+ /// top-level crate documentation for more information about this.
+ ///
+ /// Note that a nest limit of `0` will return a nest limit error for
+ /// most patterns but not all. For example, a nest limit of `0` permits
+ /// `a` but not `ab`, since `ab` requires an explicit concatenation,
+ /// which results in a nest depth of `1`. In general, a nest limit is
+ /// not something that manifests in an obvious way in the concrete
+ /// syntax, therefore, it should not be used in a granular way.
+ ///
+ /// # Example
+ ///
+ /// ```
+ /// use regex::bytes::RegexSetBuilder;
+ ///
+ /// assert!(RegexSetBuilder::new([r"a"]).nest_limit(0).build().is_ok());
+ /// assert!(RegexSetBuilder::new([r"ab"]).nest_limit(0).build().is_err());
+ /// ```
+ pub fn nest_limit(&mut self, limit: u32) -> &mut RegexSetBuilder {
+ self.builder.nest_limit(limit);
+ self
+ }
+ }
+}
diff --git a/vendor/regex/src/bytes.rs b/vendor/regex/src/bytes.rs
new file mode 100644
index 000000000..383ac4a5b
--- /dev/null
+++ b/vendor/regex/src/bytes.rs
@@ -0,0 +1,91 @@
+/*!
+Search for regex matches in `&[u8]` haystacks.
+
+This module provides a nearly identical API via [`Regex`] to the one found in
+the top-level of this crate. There are two important differences:
+
+1. Matching is done on `&[u8]` instead of `&str`. Additionally, `Vec<u8>`
+is used where `String` would have been used in the top-level API.
+2. Unicode support can be disabled even when disabling it would result in
+matching invalid UTF-8 bytes.
+
+# Example: match null terminated string
+
+This shows how to find all null-terminated strings in a slice of bytes. This
+works even if a C string contains invalid UTF-8.
+
+```rust
+use regex::bytes::Regex;
+
+let re = Regex::new(r"(?-u)(?<cstr>[^\x00]+)\x00").unwrap();
+let hay = b"foo\x00qu\xFFux\x00baz\x00";
+
+// Extract all of the strings without the NUL terminator from each match.
+// The unwrap is OK here since a match requires the `cstr` capture to match.
+let cstrs: Vec<&[u8]> =
+ re.captures_iter(hay)
+ .map(|c| c.name("cstr").unwrap().as_bytes())
+ .collect();
+assert_eq!(cstrs, vec![&b"foo"[..], &b"qu\xFFux"[..], &b"baz"[..]]);
+```
+
+# Example: selectively enable Unicode support
+
+This shows how to match an arbitrary byte pattern followed by a UTF-8 encoded
+string (e.g., to extract a title from a Matroska file):
+
+```rust
+use regex::bytes::Regex;
+
+let re = Regex::new(
+ r"(?-u)\x7b\xa9(?:[\x80-\xfe]|[\x40-\xff].)(?u:(.*))"
+).unwrap();
+let hay = b"\x12\xd0\x3b\x5f\x7b\xa9\x85\xe2\x98\x83\x80\x98\x54\x76\x68\x65";
+
+// Notice that despite the `.*` at the end, it will only match valid UTF-8
+// because Unicode mode was enabled with the `u` flag. Without the `u` flag,
+// the `.*` would match the rest of the bytes regardless of whether they were
+// valid UTF-8.
+let (_, [title]) = re.captures(hay).unwrap().extract();
+assert_eq!(title, b"\xE2\x98\x83");
+// We can UTF-8 decode the title now. And the unwrap here
+// is correct because the existence of a match guarantees
+// that `title` is valid UTF-8.
+let title = std::str::from_utf8(title).unwrap();
+assert_eq!(title, "☃");
+```
+
+In general, if the Unicode flag is enabled in a capture group and that capture
+is part of the overall match, then the capture is *guaranteed* to be valid
+UTF-8.
+
+# Syntax
+
+The supported syntax is pretty much the same as the syntax for Unicode
+regular expressions with a few changes that make sense for matching arbitrary
+bytes:
+
+1. The `u` flag can be disabled even when disabling it might cause the regex to
+match invalid UTF-8. When the `u` flag is disabled, the regex is said to be in
+"ASCII compatible" mode.
+2. In ASCII compatible mode, Unicode character classes are not allowed. Literal
+Unicode scalar values outside of character classes are allowed.
+3. In ASCII compatible mode, Perl character classes (`\w`, `\d` and `\s`)
+revert to their typical ASCII definition. `\w` maps to `[[:word:]]`, `\d` maps
+to `[[:digit:]]` and `\s` maps to `[[:space:]]`.
+4. In ASCII compatible mode, word boundaries use the ASCII compatible `\w` to
+determine whether a byte is a word byte or not.
+5. Hexadecimal notation can be used to specify arbitrary bytes instead of
+Unicode codepoints. For example, in ASCII compatible mode, `\xFF` matches the
+literal byte `\xFF`, while in Unicode mode, `\xFF` is the Unicode codepoint
+`U+00FF` that matches its UTF-8 encoding of `\xC3\xBF`. Similarly for octal
+notation when enabled.
+6. In ASCII compatible mode, `.` matches any *byte* except for `\n`. When the
+`s` flag is additionally enabled, `.` matches any byte.
+
+# Performance
+
+In general, one should expect performance on `&[u8]` to be roughly similar to
+performance on `&str`.
+*/
+pub use crate::{builders::bytes::*, regex::bytes::*, regexset::bytes::*};
diff --git a/vendor/regex/src/compile.rs b/vendor/regex/src/compile.rs
deleted file mode 100644
index 23e63ec89..000000000
--- a/vendor/regex/src/compile.rs
+++ /dev/null
@@ -1,1333 +0,0 @@
-use std::collections::HashMap;
-use std::fmt;
-use std::iter;
-use std::result;
-use std::sync::Arc;
-
-use regex_syntax::hir::{self, Hir, Look};
-use regex_syntax::is_word_byte;
-use regex_syntax::utf8::{Utf8Range, Utf8Sequence, Utf8Sequences};
-
-use crate::prog::{
- EmptyLook, Inst, InstBytes, InstChar, InstEmptyLook, InstPtr, InstRanges,
- InstSave, InstSplit, Program,
-};
-
-use crate::Error;
-
-type Result = result::Result<Patch, Error>;
-type ResultOrEmpty = result::Result<Option<Patch>, Error>;
-
-#[derive(Debug)]
-struct Patch {
- hole: Hole,
- entry: InstPtr,
-}
-
-/// A compiler translates a regular expression AST to a sequence of
-/// instructions. The sequence of instructions represents an NFA.
-// `Compiler` is only public via the `internal` module, so avoid deriving
-// `Debug`.
-#[allow(missing_debug_implementations)]
-pub struct Compiler {
- insts: Vec<MaybeInst>,
- compiled: Program,
- capture_name_idx: HashMap<String, usize>,
- num_exprs: usize,
- size_limit: usize,
- suffix_cache: SuffixCache,
- utf8_seqs: Option<Utf8Sequences>,
- byte_classes: ByteClassSet,
- // This keeps track of extra bytes allocated while compiling the regex
- // program. Currently, this corresponds to two things. First is the heap
- // memory allocated by Unicode character classes ('InstRanges'). Second is
- // a "fake" amount of memory used by empty sub-expressions, so that enough
- // empty sub-expressions will ultimately trigger the compiler to bail
- // because of a size limit restriction. (That empty sub-expressions don't
- // add to heap memory usage is more-or-less an implementation detail.) In
- // the second case, if we don't bail, then an excessively large repetition
- // on an empty sub-expression can result in the compiler using a very large
- // amount of CPU time.
- extra_inst_bytes: usize,
-}
-
-impl Compiler {
- /// Create a new regular expression compiler.
- ///
- /// Various options can be set before calling `compile` on an expression.
- pub fn new() -> Self {
- Compiler {
- insts: vec![],
- compiled: Program::new(),
- capture_name_idx: HashMap::new(),
- num_exprs: 0,
- size_limit: 10 * (1 << 20),
- suffix_cache: SuffixCache::new(1000),
- utf8_seqs: Some(Utf8Sequences::new('\x00', '\x00')),
- byte_classes: ByteClassSet::new(),
- extra_inst_bytes: 0,
- }
- }
-
- /// The size of the resulting program is limited by size_limit. If
- /// the program approximately exceeds the given size (in bytes), then
- /// compilation will stop and return an error.
- pub fn size_limit(mut self, size_limit: usize) -> Self {
- self.size_limit = size_limit;
- self
- }
-
- /// If bytes is true, then the program is compiled as a byte based
- /// automaton, which incorporates UTF-8 decoding into the machine. If it's
- /// false, then the automaton is Unicode scalar value based, e.g., an
- /// engine utilizing such an automaton is responsible for UTF-8 decoding.
- ///
- /// The specific invariant is that when returning a byte based machine,
- /// the neither the `Char` nor `Ranges` instructions are produced.
- /// Conversely, when producing a Unicode scalar value machine, the `Bytes`
- /// instruction is never produced.
- ///
- /// Note that `dfa(true)` implies `bytes(true)`.
- pub fn bytes(mut self, yes: bool) -> Self {
- self.compiled.is_bytes = yes;
- self
- }
-
- /// When disabled, the program compiled may match arbitrary bytes.
- ///
- /// When enabled (the default), all compiled programs exclusively match
- /// valid UTF-8 bytes.
- pub fn only_utf8(mut self, yes: bool) -> Self {
- self.compiled.only_utf8 = yes;
- self
- }
-
- /// When set, the machine returned is suitable for use in the DFA matching
- /// engine.
- ///
- /// In particular, this ensures that if the regex is not anchored in the
- /// beginning, then a preceding `.*?` is included in the program. (The NFA
- /// based engines handle the preceding `.*?` explicitly, which is difficult
- /// or impossible in the DFA engine.)
- pub fn dfa(mut self, yes: bool) -> Self {
- self.compiled.is_dfa = yes;
- self
- }
-
- /// When set, the machine returned is suitable for matching text in
- /// reverse. In particular, all concatenations are flipped.
- pub fn reverse(mut self, yes: bool) -> Self {
- self.compiled.is_reverse = yes;
- self
- }
-
- /// Compile a regular expression given its AST.
- ///
- /// The compiler is guaranteed to succeed unless the program exceeds the
- /// specified size limit. If the size limit is exceeded, then compilation
- /// stops and returns an error.
- pub fn compile(mut self, exprs: &[Hir]) -> result::Result<Program, Error> {
- debug_assert!(!exprs.is_empty());
- self.num_exprs = exprs.len();
- if exprs.len() == 1 {
- self.compile_one(&exprs[0])
- } else {
- self.compile_many(exprs)
- }
- }
-
- fn compile_one(mut self, expr: &Hir) -> result::Result<Program, Error> {
- if self.compiled.only_utf8
- && expr.properties().look_set().contains(Look::WordAsciiNegate)
- {
- return Err(Error::Syntax(
- "ASCII-only \\B is not allowed in Unicode regexes \
- because it may result in invalid UTF-8 matches"
- .to_string(),
- ));
- }
- // If we're compiling a forward DFA and we aren't anchored, then
- // add a `.*?` before the first capture group.
- // Other matching engines handle this by baking the logic into the
- // matching engine itself.
- let mut dotstar_patch = Patch { hole: Hole::None, entry: 0 };
- self.compiled.is_anchored_start =
- expr.properties().look_set_prefix().contains(Look::Start);
- self.compiled.is_anchored_end =
- expr.properties().look_set_suffix().contains(Look::End);
- if self.compiled.needs_dotstar() {
- dotstar_patch = self.c_dotstar()?;
- self.compiled.start = dotstar_patch.entry;
- }
- self.compiled.captures = vec![None];
- let patch =
- self.c_capture(0, expr)?.unwrap_or_else(|| self.next_inst());
- if self.compiled.needs_dotstar() {
- self.fill(dotstar_patch.hole, patch.entry);
- } else {
- self.compiled.start = patch.entry;
- }
- self.fill_to_next(patch.hole);
- self.compiled.matches = vec![self.insts.len()];
- self.push_compiled(Inst::Match(0));
- self.compiled.static_captures_len =
- expr.properties().static_explicit_captures_len();
- self.compile_finish()
- }
-
- fn compile_many(
- mut self,
- exprs: &[Hir],
- ) -> result::Result<Program, Error> {
- debug_assert!(exprs.len() > 1);
-
- self.compiled.is_anchored_start = exprs
- .iter()
- .all(|e| e.properties().look_set_prefix().contains(Look::Start));
- self.compiled.is_anchored_end = exprs
- .iter()
- .all(|e| e.properties().look_set_suffix().contains(Look::End));
- let mut dotstar_patch = Patch { hole: Hole::None, entry: 0 };
- if self.compiled.needs_dotstar() {
- dotstar_patch = self.c_dotstar()?;
- self.compiled.start = dotstar_patch.entry;
- } else {
- self.compiled.start = 0; // first instruction is always split
- }
- self.fill_to_next(dotstar_patch.hole);
-
- let mut prev_hole = Hole::None;
- for (i, expr) in exprs[0..exprs.len() - 1].iter().enumerate() {
- self.fill_to_next(prev_hole);
- let split = self.push_split_hole();
- let Patch { hole, entry } =
- self.c_capture(0, expr)?.unwrap_or_else(|| self.next_inst());
- self.fill_to_next(hole);
- self.compiled.matches.push(self.insts.len());
- self.push_compiled(Inst::Match(i));
- prev_hole = self.fill_split(split, Some(entry), None);
- }
- let i = exprs.len() - 1;
- let Patch { hole, entry } =
- self.c_capture(0, &exprs[i])?.unwrap_or_else(|| self.next_inst());
- self.fill(prev_hole, entry);
- self.fill_to_next(hole);
- self.compiled.matches.push(self.insts.len());
- self.push_compiled(Inst::Match(i));
- self.compile_finish()
- }
-
- fn compile_finish(mut self) -> result::Result<Program, Error> {
- self.compiled.insts =
- self.insts.into_iter().map(|inst| inst.unwrap()).collect();
- self.compiled.byte_classes = self.byte_classes.byte_classes();
- self.compiled.capture_name_idx = Arc::new(self.capture_name_idx);
- Ok(self.compiled)
- }
-
- /// Compile expr into self.insts, returning a patch on success,
- /// or an error if we run out of memory.
- ///
- /// All of the c_* methods of the compiler share the contract outlined
- /// here.
- ///
- /// The main thing that a c_* method does is mutate `self.insts`
- /// to add a list of mostly compiled instructions required to execute
- /// the given expression. `self.insts` contains MaybeInsts rather than
- /// Insts because there is some backpatching required.
- ///
- /// The `Patch` value returned by each c_* method provides metadata
- /// about the compiled instructions emitted to `self.insts`. The
- /// `entry` member of the patch refers to the first instruction
- /// (the entry point), while the `hole` member contains zero or
- /// more offsets to partial instructions that need to be backpatched.
- /// The c_* routine can't know where its list of instructions are going to
- /// jump to after execution, so it is up to the caller to patch
- /// these jumps to point to the right place. So compiling some
- /// expression, e, we would end up with a situation that looked like:
- ///
- /// ```text
- /// self.insts = [ ..., i1, i2, ..., iexit1, ..., iexitn, ...]
- /// ^ ^ ^
- /// | \ /
- /// entry \ /
- /// hole
- /// ```
- ///
- /// To compile two expressions, e1 and e2, concatenated together we
- /// would do:
- ///
- /// ```ignore
- /// let patch1 = self.c(e1);
- /// let patch2 = self.c(e2);
- /// ```
- ///
- /// while leaves us with a situation that looks like
- ///
- /// ```text
- /// self.insts = [ ..., i1, ..., iexit1, ..., i2, ..., iexit2 ]
- /// ^ ^ ^ ^
- /// | | | |
- /// entry1 hole1 entry2 hole2
- /// ```
- ///
- /// Then to merge the two patches together into one we would backpatch
- /// hole1 with entry2 and return a new patch that enters at entry1
- /// and has hole2 for a hole. In fact, if you look at the c_concat
- /// method you will see that it does exactly this, though it handles
- /// a list of expressions rather than just the two that we use for
- /// an example.
- ///
- /// Ok(None) is returned when an expression is compiled to no
- /// instruction, and so no patch.entry value makes sense.
- fn c(&mut self, expr: &Hir) -> ResultOrEmpty {
- use crate::prog;
- use regex_syntax::hir::HirKind::*;
-
- self.check_size()?;
- match *expr.kind() {
- Empty => self.c_empty(),
- Literal(hir::Literal(ref bytes)) => {
- if self.compiled.is_reverse {
- let mut bytes = bytes.to_vec();
- bytes.reverse();
- self.c_literal(&bytes)
- } else {
- self.c_literal(bytes)
- }
- }
- Class(hir::Class::Unicode(ref cls)) => self.c_class(cls.ranges()),
- Class(hir::Class::Bytes(ref cls)) => {
- if self.compiled.uses_bytes() {
- self.c_class_bytes(cls.ranges())
- } else {
- assert!(cls.is_ascii());
- let mut char_ranges = vec![];
- for r in cls.iter() {
- let (s, e) = (r.start() as char, r.end() as char);
- char_ranges.push(hir::ClassUnicodeRange::new(s, e));
- }
- self.c_class(&char_ranges)
- }
- }
- Look(ref look) => match *look {
- hir::Look::Start if self.compiled.is_reverse => {
- self.c_empty_look(prog::EmptyLook::EndText)
- }
- hir::Look::Start => {
- self.c_empty_look(prog::EmptyLook::StartText)
- }
- hir::Look::End if self.compiled.is_reverse => {
- self.c_empty_look(prog::EmptyLook::StartText)
- }
- hir::Look::End => self.c_empty_look(prog::EmptyLook::EndText),
- hir::Look::StartLF if self.compiled.is_reverse => {
- self.byte_classes.set_range(b'\n', b'\n');
- self.c_empty_look(prog::EmptyLook::EndLine)
- }
- hir::Look::StartLF => {
- self.byte_classes.set_range(b'\n', b'\n');
- self.c_empty_look(prog::EmptyLook::StartLine)
- }
- hir::Look::EndLF if self.compiled.is_reverse => {
- self.byte_classes.set_range(b'\n', b'\n');
- self.c_empty_look(prog::EmptyLook::StartLine)
- }
- hir::Look::EndLF => {
- self.byte_classes.set_range(b'\n', b'\n');
- self.c_empty_look(prog::EmptyLook::EndLine)
- }
- hir::Look::StartCRLF | hir::Look::EndCRLF => {
- return Err(Error::Syntax(
- "CRLF-aware line anchors are not supported yet"
- .to_string(),
- ));
- }
- hir::Look::WordAscii => {
- self.byte_classes.set_word_boundary();
- self.c_empty_look(prog::EmptyLook::WordBoundaryAscii)
- }
- hir::Look::WordAsciiNegate => {
- self.byte_classes.set_word_boundary();
- self.c_empty_look(prog::EmptyLook::NotWordBoundaryAscii)
- }
- hir::Look::WordUnicode => {
- if !cfg!(feature = "unicode-perl") {
- return Err(Error::Syntax(
- "Unicode word boundaries are unavailable when \
- the unicode-perl feature is disabled"
- .to_string(),
- ));
- }
- self.compiled.has_unicode_word_boundary = true;
- self.byte_classes.set_word_boundary();
- // We also make sure that all ASCII bytes are in a different
- // class from non-ASCII bytes. Otherwise, it's possible for
- // ASCII bytes to get lumped into the same class as non-ASCII
- // bytes. This in turn may cause the lazy DFA to falsely start
- // when it sees an ASCII byte that maps to a byte class with
- // non-ASCII bytes. This ensures that never happens.
- self.byte_classes.set_range(0, 0x7F);
- self.c_empty_look(prog::EmptyLook::WordBoundary)
- }
- hir::Look::WordUnicodeNegate => {
- if !cfg!(feature = "unicode-perl") {
- return Err(Error::Syntax(
- "Unicode word boundaries are unavailable when \
- the unicode-perl feature is disabled"
- .to_string(),
- ));
- }
- self.compiled.has_unicode_word_boundary = true;
- self.byte_classes.set_word_boundary();
- // See comments above for why we set the ASCII range here.
- self.byte_classes.set_range(0, 0x7F);
- self.c_empty_look(prog::EmptyLook::NotWordBoundary)
- }
- },
- Capture(hir::Capture { index, ref name, ref sub }) => {
- if index as usize >= self.compiled.captures.len() {
- let name = match *name {
- None => None,
- Some(ref boxed_str) => Some(boxed_str.to_string()),
- };
- self.compiled.captures.push(name.clone());
- if let Some(name) = name {
- self.capture_name_idx.insert(name, index as usize);
- }
- }
- self.c_capture(2 * index as usize, sub)
- }
- Concat(ref es) => {
- if self.compiled.is_reverse {
- self.c_concat(es.iter().rev())
- } else {
- self.c_concat(es)
- }
- }
- Alternation(ref es) => self.c_alternate(&**es),
- Repetition(ref rep) => self.c_repeat(rep),
- }
- }
-
- fn c_empty(&mut self) -> ResultOrEmpty {
- // See: https://github.com/rust-lang/regex/security/advisories/GHSA-m5pq-gvj9-9vr8
- // See: CVE-2022-24713
- //
- // Since 'empty' sub-expressions don't increase the size of
- // the actual compiled object, we "fake" an increase in its
- // size so that our 'check_size_limit' routine will eventually
- // stop compilation if there are too many empty sub-expressions
- // (e.g., via a large repetition).
- self.extra_inst_bytes += std::mem::size_of::<Inst>();
- Ok(None)
- }
-
- fn c_capture(&mut self, first_slot: usize, expr: &Hir) -> ResultOrEmpty {
- if self.num_exprs > 1 || self.compiled.is_dfa {
- // Don't ever compile Save instructions for regex sets because
- // they are never used. They are also never used in DFA programs
- // because DFAs can't handle captures.
- self.c(expr)
- } else {
- let entry = self.insts.len();
- let hole = self.push_hole(InstHole::Save { slot: first_slot });
- let patch = self.c(expr)?.unwrap_or_else(|| self.next_inst());
- self.fill(hole, patch.entry);
- self.fill_to_next(patch.hole);
- let hole = self.push_hole(InstHole::Save { slot: first_slot + 1 });
- Ok(Some(Patch { hole, entry }))
- }
- }
-
- fn c_dotstar(&mut self) -> Result {
- let hir = if self.compiled.only_utf8() {
- Hir::dot(hir::Dot::AnyChar)
- } else {
- Hir::dot(hir::Dot::AnyByte)
- };
- Ok(self
- .c(&Hir::repetition(hir::Repetition {
- min: 0,
- max: None,
- greedy: false,
- sub: Box::new(hir),
- }))?
- .unwrap())
- }
-
- fn c_char(&mut self, c: char) -> ResultOrEmpty {
- if self.compiled.uses_bytes() {
- if c.is_ascii() {
- let b = c as u8;
- let hole =
- self.push_hole(InstHole::Bytes { start: b, end: b });
- self.byte_classes.set_range(b, b);
- Ok(Some(Patch { hole, entry: self.insts.len() - 1 }))
- } else {
- self.c_class(&[hir::ClassUnicodeRange::new(c, c)])
- }
- } else {
- let hole = self.push_hole(InstHole::Char { c });
- Ok(Some(Patch { hole, entry: self.insts.len() - 1 }))
- }
- }
-
- fn c_class(&mut self, ranges: &[hir::ClassUnicodeRange]) -> ResultOrEmpty {
- use std::mem::size_of;
-
- if ranges.is_empty() {
- return Err(Error::Syntax(
- "empty character classes are not allowed".to_string(),
- ));
- }
- if self.compiled.uses_bytes() {
- Ok(Some(CompileClass { c: self, ranges }.compile()?))
- } else {
- let ranges: Vec<(char, char)> =
- ranges.iter().map(|r| (r.start(), r.end())).collect();
- let hole = if ranges.len() == 1 && ranges[0].0 == ranges[0].1 {
- self.push_hole(InstHole::Char { c: ranges[0].0 })
- } else {
- self.extra_inst_bytes +=
- ranges.len() * (size_of::<char>() * 2);
- self.push_hole(InstHole::Ranges { ranges })
- };
- Ok(Some(Patch { hole, entry: self.insts.len() - 1 }))
- }
- }
-
- fn c_byte(&mut self, b: u8) -> ResultOrEmpty {
- self.c_class_bytes(&[hir::ClassBytesRange::new(b, b)])
- }
-
- fn c_class_bytes(
- &mut self,
- ranges: &[hir::ClassBytesRange],
- ) -> ResultOrEmpty {
- if ranges.is_empty() {
- return Err(Error::Syntax(
- "empty character classes are not allowed".to_string(),
- ));
- }
-
- let first_split_entry = self.insts.len();
- let mut holes = vec![];
- let mut prev_hole = Hole::None;
- for r in &ranges[0..ranges.len() - 1] {
- self.fill_to_next(prev_hole);
- let split = self.push_split_hole();
- let next = self.insts.len();
- self.byte_classes.set_range(r.start(), r.end());
- holes.push(self.push_hole(InstHole::Bytes {
- start: r.start(),
- end: r.end(),
- }));
- prev_hole = self.fill_split(split, Some(next), None);
- }
- let next = self.insts.len();
- let r = &ranges[ranges.len() - 1];
- self.byte_classes.set_range(r.start(), r.end());
- holes.push(
- self.push_hole(InstHole::Bytes { start: r.start(), end: r.end() }),
- );
- self.fill(prev_hole, next);
- Ok(Some(Patch { hole: Hole::Many(holes), entry: first_split_entry }))
- }
-
- fn c_empty_look(&mut self, look: EmptyLook) -> ResultOrEmpty {
- let hole = self.push_hole(InstHole::EmptyLook { look });
- Ok(Some(Patch { hole, entry: self.insts.len() - 1 }))
- }
-
- fn c_literal(&mut self, bytes: &[u8]) -> ResultOrEmpty {
- match core::str::from_utf8(bytes) {
- Ok(string) => {
- let mut it = string.chars();
- let Patch { mut hole, entry } = loop {
- match it.next() {
- None => return self.c_empty(),
- Some(ch) => {
- if let Some(p) = self.c_char(ch)? {
- break p;
- }
- }
- }
- };
- for ch in it {
- if let Some(p) = self.c_char(ch)? {
- self.fill(hole, p.entry);
- hole = p.hole;
- }
- }
- Ok(Some(Patch { hole, entry }))
- }
- Err(_) => {
- assert!(self.compiled.uses_bytes());
- let mut it = bytes.iter().copied();
- let Patch { mut hole, entry } = loop {
- match it.next() {
- None => return self.c_empty(),
- Some(byte) => {
- if let Some(p) = self.c_byte(byte)? {
- break p;
- }
- }
- }
- };
- for byte in it {
- if let Some(p) = self.c_byte(byte)? {
- self.fill(hole, p.entry);
- hole = p.hole;
- }
- }
- Ok(Some(Patch { hole, entry }))
- }
- }
- }
-
- fn c_concat<'a, I>(&mut self, exprs: I) -> ResultOrEmpty
- where
- I: IntoIterator<Item = &'a Hir>,
- {
- let mut exprs = exprs.into_iter();
- let Patch { mut hole, entry } = loop {
- match exprs.next() {
- None => return self.c_empty(),
- Some(e) => {
- if let Some(p) = self.c(e)? {
- break p;
- }
- }
- }
- };
- for e in exprs {
- if let Some(p) = self.c(e)? {
- self.fill(hole, p.entry);
- hole = p.hole;
- }
- }
- Ok(Some(Patch { hole, entry }))
- }
-
- fn c_alternate(&mut self, exprs: &[Hir]) -> ResultOrEmpty {
- debug_assert!(
- exprs.len() >= 2,
- "alternates must have at least 2 exprs"
- );
-
- // Initial entry point is always the first split.
- let first_split_entry = self.insts.len();
-
- // Save up all of the holes from each alternate. They will all get
- // patched to point to the same location.
- let mut holes = vec![];
-
- // true indicates that the hole is a split where we want to fill
- // the second branch.
- let mut prev_hole = (Hole::None, false);
- for e in &exprs[0..exprs.len() - 1] {
- if prev_hole.1 {
- let next = self.insts.len();
- self.fill_split(prev_hole.0, None, Some(next));
- } else {
- self.fill_to_next(prev_hole.0);
- }
- let split = self.push_split_hole();
- if let Some(Patch { hole, entry }) = self.c(e)? {
- holes.push(hole);
- prev_hole = (self.fill_split(split, Some(entry), None), false);
- } else {
- let (split1, split2) = split.dup_one();
- holes.push(split1);
- prev_hole = (split2, true);
- }
- }
- if let Some(Patch { hole, entry }) = self.c(&exprs[exprs.len() - 1])? {
- holes.push(hole);
- if prev_hole.1 {
- self.fill_split(prev_hole.0, None, Some(entry));
- } else {
- self.fill(prev_hole.0, entry);
- }
- } else {
- // We ignore prev_hole.1. When it's true, it means we have two
- // empty branches both pushing prev_hole.0 into holes, so both
- // branches will go to the same place anyway.
- holes.push(prev_hole.0);
- }
- Ok(Some(Patch { hole: Hole::Many(holes), entry: first_split_entry }))
- }
-
- fn c_repeat(&mut self, rep: &hir::Repetition) -> ResultOrEmpty {
- match (rep.min, rep.max) {
- (0, Some(1)) => self.c_repeat_zero_or_one(&rep.sub, rep.greedy),
- (0, None) => self.c_repeat_zero_or_more(&rep.sub, rep.greedy),
- (1, None) => self.c_repeat_one_or_more(&rep.sub, rep.greedy),
- (min, None) => {
- self.c_repeat_range_min_or_more(&rep.sub, rep.greedy, min)
- }
- (min, Some(max)) => {
- self.c_repeat_range(&rep.sub, rep.greedy, min, max)
- }
- }
- }
-
- fn c_repeat_zero_or_one(
- &mut self,
- expr: &Hir,
- greedy: bool,
- ) -> ResultOrEmpty {
- let split_entry = self.insts.len();
- let split = self.push_split_hole();
- let Patch { hole: hole_rep, entry: entry_rep } = match self.c(expr)? {
- Some(p) => p,
- None => return self.pop_split_hole(),
- };
- let split_hole = if greedy {
- self.fill_split(split, Some(entry_rep), None)
- } else {
- self.fill_split(split, None, Some(entry_rep))
- };
- let holes = vec![hole_rep, split_hole];
- Ok(Some(Patch { hole: Hole::Many(holes), entry: split_entry }))
- }
-
- fn c_repeat_zero_or_more(
- &mut self,
- expr: &Hir,
- greedy: bool,
- ) -> ResultOrEmpty {
- let split_entry = self.insts.len();
- let split = self.push_split_hole();
- let Patch { hole: hole_rep, entry: entry_rep } = match self.c(expr)? {
- Some(p) => p,
- None => return self.pop_split_hole(),
- };
-
- self.fill(hole_rep, split_entry);
- let split_hole = if greedy {
- self.fill_split(split, Some(entry_rep), None)
- } else {
- self.fill_split(split, None, Some(entry_rep))
- };
- Ok(Some(Patch { hole: split_hole, entry: split_entry }))
- }
-
- fn c_repeat_one_or_more(
- &mut self,
- expr: &Hir,
- greedy: bool,
- ) -> ResultOrEmpty {
- let Patch { hole: hole_rep, entry: entry_rep } = match self.c(expr)? {
- Some(p) => p,
- None => return Ok(None),
- };
- self.fill_to_next(hole_rep);
- let split = self.push_split_hole();
-
- let split_hole = if greedy {
- self.fill_split(split, Some(entry_rep), None)
- } else {
- self.fill_split(split, None, Some(entry_rep))
- };
- Ok(Some(Patch { hole: split_hole, entry: entry_rep }))
- }
-
- fn c_repeat_range_min_or_more(
- &mut self,
- expr: &Hir,
- greedy: bool,
- min: u32,
- ) -> ResultOrEmpty {
- let min = u32_to_usize(min);
- // Using next_inst() is ok, because we can't return it (concat would
- // have to return Some(_) while c_repeat_range_min_or_more returns
- // None).
- let patch_concat = self
- .c_concat(iter::repeat(expr).take(min))?
- .unwrap_or_else(|| self.next_inst());
- if let Some(patch_rep) = self.c_repeat_zero_or_more(expr, greedy)? {
- self.fill(patch_concat.hole, patch_rep.entry);
- Ok(Some(Patch { hole: patch_rep.hole, entry: patch_concat.entry }))
- } else {
- Ok(None)
- }
- }
-
- fn c_repeat_range(
- &mut self,
- expr: &Hir,
- greedy: bool,
- min: u32,
- max: u32,
- ) -> ResultOrEmpty {
- let (min, max) = (u32_to_usize(min), u32_to_usize(max));
- debug_assert!(min <= max);
- let patch_concat = self.c_concat(iter::repeat(expr).take(min))?;
- if min == max {
- return Ok(patch_concat);
- }
- // Same reasoning as in c_repeat_range_min_or_more (we know that min <
- // max at this point).
- let patch_concat = patch_concat.unwrap_or_else(|| self.next_inst());
- let initial_entry = patch_concat.entry;
- // It is much simpler to compile, e.g., `a{2,5}` as:
- //
- // aaa?a?a?
- //
- // But you end up with a sequence of instructions like this:
- //
- // 0: 'a'
- // 1: 'a',
- // 2: split(3, 4)
- // 3: 'a'
- // 4: split(5, 6)
- // 5: 'a'
- // 6: split(7, 8)
- // 7: 'a'
- // 8: MATCH
- //
- // This is *incredibly* inefficient because the splits end
- // up forming a chain, which has to be resolved everything a
- // transition is followed.
- let mut holes = vec![];
- let mut prev_hole = patch_concat.hole;
- for _ in min..max {
- self.fill_to_next(prev_hole);
- let split = self.push_split_hole();
- let Patch { hole, entry } = match self.c(expr)? {
- Some(p) => p,
- None => return self.pop_split_hole(),
- };
- prev_hole = hole;
- if greedy {
- holes.push(self.fill_split(split, Some(entry), None));
- } else {
- holes.push(self.fill_split(split, None, Some(entry)));
- }
- }
- holes.push(prev_hole);
- Ok(Some(Patch { hole: Hole::Many(holes), entry: initial_entry }))
- }
-
- /// Can be used as a default value for the c_* functions when the call to
- /// c_function is followed by inserting at least one instruction that is
- /// always executed after the ones written by the c* function.
- fn next_inst(&self) -> Patch {
- Patch { hole: Hole::None, entry: self.insts.len() }
- }
-
- fn fill(&mut self, hole: Hole, goto: InstPtr) {
- match hole {
- Hole::None => {}
- Hole::One(pc) => {
- self.insts[pc].fill(goto);
- }
- Hole::Many(holes) => {
- for hole in holes {
- self.fill(hole, goto);
- }
- }
- }
- }
-
- fn fill_to_next(&mut self, hole: Hole) {
- let next = self.insts.len();
- self.fill(hole, next);
- }
-
- fn fill_split(
- &mut self,
- hole: Hole,
- goto1: Option<InstPtr>,
- goto2: Option<InstPtr>,
- ) -> Hole {
- match hole {
- Hole::None => Hole::None,
- Hole::One(pc) => match (goto1, goto2) {
- (Some(goto1), Some(goto2)) => {
- self.insts[pc].fill_split(goto1, goto2);
- Hole::None
- }
- (Some(goto1), None) => {
- self.insts[pc].half_fill_split_goto1(goto1);
- Hole::One(pc)
- }
- (None, Some(goto2)) => {
- self.insts[pc].half_fill_split_goto2(goto2);
- Hole::One(pc)
- }
- (None, None) => unreachable!(
- "at least one of the split \
- holes must be filled"
- ),
- },
- Hole::Many(holes) => {
- let mut new_holes = vec![];
- for hole in holes {
- new_holes.push(self.fill_split(hole, goto1, goto2));
- }
- if new_holes.is_empty() {
- Hole::None
- } else if new_holes.len() == 1 {
- new_holes.pop().unwrap()
- } else {
- Hole::Many(new_holes)
- }
- }
- }
- }
-
- fn push_compiled(&mut self, inst: Inst) {
- self.insts.push(MaybeInst::Compiled(inst));
- }
-
- fn push_hole(&mut self, inst: InstHole) -> Hole {
- let hole = self.insts.len();
- self.insts.push(MaybeInst::Uncompiled(inst));
- Hole::One(hole)
- }
-
- fn push_split_hole(&mut self) -> Hole {
- let hole = self.insts.len();
- self.insts.push(MaybeInst::Split);
- Hole::One(hole)
- }
-
- fn pop_split_hole(&mut self) -> ResultOrEmpty {
- self.insts.pop();
- Ok(None)
- }
-
- fn check_size(&self) -> result::Result<(), Error> {
- use std::mem::size_of;
-
- let size =
- self.extra_inst_bytes + (self.insts.len() * size_of::<Inst>());
- if size > self.size_limit {
- Err(Error::CompiledTooBig(self.size_limit))
- } else {
- Ok(())
- }
- }
-}
-
-#[derive(Debug)]
-enum Hole {
- None,
- One(InstPtr),
- Many(Vec<Hole>),
-}
-
-impl Hole {
- fn dup_one(self) -> (Self, Self) {
- match self {
- Hole::One(pc) => (Hole::One(pc), Hole::One(pc)),
- Hole::None | Hole::Many(_) => {
- unreachable!("must be called on single hole")
- }
- }
- }
-}
-
-#[derive(Clone, Debug)]
-enum MaybeInst {
- Compiled(Inst),
- Uncompiled(InstHole),
- Split,
- Split1(InstPtr),
- Split2(InstPtr),
-}
-
-impl MaybeInst {
- fn fill(&mut self, goto: InstPtr) {
- let maybeinst = match *self {
- MaybeInst::Split => MaybeInst::Split1(goto),
- MaybeInst::Uncompiled(ref inst) => {
- MaybeInst::Compiled(inst.fill(goto))
- }
- MaybeInst::Split1(goto1) => {
- MaybeInst::Compiled(Inst::Split(InstSplit {
- goto1,
- goto2: goto,
- }))
- }
- MaybeInst::Split2(goto2) => {
- MaybeInst::Compiled(Inst::Split(InstSplit {
- goto1: goto,
- goto2,
- }))
- }
- _ => unreachable!(
- "not all instructions were compiled! \
- found uncompiled instruction: {:?}",
- self
- ),
- };
- *self = maybeinst;
- }
-
- fn fill_split(&mut self, goto1: InstPtr, goto2: InstPtr) {
- let filled = match *self {
- MaybeInst::Split => Inst::Split(InstSplit { goto1, goto2 }),
- _ => unreachable!(
- "must be called on Split instruction, \
- instead it was called on: {:?}",
- self
- ),
- };
- *self = MaybeInst::Compiled(filled);
- }
-
- fn half_fill_split_goto1(&mut self, goto1: InstPtr) {
- let half_filled = match *self {
- MaybeInst::Split => goto1,
- _ => unreachable!(
- "must be called on Split instruction, \
- instead it was called on: {:?}",
- self
- ),
- };
- *self = MaybeInst::Split1(half_filled);
- }
-
- fn half_fill_split_goto2(&mut self, goto2: InstPtr) {
- let half_filled = match *self {
- MaybeInst::Split => goto2,
- _ => unreachable!(
- "must be called on Split instruction, \
- instead it was called on: {:?}",
- self
- ),
- };
- *self = MaybeInst::Split2(half_filled);
- }
-
- fn unwrap(self) -> Inst {
- match self {
- MaybeInst::Compiled(inst) => inst,
- _ => unreachable!(
- "must be called on a compiled instruction, \
- instead it was called on: {:?}",
- self
- ),
- }
- }
-}
-
-#[derive(Clone, Debug)]
-enum InstHole {
- Save { slot: usize },
- EmptyLook { look: EmptyLook },
- Char { c: char },
- Ranges { ranges: Vec<(char, char)> },
- Bytes { start: u8, end: u8 },
-}
-
-impl InstHole {
- fn fill(&self, goto: InstPtr) -> Inst {
- match *self {
- InstHole::Save { slot } => Inst::Save(InstSave { goto, slot }),
- InstHole::EmptyLook { look } => {
- Inst::EmptyLook(InstEmptyLook { goto, look })
- }
- InstHole::Char { c } => Inst::Char(InstChar { goto, c }),
- InstHole::Ranges { ref ranges } => Inst::Ranges(InstRanges {
- goto,
- ranges: ranges.clone().into_boxed_slice(),
- }),
- InstHole::Bytes { start, end } => {
- Inst::Bytes(InstBytes { goto, start, end })
- }
- }
- }
-}
-
-struct CompileClass<'a, 'b> {
- c: &'a mut Compiler,
- ranges: &'b [hir::ClassUnicodeRange],
-}
-
-impl<'a, 'b> CompileClass<'a, 'b> {
- fn compile(mut self) -> Result {
- let mut holes = vec![];
- let mut initial_entry = None;
- let mut last_split = Hole::None;
- let mut utf8_seqs = self.c.utf8_seqs.take().unwrap();
- self.c.suffix_cache.clear();
-
- for (i, range) in self.ranges.iter().enumerate() {
- let is_last_range = i + 1 == self.ranges.len();
- utf8_seqs.reset(range.start(), range.end());
- let mut it = (&mut utf8_seqs).peekable();
- loop {
- let utf8_seq = match it.next() {
- None => break,
- Some(utf8_seq) => utf8_seq,
- };
- if is_last_range && it.peek().is_none() {
- let Patch { hole, entry } = self.c_utf8_seq(&utf8_seq)?;
- holes.push(hole);
- self.c.fill(last_split, entry);
- last_split = Hole::None;
- if initial_entry.is_none() {
- initial_entry = Some(entry);
- }
- } else {
- if initial_entry.is_none() {
- initial_entry = Some(self.c.insts.len());
- }
- self.c.fill_to_next(last_split);
- last_split = self.c.push_split_hole();
- let Patch { hole, entry } = self.c_utf8_seq(&utf8_seq)?;
- holes.push(hole);
- last_split =
- self.c.fill_split(last_split, Some(entry), None);
- }
- }
- }
- self.c.utf8_seqs = Some(utf8_seqs);
- Ok(Patch { hole: Hole::Many(holes), entry: initial_entry.unwrap() })
- }
-
- fn c_utf8_seq(&mut self, seq: &Utf8Sequence) -> Result {
- if self.c.compiled.is_reverse {
- self.c_utf8_seq_(seq)
- } else {
- self.c_utf8_seq_(seq.into_iter().rev())
- }
- }
-
- fn c_utf8_seq_<'r, I>(&mut self, seq: I) -> Result
- where
- I: IntoIterator<Item = &'r Utf8Range>,
- {
- // The initial instruction for each UTF-8 sequence should be the same.
- let mut from_inst = ::std::usize::MAX;
- let mut last_hole = Hole::None;
- for byte_range in seq {
- let key = SuffixCacheKey {
- from_inst,
- start: byte_range.start,
- end: byte_range.end,
- };
- {
- let pc = self.c.insts.len();
- if let Some(cached_pc) = self.c.suffix_cache.get(key, pc) {
- from_inst = cached_pc;
- continue;
- }
- }
- self.c.byte_classes.set_range(byte_range.start, byte_range.end);
- if from_inst == ::std::usize::MAX {
- last_hole = self.c.push_hole(InstHole::Bytes {
- start: byte_range.start,
- end: byte_range.end,
- });
- } else {
- self.c.push_compiled(Inst::Bytes(InstBytes {
- goto: from_inst,
- start: byte_range.start,
- end: byte_range.end,
- }));
- }
- from_inst = self.c.insts.len().checked_sub(1).unwrap();
- debug_assert!(from_inst < ::std::usize::MAX);
- }
- debug_assert!(from_inst < ::std::usize::MAX);
- Ok(Patch { hole: last_hole, entry: from_inst })
- }
-}
-
-/// `SuffixCache` is a simple bounded hash map for caching suffix entries in
-/// UTF-8 automata. For example, consider the Unicode range \u{0}-\u{FFFF}.
-/// The set of byte ranges looks like this:
-///
-/// [0-7F]
-/// [C2-DF][80-BF]
-/// [E0][A0-BF][80-BF]
-/// [E1-EC][80-BF][80-BF]
-/// [ED][80-9F][80-BF]
-/// [EE-EF][80-BF][80-BF]
-///
-/// Each line above translates to one alternate in the compiled regex program.
-/// However, all but one of the alternates end in the same suffix, which is
-/// a waste of an instruction. The suffix cache facilitates reusing them across
-/// alternates.
-///
-/// Note that a HashMap could be trivially used for this, but we don't need its
-/// overhead. Some small bounded space (LRU style) is more than enough.
-///
-/// This uses similar idea to [`SparseSet`](../sparse/struct.SparseSet.html),
-/// except it uses hashes as original indices and then compares full keys for
-/// validation against `dense` array.
-#[derive(Debug)]
-struct SuffixCache {
- sparse: Box<[usize]>,
- dense: Vec<SuffixCacheEntry>,
-}
-
-#[derive(Clone, Copy, Debug, Default, Eq, Hash, PartialEq)]
-struct SuffixCacheEntry {
- key: SuffixCacheKey,
- pc: InstPtr,
-}
-
-#[derive(Clone, Copy, Debug, Default, Eq, Hash, PartialEq)]
-struct SuffixCacheKey {
- from_inst: InstPtr,
- start: u8,
- end: u8,
-}
-
-impl SuffixCache {
- fn new(size: usize) -> Self {
- SuffixCache {
- sparse: vec![0usize; size].into(),
- dense: Vec::with_capacity(size),
- }
- }
-
- fn get(&mut self, key: SuffixCacheKey, pc: InstPtr) -> Option<InstPtr> {
- let hash = self.hash(&key);
- let pos = &mut self.sparse[hash];
- if let Some(entry) = self.dense.get(*pos) {
- if entry.key == key {
- return Some(entry.pc);
- }
- }
- *pos = self.dense.len();
- self.dense.push(SuffixCacheEntry { key, pc });
- None
- }
-
- fn clear(&mut self) {
- self.dense.clear();
- }
-
- fn hash(&self, suffix: &SuffixCacheKey) -> usize {
- // Basic FNV-1a hash as described:
- // https://en.wikipedia.org/wiki/Fowler%E2%80%93Noll%E2%80%93Vo_hash_function
- const FNV_PRIME: u64 = 1_099_511_628_211;
- let mut h = 14_695_981_039_346_656_037;
- h = (h ^ (suffix.from_inst as u64)).wrapping_mul(FNV_PRIME);
- h = (h ^ (suffix.start as u64)).wrapping_mul(FNV_PRIME);
- h = (h ^ (suffix.end as u64)).wrapping_mul(FNV_PRIME);
- (h as usize) % self.sparse.len()
- }
-}
-
-struct ByteClassSet([bool; 256]);
-
-impl ByteClassSet {
- fn new() -> Self {
- ByteClassSet([false; 256])
- }
-
- fn set_range(&mut self, start: u8, end: u8) {
- debug_assert!(start <= end);
- if start > 0 {
- self.0[start as usize - 1] = true;
- }
- self.0[end as usize] = true;
- }
-
- fn set_word_boundary(&mut self) {
- // We need to mark all ranges of bytes whose pairs result in
- // evaluating \b differently.
- let iswb = is_word_byte;
- let mut b1: u16 = 0;
- let mut b2: u16;
- while b1 <= 255 {
- b2 = b1 + 1;
- while b2 <= 255 && iswb(b1 as u8) == iswb(b2 as u8) {
- b2 += 1;
- }
- self.set_range(b1 as u8, (b2 - 1) as u8);
- b1 = b2;
- }
- }
-
- fn byte_classes(&self) -> Vec<u8> {
- // N.B. If you're debugging the DFA, it's useful to simply return
- // `(0..256).collect()`, which effectively removes the byte classes
- // and makes the transitions easier to read.
- // (0usize..256).map(|x| x as u8).collect()
- let mut byte_classes = vec![0; 256];
- let mut class = 0u8;
- let mut i = 0;
- loop {
- byte_classes[i] = class as u8;
- if i >= 255 {
- break;
- }
- if self.0[i] {
- class = class.checked_add(1).unwrap();
- }
- i += 1;
- }
- byte_classes
- }
-}
-
-impl fmt::Debug for ByteClassSet {
- fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
- f.debug_tuple("ByteClassSet").field(&&self.0[..]).finish()
- }
-}
-
-fn u32_to_usize(n: u32) -> usize {
- // In case usize is less than 32 bits, we need to guard against overflow.
- // On most platforms this compiles to nothing.
- // TODO Use `std::convert::TryFrom` once it's stable.
- if (n as u64) > (::std::usize::MAX as u64) {
- panic!("BUG: {} is too big to be pointer sized", n)
- }
- n as usize
-}
-
-#[cfg(test)]
-mod tests {
- use super::ByteClassSet;
-
- #[test]
- fn byte_classes() {
- let mut set = ByteClassSet::new();
- set.set_range(b'a', b'z');
- let classes = set.byte_classes();
- assert_eq!(classes[0], 0);
- assert_eq!(classes[1], 0);
- assert_eq!(classes[2], 0);
- assert_eq!(classes[b'a' as usize - 1], 0);
- assert_eq!(classes[b'a' as usize], 1);
- assert_eq!(classes[b'm' as usize], 1);
- assert_eq!(classes[b'z' as usize], 1);
- assert_eq!(classes[b'z' as usize + 1], 2);
- assert_eq!(classes[254], 2);
- assert_eq!(classes[255], 2);
-
- let mut set = ByteClassSet::new();
- set.set_range(0, 2);
- set.set_range(4, 6);
- let classes = set.byte_classes();
- assert_eq!(classes[0], 0);
- assert_eq!(classes[1], 0);
- assert_eq!(classes[2], 0);
- assert_eq!(classes[3], 1);
- assert_eq!(classes[4], 2);
- assert_eq!(classes[5], 2);
- assert_eq!(classes[6], 2);
- assert_eq!(classes[7], 3);
- assert_eq!(classes[255], 3);
- }
-
- #[test]
- fn full_byte_classes() {
- let mut set = ByteClassSet::new();
- for i in 0..256u16 {
- set.set_range(i as u8, i as u8);
- }
- assert_eq!(set.byte_classes().len(), 256);
- }
-}
diff --git a/vendor/regex/src/dfa.rs b/vendor/regex/src/dfa.rs
deleted file mode 100644
index 78ed71021..000000000
--- a/vendor/regex/src/dfa.rs
+++ /dev/null
@@ -1,1945 +0,0 @@
-/*!
-The DFA matching engine.
-
-A DFA provides faster matching because the engine is in exactly one state at
-any point in time. In the NFA, there may be multiple active states, and
-considerable CPU cycles are spent shuffling them around. In finite automata
-speak, the DFA follows epsilon transitions in the regex far less than the NFA.
-
-A DFA is a classic trade off between time and space. The NFA is slower, but
-its memory requirements are typically small and predictable. The DFA is faster,
-but given the right regex and the right input, the number of states in the
-DFA can grow exponentially. To mitigate this space problem, we do two things:
-
-1. We implement an *online* DFA. That is, the DFA is constructed from the NFA
- during a search. When a new state is computed, it is stored in a cache so
- that it may be reused. An important consequence of this implementation
- is that states that are never reached for a particular input are never
- computed. (This is impossible in an "offline" DFA which needs to compute
- all possible states up front.)
-2. If the cache gets too big, we wipe it and continue matching.
-
-In pathological cases, a new state can be created for every byte of input.
-(e.g., The regex `(a|b)*a(a|b){20}` on a long sequence of a's and b's.)
-In this case, performance regresses to slightly slower than the full NFA
-simulation, in large part because the cache becomes useless. If the cache
-is wiped too frequently, the DFA quits and control falls back to one of the
-NFA simulations.
-
-Because of the "lazy" nature of this DFA, the inner matching loop is
-considerably more complex than one might expect out of a DFA. A number of
-tricks are employed to make it fast. Tread carefully.
-
-N.B. While this implementation is heavily commented, Russ Cox's series of
-articles on regexes is strongly recommended: <https://swtch.com/~rsc/regexp/>
-(As is the DFA implementation in RE2, which heavily influenced this
-implementation.)
-*/
-
-use std::collections::HashMap;
-use std::fmt;
-use std::iter::repeat;
-use std::mem;
-use std::sync::Arc;
-
-use crate::exec::ProgramCache;
-use crate::prog::{Inst, Program};
-use crate::sparse::SparseSet;
-
-/// Return true if and only if the given program can be executed by a DFA.
-///
-/// Generally, a DFA is always possible. A pathological case where it is not
-/// possible is if the number of NFA states exceeds `u32::MAX`, in which case,
-/// this function will return false.
-///
-/// This function will also return false if the given program has any Unicode
-/// instructions (Char or Ranges) since the DFA operates on bytes only.
-pub fn can_exec(insts: &Program) -> bool {
- use crate::prog::Inst::*;
- // If for some reason we manage to allocate a regex program with more
- // than i32::MAX instructions, then we can't execute the DFA because we
- // use 32 bit instruction pointer deltas for memory savings.
- // If i32::MAX is the largest positive delta,
- // then -i32::MAX == i32::MIN + 1 is the largest negative delta,
- // and we are OK to use 32 bits.
- if insts.dfa_size_limit == 0 || insts.len() > ::std::i32::MAX as usize {
- return false;
- }
- for inst in insts {
- match *inst {
- Char(_) | Ranges(_) => return false,
- EmptyLook(_) | Match(_) | Save(_) | Split(_) | Bytes(_) => {}
- }
- }
- true
-}
-
-/// A reusable cache of DFA states.
-///
-/// This cache is reused between multiple invocations of the same regex
-/// program. (It is not shared simultaneously between threads. If there is
-/// contention, then new caches are created.)
-#[derive(Debug)]
-pub struct Cache {
- /// Group persistent DFA related cache state together. The sparse sets
- /// listed below are used as scratch space while computing uncached states.
- inner: CacheInner,
- /// qcur and qnext are ordered sets with constant time
- /// addition/membership/clearing-whole-set and linear time iteration. They
- /// are used to manage the sets of NFA states in DFA states when computing
- /// cached DFA states. In particular, the order of the NFA states matters
- /// for leftmost-first style matching. Namely, when computing a cached
- /// state, the set of NFA states stops growing as soon as the first Match
- /// instruction is observed.
- qcur: SparseSet,
- qnext: SparseSet,
-}
-
-/// `CacheInner` is logically just a part of Cache, but groups together fields
-/// that aren't passed as function parameters throughout search. (This split
-/// is mostly an artifact of the borrow checker. It is happily paid.)
-#[derive(Debug)]
-struct CacheInner {
- /// A cache of pre-compiled DFA states, keyed by the set of NFA states
- /// and the set of empty-width flags set at the byte in the input when the
- /// state was observed.
- ///
- /// A StatePtr is effectively a `*State`, but to avoid various inconvenient
- /// things, we just pass indexes around manually. The performance impact of
- /// this is probably an instruction or two in the inner loop. However, on
- /// 64 bit, each StatePtr is half the size of a *State.
- compiled: StateMap,
- /// The transition table.
- ///
- /// The transition table is laid out in row-major order, where states are
- /// rows and the transitions for each state are columns. At a high level,
- /// given state `s` and byte `b`, the next state can be found at index
- /// `s * 256 + b`.
- ///
- /// This is, of course, a lie. A StatePtr is actually a pointer to the
- /// *start* of a row in this table. When indexing in the DFA's inner loop,
- /// this removes the need to multiply the StatePtr by the stride. Yes, it
- /// matters. This reduces the number of states we can store, but: the
- /// stride is rarely 256 since we define transitions in terms of
- /// *equivalence classes* of bytes. Each class corresponds to a set of
- /// bytes that never discriminate a distinct path through the DFA from each
- /// other.
- trans: Transitions,
- /// A set of cached start states, which are limited to the number of
- /// permutations of flags set just before the initial byte of input. (The
- /// index into this vec is a `EmptyFlags`.)
- ///
- /// N.B. A start state can be "dead" (i.e., no possible match), so we
- /// represent it with a StatePtr.
- start_states: Vec<StatePtr>,
- /// Stack scratch space used to follow epsilon transitions in the NFA.
- /// (This permits us to avoid recursion.)
- ///
- /// The maximum stack size is the number of NFA states.
- stack: Vec<InstPtr>,
- /// The total number of times this cache has been flushed by the DFA
- /// because of space constraints.
- flush_count: u64,
- /// The total heap size of the DFA's cache. We use this to determine when
- /// we should flush the cache.
- size: usize,
- /// Scratch space used when building instruction pointer lists for new
- /// states. This helps amortize allocation.
- insts_scratch_space: Vec<u8>,
-}
-
-/// The transition table.
-///
-/// It is laid out in row-major order, with states as rows and byte class
-/// transitions as columns.
-///
-/// The transition table is responsible for producing valid `StatePtrs`. A
-/// `StatePtr` points to the start of a particular row in this table. When
-/// indexing to find the next state this allows us to avoid a multiplication
-/// when computing an index into the table.
-#[derive(Clone)]
-struct Transitions {
- /// The table.
- table: Vec<StatePtr>,
- /// The stride.
- num_byte_classes: usize,
-}
-
-/// Fsm encapsulates the actual execution of the DFA.
-#[derive(Debug)]
-pub struct Fsm<'a> {
- /// prog contains the NFA instruction opcodes. DFA execution uses either
- /// the `dfa` instructions or the `dfa_reverse` instructions from
- /// `exec::ExecReadOnly`. (It never uses `ExecReadOnly.nfa`, which may have
- /// Unicode opcodes that cannot be executed by the DFA.)
- prog: &'a Program,
- /// The start state. We record it here because the pointer may change
- /// when the cache is wiped.
- start: StatePtr,
- /// The current position in the input.
- at: usize,
- /// Should we quit after seeing the first match? e.g., When the caller
- /// uses `is_match` or `shortest_match`.
- quit_after_match: bool,
- /// The last state that matched.
- ///
- /// When no match has occurred, this is set to STATE_UNKNOWN.
- ///
- /// This is only useful when matching regex sets. The last match state
- /// is useful because it contains all of the match instructions seen,
- /// thereby allowing us to enumerate which regexes in the set matched.
- last_match_si: StatePtr,
- /// The input position of the last cache flush. We use this to determine
- /// if we're thrashing in the cache too often. If so, the DFA quits so
- /// that we can fall back to the NFA algorithm.
- last_cache_flush: usize,
- /// All cached DFA information that is persisted between searches.
- cache: &'a mut CacheInner,
-}
-
-/// The result of running the DFA.
-///
-/// Generally, the result is either a match or not a match, but sometimes the
-/// DFA runs too slowly because the cache size is too small. In that case, it
-/// gives up with the intent of falling back to the NFA algorithm.
-///
-/// The DFA can also give up if it runs out of room to create new states, or if
-/// it sees non-ASCII bytes in the presence of a Unicode word boundary.
-#[derive(Clone, Debug)]
-pub enum Result<T> {
- Match(T),
- NoMatch(usize),
- Quit,
-}
-
-impl<T> Result<T> {
- /// Returns true if this result corresponds to a match.
- pub fn is_match(&self) -> bool {
- match *self {
- Result::Match(_) => true,
- Result::NoMatch(_) | Result::Quit => false,
- }
- }
-
- /// Maps the given function onto T and returns the result.
- ///
- /// If this isn't a match, then this is a no-op.
- #[cfg(feature = "perf-literal")]
- pub fn map<U, F: FnMut(T) -> U>(self, mut f: F) -> Result<U> {
- match self {
- Result::Match(t) => Result::Match(f(t)),
- Result::NoMatch(x) => Result::NoMatch(x),
- Result::Quit => Result::Quit,
- }
- }
-
- /// Sets the non-match position.
- ///
- /// If this isn't a non-match, then this is a no-op.
- fn set_non_match(self, at: usize) -> Result<T> {
- match self {
- Result::NoMatch(_) => Result::NoMatch(at),
- r => r,
- }
- }
-}
-
-/// `State` is a DFA state. It contains an ordered set of NFA states (not
-/// necessarily complete) and a smattering of flags.
-///
-/// The flags are packed into the first byte of data.
-///
-/// States don't carry their transitions. Instead, transitions are stored in
-/// a single row-major table.
-///
-/// Delta encoding is used to store the instruction pointers.
-/// The first instruction pointer is stored directly starting
-/// at data[1], and each following pointer is stored as an offset
-/// to the previous one. If a delta is in the range -127..127,
-/// it is packed into a single byte; Otherwise the byte 128 (-128 as an i8)
-/// is coded as a flag, followed by 4 bytes encoding the delta.
-#[derive(Clone, Eq, Hash, PartialEq)]
-struct State {
- data: Arc<[u8]>,
-}
-
-/// `InstPtr` is a 32 bit pointer into a sequence of opcodes (i.e., it indexes
-/// an NFA state).
-///
-/// Throughout this library, this is usually set to `usize`, but we force a
-/// `u32` here for the DFA to save on space.
-type InstPtr = u32;
-
-/// Adds ip to data using delta encoding with respect to prev.
-///
-/// After completion, `data` will contain `ip` and `prev` will be set to `ip`.
-fn push_inst_ptr(data: &mut Vec<u8>, prev: &mut InstPtr, ip: InstPtr) {
- let delta = (ip as i32) - (*prev as i32);
- write_vari32(data, delta);
- *prev = ip;
-}
-
-struct InstPtrs<'a> {
- base: usize,
- data: &'a [u8],
-}
-
-impl<'a> Iterator for InstPtrs<'a> {
- type Item = usize;
-
- fn next(&mut self) -> Option<usize> {
- if self.data.is_empty() {
- return None;
- }
- let (delta, nread) = read_vari32(self.data);
- let base = self.base as i32 + delta;
- debug_assert!(base >= 0);
- debug_assert!(nread > 0);
- self.data = &self.data[nread..];
- self.base = base as usize;
- Some(self.base)
- }
-}
-
-impl State {
- fn flags(&self) -> StateFlags {
- StateFlags(self.data[0])
- }
-
- fn inst_ptrs(&self) -> InstPtrs<'_> {
- InstPtrs { base: 0, data: &self.data[1..] }
- }
-}
-
-/// `StatePtr` is a 32 bit pointer to the start of a row in the transition
-/// table.
-///
-/// It has many special values. There are two types of special values:
-/// sentinels and flags.
-///
-/// Sentinels corresponds to special states that carry some kind of
-/// significance. There are three such states: unknown, dead and quit states.
-///
-/// Unknown states are states that haven't been computed yet. They indicate
-/// that a transition should be filled in that points to either an existing
-/// cached state or a new state altogether. In general, an unknown state means
-/// "follow the NFA's epsilon transitions."
-///
-/// Dead states are states that can never lead to a match, no matter what
-/// subsequent input is observed. This means that the DFA should quit
-/// immediately and return the longest match it has found thus far.
-///
-/// Quit states are states that imply the DFA is not capable of matching the
-/// regex correctly. Currently, this is only used when a Unicode word boundary
-/// exists in the regex *and* a non-ASCII byte is observed.
-///
-/// The other type of state pointer is a state pointer with special flag bits.
-/// There are two flags: a start flag and a match flag. The lower bits of both
-/// kinds always contain a "valid" `StatePtr` (indicated by the `STATE_MAX`
-/// mask).
-///
-/// The start flag means that the state is a start state, and therefore may be
-/// subject to special prefix scanning optimizations.
-///
-/// The match flag means that the state is a match state, and therefore the
-/// current position in the input (while searching) should be recorded.
-///
-/// The above exists mostly in the service of making the inner loop fast.
-/// In particular, the inner *inner* loop looks something like this:
-///
-/// ```ignore
-/// while state <= STATE_MAX and i < len(text):
-/// state = state.next[i]
-/// ```
-///
-/// This is nice because it lets us execute a lazy DFA as if it were an
-/// entirely offline DFA (i.e., with very few instructions). The loop will
-/// quit only when we need to examine a case that needs special attention.
-type StatePtr = u32;
-
-/// An unknown state means that the state has not been computed yet, and that
-/// the only way to progress is to compute it.
-const STATE_UNKNOWN: StatePtr = 1 << 31;
-
-/// A dead state means that the state has been computed and it is known that
-/// once it is entered, no future match can ever occur.
-const STATE_DEAD: StatePtr = STATE_UNKNOWN + 1;
-
-/// A quit state means that the DFA came across some input that it doesn't
-/// know how to process correctly. The DFA should quit and another matching
-/// engine should be run in its place.
-const STATE_QUIT: StatePtr = STATE_DEAD + 1;
-
-/// A start state is a state that the DFA can start in.
-///
-/// Note that start states have their lower bits set to a state pointer.
-const STATE_START: StatePtr = 1 << 30;
-
-/// A match state means that the regex has successfully matched.
-///
-/// Note that match states have their lower bits set to a state pointer.
-const STATE_MATCH: StatePtr = 1 << 29;
-
-/// The maximum state pointer. This is useful to mask out the "valid" state
-/// pointer from a state with the "start" or "match" bits set.
-///
-/// It doesn't make sense to use this with unknown, dead or quit state
-/// pointers, since those pointers are sentinels and never have their lower
-/// bits set to anything meaningful.
-const STATE_MAX: StatePtr = STATE_MATCH - 1;
-
-/// Byte is a u8 in spirit, but a u16 in practice so that we can represent the
-/// special EOF sentinel value.
-#[derive(Copy, Clone, Debug)]
-struct Byte(u16);
-
-/// A set of flags for zero-width assertions.
-#[derive(Clone, Copy, Eq, Debug, Default, Hash, PartialEq)]
-struct EmptyFlags {
- start: bool,
- end: bool,
- start_line: bool,
- end_line: bool,
- word_boundary: bool,
- not_word_boundary: bool,
-}
-
-/// A set of flags describing various configurations of a DFA state. This is
-/// represented by a `u8` so that it is compact.
-#[derive(Clone, Copy, Eq, Default, Hash, PartialEq)]
-struct StateFlags(u8);
-
-impl Cache {
- /// Create new empty cache for the DFA engine.
- pub fn new(prog: &Program) -> Self {
- // We add 1 to account for the special EOF byte.
- let num_byte_classes = (prog.byte_classes[255] as usize + 1) + 1;
- let starts = vec![STATE_UNKNOWN; 256];
- let mut cache = Cache {
- inner: CacheInner {
- compiled: StateMap::new(num_byte_classes),
- trans: Transitions::new(num_byte_classes),
- start_states: starts,
- stack: vec![],
- flush_count: 0,
- size: 0,
- insts_scratch_space: vec![],
- },
- qcur: SparseSet::new(prog.insts.len()),
- qnext: SparseSet::new(prog.insts.len()),
- };
- cache.inner.reset_size();
- cache
- }
-}
-
-impl CacheInner {
- /// Resets the cache size to account for fixed costs, such as the program
- /// and stack sizes.
- fn reset_size(&mut self) {
- self.size = (self.start_states.len() * mem::size_of::<StatePtr>())
- + (self.stack.len() * mem::size_of::<InstPtr>());
- }
-}
-
-impl<'a> Fsm<'a> {
- #[cfg_attr(feature = "perf-inline", inline(always))]
- pub fn forward(
- prog: &'a Program,
- cache: &ProgramCache,
- quit_after_match: bool,
- text: &[u8],
- at: usize,
- ) -> Result<usize> {
- let mut cache = cache.borrow_mut();
- let cache = &mut cache.dfa;
- let mut dfa = Fsm {
- prog,
- start: 0, // filled in below
- at,
- quit_after_match,
- last_match_si: STATE_UNKNOWN,
- last_cache_flush: at,
- cache: &mut cache.inner,
- };
- let (empty_flags, state_flags) = dfa.start_flags(text, at);
- dfa.start =
- match dfa.start_state(&mut cache.qcur, empty_flags, state_flags) {
- None => return Result::Quit,
- Some(STATE_DEAD) => return Result::NoMatch(at),
- Some(si) => si,
- };
- debug_assert!(dfa.start != STATE_UNKNOWN);
- dfa.exec_at(&mut cache.qcur, &mut cache.qnext, text)
- }
-
- #[cfg_attr(feature = "perf-inline", inline(always))]
- pub fn reverse(
- prog: &'a Program,
- cache: &ProgramCache,
- quit_after_match: bool,
- text: &[u8],
- at: usize,
- ) -> Result<usize> {
- let mut cache = cache.borrow_mut();
- let cache = &mut cache.dfa_reverse;
- let mut dfa = Fsm {
- prog,
- start: 0, // filled in below
- at,
- quit_after_match,
- last_match_si: STATE_UNKNOWN,
- last_cache_flush: at,
- cache: &mut cache.inner,
- };
- let (empty_flags, state_flags) = dfa.start_flags_reverse(text, at);
- dfa.start =
- match dfa.start_state(&mut cache.qcur, empty_flags, state_flags) {
- None => return Result::Quit,
- Some(STATE_DEAD) => return Result::NoMatch(at),
- Some(si) => si,
- };
- debug_assert!(dfa.start != STATE_UNKNOWN);
- dfa.exec_at_reverse(&mut cache.qcur, &mut cache.qnext, text)
- }
-
- #[cfg_attr(feature = "perf-inline", inline(always))]
- pub fn forward_many(
- prog: &'a Program,
- cache: &ProgramCache,
- matches: &mut [bool],
- text: &[u8],
- at: usize,
- ) -> Result<usize> {
- debug_assert!(matches.len() == prog.matches.len());
- let mut cache = cache.borrow_mut();
- let cache = &mut cache.dfa;
- let mut dfa = Fsm {
- prog,
- start: 0, // filled in below
- at,
- quit_after_match: false,
- last_match_si: STATE_UNKNOWN,
- last_cache_flush: at,
- cache: &mut cache.inner,
- };
- let (empty_flags, state_flags) = dfa.start_flags(text, at);
- dfa.start =
- match dfa.start_state(&mut cache.qcur, empty_flags, state_flags) {
- None => return Result::Quit,
- Some(STATE_DEAD) => return Result::NoMatch(at),
- Some(si) => si,
- };
- debug_assert!(dfa.start != STATE_UNKNOWN);
- let result = dfa.exec_at(&mut cache.qcur, &mut cache.qnext, text);
- if result.is_match() {
- if matches.len() == 1 {
- matches[0] = true;
- } else {
- debug_assert!(dfa.last_match_si != STATE_UNKNOWN);
- debug_assert!(dfa.last_match_si != STATE_DEAD);
- for ip in dfa.state(dfa.last_match_si).inst_ptrs() {
- if let Inst::Match(slot) = dfa.prog[ip] {
- matches[slot] = true;
- }
- }
- }
- }
- result
- }
-
- /// Executes the DFA on a forward NFA.
- ///
- /// {qcur,qnext} are scratch ordered sets which may be non-empty.
- #[cfg_attr(feature = "perf-inline", inline(always))]
- fn exec_at(
- &mut self,
- qcur: &mut SparseSet,
- qnext: &mut SparseSet,
- text: &[u8],
- ) -> Result<usize> {
- // For the most part, the DFA is basically:
- //
- // last_match = null
- // while current_byte != EOF:
- // si = current_state.next[current_byte]
- // if si is match
- // last_match = si
- // return last_match
- //
- // However, we need to deal with a few things:
- //
- // 1. This is an *online* DFA, so the current state's next list
- // may not point to anywhere yet, so we must go out and compute
- // them. (They are then cached into the current state's next list
- // to avoid re-computation.)
- // 2. If we come across a state that is known to be dead (i.e., never
- // leads to a match), then we can quit early.
- // 3. If the caller just wants to know if a match occurs, then we
- // can quit as soon as we know we have a match. (Full leftmost
- // first semantics require continuing on.)
- // 4. If we're in the start state, then we can use a pre-computed set
- // of prefix literals to skip quickly along the input.
- // 5. After the input is exhausted, we run the DFA on one symbol
- // that stands for EOF. This is useful for handling empty width
- // assertions.
- // 6. We can't actually do state.next[byte]. Instead, we have to do
- // state.next[byte_classes[byte]], which permits us to keep the
- // 'next' list very small.
- //
- // Since there's a bunch of extra stuff we need to consider, we do some
- // pretty hairy tricks to get the inner loop to run as fast as
- // possible.
- debug_assert!(!self.prog.is_reverse);
-
- // The last match is the currently known ending match position. It is
- // reported as an index to the most recent byte that resulted in a
- // transition to a match state and is always stored in capture slot `1`
- // when searching forwards. Its maximum value is `text.len()`.
- let mut result = Result::NoMatch(self.at);
- let (mut prev_si, mut next_si) = (self.start, self.start);
- let mut at = self.at;
- while at < text.len() {
- // This is the real inner loop. We take advantage of special bits
- // set in the state pointer to determine whether a state is in the
- // "common" case or not. Specifically, the common case is a
- // non-match non-start non-dead state that has already been
- // computed. So long as we remain in the common case, this inner
- // loop will chew through the input.
- //
- // We also unroll the loop 4 times to amortize the cost of checking
- // whether we've consumed the entire input. We are also careful
- // to make sure that `prev_si` always represents the previous state
- // and `next_si` always represents the next state after the loop
- // exits, even if it isn't always true inside the loop.
- while next_si <= STATE_MAX && at < text.len() {
- // Argument for safety is in the definition of next_si.
- prev_si = unsafe { self.next_si(next_si, text, at) };
- at += 1;
- if prev_si > STATE_MAX || at + 2 >= text.len() {
- mem::swap(&mut prev_si, &mut next_si);
- break;
- }
- next_si = unsafe { self.next_si(prev_si, text, at) };
- at += 1;
- if next_si > STATE_MAX {
- break;
- }
- prev_si = unsafe { self.next_si(next_si, text, at) };
- at += 1;
- if prev_si > STATE_MAX {
- mem::swap(&mut prev_si, &mut next_si);
- break;
- }
- next_si = unsafe { self.next_si(prev_si, text, at) };
- at += 1;
- }
- if next_si & STATE_MATCH > 0 {
- // A match state is outside of the common case because it needs
- // special case analysis. In particular, we need to record the
- // last position as having matched and possibly quit the DFA if
- // we don't need to keep matching.
- next_si &= !STATE_MATCH;
- result = Result::Match(at - 1);
- if self.quit_after_match {
- return result;
- }
- self.last_match_si = next_si;
- prev_si = next_si;
-
- // This permits short-circuiting when matching a regex set.
- // In particular, if this DFA state contains only match states,
- // then it's impossible to extend the set of matches since
- // match states are final. Therefore, we can quit.
- if self.prog.matches.len() > 1 {
- let state = self.state(next_si);
- let just_matches =
- state.inst_ptrs().all(|ip| self.prog[ip].is_match());
- if just_matches {
- return result;
- }
- }
-
- // Another inner loop! If the DFA stays in this particular
- // match state, then we can rip through all of the input
- // very quickly, and only recording the match location once
- // we've left this particular state.
- let cur = at;
- while (next_si & !STATE_MATCH) == prev_si
- && at + 2 < text.len()
- {
- // Argument for safety is in the definition of next_si.
- next_si = unsafe {
- self.next_si(next_si & !STATE_MATCH, text, at)
- };
- at += 1;
- }
- if at > cur {
- result = Result::Match(at - 2);
- }
- } else if next_si & STATE_START > 0 {
- // A start state isn't in the common case because we may
- // want to do quick prefix scanning. If the program doesn't
- // have a detected prefix, then start states are actually
- // considered common and this case is never reached.
- debug_assert!(self.has_prefix());
- next_si &= !STATE_START;
- prev_si = next_si;
- at = match self.prefix_at(text, at) {
- None => return Result::NoMatch(text.len()),
- Some(i) => i,
- };
- } else if next_si >= STATE_UNKNOWN {
- if next_si == STATE_QUIT {
- return Result::Quit;
- }
- // Finally, this corresponds to the case where the transition
- // entered a state that can never lead to a match or a state
- // that hasn't been computed yet. The latter being the "slow"
- // path.
- let byte = Byte::byte(text[at - 1]);
- // We no longer care about the special bits in the state
- // pointer.
- prev_si &= STATE_MAX;
- // Record where we are. This is used to track progress for
- // determining whether we should quit if we've flushed the
- // cache too much.
- self.at = at;
- next_si = match self.next_state(qcur, qnext, prev_si, byte) {
- None => return Result::Quit,
- Some(STATE_DEAD) => return result.set_non_match(at),
- Some(si) => si,
- };
- debug_assert!(next_si != STATE_UNKNOWN);
- if next_si & STATE_MATCH > 0 {
- next_si &= !STATE_MATCH;
- result = Result::Match(at - 1);
- if self.quit_after_match {
- return result;
- }
- self.last_match_si = next_si;
- }
- prev_si = next_si;
- } else {
- prev_si = next_si;
- }
- }
-
- // Run the DFA once more on the special EOF sentinel value.
- // We don't care about the special bits in the state pointer any more,
- // so get rid of them.
- prev_si &= STATE_MAX;
- prev_si = match self.next_state(qcur, qnext, prev_si, Byte::eof()) {
- None => return Result::Quit,
- Some(STATE_DEAD) => return result.set_non_match(text.len()),
- Some(si) => si & !STATE_START,
- };
- debug_assert!(prev_si != STATE_UNKNOWN);
- if prev_si & STATE_MATCH > 0 {
- prev_si &= !STATE_MATCH;
- self.last_match_si = prev_si;
- result = Result::Match(text.len());
- }
- result
- }
-
- /// Executes the DFA on a reverse NFA.
- #[cfg_attr(feature = "perf-inline", inline(always))]
- fn exec_at_reverse(
- &mut self,
- qcur: &mut SparseSet,
- qnext: &mut SparseSet,
- text: &[u8],
- ) -> Result<usize> {
- // The comments in `exec_at` above mostly apply here too. The main
- // difference is that we move backwards over the input and we look for
- // the longest possible match instead of the leftmost-first match.
- //
- // N.B. The code duplication here is regrettable. Efforts to improve
- // it without sacrificing performance are welcome. ---AG
- debug_assert!(self.prog.is_reverse);
- let mut result = Result::NoMatch(self.at);
- let (mut prev_si, mut next_si) = (self.start, self.start);
- let mut at = self.at;
- while at > 0 {
- while next_si <= STATE_MAX && at > 0 {
- // Argument for safety is in the definition of next_si.
- at -= 1;
- prev_si = unsafe { self.next_si(next_si, text, at) };
- if prev_si > STATE_MAX || at <= 4 {
- mem::swap(&mut prev_si, &mut next_si);
- break;
- }
- at -= 1;
- next_si = unsafe { self.next_si(prev_si, text, at) };
- if next_si > STATE_MAX {
- break;
- }
- at -= 1;
- prev_si = unsafe { self.next_si(next_si, text, at) };
- if prev_si > STATE_MAX {
- mem::swap(&mut prev_si, &mut next_si);
- break;
- }
- at -= 1;
- next_si = unsafe { self.next_si(prev_si, text, at) };
- }
- if next_si & STATE_MATCH > 0 {
- next_si &= !STATE_MATCH;
- result = Result::Match(at + 1);
- if self.quit_after_match {
- return result;
- }
- self.last_match_si = next_si;
- prev_si = next_si;
- let cur = at;
- while (next_si & !STATE_MATCH) == prev_si && at >= 2 {
- // Argument for safety is in the definition of next_si.
- at -= 1;
- next_si = unsafe {
- self.next_si(next_si & !STATE_MATCH, text, at)
- };
- }
- if at < cur {
- result = Result::Match(at + 2);
- }
- } else if next_si >= STATE_UNKNOWN {
- if next_si == STATE_QUIT {
- return Result::Quit;
- }
- let byte = Byte::byte(text[at]);
- prev_si &= STATE_MAX;
- self.at = at;
- next_si = match self.next_state(qcur, qnext, prev_si, byte) {
- None => return Result::Quit,
- Some(STATE_DEAD) => return result.set_non_match(at),
- Some(si) => si,
- };
- debug_assert!(next_si != STATE_UNKNOWN);
- if next_si & STATE_MATCH > 0 {
- next_si &= !STATE_MATCH;
- result = Result::Match(at + 1);
- if self.quit_after_match {
- return result;
- }
- self.last_match_si = next_si;
- }
- prev_si = next_si;
- } else {
- prev_si = next_si;
- }
- }
-
- // Run the DFA once more on the special EOF sentinel value.
- prev_si = match self.next_state(qcur, qnext, prev_si, Byte::eof()) {
- None => return Result::Quit,
- Some(STATE_DEAD) => return result.set_non_match(0),
- Some(si) => si,
- };
- debug_assert!(prev_si != STATE_UNKNOWN);
- if prev_si & STATE_MATCH > 0 {
- prev_si &= !STATE_MATCH;
- self.last_match_si = prev_si;
- result = Result::Match(0);
- }
- result
- }
-
- /// next_si transitions to the next state, where the transition input
- /// corresponds to text[i].
- ///
- /// This elides bounds checks, and is therefore not safe.
- #[cfg_attr(feature = "perf-inline", inline(always))]
- unsafe fn next_si(&self, si: StatePtr, text: &[u8], i: usize) -> StatePtr {
- // What is the argument for safety here?
- // We have three unchecked accesses that could possibly violate safety:
- //
- // 1. The given byte of input (`text[i]`).
- // 2. The class of the byte of input (`classes[text[i]]`).
- // 3. The transition for the class (`trans[si + cls]`).
- //
- // (1) is only safe when calling next_si is guarded by
- // `i < text.len()`.
- //
- // (2) is the easiest case to guarantee since `text[i]` is always a
- // `u8` and `self.prog.byte_classes` always has length `u8::MAX`.
- // (See `ByteClassSet.byte_classes` in `compile.rs`.)
- //
- // (3) is only safe if (1)+(2) are safe. Namely, the transitions
- // of every state are defined to have length equal to the number of
- // byte classes in the program. Therefore, a valid class leads to a
- // valid transition. (All possible transitions are valid lookups, even
- // if it points to a state that hasn't been computed yet.) (3) also
- // relies on `si` being correct, but StatePtrs should only ever be
- // retrieved from the transition table, which ensures they are correct.
- debug_assert!(i < text.len());
- let b = *text.get_unchecked(i);
- debug_assert!((b as usize) < self.prog.byte_classes.len());
- let cls = *self.prog.byte_classes.get_unchecked(b as usize);
- self.cache.trans.next_unchecked(si, cls as usize)
- }
-
- /// Computes the next state given the current state and the current input
- /// byte (which may be EOF).
- ///
- /// If STATE_DEAD is returned, then there is no valid state transition.
- /// This implies that no permutation of future input can lead to a match
- /// state.
- ///
- /// STATE_UNKNOWN can never be returned.
- fn exec_byte(
- &mut self,
- qcur: &mut SparseSet,
- qnext: &mut SparseSet,
- mut si: StatePtr,
- b: Byte,
- ) -> Option<StatePtr> {
- use crate::prog::Inst::*;
-
- // Initialize a queue with the current DFA state's NFA states.
- qcur.clear();
- for ip in self.state(si).inst_ptrs() {
- qcur.insert(ip);
- }
-
- // Before inspecting the current byte, we may need to also inspect
- // whether the position immediately preceding the current byte
- // satisfies the empty assertions found in the current state.
- //
- // We only need to do this step if there are any empty assertions in
- // the current state.
- let is_word_last = self.state(si).flags().is_word();
- let is_word = b.is_ascii_word();
- if self.state(si).flags().has_empty() {
- // Compute the flags immediately preceding the current byte.
- // This means we only care about the "end" or "end line" flags.
- // (The "start" flags are computed immediately following the
- // current byte and are handled below.)
- let mut flags = EmptyFlags::default();
- if b.is_eof() {
- flags.end = true;
- flags.end_line = true;
- } else if b.as_byte().map_or(false, |b| b == b'\n') {
- flags.end_line = true;
- }
- if is_word_last == is_word {
- flags.not_word_boundary = true;
- } else {
- flags.word_boundary = true;
- }
- // Now follow epsilon transitions from every NFA state, but make
- // sure we only follow transitions that satisfy our flags.
- qnext.clear();
- for &ip in &*qcur {
- self.follow_epsilons(usize_to_u32(ip), qnext, flags);
- }
- mem::swap(qcur, qnext);
- }
-
- // Now we set flags for immediately after the current byte. Since start
- // states are processed separately, and are the only states that can
- // have the StartText flag set, we therefore only need to worry about
- // the StartLine flag here.
- //
- // We do also keep track of whether this DFA state contains a NFA state
- // that is a matching state. This is precisely how we delay the DFA
- // matching by one byte in order to process the special EOF sentinel
- // byte. Namely, if this DFA state containing a matching NFA state,
- // then it is the *next* DFA state that is marked as a match.
- let mut empty_flags = EmptyFlags::default();
- let mut state_flags = StateFlags::default();
- empty_flags.start_line = b.as_byte().map_or(false, |b| b == b'\n');
- if b.is_ascii_word() {
- state_flags.set_word();
- }
- // Now follow all epsilon transitions again, but only after consuming
- // the current byte.
- qnext.clear();
- for &ip in &*qcur {
- match self.prog[ip as usize] {
- // These states never happen in a byte-based program.
- Char(_) | Ranges(_) => unreachable!(),
- // These states are handled when following epsilon transitions.
- Save(_) | Split(_) | EmptyLook(_) => {}
- Match(_) => {
- state_flags.set_match();
- if !self.continue_past_first_match() {
- break;
- } else if self.prog.matches.len() > 1
- && !qnext.contains(ip as usize)
- {
- // If we are continuing on to find other matches,
- // then keep a record of the match states we've seen.
- qnext.insert(ip);
- }
- }
- Bytes(ref inst) => {
- if b.as_byte().map_or(false, |b| inst.matches(b)) {
- self.follow_epsilons(
- inst.goto as InstPtr,
- qnext,
- empty_flags,
- );
- }
- }
- }
- }
-
- let cache = if b.is_eof() && self.prog.matches.len() > 1 {
- // If we're processing the last byte of the input and we're
- // matching a regex set, then make the next state contain the
- // previous states transitions. We do this so that the main
- // matching loop can extract all of the match instructions.
- mem::swap(qcur, qnext);
- // And don't cache this state because it's totally bunk.
- false
- } else {
- true
- };
-
- // We've now built up the set of NFA states that ought to comprise the
- // next DFA state, so try to find it in the cache, and if it doesn't
- // exist, cache it.
- //
- // N.B. We pass `&mut si` here because the cache may clear itself if
- // it has gotten too full. When that happens, the location of the
- // current state may change.
- let mut next =
- match self.cached_state(qnext, state_flags, Some(&mut si)) {
- None => return None,
- Some(next) => next,
- };
- if (self.start & !STATE_START) == next {
- // Start states can never be match states since all matches are
- // delayed by one byte.
- debug_assert!(!self.state(next).flags().is_match());
- next = self.start_ptr(next);
- }
- if next <= STATE_MAX && self.state(next).flags().is_match() {
- next |= STATE_MATCH;
- }
- debug_assert!(next != STATE_UNKNOWN);
- // And now store our state in the current state's next list.
- if cache {
- let cls = self.byte_class(b);
- self.cache.trans.set_next(si, cls, next);
- }
- Some(next)
- }
-
- /// Follows the epsilon transitions starting at (and including) `ip`. The
- /// resulting states are inserted into the ordered set `q`.
- ///
- /// Conditional epsilon transitions (i.e., empty width assertions) are only
- /// followed if they are satisfied by the given flags, which should
- /// represent the flags set at the current location in the input.
- ///
- /// If the current location corresponds to the empty string, then only the
- /// end line and/or end text flags may be set. If the current location
- /// corresponds to a real byte in the input, then only the start line
- /// and/or start text flags may be set.
- ///
- /// As an exception to the above, when finding the initial state, any of
- /// the above flags may be set:
- ///
- /// If matching starts at the beginning of the input, then start text and
- /// start line should be set. If the input is empty, then end text and end
- /// line should also be set.
- ///
- /// If matching starts after the beginning of the input, then only start
- /// line should be set if the preceding byte is `\n`. End line should never
- /// be set in this case. (Even if the following byte is a `\n`, it will
- /// be handled in a subsequent DFA state.)
- fn follow_epsilons(
- &mut self,
- ip: InstPtr,
- q: &mut SparseSet,
- flags: EmptyFlags,
- ) {
- use crate::prog::EmptyLook::*;
- use crate::prog::Inst::*;
-
- // We need to traverse the NFA to follow epsilon transitions, so avoid
- // recursion with an explicit stack.
- self.cache.stack.push(ip);
- while let Some(mut ip) = self.cache.stack.pop() {
- // Try to munch through as many states as possible without
- // pushes/pops to the stack.
- loop {
- // Don't visit states we've already added.
- if q.contains(ip as usize) {
- break;
- }
- q.insert(ip as usize);
- match self.prog[ip as usize] {
- Char(_) | Ranges(_) => unreachable!(),
- Match(_) | Bytes(_) => {
- break;
- }
- EmptyLook(ref inst) => {
- // Only follow empty assertion states if our flags
- // satisfy the assertion.
- match inst.look {
- StartLine if flags.start_line => {
- ip = inst.goto as InstPtr;
- }
- EndLine if flags.end_line => {
- ip = inst.goto as InstPtr;
- }
- StartText if flags.start => {
- ip = inst.goto as InstPtr;
- }
- EndText if flags.end => {
- ip = inst.goto as InstPtr;
- }
- WordBoundaryAscii if flags.word_boundary => {
- ip = inst.goto as InstPtr;
- }
- NotWordBoundaryAscii
- if flags.not_word_boundary =>
- {
- ip = inst.goto as InstPtr;
- }
- WordBoundary if flags.word_boundary => {
- ip = inst.goto as InstPtr;
- }
- NotWordBoundary if flags.not_word_boundary => {
- ip = inst.goto as InstPtr;
- }
- StartLine | EndLine | StartText | EndText
- | WordBoundaryAscii | NotWordBoundaryAscii
- | WordBoundary | NotWordBoundary => {
- break;
- }
- }
- }
- Save(ref inst) => {
- ip = inst.goto as InstPtr;
- }
- Split(ref inst) => {
- self.cache.stack.push(inst.goto2 as InstPtr);
- ip = inst.goto1 as InstPtr;
- }
- }
- }
- }
- }
-
- /// Find a previously computed state matching the given set of instructions
- /// and is_match bool.
- ///
- /// The given set of instructions should represent a single state in the
- /// NFA along with all states reachable without consuming any input.
- ///
- /// The is_match bool should be true if and only if the preceding DFA state
- /// contains an NFA matching state. The cached state produced here will
- /// then signify a match. (This enables us to delay a match by one byte,
- /// in order to account for the EOF sentinel byte.)
- ///
- /// If the cache is full, then it is wiped before caching a new state.
- ///
- /// The current state should be specified if it exists, since it will need
- /// to be preserved if the cache clears itself. (Start states are
- /// always saved, so they should not be passed here.) It takes a mutable
- /// pointer to the index because if the cache is cleared, the state's
- /// location may change.
- fn cached_state(
- &mut self,
- q: &SparseSet,
- mut state_flags: StateFlags,
- current_state: Option<&mut StatePtr>,
- ) -> Option<StatePtr> {
- // If we couldn't come up with a non-empty key to represent this state,
- // then it is dead and can never lead to a match.
- //
- // Note that inst_flags represent the set of empty width assertions
- // in q. We use this as an optimization in exec_byte to determine when
- // we should follow epsilon transitions at the empty string preceding
- // the current byte.
- let key = match self.cached_state_key(q, &mut state_flags) {
- None => return Some(STATE_DEAD),
- Some(v) => v,
- };
- // In the cache? Cool. Done.
- if let Some(si) = self.cache.compiled.get_ptr(&key) {
- return Some(si);
- }
- // If the cache has gotten too big, wipe it.
- if self.approximate_size() > self.prog.dfa_size_limit
- && !self.clear_cache_and_save(current_state)
- {
- // Ooops. DFA is giving up.
- return None;
- }
- // Allocate room for our state and add it.
- self.add_state(key)
- }
-
- /// Produces a key suitable for describing a state in the DFA cache.
- ///
- /// The key invariant here is that equivalent keys are produced for any two
- /// sets of ordered NFA states (and toggling of whether the previous NFA
- /// states contain a match state) that do not discriminate a match for any
- /// input.
- ///
- /// Specifically, q should be an ordered set of NFA states and is_match
- /// should be true if and only if the previous NFA states contained a match
- /// state.
- fn cached_state_key(
- &mut self,
- q: &SparseSet,
- state_flags: &mut StateFlags,
- ) -> Option<State> {
- use crate::prog::Inst::*;
-
- // We need to build up enough information to recognize pre-built states
- // in the DFA. Generally speaking, this includes every instruction
- // except for those which are purely epsilon transitions, e.g., the
- // Save and Split instructions.
- //
- // Empty width assertions are also epsilon transitions, but since they
- // are conditional, we need to make them part of a state's key in the
- // cache.
-
- let mut insts =
- mem::replace(&mut self.cache.insts_scratch_space, vec![]);
- insts.clear();
- // Reserve 1 byte for flags.
- insts.push(0);
-
- let mut prev = 0;
- for &ip in q {
- let ip = usize_to_u32(ip);
- match self.prog[ip as usize] {
- Char(_) | Ranges(_) => unreachable!(),
- Save(_) | Split(_) => {}
- Bytes(_) => push_inst_ptr(&mut insts, &mut prev, ip),
- EmptyLook(_) => {
- state_flags.set_empty();
- push_inst_ptr(&mut insts, &mut prev, ip)
- }
- Match(_) => {
- push_inst_ptr(&mut insts, &mut prev, ip);
- if !self.continue_past_first_match() {
- break;
- }
- }
- }
- }
- // If we couldn't transition to any other instructions and we didn't
- // see a match when expanding NFA states previously, then this is a
- // dead state and no amount of additional input can transition out
- // of this state.
- let opt_state = if insts.len() == 1 && !state_flags.is_match() {
- None
- } else {
- let StateFlags(f) = *state_flags;
- insts[0] = f;
- Some(State { data: Arc::from(&*insts) })
- };
- self.cache.insts_scratch_space = insts;
- opt_state
- }
-
- /// Clears the cache, but saves and restores current_state if it is not
- /// none.
- ///
- /// The current state must be provided here in case its location in the
- /// cache changes.
- ///
- /// This returns false if the cache is not cleared and the DFA should
- /// give up.
- fn clear_cache_and_save(
- &mut self,
- current_state: Option<&mut StatePtr>,
- ) -> bool {
- if self.cache.compiled.is_empty() {
- // Nothing to clear...
- return true;
- }
- match current_state {
- None => self.clear_cache(),
- Some(si) => {
- let cur = self.state(*si).clone();
- if !self.clear_cache() {
- return false;
- }
- // The unwrap is OK because we just cleared the cache and
- // therefore know that the next state pointer won't exceed
- // STATE_MAX.
- *si = self.restore_state(cur).unwrap();
- true
- }
- }
- }
-
- /// Wipes the state cache, but saves and restores the current start state.
- ///
- /// This returns false if the cache is not cleared and the DFA should
- /// give up.
- fn clear_cache(&mut self) -> bool {
- // Bail out of the DFA if we're moving too "slowly."
- // A heuristic from RE2: assume the DFA is too slow if it is processing
- // 10 or fewer bytes per state.
- // Additionally, we permit the cache to be flushed a few times before
- // caling it quits.
- let nstates = self.cache.compiled.len();
- if self.cache.flush_count >= 3
- && self.at >= self.last_cache_flush
- && (self.at - self.last_cache_flush) <= 10 * nstates
- {
- return false;
- }
- // Update statistics tracking cache flushes.
- self.last_cache_flush = self.at;
- self.cache.flush_count += 1;
-
- // OK, actually flush the cache.
- let start = self.state(self.start & !STATE_START).clone();
- let last_match = if self.last_match_si <= STATE_MAX {
- Some(self.state(self.last_match_si).clone())
- } else {
- None
- };
- self.cache.reset_size();
- self.cache.trans.clear();
- self.cache.compiled.clear();
- for s in &mut self.cache.start_states {
- *s = STATE_UNKNOWN;
- }
- // The unwraps are OK because we just cleared the cache and therefore
- // know that the next state pointer won't exceed STATE_MAX.
- let start_ptr = self.restore_state(start).unwrap();
- self.start = self.start_ptr(start_ptr);
- if let Some(last_match) = last_match {
- self.last_match_si = self.restore_state(last_match).unwrap();
- }
- true
- }
-
- /// Restores the given state back into the cache, and returns a pointer
- /// to it.
- fn restore_state(&mut self, state: State) -> Option<StatePtr> {
- // If we've already stored this state, just return a pointer to it.
- // None will be the wiser.
- if let Some(si) = self.cache.compiled.get_ptr(&state) {
- return Some(si);
- }
- self.add_state(state)
- }
-
- /// Returns the next state given the current state si and current byte
- /// b. {qcur,qnext} are used as scratch space for storing ordered NFA
- /// states.
- ///
- /// This tries to fetch the next state from the cache, but if that fails,
- /// it computes the next state, caches it and returns a pointer to it.
- ///
- /// The pointer can be to a real state, or it can be STATE_DEAD.
- /// STATE_UNKNOWN cannot be returned.
- ///
- /// None is returned if a new state could not be allocated (i.e., the DFA
- /// ran out of space and thinks it's running too slowly).
- fn next_state(
- &mut self,
- qcur: &mut SparseSet,
- qnext: &mut SparseSet,
- si: StatePtr,
- b: Byte,
- ) -> Option<StatePtr> {
- if si == STATE_DEAD {
- return Some(STATE_DEAD);
- }
- match self.cache.trans.next(si, self.byte_class(b)) {
- STATE_UNKNOWN => self.exec_byte(qcur, qnext, si, b),
- STATE_QUIT => None,
- nsi => Some(nsi),
- }
- }
-
- /// Computes and returns the start state, where searching begins at
- /// position `at` in `text`. If the state has already been computed,
- /// then it is pulled from the cache. If the state hasn't been cached,
- /// then it is computed, cached and a pointer to it is returned.
- ///
- /// This may return STATE_DEAD but never STATE_UNKNOWN.
- #[cfg_attr(feature = "perf-inline", inline(always))]
- fn start_state(
- &mut self,
- q: &mut SparseSet,
- empty_flags: EmptyFlags,
- state_flags: StateFlags,
- ) -> Option<StatePtr> {
- // Compute an index into our cache of start states based on the set
- // of empty/state flags set at the current position in the input. We
- // don't use every flag since not all flags matter. For example, since
- // matches are delayed by one byte, start states can never be match
- // states.
- let flagi = {
- (((empty_flags.start as u8) << 0)
- | ((empty_flags.end as u8) << 1)
- | ((empty_flags.start_line as u8) << 2)
- | ((empty_flags.end_line as u8) << 3)
- | ((empty_flags.word_boundary as u8) << 4)
- | ((empty_flags.not_word_boundary as u8) << 5)
- | ((state_flags.is_word() as u8) << 6)) as usize
- };
- match self.cache.start_states[flagi] {
- STATE_UNKNOWN => {}
- si => return Some(si),
- }
- q.clear();
- let start = usize_to_u32(self.prog.start);
- self.follow_epsilons(start, q, empty_flags);
- // Start states can never be match states because we delay every match
- // by one byte. Given an empty string and an empty match, the match
- // won't actually occur until the DFA processes the special EOF
- // sentinel byte.
- let sp = match self.cached_state(q, state_flags, None) {
- None => return None,
- Some(sp) => self.start_ptr(sp),
- };
- self.cache.start_states[flagi] = sp;
- Some(sp)
- }
-
- /// Computes the set of starting flags for the given position in text.
- ///
- /// This should only be used when executing the DFA forwards over the
- /// input.
- fn start_flags(&self, text: &[u8], at: usize) -> (EmptyFlags, StateFlags) {
- let mut empty_flags = EmptyFlags::default();
- let mut state_flags = StateFlags::default();
- empty_flags.start = at == 0;
- empty_flags.end = text.is_empty();
- empty_flags.start_line = at == 0 || text[at - 1] == b'\n';
- empty_flags.end_line = text.is_empty();
-
- let is_word_last = at > 0 && Byte::byte(text[at - 1]).is_ascii_word();
- let is_word = at < text.len() && Byte::byte(text[at]).is_ascii_word();
- if is_word_last {
- state_flags.set_word();
- }
- if is_word == is_word_last {
- empty_flags.not_word_boundary = true;
- } else {
- empty_flags.word_boundary = true;
- }
- (empty_flags, state_flags)
- }
-
- /// Computes the set of starting flags for the given position in text.
- ///
- /// This should only be used when executing the DFA in reverse over the
- /// input.
- fn start_flags_reverse(
- &self,
- text: &[u8],
- at: usize,
- ) -> (EmptyFlags, StateFlags) {
- let mut empty_flags = EmptyFlags::default();
- let mut state_flags = StateFlags::default();
- empty_flags.start = at == text.len();
- empty_flags.end = text.is_empty();
- empty_flags.start_line = at == text.len() || text[at] == b'\n';
- empty_flags.end_line = text.is_empty();
-
- let is_word_last =
- at < text.len() && Byte::byte(text[at]).is_ascii_word();
- let is_word = at > 0 && Byte::byte(text[at - 1]).is_ascii_word();
- if is_word_last {
- state_flags.set_word();
- }
- if is_word == is_word_last {
- empty_flags.not_word_boundary = true;
- } else {
- empty_flags.word_boundary = true;
- }
- (empty_flags, state_flags)
- }
-
- /// Returns a reference to a State given a pointer to it.
- fn state(&self, si: StatePtr) -> &State {
- self.cache.compiled.get_state(si).unwrap()
- }
-
- /// Adds the given state to the DFA.
- ///
- /// This allocates room for transitions out of this state in
- /// self.cache.trans. The transitions can be set with the returned
- /// StatePtr.
- ///
- /// If None is returned, then the state limit was reached and the DFA
- /// should quit.
- fn add_state(&mut self, state: State) -> Option<StatePtr> {
- // This will fail if the next state pointer exceeds STATE_PTR. In
- // practice, the cache limit will prevent us from ever getting here,
- // but maybe callers will set the cache size to something ridiculous...
- let si = match self.cache.trans.add() {
- None => return None,
- Some(si) => si,
- };
- // If the program has a Unicode word boundary, then set any transitions
- // for non-ASCII bytes to STATE_QUIT. If the DFA stumbles over such a
- // transition, then it will quit and an alternative matching engine
- // will take over.
- if self.prog.has_unicode_word_boundary {
- for b in 128..256 {
- let cls = self.byte_class(Byte::byte(b as u8));
- self.cache.trans.set_next(si, cls, STATE_QUIT);
- }
- }
- // Finally, put our actual state on to our heap of states and index it
- // so we can find it later.
- self.cache.size += self.cache.trans.state_heap_size()
- + state.data.len()
- + (2 * mem::size_of::<State>())
- + mem::size_of::<StatePtr>();
- self.cache.compiled.insert(state, si);
- // Transition table and set of states and map should all be in sync.
- debug_assert!(
- self.cache.compiled.len() == self.cache.trans.num_states()
- );
- Some(si)
- }
-
- /// Quickly finds the next occurrence of any literal prefixes in the regex.
- /// If there are no literal prefixes, then the current position is
- /// returned. If there are literal prefixes and one could not be found,
- /// then None is returned.
- ///
- /// This should only be called when the DFA is in a start state.
- fn prefix_at(&self, text: &[u8], at: usize) -> Option<usize> {
- self.prog.prefixes.find(&text[at..]).map(|(s, _)| at + s)
- }
-
- /// Returns the number of byte classes required to discriminate transitions
- /// in each state.
- ///
- /// invariant: num_byte_classes() == len(State.next)
- fn num_byte_classes(&self) -> usize {
- // We add 1 to account for the special EOF byte.
- (self.prog.byte_classes[255] as usize + 1) + 1
- }
-
- /// Given an input byte or the special EOF sentinel, return its
- /// corresponding byte class.
- #[cfg_attr(feature = "perf-inline", inline(always))]
- fn byte_class(&self, b: Byte) -> usize {
- match b.as_byte() {
- None => self.num_byte_classes() - 1,
- Some(b) => self.u8_class(b),
- }
- }
-
- /// Like byte_class, but explicitly for u8s.
- #[cfg_attr(feature = "perf-inline", inline(always))]
- fn u8_class(&self, b: u8) -> usize {
- self.prog.byte_classes[b as usize] as usize
- }
-
- /// Returns true if the DFA should continue searching past the first match.
- ///
- /// Leftmost first semantics in the DFA are preserved by not following NFA
- /// transitions after the first match is seen.
- ///
- /// On occasion, we want to avoid leftmost first semantics to find either
- /// the longest match (for reverse search) or all possible matches (for
- /// regex sets).
- fn continue_past_first_match(&self) -> bool {
- self.prog.is_reverse || self.prog.matches.len() > 1
- }
-
- /// Returns true if there is a prefix we can quickly search for.
- fn has_prefix(&self) -> bool {
- !self.prog.is_reverse
- && !self.prog.prefixes.is_empty()
- && !self.prog.is_anchored_start
- }
-
- /// Sets the STATE_START bit in the given state pointer if and only if
- /// we have a prefix to scan for.
- ///
- /// If there's no prefix, then it's a waste to treat the start state
- /// specially.
- fn start_ptr(&self, si: StatePtr) -> StatePtr {
- if self.has_prefix() {
- si | STATE_START
- } else {
- si
- }
- }
-
- /// Approximate size returns the approximate heap space currently used by
- /// the DFA. It is used to determine whether the DFA's state cache needs to
- /// be wiped. Namely, it is possible that for certain regexes on certain
- /// inputs, a new state could be created for every byte of input. (This is
- /// bad for memory use, so we bound it with a cache.)
- fn approximate_size(&self) -> usize {
- self.cache.size
- }
-}
-
-/// An abstraction for representing a map of states. The map supports two
-/// different ways of state lookup. One is fast constant time access via a
-/// state pointer. The other is a hashmap lookup based on the DFA's
-/// constituent NFA states.
-///
-/// A DFA state internally uses an Arc such that we only need to store the
-/// set of NFA states on the heap once, even though we support looking up
-/// states by two different means. A more natural way to express this might
-/// use raw pointers, but an Arc is safe and effectively achieves the same
-/// thing.
-#[derive(Debug)]
-struct StateMap {
- /// The keys are not actually static but rely on always pointing to a
- /// buffer in `states` which will never be moved except when clearing
- /// the map or on drop, in which case the keys of this map will be
- /// removed before
- map: HashMap<State, StatePtr>,
- /// Our set of states. Note that `StatePtr / num_byte_classes` indexes
- /// this Vec rather than just a `StatePtr`.
- states: Vec<State>,
- /// The number of byte classes in the DFA. Used to index `states`.
- num_byte_classes: usize,
-}
-
-impl StateMap {
- fn new(num_byte_classes: usize) -> StateMap {
- StateMap { map: HashMap::new(), states: vec![], num_byte_classes }
- }
-
- fn len(&self) -> usize {
- self.states.len()
- }
-
- fn is_empty(&self) -> bool {
- self.states.is_empty()
- }
-
- fn get_ptr(&self, state: &State) -> Option<StatePtr> {
- self.map.get(state).cloned()
- }
-
- fn get_state(&self, si: StatePtr) -> Option<&State> {
- self.states.get(si as usize / self.num_byte_classes)
- }
-
- fn insert(&mut self, state: State, si: StatePtr) {
- self.map.insert(state.clone(), si);
- self.states.push(state);
- }
-
- fn clear(&mut self) {
- self.map.clear();
- self.states.clear();
- }
-}
-
-impl Transitions {
- /// Create a new transition table.
- ///
- /// The number of byte classes corresponds to the stride. Every state will
- /// have `num_byte_classes` slots for transitions.
- fn new(num_byte_classes: usize) -> Transitions {
- Transitions { table: vec![], num_byte_classes }
- }
-
- /// Returns the total number of states currently in this table.
- fn num_states(&self) -> usize {
- self.table.len() / self.num_byte_classes
- }
-
- /// Allocates room for one additional state and returns a pointer to it.
- ///
- /// If there's no more room, None is returned.
- fn add(&mut self) -> Option<StatePtr> {
- let si = self.table.len();
- if si > STATE_MAX as usize {
- return None;
- }
- self.table.extend(repeat(STATE_UNKNOWN).take(self.num_byte_classes));
- Some(usize_to_u32(si))
- }
-
- /// Clears the table of all states.
- fn clear(&mut self) {
- self.table.clear();
- }
-
- /// Sets the transition from (si, cls) to next.
- fn set_next(&mut self, si: StatePtr, cls: usize, next: StatePtr) {
- self.table[si as usize + cls] = next;
- }
-
- /// Returns the transition corresponding to (si, cls).
- fn next(&self, si: StatePtr, cls: usize) -> StatePtr {
- self.table[si as usize + cls]
- }
-
- /// The heap size, in bytes, of a single state in the transition table.
- fn state_heap_size(&self) -> usize {
- self.num_byte_classes * mem::size_of::<StatePtr>()
- }
-
- /// Like `next`, but uses unchecked access and is therefore not safe.
- unsafe fn next_unchecked(&self, si: StatePtr, cls: usize) -> StatePtr {
- debug_assert!((si as usize) < self.table.len());
- debug_assert!(cls < self.num_byte_classes);
- *self.table.get_unchecked(si as usize + cls)
- }
-}
-
-impl StateFlags {
- fn is_match(&self) -> bool {
- self.0 & 0b0000_0001 > 0
- }
-
- fn set_match(&mut self) {
- self.0 |= 0b0000_0001;
- }
-
- fn is_word(&self) -> bool {
- self.0 & 0b0000_0010 > 0
- }
-
- fn set_word(&mut self) {
- self.0 |= 0b0000_0010;
- }
-
- fn has_empty(&self) -> bool {
- self.0 & 0b0000_0100 > 0
- }
-
- fn set_empty(&mut self) {
- self.0 |= 0b0000_0100;
- }
-}
-
-impl Byte {
- fn byte(b: u8) -> Self {
- Byte(b as u16)
- }
- fn eof() -> Self {
- Byte(256)
- }
- fn is_eof(&self) -> bool {
- self.0 == 256
- }
-
- fn is_ascii_word(&self) -> bool {
- let b = match self.as_byte() {
- None => return false,
- Some(b) => b,
- };
- match b {
- b'A'..=b'Z' | b'a'..=b'z' | b'0'..=b'9' | b'_' => true,
- _ => false,
- }
- }
-
- fn as_byte(&self) -> Option<u8> {
- if self.is_eof() {
- None
- } else {
- Some(self.0 as u8)
- }
- }
-}
-
-impl fmt::Debug for State {
- fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
- let ips: Vec<usize> = self.inst_ptrs().collect();
- f.debug_struct("State")
- .field("flags", &self.flags())
- .field("insts", &ips)
- .finish()
- }
-}
-
-impl fmt::Debug for Transitions {
- fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
- let mut fmtd = f.debug_map();
- for si in 0..self.num_states() {
- let s = si * self.num_byte_classes;
- let e = s + self.num_byte_classes;
- fmtd.entry(&si.to_string(), &TransitionsRow(&self.table[s..e]));
- }
- fmtd.finish()
- }
-}
-
-struct TransitionsRow<'a>(&'a [StatePtr]);
-
-impl<'a> fmt::Debug for TransitionsRow<'a> {
- fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
- let mut fmtd = f.debug_map();
- for (b, si) in self.0.iter().enumerate() {
- match *si {
- STATE_UNKNOWN => {}
- STATE_DEAD => {
- fmtd.entry(&vb(b as usize), &"DEAD");
- }
- si => {
- fmtd.entry(&vb(b as usize), &si.to_string());
- }
- }
- }
- fmtd.finish()
- }
-}
-
-impl fmt::Debug for StateFlags {
- fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
- f.debug_struct("StateFlags")
- .field("is_match", &self.is_match())
- .field("is_word", &self.is_word())
- .field("has_empty", &self.has_empty())
- .finish()
- }
-}
-
-/// Helper function for formatting a byte as a nice-to-read escaped string.
-fn vb(b: usize) -> String {
- use std::ascii::escape_default;
-
- if b > ::std::u8::MAX as usize {
- "EOF".to_owned()
- } else {
- let escaped = escape_default(b as u8).collect::<Vec<u8>>();
- String::from_utf8_lossy(&escaped).into_owned()
- }
-}
-
-fn usize_to_u32(n: usize) -> u32 {
- if (n as u64) > (::std::u32::MAX as u64) {
- panic!("BUG: {} is too big to fit into u32", n)
- }
- n as u32
-}
-
-#[allow(dead_code)] // useful for debugging
-fn show_state_ptr(si: StatePtr) -> String {
- let mut s = format!("{:?}", si & STATE_MAX);
- if si == STATE_UNKNOWN {
- s = format!("{} (unknown)", s);
- }
- if si == STATE_DEAD {
- s = format!("{} (dead)", s);
- }
- if si == STATE_QUIT {
- s = format!("{} (quit)", s);
- }
- if si & STATE_START > 0 {
- s = format!("{} (start)", s);
- }
- if si & STATE_MATCH > 0 {
- s = format!("{} (match)", s);
- }
- s
-}
-
-/// https://developers.google.com/protocol-buffers/docs/encoding#varints
-fn write_vari32(data: &mut Vec<u8>, n: i32) {
- let mut un = (n as u32) << 1;
- if n < 0 {
- un = !un;
- }
- write_varu32(data, un)
-}
-
-/// https://developers.google.com/protocol-buffers/docs/encoding#varints
-fn read_vari32(data: &[u8]) -> (i32, usize) {
- let (un, i) = read_varu32(data);
- let mut n = (un >> 1) as i32;
- if un & 1 != 0 {
- n = !n;
- }
- (n, i)
-}
-
-/// https://developers.google.com/protocol-buffers/docs/encoding#varints
-fn write_varu32(data: &mut Vec<u8>, mut n: u32) {
- while n >= 0b1000_0000 {
- data.push((n as u8) | 0b1000_0000);
- n >>= 7;
- }
- data.push(n as u8);
-}
-
-/// https://developers.google.com/protocol-buffers/docs/encoding#varints
-fn read_varu32(data: &[u8]) -> (u32, usize) {
- let mut n: u32 = 0;
- let mut shift: u32 = 0;
- for (i, &b) in data.iter().enumerate() {
- if b < 0b1000_0000 {
- return (n | ((b as u32) << shift), i + 1);
- }
- n |= ((b as u32) & 0b0111_1111) << shift;
- shift += 7;
- }
- (0, 0)
-}
-
-#[cfg(test)]
-mod tests {
-
- use super::{
- push_inst_ptr, read_vari32, read_varu32, write_vari32, write_varu32,
- State, StateFlags,
- };
- use quickcheck::{quickcheck, Gen, QuickCheck};
- use std::sync::Arc;
-
- #[test]
- fn prop_state_encode_decode() {
- fn p(mut ips: Vec<u32>, flags: u8) -> bool {
- // It looks like our encoding scheme can't handle instruction
- // pointers at or above 2**31. We should fix that, but it seems
- // unlikely to occur in real code due to the amount of memory
- // required for such a state machine. So for now, we just clamp
- // our test data.
- for ip in &mut ips {
- if *ip >= 1 << 31 {
- *ip = (1 << 31) - 1;
- }
- }
- let mut data = vec![flags];
- let mut prev = 0;
- for &ip in ips.iter() {
- push_inst_ptr(&mut data, &mut prev, ip);
- }
- let state = State { data: Arc::from(&data[..]) };
-
- let expected: Vec<usize> =
- ips.into_iter().map(|ip| ip as usize).collect();
- let got: Vec<usize> = state.inst_ptrs().collect();
- expected == got && state.flags() == StateFlags(flags)
- }
- QuickCheck::new()
- .gen(Gen::new(10_000))
- .quickcheck(p as fn(Vec<u32>, u8) -> bool);
- }
-
- #[test]
- fn prop_read_write_u32() {
- fn p(n: u32) -> bool {
- let mut buf = vec![];
- write_varu32(&mut buf, n);
- let (got, nread) = read_varu32(&buf);
- nread == buf.len() && got == n
- }
- quickcheck(p as fn(u32) -> bool);
- }
-
- #[test]
- fn prop_read_write_i32() {
- fn p(n: i32) -> bool {
- let mut buf = vec![];
- write_vari32(&mut buf, n);
- let (got, nread) = read_vari32(&buf);
- nread == buf.len() && got == n
- }
- quickcheck(p as fn(i32) -> bool);
- }
-}
diff --git a/vendor/regex/src/error.rs b/vendor/regex/src/error.rs
index 6c341f604..6026b3849 100644
--- a/vendor/regex/src/error.rs
+++ b/vendor/regex/src/error.rs
@@ -1,7 +1,9 @@
-use std::fmt;
-use std::iter::repeat;
+use alloc::string::{String, ToString};
+
+use regex_automata::meta;
/// An error that occurred during parsing or compiling a regular expression.
+#[non_exhaustive]
#[derive(Clone, PartialEq)]
pub enum Error {
/// A syntax error.
@@ -27,29 +29,44 @@ pub enum Error {
/// approaches may be appropriate. Instead, you'll have to determine just
/// how big of a regex you want to allow.
CompiledTooBig(usize),
- /// Hints that destructuring should not be exhaustive.
- ///
- /// This enum may grow additional variants, so this makes sure clients
- /// don't count on exhaustive matching. (Otherwise, adding a new variant
- /// could break existing code.)
- #[doc(hidden)]
- __Nonexhaustive,
}
-impl ::std::error::Error for Error {
+impl Error {
+ pub(crate) fn from_meta_build_error(err: meta::BuildError) -> Error {
+ if let Some(size_limit) = err.size_limit() {
+ Error::CompiledTooBig(size_limit)
+ } else if let Some(ref err) = err.syntax_error() {
+ Error::Syntax(err.to_string())
+ } else {
+ // This is a little suspect. Technically there are more ways for
+ // a meta regex to fail to build other than "exceeded size limit"
+ // and "syntax error." For example, if there are too many states
+ // or even too many patterns. But in practice this is probably
+ // good enough. The worst thing that happens is that Error::Syntax
+ // represents an error that isn't technically a syntax error, but
+ // the actual message will still be shown. So... it's not too bad.
+ //
+ // We really should have made the Error type in the regex crate
+ // completely opaque. Rookie mistake.
+ Error::Syntax(err.to_string())
+ }
+ }
+}
+
+#[cfg(feature = "std")]
+impl std::error::Error for Error {
// TODO: Remove this method entirely on the next breaking semver release.
#[allow(deprecated)]
fn description(&self) -> &str {
match *self {
Error::Syntax(ref err) => err,
Error::CompiledTooBig(_) => "compiled program too big",
- Error::__Nonexhaustive => unreachable!(),
}
}
}
-impl fmt::Display for Error {
- fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+impl core::fmt::Display for Error {
+ fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
match *self {
Error::Syntax(ref err) => err.fmt(f),
Error::CompiledTooBig(limit) => write!(
@@ -57,7 +74,6 @@ impl fmt::Display for Error {
"Compiled regex exceeds size limit of {} bytes.",
limit
),
- Error::__Nonexhaustive => unreachable!(),
}
}
}
@@ -66,11 +82,11 @@ impl fmt::Display for Error {
// errors when people use `Regex::new(...).unwrap()`. It's a little weird,
// but the `Syntax` variant is already storing a `String` anyway, so we might
// as well format it nicely.
-impl fmt::Debug for Error {
- fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+impl core::fmt::Debug for Error {
+ fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
match *self {
Error::Syntax(ref err) => {
- let hr: String = repeat('~').take(79).collect();
+ let hr: String = core::iter::repeat('~').take(79).collect();
writeln!(f, "Syntax(")?;
writeln!(f, "{}", hr)?;
writeln!(f, "{}", err)?;
@@ -81,9 +97,6 @@ impl fmt::Debug for Error {
Error::CompiledTooBig(limit) => {
f.debug_tuple("CompiledTooBig").field(&limit).finish()
}
- Error::__Nonexhaustive => {
- f.debug_tuple("__Nonexhaustive").finish()
- }
}
}
}
diff --git a/vendor/regex/src/exec.rs b/vendor/regex/src/exec.rs
deleted file mode 100644
index ee8b589d2..000000000
--- a/vendor/regex/src/exec.rs
+++ /dev/null
@@ -1,1759 +0,0 @@
-use std::cell::RefCell;
-use std::collections::HashMap;
-use std::panic::AssertUnwindSafe;
-use std::sync::Arc;
-
-#[cfg(feature = "perf-literal")]
-use aho_corasick::{AhoCorasick, MatchKind};
-use regex_syntax::hir::literal;
-use regex_syntax::hir::{Hir, Look};
-use regex_syntax::ParserBuilder;
-
-use crate::backtrack;
-use crate::compile::Compiler;
-#[cfg(feature = "perf-dfa")]
-use crate::dfa;
-use crate::error::Error;
-use crate::input::{ByteInput, CharInput};
-use crate::literal::LiteralSearcher;
-use crate::pikevm;
-use crate::pool::{Pool, PoolGuard};
-use crate::prog::Program;
-use crate::re_builder::RegexOptions;
-use crate::re_bytes;
-use crate::re_set;
-use crate::re_trait::{Locations, RegularExpression, Slot};
-use crate::re_unicode;
-use crate::utf8::next_utf8;
-
-/// `Exec` manages the execution of a regular expression.
-///
-/// In particular, this manages the various compiled forms of a single regular
-/// expression and the choice of which matching engine to use to execute a
-/// regular expression.
-#[derive(Debug)]
-pub struct Exec {
- /// All read only state.
- ro: Arc<ExecReadOnly>,
- /// A pool of reusable values for the various matching engines.
- ///
- /// Note that boxing this value is not strictly necessary, but it is an
- /// easy way to ensure that T does not bloat the stack sized used by a pool
- /// in the case where T is big. And this turns out to be the case at the
- /// time of writing for regex's use of this pool. At the time of writing,
- /// the size of a Regex on the stack is 856 bytes. Boxing this value
- /// reduces that size to 16 bytes.
- pool: Box<Pool<ProgramCache>>,
-}
-
-/// `ExecNoSync` is like `Exec`, except it embeds a reference to a cache. This
-/// means it is no longer Sync, but we can now avoid the overhead of
-/// synchronization to fetch the cache.
-#[derive(Debug)]
-pub struct ExecNoSync<'c> {
- /// All read only state.
- ro: &'c Arc<ExecReadOnly>,
- /// Caches for the various matching engines.
- cache: PoolGuard<'c, ProgramCache>,
-}
-
-/// `ExecNoSyncStr` is like `ExecNoSync`, but matches on &str instead of &[u8].
-#[derive(Debug)]
-pub struct ExecNoSyncStr<'c>(ExecNoSync<'c>);
-
-/// `ExecReadOnly` comprises all read only state for a regex. Namely, all such
-/// state is determined at compile time and never changes during search.
-#[derive(Debug)]
-struct ExecReadOnly {
- /// The original regular expressions given by the caller to compile.
- res: Vec<String>,
- /// A compiled program that is used in the NFA simulation and backtracking.
- /// It can be byte-based or Unicode codepoint based.
- ///
- /// N.B. It is not possibly to make this byte-based from the public API.
- /// It is only used for testing byte based programs in the NFA simulations.
- nfa: Program,
- /// A compiled byte based program for DFA execution. This is only used
- /// if a DFA can be executed. (Currently, only word boundary assertions are
- /// not supported.) Note that this program contains an embedded `.*?`
- /// preceding the first capture group, unless the regex is anchored at the
- /// beginning.
- #[allow(dead_code)]
- dfa: Program,
- /// The same as above, except the program is reversed (and there is no
- /// preceding `.*?`). This is used by the DFA to find the starting location
- /// of matches.
- #[allow(dead_code)]
- dfa_reverse: Program,
- /// A set of suffix literals extracted from the regex.
- ///
- /// Prefix literals are stored on the `Program`, since they are used inside
- /// the matching engines.
- #[allow(dead_code)]
- suffixes: LiteralSearcher,
- /// An Aho-Corasick automaton with leftmost-first match semantics.
- ///
- /// This is only set when the entire regex is a simple unanchored
- /// alternation of literals. We could probably use it more circumstances,
- /// but this is already hacky enough in this architecture.
- ///
- /// N.B. We use u32 as a state ID representation under the assumption that
- /// if we were to exhaust the ID space, we probably would have long
- /// surpassed the compilation size limit.
- #[cfg(feature = "perf-literal")]
- ac: Option<AhoCorasick>,
- /// match_type encodes as much upfront knowledge about how we're going to
- /// execute a search as possible.
- match_type: MatchType,
-}
-
-/// Facilitates the construction of an executor by exposing various knobs
-/// to control how a regex is executed and what kinds of resources it's
-/// permitted to use.
-// `ExecBuilder` is only public via the `internal` module, so avoid deriving
-// `Debug`.
-#[allow(missing_debug_implementations)]
-pub struct ExecBuilder {
- options: RegexOptions,
- match_type: Option<MatchType>,
- bytes: bool,
- only_utf8: bool,
-}
-
-/// Parsed represents a set of parsed regular expressions and their detected
-/// literals.
-struct Parsed {
- exprs: Vec<Hir>,
- prefixes: literal::Seq,
- suffixes: literal::Seq,
- bytes: bool,
-}
-
-impl ExecBuilder {
- /// Create a regex execution builder.
- ///
- /// This uses default settings for everything except the regex itself,
- /// which must be provided. Further knobs can be set by calling methods,
- /// and then finally, `build` to actually create the executor.
- pub fn new(re: &str) -> Self {
- Self::new_many(&[re])
- }
-
- /// Like new, but compiles the union of the given regular expressions.
- ///
- /// Note that when compiling 2 or more regular expressions, capture groups
- /// are completely unsupported. (This means both `find` and `captures`
- /// won't work.)
- pub fn new_many<I, S>(res: I) -> Self
- where
- S: AsRef<str>,
- I: IntoIterator<Item = S>,
- {
- let mut opts = RegexOptions::default();
- opts.pats = res.into_iter().map(|s| s.as_ref().to_owned()).collect();
- Self::new_options(opts)
- }
-
- /// Create a regex execution builder.
- pub fn new_options(opts: RegexOptions) -> Self {
- ExecBuilder {
- options: opts,
- match_type: None,
- bytes: false,
- only_utf8: true,
- }
- }
-
- /// Set the matching engine to be automatically determined.
- ///
- /// This is the default state and will apply whatever optimizations are
- /// possible, such as running a DFA.
- ///
- /// This overrides whatever was previously set via the `nfa` or
- /// `bounded_backtracking` methods.
- pub fn automatic(mut self) -> Self {
- self.match_type = None;
- self
- }
-
- /// Sets the matching engine to use the NFA algorithm no matter what
- /// optimizations are possible.
- ///
- /// This overrides whatever was previously set via the `automatic` or
- /// `bounded_backtracking` methods.
- pub fn nfa(mut self) -> Self {
- self.match_type = Some(MatchType::Nfa(MatchNfaType::PikeVM));
- self
- }
-
- /// Sets the matching engine to use a bounded backtracking engine no
- /// matter what optimizations are possible.
- ///
- /// One must use this with care, since the bounded backtracking engine
- /// uses memory proportion to `len(regex) * len(text)`.
- ///
- /// This overrides whatever was previously set via the `automatic` or
- /// `nfa` methods.
- pub fn bounded_backtracking(mut self) -> Self {
- self.match_type = Some(MatchType::Nfa(MatchNfaType::Backtrack));
- self
- }
-
- /// Compiles byte based programs for use with the NFA matching engines.
- ///
- /// By default, the NFA engines match on Unicode scalar values. They can
- /// be made to use byte based programs instead. In general, the byte based
- /// programs are slower because of a less efficient encoding of character
- /// classes.
- ///
- /// Note that this does not impact DFA matching engines, which always
- /// execute on bytes.
- pub fn bytes(mut self, yes: bool) -> Self {
- self.bytes = yes;
- self
- }
-
- /// When disabled, the program compiled may match arbitrary bytes.
- ///
- /// When enabled (the default), all compiled programs exclusively match
- /// valid UTF-8 bytes.
- pub fn only_utf8(mut self, yes: bool) -> Self {
- self.only_utf8 = yes;
- self
- }
-
- /// Set the Unicode flag.
- pub fn unicode(mut self, yes: bool) -> Self {
- self.options.unicode = yes;
- self
- }
-
- /// Parse the current set of patterns into their AST and extract literals.
- fn parse(&self) -> Result<Parsed, Error> {
- let mut exprs = Vec::with_capacity(self.options.pats.len());
- let mut prefixes = Some(literal::Seq::empty());
- let mut suffixes = Some(literal::Seq::empty());
- let mut bytes = false;
- let is_set = self.options.pats.len() > 1;
- // If we're compiling a regex set and that set has any anchored
- // expressions, then disable all literal optimizations.
- for pat in &self.options.pats {
- let mut parser = ParserBuilder::new()
- .octal(self.options.octal)
- .case_insensitive(self.options.case_insensitive)
- .multi_line(self.options.multi_line)
- .dot_matches_new_line(self.options.dot_matches_new_line)
- .swap_greed(self.options.swap_greed)
- .ignore_whitespace(self.options.ignore_whitespace)
- .unicode(self.options.unicode)
- .utf8(self.only_utf8)
- .nest_limit(self.options.nest_limit)
- .build();
- let expr =
- parser.parse(pat).map_err(|e| Error::Syntax(e.to_string()))?;
- let props = expr.properties();
- // This used to just check whether the HIR matched valid UTF-8
- // or not, but in regex-syntax 0.7, we changed our definition of
- // "matches valid UTF-8" to exclude zero-width matches. And in
- // particular, previously, we considered WordAsciiNegate (that
- // is '(?-u:\B)') to be capable of matching invalid UTF-8. Our
- // matcher engines were built under this assumption and fixing
- // them is not worth it with the imminent plan to switch over to
- // regex-automata. So for now, we retain the previous behavior by
- // just explicitly treating the presence of a negated ASCII word
- // boundary as forcing use to use a byte oriented automaton.
- bytes = bytes
- || !props.is_utf8()
- || props.look_set().contains(Look::WordAsciiNegate);
-
- if cfg!(feature = "perf-literal") {
- if !props.look_set_prefix().contains(Look::Start)
- && props.look_set().contains(Look::Start)
- {
- // Partial anchors unfortunately make it hard to use
- // prefixes, so disable them.
- prefixes = None;
- } else if is_set
- && props.look_set_prefix_any().contains(Look::Start)
- {
- // Regex sets with anchors do not go well with literal
- // optimizations.
- prefixes = None;
- } else if props.look_set_prefix_any().contains_word() {
- // The new literal extractor ignores look-around while
- // the old one refused to extract prefixes from regexes
- // that began with a \b. These old creaky regex internals
- // can't deal with it, so we drop it.
- prefixes = None;
- } else if props.look_set_prefix_any().contains(Look::StartLF) {
- // Similar to the reasoning for word boundaries, this old
- // regex engine can't handle literal prefixes with '(?m:^)'
- // at the beginning of a regex.
- prefixes = None;
- }
-
- if !props.look_set_suffix().contains(Look::End)
- && props.look_set().contains(Look::End)
- {
- // Partial anchors unfortunately make it hard to use
- // suffixes, so disable them.
- suffixes = None;
- } else if is_set
- && props.look_set_suffix_any().contains(Look::End)
- {
- // Regex sets with anchors do not go well with literal
- // optimizations.
- suffixes = None;
- } else if props.look_set_suffix_any().contains_word() {
- // See the prefix case for reasoning here.
- suffixes = None;
- } else if props.look_set_suffix_any().contains(Look::EndLF) {
- // See the prefix case for reasoning here.
- suffixes = None;
- }
-
- let (mut pres, mut suffs) =
- if prefixes.is_none() && suffixes.is_none() {
- (literal::Seq::infinite(), literal::Seq::infinite())
- } else {
- literal_analysis(&expr)
- };
- // These old creaky regex internals can't handle cases where
- // the literal sequences are exact but there are look-around
- // assertions. So we make sure the sequences are inexact if
- // there are look-around assertions anywhere. This forces the
- // regex engines to run instead of assuming that a literal
- // match implies an overall match.
- if !props.look_set().is_empty() {
- pres.make_inexact();
- suffs.make_inexact();
- }
- prefixes = prefixes.and_then(|mut prefixes| {
- prefixes.union(&mut pres);
- Some(prefixes)
- });
- suffixes = suffixes.and_then(|mut suffixes| {
- suffixes.union(&mut suffs);
- Some(suffixes)
- });
- }
- exprs.push(expr);
- }
- Ok(Parsed {
- exprs,
- prefixes: prefixes.unwrap_or_else(literal::Seq::empty),
- suffixes: suffixes.unwrap_or_else(literal::Seq::empty),
- bytes,
- })
- }
-
- /// Build an executor that can run a regular expression.
- pub fn build(self) -> Result<Exec, Error> {
- // Special case when we have no patterns to compile.
- // This can happen when compiling a regex set.
- if self.options.pats.is_empty() {
- let ro = Arc::new(ExecReadOnly {
- res: vec![],
- nfa: Program::new(),
- dfa: Program::new(),
- dfa_reverse: Program::new(),
- suffixes: LiteralSearcher::empty(),
- #[cfg(feature = "perf-literal")]
- ac: None,
- match_type: MatchType::Nothing,
- });
- let pool = ExecReadOnly::new_pool(&ro);
- return Ok(Exec { ro, pool });
- }
- let parsed = self.parse()?;
- let mut nfa = Compiler::new()
- .size_limit(self.options.size_limit)
- .bytes(self.bytes || parsed.bytes)
- .only_utf8(self.only_utf8)
- .compile(&parsed.exprs)?;
- let mut dfa = Compiler::new()
- .size_limit(self.options.size_limit)
- .dfa(true)
- .only_utf8(self.only_utf8)
- .compile(&parsed.exprs)?;
- let mut dfa_reverse = Compiler::new()
- .size_limit(self.options.size_limit)
- .dfa(true)
- .only_utf8(self.only_utf8)
- .reverse(true)
- .compile(&parsed.exprs)?;
-
- #[cfg(feature = "perf-literal")]
- let ac = self.build_aho_corasick(&parsed);
- nfa.prefixes = LiteralSearcher::prefixes(parsed.prefixes);
- dfa.prefixes = nfa.prefixes.clone();
- dfa.dfa_size_limit = self.options.dfa_size_limit;
- dfa_reverse.dfa_size_limit = self.options.dfa_size_limit;
-
- let mut ro = ExecReadOnly {
- res: self.options.pats,
- nfa,
- dfa,
- dfa_reverse,
- suffixes: LiteralSearcher::suffixes(parsed.suffixes),
- #[cfg(feature = "perf-literal")]
- ac,
- match_type: MatchType::Nothing,
- };
- ro.match_type = ro.choose_match_type(self.match_type);
-
- let ro = Arc::new(ro);
- let pool = ExecReadOnly::new_pool(&ro);
- Ok(Exec { ro, pool })
- }
-
- #[cfg(feature = "perf-literal")]
- fn build_aho_corasick(&self, parsed: &Parsed) -> Option<AhoCorasick> {
- if parsed.exprs.len() != 1 {
- return None;
- }
- let lits = match alternation_literals(&parsed.exprs[0]) {
- None => return None,
- Some(lits) => lits,
- };
- // If we have a small number of literals, then let Teddy handle
- // things (see literal/mod.rs).
- if lits.len() <= 32 {
- return None;
- }
- Some(
- AhoCorasick::builder()
- .match_kind(MatchKind::LeftmostFirst)
- .build(&lits)
- // This should never happen because we'd long exceed the
- // compilation limit for regexes first.
- .expect("AC automaton too big"),
- )
- }
-}
-
-impl<'c> RegularExpression for ExecNoSyncStr<'c> {
- type Text = str;
-
- fn slots_len(&self) -> usize {
- self.0.slots_len()
- }
-
- fn next_after_empty(&self, text: &str, i: usize) -> usize {
- next_utf8(text.as_bytes(), i)
- }
-
- #[cfg_attr(feature = "perf-inline", inline(always))]
- fn shortest_match_at(&self, text: &str, start: usize) -> Option<usize> {
- self.0.shortest_match_at(text.as_bytes(), start)
- }
-
- #[cfg_attr(feature = "perf-inline", inline(always))]
- fn is_match_at(&self, text: &str, start: usize) -> bool {
- self.0.is_match_at(text.as_bytes(), start)
- }
-
- #[cfg_attr(feature = "perf-inline", inline(always))]
- fn find_at(&self, text: &str, start: usize) -> Option<(usize, usize)> {
- self.0.find_at(text.as_bytes(), start)
- }
-
- #[cfg_attr(feature = "perf-inline", inline(always))]
- fn captures_read_at(
- &self,
- locs: &mut Locations,
- text: &str,
- start: usize,
- ) -> Option<(usize, usize)> {
- self.0.captures_read_at(locs, text.as_bytes(), start)
- }
-}
-
-impl<'c> RegularExpression for ExecNoSync<'c> {
- type Text = [u8];
-
- /// Returns the number of capture slots in the regular expression. (There
- /// are two slots for every capture group, corresponding to possibly empty
- /// start and end locations of the capture.)
- fn slots_len(&self) -> usize {
- self.ro.nfa.captures.len() * 2
- }
-
- fn next_after_empty(&self, _text: &[u8], i: usize) -> usize {
- i + 1
- }
-
- /// Returns the end of a match location, possibly occurring before the
- /// end location of the correct leftmost-first match.
- #[cfg_attr(feature = "perf-inline", inline(always))]
- fn shortest_match_at(&self, text: &[u8], start: usize) -> Option<usize> {
- if !self.is_anchor_end_match(text) {
- return None;
- }
- match self.ro.match_type {
- #[cfg(feature = "perf-literal")]
- MatchType::Literal(ty) => {
- self.find_literals(ty, text, start).map(|(_, e)| e)
- }
- #[cfg(feature = "perf-dfa")]
- MatchType::Dfa | MatchType::DfaMany => {
- match self.shortest_dfa(text, start) {
- dfa::Result::Match(end) => Some(end),
- dfa::Result::NoMatch(_) => None,
- dfa::Result::Quit => self.shortest_nfa(text, start),
- }
- }
- #[cfg(feature = "perf-dfa")]
- MatchType::DfaAnchoredReverse => {
- match dfa::Fsm::reverse(
- &self.ro.dfa_reverse,
- self.cache.value(),
- true,
- &text[start..],
- text.len() - start,
- ) {
- dfa::Result::Match(_) => Some(text.len()),
- dfa::Result::NoMatch(_) => None,
- dfa::Result::Quit => self.shortest_nfa(text, start),
- }
- }
- #[cfg(all(feature = "perf-dfa", feature = "perf-literal"))]
- MatchType::DfaSuffix => {
- match self.shortest_dfa_reverse_suffix(text, start) {
- dfa::Result::Match(e) => Some(e),
- dfa::Result::NoMatch(_) => None,
- dfa::Result::Quit => self.shortest_nfa(text, start),
- }
- }
- MatchType::Nfa(ty) => self.shortest_nfa_type(ty, text, start),
- MatchType::Nothing => None,
- }
- }
-
- /// Returns true if and only if the regex matches text.
- ///
- /// For single regular expressions, this is equivalent to calling
- /// shortest_match(...).is_some().
- #[cfg_attr(feature = "perf-inline", inline(always))]
- fn is_match_at(&self, text: &[u8], start: usize) -> bool {
- if !self.is_anchor_end_match(text) {
- return false;
- }
- // We need to do this dance because shortest_match relies on the NFA
- // filling in captures[1], but a RegexSet has no captures. In other
- // words, a RegexSet can't (currently) use shortest_match. ---AG
- match self.ro.match_type {
- #[cfg(feature = "perf-literal")]
- MatchType::Literal(ty) => {
- self.find_literals(ty, text, start).is_some()
- }
- #[cfg(feature = "perf-dfa")]
- MatchType::Dfa | MatchType::DfaMany => {
- match self.shortest_dfa(text, start) {
- dfa::Result::Match(_) => true,
- dfa::Result::NoMatch(_) => false,
- dfa::Result::Quit => self.match_nfa(text, start),
- }
- }
- #[cfg(feature = "perf-dfa")]
- MatchType::DfaAnchoredReverse => {
- match dfa::Fsm::reverse(
- &self.ro.dfa_reverse,
- self.cache.value(),
- true,
- &text[start..],
- text.len() - start,
- ) {
- dfa::Result::Match(_) => true,
- dfa::Result::NoMatch(_) => false,
- dfa::Result::Quit => self.match_nfa(text, start),
- }
- }
- #[cfg(all(feature = "perf-dfa", feature = "perf-literal"))]
- MatchType::DfaSuffix => {
- match self.shortest_dfa_reverse_suffix(text, start) {
- dfa::Result::Match(_) => true,
- dfa::Result::NoMatch(_) => false,
- dfa::Result::Quit => self.match_nfa(text, start),
- }
- }
- MatchType::Nfa(ty) => self.match_nfa_type(ty, text, start),
- MatchType::Nothing => false,
- }
- }
-
- /// Finds the start and end location of the leftmost-first match, starting
- /// at the given location.
- #[cfg_attr(feature = "perf-inline", inline(always))]
- fn find_at(&self, text: &[u8], start: usize) -> Option<(usize, usize)> {
- if !self.is_anchor_end_match(text) {
- return None;
- }
- match self.ro.match_type {
- #[cfg(feature = "perf-literal")]
- MatchType::Literal(ty) => self.find_literals(ty, text, start),
- #[cfg(feature = "perf-dfa")]
- MatchType::Dfa => match self.find_dfa_forward(text, start) {
- dfa::Result::Match((s, e)) => Some((s, e)),
- dfa::Result::NoMatch(_) => None,
- dfa::Result::Quit => {
- self.find_nfa(MatchNfaType::Auto, text, start)
- }
- },
- #[cfg(feature = "perf-dfa")]
- MatchType::DfaAnchoredReverse => {
- match self.find_dfa_anchored_reverse(text, start) {
- dfa::Result::Match((s, e)) => Some((s, e)),
- dfa::Result::NoMatch(_) => None,
- dfa::Result::Quit => {
- self.find_nfa(MatchNfaType::Auto, text, start)
- }
- }
- }
- #[cfg(all(feature = "perf-dfa", feature = "perf-literal"))]
- MatchType::DfaSuffix => {
- match self.find_dfa_reverse_suffix(text, start) {
- dfa::Result::Match((s, e)) => Some((s, e)),
- dfa::Result::NoMatch(_) => None,
- dfa::Result::Quit => {
- self.find_nfa(MatchNfaType::Auto, text, start)
- }
- }
- }
- MatchType::Nfa(ty) => self.find_nfa(ty, text, start),
- MatchType::Nothing => None,
- #[cfg(feature = "perf-dfa")]
- MatchType::DfaMany => {
- unreachable!("BUG: RegexSet cannot be used with find")
- }
- }
- }
-
- /// Finds the start and end location of the leftmost-first match and also
- /// fills in all matching capture groups.
- ///
- /// The number of capture slots given should be equal to the total number
- /// of capture slots in the compiled program.
- ///
- /// Note that the first two slots always correspond to the start and end
- /// locations of the overall match.
- fn captures_read_at(
- &self,
- locs: &mut Locations,
- text: &[u8],
- start: usize,
- ) -> Option<(usize, usize)> {
- let slots = locs.as_slots();
- for slot in slots.iter_mut() {
- *slot = None;
- }
- // If the caller unnecessarily uses this, then we try to save them
- // from themselves.
- match slots.len() {
- 0 => return self.find_at(text, start),
- 2 => {
- return self.find_at(text, start).map(|(s, e)| {
- slots[0] = Some(s);
- slots[1] = Some(e);
- (s, e)
- });
- }
- _ => {} // fallthrough
- }
- if !self.is_anchor_end_match(text) {
- return None;
- }
- match self.ro.match_type {
- #[cfg(feature = "perf-literal")]
- MatchType::Literal(ty) => {
- self.find_literals(ty, text, start).and_then(|(s, e)| {
- self.captures_nfa_type(
- MatchNfaType::Auto,
- slots,
- text,
- s,
- e,
- )
- })
- }
- #[cfg(feature = "perf-dfa")]
- MatchType::Dfa => {
- if self.ro.nfa.is_anchored_start {
- self.captures_nfa(slots, text, start)
- } else {
- match self.find_dfa_forward(text, start) {
- dfa::Result::Match((s, e)) => self.captures_nfa_type(
- MatchNfaType::Auto,
- slots,
- text,
- s,
- e,
- ),
- dfa::Result::NoMatch(_) => None,
- dfa::Result::Quit => {
- self.captures_nfa(slots, text, start)
- }
- }
- }
- }
- #[cfg(feature = "perf-dfa")]
- MatchType::DfaAnchoredReverse => {
- match self.find_dfa_anchored_reverse(text, start) {
- dfa::Result::Match((s, e)) => self.captures_nfa_type(
- MatchNfaType::Auto,
- slots,
- text,
- s,
- e,
- ),
- dfa::Result::NoMatch(_) => None,
- dfa::Result::Quit => self.captures_nfa(slots, text, start),
- }
- }
- #[cfg(all(feature = "perf-dfa", feature = "perf-literal"))]
- MatchType::DfaSuffix => {
- match self.find_dfa_reverse_suffix(text, start) {
- dfa::Result::Match((s, e)) => self.captures_nfa_type(
- MatchNfaType::Auto,
- slots,
- text,
- s,
- e,
- ),
- dfa::Result::NoMatch(_) => None,
- dfa::Result::Quit => self.captures_nfa(slots, text, start),
- }
- }
- MatchType::Nfa(ty) => {
- self.captures_nfa_type(ty, slots, text, start, text.len())
- }
- MatchType::Nothing => None,
- #[cfg(feature = "perf-dfa")]
- MatchType::DfaMany => {
- unreachable!("BUG: RegexSet cannot be used with captures")
- }
- }
- }
-}
-
-impl<'c> ExecNoSync<'c> {
- /// Finds the leftmost-first match using only literal search.
- #[cfg(feature = "perf-literal")]
- #[cfg_attr(feature = "perf-inline", inline(always))]
- fn find_literals(
- &self,
- ty: MatchLiteralType,
- text: &[u8],
- start: usize,
- ) -> Option<(usize, usize)> {
- use self::MatchLiteralType::*;
- match ty {
- Unanchored => {
- let lits = &self.ro.nfa.prefixes;
- lits.find(&text[start..]).map(|(s, e)| (start + s, start + e))
- }
- AnchoredStart => {
- let lits = &self.ro.nfa.prefixes;
- if start == 0 || !self.ro.nfa.is_anchored_start {
- lits.find_start(&text[start..])
- .map(|(s, e)| (start + s, start + e))
- } else {
- None
- }
- }
- AnchoredEnd => {
- let lits = &self.ro.suffixes;
- lits.find_end(&text[start..])
- .map(|(s, e)| (start + s, start + e))
- }
- AhoCorasick => self
- .ro
- .ac
- .as_ref()
- .unwrap()
- .find(&text[start..])
- .map(|m| (start + m.start(), start + m.end())),
- }
- }
-
- /// Finds the leftmost-first match (start and end) using only the DFA.
- ///
- /// If the result returned indicates that the DFA quit, then another
- /// matching engine should be used.
- #[cfg(feature = "perf-dfa")]
- #[cfg_attr(feature = "perf-inline", inline(always))]
- fn find_dfa_forward(
- &self,
- text: &[u8],
- start: usize,
- ) -> dfa::Result<(usize, usize)> {
- use crate::dfa::Result::*;
- let end = match dfa::Fsm::forward(
- &self.ro.dfa,
- self.cache.value(),
- false,
- text,
- start,
- ) {
- NoMatch(i) => return NoMatch(i),
- Quit => return Quit,
- Match(end) if start == end => return Match((start, start)),
- Match(end) => end,
- };
- // Now run the DFA in reverse to find the start of the match.
- match dfa::Fsm::reverse(
- &self.ro.dfa_reverse,
- self.cache.value(),
- false,
- &text[start..],
- end - start,
- ) {
- Match(s) => Match((start + s, end)),
- NoMatch(i) => NoMatch(i),
- Quit => Quit,
- }
- }
-
- /// Finds the leftmost-first match (start and end) using only the DFA,
- /// but assumes the regex is anchored at the end and therefore starts at
- /// the end of the regex and matches in reverse.
- ///
- /// If the result returned indicates that the DFA quit, then another
- /// matching engine should be used.
- #[cfg(feature = "perf-dfa")]
- #[cfg_attr(feature = "perf-inline", inline(always))]
- fn find_dfa_anchored_reverse(
- &self,
- text: &[u8],
- start: usize,
- ) -> dfa::Result<(usize, usize)> {
- use crate::dfa::Result::*;
- match dfa::Fsm::reverse(
- &self.ro.dfa_reverse,
- self.cache.value(),
- false,
- &text[start..],
- text.len() - start,
- ) {
- Match(s) => Match((start + s, text.len())),
- NoMatch(i) => NoMatch(i),
- Quit => Quit,
- }
- }
-
- /// Finds the end of the shortest match using only the DFA.
- #[cfg(feature = "perf-dfa")]
- #[cfg_attr(feature = "perf-inline", inline(always))]
- fn shortest_dfa(&self, text: &[u8], start: usize) -> dfa::Result<usize> {
- dfa::Fsm::forward(&self.ro.dfa, self.cache.value(), true, text, start)
- }
-
- /// Finds the end of the shortest match using only the DFA by scanning for
- /// suffix literals.
- #[cfg(all(feature = "perf-dfa", feature = "perf-literal"))]
- #[cfg_attr(feature = "perf-inline", inline(always))]
- fn shortest_dfa_reverse_suffix(
- &self,
- text: &[u8],
- start: usize,
- ) -> dfa::Result<usize> {
- match self.exec_dfa_reverse_suffix(text, start) {
- None => self.shortest_dfa(text, start),
- Some(r) => r.map(|(_, end)| end),
- }
- }
-
- /// Finds the end of the shortest match using only the DFA by scanning for
- /// suffix literals. It also reports the start of the match.
- ///
- /// Note that if None is returned, then the optimization gave up to avoid
- /// worst case quadratic behavior. A forward scanning DFA should be tried
- /// next.
- ///
- /// If a match is returned and the full leftmost-first match is desired,
- /// then a forward scan starting from the beginning of the match must be
- /// done.
- ///
- /// If the result returned indicates that the DFA quit, then another
- /// matching engine should be used.
- #[cfg(all(feature = "perf-dfa", feature = "perf-literal"))]
- #[cfg_attr(feature = "perf-inline", inline(always))]
- fn exec_dfa_reverse_suffix(
- &self,
- text: &[u8],
- original_start: usize,
- ) -> Option<dfa::Result<(usize, usize)>> {
- use crate::dfa::Result::*;
-
- let lcs = self.ro.suffixes.lcs();
- debug_assert!(lcs.len() >= 1);
- let mut start = original_start;
- let mut end = start;
- let mut last_literal = start;
- while end <= text.len() {
- last_literal += match lcs.find(&text[last_literal..]) {
- None => return Some(NoMatch(text.len())),
- Some(i) => i,
- };
- end = last_literal + lcs.len();
- match dfa::Fsm::reverse(
- &self.ro.dfa_reverse,
- self.cache.value(),
- false,
- &text[start..end],
- end - start,
- ) {
- Match(0) | NoMatch(0) => return None,
- Match(i) => return Some(Match((start + i, end))),
- NoMatch(i) => {
- start += i;
- last_literal += 1;
- continue;
- }
- Quit => return Some(Quit),
- };
- }
- Some(NoMatch(text.len()))
- }
-
- /// Finds the leftmost-first match (start and end) using only the DFA
- /// by scanning for suffix literals.
- ///
- /// If the result returned indicates that the DFA quit, then another
- /// matching engine should be used.
- #[cfg(all(feature = "perf-dfa", feature = "perf-literal"))]
- #[cfg_attr(feature = "perf-inline", inline(always))]
- fn find_dfa_reverse_suffix(
- &self,
- text: &[u8],
- start: usize,
- ) -> dfa::Result<(usize, usize)> {
- use crate::dfa::Result::*;
-
- let match_start = match self.exec_dfa_reverse_suffix(text, start) {
- None => return self.find_dfa_forward(text, start),
- Some(Match((start, _))) => start,
- Some(r) => return r,
- };
- // At this point, we've found a match. The only way to quit now
- // without a match is if the DFA gives up (seems unlikely).
- //
- // Now run the DFA forwards to find the proper end of the match.
- // (The suffix literal match can only indicate the earliest
- // possible end location, which may appear before the end of the
- // leftmost-first match.)
- match dfa::Fsm::forward(
- &self.ro.dfa,
- self.cache.value(),
- false,
- text,
- match_start,
- ) {
- NoMatch(_) => panic!("BUG: reverse match implies forward match"),
- Quit => Quit,
- Match(e) => Match((match_start, e)),
- }
- }
-
- /// Executes the NFA engine to return whether there is a match or not.
- ///
- /// Ideally, we could use shortest_nfa(...).is_some() and get the same
- /// performance characteristics, but regex sets don't have captures, which
- /// shortest_nfa depends on.
- #[cfg(feature = "perf-dfa")]
- fn match_nfa(&self, text: &[u8], start: usize) -> bool {
- self.match_nfa_type(MatchNfaType::Auto, text, start)
- }
-
- /// Like match_nfa, but allows specification of the type of NFA engine.
- fn match_nfa_type(
- &self,
- ty: MatchNfaType,
- text: &[u8],
- start: usize,
- ) -> bool {
- self.exec_nfa(
- ty,
- &mut [false],
- &mut [],
- true,
- false,
- text,
- start,
- text.len(),
- )
- }
-
- /// Finds the shortest match using an NFA.
- #[cfg(feature = "perf-dfa")]
- fn shortest_nfa(&self, text: &[u8], start: usize) -> Option<usize> {
- self.shortest_nfa_type(MatchNfaType::Auto, text, start)
- }
-
- /// Like shortest_nfa, but allows specification of the type of NFA engine.
- fn shortest_nfa_type(
- &self,
- ty: MatchNfaType,
- text: &[u8],
- start: usize,
- ) -> Option<usize> {
- let mut slots = [None, None];
- if self.exec_nfa(
- ty,
- &mut [false],
- &mut slots,
- true,
- true,
- text,
- start,
- text.len(),
- ) {
- slots[1]
- } else {
- None
- }
- }
-
- /// Like find, but executes an NFA engine.
- fn find_nfa(
- &self,
- ty: MatchNfaType,
- text: &[u8],
- start: usize,
- ) -> Option<(usize, usize)> {
- let mut slots = [None, None];
- if self.exec_nfa(
- ty,
- &mut [false],
- &mut slots,
- false,
- false,
- text,
- start,
- text.len(),
- ) {
- match (slots[0], slots[1]) {
- (Some(s), Some(e)) => Some((s, e)),
- _ => None,
- }
- } else {
- None
- }
- }
-
- /// Like find_nfa, but fills in captures.
- ///
- /// `slots` should have length equal to `2 * nfa.captures.len()`.
- #[cfg(feature = "perf-dfa")]
- fn captures_nfa(
- &self,
- slots: &mut [Slot],
- text: &[u8],
- start: usize,
- ) -> Option<(usize, usize)> {
- self.captures_nfa_type(
- MatchNfaType::Auto,
- slots,
- text,
- start,
- text.len(),
- )
- }
-
- /// Like captures_nfa, but allows specification of type of NFA engine.
- fn captures_nfa_type(
- &self,
- ty: MatchNfaType,
- slots: &mut [Slot],
- text: &[u8],
- start: usize,
- end: usize,
- ) -> Option<(usize, usize)> {
- if self.exec_nfa(
- ty,
- &mut [false],
- slots,
- false,
- false,
- text,
- start,
- end,
- ) {
- match (slots[0], slots[1]) {
- (Some(s), Some(e)) => Some((s, e)),
- _ => None,
- }
- } else {
- None
- }
- }
-
- fn exec_nfa(
- &self,
- mut ty: MatchNfaType,
- matches: &mut [bool],
- slots: &mut [Slot],
- quit_after_match: bool,
- quit_after_match_with_pos: bool,
- text: &[u8],
- start: usize,
- end: usize,
- ) -> bool {
- use self::MatchNfaType::*;
- if let Auto = ty {
- if backtrack::should_exec(self.ro.nfa.len(), text.len()) {
- ty = Backtrack;
- } else {
- ty = PikeVM;
- }
- }
- // The backtracker can't return the shortest match position as it is
- // implemented today. So if someone calls `shortest_match` and we need
- // to run an NFA, then use the PikeVM.
- if quit_after_match_with_pos || ty == PikeVM {
- self.exec_pikevm(
- matches,
- slots,
- quit_after_match,
- text,
- start,
- end,
- )
- } else {
- self.exec_backtrack(matches, slots, text, start, end)
- }
- }
-
- /// Always run the NFA algorithm.
- fn exec_pikevm(
- &self,
- matches: &mut [bool],
- slots: &mut [Slot],
- quit_after_match: bool,
- text: &[u8],
- start: usize,
- end: usize,
- ) -> bool {
- if self.ro.nfa.uses_bytes() {
- pikevm::Fsm::exec(
- &self.ro.nfa,
- self.cache.value(),
- matches,
- slots,
- quit_after_match,
- ByteInput::new(text, self.ro.nfa.only_utf8),
- start,
- end,
- )
- } else {
- pikevm::Fsm::exec(
- &self.ro.nfa,
- self.cache.value(),
- matches,
- slots,
- quit_after_match,
- CharInput::new(text),
- start,
- end,
- )
- }
- }
-
- /// Always runs the NFA using bounded backtracking.
- fn exec_backtrack(
- &self,
- matches: &mut [bool],
- slots: &mut [Slot],
- text: &[u8],
- start: usize,
- end: usize,
- ) -> bool {
- if self.ro.nfa.uses_bytes() {
- backtrack::Bounded::exec(
- &self.ro.nfa,
- self.cache.value(),
- matches,
- slots,
- ByteInput::new(text, self.ro.nfa.only_utf8),
- start,
- end,
- )
- } else {
- backtrack::Bounded::exec(
- &self.ro.nfa,
- self.cache.value(),
- matches,
- slots,
- CharInput::new(text),
- start,
- end,
- )
- }
- }
-
- /// Finds which regular expressions match the given text.
- ///
- /// `matches` should have length equal to the number of regexes being
- /// searched.
- ///
- /// This is only useful when one wants to know which regexes in a set
- /// match some text.
- pub fn many_matches_at(
- &self,
- matches: &mut [bool],
- text: &[u8],
- start: usize,
- ) -> bool {
- use self::MatchType::*;
- if !self.is_anchor_end_match(text) {
- return false;
- }
- match self.ro.match_type {
- #[cfg(feature = "perf-literal")]
- Literal(ty) => {
- debug_assert_eq!(matches.len(), 1);
- matches[0] = self.find_literals(ty, text, start).is_some();
- matches[0]
- }
- #[cfg(feature = "perf-dfa")]
- Dfa | DfaAnchoredReverse | DfaMany => {
- match dfa::Fsm::forward_many(
- &self.ro.dfa,
- self.cache.value(),
- matches,
- text,
- start,
- ) {
- dfa::Result::Match(_) => true,
- dfa::Result::NoMatch(_) => false,
- dfa::Result::Quit => self.exec_nfa(
- MatchNfaType::Auto,
- matches,
- &mut [],
- false,
- false,
- text,
- start,
- text.len(),
- ),
- }
- }
- #[cfg(all(feature = "perf-dfa", feature = "perf-literal"))]
- DfaSuffix => {
- match dfa::Fsm::forward_many(
- &self.ro.dfa,
- self.cache.value(),
- matches,
- text,
- start,
- ) {
- dfa::Result::Match(_) => true,
- dfa::Result::NoMatch(_) => false,
- dfa::Result::Quit => self.exec_nfa(
- MatchNfaType::Auto,
- matches,
- &mut [],
- false,
- false,
- text,
- start,
- text.len(),
- ),
- }
- }
- Nfa(ty) => self.exec_nfa(
- ty,
- matches,
- &mut [],
- false,
- false,
- text,
- start,
- text.len(),
- ),
- Nothing => false,
- }
- }
-
- #[cfg_attr(feature = "perf-inline", inline(always))]
- fn is_anchor_end_match(&self, text: &[u8]) -> bool {
- #[cfg(not(feature = "perf-literal"))]
- fn imp(_: &ExecReadOnly, _: &[u8]) -> bool {
- true
- }
-
- #[cfg(feature = "perf-literal")]
- fn imp(ro: &ExecReadOnly, text: &[u8]) -> bool {
- // Only do this check if the haystack is big (>1MB).
- if text.len() > (1 << 20) && ro.nfa.is_anchored_end {
- let lcs = ro.suffixes.lcs();
- if lcs.len() >= 1 && !lcs.is_suffix(text) {
- return false;
- }
- }
- true
- }
-
- imp(&self.ro, text)
- }
-
- pub fn capture_name_idx(&self) -> &Arc<HashMap<String, usize>> {
- &self.ro.nfa.capture_name_idx
- }
-}
-
-impl<'c> ExecNoSyncStr<'c> {
- pub fn capture_name_idx(&self) -> &Arc<HashMap<String, usize>> {
- self.0.capture_name_idx()
- }
-}
-
-impl Exec {
- /// Get a searcher that isn't Sync.
- #[cfg_attr(feature = "perf-inline", inline(always))]
- pub fn searcher(&self) -> ExecNoSync<'_> {
- ExecNoSync {
- ro: &self.ro, // a clone is too expensive here! (and not needed)
- cache: self.pool.get(),
- }
- }
-
- /// Get a searcher that isn't Sync and can match on &str.
- #[cfg_attr(feature = "perf-inline", inline(always))]
- pub fn searcher_str(&self) -> ExecNoSyncStr<'_> {
- ExecNoSyncStr(self.searcher())
- }
-
- /// Build a Regex from this executor.
- pub fn into_regex(self) -> re_unicode::Regex {
- re_unicode::Regex::from(self)
- }
-
- /// Build a RegexSet from this executor.
- pub fn into_regex_set(self) -> re_set::unicode::RegexSet {
- re_set::unicode::RegexSet::from(self)
- }
-
- /// Build a Regex from this executor that can match arbitrary bytes.
- pub fn into_byte_regex(self) -> re_bytes::Regex {
- re_bytes::Regex::from(self)
- }
-
- /// Build a RegexSet from this executor that can match arbitrary bytes.
- pub fn into_byte_regex_set(self) -> re_set::bytes::RegexSet {
- re_set::bytes::RegexSet::from(self)
- }
-
- /// The original regular expressions given by the caller that were
- /// compiled.
- pub fn regex_strings(&self) -> &[String] {
- &self.ro.res
- }
-
- /// Return a slice of capture names.
- ///
- /// Any capture that isn't named is None.
- pub fn capture_names(&self) -> &[Option<String>] {
- &self.ro.nfa.captures
- }
-
- /// Return a reference to named groups mapping (from group name to
- /// group position).
- pub fn capture_name_idx(&self) -> &Arc<HashMap<String, usize>> {
- &self.ro.nfa.capture_name_idx
- }
-
- /// If the number of capture groups in every match is always the same, then
- /// return that number. Otherwise return `None`.
- pub fn static_captures_len(&self) -> Option<usize> {
- self.ro.nfa.static_captures_len
- }
-}
-
-impl Clone for Exec {
- fn clone(&self) -> Exec {
- let pool = ExecReadOnly::new_pool(&self.ro);
- Exec { ro: self.ro.clone(), pool }
- }
-}
-
-impl ExecReadOnly {
- fn choose_match_type(&self, hint: Option<MatchType>) -> MatchType {
- if let Some(MatchType::Nfa(_)) = hint {
- return hint.unwrap();
- }
- // If the NFA is empty, then we'll never match anything.
- if self.nfa.insts.is_empty() {
- return MatchType::Nothing;
- }
- if let Some(literalty) = self.choose_literal_match_type() {
- return literalty;
- }
- if let Some(dfaty) = self.choose_dfa_match_type() {
- return dfaty;
- }
- // We're so totally hosed.
- MatchType::Nfa(MatchNfaType::Auto)
- }
-
- /// If a plain literal scan can be used, then a corresponding literal
- /// search type is returned.
- fn choose_literal_match_type(&self) -> Option<MatchType> {
- #[cfg(not(feature = "perf-literal"))]
- fn imp(_: &ExecReadOnly) -> Option<MatchType> {
- None
- }
-
- #[cfg(feature = "perf-literal")]
- fn imp(ro: &ExecReadOnly) -> Option<MatchType> {
- // If our set of prefixes is complete, then we can use it to find
- // a match in lieu of a regex engine. This doesn't quite work well
- // in the presence of multiple regexes, so only do it when there's
- // one.
- //
- // TODO(burntsushi): Also, don't try to match literals if the regex
- // is partially anchored. We could technically do it, but we'd need
- // to create two sets of literals: all of them and then the subset
- // that aren't anchored. We would then only search for all of them
- // when at the beginning of the input and use the subset in all
- // other cases.
- if ro.res.len() != 1 {
- return None;
- }
- if ro.ac.is_some() {
- return Some(MatchType::Literal(
- MatchLiteralType::AhoCorasick,
- ));
- }
- if ro.nfa.prefixes.complete() {
- return if ro.nfa.is_anchored_start {
- Some(MatchType::Literal(MatchLiteralType::AnchoredStart))
- } else {
- Some(MatchType::Literal(MatchLiteralType::Unanchored))
- };
- }
- if ro.suffixes.complete() {
- return if ro.nfa.is_anchored_end {
- Some(MatchType::Literal(MatchLiteralType::AnchoredEnd))
- } else {
- // This case shouldn't happen. When the regex isn't
- // anchored, then complete prefixes should imply complete
- // suffixes.
- //
- // The above is wrong! This case can happen. While
- // complete prefixes should imply complete suffixes
- // here, that doesn't necessarily mean we have a useful
- // prefix matcher! It could be the case that the literal
- // searcher decided the prefixes---even though they are
- // "complete"---weren't good enough and thus created an
- // empty matcher. If that happens and we return Unanchored
- // here, then we'll end up using that matcher, which is
- // very bad because it matches at every position. So...
- // return None.
- None
- };
- }
- None
- }
-
- imp(self)
- }
-
- /// If a DFA scan can be used, then choose the appropriate DFA strategy.
- fn choose_dfa_match_type(&self) -> Option<MatchType> {
- #[cfg(not(feature = "perf-dfa"))]
- fn imp(_: &ExecReadOnly) -> Option<MatchType> {
- None
- }
-
- #[cfg(feature = "perf-dfa")]
- fn imp(ro: &ExecReadOnly) -> Option<MatchType> {
- if !dfa::can_exec(&ro.dfa) {
- return None;
- }
- // Regex sets require a slightly specialized path.
- if ro.res.len() >= 2 {
- return Some(MatchType::DfaMany);
- }
- // If the regex is anchored at the end but not the start, then
- // just match in reverse from the end of the haystack.
- if !ro.nfa.is_anchored_start && ro.nfa.is_anchored_end {
- return Some(MatchType::DfaAnchoredReverse);
- }
- #[cfg(feature = "perf-literal")]
- {
- // If there's a longish suffix literal, then it might be faster
- // to look for that first.
- if ro.should_suffix_scan() {
- return Some(MatchType::DfaSuffix);
- }
- }
- // Fall back to your garden variety forward searching lazy DFA.
- Some(MatchType::Dfa)
- }
-
- imp(self)
- }
-
- /// Returns true if the program is amenable to suffix scanning.
- ///
- /// When this is true, as a heuristic, we assume it is OK to quickly scan
- /// for suffix literals and then do a *reverse* DFA match from any matches
- /// produced by the literal scan. (And then followed by a forward DFA
- /// search, since the previously found suffix literal maybe not actually be
- /// the end of a match.)
- ///
- /// This is a bit of a specialized optimization, but can result in pretty
- /// big performance wins if 1) there are no prefix literals and 2) the
- /// suffix literals are pretty rare in the text. (1) is obviously easy to
- /// account for but (2) is harder. As a proxy, we assume that longer
- /// strings are generally rarer, so we only enable this optimization when
- /// we have a meaty suffix.
- #[cfg(all(feature = "perf-dfa", feature = "perf-literal"))]
- fn should_suffix_scan(&self) -> bool {
- if self.suffixes.is_empty() {
- return false;
- }
- let lcs_len = self.suffixes.lcs().char_len();
- lcs_len >= 3 && lcs_len > self.dfa.prefixes.lcp().char_len()
- }
-
- fn new_pool(ro: &Arc<ExecReadOnly>) -> Box<Pool<ProgramCache>> {
- let ro = ro.clone();
- Box::new(Pool::new(Box::new(move || {
- AssertUnwindSafe(RefCell::new(ProgramCacheInner::new(&ro)))
- })))
- }
-}
-
-#[derive(Clone, Copy, Debug)]
-enum MatchType {
- /// A single or multiple literal search. This is only used when the regex
- /// can be decomposed into a literal search.
- #[cfg(feature = "perf-literal")]
- Literal(MatchLiteralType),
- /// A normal DFA search.
- #[cfg(feature = "perf-dfa")]
- Dfa,
- /// A reverse DFA search starting from the end of a haystack.
- #[cfg(feature = "perf-dfa")]
- DfaAnchoredReverse,
- /// A reverse DFA search with suffix literal scanning.
- #[cfg(all(feature = "perf-dfa", feature = "perf-literal"))]
- DfaSuffix,
- /// Use the DFA on two or more regular expressions.
- #[cfg(feature = "perf-dfa")]
- DfaMany,
- /// An NFA variant.
- Nfa(MatchNfaType),
- /// No match is ever possible, so don't ever try to search.
- Nothing,
-}
-
-#[derive(Clone, Copy, Debug)]
-#[cfg(feature = "perf-literal")]
-enum MatchLiteralType {
- /// Match literals anywhere in text.
- Unanchored,
- /// Match literals only at the start of text.
- AnchoredStart,
- /// Match literals only at the end of text.
- AnchoredEnd,
- /// Use an Aho-Corasick automaton. This requires `ac` to be Some on
- /// ExecReadOnly.
- AhoCorasick,
-}
-
-#[derive(Clone, Copy, Debug, Eq, PartialEq)]
-enum MatchNfaType {
- /// Choose between Backtrack and PikeVM.
- Auto,
- /// NFA bounded backtracking.
- ///
- /// (This is only set by tests, since it never makes sense to always want
- /// backtracking.)
- Backtrack,
- /// The Pike VM.
- ///
- /// (This is only set by tests, since it never makes sense to always want
- /// the Pike VM.)
- PikeVM,
-}
-
-/// `ProgramCache` maintains reusable allocations for each matching engine
-/// available to a particular program.
-///
-/// We declare this as unwind safe since it's a cache that's only used for
-/// performance purposes. If a panic occurs, it is (or should be) always safe
-/// to continue using the same regex object.
-pub type ProgramCache = AssertUnwindSafe<RefCell<ProgramCacheInner>>;
-
-#[derive(Debug)]
-pub struct ProgramCacheInner {
- pub pikevm: pikevm::Cache,
- pub backtrack: backtrack::Cache,
- #[cfg(feature = "perf-dfa")]
- pub dfa: dfa::Cache,
- #[cfg(feature = "perf-dfa")]
- pub dfa_reverse: dfa::Cache,
-}
-
-impl ProgramCacheInner {
- fn new(ro: &ExecReadOnly) -> Self {
- ProgramCacheInner {
- pikevm: pikevm::Cache::new(&ro.nfa),
- backtrack: backtrack::Cache::new(&ro.nfa),
- #[cfg(feature = "perf-dfa")]
- dfa: dfa::Cache::new(&ro.dfa),
- #[cfg(feature = "perf-dfa")]
- dfa_reverse: dfa::Cache::new(&ro.dfa_reverse),
- }
- }
-}
-
-/// Alternation literals checks if the given HIR is a simple alternation of
-/// literals, and if so, returns them. Otherwise, this returns None.
-#[cfg(feature = "perf-literal")]
-fn alternation_literals(expr: &Hir) -> Option<Vec<Vec<u8>>> {
- use regex_syntax::hir::{HirKind, Literal};
-
- // This is pretty hacky, but basically, if `is_alternation_literal` is
- // true, then we can make several assumptions about the structure of our
- // HIR. This is what justifies the `unreachable!` statements below.
- //
- // This code should be refactored once we overhaul this crate's
- // optimization pipeline, because this is a terribly inflexible way to go
- // about things.
-
- if !expr.properties().is_alternation_literal() {
- return None;
- }
- let alts = match *expr.kind() {
- HirKind::Alternation(ref alts) => alts,
- _ => return None, // one literal isn't worth it
- };
-
- let mut lits = vec![];
- for alt in alts {
- let mut lit = vec![];
- match *alt.kind() {
- HirKind::Literal(Literal(ref bytes)) => {
- lit.extend_from_slice(bytes)
- }
- HirKind::Concat(ref exprs) => {
- for e in exprs {
- match *e.kind() {
- HirKind::Literal(Literal(ref bytes)) => {
- lit.extend_from_slice(bytes);
- }
- _ => unreachable!("expected literal, got {:?}", e),
- }
- }
- }
- _ => unreachable!("expected literal or concat, got {:?}", alt),
- }
- lits.push(lit);
- }
- Some(lits)
-}
-
-#[cfg(not(feature = "perf-literal"))]
-fn literal_analysis(_: &Hir) -> (literal::Seq, literal::Seq) {
- (literal::Seq::infinite(), literal::Seq::infinite())
-}
-
-#[cfg(feature = "perf-literal")]
-fn literal_analysis(expr: &Hir) -> (literal::Seq, literal::Seq) {
- const ATTEMPTS: [(usize, usize); 3] = [(5, 50), (4, 30), (3, 20)];
-
- let mut prefixes = literal::Extractor::new()
- .kind(literal::ExtractKind::Prefix)
- .extract(expr);
- for (keep, limit) in ATTEMPTS {
- let len = match prefixes.len() {
- None => break,
- Some(len) => len,
- };
- if len <= limit {
- break;
- }
- prefixes.keep_first_bytes(keep);
- prefixes.minimize_by_preference();
- }
-
- let mut suffixes = literal::Extractor::new()
- .kind(literal::ExtractKind::Suffix)
- .extract(expr);
- for (keep, limit) in ATTEMPTS {
- let len = match suffixes.len() {
- None => break,
- Some(len) => len,
- };
- if len <= limit {
- break;
- }
- suffixes.keep_last_bytes(keep);
- suffixes.minimize_by_preference();
- }
-
- (prefixes, suffixes)
-}
-
-#[cfg(test)]
-mod test {
- #[test]
- fn uppercut_s_backtracking_bytes_default_bytes_mismatch() {
- use crate::internal::ExecBuilder;
-
- let backtrack_bytes_re = ExecBuilder::new("^S")
- .bounded_backtracking()
- .only_utf8(false)
- .build()
- .map(|exec| exec.into_byte_regex())
- .map_err(|err| format!("{}", err))
- .unwrap();
-
- let default_bytes_re = ExecBuilder::new("^S")
- .only_utf8(false)
- .build()
- .map(|exec| exec.into_byte_regex())
- .map_err(|err| format!("{}", err))
- .unwrap();
-
- let input = vec![83, 83];
-
- let s1 = backtrack_bytes_re.split(&input);
- let s2 = default_bytes_re.split(&input);
- for (chunk1, chunk2) in s1.zip(s2) {
- assert_eq!(chunk1, chunk2);
- }
- }
-
- #[test]
- fn unicode_lit_star_backtracking_utf8bytes_default_utf8bytes_mismatch() {
- use crate::internal::ExecBuilder;
-
- let backtrack_bytes_re = ExecBuilder::new(r"^(?u:\*)")
- .bounded_backtracking()
- .bytes(true)
- .build()
- .map(|exec| exec.into_regex())
- .map_err(|err| format!("{}", err))
- .unwrap();
-
- let default_bytes_re = ExecBuilder::new(r"^(?u:\*)")
- .bytes(true)
- .build()
- .map(|exec| exec.into_regex())
- .map_err(|err| format!("{}", err))
- .unwrap();
-
- let input = "**";
-
- let s1 = backtrack_bytes_re.split(input);
- let s2 = default_bytes_re.split(input);
- for (chunk1, chunk2) in s1.zip(s2) {
- assert_eq!(chunk1, chunk2);
- }
- }
-}
diff --git a/vendor/regex/src/expand.rs b/vendor/regex/src/expand.rs
deleted file mode 100644
index 98fafc949..000000000
--- a/vendor/regex/src/expand.rs
+++ /dev/null
@@ -1,247 +0,0 @@
-use std::str;
-
-use crate::find_byte::find_byte;
-
-use crate::re_bytes;
-use crate::re_unicode;
-
-pub fn expand_str(
- caps: &re_unicode::Captures<'_>,
- mut replacement: &str,
- dst: &mut String,
-) {
- while !replacement.is_empty() {
- match find_byte(b'$', replacement.as_bytes()) {
- None => break,
- Some(i) => {
- dst.push_str(&replacement[..i]);
- replacement = &replacement[i..];
- }
- }
- if replacement.as_bytes().get(1).map_or(false, |&b| b == b'$') {
- dst.push_str("$");
- replacement = &replacement[2..];
- continue;
- }
- debug_assert!(!replacement.is_empty());
- let cap_ref = match find_cap_ref(replacement.as_bytes()) {
- Some(cap_ref) => cap_ref,
- None => {
- dst.push_str("$");
- replacement = &replacement[1..];
- continue;
- }
- };
- replacement = &replacement[cap_ref.end..];
- match cap_ref.cap {
- Ref::Number(i) => {
- dst.push_str(caps.get(i).map(|m| m.as_str()).unwrap_or(""));
- }
- Ref::Named(name) => {
- dst.push_str(
- caps.name(name).map(|m| m.as_str()).unwrap_or(""),
- );
- }
- }
- }
- dst.push_str(replacement);
-}
-
-pub fn expand_bytes(
- caps: &re_bytes::Captures<'_>,
- mut replacement: &[u8],
- dst: &mut Vec<u8>,
-) {
- while !replacement.is_empty() {
- match find_byte(b'$', replacement) {
- None => break,
- Some(i) => {
- dst.extend(&replacement[..i]);
- replacement = &replacement[i..];
- }
- }
- if replacement.get(1).map_or(false, |&b| b == b'$') {
- dst.push(b'$');
- replacement = &replacement[2..];
- continue;
- }
- debug_assert!(!replacement.is_empty());
- let cap_ref = match find_cap_ref(replacement) {
- Some(cap_ref) => cap_ref,
- None => {
- dst.push(b'$');
- replacement = &replacement[1..];
- continue;
- }
- };
- replacement = &replacement[cap_ref.end..];
- match cap_ref.cap {
- Ref::Number(i) => {
- dst.extend(caps.get(i).map(|m| m.as_bytes()).unwrap_or(b""));
- }
- Ref::Named(name) => {
- dst.extend(
- caps.name(name).map(|m| m.as_bytes()).unwrap_or(b""),
- );
- }
- }
- }
- dst.extend(replacement);
-}
-
-/// `CaptureRef` represents a reference to a capture group inside some text.
-/// The reference is either a capture group name or a number.
-///
-/// It is also tagged with the position in the text following the
-/// capture reference.
-#[derive(Clone, Copy, Debug, Eq, PartialEq)]
-struct CaptureRef<'a> {
- cap: Ref<'a>,
- end: usize,
-}
-
-/// A reference to a capture group in some text.
-///
-/// e.g., `$2`, `$foo`, `${foo}`.
-#[derive(Clone, Copy, Debug, Eq, PartialEq)]
-enum Ref<'a> {
- Named(&'a str),
- Number(usize),
-}
-
-impl<'a> From<&'a str> for Ref<'a> {
- fn from(x: &'a str) -> Ref<'a> {
- Ref::Named(x)
- }
-}
-
-impl From<usize> for Ref<'static> {
- fn from(x: usize) -> Ref<'static> {
- Ref::Number(x)
- }
-}
-
-/// Parses a possible reference to a capture group name in the given text,
-/// starting at the beginning of `replacement`.
-///
-/// If no such valid reference could be found, None is returned.
-fn find_cap_ref(replacement: &[u8]) -> Option<CaptureRef<'_>> {
- let mut i = 0;
- let rep: &[u8] = replacement;
- if rep.len() <= 1 || rep[0] != b'$' {
- return None;
- }
- i += 1;
- if rep[i] == b'{' {
- return find_cap_ref_braced(rep, i + 1);
- }
- let mut cap_end = i;
- while rep.get(cap_end).copied().map_or(false, is_valid_cap_letter) {
- cap_end += 1;
- }
- if cap_end == i {
- return None;
- }
- // We just verified that the range 0..cap_end is valid ASCII, so it must
- // therefore be valid UTF-8. If we really cared, we could avoid this UTF-8
- // check via an unchecked conversion or by parsing the number straight from
- // &[u8].
- let cap =
- str::from_utf8(&rep[i..cap_end]).expect("valid UTF-8 capture name");
- Some(CaptureRef {
- cap: match cap.parse::<u32>() {
- Ok(i) => Ref::Number(i as usize),
- Err(_) => Ref::Named(cap),
- },
- end: cap_end,
- })
-}
-
-fn find_cap_ref_braced(rep: &[u8], mut i: usize) -> Option<CaptureRef<'_>> {
- let start = i;
- while rep.get(i).map_or(false, |&b| b != b'}') {
- i += 1;
- }
- if !rep.get(i).map_or(false, |&b| b == b'}') {
- return None;
- }
- // When looking at braced names, we don't put any restrictions on the name,
- // so it's possible it could be invalid UTF-8. But a capture group name
- // can never be invalid UTF-8, so if we have invalid UTF-8, then we can
- // safely return None.
- let cap = match str::from_utf8(&rep[start..i]) {
- Err(_) => return None,
- Ok(cap) => cap,
- };
- Some(CaptureRef {
- cap: match cap.parse::<u32>() {
- Ok(i) => Ref::Number(i as usize),
- Err(_) => Ref::Named(cap),
- },
- end: i + 1,
- })
-}
-
-/// Returns true if and only if the given byte is allowed in a capture name
-/// written in non-brace form.
-fn is_valid_cap_letter(b: u8) -> bool {
- match b {
- b'0'..=b'9' | b'a'..=b'z' | b'A'..=b'Z' | b'_' => true,
- _ => false,
- }
-}
-
-#[cfg(test)]
-mod tests {
- use super::{find_cap_ref, CaptureRef};
-
- macro_rules! find {
- ($name:ident, $text:expr) => {
- #[test]
- fn $name() {
- assert_eq!(None, find_cap_ref($text.as_bytes()));
- }
- };
- ($name:ident, $text:expr, $capref:expr) => {
- #[test]
- fn $name() {
- assert_eq!(Some($capref), find_cap_ref($text.as_bytes()));
- }
- };
- }
-
- macro_rules! c {
- ($name_or_number:expr, $pos:expr) => {
- CaptureRef { cap: $name_or_number.into(), end: $pos }
- };
- }
-
- find!(find_cap_ref1, "$foo", c!("foo", 4));
- find!(find_cap_ref2, "${foo}", c!("foo", 6));
- find!(find_cap_ref3, "$0", c!(0, 2));
- find!(find_cap_ref4, "$5", c!(5, 2));
- find!(find_cap_ref5, "$10", c!(10, 3));
- // See https://github.com/rust-lang/regex/pull/585
- // for more on characters following numbers
- find!(find_cap_ref6, "$42a", c!("42a", 4));
- find!(find_cap_ref7, "${42}a", c!(42, 5));
- find!(find_cap_ref8, "${42");
- find!(find_cap_ref9, "${42 ");
- find!(find_cap_ref10, " $0 ");
- find!(find_cap_ref11, "$");
- find!(find_cap_ref12, " ");
- find!(find_cap_ref13, "");
- find!(find_cap_ref14, "$1-$2", c!(1, 2));
- find!(find_cap_ref15, "$1_$2", c!("1_", 3));
- find!(find_cap_ref16, "$x-$y", c!("x", 2));
- find!(find_cap_ref17, "$x_$y", c!("x_", 3));
- find!(find_cap_ref18, "${#}", c!("#", 4));
- find!(find_cap_ref19, "${Z[}", c!("Z[", 5));
- find!(find_cap_ref20, "${¾}", c!("¾", 5));
- find!(find_cap_ref21, "${¾a}", c!("¾a", 6));
- find!(find_cap_ref22, "${a¾}", c!("a¾", 6));
- find!(find_cap_ref23, "${☃}", c!("☃", 6));
- find!(find_cap_ref24, "${a☃}", c!("a☃", 7));
- find!(find_cap_ref25, "${☃a}", c!("☃a", 7));
- find!(find_cap_ref26, "${名字}", c!("名字", 9));
-}
diff --git a/vendor/regex/src/find_byte.rs b/vendor/regex/src/find_byte.rs
index e95f72afb..9c6915db4 100644
--- a/vendor/regex/src/find_byte.rs
+++ b/vendor/regex/src/find_byte.rs
@@ -2,7 +2,7 @@
///
/// If the perf-literal feature is enabled, then this uses the super optimized
/// memchr crate. Otherwise, it uses the naive byte-at-a-time implementation.
-pub fn find_byte(needle: u8, haystack: &[u8]) -> Option<usize> {
+pub(crate) fn find_byte(needle: u8, haystack: &[u8]) -> Option<usize> {
#[cfg(not(feature = "perf-literal"))]
fn imp(needle: u8, haystack: &[u8]) -> Option<usize> {
haystack.iter().position(|&b| b == needle)
@@ -10,8 +10,7 @@ pub fn find_byte(needle: u8, haystack: &[u8]) -> Option<usize> {
#[cfg(feature = "perf-literal")]
fn imp(needle: u8, haystack: &[u8]) -> Option<usize> {
- use memchr::memchr;
- memchr(needle, haystack)
+ memchr::memchr(needle, haystack)
}
imp(needle, haystack)
diff --git a/vendor/regex/src/freqs.rs b/vendor/regex/src/freqs.rs
deleted file mode 100644
index fcffa95fb..000000000
--- a/vendor/regex/src/freqs.rs
+++ /dev/null
@@ -1,261 +0,0 @@
-// NOTE: The following code was generated by "scripts/frequencies.py", do not
-// edit directly
-
-pub const BYTE_FREQUENCIES: [u8; 256] = [
- 55, // '\x00'
- 52, // '\x01'
- 51, // '\x02'
- 50, // '\x03'
- 49, // '\x04'
- 48, // '\x05'
- 47, // '\x06'
- 46, // '\x07'
- 45, // '\x08'
- 103, // '\t'
- 242, // '\n'
- 66, // '\x0b'
- 67, // '\x0c'
- 229, // '\r'
- 44, // '\x0e'
- 43, // '\x0f'
- 42, // '\x10'
- 41, // '\x11'
- 40, // '\x12'
- 39, // '\x13'
- 38, // '\x14'
- 37, // '\x15'
- 36, // '\x16'
- 35, // '\x17'
- 34, // '\x18'
- 33, // '\x19'
- 56, // '\x1a'
- 32, // '\x1b'
- 31, // '\x1c'
- 30, // '\x1d'
- 29, // '\x1e'
- 28, // '\x1f'
- 255, // ' '
- 148, // '!'
- 164, // '"'
- 149, // '#'
- 136, // '$'
- 160, // '%'
- 155, // '&'
- 173, // "'"
- 221, // '('
- 222, // ')'
- 134, // '*'
- 122, // '+'
- 232, // ','
- 202, // '-'
- 215, // '.'
- 224, // '/'
- 208, // '0'
- 220, // '1'
- 204, // '2'
- 187, // '3'
- 183, // '4'
- 179, // '5'
- 177, // '6'
- 168, // '7'
- 178, // '8'
- 200, // '9'
- 226, // ':'
- 195, // ';'
- 154, // '<'
- 184, // '='
- 174, // '>'
- 126, // '?'
- 120, // '@'
- 191, // 'A'
- 157, // 'B'
- 194, // 'C'
- 170, // 'D'
- 189, // 'E'
- 162, // 'F'
- 161, // 'G'
- 150, // 'H'
- 193, // 'I'
- 142, // 'J'
- 137, // 'K'
- 171, // 'L'
- 176, // 'M'
- 185, // 'N'
- 167, // 'O'
- 186, // 'P'
- 112, // 'Q'
- 175, // 'R'
- 192, // 'S'
- 188, // 'T'
- 156, // 'U'
- 140, // 'V'
- 143, // 'W'
- 123, // 'X'
- 133, // 'Y'
- 128, // 'Z'
- 147, // '['
- 138, // '\\'
- 146, // ']'
- 114, // '^'
- 223, // '_'
- 151, // '`'
- 249, // 'a'
- 216, // 'b'
- 238, // 'c'
- 236, // 'd'
- 253, // 'e'
- 227, // 'f'
- 218, // 'g'
- 230, // 'h'
- 247, // 'i'
- 135, // 'j'
- 180, // 'k'
- 241, // 'l'
- 233, // 'm'
- 246, // 'n'
- 244, // 'o'
- 231, // 'p'
- 139, // 'q'
- 245, // 'r'
- 243, // 's'
- 251, // 't'
- 235, // 'u'
- 201, // 'v'
- 196, // 'w'
- 240, // 'x'
- 214, // 'y'
- 152, // 'z'
- 182, // '{'
- 205, // '|'
- 181, // '}'
- 127, // '~'
- 27, // '\x7f'
- 212, // '\x80'
- 211, // '\x81'
- 210, // '\x82'
- 213, // '\x83'
- 228, // '\x84'
- 197, // '\x85'
- 169, // '\x86'
- 159, // '\x87'
- 131, // '\x88'
- 172, // '\x89'
- 105, // '\x8a'
- 80, // '\x8b'
- 98, // '\x8c'
- 96, // '\x8d'
- 97, // '\x8e'
- 81, // '\x8f'
- 207, // '\x90'
- 145, // '\x91'
- 116, // '\x92'
- 115, // '\x93'
- 144, // '\x94'
- 130, // '\x95'
- 153, // '\x96'
- 121, // '\x97'
- 107, // '\x98'
- 132, // '\x99'
- 109, // '\x9a'
- 110, // '\x9b'
- 124, // '\x9c'
- 111, // '\x9d'
- 82, // '\x9e'
- 108, // '\x9f'
- 118, // '\xa0'
- 141, // '¡'
- 113, // '¢'
- 129, // '£'
- 119, // '¤'
- 125, // '¥'
- 165, // '¦'
- 117, // '§'
- 92, // '¨'
- 106, // '©'
- 83, // 'ª'
- 72, // '«'
- 99, // '¬'
- 93, // '\xad'
- 65, // '®'
- 79, // '¯'
- 166, // '°'
- 237, // '±'
- 163, // '²'
- 199, // '³'
- 190, // '´'
- 225, // 'µ'
- 209, // '¶'
- 203, // '·'
- 198, // '¸'
- 217, // '¹'
- 219, // 'º'
- 206, // '»'
- 234, // '¼'
- 248, // '½'
- 158, // '¾'
- 239, // '¿'
- 255, // 'À'
- 255, // 'Á'
- 255, // 'Â'
- 255, // 'Ã'
- 255, // 'Ä'
- 255, // 'Å'
- 255, // 'Æ'
- 255, // 'Ç'
- 255, // 'È'
- 255, // 'É'
- 255, // 'Ê'
- 255, // 'Ë'
- 255, // 'Ì'
- 255, // 'Í'
- 255, // 'Î'
- 255, // 'Ï'
- 255, // 'Ð'
- 255, // 'Ñ'
- 255, // 'Ò'
- 255, // 'Ó'
- 255, // 'Ô'
- 255, // 'Õ'
- 255, // 'Ö'
- 255, // '×'
- 255, // 'Ø'
- 255, // 'Ù'
- 255, // 'Ú'
- 255, // 'Û'
- 255, // 'Ü'
- 255, // 'Ý'
- 255, // 'Þ'
- 255, // 'ß'
- 255, // 'à'
- 255, // 'á'
- 255, // 'â'
- 255, // 'ã'
- 255, // 'ä'
- 255, // 'å'
- 255, // 'æ'
- 255, // 'ç'
- 255, // 'è'
- 255, // 'é'
- 255, // 'ê'
- 255, // 'ë'
- 255, // 'ì'
- 255, // 'í'
- 255, // 'î'
- 255, // 'ï'
- 255, // 'ð'
- 255, // 'ñ'
- 255, // 'ò'
- 255, // 'ó'
- 255, // 'ô'
- 255, // 'õ'
- 255, // 'ö'
- 255, // '÷'
- 255, // 'ø'
- 255, // 'ù'
- 255, // 'ú'
- 255, // 'û'
- 255, // 'ü'
- 255, // 'ý'
- 255, // 'þ'
- 255, // 'ÿ'
-];
diff --git a/vendor/regex/src/input.rs b/vendor/regex/src/input.rs
deleted file mode 100644
index df6c3e0c9..000000000
--- a/vendor/regex/src/input.rs
+++ /dev/null
@@ -1,432 +0,0 @@
-use std::char;
-use std::cmp::Ordering;
-use std::fmt;
-use std::ops;
-use std::u32;
-
-use crate::literal::LiteralSearcher;
-use crate::prog::InstEmptyLook;
-use crate::utf8::{decode_last_utf8, decode_utf8};
-
-/// Represents a location in the input.
-#[derive(Clone, Copy, Debug)]
-pub struct InputAt {
- pos: usize,
- c: Char,
- byte: Option<u8>,
- len: usize,
-}
-
-impl InputAt {
- /// Returns true iff this position is at the beginning of the input.
- pub fn is_start(&self) -> bool {
- self.pos == 0
- }
-
- /// Returns true iff this position is past the end of the input.
- pub fn is_end(&self) -> bool {
- self.c.is_none() && self.byte.is_none()
- }
-
- /// Returns the character at this position.
- ///
- /// If this position is just before or after the input, then an absent
- /// character is returned.
- pub fn char(&self) -> Char {
- self.c
- }
-
- /// Returns the byte at this position.
- pub fn byte(&self) -> Option<u8> {
- self.byte
- }
-
- /// Returns the UTF-8 width of the character at this position.
- pub fn len(&self) -> usize {
- self.len
- }
-
- /// Returns whether the UTF-8 width of the character at this position
- /// is zero.
- pub fn is_empty(&self) -> bool {
- self.len == 0
- }
-
- /// Returns the byte offset of this position.
- pub fn pos(&self) -> usize {
- self.pos
- }
-
- /// Returns the byte offset of the next position in the input.
- pub fn next_pos(&self) -> usize {
- self.pos + self.len
- }
-}
-
-/// An abstraction over input used in the matching engines.
-pub trait Input: fmt::Debug {
- /// Return an encoding of the position at byte offset `i`.
- fn at(&self, i: usize) -> InputAt;
-
- /// Return the Unicode character occurring next to `at`.
- ///
- /// If no such character could be decoded, then `Char` is absent.
- fn next_char(&self, at: InputAt) -> Char;
-
- /// Return the Unicode character occurring previous to `at`.
- ///
- /// If no such character could be decoded, then `Char` is absent.
- fn previous_char(&self, at: InputAt) -> Char;
-
- /// Return true if the given empty width instruction matches at the
- /// input position given.
- fn is_empty_match(&self, at: InputAt, empty: &InstEmptyLook) -> bool;
-
- /// Scan the input for a matching prefix.
- fn prefix_at(
- &self,
- prefixes: &LiteralSearcher,
- at: InputAt,
- ) -> Option<InputAt>;
-
- /// The number of bytes in the input.
- fn len(&self) -> usize;
-
- /// Whether the input is empty.
- fn is_empty(&self) -> bool {
- self.len() == 0
- }
-
- /// Return the given input as a sequence of bytes.
- fn as_bytes(&self) -> &[u8];
-}
-
-impl<'a, T: Input> Input for &'a T {
- fn at(&self, i: usize) -> InputAt {
- (**self).at(i)
- }
-
- fn next_char(&self, at: InputAt) -> Char {
- (**self).next_char(at)
- }
-
- fn previous_char(&self, at: InputAt) -> Char {
- (**self).previous_char(at)
- }
-
- fn is_empty_match(&self, at: InputAt, empty: &InstEmptyLook) -> bool {
- (**self).is_empty_match(at, empty)
- }
-
- fn prefix_at(
- &self,
- prefixes: &LiteralSearcher,
- at: InputAt,
- ) -> Option<InputAt> {
- (**self).prefix_at(prefixes, at)
- }
-
- fn len(&self) -> usize {
- (**self).len()
- }
-
- fn as_bytes(&self) -> &[u8] {
- (**self).as_bytes()
- }
-}
-
-/// An input reader over characters.
-#[derive(Clone, Copy, Debug)]
-pub struct CharInput<'t>(&'t [u8]);
-
-impl<'t> CharInput<'t> {
- /// Return a new character input reader for the given string.
- pub fn new(s: &'t [u8]) -> CharInput<'t> {
- CharInput(s)
- }
-}
-
-impl<'t> ops::Deref for CharInput<'t> {
- type Target = [u8];
-
- fn deref(&self) -> &[u8] {
- self.0
- }
-}
-
-impl<'t> Input for CharInput<'t> {
- fn at(&self, i: usize) -> InputAt {
- if i >= self.len() {
- InputAt { pos: self.len(), c: None.into(), byte: None, len: 0 }
- } else {
- let c = decode_utf8(&self[i..]).map(|(c, _)| c).into();
- InputAt { pos: i, c, byte: None, len: c.len_utf8() }
- }
- }
-
- fn next_char(&self, at: InputAt) -> Char {
- at.char()
- }
-
- fn previous_char(&self, at: InputAt) -> Char {
- decode_last_utf8(&self[..at.pos()]).map(|(c, _)| c).into()
- }
-
- fn is_empty_match(&self, at: InputAt, empty: &InstEmptyLook) -> bool {
- use crate::prog::EmptyLook::*;
- match empty.look {
- StartLine => {
- let c = self.previous_char(at);
- at.pos() == 0 || c == '\n'
- }
- EndLine => {
- let c = self.next_char(at);
- at.pos() == self.len() || c == '\n'
- }
- StartText => at.pos() == 0,
- EndText => at.pos() == self.len(),
- WordBoundary => {
- let (c1, c2) = (self.previous_char(at), self.next_char(at));
- c1.is_word_char() != c2.is_word_char()
- }
- NotWordBoundary => {
- let (c1, c2) = (self.previous_char(at), self.next_char(at));
- c1.is_word_char() == c2.is_word_char()
- }
- WordBoundaryAscii => {
- let (c1, c2) = (self.previous_char(at), self.next_char(at));
- c1.is_word_byte() != c2.is_word_byte()
- }
- NotWordBoundaryAscii => {
- let (c1, c2) = (self.previous_char(at), self.next_char(at));
- c1.is_word_byte() == c2.is_word_byte()
- }
- }
- }
-
- fn prefix_at(
- &self,
- prefixes: &LiteralSearcher,
- at: InputAt,
- ) -> Option<InputAt> {
- prefixes.find(&self[at.pos()..]).map(|(s, _)| self.at(at.pos() + s))
- }
-
- fn len(&self) -> usize {
- self.0.len()
- }
-
- fn as_bytes(&self) -> &[u8] {
- self.0
- }
-}
-
-/// An input reader over bytes.
-#[derive(Clone, Copy, Debug)]
-pub struct ByteInput<'t> {
- text: &'t [u8],
- only_utf8: bool,
-}
-
-impl<'t> ByteInput<'t> {
- /// Return a new byte-based input reader for the given string.
- pub fn new(text: &'t [u8], only_utf8: bool) -> ByteInput<'t> {
- ByteInput { text, only_utf8 }
- }
-}
-
-impl<'t> ops::Deref for ByteInput<'t> {
- type Target = [u8];
-
- fn deref(&self) -> &[u8] {
- self.text
- }
-}
-
-impl<'t> Input for ByteInput<'t> {
- fn at(&self, i: usize) -> InputAt {
- if i >= self.len() {
- InputAt { pos: self.len(), c: None.into(), byte: None, len: 0 }
- } else {
- InputAt {
- pos: i,
- c: None.into(),
- byte: self.get(i).cloned(),
- len: 1,
- }
- }
- }
-
- fn next_char(&self, at: InputAt) -> Char {
- decode_utf8(&self[at.pos()..]).map(|(c, _)| c).into()
- }
-
- fn previous_char(&self, at: InputAt) -> Char {
- decode_last_utf8(&self[..at.pos()]).map(|(c, _)| c).into()
- }
-
- fn is_empty_match(&self, at: InputAt, empty: &InstEmptyLook) -> bool {
- use crate::prog::EmptyLook::*;
- match empty.look {
- StartLine => {
- let c = self.previous_char(at);
- at.pos() == 0 || c == '\n'
- }
- EndLine => {
- let c = self.next_char(at);
- at.pos() == self.len() || c == '\n'
- }
- StartText => at.pos() == 0,
- EndText => at.pos() == self.len(),
- WordBoundary => {
- let (c1, c2) = (self.previous_char(at), self.next_char(at));
- c1.is_word_char() != c2.is_word_char()
- }
- NotWordBoundary => {
- let (c1, c2) = (self.previous_char(at), self.next_char(at));
- c1.is_word_char() == c2.is_word_char()
- }
- WordBoundaryAscii => {
- let (c1, c2) = (self.previous_char(at), self.next_char(at));
- if self.only_utf8 {
- // If we must match UTF-8, then we can't match word
- // boundaries at invalid UTF-8.
- if c1.is_none() && !at.is_start() {
- return false;
- }
- if c2.is_none() && !at.is_end() {
- return false;
- }
- }
- c1.is_word_byte() != c2.is_word_byte()
- }
- NotWordBoundaryAscii => {
- let (c1, c2) = (self.previous_char(at), self.next_char(at));
- if self.only_utf8 {
- // If we must match UTF-8, then we can't match word
- // boundaries at invalid UTF-8.
- if c1.is_none() && !at.is_start() {
- return false;
- }
- if c2.is_none() && !at.is_end() {
- return false;
- }
- }
- c1.is_word_byte() == c2.is_word_byte()
- }
- }
- }
-
- fn prefix_at(
- &self,
- prefixes: &LiteralSearcher,
- at: InputAt,
- ) -> Option<InputAt> {
- prefixes.find(&self[at.pos()..]).map(|(s, _)| self.at(at.pos() + s))
- }
-
- fn len(&self) -> usize {
- self.text.len()
- }
-
- fn as_bytes(&self) -> &[u8] {
- self.text
- }
-}
-
-/// An inline representation of `Option<char>`.
-///
-/// This eliminates the need to do case analysis on `Option<char>` to determine
-/// ordinality with other characters.
-///
-/// (The `Option<char>` is not related to encoding. Instead, it is used in the
-/// matching engines to represent the beginning and ending boundaries of the
-/// search text.)
-#[derive(Clone, Copy, Hash, PartialEq, Eq, PartialOrd, Ord)]
-pub struct Char(u32);
-
-impl fmt::Debug for Char {
- fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
- match char::from_u32(self.0) {
- None => write!(f, "Empty"),
- Some(c) => write!(f, "{:?}", c),
- }
- }
-}
-
-impl Char {
- /// Returns true iff the character is absent.
- #[inline]
- pub fn is_none(self) -> bool {
- self.0 == u32::MAX
- }
-
- /// Returns the length of the character's UTF-8 encoding.
- ///
- /// If the character is absent, then `1` is returned.
- #[inline]
- pub fn len_utf8(self) -> usize {
- char::from_u32(self.0).map_or(1, |c| c.len_utf8())
- }
-
- /// Returns true iff the character is a word character.
- ///
- /// If the character is absent, then false is returned.
- pub fn is_word_char(self) -> bool {
- // is_word_character can panic if the Unicode data for \w isn't
- // available. However, our compiler ensures that if a Unicode word
- // boundary is used, then the data must also be available. If it isn't,
- // then the compiler returns an error.
- char::from_u32(self.0).map_or(false, regex_syntax::is_word_character)
- }
-
- /// Returns true iff the byte is a word byte.
- ///
- /// If the byte is absent, then false is returned.
- pub fn is_word_byte(self) -> bool {
- match char::from_u32(self.0) {
- Some(c) if c <= '\u{7F}' => regex_syntax::is_word_byte(c as u8),
- None | Some(_) => false,
- }
- }
-}
-
-impl From<char> for Char {
- fn from(c: char) -> Char {
- Char(c as u32)
- }
-}
-
-impl From<Option<char>> for Char {
- fn from(c: Option<char>) -> Char {
- c.map_or(Char(u32::MAX), |c| c.into())
- }
-}
-
-impl PartialEq<char> for Char {
- #[inline]
- fn eq(&self, other: &char) -> bool {
- self.0 == *other as u32
- }
-}
-
-impl PartialEq<Char> for char {
- #[inline]
- fn eq(&self, other: &Char) -> bool {
- *self as u32 == other.0
- }
-}
-
-impl PartialOrd<char> for Char {
- #[inline]
- fn partial_cmp(&self, other: &char) -> Option<Ordering> {
- self.0.partial_cmp(&(*other as u32))
- }
-}
-
-impl PartialOrd<Char> for char {
- #[inline]
- fn partial_cmp(&self, other: &Char) -> Option<Ordering> {
- (*self as u32).partial_cmp(&other.0)
- }
-}
diff --git a/vendor/regex/src/lib.rs b/vendor/regex/src/lib.rs
index 82c1b77ad..6dbd3c202 100644
--- a/vendor/regex/src/lib.rs
+++ b/vendor/regex/src/lib.rs
@@ -1,146 +1,371 @@
/*!
-This crate provides a library for parsing, compiling, and executing regular
-expressions. Its syntax is similar to Perl-style regular expressions, but lacks
-a few features like look around and backreferences. In exchange, all searches
-execute in linear time with respect to the size of the regular expression and
-search text.
+This crate provides routines for searching strings for matches of a [regular
+expression] (aka "regex"). The regex syntax supported by this crate is similar
+to other regex engines, but it lacks several features that are not known how to
+implement efficiently. This includes, but is not limited to, look-around and
+backreferences. In exchange, all regex searches in this crate have worst case
+`O(m * n)` time complexity, where `m` is proportional to the size of the regex
+and `n` is proportional to the size of the string being searched.
-This crate's documentation provides some simple examples, describes
-[Unicode support](#unicode) and exhaustively lists the
-[supported syntax](#syntax).
+[regular expression]: https://en.wikipedia.org/wiki/Regular_expression
-For more specific details on the API for regular expressions, please see the
-documentation for the [`Regex`](struct.Regex.html) type.
+If you just want API documentation, then skip to the [`Regex`] type. Otherwise,
+here's a quick example showing one way of parsing the output of a grep-like
+program:
+
+```rust
+use regex::Regex;
+
+let re = Regex::new(r"(?m)^([^:]+):([0-9]+):(.+)$").unwrap();
+let hay = "\
+path/to/foo:54:Blue Harvest
+path/to/bar:90:Something, Something, Something, Dark Side
+path/to/baz:3:It's a Trap!
+";
+
+let mut results = vec![];
+for (_, [path, lineno, line]) in re.captures_iter(hay).map(|c| c.extract()) {
+ results.push((path, lineno.parse::<u64>()?, line));
+}
+assert_eq!(results, vec![
+ ("path/to/foo", 54, "Blue Harvest"),
+ ("path/to/bar", 90, "Something, Something, Something, Dark Side"),
+ ("path/to/baz", 3, "It's a Trap!"),
+]);
+# Ok::<(), Box<dyn std::error::Error>>(())
+```
+
+# Overview
+
+The primary type in this crate is a [`Regex`]. Its most important methods are
+as follows:
+
+* [`Regex::new`] compiles a regex using the default configuration. A
+[`RegexBuilder`] permits setting a non-default configuration. (For example,
+case insensitive matching, verbose mode and others.)
+* [`Regex::is_match`] reports whether a match exists in a particular haystack.
+* [`Regex::find`] reports the byte offsets of a match in a haystack, if one
+exists. [`Regex::find_iter`] returns an iterator over all such matches.
+* [`Regex::captures`] returns a [`Captures`], which reports both the byte
+offsets of a match in a haystack and the byte offsets of each matching capture
+group from the regex in the haystack.
+[`Regex::captures_iter`] returns an iterator over all such matches.
+
+There is also a [`RegexSet`], which permits searching for multiple regex
+patterns simultaneously in a single search. However, it currently only reports
+which patterns match and *not* the byte offsets of a match.
+
+Otherwise, this top-level crate documentation is organized as follows:
+
+* [Usage](#usage) shows how to add the `regex` crate to your Rust project.
+* [Examples](#examples) provides a limited selection of regex search examples.
+* [Performance](#performance) provides a brief summary of how to optimize regex
+searching speed.
+* [Unicode](#unicode) discusses support for non-ASCII patterns.
+* [Syntax](#syntax) enumerates the specific regex syntax supported by this
+crate.
+* [Untrusted input](#untrusted-input) discusses how this crate deals with regex
+patterns or haystacks that are untrusted.
+* [Crate features](#crate-features) documents the Cargo features that can be
+enabled or disabled for this crate.
+* [Other crates](#other-crates) links to other crates in the `regex` family.
# Usage
-This crate is [on crates.io](https://crates.io/crates/regex) and can be
+The `regex` crate is [on crates.io](https://crates.io/crates/regex) and can be
used by adding `regex` to your dependencies in your project's `Cargo.toml`.
+Or more simply, just run `cargo add regex`.
+
+Here is a complete example that creates a new Rust project, adds a dependency
+on `regex`, creates the source code for a regex search and then runs the
+program.
+
+First, create the project in a new directory:
-```toml
-[dependencies]
-regex = "1"
+```text
+$ mkdir regex-example
+$ cd regex-example
+$ cargo init
```
-# Example: find a date
+Second, add a dependency on `regex`:
-General use of regular expressions in this package involves compiling an
-expression and then using it to search, split or replace text. For example,
-to confirm that some text resembles a date:
+```text
+$ cargo add regex
+```
+
+Third, edit `src/main.rs`. Delete what's there and replace it with this:
+
+```
+use regex::Regex;
+
+fn main() {
+ let re = Regex::new(r"Hello (?<name>\w+)!").unwrap();
+ let Some(caps) = re.captures("Hello Murphy!") else {
+ println!("no match!");
+ return;
+ };
+ println!("The name is: {}", &caps["name"]);
+}
+```
+
+Fourth, run it with `cargo run`:
+
+```text
+$ cargo run
+ Compiling memchr v2.5.0
+ Compiling regex-syntax v0.7.1
+ Compiling aho-corasick v1.0.1
+ Compiling regex v1.8.1
+ Compiling regex-example v0.1.0 (/tmp/regex-example)
+ Finished dev [unoptimized + debuginfo] target(s) in 4.22s
+ Running `target/debug/regex-example`
+The name is: Murphy
+```
+
+The first time you run the program will show more output like above. But
+subsequent runs shouldn't have to re-compile the dependencies.
+
+# Examples
+
+This section provides a few examples, in tutorial style, showing how to
+search a haystack with a regex. There are more examples throughout the API
+documentation.
+
+Before starting though, it's worth defining a few terms:
+
+* A **regex** is a Rust value whose type is `Regex`. We use `re` as a
+variable name for a regex.
+* A **pattern** is the string that is used to build a regex. We use `pat` as
+a variable name for a pattern.
+* A **haystack** is the string that is searched by a regex. We use `hay` as a
+variable name for a haystack.
+
+Sometimes the words "regex" and "pattern" are used interchangeably.
+
+General use of regular expressions in this crate proceeds by compiling a
+**pattern** into a **regex**, and then using that regex to search, split or
+replace parts of a **haystack**.
+
+### Example: find a middle initial
+
+We'll start off with a very simple example: a regex that looks for a specific
+name but uses a wildcard to match a middle initial. Our pattern serves as
+something like a template that will match a particular name with *any* middle
+initial.
```rust
use regex::Regex;
-let re = Regex::new(r"^\d{4}-\d{2}-\d{2}$").unwrap();
-assert!(re.is_match("2014-01-01"));
+
+// We use 'unwrap()' here because it would be a bug in our program if the
+// pattern failed to compile to a regex. Panicking in the presence of a bug
+// is okay.
+let re = Regex::new(r"Homer (.)\. Simpson").unwrap();
+let hay = "Homer J. Simpson";
+let Some(caps) = re.captures(hay) else { return };
+assert_eq!("J", &caps[1]);
```
-Notice the use of the `^` and `$` anchors. In this crate, every expression
-is executed with an implicit `.*?` at the beginning and end, which allows
-it to match anywhere in the text. Anchors can be used to ensure that the
-full text matches an expression.
+There are a few things worth noticing here in our first example:
-This example also demonstrates the utility of
-[raw strings](https://doc.rust-lang.org/stable/reference/tokens.html#raw-string-literals)
-in Rust, which
-are just like regular strings except they are prefixed with an `r` and do
-not process any escape sequences. For example, `"\\d"` is the same
-expression as `r"\d"`.
+* The `.` is a special pattern meta character that means "match any single
+character except for new lines." (More precisely, in this crate, it means
+"match any UTF-8 encoding of any Unicode scalar value other than `\n`.")
+* We can match an actual `.` literally by escaping it, i.e., `\.`.
+* We use Rust's [raw strings] to avoid needing to deal with escape sequences in
+both the regex pattern syntax and in Rust's string literal syntax. If we didn't
+use raw strings here, we would have had to use `\\.` to match a literal `.`
+character. That is, `r"\."` and `"\\."` are equivalent patterns.
+* We put our wildcard `.` instruction in parentheses. These parentheses have a
+special meaning that says, "make whatever part of the haystack matches within
+these parentheses available as a capturing group." After finding a match, we
+access this capture group with `&caps[1]`.
-# Example: Avoid compiling the same regex in a loop
+[raw strings]: https://doc.rust-lang.org/stable/reference/tokens.html#raw-string-literals
-It is an anti-pattern to compile the same regular expression in a loop
-since compilation is typically expensive. (It takes anywhere from a few
-microseconds to a few **milliseconds** depending on the size of the
-regex.) Not only is compilation itself expensive, but this also prevents
-optimizations that reuse allocations internally to the matching engines.
+Otherwise, we execute a search using `re.captures(hay)` and return from our
+function if no match occurred. We then reference the middle initial by asking
+for the part of the haystack that matched the capture group indexed at `1`.
+(The capture group at index 0 is implicit and always corresponds to the entire
+match. In this case, that's `Homer J. Simpson`.)
-In Rust, it can sometimes be a pain to pass regular expressions around if
-they're used from inside a helper function. Instead, we recommend using the
-[`lazy_static`](https://crates.io/crates/lazy_static) crate to ensure that
-regular expressions are compiled exactly once.
+### Example: named capture groups
-For example:
+Continuing from our middle initial example above, we can tweak the pattern
+slightly to give a name to the group that matches the middle initial:
```rust
-use lazy_static::lazy_static;
use regex::Regex;
-fn some_helper_function(text: &str) -> bool {
- lazy_static! {
- static ref RE: Regex = Regex::new("...").unwrap();
- }
- RE.is_match(text)
-}
+// Note that (?P<middle>.) is a different way to spell the same thing.
+let re = Regex::new(r"Homer (?<middle>.)\. Simpson").unwrap();
+let hay = "Homer J. Simpson";
+let Some(caps) = re.captures(hay) else { return };
+assert_eq!("J", &caps["middle"]);
+```
+
+Giving a name to a group can be useful when there are multiple groups in
+a pattern. It makes the code referring to those groups a bit easier to
+understand.
+
+### Example: validating a particular date format
+
+This examples shows how to confirm whether a haystack, in its entirety, matches
+a particular date format:
+
+```rust
+use regex::Regex;
-fn main() {}
+let re = Regex::new(r"^\d{4}-\d{2}-\d{2}$").unwrap();
+assert!(re.is_match("2010-03-14"));
```
-Specifically, in this example, the regex will be compiled when it is used for
-the first time. On subsequent uses, it will reuse the previous compilation.
+Notice the use of the `^` and `$` anchors. In this crate, every regex search is
+run with an implicit `(?s:.)*?` at the beginning of its pattern, which allows
+the regex to match anywhere in a haystack. Anchors, as above, can be used to
+ensure that the full haystack matches a pattern.
+
+This crate is also Unicode aware by default, which means that `\d` might match
+more than you might expect it to. For example:
+
+```rust
+use regex::Regex;
+
+let re = Regex::new(r"^\d{4}-\d{2}-\d{2}$").unwrap();
+assert!(re.is_match("𝟚𝟘𝟙𝟘-𝟘𝟛-𝟙𝟜"));
+```
+
+To only match an ASCII decimal digit, all of the following are equivalent:
-# Example: iterating over capture groups
+* `[0-9]`
+* `(?-u:\d)`
+* `[[:digit:]]`
+* `[\d&&\p{ascii}]`
-This crate provides convenient iterators for matching an expression
-repeatedly against a search string to find successive non-overlapping
-matches. For example, to find all dates in a string and be able to access
-them by their component pieces:
+### Example: finding dates in a haystack
+
+In the previous example, we showed how one might validate that a haystack,
+in its entirety, corresponded to a particular date format. But what if we wanted
+to extract all things that look like dates in a specific format from a haystack?
+To do this, we can use an iterator API to find all matches (notice that we've
+removed the anchors and switched to looking for ASCII-only digits):
```rust
-# use regex::Regex;
-# fn main() {
-let re = Regex::new(r"(\d{4})-(\d{2})-(\d{2})").unwrap();
-let text = "2012-03-14, 2013-01-01 and 2014-07-05";
-for cap in re.captures_iter(text) {
- println!("Month: {} Day: {} Year: {}", &cap[2], &cap[3], &cap[1]);
-}
-// Output:
-// Month: 03 Day: 14 Year: 2012
-// Month: 01 Day: 01 Year: 2013
-// Month: 07 Day: 05 Year: 2014
-# }
+use regex::Regex;
+
+let re = Regex::new(r"[0-9]{4}-[0-9]{2}-[0-9]{2}").unwrap();
+let hay = "What do 1865-04-14, 1881-07-02, 1901-09-06 and 1963-11-22 have in common?";
+// 'm' is a 'Match', and 'as_str()' returns the matching part of the haystack.
+let dates: Vec<&str> = re.find_iter(hay).map(|m| m.as_str()).collect();
+assert_eq!(dates, vec![
+ "1865-04-14",
+ "1881-07-02",
+ "1901-09-06",
+ "1963-11-22",
+]);
+```
+
+We can also iterate over [`Captures`] values instead of [`Match`] values, and
+that in turn permits accessing each component of the date via capturing groups:
+
+```rust
+use regex::Regex;
+
+let re = Regex::new(r"(?<y>[0-9]{4})-(?<m>[0-9]{2})-(?<d>[0-9]{2})").unwrap();
+let hay = "What do 1865-04-14, 1881-07-02, 1901-09-06 and 1963-11-22 have in common?";
+// 'm' is a 'Match', and 'as_str()' returns the matching part of the haystack.
+let dates: Vec<(&str, &str, &str)> = re.captures_iter(hay).map(|caps| {
+ // The unwraps are okay because every capture group must match if the whole
+ // regex matches, and in this context, we know we have a match.
+ //
+ // Note that we use `caps.name("y").unwrap().as_str()` instead of
+ // `&caps["y"]` because the lifetime of the former is the same as the
+ // lifetime of `hay` above, but the lifetime of the latter is tied to the
+ // lifetime of `caps` due to how the `Index` trait is defined.
+ let year = caps.name("y").unwrap().as_str();
+ let month = caps.name("m").unwrap().as_str();
+ let day = caps.name("d").unwrap().as_str();
+ (year, month, day)
+}).collect();
+assert_eq!(dates, vec![
+ ("1865", "04", "14"),
+ ("1881", "07", "02"),
+ ("1901", "09", "06"),
+ ("1963", "11", "22"),
+]);
```
-Notice that the year is in the capture group indexed at `1`. This is
-because the *entire match* is stored in the capture group at index `0`.
+### Example: simpler capture group extraction
-# Example: replacement with named capture groups
+One can use [`Captures::extract`] to make the code from the previous example a
+bit simpler in this case:
+
+```rust
+use regex::Regex;
+
+let re = Regex::new(r"([0-9]{4})-([0-9]{2})-([0-9]{2})").unwrap();
+let hay = "What do 1865-04-14, 1881-07-02, 1901-09-06 and 1963-11-22 have in common?";
+let dates: Vec<(&str, &str, &str)> = re.captures_iter(hay).map(|caps| {
+ let (_, [year, month, day]) = caps.extract();
+ (year, month, day)
+}).collect();
+assert_eq!(dates, vec![
+ ("1865", "04", "14"),
+ ("1881", "07", "02"),
+ ("1901", "09", "06"),
+ ("1963", "11", "22"),
+]);
+```
+
+`Captures::extract` works by ensuring that the number of matching groups match
+the number of groups requested via the `[year, month, day]` syntax. If they do,
+then the substrings for each corresponding capture group are automatically
+returned in an appropriately sized array. Rust's syntax for pattern matching
+arrays does the rest.
+
+### Example: replacement with named capture groups
Building on the previous example, perhaps we'd like to rearrange the date
-formats. This can be done with text replacement. But to make the code
-clearer, we can *name* our capture groups and use those names as variables
-in our replacement text:
+formats. This can be done by finding each match and replacing it with
+something different. The [`Regex::replace_all`] routine provides a convenient
+way to do this, including by supporting references to named groups in the
+replacement string:
```rust
-# use regex::Regex;
-# fn main() {
-let re = Regex::new(r"(?P<y>\d{4})-(?P<m>\d{2})-(?P<d>\d{2})").unwrap();
-let before = "2012-03-14, 2013-01-01 and 2014-07-05";
+use regex::Regex;
+
+let re = Regex::new(r"(?<y>\d{4})-(?<m>\d{2})-(?<d>\d{2})").unwrap();
+let before = "1973-01-05, 1975-08-25 and 1980-10-18";
let after = re.replace_all(before, "$m/$d/$y");
-assert_eq!(after, "03/14/2012, 01/01/2013 and 07/05/2014");
-# }
+assert_eq!(after, "01/05/1973, 08/25/1975 and 10/18/1980");
```
-The `replace` methods are actually polymorphic in the replacement, which
+The replace methods are actually polymorphic in the replacement, which
provides more flexibility than is seen here. (See the documentation for
-`Regex::replace` for more details.)
+[`Regex::replace`] for more details.)
+
+### Example: verbose mode
-Note that if your regex gets complicated, you can use the `x` flag to
-enable insignificant whitespace mode, which also lets you write comments:
+When your regex gets complicated, you might consider using something other
+than regex. But if you stick with regex, you can use the `x` flag to enable
+insignificant whitespace mode or "verbose mode." In this mode, whitespace
+is treated as insignificant and one may write comments. This may make your
+patterns easier to comprehend.
```rust
-# use regex::Regex;
-# fn main() {
+use regex::Regex;
+
let re = Regex::new(r"(?x)
- (?P<y>\d{4}) # the year
+ (?P<y>\d{4}) # the year, including all Unicode digits
-
- (?P<m>\d{2}) # the month
+ (?P<m>\d{2}) # the month, including all Unicode digits
-
- (?P<d>\d{2}) # the day
+ (?P<d>\d{2}) # the day, including all Unicode digits
").unwrap();
-let before = "2012-03-14, 2013-01-01 and 2014-07-05";
+
+let before = "1973-01-05, 1975-08-25 and 1980-10-18";
let after = re.replace_all(before, "$m/$d/$y");
-assert_eq!(after, "03/14/2012, 01/01/2013 and 07/05/2014");
-# }
+assert_eq!(after, "01/05/1973, 08/25/1975 and 10/18/1980");
```
If you wish to match against whitespace in this mode, you can still use `\s`,
@@ -148,10 +373,10 @@ If you wish to match against whitespace in this mode, you can still use `\s`,
directly with `\ `, use its hex character code `\x20` or temporarily disable
the `x` flag, e.g., `(?-x: )`.
-# Example: match multiple regular expressions simultaneously
+### Example: match multiple regular expressions simultaneously
-This demonstrates how to use a `RegexSet` to match multiple (possibly
-overlapping) regular expressions in a single scan of the search text:
+This demonstrates how to use a [`RegexSet`] to match multiple (possibly
+overlapping) regexes in a single scan of a haystack:
```rust
use regex::RegexSet;
@@ -166,7 +391,8 @@ let set = RegexSet::new(&[
r"foobar",
]).unwrap();
-// Iterate over and collect all of the matches.
+// Iterate over and collect all of the matches. Each match corresponds to the
+// ID of the matching pattern.
let matches: Vec<_> = set.matches("foobar").into_iter().collect();
assert_eq!(matches, vec![0, 2, 3, 4, 6]);
@@ -176,96 +402,225 @@ assert!(!matches.matched(5));
assert!(matches.matched(6));
```
-# Pay for what you use
+# Performance
+
+This section briefly discusses a few concerns regarding the speed and resource
+usage of regexes.
-With respect to searching text with a regular expression, there are three
-questions that can be asked:
+### Only ask for what you need
-1. Does the text match this expression?
-2. If so, where does it match?
-3. Where did the capturing groups match?
+When running a search with a regex, there are generally three different types
+of information one can ask for:
+
+1. Does a regex match in a haystack?
+2. Where does a regex match in a haystack?
+3. Where do each of the capturing groups match in a haystack?
Generally speaking, this crate could provide a function to answer only #3,
which would subsume #1 and #2 automatically. However, it can be significantly
more expensive to compute the location of capturing group matches, so it's best
not to do it if you don't need to.
-Therefore, only use what you need. For example, don't use `find` if you
-only need to test if an expression matches a string. (Use `is_match`
-instead.)
+Therefore, only ask for what you need. For example, don't use [`Regex::find`]
+if you only need to test if a regex matches a haystack. Use [`Regex::is_match`]
+instead.
+
+### Unicode can impact memory usage and search speed
+
+This crate has first class support for Unicode and it is **enabled by default**.
+In many cases, the extra memory required to support it will be negligible and
+it typically won't impact search speed. But it can in some cases.
+
+With respect to memory usage, the impact of Unicode principally manifests
+through the use of Unicode character classes. Unicode character classes
+tend to be quite large. For example, `\w` by default matches around 140,000
+distinct codepoints. This requires additional memory, and tends to slow down
+regex compilation. While a `\w` here and there is unlikely to be noticed,
+writing `\w{100}` will for example result in quite a large regex by default.
+Indeed, `\w` is considerably larger than its ASCII-only version, so if your
+requirements are satisfied by ASCII, it's probably a good idea to stick to
+ASCII classes. The ASCII-only version of `\w` can be spelled in a number of
+ways. All of the following are equivalent:
+
+* `[0-9A-Za-z_]`
+* `(?-u:\w)`
+* `[[:word:]]`
+* `[\w&&\p{ascii}]`
+
+With respect to search speed, Unicode tends to be handled pretty well, even when
+using large Unicode character classes. However, some of the faster internal
+regex engines cannot handle a Unicode aware word boundary assertion. So if you
+don't need Unicode-aware word boundary assertions, you might consider using
+`(?-u:\b)` instead of `\b`, where the former uses an ASCII-only definition of
+a word character.
+
+### Literals might accelerate searches
+
+This crate tends to be quite good at recognizing literals in a regex pattern
+and using them to accelerate a search. If it is at all possible to include
+some kind of literal in your pattern, then it might make search substantially
+faster. For example, in the regex `\w+@\w+`, the engine will look for
+occurrences of `@` and then try a reverse match for `\w+` to find the start
+position.
+
+### Avoid re-compiling regexes, especially in a loop
+
+It is an anti-pattern to compile the same pattern in a loop since regex
+compilation is typically expensive. (It takes anywhere from a few microseconds
+to a few **milliseconds** depending on the size of the pattern.) Not only is
+compilation itself expensive, but this also prevents optimizations that reuse
+allocations internally to the regex engine.
+
+In Rust, it can sometimes be a pain to pass regexes around if they're used from
+inside a helper function. Instead, we recommend using crates like [`once_cell`]
+and [`lazy_static`] to ensure that patterns are compiled exactly once.
+
+[`once_cell`]: https://crates.io/crates/once_cell
+[`lazy_static`]: https://crates.io/crates/lazy_static
+
+This example shows how to use `once_cell`:
-# Unicode
+```rust
+use {
+ once_cell::sync::Lazy,
+ regex::Regex,
+};
-This implementation executes regular expressions **only** on valid UTF-8
-while exposing match locations as byte indices into the search string. (To
-relax this restriction, use the [`bytes`](bytes/index.html) sub-module.)
-Conceptually, the regex engine works by matching a haystack as if it were a
-sequence of Unicode scalar values.
+fn some_helper_function(haystack: &str) -> bool {
+ static RE: Lazy<Regex> = Lazy::new(|| Regex::new(r"...").unwrap());
+ RE.is_match(haystack)
+}
+
+fn main() {
+ assert!(some_helper_function("abc"));
+ assert!(!some_helper_function("ac"));
+}
+```
+
+Specifically, in this example, the regex will be compiled when it is used for
+the first time. On subsequent uses, it will reuse the previously built `Regex`.
+Notice how one can define the `Regex` locally to a specific function.
+
+### Sharing a regex across threads can result in contention
+
+While a single `Regex` can be freely used from multiple threads simultaneously,
+there is a small synchronization cost that must be paid. Generally speaking,
+one shouldn't expect to observe this unless the principal task in each thread
+is searching with the regex *and* most searches are on short haystacks. In this
+case, internal contention on shared resources can spike and increase latency,
+which in turn may slow down each individual search.
+
+One can work around this by cloning each `Regex` before sending it to another
+thread. The cloned regexes will still share the same internal read-only portion
+of its compiled state (it's reference counted), but each thread will get
+optimized access to the mutable space that is used to run a search. In general,
+there is no additional cost in memory to doing this. The only cost is the added
+code complexity required to explicitly clone the regex. (If you share the same
+`Regex` across multiple threads, each thread still gets its own mutable space,
+but accessing that space is slower.)
-Only simple case folding is supported. Namely, when matching
-case-insensitively, the characters are first mapped using the "simple" case
-folding rules defined by Unicode.
+# Unicode
-Regular expressions themselves are **only** interpreted as a sequence of
-Unicode scalar values. This means you can use Unicode characters directly
-in your expression:
+This section discusses what kind of Unicode support this regex library has.
+Before showing some examples, we'll summarize the relevant points:
+
+* This crate almost fully implements "Basic Unicode Support" (Level 1) as
+specified by the [Unicode Technical Standard #18][UTS18]. The full details
+of what is supported are documented in [UNICODE.md] in the root of the regex
+crate repository. There is virtually no support for "Extended Unicode Support"
+(Level 2) from UTS#18.
+* The top-level [`Regex`] runs searches *as if* iterating over each of the
+codepoints in the haystack. That is, the fundamental atom of matching is a
+single codepoint.
+* [`bytes::Regex`], in contrast, permits disabling Unicode mode for part of all
+of your pattern in all cases. When Unicode mode is disabled, then a search is
+run *as if* iterating over each byte in the haystack. That is, the fundamental
+atom of matching is a single byte. (A top-level `Regex` also permits disabling
+Unicode and thus matching *as if* it were one byte at a time, but only when
+doing so wouldn't permit matching invalid UTF-8.)
+* When Unicode mode is enabled (the default), `.` will match an entire Unicode
+scalar value, even when it is encoded using multiple bytes. When Unicode mode
+is disabled (e.g., `(?-u:.)`), then `.` will match a single byte in all cases.
+* The character classes `\w`, `\d` and `\s` are all Unicode-aware by default.
+Use `(?-u:\w)`, `(?-u:\d)` and `(?-u:\s)` to get their ASCII-only definitions.
+* Similarly, `\b` and `\B` use a Unicode definition of a "word" character.
+To get ASCII-only word boundaries, use `(?-u:\b)` and `(?-u:\B)`. This also
+applies to the special word boundary assertions. (That is, `\b{start}`,
+`\b{end}`, `\b{start-half}`, `\b{end-half}`.)
+* `^` and `$` are **not** Unicode-aware in multi-line mode. Namely, they only
+recognize `\n` (assuming CRLF mode is not enabled) and not any of the other
+forms of line terminators defined by Unicode.
+* Case insensitive searching is Unicode-aware and uses simple case folding.
+* Unicode general categories, scripts and many boolean properties are available
+by default via the `\p{property name}` syntax.
+* In all cases, matches are reported using byte offsets. Or more precisely,
+UTF-8 code unit offsets. This permits constant time indexing and slicing of the
+haystack.
+
+[UTS18]: https://unicode.org/reports/tr18/
+[UNICODE.md]: https://github.com/rust-lang/regex/blob/master/UNICODE.md
+
+Patterns themselves are **only** interpreted as a sequence of Unicode scalar
+values. This means you can use Unicode characters directly in your pattern:
```rust
-# use regex::Regex;
-# fn main() {
+use regex::Regex;
+
let re = Regex::new(r"(?i)Δ+").unwrap();
-let mat = re.find("ΔδΔ").unwrap();
-assert_eq!((mat.start(), mat.end()), (0, 6));
-# }
+let m = re.find("ΔδΔ").unwrap();
+assert_eq!((0, 6), (m.start(), m.end()));
+// alternatively:
+assert_eq!(0..6, m.range());
```
-Most features of the regular expressions in this crate are Unicode aware. Here
-are some examples:
-
-* `.` will match any valid UTF-8 encoded Unicode scalar value except for `\n`.
- (To also match `\n`, enable the `s` flag, e.g., `(?s:.)`.)
-* `\w`, `\d` and `\s` are Unicode aware. For example, `\s` will match all forms
- of whitespace categorized by Unicode.
-* `\b` matches a Unicode word boundary.
-* Negated character classes like `[^a]` match all Unicode scalar values except
- for `a`.
-* `^` and `$` are **not** Unicode aware in multi-line mode. Namely, they only
- recognize `\n` and not any of the other forms of line terminators defined
- by Unicode.
-
-Unicode general categories, scripts, script extensions, ages and a smattering
-of boolean properties are available as character classes. For example, you can
-match a sequence of numerals, Greek or Cherokee letters:
+As noted above, Unicode general categories, scripts, script extensions, ages
+and a smattering of boolean properties are available as character classes. For
+example, you can match a sequence of numerals, Greek or Cherokee letters:
```rust
-# use regex::Regex;
-# fn main() {
+use regex::Regex;
+
let re = Regex::new(r"[\pN\p{Greek}\p{Cherokee}]+").unwrap();
-let mat = re.find("abcΔᎠβⅠᏴγδⅡxyz").unwrap();
-assert_eq!((mat.start(), mat.end()), (3, 23));
-# }
+let m = re.find("abcΔᎠβⅠᏴγδⅡxyz").unwrap();
+assert_eq!(3..23, m.range());
```
-For a more detailed breakdown of Unicode support with respect to
-[UTS#18](https://unicode.org/reports/tr18/),
-please see the
-[UNICODE](https://github.com/rust-lang/regex/blob/master/UNICODE.md)
-document in the root of the regex repository.
+While not specific to Unicode, this library also supports character class set
+operations. Namely, one can nest character classes arbitrarily and perform set
+operations on them. Those set operations are union (the default), intersection,
+difference and symmetric difference. These set operations tend to be most
+useful with Unicode character classes. For example, to match any codepoint
+that is both in the `Greek` script and in the `Letter` general category:
+
+```rust
+use regex::Regex;
+
+let re = Regex::new(r"[\p{Greek}&&\pL]+").unwrap();
+let subs: Vec<&str> = re.find_iter("ΔδΔ𐅌ΔδΔ").map(|m| m.as_str()).collect();
+assert_eq!(subs, vec!["ΔδΔ", "ΔδΔ"]);
-# Opt out of Unicode support
+// If we just matches on Greek, then all codepoints would match!
+let re = Regex::new(r"\p{Greek}+").unwrap();
+let subs: Vec<&str> = re.find_iter("ΔδΔ𐅌ΔδΔ").map(|m| m.as_str()).collect();
+assert_eq!(subs, vec!["ΔδΔ𐅌ΔδΔ"]);
+```
-The `bytes` sub-module provides a `Regex` type that can be used to match
-on `&[u8]`. By default, text is interpreted as UTF-8 just like it is with
-the main `Regex` type. However, this behavior can be disabled by turning
-off the `u` flag, even if doing so could result in matching invalid UTF-8.
-For example, when the `u` flag is disabled, `.` will match any byte instead
-of any Unicode scalar value.
+### Opt out of Unicode support
+
+The [`bytes::Regex`] type that can be used to search `&[u8]` haystacks. By
+default, haystacks are conventionally treated as UTF-8 just like it is with the
+main `Regex` type. However, this behavior can be disabled by turning off the
+`u` flag, even if doing so could result in matching invalid UTF-8. For example,
+when the `u` flag is disabled, `.` will match any byte instead of any Unicode
+scalar value.
Disabling the `u` flag is also possible with the standard `&str`-based `Regex`
type, but it is only allowed where the UTF-8 invariant is maintained. For
example, `(?-u:\w)` is an ASCII-only `\w` character class and is legal in an
-`&str`-based `Regex`, but `(?-u:\xFF)` will attempt to match the raw byte
-`\xFF`, which is invalid UTF-8 and therefore is illegal in `&str`-based
+`&str`-based `Regex`, but `(?-u:\W)` will attempt to match *any byte* that
+isn't in `(?-u:\w)`, which in turn includes bytes that are invalid UTF-8.
+Similarly, `(?-u:\xFF)` will attempt to match the raw byte `\xFF` (instead of
+`U+00FF`), which is invalid UTF-8 and therefore is illegal in `&str`-based
regexes.
Finally, since Unicode support requires bundling large Unicode data
@@ -281,10 +636,11 @@ The syntax supported in this crate is documented below.
Note that the regular expression parser and abstract syntax are exposed in
a separate crate, [`regex-syntax`](https://docs.rs/regex-syntax).
-## Matching one character
+### Matching one character
<pre class="rust">
. any character except new line (includes new line with s flag)
+[0-9] any ASCII digit
\d digit (\p{Nd})
\D not digit
\pX Unicode character class identified by a one-letter name
@@ -307,20 +663,23 @@ a separate crate, [`regex-syntax`](https://docs.rs/regex-syntax).
[0-9--4] Direct subtraction (matching 0-9 except 4)
[a-g~~b-h] Symmetric difference (matching `a` and `h` only)
[\[\]] Escaping in character classes (matching [ or ])
+[a&&b] An empty character class matching nothing
</pre>
Any named character class may appear inside a bracketed `[...]` character
-class. For example, `[\p{Greek}[:digit:]]` matches any Greek or ASCII
-digit. `[\p{Greek}&&\pL]` matches Greek letters.
+class. For example, `[\p{Greek}[:digit:]]` matches any ASCII digit or any
+codepoint in the `Greek` script. `[\p{Greek}&&\pL]` matches Greek letters.
Precedence in character classes, from most binding to least:
-1. Ranges: `a-cd` == `[a-c]d`
-2. Union: `ab&&bc` == `[ab]&&[bc]`
-3. Intersection: `^a-z&&b` == `^[a-z&&b]`
-4. Negation
+1. Ranges: `[a-cd]` == `[[a-c]d]`
+2. Union: `[ab&&bc]` == `[[ab]&&[bc]]`
+3. Intersection, difference, symmetric difference. All three have equivalent
+precedence, and are evaluated in left-to-right order. For example,
+`[\pL--\p{Greek}&&\p{Uppercase}]` == `[[\pL--\p{Greek}]&&\p{Uppercase}]`.
+4. Negation: `[^a-z&&b]` == `[^[a-z&&b]]`.
-## Composites
+### Composites
<pre class="rust">
xy concatenation (x followed by y)
@@ -346,7 +705,7 @@ let re = Regex::new(r"sam|samwise").unwrap();
assert_eq!("sam", re.find(haystack).unwrap().as_str());
```
-## Repetitions
+### Repetitions
<pre class="rust">
x* zero or more of x (greedy)
@@ -363,21 +722,44 @@ x{n,}? at least n x (ungreedy/lazy)
x{n}? exactly n x
</pre>
-## Empty matches
+### Empty matches
<pre class="rust">
-^ the beginning of text (or start-of-line with multi-line mode)
-$ the end of text (or end-of-line with multi-line mode)
-\A only the beginning of text (even with multi-line mode enabled)
-\z only the end of text (even with multi-line mode enabled)
-\b a Unicode word boundary (\w on one side and \W, \A, or \z on other)
-\B not a Unicode word boundary
+^ the beginning of a haystack (or start-of-line with multi-line mode)
+$ the end of a haystack (or end-of-line with multi-line mode)
+\A only the beginning of a haystack (even with multi-line mode enabled)
+\z only the end of a haystack (even with multi-line mode enabled)
+\b a Unicode word boundary (\w on one side and \W, \A, or \z on other)
+\B not a Unicode word boundary
+\b{start}, \< a Unicode start-of-word boundary (\W|\A on the left, \w on the right)
+\b{end}, \> a Unicode end-of-word boundary (\w on the left, \W|\z on the right))
+\b{start-half} half of a Unicode start-of-word boundary (\W|\A on the left)
+\b{end-half} half of a Unicode end-of-word boundary (\W|\z on the right)
</pre>
-The empty regex is valid and matches the empty string. For example, the empty
-regex matches `abc` at positions `0`, `1`, `2` and `3`.
+The empty regex is valid and matches the empty string. For example, the
+empty regex matches `abc` at positions `0`, `1`, `2` and `3`. When using the
+top-level [`Regex`] on `&str` haystacks, an empty match that splits a codepoint
+is guaranteed to never be returned. However, such matches are permitted when
+using a [`bytes::Regex`]. For example:
-## Grouping and flags
+```rust
+let re = regex::Regex::new(r"").unwrap();
+let ranges: Vec<_> = re.find_iter("💩").map(|m| m.range()).collect();
+assert_eq!(ranges, vec![0..0, 4..4]);
+
+let re = regex::bytes::Regex::new(r"").unwrap();
+let ranges: Vec<_> = re.find_iter("💩".as_bytes()).map(|m| m.range()).collect();
+assert_eq!(ranges, vec![0..0, 1..1, 2..2, 3..3, 4..4]);
+```
+
+Note that an empty regex is distinct from a regex that can never match.
+For example, the regex `[a&&b]` is a character class that represents the
+intersection of `a` and `b`. That intersection is empty, which means the
+character class is empty. Since nothing is in the empty set, `[a&&b]` matches
+nothing, not even the empty string.
+
+### Grouping and flags
<pre class="rust">
(exp) numbered capture group (indexed by opening parenthesis)
@@ -405,6 +787,7 @@ All flags are by default disabled unless stated otherwise. They are:
i case-insensitive: letters match both upper and lower case
m multi-line mode: ^ and $ match begin/end of line
s allow . to match \n
+R enables CRLF mode: when multi-line mode is enabled, \r\n is used
U swap the meaning of x* and x*?
u Unicode support (enabled by default)
x verbose mode, ignores whitespace and allow line comments (starting with `#`)
@@ -418,22 +801,22 @@ Flags can be toggled within a pattern. Here's an example that matches
case-insensitively for the first part but case-sensitively for the second part:
```rust
-# use regex::Regex;
-# fn main() {
+use regex::Regex;
+
let re = Regex::new(r"(?i)a+(?-i)b+").unwrap();
-let cap = re.captures("AaAaAbbBBBb").unwrap();
-assert_eq!(&cap[0], "AaAaAbb");
-# }
+let m = re.find("AaAaAbbBBBb").unwrap();
+assert_eq!(m.as_str(), "AaAaAbb");
```
Notice that the `a+` matches either `a` or `A`, but the `b+` only matches
`b`.
Multi-line mode means `^` and `$` no longer match just at the beginning/end of
-the input, but at the beginning/end of lines:
+the input, but also at the beginning/end of lines:
```
-# use regex::Regex;
+use regex::Regex;
+
let re = Regex::new(r"(?m)^line \d+").unwrap();
let m = re.find("line one\nline 2\n").unwrap();
assert_eq!(m.as_str(), "line 2");
@@ -442,44 +825,72 @@ assert_eq!(m.as_str(), "line 2");
Note that `^` matches after new lines, even at the end of input:
```
-# use regex::Regex;
+use regex::Regex;
+
let re = Regex::new(r"(?m)^").unwrap();
let m = re.find_iter("test\n").last().unwrap();
assert_eq!((m.start(), m.end()), (5, 5));
```
-Here is an example that uses an ASCII word boundary instead of a Unicode
-word boundary:
+When both CRLF mode and multi-line mode are enabled, then `^` and `$` will
+match either `\r` and `\n`, but never in the middle of a `\r\n`:
+
+```
+use regex::Regex;
+
+let re = Regex::new(r"(?mR)^foo$").unwrap();
+let m = re.find("\r\nfoo\r\n").unwrap();
+assert_eq!(m.as_str(), "foo");
+```
+
+Unicode mode can also be selectively disabled, although only when the result
+*would not* match invalid UTF-8. One good example of this is using an ASCII
+word boundary instead of a Unicode word boundary, which might make some regex
+searches run faster:
```rust
-# use regex::Regex;
-# fn main() {
+use regex::Regex;
+
let re = Regex::new(r"(?-u:\b).+(?-u:\b)").unwrap();
-let cap = re.captures("$$abc$$").unwrap();
-assert_eq!(&cap[0], "abc");
-# }
+let m = re.find("$$abc$$").unwrap();
+assert_eq!(m.as_str(), "abc");
```
-## Escape sequences
+### Escape sequences
+
+Note that this includes all possible escape sequences, even ones that are
+documented elsewhere.
<pre class="rust">
-\* literal *, works for any punctuation character: \.+*?()|[]{}^$
-\a bell (\x07)
-\f form feed (\x0C)
-\t horizontal tab
-\n new line
-\r carriage return
-\v vertical tab (\x0B)
-\123 octal character code (up to three digits) (when enabled)
-\x7F hex character code (exactly two digits)
-\x{10FFFF} any hex character code corresponding to a Unicode code point
-\u007F hex character code (exactly four digits)
-\u{7F} any hex character code corresponding to a Unicode code point
-\U0000007F hex character code (exactly eight digits)
-\U{7F} any hex character code corresponding to a Unicode code point
+\* literal *, applies to all ASCII except [0-9A-Za-z<>]
+\a bell (\x07)
+\f form feed (\x0C)
+\t horizontal tab
+\n new line
+\r carriage return
+\v vertical tab (\x0B)
+\A matches at the beginning of a haystack
+\z matches at the end of a haystack
+\b word boundary assertion
+\B negated word boundary assertion
+\b{start}, \< start-of-word boundary assertion
+\b{end}, \> end-of-word boundary assertion
+\b{start-half} half of a start-of-word boundary assertion
+\b{end-half} half of a end-of-word boundary assertion
+\123 octal character code, up to three digits (when enabled)
+\x7F hex character code (exactly two digits)
+\x{10FFFF} any hex character code corresponding to a Unicode code point
+\u007F hex character code (exactly four digits)
+\u{7F} any hex character code corresponding to a Unicode code point
+\U0000007F hex character code (exactly eight digits)
+\U{7F} any hex character code corresponding to a Unicode code point
+\p{Letter} Unicode character class
+\P{Letter} negated Unicode character class
+\d, \s, \w Perl character class
+\D, \S, \W negated Perl character class
</pre>
-## Perl character classes (Unicode friendly)
+### Perl character classes (Unicode friendly)
These classes are based on the definitions provided in
[UTS#18](https://www.unicode.org/reports/tr18/#Compatibility_Properties):
@@ -493,7 +904,10 @@ These classes are based on the definitions provided in
\W not word character
</pre>
-## ASCII character classes
+### ASCII character classes
+
+These classes are based on the definitions provided in
+[UTS#18](https://www.unicode.org/reports/tr18/#Compatibility_Properties):
<pre class="rust">
[[:alnum:]] alphanumeric ([0-9A-Za-z])
@@ -512,16 +926,228 @@ These classes are based on the definitions provided in
[[:xdigit:]] hex digit ([0-9A-Fa-f])
</pre>
+# Untrusted input
+
+This crate is meant to be able to run regex searches on untrusted haystacks
+without fear of [ReDoS]. This crate also, to a certain extent, supports
+untrusted patterns.
+
+[ReDoS]: https://en.wikipedia.org/wiki/ReDoS
+
+This crate differs from most (but not all) other regex engines in that it
+doesn't use unbounded backtracking to run a regex search. In those cases,
+one generally cannot use untrusted patterns *or* untrusted haystacks because
+it can be very difficult to know whether a particular pattern will result in
+catastrophic backtracking or not.
+
+We'll first discuss how this crate deals with untrusted inputs and then wrap
+it up with a realistic discussion about what practice really looks like.
+
+### Panics
+
+Outside of clearly documented cases, most APIs in this crate are intended to
+never panic regardless of the inputs given to them. For example, `Regex::new`,
+`Regex::is_match`, `Regex::find` and `Regex::captures` should never panic. That
+is, it is an API promise that those APIs will never panic no matter what inputs
+are given to them. With that said, regex engines are complicated beasts, and
+providing a rock solid guarantee that these APIs literally never panic is
+essentially equivalent to saying, "there are no bugs in this library." That is
+a bold claim, and not really one that can be feasibly made with a straight
+face.
+
+Don't get the wrong impression here. This crate is extensively tested, not just
+with unit and integration tests, but also via fuzz testing. For example, this
+crate is part of the [OSS-fuzz project]. Panics should be incredibly rare, but
+it is possible for bugs to exist, and thus possible for a panic to occur. If
+you need a rock solid guarantee against panics, then you should wrap calls into
+this library with [`std::panic::catch_unwind`].
+
+It's also worth pointing out that this library will *generally* panic when
+other regex engines would commit undefined behavior. When undefined behavior
+occurs, your program might continue as if nothing bad has happened, but it also
+might mean your program is open to the worst kinds of exploits. In contrast,
+the worst thing a panic can do is a denial of service.
+
+[OSS-fuzz project]: https://android.googlesource.com/platform/external/oss-fuzz/+/refs/tags/android-t-preview-1/projects/rust-regex/
+[`std::panic::catch_unwind`]: https://doc.rust-lang.org/std/panic/fn.catch_unwind.html
+
+### Untrusted patterns
+
+The principal way this crate deals with them is by limiting their size by
+default. The size limit can be configured via [`RegexBuilder::size_limit`]. The
+idea of a size limit is that compiling a pattern into a `Regex` will fail if it
+becomes "too big." Namely, while *most* resources consumed by compiling a regex
+are approximately proportional (albeit with some high constant factors in some
+cases, such as with Unicode character classes) to the length of the pattern
+itself, there is one particular exception to this: counted repetitions. Namely,
+this pattern:
+
+```text
+a{5}{5}{5}{5}{5}{5}
+```
+
+Is equivalent to this pattern:
+
+```text
+a{15625}
+```
+
+In both of these cases, the actual pattern string is quite small, but the
+resulting `Regex` value is quite large. Indeed, as the first pattern shows,
+it isn't enough to locally limit the size of each repetition because they can
+be stacked in a way that results in exponential growth.
+
+To provide a bit more context, a simplified view of regex compilation looks
+like this:
+
+* The pattern string is parsed into a structured representation called an AST.
+Counted repetitions are not expanded and Unicode character classes are not
+looked up in this stage. That is, the size of the AST is proportional to the
+size of the pattern with "reasonable" constant factors. In other words, one
+can reasonably limit the memory used by an AST by limiting the length of the
+pattern string.
+* The AST is translated into an HIR. Counted repetitions are still *not*
+expanded at this stage, but Unicode character classes are embedded into the
+HIR. The memory usage of a HIR is still proportional to the length of the
+original pattern string, but the constant factors---mostly as a result of
+Unicode character classes---can be quite high. Still though, the memory used by
+an HIR can be reasonably limited by limiting the length of the pattern string.
+* The HIR is compiled into a [Thompson NFA]. This is the stage at which
+something like `\w{5}` is rewritten to `\w\w\w\w\w`. Thus, this is the stage
+at which [`RegexBuilder::size_limit`] is enforced. If the NFA exceeds the
+configured size, then this stage will fail.
+
+[Thompson NFA]: https://en.wikipedia.org/wiki/Thompson%27s_construction
+
+The size limit helps avoid two different kinds of exorbitant resource usage:
+
+* It avoids permitting exponential memory usage based on the size of the
+pattern string.
+* It avoids long search times. This will be discussed in more detail in the
+next section, but worst case search time *is* dependent on the size of the
+regex. So keeping regexes limited to a reasonable size is also a way of keeping
+search times reasonable.
+
+Finally, it's worth pointing out that regex compilation is guaranteed to take
+worst case `O(m)` time, where `m` is proportional to the size of regex. The
+size of the regex here is *after* the counted repetitions have been expanded.
+
+**Advice for those using untrusted regexes**: limit the pattern length to
+something small and expand it as needed. Configure [`RegexBuilder::size_limit`]
+to something small and then expand it as needed.
+
+### Untrusted haystacks
+
+The main way this crate guards against searches from taking a long time is by
+using algorithms that guarantee a `O(m * n)` worst case time and space bound.
+Namely:
+
+* `m` is proportional to the size of the regex, where the size of the regex
+includes the expansion of all counted repetitions. (See the previous section on
+untrusted patterns.)
+* `n` is proportional to the length, in bytes, of the haystack.
+
+In other words, if you consider `m` to be a constant (for example, the regex
+pattern is a literal in the source code), then the search can be said to run
+in "linear time." Or equivalently, "linear time with respect to the size of the
+haystack."
+
+But the `m` factor here is important not to ignore. If a regex is
+particularly big, the search times can get quite slow. This is why, in part,
+[`RegexBuilder::size_limit`] exists.
+
+**Advice for those searching untrusted haystacks**: As long as your regexes
+are not enormous, you should expect to be able to search untrusted haystacks
+without fear. If you aren't sure, you should benchmark it. Unlike backtracking
+engines, if your regex is so big that it's likely to result in slow searches,
+this is probably something you'll be able to observe regardless of what the
+haystack is made up of.
+
+### Iterating over matches
+
+One thing that is perhaps easy to miss is that the worst case time
+complexity bound of `O(m * n)` applies to methods like [`Regex::is_match`],
+[`Regex::find`] and [`Regex::captures`]. It does **not** apply to
+[`Regex::find_iter`] or [`Regex::captures_iter`]. Namely, since iterating over
+all matches can execute many searches, and each search can scan the entire
+haystack, the worst case time complexity for iterators is `O(m * n^2)`.
+
+One example of where this occurs is when a pattern consists of an alternation,
+where an earlier branch of the alternation requires scanning the entire
+haystack only to discover that there is no match. It also requires a later
+branch of the alternation to have matched at the beginning of the search. For
+example, consider the pattern `.*[^A-Z]|[A-Z]` and the haystack `AAAAA`. The
+first search will scan to the end looking for matches of `.*[^A-Z]` even though
+a finite automata engine (as in this crate) knows that `[A-Z]` has already
+matched the first character of the haystack. This is due to the greedy nature
+of regex searching. That first search will report a match at the first `A` only
+after scanning to the end to discover that no other match exists. The next
+search then begins at the second `A` and the behavior repeats.
+
+There is no way to avoid this. This means that if both patterns and haystacks
+are untrusted and you're iterating over all matches, you're susceptible to
+worst case quadratic time complexity. One possible way to mitigate this
+is to drop down to the lower level `regex-automata` crate and use its
+`meta::Regex` iterator APIs. There, you can configure the search to operate
+in "earliest" mode by passing a `Input::new(haystack).earliest(true)` to
+`meta::Regex::find_iter` (for example). By enabling this mode, you give up
+the normal greedy match semantics of regex searches and instead ask the regex
+engine to immediately stop as soon as a match has been found. Enabling this
+mode will thus restore the worst case `O(m * n)` time complexity bound, but at
+the cost of different semantics.
+
+### Untrusted inputs in practice
+
+While providing a `O(m * n)` worst case time bound on all searches goes a long
+way toward preventing [ReDoS], that doesn't mean every search you can possibly
+run will complete without burning CPU time. In general, there are a few ways
+for the `m * n` time bound to still bite you:
+
+* You are searching an exceptionally long haystack. No matter how you slice
+it, a longer haystack will take more time to search. This crate may often make
+very quick work of even long haystacks because of its literal optimizations,
+but those aren't available for all regexes.
+* Unicode character classes can cause searches to be quite slow in some cases.
+This is especially true when they are combined with counted repetitions. While
+the regex size limit above will protect you from the most egregious cases,
+the default size limit still permits pretty big regexes that can execute more
+slowly than one might expect.
+* While routines like [`Regex::find`] and [`Regex::captures`] guarantee
+worst case `O(m * n)` search time, routines like [`Regex::find_iter`] and
+[`Regex::captures_iter`] actually have worst case `O(m * n^2)` search time.
+This is because `find_iter` runs many searches, and each search takes worst
+case `O(m * n)` time. Thus, iteration of all matches in a haystack has
+worst case `O(m * n^2)`. A good example of a pattern that exhibits this is
+`(?:A+){1000}|` or even `.*[^A-Z]|[A-Z]`.
+
+In general, unstrusted haystacks are easier to stomach than untrusted patterns.
+Untrusted patterns give a lot more control to the caller to impact the
+performance of a search. In many cases, a regex search will actually execute in
+average case `O(n)` time (i.e., not dependent on the size of the regex), but
+this can't be guaranteed in general. Therefore, permitting untrusted patterns
+means that your only line of defense is to put a limit on how big `m` (and
+perhaps also `n`) can be in `O(m * n)`. `n` is limited by simply inspecting
+the length of the haystack while `m` is limited by *both* applying a limit to
+the length of the pattern *and* a limit on the compiled size of the regex via
+[`RegexBuilder::size_limit`].
+
+It bears repeating: if you're accepting untrusted patterns, it would be a good
+idea to start with conservative limits on `m` and `n`, and then carefully
+increase them as needed.
+
# Crate features
By default, this crate tries pretty hard to make regex matching both as fast
-as possible and as correct as it can be, within reason. This means that there
-is a lot of code dedicated to performance, the handling of Unicode data and the
-Unicode data itself. Overall, this leads to more dependencies, larger binaries
-and longer compile times. This trade off may not be appropriate in all cases,
-and indeed, even when all Unicode and performance features are disabled, one
-is still left with a perfectly serviceable regex engine that will work well
-in many cases.
+as possible and as correct as it can be. This means that there is a lot of
+code dedicated to performance, the handling of Unicode data and the Unicode
+data itself. Overall, this leads to more dependencies, larger binaries and
+longer compile times. This trade off may not be appropriate in all cases, and
+indeed, even when all Unicode and performance features are disabled, one is
+still left with a perfectly serviceable regex engine that will work well in
+many cases. (Note that code is not arbitrarily reducible, and for this reason,
+the [`regex-lite`](https://docs.rs/regex-lite) crate exists to provide an even
+more minimal experience by cutting out Unicode and performance, but still
+maintaining the linear search time bound.)
This crate exposes a number of features for controlling that trade off. Some
of these features are strictly performance oriented, such that disabling them
@@ -530,32 +1156,61 @@ Other features, such as the ones controlling the presence or absence of Unicode
data, can result in a loss of functionality. For example, if one disables the
`unicode-case` feature (described below), then compiling the regex `(?i)a`
will fail since Unicode case insensitivity is enabled by default. Instead,
-callers must use `(?i-u)a` instead to disable Unicode case folding. Stated
-differently, enabling or disabling any of the features below can only add or
-subtract from the total set of valid regular expressions. Enabling or disabling
-a feature will never modify the match semantics of a regular expression.
+callers must use `(?i-u)a` to disable Unicode case folding. Stated differently,
+enabling or disabling any of the features below can only add or subtract from
+the total set of valid regular expressions. Enabling or disabling a feature
+will never modify the match semantics of a regular expression.
-All features below are enabled by default.
+Most features below are enabled by default. Features that aren't enabled by
+default are noted.
### Ecosystem features
* **std** -
- When enabled, this will cause `regex` to use the standard library. Currently,
- disabling this feature will always result in a compilation error. It is
- intended to add `alloc`-only support to regex in the future.
+ When enabled, this will cause `regex` to use the standard library. In terms
+ of APIs, `std` causes error types to implement the `std::error::Error`
+ trait. Enabling `std` will also result in performance optimizations,
+ including SIMD and faster synchronization primitives. Notably, **disabling
+ the `std` feature will result in the use of spin locks**. To use a regex
+ engine without `std` and without spin locks, you'll need to drop down to
+ the [`regex-automata`](https://docs.rs/regex-automata) crate.
+* **logging** -
+ When enabled, the `log` crate is used to emit messages about regex
+ compilation and search strategies. This is **disabled by default**. This is
+ typically only useful to someone working on this crate's internals, but might
+ be useful if you're doing some rabbit hole performance hacking. Or if you're
+ just interested in the kinds of decisions being made by the regex engine.
### Performance features
* **perf** -
- Enables all performance related features. This feature is enabled by default
- and will always cover all features that improve performance, even if more
- are added in the future.
+ Enables all performance related features except for `perf-dfa-full`. This
+ feature is enabled by default is intended to cover all reasonable features
+ that improve performance, even if more are added in the future.
* **perf-dfa** -
Enables the use of a lazy DFA for matching. The lazy DFA is used to compile
portions of a regex to a very fast DFA on an as-needed basis. This can
result in substantial speedups, usually by an order of magnitude on large
haystacks. The lazy DFA does not bring in any new dependencies, but it can
make compile times longer.
+* **perf-dfa-full** -
+ Enables the use of a full DFA for matching. Full DFAs are problematic because
+ they have worst case `O(2^n)` construction time. For this reason, when this
+ feature is enabled, full DFAs are only used for very small regexes and a
+ very small space bound is used during determinization to avoid the DFA
+ from blowing up. This feature is not enabled by default, even as part of
+ `perf`, because it results in fairly sizeable increases in binary size and
+ compilation time. It can result in faster search times, but they tend to be
+ more modest and limited to non-Unicode regexes.
+* **perf-onepass** -
+ Enables the use of a one-pass DFA for extracting the positions of capture
+ groups. This optimization applies to a subset of certain types of NFAs and
+ represents the fastest engine in this crate for dealing with capture groups.
+* **perf-backtrack** -
+ Enables the use of a bounded backtracking algorithm for extracting the
+ positions of capture groups. This usually sits between the slowest engine
+ (the PikeVM) and the fastest engine (one-pass DFA) for extracting capture
+ groups. It's used whenever the regex is not one-pass and is small enough.
* **perf-inline** -
Enables the use of aggressive inlining inside match routines. This reduces
the overhead of each match. The aggressive inlining, however, increases
@@ -609,193 +1264,83 @@ All features below are enabled by default.
This enables using classes like `\p{gcb=Extend}`, `\p{wb=Katakana}` and
`\p{sb=ATerm}`.
-
-# Untrusted input
-
-This crate can handle both untrusted regular expressions and untrusted
-search text.
-
-Untrusted regular expressions are handled by capping the size of a compiled
-regular expression.
-(See [`RegexBuilder::size_limit`](struct.RegexBuilder.html#method.size_limit).)
-Without this, it would be trivial for an attacker to exhaust your system's
-memory with expressions like `a{100}{100}{100}`.
-
-Untrusted search text is allowed because the matching engine(s) in this
-crate have time complexity `O(mn)` (with `m ~ regex` and `n ~ search
-text`), which means there's no way to cause exponential blow-up like with
-some other regular expression engines. (We pay for this by disallowing
-features like arbitrary look-ahead and backreferences.)
-
-When a DFA is used, pathological cases with exponential state blow-up are
-avoided by constructing the DFA lazily or in an "online" manner. Therefore,
-at most one new state can be created for each byte of input. This satisfies
-our time complexity guarantees, but can lead to memory growth
-proportional to the size of the input. As a stopgap, the DFA is only
-allowed to store a fixed number of states. When the limit is reached, its
-states are wiped and continues on, possibly duplicating previous work. If
-the limit is reached too frequently, it gives up and hands control off to
-another matching engine with fixed memory requirements.
-(The DFA size limit can also be tweaked. See
-[`RegexBuilder::dfa_size_limit`](struct.RegexBuilder.html#method.dfa_size_limit).)
+# Other crates
+
+This crate has two required dependencies and several optional dependencies.
+This section briefly describes them with the goal of raising awareness of how
+different components of this crate may be used independently.
+
+It is somewhat unusual for a regex engine to have dependencies, as most regex
+libraries are self contained units with no dependencies other than a particular
+environment's standard library. Indeed, for other similarly optimized regex
+engines, most or all of the code in the dependencies of this crate would
+normally just be unseparable or coupled parts of the crate itself. But since
+Rust and its tooling ecosystem make the use of dependencies so easy, it made
+sense to spend some effort de-coupling parts of this crate and making them
+independently useful.
+
+We only briefly describe each crate here.
+
+* [`regex-lite`](https://docs.rs/regex-lite) is not a dependency of `regex`,
+but rather, a standalone zero-dependency simpler version of `regex` that
+prioritizes compile times and binary size. In exchange, it eschews Unicode
+support and performance. Its match semantics are as identical as possible to
+the `regex` crate, and for the things it supports, its APIs are identical to
+the APIs in this crate. In other words, for a lot of use cases, it is a drop-in
+replacement.
+* [`regex-syntax`](https://docs.rs/regex-syntax) provides a regular expression
+parser via `Ast` and `Hir` types. It also provides routines for extracting
+literals from a pattern. Folks can use this crate to do analysis, or even to
+build their own regex engine without having to worry about writing a parser.
+* [`regex-automata`](https://docs.rs/regex-automata) provides the regex engines
+themselves. One of the downsides of finite automata based regex engines is that
+they often need multiple internal engines in order to have similar or better
+performance than an unbounded backtracking engine in practice. `regex-automata`
+in particular provides public APIs for a PikeVM, a bounded backtracker, a
+one-pass DFA, a lazy DFA, a fully compiled DFA and a meta regex engine that
+combines all them together. It also has native multi-pattern support and
+provides a way to compile and serialize full DFAs such that they can be loaded
+and searched in a no-std no-alloc environment. `regex-automata` itself doesn't
+even have a required dependency on `regex-syntax`!
+* [`memchr`](https://docs.rs/memchr) provides low level SIMD vectorized
+routines for quickly finding the location of single bytes or even substrings
+in a haystack. In other words, it provides fast `memchr` and `memmem` routines.
+These are used by this crate in literal optimizations.
+* [`aho-corasick`](https://docs.rs/aho-corasick) provides multi-substring
+search. It also provides SIMD vectorized routines in the case where the number
+of substrings to search for is relatively small. The `regex` crate also uses
+this for literal optimizations.
*/
+#![no_std]
#![deny(missing_docs)]
#![cfg_attr(feature = "pattern", feature(pattern))]
#![warn(missing_debug_implementations)]
-#[cfg(not(feature = "std"))]
-compile_error!("`std` feature is currently required to build this crate");
+#[cfg(doctest)]
+doc_comment::doctest!("../README.md");
-// To check README's example
-// TODO: Re-enable this once the MSRV is 1.43 or greater.
-// See: https://github.com/rust-lang/regex/issues/684
-// See: https://github.com/rust-lang/regex/issues/685
-// #[cfg(doctest)]
-// doc_comment::doctest!("../README.md");
+extern crate alloc;
+#[cfg(any(test, feature = "std"))]
+extern crate std;
-#[cfg(feature = "std")]
pub use crate::error::Error;
-#[cfg(feature = "std")]
-pub use crate::re_builder::set_unicode::*;
-#[cfg(feature = "std")]
-pub use crate::re_builder::unicode::*;
-#[cfg(feature = "std")]
-pub use crate::re_set::unicode::*;
-#[cfg(feature = "std")]
-pub use crate::re_unicode::{
- escape, CaptureLocations, CaptureMatches, CaptureNames, Captures,
- Locations, Match, Matches, NoExpand, Regex, Replacer, ReplacerRef, Split,
- SplitN, SubCaptureMatches,
-};
-
-/**
-Match regular expressions on arbitrary bytes.
-
-This module provides a nearly identical API to the one found in the
-top-level of this crate. There are two important differences:
-
-1. Matching is done on `&[u8]` instead of `&str`. Additionally, `Vec<u8>`
-is used where `String` would have been used.
-2. Unicode support can be disabled even when disabling it would result in
-matching invalid UTF-8 bytes.
-
-# Example: match null terminated string
-
-This shows how to find all null-terminated strings in a slice of bytes:
-```rust
-# use regex::bytes::Regex;
-let re = Regex::new(r"(?-u)(?P<cstr>[^\x00]+)\x00").unwrap();
-let text = b"foo\x00bar\x00baz\x00";
-
-// Extract all of the strings without the null terminator from each match.
-// The unwrap is OK here since a match requires the `cstr` capture to match.
-let cstrs: Vec<&[u8]> =
- re.captures_iter(text)
- .map(|c| c.name("cstr").unwrap().as_bytes())
- .collect();
-assert_eq!(vec![&b"foo"[..], &b"bar"[..], &b"baz"[..]], cstrs);
-```
-
-# Example: selectively enable Unicode support
-
-This shows how to match an arbitrary byte pattern followed by a UTF-8 encoded
-string (e.g., to extract a title from a Matroska file):
-
-```rust
-# use std::str;
-# use regex::bytes::Regex;
-let re = Regex::new(
- r"(?-u)\x7b\xa9(?:[\x80-\xfe]|[\x40-\xff].)(?u:(.*))"
-).unwrap();
-let text = b"\x12\xd0\x3b\x5f\x7b\xa9\x85\xe2\x98\x83\x80\x98\x54\x76\x68\x65";
-let caps = re.captures(text).unwrap();
-
-// Notice that despite the `.*` at the end, it will only match valid UTF-8
-// because Unicode mode was enabled with the `u` flag. Without the `u` flag,
-// the `.*` would match the rest of the bytes.
-let mat = caps.get(1).unwrap();
-assert_eq!((7, 10), (mat.start(), mat.end()));
-
-// If there was a match, Unicode mode guarantees that `title` is valid UTF-8.
-let title = str::from_utf8(&caps[1]).unwrap();
-assert_eq!("☃", title);
-```
-
-In general, if the Unicode flag is enabled in a capture group and that capture
-is part of the overall match, then the capture is *guaranteed* to be valid
-UTF-8.
-
-# Syntax
-
-The supported syntax is pretty much the same as the syntax for Unicode
-regular expressions with a few changes that make sense for matching arbitrary
-bytes:
-
-1. The `u` flag can be disabled even when disabling it might cause the regex to
-match invalid UTF-8. When the `u` flag is disabled, the regex is said to be in
-"ASCII compatible" mode.
-2. In ASCII compatible mode, neither Unicode scalar values nor Unicode
-character classes are allowed.
-3. In ASCII compatible mode, Perl character classes (`\w`, `\d` and `\s`)
-revert to their typical ASCII definition. `\w` maps to `[[:word:]]`, `\d` maps
-to `[[:digit:]]` and `\s` maps to `[[:space:]]`.
-4. In ASCII compatible mode, word boundaries use the ASCII compatible `\w` to
-determine whether a byte is a word byte or not.
-5. Hexadecimal notation can be used to specify arbitrary bytes instead of
-Unicode codepoints. For example, in ASCII compatible mode, `\xFF` matches the
-literal byte `\xFF`, while in Unicode mode, `\xFF` is a Unicode codepoint that
-matches its UTF-8 encoding of `\xC3\xBF`. Similarly for octal notation when
-enabled.
-6. In ASCII compatible mode, `.` matches any *byte* except for `\n`. When the
-`s` flag is additionally enabled, `.` matches any byte.
-
-# Performance
-
-In general, one should expect performance on `&[u8]` to be roughly similar to
-performance on `&str`.
-*/
-#[cfg(feature = "std")]
-pub mod bytes {
- pub use crate::re_builder::bytes::*;
- pub use crate::re_builder::set_bytes::*;
- pub use crate::re_bytes::*;
- pub use crate::re_set::bytes::*;
-}
+pub use crate::{builders::string::*, regex::string::*, regexset::string::*};
-mod backtrack;
-mod compile;
-#[cfg(feature = "perf-dfa")]
-mod dfa;
+mod builders;
+pub mod bytes;
mod error;
-mod exec;
-mod expand;
mod find_byte;
-mod input;
-mod literal;
#[cfg(feature = "pattern")]
mod pattern;
-mod pikevm;
-mod pool;
-mod prog;
-mod re_builder;
-mod re_bytes;
-mod re_set;
-mod re_trait;
-mod re_unicode;
-mod sparse;
-mod utf8;
-
-/// The `internal` module exists to support suspicious activity, such as
-/// testing different matching engines and supporting the `regex-debug` CLI
-/// utility.
-#[doc(hidden)]
-#[cfg(feature = "std")]
-pub mod internal {
- pub use crate::compile::Compiler;
- pub use crate::exec::{Exec, ExecBuilder};
- pub use crate::input::{Char, CharInput, Input, InputAt};
- pub use crate::literal::LiteralSearcher;
- pub use crate::prog::{EmptyLook, Inst, InstRanges, Program};
+mod regex;
+mod regexset;
+
+/// Escapes all regular expression meta characters in `pattern`.
+///
+/// The string returned may be safely used as a literal in a regular
+/// expression.
+pub fn escape(pattern: &str) -> alloc::string::String {
+ regex_syntax::escape(pattern)
}
diff --git a/vendor/regex/src/literal/imp.rs b/vendor/regex/src/literal/imp.rs
deleted file mode 100644
index 75fa6e37b..000000000
--- a/vendor/regex/src/literal/imp.rs
+++ /dev/null
@@ -1,413 +0,0 @@
-use std::mem;
-
-use aho_corasick::{self, packed, AhoCorasick};
-use memchr::{memchr, memchr2, memchr3, memmem};
-use regex_syntax::hir::literal::{Literal, Seq};
-
-/// A prefix extracted from a compiled regular expression.
-///
-/// A regex prefix is a set of literal strings that *must* be matched at the
-/// beginning of a regex in order for the entire regex to match. Similarly
-/// for a regex suffix.
-#[derive(Clone, Debug)]
-pub struct LiteralSearcher {
- complete: bool,
- lcp: Memmem,
- lcs: Memmem,
- matcher: Matcher,
-}
-
-#[derive(Clone, Debug)]
-enum Matcher {
- /// No literals. (Never advances through the input.)
- Empty,
- /// A set of four or more single byte literals.
- Bytes(SingleByteSet),
- /// A single substring, using vector accelerated routines when available.
- Memmem(Memmem),
- /// An Aho-Corasick automaton.
- AC { ac: AhoCorasick, lits: Vec<Literal> },
- /// A packed multiple substring searcher, using SIMD.
- ///
- /// Note that Aho-Corasick will actually use this packed searcher
- /// internally automatically, however, there is some overhead associated
- /// with going through the Aho-Corasick machinery. So using the packed
- /// searcher directly results in some gains.
- Packed { s: packed::Searcher, lits: Vec<Literal> },
-}
-
-impl LiteralSearcher {
- /// Returns a matcher that never matches and never advances the input.
- pub fn empty() -> Self {
- Self::new(Seq::infinite(), Matcher::Empty)
- }
-
- /// Returns a matcher for literal prefixes from the given set.
- pub fn prefixes(lits: Seq) -> Self {
- let matcher = Matcher::prefixes(&lits);
- Self::new(lits, matcher)
- }
-
- /// Returns a matcher for literal suffixes from the given set.
- pub fn suffixes(lits: Seq) -> Self {
- let matcher = Matcher::suffixes(&lits);
- Self::new(lits, matcher)
- }
-
- fn new(lits: Seq, matcher: Matcher) -> Self {
- LiteralSearcher {
- complete: lits.is_exact(),
- lcp: Memmem::new(lits.longest_common_prefix().unwrap_or(b"")),
- lcs: Memmem::new(lits.longest_common_suffix().unwrap_or(b"")),
- matcher,
- }
- }
-
- /// Returns true if all matches comprise the entire regular expression.
- ///
- /// This does not necessarily mean that a literal match implies a match
- /// of the regular expression. For example, the regular expression `^a`
- /// is comprised of a single complete literal `a`, but the regular
- /// expression demands that it only match at the beginning of a string.
- pub fn complete(&self) -> bool {
- self.complete && !self.is_empty()
- }
-
- /// Find the position of a literal in `haystack` if it exists.
- #[cfg_attr(feature = "perf-inline", inline(always))]
- pub fn find(&self, haystack: &[u8]) -> Option<(usize, usize)> {
- use self::Matcher::*;
- match self.matcher {
- Empty => Some((0, 0)),
- Bytes(ref sset) => sset.find(haystack).map(|i| (i, i + 1)),
- Memmem(ref s) => s.find(haystack).map(|i| (i, i + s.len())),
- AC { ref ac, .. } => {
- ac.find(haystack).map(|m| (m.start(), m.end()))
- }
- Packed { ref s, .. } => {
- s.find(haystack).map(|m| (m.start(), m.end()))
- }
- }
- }
-
- /// Like find, except matches must start at index `0`.
- pub fn find_start(&self, haystack: &[u8]) -> Option<(usize, usize)> {
- for lit in self.iter() {
- if lit.len() > haystack.len() {
- continue;
- }
- if lit == &haystack[0..lit.len()] {
- return Some((0, lit.len()));
- }
- }
- None
- }
-
- /// Like find, except matches must end at index `haystack.len()`.
- pub fn find_end(&self, haystack: &[u8]) -> Option<(usize, usize)> {
- for lit in self.iter() {
- if lit.len() > haystack.len() {
- continue;
- }
- if lit == &haystack[haystack.len() - lit.len()..] {
- return Some((haystack.len() - lit.len(), haystack.len()));
- }
- }
- None
- }
-
- /// Returns an iterator over all literals to be matched.
- pub fn iter(&self) -> LiteralIter<'_> {
- match self.matcher {
- Matcher::Empty => LiteralIter::Empty,
- Matcher::Bytes(ref sset) => LiteralIter::Bytes(&sset.dense),
- Matcher::Memmem(ref s) => LiteralIter::Single(&s.finder.needle()),
- Matcher::AC { ref lits, .. } => LiteralIter::AC(lits),
- Matcher::Packed { ref lits, .. } => LiteralIter::Packed(lits),
- }
- }
-
- /// Returns a matcher for the longest common prefix of this matcher.
- pub fn lcp(&self) -> &Memmem {
- &self.lcp
- }
-
- /// Returns a matcher for the longest common suffix of this matcher.
- pub fn lcs(&self) -> &Memmem {
- &self.lcs
- }
-
- /// Returns true iff this prefix is empty.
- pub fn is_empty(&self) -> bool {
- self.len() == 0
- }
-
- /// Returns the number of prefixes in this machine.
- pub fn len(&self) -> usize {
- use self::Matcher::*;
- match self.matcher {
- Empty => 0,
- Bytes(ref sset) => sset.dense.len(),
- Memmem(_) => 1,
- AC { ref ac, .. } => ac.patterns_len(),
- Packed { ref lits, .. } => lits.len(),
- }
- }
-
- /// Return the approximate heap usage of literals in bytes.
- pub fn approximate_size(&self) -> usize {
- use self::Matcher::*;
- match self.matcher {
- Empty => 0,
- Bytes(ref sset) => sset.approximate_size(),
- Memmem(ref single) => single.approximate_size(),
- AC { ref ac, .. } => ac.memory_usage(),
- Packed { ref s, .. } => s.memory_usage(),
- }
- }
-}
-
-impl Matcher {
- fn prefixes(lits: &Seq) -> Self {
- let sset = SingleByteSet::prefixes(lits);
- Matcher::new(lits, sset)
- }
-
- fn suffixes(lits: &Seq) -> Self {
- let sset = SingleByteSet::suffixes(lits);
- Matcher::new(lits, sset)
- }
-
- fn new(lits: &Seq, sset: SingleByteSet) -> Self {
- if lits.is_empty() || lits.min_literal_len() == Some(0) {
- return Matcher::Empty;
- }
- let lits = match lits.literals() {
- None => return Matcher::Empty,
- Some(members) => members,
- };
- if sset.dense.len() >= 26 {
- // Avoid trying to match a large number of single bytes.
- // This is *very* sensitive to a frequency analysis comparison
- // between the bytes in sset and the composition of the haystack.
- // No matter the size of sset, if its members all are rare in the
- // haystack, then it'd be worth using it. How to tune this... IDK.
- // ---AG
- return Matcher::Empty;
- }
- if sset.complete {
- return Matcher::Bytes(sset);
- }
- if lits.len() == 1 {
- return Matcher::Memmem(Memmem::new(lits[0].as_bytes()));
- }
-
- let pats: Vec<&[u8]> = lits.iter().map(|lit| lit.as_bytes()).collect();
- let is_aho_corasick_fast = sset.dense.len() <= 1 && sset.all_ascii;
- if lits.len() <= 100 && !is_aho_corasick_fast {
- let mut builder = packed::Config::new()
- .match_kind(packed::MatchKind::LeftmostFirst)
- .builder();
- if let Some(s) = builder.extend(&pats).build() {
- return Matcher::Packed { s, lits: lits.to_owned() };
- }
- }
- let ac = AhoCorasick::builder()
- .match_kind(aho_corasick::MatchKind::LeftmostFirst)
- .kind(Some(aho_corasick::AhoCorasickKind::DFA))
- .build(&pats)
- .unwrap();
- Matcher::AC { ac, lits: lits.to_owned() }
- }
-}
-
-#[derive(Debug)]
-pub enum LiteralIter<'a> {
- Empty,
- Bytes(&'a [u8]),
- Single(&'a [u8]),
- AC(&'a [Literal]),
- Packed(&'a [Literal]),
-}
-
-impl<'a> Iterator for LiteralIter<'a> {
- type Item = &'a [u8];
-
- fn next(&mut self) -> Option<Self::Item> {
- match *self {
- LiteralIter::Empty => None,
- LiteralIter::Bytes(ref mut many) => {
- if many.is_empty() {
- None
- } else {
- let next = &many[0..1];
- *many = &many[1..];
- Some(next)
- }
- }
- LiteralIter::Single(ref mut one) => {
- if one.is_empty() {
- None
- } else {
- let next = &one[..];
- *one = &[];
- Some(next)
- }
- }
- LiteralIter::AC(ref mut lits) => {
- if lits.is_empty() {
- None
- } else {
- let next = &lits[0];
- *lits = &lits[1..];
- Some(next.as_bytes())
- }
- }
- LiteralIter::Packed(ref mut lits) => {
- if lits.is_empty() {
- None
- } else {
- let next = &lits[0];
- *lits = &lits[1..];
- Some(next.as_bytes())
- }
- }
- }
- }
-}
-
-#[derive(Clone, Debug)]
-struct SingleByteSet {
- sparse: Vec<bool>,
- dense: Vec<u8>,
- complete: bool,
- all_ascii: bool,
-}
-
-impl SingleByteSet {
- fn new() -> SingleByteSet {
- SingleByteSet {
- sparse: vec![false; 256],
- dense: vec![],
- complete: true,
- all_ascii: true,
- }
- }
-
- fn prefixes(lits: &Seq) -> SingleByteSet {
- let mut sset = SingleByteSet::new();
- let lits = match lits.literals() {
- None => return sset,
- Some(lits) => lits,
- };
- for lit in lits.iter() {
- sset.complete = sset.complete && lit.len() == 1;
- if let Some(&b) = lit.as_bytes().get(0) {
- if !sset.sparse[b as usize] {
- if b > 0x7F {
- sset.all_ascii = false;
- }
- sset.dense.push(b);
- sset.sparse[b as usize] = true;
- }
- }
- }
- sset
- }
-
- fn suffixes(lits: &Seq) -> SingleByteSet {
- let mut sset = SingleByteSet::new();
- let lits = match lits.literals() {
- None => return sset,
- Some(lits) => lits,
- };
- for lit in lits.iter() {
- sset.complete = sset.complete && lit.len() == 1;
- if let Some(&b) = lit.as_bytes().last() {
- if !sset.sparse[b as usize] {
- if b > 0x7F {
- sset.all_ascii = false;
- }
- sset.dense.push(b);
- sset.sparse[b as usize] = true;
- }
- }
- }
- sset
- }
-
- /// Faster find that special cases certain sizes to use memchr.
- #[cfg_attr(feature = "perf-inline", inline(always))]
- fn find(&self, text: &[u8]) -> Option<usize> {
- match self.dense.len() {
- 0 => None,
- 1 => memchr(self.dense[0], text),
- 2 => memchr2(self.dense[0], self.dense[1], text),
- 3 => memchr3(self.dense[0], self.dense[1], self.dense[2], text),
- _ => self._find(text),
- }
- }
-
- /// Generic find that works on any sized set.
- fn _find(&self, haystack: &[u8]) -> Option<usize> {
- for (i, &b) in haystack.iter().enumerate() {
- if self.sparse[b as usize] {
- return Some(i);
- }
- }
- None
- }
-
- fn approximate_size(&self) -> usize {
- (self.dense.len() * mem::size_of::<u8>())
- + (self.sparse.len() * mem::size_of::<bool>())
- }
-}
-
-/// A simple wrapper around the memchr crate's memmem implementation.
-///
-/// The API this exposes mirrors the API of previous substring searchers that
-/// this supplanted.
-#[derive(Clone, Debug)]
-pub struct Memmem {
- finder: memmem::Finder<'static>,
- char_len: usize,
-}
-
-impl Memmem {
- fn new(pat: &[u8]) -> Memmem {
- Memmem {
- finder: memmem::Finder::new(pat).into_owned(),
- char_len: char_len_lossy(pat),
- }
- }
-
- #[cfg_attr(feature = "perf-inline", inline(always))]
- pub fn find(&self, haystack: &[u8]) -> Option<usize> {
- self.finder.find(haystack)
- }
-
- #[cfg_attr(feature = "perf-inline", inline(always))]
- pub fn is_suffix(&self, text: &[u8]) -> bool {
- if text.len() < self.len() {
- return false;
- }
- &text[text.len() - self.len()..] == self.finder.needle()
- }
-
- pub fn len(&self) -> usize {
- self.finder.needle().len()
- }
-
- pub fn char_len(&self) -> usize {
- self.char_len
- }
-
- fn approximate_size(&self) -> usize {
- self.finder.needle().len() * mem::size_of::<u8>()
- }
-}
-
-fn char_len_lossy(bytes: &[u8]) -> usize {
- String::from_utf8_lossy(bytes).chars().count()
-}
diff --git a/vendor/regex/src/literal/mod.rs b/vendor/regex/src/literal/mod.rs
deleted file mode 100644
index b9fb77aed..000000000
--- a/vendor/regex/src/literal/mod.rs
+++ /dev/null
@@ -1,55 +0,0 @@
-pub use self::imp::*;
-
-#[cfg(feature = "perf-literal")]
-mod imp;
-
-#[allow(missing_docs)]
-#[cfg(not(feature = "perf-literal"))]
-mod imp {
- use regex_syntax::hir::literal::Seq;
-
- #[derive(Clone, Debug)]
- pub struct LiteralSearcher(());
-
- impl LiteralSearcher {
- pub fn empty() -> Self {
- LiteralSearcher(())
- }
-
- pub fn prefixes(_: Seq) -> Self {
- LiteralSearcher(())
- }
-
- pub fn suffixes(_: Seq) -> Self {
- LiteralSearcher(())
- }
-
- pub fn complete(&self) -> bool {
- false
- }
-
- pub fn find(&self, _: &[u8]) -> Option<(usize, usize)> {
- unreachable!()
- }
-
- pub fn find_start(&self, _: &[u8]) -> Option<(usize, usize)> {
- unreachable!()
- }
-
- pub fn find_end(&self, _: &[u8]) -> Option<(usize, usize)> {
- unreachable!()
- }
-
- pub fn is_empty(&self) -> bool {
- true
- }
-
- pub fn len(&self) -> usize {
- 0
- }
-
- pub fn approximate_size(&self) -> usize {
- 0
- }
- }
-}
diff --git a/vendor/regex/src/pattern.rs b/vendor/regex/src/pattern.rs
index 00549e510..2db04d8b3 100644
--- a/vendor/regex/src/pattern.rs
+++ b/vendor/regex/src/pattern.rs
@@ -1,6 +1,6 @@
-use std::str::pattern::{Pattern, SearchStep, Searcher};
+use core::str::pattern::{Pattern, SearchStep, Searcher};
-use crate::re_unicode::{Matches, Regex};
+use crate::{Matches, Regex};
#[derive(Debug)]
pub struct RegexSearcher<'r, 't> {
diff --git a/vendor/regex/src/pikevm.rs b/vendor/regex/src/pikevm.rs
deleted file mode 100644
index 8c9eac2d3..000000000
--- a/vendor/regex/src/pikevm.rs
+++ /dev/null
@@ -1,360 +0,0 @@
-// This module implements the Pike VM. That is, it guarantees linear time
-// search of a regex on any text with memory use proportional to the size of
-// the regex.
-//
-// It is equal in power to the backtracking engine in this crate, except the
-// backtracking engine is typically faster on small regexes/texts at the
-// expense of a bigger memory footprint.
-//
-// It can do more than the DFA can (specifically, record capture locations
-// and execute Unicode word boundary assertions), but at a slower speed.
-// Specifically, the Pike VM executes a DFA implicitly by repeatedly expanding
-// epsilon transitions. That is, the Pike VM engine can be in multiple states
-// at once where as the DFA is only ever in one state at a time.
-//
-// Therefore, the Pike VM is generally treated as the fallback when the other
-// matching engines either aren't feasible to run or are insufficient.
-
-use std::mem;
-
-use crate::exec::ProgramCache;
-use crate::input::{Input, InputAt};
-use crate::prog::{InstPtr, Program};
-use crate::re_trait::Slot;
-use crate::sparse::SparseSet;
-
-/// An NFA simulation matching engine.
-#[derive(Debug)]
-pub struct Fsm<'r, I> {
- /// The sequence of opcodes (among other things) that is actually executed.
- ///
- /// The program may be byte oriented or Unicode codepoint oriented.
- prog: &'r Program,
- /// An explicit stack used for following epsilon transitions. (This is
- /// borrowed from the cache.)
- stack: &'r mut Vec<FollowEpsilon>,
- /// The input to search.
- input: I,
-}
-
-/// A cached allocation that can be reused on each execution.
-#[derive(Clone, Debug)]
-pub struct Cache {
- /// A pair of ordered sets for tracking NFA states.
- clist: Threads,
- nlist: Threads,
- /// An explicit stack used for following epsilon transitions.
- stack: Vec<FollowEpsilon>,
-}
-
-/// An ordered set of NFA states and their captures.
-#[derive(Clone, Debug)]
-struct Threads {
- /// An ordered set of opcodes (each opcode is an NFA state).
- set: SparseSet,
- /// Captures for every NFA state.
- ///
- /// It is stored in row-major order, where the columns are the capture
- /// slots and the rows are the states.
- caps: Vec<Slot>,
- /// The number of capture slots stored per thread. (Every capture has
- /// two slots.)
- slots_per_thread: usize,
-}
-
-/// A representation of an explicit stack frame when following epsilon
-/// transitions. This is used to avoid recursion.
-#[derive(Clone, Debug)]
-enum FollowEpsilon {
- /// Follow transitions at the given instruction pointer.
- IP(InstPtr),
- /// Restore the capture slot with the given position in the input.
- Capture { slot: usize, pos: Slot },
-}
-
-impl Cache {
- /// Create a new allocation used by the NFA machine to record execution
- /// and captures.
- pub fn new(_prog: &Program) -> Self {
- Cache { clist: Threads::new(), nlist: Threads::new(), stack: vec![] }
- }
-}
-
-impl<'r, I: Input> Fsm<'r, I> {
- /// Execute the NFA matching engine.
- ///
- /// If there's a match, `exec` returns `true` and populates the given
- /// captures accordingly.
- pub fn exec(
- prog: &'r Program,
- cache: &ProgramCache,
- matches: &mut [bool],
- slots: &mut [Slot],
- quit_after_match: bool,
- input: I,
- start: usize,
- end: usize,
- ) -> bool {
- let mut cache = cache.borrow_mut();
- let cache = &mut cache.pikevm;
- cache.clist.resize(prog.len(), prog.captures.len());
- cache.nlist.resize(prog.len(), prog.captures.len());
- let at = input.at(start);
- Fsm { prog, stack: &mut cache.stack, input }.exec_(
- &mut cache.clist,
- &mut cache.nlist,
- matches,
- slots,
- quit_after_match,
- at,
- end,
- )
- }
-
- fn exec_(
- &mut self,
- mut clist: &mut Threads,
- mut nlist: &mut Threads,
- matches: &mut [bool],
- slots: &mut [Slot],
- quit_after_match: bool,
- mut at: InputAt,
- end: usize,
- ) -> bool {
- let mut matched = false;
- let mut all_matched = false;
- clist.set.clear();
- nlist.set.clear();
- 'LOOP: loop {
- if clist.set.is_empty() {
- // Three ways to bail out when our current set of threads is
- // empty.
- //
- // 1. We have a match---so we're done exploring any possible
- // alternatives. Time to quit. (We can't do this if we're
- // looking for matches for multiple regexes, unless we know
- // they all matched.)
- //
- // 2. If the expression starts with a '^' we can terminate as
- // soon as the last thread dies.
- if (matched && matches.len() <= 1)
- || all_matched
- || (!at.is_start() && self.prog.is_anchored_start)
- {
- break;
- }
-
- // 3. If there's a literal prefix for the program, try to
- // jump ahead quickly. If it can't be found, then we can
- // bail out early.
- if !self.prog.prefixes.is_empty() {
- at = match self.input.prefix_at(&self.prog.prefixes, at) {
- None => break,
- Some(at) => at,
- };
- }
- }
-
- // This simulates a preceding '.*?' for every regex by adding
- // a state starting at the current position in the input for the
- // beginning of the program only if we don't already have a match.
- if clist.set.is_empty()
- || (!self.prog.is_anchored_start && !all_matched)
- {
- self.add(&mut clist, slots, 0, at);
- }
- // The previous call to "add" actually inspects the position just
- // before the current character. For stepping through the machine,
- // we can to look at the current character, so we advance the
- // input.
- let at_next = self.input.at(at.next_pos());
- for i in 0..clist.set.len() {
- let ip = clist.set[i];
- if self.step(
- &mut nlist,
- matches,
- slots,
- clist.caps(ip),
- ip,
- at,
- at_next,
- ) {
- matched = true;
- all_matched = all_matched || matches.iter().all(|&b| b);
- if quit_after_match {
- // If we only care if a match occurs (not its
- // position), then we can quit right now.
- break 'LOOP;
- }
- if self.prog.matches.len() == 1 {
- // We don't need to check the rest of the threads
- // in this set because we've matched something
- // ("leftmost-first"). However, we still need to check
- // threads in the next set to support things like
- // greedy matching.
- //
- // This is only true on normal regexes. For regex sets,
- // we need to mush on to observe other matches.
- break;
- }
- }
- }
- if at.pos() >= end {
- break;
- }
- at = at_next;
- mem::swap(clist, nlist);
- nlist.set.clear();
- }
- matched
- }
-
- /// Step through the input, one token (byte or codepoint) at a time.
- ///
- /// nlist is the set of states that will be processed on the next token
- /// in the input.
- ///
- /// caps is the set of captures passed by the caller of the NFA. They are
- /// written to only when a match state is visited.
- ///
- /// thread_caps is the set of captures set for the current NFA state, ip.
- ///
- /// at and at_next are the current and next positions in the input. at or
- /// at_next may be EOF.
- fn step(
- &mut self,
- nlist: &mut Threads,
- matches: &mut [bool],
- slots: &mut [Slot],
- thread_caps: &mut [Option<usize>],
- ip: usize,
- at: InputAt,
- at_next: InputAt,
- ) -> bool {
- use crate::prog::Inst::*;
- match self.prog[ip] {
- Match(match_slot) => {
- if match_slot < matches.len() {
- matches[match_slot] = true;
- }
- for (slot, val) in slots.iter_mut().zip(thread_caps.iter()) {
- *slot = *val;
- }
- true
- }
- Char(ref inst) => {
- if inst.c == at.char() {
- self.add(nlist, thread_caps, inst.goto, at_next);
- }
- false
- }
- Ranges(ref inst) => {
- if inst.matches(at.char()) {
- self.add(nlist, thread_caps, inst.goto, at_next);
- }
- false
- }
- Bytes(ref inst) => {
- if let Some(b) = at.byte() {
- if inst.matches(b) {
- self.add(nlist, thread_caps, inst.goto, at_next);
- }
- }
- false
- }
- EmptyLook(_) | Save(_) | Split(_) => false,
- }
- }
-
- /// Follows epsilon transitions and adds them for processing to nlist,
- /// starting at and including ip.
- fn add(
- &mut self,
- nlist: &mut Threads,
- thread_caps: &mut [Option<usize>],
- ip: usize,
- at: InputAt,
- ) {
- self.stack.push(FollowEpsilon::IP(ip));
- while let Some(frame) = self.stack.pop() {
- match frame {
- FollowEpsilon::IP(ip) => {
- self.add_step(nlist, thread_caps, ip, at);
- }
- FollowEpsilon::Capture { slot, pos } => {
- thread_caps[slot] = pos;
- }
- }
- }
- }
-
- /// A helper function for add that avoids excessive pushing to the stack.
- fn add_step(
- &mut self,
- nlist: &mut Threads,
- thread_caps: &mut [Option<usize>],
- mut ip: usize,
- at: InputAt,
- ) {
- // Instead of pushing and popping to the stack, we mutate ip as we
- // traverse the set of states. We only push to the stack when we
- // absolutely need recursion (restoring captures or following a
- // branch).
- use crate::prog::Inst::*;
- loop {
- // Don't visit states we've already added.
- if nlist.set.contains(ip) {
- return;
- }
- nlist.set.insert(ip);
- match self.prog[ip] {
- EmptyLook(ref inst) => {
- if self.input.is_empty_match(at, inst) {
- ip = inst.goto;
- }
- }
- Save(ref inst) => {
- if inst.slot < thread_caps.len() {
- self.stack.push(FollowEpsilon::Capture {
- slot: inst.slot,
- pos: thread_caps[inst.slot],
- });
- thread_caps[inst.slot] = Some(at.pos());
- }
- ip = inst.goto;
- }
- Split(ref inst) => {
- self.stack.push(FollowEpsilon::IP(inst.goto2));
- ip = inst.goto1;
- }
- Match(_) | Char(_) | Ranges(_) | Bytes(_) => {
- let t = &mut nlist.caps(ip);
- for (slot, val) in t.iter_mut().zip(thread_caps.iter()) {
- *slot = *val;
- }
- return;
- }
- }
- }
- }
-}
-
-impl Threads {
- fn new() -> Self {
- Threads { set: SparseSet::new(0), caps: vec![], slots_per_thread: 0 }
- }
-
- fn resize(&mut self, num_insts: usize, ncaps: usize) {
- if num_insts == self.set.capacity() {
- return;
- }
- self.slots_per_thread = ncaps * 2;
- self.set = SparseSet::new(num_insts);
- self.caps = vec![None; self.slots_per_thread * num_insts];
- }
-
- fn caps(&mut self, pc: usize) -> &mut [Option<usize>] {
- let i = pc * self.slots_per_thread;
- &mut self.caps[i..i + self.slots_per_thread]
- }
-}
diff --git a/vendor/regex/src/pool.rs b/vendor/regex/src/pool.rs
deleted file mode 100644
index 6a6f15b19..000000000
--- a/vendor/regex/src/pool.rs
+++ /dev/null
@@ -1,333 +0,0 @@
-// This module provides a relatively simple thread-safe pool of reusable
-// objects. For the most part, it's implemented by a stack represented by a
-// Mutex<Vec<T>>. It has one small trick: because unlocking a mutex is somewhat
-// costly, in the case where a pool is accessed by the first thread that tried
-// to get a value, we bypass the mutex. Here are some benchmarks showing the
-// difference.
-//
-// 1) misc::anchored_literal_long_non_match 21 (18571 MB/s)
-// 2) misc::anchored_literal_long_non_match 107 (3644 MB/s)
-// 3) misc::anchored_literal_long_non_match 45 (8666 MB/s)
-// 4) misc::anchored_literal_long_non_match 19 (20526 MB/s)
-//
-// (1) represents our baseline: the master branch at the time of writing when
-// using the 'thread_local' crate to implement the pool below.
-//
-// (2) represents a naive pool implemented completely via Mutex<Vec<T>>. There
-// is no special trick for bypassing the mutex.
-//
-// (3) is the same as (2), except it uses Mutex<Vec<Box<T>>>. It is twice as
-// fast because a Box<T> is much smaller than the T we use with a Pool in this
-// crate. So pushing and popping a Box<T> from a Vec is quite a bit faster
-// than for T.
-//
-// (4) is the same as (3), but with the trick for bypassing the mutex in the
-// case of the first-to-get thread.
-//
-// Why move off of thread_local? Even though (4) is a hair faster than (1)
-// above, this was not the main goal. The main goal was to move off of
-// thread_local and find a way to *simply* re-capture some of its speed for
-// regex's specific case. So again, why move off of it? The *primary* reason is
-// because of memory leaks. See https://github.com/rust-lang/regex/issues/362
-// for example. (Why do I want it to be simple? Well, I suppose what I mean is,
-// "use as much safe code as possible to minimize risk and be as sure as I can
-// be that it is correct.")
-//
-// My guess is that the thread_local design is probably not appropriate for
-// regex since its memory usage scales to the number of active threads that
-// have used a regex, where as the pool below scales to the number of threads
-// that simultaneously use a regex. While neither case permits contraction,
-// since we own the pool data structure below, we can add contraction if a
-// clear use case pops up in the wild. More pressingly though, it seems that
-// there are at least some use case patterns where one might have many threads
-// sitting around that might have used a regex at one point. While thread_local
-// does try to reuse space previously used by a thread that has since stopped,
-// its maximal memory usage still scales with the total number of active
-// threads. In contrast, the pool below scales with the total number of threads
-// *simultaneously* using the pool. The hope is that this uses less memory
-// overall. And if it doesn't, we can hopefully tune it somehow.
-//
-// It seems that these sort of conditions happen frequently
-// in FFI inside of other more "managed" languages. This was
-// mentioned in the issue linked above, and also mentioned here:
-// https://github.com/BurntSushi/rure-go/issues/3. And in particular, users
-// confirm that disabling the use of thread_local resolves the leak.
-//
-// There were other weaker reasons for moving off of thread_local as well.
-// Namely, at the time, I was looking to reduce dependencies. And for something
-// like regex, maintenance can be simpler when we own the full dependency tree.
-
-use std::panic::{RefUnwindSafe, UnwindSafe};
-use std::sync::atomic::{AtomicUsize, Ordering};
-use std::sync::Mutex;
-
-/// An atomic counter used to allocate thread IDs.
-static COUNTER: AtomicUsize = AtomicUsize::new(1);
-
-thread_local!(
- /// A thread local used to assign an ID to a thread.
- static THREAD_ID: usize = {
- let next = COUNTER.fetch_add(1, Ordering::Relaxed);
- // SAFETY: We cannot permit the reuse of thread IDs since reusing a
- // thread ID might result in more than one thread "owning" a pool,
- // and thus, permit accessing a mutable value from multiple threads
- // simultaneously without synchronization. The intent of this panic is
- // to be a sanity check. It is not expected that the thread ID space
- // will actually be exhausted in practice.
- //
- // This checks that the counter never wraps around, since atomic
- // addition wraps around on overflow.
- if next == 0 {
- panic!("regex: thread ID allocation space exhausted");
- }
- next
- };
-);
-
-/// The type of the function used to create values in a pool when the pool is
-/// empty and the caller requests one.
-type CreateFn<T> =
- Box<dyn Fn() -> T + Send + Sync + UnwindSafe + RefUnwindSafe + 'static>;
-
-/// A simple thread safe pool for reusing values.
-///
-/// Getting a value out comes with a guard. When that guard is dropped, the
-/// value is automatically put back in the pool.
-///
-/// A Pool<T> impls Sync when T is Send (even if it's not Sync). This means
-/// that T can use interior mutability. This is possible because a pool is
-/// guaranteed to provide a value to exactly one thread at any time.
-///
-/// Currently, a pool never contracts in size. Its size is proportional to the
-/// number of simultaneous uses.
-pub struct Pool<T> {
- /// A stack of T values to hand out. These are used when a Pool is
- /// accessed by a thread that didn't create it.
- stack: Mutex<Vec<Box<T>>>,
- /// A function to create more T values when stack is empty and a caller
- /// has requested a T.
- create: CreateFn<T>,
- /// The ID of the thread that owns this pool. The owner is the thread
- /// that makes the first call to 'get'. When the owner calls 'get', it
- /// gets 'owner_val' directly instead of returning a T from 'stack'.
- /// See comments elsewhere for details, but this is intended to be an
- /// optimization for the common case that makes getting a T faster.
- ///
- /// It is initialized to a value of zero (an impossible thread ID) as a
- /// sentinel to indicate that it is unowned.
- owner: AtomicUsize,
- /// A value to return when the caller is in the same thread that created
- /// the Pool.
- owner_val: T,
-}
-
-// SAFETY: Since we want to use a Pool from multiple threads simultaneously
-// behind an Arc, we need for it to be Sync. In cases where T is sync, Pool<T>
-// would be Sync. However, since we use a Pool to store mutable scratch space,
-// we wind up using a T that has interior mutability and is thus itself not
-// Sync. So what we *really* want is for our Pool<T> to by Sync even when T is
-// not Sync (but is at least Send).
-//
-// The only non-sync aspect of a Pool is its 'owner_val' field, which is used
-// to implement faster access to a pool value in the common case of a pool
-// being accessed in the same thread in which it was created. The 'stack' field
-// is also shared, but a Mutex<T> where T: Send is already Sync. So we only
-// need to worry about 'owner_val'.
-//
-// The key is to guarantee that 'owner_val' can only ever be accessed from one
-// thread. In our implementation below, we guarantee this by only returning the
-// 'owner_val' when the ID of the current thread matches the ID of the thread
-// that created the Pool. Since this can only ever be one thread, it follows
-// that only one thread can access 'owner_val' at any point in time. Thus, it
-// is safe to declare that Pool<T> is Sync when T is Send.
-//
-// NOTE: It would also be possible to make the owning thread be the *first*
-// thread that tries to get a value out of a Pool. However, the current
-// implementation is a little simpler and it's not clear if making the first
-// thread (rather than the creating thread) is meaningfully better.
-//
-// If there is a way to achieve our performance goals using safe code, then
-// I would very much welcome a patch. As it stands, the implementation below
-// tries to balance safety with performance. The case where a Regex is used
-// from multiple threads simultaneously will suffer a bit since getting a cache
-// will require unlocking a mutex.
-unsafe impl<T: Send> Sync for Pool<T> {}
-
-impl<T: ::std::fmt::Debug> ::std::fmt::Debug for Pool<T> {
- fn fmt(&self, f: &mut ::std::fmt::Formatter<'_>) -> ::std::fmt::Result {
- f.debug_struct("Pool")
- .field("stack", &self.stack)
- .field("owner", &self.owner)
- .field("owner_val", &self.owner_val)
- .finish()
- }
-}
-
-/// A guard that is returned when a caller requests a value from the pool.
-///
-/// The purpose of the guard is to use RAII to automatically put the value back
-/// in the pool once it's dropped.
-#[derive(Debug)]
-pub struct PoolGuard<'a, T: Send> {
- /// The pool that this guard is attached to.
- pool: &'a Pool<T>,
- /// This is None when the guard represents the special "owned" value. In
- /// which case, the value is retrieved from 'pool.owner_val'.
- value: Option<Box<T>>,
-}
-
-impl<T: Send> Pool<T> {
- /// Create a new pool. The given closure is used to create values in the
- /// pool when necessary.
- pub fn new(create: CreateFn<T>) -> Pool<T> {
- let owner = AtomicUsize::new(0);
- let owner_val = create();
- Pool { stack: Mutex::new(vec![]), create, owner, owner_val }
- }
-
- /// Get a value from the pool. The caller is guaranteed to have exclusive
- /// access to the given value.
- ///
- /// Note that there is no guarantee provided about which value in the
- /// pool is returned. That is, calling get, dropping the guard (causing
- /// the value to go back into the pool) and then calling get again is NOT
- /// guaranteed to return the same value received in the first get call.
- #[cfg_attr(feature = "perf-inline", inline(always))]
- pub fn get(&self) -> PoolGuard<'_, T> {
- // Our fast path checks if the caller is the thread that "owns" this
- // pool. Or stated differently, whether it is the first thread that
- // tried to extract a value from the pool. If it is, then we can return
- // a T to the caller without going through a mutex.
- //
- // SAFETY: We must guarantee that only one thread gets access to this
- // value. Since a thread is uniquely identified by the THREAD_ID thread
- // local, it follows that is the caller's thread ID is equal to the
- // owner, then only one thread may receive this value.
- let caller = THREAD_ID.with(|id| *id);
- let owner = self.owner.load(Ordering::Relaxed);
- if caller == owner {
- return self.guard_owned();
- }
- self.get_slow(caller, owner)
- }
-
- /// This is the "slow" version that goes through a mutex to pop an
- /// allocated value off a stack to return to the caller. (Or, if the stack
- /// is empty, a new value is created.)
- ///
- /// If the pool has no owner, then this will set the owner.
- #[cold]
- fn get_slow(&self, caller: usize, owner: usize) -> PoolGuard<'_, T> {
- use std::sync::atomic::Ordering::Relaxed;
-
- if owner == 0 {
- // The sentinel 0 value means this pool is not yet owned. We
- // try to atomically set the owner. If we do, then this thread
- // becomes the owner and we can return a guard that represents
- // the special T for the owner.
- let res = self.owner.compare_exchange(0, caller, Relaxed, Relaxed);
- if res.is_ok() {
- return self.guard_owned();
- }
- }
- let mut stack = self.stack.lock().unwrap();
- let value = match stack.pop() {
- None => Box::new((self.create)()),
- Some(value) => value,
- };
- self.guard_stack(value)
- }
-
- /// Puts a value back into the pool. Callers don't need to call this. Once
- /// the guard that's returned by 'get' is dropped, it is put back into the
- /// pool automatically.
- fn put(&self, value: Box<T>) {
- let mut stack = self.stack.lock().unwrap();
- stack.push(value);
- }
-
- /// Create a guard that represents the special owned T.
- fn guard_owned(&self) -> PoolGuard<'_, T> {
- PoolGuard { pool: self, value: None }
- }
-
- /// Create a guard that contains a value from the pool's stack.
- fn guard_stack(&self, value: Box<T>) -> PoolGuard<'_, T> {
- PoolGuard { pool: self, value: Some(value) }
- }
-}
-
-impl<'a, T: Send> PoolGuard<'a, T> {
- /// Return the underlying value.
- pub fn value(&self) -> &T {
- match self.value {
- None => &self.pool.owner_val,
- Some(ref v) => &**v,
- }
- }
-}
-
-impl<'a, T: Send> Drop for PoolGuard<'a, T> {
- #[cfg_attr(feature = "perf-inline", inline(always))]
- fn drop(&mut self) {
- if let Some(value) = self.value.take() {
- self.pool.put(value);
- }
- }
-}
-
-#[cfg(test)]
-mod tests {
- use std::panic::{RefUnwindSafe, UnwindSafe};
-
- use super::*;
-
- #[test]
- fn oibits() {
- use crate::exec::ProgramCache;
-
- fn has_oibits<T: Send + Sync + UnwindSafe + RefUnwindSafe>() {}
- has_oibits::<Pool<ProgramCache>>();
- }
-
- // Tests that Pool implements the "single owner" optimization. That is, the
- // thread that first accesses the pool gets its own copy, while all other
- // threads get distinct copies.
- #[test]
- fn thread_owner_optimization() {
- use std::cell::RefCell;
- use std::sync::Arc;
-
- let pool: Arc<Pool<RefCell<Vec<char>>>> =
- Arc::new(Pool::new(Box::new(|| RefCell::new(vec!['a']))));
- pool.get().value().borrow_mut().push('x');
-
- let pool1 = pool.clone();
- let t1 = std::thread::spawn(move || {
- let guard = pool1.get();
- let v = guard.value();
- v.borrow_mut().push('y');
- });
-
- let pool2 = pool.clone();
- let t2 = std::thread::spawn(move || {
- let guard = pool2.get();
- let v = guard.value();
- v.borrow_mut().push('z');
- });
-
- t1.join().unwrap();
- t2.join().unwrap();
-
- // If we didn't implement the single owner optimization, then one of
- // the threads above is likely to have mutated the [a, x] vec that
- // we stuffed in the pool before spawning the threads. But since
- // neither thread was first to access the pool, and because of the
- // optimization, we should be guaranteed that neither thread mutates
- // the special owned pool value.
- //
- // (Technically this is an implementation detail and not a contract of
- // Pool's API.)
- assert_eq!(vec!['a', 'x'], *pool.get().value().borrow());
- }
-}
diff --git a/vendor/regex/src/prog.rs b/vendor/regex/src/prog.rs
deleted file mode 100644
index 100862cf1..000000000
--- a/vendor/regex/src/prog.rs
+++ /dev/null
@@ -1,451 +0,0 @@
-use std::cmp::Ordering;
-use std::collections::HashMap;
-use std::fmt;
-use std::mem;
-use std::ops::Deref;
-use std::slice;
-use std::sync::Arc;
-
-use crate::input::Char;
-use crate::literal::LiteralSearcher;
-
-/// `InstPtr` represents the index of an instruction in a regex program.
-pub type InstPtr = usize;
-
-/// Program is a sequence of instructions and various facts about thos
-/// instructions.
-#[derive(Clone)]
-pub struct Program {
- /// A sequence of instructions that represents an NFA.
- pub insts: Vec<Inst>,
- /// Pointers to each Match instruction in the sequence.
- ///
- /// This is always length 1 unless this program represents a regex set.
- pub matches: Vec<InstPtr>,
- /// The ordered sequence of all capture groups extracted from the AST.
- /// Unnamed groups are `None`.
- pub captures: Vec<Option<String>>,
- /// Pointers to all named capture groups into `captures`.
- pub capture_name_idx: Arc<HashMap<String, usize>>,
- /// If the number of capture groups is the same for all possible matches,
- /// then this is that number.
- pub static_captures_len: Option<usize>,
- /// A pointer to the start instruction. This can vary depending on how
- /// the program was compiled. For example, programs for use with the DFA
- /// engine have a `.*?` inserted at the beginning of unanchored regular
- /// expressions. The actual starting point of the program is after the
- /// `.*?`.
- pub start: InstPtr,
- /// A set of equivalence classes for discriminating bytes in the compiled
- /// program.
- pub byte_classes: Vec<u8>,
- /// When true, this program can only match valid UTF-8.
- pub only_utf8: bool,
- /// When true, this program uses byte range instructions instead of Unicode
- /// range instructions.
- pub is_bytes: bool,
- /// When true, the program is compiled for DFA matching. For example, this
- /// implies `is_bytes` and also inserts a preceding `.*?` for unanchored
- /// regexes.
- pub is_dfa: bool,
- /// When true, the program matches text in reverse (for use only in the
- /// DFA).
- pub is_reverse: bool,
- /// Whether the regex must match from the start of the input.
- pub is_anchored_start: bool,
- /// Whether the regex must match at the end of the input.
- pub is_anchored_end: bool,
- /// Whether this program contains a Unicode word boundary instruction.
- pub has_unicode_word_boundary: bool,
- /// A possibly empty machine for very quickly matching prefix literals.
- pub prefixes: LiteralSearcher,
- /// A limit on the size of the cache that the DFA is allowed to use while
- /// matching.
- ///
- /// The cache limit specifies approximately how much space we're willing to
- /// give to the state cache. Once the state cache exceeds the size, it is
- /// wiped and all states must be re-computed.
- ///
- /// Note that this value does not impact correctness. It can be set to 0
- /// and the DFA will run just fine. (It will only ever store exactly one
- /// state in the cache, and will likely run very slowly, but it will work.)
- ///
- /// Also note that this limit is *per thread of execution*. That is,
- /// if the same regex is used to search text across multiple threads
- /// simultaneously, then the DFA cache is not shared. Instead, copies are
- /// made.
- pub dfa_size_limit: usize,
-}
-
-impl Program {
- /// Creates an empty instruction sequence. Fields are given default
- /// values.
- pub fn new() -> Self {
- Program {
- insts: vec![],
- matches: vec![],
- captures: vec![],
- capture_name_idx: Arc::new(HashMap::new()),
- static_captures_len: None,
- start: 0,
- byte_classes: vec![0; 256],
- only_utf8: true,
- is_bytes: false,
- is_dfa: false,
- is_reverse: false,
- is_anchored_start: false,
- is_anchored_end: false,
- has_unicode_word_boundary: false,
- prefixes: LiteralSearcher::empty(),
- dfa_size_limit: 2 * (1 << 20),
- }
- }
-
- /// If pc is an index to a no-op instruction (like Save), then return the
- /// next pc that is not a no-op instruction.
- pub fn skip(&self, mut pc: usize) -> usize {
- loop {
- match self[pc] {
- Inst::Save(ref i) => pc = i.goto,
- _ => return pc,
- }
- }
- }
-
- /// Return true if and only if an execution engine at instruction `pc` will
- /// always lead to a match.
- pub fn leads_to_match(&self, pc: usize) -> bool {
- if self.matches.len() > 1 {
- // If we have a regex set, then we have more than one ending
- // state, so leading to one of those states is generally
- // meaningless.
- return false;
- }
- match self[self.skip(pc)] {
- Inst::Match(_) => true,
- _ => false,
- }
- }
-
- /// Returns true if the current configuration demands that an implicit
- /// `.*?` be prepended to the instruction sequence.
- pub fn needs_dotstar(&self) -> bool {
- self.is_dfa && !self.is_reverse && !self.is_anchored_start
- }
-
- /// Returns true if this program uses Byte instructions instead of
- /// Char/Range instructions.
- pub fn uses_bytes(&self) -> bool {
- self.is_bytes || self.is_dfa
- }
-
- /// Returns true if this program exclusively matches valid UTF-8 bytes.
- ///
- /// That is, if an invalid UTF-8 byte is seen, then no match is possible.
- pub fn only_utf8(&self) -> bool {
- self.only_utf8
- }
-
- /// Return the approximate heap usage of this instruction sequence in
- /// bytes.
- pub fn approximate_size(&self) -> usize {
- // The only instruction that uses heap space is Ranges (for
- // Unicode codepoint programs) to store non-overlapping codepoint
- // ranges. To keep this operation constant time, we ignore them.
- (self.len() * mem::size_of::<Inst>())
- + (self.matches.len() * mem::size_of::<InstPtr>())
- + (self.captures.len() * mem::size_of::<Option<String>>())
- + (self.capture_name_idx.len()
- * (mem::size_of::<String>() + mem::size_of::<usize>()))
- + (self.byte_classes.len() * mem::size_of::<u8>())
- + self.prefixes.approximate_size()
- }
-}
-
-impl Deref for Program {
- type Target = [Inst];
-
- #[cfg_attr(feature = "perf-inline", inline(always))]
- fn deref(&self) -> &Self::Target {
- &*self.insts
- }
-}
-
-impl fmt::Debug for Program {
- fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
- use self::Inst::*;
-
- fn with_goto(cur: usize, goto: usize, fmtd: String) -> String {
- if goto == cur + 1 {
- fmtd
- } else {
- format!("{} (goto: {})", fmtd, goto)
- }
- }
-
- fn visible_byte(b: u8) -> String {
- use std::ascii::escape_default;
- let escaped = escape_default(b).collect::<Vec<u8>>();
- String::from_utf8_lossy(&escaped).into_owned()
- }
-
- for (pc, inst) in self.iter().enumerate() {
- match *inst {
- Match(slot) => write!(f, "{:04} Match({:?})", pc, slot)?,
- Save(ref inst) => {
- let s = format!("{:04} Save({})", pc, inst.slot);
- write!(f, "{}", with_goto(pc, inst.goto, s))?;
- }
- Split(ref inst) => {
- write!(
- f,
- "{:04} Split({}, {})",
- pc, inst.goto1, inst.goto2
- )?;
- }
- EmptyLook(ref inst) => {
- let s = format!("{:?}", inst.look);
- write!(f, "{:04} {}", pc, with_goto(pc, inst.goto, s))?;
- }
- Char(ref inst) => {
- let s = format!("{:?}", inst.c);
- write!(f, "{:04} {}", pc, with_goto(pc, inst.goto, s))?;
- }
- Ranges(ref inst) => {
- let ranges = inst
- .ranges
- .iter()
- .map(|r| format!("{:?}-{:?}", r.0, r.1))
- .collect::<Vec<String>>()
- .join(", ");
- write!(
- f,
- "{:04} {}",
- pc,
- with_goto(pc, inst.goto, ranges)
- )?;
- }
- Bytes(ref inst) => {
- let s = format!(
- "Bytes({}, {})",
- visible_byte(inst.start),
- visible_byte(inst.end)
- );
- write!(f, "{:04} {}", pc, with_goto(pc, inst.goto, s))?;
- }
- }
- if pc == self.start {
- write!(f, " (start)")?;
- }
- writeln!(f)?;
- }
- Ok(())
- }
-}
-
-impl<'a> IntoIterator for &'a Program {
- type Item = &'a Inst;
- type IntoIter = slice::Iter<'a, Inst>;
- fn into_iter(self) -> Self::IntoIter {
- self.iter()
- }
-}
-
-/// Inst is an instruction code in a Regex program.
-///
-/// Regrettably, a regex program either contains Unicode codepoint
-/// instructions (Char and Ranges) or it contains byte instructions (Bytes).
-/// A regex program can never contain both.
-///
-/// It would be worth investigating splitting this into two distinct types and
-/// then figuring out how to make the matching engines polymorphic over those
-/// types without sacrificing performance.
-///
-/// Other than the benefit of moving invariants into the type system, another
-/// benefit is the decreased size. If we remove the `Char` and `Ranges`
-/// instructions from the `Inst` enum, then its size shrinks from 32 bytes to
-/// 24 bytes. (This is because of the removal of a `Box<[]>` in the `Ranges`
-/// variant.) Given that byte based machines are typically much bigger than
-/// their Unicode analogues (because they can decode UTF-8 directly), this ends
-/// up being a pretty significant savings.
-#[derive(Clone, Debug)]
-pub enum Inst {
- /// Match indicates that the program has reached a match state.
- ///
- /// The number in the match corresponds to the Nth logical regular
- /// expression in this program. This index is always 0 for normal regex
- /// programs. Values greater than 0 appear when compiling regex sets, and
- /// each match instruction gets its own unique value. The value corresponds
- /// to the Nth regex in the set.
- Match(usize),
- /// Save causes the program to save the current location of the input in
- /// the slot indicated by InstSave.
- Save(InstSave),
- /// Split causes the program to diverge to one of two paths in the
- /// program, preferring goto1 in InstSplit.
- Split(InstSplit),
- /// EmptyLook represents a zero-width assertion in a regex program. A
- /// zero-width assertion does not consume any of the input text.
- EmptyLook(InstEmptyLook),
- /// Char requires the regex program to match the character in InstChar at
- /// the current position in the input.
- Char(InstChar),
- /// Ranges requires the regex program to match the character at the current
- /// position in the input with one of the ranges specified in InstRanges.
- Ranges(InstRanges),
- /// Bytes is like Ranges, except it expresses a single byte range. It is
- /// used in conjunction with Split instructions to implement multi-byte
- /// character classes.
- Bytes(InstBytes),
-}
-
-impl Inst {
- /// Returns true if and only if this is a match instruction.
- pub fn is_match(&self) -> bool {
- match *self {
- Inst::Match(_) => true,
- _ => false,
- }
- }
-}
-
-/// Representation of the Save instruction.
-#[derive(Clone, Debug)]
-pub struct InstSave {
- /// The next location to execute in the program.
- pub goto: InstPtr,
- /// The capture slot (there are two slots for every capture in a regex,
- /// including the zeroth capture for the entire match).
- pub slot: usize,
-}
-
-/// Representation of the Split instruction.
-#[derive(Clone, Debug)]
-pub struct InstSplit {
- /// The first instruction to try. A match resulting from following goto1
- /// has precedence over a match resulting from following goto2.
- pub goto1: InstPtr,
- /// The second instruction to try. A match resulting from following goto1
- /// has precedence over a match resulting from following goto2.
- pub goto2: InstPtr,
-}
-
-/// Representation of the `EmptyLook` instruction.
-#[derive(Clone, Debug)]
-pub struct InstEmptyLook {
- /// The next location to execute in the program if this instruction
- /// succeeds.
- pub goto: InstPtr,
- /// The type of zero-width assertion to check.
- pub look: EmptyLook,
-}
-
-/// The set of zero-width match instructions.
-#[derive(Clone, Copy, Debug, PartialEq, Eq)]
-pub enum EmptyLook {
- /// Start of line or input.
- StartLine,
- /// End of line or input.
- EndLine,
- /// Start of input.
- StartText,
- /// End of input.
- EndText,
- /// Word character on one side and non-word character on other.
- WordBoundary,
- /// Word character on both sides or non-word character on both sides.
- NotWordBoundary,
- /// ASCII word boundary.
- WordBoundaryAscii,
- /// Not ASCII word boundary.
- NotWordBoundaryAscii,
-}
-
-/// Representation of the Char instruction.
-#[derive(Clone, Debug)]
-pub struct InstChar {
- /// The next location to execute in the program if this instruction
- /// succeeds.
- pub goto: InstPtr,
- /// The character to test.
- pub c: char,
-}
-
-/// Representation of the Ranges instruction.
-#[derive(Clone, Debug)]
-pub struct InstRanges {
- /// The next location to execute in the program if this instruction
- /// succeeds.
- pub goto: InstPtr,
- /// The set of Unicode scalar value ranges to test.
- pub ranges: Box<[(char, char)]>,
-}
-
-impl InstRanges {
- /// Tests whether the given input character matches this instruction.
- pub fn matches(&self, c: Char) -> bool {
- // This speeds up the `match_class_unicode` benchmark by checking
- // some common cases quickly without binary search. e.g., Matching
- // a Unicode class on predominantly ASCII text.
- for r in self.ranges.iter().take(4) {
- if c < r.0 {
- return false;
- }
- if c <= r.1 {
- return true;
- }
- }
- self.ranges
- .binary_search_by(|r| {
- if r.1 < c {
- Ordering::Less
- } else if r.0 > c {
- Ordering::Greater
- } else {
- Ordering::Equal
- }
- })
- .is_ok()
- }
-
- /// Return the number of distinct characters represented by all of the
- /// ranges.
- pub fn num_chars(&self) -> usize {
- self.ranges
- .iter()
- .map(|&(s, e)| 1 + (e as u32) - (s as u32))
- .sum::<u32>() as usize
- }
-}
-
-/// Representation of the Bytes instruction.
-#[derive(Clone, Debug)]
-pub struct InstBytes {
- /// The next location to execute in the program if this instruction
- /// succeeds.
- pub goto: InstPtr,
- /// The start (inclusive) of this byte range.
- pub start: u8,
- /// The end (inclusive) of this byte range.
- pub end: u8,
-}
-
-impl InstBytes {
- /// Returns true if and only if the given byte is in this range.
- pub fn matches(&self, byte: u8) -> bool {
- self.start <= byte && byte <= self.end
- }
-}
-
-#[cfg(test)]
-mod test {
- #[test]
- #[cfg(target_pointer_width = "64")]
- fn test_size_of_inst() {
- use std::mem::size_of;
-
- use super::Inst;
-
- assert_eq!(32, size_of::<Inst>());
- }
-}
diff --git a/vendor/regex/src/re_builder.rs b/vendor/regex/src/re_builder.rs
deleted file mode 100644
index ee6383690..000000000
--- a/vendor/regex/src/re_builder.rs
+++ /dev/null
@@ -1,421 +0,0 @@
-/// The set of user configurable options for compiling zero or more regexes.
-#[derive(Clone, Debug)]
-#[allow(missing_docs)]
-pub struct RegexOptions {
- pub pats: Vec<String>,
- pub size_limit: usize,
- pub dfa_size_limit: usize,
- pub nest_limit: u32,
- pub case_insensitive: bool,
- pub multi_line: bool,
- pub dot_matches_new_line: bool,
- pub swap_greed: bool,
- pub ignore_whitespace: bool,
- pub unicode: bool,
- pub octal: bool,
-}
-
-impl Default for RegexOptions {
- fn default() -> Self {
- RegexOptions {
- pats: vec![],
- size_limit: 10 * (1 << 20),
- dfa_size_limit: 2 * (1 << 20),
- nest_limit: 250,
- case_insensitive: false,
- multi_line: false,
- dot_matches_new_line: false,
- swap_greed: false,
- ignore_whitespace: false,
- unicode: true,
- octal: false,
- }
- }
-}
-
-macro_rules! define_builder {
- ($name:ident, $regex_mod:ident, $only_utf8:expr) => {
- pub mod $name {
- use super::RegexOptions;
- use crate::error::Error;
- use crate::exec::ExecBuilder;
-
- use crate::$regex_mod::Regex;
-
- /// A configurable builder for a regular expression.
- ///
- /// A builder can be used to configure how the regex is built, for example, by
- /// setting the default flags (which can be overridden in the expression
- /// itself) or setting various limits.
- #[derive(Debug)]
- pub struct RegexBuilder(RegexOptions);
-
- impl RegexBuilder {
- /// Create a new regular expression builder with the given pattern.
- ///
- /// If the pattern is invalid, then an error will be returned when
- /// `build` is called.
- pub fn new(pattern: &str) -> RegexBuilder {
- let mut builder = RegexBuilder(RegexOptions::default());
- builder.0.pats.push(pattern.to_owned());
- builder
- }
-
- /// Consume the builder and compile the regular expression.
- ///
- /// Note that calling `as_str` on the resulting `Regex` will produce the
- /// pattern given to `new` verbatim. Notably, it will not incorporate any
- /// of the flags set on this builder.
- pub fn build(&self) -> Result<Regex, Error> {
- ExecBuilder::new_options(self.0.clone())
- .only_utf8($only_utf8)
- .build()
- .map(Regex::from)
- }
-
- /// Set the value for the case insensitive (`i`) flag.
- ///
- /// When enabled, letters in the pattern will match both upper case and
- /// lower case variants.
- pub fn case_insensitive(
- &mut self,
- yes: bool,
- ) -> &mut RegexBuilder {
- self.0.case_insensitive = yes;
- self
- }
-
- /// Set the value for the multi-line matching (`m`) flag.
- ///
- /// When enabled, `^` matches the beginning of lines and `$` matches the
- /// end of lines.
- ///
- /// By default, they match beginning/end of the input.
- pub fn multi_line(&mut self, yes: bool) -> &mut RegexBuilder {
- self.0.multi_line = yes;
- self
- }
-
- /// Set the value for the any character (`s`) flag, where in `.` matches
- /// anything when `s` is set and matches anything except for new line when
- /// it is not set (the default).
- ///
- /// N.B. "matches anything" means "any byte" when Unicode is disabled and
- /// means "any valid UTF-8 encoding of any Unicode scalar value" when
- /// Unicode is enabled.
- pub fn dot_matches_new_line(
- &mut self,
- yes: bool,
- ) -> &mut RegexBuilder {
- self.0.dot_matches_new_line = yes;
- self
- }
-
- /// Set the value for the greedy swap (`U`) flag.
- ///
- /// When enabled, a pattern like `a*` is lazy (tries to find shortest
- /// match) and `a*?` is greedy (tries to find longest match).
- ///
- /// By default, `a*` is greedy and `a*?` is lazy.
- pub fn swap_greed(&mut self, yes: bool) -> &mut RegexBuilder {
- self.0.swap_greed = yes;
- self
- }
-
- /// Set the value for the ignore whitespace (`x`) flag.
- ///
- /// When enabled, whitespace such as new lines and spaces will be ignored
- /// between expressions of the pattern, and `#` can be used to start a
- /// comment until the next new line.
- pub fn ignore_whitespace(
- &mut self,
- yes: bool,
- ) -> &mut RegexBuilder {
- self.0.ignore_whitespace = yes;
- self
- }
-
- /// Set the value for the Unicode (`u`) flag.
- ///
- /// Enabled by default. When disabled, character classes such as `\w` only
- /// match ASCII word characters instead of all Unicode word characters.
- pub fn unicode(&mut self, yes: bool) -> &mut RegexBuilder {
- self.0.unicode = yes;
- self
- }
-
- /// Whether to support octal syntax or not.
- ///
- /// Octal syntax is a little-known way of uttering Unicode codepoints in
- /// a regular expression. For example, `a`, `\x61`, `\u0061` and
- /// `\141` are all equivalent regular expressions, where the last example
- /// shows octal syntax.
- ///
- /// While supporting octal syntax isn't in and of itself a problem, it does
- /// make good error messages harder. That is, in PCRE based regex engines,
- /// syntax like `\0` invokes a backreference, which is explicitly
- /// unsupported in Rust's regex engine. However, many users expect it to
- /// be supported. Therefore, when octal support is disabled, the error
- /// message will explicitly mention that backreferences aren't supported.
- ///
- /// Octal syntax is disabled by default.
- pub fn octal(&mut self, yes: bool) -> &mut RegexBuilder {
- self.0.octal = yes;
- self
- }
-
- /// Set the approximate size limit of the compiled regular expression.
- ///
- /// This roughly corresponds to the number of bytes occupied by a single
- /// compiled program. If the program exceeds this number, then a
- /// compilation error is returned.
- pub fn size_limit(
- &mut self,
- limit: usize,
- ) -> &mut RegexBuilder {
- self.0.size_limit = limit;
- self
- }
-
- /// Set the approximate size of the cache used by the DFA.
- ///
- /// This roughly corresponds to the number of bytes that the DFA will
- /// use while searching.
- ///
- /// Note that this is a *per thread* limit. There is no way to set a global
- /// limit. In particular, if a regex is used from multiple threads
- /// simultaneously, then each thread may use up to the number of bytes
- /// specified here.
- pub fn dfa_size_limit(
- &mut self,
- limit: usize,
- ) -> &mut RegexBuilder {
- self.0.dfa_size_limit = limit;
- self
- }
-
- /// Set the nesting limit for this parser.
- ///
- /// The nesting limit controls how deep the abstract syntax tree is allowed
- /// to be. If the AST exceeds the given limit (e.g., with too many nested
- /// groups), then an error is returned by the parser.
- ///
- /// The purpose of this limit is to act as a heuristic to prevent stack
- /// overflow for consumers that do structural induction on an `Ast` using
- /// explicit recursion. While this crate never does this (instead using
- /// constant stack space and moving the call stack to the heap), other
- /// crates may.
- ///
- /// This limit is not checked until the entire Ast is parsed. Therefore,
- /// if callers want to put a limit on the amount of heap space used, then
- /// they should impose a limit on the length, in bytes, of the concrete
- /// pattern string. In particular, this is viable since this parser
- /// implementation will limit itself to heap space proportional to the
- /// length of the pattern string.
- ///
- /// Note that a nest limit of `0` will return a nest limit error for most
- /// patterns but not all. For example, a nest limit of `0` permits `a` but
- /// not `ab`, since `ab` requires a concatenation, which results in a nest
- /// depth of `1`. In general, a nest limit is not something that manifests
- /// in an obvious way in the concrete syntax, therefore, it should not be
- /// used in a granular way.
- pub fn nest_limit(&mut self, limit: u32) -> &mut RegexBuilder {
- self.0.nest_limit = limit;
- self
- }
- }
- }
- };
-}
-
-define_builder!(bytes, re_bytes, false);
-define_builder!(unicode, re_unicode, true);
-
-macro_rules! define_set_builder {
- ($name:ident, $regex_mod:ident, $only_utf8:expr) => {
- pub mod $name {
- use super::RegexOptions;
- use crate::error::Error;
- use crate::exec::ExecBuilder;
-
- use crate::re_set::$regex_mod::RegexSet;
-
- /// A configurable builder for a set of regular expressions.
- ///
- /// A builder can be used to configure how the regexes are built, for example,
- /// by setting the default flags (which can be overridden in the expression
- /// itself) or setting various limits.
- #[derive(Debug)]
- pub struct RegexSetBuilder(RegexOptions);
-
- impl RegexSetBuilder {
- /// Create a new regular expression builder with the given pattern.
- ///
- /// If the pattern is invalid, then an error will be returned when
- /// `build` is called.
- pub fn new<I, S>(patterns: I) -> RegexSetBuilder
- where
- S: AsRef<str>,
- I: IntoIterator<Item = S>,
- {
- let mut builder = RegexSetBuilder(RegexOptions::default());
- for pat in patterns {
- builder.0.pats.push(pat.as_ref().to_owned());
- }
- builder
- }
-
- /// Consume the builder and compile the regular expressions into a set.
- pub fn build(&self) -> Result<RegexSet, Error> {
- ExecBuilder::new_options(self.0.clone())
- .only_utf8($only_utf8)
- .build()
- .map(RegexSet::from)
- }
-
- /// Set the value for the case insensitive (`i`) flag.
- pub fn case_insensitive(
- &mut self,
- yes: bool,
- ) -> &mut RegexSetBuilder {
- self.0.case_insensitive = yes;
- self
- }
-
- /// Set the value for the multi-line matching (`m`) flag.
- pub fn multi_line(
- &mut self,
- yes: bool,
- ) -> &mut RegexSetBuilder {
- self.0.multi_line = yes;
- self
- }
-
- /// Set the value for the any character (`s`) flag, where in `.` matches
- /// anything when `s` is set and matches anything except for new line when
- /// it is not set (the default).
- ///
- /// N.B. "matches anything" means "any byte" for `regex::bytes::RegexSet`
- /// expressions and means "any Unicode scalar value" for `regex::RegexSet`
- /// expressions.
- pub fn dot_matches_new_line(
- &mut self,
- yes: bool,
- ) -> &mut RegexSetBuilder {
- self.0.dot_matches_new_line = yes;
- self
- }
-
- /// Set the value for the greedy swap (`U`) flag.
- pub fn swap_greed(
- &mut self,
- yes: bool,
- ) -> &mut RegexSetBuilder {
- self.0.swap_greed = yes;
- self
- }
-
- /// Set the value for the ignore whitespace (`x`) flag.
- pub fn ignore_whitespace(
- &mut self,
- yes: bool,
- ) -> &mut RegexSetBuilder {
- self.0.ignore_whitespace = yes;
- self
- }
-
- /// Set the value for the Unicode (`u`) flag.
- pub fn unicode(&mut self, yes: bool) -> &mut RegexSetBuilder {
- self.0.unicode = yes;
- self
- }
-
- /// Whether to support octal syntax or not.
- ///
- /// Octal syntax is a little-known way of uttering Unicode codepoints in
- /// a regular expression. For example, `a`, `\x61`, `\u0061` and
- /// `\141` are all equivalent regular expressions, where the last example
- /// shows octal syntax.
- ///
- /// While supporting octal syntax isn't in and of itself a problem, it does
- /// make good error messages harder. That is, in PCRE based regex engines,
- /// syntax like `\0` invokes a backreference, which is explicitly
- /// unsupported in Rust's regex engine. However, many users expect it to
- /// be supported. Therefore, when octal support is disabled, the error
- /// message will explicitly mention that backreferences aren't supported.
- ///
- /// Octal syntax is disabled by default.
- pub fn octal(&mut self, yes: bool) -> &mut RegexSetBuilder {
- self.0.octal = yes;
- self
- }
-
- /// Set the approximate size limit of the compiled regular expression.
- ///
- /// This roughly corresponds to the number of bytes occupied by a single
- /// compiled program. If the program exceeds this number, then a
- /// compilation error is returned.
- pub fn size_limit(
- &mut self,
- limit: usize,
- ) -> &mut RegexSetBuilder {
- self.0.size_limit = limit;
- self
- }
-
- /// Set the approximate size of the cache used by the DFA.
- ///
- /// This roughly corresponds to the number of bytes that the DFA will
- /// use while searching.
- ///
- /// Note that this is a *per thread* limit. There is no way to set a global
- /// limit. In particular, if a regex is used from multiple threads
- /// simultaneously, then each thread may use up to the number of bytes
- /// specified here.
- pub fn dfa_size_limit(
- &mut self,
- limit: usize,
- ) -> &mut RegexSetBuilder {
- self.0.dfa_size_limit = limit;
- self
- }
-
- /// Set the nesting limit for this parser.
- ///
- /// The nesting limit controls how deep the abstract syntax tree is allowed
- /// to be. If the AST exceeds the given limit (e.g., with too many nested
- /// groups), then an error is returned by the parser.
- ///
- /// The purpose of this limit is to act as a heuristic to prevent stack
- /// overflow for consumers that do structural induction on an `Ast` using
- /// explicit recursion. While this crate never does this (instead using
- /// constant stack space and moving the call stack to the heap), other
- /// crates may.
- ///
- /// This limit is not checked until the entire Ast is parsed. Therefore,
- /// if callers want to put a limit on the amount of heap space used, then
- /// they should impose a limit on the length, in bytes, of the concrete
- /// pattern string. In particular, this is viable since this parser
- /// implementation will limit itself to heap space proportional to the
- /// length of the pattern string.
- ///
- /// Note that a nest limit of `0` will return a nest limit error for most
- /// patterns but not all. For example, a nest limit of `0` permits `a` but
- /// not `ab`, since `ab` requires a concatenation, which results in a nest
- /// depth of `1`. In general, a nest limit is not something that manifests
- /// in an obvious way in the concrete syntax, therefore, it should not be
- /// used in a granular way.
- pub fn nest_limit(
- &mut self,
- limit: u32,
- ) -> &mut RegexSetBuilder {
- self.0.nest_limit = limit;
- self
- }
- }
- }
- };
-}
-
-define_set_builder!(set_bytes, bytes, false);
-define_set_builder!(set_unicode, unicode, true);
diff --git a/vendor/regex/src/re_bytes.rs b/vendor/regex/src/re_bytes.rs
deleted file mode 100644
index e3a3b019b..000000000
--- a/vendor/regex/src/re_bytes.rs
+++ /dev/null
@@ -1,1372 +0,0 @@
-use std::borrow::Cow;
-use std::collections::HashMap;
-use std::fmt;
-use std::iter::FusedIterator;
-use std::ops::{Index, Range};
-use std::str::FromStr;
-use std::sync::Arc;
-
-use crate::find_byte::find_byte;
-
-use crate::error::Error;
-use crate::exec::{Exec, ExecNoSync};
-use crate::expand::expand_bytes;
-use crate::re_builder::bytes::RegexBuilder;
-use crate::re_trait::{self, RegularExpression, SubCapturesPosIter};
-
-/// Match represents a single match of a regex in a haystack.
-///
-/// The lifetime parameter `'t` refers to the lifetime of the matched text.
-#[derive(Copy, Clone, Eq, PartialEq)]
-pub struct Match<'t> {
- text: &'t [u8],
- start: usize,
- end: usize,
-}
-
-impl<'t> Match<'t> {
- /// Returns the starting byte offset of the match in the haystack.
- #[inline]
- pub fn start(&self) -> usize {
- self.start
- }
-
- /// Returns the ending byte offset of the match in the haystack.
- #[inline]
- pub fn end(&self) -> usize {
- self.end
- }
-
- /// Returns true if and only if this match has a length of zero.
- #[inline]
- pub fn is_empty(&self) -> bool {
- self.start == self.end
- }
-
- /// Returns the length, in bytes, of this match.
- #[inline]
- pub fn len(&self) -> usize {
- self.end - self.start
- }
-
- /// Returns the range over the starting and ending byte offsets of the
- /// match in the haystack.
- #[inline]
- pub fn range(&self) -> Range<usize> {
- self.start..self.end
- }
-
- /// Returns the matched text.
- #[inline]
- pub fn as_bytes(&self) -> &'t [u8] {
- &self.text[self.range()]
- }
-
- /// Creates a new match from the given haystack and byte offsets.
- #[inline]
- fn new(haystack: &'t [u8], start: usize, end: usize) -> Match<'t> {
- Match { text: haystack, start, end }
- }
-}
-
-impl<'t> std::fmt::Debug for Match<'t> {
- fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
- let mut fmt = f.debug_struct("Match");
- fmt.field("start", &self.start).field("end", &self.end);
- if let Ok(s) = std::str::from_utf8(self.as_bytes()) {
- fmt.field("bytes", &s);
- } else {
- // FIXME: It would be nice if this could be printed as a string
- // with invalid UTF-8 replaced with hex escapes. A alloc would
- // probably okay if that makes it easier, but regex-automata does
- // (at time of writing) have internal routines that do this. So
- // maybe we should expose them.
- fmt.field("bytes", &self.as_bytes());
- }
- fmt.finish()
- }
-}
-
-impl<'t> From<Match<'t>> for Range<usize> {
- fn from(m: Match<'t>) -> Range<usize> {
- m.range()
- }
-}
-
-/// A compiled regular expression for matching arbitrary bytes.
-///
-/// It can be used to search, split or replace text. All searching is done with
-/// an implicit `.*?` at the beginning and end of an expression. To force an
-/// expression to match the whole string (or a prefix or a suffix), you must
-/// use an anchor like `^` or `$` (or `\A` and `\z`).
-///
-/// Like the `Regex` type in the parent module, matches with this regex return
-/// byte offsets into the search text. **Unlike** the parent `Regex` type,
-/// these byte offsets may not correspond to UTF-8 sequence boundaries since
-/// the regexes in this module can match arbitrary bytes.
-#[derive(Clone)]
-pub struct Regex(Exec);
-
-impl fmt::Display for Regex {
- /// Shows the original regular expression.
- fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
- write!(f, "{}", self.as_str())
- }
-}
-
-impl fmt::Debug for Regex {
- /// Shows the original regular expression.
- fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
- fmt::Display::fmt(self, f)
- }
-}
-
-/// A constructor for Regex from an Exec.
-///
-/// This is hidden because Exec isn't actually part of the public API.
-#[doc(hidden)]
-impl From<Exec> for Regex {
- fn from(exec: Exec) -> Regex {
- Regex(exec)
- }
-}
-
-impl FromStr for Regex {
- type Err = Error;
-
- /// Attempts to parse a string into a regular expression
- fn from_str(s: &str) -> Result<Regex, Error> {
- Regex::new(s)
- }
-}
-
-/// Core regular expression methods.
-impl Regex {
- /// Compiles a regular expression. Once compiled, it can be used repeatedly
- /// to search, split or replace text in a string.
- ///
- /// If an invalid expression is given, then an error is returned.
- pub fn new(re: &str) -> Result<Regex, Error> {
- RegexBuilder::new(re).build()
- }
-
- /// Returns true if and only if there is a match for the regex in the
- /// string given.
- ///
- /// It is recommended to use this method if all you need to do is test
- /// a match, since the underlying matching engine may be able to do less
- /// work.
- ///
- /// # Example
- ///
- /// Test if some text contains at least one word with exactly 13 ASCII word
- /// bytes:
- ///
- /// ```rust
- /// # use regex::bytes::Regex;
- /// # fn main() {
- /// let text = b"I categorically deny having triskaidekaphobia.";
- /// assert!(Regex::new(r"\b\w{13}\b").unwrap().is_match(text));
- /// # }
- /// ```
- pub fn is_match(&self, text: &[u8]) -> bool {
- self.is_match_at(text, 0)
- }
-
- /// Returns the start and end byte range of the leftmost-first match in
- /// `text`. If no match exists, then `None` is returned.
- ///
- /// Note that this should only be used if you want to discover the position
- /// of the match. Testing the existence of a match is faster if you use
- /// `is_match`.
- ///
- /// # Example
- ///
- /// Find the start and end location of the first word with exactly 13
- /// ASCII word bytes:
- ///
- /// ```rust
- /// # use regex::bytes::Regex;
- /// # fn main() {
- /// let text = b"I categorically deny having triskaidekaphobia.";
- /// let mat = Regex::new(r"\b\w{13}\b").unwrap().find(text).unwrap();
- /// assert_eq!((mat.start(), mat.end()), (2, 15));
- /// # }
- /// ```
- pub fn find<'t>(&self, text: &'t [u8]) -> Option<Match<'t>> {
- self.find_at(text, 0)
- }
-
- /// Returns an iterator for each successive non-overlapping match in
- /// `text`, returning the start and end byte indices with respect to
- /// `text`.
- ///
- /// # Example
- ///
- /// Find the start and end location of every word with exactly 13 ASCII
- /// word bytes:
- ///
- /// ```rust
- /// # use regex::bytes::Regex;
- /// # fn main() {
- /// let text = b"Retroactively relinquishing remunerations is reprehensible.";
- /// for mat in Regex::new(r"\b\w{13}\b").unwrap().find_iter(text) {
- /// println!("{:?}", mat);
- /// }
- /// # }
- /// ```
- pub fn find_iter<'r, 't>(&'r self, text: &'t [u8]) -> Matches<'r, 't> {
- Matches(self.0.searcher().find_iter(text))
- }
-
- /// Returns the capture groups corresponding to the leftmost-first
- /// match in `text`. Capture group `0` always corresponds to the entire
- /// match. If no match is found, then `None` is returned.
- ///
- /// You should only use `captures` if you need access to the location of
- /// capturing group matches. Otherwise, `find` is faster for discovering
- /// the location of the overall match.
- ///
- /// # Examples
- ///
- /// Say you have some text with movie names and their release years,
- /// like "'Citizen Kane' (1941)". It'd be nice if we could search for text
- /// looking like that, while also extracting the movie name and its release
- /// year separately.
- ///
- /// ```rust
- /// # use regex::bytes::Regex;
- /// # fn main() {
- /// let re = Regex::new(r"'([^']+)'\s+\((\d{4})\)").unwrap();
- /// let text = b"Not my favorite movie: 'Citizen Kane' (1941).";
- /// let caps = re.captures(text).unwrap();
- /// assert_eq!(caps.get(1).unwrap().as_bytes(), &b"Citizen Kane"[..]);
- /// assert_eq!(caps.get(2).unwrap().as_bytes(), &b"1941"[..]);
- /// assert_eq!(caps.get(0).unwrap().as_bytes(), &b"'Citizen Kane' (1941)"[..]);
- /// // You can also access the groups by index using the Index notation.
- /// // Note that this will panic on an invalid index.
- /// assert_eq!(&caps[1], b"Citizen Kane");
- /// assert_eq!(&caps[2], b"1941");
- /// assert_eq!(&caps[0], b"'Citizen Kane' (1941)");
- /// # }
- /// ```
- ///
- /// Note that the full match is at capture group `0`. Each subsequent
- /// capture group is indexed by the order of its opening `(`.
- ///
- /// We can make this example a bit clearer by using *named* capture groups:
- ///
- /// ```rust
- /// # use regex::bytes::Regex;
- /// # fn main() {
- /// let re = Regex::new(r"'(?P<title>[^']+)'\s+\((?P<year>\d{4})\)")
- /// .unwrap();
- /// let text = b"Not my favorite movie: 'Citizen Kane' (1941).";
- /// let caps = re.captures(text).unwrap();
- /// assert_eq!(caps.name("title").unwrap().as_bytes(), b"Citizen Kane");
- /// assert_eq!(caps.name("year").unwrap().as_bytes(), b"1941");
- /// assert_eq!(caps.get(0).unwrap().as_bytes(), &b"'Citizen Kane' (1941)"[..]);
- /// // You can also access the groups by name using the Index notation.
- /// // Note that this will panic on an invalid group name.
- /// assert_eq!(&caps["title"], b"Citizen Kane");
- /// assert_eq!(&caps["year"], b"1941");
- /// assert_eq!(&caps[0], b"'Citizen Kane' (1941)");
- ///
- /// # }
- /// ```
- ///
- /// Here we name the capture groups, which we can access with the `name`
- /// method or the `Index` notation with a `&str`. Note that the named
- /// capture groups are still accessible with `get` or the `Index` notation
- /// with a `usize`.
- ///
- /// The `0`th capture group is always unnamed, so it must always be
- /// accessed with `get(0)` or `[0]`.
- pub fn captures<'t>(&self, text: &'t [u8]) -> Option<Captures<'t>> {
- self.captures_at(text, 0)
- }
-
- /// Returns an iterator over all the non-overlapping capture groups matched
- /// in `text`. This is operationally the same as `find_iter`, except it
- /// yields information about capturing group matches.
- ///
- /// # Example
- ///
- /// We can use this to find all movie titles and their release years in
- /// some text, where the movie is formatted like "'Title' (xxxx)":
- ///
- /// ```rust
- /// # use std::str; use regex::bytes::Regex;
- /// # fn main() {
- /// let re = Regex::new(r"'(?P<title>[^']+)'\s+\((?P<year>\d{4})\)")
- /// .unwrap();
- /// let text = b"'Citizen Kane' (1941), 'The Wizard of Oz' (1939), 'M' (1931).";
- /// for caps in re.captures_iter(text) {
- /// let title = str::from_utf8(&caps["title"]).unwrap();
- /// let year = str::from_utf8(&caps["year"]).unwrap();
- /// println!("Movie: {:?}, Released: {:?}", title, year);
- /// }
- /// // Output:
- /// // Movie: Citizen Kane, Released: 1941
- /// // Movie: The Wizard of Oz, Released: 1939
- /// // Movie: M, Released: 1931
- /// # }
- /// ```
- pub fn captures_iter<'r, 't>(
- &'r self,
- text: &'t [u8],
- ) -> CaptureMatches<'r, 't> {
- CaptureMatches(self.0.searcher().captures_iter(text))
- }
-
- /// Returns an iterator of substrings of `text` delimited by a match of the
- /// regular expression. Namely, each element of the iterator corresponds to
- /// text that *isn't* matched by the regular expression.
- ///
- /// This method will *not* copy the text given.
- ///
- /// # Example
- ///
- /// To split a string delimited by arbitrary amounts of spaces or tabs:
- ///
- /// ```rust
- /// # use regex::bytes::Regex;
- /// # fn main() {
- /// let re = Regex::new(r"[ \t]+").unwrap();
- /// let fields: Vec<&[u8]> = re.split(b"a b \t c\td e").collect();
- /// assert_eq!(fields, vec![
- /// &b"a"[..], &b"b"[..], &b"c"[..], &b"d"[..], &b"e"[..],
- /// ]);
- /// # }
- /// ```
- pub fn split<'r, 't>(&'r self, text: &'t [u8]) -> Split<'r, 't> {
- Split { finder: self.find_iter(text), last: 0 }
- }
-
- /// Returns an iterator of at most `limit` substrings of `text` delimited
- /// by a match of the regular expression. (A `limit` of `0` will return no
- /// substrings.) Namely, each element of the iterator corresponds to text
- /// that *isn't* matched by the regular expression. The remainder of the
- /// string that is not split will be the last element in the iterator.
- ///
- /// This method will *not* copy the text given.
- ///
- /// # Example
- ///
- /// Get the first two words in some text:
- ///
- /// ```rust
- /// # use regex::bytes::Regex;
- /// # fn main() {
- /// let re = Regex::new(r"\W+").unwrap();
- /// let fields: Vec<&[u8]> = re.splitn(b"Hey! How are you?", 3).collect();
- /// assert_eq!(fields, vec![&b"Hey"[..], &b"How"[..], &b"are you?"[..]]);
- /// # }
- /// ```
- pub fn splitn<'r, 't>(
- &'r self,
- text: &'t [u8],
- limit: usize,
- ) -> SplitN<'r, 't> {
- SplitN { splits: self.split(text), n: limit }
- }
-
- /// Replaces the leftmost-first match with the replacement provided. The
- /// replacement can be a regular byte string (where `$N` and `$name` are
- /// expanded to match capture groups) or a function that takes the matches'
- /// `Captures` and returns the replaced byte string.
- ///
- /// If no match is found, then a copy of the byte string is returned
- /// unchanged.
- ///
- /// # Replacement string syntax
- ///
- /// All instances of `$name` in the replacement text is replaced with the
- /// corresponding capture group `name`.
- ///
- /// `name` may be an integer corresponding to the index of the
- /// capture group (counted by order of opening parenthesis where `0` is the
- /// entire match) or it can be a name (consisting of letters, digits or
- /// underscores) corresponding to a named capture group.
- ///
- /// If `name` isn't a valid capture group (whether the name doesn't exist
- /// or isn't a valid index), then it is replaced with the empty string.
- ///
- /// The longest possible name is used. e.g., `$1a` looks up the capture
- /// group named `1a` and not the capture group at index `1`. To exert more
- /// precise control over the name, use braces, e.g., `${1}a`.
- ///
- /// To write a literal `$` use `$$`.
- ///
- /// # Examples
- ///
- /// Note that this function is polymorphic with respect to the replacement.
- /// In typical usage, this can just be a normal byte string:
- ///
- /// ```rust
- /// # use regex::bytes::Regex;
- /// # fn main() {
- /// let re = Regex::new("[^01]+").unwrap();
- /// assert_eq!(re.replace(b"1078910", &b""[..]), &b"1010"[..]);
- /// # }
- /// ```
- ///
- /// But anything satisfying the `Replacer` trait will work. For example, a
- /// closure of type `|&Captures| -> Vec<u8>` provides direct access to the
- /// captures corresponding to a match. This allows one to access capturing
- /// group matches easily:
- ///
- /// ```rust
- /// # use regex::bytes::Regex;
- /// # use regex::bytes::Captures; fn main() {
- /// let re = Regex::new(r"([^,\s]+),\s+(\S+)").unwrap();
- /// let result = re.replace(b"Springsteen, Bruce", |caps: &Captures| {
- /// let mut replacement = caps[2].to_owned();
- /// replacement.push(b' ');
- /// replacement.extend(&caps[1]);
- /// replacement
- /// });
- /// assert_eq!(result, &b"Bruce Springsteen"[..]);
- /// # }
- /// ```
- ///
- /// But this is a bit cumbersome to use all the time. Instead, a simple
- /// syntax is supported that expands `$name` into the corresponding capture
- /// group. Here's the last example, but using this expansion technique
- /// with named capture groups:
- ///
- /// ```rust
- /// # use regex::bytes::Regex;
- /// # fn main() {
- /// let re = Regex::new(r"(?P<last>[^,\s]+),\s+(?P<first>\S+)").unwrap();
- /// let result = re.replace(b"Springsteen, Bruce", &b"$first $last"[..]);
- /// assert_eq!(result, &b"Bruce Springsteen"[..]);
- /// # }
- /// ```
- ///
- /// Note that using `$2` instead of `$first` or `$1` instead of `$last`
- /// would produce the same result. To write a literal `$` use `$$`.
- ///
- /// Sometimes the replacement string requires use of curly braces to
- /// delineate a capture group replacement and surrounding literal text.
- /// For example, if we wanted to join two words together with an
- /// underscore:
- ///
- /// ```rust
- /// # use regex::bytes::Regex;
- /// # fn main() {
- /// let re = Regex::new(r"(?P<first>\w+)\s+(?P<second>\w+)").unwrap();
- /// let result = re.replace(b"deep fried", &b"${first}_$second"[..]);
- /// assert_eq!(result, &b"deep_fried"[..]);
- /// # }
- /// ```
- ///
- /// Without the curly braces, the capture group name `first_` would be
- /// used, and since it doesn't exist, it would be replaced with the empty
- /// string.
- ///
- /// Finally, sometimes you just want to replace a literal string with no
- /// regard for capturing group expansion. This can be done by wrapping a
- /// byte string with `NoExpand`:
- ///
- /// ```rust
- /// # use regex::bytes::Regex;
- /// # fn main() {
- /// use regex::bytes::NoExpand;
- ///
- /// let re = Regex::new(r"(?P<last>[^,\s]+),\s+(\S+)").unwrap();
- /// let result = re.replace(b"Springsteen, Bruce", NoExpand(b"$2 $last"));
- /// assert_eq!(result, &b"$2 $last"[..]);
- /// # }
- /// ```
- pub fn replace<'t, R: Replacer>(
- &self,
- text: &'t [u8],
- rep: R,
- ) -> Cow<'t, [u8]> {
- self.replacen(text, 1, rep)
- }
-
- /// Replaces all non-overlapping matches in `text` with the replacement
- /// provided. This is the same as calling `replacen` with `limit` set to
- /// `0`.
- ///
- /// See the documentation for `replace` for details on how to access
- /// capturing group matches in the replacement text.
- pub fn replace_all<'t, R: Replacer>(
- &self,
- text: &'t [u8],
- rep: R,
- ) -> Cow<'t, [u8]> {
- self.replacen(text, 0, rep)
- }
-
- /// Replaces at most `limit` non-overlapping matches in `text` with the
- /// replacement provided. If `limit` is 0, then all non-overlapping matches
- /// are replaced.
- ///
- /// See the documentation for `replace` for details on how to access
- /// capturing group matches in the replacement text.
- pub fn replacen<'t, R: Replacer>(
- &self,
- text: &'t [u8],
- limit: usize,
- mut rep: R,
- ) -> Cow<'t, [u8]> {
- if let Some(rep) = rep.no_expansion() {
- let mut it = self.find_iter(text).enumerate().peekable();
- if it.peek().is_none() {
- return Cow::Borrowed(text);
- }
- let mut new = Vec::with_capacity(text.len());
- let mut last_match = 0;
- for (i, m) in it {
- new.extend_from_slice(&text[last_match..m.start()]);
- new.extend_from_slice(&rep);
- last_match = m.end();
- if limit > 0 && i >= limit - 1 {
- break;
- }
- }
- new.extend_from_slice(&text[last_match..]);
- return Cow::Owned(new);
- }
-
- // The slower path, which we use if the replacement needs access to
- // capture groups.
- let mut it = self.captures_iter(text).enumerate().peekable();
- if it.peek().is_none() {
- return Cow::Borrowed(text);
- }
- let mut new = Vec::with_capacity(text.len());
- let mut last_match = 0;
- for (i, cap) in it {
- // unwrap on 0 is OK because captures only reports matches
- let m = cap.get(0).unwrap();
- new.extend_from_slice(&text[last_match..m.start()]);
- rep.replace_append(&cap, &mut new);
- last_match = m.end();
- if limit > 0 && i >= limit - 1 {
- break;
- }
- }
- new.extend_from_slice(&text[last_match..]);
- Cow::Owned(new)
- }
-}
-
-/// Advanced or "lower level" search methods.
-impl Regex {
- /// Returns the end location of a match in the text given.
- ///
- /// This method may have the same performance characteristics as
- /// `is_match`, except it provides an end location for a match. In
- /// particular, the location returned *may be shorter* than the proper end
- /// of the leftmost-first match that you would find via `Regex::find`.
- ///
- /// Note that it is not guaranteed that this routine finds the shortest or
- /// "earliest" possible match. Instead, the main idea of this API is that
- /// it returns the offset at the point at which the internal regex engine
- /// has determined that a match has occurred. This may vary depending on
- /// which internal regex engine is used, and thus, the offset itself may
- /// change.
- ///
- /// # Example
- ///
- /// Typically, `a+` would match the entire first sequence of `a` in some
- /// text, but `shortest_match` can give up as soon as it sees the first
- /// `a`.
- ///
- /// ```rust
- /// # use regex::bytes::Regex;
- /// # fn main() {
- /// let text = b"aaaaa";
- /// let pos = Regex::new(r"a+").unwrap().shortest_match(text);
- /// assert_eq!(pos, Some(1));
- /// # }
- /// ```
- pub fn shortest_match(&self, text: &[u8]) -> Option<usize> {
- self.shortest_match_at(text, 0)
- }
-
- /// Returns the same as shortest_match, but starts the search at the given
- /// offset.
- ///
- /// The significance of the starting point is that it takes the surrounding
- /// context into consideration. For example, the `\A` anchor can only
- /// match when `start == 0`.
- pub fn shortest_match_at(
- &self,
- text: &[u8],
- start: usize,
- ) -> Option<usize> {
- self.0.searcher().shortest_match_at(text, start)
- }
-
- /// Returns the same as is_match, but starts the search at the given
- /// offset.
- ///
- /// The significance of the starting point is that it takes the surrounding
- /// context into consideration. For example, the `\A` anchor can only
- /// match when `start == 0`.
- pub fn is_match_at(&self, text: &[u8], start: usize) -> bool {
- self.0.searcher().is_match_at(text, start)
- }
-
- /// Returns the same as find, but starts the search at the given
- /// offset.
- ///
- /// The significance of the starting point is that it takes the surrounding
- /// context into consideration. For example, the `\A` anchor can only
- /// match when `start == 0`.
- pub fn find_at<'t>(
- &self,
- text: &'t [u8],
- start: usize,
- ) -> Option<Match<'t>> {
- self.0
- .searcher()
- .find_at(text, start)
- .map(|(s, e)| Match::new(text, s, e))
- }
-
- /// Returns the same as [`Regex::captures`], but starts the search at the
- /// given offset.
- ///
- /// The significance of the starting point is that it takes the surrounding
- /// context into consideration. For example, the `\A` anchor can only
- /// match when `start == 0`.
- pub fn captures_at<'t>(
- &self,
- text: &'t [u8],
- start: usize,
- ) -> Option<Captures<'t>> {
- let mut locs = self.capture_locations();
- self.captures_read_at(&mut locs, text, start).map(move |_| Captures {
- text,
- locs: locs.0,
- named_groups: self.0.capture_name_idx().clone(),
- })
- }
-
- /// This is like `captures`, but uses
- /// [`CaptureLocations`](struct.CaptureLocations.html)
- /// instead of
- /// [`Captures`](struct.Captures.html) in order to amortize allocations.
- ///
- /// To create a `CaptureLocations` value, use the
- /// `Regex::capture_locations` method.
- ///
- /// This returns the overall match if this was successful, which is always
- /// equivalence to the `0`th capture group.
- pub fn captures_read<'t>(
- &self,
- locs: &mut CaptureLocations,
- text: &'t [u8],
- ) -> Option<Match<'t>> {
- self.captures_read_at(locs, text, 0)
- }
-
- /// Returns the same as `captures_read`, but starts the search at the given
- /// offset and populates the capture locations given.
- ///
- /// The significance of the starting point is that it takes the surrounding
- /// context into consideration. For example, the `\A` anchor can only
- /// match when `start == 0`.
- pub fn captures_read_at<'t>(
- &self,
- locs: &mut CaptureLocations,
- text: &'t [u8],
- start: usize,
- ) -> Option<Match<'t>> {
- self.0
- .searcher()
- .captures_read_at(&mut locs.0, text, start)
- .map(|(s, e)| Match::new(text, s, e))
- }
-
- /// An undocumented alias for `captures_read_at`.
- ///
- /// The `regex-capi` crate previously used this routine, so to avoid
- /// breaking that crate, we continue to provide the name as an undocumented
- /// alias.
- #[doc(hidden)]
- pub fn read_captures_at<'t>(
- &self,
- locs: &mut CaptureLocations,
- text: &'t [u8],
- start: usize,
- ) -> Option<Match<'t>> {
- self.captures_read_at(locs, text, start)
- }
-}
-
-/// Auxiliary methods.
-impl Regex {
- /// Returns the original string of this regex.
- pub fn as_str(&self) -> &str {
- &self.0.regex_strings()[0]
- }
-
- /// Returns an iterator over the capture names.
- pub fn capture_names(&self) -> CaptureNames<'_> {
- CaptureNames(self.0.capture_names().iter())
- }
-
- /// Returns the number of captures.
- pub fn captures_len(&self) -> usize {
- self.0.capture_names().len()
- }
-
- /// Returns the total number of capturing groups that appear in every
- /// possible match.
- ///
- /// If the number of capture groups can vary depending on the match, then
- /// this returns `None`. That is, a value is only returned when the number
- /// of matching groups is invariant or "static."
- ///
- /// Note that like [`Regex::captures_len`], this **does** include the
- /// implicit capturing group corresponding to the entire match. Therefore,
- /// when a non-None value is returned, it is guaranteed to be at least `1`.
- /// Stated differently, a return value of `Some(0)` is impossible.
- ///
- /// # Example
- ///
- /// This shows a few cases where a static number of capture groups is
- /// available and a few cases where it is not.
- ///
- /// ```
- /// use regex::bytes::Regex;
- ///
- /// let len = |pattern| {
- /// Regex::new(pattern).map(|re| re.static_captures_len())
- /// };
- ///
- /// assert_eq!(Some(1), len("a")?);
- /// assert_eq!(Some(2), len("(a)")?);
- /// assert_eq!(Some(2), len("(a)|(b)")?);
- /// assert_eq!(Some(3), len("(a)(b)|(c)(d)")?);
- /// assert_eq!(None, len("(a)|b")?);
- /// assert_eq!(None, len("a|(b)")?);
- /// assert_eq!(None, len("(b)*")?);
- /// assert_eq!(Some(2), len("(b)+")?);
- ///
- /// # Ok::<(), Box<dyn std::error::Error>>(())
- /// ```
- #[inline]
- pub fn static_captures_len(&self) -> Option<usize> {
- self.0.static_captures_len().map(|len| len.saturating_add(1))
- }
-
- /// Returns an empty set of capture locations that can be reused in
- /// multiple calls to `captures_read` or `captures_read_at`.
- pub fn capture_locations(&self) -> CaptureLocations {
- CaptureLocations(self.0.searcher().locations())
- }
-
- /// An alias for `capture_locations` to preserve backward compatibility.
- ///
- /// The `regex-capi` crate uses this method, so to avoid breaking that
- /// crate, we continue to export it as an undocumented API.
- #[doc(hidden)]
- pub fn locations(&self) -> CaptureLocations {
- CaptureLocations(self.0.searcher().locations())
- }
-}
-
-/// An iterator over all non-overlapping matches for a particular string.
-///
-/// The iterator yields a tuple of integers corresponding to the start and end
-/// of the match. The indices are byte offsets. The iterator stops when no more
-/// matches can be found.
-///
-/// `'r` is the lifetime of the compiled regular expression and `'t` is the
-/// lifetime of the matched byte string.
-#[derive(Debug)]
-pub struct Matches<'r, 't>(re_trait::Matches<'t, ExecNoSync<'r>>);
-
-impl<'r, 't> Iterator for Matches<'r, 't> {
- type Item = Match<'t>;
-
- fn next(&mut self) -> Option<Match<'t>> {
- let text = self.0.text();
- self.0.next().map(|(s, e)| Match::new(text, s, e))
- }
-}
-
-impl<'r, 't> FusedIterator for Matches<'r, 't> {}
-
-/// An iterator that yields all non-overlapping capture groups matching a
-/// particular regular expression.
-///
-/// The iterator stops when no more matches can be found.
-///
-/// `'r` is the lifetime of the compiled regular expression and `'t` is the
-/// lifetime of the matched byte string.
-#[derive(Debug)]
-pub struct CaptureMatches<'r, 't>(
- re_trait::CaptureMatches<'t, ExecNoSync<'r>>,
-);
-
-impl<'r, 't> Iterator for CaptureMatches<'r, 't> {
- type Item = Captures<'t>;
-
- fn next(&mut self) -> Option<Captures<'t>> {
- self.0.next().map(|locs| Captures {
- text: self.0.text(),
- locs,
- named_groups: self.0.regex().capture_name_idx().clone(),
- })
- }
-}
-
-impl<'r, 't> FusedIterator for CaptureMatches<'r, 't> {}
-
-/// Yields all substrings delimited by a regular expression match.
-///
-/// `'r` is the lifetime of the compiled regular expression and `'t` is the
-/// lifetime of the byte string being split.
-#[derive(Debug)]
-pub struct Split<'r, 't> {
- finder: Matches<'r, 't>,
- last: usize,
-}
-
-impl<'r, 't> Iterator for Split<'r, 't> {
- type Item = &'t [u8];
-
- fn next(&mut self) -> Option<&'t [u8]> {
- let text = self.finder.0.text();
- match self.finder.next() {
- None => {
- if self.last > text.len() {
- None
- } else {
- let s = &text[self.last..];
- self.last = text.len() + 1; // Next call will return None
- Some(s)
- }
- }
- Some(m) => {
- let matched = &text[self.last..m.start()];
- self.last = m.end();
- Some(matched)
- }
- }
- }
-}
-
-impl<'r, 't> FusedIterator for Split<'r, 't> {}
-
-/// Yields at most `N` substrings delimited by a regular expression match.
-///
-/// The last substring will be whatever remains after splitting.
-///
-/// `'r` is the lifetime of the compiled regular expression and `'t` is the
-/// lifetime of the byte string being split.
-#[derive(Debug)]
-pub struct SplitN<'r, 't> {
- splits: Split<'r, 't>,
- n: usize,
-}
-
-impl<'r, 't> Iterator for SplitN<'r, 't> {
- type Item = &'t [u8];
-
- fn next(&mut self) -> Option<&'t [u8]> {
- if self.n == 0 {
- return None;
- }
-
- self.n -= 1;
- if self.n > 0 {
- return self.splits.next();
- }
-
- let text = self.splits.finder.0.text();
- if self.splits.last > text.len() {
- // We've already returned all substrings.
- None
- } else {
- // self.n == 0, so future calls will return None immediately
- Some(&text[self.splits.last..])
- }
- }
-
- fn size_hint(&self) -> (usize, Option<usize>) {
- (0, Some(self.n))
- }
-}
-
-impl<'r, 't> FusedIterator for SplitN<'r, 't> {}
-
-/// An iterator over the names of all possible captures.
-///
-/// `None` indicates an unnamed capture; the first element (capture 0, the
-/// whole matched region) is always unnamed.
-///
-/// `'r` is the lifetime of the compiled regular expression.
-#[derive(Clone, Debug)]
-pub struct CaptureNames<'r>(::std::slice::Iter<'r, Option<String>>);
-
-impl<'r> Iterator for CaptureNames<'r> {
- type Item = Option<&'r str>;
-
- fn next(&mut self) -> Option<Option<&'r str>> {
- self.0
- .next()
- .as_ref()
- .map(|slot| slot.as_ref().map(|name| name.as_ref()))
- }
-
- fn size_hint(&self) -> (usize, Option<usize>) {
- self.0.size_hint()
- }
-
- fn count(self) -> usize {
- self.0.count()
- }
-}
-
-impl<'r> ExactSizeIterator for CaptureNames<'r> {}
-
-impl<'r> FusedIterator for CaptureNames<'r> {}
-
-/// CaptureLocations is a low level representation of the raw offsets of each
-/// submatch.
-///
-/// You can think of this as a lower level
-/// [`Captures`](struct.Captures.html), where this type does not support
-/// named capturing groups directly and it does not borrow the text that these
-/// offsets were matched on.
-///
-/// Primarily, this type is useful when using the lower level `Regex` APIs
-/// such as `read_captures`, which permits amortizing the allocation in which
-/// capture match locations are stored.
-///
-/// In order to build a value of this type, you'll need to call the
-/// `capture_locations` method on the `Regex` being used to execute the search.
-/// The value returned can then be reused in subsequent searches.
-///
-/// # Example
-///
-/// This example shows how to create and use `CaptureLocations` in a search.
-///
-/// ```
-/// use regex::bytes::Regex;
-///
-/// let re = Regex::new(r"(?<first>\w+)\s+(?<last>\w+)").unwrap();
-/// let mut locs = re.capture_locations();
-/// let m = re.captures_read(&mut locs, b"Bruce Springsteen").unwrap();
-/// assert_eq!(0..17, m.range());
-/// assert_eq!(Some((0, 17)), locs.get(0));
-/// assert_eq!(Some((0, 5)), locs.get(1));
-/// assert_eq!(Some((6, 17)), locs.get(2));
-///
-/// // Asking for an invalid capture group always returns None.
-/// assert_eq!(None, locs.get(3));
-/// assert_eq!(None, locs.get(34973498648));
-/// assert_eq!(None, locs.get(9944060567225171988));
-/// ```
-#[derive(Clone, Debug)]
-pub struct CaptureLocations(re_trait::Locations);
-
-/// A type alias for `CaptureLocations` for backwards compatibility.
-///
-/// Previously, we exported `CaptureLocations` as `Locations` in an
-/// undocumented API. To prevent breaking that code (e.g., in `regex-capi`),
-/// we continue re-exporting the same undocumented API.
-#[doc(hidden)]
-pub type Locations = CaptureLocations;
-
-impl CaptureLocations {
- /// Returns the start and end positions of the Nth capture group. Returns
- /// `None` if `i` is not a valid capture group or if the capture group did
- /// not match anything. The positions returned are *always* byte indices
- /// with respect to the original string matched.
- #[inline]
- pub fn get(&self, i: usize) -> Option<(usize, usize)> {
- self.0.pos(i)
- }
-
- /// Returns the total number of capture groups (even if they didn't match).
- ///
- /// This is always at least `1` since every regex has at least `1`
- /// capturing group that corresponds to the entire match.
- #[inline]
- pub fn len(&self) -> usize {
- self.0.len()
- }
-
- /// An alias for the `get` method for backwards compatibility.
- ///
- /// Previously, we exported `get` as `pos` in an undocumented API. To
- /// prevent breaking that code (e.g., in `regex-capi`), we continue
- /// re-exporting the same undocumented API.
- #[doc(hidden)]
- #[inline]
- pub fn pos(&self, i: usize) -> Option<(usize, usize)> {
- self.get(i)
- }
-}
-
-/// Captures represents a group of captured byte strings for a single match.
-///
-/// The 0th capture always corresponds to the entire match. Each subsequent
-/// index corresponds to the next capture group in the regex. If a capture
-/// group is named, then the matched byte string is *also* available via the
-/// `name` method. (Note that the 0th capture is always unnamed and so must be
-/// accessed with the `get` method.)
-///
-/// Positions returned from a capture group are always byte indices.
-///
-/// `'t` is the lifetime of the matched text.
-pub struct Captures<'t> {
- text: &'t [u8],
- locs: re_trait::Locations,
- named_groups: Arc<HashMap<String, usize>>,
-}
-
-impl<'t> Captures<'t> {
- /// Returns the match associated with the capture group at index `i`. If
- /// `i` does not correspond to a capture group, or if the capture group
- /// did not participate in the match, then `None` is returned.
- ///
- /// # Examples
- ///
- /// Get the text of the match with a default of an empty string if this
- /// group didn't participate in the match:
- ///
- /// ```rust
- /// # use regex::bytes::Regex;
- /// let re = Regex::new(r"[a-z]+(?:([0-9]+)|([A-Z]+))").unwrap();
- /// let caps = re.captures(b"abc123").unwrap();
- ///
- /// let text1 = caps.get(1).map_or(&b""[..], |m| m.as_bytes());
- /// let text2 = caps.get(2).map_or(&b""[..], |m| m.as_bytes());
- /// assert_eq!(text1, &b"123"[..]);
- /// assert_eq!(text2, &b""[..]);
- /// ```
- pub fn get(&self, i: usize) -> Option<Match<'t>> {
- self.locs.pos(i).map(|(s, e)| Match::new(self.text, s, e))
- }
-
- /// Returns the match for the capture group named `name`. If `name` isn't a
- /// valid capture group or didn't match anything, then `None` is returned.
- pub fn name(&self, name: &str) -> Option<Match<'t>> {
- self.named_groups.get(name).and_then(|&i| self.get(i))
- }
-
- /// An iterator that yields all capturing matches in the order in which
- /// they appear in the regex. If a particular capture group didn't
- /// participate in the match, then `None` is yielded for that capture.
- ///
- /// The first match always corresponds to the overall match of the regex.
- pub fn iter<'c>(&'c self) -> SubCaptureMatches<'c, 't> {
- SubCaptureMatches { caps: self, it: self.locs.iter() }
- }
-
- /// Expands all instances of `$name` in `replacement` to the corresponding
- /// capture group `name`, and writes them to the `dst` buffer given.
- ///
- /// `name` may be an integer corresponding to the index of the capture
- /// group (counted by order of opening parenthesis where `0` is the
- /// entire match) or it can be a name (consisting of letters, digits or
- /// underscores) corresponding to a named capture group.
- ///
- /// If `name` isn't a valid capture group (whether the name doesn't exist
- /// or isn't a valid index), then it is replaced with the empty string.
- ///
- /// The longest possible name consisting of the characters `[_0-9A-Za-z]`
- /// is used. e.g., `$1a` looks up the capture group named `1a` and not the
- /// capture group at index `1`. To exert more precise control over the
- /// name, or to refer to a capture group name that uses characters outside
- /// of `[_0-9A-Za-z]`, use braces, e.g., `${1}a` or `${foo[bar].baz}`. When
- /// using braces, any sequence of valid UTF-8 bytes is permitted. If the
- /// sequence does not refer to a capture group name in the corresponding
- /// regex, then it is replaced with an empty string.
- ///
- /// To write a literal `$` use `$$`.
- pub fn expand(&self, replacement: &[u8], dst: &mut Vec<u8>) {
- expand_bytes(self, replacement, dst)
- }
-
- /// Returns the total number of capture groups (even if they didn't match).
- ///
- /// This is always at least `1`, since every regex has at least one capture
- /// group that corresponds to the full match.
- #[inline]
- pub fn len(&self) -> usize {
- self.locs.len()
- }
-}
-
-impl<'t> fmt::Debug for Captures<'t> {
- fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
- f.debug_tuple("Captures").field(&CapturesDebug(self)).finish()
- }
-}
-
-struct CapturesDebug<'c, 't>(&'c Captures<'t>);
-
-impl<'c, 't> fmt::Debug for CapturesDebug<'c, 't> {
- fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
- fn escape_bytes(bytes: &[u8]) -> String {
- let mut s = String::new();
- for &b in bytes {
- s.push_str(&escape_byte(b));
- }
- s
- }
-
- fn escape_byte(byte: u8) -> String {
- use std::ascii::escape_default;
-
- let escaped: Vec<u8> = escape_default(byte).collect();
- String::from_utf8_lossy(&escaped).into_owned()
- }
-
- // We'd like to show something nice here, even if it means an
- // allocation to build a reverse index.
- let slot_to_name: HashMap<&usize, &String> =
- self.0.named_groups.iter().map(|(a, b)| (b, a)).collect();
- let mut map = f.debug_map();
- for (slot, m) in self.0.locs.iter().enumerate() {
- let m = m.map(|(s, e)| escape_bytes(&self.0.text[s..e]));
- if let Some(name) = slot_to_name.get(&slot) {
- map.entry(&name, &m);
- } else {
- map.entry(&slot, &m);
- }
- }
- map.finish()
- }
-}
-
-/// Get a group by index.
-///
-/// `'t` is the lifetime of the matched text.
-///
-/// The text can't outlive the `Captures` object if this method is
-/// used, because of how `Index` is defined (normally `a[i]` is part
-/// of `a` and can't outlive it); to do that, use `get()` instead.
-///
-/// # Panics
-///
-/// If there is no group at the given index.
-impl<'t> Index<usize> for Captures<'t> {
- type Output = [u8];
-
- fn index(&self, i: usize) -> &[u8] {
- self.get(i)
- .map(|m| m.as_bytes())
- .unwrap_or_else(|| panic!("no group at index '{}'", i))
- }
-}
-
-/// Get a group by name.
-///
-/// `'t` is the lifetime of the matched text and `'i` is the lifetime
-/// of the group name (the index).
-///
-/// The text can't outlive the `Captures` object if this method is
-/// used, because of how `Index` is defined (normally `a[i]` is part
-/// of `a` and can't outlive it); to do that, use `name` instead.
-///
-/// # Panics
-///
-/// If there is no group named by the given value.
-impl<'t, 'i> Index<&'i str> for Captures<'t> {
- type Output = [u8];
-
- fn index<'a>(&'a self, name: &'i str) -> &'a [u8] {
- self.name(name)
- .map(|m| m.as_bytes())
- .unwrap_or_else(|| panic!("no group named '{}'", name))
- }
-}
-
-/// An iterator that yields all capturing matches in the order in which they
-/// appear in the regex.
-///
-/// If a particular capture group didn't participate in the match, then `None`
-/// is yielded for that capture. The first match always corresponds to the
-/// overall match of the regex.
-///
-/// The lifetime `'c` corresponds to the lifetime of the `Captures` value, and
-/// the lifetime `'t` corresponds to the originally matched text.
-#[derive(Clone, Debug)]
-pub struct SubCaptureMatches<'c, 't> {
- caps: &'c Captures<'t>,
- it: SubCapturesPosIter<'c>,
-}
-
-impl<'c, 't> Iterator for SubCaptureMatches<'c, 't> {
- type Item = Option<Match<'t>>;
-
- fn next(&mut self) -> Option<Option<Match<'t>>> {
- self.it
- .next()
- .map(|cap| cap.map(|(s, e)| Match::new(self.caps.text, s, e)))
- }
-}
-
-impl<'c, 't> FusedIterator for SubCaptureMatches<'c, 't> {}
-
-/// Replacer describes types that can be used to replace matches in a byte
-/// string.
-///
-/// In general, users of this crate shouldn't need to implement this trait,
-/// since implementations are already provided for `&[u8]` along with other
-/// variants of bytes types and `FnMut(&Captures) -> Vec<u8>` (or any
-/// `FnMut(&Captures) -> T` where `T: AsRef<[u8]>`), which covers most use cases.
-pub trait Replacer {
- /// Appends text to `dst` to replace the current match.
- ///
- /// The current match is represented by `caps`, which is guaranteed to
- /// have a match at capture group `0`.
- ///
- /// For example, a no-op replacement would be
- /// `dst.extend(&caps[0])`.
- fn replace_append(&mut self, caps: &Captures<'_>, dst: &mut Vec<u8>);
-
- /// Return a fixed unchanging replacement byte string.
- ///
- /// When doing replacements, if access to `Captures` is not needed (e.g.,
- /// the replacement byte string does not need `$` expansion), then it can
- /// be beneficial to avoid finding sub-captures.
- ///
- /// In general, this is called once for every call to `replacen`.
- fn no_expansion<'r>(&'r mut self) -> Option<Cow<'r, [u8]>> {
- None
- }
-
- /// Return a `Replacer` that borrows and wraps this `Replacer`.
- ///
- /// This is useful when you want to take a generic `Replacer` (which might
- /// not be cloneable) and use it without consuming it, so it can be used
- /// more than once.
- ///
- /// # Example
- ///
- /// ```
- /// use regex::bytes::{Regex, Replacer};
- ///
- /// fn replace_all_twice<R: Replacer>(
- /// re: Regex,
- /// src: &[u8],
- /// mut rep: R,
- /// ) -> Vec<u8> {
- /// let dst = re.replace_all(src, rep.by_ref());
- /// let dst = re.replace_all(&dst, rep.by_ref());
- /// dst.into_owned()
- /// }
- /// ```
- fn by_ref<'r>(&'r mut self) -> ReplacerRef<'r, Self> {
- ReplacerRef(self)
- }
-}
-
-/// By-reference adaptor for a `Replacer`
-///
-/// Returned by [`Replacer::by_ref`](trait.Replacer.html#method.by_ref).
-#[derive(Debug)]
-pub struct ReplacerRef<'a, R: ?Sized>(&'a mut R);
-
-impl<'a, R: Replacer + ?Sized + 'a> Replacer for ReplacerRef<'a, R> {
- fn replace_append(&mut self, caps: &Captures<'_>, dst: &mut Vec<u8>) {
- self.0.replace_append(caps, dst)
- }
- fn no_expansion<'r>(&'r mut self) -> Option<Cow<'r, [u8]>> {
- self.0.no_expansion()
- }
-}
-
-impl<'a> Replacer for &'a [u8] {
- fn replace_append(&mut self, caps: &Captures<'_>, dst: &mut Vec<u8>) {
- caps.expand(*self, dst);
- }
-
- fn no_expansion(&mut self) -> Option<Cow<'_, [u8]>> {
- no_expansion(self)
- }
-}
-
-impl<'a> Replacer for &'a Vec<u8> {
- fn replace_append(&mut self, caps: &Captures<'_>, dst: &mut Vec<u8>) {
- caps.expand(*self, dst);
- }
-
- fn no_expansion(&mut self) -> Option<Cow<'_, [u8]>> {
- no_expansion(self)
- }
-}
-
-impl Replacer for Vec<u8> {
- fn replace_append(&mut self, caps: &Captures<'_>, dst: &mut Vec<u8>) {
- caps.expand(self, dst);
- }
-
- fn no_expansion(&mut self) -> Option<Cow<'_, [u8]>> {
- no_expansion(self)
- }
-}
-
-impl<'a> Replacer for Cow<'a, [u8]> {
- fn replace_append(&mut self, caps: &Captures<'_>, dst: &mut Vec<u8>) {
- caps.expand(self.as_ref(), dst);
- }
-
- fn no_expansion(&mut self) -> Option<Cow<'_, [u8]>> {
- no_expansion(self)
- }
-}
-
-impl<'a> Replacer for &'a Cow<'a, [u8]> {
- fn replace_append(&mut self, caps: &Captures<'_>, dst: &mut Vec<u8>) {
- caps.expand(self.as_ref(), dst);
- }
-
- fn no_expansion(&mut self) -> Option<Cow<'_, [u8]>> {
- no_expansion(self)
- }
-}
-
-fn no_expansion<T: AsRef<[u8]>>(t: &T) -> Option<Cow<'_, [u8]>> {
- let s = t.as_ref();
- match find_byte(b'$', s) {
- Some(_) => None,
- None => Some(Cow::Borrowed(s)),
- }
-}
-
-impl<F, T> Replacer for F
-where
- F: FnMut(&Captures<'_>) -> T,
- T: AsRef<[u8]>,
-{
- fn replace_append(&mut self, caps: &Captures<'_>, dst: &mut Vec<u8>) {
- dst.extend_from_slice((*self)(caps).as_ref());
- }
-}
-
-/// `NoExpand` indicates literal byte string replacement.
-///
-/// It can be used with `replace` and `replace_all` to do a literal byte string
-/// replacement without expanding `$name` to their corresponding capture
-/// groups. This can be both convenient (to avoid escaping `$`, for example)
-/// and performant (since capture groups don't need to be found).
-///
-/// `'t` is the lifetime of the literal text.
-#[derive(Clone, Debug)]
-pub struct NoExpand<'t>(pub &'t [u8]);
-
-impl<'t> Replacer for NoExpand<'t> {
- fn replace_append(&mut self, _: &Captures<'_>, dst: &mut Vec<u8>) {
- dst.extend_from_slice(self.0);
- }
-
- fn no_expansion(&mut self) -> Option<Cow<'_, [u8]>> {
- Some(Cow::Borrowed(self.0))
- }
-}
diff --git a/vendor/regex/src/re_set.rs b/vendor/regex/src/re_set.rs
deleted file mode 100644
index 7c8253f0c..000000000
--- a/vendor/regex/src/re_set.rs
+++ /dev/null
@@ -1,518 +0,0 @@
-macro_rules! define_set {
- ($name:ident, $builder_mod:ident, $text_ty:ty, $as_bytes:expr,
- $(#[$doc_regexset_example:meta])* ) => {
- pub mod $name {
- use std::fmt;
- use std::iter;
- use std::slice;
- use std::vec;
-
- use crate::error::Error;
- use crate::exec::Exec;
- use crate::re_builder::$builder_mod::RegexSetBuilder;
- use crate::re_trait::RegularExpression;
-
-/// Match multiple (possibly overlapping) regular expressions in a single scan.
-///
-/// A regex set corresponds to the union of two or more regular expressions.
-/// That is, a regex set will match text where at least one of its
-/// constituent regular expressions matches. A regex set as its formulated here
-/// provides a touch more power: it will also report *which* regular
-/// expressions in the set match. Indeed, this is the key difference between
-/// regex sets and a single `Regex` with many alternates, since only one
-/// alternate can match at a time.
-///
-/// For example, consider regular expressions to match email addresses and
-/// domains: `[a-z]+@[a-z]+\.(com|org|net)` and `[a-z]+\.(com|org|net)`. If a
-/// regex set is constructed from those regexes, then searching the text
-/// `foo@example.com` will report both regexes as matching. Of course, one
-/// could accomplish this by compiling each regex on its own and doing two
-/// searches over the text. The key advantage of using a regex set is that it
-/// will report the matching regexes using a *single pass through the text*.
-/// If one has hundreds or thousands of regexes to match repeatedly (like a URL
-/// router for a complex web application or a user agent matcher), then a regex
-/// set can realize huge performance gains.
-///
-/// # Example
-///
-/// This shows how the above two regexes (for matching email addresses and
-/// domains) might work:
-///
-$(#[$doc_regexset_example])*
-///
-/// Note that it would be possible to adapt the above example to using `Regex`
-/// with an expression like:
-///
-/// ```text
-/// (?P<email>[a-z]+@(?P<email_domain>[a-z]+[.](com|org|net)))|(?P<domain>[a-z]+[.](com|org|net))
-/// ```
-///
-/// After a match, one could then inspect the capture groups to figure out
-/// which alternates matched. The problem is that it is hard to make this
-/// approach scale when there are many regexes since the overlap between each
-/// alternate isn't always obvious to reason about.
-///
-/// # Limitations
-///
-/// Regex sets are limited to answering the following two questions:
-///
-/// 1. Does any regex in the set match?
-/// 2. If so, which regexes in the set match?
-///
-/// As with the main [`Regex`][crate::Regex] type, it is cheaper to ask (1)
-/// instead of (2) since the matching engines can stop after the first match
-/// is found.
-///
-/// You cannot directly extract [`Match`][crate::Match] or
-/// [`Captures`][crate::Captures] objects from a regex set. If you need these
-/// operations, the recommended approach is to compile each pattern in the set
-/// independently and scan the exact same input a second time with those
-/// independently compiled patterns:
-///
-/// ```rust
-/// use regex::{Regex, RegexSet};
-///
-/// let patterns = ["foo", "bar"];
-/// // Both patterns will match different ranges of this string.
-/// let text = "barfoo";
-///
-/// // Compile a set matching any of our patterns.
-/// let set = RegexSet::new(&patterns).unwrap();
-/// // Compile each pattern independently.
-/// let regexes: Vec<_> = set.patterns().iter()
-/// .map(|pat| Regex::new(pat).unwrap())
-/// .collect();
-///
-/// // Match against the whole set first and identify the individual
-/// // matching patterns.
-/// let matches: Vec<&str> = set.matches(text).into_iter()
-/// // Dereference the match index to get the corresponding
-/// // compiled pattern.
-/// .map(|match_idx| &regexes[match_idx])
-/// // To get match locations or any other info, we then have to search
-/// // the exact same text again, using our separately-compiled pattern.
-/// .map(|pat| pat.find(text).unwrap().as_str())
-/// .collect();
-///
-/// // Matches arrive in the order the constituent patterns were declared,
-/// // not the order they appear in the input.
-/// assert_eq!(vec!["foo", "bar"], matches);
-/// ```
-///
-/// # Performance
-///
-/// A `RegexSet` has the same performance characteristics as `Regex`. Namely,
-/// search takes `O(mn)` time, where `m` is proportional to the size of the
-/// regex set and `n` is proportional to the length of the search text.
-#[derive(Clone)]
-pub struct RegexSet(Exec);
-
-impl RegexSet {
- /// Create a new regex set with the given regular expressions.
- ///
- /// This takes an iterator of `S`, where `S` is something that can produce
- /// a `&str`. If any of the strings in the iterator are not valid regular
- /// expressions, then an error is returned.
- ///
- /// # Example
- ///
- /// Create a new regex set from an iterator of strings:
- ///
- /// ```rust
- /// # use regex::RegexSet;
- /// let set = RegexSet::new(&[r"\w+", r"\d+"]).unwrap();
- /// assert!(set.is_match("foo"));
- /// ```
- pub fn new<I, S>(exprs: I) -> Result<RegexSet, Error>
- where S: AsRef<str>, I: IntoIterator<Item=S> {
- RegexSetBuilder::new(exprs).build()
- }
-
- /// Create a new empty regex set.
- ///
- /// # Example
- ///
- /// ```rust
- /// # use regex::RegexSet;
- /// let set = RegexSet::empty();
- /// assert!(set.is_empty());
- /// ```
- pub fn empty() -> RegexSet {
- RegexSetBuilder::new(&[""; 0]).build().unwrap()
- }
-
- /// Returns true if and only if one of the regexes in this set matches
- /// the text given.
- ///
- /// This method should be preferred if you only need to test whether any
- /// of the regexes in the set should match, but don't care about *which*
- /// regexes matched. This is because the underlying matching engine will
- /// quit immediately after seeing the first match instead of continuing to
- /// find all matches.
- ///
- /// Note that as with searches using `Regex`, the expression is unanchored
- /// by default. That is, if the regex does not start with `^` or `\A`, or
- /// end with `$` or `\z`, then it is permitted to match anywhere in the
- /// text.
- ///
- /// # Example
- ///
- /// Tests whether a set matches some text:
- ///
- /// ```rust
- /// # use regex::RegexSet;
- /// let set = RegexSet::new(&[r"\w+", r"\d+"]).unwrap();
- /// assert!(set.is_match("foo"));
- /// assert!(!set.is_match("☃"));
- /// ```
- pub fn is_match(&self, text: $text_ty) -> bool {
- self.is_match_at(text, 0)
- }
-
- /// Returns the same as is_match, but starts the search at the given
- /// offset.
- ///
- /// The significance of the starting point is that it takes the surrounding
- /// context into consideration. For example, the `\A` anchor can only
- /// match when `start == 0`.
- #[doc(hidden)]
- pub fn is_match_at(&self, text: $text_ty, start: usize) -> bool {
- self.0.searcher().is_match_at($as_bytes(text), start)
- }
-
- /// Returns the set of regular expressions that match in the given text.
- ///
- /// The set returned contains the index of each regular expression that
- /// matches in the given text. The index is in correspondence with the
- /// order of regular expressions given to `RegexSet`'s constructor.
- ///
- /// The set can also be used to iterate over the matched indices.
- ///
- /// Note that as with searches using `Regex`, the expression is unanchored
- /// by default. That is, if the regex does not start with `^` or `\A`, or
- /// end with `$` or `\z`, then it is permitted to match anywhere in the
- /// text.
- ///
- /// # Example
- ///
- /// Tests which regular expressions match the given text:
- ///
- /// ```rust
- /// # use regex::RegexSet;
- /// let set = RegexSet::new(&[
- /// r"\w+",
- /// r"\d+",
- /// r"\pL+",
- /// r"foo",
- /// r"bar",
- /// r"barfoo",
- /// r"foobar",
- /// ]).unwrap();
- /// let matches: Vec<_> = set.matches("foobar").into_iter().collect();
- /// assert_eq!(matches, vec![0, 2, 3, 4, 6]);
- ///
- /// // You can also test whether a particular regex matched:
- /// let matches = set.matches("foobar");
- /// assert!(!matches.matched(5));
- /// assert!(matches.matched(6));
- /// ```
- pub fn matches(&self, text: $text_ty) -> SetMatches {
- let mut matches = vec![false; self.0.regex_strings().len()];
- let any = self.read_matches_at(&mut matches, text, 0);
- SetMatches {
- matched_any: any,
- matches: matches,
- }
- }
-
- /// Returns the same as matches, but starts the search at the given
- /// offset and stores the matches into the slice given.
- ///
- /// The significance of the starting point is that it takes the surrounding
- /// context into consideration. For example, the `\A` anchor can only
- /// match when `start == 0`.
- ///
- /// `matches` must have a length that is at least the number of regexes
- /// in this set.
- ///
- /// This method returns true if and only if at least one member of
- /// `matches` is true after executing the set against `text`.
- #[doc(hidden)]
- pub fn read_matches_at(
- &self,
- matches: &mut [bool],
- text: $text_ty,
- start: usize,
- ) -> bool {
- self.0.searcher().many_matches_at(matches, $as_bytes(text), start)
- }
-
- /// Returns the total number of regular expressions in this set.
- pub fn len(&self) -> usize {
- self.0.regex_strings().len()
- }
-
- /// Returns `true` if this set contains no regular expressions.
- pub fn is_empty(&self) -> bool {
- self.0.regex_strings().is_empty()
- }
-
- /// Returns the patterns that this set will match on.
- ///
- /// This function can be used to determine the pattern for a match. The
- /// slice returned has exactly as many patterns givens to this regex set,
- /// and the order of the slice is the same as the order of the patterns
- /// provided to the set.
- ///
- /// # Example
- ///
- /// ```rust
- /// # use regex::RegexSet;
- /// let set = RegexSet::new(&[
- /// r"\w+",
- /// r"\d+",
- /// r"\pL+",
- /// r"foo",
- /// r"bar",
- /// r"barfoo",
- /// r"foobar",
- /// ]).unwrap();
- /// let matches: Vec<_> = set
- /// .matches("foobar")
- /// .into_iter()
- /// .map(|match_idx| &set.patterns()[match_idx])
- /// .collect();
- /// assert_eq!(matches, vec![r"\w+", r"\pL+", r"foo", r"bar", r"foobar"]);
- /// ```
- pub fn patterns(&self) -> &[String] {
- self.0.regex_strings()
- }
-}
-
-impl Default for RegexSet {
- fn default() -> Self {
- RegexSet::empty()
- }
-}
-
-/// A set of matches returned by a regex set.
-#[derive(Clone, Debug)]
-pub struct SetMatches {
- matched_any: bool,
- matches: Vec<bool>,
-}
-
-impl SetMatches {
- /// Whether this set contains any matches.
- pub fn matched_any(&self) -> bool {
- self.matched_any
- }
-
- /// Whether the regex at the given index matched.
- ///
- /// The index for a regex is determined by its insertion order upon the
- /// initial construction of a `RegexSet`, starting at `0`.
- ///
- /// # Panics
- ///
- /// If `regex_index` is greater than or equal to `self.len()`.
- pub fn matched(&self, regex_index: usize) -> bool {
- self.matches[regex_index]
- }
-
- /// The total number of regexes in the set that created these matches.
- ///
- /// **WARNING:** This always returns the same value as [`RegexSet::len`].
- /// In particular, it does *not* return the number of elements yielded by
- /// [`SetMatches::iter`]. The only way to determine the total number of
- /// matched regexes is to iterate over them.
- pub fn len(&self) -> usize {
- self.matches.len()
- }
-
- /// Returns an iterator over indexes in the regex that matched.
- ///
- /// This will always produces matches in ascending order of index, where
- /// the index corresponds to the index of the regex that matched with
- /// respect to its position when initially building the set.
- pub fn iter(&self) -> SetMatchesIter<'_> {
- SetMatchesIter((&*self.matches).into_iter().enumerate())
- }
-}
-
-impl IntoIterator for SetMatches {
- type IntoIter = SetMatchesIntoIter;
- type Item = usize;
-
- fn into_iter(self) -> Self::IntoIter {
- SetMatchesIntoIter(self.matches.into_iter().enumerate())
- }
-}
-
-impl<'a> IntoIterator for &'a SetMatches {
- type IntoIter = SetMatchesIter<'a>;
- type Item = usize;
-
- fn into_iter(self) -> Self::IntoIter {
- self.iter()
- }
-}
-
-/// An owned iterator over the set of matches from a regex set.
-///
-/// This will always produces matches in ascending order of index, where the
-/// index corresponds to the index of the regex that matched with respect to
-/// its position when initially building the set.
-#[derive(Debug)]
-pub struct SetMatchesIntoIter(iter::Enumerate<vec::IntoIter<bool>>);
-
-impl Iterator for SetMatchesIntoIter {
- type Item = usize;
-
- fn next(&mut self) -> Option<usize> {
- loop {
- match self.0.next() {
- None => return None,
- Some((_, false)) => {}
- Some((i, true)) => return Some(i),
- }
- }
- }
-
- fn size_hint(&self) -> (usize, Option<usize>) {
- self.0.size_hint()
- }
-}
-
-impl DoubleEndedIterator for SetMatchesIntoIter {
- fn next_back(&mut self) -> Option<usize> {
- loop {
- match self.0.next_back() {
- None => return None,
- Some((_, false)) => {}
- Some((i, true)) => return Some(i),
- }
- }
- }
-}
-
-impl iter::FusedIterator for SetMatchesIntoIter {}
-
-/// A borrowed iterator over the set of matches from a regex set.
-///
-/// The lifetime `'a` refers to the lifetime of a `SetMatches` value.
-///
-/// This will always produces matches in ascending order of index, where the
-/// index corresponds to the index of the regex that matched with respect to
-/// its position when initially building the set.
-#[derive(Clone, Debug)]
-pub struct SetMatchesIter<'a>(iter::Enumerate<slice::Iter<'a, bool>>);
-
-impl<'a> Iterator for SetMatchesIter<'a> {
- type Item = usize;
-
- fn next(&mut self) -> Option<usize> {
- loop {
- match self.0.next() {
- None => return None,
- Some((_, &false)) => {}
- Some((i, &true)) => return Some(i),
- }
- }
- }
-
- fn size_hint(&self) -> (usize, Option<usize>) {
- self.0.size_hint()
- }
-}
-
-impl<'a> DoubleEndedIterator for SetMatchesIter<'a> {
- fn next_back(&mut self) -> Option<usize> {
- loop {
- match self.0.next_back() {
- None => return None,
- Some((_, &false)) => {}
- Some((i, &true)) => return Some(i),
- }
- }
- }
-}
-
-impl<'a> iter::FusedIterator for SetMatchesIter<'a> {}
-
-#[doc(hidden)]
-impl From<Exec> for RegexSet {
- fn from(exec: Exec) -> Self {
- RegexSet(exec)
- }
-}
-
-impl fmt::Debug for RegexSet {
- fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
- write!(f, "RegexSet({:?})", self.0.regex_strings())
- }
-}
-
-#[allow(dead_code)] fn as_bytes_str(text: &str) -> &[u8] { text.as_bytes() }
-#[allow(dead_code)] fn as_bytes_bytes(text: &[u8]) -> &[u8] { text }
- }
- }
-}
-
-define_set! {
- unicode,
- set_unicode,
- &str,
- as_bytes_str,
-/// ```rust
-/// # use regex::RegexSet;
-/// let set = RegexSet::new(&[
-/// r"[a-z]+@[a-z]+\.(com|org|net)",
-/// r"[a-z]+\.(com|org|net)",
-/// ]).unwrap();
-///
-/// // Ask whether any regexes in the set match.
-/// assert!(set.is_match("foo@example.com"));
-///
-/// // Identify which regexes in the set match.
-/// let matches: Vec<_> = set.matches("foo@example.com").into_iter().collect();
-/// assert_eq!(vec![0, 1], matches);
-///
-/// // Try again, but with text that only matches one of the regexes.
-/// let matches: Vec<_> = set.matches("example.com").into_iter().collect();
-/// assert_eq!(vec![1], matches);
-///
-/// // Try again, but with text that doesn't match any regex in the set.
-/// let matches: Vec<_> = set.matches("example").into_iter().collect();
-/// assert!(matches.is_empty());
-/// ```
-}
-
-define_set! {
- bytes,
- set_bytes,
- &[u8],
- as_bytes_bytes,
-/// ```rust
-/// # use regex::bytes::RegexSet;
-/// let set = RegexSet::new(&[
-/// r"[a-z]+@[a-z]+\.(com|org|net)",
-/// r"[a-z]+\.(com|org|net)",
-/// ]).unwrap();
-///
-/// // Ask whether any regexes in the set match.
-/// assert!(set.is_match(b"foo@example.com"));
-///
-/// // Identify which regexes in the set match.
-/// let matches: Vec<_> = set.matches(b"foo@example.com").into_iter().collect();
-/// assert_eq!(vec![0, 1], matches);
-///
-/// // Try again, but with text that only matches one of the regexes.
-/// let matches: Vec<_> = set.matches(b"example.com").into_iter().collect();
-/// assert_eq!(vec![1], matches);
-///
-/// // Try again, but with text that doesn't match any regex in the set.
-/// let matches: Vec<_> = set.matches(b"example").into_iter().collect();
-/// assert!(matches.is_empty());
-/// ```
-}
diff --git a/vendor/regex/src/re_trait.rs b/vendor/regex/src/re_trait.rs
deleted file mode 100644
index 505810c84..000000000
--- a/vendor/regex/src/re_trait.rs
+++ /dev/null
@@ -1,294 +0,0 @@
-use std::fmt;
-use std::iter::FusedIterator;
-
-/// Slot is a single saved capture location. Note that there are two slots for
-/// every capture in a regular expression (one slot each for the start and end
-/// of the capture).
-pub type Slot = Option<usize>;
-
-/// Locations represents the offsets of each capturing group in a regex for
-/// a single match.
-///
-/// Unlike `Captures`, a `Locations` value only stores offsets.
-#[doc(hidden)]
-#[derive(Clone, Debug)]
-pub struct Locations(Vec<Slot>);
-
-impl Locations {
- /// Returns the start and end positions of the Nth capture group. Returns
- /// `None` if `i` is not a valid capture group or if the capture group did
- /// not match anything. The positions returned are *always* byte indices
- /// with respect to the original string matched.
- pub fn pos(&self, i: usize) -> Option<(usize, usize)> {
- let (s, e) = (i.checked_mul(2)?, i.checked_mul(2)?.checked_add(1)?);
- match (self.0.get(s), self.0.get(e)) {
- (Some(&Some(s)), Some(&Some(e))) => Some((s, e)),
- _ => None,
- }
- }
-
- /// Creates an iterator of all the capture group positions in order of
- /// appearance in the regular expression. Positions are byte indices
- /// in terms of the original string matched.
- pub fn iter(&self) -> SubCapturesPosIter<'_> {
- SubCapturesPosIter { idx: 0, locs: self }
- }
-
- /// Returns the total number of capturing groups.
- ///
- /// This is always at least `1` since every regex has at least `1`
- /// capturing group that corresponds to the entire match.
- pub fn len(&self) -> usize {
- self.0.len() / 2
- }
-
- /// Return the individual slots as a slice.
- pub(crate) fn as_slots(&mut self) -> &mut [Slot] {
- &mut self.0
- }
-}
-
-/// An iterator over capture group positions for a particular match of a
-/// regular expression.
-///
-/// Positions are byte indices in terms of the original string matched.
-///
-/// `'c` is the lifetime of the captures.
-#[derive(Clone, Debug)]
-pub struct SubCapturesPosIter<'c> {
- idx: usize,
- locs: &'c Locations,
-}
-
-impl<'c> Iterator for SubCapturesPosIter<'c> {
- type Item = Option<(usize, usize)>;
-
- fn next(&mut self) -> Option<Option<(usize, usize)>> {
- if self.idx >= self.locs.len() {
- return None;
- }
- let x = match self.locs.pos(self.idx) {
- None => Some(None),
- Some((s, e)) => Some(Some((s, e))),
- };
- self.idx += 1;
- x
- }
-
- fn size_hint(&self) -> (usize, Option<usize>) {
- let len = self.locs.len() - self.idx;
- (len, Some(len))
- }
-
- fn count(self) -> usize {
- self.len()
- }
-}
-
-impl<'c> ExactSizeIterator for SubCapturesPosIter<'c> {}
-
-impl<'c> FusedIterator for SubCapturesPosIter<'c> {}
-
-/// `RegularExpression` describes types that can implement regex searching.
-///
-/// This trait is my attempt at reducing code duplication and to standardize
-/// the internal API. Specific duplication that is avoided are the `find`
-/// and `capture` iterators, which are slightly tricky.
-///
-/// It's not clear whether this trait is worth it, and it also isn't
-/// clear whether it's useful as a public trait or not. Methods like
-/// `next_after_empty` reak of bad design, but the rest of the methods seem
-/// somewhat reasonable. One particular thing this trait would expose would be
-/// the ability to start the search of a regex anywhere in a haystack, which
-/// isn't possible in the current public API.
-pub trait RegularExpression: Sized + fmt::Debug {
- /// The type of the haystack.
- type Text: ?Sized + fmt::Debug;
-
- /// The number of capture slots in the compiled regular expression. This is
- /// always two times the number of capture groups (two slots per group).
- fn slots_len(&self) -> usize;
-
- /// Allocates fresh space for all capturing groups in this regex.
- fn locations(&self) -> Locations {
- Locations(vec![None; self.slots_len()])
- }
-
- /// Returns the position of the next character after `i`.
- ///
- /// For example, a haystack with type `&[u8]` probably returns `i+1`,
- /// whereas a haystack with type `&str` probably returns `i` plus the
- /// length of the next UTF-8 sequence.
- fn next_after_empty(&self, text: &Self::Text, i: usize) -> usize;
-
- /// Returns the location of the shortest match.
- fn shortest_match_at(
- &self,
- text: &Self::Text,
- start: usize,
- ) -> Option<usize>;
-
- /// Returns whether the regex matches the text given.
- fn is_match_at(&self, text: &Self::Text, start: usize) -> bool;
-
- /// Returns the leftmost-first match location if one exists.
- fn find_at(
- &self,
- text: &Self::Text,
- start: usize,
- ) -> Option<(usize, usize)>;
-
- /// Returns the leftmost-first match location if one exists, and also
- /// fills in any matching capture slot locations.
- fn captures_read_at(
- &self,
- locs: &mut Locations,
- text: &Self::Text,
- start: usize,
- ) -> Option<(usize, usize)>;
-
- /// Returns an iterator over all non-overlapping successive leftmost-first
- /// matches.
- fn find_iter(self, text: &Self::Text) -> Matches<'_, Self> {
- Matches { re: self, text, last_end: 0, last_match: None }
- }
-
- /// Returns an iterator over all non-overlapping successive leftmost-first
- /// matches with captures.
- fn captures_iter(self, text: &Self::Text) -> CaptureMatches<'_, Self> {
- CaptureMatches(self.find_iter(text))
- }
-}
-
-/// An iterator over all non-overlapping successive leftmost-first matches.
-#[derive(Debug)]
-pub struct Matches<'t, R>
-where
- R: RegularExpression,
- R::Text: 't,
-{
- re: R,
- text: &'t R::Text,
- last_end: usize,
- last_match: Option<usize>,
-}
-
-impl<'t, R> Matches<'t, R>
-where
- R: RegularExpression,
- R::Text: 't,
-{
- /// Return the text being searched.
- pub fn text(&self) -> &'t R::Text {
- self.text
- }
-
- /// Return the underlying regex.
- pub fn regex(&self) -> &R {
- &self.re
- }
-}
-
-impl<'t, R> Iterator for Matches<'t, R>
-where
- R: RegularExpression,
- R::Text: 't + AsRef<[u8]>,
-{
- type Item = (usize, usize);
-
- fn next(&mut self) -> Option<(usize, usize)> {
- if self.last_end > self.text.as_ref().len() {
- return None;
- }
- let (s, e) = match self.re.find_at(self.text, self.last_end) {
- None => return None,
- Some((s, e)) => (s, e),
- };
- if s == e {
- // This is an empty match. To ensure we make progress, start
- // the next search at the smallest possible starting position
- // of the next match following this one.
- self.last_end = self.re.next_after_empty(self.text, e);
- // Don't accept empty matches immediately following a match.
- // Just move on to the next match.
- if Some(e) == self.last_match {
- return self.next();
- }
- } else {
- self.last_end = e;
- }
- self.last_match = Some(e);
- Some((s, e))
- }
-}
-
-impl<'t, R> FusedIterator for Matches<'t, R>
-where
- R: RegularExpression,
- R::Text: 't + AsRef<[u8]>,
-{
-}
-
-/// An iterator over all non-overlapping successive leftmost-first matches with
-/// captures.
-#[derive(Debug)]
-pub struct CaptureMatches<'t, R>(Matches<'t, R>)
-where
- R: RegularExpression,
- R::Text: 't;
-
-impl<'t, R> CaptureMatches<'t, R>
-where
- R: RegularExpression,
- R::Text: 't,
-{
- /// Return the text being searched.
- pub fn text(&self) -> &'t R::Text {
- self.0.text()
- }
-
- /// Return the underlying regex.
- pub fn regex(&self) -> &R {
- self.0.regex()
- }
-}
-
-impl<'t, R> Iterator for CaptureMatches<'t, R>
-where
- R: RegularExpression,
- R::Text: 't + AsRef<[u8]>,
-{
- type Item = Locations;
-
- fn next(&mut self) -> Option<Locations> {
- if self.0.last_end > self.0.text.as_ref().len() {
- return None;
- }
- let mut locs = self.0.re.locations();
- let (s, e) = match self.0.re.captures_read_at(
- &mut locs,
- self.0.text,
- self.0.last_end,
- ) {
- None => return None,
- Some((s, e)) => (s, e),
- };
- if s == e {
- self.0.last_end = self.0.re.next_after_empty(self.0.text, e);
- if Some(e) == self.0.last_match {
- return self.next();
- }
- } else {
- self.0.last_end = e;
- }
- self.0.last_match = Some(e);
- Some(locs)
- }
-}
-
-impl<'t, R> FusedIterator for CaptureMatches<'t, R>
-where
- R: RegularExpression,
- R::Text: 't + AsRef<[u8]>,
-{
-}
diff --git a/vendor/regex/src/re_unicode.rs b/vendor/regex/src/re_unicode.rs
deleted file mode 100644
index 57689086d..000000000
--- a/vendor/regex/src/re_unicode.rs
+++ /dev/null
@@ -1,1415 +0,0 @@
-use std::borrow::Cow;
-use std::collections::HashMap;
-use std::fmt;
-use std::iter::FusedIterator;
-use std::ops::{Index, Range};
-use std::str::FromStr;
-use std::sync::Arc;
-
-use crate::find_byte::find_byte;
-
-use crate::error::Error;
-use crate::exec::{Exec, ExecNoSyncStr};
-use crate::expand::expand_str;
-use crate::re_builder::unicode::RegexBuilder;
-use crate::re_trait::{self, RegularExpression, SubCapturesPosIter};
-
-/// Escapes all regular expression meta characters in `text`.
-///
-/// The string returned may be safely used as a literal in a regular
-/// expression.
-pub fn escape(text: &str) -> String {
- regex_syntax::escape(text)
-}
-
-/// Match represents a single match of a regex in a haystack.
-///
-/// The lifetime parameter `'t` refers to the lifetime of the matched text.
-#[derive(Copy, Clone, Eq, PartialEq)]
-pub struct Match<'t> {
- text: &'t str,
- start: usize,
- end: usize,
-}
-
-impl<'t> Match<'t> {
- /// Returns the starting byte offset of the match in the haystack.
- #[inline]
- pub fn start(&self) -> usize {
- self.start
- }
-
- /// Returns the ending byte offset of the match in the haystack.
- #[inline]
- pub fn end(&self) -> usize {
- self.end
- }
-
- /// Returns true if and only if this match has a length of zero.
- #[inline]
- pub fn is_empty(&self) -> bool {
- self.start == self.end
- }
-
- /// Returns the length, in bytes, of this match.
- #[inline]
- pub fn len(&self) -> usize {
- self.end - self.start
- }
-
- /// Returns the range over the starting and ending byte offsets of the
- /// match in the haystack.
- #[inline]
- pub fn range(&self) -> Range<usize> {
- self.start..self.end
- }
-
- /// Returns the matched text.
- #[inline]
- pub fn as_str(&self) -> &'t str {
- &self.text[self.range()]
- }
-
- /// Creates a new match from the given haystack and byte offsets.
- #[inline]
- fn new(haystack: &'t str, start: usize, end: usize) -> Match<'t> {
- Match { text: haystack, start, end }
- }
-}
-
-impl<'t> std::fmt::Debug for Match<'t> {
- fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
- f.debug_struct("Match")
- .field("start", &self.start)
- .field("end", &self.end)
- .field("string", &self.as_str())
- .finish()
- }
-}
-
-impl<'t> From<Match<'t>> for &'t str {
- fn from(m: Match<'t>) -> &'t str {
- m.as_str()
- }
-}
-
-impl<'t> From<Match<'t>> for Range<usize> {
- fn from(m: Match<'t>) -> Range<usize> {
- m.range()
- }
-}
-
-/// A compiled regular expression for matching Unicode strings.
-///
-/// It is represented as either a sequence of bytecode instructions (dynamic)
-/// or as a specialized Rust function (native). It can be used to search, split
-/// or replace text. All searching is done with an implicit `.*?` at the
-/// beginning and end of an expression. To force an expression to match the
-/// whole string (or a prefix or a suffix), you must use an anchor like `^` or
-/// `$` (or `\A` and `\z`).
-///
-/// While this crate will handle Unicode strings (whether in the regular
-/// expression or in the search text), all positions returned are **byte
-/// indices**. Every byte index is guaranteed to be at a Unicode code point
-/// boundary.
-///
-/// The lifetimes `'r` and `'t` in this crate correspond to the lifetime of a
-/// compiled regular expression and text to search, respectively.
-///
-/// The only methods that allocate new strings are the string replacement
-/// methods. All other methods (searching and splitting) return borrowed
-/// pointers into the string given.
-///
-/// # Examples
-///
-/// Find the location of a US phone number:
-///
-/// ```rust
-/// # use regex::Regex;
-/// let re = Regex::new("[0-9]{3}-[0-9]{3}-[0-9]{4}").unwrap();
-/// let mat = re.find("phone: 111-222-3333").unwrap();
-/// assert_eq!((mat.start(), mat.end()), (7, 19));
-/// ```
-///
-/// # Using the `std::str::pattern` methods with `Regex`
-///
-/// > **Note**: This section requires that this crate is compiled with the
-/// > `pattern` Cargo feature enabled, which **requires nightly Rust**.
-///
-/// Since `Regex` implements `Pattern`, you can use regexes with methods
-/// defined on `&str`. For example, `is_match`, `find`, `find_iter`
-/// and `split` can be replaced with `str::contains`, `str::find`,
-/// `str::match_indices` and `str::split`.
-///
-/// Here are some examples:
-///
-/// ```rust,ignore
-/// # use regex::Regex;
-/// let re = Regex::new(r"\d+").unwrap();
-/// let haystack = "a111b222c";
-///
-/// assert!(haystack.contains(&re));
-/// assert_eq!(haystack.find(&re), Some(1));
-/// assert_eq!(haystack.match_indices(&re).collect::<Vec<_>>(),
-/// vec![(1, "111"), (5, "222")]);
-/// assert_eq!(haystack.split(&re).collect::<Vec<_>>(), vec!["a", "b", "c"]);
-/// ```
-#[derive(Clone)]
-pub struct Regex(Exec);
-
-impl fmt::Display for Regex {
- /// Shows the original regular expression.
- fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
- write!(f, "{}", self.as_str())
- }
-}
-
-impl fmt::Debug for Regex {
- /// Shows the original regular expression.
- fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
- fmt::Display::fmt(self, f)
- }
-}
-
-#[doc(hidden)]
-impl From<Exec> for Regex {
- fn from(exec: Exec) -> Regex {
- Regex(exec)
- }
-}
-
-impl FromStr for Regex {
- type Err = Error;
-
- /// Attempts to parse a string into a regular expression
- fn from_str(s: &str) -> Result<Regex, Error> {
- Regex::new(s)
- }
-}
-
-/// Core regular expression methods.
-impl Regex {
- /// Compiles a regular expression. Once compiled, it can be used repeatedly
- /// to search, split or replace text in a string.
- ///
- /// If an invalid expression is given, then an error is returned.
- pub fn new(re: &str) -> Result<Regex, Error> {
- RegexBuilder::new(re).build()
- }
-
- /// Returns true if and only if there is a match for the regex in the
- /// string given.
- ///
- /// It is recommended to use this method if all you need to do is test
- /// a match, since the underlying matching engine may be able to do less
- /// work.
- ///
- /// # Example
- ///
- /// Test if some text contains at least one word with exactly 13
- /// Unicode word characters:
- ///
- /// ```rust
- /// # use regex::Regex;
- /// # fn main() {
- /// let text = "I categorically deny having triskaidekaphobia.";
- /// assert!(Regex::new(r"\b\w{13}\b").unwrap().is_match(text));
- /// # }
- /// ```
- pub fn is_match(&self, text: &str) -> bool {
- self.is_match_at(text, 0)
- }
-
- /// Returns the start and end byte range of the leftmost-first match in
- /// `text`. If no match exists, then `None` is returned.
- ///
- /// Note that this should only be used if you want to discover the position
- /// of the match. Testing the existence of a match is faster if you use
- /// `is_match`.
- ///
- /// # Example
- ///
- /// Find the start and end location of the first word with exactly 13
- /// Unicode word characters:
- ///
- /// ```rust
- /// # use regex::Regex;
- /// # fn main() {
- /// let text = "I categorically deny having triskaidekaphobia.";
- /// let mat = Regex::new(r"\b\w{13}\b").unwrap().find(text).unwrap();
- /// assert_eq!(mat.start(), 2);
- /// assert_eq!(mat.end(), 15);
- /// # }
- /// ```
- pub fn find<'t>(&self, text: &'t str) -> Option<Match<'t>> {
- self.find_at(text, 0)
- }
-
- /// Returns an iterator for each successive non-overlapping match in
- /// `text`, returning the start and end byte indices with respect to
- /// `text`.
- ///
- /// # Example
- ///
- /// Find the start and end location of every word with exactly 13 Unicode
- /// word characters:
- ///
- /// ```rust
- /// # use regex::Regex;
- /// # fn main() {
- /// let text = "Retroactively relinquishing remunerations is reprehensible.";
- /// for mat in Regex::new(r"\b\w{13}\b").unwrap().find_iter(text) {
- /// println!("{:?}", mat);
- /// }
- /// # }
- /// ```
- pub fn find_iter<'r, 't>(&'r self, text: &'t str) -> Matches<'r, 't> {
- Matches(self.0.searcher_str().find_iter(text))
- }
-
- /// Returns the capture groups corresponding to the leftmost-first
- /// match in `text`. Capture group `0` always corresponds to the entire
- /// match. If no match is found, then `None` is returned.
- ///
- /// You should only use `captures` if you need access to the location of
- /// capturing group matches. Otherwise, `find` is faster for discovering
- /// the location of the overall match.
- ///
- /// # Examples
- ///
- /// Say you have some text with movie names and their release years,
- /// like "'Citizen Kane' (1941)". It'd be nice if we could search for text
- /// looking like that, while also extracting the movie name and its release
- /// year separately.
- ///
- /// ```rust
- /// # use regex::Regex;
- /// # fn main() {
- /// let re = Regex::new(r"'([^']+)'\s+\((\d{4})\)").unwrap();
- /// let text = "Not my favorite movie: 'Citizen Kane' (1941).";
- /// let caps = re.captures(text).unwrap();
- /// assert_eq!(caps.get(1).unwrap().as_str(), "Citizen Kane");
- /// assert_eq!(caps.get(2).unwrap().as_str(), "1941");
- /// assert_eq!(caps.get(0).unwrap().as_str(), "'Citizen Kane' (1941)");
- /// // You can also access the groups by index using the Index notation.
- /// // Note that this will panic on an invalid index.
- /// assert_eq!(&caps[1], "Citizen Kane");
- /// assert_eq!(&caps[2], "1941");
- /// assert_eq!(&caps[0], "'Citizen Kane' (1941)");
- /// # }
- /// ```
- ///
- /// Note that the full match is at capture group `0`. Each subsequent
- /// capture group is indexed by the order of its opening `(`.
- ///
- /// We can make this example a bit clearer by using *named* capture groups:
- ///
- /// ```rust
- /// # use regex::Regex;
- /// # fn main() {
- /// let re = Regex::new(r"'(?P<title>[^']+)'\s+\((?P<year>\d{4})\)")
- /// .unwrap();
- /// let text = "Not my favorite movie: 'Citizen Kane' (1941).";
- /// let caps = re.captures(text).unwrap();
- /// assert_eq!(caps.name("title").unwrap().as_str(), "Citizen Kane");
- /// assert_eq!(caps.name("year").unwrap().as_str(), "1941");
- /// assert_eq!(caps.get(0).unwrap().as_str(), "'Citizen Kane' (1941)");
- /// // You can also access the groups by name using the Index notation.
- /// // Note that this will panic on an invalid group name.
- /// assert_eq!(&caps["title"], "Citizen Kane");
- /// assert_eq!(&caps["year"], "1941");
- /// assert_eq!(&caps[0], "'Citizen Kane' (1941)");
- ///
- /// # }
- /// ```
- ///
- /// Here we name the capture groups, which we can access with the `name`
- /// method or the `Index` notation with a `&str`. Note that the named
- /// capture groups are still accessible with `get` or the `Index` notation
- /// with a `usize`.
- ///
- /// The `0`th capture group is always unnamed, so it must always be
- /// accessed with `get(0)` or `[0]`.
- pub fn captures<'t>(&self, text: &'t str) -> Option<Captures<'t>> {
- self.captures_at(text, 0)
- }
-
- /// Returns an iterator over all the non-overlapping capture groups matched
- /// in `text`. This is operationally the same as `find_iter`, except it
- /// yields information about capturing group matches.
- ///
- /// # Example
- ///
- /// We can use this to find all movie titles and their release years in
- /// some text, where the movie is formatted like "'Title' (xxxx)":
- ///
- /// ```rust
- /// # use regex::Regex;
- /// # fn main() {
- /// let re = Regex::new(r"'(?P<title>[^']+)'\s+\((?P<year>\d{4})\)")
- /// .unwrap();
- /// let text = "'Citizen Kane' (1941), 'The Wizard of Oz' (1939), 'M' (1931).";
- /// for caps in re.captures_iter(text) {
- /// println!("Movie: {:?}, Released: {:?}",
- /// &caps["title"], &caps["year"]);
- /// }
- /// // Output:
- /// // Movie: Citizen Kane, Released: 1941
- /// // Movie: The Wizard of Oz, Released: 1939
- /// // Movie: M, Released: 1931
- /// # }
- /// ```
- pub fn captures_iter<'r, 't>(
- &'r self,
- text: &'t str,
- ) -> CaptureMatches<'r, 't> {
- CaptureMatches(self.0.searcher_str().captures_iter(text))
- }
-
- /// Returns an iterator of substrings of `text` delimited by a match of the
- /// regular expression. Namely, each element of the iterator corresponds to
- /// text that *isn't* matched by the regular expression.
- ///
- /// This method will *not* copy the text given.
- ///
- /// # Example
- ///
- /// To split a string delimited by arbitrary amounts of spaces or tabs:
- ///
- /// ```rust
- /// # use regex::Regex;
- /// # fn main() {
- /// let re = Regex::new(r"[ \t]+").unwrap();
- /// let fields: Vec<&str> = re.split("a b \t c\td e").collect();
- /// assert_eq!(fields, vec!["a", "b", "c", "d", "e"]);
- /// # }
- /// ```
- pub fn split<'r, 't>(&'r self, text: &'t str) -> Split<'r, 't> {
- Split { finder: self.find_iter(text), last: 0 }
- }
-
- /// Returns an iterator of at most `limit` substrings of `text` delimited
- /// by a match of the regular expression. (A `limit` of `0` will return no
- /// substrings.) Namely, each element of the iterator corresponds to text
- /// that *isn't* matched by the regular expression. The remainder of the
- /// string that is not split will be the last element in the iterator.
- ///
- /// This method will *not* copy the text given.
- ///
- /// # Example
- ///
- /// Get the first two words in some text:
- ///
- /// ```rust
- /// # use regex::Regex;
- /// # fn main() {
- /// let re = Regex::new(r"\W+").unwrap();
- /// let fields: Vec<&str> = re.splitn("Hey! How are you?", 3).collect();
- /// assert_eq!(fields, vec!("Hey", "How", "are you?"));
- /// # }
- /// ```
- pub fn splitn<'r, 't>(
- &'r self,
- text: &'t str,
- limit: usize,
- ) -> SplitN<'r, 't> {
- SplitN { splits: self.split(text), n: limit }
- }
-
- /// Replaces the leftmost-first match with the replacement provided.
- /// The replacement can be a regular string (where `$N` and `$name` are
- /// expanded to match capture groups) or a function that takes the matches'
- /// `Captures` and returns the replaced string.
- ///
- /// If no match is found, then a copy of the string is returned unchanged.
- ///
- /// # Replacement string syntax
- ///
- /// All instances of `$name` in the replacement text is replaced with the
- /// corresponding capture group `name`.
- ///
- /// `name` may be an integer corresponding to the index of the
- /// capture group (counted by order of opening parenthesis where `0` is the
- /// entire match) or it can be a name (consisting of letters, digits or
- /// underscores) corresponding to a named capture group.
- ///
- /// If `name` isn't a valid capture group (whether the name doesn't exist
- /// or isn't a valid index), then it is replaced with the empty string.
- ///
- /// The longest possible name is used. e.g., `$1a` looks up the capture
- /// group named `1a` and not the capture group at index `1`. To exert more
- /// precise control over the name, use braces, e.g., `${1}a`.
- ///
- /// To write a literal `$` use `$$`.
- ///
- /// # Examples
- ///
- /// Note that this function is polymorphic with respect to the replacement.
- /// In typical usage, this can just be a normal string:
- ///
- /// ```rust
- /// # use regex::Regex;
- /// # fn main() {
- /// let re = Regex::new("[^01]+").unwrap();
- /// assert_eq!(re.replace("1078910", ""), "1010");
- /// # }
- /// ```
- ///
- /// But anything satisfying the `Replacer` trait will work. For example,
- /// a closure of type `|&Captures| -> String` provides direct access to the
- /// captures corresponding to a match. This allows one to access
- /// capturing group matches easily:
- ///
- /// ```rust
- /// # use regex::Regex;
- /// # use regex::Captures; fn main() {
- /// let re = Regex::new(r"([^,\s]+),\s+(\S+)").unwrap();
- /// let result = re.replace("Springsteen, Bruce", |caps: &Captures| {
- /// format!("{} {}", &caps[2], &caps[1])
- /// });
- /// assert_eq!(result, "Bruce Springsteen");
- /// # }
- /// ```
- ///
- /// But this is a bit cumbersome to use all the time. Instead, a simple
- /// syntax is supported that expands `$name` into the corresponding capture
- /// group. Here's the last example, but using this expansion technique
- /// with named capture groups:
- ///
- /// ```rust
- /// # use regex::Regex;
- /// # fn main() {
- /// let re = Regex::new(r"(?P<last>[^,\s]+),\s+(?P<first>\S+)").unwrap();
- /// let result = re.replace("Springsteen, Bruce", "$first $last");
- /// assert_eq!(result, "Bruce Springsteen");
- /// # }
- /// ```
- ///
- /// Note that using `$2` instead of `$first` or `$1` instead of `$last`
- /// would produce the same result. To write a literal `$` use `$$`.
- ///
- /// Sometimes the replacement string requires use of curly braces to
- /// delineate a capture group replacement and surrounding literal text.
- /// For example, if we wanted to join two words together with an
- /// underscore:
- ///
- /// ```rust
- /// # use regex::Regex;
- /// # fn main() {
- /// let re = Regex::new(r"(?P<first>\w+)\s+(?P<second>\w+)").unwrap();
- /// let result = re.replace("deep fried", "${first}_$second");
- /// assert_eq!(result, "deep_fried");
- /// # }
- /// ```
- ///
- /// Without the curly braces, the capture group name `first_` would be
- /// used, and since it doesn't exist, it would be replaced with the empty
- /// string.
- ///
- /// Finally, sometimes you just want to replace a literal string with no
- /// regard for capturing group expansion. This can be done by wrapping a
- /// byte string with `NoExpand`:
- ///
- /// ```rust
- /// # use regex::Regex;
- /// # fn main() {
- /// use regex::NoExpand;
- ///
- /// let re = Regex::new(r"(?P<last>[^,\s]+),\s+(\S+)").unwrap();
- /// let result = re.replace("Springsteen, Bruce", NoExpand("$2 $last"));
- /// assert_eq!(result, "$2 $last");
- /// # }
- /// ```
- pub fn replace<'t, R: Replacer>(
- &self,
- text: &'t str,
- rep: R,
- ) -> Cow<'t, str> {
- self.replacen(text, 1, rep)
- }
-
- /// Replaces all non-overlapping matches in `text` with the replacement
- /// provided. This is the same as calling `replacen` with `limit` set to
- /// `0`.
- ///
- /// See the documentation for `replace` for details on how to access
- /// capturing group matches in the replacement string.
- pub fn replace_all<'t, R: Replacer>(
- &self,
- text: &'t str,
- rep: R,
- ) -> Cow<'t, str> {
- self.replacen(text, 0, rep)
- }
-
- /// Replaces at most `limit` non-overlapping matches in `text` with the
- /// replacement provided. If `limit` is 0, then all non-overlapping matches
- /// are replaced.
- ///
- /// See the documentation for `replace` for details on how to access
- /// capturing group matches in the replacement string.
- pub fn replacen<'t, R: Replacer>(
- &self,
- text: &'t str,
- limit: usize,
- mut rep: R,
- ) -> Cow<'t, str> {
- // If we know that the replacement doesn't have any capture expansions,
- // then we can use the fast path. The fast path can make a tremendous
- // difference:
- //
- // 1) We use `find_iter` instead of `captures_iter`. Not asking for
- // captures generally makes the regex engines faster.
- // 2) We don't need to look up all of the capture groups and do
- // replacements inside the replacement string. We just push it
- // at each match and be done with it.
- if let Some(rep) = rep.no_expansion() {
- let mut it = self.find_iter(text).enumerate().peekable();
- if it.peek().is_none() {
- return Cow::Borrowed(text);
- }
- let mut new = String::with_capacity(text.len());
- let mut last_match = 0;
- for (i, m) in it {
- new.push_str(&text[last_match..m.start()]);
- new.push_str(&rep);
- last_match = m.end();
- if limit > 0 && i >= limit - 1 {
- break;
- }
- }
- new.push_str(&text[last_match..]);
- return Cow::Owned(new);
- }
-
- // The slower path, which we use if the replacement needs access to
- // capture groups.
- let mut it = self.captures_iter(text).enumerate().peekable();
- if it.peek().is_none() {
- return Cow::Borrowed(text);
- }
- let mut new = String::with_capacity(text.len());
- let mut last_match = 0;
- for (i, cap) in it {
- // unwrap on 0 is OK because captures only reports matches
- let m = cap.get(0).unwrap();
- new.push_str(&text[last_match..m.start()]);
- rep.replace_append(&cap, &mut new);
- last_match = m.end();
- if limit > 0 && i >= limit - 1 {
- break;
- }
- }
- new.push_str(&text[last_match..]);
- Cow::Owned(new)
- }
-}
-
-/// Advanced or "lower level" search methods.
-impl Regex {
- /// Returns the end location of a match in the text given.
- ///
- /// This method may have the same performance characteristics as
- /// `is_match`, except it provides an end location for a match. In
- /// particular, the location returned *may be shorter* than the proper end
- /// of the leftmost-first match that you would find via `Regex::find`.
- ///
- /// Note that it is not guaranteed that this routine finds the shortest or
- /// "earliest" possible match. Instead, the main idea of this API is that
- /// it returns the offset at the point at which the internal regex engine
- /// has determined that a match has occurred. This may vary depending on
- /// which internal regex engine is used, and thus, the offset itself may
- /// change.
- ///
- /// # Example
- ///
- /// Typically, `a+` would match the entire first sequence of `a` in some
- /// text, but `shortest_match` can give up as soon as it sees the first
- /// `a`.
- ///
- /// ```rust
- /// # use regex::Regex;
- /// # fn main() {
- /// let text = "aaaaa";
- /// let pos = Regex::new(r"a+").unwrap().shortest_match(text);
- /// assert_eq!(pos, Some(1));
- /// # }
- /// ```
- pub fn shortest_match(&self, text: &str) -> Option<usize> {
- self.shortest_match_at(text, 0)
- }
-
- /// Returns the same as `shortest_match`, but starts the search at the
- /// given offset.
- ///
- /// The significance of the starting point is that it takes the surrounding
- /// context into consideration. For example, the `\A` anchor can only match
- /// when `start == 0`.
- pub fn shortest_match_at(
- &self,
- text: &str,
- start: usize,
- ) -> Option<usize> {
- self.0.searcher_str().shortest_match_at(text, start)
- }
-
- /// Returns the same as is_match, but starts the search at the given
- /// offset.
- ///
- /// The significance of the starting point is that it takes the surrounding
- /// context into consideration. For example, the `\A` anchor can only
- /// match when `start == 0`.
- pub fn is_match_at(&self, text: &str, start: usize) -> bool {
- self.0.searcher_str().is_match_at(text, start)
- }
-
- /// Returns the same as find, but starts the search at the given
- /// offset.
- ///
- /// The significance of the starting point is that it takes the surrounding
- /// context into consideration. For example, the `\A` anchor can only
- /// match when `start == 0`.
- pub fn find_at<'t>(
- &self,
- text: &'t str,
- start: usize,
- ) -> Option<Match<'t>> {
- self.0
- .searcher_str()
- .find_at(text, start)
- .map(|(s, e)| Match::new(text, s, e))
- }
-
- /// Returns the same as [`Regex::captures`], but starts the search at the
- /// given offset.
- ///
- /// The significance of the starting point is that it takes the surrounding
- /// context into consideration. For example, the `\A` anchor can only
- /// match when `start == 0`.
- pub fn captures_at<'t>(
- &self,
- text: &'t str,
- start: usize,
- ) -> Option<Captures<'t>> {
- let mut locs = self.capture_locations();
- self.captures_read_at(&mut locs, text, start).map(move |_| Captures {
- text,
- locs: locs.0,
- named_groups: self.0.capture_name_idx().clone(),
- })
- }
-
- /// This is like `captures`, but uses
- /// [`CaptureLocations`](struct.CaptureLocations.html)
- /// instead of
- /// [`Captures`](struct.Captures.html) in order to amortize allocations.
- ///
- /// To create a `CaptureLocations` value, use the
- /// `Regex::capture_locations` method.
- ///
- /// This returns the overall match if this was successful, which is always
- /// equivalence to the `0`th capture group.
- pub fn captures_read<'t>(
- &self,
- locs: &mut CaptureLocations,
- text: &'t str,
- ) -> Option<Match<'t>> {
- self.captures_read_at(locs, text, 0)
- }
-
- /// Returns the same as captures, but starts the search at the given
- /// offset and populates the capture locations given.
- ///
- /// The significance of the starting point is that it takes the surrounding
- /// context into consideration. For example, the `\A` anchor can only
- /// match when `start == 0`.
- pub fn captures_read_at<'t>(
- &self,
- locs: &mut CaptureLocations,
- text: &'t str,
- start: usize,
- ) -> Option<Match<'t>> {
- self.0
- .searcher_str()
- .captures_read_at(&mut locs.0, text, start)
- .map(|(s, e)| Match::new(text, s, e))
- }
-
- /// An undocumented alias for `captures_read_at`.
- ///
- /// The `regex-capi` crate previously used this routine, so to avoid
- /// breaking that crate, we continue to provide the name as an undocumented
- /// alias.
- #[doc(hidden)]
- pub fn read_captures_at<'t>(
- &self,
- locs: &mut CaptureLocations,
- text: &'t str,
- start: usize,
- ) -> Option<Match<'t>> {
- self.captures_read_at(locs, text, start)
- }
-}
-
-/// Auxiliary methods.
-impl Regex {
- /// Returns the original string of this regex.
- pub fn as_str(&self) -> &str {
- &self.0.regex_strings()[0]
- }
-
- /// Returns an iterator over the capture names.
- pub fn capture_names(&self) -> CaptureNames<'_> {
- CaptureNames(self.0.capture_names().iter())
- }
-
- /// Returns the number of captures.
- pub fn captures_len(&self) -> usize {
- self.0.capture_names().len()
- }
-
- /// Returns the total number of capturing groups that appear in every
- /// possible match.
- ///
- /// If the number of capture groups can vary depending on the match, then
- /// this returns `None`. That is, a value is only returned when the number
- /// of matching groups is invariant or "static."
- ///
- /// Note that like [`Regex::captures_len`], this **does** include the
- /// implicit capturing group corresponding to the entire match. Therefore,
- /// when a non-None value is returned, it is guaranteed to be at least `1`.
- /// Stated differently, a return value of `Some(0)` is impossible.
- ///
- /// # Example
- ///
- /// This shows a few cases where a static number of capture groups is
- /// available and a few cases where it is not.
- ///
- /// ```
- /// use regex::Regex;
- ///
- /// let len = |pattern| {
- /// Regex::new(pattern).map(|re| re.static_captures_len())
- /// };
- ///
- /// assert_eq!(Some(1), len("a")?);
- /// assert_eq!(Some(2), len("(a)")?);
- /// assert_eq!(Some(2), len("(a)|(b)")?);
- /// assert_eq!(Some(3), len("(a)(b)|(c)(d)")?);
- /// assert_eq!(None, len("(a)|b")?);
- /// assert_eq!(None, len("a|(b)")?);
- /// assert_eq!(None, len("(b)*")?);
- /// assert_eq!(Some(2), len("(b)+")?);
- ///
- /// # Ok::<(), Box<dyn std::error::Error>>(())
- /// ```
- #[inline]
- pub fn static_captures_len(&self) -> Option<usize> {
- self.0.static_captures_len().map(|len| len.saturating_add(1))
- }
-
- /// Returns an empty set of capture locations that can be reused in
- /// multiple calls to `captures_read` or `captures_read_at`.
- pub fn capture_locations(&self) -> CaptureLocations {
- CaptureLocations(self.0.searcher_str().locations())
- }
-
- /// An alias for `capture_locations` to preserve backward compatibility.
- ///
- /// The `regex-capi` crate uses this method, so to avoid breaking that
- /// crate, we continue to export it as an undocumented API.
- #[doc(hidden)]
- pub fn locations(&self) -> CaptureLocations {
- CaptureLocations(self.0.searcher_str().locations())
- }
-}
-
-/// An iterator over the names of all possible captures.
-///
-/// `None` indicates an unnamed capture; the first element (capture 0, the
-/// whole matched region) is always unnamed.
-///
-/// `'r` is the lifetime of the compiled regular expression.
-#[derive(Clone, Debug)]
-pub struct CaptureNames<'r>(::std::slice::Iter<'r, Option<String>>);
-
-impl<'r> Iterator for CaptureNames<'r> {
- type Item = Option<&'r str>;
-
- fn next(&mut self) -> Option<Option<&'r str>> {
- self.0
- .next()
- .as_ref()
- .map(|slot| slot.as_ref().map(|name| name.as_ref()))
- }
-
- fn size_hint(&self) -> (usize, Option<usize>) {
- self.0.size_hint()
- }
-
- fn count(self) -> usize {
- self.0.count()
- }
-}
-
-impl<'r> ExactSizeIterator for CaptureNames<'r> {}
-
-impl<'r> FusedIterator for CaptureNames<'r> {}
-
-/// Yields all substrings delimited by a regular expression match.
-///
-/// `'r` is the lifetime of the compiled regular expression and `'t` is the
-/// lifetime of the string being split.
-#[derive(Debug)]
-pub struct Split<'r, 't> {
- finder: Matches<'r, 't>,
- last: usize,
-}
-
-impl<'r, 't> Iterator for Split<'r, 't> {
- type Item = &'t str;
-
- fn next(&mut self) -> Option<&'t str> {
- let text = self.finder.0.text();
- match self.finder.next() {
- None => {
- if self.last > text.len() {
- None
- } else {
- let s = &text[self.last..];
- self.last = text.len() + 1; // Next call will return None
- Some(s)
- }
- }
- Some(m) => {
- let matched = &text[self.last..m.start()];
- self.last = m.end();
- Some(matched)
- }
- }
- }
-}
-
-impl<'r, 't> FusedIterator for Split<'r, 't> {}
-
-/// Yields at most `N` substrings delimited by a regular expression match.
-///
-/// The last substring will be whatever remains after splitting.
-///
-/// `'r` is the lifetime of the compiled regular expression and `'t` is the
-/// lifetime of the string being split.
-#[derive(Debug)]
-pub struct SplitN<'r, 't> {
- splits: Split<'r, 't>,
- n: usize,
-}
-
-impl<'r, 't> Iterator for SplitN<'r, 't> {
- type Item = &'t str;
-
- fn next(&mut self) -> Option<&'t str> {
- if self.n == 0 {
- return None;
- }
-
- self.n -= 1;
- if self.n > 0 {
- return self.splits.next();
- }
-
- let text = self.splits.finder.0.text();
- if self.splits.last > text.len() {
- // We've already returned all substrings.
- None
- } else {
- // self.n == 0, so future calls will return None immediately
- Some(&text[self.splits.last..])
- }
- }
-
- fn size_hint(&self) -> (usize, Option<usize>) {
- (0, Some(self.n))
- }
-}
-
-impl<'r, 't> FusedIterator for SplitN<'r, 't> {}
-
-/// CaptureLocations is a low level representation of the raw offsets of each
-/// submatch.
-///
-/// You can think of this as a lower level
-/// [`Captures`](struct.Captures.html), where this type does not support
-/// named capturing groups directly and it does not borrow the text that these
-/// offsets were matched on.
-///
-/// Primarily, this type is useful when using the lower level `Regex` APIs
-/// such as `read_captures`, which permits amortizing the allocation in which
-/// capture match locations are stored.
-///
-/// In order to build a value of this type, you'll need to call the
-/// `capture_locations` method on the `Regex` being used to execute the search.
-/// The value returned can then be reused in subsequent searches.
-///
-/// # Example
-///
-/// This example shows how to create and use `CaptureLocations` in a search.
-///
-/// ```
-/// use regex::Regex;
-///
-/// let re = Regex::new(r"(?<first>\w+)\s+(?<last>\w+)").unwrap();
-/// let mut locs = re.capture_locations();
-/// let m = re.captures_read(&mut locs, "Bruce Springsteen").unwrap();
-/// assert_eq!(0..17, m.range());
-/// assert_eq!(Some((0, 17)), locs.get(0));
-/// assert_eq!(Some((0, 5)), locs.get(1));
-/// assert_eq!(Some((6, 17)), locs.get(2));
-///
-/// // Asking for an invalid capture group always returns None.
-/// assert_eq!(None, locs.get(3));
-/// assert_eq!(None, locs.get(34973498648));
-/// assert_eq!(None, locs.get(9944060567225171988));
-/// ```
-#[derive(Clone, Debug)]
-pub struct CaptureLocations(re_trait::Locations);
-
-/// A type alias for `CaptureLocations` for backwards compatibility.
-///
-/// Previously, we exported `CaptureLocations` as `Locations` in an
-/// undocumented API. To prevent breaking that code (e.g., in `regex-capi`),
-/// we continue re-exporting the same undocumented API.
-#[doc(hidden)]
-pub type Locations = CaptureLocations;
-
-impl CaptureLocations {
- /// Returns the start and end positions of the Nth capture group. Returns
- /// `None` if `i` is not a valid capture group or if the capture group did
- /// not match anything. The positions returned are *always* byte indices
- /// with respect to the original string matched.
- #[inline]
- pub fn get(&self, i: usize) -> Option<(usize, usize)> {
- self.0.pos(i)
- }
-
- /// Returns the total number of capture groups (even if they didn't match).
- ///
- /// This is always at least `1` since every regex has at least `1`
- /// capturing group that corresponds to the entire match.
- #[inline]
- pub fn len(&self) -> usize {
- self.0.len()
- }
-
- /// An alias for the `get` method for backwards compatibility.
- ///
- /// Previously, we exported `get` as `pos` in an undocumented API. To
- /// prevent breaking that code (e.g., in `regex-capi`), we continue
- /// re-exporting the same undocumented API.
- #[doc(hidden)]
- #[inline]
- pub fn pos(&self, i: usize) -> Option<(usize, usize)> {
- self.get(i)
- }
-}
-
-/// Captures represents a group of captured strings for a single match.
-///
-/// The 0th capture always corresponds to the entire match. Each subsequent
-/// index corresponds to the next capture group in the regex. If a capture
-/// group is named, then the matched string is *also* available via the `name`
-/// method. (Note that the 0th capture is always unnamed and so must be
-/// accessed with the `get` method.)
-///
-/// Positions returned from a capture group are always byte indices.
-///
-/// `'t` is the lifetime of the matched text.
-pub struct Captures<'t> {
- text: &'t str,
- locs: re_trait::Locations,
- named_groups: Arc<HashMap<String, usize>>,
-}
-
-impl<'t> Captures<'t> {
- /// Returns the match associated with the capture group at index `i`. If
- /// `i` does not correspond to a capture group, or if the capture group
- /// did not participate in the match, then `None` is returned.
- ///
- /// # Examples
- ///
- /// Get the text of the match with a default of an empty string if this
- /// group didn't participate in the match:
- ///
- /// ```rust
- /// # use regex::Regex;
- /// let re = Regex::new(r"[a-z]+(?:([0-9]+)|([A-Z]+))").unwrap();
- /// let caps = re.captures("abc123").unwrap();
- ///
- /// let text1 = caps.get(1).map_or("", |m| m.as_str());
- /// let text2 = caps.get(2).map_or("", |m| m.as_str());
- /// assert_eq!(text1, "123");
- /// assert_eq!(text2, "");
- /// ```
- pub fn get(&self, i: usize) -> Option<Match<'t>> {
- self.locs.pos(i).map(|(s, e)| Match::new(self.text, s, e))
- }
-
- /// Returns the match for the capture group named `name`. If `name` isn't a
- /// valid capture group or didn't match anything, then `None` is returned.
- pub fn name(&self, name: &str) -> Option<Match<'t>> {
- self.named_groups.get(name).and_then(|&i| self.get(i))
- }
-
- /// An iterator that yields all capturing matches in the order in which
- /// they appear in the regex. If a particular capture group didn't
- /// participate in the match, then `None` is yielded for that capture.
- ///
- /// The first match always corresponds to the overall match of the regex.
- pub fn iter<'c>(&'c self) -> SubCaptureMatches<'c, 't> {
- SubCaptureMatches { caps: self, it: self.locs.iter() }
- }
-
- /// Expands all instances of `$name` in `replacement` to the corresponding
- /// capture group `name`, and writes them to the `dst` buffer given.
- ///
- /// `name` may be an integer corresponding to the index of the capture
- /// group (counted by order of opening parenthesis where `0` is the
- /// entire match) or it can be a name (consisting of letters, digits or
- /// underscores) corresponding to a named capture group.
- ///
- /// If `name` isn't a valid capture group (whether the name doesn't exist
- /// or isn't a valid index), then it is replaced with the empty string.
- ///
- /// The longest possible name consisting of the characters `[_0-9A-Za-z]`
- /// is used. e.g., `$1a` looks up the capture group named `1a` and not the
- /// capture group at index `1`. To exert more precise control over the
- /// name, or to refer to a capture group name that uses characters outside
- /// of `[_0-9A-Za-z]`, use braces, e.g., `${1}a` or `${foo[bar].baz}`. When
- /// using braces, any sequence of characters is permitted. If the sequence
- /// does not refer to a capture group name in the corresponding regex, then
- /// it is replaced with an empty string.
- ///
- /// To write a literal `$` use `$$`.
- pub fn expand(&self, replacement: &str, dst: &mut String) {
- expand_str(self, replacement, dst)
- }
-
- /// Returns the total number of capture groups (even if they didn't match).
- ///
- /// This is always at least `1`, since every regex has at least one capture
- /// group that corresponds to the full match.
- #[inline]
- pub fn len(&self) -> usize {
- self.locs.len()
- }
-}
-
-impl<'t> fmt::Debug for Captures<'t> {
- fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
- f.debug_tuple("Captures").field(&CapturesDebug(self)).finish()
- }
-}
-
-struct CapturesDebug<'c, 't>(&'c Captures<'t>);
-
-impl<'c, 't> fmt::Debug for CapturesDebug<'c, 't> {
- fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
- // We'd like to show something nice here, even if it means an
- // allocation to build a reverse index.
- let slot_to_name: HashMap<&usize, &String> =
- self.0.named_groups.iter().map(|(a, b)| (b, a)).collect();
- let mut map = f.debug_map();
- for (slot, m) in self.0.locs.iter().enumerate() {
- let m = m.map(|(s, e)| &self.0.text[s..e]);
- if let Some(name) = slot_to_name.get(&slot) {
- map.entry(&name, &m);
- } else {
- map.entry(&slot, &m);
- }
- }
- map.finish()
- }
-}
-
-/// Get a group by index.
-///
-/// `'t` is the lifetime of the matched text.
-///
-/// The text can't outlive the `Captures` object if this method is
-/// used, because of how `Index` is defined (normally `a[i]` is part
-/// of `a` and can't outlive it); to do that, use `get()` instead.
-///
-/// # Panics
-///
-/// If there is no group at the given index.
-impl<'t> Index<usize> for Captures<'t> {
- type Output = str;
-
- fn index(&self, i: usize) -> &str {
- self.get(i)
- .map(|m| m.as_str())
- .unwrap_or_else(|| panic!("no group at index '{}'", i))
- }
-}
-
-/// Get a group by name.
-///
-/// `'t` is the lifetime of the matched text and `'i` is the lifetime
-/// of the group name (the index).
-///
-/// The text can't outlive the `Captures` object if this method is
-/// used, because of how `Index` is defined (normally `a[i]` is part
-/// of `a` and can't outlive it); to do that, use `name` instead.
-///
-/// # Panics
-///
-/// If there is no group named by the given value.
-impl<'t, 'i> Index<&'i str> for Captures<'t> {
- type Output = str;
-
- fn index<'a>(&'a self, name: &'i str) -> &'a str {
- self.name(name)
- .map(|m| m.as_str())
- .unwrap_or_else(|| panic!("no group named '{}'", name))
- }
-}
-
-/// An iterator that yields all capturing matches in the order in which they
-/// appear in the regex.
-///
-/// If a particular capture group didn't participate in the match, then `None`
-/// is yielded for that capture. The first match always corresponds to the
-/// overall match of the regex.
-///
-/// The lifetime `'c` corresponds to the lifetime of the `Captures` value, and
-/// the lifetime `'t` corresponds to the originally matched text.
-#[derive(Clone, Debug)]
-pub struct SubCaptureMatches<'c, 't> {
- caps: &'c Captures<'t>,
- it: SubCapturesPosIter<'c>,
-}
-
-impl<'c, 't> Iterator for SubCaptureMatches<'c, 't> {
- type Item = Option<Match<'t>>;
-
- fn next(&mut self) -> Option<Option<Match<'t>>> {
- self.it
- .next()
- .map(|cap| cap.map(|(s, e)| Match::new(self.caps.text, s, e)))
- }
-
- fn size_hint(&self) -> (usize, Option<usize>) {
- self.it.size_hint()
- }
-
- fn count(self) -> usize {
- self.it.count()
- }
-}
-
-impl<'c, 't> ExactSizeIterator for SubCaptureMatches<'c, 't> {}
-
-impl<'c, 't> FusedIterator for SubCaptureMatches<'c, 't> {}
-
-/// An iterator that yields all non-overlapping capture groups matching a
-/// particular regular expression.
-///
-/// The iterator stops when no more matches can be found.
-///
-/// `'r` is the lifetime of the compiled regular expression and `'t` is the
-/// lifetime of the matched string.
-#[derive(Debug)]
-pub struct CaptureMatches<'r, 't>(
- re_trait::CaptureMatches<'t, ExecNoSyncStr<'r>>,
-);
-
-impl<'r, 't> Iterator for CaptureMatches<'r, 't> {
- type Item = Captures<'t>;
-
- fn next(&mut self) -> Option<Captures<'t>> {
- self.0.next().map(|locs| Captures {
- text: self.0.text(),
- locs,
- named_groups: self.0.regex().capture_name_idx().clone(),
- })
- }
-}
-
-impl<'r, 't> FusedIterator for CaptureMatches<'r, 't> {}
-
-/// An iterator over all non-overlapping matches for a particular string.
-///
-/// The iterator yields a `Match` value. The iterator stops when no more
-/// matches can be found.
-///
-/// `'r` is the lifetime of the compiled regular expression and `'t` is the
-/// lifetime of the matched string.
-#[derive(Debug)]
-pub struct Matches<'r, 't>(re_trait::Matches<'t, ExecNoSyncStr<'r>>);
-
-impl<'r, 't> Iterator for Matches<'r, 't> {
- type Item = Match<'t>;
-
- fn next(&mut self) -> Option<Match<'t>> {
- let text = self.0.text();
- self.0.next().map(|(s, e)| Match::new(text, s, e))
- }
-}
-
-impl<'r, 't> FusedIterator for Matches<'r, 't> {}
-
-/// Replacer describes types that can be used to replace matches in a string.
-///
-/// In general, users of this crate shouldn't need to implement this trait,
-/// since implementations are already provided for `&str` along with other
-/// variants of string types and `FnMut(&Captures) -> String` (or any
-/// `FnMut(&Captures) -> T` where `T: AsRef<str>`), which covers most use cases.
-pub trait Replacer {
- /// Appends text to `dst` to replace the current match.
- ///
- /// The current match is represented by `caps`, which is guaranteed to
- /// have a match at capture group `0`.
- ///
- /// For example, a no-op replacement would be
- /// `dst.push_str(caps.get(0).unwrap().as_str())`.
- fn replace_append(&mut self, caps: &Captures<'_>, dst: &mut String);
-
- /// Return a fixed unchanging replacement string.
- ///
- /// When doing replacements, if access to `Captures` is not needed (e.g.,
- /// the replacement byte string does not need `$` expansion), then it can
- /// be beneficial to avoid finding sub-captures.
- ///
- /// In general, this is called once for every call to `replacen`.
- fn no_expansion<'r>(&'r mut self) -> Option<Cow<'r, str>> {
- None
- }
-
- /// Return a `Replacer` that borrows and wraps this `Replacer`.
- ///
- /// This is useful when you want to take a generic `Replacer` (which might
- /// not be cloneable) and use it without consuming it, so it can be used
- /// more than once.
- ///
- /// # Example
- ///
- /// ```
- /// use regex::{Regex, Replacer};
- ///
- /// fn replace_all_twice<R: Replacer>(
- /// re: Regex,
- /// src: &str,
- /// mut rep: R,
- /// ) -> String {
- /// let dst = re.replace_all(src, rep.by_ref());
- /// let dst = re.replace_all(&dst, rep.by_ref());
- /// dst.into_owned()
- /// }
- /// ```
- fn by_ref<'r>(&'r mut self) -> ReplacerRef<'r, Self> {
- ReplacerRef(self)
- }
-}
-
-/// By-reference adaptor for a `Replacer`
-///
-/// Returned by [`Replacer::by_ref`](trait.Replacer.html#method.by_ref).
-#[derive(Debug)]
-pub struct ReplacerRef<'a, R: ?Sized>(&'a mut R);
-
-impl<'a, R: Replacer + ?Sized + 'a> Replacer for ReplacerRef<'a, R> {
- fn replace_append(&mut self, caps: &Captures<'_>, dst: &mut String) {
- self.0.replace_append(caps, dst)
- }
- fn no_expansion(&mut self) -> Option<Cow<'_, str>> {
- self.0.no_expansion()
- }
-}
-
-impl<'a> Replacer for &'a str {
- fn replace_append(&mut self, caps: &Captures<'_>, dst: &mut String) {
- caps.expand(*self, dst);
- }
-
- fn no_expansion(&mut self) -> Option<Cow<'_, str>> {
- no_expansion(self)
- }
-}
-
-impl<'a> Replacer for &'a String {
- fn replace_append(&mut self, caps: &Captures<'_>, dst: &mut String) {
- self.as_str().replace_append(caps, dst)
- }
-
- fn no_expansion(&mut self) -> Option<Cow<'_, str>> {
- no_expansion(self)
- }
-}
-
-impl Replacer for String {
- fn replace_append(&mut self, caps: &Captures<'_>, dst: &mut String) {
- self.as_str().replace_append(caps, dst)
- }
-
- fn no_expansion(&mut self) -> Option<Cow<'_, str>> {
- no_expansion(self)
- }
-}
-
-impl<'a> Replacer for Cow<'a, str> {
- fn replace_append(&mut self, caps: &Captures<'_>, dst: &mut String) {
- self.as_ref().replace_append(caps, dst)
- }
-
- fn no_expansion(&mut self) -> Option<Cow<'_, str>> {
- no_expansion(self)
- }
-}
-
-impl<'a> Replacer for &'a Cow<'a, str> {
- fn replace_append(&mut self, caps: &Captures<'_>, dst: &mut String) {
- self.as_ref().replace_append(caps, dst)
- }
-
- fn no_expansion(&mut self) -> Option<Cow<'_, str>> {
- no_expansion(self)
- }
-}
-
-fn no_expansion<T: AsRef<str>>(t: &T) -> Option<Cow<'_, str>> {
- let s = t.as_ref();
- match find_byte(b'$', s.as_bytes()) {
- Some(_) => None,
- None => Some(Cow::Borrowed(s)),
- }
-}
-
-impl<F, T> Replacer for F
-where
- F: FnMut(&Captures<'_>) -> T,
- T: AsRef<str>,
-{
- fn replace_append(&mut self, caps: &Captures<'_>, dst: &mut String) {
- dst.push_str((*self)(caps).as_ref());
- }
-}
-
-/// `NoExpand` indicates literal string replacement.
-///
-/// It can be used with `replace` and `replace_all` to do a literal string
-/// replacement without expanding `$name` to their corresponding capture
-/// groups. This can be both convenient (to avoid escaping `$`, for example)
-/// and performant (since capture groups don't need to be found).
-///
-/// `'t` is the lifetime of the literal text.
-#[derive(Clone, Debug)]
-pub struct NoExpand<'t>(pub &'t str);
-
-impl<'t> Replacer for NoExpand<'t> {
- fn replace_append(&mut self, _: &Captures<'_>, dst: &mut String) {
- dst.push_str(self.0);
- }
-
- fn no_expansion(&mut self) -> Option<Cow<'_, str>> {
- Some(Cow::Borrowed(self.0))
- }
-}
diff --git a/vendor/regex/src/regex/bytes.rs b/vendor/regex/src/regex/bytes.rs
new file mode 100644
index 000000000..19f5701af
--- /dev/null
+++ b/vendor/regex/src/regex/bytes.rs
@@ -0,0 +1,2600 @@
+use alloc::{borrow::Cow, string::String, sync::Arc, vec::Vec};
+
+use regex_automata::{meta, util::captures, Input, PatternID};
+
+use crate::{bytes::RegexBuilder, error::Error};
+
+/// A compiled regular expression for searching Unicode haystacks.
+///
+/// A `Regex` can be used to search haystacks, split haystacks into substrings
+/// or replace substrings in a haystack with a different substring. All
+/// searching is done with an implicit `(?s:.)*?` at the beginning and end of
+/// an pattern. To force an expression to match the whole string (or a prefix
+/// or a suffix), you must use an anchor like `^` or `$` (or `\A` and `\z`).
+///
+/// Like the `Regex` type in the parent module, matches with this regex return
+/// byte offsets into the haystack. **Unlike** the parent `Regex` type, these
+/// byte offsets may not correspond to UTF-8 sequence boundaries since the
+/// regexes in this module can match arbitrary bytes.
+///
+/// The only methods that allocate new byte strings are the string replacement
+/// methods. All other methods (searching and splitting) return borrowed
+/// references into the haystack given.
+///
+/// # Example
+///
+/// Find the offsets of a US phone number:
+///
+/// ```
+/// use regex::bytes::Regex;
+///
+/// let re = Regex::new("[0-9]{3}-[0-9]{3}-[0-9]{4}").unwrap();
+/// let m = re.find(b"phone: 111-222-3333").unwrap();
+/// assert_eq!(7..19, m.range());
+/// ```
+///
+/// # Example: extracting capture groups
+///
+/// A common way to use regexes is with capture groups. That is, instead of
+/// just looking for matches of an entire regex, parentheses are used to create
+/// groups that represent part of the match.
+///
+/// For example, consider a haystack with multiple lines, and each line has
+/// three whitespace delimited fields where the second field is expected to be
+/// a number and the third field a boolean. To make this convenient, we use
+/// the [`Captures::extract`] API to put the strings that match each group
+/// into a fixed size array:
+///
+/// ```
+/// use regex::bytes::Regex;
+///
+/// let hay = b"
+/// rabbit 54 true
+/// groundhog 2 true
+/// does not match
+/// fox 109 false
+/// ";
+/// let re = Regex::new(r"(?m)^\s*(\S+)\s+([0-9]+)\s+(true|false)\s*$").unwrap();
+/// let mut fields: Vec<(&[u8], i64, bool)> = vec![];
+/// for (_, [f1, f2, f3]) in re.captures_iter(hay).map(|caps| caps.extract()) {
+/// // These unwraps are OK because our pattern is written in a way where
+/// // all matches for f2 and f3 will be valid UTF-8.
+/// let f2 = std::str::from_utf8(f2).unwrap();
+/// let f3 = std::str::from_utf8(f3).unwrap();
+/// fields.push((f1, f2.parse()?, f3.parse()?));
+/// }
+/// assert_eq!(fields, vec![
+/// (&b"rabbit"[..], 54, true),
+/// (&b"groundhog"[..], 2, true),
+/// (&b"fox"[..], 109, false),
+/// ]);
+///
+/// # Ok::<(), Box<dyn std::error::Error>>(())
+/// ```
+///
+/// # Example: matching invalid UTF-8
+///
+/// One of the reasons for searching `&[u8]` haystacks is that the `&[u8]`
+/// might not be valid UTF-8. Indeed, with a `bytes::Regex`, patterns that
+/// match invalid UTF-8 are explicitly allowed. Here's one example that looks
+/// for valid UTF-8 fields that might be separated by invalid UTF-8. In this
+/// case, we use `(?s-u:.)`, which matches any byte. Attempting to use it in a
+/// top-level `Regex` will result in the regex failing to compile. Notice also
+/// that we use `.` with Unicode mode enabled, in which case, only valid UTF-8
+/// is matched. In this way, we can build one pattern where some parts only
+/// match valid UTF-8 while other parts are more permissive.
+///
+/// ```
+/// use regex::bytes::Regex;
+///
+/// // F0 9F 92 A9 is the UTF-8 encoding for a Pile of Poo.
+/// let hay = b"\xFF\xFFfoo\xFF\xFF\xFF\xF0\x9F\x92\xA9\xFF";
+/// // An equivalent to '(?s-u:.)' is '(?-u:[\x00-\xFF])'.
+/// let re = Regex::new(r"(?s)(?-u:.)*?(?<f1>.+)(?-u:.)*?(?<f2>.+)").unwrap();
+/// let caps = re.captures(hay).unwrap();
+/// assert_eq!(&caps["f1"], &b"foo"[..]);
+/// assert_eq!(&caps["f2"], "💩".as_bytes());
+/// ```
+#[derive(Clone)]
+pub struct Regex {
+ pub(crate) meta: meta::Regex,
+ pub(crate) pattern: Arc<str>,
+}
+
+impl core::fmt::Display for Regex {
+ /// Shows the original regular expression.
+ fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
+ write!(f, "{}", self.as_str())
+ }
+}
+
+impl core::fmt::Debug for Regex {
+ /// Shows the original regular expression.
+ fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
+ f.debug_tuple("Regex").field(&self.as_str()).finish()
+ }
+}
+
+impl core::str::FromStr for Regex {
+ type Err = Error;
+
+ /// Attempts to parse a string into a regular expression
+ fn from_str(s: &str) -> Result<Regex, Error> {
+ Regex::new(s)
+ }
+}
+
+impl TryFrom<&str> for Regex {
+ type Error = Error;
+
+ /// Attempts to parse a string into a regular expression
+ fn try_from(s: &str) -> Result<Regex, Error> {
+ Regex::new(s)
+ }
+}
+
+impl TryFrom<String> for Regex {
+ type Error = Error;
+
+ /// Attempts to parse a string into a regular expression
+ fn try_from(s: String) -> Result<Regex, Error> {
+ Regex::new(&s)
+ }
+}
+
+/// Core regular expression methods.
+impl Regex {
+ /// Compiles a regular expression. Once compiled, it can be used repeatedly
+ /// to search, split or replace substrings in a haystack.
+ ///
+ /// Note that regex compilation tends to be a somewhat expensive process,
+ /// and unlike higher level environments, compilation is not automatically
+ /// cached for you. One should endeavor to compile a regex once and then
+ /// reuse it. For example, it's a bad idea to compile the same regex
+ /// repeatedly in a loop.
+ ///
+ /// # Errors
+ ///
+ /// If an invalid pattern is given, then an error is returned.
+ /// An error is also returned if the pattern is valid, but would
+ /// produce a regex that is bigger than the configured size limit via
+ /// [`RegexBuilder::size_limit`]. (A reasonable size limit is enabled by
+ /// default.)
+ ///
+ /// # Example
+ ///
+ /// ```
+ /// use regex::bytes::Regex;
+ ///
+ /// // An Invalid pattern because of an unclosed parenthesis
+ /// assert!(Regex::new(r"foo(bar").is_err());
+ /// // An invalid pattern because the regex would be too big
+ /// // because Unicode tends to inflate things.
+ /// assert!(Regex::new(r"\w{1000}").is_err());
+ /// // Disabling Unicode can make the regex much smaller,
+ /// // potentially by up to or more than an order of magnitude.
+ /// assert!(Regex::new(r"(?-u:\w){1000}").is_ok());
+ /// ```
+ pub fn new(re: &str) -> Result<Regex, Error> {
+ RegexBuilder::new(re).build()
+ }
+
+ /// Returns true if and only if there is a match for the regex anywhere
+ /// in the haystack given.
+ ///
+ /// It is recommended to use this method if all you need to do is test
+ /// whether a match exists, since the underlying matching engine may be
+ /// able to do less work.
+ ///
+ /// # Example
+ ///
+ /// Test if some haystack contains at least one word with exactly 13
+ /// Unicode word characters:
+ ///
+ /// ```
+ /// use regex::bytes::Regex;
+ ///
+ /// let re = Regex::new(r"\b\w{13}\b").unwrap();
+ /// let hay = b"I categorically deny having triskaidekaphobia.";
+ /// assert!(re.is_match(hay));
+ /// ```
+ #[inline]
+ pub fn is_match(&self, haystack: &[u8]) -> bool {
+ self.is_match_at(haystack, 0)
+ }
+
+ /// This routine searches for the first match of this regex in the
+ /// haystack given, and if found, returns a [`Match`]. The `Match`
+ /// provides access to both the byte offsets of the match and the actual
+ /// substring that matched.
+ ///
+ /// Note that this should only be used if you want to find the entire
+ /// match. If instead you just want to test the existence of a match,
+ /// it's potentially faster to use `Regex::is_match(hay)` instead of
+ /// `Regex::find(hay).is_some()`.
+ ///
+ /// # Example
+ ///
+ /// Find the first word with exactly 13 Unicode word characters:
+ ///
+ /// ```
+ /// use regex::bytes::Regex;
+ ///
+ /// let re = Regex::new(r"\b\w{13}\b").unwrap();
+ /// let hay = b"I categorically deny having triskaidekaphobia.";
+ /// let mat = re.find(hay).unwrap();
+ /// assert_eq!(2..15, mat.range());
+ /// assert_eq!(b"categorically", mat.as_bytes());
+ /// ```
+ #[inline]
+ pub fn find<'h>(&self, haystack: &'h [u8]) -> Option<Match<'h>> {
+ self.find_at(haystack, 0)
+ }
+
+ /// Returns an iterator that yields successive non-overlapping matches in
+ /// the given haystack. The iterator yields values of type [`Match`].
+ ///
+ /// # Time complexity
+ ///
+ /// Note that since `find_iter` runs potentially many searches on the
+ /// haystack and since each search has worst case `O(m * n)` time
+ /// complexity, the overall worst case time complexity for iteration is
+ /// `O(m * n^2)`.
+ ///
+ /// # Example
+ ///
+ /// Find every word with exactly 13 Unicode word characters:
+ ///
+ /// ```
+ /// use regex::bytes::Regex;
+ ///
+ /// let re = Regex::new(r"\b\w{13}\b").unwrap();
+ /// let hay = b"Retroactively relinquishing remunerations is reprehensible.";
+ /// let matches: Vec<_> = re.find_iter(hay).map(|m| m.as_bytes()).collect();
+ /// assert_eq!(matches, vec![
+ /// &b"Retroactively"[..],
+ /// &b"relinquishing"[..],
+ /// &b"remunerations"[..],
+ /// &b"reprehensible"[..],
+ /// ]);
+ /// ```
+ #[inline]
+ pub fn find_iter<'r, 'h>(&'r self, haystack: &'h [u8]) -> Matches<'r, 'h> {
+ Matches { haystack, it: self.meta.find_iter(haystack) }
+ }
+
+ /// This routine searches for the first match of this regex in the haystack
+ /// given, and if found, returns not only the overall match but also the
+ /// matches of each capture group in the regex. If no match is found, then
+ /// `None` is returned.
+ ///
+ /// Capture group `0` always corresponds to an implicit unnamed group that
+ /// includes the entire match. If a match is found, this group is always
+ /// present. Subsequent groups may be named and are numbered, starting
+ /// at 1, by the order in which the opening parenthesis appears in the
+ /// pattern. For example, in the pattern `(?<a>.(?<b>.))(?<c>.)`, `a`,
+ /// `b` and `c` correspond to capture group indices `1`, `2` and `3`,
+ /// respectively.
+ ///
+ /// You should only use `captures` if you need access to the capture group
+ /// matches. Otherwise, [`Regex::find`] is generally faster for discovering
+ /// just the overall match.
+ ///
+ /// # Example
+ ///
+ /// Say you have some haystack with movie names and their release years,
+ /// like "'Citizen Kane' (1941)". It'd be nice if we could search for
+ /// strings looking like that, while also extracting the movie name and its
+ /// release year separately. The example below shows how to do that.
+ ///
+ /// ```
+ /// use regex::bytes::Regex;
+ ///
+ /// let re = Regex::new(r"'([^']+)'\s+\((\d{4})\)").unwrap();
+ /// let hay = b"Not my favorite movie: 'Citizen Kane' (1941).";
+ /// let caps = re.captures(hay).unwrap();
+ /// assert_eq!(caps.get(0).unwrap().as_bytes(), b"'Citizen Kane' (1941)");
+ /// assert_eq!(caps.get(1).unwrap().as_bytes(), b"Citizen Kane");
+ /// assert_eq!(caps.get(2).unwrap().as_bytes(), b"1941");
+ /// // You can also access the groups by index using the Index notation.
+ /// // Note that this will panic on an invalid index. In this case, these
+ /// // accesses are always correct because the overall regex will only
+ /// // match when these capture groups match.
+ /// assert_eq!(&caps[0], b"'Citizen Kane' (1941)");
+ /// assert_eq!(&caps[1], b"Citizen Kane");
+ /// assert_eq!(&caps[2], b"1941");
+ /// ```
+ ///
+ /// Note that the full match is at capture group `0`. Each subsequent
+ /// capture group is indexed by the order of its opening `(`.
+ ///
+ /// We can make this example a bit clearer by using *named* capture groups:
+ ///
+ /// ```
+ /// use regex::bytes::Regex;
+ ///
+ /// let re = Regex::new(r"'(?<title>[^']+)'\s+\((?<year>\d{4})\)").unwrap();
+ /// let hay = b"Not my favorite movie: 'Citizen Kane' (1941).";
+ /// let caps = re.captures(hay).unwrap();
+ /// assert_eq!(caps.get(0).unwrap().as_bytes(), b"'Citizen Kane' (1941)");
+ /// assert_eq!(caps.name("title").unwrap().as_bytes(), b"Citizen Kane");
+ /// assert_eq!(caps.name("year").unwrap().as_bytes(), b"1941");
+ /// // You can also access the groups by name using the Index notation.
+ /// // Note that this will panic on an invalid group name. In this case,
+ /// // these accesses are always correct because the overall regex will
+ /// // only match when these capture groups match.
+ /// assert_eq!(&caps[0], b"'Citizen Kane' (1941)");
+ /// assert_eq!(&caps["title"], b"Citizen Kane");
+ /// assert_eq!(&caps["year"], b"1941");
+ /// ```
+ ///
+ /// Here we name the capture groups, which we can access with the `name`
+ /// method or the `Index` notation with a `&str`. Note that the named
+ /// capture groups are still accessible with `get` or the `Index` notation
+ /// with a `usize`.
+ ///
+ /// The `0`th capture group is always unnamed, so it must always be
+ /// accessed with `get(0)` or `[0]`.
+ ///
+ /// Finally, one other way to to get the matched substrings is with the
+ /// [`Captures::extract`] API:
+ ///
+ /// ```
+ /// use regex::bytes::Regex;
+ ///
+ /// let re = Regex::new(r"'([^']+)'\s+\((\d{4})\)").unwrap();
+ /// let hay = b"Not my favorite movie: 'Citizen Kane' (1941).";
+ /// let (full, [title, year]) = re.captures(hay).unwrap().extract();
+ /// assert_eq!(full, b"'Citizen Kane' (1941)");
+ /// assert_eq!(title, b"Citizen Kane");
+ /// assert_eq!(year, b"1941");
+ /// ```
+ #[inline]
+ pub fn captures<'h>(&self, haystack: &'h [u8]) -> Option<Captures<'h>> {
+ self.captures_at(haystack, 0)
+ }
+
+ /// Returns an iterator that yields successive non-overlapping matches in
+ /// the given haystack. The iterator yields values of type [`Captures`].
+ ///
+ /// This is the same as [`Regex::find_iter`], but instead of only providing
+ /// access to the overall match, each value yield includes access to the
+ /// matches of all capture groups in the regex. Reporting this extra match
+ /// data is potentially costly, so callers should only use `captures_iter`
+ /// over `find_iter` when they actually need access to the capture group
+ /// matches.
+ ///
+ /// # Time complexity
+ ///
+ /// Note that since `captures_iter` runs potentially many searches on the
+ /// haystack and since each search has worst case `O(m * n)` time
+ /// complexity, the overall worst case time complexity for iteration is
+ /// `O(m * n^2)`.
+ ///
+ /// # Example
+ ///
+ /// We can use this to find all movie titles and their release years in
+ /// some haystack, where the movie is formatted like "'Title' (xxxx)":
+ ///
+ /// ```
+ /// use regex::bytes::Regex;
+ ///
+ /// let re = Regex::new(r"'([^']+)'\s+\(([0-9]{4})\)").unwrap();
+ /// let hay = b"'Citizen Kane' (1941), 'The Wizard of Oz' (1939), 'M' (1931).";
+ /// let mut movies = vec![];
+ /// for (_, [title, year]) in re.captures_iter(hay).map(|c| c.extract()) {
+ /// // OK because [0-9]{4} can only match valid UTF-8.
+ /// let year = std::str::from_utf8(year).unwrap();
+ /// movies.push((title, year.parse::<i64>()?));
+ /// }
+ /// assert_eq!(movies, vec![
+ /// (&b"Citizen Kane"[..], 1941),
+ /// (&b"The Wizard of Oz"[..], 1939),
+ /// (&b"M"[..], 1931),
+ /// ]);
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ ///
+ /// Or with named groups:
+ ///
+ /// ```
+ /// use regex::bytes::Regex;
+ ///
+ /// let re = Regex::new(r"'(?<title>[^']+)'\s+\((?<year>[0-9]{4})\)").unwrap();
+ /// let hay = b"'Citizen Kane' (1941), 'The Wizard of Oz' (1939), 'M' (1931).";
+ /// let mut it = re.captures_iter(hay);
+ ///
+ /// let caps = it.next().unwrap();
+ /// assert_eq!(&caps["title"], b"Citizen Kane");
+ /// assert_eq!(&caps["year"], b"1941");
+ ///
+ /// let caps = it.next().unwrap();
+ /// assert_eq!(&caps["title"], b"The Wizard of Oz");
+ /// assert_eq!(&caps["year"], b"1939");
+ ///
+ /// let caps = it.next().unwrap();
+ /// assert_eq!(&caps["title"], b"M");
+ /// assert_eq!(&caps["year"], b"1931");
+ /// ```
+ #[inline]
+ pub fn captures_iter<'r, 'h>(
+ &'r self,
+ haystack: &'h [u8],
+ ) -> CaptureMatches<'r, 'h> {
+ CaptureMatches { haystack, it: self.meta.captures_iter(haystack) }
+ }
+
+ /// Returns an iterator of substrings of the haystack given, delimited by a
+ /// match of the regex. Namely, each element of the iterator corresponds to
+ /// a part of the haystack that *isn't* matched by the regular expression.
+ ///
+ /// # Time complexity
+ ///
+ /// Since iterators over all matches requires running potentially many
+ /// searches on the haystack, and since each search has worst case
+ /// `O(m * n)` time complexity, the overall worst case time complexity for
+ /// this routine is `O(m * n^2)`.
+ ///
+ /// # Example
+ ///
+ /// To split a string delimited by arbitrary amounts of spaces or tabs:
+ ///
+ /// ```
+ /// use regex::bytes::Regex;
+ ///
+ /// let re = Regex::new(r"[ \t]+").unwrap();
+ /// let hay = b"a b \t c\td e";
+ /// let fields: Vec<&[u8]> = re.split(hay).collect();
+ /// assert_eq!(fields, vec![
+ /// &b"a"[..], &b"b"[..], &b"c"[..], &b"d"[..], &b"e"[..],
+ /// ]);
+ /// ```
+ ///
+ /// # Example: more cases
+ ///
+ /// Basic usage:
+ ///
+ /// ```
+ /// use regex::bytes::Regex;
+ ///
+ /// let re = Regex::new(r" ").unwrap();
+ /// let hay = b"Mary had a little lamb";
+ /// let got: Vec<&[u8]> = re.split(hay).collect();
+ /// assert_eq!(got, vec![
+ /// &b"Mary"[..], &b"had"[..], &b"a"[..], &b"little"[..], &b"lamb"[..],
+ /// ]);
+ ///
+ /// let re = Regex::new(r"X").unwrap();
+ /// let hay = b"";
+ /// let got: Vec<&[u8]> = re.split(hay).collect();
+ /// assert_eq!(got, vec![&b""[..]]);
+ ///
+ /// let re = Regex::new(r"X").unwrap();
+ /// let hay = b"lionXXtigerXleopard";
+ /// let got: Vec<&[u8]> = re.split(hay).collect();
+ /// assert_eq!(got, vec![
+ /// &b"lion"[..], &b""[..], &b"tiger"[..], &b"leopard"[..],
+ /// ]);
+ ///
+ /// let re = Regex::new(r"::").unwrap();
+ /// let hay = b"lion::tiger::leopard";
+ /// let got: Vec<&[u8]> = re.split(hay).collect();
+ /// assert_eq!(got, vec![&b"lion"[..], &b"tiger"[..], &b"leopard"[..]]);
+ /// ```
+ ///
+ /// If a haystack contains multiple contiguous matches, you will end up
+ /// with empty spans yielded by the iterator:
+ ///
+ /// ```
+ /// use regex::bytes::Regex;
+ ///
+ /// let re = Regex::new(r"X").unwrap();
+ /// let hay = b"XXXXaXXbXc";
+ /// let got: Vec<&[u8]> = re.split(hay).collect();
+ /// assert_eq!(got, vec![
+ /// &b""[..], &b""[..], &b""[..], &b""[..],
+ /// &b"a"[..], &b""[..], &b"b"[..], &b"c"[..],
+ /// ]);
+ ///
+ /// let re = Regex::new(r"/").unwrap();
+ /// let hay = b"(///)";
+ /// let got: Vec<&[u8]> = re.split(hay).collect();
+ /// assert_eq!(got, vec![&b"("[..], &b""[..], &b""[..], &b")"[..]]);
+ /// ```
+ ///
+ /// Separators at the start or end of a haystack are neighbored by empty
+ /// substring.
+ ///
+ /// ```
+ /// use regex::bytes::Regex;
+ ///
+ /// let re = Regex::new(r"0").unwrap();
+ /// let hay = b"010";
+ /// let got: Vec<&[u8]> = re.split(hay).collect();
+ /// assert_eq!(got, vec![&b""[..], &b"1"[..], &b""[..]]);
+ /// ```
+ ///
+ /// When the regex can match the empty string, it splits at every byte
+ /// position in the haystack. This includes between all UTF-8 code units.
+ /// (The top-level [`Regex::split`](crate::Regex::split) will only split
+ /// at valid UTF-8 boundaries.)
+ ///
+ /// ```
+ /// use regex::bytes::Regex;
+ ///
+ /// let re = Regex::new(r"").unwrap();
+ /// let hay = "☃".as_bytes();
+ /// let got: Vec<&[u8]> = re.split(hay).collect();
+ /// assert_eq!(got, vec![
+ /// &[][..], &[b'\xE2'][..], &[b'\x98'][..], &[b'\x83'][..], &[][..],
+ /// ]);
+ /// ```
+ ///
+ /// Contiguous separators (commonly shows up with whitespace), can lead to
+ /// possibly surprising behavior. For example, this code is correct:
+ ///
+ /// ```
+ /// use regex::bytes::Regex;
+ ///
+ /// let re = Regex::new(r" ").unwrap();
+ /// let hay = b" a b c";
+ /// let got: Vec<&[u8]> = re.split(hay).collect();
+ /// assert_eq!(got, vec![
+ /// &b""[..], &b""[..], &b""[..], &b""[..],
+ /// &b"a"[..], &b""[..], &b"b"[..], &b"c"[..],
+ /// ]);
+ /// ```
+ ///
+ /// It does *not* give you `["a", "b", "c"]`. For that behavior, you'd want
+ /// to match contiguous space characters:
+ ///
+ /// ```
+ /// use regex::bytes::Regex;
+ ///
+ /// let re = Regex::new(r" +").unwrap();
+ /// let hay = b" a b c";
+ /// let got: Vec<&[u8]> = re.split(hay).collect();
+ /// // N.B. This does still include a leading empty span because ' +'
+ /// // matches at the beginning of the haystack.
+ /// assert_eq!(got, vec![&b""[..], &b"a"[..], &b"b"[..], &b"c"[..]]);
+ /// ```
+ #[inline]
+ pub fn split<'r, 'h>(&'r self, haystack: &'h [u8]) -> Split<'r, 'h> {
+ Split { haystack, it: self.meta.split(haystack) }
+ }
+
+ /// Returns an iterator of at most `limit` substrings of the haystack
+ /// given, delimited by a match of the regex. (A `limit` of `0` will return
+ /// no substrings.) Namely, each element of the iterator corresponds to a
+ /// part of the haystack that *isn't* matched by the regular expression.
+ /// The remainder of the haystack that is not split will be the last
+ /// element in the iterator.
+ ///
+ /// # Time complexity
+ ///
+ /// Since iterators over all matches requires running potentially many
+ /// searches on the haystack, and since each search has worst case
+ /// `O(m * n)` time complexity, the overall worst case time complexity for
+ /// this routine is `O(m * n^2)`.
+ ///
+ /// Although note that the worst case time here has an upper bound given
+ /// by the `limit` parameter.
+ ///
+ /// # Example
+ ///
+ /// Get the first two words in some haystack:
+ ///
+ /// ```
+ /// use regex::bytes::Regex;
+ ///
+ /// let re = Regex::new(r"\W+").unwrap();
+ /// let hay = b"Hey! How are you?";
+ /// let fields: Vec<&[u8]> = re.splitn(hay, 3).collect();
+ /// assert_eq!(fields, vec![&b"Hey"[..], &b"How"[..], &b"are you?"[..]]);
+ /// ```
+ ///
+ /// # Examples: more cases
+ ///
+ /// ```
+ /// use regex::bytes::Regex;
+ ///
+ /// let re = Regex::new(r" ").unwrap();
+ /// let hay = b"Mary had a little lamb";
+ /// let got: Vec<&[u8]> = re.splitn(hay, 3).collect();
+ /// assert_eq!(got, vec![&b"Mary"[..], &b"had"[..], &b"a little lamb"[..]]);
+ ///
+ /// let re = Regex::new(r"X").unwrap();
+ /// let hay = b"";
+ /// let got: Vec<&[u8]> = re.splitn(hay, 3).collect();
+ /// assert_eq!(got, vec![&b""[..]]);
+ ///
+ /// let re = Regex::new(r"X").unwrap();
+ /// let hay = b"lionXXtigerXleopard";
+ /// let got: Vec<&[u8]> = re.splitn(hay, 3).collect();
+ /// assert_eq!(got, vec![&b"lion"[..], &b""[..], &b"tigerXleopard"[..]]);
+ ///
+ /// let re = Regex::new(r"::").unwrap();
+ /// let hay = b"lion::tiger::leopard";
+ /// let got: Vec<&[u8]> = re.splitn(hay, 2).collect();
+ /// assert_eq!(got, vec![&b"lion"[..], &b"tiger::leopard"[..]]);
+ ///
+ /// let re = Regex::new(r"X").unwrap();
+ /// let hay = b"abcXdef";
+ /// let got: Vec<&[u8]> = re.splitn(hay, 1).collect();
+ /// assert_eq!(got, vec![&b"abcXdef"[..]]);
+ ///
+ /// let re = Regex::new(r"X").unwrap();
+ /// let hay = b"abcdef";
+ /// let got: Vec<&[u8]> = re.splitn(hay, 2).collect();
+ /// assert_eq!(got, vec![&b"abcdef"[..]]);
+ ///
+ /// let re = Regex::new(r"X").unwrap();
+ /// let hay = b"abcXdef";
+ /// let got: Vec<&[u8]> = re.splitn(hay, 0).collect();
+ /// assert!(got.is_empty());
+ /// ```
+ #[inline]
+ pub fn splitn<'r, 'h>(
+ &'r self,
+ haystack: &'h [u8],
+ limit: usize,
+ ) -> SplitN<'r, 'h> {
+ SplitN { haystack, it: self.meta.splitn(haystack, limit) }
+ }
+
+ /// Replaces the leftmost-first match in the given haystack with the
+ /// replacement provided. The replacement can be a regular string (where
+ /// `$N` and `$name` are expanded to match capture groups) or a function
+ /// that takes a [`Captures`] and returns the replaced string.
+ ///
+ /// If no match is found, then the haystack is returned unchanged. In that
+ /// case, this implementation will likely return a `Cow::Borrowed` value
+ /// such that no allocation is performed.
+ ///
+ /// # Replacement string syntax
+ ///
+ /// All instances of `$ref` in the replacement string are replaced with
+ /// the substring corresponding to the capture group identified by `ref`.
+ ///
+ /// `ref` may be an integer corresponding to the index of the capture group
+ /// (counted by order of opening parenthesis where `0` is the entire match)
+ /// or it can be a name (consisting of letters, digits or underscores)
+ /// corresponding to a named capture group.
+ ///
+ /// If `ref` isn't a valid capture group (whether the name doesn't exist or
+ /// isn't a valid index), then it is replaced with the empty string.
+ ///
+ /// The longest possible name is used. For example, `$1a` looks up the
+ /// capture group named `1a` and not the capture group at index `1`. To
+ /// exert more precise control over the name, use braces, e.g., `${1}a`.
+ ///
+ /// To write a literal `$` use `$$`.
+ ///
+ /// # Example
+ ///
+ /// Note that this function is polymorphic with respect to the replacement.
+ /// In typical usage, this can just be a normal string:
+ ///
+ /// ```
+ /// use regex::bytes::Regex;
+ ///
+ /// let re = Regex::new(r"[^01]+").unwrap();
+ /// assert_eq!(re.replace(b"1078910", b""), &b"1010"[..]);
+ /// ```
+ ///
+ /// But anything satisfying the [`Replacer`] trait will work. For example,
+ /// a closure of type `|&Captures| -> String` provides direct access to the
+ /// captures corresponding to a match. This allows one to access capturing
+ /// group matches easily:
+ ///
+ /// ```
+ /// use regex::bytes::{Captures, Regex};
+ ///
+ /// let re = Regex::new(r"([^,\s]+),\s+(\S+)").unwrap();
+ /// let result = re.replace(b"Springsteen, Bruce", |caps: &Captures| {
+ /// let mut buf = vec![];
+ /// buf.extend_from_slice(&caps[2]);
+ /// buf.push(b' ');
+ /// buf.extend_from_slice(&caps[1]);
+ /// buf
+ /// });
+ /// assert_eq!(result, &b"Bruce Springsteen"[..]);
+ /// ```
+ ///
+ /// But this is a bit cumbersome to use all the time. Instead, a simple
+ /// syntax is supported (as described above) that expands `$name` into the
+ /// corresponding capture group. Here's the last example, but using this
+ /// expansion technique with named capture groups:
+ ///
+ /// ```
+ /// use regex::bytes::Regex;
+ ///
+ /// let re = Regex::new(r"(?<last>[^,\s]+),\s+(?<first>\S+)").unwrap();
+ /// let result = re.replace(b"Springsteen, Bruce", b"$first $last");
+ /// assert_eq!(result, &b"Bruce Springsteen"[..]);
+ /// ```
+ ///
+ /// Note that using `$2` instead of `$first` or `$1` instead of `$last`
+ /// would produce the same result. To write a literal `$` use `$$`.
+ ///
+ /// Sometimes the replacement string requires use of curly braces to
+ /// delineate a capture group replacement when it is adjacent to some other
+ /// literal text. For example, if we wanted to join two words together with
+ /// an underscore:
+ ///
+ /// ```
+ /// use regex::bytes::Regex;
+ ///
+ /// let re = Regex::new(r"(?<first>\w+)\s+(?<second>\w+)").unwrap();
+ /// let result = re.replace(b"deep fried", b"${first}_$second");
+ /// assert_eq!(result, &b"deep_fried"[..]);
+ /// ```
+ ///
+ /// Without the curly braces, the capture group name `first_` would be
+ /// used, and since it doesn't exist, it would be replaced with the empty
+ /// string.
+ ///
+ /// Finally, sometimes you just want to replace a literal string with no
+ /// regard for capturing group expansion. This can be done by wrapping a
+ /// string with [`NoExpand`]:
+ ///
+ /// ```
+ /// use regex::bytes::{NoExpand, Regex};
+ ///
+ /// let re = Regex::new(r"(?<last>[^,\s]+),\s+(\S+)").unwrap();
+ /// let result = re.replace(b"Springsteen, Bruce", NoExpand(b"$2 $last"));
+ /// assert_eq!(result, &b"$2 $last"[..]);
+ /// ```
+ ///
+ /// Using `NoExpand` may also be faster, since the replacement string won't
+ /// need to be parsed for the `$` syntax.
+ #[inline]
+ pub fn replace<'h, R: Replacer>(
+ &self,
+ haystack: &'h [u8],
+ rep: R,
+ ) -> Cow<'h, [u8]> {
+ self.replacen(haystack, 1, rep)
+ }
+
+ /// Replaces all non-overlapping matches in the haystack with the
+ /// replacement provided. This is the same as calling `replacen` with
+ /// `limit` set to `0`.
+ ///
+ /// The documentation for [`Regex::replace`] goes into more detail about
+ /// what kinds of replacement strings are supported.
+ ///
+ /// # Time complexity
+ ///
+ /// Since iterators over all matches requires running potentially many
+ /// searches on the haystack, and since each search has worst case
+ /// `O(m * n)` time complexity, the overall worst case time complexity for
+ /// this routine is `O(m * n^2)`.
+ ///
+ /// # Fallibility
+ ///
+ /// If you need to write a replacement routine where any individual
+ /// replacement might "fail," doing so with this API isn't really feasible
+ /// because there's no way to stop the search process if a replacement
+ /// fails. Instead, if you need this functionality, you should consider
+ /// implementing your own replacement routine:
+ ///
+ /// ```
+ /// use regex::bytes::{Captures, Regex};
+ ///
+ /// fn replace_all<E>(
+ /// re: &Regex,
+ /// haystack: &[u8],
+ /// replacement: impl Fn(&Captures) -> Result<Vec<u8>, E>,
+ /// ) -> Result<Vec<u8>, E> {
+ /// let mut new = Vec::with_capacity(haystack.len());
+ /// let mut last_match = 0;
+ /// for caps in re.captures_iter(haystack) {
+ /// let m = caps.get(0).unwrap();
+ /// new.extend_from_slice(&haystack[last_match..m.start()]);
+ /// new.extend_from_slice(&replacement(&caps)?);
+ /// last_match = m.end();
+ /// }
+ /// new.extend_from_slice(&haystack[last_match..]);
+ /// Ok(new)
+ /// }
+ ///
+ /// // Let's replace each word with the number of bytes in that word.
+ /// // But if we see a word that is "too long," we'll give up.
+ /// let re = Regex::new(r"\w+").unwrap();
+ /// let replacement = |caps: &Captures| -> Result<Vec<u8>, &'static str> {
+ /// if caps[0].len() >= 5 {
+ /// return Err("word too long");
+ /// }
+ /// Ok(caps[0].len().to_string().into_bytes())
+ /// };
+ /// assert_eq!(
+ /// Ok(b"2 3 3 3?".to_vec()),
+ /// replace_all(&re, b"hi how are you?", &replacement),
+ /// );
+ /// assert!(replace_all(&re, b"hi there", &replacement).is_err());
+ /// ```
+ ///
+ /// # Example
+ ///
+ /// This example shows how to flip the order of whitespace (excluding line
+ /// terminators) delimited fields, and normalizes the whitespace that
+ /// delimits the fields:
+ ///
+ /// ```
+ /// use regex::bytes::Regex;
+ ///
+ /// let re = Regex::new(r"(?m)^(\S+)[\s--\r\n]+(\S+)$").unwrap();
+ /// let hay = b"
+ /// Greetings 1973
+ /// Wild\t1973
+ /// BornToRun\t\t\t\t1975
+ /// Darkness 1978
+ /// TheRiver 1980
+ /// ";
+ /// let new = re.replace_all(hay, b"$2 $1");
+ /// assert_eq!(new, &b"
+ /// 1973 Greetings
+ /// 1973 Wild
+ /// 1975 BornToRun
+ /// 1978 Darkness
+ /// 1980 TheRiver
+ /// "[..]);
+ /// ```
+ #[inline]
+ pub fn replace_all<'h, R: Replacer>(
+ &self,
+ haystack: &'h [u8],
+ rep: R,
+ ) -> Cow<'h, [u8]> {
+ self.replacen(haystack, 0, rep)
+ }
+
+ /// Replaces at most `limit` non-overlapping matches in the haystack with
+ /// the replacement provided. If `limit` is `0`, then all non-overlapping
+ /// matches are replaced. That is, `Regex::replace_all(hay, rep)` is
+ /// equivalent to `Regex::replacen(hay, 0, rep)`.
+ ///
+ /// The documentation for [`Regex::replace`] goes into more detail about
+ /// what kinds of replacement strings are supported.
+ ///
+ /// # Time complexity
+ ///
+ /// Since iterators over all matches requires running potentially many
+ /// searches on the haystack, and since each search has worst case
+ /// `O(m * n)` time complexity, the overall worst case time complexity for
+ /// this routine is `O(m * n^2)`.
+ ///
+ /// Although note that the worst case time here has an upper bound given
+ /// by the `limit` parameter.
+ ///
+ /// # Fallibility
+ ///
+ /// See the corresponding section in the docs for [`Regex::replace_all`]
+ /// for tips on how to deal with a replacement routine that can fail.
+ ///
+ /// # Example
+ ///
+ /// This example shows how to flip the order of whitespace (excluding line
+ /// terminators) delimited fields, and normalizes the whitespace that
+ /// delimits the fields. But we only do it for the first two matches.
+ ///
+ /// ```
+ /// use regex::bytes::Regex;
+ ///
+ /// let re = Regex::new(r"(?m)^(\S+)[\s--\r\n]+(\S+)$").unwrap();
+ /// let hay = b"
+ /// Greetings 1973
+ /// Wild\t1973
+ /// BornToRun\t\t\t\t1975
+ /// Darkness 1978
+ /// TheRiver 1980
+ /// ";
+ /// let new = re.replacen(hay, 2, b"$2 $1");
+ /// assert_eq!(new, &b"
+ /// 1973 Greetings
+ /// 1973 Wild
+ /// BornToRun\t\t\t\t1975
+ /// Darkness 1978
+ /// TheRiver 1980
+ /// "[..]);
+ /// ```
+ #[inline]
+ pub fn replacen<'h, R: Replacer>(
+ &self,
+ haystack: &'h [u8],
+ limit: usize,
+ mut rep: R,
+ ) -> Cow<'h, [u8]> {
+ // If we know that the replacement doesn't have any capture expansions,
+ // then we can use the fast path. The fast path can make a tremendous
+ // difference:
+ //
+ // 1) We use `find_iter` instead of `captures_iter`. Not asking for
+ // captures generally makes the regex engines faster.
+ // 2) We don't need to look up all of the capture groups and do
+ // replacements inside the replacement string. We just push it
+ // at each match and be done with it.
+ if let Some(rep) = rep.no_expansion() {
+ let mut it = self.find_iter(haystack).enumerate().peekable();
+ if it.peek().is_none() {
+ return Cow::Borrowed(haystack);
+ }
+ let mut new = Vec::with_capacity(haystack.len());
+ let mut last_match = 0;
+ for (i, m) in it {
+ new.extend_from_slice(&haystack[last_match..m.start()]);
+ new.extend_from_slice(&rep);
+ last_match = m.end();
+ if limit > 0 && i >= limit - 1 {
+ break;
+ }
+ }
+ new.extend_from_slice(&haystack[last_match..]);
+ return Cow::Owned(new);
+ }
+
+ // The slower path, which we use if the replacement needs access to
+ // capture groups.
+ let mut it = self.captures_iter(haystack).enumerate().peekable();
+ if it.peek().is_none() {
+ return Cow::Borrowed(haystack);
+ }
+ let mut new = Vec::with_capacity(haystack.len());
+ let mut last_match = 0;
+ for (i, cap) in it {
+ // unwrap on 0 is OK because captures only reports matches
+ let m = cap.get(0).unwrap();
+ new.extend_from_slice(&haystack[last_match..m.start()]);
+ rep.replace_append(&cap, &mut new);
+ last_match = m.end();
+ if limit > 0 && i >= limit - 1 {
+ break;
+ }
+ }
+ new.extend_from_slice(&haystack[last_match..]);
+ Cow::Owned(new)
+ }
+}
+
+/// A group of advanced or "lower level" search methods. Some methods permit
+/// starting the search at a position greater than `0` in the haystack. Other
+/// methods permit reusing allocations, for example, when extracting the
+/// matches for capture groups.
+impl Regex {
+ /// Returns the end byte offset of the first match in the haystack given.
+ ///
+ /// This method may have the same performance characteristics as
+ /// `is_match`. Behaviorlly, it doesn't just report whether it match
+ /// occurs, but also the end offset for a match. In particular, the offset
+ /// returned *may be shorter* than the proper end of the leftmost-first
+ /// match that you would find via [`Regex::find`].
+ ///
+ /// Note that it is not guaranteed that this routine finds the shortest or
+ /// "earliest" possible match. Instead, the main idea of this API is that
+ /// it returns the offset at the point at which the internal regex engine
+ /// has determined that a match has occurred. This may vary depending on
+ /// which internal regex engine is used, and thus, the offset itself may
+ /// change based on internal heuristics.
+ ///
+ /// # Example
+ ///
+ /// Typically, `a+` would match the entire first sequence of `a` in some
+ /// haystack, but `shortest_match` *may* give up as soon as it sees the
+ /// first `a`.
+ ///
+ /// ```
+ /// use regex::bytes::Regex;
+ ///
+ /// let re = Regex::new(r"a+").unwrap();
+ /// let offset = re.shortest_match(b"aaaaa").unwrap();
+ /// assert_eq!(offset, 1);
+ /// ```
+ #[inline]
+ pub fn shortest_match(&self, haystack: &[u8]) -> Option<usize> {
+ self.shortest_match_at(haystack, 0)
+ }
+
+ /// Returns the same as `shortest_match`, but starts the search at the
+ /// given offset.
+ ///
+ /// The significance of the starting point is that it takes the surrounding
+ /// context into consideration. For example, the `\A` anchor can only match
+ /// when `start == 0`.
+ ///
+ /// If a match is found, the offset returned is relative to the beginning
+ /// of the haystack, not the beginning of the search.
+ ///
+ /// # Panics
+ ///
+ /// This panics when `start >= haystack.len() + 1`.
+ ///
+ /// # Example
+ ///
+ /// This example shows the significance of `start` by demonstrating how it
+ /// can be used to permit look-around assertions in a regex to take the
+ /// surrounding context into account.
+ ///
+ /// ```
+ /// use regex::bytes::Regex;
+ ///
+ /// let re = Regex::new(r"\bchew\b").unwrap();
+ /// let hay = b"eschew";
+ /// // We get a match here, but it's probably not intended.
+ /// assert_eq!(re.shortest_match(&hay[2..]), Some(4));
+ /// // No match because the assertions take the context into account.
+ /// assert_eq!(re.shortest_match_at(hay, 2), None);
+ /// ```
+ #[inline]
+ pub fn shortest_match_at(
+ &self,
+ haystack: &[u8],
+ start: usize,
+ ) -> Option<usize> {
+ let input =
+ Input::new(haystack).earliest(true).span(start..haystack.len());
+ self.meta.search_half(&input).map(|hm| hm.offset())
+ }
+
+ /// Returns the same as [`Regex::is_match`], but starts the search at the
+ /// given offset.
+ ///
+ /// The significance of the starting point is that it takes the surrounding
+ /// context into consideration. For example, the `\A` anchor can only
+ /// match when `start == 0`.
+ ///
+ /// # Panics
+ ///
+ /// This panics when `start >= haystack.len() + 1`.
+ ///
+ /// # Example
+ ///
+ /// This example shows the significance of `start` by demonstrating how it
+ /// can be used to permit look-around assertions in a regex to take the
+ /// surrounding context into account.
+ ///
+ /// ```
+ /// use regex::bytes::Regex;
+ ///
+ /// let re = Regex::new(r"\bchew\b").unwrap();
+ /// let hay = b"eschew";
+ /// // We get a match here, but it's probably not intended.
+ /// assert!(re.is_match(&hay[2..]));
+ /// // No match because the assertions take the context into account.
+ /// assert!(!re.is_match_at(hay, 2));
+ /// ```
+ #[inline]
+ pub fn is_match_at(&self, haystack: &[u8], start: usize) -> bool {
+ self.meta.is_match(Input::new(haystack).span(start..haystack.len()))
+ }
+
+ /// Returns the same as [`Regex::find`], but starts the search at the given
+ /// offset.
+ ///
+ /// The significance of the starting point is that it takes the surrounding
+ /// context into consideration. For example, the `\A` anchor can only
+ /// match when `start == 0`.
+ ///
+ /// # Panics
+ ///
+ /// This panics when `start >= haystack.len() + 1`.
+ ///
+ /// # Example
+ ///
+ /// This example shows the significance of `start` by demonstrating how it
+ /// can be used to permit look-around assertions in a regex to take the
+ /// surrounding context into account.
+ ///
+ /// ```
+ /// use regex::bytes::Regex;
+ ///
+ /// let re = Regex::new(r"\bchew\b").unwrap();
+ /// let hay = b"eschew";
+ /// // We get a match here, but it's probably not intended.
+ /// assert_eq!(re.find(&hay[2..]).map(|m| m.range()), Some(0..4));
+ /// // No match because the assertions take the context into account.
+ /// assert_eq!(re.find_at(hay, 2), None);
+ /// ```
+ #[inline]
+ pub fn find_at<'h>(
+ &self,
+ haystack: &'h [u8],
+ start: usize,
+ ) -> Option<Match<'h>> {
+ let input = Input::new(haystack).span(start..haystack.len());
+ self.meta.find(input).map(|m| Match::new(haystack, m.start(), m.end()))
+ }
+
+ /// Returns the same as [`Regex::captures`], but starts the search at the
+ /// given offset.
+ ///
+ /// The significance of the starting point is that it takes the surrounding
+ /// context into consideration. For example, the `\A` anchor can only
+ /// match when `start == 0`.
+ ///
+ /// # Panics
+ ///
+ /// This panics when `start >= haystack.len() + 1`.
+ ///
+ /// # Example
+ ///
+ /// This example shows the significance of `start` by demonstrating how it
+ /// can be used to permit look-around assertions in a regex to take the
+ /// surrounding context into account.
+ ///
+ /// ```
+ /// use regex::bytes::Regex;
+ ///
+ /// let re = Regex::new(r"\bchew\b").unwrap();
+ /// let hay = b"eschew";
+ /// // We get a match here, but it's probably not intended.
+ /// assert_eq!(&re.captures(&hay[2..]).unwrap()[0], b"chew");
+ /// // No match because the assertions take the context into account.
+ /// assert!(re.captures_at(hay, 2).is_none());
+ /// ```
+ #[inline]
+ pub fn captures_at<'h>(
+ &self,
+ haystack: &'h [u8],
+ start: usize,
+ ) -> Option<Captures<'h>> {
+ let input = Input::new(haystack).span(start..haystack.len());
+ let mut caps = self.meta.create_captures();
+ self.meta.captures(input, &mut caps);
+ if caps.is_match() {
+ let static_captures_len = self.static_captures_len();
+ Some(Captures { haystack, caps, static_captures_len })
+ } else {
+ None
+ }
+ }
+
+ /// This is like [`Regex::captures`], but writes the byte offsets of each
+ /// capture group match into the locations given.
+ ///
+ /// A [`CaptureLocations`] stores the same byte offsets as a [`Captures`],
+ /// but does *not* store a reference to the haystack. This makes its API
+ /// a bit lower level and less convenient. But in exchange, callers
+ /// may allocate their own `CaptureLocations` and reuse it for multiple
+ /// searches. This may be helpful if allocating a `Captures` shows up in a
+ /// profile as too costly.
+ ///
+ /// To create a `CaptureLocations` value, use the
+ /// [`Regex::capture_locations`] method.
+ ///
+ /// This also returns the overall match if one was found. When a match is
+ /// found, its offsets are also always stored in `locs` at index `0`.
+ ///
+ /// # Example
+ ///
+ /// ```
+ /// use regex::bytes::Regex;
+ ///
+ /// let re = Regex::new(r"^([a-z]+)=(\S*)$").unwrap();
+ /// let mut locs = re.capture_locations();
+ /// assert!(re.captures_read(&mut locs, b"id=foo123").is_some());
+ /// assert_eq!(Some((0, 9)), locs.get(0));
+ /// assert_eq!(Some((0, 2)), locs.get(1));
+ /// assert_eq!(Some((3, 9)), locs.get(2));
+ /// ```
+ #[inline]
+ pub fn captures_read<'h>(
+ &self,
+ locs: &mut CaptureLocations,
+ haystack: &'h [u8],
+ ) -> Option<Match<'h>> {
+ self.captures_read_at(locs, haystack, 0)
+ }
+
+ /// Returns the same as [`Regex::captures_read`], but starts the search at
+ /// the given offset.
+ ///
+ /// The significance of the starting point is that it takes the surrounding
+ /// context into consideration. For example, the `\A` anchor can only
+ /// match when `start == 0`.
+ ///
+ /// # Panics
+ ///
+ /// This panics when `start >= haystack.len() + 1`.
+ ///
+ /// # Example
+ ///
+ /// This example shows the significance of `start` by demonstrating how it
+ /// can be used to permit look-around assertions in a regex to take the
+ /// surrounding context into account.
+ ///
+ /// ```
+ /// use regex::bytes::Regex;
+ ///
+ /// let re = Regex::new(r"\bchew\b").unwrap();
+ /// let hay = b"eschew";
+ /// let mut locs = re.capture_locations();
+ /// // We get a match here, but it's probably not intended.
+ /// assert!(re.captures_read(&mut locs, &hay[2..]).is_some());
+ /// // No match because the assertions take the context into account.
+ /// assert!(re.captures_read_at(&mut locs, hay, 2).is_none());
+ /// ```
+ #[inline]
+ pub fn captures_read_at<'h>(
+ &self,
+ locs: &mut CaptureLocations,
+ haystack: &'h [u8],
+ start: usize,
+ ) -> Option<Match<'h>> {
+ let input = Input::new(haystack).span(start..haystack.len());
+ self.meta.search_captures(&input, &mut locs.0);
+ locs.0.get_match().map(|m| Match::new(haystack, m.start(), m.end()))
+ }
+
+ /// An undocumented alias for `captures_read_at`.
+ ///
+ /// The `regex-capi` crate previously used this routine, so to avoid
+ /// breaking that crate, we continue to provide the name as an undocumented
+ /// alias.
+ #[doc(hidden)]
+ #[inline]
+ pub fn read_captures_at<'h>(
+ &self,
+ locs: &mut CaptureLocations,
+ haystack: &'h [u8],
+ start: usize,
+ ) -> Option<Match<'h>> {
+ self.captures_read_at(locs, haystack, start)
+ }
+}
+
+/// Auxiliary methods.
+impl Regex {
+ /// Returns the original string of this regex.
+ ///
+ /// # Example
+ ///
+ /// ```
+ /// use regex::bytes::Regex;
+ ///
+ /// let re = Regex::new(r"foo\w+bar").unwrap();
+ /// assert_eq!(re.as_str(), r"foo\w+bar");
+ /// ```
+ #[inline]
+ pub fn as_str(&self) -> &str {
+ &self.pattern
+ }
+
+ /// Returns an iterator over the capture names in this regex.
+ ///
+ /// The iterator returned yields elements of type `Option<&str>`. That is,
+ /// the iterator yields values for all capture groups, even ones that are
+ /// unnamed. The order of the groups corresponds to the order of the group's
+ /// corresponding opening parenthesis.
+ ///
+ /// The first element of the iterator always yields the group corresponding
+ /// to the overall match, and this group is always unnamed. Therefore, the
+ /// iterator always yields at least one group.
+ ///
+ /// # Example
+ ///
+ /// This shows basic usage with a mix of named and unnamed capture groups:
+ ///
+ /// ```
+ /// use regex::bytes::Regex;
+ ///
+ /// let re = Regex::new(r"(?<a>.(?<b>.))(.)(?:.)(?<c>.)").unwrap();
+ /// let mut names = re.capture_names();
+ /// assert_eq!(names.next(), Some(None));
+ /// assert_eq!(names.next(), Some(Some("a")));
+ /// assert_eq!(names.next(), Some(Some("b")));
+ /// assert_eq!(names.next(), Some(None));
+ /// // the '(?:.)' group is non-capturing and so doesn't appear here!
+ /// assert_eq!(names.next(), Some(Some("c")));
+ /// assert_eq!(names.next(), None);
+ /// ```
+ ///
+ /// The iterator always yields at least one element, even for regexes with
+ /// no capture groups and even for regexes that can never match:
+ ///
+ /// ```
+ /// use regex::bytes::Regex;
+ ///
+ /// let re = Regex::new(r"").unwrap();
+ /// let mut names = re.capture_names();
+ /// assert_eq!(names.next(), Some(None));
+ /// assert_eq!(names.next(), None);
+ ///
+ /// let re = Regex::new(r"[a&&b]").unwrap();
+ /// let mut names = re.capture_names();
+ /// assert_eq!(names.next(), Some(None));
+ /// assert_eq!(names.next(), None);
+ /// ```
+ #[inline]
+ pub fn capture_names(&self) -> CaptureNames<'_> {
+ CaptureNames(self.meta.group_info().pattern_names(PatternID::ZERO))
+ }
+
+ /// Returns the number of captures groups in this regex.
+ ///
+ /// This includes all named and unnamed groups, including the implicit
+ /// unnamed group that is always present and corresponds to the entire
+ /// match.
+ ///
+ /// Since the implicit unnamed group is always included in this length, the
+ /// length returned is guaranteed to be greater than zero.
+ ///
+ /// # Example
+ ///
+ /// ```
+ /// use regex::bytes::Regex;
+ ///
+ /// let re = Regex::new(r"foo").unwrap();
+ /// assert_eq!(1, re.captures_len());
+ ///
+ /// let re = Regex::new(r"(foo)").unwrap();
+ /// assert_eq!(2, re.captures_len());
+ ///
+ /// let re = Regex::new(r"(?<a>.(?<b>.))(.)(?:.)(?<c>.)").unwrap();
+ /// assert_eq!(5, re.captures_len());
+ ///
+ /// let re = Regex::new(r"[a&&b]").unwrap();
+ /// assert_eq!(1, re.captures_len());
+ /// ```
+ #[inline]
+ pub fn captures_len(&self) -> usize {
+ self.meta.group_info().group_len(PatternID::ZERO)
+ }
+
+ /// Returns the total number of capturing groups that appear in every
+ /// possible match.
+ ///
+ /// If the number of capture groups can vary depending on the match, then
+ /// this returns `None`. That is, a value is only returned when the number
+ /// of matching groups is invariant or "static."
+ ///
+ /// Note that like [`Regex::captures_len`], this **does** include the
+ /// implicit capturing group corresponding to the entire match. Therefore,
+ /// when a non-None value is returned, it is guaranteed to be at least `1`.
+ /// Stated differently, a return value of `Some(0)` is impossible.
+ ///
+ /// # Example
+ ///
+ /// This shows a few cases where a static number of capture groups is
+ /// available and a few cases where it is not.
+ ///
+ /// ```
+ /// use regex::bytes::Regex;
+ ///
+ /// let len = |pattern| {
+ /// Regex::new(pattern).map(|re| re.static_captures_len())
+ /// };
+ ///
+ /// assert_eq!(Some(1), len("a")?);
+ /// assert_eq!(Some(2), len("(a)")?);
+ /// assert_eq!(Some(2), len("(a)|(b)")?);
+ /// assert_eq!(Some(3), len("(a)(b)|(c)(d)")?);
+ /// assert_eq!(None, len("(a)|b")?);
+ /// assert_eq!(None, len("a|(b)")?);
+ /// assert_eq!(None, len("(b)*")?);
+ /// assert_eq!(Some(2), len("(b)+")?);
+ ///
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ #[inline]
+ pub fn static_captures_len(&self) -> Option<usize> {
+ self.meta.static_captures_len()
+ }
+
+ /// Returns a fresh allocated set of capture locations that can
+ /// be reused in multiple calls to [`Regex::captures_read`] or
+ /// [`Regex::captures_read_at`].
+ ///
+ /// # Example
+ ///
+ /// ```
+ /// use regex::bytes::Regex;
+ ///
+ /// let re = Regex::new(r"(.)(.)(\w+)").unwrap();
+ /// let mut locs = re.capture_locations();
+ /// assert!(re.captures_read(&mut locs, b"Padron").is_some());
+ /// assert_eq!(locs.get(0), Some((0, 6)));
+ /// assert_eq!(locs.get(1), Some((0, 1)));
+ /// assert_eq!(locs.get(2), Some((1, 2)));
+ /// assert_eq!(locs.get(3), Some((2, 6)));
+ /// ```
+ #[inline]
+ pub fn capture_locations(&self) -> CaptureLocations {
+ CaptureLocations(self.meta.create_captures())
+ }
+
+ /// An alias for `capture_locations` to preserve backward compatibility.
+ ///
+ /// The `regex-capi` crate uses this method, so to avoid breaking that
+ /// crate, we continue to export it as an undocumented API.
+ #[doc(hidden)]
+ #[inline]
+ pub fn locations(&self) -> CaptureLocations {
+ self.capture_locations()
+ }
+}
+
+/// Represents a single match of a regex in a haystack.
+///
+/// A `Match` contains both the start and end byte offsets of the match and the
+/// actual substring corresponding to the range of those byte offsets. It is
+/// guaranteed that `start <= end`. When `start == end`, the match is empty.
+///
+/// Unlike the top-level `Match` type, this `Match` type is produced by APIs
+/// that search `&[u8]` haystacks. This means that the offsets in a `Match` can
+/// point to anywhere in the haystack, including in a place that splits the
+/// UTF-8 encoding of a Unicode scalar value.
+///
+/// The lifetime parameter `'h` refers to the lifetime of the matched of the
+/// haystack that this match was produced from.
+///
+/// # Numbering
+///
+/// The byte offsets in a `Match` form a half-open interval. That is, the
+/// start of the range is inclusive and the end of the range is exclusive.
+/// For example, given a haystack `abcFOOxyz` and a match of `FOO`, its byte
+/// offset range starts at `3` and ends at `6`. `3` corresponds to `F` and
+/// `6` corresponds to `x`, which is one past the end of the match. This
+/// corresponds to the same kind of slicing that Rust uses.
+///
+/// For more on why this was chosen over other schemes (aside from being
+/// consistent with how Rust the language works), see [this discussion] and
+/// [Dijkstra's note on a related topic][note].
+///
+/// [this discussion]: https://github.com/rust-lang/regex/discussions/866
+/// [note]: https://www.cs.utexas.edu/users/EWD/transcriptions/EWD08xx/EWD831.html
+///
+/// # Example
+///
+/// This example shows the value of each of the methods on `Match` for a
+/// particular search.
+///
+/// ```
+/// use regex::bytes::Regex;
+///
+/// let re = Regex::new(r"\p{Greek}+").unwrap();
+/// let hay = "Greek: αβγδ".as_bytes();
+/// let m = re.find(hay).unwrap();
+/// assert_eq!(7, m.start());
+/// assert_eq!(15, m.end());
+/// assert!(!m.is_empty());
+/// assert_eq!(8, m.len());
+/// assert_eq!(7..15, m.range());
+/// assert_eq!("αβγδ".as_bytes(), m.as_bytes());
+/// ```
+#[derive(Copy, Clone, Eq, PartialEq)]
+pub struct Match<'h> {
+ haystack: &'h [u8],
+ start: usize,
+ end: usize,
+}
+
+impl<'h> Match<'h> {
+ /// Returns the byte offset of the start of the match in the haystack. The
+ /// start of the match corresponds to the position where the match begins
+ /// and includes the first byte in the match.
+ ///
+ /// It is guaranteed that `Match::start() <= Match::end()`.
+ ///
+ /// Unlike the top-level `Match` type, the start offset may appear anywhere
+ /// in the haystack. This includes between the code units of a UTF-8
+ /// encoded Unicode scalar value.
+ #[inline]
+ pub fn start(&self) -> usize {
+ self.start
+ }
+
+ /// Returns the byte offset of the end of the match in the haystack. The
+ /// end of the match corresponds to the byte immediately following the last
+ /// byte in the match. This means that `&slice[start..end]` works as one
+ /// would expect.
+ ///
+ /// It is guaranteed that `Match::start() <= Match::end()`.
+ ///
+ /// Unlike the top-level `Match` type, the start offset may appear anywhere
+ /// in the haystack. This includes between the code units of a UTF-8
+ /// encoded Unicode scalar value.
+ #[inline]
+ pub fn end(&self) -> usize {
+ self.end
+ }
+
+ /// Returns true if and only if this match has a length of zero.
+ ///
+ /// Note that an empty match can only occur when the regex itself can
+ /// match the empty string. Here are some examples of regexes that can
+ /// all match the empty string: `^`, `^$`, `\b`, `a?`, `a*`, `a{0}`,
+ /// `(foo|\d+|quux)?`.
+ #[inline]
+ pub fn is_empty(&self) -> bool {
+ self.start == self.end
+ }
+
+ /// Returns the length, in bytes, of this match.
+ #[inline]
+ pub fn len(&self) -> usize {
+ self.end - self.start
+ }
+
+ /// Returns the range over the starting and ending byte offsets of the
+ /// match in the haystack.
+ #[inline]
+ pub fn range(&self) -> core::ops::Range<usize> {
+ self.start..self.end
+ }
+
+ /// Returns the substring of the haystack that matched.
+ #[inline]
+ pub fn as_bytes(&self) -> &'h [u8] {
+ &self.haystack[self.range()]
+ }
+
+ /// Creates a new match from the given haystack and byte offsets.
+ #[inline]
+ fn new(haystack: &'h [u8], start: usize, end: usize) -> Match<'h> {
+ Match { haystack, start, end }
+ }
+}
+
+impl<'h> core::fmt::Debug for Match<'h> {
+ fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result {
+ let mut fmt = f.debug_struct("Match");
+ fmt.field("start", &self.start).field("end", &self.end);
+ if let Ok(s) = core::str::from_utf8(self.as_bytes()) {
+ fmt.field("bytes", &s);
+ } else {
+ // FIXME: It would be nice if this could be printed as a string
+ // with invalid UTF-8 replaced with hex escapes. A alloc would
+ // probably okay if that makes it easier, but regex-automata does
+ // (at time of writing) have internal routines that do this. So
+ // maybe we should expose them.
+ fmt.field("bytes", &self.as_bytes());
+ }
+ fmt.finish()
+ }
+}
+
+impl<'h> From<Match<'h>> for &'h [u8] {
+ fn from(m: Match<'h>) -> &'h [u8] {
+ m.as_bytes()
+ }
+}
+
+impl<'h> From<Match<'h>> for core::ops::Range<usize> {
+ fn from(m: Match<'h>) -> core::ops::Range<usize> {
+ m.range()
+ }
+}
+
+/// Represents the capture groups for a single match.
+///
+/// Capture groups refer to parts of a regex enclosed in parentheses. They can
+/// be optionally named. The purpose of capture groups is to be able to
+/// reference different parts of a match based on the original pattern. For
+/// example, say you want to match the individual letters in a 5-letter word:
+///
+/// ```text
+/// (?<first>\w)(\w)(?:\w)\w(?<last>\w)
+/// ```
+///
+/// This regex has 4 capture groups:
+///
+/// * The group at index `0` corresponds to the overall match. It is always
+/// present in every match and never has a name.
+/// * The group at index `1` with name `first` corresponding to the first
+/// letter.
+/// * The group at index `2` with no name corresponding to the second letter.
+/// * The group at index `3` with name `last` corresponding to the fifth and
+/// last letter.
+///
+/// Notice that `(?:\w)` was not listed above as a capture group despite it
+/// being enclosed in parentheses. That's because `(?:pattern)` is a special
+/// syntax that permits grouping but *without* capturing. The reason for not
+/// treating it as a capture is that tracking and reporting capture groups
+/// requires additional state that may lead to slower searches. So using as few
+/// capture groups as possible can help performance. (Although the difference
+/// in performance of a couple of capture groups is likely immaterial.)
+///
+/// Values with this type are created by [`Regex::captures`] or
+/// [`Regex::captures_iter`].
+///
+/// `'h` is the lifetime of the haystack that these captures were matched from.
+///
+/// # Example
+///
+/// ```
+/// use regex::bytes::Regex;
+///
+/// let re = Regex::new(r"(?<first>\w)(\w)(?:\w)\w(?<last>\w)").unwrap();
+/// let caps = re.captures(b"toady").unwrap();
+/// assert_eq!(b"toady", &caps[0]);
+/// assert_eq!(b"t", &caps["first"]);
+/// assert_eq!(b"o", &caps[2]);
+/// assert_eq!(b"y", &caps["last"]);
+/// ```
+pub struct Captures<'h> {
+ haystack: &'h [u8],
+ caps: captures::Captures,
+ static_captures_len: Option<usize>,
+}
+
+impl<'h> Captures<'h> {
+ /// Returns the `Match` associated with the capture group at index `i`. If
+ /// `i` does not correspond to a capture group, or if the capture group did
+ /// not participate in the match, then `None` is returned.
+ ///
+ /// When `i == 0`, this is guaranteed to return a non-`None` value.
+ ///
+ /// # Examples
+ ///
+ /// Get the substring that matched with a default of an empty string if the
+ /// group didn't participate in the match:
+ ///
+ /// ```
+ /// use regex::bytes::Regex;
+ ///
+ /// let re = Regex::new(r"[a-z]+(?:([0-9]+)|([A-Z]+))").unwrap();
+ /// let caps = re.captures(b"abc123").unwrap();
+ ///
+ /// let substr1 = caps.get(1).map_or(&b""[..], |m| m.as_bytes());
+ /// let substr2 = caps.get(2).map_or(&b""[..], |m| m.as_bytes());
+ /// assert_eq!(substr1, b"123");
+ /// assert_eq!(substr2, b"");
+ /// ```
+ #[inline]
+ pub fn get(&self, i: usize) -> Option<Match<'h>> {
+ self.caps
+ .get_group(i)
+ .map(|sp| Match::new(self.haystack, sp.start, sp.end))
+ }
+
+ /// Returns the `Match` associated with the capture group named `name`. If
+ /// `name` isn't a valid capture group or it refers to a group that didn't
+ /// match, then `None` is returned.
+ ///
+ /// Note that unlike `caps["name"]`, this returns a `Match` whose lifetime
+ /// matches the lifetime of the haystack in this `Captures` value.
+ /// Conversely, the substring returned by `caps["name"]` has a lifetime
+ /// of the `Captures` value, which is likely shorter than the lifetime of
+ /// the haystack. In some cases, it may be necessary to use this method to
+ /// access the matching substring instead of the `caps["name"]` notation.
+ ///
+ /// # Examples
+ ///
+ /// Get the substring that matched with a default of an empty string if the
+ /// group didn't participate in the match:
+ ///
+ /// ```
+ /// use regex::bytes::Regex;
+ ///
+ /// let re = Regex::new(
+ /// r"[a-z]+(?:(?<numbers>[0-9]+)|(?<letters>[A-Z]+))",
+ /// ).unwrap();
+ /// let caps = re.captures(b"abc123").unwrap();
+ ///
+ /// let numbers = caps.name("numbers").map_or(&b""[..], |m| m.as_bytes());
+ /// let letters = caps.name("letters").map_or(&b""[..], |m| m.as_bytes());
+ /// assert_eq!(numbers, b"123");
+ /// assert_eq!(letters, b"");
+ /// ```
+ #[inline]
+ pub fn name(&self, name: &str) -> Option<Match<'h>> {
+ self.caps
+ .get_group_by_name(name)
+ .map(|sp| Match::new(self.haystack, sp.start, sp.end))
+ }
+
+ /// This is a convenience routine for extracting the substrings
+ /// corresponding to matching capture groups.
+ ///
+ /// This returns a tuple where the first element corresponds to the full
+ /// substring of the haystack that matched the regex. The second element is
+ /// an array of substrings, with each corresponding to the to the substring
+ /// that matched for a particular capture group.
+ ///
+ /// # Panics
+ ///
+ /// This panics if the number of possible matching groups in this
+ /// `Captures` value is not fixed to `N` in all circumstances.
+ /// More precisely, this routine only works when `N` is equivalent to
+ /// [`Regex::static_captures_len`].
+ ///
+ /// Stated more plainly, if the number of matching capture groups in a
+ /// regex can vary from match to match, then this function always panics.
+ ///
+ /// For example, `(a)(b)|(c)` could produce two matching capture groups
+ /// or one matching capture group for any given match. Therefore, one
+ /// cannot use `extract` with such a pattern.
+ ///
+ /// But a pattern like `(a)(b)|(c)(d)` can be used with `extract` because
+ /// the number of capture groups in every match is always equivalent,
+ /// even if the capture _indices_ in each match are not.
+ ///
+ /// # Example
+ ///
+ /// ```
+ /// use regex::bytes::Regex;
+ ///
+ /// let re = Regex::new(r"([0-9]{4})-([0-9]{2})-([0-9]{2})").unwrap();
+ /// let hay = b"On 2010-03-14, I became a Tenneessee lamb.";
+ /// let Some((full, [year, month, day])) =
+ /// re.captures(hay).map(|caps| caps.extract()) else { return };
+ /// assert_eq!(b"2010-03-14", full);
+ /// assert_eq!(b"2010", year);
+ /// assert_eq!(b"03", month);
+ /// assert_eq!(b"14", day);
+ /// ```
+ ///
+ /// # Example: iteration
+ ///
+ /// This example shows how to use this method when iterating over all
+ /// `Captures` matches in a haystack.
+ ///
+ /// ```
+ /// use regex::bytes::Regex;
+ ///
+ /// let re = Regex::new(r"([0-9]{4})-([0-9]{2})-([0-9]{2})").unwrap();
+ /// let hay = b"1973-01-05, 1975-08-25 and 1980-10-18";
+ ///
+ /// let mut dates: Vec<(&[u8], &[u8], &[u8])> = vec![];
+ /// for (_, [y, m, d]) in re.captures_iter(hay).map(|c| c.extract()) {
+ /// dates.push((y, m, d));
+ /// }
+ /// assert_eq!(dates, vec![
+ /// (&b"1973"[..], &b"01"[..], &b"05"[..]),
+ /// (&b"1975"[..], &b"08"[..], &b"25"[..]),
+ /// (&b"1980"[..], &b"10"[..], &b"18"[..]),
+ /// ]);
+ /// ```
+ ///
+ /// # Example: parsing different formats
+ ///
+ /// This API is particularly useful when you need to extract a particular
+ /// value that might occur in a different format. Consider, for example,
+ /// an identifier that might be in double quotes or single quotes:
+ ///
+ /// ```
+ /// use regex::bytes::Regex;
+ ///
+ /// let re = Regex::new(r#"id:(?:"([^"]+)"|'([^']+)')"#).unwrap();
+ /// let hay = br#"The first is id:"foo" and the second is id:'bar'."#;
+ /// let mut ids = vec![];
+ /// for (_, [id]) in re.captures_iter(hay).map(|c| c.extract()) {
+ /// ids.push(id);
+ /// }
+ /// assert_eq!(ids, vec![b"foo", b"bar"]);
+ /// ```
+ pub fn extract<const N: usize>(&self) -> (&'h [u8], [&'h [u8]; N]) {
+ let len = self
+ .static_captures_len
+ .expect("number of capture groups can vary in a match")
+ .checked_sub(1)
+ .expect("number of groups is always greater than zero");
+ assert_eq!(N, len, "asked for {} groups, but must ask for {}", N, len);
+ // The regex-automata variant of extract is a bit more permissive.
+ // It doesn't require the number of matching capturing groups to be
+ // static, and you can even request fewer groups than what's there. So
+ // this is guaranteed to never panic because we've asserted above that
+ // the user has requested precisely the number of groups that must be
+ // present in any match for this regex.
+ self.caps.extract_bytes(self.haystack)
+ }
+
+ /// Expands all instances of `$ref` in `replacement` to the corresponding
+ /// capture group, and writes them to the `dst` buffer given. A `ref` can
+ /// be a capture group index or a name. If `ref` doesn't refer to a capture
+ /// group that participated in the match, then it is replaced with the
+ /// empty string.
+ ///
+ /// # Format
+ ///
+ /// The format of the replacement string supports two different kinds of
+ /// capture references: unbraced and braced.
+ ///
+ /// For the unbraced format, the format supported is `$ref` where `name`
+ /// can be any character in the class `[0-9A-Za-z_]`. `ref` is always
+ /// the longest possible parse. So for example, `$1a` corresponds to the
+ /// capture group named `1a` and not the capture group at index `1`. If
+ /// `ref` matches `^[0-9]+$`, then it is treated as a capture group index
+ /// itself and not a name.
+ ///
+ /// For the braced format, the format supported is `${ref}` where `ref` can
+ /// be any sequence of bytes except for `}`. If no closing brace occurs,
+ /// then it is not considered a capture reference. As with the unbraced
+ /// format, if `ref` matches `^[0-9]+$`, then it is treated as a capture
+ /// group index and not a name.
+ ///
+ /// The braced format is useful for exerting precise control over the name
+ /// of the capture reference. For example, `${1}a` corresponds to the
+ /// capture group reference `1` followed by the letter `a`, where as `$1a`
+ /// (as mentioned above) corresponds to the capture group reference `1a`.
+ /// The braced format is also useful for expressing capture group names
+ /// that use characters not supported by the unbraced format. For example,
+ /// `${foo[bar].baz}` refers to the capture group named `foo[bar].baz`.
+ ///
+ /// If a capture group reference is found and it does not refer to a valid
+ /// capture group, then it will be replaced with the empty string.
+ ///
+ /// To write a literal `$`, use `$$`.
+ ///
+ /// # Example
+ ///
+ /// ```
+ /// use regex::bytes::Regex;
+ ///
+ /// let re = Regex::new(
+ /// r"(?<day>[0-9]{2})-(?<month>[0-9]{2})-(?<year>[0-9]{4})",
+ /// ).unwrap();
+ /// let hay = b"On 14-03-2010, I became a Tenneessee lamb.";
+ /// let caps = re.captures(hay).unwrap();
+ ///
+ /// let mut dst = vec![];
+ /// caps.expand(b"year=$year, month=$month, day=$day", &mut dst);
+ /// assert_eq!(dst, b"year=2010, month=03, day=14");
+ /// ```
+ #[inline]
+ pub fn expand(&self, replacement: &[u8], dst: &mut Vec<u8>) {
+ self.caps.interpolate_bytes_into(self.haystack, replacement, dst);
+ }
+
+ /// Returns an iterator over all capture groups. This includes both
+ /// matching and non-matching groups.
+ ///
+ /// The iterator always yields at least one matching group: the first group
+ /// (at index `0`) with no name. Subsequent groups are returned in the order
+ /// of their opening parenthesis in the regex.
+ ///
+ /// The elements yielded have type `Option<Match<'h>>`, where a non-`None`
+ /// value is present if the capture group matches.
+ ///
+ /// # Example
+ ///
+ /// ```
+ /// use regex::bytes::Regex;
+ ///
+ /// let re = Regex::new(r"(\w)(\d)?(\w)").unwrap();
+ /// let caps = re.captures(b"AZ").unwrap();
+ ///
+ /// let mut it = caps.iter();
+ /// assert_eq!(it.next().unwrap().map(|m| m.as_bytes()), Some(&b"AZ"[..]));
+ /// assert_eq!(it.next().unwrap().map(|m| m.as_bytes()), Some(&b"A"[..]));
+ /// assert_eq!(it.next().unwrap().map(|m| m.as_bytes()), None);
+ /// assert_eq!(it.next().unwrap().map(|m| m.as_bytes()), Some(&b"Z"[..]));
+ /// assert_eq!(it.next(), None);
+ /// ```
+ #[inline]
+ pub fn iter<'c>(&'c self) -> SubCaptureMatches<'c, 'h> {
+ SubCaptureMatches { haystack: self.haystack, it: self.caps.iter() }
+ }
+
+ /// Returns the total number of capture groups. This includes both
+ /// matching and non-matching groups.
+ ///
+ /// The length returned is always equivalent to the number of elements
+ /// yielded by [`Captures::iter`]. Consequently, the length is always
+ /// greater than zero since every `Captures` value always includes the
+ /// match for the entire regex.
+ ///
+ /// # Example
+ ///
+ /// ```
+ /// use regex::bytes::Regex;
+ ///
+ /// let re = Regex::new(r"(\w)(\d)?(\w)").unwrap();
+ /// let caps = re.captures(b"AZ").unwrap();
+ /// assert_eq!(caps.len(), 4);
+ /// ```
+ #[inline]
+ pub fn len(&self) -> usize {
+ self.caps.group_len()
+ }
+}
+
+impl<'h> core::fmt::Debug for Captures<'h> {
+ fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
+ /// A little helper type to provide a nice map-like debug
+ /// representation for our capturing group spans.
+ ///
+ /// regex-automata has something similar, but it includes the pattern
+ /// ID in its debug output, which is confusing. It also doesn't include
+ /// that strings that match because a regex-automata `Captures` doesn't
+ /// borrow the haystack.
+ struct CapturesDebugMap<'a> {
+ caps: &'a Captures<'a>,
+ }
+
+ impl<'a> core::fmt::Debug for CapturesDebugMap<'a> {
+ fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result {
+ let mut map = f.debug_map();
+ let names =
+ self.caps.caps.group_info().pattern_names(PatternID::ZERO);
+ for (group_index, maybe_name) in names.enumerate() {
+ let key = Key(group_index, maybe_name);
+ match self.caps.get(group_index) {
+ None => map.entry(&key, &None::<()>),
+ Some(mat) => map.entry(&key, &Value(mat)),
+ };
+ }
+ map.finish()
+ }
+ }
+
+ struct Key<'a>(usize, Option<&'a str>);
+
+ impl<'a> core::fmt::Debug for Key<'a> {
+ fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result {
+ write!(f, "{}", self.0)?;
+ if let Some(name) = self.1 {
+ write!(f, "/{:?}", name)?;
+ }
+ Ok(())
+ }
+ }
+
+ struct Value<'a>(Match<'a>);
+
+ impl<'a> core::fmt::Debug for Value<'a> {
+ fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result {
+ use regex_automata::util::escape::DebugHaystack;
+
+ write!(
+ f,
+ "{}..{}/{:?}",
+ self.0.start(),
+ self.0.end(),
+ DebugHaystack(self.0.as_bytes())
+ )
+ }
+ }
+
+ f.debug_tuple("Captures")
+ .field(&CapturesDebugMap { caps: self })
+ .finish()
+ }
+}
+
+/// Get a matching capture group's haystack substring by index.
+///
+/// The haystack substring returned can't outlive the `Captures` object if this
+/// method is used, because of how `Index` is defined (normally `a[i]` is part
+/// of `a` and can't outlive it). To work around this limitation, do that, use
+/// [`Captures::get`] instead.
+///
+/// `'h` is the lifetime of the matched haystack, but the lifetime of the
+/// `&str` returned by this implementation is the lifetime of the `Captures`
+/// value itself.
+///
+/// # Panics
+///
+/// If there is no matching group at the given index.
+impl<'h> core::ops::Index<usize> for Captures<'h> {
+ type Output = [u8];
+
+ // The lifetime is written out to make it clear that the &str returned
+ // does NOT have a lifetime equivalent to 'h.
+ fn index<'a>(&'a self, i: usize) -> &'a [u8] {
+ self.get(i)
+ .map(|m| m.as_bytes())
+ .unwrap_or_else(|| panic!("no group at index '{}'", i))
+ }
+}
+
+/// Get a matching capture group's haystack substring by name.
+///
+/// The haystack substring returned can't outlive the `Captures` object if this
+/// method is used, because of how `Index` is defined (normally `a[i]` is part
+/// of `a` and can't outlive it). To work around this limitation, do that, use
+/// [`Captures::get`] instead.
+///
+/// `'h` is the lifetime of the matched haystack, but the lifetime of the
+/// `&str` returned by this implementation is the lifetime of the `Captures`
+/// value itself.
+///
+/// `'n` is the lifetime of the group name used to index the `Captures` value.
+///
+/// # Panics
+///
+/// If there is no matching group at the given name.
+impl<'h, 'n> core::ops::Index<&'n str> for Captures<'h> {
+ type Output = [u8];
+
+ fn index<'a>(&'a self, name: &'n str) -> &'a [u8] {
+ self.name(name)
+ .map(|m| m.as_bytes())
+ .unwrap_or_else(|| panic!("no group named '{}'", name))
+ }
+}
+
+/// A low level representation of the byte offsets of each capture group.
+///
+/// You can think of this as a lower level [`Captures`], where this type does
+/// not support named capturing groups directly and it does not borrow the
+/// haystack that these offsets were matched on.
+///
+/// Primarily, this type is useful when using the lower level `Regex` APIs such
+/// as [`Regex::captures_read`], which permits amortizing the allocation in
+/// which capture match offsets are stored.
+///
+/// In order to build a value of this type, you'll need to call the
+/// [`Regex::capture_locations`] method. The value returned can then be reused
+/// in subsequent searches for that regex. Using it for other regexes may
+/// result in a panic or otherwise incorrect results.
+///
+/// # Example
+///
+/// This example shows how to create and use `CaptureLocations` in a search.
+///
+/// ```
+/// use regex::bytes::Regex;
+///
+/// let re = Regex::new(r"(?<first>\w+)\s+(?<last>\w+)").unwrap();
+/// let mut locs = re.capture_locations();
+/// let m = re.captures_read(&mut locs, b"Bruce Springsteen").unwrap();
+/// assert_eq!(0..17, m.range());
+/// assert_eq!(Some((0, 17)), locs.get(0));
+/// assert_eq!(Some((0, 5)), locs.get(1));
+/// assert_eq!(Some((6, 17)), locs.get(2));
+///
+/// // Asking for an invalid capture group always returns None.
+/// assert_eq!(None, locs.get(3));
+/// # // literals are too big for 32-bit usize: #1041
+/// # #[cfg(target_pointer_width = "64")]
+/// assert_eq!(None, locs.get(34973498648));
+/// # #[cfg(target_pointer_width = "64")]
+/// assert_eq!(None, locs.get(9944060567225171988));
+/// ```
+#[derive(Clone, Debug)]
+pub struct CaptureLocations(captures::Captures);
+
+/// A type alias for `CaptureLocations` for backwards compatibility.
+///
+/// Previously, we exported `CaptureLocations` as `Locations` in an
+/// undocumented API. To prevent breaking that code (e.g., in `regex-capi`),
+/// we continue re-exporting the same undocumented API.
+#[doc(hidden)]
+pub type Locations = CaptureLocations;
+
+impl CaptureLocations {
+ /// Returns the start and end byte offsets of the capture group at index
+ /// `i`. This returns `None` if `i` is not a valid capture group or if the
+ /// capture group did not match.
+ ///
+ /// # Example
+ ///
+ /// ```
+ /// use regex::bytes::Regex;
+ ///
+ /// let re = Regex::new(r"(?<first>\w+)\s+(?<last>\w+)").unwrap();
+ /// let mut locs = re.capture_locations();
+ /// re.captures_read(&mut locs, b"Bruce Springsteen").unwrap();
+ /// assert_eq!(Some((0, 17)), locs.get(0));
+ /// assert_eq!(Some((0, 5)), locs.get(1));
+ /// assert_eq!(Some((6, 17)), locs.get(2));
+ /// ```
+ #[inline]
+ pub fn get(&self, i: usize) -> Option<(usize, usize)> {
+ self.0.get_group(i).map(|sp| (sp.start, sp.end))
+ }
+
+ /// Returns the total number of capture groups (even if they didn't match).
+ /// That is, the length returned is unaffected by the result of a search.
+ ///
+ /// This is always at least `1` since every regex has at least `1`
+ /// capturing group that corresponds to the entire match.
+ ///
+ /// # Example
+ ///
+ /// ```
+ /// use regex::bytes::Regex;
+ ///
+ /// let re = Regex::new(r"(?<first>\w+)\s+(?<last>\w+)").unwrap();
+ /// let mut locs = re.capture_locations();
+ /// assert_eq!(3, locs.len());
+ /// re.captures_read(&mut locs, b"Bruce Springsteen").unwrap();
+ /// assert_eq!(3, locs.len());
+ /// ```
+ ///
+ /// Notice that the length is always at least `1`, regardless of the regex:
+ ///
+ /// ```
+ /// use regex::bytes::Regex;
+ ///
+ /// let re = Regex::new(r"").unwrap();
+ /// let locs = re.capture_locations();
+ /// assert_eq!(1, locs.len());
+ ///
+ /// // [a&&b] is a regex that never matches anything.
+ /// let re = Regex::new(r"[a&&b]").unwrap();
+ /// let locs = re.capture_locations();
+ /// assert_eq!(1, locs.len());
+ /// ```
+ #[inline]
+ pub fn len(&self) -> usize {
+ // self.0.group_len() returns 0 if the underlying captures doesn't
+ // represent a match, but the behavior guaranteed for this method is
+ // that the length doesn't change based on a match or not.
+ self.0.group_info().group_len(PatternID::ZERO)
+ }
+
+ /// An alias for the `get` method for backwards compatibility.
+ ///
+ /// Previously, we exported `get` as `pos` in an undocumented API. To
+ /// prevent breaking that code (e.g., in `regex-capi`), we continue
+ /// re-exporting the same undocumented API.
+ #[doc(hidden)]
+ #[inline]
+ pub fn pos(&self, i: usize) -> Option<(usize, usize)> {
+ self.get(i)
+ }
+}
+
+/// An iterator over all non-overlapping matches in a haystack.
+///
+/// This iterator yields [`Match`] values. The iterator stops when no more
+/// matches can be found.
+///
+/// `'r` is the lifetime of the compiled regular expression and `'h` is the
+/// lifetime of the haystack.
+///
+/// This iterator is created by [`Regex::find_iter`].
+///
+/// # Time complexity
+///
+/// Note that since an iterator runs potentially many searches on the haystack
+/// and since each search has worst case `O(m * n)` time complexity, the
+/// overall worst case time complexity for iteration is `O(m * n^2)`.
+#[derive(Debug)]
+pub struct Matches<'r, 'h> {
+ haystack: &'h [u8],
+ it: meta::FindMatches<'r, 'h>,
+}
+
+impl<'r, 'h> Iterator for Matches<'r, 'h> {
+ type Item = Match<'h>;
+
+ #[inline]
+ fn next(&mut self) -> Option<Match<'h>> {
+ self.it
+ .next()
+ .map(|sp| Match::new(self.haystack, sp.start(), sp.end()))
+ }
+
+ #[inline]
+ fn count(self) -> usize {
+ // This can actually be up to 2x faster than calling `next()` until
+ // completion, because counting matches when using a DFA only requires
+ // finding the end of each match. But returning a `Match` via `next()`
+ // requires the start of each match which, with a DFA, requires a
+ // reverse forward scan to find it.
+ self.it.count()
+ }
+}
+
+impl<'r, 'h> core::iter::FusedIterator for Matches<'r, 'h> {}
+
+/// An iterator over all non-overlapping capture matches in a haystack.
+///
+/// This iterator yields [`Captures`] values. The iterator stops when no more
+/// matches can be found.
+///
+/// `'r` is the lifetime of the compiled regular expression and `'h` is the
+/// lifetime of the matched string.
+///
+/// This iterator is created by [`Regex::captures_iter`].
+///
+/// # Time complexity
+///
+/// Note that since an iterator runs potentially many searches on the haystack
+/// and since each search has worst case `O(m * n)` time complexity, the
+/// overall worst case time complexity for iteration is `O(m * n^2)`.
+#[derive(Debug)]
+pub struct CaptureMatches<'r, 'h> {
+ haystack: &'h [u8],
+ it: meta::CapturesMatches<'r, 'h>,
+}
+
+impl<'r, 'h> Iterator for CaptureMatches<'r, 'h> {
+ type Item = Captures<'h>;
+
+ #[inline]
+ fn next(&mut self) -> Option<Captures<'h>> {
+ let static_captures_len = self.it.regex().static_captures_len();
+ self.it.next().map(|caps| Captures {
+ haystack: self.haystack,
+ caps,
+ static_captures_len,
+ })
+ }
+
+ #[inline]
+ fn count(self) -> usize {
+ // This can actually be up to 2x faster than calling `next()` until
+ // completion, because counting matches when using a DFA only requires
+ // finding the end of each match. But returning a `Match` via `next()`
+ // requires the start of each match which, with a DFA, requires a
+ // reverse forward scan to find it.
+ self.it.count()
+ }
+}
+
+impl<'r, 'h> core::iter::FusedIterator for CaptureMatches<'r, 'h> {}
+
+/// An iterator over all substrings delimited by a regex match.
+///
+/// `'r` is the lifetime of the compiled regular expression and `'h` is the
+/// lifetime of the byte string being split.
+///
+/// This iterator is created by [`Regex::split`].
+///
+/// # Time complexity
+///
+/// Note that since an iterator runs potentially many searches on the haystack
+/// and since each search has worst case `O(m * n)` time complexity, the
+/// overall worst case time complexity for iteration is `O(m * n^2)`.
+#[derive(Debug)]
+pub struct Split<'r, 'h> {
+ haystack: &'h [u8],
+ it: meta::Split<'r, 'h>,
+}
+
+impl<'r, 'h> Iterator for Split<'r, 'h> {
+ type Item = &'h [u8];
+
+ #[inline]
+ fn next(&mut self) -> Option<&'h [u8]> {
+ self.it.next().map(|span| &self.haystack[span])
+ }
+}
+
+impl<'r, 'h> core::iter::FusedIterator for Split<'r, 'h> {}
+
+/// An iterator over at most `N` substrings delimited by a regex match.
+///
+/// The last substring yielded by this iterator will be whatever remains after
+/// `N-1` splits.
+///
+/// `'r` is the lifetime of the compiled regular expression and `'h` is the
+/// lifetime of the byte string being split.
+///
+/// This iterator is created by [`Regex::splitn`].
+///
+/// # Time complexity
+///
+/// Note that since an iterator runs potentially many searches on the haystack
+/// and since each search has worst case `O(m * n)` time complexity, the
+/// overall worst case time complexity for iteration is `O(m * n^2)`.
+///
+/// Although note that the worst case time here has an upper bound given
+/// by the `limit` parameter to [`Regex::splitn`].
+#[derive(Debug)]
+pub struct SplitN<'r, 'h> {
+ haystack: &'h [u8],
+ it: meta::SplitN<'r, 'h>,
+}
+
+impl<'r, 'h> Iterator for SplitN<'r, 'h> {
+ type Item = &'h [u8];
+
+ #[inline]
+ fn next(&mut self) -> Option<&'h [u8]> {
+ self.it.next().map(|span| &self.haystack[span])
+ }
+
+ #[inline]
+ fn size_hint(&self) -> (usize, Option<usize>) {
+ self.it.size_hint()
+ }
+}
+
+impl<'r, 'h> core::iter::FusedIterator for SplitN<'r, 'h> {}
+
+/// An iterator over the names of all capture groups in a regex.
+///
+/// This iterator yields values of type `Option<&str>` in order of the opening
+/// capture group parenthesis in the regex pattern. `None` is yielded for
+/// groups with no name. The first element always corresponds to the implicit
+/// and unnamed group for the overall match.
+///
+/// `'r` is the lifetime of the compiled regular expression.
+///
+/// This iterator is created by [`Regex::capture_names`].
+#[derive(Clone, Debug)]
+pub struct CaptureNames<'r>(captures::GroupInfoPatternNames<'r>);
+
+impl<'r> Iterator for CaptureNames<'r> {
+ type Item = Option<&'r str>;
+
+ #[inline]
+ fn next(&mut self) -> Option<Option<&'r str>> {
+ self.0.next()
+ }
+
+ #[inline]
+ fn size_hint(&self) -> (usize, Option<usize>) {
+ self.0.size_hint()
+ }
+
+ #[inline]
+ fn count(self) -> usize {
+ self.0.count()
+ }
+}
+
+impl<'r> ExactSizeIterator for CaptureNames<'r> {}
+
+impl<'r> core::iter::FusedIterator for CaptureNames<'r> {}
+
+/// An iterator over all group matches in a [`Captures`] value.
+///
+/// This iterator yields values of type `Option<Match<'h>>`, where `'h` is the
+/// lifetime of the haystack that the matches are for. The order of elements
+/// yielded corresponds to the order of the opening parenthesis for the group
+/// in the regex pattern. `None` is yielded for groups that did not participate
+/// in the match.
+///
+/// The first element always corresponds to the implicit group for the overall
+/// match. Since this iterator is created by a [`Captures`] value, and a
+/// `Captures` value is only created when a match occurs, it follows that the
+/// first element yielded by this iterator is guaranteed to be non-`None`.
+///
+/// The lifetime `'c` corresponds to the lifetime of the `Captures` value that
+/// created this iterator, and the lifetime `'h` corresponds to the originally
+/// matched haystack.
+#[derive(Clone, Debug)]
+pub struct SubCaptureMatches<'c, 'h> {
+ haystack: &'h [u8],
+ it: captures::CapturesPatternIter<'c>,
+}
+
+impl<'c, 'h> Iterator for SubCaptureMatches<'c, 'h> {
+ type Item = Option<Match<'h>>;
+
+ #[inline]
+ fn next(&mut self) -> Option<Option<Match<'h>>> {
+ self.it.next().map(|group| {
+ group.map(|sp| Match::new(self.haystack, sp.start, sp.end))
+ })
+ }
+
+ #[inline]
+ fn size_hint(&self) -> (usize, Option<usize>) {
+ self.it.size_hint()
+ }
+
+ #[inline]
+ fn count(self) -> usize {
+ self.it.count()
+ }
+}
+
+impl<'c, 'h> ExactSizeIterator for SubCaptureMatches<'c, 'h> {}
+
+impl<'c, 'h> core::iter::FusedIterator for SubCaptureMatches<'c, 'h> {}
+
+/// A trait for types that can be used to replace matches in a haystack.
+///
+/// In general, users of this crate shouldn't need to implement this trait,
+/// since implementations are already provided for `&[u8]` along with other
+/// variants of byte string types, as well as `FnMut(&Captures) -> Vec<u8>` (or
+/// any `FnMut(&Captures) -> T` where `T: AsRef<[u8]>`). Those cover most use
+/// cases, but callers can implement this trait directly if necessary.
+///
+/// # Example
+///
+/// This example shows a basic implementation of the `Replacer` trait. This can
+/// be done much more simply using the replacement byte string interpolation
+/// support (e.g., `$first $last`), but this approach avoids needing to parse
+/// the replacement byte string at all.
+///
+/// ```
+/// use regex::bytes::{Captures, Regex, Replacer};
+///
+/// struct NameSwapper;
+///
+/// impl Replacer for NameSwapper {
+/// fn replace_append(&mut self, caps: &Captures<'_>, dst: &mut Vec<u8>) {
+/// dst.extend_from_slice(&caps["first"]);
+/// dst.extend_from_slice(b" ");
+/// dst.extend_from_slice(&caps["last"]);
+/// }
+/// }
+///
+/// let re = Regex::new(r"(?<last>[^,\s]+),\s+(?<first>\S+)").unwrap();
+/// let result = re.replace(b"Springsteen, Bruce", NameSwapper);
+/// assert_eq!(result, &b"Bruce Springsteen"[..]);
+/// ```
+pub trait Replacer {
+ /// Appends possibly empty data to `dst` to replace the current match.
+ ///
+ /// The current match is represented by `caps`, which is guaranteed to have
+ /// a match at capture group `0`.
+ ///
+ /// For example, a no-op replacement would be
+ /// `dst.extend_from_slice(&caps[0])`.
+ fn replace_append(&mut self, caps: &Captures<'_>, dst: &mut Vec<u8>);
+
+ /// Return a fixed unchanging replacement byte string.
+ ///
+ /// When doing replacements, if access to [`Captures`] is not needed (e.g.,
+ /// the replacement byte string does not need `$` expansion), then it can
+ /// be beneficial to avoid finding sub-captures.
+ ///
+ /// In general, this is called once for every call to a replacement routine
+ /// such as [`Regex::replace_all`].
+ fn no_expansion<'r>(&'r mut self) -> Option<Cow<'r, [u8]>> {
+ None
+ }
+
+ /// Returns a type that implements `Replacer`, but that borrows and wraps
+ /// this `Replacer`.
+ ///
+ /// This is useful when you want to take a generic `Replacer` (which might
+ /// not be cloneable) and use it without consuming it, so it can be used
+ /// more than once.
+ ///
+ /// # Example
+ ///
+ /// ```
+ /// use regex::bytes::{Regex, Replacer};
+ ///
+ /// fn replace_all_twice<R: Replacer>(
+ /// re: Regex,
+ /// src: &[u8],
+ /// mut rep: R,
+ /// ) -> Vec<u8> {
+ /// let dst = re.replace_all(src, rep.by_ref());
+ /// let dst = re.replace_all(&dst, rep.by_ref());
+ /// dst.into_owned()
+ /// }
+ /// ```
+ fn by_ref<'r>(&'r mut self) -> ReplacerRef<'r, Self> {
+ ReplacerRef(self)
+ }
+}
+
+impl<'a, const N: usize> Replacer for &'a [u8; N] {
+ fn replace_append(&mut self, caps: &Captures<'_>, dst: &mut Vec<u8>) {
+ caps.expand(&**self, dst);
+ }
+
+ fn no_expansion(&mut self) -> Option<Cow<'_, [u8]>> {
+ no_expansion(self)
+ }
+}
+
+impl<const N: usize> Replacer for [u8; N] {
+ fn replace_append(&mut self, caps: &Captures<'_>, dst: &mut Vec<u8>) {
+ caps.expand(&*self, dst);
+ }
+
+ fn no_expansion(&mut self) -> Option<Cow<'_, [u8]>> {
+ no_expansion(self)
+ }
+}
+
+impl<'a> Replacer for &'a [u8] {
+ fn replace_append(&mut self, caps: &Captures<'_>, dst: &mut Vec<u8>) {
+ caps.expand(*self, dst);
+ }
+
+ fn no_expansion(&mut self) -> Option<Cow<'_, [u8]>> {
+ no_expansion(self)
+ }
+}
+
+impl<'a> Replacer for &'a Vec<u8> {
+ fn replace_append(&mut self, caps: &Captures<'_>, dst: &mut Vec<u8>) {
+ caps.expand(*self, dst);
+ }
+
+ fn no_expansion(&mut self) -> Option<Cow<'_, [u8]>> {
+ no_expansion(self)
+ }
+}
+
+impl Replacer for Vec<u8> {
+ fn replace_append(&mut self, caps: &Captures<'_>, dst: &mut Vec<u8>) {
+ caps.expand(self, dst);
+ }
+
+ fn no_expansion(&mut self) -> Option<Cow<'_, [u8]>> {
+ no_expansion(self)
+ }
+}
+
+impl<'a> Replacer for Cow<'a, [u8]> {
+ fn replace_append(&mut self, caps: &Captures<'_>, dst: &mut Vec<u8>) {
+ caps.expand(self.as_ref(), dst);
+ }
+
+ fn no_expansion(&mut self) -> Option<Cow<'_, [u8]>> {
+ no_expansion(self)
+ }
+}
+
+impl<'a> Replacer for &'a Cow<'a, [u8]> {
+ fn replace_append(&mut self, caps: &Captures<'_>, dst: &mut Vec<u8>) {
+ caps.expand(self.as_ref(), dst);
+ }
+
+ fn no_expansion(&mut self) -> Option<Cow<'_, [u8]>> {
+ no_expansion(self)
+ }
+}
+
+impl<F, T> Replacer for F
+where
+ F: FnMut(&Captures<'_>) -> T,
+ T: AsRef<[u8]>,
+{
+ fn replace_append(&mut self, caps: &Captures<'_>, dst: &mut Vec<u8>) {
+ dst.extend_from_slice((*self)(caps).as_ref());
+ }
+}
+
+/// A by-reference adaptor for a [`Replacer`].
+///
+/// This permits reusing the same `Replacer` value in multiple calls to a
+/// replacement routine like [`Regex::replace_all`].
+///
+/// This type is created by [`Replacer::by_ref`].
+#[derive(Debug)]
+pub struct ReplacerRef<'a, R: ?Sized>(&'a mut R);
+
+impl<'a, R: Replacer + ?Sized + 'a> Replacer for ReplacerRef<'a, R> {
+ fn replace_append(&mut self, caps: &Captures<'_>, dst: &mut Vec<u8>) {
+ self.0.replace_append(caps, dst)
+ }
+
+ fn no_expansion<'r>(&'r mut self) -> Option<Cow<'r, [u8]>> {
+ self.0.no_expansion()
+ }
+}
+
+/// A helper type for forcing literal string replacement.
+///
+/// It can be used with routines like [`Regex::replace`] and
+/// [`Regex::replace_all`] to do a literal string replacement without expanding
+/// `$name` to their corresponding capture groups. This can be both convenient
+/// (to avoid escaping `$`, for example) and faster (since capture groups
+/// don't need to be found).
+///
+/// `'s` is the lifetime of the literal string to use.
+///
+/// # Example
+///
+/// ```
+/// use regex::bytes::{NoExpand, Regex};
+///
+/// let re = Regex::new(r"(?<last>[^,\s]+),\s+(\S+)").unwrap();
+/// let result = re.replace(b"Springsteen, Bruce", NoExpand(b"$2 $last"));
+/// assert_eq!(result, &b"$2 $last"[..]);
+/// ```
+#[derive(Clone, Debug)]
+pub struct NoExpand<'s>(pub &'s [u8]);
+
+impl<'s> Replacer for NoExpand<'s> {
+ fn replace_append(&mut self, _: &Captures<'_>, dst: &mut Vec<u8>) {
+ dst.extend_from_slice(self.0);
+ }
+
+ fn no_expansion(&mut self) -> Option<Cow<'_, [u8]>> {
+ Some(Cow::Borrowed(self.0))
+ }
+}
+
+/// Quickly checks the given replacement string for whether interpolation
+/// should be done on it. It returns `None` if a `$` was found anywhere in the
+/// given string, which suggests interpolation needs to be done. But if there's
+/// no `$` anywhere, then interpolation definitely does not need to be done. In
+/// that case, the given string is returned as a borrowed `Cow`.
+///
+/// This is meant to be used to implement the `Replacer::no_expandsion` method
+/// in its various trait impls.
+fn no_expansion<T: AsRef<[u8]>>(replacement: &T) -> Option<Cow<'_, [u8]>> {
+ let replacement = replacement.as_ref();
+ match crate::find_byte::find_byte(b'$', replacement) {
+ Some(_) => None,
+ None => Some(Cow::Borrowed(replacement)),
+ }
+}
diff --git a/vendor/regex/src/regex/mod.rs b/vendor/regex/src/regex/mod.rs
new file mode 100644
index 000000000..93fadec8b
--- /dev/null
+++ b/vendor/regex/src/regex/mod.rs
@@ -0,0 +1,2 @@
+pub(crate) mod bytes;
+pub(crate) mod string;
diff --git a/vendor/regex/src/regex/string.rs b/vendor/regex/src/regex/string.rs
new file mode 100644
index 000000000..880d6082a
--- /dev/null
+++ b/vendor/regex/src/regex/string.rs
@@ -0,0 +1,2582 @@
+use alloc::{borrow::Cow, string::String, sync::Arc};
+
+use regex_automata::{meta, util::captures, Input, PatternID};
+
+use crate::{error::Error, RegexBuilder};
+
+/// A compiled regular expression for searching Unicode haystacks.
+///
+/// A `Regex` can be used to search haystacks, split haystacks into substrings
+/// or replace substrings in a haystack with a different substring. All
+/// searching is done with an implicit `(?s:.)*?` at the beginning and end of
+/// an pattern. To force an expression to match the whole string (or a prefix
+/// or a suffix), you must use an anchor like `^` or `$` (or `\A` and `\z`).
+///
+/// While this crate will handle Unicode strings (whether in the regular
+/// expression or in the haystack), all positions returned are **byte
+/// offsets**. Every byte offset is guaranteed to be at a Unicode code point
+/// boundary. That is, all offsets returned by the `Regex` API are guaranteed
+/// to be ranges that can slice a `&str` without panicking. If you want to
+/// relax this requirement, then you must search `&[u8]` haystacks with a
+/// [`bytes::Regex`](crate::bytes::Regex).
+///
+/// The only methods that allocate new strings are the string replacement
+/// methods. All other methods (searching and splitting) return borrowed
+/// references into the haystack given.
+///
+/// # Example
+///
+/// Find the offsets of a US phone number:
+///
+/// ```
+/// use regex::Regex;
+///
+/// let re = Regex::new("[0-9]{3}-[0-9]{3}-[0-9]{4}").unwrap();
+/// let m = re.find("phone: 111-222-3333").unwrap();
+/// assert_eq!(7..19, m.range());
+/// ```
+///
+/// # Example: extracting capture groups
+///
+/// A common way to use regexes is with capture groups. That is, instead of
+/// just looking for matches of an entire regex, parentheses are used to create
+/// groups that represent part of the match.
+///
+/// For example, consider a haystack with multiple lines, and each line has
+/// three whitespace delimited fields where the second field is expected to be
+/// a number and the third field a boolean. To make this convenient, we use
+/// the [`Captures::extract`] API to put the strings that match each group
+/// into a fixed size array:
+///
+/// ```
+/// use regex::Regex;
+///
+/// let hay = "
+/// rabbit 54 true
+/// groundhog 2 true
+/// does not match
+/// fox 109 false
+/// ";
+/// let re = Regex::new(r"(?m)^\s*(\S+)\s+([0-9]+)\s+(true|false)\s*$").unwrap();
+/// let mut fields: Vec<(&str, i64, bool)> = vec![];
+/// for (_, [f1, f2, f3]) in re.captures_iter(hay).map(|caps| caps.extract()) {
+/// fields.push((f1, f2.parse()?, f3.parse()?));
+/// }
+/// assert_eq!(fields, vec![
+/// ("rabbit", 54, true),
+/// ("groundhog", 2, true),
+/// ("fox", 109, false),
+/// ]);
+///
+/// # Ok::<(), Box<dyn std::error::Error>>(())
+/// ```
+///
+/// # Example: searching with the `Pattern` trait
+///
+/// **Note**: This section requires that this crate is compiled with the
+/// `pattern` Cargo feature enabled, which **requires nightly Rust**.
+///
+/// Since `Regex` implements `Pattern` from the standard library, one can
+/// use regexes with methods defined on `&str`. For example, `is_match`,
+/// `find`, `find_iter` and `split` can, in some cases, be replaced with
+/// `str::contains`, `str::find`, `str::match_indices` and `str::split`.
+///
+/// Here are some examples:
+///
+/// ```ignore
+/// use regex::Regex;
+///
+/// let re = Regex::new(r"\d+").unwrap();
+/// let hay = "a111b222c";
+///
+/// assert!(hay.contains(&re));
+/// assert_eq!(hay.find(&re), Some(1));
+/// assert_eq!(hay.match_indices(&re).collect::<Vec<_>>(), vec![
+/// (1, "111"),
+/// (5, "222"),
+/// ]);
+/// assert_eq!(hay.split(&re).collect::<Vec<_>>(), vec!["a", "b", "c"]);
+/// ```
+#[derive(Clone)]
+pub struct Regex {
+ pub(crate) meta: meta::Regex,
+ pub(crate) pattern: Arc<str>,
+}
+
+impl core::fmt::Display for Regex {
+ /// Shows the original regular expression.
+ fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
+ write!(f, "{}", self.as_str())
+ }
+}
+
+impl core::fmt::Debug for Regex {
+ /// Shows the original regular expression.
+ fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
+ f.debug_tuple("Regex").field(&self.as_str()).finish()
+ }
+}
+
+impl core::str::FromStr for Regex {
+ type Err = Error;
+
+ /// Attempts to parse a string into a regular expression
+ fn from_str(s: &str) -> Result<Regex, Error> {
+ Regex::new(s)
+ }
+}
+
+impl TryFrom<&str> for Regex {
+ type Error = Error;
+
+ /// Attempts to parse a string into a regular expression
+ fn try_from(s: &str) -> Result<Regex, Error> {
+ Regex::new(s)
+ }
+}
+
+impl TryFrom<String> for Regex {
+ type Error = Error;
+
+ /// Attempts to parse a string into a regular expression
+ fn try_from(s: String) -> Result<Regex, Error> {
+ Regex::new(&s)
+ }
+}
+
+/// Core regular expression methods.
+impl Regex {
+ /// Compiles a regular expression. Once compiled, it can be used repeatedly
+ /// to search, split or replace substrings in a haystack.
+ ///
+ /// Note that regex compilation tends to be a somewhat expensive process,
+ /// and unlike higher level environments, compilation is not automatically
+ /// cached for you. One should endeavor to compile a regex once and then
+ /// reuse it. For example, it's a bad idea to compile the same regex
+ /// repeatedly in a loop.
+ ///
+ /// # Errors
+ ///
+ /// If an invalid pattern is given, then an error is returned.
+ /// An error is also returned if the pattern is valid, but would
+ /// produce a regex that is bigger than the configured size limit via
+ /// [`RegexBuilder::size_limit`]. (A reasonable size limit is enabled by
+ /// default.)
+ ///
+ /// # Example
+ ///
+ /// ```
+ /// use regex::Regex;
+ ///
+ /// // An Invalid pattern because of an unclosed parenthesis
+ /// assert!(Regex::new(r"foo(bar").is_err());
+ /// // An invalid pattern because the regex would be too big
+ /// // because Unicode tends to inflate things.
+ /// assert!(Regex::new(r"\w{1000}").is_err());
+ /// // Disabling Unicode can make the regex much smaller,
+ /// // potentially by up to or more than an order of magnitude.
+ /// assert!(Regex::new(r"(?-u:\w){1000}").is_ok());
+ /// ```
+ pub fn new(re: &str) -> Result<Regex, Error> {
+ RegexBuilder::new(re).build()
+ }
+
+ /// Returns true if and only if there is a match for the regex anywhere
+ /// in the haystack given.
+ ///
+ /// It is recommended to use this method if all you need to do is test
+ /// whether a match exists, since the underlying matching engine may be
+ /// able to do less work.
+ ///
+ /// # Example
+ ///
+ /// Test if some haystack contains at least one word with exactly 13
+ /// Unicode word characters:
+ ///
+ /// ```
+ /// use regex::Regex;
+ ///
+ /// let re = Regex::new(r"\b\w{13}\b").unwrap();
+ /// let hay = "I categorically deny having triskaidekaphobia.";
+ /// assert!(re.is_match(hay));
+ /// ```
+ #[inline]
+ pub fn is_match(&self, haystack: &str) -> bool {
+ self.is_match_at(haystack, 0)
+ }
+
+ /// This routine searches for the first match of this regex in the
+ /// haystack given, and if found, returns a [`Match`]. The `Match`
+ /// provides access to both the byte offsets of the match and the actual
+ /// substring that matched.
+ ///
+ /// Note that this should only be used if you want to find the entire
+ /// match. If instead you just want to test the existence of a match,
+ /// it's potentially faster to use `Regex::is_match(hay)` instead of
+ /// `Regex::find(hay).is_some()`.
+ ///
+ /// # Example
+ ///
+ /// Find the first word with exactly 13 Unicode word characters:
+ ///
+ /// ```
+ /// use regex::Regex;
+ ///
+ /// let re = Regex::new(r"\b\w{13}\b").unwrap();
+ /// let hay = "I categorically deny having triskaidekaphobia.";
+ /// let mat = re.find(hay).unwrap();
+ /// assert_eq!(2..15, mat.range());
+ /// assert_eq!("categorically", mat.as_str());
+ /// ```
+ #[inline]
+ pub fn find<'h>(&self, haystack: &'h str) -> Option<Match<'h>> {
+ self.find_at(haystack, 0)
+ }
+
+ /// Returns an iterator that yields successive non-overlapping matches in
+ /// the given haystack. The iterator yields values of type [`Match`].
+ ///
+ /// # Time complexity
+ ///
+ /// Note that since `find_iter` runs potentially many searches on the
+ /// haystack and since each search has worst case `O(m * n)` time
+ /// complexity, the overall worst case time complexity for iteration is
+ /// `O(m * n^2)`.
+ ///
+ /// # Example
+ ///
+ /// Find every word with exactly 13 Unicode word characters:
+ ///
+ /// ```
+ /// use regex::Regex;
+ ///
+ /// let re = Regex::new(r"\b\w{13}\b").unwrap();
+ /// let hay = "Retroactively relinquishing remunerations is reprehensible.";
+ /// let matches: Vec<_> = re.find_iter(hay).map(|m| m.as_str()).collect();
+ /// assert_eq!(matches, vec![
+ /// "Retroactively",
+ /// "relinquishing",
+ /// "remunerations",
+ /// "reprehensible",
+ /// ]);
+ /// ```
+ #[inline]
+ pub fn find_iter<'r, 'h>(&'r self, haystack: &'h str) -> Matches<'r, 'h> {
+ Matches { haystack, it: self.meta.find_iter(haystack) }
+ }
+
+ /// This routine searches for the first match of this regex in the haystack
+ /// given, and if found, returns not only the overall match but also the
+ /// matches of each capture group in the regex. If no match is found, then
+ /// `None` is returned.
+ ///
+ /// Capture group `0` always corresponds to an implicit unnamed group that
+ /// includes the entire match. If a match is found, this group is always
+ /// present. Subsequent groups may be named and are numbered, starting
+ /// at 1, by the order in which the opening parenthesis appears in the
+ /// pattern. For example, in the pattern `(?<a>.(?<b>.))(?<c>.)`, `a`,
+ /// `b` and `c` correspond to capture group indices `1`, `2` and `3`,
+ /// respectively.
+ ///
+ /// You should only use `captures` if you need access to the capture group
+ /// matches. Otherwise, [`Regex::find`] is generally faster for discovering
+ /// just the overall match.
+ ///
+ /// # Example
+ ///
+ /// Say you have some haystack with movie names and their release years,
+ /// like "'Citizen Kane' (1941)". It'd be nice if we could search for
+ /// substrings looking like that, while also extracting the movie name and
+ /// its release year separately. The example below shows how to do that.
+ ///
+ /// ```
+ /// use regex::Regex;
+ ///
+ /// let re = Regex::new(r"'([^']+)'\s+\((\d{4})\)").unwrap();
+ /// let hay = "Not my favorite movie: 'Citizen Kane' (1941).";
+ /// let caps = re.captures(hay).unwrap();
+ /// assert_eq!(caps.get(0).unwrap().as_str(), "'Citizen Kane' (1941)");
+ /// assert_eq!(caps.get(1).unwrap().as_str(), "Citizen Kane");
+ /// assert_eq!(caps.get(2).unwrap().as_str(), "1941");
+ /// // You can also access the groups by index using the Index notation.
+ /// // Note that this will panic on an invalid index. In this case, these
+ /// // accesses are always correct because the overall regex will only
+ /// // match when these capture groups match.
+ /// assert_eq!(&caps[0], "'Citizen Kane' (1941)");
+ /// assert_eq!(&caps[1], "Citizen Kane");
+ /// assert_eq!(&caps[2], "1941");
+ /// ```
+ ///
+ /// Note that the full match is at capture group `0`. Each subsequent
+ /// capture group is indexed by the order of its opening `(`.
+ ///
+ /// We can make this example a bit clearer by using *named* capture groups:
+ ///
+ /// ```
+ /// use regex::Regex;
+ ///
+ /// let re = Regex::new(r"'(?<title>[^']+)'\s+\((?<year>\d{4})\)").unwrap();
+ /// let hay = "Not my favorite movie: 'Citizen Kane' (1941).";
+ /// let caps = re.captures(hay).unwrap();
+ /// assert_eq!(caps.get(0).unwrap().as_str(), "'Citizen Kane' (1941)");
+ /// assert_eq!(caps.name("title").unwrap().as_str(), "Citizen Kane");
+ /// assert_eq!(caps.name("year").unwrap().as_str(), "1941");
+ /// // You can also access the groups by name using the Index notation.
+ /// // Note that this will panic on an invalid group name. In this case,
+ /// // these accesses are always correct because the overall regex will
+ /// // only match when these capture groups match.
+ /// assert_eq!(&caps[0], "'Citizen Kane' (1941)");
+ /// assert_eq!(&caps["title"], "Citizen Kane");
+ /// assert_eq!(&caps["year"], "1941");
+ /// ```
+ ///
+ /// Here we name the capture groups, which we can access with the `name`
+ /// method or the `Index` notation with a `&str`. Note that the named
+ /// capture groups are still accessible with `get` or the `Index` notation
+ /// with a `usize`.
+ ///
+ /// The `0`th capture group is always unnamed, so it must always be
+ /// accessed with `get(0)` or `[0]`.
+ ///
+ /// Finally, one other way to to get the matched substrings is with the
+ /// [`Captures::extract`] API:
+ ///
+ /// ```
+ /// use regex::Regex;
+ ///
+ /// let re = Regex::new(r"'([^']+)'\s+\((\d{4})\)").unwrap();
+ /// let hay = "Not my favorite movie: 'Citizen Kane' (1941).";
+ /// let (full, [title, year]) = re.captures(hay).unwrap().extract();
+ /// assert_eq!(full, "'Citizen Kane' (1941)");
+ /// assert_eq!(title, "Citizen Kane");
+ /// assert_eq!(year, "1941");
+ /// ```
+ #[inline]
+ pub fn captures<'h>(&self, haystack: &'h str) -> Option<Captures<'h>> {
+ self.captures_at(haystack, 0)
+ }
+
+ /// Returns an iterator that yields successive non-overlapping matches in
+ /// the given haystack. The iterator yields values of type [`Captures`].
+ ///
+ /// This is the same as [`Regex::find_iter`], but instead of only providing
+ /// access to the overall match, each value yield includes access to the
+ /// matches of all capture groups in the regex. Reporting this extra match
+ /// data is potentially costly, so callers should only use `captures_iter`
+ /// over `find_iter` when they actually need access to the capture group
+ /// matches.
+ ///
+ /// # Time complexity
+ ///
+ /// Note that since `captures_iter` runs potentially many searches on the
+ /// haystack and since each search has worst case `O(m * n)` time
+ /// complexity, the overall worst case time complexity for iteration is
+ /// `O(m * n^2)`.
+ ///
+ /// # Example
+ ///
+ /// We can use this to find all movie titles and their release years in
+ /// some haystack, where the movie is formatted like "'Title' (xxxx)":
+ ///
+ /// ```
+ /// use regex::Regex;
+ ///
+ /// let re = Regex::new(r"'([^']+)'\s+\(([0-9]{4})\)").unwrap();
+ /// let hay = "'Citizen Kane' (1941), 'The Wizard of Oz' (1939), 'M' (1931).";
+ /// let mut movies = vec![];
+ /// for (_, [title, year]) in re.captures_iter(hay).map(|c| c.extract()) {
+ /// movies.push((title, year.parse::<i64>()?));
+ /// }
+ /// assert_eq!(movies, vec![
+ /// ("Citizen Kane", 1941),
+ /// ("The Wizard of Oz", 1939),
+ /// ("M", 1931),
+ /// ]);
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ ///
+ /// Or with named groups:
+ ///
+ /// ```
+ /// use regex::Regex;
+ ///
+ /// let re = Regex::new(r"'(?<title>[^']+)'\s+\((?<year>[0-9]{4})\)").unwrap();
+ /// let hay = "'Citizen Kane' (1941), 'The Wizard of Oz' (1939), 'M' (1931).";
+ /// let mut it = re.captures_iter(hay);
+ ///
+ /// let caps = it.next().unwrap();
+ /// assert_eq!(&caps["title"], "Citizen Kane");
+ /// assert_eq!(&caps["year"], "1941");
+ ///
+ /// let caps = it.next().unwrap();
+ /// assert_eq!(&caps["title"], "The Wizard of Oz");
+ /// assert_eq!(&caps["year"], "1939");
+ ///
+ /// let caps = it.next().unwrap();
+ /// assert_eq!(&caps["title"], "M");
+ /// assert_eq!(&caps["year"], "1931");
+ /// ```
+ #[inline]
+ pub fn captures_iter<'r, 'h>(
+ &'r self,
+ haystack: &'h str,
+ ) -> CaptureMatches<'r, 'h> {
+ CaptureMatches { haystack, it: self.meta.captures_iter(haystack) }
+ }
+
+ /// Returns an iterator of substrings of the haystack given, delimited by a
+ /// match of the regex. Namely, each element of the iterator corresponds to
+ /// a part of the haystack that *isn't* matched by the regular expression.
+ ///
+ /// # Time complexity
+ ///
+ /// Since iterators over all matches requires running potentially many
+ /// searches on the haystack, and since each search has worst case
+ /// `O(m * n)` time complexity, the overall worst case time complexity for
+ /// this routine is `O(m * n^2)`.
+ ///
+ /// # Example
+ ///
+ /// To split a string delimited by arbitrary amounts of spaces or tabs:
+ ///
+ /// ```
+ /// use regex::Regex;
+ ///
+ /// let re = Regex::new(r"[ \t]+").unwrap();
+ /// let hay = "a b \t c\td e";
+ /// let fields: Vec<&str> = re.split(hay).collect();
+ /// assert_eq!(fields, vec!["a", "b", "c", "d", "e"]);
+ /// ```
+ ///
+ /// # Example: more cases
+ ///
+ /// Basic usage:
+ ///
+ /// ```
+ /// use regex::Regex;
+ ///
+ /// let re = Regex::new(r" ").unwrap();
+ /// let hay = "Mary had a little lamb";
+ /// let got: Vec<&str> = re.split(hay).collect();
+ /// assert_eq!(got, vec!["Mary", "had", "a", "little", "lamb"]);
+ ///
+ /// let re = Regex::new(r"X").unwrap();
+ /// let hay = "";
+ /// let got: Vec<&str> = re.split(hay).collect();
+ /// assert_eq!(got, vec![""]);
+ ///
+ /// let re = Regex::new(r"X").unwrap();
+ /// let hay = "lionXXtigerXleopard";
+ /// let got: Vec<&str> = re.split(hay).collect();
+ /// assert_eq!(got, vec!["lion", "", "tiger", "leopard"]);
+ ///
+ /// let re = Regex::new(r"::").unwrap();
+ /// let hay = "lion::tiger::leopard";
+ /// let got: Vec<&str> = re.split(hay).collect();
+ /// assert_eq!(got, vec!["lion", "tiger", "leopard"]);
+ /// ```
+ ///
+ /// If a haystack contains multiple contiguous matches, you will end up
+ /// with empty spans yielded by the iterator:
+ ///
+ /// ```
+ /// use regex::Regex;
+ ///
+ /// let re = Regex::new(r"X").unwrap();
+ /// let hay = "XXXXaXXbXc";
+ /// let got: Vec<&str> = re.split(hay).collect();
+ /// assert_eq!(got, vec!["", "", "", "", "a", "", "b", "c"]);
+ ///
+ /// let re = Regex::new(r"/").unwrap();
+ /// let hay = "(///)";
+ /// let got: Vec<&str> = re.split(hay).collect();
+ /// assert_eq!(got, vec!["(", "", "", ")"]);
+ /// ```
+ ///
+ /// Separators at the start or end of a haystack are neighbored by empty
+ /// substring.
+ ///
+ /// ```
+ /// use regex::Regex;
+ ///
+ /// let re = Regex::new(r"0").unwrap();
+ /// let hay = "010";
+ /// let got: Vec<&str> = re.split(hay).collect();
+ /// assert_eq!(got, vec!["", "1", ""]);
+ /// ```
+ ///
+ /// When the empty string is used as a regex, it splits at every valid
+ /// UTF-8 boundary by default (which includes the beginning and end of the
+ /// haystack):
+ ///
+ /// ```
+ /// use regex::Regex;
+ ///
+ /// let re = Regex::new(r"").unwrap();
+ /// let hay = "rust";
+ /// let got: Vec<&str> = re.split(hay).collect();
+ /// assert_eq!(got, vec!["", "r", "u", "s", "t", ""]);
+ ///
+ /// // Splitting by an empty string is UTF-8 aware by default!
+ /// let re = Regex::new(r"").unwrap();
+ /// let hay = "☃";
+ /// let got: Vec<&str> = re.split(hay).collect();
+ /// assert_eq!(got, vec!["", "☃", ""]);
+ /// ```
+ ///
+ /// Contiguous separators (commonly shows up with whitespace), can lead to
+ /// possibly surprising behavior. For example, this code is correct:
+ ///
+ /// ```
+ /// use regex::Regex;
+ ///
+ /// let re = Regex::new(r" ").unwrap();
+ /// let hay = " a b c";
+ /// let got: Vec<&str> = re.split(hay).collect();
+ /// assert_eq!(got, vec!["", "", "", "", "a", "", "b", "c"]);
+ /// ```
+ ///
+ /// It does *not* give you `["a", "b", "c"]`. For that behavior, you'd want
+ /// to match contiguous space characters:
+ ///
+ /// ```
+ /// use regex::Regex;
+ ///
+ /// let re = Regex::new(r" +").unwrap();
+ /// let hay = " a b c";
+ /// let got: Vec<&str> = re.split(hay).collect();
+ /// // N.B. This does still include a leading empty span because ' +'
+ /// // matches at the beginning of the haystack.
+ /// assert_eq!(got, vec!["", "a", "b", "c"]);
+ /// ```
+ #[inline]
+ pub fn split<'r, 'h>(&'r self, haystack: &'h str) -> Split<'r, 'h> {
+ Split { haystack, it: self.meta.split(haystack) }
+ }
+
+ /// Returns an iterator of at most `limit` substrings of the haystack
+ /// given, delimited by a match of the regex. (A `limit` of `0` will return
+ /// no substrings.) Namely, each element of the iterator corresponds to a
+ /// part of the haystack that *isn't* matched by the regular expression.
+ /// The remainder of the haystack that is not split will be the last
+ /// element in the iterator.
+ ///
+ /// # Time complexity
+ ///
+ /// Since iterators over all matches requires running potentially many
+ /// searches on the haystack, and since each search has worst case
+ /// `O(m * n)` time complexity, the overall worst case time complexity for
+ /// this routine is `O(m * n^2)`.
+ ///
+ /// Although note that the worst case time here has an upper bound given
+ /// by the `limit` parameter.
+ ///
+ /// # Example
+ ///
+ /// Get the first two words in some haystack:
+ ///
+ /// ```
+ /// use regex::Regex;
+ ///
+ /// let re = Regex::new(r"\W+").unwrap();
+ /// let hay = "Hey! How are you?";
+ /// let fields: Vec<&str> = re.splitn(hay, 3).collect();
+ /// assert_eq!(fields, vec!["Hey", "How", "are you?"]);
+ /// ```
+ ///
+ /// # Examples: more cases
+ ///
+ /// ```
+ /// use regex::Regex;
+ ///
+ /// let re = Regex::new(r" ").unwrap();
+ /// let hay = "Mary had a little lamb";
+ /// let got: Vec<&str> = re.splitn(hay, 3).collect();
+ /// assert_eq!(got, vec!["Mary", "had", "a little lamb"]);
+ ///
+ /// let re = Regex::new(r"X").unwrap();
+ /// let hay = "";
+ /// let got: Vec<&str> = re.splitn(hay, 3).collect();
+ /// assert_eq!(got, vec![""]);
+ ///
+ /// let re = Regex::new(r"X").unwrap();
+ /// let hay = "lionXXtigerXleopard";
+ /// let got: Vec<&str> = re.splitn(hay, 3).collect();
+ /// assert_eq!(got, vec!["lion", "", "tigerXleopard"]);
+ ///
+ /// let re = Regex::new(r"::").unwrap();
+ /// let hay = "lion::tiger::leopard";
+ /// let got: Vec<&str> = re.splitn(hay, 2).collect();
+ /// assert_eq!(got, vec!["lion", "tiger::leopard"]);
+ ///
+ /// let re = Regex::new(r"X").unwrap();
+ /// let hay = "abcXdef";
+ /// let got: Vec<&str> = re.splitn(hay, 1).collect();
+ /// assert_eq!(got, vec!["abcXdef"]);
+ ///
+ /// let re = Regex::new(r"X").unwrap();
+ /// let hay = "abcdef";
+ /// let got: Vec<&str> = re.splitn(hay, 2).collect();
+ /// assert_eq!(got, vec!["abcdef"]);
+ ///
+ /// let re = Regex::new(r"X").unwrap();
+ /// let hay = "abcXdef";
+ /// let got: Vec<&str> = re.splitn(hay, 0).collect();
+ /// assert!(got.is_empty());
+ /// ```
+ #[inline]
+ pub fn splitn<'r, 'h>(
+ &'r self,
+ haystack: &'h str,
+ limit: usize,
+ ) -> SplitN<'r, 'h> {
+ SplitN { haystack, it: self.meta.splitn(haystack, limit) }
+ }
+
+ /// Replaces the leftmost-first match in the given haystack with the
+ /// replacement provided. The replacement can be a regular string (where
+ /// `$N` and `$name` are expanded to match capture groups) or a function
+ /// that takes a [`Captures`] and returns the replaced string.
+ ///
+ /// If no match is found, then the haystack is returned unchanged. In that
+ /// case, this implementation will likely return a `Cow::Borrowed` value
+ /// such that no allocation is performed.
+ ///
+ /// # Replacement string syntax
+ ///
+ /// All instances of `$ref` in the replacement string are replaced with
+ /// the substring corresponding to the capture group identified by `ref`.
+ ///
+ /// `ref` may be an integer corresponding to the index of the capture group
+ /// (counted by order of opening parenthesis where `0` is the entire match)
+ /// or it can be a name (consisting of letters, digits or underscores)
+ /// corresponding to a named capture group.
+ ///
+ /// If `ref` isn't a valid capture group (whether the name doesn't exist or
+ /// isn't a valid index), then it is replaced with the empty string.
+ ///
+ /// The longest possible name is used. For example, `$1a` looks up the
+ /// capture group named `1a` and not the capture group at index `1`. To
+ /// exert more precise control over the name, use braces, e.g., `${1}a`.
+ ///
+ /// To write a literal `$` use `$$`.
+ ///
+ /// # Example
+ ///
+ /// Note that this function is polymorphic with respect to the replacement.
+ /// In typical usage, this can just be a normal string:
+ ///
+ /// ```
+ /// use regex::Regex;
+ ///
+ /// let re = Regex::new(r"[^01]+").unwrap();
+ /// assert_eq!(re.replace("1078910", ""), "1010");
+ /// ```
+ ///
+ /// But anything satisfying the [`Replacer`] trait will work. For example,
+ /// a closure of type `|&Captures| -> String` provides direct access to the
+ /// captures corresponding to a match. This allows one to access capturing
+ /// group matches easily:
+ ///
+ /// ```
+ /// use regex::{Captures, Regex};
+ ///
+ /// let re = Regex::new(r"([^,\s]+),\s+(\S+)").unwrap();
+ /// let result = re.replace("Springsteen, Bruce", |caps: &Captures| {
+ /// format!("{} {}", &caps[2], &caps[1])
+ /// });
+ /// assert_eq!(result, "Bruce Springsteen");
+ /// ```
+ ///
+ /// But this is a bit cumbersome to use all the time. Instead, a simple
+ /// syntax is supported (as described above) that expands `$name` into the
+ /// corresponding capture group. Here's the last example, but using this
+ /// expansion technique with named capture groups:
+ ///
+ /// ```
+ /// use regex::Regex;
+ ///
+ /// let re = Regex::new(r"(?<last>[^,\s]+),\s+(?<first>\S+)").unwrap();
+ /// let result = re.replace("Springsteen, Bruce", "$first $last");
+ /// assert_eq!(result, "Bruce Springsteen");
+ /// ```
+ ///
+ /// Note that using `$2` instead of `$first` or `$1` instead of `$last`
+ /// would produce the same result. To write a literal `$` use `$$`.
+ ///
+ /// Sometimes the replacement string requires use of curly braces to
+ /// delineate a capture group replacement when it is adjacent to some other
+ /// literal text. For example, if we wanted to join two words together with
+ /// an underscore:
+ ///
+ /// ```
+ /// use regex::Regex;
+ ///
+ /// let re = Regex::new(r"(?<first>\w+)\s+(?<second>\w+)").unwrap();
+ /// let result = re.replace("deep fried", "${first}_$second");
+ /// assert_eq!(result, "deep_fried");
+ /// ```
+ ///
+ /// Without the curly braces, the capture group name `first_` would be
+ /// used, and since it doesn't exist, it would be replaced with the empty
+ /// string.
+ ///
+ /// Finally, sometimes you just want to replace a literal string with no
+ /// regard for capturing group expansion. This can be done by wrapping a
+ /// string with [`NoExpand`]:
+ ///
+ /// ```
+ /// use regex::{NoExpand, Regex};
+ ///
+ /// let re = Regex::new(r"(?<last>[^,\s]+),\s+(\S+)").unwrap();
+ /// let result = re.replace("Springsteen, Bruce", NoExpand("$2 $last"));
+ /// assert_eq!(result, "$2 $last");
+ /// ```
+ ///
+ /// Using `NoExpand` may also be faster, since the replacement string won't
+ /// need to be parsed for the `$` syntax.
+ #[inline]
+ pub fn replace<'h, R: Replacer>(
+ &self,
+ haystack: &'h str,
+ rep: R,
+ ) -> Cow<'h, str> {
+ self.replacen(haystack, 1, rep)
+ }
+
+ /// Replaces all non-overlapping matches in the haystack with the
+ /// replacement provided. This is the same as calling `replacen` with
+ /// `limit` set to `0`.
+ ///
+ /// The documentation for [`Regex::replace`] goes into more detail about
+ /// what kinds of replacement strings are supported.
+ ///
+ /// # Time complexity
+ ///
+ /// Since iterators over all matches requires running potentially many
+ /// searches on the haystack, and since each search has worst case
+ /// `O(m * n)` time complexity, the overall worst case time complexity for
+ /// this routine is `O(m * n^2)`.
+ ///
+ /// # Fallibility
+ ///
+ /// If you need to write a replacement routine where any individual
+ /// replacement might "fail," doing so with this API isn't really feasible
+ /// because there's no way to stop the search process if a replacement
+ /// fails. Instead, if you need this functionality, you should consider
+ /// implementing your own replacement routine:
+ ///
+ /// ```
+ /// use regex::{Captures, Regex};
+ ///
+ /// fn replace_all<E>(
+ /// re: &Regex,
+ /// haystack: &str,
+ /// replacement: impl Fn(&Captures) -> Result<String, E>,
+ /// ) -> Result<String, E> {
+ /// let mut new = String::with_capacity(haystack.len());
+ /// let mut last_match = 0;
+ /// for caps in re.captures_iter(haystack) {
+ /// let m = caps.get(0).unwrap();
+ /// new.push_str(&haystack[last_match..m.start()]);
+ /// new.push_str(&replacement(&caps)?);
+ /// last_match = m.end();
+ /// }
+ /// new.push_str(&haystack[last_match..]);
+ /// Ok(new)
+ /// }
+ ///
+ /// // Let's replace each word with the number of bytes in that word.
+ /// // But if we see a word that is "too long," we'll give up.
+ /// let re = Regex::new(r"\w+").unwrap();
+ /// let replacement = |caps: &Captures| -> Result<String, &'static str> {
+ /// if caps[0].len() >= 5 {
+ /// return Err("word too long");
+ /// }
+ /// Ok(caps[0].len().to_string())
+ /// };
+ /// assert_eq!(
+ /// Ok("2 3 3 3?".to_string()),
+ /// replace_all(&re, "hi how are you?", &replacement),
+ /// );
+ /// assert!(replace_all(&re, "hi there", &replacement).is_err());
+ /// ```
+ ///
+ /// # Example
+ ///
+ /// This example shows how to flip the order of whitespace (excluding line
+ /// terminators) delimited fields, and normalizes the whitespace that
+ /// delimits the fields:
+ ///
+ /// ```
+ /// use regex::Regex;
+ ///
+ /// let re = Regex::new(r"(?m)^(\S+)[\s--\r\n]+(\S+)$").unwrap();
+ /// let hay = "
+ /// Greetings 1973
+ /// Wild\t1973
+ /// BornToRun\t\t\t\t1975
+ /// Darkness 1978
+ /// TheRiver 1980
+ /// ";
+ /// let new = re.replace_all(hay, "$2 $1");
+ /// assert_eq!(new, "
+ /// 1973 Greetings
+ /// 1973 Wild
+ /// 1975 BornToRun
+ /// 1978 Darkness
+ /// 1980 TheRiver
+ /// ");
+ /// ```
+ #[inline]
+ pub fn replace_all<'h, R: Replacer>(
+ &self,
+ haystack: &'h str,
+ rep: R,
+ ) -> Cow<'h, str> {
+ self.replacen(haystack, 0, rep)
+ }
+
+ /// Replaces at most `limit` non-overlapping matches in the haystack with
+ /// the replacement provided. If `limit` is `0`, then all non-overlapping
+ /// matches are replaced. That is, `Regex::replace_all(hay, rep)` is
+ /// equivalent to `Regex::replacen(hay, 0, rep)`.
+ ///
+ /// The documentation for [`Regex::replace`] goes into more detail about
+ /// what kinds of replacement strings are supported.
+ ///
+ /// # Time complexity
+ ///
+ /// Since iterators over all matches requires running potentially many
+ /// searches on the haystack, and since each search has worst case
+ /// `O(m * n)` time complexity, the overall worst case time complexity for
+ /// this routine is `O(m * n^2)`.
+ ///
+ /// Although note that the worst case time here has an upper bound given
+ /// by the `limit` parameter.
+ ///
+ /// # Fallibility
+ ///
+ /// See the corresponding section in the docs for [`Regex::replace_all`]
+ /// for tips on how to deal with a replacement routine that can fail.
+ ///
+ /// # Example
+ ///
+ /// This example shows how to flip the order of whitespace (excluding line
+ /// terminators) delimited fields, and normalizes the whitespace that
+ /// delimits the fields. But we only do it for the first two matches.
+ ///
+ /// ```
+ /// use regex::Regex;
+ ///
+ /// let re = Regex::new(r"(?m)^(\S+)[\s--\r\n]+(\S+)$").unwrap();
+ /// let hay = "
+ /// Greetings 1973
+ /// Wild\t1973
+ /// BornToRun\t\t\t\t1975
+ /// Darkness 1978
+ /// TheRiver 1980
+ /// ";
+ /// let new = re.replacen(hay, 2, "$2 $1");
+ /// assert_eq!(new, "
+ /// 1973 Greetings
+ /// 1973 Wild
+ /// BornToRun\t\t\t\t1975
+ /// Darkness 1978
+ /// TheRiver 1980
+ /// ");
+ /// ```
+ #[inline]
+ pub fn replacen<'h, R: Replacer>(
+ &self,
+ haystack: &'h str,
+ limit: usize,
+ mut rep: R,
+ ) -> Cow<'h, str> {
+ // If we know that the replacement doesn't have any capture expansions,
+ // then we can use the fast path. The fast path can make a tremendous
+ // difference:
+ //
+ // 1) We use `find_iter` instead of `captures_iter`. Not asking for
+ // captures generally makes the regex engines faster.
+ // 2) We don't need to look up all of the capture groups and do
+ // replacements inside the replacement string. We just push it
+ // at each match and be done with it.
+ if let Some(rep) = rep.no_expansion() {
+ let mut it = self.find_iter(haystack).enumerate().peekable();
+ if it.peek().is_none() {
+ return Cow::Borrowed(haystack);
+ }
+ let mut new = String::with_capacity(haystack.len());
+ let mut last_match = 0;
+ for (i, m) in it {
+ new.push_str(&haystack[last_match..m.start()]);
+ new.push_str(&rep);
+ last_match = m.end();
+ if limit > 0 && i >= limit - 1 {
+ break;
+ }
+ }
+ new.push_str(&haystack[last_match..]);
+ return Cow::Owned(new);
+ }
+
+ // The slower path, which we use if the replacement may need access to
+ // capture groups.
+ let mut it = self.captures_iter(haystack).enumerate().peekable();
+ if it.peek().is_none() {
+ return Cow::Borrowed(haystack);
+ }
+ let mut new = String::with_capacity(haystack.len());
+ let mut last_match = 0;
+ for (i, cap) in it {
+ // unwrap on 0 is OK because captures only reports matches
+ let m = cap.get(0).unwrap();
+ new.push_str(&haystack[last_match..m.start()]);
+ rep.replace_append(&cap, &mut new);
+ last_match = m.end();
+ if limit > 0 && i >= limit - 1 {
+ break;
+ }
+ }
+ new.push_str(&haystack[last_match..]);
+ Cow::Owned(new)
+ }
+}
+
+/// A group of advanced or "lower level" search methods. Some methods permit
+/// starting the search at a position greater than `0` in the haystack. Other
+/// methods permit reusing allocations, for example, when extracting the
+/// matches for capture groups.
+impl Regex {
+ /// Returns the end byte offset of the first match in the haystack given.
+ ///
+ /// This method may have the same performance characteristics as
+ /// `is_match`. Behaviorlly, it doesn't just report whether it match
+ /// occurs, but also the end offset for a match. In particular, the offset
+ /// returned *may be shorter* than the proper end of the leftmost-first
+ /// match that you would find via [`Regex::find`].
+ ///
+ /// Note that it is not guaranteed that this routine finds the shortest or
+ /// "earliest" possible match. Instead, the main idea of this API is that
+ /// it returns the offset at the point at which the internal regex engine
+ /// has determined that a match has occurred. This may vary depending on
+ /// which internal regex engine is used, and thus, the offset itself may
+ /// change based on internal heuristics.
+ ///
+ /// # Example
+ ///
+ /// Typically, `a+` would match the entire first sequence of `a` in some
+ /// haystack, but `shortest_match` *may* give up as soon as it sees the
+ /// first `a`.
+ ///
+ /// ```
+ /// use regex::Regex;
+ ///
+ /// let re = Regex::new(r"a+").unwrap();
+ /// let offset = re.shortest_match("aaaaa").unwrap();
+ /// assert_eq!(offset, 1);
+ /// ```
+ #[inline]
+ pub fn shortest_match(&self, haystack: &str) -> Option<usize> {
+ self.shortest_match_at(haystack, 0)
+ }
+
+ /// Returns the same as [`Regex::shortest_match`], but starts the search at
+ /// the given offset.
+ ///
+ /// The significance of the starting point is that it takes the surrounding
+ /// context into consideration. For example, the `\A` anchor can only match
+ /// when `start == 0`.
+ ///
+ /// If a match is found, the offset returned is relative to the beginning
+ /// of the haystack, not the beginning of the search.
+ ///
+ /// # Panics
+ ///
+ /// This panics when `start >= haystack.len() + 1`.
+ ///
+ /// # Example
+ ///
+ /// This example shows the significance of `start` by demonstrating how it
+ /// can be used to permit look-around assertions in a regex to take the
+ /// surrounding context into account.
+ ///
+ /// ```
+ /// use regex::Regex;
+ ///
+ /// let re = Regex::new(r"\bchew\b").unwrap();
+ /// let hay = "eschew";
+ /// // We get a match here, but it's probably not intended.
+ /// assert_eq!(re.shortest_match(&hay[2..]), Some(4));
+ /// // No match because the assertions take the context into account.
+ /// assert_eq!(re.shortest_match_at(hay, 2), None);
+ /// ```
+ #[inline]
+ pub fn shortest_match_at(
+ &self,
+ haystack: &str,
+ start: usize,
+ ) -> Option<usize> {
+ let input =
+ Input::new(haystack).earliest(true).span(start..haystack.len());
+ self.meta.search_half(&input).map(|hm| hm.offset())
+ }
+
+ /// Returns the same as [`Regex::is_match`], but starts the search at the
+ /// given offset.
+ ///
+ /// The significance of the starting point is that it takes the surrounding
+ /// context into consideration. For example, the `\A` anchor can only
+ /// match when `start == 0`.
+ ///
+ /// # Panics
+ ///
+ /// This panics when `start >= haystack.len() + 1`.
+ ///
+ /// # Example
+ ///
+ /// This example shows the significance of `start` by demonstrating how it
+ /// can be used to permit look-around assertions in a regex to take the
+ /// surrounding context into account.
+ ///
+ /// ```
+ /// use regex::Regex;
+ ///
+ /// let re = Regex::new(r"\bchew\b").unwrap();
+ /// let hay = "eschew";
+ /// // We get a match here, but it's probably not intended.
+ /// assert!(re.is_match(&hay[2..]));
+ /// // No match because the assertions take the context into account.
+ /// assert!(!re.is_match_at(hay, 2));
+ /// ```
+ #[inline]
+ pub fn is_match_at(&self, haystack: &str, start: usize) -> bool {
+ let input =
+ Input::new(haystack).earliest(true).span(start..haystack.len());
+ self.meta.search_half(&input).is_some()
+ }
+
+ /// Returns the same as [`Regex::find`], but starts the search at the given
+ /// offset.
+ ///
+ /// The significance of the starting point is that it takes the surrounding
+ /// context into consideration. For example, the `\A` anchor can only
+ /// match when `start == 0`.
+ ///
+ /// # Panics
+ ///
+ /// This panics when `start >= haystack.len() + 1`.
+ ///
+ /// # Example
+ ///
+ /// This example shows the significance of `start` by demonstrating how it
+ /// can be used to permit look-around assertions in a regex to take the
+ /// surrounding context into account.
+ ///
+ /// ```
+ /// use regex::Regex;
+ ///
+ /// let re = Regex::new(r"\bchew\b").unwrap();
+ /// let hay = "eschew";
+ /// // We get a match here, but it's probably not intended.
+ /// assert_eq!(re.find(&hay[2..]).map(|m| m.range()), Some(0..4));
+ /// // No match because the assertions take the context into account.
+ /// assert_eq!(re.find_at(hay, 2), None);
+ /// ```
+ #[inline]
+ pub fn find_at<'h>(
+ &self,
+ haystack: &'h str,
+ start: usize,
+ ) -> Option<Match<'h>> {
+ let input = Input::new(haystack).span(start..haystack.len());
+ self.meta
+ .search(&input)
+ .map(|m| Match::new(haystack, m.start(), m.end()))
+ }
+
+ /// Returns the same as [`Regex::captures`], but starts the search at the
+ /// given offset.
+ ///
+ /// The significance of the starting point is that it takes the surrounding
+ /// context into consideration. For example, the `\A` anchor can only
+ /// match when `start == 0`.
+ ///
+ /// # Panics
+ ///
+ /// This panics when `start >= haystack.len() + 1`.
+ ///
+ /// # Example
+ ///
+ /// This example shows the significance of `start` by demonstrating how it
+ /// can be used to permit look-around assertions in a regex to take the
+ /// surrounding context into account.
+ ///
+ /// ```
+ /// use regex::Regex;
+ ///
+ /// let re = Regex::new(r"\bchew\b").unwrap();
+ /// let hay = "eschew";
+ /// // We get a match here, but it's probably not intended.
+ /// assert_eq!(&re.captures(&hay[2..]).unwrap()[0], "chew");
+ /// // No match because the assertions take the context into account.
+ /// assert!(re.captures_at(hay, 2).is_none());
+ /// ```
+ #[inline]
+ pub fn captures_at<'h>(
+ &self,
+ haystack: &'h str,
+ start: usize,
+ ) -> Option<Captures<'h>> {
+ let input = Input::new(haystack).span(start..haystack.len());
+ let mut caps = self.meta.create_captures();
+ self.meta.search_captures(&input, &mut caps);
+ if caps.is_match() {
+ let static_captures_len = self.static_captures_len();
+ Some(Captures { haystack, caps, static_captures_len })
+ } else {
+ None
+ }
+ }
+
+ /// This is like [`Regex::captures`], but writes the byte offsets of each
+ /// capture group match into the locations given.
+ ///
+ /// A [`CaptureLocations`] stores the same byte offsets as a [`Captures`],
+ /// but does *not* store a reference to the haystack. This makes its API
+ /// a bit lower level and less convenient. But in exchange, callers
+ /// may allocate their own `CaptureLocations` and reuse it for multiple
+ /// searches. This may be helpful if allocating a `Captures` shows up in a
+ /// profile as too costly.
+ ///
+ /// To create a `CaptureLocations` value, use the
+ /// [`Regex::capture_locations`] method.
+ ///
+ /// This also returns the overall match if one was found. When a match is
+ /// found, its offsets are also always stored in `locs` at index `0`.
+ ///
+ /// # Panics
+ ///
+ /// This routine may panic if the given `CaptureLocations` was not created
+ /// by this regex.
+ ///
+ /// # Example
+ ///
+ /// ```
+ /// use regex::Regex;
+ ///
+ /// let re = Regex::new(r"^([a-z]+)=(\S*)$").unwrap();
+ /// let mut locs = re.capture_locations();
+ /// assert!(re.captures_read(&mut locs, "id=foo123").is_some());
+ /// assert_eq!(Some((0, 9)), locs.get(0));
+ /// assert_eq!(Some((0, 2)), locs.get(1));
+ /// assert_eq!(Some((3, 9)), locs.get(2));
+ /// ```
+ #[inline]
+ pub fn captures_read<'h>(
+ &self,
+ locs: &mut CaptureLocations,
+ haystack: &'h str,
+ ) -> Option<Match<'h>> {
+ self.captures_read_at(locs, haystack, 0)
+ }
+
+ /// Returns the same as [`Regex::captures_read`], but starts the search at
+ /// the given offset.
+ ///
+ /// The significance of the starting point is that it takes the surrounding
+ /// context into consideration. For example, the `\A` anchor can only
+ /// match when `start == 0`.
+ ///
+ /// # Panics
+ ///
+ /// This panics when `start >= haystack.len() + 1`.
+ ///
+ /// This routine may also panic if the given `CaptureLocations` was not
+ /// created by this regex.
+ ///
+ /// # Example
+ ///
+ /// This example shows the significance of `start` by demonstrating how it
+ /// can be used to permit look-around assertions in a regex to take the
+ /// surrounding context into account.
+ ///
+ /// ```
+ /// use regex::Regex;
+ ///
+ /// let re = Regex::new(r"\bchew\b").unwrap();
+ /// let hay = "eschew";
+ /// let mut locs = re.capture_locations();
+ /// // We get a match here, but it's probably not intended.
+ /// assert!(re.captures_read(&mut locs, &hay[2..]).is_some());
+ /// // No match because the assertions take the context into account.
+ /// assert!(re.captures_read_at(&mut locs, hay, 2).is_none());
+ /// ```
+ #[inline]
+ pub fn captures_read_at<'h>(
+ &self,
+ locs: &mut CaptureLocations,
+ haystack: &'h str,
+ start: usize,
+ ) -> Option<Match<'h>> {
+ let input = Input::new(haystack).span(start..haystack.len());
+ self.meta.search_captures(&input, &mut locs.0);
+ locs.0.get_match().map(|m| Match::new(haystack, m.start(), m.end()))
+ }
+
+ /// An undocumented alias for `captures_read_at`.
+ ///
+ /// The `regex-capi` crate previously used this routine, so to avoid
+ /// breaking that crate, we continue to provide the name as an undocumented
+ /// alias.
+ #[doc(hidden)]
+ #[inline]
+ pub fn read_captures_at<'h>(
+ &self,
+ locs: &mut CaptureLocations,
+ haystack: &'h str,
+ start: usize,
+ ) -> Option<Match<'h>> {
+ self.captures_read_at(locs, haystack, start)
+ }
+}
+
+/// Auxiliary methods.
+impl Regex {
+ /// Returns the original string of this regex.
+ ///
+ /// # Example
+ ///
+ /// ```
+ /// use regex::Regex;
+ ///
+ /// let re = Regex::new(r"foo\w+bar").unwrap();
+ /// assert_eq!(re.as_str(), r"foo\w+bar");
+ /// ```
+ #[inline]
+ pub fn as_str(&self) -> &str {
+ &self.pattern
+ }
+
+ /// Returns an iterator over the capture names in this regex.
+ ///
+ /// The iterator returned yields elements of type `Option<&str>`. That is,
+ /// the iterator yields values for all capture groups, even ones that are
+ /// unnamed. The order of the groups corresponds to the order of the group's
+ /// corresponding opening parenthesis.
+ ///
+ /// The first element of the iterator always yields the group corresponding
+ /// to the overall match, and this group is always unnamed. Therefore, the
+ /// iterator always yields at least one group.
+ ///
+ /// # Example
+ ///
+ /// This shows basic usage with a mix of named and unnamed capture groups:
+ ///
+ /// ```
+ /// use regex::Regex;
+ ///
+ /// let re = Regex::new(r"(?<a>.(?<b>.))(.)(?:.)(?<c>.)").unwrap();
+ /// let mut names = re.capture_names();
+ /// assert_eq!(names.next(), Some(None));
+ /// assert_eq!(names.next(), Some(Some("a")));
+ /// assert_eq!(names.next(), Some(Some("b")));
+ /// assert_eq!(names.next(), Some(None));
+ /// // the '(?:.)' group is non-capturing and so doesn't appear here!
+ /// assert_eq!(names.next(), Some(Some("c")));
+ /// assert_eq!(names.next(), None);
+ /// ```
+ ///
+ /// The iterator always yields at least one element, even for regexes with
+ /// no capture groups and even for regexes that can never match:
+ ///
+ /// ```
+ /// use regex::Regex;
+ ///
+ /// let re = Regex::new(r"").unwrap();
+ /// let mut names = re.capture_names();
+ /// assert_eq!(names.next(), Some(None));
+ /// assert_eq!(names.next(), None);
+ ///
+ /// let re = Regex::new(r"[a&&b]").unwrap();
+ /// let mut names = re.capture_names();
+ /// assert_eq!(names.next(), Some(None));
+ /// assert_eq!(names.next(), None);
+ /// ```
+ #[inline]
+ pub fn capture_names(&self) -> CaptureNames<'_> {
+ CaptureNames(self.meta.group_info().pattern_names(PatternID::ZERO))
+ }
+
+ /// Returns the number of captures groups in this regex.
+ ///
+ /// This includes all named and unnamed groups, including the implicit
+ /// unnamed group that is always present and corresponds to the entire
+ /// match.
+ ///
+ /// Since the implicit unnamed group is always included in this length, the
+ /// length returned is guaranteed to be greater than zero.
+ ///
+ /// # Example
+ ///
+ /// ```
+ /// use regex::Regex;
+ ///
+ /// let re = Regex::new(r"foo").unwrap();
+ /// assert_eq!(1, re.captures_len());
+ ///
+ /// let re = Regex::new(r"(foo)").unwrap();
+ /// assert_eq!(2, re.captures_len());
+ ///
+ /// let re = Regex::new(r"(?<a>.(?<b>.))(.)(?:.)(?<c>.)").unwrap();
+ /// assert_eq!(5, re.captures_len());
+ ///
+ /// let re = Regex::new(r"[a&&b]").unwrap();
+ /// assert_eq!(1, re.captures_len());
+ /// ```
+ #[inline]
+ pub fn captures_len(&self) -> usize {
+ self.meta.group_info().group_len(PatternID::ZERO)
+ }
+
+ /// Returns the total number of capturing groups that appear in every
+ /// possible match.
+ ///
+ /// If the number of capture groups can vary depending on the match, then
+ /// this returns `None`. That is, a value is only returned when the number
+ /// of matching groups is invariant or "static."
+ ///
+ /// Note that like [`Regex::captures_len`], this **does** include the
+ /// implicit capturing group corresponding to the entire match. Therefore,
+ /// when a non-None value is returned, it is guaranteed to be at least `1`.
+ /// Stated differently, a return value of `Some(0)` is impossible.
+ ///
+ /// # Example
+ ///
+ /// This shows a few cases where a static number of capture groups is
+ /// available and a few cases where it is not.
+ ///
+ /// ```
+ /// use regex::Regex;
+ ///
+ /// let len = |pattern| {
+ /// Regex::new(pattern).map(|re| re.static_captures_len())
+ /// };
+ ///
+ /// assert_eq!(Some(1), len("a")?);
+ /// assert_eq!(Some(2), len("(a)")?);
+ /// assert_eq!(Some(2), len("(a)|(b)")?);
+ /// assert_eq!(Some(3), len("(a)(b)|(c)(d)")?);
+ /// assert_eq!(None, len("(a)|b")?);
+ /// assert_eq!(None, len("a|(b)")?);
+ /// assert_eq!(None, len("(b)*")?);
+ /// assert_eq!(Some(2), len("(b)+")?);
+ ///
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ #[inline]
+ pub fn static_captures_len(&self) -> Option<usize> {
+ self.meta.static_captures_len()
+ }
+
+ /// Returns a fresh allocated set of capture locations that can
+ /// be reused in multiple calls to [`Regex::captures_read`] or
+ /// [`Regex::captures_read_at`].
+ ///
+ /// The returned locations can be used for any subsequent search for this
+ /// particular regex. There is no guarantee that it is correct to use for
+ /// other regexes, even if they have the same number of capture groups.
+ ///
+ /// # Example
+ ///
+ /// ```
+ /// use regex::Regex;
+ ///
+ /// let re = Regex::new(r"(.)(.)(\w+)").unwrap();
+ /// let mut locs = re.capture_locations();
+ /// assert!(re.captures_read(&mut locs, "Padron").is_some());
+ /// assert_eq!(locs.get(0), Some((0, 6)));
+ /// assert_eq!(locs.get(1), Some((0, 1)));
+ /// assert_eq!(locs.get(2), Some((1, 2)));
+ /// assert_eq!(locs.get(3), Some((2, 6)));
+ /// ```
+ #[inline]
+ pub fn capture_locations(&self) -> CaptureLocations {
+ CaptureLocations(self.meta.create_captures())
+ }
+
+ /// An alias for `capture_locations` to preserve backward compatibility.
+ ///
+ /// The `regex-capi` crate used this method, so to avoid breaking that
+ /// crate, we continue to export it as an undocumented API.
+ #[doc(hidden)]
+ #[inline]
+ pub fn locations(&self) -> CaptureLocations {
+ self.capture_locations()
+ }
+}
+
+/// Represents a single match of a regex in a haystack.
+///
+/// A `Match` contains both the start and end byte offsets of the match and the
+/// actual substring corresponding to the range of those byte offsets. It is
+/// guaranteed that `start <= end`. When `start == end`, the match is empty.
+///
+/// Since this `Match` can only be produced by the top-level `Regex` APIs
+/// that only support searching UTF-8 encoded strings, the byte offsets for a
+/// `Match` are guaranteed to fall on valid UTF-8 codepoint boundaries. That
+/// is, slicing a `&str` with [`Match::range`] is guaranteed to never panic.
+///
+/// Values with this type are created by [`Regex::find`] or
+/// [`Regex::find_iter`]. Other APIs can create `Match` values too. For
+/// example, [`Captures::get`].
+///
+/// The lifetime parameter `'h` refers to the lifetime of the matched of the
+/// haystack that this match was produced from.
+///
+/// # Numbering
+///
+/// The byte offsets in a `Match` form a half-open interval. That is, the
+/// start of the range is inclusive and the end of the range is exclusive.
+/// For example, given a haystack `abcFOOxyz` and a match of `FOO`, its byte
+/// offset range starts at `3` and ends at `6`. `3` corresponds to `F` and
+/// `6` corresponds to `x`, which is one past the end of the match. This
+/// corresponds to the same kind of slicing that Rust uses.
+///
+/// For more on why this was chosen over other schemes (aside from being
+/// consistent with how Rust the language works), see [this discussion] and
+/// [Dijkstra's note on a related topic][note].
+///
+/// [this discussion]: https://github.com/rust-lang/regex/discussions/866
+/// [note]: https://www.cs.utexas.edu/users/EWD/transcriptions/EWD08xx/EWD831.html
+///
+/// # Example
+///
+/// This example shows the value of each of the methods on `Match` for a
+/// particular search.
+///
+/// ```
+/// use regex::Regex;
+///
+/// let re = Regex::new(r"\p{Greek}+").unwrap();
+/// let hay = "Greek: αβγδ";
+/// let m = re.find(hay).unwrap();
+/// assert_eq!(7, m.start());
+/// assert_eq!(15, m.end());
+/// assert!(!m.is_empty());
+/// assert_eq!(8, m.len());
+/// assert_eq!(7..15, m.range());
+/// assert_eq!("αβγδ", m.as_str());
+/// ```
+#[derive(Copy, Clone, Eq, PartialEq)]
+pub struct Match<'h> {
+ haystack: &'h str,
+ start: usize,
+ end: usize,
+}
+
+impl<'h> Match<'h> {
+ /// Returns the byte offset of the start of the match in the haystack. The
+ /// start of the match corresponds to the position where the match begins
+ /// and includes the first byte in the match.
+ ///
+ /// It is guaranteed that `Match::start() <= Match::end()`.
+ ///
+ /// This is guaranteed to fall on a valid UTF-8 codepoint boundary. That
+ /// is, it will never be an offset that appears between the UTF-8 code
+ /// units of a UTF-8 encoded Unicode scalar value. Consequently, it is
+ /// always safe to slice the corresponding haystack using this offset.
+ #[inline]
+ pub fn start(&self) -> usize {
+ self.start
+ }
+
+ /// Returns the byte offset of the end of the match in the haystack. The
+ /// end of the match corresponds to the byte immediately following the last
+ /// byte in the match. This means that `&slice[start..end]` works as one
+ /// would expect.
+ ///
+ /// It is guaranteed that `Match::start() <= Match::end()`.
+ ///
+ /// This is guaranteed to fall on a valid UTF-8 codepoint boundary. That
+ /// is, it will never be an offset that appears between the UTF-8 code
+ /// units of a UTF-8 encoded Unicode scalar value. Consequently, it is
+ /// always safe to slice the corresponding haystack using this offset.
+ #[inline]
+ pub fn end(&self) -> usize {
+ self.end
+ }
+
+ /// Returns true if and only if this match has a length of zero.
+ ///
+ /// Note that an empty match can only occur when the regex itself can
+ /// match the empty string. Here are some examples of regexes that can
+ /// all match the empty string: `^`, `^$`, `\b`, `a?`, `a*`, `a{0}`,
+ /// `(foo|\d+|quux)?`.
+ #[inline]
+ pub fn is_empty(&self) -> bool {
+ self.start == self.end
+ }
+
+ /// Returns the length, in bytes, of this match.
+ #[inline]
+ pub fn len(&self) -> usize {
+ self.end - self.start
+ }
+
+ /// Returns the range over the starting and ending byte offsets of the
+ /// match in the haystack.
+ ///
+ /// It is always correct to slice the original haystack searched with this
+ /// range. That is, because the offsets are guaranteed to fall on valid
+ /// UTF-8 boundaries, the range returned is always valid.
+ #[inline]
+ pub fn range(&self) -> core::ops::Range<usize> {
+ self.start..self.end
+ }
+
+ /// Returns the substring of the haystack that matched.
+ #[inline]
+ pub fn as_str(&self) -> &'h str {
+ &self.haystack[self.range()]
+ }
+
+ /// Creates a new match from the given haystack and byte offsets.
+ #[inline]
+ fn new(haystack: &'h str, start: usize, end: usize) -> Match<'h> {
+ Match { haystack, start, end }
+ }
+}
+
+impl<'h> core::fmt::Debug for Match<'h> {
+ fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result {
+ f.debug_struct("Match")
+ .field("start", &self.start)
+ .field("end", &self.end)
+ .field("string", &self.as_str())
+ .finish()
+ }
+}
+
+impl<'h> From<Match<'h>> for &'h str {
+ fn from(m: Match<'h>) -> &'h str {
+ m.as_str()
+ }
+}
+
+impl<'h> From<Match<'h>> for core::ops::Range<usize> {
+ fn from(m: Match<'h>) -> core::ops::Range<usize> {
+ m.range()
+ }
+}
+
+/// Represents the capture groups for a single match.
+///
+/// Capture groups refer to parts of a regex enclosed in parentheses. They can
+/// be optionally named. The purpose of capture groups is to be able to
+/// reference different parts of a match based on the original pattern. For
+/// example, say you want to match the individual letters in a 5-letter word:
+///
+/// ```text
+/// (?<first>\w)(\w)(?:\w)\w(?<last>\w)
+/// ```
+///
+/// This regex has 4 capture groups:
+///
+/// * The group at index `0` corresponds to the overall match. It is always
+/// present in every match and never has a name.
+/// * The group at index `1` with name `first` corresponding to the first
+/// letter.
+/// * The group at index `2` with no name corresponding to the second letter.
+/// * The group at index `3` with name `last` corresponding to the fifth and
+/// last letter.
+///
+/// Notice that `(?:\w)` was not listed above as a capture group despite it
+/// being enclosed in parentheses. That's because `(?:pattern)` is a special
+/// syntax that permits grouping but *without* capturing. The reason for not
+/// treating it as a capture is that tracking and reporting capture groups
+/// requires additional state that may lead to slower searches. So using as few
+/// capture groups as possible can help performance. (Although the difference
+/// in performance of a couple of capture groups is likely immaterial.)
+///
+/// Values with this type are created by [`Regex::captures`] or
+/// [`Regex::captures_iter`].
+///
+/// `'h` is the lifetime of the haystack that these captures were matched from.
+///
+/// # Example
+///
+/// ```
+/// use regex::Regex;
+///
+/// let re = Regex::new(r"(?<first>\w)(\w)(?:\w)\w(?<last>\w)").unwrap();
+/// let caps = re.captures("toady").unwrap();
+/// assert_eq!("toady", &caps[0]);
+/// assert_eq!("t", &caps["first"]);
+/// assert_eq!("o", &caps[2]);
+/// assert_eq!("y", &caps["last"]);
+/// ```
+pub struct Captures<'h> {
+ haystack: &'h str,
+ caps: captures::Captures,
+ static_captures_len: Option<usize>,
+}
+
+impl<'h> Captures<'h> {
+ /// Returns the `Match` associated with the capture group at index `i`. If
+ /// `i` does not correspond to a capture group, or if the capture group did
+ /// not participate in the match, then `None` is returned.
+ ///
+ /// When `i == 0`, this is guaranteed to return a non-`None` value.
+ ///
+ /// # Examples
+ ///
+ /// Get the substring that matched with a default of an empty string if the
+ /// group didn't participate in the match:
+ ///
+ /// ```
+ /// use regex::Regex;
+ ///
+ /// let re = Regex::new(r"[a-z]+(?:([0-9]+)|([A-Z]+))").unwrap();
+ /// let caps = re.captures("abc123").unwrap();
+ ///
+ /// let substr1 = caps.get(1).map_or("", |m| m.as_str());
+ /// let substr2 = caps.get(2).map_or("", |m| m.as_str());
+ /// assert_eq!(substr1, "123");
+ /// assert_eq!(substr2, "");
+ /// ```
+ #[inline]
+ pub fn get(&self, i: usize) -> Option<Match<'h>> {
+ self.caps
+ .get_group(i)
+ .map(|sp| Match::new(self.haystack, sp.start, sp.end))
+ }
+
+ /// Returns the `Match` associated with the capture group named `name`. If
+ /// `name` isn't a valid capture group or it refers to a group that didn't
+ /// match, then `None` is returned.
+ ///
+ /// Note that unlike `caps["name"]`, this returns a `Match` whose lifetime
+ /// matches the lifetime of the haystack in this `Captures` value.
+ /// Conversely, the substring returned by `caps["name"]` has a lifetime
+ /// of the `Captures` value, which is likely shorter than the lifetime of
+ /// the haystack. In some cases, it may be necessary to use this method to
+ /// access the matching substring instead of the `caps["name"]` notation.
+ ///
+ /// # Examples
+ ///
+ /// Get the substring that matched with a default of an empty string if the
+ /// group didn't participate in the match:
+ ///
+ /// ```
+ /// use regex::Regex;
+ ///
+ /// let re = Regex::new(
+ /// r"[a-z]+(?:(?<numbers>[0-9]+)|(?<letters>[A-Z]+))",
+ /// ).unwrap();
+ /// let caps = re.captures("abc123").unwrap();
+ ///
+ /// let numbers = caps.name("numbers").map_or("", |m| m.as_str());
+ /// let letters = caps.name("letters").map_or("", |m| m.as_str());
+ /// assert_eq!(numbers, "123");
+ /// assert_eq!(letters, "");
+ /// ```
+ #[inline]
+ pub fn name(&self, name: &str) -> Option<Match<'h>> {
+ self.caps
+ .get_group_by_name(name)
+ .map(|sp| Match::new(self.haystack, sp.start, sp.end))
+ }
+
+ /// This is a convenience routine for extracting the substrings
+ /// corresponding to matching capture groups.
+ ///
+ /// This returns a tuple where the first element corresponds to the full
+ /// substring of the haystack that matched the regex. The second element is
+ /// an array of substrings, with each corresponding to the to the substring
+ /// that matched for a particular capture group.
+ ///
+ /// # Panics
+ ///
+ /// This panics if the number of possible matching groups in this
+ /// `Captures` value is not fixed to `N` in all circumstances.
+ /// More precisely, this routine only works when `N` is equivalent to
+ /// [`Regex::static_captures_len`].
+ ///
+ /// Stated more plainly, if the number of matching capture groups in a
+ /// regex can vary from match to match, then this function always panics.
+ ///
+ /// For example, `(a)(b)|(c)` could produce two matching capture groups
+ /// or one matching capture group for any given match. Therefore, one
+ /// cannot use `extract` with such a pattern.
+ ///
+ /// But a pattern like `(a)(b)|(c)(d)` can be used with `extract` because
+ /// the number of capture groups in every match is always equivalent,
+ /// even if the capture _indices_ in each match are not.
+ ///
+ /// # Example
+ ///
+ /// ```
+ /// use regex::Regex;
+ ///
+ /// let re = Regex::new(r"([0-9]{4})-([0-9]{2})-([0-9]{2})").unwrap();
+ /// let hay = "On 2010-03-14, I became a Tenneessee lamb.";
+ /// let Some((full, [year, month, day])) =
+ /// re.captures(hay).map(|caps| caps.extract()) else { return };
+ /// assert_eq!("2010-03-14", full);
+ /// assert_eq!("2010", year);
+ /// assert_eq!("03", month);
+ /// assert_eq!("14", day);
+ /// ```
+ ///
+ /// # Example: iteration
+ ///
+ /// This example shows how to use this method when iterating over all
+ /// `Captures` matches in a haystack.
+ ///
+ /// ```
+ /// use regex::Regex;
+ ///
+ /// let re = Regex::new(r"([0-9]{4})-([0-9]{2})-([0-9]{2})").unwrap();
+ /// let hay = "1973-01-05, 1975-08-25 and 1980-10-18";
+ ///
+ /// let mut dates: Vec<(&str, &str, &str)> = vec![];
+ /// for (_, [y, m, d]) in re.captures_iter(hay).map(|c| c.extract()) {
+ /// dates.push((y, m, d));
+ /// }
+ /// assert_eq!(dates, vec![
+ /// ("1973", "01", "05"),
+ /// ("1975", "08", "25"),
+ /// ("1980", "10", "18"),
+ /// ]);
+ /// ```
+ ///
+ /// # Example: parsing different formats
+ ///
+ /// This API is particularly useful when you need to extract a particular
+ /// value that might occur in a different format. Consider, for example,
+ /// an identifier that might be in double quotes or single quotes:
+ ///
+ /// ```
+ /// use regex::Regex;
+ ///
+ /// let re = Regex::new(r#"id:(?:"([^"]+)"|'([^']+)')"#).unwrap();
+ /// let hay = r#"The first is id:"foo" and the second is id:'bar'."#;
+ /// let mut ids = vec![];
+ /// for (_, [id]) in re.captures_iter(hay).map(|c| c.extract()) {
+ /// ids.push(id);
+ /// }
+ /// assert_eq!(ids, vec!["foo", "bar"]);
+ /// ```
+ pub fn extract<const N: usize>(&self) -> (&'h str, [&'h str; N]) {
+ let len = self
+ .static_captures_len
+ .expect("number of capture groups can vary in a match")
+ .checked_sub(1)
+ .expect("number of groups is always greater than zero");
+ assert_eq!(N, len, "asked for {} groups, but must ask for {}", N, len);
+ // The regex-automata variant of extract is a bit more permissive.
+ // It doesn't require the number of matching capturing groups to be
+ // static, and you can even request fewer groups than what's there. So
+ // this is guaranteed to never panic because we've asserted above that
+ // the user has requested precisely the number of groups that must be
+ // present in any match for this regex.
+ self.caps.extract(self.haystack)
+ }
+
+ /// Expands all instances of `$ref` in `replacement` to the corresponding
+ /// capture group, and writes them to the `dst` buffer given. A `ref` can
+ /// be a capture group index or a name. If `ref` doesn't refer to a capture
+ /// group that participated in the match, then it is replaced with the
+ /// empty string.
+ ///
+ /// # Format
+ ///
+ /// The format of the replacement string supports two different kinds of
+ /// capture references: unbraced and braced.
+ ///
+ /// For the unbraced format, the format supported is `$ref` where `name`
+ /// can be any character in the class `[0-9A-Za-z_]`. `ref` is always
+ /// the longest possible parse. So for example, `$1a` corresponds to the
+ /// capture group named `1a` and not the capture group at index `1`. If
+ /// `ref` matches `^[0-9]+$`, then it is treated as a capture group index
+ /// itself and not a name.
+ ///
+ /// For the braced format, the format supported is `${ref}` where `ref` can
+ /// be any sequence of bytes except for `}`. If no closing brace occurs,
+ /// then it is not considered a capture reference. As with the unbraced
+ /// format, if `ref` matches `^[0-9]+$`, then it is treated as a capture
+ /// group index and not a name.
+ ///
+ /// The braced format is useful for exerting precise control over the name
+ /// of the capture reference. For example, `${1}a` corresponds to the
+ /// capture group reference `1` followed by the letter `a`, where as `$1a`
+ /// (as mentioned above) corresponds to the capture group reference `1a`.
+ /// The braced format is also useful for expressing capture group names
+ /// that use characters not supported by the unbraced format. For example,
+ /// `${foo[bar].baz}` refers to the capture group named `foo[bar].baz`.
+ ///
+ /// If a capture group reference is found and it does not refer to a valid
+ /// capture group, then it will be replaced with the empty string.
+ ///
+ /// To write a literal `$`, use `$$`.
+ ///
+ /// # Example
+ ///
+ /// ```
+ /// use regex::Regex;
+ ///
+ /// let re = Regex::new(
+ /// r"(?<day>[0-9]{2})-(?<month>[0-9]{2})-(?<year>[0-9]{4})",
+ /// ).unwrap();
+ /// let hay = "On 14-03-2010, I became a Tenneessee lamb.";
+ /// let caps = re.captures(hay).unwrap();
+ ///
+ /// let mut dst = String::new();
+ /// caps.expand("year=$year, month=$month, day=$day", &mut dst);
+ /// assert_eq!(dst, "year=2010, month=03, day=14");
+ /// ```
+ #[inline]
+ pub fn expand(&self, replacement: &str, dst: &mut String) {
+ self.caps.interpolate_string_into(self.haystack, replacement, dst);
+ }
+
+ /// Returns an iterator over all capture groups. This includes both
+ /// matching and non-matching groups.
+ ///
+ /// The iterator always yields at least one matching group: the first group
+ /// (at index `0`) with no name. Subsequent groups are returned in the order
+ /// of their opening parenthesis in the regex.
+ ///
+ /// The elements yielded have type `Option<Match<'h>>`, where a non-`None`
+ /// value is present if the capture group matches.
+ ///
+ /// # Example
+ ///
+ /// ```
+ /// use regex::Regex;
+ ///
+ /// let re = Regex::new(r"(\w)(\d)?(\w)").unwrap();
+ /// let caps = re.captures("AZ").unwrap();
+ ///
+ /// let mut it = caps.iter();
+ /// assert_eq!(it.next().unwrap().map(|m| m.as_str()), Some("AZ"));
+ /// assert_eq!(it.next().unwrap().map(|m| m.as_str()), Some("A"));
+ /// assert_eq!(it.next().unwrap().map(|m| m.as_str()), None);
+ /// assert_eq!(it.next().unwrap().map(|m| m.as_str()), Some("Z"));
+ /// assert_eq!(it.next(), None);
+ /// ```
+ #[inline]
+ pub fn iter<'c>(&'c self) -> SubCaptureMatches<'c, 'h> {
+ SubCaptureMatches { haystack: self.haystack, it: self.caps.iter() }
+ }
+
+ /// Returns the total number of capture groups. This includes both
+ /// matching and non-matching groups.
+ ///
+ /// The length returned is always equivalent to the number of elements
+ /// yielded by [`Captures::iter`]. Consequently, the length is always
+ /// greater than zero since every `Captures` value always includes the
+ /// match for the entire regex.
+ ///
+ /// # Example
+ ///
+ /// ```
+ /// use regex::Regex;
+ ///
+ /// let re = Regex::new(r"(\w)(\d)?(\w)").unwrap();
+ /// let caps = re.captures("AZ").unwrap();
+ /// assert_eq!(caps.len(), 4);
+ /// ```
+ #[inline]
+ pub fn len(&self) -> usize {
+ self.caps.group_len()
+ }
+}
+
+impl<'h> core::fmt::Debug for Captures<'h> {
+ fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
+ /// A little helper type to provide a nice map-like debug
+ /// representation for our capturing group spans.
+ ///
+ /// regex-automata has something similar, but it includes the pattern
+ /// ID in its debug output, which is confusing. It also doesn't include
+ /// that strings that match because a regex-automata `Captures` doesn't
+ /// borrow the haystack.
+ struct CapturesDebugMap<'a> {
+ caps: &'a Captures<'a>,
+ }
+
+ impl<'a> core::fmt::Debug for CapturesDebugMap<'a> {
+ fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result {
+ let mut map = f.debug_map();
+ let names =
+ self.caps.caps.group_info().pattern_names(PatternID::ZERO);
+ for (group_index, maybe_name) in names.enumerate() {
+ let key = Key(group_index, maybe_name);
+ match self.caps.get(group_index) {
+ None => map.entry(&key, &None::<()>),
+ Some(mat) => map.entry(&key, &Value(mat)),
+ };
+ }
+ map.finish()
+ }
+ }
+
+ struct Key<'a>(usize, Option<&'a str>);
+
+ impl<'a> core::fmt::Debug for Key<'a> {
+ fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result {
+ write!(f, "{}", self.0)?;
+ if let Some(name) = self.1 {
+ write!(f, "/{:?}", name)?;
+ }
+ Ok(())
+ }
+ }
+
+ struct Value<'a>(Match<'a>);
+
+ impl<'a> core::fmt::Debug for Value<'a> {
+ fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result {
+ write!(
+ f,
+ "{}..{}/{:?}",
+ self.0.start(),
+ self.0.end(),
+ self.0.as_str()
+ )
+ }
+ }
+
+ f.debug_tuple("Captures")
+ .field(&CapturesDebugMap { caps: self })
+ .finish()
+ }
+}
+
+/// Get a matching capture group's haystack substring by index.
+///
+/// The haystack substring returned can't outlive the `Captures` object if this
+/// method is used, because of how `Index` is defined (normally `a[i]` is part
+/// of `a` and can't outlive it). To work around this limitation, do that, use
+/// [`Captures::get`] instead.
+///
+/// `'h` is the lifetime of the matched haystack, but the lifetime of the
+/// `&str` returned by this implementation is the lifetime of the `Captures`
+/// value itself.
+///
+/// # Panics
+///
+/// If there is no matching group at the given index.
+impl<'h> core::ops::Index<usize> for Captures<'h> {
+ type Output = str;
+
+ // The lifetime is written out to make it clear that the &str returned
+ // does NOT have a lifetime equivalent to 'h.
+ fn index<'a>(&'a self, i: usize) -> &'a str {
+ self.get(i)
+ .map(|m| m.as_str())
+ .unwrap_or_else(|| panic!("no group at index '{}'", i))
+ }
+}
+
+/// Get a matching capture group's haystack substring by name.
+///
+/// The haystack substring returned can't outlive the `Captures` object if this
+/// method is used, because of how `Index` is defined (normally `a[i]` is part
+/// of `a` and can't outlive it). To work around this limitation, do that, use
+/// [`Captures::get`] instead.
+///
+/// `'h` is the lifetime of the matched haystack, but the lifetime of the
+/// `&str` returned by this implementation is the lifetime of the `Captures`
+/// value itself.
+///
+/// `'n` is the lifetime of the group name used to index the `Captures` value.
+///
+/// # Panics
+///
+/// If there is no matching group at the given name.
+impl<'h, 'n> core::ops::Index<&'n str> for Captures<'h> {
+ type Output = str;
+
+ fn index<'a>(&'a self, name: &'n str) -> &'a str {
+ self.name(name)
+ .map(|m| m.as_str())
+ .unwrap_or_else(|| panic!("no group named '{}'", name))
+ }
+}
+
+/// A low level representation of the byte offsets of each capture group.
+///
+/// You can think of this as a lower level [`Captures`], where this type does
+/// not support named capturing groups directly and it does not borrow the
+/// haystack that these offsets were matched on.
+///
+/// Primarily, this type is useful when using the lower level `Regex` APIs such
+/// as [`Regex::captures_read`], which permits amortizing the allocation in
+/// which capture match offsets are stored.
+///
+/// In order to build a value of this type, you'll need to call the
+/// [`Regex::capture_locations`] method. The value returned can then be reused
+/// in subsequent searches for that regex. Using it for other regexes may
+/// result in a panic or otherwise incorrect results.
+///
+/// # Example
+///
+/// This example shows how to create and use `CaptureLocations` in a search.
+///
+/// ```
+/// use regex::Regex;
+///
+/// let re = Regex::new(r"(?<first>\w+)\s+(?<last>\w+)").unwrap();
+/// let mut locs = re.capture_locations();
+/// let m = re.captures_read(&mut locs, "Bruce Springsteen").unwrap();
+/// assert_eq!(0..17, m.range());
+/// assert_eq!(Some((0, 17)), locs.get(0));
+/// assert_eq!(Some((0, 5)), locs.get(1));
+/// assert_eq!(Some((6, 17)), locs.get(2));
+///
+/// // Asking for an invalid capture group always returns None.
+/// assert_eq!(None, locs.get(3));
+/// # // literals are too big for 32-bit usize: #1041
+/// # #[cfg(target_pointer_width = "64")]
+/// assert_eq!(None, locs.get(34973498648));
+/// # #[cfg(target_pointer_width = "64")]
+/// assert_eq!(None, locs.get(9944060567225171988));
+/// ```
+#[derive(Clone, Debug)]
+pub struct CaptureLocations(captures::Captures);
+
+/// A type alias for `CaptureLocations` for backwards compatibility.
+///
+/// Previously, we exported `CaptureLocations` as `Locations` in an
+/// undocumented API. To prevent breaking that code (e.g., in `regex-capi`),
+/// we continue re-exporting the same undocumented API.
+#[doc(hidden)]
+pub type Locations = CaptureLocations;
+
+impl CaptureLocations {
+ /// Returns the start and end byte offsets of the capture group at index
+ /// `i`. This returns `None` if `i` is not a valid capture group or if the
+ /// capture group did not match.
+ ///
+ /// # Example
+ ///
+ /// ```
+ /// use regex::Regex;
+ ///
+ /// let re = Regex::new(r"(?<first>\w+)\s+(?<last>\w+)").unwrap();
+ /// let mut locs = re.capture_locations();
+ /// re.captures_read(&mut locs, "Bruce Springsteen").unwrap();
+ /// assert_eq!(Some((0, 17)), locs.get(0));
+ /// assert_eq!(Some((0, 5)), locs.get(1));
+ /// assert_eq!(Some((6, 17)), locs.get(2));
+ /// ```
+ #[inline]
+ pub fn get(&self, i: usize) -> Option<(usize, usize)> {
+ self.0.get_group(i).map(|sp| (sp.start, sp.end))
+ }
+
+ /// Returns the total number of capture groups (even if they didn't match).
+ /// That is, the length returned is unaffected by the result of a search.
+ ///
+ /// This is always at least `1` since every regex has at least `1`
+ /// capturing group that corresponds to the entire match.
+ ///
+ /// # Example
+ ///
+ /// ```
+ /// use regex::Regex;
+ ///
+ /// let re = Regex::new(r"(?<first>\w+)\s+(?<last>\w+)").unwrap();
+ /// let mut locs = re.capture_locations();
+ /// assert_eq!(3, locs.len());
+ /// re.captures_read(&mut locs, "Bruce Springsteen").unwrap();
+ /// assert_eq!(3, locs.len());
+ /// ```
+ ///
+ /// Notice that the length is always at least `1`, regardless of the regex:
+ ///
+ /// ```
+ /// use regex::Regex;
+ ///
+ /// let re = Regex::new(r"").unwrap();
+ /// let locs = re.capture_locations();
+ /// assert_eq!(1, locs.len());
+ ///
+ /// // [a&&b] is a regex that never matches anything.
+ /// let re = Regex::new(r"[a&&b]").unwrap();
+ /// let locs = re.capture_locations();
+ /// assert_eq!(1, locs.len());
+ /// ```
+ #[inline]
+ pub fn len(&self) -> usize {
+ // self.0.group_len() returns 0 if the underlying captures doesn't
+ // represent a match, but the behavior guaranteed for this method is
+ // that the length doesn't change based on a match or not.
+ self.0.group_info().group_len(PatternID::ZERO)
+ }
+
+ /// An alias for the `get` method for backwards compatibility.
+ ///
+ /// Previously, we exported `get` as `pos` in an undocumented API. To
+ /// prevent breaking that code (e.g., in `regex-capi`), we continue
+ /// re-exporting the same undocumented API.
+ #[doc(hidden)]
+ #[inline]
+ pub fn pos(&self, i: usize) -> Option<(usize, usize)> {
+ self.get(i)
+ }
+}
+
+/// An iterator over all non-overlapping matches in a haystack.
+///
+/// This iterator yields [`Match`] values. The iterator stops when no more
+/// matches can be found.
+///
+/// `'r` is the lifetime of the compiled regular expression and `'h` is the
+/// lifetime of the haystack.
+///
+/// This iterator is created by [`Regex::find_iter`].
+///
+/// # Time complexity
+///
+/// Note that since an iterator runs potentially many searches on the haystack
+/// and since each search has worst case `O(m * n)` time complexity, the
+/// overall worst case time complexity for iteration is `O(m * n^2)`.
+#[derive(Debug)]
+pub struct Matches<'r, 'h> {
+ haystack: &'h str,
+ it: meta::FindMatches<'r, 'h>,
+}
+
+impl<'r, 'h> Iterator for Matches<'r, 'h> {
+ type Item = Match<'h>;
+
+ #[inline]
+ fn next(&mut self) -> Option<Match<'h>> {
+ self.it
+ .next()
+ .map(|sp| Match::new(self.haystack, sp.start(), sp.end()))
+ }
+
+ #[inline]
+ fn count(self) -> usize {
+ // This can actually be up to 2x faster than calling `next()` until
+ // completion, because counting matches when using a DFA only requires
+ // finding the end of each match. But returning a `Match` via `next()`
+ // requires the start of each match which, with a DFA, requires a
+ // reverse forward scan to find it.
+ self.it.count()
+ }
+}
+
+impl<'r, 'h> core::iter::FusedIterator for Matches<'r, 'h> {}
+
+/// An iterator over all non-overlapping capture matches in a haystack.
+///
+/// This iterator yields [`Captures`] values. The iterator stops when no more
+/// matches can be found.
+///
+/// `'r` is the lifetime of the compiled regular expression and `'h` is the
+/// lifetime of the matched string.
+///
+/// This iterator is created by [`Regex::captures_iter`].
+///
+/// # Time complexity
+///
+/// Note that since an iterator runs potentially many searches on the haystack
+/// and since each search has worst case `O(m * n)` time complexity, the
+/// overall worst case time complexity for iteration is `O(m * n^2)`.
+#[derive(Debug)]
+pub struct CaptureMatches<'r, 'h> {
+ haystack: &'h str,
+ it: meta::CapturesMatches<'r, 'h>,
+}
+
+impl<'r, 'h> Iterator for CaptureMatches<'r, 'h> {
+ type Item = Captures<'h>;
+
+ #[inline]
+ fn next(&mut self) -> Option<Captures<'h>> {
+ let static_captures_len = self.it.regex().static_captures_len();
+ self.it.next().map(|caps| Captures {
+ haystack: self.haystack,
+ caps,
+ static_captures_len,
+ })
+ }
+
+ #[inline]
+ fn count(self) -> usize {
+ // This can actually be up to 2x faster than calling `next()` until
+ // completion, because counting matches when using a DFA only requires
+ // finding the end of each match. But returning a `Match` via `next()`
+ // requires the start of each match which, with a DFA, requires a
+ // reverse forward scan to find it.
+ self.it.count()
+ }
+}
+
+impl<'r, 'h> core::iter::FusedIterator for CaptureMatches<'r, 'h> {}
+
+/// An iterator over all substrings delimited by a regex match.
+///
+/// `'r` is the lifetime of the compiled regular expression and `'h` is the
+/// lifetime of the byte string being split.
+///
+/// This iterator is created by [`Regex::split`].
+///
+/// # Time complexity
+///
+/// Note that since an iterator runs potentially many searches on the haystack
+/// and since each search has worst case `O(m * n)` time complexity, the
+/// overall worst case time complexity for iteration is `O(m * n^2)`.
+#[derive(Debug)]
+pub struct Split<'r, 'h> {
+ haystack: &'h str,
+ it: meta::Split<'r, 'h>,
+}
+
+impl<'r, 'h> Iterator for Split<'r, 'h> {
+ type Item = &'h str;
+
+ #[inline]
+ fn next(&mut self) -> Option<&'h str> {
+ self.it.next().map(|span| &self.haystack[span])
+ }
+}
+
+impl<'r, 'h> core::iter::FusedIterator for Split<'r, 'h> {}
+
+/// An iterator over at most `N` substrings delimited by a regex match.
+///
+/// The last substring yielded by this iterator will be whatever remains after
+/// `N-1` splits.
+///
+/// `'r` is the lifetime of the compiled regular expression and `'h` is the
+/// lifetime of the byte string being split.
+///
+/// This iterator is created by [`Regex::splitn`].
+///
+/// # Time complexity
+///
+/// Note that since an iterator runs potentially many searches on the haystack
+/// and since each search has worst case `O(m * n)` time complexity, the
+/// overall worst case time complexity for iteration is `O(m * n^2)`.
+///
+/// Although note that the worst case time here has an upper bound given
+/// by the `limit` parameter to [`Regex::splitn`].
+#[derive(Debug)]
+pub struct SplitN<'r, 'h> {
+ haystack: &'h str,
+ it: meta::SplitN<'r, 'h>,
+}
+
+impl<'r, 'h> Iterator for SplitN<'r, 'h> {
+ type Item = &'h str;
+
+ #[inline]
+ fn next(&mut self) -> Option<&'h str> {
+ self.it.next().map(|span| &self.haystack[span])
+ }
+
+ #[inline]
+ fn size_hint(&self) -> (usize, Option<usize>) {
+ self.it.size_hint()
+ }
+}
+
+impl<'r, 'h> core::iter::FusedIterator for SplitN<'r, 'h> {}
+
+/// An iterator over the names of all capture groups in a regex.
+///
+/// This iterator yields values of type `Option<&str>` in order of the opening
+/// capture group parenthesis in the regex pattern. `None` is yielded for
+/// groups with no name. The first element always corresponds to the implicit
+/// and unnamed group for the overall match.
+///
+/// `'r` is the lifetime of the compiled regular expression.
+///
+/// This iterator is created by [`Regex::capture_names`].
+#[derive(Clone, Debug)]
+pub struct CaptureNames<'r>(captures::GroupInfoPatternNames<'r>);
+
+impl<'r> Iterator for CaptureNames<'r> {
+ type Item = Option<&'r str>;
+
+ #[inline]
+ fn next(&mut self) -> Option<Option<&'r str>> {
+ self.0.next()
+ }
+
+ #[inline]
+ fn size_hint(&self) -> (usize, Option<usize>) {
+ self.0.size_hint()
+ }
+
+ #[inline]
+ fn count(self) -> usize {
+ self.0.count()
+ }
+}
+
+impl<'r> ExactSizeIterator for CaptureNames<'r> {}
+
+impl<'r> core::iter::FusedIterator for CaptureNames<'r> {}
+
+/// An iterator over all group matches in a [`Captures`] value.
+///
+/// This iterator yields values of type `Option<Match<'h>>`, where `'h` is the
+/// lifetime of the haystack that the matches are for. The order of elements
+/// yielded corresponds to the order of the opening parenthesis for the group
+/// in the regex pattern. `None` is yielded for groups that did not participate
+/// in the match.
+///
+/// The first element always corresponds to the implicit group for the overall
+/// match. Since this iterator is created by a [`Captures`] value, and a
+/// `Captures` value is only created when a match occurs, it follows that the
+/// first element yielded by this iterator is guaranteed to be non-`None`.
+///
+/// The lifetime `'c` corresponds to the lifetime of the `Captures` value that
+/// created this iterator, and the lifetime `'h` corresponds to the originally
+/// matched haystack.
+#[derive(Clone, Debug)]
+pub struct SubCaptureMatches<'c, 'h> {
+ haystack: &'h str,
+ it: captures::CapturesPatternIter<'c>,
+}
+
+impl<'c, 'h> Iterator for SubCaptureMatches<'c, 'h> {
+ type Item = Option<Match<'h>>;
+
+ #[inline]
+ fn next(&mut self) -> Option<Option<Match<'h>>> {
+ self.it.next().map(|group| {
+ group.map(|sp| Match::new(self.haystack, sp.start, sp.end))
+ })
+ }
+
+ #[inline]
+ fn size_hint(&self) -> (usize, Option<usize>) {
+ self.it.size_hint()
+ }
+
+ #[inline]
+ fn count(self) -> usize {
+ self.it.count()
+ }
+}
+
+impl<'c, 'h> ExactSizeIterator for SubCaptureMatches<'c, 'h> {}
+
+impl<'c, 'h> core::iter::FusedIterator for SubCaptureMatches<'c, 'h> {}
+
+/// A trait for types that can be used to replace matches in a haystack.
+///
+/// In general, users of this crate shouldn't need to implement this trait,
+/// since implementations are already provided for `&str` along with other
+/// variants of string types, as well as `FnMut(&Captures) -> String` (or any
+/// `FnMut(&Captures) -> T` where `T: AsRef<str>`). Those cover most use cases,
+/// but callers can implement this trait directly if necessary.
+///
+/// # Example
+///
+/// This example shows a basic implementation of the `Replacer` trait. This
+/// can be done much more simply using the replacement string interpolation
+/// support (e.g., `$first $last`), but this approach avoids needing to parse
+/// the replacement string at all.
+///
+/// ```
+/// use regex::{Captures, Regex, Replacer};
+///
+/// struct NameSwapper;
+///
+/// impl Replacer for NameSwapper {
+/// fn replace_append(&mut self, caps: &Captures<'_>, dst: &mut String) {
+/// dst.push_str(&caps["first"]);
+/// dst.push_str(" ");
+/// dst.push_str(&caps["last"]);
+/// }
+/// }
+///
+/// let re = Regex::new(r"(?<last>[^,\s]+),\s+(?<first>\S+)").unwrap();
+/// let result = re.replace("Springsteen, Bruce", NameSwapper);
+/// assert_eq!(result, "Bruce Springsteen");
+/// ```
+pub trait Replacer {
+ /// Appends possibly empty data to `dst` to replace the current match.
+ ///
+ /// The current match is represented by `caps`, which is guaranteed to
+ /// have a match at capture group `0`.
+ ///
+ /// For example, a no-op replacement would be `dst.push_str(&caps[0])`.
+ fn replace_append(&mut self, caps: &Captures<'_>, dst: &mut String);
+
+ /// Return a fixed unchanging replacement string.
+ ///
+ /// When doing replacements, if access to [`Captures`] is not needed (e.g.,
+ /// the replacement string does not need `$` expansion), then it can be
+ /// beneficial to avoid finding sub-captures.
+ ///
+ /// In general, this is called once for every call to a replacement routine
+ /// such as [`Regex::replace_all`].
+ fn no_expansion<'r>(&'r mut self) -> Option<Cow<'r, str>> {
+ None
+ }
+
+ /// Returns a type that implements `Replacer`, but that borrows and wraps
+ /// this `Replacer`.
+ ///
+ /// This is useful when you want to take a generic `Replacer` (which might
+ /// not be cloneable) and use it without consuming it, so it can be used
+ /// more than once.
+ ///
+ /// # Example
+ ///
+ /// ```
+ /// use regex::{Regex, Replacer};
+ ///
+ /// fn replace_all_twice<R: Replacer>(
+ /// re: Regex,
+ /// src: &str,
+ /// mut rep: R,
+ /// ) -> String {
+ /// let dst = re.replace_all(src, rep.by_ref());
+ /// let dst = re.replace_all(&dst, rep.by_ref());
+ /// dst.into_owned()
+ /// }
+ /// ```
+ fn by_ref<'r>(&'r mut self) -> ReplacerRef<'r, Self> {
+ ReplacerRef(self)
+ }
+}
+
+impl<'a> Replacer for &'a str {
+ fn replace_append(&mut self, caps: &Captures<'_>, dst: &mut String) {
+ caps.expand(*self, dst);
+ }
+
+ fn no_expansion(&mut self) -> Option<Cow<'_, str>> {
+ no_expansion(self)
+ }
+}
+
+impl<'a> Replacer for &'a String {
+ fn replace_append(&mut self, caps: &Captures<'_>, dst: &mut String) {
+ self.as_str().replace_append(caps, dst)
+ }
+
+ fn no_expansion(&mut self) -> Option<Cow<'_, str>> {
+ no_expansion(self)
+ }
+}
+
+impl Replacer for String {
+ fn replace_append(&mut self, caps: &Captures<'_>, dst: &mut String) {
+ self.as_str().replace_append(caps, dst)
+ }
+
+ fn no_expansion(&mut self) -> Option<Cow<'_, str>> {
+ no_expansion(self)
+ }
+}
+
+impl<'a> Replacer for Cow<'a, str> {
+ fn replace_append(&mut self, caps: &Captures<'_>, dst: &mut String) {
+ self.as_ref().replace_append(caps, dst)
+ }
+
+ fn no_expansion(&mut self) -> Option<Cow<'_, str>> {
+ no_expansion(self)
+ }
+}
+
+impl<'a> Replacer for &'a Cow<'a, str> {
+ fn replace_append(&mut self, caps: &Captures<'_>, dst: &mut String) {
+ self.as_ref().replace_append(caps, dst)
+ }
+
+ fn no_expansion(&mut self) -> Option<Cow<'_, str>> {
+ no_expansion(self)
+ }
+}
+
+impl<F, T> Replacer for F
+where
+ F: FnMut(&Captures<'_>) -> T,
+ T: AsRef<str>,
+{
+ fn replace_append(&mut self, caps: &Captures<'_>, dst: &mut String) {
+ dst.push_str((*self)(caps).as_ref());
+ }
+}
+
+/// A by-reference adaptor for a [`Replacer`].
+///
+/// This permits reusing the same `Replacer` value in multiple calls to a
+/// replacement routine like [`Regex::replace_all`].
+///
+/// This type is created by [`Replacer::by_ref`].
+#[derive(Debug)]
+pub struct ReplacerRef<'a, R: ?Sized>(&'a mut R);
+
+impl<'a, R: Replacer + ?Sized + 'a> Replacer for ReplacerRef<'a, R> {
+ fn replace_append(&mut self, caps: &Captures<'_>, dst: &mut String) {
+ self.0.replace_append(caps, dst)
+ }
+
+ fn no_expansion(&mut self) -> Option<Cow<'_, str>> {
+ self.0.no_expansion()
+ }
+}
+
+/// A helper type for forcing literal string replacement.
+///
+/// It can be used with routines like [`Regex::replace`] and
+/// [`Regex::replace_all`] to do a literal string replacement without expanding
+/// `$name` to their corresponding capture groups. This can be both convenient
+/// (to avoid escaping `$`, for example) and faster (since capture groups
+/// don't need to be found).
+///
+/// `'s` is the lifetime of the literal string to use.
+///
+/// # Example
+///
+/// ```
+/// use regex::{NoExpand, Regex};
+///
+/// let re = Regex::new(r"(?<last>[^,\s]+),\s+(\S+)").unwrap();
+/// let result = re.replace("Springsteen, Bruce", NoExpand("$2 $last"));
+/// assert_eq!(result, "$2 $last");
+/// ```
+#[derive(Clone, Debug)]
+pub struct NoExpand<'s>(pub &'s str);
+
+impl<'s> Replacer for NoExpand<'s> {
+ fn replace_append(&mut self, _: &Captures<'_>, dst: &mut String) {
+ dst.push_str(self.0);
+ }
+
+ fn no_expansion(&mut self) -> Option<Cow<'_, str>> {
+ Some(Cow::Borrowed(self.0))
+ }
+}
+
+/// Quickly checks the given replacement string for whether interpolation
+/// should be done on it. It returns `None` if a `$` was found anywhere in the
+/// given string, which suggests interpolation needs to be done. But if there's
+/// no `$` anywhere, then interpolation definitely does not need to be done. In
+/// that case, the given string is returned as a borrowed `Cow`.
+///
+/// This is meant to be used to implement the `Replacer::no_expandsion` method
+/// in its various trait impls.
+fn no_expansion<T: AsRef<str>>(replacement: &T) -> Option<Cow<'_, str>> {
+ let replacement = replacement.as_ref();
+ match crate::find_byte::find_byte(b'$', replacement.as_bytes()) {
+ Some(_) => None,
+ None => Some(Cow::Borrowed(replacement)),
+ }
+}
diff --git a/vendor/regex/src/regexset/bytes.rs b/vendor/regex/src/regexset/bytes.rs
new file mode 100644
index 000000000..1220a1466
--- /dev/null
+++ b/vendor/regex/src/regexset/bytes.rs
@@ -0,0 +1,710 @@
+use alloc::string::String;
+
+use regex_automata::{meta, Input, PatternID, PatternSet, PatternSetIter};
+
+use crate::{bytes::RegexSetBuilder, Error};
+
+/// Match multiple, possibly overlapping, regexes in a single search.
+///
+/// A regex set corresponds to the union of zero or more regular expressions.
+/// That is, a regex set will match a haystack when at least one of its
+/// constituent regexes matches. A regex set as its formulated here provides a
+/// touch more power: it will also report *which* regular expressions in the
+/// set match. Indeed, this is the key difference between regex sets and a
+/// single `Regex` with many alternates, since only one alternate can match at
+/// a time.
+///
+/// For example, consider regular expressions to match email addresses and
+/// domains: `[a-z]+@[a-z]+\.(com|org|net)` and `[a-z]+\.(com|org|net)`. If a
+/// regex set is constructed from those regexes, then searching the haystack
+/// `foo@example.com` will report both regexes as matching. Of course, one
+/// could accomplish this by compiling each regex on its own and doing two
+/// searches over the haystack. The key advantage of using a regex set is
+/// that it will report the matching regexes using a *single pass through the
+/// haystack*. If one has hundreds or thousands of regexes to match repeatedly
+/// (like a URL router for a complex web application or a user agent matcher),
+/// then a regex set *can* realize huge performance gains.
+///
+/// Unlike the top-level [`RegexSet`](crate::RegexSet), this `RegexSet`
+/// searches haystacks with type `&[u8]` instead of `&str`. Consequently, this
+/// `RegexSet` is permitted to match invalid UTF-8.
+///
+/// # Limitations
+///
+/// Regex sets are limited to answering the following two questions:
+///
+/// 1. Does any regex in the set match?
+/// 2. If so, which regexes in the set match?
+///
+/// As with the main [`Regex`][crate::bytes::Regex] type, it is cheaper to ask
+/// (1) instead of (2) since the matching engines can stop after the first
+/// match is found.
+///
+/// You cannot directly extract [`Match`][crate::bytes::Match] or
+/// [`Captures`][crate::bytes::Captures] objects from a regex set. If you need
+/// these operations, the recommended approach is to compile each pattern in
+/// the set independently and scan the exact same haystack a second time with
+/// those independently compiled patterns:
+///
+/// ```
+/// use regex::bytes::{Regex, RegexSet};
+///
+/// let patterns = ["foo", "bar"];
+/// // Both patterns will match different ranges of this string.
+/// let hay = b"barfoo";
+///
+/// // Compile a set matching any of our patterns.
+/// let set = RegexSet::new(patterns).unwrap();
+/// // Compile each pattern independently.
+/// let regexes: Vec<_> = set
+/// .patterns()
+/// .iter()
+/// .map(|pat| Regex::new(pat).unwrap())
+/// .collect();
+///
+/// // Match against the whole set first and identify the individual
+/// // matching patterns.
+/// let matches: Vec<&[u8]> = set
+/// .matches(hay)
+/// .into_iter()
+/// // Dereference the match index to get the corresponding
+/// // compiled pattern.
+/// .map(|index| &regexes[index])
+/// // To get match locations or any other info, we then have to search the
+/// // exact same haystack again, using our separately-compiled pattern.
+/// .map(|re| re.find(hay).unwrap().as_bytes())
+/// .collect();
+///
+/// // Matches arrive in the order the constituent patterns were declared,
+/// // not the order they appear in the haystack.
+/// assert_eq!(vec![&b"foo"[..], &b"bar"[..]], matches);
+/// ```
+///
+/// # Performance
+///
+/// A `RegexSet` has the same performance characteristics as `Regex`. Namely,
+/// search takes `O(m * n)` time, where `m` is proportional to the size of the
+/// regex set and `n` is proportional to the length of the haystack.
+///
+/// # Trait implementations
+///
+/// The `Default` trait is implemented for `RegexSet`. The default value
+/// is an empty set. An empty set can also be explicitly constructed via
+/// [`RegexSet::empty`].
+///
+/// # Example
+///
+/// This shows how the above two regexes (for matching email addresses and
+/// domains) might work:
+///
+/// ```
+/// use regex::bytes::RegexSet;
+///
+/// let set = RegexSet::new(&[
+/// r"[a-z]+@[a-z]+\.(com|org|net)",
+/// r"[a-z]+\.(com|org|net)",
+/// ]).unwrap();
+///
+/// // Ask whether any regexes in the set match.
+/// assert!(set.is_match(b"foo@example.com"));
+///
+/// // Identify which regexes in the set match.
+/// let matches: Vec<_> = set.matches(b"foo@example.com").into_iter().collect();
+/// assert_eq!(vec![0, 1], matches);
+///
+/// // Try again, but with a haystack that only matches one of the regexes.
+/// let matches: Vec<_> = set.matches(b"example.com").into_iter().collect();
+/// assert_eq!(vec![1], matches);
+///
+/// // Try again, but with a haystack that doesn't match any regex in the set.
+/// let matches: Vec<_> = set.matches(b"example").into_iter().collect();
+/// assert!(matches.is_empty());
+/// ```
+///
+/// Note that it would be possible to adapt the above example to using `Regex`
+/// with an expression like:
+///
+/// ```text
+/// (?P<email>[a-z]+@(?P<email_domain>[a-z]+[.](com|org|net)))|(?P<domain>[a-z]+[.](com|org|net))
+/// ```
+///
+/// After a match, one could then inspect the capture groups to figure out
+/// which alternates matched. The problem is that it is hard to make this
+/// approach scale when there are many regexes since the overlap between each
+/// alternate isn't always obvious to reason about.
+#[derive(Clone)]
+pub struct RegexSet {
+ pub(crate) meta: meta::Regex,
+ pub(crate) patterns: alloc::sync::Arc<[String]>,
+}
+
+impl RegexSet {
+ /// Create a new regex set with the given regular expressions.
+ ///
+ /// This takes an iterator of `S`, where `S` is something that can produce
+ /// a `&str`. If any of the strings in the iterator are not valid regular
+ /// expressions, then an error is returned.
+ ///
+ /// # Example
+ ///
+ /// Create a new regex set from an iterator of strings:
+ ///
+ /// ```
+ /// use regex::bytes::RegexSet;
+ ///
+ /// let set = RegexSet::new([r"\w+", r"\d+"]).unwrap();
+ /// assert!(set.is_match(b"foo"));
+ /// ```
+ pub fn new<I, S>(exprs: I) -> Result<RegexSet, Error>
+ where
+ S: AsRef<str>,
+ I: IntoIterator<Item = S>,
+ {
+ RegexSetBuilder::new(exprs).build()
+ }
+
+ /// Create a new empty regex set.
+ ///
+ /// An empty regex never matches anything.
+ ///
+ /// This is a convenience function for `RegexSet::new([])`, but doesn't
+ /// require one to specify the type of the input.
+ ///
+ /// # Example
+ ///
+ /// ```
+ /// use regex::bytes::RegexSet;
+ ///
+ /// let set = RegexSet::empty();
+ /// assert!(set.is_empty());
+ /// // an empty set matches nothing
+ /// assert!(!set.is_match(b""));
+ /// ```
+ pub fn empty() -> RegexSet {
+ let empty: [&str; 0] = [];
+ RegexSetBuilder::new(empty).build().unwrap()
+ }
+
+ /// Returns true if and only if one of the regexes in this set matches
+ /// the haystack given.
+ ///
+ /// This method should be preferred if you only need to test whether any
+ /// of the regexes in the set should match, but don't care about *which*
+ /// regexes matched. This is because the underlying matching engine will
+ /// quit immediately after seeing the first match instead of continuing to
+ /// find all matches.
+ ///
+ /// Note that as with searches using [`Regex`](crate::bytes::Regex), the
+ /// expression is unanchored by default. That is, if the regex does not
+ /// start with `^` or `\A`, or end with `$` or `\z`, then it is permitted
+ /// to match anywhere in the haystack.
+ ///
+ /// # Example
+ ///
+ /// Tests whether a set matches somewhere in a haystack:
+ ///
+ /// ```
+ /// use regex::bytes::RegexSet;
+ ///
+ /// let set = RegexSet::new([r"\w+", r"\d+"]).unwrap();
+ /// assert!(set.is_match(b"foo"));
+ /// assert!(!set.is_match("☃".as_bytes()));
+ /// ```
+ #[inline]
+ pub fn is_match(&self, haystack: &[u8]) -> bool {
+ self.is_match_at(haystack, 0)
+ }
+
+ /// Returns true if and only if one of the regexes in this set matches the
+ /// haystack given, with the search starting at the offset given.
+ ///
+ /// The significance of the starting point is that it takes the surrounding
+ /// context into consideration. For example, the `\A` anchor can only
+ /// match when `start == 0`.
+ ///
+ /// # Panics
+ ///
+ /// This panics when `start >= haystack.len() + 1`.
+ ///
+ /// # Example
+ ///
+ /// This example shows the significance of `start`. Namely, consider a
+ /// haystack `foobar` and a desire to execute a search starting at offset
+ /// `3`. You could search a substring explicitly, but then the look-around
+ /// assertions won't work correctly. Instead, you can use this method to
+ /// specify the start position of a search.
+ ///
+ /// ```
+ /// use regex::bytes::RegexSet;
+ ///
+ /// let set = RegexSet::new([r"\bbar\b", r"(?m)^bar$"]).unwrap();
+ /// let hay = b"foobar";
+ /// // We get a match here, but it's probably not intended.
+ /// assert!(set.is_match(&hay[3..]));
+ /// // No match because the assertions take the context into account.
+ /// assert!(!set.is_match_at(hay, 3));
+ /// ```
+ #[inline]
+ pub fn is_match_at(&self, haystack: &[u8], start: usize) -> bool {
+ self.meta.is_match(Input::new(haystack).span(start..haystack.len()))
+ }
+
+ /// Returns the set of regexes that match in the given haystack.
+ ///
+ /// The set returned contains the index of each regex that matches in
+ /// the given haystack. The index is in correspondence with the order of
+ /// regular expressions given to `RegexSet`'s constructor.
+ ///
+ /// The set can also be used to iterate over the matched indices. The order
+ /// of iteration is always ascending with respect to the matching indices.
+ ///
+ /// Note that as with searches using [`Regex`](crate::bytes::Regex), the
+ /// expression is unanchored by default. That is, if the regex does not
+ /// start with `^` or `\A`, or end with `$` or `\z`, then it is permitted
+ /// to match anywhere in the haystack.
+ ///
+ /// # Example
+ ///
+ /// Tests which regular expressions match the given haystack:
+ ///
+ /// ```
+ /// use regex::bytes::RegexSet;
+ ///
+ /// let set = RegexSet::new([
+ /// r"\w+",
+ /// r"\d+",
+ /// r"\pL+",
+ /// r"foo",
+ /// r"bar",
+ /// r"barfoo",
+ /// r"foobar",
+ /// ]).unwrap();
+ /// let matches: Vec<_> = set.matches(b"foobar").into_iter().collect();
+ /// assert_eq!(matches, vec![0, 2, 3, 4, 6]);
+ ///
+ /// // You can also test whether a particular regex matched:
+ /// let matches = set.matches(b"foobar");
+ /// assert!(!matches.matched(5));
+ /// assert!(matches.matched(6));
+ /// ```
+ #[inline]
+ pub fn matches(&self, haystack: &[u8]) -> SetMatches {
+ self.matches_at(haystack, 0)
+ }
+
+ /// Returns the set of regexes that match in the given haystack.
+ ///
+ /// The set returned contains the index of each regex that matches in
+ /// the given haystack. The index is in correspondence with the order of
+ /// regular expressions given to `RegexSet`'s constructor.
+ ///
+ /// The set can also be used to iterate over the matched indices. The order
+ /// of iteration is always ascending with respect to the matching indices.
+ ///
+ /// The significance of the starting point is that it takes the surrounding
+ /// context into consideration. For example, the `\A` anchor can only
+ /// match when `start == 0`.
+ ///
+ /// # Panics
+ ///
+ /// This panics when `start >= haystack.len() + 1`.
+ ///
+ /// # Example
+ ///
+ /// Tests which regular expressions match the given haystack:
+ ///
+ /// ```
+ /// use regex::bytes::RegexSet;
+ ///
+ /// let set = RegexSet::new([r"\bbar\b", r"(?m)^bar$"]).unwrap();
+ /// let hay = b"foobar";
+ /// // We get matches here, but it's probably not intended.
+ /// let matches: Vec<_> = set.matches(&hay[3..]).into_iter().collect();
+ /// assert_eq!(matches, vec![0, 1]);
+ /// // No matches because the assertions take the context into account.
+ /// let matches: Vec<_> = set.matches_at(hay, 3).into_iter().collect();
+ /// assert_eq!(matches, vec![]);
+ /// ```
+ #[inline]
+ pub fn matches_at(&self, haystack: &[u8], start: usize) -> SetMatches {
+ let input = Input::new(haystack).span(start..haystack.len());
+ let mut patset = PatternSet::new(self.meta.pattern_len());
+ self.meta.which_overlapping_matches(&input, &mut patset);
+ SetMatches(patset)
+ }
+
+ /// Returns the same as matches, but starts the search at the given
+ /// offset and stores the matches into the slice given.
+ ///
+ /// The significance of the starting point is that it takes the surrounding
+ /// context into consideration. For example, the `\A` anchor can only
+ /// match when `start == 0`.
+ ///
+ /// `matches` must have a length that is at least the number of regexes
+ /// in this set.
+ ///
+ /// This method returns true if and only if at least one member of
+ /// `matches` is true after executing the set against `haystack`.
+ #[doc(hidden)]
+ #[inline]
+ pub fn matches_read_at(
+ &self,
+ matches: &mut [bool],
+ haystack: &[u8],
+ start: usize,
+ ) -> bool {
+ // This is pretty dumb. We should try to fix this, but the
+ // regex-automata API doesn't provide a way to store matches in an
+ // arbitrary &mut [bool]. Thankfully, this API is is doc(hidden) and
+ // thus not public... But regex-capi currently uses it. We should
+ // fix regex-capi to use a PatternSet, maybe? Not sure... PatternSet
+ // is in regex-automata, not regex. So maybe we should just accept a
+ // 'SetMatches', which is basically just a newtype around PatternSet.
+ let mut patset = PatternSet::new(self.meta.pattern_len());
+ let mut input = Input::new(haystack);
+ input.set_start(start);
+ self.meta.which_overlapping_matches(&input, &mut patset);
+ for pid in patset.iter() {
+ matches[pid] = true;
+ }
+ !patset.is_empty()
+ }
+
+ /// An alias for `matches_read_at` to preserve backward compatibility.
+ ///
+ /// The `regex-capi` crate used this method, so to avoid breaking that
+ /// crate, we continue to export it as an undocumented API.
+ #[doc(hidden)]
+ #[inline]
+ pub fn read_matches_at(
+ &self,
+ matches: &mut [bool],
+ haystack: &[u8],
+ start: usize,
+ ) -> bool {
+ self.matches_read_at(matches, haystack, start)
+ }
+
+ /// Returns the total number of regexes in this set.
+ ///
+ /// # Example
+ ///
+ /// ```
+ /// use regex::bytes::RegexSet;
+ ///
+ /// assert_eq!(0, RegexSet::empty().len());
+ /// assert_eq!(1, RegexSet::new([r"[0-9]"]).unwrap().len());
+ /// assert_eq!(2, RegexSet::new([r"[0-9]", r"[a-z]"]).unwrap().len());
+ /// ```
+ #[inline]
+ pub fn len(&self) -> usize {
+ self.meta.pattern_len()
+ }
+
+ /// Returns `true` if this set contains no regexes.
+ ///
+ /// # Example
+ ///
+ /// ```
+ /// use regex::bytes::RegexSet;
+ ///
+ /// assert!(RegexSet::empty().is_empty());
+ /// assert!(!RegexSet::new([r"[0-9]"]).unwrap().is_empty());
+ /// ```
+ #[inline]
+ pub fn is_empty(&self) -> bool {
+ self.meta.pattern_len() == 0
+ }
+
+ /// Returns the regex patterns that this regex set was constructed from.
+ ///
+ /// This function can be used to determine the pattern for a match. The
+ /// slice returned has exactly as many patterns givens to this regex set,
+ /// and the order of the slice is the same as the order of the patterns
+ /// provided to the set.
+ ///
+ /// # Example
+ ///
+ /// ```
+ /// use regex::bytes::RegexSet;
+ ///
+ /// let set = RegexSet::new(&[
+ /// r"\w+",
+ /// r"\d+",
+ /// r"\pL+",
+ /// r"foo",
+ /// r"bar",
+ /// r"barfoo",
+ /// r"foobar",
+ /// ]).unwrap();
+ /// let matches: Vec<_> = set
+ /// .matches(b"foobar")
+ /// .into_iter()
+ /// .map(|index| &set.patterns()[index])
+ /// .collect();
+ /// assert_eq!(matches, vec![r"\w+", r"\pL+", r"foo", r"bar", r"foobar"]);
+ /// ```
+ #[inline]
+ pub fn patterns(&self) -> &[String] {
+ &self.patterns
+ }
+}
+
+impl Default for RegexSet {
+ fn default() -> Self {
+ RegexSet::empty()
+ }
+}
+
+/// A set of matches returned by a regex set.
+///
+/// Values of this type are constructed by [`RegexSet::matches`].
+#[derive(Clone, Debug)]
+pub struct SetMatches(PatternSet);
+
+impl SetMatches {
+ /// Whether this set contains any matches.
+ ///
+ /// # Example
+ ///
+ /// ```
+ /// use regex::bytes::RegexSet;
+ ///
+ /// let set = RegexSet::new(&[
+ /// r"[a-z]+@[a-z]+\.(com|org|net)",
+ /// r"[a-z]+\.(com|org|net)",
+ /// ]).unwrap();
+ /// let matches = set.matches(b"foo@example.com");
+ /// assert!(matches.matched_any());
+ /// ```
+ #[inline]
+ pub fn matched_any(&self) -> bool {
+ !self.0.is_empty()
+ }
+
+ /// Whether the regex at the given index matched.
+ ///
+ /// The index for a regex is determined by its insertion order upon the
+ /// initial construction of a `RegexSet`, starting at `0`.
+ ///
+ /// # Panics
+ ///
+ /// If `index` is greater than or equal to the number of regexes in the
+ /// original set that produced these matches. Equivalently, when `index`
+ /// is greater than or equal to [`SetMatches::len`].
+ ///
+ /// # Example
+ ///
+ /// ```
+ /// use regex::bytes::RegexSet;
+ ///
+ /// let set = RegexSet::new([
+ /// r"[a-z]+@[a-z]+\.(com|org|net)",
+ /// r"[a-z]+\.(com|org|net)",
+ /// ]).unwrap();
+ /// let matches = set.matches(b"example.com");
+ /// assert!(!matches.matched(0));
+ /// assert!(matches.matched(1));
+ /// ```
+ #[inline]
+ pub fn matched(&self, index: usize) -> bool {
+ self.0.contains(PatternID::new_unchecked(index))
+ }
+
+ /// The total number of regexes in the set that created these matches.
+ ///
+ /// **WARNING:** This always returns the same value as [`RegexSet::len`].
+ /// In particular, it does *not* return the number of elements yielded by
+ /// [`SetMatches::iter`]. The only way to determine the total number of
+ /// matched regexes is to iterate over them.
+ ///
+ /// # Example
+ ///
+ /// Notice that this method returns the total number of regexes in the
+ /// original set, and *not* the total number of regexes that matched.
+ ///
+ /// ```
+ /// use regex::bytes::RegexSet;
+ ///
+ /// let set = RegexSet::new([
+ /// r"[a-z]+@[a-z]+\.(com|org|net)",
+ /// r"[a-z]+\.(com|org|net)",
+ /// ]).unwrap();
+ /// let matches = set.matches(b"example.com");
+ /// // Total number of patterns that matched.
+ /// assert_eq!(1, matches.iter().count());
+ /// // Total number of patterns in the set.
+ /// assert_eq!(2, matches.len());
+ /// ```
+ #[inline]
+ pub fn len(&self) -> usize {
+ self.0.capacity()
+ }
+
+ /// Returns an iterator over the indices of the regexes that matched.
+ ///
+ /// This will always produces matches in ascending order, where the index
+ /// yielded corresponds to the index of the regex that matched with respect
+ /// to its position when initially building the set.
+ ///
+ /// # Example
+ ///
+ /// ```
+ /// use regex::bytes::RegexSet;
+ ///
+ /// let set = RegexSet::new([
+ /// r"[0-9]",
+ /// r"[a-z]",
+ /// r"[A-Z]",
+ /// r"\p{Greek}",
+ /// ]).unwrap();
+ /// let hay = "βa1".as_bytes();
+ /// let matches: Vec<_> = set.matches(hay).iter().collect();
+ /// assert_eq!(matches, vec![0, 1, 3]);
+ /// ```
+ ///
+ /// Note that `SetMatches` also implemnets the `IntoIterator` trait, so
+ /// this method is not always needed. For example:
+ ///
+ /// ```
+ /// use regex::bytes::RegexSet;
+ ///
+ /// let set = RegexSet::new([
+ /// r"[0-9]",
+ /// r"[a-z]",
+ /// r"[A-Z]",
+ /// r"\p{Greek}",
+ /// ]).unwrap();
+ /// let hay = "βa1".as_bytes();
+ /// let mut matches = vec![];
+ /// for index in set.matches(hay) {
+ /// matches.push(index);
+ /// }
+ /// assert_eq!(matches, vec![0, 1, 3]);
+ /// ```
+ #[inline]
+ pub fn iter(&self) -> SetMatchesIter<'_> {
+ SetMatchesIter(self.0.iter())
+ }
+}
+
+impl IntoIterator for SetMatches {
+ type IntoIter = SetMatchesIntoIter;
+ type Item = usize;
+
+ fn into_iter(self) -> Self::IntoIter {
+ let it = 0..self.0.capacity();
+ SetMatchesIntoIter { patset: self.0, it }
+ }
+}
+
+impl<'a> IntoIterator for &'a SetMatches {
+ type IntoIter = SetMatchesIter<'a>;
+ type Item = usize;
+
+ fn into_iter(self) -> Self::IntoIter {
+ self.iter()
+ }
+}
+
+/// An owned iterator over the set of matches from a regex set.
+///
+/// This will always produces matches in ascending order of index, where the
+/// index corresponds to the index of the regex that matched with respect to
+/// its position when initially building the set.
+///
+/// This iterator is created by calling `SetMatches::into_iter` via the
+/// `IntoIterator` trait. This is automatically done in `for` loops.
+///
+/// # Example
+///
+/// ```
+/// use regex::bytes::RegexSet;
+///
+/// let set = RegexSet::new([
+/// r"[0-9]",
+/// r"[a-z]",
+/// r"[A-Z]",
+/// r"\p{Greek}",
+/// ]).unwrap();
+/// let hay = "βa1".as_bytes();
+/// let mut matches = vec![];
+/// for index in set.matches(hay) {
+/// matches.push(index);
+/// }
+/// assert_eq!(matches, vec![0, 1, 3]);
+/// ```
+#[derive(Debug)]
+pub struct SetMatchesIntoIter {
+ patset: PatternSet,
+ it: core::ops::Range<usize>,
+}
+
+impl Iterator for SetMatchesIntoIter {
+ type Item = usize;
+
+ fn next(&mut self) -> Option<usize> {
+ loop {
+ let id = self.it.next()?;
+ if self.patset.contains(PatternID::new_unchecked(id)) {
+ return Some(id);
+ }
+ }
+ }
+
+ fn size_hint(&self) -> (usize, Option<usize>) {
+ self.it.size_hint()
+ }
+}
+
+impl DoubleEndedIterator for SetMatchesIntoIter {
+ fn next_back(&mut self) -> Option<usize> {
+ loop {
+ let id = self.it.next_back()?;
+ if self.patset.contains(PatternID::new_unchecked(id)) {
+ return Some(id);
+ }
+ }
+ }
+}
+
+impl core::iter::FusedIterator for SetMatchesIntoIter {}
+
+/// A borrowed iterator over the set of matches from a regex set.
+///
+/// The lifetime `'a` refers to the lifetime of the [`SetMatches`] value that
+/// created this iterator.
+///
+/// This will always produces matches in ascending order, where the index
+/// corresponds to the index of the regex that matched with respect to its
+/// position when initially building the set.
+///
+/// This iterator is created by the [`SetMatches::iter`] method.
+#[derive(Clone, Debug)]
+pub struct SetMatchesIter<'a>(PatternSetIter<'a>);
+
+impl<'a> Iterator for SetMatchesIter<'a> {
+ type Item = usize;
+
+ fn next(&mut self) -> Option<usize> {
+ self.0.next().map(|pid| pid.as_usize())
+ }
+
+ fn size_hint(&self) -> (usize, Option<usize>) {
+ self.0.size_hint()
+ }
+}
+
+impl<'a> DoubleEndedIterator for SetMatchesIter<'a> {
+ fn next_back(&mut self) -> Option<usize> {
+ self.0.next_back().map(|pid| pid.as_usize())
+ }
+}
+
+impl<'a> core::iter::FusedIterator for SetMatchesIter<'a> {}
+
+impl core::fmt::Debug for RegexSet {
+ fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
+ write!(f, "RegexSet({:?})", self.patterns())
+ }
+}
diff --git a/vendor/regex/src/regexset/mod.rs b/vendor/regex/src/regexset/mod.rs
new file mode 100644
index 000000000..93fadec8b
--- /dev/null
+++ b/vendor/regex/src/regexset/mod.rs
@@ -0,0 +1,2 @@
+pub(crate) mod bytes;
+pub(crate) mod string;
diff --git a/vendor/regex/src/regexset/string.rs b/vendor/regex/src/regexset/string.rs
new file mode 100644
index 000000000..2a3e7b802
--- /dev/null
+++ b/vendor/regex/src/regexset/string.rs
@@ -0,0 +1,706 @@
+use alloc::string::String;
+
+use regex_automata::{meta, Input, PatternID, PatternSet, PatternSetIter};
+
+use crate::{Error, RegexSetBuilder};
+
+/// Match multiple, possibly overlapping, regexes in a single search.
+///
+/// A regex set corresponds to the union of zero or more regular expressions.
+/// That is, a regex set will match a haystack when at least one of its
+/// constituent regexes matches. A regex set as its formulated here provides a
+/// touch more power: it will also report *which* regular expressions in the
+/// set match. Indeed, this is the key difference between regex sets and a
+/// single `Regex` with many alternates, since only one alternate can match at
+/// a time.
+///
+/// For example, consider regular expressions to match email addresses and
+/// domains: `[a-z]+@[a-z]+\.(com|org|net)` and `[a-z]+\.(com|org|net)`. If a
+/// regex set is constructed from those regexes, then searching the haystack
+/// `foo@example.com` will report both regexes as matching. Of course, one
+/// could accomplish this by compiling each regex on its own and doing two
+/// searches over the haystack. The key advantage of using a regex set is
+/// that it will report the matching regexes using a *single pass through the
+/// haystack*. If one has hundreds or thousands of regexes to match repeatedly
+/// (like a URL router for a complex web application or a user agent matcher),
+/// then a regex set *can* realize huge performance gains.
+///
+/// # Limitations
+///
+/// Regex sets are limited to answering the following two questions:
+///
+/// 1. Does any regex in the set match?
+/// 2. If so, which regexes in the set match?
+///
+/// As with the main [`Regex`][crate::Regex] type, it is cheaper to ask (1)
+/// instead of (2) since the matching engines can stop after the first match
+/// is found.
+///
+/// You cannot directly extract [`Match`][crate::Match] or
+/// [`Captures`][crate::Captures] objects from a regex set. If you need these
+/// operations, the recommended approach is to compile each pattern in the set
+/// independently and scan the exact same haystack a second time with those
+/// independently compiled patterns:
+///
+/// ```
+/// use regex::{Regex, RegexSet};
+///
+/// let patterns = ["foo", "bar"];
+/// // Both patterns will match different ranges of this string.
+/// let hay = "barfoo";
+///
+/// // Compile a set matching any of our patterns.
+/// let set = RegexSet::new(patterns).unwrap();
+/// // Compile each pattern independently.
+/// let regexes: Vec<_> = set
+/// .patterns()
+/// .iter()
+/// .map(|pat| Regex::new(pat).unwrap())
+/// .collect();
+///
+/// // Match against the whole set first and identify the individual
+/// // matching patterns.
+/// let matches: Vec<&str> = set
+/// .matches(hay)
+/// .into_iter()
+/// // Dereference the match index to get the corresponding
+/// // compiled pattern.
+/// .map(|index| &regexes[index])
+/// // To get match locations or any other info, we then have to search the
+/// // exact same haystack again, using our separately-compiled pattern.
+/// .map(|re| re.find(hay).unwrap().as_str())
+/// .collect();
+///
+/// // Matches arrive in the order the constituent patterns were declared,
+/// // not the order they appear in the haystack.
+/// assert_eq!(vec!["foo", "bar"], matches);
+/// ```
+///
+/// # Performance
+///
+/// A `RegexSet` has the same performance characteristics as `Regex`. Namely,
+/// search takes `O(m * n)` time, where `m` is proportional to the size of the
+/// regex set and `n` is proportional to the length of the haystack.
+///
+/// # Trait implementations
+///
+/// The `Default` trait is implemented for `RegexSet`. The default value
+/// is an empty set. An empty set can also be explicitly constructed via
+/// [`RegexSet::empty`].
+///
+/// # Example
+///
+/// This shows how the above two regexes (for matching email addresses and
+/// domains) might work:
+///
+/// ```
+/// use regex::RegexSet;
+///
+/// let set = RegexSet::new(&[
+/// r"[a-z]+@[a-z]+\.(com|org|net)",
+/// r"[a-z]+\.(com|org|net)",
+/// ]).unwrap();
+///
+/// // Ask whether any regexes in the set match.
+/// assert!(set.is_match("foo@example.com"));
+///
+/// // Identify which regexes in the set match.
+/// let matches: Vec<_> = set.matches("foo@example.com").into_iter().collect();
+/// assert_eq!(vec![0, 1], matches);
+///
+/// // Try again, but with a haystack that only matches one of the regexes.
+/// let matches: Vec<_> = set.matches("example.com").into_iter().collect();
+/// assert_eq!(vec![1], matches);
+///
+/// // Try again, but with a haystack that doesn't match any regex in the set.
+/// let matches: Vec<_> = set.matches("example").into_iter().collect();
+/// assert!(matches.is_empty());
+/// ```
+///
+/// Note that it would be possible to adapt the above example to using `Regex`
+/// with an expression like:
+///
+/// ```text
+/// (?P<email>[a-z]+@(?P<email_domain>[a-z]+[.](com|org|net)))|(?P<domain>[a-z]+[.](com|org|net))
+/// ```
+///
+/// After a match, one could then inspect the capture groups to figure out
+/// which alternates matched. The problem is that it is hard to make this
+/// approach scale when there are many regexes since the overlap between each
+/// alternate isn't always obvious to reason about.
+#[derive(Clone)]
+pub struct RegexSet {
+ pub(crate) meta: meta::Regex,
+ pub(crate) patterns: alloc::sync::Arc<[String]>,
+}
+
+impl RegexSet {
+ /// Create a new regex set with the given regular expressions.
+ ///
+ /// This takes an iterator of `S`, where `S` is something that can produce
+ /// a `&str`. If any of the strings in the iterator are not valid regular
+ /// expressions, then an error is returned.
+ ///
+ /// # Example
+ ///
+ /// Create a new regex set from an iterator of strings:
+ ///
+ /// ```
+ /// use regex::RegexSet;
+ ///
+ /// let set = RegexSet::new([r"\w+", r"\d+"]).unwrap();
+ /// assert!(set.is_match("foo"));
+ /// ```
+ pub fn new<I, S>(exprs: I) -> Result<RegexSet, Error>
+ where
+ S: AsRef<str>,
+ I: IntoIterator<Item = S>,
+ {
+ RegexSetBuilder::new(exprs).build()
+ }
+
+ /// Create a new empty regex set.
+ ///
+ /// An empty regex never matches anything.
+ ///
+ /// This is a convenience function for `RegexSet::new([])`, but doesn't
+ /// require one to specify the type of the input.
+ ///
+ /// # Example
+ ///
+ /// ```
+ /// use regex::RegexSet;
+ ///
+ /// let set = RegexSet::empty();
+ /// assert!(set.is_empty());
+ /// // an empty set matches nothing
+ /// assert!(!set.is_match(""));
+ /// ```
+ pub fn empty() -> RegexSet {
+ let empty: [&str; 0] = [];
+ RegexSetBuilder::new(empty).build().unwrap()
+ }
+
+ /// Returns true if and only if one of the regexes in this set matches
+ /// the haystack given.
+ ///
+ /// This method should be preferred if you only need to test whether any
+ /// of the regexes in the set should match, but don't care about *which*
+ /// regexes matched. This is because the underlying matching engine will
+ /// quit immediately after seeing the first match instead of continuing to
+ /// find all matches.
+ ///
+ /// Note that as with searches using [`Regex`](crate::Regex), the
+ /// expression is unanchored by default. That is, if the regex does not
+ /// start with `^` or `\A`, or end with `$` or `\z`, then it is permitted
+ /// to match anywhere in the haystack.
+ ///
+ /// # Example
+ ///
+ /// Tests whether a set matches somewhere in a haystack:
+ ///
+ /// ```
+ /// use regex::RegexSet;
+ ///
+ /// let set = RegexSet::new([r"\w+", r"\d+"]).unwrap();
+ /// assert!(set.is_match("foo"));
+ /// assert!(!set.is_match("☃"));
+ /// ```
+ #[inline]
+ pub fn is_match(&self, haystack: &str) -> bool {
+ self.is_match_at(haystack, 0)
+ }
+
+ /// Returns true if and only if one of the regexes in this set matches the
+ /// haystack given, with the search starting at the offset given.
+ ///
+ /// The significance of the starting point is that it takes the surrounding
+ /// context into consideration. For example, the `\A` anchor can only
+ /// match when `start == 0`.
+ ///
+ /// # Panics
+ ///
+ /// This panics when `start >= haystack.len() + 1`.
+ ///
+ /// # Example
+ ///
+ /// This example shows the significance of `start`. Namely, consider a
+ /// haystack `foobar` and a desire to execute a search starting at offset
+ /// `3`. You could search a substring explicitly, but then the look-around
+ /// assertions won't work correctly. Instead, you can use this method to
+ /// specify the start position of a search.
+ ///
+ /// ```
+ /// use regex::RegexSet;
+ ///
+ /// let set = RegexSet::new([r"\bbar\b", r"(?m)^bar$"]).unwrap();
+ /// let hay = "foobar";
+ /// // We get a match here, but it's probably not intended.
+ /// assert!(set.is_match(&hay[3..]));
+ /// // No match because the assertions take the context into account.
+ /// assert!(!set.is_match_at(hay, 3));
+ /// ```
+ #[inline]
+ pub fn is_match_at(&self, haystack: &str, start: usize) -> bool {
+ self.meta.is_match(Input::new(haystack).span(start..haystack.len()))
+ }
+
+ /// Returns the set of regexes that match in the given haystack.
+ ///
+ /// The set returned contains the index of each regex that matches in
+ /// the given haystack. The index is in correspondence with the order of
+ /// regular expressions given to `RegexSet`'s constructor.
+ ///
+ /// The set can also be used to iterate over the matched indices. The order
+ /// of iteration is always ascending with respect to the matching indices.
+ ///
+ /// Note that as with searches using [`Regex`](crate::Regex), the
+ /// expression is unanchored by default. That is, if the regex does not
+ /// start with `^` or `\A`, or end with `$` or `\z`, then it is permitted
+ /// to match anywhere in the haystack.
+ ///
+ /// # Example
+ ///
+ /// Tests which regular expressions match the given haystack:
+ ///
+ /// ```
+ /// use regex::RegexSet;
+ ///
+ /// let set = RegexSet::new([
+ /// r"\w+",
+ /// r"\d+",
+ /// r"\pL+",
+ /// r"foo",
+ /// r"bar",
+ /// r"barfoo",
+ /// r"foobar",
+ /// ]).unwrap();
+ /// let matches: Vec<_> = set.matches("foobar").into_iter().collect();
+ /// assert_eq!(matches, vec![0, 2, 3, 4, 6]);
+ ///
+ /// // You can also test whether a particular regex matched:
+ /// let matches = set.matches("foobar");
+ /// assert!(!matches.matched(5));
+ /// assert!(matches.matched(6));
+ /// ```
+ #[inline]
+ pub fn matches(&self, haystack: &str) -> SetMatches {
+ self.matches_at(haystack, 0)
+ }
+
+ /// Returns the set of regexes that match in the given haystack.
+ ///
+ /// The set returned contains the index of each regex that matches in
+ /// the given haystack. The index is in correspondence with the order of
+ /// regular expressions given to `RegexSet`'s constructor.
+ ///
+ /// The set can also be used to iterate over the matched indices. The order
+ /// of iteration is always ascending with respect to the matching indices.
+ ///
+ /// The significance of the starting point is that it takes the surrounding
+ /// context into consideration. For example, the `\A` anchor can only
+ /// match when `start == 0`.
+ ///
+ /// # Panics
+ ///
+ /// This panics when `start >= haystack.len() + 1`.
+ ///
+ /// # Example
+ ///
+ /// Tests which regular expressions match the given haystack:
+ ///
+ /// ```
+ /// use regex::RegexSet;
+ ///
+ /// let set = RegexSet::new([r"\bbar\b", r"(?m)^bar$"]).unwrap();
+ /// let hay = "foobar";
+ /// // We get matches here, but it's probably not intended.
+ /// let matches: Vec<_> = set.matches(&hay[3..]).into_iter().collect();
+ /// assert_eq!(matches, vec![0, 1]);
+ /// // No matches because the assertions take the context into account.
+ /// let matches: Vec<_> = set.matches_at(hay, 3).into_iter().collect();
+ /// assert_eq!(matches, vec![]);
+ /// ```
+ #[inline]
+ pub fn matches_at(&self, haystack: &str, start: usize) -> SetMatches {
+ let input = Input::new(haystack).span(start..haystack.len());
+ let mut patset = PatternSet::new(self.meta.pattern_len());
+ self.meta.which_overlapping_matches(&input, &mut patset);
+ SetMatches(patset)
+ }
+
+ /// Returns the same as matches, but starts the search at the given
+ /// offset and stores the matches into the slice given.
+ ///
+ /// The significance of the starting point is that it takes the surrounding
+ /// context into consideration. For example, the `\A` anchor can only
+ /// match when `start == 0`.
+ ///
+ /// `matches` must have a length that is at least the number of regexes
+ /// in this set.
+ ///
+ /// This method returns true if and only if at least one member of
+ /// `matches` is true after executing the set against `haystack`.
+ #[doc(hidden)]
+ #[inline]
+ pub fn matches_read_at(
+ &self,
+ matches: &mut [bool],
+ haystack: &str,
+ start: usize,
+ ) -> bool {
+ // This is pretty dumb. We should try to fix this, but the
+ // regex-automata API doesn't provide a way to store matches in an
+ // arbitrary &mut [bool]. Thankfully, this API is is doc(hidden) and
+ // thus not public... But regex-capi currently uses it. We should
+ // fix regex-capi to use a PatternSet, maybe? Not sure... PatternSet
+ // is in regex-automata, not regex. So maybe we should just accept a
+ // 'SetMatches', which is basically just a newtype around PatternSet.
+ let mut patset = PatternSet::new(self.meta.pattern_len());
+ let mut input = Input::new(haystack);
+ input.set_start(start);
+ self.meta.which_overlapping_matches(&input, &mut patset);
+ for pid in patset.iter() {
+ matches[pid] = true;
+ }
+ !patset.is_empty()
+ }
+
+ /// An alias for `matches_read_at` to preserve backward compatibility.
+ ///
+ /// The `regex-capi` crate used this method, so to avoid breaking that
+ /// crate, we continue to export it as an undocumented API.
+ #[doc(hidden)]
+ #[inline]
+ pub fn read_matches_at(
+ &self,
+ matches: &mut [bool],
+ haystack: &str,
+ start: usize,
+ ) -> bool {
+ self.matches_read_at(matches, haystack, start)
+ }
+
+ /// Returns the total number of regexes in this set.
+ ///
+ /// # Example
+ ///
+ /// ```
+ /// use regex::RegexSet;
+ ///
+ /// assert_eq!(0, RegexSet::empty().len());
+ /// assert_eq!(1, RegexSet::new([r"[0-9]"]).unwrap().len());
+ /// assert_eq!(2, RegexSet::new([r"[0-9]", r"[a-z]"]).unwrap().len());
+ /// ```
+ #[inline]
+ pub fn len(&self) -> usize {
+ self.meta.pattern_len()
+ }
+
+ /// Returns `true` if this set contains no regexes.
+ ///
+ /// # Example
+ ///
+ /// ```
+ /// use regex::RegexSet;
+ ///
+ /// assert!(RegexSet::empty().is_empty());
+ /// assert!(!RegexSet::new([r"[0-9]"]).unwrap().is_empty());
+ /// ```
+ #[inline]
+ pub fn is_empty(&self) -> bool {
+ self.meta.pattern_len() == 0
+ }
+
+ /// Returns the regex patterns that this regex set was constructed from.
+ ///
+ /// This function can be used to determine the pattern for a match. The
+ /// slice returned has exactly as many patterns givens to this regex set,
+ /// and the order of the slice is the same as the order of the patterns
+ /// provided to the set.
+ ///
+ /// # Example
+ ///
+ /// ```
+ /// use regex::RegexSet;
+ ///
+ /// let set = RegexSet::new(&[
+ /// r"\w+",
+ /// r"\d+",
+ /// r"\pL+",
+ /// r"foo",
+ /// r"bar",
+ /// r"barfoo",
+ /// r"foobar",
+ /// ]).unwrap();
+ /// let matches: Vec<_> = set
+ /// .matches("foobar")
+ /// .into_iter()
+ /// .map(|index| &set.patterns()[index])
+ /// .collect();
+ /// assert_eq!(matches, vec![r"\w+", r"\pL+", r"foo", r"bar", r"foobar"]);
+ /// ```
+ #[inline]
+ pub fn patterns(&self) -> &[String] {
+ &self.patterns
+ }
+}
+
+impl Default for RegexSet {
+ fn default() -> Self {
+ RegexSet::empty()
+ }
+}
+
+/// A set of matches returned by a regex set.
+///
+/// Values of this type are constructed by [`RegexSet::matches`].
+#[derive(Clone, Debug)]
+pub struct SetMatches(PatternSet);
+
+impl SetMatches {
+ /// Whether this set contains any matches.
+ ///
+ /// # Example
+ ///
+ /// ```
+ /// use regex::RegexSet;
+ ///
+ /// let set = RegexSet::new(&[
+ /// r"[a-z]+@[a-z]+\.(com|org|net)",
+ /// r"[a-z]+\.(com|org|net)",
+ /// ]).unwrap();
+ /// let matches = set.matches("foo@example.com");
+ /// assert!(matches.matched_any());
+ /// ```
+ #[inline]
+ pub fn matched_any(&self) -> bool {
+ !self.0.is_empty()
+ }
+
+ /// Whether the regex at the given index matched.
+ ///
+ /// The index for a regex is determined by its insertion order upon the
+ /// initial construction of a `RegexSet`, starting at `0`.
+ ///
+ /// # Panics
+ ///
+ /// If `index` is greater than or equal to the number of regexes in the
+ /// original set that produced these matches. Equivalently, when `index`
+ /// is greater than or equal to [`SetMatches::len`].
+ ///
+ /// # Example
+ ///
+ /// ```
+ /// use regex::RegexSet;
+ ///
+ /// let set = RegexSet::new([
+ /// r"[a-z]+@[a-z]+\.(com|org|net)",
+ /// r"[a-z]+\.(com|org|net)",
+ /// ]).unwrap();
+ /// let matches = set.matches("example.com");
+ /// assert!(!matches.matched(0));
+ /// assert!(matches.matched(1));
+ /// ```
+ #[inline]
+ pub fn matched(&self, index: usize) -> bool {
+ self.0.contains(PatternID::new_unchecked(index))
+ }
+
+ /// The total number of regexes in the set that created these matches.
+ ///
+ /// **WARNING:** This always returns the same value as [`RegexSet::len`].
+ /// In particular, it does *not* return the number of elements yielded by
+ /// [`SetMatches::iter`]. The only way to determine the total number of
+ /// matched regexes is to iterate over them.
+ ///
+ /// # Example
+ ///
+ /// Notice that this method returns the total number of regexes in the
+ /// original set, and *not* the total number of regexes that matched.
+ ///
+ /// ```
+ /// use regex::RegexSet;
+ ///
+ /// let set = RegexSet::new([
+ /// r"[a-z]+@[a-z]+\.(com|org|net)",
+ /// r"[a-z]+\.(com|org|net)",
+ /// ]).unwrap();
+ /// let matches = set.matches("example.com");
+ /// // Total number of patterns that matched.
+ /// assert_eq!(1, matches.iter().count());
+ /// // Total number of patterns in the set.
+ /// assert_eq!(2, matches.len());
+ /// ```
+ #[inline]
+ pub fn len(&self) -> usize {
+ self.0.capacity()
+ }
+
+ /// Returns an iterator over the indices of the regexes that matched.
+ ///
+ /// This will always produces matches in ascending order, where the index
+ /// yielded corresponds to the index of the regex that matched with respect
+ /// to its position when initially building the set.
+ ///
+ /// # Example
+ ///
+ /// ```
+ /// use regex::RegexSet;
+ ///
+ /// let set = RegexSet::new([
+ /// r"[0-9]",
+ /// r"[a-z]",
+ /// r"[A-Z]",
+ /// r"\p{Greek}",
+ /// ]).unwrap();
+ /// let hay = "βa1";
+ /// let matches: Vec<_> = set.matches(hay).iter().collect();
+ /// assert_eq!(matches, vec![0, 1, 3]);
+ /// ```
+ ///
+ /// Note that `SetMatches` also implemnets the `IntoIterator` trait, so
+ /// this method is not always needed. For example:
+ ///
+ /// ```
+ /// use regex::RegexSet;
+ ///
+ /// let set = RegexSet::new([
+ /// r"[0-9]",
+ /// r"[a-z]",
+ /// r"[A-Z]",
+ /// r"\p{Greek}",
+ /// ]).unwrap();
+ /// let hay = "βa1";
+ /// let mut matches = vec![];
+ /// for index in set.matches(hay) {
+ /// matches.push(index);
+ /// }
+ /// assert_eq!(matches, vec![0, 1, 3]);
+ /// ```
+ #[inline]
+ pub fn iter(&self) -> SetMatchesIter<'_> {
+ SetMatchesIter(self.0.iter())
+ }
+}
+
+impl IntoIterator for SetMatches {
+ type IntoIter = SetMatchesIntoIter;
+ type Item = usize;
+
+ fn into_iter(self) -> Self::IntoIter {
+ let it = 0..self.0.capacity();
+ SetMatchesIntoIter { patset: self.0, it }
+ }
+}
+
+impl<'a> IntoIterator for &'a SetMatches {
+ type IntoIter = SetMatchesIter<'a>;
+ type Item = usize;
+
+ fn into_iter(self) -> Self::IntoIter {
+ self.iter()
+ }
+}
+
+/// An owned iterator over the set of matches from a regex set.
+///
+/// This will always produces matches in ascending order of index, where the
+/// index corresponds to the index of the regex that matched with respect to
+/// its position when initially building the set.
+///
+/// This iterator is created by calling `SetMatches::into_iter` via the
+/// `IntoIterator` trait. This is automatically done in `for` loops.
+///
+/// # Example
+///
+/// ```
+/// use regex::RegexSet;
+///
+/// let set = RegexSet::new([
+/// r"[0-9]",
+/// r"[a-z]",
+/// r"[A-Z]",
+/// r"\p{Greek}",
+/// ]).unwrap();
+/// let hay = "βa1";
+/// let mut matches = vec![];
+/// for index in set.matches(hay) {
+/// matches.push(index);
+/// }
+/// assert_eq!(matches, vec![0, 1, 3]);
+/// ```
+#[derive(Debug)]
+pub struct SetMatchesIntoIter {
+ patset: PatternSet,
+ it: core::ops::Range<usize>,
+}
+
+impl Iterator for SetMatchesIntoIter {
+ type Item = usize;
+
+ fn next(&mut self) -> Option<usize> {
+ loop {
+ let id = self.it.next()?;
+ if self.patset.contains(PatternID::new_unchecked(id)) {
+ return Some(id);
+ }
+ }
+ }
+
+ fn size_hint(&self) -> (usize, Option<usize>) {
+ self.it.size_hint()
+ }
+}
+
+impl DoubleEndedIterator for SetMatchesIntoIter {
+ fn next_back(&mut self) -> Option<usize> {
+ loop {
+ let id = self.it.next_back()?;
+ if self.patset.contains(PatternID::new_unchecked(id)) {
+ return Some(id);
+ }
+ }
+ }
+}
+
+impl core::iter::FusedIterator for SetMatchesIntoIter {}
+
+/// A borrowed iterator over the set of matches from a regex set.
+///
+/// The lifetime `'a` refers to the lifetime of the [`SetMatches`] value that
+/// created this iterator.
+///
+/// This will always produces matches in ascending order, where the index
+/// corresponds to the index of the regex that matched with respect to its
+/// position when initially building the set.
+///
+/// This iterator is created by the [`SetMatches::iter`] method.
+#[derive(Clone, Debug)]
+pub struct SetMatchesIter<'a>(PatternSetIter<'a>);
+
+impl<'a> Iterator for SetMatchesIter<'a> {
+ type Item = usize;
+
+ fn next(&mut self) -> Option<usize> {
+ self.0.next().map(|pid| pid.as_usize())
+ }
+
+ fn size_hint(&self) -> (usize, Option<usize>) {
+ self.0.size_hint()
+ }
+}
+
+impl<'a> DoubleEndedIterator for SetMatchesIter<'a> {
+ fn next_back(&mut self) -> Option<usize> {
+ self.0.next_back().map(|pid| pid.as_usize())
+ }
+}
+
+impl<'a> core::iter::FusedIterator for SetMatchesIter<'a> {}
+
+impl core::fmt::Debug for RegexSet {
+ fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
+ write!(f, "RegexSet({:?})", self.patterns())
+ }
+}
diff --git a/vendor/regex/src/sparse.rs b/vendor/regex/src/sparse.rs
deleted file mode 100644
index 98b726613..000000000
--- a/vendor/regex/src/sparse.rs
+++ /dev/null
@@ -1,84 +0,0 @@
-use std::fmt;
-use std::ops::Deref;
-use std::slice;
-
-/// A sparse set used for representing ordered NFA states.
-///
-/// This supports constant time addition and membership testing. Clearing an
-/// entire set can also be done in constant time. Iteration yields elements
-/// in the order in which they were inserted.
-///
-/// The data structure is based on: https://research.swtch.com/sparse
-/// Note though that we don't actually use uninitialized memory. We generally
-/// reuse allocations, so the initial allocation cost is bareable. However,
-/// its other properties listed above are extremely useful.
-#[derive(Clone)]
-pub struct SparseSet {
- /// Dense contains the instruction pointers in the order in which they
- /// were inserted.
- dense: Vec<usize>,
- /// Sparse maps instruction pointers to their location in dense.
- ///
- /// An instruction pointer is in the set if and only if
- /// sparse[ip] < dense.len() && ip == dense[sparse[ip]].
- sparse: Box<[usize]>,
-}
-
-impl SparseSet {
- pub fn new(size: usize) -> SparseSet {
- SparseSet {
- dense: Vec::with_capacity(size),
- sparse: vec![0; size].into_boxed_slice(),
- }
- }
-
- pub fn len(&self) -> usize {
- self.dense.len()
- }
-
- pub fn is_empty(&self) -> bool {
- self.dense.is_empty()
- }
-
- pub fn capacity(&self) -> usize {
- self.dense.capacity()
- }
-
- pub fn insert(&mut self, value: usize) {
- let i = self.len();
- assert!(i < self.capacity());
- self.dense.push(value);
- self.sparse[value] = i;
- }
-
- pub fn contains(&self, value: usize) -> bool {
- let i = self.sparse[value];
- self.dense.get(i) == Some(&value)
- }
-
- pub fn clear(&mut self) {
- self.dense.clear();
- }
-}
-
-impl fmt::Debug for SparseSet {
- fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
- write!(f, "SparseSet({:?})", self.dense)
- }
-}
-
-impl Deref for SparseSet {
- type Target = [usize];
-
- fn deref(&self) -> &Self::Target {
- &self.dense
- }
-}
-
-impl<'a> IntoIterator for &'a SparseSet {
- type Item = &'a usize;
- type IntoIter = slice::Iter<'a, usize>;
- fn into_iter(self) -> Self::IntoIter {
- self.iter()
- }
-}
diff --git a/vendor/regex/src/testdata/LICENSE b/vendor/regex/src/testdata/LICENSE
deleted file mode 100644
index f47dbf4c4..000000000
--- a/vendor/regex/src/testdata/LICENSE
+++ /dev/null
@@ -1,19 +0,0 @@
-The following license covers testregex.c and all associated test data.
-
-Permission is hereby granted, free of charge, to any person obtaining a
-copy of THIS SOFTWARE FILE (the "Software"), to deal in the Software
-without restriction, including without limitation the rights to use,
-copy, modify, merge, publish, distribute, and/or sell copies of the
-Software, and to permit persons to whom the Software is furnished to do
-so, subject to the following disclaimer:
-
-THIS SOFTWARE IS PROVIDED BY AT&T ``AS IS'' AND ANY EXPRESS OR IMPLIED
-WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
-MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
-IN NO EVENT SHALL AT&T BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
diff --git a/vendor/regex/src/testdata/README b/vendor/regex/src/testdata/README
deleted file mode 100644
index 6efc2dad3..000000000
--- a/vendor/regex/src/testdata/README
+++ /dev/null
@@ -1,17 +0,0 @@
-Test data was taken from the Go distribution, which was in turn taken from the
-testregex test suite:
-
- http://www2.research.att.com/~astopen/testregex/testregex.html
-
-The LICENSE in this directory corresponds to the LICENSE that the data was
-released under.
-
-The tests themselves were modified for RE2/Go. A couple were modified further
-by me (Andrew Gallant) (only in repetition.dat) so that RE2/Go would pass them.
-(Yes, it seems like RE2/Go includes failing test cases.) This may or may not
-have been a bad idea, but I think being consistent with an established Regex
-library is worth something.
-
-Note that these files are read by 'scripts/regex-match-tests.py' and turned
-into Rust tests found in 'regex_macros/tests/matches.rs'.
-
diff --git a/vendor/regex/src/testdata/basic.dat b/vendor/regex/src/testdata/basic.dat
deleted file mode 100644
index 632e1bb41..000000000
--- a/vendor/regex/src/testdata/basic.dat
+++ /dev/null
@@ -1,221 +0,0 @@
-NOTE all standard compliant implementations should pass these : 2002-05-31
-
-BE abracadabra$ abracadabracadabra (7,18)
-BE a...b abababbb (2,7)
-BE XXXXXX ..XXXXXX (2,8)
-E \) () (1,2)
-BE a] a]a (0,2)
-B } } (0,1)
-E \} } (0,1)
-BE \] ] (0,1)
-B ] ] (0,1)
-E ] ] (0,1)
-B { { (0,1)
-B } } (0,1)
-BE ^a ax (0,1)
-BE \^a a^a (1,3)
-BE a\^ a^ (0,2)
-BE a$ aa (1,2)
-BE a\$ a$ (0,2)
-BE ^$ NULL (0,0)
-E $^ NULL (0,0)
-E a($) aa (1,2)(2,2)
-E a*(^a) aa (0,1)(0,1)
-E (..)*(...)* a (0,0)
-E (..)*(...)* abcd (0,4)(2,4)
-E (ab|a)(bc|c) abc (0,3)(0,2)(2,3)
-E (ab)c|abc abc (0,3)(0,2)
-E a{0}b ab (1,2)
-E (a*)(b?)(b+)b{3} aaabbbbbbb (0,10)(0,3)(3,4)(4,7)
-E (a*)(b{0,1})(b{1,})b{3} aaabbbbbbb (0,10)(0,3)(3,4)(4,7)
-E a{9876543210} NULL BADBR
-E ((a|a)|a) a (0,1)(0,1)(0,1)
-E (a*)(a|aa) aaaa (0,4)(0,3)(3,4)
-E a*(a.|aa) aaaa (0,4)(2,4)
-E a(b)|c(d)|a(e)f aef (0,3)(?,?)(?,?)(1,2)
-E (a|b)?.* b (0,1)(0,1)
-E (a|b)c|a(b|c) ac (0,2)(0,1)
-E (a|b)c|a(b|c) ab (0,2)(?,?)(1,2)
-E (a|b)*c|(a|ab)*c abc (0,3)(1,2)
-E (a|b)*c|(a|ab)*c xc (1,2)
-E (.a|.b).*|.*(.a|.b) xa (0,2)(0,2)
-E a?(ab|ba)ab abab (0,4)(0,2)
-E a?(ac{0}b|ba)ab abab (0,4)(0,2)
-E ab|abab abbabab (0,2)
-E aba|bab|bba baaabbbaba (5,8)
-E aba|bab baaabbbaba (6,9)
-E (aa|aaa)*|(a|aaaaa) aa (0,2)(0,2)
-E (a.|.a.)*|(a|.a...) aa (0,2)(0,2)
-E ab|a xabc (1,3)
-E ab|a xxabc (2,4)
-Ei (?-u)(Ab|cD)* aBcD (0,4)(2,4)
-BE [^-] --a (2,3)
-BE [a-]* --a (0,3)
-BE [a-m-]* --amoma-- (0,4)
-E :::1:::0:|:::1:1:0: :::0:::1:::1:::0: (8,17)
-E :::1:::0:|:::1:1:1: :::0:::1:::1:::0: (8,17)
-{E [[:upper:]] A (0,1) [[<element>]] not supported
-E [[:lower:]]+ `az{ (1,3)
-E [[:upper:]]+ @AZ[ (1,3)
-# No collation in Go
-#BE [[-]] [[-]] (2,4)
-#BE [[.NIL.]] NULL ECOLLATE
-#BE [[=aleph=]] NULL ECOLLATE
-}
-BE$ \n \n (0,1)
-BEn$ \n \n (0,1)
-BE$ [^a] \n (0,1)
-BE$ \na \na (0,2)
-E (a)(b)(c) abc (0,3)(0,1)(1,2)(2,3)
-BE xxx xxx (0,3)
-E1 (^|[ (,;])((([Ff]eb[^ ]* *|0*2/|\* */?)0*[6-7]))([^0-9]|$) feb 6, (0,6)
-E1 (^|[ (,;])((([Ff]eb[^ ]* *|0*2/|\* */?)0*[6-7]))([^0-9]|$) 2/7 (0,3)
-E1 (^|[ (,;])((([Ff]eb[^ ]* *|0*2/|\* */?)0*[6-7]))([^0-9]|$) feb 1,Feb 6 (5,11)
-E3 ((((((((((((((((((((((((((((((x)))))))))))))))))))))))))))))) x (0,1)(0,1)(0,1)
-E3 ((((((((((((((((((((((((((((((x))))))))))))))))))))))))))))))* xx (0,2)(1,2)(1,2)
-E a?(ab|ba)* ababababababababababababababababababababababababababababababababababababababababa (0,81)(79,81)
-E abaa|abbaa|abbbaa|abbbbaa ababbabbbabbbabbbbabbbbaa (18,25)
-E abaa|abbaa|abbbaa|abbbbaa ababbabbbabbbabbbbabaa (18,22)
-E aaac|aabc|abac|abbc|baac|babc|bbac|bbbc baaabbbabac (7,11)
-BE$ .* \x01\x7f (0,2)
-E aaaa|bbbb|cccc|ddddd|eeeeee|fffffff|gggg|hhhh|iiiii|jjjjj|kkkkk|llll XaaaXbbbXcccXdddXeeeXfffXgggXhhhXiiiXjjjXkkkXlllXcbaXaaaa (53,57)
-L aaaa\nbbbb\ncccc\nddddd\neeeeee\nfffffff\ngggg\nhhhh\niiiii\njjjjj\nkkkkk\nllll XaaaXbbbXcccXdddXeeeXfffXgggXhhhXiiiXjjjXkkkXlllXcbaXaaaa NOMATCH
-E a*a*a*a*a*b aaaaaaaaab (0,10)
-BE ^ NULL (0,0)
-BE $ NULL (0,0)
-BE ^$ NULL (0,0)
-BE ^a$ a (0,1)
-BE abc abc (0,3)
-BE abc xabcy (1,4)
-BE abc ababc (2,5)
-BE ab*c abc (0,3)
-BE ab*bc abc (0,3)
-BE ab*bc abbc (0,4)
-BE ab*bc abbbbc (0,6)
-E ab+bc abbc (0,4)
-E ab+bc abbbbc (0,6)
-E ab?bc abbc (0,4)
-E ab?bc abc (0,3)
-E ab?c abc (0,3)
-BE ^abc$ abc (0,3)
-BE ^abc abcc (0,3)
-BE abc$ aabc (1,4)
-BE ^ abc (0,0)
-BE $ abc (3,3)
-BE a.c abc (0,3)
-BE a.c axc (0,3)
-BE a.*c axyzc (0,5)
-BE a[bc]d abd (0,3)
-BE a[b-d]e ace (0,3)
-BE a[b-d] aac (1,3)
-BE a[-b] a- (0,2)
-BE a[b-] a- (0,2)
-BE a] a] (0,2)
-BE a[]]b a]b (0,3)
-BE a[^bc]d aed (0,3)
-BE a[^-b]c adc (0,3)
-BE a[^]b]c adc (0,3)
-E ab|cd abc (0,2)
-E ab|cd abcd (0,2)
-E a\(b a(b (0,3)
-E a\(*b ab (0,2)
-E a\(*b a((b (0,4)
-E ((a)) abc (0,1)(0,1)(0,1)
-E (a)b(c) abc (0,3)(0,1)(2,3)
-E a+b+c aabbabc (4,7)
-E a* aaa (0,3)
-#E (a*)* - (0,0)(0,0)
-E (a*)* - (0,0)(?,?) RE2/Go
-E (a*)+ - (0,0)(0,0)
-#E (a*|b)* - (0,0)(0,0)
-E (a*|b)* - (0,0)(?,?) RE2/Go
-E (a+|b)* ab (0,2)(1,2)
-E (a+|b)+ ab (0,2)(1,2)
-E (a+|b)? ab (0,1)(0,1)
-BE [^ab]* cde (0,3)
-#E (^)* - (0,0)(0,0)
-E (^)* - (0,0)(?,?) RE2/Go
-BE a* NULL (0,0)
-E ([abc])*d abbbcd (0,6)(4,5)
-E ([abc])*bcd abcd (0,4)(0,1)
-E a|b|c|d|e e (0,1)
-E (a|b|c|d|e)f ef (0,2)(0,1)
-#E ((a*|b))* - (0,0)(0,0)(0,0)
-E ((a*|b))* - (0,0)(?,?)(?,?) RE2/Go
-BE abcd*efg abcdefg (0,7)
-BE ab* xabyabbbz (1,3)
-BE ab* xayabbbz (1,2)
-E (ab|cd)e abcde (2,5)(2,4)
-BE [abhgefdc]ij hij (0,3)
-E (a|b)c*d abcd (1,4)(1,2)
-E (ab|ab*)bc abc (0,3)(0,1)
-E a([bc]*)c* abc (0,3)(1,3)
-E a([bc]*)(c*d) abcd (0,4)(1,3)(3,4)
-E a([bc]+)(c*d) abcd (0,4)(1,3)(3,4)
-E a([bc]*)(c+d) abcd (0,4)(1,2)(2,4)
-E a[bcd]*dcdcde adcdcde (0,7)
-E (ab|a)b*c abc (0,3)(0,2)
-E ((a)(b)c)(d) abcd (0,4)(0,3)(0,1)(1,2)(3,4)
-BE [A-Za-z_][A-Za-z0-9_]* alpha (0,5)
-E ^a(bc+|b[eh])g|.h$ abh (1,3)
-E (bc+d$|ef*g.|h?i(j|k)) effgz (0,5)(0,5)
-E (bc+d$|ef*g.|h?i(j|k)) ij (0,2)(0,2)(1,2)
-E (bc+d$|ef*g.|h?i(j|k)) reffgz (1,6)(1,6)
-E (((((((((a))))))))) a (0,1)(0,1)(0,1)(0,1)(0,1)(0,1)(0,1)(0,1)(0,1)(0,1)
-BE multiple words multiple words yeah (0,14)
-E (.*)c(.*) abcde (0,5)(0,2)(3,5)
-BE abcd abcd (0,4)
-E a(bc)d abcd (0,4)(1,3)
-E a[-]?c ac (0,3)
-E M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy] Muammar Qaddafi (0,15)(?,?)(10,12)
-E M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy] Mo'ammar Gadhafi (0,16)(?,?)(11,13)
-E M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy] Muammar Kaddafi (0,15)(?,?)(10,12)
-E M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy] Muammar Qadhafi (0,15)(?,?)(10,12)
-E M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy] Muammar Gadafi (0,14)(?,?)(10,11)
-E M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy] Mu'ammar Qadafi (0,15)(?,?)(11,12)
-E M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy] Moamar Gaddafi (0,14)(?,?)(9,11)
-E M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy] Mu'ammar Qadhdhafi (0,18)(?,?)(13,15)
-E M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy] Muammar Khaddafi (0,16)(?,?)(11,13)
-E M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy] Muammar Ghaddafy (0,16)(?,?)(11,13)
-E M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy] Muammar Ghadafi (0,15)(?,?)(11,12)
-E M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy] Muammar Ghaddafi (0,16)(?,?)(11,13)
-E M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy] Muamar Kaddafi (0,14)(?,?)(9,11)
-E M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy] Muammar Quathafi (0,16)(?,?)(11,13)
-E M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy] Muammar Gheddafi (0,16)(?,?)(11,13)
-E M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy] Moammar Khadafy (0,15)(?,?)(11,12)
-E M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy] Moammar Qudhafi (0,15)(?,?)(10,12)
-E a+(b|c)*d+ aabcdd (0,6)(3,4)
-E ^.+$ vivi (0,4)
-E ^(.+)$ vivi (0,4)(0,4)
-E ^([^!.]+).att.com!(.+)$ gryphon.att.com!eby (0,19)(0,7)(16,19)
-E ^([^!]+!)?([^!]+)$ bas (0,3)(?,?)(0,3)
-E ^([^!]+!)?([^!]+)$ bar!bas (0,7)(0,4)(4,7)
-E ^([^!]+!)?([^!]+)$ foo!bas (0,7)(0,4)(4,7)
-E ^.+!([^!]+!)([^!]+)$ foo!bar!bas (0,11)(4,8)(8,11)
-E ((foo)|(bar))!bas bar!bas (0,7)(0,3)(?,?)(0,3)
-E ((foo)|(bar))!bas foo!bar!bas (4,11)(4,7)(?,?)(4,7)
-E ((foo)|(bar))!bas foo!bas (0,7)(0,3)(0,3)
-E ((foo)|bar)!bas bar!bas (0,7)(0,3)
-E ((foo)|bar)!bas foo!bar!bas (4,11)(4,7)
-E ((foo)|bar)!bas foo!bas (0,7)(0,3)(0,3)
-E (foo|(bar))!bas bar!bas (0,7)(0,3)(0,3)
-E (foo|(bar))!bas foo!bar!bas (4,11)(4,7)(4,7)
-E (foo|(bar))!bas foo!bas (0,7)(0,3)
-E (foo|bar)!bas bar!bas (0,7)(0,3)
-E (foo|bar)!bas foo!bar!bas (4,11)(4,7)
-E (foo|bar)!bas foo!bas (0,7)(0,3)
-E ^(([^!]+!)?([^!]+)|.+!([^!]+!)([^!]+))$ foo!bar!bas (0,11)(0,11)(?,?)(?,?)(4,8)(8,11)
-E ^([^!]+!)?([^!]+)$|^.+!([^!]+!)([^!]+)$ bas (0,3)(?,?)(0,3)
-E ^([^!]+!)?([^!]+)$|^.+!([^!]+!)([^!]+)$ bar!bas (0,7)(0,4)(4,7)
-E ^([^!]+!)?([^!]+)$|^.+!([^!]+!)([^!]+)$ foo!bar!bas (0,11)(?,?)(?,?)(4,8)(8,11)
-E ^([^!]+!)?([^!]+)$|^.+!([^!]+!)([^!]+)$ foo!bas (0,7)(0,4)(4,7)
-E ^(([^!]+!)?([^!]+)|.+!([^!]+!)([^!]+))$ bas (0,3)(0,3)(?,?)(0,3)
-E ^(([^!]+!)?([^!]+)|.+!([^!]+!)([^!]+))$ bar!bas (0,7)(0,7)(0,4)(4,7)
-E ^(([^!]+!)?([^!]+)|.+!([^!]+!)([^!]+))$ foo!bar!bas (0,11)(0,11)(?,?)(?,?)(4,8)(8,11)
-E ^(([^!]+!)?([^!]+)|.+!([^!]+!)([^!]+))$ foo!bas (0,7)(0,7)(0,4)(4,7)
-E .*(/XXX).* /XXX (0,4)(0,4)
-E .*(\\XXX).* \XXX (0,4)(0,4)
-E \\XXX \XXX (0,4)
-E .*(/000).* /000 (0,4)(0,4)
-E .*(\\000).* \000 (0,4)(0,4)
-E \\000 \000 (0,4)
diff --git a/vendor/regex/src/testdata/nullsubexpr.dat b/vendor/regex/src/testdata/nullsubexpr.dat
deleted file mode 100644
index 2e18fbb91..000000000
--- a/vendor/regex/src/testdata/nullsubexpr.dat
+++ /dev/null
@@ -1,79 +0,0 @@
-NOTE null subexpression matches : 2002-06-06
-
-E (a*)* a (0,1)(0,1)
-#E SAME x (0,0)(0,0)
-E SAME x (0,0)(?,?) RE2/Go
-E SAME aaaaaa (0,6)(0,6)
-E SAME aaaaaax (0,6)(0,6)
-E (a*)+ a (0,1)(0,1)
-E SAME x (0,0)(0,0)
-E SAME aaaaaa (0,6)(0,6)
-E SAME aaaaaax (0,6)(0,6)
-E (a+)* a (0,1)(0,1)
-E SAME x (0,0)
-E SAME aaaaaa (0,6)(0,6)
-E SAME aaaaaax (0,6)(0,6)
-E (a+)+ a (0,1)(0,1)
-E SAME x NOMATCH
-E SAME aaaaaa (0,6)(0,6)
-E SAME aaaaaax (0,6)(0,6)
-
-E ([a]*)* a (0,1)(0,1)
-#E SAME x (0,0)(0,0)
-E SAME x (0,0)(?,?) RE2/Go
-E SAME aaaaaa (0,6)(0,6)
-E SAME aaaaaax (0,6)(0,6)
-E ([a]*)+ a (0,1)(0,1)
-E SAME x (0,0)(0,0)
-E SAME aaaaaa (0,6)(0,6)
-E SAME aaaaaax (0,6)(0,6)
-E ([^b]*)* a (0,1)(0,1)
-#E SAME b (0,0)(0,0)
-E SAME b (0,0)(?,?) RE2/Go
-E SAME aaaaaa (0,6)(0,6)
-E SAME aaaaaab (0,6)(0,6)
-E ([ab]*)* a (0,1)(0,1)
-E SAME aaaaaa (0,6)(0,6)
-E SAME ababab (0,6)(0,6)
-E SAME bababa (0,6)(0,6)
-E SAME b (0,1)(0,1)
-E SAME bbbbbb (0,6)(0,6)
-E SAME aaaabcde (0,5)(0,5)
-E ([^a]*)* b (0,1)(0,1)
-E SAME bbbbbb (0,6)(0,6)
-#E SAME aaaaaa (0,0)(0,0)
-E SAME aaaaaa (0,0)(?,?) RE2/Go
-E ([^ab]*)* ccccxx (0,6)(0,6)
-#E SAME ababab (0,0)(0,0)
-E SAME ababab (0,0)(?,?) RE2/Go
-
-E ((z)+|a)* zabcde (0,2)(1,2)
-
-#{E a+? aaaaaa (0,1) no *? +? mimimal match ops
-#E (a) aaa (0,1)(0,1)
-#E (a*?) aaa (0,0)(0,0)
-#E (a)*? aaa (0,0)
-#E (a*?)*? aaa (0,0)
-#}
-
-B \(a*\)*\(x\) x (0,1)(0,0)(0,1)
-B \(a*\)*\(x\) ax (0,2)(0,1)(1,2)
-B \(a*\)*\(x\) axa (0,2)(0,1)(1,2)
-B \(a*\)*\(x\)\(\1\) x (0,1)(0,0)(0,1)(1,1)
-B \(a*\)*\(x\)\(\1\) ax (0,2)(1,1)(1,2)(2,2)
-B \(a*\)*\(x\)\(\1\) axa (0,3)(0,1)(1,2)(2,3)
-B \(a*\)*\(x\)\(\1\)\(x\) axax (0,4)(0,1)(1,2)(2,3)(3,4)
-B \(a*\)*\(x\)\(\1\)\(x\) axxa (0,3)(1,1)(1,2)(2,2)(2,3)
-
-#E (a*)*(x) x (0,1)(0,0)(0,1)
-E (a*)*(x) x (0,1)(?,?)(0,1) RE2/Go
-E (a*)*(x) ax (0,2)(0,1)(1,2)
-E (a*)*(x) axa (0,2)(0,1)(1,2)
-
-E (a*)+(x) x (0,1)(0,0)(0,1)
-E (a*)+(x) ax (0,2)(0,1)(1,2)
-E (a*)+(x) axa (0,2)(0,1)(1,2)
-
-E (a*){2}(x) x (0,1)(0,0)(0,1)
-E (a*){2}(x) ax (0,2)(1,1)(1,2)
-E (a*){2}(x) axa (0,2)(1,1)(1,2)
diff --git a/vendor/regex/src/testdata/repetition.dat b/vendor/regex/src/testdata/repetition.dat
deleted file mode 100644
index 3bb212118..000000000
--- a/vendor/regex/src/testdata/repetition.dat
+++ /dev/null
@@ -1,163 +0,0 @@
-NOTE implicit vs. explicit repetitions : 2009-02-02
-
-# Glenn Fowler <gsf@research.att.com>
-# conforming matches (column 4) must match one of the following BREs
-# NOMATCH
-# (0,.)\((\(.\),\(.\))(?,?)(\2,\3)\)*
-# (0,.)\((\(.\),\(.\))(\2,\3)(?,?)\)*
-# i.e., each 3-tuple has two identical elements and one (?,?)
-
-E ((..)|(.)) NULL NOMATCH
-E ((..)|(.))((..)|(.)) NULL NOMATCH
-E ((..)|(.))((..)|(.))((..)|(.)) NULL NOMATCH
-
-E ((..)|(.)){1} NULL NOMATCH
-E ((..)|(.)){2} NULL NOMATCH
-E ((..)|(.)){3} NULL NOMATCH
-
-E ((..)|(.))* NULL (0,0)
-
-E ((..)|(.)) a (0,1)(0,1)(?,?)(0,1)
-E ((..)|(.))((..)|(.)) a NOMATCH
-E ((..)|(.))((..)|(.))((..)|(.)) a NOMATCH
-
-E ((..)|(.)){1} a (0,1)(0,1)(?,?)(0,1)
-E ((..)|(.)){2} a NOMATCH
-E ((..)|(.)){3} a NOMATCH
-
-E ((..)|(.))* a (0,1)(0,1)(?,?)(0,1)
-
-E ((..)|(.)) aa (0,2)(0,2)(0,2)(?,?)
-E ((..)|(.))((..)|(.)) aa (0,2)(0,1)(?,?)(0,1)(1,2)(?,?)(1,2)
-E ((..)|(.))((..)|(.))((..)|(.)) aa NOMATCH
-
-E ((..)|(.)){1} aa (0,2)(0,2)(0,2)(?,?)
-E ((..)|(.)){2} aa (0,2)(1,2)(?,?)(1,2)
-E ((..)|(.)){3} aa NOMATCH
-
-E ((..)|(.))* aa (0,2)(0,2)(0,2)(?,?)
-
-E ((..)|(.)) aaa (0,2)(0,2)(0,2)(?,?)
-E ((..)|(.))((..)|(.)) aaa (0,3)(0,2)(0,2)(?,?)(2,3)(?,?)(2,3)
-E ((..)|(.))((..)|(.))((..)|(.)) aaa (0,3)(0,1)(?,?)(0,1)(1,2)(?,?)(1,2)(2,3)(?,?)(2,3)
-
-E ((..)|(.)){1} aaa (0,2)(0,2)(0,2)(?,?)
-#E ((..)|(.)){2} aaa (0,3)(2,3)(?,?)(2,3)
-E ((..)|(.)){2} aaa (0,3)(2,3)(0,2)(2,3) RE2/Go
-E ((..)|(.)){3} aaa (0,3)(2,3)(?,?)(2,3)
-
-#E ((..)|(.))* aaa (0,3)(2,3)(?,?)(2,3)
-E ((..)|(.))* aaa (0,3)(2,3)(0,2)(2,3) RE2/Go
-
-E ((..)|(.)) aaaa (0,2)(0,2)(0,2)(?,?)
-E ((..)|(.))((..)|(.)) aaaa (0,4)(0,2)(0,2)(?,?)(2,4)(2,4)(?,?)
-E ((..)|(.))((..)|(.))((..)|(.)) aaaa (0,4)(0,2)(0,2)(?,?)(2,3)(?,?)(2,3)(3,4)(?,?)(3,4)
-
-E ((..)|(.)){1} aaaa (0,2)(0,2)(0,2)(?,?)
-E ((..)|(.)){2} aaaa (0,4)(2,4)(2,4)(?,?)
-#E ((..)|(.)){3} aaaa (0,4)(3,4)(?,?)(3,4)
-E ((..)|(.)){3} aaaa (0,4)(3,4)(0,2)(3,4) RE2/Go
-
-E ((..)|(.))* aaaa (0,4)(2,4)(2,4)(?,?)
-
-E ((..)|(.)) aaaaa (0,2)(0,2)(0,2)(?,?)
-E ((..)|(.))((..)|(.)) aaaaa (0,4)(0,2)(0,2)(?,?)(2,4)(2,4)(?,?)
-E ((..)|(.))((..)|(.))((..)|(.)) aaaaa (0,5)(0,2)(0,2)(?,?)(2,4)(2,4)(?,?)(4,5)(?,?)(4,5)
-
-E ((..)|(.)){1} aaaaa (0,2)(0,2)(0,2)(?,?)
-E ((..)|(.)){2} aaaaa (0,4)(2,4)(2,4)(?,?)
-#E ((..)|(.)){3} aaaaa (0,5)(4,5)(?,?)(4,5)
-E ((..)|(.)){3} aaaaa (0,5)(4,5)(2,4)(4,5) RE2/Go
-
-#E ((..)|(.))* aaaaa (0,5)(4,5)(?,?)(4,5)
-E ((..)|(.))* aaaaa (0,5)(4,5)(2,4)(4,5) RE2/Go
-
-E ((..)|(.)) aaaaaa (0,2)(0,2)(0,2)(?,?)
-E ((..)|(.))((..)|(.)) aaaaaa (0,4)(0,2)(0,2)(?,?)(2,4)(2,4)(?,?)
-E ((..)|(.))((..)|(.))((..)|(.)) aaaaaa (0,6)(0,2)(0,2)(?,?)(2,4)(2,4)(?,?)(4,6)(4,6)(?,?)
-
-E ((..)|(.)){1} aaaaaa (0,2)(0,2)(0,2)(?,?)
-E ((..)|(.)){2} aaaaaa (0,4)(2,4)(2,4)(?,?)
-E ((..)|(.)){3} aaaaaa (0,6)(4,6)(4,6)(?,?)
-
-E ((..)|(.))* aaaaaa (0,6)(4,6)(4,6)(?,?)
-
-NOTE additional repetition tests graciously provided by Chris Kuklewicz www.haskell.org 2009-02-02
-
-# These test a bug in OS X / FreeBSD / NetBSD, and libtree.
-# Linux/GLIBC gets the {8,} and {8,8} wrong.
-
-:HA#100:E X(.?){0,}Y X1234567Y (0,9)(7,8)
-:HA#101:E X(.?){1,}Y X1234567Y (0,9)(7,8)
-:HA#102:E X(.?){2,}Y X1234567Y (0,9)(7,8)
-:HA#103:E X(.?){3,}Y X1234567Y (0,9)(7,8)
-:HA#104:E X(.?){4,}Y X1234567Y (0,9)(7,8)
-:HA#105:E X(.?){5,}Y X1234567Y (0,9)(7,8)
-:HA#106:E X(.?){6,}Y X1234567Y (0,9)(7,8)
-:HA#107:E X(.?){7,}Y X1234567Y (0,9)(7,8)
-:HA#108:E X(.?){8,}Y X1234567Y (0,9)(8,8)
-#:HA#110:E X(.?){0,8}Y X1234567Y (0,9)(7,8)
-:HA#110:E X(.?){0,8}Y X1234567Y (0,9)(8,8) RE2/Go
-#:HA#111:E X(.?){1,8}Y X1234567Y (0,9)(7,8)
-:HA#111:E X(.?){1,8}Y X1234567Y (0,9)(8,8) RE2/Go
-#:HA#112:E X(.?){2,8}Y X1234567Y (0,9)(7,8)
-:HA#112:E X(.?){2,8}Y X1234567Y (0,9)(8,8) RE2/Go
-#:HA#113:E X(.?){3,8}Y X1234567Y (0,9)(7,8)
-:HA#113:E X(.?){3,8}Y X1234567Y (0,9)(8,8) RE2/Go
-#:HA#114:E X(.?){4,8}Y X1234567Y (0,9)(7,8)
-:HA#114:E X(.?){4,8}Y X1234567Y (0,9)(8,8) RE2/Go
-#:HA#115:E X(.?){5,8}Y X1234567Y (0,9)(7,8)
-:HA#115:E X(.?){5,8}Y X1234567Y (0,9)(8,8) RE2/Go
-#:HA#116:E X(.?){6,8}Y X1234567Y (0,9)(7,8)
-:HA#116:E X(.?){6,8}Y X1234567Y (0,9)(8,8) RE2/Go
-#:HA#117:E X(.?){7,8}Y X1234567Y (0,9)(7,8)
-:HA#117:E X(.?){7,8}Y X1234567Y (0,9)(8,8) RE2/Go
-:HA#118:E X(.?){8,8}Y X1234567Y (0,9)(8,8)
-
-# These test a fixed bug in my regex-tdfa that did not keep the expanded
-# form properly grouped, so right association did the wrong thing with
-# these ambiguous patterns (crafted just to test my code when I became
-# suspicious of my implementation). The first subexpression should use
-# "ab" then "a" then "bcd".
-
-# OS X / FreeBSD / NetBSD badly fail many of these, with impossible
-# results like (0,6)(4,5)(6,6).
-
-:HA#260:E (a|ab|c|bcd){0,}(d*) ababcd (0,1)(0,1)(1,1)
-:HA#261:E (a|ab|c|bcd){1,}(d*) ababcd (0,1)(0,1)(1,1)
-:HA#262:E (a|ab|c|bcd){2,}(d*) ababcd (0,6)(3,6)(6,6)
-:HA#263:E (a|ab|c|bcd){3,}(d*) ababcd (0,6)(3,6)(6,6)
-:HA#264:E (a|ab|c|bcd){4,}(d*) ababcd NOMATCH
-:HA#265:E (a|ab|c|bcd){0,10}(d*) ababcd (0,1)(0,1)(1,1)
-:HA#266:E (a|ab|c|bcd){1,10}(d*) ababcd (0,1)(0,1)(1,1)
-:HA#267:E (a|ab|c|bcd){2,10}(d*) ababcd (0,6)(3,6)(6,6)
-:HA#268:E (a|ab|c|bcd){3,10}(d*) ababcd (0,6)(3,6)(6,6)
-:HA#269:E (a|ab|c|bcd){4,10}(d*) ababcd NOMATCH
-:HA#270:E (a|ab|c|bcd)*(d*) ababcd (0,1)(0,1)(1,1)
-:HA#271:E (a|ab|c|bcd)+(d*) ababcd (0,1)(0,1)(1,1)
-
-# The above worked on Linux/GLIBC but the following often fail.
-# They also trip up OS X / FreeBSD / NetBSD:
-
-#:HA#280:E (ab|a|c|bcd){0,}(d*) ababcd (0,6)(3,6)(6,6)
-:HA#280:E (ab|a|c|bcd){0,}(d*) ababcd (0,6)(4,5)(5,6) RE2/Go
-#:HA#281:E (ab|a|c|bcd){1,}(d*) ababcd (0,6)(3,6)(6,6)
-:HA#281:E (ab|a|c|bcd){1,}(d*) ababcd (0,6)(4,5)(5,6) RE2/Go
-#:HA#282:E (ab|a|c|bcd){2,}(d*) ababcd (0,6)(3,6)(6,6)
-:HA#282:E (ab|a|c|bcd){2,}(d*) ababcd (0,6)(4,5)(5,6) RE2/Go
-#:HA#283:E (ab|a|c|bcd){3,}(d*) ababcd (0,6)(3,6)(6,6)
-:HA#283:E (ab|a|c|bcd){3,}(d*) ababcd (0,6)(4,5)(5,6) RE2/Go
-:HA#284:E (ab|a|c|bcd){4,}(d*) ababcd NOMATCH
-#:HA#285:E (ab|a|c|bcd){0,10}(d*) ababcd (0,6)(3,6)(6,6)
-:HA#285:E (ab|a|c|bcd){0,10}(d*) ababcd (0,6)(4,5)(5,6) RE2/Go
-#:HA#286:E (ab|a|c|bcd){1,10}(d*) ababcd (0,6)(3,6)(6,6)
-:HA#286:E (ab|a|c|bcd){1,10}(d*) ababcd (0,6)(4,5)(5,6) RE2/Go
-#:HA#287:E (ab|a|c|bcd){2,10}(d*) ababcd (0,6)(3,6)(6,6)
-:HA#287:E (ab|a|c|bcd){2,10}(d*) ababcd (0,6)(4,5)(5,6) RE2/Go
-#:HA#288:E (ab|a|c|bcd){3,10}(d*) ababcd (0,6)(3,6)(6,6)
-:HA#288:E (ab|a|c|bcd){3,10}(d*) ababcd (0,6)(4,5)(5,6) RE2/Go
-:HA#289:E (ab|a|c|bcd){4,10}(d*) ababcd NOMATCH
-#:HA#290:E (ab|a|c|bcd)*(d*) ababcd (0,6)(3,6)(6,6)
-:HA#290:E (ab|a|c|bcd)*(d*) ababcd (0,6)(4,5)(5,6) RE2/Go
-#:HA#291:E (ab|a|c|bcd)+(d*) ababcd (0,6)(3,6)(6,6)
-:HA#291:E (ab|a|c|bcd)+(d*) ababcd (0,6)(4,5)(5,6) RE2/Go
diff --git a/vendor/regex/src/utf8.rs b/vendor/regex/src/utf8.rs
deleted file mode 100644
index 2dfd2c0d1..000000000
--- a/vendor/regex/src/utf8.rs
+++ /dev/null
@@ -1,264 +0,0 @@
-/// A few elementary UTF-8 encoding and decoding functions used by the matching
-/// engines.
-///
-/// In an ideal world, the matching engines operate on `&str` and we can just
-/// lean on the standard library for all our UTF-8 needs. However, to support
-/// byte based regexes (that can match on arbitrary bytes which may contain
-/// UTF-8), we need to be capable of searching and decoding UTF-8 on a `&[u8]`.
-/// The standard library doesn't really recognize this use case, so we have
-/// to build it out ourselves.
-///
-/// Should this be factored out into a separate crate? It seems independently
-/// useful. There are other crates that already exist (e.g., `utf-8`) that have
-/// overlapping use cases. Not sure what to do.
-use std::char;
-
-const TAG_CONT: u8 = 0b1000_0000;
-const TAG_TWO: u8 = 0b1100_0000;
-const TAG_THREE: u8 = 0b1110_0000;
-const TAG_FOUR: u8 = 0b1111_0000;
-
-/// Returns the smallest possible index of the next valid UTF-8 sequence
-/// starting after `i`.
-pub fn next_utf8(text: &[u8], i: usize) -> usize {
- let b = match text.get(i) {
- None => return i + 1,
- Some(&b) => b,
- };
- let inc = if b <= 0x7F {
- 1
- } else if b <= 0b110_11111 {
- 2
- } else if b <= 0b1110_1111 {
- 3
- } else {
- 4
- };
- i + inc
-}
-
-/// Decode a single UTF-8 sequence into a single Unicode codepoint from `src`.
-///
-/// If no valid UTF-8 sequence could be found, then `None` is returned.
-/// Otherwise, the decoded codepoint and the number of bytes read is returned.
-/// The number of bytes read (for a valid UTF-8 sequence) is guaranteed to be
-/// 1, 2, 3 or 4.
-///
-/// Note that a UTF-8 sequence is invalid if it is incorrect UTF-8, encodes a
-/// codepoint that is out of range (surrogate codepoints are out of range) or
-/// is not the shortest possible UTF-8 sequence for that codepoint.
-#[inline]
-pub fn decode_utf8(src: &[u8]) -> Option<(char, usize)> {
- let b0 = match src.get(0) {
- None => return None,
- Some(&b) if b <= 0x7F => return Some((b as char, 1)),
- Some(&b) => b,
- };
- match b0 {
- 0b110_00000..=0b110_11111 => {
- if src.len() < 2 {
- return None;
- }
- let b1 = src[1];
- if 0b11_000000 & b1 != TAG_CONT {
- return None;
- }
- let cp = ((b0 & !TAG_TWO) as u32) << 6 | ((b1 & !TAG_CONT) as u32);
- match cp {
- 0x80..=0x7FF => char::from_u32(cp).map(|cp| (cp, 2)),
- _ => None,
- }
- }
- 0b1110_0000..=0b1110_1111 => {
- if src.len() < 3 {
- return None;
- }
- let (b1, b2) = (src[1], src[2]);
- if 0b11_000000 & b1 != TAG_CONT {
- return None;
- }
- if 0b11_000000 & b2 != TAG_CONT {
- return None;
- }
- let cp = ((b0 & !TAG_THREE) as u32) << 12
- | ((b1 & !TAG_CONT) as u32) << 6
- | ((b2 & !TAG_CONT) as u32);
- match cp {
- // char::from_u32 will disallow surrogate codepoints.
- 0x800..=0xFFFF => char::from_u32(cp).map(|cp| (cp, 3)),
- _ => None,
- }
- }
- 0b11110_000..=0b11110_111 => {
- if src.len() < 4 {
- return None;
- }
- let (b1, b2, b3) = (src[1], src[2], src[3]);
- if 0b11_000000 & b1 != TAG_CONT {
- return None;
- }
- if 0b11_000000 & b2 != TAG_CONT {
- return None;
- }
- if 0b11_000000 & b3 != TAG_CONT {
- return None;
- }
- let cp = ((b0 & !TAG_FOUR) as u32) << 18
- | ((b1 & !TAG_CONT) as u32) << 12
- | ((b2 & !TAG_CONT) as u32) << 6
- | ((b3 & !TAG_CONT) as u32);
- match cp {
- 0x10000..=0x0010_FFFF => char::from_u32(cp).map(|cp| (cp, 4)),
- _ => None,
- }
- }
- _ => None,
- }
-}
-
-/// Like `decode_utf8`, but decodes the last UTF-8 sequence in `src` instead
-/// of the first.
-pub fn decode_last_utf8(src: &[u8]) -> Option<(char, usize)> {
- if src.is_empty() {
- return None;
- }
- let mut start = src.len() - 1;
- if src[start] <= 0x7F {
- return Some((src[start] as char, 1));
- }
- while start > src.len().saturating_sub(4) {
- start -= 1;
- if is_start_byte(src[start]) {
- break;
- }
- }
- match decode_utf8(&src[start..]) {
- None => None,
- Some((_, n)) if n < src.len() - start => None,
- Some((cp, n)) => Some((cp, n)),
- }
-}
-
-fn is_start_byte(b: u8) -> bool {
- b & 0b11_000000 != 0b1_0000000
-}
-
-#[cfg(test)]
-mod tests {
- use std::str;
-
- use quickcheck::quickcheck;
-
- use super::{
- decode_last_utf8, decode_utf8, TAG_CONT, TAG_FOUR, TAG_THREE, TAG_TWO,
- };
-
- #[test]
- fn prop_roundtrip() {
- fn p(given_cp: char) -> bool {
- let mut tmp = [0; 4];
- let encoded_len = given_cp.encode_utf8(&mut tmp).len();
- let (got_cp, got_len) = decode_utf8(&tmp[..encoded_len]).unwrap();
- encoded_len == got_len && given_cp == got_cp
- }
- quickcheck(p as fn(char) -> bool)
- }
-
- #[test]
- fn prop_roundtrip_last() {
- fn p(given_cp: char) -> bool {
- let mut tmp = [0; 4];
- let encoded_len = given_cp.encode_utf8(&mut tmp).len();
- let (got_cp, got_len) =
- decode_last_utf8(&tmp[..encoded_len]).unwrap();
- encoded_len == got_len && given_cp == got_cp
- }
- quickcheck(p as fn(char) -> bool)
- }
-
- #[test]
- fn prop_encode_matches_std() {
- fn p(cp: char) -> bool {
- let mut got = [0; 4];
- let n = cp.encode_utf8(&mut got).len();
- let expected = cp.to_string();
- &got[..n] == expected.as_bytes()
- }
- quickcheck(p as fn(char) -> bool)
- }
-
- #[test]
- fn prop_decode_matches_std() {
- fn p(given_cp: char) -> bool {
- let mut tmp = [0; 4];
- let n = given_cp.encode_utf8(&mut tmp).len();
- let (got_cp, _) = decode_utf8(&tmp[..n]).unwrap();
- let expected_cp =
- str::from_utf8(&tmp[..n]).unwrap().chars().next().unwrap();
- got_cp == expected_cp
- }
- quickcheck(p as fn(char) -> bool)
- }
-
- #[test]
- fn prop_decode_last_matches_std() {
- fn p(given_cp: char) -> bool {
- let mut tmp = [0; 4];
- let n = given_cp.encode_utf8(&mut tmp).len();
- let (got_cp, _) = decode_last_utf8(&tmp[..n]).unwrap();
- let expected_cp = str::from_utf8(&tmp[..n])
- .unwrap()
- .chars()
- .rev()
- .next()
- .unwrap();
- got_cp == expected_cp
- }
- quickcheck(p as fn(char) -> bool)
- }
-
- #[test]
- fn reject_invalid() {
- // Invalid start byte
- assert_eq!(decode_utf8(&[0xFF]), None);
- // Surrogate pair
- assert_eq!(decode_utf8(&[0xED, 0xA0, 0x81]), None);
- // Invalid continuation byte.
- assert_eq!(decode_utf8(&[0xD4, 0xC2]), None);
- // Bad lengths
- assert_eq!(decode_utf8(&[0xC3]), None); // 2 bytes
- assert_eq!(decode_utf8(&[0xEF, 0xBF]), None); // 3 bytes
- assert_eq!(decode_utf8(&[0xF4, 0x8F, 0xBF]), None); // 4 bytes
- // Not a minimal UTF-8 sequence
- assert_eq!(decode_utf8(&[TAG_TWO, TAG_CONT | b'a']), None);
- assert_eq!(decode_utf8(&[TAG_THREE, TAG_CONT, TAG_CONT | b'a']), None);
- assert_eq!(
- decode_utf8(&[TAG_FOUR, TAG_CONT, TAG_CONT, TAG_CONT | b'a',]),
- None
- );
- }
-
- #[test]
- fn reject_invalid_last() {
- // Invalid start byte
- assert_eq!(decode_last_utf8(&[0xFF]), None);
- // Surrogate pair
- assert_eq!(decode_last_utf8(&[0xED, 0xA0, 0x81]), None);
- // Bad lengths
- assert_eq!(decode_last_utf8(&[0xC3]), None); // 2 bytes
- assert_eq!(decode_last_utf8(&[0xEF, 0xBF]), None); // 3 bytes
- assert_eq!(decode_last_utf8(&[0xF4, 0x8F, 0xBF]), None); // 4 bytes
- // Not a minimal UTF-8 sequence
- assert_eq!(decode_last_utf8(&[TAG_TWO, TAG_CONT | b'a']), None);
- assert_eq!(
- decode_last_utf8(&[TAG_THREE, TAG_CONT, TAG_CONT | b'a',]),
- None
- );
- assert_eq!(
- decode_last_utf8(
- &[TAG_FOUR, TAG_CONT, TAG_CONT, TAG_CONT | b'a',]
- ),
- None
- );
- }
-}