summaryrefslogtreecommitdiffstats
path: root/vendor/regex/src
diff options
context:
space:
mode:
Diffstat (limited to 'vendor/regex/src')
-rw-r--r--vendor/regex/src/compile.rs297
-rw-r--r--vendor/regex/src/dfa.rs2
-rw-r--r--vendor/regex/src/error.rs22
-rw-r--r--vendor/regex/src/exec.rs196
-rw-r--r--vendor/regex/src/expand.rs10
-rw-r--r--vendor/regex/src/lib.rs40
-rw-r--r--vendor/regex/src/literal/imp.rs81
-rw-r--r--vendor/regex/src/literal/mod.rs6
-rw-r--r--vendor/regex/src/prog.rs4
-rw-r--r--vendor/regex/src/re_bytes.rs128
-rw-r--r--vendor/regex/src/re_set.rs11
-rw-r--r--vendor/regex/src/re_trait.rs2
-rw-r--r--vendor/regex/src/re_unicode.rs128
13 files changed, 700 insertions, 227 deletions
diff --git a/vendor/regex/src/compile.rs b/vendor/regex/src/compile.rs
index 90ca25015..23e63ec89 100644
--- a/vendor/regex/src/compile.rs
+++ b/vendor/regex/src/compile.rs
@@ -4,7 +4,7 @@ use std::iter;
use std::result;
use std::sync::Arc;
-use regex_syntax::hir::{self, Hir};
+use regex_syntax::hir::{self, Hir, Look};
use regex_syntax::is_word_byte;
use regex_syntax::utf8::{Utf8Range, Utf8Sequence, Utf8Sequences};
@@ -137,13 +137,24 @@ impl Compiler {
}
fn compile_one(mut self, expr: &Hir) -> result::Result<Program, Error> {
+ if self.compiled.only_utf8
+ && expr.properties().look_set().contains(Look::WordAsciiNegate)
+ {
+ return Err(Error::Syntax(
+ "ASCII-only \\B is not allowed in Unicode regexes \
+ because it may result in invalid UTF-8 matches"
+ .to_string(),
+ ));
+ }
// If we're compiling a forward DFA and we aren't anchored, then
// add a `.*?` before the first capture group.
// Other matching engines handle this by baking the logic into the
// matching engine itself.
let mut dotstar_patch = Patch { hole: Hole::None, entry: 0 };
- self.compiled.is_anchored_start = expr.is_anchored_start();
- self.compiled.is_anchored_end = expr.is_anchored_end();
+ self.compiled.is_anchored_start =
+ expr.properties().look_set_prefix().contains(Look::Start);
+ self.compiled.is_anchored_end =
+ expr.properties().look_set_suffix().contains(Look::End);
if self.compiled.needs_dotstar() {
dotstar_patch = self.c_dotstar()?;
self.compiled.start = dotstar_patch.entry;
@@ -159,6 +170,8 @@ impl Compiler {
self.fill_to_next(patch.hole);
self.compiled.matches = vec![self.insts.len()];
self.push_compiled(Inst::Match(0));
+ self.compiled.static_captures_len =
+ expr.properties().static_explicit_captures_len();
self.compile_finish()
}
@@ -168,10 +181,12 @@ impl Compiler {
) -> result::Result<Program, Error> {
debug_assert!(exprs.len() > 1);
- self.compiled.is_anchored_start =
- exprs.iter().all(|e| e.is_anchored_start());
- self.compiled.is_anchored_end =
- exprs.iter().all(|e| e.is_anchored_end());
+ self.compiled.is_anchored_start = exprs
+ .iter()
+ .all(|e| e.properties().look_set_prefix().contains(Look::Start));
+ self.compiled.is_anchored_end = exprs
+ .iter()
+ .all(|e| e.properties().look_set_suffix().contains(Look::End));
let mut dotstar_patch = Patch { hole: Hole::None, entry: 0 };
if self.compiled.needs_dotstar() {
dotstar_patch = self.c_dotstar()?;
@@ -272,17 +287,21 @@ impl Compiler {
self.check_size()?;
match *expr.kind() {
Empty => self.c_empty(),
- Literal(hir::Literal::Unicode(c)) => self.c_char(c),
- Literal(hir::Literal::Byte(b)) => {
- assert!(self.compiled.uses_bytes());
- self.c_byte(b)
+ Literal(hir::Literal(ref bytes)) => {
+ if self.compiled.is_reverse {
+ let mut bytes = bytes.to_vec();
+ bytes.reverse();
+ self.c_literal(&bytes)
+ } else {
+ self.c_literal(bytes)
+ }
}
Class(hir::Class::Unicode(ref cls)) => self.c_class(cls.ranges()),
Class(hir::Class::Bytes(ref cls)) => {
if self.compiled.uses_bytes() {
self.c_class_bytes(cls.ranges())
} else {
- assert!(cls.is_all_ascii());
+ assert!(cls.is_ascii());
let mut char_ranges = vec![];
for r in cls.iter() {
let (s, e) = (r.start() as char, r.end() as char);
@@ -291,92 +310,94 @@ impl Compiler {
self.c_class(&char_ranges)
}
}
- Anchor(hir::Anchor::StartLine) if self.compiled.is_reverse => {
- self.byte_classes.set_range(b'\n', b'\n');
- self.c_empty_look(prog::EmptyLook::EndLine)
- }
- Anchor(hir::Anchor::StartLine) => {
- self.byte_classes.set_range(b'\n', b'\n');
- self.c_empty_look(prog::EmptyLook::StartLine)
- }
- Anchor(hir::Anchor::EndLine) if self.compiled.is_reverse => {
- self.byte_classes.set_range(b'\n', b'\n');
- self.c_empty_look(prog::EmptyLook::StartLine)
- }
- Anchor(hir::Anchor::EndLine) => {
- self.byte_classes.set_range(b'\n', b'\n');
- self.c_empty_look(prog::EmptyLook::EndLine)
- }
- Anchor(hir::Anchor::StartText) if self.compiled.is_reverse => {
- self.c_empty_look(prog::EmptyLook::EndText)
- }
- Anchor(hir::Anchor::StartText) => {
- self.c_empty_look(prog::EmptyLook::StartText)
- }
- Anchor(hir::Anchor::EndText) if self.compiled.is_reverse => {
- self.c_empty_look(prog::EmptyLook::StartText)
- }
- Anchor(hir::Anchor::EndText) => {
- self.c_empty_look(prog::EmptyLook::EndText)
- }
- WordBoundary(hir::WordBoundary::Unicode) => {
- if !cfg!(feature = "unicode-perl") {
- return Err(Error::Syntax(
- "Unicode word boundaries are unavailable when \
- the unicode-perl feature is disabled"
- .to_string(),
- ));
+ Look(ref look) => match *look {
+ hir::Look::Start if self.compiled.is_reverse => {
+ self.c_empty_look(prog::EmptyLook::EndText)
}
- self.compiled.has_unicode_word_boundary = true;
- self.byte_classes.set_word_boundary();
- // We also make sure that all ASCII bytes are in a different
- // class from non-ASCII bytes. Otherwise, it's possible for
- // ASCII bytes to get lumped into the same class as non-ASCII
- // bytes. This in turn may cause the lazy DFA to falsely start
- // when it sees an ASCII byte that maps to a byte class with
- // non-ASCII bytes. This ensures that never happens.
- self.byte_classes.set_range(0, 0x7F);
- self.c_empty_look(prog::EmptyLook::WordBoundary)
- }
- WordBoundary(hir::WordBoundary::UnicodeNegate) => {
- if !cfg!(feature = "unicode-perl") {
+ hir::Look::Start => {
+ self.c_empty_look(prog::EmptyLook::StartText)
+ }
+ hir::Look::End if self.compiled.is_reverse => {
+ self.c_empty_look(prog::EmptyLook::StartText)
+ }
+ hir::Look::End => self.c_empty_look(prog::EmptyLook::EndText),
+ hir::Look::StartLF if self.compiled.is_reverse => {
+ self.byte_classes.set_range(b'\n', b'\n');
+ self.c_empty_look(prog::EmptyLook::EndLine)
+ }
+ hir::Look::StartLF => {
+ self.byte_classes.set_range(b'\n', b'\n');
+ self.c_empty_look(prog::EmptyLook::StartLine)
+ }
+ hir::Look::EndLF if self.compiled.is_reverse => {
+ self.byte_classes.set_range(b'\n', b'\n');
+ self.c_empty_look(prog::EmptyLook::StartLine)
+ }
+ hir::Look::EndLF => {
+ self.byte_classes.set_range(b'\n', b'\n');
+ self.c_empty_look(prog::EmptyLook::EndLine)
+ }
+ hir::Look::StartCRLF | hir::Look::EndCRLF => {
return Err(Error::Syntax(
- "Unicode word boundaries are unavailable when \
- the unicode-perl feature is disabled"
+ "CRLF-aware line anchors are not supported yet"
.to_string(),
));
}
- self.compiled.has_unicode_word_boundary = true;
- self.byte_classes.set_word_boundary();
- // See comments above for why we set the ASCII range here.
- self.byte_classes.set_range(0, 0x7F);
- self.c_empty_look(prog::EmptyLook::NotWordBoundary)
- }
- WordBoundary(hir::WordBoundary::Ascii) => {
- self.byte_classes.set_word_boundary();
- self.c_empty_look(prog::EmptyLook::WordBoundaryAscii)
- }
- WordBoundary(hir::WordBoundary::AsciiNegate) => {
- self.byte_classes.set_word_boundary();
- self.c_empty_look(prog::EmptyLook::NotWordBoundaryAscii)
- }
- Group(ref g) => match g.kind {
- hir::GroupKind::NonCapturing => self.c(&g.hir),
- hir::GroupKind::CaptureIndex(index) => {
- if index as usize >= self.compiled.captures.len() {
- self.compiled.captures.push(None);
+ hir::Look::WordAscii => {
+ self.byte_classes.set_word_boundary();
+ self.c_empty_look(prog::EmptyLook::WordBoundaryAscii)
+ }
+ hir::Look::WordAsciiNegate => {
+ self.byte_classes.set_word_boundary();
+ self.c_empty_look(prog::EmptyLook::NotWordBoundaryAscii)
+ }
+ hir::Look::WordUnicode => {
+ if !cfg!(feature = "unicode-perl") {
+ return Err(Error::Syntax(
+ "Unicode word boundaries are unavailable when \
+ the unicode-perl feature is disabled"
+ .to_string(),
+ ));
}
- self.c_capture(2 * index as usize, &g.hir)
+ self.compiled.has_unicode_word_boundary = true;
+ self.byte_classes.set_word_boundary();
+ // We also make sure that all ASCII bytes are in a different
+ // class from non-ASCII bytes. Otherwise, it's possible for
+ // ASCII bytes to get lumped into the same class as non-ASCII
+ // bytes. This in turn may cause the lazy DFA to falsely start
+ // when it sees an ASCII byte that maps to a byte class with
+ // non-ASCII bytes. This ensures that never happens.
+ self.byte_classes.set_range(0, 0x7F);
+ self.c_empty_look(prog::EmptyLook::WordBoundary)
}
- hir::GroupKind::CaptureName { index, ref name } => {
- if index as usize >= self.compiled.captures.len() {
- let n = name.to_string();
- self.compiled.captures.push(Some(n.clone()));
- self.capture_name_idx.insert(n, index as usize);
+ hir::Look::WordUnicodeNegate => {
+ if !cfg!(feature = "unicode-perl") {
+ return Err(Error::Syntax(
+ "Unicode word boundaries are unavailable when \
+ the unicode-perl feature is disabled"
+ .to_string(),
+ ));
}
- self.c_capture(2 * index as usize, &g.hir)
+ self.compiled.has_unicode_word_boundary = true;
+ self.byte_classes.set_word_boundary();
+ // See comments above for why we set the ASCII range here.
+ self.byte_classes.set_range(0, 0x7F);
+ self.c_empty_look(prog::EmptyLook::NotWordBoundary)
}
},
+ Capture(hir::Capture { index, ref name, ref sub }) => {
+ if index as usize >= self.compiled.captures.len() {
+ let name = match *name {
+ None => None,
+ Some(ref boxed_str) => Some(boxed_str.to_string()),
+ };
+ self.compiled.captures.push(name.clone());
+ if let Some(name) = name {
+ self.capture_name_idx.insert(name, index as usize);
+ }
+ }
+ self.c_capture(2 * index as usize, sub)
+ }
Concat(ref es) => {
if self.compiled.is_reverse {
self.c_concat(es.iter().rev())
@@ -420,21 +441,19 @@ impl Compiler {
}
fn c_dotstar(&mut self) -> Result {
- Ok(if !self.compiled.only_utf8() {
- self.c(&Hir::repetition(hir::Repetition {
- kind: hir::RepetitionKind::ZeroOrMore,
- greedy: false,
- hir: Box::new(Hir::any(true)),
- }))?
- .unwrap()
+ let hir = if self.compiled.only_utf8() {
+ Hir::dot(hir::Dot::AnyChar)
} else {
- self.c(&Hir::repetition(hir::Repetition {
- kind: hir::RepetitionKind::ZeroOrMore,
+ Hir::dot(hir::Dot::AnyByte)
+ };
+ Ok(self
+ .c(&Hir::repetition(hir::Repetition {
+ min: 0,
+ max: None,
greedy: false,
- hir: Box::new(Hir::any(false)),
+ sub: Box::new(hir),
}))?
- .unwrap()
- })
+ .unwrap())
}
fn c_char(&mut self, c: char) -> ResultOrEmpty {
@@ -457,7 +476,11 @@ impl Compiler {
fn c_class(&mut self, ranges: &[hir::ClassUnicodeRange]) -> ResultOrEmpty {
use std::mem::size_of;
- assert!(!ranges.is_empty());
+ if ranges.is_empty() {
+ return Err(Error::Syntax(
+ "empty character classes are not allowed".to_string(),
+ ));
+ }
if self.compiled.uses_bytes() {
Ok(Some(CompileClass { c: self, ranges }.compile()?))
} else {
@@ -482,7 +505,11 @@ impl Compiler {
&mut self,
ranges: &[hir::ClassBytesRange],
) -> ResultOrEmpty {
- debug_assert!(!ranges.is_empty());
+ if ranges.is_empty() {
+ return Err(Error::Syntax(
+ "empty character classes are not allowed".to_string(),
+ ));
+ }
let first_split_entry = self.insts.len();
let mut holes = vec![];
@@ -513,6 +540,52 @@ impl Compiler {
Ok(Some(Patch { hole, entry: self.insts.len() - 1 }))
}
+ fn c_literal(&mut self, bytes: &[u8]) -> ResultOrEmpty {
+ match core::str::from_utf8(bytes) {
+ Ok(string) => {
+ let mut it = string.chars();
+ let Patch { mut hole, entry } = loop {
+ match it.next() {
+ None => return self.c_empty(),
+ Some(ch) => {
+ if let Some(p) = self.c_char(ch)? {
+ break p;
+ }
+ }
+ }
+ };
+ for ch in it {
+ if let Some(p) = self.c_char(ch)? {
+ self.fill(hole, p.entry);
+ hole = p.hole;
+ }
+ }
+ Ok(Some(Patch { hole, entry }))
+ }
+ Err(_) => {
+ assert!(self.compiled.uses_bytes());
+ let mut it = bytes.iter().copied();
+ let Patch { mut hole, entry } = loop {
+ match it.next() {
+ None => return self.c_empty(),
+ Some(byte) => {
+ if let Some(p) = self.c_byte(byte)? {
+ break p;
+ }
+ }
+ }
+ };
+ for byte in it {
+ if let Some(p) = self.c_byte(byte)? {
+ self.fill(hole, p.entry);
+ hole = p.hole;
+ }
+ }
+ Ok(Some(Patch { hole, entry }))
+ }
+ }
+ }
+
fn c_concat<'a, I>(&mut self, exprs: I) -> ResultOrEmpty
where
I: IntoIterator<Item = &'a Hir>,
@@ -587,19 +660,15 @@ impl Compiler {
}
fn c_repeat(&mut self, rep: &hir::Repetition) -> ResultOrEmpty {
- use regex_syntax::hir::RepetitionKind::*;
- match rep.kind {
- ZeroOrOne => self.c_repeat_zero_or_one(&rep.hir, rep.greedy),
- ZeroOrMore => self.c_repeat_zero_or_more(&rep.hir, rep.greedy),
- OneOrMore => self.c_repeat_one_or_more(&rep.hir, rep.greedy),
- Range(hir::RepetitionRange::Exactly(min_max)) => {
- self.c_repeat_range(&rep.hir, rep.greedy, min_max, min_max)
- }
- Range(hir::RepetitionRange::AtLeast(min)) => {
- self.c_repeat_range_min_or_more(&rep.hir, rep.greedy, min)
+ match (rep.min, rep.max) {
+ (0, Some(1)) => self.c_repeat_zero_or_one(&rep.sub, rep.greedy),
+ (0, None) => self.c_repeat_zero_or_more(&rep.sub, rep.greedy),
+ (1, None) => self.c_repeat_one_or_more(&rep.sub, rep.greedy),
+ (min, None) => {
+ self.c_repeat_range_min_or_more(&rep.sub, rep.greedy, min)
}
- Range(hir::RepetitionRange::Bounded(min, max)) => {
- self.c_repeat_range(&rep.hir, rep.greedy, min, max)
+ (min, Some(max)) => {
+ self.c_repeat_range(&rep.sub, rep.greedy, min, max)
}
}
}
diff --git a/vendor/regex/src/dfa.rs b/vendor/regex/src/dfa.rs
index dc9952120..78ed71021 100644
--- a/vendor/regex/src/dfa.rs
+++ b/vendor/regex/src/dfa.rs
@@ -1576,7 +1576,7 @@ impl<'a> Fsm<'a> {
/// inputs, a new state could be created for every byte of input. (This is
/// bad for memory use, so we bound it with a cache.)
fn approximate_size(&self) -> usize {
- self.cache.size + self.prog.approximate_size()
+ self.cache.size
}
}
diff --git a/vendor/regex/src/error.rs b/vendor/regex/src/error.rs
index 3e0ec7521..6c341f604 100644
--- a/vendor/regex/src/error.rs
+++ b/vendor/regex/src/error.rs
@@ -6,8 +6,26 @@ use std::iter::repeat;
pub enum Error {
/// A syntax error.
Syntax(String),
- /// The compiled program exceeded the set size limit.
- /// The argument is the size limit imposed.
+ /// The compiled program exceeded the set size
+ /// limit. The argument is the size limit imposed by
+ /// [`RegexBuilder::size_limit`](crate::RegexBuilder::size_limit). Even
+ /// when not configured explicitly, it defaults to a reasonable limit.
+ ///
+ /// If you're getting this error, it occurred because your regex has been
+ /// compiled to an intermediate state that is too big. It is important to
+ /// note that exceeding this limit does _not_ mean the regex is too big to
+ /// _work_, but rather, the regex is big enough that it may wind up being
+ /// surprisingly slow when used in a search. In other words, this error is
+ /// meant to be a practical heuristic for avoiding a performance footgun,
+ /// and especially so for the case where the regex pattern is coming from
+ /// an untrusted source.
+ ///
+ /// There are generally two ways to move forward if you hit this error.
+ /// The first is to find some way to use a smaller regex. The second is to
+ /// increase the size limit via `RegexBuilder::size_limit`. However, if
+ /// your regex pattern is not from a trusted source, then neither of these
+ /// approaches may be appropriate. Instead, you'll have to determine just
+ /// how big of a regex you want to allow.
CompiledTooBig(usize),
/// Hints that destructuring should not be exhaustive.
///
diff --git a/vendor/regex/src/exec.rs b/vendor/regex/src/exec.rs
index b9abcdc04..ee8b589d2 100644
--- a/vendor/regex/src/exec.rs
+++ b/vendor/regex/src/exec.rs
@@ -4,9 +4,9 @@ use std::panic::AssertUnwindSafe;
use std::sync::Arc;
#[cfg(feature = "perf-literal")]
-use aho_corasick::{AhoCorasick, AhoCorasickBuilder, MatchKind};
-use regex_syntax::hir::literal::Literals;
-use regex_syntax::hir::Hir;
+use aho_corasick::{AhoCorasick, MatchKind};
+use regex_syntax::hir::literal;
+use regex_syntax::hir::{Hir, Look};
use regex_syntax::ParserBuilder;
use crate::backtrack;
@@ -78,15 +78,18 @@ struct ExecReadOnly {
/// not supported.) Note that this program contains an embedded `.*?`
/// preceding the first capture group, unless the regex is anchored at the
/// beginning.
+ #[allow(dead_code)]
dfa: Program,
/// The same as above, except the program is reversed (and there is no
/// preceding `.*?`). This is used by the DFA to find the starting location
/// of matches.
+ #[allow(dead_code)]
dfa_reverse: Program,
/// A set of suffix literals extracted from the regex.
///
/// Prefix literals are stored on the `Program`, since they are used inside
/// the matching engines.
+ #[allow(dead_code)]
suffixes: LiteralSearcher,
/// An Aho-Corasick automaton with leftmost-first match semantics.
///
@@ -98,7 +101,7 @@ struct ExecReadOnly {
/// if we were to exhaust the ID space, we probably would have long
/// surpassed the compilation size limit.
#[cfg(feature = "perf-literal")]
- ac: Option<AhoCorasick<u32>>,
+ ac: Option<AhoCorasick>,
/// match_type encodes as much upfront knowledge about how we're going to
/// execute a search as possible.
match_type: MatchType,
@@ -121,8 +124,8 @@ pub struct ExecBuilder {
/// literals.
struct Parsed {
exprs: Vec<Hir>,
- prefixes: Literals,
- suffixes: Literals,
+ prefixes: literal::Seq,
+ suffixes: literal::Seq,
bytes: bool,
}
@@ -228,8 +231,8 @@ impl ExecBuilder {
/// Parse the current set of patterns into their AST and extract literals.
fn parse(&self) -> Result<Parsed, Error> {
let mut exprs = Vec::with_capacity(self.options.pats.len());
- let mut prefixes = Some(Literals::empty());
- let mut suffixes = Some(Literals::empty());
+ let mut prefixes = Some(literal::Seq::empty());
+ let mut suffixes = Some(literal::Seq::empty());
let mut bytes = false;
let is_set = self.options.pats.len() > 1;
// If we're compiling a regex set and that set has any anchored
@@ -243,54 +246,103 @@ impl ExecBuilder {
.swap_greed(self.options.swap_greed)
.ignore_whitespace(self.options.ignore_whitespace)
.unicode(self.options.unicode)
- .allow_invalid_utf8(!self.only_utf8)
+ .utf8(self.only_utf8)
.nest_limit(self.options.nest_limit)
.build();
let expr =
parser.parse(pat).map_err(|e| Error::Syntax(e.to_string()))?;
- bytes = bytes || !expr.is_always_utf8();
+ let props = expr.properties();
+ // This used to just check whether the HIR matched valid UTF-8
+ // or not, but in regex-syntax 0.7, we changed our definition of
+ // "matches valid UTF-8" to exclude zero-width matches. And in
+ // particular, previously, we considered WordAsciiNegate (that
+ // is '(?-u:\B)') to be capable of matching invalid UTF-8. Our
+ // matcher engines were built under this assumption and fixing
+ // them is not worth it with the imminent plan to switch over to
+ // regex-automata. So for now, we retain the previous behavior by
+ // just explicitly treating the presence of a negated ASCII word
+ // boundary as forcing use to use a byte oriented automaton.
+ bytes = bytes
+ || !props.is_utf8()
+ || props.look_set().contains(Look::WordAsciiNegate);
if cfg!(feature = "perf-literal") {
- if !expr.is_anchored_start() && expr.is_any_anchored_start() {
+ if !props.look_set_prefix().contains(Look::Start)
+ && props.look_set().contains(Look::Start)
+ {
// Partial anchors unfortunately make it hard to use
// prefixes, so disable them.
prefixes = None;
- } else if is_set && expr.is_anchored_start() {
+ } else if is_set
+ && props.look_set_prefix_any().contains(Look::Start)
+ {
// Regex sets with anchors do not go well with literal
// optimizations.
prefixes = None;
+ } else if props.look_set_prefix_any().contains_word() {
+ // The new literal extractor ignores look-around while
+ // the old one refused to extract prefixes from regexes
+ // that began with a \b. These old creaky regex internals
+ // can't deal with it, so we drop it.
+ prefixes = None;
+ } else if props.look_set_prefix_any().contains(Look::StartLF) {
+ // Similar to the reasoning for word boundaries, this old
+ // regex engine can't handle literal prefixes with '(?m:^)'
+ // at the beginning of a regex.
+ prefixes = None;
}
- prefixes = prefixes.and_then(|mut prefixes| {
- if !prefixes.union_prefixes(&expr) {
- None
- } else {
- Some(prefixes)
- }
- });
- if !expr.is_anchored_end() && expr.is_any_anchored_end() {
+ if !props.look_set_suffix().contains(Look::End)
+ && props.look_set().contains(Look::End)
+ {
// Partial anchors unfortunately make it hard to use
// suffixes, so disable them.
suffixes = None;
- } else if is_set && expr.is_anchored_end() {
+ } else if is_set
+ && props.look_set_suffix_any().contains(Look::End)
+ {
// Regex sets with anchors do not go well with literal
// optimizations.
suffixes = None;
+ } else if props.look_set_suffix_any().contains_word() {
+ // See the prefix case for reasoning here.
+ suffixes = None;
+ } else if props.look_set_suffix_any().contains(Look::EndLF) {
+ // See the prefix case for reasoning here.
+ suffixes = None;
}
- suffixes = suffixes.and_then(|mut suffixes| {
- if !suffixes.union_suffixes(&expr) {
- None
+
+ let (mut pres, mut suffs) =
+ if prefixes.is_none() && suffixes.is_none() {
+ (literal::Seq::infinite(), literal::Seq::infinite())
} else {
- Some(suffixes)
- }
+ literal_analysis(&expr)
+ };
+ // These old creaky regex internals can't handle cases where
+ // the literal sequences are exact but there are look-around
+ // assertions. So we make sure the sequences are inexact if
+ // there are look-around assertions anywhere. This forces the
+ // regex engines to run instead of assuming that a literal
+ // match implies an overall match.
+ if !props.look_set().is_empty() {
+ pres.make_inexact();
+ suffs.make_inexact();
+ }
+ prefixes = prefixes.and_then(|mut prefixes| {
+ prefixes.union(&mut pres);
+ Some(prefixes)
+ });
+ suffixes = suffixes.and_then(|mut suffixes| {
+ suffixes.union(&mut suffs);
+ Some(suffixes)
});
}
exprs.push(expr);
}
Ok(Parsed {
exprs,
- prefixes: prefixes.unwrap_or_else(Literals::empty),
- suffixes: suffixes.unwrap_or_else(Literals::empty),
+ prefixes: prefixes.unwrap_or_else(literal::Seq::empty),
+ suffixes: suffixes.unwrap_or_else(literal::Seq::empty),
bytes,
})
}
@@ -356,7 +408,7 @@ impl ExecBuilder {
}
#[cfg(feature = "perf-literal")]
- fn build_aho_corasick(&self, parsed: &Parsed) -> Option<AhoCorasick<u32>> {
+ fn build_aho_corasick(&self, parsed: &Parsed) -> Option<AhoCorasick> {
if parsed.exprs.len() != 1 {
return None;
}
@@ -370,10 +422,9 @@ impl ExecBuilder {
return None;
}
Some(
- AhoCorasickBuilder::new()
+ AhoCorasick::builder()
.match_kind(MatchKind::LeftmostFirst)
- .auto_configure(&lits)
- .build_with_size::<u32, _, _>(&lits)
+ .build(&lits)
// This should never happen because we'd long exceed the
// compilation limit for regexes first.
.expect("AC automaton too big"),
@@ -1311,6 +1362,12 @@ impl Exec {
pub fn capture_name_idx(&self) -> &Arc<HashMap<String, usize>> {
&self.ro.nfa.capture_name_idx
}
+
+ /// If the number of capture groups in every match is always the same, then
+ /// return that number. Otherwise return `None`.
+ pub fn static_captures_len(&self) -> Option<usize> {
+ self.ro.nfa.static_captures_len
+ }
}
impl Clone for Exec {
@@ -1382,7 +1439,18 @@ impl ExecReadOnly {
// This case shouldn't happen. When the regex isn't
// anchored, then complete prefixes should imply complete
// suffixes.
- Some(MatchType::Literal(MatchLiteralType::Unanchored))
+ //
+ // The above is wrong! This case can happen. While
+ // complete prefixes should imply complete suffixes
+ // here, that doesn't necessarily mean we have a useful
+ // prefix matcher! It could be the case that the literal
+ // searcher decided the prefixes---even though they are
+ // "complete"---weren't good enough and thus created an
+ // empty matcher. If that happens and we return Unanchored
+ // here, then we'll end up using that matcher, which is
+ // very bad because it matches at every position. So...
+ // return None.
+ None
};
}
None
@@ -1557,7 +1625,7 @@ fn alternation_literals(expr: &Hir) -> Option<Vec<Vec<u8>>> {
// optimization pipeline, because this is a terribly inflexible way to go
// about things.
- if !expr.is_alternation_literal() {
+ if !expr.properties().is_alternation_literal() {
return None;
}
let alts = match *expr.kind() {
@@ -1565,25 +1633,19 @@ fn alternation_literals(expr: &Hir) -> Option<Vec<Vec<u8>>> {
_ => return None, // one literal isn't worth it
};
- let extendlit = |lit: &Literal, dst: &mut Vec<u8>| match *lit {
- Literal::Unicode(c) => {
- let mut buf = [0; 4];
- dst.extend_from_slice(c.encode_utf8(&mut buf).as_bytes());
- }
- Literal::Byte(b) => {
- dst.push(b);
- }
- };
-
let mut lits = vec![];
for alt in alts {
let mut lit = vec![];
match *alt.kind() {
- HirKind::Literal(ref x) => extendlit(x, &mut lit),
+ HirKind::Literal(Literal(ref bytes)) => {
+ lit.extend_from_slice(bytes)
+ }
HirKind::Concat(ref exprs) => {
for e in exprs {
match *e.kind() {
- HirKind::Literal(ref x) => extendlit(x, &mut lit),
+ HirKind::Literal(Literal(ref bytes)) => {
+ lit.extend_from_slice(bytes);
+ }
_ => unreachable!("expected literal, got {:?}", e),
}
}
@@ -1595,6 +1657,48 @@ fn alternation_literals(expr: &Hir) -> Option<Vec<Vec<u8>>> {
Some(lits)
}
+#[cfg(not(feature = "perf-literal"))]
+fn literal_analysis(_: &Hir) -> (literal::Seq, literal::Seq) {
+ (literal::Seq::infinite(), literal::Seq::infinite())
+}
+
+#[cfg(feature = "perf-literal")]
+fn literal_analysis(expr: &Hir) -> (literal::Seq, literal::Seq) {
+ const ATTEMPTS: [(usize, usize); 3] = [(5, 50), (4, 30), (3, 20)];
+
+ let mut prefixes = literal::Extractor::new()
+ .kind(literal::ExtractKind::Prefix)
+ .extract(expr);
+ for (keep, limit) in ATTEMPTS {
+ let len = match prefixes.len() {
+ None => break,
+ Some(len) => len,
+ };
+ if len <= limit {
+ break;
+ }
+ prefixes.keep_first_bytes(keep);
+ prefixes.minimize_by_preference();
+ }
+
+ let mut suffixes = literal::Extractor::new()
+ .kind(literal::ExtractKind::Suffix)
+ .extract(expr);
+ for (keep, limit) in ATTEMPTS {
+ let len = match suffixes.len() {
+ None => break,
+ Some(len) => len,
+ };
+ if len <= limit {
+ break;
+ }
+ suffixes.keep_last_bytes(keep);
+ suffixes.minimize_by_preference();
+ }
+
+ (prefixes, suffixes)
+}
+
#[cfg(test)]
mod test {
#[test]
diff --git a/vendor/regex/src/expand.rs b/vendor/regex/src/expand.rs
index 67b514926..98fafc949 100644
--- a/vendor/regex/src/expand.rs
+++ b/vendor/regex/src/expand.rs
@@ -182,7 +182,8 @@ fn find_cap_ref_braced(rep: &[u8], mut i: usize) -> Option<CaptureRef<'_>> {
})
}
-/// Returns true if and only if the given byte is allowed in a capture name.
+/// Returns true if and only if the given byte is allowed in a capture name
+/// written in non-brace form.
fn is_valid_cap_letter(b: u8) -> bool {
match b {
b'0'..=b'9' | b'a'..=b'z' | b'A'..=b'Z' | b'_' => true,
@@ -236,4 +237,11 @@ mod tests {
find!(find_cap_ref17, "$x_$y", c!("x_", 3));
find!(find_cap_ref18, "${#}", c!("#", 4));
find!(find_cap_ref19, "${Z[}", c!("Z[", 5));
+ find!(find_cap_ref20, "${¾}", c!("¾", 5));
+ find!(find_cap_ref21, "${¾a}", c!("¾a", 6));
+ find!(find_cap_ref22, "${a¾}", c!("a¾", 6));
+ find!(find_cap_ref23, "${☃}", c!("☃", 6));
+ find!(find_cap_ref24, "${a☃}", c!("a☃", 7));
+ find!(find_cap_ref25, "${☃a}", c!("☃a", 7));
+ find!(find_cap_ref26, "${名字}", c!("名字", 9));
}
diff --git a/vendor/regex/src/lib.rs b/vendor/regex/src/lib.rs
index 6b95739c5..82c1b77ad 100644
--- a/vendor/regex/src/lib.rs
+++ b/vendor/regex/src/lib.rs
@@ -199,6 +199,8 @@ instead.)
This implementation executes regular expressions **only** on valid UTF-8
while exposing match locations as byte indices into the search string. (To
relax this restriction, use the [`bytes`](bytes/index.html) sub-module.)
+Conceptually, the regex engine works by matching a haystack as if it were a
+sequence of Unicode scalar values.
Only simple case folding is supported. Namely, when matching
case-insensitively, the characters are first mapped using the "simple" case
@@ -285,9 +287,9 @@ a separate crate, [`regex-syntax`](https://docs.rs/regex-syntax).
. any character except new line (includes new line with s flag)
\d digit (\p{Nd})
\D not digit
-\pN One-letter name Unicode character class
+\pX Unicode character class identified by a one-letter name
\p{Greek} Unicode character class (general category or script)
-\PN Negated one-letter name Unicode character class
+\PX Negated Unicode character class identified by a one-letter name
\P{Greek} negated Unicode character class (general category or script)
</pre>
@@ -325,6 +327,25 @@ xy concatenation (x followed by y)
x|y alternation (x or y, prefer x)
</pre>
+This example shows how an alternation works, and what it means to prefer a
+branch in the alternation over subsequent branches.
+
+```
+use regex::Regex;
+
+let haystack = "samwise";
+// If 'samwise' comes first in our alternation, then it is
+// preferred as a match, even if the regex engine could
+// technically detect that 'sam' led to a match earlier.
+let re = Regex::new(r"samwise|sam").unwrap();
+assert_eq!("samwise", re.find(haystack).unwrap().as_str());
+// But if 'sam' comes first, then it will match instead.
+// In this case, it is impossible for 'samwise' to match
+// because 'sam' is a prefix of it.
+let re = Regex::new(r"sam|samwise").unwrap();
+assert_eq!("sam", re.find(haystack).unwrap().as_str());
+```
+
## Repetitions
<pre class="rust">
@@ -360,12 +381,19 @@ regex matches `abc` at positions `0`, `1`, `2` and `3`.
<pre class="rust">
(exp) numbered capture group (indexed by opening parenthesis)
-(?P&lt;name&gt;exp) named (also numbered) capture group (allowed chars: [_0-9a-zA-Z.\[\]])
+(?P&lt;name&gt;exp) named (also numbered) capture group (names must be alpha-numeric)
+(?&lt;name&gt;exp) named (also numbered) capture group (names must be alpha-numeric)
(?:exp) non-capturing group
(?flags) set flags within current group
(?flags:exp) set flags for exp (non-capturing)
</pre>
+Capture group names must be any sequence of alpha-numeric Unicode codepoints,
+in addition to `.`, `_`, `[` and `]`. Names must start with either an `_` or
+an alphabetic codepoint. Alphabetic codepoints correspond to the `Alphabetic`
+Unicode property, while numeric codepoints correspond to the union of the
+`Decimal_Number`, `Letter_Number` and `Other_Number` general categories.
+
Flags are each a single character. For example, `(?x)` sets the flag `x`
and `(?-x)` clears the flag `x`. Multiple flags can be set or cleared at
the same time: `(?xy)` sets both the `x` and `y` flags and `(?x-y)` sets
@@ -379,9 +407,13 @@ m multi-line mode: ^ and $ match begin/end of line
s allow . to match \n
U swap the meaning of x* and x*?
u Unicode support (enabled by default)
-x ignore whitespace and allow line comments (starting with `#`)
+x verbose mode, ignores whitespace and allow line comments (starting with `#`)
</pre>
+Note that in verbose mode, whitespace is ignored everywhere, including within
+character classes. To insert whitespace, use its escaped form or a hex literal.
+For example, `\ ` or `\x20` for an ASCII space.
+
Flags can be toggled within a pattern. Here's an example that matches
case-insensitively for the first part but case-sensitively for the second part:
diff --git a/vendor/regex/src/literal/imp.rs b/vendor/regex/src/literal/imp.rs
index 90b2f1160..75fa6e37b 100644
--- a/vendor/regex/src/literal/imp.rs
+++ b/vendor/regex/src/literal/imp.rs
@@ -1,8 +1,8 @@
use std::mem;
-use aho_corasick::{self, packed, AhoCorasick, AhoCorasickBuilder};
+use aho_corasick::{self, packed, AhoCorasick};
use memchr::{memchr, memchr2, memchr3, memmem};
-use regex_syntax::hir::literal::{Literal, Literals};
+use regex_syntax::hir::literal::{Literal, Seq};
/// A prefix extracted from a compiled regular expression.
///
@@ -26,7 +26,7 @@ enum Matcher {
/// A single substring, using vector accelerated routines when available.
Memmem(Memmem),
/// An Aho-Corasick automaton.
- AC { ac: AhoCorasick<u32>, lits: Vec<Literal> },
+ AC { ac: AhoCorasick, lits: Vec<Literal> },
/// A packed multiple substring searcher, using SIMD.
///
/// Note that Aho-Corasick will actually use this packed searcher
@@ -39,27 +39,26 @@ enum Matcher {
impl LiteralSearcher {
/// Returns a matcher that never matches and never advances the input.
pub fn empty() -> Self {
- Self::new(Literals::empty(), Matcher::Empty)
+ Self::new(Seq::infinite(), Matcher::Empty)
}
/// Returns a matcher for literal prefixes from the given set.
- pub fn prefixes(lits: Literals) -> Self {
+ pub fn prefixes(lits: Seq) -> Self {
let matcher = Matcher::prefixes(&lits);
Self::new(lits, matcher)
}
/// Returns a matcher for literal suffixes from the given set.
- pub fn suffixes(lits: Literals) -> Self {
+ pub fn suffixes(lits: Seq) -> Self {
let matcher = Matcher::suffixes(&lits);
Self::new(lits, matcher)
}
- fn new(lits: Literals, matcher: Matcher) -> Self {
- let complete = lits.all_complete();
+ fn new(lits: Seq, matcher: Matcher) -> Self {
LiteralSearcher {
- complete,
- lcp: Memmem::new(lits.longest_common_prefix()),
- lcs: Memmem::new(lits.longest_common_suffix()),
+ complete: lits.is_exact(),
+ lcp: Memmem::new(lits.longest_common_prefix().unwrap_or(b"")),
+ lcs: Memmem::new(lits.longest_common_suffix().unwrap_or(b"")),
matcher,
}
}
@@ -150,7 +149,7 @@ impl LiteralSearcher {
Empty => 0,
Bytes(ref sset) => sset.dense.len(),
Memmem(_) => 1,
- AC { ref ac, .. } => ac.pattern_count(),
+ AC { ref ac, .. } => ac.patterns_len(),
Packed { ref lits, .. } => lits.len(),
}
}
@@ -162,27 +161,31 @@ impl LiteralSearcher {
Empty => 0,
Bytes(ref sset) => sset.approximate_size(),
Memmem(ref single) => single.approximate_size(),
- AC { ref ac, .. } => ac.heap_bytes(),
- Packed { ref s, .. } => s.heap_bytes(),
+ AC { ref ac, .. } => ac.memory_usage(),
+ Packed { ref s, .. } => s.memory_usage(),
}
}
}
impl Matcher {
- fn prefixes(lits: &Literals) -> Self {
+ fn prefixes(lits: &Seq) -> Self {
let sset = SingleByteSet::prefixes(lits);
Matcher::new(lits, sset)
}
- fn suffixes(lits: &Literals) -> Self {
+ fn suffixes(lits: &Seq) -> Self {
let sset = SingleByteSet::suffixes(lits);
Matcher::new(lits, sset)
}
- fn new(lits: &Literals, sset: SingleByteSet) -> Self {
- if lits.literals().is_empty() {
+ fn new(lits: &Seq, sset: SingleByteSet) -> Self {
+ if lits.is_empty() || lits.min_literal_len() == Some(0) {
return Matcher::Empty;
}
+ let lits = match lits.literals() {
+ None => return Matcher::Empty,
+ Some(members) => members,
+ };
if sset.dense.len() >= 26 {
// Avoid trying to match a large number of single bytes.
// This is *very* sensitive to a frequency analysis comparison
@@ -195,26 +198,26 @@ impl Matcher {
if sset.complete {
return Matcher::Bytes(sset);
}
- if lits.literals().len() == 1 {
- return Matcher::Memmem(Memmem::new(&lits.literals()[0]));
+ if lits.len() == 1 {
+ return Matcher::Memmem(Memmem::new(lits[0].as_bytes()));
}
- let pats = lits.literals().to_owned();
+ let pats: Vec<&[u8]> = lits.iter().map(|lit| lit.as_bytes()).collect();
let is_aho_corasick_fast = sset.dense.len() <= 1 && sset.all_ascii;
- if lits.literals().len() <= 100 && !is_aho_corasick_fast {
+ if lits.len() <= 100 && !is_aho_corasick_fast {
let mut builder = packed::Config::new()
.match_kind(packed::MatchKind::LeftmostFirst)
.builder();
if let Some(s) = builder.extend(&pats).build() {
- return Matcher::Packed { s, lits: pats };
+ return Matcher::Packed { s, lits: lits.to_owned() };
}
}
- let ac = AhoCorasickBuilder::new()
+ let ac = AhoCorasick::builder()
.match_kind(aho_corasick::MatchKind::LeftmostFirst)
- .dfa(true)
- .build_with_size::<u32, _, _>(&pats)
+ .kind(Some(aho_corasick::AhoCorasickKind::DFA))
+ .build(&pats)
.unwrap();
- Matcher::AC { ac, lits: pats }
+ Matcher::AC { ac, lits: lits.to_owned() }
}
}
@@ -257,7 +260,7 @@ impl<'a> Iterator for LiteralIter<'a> {
} else {
let next = &lits[0];
*lits = &lits[1..];
- Some(&**next)
+ Some(next.as_bytes())
}
}
LiteralIter::Packed(ref mut lits) => {
@@ -266,7 +269,7 @@ impl<'a> Iterator for LiteralIter<'a> {
} else {
let next = &lits[0];
*lits = &lits[1..];
- Some(&**next)
+ Some(next.as_bytes())
}
}
}
@@ -291,11 +294,15 @@ impl SingleByteSet {
}
}
- fn prefixes(lits: &Literals) -> SingleByteSet {
+ fn prefixes(lits: &Seq) -> SingleByteSet {
let mut sset = SingleByteSet::new();
- for lit in lits.literals() {
+ let lits = match lits.literals() {
+ None => return sset,
+ Some(lits) => lits,
+ };
+ for lit in lits.iter() {
sset.complete = sset.complete && lit.len() == 1;
- if let Some(&b) = lit.get(0) {
+ if let Some(&b) = lit.as_bytes().get(0) {
if !sset.sparse[b as usize] {
if b > 0x7F {
sset.all_ascii = false;
@@ -308,11 +315,15 @@ impl SingleByteSet {
sset
}
- fn suffixes(lits: &Literals) -> SingleByteSet {
+ fn suffixes(lits: &Seq) -> SingleByteSet {
let mut sset = SingleByteSet::new();
- for lit in lits.literals() {
+ let lits = match lits.literals() {
+ None => return sset,
+ Some(lits) => lits,
+ };
+ for lit in lits.iter() {
sset.complete = sset.complete && lit.len() == 1;
- if let Some(&b) = lit.get(lit.len().checked_sub(1).unwrap()) {
+ if let Some(&b) = lit.as_bytes().last() {
if !sset.sparse[b as usize] {
if b > 0x7F {
sset.all_ascii = false;
diff --git a/vendor/regex/src/literal/mod.rs b/vendor/regex/src/literal/mod.rs
index 980f52330..b9fb77aed 100644
--- a/vendor/regex/src/literal/mod.rs
+++ b/vendor/regex/src/literal/mod.rs
@@ -6,7 +6,7 @@ mod imp;
#[allow(missing_docs)]
#[cfg(not(feature = "perf-literal"))]
mod imp {
- use regex_syntax::hir::literal::Literals;
+ use regex_syntax::hir::literal::Seq;
#[derive(Clone, Debug)]
pub struct LiteralSearcher(());
@@ -16,11 +16,11 @@ mod imp {
LiteralSearcher(())
}
- pub fn prefixes(_: Literals) -> Self {
+ pub fn prefixes(_: Seq) -> Self {
LiteralSearcher(())
}
- pub fn suffixes(_: Literals) -> Self {
+ pub fn suffixes(_: Seq) -> Self {
LiteralSearcher(())
}
diff --git a/vendor/regex/src/prog.rs b/vendor/regex/src/prog.rs
index c211f71d8..100862cf1 100644
--- a/vendor/regex/src/prog.rs
+++ b/vendor/regex/src/prog.rs
@@ -27,6 +27,9 @@ pub struct Program {
pub captures: Vec<Option<String>>,
/// Pointers to all named capture groups into `captures`.
pub capture_name_idx: Arc<HashMap<String, usize>>,
+ /// If the number of capture groups is the same for all possible matches,
+ /// then this is that number.
+ pub static_captures_len: Option<usize>,
/// A pointer to the start instruction. This can vary depending on how
/// the program was compiled. For example, programs for use with the DFA
/// engine have a `.*?` inserted at the beginning of unanchored regular
@@ -83,6 +86,7 @@ impl Program {
matches: vec![],
captures: vec![],
capture_name_idx: Arc::new(HashMap::new()),
+ static_captures_len: None,
start: 0,
byte_classes: vec![0; 256],
only_utf8: true,
diff --git a/vendor/regex/src/re_bytes.rs b/vendor/regex/src/re_bytes.rs
index 07e9f98ac..e3a3b019b 100644
--- a/vendor/regex/src/re_bytes.rs
+++ b/vendor/regex/src/re_bytes.rs
@@ -17,7 +17,7 @@ use crate::re_trait::{self, RegularExpression, SubCapturesPosIter};
/// Match represents a single match of a regex in a haystack.
///
/// The lifetime parameter `'t` refers to the lifetime of the matched text.
-#[derive(Copy, Clone, Debug, Eq, PartialEq)]
+#[derive(Copy, Clone, Eq, PartialEq)]
pub struct Match<'t> {
text: &'t [u8],
start: usize,
@@ -37,6 +37,18 @@ impl<'t> Match<'t> {
self.end
}
+ /// Returns true if and only if this match has a length of zero.
+ #[inline]
+ pub fn is_empty(&self) -> bool {
+ self.start == self.end
+ }
+
+ /// Returns the length, in bytes, of this match.
+ #[inline]
+ pub fn len(&self) -> usize {
+ self.end - self.start
+ }
+
/// Returns the range over the starting and ending byte offsets of the
/// match in the haystack.
#[inline]
@@ -57,6 +69,24 @@ impl<'t> Match<'t> {
}
}
+impl<'t> std::fmt::Debug for Match<'t> {
+ fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
+ let mut fmt = f.debug_struct("Match");
+ fmt.field("start", &self.start).field("end", &self.end);
+ if let Ok(s) = std::str::from_utf8(self.as_bytes()) {
+ fmt.field("bytes", &s);
+ } else {
+ // FIXME: It would be nice if this could be printed as a string
+ // with invalid UTF-8 replaced with hex escapes. A alloc would
+ // probably okay if that makes it easier, but regex-automata does
+ // (at time of writing) have internal routines that do this. So
+ // maybe we should expose them.
+ fmt.field("bytes", &self.as_bytes());
+ }
+ fmt.finish()
+ }
+}
+
impl<'t> From<Match<'t>> for Range<usize> {
fn from(m: Match<'t>) -> Range<usize> {
m.range()
@@ -253,12 +283,7 @@ impl Regex {
/// The `0`th capture group is always unnamed, so it must always be
/// accessed with `get(0)` or `[0]`.
pub fn captures<'t>(&self, text: &'t [u8]) -> Option<Captures<'t>> {
- let mut locs = self.capture_locations();
- self.captures_read_at(&mut locs, text, 0).map(move |_| Captures {
- text,
- locs: locs.0,
- named_groups: self.0.capture_name_idx().clone(),
- })
+ self.captures_at(text, 0)
}
/// Returns an iterator over all the non-overlapping capture groups matched
@@ -537,7 +562,14 @@ impl Regex {
/// This method may have the same performance characteristics as
/// `is_match`, except it provides an end location for a match. In
/// particular, the location returned *may be shorter* than the proper end
- /// of the leftmost-first match.
+ /// of the leftmost-first match that you would find via `Regex::find`.
+ ///
+ /// Note that it is not guaranteed that this routine finds the shortest or
+ /// "earliest" possible match. Instead, the main idea of this API is that
+ /// it returns the offset at the point at which the internal regex engine
+ /// has determined that a match has occurred. This may vary depending on
+ /// which internal regex engine is used, and thus, the offset itself may
+ /// change.
///
/// # Example
///
@@ -598,6 +630,25 @@ impl Regex {
.map(|(s, e)| Match::new(text, s, e))
}
+ /// Returns the same as [`Regex::captures`], but starts the search at the
+ /// given offset.
+ ///
+ /// The significance of the starting point is that it takes the surrounding
+ /// context into consideration. For example, the `\A` anchor can only
+ /// match when `start == 0`.
+ pub fn captures_at<'t>(
+ &self,
+ text: &'t [u8],
+ start: usize,
+ ) -> Option<Captures<'t>> {
+ let mut locs = self.capture_locations();
+ self.captures_read_at(&mut locs, text, start).map(move |_| Captures {
+ text,
+ locs: locs.0,
+ named_groups: self.0.capture_name_idx().clone(),
+ })
+ }
+
/// This is like `captures`, but uses
/// [`CaptureLocations`](struct.CaptureLocations.html)
/// instead of
@@ -667,6 +718,46 @@ impl Regex {
self.0.capture_names().len()
}
+ /// Returns the total number of capturing groups that appear in every
+ /// possible match.
+ ///
+ /// If the number of capture groups can vary depending on the match, then
+ /// this returns `None`. That is, a value is only returned when the number
+ /// of matching groups is invariant or "static."
+ ///
+ /// Note that like [`Regex::captures_len`], this **does** include the
+ /// implicit capturing group corresponding to the entire match. Therefore,
+ /// when a non-None value is returned, it is guaranteed to be at least `1`.
+ /// Stated differently, a return value of `Some(0)` is impossible.
+ ///
+ /// # Example
+ ///
+ /// This shows a few cases where a static number of capture groups is
+ /// available and a few cases where it is not.
+ ///
+ /// ```
+ /// use regex::bytes::Regex;
+ ///
+ /// let len = |pattern| {
+ /// Regex::new(pattern).map(|re| re.static_captures_len())
+ /// };
+ ///
+ /// assert_eq!(Some(1), len("a")?);
+ /// assert_eq!(Some(2), len("(a)")?);
+ /// assert_eq!(Some(2), len("(a)|(b)")?);
+ /// assert_eq!(Some(3), len("(a)(b)|(c)(d)")?);
+ /// assert_eq!(None, len("(a)|b")?);
+ /// assert_eq!(None, len("a|(b)")?);
+ /// assert_eq!(None, len("(b)*")?);
+ /// assert_eq!(Some(2), len("(b)+")?);
+ ///
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ #[inline]
+ pub fn static_captures_len(&self) -> Option<usize> {
+ self.0.static_captures_len().map(|len| len.saturating_add(1))
+ }
+
/// Returns an empty set of capture locations that can be reused in
/// multiple calls to `captures_read` or `captures_read_at`.
pub fn capture_locations(&self) -> CaptureLocations {
@@ -856,6 +947,27 @@ impl<'r> FusedIterator for CaptureNames<'r> {}
/// In order to build a value of this type, you'll need to call the
/// `capture_locations` method on the `Regex` being used to execute the search.
/// The value returned can then be reused in subsequent searches.
+///
+/// # Example
+///
+/// This example shows how to create and use `CaptureLocations` in a search.
+///
+/// ```
+/// use regex::bytes::Regex;
+///
+/// let re = Regex::new(r"(?<first>\w+)\s+(?<last>\w+)").unwrap();
+/// let mut locs = re.capture_locations();
+/// let m = re.captures_read(&mut locs, b"Bruce Springsteen").unwrap();
+/// assert_eq!(0..17, m.range());
+/// assert_eq!(Some((0, 17)), locs.get(0));
+/// assert_eq!(Some((0, 5)), locs.get(1));
+/// assert_eq!(Some((6, 17)), locs.get(2));
+///
+/// // Asking for an invalid capture group always returns None.
+/// assert_eq!(None, locs.get(3));
+/// assert_eq!(None, locs.get(34973498648));
+/// assert_eq!(None, locs.get(9944060567225171988));
+/// ```
#[derive(Clone, Debug)]
pub struct CaptureLocations(re_trait::Locations);
diff --git a/vendor/regex/src/re_set.rs b/vendor/regex/src/re_set.rs
index a6d886d76..7c8253f0c 100644
--- a/vendor/regex/src/re_set.rs
+++ b/vendor/regex/src/re_set.rs
@@ -289,6 +289,12 @@ impl RegexSet {
}
}
+impl Default for RegexSet {
+ fn default() -> Self {
+ RegexSet::empty()
+ }
+}
+
/// A set of matches returned by a regex set.
#[derive(Clone, Debug)]
pub struct SetMatches {
@@ -315,6 +321,11 @@ impl SetMatches {
}
/// The total number of regexes in the set that created these matches.
+ ///
+ /// **WARNING:** This always returns the same value as [`RegexSet::len`].
+ /// In particular, it does *not* return the number of elements yielded by
+ /// [`SetMatches::iter`]. The only way to determine the total number of
+ /// matched regexes is to iterate over them.
pub fn len(&self) -> usize {
self.matches.len()
}
diff --git a/vendor/regex/src/re_trait.rs b/vendor/regex/src/re_trait.rs
index d0c717df5..505810c84 100644
--- a/vendor/regex/src/re_trait.rs
+++ b/vendor/regex/src/re_trait.rs
@@ -20,7 +20,7 @@ impl Locations {
/// not match anything. The positions returned are *always* byte indices
/// with respect to the original string matched.
pub fn pos(&self, i: usize) -> Option<(usize, usize)> {
- let (s, e) = (i * 2, i * 2 + 1);
+ let (s, e) = (i.checked_mul(2)?, i.checked_mul(2)?.checked_add(1)?);
match (self.0.get(s), self.0.get(e)) {
(Some(&Some(s)), Some(&Some(e))) => Some((s, e)),
_ => None,
diff --git a/vendor/regex/src/re_unicode.rs b/vendor/regex/src/re_unicode.rs
index 197510ea0..57689086d 100644
--- a/vendor/regex/src/re_unicode.rs
+++ b/vendor/regex/src/re_unicode.rs
@@ -25,7 +25,7 @@ pub fn escape(text: &str) -> String {
/// Match represents a single match of a regex in a haystack.
///
/// The lifetime parameter `'t` refers to the lifetime of the matched text.
-#[derive(Copy, Clone, Debug, Eq, PartialEq)]
+#[derive(Copy, Clone, Eq, PartialEq)]
pub struct Match<'t> {
text: &'t str,
start: usize,
@@ -45,6 +45,18 @@ impl<'t> Match<'t> {
self.end
}
+ /// Returns true if and only if this match has a length of zero.
+ #[inline]
+ pub fn is_empty(&self) -> bool {
+ self.start == self.end
+ }
+
+ /// Returns the length, in bytes, of this match.
+ #[inline]
+ pub fn len(&self) -> usize {
+ self.end - self.start
+ }
+
/// Returns the range over the starting and ending byte offsets of the
/// match in the haystack.
#[inline]
@@ -65,6 +77,16 @@ impl<'t> Match<'t> {
}
}
+impl<'t> std::fmt::Debug for Match<'t> {
+ fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
+ f.debug_struct("Match")
+ .field("start", &self.start)
+ .field("end", &self.end)
+ .field("string", &self.as_str())
+ .finish()
+ }
+}
+
impl<'t> From<Match<'t>> for &'t str {
fn from(m: Match<'t>) -> &'t str {
m.as_str()
@@ -309,12 +331,7 @@ impl Regex {
/// The `0`th capture group is always unnamed, so it must always be
/// accessed with `get(0)` or `[0]`.
pub fn captures<'t>(&self, text: &'t str) -> Option<Captures<'t>> {
- let mut locs = self.capture_locations();
- self.captures_read_at(&mut locs, text, 0).map(move |_| Captures {
- text,
- locs: locs.0,
- named_groups: self.0.capture_name_idx().clone(),
- })
+ self.captures_at(text, 0)
}
/// Returns an iterator over all the non-overlapping capture groups matched
@@ -595,7 +612,14 @@ impl Regex {
/// This method may have the same performance characteristics as
/// `is_match`, except it provides an end location for a match. In
/// particular, the location returned *may be shorter* than the proper end
- /// of the leftmost-first match.
+ /// of the leftmost-first match that you would find via `Regex::find`.
+ ///
+ /// Note that it is not guaranteed that this routine finds the shortest or
+ /// "earliest" possible match. Instead, the main idea of this API is that
+ /// it returns the offset at the point at which the internal regex engine
+ /// has determined that a match has occurred. This may vary depending on
+ /// which internal regex engine is used, and thus, the offset itself may
+ /// change.
///
/// # Example
///
@@ -615,12 +639,12 @@ impl Regex {
self.shortest_match_at(text, 0)
}
- /// Returns the same as shortest_match, but starts the search at the given
- /// offset.
+ /// Returns the same as `shortest_match`, but starts the search at the
+ /// given offset.
///
/// The significance of the starting point is that it takes the surrounding
- /// context into consideration. For example, the `\A` anchor can only
- /// match when `start == 0`.
+ /// context into consideration. For example, the `\A` anchor can only match
+ /// when `start == 0`.
pub fn shortest_match_at(
&self,
text: &str,
@@ -656,6 +680,25 @@ impl Regex {
.map(|(s, e)| Match::new(text, s, e))
}
+ /// Returns the same as [`Regex::captures`], but starts the search at the
+ /// given offset.
+ ///
+ /// The significance of the starting point is that it takes the surrounding
+ /// context into consideration. For example, the `\A` anchor can only
+ /// match when `start == 0`.
+ pub fn captures_at<'t>(
+ &self,
+ text: &'t str,
+ start: usize,
+ ) -> Option<Captures<'t>> {
+ let mut locs = self.capture_locations();
+ self.captures_read_at(&mut locs, text, start).map(move |_| Captures {
+ text,
+ locs: locs.0,
+ named_groups: self.0.capture_name_idx().clone(),
+ })
+ }
+
/// This is like `captures`, but uses
/// [`CaptureLocations`](struct.CaptureLocations.html)
/// instead of
@@ -725,6 +768,46 @@ impl Regex {
self.0.capture_names().len()
}
+ /// Returns the total number of capturing groups that appear in every
+ /// possible match.
+ ///
+ /// If the number of capture groups can vary depending on the match, then
+ /// this returns `None`. That is, a value is only returned when the number
+ /// of matching groups is invariant or "static."
+ ///
+ /// Note that like [`Regex::captures_len`], this **does** include the
+ /// implicit capturing group corresponding to the entire match. Therefore,
+ /// when a non-None value is returned, it is guaranteed to be at least `1`.
+ /// Stated differently, a return value of `Some(0)` is impossible.
+ ///
+ /// # Example
+ ///
+ /// This shows a few cases where a static number of capture groups is
+ /// available and a few cases where it is not.
+ ///
+ /// ```
+ /// use regex::Regex;
+ ///
+ /// let len = |pattern| {
+ /// Regex::new(pattern).map(|re| re.static_captures_len())
+ /// };
+ ///
+ /// assert_eq!(Some(1), len("a")?);
+ /// assert_eq!(Some(2), len("(a)")?);
+ /// assert_eq!(Some(2), len("(a)|(b)")?);
+ /// assert_eq!(Some(3), len("(a)(b)|(c)(d)")?);
+ /// assert_eq!(None, len("(a)|b")?);
+ /// assert_eq!(None, len("a|(b)")?);
+ /// assert_eq!(None, len("(b)*")?);
+ /// assert_eq!(Some(2), len("(b)+")?);
+ ///
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ #[inline]
+ pub fn static_captures_len(&self) -> Option<usize> {
+ self.0.static_captures_len().map(|len| len.saturating_add(1))
+ }
+
/// Returns an empty set of capture locations that can be reused in
/// multiple calls to `captures_read` or `captures_read_at`.
pub fn capture_locations(&self) -> CaptureLocations {
@@ -866,6 +949,27 @@ impl<'r, 't> FusedIterator for SplitN<'r, 't> {}
/// In order to build a value of this type, you'll need to call the
/// `capture_locations` method on the `Regex` being used to execute the search.
/// The value returned can then be reused in subsequent searches.
+///
+/// # Example
+///
+/// This example shows how to create and use `CaptureLocations` in a search.
+///
+/// ```
+/// use regex::Regex;
+///
+/// let re = Regex::new(r"(?<first>\w+)\s+(?<last>\w+)").unwrap();
+/// let mut locs = re.capture_locations();
+/// let m = re.captures_read(&mut locs, "Bruce Springsteen").unwrap();
+/// assert_eq!(0..17, m.range());
+/// assert_eq!(Some((0, 17)), locs.get(0));
+/// assert_eq!(Some((0, 5)), locs.get(1));
+/// assert_eq!(Some((6, 17)), locs.get(2));
+///
+/// // Asking for an invalid capture group always returns None.
+/// assert_eq!(None, locs.get(3));
+/// assert_eq!(None, locs.get(34973498648));
+/// assert_eq!(None, locs.get(9944060567225171988));
+/// ```
#[derive(Clone, Debug)]
pub struct CaptureLocations(re_trait::Locations);