diff options
Diffstat (limited to 'vendor/regex/src')
-rw-r--r-- | vendor/regex/src/compile.rs | 297 | ||||
-rw-r--r-- | vendor/regex/src/dfa.rs | 2 | ||||
-rw-r--r-- | vendor/regex/src/error.rs | 22 | ||||
-rw-r--r-- | vendor/regex/src/exec.rs | 196 | ||||
-rw-r--r-- | vendor/regex/src/expand.rs | 10 | ||||
-rw-r--r-- | vendor/regex/src/lib.rs | 40 | ||||
-rw-r--r-- | vendor/regex/src/literal/imp.rs | 81 | ||||
-rw-r--r-- | vendor/regex/src/literal/mod.rs | 6 | ||||
-rw-r--r-- | vendor/regex/src/prog.rs | 4 | ||||
-rw-r--r-- | vendor/regex/src/re_bytes.rs | 128 | ||||
-rw-r--r-- | vendor/regex/src/re_set.rs | 11 | ||||
-rw-r--r-- | vendor/regex/src/re_trait.rs | 2 | ||||
-rw-r--r-- | vendor/regex/src/re_unicode.rs | 128 |
13 files changed, 700 insertions, 227 deletions
diff --git a/vendor/regex/src/compile.rs b/vendor/regex/src/compile.rs index 90ca25015..23e63ec89 100644 --- a/vendor/regex/src/compile.rs +++ b/vendor/regex/src/compile.rs @@ -4,7 +4,7 @@ use std::iter; use std::result; use std::sync::Arc; -use regex_syntax::hir::{self, Hir}; +use regex_syntax::hir::{self, Hir, Look}; use regex_syntax::is_word_byte; use regex_syntax::utf8::{Utf8Range, Utf8Sequence, Utf8Sequences}; @@ -137,13 +137,24 @@ impl Compiler { } fn compile_one(mut self, expr: &Hir) -> result::Result<Program, Error> { + if self.compiled.only_utf8 + && expr.properties().look_set().contains(Look::WordAsciiNegate) + { + return Err(Error::Syntax( + "ASCII-only \\B is not allowed in Unicode regexes \ + because it may result in invalid UTF-8 matches" + .to_string(), + )); + } // If we're compiling a forward DFA and we aren't anchored, then // add a `.*?` before the first capture group. // Other matching engines handle this by baking the logic into the // matching engine itself. let mut dotstar_patch = Patch { hole: Hole::None, entry: 0 }; - self.compiled.is_anchored_start = expr.is_anchored_start(); - self.compiled.is_anchored_end = expr.is_anchored_end(); + self.compiled.is_anchored_start = + expr.properties().look_set_prefix().contains(Look::Start); + self.compiled.is_anchored_end = + expr.properties().look_set_suffix().contains(Look::End); if self.compiled.needs_dotstar() { dotstar_patch = self.c_dotstar()?; self.compiled.start = dotstar_patch.entry; @@ -159,6 +170,8 @@ impl Compiler { self.fill_to_next(patch.hole); self.compiled.matches = vec![self.insts.len()]; self.push_compiled(Inst::Match(0)); + self.compiled.static_captures_len = + expr.properties().static_explicit_captures_len(); self.compile_finish() } @@ -168,10 +181,12 @@ impl Compiler { ) -> result::Result<Program, Error> { debug_assert!(exprs.len() > 1); - self.compiled.is_anchored_start = - exprs.iter().all(|e| e.is_anchored_start()); - self.compiled.is_anchored_end = - exprs.iter().all(|e| e.is_anchored_end()); + self.compiled.is_anchored_start = exprs + .iter() + .all(|e| e.properties().look_set_prefix().contains(Look::Start)); + self.compiled.is_anchored_end = exprs + .iter() + .all(|e| e.properties().look_set_suffix().contains(Look::End)); let mut dotstar_patch = Patch { hole: Hole::None, entry: 0 }; if self.compiled.needs_dotstar() { dotstar_patch = self.c_dotstar()?; @@ -272,17 +287,21 @@ impl Compiler { self.check_size()?; match *expr.kind() { Empty => self.c_empty(), - Literal(hir::Literal::Unicode(c)) => self.c_char(c), - Literal(hir::Literal::Byte(b)) => { - assert!(self.compiled.uses_bytes()); - self.c_byte(b) + Literal(hir::Literal(ref bytes)) => { + if self.compiled.is_reverse { + let mut bytes = bytes.to_vec(); + bytes.reverse(); + self.c_literal(&bytes) + } else { + self.c_literal(bytes) + } } Class(hir::Class::Unicode(ref cls)) => self.c_class(cls.ranges()), Class(hir::Class::Bytes(ref cls)) => { if self.compiled.uses_bytes() { self.c_class_bytes(cls.ranges()) } else { - assert!(cls.is_all_ascii()); + assert!(cls.is_ascii()); let mut char_ranges = vec![]; for r in cls.iter() { let (s, e) = (r.start() as char, r.end() as char); @@ -291,92 +310,94 @@ impl Compiler { self.c_class(&char_ranges) } } - Anchor(hir::Anchor::StartLine) if self.compiled.is_reverse => { - self.byte_classes.set_range(b'\n', b'\n'); - self.c_empty_look(prog::EmptyLook::EndLine) - } - Anchor(hir::Anchor::StartLine) => { - self.byte_classes.set_range(b'\n', b'\n'); - self.c_empty_look(prog::EmptyLook::StartLine) - } - Anchor(hir::Anchor::EndLine) if self.compiled.is_reverse => { - self.byte_classes.set_range(b'\n', b'\n'); - self.c_empty_look(prog::EmptyLook::StartLine) - } - Anchor(hir::Anchor::EndLine) => { - self.byte_classes.set_range(b'\n', b'\n'); - self.c_empty_look(prog::EmptyLook::EndLine) - } - Anchor(hir::Anchor::StartText) if self.compiled.is_reverse => { - self.c_empty_look(prog::EmptyLook::EndText) - } - Anchor(hir::Anchor::StartText) => { - self.c_empty_look(prog::EmptyLook::StartText) - } - Anchor(hir::Anchor::EndText) if self.compiled.is_reverse => { - self.c_empty_look(prog::EmptyLook::StartText) - } - Anchor(hir::Anchor::EndText) => { - self.c_empty_look(prog::EmptyLook::EndText) - } - WordBoundary(hir::WordBoundary::Unicode) => { - if !cfg!(feature = "unicode-perl") { - return Err(Error::Syntax( - "Unicode word boundaries are unavailable when \ - the unicode-perl feature is disabled" - .to_string(), - )); + Look(ref look) => match *look { + hir::Look::Start if self.compiled.is_reverse => { + self.c_empty_look(prog::EmptyLook::EndText) } - self.compiled.has_unicode_word_boundary = true; - self.byte_classes.set_word_boundary(); - // We also make sure that all ASCII bytes are in a different - // class from non-ASCII bytes. Otherwise, it's possible for - // ASCII bytes to get lumped into the same class as non-ASCII - // bytes. This in turn may cause the lazy DFA to falsely start - // when it sees an ASCII byte that maps to a byte class with - // non-ASCII bytes. This ensures that never happens. - self.byte_classes.set_range(0, 0x7F); - self.c_empty_look(prog::EmptyLook::WordBoundary) - } - WordBoundary(hir::WordBoundary::UnicodeNegate) => { - if !cfg!(feature = "unicode-perl") { + hir::Look::Start => { + self.c_empty_look(prog::EmptyLook::StartText) + } + hir::Look::End if self.compiled.is_reverse => { + self.c_empty_look(prog::EmptyLook::StartText) + } + hir::Look::End => self.c_empty_look(prog::EmptyLook::EndText), + hir::Look::StartLF if self.compiled.is_reverse => { + self.byte_classes.set_range(b'\n', b'\n'); + self.c_empty_look(prog::EmptyLook::EndLine) + } + hir::Look::StartLF => { + self.byte_classes.set_range(b'\n', b'\n'); + self.c_empty_look(prog::EmptyLook::StartLine) + } + hir::Look::EndLF if self.compiled.is_reverse => { + self.byte_classes.set_range(b'\n', b'\n'); + self.c_empty_look(prog::EmptyLook::StartLine) + } + hir::Look::EndLF => { + self.byte_classes.set_range(b'\n', b'\n'); + self.c_empty_look(prog::EmptyLook::EndLine) + } + hir::Look::StartCRLF | hir::Look::EndCRLF => { return Err(Error::Syntax( - "Unicode word boundaries are unavailable when \ - the unicode-perl feature is disabled" + "CRLF-aware line anchors are not supported yet" .to_string(), )); } - self.compiled.has_unicode_word_boundary = true; - self.byte_classes.set_word_boundary(); - // See comments above for why we set the ASCII range here. - self.byte_classes.set_range(0, 0x7F); - self.c_empty_look(prog::EmptyLook::NotWordBoundary) - } - WordBoundary(hir::WordBoundary::Ascii) => { - self.byte_classes.set_word_boundary(); - self.c_empty_look(prog::EmptyLook::WordBoundaryAscii) - } - WordBoundary(hir::WordBoundary::AsciiNegate) => { - self.byte_classes.set_word_boundary(); - self.c_empty_look(prog::EmptyLook::NotWordBoundaryAscii) - } - Group(ref g) => match g.kind { - hir::GroupKind::NonCapturing => self.c(&g.hir), - hir::GroupKind::CaptureIndex(index) => { - if index as usize >= self.compiled.captures.len() { - self.compiled.captures.push(None); + hir::Look::WordAscii => { + self.byte_classes.set_word_boundary(); + self.c_empty_look(prog::EmptyLook::WordBoundaryAscii) + } + hir::Look::WordAsciiNegate => { + self.byte_classes.set_word_boundary(); + self.c_empty_look(prog::EmptyLook::NotWordBoundaryAscii) + } + hir::Look::WordUnicode => { + if !cfg!(feature = "unicode-perl") { + return Err(Error::Syntax( + "Unicode word boundaries are unavailable when \ + the unicode-perl feature is disabled" + .to_string(), + )); } - self.c_capture(2 * index as usize, &g.hir) + self.compiled.has_unicode_word_boundary = true; + self.byte_classes.set_word_boundary(); + // We also make sure that all ASCII bytes are in a different + // class from non-ASCII bytes. Otherwise, it's possible for + // ASCII bytes to get lumped into the same class as non-ASCII + // bytes. This in turn may cause the lazy DFA to falsely start + // when it sees an ASCII byte that maps to a byte class with + // non-ASCII bytes. This ensures that never happens. + self.byte_classes.set_range(0, 0x7F); + self.c_empty_look(prog::EmptyLook::WordBoundary) } - hir::GroupKind::CaptureName { index, ref name } => { - if index as usize >= self.compiled.captures.len() { - let n = name.to_string(); - self.compiled.captures.push(Some(n.clone())); - self.capture_name_idx.insert(n, index as usize); + hir::Look::WordUnicodeNegate => { + if !cfg!(feature = "unicode-perl") { + return Err(Error::Syntax( + "Unicode word boundaries are unavailable when \ + the unicode-perl feature is disabled" + .to_string(), + )); } - self.c_capture(2 * index as usize, &g.hir) + self.compiled.has_unicode_word_boundary = true; + self.byte_classes.set_word_boundary(); + // See comments above for why we set the ASCII range here. + self.byte_classes.set_range(0, 0x7F); + self.c_empty_look(prog::EmptyLook::NotWordBoundary) } }, + Capture(hir::Capture { index, ref name, ref sub }) => { + if index as usize >= self.compiled.captures.len() { + let name = match *name { + None => None, + Some(ref boxed_str) => Some(boxed_str.to_string()), + }; + self.compiled.captures.push(name.clone()); + if let Some(name) = name { + self.capture_name_idx.insert(name, index as usize); + } + } + self.c_capture(2 * index as usize, sub) + } Concat(ref es) => { if self.compiled.is_reverse { self.c_concat(es.iter().rev()) @@ -420,21 +441,19 @@ impl Compiler { } fn c_dotstar(&mut self) -> Result { - Ok(if !self.compiled.only_utf8() { - self.c(&Hir::repetition(hir::Repetition { - kind: hir::RepetitionKind::ZeroOrMore, - greedy: false, - hir: Box::new(Hir::any(true)), - }))? - .unwrap() + let hir = if self.compiled.only_utf8() { + Hir::dot(hir::Dot::AnyChar) } else { - self.c(&Hir::repetition(hir::Repetition { - kind: hir::RepetitionKind::ZeroOrMore, + Hir::dot(hir::Dot::AnyByte) + }; + Ok(self + .c(&Hir::repetition(hir::Repetition { + min: 0, + max: None, greedy: false, - hir: Box::new(Hir::any(false)), + sub: Box::new(hir), }))? - .unwrap() - }) + .unwrap()) } fn c_char(&mut self, c: char) -> ResultOrEmpty { @@ -457,7 +476,11 @@ impl Compiler { fn c_class(&mut self, ranges: &[hir::ClassUnicodeRange]) -> ResultOrEmpty { use std::mem::size_of; - assert!(!ranges.is_empty()); + if ranges.is_empty() { + return Err(Error::Syntax( + "empty character classes are not allowed".to_string(), + )); + } if self.compiled.uses_bytes() { Ok(Some(CompileClass { c: self, ranges }.compile()?)) } else { @@ -482,7 +505,11 @@ impl Compiler { &mut self, ranges: &[hir::ClassBytesRange], ) -> ResultOrEmpty { - debug_assert!(!ranges.is_empty()); + if ranges.is_empty() { + return Err(Error::Syntax( + "empty character classes are not allowed".to_string(), + )); + } let first_split_entry = self.insts.len(); let mut holes = vec![]; @@ -513,6 +540,52 @@ impl Compiler { Ok(Some(Patch { hole, entry: self.insts.len() - 1 })) } + fn c_literal(&mut self, bytes: &[u8]) -> ResultOrEmpty { + match core::str::from_utf8(bytes) { + Ok(string) => { + let mut it = string.chars(); + let Patch { mut hole, entry } = loop { + match it.next() { + None => return self.c_empty(), + Some(ch) => { + if let Some(p) = self.c_char(ch)? { + break p; + } + } + } + }; + for ch in it { + if let Some(p) = self.c_char(ch)? { + self.fill(hole, p.entry); + hole = p.hole; + } + } + Ok(Some(Patch { hole, entry })) + } + Err(_) => { + assert!(self.compiled.uses_bytes()); + let mut it = bytes.iter().copied(); + let Patch { mut hole, entry } = loop { + match it.next() { + None => return self.c_empty(), + Some(byte) => { + if let Some(p) = self.c_byte(byte)? { + break p; + } + } + } + }; + for byte in it { + if let Some(p) = self.c_byte(byte)? { + self.fill(hole, p.entry); + hole = p.hole; + } + } + Ok(Some(Patch { hole, entry })) + } + } + } + fn c_concat<'a, I>(&mut self, exprs: I) -> ResultOrEmpty where I: IntoIterator<Item = &'a Hir>, @@ -587,19 +660,15 @@ impl Compiler { } fn c_repeat(&mut self, rep: &hir::Repetition) -> ResultOrEmpty { - use regex_syntax::hir::RepetitionKind::*; - match rep.kind { - ZeroOrOne => self.c_repeat_zero_or_one(&rep.hir, rep.greedy), - ZeroOrMore => self.c_repeat_zero_or_more(&rep.hir, rep.greedy), - OneOrMore => self.c_repeat_one_or_more(&rep.hir, rep.greedy), - Range(hir::RepetitionRange::Exactly(min_max)) => { - self.c_repeat_range(&rep.hir, rep.greedy, min_max, min_max) - } - Range(hir::RepetitionRange::AtLeast(min)) => { - self.c_repeat_range_min_or_more(&rep.hir, rep.greedy, min) + match (rep.min, rep.max) { + (0, Some(1)) => self.c_repeat_zero_or_one(&rep.sub, rep.greedy), + (0, None) => self.c_repeat_zero_or_more(&rep.sub, rep.greedy), + (1, None) => self.c_repeat_one_or_more(&rep.sub, rep.greedy), + (min, None) => { + self.c_repeat_range_min_or_more(&rep.sub, rep.greedy, min) } - Range(hir::RepetitionRange::Bounded(min, max)) => { - self.c_repeat_range(&rep.hir, rep.greedy, min, max) + (min, Some(max)) => { + self.c_repeat_range(&rep.sub, rep.greedy, min, max) } } } diff --git a/vendor/regex/src/dfa.rs b/vendor/regex/src/dfa.rs index dc9952120..78ed71021 100644 --- a/vendor/regex/src/dfa.rs +++ b/vendor/regex/src/dfa.rs @@ -1576,7 +1576,7 @@ impl<'a> Fsm<'a> { /// inputs, a new state could be created for every byte of input. (This is /// bad for memory use, so we bound it with a cache.) fn approximate_size(&self) -> usize { - self.cache.size + self.prog.approximate_size() + self.cache.size } } diff --git a/vendor/regex/src/error.rs b/vendor/regex/src/error.rs index 3e0ec7521..6c341f604 100644 --- a/vendor/regex/src/error.rs +++ b/vendor/regex/src/error.rs @@ -6,8 +6,26 @@ use std::iter::repeat; pub enum Error { /// A syntax error. Syntax(String), - /// The compiled program exceeded the set size limit. - /// The argument is the size limit imposed. + /// The compiled program exceeded the set size + /// limit. The argument is the size limit imposed by + /// [`RegexBuilder::size_limit`](crate::RegexBuilder::size_limit). Even + /// when not configured explicitly, it defaults to a reasonable limit. + /// + /// If you're getting this error, it occurred because your regex has been + /// compiled to an intermediate state that is too big. It is important to + /// note that exceeding this limit does _not_ mean the regex is too big to + /// _work_, but rather, the regex is big enough that it may wind up being + /// surprisingly slow when used in a search. In other words, this error is + /// meant to be a practical heuristic for avoiding a performance footgun, + /// and especially so for the case where the regex pattern is coming from + /// an untrusted source. + /// + /// There are generally two ways to move forward if you hit this error. + /// The first is to find some way to use a smaller regex. The second is to + /// increase the size limit via `RegexBuilder::size_limit`. However, if + /// your regex pattern is not from a trusted source, then neither of these + /// approaches may be appropriate. Instead, you'll have to determine just + /// how big of a regex you want to allow. CompiledTooBig(usize), /// Hints that destructuring should not be exhaustive. /// diff --git a/vendor/regex/src/exec.rs b/vendor/regex/src/exec.rs index b9abcdc04..ee8b589d2 100644 --- a/vendor/regex/src/exec.rs +++ b/vendor/regex/src/exec.rs @@ -4,9 +4,9 @@ use std::panic::AssertUnwindSafe; use std::sync::Arc; #[cfg(feature = "perf-literal")] -use aho_corasick::{AhoCorasick, AhoCorasickBuilder, MatchKind}; -use regex_syntax::hir::literal::Literals; -use regex_syntax::hir::Hir; +use aho_corasick::{AhoCorasick, MatchKind}; +use regex_syntax::hir::literal; +use regex_syntax::hir::{Hir, Look}; use regex_syntax::ParserBuilder; use crate::backtrack; @@ -78,15 +78,18 @@ struct ExecReadOnly { /// not supported.) Note that this program contains an embedded `.*?` /// preceding the first capture group, unless the regex is anchored at the /// beginning. + #[allow(dead_code)] dfa: Program, /// The same as above, except the program is reversed (and there is no /// preceding `.*?`). This is used by the DFA to find the starting location /// of matches. + #[allow(dead_code)] dfa_reverse: Program, /// A set of suffix literals extracted from the regex. /// /// Prefix literals are stored on the `Program`, since they are used inside /// the matching engines. + #[allow(dead_code)] suffixes: LiteralSearcher, /// An Aho-Corasick automaton with leftmost-first match semantics. /// @@ -98,7 +101,7 @@ struct ExecReadOnly { /// if we were to exhaust the ID space, we probably would have long /// surpassed the compilation size limit. #[cfg(feature = "perf-literal")] - ac: Option<AhoCorasick<u32>>, + ac: Option<AhoCorasick>, /// match_type encodes as much upfront knowledge about how we're going to /// execute a search as possible. match_type: MatchType, @@ -121,8 +124,8 @@ pub struct ExecBuilder { /// literals. struct Parsed { exprs: Vec<Hir>, - prefixes: Literals, - suffixes: Literals, + prefixes: literal::Seq, + suffixes: literal::Seq, bytes: bool, } @@ -228,8 +231,8 @@ impl ExecBuilder { /// Parse the current set of patterns into their AST and extract literals. fn parse(&self) -> Result<Parsed, Error> { let mut exprs = Vec::with_capacity(self.options.pats.len()); - let mut prefixes = Some(Literals::empty()); - let mut suffixes = Some(Literals::empty()); + let mut prefixes = Some(literal::Seq::empty()); + let mut suffixes = Some(literal::Seq::empty()); let mut bytes = false; let is_set = self.options.pats.len() > 1; // If we're compiling a regex set and that set has any anchored @@ -243,54 +246,103 @@ impl ExecBuilder { .swap_greed(self.options.swap_greed) .ignore_whitespace(self.options.ignore_whitespace) .unicode(self.options.unicode) - .allow_invalid_utf8(!self.only_utf8) + .utf8(self.only_utf8) .nest_limit(self.options.nest_limit) .build(); let expr = parser.parse(pat).map_err(|e| Error::Syntax(e.to_string()))?; - bytes = bytes || !expr.is_always_utf8(); + let props = expr.properties(); + // This used to just check whether the HIR matched valid UTF-8 + // or not, but in regex-syntax 0.7, we changed our definition of + // "matches valid UTF-8" to exclude zero-width matches. And in + // particular, previously, we considered WordAsciiNegate (that + // is '(?-u:\B)') to be capable of matching invalid UTF-8. Our + // matcher engines were built under this assumption and fixing + // them is not worth it with the imminent plan to switch over to + // regex-automata. So for now, we retain the previous behavior by + // just explicitly treating the presence of a negated ASCII word + // boundary as forcing use to use a byte oriented automaton. + bytes = bytes + || !props.is_utf8() + || props.look_set().contains(Look::WordAsciiNegate); if cfg!(feature = "perf-literal") { - if !expr.is_anchored_start() && expr.is_any_anchored_start() { + if !props.look_set_prefix().contains(Look::Start) + && props.look_set().contains(Look::Start) + { // Partial anchors unfortunately make it hard to use // prefixes, so disable them. prefixes = None; - } else if is_set && expr.is_anchored_start() { + } else if is_set + && props.look_set_prefix_any().contains(Look::Start) + { // Regex sets with anchors do not go well with literal // optimizations. prefixes = None; + } else if props.look_set_prefix_any().contains_word() { + // The new literal extractor ignores look-around while + // the old one refused to extract prefixes from regexes + // that began with a \b. These old creaky regex internals + // can't deal with it, so we drop it. + prefixes = None; + } else if props.look_set_prefix_any().contains(Look::StartLF) { + // Similar to the reasoning for word boundaries, this old + // regex engine can't handle literal prefixes with '(?m:^)' + // at the beginning of a regex. + prefixes = None; } - prefixes = prefixes.and_then(|mut prefixes| { - if !prefixes.union_prefixes(&expr) { - None - } else { - Some(prefixes) - } - }); - if !expr.is_anchored_end() && expr.is_any_anchored_end() { + if !props.look_set_suffix().contains(Look::End) + && props.look_set().contains(Look::End) + { // Partial anchors unfortunately make it hard to use // suffixes, so disable them. suffixes = None; - } else if is_set && expr.is_anchored_end() { + } else if is_set + && props.look_set_suffix_any().contains(Look::End) + { // Regex sets with anchors do not go well with literal // optimizations. suffixes = None; + } else if props.look_set_suffix_any().contains_word() { + // See the prefix case for reasoning here. + suffixes = None; + } else if props.look_set_suffix_any().contains(Look::EndLF) { + // See the prefix case for reasoning here. + suffixes = None; } - suffixes = suffixes.and_then(|mut suffixes| { - if !suffixes.union_suffixes(&expr) { - None + + let (mut pres, mut suffs) = + if prefixes.is_none() && suffixes.is_none() { + (literal::Seq::infinite(), literal::Seq::infinite()) } else { - Some(suffixes) - } + literal_analysis(&expr) + }; + // These old creaky regex internals can't handle cases where + // the literal sequences are exact but there are look-around + // assertions. So we make sure the sequences are inexact if + // there are look-around assertions anywhere. This forces the + // regex engines to run instead of assuming that a literal + // match implies an overall match. + if !props.look_set().is_empty() { + pres.make_inexact(); + suffs.make_inexact(); + } + prefixes = prefixes.and_then(|mut prefixes| { + prefixes.union(&mut pres); + Some(prefixes) + }); + suffixes = suffixes.and_then(|mut suffixes| { + suffixes.union(&mut suffs); + Some(suffixes) }); } exprs.push(expr); } Ok(Parsed { exprs, - prefixes: prefixes.unwrap_or_else(Literals::empty), - suffixes: suffixes.unwrap_or_else(Literals::empty), + prefixes: prefixes.unwrap_or_else(literal::Seq::empty), + suffixes: suffixes.unwrap_or_else(literal::Seq::empty), bytes, }) } @@ -356,7 +408,7 @@ impl ExecBuilder { } #[cfg(feature = "perf-literal")] - fn build_aho_corasick(&self, parsed: &Parsed) -> Option<AhoCorasick<u32>> { + fn build_aho_corasick(&self, parsed: &Parsed) -> Option<AhoCorasick> { if parsed.exprs.len() != 1 { return None; } @@ -370,10 +422,9 @@ impl ExecBuilder { return None; } Some( - AhoCorasickBuilder::new() + AhoCorasick::builder() .match_kind(MatchKind::LeftmostFirst) - .auto_configure(&lits) - .build_with_size::<u32, _, _>(&lits) + .build(&lits) // This should never happen because we'd long exceed the // compilation limit for regexes first. .expect("AC automaton too big"), @@ -1311,6 +1362,12 @@ impl Exec { pub fn capture_name_idx(&self) -> &Arc<HashMap<String, usize>> { &self.ro.nfa.capture_name_idx } + + /// If the number of capture groups in every match is always the same, then + /// return that number. Otherwise return `None`. + pub fn static_captures_len(&self) -> Option<usize> { + self.ro.nfa.static_captures_len + } } impl Clone for Exec { @@ -1382,7 +1439,18 @@ impl ExecReadOnly { // This case shouldn't happen. When the regex isn't // anchored, then complete prefixes should imply complete // suffixes. - Some(MatchType::Literal(MatchLiteralType::Unanchored)) + // + // The above is wrong! This case can happen. While + // complete prefixes should imply complete suffixes + // here, that doesn't necessarily mean we have a useful + // prefix matcher! It could be the case that the literal + // searcher decided the prefixes---even though they are + // "complete"---weren't good enough and thus created an + // empty matcher. If that happens and we return Unanchored + // here, then we'll end up using that matcher, which is + // very bad because it matches at every position. So... + // return None. + None }; } None @@ -1557,7 +1625,7 @@ fn alternation_literals(expr: &Hir) -> Option<Vec<Vec<u8>>> { // optimization pipeline, because this is a terribly inflexible way to go // about things. - if !expr.is_alternation_literal() { + if !expr.properties().is_alternation_literal() { return None; } let alts = match *expr.kind() { @@ -1565,25 +1633,19 @@ fn alternation_literals(expr: &Hir) -> Option<Vec<Vec<u8>>> { _ => return None, // one literal isn't worth it }; - let extendlit = |lit: &Literal, dst: &mut Vec<u8>| match *lit { - Literal::Unicode(c) => { - let mut buf = [0; 4]; - dst.extend_from_slice(c.encode_utf8(&mut buf).as_bytes()); - } - Literal::Byte(b) => { - dst.push(b); - } - }; - let mut lits = vec![]; for alt in alts { let mut lit = vec![]; match *alt.kind() { - HirKind::Literal(ref x) => extendlit(x, &mut lit), + HirKind::Literal(Literal(ref bytes)) => { + lit.extend_from_slice(bytes) + } HirKind::Concat(ref exprs) => { for e in exprs { match *e.kind() { - HirKind::Literal(ref x) => extendlit(x, &mut lit), + HirKind::Literal(Literal(ref bytes)) => { + lit.extend_from_slice(bytes); + } _ => unreachable!("expected literal, got {:?}", e), } } @@ -1595,6 +1657,48 @@ fn alternation_literals(expr: &Hir) -> Option<Vec<Vec<u8>>> { Some(lits) } +#[cfg(not(feature = "perf-literal"))] +fn literal_analysis(_: &Hir) -> (literal::Seq, literal::Seq) { + (literal::Seq::infinite(), literal::Seq::infinite()) +} + +#[cfg(feature = "perf-literal")] +fn literal_analysis(expr: &Hir) -> (literal::Seq, literal::Seq) { + const ATTEMPTS: [(usize, usize); 3] = [(5, 50), (4, 30), (3, 20)]; + + let mut prefixes = literal::Extractor::new() + .kind(literal::ExtractKind::Prefix) + .extract(expr); + for (keep, limit) in ATTEMPTS { + let len = match prefixes.len() { + None => break, + Some(len) => len, + }; + if len <= limit { + break; + } + prefixes.keep_first_bytes(keep); + prefixes.minimize_by_preference(); + } + + let mut suffixes = literal::Extractor::new() + .kind(literal::ExtractKind::Suffix) + .extract(expr); + for (keep, limit) in ATTEMPTS { + let len = match suffixes.len() { + None => break, + Some(len) => len, + }; + if len <= limit { + break; + } + suffixes.keep_last_bytes(keep); + suffixes.minimize_by_preference(); + } + + (prefixes, suffixes) +} + #[cfg(test)] mod test { #[test] diff --git a/vendor/regex/src/expand.rs b/vendor/regex/src/expand.rs index 67b514926..98fafc949 100644 --- a/vendor/regex/src/expand.rs +++ b/vendor/regex/src/expand.rs @@ -182,7 +182,8 @@ fn find_cap_ref_braced(rep: &[u8], mut i: usize) -> Option<CaptureRef<'_>> { }) } -/// Returns true if and only if the given byte is allowed in a capture name. +/// Returns true if and only if the given byte is allowed in a capture name +/// written in non-brace form. fn is_valid_cap_letter(b: u8) -> bool { match b { b'0'..=b'9' | b'a'..=b'z' | b'A'..=b'Z' | b'_' => true, @@ -236,4 +237,11 @@ mod tests { find!(find_cap_ref17, "$x_$y", c!("x_", 3)); find!(find_cap_ref18, "${#}", c!("#", 4)); find!(find_cap_ref19, "${Z[}", c!("Z[", 5)); + find!(find_cap_ref20, "${¾}", c!("¾", 5)); + find!(find_cap_ref21, "${¾a}", c!("¾a", 6)); + find!(find_cap_ref22, "${a¾}", c!("a¾", 6)); + find!(find_cap_ref23, "${☃}", c!("☃", 6)); + find!(find_cap_ref24, "${a☃}", c!("a☃", 7)); + find!(find_cap_ref25, "${☃a}", c!("☃a", 7)); + find!(find_cap_ref26, "${名字}", c!("名字", 9)); } diff --git a/vendor/regex/src/lib.rs b/vendor/regex/src/lib.rs index 6b95739c5..82c1b77ad 100644 --- a/vendor/regex/src/lib.rs +++ b/vendor/regex/src/lib.rs @@ -199,6 +199,8 @@ instead.) This implementation executes regular expressions **only** on valid UTF-8 while exposing match locations as byte indices into the search string. (To relax this restriction, use the [`bytes`](bytes/index.html) sub-module.) +Conceptually, the regex engine works by matching a haystack as if it were a +sequence of Unicode scalar values. Only simple case folding is supported. Namely, when matching case-insensitively, the characters are first mapped using the "simple" case @@ -285,9 +287,9 @@ a separate crate, [`regex-syntax`](https://docs.rs/regex-syntax). . any character except new line (includes new line with s flag) \d digit (\p{Nd}) \D not digit -\pN One-letter name Unicode character class +\pX Unicode character class identified by a one-letter name \p{Greek} Unicode character class (general category or script) -\PN Negated one-letter name Unicode character class +\PX Negated Unicode character class identified by a one-letter name \P{Greek} negated Unicode character class (general category or script) </pre> @@ -325,6 +327,25 @@ xy concatenation (x followed by y) x|y alternation (x or y, prefer x) </pre> +This example shows how an alternation works, and what it means to prefer a +branch in the alternation over subsequent branches. + +``` +use regex::Regex; + +let haystack = "samwise"; +// If 'samwise' comes first in our alternation, then it is +// preferred as a match, even if the regex engine could +// technically detect that 'sam' led to a match earlier. +let re = Regex::new(r"samwise|sam").unwrap(); +assert_eq!("samwise", re.find(haystack).unwrap().as_str()); +// But if 'sam' comes first, then it will match instead. +// In this case, it is impossible for 'samwise' to match +// because 'sam' is a prefix of it. +let re = Regex::new(r"sam|samwise").unwrap(); +assert_eq!("sam", re.find(haystack).unwrap().as_str()); +``` + ## Repetitions <pre class="rust"> @@ -360,12 +381,19 @@ regex matches `abc` at positions `0`, `1`, `2` and `3`. <pre class="rust"> (exp) numbered capture group (indexed by opening parenthesis) -(?P<name>exp) named (also numbered) capture group (allowed chars: [_0-9a-zA-Z.\[\]]) +(?P<name>exp) named (also numbered) capture group (names must be alpha-numeric) +(?<name>exp) named (also numbered) capture group (names must be alpha-numeric) (?:exp) non-capturing group (?flags) set flags within current group (?flags:exp) set flags for exp (non-capturing) </pre> +Capture group names must be any sequence of alpha-numeric Unicode codepoints, +in addition to `.`, `_`, `[` and `]`. Names must start with either an `_` or +an alphabetic codepoint. Alphabetic codepoints correspond to the `Alphabetic` +Unicode property, while numeric codepoints correspond to the union of the +`Decimal_Number`, `Letter_Number` and `Other_Number` general categories. + Flags are each a single character. For example, `(?x)` sets the flag `x` and `(?-x)` clears the flag `x`. Multiple flags can be set or cleared at the same time: `(?xy)` sets both the `x` and `y` flags and `(?x-y)` sets @@ -379,9 +407,13 @@ m multi-line mode: ^ and $ match begin/end of line s allow . to match \n U swap the meaning of x* and x*? u Unicode support (enabled by default) -x ignore whitespace and allow line comments (starting with `#`) +x verbose mode, ignores whitespace and allow line comments (starting with `#`) </pre> +Note that in verbose mode, whitespace is ignored everywhere, including within +character classes. To insert whitespace, use its escaped form or a hex literal. +For example, `\ ` or `\x20` for an ASCII space. + Flags can be toggled within a pattern. Here's an example that matches case-insensitively for the first part but case-sensitively for the second part: diff --git a/vendor/regex/src/literal/imp.rs b/vendor/regex/src/literal/imp.rs index 90b2f1160..75fa6e37b 100644 --- a/vendor/regex/src/literal/imp.rs +++ b/vendor/regex/src/literal/imp.rs @@ -1,8 +1,8 @@ use std::mem; -use aho_corasick::{self, packed, AhoCorasick, AhoCorasickBuilder}; +use aho_corasick::{self, packed, AhoCorasick}; use memchr::{memchr, memchr2, memchr3, memmem}; -use regex_syntax::hir::literal::{Literal, Literals}; +use regex_syntax::hir::literal::{Literal, Seq}; /// A prefix extracted from a compiled regular expression. /// @@ -26,7 +26,7 @@ enum Matcher { /// A single substring, using vector accelerated routines when available. Memmem(Memmem), /// An Aho-Corasick automaton. - AC { ac: AhoCorasick<u32>, lits: Vec<Literal> }, + AC { ac: AhoCorasick, lits: Vec<Literal> }, /// A packed multiple substring searcher, using SIMD. /// /// Note that Aho-Corasick will actually use this packed searcher @@ -39,27 +39,26 @@ enum Matcher { impl LiteralSearcher { /// Returns a matcher that never matches and never advances the input. pub fn empty() -> Self { - Self::new(Literals::empty(), Matcher::Empty) + Self::new(Seq::infinite(), Matcher::Empty) } /// Returns a matcher for literal prefixes from the given set. - pub fn prefixes(lits: Literals) -> Self { + pub fn prefixes(lits: Seq) -> Self { let matcher = Matcher::prefixes(&lits); Self::new(lits, matcher) } /// Returns a matcher for literal suffixes from the given set. - pub fn suffixes(lits: Literals) -> Self { + pub fn suffixes(lits: Seq) -> Self { let matcher = Matcher::suffixes(&lits); Self::new(lits, matcher) } - fn new(lits: Literals, matcher: Matcher) -> Self { - let complete = lits.all_complete(); + fn new(lits: Seq, matcher: Matcher) -> Self { LiteralSearcher { - complete, - lcp: Memmem::new(lits.longest_common_prefix()), - lcs: Memmem::new(lits.longest_common_suffix()), + complete: lits.is_exact(), + lcp: Memmem::new(lits.longest_common_prefix().unwrap_or(b"")), + lcs: Memmem::new(lits.longest_common_suffix().unwrap_or(b"")), matcher, } } @@ -150,7 +149,7 @@ impl LiteralSearcher { Empty => 0, Bytes(ref sset) => sset.dense.len(), Memmem(_) => 1, - AC { ref ac, .. } => ac.pattern_count(), + AC { ref ac, .. } => ac.patterns_len(), Packed { ref lits, .. } => lits.len(), } } @@ -162,27 +161,31 @@ impl LiteralSearcher { Empty => 0, Bytes(ref sset) => sset.approximate_size(), Memmem(ref single) => single.approximate_size(), - AC { ref ac, .. } => ac.heap_bytes(), - Packed { ref s, .. } => s.heap_bytes(), + AC { ref ac, .. } => ac.memory_usage(), + Packed { ref s, .. } => s.memory_usage(), } } } impl Matcher { - fn prefixes(lits: &Literals) -> Self { + fn prefixes(lits: &Seq) -> Self { let sset = SingleByteSet::prefixes(lits); Matcher::new(lits, sset) } - fn suffixes(lits: &Literals) -> Self { + fn suffixes(lits: &Seq) -> Self { let sset = SingleByteSet::suffixes(lits); Matcher::new(lits, sset) } - fn new(lits: &Literals, sset: SingleByteSet) -> Self { - if lits.literals().is_empty() { + fn new(lits: &Seq, sset: SingleByteSet) -> Self { + if lits.is_empty() || lits.min_literal_len() == Some(0) { return Matcher::Empty; } + let lits = match lits.literals() { + None => return Matcher::Empty, + Some(members) => members, + }; if sset.dense.len() >= 26 { // Avoid trying to match a large number of single bytes. // This is *very* sensitive to a frequency analysis comparison @@ -195,26 +198,26 @@ impl Matcher { if sset.complete { return Matcher::Bytes(sset); } - if lits.literals().len() == 1 { - return Matcher::Memmem(Memmem::new(&lits.literals()[0])); + if lits.len() == 1 { + return Matcher::Memmem(Memmem::new(lits[0].as_bytes())); } - let pats = lits.literals().to_owned(); + let pats: Vec<&[u8]> = lits.iter().map(|lit| lit.as_bytes()).collect(); let is_aho_corasick_fast = sset.dense.len() <= 1 && sset.all_ascii; - if lits.literals().len() <= 100 && !is_aho_corasick_fast { + if lits.len() <= 100 && !is_aho_corasick_fast { let mut builder = packed::Config::new() .match_kind(packed::MatchKind::LeftmostFirst) .builder(); if let Some(s) = builder.extend(&pats).build() { - return Matcher::Packed { s, lits: pats }; + return Matcher::Packed { s, lits: lits.to_owned() }; } } - let ac = AhoCorasickBuilder::new() + let ac = AhoCorasick::builder() .match_kind(aho_corasick::MatchKind::LeftmostFirst) - .dfa(true) - .build_with_size::<u32, _, _>(&pats) + .kind(Some(aho_corasick::AhoCorasickKind::DFA)) + .build(&pats) .unwrap(); - Matcher::AC { ac, lits: pats } + Matcher::AC { ac, lits: lits.to_owned() } } } @@ -257,7 +260,7 @@ impl<'a> Iterator for LiteralIter<'a> { } else { let next = &lits[0]; *lits = &lits[1..]; - Some(&**next) + Some(next.as_bytes()) } } LiteralIter::Packed(ref mut lits) => { @@ -266,7 +269,7 @@ impl<'a> Iterator for LiteralIter<'a> { } else { let next = &lits[0]; *lits = &lits[1..]; - Some(&**next) + Some(next.as_bytes()) } } } @@ -291,11 +294,15 @@ impl SingleByteSet { } } - fn prefixes(lits: &Literals) -> SingleByteSet { + fn prefixes(lits: &Seq) -> SingleByteSet { let mut sset = SingleByteSet::new(); - for lit in lits.literals() { + let lits = match lits.literals() { + None => return sset, + Some(lits) => lits, + }; + for lit in lits.iter() { sset.complete = sset.complete && lit.len() == 1; - if let Some(&b) = lit.get(0) { + if let Some(&b) = lit.as_bytes().get(0) { if !sset.sparse[b as usize] { if b > 0x7F { sset.all_ascii = false; @@ -308,11 +315,15 @@ impl SingleByteSet { sset } - fn suffixes(lits: &Literals) -> SingleByteSet { + fn suffixes(lits: &Seq) -> SingleByteSet { let mut sset = SingleByteSet::new(); - for lit in lits.literals() { + let lits = match lits.literals() { + None => return sset, + Some(lits) => lits, + }; + for lit in lits.iter() { sset.complete = sset.complete && lit.len() == 1; - if let Some(&b) = lit.get(lit.len().checked_sub(1).unwrap()) { + if let Some(&b) = lit.as_bytes().last() { if !sset.sparse[b as usize] { if b > 0x7F { sset.all_ascii = false; diff --git a/vendor/regex/src/literal/mod.rs b/vendor/regex/src/literal/mod.rs index 980f52330..b9fb77aed 100644 --- a/vendor/regex/src/literal/mod.rs +++ b/vendor/regex/src/literal/mod.rs @@ -6,7 +6,7 @@ mod imp; #[allow(missing_docs)] #[cfg(not(feature = "perf-literal"))] mod imp { - use regex_syntax::hir::literal::Literals; + use regex_syntax::hir::literal::Seq; #[derive(Clone, Debug)] pub struct LiteralSearcher(()); @@ -16,11 +16,11 @@ mod imp { LiteralSearcher(()) } - pub fn prefixes(_: Literals) -> Self { + pub fn prefixes(_: Seq) -> Self { LiteralSearcher(()) } - pub fn suffixes(_: Literals) -> Self { + pub fn suffixes(_: Seq) -> Self { LiteralSearcher(()) } diff --git a/vendor/regex/src/prog.rs b/vendor/regex/src/prog.rs index c211f71d8..100862cf1 100644 --- a/vendor/regex/src/prog.rs +++ b/vendor/regex/src/prog.rs @@ -27,6 +27,9 @@ pub struct Program { pub captures: Vec<Option<String>>, /// Pointers to all named capture groups into `captures`. pub capture_name_idx: Arc<HashMap<String, usize>>, + /// If the number of capture groups is the same for all possible matches, + /// then this is that number. + pub static_captures_len: Option<usize>, /// A pointer to the start instruction. This can vary depending on how /// the program was compiled. For example, programs for use with the DFA /// engine have a `.*?` inserted at the beginning of unanchored regular @@ -83,6 +86,7 @@ impl Program { matches: vec![], captures: vec![], capture_name_idx: Arc::new(HashMap::new()), + static_captures_len: None, start: 0, byte_classes: vec![0; 256], only_utf8: true, diff --git a/vendor/regex/src/re_bytes.rs b/vendor/regex/src/re_bytes.rs index 07e9f98ac..e3a3b019b 100644 --- a/vendor/regex/src/re_bytes.rs +++ b/vendor/regex/src/re_bytes.rs @@ -17,7 +17,7 @@ use crate::re_trait::{self, RegularExpression, SubCapturesPosIter}; /// Match represents a single match of a regex in a haystack. /// /// The lifetime parameter `'t` refers to the lifetime of the matched text. -#[derive(Copy, Clone, Debug, Eq, PartialEq)] +#[derive(Copy, Clone, Eq, PartialEq)] pub struct Match<'t> { text: &'t [u8], start: usize, @@ -37,6 +37,18 @@ impl<'t> Match<'t> { self.end } + /// Returns true if and only if this match has a length of zero. + #[inline] + pub fn is_empty(&self) -> bool { + self.start == self.end + } + + /// Returns the length, in bytes, of this match. + #[inline] + pub fn len(&self) -> usize { + self.end - self.start + } + /// Returns the range over the starting and ending byte offsets of the /// match in the haystack. #[inline] @@ -57,6 +69,24 @@ impl<'t> Match<'t> { } } +impl<'t> std::fmt::Debug for Match<'t> { + fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result { + let mut fmt = f.debug_struct("Match"); + fmt.field("start", &self.start).field("end", &self.end); + if let Ok(s) = std::str::from_utf8(self.as_bytes()) { + fmt.field("bytes", &s); + } else { + // FIXME: It would be nice if this could be printed as a string + // with invalid UTF-8 replaced with hex escapes. A alloc would + // probably okay if that makes it easier, but regex-automata does + // (at time of writing) have internal routines that do this. So + // maybe we should expose them. + fmt.field("bytes", &self.as_bytes()); + } + fmt.finish() + } +} + impl<'t> From<Match<'t>> for Range<usize> { fn from(m: Match<'t>) -> Range<usize> { m.range() @@ -253,12 +283,7 @@ impl Regex { /// The `0`th capture group is always unnamed, so it must always be /// accessed with `get(0)` or `[0]`. pub fn captures<'t>(&self, text: &'t [u8]) -> Option<Captures<'t>> { - let mut locs = self.capture_locations(); - self.captures_read_at(&mut locs, text, 0).map(move |_| Captures { - text, - locs: locs.0, - named_groups: self.0.capture_name_idx().clone(), - }) + self.captures_at(text, 0) } /// Returns an iterator over all the non-overlapping capture groups matched @@ -537,7 +562,14 @@ impl Regex { /// This method may have the same performance characteristics as /// `is_match`, except it provides an end location for a match. In /// particular, the location returned *may be shorter* than the proper end - /// of the leftmost-first match. + /// of the leftmost-first match that you would find via `Regex::find`. + /// + /// Note that it is not guaranteed that this routine finds the shortest or + /// "earliest" possible match. Instead, the main idea of this API is that + /// it returns the offset at the point at which the internal regex engine + /// has determined that a match has occurred. This may vary depending on + /// which internal regex engine is used, and thus, the offset itself may + /// change. /// /// # Example /// @@ -598,6 +630,25 @@ impl Regex { .map(|(s, e)| Match::new(text, s, e)) } + /// Returns the same as [`Regex::captures`], but starts the search at the + /// given offset. + /// + /// The significance of the starting point is that it takes the surrounding + /// context into consideration. For example, the `\A` anchor can only + /// match when `start == 0`. + pub fn captures_at<'t>( + &self, + text: &'t [u8], + start: usize, + ) -> Option<Captures<'t>> { + let mut locs = self.capture_locations(); + self.captures_read_at(&mut locs, text, start).map(move |_| Captures { + text, + locs: locs.0, + named_groups: self.0.capture_name_idx().clone(), + }) + } + /// This is like `captures`, but uses /// [`CaptureLocations`](struct.CaptureLocations.html) /// instead of @@ -667,6 +718,46 @@ impl Regex { self.0.capture_names().len() } + /// Returns the total number of capturing groups that appear in every + /// possible match. + /// + /// If the number of capture groups can vary depending on the match, then + /// this returns `None`. That is, a value is only returned when the number + /// of matching groups is invariant or "static." + /// + /// Note that like [`Regex::captures_len`], this **does** include the + /// implicit capturing group corresponding to the entire match. Therefore, + /// when a non-None value is returned, it is guaranteed to be at least `1`. + /// Stated differently, a return value of `Some(0)` is impossible. + /// + /// # Example + /// + /// This shows a few cases where a static number of capture groups is + /// available and a few cases where it is not. + /// + /// ``` + /// use regex::bytes::Regex; + /// + /// let len = |pattern| { + /// Regex::new(pattern).map(|re| re.static_captures_len()) + /// }; + /// + /// assert_eq!(Some(1), len("a")?); + /// assert_eq!(Some(2), len("(a)")?); + /// assert_eq!(Some(2), len("(a)|(b)")?); + /// assert_eq!(Some(3), len("(a)(b)|(c)(d)")?); + /// assert_eq!(None, len("(a)|b")?); + /// assert_eq!(None, len("a|(b)")?); + /// assert_eq!(None, len("(b)*")?); + /// assert_eq!(Some(2), len("(b)+")?); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + #[inline] + pub fn static_captures_len(&self) -> Option<usize> { + self.0.static_captures_len().map(|len| len.saturating_add(1)) + } + /// Returns an empty set of capture locations that can be reused in /// multiple calls to `captures_read` or `captures_read_at`. pub fn capture_locations(&self) -> CaptureLocations { @@ -856,6 +947,27 @@ impl<'r> FusedIterator for CaptureNames<'r> {} /// In order to build a value of this type, you'll need to call the /// `capture_locations` method on the `Regex` being used to execute the search. /// The value returned can then be reused in subsequent searches. +/// +/// # Example +/// +/// This example shows how to create and use `CaptureLocations` in a search. +/// +/// ``` +/// use regex::bytes::Regex; +/// +/// let re = Regex::new(r"(?<first>\w+)\s+(?<last>\w+)").unwrap(); +/// let mut locs = re.capture_locations(); +/// let m = re.captures_read(&mut locs, b"Bruce Springsteen").unwrap(); +/// assert_eq!(0..17, m.range()); +/// assert_eq!(Some((0, 17)), locs.get(0)); +/// assert_eq!(Some((0, 5)), locs.get(1)); +/// assert_eq!(Some((6, 17)), locs.get(2)); +/// +/// // Asking for an invalid capture group always returns None. +/// assert_eq!(None, locs.get(3)); +/// assert_eq!(None, locs.get(34973498648)); +/// assert_eq!(None, locs.get(9944060567225171988)); +/// ``` #[derive(Clone, Debug)] pub struct CaptureLocations(re_trait::Locations); diff --git a/vendor/regex/src/re_set.rs b/vendor/regex/src/re_set.rs index a6d886d76..7c8253f0c 100644 --- a/vendor/regex/src/re_set.rs +++ b/vendor/regex/src/re_set.rs @@ -289,6 +289,12 @@ impl RegexSet { } } +impl Default for RegexSet { + fn default() -> Self { + RegexSet::empty() + } +} + /// A set of matches returned by a regex set. #[derive(Clone, Debug)] pub struct SetMatches { @@ -315,6 +321,11 @@ impl SetMatches { } /// The total number of regexes in the set that created these matches. + /// + /// **WARNING:** This always returns the same value as [`RegexSet::len`]. + /// In particular, it does *not* return the number of elements yielded by + /// [`SetMatches::iter`]. The only way to determine the total number of + /// matched regexes is to iterate over them. pub fn len(&self) -> usize { self.matches.len() } diff --git a/vendor/regex/src/re_trait.rs b/vendor/regex/src/re_trait.rs index d0c717df5..505810c84 100644 --- a/vendor/regex/src/re_trait.rs +++ b/vendor/regex/src/re_trait.rs @@ -20,7 +20,7 @@ impl Locations { /// not match anything. The positions returned are *always* byte indices /// with respect to the original string matched. pub fn pos(&self, i: usize) -> Option<(usize, usize)> { - let (s, e) = (i * 2, i * 2 + 1); + let (s, e) = (i.checked_mul(2)?, i.checked_mul(2)?.checked_add(1)?); match (self.0.get(s), self.0.get(e)) { (Some(&Some(s)), Some(&Some(e))) => Some((s, e)), _ => None, diff --git a/vendor/regex/src/re_unicode.rs b/vendor/regex/src/re_unicode.rs index 197510ea0..57689086d 100644 --- a/vendor/regex/src/re_unicode.rs +++ b/vendor/regex/src/re_unicode.rs @@ -25,7 +25,7 @@ pub fn escape(text: &str) -> String { /// Match represents a single match of a regex in a haystack. /// /// The lifetime parameter `'t` refers to the lifetime of the matched text. -#[derive(Copy, Clone, Debug, Eq, PartialEq)] +#[derive(Copy, Clone, Eq, PartialEq)] pub struct Match<'t> { text: &'t str, start: usize, @@ -45,6 +45,18 @@ impl<'t> Match<'t> { self.end } + /// Returns true if and only if this match has a length of zero. + #[inline] + pub fn is_empty(&self) -> bool { + self.start == self.end + } + + /// Returns the length, in bytes, of this match. + #[inline] + pub fn len(&self) -> usize { + self.end - self.start + } + /// Returns the range over the starting and ending byte offsets of the /// match in the haystack. #[inline] @@ -65,6 +77,16 @@ impl<'t> Match<'t> { } } +impl<'t> std::fmt::Debug for Match<'t> { + fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result { + f.debug_struct("Match") + .field("start", &self.start) + .field("end", &self.end) + .field("string", &self.as_str()) + .finish() + } +} + impl<'t> From<Match<'t>> for &'t str { fn from(m: Match<'t>) -> &'t str { m.as_str() @@ -309,12 +331,7 @@ impl Regex { /// The `0`th capture group is always unnamed, so it must always be /// accessed with `get(0)` or `[0]`. pub fn captures<'t>(&self, text: &'t str) -> Option<Captures<'t>> { - let mut locs = self.capture_locations(); - self.captures_read_at(&mut locs, text, 0).map(move |_| Captures { - text, - locs: locs.0, - named_groups: self.0.capture_name_idx().clone(), - }) + self.captures_at(text, 0) } /// Returns an iterator over all the non-overlapping capture groups matched @@ -595,7 +612,14 @@ impl Regex { /// This method may have the same performance characteristics as /// `is_match`, except it provides an end location for a match. In /// particular, the location returned *may be shorter* than the proper end - /// of the leftmost-first match. + /// of the leftmost-first match that you would find via `Regex::find`. + /// + /// Note that it is not guaranteed that this routine finds the shortest or + /// "earliest" possible match. Instead, the main idea of this API is that + /// it returns the offset at the point at which the internal regex engine + /// has determined that a match has occurred. This may vary depending on + /// which internal regex engine is used, and thus, the offset itself may + /// change. /// /// # Example /// @@ -615,12 +639,12 @@ impl Regex { self.shortest_match_at(text, 0) } - /// Returns the same as shortest_match, but starts the search at the given - /// offset. + /// Returns the same as `shortest_match`, but starts the search at the + /// given offset. /// /// The significance of the starting point is that it takes the surrounding - /// context into consideration. For example, the `\A` anchor can only - /// match when `start == 0`. + /// context into consideration. For example, the `\A` anchor can only match + /// when `start == 0`. pub fn shortest_match_at( &self, text: &str, @@ -656,6 +680,25 @@ impl Regex { .map(|(s, e)| Match::new(text, s, e)) } + /// Returns the same as [`Regex::captures`], but starts the search at the + /// given offset. + /// + /// The significance of the starting point is that it takes the surrounding + /// context into consideration. For example, the `\A` anchor can only + /// match when `start == 0`. + pub fn captures_at<'t>( + &self, + text: &'t str, + start: usize, + ) -> Option<Captures<'t>> { + let mut locs = self.capture_locations(); + self.captures_read_at(&mut locs, text, start).map(move |_| Captures { + text, + locs: locs.0, + named_groups: self.0.capture_name_idx().clone(), + }) + } + /// This is like `captures`, but uses /// [`CaptureLocations`](struct.CaptureLocations.html) /// instead of @@ -725,6 +768,46 @@ impl Regex { self.0.capture_names().len() } + /// Returns the total number of capturing groups that appear in every + /// possible match. + /// + /// If the number of capture groups can vary depending on the match, then + /// this returns `None`. That is, a value is only returned when the number + /// of matching groups is invariant or "static." + /// + /// Note that like [`Regex::captures_len`], this **does** include the + /// implicit capturing group corresponding to the entire match. Therefore, + /// when a non-None value is returned, it is guaranteed to be at least `1`. + /// Stated differently, a return value of `Some(0)` is impossible. + /// + /// # Example + /// + /// This shows a few cases where a static number of capture groups is + /// available and a few cases where it is not. + /// + /// ``` + /// use regex::Regex; + /// + /// let len = |pattern| { + /// Regex::new(pattern).map(|re| re.static_captures_len()) + /// }; + /// + /// assert_eq!(Some(1), len("a")?); + /// assert_eq!(Some(2), len("(a)")?); + /// assert_eq!(Some(2), len("(a)|(b)")?); + /// assert_eq!(Some(3), len("(a)(b)|(c)(d)")?); + /// assert_eq!(None, len("(a)|b")?); + /// assert_eq!(None, len("a|(b)")?); + /// assert_eq!(None, len("(b)*")?); + /// assert_eq!(Some(2), len("(b)+")?); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + #[inline] + pub fn static_captures_len(&self) -> Option<usize> { + self.0.static_captures_len().map(|len| len.saturating_add(1)) + } + /// Returns an empty set of capture locations that can be reused in /// multiple calls to `captures_read` or `captures_read_at`. pub fn capture_locations(&self) -> CaptureLocations { @@ -866,6 +949,27 @@ impl<'r, 't> FusedIterator for SplitN<'r, 't> {} /// In order to build a value of this type, you'll need to call the /// `capture_locations` method on the `Regex` being used to execute the search. /// The value returned can then be reused in subsequent searches. +/// +/// # Example +/// +/// This example shows how to create and use `CaptureLocations` in a search. +/// +/// ``` +/// use regex::Regex; +/// +/// let re = Regex::new(r"(?<first>\w+)\s+(?<last>\w+)").unwrap(); +/// let mut locs = re.capture_locations(); +/// let m = re.captures_read(&mut locs, "Bruce Springsteen").unwrap(); +/// assert_eq!(0..17, m.range()); +/// assert_eq!(Some((0, 17)), locs.get(0)); +/// assert_eq!(Some((0, 5)), locs.get(1)); +/// assert_eq!(Some((6, 17)), locs.get(2)); +/// +/// // Asking for an invalid capture group always returns None. +/// assert_eq!(None, locs.get(3)); +/// assert_eq!(None, locs.get(34973498648)); +/// assert_eq!(None, locs.get(9944060567225171988)); +/// ``` #[derive(Clone, Debug)] pub struct CaptureLocations(re_trait::Locations); |