diff options
author | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-06-19 09:26:03 +0000 |
---|---|---|
committer | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-06-19 09:26:03 +0000 |
commit | 9918693037dce8aa4bb6f08741b6812923486c18 (patch) | |
tree | 21d2b40bec7e6a7ea664acee056eb3d08e15a1cf /vendor/regex-syntax | |
parent | Releasing progress-linux version 1.75.0+dfsg1-5~progress7.99u1. (diff) | |
download | rustc-9918693037dce8aa4bb6f08741b6812923486c18.tar.xz rustc-9918693037dce8aa4bb6f08741b6812923486c18.zip |
Merging upstream version 1.76.0+dfsg1.
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'vendor/regex-syntax')
-rw-r--r-- | vendor/regex-syntax/.cargo-checksum.json | 2 | ||||
-rw-r--r-- | vendor/regex-syntax/Cargo.toml | 18 | ||||
-rw-r--r-- | vendor/regex-syntax/README.md | 1 | ||||
-rw-r--r-- | vendor/regex-syntax/src/ast/mod.rs | 440 | ||||
-rw-r--r-- | vendor/regex-syntax/src/ast/parse.rs | 576 | ||||
-rw-r--r-- | vendor/regex-syntax/src/ast/print.rs | 20 | ||||
-rw-r--r-- | vendor/regex-syntax/src/ast/visitor.rs | 17 | ||||
-rw-r--r-- | vendor/regex-syntax/src/hir/literal.rs | 167 | ||||
-rw-r--r-- | vendor/regex-syntax/src/hir/mod.rs | 196 | ||||
-rw-r--r-- | vendor/regex-syntax/src/hir/print.rs | 59 | ||||
-rw-r--r-- | vendor/regex-syntax/src/hir/translate.rs | 297 | ||||
-rw-r--r-- | vendor/regex-syntax/src/hir/visitor.rs | 15 | ||||
-rw-r--r-- | vendor/regex-syntax/src/lib.rs | 10 | ||||
-rw-r--r-- | vendor/regex-syntax/src/parser.rs | 25 | ||||
-rwxr-xr-x | vendor/regex-syntax/test | 4 |
15 files changed, 1390 insertions, 457 deletions
diff --git a/vendor/regex-syntax/.cargo-checksum.json b/vendor/regex-syntax/.cargo-checksum.json index 94024d262..8152441ca 100644 --- a/vendor/regex-syntax/.cargo-checksum.json +++ b/vendor/regex-syntax/.cargo-checksum.json @@ -1 +1 @@ -{"files":{"Cargo.toml":"572d5198c4a39a5a55d9cbbb79de0e93f0b734f9a8cb3b7a80c0046897a79c85","LICENSE-APACHE":"a60eea817514531668d7e00765731449fe14d059d3249e0bc93b36de45f759f2","LICENSE-MIT":"6485b8ed310d3f0340bf1ad1f47645069ce4069dcc6bb46c7d5c6faf41de1fdb","README.md":"1f5f6c3e0f7e452236eb13a0a8627dceb35e7bd9e18798916b74b724ba8161fe","benches/bench.rs":"d2b6ae5b939abd6093064f144b981b7739d7f474ec0698a1268052fc92406635","src/ast/mod.rs":"f18ba2a3a082fa342512feed20da3bf5fb8b6d92d2e246809732860b446f75c9","src/ast/parse.rs":"49478a4ae5b557dc46aa7071c91c7a6905a0ce62910f8c8fefce464e5779e934","src/ast/print.rs":"62d319cd0b7e6f437dc8dcaf798046a44afa03e9aeb6a384d5cffa448383af53","src/ast/visitor.rs":"a58170758852b2a59c9232f3a027a91f0603b26dd0d9acbde73ac1f575ca600b","src/debug.rs":"7a16cca02be9715fdc8c26a32279465774623cd12fab1ec59ac25a6e3047817f","src/either.rs":"1758e3edd056884eccadd995708d1e374ba9aa65846bd0e13b1aae852607c560","src/error.rs":"01a67e3407b0d0d869119363e47a94d92158834bfe5936366c2e3f6f4ed13f36","src/hir/interval.rs":"2358e74b4d4aabfa62f79df855fd5d183779b86c4e14aae4ee42d8695bb3d010","src/hir/literal.rs":"a57f77b49998f4e4be9f9e4512f9934bc61640786dd7ac07c708825ba2b83429","src/hir/mod.rs":"48f885da2485afd638823dbdbe591f71051db398e86055820557d627d8b23309","src/hir/print.rs":"1f1fb454af939a53ea2799f55b67c2a2615c47c24dbf9f48a7c2a2b402d36e1f","src/hir/translate.rs":"1fbba4c456891ead0298ab6457ac5e9649431e52e75acc85a293d2a17886ac84","src/hir/visitor.rs":"e98aab188f92a92baee7b490d8558383373398697ae97335ae2635b6a5aa45ca","src/lib.rs":"4dbb230403b281f35dc1f9494e6e33e18bad2c2c648da78fd2b4662b85f88748","src/parser.rs":"cac139ed552a63ac4f81d60610cf0c2084421e34729475707681ef9392e1e9ae","src/rank.rs":"ff3d58b0cc5ffa69e2e8c56fc7d9ef41dd399d59a639a253a51551b858cb5bbd","src/unicode.rs":"9829458ef321b3bc22c21eae4b22805b33f8b5e67022928ffd9a9e0287bc7c31","src/unicode_tables/LICENSE-UNICODE":"74db5baf44a41b1000312c673544b3374e4198af5605c7f9080a402cec42cfa3","src/unicode_tables/age.rs":"2a2599a4e406fbbd0efd16aa6ce385c3f97b87c34820d6686a9f9113a5231c67","src/unicode_tables/case_folding_simple.rs":"9583803d4a10486da372b76979dbd26349b40766229467238eff972c1d78e47b","src/unicode_tables/general_category.rs":"36a93ba1cdeed96a00ff29a5ab5afd2c578a89541bf4dd8b18478146cebda0aa","src/unicode_tables/grapheme_cluster_break.rs":"39c388e9805a8391d3d3e69d74d831ce4fb99aa7e13e52c64dd2bd16d4765301","src/unicode_tables/mod.rs":"26c837099cd934c8062e24bc9a0aaecf15fe1de03f9c6da3f3e1e5ac3ca24bee","src/unicode_tables/perl_decimal.rs":"a98ea4afe71c2947023ae12bd25c46bf4c7de48eeb40979eca5c96ba62cee02e","src/unicode_tables/perl_space.rs":"ea2b3b84b4a48334082dadc6c37d9fcc9c9ded84b40e8f5c9c9314898638967e","src/unicode_tables/perl_word.rs":"6f1156bd6af32151ecffea4abe07a38fa04b1fc1b227ec1a8dac5d5f08d9d74b","src/unicode_tables/property_bool.rs":"0bd64f6e3228eaecf47824e238bdf1f8a9eef113ace6e790a57f045a8106701c","src/unicode_tables/property_names.rs":"5ca25437927eb70c62adf7d038e99a601cfb8a718677fd6de832589664d3c481","src/unicode_tables/property_values.rs":"5b4cc02392d382cf7af60455fc87b9980e97409b62a4b8d6c5843190d2e2d21d","src/unicode_tables/script.rs":"ea1d771b6d0a4b12d143f9bad2ea9342a0887878cbbe3c11262b6eabedaf2dd4","src/unicode_tables/script_extension.rs":"beeb8349703d903ff861beb8401bfd2599e457dc25df872e69d6ad1615f8b5e9","src/unicode_tables/sentence_break.rs":"2befe2a27cc4e8aecb624e310ef9f371462470dd3b2f572cec1f5873a5e30aa9","src/unicode_tables/word_break.rs":"94679177731b515f0c360eff394286a1f99b59527bdbc826cbf51d32f9666187","src/utf8.rs":"e9a13623a94295b81969c5483de17219ff74bb20768be13c527010351245acbd","test":"01d6f6e9a689fb794173288a52f40f53b4f782176d0fcd648c7c6d3a2df05c63"},"package":"436b050e76ed2903236f032a59761c1eb99e1b0aead2c257922771dab1fc8c78"}
\ No newline at end of file +{"files":{"Cargo.toml":"33c96af38ed9f42d1ccbf85ecfeea1d46202943d01c595b8ee4dddef760e6bd5","LICENSE-APACHE":"a60eea817514531668d7e00765731449fe14d059d3249e0bc93b36de45f759f2","LICENSE-MIT":"6485b8ed310d3f0340bf1ad1f47645069ce4069dcc6bb46c7d5c6faf41de1fdb","README.md":"b2484aa7e66fb92d1378e9a7ce7605af18f77cb12c179866eaf92ba28cfec1d9","benches/bench.rs":"d2b6ae5b939abd6093064f144b981b7739d7f474ec0698a1268052fc92406635","src/ast/mod.rs":"700c2f779fccb529db7b444819d53c38f916b065d3d05a74282f929af581e8b1","src/ast/parse.rs":"fcd45146eaf747d15a2a519d34754638d451ab83e88b5962841cf7a0dd32e988","src/ast/print.rs":"99cb69ece252ef31e0be177fb3364797eb30b785f936532b8dcd8106e7be0738","src/ast/visitor.rs":"f0fdf758801fe70e6b299b73ab63196e814af95ef6eccad7ef4f72075743fcf6","src/debug.rs":"7a16cca02be9715fdc8c26a32279465774623cd12fab1ec59ac25a6e3047817f","src/either.rs":"1758e3edd056884eccadd995708d1e374ba9aa65846bd0e13b1aae852607c560","src/error.rs":"01a67e3407b0d0d869119363e47a94d92158834bfe5936366c2e3f6f4ed13f36","src/hir/interval.rs":"2358e74b4d4aabfa62f79df855fd5d183779b86c4e14aae4ee42d8695bb3d010","src/hir/literal.rs":"6a8108b8919fbfd9ab93072846124c51d2998489810fcd6e7a89fdccc45833e0","src/hir/mod.rs":"eca183b8e173f486c1a11a5fa10895c96067162c8ec936871f937ca7fca5f710","src/hir/print.rs":"ad51c515c933bfd67d307ba3d7e6ac59c9c5903b4f393a9f9a4785c92b88348d","src/hir/translate.rs":"5fbff527c53f217ba2bac9b0948d7de74164625d08674b91a479ced271159ebd","src/hir/visitor.rs":"71ca9c93aa48a5ed445399659fa6455093a1bbd9ef44b66bc7095c1b08b2ec1f","src/lib.rs":"5ae457d402e49443bdb23b71353693dd3b0d263b57a6eeb9eb5b5dae5c901bdd","src/parser.rs":"6b2f4f27e3331a01a25b87c89368dd2e54396bd425dac57941f9c1ebfd238ac8","src/rank.rs":"ff3d58b0cc5ffa69e2e8c56fc7d9ef41dd399d59a639a253a51551b858cb5bbd","src/unicode.rs":"9829458ef321b3bc22c21eae4b22805b33f8b5e67022928ffd9a9e0287bc7c31","src/unicode_tables/LICENSE-UNICODE":"74db5baf44a41b1000312c673544b3374e4198af5605c7f9080a402cec42cfa3","src/unicode_tables/age.rs":"2a2599a4e406fbbd0efd16aa6ce385c3f97b87c34820d6686a9f9113a5231c67","src/unicode_tables/case_folding_simple.rs":"9583803d4a10486da372b76979dbd26349b40766229467238eff972c1d78e47b","src/unicode_tables/general_category.rs":"36a93ba1cdeed96a00ff29a5ab5afd2c578a89541bf4dd8b18478146cebda0aa","src/unicode_tables/grapheme_cluster_break.rs":"39c388e9805a8391d3d3e69d74d831ce4fb99aa7e13e52c64dd2bd16d4765301","src/unicode_tables/mod.rs":"26c837099cd934c8062e24bc9a0aaecf15fe1de03f9c6da3f3e1e5ac3ca24bee","src/unicode_tables/perl_decimal.rs":"a98ea4afe71c2947023ae12bd25c46bf4c7de48eeb40979eca5c96ba62cee02e","src/unicode_tables/perl_space.rs":"ea2b3b84b4a48334082dadc6c37d9fcc9c9ded84b40e8f5c9c9314898638967e","src/unicode_tables/perl_word.rs":"6f1156bd6af32151ecffea4abe07a38fa04b1fc1b227ec1a8dac5d5f08d9d74b","src/unicode_tables/property_bool.rs":"0bd64f6e3228eaecf47824e238bdf1f8a9eef113ace6e790a57f045a8106701c","src/unicode_tables/property_names.rs":"5ca25437927eb70c62adf7d038e99a601cfb8a718677fd6de832589664d3c481","src/unicode_tables/property_values.rs":"5b4cc02392d382cf7af60455fc87b9980e97409b62a4b8d6c5843190d2e2d21d","src/unicode_tables/script.rs":"ea1d771b6d0a4b12d143f9bad2ea9342a0887878cbbe3c11262b6eabedaf2dd4","src/unicode_tables/script_extension.rs":"beeb8349703d903ff861beb8401bfd2599e457dc25df872e69d6ad1615f8b5e9","src/unicode_tables/sentence_break.rs":"2befe2a27cc4e8aecb624e310ef9f371462470dd3b2f572cec1f5873a5e30aa9","src/unicode_tables/word_break.rs":"94679177731b515f0c360eff394286a1f99b59527bdbc826cbf51d32f9666187","src/utf8.rs":"e9a13623a94295b81969c5483de17219ff74bb20768be13c527010351245acbd","test":"c7de5fbc0010d9b5b758cd49956375a64b88601c068167fd366808950257f108"},"package":"c08c74e62047bb2de4ff487b251e4a92e24f48745648451635cec7d591162d9f"}
\ No newline at end of file diff --git a/vendor/regex-syntax/Cargo.toml b/vendor/regex-syntax/Cargo.toml index cd4b92cb2..3602ab33c 100644 --- a/vendor/regex-syntax/Cargo.toml +++ b/vendor/regex-syntax/Cargo.toml @@ -11,16 +11,18 @@ [package] edition = "2021" -rust-version = "1.60.0" +rust-version = "1.65" name = "regex-syntax" -version = "0.7.2" -authors = ["The Rust Project Developers"] +version = "0.8.2" +authors = [ + "The Rust Project Developers", + "Andrew Gallant <jamslam@gmail.com>", +] description = "A regular expression parser." -homepage = "https://github.com/rust-lang/regex" documentation = "https://docs.rs/regex-syntax" readme = "README.md" license = "MIT OR Apache-2.0" -repository = "https://github.com/rust-lang/regex" +repository = "https://github.com/rust-lang/regex/tree/master/regex-syntax" [package.metadata.docs.rs] all-features = true @@ -29,7 +31,13 @@ rustdoc-args = [ "docsrs", ] +[dependencies.arbitrary] +version = "1.3.0" +features = ["derive"] +optional = true + [features] +arbitrary = ["dep:arbitrary"] default = [ "std", "unicode", diff --git a/vendor/regex-syntax/README.md b/vendor/regex-syntax/README.md index ff4fe094c..529513b0c 100644 --- a/vendor/regex-syntax/README.md +++ b/vendor/regex-syntax/README.md @@ -4,7 +4,6 @@ This crate provides a robust regular expression parser. [![Build status](https://github.com/rust-lang/regex/workflows/ci/badge.svg)](https://github.com/rust-lang/regex/actions) [![Crates.io](https://img.shields.io/crates/v/regex-syntax.svg)](https://crates.io/crates/regex-syntax) -[![Rust](https://img.shields.io/badge/rust-1.28.0%2B-blue.svg?maxAge=3600)](https://github.com/rust-lang/regex) ### Documentation diff --git a/vendor/regex-syntax/src/ast/mod.rs b/vendor/regex-syntax/src/ast/mod.rs index a95b1c873..6a77ee134 100644 --- a/vendor/regex-syntax/src/ast/mod.rs +++ b/vendor/regex-syntax/src/ast/mod.rs @@ -20,6 +20,7 @@ mod visitor; /// valid Unicode property name. That particular error is reported when /// translating an AST to the high-level intermediate representation (`HIR`). #[derive(Clone, Debug, Eq, PartialEq)] +#[cfg_attr(feature = "arbitrary", derive(arbitrary::Arbitrary))] pub struct Error { /// The kind of error. kind: ErrorKind, @@ -70,6 +71,7 @@ impl Error { /// new variant is not considered a breaking change. #[non_exhaustive] #[derive(Clone, Debug, Eq, PartialEq)] +#[cfg_attr(feature = "arbitrary", derive(arbitrary::Arbitrary))] pub enum ErrorKind { /// The capturing group limit was exceeded. /// @@ -160,6 +162,18 @@ pub enum ErrorKind { /// `(?i)*`. It is, however, possible to create a repetition operating on /// an empty sub-expression. For example, `()*` is still considered valid. RepetitionMissing, + /// The special word boundary syntax, `\b{something}`, was used, but + /// either EOF without `}` was seen, or an invalid character in the + /// braces was seen. + SpecialWordBoundaryUnclosed, + /// The special word boundary syntax, `\b{something}`, was used, but + /// `something` was not recognized as a valid word boundary kind. + SpecialWordBoundaryUnrecognized, + /// The syntax `\b{` was observed, but afterwards the end of the pattern + /// was observed without being able to tell whether it was meant to be a + /// bounded repetition on the `\b` or the beginning of a special word + /// boundary assertion. + SpecialWordOrRepetitionUnexpectedEof, /// The Unicode class is not valid. This typically occurs when a `\p` is /// followed by something other than a `{`. UnicodeClassInvalid, @@ -258,6 +272,29 @@ impl core::fmt::Display for ErrorKind { RepetitionMissing => { write!(f, "repetition operator missing expression") } + SpecialWordBoundaryUnclosed => { + write!( + f, + "special word boundary assertion is either \ + unclosed or contains an invalid character", + ) + } + SpecialWordBoundaryUnrecognized => { + write!( + f, + "unrecognized special word boundary assertion, \ + valid choices are: start, end, start-half \ + or end-half", + ) + } + SpecialWordOrRepetitionUnexpectedEof => { + write!( + f, + "found either the beginning of a special word \ + boundary or a bounded repetition on a \\b with \ + an opening brace, but no closing brace", + ) + } UnicodeClassInvalid => { write!(f, "invalid Unicode character class") } @@ -278,6 +315,7 @@ impl core::fmt::Display for ErrorKind { /// All span positions are absolute byte offsets that can be used on the /// original regular expression that was parsed. #[derive(Clone, Copy, Eq, PartialEq)] +#[cfg_attr(feature = "arbitrary", derive(arbitrary::Arbitrary))] pub struct Span { /// The start byte offset. pub start: Position, @@ -308,6 +346,7 @@ impl PartialOrd for Span { /// A position encodes one half of a span, and include the byte offset, line /// number and column number. #[derive(Clone, Copy, Eq, PartialEq)] +#[cfg_attr(feature = "arbitrary", derive(arbitrary::Arbitrary))] pub struct Position { /// The absolute offset of this position, starting at `0` from the /// beginning of the regular expression pattern string. @@ -396,6 +435,7 @@ impl Position { /// comment contains a span of precisely where it occurred in the original /// regular expression. #[derive(Clone, Debug, Eq, PartialEq)] +#[cfg_attr(feature = "arbitrary", derive(arbitrary::Arbitrary))] pub struct WithComments { /// The actual ast. pub ast: Ast, @@ -408,6 +448,7 @@ pub struct WithComments { /// A regular expression can only contain comments when the `x` flag is /// enabled. #[derive(Clone, Debug, Eq, PartialEq)] +#[cfg_attr(feature = "arbitrary", derive(arbitrary::Arbitrary))] pub struct Comment { /// The span of this comment, including the beginning `#` and ending `\n`. pub span: Span, @@ -424,31 +465,97 @@ pub struct Comment { /// This type defines its own destructor that uses constant stack space and /// heap space proportional to the size of the `Ast`. #[derive(Clone, Debug, Eq, PartialEq)] +#[cfg_attr(feature = "arbitrary", derive(arbitrary::Arbitrary))] pub enum Ast { /// An empty regex that matches everything. - Empty(Span), + Empty(Box<Span>), /// A set of flags, e.g., `(?is)`. - Flags(SetFlags), + Flags(Box<SetFlags>), /// A single character literal, which includes escape sequences. - Literal(Literal), + Literal(Box<Literal>), /// The "any character" class. - Dot(Span), + Dot(Box<Span>), /// A single zero-width assertion. - Assertion(Assertion), - /// A single character class. This includes all forms of character classes - /// except for `.`. e.g., `\d`, `\pN`, `[a-z]` and `[[:alpha:]]`. - Class(Class), + Assertion(Box<Assertion>), + /// A single Unicode character class, e.g., `\pL` or `\p{Greek}`. + ClassUnicode(Box<ClassUnicode>), + /// A single perl character class, e.g., `\d` or `\W`. + ClassPerl(Box<ClassPerl>), + /// A single bracketed character class set, which may contain zero or more + /// character ranges and/or zero or more nested classes. e.g., + /// `[a-zA-Z\pL]`. + ClassBracketed(Box<ClassBracketed>), /// A repetition operator applied to an arbitrary regular expression. - Repetition(Repetition), + Repetition(Box<Repetition>), /// A grouped regular expression. - Group(Group), + Group(Box<Group>), /// An alternation of regular expressions. - Alternation(Alternation), + Alternation(Box<Alternation>), /// A concatenation of regular expressions. - Concat(Concat), + Concat(Box<Concat>), } impl Ast { + /// Create an "empty" AST item. + pub fn empty(span: Span) -> Ast { + Ast::Empty(Box::new(span)) + } + + /// Create a "flags" AST item. + pub fn flags(e: SetFlags) -> Ast { + Ast::Flags(Box::new(e)) + } + + /// Create a "literal" AST item. + pub fn literal(e: Literal) -> Ast { + Ast::Literal(Box::new(e)) + } + + /// Create a "dot" AST item. + pub fn dot(span: Span) -> Ast { + Ast::Dot(Box::new(span)) + } + + /// Create a "assertion" AST item. + pub fn assertion(e: Assertion) -> Ast { + Ast::Assertion(Box::new(e)) + } + + /// Create a "Unicode class" AST item. + pub fn class_unicode(e: ClassUnicode) -> Ast { + Ast::ClassUnicode(Box::new(e)) + } + + /// Create a "Perl class" AST item. + pub fn class_perl(e: ClassPerl) -> Ast { + Ast::ClassPerl(Box::new(e)) + } + + /// Create a "bracketed class" AST item. + pub fn class_bracketed(e: ClassBracketed) -> Ast { + Ast::ClassBracketed(Box::new(e)) + } + + /// Create a "repetition" AST item. + pub fn repetition(e: Repetition) -> Ast { + Ast::Repetition(Box::new(e)) + } + + /// Create a "group" AST item. + pub fn group(e: Group) -> Ast { + Ast::Group(Box::new(e)) + } + + /// Create a "alternation" AST item. + pub fn alternation(e: Alternation) -> Ast { + Ast::Alternation(Box::new(e)) + } + + /// Create a "concat" AST item. + pub fn concat(e: Concat) -> Ast { + Ast::Concat(Box::new(e)) + } + /// Return the span of this abstract syntax tree. pub fn span(&self) -> &Span { match *self { @@ -457,7 +564,9 @@ impl Ast { Ast::Literal(ref x) => &x.span, Ast::Dot(ref span) => span, Ast::Assertion(ref x) => &x.span, - Ast::Class(ref x) => x.span(), + Ast::ClassUnicode(ref x) => &x.span, + Ast::ClassPerl(ref x) => &x.span, + Ast::ClassBracketed(ref x) => &x.span, Ast::Repetition(ref x) => &x.span, Ast::Group(ref x) => &x.span, Ast::Alternation(ref x) => &x.span, @@ -481,8 +590,10 @@ impl Ast { | Ast::Flags(_) | Ast::Literal(_) | Ast::Dot(_) - | Ast::Assertion(_) => false, - Ast::Class(_) + | Ast::Assertion(_) + | Ast::ClassUnicode(_) + | Ast::ClassPerl(_) => false, + Ast::ClassBracketed(_) | Ast::Repetition(_) | Ast::Group(_) | Ast::Alternation(_) @@ -508,6 +619,7 @@ impl core::fmt::Display for Ast { /// An alternation of regular expressions. #[derive(Clone, Debug, Eq, PartialEq)] +#[cfg_attr(feature = "arbitrary", derive(arbitrary::Arbitrary))] pub struct Alternation { /// The span of this alternation. pub span: Span, @@ -518,20 +630,21 @@ pub struct Alternation { impl Alternation { /// Return this alternation as an AST. /// - /// If this alternation contains zero ASTs, then Ast::Empty is - /// returned. If this alternation contains exactly 1 AST, then the - /// corresponding AST is returned. Otherwise, Ast::Alternation is returned. + /// If this alternation contains zero ASTs, then `Ast::empty` is returned. + /// If this alternation contains exactly 1 AST, then the corresponding AST + /// is returned. Otherwise, `Ast::alternation` is returned. pub fn into_ast(mut self) -> Ast { match self.asts.len() { - 0 => Ast::Empty(self.span), + 0 => Ast::empty(self.span), 1 => self.asts.pop().unwrap(), - _ => Ast::Alternation(self), + _ => Ast::alternation(self), } } } /// A concatenation of regular expressions. #[derive(Clone, Debug, Eq, PartialEq)] +#[cfg_attr(feature = "arbitrary", derive(arbitrary::Arbitrary))] pub struct Concat { /// The span of this concatenation. pub span: Span, @@ -542,14 +655,14 @@ pub struct Concat { impl Concat { /// Return this concatenation as an AST. /// - /// If this concatenation contains zero ASTs, then Ast::Empty is - /// returned. If this concatenation contains exactly 1 AST, then the - /// corresponding AST is returned. Otherwise, Ast::Concat is returned. + /// If this alternation contains zero ASTs, then `Ast::empty` is returned. + /// If this alternation contains exactly 1 AST, then the corresponding AST + /// is returned. Otherwise, `Ast::concat` is returned. pub fn into_ast(mut self) -> Ast { match self.asts.len() { - 0 => Ast::Empty(self.span), + 0 => Ast::empty(self.span), 1 => self.asts.pop().unwrap(), - _ => Ast::Concat(self), + _ => Ast::concat(self), } } } @@ -560,6 +673,7 @@ impl Concat { /// represented in their literal form, e.g., `a` or in their escaped form, /// e.g., `\x61`. #[derive(Clone, Debug, Eq, PartialEq)] +#[cfg_attr(feature = "arbitrary", derive(arbitrary::Arbitrary))] pub struct Literal { /// The span of this literal. pub span: Span, @@ -584,6 +698,7 @@ impl Literal { /// The kind of a single literal expression. #[derive(Clone, Debug, Eq, PartialEq)] +#[cfg_attr(feature = "arbitrary", derive(arbitrary::Arbitrary))] pub enum LiteralKind { /// The literal is written verbatim, e.g., `a` or `☃`. Verbatim, @@ -613,6 +728,7 @@ pub enum LiteralKind { /// A special literal is a special escape sequence recognized by the regex /// parser, e.g., `\f` or `\n`. #[derive(Clone, Debug, Eq, PartialEq)] +#[cfg_attr(feature = "arbitrary", derive(arbitrary::Arbitrary))] pub enum SpecialLiteralKind { /// Bell, spelled `\a` (`\x07`). Bell, @@ -637,6 +753,7 @@ pub enum SpecialLiteralKind { /// differ when used without brackets in the number of hex digits that must /// follow. #[derive(Clone, Debug, Eq, PartialEq)] +#[cfg_attr(feature = "arbitrary", derive(arbitrary::Arbitrary))] pub enum HexLiteralKind { /// A `\x` prefix. When used without brackets, this form is limited to /// two digits. @@ -662,32 +779,9 @@ impl HexLiteralKind { } } -/// A single character class expression. -#[derive(Clone, Debug, Eq, PartialEq)] -pub enum Class { - /// A Unicode character class, e.g., `\pL` or `\p{Greek}`. - Unicode(ClassUnicode), - /// A perl character class, e.g., `\d` or `\W`. - Perl(ClassPerl), - /// A bracketed character class set, which may contain zero or more - /// character ranges and/or zero or more nested classes. e.g., - /// `[a-zA-Z\pL]`. - Bracketed(ClassBracketed), -} - -impl Class { - /// Return the span of this character class. - pub fn span(&self) -> &Span { - match *self { - Class::Perl(ref x) => &x.span, - Class::Unicode(ref x) => &x.span, - Class::Bracketed(ref x) => &x.span, - } - } -} - /// A Perl character class. #[derive(Clone, Debug, Eq, PartialEq)] +#[cfg_attr(feature = "arbitrary", derive(arbitrary::Arbitrary))] pub struct ClassPerl { /// The span of this class. pub span: Span, @@ -700,6 +794,7 @@ pub struct ClassPerl { /// The available Perl character classes. #[derive(Clone, Debug, Eq, PartialEq)] +#[cfg_attr(feature = "arbitrary", derive(arbitrary::Arbitrary))] pub enum ClassPerlKind { /// Decimal numbers. Digit, @@ -711,6 +806,7 @@ pub enum ClassPerlKind { /// An ASCII character class. #[derive(Clone, Debug, Eq, PartialEq)] +#[cfg_attr(feature = "arbitrary", derive(arbitrary::Arbitrary))] pub struct ClassAscii { /// The span of this class. pub span: Span, @@ -723,6 +819,7 @@ pub struct ClassAscii { /// The available ASCII character classes. #[derive(Clone, Debug, Eq, PartialEq)] +#[cfg_attr(feature = "arbitrary", derive(arbitrary::Arbitrary))] pub enum ClassAsciiKind { /// `[0-9A-Za-z]` Alnum, @@ -786,6 +883,7 @@ impl ClassAsciiKind { /// A Unicode character class. #[derive(Clone, Debug, Eq, PartialEq)] +#[cfg_attr(feature = "arbitrary", derive(arbitrary::Arbitrary))] pub struct ClassUnicode { /// The span of this class. pub span: Span, @@ -838,8 +936,156 @@ pub enum ClassUnicodeKind { }, } +#[cfg(feature = "arbitrary")] +impl arbitrary::Arbitrary<'_> for ClassUnicodeKind { + fn arbitrary( + u: &mut arbitrary::Unstructured, + ) -> arbitrary::Result<ClassUnicodeKind> { + #[cfg(any( + feature = "unicode-age", + feature = "unicode-bool", + feature = "unicode-gencat", + feature = "unicode-perl", + feature = "unicode-script", + feature = "unicode-segment", + ))] + { + use alloc::string::ToString; + + use super::unicode_tables::{ + property_names::PROPERTY_NAMES, + property_values::PROPERTY_VALUES, + }; + + match u.choose_index(3)? { + 0 => { + let all = PROPERTY_VALUES + .iter() + .flat_map(|e| e.1.iter()) + .filter(|(name, _)| name.len() == 1) + .count(); + let idx = u.choose_index(all)?; + let value = PROPERTY_VALUES + .iter() + .flat_map(|e| e.1.iter()) + .take(idx + 1) + .last() + .unwrap() + .0 + .chars() + .next() + .unwrap(); + Ok(ClassUnicodeKind::OneLetter(value)) + } + 1 => { + let all = PROPERTY_VALUES + .iter() + .map(|e| e.1.len()) + .sum::<usize>() + + PROPERTY_NAMES.len(); + let idx = u.choose_index(all)?; + let name = PROPERTY_VALUES + .iter() + .flat_map(|e| e.1.iter()) + .chain(PROPERTY_NAMES) + .map(|(_, e)| e) + .take(idx + 1) + .last() + .unwrap(); + Ok(ClassUnicodeKind::Named(name.to_string())) + } + 2 => { + let all = PROPERTY_VALUES + .iter() + .map(|e| e.1.len()) + .sum::<usize>(); + let idx = u.choose_index(all)?; + let (prop, value) = PROPERTY_VALUES + .iter() + .flat_map(|e| { + e.1.iter().map(|(_, value)| (e.0, value)) + }) + .take(idx + 1) + .last() + .unwrap(); + Ok(ClassUnicodeKind::NamedValue { + op: u.arbitrary()?, + name: prop.to_string(), + value: value.to_string(), + }) + } + _ => unreachable!("index chosen is impossible"), + } + } + #[cfg(not(any( + feature = "unicode-age", + feature = "unicode-bool", + feature = "unicode-gencat", + feature = "unicode-perl", + feature = "unicode-script", + feature = "unicode-segment", + )))] + { + match u.choose_index(3)? { + 0 => Ok(ClassUnicodeKind::OneLetter(u.arbitrary()?)), + 1 => Ok(ClassUnicodeKind::Named(u.arbitrary()?)), + 2 => Ok(ClassUnicodeKind::NamedValue { + op: u.arbitrary()?, + name: u.arbitrary()?, + value: u.arbitrary()?, + }), + _ => unreachable!("index chosen is impossible"), + } + } + } + + fn size_hint(depth: usize) -> (usize, Option<usize>) { + #[cfg(any( + feature = "unicode-age", + feature = "unicode-bool", + feature = "unicode-gencat", + feature = "unicode-perl", + feature = "unicode-script", + feature = "unicode-segment", + ))] + { + arbitrary::size_hint::and_all(&[ + usize::size_hint(depth), + usize::size_hint(depth), + arbitrary::size_hint::or( + (0, Some(0)), + ClassUnicodeOpKind::size_hint(depth), + ), + ]) + } + #[cfg(not(any( + feature = "unicode-age", + feature = "unicode-bool", + feature = "unicode-gencat", + feature = "unicode-perl", + feature = "unicode-script", + feature = "unicode-segment", + )))] + { + arbitrary::size_hint::and( + usize::size_hint(depth), + arbitrary::size_hint::or_all(&[ + char::size_hint(depth), + String::size_hint(depth), + arbitrary::size_hint::and_all(&[ + String::size_hint(depth), + String::size_hint(depth), + ClassUnicodeOpKind::size_hint(depth), + ]), + ]), + ) + } + } +} + /// The type of op used in a Unicode character class. #[derive(Clone, Debug, Eq, PartialEq)] +#[cfg_attr(feature = "arbitrary", derive(arbitrary::Arbitrary))] pub enum ClassUnicodeOpKind { /// A property set to a specific value, e.g., `\p{scx=Katakana}`. Equal, @@ -862,6 +1108,7 @@ impl ClassUnicodeOpKind { /// A bracketed character class, e.g., `[a-z0-9]`. #[derive(Clone, Debug, Eq, PartialEq)] +#[cfg_attr(feature = "arbitrary", derive(arbitrary::Arbitrary))] pub struct ClassBracketed { /// The span of this class. pub span: Span, @@ -880,6 +1127,7 @@ pub struct ClassBracketed { /// items (literals, ranges, other bracketed classes) or a tree of binary set /// operations. #[derive(Clone, Debug, Eq, PartialEq)] +#[cfg_attr(feature = "arbitrary", derive(arbitrary::Arbitrary))] pub enum ClassSet { /// An item, which can be a single literal, range, nested character class /// or a union of items. @@ -913,6 +1161,7 @@ impl ClassSet { /// A single component of a character class set. #[derive(Clone, Debug, Eq, PartialEq)] +#[cfg_attr(feature = "arbitrary", derive(arbitrary::Arbitrary))] pub enum ClassSetItem { /// An empty item. /// @@ -956,6 +1205,7 @@ impl ClassSetItem { /// A single character class range in a set. #[derive(Clone, Debug, Eq, PartialEq)] +#[cfg_attr(feature = "arbitrary", derive(arbitrary::Arbitrary))] pub struct ClassSetRange { /// The span of this range. pub span: Span, @@ -977,6 +1227,7 @@ impl ClassSetRange { /// A union of items inside a character class set. #[derive(Clone, Debug, Eq, PartialEq)] +#[cfg_attr(feature = "arbitrary", derive(arbitrary::Arbitrary))] pub struct ClassSetUnion { /// The span of the items in this operation. e.g., the `a-z0-9` in /// `[^a-z0-9]` @@ -1021,6 +1272,7 @@ impl ClassSetUnion { /// A Unicode character class set operation. #[derive(Clone, Debug, Eq, PartialEq)] +#[cfg_attr(feature = "arbitrary", derive(arbitrary::Arbitrary))] pub struct ClassSetBinaryOp { /// The span of this operation. e.g., the `a-z--[h-p]` in `[a-z--h-p]`. pub span: Span, @@ -1038,6 +1290,7 @@ pub struct ClassSetBinaryOp { /// explicit union operator. Concatenation inside a character class corresponds /// to the union operation. #[derive(Clone, Copy, Debug, Eq, PartialEq)] +#[cfg_attr(feature = "arbitrary", derive(arbitrary::Arbitrary))] pub enum ClassSetBinaryOpKind { /// The intersection of two sets, e.g., `\pN&&[a-z]`. Intersection, @@ -1051,6 +1304,7 @@ pub enum ClassSetBinaryOpKind { /// A single zero-width assertion. #[derive(Clone, Debug, Eq, PartialEq)] +#[cfg_attr(feature = "arbitrary", derive(arbitrary::Arbitrary))] pub struct Assertion { /// The span of this assertion. pub span: Span, @@ -1060,6 +1314,7 @@ pub struct Assertion { /// An assertion kind. #[derive(Clone, Debug, Eq, PartialEq)] +#[cfg_attr(feature = "arbitrary", derive(arbitrary::Arbitrary))] pub enum AssertionKind { /// `^` StartLine, @@ -1073,10 +1328,23 @@ pub enum AssertionKind { WordBoundary, /// `\B` NotWordBoundary, + /// `\b{start}` + WordBoundaryStart, + /// `\b{end}` + WordBoundaryEnd, + /// `\<` (alias for `\b{start}`) + WordBoundaryStartAngle, + /// `\>` (alias for `\b{end}`) + WordBoundaryEndAngle, + /// `\b{start-half}` + WordBoundaryStartHalf, + /// `\b{end-half}` + WordBoundaryEndHalf, } /// A repetition operation applied to a regular expression. #[derive(Clone, Debug, Eq, PartialEq)] +#[cfg_attr(feature = "arbitrary", derive(arbitrary::Arbitrary))] pub struct Repetition { /// The span of this operation. pub span: Span, @@ -1090,6 +1358,7 @@ pub struct Repetition { /// The repetition operator itself. #[derive(Clone, Debug, Eq, PartialEq)] +#[cfg_attr(feature = "arbitrary", derive(arbitrary::Arbitrary))] pub struct RepetitionOp { /// The span of this operator. This includes things like `+`, `*?` and /// `{m,n}`. @@ -1100,6 +1369,7 @@ pub struct RepetitionOp { /// The kind of a repetition operator. #[derive(Clone, Debug, Eq, PartialEq)] +#[cfg_attr(feature = "arbitrary", derive(arbitrary::Arbitrary))] pub enum RepetitionKind { /// `?` ZeroOrOne, @@ -1113,6 +1383,7 @@ pub enum RepetitionKind { /// A range repetition operator. #[derive(Clone, Debug, Eq, PartialEq)] +#[cfg_attr(feature = "arbitrary", derive(arbitrary::Arbitrary))] pub enum RepetitionRange { /// `{m}` Exactly(u32), @@ -1142,6 +1413,7 @@ impl RepetitionRange { /// contains a sub-expression, e.g., `(a)`, `(?P<name>a)`, `(?:a)` and /// `(?is:a)`. #[derive(Clone, Debug, Eq, PartialEq)] +#[cfg_attr(feature = "arbitrary", derive(arbitrary::Arbitrary))] pub struct Group { /// The span of this group. pub span: Span, @@ -1183,6 +1455,7 @@ impl Group { /// The kind of a group. #[derive(Clone, Debug, Eq, PartialEq)] +#[cfg_attr(feature = "arbitrary", derive(arbitrary::Arbitrary))] pub enum GroupKind { /// `(a)` CaptureIndex(u32), @@ -1211,8 +1484,38 @@ pub struct CaptureName { pub index: u32, } +#[cfg(feature = "arbitrary")] +impl arbitrary::Arbitrary<'_> for CaptureName { + fn arbitrary( + u: &mut arbitrary::Unstructured, + ) -> arbitrary::Result<CaptureName> { + let len = u.arbitrary_len::<char>()?; + if len == 0 { + return Err(arbitrary::Error::NotEnoughData); + } + let mut name: String = String::new(); + for _ in 0..len { + let ch: char = u.arbitrary()?; + let cp = u32::from(ch); + let ascii_letter_offset = u8::try_from(cp % 26).unwrap(); + let ascii_letter = b'a' + ascii_letter_offset; + name.push(char::from(ascii_letter)); + } + Ok(CaptureName { span: u.arbitrary()?, name, index: u.arbitrary()? }) + } + + fn size_hint(depth: usize) -> (usize, Option<usize>) { + arbitrary::size_hint::and_all(&[ + Span::size_hint(depth), + usize::size_hint(depth), + u32::size_hint(depth), + ]) + } +} + /// A group of flags that is not applied to a particular regular expression. #[derive(Clone, Debug, Eq, PartialEq)] +#[cfg_attr(feature = "arbitrary", derive(arbitrary::Arbitrary))] pub struct SetFlags { /// The span of these flags, including the grouping parentheses. pub span: Span, @@ -1224,6 +1527,7 @@ pub struct SetFlags { /// /// This corresponds only to the sequence of flags themselves, e.g., `is-u`. #[derive(Clone, Debug, Eq, PartialEq)] +#[cfg_attr(feature = "arbitrary", derive(arbitrary::Arbitrary))] pub struct Flags { /// The span of this group of flags. pub span: Span, @@ -1276,6 +1580,7 @@ impl Flags { /// A single item in a group of flags. #[derive(Clone, Debug, Eq, PartialEq)] +#[cfg_attr(feature = "arbitrary", derive(arbitrary::Arbitrary))] pub struct FlagsItem { /// The span of this item. pub span: Span, @@ -1285,6 +1590,7 @@ pub struct FlagsItem { /// The kind of an item in a group of flags. #[derive(Clone, Debug, Eq, PartialEq)] +#[cfg_attr(feature = "arbitrary", derive(arbitrary::Arbitrary))] pub enum FlagsItemKind { /// A negation operator applied to all subsequent flags in the enclosing /// group. @@ -1305,6 +1611,7 @@ impl FlagsItemKind { /// A single flag. #[derive(Clone, Copy, Debug, Eq, PartialEq)] +#[cfg_attr(feature = "arbitrary", derive(arbitrary::Arbitrary))] pub enum Flag { /// `i` CaseInsensitive, @@ -1334,8 +1641,10 @@ impl Drop for Ast { | Ast::Literal(_) | Ast::Dot(_) | Ast::Assertion(_) - // Classes are recursive, so they get their own Drop impl. - | Ast::Class(_) => return, + | Ast::ClassUnicode(_) + | Ast::ClassPerl(_) + // Bracketed classes are recursive, they get their own Drop impl. + | Ast::ClassBracketed(_) => return, Ast::Repetition(ref x) if !x.ast.has_subexprs() => return, Ast::Group(ref x) if !x.ast.has_subexprs() => return, Ast::Alternation(ref x) if x.asts.is_empty() => return, @@ -1344,7 +1653,7 @@ impl Drop for Ast { } let empty_span = || Span::splat(Position::new(0, 0, 0)); - let empty_ast = || Ast::Empty(empty_span()); + let empty_ast = || Ast::empty(empty_span()); let mut stack = vec![mem::replace(self, empty_ast())]; while let Some(mut ast) = stack.pop() { match ast { @@ -1353,8 +1662,11 @@ impl Drop for Ast { | Ast::Literal(_) | Ast::Dot(_) | Ast::Assertion(_) - // Classes are recursive, so they get their own Drop impl. - | Ast::Class(_) => {} + | Ast::ClassUnicode(_) + | Ast::ClassPerl(_) + // Bracketed classes are recursive, so they get their own Drop + // impl. + | Ast::ClassBracketed(_) => {} Ast::Repetition(ref mut x) => { stack.push(mem::replace(&mut x.ast, empty_ast())); } @@ -1447,9 +1759,9 @@ mod tests { let run = || { let span = || Span::splat(Position::new(0, 0, 0)); - let mut ast = Ast::Empty(span()); + let mut ast = Ast::empty(span()); for i in 0..200 { - ast = Ast::Group(Group { + ast = Ast::group(Group { span: span(), kind: GroupKind::CaptureIndex(i), ast: Box::new(ast), @@ -1478,4 +1790,20 @@ mod tests { .join() .unwrap(); } + + // This tests that our `Ast` has a reasonable size. This isn't a hard rule + // and it can be increased if given a good enough reason. But this test + // exists because the size of `Ast` was at one point over 200 bytes on a + // 64-bit target. Wow. + #[test] + fn ast_size() { + let max = 2 * core::mem::size_of::<usize>(); + let size = core::mem::size_of::<Ast>(); + assert!( + size <= max, + "Ast size of {} bytes is bigger than suggested max {}", + size, + max + ); + } } diff --git a/vendor/regex-syntax/src/ast/parse.rs b/vendor/regex-syntax/src/ast/parse.rs index 9cf64e9ec..593b14fbc 100644 --- a/vendor/regex-syntax/src/ast/parse.rs +++ b/vendor/regex-syntax/src/ast/parse.rs @@ -53,11 +53,11 @@ impl Primitive { /// Convert this primitive into a proper AST. fn into_ast(self) -> Ast { match self { - Primitive::Literal(lit) => Ast::Literal(lit), - Primitive::Assertion(assert) => Ast::Assertion(assert), - Primitive::Dot(span) => Ast::Dot(span), - Primitive::Perl(cls) => Ast::Class(ast::Class::Perl(cls)), - Primitive::Unicode(cls) => Ast::Class(ast::Class::Unicode(cls)), + Primitive::Literal(lit) => Ast::literal(lit), + Primitive::Assertion(assert) => Ast::assertion(assert), + Primitive::Dot(span) => Ast::dot(span), + Primitive::Perl(cls) => Ast::class_perl(cls), + Primitive::Unicode(cls) => Ast::class_unicode(cls), } } @@ -383,7 +383,7 @@ impl<'s, P: Borrow<Parser>> ParserI<'s, P> { /// Return a reference to the pattern being parsed. fn pattern(&self) -> &str { - self.pattern.borrow() + self.pattern } /// Create a new error with the given span and error type. @@ -691,7 +691,7 @@ impl<'s, P: Borrow<Parser>> ParserI<'s, P> { self.parser().ignore_whitespace.set(v); } - concat.asts.push(Ast::Flags(set)); + concat.asts.push(Ast::flags(set)); Ok(concat) } Either::Right(group) => { @@ -764,7 +764,7 @@ impl<'s, P: Borrow<Parser>> ParserI<'s, P> { group.ast = Box::new(group_concat.into_ast()); } } - prior_concat.asts.push(Ast::Group(group)); + prior_concat.asts.push(Ast::group(group)); Ok(prior_concat) } @@ -783,7 +783,7 @@ impl<'s, P: Borrow<Parser>> ParserI<'s, P> { Some(GroupState::Alternation(mut alt)) => { alt.span.end = self.pos(); alt.asts.push(concat.into_ast()); - Ok(Ast::Alternation(alt)) + Ok(Ast::alternation(alt)) } Some(GroupState::Group { group, .. }) => { return Err( @@ -850,7 +850,7 @@ impl<'s, P: Borrow<Parser>> ParserI<'s, P> { fn pop_class( &self, nested_union: ast::ClassSetUnion, - ) -> Result<Either<ast::ClassSetUnion, ast::Class>> { + ) -> Result<Either<ast::ClassSetUnion, ast::ClassBracketed>> { assert_eq!(self.char(), ']'); let item = ast::ClassSet::Item(nested_union.into_item()); @@ -882,7 +882,7 @@ impl<'s, P: Borrow<Parser>> ParserI<'s, P> { set.span.end = self.pos(); set.kind = prevset; if stack.is_empty() { - Ok(Either::Right(ast::Class::Bracketed(set))) + Ok(Either::Right(set)) } else { union.push(ast::ClassSetItem::Bracketed(Box::new(set))); Ok(Either::Left(union)) @@ -976,7 +976,7 @@ impl<'s, P: Borrow<Parser>> ParserI<'s, P> { '|' => concat = self.push_alternate(concat)?, '[' => { let class = self.parse_set_class()?; - concat.asts.push(Ast::Class(class)); + concat.asts.push(Ast::class_bracketed(class)); } '?' => { concat = self.parse_uncounted_repetition( @@ -1057,7 +1057,7 @@ impl<'s, P: Borrow<Parser>> ParserI<'s, P> { greedy = false; self.bump(); } - concat.asts.push(Ast::Repetition(ast::Repetition { + concat.asts.push(Ast::repetition(ast::Repetition { span: ast.span().with_end(self.pos()), op: ast::RepetitionOp { span: Span::new(op_start, self.pos()), @@ -1159,7 +1159,7 @@ impl<'s, P: Borrow<Parser>> ParserI<'s, P> { self.error(op_span, ast::ErrorKind::RepetitionCountInvalid) ); } - concat.asts.push(Ast::Repetition(ast::Repetition { + concat.asts.push(Ast::repetition(ast::Repetition { span: ast.span().with_end(self.pos()), op: ast::RepetitionOp { span: op_span, @@ -1212,7 +1212,7 @@ impl<'s, P: Borrow<Parser>> ParserI<'s, P> { Ok(Either::Right(ast::Group { span: open_span, kind: ast::GroupKind::CaptureName { starts_with_p, name }, - ast: Box::new(Ast::Empty(self.span())), + ast: Box::new(Ast::empty(self.span())), })) } else if self.bump_if("?") { if self.is_eof() { @@ -1241,7 +1241,7 @@ impl<'s, P: Borrow<Parser>> ParserI<'s, P> { Ok(Either::Right(ast::Group { span: open_span, kind: ast::GroupKind::NonCapturing(flags), - ast: Box::new(Ast::Empty(self.span())), + ast: Box::new(Ast::empty(self.span())), })) } } else { @@ -1249,7 +1249,7 @@ impl<'s, P: Borrow<Parser>> ParserI<'s, P> { Ok(Either::Right(ast::Group { span: open_span, kind: ast::GroupKind::CaptureIndex(capture_index), - ast: Box::new(Ast::Empty(self.span())), + ast: Box::new(Ast::empty(self.span())), })) } } @@ -1528,18 +1528,115 @@ impl<'s, P: Borrow<Parser>> ParserI<'s, P> { span, kind: ast::AssertionKind::EndText, })), - 'b' => Ok(Primitive::Assertion(ast::Assertion { - span, - kind: ast::AssertionKind::WordBoundary, - })), + 'b' => { + let mut wb = ast::Assertion { + span, + kind: ast::AssertionKind::WordBoundary, + }; + // After a \b, we "try" to parse things like \b{start} for + // special word boundary assertions. + if !self.is_eof() && self.char() == '{' { + if let Some(kind) = + self.maybe_parse_special_word_boundary(start)? + { + wb.kind = kind; + wb.span.end = self.pos(); + } + } + Ok(Primitive::Assertion(wb)) + } 'B' => Ok(Primitive::Assertion(ast::Assertion { span, kind: ast::AssertionKind::NotWordBoundary, })), + '<' => Ok(Primitive::Assertion(ast::Assertion { + span, + kind: ast::AssertionKind::WordBoundaryStartAngle, + })), + '>' => Ok(Primitive::Assertion(ast::Assertion { + span, + kind: ast::AssertionKind::WordBoundaryEndAngle, + })), _ => Err(self.error(span, ast::ErrorKind::EscapeUnrecognized)), } } + /// Attempt to parse a specialty word boundary. That is, `\b{start}`, + /// `\b{end}`, `\b{start-half}` or `\b{end-half}`. + /// + /// This is similar to `maybe_parse_ascii_class` in that, in most cases, + /// if it fails it will just return `None` with no error. This is done + /// because `\b{5}` is a valid expression and we want to let that be parsed + /// by the existing counted repetition parsing code. (I thought about just + /// invoking the counted repetition code from here, but it seemed a little + /// ham-fisted.) + /// + /// Unlike `maybe_parse_ascii_class` though, this can return an error. + /// Namely, if we definitely know it isn't a counted repetition, then we + /// return an error specific to the specialty word boundaries. + /// + /// This assumes the parser is positioned at a `{` immediately following + /// a `\b`. When `None` is returned, the parser is returned to the position + /// at which it started: pointing at a `{`. + /// + /// The position given should correspond to the start of the `\b`. + fn maybe_parse_special_word_boundary( + &self, + wb_start: Position, + ) -> Result<Option<ast::AssertionKind>> { + assert_eq!(self.char(), '{'); + + let is_valid_char = |c| match c { + 'A'..='Z' | 'a'..='z' | '-' => true, + _ => false, + }; + let start = self.pos(); + if !self.bump_and_bump_space() { + return Err(self.error( + Span::new(wb_start, self.pos()), + ast::ErrorKind::SpecialWordOrRepetitionUnexpectedEof, + )); + } + let start_contents = self.pos(); + // This is one of the critical bits: if the first non-whitespace + // character isn't in [-A-Za-z] (i.e., this can't be a special word + // boundary), then we bail and let the counted repetition parser deal + // with this. + if !is_valid_char(self.char()) { + self.parser().pos.set(start); + return Ok(None); + } + + // Now collect up our chars until we see a '}'. + let mut scratch = self.parser().scratch.borrow_mut(); + scratch.clear(); + while !self.is_eof() && is_valid_char(self.char()) { + scratch.push(self.char()); + self.bump_and_bump_space(); + } + if self.is_eof() || self.char() != '}' { + return Err(self.error( + Span::new(start, self.pos()), + ast::ErrorKind::SpecialWordBoundaryUnclosed, + )); + } + let end = self.pos(); + self.bump(); + let kind = match scratch.as_str() { + "start" => ast::AssertionKind::WordBoundaryStart, + "end" => ast::AssertionKind::WordBoundaryEnd, + "start-half" => ast::AssertionKind::WordBoundaryStartHalf, + "end-half" => ast::AssertionKind::WordBoundaryEndHalf, + _ => { + return Err(self.error( + Span::new(start_contents, end), + ast::ErrorKind::SpecialWordBoundaryUnrecognized, + )) + } + }; + Ok(Some(kind)) + } + /// Parse an octal representation of a Unicode codepoint up to 3 digits /// long. This expects the parser to be positioned at the first octal /// digit and advances the parser to the first character immediately @@ -1743,7 +1840,7 @@ impl<'s, P: Borrow<Parser>> ParserI<'s, P> { /// is successful, then the parser is advanced to the position immediately /// following the closing `]`. #[inline(never)] - fn parse_set_class(&self) -> Result<ast::Class> { + fn parse_set_class(&self) -> Result<ast::ClassBracketed> { assert_eq!(self.char(), '['); let mut union = @@ -1967,9 +2064,9 @@ impl<'s, P: Borrow<Parser>> ParserI<'s, P> { // because parsing cannot fail with any interesting error. For example, // in order to use an ASCII character class, it must be enclosed in // double brackets, e.g., `[[:alnum:]]`. Alternatively, you might think - // of it as "ASCII character characters have the syntax `[:NAME:]` - // which can only appear within character brackets." This means that - // things like `[[:lower:]A]` are legal constructs. + // of it as "ASCII character classes have the syntax `[:NAME:]` which + // can only appear within character brackets." This means that things + // like `[[:lower:]A]` are legal constructs. // // However, if one types an incorrect ASCII character class, e.g., // `[[:loower:]]`, then we treat that as a normal nested character @@ -2189,12 +2286,12 @@ impl<'p, 's, P: Borrow<Parser>> ast::Visitor for NestLimiter<'p, 's, P> { | Ast::Literal(_) | Ast::Dot(_) | Ast::Assertion(_) - | Ast::Class(ast::Class::Unicode(_)) - | Ast::Class(ast::Class::Perl(_)) => { + | Ast::ClassUnicode(_) + | Ast::ClassPerl(_) => { // These are all base cases, so we don't increment depth. return Ok(()); } - Ast::Class(ast::Class::Bracketed(ref x)) => &x.span, + Ast::ClassBracketed(ref x) => &x.span, Ast::Repetition(ref x) => &x.span, Ast::Group(ref x) => &x.span, Ast::Alternation(ref x) => &x.span, @@ -2210,12 +2307,12 @@ impl<'p, 's, P: Borrow<Parser>> ast::Visitor for NestLimiter<'p, 's, P> { | Ast::Literal(_) | Ast::Dot(_) | Ast::Assertion(_) - | Ast::Class(ast::Class::Unicode(_)) - | Ast::Class(ast::Class::Perl(_)) => { + | Ast::ClassUnicode(_) + | Ast::ClassPerl(_) => { // These are all base cases, so we don't decrement depth. Ok(()) } - Ast::Class(ast::Class::Bracketed(_)) + Ast::ClassBracketed(_) | Ast::Repetition(_) | Ast::Group(_) | Ast::Alternation(_) @@ -2426,12 +2523,12 @@ mod tests { /// Create a meta literal starting at the given position. fn meta_lit(c: char, span: Span) -> Ast { - Ast::Literal(ast::Literal { span, kind: ast::LiteralKind::Meta, c }) + Ast::literal(ast::Literal { span, kind: ast::LiteralKind::Meta, c }) } /// Create a verbatim literal with the given span. fn lit_with(c: char, span: Span) -> Ast { - Ast::Literal(ast::Literal { + Ast::literal(ast::Literal { span, kind: ast::LiteralKind::Verbatim, c, @@ -2445,17 +2542,17 @@ mod tests { /// Create a concatenation with the given span. fn concat_with(span: Span, asts: Vec<Ast>) -> Ast { - Ast::Concat(ast::Concat { span, asts }) + Ast::concat(ast::Concat { span, asts }) } /// Create an alternation with the given span. fn alt(range: Range<usize>, asts: Vec<Ast>) -> Ast { - Ast::Alternation(ast::Alternation { span: span(range), asts }) + Ast::alternation(ast::Alternation { span: span(range), asts }) } /// Create a capturing group with the given span. fn group(range: Range<usize>, index: u32, ast: Ast) -> Ast { - Ast::Group(ast::Group { + Ast::group(ast::Group { span: span(range), kind: ast::GroupKind::CaptureIndex(index), ast: Box::new(ast), @@ -2488,7 +2585,7 @@ mod tests { }, ); } - Ast::Flags(ast::SetFlags { + Ast::flags(ast::SetFlags { span: span_range(pat, range.clone()), flags: ast::Flags { span: span_range(pat, (range.start + 2)..(range.end - 1)), @@ -2502,7 +2599,7 @@ mod tests { // A nest limit of 0 still allows some types of regexes. assert_eq!( parser_nest_limit("", 0).parse(), - Ok(Ast::Empty(span(0..0))) + Ok(Ast::empty(span(0..0))) ); assert_eq!(parser_nest_limit("a", 0).parse(), Ok(lit('a', 0))); @@ -2516,7 +2613,7 @@ mod tests { ); assert_eq!( parser_nest_limit("a+", 1).parse(), - Ok(Ast::Repetition(ast::Repetition { + Ok(Ast::repetition(ast::Repetition { span: span(0..2), op: ast::RepetitionOp { span: span(1..2), @@ -2542,14 +2639,14 @@ mod tests { ); assert_eq!( parser_nest_limit("a+*", 2).parse(), - Ok(Ast::Repetition(ast::Repetition { + Ok(Ast::repetition(ast::Repetition { span: span(0..3), op: ast::RepetitionOp { span: span(2..3), kind: ast::RepetitionKind::ZeroOrMore, }, greedy: true, - ast: Box::new(Ast::Repetition(ast::Repetition { + ast: Box::new(Ast::repetition(ast::Repetition { span: span(0..2), op: ast::RepetitionOp { span: span(1..2), @@ -2606,7 +2703,7 @@ mod tests { ); assert_eq!( parser_nest_limit("[a]", 1).parse(), - Ok(Ast::Class(ast::Class::Bracketed(ast::ClassBracketed { + Ok(Ast::class_bracketed(ast::ClassBracketed { span: span(0..3), negated: false, kind: ast::ClassSet::Item(ast::ClassSetItem::Literal( @@ -2616,7 +2713,7 @@ mod tests { c: 'a', } )), - }))) + })) ); assert_eq!( parser_nest_limit("[ab]", 1).parse().unwrap_err(), @@ -2776,7 +2873,7 @@ bar vec![ lit_with('a', span_range(pat, 0..1)), lit_with(' ', span_range(pat, 1..2)), - Ast::Group(ast::Group { + Ast::group(ast::Group { span: span_range(pat, 2..9), kind: ast::GroupKind::NonCapturing(ast::Flags { span: span_range(pat, 4..5), @@ -2803,7 +2900,7 @@ bar span_range(pat, 0..pat.len()), vec![ flag_set(pat, 0..4, ast::Flag::IgnoreWhitespace, false), - Ast::Group(ast::Group { + Ast::group(ast::Group { span: span_range(pat, 4..pat.len()), kind: ast::GroupKind::CaptureName { starts_with_p: true, @@ -2825,7 +2922,7 @@ bar span_range(pat, 0..pat.len()), vec![ flag_set(pat, 0..4, ast::Flag::IgnoreWhitespace, false), - Ast::Group(ast::Group { + Ast::group(ast::Group { span: span_range(pat, 4..pat.len()), kind: ast::GroupKind::CaptureIndex(1), ast: Box::new(lit_with('a', span_range(pat, 7..8))), @@ -2840,7 +2937,7 @@ bar span_range(pat, 0..pat.len()), vec![ flag_set(pat, 0..4, ast::Flag::IgnoreWhitespace, false), - Ast::Group(ast::Group { + Ast::group(ast::Group { span: span_range(pat, 4..pat.len()), kind: ast::GroupKind::NonCapturing(ast::Flags { span: span_range(pat, 8..8), @@ -2858,7 +2955,7 @@ bar span_range(pat, 0..pat.len()), vec![ flag_set(pat, 0..4, ast::Flag::IgnoreWhitespace, false), - Ast::Literal(ast::Literal { + Ast::literal(ast::Literal { span: span(4..13), kind: ast::LiteralKind::HexBrace( ast::HexLiteralKind::X @@ -2877,7 +2974,7 @@ bar span_range(pat, 0..pat.len()), vec![ flag_set(pat, 0..4, ast::Flag::IgnoreWhitespace, false), - Ast::Literal(ast::Literal { + Ast::literal(ast::Literal { span: span_range(pat, 4..6), kind: ast::LiteralKind::Superfluous, c: ' ', @@ -2895,9 +2992,9 @@ bar Ok(concat_with( span_range(pat, 0..3), vec![ - Ast::Dot(span_range(pat, 0..1)), + Ast::dot(span_range(pat, 0..1)), lit_with('\n', span_range(pat, 1..2)), - Ast::Dot(span_range(pat, 2..3)), + Ast::dot(span_range(pat, 2..3)), ] )) ); @@ -2933,7 +3030,7 @@ bar fn parse_uncounted_repetition() { assert_eq!( parser(r"a*").parse(), - Ok(Ast::Repetition(ast::Repetition { + Ok(Ast::repetition(ast::Repetition { span: span(0..2), op: ast::RepetitionOp { span: span(1..2), @@ -2945,7 +3042,7 @@ bar ); assert_eq!( parser(r"a+").parse(), - Ok(Ast::Repetition(ast::Repetition { + Ok(Ast::repetition(ast::Repetition { span: span(0..2), op: ast::RepetitionOp { span: span(1..2), @@ -2958,7 +3055,7 @@ bar assert_eq!( parser(r"a?").parse(), - Ok(Ast::Repetition(ast::Repetition { + Ok(Ast::repetition(ast::Repetition { span: span(0..2), op: ast::RepetitionOp { span: span(1..2), @@ -2970,7 +3067,7 @@ bar ); assert_eq!( parser(r"a??").parse(), - Ok(Ast::Repetition(ast::Repetition { + Ok(Ast::repetition(ast::Repetition { span: span(0..3), op: ast::RepetitionOp { span: span(1..3), @@ -2982,7 +3079,7 @@ bar ); assert_eq!( parser(r"a?").parse(), - Ok(Ast::Repetition(ast::Repetition { + Ok(Ast::repetition(ast::Repetition { span: span(0..2), op: ast::RepetitionOp { span: span(1..2), @@ -2997,7 +3094,7 @@ bar Ok(concat( 0..3, vec![ - Ast::Repetition(ast::Repetition { + Ast::repetition(ast::Repetition { span: span(0..2), op: ast::RepetitionOp { span: span(1..2), @@ -3015,7 +3112,7 @@ bar Ok(concat( 0..4, vec![ - Ast::Repetition(ast::Repetition { + Ast::repetition(ast::Repetition { span: span(0..3), op: ast::RepetitionOp { span: span(1..3), @@ -3034,7 +3131,7 @@ bar 0..3, vec![ lit('a', 0), - Ast::Repetition(ast::Repetition { + Ast::repetition(ast::Repetition { span: span(1..3), op: ast::RepetitionOp { span: span(2..3), @@ -3048,7 +3145,7 @@ bar ); assert_eq!( parser(r"(ab)?").parse(), - Ok(Ast::Repetition(ast::Repetition { + Ok(Ast::repetition(ast::Repetition { span: span(0..5), op: ast::RepetitionOp { span: span(4..5), @@ -3067,8 +3164,8 @@ bar Ok(alt( 0..3, vec![ - Ast::Empty(span(0..0)), - Ast::Repetition(ast::Repetition { + Ast::empty(span(0..0)), + Ast::repetition(ast::Repetition { span: span(1..3), op: ast::RepetitionOp { span: span(2..3), @@ -3157,7 +3254,7 @@ bar fn parse_counted_repetition() { assert_eq!( parser(r"a{5}").parse(), - Ok(Ast::Repetition(ast::Repetition { + Ok(Ast::repetition(ast::Repetition { span: span(0..4), op: ast::RepetitionOp { span: span(1..4), @@ -3171,7 +3268,7 @@ bar ); assert_eq!( parser(r"a{5,}").parse(), - Ok(Ast::Repetition(ast::Repetition { + Ok(Ast::repetition(ast::Repetition { span: span(0..5), op: ast::RepetitionOp { span: span(1..5), @@ -3185,7 +3282,7 @@ bar ); assert_eq!( parser(r"a{5,9}").parse(), - Ok(Ast::Repetition(ast::Repetition { + Ok(Ast::repetition(ast::Repetition { span: span(0..6), op: ast::RepetitionOp { span: span(1..6), @@ -3199,7 +3296,7 @@ bar ); assert_eq!( parser(r"a{5}?").parse(), - Ok(Ast::Repetition(ast::Repetition { + Ok(Ast::repetition(ast::Repetition { span: span(0..5), op: ast::RepetitionOp { span: span(1..5), @@ -3217,7 +3314,7 @@ bar 0..5, vec![ lit('a', 0), - Ast::Repetition(ast::Repetition { + Ast::repetition(ast::Repetition { span: span(1..5), op: ast::RepetitionOp { span: span(2..5), @@ -3237,7 +3334,7 @@ bar 0..6, vec![ lit('a', 0), - Ast::Repetition(ast::Repetition { + Ast::repetition(ast::Repetition { span: span(1..5), op: ast::RepetitionOp { span: span(2..5), @@ -3255,7 +3352,7 @@ bar assert_eq!( parser(r"a{ 5 }").parse(), - Ok(Ast::Repetition(ast::Repetition { + Ok(Ast::repetition(ast::Repetition { span: span(0..6), op: ast::RepetitionOp { span: span(1..6), @@ -3269,7 +3366,7 @@ bar ); assert_eq!( parser(r"a{ 5 , 9 }").parse(), - Ok(Ast::Repetition(ast::Repetition { + Ok(Ast::repetition(ast::Repetition { span: span(0..10), op: ast::RepetitionOp { span: span(1..10), @@ -3283,7 +3380,7 @@ bar ); assert_eq!( parser_ignore_whitespace(r"a{5,9} ?").parse(), - Ok(Ast::Repetition(ast::Repetition { + Ok(Ast::repetition(ast::Repetition { span: span(0..8), op: ast::RepetitionOp { span: span(1..8), @@ -3295,6 +3392,23 @@ bar ast: Box::new(lit('a', 0)), })) ); + assert_eq!( + parser(r"\b{5,9}").parse(), + Ok(Ast::repetition(ast::Repetition { + span: span(0..7), + op: ast::RepetitionOp { + span: span(2..7), + kind: ast::RepetitionKind::Range( + ast::RepetitionRange::Bounded(5, 9) + ), + }, + greedy: true, + ast: Box::new(Ast::assertion(ast::Assertion { + span: span(0..2), + kind: ast::AssertionKind::WordBoundary, + })), + })) + ); assert_eq!( parser(r"(?i){0}").parse().unwrap_err(), @@ -3414,7 +3528,7 @@ bar fn parse_alternate() { assert_eq!( parser(r"a|b").parse(), - Ok(Ast::Alternation(ast::Alternation { + Ok(Ast::alternation(ast::Alternation { span: span(0..3), asts: vec![lit('a', 0), lit('b', 2)], })) @@ -3424,7 +3538,7 @@ bar Ok(group( 0..5, 1, - Ast::Alternation(ast::Alternation { + Ast::alternation(ast::Alternation { span: span(1..4), asts: vec![lit('a', 1), lit('b', 3)], }) @@ -3433,14 +3547,14 @@ bar assert_eq!( parser(r"a|b|c").parse(), - Ok(Ast::Alternation(ast::Alternation { + Ok(Ast::alternation(ast::Alternation { span: span(0..5), asts: vec![lit('a', 0), lit('b', 2), lit('c', 4)], })) ); assert_eq!( parser(r"ax|by|cz").parse(), - Ok(Ast::Alternation(ast::Alternation { + Ok(Ast::alternation(ast::Alternation { span: span(0..8), asts: vec![ concat(0..2, vec![lit('a', 0), lit('x', 1)]), @@ -3454,7 +3568,7 @@ bar Ok(group( 0..10, 1, - Ast::Alternation(ast::Alternation { + Ast::alternation(ast::Alternation { span: span(1..9), asts: vec![ concat(1..3, vec![lit('a', 1), lit('x', 2)]), @@ -3503,7 +3617,7 @@ bar parser(r"|").parse(), Ok(alt( 0..1, - vec![Ast::Empty(span(0..0)), Ast::Empty(span(1..1)),] + vec![Ast::empty(span(0..0)), Ast::empty(span(1..1)),] )) ); assert_eq!( @@ -3511,19 +3625,19 @@ bar Ok(alt( 0..2, vec![ - Ast::Empty(span(0..0)), - Ast::Empty(span(1..1)), - Ast::Empty(span(2..2)), + Ast::empty(span(0..0)), + Ast::empty(span(1..1)), + Ast::empty(span(2..2)), ] )) ); assert_eq!( parser(r"a|").parse(), - Ok(alt(0..2, vec![lit('a', 0), Ast::Empty(span(2..2)),])) + Ok(alt(0..2, vec![lit('a', 0), Ast::empty(span(2..2)),])) ); assert_eq!( parser(r"|a").parse(), - Ok(alt(0..2, vec![Ast::Empty(span(0..0)), lit('a', 1),])) + Ok(alt(0..2, vec![Ast::empty(span(0..0)), lit('a', 1),])) ); assert_eq!( @@ -3533,7 +3647,7 @@ bar 1, alt( 1..2, - vec![Ast::Empty(span(1..1)), Ast::Empty(span(2..2)),] + vec![Ast::empty(span(1..1)), Ast::empty(span(2..2)),] ) )) ); @@ -3542,7 +3656,7 @@ bar Ok(group( 0..4, 1, - alt(1..3, vec![lit('a', 1), Ast::Empty(span(3..3)),]) + alt(1..3, vec![lit('a', 1), Ast::empty(span(3..3)),]) )) ); assert_eq!( @@ -3550,7 +3664,7 @@ bar Ok(group( 0..4, 1, - alt(1..3, vec![Ast::Empty(span(1..1)), lit('a', 2),]) + alt(1..3, vec![Ast::empty(span(1..1)), lit('a', 2),]) )) ); @@ -3606,7 +3720,7 @@ bar fn parse_group() { assert_eq!( parser("(?i)").parse(), - Ok(Ast::Flags(ast::SetFlags { + Ok(Ast::flags(ast::SetFlags { span: span(0..4), flags: ast::Flags { span: span(2..3), @@ -3621,7 +3735,7 @@ bar ); assert_eq!( parser("(?iU)").parse(), - Ok(Ast::Flags(ast::SetFlags { + Ok(Ast::flags(ast::SetFlags { span: span(0..5), flags: ast::Flags { span: span(2..4), @@ -3644,7 +3758,7 @@ bar ); assert_eq!( parser("(?i-U)").parse(), - Ok(Ast::Flags(ast::SetFlags { + Ok(Ast::flags(ast::SetFlags { span: span(0..6), flags: ast::Flags { span: span(2..5), @@ -3672,15 +3786,15 @@ bar assert_eq!( parser("()").parse(), - Ok(Ast::Group(ast::Group { + Ok(Ast::group(ast::Group { span: span(0..2), kind: ast::GroupKind::CaptureIndex(1), - ast: Box::new(Ast::Empty(span(1..1))), + ast: Box::new(Ast::empty(span(1..1))), })) ); assert_eq!( parser("(a)").parse(), - Ok(Ast::Group(ast::Group { + Ok(Ast::group(ast::Group { span: span(0..3), kind: ast::GroupKind::CaptureIndex(1), ast: Box::new(lit('a', 1)), @@ -3688,20 +3802,20 @@ bar ); assert_eq!( parser("(())").parse(), - Ok(Ast::Group(ast::Group { + Ok(Ast::group(ast::Group { span: span(0..4), kind: ast::GroupKind::CaptureIndex(1), - ast: Box::new(Ast::Group(ast::Group { + ast: Box::new(Ast::group(ast::Group { span: span(1..3), kind: ast::GroupKind::CaptureIndex(2), - ast: Box::new(Ast::Empty(span(2..2))), + ast: Box::new(Ast::empty(span(2..2))), })), })) ); assert_eq!( parser("(?:a)").parse(), - Ok(Ast::Group(ast::Group { + Ok(Ast::group(ast::Group { span: span(0..5), kind: ast::GroupKind::NonCapturing(ast::Flags { span: span(2..2), @@ -3713,7 +3827,7 @@ bar assert_eq!( parser("(?i:a)").parse(), - Ok(Ast::Group(ast::Group { + Ok(Ast::group(ast::Group { span: span(0..6), kind: ast::GroupKind::NonCapturing(ast::Flags { span: span(2..3), @@ -3729,7 +3843,7 @@ bar ); assert_eq!( parser("(?i-U:a)").parse(), - Ok(Ast::Group(ast::Group { + Ok(Ast::group(ast::Group { span: span(0..8), kind: ast::GroupKind::NonCapturing(ast::Flags { span: span(2..5), @@ -3818,7 +3932,7 @@ bar fn parse_capture_name() { assert_eq!( parser("(?<a>z)").parse(), - Ok(Ast::Group(ast::Group { + Ok(Ast::group(ast::Group { span: span(0..7), kind: ast::GroupKind::CaptureName { starts_with_p: false, @@ -3833,7 +3947,7 @@ bar ); assert_eq!( parser("(?P<a>z)").parse(), - Ok(Ast::Group(ast::Group { + Ok(Ast::group(ast::Group { span: span(0..8), kind: ast::GroupKind::CaptureName { starts_with_p: true, @@ -3848,7 +3962,7 @@ bar ); assert_eq!( parser("(?P<abc>z)").parse(), - Ok(Ast::Group(ast::Group { + Ok(Ast::group(ast::Group { span: span(0..10), kind: ast::GroupKind::CaptureName { starts_with_p: true, @@ -3864,7 +3978,7 @@ bar assert_eq!( parser("(?P<a_1>z)").parse(), - Ok(Ast::Group(ast::Group { + Ok(Ast::group(ast::Group { span: span(0..10), kind: ast::GroupKind::CaptureName { starts_with_p: true, @@ -3880,7 +3994,7 @@ bar assert_eq!( parser("(?P<a.1>z)").parse(), - Ok(Ast::Group(ast::Group { + Ok(Ast::group(ast::Group { span: span(0..10), kind: ast::GroupKind::CaptureName { starts_with_p: true, @@ -3896,7 +4010,7 @@ bar assert_eq!( parser("(?P<a[1]>z)").parse(), - Ok(Ast::Group(ast::Group { + Ok(Ast::group(ast::Group { span: span(0..11), kind: ast::GroupKind::CaptureName { starts_with_p: true, @@ -3912,7 +4026,7 @@ bar assert_eq!( parser("(?P<a¾>)").parse(), - Ok(Ast::Group(ast::Group { + Ok(Ast::group(ast::Group { span: Span::new( Position::new(0, 1, 1), Position::new(9, 1, 9), @@ -3928,7 +4042,7 @@ bar index: 1, } }, - ast: Box::new(Ast::Empty(Span::new( + ast: Box::new(Ast::empty(Span::new( Position::new(8, 1, 8), Position::new(8, 1, 8), ))), @@ -3936,7 +4050,7 @@ bar ); assert_eq!( parser("(?P<名字>)").parse(), - Ok(Ast::Group(ast::Group { + Ok(Ast::group(ast::Group { span: Span::new( Position::new(0, 1, 1), Position::new(12, 1, 9), @@ -3952,7 +4066,7 @@ bar index: 1, } }, - ast: Box::new(Ast::Empty(Span::new( + ast: Box::new(Ast::empty(Span::new( Position::new(11, 1, 8), Position::new(11, 1, 8), ))), @@ -4382,6 +4496,48 @@ bar })) ); assert_eq!( + parser(r"\b{start}").parse_primitive(), + Ok(Primitive::Assertion(ast::Assertion { + span: span(0..9), + kind: ast::AssertionKind::WordBoundaryStart, + })) + ); + assert_eq!( + parser(r"\b{end}").parse_primitive(), + Ok(Primitive::Assertion(ast::Assertion { + span: span(0..7), + kind: ast::AssertionKind::WordBoundaryEnd, + })) + ); + assert_eq!( + parser(r"\b{start-half}").parse_primitive(), + Ok(Primitive::Assertion(ast::Assertion { + span: span(0..14), + kind: ast::AssertionKind::WordBoundaryStartHalf, + })) + ); + assert_eq!( + parser(r"\b{end-half}").parse_primitive(), + Ok(Primitive::Assertion(ast::Assertion { + span: span(0..12), + kind: ast::AssertionKind::WordBoundaryEndHalf, + })) + ); + assert_eq!( + parser(r"\<").parse_primitive(), + Ok(Primitive::Assertion(ast::Assertion { + span: span(0..2), + kind: ast::AssertionKind::WordBoundaryStartAngle, + })) + ); + assert_eq!( + parser(r"\>").parse_primitive(), + Ok(Primitive::Assertion(ast::Assertion { + span: span(0..2), + kind: ast::AssertionKind::WordBoundaryEndAngle, + })) + ); + assert_eq!( parser(r"\B").parse_primitive(), Ok(Primitive::Assertion(ast::Assertion { span: span(0..2), @@ -4418,20 +4574,60 @@ bar kind: ast::ErrorKind::EscapeUnrecognized, } ); - // But also, < and > are banned, so that we may evolve them into - // start/end word boundary assertions. (Not sure if we will...) + + // Starting a special word boundary without any non-whitespace chars + // after the brace makes it ambiguous whether the user meant to write + // a counted repetition (probably not?) or an actual special word + // boundary assertion. assert_eq!( - parser(r"\<").parse_escape().unwrap_err(), + parser(r"\b{").parse_escape().unwrap_err(), TestError { - span: span(0..2), - kind: ast::ErrorKind::EscapeUnrecognized, + span: span(0..3), + kind: ast::ErrorKind::SpecialWordOrRepetitionUnexpectedEof, } ); assert_eq!( - parser(r"\>").parse_escape().unwrap_err(), + parser_ignore_whitespace(r"\b{ ").parse_escape().unwrap_err(), TestError { - span: span(0..2), - kind: ast::ErrorKind::EscapeUnrecognized, + span: span(0..4), + kind: ast::ErrorKind::SpecialWordOrRepetitionUnexpectedEof, + } + ); + // When 'x' is not enabled, the space is seen as a non-[-A-Za-z] char, + // and thus causes the parser to treat it as a counted repetition. + assert_eq!( + parser(r"\b{ ").parse().unwrap_err(), + TestError { + span: span(4..4), + kind: ast::ErrorKind::RepetitionCountDecimalEmpty, + } + ); + // In this case, we got some valid chars that makes it look like the + // user is writing one of the special word boundary assertions, but + // we forget to close the brace. + assert_eq!( + parser(r"\b{foo").parse_escape().unwrap_err(), + TestError { + span: span(2..6), + kind: ast::ErrorKind::SpecialWordBoundaryUnclosed, + } + ); + // We get the same error as above, except it is provoked by seeing a + // char that we know is invalid before seeing a closing brace. + assert_eq!( + parser(r"\b{foo!}").parse_escape().unwrap_err(), + TestError { + span: span(2..6), + kind: ast::ErrorKind::SpecialWordBoundaryUnclosed, + } + ); + // And this one occurs when, syntactically, everything looks okay, but + // we don't use a valid spelling of a word boundary assertion. + assert_eq!( + parser(r"\b{foo}").parse_escape().unwrap_err(), + TestError { + span: span(3..6), + kind: ast::ErrorKind::SpecialWordBoundaryUnrecognized, } ); @@ -4494,15 +4690,15 @@ bar ); assert_eq!( parser_octal(r"\778").parse(), - Ok(Ast::Concat(ast::Concat { + Ok(Ast::concat(ast::Concat { span: span(0..4), asts: vec![ - Ast::Literal(ast::Literal { + Ast::literal(ast::Literal { span: span(0..3), kind: ast::LiteralKind::Octal, c: '?', }), - Ast::Literal(ast::Literal { + Ast::literal(ast::Literal { span: span(3..4), kind: ast::LiteralKind::Verbatim, c: '8', @@ -4512,15 +4708,15 @@ bar ); assert_eq!( parser_octal(r"\7777").parse(), - Ok(Ast::Concat(ast::Concat { + Ok(Ast::concat(ast::Concat { span: span(0..5), asts: vec![ - Ast::Literal(ast::Literal { + Ast::literal(ast::Literal { span: span(0..4), kind: ast::LiteralKind::Octal, c: '\u{01FF}', }), - Ast::Literal(ast::Literal { + Ast::literal(ast::Literal { span: span(4..5), kind: ast::LiteralKind::Verbatim, c: '7', @@ -4965,15 +5161,15 @@ bar assert_eq!( parser("[[:alnum:]]").parse(), - Ok(Ast::Class(ast::Class::Bracketed(ast::ClassBracketed { + Ok(Ast::class_bracketed(ast::ClassBracketed { span: span(0..11), negated: false, kind: itemset(item_ascii(alnum(span(1..10), false))), - }))) + })) ); assert_eq!( parser("[[[:alnum:]]]").parse(), - Ok(Ast::Class(ast::Class::Bracketed(ast::ClassBracketed { + Ok(Ast::class_bracketed(ast::ClassBracketed { span: span(0..13), negated: false, kind: itemset(item_bracket(ast::ClassBracketed { @@ -4981,11 +5177,11 @@ bar negated: false, kind: itemset(item_ascii(alnum(span(2..11), false))), })), - }))) + })) ); assert_eq!( parser("[[:alnum:]&&[:lower:]]").parse(), - Ok(Ast::Class(ast::Class::Bracketed(ast::ClassBracketed { + Ok(Ast::class_bracketed(ast::ClassBracketed { span: span(0..22), negated: false, kind: intersection( @@ -4993,11 +5189,11 @@ bar itemset(item_ascii(alnum(span(1..10), false))), itemset(item_ascii(lower(span(12..21), false))), ), - }))) + })) ); assert_eq!( parser("[[:alnum:]--[:lower:]]").parse(), - Ok(Ast::Class(ast::Class::Bracketed(ast::ClassBracketed { + Ok(Ast::class_bracketed(ast::ClassBracketed { span: span(0..22), negated: false, kind: difference( @@ -5005,11 +5201,11 @@ bar itemset(item_ascii(alnum(span(1..10), false))), itemset(item_ascii(lower(span(12..21), false))), ), - }))) + })) ); assert_eq!( parser("[[:alnum:]~~[:lower:]]").parse(), - Ok(Ast::Class(ast::Class::Bracketed(ast::ClassBracketed { + Ok(Ast::class_bracketed(ast::ClassBracketed { span: span(0..22), negated: false, kind: symdifference( @@ -5017,20 +5213,20 @@ bar itemset(item_ascii(alnum(span(1..10), false))), itemset(item_ascii(lower(span(12..21), false))), ), - }))) + })) ); assert_eq!( parser("[a]").parse(), - Ok(Ast::Class(ast::Class::Bracketed(ast::ClassBracketed { + Ok(Ast::class_bracketed(ast::ClassBracketed { span: span(0..3), negated: false, kind: itemset(lit(span(1..2), 'a')), - }))) + })) ); assert_eq!( parser(r"[a\]]").parse(), - Ok(Ast::Class(ast::Class::Bracketed(ast::ClassBracketed { + Ok(Ast::class_bracketed(ast::ClassBracketed { span: span(0..5), negated: false, kind: union( @@ -5044,11 +5240,11 @@ bar }), ] ), - }))) + })) ); assert_eq!( parser(r"[a\-z]").parse(), - Ok(Ast::Class(ast::Class::Bracketed(ast::ClassBracketed { + Ok(Ast::class_bracketed(ast::ClassBracketed { span: span(0..6), negated: false, kind: union( @@ -5063,44 +5259,44 @@ bar lit(span(4..5), 'z'), ] ), - }))) + })) ); assert_eq!( parser("[ab]").parse(), - Ok(Ast::Class(ast::Class::Bracketed(ast::ClassBracketed { + Ok(Ast::class_bracketed(ast::ClassBracketed { span: span(0..4), negated: false, kind: union( span(1..3), vec![lit(span(1..2), 'a'), lit(span(2..3), 'b'),] ), - }))) + })) ); assert_eq!( parser("[a-]").parse(), - Ok(Ast::Class(ast::Class::Bracketed(ast::ClassBracketed { + Ok(Ast::class_bracketed(ast::ClassBracketed { span: span(0..4), negated: false, kind: union( span(1..3), vec![lit(span(1..2), 'a'), lit(span(2..3), '-'),] ), - }))) + })) ); assert_eq!( parser("[-a]").parse(), - Ok(Ast::Class(ast::Class::Bracketed(ast::ClassBracketed { + Ok(Ast::class_bracketed(ast::ClassBracketed { span: span(0..4), negated: false, kind: union( span(1..3), vec![lit(span(1..2), '-'), lit(span(2..3), 'a'),] ), - }))) + })) ); assert_eq!( parser(r"[\pL]").parse(), - Ok(Ast::Class(ast::Class::Bracketed(ast::ClassBracketed { + Ok(Ast::class_bracketed(ast::ClassBracketed { span: span(0..5), negated: false, kind: itemset(item_unicode(ast::ClassUnicode { @@ -5108,11 +5304,11 @@ bar negated: false, kind: ast::ClassUnicodeKind::OneLetter('L'), })), - }))) + })) ); assert_eq!( parser(r"[\w]").parse(), - Ok(Ast::Class(ast::Class::Bracketed(ast::ClassBracketed { + Ok(Ast::class_bracketed(ast::ClassBracketed { span: span(0..4), negated: false, kind: itemset(item_perl(ast::ClassPerl { @@ -5120,11 +5316,11 @@ bar kind: ast::ClassPerlKind::Word, negated: false, })), - }))) + })) ); assert_eq!( parser(r"[a\wz]").parse(), - Ok(Ast::Class(ast::Class::Bracketed(ast::ClassBracketed { + Ok(Ast::class_bracketed(ast::ClassBracketed { span: span(0..6), negated: false, kind: union( @@ -5139,20 +5335,20 @@ bar lit(span(4..5), 'z'), ] ), - }))) + })) ); assert_eq!( parser("[a-z]").parse(), - Ok(Ast::Class(ast::Class::Bracketed(ast::ClassBracketed { + Ok(Ast::class_bracketed(ast::ClassBracketed { span: span(0..5), negated: false, kind: itemset(range(span(1..4), 'a', 'z')), - }))) + })) ); assert_eq!( parser("[a-cx-z]").parse(), - Ok(Ast::Class(ast::Class::Bracketed(ast::ClassBracketed { + Ok(Ast::class_bracketed(ast::ClassBracketed { span: span(0..8), negated: false, kind: union( @@ -5162,11 +5358,11 @@ bar range(span(4..7), 'x', 'z'), ] ), - }))) + })) ); assert_eq!( parser(r"[\w&&a-cx-z]").parse(), - Ok(Ast::Class(ast::Class::Bracketed(ast::ClassBracketed { + Ok(Ast::class_bracketed(ast::ClassBracketed { span: span(0..12), negated: false, kind: intersection( @@ -5184,11 +5380,11 @@ bar ] ), ), - }))) + })) ); assert_eq!( parser(r"[a-cx-z&&\w]").parse(), - Ok(Ast::Class(ast::Class::Bracketed(ast::ClassBracketed { + Ok(Ast::class_bracketed(ast::ClassBracketed { span: span(0..12), negated: false, kind: intersection( @@ -5206,11 +5402,11 @@ bar negated: false, })), ), - }))) + })) ); assert_eq!( parser(r"[a--b--c]").parse(), - Ok(Ast::Class(ast::Class::Bracketed(ast::ClassBracketed { + Ok(Ast::class_bracketed(ast::ClassBracketed { span: span(0..9), negated: false, kind: difference( @@ -5222,11 +5418,11 @@ bar ), itemset(lit(span(7..8), 'c')), ), - }))) + })) ); assert_eq!( parser(r"[a~~b~~c]").parse(), - Ok(Ast::Class(ast::Class::Bracketed(ast::ClassBracketed { + Ok(Ast::class_bracketed(ast::ClassBracketed { span: span(0..9), negated: false, kind: symdifference( @@ -5238,11 +5434,11 @@ bar ), itemset(lit(span(7..8), 'c')), ), - }))) + })) ); assert_eq!( parser(r"[\^&&^]").parse(), - Ok(Ast::Class(ast::Class::Bracketed(ast::ClassBracketed { + Ok(Ast::class_bracketed(ast::ClassBracketed { span: span(0..7), negated: false, kind: intersection( @@ -5254,11 +5450,11 @@ bar })), itemset(lit(span(5..6), '^')), ), - }))) + })) ); assert_eq!( parser(r"[\&&&&]").parse(), - Ok(Ast::Class(ast::Class::Bracketed(ast::ClassBracketed { + Ok(Ast::class_bracketed(ast::ClassBracketed { span: span(0..7), negated: false, kind: intersection( @@ -5270,11 +5466,11 @@ bar })), itemset(lit(span(5..6), '&')), ), - }))) + })) ); assert_eq!( parser(r"[&&&&]").parse(), - Ok(Ast::Class(ast::Class::Bracketed(ast::ClassBracketed { + Ok(Ast::class_bracketed(ast::ClassBracketed { span: span(0..6), negated: false, kind: intersection( @@ -5286,13 +5482,13 @@ bar ), itemset(empty(span(5..5))), ), - }))) + })) ); let pat = "[☃-⛄]"; assert_eq!( parser(pat).parse(), - Ok(Ast::Class(ast::Class::Bracketed(ast::ClassBracketed { + Ok(Ast::class_bracketed(ast::ClassBracketed { span: span_range(pat, 0..9), negated: false, kind: itemset(ast::ClassSetItem::Range(ast::ClassSetRange { @@ -5308,20 +5504,20 @@ bar c: '⛄', }, })), - }))) + })) ); assert_eq!( parser(r"[]]").parse(), - Ok(Ast::Class(ast::Class::Bracketed(ast::ClassBracketed { + Ok(Ast::class_bracketed(ast::ClassBracketed { span: span(0..3), negated: false, kind: itemset(lit(span(1..2), ']')), - }))) + })) ); assert_eq!( parser(r"[]\[]").parse(), - Ok(Ast::Class(ast::Class::Bracketed(ast::ClassBracketed { + Ok(Ast::class_bracketed(ast::ClassBracketed { span: span(0..5), negated: false, kind: union( @@ -5335,14 +5531,14 @@ bar }), ] ), - }))) + })) ); assert_eq!( parser(r"[\[]]").parse(), Ok(concat( 0..5, vec![ - Ast::Class(ast::Class::Bracketed(ast::ClassBracketed { + Ast::class_bracketed(ast::ClassBracketed { span: span(0..4), negated: false, kind: itemset(ast::ClassSetItem::Literal( @@ -5352,8 +5548,8 @@ bar c: '[', } )), - })), - Ast::Literal(ast::Literal { + }), + Ast::literal(ast::Literal { span: span(4..5), kind: ast::LiteralKind::Verbatim, c: ']', @@ -5914,15 +6110,15 @@ bar assert_eq!( parser(r"\pNz").parse(), - Ok(Ast::Concat(ast::Concat { + Ok(Ast::concat(ast::Concat { span: span(0..4), asts: vec![ - Ast::Class(ast::Class::Unicode(ast::ClassUnicode { + Ast::class_unicode(ast::ClassUnicode { span: span(0..3), negated: false, kind: ast::ClassUnicodeKind::OneLetter('N'), - })), - Ast::Literal(ast::Literal { + }), + Ast::literal(ast::Literal { span: span(3..4), kind: ast::LiteralKind::Verbatim, c: 'z', @@ -5932,15 +6128,15 @@ bar ); assert_eq!( parser(r"\p{Greek}z").parse(), - Ok(Ast::Concat(ast::Concat { + Ok(Ast::concat(ast::Concat { span: span(0..10), asts: vec![ - Ast::Class(ast::Class::Unicode(ast::ClassUnicode { + Ast::class_unicode(ast::ClassUnicode { span: span(0..9), negated: false, kind: ast::ClassUnicodeKind::Named(s("Greek")), - })), - Ast::Literal(ast::Literal { + }), + Ast::literal(ast::Literal { span: span(9..10), kind: ast::LiteralKind::Verbatim, c: 'z', @@ -6017,23 +6213,23 @@ bar assert_eq!( parser(r"\d").parse(), - Ok(Ast::Class(ast::Class::Perl(ast::ClassPerl { + Ok(Ast::class_perl(ast::ClassPerl { span: span(0..2), kind: ast::ClassPerlKind::Digit, negated: false, - }))) + })) ); assert_eq!( parser(r"\dz").parse(), - Ok(Ast::Concat(ast::Concat { + Ok(Ast::concat(ast::Concat { span: span(0..3), asts: vec![ - Ast::Class(ast::Class::Perl(ast::ClassPerl { + Ast::class_perl(ast::ClassPerl { span: span(0..2), kind: ast::ClassPerlKind::Digit, negated: false, - })), - Ast::Literal(ast::Literal { + }), + Ast::literal(ast::Literal { span: span(2..3), kind: ast::LiteralKind::Verbatim, c: 'z', diff --git a/vendor/regex-syntax/src/ast/print.rs b/vendor/regex-syntax/src/ast/print.rs index 86a87e143..1ceb3c7fa 100644 --- a/vendor/regex-syntax/src/ast/print.rs +++ b/vendor/regex-syntax/src/ast/print.rs @@ -80,27 +80,21 @@ impl<W: fmt::Write> Visitor for Writer<W> { fn visit_pre(&mut self, ast: &Ast) -> fmt::Result { match *ast { Ast::Group(ref x) => self.fmt_group_pre(x), - Ast::Class(ast::Class::Bracketed(ref x)) => { - self.fmt_class_bracketed_pre(x) - } + Ast::ClassBracketed(ref x) => self.fmt_class_bracketed_pre(x), _ => Ok(()), } } fn visit_post(&mut self, ast: &Ast) -> fmt::Result { - use crate::ast::Class; - match *ast { Ast::Empty(_) => Ok(()), Ast::Flags(ref x) => self.fmt_set_flags(x), Ast::Literal(ref x) => self.fmt_literal(x), Ast::Dot(_) => self.wtr.write_str("."), Ast::Assertion(ref x) => self.fmt_assertion(x), - Ast::Class(Class::Perl(ref x)) => self.fmt_class_perl(x), - Ast::Class(Class::Unicode(ref x)) => self.fmt_class_unicode(x), - Ast::Class(Class::Bracketed(ref x)) => { - self.fmt_class_bracketed_post(x) - } + Ast::ClassPerl(ref x) => self.fmt_class_perl(x), + Ast::ClassUnicode(ref x) => self.fmt_class_unicode(x), + Ast::ClassBracketed(ref x) => self.fmt_class_bracketed_post(x), Ast::Repetition(ref x) => self.fmt_repetition(x), Ast::Group(ref x) => self.fmt_group_post(x), Ast::Alternation(_) => Ok(()), @@ -267,6 +261,12 @@ impl<W: fmt::Write> Writer<W> { EndText => self.wtr.write_str(r"\z"), WordBoundary => self.wtr.write_str(r"\b"), NotWordBoundary => self.wtr.write_str(r"\B"), + WordBoundaryStart => self.wtr.write_str(r"\b{start}"), + WordBoundaryEnd => self.wtr.write_str(r"\b{end}"), + WordBoundaryStartAngle => self.wtr.write_str(r"\<"), + WordBoundaryEndAngle => self.wtr.write_str(r"\>"), + WordBoundaryStartHalf => self.wtr.write_str(r"\b{start-half}"), + WordBoundaryEndHalf => self.wtr.write_str(r"\b{end-half}"), } } diff --git a/vendor/regex-syntax/src/ast/visitor.rs b/vendor/regex-syntax/src/ast/visitor.rs index ab136739e..c1bb24d97 100644 --- a/vendor/regex-syntax/src/ast/visitor.rs +++ b/vendor/regex-syntax/src/ast/visitor.rs @@ -48,6 +48,11 @@ pub trait Visitor { Ok(()) } + /// This method is called between child nodes of a concatenation. + fn visit_concat_in(&mut self) -> Result<(), Self::Err> { + Ok(()) + } + /// This method is called on every [`ClassSetItem`](ast::ClassSetItem) /// before descending into child nodes. fn visit_class_set_item_pre( @@ -228,8 +233,14 @@ impl<'a> HeapVisitor<'a> { // If this is a concat/alternate, then we might have additional // inductive steps to process. if let Some(x) = self.pop(frame) { - if let Frame::Alternation { .. } = x { - visitor.visit_alternation_in()?; + match x { + Frame::Alternation { .. } => { + visitor.visit_alternation_in()?; + } + Frame::Concat { .. } => { + visitor.visit_concat_in()?; + } + _ => {} } ast = x.child(); self.stack.push((post_ast, x)); @@ -253,7 +264,7 @@ impl<'a> HeapVisitor<'a> { visitor: &mut V, ) -> Result<Option<Frame<'a>>, V::Err> { Ok(match *ast { - Ast::Class(ast::Class::Bracketed(ref x)) => { + Ast::ClassBracketed(ref x) => { self.visit_class(x, visitor)?; None } diff --git a/vendor/regex-syntax/src/hir/literal.rs b/vendor/regex-syntax/src/hir/literal.rs index bd3a2d143..a5a3737f6 100644 --- a/vendor/regex-syntax/src/hir/literal.rs +++ b/vendor/regex-syntax/src/hir/literal.rs @@ -23,7 +23,7 @@ effective literal optimizations: to lead to substring search that is only a little faster than a regex search, and thus the overhead of using literal optimizations in the first place might make things slower overall. -* The literals in your [`Seq`] shoudn't be too short. In general, longer is +* The literals in your [`Seq`] shouldn't be too short. In general, longer is better. A sequence corresponding to single bytes that occur frequently in the haystack, for example, is probably a bad literal optimization because it's likely to produce many false positive candidates. Longer literals are less @@ -51,7 +51,7 @@ the "trickier" parts are how to combine literal sequences, and that is all implemented on [`Seq`]. */ -use core::{cmp, mem}; +use core::{cmp, mem, num::NonZeroUsize}; use alloc::{vec, vec::Vec}; @@ -477,7 +477,7 @@ impl Extractor { } seq } - hir::Repetition { min, max: Some(max), .. } if min < max => { + hir::Repetition { min, .. } => { assert!(min > 0); // handled above let limit = u32::try_from(self.limit_repeat).unwrap_or(u32::MAX); @@ -491,10 +491,6 @@ impl Extractor { seq.make_inexact(); seq } - hir::Repetition { .. } => { - subseq.make_inexact(); - subseq - } } } @@ -692,7 +688,7 @@ impl Default for ExtractKind { /// from making assumptions about what literals are required in order to match /// a particular [`Hir`] expression. Generally speaking, when a set is in this /// state, literal optimizations are inhibited. A good example of a regex that -/// will cause this sort of set to apppear is `[A-Za-z]`. The character class +/// will cause this sort of set to appear is `[A-Za-z]`. The character class /// is just too big (and also too narrow) to be usefully expanded into 52 /// different literals. (Note that the decision for when a seq should become /// infinite is determined by the caller. A seq itself has no hard-coded @@ -1571,7 +1567,7 @@ impl Seq { /// unioning `self` with `other`. If either set is infinite, then this /// returns `None`. #[inline] - fn max_union_len(&self, other: &Seq) -> Option<usize> { + pub fn max_union_len(&self, other: &Seq) -> Option<usize> { let len1 = self.len()?; let len2 = other.len()?; Some(len1.saturating_add(len2)) @@ -1581,7 +1577,7 @@ impl Seq { /// cross product of `self` with `other`. If either set is infinite, then /// this returns `None`. #[inline] - fn max_cross_len(&self, other: &Seq) -> Option<usize> { + pub fn max_cross_len(&self, other: &Seq) -> Option<usize> { let len1 = self.len()?; let len2 = other.len()?; Some(len1.saturating_mul(len2)) @@ -1841,6 +1837,14 @@ impl Seq { None => return, Some(len) => len, }; + // Just give up now if our sequence contains an empty string. + if self.min_literal_len().map_or(false, |len| len == 0) { + // We squash the sequence so that nobody else gets any bright + // ideas to try and use it. An empty string implies a match at + // every position. A prefilter cannot help you here. + self.make_infinite(); + return; + } // Make sure we start with the smallest sequence possible. We use a // special version of preference minimization that retains exactness. // This is legal because optimization is only expected to occur once @@ -1910,34 +1914,41 @@ impl Seq { // longest common prefix to be subject to the poison check. } } - // Everything below this check is more-or-less about trying to - // heuristically reduce the false positive rate of a prefilter. But - // if our sequence is completely exact, then it's possible the regex - // engine can be skipped entirely. In this case, the false positive - // rate is zero because every literal match corresponds to a regex - // match. + // If we have an exact sequence, we *probably* just want to keep it + // as-is. But there are some cases where we don't. So we save a copy of + // the exact sequence now, and then try to do some more optimizations + // below. If those don't work out, we go back to this exact sequence. // - // This is OK even if the sequence contains a poison literal. Remember, - // a literal is only poisononous because of what we assume about its - // impact on the false positive rate. However, we do still check for - // an empty string. Empty strings are weird and it's best to let the - // regex engine handle those. + // The specific motivation for this is that we sometimes wind up with + // an exact sequence with a hefty number of literals. Say, 100. If we + // stuck with that, it would be too big for Teddy and would result in + // using Aho-Corasick. Which is fine... but the lazy DFA is plenty + // suitable in such cases. The real issue is that we will wind up not + // using a fast prefilter at all. So in cases like this, even though + // we have an exact sequence, it would be better to try and shrink the + // sequence (which we do below) and use it as a prefilter that can + // produce false positive matches. // - // We do currently do this check after the longest common prefix (or - // suffix) check, under the theory that single-substring search is so - // fast that we want that even if we'd end up turning an exact sequence - // into an inexact one. But this might be wrong... - if self.is_exact() - && self.min_literal_len().map_or(false, |len| len > 0) - { - return; - } + // But if the shrinking below results in a sequence that "sucks," then + // we don't want to use that because we already have an exact sequence + // in hand. + let exact: Option<Seq> = + if self.is_exact() { Some(self.clone()) } else { None }; // Now we attempt to shorten the sequence. The idea here is that we // don't want to look for too many literals, but we want to shorten // our sequence enough to improve our odds of using better algorithms // downstream (such as Teddy). + // + // The pair of numbers in this list corresponds to the maximal prefix + // (in bytes) to keep for all literals and the length of the sequence + // at which to do it. + // + // So for example, the pair (3, 500) would mean, "if we have more than + // 500 literals in our sequence, then truncate all of our literals + // such that they are at most 3 bytes in length and the minimize the + // sequence." const ATTEMPTS: [(usize, usize); 5] = - [(5, 64), (4, 64), (3, 64), (2, 64), (1, 10)]; + [(5, 10), (4, 10), (3, 64), (2, 64), (1, 10)]; for (keep, limit) in ATTEMPTS { let len = match self.len() { None => break, @@ -1951,7 +1962,11 @@ impl Seq { } else { self.keep_last_bytes(keep); } - self.minimize_by_preference(); + if prefix { + if let Some(ref mut lits) = self.literals { + PreferenceTrie::minimize(lits, true); + } + } } // Check for a poison literal. A poison literal is one that is short // and is believed to have a very high match count. These poisons @@ -1968,6 +1983,30 @@ impl Seq { self.make_infinite(); } } + // OK, if we had an exact sequence before attempting more optimizations + // above and our post-optimized sequence sucks for some reason or + // another, then we go back to the exact sequence. + if let Some(exact) = exact { + // If optimizing resulted in dropping our literals, then certainly + // backup and use the exact sequence that we had. + if !self.is_finite() { + *self = exact; + return; + } + // If our optimized sequence contains a short literal, then it's + // *probably* not so great. So throw it away and revert to the + // exact sequence. + if self.min_literal_len().map_or(true, |len| len <= 2) { + *self = exact; + return; + } + // Finally, if our optimized sequence is "big" (i.e., can't use + // Teddy), then also don't use it and rely on the exact sequence. + if self.len().map_or(true, |len| len > 64) { + *self = exact; + return; + } + } } } @@ -1977,7 +2016,7 @@ impl core::fmt::Debug for Seq { if let Some(lits) = self.literals() { f.debug_list().entries(lits.iter()).finish() } else { - write!(f, "[∅]") + write!(f, "[∞]") } } } @@ -2160,12 +2199,19 @@ impl core::fmt::Debug for Literal { /// never seen this show up on a profile. Because of the heuristic limits /// imposed on literal extractions, the size of the inputs here is usually /// very small.) -#[derive(Debug, Default)] +#[derive(Debug)] struct PreferenceTrie { /// The states in this trie. The index of a state in this vector is its ID. states: Vec<State>, + /// This vec indicates which states are match states. It always has + /// the same length as `states` and is indexed by the same state ID. + /// A state with identifier `sid` is a match state if and only if + /// `matches[sid].is_some()`. The option contains the index of the literal + /// corresponding to the match. The index is offset by 1 so that it fits in + /// a NonZeroUsize. + matches: Vec<Option<NonZeroUsize>>, /// The index to allocate to the next literal added to this trie. Starts at - /// 0 and increments by 1 for every literal successfully added to the trie. + /// 1 and increments by 1 for every literal successfully added to the trie. next_literal_index: usize, } @@ -2176,9 +2222,6 @@ struct State { /// are sorted by byte. There is at most one such transition for any /// particular byte. trans: Vec<(u8, usize)>, - /// Whether this is a matching state or not. If it is, then it contains the - /// index to the matching literal. - literal_index: Option<usize>, } impl PreferenceTrie { @@ -2192,20 +2235,19 @@ impl PreferenceTrie { /// after them and because any removed literals are guaranteed to never /// match. fn minimize(literals: &mut Vec<Literal>, keep_exact: bool) { - use core::cell::RefCell; - - // MSRV(1.61): Use retain_mut here to avoid interior mutability. - let trie = RefCell::new(PreferenceTrie::default()); + let mut trie = PreferenceTrie { + states: vec![], + matches: vec![], + next_literal_index: 1, + }; let mut make_inexact = vec![]; - literals.retain(|lit| { - match trie.borrow_mut().insert(lit.as_bytes()) { - Ok(_) => true, - Err(i) => { - if !keep_exact { - make_inexact.push(i); - } - false + literals.retain_mut(|lit| match trie.insert(lit.as_bytes()) { + Ok(_) => true, + Err(i) => { + if !keep_exact { + make_inexact.push(i.checked_sub(1).unwrap()); } + false } }); for i in make_inexact { @@ -2225,15 +2267,15 @@ impl PreferenceTrie { /// search. fn insert(&mut self, bytes: &[u8]) -> Result<usize, usize> { let mut prev = self.root(); - if let Some(idx) = self.states[prev].literal_index { - return Err(idx); + if let Some(idx) = self.matches[prev] { + return Err(idx.get()); } for &b in bytes.iter() { match self.states[prev].trans.binary_search_by_key(&b, |t| t.0) { Ok(i) => { prev = self.states[prev].trans[i].1; - if let Some(idx) = self.states[prev].literal_index { - return Err(idx); + if let Some(idx) = self.matches[prev] { + return Err(idx.get()); } } Err(i) => { @@ -2245,7 +2287,7 @@ impl PreferenceTrie { } let idx = self.next_literal_index; self.next_literal_index += 1; - self.states[prev].literal_index = Some(idx); + self.matches[prev] = NonZeroUsize::new(idx); Ok(idx) } @@ -2262,6 +2304,7 @@ impl PreferenceTrie { fn create_state(&mut self) -> usize { let id = self.states.len(); self.states.push(State::default()); + self.matches.push(None); id } } @@ -2603,6 +2646,12 @@ mod tests { ]), e(r"(ab|cd)(ef|gh)(ij|kl)") ); + + assert_eq!(inexact([E("abab")], [E("abab")]), e(r"(ab){2}")); + + assert_eq!(inexact([I("abab")], [I("abab")]), e(r"(ab){2,3}")); + + assert_eq!(inexact([I("abab")], [I("abab")]), e(r"(ab){2,}")); } #[test] @@ -2815,13 +2864,13 @@ mod tests { // repeats. #[test] fn crazy_repeats() { - assert_eq!(inexact([I("")], [I("")]), e(r"(?:){4294967295}")); + assert_eq!(inexact([E("")], [E("")]), e(r"(?:){4294967295}")); assert_eq!( - inexact([I("")], [I("")]), + inexact([E("")], [E("")]), e(r"(?:){64}{64}{64}{64}{64}{64}") ); - assert_eq!(inexact([I("")], [I("")]), e(r"x{0}{4294967295}")); - assert_eq!(inexact([I("")], [I("")]), e(r"(?:|){4294967295}")); + assert_eq!(inexact([E("")], [E("")]), e(r"x{0}{4294967295}")); + assert_eq!(inexact([E("")], [E("")]), e(r"(?:|){4294967295}")); assert_eq!( inexact([E("")], [E("")]), diff --git a/vendor/regex-syntax/src/hir/mod.rs b/vendor/regex-syntax/src/hir/mod.rs index 062d4dcab..ce38ead7b 100644 --- a/vendor/regex-syntax/src/hir/mod.rs +++ b/vendor/regex-syntax/src/hir/mod.rs @@ -88,6 +88,9 @@ pub enum ErrorKind { /// This error occurs when translating a pattern that could match a byte /// sequence that isn't UTF-8 and `utf8` was enabled. InvalidUtf8, + /// This error occurs when one uses a non-ASCII byte for a line terminator, + /// but where Unicode mode is enabled and UTF-8 mode is disabled. + InvalidLineTerminator, /// This occurs when an unrecognized Unicode property name could not /// be found. UnicodePropertyNotFound, @@ -120,6 +123,7 @@ impl core::fmt::Display for ErrorKind { let msg = match *self { UnicodeNotAllowed => "Unicode not allowed here", InvalidUtf8 => "pattern can match invalid UTF-8", + InvalidLineTerminator => "invalid line terminator, must be ASCII", UnicodePropertyNotFound => "Unicode property not found", UnicodePropertyValueNotFound => "Unicode property value not found", UnicodePerlClassNotFound => { @@ -180,7 +184,7 @@ impl core::fmt::Display for ErrorKind { /// matches. /// /// For empty matches, those can occur at any position. It is the -/// repsonsibility of the regex engine to determine whether empty matches are +/// responsibility of the regex engine to determine whether empty matches are /// permitted between the code units of a single codepoint. /// /// # Stack space @@ -355,7 +359,13 @@ impl Hir { /// Creates a repetition HIR expression. #[inline] - pub fn repetition(rep: Repetition) -> Hir { + pub fn repetition(mut rep: Repetition) -> Hir { + // If the sub-expression of a repetition can only match the empty + // string, then we force its maximum to be at most 1. + if rep.sub.properties().maximum_len() == Some(0) { + rep.min = cmp::min(rep.min, 1); + rep.max = rep.max.map(|n| cmp::min(n, 1)).or(Some(1)); + } // The regex 'a{0}' is always equivalent to the empty regex. This is // true even when 'a' is an expression that never matches anything // (like '\P{any}'). @@ -547,7 +557,7 @@ impl Hir { // We rebuild the alternation by simplifying it. We proceed similarly // as the concatenation case. But in this case, there's no literal // simplification happening. We're just flattening alternations. - let mut new = vec![]; + let mut new = Vec::with_capacity(subs.len()); for sub in subs { let (kind, props) = sub.into_parts(); match kind { @@ -642,6 +652,12 @@ impl Hir { cls.push(ClassBytesRange::new(b'\0', b'\xFF')); Hir::class(Class::Bytes(cls)) } + Dot::AnyCharExcept(ch) => { + let mut cls = + ClassUnicode::new([ClassUnicodeRange::new(ch, ch)]); + cls.negate(); + Hir::class(Class::Unicode(cls)) + } Dot::AnyCharExceptLF => { let mut cls = ClassUnicode::empty(); cls.push(ClassUnicodeRange::new('\0', '\x09')); @@ -655,6 +671,12 @@ impl Hir { cls.push(ClassUnicodeRange::new('\x0E', '\u{10FFFF}')); Hir::class(Class::Unicode(cls)) } + Dot::AnyByteExcept(byte) => { + let mut cls = + ClassBytes::new([ClassBytesRange::new(byte, byte)]); + cls.negate(); + Hir::class(Class::Bytes(cls)) + } Dot::AnyByteExceptLF => { let mut cls = ClassBytes::empty(); cls.push(ClassBytesRange::new(b'\0', b'\x09')); @@ -775,13 +797,18 @@ impl core::fmt::Debug for Literal { /// The high-level intermediate representation of a character class. /// /// A character class corresponds to a set of characters. A character is either -/// defined by a Unicode scalar value or a byte. Unicode characters are used -/// by default, while bytes are used when Unicode mode (via the `u` flag) is -/// disabled. +/// defined by a Unicode scalar value or a byte. /// /// A character class, regardless of its character type, is represented by a /// sequence of non-overlapping non-adjacent ranges of characters. /// +/// There are no guarantees about which class variant is used. Generally +/// speaking, the Unicode variat is used whenever a class needs to contain +/// non-ASCII Unicode scalar values. But the Unicode variant can be used even +/// when Unicode mode is disabled. For example, at the time of writing, the +/// regex `(?-u:a|\xc2\xa0)` will compile down to HIR for the Unicode class +/// `[a\u00A0]` due to optimizations. +/// /// Note that `Bytes` variant may be produced even when it exclusively matches /// valid UTF-8. This is because a `Bytes` variant represents an intention by /// the author of the regular expression to disable Unicode mode, which in turn @@ -1304,8 +1331,9 @@ impl ClassUnicodeRange { } } -/// A set of characters represented by arbitrary bytes (where one byte -/// corresponds to one character). +/// A set of characters represented by arbitrary bytes. +/// +/// Each byte corresponds to one character. #[derive(Clone, Debug, Eq, PartialEq)] pub struct ClassBytes { set: IntervalSet<ClassBytesRange>, @@ -1607,6 +1635,42 @@ pub enum Look { WordUnicode = 1 << 8, /// Match a Unicode-aware negation of a word boundary. WordUnicodeNegate = 1 << 9, + /// Match the start of an ASCII-only word boundary. That is, this matches a + /// position at either the beginning of the haystack or where the previous + /// character is not a word character and the following character is a word + /// character. + WordStartAscii = 1 << 10, + /// Match the end of an ASCII-only word boundary. That is, this matches + /// a position at either the end of the haystack or where the previous + /// character is a word character and the following character is not a word + /// character. + WordEndAscii = 1 << 11, + /// Match the start of a Unicode word boundary. That is, this matches a + /// position at either the beginning of the haystack or where the previous + /// character is not a word character and the following character is a word + /// character. + WordStartUnicode = 1 << 12, + /// Match the end of a Unicode word boundary. That is, this matches a + /// position at either the end of the haystack or where the previous + /// character is a word character and the following character is not a word + /// character. + WordEndUnicode = 1 << 13, + /// Match the start half of an ASCII-only word boundary. That is, this + /// matches a position at either the beginning of the haystack or where the + /// previous character is not a word character. + WordStartHalfAscii = 1 << 14, + /// Match the end half of an ASCII-only word boundary. That is, this + /// matches a position at either the end of the haystack or where the + /// following character is not a word character. + WordEndHalfAscii = 1 << 15, + /// Match the start half of a Unicode word boundary. That is, this matches + /// a position at either the beginning of the haystack or where the + /// previous character is not a word character. + WordStartHalfUnicode = 1 << 16, + /// Match the end half of a Unicode word boundary. That is, this matches + /// a position at either the end of the haystack or where the following + /// character is not a word character. + WordEndHalfUnicode = 1 << 17, } impl Look { @@ -1628,6 +1692,14 @@ impl Look { Look::WordAsciiNegate => Look::WordAsciiNegate, Look::WordUnicode => Look::WordUnicode, Look::WordUnicodeNegate => Look::WordUnicodeNegate, + Look::WordStartAscii => Look::WordEndAscii, + Look::WordEndAscii => Look::WordStartAscii, + Look::WordStartUnicode => Look::WordEndUnicode, + Look::WordEndUnicode => Look::WordStartUnicode, + Look::WordStartHalfAscii => Look::WordEndHalfAscii, + Look::WordEndHalfAscii => Look::WordStartHalfAscii, + Look::WordStartHalfUnicode => Look::WordEndHalfUnicode, + Look::WordEndHalfUnicode => Look::WordStartHalfUnicode, } } @@ -1636,28 +1708,36 @@ impl Look { /// constructor is guaranteed to return the same look-around variant that /// one started with within a semver compatible release of this crate. #[inline] - pub const fn as_repr(self) -> u16 { + pub const fn as_repr(self) -> u32 { // AFAIK, 'as' is the only way to zero-cost convert an int enum to an // actual int. - self as u16 + self as u32 } /// Given the underlying representation of a `Look` value, return the /// corresponding `Look` value if the representation is valid. Otherwise /// `None` is returned. #[inline] - pub const fn from_repr(repr: u16) -> Option<Look> { + pub const fn from_repr(repr: u32) -> Option<Look> { match repr { - 0b00_0000_0001 => Some(Look::Start), - 0b00_0000_0010 => Some(Look::End), - 0b00_0000_0100 => Some(Look::StartLF), - 0b00_0000_1000 => Some(Look::EndLF), - 0b00_0001_0000 => Some(Look::StartCRLF), - 0b00_0010_0000 => Some(Look::EndCRLF), - 0b00_0100_0000 => Some(Look::WordAscii), - 0b00_1000_0000 => Some(Look::WordAsciiNegate), - 0b01_0000_0000 => Some(Look::WordUnicode), - 0b10_0000_0000 => Some(Look::WordUnicodeNegate), + 0b00_0000_0000_0000_0001 => Some(Look::Start), + 0b00_0000_0000_0000_0010 => Some(Look::End), + 0b00_0000_0000_0000_0100 => Some(Look::StartLF), + 0b00_0000_0000_0000_1000 => Some(Look::EndLF), + 0b00_0000_0000_0001_0000 => Some(Look::StartCRLF), + 0b00_0000_0000_0010_0000 => Some(Look::EndCRLF), + 0b00_0000_0000_0100_0000 => Some(Look::WordAscii), + 0b00_0000_0000_1000_0000 => Some(Look::WordAsciiNegate), + 0b00_0000_0001_0000_0000 => Some(Look::WordUnicode), + 0b00_0000_0010_0000_0000 => Some(Look::WordUnicodeNegate), + 0b00_0000_0100_0000_0000 => Some(Look::WordStartAscii), + 0b00_0000_1000_0000_0000 => Some(Look::WordEndAscii), + 0b00_0001_0000_0000_0000 => Some(Look::WordStartUnicode), + 0b00_0010_0000_0000_0000 => Some(Look::WordEndUnicode), + 0b00_0100_0000_0000_0000 => Some(Look::WordStartHalfAscii), + 0b00_1000_0000_0000_0000 => Some(Look::WordEndHalfAscii), + 0b01_0000_0000_0000_0000 => Some(Look::WordStartHalfUnicode), + 0b10_0000_0000_0000_0000 => Some(Look::WordEndHalfUnicode), _ => None, } } @@ -1682,6 +1762,14 @@ impl Look { Look::WordAsciiNegate => 'B', Look::WordUnicode => '𝛃', Look::WordUnicodeNegate => '𝚩', + Look::WordStartAscii => '<', + Look::WordEndAscii => '>', + Look::WordStartUnicode => '〈', + Look::WordEndUnicode => '〉', + Look::WordStartHalfAscii => '◁', + Look::WordEndHalfAscii => '▷', + Look::WordStartHalfUnicode => '◀', + Look::WordEndHalfUnicode => '▶', } } } @@ -1766,6 +1854,18 @@ pub enum Dot { /// /// This is equivalent to `(?s-u:.)` and also `(?-u:[\x00-\xFF])`. AnyByte, + /// Matches the UTF-8 encoding of any Unicode scalar value except for the + /// `char` given. + /// + /// This is equivalent to using `(?u-s:.)` with the line terminator set + /// to a particular ASCII byte. (Because of peculiarities in the regex + /// engines, a line terminator must be a single byte. It follows that when + /// UTF-8 mode is enabled, this single byte must also be a Unicode scalar + /// value. That is, ti must be ASCII.) + /// + /// (This and `AnyCharExceptLF` both exist because of legacy reasons. + /// `AnyCharExceptLF` will be dropped in the next breaking change release.) + AnyCharExcept(char), /// Matches the UTF-8 encoding of any Unicode scalar value except for `\n`. /// /// This is equivalent to `(?u-s:.)` and also `[\p{any}--\n]`. @@ -1775,6 +1875,17 @@ pub enum Dot { /// /// This is equivalent to `(?uR-s:.)` and also `[\p{any}--\r\n]`. AnyCharExceptCRLF, + /// Matches any byte value except for the `u8` given. + /// + /// This is equivalent to using `(?-us:.)` with the line terminator set + /// to a particular ASCII byte. (Because of peculiarities in the regex + /// engines, a line terminator must be a single byte. It follows that when + /// UTF-8 mode is enabled, this single byte must also be a Unicode scalar + /// value. That is, ti must be ASCII.) + /// + /// (This and `AnyByteExceptLF` both exist because of legacy reasons. + /// `AnyByteExceptLF` will be dropped in the next breaking change release.) + AnyByteExcept(u8), /// Matches any byte value except for `\n`. /// /// This is equivalent to `(?-su:.)` and also `(?-u:[[\x00-\xFF]--\n])`. @@ -2410,10 +2521,10 @@ impl Properties { inner.look_set_prefix = p.look_set_prefix(); inner.look_set_suffix = p.look_set_suffix(); } - // If the static captures len of the sub-expression is not known or is - // zero, then it automatically propagates to the repetition, regardless - // of the repetition. Otherwise, it might change, but only when the - // repetition can match 0 times. + // If the static captures len of the sub-expression is not known or + // is greater than zero, then it automatically propagates to the + // repetition, regardless of the repetition. Otherwise, it might + // change, but only when the repetition can match 0 times. if rep.min == 0 && inner.static_explicit_captures_len.map_or(false, |len| len > 0) { @@ -2549,7 +2660,7 @@ pub struct LookSet { /// range of `u16` values to be represented. For example, even if the /// current implementation only makes use of the 10 least significant bits, /// it may use more bits in a future semver compatible release. - pub bits: u16, + pub bits: u32, } impl LookSet { @@ -2652,13 +2763,22 @@ impl LookSet { pub fn contains_word_unicode(self) -> bool { self.contains(Look::WordUnicode) || self.contains(Look::WordUnicodeNegate) + || self.contains(Look::WordStartUnicode) + || self.contains(Look::WordEndUnicode) + || self.contains(Look::WordStartHalfUnicode) + || self.contains(Look::WordEndHalfUnicode) } /// Returns true if and only if this set contains any ASCII word boundary /// or negated ASCII word boundary assertions. #[inline] pub fn contains_word_ascii(self) -> bool { - self.contains(Look::WordAscii) || self.contains(Look::WordAsciiNegate) + self.contains(Look::WordAscii) + || self.contains(Look::WordAsciiNegate) + || self.contains(Look::WordStartAscii) + || self.contains(Look::WordEndAscii) + || self.contains(Look::WordStartHalfAscii) + || self.contains(Look::WordEndHalfAscii) } /// Returns an iterator over all of the look-around assertions in this set. @@ -2737,29 +2857,31 @@ impl LookSet { *self = self.intersect(other); } - /// Return a `LookSet` from the slice given as a native endian 16-bit + /// Return a `LookSet` from the slice given as a native endian 32-bit /// integer. /// /// # Panics /// - /// This panics if `slice.len() < 2`. + /// This panics if `slice.len() < 4`. #[inline] pub fn read_repr(slice: &[u8]) -> LookSet { - let bits = u16::from_ne_bytes(slice[..2].try_into().unwrap()); + let bits = u32::from_ne_bytes(slice[..4].try_into().unwrap()); LookSet { bits } } - /// Write a `LookSet` as a native endian 16-bit integer to the beginning + /// Write a `LookSet` as a native endian 32-bit integer to the beginning /// of the slice given. /// /// # Panics /// - /// This panics if `slice.len() < 2`. + /// This panics if `slice.len() < 4`. #[inline] pub fn write_repr(self, slice: &mut [u8]) { let raw = self.bits.to_ne_bytes(); slice[0] = raw[0]; slice[1] = raw[1]; + slice[2] = raw[2]; + slice[3] = raw[3]; } } @@ -2792,9 +2914,9 @@ impl Iterator for LookSetIter { return None; } // We'll never have more than u8::MAX distinct look-around assertions, - // so 'repr' will always fit into a u16. - let repr = u16::try_from(self.set.bits.trailing_zeros()).unwrap(); - let look = Look::from_repr(1 << repr)?; + // so 'bit' will always fit into a u16. + let bit = u16::try_from(self.set.bits.trailing_zeros()).unwrap(); + let look = Look::from_repr(1 << bit)?; self.set = self.set.remove(look); Some(look) } @@ -3716,7 +3838,7 @@ mod tests { assert_eq!(0, set.iter().count()); let set = LookSet::full(); - assert_eq!(10, set.iter().count()); + assert_eq!(18, set.iter().count()); let set = LookSet::empty().insert(Look::StartLF).insert(Look::WordUnicode); @@ -3734,6 +3856,6 @@ mod tests { let res = format!("{:?}", LookSet::empty()); assert_eq!("∅", res); let res = format!("{:?}", LookSet::full()); - assert_eq!("Az^$rRbB𝛃𝚩", res); + assert_eq!("Az^$rRbB𝛃𝚩<>〈〉◁▷◀▶", res); } } diff --git a/vendor/regex-syntax/src/hir/print.rs b/vendor/regex-syntax/src/hir/print.rs index fcb7cd252..dfa6d4032 100644 --- a/vendor/regex-syntax/src/hir/print.rs +++ b/vendor/regex-syntax/src/hir/print.rs @@ -89,9 +89,16 @@ impl<W: fmt::Write> Visitor for Writer<W> { fn visit_pre(&mut self, hir: &Hir) -> fmt::Result { match *hir.kind() { - // Empty is represented by nothing in the concrete syntax, and - // repetition operators are strictly suffix oriented. - HirKind::Empty | HirKind::Repetition(_) => {} + HirKind::Empty => { + // Technically an empty sub-expression could be "printed" by + // just ignoring it, but in practice, you could have a + // repetition operator attached to an empty expression, and you + // really need something in the concrete syntax to make that + // work as you'd expect. + self.wtr.write_str(r"(?:)")?; + } + // Repetition operators are strictly suffix oriented. + HirKind::Repetition(_) => {} HirKind::Literal(hir::Literal(ref bytes)) => { // See the comment on the 'Concat' and 'Alternation' case below // for why we put parens here. Literals are, conceptually, @@ -195,6 +202,30 @@ impl<W: fmt::Write> Visitor for Writer<W> { hir::Look::WordUnicodeNegate => { self.wtr.write_str(r"\B")?; } + hir::Look::WordStartAscii => { + self.wtr.write_str(r"(?-u:\b{start})")?; + } + hir::Look::WordEndAscii => { + self.wtr.write_str(r"(?-u:\b{end})")?; + } + hir::Look::WordStartUnicode => { + self.wtr.write_str(r"\b{start}")?; + } + hir::Look::WordEndUnicode => { + self.wtr.write_str(r"\b{end}")?; + } + hir::Look::WordStartHalfAscii => { + self.wtr.write_str(r"(?-u:\b{start-half})")?; + } + hir::Look::WordEndHalfAscii => { + self.wtr.write_str(r"(?-u:\b{end-half})")?; + } + hir::Look::WordStartHalfUnicode => { + self.wtr.write_str(r"\b{start-half}")?; + } + hir::Look::WordEndHalfUnicode => { + self.wtr.write_str(r"\b{end-half}")?; + } }, HirKind::Capture(hir::Capture { ref name, .. }) => { self.wtr.write_str("(")?; @@ -424,20 +455,20 @@ mod tests { // Test that various zero-length repetitions always translate to an // empty regex. This is more a property of HIR's smart constructors // than the printer though. - roundtrip("a{0}", ""); - roundtrip("(?:ab){0}", ""); + roundtrip("a{0}", "(?:)"); + roundtrip("(?:ab){0}", "(?:)"); #[cfg(feature = "unicode-gencat")] { - roundtrip(r"\p{any}{0}", ""); - roundtrip(r"\P{any}{0}", ""); + roundtrip(r"\p{any}{0}", "(?:)"); + roundtrip(r"\P{any}{0}", "(?:)"); } } #[test] fn print_group() { - roundtrip("()", "()"); - roundtrip("(?P<foo>)", "(?P<foo>)"); - roundtrip("(?:)", ""); + roundtrip("()", "((?:))"); + roundtrip("(?P<foo>)", "(?P<foo>(?:))"); + roundtrip("(?:)", "(?:)"); roundtrip("(a)", "(a)"); roundtrip("(?P<foo>a)", "(?P<foo>a)"); @@ -448,8 +479,8 @@ mod tests { #[test] fn print_alternation() { - roundtrip("|", "(?:|)"); - roundtrip("||", "(?:||)"); + roundtrip("|", "(?:(?:)|(?:))"); + roundtrip("||", "(?:(?:)|(?:)|(?:))"); roundtrip("a|b", "[ab]"); roundtrip("ab|cd", "(?:(?:ab)|(?:cd))"); @@ -503,7 +534,7 @@ mod tests { }), Hir::look(hir::Look::End), ]); - assert_eq!(r"(?:\A(?:\A\z)+\z)", expr.to_string()); + assert_eq!(r"(?:\A\A\z\z)", expr.to_string()); } // Just like regression_repetition_concat, but with the repetition using @@ -540,7 +571,7 @@ mod tests { }), Hir::look(hir::Look::End), ]); - assert_eq!(r"(?:\A(?:\A|\z)+\z)", expr.to_string()); + assert_eq!(r"(?:\A(?:\A|\z)\z)", expr.to_string()); } // This regression test is very similar in flavor to diff --git a/vendor/regex-syntax/src/hir/translate.rs b/vendor/regex-syntax/src/hir/translate.rs index ff9c5ee91..313a1e9e8 100644 --- a/vendor/regex-syntax/src/hir/translate.rs +++ b/vendor/regex-syntax/src/hir/translate.rs @@ -19,6 +19,7 @@ type Result<T> = core::result::Result<T, Error>; #[derive(Clone, Debug)] pub struct TranslatorBuilder { utf8: bool, + line_terminator: u8, flags: Flags, } @@ -31,7 +32,11 @@ impl Default for TranslatorBuilder { impl TranslatorBuilder { /// Create a new translator builder with a default c onfiguration. pub fn new() -> TranslatorBuilder { - TranslatorBuilder { utf8: true, flags: Flags::default() } + TranslatorBuilder { + utf8: true, + line_terminator: b'\n', + flags: Flags::default(), + } } /// Build a translator using the current configuration. @@ -40,6 +45,7 @@ impl TranslatorBuilder { stack: RefCell::new(vec![]), flags: Cell::new(self.flags), utf8: self.utf8, + line_terminator: self.line_terminator, } } @@ -63,6 +69,31 @@ impl TranslatorBuilder { self } + /// Sets the line terminator for use with `(?u-s:.)` and `(?-us:.)`. + /// + /// Namely, instead of `.` (by default) matching everything except for `\n`, + /// this will cause `.` to match everything except for the byte given. + /// + /// If `.` is used in a context where Unicode mode is enabled and this byte + /// isn't ASCII, then an error will be returned. When Unicode mode is + /// disabled, then any byte is permitted, but will return an error if UTF-8 + /// mode is enabled and it is a non-ASCII byte. + /// + /// In short, any ASCII value for a line terminator is always okay. But a + /// non-ASCII byte might result in an error depending on whether Unicode + /// mode or UTF-8 mode are enabled. + /// + /// Note that if `R` mode is enabled then it always takes precedence and + /// the line terminator will be treated as `\r` and `\n` simultaneously. + /// + /// Note also that this *doesn't* impact the look-around assertions + /// `(?m:^)` and `(?m:$)`. That's usually controlled by additional + /// configuration in the regex engine itself. + pub fn line_terminator(&mut self, byte: u8) -> &mut TranslatorBuilder { + self.line_terminator = byte; + self + } + /// Enable or disable the case insensitive flag (`i`) by default. pub fn case_insensitive(&mut self, yes: bool) -> &mut TranslatorBuilder { self.flags.case_insensitive = if yes { Some(true) } else { None }; @@ -120,6 +151,8 @@ pub struct Translator { flags: Cell<Flags>, /// Whether we're allowed to produce HIR that can match arbitrary bytes. utf8: bool, + /// The line terminator to use for `.`. + line_terminator: u8, } impl Translator { @@ -304,7 +337,7 @@ impl<'t, 'p> Visitor for TranslatorI<'t, 'p> { fn visit_pre(&mut self, ast: &Ast) -> Result<()> { match *ast { - Ast::Class(ast::Class::Bracketed(_)) => { + Ast::ClassBracketed(_) => { if self.flags().unicode() { let cls = hir::ClassUnicode::empty(); self.push(HirFrame::ClassUnicode(cls)); @@ -321,14 +354,14 @@ impl<'t, 'p> Visitor for TranslatorI<'t, 'p> { .unwrap_or_else(|| self.flags()); self.push(HirFrame::Group { old_flags }); } - Ast::Concat(ref x) if x.asts.is_empty() => {} Ast::Concat(_) => { self.push(HirFrame::Concat); } - Ast::Alternation(ref x) if x.asts.is_empty() => {} - Ast::Alternation(_) => { + Ast::Alternation(ref x) => { self.push(HirFrame::Alternation); - self.push(HirFrame::AlternationBranch); + if !x.asts.is_empty() { + self.push(HirFrame::AlternationBranch); + } } _ => {} } @@ -353,29 +386,20 @@ impl<'t, 'p> Visitor for TranslatorI<'t, 'p> { // consistency sake. self.push(HirFrame::Expr(Hir::empty())); } - Ast::Literal(ref x) => { - match self.ast_literal_to_scalar(x)? { - Either::Right(byte) => self.push_byte(byte), - Either::Left(ch) => { - if !self.flags().unicode() && ch.len_utf8() > 1 { - return Err(self - .error(x.span, ErrorKind::UnicodeNotAllowed)); - } - match self.case_fold_char(x.span, ch)? { - None => self.push_char(ch), - Some(expr) => self.push(HirFrame::Expr(expr)), - } - } - } - // self.push(HirFrame::Expr(self.hir_literal(x)?)); - } - Ast::Dot(span) => { - self.push(HirFrame::Expr(self.hir_dot(span)?)); + Ast::Literal(ref x) => match self.ast_literal_to_scalar(x)? { + Either::Right(byte) => self.push_byte(byte), + Either::Left(ch) => match self.case_fold_char(x.span, ch)? { + None => self.push_char(ch), + Some(expr) => self.push(HirFrame::Expr(expr)), + }, + }, + Ast::Dot(ref span) => { + self.push(HirFrame::Expr(self.hir_dot(**span)?)); } Ast::Assertion(ref x) => { self.push(HirFrame::Expr(self.hir_assertion(x)?)); } - Ast::Class(ast::Class::Perl(ref x)) => { + Ast::ClassPerl(ref x) => { if self.flags().unicode() { let cls = self.hir_perl_unicode_class(x)?; let hcls = hir::Class::Unicode(cls); @@ -386,11 +410,11 @@ impl<'t, 'p> Visitor for TranslatorI<'t, 'p> { self.push(HirFrame::Expr(Hir::class(hcls))); } } - Ast::Class(ast::Class::Unicode(ref x)) => { + Ast::ClassUnicode(ref x) => { let cls = hir::Class::Unicode(self.hir_unicode_class(x)?); self.push(HirFrame::Expr(Hir::class(cls))); } - Ast::Class(ast::Class::Bracketed(ref ast)) => { + Ast::ClassBracketed(ref ast) => { if self.flags().unicode() { let mut cls = self.pop().unwrap().unwrap_class_unicode(); self.unicode_fold_and_negate( @@ -841,8 +865,8 @@ impl<'t, 'p> TranslatorI<'t, 'p> { })?; Ok(Some(Hir::class(hir::Class::Unicode(cls)))) } else { - if c.len_utf8() > 1 { - return Err(self.error(span, ErrorKind::UnicodeNotAllowed)); + if !c.is_ascii() { + return Ok(None); } // If case folding won't do anything, then don't bother trying. match c { @@ -862,10 +886,38 @@ impl<'t, 'p> TranslatorI<'t, 'p> { } fn hir_dot(&self, span: Span) -> Result<Hir> { - if !self.flags().unicode() && self.trans().utf8 { + let (utf8, lineterm, flags) = + (self.trans().utf8, self.trans().line_terminator, self.flags()); + if utf8 && (!flags.unicode() || !lineterm.is_ascii()) { return Err(self.error(span, ErrorKind::InvalidUtf8)); } - Ok(Hir::dot(self.flags().dot())) + let dot = if flags.dot_matches_new_line() { + if flags.unicode() { + hir::Dot::AnyChar + } else { + hir::Dot::AnyByte + } + } else { + if flags.unicode() { + if flags.crlf() { + hir::Dot::AnyCharExceptCRLF + } else { + if !lineterm.is_ascii() { + return Err( + self.error(span, ErrorKind::InvalidLineTerminator) + ); + } + hir::Dot::AnyCharExcept(char::from(lineterm)) + } + } else { + if flags.crlf() { + hir::Dot::AnyByteExceptCRLF + } else { + hir::Dot::AnyByteExcept(lineterm) + } + } + }; + Ok(Hir::dot(dot)) } fn hir_assertion(&self, asst: &ast::Assertion) -> Result<Hir> { @@ -903,6 +955,34 @@ impl<'t, 'p> TranslatorI<'t, 'p> { } else { hir::Look::WordAsciiNegate }), + ast::AssertionKind::WordBoundaryStart + | ast::AssertionKind::WordBoundaryStartAngle => { + Hir::look(if unicode { + hir::Look::WordStartUnicode + } else { + hir::Look::WordStartAscii + }) + } + ast::AssertionKind::WordBoundaryEnd + | ast::AssertionKind::WordBoundaryEndAngle => { + Hir::look(if unicode { + hir::Look::WordEndUnicode + } else { + hir::Look::WordEndAscii + }) + } + ast::AssertionKind::WordBoundaryStartHalf => { + Hir::look(if unicode { + hir::Look::WordStartHalfUnicode + } else { + hir::Look::WordStartHalfAscii + }) + } + ast::AssertionKind::WordBoundaryEndHalf => Hir::look(if unicode { + hir::Look::WordEndHalfUnicode + } else { + hir::Look::WordEndHalfAscii + }), }) } @@ -1124,9 +1204,8 @@ impl<'t, 'p> TranslatorI<'t, 'p> { match self.ast_literal_to_scalar(ast)? { Either::Right(byte) => Ok(byte), Either::Left(ch) => { - let cp = u32::from(ch); - if cp <= 0x7F { - Ok(u8::try_from(cp).unwrap()) + if ch.is_ascii() { + Ok(u8::try_from(ch).unwrap()) } else { // We can't feasibly support Unicode in // byte oriented classes. Byte classes don't @@ -1209,30 +1288,6 @@ impl Flags { } } - fn dot(&self) -> hir::Dot { - if self.dot_matches_new_line() { - if self.unicode() { - hir::Dot::AnyChar - } else { - hir::Dot::AnyByte - } - } else { - if self.unicode() { - if self.crlf() { - hir::Dot::AnyCharExceptCRLF - } else { - hir::Dot::AnyCharExceptLF - } - } else { - if self.crlf() { - hir::Dot::AnyByteExceptCRLF - } else { - hir::Dot::AnyByteExceptLF - } - } - } - } - fn case_insensitive(&self) -> bool { self.case_insensitive.unwrap_or(false) } @@ -1598,16 +1653,7 @@ mod tests { assert_eq!(t_bytes(r"(?-u)\x61"), hir_lit("a")); assert_eq!(t_bytes(r"(?-u)\xFF"), hir_blit(b"\xFF")); - assert_eq!( - t_err("(?-u)☃"), - TestError { - kind: hir::ErrorKind::UnicodeNotAllowed, - span: Span::new( - Position::new(5, 1, 6), - Position::new(8, 1, 7) - ), - } - ); + assert_eq!(t("(?-u)☃"), hir_lit("☃")); assert_eq!( t_err(r"(?-u)\xFF"), TestError { @@ -1685,16 +1731,7 @@ mod tests { ); assert_eq!(t_bytes(r"(?i-u)\xFF"), hir_blit(b"\xFF")); - assert_eq!( - t_err("(?i-u)β"), - TestError { - kind: hir::ErrorKind::UnicodeNotAllowed, - span: Span::new( - Position::new(6, 1, 7), - Position::new(8, 1, 8), - ), - } - ); + assert_eq!(t("(?i-u)β"), hir_lit("β"),); } #[test] @@ -3489,6 +3526,15 @@ mod tests { assert!(!props(r"(?:z|xx)@|xx").is_alternation_literal()); } + // This tests that the smart Hir::repetition constructors does some basic + // simplifications. + #[test] + fn smart_repetition() { + assert_eq!(t(r"a{0}"), Hir::empty()); + assert_eq!(t(r"a{1}"), hir_lit("a")); + assert_eq!(t(r"\B{32111}"), hir_look(hir::Look::WordUnicodeNegate)); + } + // This tests that the smart Hir::concat constructor simplifies the given // exprs in a way we expect. #[test] @@ -3580,4 +3626,99 @@ mod tests { ]), ); } + + #[test] + fn regression_alt_empty_concat() { + use crate::ast::{self, Ast}; + + let span = Span::splat(Position::new(0, 0, 0)); + let ast = Ast::alternation(ast::Alternation { + span, + asts: vec![Ast::concat(ast::Concat { span, asts: vec![] })], + }); + + let mut t = Translator::new(); + assert_eq!(Ok(Hir::empty()), t.translate("", &ast)); + } + + #[test] + fn regression_empty_alt() { + use crate::ast::{self, Ast}; + + let span = Span::splat(Position::new(0, 0, 0)); + let ast = Ast::concat(ast::Concat { + span, + asts: vec![Ast::alternation(ast::Alternation { + span, + asts: vec![], + })], + }); + + let mut t = Translator::new(); + assert_eq!(Ok(Hir::fail()), t.translate("", &ast)); + } + + #[test] + fn regression_singleton_alt() { + use crate::{ + ast::{self, Ast}, + hir::Dot, + }; + + let span = Span::splat(Position::new(0, 0, 0)); + let ast = Ast::concat(ast::Concat { + span, + asts: vec![Ast::alternation(ast::Alternation { + span, + asts: vec![Ast::dot(span)], + })], + }); + + let mut t = Translator::new(); + assert_eq!(Ok(Hir::dot(Dot::AnyCharExceptLF)), t.translate("", &ast)); + } + + // See: https://bugs.chromium.org/p/oss-fuzz/issues/detail?id=63168 + #[test] + fn regression_fuzz_match() { + let pat = "[(\u{6} \0-\u{afdf5}] \0 "; + let ast = ParserBuilder::new() + .octal(false) + .ignore_whitespace(true) + .build() + .parse(pat) + .unwrap(); + let hir = TranslatorBuilder::new() + .utf8(true) + .case_insensitive(false) + .multi_line(false) + .dot_matches_new_line(false) + .swap_greed(true) + .unicode(true) + .build() + .translate(pat, &ast) + .unwrap(); + assert_eq!( + hir, + Hir::concat(vec![ + hir_uclass(&[('\0', '\u{afdf5}')]), + hir_lit("\0"), + ]) + ); + } + + // See: https://bugs.chromium.org/p/oss-fuzz/issues/detail?id=63155 + #[cfg(feature = "unicode")] + #[test] + fn regression_fuzz_difference1() { + let pat = r"\W\W|\W[^\v--\W\W\P{Script_Extensions:Pau_Cin_Hau}\u10A1A1-\U{3E3E3}--~~~~--~~~~~~~~------~~~~~~--~~~~~~]*"; + let _ = t(pat); // shouldn't panic + } + + // See: https://bugs.chromium.org/p/oss-fuzz/issues/detail?id=63153 + #[test] + fn regression_fuzz_char_decrement1() { + let pat = "w[w[^w?\rw\rw[^w?\rw[^w?\rw[^w?\rw[^w?\rw[^w?\rw[^w?\r\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0w?\rw[^w?\rw[^w?\rw[^w\0\0\u{1}\0]\0\0-*\0]\0\0\0\0\0\0\u{1}\0]\0\0-*\0]\0\0\0\0\0\u{1}\0]\0\0\0\0\0\0\0\0\0*\0\0\u{1}\0]\0\0-*\0][^w?\rw[^w?\rw[^w?\rw[^w?\rw[^w?\rw[^w?\rw[^w\0\0\u{1}\0]\0\0-*\0]\0\0\0\0\0\0\u{1}\0]\0\0-*\0]\0\0\0\0\0\u{1}\0]\0\0\0\0\0\0\0\0\0x\0\0\u{1}\0]\0\0-*\0]\0\0\0\0\0\0\0\0\0*??\0\u{7f}{2}\u{10}??\0\0\0\0\0\0\0\0\0\u{3}\0\0\0}\0-*\0]\0\0\0\0\0\0\u{1}\0]\0\0-*\0]\0\0\0\0\0\0\u{1}\0]\0\0-*\0]\0\0\0\0\0\u{1}\0]\0\0-*\0]\0\0\0\0\0\0\0\u{1}\0]\0\u{1}\u{1}H-i]-]\0\0\0\0\u{1}\0]\0\0\0\u{1}\0]\0\0-*\0\0\0\0\u{1}9-\u{7f}]\0'|-\u{7f}]\0'|(?i-ux)[-\u{7f}]\0'\u{3}\0\0\0}\0-*\0]<D\0\0\0\0\0\0\u{1}]\0\0\0\0]\0\0-*\0]\0\0 "; + let _ = t(pat); // shouldn't panic + } } diff --git a/vendor/regex-syntax/src/hir/visitor.rs b/vendor/regex-syntax/src/hir/visitor.rs index e5f15cf1c..f30f0a163 100644 --- a/vendor/regex-syntax/src/hir/visitor.rs +++ b/vendor/regex-syntax/src/hir/visitor.rs @@ -41,6 +41,11 @@ pub trait Visitor { fn visit_alternation_in(&mut self) -> Result<(), Self::Err> { Ok(()) } + + /// This method is called between child nodes of a concatenation. + fn visit_concat_in(&mut self) -> Result<(), Self::Err> { + Ok(()) + } } /// Executes an implementation of `Visitor` in constant stack space. @@ -131,8 +136,14 @@ impl<'a> HeapVisitor<'a> { // If this is a concat/alternate, then we might have additional // inductive steps to process. if let Some(x) = self.pop(frame) { - if let Frame::Alternation { .. } = x { - visitor.visit_alternation_in()?; + match x { + Frame::Alternation { .. } => { + visitor.visit_alternation_in()?; + } + Frame::Concat { .. } => { + visitor.visit_concat_in()?; + } + _ => {} } hir = x.child(); self.stack.push((post_hir, x)); diff --git a/vendor/regex-syntax/src/lib.rs b/vendor/regex-syntax/src/lib.rs index 754858900..20f25db71 100644 --- a/vendor/regex-syntax/src/lib.rs +++ b/vendor/regex-syntax/src/lib.rs @@ -157,6 +157,11 @@ The following features are available: [Unicode text segmentation algorithms](https://www.unicode.org/reports/tr29/). This enables using classes like `\p{gcb=Extend}`, `\p{wb=Katakana}` and `\p{sb=ATerm}`. +* **arbitrary** - + Enabling this feature introduces a public dependency on the + [`arbitrary`](https://crates.io/crates/arbitrary) + crate. Namely, it implements the `Arbitrary` trait from that crate for the + [`Ast`](crate::ast::Ast) type. This feature is disabled by default. */ #![no_std] @@ -317,6 +322,9 @@ pub fn is_escapeable_character(c: char) -> bool { // escapeable, \< and \> will result in a parse error. Thus, we can // turn them into something else in the future without it being a // backwards incompatible change. + // + // OK, now we support \< and \>, and we need to retain them as *not* + // escapeable here since the escape sequence is significant. '<' | '>' => false, _ => true, } @@ -364,7 +372,7 @@ pub fn try_is_word_character( /// Returns true if and only if the given character is an ASCII word character. /// /// An ASCII word character is defined by the following character class: -/// `[_0-9a-zA-Z]'. +/// `[_0-9a-zA-Z]`. pub fn is_word_byte(c: u8) -> bool { match c { b'_' | b'0'..=b'9' | b'a'..=b'z' | b'A'..=b'Z' => true, diff --git a/vendor/regex-syntax/src/parser.rs b/vendor/regex-syntax/src/parser.rs index 2e7a2bb80..f482b8466 100644 --- a/vendor/regex-syntax/src/parser.rs +++ b/vendor/regex-syntax/src/parser.rs @@ -165,6 +165,31 @@ impl ParserBuilder { self } + /// Sets the line terminator for use with `(?u-s:.)` and `(?-us:.)`. + /// + /// Namely, instead of `.` (by default) matching everything except for `\n`, + /// this will cause `.` to match everything except for the byte given. + /// + /// If `.` is used in a context where Unicode mode is enabled and this byte + /// isn't ASCII, then an error will be returned. When Unicode mode is + /// disabled, then any byte is permitted, but will return an error if UTF-8 + /// mode is enabled and it is a non-ASCII byte. + /// + /// In short, any ASCII value for a line terminator is always okay. But a + /// non-ASCII byte might result in an error depending on whether Unicode + /// mode or UTF-8 mode are enabled. + /// + /// Note that if `R` mode is enabled then it always takes precedence and + /// the line terminator will be treated as `\r` and `\n` simultaneously. + /// + /// Note also that this *doesn't* impact the look-around assertions + /// `(?m:^)` and `(?m:$)`. That's usually controlled by additional + /// configuration in the regex engine itself. + pub fn line_terminator(&mut self, byte: u8) -> &mut ParserBuilder { + self.hir.line_terminator(byte); + self + } + /// Enable or disable the "swap greed" flag by default. /// /// By default this is disabled. It may alternatively be selectively diff --git a/vendor/regex-syntax/test b/vendor/regex-syntax/test index a4d6cfaba..8626c3bfc 100755 --- a/vendor/regex-syntax/test +++ b/vendor/regex-syntax/test @@ -2,6 +2,10 @@ set -e +# cd to the directory containing this crate's Cargo.toml so that we don't need +# to pass --manifest-path to every `cargo` command. +cd "$(dirname "$0")" + # This is a convenience script for running a broad swath of the syntax tests. echo "===== DEFAULT FEATURES ===" cargo test |