summaryrefslogtreecommitdiffstats
path: root/vendor/regex-syntax
diff options
context:
space:
mode:
authorDaniel Baumann <daniel.baumann@progress-linux.org>2024-06-19 09:26:03 +0000
committerDaniel Baumann <daniel.baumann@progress-linux.org>2024-06-19 09:26:03 +0000
commit9918693037dce8aa4bb6f08741b6812923486c18 (patch)
tree21d2b40bec7e6a7ea664acee056eb3d08e15a1cf /vendor/regex-syntax
parentReleasing progress-linux version 1.75.0+dfsg1-5~progress7.99u1. (diff)
downloadrustc-9918693037dce8aa4bb6f08741b6812923486c18.tar.xz
rustc-9918693037dce8aa4bb6f08741b6812923486c18.zip
Merging upstream version 1.76.0+dfsg1.
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'vendor/regex-syntax')
-rw-r--r--vendor/regex-syntax/.cargo-checksum.json2
-rw-r--r--vendor/regex-syntax/Cargo.toml18
-rw-r--r--vendor/regex-syntax/README.md1
-rw-r--r--vendor/regex-syntax/src/ast/mod.rs440
-rw-r--r--vendor/regex-syntax/src/ast/parse.rs576
-rw-r--r--vendor/regex-syntax/src/ast/print.rs20
-rw-r--r--vendor/regex-syntax/src/ast/visitor.rs17
-rw-r--r--vendor/regex-syntax/src/hir/literal.rs167
-rw-r--r--vendor/regex-syntax/src/hir/mod.rs196
-rw-r--r--vendor/regex-syntax/src/hir/print.rs59
-rw-r--r--vendor/regex-syntax/src/hir/translate.rs297
-rw-r--r--vendor/regex-syntax/src/hir/visitor.rs15
-rw-r--r--vendor/regex-syntax/src/lib.rs10
-rw-r--r--vendor/regex-syntax/src/parser.rs25
-rwxr-xr-xvendor/regex-syntax/test4
15 files changed, 1390 insertions, 457 deletions
diff --git a/vendor/regex-syntax/.cargo-checksum.json b/vendor/regex-syntax/.cargo-checksum.json
index 94024d262..8152441ca 100644
--- a/vendor/regex-syntax/.cargo-checksum.json
+++ b/vendor/regex-syntax/.cargo-checksum.json
@@ -1 +1 @@
-{"files":{"Cargo.toml":"572d5198c4a39a5a55d9cbbb79de0e93f0b734f9a8cb3b7a80c0046897a79c85","LICENSE-APACHE":"a60eea817514531668d7e00765731449fe14d059d3249e0bc93b36de45f759f2","LICENSE-MIT":"6485b8ed310d3f0340bf1ad1f47645069ce4069dcc6bb46c7d5c6faf41de1fdb","README.md":"1f5f6c3e0f7e452236eb13a0a8627dceb35e7bd9e18798916b74b724ba8161fe","benches/bench.rs":"d2b6ae5b939abd6093064f144b981b7739d7f474ec0698a1268052fc92406635","src/ast/mod.rs":"f18ba2a3a082fa342512feed20da3bf5fb8b6d92d2e246809732860b446f75c9","src/ast/parse.rs":"49478a4ae5b557dc46aa7071c91c7a6905a0ce62910f8c8fefce464e5779e934","src/ast/print.rs":"62d319cd0b7e6f437dc8dcaf798046a44afa03e9aeb6a384d5cffa448383af53","src/ast/visitor.rs":"a58170758852b2a59c9232f3a027a91f0603b26dd0d9acbde73ac1f575ca600b","src/debug.rs":"7a16cca02be9715fdc8c26a32279465774623cd12fab1ec59ac25a6e3047817f","src/either.rs":"1758e3edd056884eccadd995708d1e374ba9aa65846bd0e13b1aae852607c560","src/error.rs":"01a67e3407b0d0d869119363e47a94d92158834bfe5936366c2e3f6f4ed13f36","src/hir/interval.rs":"2358e74b4d4aabfa62f79df855fd5d183779b86c4e14aae4ee42d8695bb3d010","src/hir/literal.rs":"a57f77b49998f4e4be9f9e4512f9934bc61640786dd7ac07c708825ba2b83429","src/hir/mod.rs":"48f885da2485afd638823dbdbe591f71051db398e86055820557d627d8b23309","src/hir/print.rs":"1f1fb454af939a53ea2799f55b67c2a2615c47c24dbf9f48a7c2a2b402d36e1f","src/hir/translate.rs":"1fbba4c456891ead0298ab6457ac5e9649431e52e75acc85a293d2a17886ac84","src/hir/visitor.rs":"e98aab188f92a92baee7b490d8558383373398697ae97335ae2635b6a5aa45ca","src/lib.rs":"4dbb230403b281f35dc1f9494e6e33e18bad2c2c648da78fd2b4662b85f88748","src/parser.rs":"cac139ed552a63ac4f81d60610cf0c2084421e34729475707681ef9392e1e9ae","src/rank.rs":"ff3d58b0cc5ffa69e2e8c56fc7d9ef41dd399d59a639a253a51551b858cb5bbd","src/unicode.rs":"9829458ef321b3bc22c21eae4b22805b33f8b5e67022928ffd9a9e0287bc7c31","src/unicode_tables/LICENSE-UNICODE":"74db5baf44a41b1000312c673544b3374e4198af5605c7f9080a402cec42cfa3","src/unicode_tables/age.rs":"2a2599a4e406fbbd0efd16aa6ce385c3f97b87c34820d6686a9f9113a5231c67","src/unicode_tables/case_folding_simple.rs":"9583803d4a10486da372b76979dbd26349b40766229467238eff972c1d78e47b","src/unicode_tables/general_category.rs":"36a93ba1cdeed96a00ff29a5ab5afd2c578a89541bf4dd8b18478146cebda0aa","src/unicode_tables/grapheme_cluster_break.rs":"39c388e9805a8391d3d3e69d74d831ce4fb99aa7e13e52c64dd2bd16d4765301","src/unicode_tables/mod.rs":"26c837099cd934c8062e24bc9a0aaecf15fe1de03f9c6da3f3e1e5ac3ca24bee","src/unicode_tables/perl_decimal.rs":"a98ea4afe71c2947023ae12bd25c46bf4c7de48eeb40979eca5c96ba62cee02e","src/unicode_tables/perl_space.rs":"ea2b3b84b4a48334082dadc6c37d9fcc9c9ded84b40e8f5c9c9314898638967e","src/unicode_tables/perl_word.rs":"6f1156bd6af32151ecffea4abe07a38fa04b1fc1b227ec1a8dac5d5f08d9d74b","src/unicode_tables/property_bool.rs":"0bd64f6e3228eaecf47824e238bdf1f8a9eef113ace6e790a57f045a8106701c","src/unicode_tables/property_names.rs":"5ca25437927eb70c62adf7d038e99a601cfb8a718677fd6de832589664d3c481","src/unicode_tables/property_values.rs":"5b4cc02392d382cf7af60455fc87b9980e97409b62a4b8d6c5843190d2e2d21d","src/unicode_tables/script.rs":"ea1d771b6d0a4b12d143f9bad2ea9342a0887878cbbe3c11262b6eabedaf2dd4","src/unicode_tables/script_extension.rs":"beeb8349703d903ff861beb8401bfd2599e457dc25df872e69d6ad1615f8b5e9","src/unicode_tables/sentence_break.rs":"2befe2a27cc4e8aecb624e310ef9f371462470dd3b2f572cec1f5873a5e30aa9","src/unicode_tables/word_break.rs":"94679177731b515f0c360eff394286a1f99b59527bdbc826cbf51d32f9666187","src/utf8.rs":"e9a13623a94295b81969c5483de17219ff74bb20768be13c527010351245acbd","test":"01d6f6e9a689fb794173288a52f40f53b4f782176d0fcd648c7c6d3a2df05c63"},"package":"436b050e76ed2903236f032a59761c1eb99e1b0aead2c257922771dab1fc8c78"} \ No newline at end of file
+{"files":{"Cargo.toml":"33c96af38ed9f42d1ccbf85ecfeea1d46202943d01c595b8ee4dddef760e6bd5","LICENSE-APACHE":"a60eea817514531668d7e00765731449fe14d059d3249e0bc93b36de45f759f2","LICENSE-MIT":"6485b8ed310d3f0340bf1ad1f47645069ce4069dcc6bb46c7d5c6faf41de1fdb","README.md":"b2484aa7e66fb92d1378e9a7ce7605af18f77cb12c179866eaf92ba28cfec1d9","benches/bench.rs":"d2b6ae5b939abd6093064f144b981b7739d7f474ec0698a1268052fc92406635","src/ast/mod.rs":"700c2f779fccb529db7b444819d53c38f916b065d3d05a74282f929af581e8b1","src/ast/parse.rs":"fcd45146eaf747d15a2a519d34754638d451ab83e88b5962841cf7a0dd32e988","src/ast/print.rs":"99cb69ece252ef31e0be177fb3364797eb30b785f936532b8dcd8106e7be0738","src/ast/visitor.rs":"f0fdf758801fe70e6b299b73ab63196e814af95ef6eccad7ef4f72075743fcf6","src/debug.rs":"7a16cca02be9715fdc8c26a32279465774623cd12fab1ec59ac25a6e3047817f","src/either.rs":"1758e3edd056884eccadd995708d1e374ba9aa65846bd0e13b1aae852607c560","src/error.rs":"01a67e3407b0d0d869119363e47a94d92158834bfe5936366c2e3f6f4ed13f36","src/hir/interval.rs":"2358e74b4d4aabfa62f79df855fd5d183779b86c4e14aae4ee42d8695bb3d010","src/hir/literal.rs":"6a8108b8919fbfd9ab93072846124c51d2998489810fcd6e7a89fdccc45833e0","src/hir/mod.rs":"eca183b8e173f486c1a11a5fa10895c96067162c8ec936871f937ca7fca5f710","src/hir/print.rs":"ad51c515c933bfd67d307ba3d7e6ac59c9c5903b4f393a9f9a4785c92b88348d","src/hir/translate.rs":"5fbff527c53f217ba2bac9b0948d7de74164625d08674b91a479ced271159ebd","src/hir/visitor.rs":"71ca9c93aa48a5ed445399659fa6455093a1bbd9ef44b66bc7095c1b08b2ec1f","src/lib.rs":"5ae457d402e49443bdb23b71353693dd3b0d263b57a6eeb9eb5b5dae5c901bdd","src/parser.rs":"6b2f4f27e3331a01a25b87c89368dd2e54396bd425dac57941f9c1ebfd238ac8","src/rank.rs":"ff3d58b0cc5ffa69e2e8c56fc7d9ef41dd399d59a639a253a51551b858cb5bbd","src/unicode.rs":"9829458ef321b3bc22c21eae4b22805b33f8b5e67022928ffd9a9e0287bc7c31","src/unicode_tables/LICENSE-UNICODE":"74db5baf44a41b1000312c673544b3374e4198af5605c7f9080a402cec42cfa3","src/unicode_tables/age.rs":"2a2599a4e406fbbd0efd16aa6ce385c3f97b87c34820d6686a9f9113a5231c67","src/unicode_tables/case_folding_simple.rs":"9583803d4a10486da372b76979dbd26349b40766229467238eff972c1d78e47b","src/unicode_tables/general_category.rs":"36a93ba1cdeed96a00ff29a5ab5afd2c578a89541bf4dd8b18478146cebda0aa","src/unicode_tables/grapheme_cluster_break.rs":"39c388e9805a8391d3d3e69d74d831ce4fb99aa7e13e52c64dd2bd16d4765301","src/unicode_tables/mod.rs":"26c837099cd934c8062e24bc9a0aaecf15fe1de03f9c6da3f3e1e5ac3ca24bee","src/unicode_tables/perl_decimal.rs":"a98ea4afe71c2947023ae12bd25c46bf4c7de48eeb40979eca5c96ba62cee02e","src/unicode_tables/perl_space.rs":"ea2b3b84b4a48334082dadc6c37d9fcc9c9ded84b40e8f5c9c9314898638967e","src/unicode_tables/perl_word.rs":"6f1156bd6af32151ecffea4abe07a38fa04b1fc1b227ec1a8dac5d5f08d9d74b","src/unicode_tables/property_bool.rs":"0bd64f6e3228eaecf47824e238bdf1f8a9eef113ace6e790a57f045a8106701c","src/unicode_tables/property_names.rs":"5ca25437927eb70c62adf7d038e99a601cfb8a718677fd6de832589664d3c481","src/unicode_tables/property_values.rs":"5b4cc02392d382cf7af60455fc87b9980e97409b62a4b8d6c5843190d2e2d21d","src/unicode_tables/script.rs":"ea1d771b6d0a4b12d143f9bad2ea9342a0887878cbbe3c11262b6eabedaf2dd4","src/unicode_tables/script_extension.rs":"beeb8349703d903ff861beb8401bfd2599e457dc25df872e69d6ad1615f8b5e9","src/unicode_tables/sentence_break.rs":"2befe2a27cc4e8aecb624e310ef9f371462470dd3b2f572cec1f5873a5e30aa9","src/unicode_tables/word_break.rs":"94679177731b515f0c360eff394286a1f99b59527bdbc826cbf51d32f9666187","src/utf8.rs":"e9a13623a94295b81969c5483de17219ff74bb20768be13c527010351245acbd","test":"c7de5fbc0010d9b5b758cd49956375a64b88601c068167fd366808950257f108"},"package":"c08c74e62047bb2de4ff487b251e4a92e24f48745648451635cec7d591162d9f"} \ No newline at end of file
diff --git a/vendor/regex-syntax/Cargo.toml b/vendor/regex-syntax/Cargo.toml
index cd4b92cb2..3602ab33c 100644
--- a/vendor/regex-syntax/Cargo.toml
+++ b/vendor/regex-syntax/Cargo.toml
@@ -11,16 +11,18 @@
[package]
edition = "2021"
-rust-version = "1.60.0"
+rust-version = "1.65"
name = "regex-syntax"
-version = "0.7.2"
-authors = ["The Rust Project Developers"]
+version = "0.8.2"
+authors = [
+ "The Rust Project Developers",
+ "Andrew Gallant <jamslam@gmail.com>",
+]
description = "A regular expression parser."
-homepage = "https://github.com/rust-lang/regex"
documentation = "https://docs.rs/regex-syntax"
readme = "README.md"
license = "MIT OR Apache-2.0"
-repository = "https://github.com/rust-lang/regex"
+repository = "https://github.com/rust-lang/regex/tree/master/regex-syntax"
[package.metadata.docs.rs]
all-features = true
@@ -29,7 +31,13 @@ rustdoc-args = [
"docsrs",
]
+[dependencies.arbitrary]
+version = "1.3.0"
+features = ["derive"]
+optional = true
+
[features]
+arbitrary = ["dep:arbitrary"]
default = [
"std",
"unicode",
diff --git a/vendor/regex-syntax/README.md b/vendor/regex-syntax/README.md
index ff4fe094c..529513b0c 100644
--- a/vendor/regex-syntax/README.md
+++ b/vendor/regex-syntax/README.md
@@ -4,7 +4,6 @@ This crate provides a robust regular expression parser.
[![Build status](https://github.com/rust-lang/regex/workflows/ci/badge.svg)](https://github.com/rust-lang/regex/actions)
[![Crates.io](https://img.shields.io/crates/v/regex-syntax.svg)](https://crates.io/crates/regex-syntax)
-[![Rust](https://img.shields.io/badge/rust-1.28.0%2B-blue.svg?maxAge=3600)](https://github.com/rust-lang/regex)
### Documentation
diff --git a/vendor/regex-syntax/src/ast/mod.rs b/vendor/regex-syntax/src/ast/mod.rs
index a95b1c873..6a77ee134 100644
--- a/vendor/regex-syntax/src/ast/mod.rs
+++ b/vendor/regex-syntax/src/ast/mod.rs
@@ -20,6 +20,7 @@ mod visitor;
/// valid Unicode property name. That particular error is reported when
/// translating an AST to the high-level intermediate representation (`HIR`).
#[derive(Clone, Debug, Eq, PartialEq)]
+#[cfg_attr(feature = "arbitrary", derive(arbitrary::Arbitrary))]
pub struct Error {
/// The kind of error.
kind: ErrorKind,
@@ -70,6 +71,7 @@ impl Error {
/// new variant is not considered a breaking change.
#[non_exhaustive]
#[derive(Clone, Debug, Eq, PartialEq)]
+#[cfg_attr(feature = "arbitrary", derive(arbitrary::Arbitrary))]
pub enum ErrorKind {
/// The capturing group limit was exceeded.
///
@@ -160,6 +162,18 @@ pub enum ErrorKind {
/// `(?i)*`. It is, however, possible to create a repetition operating on
/// an empty sub-expression. For example, `()*` is still considered valid.
RepetitionMissing,
+ /// The special word boundary syntax, `\b{something}`, was used, but
+ /// either EOF without `}` was seen, or an invalid character in the
+ /// braces was seen.
+ SpecialWordBoundaryUnclosed,
+ /// The special word boundary syntax, `\b{something}`, was used, but
+ /// `something` was not recognized as a valid word boundary kind.
+ SpecialWordBoundaryUnrecognized,
+ /// The syntax `\b{` was observed, but afterwards the end of the pattern
+ /// was observed without being able to tell whether it was meant to be a
+ /// bounded repetition on the `\b` or the beginning of a special word
+ /// boundary assertion.
+ SpecialWordOrRepetitionUnexpectedEof,
/// The Unicode class is not valid. This typically occurs when a `\p` is
/// followed by something other than a `{`.
UnicodeClassInvalid,
@@ -258,6 +272,29 @@ impl core::fmt::Display for ErrorKind {
RepetitionMissing => {
write!(f, "repetition operator missing expression")
}
+ SpecialWordBoundaryUnclosed => {
+ write!(
+ f,
+ "special word boundary assertion is either \
+ unclosed or contains an invalid character",
+ )
+ }
+ SpecialWordBoundaryUnrecognized => {
+ write!(
+ f,
+ "unrecognized special word boundary assertion, \
+ valid choices are: start, end, start-half \
+ or end-half",
+ )
+ }
+ SpecialWordOrRepetitionUnexpectedEof => {
+ write!(
+ f,
+ "found either the beginning of a special word \
+ boundary or a bounded repetition on a \\b with \
+ an opening brace, but no closing brace",
+ )
+ }
UnicodeClassInvalid => {
write!(f, "invalid Unicode character class")
}
@@ -278,6 +315,7 @@ impl core::fmt::Display for ErrorKind {
/// All span positions are absolute byte offsets that can be used on the
/// original regular expression that was parsed.
#[derive(Clone, Copy, Eq, PartialEq)]
+#[cfg_attr(feature = "arbitrary", derive(arbitrary::Arbitrary))]
pub struct Span {
/// The start byte offset.
pub start: Position,
@@ -308,6 +346,7 @@ impl PartialOrd for Span {
/// A position encodes one half of a span, and include the byte offset, line
/// number and column number.
#[derive(Clone, Copy, Eq, PartialEq)]
+#[cfg_attr(feature = "arbitrary", derive(arbitrary::Arbitrary))]
pub struct Position {
/// The absolute offset of this position, starting at `0` from the
/// beginning of the regular expression pattern string.
@@ -396,6 +435,7 @@ impl Position {
/// comment contains a span of precisely where it occurred in the original
/// regular expression.
#[derive(Clone, Debug, Eq, PartialEq)]
+#[cfg_attr(feature = "arbitrary", derive(arbitrary::Arbitrary))]
pub struct WithComments {
/// The actual ast.
pub ast: Ast,
@@ -408,6 +448,7 @@ pub struct WithComments {
/// A regular expression can only contain comments when the `x` flag is
/// enabled.
#[derive(Clone, Debug, Eq, PartialEq)]
+#[cfg_attr(feature = "arbitrary", derive(arbitrary::Arbitrary))]
pub struct Comment {
/// The span of this comment, including the beginning `#` and ending `\n`.
pub span: Span,
@@ -424,31 +465,97 @@ pub struct Comment {
/// This type defines its own destructor that uses constant stack space and
/// heap space proportional to the size of the `Ast`.
#[derive(Clone, Debug, Eq, PartialEq)]
+#[cfg_attr(feature = "arbitrary", derive(arbitrary::Arbitrary))]
pub enum Ast {
/// An empty regex that matches everything.
- Empty(Span),
+ Empty(Box<Span>),
/// A set of flags, e.g., `(?is)`.
- Flags(SetFlags),
+ Flags(Box<SetFlags>),
/// A single character literal, which includes escape sequences.
- Literal(Literal),
+ Literal(Box<Literal>),
/// The "any character" class.
- Dot(Span),
+ Dot(Box<Span>),
/// A single zero-width assertion.
- Assertion(Assertion),
- /// A single character class. This includes all forms of character classes
- /// except for `.`. e.g., `\d`, `\pN`, `[a-z]` and `[[:alpha:]]`.
- Class(Class),
+ Assertion(Box<Assertion>),
+ /// A single Unicode character class, e.g., `\pL` or `\p{Greek}`.
+ ClassUnicode(Box<ClassUnicode>),
+ /// A single perl character class, e.g., `\d` or `\W`.
+ ClassPerl(Box<ClassPerl>),
+ /// A single bracketed character class set, which may contain zero or more
+ /// character ranges and/or zero or more nested classes. e.g.,
+ /// `[a-zA-Z\pL]`.
+ ClassBracketed(Box<ClassBracketed>),
/// A repetition operator applied to an arbitrary regular expression.
- Repetition(Repetition),
+ Repetition(Box<Repetition>),
/// A grouped regular expression.
- Group(Group),
+ Group(Box<Group>),
/// An alternation of regular expressions.
- Alternation(Alternation),
+ Alternation(Box<Alternation>),
/// A concatenation of regular expressions.
- Concat(Concat),
+ Concat(Box<Concat>),
}
impl Ast {
+ /// Create an "empty" AST item.
+ pub fn empty(span: Span) -> Ast {
+ Ast::Empty(Box::new(span))
+ }
+
+ /// Create a "flags" AST item.
+ pub fn flags(e: SetFlags) -> Ast {
+ Ast::Flags(Box::new(e))
+ }
+
+ /// Create a "literal" AST item.
+ pub fn literal(e: Literal) -> Ast {
+ Ast::Literal(Box::new(e))
+ }
+
+ /// Create a "dot" AST item.
+ pub fn dot(span: Span) -> Ast {
+ Ast::Dot(Box::new(span))
+ }
+
+ /// Create a "assertion" AST item.
+ pub fn assertion(e: Assertion) -> Ast {
+ Ast::Assertion(Box::new(e))
+ }
+
+ /// Create a "Unicode class" AST item.
+ pub fn class_unicode(e: ClassUnicode) -> Ast {
+ Ast::ClassUnicode(Box::new(e))
+ }
+
+ /// Create a "Perl class" AST item.
+ pub fn class_perl(e: ClassPerl) -> Ast {
+ Ast::ClassPerl(Box::new(e))
+ }
+
+ /// Create a "bracketed class" AST item.
+ pub fn class_bracketed(e: ClassBracketed) -> Ast {
+ Ast::ClassBracketed(Box::new(e))
+ }
+
+ /// Create a "repetition" AST item.
+ pub fn repetition(e: Repetition) -> Ast {
+ Ast::Repetition(Box::new(e))
+ }
+
+ /// Create a "group" AST item.
+ pub fn group(e: Group) -> Ast {
+ Ast::Group(Box::new(e))
+ }
+
+ /// Create a "alternation" AST item.
+ pub fn alternation(e: Alternation) -> Ast {
+ Ast::Alternation(Box::new(e))
+ }
+
+ /// Create a "concat" AST item.
+ pub fn concat(e: Concat) -> Ast {
+ Ast::Concat(Box::new(e))
+ }
+
/// Return the span of this abstract syntax tree.
pub fn span(&self) -> &Span {
match *self {
@@ -457,7 +564,9 @@ impl Ast {
Ast::Literal(ref x) => &x.span,
Ast::Dot(ref span) => span,
Ast::Assertion(ref x) => &x.span,
- Ast::Class(ref x) => x.span(),
+ Ast::ClassUnicode(ref x) => &x.span,
+ Ast::ClassPerl(ref x) => &x.span,
+ Ast::ClassBracketed(ref x) => &x.span,
Ast::Repetition(ref x) => &x.span,
Ast::Group(ref x) => &x.span,
Ast::Alternation(ref x) => &x.span,
@@ -481,8 +590,10 @@ impl Ast {
| Ast::Flags(_)
| Ast::Literal(_)
| Ast::Dot(_)
- | Ast::Assertion(_) => false,
- Ast::Class(_)
+ | Ast::Assertion(_)
+ | Ast::ClassUnicode(_)
+ | Ast::ClassPerl(_) => false,
+ Ast::ClassBracketed(_)
| Ast::Repetition(_)
| Ast::Group(_)
| Ast::Alternation(_)
@@ -508,6 +619,7 @@ impl core::fmt::Display for Ast {
/// An alternation of regular expressions.
#[derive(Clone, Debug, Eq, PartialEq)]
+#[cfg_attr(feature = "arbitrary", derive(arbitrary::Arbitrary))]
pub struct Alternation {
/// The span of this alternation.
pub span: Span,
@@ -518,20 +630,21 @@ pub struct Alternation {
impl Alternation {
/// Return this alternation as an AST.
///
- /// If this alternation contains zero ASTs, then Ast::Empty is
- /// returned. If this alternation contains exactly 1 AST, then the
- /// corresponding AST is returned. Otherwise, Ast::Alternation is returned.
+ /// If this alternation contains zero ASTs, then `Ast::empty` is returned.
+ /// If this alternation contains exactly 1 AST, then the corresponding AST
+ /// is returned. Otherwise, `Ast::alternation` is returned.
pub fn into_ast(mut self) -> Ast {
match self.asts.len() {
- 0 => Ast::Empty(self.span),
+ 0 => Ast::empty(self.span),
1 => self.asts.pop().unwrap(),
- _ => Ast::Alternation(self),
+ _ => Ast::alternation(self),
}
}
}
/// A concatenation of regular expressions.
#[derive(Clone, Debug, Eq, PartialEq)]
+#[cfg_attr(feature = "arbitrary", derive(arbitrary::Arbitrary))]
pub struct Concat {
/// The span of this concatenation.
pub span: Span,
@@ -542,14 +655,14 @@ pub struct Concat {
impl Concat {
/// Return this concatenation as an AST.
///
- /// If this concatenation contains zero ASTs, then Ast::Empty is
- /// returned. If this concatenation contains exactly 1 AST, then the
- /// corresponding AST is returned. Otherwise, Ast::Concat is returned.
+ /// If this alternation contains zero ASTs, then `Ast::empty` is returned.
+ /// If this alternation contains exactly 1 AST, then the corresponding AST
+ /// is returned. Otherwise, `Ast::concat` is returned.
pub fn into_ast(mut self) -> Ast {
match self.asts.len() {
- 0 => Ast::Empty(self.span),
+ 0 => Ast::empty(self.span),
1 => self.asts.pop().unwrap(),
- _ => Ast::Concat(self),
+ _ => Ast::concat(self),
}
}
}
@@ -560,6 +673,7 @@ impl Concat {
/// represented in their literal form, e.g., `a` or in their escaped form,
/// e.g., `\x61`.
#[derive(Clone, Debug, Eq, PartialEq)]
+#[cfg_attr(feature = "arbitrary", derive(arbitrary::Arbitrary))]
pub struct Literal {
/// The span of this literal.
pub span: Span,
@@ -584,6 +698,7 @@ impl Literal {
/// The kind of a single literal expression.
#[derive(Clone, Debug, Eq, PartialEq)]
+#[cfg_attr(feature = "arbitrary", derive(arbitrary::Arbitrary))]
pub enum LiteralKind {
/// The literal is written verbatim, e.g., `a` or `☃`.
Verbatim,
@@ -613,6 +728,7 @@ pub enum LiteralKind {
/// A special literal is a special escape sequence recognized by the regex
/// parser, e.g., `\f` or `\n`.
#[derive(Clone, Debug, Eq, PartialEq)]
+#[cfg_attr(feature = "arbitrary", derive(arbitrary::Arbitrary))]
pub enum SpecialLiteralKind {
/// Bell, spelled `\a` (`\x07`).
Bell,
@@ -637,6 +753,7 @@ pub enum SpecialLiteralKind {
/// differ when used without brackets in the number of hex digits that must
/// follow.
#[derive(Clone, Debug, Eq, PartialEq)]
+#[cfg_attr(feature = "arbitrary", derive(arbitrary::Arbitrary))]
pub enum HexLiteralKind {
/// A `\x` prefix. When used without brackets, this form is limited to
/// two digits.
@@ -662,32 +779,9 @@ impl HexLiteralKind {
}
}
-/// A single character class expression.
-#[derive(Clone, Debug, Eq, PartialEq)]
-pub enum Class {
- /// A Unicode character class, e.g., `\pL` or `\p{Greek}`.
- Unicode(ClassUnicode),
- /// A perl character class, e.g., `\d` or `\W`.
- Perl(ClassPerl),
- /// A bracketed character class set, which may contain zero or more
- /// character ranges and/or zero or more nested classes. e.g.,
- /// `[a-zA-Z\pL]`.
- Bracketed(ClassBracketed),
-}
-
-impl Class {
- /// Return the span of this character class.
- pub fn span(&self) -> &Span {
- match *self {
- Class::Perl(ref x) => &x.span,
- Class::Unicode(ref x) => &x.span,
- Class::Bracketed(ref x) => &x.span,
- }
- }
-}
-
/// A Perl character class.
#[derive(Clone, Debug, Eq, PartialEq)]
+#[cfg_attr(feature = "arbitrary", derive(arbitrary::Arbitrary))]
pub struct ClassPerl {
/// The span of this class.
pub span: Span,
@@ -700,6 +794,7 @@ pub struct ClassPerl {
/// The available Perl character classes.
#[derive(Clone, Debug, Eq, PartialEq)]
+#[cfg_attr(feature = "arbitrary", derive(arbitrary::Arbitrary))]
pub enum ClassPerlKind {
/// Decimal numbers.
Digit,
@@ -711,6 +806,7 @@ pub enum ClassPerlKind {
/// An ASCII character class.
#[derive(Clone, Debug, Eq, PartialEq)]
+#[cfg_attr(feature = "arbitrary", derive(arbitrary::Arbitrary))]
pub struct ClassAscii {
/// The span of this class.
pub span: Span,
@@ -723,6 +819,7 @@ pub struct ClassAscii {
/// The available ASCII character classes.
#[derive(Clone, Debug, Eq, PartialEq)]
+#[cfg_attr(feature = "arbitrary", derive(arbitrary::Arbitrary))]
pub enum ClassAsciiKind {
/// `[0-9A-Za-z]`
Alnum,
@@ -786,6 +883,7 @@ impl ClassAsciiKind {
/// A Unicode character class.
#[derive(Clone, Debug, Eq, PartialEq)]
+#[cfg_attr(feature = "arbitrary", derive(arbitrary::Arbitrary))]
pub struct ClassUnicode {
/// The span of this class.
pub span: Span,
@@ -838,8 +936,156 @@ pub enum ClassUnicodeKind {
},
}
+#[cfg(feature = "arbitrary")]
+impl arbitrary::Arbitrary<'_> for ClassUnicodeKind {
+ fn arbitrary(
+ u: &mut arbitrary::Unstructured,
+ ) -> arbitrary::Result<ClassUnicodeKind> {
+ #[cfg(any(
+ feature = "unicode-age",
+ feature = "unicode-bool",
+ feature = "unicode-gencat",
+ feature = "unicode-perl",
+ feature = "unicode-script",
+ feature = "unicode-segment",
+ ))]
+ {
+ use alloc::string::ToString;
+
+ use super::unicode_tables::{
+ property_names::PROPERTY_NAMES,
+ property_values::PROPERTY_VALUES,
+ };
+
+ match u.choose_index(3)? {
+ 0 => {
+ let all = PROPERTY_VALUES
+ .iter()
+ .flat_map(|e| e.1.iter())
+ .filter(|(name, _)| name.len() == 1)
+ .count();
+ let idx = u.choose_index(all)?;
+ let value = PROPERTY_VALUES
+ .iter()
+ .flat_map(|e| e.1.iter())
+ .take(idx + 1)
+ .last()
+ .unwrap()
+ .0
+ .chars()
+ .next()
+ .unwrap();
+ Ok(ClassUnicodeKind::OneLetter(value))
+ }
+ 1 => {
+ let all = PROPERTY_VALUES
+ .iter()
+ .map(|e| e.1.len())
+ .sum::<usize>()
+ + PROPERTY_NAMES.len();
+ let idx = u.choose_index(all)?;
+ let name = PROPERTY_VALUES
+ .iter()
+ .flat_map(|e| e.1.iter())
+ .chain(PROPERTY_NAMES)
+ .map(|(_, e)| e)
+ .take(idx + 1)
+ .last()
+ .unwrap();
+ Ok(ClassUnicodeKind::Named(name.to_string()))
+ }
+ 2 => {
+ let all = PROPERTY_VALUES
+ .iter()
+ .map(|e| e.1.len())
+ .sum::<usize>();
+ let idx = u.choose_index(all)?;
+ let (prop, value) = PROPERTY_VALUES
+ .iter()
+ .flat_map(|e| {
+ e.1.iter().map(|(_, value)| (e.0, value))
+ })
+ .take(idx + 1)
+ .last()
+ .unwrap();
+ Ok(ClassUnicodeKind::NamedValue {
+ op: u.arbitrary()?,
+ name: prop.to_string(),
+ value: value.to_string(),
+ })
+ }
+ _ => unreachable!("index chosen is impossible"),
+ }
+ }
+ #[cfg(not(any(
+ feature = "unicode-age",
+ feature = "unicode-bool",
+ feature = "unicode-gencat",
+ feature = "unicode-perl",
+ feature = "unicode-script",
+ feature = "unicode-segment",
+ )))]
+ {
+ match u.choose_index(3)? {
+ 0 => Ok(ClassUnicodeKind::OneLetter(u.arbitrary()?)),
+ 1 => Ok(ClassUnicodeKind::Named(u.arbitrary()?)),
+ 2 => Ok(ClassUnicodeKind::NamedValue {
+ op: u.arbitrary()?,
+ name: u.arbitrary()?,
+ value: u.arbitrary()?,
+ }),
+ _ => unreachable!("index chosen is impossible"),
+ }
+ }
+ }
+
+ fn size_hint(depth: usize) -> (usize, Option<usize>) {
+ #[cfg(any(
+ feature = "unicode-age",
+ feature = "unicode-bool",
+ feature = "unicode-gencat",
+ feature = "unicode-perl",
+ feature = "unicode-script",
+ feature = "unicode-segment",
+ ))]
+ {
+ arbitrary::size_hint::and_all(&[
+ usize::size_hint(depth),
+ usize::size_hint(depth),
+ arbitrary::size_hint::or(
+ (0, Some(0)),
+ ClassUnicodeOpKind::size_hint(depth),
+ ),
+ ])
+ }
+ #[cfg(not(any(
+ feature = "unicode-age",
+ feature = "unicode-bool",
+ feature = "unicode-gencat",
+ feature = "unicode-perl",
+ feature = "unicode-script",
+ feature = "unicode-segment",
+ )))]
+ {
+ arbitrary::size_hint::and(
+ usize::size_hint(depth),
+ arbitrary::size_hint::or_all(&[
+ char::size_hint(depth),
+ String::size_hint(depth),
+ arbitrary::size_hint::and_all(&[
+ String::size_hint(depth),
+ String::size_hint(depth),
+ ClassUnicodeOpKind::size_hint(depth),
+ ]),
+ ]),
+ )
+ }
+ }
+}
+
/// The type of op used in a Unicode character class.
#[derive(Clone, Debug, Eq, PartialEq)]
+#[cfg_attr(feature = "arbitrary", derive(arbitrary::Arbitrary))]
pub enum ClassUnicodeOpKind {
/// A property set to a specific value, e.g., `\p{scx=Katakana}`.
Equal,
@@ -862,6 +1108,7 @@ impl ClassUnicodeOpKind {
/// A bracketed character class, e.g., `[a-z0-9]`.
#[derive(Clone, Debug, Eq, PartialEq)]
+#[cfg_attr(feature = "arbitrary", derive(arbitrary::Arbitrary))]
pub struct ClassBracketed {
/// The span of this class.
pub span: Span,
@@ -880,6 +1127,7 @@ pub struct ClassBracketed {
/// items (literals, ranges, other bracketed classes) or a tree of binary set
/// operations.
#[derive(Clone, Debug, Eq, PartialEq)]
+#[cfg_attr(feature = "arbitrary", derive(arbitrary::Arbitrary))]
pub enum ClassSet {
/// An item, which can be a single literal, range, nested character class
/// or a union of items.
@@ -913,6 +1161,7 @@ impl ClassSet {
/// A single component of a character class set.
#[derive(Clone, Debug, Eq, PartialEq)]
+#[cfg_attr(feature = "arbitrary", derive(arbitrary::Arbitrary))]
pub enum ClassSetItem {
/// An empty item.
///
@@ -956,6 +1205,7 @@ impl ClassSetItem {
/// A single character class range in a set.
#[derive(Clone, Debug, Eq, PartialEq)]
+#[cfg_attr(feature = "arbitrary", derive(arbitrary::Arbitrary))]
pub struct ClassSetRange {
/// The span of this range.
pub span: Span,
@@ -977,6 +1227,7 @@ impl ClassSetRange {
/// A union of items inside a character class set.
#[derive(Clone, Debug, Eq, PartialEq)]
+#[cfg_attr(feature = "arbitrary", derive(arbitrary::Arbitrary))]
pub struct ClassSetUnion {
/// The span of the items in this operation. e.g., the `a-z0-9` in
/// `[^a-z0-9]`
@@ -1021,6 +1272,7 @@ impl ClassSetUnion {
/// A Unicode character class set operation.
#[derive(Clone, Debug, Eq, PartialEq)]
+#[cfg_attr(feature = "arbitrary", derive(arbitrary::Arbitrary))]
pub struct ClassSetBinaryOp {
/// The span of this operation. e.g., the `a-z--[h-p]` in `[a-z--h-p]`.
pub span: Span,
@@ -1038,6 +1290,7 @@ pub struct ClassSetBinaryOp {
/// explicit union operator. Concatenation inside a character class corresponds
/// to the union operation.
#[derive(Clone, Copy, Debug, Eq, PartialEq)]
+#[cfg_attr(feature = "arbitrary", derive(arbitrary::Arbitrary))]
pub enum ClassSetBinaryOpKind {
/// The intersection of two sets, e.g., `\pN&&[a-z]`.
Intersection,
@@ -1051,6 +1304,7 @@ pub enum ClassSetBinaryOpKind {
/// A single zero-width assertion.
#[derive(Clone, Debug, Eq, PartialEq)]
+#[cfg_attr(feature = "arbitrary", derive(arbitrary::Arbitrary))]
pub struct Assertion {
/// The span of this assertion.
pub span: Span,
@@ -1060,6 +1314,7 @@ pub struct Assertion {
/// An assertion kind.
#[derive(Clone, Debug, Eq, PartialEq)]
+#[cfg_attr(feature = "arbitrary", derive(arbitrary::Arbitrary))]
pub enum AssertionKind {
/// `^`
StartLine,
@@ -1073,10 +1328,23 @@ pub enum AssertionKind {
WordBoundary,
/// `\B`
NotWordBoundary,
+ /// `\b{start}`
+ WordBoundaryStart,
+ /// `\b{end}`
+ WordBoundaryEnd,
+ /// `\<` (alias for `\b{start}`)
+ WordBoundaryStartAngle,
+ /// `\>` (alias for `\b{end}`)
+ WordBoundaryEndAngle,
+ /// `\b{start-half}`
+ WordBoundaryStartHalf,
+ /// `\b{end-half}`
+ WordBoundaryEndHalf,
}
/// A repetition operation applied to a regular expression.
#[derive(Clone, Debug, Eq, PartialEq)]
+#[cfg_attr(feature = "arbitrary", derive(arbitrary::Arbitrary))]
pub struct Repetition {
/// The span of this operation.
pub span: Span,
@@ -1090,6 +1358,7 @@ pub struct Repetition {
/// The repetition operator itself.
#[derive(Clone, Debug, Eq, PartialEq)]
+#[cfg_attr(feature = "arbitrary", derive(arbitrary::Arbitrary))]
pub struct RepetitionOp {
/// The span of this operator. This includes things like `+`, `*?` and
/// `{m,n}`.
@@ -1100,6 +1369,7 @@ pub struct RepetitionOp {
/// The kind of a repetition operator.
#[derive(Clone, Debug, Eq, PartialEq)]
+#[cfg_attr(feature = "arbitrary", derive(arbitrary::Arbitrary))]
pub enum RepetitionKind {
/// `?`
ZeroOrOne,
@@ -1113,6 +1383,7 @@ pub enum RepetitionKind {
/// A range repetition operator.
#[derive(Clone, Debug, Eq, PartialEq)]
+#[cfg_attr(feature = "arbitrary", derive(arbitrary::Arbitrary))]
pub enum RepetitionRange {
/// `{m}`
Exactly(u32),
@@ -1142,6 +1413,7 @@ impl RepetitionRange {
/// contains a sub-expression, e.g., `(a)`, `(?P<name>a)`, `(?:a)` and
/// `(?is:a)`.
#[derive(Clone, Debug, Eq, PartialEq)]
+#[cfg_attr(feature = "arbitrary", derive(arbitrary::Arbitrary))]
pub struct Group {
/// The span of this group.
pub span: Span,
@@ -1183,6 +1455,7 @@ impl Group {
/// The kind of a group.
#[derive(Clone, Debug, Eq, PartialEq)]
+#[cfg_attr(feature = "arbitrary", derive(arbitrary::Arbitrary))]
pub enum GroupKind {
/// `(a)`
CaptureIndex(u32),
@@ -1211,8 +1484,38 @@ pub struct CaptureName {
pub index: u32,
}
+#[cfg(feature = "arbitrary")]
+impl arbitrary::Arbitrary<'_> for CaptureName {
+ fn arbitrary(
+ u: &mut arbitrary::Unstructured,
+ ) -> arbitrary::Result<CaptureName> {
+ let len = u.arbitrary_len::<char>()?;
+ if len == 0 {
+ return Err(arbitrary::Error::NotEnoughData);
+ }
+ let mut name: String = String::new();
+ for _ in 0..len {
+ let ch: char = u.arbitrary()?;
+ let cp = u32::from(ch);
+ let ascii_letter_offset = u8::try_from(cp % 26).unwrap();
+ let ascii_letter = b'a' + ascii_letter_offset;
+ name.push(char::from(ascii_letter));
+ }
+ Ok(CaptureName { span: u.arbitrary()?, name, index: u.arbitrary()? })
+ }
+
+ fn size_hint(depth: usize) -> (usize, Option<usize>) {
+ arbitrary::size_hint::and_all(&[
+ Span::size_hint(depth),
+ usize::size_hint(depth),
+ u32::size_hint(depth),
+ ])
+ }
+}
+
/// A group of flags that is not applied to a particular regular expression.
#[derive(Clone, Debug, Eq, PartialEq)]
+#[cfg_attr(feature = "arbitrary", derive(arbitrary::Arbitrary))]
pub struct SetFlags {
/// The span of these flags, including the grouping parentheses.
pub span: Span,
@@ -1224,6 +1527,7 @@ pub struct SetFlags {
///
/// This corresponds only to the sequence of flags themselves, e.g., `is-u`.
#[derive(Clone, Debug, Eq, PartialEq)]
+#[cfg_attr(feature = "arbitrary", derive(arbitrary::Arbitrary))]
pub struct Flags {
/// The span of this group of flags.
pub span: Span,
@@ -1276,6 +1580,7 @@ impl Flags {
/// A single item in a group of flags.
#[derive(Clone, Debug, Eq, PartialEq)]
+#[cfg_attr(feature = "arbitrary", derive(arbitrary::Arbitrary))]
pub struct FlagsItem {
/// The span of this item.
pub span: Span,
@@ -1285,6 +1590,7 @@ pub struct FlagsItem {
/// The kind of an item in a group of flags.
#[derive(Clone, Debug, Eq, PartialEq)]
+#[cfg_attr(feature = "arbitrary", derive(arbitrary::Arbitrary))]
pub enum FlagsItemKind {
/// A negation operator applied to all subsequent flags in the enclosing
/// group.
@@ -1305,6 +1611,7 @@ impl FlagsItemKind {
/// A single flag.
#[derive(Clone, Copy, Debug, Eq, PartialEq)]
+#[cfg_attr(feature = "arbitrary", derive(arbitrary::Arbitrary))]
pub enum Flag {
/// `i`
CaseInsensitive,
@@ -1334,8 +1641,10 @@ impl Drop for Ast {
| Ast::Literal(_)
| Ast::Dot(_)
| Ast::Assertion(_)
- // Classes are recursive, so they get their own Drop impl.
- | Ast::Class(_) => return,
+ | Ast::ClassUnicode(_)
+ | Ast::ClassPerl(_)
+ // Bracketed classes are recursive, they get their own Drop impl.
+ | Ast::ClassBracketed(_) => return,
Ast::Repetition(ref x) if !x.ast.has_subexprs() => return,
Ast::Group(ref x) if !x.ast.has_subexprs() => return,
Ast::Alternation(ref x) if x.asts.is_empty() => return,
@@ -1344,7 +1653,7 @@ impl Drop for Ast {
}
let empty_span = || Span::splat(Position::new(0, 0, 0));
- let empty_ast = || Ast::Empty(empty_span());
+ let empty_ast = || Ast::empty(empty_span());
let mut stack = vec![mem::replace(self, empty_ast())];
while let Some(mut ast) = stack.pop() {
match ast {
@@ -1353,8 +1662,11 @@ impl Drop for Ast {
| Ast::Literal(_)
| Ast::Dot(_)
| Ast::Assertion(_)
- // Classes are recursive, so they get their own Drop impl.
- | Ast::Class(_) => {}
+ | Ast::ClassUnicode(_)
+ | Ast::ClassPerl(_)
+ // Bracketed classes are recursive, so they get their own Drop
+ // impl.
+ | Ast::ClassBracketed(_) => {}
Ast::Repetition(ref mut x) => {
stack.push(mem::replace(&mut x.ast, empty_ast()));
}
@@ -1447,9 +1759,9 @@ mod tests {
let run = || {
let span = || Span::splat(Position::new(0, 0, 0));
- let mut ast = Ast::Empty(span());
+ let mut ast = Ast::empty(span());
for i in 0..200 {
- ast = Ast::Group(Group {
+ ast = Ast::group(Group {
span: span(),
kind: GroupKind::CaptureIndex(i),
ast: Box::new(ast),
@@ -1478,4 +1790,20 @@ mod tests {
.join()
.unwrap();
}
+
+ // This tests that our `Ast` has a reasonable size. This isn't a hard rule
+ // and it can be increased if given a good enough reason. But this test
+ // exists because the size of `Ast` was at one point over 200 bytes on a
+ // 64-bit target. Wow.
+ #[test]
+ fn ast_size() {
+ let max = 2 * core::mem::size_of::<usize>();
+ let size = core::mem::size_of::<Ast>();
+ assert!(
+ size <= max,
+ "Ast size of {} bytes is bigger than suggested max {}",
+ size,
+ max
+ );
+ }
}
diff --git a/vendor/regex-syntax/src/ast/parse.rs b/vendor/regex-syntax/src/ast/parse.rs
index 9cf64e9ec..593b14fbc 100644
--- a/vendor/regex-syntax/src/ast/parse.rs
+++ b/vendor/regex-syntax/src/ast/parse.rs
@@ -53,11 +53,11 @@ impl Primitive {
/// Convert this primitive into a proper AST.
fn into_ast(self) -> Ast {
match self {
- Primitive::Literal(lit) => Ast::Literal(lit),
- Primitive::Assertion(assert) => Ast::Assertion(assert),
- Primitive::Dot(span) => Ast::Dot(span),
- Primitive::Perl(cls) => Ast::Class(ast::Class::Perl(cls)),
- Primitive::Unicode(cls) => Ast::Class(ast::Class::Unicode(cls)),
+ Primitive::Literal(lit) => Ast::literal(lit),
+ Primitive::Assertion(assert) => Ast::assertion(assert),
+ Primitive::Dot(span) => Ast::dot(span),
+ Primitive::Perl(cls) => Ast::class_perl(cls),
+ Primitive::Unicode(cls) => Ast::class_unicode(cls),
}
}
@@ -383,7 +383,7 @@ impl<'s, P: Borrow<Parser>> ParserI<'s, P> {
/// Return a reference to the pattern being parsed.
fn pattern(&self) -> &str {
- self.pattern.borrow()
+ self.pattern
}
/// Create a new error with the given span and error type.
@@ -691,7 +691,7 @@ impl<'s, P: Borrow<Parser>> ParserI<'s, P> {
self.parser().ignore_whitespace.set(v);
}
- concat.asts.push(Ast::Flags(set));
+ concat.asts.push(Ast::flags(set));
Ok(concat)
}
Either::Right(group) => {
@@ -764,7 +764,7 @@ impl<'s, P: Borrow<Parser>> ParserI<'s, P> {
group.ast = Box::new(group_concat.into_ast());
}
}
- prior_concat.asts.push(Ast::Group(group));
+ prior_concat.asts.push(Ast::group(group));
Ok(prior_concat)
}
@@ -783,7 +783,7 @@ impl<'s, P: Borrow<Parser>> ParserI<'s, P> {
Some(GroupState::Alternation(mut alt)) => {
alt.span.end = self.pos();
alt.asts.push(concat.into_ast());
- Ok(Ast::Alternation(alt))
+ Ok(Ast::alternation(alt))
}
Some(GroupState::Group { group, .. }) => {
return Err(
@@ -850,7 +850,7 @@ impl<'s, P: Borrow<Parser>> ParserI<'s, P> {
fn pop_class(
&self,
nested_union: ast::ClassSetUnion,
- ) -> Result<Either<ast::ClassSetUnion, ast::Class>> {
+ ) -> Result<Either<ast::ClassSetUnion, ast::ClassBracketed>> {
assert_eq!(self.char(), ']');
let item = ast::ClassSet::Item(nested_union.into_item());
@@ -882,7 +882,7 @@ impl<'s, P: Borrow<Parser>> ParserI<'s, P> {
set.span.end = self.pos();
set.kind = prevset;
if stack.is_empty() {
- Ok(Either::Right(ast::Class::Bracketed(set)))
+ Ok(Either::Right(set))
} else {
union.push(ast::ClassSetItem::Bracketed(Box::new(set)));
Ok(Either::Left(union))
@@ -976,7 +976,7 @@ impl<'s, P: Borrow<Parser>> ParserI<'s, P> {
'|' => concat = self.push_alternate(concat)?,
'[' => {
let class = self.parse_set_class()?;
- concat.asts.push(Ast::Class(class));
+ concat.asts.push(Ast::class_bracketed(class));
}
'?' => {
concat = self.parse_uncounted_repetition(
@@ -1057,7 +1057,7 @@ impl<'s, P: Borrow<Parser>> ParserI<'s, P> {
greedy = false;
self.bump();
}
- concat.asts.push(Ast::Repetition(ast::Repetition {
+ concat.asts.push(Ast::repetition(ast::Repetition {
span: ast.span().with_end(self.pos()),
op: ast::RepetitionOp {
span: Span::new(op_start, self.pos()),
@@ -1159,7 +1159,7 @@ impl<'s, P: Borrow<Parser>> ParserI<'s, P> {
self.error(op_span, ast::ErrorKind::RepetitionCountInvalid)
);
}
- concat.asts.push(Ast::Repetition(ast::Repetition {
+ concat.asts.push(Ast::repetition(ast::Repetition {
span: ast.span().with_end(self.pos()),
op: ast::RepetitionOp {
span: op_span,
@@ -1212,7 +1212,7 @@ impl<'s, P: Borrow<Parser>> ParserI<'s, P> {
Ok(Either::Right(ast::Group {
span: open_span,
kind: ast::GroupKind::CaptureName { starts_with_p, name },
- ast: Box::new(Ast::Empty(self.span())),
+ ast: Box::new(Ast::empty(self.span())),
}))
} else if self.bump_if("?") {
if self.is_eof() {
@@ -1241,7 +1241,7 @@ impl<'s, P: Borrow<Parser>> ParserI<'s, P> {
Ok(Either::Right(ast::Group {
span: open_span,
kind: ast::GroupKind::NonCapturing(flags),
- ast: Box::new(Ast::Empty(self.span())),
+ ast: Box::new(Ast::empty(self.span())),
}))
}
} else {
@@ -1249,7 +1249,7 @@ impl<'s, P: Borrow<Parser>> ParserI<'s, P> {
Ok(Either::Right(ast::Group {
span: open_span,
kind: ast::GroupKind::CaptureIndex(capture_index),
- ast: Box::new(Ast::Empty(self.span())),
+ ast: Box::new(Ast::empty(self.span())),
}))
}
}
@@ -1528,18 +1528,115 @@ impl<'s, P: Borrow<Parser>> ParserI<'s, P> {
span,
kind: ast::AssertionKind::EndText,
})),
- 'b' => Ok(Primitive::Assertion(ast::Assertion {
- span,
- kind: ast::AssertionKind::WordBoundary,
- })),
+ 'b' => {
+ let mut wb = ast::Assertion {
+ span,
+ kind: ast::AssertionKind::WordBoundary,
+ };
+ // After a \b, we "try" to parse things like \b{start} for
+ // special word boundary assertions.
+ if !self.is_eof() && self.char() == '{' {
+ if let Some(kind) =
+ self.maybe_parse_special_word_boundary(start)?
+ {
+ wb.kind = kind;
+ wb.span.end = self.pos();
+ }
+ }
+ Ok(Primitive::Assertion(wb))
+ }
'B' => Ok(Primitive::Assertion(ast::Assertion {
span,
kind: ast::AssertionKind::NotWordBoundary,
})),
+ '<' => Ok(Primitive::Assertion(ast::Assertion {
+ span,
+ kind: ast::AssertionKind::WordBoundaryStartAngle,
+ })),
+ '>' => Ok(Primitive::Assertion(ast::Assertion {
+ span,
+ kind: ast::AssertionKind::WordBoundaryEndAngle,
+ })),
_ => Err(self.error(span, ast::ErrorKind::EscapeUnrecognized)),
}
}
+ /// Attempt to parse a specialty word boundary. That is, `\b{start}`,
+ /// `\b{end}`, `\b{start-half}` or `\b{end-half}`.
+ ///
+ /// This is similar to `maybe_parse_ascii_class` in that, in most cases,
+ /// if it fails it will just return `None` with no error. This is done
+ /// because `\b{5}` is a valid expression and we want to let that be parsed
+ /// by the existing counted repetition parsing code. (I thought about just
+ /// invoking the counted repetition code from here, but it seemed a little
+ /// ham-fisted.)
+ ///
+ /// Unlike `maybe_parse_ascii_class` though, this can return an error.
+ /// Namely, if we definitely know it isn't a counted repetition, then we
+ /// return an error specific to the specialty word boundaries.
+ ///
+ /// This assumes the parser is positioned at a `{` immediately following
+ /// a `\b`. When `None` is returned, the parser is returned to the position
+ /// at which it started: pointing at a `{`.
+ ///
+ /// The position given should correspond to the start of the `\b`.
+ fn maybe_parse_special_word_boundary(
+ &self,
+ wb_start: Position,
+ ) -> Result<Option<ast::AssertionKind>> {
+ assert_eq!(self.char(), '{');
+
+ let is_valid_char = |c| match c {
+ 'A'..='Z' | 'a'..='z' | '-' => true,
+ _ => false,
+ };
+ let start = self.pos();
+ if !self.bump_and_bump_space() {
+ return Err(self.error(
+ Span::new(wb_start, self.pos()),
+ ast::ErrorKind::SpecialWordOrRepetitionUnexpectedEof,
+ ));
+ }
+ let start_contents = self.pos();
+ // This is one of the critical bits: if the first non-whitespace
+ // character isn't in [-A-Za-z] (i.e., this can't be a special word
+ // boundary), then we bail and let the counted repetition parser deal
+ // with this.
+ if !is_valid_char(self.char()) {
+ self.parser().pos.set(start);
+ return Ok(None);
+ }
+
+ // Now collect up our chars until we see a '}'.
+ let mut scratch = self.parser().scratch.borrow_mut();
+ scratch.clear();
+ while !self.is_eof() && is_valid_char(self.char()) {
+ scratch.push(self.char());
+ self.bump_and_bump_space();
+ }
+ if self.is_eof() || self.char() != '}' {
+ return Err(self.error(
+ Span::new(start, self.pos()),
+ ast::ErrorKind::SpecialWordBoundaryUnclosed,
+ ));
+ }
+ let end = self.pos();
+ self.bump();
+ let kind = match scratch.as_str() {
+ "start" => ast::AssertionKind::WordBoundaryStart,
+ "end" => ast::AssertionKind::WordBoundaryEnd,
+ "start-half" => ast::AssertionKind::WordBoundaryStartHalf,
+ "end-half" => ast::AssertionKind::WordBoundaryEndHalf,
+ _ => {
+ return Err(self.error(
+ Span::new(start_contents, end),
+ ast::ErrorKind::SpecialWordBoundaryUnrecognized,
+ ))
+ }
+ };
+ Ok(Some(kind))
+ }
+
/// Parse an octal representation of a Unicode codepoint up to 3 digits
/// long. This expects the parser to be positioned at the first octal
/// digit and advances the parser to the first character immediately
@@ -1743,7 +1840,7 @@ impl<'s, P: Borrow<Parser>> ParserI<'s, P> {
/// is successful, then the parser is advanced to the position immediately
/// following the closing `]`.
#[inline(never)]
- fn parse_set_class(&self) -> Result<ast::Class> {
+ fn parse_set_class(&self) -> Result<ast::ClassBracketed> {
assert_eq!(self.char(), '[');
let mut union =
@@ -1967,9 +2064,9 @@ impl<'s, P: Borrow<Parser>> ParserI<'s, P> {
// because parsing cannot fail with any interesting error. For example,
// in order to use an ASCII character class, it must be enclosed in
// double brackets, e.g., `[[:alnum:]]`. Alternatively, you might think
- // of it as "ASCII character characters have the syntax `[:NAME:]`
- // which can only appear within character brackets." This means that
- // things like `[[:lower:]A]` are legal constructs.
+ // of it as "ASCII character classes have the syntax `[:NAME:]` which
+ // can only appear within character brackets." This means that things
+ // like `[[:lower:]A]` are legal constructs.
//
// However, if one types an incorrect ASCII character class, e.g.,
// `[[:loower:]]`, then we treat that as a normal nested character
@@ -2189,12 +2286,12 @@ impl<'p, 's, P: Borrow<Parser>> ast::Visitor for NestLimiter<'p, 's, P> {
| Ast::Literal(_)
| Ast::Dot(_)
| Ast::Assertion(_)
- | Ast::Class(ast::Class::Unicode(_))
- | Ast::Class(ast::Class::Perl(_)) => {
+ | Ast::ClassUnicode(_)
+ | Ast::ClassPerl(_) => {
// These are all base cases, so we don't increment depth.
return Ok(());
}
- Ast::Class(ast::Class::Bracketed(ref x)) => &x.span,
+ Ast::ClassBracketed(ref x) => &x.span,
Ast::Repetition(ref x) => &x.span,
Ast::Group(ref x) => &x.span,
Ast::Alternation(ref x) => &x.span,
@@ -2210,12 +2307,12 @@ impl<'p, 's, P: Borrow<Parser>> ast::Visitor for NestLimiter<'p, 's, P> {
| Ast::Literal(_)
| Ast::Dot(_)
| Ast::Assertion(_)
- | Ast::Class(ast::Class::Unicode(_))
- | Ast::Class(ast::Class::Perl(_)) => {
+ | Ast::ClassUnicode(_)
+ | Ast::ClassPerl(_) => {
// These are all base cases, so we don't decrement depth.
Ok(())
}
- Ast::Class(ast::Class::Bracketed(_))
+ Ast::ClassBracketed(_)
| Ast::Repetition(_)
| Ast::Group(_)
| Ast::Alternation(_)
@@ -2426,12 +2523,12 @@ mod tests {
/// Create a meta literal starting at the given position.
fn meta_lit(c: char, span: Span) -> Ast {
- Ast::Literal(ast::Literal { span, kind: ast::LiteralKind::Meta, c })
+ Ast::literal(ast::Literal { span, kind: ast::LiteralKind::Meta, c })
}
/// Create a verbatim literal with the given span.
fn lit_with(c: char, span: Span) -> Ast {
- Ast::Literal(ast::Literal {
+ Ast::literal(ast::Literal {
span,
kind: ast::LiteralKind::Verbatim,
c,
@@ -2445,17 +2542,17 @@ mod tests {
/// Create a concatenation with the given span.
fn concat_with(span: Span, asts: Vec<Ast>) -> Ast {
- Ast::Concat(ast::Concat { span, asts })
+ Ast::concat(ast::Concat { span, asts })
}
/// Create an alternation with the given span.
fn alt(range: Range<usize>, asts: Vec<Ast>) -> Ast {
- Ast::Alternation(ast::Alternation { span: span(range), asts })
+ Ast::alternation(ast::Alternation { span: span(range), asts })
}
/// Create a capturing group with the given span.
fn group(range: Range<usize>, index: u32, ast: Ast) -> Ast {
- Ast::Group(ast::Group {
+ Ast::group(ast::Group {
span: span(range),
kind: ast::GroupKind::CaptureIndex(index),
ast: Box::new(ast),
@@ -2488,7 +2585,7 @@ mod tests {
},
);
}
- Ast::Flags(ast::SetFlags {
+ Ast::flags(ast::SetFlags {
span: span_range(pat, range.clone()),
flags: ast::Flags {
span: span_range(pat, (range.start + 2)..(range.end - 1)),
@@ -2502,7 +2599,7 @@ mod tests {
// A nest limit of 0 still allows some types of regexes.
assert_eq!(
parser_nest_limit("", 0).parse(),
- Ok(Ast::Empty(span(0..0)))
+ Ok(Ast::empty(span(0..0)))
);
assert_eq!(parser_nest_limit("a", 0).parse(), Ok(lit('a', 0)));
@@ -2516,7 +2613,7 @@ mod tests {
);
assert_eq!(
parser_nest_limit("a+", 1).parse(),
- Ok(Ast::Repetition(ast::Repetition {
+ Ok(Ast::repetition(ast::Repetition {
span: span(0..2),
op: ast::RepetitionOp {
span: span(1..2),
@@ -2542,14 +2639,14 @@ mod tests {
);
assert_eq!(
parser_nest_limit("a+*", 2).parse(),
- Ok(Ast::Repetition(ast::Repetition {
+ Ok(Ast::repetition(ast::Repetition {
span: span(0..3),
op: ast::RepetitionOp {
span: span(2..3),
kind: ast::RepetitionKind::ZeroOrMore,
},
greedy: true,
- ast: Box::new(Ast::Repetition(ast::Repetition {
+ ast: Box::new(Ast::repetition(ast::Repetition {
span: span(0..2),
op: ast::RepetitionOp {
span: span(1..2),
@@ -2606,7 +2703,7 @@ mod tests {
);
assert_eq!(
parser_nest_limit("[a]", 1).parse(),
- Ok(Ast::Class(ast::Class::Bracketed(ast::ClassBracketed {
+ Ok(Ast::class_bracketed(ast::ClassBracketed {
span: span(0..3),
negated: false,
kind: ast::ClassSet::Item(ast::ClassSetItem::Literal(
@@ -2616,7 +2713,7 @@ mod tests {
c: 'a',
}
)),
- })))
+ }))
);
assert_eq!(
parser_nest_limit("[ab]", 1).parse().unwrap_err(),
@@ -2776,7 +2873,7 @@ bar
vec![
lit_with('a', span_range(pat, 0..1)),
lit_with(' ', span_range(pat, 1..2)),
- Ast::Group(ast::Group {
+ Ast::group(ast::Group {
span: span_range(pat, 2..9),
kind: ast::GroupKind::NonCapturing(ast::Flags {
span: span_range(pat, 4..5),
@@ -2803,7 +2900,7 @@ bar
span_range(pat, 0..pat.len()),
vec![
flag_set(pat, 0..4, ast::Flag::IgnoreWhitespace, false),
- Ast::Group(ast::Group {
+ Ast::group(ast::Group {
span: span_range(pat, 4..pat.len()),
kind: ast::GroupKind::CaptureName {
starts_with_p: true,
@@ -2825,7 +2922,7 @@ bar
span_range(pat, 0..pat.len()),
vec![
flag_set(pat, 0..4, ast::Flag::IgnoreWhitespace, false),
- Ast::Group(ast::Group {
+ Ast::group(ast::Group {
span: span_range(pat, 4..pat.len()),
kind: ast::GroupKind::CaptureIndex(1),
ast: Box::new(lit_with('a', span_range(pat, 7..8))),
@@ -2840,7 +2937,7 @@ bar
span_range(pat, 0..pat.len()),
vec![
flag_set(pat, 0..4, ast::Flag::IgnoreWhitespace, false),
- Ast::Group(ast::Group {
+ Ast::group(ast::Group {
span: span_range(pat, 4..pat.len()),
kind: ast::GroupKind::NonCapturing(ast::Flags {
span: span_range(pat, 8..8),
@@ -2858,7 +2955,7 @@ bar
span_range(pat, 0..pat.len()),
vec![
flag_set(pat, 0..4, ast::Flag::IgnoreWhitespace, false),
- Ast::Literal(ast::Literal {
+ Ast::literal(ast::Literal {
span: span(4..13),
kind: ast::LiteralKind::HexBrace(
ast::HexLiteralKind::X
@@ -2877,7 +2974,7 @@ bar
span_range(pat, 0..pat.len()),
vec![
flag_set(pat, 0..4, ast::Flag::IgnoreWhitespace, false),
- Ast::Literal(ast::Literal {
+ Ast::literal(ast::Literal {
span: span_range(pat, 4..6),
kind: ast::LiteralKind::Superfluous,
c: ' ',
@@ -2895,9 +2992,9 @@ bar
Ok(concat_with(
span_range(pat, 0..3),
vec![
- Ast::Dot(span_range(pat, 0..1)),
+ Ast::dot(span_range(pat, 0..1)),
lit_with('\n', span_range(pat, 1..2)),
- Ast::Dot(span_range(pat, 2..3)),
+ Ast::dot(span_range(pat, 2..3)),
]
))
);
@@ -2933,7 +3030,7 @@ bar
fn parse_uncounted_repetition() {
assert_eq!(
parser(r"a*").parse(),
- Ok(Ast::Repetition(ast::Repetition {
+ Ok(Ast::repetition(ast::Repetition {
span: span(0..2),
op: ast::RepetitionOp {
span: span(1..2),
@@ -2945,7 +3042,7 @@ bar
);
assert_eq!(
parser(r"a+").parse(),
- Ok(Ast::Repetition(ast::Repetition {
+ Ok(Ast::repetition(ast::Repetition {
span: span(0..2),
op: ast::RepetitionOp {
span: span(1..2),
@@ -2958,7 +3055,7 @@ bar
assert_eq!(
parser(r"a?").parse(),
- Ok(Ast::Repetition(ast::Repetition {
+ Ok(Ast::repetition(ast::Repetition {
span: span(0..2),
op: ast::RepetitionOp {
span: span(1..2),
@@ -2970,7 +3067,7 @@ bar
);
assert_eq!(
parser(r"a??").parse(),
- Ok(Ast::Repetition(ast::Repetition {
+ Ok(Ast::repetition(ast::Repetition {
span: span(0..3),
op: ast::RepetitionOp {
span: span(1..3),
@@ -2982,7 +3079,7 @@ bar
);
assert_eq!(
parser(r"a?").parse(),
- Ok(Ast::Repetition(ast::Repetition {
+ Ok(Ast::repetition(ast::Repetition {
span: span(0..2),
op: ast::RepetitionOp {
span: span(1..2),
@@ -2997,7 +3094,7 @@ bar
Ok(concat(
0..3,
vec![
- Ast::Repetition(ast::Repetition {
+ Ast::repetition(ast::Repetition {
span: span(0..2),
op: ast::RepetitionOp {
span: span(1..2),
@@ -3015,7 +3112,7 @@ bar
Ok(concat(
0..4,
vec![
- Ast::Repetition(ast::Repetition {
+ Ast::repetition(ast::Repetition {
span: span(0..3),
op: ast::RepetitionOp {
span: span(1..3),
@@ -3034,7 +3131,7 @@ bar
0..3,
vec![
lit('a', 0),
- Ast::Repetition(ast::Repetition {
+ Ast::repetition(ast::Repetition {
span: span(1..3),
op: ast::RepetitionOp {
span: span(2..3),
@@ -3048,7 +3145,7 @@ bar
);
assert_eq!(
parser(r"(ab)?").parse(),
- Ok(Ast::Repetition(ast::Repetition {
+ Ok(Ast::repetition(ast::Repetition {
span: span(0..5),
op: ast::RepetitionOp {
span: span(4..5),
@@ -3067,8 +3164,8 @@ bar
Ok(alt(
0..3,
vec![
- Ast::Empty(span(0..0)),
- Ast::Repetition(ast::Repetition {
+ Ast::empty(span(0..0)),
+ Ast::repetition(ast::Repetition {
span: span(1..3),
op: ast::RepetitionOp {
span: span(2..3),
@@ -3157,7 +3254,7 @@ bar
fn parse_counted_repetition() {
assert_eq!(
parser(r"a{5}").parse(),
- Ok(Ast::Repetition(ast::Repetition {
+ Ok(Ast::repetition(ast::Repetition {
span: span(0..4),
op: ast::RepetitionOp {
span: span(1..4),
@@ -3171,7 +3268,7 @@ bar
);
assert_eq!(
parser(r"a{5,}").parse(),
- Ok(Ast::Repetition(ast::Repetition {
+ Ok(Ast::repetition(ast::Repetition {
span: span(0..5),
op: ast::RepetitionOp {
span: span(1..5),
@@ -3185,7 +3282,7 @@ bar
);
assert_eq!(
parser(r"a{5,9}").parse(),
- Ok(Ast::Repetition(ast::Repetition {
+ Ok(Ast::repetition(ast::Repetition {
span: span(0..6),
op: ast::RepetitionOp {
span: span(1..6),
@@ -3199,7 +3296,7 @@ bar
);
assert_eq!(
parser(r"a{5}?").parse(),
- Ok(Ast::Repetition(ast::Repetition {
+ Ok(Ast::repetition(ast::Repetition {
span: span(0..5),
op: ast::RepetitionOp {
span: span(1..5),
@@ -3217,7 +3314,7 @@ bar
0..5,
vec![
lit('a', 0),
- Ast::Repetition(ast::Repetition {
+ Ast::repetition(ast::Repetition {
span: span(1..5),
op: ast::RepetitionOp {
span: span(2..5),
@@ -3237,7 +3334,7 @@ bar
0..6,
vec![
lit('a', 0),
- Ast::Repetition(ast::Repetition {
+ Ast::repetition(ast::Repetition {
span: span(1..5),
op: ast::RepetitionOp {
span: span(2..5),
@@ -3255,7 +3352,7 @@ bar
assert_eq!(
parser(r"a{ 5 }").parse(),
- Ok(Ast::Repetition(ast::Repetition {
+ Ok(Ast::repetition(ast::Repetition {
span: span(0..6),
op: ast::RepetitionOp {
span: span(1..6),
@@ -3269,7 +3366,7 @@ bar
);
assert_eq!(
parser(r"a{ 5 , 9 }").parse(),
- Ok(Ast::Repetition(ast::Repetition {
+ Ok(Ast::repetition(ast::Repetition {
span: span(0..10),
op: ast::RepetitionOp {
span: span(1..10),
@@ -3283,7 +3380,7 @@ bar
);
assert_eq!(
parser_ignore_whitespace(r"a{5,9} ?").parse(),
- Ok(Ast::Repetition(ast::Repetition {
+ Ok(Ast::repetition(ast::Repetition {
span: span(0..8),
op: ast::RepetitionOp {
span: span(1..8),
@@ -3295,6 +3392,23 @@ bar
ast: Box::new(lit('a', 0)),
}))
);
+ assert_eq!(
+ parser(r"\b{5,9}").parse(),
+ Ok(Ast::repetition(ast::Repetition {
+ span: span(0..7),
+ op: ast::RepetitionOp {
+ span: span(2..7),
+ kind: ast::RepetitionKind::Range(
+ ast::RepetitionRange::Bounded(5, 9)
+ ),
+ },
+ greedy: true,
+ ast: Box::new(Ast::assertion(ast::Assertion {
+ span: span(0..2),
+ kind: ast::AssertionKind::WordBoundary,
+ })),
+ }))
+ );
assert_eq!(
parser(r"(?i){0}").parse().unwrap_err(),
@@ -3414,7 +3528,7 @@ bar
fn parse_alternate() {
assert_eq!(
parser(r"a|b").parse(),
- Ok(Ast::Alternation(ast::Alternation {
+ Ok(Ast::alternation(ast::Alternation {
span: span(0..3),
asts: vec![lit('a', 0), lit('b', 2)],
}))
@@ -3424,7 +3538,7 @@ bar
Ok(group(
0..5,
1,
- Ast::Alternation(ast::Alternation {
+ Ast::alternation(ast::Alternation {
span: span(1..4),
asts: vec![lit('a', 1), lit('b', 3)],
})
@@ -3433,14 +3547,14 @@ bar
assert_eq!(
parser(r"a|b|c").parse(),
- Ok(Ast::Alternation(ast::Alternation {
+ Ok(Ast::alternation(ast::Alternation {
span: span(0..5),
asts: vec![lit('a', 0), lit('b', 2), lit('c', 4)],
}))
);
assert_eq!(
parser(r"ax|by|cz").parse(),
- Ok(Ast::Alternation(ast::Alternation {
+ Ok(Ast::alternation(ast::Alternation {
span: span(0..8),
asts: vec![
concat(0..2, vec![lit('a', 0), lit('x', 1)]),
@@ -3454,7 +3568,7 @@ bar
Ok(group(
0..10,
1,
- Ast::Alternation(ast::Alternation {
+ Ast::alternation(ast::Alternation {
span: span(1..9),
asts: vec![
concat(1..3, vec![lit('a', 1), lit('x', 2)]),
@@ -3503,7 +3617,7 @@ bar
parser(r"|").parse(),
Ok(alt(
0..1,
- vec![Ast::Empty(span(0..0)), Ast::Empty(span(1..1)),]
+ vec![Ast::empty(span(0..0)), Ast::empty(span(1..1)),]
))
);
assert_eq!(
@@ -3511,19 +3625,19 @@ bar
Ok(alt(
0..2,
vec![
- Ast::Empty(span(0..0)),
- Ast::Empty(span(1..1)),
- Ast::Empty(span(2..2)),
+ Ast::empty(span(0..0)),
+ Ast::empty(span(1..1)),
+ Ast::empty(span(2..2)),
]
))
);
assert_eq!(
parser(r"a|").parse(),
- Ok(alt(0..2, vec![lit('a', 0), Ast::Empty(span(2..2)),]))
+ Ok(alt(0..2, vec![lit('a', 0), Ast::empty(span(2..2)),]))
);
assert_eq!(
parser(r"|a").parse(),
- Ok(alt(0..2, vec![Ast::Empty(span(0..0)), lit('a', 1),]))
+ Ok(alt(0..2, vec![Ast::empty(span(0..0)), lit('a', 1),]))
);
assert_eq!(
@@ -3533,7 +3647,7 @@ bar
1,
alt(
1..2,
- vec![Ast::Empty(span(1..1)), Ast::Empty(span(2..2)),]
+ vec![Ast::empty(span(1..1)), Ast::empty(span(2..2)),]
)
))
);
@@ -3542,7 +3656,7 @@ bar
Ok(group(
0..4,
1,
- alt(1..3, vec![lit('a', 1), Ast::Empty(span(3..3)),])
+ alt(1..3, vec![lit('a', 1), Ast::empty(span(3..3)),])
))
);
assert_eq!(
@@ -3550,7 +3664,7 @@ bar
Ok(group(
0..4,
1,
- alt(1..3, vec![Ast::Empty(span(1..1)), lit('a', 2),])
+ alt(1..3, vec![Ast::empty(span(1..1)), lit('a', 2),])
))
);
@@ -3606,7 +3720,7 @@ bar
fn parse_group() {
assert_eq!(
parser("(?i)").parse(),
- Ok(Ast::Flags(ast::SetFlags {
+ Ok(Ast::flags(ast::SetFlags {
span: span(0..4),
flags: ast::Flags {
span: span(2..3),
@@ -3621,7 +3735,7 @@ bar
);
assert_eq!(
parser("(?iU)").parse(),
- Ok(Ast::Flags(ast::SetFlags {
+ Ok(Ast::flags(ast::SetFlags {
span: span(0..5),
flags: ast::Flags {
span: span(2..4),
@@ -3644,7 +3758,7 @@ bar
);
assert_eq!(
parser("(?i-U)").parse(),
- Ok(Ast::Flags(ast::SetFlags {
+ Ok(Ast::flags(ast::SetFlags {
span: span(0..6),
flags: ast::Flags {
span: span(2..5),
@@ -3672,15 +3786,15 @@ bar
assert_eq!(
parser("()").parse(),
- Ok(Ast::Group(ast::Group {
+ Ok(Ast::group(ast::Group {
span: span(0..2),
kind: ast::GroupKind::CaptureIndex(1),
- ast: Box::new(Ast::Empty(span(1..1))),
+ ast: Box::new(Ast::empty(span(1..1))),
}))
);
assert_eq!(
parser("(a)").parse(),
- Ok(Ast::Group(ast::Group {
+ Ok(Ast::group(ast::Group {
span: span(0..3),
kind: ast::GroupKind::CaptureIndex(1),
ast: Box::new(lit('a', 1)),
@@ -3688,20 +3802,20 @@ bar
);
assert_eq!(
parser("(())").parse(),
- Ok(Ast::Group(ast::Group {
+ Ok(Ast::group(ast::Group {
span: span(0..4),
kind: ast::GroupKind::CaptureIndex(1),
- ast: Box::new(Ast::Group(ast::Group {
+ ast: Box::new(Ast::group(ast::Group {
span: span(1..3),
kind: ast::GroupKind::CaptureIndex(2),
- ast: Box::new(Ast::Empty(span(2..2))),
+ ast: Box::new(Ast::empty(span(2..2))),
})),
}))
);
assert_eq!(
parser("(?:a)").parse(),
- Ok(Ast::Group(ast::Group {
+ Ok(Ast::group(ast::Group {
span: span(0..5),
kind: ast::GroupKind::NonCapturing(ast::Flags {
span: span(2..2),
@@ -3713,7 +3827,7 @@ bar
assert_eq!(
parser("(?i:a)").parse(),
- Ok(Ast::Group(ast::Group {
+ Ok(Ast::group(ast::Group {
span: span(0..6),
kind: ast::GroupKind::NonCapturing(ast::Flags {
span: span(2..3),
@@ -3729,7 +3843,7 @@ bar
);
assert_eq!(
parser("(?i-U:a)").parse(),
- Ok(Ast::Group(ast::Group {
+ Ok(Ast::group(ast::Group {
span: span(0..8),
kind: ast::GroupKind::NonCapturing(ast::Flags {
span: span(2..5),
@@ -3818,7 +3932,7 @@ bar
fn parse_capture_name() {
assert_eq!(
parser("(?<a>z)").parse(),
- Ok(Ast::Group(ast::Group {
+ Ok(Ast::group(ast::Group {
span: span(0..7),
kind: ast::GroupKind::CaptureName {
starts_with_p: false,
@@ -3833,7 +3947,7 @@ bar
);
assert_eq!(
parser("(?P<a>z)").parse(),
- Ok(Ast::Group(ast::Group {
+ Ok(Ast::group(ast::Group {
span: span(0..8),
kind: ast::GroupKind::CaptureName {
starts_with_p: true,
@@ -3848,7 +3962,7 @@ bar
);
assert_eq!(
parser("(?P<abc>z)").parse(),
- Ok(Ast::Group(ast::Group {
+ Ok(Ast::group(ast::Group {
span: span(0..10),
kind: ast::GroupKind::CaptureName {
starts_with_p: true,
@@ -3864,7 +3978,7 @@ bar
assert_eq!(
parser("(?P<a_1>z)").parse(),
- Ok(Ast::Group(ast::Group {
+ Ok(Ast::group(ast::Group {
span: span(0..10),
kind: ast::GroupKind::CaptureName {
starts_with_p: true,
@@ -3880,7 +3994,7 @@ bar
assert_eq!(
parser("(?P<a.1>z)").parse(),
- Ok(Ast::Group(ast::Group {
+ Ok(Ast::group(ast::Group {
span: span(0..10),
kind: ast::GroupKind::CaptureName {
starts_with_p: true,
@@ -3896,7 +4010,7 @@ bar
assert_eq!(
parser("(?P<a[1]>z)").parse(),
- Ok(Ast::Group(ast::Group {
+ Ok(Ast::group(ast::Group {
span: span(0..11),
kind: ast::GroupKind::CaptureName {
starts_with_p: true,
@@ -3912,7 +4026,7 @@ bar
assert_eq!(
parser("(?P<a¾>)").parse(),
- Ok(Ast::Group(ast::Group {
+ Ok(Ast::group(ast::Group {
span: Span::new(
Position::new(0, 1, 1),
Position::new(9, 1, 9),
@@ -3928,7 +4042,7 @@ bar
index: 1,
}
},
- ast: Box::new(Ast::Empty(Span::new(
+ ast: Box::new(Ast::empty(Span::new(
Position::new(8, 1, 8),
Position::new(8, 1, 8),
))),
@@ -3936,7 +4050,7 @@ bar
);
assert_eq!(
parser("(?P<名字>)").parse(),
- Ok(Ast::Group(ast::Group {
+ Ok(Ast::group(ast::Group {
span: Span::new(
Position::new(0, 1, 1),
Position::new(12, 1, 9),
@@ -3952,7 +4066,7 @@ bar
index: 1,
}
},
- ast: Box::new(Ast::Empty(Span::new(
+ ast: Box::new(Ast::empty(Span::new(
Position::new(11, 1, 8),
Position::new(11, 1, 8),
))),
@@ -4382,6 +4496,48 @@ bar
}))
);
assert_eq!(
+ parser(r"\b{start}").parse_primitive(),
+ Ok(Primitive::Assertion(ast::Assertion {
+ span: span(0..9),
+ kind: ast::AssertionKind::WordBoundaryStart,
+ }))
+ );
+ assert_eq!(
+ parser(r"\b{end}").parse_primitive(),
+ Ok(Primitive::Assertion(ast::Assertion {
+ span: span(0..7),
+ kind: ast::AssertionKind::WordBoundaryEnd,
+ }))
+ );
+ assert_eq!(
+ parser(r"\b{start-half}").parse_primitive(),
+ Ok(Primitive::Assertion(ast::Assertion {
+ span: span(0..14),
+ kind: ast::AssertionKind::WordBoundaryStartHalf,
+ }))
+ );
+ assert_eq!(
+ parser(r"\b{end-half}").parse_primitive(),
+ Ok(Primitive::Assertion(ast::Assertion {
+ span: span(0..12),
+ kind: ast::AssertionKind::WordBoundaryEndHalf,
+ }))
+ );
+ assert_eq!(
+ parser(r"\<").parse_primitive(),
+ Ok(Primitive::Assertion(ast::Assertion {
+ span: span(0..2),
+ kind: ast::AssertionKind::WordBoundaryStartAngle,
+ }))
+ );
+ assert_eq!(
+ parser(r"\>").parse_primitive(),
+ Ok(Primitive::Assertion(ast::Assertion {
+ span: span(0..2),
+ kind: ast::AssertionKind::WordBoundaryEndAngle,
+ }))
+ );
+ assert_eq!(
parser(r"\B").parse_primitive(),
Ok(Primitive::Assertion(ast::Assertion {
span: span(0..2),
@@ -4418,20 +4574,60 @@ bar
kind: ast::ErrorKind::EscapeUnrecognized,
}
);
- // But also, < and > are banned, so that we may evolve them into
- // start/end word boundary assertions. (Not sure if we will...)
+
+ // Starting a special word boundary without any non-whitespace chars
+ // after the brace makes it ambiguous whether the user meant to write
+ // a counted repetition (probably not?) or an actual special word
+ // boundary assertion.
assert_eq!(
- parser(r"\<").parse_escape().unwrap_err(),
+ parser(r"\b{").parse_escape().unwrap_err(),
TestError {
- span: span(0..2),
- kind: ast::ErrorKind::EscapeUnrecognized,
+ span: span(0..3),
+ kind: ast::ErrorKind::SpecialWordOrRepetitionUnexpectedEof,
}
);
assert_eq!(
- parser(r"\>").parse_escape().unwrap_err(),
+ parser_ignore_whitespace(r"\b{ ").parse_escape().unwrap_err(),
TestError {
- span: span(0..2),
- kind: ast::ErrorKind::EscapeUnrecognized,
+ span: span(0..4),
+ kind: ast::ErrorKind::SpecialWordOrRepetitionUnexpectedEof,
+ }
+ );
+ // When 'x' is not enabled, the space is seen as a non-[-A-Za-z] char,
+ // and thus causes the parser to treat it as a counted repetition.
+ assert_eq!(
+ parser(r"\b{ ").parse().unwrap_err(),
+ TestError {
+ span: span(4..4),
+ kind: ast::ErrorKind::RepetitionCountDecimalEmpty,
+ }
+ );
+ // In this case, we got some valid chars that makes it look like the
+ // user is writing one of the special word boundary assertions, but
+ // we forget to close the brace.
+ assert_eq!(
+ parser(r"\b{foo").parse_escape().unwrap_err(),
+ TestError {
+ span: span(2..6),
+ kind: ast::ErrorKind::SpecialWordBoundaryUnclosed,
+ }
+ );
+ // We get the same error as above, except it is provoked by seeing a
+ // char that we know is invalid before seeing a closing brace.
+ assert_eq!(
+ parser(r"\b{foo!}").parse_escape().unwrap_err(),
+ TestError {
+ span: span(2..6),
+ kind: ast::ErrorKind::SpecialWordBoundaryUnclosed,
+ }
+ );
+ // And this one occurs when, syntactically, everything looks okay, but
+ // we don't use a valid spelling of a word boundary assertion.
+ assert_eq!(
+ parser(r"\b{foo}").parse_escape().unwrap_err(),
+ TestError {
+ span: span(3..6),
+ kind: ast::ErrorKind::SpecialWordBoundaryUnrecognized,
}
);
@@ -4494,15 +4690,15 @@ bar
);
assert_eq!(
parser_octal(r"\778").parse(),
- Ok(Ast::Concat(ast::Concat {
+ Ok(Ast::concat(ast::Concat {
span: span(0..4),
asts: vec![
- Ast::Literal(ast::Literal {
+ Ast::literal(ast::Literal {
span: span(0..3),
kind: ast::LiteralKind::Octal,
c: '?',
}),
- Ast::Literal(ast::Literal {
+ Ast::literal(ast::Literal {
span: span(3..4),
kind: ast::LiteralKind::Verbatim,
c: '8',
@@ -4512,15 +4708,15 @@ bar
);
assert_eq!(
parser_octal(r"\7777").parse(),
- Ok(Ast::Concat(ast::Concat {
+ Ok(Ast::concat(ast::Concat {
span: span(0..5),
asts: vec![
- Ast::Literal(ast::Literal {
+ Ast::literal(ast::Literal {
span: span(0..4),
kind: ast::LiteralKind::Octal,
c: '\u{01FF}',
}),
- Ast::Literal(ast::Literal {
+ Ast::literal(ast::Literal {
span: span(4..5),
kind: ast::LiteralKind::Verbatim,
c: '7',
@@ -4965,15 +5161,15 @@ bar
assert_eq!(
parser("[[:alnum:]]").parse(),
- Ok(Ast::Class(ast::Class::Bracketed(ast::ClassBracketed {
+ Ok(Ast::class_bracketed(ast::ClassBracketed {
span: span(0..11),
negated: false,
kind: itemset(item_ascii(alnum(span(1..10), false))),
- })))
+ }))
);
assert_eq!(
parser("[[[:alnum:]]]").parse(),
- Ok(Ast::Class(ast::Class::Bracketed(ast::ClassBracketed {
+ Ok(Ast::class_bracketed(ast::ClassBracketed {
span: span(0..13),
negated: false,
kind: itemset(item_bracket(ast::ClassBracketed {
@@ -4981,11 +5177,11 @@ bar
negated: false,
kind: itemset(item_ascii(alnum(span(2..11), false))),
})),
- })))
+ }))
);
assert_eq!(
parser("[[:alnum:]&&[:lower:]]").parse(),
- Ok(Ast::Class(ast::Class::Bracketed(ast::ClassBracketed {
+ Ok(Ast::class_bracketed(ast::ClassBracketed {
span: span(0..22),
negated: false,
kind: intersection(
@@ -4993,11 +5189,11 @@ bar
itemset(item_ascii(alnum(span(1..10), false))),
itemset(item_ascii(lower(span(12..21), false))),
),
- })))
+ }))
);
assert_eq!(
parser("[[:alnum:]--[:lower:]]").parse(),
- Ok(Ast::Class(ast::Class::Bracketed(ast::ClassBracketed {
+ Ok(Ast::class_bracketed(ast::ClassBracketed {
span: span(0..22),
negated: false,
kind: difference(
@@ -5005,11 +5201,11 @@ bar
itemset(item_ascii(alnum(span(1..10), false))),
itemset(item_ascii(lower(span(12..21), false))),
),
- })))
+ }))
);
assert_eq!(
parser("[[:alnum:]~~[:lower:]]").parse(),
- Ok(Ast::Class(ast::Class::Bracketed(ast::ClassBracketed {
+ Ok(Ast::class_bracketed(ast::ClassBracketed {
span: span(0..22),
negated: false,
kind: symdifference(
@@ -5017,20 +5213,20 @@ bar
itemset(item_ascii(alnum(span(1..10), false))),
itemset(item_ascii(lower(span(12..21), false))),
),
- })))
+ }))
);
assert_eq!(
parser("[a]").parse(),
- Ok(Ast::Class(ast::Class::Bracketed(ast::ClassBracketed {
+ Ok(Ast::class_bracketed(ast::ClassBracketed {
span: span(0..3),
negated: false,
kind: itemset(lit(span(1..2), 'a')),
- })))
+ }))
);
assert_eq!(
parser(r"[a\]]").parse(),
- Ok(Ast::Class(ast::Class::Bracketed(ast::ClassBracketed {
+ Ok(Ast::class_bracketed(ast::ClassBracketed {
span: span(0..5),
negated: false,
kind: union(
@@ -5044,11 +5240,11 @@ bar
}),
]
),
- })))
+ }))
);
assert_eq!(
parser(r"[a\-z]").parse(),
- Ok(Ast::Class(ast::Class::Bracketed(ast::ClassBracketed {
+ Ok(Ast::class_bracketed(ast::ClassBracketed {
span: span(0..6),
negated: false,
kind: union(
@@ -5063,44 +5259,44 @@ bar
lit(span(4..5), 'z'),
]
),
- })))
+ }))
);
assert_eq!(
parser("[ab]").parse(),
- Ok(Ast::Class(ast::Class::Bracketed(ast::ClassBracketed {
+ Ok(Ast::class_bracketed(ast::ClassBracketed {
span: span(0..4),
negated: false,
kind: union(
span(1..3),
vec![lit(span(1..2), 'a'), lit(span(2..3), 'b'),]
),
- })))
+ }))
);
assert_eq!(
parser("[a-]").parse(),
- Ok(Ast::Class(ast::Class::Bracketed(ast::ClassBracketed {
+ Ok(Ast::class_bracketed(ast::ClassBracketed {
span: span(0..4),
negated: false,
kind: union(
span(1..3),
vec![lit(span(1..2), 'a'), lit(span(2..3), '-'),]
),
- })))
+ }))
);
assert_eq!(
parser("[-a]").parse(),
- Ok(Ast::Class(ast::Class::Bracketed(ast::ClassBracketed {
+ Ok(Ast::class_bracketed(ast::ClassBracketed {
span: span(0..4),
negated: false,
kind: union(
span(1..3),
vec![lit(span(1..2), '-'), lit(span(2..3), 'a'),]
),
- })))
+ }))
);
assert_eq!(
parser(r"[\pL]").parse(),
- Ok(Ast::Class(ast::Class::Bracketed(ast::ClassBracketed {
+ Ok(Ast::class_bracketed(ast::ClassBracketed {
span: span(0..5),
negated: false,
kind: itemset(item_unicode(ast::ClassUnicode {
@@ -5108,11 +5304,11 @@ bar
negated: false,
kind: ast::ClassUnicodeKind::OneLetter('L'),
})),
- })))
+ }))
);
assert_eq!(
parser(r"[\w]").parse(),
- Ok(Ast::Class(ast::Class::Bracketed(ast::ClassBracketed {
+ Ok(Ast::class_bracketed(ast::ClassBracketed {
span: span(0..4),
negated: false,
kind: itemset(item_perl(ast::ClassPerl {
@@ -5120,11 +5316,11 @@ bar
kind: ast::ClassPerlKind::Word,
negated: false,
})),
- })))
+ }))
);
assert_eq!(
parser(r"[a\wz]").parse(),
- Ok(Ast::Class(ast::Class::Bracketed(ast::ClassBracketed {
+ Ok(Ast::class_bracketed(ast::ClassBracketed {
span: span(0..6),
negated: false,
kind: union(
@@ -5139,20 +5335,20 @@ bar
lit(span(4..5), 'z'),
]
),
- })))
+ }))
);
assert_eq!(
parser("[a-z]").parse(),
- Ok(Ast::Class(ast::Class::Bracketed(ast::ClassBracketed {
+ Ok(Ast::class_bracketed(ast::ClassBracketed {
span: span(0..5),
negated: false,
kind: itemset(range(span(1..4), 'a', 'z')),
- })))
+ }))
);
assert_eq!(
parser("[a-cx-z]").parse(),
- Ok(Ast::Class(ast::Class::Bracketed(ast::ClassBracketed {
+ Ok(Ast::class_bracketed(ast::ClassBracketed {
span: span(0..8),
negated: false,
kind: union(
@@ -5162,11 +5358,11 @@ bar
range(span(4..7), 'x', 'z'),
]
),
- })))
+ }))
);
assert_eq!(
parser(r"[\w&&a-cx-z]").parse(),
- Ok(Ast::Class(ast::Class::Bracketed(ast::ClassBracketed {
+ Ok(Ast::class_bracketed(ast::ClassBracketed {
span: span(0..12),
negated: false,
kind: intersection(
@@ -5184,11 +5380,11 @@ bar
]
),
),
- })))
+ }))
);
assert_eq!(
parser(r"[a-cx-z&&\w]").parse(),
- Ok(Ast::Class(ast::Class::Bracketed(ast::ClassBracketed {
+ Ok(Ast::class_bracketed(ast::ClassBracketed {
span: span(0..12),
negated: false,
kind: intersection(
@@ -5206,11 +5402,11 @@ bar
negated: false,
})),
),
- })))
+ }))
);
assert_eq!(
parser(r"[a--b--c]").parse(),
- Ok(Ast::Class(ast::Class::Bracketed(ast::ClassBracketed {
+ Ok(Ast::class_bracketed(ast::ClassBracketed {
span: span(0..9),
negated: false,
kind: difference(
@@ -5222,11 +5418,11 @@ bar
),
itemset(lit(span(7..8), 'c')),
),
- })))
+ }))
);
assert_eq!(
parser(r"[a~~b~~c]").parse(),
- Ok(Ast::Class(ast::Class::Bracketed(ast::ClassBracketed {
+ Ok(Ast::class_bracketed(ast::ClassBracketed {
span: span(0..9),
negated: false,
kind: symdifference(
@@ -5238,11 +5434,11 @@ bar
),
itemset(lit(span(7..8), 'c')),
),
- })))
+ }))
);
assert_eq!(
parser(r"[\^&&^]").parse(),
- Ok(Ast::Class(ast::Class::Bracketed(ast::ClassBracketed {
+ Ok(Ast::class_bracketed(ast::ClassBracketed {
span: span(0..7),
negated: false,
kind: intersection(
@@ -5254,11 +5450,11 @@ bar
})),
itemset(lit(span(5..6), '^')),
),
- })))
+ }))
);
assert_eq!(
parser(r"[\&&&&]").parse(),
- Ok(Ast::Class(ast::Class::Bracketed(ast::ClassBracketed {
+ Ok(Ast::class_bracketed(ast::ClassBracketed {
span: span(0..7),
negated: false,
kind: intersection(
@@ -5270,11 +5466,11 @@ bar
})),
itemset(lit(span(5..6), '&')),
),
- })))
+ }))
);
assert_eq!(
parser(r"[&&&&]").parse(),
- Ok(Ast::Class(ast::Class::Bracketed(ast::ClassBracketed {
+ Ok(Ast::class_bracketed(ast::ClassBracketed {
span: span(0..6),
negated: false,
kind: intersection(
@@ -5286,13 +5482,13 @@ bar
),
itemset(empty(span(5..5))),
),
- })))
+ }))
);
let pat = "[☃-⛄]";
assert_eq!(
parser(pat).parse(),
- Ok(Ast::Class(ast::Class::Bracketed(ast::ClassBracketed {
+ Ok(Ast::class_bracketed(ast::ClassBracketed {
span: span_range(pat, 0..9),
negated: false,
kind: itemset(ast::ClassSetItem::Range(ast::ClassSetRange {
@@ -5308,20 +5504,20 @@ bar
c: '⛄',
},
})),
- })))
+ }))
);
assert_eq!(
parser(r"[]]").parse(),
- Ok(Ast::Class(ast::Class::Bracketed(ast::ClassBracketed {
+ Ok(Ast::class_bracketed(ast::ClassBracketed {
span: span(0..3),
negated: false,
kind: itemset(lit(span(1..2), ']')),
- })))
+ }))
);
assert_eq!(
parser(r"[]\[]").parse(),
- Ok(Ast::Class(ast::Class::Bracketed(ast::ClassBracketed {
+ Ok(Ast::class_bracketed(ast::ClassBracketed {
span: span(0..5),
negated: false,
kind: union(
@@ -5335,14 +5531,14 @@ bar
}),
]
),
- })))
+ }))
);
assert_eq!(
parser(r"[\[]]").parse(),
Ok(concat(
0..5,
vec![
- Ast::Class(ast::Class::Bracketed(ast::ClassBracketed {
+ Ast::class_bracketed(ast::ClassBracketed {
span: span(0..4),
negated: false,
kind: itemset(ast::ClassSetItem::Literal(
@@ -5352,8 +5548,8 @@ bar
c: '[',
}
)),
- })),
- Ast::Literal(ast::Literal {
+ }),
+ Ast::literal(ast::Literal {
span: span(4..5),
kind: ast::LiteralKind::Verbatim,
c: ']',
@@ -5914,15 +6110,15 @@ bar
assert_eq!(
parser(r"\pNz").parse(),
- Ok(Ast::Concat(ast::Concat {
+ Ok(Ast::concat(ast::Concat {
span: span(0..4),
asts: vec![
- Ast::Class(ast::Class::Unicode(ast::ClassUnicode {
+ Ast::class_unicode(ast::ClassUnicode {
span: span(0..3),
negated: false,
kind: ast::ClassUnicodeKind::OneLetter('N'),
- })),
- Ast::Literal(ast::Literal {
+ }),
+ Ast::literal(ast::Literal {
span: span(3..4),
kind: ast::LiteralKind::Verbatim,
c: 'z',
@@ -5932,15 +6128,15 @@ bar
);
assert_eq!(
parser(r"\p{Greek}z").parse(),
- Ok(Ast::Concat(ast::Concat {
+ Ok(Ast::concat(ast::Concat {
span: span(0..10),
asts: vec![
- Ast::Class(ast::Class::Unicode(ast::ClassUnicode {
+ Ast::class_unicode(ast::ClassUnicode {
span: span(0..9),
negated: false,
kind: ast::ClassUnicodeKind::Named(s("Greek")),
- })),
- Ast::Literal(ast::Literal {
+ }),
+ Ast::literal(ast::Literal {
span: span(9..10),
kind: ast::LiteralKind::Verbatim,
c: 'z',
@@ -6017,23 +6213,23 @@ bar
assert_eq!(
parser(r"\d").parse(),
- Ok(Ast::Class(ast::Class::Perl(ast::ClassPerl {
+ Ok(Ast::class_perl(ast::ClassPerl {
span: span(0..2),
kind: ast::ClassPerlKind::Digit,
negated: false,
- })))
+ }))
);
assert_eq!(
parser(r"\dz").parse(),
- Ok(Ast::Concat(ast::Concat {
+ Ok(Ast::concat(ast::Concat {
span: span(0..3),
asts: vec![
- Ast::Class(ast::Class::Perl(ast::ClassPerl {
+ Ast::class_perl(ast::ClassPerl {
span: span(0..2),
kind: ast::ClassPerlKind::Digit,
negated: false,
- })),
- Ast::Literal(ast::Literal {
+ }),
+ Ast::literal(ast::Literal {
span: span(2..3),
kind: ast::LiteralKind::Verbatim,
c: 'z',
diff --git a/vendor/regex-syntax/src/ast/print.rs b/vendor/regex-syntax/src/ast/print.rs
index 86a87e143..1ceb3c7fa 100644
--- a/vendor/regex-syntax/src/ast/print.rs
+++ b/vendor/regex-syntax/src/ast/print.rs
@@ -80,27 +80,21 @@ impl<W: fmt::Write> Visitor for Writer<W> {
fn visit_pre(&mut self, ast: &Ast) -> fmt::Result {
match *ast {
Ast::Group(ref x) => self.fmt_group_pre(x),
- Ast::Class(ast::Class::Bracketed(ref x)) => {
- self.fmt_class_bracketed_pre(x)
- }
+ Ast::ClassBracketed(ref x) => self.fmt_class_bracketed_pre(x),
_ => Ok(()),
}
}
fn visit_post(&mut self, ast: &Ast) -> fmt::Result {
- use crate::ast::Class;
-
match *ast {
Ast::Empty(_) => Ok(()),
Ast::Flags(ref x) => self.fmt_set_flags(x),
Ast::Literal(ref x) => self.fmt_literal(x),
Ast::Dot(_) => self.wtr.write_str("."),
Ast::Assertion(ref x) => self.fmt_assertion(x),
- Ast::Class(Class::Perl(ref x)) => self.fmt_class_perl(x),
- Ast::Class(Class::Unicode(ref x)) => self.fmt_class_unicode(x),
- Ast::Class(Class::Bracketed(ref x)) => {
- self.fmt_class_bracketed_post(x)
- }
+ Ast::ClassPerl(ref x) => self.fmt_class_perl(x),
+ Ast::ClassUnicode(ref x) => self.fmt_class_unicode(x),
+ Ast::ClassBracketed(ref x) => self.fmt_class_bracketed_post(x),
Ast::Repetition(ref x) => self.fmt_repetition(x),
Ast::Group(ref x) => self.fmt_group_post(x),
Ast::Alternation(_) => Ok(()),
@@ -267,6 +261,12 @@ impl<W: fmt::Write> Writer<W> {
EndText => self.wtr.write_str(r"\z"),
WordBoundary => self.wtr.write_str(r"\b"),
NotWordBoundary => self.wtr.write_str(r"\B"),
+ WordBoundaryStart => self.wtr.write_str(r"\b{start}"),
+ WordBoundaryEnd => self.wtr.write_str(r"\b{end}"),
+ WordBoundaryStartAngle => self.wtr.write_str(r"\<"),
+ WordBoundaryEndAngle => self.wtr.write_str(r"\>"),
+ WordBoundaryStartHalf => self.wtr.write_str(r"\b{start-half}"),
+ WordBoundaryEndHalf => self.wtr.write_str(r"\b{end-half}"),
}
}
diff --git a/vendor/regex-syntax/src/ast/visitor.rs b/vendor/regex-syntax/src/ast/visitor.rs
index ab136739e..c1bb24d97 100644
--- a/vendor/regex-syntax/src/ast/visitor.rs
+++ b/vendor/regex-syntax/src/ast/visitor.rs
@@ -48,6 +48,11 @@ pub trait Visitor {
Ok(())
}
+ /// This method is called between child nodes of a concatenation.
+ fn visit_concat_in(&mut self) -> Result<(), Self::Err> {
+ Ok(())
+ }
+
/// This method is called on every [`ClassSetItem`](ast::ClassSetItem)
/// before descending into child nodes.
fn visit_class_set_item_pre(
@@ -228,8 +233,14 @@ impl<'a> HeapVisitor<'a> {
// If this is a concat/alternate, then we might have additional
// inductive steps to process.
if let Some(x) = self.pop(frame) {
- if let Frame::Alternation { .. } = x {
- visitor.visit_alternation_in()?;
+ match x {
+ Frame::Alternation { .. } => {
+ visitor.visit_alternation_in()?;
+ }
+ Frame::Concat { .. } => {
+ visitor.visit_concat_in()?;
+ }
+ _ => {}
}
ast = x.child();
self.stack.push((post_ast, x));
@@ -253,7 +264,7 @@ impl<'a> HeapVisitor<'a> {
visitor: &mut V,
) -> Result<Option<Frame<'a>>, V::Err> {
Ok(match *ast {
- Ast::Class(ast::Class::Bracketed(ref x)) => {
+ Ast::ClassBracketed(ref x) => {
self.visit_class(x, visitor)?;
None
}
diff --git a/vendor/regex-syntax/src/hir/literal.rs b/vendor/regex-syntax/src/hir/literal.rs
index bd3a2d143..a5a3737f6 100644
--- a/vendor/regex-syntax/src/hir/literal.rs
+++ b/vendor/regex-syntax/src/hir/literal.rs
@@ -23,7 +23,7 @@ effective literal optimizations:
to lead to substring search that is only a little faster than a regex search,
and thus the overhead of using literal optimizations in the first place might
make things slower overall.
-* The literals in your [`Seq`] shoudn't be too short. In general, longer is
+* The literals in your [`Seq`] shouldn't be too short. In general, longer is
better. A sequence corresponding to single bytes that occur frequently in the
haystack, for example, is probably a bad literal optimization because it's
likely to produce many false positive candidates. Longer literals are less
@@ -51,7 +51,7 @@ the "trickier" parts are how to combine literal sequences, and that is all
implemented on [`Seq`].
*/
-use core::{cmp, mem};
+use core::{cmp, mem, num::NonZeroUsize};
use alloc::{vec, vec::Vec};
@@ -477,7 +477,7 @@ impl Extractor {
}
seq
}
- hir::Repetition { min, max: Some(max), .. } if min < max => {
+ hir::Repetition { min, .. } => {
assert!(min > 0); // handled above
let limit =
u32::try_from(self.limit_repeat).unwrap_or(u32::MAX);
@@ -491,10 +491,6 @@ impl Extractor {
seq.make_inexact();
seq
}
- hir::Repetition { .. } => {
- subseq.make_inexact();
- subseq
- }
}
}
@@ -692,7 +688,7 @@ impl Default for ExtractKind {
/// from making assumptions about what literals are required in order to match
/// a particular [`Hir`] expression. Generally speaking, when a set is in this
/// state, literal optimizations are inhibited. A good example of a regex that
-/// will cause this sort of set to apppear is `[A-Za-z]`. The character class
+/// will cause this sort of set to appear is `[A-Za-z]`. The character class
/// is just too big (and also too narrow) to be usefully expanded into 52
/// different literals. (Note that the decision for when a seq should become
/// infinite is determined by the caller. A seq itself has no hard-coded
@@ -1571,7 +1567,7 @@ impl Seq {
/// unioning `self` with `other`. If either set is infinite, then this
/// returns `None`.
#[inline]
- fn max_union_len(&self, other: &Seq) -> Option<usize> {
+ pub fn max_union_len(&self, other: &Seq) -> Option<usize> {
let len1 = self.len()?;
let len2 = other.len()?;
Some(len1.saturating_add(len2))
@@ -1581,7 +1577,7 @@ impl Seq {
/// cross product of `self` with `other`. If either set is infinite, then
/// this returns `None`.
#[inline]
- fn max_cross_len(&self, other: &Seq) -> Option<usize> {
+ pub fn max_cross_len(&self, other: &Seq) -> Option<usize> {
let len1 = self.len()?;
let len2 = other.len()?;
Some(len1.saturating_mul(len2))
@@ -1841,6 +1837,14 @@ impl Seq {
None => return,
Some(len) => len,
};
+ // Just give up now if our sequence contains an empty string.
+ if self.min_literal_len().map_or(false, |len| len == 0) {
+ // We squash the sequence so that nobody else gets any bright
+ // ideas to try and use it. An empty string implies a match at
+ // every position. A prefilter cannot help you here.
+ self.make_infinite();
+ return;
+ }
// Make sure we start with the smallest sequence possible. We use a
// special version of preference minimization that retains exactness.
// This is legal because optimization is only expected to occur once
@@ -1910,34 +1914,41 @@ impl Seq {
// longest common prefix to be subject to the poison check.
}
}
- // Everything below this check is more-or-less about trying to
- // heuristically reduce the false positive rate of a prefilter. But
- // if our sequence is completely exact, then it's possible the regex
- // engine can be skipped entirely. In this case, the false positive
- // rate is zero because every literal match corresponds to a regex
- // match.
+ // If we have an exact sequence, we *probably* just want to keep it
+ // as-is. But there are some cases where we don't. So we save a copy of
+ // the exact sequence now, and then try to do some more optimizations
+ // below. If those don't work out, we go back to this exact sequence.
//
- // This is OK even if the sequence contains a poison literal. Remember,
- // a literal is only poisononous because of what we assume about its
- // impact on the false positive rate. However, we do still check for
- // an empty string. Empty strings are weird and it's best to let the
- // regex engine handle those.
+ // The specific motivation for this is that we sometimes wind up with
+ // an exact sequence with a hefty number of literals. Say, 100. If we
+ // stuck with that, it would be too big for Teddy and would result in
+ // using Aho-Corasick. Which is fine... but the lazy DFA is plenty
+ // suitable in such cases. The real issue is that we will wind up not
+ // using a fast prefilter at all. So in cases like this, even though
+ // we have an exact sequence, it would be better to try and shrink the
+ // sequence (which we do below) and use it as a prefilter that can
+ // produce false positive matches.
//
- // We do currently do this check after the longest common prefix (or
- // suffix) check, under the theory that single-substring search is so
- // fast that we want that even if we'd end up turning an exact sequence
- // into an inexact one. But this might be wrong...
- if self.is_exact()
- && self.min_literal_len().map_or(false, |len| len > 0)
- {
- return;
- }
+ // But if the shrinking below results in a sequence that "sucks," then
+ // we don't want to use that because we already have an exact sequence
+ // in hand.
+ let exact: Option<Seq> =
+ if self.is_exact() { Some(self.clone()) } else { None };
// Now we attempt to shorten the sequence. The idea here is that we
// don't want to look for too many literals, but we want to shorten
// our sequence enough to improve our odds of using better algorithms
// downstream (such as Teddy).
+ //
+ // The pair of numbers in this list corresponds to the maximal prefix
+ // (in bytes) to keep for all literals and the length of the sequence
+ // at which to do it.
+ //
+ // So for example, the pair (3, 500) would mean, "if we have more than
+ // 500 literals in our sequence, then truncate all of our literals
+ // such that they are at most 3 bytes in length and the minimize the
+ // sequence."
const ATTEMPTS: [(usize, usize); 5] =
- [(5, 64), (4, 64), (3, 64), (2, 64), (1, 10)];
+ [(5, 10), (4, 10), (3, 64), (2, 64), (1, 10)];
for (keep, limit) in ATTEMPTS {
let len = match self.len() {
None => break,
@@ -1951,7 +1962,11 @@ impl Seq {
} else {
self.keep_last_bytes(keep);
}
- self.minimize_by_preference();
+ if prefix {
+ if let Some(ref mut lits) = self.literals {
+ PreferenceTrie::minimize(lits, true);
+ }
+ }
}
// Check for a poison literal. A poison literal is one that is short
// and is believed to have a very high match count. These poisons
@@ -1968,6 +1983,30 @@ impl Seq {
self.make_infinite();
}
}
+ // OK, if we had an exact sequence before attempting more optimizations
+ // above and our post-optimized sequence sucks for some reason or
+ // another, then we go back to the exact sequence.
+ if let Some(exact) = exact {
+ // If optimizing resulted in dropping our literals, then certainly
+ // backup and use the exact sequence that we had.
+ if !self.is_finite() {
+ *self = exact;
+ return;
+ }
+ // If our optimized sequence contains a short literal, then it's
+ // *probably* not so great. So throw it away and revert to the
+ // exact sequence.
+ if self.min_literal_len().map_or(true, |len| len <= 2) {
+ *self = exact;
+ return;
+ }
+ // Finally, if our optimized sequence is "big" (i.e., can't use
+ // Teddy), then also don't use it and rely on the exact sequence.
+ if self.len().map_or(true, |len| len > 64) {
+ *self = exact;
+ return;
+ }
+ }
}
}
@@ -1977,7 +2016,7 @@ impl core::fmt::Debug for Seq {
if let Some(lits) = self.literals() {
f.debug_list().entries(lits.iter()).finish()
} else {
- write!(f, "[∅]")
+ write!(f, "[∞]")
}
}
}
@@ -2160,12 +2199,19 @@ impl core::fmt::Debug for Literal {
/// never seen this show up on a profile. Because of the heuristic limits
/// imposed on literal extractions, the size of the inputs here is usually
/// very small.)
-#[derive(Debug, Default)]
+#[derive(Debug)]
struct PreferenceTrie {
/// The states in this trie. The index of a state in this vector is its ID.
states: Vec<State>,
+ /// This vec indicates which states are match states. It always has
+ /// the same length as `states` and is indexed by the same state ID.
+ /// A state with identifier `sid` is a match state if and only if
+ /// `matches[sid].is_some()`. The option contains the index of the literal
+ /// corresponding to the match. The index is offset by 1 so that it fits in
+ /// a NonZeroUsize.
+ matches: Vec<Option<NonZeroUsize>>,
/// The index to allocate to the next literal added to this trie. Starts at
- /// 0 and increments by 1 for every literal successfully added to the trie.
+ /// 1 and increments by 1 for every literal successfully added to the trie.
next_literal_index: usize,
}
@@ -2176,9 +2222,6 @@ struct State {
/// are sorted by byte. There is at most one such transition for any
/// particular byte.
trans: Vec<(u8, usize)>,
- /// Whether this is a matching state or not. If it is, then it contains the
- /// index to the matching literal.
- literal_index: Option<usize>,
}
impl PreferenceTrie {
@@ -2192,20 +2235,19 @@ impl PreferenceTrie {
/// after them and because any removed literals are guaranteed to never
/// match.
fn minimize(literals: &mut Vec<Literal>, keep_exact: bool) {
- use core::cell::RefCell;
-
- // MSRV(1.61): Use retain_mut here to avoid interior mutability.
- let trie = RefCell::new(PreferenceTrie::default());
+ let mut trie = PreferenceTrie {
+ states: vec![],
+ matches: vec![],
+ next_literal_index: 1,
+ };
let mut make_inexact = vec![];
- literals.retain(|lit| {
- match trie.borrow_mut().insert(lit.as_bytes()) {
- Ok(_) => true,
- Err(i) => {
- if !keep_exact {
- make_inexact.push(i);
- }
- false
+ literals.retain_mut(|lit| match trie.insert(lit.as_bytes()) {
+ Ok(_) => true,
+ Err(i) => {
+ if !keep_exact {
+ make_inexact.push(i.checked_sub(1).unwrap());
}
+ false
}
});
for i in make_inexact {
@@ -2225,15 +2267,15 @@ impl PreferenceTrie {
/// search.
fn insert(&mut self, bytes: &[u8]) -> Result<usize, usize> {
let mut prev = self.root();
- if let Some(idx) = self.states[prev].literal_index {
- return Err(idx);
+ if let Some(idx) = self.matches[prev] {
+ return Err(idx.get());
}
for &b in bytes.iter() {
match self.states[prev].trans.binary_search_by_key(&b, |t| t.0) {
Ok(i) => {
prev = self.states[prev].trans[i].1;
- if let Some(idx) = self.states[prev].literal_index {
- return Err(idx);
+ if let Some(idx) = self.matches[prev] {
+ return Err(idx.get());
}
}
Err(i) => {
@@ -2245,7 +2287,7 @@ impl PreferenceTrie {
}
let idx = self.next_literal_index;
self.next_literal_index += 1;
- self.states[prev].literal_index = Some(idx);
+ self.matches[prev] = NonZeroUsize::new(idx);
Ok(idx)
}
@@ -2262,6 +2304,7 @@ impl PreferenceTrie {
fn create_state(&mut self) -> usize {
let id = self.states.len();
self.states.push(State::default());
+ self.matches.push(None);
id
}
}
@@ -2603,6 +2646,12 @@ mod tests {
]),
e(r"(ab|cd)(ef|gh)(ij|kl)")
);
+
+ assert_eq!(inexact([E("abab")], [E("abab")]), e(r"(ab){2}"));
+
+ assert_eq!(inexact([I("abab")], [I("abab")]), e(r"(ab){2,3}"));
+
+ assert_eq!(inexact([I("abab")], [I("abab")]), e(r"(ab){2,}"));
}
#[test]
@@ -2815,13 +2864,13 @@ mod tests {
// repeats.
#[test]
fn crazy_repeats() {
- assert_eq!(inexact([I("")], [I("")]), e(r"(?:){4294967295}"));
+ assert_eq!(inexact([E("")], [E("")]), e(r"(?:){4294967295}"));
assert_eq!(
- inexact([I("")], [I("")]),
+ inexact([E("")], [E("")]),
e(r"(?:){64}{64}{64}{64}{64}{64}")
);
- assert_eq!(inexact([I("")], [I("")]), e(r"x{0}{4294967295}"));
- assert_eq!(inexact([I("")], [I("")]), e(r"(?:|){4294967295}"));
+ assert_eq!(inexact([E("")], [E("")]), e(r"x{0}{4294967295}"));
+ assert_eq!(inexact([E("")], [E("")]), e(r"(?:|){4294967295}"));
assert_eq!(
inexact([E("")], [E("")]),
diff --git a/vendor/regex-syntax/src/hir/mod.rs b/vendor/regex-syntax/src/hir/mod.rs
index 062d4dcab..ce38ead7b 100644
--- a/vendor/regex-syntax/src/hir/mod.rs
+++ b/vendor/regex-syntax/src/hir/mod.rs
@@ -88,6 +88,9 @@ pub enum ErrorKind {
/// This error occurs when translating a pattern that could match a byte
/// sequence that isn't UTF-8 and `utf8` was enabled.
InvalidUtf8,
+ /// This error occurs when one uses a non-ASCII byte for a line terminator,
+ /// but where Unicode mode is enabled and UTF-8 mode is disabled.
+ InvalidLineTerminator,
/// This occurs when an unrecognized Unicode property name could not
/// be found.
UnicodePropertyNotFound,
@@ -120,6 +123,7 @@ impl core::fmt::Display for ErrorKind {
let msg = match *self {
UnicodeNotAllowed => "Unicode not allowed here",
InvalidUtf8 => "pattern can match invalid UTF-8",
+ InvalidLineTerminator => "invalid line terminator, must be ASCII",
UnicodePropertyNotFound => "Unicode property not found",
UnicodePropertyValueNotFound => "Unicode property value not found",
UnicodePerlClassNotFound => {
@@ -180,7 +184,7 @@ impl core::fmt::Display for ErrorKind {
/// matches.
///
/// For empty matches, those can occur at any position. It is the
-/// repsonsibility of the regex engine to determine whether empty matches are
+/// responsibility of the regex engine to determine whether empty matches are
/// permitted between the code units of a single codepoint.
///
/// # Stack space
@@ -355,7 +359,13 @@ impl Hir {
/// Creates a repetition HIR expression.
#[inline]
- pub fn repetition(rep: Repetition) -> Hir {
+ pub fn repetition(mut rep: Repetition) -> Hir {
+ // If the sub-expression of a repetition can only match the empty
+ // string, then we force its maximum to be at most 1.
+ if rep.sub.properties().maximum_len() == Some(0) {
+ rep.min = cmp::min(rep.min, 1);
+ rep.max = rep.max.map(|n| cmp::min(n, 1)).or(Some(1));
+ }
// The regex 'a{0}' is always equivalent to the empty regex. This is
// true even when 'a' is an expression that never matches anything
// (like '\P{any}').
@@ -547,7 +557,7 @@ impl Hir {
// We rebuild the alternation by simplifying it. We proceed similarly
// as the concatenation case. But in this case, there's no literal
// simplification happening. We're just flattening alternations.
- let mut new = vec![];
+ let mut new = Vec::with_capacity(subs.len());
for sub in subs {
let (kind, props) = sub.into_parts();
match kind {
@@ -642,6 +652,12 @@ impl Hir {
cls.push(ClassBytesRange::new(b'\0', b'\xFF'));
Hir::class(Class::Bytes(cls))
}
+ Dot::AnyCharExcept(ch) => {
+ let mut cls =
+ ClassUnicode::new([ClassUnicodeRange::new(ch, ch)]);
+ cls.negate();
+ Hir::class(Class::Unicode(cls))
+ }
Dot::AnyCharExceptLF => {
let mut cls = ClassUnicode::empty();
cls.push(ClassUnicodeRange::new('\0', '\x09'));
@@ -655,6 +671,12 @@ impl Hir {
cls.push(ClassUnicodeRange::new('\x0E', '\u{10FFFF}'));
Hir::class(Class::Unicode(cls))
}
+ Dot::AnyByteExcept(byte) => {
+ let mut cls =
+ ClassBytes::new([ClassBytesRange::new(byte, byte)]);
+ cls.negate();
+ Hir::class(Class::Bytes(cls))
+ }
Dot::AnyByteExceptLF => {
let mut cls = ClassBytes::empty();
cls.push(ClassBytesRange::new(b'\0', b'\x09'));
@@ -775,13 +797,18 @@ impl core::fmt::Debug for Literal {
/// The high-level intermediate representation of a character class.
///
/// A character class corresponds to a set of characters. A character is either
-/// defined by a Unicode scalar value or a byte. Unicode characters are used
-/// by default, while bytes are used when Unicode mode (via the `u` flag) is
-/// disabled.
+/// defined by a Unicode scalar value or a byte.
///
/// A character class, regardless of its character type, is represented by a
/// sequence of non-overlapping non-adjacent ranges of characters.
///
+/// There are no guarantees about which class variant is used. Generally
+/// speaking, the Unicode variat is used whenever a class needs to contain
+/// non-ASCII Unicode scalar values. But the Unicode variant can be used even
+/// when Unicode mode is disabled. For example, at the time of writing, the
+/// regex `(?-u:a|\xc2\xa0)` will compile down to HIR for the Unicode class
+/// `[a\u00A0]` due to optimizations.
+///
/// Note that `Bytes` variant may be produced even when it exclusively matches
/// valid UTF-8. This is because a `Bytes` variant represents an intention by
/// the author of the regular expression to disable Unicode mode, which in turn
@@ -1304,8 +1331,9 @@ impl ClassUnicodeRange {
}
}
-/// A set of characters represented by arbitrary bytes (where one byte
-/// corresponds to one character).
+/// A set of characters represented by arbitrary bytes.
+///
+/// Each byte corresponds to one character.
#[derive(Clone, Debug, Eq, PartialEq)]
pub struct ClassBytes {
set: IntervalSet<ClassBytesRange>,
@@ -1607,6 +1635,42 @@ pub enum Look {
WordUnicode = 1 << 8,
/// Match a Unicode-aware negation of a word boundary.
WordUnicodeNegate = 1 << 9,
+ /// Match the start of an ASCII-only word boundary. That is, this matches a
+ /// position at either the beginning of the haystack or where the previous
+ /// character is not a word character and the following character is a word
+ /// character.
+ WordStartAscii = 1 << 10,
+ /// Match the end of an ASCII-only word boundary. That is, this matches
+ /// a position at either the end of the haystack or where the previous
+ /// character is a word character and the following character is not a word
+ /// character.
+ WordEndAscii = 1 << 11,
+ /// Match the start of a Unicode word boundary. That is, this matches a
+ /// position at either the beginning of the haystack or where the previous
+ /// character is not a word character and the following character is a word
+ /// character.
+ WordStartUnicode = 1 << 12,
+ /// Match the end of a Unicode word boundary. That is, this matches a
+ /// position at either the end of the haystack or where the previous
+ /// character is a word character and the following character is not a word
+ /// character.
+ WordEndUnicode = 1 << 13,
+ /// Match the start half of an ASCII-only word boundary. That is, this
+ /// matches a position at either the beginning of the haystack or where the
+ /// previous character is not a word character.
+ WordStartHalfAscii = 1 << 14,
+ /// Match the end half of an ASCII-only word boundary. That is, this
+ /// matches a position at either the end of the haystack or where the
+ /// following character is not a word character.
+ WordEndHalfAscii = 1 << 15,
+ /// Match the start half of a Unicode word boundary. That is, this matches
+ /// a position at either the beginning of the haystack or where the
+ /// previous character is not a word character.
+ WordStartHalfUnicode = 1 << 16,
+ /// Match the end half of a Unicode word boundary. That is, this matches
+ /// a position at either the end of the haystack or where the following
+ /// character is not a word character.
+ WordEndHalfUnicode = 1 << 17,
}
impl Look {
@@ -1628,6 +1692,14 @@ impl Look {
Look::WordAsciiNegate => Look::WordAsciiNegate,
Look::WordUnicode => Look::WordUnicode,
Look::WordUnicodeNegate => Look::WordUnicodeNegate,
+ Look::WordStartAscii => Look::WordEndAscii,
+ Look::WordEndAscii => Look::WordStartAscii,
+ Look::WordStartUnicode => Look::WordEndUnicode,
+ Look::WordEndUnicode => Look::WordStartUnicode,
+ Look::WordStartHalfAscii => Look::WordEndHalfAscii,
+ Look::WordEndHalfAscii => Look::WordStartHalfAscii,
+ Look::WordStartHalfUnicode => Look::WordEndHalfUnicode,
+ Look::WordEndHalfUnicode => Look::WordStartHalfUnicode,
}
}
@@ -1636,28 +1708,36 @@ impl Look {
/// constructor is guaranteed to return the same look-around variant that
/// one started with within a semver compatible release of this crate.
#[inline]
- pub const fn as_repr(self) -> u16 {
+ pub const fn as_repr(self) -> u32 {
// AFAIK, 'as' is the only way to zero-cost convert an int enum to an
// actual int.
- self as u16
+ self as u32
}
/// Given the underlying representation of a `Look` value, return the
/// corresponding `Look` value if the representation is valid. Otherwise
/// `None` is returned.
#[inline]
- pub const fn from_repr(repr: u16) -> Option<Look> {
+ pub const fn from_repr(repr: u32) -> Option<Look> {
match repr {
- 0b00_0000_0001 => Some(Look::Start),
- 0b00_0000_0010 => Some(Look::End),
- 0b00_0000_0100 => Some(Look::StartLF),
- 0b00_0000_1000 => Some(Look::EndLF),
- 0b00_0001_0000 => Some(Look::StartCRLF),
- 0b00_0010_0000 => Some(Look::EndCRLF),
- 0b00_0100_0000 => Some(Look::WordAscii),
- 0b00_1000_0000 => Some(Look::WordAsciiNegate),
- 0b01_0000_0000 => Some(Look::WordUnicode),
- 0b10_0000_0000 => Some(Look::WordUnicodeNegate),
+ 0b00_0000_0000_0000_0001 => Some(Look::Start),
+ 0b00_0000_0000_0000_0010 => Some(Look::End),
+ 0b00_0000_0000_0000_0100 => Some(Look::StartLF),
+ 0b00_0000_0000_0000_1000 => Some(Look::EndLF),
+ 0b00_0000_0000_0001_0000 => Some(Look::StartCRLF),
+ 0b00_0000_0000_0010_0000 => Some(Look::EndCRLF),
+ 0b00_0000_0000_0100_0000 => Some(Look::WordAscii),
+ 0b00_0000_0000_1000_0000 => Some(Look::WordAsciiNegate),
+ 0b00_0000_0001_0000_0000 => Some(Look::WordUnicode),
+ 0b00_0000_0010_0000_0000 => Some(Look::WordUnicodeNegate),
+ 0b00_0000_0100_0000_0000 => Some(Look::WordStartAscii),
+ 0b00_0000_1000_0000_0000 => Some(Look::WordEndAscii),
+ 0b00_0001_0000_0000_0000 => Some(Look::WordStartUnicode),
+ 0b00_0010_0000_0000_0000 => Some(Look::WordEndUnicode),
+ 0b00_0100_0000_0000_0000 => Some(Look::WordStartHalfAscii),
+ 0b00_1000_0000_0000_0000 => Some(Look::WordEndHalfAscii),
+ 0b01_0000_0000_0000_0000 => Some(Look::WordStartHalfUnicode),
+ 0b10_0000_0000_0000_0000 => Some(Look::WordEndHalfUnicode),
_ => None,
}
}
@@ -1682,6 +1762,14 @@ impl Look {
Look::WordAsciiNegate => 'B',
Look::WordUnicode => '𝛃',
Look::WordUnicodeNegate => '𝚩',
+ Look::WordStartAscii => '<',
+ Look::WordEndAscii => '>',
+ Look::WordStartUnicode => '〈',
+ Look::WordEndUnicode => '〉',
+ Look::WordStartHalfAscii => '◁',
+ Look::WordEndHalfAscii => '▷',
+ Look::WordStartHalfUnicode => '◀',
+ Look::WordEndHalfUnicode => '▶',
}
}
}
@@ -1766,6 +1854,18 @@ pub enum Dot {
///
/// This is equivalent to `(?s-u:.)` and also `(?-u:[\x00-\xFF])`.
AnyByte,
+ /// Matches the UTF-8 encoding of any Unicode scalar value except for the
+ /// `char` given.
+ ///
+ /// This is equivalent to using `(?u-s:.)` with the line terminator set
+ /// to a particular ASCII byte. (Because of peculiarities in the regex
+ /// engines, a line terminator must be a single byte. It follows that when
+ /// UTF-8 mode is enabled, this single byte must also be a Unicode scalar
+ /// value. That is, ti must be ASCII.)
+ ///
+ /// (This and `AnyCharExceptLF` both exist because of legacy reasons.
+ /// `AnyCharExceptLF` will be dropped in the next breaking change release.)
+ AnyCharExcept(char),
/// Matches the UTF-8 encoding of any Unicode scalar value except for `\n`.
///
/// This is equivalent to `(?u-s:.)` and also `[\p{any}--\n]`.
@@ -1775,6 +1875,17 @@ pub enum Dot {
///
/// This is equivalent to `(?uR-s:.)` and also `[\p{any}--\r\n]`.
AnyCharExceptCRLF,
+ /// Matches any byte value except for the `u8` given.
+ ///
+ /// This is equivalent to using `(?-us:.)` with the line terminator set
+ /// to a particular ASCII byte. (Because of peculiarities in the regex
+ /// engines, a line terminator must be a single byte. It follows that when
+ /// UTF-8 mode is enabled, this single byte must also be a Unicode scalar
+ /// value. That is, ti must be ASCII.)
+ ///
+ /// (This and `AnyByteExceptLF` both exist because of legacy reasons.
+ /// `AnyByteExceptLF` will be dropped in the next breaking change release.)
+ AnyByteExcept(u8),
/// Matches any byte value except for `\n`.
///
/// This is equivalent to `(?-su:.)` and also `(?-u:[[\x00-\xFF]--\n])`.
@@ -2410,10 +2521,10 @@ impl Properties {
inner.look_set_prefix = p.look_set_prefix();
inner.look_set_suffix = p.look_set_suffix();
}
- // If the static captures len of the sub-expression is not known or is
- // zero, then it automatically propagates to the repetition, regardless
- // of the repetition. Otherwise, it might change, but only when the
- // repetition can match 0 times.
+ // If the static captures len of the sub-expression is not known or
+ // is greater than zero, then it automatically propagates to the
+ // repetition, regardless of the repetition. Otherwise, it might
+ // change, but only when the repetition can match 0 times.
if rep.min == 0
&& inner.static_explicit_captures_len.map_or(false, |len| len > 0)
{
@@ -2549,7 +2660,7 @@ pub struct LookSet {
/// range of `u16` values to be represented. For example, even if the
/// current implementation only makes use of the 10 least significant bits,
/// it may use more bits in a future semver compatible release.
- pub bits: u16,
+ pub bits: u32,
}
impl LookSet {
@@ -2652,13 +2763,22 @@ impl LookSet {
pub fn contains_word_unicode(self) -> bool {
self.contains(Look::WordUnicode)
|| self.contains(Look::WordUnicodeNegate)
+ || self.contains(Look::WordStartUnicode)
+ || self.contains(Look::WordEndUnicode)
+ || self.contains(Look::WordStartHalfUnicode)
+ || self.contains(Look::WordEndHalfUnicode)
}
/// Returns true if and only if this set contains any ASCII word boundary
/// or negated ASCII word boundary assertions.
#[inline]
pub fn contains_word_ascii(self) -> bool {
- self.contains(Look::WordAscii) || self.contains(Look::WordAsciiNegate)
+ self.contains(Look::WordAscii)
+ || self.contains(Look::WordAsciiNegate)
+ || self.contains(Look::WordStartAscii)
+ || self.contains(Look::WordEndAscii)
+ || self.contains(Look::WordStartHalfAscii)
+ || self.contains(Look::WordEndHalfAscii)
}
/// Returns an iterator over all of the look-around assertions in this set.
@@ -2737,29 +2857,31 @@ impl LookSet {
*self = self.intersect(other);
}
- /// Return a `LookSet` from the slice given as a native endian 16-bit
+ /// Return a `LookSet` from the slice given as a native endian 32-bit
/// integer.
///
/// # Panics
///
- /// This panics if `slice.len() < 2`.
+ /// This panics if `slice.len() < 4`.
#[inline]
pub fn read_repr(slice: &[u8]) -> LookSet {
- let bits = u16::from_ne_bytes(slice[..2].try_into().unwrap());
+ let bits = u32::from_ne_bytes(slice[..4].try_into().unwrap());
LookSet { bits }
}
- /// Write a `LookSet` as a native endian 16-bit integer to the beginning
+ /// Write a `LookSet` as a native endian 32-bit integer to the beginning
/// of the slice given.
///
/// # Panics
///
- /// This panics if `slice.len() < 2`.
+ /// This panics if `slice.len() < 4`.
#[inline]
pub fn write_repr(self, slice: &mut [u8]) {
let raw = self.bits.to_ne_bytes();
slice[0] = raw[0];
slice[1] = raw[1];
+ slice[2] = raw[2];
+ slice[3] = raw[3];
}
}
@@ -2792,9 +2914,9 @@ impl Iterator for LookSetIter {
return None;
}
// We'll never have more than u8::MAX distinct look-around assertions,
- // so 'repr' will always fit into a u16.
- let repr = u16::try_from(self.set.bits.trailing_zeros()).unwrap();
- let look = Look::from_repr(1 << repr)?;
+ // so 'bit' will always fit into a u16.
+ let bit = u16::try_from(self.set.bits.trailing_zeros()).unwrap();
+ let look = Look::from_repr(1 << bit)?;
self.set = self.set.remove(look);
Some(look)
}
@@ -3716,7 +3838,7 @@ mod tests {
assert_eq!(0, set.iter().count());
let set = LookSet::full();
- assert_eq!(10, set.iter().count());
+ assert_eq!(18, set.iter().count());
let set =
LookSet::empty().insert(Look::StartLF).insert(Look::WordUnicode);
@@ -3734,6 +3856,6 @@ mod tests {
let res = format!("{:?}", LookSet::empty());
assert_eq!("∅", res);
let res = format!("{:?}", LookSet::full());
- assert_eq!("Az^$rRbB𝛃𝚩", res);
+ assert_eq!("Az^$rRbB𝛃𝚩<>〈〉◁▷◀▶", res);
}
}
diff --git a/vendor/regex-syntax/src/hir/print.rs b/vendor/regex-syntax/src/hir/print.rs
index fcb7cd252..dfa6d4032 100644
--- a/vendor/regex-syntax/src/hir/print.rs
+++ b/vendor/regex-syntax/src/hir/print.rs
@@ -89,9 +89,16 @@ impl<W: fmt::Write> Visitor for Writer<W> {
fn visit_pre(&mut self, hir: &Hir) -> fmt::Result {
match *hir.kind() {
- // Empty is represented by nothing in the concrete syntax, and
- // repetition operators are strictly suffix oriented.
- HirKind::Empty | HirKind::Repetition(_) => {}
+ HirKind::Empty => {
+ // Technically an empty sub-expression could be "printed" by
+ // just ignoring it, but in practice, you could have a
+ // repetition operator attached to an empty expression, and you
+ // really need something in the concrete syntax to make that
+ // work as you'd expect.
+ self.wtr.write_str(r"(?:)")?;
+ }
+ // Repetition operators are strictly suffix oriented.
+ HirKind::Repetition(_) => {}
HirKind::Literal(hir::Literal(ref bytes)) => {
// See the comment on the 'Concat' and 'Alternation' case below
// for why we put parens here. Literals are, conceptually,
@@ -195,6 +202,30 @@ impl<W: fmt::Write> Visitor for Writer<W> {
hir::Look::WordUnicodeNegate => {
self.wtr.write_str(r"\B")?;
}
+ hir::Look::WordStartAscii => {
+ self.wtr.write_str(r"(?-u:\b{start})")?;
+ }
+ hir::Look::WordEndAscii => {
+ self.wtr.write_str(r"(?-u:\b{end})")?;
+ }
+ hir::Look::WordStartUnicode => {
+ self.wtr.write_str(r"\b{start}")?;
+ }
+ hir::Look::WordEndUnicode => {
+ self.wtr.write_str(r"\b{end}")?;
+ }
+ hir::Look::WordStartHalfAscii => {
+ self.wtr.write_str(r"(?-u:\b{start-half})")?;
+ }
+ hir::Look::WordEndHalfAscii => {
+ self.wtr.write_str(r"(?-u:\b{end-half})")?;
+ }
+ hir::Look::WordStartHalfUnicode => {
+ self.wtr.write_str(r"\b{start-half}")?;
+ }
+ hir::Look::WordEndHalfUnicode => {
+ self.wtr.write_str(r"\b{end-half}")?;
+ }
},
HirKind::Capture(hir::Capture { ref name, .. }) => {
self.wtr.write_str("(")?;
@@ -424,20 +455,20 @@ mod tests {
// Test that various zero-length repetitions always translate to an
// empty regex. This is more a property of HIR's smart constructors
// than the printer though.
- roundtrip("a{0}", "");
- roundtrip("(?:ab){0}", "");
+ roundtrip("a{0}", "(?:)");
+ roundtrip("(?:ab){0}", "(?:)");
#[cfg(feature = "unicode-gencat")]
{
- roundtrip(r"\p{any}{0}", "");
- roundtrip(r"\P{any}{0}", "");
+ roundtrip(r"\p{any}{0}", "(?:)");
+ roundtrip(r"\P{any}{0}", "(?:)");
}
}
#[test]
fn print_group() {
- roundtrip("()", "()");
- roundtrip("(?P<foo>)", "(?P<foo>)");
- roundtrip("(?:)", "");
+ roundtrip("()", "((?:))");
+ roundtrip("(?P<foo>)", "(?P<foo>(?:))");
+ roundtrip("(?:)", "(?:)");
roundtrip("(a)", "(a)");
roundtrip("(?P<foo>a)", "(?P<foo>a)");
@@ -448,8 +479,8 @@ mod tests {
#[test]
fn print_alternation() {
- roundtrip("|", "(?:|)");
- roundtrip("||", "(?:||)");
+ roundtrip("|", "(?:(?:)|(?:))");
+ roundtrip("||", "(?:(?:)|(?:)|(?:))");
roundtrip("a|b", "[ab]");
roundtrip("ab|cd", "(?:(?:ab)|(?:cd))");
@@ -503,7 +534,7 @@ mod tests {
}),
Hir::look(hir::Look::End),
]);
- assert_eq!(r"(?:\A(?:\A\z)+\z)", expr.to_string());
+ assert_eq!(r"(?:\A\A\z\z)", expr.to_string());
}
// Just like regression_repetition_concat, but with the repetition using
@@ -540,7 +571,7 @@ mod tests {
}),
Hir::look(hir::Look::End),
]);
- assert_eq!(r"(?:\A(?:\A|\z)+\z)", expr.to_string());
+ assert_eq!(r"(?:\A(?:\A|\z)\z)", expr.to_string());
}
// This regression test is very similar in flavor to
diff --git a/vendor/regex-syntax/src/hir/translate.rs b/vendor/regex-syntax/src/hir/translate.rs
index ff9c5ee91..313a1e9e8 100644
--- a/vendor/regex-syntax/src/hir/translate.rs
+++ b/vendor/regex-syntax/src/hir/translate.rs
@@ -19,6 +19,7 @@ type Result<T> = core::result::Result<T, Error>;
#[derive(Clone, Debug)]
pub struct TranslatorBuilder {
utf8: bool,
+ line_terminator: u8,
flags: Flags,
}
@@ -31,7 +32,11 @@ impl Default for TranslatorBuilder {
impl TranslatorBuilder {
/// Create a new translator builder with a default c onfiguration.
pub fn new() -> TranslatorBuilder {
- TranslatorBuilder { utf8: true, flags: Flags::default() }
+ TranslatorBuilder {
+ utf8: true,
+ line_terminator: b'\n',
+ flags: Flags::default(),
+ }
}
/// Build a translator using the current configuration.
@@ -40,6 +45,7 @@ impl TranslatorBuilder {
stack: RefCell::new(vec![]),
flags: Cell::new(self.flags),
utf8: self.utf8,
+ line_terminator: self.line_terminator,
}
}
@@ -63,6 +69,31 @@ impl TranslatorBuilder {
self
}
+ /// Sets the line terminator for use with `(?u-s:.)` and `(?-us:.)`.
+ ///
+ /// Namely, instead of `.` (by default) matching everything except for `\n`,
+ /// this will cause `.` to match everything except for the byte given.
+ ///
+ /// If `.` is used in a context where Unicode mode is enabled and this byte
+ /// isn't ASCII, then an error will be returned. When Unicode mode is
+ /// disabled, then any byte is permitted, but will return an error if UTF-8
+ /// mode is enabled and it is a non-ASCII byte.
+ ///
+ /// In short, any ASCII value for a line terminator is always okay. But a
+ /// non-ASCII byte might result in an error depending on whether Unicode
+ /// mode or UTF-8 mode are enabled.
+ ///
+ /// Note that if `R` mode is enabled then it always takes precedence and
+ /// the line terminator will be treated as `\r` and `\n` simultaneously.
+ ///
+ /// Note also that this *doesn't* impact the look-around assertions
+ /// `(?m:^)` and `(?m:$)`. That's usually controlled by additional
+ /// configuration in the regex engine itself.
+ pub fn line_terminator(&mut self, byte: u8) -> &mut TranslatorBuilder {
+ self.line_terminator = byte;
+ self
+ }
+
/// Enable or disable the case insensitive flag (`i`) by default.
pub fn case_insensitive(&mut self, yes: bool) -> &mut TranslatorBuilder {
self.flags.case_insensitive = if yes { Some(true) } else { None };
@@ -120,6 +151,8 @@ pub struct Translator {
flags: Cell<Flags>,
/// Whether we're allowed to produce HIR that can match arbitrary bytes.
utf8: bool,
+ /// The line terminator to use for `.`.
+ line_terminator: u8,
}
impl Translator {
@@ -304,7 +337,7 @@ impl<'t, 'p> Visitor for TranslatorI<'t, 'p> {
fn visit_pre(&mut self, ast: &Ast) -> Result<()> {
match *ast {
- Ast::Class(ast::Class::Bracketed(_)) => {
+ Ast::ClassBracketed(_) => {
if self.flags().unicode() {
let cls = hir::ClassUnicode::empty();
self.push(HirFrame::ClassUnicode(cls));
@@ -321,14 +354,14 @@ impl<'t, 'p> Visitor for TranslatorI<'t, 'p> {
.unwrap_or_else(|| self.flags());
self.push(HirFrame::Group { old_flags });
}
- Ast::Concat(ref x) if x.asts.is_empty() => {}
Ast::Concat(_) => {
self.push(HirFrame::Concat);
}
- Ast::Alternation(ref x) if x.asts.is_empty() => {}
- Ast::Alternation(_) => {
+ Ast::Alternation(ref x) => {
self.push(HirFrame::Alternation);
- self.push(HirFrame::AlternationBranch);
+ if !x.asts.is_empty() {
+ self.push(HirFrame::AlternationBranch);
+ }
}
_ => {}
}
@@ -353,29 +386,20 @@ impl<'t, 'p> Visitor for TranslatorI<'t, 'p> {
// consistency sake.
self.push(HirFrame::Expr(Hir::empty()));
}
- Ast::Literal(ref x) => {
- match self.ast_literal_to_scalar(x)? {
- Either::Right(byte) => self.push_byte(byte),
- Either::Left(ch) => {
- if !self.flags().unicode() && ch.len_utf8() > 1 {
- return Err(self
- .error(x.span, ErrorKind::UnicodeNotAllowed));
- }
- match self.case_fold_char(x.span, ch)? {
- None => self.push_char(ch),
- Some(expr) => self.push(HirFrame::Expr(expr)),
- }
- }
- }
- // self.push(HirFrame::Expr(self.hir_literal(x)?));
- }
- Ast::Dot(span) => {
- self.push(HirFrame::Expr(self.hir_dot(span)?));
+ Ast::Literal(ref x) => match self.ast_literal_to_scalar(x)? {
+ Either::Right(byte) => self.push_byte(byte),
+ Either::Left(ch) => match self.case_fold_char(x.span, ch)? {
+ None => self.push_char(ch),
+ Some(expr) => self.push(HirFrame::Expr(expr)),
+ },
+ },
+ Ast::Dot(ref span) => {
+ self.push(HirFrame::Expr(self.hir_dot(**span)?));
}
Ast::Assertion(ref x) => {
self.push(HirFrame::Expr(self.hir_assertion(x)?));
}
- Ast::Class(ast::Class::Perl(ref x)) => {
+ Ast::ClassPerl(ref x) => {
if self.flags().unicode() {
let cls = self.hir_perl_unicode_class(x)?;
let hcls = hir::Class::Unicode(cls);
@@ -386,11 +410,11 @@ impl<'t, 'p> Visitor for TranslatorI<'t, 'p> {
self.push(HirFrame::Expr(Hir::class(hcls)));
}
}
- Ast::Class(ast::Class::Unicode(ref x)) => {
+ Ast::ClassUnicode(ref x) => {
let cls = hir::Class::Unicode(self.hir_unicode_class(x)?);
self.push(HirFrame::Expr(Hir::class(cls)));
}
- Ast::Class(ast::Class::Bracketed(ref ast)) => {
+ Ast::ClassBracketed(ref ast) => {
if self.flags().unicode() {
let mut cls = self.pop().unwrap().unwrap_class_unicode();
self.unicode_fold_and_negate(
@@ -841,8 +865,8 @@ impl<'t, 'p> TranslatorI<'t, 'p> {
})?;
Ok(Some(Hir::class(hir::Class::Unicode(cls))))
} else {
- if c.len_utf8() > 1 {
- return Err(self.error(span, ErrorKind::UnicodeNotAllowed));
+ if !c.is_ascii() {
+ return Ok(None);
}
// If case folding won't do anything, then don't bother trying.
match c {
@@ -862,10 +886,38 @@ impl<'t, 'p> TranslatorI<'t, 'p> {
}
fn hir_dot(&self, span: Span) -> Result<Hir> {
- if !self.flags().unicode() && self.trans().utf8 {
+ let (utf8, lineterm, flags) =
+ (self.trans().utf8, self.trans().line_terminator, self.flags());
+ if utf8 && (!flags.unicode() || !lineterm.is_ascii()) {
return Err(self.error(span, ErrorKind::InvalidUtf8));
}
- Ok(Hir::dot(self.flags().dot()))
+ let dot = if flags.dot_matches_new_line() {
+ if flags.unicode() {
+ hir::Dot::AnyChar
+ } else {
+ hir::Dot::AnyByte
+ }
+ } else {
+ if flags.unicode() {
+ if flags.crlf() {
+ hir::Dot::AnyCharExceptCRLF
+ } else {
+ if !lineterm.is_ascii() {
+ return Err(
+ self.error(span, ErrorKind::InvalidLineTerminator)
+ );
+ }
+ hir::Dot::AnyCharExcept(char::from(lineterm))
+ }
+ } else {
+ if flags.crlf() {
+ hir::Dot::AnyByteExceptCRLF
+ } else {
+ hir::Dot::AnyByteExcept(lineterm)
+ }
+ }
+ };
+ Ok(Hir::dot(dot))
}
fn hir_assertion(&self, asst: &ast::Assertion) -> Result<Hir> {
@@ -903,6 +955,34 @@ impl<'t, 'p> TranslatorI<'t, 'p> {
} else {
hir::Look::WordAsciiNegate
}),
+ ast::AssertionKind::WordBoundaryStart
+ | ast::AssertionKind::WordBoundaryStartAngle => {
+ Hir::look(if unicode {
+ hir::Look::WordStartUnicode
+ } else {
+ hir::Look::WordStartAscii
+ })
+ }
+ ast::AssertionKind::WordBoundaryEnd
+ | ast::AssertionKind::WordBoundaryEndAngle => {
+ Hir::look(if unicode {
+ hir::Look::WordEndUnicode
+ } else {
+ hir::Look::WordEndAscii
+ })
+ }
+ ast::AssertionKind::WordBoundaryStartHalf => {
+ Hir::look(if unicode {
+ hir::Look::WordStartHalfUnicode
+ } else {
+ hir::Look::WordStartHalfAscii
+ })
+ }
+ ast::AssertionKind::WordBoundaryEndHalf => Hir::look(if unicode {
+ hir::Look::WordEndHalfUnicode
+ } else {
+ hir::Look::WordEndHalfAscii
+ }),
})
}
@@ -1124,9 +1204,8 @@ impl<'t, 'p> TranslatorI<'t, 'p> {
match self.ast_literal_to_scalar(ast)? {
Either::Right(byte) => Ok(byte),
Either::Left(ch) => {
- let cp = u32::from(ch);
- if cp <= 0x7F {
- Ok(u8::try_from(cp).unwrap())
+ if ch.is_ascii() {
+ Ok(u8::try_from(ch).unwrap())
} else {
// We can't feasibly support Unicode in
// byte oriented classes. Byte classes don't
@@ -1209,30 +1288,6 @@ impl Flags {
}
}
- fn dot(&self) -> hir::Dot {
- if self.dot_matches_new_line() {
- if self.unicode() {
- hir::Dot::AnyChar
- } else {
- hir::Dot::AnyByte
- }
- } else {
- if self.unicode() {
- if self.crlf() {
- hir::Dot::AnyCharExceptCRLF
- } else {
- hir::Dot::AnyCharExceptLF
- }
- } else {
- if self.crlf() {
- hir::Dot::AnyByteExceptCRLF
- } else {
- hir::Dot::AnyByteExceptLF
- }
- }
- }
- }
-
fn case_insensitive(&self) -> bool {
self.case_insensitive.unwrap_or(false)
}
@@ -1598,16 +1653,7 @@ mod tests {
assert_eq!(t_bytes(r"(?-u)\x61"), hir_lit("a"));
assert_eq!(t_bytes(r"(?-u)\xFF"), hir_blit(b"\xFF"));
- assert_eq!(
- t_err("(?-u)☃"),
- TestError {
- kind: hir::ErrorKind::UnicodeNotAllowed,
- span: Span::new(
- Position::new(5, 1, 6),
- Position::new(8, 1, 7)
- ),
- }
- );
+ assert_eq!(t("(?-u)☃"), hir_lit("☃"));
assert_eq!(
t_err(r"(?-u)\xFF"),
TestError {
@@ -1685,16 +1731,7 @@ mod tests {
);
assert_eq!(t_bytes(r"(?i-u)\xFF"), hir_blit(b"\xFF"));
- assert_eq!(
- t_err("(?i-u)β"),
- TestError {
- kind: hir::ErrorKind::UnicodeNotAllowed,
- span: Span::new(
- Position::new(6, 1, 7),
- Position::new(8, 1, 8),
- ),
- }
- );
+ assert_eq!(t("(?i-u)β"), hir_lit("β"),);
}
#[test]
@@ -3489,6 +3526,15 @@ mod tests {
assert!(!props(r"(?:z|xx)@|xx").is_alternation_literal());
}
+ // This tests that the smart Hir::repetition constructors does some basic
+ // simplifications.
+ #[test]
+ fn smart_repetition() {
+ assert_eq!(t(r"a{0}"), Hir::empty());
+ assert_eq!(t(r"a{1}"), hir_lit("a"));
+ assert_eq!(t(r"\B{32111}"), hir_look(hir::Look::WordUnicodeNegate));
+ }
+
// This tests that the smart Hir::concat constructor simplifies the given
// exprs in a way we expect.
#[test]
@@ -3580,4 +3626,99 @@ mod tests {
]),
);
}
+
+ #[test]
+ fn regression_alt_empty_concat() {
+ use crate::ast::{self, Ast};
+
+ let span = Span::splat(Position::new(0, 0, 0));
+ let ast = Ast::alternation(ast::Alternation {
+ span,
+ asts: vec![Ast::concat(ast::Concat { span, asts: vec![] })],
+ });
+
+ let mut t = Translator::new();
+ assert_eq!(Ok(Hir::empty()), t.translate("", &ast));
+ }
+
+ #[test]
+ fn regression_empty_alt() {
+ use crate::ast::{self, Ast};
+
+ let span = Span::splat(Position::new(0, 0, 0));
+ let ast = Ast::concat(ast::Concat {
+ span,
+ asts: vec![Ast::alternation(ast::Alternation {
+ span,
+ asts: vec![],
+ })],
+ });
+
+ let mut t = Translator::new();
+ assert_eq!(Ok(Hir::fail()), t.translate("", &ast));
+ }
+
+ #[test]
+ fn regression_singleton_alt() {
+ use crate::{
+ ast::{self, Ast},
+ hir::Dot,
+ };
+
+ let span = Span::splat(Position::new(0, 0, 0));
+ let ast = Ast::concat(ast::Concat {
+ span,
+ asts: vec![Ast::alternation(ast::Alternation {
+ span,
+ asts: vec![Ast::dot(span)],
+ })],
+ });
+
+ let mut t = Translator::new();
+ assert_eq!(Ok(Hir::dot(Dot::AnyCharExceptLF)), t.translate("", &ast));
+ }
+
+ // See: https://bugs.chromium.org/p/oss-fuzz/issues/detail?id=63168
+ #[test]
+ fn regression_fuzz_match() {
+ let pat = "[(\u{6} \0-\u{afdf5}] \0 ";
+ let ast = ParserBuilder::new()
+ .octal(false)
+ .ignore_whitespace(true)
+ .build()
+ .parse(pat)
+ .unwrap();
+ let hir = TranslatorBuilder::new()
+ .utf8(true)
+ .case_insensitive(false)
+ .multi_line(false)
+ .dot_matches_new_line(false)
+ .swap_greed(true)
+ .unicode(true)
+ .build()
+ .translate(pat, &ast)
+ .unwrap();
+ assert_eq!(
+ hir,
+ Hir::concat(vec![
+ hir_uclass(&[('\0', '\u{afdf5}')]),
+ hir_lit("\0"),
+ ])
+ );
+ }
+
+ // See: https://bugs.chromium.org/p/oss-fuzz/issues/detail?id=63155
+ #[cfg(feature = "unicode")]
+ #[test]
+ fn regression_fuzz_difference1() {
+ let pat = r"\W\W|\W[^\v--\W\W\P{Script_Extensions:Pau_Cin_Hau}\u10A1A1-\U{3E3E3}--~~~~--~~~~~~~~------~~~~~~--~~~~~~]*";
+ let _ = t(pat); // shouldn't panic
+ }
+
+ // See: https://bugs.chromium.org/p/oss-fuzz/issues/detail?id=63153
+ #[test]
+ fn regression_fuzz_char_decrement1() {
+ let pat = "w[w[^w?\rw\rw[^w?\rw[^w?\rw[^w?\rw[^w?\rw[^w?\rw[^w?\r\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0w?\rw[^w?\rw[^w?\rw[^w\0\0\u{1}\0]\0\0-*\0]\0\0\0\0\0\0\u{1}\0]\0\0-*\0]\0\0\0\0\0\u{1}\0]\0\0\0\0\0\0\0\0\0*\0\0\u{1}\0]\0\0-*\0][^w?\rw[^w?\rw[^w?\rw[^w?\rw[^w?\rw[^w?\rw[^w\0\0\u{1}\0]\0\0-*\0]\0\0\0\0\0\0\u{1}\0]\0\0-*\0]\0\0\0\0\0\u{1}\0]\0\0\0\0\0\0\0\0\0x\0\0\u{1}\0]\0\0-*\0]\0\0\0\0\0\0\0\0\0*??\0\u{7f}{2}\u{10}??\0\0\0\0\0\0\0\0\0\u{3}\0\0\0}\0-*\0]\0\0\0\0\0\0\u{1}\0]\0\0-*\0]\0\0\0\0\0\0\u{1}\0]\0\0-*\0]\0\0\0\0\0\u{1}\0]\0\0-*\0]\0\0\0\0\0\0\0\u{1}\0]\0\u{1}\u{1}H-i]-]\0\0\0\0\u{1}\0]\0\0\0\u{1}\0]\0\0-*\0\0\0\0\u{1}9-\u{7f}]\0'|-\u{7f}]\0'|(?i-ux)[-\u{7f}]\0'\u{3}\0\0\0}\0-*\0]<D\0\0\0\0\0\0\u{1}]\0\0\0\0]\0\0-*\0]\0\0 ";
+ let _ = t(pat); // shouldn't panic
+ }
}
diff --git a/vendor/regex-syntax/src/hir/visitor.rs b/vendor/regex-syntax/src/hir/visitor.rs
index e5f15cf1c..f30f0a163 100644
--- a/vendor/regex-syntax/src/hir/visitor.rs
+++ b/vendor/regex-syntax/src/hir/visitor.rs
@@ -41,6 +41,11 @@ pub trait Visitor {
fn visit_alternation_in(&mut self) -> Result<(), Self::Err> {
Ok(())
}
+
+ /// This method is called between child nodes of a concatenation.
+ fn visit_concat_in(&mut self) -> Result<(), Self::Err> {
+ Ok(())
+ }
}
/// Executes an implementation of `Visitor` in constant stack space.
@@ -131,8 +136,14 @@ impl<'a> HeapVisitor<'a> {
// If this is a concat/alternate, then we might have additional
// inductive steps to process.
if let Some(x) = self.pop(frame) {
- if let Frame::Alternation { .. } = x {
- visitor.visit_alternation_in()?;
+ match x {
+ Frame::Alternation { .. } => {
+ visitor.visit_alternation_in()?;
+ }
+ Frame::Concat { .. } => {
+ visitor.visit_concat_in()?;
+ }
+ _ => {}
}
hir = x.child();
self.stack.push((post_hir, x));
diff --git a/vendor/regex-syntax/src/lib.rs b/vendor/regex-syntax/src/lib.rs
index 754858900..20f25db71 100644
--- a/vendor/regex-syntax/src/lib.rs
+++ b/vendor/regex-syntax/src/lib.rs
@@ -157,6 +157,11 @@ The following features are available:
[Unicode text segmentation algorithms](https://www.unicode.org/reports/tr29/).
This enables using classes like `\p{gcb=Extend}`, `\p{wb=Katakana}` and
`\p{sb=ATerm}`.
+* **arbitrary** -
+ Enabling this feature introduces a public dependency on the
+ [`arbitrary`](https://crates.io/crates/arbitrary)
+ crate. Namely, it implements the `Arbitrary` trait from that crate for the
+ [`Ast`](crate::ast::Ast) type. This feature is disabled by default.
*/
#![no_std]
@@ -317,6 +322,9 @@ pub fn is_escapeable_character(c: char) -> bool {
// escapeable, \< and \> will result in a parse error. Thus, we can
// turn them into something else in the future without it being a
// backwards incompatible change.
+ //
+ // OK, now we support \< and \>, and we need to retain them as *not*
+ // escapeable here since the escape sequence is significant.
'<' | '>' => false,
_ => true,
}
@@ -364,7 +372,7 @@ pub fn try_is_word_character(
/// Returns true if and only if the given character is an ASCII word character.
///
/// An ASCII word character is defined by the following character class:
-/// `[_0-9a-zA-Z]'.
+/// `[_0-9a-zA-Z]`.
pub fn is_word_byte(c: u8) -> bool {
match c {
b'_' | b'0'..=b'9' | b'a'..=b'z' | b'A'..=b'Z' => true,
diff --git a/vendor/regex-syntax/src/parser.rs b/vendor/regex-syntax/src/parser.rs
index 2e7a2bb80..f482b8466 100644
--- a/vendor/regex-syntax/src/parser.rs
+++ b/vendor/regex-syntax/src/parser.rs
@@ -165,6 +165,31 @@ impl ParserBuilder {
self
}
+ /// Sets the line terminator for use with `(?u-s:.)` and `(?-us:.)`.
+ ///
+ /// Namely, instead of `.` (by default) matching everything except for `\n`,
+ /// this will cause `.` to match everything except for the byte given.
+ ///
+ /// If `.` is used in a context where Unicode mode is enabled and this byte
+ /// isn't ASCII, then an error will be returned. When Unicode mode is
+ /// disabled, then any byte is permitted, but will return an error if UTF-8
+ /// mode is enabled and it is a non-ASCII byte.
+ ///
+ /// In short, any ASCII value for a line terminator is always okay. But a
+ /// non-ASCII byte might result in an error depending on whether Unicode
+ /// mode or UTF-8 mode are enabled.
+ ///
+ /// Note that if `R` mode is enabled then it always takes precedence and
+ /// the line terminator will be treated as `\r` and `\n` simultaneously.
+ ///
+ /// Note also that this *doesn't* impact the look-around assertions
+ /// `(?m:^)` and `(?m:$)`. That's usually controlled by additional
+ /// configuration in the regex engine itself.
+ pub fn line_terminator(&mut self, byte: u8) -> &mut ParserBuilder {
+ self.hir.line_terminator(byte);
+ self
+ }
+
/// Enable or disable the "swap greed" flag by default.
///
/// By default this is disabled. It may alternatively be selectively
diff --git a/vendor/regex-syntax/test b/vendor/regex-syntax/test
index a4d6cfaba..8626c3bfc 100755
--- a/vendor/regex-syntax/test
+++ b/vendor/regex-syntax/test
@@ -2,6 +2,10 @@
set -e
+# cd to the directory containing this crate's Cargo.toml so that we don't need
+# to pass --manifest-path to every `cargo` command.
+cd "$(dirname "$0")"
+
# This is a convenience script for running a broad swath of the syntax tests.
echo "===== DEFAULT FEATURES ==="
cargo test