diff options
Diffstat (limited to 'vendor/regex-syntax/src/lib.rs')
-rw-r--r-- | vendor/regex-syntax/src/lib.rs | 175 |
1 files changed, 143 insertions, 32 deletions
diff --git a/vendor/regex-syntax/src/lib.rs b/vendor/regex-syntax/src/lib.rs index 1dfb38af3..4953641d7 100644 --- a/vendor/regex-syntax/src/lib.rs +++ b/vendor/regex-syntax/src/lib.rs @@ -3,14 +3,14 @@ This crate provides a robust regular expression parser. This crate defines two primary types: -* [`Ast`](ast/enum.Ast.html) is the abstract syntax of a regular expression. +* [`Ast`](ast::Ast) is the abstract syntax of a regular expression. An abstract syntax corresponds to a *structured representation* of the concrete syntax of a regular expression, where the concrete syntax is the pattern string itself (e.g., `foo(bar)+`). Given some abstract syntax, it can be converted back to the original concrete syntax (modulo some details, like whitespace). To a first approximation, the abstract syntax is complex and difficult to analyze. -* [`Hir`](hir/struct.Hir.html) is the high-level intermediate representation +* [`Hir`](hir::Hir) is the high-level intermediate representation ("HIR" or "high-level IR" for short) of regular expression. It corresponds to an intermediate state of a regular expression that sits between the abstract syntax and the low level compiled opcodes that are eventually responsible for @@ -22,14 +22,15 @@ This crate defines two primary types: These two types come with conversion routines: -* An [`ast::parse::Parser`](ast/parse/struct.Parser.html) converts concrete - syntax (a `&str`) to an [`Ast`](ast/enum.Ast.html). -* A [`hir::translate::Translator`](hir/translate/struct.Translator.html) - converts an [`Ast`](ast/enum.Ast.html) to a [`Hir`](hir/struct.Hir.html). +* An [`ast::parse::Parser`] converts concrete syntax (a `&str`) to an +[`Ast`](ast::Ast). +* A [`hir::translate::Translator`] converts an [`Ast`](ast::Ast) to a +[`Hir`](hir::Hir). As a convenience, the above two conversion routines are combined into one via -the top-level [`Parser`](struct.Parser.html) type. This `Parser` will first -convert your pattern to an `Ast` and then convert the `Ast` to an `Hir`. +the top-level [`Parser`] type. This `Parser` will first convert your pattern to +an `Ast` and then convert the `Ast` to an `Hir`. It's also exposed as top-level +[`parse`] free function. # Example @@ -37,14 +38,14 @@ convert your pattern to an `Ast` and then convert the `Ast` to an `Hir`. This example shows how to parse a pattern string into its HIR: ``` -use regex_syntax::Parser; -use regex_syntax::hir::{self, Hir}; +use regex_syntax::{hir::Hir, parse}; -let hir = Parser::new().parse("a|b").unwrap(); +let hir = parse("a|b")?; assert_eq!(hir, Hir::alternation(vec![ - Hir::literal(hir::Literal::Unicode('a')), - Hir::literal(hir::Literal::Unicode('b')), + Hir::literal("a".as_bytes()), + Hir::literal("b".as_bytes()), ])); +# Ok::<(), Box<dyn std::error::Error>>(()) ``` @@ -81,10 +82,9 @@ in a monospace font. # Literal extraction -This crate provides limited support for -[literal extraction from `Hir` values](hir/literal/struct.Literals.html). -Be warned that literal extraction currently uses recursion, and therefore, -stack size proportional to the size of the `Hir`. +This crate provides limited support for [literal extraction from `Hir` +values](hir::literal). Be warned that literal extraction uses recursion, and +therefore, stack size proportional to the size of the `Hir`. The purpose of literal extraction is to speed up searches. That is, if you know a regular expression must match a prefix or suffix literal, then it is @@ -116,6 +116,11 @@ match semantics of a regular expression. The following features are available: +* **std** - + Enables support for the standard library. This feature is enabled by default. + When disabled, only `core` and `alloc` are used. Otherwise, enabling `std` + generally just enables `std::error::Error` trait impls for the various error + types. * **unicode** - Enables all Unicode features. This feature is enabled by default, and will always cover all Unicode features, even if more are added in the future. @@ -154,19 +159,32 @@ The following features are available: `\p{sb=ATerm}`. */ -#![deny(missing_docs)] -#![warn(missing_debug_implementations)] +#![no_std] #![forbid(unsafe_code)] +#![deny(missing_docs, rustdoc::broken_intra_doc_links)] +#![warn(missing_debug_implementations)] +#![cfg_attr(docsrs, feature(doc_auto_cfg))] + +#[cfg(any(test, feature = "std"))] +extern crate std; -pub use crate::error::{Error, Result}; -pub use crate::parser::{Parser, ParserBuilder}; -pub use crate::unicode::UnicodeWordError; +extern crate alloc; + +pub use crate::{ + error::Error, + parser::{parse, Parser, ParserBuilder}, + unicode::UnicodeWordError, +}; + +use alloc::string::String; pub mod ast; +mod debug; mod either; mod error; pub mod hir; mod parser; +mod rank; mod unicode; mod unicode_tables; pub mod utf8; @@ -197,13 +215,43 @@ pub fn escape_into(text: &str, buf: &mut String) { /// Returns true if the given character has significance in a regex. /// -/// These are the only characters that are allowed to be escaped, with one -/// exception: an ASCII space character may be escaped when extended mode (with -/// the `x` flag) is enabled. In particular, `is_meta_character(' ')` returns -/// `false`. +/// Generally speaking, these are the only characters which _must_ be escaped +/// in order to match their literal meaning. For example, to match a literal +/// `|`, one could write `\|`. Sometimes escaping isn't always necessary. For +/// example, `-` is treated as a meta character because of its significance +/// for writing ranges inside of character classes, but the regex `-` will +/// match a literal `-` because `-` has no special meaning outside of character +/// classes. +/// +/// In order to determine whether a character may be escaped at all, the +/// [`is_escapeable_character`] routine should be used. The difference between +/// `is_meta_character` and `is_escapeable_character` is that the latter will +/// return true for some characters that are _not_ meta characters. For +/// example, `%` and `\%` both match a literal `%` in all contexts. In other +/// words, `is_escapeable_character` includes "superfluous" escapes. /// /// Note that the set of characters for which this function returns `true` or -/// `false` is fixed and won't change in a semver compatible release. +/// `false` is fixed and won't change in a semver compatible release. (In this +/// case, "semver compatible release" actually refers to the `regex` crate +/// itself, since reducing or expanding the set of meta characters would be a +/// breaking change for not just `regex-syntax` but also `regex` itself.) +/// +/// # Example +/// +/// ``` +/// use regex_syntax::is_meta_character; +/// +/// assert!(is_meta_character('?')); +/// assert!(is_meta_character('-')); +/// assert!(is_meta_character('&')); +/// assert!(is_meta_character('#')); +/// +/// assert!(!is_meta_character('%')); +/// assert!(!is_meta_character('/')); +/// assert!(!is_meta_character('!')); +/// assert!(!is_meta_character('"')); +/// assert!(!is_meta_character('e')); +/// ``` pub fn is_meta_character(c: char) -> bool { match c { '\\' | '.' | '+' | '*' | '?' | '(' | ')' | '|' | '[' | ']' | '{' @@ -212,6 +260,68 @@ pub fn is_meta_character(c: char) -> bool { } } +/// Returns true if the given character can be escaped in a regex. +/// +/// This returns true in all cases that `is_meta_character` returns true, but +/// also returns true in some cases where `is_meta_character` returns false. +/// For example, `%` is not a meta character, but it is escapeable. That is, +/// `%` and `\%` both match a literal `%` in all contexts. +/// +/// The purpose of this routine is to provide knowledge about what characters +/// may be escaped. Namely, most regex engines permit "superfluous" escapes +/// where characters without any special significance may be escaped even +/// though there is no actual _need_ to do so. +/// +/// This will return false for some characters. For example, `e` is not +/// escapeable. Therefore, `\e` will either result in a parse error (which is +/// true today), or it could backwards compatibly evolve into a new construct +/// with its own meaning. Indeed, that is the purpose of banning _some_ +/// superfluous escapes: it provides a way to evolve the syntax in a compatible +/// manner. +/// +/// # Example +/// +/// ``` +/// use regex_syntax::is_escapeable_character; +/// +/// assert!(is_escapeable_character('?')); +/// assert!(is_escapeable_character('-')); +/// assert!(is_escapeable_character('&')); +/// assert!(is_escapeable_character('#')); +/// assert!(is_escapeable_character('%')); +/// assert!(is_escapeable_character('/')); +/// assert!(is_escapeable_character('!')); +/// assert!(is_escapeable_character('"')); +/// +/// assert!(!is_escapeable_character('e')); +/// ``` +pub fn is_escapeable_character(c: char) -> bool { + // Certainly escapeable if it's a meta character. + if is_meta_character(c) { + return true; + } + // Any character that isn't ASCII is definitely not escapeable. There's + // no real need to allow things like \☃ right? + if !c.is_ascii() { + return false; + } + // Otherwise, we basically say that everything is escapeable unless it's a + // letter or digit. Things like \3 are either octal (when enabled) or an + // error, and we should keep it that way. Otherwise, letters are reserved + // for adding new syntax in a backwards compatible way. + match c { + '0'..='9' | 'A'..='Z' | 'a'..='z' => false, + // While not currently supported, we keep these as not escapeable to + // give us some flexibility with respect to supporting the \< and + // \> word boundary assertions in the future. By rejecting them as + // escapeable, \< and \> will result in a parse error. Thus, we can + // turn them into something else in the future without it being a + // backwards incompatible change. + '<' | '>' => false, + _ => true, + } +} + /// Returns true if and only if the given character is a Unicode word /// character. /// @@ -224,10 +334,9 @@ pub fn is_meta_character(c: char) -> bool { /// /// # Panics /// -/// If the `unicode-perl` feature is not enabled, then this function panics. -/// For this reason, it is recommended that callers use -/// [`try_is_word_character`](fn.try_is_word_character.html) -/// instead. +/// If the `unicode-perl` feature is not enabled, then this function +/// panics. For this reason, it is recommended that callers use +/// [`try_is_word_character`] instead. pub fn is_word_character(c: char) -> bool { try_is_word_character(c).expect("unicode-perl feature must be enabled") } @@ -248,7 +357,7 @@ pub fn is_word_character(c: char) -> bool { /// returns an error. pub fn try_is_word_character( c: char, -) -> std::result::Result<bool, UnicodeWordError> { +) -> core::result::Result<bool, UnicodeWordError> { unicode::is_word_character(c) } @@ -265,6 +374,8 @@ pub fn is_word_byte(c: u8) -> bool { #[cfg(test)] mod tests { + use alloc::string::ToString; + use super::*; #[test] |