summaryrefslogtreecommitdiffstats
path: root/third_party/rust/regex-syntax/src/hir/translate.rs
diff options
context:
space:
mode:
Diffstat (limited to 'third_party/rust/regex-syntax/src/hir/translate.rs')
-rw-r--r--third_party/rust/regex-syntax/src/hir/translate.rs3207
1 files changed, 3207 insertions, 0 deletions
diff --git a/third_party/rust/regex-syntax/src/hir/translate.rs b/third_party/rust/regex-syntax/src/hir/translate.rs
new file mode 100644
index 0000000000..890e1608b3
--- /dev/null
+++ b/third_party/rust/regex-syntax/src/hir/translate.rs
@@ -0,0 +1,3207 @@
+/*!
+Defines a translator that converts an `Ast` to an `Hir`.
+*/
+
+use std::cell::{Cell, RefCell};
+use std::result;
+
+use crate::ast::{self, Ast, Span, Visitor};
+use crate::hir::{self, Error, ErrorKind, Hir};
+use crate::unicode::{self, ClassQuery};
+
+type Result<T> = result::Result<T, Error>;
+
+/// A builder for constructing an AST->HIR translator.
+#[derive(Clone, Debug)]
+pub struct TranslatorBuilder {
+ allow_invalid_utf8: bool,
+ flags: Flags,
+}
+
+impl Default for TranslatorBuilder {
+ fn default() -> TranslatorBuilder {
+ TranslatorBuilder::new()
+ }
+}
+
+impl TranslatorBuilder {
+ /// Create a new translator builder with a default c onfiguration.
+ pub fn new() -> TranslatorBuilder {
+ TranslatorBuilder {
+ allow_invalid_utf8: false,
+ flags: Flags::default(),
+ }
+ }
+
+ /// Build a translator using the current configuration.
+ pub fn build(&self) -> Translator {
+ Translator {
+ stack: RefCell::new(vec![]),
+ flags: Cell::new(self.flags),
+ allow_invalid_utf8: self.allow_invalid_utf8,
+ }
+ }
+
+ /// When enabled, translation will permit the construction of a regular
+ /// expression that may match invalid UTF-8.
+ ///
+ /// When disabled (the default), the translator is guaranteed to produce
+ /// an expression that will only ever match valid UTF-8 (otherwise, the
+ /// translator will return an error).
+ ///
+ /// Perhaps surprisingly, when invalid UTF-8 isn't allowed, a negated ASCII
+ /// word boundary (uttered as `(?-u:\B)` in the concrete syntax) will cause
+ /// the parser to return an error. Namely, a negated ASCII word boundary
+ /// can result in matching positions that aren't valid UTF-8 boundaries.
+ pub fn allow_invalid_utf8(&mut self, yes: bool) -> &mut TranslatorBuilder {
+ self.allow_invalid_utf8 = yes;
+ self
+ }
+
+ /// Enable or disable the case insensitive flag (`i`) by default.
+ pub fn case_insensitive(&mut self, yes: bool) -> &mut TranslatorBuilder {
+ self.flags.case_insensitive = if yes { Some(true) } else { None };
+ self
+ }
+
+ /// Enable or disable the multi-line matching flag (`m`) by default.
+ pub fn multi_line(&mut self, yes: bool) -> &mut TranslatorBuilder {
+ self.flags.multi_line = if yes { Some(true) } else { None };
+ self
+ }
+
+ /// Enable or disable the "dot matches any character" flag (`s`) by
+ /// default.
+ pub fn dot_matches_new_line(
+ &mut self,
+ yes: bool,
+ ) -> &mut TranslatorBuilder {
+ self.flags.dot_matches_new_line = if yes { Some(true) } else { None };
+ self
+ }
+
+ /// Enable or disable the "swap greed" flag (`U`) by default.
+ pub fn swap_greed(&mut self, yes: bool) -> &mut TranslatorBuilder {
+ self.flags.swap_greed = if yes { Some(true) } else { None };
+ self
+ }
+
+ /// Enable or disable the Unicode flag (`u`) by default.
+ pub fn unicode(&mut self, yes: bool) -> &mut TranslatorBuilder {
+ self.flags.unicode = if yes { None } else { Some(false) };
+ self
+ }
+}
+
+/// A translator maps abstract syntax to a high level intermediate
+/// representation.
+///
+/// A translator may be benefit from reuse. That is, a translator can translate
+/// many abstract syntax trees.
+///
+/// A `Translator` can be configured in more detail via a
+/// [`TranslatorBuilder`](struct.TranslatorBuilder.html).
+#[derive(Clone, Debug)]
+pub struct Translator {
+ /// Our call stack, but on the heap.
+ stack: RefCell<Vec<HirFrame>>,
+ /// The current flag settings.
+ flags: Cell<Flags>,
+ /// Whether we're allowed to produce HIR that can match arbitrary bytes.
+ allow_invalid_utf8: bool,
+}
+
+impl Translator {
+ /// Create a new translator using the default configuration.
+ pub fn new() -> Translator {
+ TranslatorBuilder::new().build()
+ }
+
+ /// Translate the given abstract syntax tree (AST) into a high level
+ /// intermediate representation (HIR).
+ ///
+ /// If there was a problem doing the translation, then an HIR-specific
+ /// error is returned.
+ ///
+ /// The original pattern string used to produce the `Ast` *must* also be
+ /// provided. The translator does not use the pattern string during any
+ /// correct translation, but is used for error reporting.
+ pub fn translate(&mut self, pattern: &str, ast: &Ast) -> Result<Hir> {
+ ast::visit(ast, TranslatorI::new(self, pattern))
+ }
+}
+
+/// An HirFrame is a single stack frame, represented explicitly, which is
+/// created for each item in the Ast that we traverse.
+///
+/// Note that technically, this type doesn't represent our entire stack
+/// frame. In particular, the Ast visitor represents any state associated with
+/// traversing the Ast itself.
+#[derive(Clone, Debug)]
+enum HirFrame {
+ /// An arbitrary HIR expression. These get pushed whenever we hit a base
+ /// case in the Ast. They get popped after an inductive (i.e., recursive)
+ /// step is complete.
+ Expr(Hir),
+ /// A Unicode character class. This frame is mutated as we descend into
+ /// the Ast of a character class (which is itself its own mini recursive
+ /// structure).
+ ClassUnicode(hir::ClassUnicode),
+ /// A byte-oriented character class. This frame is mutated as we descend
+ /// into the Ast of a character class (which is itself its own mini
+ /// recursive structure).
+ ///
+ /// Byte character classes are created when Unicode mode (`u`) is disabled.
+ /// If `allow_invalid_utf8` is disabled (the default), then a byte
+ /// character is only permitted to match ASCII text.
+ ClassBytes(hir::ClassBytes),
+ /// This is pushed on to the stack upon first seeing any kind of group,
+ /// indicated by parentheses (including non-capturing groups). It is popped
+ /// upon leaving a group.
+ Group {
+ /// The old active flags when this group was opened.
+ ///
+ /// If this group sets flags, then the new active flags are set to the
+ /// result of merging the old flags with the flags introduced by this
+ /// group. If the group doesn't set any flags, then this is simply
+ /// equivalent to whatever flags were set when the group was opened.
+ ///
+ /// When this group is popped, the active flags should be restored to
+ /// the flags set here.
+ ///
+ /// The "active" flags correspond to whatever flags are set in the
+ /// Translator.
+ old_flags: Flags,
+ },
+ /// This is pushed whenever a concatenation is observed. After visiting
+ /// every sub-expression in the concatenation, the translator's stack is
+ /// popped until it sees a Concat frame.
+ Concat,
+ /// This is pushed whenever an alternation is observed. After visiting
+ /// every sub-expression in the alternation, the translator's stack is
+ /// popped until it sees an Alternation frame.
+ Alternation,
+}
+
+impl HirFrame {
+ /// Assert that the current stack frame is an Hir expression and return it.
+ fn unwrap_expr(self) -> Hir {
+ match self {
+ HirFrame::Expr(expr) => expr,
+ _ => panic!("tried to unwrap expr from HirFrame, got: {:?}", self),
+ }
+ }
+
+ /// Assert that the current stack frame is a Unicode class expression and
+ /// return it.
+ fn unwrap_class_unicode(self) -> hir::ClassUnicode {
+ match self {
+ HirFrame::ClassUnicode(cls) => cls,
+ _ => panic!(
+ "tried to unwrap Unicode class \
+ from HirFrame, got: {:?}",
+ self
+ ),
+ }
+ }
+
+ /// Assert that the current stack frame is a byte class expression and
+ /// return it.
+ fn unwrap_class_bytes(self) -> hir::ClassBytes {
+ match self {
+ HirFrame::ClassBytes(cls) => cls,
+ _ => panic!(
+ "tried to unwrap byte class \
+ from HirFrame, got: {:?}",
+ self
+ ),
+ }
+ }
+
+ /// Assert that the current stack frame is a group indicator and return
+ /// its corresponding flags (the flags that were active at the time the
+ /// group was entered).
+ fn unwrap_group(self) -> Flags {
+ match self {
+ HirFrame::Group { old_flags } => old_flags,
+ _ => {
+ panic!("tried to unwrap group from HirFrame, got: {:?}", self)
+ }
+ }
+ }
+}
+
+impl<'t, 'p> Visitor for TranslatorI<'t, 'p> {
+ type Output = Hir;
+ type Err = Error;
+
+ fn finish(self) -> Result<Hir> {
+ // ... otherwise, we should have exactly one HIR on the stack.
+ assert_eq!(self.trans().stack.borrow().len(), 1);
+ Ok(self.pop().unwrap().unwrap_expr())
+ }
+
+ fn visit_pre(&mut self, ast: &Ast) -> Result<()> {
+ match *ast {
+ Ast::Class(ast::Class::Bracketed(_)) => {
+ if self.flags().unicode() {
+ let cls = hir::ClassUnicode::empty();
+ self.push(HirFrame::ClassUnicode(cls));
+ } else {
+ let cls = hir::ClassBytes::empty();
+ self.push(HirFrame::ClassBytes(cls));
+ }
+ }
+ Ast::Group(ref x) => {
+ let old_flags = x
+ .flags()
+ .map(|ast| self.set_flags(ast))
+ .unwrap_or_else(|| self.flags());
+ self.push(HirFrame::Group { old_flags });
+ }
+ Ast::Concat(ref x) if x.asts.is_empty() => {}
+ Ast::Concat(_) => {
+ self.push(HirFrame::Concat);
+ }
+ Ast::Alternation(ref x) if x.asts.is_empty() => {}
+ Ast::Alternation(_) => {
+ self.push(HirFrame::Alternation);
+ }
+ _ => {}
+ }
+ Ok(())
+ }
+
+ fn visit_post(&mut self, ast: &Ast) -> Result<()> {
+ match *ast {
+ Ast::Empty(_) => {
+ self.push(HirFrame::Expr(Hir::empty()));
+ }
+ Ast::Flags(ref x) => {
+ self.set_flags(&x.flags);
+ // Flags in the AST are generally considered directives and
+ // not actual sub-expressions. However, they can be used in
+ // the concrete syntax like `((?i))`, and we need some kind of
+ // indication of an expression there, and Empty is the correct
+ // choice.
+ //
+ // There can also be things like `(?i)+`, but we rule those out
+ // in the parser. In the future, we might allow them for
+ // consistency sake.
+ self.push(HirFrame::Expr(Hir::empty()));
+ }
+ Ast::Literal(ref x) => {
+ self.push(HirFrame::Expr(self.hir_literal(x)?));
+ }
+ Ast::Dot(span) => {
+ self.push(HirFrame::Expr(self.hir_dot(span)?));
+ }
+ Ast::Assertion(ref x) => {
+ self.push(HirFrame::Expr(self.hir_assertion(x)?));
+ }
+ Ast::Class(ast::Class::Perl(ref x)) => {
+ if self.flags().unicode() {
+ let cls = self.hir_perl_unicode_class(x)?;
+ let hcls = hir::Class::Unicode(cls);
+ self.push(HirFrame::Expr(Hir::class(hcls)));
+ } else {
+ let cls = self.hir_perl_byte_class(x);
+ let hcls = hir::Class::Bytes(cls);
+ self.push(HirFrame::Expr(Hir::class(hcls)));
+ }
+ }
+ Ast::Class(ast::Class::Unicode(ref x)) => {
+ let cls = hir::Class::Unicode(self.hir_unicode_class(x)?);
+ self.push(HirFrame::Expr(Hir::class(cls)));
+ }
+ Ast::Class(ast::Class::Bracketed(ref ast)) => {
+ if self.flags().unicode() {
+ let mut cls = self.pop().unwrap().unwrap_class_unicode();
+ self.unicode_fold_and_negate(
+ &ast.span,
+ ast.negated,
+ &mut cls,
+ )?;
+ if cls.ranges().is_empty() {
+ return Err(self.error(
+ ast.span,
+ ErrorKind::EmptyClassNotAllowed,
+ ));
+ }
+ let expr = Hir::class(hir::Class::Unicode(cls));
+ self.push(HirFrame::Expr(expr));
+ } else {
+ let mut cls = self.pop().unwrap().unwrap_class_bytes();
+ self.bytes_fold_and_negate(
+ &ast.span,
+ ast.negated,
+ &mut cls,
+ )?;
+ if cls.ranges().is_empty() {
+ return Err(self.error(
+ ast.span,
+ ErrorKind::EmptyClassNotAllowed,
+ ));
+ }
+
+ let expr = Hir::class(hir::Class::Bytes(cls));
+ self.push(HirFrame::Expr(expr));
+ }
+ }
+ Ast::Repetition(ref x) => {
+ let expr = self.pop().unwrap().unwrap_expr();
+ self.push(HirFrame::Expr(self.hir_repetition(x, expr)));
+ }
+ Ast::Group(ref x) => {
+ let expr = self.pop().unwrap().unwrap_expr();
+ let old_flags = self.pop().unwrap().unwrap_group();
+ self.trans().flags.set(old_flags);
+ self.push(HirFrame::Expr(self.hir_group(x, expr)));
+ }
+ Ast::Concat(_) => {
+ let mut exprs = vec![];
+ while let Some(HirFrame::Expr(expr)) = self.pop() {
+ if !expr.kind().is_empty() {
+ exprs.push(expr);
+ }
+ }
+ exprs.reverse();
+ self.push(HirFrame::Expr(Hir::concat(exprs)));
+ }
+ Ast::Alternation(_) => {
+ let mut exprs = vec![];
+ while let Some(HirFrame::Expr(expr)) = self.pop() {
+ exprs.push(expr);
+ }
+ exprs.reverse();
+ self.push(HirFrame::Expr(Hir::alternation(exprs)));
+ }
+ }
+ Ok(())
+ }
+
+ fn visit_class_set_item_pre(
+ &mut self,
+ ast: &ast::ClassSetItem,
+ ) -> Result<()> {
+ match *ast {
+ ast::ClassSetItem::Bracketed(_) => {
+ if self.flags().unicode() {
+ let cls = hir::ClassUnicode::empty();
+ self.push(HirFrame::ClassUnicode(cls));
+ } else {
+ let cls = hir::ClassBytes::empty();
+ self.push(HirFrame::ClassBytes(cls));
+ }
+ }
+ // We needn't handle the Union case here since the visitor will
+ // do it for us.
+ _ => {}
+ }
+ Ok(())
+ }
+
+ fn visit_class_set_item_post(
+ &mut self,
+ ast: &ast::ClassSetItem,
+ ) -> Result<()> {
+ match *ast {
+ ast::ClassSetItem::Empty(_) => {}
+ ast::ClassSetItem::Literal(ref x) => {
+ if self.flags().unicode() {
+ let mut cls = self.pop().unwrap().unwrap_class_unicode();
+ cls.push(hir::ClassUnicodeRange::new(x.c, x.c));
+ self.push(HirFrame::ClassUnicode(cls));
+ } else {
+ let mut cls = self.pop().unwrap().unwrap_class_bytes();
+ let byte = self.class_literal_byte(x)?;
+ cls.push(hir::ClassBytesRange::new(byte, byte));
+ self.push(HirFrame::ClassBytes(cls));
+ }
+ }
+ ast::ClassSetItem::Range(ref x) => {
+ if self.flags().unicode() {
+ let mut cls = self.pop().unwrap().unwrap_class_unicode();
+ cls.push(hir::ClassUnicodeRange::new(x.start.c, x.end.c));
+ self.push(HirFrame::ClassUnicode(cls));
+ } else {
+ let mut cls = self.pop().unwrap().unwrap_class_bytes();
+ let start = self.class_literal_byte(&x.start)?;
+ let end = self.class_literal_byte(&x.end)?;
+ cls.push(hir::ClassBytesRange::new(start, end));
+ self.push(HirFrame::ClassBytes(cls));
+ }
+ }
+ ast::ClassSetItem::Ascii(ref x) => {
+ if self.flags().unicode() {
+ let xcls = self.hir_ascii_unicode_class(x)?;
+ let mut cls = self.pop().unwrap().unwrap_class_unicode();
+ cls.union(&xcls);
+ self.push(HirFrame::ClassUnicode(cls));
+ } else {
+ let xcls = self.hir_ascii_byte_class(x)?;
+ let mut cls = self.pop().unwrap().unwrap_class_bytes();
+ cls.union(&xcls);
+ self.push(HirFrame::ClassBytes(cls));
+ }
+ }
+ ast::ClassSetItem::Unicode(ref x) => {
+ let xcls = self.hir_unicode_class(x)?;
+ let mut cls = self.pop().unwrap().unwrap_class_unicode();
+ cls.union(&xcls);
+ self.push(HirFrame::ClassUnicode(cls));
+ }
+ ast::ClassSetItem::Perl(ref x) => {
+ if self.flags().unicode() {
+ let xcls = self.hir_perl_unicode_class(x)?;
+ let mut cls = self.pop().unwrap().unwrap_class_unicode();
+ cls.union(&xcls);
+ self.push(HirFrame::ClassUnicode(cls));
+ } else {
+ let xcls = self.hir_perl_byte_class(x);
+ let mut cls = self.pop().unwrap().unwrap_class_bytes();
+ cls.union(&xcls);
+ self.push(HirFrame::ClassBytes(cls));
+ }
+ }
+ ast::ClassSetItem::Bracketed(ref ast) => {
+ if self.flags().unicode() {
+ let mut cls1 = self.pop().unwrap().unwrap_class_unicode();
+ self.unicode_fold_and_negate(
+ &ast.span,
+ ast.negated,
+ &mut cls1,
+ )?;
+
+ let mut cls2 = self.pop().unwrap().unwrap_class_unicode();
+ cls2.union(&cls1);
+ self.push(HirFrame::ClassUnicode(cls2));
+ } else {
+ let mut cls1 = self.pop().unwrap().unwrap_class_bytes();
+ self.bytes_fold_and_negate(
+ &ast.span,
+ ast.negated,
+ &mut cls1,
+ )?;
+
+ let mut cls2 = self.pop().unwrap().unwrap_class_bytes();
+ cls2.union(&cls1);
+ self.push(HirFrame::ClassBytes(cls2));
+ }
+ }
+ // This is handled automatically by the visitor.
+ ast::ClassSetItem::Union(_) => {}
+ }
+ Ok(())
+ }
+
+ fn visit_class_set_binary_op_pre(
+ &mut self,
+ _op: &ast::ClassSetBinaryOp,
+ ) -> Result<()> {
+ if self.flags().unicode() {
+ let cls = hir::ClassUnicode::empty();
+ self.push(HirFrame::ClassUnicode(cls));
+ } else {
+ let cls = hir::ClassBytes::empty();
+ self.push(HirFrame::ClassBytes(cls));
+ }
+ Ok(())
+ }
+
+ fn visit_class_set_binary_op_in(
+ &mut self,
+ _op: &ast::ClassSetBinaryOp,
+ ) -> Result<()> {
+ if self.flags().unicode() {
+ let cls = hir::ClassUnicode::empty();
+ self.push(HirFrame::ClassUnicode(cls));
+ } else {
+ let cls = hir::ClassBytes::empty();
+ self.push(HirFrame::ClassBytes(cls));
+ }
+ Ok(())
+ }
+
+ fn visit_class_set_binary_op_post(
+ &mut self,
+ op: &ast::ClassSetBinaryOp,
+ ) -> Result<()> {
+ use crate::ast::ClassSetBinaryOpKind::*;
+
+ if self.flags().unicode() {
+ let mut rhs = self.pop().unwrap().unwrap_class_unicode();
+ let mut lhs = self.pop().unwrap().unwrap_class_unicode();
+ let mut cls = self.pop().unwrap().unwrap_class_unicode();
+ if self.flags().case_insensitive() {
+ rhs.try_case_fold_simple().map_err(|_| {
+ self.error(
+ op.rhs.span().clone(),
+ ErrorKind::UnicodeCaseUnavailable,
+ )
+ })?;
+ lhs.try_case_fold_simple().map_err(|_| {
+ self.error(
+ op.lhs.span().clone(),
+ ErrorKind::UnicodeCaseUnavailable,
+ )
+ })?;
+ }
+ match op.kind {
+ Intersection => lhs.intersect(&rhs),
+ Difference => lhs.difference(&rhs),
+ SymmetricDifference => lhs.symmetric_difference(&rhs),
+ }
+ cls.union(&lhs);
+ self.push(HirFrame::ClassUnicode(cls));
+ } else {
+ let mut rhs = self.pop().unwrap().unwrap_class_bytes();
+ let mut lhs = self.pop().unwrap().unwrap_class_bytes();
+ let mut cls = self.pop().unwrap().unwrap_class_bytes();
+ if self.flags().case_insensitive() {
+ rhs.case_fold_simple();
+ lhs.case_fold_simple();
+ }
+ match op.kind {
+ Intersection => lhs.intersect(&rhs),
+ Difference => lhs.difference(&rhs),
+ SymmetricDifference => lhs.symmetric_difference(&rhs),
+ }
+ cls.union(&lhs);
+ self.push(HirFrame::ClassBytes(cls));
+ }
+ Ok(())
+ }
+}
+
+/// The internal implementation of a translator.
+///
+/// This type is responsible for carrying around the original pattern string,
+/// which is not tied to the internal state of a translator.
+///
+/// A TranslatorI exists for the time it takes to translate a single Ast.
+#[derive(Clone, Debug)]
+struct TranslatorI<'t, 'p> {
+ trans: &'t Translator,
+ pattern: &'p str,
+}
+
+impl<'t, 'p> TranslatorI<'t, 'p> {
+ /// Build a new internal translator.
+ fn new(trans: &'t Translator, pattern: &'p str) -> TranslatorI<'t, 'p> {
+ TranslatorI { trans, pattern }
+ }
+
+ /// Return a reference to the underlying translator.
+ fn trans(&self) -> &Translator {
+ &self.trans
+ }
+
+ /// Push the given frame on to the call stack.
+ fn push(&self, frame: HirFrame) {
+ self.trans().stack.borrow_mut().push(frame);
+ }
+
+ /// Pop the top of the call stack. If the call stack is empty, return None.
+ fn pop(&self) -> Option<HirFrame> {
+ self.trans().stack.borrow_mut().pop()
+ }
+
+ /// Create a new error with the given span and error type.
+ fn error(&self, span: Span, kind: ErrorKind) -> Error {
+ Error { kind, pattern: self.pattern.to_string(), span }
+ }
+
+ /// Return a copy of the active flags.
+ fn flags(&self) -> Flags {
+ self.trans().flags.get()
+ }
+
+ /// Set the flags of this translator from the flags set in the given AST.
+ /// Then, return the old flags.
+ fn set_flags(&self, ast_flags: &ast::Flags) -> Flags {
+ let old_flags = self.flags();
+ let mut new_flags = Flags::from_ast(ast_flags);
+ new_flags.merge(&old_flags);
+ self.trans().flags.set(new_flags);
+ old_flags
+ }
+
+ fn hir_literal(&self, lit: &ast::Literal) -> Result<Hir> {
+ let ch = match self.literal_to_char(lit)? {
+ byte @ hir::Literal::Byte(_) => return Ok(Hir::literal(byte)),
+ hir::Literal::Unicode(ch) => ch,
+ };
+ if self.flags().case_insensitive() {
+ self.hir_from_char_case_insensitive(lit.span, ch)
+ } else {
+ self.hir_from_char(lit.span, ch)
+ }
+ }
+
+ /// Convert an Ast literal to its scalar representation.
+ ///
+ /// When Unicode mode is enabled, then this always succeeds and returns a
+ /// `char` (Unicode scalar value).
+ ///
+ /// When Unicode mode is disabled, then a raw byte is returned. If that
+ /// byte is not ASCII and invalid UTF-8 is not allowed, then this returns
+ /// an error.
+ fn literal_to_char(&self, lit: &ast::Literal) -> Result<hir::Literal> {
+ if self.flags().unicode() {
+ return Ok(hir::Literal::Unicode(lit.c));
+ }
+ let byte = match lit.byte() {
+ None => return Ok(hir::Literal::Unicode(lit.c)),
+ Some(byte) => byte,
+ };
+ if byte <= 0x7F {
+ return Ok(hir::Literal::Unicode(byte as char));
+ }
+ if !self.trans().allow_invalid_utf8 {
+ return Err(self.error(lit.span, ErrorKind::InvalidUtf8));
+ }
+ Ok(hir::Literal::Byte(byte))
+ }
+
+ fn hir_from_char(&self, span: Span, c: char) -> Result<Hir> {
+ if !self.flags().unicode() && c.len_utf8() > 1 {
+ return Err(self.error(span, ErrorKind::UnicodeNotAllowed));
+ }
+ Ok(Hir::literal(hir::Literal::Unicode(c)))
+ }
+
+ fn hir_from_char_case_insensitive(
+ &self,
+ span: Span,
+ c: char,
+ ) -> Result<Hir> {
+ if self.flags().unicode() {
+ // If case folding won't do anything, then don't bother trying.
+ let map =
+ unicode::contains_simple_case_mapping(c, c).map_err(|_| {
+ self.error(span, ErrorKind::UnicodeCaseUnavailable)
+ })?;
+ if !map {
+ return self.hir_from_char(span, c);
+ }
+ let mut cls =
+ hir::ClassUnicode::new(vec![hir::ClassUnicodeRange::new(
+ c, c,
+ )]);
+ cls.try_case_fold_simple().map_err(|_| {
+ self.error(span, ErrorKind::UnicodeCaseUnavailable)
+ })?;
+ Ok(Hir::class(hir::Class::Unicode(cls)))
+ } else {
+ if c.len_utf8() > 1 {
+ return Err(self.error(span, ErrorKind::UnicodeNotAllowed));
+ }
+ // If case folding won't do anything, then don't bother trying.
+ match c {
+ 'A'..='Z' | 'a'..='z' => {}
+ _ => return self.hir_from_char(span, c),
+ }
+ let mut cls =
+ hir::ClassBytes::new(vec![hir::ClassBytesRange::new(
+ c as u8, c as u8,
+ )]);
+ cls.case_fold_simple();
+ Ok(Hir::class(hir::Class::Bytes(cls)))
+ }
+ }
+
+ fn hir_dot(&self, span: Span) -> Result<Hir> {
+ let unicode = self.flags().unicode();
+ if !unicode && !self.trans().allow_invalid_utf8 {
+ return Err(self.error(span, ErrorKind::InvalidUtf8));
+ }
+ Ok(if self.flags().dot_matches_new_line() {
+ Hir::any(!unicode)
+ } else {
+ Hir::dot(!unicode)
+ })
+ }
+
+ fn hir_assertion(&self, asst: &ast::Assertion) -> Result<Hir> {
+ let unicode = self.flags().unicode();
+ let multi_line = self.flags().multi_line();
+ Ok(match asst.kind {
+ ast::AssertionKind::StartLine => Hir::anchor(if multi_line {
+ hir::Anchor::StartLine
+ } else {
+ hir::Anchor::StartText
+ }),
+ ast::AssertionKind::EndLine => Hir::anchor(if multi_line {
+ hir::Anchor::EndLine
+ } else {
+ hir::Anchor::EndText
+ }),
+ ast::AssertionKind::StartText => {
+ Hir::anchor(hir::Anchor::StartText)
+ }
+ ast::AssertionKind::EndText => Hir::anchor(hir::Anchor::EndText),
+ ast::AssertionKind::WordBoundary => {
+ Hir::word_boundary(if unicode {
+ hir::WordBoundary::Unicode
+ } else {
+ hir::WordBoundary::Ascii
+ })
+ }
+ ast::AssertionKind::NotWordBoundary => {
+ Hir::word_boundary(if unicode {
+ hir::WordBoundary::UnicodeNegate
+ } else {
+ // It is possible for negated ASCII word boundaries to
+ // match at invalid UTF-8 boundaries, even when searching
+ // valid UTF-8.
+ if !self.trans().allow_invalid_utf8 {
+ return Err(
+ self.error(asst.span, ErrorKind::InvalidUtf8)
+ );
+ }
+ hir::WordBoundary::AsciiNegate
+ })
+ }
+ })
+ }
+
+ fn hir_group(&self, group: &ast::Group, expr: Hir) -> Hir {
+ let kind = match group.kind {
+ ast::GroupKind::CaptureIndex(idx) => {
+ hir::GroupKind::CaptureIndex(idx)
+ }
+ ast::GroupKind::CaptureName(ref capname) => {
+ hir::GroupKind::CaptureName {
+ name: capname.name.clone(),
+ index: capname.index,
+ }
+ }
+ ast::GroupKind::NonCapturing(_) => hir::GroupKind::NonCapturing,
+ };
+ Hir::group(hir::Group { kind, hir: Box::new(expr) })
+ }
+
+ fn hir_repetition(&self, rep: &ast::Repetition, expr: Hir) -> Hir {
+ let kind = match rep.op.kind {
+ ast::RepetitionKind::ZeroOrOne => hir::RepetitionKind::ZeroOrOne,
+ ast::RepetitionKind::ZeroOrMore => hir::RepetitionKind::ZeroOrMore,
+ ast::RepetitionKind::OneOrMore => hir::RepetitionKind::OneOrMore,
+ ast::RepetitionKind::Range(ast::RepetitionRange::Exactly(m)) => {
+ hir::RepetitionKind::Range(hir::RepetitionRange::Exactly(m))
+ }
+ ast::RepetitionKind::Range(ast::RepetitionRange::AtLeast(m)) => {
+ hir::RepetitionKind::Range(hir::RepetitionRange::AtLeast(m))
+ }
+ ast::RepetitionKind::Range(ast::RepetitionRange::Bounded(
+ m,
+ n,
+ )) => {
+ hir::RepetitionKind::Range(hir::RepetitionRange::Bounded(m, n))
+ }
+ };
+ let greedy =
+ if self.flags().swap_greed() { !rep.greedy } else { rep.greedy };
+ Hir::repetition(hir::Repetition { kind, greedy, hir: Box::new(expr) })
+ }
+
+ fn hir_unicode_class(
+ &self,
+ ast_class: &ast::ClassUnicode,
+ ) -> Result<hir::ClassUnicode> {
+ use crate::ast::ClassUnicodeKind::*;
+
+ if !self.flags().unicode() {
+ return Err(
+ self.error(ast_class.span, ErrorKind::UnicodeNotAllowed)
+ );
+ }
+ let query = match ast_class.kind {
+ OneLetter(name) => ClassQuery::OneLetter(name),
+ Named(ref name) => ClassQuery::Binary(name),
+ NamedValue { ref name, ref value, .. } => ClassQuery::ByValue {
+ property_name: name,
+ property_value: value,
+ },
+ };
+ let mut result = self.convert_unicode_class_error(
+ &ast_class.span,
+ unicode::class(query),
+ );
+ if let Ok(ref mut class) = result {
+ self.unicode_fold_and_negate(
+ &ast_class.span,
+ ast_class.negated,
+ class,
+ )?;
+ if class.ranges().is_empty() {
+ let err = self
+ .error(ast_class.span, ErrorKind::EmptyClassNotAllowed);
+ return Err(err);
+ }
+ }
+ result
+ }
+
+ fn hir_ascii_unicode_class(
+ &self,
+ ast: &ast::ClassAscii,
+ ) -> Result<hir::ClassUnicode> {
+ let mut cls = hir::ClassUnicode::new(
+ ascii_class(&ast.kind)
+ .iter()
+ .map(|&(s, e)| hir::ClassUnicodeRange::new(s, e)),
+ );
+ self.unicode_fold_and_negate(&ast.span, ast.negated, &mut cls)?;
+ Ok(cls)
+ }
+
+ fn hir_ascii_byte_class(
+ &self,
+ ast: &ast::ClassAscii,
+ ) -> Result<hir::ClassBytes> {
+ let mut cls = hir::ClassBytes::new(
+ ascii_class(&ast.kind)
+ .iter()
+ .map(|&(s, e)| hir::ClassBytesRange::new(s as u8, e as u8)),
+ );
+ self.bytes_fold_and_negate(&ast.span, ast.negated, &mut cls)?;
+ Ok(cls)
+ }
+
+ fn hir_perl_unicode_class(
+ &self,
+ ast_class: &ast::ClassPerl,
+ ) -> Result<hir::ClassUnicode> {
+ use crate::ast::ClassPerlKind::*;
+
+ assert!(self.flags().unicode());
+ let result = match ast_class.kind {
+ Digit => unicode::perl_digit(),
+ Space => unicode::perl_space(),
+ Word => unicode::perl_word(),
+ };
+ let mut class =
+ self.convert_unicode_class_error(&ast_class.span, result)?;
+ // We needn't apply case folding here because the Perl Unicode classes
+ // are already closed under Unicode simple case folding.
+ if ast_class.negated {
+ class.negate();
+ }
+ Ok(class)
+ }
+
+ fn hir_perl_byte_class(
+ &self,
+ ast_class: &ast::ClassPerl,
+ ) -> hir::ClassBytes {
+ use crate::ast::ClassPerlKind::*;
+
+ assert!(!self.flags().unicode());
+ let mut class = match ast_class.kind {
+ Digit => hir_ascii_class_bytes(&ast::ClassAsciiKind::Digit),
+ Space => hir_ascii_class_bytes(&ast::ClassAsciiKind::Space),
+ Word => hir_ascii_class_bytes(&ast::ClassAsciiKind::Word),
+ };
+ // We needn't apply case folding here because the Perl ASCII classes
+ // are already closed (under ASCII case folding).
+ if ast_class.negated {
+ class.negate();
+ }
+ class
+ }
+
+ /// Converts the given Unicode specific error to an HIR translation error.
+ ///
+ /// The span given should approximate the position at which an error would
+ /// occur.
+ fn convert_unicode_class_error(
+ &self,
+ span: &Span,
+ result: unicode::Result<hir::ClassUnicode>,
+ ) -> Result<hir::ClassUnicode> {
+ result.map_err(|err| {
+ let sp = span.clone();
+ match err {
+ unicode::Error::PropertyNotFound => {
+ self.error(sp, ErrorKind::UnicodePropertyNotFound)
+ }
+ unicode::Error::PropertyValueNotFound => {
+ self.error(sp, ErrorKind::UnicodePropertyValueNotFound)
+ }
+ unicode::Error::PerlClassNotFound => {
+ self.error(sp, ErrorKind::UnicodePerlClassNotFound)
+ }
+ }
+ })
+ }
+
+ fn unicode_fold_and_negate(
+ &self,
+ span: &Span,
+ negated: bool,
+ class: &mut hir::ClassUnicode,
+ ) -> Result<()> {
+ // Note that we must apply case folding before negation!
+ // Consider `(?i)[^x]`. If we applied negation field, then
+ // the result would be the character class that matched any
+ // Unicode scalar value.
+ if self.flags().case_insensitive() {
+ class.try_case_fold_simple().map_err(|_| {
+ self.error(span.clone(), ErrorKind::UnicodeCaseUnavailable)
+ })?;
+ }
+ if negated {
+ class.negate();
+ }
+ Ok(())
+ }
+
+ fn bytes_fold_and_negate(
+ &self,
+ span: &Span,
+ negated: bool,
+ class: &mut hir::ClassBytes,
+ ) -> Result<()> {
+ // Note that we must apply case folding before negation!
+ // Consider `(?i)[^x]`. If we applied negation first, then
+ // the result would be the character class that matched any
+ // Unicode scalar value.
+ if self.flags().case_insensitive() {
+ class.case_fold_simple();
+ }
+ if negated {
+ class.negate();
+ }
+ if !self.trans().allow_invalid_utf8 && !class.is_all_ascii() {
+ return Err(self.error(span.clone(), ErrorKind::InvalidUtf8));
+ }
+ Ok(())
+ }
+
+ /// Return a scalar byte value suitable for use as a literal in a byte
+ /// character class.
+ fn class_literal_byte(&self, ast: &ast::Literal) -> Result<u8> {
+ match self.literal_to_char(ast)? {
+ hir::Literal::Byte(byte) => Ok(byte),
+ hir::Literal::Unicode(ch) => {
+ if ch <= 0x7F as char {
+ Ok(ch as u8)
+ } else {
+ // We can't feasibly support Unicode in
+ // byte oriented classes. Byte classes don't
+ // do Unicode case folding.
+ Err(self.error(ast.span, ErrorKind::UnicodeNotAllowed))
+ }
+ }
+ }
+ }
+}
+
+/// A translator's representation of a regular expression's flags at any given
+/// moment in time.
+///
+/// Each flag can be in one of three states: absent, present but disabled or
+/// present but enabled.
+#[derive(Clone, Copy, Debug, Default)]
+struct Flags {
+ case_insensitive: Option<bool>,
+ multi_line: Option<bool>,
+ dot_matches_new_line: Option<bool>,
+ swap_greed: Option<bool>,
+ unicode: Option<bool>,
+ // Note that `ignore_whitespace` is omitted here because it is handled
+ // entirely in the parser.
+}
+
+impl Flags {
+ fn from_ast(ast: &ast::Flags) -> Flags {
+ let mut flags = Flags::default();
+ let mut enable = true;
+ for item in &ast.items {
+ match item.kind {
+ ast::FlagsItemKind::Negation => {
+ enable = false;
+ }
+ ast::FlagsItemKind::Flag(ast::Flag::CaseInsensitive) => {
+ flags.case_insensitive = Some(enable);
+ }
+ ast::FlagsItemKind::Flag(ast::Flag::MultiLine) => {
+ flags.multi_line = Some(enable);
+ }
+ ast::FlagsItemKind::Flag(ast::Flag::DotMatchesNewLine) => {
+ flags.dot_matches_new_line = Some(enable);
+ }
+ ast::FlagsItemKind::Flag(ast::Flag::SwapGreed) => {
+ flags.swap_greed = Some(enable);
+ }
+ ast::FlagsItemKind::Flag(ast::Flag::Unicode) => {
+ flags.unicode = Some(enable);
+ }
+ ast::FlagsItemKind::Flag(ast::Flag::IgnoreWhitespace) => {}
+ }
+ }
+ flags
+ }
+
+ fn merge(&mut self, previous: &Flags) {
+ if self.case_insensitive.is_none() {
+ self.case_insensitive = previous.case_insensitive;
+ }
+ if self.multi_line.is_none() {
+ self.multi_line = previous.multi_line;
+ }
+ if self.dot_matches_new_line.is_none() {
+ self.dot_matches_new_line = previous.dot_matches_new_line;
+ }
+ if self.swap_greed.is_none() {
+ self.swap_greed = previous.swap_greed;
+ }
+ if self.unicode.is_none() {
+ self.unicode = previous.unicode;
+ }
+ }
+
+ fn case_insensitive(&self) -> bool {
+ self.case_insensitive.unwrap_or(false)
+ }
+
+ fn multi_line(&self) -> bool {
+ self.multi_line.unwrap_or(false)
+ }
+
+ fn dot_matches_new_line(&self) -> bool {
+ self.dot_matches_new_line.unwrap_or(false)
+ }
+
+ fn swap_greed(&self) -> bool {
+ self.swap_greed.unwrap_or(false)
+ }
+
+ fn unicode(&self) -> bool {
+ self.unicode.unwrap_or(true)
+ }
+}
+
+fn hir_ascii_class_bytes(kind: &ast::ClassAsciiKind) -> hir::ClassBytes {
+ let ranges: Vec<_> = ascii_class(kind)
+ .iter()
+ .cloned()
+ .map(|(s, e)| hir::ClassBytesRange::new(s as u8, e as u8))
+ .collect();
+ hir::ClassBytes::new(ranges)
+}
+
+fn ascii_class(kind: &ast::ClassAsciiKind) -> &'static [(char, char)] {
+ use crate::ast::ClassAsciiKind::*;
+ match *kind {
+ Alnum => &[('0', '9'), ('A', 'Z'), ('a', 'z')],
+ Alpha => &[('A', 'Z'), ('a', 'z')],
+ Ascii => &[('\x00', '\x7F')],
+ Blank => &[('\t', '\t'), (' ', ' ')],
+ Cntrl => &[('\x00', '\x1F'), ('\x7F', '\x7F')],
+ Digit => &[('0', '9')],
+ Graph => &[('!', '~')],
+ Lower => &[('a', 'z')],
+ Print => &[(' ', '~')],
+ Punct => &[('!', '/'), (':', '@'), ('[', '`'), ('{', '~')],
+ Space => &[
+ ('\t', '\t'),
+ ('\n', '\n'),
+ ('\x0B', '\x0B'),
+ ('\x0C', '\x0C'),
+ ('\r', '\r'),
+ (' ', ' '),
+ ],
+ Upper => &[('A', 'Z')],
+ Word => &[('0', '9'), ('A', 'Z'), ('_', '_'), ('a', 'z')],
+ Xdigit => &[('0', '9'), ('A', 'F'), ('a', 'f')],
+ }
+}
+
+#[cfg(test)]
+mod tests {
+ use crate::ast::parse::ParserBuilder;
+ use crate::ast::{self, Ast, Position, Span};
+ use crate::hir::{self, Hir, HirKind};
+ use crate::unicode::{self, ClassQuery};
+
+ use super::{ascii_class, TranslatorBuilder};
+
+ // We create these errors to compare with real hir::Errors in the tests.
+ // We define equality between TestError and hir::Error to disregard the
+ // pattern string in hir::Error, which is annoying to provide in tests.
+ #[derive(Clone, Debug)]
+ struct TestError {
+ span: Span,
+ kind: hir::ErrorKind,
+ }
+
+ impl PartialEq<hir::Error> for TestError {
+ fn eq(&self, other: &hir::Error) -> bool {
+ self.span == other.span && self.kind == other.kind
+ }
+ }
+
+ impl PartialEq<TestError> for hir::Error {
+ fn eq(&self, other: &TestError) -> bool {
+ self.span == other.span && self.kind == other.kind
+ }
+ }
+
+ fn parse(pattern: &str) -> Ast {
+ ParserBuilder::new().octal(true).build().parse(pattern).unwrap()
+ }
+
+ fn t(pattern: &str) -> Hir {
+ TranslatorBuilder::new()
+ .allow_invalid_utf8(false)
+ .build()
+ .translate(pattern, &parse(pattern))
+ .unwrap()
+ }
+
+ fn t_err(pattern: &str) -> hir::Error {
+ TranslatorBuilder::new()
+ .allow_invalid_utf8(false)
+ .build()
+ .translate(pattern, &parse(pattern))
+ .unwrap_err()
+ }
+
+ fn t_bytes(pattern: &str) -> Hir {
+ TranslatorBuilder::new()
+ .allow_invalid_utf8(true)
+ .build()
+ .translate(pattern, &parse(pattern))
+ .unwrap()
+ }
+
+ fn hir_lit(s: &str) -> Hir {
+ match s.len() {
+ 0 => Hir::empty(),
+ _ => {
+ let lits = s
+ .chars()
+ .map(hir::Literal::Unicode)
+ .map(Hir::literal)
+ .collect();
+ Hir::concat(lits)
+ }
+ }
+ }
+
+ fn hir_blit(s: &[u8]) -> Hir {
+ match s.len() {
+ 0 => Hir::empty(),
+ 1 => Hir::literal(hir::Literal::Byte(s[0])),
+ _ => {
+ let lits = s
+ .iter()
+ .cloned()
+ .map(hir::Literal::Byte)
+ .map(Hir::literal)
+ .collect();
+ Hir::concat(lits)
+ }
+ }
+ }
+
+ fn hir_group(i: u32, expr: Hir) -> Hir {
+ Hir::group(hir::Group {
+ kind: hir::GroupKind::CaptureIndex(i),
+ hir: Box::new(expr),
+ })
+ }
+
+ fn hir_group_name(i: u32, name: &str, expr: Hir) -> Hir {
+ Hir::group(hir::Group {
+ kind: hir::GroupKind::CaptureName {
+ name: name.to_string(),
+ index: i,
+ },
+ hir: Box::new(expr),
+ })
+ }
+
+ fn hir_group_nocap(expr: Hir) -> Hir {
+ Hir::group(hir::Group {
+ kind: hir::GroupKind::NonCapturing,
+ hir: Box::new(expr),
+ })
+ }
+
+ fn hir_quest(greedy: bool, expr: Hir) -> Hir {
+ Hir::repetition(hir::Repetition {
+ kind: hir::RepetitionKind::ZeroOrOne,
+ greedy,
+ hir: Box::new(expr),
+ })
+ }
+
+ fn hir_star(greedy: bool, expr: Hir) -> Hir {
+ Hir::repetition(hir::Repetition {
+ kind: hir::RepetitionKind::ZeroOrMore,
+ greedy,
+ hir: Box::new(expr),
+ })
+ }
+
+ fn hir_plus(greedy: bool, expr: Hir) -> Hir {
+ Hir::repetition(hir::Repetition {
+ kind: hir::RepetitionKind::OneOrMore,
+ greedy,
+ hir: Box::new(expr),
+ })
+ }
+
+ fn hir_range(greedy: bool, range: hir::RepetitionRange, expr: Hir) -> Hir {
+ Hir::repetition(hir::Repetition {
+ kind: hir::RepetitionKind::Range(range),
+ greedy,
+ hir: Box::new(expr),
+ })
+ }
+
+ fn hir_alt(alts: Vec<Hir>) -> Hir {
+ Hir::alternation(alts)
+ }
+
+ fn hir_cat(exprs: Vec<Hir>) -> Hir {
+ Hir::concat(exprs)
+ }
+
+ #[allow(dead_code)]
+ fn hir_uclass_query(query: ClassQuery<'_>) -> Hir {
+ Hir::class(hir::Class::Unicode(unicode::class(query).unwrap()))
+ }
+
+ #[allow(dead_code)]
+ fn hir_uclass_perl_word() -> Hir {
+ Hir::class(hir::Class::Unicode(unicode::perl_word().unwrap()))
+ }
+
+ fn hir_uclass(ranges: &[(char, char)]) -> Hir {
+ let ranges: Vec<hir::ClassUnicodeRange> = ranges
+ .iter()
+ .map(|&(s, e)| hir::ClassUnicodeRange::new(s, e))
+ .collect();
+ Hir::class(hir::Class::Unicode(hir::ClassUnicode::new(ranges)))
+ }
+
+ fn hir_bclass(ranges: &[(u8, u8)]) -> Hir {
+ let ranges: Vec<hir::ClassBytesRange> = ranges
+ .iter()
+ .map(|&(s, e)| hir::ClassBytesRange::new(s, e))
+ .collect();
+ Hir::class(hir::Class::Bytes(hir::ClassBytes::new(ranges)))
+ }
+
+ fn hir_bclass_from_char(ranges: &[(char, char)]) -> Hir {
+ let ranges: Vec<hir::ClassBytesRange> = ranges
+ .iter()
+ .map(|&(s, e)| {
+ assert!(s as u32 <= 0x7F);
+ assert!(e as u32 <= 0x7F);
+ hir::ClassBytesRange::new(s as u8, e as u8)
+ })
+ .collect();
+ Hir::class(hir::Class::Bytes(hir::ClassBytes::new(ranges)))
+ }
+
+ fn hir_case_fold(expr: Hir) -> Hir {
+ match expr.into_kind() {
+ HirKind::Class(mut cls) => {
+ cls.case_fold_simple();
+ Hir::class(cls)
+ }
+ _ => panic!("cannot case fold non-class Hir expr"),
+ }
+ }
+
+ fn hir_negate(expr: Hir) -> Hir {
+ match expr.into_kind() {
+ HirKind::Class(mut cls) => {
+ cls.negate();
+ Hir::class(cls)
+ }
+ _ => panic!("cannot negate non-class Hir expr"),
+ }
+ }
+
+ #[allow(dead_code)]
+ fn hir_union(expr1: Hir, expr2: Hir) -> Hir {
+ use crate::hir::Class::{Bytes, Unicode};
+
+ match (expr1.into_kind(), expr2.into_kind()) {
+ (HirKind::Class(Unicode(mut c1)), HirKind::Class(Unicode(c2))) => {
+ c1.union(&c2);
+ Hir::class(hir::Class::Unicode(c1))
+ }
+ (HirKind::Class(Bytes(mut c1)), HirKind::Class(Bytes(c2))) => {
+ c1.union(&c2);
+ Hir::class(hir::Class::Bytes(c1))
+ }
+ _ => panic!("cannot union non-class Hir exprs"),
+ }
+ }
+
+ #[allow(dead_code)]
+ fn hir_difference(expr1: Hir, expr2: Hir) -> Hir {
+ use crate::hir::Class::{Bytes, Unicode};
+
+ match (expr1.into_kind(), expr2.into_kind()) {
+ (HirKind::Class(Unicode(mut c1)), HirKind::Class(Unicode(c2))) => {
+ c1.difference(&c2);
+ Hir::class(hir::Class::Unicode(c1))
+ }
+ (HirKind::Class(Bytes(mut c1)), HirKind::Class(Bytes(c2))) => {
+ c1.difference(&c2);
+ Hir::class(hir::Class::Bytes(c1))
+ }
+ _ => panic!("cannot difference non-class Hir exprs"),
+ }
+ }
+
+ fn hir_anchor(anchor: hir::Anchor) -> Hir {
+ Hir::anchor(anchor)
+ }
+
+ fn hir_word(wb: hir::WordBoundary) -> Hir {
+ Hir::word_boundary(wb)
+ }
+
+ #[test]
+ fn empty() {
+ assert_eq!(t(""), Hir::empty());
+ assert_eq!(t("(?i)"), Hir::empty());
+ assert_eq!(t("()"), hir_group(1, Hir::empty()));
+ assert_eq!(t("(?:)"), hir_group_nocap(Hir::empty()));
+ assert_eq!(t("(?P<wat>)"), hir_group_name(1, "wat", Hir::empty()));
+ assert_eq!(t("|"), hir_alt(vec![Hir::empty(), Hir::empty()]));
+ assert_eq!(
+ t("()|()"),
+ hir_alt(vec![
+ hir_group(1, Hir::empty()),
+ hir_group(2, Hir::empty()),
+ ])
+ );
+ assert_eq!(
+ t("(|b)"),
+ hir_group(1, hir_alt(vec![Hir::empty(), hir_lit("b"),]))
+ );
+ assert_eq!(
+ t("(a|)"),
+ hir_group(1, hir_alt(vec![hir_lit("a"), Hir::empty(),]))
+ );
+ assert_eq!(
+ t("(a||c)"),
+ hir_group(
+ 1,
+ hir_alt(vec![hir_lit("a"), Hir::empty(), hir_lit("c"),])
+ )
+ );
+ assert_eq!(
+ t("(||)"),
+ hir_group(
+ 1,
+ hir_alt(vec![Hir::empty(), Hir::empty(), Hir::empty(),])
+ )
+ );
+ }
+
+ #[test]
+ fn literal() {
+ assert_eq!(t("a"), hir_lit("a"));
+ assert_eq!(t("(?-u)a"), hir_lit("a"));
+ assert_eq!(t("☃"), hir_lit("☃"));
+ assert_eq!(t("abcd"), hir_lit("abcd"));
+
+ assert_eq!(t_bytes("(?-u)a"), hir_lit("a"));
+ assert_eq!(t_bytes("(?-u)\x61"), hir_lit("a"));
+ assert_eq!(t_bytes(r"(?-u)\x61"), hir_lit("a"));
+ assert_eq!(t_bytes(r"(?-u)\xFF"), hir_blit(b"\xFF"));
+
+ assert_eq!(
+ t_err("(?-u)☃"),
+ TestError {
+ kind: hir::ErrorKind::UnicodeNotAllowed,
+ span: Span::new(
+ Position::new(5, 1, 6),
+ Position::new(8, 1, 7)
+ ),
+ }
+ );
+ assert_eq!(
+ t_err(r"(?-u)\xFF"),
+ TestError {
+ kind: hir::ErrorKind::InvalidUtf8,
+ span: Span::new(
+ Position::new(5, 1, 6),
+ Position::new(9, 1, 10)
+ ),
+ }
+ );
+ }
+
+ #[test]
+ fn literal_case_insensitive() {
+ #[cfg(feature = "unicode-case")]
+ assert_eq!(t("(?i)a"), hir_uclass(&[('A', 'A'), ('a', 'a'),]));
+ #[cfg(feature = "unicode-case")]
+ assert_eq!(
+ t("(?i:a)"),
+ hir_group_nocap(hir_uclass(&[('A', 'A'), ('a', 'a')],))
+ );
+ #[cfg(feature = "unicode-case")]
+ assert_eq!(
+ t("a(?i)a(?-i)a"),
+ hir_cat(vec![
+ hir_lit("a"),
+ hir_uclass(&[('A', 'A'), ('a', 'a')]),
+ hir_lit("a"),
+ ])
+ );
+ #[cfg(feature = "unicode-case")]
+ assert_eq!(
+ t("(?i)ab@c"),
+ hir_cat(vec![
+ hir_uclass(&[('A', 'A'), ('a', 'a')]),
+ hir_uclass(&[('B', 'B'), ('b', 'b')]),
+ hir_lit("@"),
+ hir_uclass(&[('C', 'C'), ('c', 'c')]),
+ ])
+ );
+ #[cfg(feature = "unicode-case")]
+ assert_eq!(
+ t("(?i)β"),
+ hir_uclass(&[('Β', 'Β'), ('β', 'β'), ('ϐ', 'ϐ'),])
+ );
+
+ assert_eq!(t("(?i-u)a"), hir_bclass(&[(b'A', b'A'), (b'a', b'a'),]));
+ #[cfg(feature = "unicode-case")]
+ assert_eq!(
+ t("(?-u)a(?i)a(?-i)a"),
+ hir_cat(vec![
+ hir_lit("a"),
+ hir_bclass(&[(b'A', b'A'), (b'a', b'a')]),
+ hir_lit("a"),
+ ])
+ );
+ assert_eq!(
+ t("(?i-u)ab@c"),
+ hir_cat(vec![
+ hir_bclass(&[(b'A', b'A'), (b'a', b'a')]),
+ hir_bclass(&[(b'B', b'B'), (b'b', b'b')]),
+ hir_lit("@"),
+ hir_bclass(&[(b'C', b'C'), (b'c', b'c')]),
+ ])
+ );
+
+ assert_eq!(
+ t_bytes("(?i-u)a"),
+ hir_bclass(&[(b'A', b'A'), (b'a', b'a'),])
+ );
+ assert_eq!(
+ t_bytes("(?i-u)\x61"),
+ hir_bclass(&[(b'A', b'A'), (b'a', b'a'),])
+ );
+ assert_eq!(
+ t_bytes(r"(?i-u)\x61"),
+ hir_bclass(&[(b'A', b'A'), (b'a', b'a'),])
+ );
+ assert_eq!(t_bytes(r"(?i-u)\xFF"), hir_blit(b"\xFF"));
+
+ assert_eq!(
+ t_err("(?i-u)β"),
+ TestError {
+ kind: hir::ErrorKind::UnicodeNotAllowed,
+ span: Span::new(
+ Position::new(6, 1, 7),
+ Position::new(8, 1, 8),
+ ),
+ }
+ );
+ }
+
+ #[test]
+ fn dot() {
+ assert_eq!(
+ t("."),
+ hir_uclass(&[('\0', '\t'), ('\x0B', '\u{10FFFF}'),])
+ );
+ assert_eq!(t("(?s)."), hir_uclass(&[('\0', '\u{10FFFF}'),]));
+ assert_eq!(
+ t_bytes("(?-u)."),
+ hir_bclass(&[(b'\0', b'\t'), (b'\x0B', b'\xFF'),])
+ );
+ assert_eq!(t_bytes("(?s-u)."), hir_bclass(&[(b'\0', b'\xFF'),]));
+
+ // If invalid UTF-8 isn't allowed, then non-Unicode `.` isn't allowed.
+ assert_eq!(
+ t_err("(?-u)."),
+ TestError {
+ kind: hir::ErrorKind::InvalidUtf8,
+ span: Span::new(
+ Position::new(5, 1, 6),
+ Position::new(6, 1, 7)
+ ),
+ }
+ );
+ assert_eq!(
+ t_err("(?s-u)."),
+ TestError {
+ kind: hir::ErrorKind::InvalidUtf8,
+ span: Span::new(
+ Position::new(6, 1, 7),
+ Position::new(7, 1, 8)
+ ),
+ }
+ );
+ }
+
+ #[test]
+ fn assertions() {
+ assert_eq!(t("^"), hir_anchor(hir::Anchor::StartText));
+ assert_eq!(t("$"), hir_anchor(hir::Anchor::EndText));
+ assert_eq!(t(r"\A"), hir_anchor(hir::Anchor::StartText));
+ assert_eq!(t(r"\z"), hir_anchor(hir::Anchor::EndText));
+ assert_eq!(t("(?m)^"), hir_anchor(hir::Anchor::StartLine));
+ assert_eq!(t("(?m)$"), hir_anchor(hir::Anchor::EndLine));
+ assert_eq!(t(r"(?m)\A"), hir_anchor(hir::Anchor::StartText));
+ assert_eq!(t(r"(?m)\z"), hir_anchor(hir::Anchor::EndText));
+
+ assert_eq!(t(r"\b"), hir_word(hir::WordBoundary::Unicode));
+ assert_eq!(t(r"\B"), hir_word(hir::WordBoundary::UnicodeNegate));
+ assert_eq!(t(r"(?-u)\b"), hir_word(hir::WordBoundary::Ascii));
+ assert_eq!(
+ t_bytes(r"(?-u)\B"),
+ hir_word(hir::WordBoundary::AsciiNegate)
+ );
+
+ assert_eq!(
+ t_err(r"(?-u)\B"),
+ TestError {
+ kind: hir::ErrorKind::InvalidUtf8,
+ span: Span::new(
+ Position::new(5, 1, 6),
+ Position::new(7, 1, 8)
+ ),
+ }
+ );
+ }
+
+ #[test]
+ fn group() {
+ assert_eq!(t("(a)"), hir_group(1, hir_lit("a")));
+ assert_eq!(
+ t("(a)(b)"),
+ hir_cat(vec![
+ hir_group(1, hir_lit("a")),
+ hir_group(2, hir_lit("b")),
+ ])
+ );
+ assert_eq!(
+ t("(a)|(b)"),
+ hir_alt(vec![
+ hir_group(1, hir_lit("a")),
+ hir_group(2, hir_lit("b")),
+ ])
+ );
+ assert_eq!(t("(?P<foo>)"), hir_group_name(1, "foo", Hir::empty()));
+ assert_eq!(t("(?P<foo>a)"), hir_group_name(1, "foo", hir_lit("a")));
+ assert_eq!(
+ t("(?P<foo>a)(?P<bar>b)"),
+ hir_cat(vec![
+ hir_group_name(1, "foo", hir_lit("a")),
+ hir_group_name(2, "bar", hir_lit("b")),
+ ])
+ );
+ assert_eq!(t("(?:)"), hir_group_nocap(Hir::empty()));
+ assert_eq!(t("(?:a)"), hir_group_nocap(hir_lit("a")));
+ assert_eq!(
+ t("(?:a)(b)"),
+ hir_cat(vec![
+ hir_group_nocap(hir_lit("a")),
+ hir_group(1, hir_lit("b")),
+ ])
+ );
+ assert_eq!(
+ t("(a)(?:b)(c)"),
+ hir_cat(vec![
+ hir_group(1, hir_lit("a")),
+ hir_group_nocap(hir_lit("b")),
+ hir_group(2, hir_lit("c")),
+ ])
+ );
+ assert_eq!(
+ t("(a)(?P<foo>b)(c)"),
+ hir_cat(vec![
+ hir_group(1, hir_lit("a")),
+ hir_group_name(2, "foo", hir_lit("b")),
+ hir_group(3, hir_lit("c")),
+ ])
+ );
+ assert_eq!(t("()"), hir_group(1, Hir::empty()));
+ assert_eq!(t("((?i))"), hir_group(1, Hir::empty()));
+ assert_eq!(t("((?x))"), hir_group(1, Hir::empty()));
+ assert_eq!(t("(((?x)))"), hir_group(1, hir_group(2, Hir::empty())));
+ }
+
+ #[test]
+ fn flags() {
+ #[cfg(feature = "unicode-case")]
+ assert_eq!(
+ t("(?i:a)a"),
+ hir_cat(vec![
+ hir_group_nocap(hir_uclass(&[('A', 'A'), ('a', 'a')])),
+ hir_lit("a"),
+ ])
+ );
+ assert_eq!(
+ t("(?i-u:a)β"),
+ hir_cat(vec![
+ hir_group_nocap(hir_bclass(&[(b'A', b'A'), (b'a', b'a')])),
+ hir_lit("β"),
+ ])
+ );
+ assert_eq!(
+ t("(?:(?i-u)a)b"),
+ hir_cat(vec![
+ hir_group_nocap(hir_bclass(&[(b'A', b'A'), (b'a', b'a')])),
+ hir_lit("b"),
+ ])
+ );
+ assert_eq!(
+ t("((?i-u)a)b"),
+ hir_cat(vec![
+ hir_group(1, hir_bclass(&[(b'A', b'A'), (b'a', b'a')])),
+ hir_lit("b"),
+ ])
+ );
+ #[cfg(feature = "unicode-case")]
+ assert_eq!(
+ t("(?i)(?-i:a)a"),
+ hir_cat(vec![
+ hir_group_nocap(hir_lit("a")),
+ hir_uclass(&[('A', 'A'), ('a', 'a')]),
+ ])
+ );
+ #[cfg(feature = "unicode-case")]
+ assert_eq!(
+ t("(?im)a^"),
+ hir_cat(vec![
+ hir_uclass(&[('A', 'A'), ('a', 'a')]),
+ hir_anchor(hir::Anchor::StartLine),
+ ])
+ );
+ #[cfg(feature = "unicode-case")]
+ assert_eq!(
+ t("(?im)a^(?i-m)a^"),
+ hir_cat(vec![
+ hir_uclass(&[('A', 'A'), ('a', 'a')]),
+ hir_anchor(hir::Anchor::StartLine),
+ hir_uclass(&[('A', 'A'), ('a', 'a')]),
+ hir_anchor(hir::Anchor::StartText),
+ ])
+ );
+ assert_eq!(
+ t("(?U)a*a*?(?-U)a*a*?"),
+ hir_cat(vec![
+ hir_star(false, hir_lit("a")),
+ hir_star(true, hir_lit("a")),
+ hir_star(true, hir_lit("a")),
+ hir_star(false, hir_lit("a")),
+ ])
+ );
+ #[cfg(feature = "unicode-case")]
+ assert_eq!(
+ t("(?:a(?i)a)a"),
+ hir_cat(vec![
+ hir_group_nocap(hir_cat(vec![
+ hir_lit("a"),
+ hir_uclass(&[('A', 'A'), ('a', 'a')]),
+ ])),
+ hir_lit("a"),
+ ])
+ );
+ #[cfg(feature = "unicode-case")]
+ assert_eq!(
+ t("(?i)(?:a(?-i)a)a"),
+ hir_cat(vec![
+ hir_group_nocap(hir_cat(vec![
+ hir_uclass(&[('A', 'A'), ('a', 'a')]),
+ hir_lit("a"),
+ ])),
+ hir_uclass(&[('A', 'A'), ('a', 'a')]),
+ ])
+ );
+ }
+
+ #[test]
+ fn escape() {
+ assert_eq!(
+ t(r"\\\.\+\*\?\(\)\|\[\]\{\}\^\$\#"),
+ hir_lit(r"\.+*?()|[]{}^$#")
+ );
+ }
+
+ #[test]
+ fn repetition() {
+ assert_eq!(t("a?"), hir_quest(true, hir_lit("a")));
+ assert_eq!(t("a*"), hir_star(true, hir_lit("a")));
+ assert_eq!(t("a+"), hir_plus(true, hir_lit("a")));
+ assert_eq!(t("a??"), hir_quest(false, hir_lit("a")));
+ assert_eq!(t("a*?"), hir_star(false, hir_lit("a")));
+ assert_eq!(t("a+?"), hir_plus(false, hir_lit("a")));
+
+ assert_eq!(
+ t("a{1}"),
+ hir_range(true, hir::RepetitionRange::Exactly(1), hir_lit("a"),)
+ );
+ assert_eq!(
+ t("a{1,}"),
+ hir_range(true, hir::RepetitionRange::AtLeast(1), hir_lit("a"),)
+ );
+ assert_eq!(
+ t("a{1,2}"),
+ hir_range(true, hir::RepetitionRange::Bounded(1, 2), hir_lit("a"),)
+ );
+ assert_eq!(
+ t("a{1}?"),
+ hir_range(false, hir::RepetitionRange::Exactly(1), hir_lit("a"),)
+ );
+ assert_eq!(
+ t("a{1,}?"),
+ hir_range(false, hir::RepetitionRange::AtLeast(1), hir_lit("a"),)
+ );
+ assert_eq!(
+ t("a{1,2}?"),
+ hir_range(
+ false,
+ hir::RepetitionRange::Bounded(1, 2),
+ hir_lit("a"),
+ )
+ );
+
+ assert_eq!(
+ t("ab?"),
+ hir_cat(vec![hir_lit("a"), hir_quest(true, hir_lit("b")),])
+ );
+ assert_eq!(
+ t("(ab)?"),
+ hir_quest(
+ true,
+ hir_group(1, hir_cat(vec![hir_lit("a"), hir_lit("b"),]))
+ )
+ );
+ assert_eq!(
+ t("a|b?"),
+ hir_alt(vec![hir_lit("a"), hir_quest(true, hir_lit("b")),])
+ );
+ }
+
+ #[test]
+ fn cat_alt() {
+ assert_eq!(
+ t("(ab)"),
+ hir_group(1, hir_cat(vec![hir_lit("a"), hir_lit("b"),]))
+ );
+ assert_eq!(t("a|b"), hir_alt(vec![hir_lit("a"), hir_lit("b"),]));
+ assert_eq!(
+ t("a|b|c"),
+ hir_alt(vec![hir_lit("a"), hir_lit("b"), hir_lit("c"),])
+ );
+ assert_eq!(
+ t("ab|bc|cd"),
+ hir_alt(vec![hir_lit("ab"), hir_lit("bc"), hir_lit("cd"),])
+ );
+ assert_eq!(
+ t("(a|b)"),
+ hir_group(1, hir_alt(vec![hir_lit("a"), hir_lit("b"),]))
+ );
+ assert_eq!(
+ t("(a|b|c)"),
+ hir_group(
+ 1,
+ hir_alt(vec![hir_lit("a"), hir_lit("b"), hir_lit("c"),])
+ )
+ );
+ assert_eq!(
+ t("(ab|bc|cd)"),
+ hir_group(
+ 1,
+ hir_alt(vec![hir_lit("ab"), hir_lit("bc"), hir_lit("cd"),])
+ )
+ );
+ assert_eq!(
+ t("(ab|(bc|(cd)))"),
+ hir_group(
+ 1,
+ hir_alt(vec![
+ hir_lit("ab"),
+ hir_group(
+ 2,
+ hir_alt(vec![
+ hir_lit("bc"),
+ hir_group(3, hir_lit("cd")),
+ ])
+ ),
+ ])
+ )
+ );
+ }
+
+ #[test]
+ fn class_ascii() {
+ assert_eq!(
+ t("[[:alnum:]]"),
+ hir_uclass(ascii_class(&ast::ClassAsciiKind::Alnum))
+ );
+ assert_eq!(
+ t("[[:alpha:]]"),
+ hir_uclass(ascii_class(&ast::ClassAsciiKind::Alpha))
+ );
+ assert_eq!(
+ t("[[:ascii:]]"),
+ hir_uclass(ascii_class(&ast::ClassAsciiKind::Ascii))
+ );
+ assert_eq!(
+ t("[[:blank:]]"),
+ hir_uclass(ascii_class(&ast::ClassAsciiKind::Blank))
+ );
+ assert_eq!(
+ t("[[:cntrl:]]"),
+ hir_uclass(ascii_class(&ast::ClassAsciiKind::Cntrl))
+ );
+ assert_eq!(
+ t("[[:digit:]]"),
+ hir_uclass(ascii_class(&ast::ClassAsciiKind::Digit))
+ );
+ assert_eq!(
+ t("[[:graph:]]"),
+ hir_uclass(ascii_class(&ast::ClassAsciiKind::Graph))
+ );
+ assert_eq!(
+ t("[[:lower:]]"),
+ hir_uclass(ascii_class(&ast::ClassAsciiKind::Lower))
+ );
+ assert_eq!(
+ t("[[:print:]]"),
+ hir_uclass(ascii_class(&ast::ClassAsciiKind::Print))
+ );
+ assert_eq!(
+ t("[[:punct:]]"),
+ hir_uclass(ascii_class(&ast::ClassAsciiKind::Punct))
+ );
+ assert_eq!(
+ t("[[:space:]]"),
+ hir_uclass(ascii_class(&ast::ClassAsciiKind::Space))
+ );
+ assert_eq!(
+ t("[[:upper:]]"),
+ hir_uclass(ascii_class(&ast::ClassAsciiKind::Upper))
+ );
+ assert_eq!(
+ t("[[:word:]]"),
+ hir_uclass(ascii_class(&ast::ClassAsciiKind::Word))
+ );
+ assert_eq!(
+ t("[[:xdigit:]]"),
+ hir_uclass(ascii_class(&ast::ClassAsciiKind::Xdigit))
+ );
+
+ assert_eq!(
+ t("[[:^lower:]]"),
+ hir_negate(hir_uclass(ascii_class(&ast::ClassAsciiKind::Lower)))
+ );
+ #[cfg(feature = "unicode-case")]
+ assert_eq!(
+ t("(?i)[[:lower:]]"),
+ hir_uclass(&[
+ ('A', 'Z'),
+ ('a', 'z'),
+ ('\u{17F}', '\u{17F}'),
+ ('\u{212A}', '\u{212A}'),
+ ])
+ );
+
+ assert_eq!(
+ t("(?-u)[[:lower:]]"),
+ hir_bclass_from_char(ascii_class(&ast::ClassAsciiKind::Lower))
+ );
+ assert_eq!(
+ t("(?i-u)[[:lower:]]"),
+ hir_case_fold(hir_bclass_from_char(ascii_class(
+ &ast::ClassAsciiKind::Lower
+ )))
+ );
+
+ assert_eq!(
+ t_err("(?-u)[[:^lower:]]"),
+ TestError {
+ kind: hir::ErrorKind::InvalidUtf8,
+ span: Span::new(
+ Position::new(6, 1, 7),
+ Position::new(16, 1, 17)
+ ),
+ }
+ );
+ assert_eq!(
+ t_err("(?i-u)[[:^lower:]]"),
+ TestError {
+ kind: hir::ErrorKind::InvalidUtf8,
+ span: Span::new(
+ Position::new(7, 1, 8),
+ Position::new(17, 1, 18)
+ ),
+ }
+ );
+ }
+
+ #[test]
+ fn class_ascii_multiple() {
+ // See: https://github.com/rust-lang/regex/issues/680
+ assert_eq!(
+ t("[[:alnum:][:^ascii:]]"),
+ hir_union(
+ hir_uclass(ascii_class(&ast::ClassAsciiKind::Alnum)),
+ hir_uclass(&[('\u{80}', '\u{10FFFF}')]),
+ ),
+ );
+ assert_eq!(
+ t_bytes("(?-u)[[:alnum:][:^ascii:]]"),
+ hir_union(
+ hir_bclass_from_char(ascii_class(&ast::ClassAsciiKind::Alnum)),
+ hir_bclass(&[(0x80, 0xFF)]),
+ ),
+ );
+ }
+
+ #[test]
+ #[cfg(feature = "unicode-perl")]
+ fn class_perl() {
+ // Unicode
+ assert_eq!(t(r"\d"), hir_uclass_query(ClassQuery::Binary("digit")));
+ assert_eq!(t(r"\s"), hir_uclass_query(ClassQuery::Binary("space")));
+ assert_eq!(t(r"\w"), hir_uclass_perl_word());
+ #[cfg(feature = "unicode-case")]
+ assert_eq!(
+ t(r"(?i)\d"),
+ hir_uclass_query(ClassQuery::Binary("digit"))
+ );
+ #[cfg(feature = "unicode-case")]
+ assert_eq!(
+ t(r"(?i)\s"),
+ hir_uclass_query(ClassQuery::Binary("space"))
+ );
+ #[cfg(feature = "unicode-case")]
+ assert_eq!(t(r"(?i)\w"), hir_uclass_perl_word());
+
+ // Unicode, negated
+ assert_eq!(
+ t(r"\D"),
+ hir_negate(hir_uclass_query(ClassQuery::Binary("digit")))
+ );
+ assert_eq!(
+ t(r"\S"),
+ hir_negate(hir_uclass_query(ClassQuery::Binary("space")))
+ );
+ assert_eq!(t(r"\W"), hir_negate(hir_uclass_perl_word()));
+ #[cfg(feature = "unicode-case")]
+ assert_eq!(
+ t(r"(?i)\D"),
+ hir_negate(hir_uclass_query(ClassQuery::Binary("digit")))
+ );
+ #[cfg(feature = "unicode-case")]
+ assert_eq!(
+ t(r"(?i)\S"),
+ hir_negate(hir_uclass_query(ClassQuery::Binary("space")))
+ );
+ #[cfg(feature = "unicode-case")]
+ assert_eq!(t(r"(?i)\W"), hir_negate(hir_uclass_perl_word()));
+
+ // ASCII only
+ assert_eq!(
+ t(r"(?-u)\d"),
+ hir_bclass_from_char(ascii_class(&ast::ClassAsciiKind::Digit))
+ );
+ assert_eq!(
+ t(r"(?-u)\s"),
+ hir_bclass_from_char(ascii_class(&ast::ClassAsciiKind::Space))
+ );
+ assert_eq!(
+ t(r"(?-u)\w"),
+ hir_bclass_from_char(ascii_class(&ast::ClassAsciiKind::Word))
+ );
+ assert_eq!(
+ t(r"(?i-u)\d"),
+ hir_bclass_from_char(ascii_class(&ast::ClassAsciiKind::Digit))
+ );
+ assert_eq!(
+ t(r"(?i-u)\s"),
+ hir_bclass_from_char(ascii_class(&ast::ClassAsciiKind::Space))
+ );
+ assert_eq!(
+ t(r"(?i-u)\w"),
+ hir_bclass_from_char(ascii_class(&ast::ClassAsciiKind::Word))
+ );
+
+ // ASCII only, negated
+ assert_eq!(
+ t(r"(?-u)\D"),
+ hir_negate(hir_bclass_from_char(ascii_class(
+ &ast::ClassAsciiKind::Digit
+ )))
+ );
+ assert_eq!(
+ t(r"(?-u)\S"),
+ hir_negate(hir_bclass_from_char(ascii_class(
+ &ast::ClassAsciiKind::Space
+ )))
+ );
+ assert_eq!(
+ t(r"(?-u)\W"),
+ hir_negate(hir_bclass_from_char(ascii_class(
+ &ast::ClassAsciiKind::Word
+ )))
+ );
+ assert_eq!(
+ t(r"(?i-u)\D"),
+ hir_negate(hir_bclass_from_char(ascii_class(
+ &ast::ClassAsciiKind::Digit
+ )))
+ );
+ assert_eq!(
+ t(r"(?i-u)\S"),
+ hir_negate(hir_bclass_from_char(ascii_class(
+ &ast::ClassAsciiKind::Space
+ )))
+ );
+ assert_eq!(
+ t(r"(?i-u)\W"),
+ hir_negate(hir_bclass_from_char(ascii_class(
+ &ast::ClassAsciiKind::Word
+ )))
+ );
+ }
+
+ #[test]
+ #[cfg(not(feature = "unicode-perl"))]
+ fn class_perl_word_disabled() {
+ assert_eq!(
+ t_err(r"\w"),
+ TestError {
+ kind: hir::ErrorKind::UnicodePerlClassNotFound,
+ span: Span::new(
+ Position::new(0, 1, 1),
+ Position::new(2, 1, 3)
+ ),
+ }
+ );
+ }
+
+ #[test]
+ #[cfg(all(not(feature = "unicode-perl"), not(feature = "unicode-bool")))]
+ fn class_perl_space_disabled() {
+ assert_eq!(
+ t_err(r"\s"),
+ TestError {
+ kind: hir::ErrorKind::UnicodePerlClassNotFound,
+ span: Span::new(
+ Position::new(0, 1, 1),
+ Position::new(2, 1, 3)
+ ),
+ }
+ );
+ }
+
+ #[test]
+ #[cfg(all(
+ not(feature = "unicode-perl"),
+ not(feature = "unicode-gencat")
+ ))]
+ fn class_perl_digit_disabled() {
+ assert_eq!(
+ t_err(r"\d"),
+ TestError {
+ kind: hir::ErrorKind::UnicodePerlClassNotFound,
+ span: Span::new(
+ Position::new(0, 1, 1),
+ Position::new(2, 1, 3)
+ ),
+ }
+ );
+ }
+
+ #[test]
+ #[cfg(feature = "unicode-gencat")]
+ fn class_unicode_gencat() {
+ assert_eq!(t(r"\pZ"), hir_uclass_query(ClassQuery::Binary("Z")));
+ assert_eq!(t(r"\pz"), hir_uclass_query(ClassQuery::Binary("Z")));
+ assert_eq!(
+ t(r"\p{Separator}"),
+ hir_uclass_query(ClassQuery::Binary("Z"))
+ );
+ assert_eq!(
+ t(r"\p{se PaRa ToR}"),
+ hir_uclass_query(ClassQuery::Binary("Z"))
+ );
+ assert_eq!(
+ t(r"\p{gc:Separator}"),
+ hir_uclass_query(ClassQuery::Binary("Z"))
+ );
+ assert_eq!(
+ t(r"\p{gc=Separator}"),
+ hir_uclass_query(ClassQuery::Binary("Z"))
+ );
+ assert_eq!(
+ t(r"\p{Other}"),
+ hir_uclass_query(ClassQuery::Binary("Other"))
+ );
+ assert_eq!(t(r"\pC"), hir_uclass_query(ClassQuery::Binary("Other")));
+
+ assert_eq!(
+ t(r"\PZ"),
+ hir_negate(hir_uclass_query(ClassQuery::Binary("Z")))
+ );
+ assert_eq!(
+ t(r"\P{separator}"),
+ hir_negate(hir_uclass_query(ClassQuery::Binary("Z")))
+ );
+ assert_eq!(
+ t(r"\P{gc!=separator}"),
+ hir_negate(hir_uclass_query(ClassQuery::Binary("Z")))
+ );
+
+ assert_eq!(t(r"\p{any}"), hir_uclass_query(ClassQuery::Binary("Any")));
+ assert_eq!(
+ t(r"\p{assigned}"),
+ hir_uclass_query(ClassQuery::Binary("Assigned"))
+ );
+ assert_eq!(
+ t(r"\p{ascii}"),
+ hir_uclass_query(ClassQuery::Binary("ASCII"))
+ );
+ assert_eq!(
+ t(r"\p{gc:any}"),
+ hir_uclass_query(ClassQuery::Binary("Any"))
+ );
+ assert_eq!(
+ t(r"\p{gc:assigned}"),
+ hir_uclass_query(ClassQuery::Binary("Assigned"))
+ );
+ assert_eq!(
+ t(r"\p{gc:ascii}"),
+ hir_uclass_query(ClassQuery::Binary("ASCII"))
+ );
+
+ assert_eq!(
+ t_err(r"(?-u)\pZ"),
+ TestError {
+ kind: hir::ErrorKind::UnicodeNotAllowed,
+ span: Span::new(
+ Position::new(5, 1, 6),
+ Position::new(8, 1, 9)
+ ),
+ }
+ );
+ assert_eq!(
+ t_err(r"(?-u)\p{Separator}"),
+ TestError {
+ kind: hir::ErrorKind::UnicodeNotAllowed,
+ span: Span::new(
+ Position::new(5, 1, 6),
+ Position::new(18, 1, 19)
+ ),
+ }
+ );
+ assert_eq!(
+ t_err(r"\pE"),
+ TestError {
+ kind: hir::ErrorKind::UnicodePropertyNotFound,
+ span: Span::new(
+ Position::new(0, 1, 1),
+ Position::new(3, 1, 4)
+ ),
+ }
+ );
+ assert_eq!(
+ t_err(r"\p{Foo}"),
+ TestError {
+ kind: hir::ErrorKind::UnicodePropertyNotFound,
+ span: Span::new(
+ Position::new(0, 1, 1),
+ Position::new(7, 1, 8)
+ ),
+ }
+ );
+ assert_eq!(
+ t_err(r"\p{gc:Foo}"),
+ TestError {
+ kind: hir::ErrorKind::UnicodePropertyValueNotFound,
+ span: Span::new(
+ Position::new(0, 1, 1),
+ Position::new(10, 1, 11)
+ ),
+ }
+ );
+ }
+
+ #[test]
+ #[cfg(not(feature = "unicode-gencat"))]
+ fn class_unicode_gencat_disabled() {
+ assert_eq!(
+ t_err(r"\p{Separator}"),
+ TestError {
+ kind: hir::ErrorKind::UnicodePropertyNotFound,
+ span: Span::new(
+ Position::new(0, 1, 1),
+ Position::new(13, 1, 14)
+ ),
+ }
+ );
+
+ assert_eq!(
+ t_err(r"\p{Any}"),
+ TestError {
+ kind: hir::ErrorKind::UnicodePropertyNotFound,
+ span: Span::new(
+ Position::new(0, 1, 1),
+ Position::new(7, 1, 8)
+ ),
+ }
+ );
+ }
+
+ #[test]
+ #[cfg(feature = "unicode-script")]
+ fn class_unicode_script() {
+ assert_eq!(
+ t(r"\p{Greek}"),
+ hir_uclass_query(ClassQuery::Binary("Greek"))
+ );
+ #[cfg(feature = "unicode-case")]
+ assert_eq!(
+ t(r"(?i)\p{Greek}"),
+ hir_case_fold(hir_uclass_query(ClassQuery::Binary("Greek")))
+ );
+ #[cfg(feature = "unicode-case")]
+ assert_eq!(
+ t(r"(?i)\P{Greek}"),
+ hir_negate(hir_case_fold(hir_uclass_query(ClassQuery::Binary(
+ "Greek"
+ ))))
+ );
+
+ assert_eq!(
+ t_err(r"\p{sc:Foo}"),
+ TestError {
+ kind: hir::ErrorKind::UnicodePropertyValueNotFound,
+ span: Span::new(
+ Position::new(0, 1, 1),
+ Position::new(10, 1, 11)
+ ),
+ }
+ );
+ assert_eq!(
+ t_err(r"\p{scx:Foo}"),
+ TestError {
+ kind: hir::ErrorKind::UnicodePropertyValueNotFound,
+ span: Span::new(
+ Position::new(0, 1, 1),
+ Position::new(11, 1, 12)
+ ),
+ }
+ );
+ }
+
+ #[test]
+ #[cfg(not(feature = "unicode-script"))]
+ fn class_unicode_script_disabled() {
+ assert_eq!(
+ t_err(r"\p{Greek}"),
+ TestError {
+ kind: hir::ErrorKind::UnicodePropertyNotFound,
+ span: Span::new(
+ Position::new(0, 1, 1),
+ Position::new(9, 1, 10)
+ ),
+ }
+ );
+
+ assert_eq!(
+ t_err(r"\p{scx:Greek}"),
+ TestError {
+ kind: hir::ErrorKind::UnicodePropertyNotFound,
+ span: Span::new(
+ Position::new(0, 1, 1),
+ Position::new(13, 1, 14)
+ ),
+ }
+ );
+ }
+
+ #[test]
+ #[cfg(feature = "unicode-age")]
+ fn class_unicode_age() {
+ assert_eq!(
+ t_err(r"\p{age:Foo}"),
+ TestError {
+ kind: hir::ErrorKind::UnicodePropertyValueNotFound,
+ span: Span::new(
+ Position::new(0, 1, 1),
+ Position::new(11, 1, 12)
+ ),
+ }
+ );
+ }
+
+ #[test]
+ #[cfg(feature = "unicode-gencat")]
+ fn class_unicode_any_empty() {
+ assert_eq!(
+ t_err(r"\P{any}"),
+ TestError {
+ kind: hir::ErrorKind::EmptyClassNotAllowed,
+ span: Span::new(
+ Position::new(0, 1, 1),
+ Position::new(7, 1, 8)
+ ),
+ }
+ );
+ }
+
+ #[test]
+ #[cfg(not(feature = "unicode-age"))]
+ fn class_unicode_age_disabled() {
+ assert_eq!(
+ t_err(r"\p{age:3.0}"),
+ TestError {
+ kind: hir::ErrorKind::UnicodePropertyNotFound,
+ span: Span::new(
+ Position::new(0, 1, 1),
+ Position::new(11, 1, 12)
+ ),
+ }
+ );
+ }
+
+ #[test]
+ fn class_bracketed() {
+ assert_eq!(t("[a]"), hir_uclass(&[('a', 'a')]));
+ assert_eq!(t("[^[a]]"), hir_negate(hir_uclass(&[('a', 'a')])));
+ assert_eq!(t("[a-z]"), hir_uclass(&[('a', 'z')]));
+ assert_eq!(t("[a-fd-h]"), hir_uclass(&[('a', 'h')]));
+ assert_eq!(t("[a-fg-m]"), hir_uclass(&[('a', 'm')]));
+ assert_eq!(t(r"[\x00]"), hir_uclass(&[('\0', '\0')]));
+ assert_eq!(t(r"[\n]"), hir_uclass(&[('\n', '\n')]));
+ assert_eq!(t("[\n]"), hir_uclass(&[('\n', '\n')]));
+ #[cfg(any(feature = "unicode-perl", feature = "unicode-gencat"))]
+ assert_eq!(t(r"[\d]"), hir_uclass_query(ClassQuery::Binary("digit")));
+ #[cfg(feature = "unicode-gencat")]
+ assert_eq!(
+ t(r"[\pZ]"),
+ hir_uclass_query(ClassQuery::Binary("separator"))
+ );
+ #[cfg(feature = "unicode-gencat")]
+ assert_eq!(
+ t(r"[\p{separator}]"),
+ hir_uclass_query(ClassQuery::Binary("separator"))
+ );
+ #[cfg(any(feature = "unicode-perl", feature = "unicode-gencat"))]
+ assert_eq!(t(r"[^\D]"), hir_uclass_query(ClassQuery::Binary("digit")));
+ #[cfg(feature = "unicode-gencat")]
+ assert_eq!(
+ t(r"[^\PZ]"),
+ hir_uclass_query(ClassQuery::Binary("separator"))
+ );
+ #[cfg(feature = "unicode-gencat")]
+ assert_eq!(
+ t(r"[^\P{separator}]"),
+ hir_uclass_query(ClassQuery::Binary("separator"))
+ );
+ #[cfg(all(
+ feature = "unicode-case",
+ any(feature = "unicode-perl", feature = "unicode-gencat")
+ ))]
+ assert_eq!(
+ t(r"(?i)[^\D]"),
+ hir_uclass_query(ClassQuery::Binary("digit"))
+ );
+ #[cfg(all(feature = "unicode-case", feature = "unicode-script"))]
+ assert_eq!(
+ t(r"(?i)[^\P{greek}]"),
+ hir_case_fold(hir_uclass_query(ClassQuery::Binary("greek")))
+ );
+
+ assert_eq!(t("(?-u)[a]"), hir_bclass(&[(b'a', b'a')]));
+ assert_eq!(t(r"(?-u)[\x00]"), hir_bclass(&[(b'\0', b'\0')]));
+ assert_eq!(t_bytes(r"(?-u)[\xFF]"), hir_bclass(&[(b'\xFF', b'\xFF')]));
+
+ #[cfg(feature = "unicode-case")]
+ assert_eq!(t("(?i)[a]"), hir_uclass(&[('A', 'A'), ('a', 'a')]));
+ #[cfg(feature = "unicode-case")]
+ assert_eq!(
+ t("(?i)[k]"),
+ hir_uclass(&[('K', 'K'), ('k', 'k'), ('\u{212A}', '\u{212A}'),])
+ );
+ #[cfg(feature = "unicode-case")]
+ assert_eq!(
+ t("(?i)[β]"),
+ hir_uclass(&[('Β', 'Β'), ('β', 'β'), ('ϐ', 'ϐ'),])
+ );
+ assert_eq!(t("(?i-u)[k]"), hir_bclass(&[(b'K', b'K'), (b'k', b'k'),]));
+
+ assert_eq!(t("[^a]"), hir_negate(hir_uclass(&[('a', 'a')])));
+ assert_eq!(t(r"[^\x00]"), hir_negate(hir_uclass(&[('\0', '\0')])));
+ assert_eq!(
+ t_bytes("(?-u)[^a]"),
+ hir_negate(hir_bclass(&[(b'a', b'a')]))
+ );
+ #[cfg(any(feature = "unicode-perl", feature = "unicode-gencat"))]
+ assert_eq!(
+ t(r"[^\d]"),
+ hir_negate(hir_uclass_query(ClassQuery::Binary("digit")))
+ );
+ #[cfg(feature = "unicode-gencat")]
+ assert_eq!(
+ t(r"[^\pZ]"),
+ hir_negate(hir_uclass_query(ClassQuery::Binary("separator")))
+ );
+ #[cfg(feature = "unicode-gencat")]
+ assert_eq!(
+ t(r"[^\p{separator}]"),
+ hir_negate(hir_uclass_query(ClassQuery::Binary("separator")))
+ );
+ #[cfg(all(feature = "unicode-case", feature = "unicode-script"))]
+ assert_eq!(
+ t(r"(?i)[^\p{greek}]"),
+ hir_negate(hir_case_fold(hir_uclass_query(ClassQuery::Binary(
+ "greek"
+ ))))
+ );
+ #[cfg(all(feature = "unicode-case", feature = "unicode-script"))]
+ assert_eq!(
+ t(r"(?i)[\P{greek}]"),
+ hir_negate(hir_case_fold(hir_uclass_query(ClassQuery::Binary(
+ "greek"
+ ))))
+ );
+
+ // Test some weird cases.
+ assert_eq!(t(r"[\[]"), hir_uclass(&[('[', '[')]));
+
+ assert_eq!(t(r"[&]"), hir_uclass(&[('&', '&')]));
+ assert_eq!(t(r"[\&]"), hir_uclass(&[('&', '&')]));
+ assert_eq!(t(r"[\&\&]"), hir_uclass(&[('&', '&')]));
+ assert_eq!(t(r"[\x00-&]"), hir_uclass(&[('\0', '&')]));
+ assert_eq!(t(r"[&-\xFF]"), hir_uclass(&[('&', '\u{FF}')]));
+
+ assert_eq!(t(r"[~]"), hir_uclass(&[('~', '~')]));
+ assert_eq!(t(r"[\~]"), hir_uclass(&[('~', '~')]));
+ assert_eq!(t(r"[\~\~]"), hir_uclass(&[('~', '~')]));
+ assert_eq!(t(r"[\x00-~]"), hir_uclass(&[('\0', '~')]));
+ assert_eq!(t(r"[~-\xFF]"), hir_uclass(&[('~', '\u{FF}')]));
+
+ assert_eq!(t(r"[-]"), hir_uclass(&[('-', '-')]));
+ assert_eq!(t(r"[\-]"), hir_uclass(&[('-', '-')]));
+ assert_eq!(t(r"[\-\-]"), hir_uclass(&[('-', '-')]));
+ assert_eq!(t(r"[\x00-\-]"), hir_uclass(&[('\0', '-')]));
+ assert_eq!(t(r"[\--\xFF]"), hir_uclass(&[('-', '\u{FF}')]));
+
+ assert_eq!(
+ t_err("(?-u)[^a]"),
+ TestError {
+ kind: hir::ErrorKind::InvalidUtf8,
+ span: Span::new(
+ Position::new(5, 1, 6),
+ Position::new(9, 1, 10)
+ ),
+ }
+ );
+ #[cfg(any(feature = "unicode-perl", feature = "unicode-bool"))]
+ assert_eq!(
+ t_err(r"[^\s\S]"),
+ TestError {
+ kind: hir::ErrorKind::EmptyClassNotAllowed,
+ span: Span::new(
+ Position::new(0, 1, 1),
+ Position::new(7, 1, 8)
+ ),
+ }
+ );
+ #[cfg(any(feature = "unicode-perl", feature = "unicode-bool"))]
+ assert_eq!(
+ t_err(r"(?-u)[^\s\S]"),
+ TestError {
+ kind: hir::ErrorKind::EmptyClassNotAllowed,
+ span: Span::new(
+ Position::new(5, 1, 6),
+ Position::new(12, 1, 13)
+ ),
+ }
+ );
+ }
+
+ #[test]
+ fn class_bracketed_union() {
+ assert_eq!(t("[a-zA-Z]"), hir_uclass(&[('A', 'Z'), ('a', 'z')]));
+ #[cfg(feature = "unicode-gencat")]
+ assert_eq!(
+ t(r"[a\pZb]"),
+ hir_union(
+ hir_uclass(&[('a', 'b')]),
+ hir_uclass_query(ClassQuery::Binary("separator"))
+ )
+ );
+ #[cfg(all(feature = "unicode-gencat", feature = "unicode-script"))]
+ assert_eq!(
+ t(r"[\pZ\p{Greek}]"),
+ hir_union(
+ hir_uclass_query(ClassQuery::Binary("greek")),
+ hir_uclass_query(ClassQuery::Binary("separator"))
+ )
+ );
+ #[cfg(all(
+ feature = "unicode-age",
+ feature = "unicode-gencat",
+ feature = "unicode-script"
+ ))]
+ assert_eq!(
+ t(r"[\p{age:3.0}\pZ\p{Greek}]"),
+ hir_union(
+ hir_uclass_query(ClassQuery::ByValue {
+ property_name: "age",
+ property_value: "3.0",
+ }),
+ hir_union(
+ hir_uclass_query(ClassQuery::Binary("greek")),
+ hir_uclass_query(ClassQuery::Binary("separator"))
+ )
+ )
+ );
+ #[cfg(all(
+ feature = "unicode-age",
+ feature = "unicode-gencat",
+ feature = "unicode-script"
+ ))]
+ assert_eq!(
+ t(r"[[[\p{age:3.0}\pZ]\p{Greek}][\p{Cyrillic}]]"),
+ hir_union(
+ hir_uclass_query(ClassQuery::ByValue {
+ property_name: "age",
+ property_value: "3.0",
+ }),
+ hir_union(
+ hir_uclass_query(ClassQuery::Binary("cyrillic")),
+ hir_union(
+ hir_uclass_query(ClassQuery::Binary("greek")),
+ hir_uclass_query(ClassQuery::Binary("separator"))
+ )
+ )
+ )
+ );
+
+ #[cfg(all(
+ feature = "unicode-age",
+ feature = "unicode-case",
+ feature = "unicode-gencat",
+ feature = "unicode-script"
+ ))]
+ assert_eq!(
+ t(r"(?i)[\p{age:3.0}\pZ\p{Greek}]"),
+ hir_case_fold(hir_union(
+ hir_uclass_query(ClassQuery::ByValue {
+ property_name: "age",
+ property_value: "3.0",
+ }),
+ hir_union(
+ hir_uclass_query(ClassQuery::Binary("greek")),
+ hir_uclass_query(ClassQuery::Binary("separator"))
+ )
+ ))
+ );
+ #[cfg(all(
+ feature = "unicode-age",
+ feature = "unicode-gencat",
+ feature = "unicode-script"
+ ))]
+ assert_eq!(
+ t(r"[^\p{age:3.0}\pZ\p{Greek}]"),
+ hir_negate(hir_union(
+ hir_uclass_query(ClassQuery::ByValue {
+ property_name: "age",
+ property_value: "3.0",
+ }),
+ hir_union(
+ hir_uclass_query(ClassQuery::Binary("greek")),
+ hir_uclass_query(ClassQuery::Binary("separator"))
+ )
+ ))
+ );
+ #[cfg(all(
+ feature = "unicode-age",
+ feature = "unicode-case",
+ feature = "unicode-gencat",
+ feature = "unicode-script"
+ ))]
+ assert_eq!(
+ t(r"(?i)[^\p{age:3.0}\pZ\p{Greek}]"),
+ hir_negate(hir_case_fold(hir_union(
+ hir_uclass_query(ClassQuery::ByValue {
+ property_name: "age",
+ property_value: "3.0",
+ }),
+ hir_union(
+ hir_uclass_query(ClassQuery::Binary("greek")),
+ hir_uclass_query(ClassQuery::Binary("separator"))
+ )
+ )))
+ );
+ }
+
+ #[test]
+ fn class_bracketed_nested() {
+ assert_eq!(t(r"[a[^c]]"), hir_negate(hir_uclass(&[('c', 'c')])));
+ assert_eq!(t(r"[a-b[^c]]"), hir_negate(hir_uclass(&[('c', 'c')])));
+ assert_eq!(t(r"[a-c[^c]]"), hir_negate(hir_uclass(&[])));
+
+ assert_eq!(t(r"[^a[^c]]"), hir_uclass(&[('c', 'c')]));
+ assert_eq!(t(r"[^a-b[^c]]"), hir_uclass(&[('c', 'c')]));
+
+ #[cfg(feature = "unicode-case")]
+ assert_eq!(
+ t(r"(?i)[a[^c]]"),
+ hir_negate(hir_case_fold(hir_uclass(&[('c', 'c')])))
+ );
+ #[cfg(feature = "unicode-case")]
+ assert_eq!(
+ t(r"(?i)[a-b[^c]]"),
+ hir_negate(hir_case_fold(hir_uclass(&[('c', 'c')])))
+ );
+
+ #[cfg(feature = "unicode-case")]
+ assert_eq!(t(r"(?i)[^a[^c]]"), hir_uclass(&[('C', 'C'), ('c', 'c')]));
+ #[cfg(feature = "unicode-case")]
+ assert_eq!(
+ t(r"(?i)[^a-b[^c]]"),
+ hir_uclass(&[('C', 'C'), ('c', 'c')])
+ );
+
+ assert_eq!(
+ t_err(r"[^a-c[^c]]"),
+ TestError {
+ kind: hir::ErrorKind::EmptyClassNotAllowed,
+ span: Span::new(
+ Position::new(0, 1, 1),
+ Position::new(10, 1, 11)
+ ),
+ }
+ );
+ #[cfg(feature = "unicode-case")]
+ assert_eq!(
+ t_err(r"(?i)[^a-c[^c]]"),
+ TestError {
+ kind: hir::ErrorKind::EmptyClassNotAllowed,
+ span: Span::new(
+ Position::new(4, 1, 5),
+ Position::new(14, 1, 15)
+ ),
+ }
+ );
+ }
+
+ #[test]
+ fn class_bracketed_intersect() {
+ assert_eq!(t("[abc&&b-c]"), hir_uclass(&[('b', 'c')]));
+ assert_eq!(t("[abc&&[b-c]]"), hir_uclass(&[('b', 'c')]));
+ assert_eq!(t("[[abc]&&[b-c]]"), hir_uclass(&[('b', 'c')]));
+ assert_eq!(t("[a-z&&b-y&&c-x]"), hir_uclass(&[('c', 'x')]));
+ assert_eq!(t("[c-da-b&&a-d]"), hir_uclass(&[('a', 'd')]));
+ assert_eq!(t("[a-d&&c-da-b]"), hir_uclass(&[('a', 'd')]));
+ assert_eq!(t(r"[a-z&&a-c]"), hir_uclass(&[('a', 'c')]));
+ assert_eq!(t(r"[[a-z&&a-c]]"), hir_uclass(&[('a', 'c')]));
+ assert_eq!(t(r"[^[a-z&&a-c]]"), hir_negate(hir_uclass(&[('a', 'c')])));
+
+ assert_eq!(t("(?-u)[abc&&b-c]"), hir_bclass(&[(b'b', b'c')]));
+ assert_eq!(t("(?-u)[abc&&[b-c]]"), hir_bclass(&[(b'b', b'c')]));
+ assert_eq!(t("(?-u)[[abc]&&[b-c]]"), hir_bclass(&[(b'b', b'c')]));
+ assert_eq!(t("(?-u)[a-z&&b-y&&c-x]"), hir_bclass(&[(b'c', b'x')]));
+ assert_eq!(t("(?-u)[c-da-b&&a-d]"), hir_bclass(&[(b'a', b'd')]));
+ assert_eq!(t("(?-u)[a-d&&c-da-b]"), hir_bclass(&[(b'a', b'd')]));
+
+ #[cfg(feature = "unicode-case")]
+ assert_eq!(
+ t("(?i)[abc&&b-c]"),
+ hir_case_fold(hir_uclass(&[('b', 'c')]))
+ );
+ #[cfg(feature = "unicode-case")]
+ assert_eq!(
+ t("(?i)[abc&&[b-c]]"),
+ hir_case_fold(hir_uclass(&[('b', 'c')]))
+ );
+ #[cfg(feature = "unicode-case")]
+ assert_eq!(
+ t("(?i)[[abc]&&[b-c]]"),
+ hir_case_fold(hir_uclass(&[('b', 'c')]))
+ );
+ #[cfg(feature = "unicode-case")]
+ assert_eq!(
+ t("(?i)[a-z&&b-y&&c-x]"),
+ hir_case_fold(hir_uclass(&[('c', 'x')]))
+ );
+ #[cfg(feature = "unicode-case")]
+ assert_eq!(
+ t("(?i)[c-da-b&&a-d]"),
+ hir_case_fold(hir_uclass(&[('a', 'd')]))
+ );
+ #[cfg(feature = "unicode-case")]
+ assert_eq!(
+ t("(?i)[a-d&&c-da-b]"),
+ hir_case_fold(hir_uclass(&[('a', 'd')]))
+ );
+
+ assert_eq!(
+ t("(?i-u)[abc&&b-c]"),
+ hir_case_fold(hir_bclass(&[(b'b', b'c')]))
+ );
+ assert_eq!(
+ t("(?i-u)[abc&&[b-c]]"),
+ hir_case_fold(hir_bclass(&[(b'b', b'c')]))
+ );
+ assert_eq!(
+ t("(?i-u)[[abc]&&[b-c]]"),
+ hir_case_fold(hir_bclass(&[(b'b', b'c')]))
+ );
+ assert_eq!(
+ t("(?i-u)[a-z&&b-y&&c-x]"),
+ hir_case_fold(hir_bclass(&[(b'c', b'x')]))
+ );
+ assert_eq!(
+ t("(?i-u)[c-da-b&&a-d]"),
+ hir_case_fold(hir_bclass(&[(b'a', b'd')]))
+ );
+ assert_eq!(
+ t("(?i-u)[a-d&&c-da-b]"),
+ hir_case_fold(hir_bclass(&[(b'a', b'd')]))
+ );
+
+ // In `[a^]`, `^` does not need to be escaped, so it makes sense that
+ // `^` is also allowed to be unescaped after `&&`.
+ assert_eq!(t(r"[\^&&^]"), hir_uclass(&[('^', '^')]));
+ // `]` needs to be escaped after `&&` since it's not at start of class.
+ assert_eq!(t(r"[]&&\]]"), hir_uclass(&[(']', ']')]));
+ assert_eq!(t(r"[-&&-]"), hir_uclass(&[('-', '-')]));
+ assert_eq!(t(r"[\&&&&]"), hir_uclass(&[('&', '&')]));
+ assert_eq!(t(r"[\&&&\&]"), hir_uclass(&[('&', '&')]));
+ // Test precedence.
+ assert_eq!(
+ t(r"[a-w&&[^c-g]z]"),
+ hir_uclass(&[('a', 'b'), ('h', 'w')])
+ );
+ }
+
+ #[test]
+ fn class_bracketed_intersect_negate() {
+ #[cfg(feature = "unicode-perl")]
+ assert_eq!(
+ t(r"[^\w&&\d]"),
+ hir_negate(hir_uclass_query(ClassQuery::Binary("digit")))
+ );
+ assert_eq!(t(r"[^[a-z&&a-c]]"), hir_negate(hir_uclass(&[('a', 'c')])));
+ #[cfg(feature = "unicode-perl")]
+ assert_eq!(
+ t(r"[^[\w&&\d]]"),
+ hir_negate(hir_uclass_query(ClassQuery::Binary("digit")))
+ );
+ #[cfg(feature = "unicode-perl")]
+ assert_eq!(
+ t(r"[^[^\w&&\d]]"),
+ hir_uclass_query(ClassQuery::Binary("digit"))
+ );
+ #[cfg(feature = "unicode-perl")]
+ assert_eq!(t(r"[[[^\w]&&[^\d]]]"), hir_negate(hir_uclass_perl_word()));
+
+ #[cfg(feature = "unicode-perl")]
+ assert_eq!(
+ t_bytes(r"(?-u)[^\w&&\d]"),
+ hir_negate(hir_bclass_from_char(ascii_class(
+ &ast::ClassAsciiKind::Digit
+ )))
+ );
+ assert_eq!(
+ t_bytes(r"(?-u)[^[a-z&&a-c]]"),
+ hir_negate(hir_bclass(&[(b'a', b'c')]))
+ );
+ assert_eq!(
+ t_bytes(r"(?-u)[^[\w&&\d]]"),
+ hir_negate(hir_bclass_from_char(ascii_class(
+ &ast::ClassAsciiKind::Digit
+ )))
+ );
+ assert_eq!(
+ t_bytes(r"(?-u)[^[^\w&&\d]]"),
+ hir_bclass_from_char(ascii_class(&ast::ClassAsciiKind::Digit))
+ );
+ assert_eq!(
+ t_bytes(r"(?-u)[[[^\w]&&[^\d]]]"),
+ hir_negate(hir_bclass_from_char(ascii_class(
+ &ast::ClassAsciiKind::Word
+ )))
+ );
+ }
+
+ #[test]
+ fn class_bracketed_difference() {
+ #[cfg(feature = "unicode-gencat")]
+ assert_eq!(
+ t(r"[\pL--[:ascii:]]"),
+ hir_difference(
+ hir_uclass_query(ClassQuery::Binary("letter")),
+ hir_uclass(&[('\0', '\x7F')])
+ )
+ );
+
+ assert_eq!(
+ t(r"(?-u)[[:alpha:]--[:lower:]]"),
+ hir_bclass(&[(b'A', b'Z')])
+ );
+ }
+
+ #[test]
+ fn class_bracketed_symmetric_difference() {
+ #[cfg(feature = "unicode-script")]
+ assert_eq!(
+ t(r"[\p{sc:Greek}~~\p{scx:Greek}]"),
+ hir_uclass(&[
+ ('\u{0342}', '\u{0342}'),
+ ('\u{0345}', '\u{0345}'),
+ ('\u{1DC0}', '\u{1DC1}'),
+ ])
+ );
+ assert_eq!(t(r"[a-g~~c-j]"), hir_uclass(&[('a', 'b'), ('h', 'j')]));
+
+ assert_eq!(
+ t(r"(?-u)[a-g~~c-j]"),
+ hir_bclass(&[(b'a', b'b'), (b'h', b'j')])
+ );
+ }
+
+ #[test]
+ fn ignore_whitespace() {
+ assert_eq!(t(r"(?x)\12 3"), hir_lit("\n3"));
+ assert_eq!(t(r"(?x)\x { 53 }"), hir_lit("S"));
+ assert_eq!(
+ t(r"(?x)\x # comment
+{ # comment
+ 53 # comment
+} #comment"),
+ hir_lit("S")
+ );
+
+ assert_eq!(t(r"(?x)\x 53"), hir_lit("S"));
+ assert_eq!(
+ t(r"(?x)\x # comment
+ 53 # comment"),
+ hir_lit("S")
+ );
+ assert_eq!(t(r"(?x)\x5 3"), hir_lit("S"));
+
+ #[cfg(feature = "unicode-gencat")]
+ assert_eq!(
+ t(r"(?x)\p # comment
+{ # comment
+ Separator # comment
+} # comment"),
+ hir_uclass_query(ClassQuery::Binary("separator"))
+ );
+
+ assert_eq!(
+ t(r"(?x)a # comment
+{ # comment
+ 5 # comment
+ , # comment
+ 10 # comment
+} # comment"),
+ hir_range(
+ true,
+ hir::RepetitionRange::Bounded(5, 10),
+ hir_lit("a")
+ )
+ );
+
+ assert_eq!(t(r"(?x)a\ # hi there"), hir_lit("a "));
+ }
+
+ #[test]
+ fn analysis_is_always_utf8() {
+ // Positive examples.
+ assert!(t_bytes(r"a").is_always_utf8());
+ assert!(t_bytes(r"ab").is_always_utf8());
+ assert!(t_bytes(r"(?-u)a").is_always_utf8());
+ assert!(t_bytes(r"(?-u)ab").is_always_utf8());
+ assert!(t_bytes(r"\xFF").is_always_utf8());
+ assert!(t_bytes(r"\xFF\xFF").is_always_utf8());
+ assert!(t_bytes(r"[^a]").is_always_utf8());
+ assert!(t_bytes(r"[^a][^a]").is_always_utf8());
+ assert!(t_bytes(r"\b").is_always_utf8());
+ assert!(t_bytes(r"\B").is_always_utf8());
+ assert!(t_bytes(r"(?-u)\b").is_always_utf8());
+
+ // Negative examples.
+ assert!(!t_bytes(r"(?-u)\xFF").is_always_utf8());
+ assert!(!t_bytes(r"(?-u)\xFF\xFF").is_always_utf8());
+ assert!(!t_bytes(r"(?-u)[^a]").is_always_utf8());
+ assert!(!t_bytes(r"(?-u)[^a][^a]").is_always_utf8());
+ assert!(!t_bytes(r"(?-u)\B").is_always_utf8());
+ }
+
+ #[test]
+ fn analysis_is_all_assertions() {
+ // Positive examples.
+ assert!(t(r"\b").is_all_assertions());
+ assert!(t(r"\B").is_all_assertions());
+ assert!(t(r"^").is_all_assertions());
+ assert!(t(r"$").is_all_assertions());
+ assert!(t(r"\A").is_all_assertions());
+ assert!(t(r"\z").is_all_assertions());
+ assert!(t(r"$^\z\A\b\B").is_all_assertions());
+ assert!(t(r"$|^|\z|\A|\b|\B").is_all_assertions());
+ assert!(t(r"^$|$^").is_all_assertions());
+ assert!(t(r"((\b)+())*^").is_all_assertions());
+
+ // Negative examples.
+ assert!(!t(r"^a").is_all_assertions());
+ }
+
+ #[test]
+ fn analysis_is_anchored() {
+ // Positive examples.
+ assert!(t(r"^").is_anchored_start());
+ assert!(t(r"$").is_anchored_end());
+ assert!(t(r"^").is_line_anchored_start());
+ assert!(t(r"$").is_line_anchored_end());
+
+ assert!(t(r"^^").is_anchored_start());
+ assert!(t(r"$$").is_anchored_end());
+ assert!(t(r"^^").is_line_anchored_start());
+ assert!(t(r"$$").is_line_anchored_end());
+
+ assert!(t(r"^$").is_anchored_start());
+ assert!(t(r"^$").is_anchored_end());
+ assert!(t(r"^$").is_line_anchored_start());
+ assert!(t(r"^$").is_line_anchored_end());
+
+ assert!(t(r"^foo").is_anchored_start());
+ assert!(t(r"foo$").is_anchored_end());
+ assert!(t(r"^foo").is_line_anchored_start());
+ assert!(t(r"foo$").is_line_anchored_end());
+
+ assert!(t(r"^foo|^bar").is_anchored_start());
+ assert!(t(r"foo$|bar$").is_anchored_end());
+ assert!(t(r"^foo|^bar").is_line_anchored_start());
+ assert!(t(r"foo$|bar$").is_line_anchored_end());
+
+ assert!(t(r"^(foo|bar)").is_anchored_start());
+ assert!(t(r"(foo|bar)$").is_anchored_end());
+ assert!(t(r"^(foo|bar)").is_line_anchored_start());
+ assert!(t(r"(foo|bar)$").is_line_anchored_end());
+
+ assert!(t(r"^+").is_anchored_start());
+ assert!(t(r"$+").is_anchored_end());
+ assert!(t(r"^+").is_line_anchored_start());
+ assert!(t(r"$+").is_line_anchored_end());
+ assert!(t(r"^++").is_anchored_start());
+ assert!(t(r"$++").is_anchored_end());
+ assert!(t(r"^++").is_line_anchored_start());
+ assert!(t(r"$++").is_line_anchored_end());
+ assert!(t(r"(^)+").is_anchored_start());
+ assert!(t(r"($)+").is_anchored_end());
+ assert!(t(r"(^)+").is_line_anchored_start());
+ assert!(t(r"($)+").is_line_anchored_end());
+
+ assert!(t(r"$^").is_anchored_start());
+ assert!(t(r"$^").is_anchored_start());
+ assert!(t(r"$^").is_line_anchored_end());
+ assert!(t(r"$^").is_line_anchored_end());
+ assert!(t(r"$^|^$").is_anchored_start());
+ assert!(t(r"$^|^$").is_anchored_end());
+ assert!(t(r"$^|^$").is_line_anchored_start());
+ assert!(t(r"$^|^$").is_line_anchored_end());
+
+ assert!(t(r"\b^").is_anchored_start());
+ assert!(t(r"$\b").is_anchored_end());
+ assert!(t(r"\b^").is_line_anchored_start());
+ assert!(t(r"$\b").is_line_anchored_end());
+ assert!(t(r"^(?m:^)").is_anchored_start());
+ assert!(t(r"(?m:$)$").is_anchored_end());
+ assert!(t(r"^(?m:^)").is_line_anchored_start());
+ assert!(t(r"(?m:$)$").is_line_anchored_end());
+ assert!(t(r"(?m:^)^").is_anchored_start());
+ assert!(t(r"$(?m:$)").is_anchored_end());
+ assert!(t(r"(?m:^)^").is_line_anchored_start());
+ assert!(t(r"$(?m:$)").is_line_anchored_end());
+
+ // Negative examples.
+ assert!(!t(r"(?m)^").is_anchored_start());
+ assert!(!t(r"(?m)$").is_anchored_end());
+ assert!(!t(r"(?m:^$)|$^").is_anchored_start());
+ assert!(!t(r"(?m:^$)|$^").is_anchored_end());
+ assert!(!t(r"$^|(?m:^$)").is_anchored_start());
+ assert!(!t(r"$^|(?m:^$)").is_anchored_end());
+
+ assert!(!t(r"a^").is_anchored_start());
+ assert!(!t(r"$a").is_anchored_start());
+ assert!(!t(r"a^").is_line_anchored_start());
+ assert!(!t(r"$a").is_line_anchored_start());
+
+ assert!(!t(r"a^").is_anchored_end());
+ assert!(!t(r"$a").is_anchored_end());
+ assert!(!t(r"a^").is_line_anchored_end());
+ assert!(!t(r"$a").is_line_anchored_end());
+
+ assert!(!t(r"^foo|bar").is_anchored_start());
+ assert!(!t(r"foo|bar$").is_anchored_end());
+ assert!(!t(r"^foo|bar").is_line_anchored_start());
+ assert!(!t(r"foo|bar$").is_line_anchored_end());
+
+ assert!(!t(r"^*").is_anchored_start());
+ assert!(!t(r"$*").is_anchored_end());
+ assert!(!t(r"^*").is_line_anchored_start());
+ assert!(!t(r"$*").is_line_anchored_end());
+ assert!(!t(r"^*+").is_anchored_start());
+ assert!(!t(r"$*+").is_anchored_end());
+ assert!(!t(r"^*+").is_line_anchored_start());
+ assert!(!t(r"$*+").is_line_anchored_end());
+ assert!(!t(r"^+*").is_anchored_start());
+ assert!(!t(r"$+*").is_anchored_end());
+ assert!(!t(r"^+*").is_line_anchored_start());
+ assert!(!t(r"$+*").is_line_anchored_end());
+ assert!(!t(r"(^)*").is_anchored_start());
+ assert!(!t(r"($)*").is_anchored_end());
+ assert!(!t(r"(^)*").is_line_anchored_start());
+ assert!(!t(r"($)*").is_line_anchored_end());
+ }
+
+ #[test]
+ fn analysis_is_line_anchored() {
+ assert!(t(r"(?m)^(foo|bar)").is_line_anchored_start());
+ assert!(t(r"(?m)(foo|bar)$").is_line_anchored_end());
+
+ assert!(t(r"(?m)^foo|^bar").is_line_anchored_start());
+ assert!(t(r"(?m)foo$|bar$").is_line_anchored_end());
+
+ assert!(t(r"(?m)^").is_line_anchored_start());
+ assert!(t(r"(?m)$").is_line_anchored_end());
+
+ assert!(t(r"(?m:^$)|$^").is_line_anchored_start());
+ assert!(t(r"(?m:^$)|$^").is_line_anchored_end());
+
+ assert!(t(r"$^|(?m:^$)").is_line_anchored_start());
+ assert!(t(r"$^|(?m:^$)").is_line_anchored_end());
+ }
+
+ #[test]
+ fn analysis_is_any_anchored() {
+ // Positive examples.
+ assert!(t(r"^").is_any_anchored_start());
+ assert!(t(r"$").is_any_anchored_end());
+ assert!(t(r"\A").is_any_anchored_start());
+ assert!(t(r"\z").is_any_anchored_end());
+
+ // Negative examples.
+ assert!(!t(r"(?m)^").is_any_anchored_start());
+ assert!(!t(r"(?m)$").is_any_anchored_end());
+ assert!(!t(r"$").is_any_anchored_start());
+ assert!(!t(r"^").is_any_anchored_end());
+ }
+
+ #[test]
+ fn analysis_is_match_empty() {
+ // Positive examples.
+ assert!(t(r"").is_match_empty());
+ assert!(t(r"()").is_match_empty());
+ assert!(t(r"()*").is_match_empty());
+ assert!(t(r"()+").is_match_empty());
+ assert!(t(r"()?").is_match_empty());
+ assert!(t(r"a*").is_match_empty());
+ assert!(t(r"a?").is_match_empty());
+ assert!(t(r"a{0}").is_match_empty());
+ assert!(t(r"a{0,}").is_match_empty());
+ assert!(t(r"a{0,1}").is_match_empty());
+ assert!(t(r"a{0,10}").is_match_empty());
+ #[cfg(feature = "unicode-gencat")]
+ assert!(t(r"\pL*").is_match_empty());
+ assert!(t(r"a*|b").is_match_empty());
+ assert!(t(r"b|a*").is_match_empty());
+ assert!(t(r"a|").is_match_empty());
+ assert!(t(r"|a").is_match_empty());
+ assert!(t(r"a||b").is_match_empty());
+ assert!(t(r"a*a?(abcd)*").is_match_empty());
+ assert!(t(r"^").is_match_empty());
+ assert!(t(r"$").is_match_empty());
+ assert!(t(r"(?m)^").is_match_empty());
+ assert!(t(r"(?m)$").is_match_empty());
+ assert!(t(r"\A").is_match_empty());
+ assert!(t(r"\z").is_match_empty());
+ assert!(t(r"\B").is_match_empty());
+ assert!(t_bytes(r"(?-u)\B").is_match_empty());
+ assert!(t(r"\b").is_match_empty());
+ assert!(t(r"(?-u)\b").is_match_empty());
+
+ // Negative examples.
+ assert!(!t(r"a+").is_match_empty());
+ assert!(!t(r"a{1}").is_match_empty());
+ assert!(!t(r"a{1,}").is_match_empty());
+ assert!(!t(r"a{1,2}").is_match_empty());
+ assert!(!t(r"a{1,10}").is_match_empty());
+ assert!(!t(r"b|a").is_match_empty());
+ assert!(!t(r"a*a+(abcd)*").is_match_empty());
+ }
+
+ #[test]
+ fn analysis_is_literal() {
+ // Positive examples.
+ assert!(t(r"a").is_literal());
+ assert!(t(r"ab").is_literal());
+ assert!(t(r"abc").is_literal());
+ assert!(t(r"(?m)abc").is_literal());
+
+ // Negative examples.
+ assert!(!t(r"").is_literal());
+ assert!(!t(r"^").is_literal());
+ assert!(!t(r"a|b").is_literal());
+ assert!(!t(r"(a)").is_literal());
+ assert!(!t(r"a+").is_literal());
+ assert!(!t(r"foo(a)").is_literal());
+ assert!(!t(r"(a)foo").is_literal());
+ assert!(!t(r"[a]").is_literal());
+ }
+
+ #[test]
+ fn analysis_is_alternation_literal() {
+ // Positive examples.
+ assert!(t(r"a").is_alternation_literal());
+ assert!(t(r"ab").is_alternation_literal());
+ assert!(t(r"abc").is_alternation_literal());
+ assert!(t(r"(?m)abc").is_alternation_literal());
+ assert!(t(r"a|b").is_alternation_literal());
+ assert!(t(r"a|b|c").is_alternation_literal());
+ assert!(t(r"foo|bar").is_alternation_literal());
+ assert!(t(r"foo|bar|baz").is_alternation_literal());
+
+ // Negative examples.
+ assert!(!t(r"").is_alternation_literal());
+ assert!(!t(r"^").is_alternation_literal());
+ assert!(!t(r"(a)").is_alternation_literal());
+ assert!(!t(r"a+").is_alternation_literal());
+ assert!(!t(r"foo(a)").is_alternation_literal());
+ assert!(!t(r"(a)foo").is_alternation_literal());
+ assert!(!t(r"[a]").is_alternation_literal());
+ assert!(!t(r"[a]|b").is_alternation_literal());
+ assert!(!t(r"a|[b]").is_alternation_literal());
+ assert!(!t(r"(a)|b").is_alternation_literal());
+ assert!(!t(r"a|(b)").is_alternation_literal());
+ }
+}