/*! Defines a translator that converts an `Ast` to an `Hir`. */ use core::cell::{Cell, RefCell}; use alloc::{boxed::Box, string::ToString, vec, vec::Vec}; use crate::{ ast::{self, Ast, Span, Visitor}, either::Either, hir::{self, Error, ErrorKind, Hir, HirKind}, unicode::{self, ClassQuery}, }; type Result = core::result::Result; /// A builder for constructing an AST->HIR translator. #[derive(Clone, Debug)] pub struct TranslatorBuilder { utf8: bool, line_terminator: u8, flags: Flags, } impl Default for TranslatorBuilder { fn default() -> TranslatorBuilder { TranslatorBuilder::new() } } impl TranslatorBuilder { /// Create a new translator builder with a default c onfiguration. pub fn new() -> TranslatorBuilder { TranslatorBuilder { utf8: true, line_terminator: b'\n', flags: Flags::default(), } } /// Build a translator using the current configuration. pub fn build(&self) -> Translator { Translator { stack: RefCell::new(vec![]), flags: Cell::new(self.flags), utf8: self.utf8, line_terminator: self.line_terminator, } } /// When disabled, translation will permit the construction of a regular /// expression that may match invalid UTF-8. /// /// When enabled (the default), the translator is guaranteed to produce an /// expression that, for non-empty matches, will only ever produce spans /// that are entirely valid UTF-8 (otherwise, the translator will return an /// error). /// /// Perhaps surprisingly, when UTF-8 is enabled, an empty regex or even /// a negated ASCII word boundary (uttered as `(?-u:\B)` in the concrete /// syntax) will be allowed even though they can produce matches that split /// a UTF-8 encoded codepoint. This only applies to zero-width or "empty" /// matches, and it is expected that the regex engine itself must handle /// these cases if necessary (perhaps by suppressing any zero-width matches /// that split a codepoint). pub fn utf8(&mut self, yes: bool) -> &mut TranslatorBuilder { self.utf8 = yes; self } /// Sets the line terminator for use with `(?u-s:.)` and `(?-us:.)`. /// /// Namely, instead of `.` (by default) matching everything except for `\n`, /// this will cause `.` to match everything except for the byte given. /// /// If `.` is used in a context where Unicode mode is enabled and this byte /// isn't ASCII, then an error will be returned. When Unicode mode is /// disabled, then any byte is permitted, but will return an error if UTF-8 /// mode is enabled and it is a non-ASCII byte. /// /// In short, any ASCII value for a line terminator is always okay. But a /// non-ASCII byte might result in an error depending on whether Unicode /// mode or UTF-8 mode are enabled. /// /// Note that if `R` mode is enabled then it always takes precedence and /// the line terminator will be treated as `\r` and `\n` simultaneously. /// /// Note also that this *doesn't* impact the look-around assertions /// `(?m:^)` and `(?m:$)`. That's usually controlled by additional /// configuration in the regex engine itself. pub fn line_terminator(&mut self, byte: u8) -> &mut TranslatorBuilder { self.line_terminator = byte; self } /// Enable or disable the case insensitive flag (`i`) by default. pub fn case_insensitive(&mut self, yes: bool) -> &mut TranslatorBuilder { self.flags.case_insensitive = if yes { Some(true) } else { None }; self } /// Enable or disable the multi-line matching flag (`m`) by default. pub fn multi_line(&mut self, yes: bool) -> &mut TranslatorBuilder { self.flags.multi_line = if yes { Some(true) } else { None }; self } /// Enable or disable the "dot matches any character" flag (`s`) by /// default. pub fn dot_matches_new_line( &mut self, yes: bool, ) -> &mut TranslatorBuilder { self.flags.dot_matches_new_line = if yes { Some(true) } else { None }; self } /// Enable or disable the CRLF mode flag (`R`) by default. pub fn crlf(&mut self, yes: bool) -> &mut TranslatorBuilder { self.flags.crlf = if yes { Some(true) } else { None }; self } /// Enable or disable the "swap greed" flag (`U`) by default. pub fn swap_greed(&mut self, yes: bool) -> &mut TranslatorBuilder { self.flags.swap_greed = if yes { Some(true) } else { None }; self } /// Enable or disable the Unicode flag (`u`) by default. pub fn unicode(&mut self, yes: bool) -> &mut TranslatorBuilder { self.flags.unicode = if yes { None } else { Some(false) }; self } } /// A translator maps abstract syntax to a high level intermediate /// representation. /// /// A translator may be benefit from reuse. That is, a translator can translate /// many abstract syntax trees. /// /// A `Translator` can be configured in more detail via a /// [`TranslatorBuilder`]. #[derive(Clone, Debug)] pub struct Translator { /// Our call stack, but on the heap. stack: RefCell>, /// The current flag settings. flags: Cell, /// Whether we're allowed to produce HIR that can match arbitrary bytes. utf8: bool, /// The line terminator to use for `.`. line_terminator: u8, } impl Translator { /// Create a new translator using the default configuration. pub fn new() -> Translator { TranslatorBuilder::new().build() } /// Translate the given abstract syntax tree (AST) into a high level /// intermediate representation (HIR). /// /// If there was a problem doing the translation, then an HIR-specific /// error is returned. /// /// The original pattern string used to produce the `Ast` *must* also be /// provided. The translator does not use the pattern string during any /// correct translation, but is used for error reporting. pub fn translate(&mut self, pattern: &str, ast: &Ast) -> Result { ast::visit(ast, TranslatorI::new(self, pattern)) } } /// An HirFrame is a single stack frame, represented explicitly, which is /// created for each item in the Ast that we traverse. /// /// Note that technically, this type doesn't represent our entire stack /// frame. In particular, the Ast visitor represents any state associated with /// traversing the Ast itself. #[derive(Clone, Debug)] enum HirFrame { /// An arbitrary HIR expression. These get pushed whenever we hit a base /// case in the Ast. They get popped after an inductive (i.e., recursive) /// step is complete. Expr(Hir), /// A literal that is being constructed, character by character, from the /// AST. We need this because the AST gives each individual character its /// own node. So as we see characters, we peek at the top-most HirFrame. /// If it's a literal, then we add to it. Otherwise, we push a new literal. /// When it comes time to pop it, we convert it to an Hir via Hir::literal. Literal(Vec), /// A Unicode character class. This frame is mutated as we descend into /// the Ast of a character class (which is itself its own mini recursive /// structure). ClassUnicode(hir::ClassUnicode), /// A byte-oriented character class. This frame is mutated as we descend /// into the Ast of a character class (which is itself its own mini /// recursive structure). /// /// Byte character classes are created when Unicode mode (`u`) is disabled. /// If `utf8` is enabled (the default), then a byte character is only /// permitted to match ASCII text. ClassBytes(hir::ClassBytes), /// This is pushed whenever a repetition is observed. After visiting every /// sub-expression in the repetition, the translator's stack is expected to /// have this sentinel at the top. /// /// This sentinel only exists to stop other things (like flattening /// literals) from reaching across repetition operators. Repetition, /// This is pushed on to the stack upon first seeing any kind of capture, /// indicated by parentheses (including non-capturing groups). It is popped /// upon leaving a group. Group { /// The old active flags when this group was opened. /// /// If this group sets flags, then the new active flags are set to the /// result of merging the old flags with the flags introduced by this /// group. If the group doesn't set any flags, then this is simply /// equivalent to whatever flags were set when the group was opened. /// /// When this group is popped, the active flags should be restored to /// the flags set here. /// /// The "active" flags correspond to whatever flags are set in the /// Translator. old_flags: Flags, }, /// This is pushed whenever a concatenation is observed. After visiting /// every sub-expression in the concatenation, the translator's stack is /// popped until it sees a Concat frame. Concat, /// This is pushed whenever an alternation is observed. After visiting /// every sub-expression in the alternation, the translator's stack is /// popped until it sees an Alternation frame. Alternation, /// This is pushed immediately before each sub-expression in an /// alternation. This separates the branches of an alternation on the /// stack and prevents literal flattening from reaching across alternation /// branches. /// /// It is popped after each expression in a branch until an 'Alternation' /// frame is observed when doing a post visit on an alternation. AlternationBranch, } impl HirFrame { /// Assert that the current stack frame is an Hir expression and return it. fn unwrap_expr(self) -> Hir { match self { HirFrame::Expr(expr) => expr, HirFrame::Literal(lit) => Hir::literal(lit), _ => panic!("tried to unwrap expr from HirFrame, got: {:?}", self), } } /// Assert that the current stack frame is a Unicode class expression and /// return it. fn unwrap_class_unicode(self) -> hir::ClassUnicode { match self { HirFrame::ClassUnicode(cls) => cls, _ => panic!( "tried to unwrap Unicode class \ from HirFrame, got: {:?}", self ), } } /// Assert that the current stack frame is a byte class expression and /// return it. fn unwrap_class_bytes(self) -> hir::ClassBytes { match self { HirFrame::ClassBytes(cls) => cls, _ => panic!( "tried to unwrap byte class \ from HirFrame, got: {:?}", self ), } } /// Assert that the current stack frame is a repetition sentinel. If it /// isn't, then panic. fn unwrap_repetition(self) { match self { HirFrame::Repetition => {} _ => { panic!( "tried to unwrap repetition from HirFrame, got: {:?}", self ) } } } /// Assert that the current stack frame is a group indicator and return /// its corresponding flags (the flags that were active at the time the /// group was entered). fn unwrap_group(self) -> Flags { match self { HirFrame::Group { old_flags } => old_flags, _ => { panic!("tried to unwrap group from HirFrame, got: {:?}", self) } } } /// Assert that the current stack frame is an alternation pipe sentinel. If /// it isn't, then panic. fn unwrap_alternation_pipe(self) { match self { HirFrame::AlternationBranch => {} _ => { panic!( "tried to unwrap alt pipe from HirFrame, got: {:?}", self ) } } } } impl<'t, 'p> Visitor for TranslatorI<'t, 'p> { type Output = Hir; type Err = Error; fn finish(self) -> Result { // ... otherwise, we should have exactly one HIR on the stack. assert_eq!(self.trans().stack.borrow().len(), 1); Ok(self.pop().unwrap().unwrap_expr()) } fn visit_pre(&mut self, ast: &Ast) -> Result<()> { match *ast { Ast::Class(ast::Class::Bracketed(_)) => { if self.flags().unicode() { let cls = hir::ClassUnicode::empty(); self.push(HirFrame::ClassUnicode(cls)); } else { let cls = hir::ClassBytes::empty(); self.push(HirFrame::ClassBytes(cls)); } } Ast::Repetition(_) => self.push(HirFrame::Repetition), Ast::Group(ref x) => { let old_flags = x .flags() .map(|ast| self.set_flags(ast)) .unwrap_or_else(|| self.flags()); self.push(HirFrame::Group { old_flags }); } Ast::Concat(ref x) if x.asts.is_empty() => {} Ast::Concat(_) => { self.push(HirFrame::Concat); } Ast::Alternation(ref x) if x.asts.is_empty() => {} Ast::Alternation(_) => { self.push(HirFrame::Alternation); self.push(HirFrame::AlternationBranch); } _ => {} } Ok(()) } fn visit_post(&mut self, ast: &Ast) -> Result<()> { match *ast { Ast::Empty(_) => { self.push(HirFrame::Expr(Hir::empty())); } Ast::Flags(ref x) => { self.set_flags(&x.flags); // Flags in the AST are generally considered directives and // not actual sub-expressions. However, they can be used in // the concrete syntax like `((?i))`, and we need some kind of // indication of an expression there, and Empty is the correct // choice. // // There can also be things like `(?i)+`, but we rule those out // in the parser. In the future, we might allow them for // consistency sake. self.push(HirFrame::Expr(Hir::empty())); } Ast::Literal(ref x) => { match self.ast_literal_to_scalar(x)? { Either::Right(byte) => self.push_byte(byte), Either::Left(ch) => { if !self.flags().unicode() && ch.len_utf8() > 1 { return Err(self .error(x.span, ErrorKind::UnicodeNotAllowed)); } match self.case_fold_char(x.span, ch)? { None => self.push_char(ch), Some(expr) => self.push(HirFrame::Expr(expr)), } } } // self.push(HirFrame::Expr(self.hir_literal(x)?)); } Ast::Dot(span) => { self.push(HirFrame::Expr(self.hir_dot(span)?)); } Ast::Assertion(ref x) => { self.push(HirFrame::Expr(self.hir_assertion(x)?)); } Ast::Class(ast::Class::Perl(ref x)) => { if self.flags().unicode() { let cls = self.hir_perl_unicode_class(x)?; let hcls = hir::Class::Unicode(cls); self.push(HirFrame::Expr(Hir::class(hcls))); } else { let cls = self.hir_perl_byte_class(x)?; let hcls = hir::Class::Bytes(cls); self.push(HirFrame::Expr(Hir::class(hcls))); } } Ast::Class(ast::Class::Unicode(ref x)) => { let cls = hir::Class::Unicode(self.hir_unicode_class(x)?); self.push(HirFrame::Expr(Hir::class(cls))); } Ast::Class(ast::Class::Bracketed(ref ast)) => { if self.flags().unicode() { let mut cls = self.pop().unwrap().unwrap_class_unicode(); self.unicode_fold_and_negate( &ast.span, ast.negated, &mut cls, )?; let expr = Hir::class(hir::Class::Unicode(cls)); self.push(HirFrame::Expr(expr)); } else { let mut cls = self.pop().unwrap().unwrap_class_bytes(); self.bytes_fold_and_negate( &ast.span, ast.negated, &mut cls, )?; let expr = Hir::class(hir::Class::Bytes(cls)); self.push(HirFrame::Expr(expr)); } } Ast::Repetition(ref x) => { let expr = self.pop().unwrap().unwrap_expr(); self.pop().unwrap().unwrap_repetition(); self.push(HirFrame::Expr(self.hir_repetition(x, expr))); } Ast::Group(ref x) => { let expr = self.pop().unwrap().unwrap_expr(); let old_flags = self.pop().unwrap().unwrap_group(); self.trans().flags.set(old_flags); self.push(HirFrame::Expr(self.hir_capture(x, expr))); } Ast::Concat(_) => { let mut exprs = vec![]; while let Some(expr) = self.pop_concat_expr() { if !matches!(*expr.kind(), HirKind::Empty) { exprs.push(expr); } } exprs.reverse(); self.push(HirFrame::Expr(Hir::concat(exprs))); } Ast::Alternation(_) => { let mut exprs = vec![]; while let Some(expr) = self.pop_alt_expr() { self.pop().unwrap().unwrap_alternation_pipe(); exprs.push(expr); } exprs.reverse(); self.push(HirFrame::Expr(Hir::alternation(exprs))); } } Ok(()) } fn visit_alternation_in(&mut self) -> Result<()> { self.push(HirFrame::AlternationBranch); Ok(()) } fn visit_class_set_item_pre( &mut self, ast: &ast::ClassSetItem, ) -> Result<()> { match *ast { ast::ClassSetItem::Bracketed(_) => { if self.flags().unicode() { let cls = hir::ClassUnicode::empty(); self.push(HirFrame::ClassUnicode(cls)); } else { let cls = hir::ClassBytes::empty(); self.push(HirFrame::ClassBytes(cls)); } } // We needn't handle the Union case here since the visitor will // do it for us. _ => {} } Ok(()) } fn visit_class_set_item_post( &mut self, ast: &ast::ClassSetItem, ) -> Result<()> { match *ast { ast::ClassSetItem::Empty(_) => {} ast::ClassSetItem::Literal(ref x) => { if self.flags().unicode() { let mut cls = self.pop().unwrap().unwrap_class_unicode(); cls.push(hir::ClassUnicodeRange::new(x.c, x.c)); self.push(HirFrame::ClassUnicode(cls)); } else { let mut cls = self.pop().unwrap().unwrap_class_bytes(); let byte = self.class_literal_byte(x)?; cls.push(hir::ClassBytesRange::new(byte, byte)); self.push(HirFrame::ClassBytes(cls)); } } ast::ClassSetItem::Range(ref x) => { if self.flags().unicode() { let mut cls = self.pop().unwrap().unwrap_class_unicode(); cls.push(hir::ClassUnicodeRange::new(x.start.c, x.end.c)); self.push(HirFrame::ClassUnicode(cls)); } else { let mut cls = self.pop().unwrap().unwrap_class_bytes(); let start = self.class_literal_byte(&x.start)?; let end = self.class_literal_byte(&x.end)?; cls.push(hir::ClassBytesRange::new(start, end)); self.push(HirFrame::ClassBytes(cls)); } } ast::ClassSetItem::Ascii(ref x) => { if self.flags().unicode() { let xcls = self.hir_ascii_unicode_class(x)?; let mut cls = self.pop().unwrap().unwrap_class_unicode(); cls.union(&xcls); self.push(HirFrame::ClassUnicode(cls)); } else { let xcls = self.hir_ascii_byte_class(x)?; let mut cls = self.pop().unwrap().unwrap_class_bytes(); cls.union(&xcls); self.push(HirFrame::ClassBytes(cls)); } } ast::ClassSetItem::Unicode(ref x) => { let xcls = self.hir_unicode_class(x)?; let mut cls = self.pop().unwrap().unwrap_class_unicode(); cls.union(&xcls); self.push(HirFrame::ClassUnicode(cls)); } ast::ClassSetItem::Perl(ref x) => { if self.flags().unicode() { let xcls = self.hir_perl_unicode_class(x)?; let mut cls = self.pop().unwrap().unwrap_class_unicode(); cls.union(&xcls); self.push(HirFrame::ClassUnicode(cls)); } else { let xcls = self.hir_perl_byte_class(x)?; let mut cls = self.pop().unwrap().unwrap_class_bytes(); cls.union(&xcls); self.push(HirFrame::ClassBytes(cls)); } } ast::ClassSetItem::Bracketed(ref ast) => { if self.flags().unicode() { let mut cls1 = self.pop().unwrap().unwrap_class_unicode(); self.unicode_fold_and_negate( &ast.span, ast.negated, &mut cls1, )?; let mut cls2 = self.pop().unwrap().unwrap_class_unicode(); cls2.union(&cls1); self.push(HirFrame::ClassUnicode(cls2)); } else { let mut cls1 = self.pop().unwrap().unwrap_class_bytes(); self.bytes_fold_and_negate( &ast.span, ast.negated, &mut cls1, )?; let mut cls2 = self.pop().unwrap().unwrap_class_bytes(); cls2.union(&cls1); self.push(HirFrame::ClassBytes(cls2)); } } // This is handled automatically by the visitor. ast::ClassSetItem::Union(_) => {} } Ok(()) } fn visit_class_set_binary_op_pre( &mut self, _op: &ast::ClassSetBinaryOp, ) -> Result<()> { if self.flags().unicode() { let cls = hir::ClassUnicode::empty(); self.push(HirFrame::ClassUnicode(cls)); } else { let cls = hir::ClassBytes::empty(); self.push(HirFrame::ClassBytes(cls)); } Ok(()) } fn visit_class_set_binary_op_in( &mut self, _op: &ast::ClassSetBinaryOp, ) -> Result<()> { if self.flags().unicode() { let cls = hir::ClassUnicode::empty(); self.push(HirFrame::ClassUnicode(cls)); } else { let cls = hir::ClassBytes::empty(); self.push(HirFrame::ClassBytes(cls)); } Ok(()) } fn visit_class_set_binary_op_post( &mut self, op: &ast::ClassSetBinaryOp, ) -> Result<()> { use crate::ast::ClassSetBinaryOpKind::*; if self.flags().unicode() { let mut rhs = self.pop().unwrap().unwrap_class_unicode(); let mut lhs = self.pop().unwrap().unwrap_class_unicode(); let mut cls = self.pop().unwrap().unwrap_class_unicode(); if self.flags().case_insensitive() { rhs.try_case_fold_simple().map_err(|_| { self.error( op.rhs.span().clone(), ErrorKind::UnicodeCaseUnavailable, ) })?; lhs.try_case_fold_simple().map_err(|_| { self.error( op.lhs.span().clone(), ErrorKind::UnicodeCaseUnavailable, ) })?; } match op.kind { Intersection => lhs.intersect(&rhs), Difference => lhs.difference(&rhs), SymmetricDifference => lhs.symmetric_difference(&rhs), } cls.union(&lhs); self.push(HirFrame::ClassUnicode(cls)); } else { let mut rhs = self.pop().unwrap().unwrap_class_bytes(); let mut lhs = self.pop().unwrap().unwrap_class_bytes(); let mut cls = self.pop().unwrap().unwrap_class_bytes(); if self.flags().case_insensitive() { rhs.case_fold_simple(); lhs.case_fold_simple(); } match op.kind { Intersection => lhs.intersect(&rhs), Difference => lhs.difference(&rhs), SymmetricDifference => lhs.symmetric_difference(&rhs), } cls.union(&lhs); self.push(HirFrame::ClassBytes(cls)); } Ok(()) } } /// The internal implementation of a translator. /// /// This type is responsible for carrying around the original pattern string, /// which is not tied to the internal state of a translator. /// /// A TranslatorI exists for the time it takes to translate a single Ast. #[derive(Clone, Debug)] struct TranslatorI<'t, 'p> { trans: &'t Translator, pattern: &'p str, } impl<'t, 'p> TranslatorI<'t, 'p> { /// Build a new internal translator. fn new(trans: &'t Translator, pattern: &'p str) -> TranslatorI<'t, 'p> { TranslatorI { trans, pattern } } /// Return a reference to the underlying translator. fn trans(&self) -> &Translator { &self.trans } /// Push the given frame on to the call stack. fn push(&self, frame: HirFrame) { self.trans().stack.borrow_mut().push(frame); } /// Push the given literal char on to the call stack. /// /// If the top-most element of the stack is a literal, then the char /// is appended to the end of that literal. Otherwise, a new literal /// containing just the given char is pushed to the top of the stack. fn push_char(&self, ch: char) { let mut buf = [0; 4]; let bytes = ch.encode_utf8(&mut buf).as_bytes(); let mut stack = self.trans().stack.borrow_mut(); if let Some(HirFrame::Literal(ref mut literal)) = stack.last_mut() { literal.extend_from_slice(bytes); } else { stack.push(HirFrame::Literal(bytes.to_vec())); } } /// Push the given literal byte on to the call stack. /// /// If the top-most element of the stack is a literal, then the byte /// is appended to the end of that literal. Otherwise, a new literal /// containing just the given byte is pushed to the top of the stack. fn push_byte(&self, byte: u8) { let mut stack = self.trans().stack.borrow_mut(); if let Some(HirFrame::Literal(ref mut literal)) = stack.last_mut() { literal.push(byte); } else { stack.push(HirFrame::Literal(vec![byte])); } } /// Pop the top of the call stack. If the call stack is empty, return None. fn pop(&self) -> Option { self.trans().stack.borrow_mut().pop() } /// Pop an HIR expression from the top of the stack for a concatenation. /// /// This returns None if the stack is empty or when a concat frame is seen. /// Otherwise, it panics if it could not find an HIR expression. fn pop_concat_expr(&self) -> Option { let frame = self.pop()?; match frame { HirFrame::Concat => None, HirFrame::Expr(expr) => Some(expr), HirFrame::Literal(lit) => Some(Hir::literal(lit)), HirFrame::ClassUnicode(_) => { unreachable!("expected expr or concat, got Unicode class") } HirFrame::ClassBytes(_) => { unreachable!("expected expr or concat, got byte class") } HirFrame::Repetition => { unreachable!("expected expr or concat, got repetition") } HirFrame::Group { .. } => { unreachable!("expected expr or concat, got group") } HirFrame::Alternation => { unreachable!("expected expr or concat, got alt marker") } HirFrame::AlternationBranch => { unreachable!("expected expr or concat, got alt branch marker") } } } /// Pop an HIR expression from the top of the stack for an alternation. /// /// This returns None if the stack is empty or when an alternation frame is /// seen. Otherwise, it panics if it could not find an HIR expression. fn pop_alt_expr(&self) -> Option { let frame = self.pop()?; match frame { HirFrame::Alternation => None, HirFrame::Expr(expr) => Some(expr), HirFrame::Literal(lit) => Some(Hir::literal(lit)), HirFrame::ClassUnicode(_) => { unreachable!("expected expr or alt, got Unicode class") } HirFrame::ClassBytes(_) => { unreachable!("expected expr or alt, got byte class") } HirFrame::Repetition => { unreachable!("expected expr or alt, got repetition") } HirFrame::Group { .. } => { unreachable!("expected expr or alt, got group") } HirFrame::Concat => { unreachable!("expected expr or alt, got concat marker") } HirFrame::AlternationBranch => { unreachable!("expected expr or alt, got alt branch marker") } } } /// Create a new error with the given span and error type. fn error(&self, span: Span, kind: ErrorKind) -> Error { Error { kind, pattern: self.pattern.to_string(), span } } /// Return a copy of the active flags. fn flags(&self) -> Flags { self.trans().flags.get() } /// Set the flags of this translator from the flags set in the given AST. /// Then, return the old flags. fn set_flags(&self, ast_flags: &ast::Flags) -> Flags { let old_flags = self.flags(); let mut new_flags = Flags::from_ast(ast_flags); new_flags.merge(&old_flags); self.trans().flags.set(new_flags); old_flags } /// Convert an Ast literal to its scalar representation. /// /// When Unicode mode is enabled, then this always succeeds and returns a /// `char` (Unicode scalar value). /// /// When Unicode mode is disabled, then a `char` will still be returned /// whenever possible. A byte is returned only when invalid UTF-8 is /// allowed and when the byte is not ASCII. Otherwise, a non-ASCII byte /// will result in an error when invalid UTF-8 is not allowed. fn ast_literal_to_scalar( &self, lit: &ast::Literal, ) -> Result> { if self.flags().unicode() { return Ok(Either::Left(lit.c)); } let byte = match lit.byte() { None => return Ok(Either::Left(lit.c)), Some(byte) => byte, }; if byte <= 0x7F { return Ok(Either::Left(char::try_from(byte).unwrap())); } if self.trans().utf8 { return Err(self.error(lit.span, ErrorKind::InvalidUtf8)); } Ok(Either::Right(byte)) } fn case_fold_char(&self, span: Span, c: char) -> Result> { if !self.flags().case_insensitive() { return Ok(None); } if self.flags().unicode() { // If case folding won't do anything, then don't bother trying. let map = unicode::SimpleCaseFolder::new() .map(|f| f.overlaps(c, c)) .map_err(|_| { self.error(span, ErrorKind::UnicodeCaseUnavailable) })?; if !map { return Ok(None); } let mut cls = hir::ClassUnicode::new(vec![hir::ClassUnicodeRange::new( c, c, )]); cls.try_case_fold_simple().map_err(|_| { self.error(span, ErrorKind::UnicodeCaseUnavailable) })?; Ok(Some(Hir::class(hir::Class::Unicode(cls)))) } else { if c.len_utf8() > 1 { return Err(self.error(span, ErrorKind::UnicodeNotAllowed)); } // If case folding won't do anything, then don't bother trying. match c { 'A'..='Z' | 'a'..='z' => {} _ => return Ok(None), } let mut cls = hir::ClassBytes::new(vec![hir::ClassBytesRange::new( // OK because 'c.len_utf8() == 1' which in turn implies // that 'c' is ASCII. u8::try_from(c).unwrap(), u8::try_from(c).unwrap(), )]); cls.case_fold_simple(); Ok(Some(Hir::class(hir::Class::Bytes(cls)))) } } fn hir_dot(&self, span: Span) -> Result { let (utf8, lineterm, flags) = (self.trans().utf8, self.trans().line_terminator, self.flags()); if utf8 && (!flags.unicode() || !lineterm.is_ascii()) { return Err(self.error(span, ErrorKind::InvalidUtf8)); } let dot = if flags.dot_matches_new_line() { if flags.unicode() { hir::Dot::AnyChar } else { hir::Dot::AnyByte } } else { if flags.unicode() { if flags.crlf() { hir::Dot::AnyCharExceptCRLF } else { if !lineterm.is_ascii() { return Err( self.error(span, ErrorKind::InvalidLineTerminator) ); } hir::Dot::AnyCharExcept(char::from(lineterm)) } } else { if flags.crlf() { hir::Dot::AnyByteExceptCRLF } else { hir::Dot::AnyByteExcept(lineterm) } } }; Ok(Hir::dot(dot)) } fn hir_assertion(&self, asst: &ast::Assertion) -> Result { let unicode = self.flags().unicode(); let multi_line = self.flags().multi_line(); let crlf = self.flags().crlf(); Ok(match asst.kind { ast::AssertionKind::StartLine => Hir::look(if multi_line { if crlf { hir::Look::StartCRLF } else { hir::Look::StartLF } } else { hir::Look::Start }), ast::AssertionKind::EndLine => Hir::look(if multi_line { if crlf { hir::Look::EndCRLF } else { hir::Look::EndLF } } else { hir::Look::End }), ast::AssertionKind::StartText => Hir::look(hir::Look::Start), ast::AssertionKind::EndText => Hir::look(hir::Look::End), ast::AssertionKind::WordBoundary => Hir::look(if unicode { hir::Look::WordUnicode } else { hir::Look::WordAscii }), ast::AssertionKind::NotWordBoundary => Hir::look(if unicode { hir::Look::WordUnicodeNegate } else { hir::Look::WordAsciiNegate }), }) } fn hir_capture(&self, group: &ast::Group, expr: Hir) -> Hir { let (index, name) = match group.kind { ast::GroupKind::CaptureIndex(index) => (index, None), ast::GroupKind::CaptureName { ref name, .. } => { (name.index, Some(name.name.clone().into_boxed_str())) } // The HIR doesn't need to use non-capturing groups, since the way // in which the data type is defined handles this automatically. ast::GroupKind::NonCapturing(_) => return expr, }; Hir::capture(hir::Capture { index, name, sub: Box::new(expr) }) } fn hir_repetition(&self, rep: &ast::Repetition, expr: Hir) -> Hir { let (min, max) = match rep.op.kind { ast::RepetitionKind::ZeroOrOne => (0, Some(1)), ast::RepetitionKind::ZeroOrMore => (0, None), ast::RepetitionKind::OneOrMore => (1, None), ast::RepetitionKind::Range(ast::RepetitionRange::Exactly(m)) => { (m, Some(m)) } ast::RepetitionKind::Range(ast::RepetitionRange::AtLeast(m)) => { (m, None) } ast::RepetitionKind::Range(ast::RepetitionRange::Bounded( m, n, )) => (m, Some(n)), }; let greedy = if self.flags().swap_greed() { !rep.greedy } else { rep.greedy }; Hir::repetition(hir::Repetition { min, max, greedy, sub: Box::new(expr), }) } fn hir_unicode_class( &self, ast_class: &ast::ClassUnicode, ) -> Result { use crate::ast::ClassUnicodeKind::*; if !self.flags().unicode() { return Err( self.error(ast_class.span, ErrorKind::UnicodeNotAllowed) ); } let query = match ast_class.kind { OneLetter(name) => ClassQuery::OneLetter(name), Named(ref name) => ClassQuery::Binary(name), NamedValue { ref name, ref value, .. } => ClassQuery::ByValue { property_name: name, property_value: value, }, }; let mut result = self.convert_unicode_class_error( &ast_class.span, unicode::class(query), ); if let Ok(ref mut class) = result { self.unicode_fold_and_negate( &ast_class.span, ast_class.negated, class, )?; } result } fn hir_ascii_unicode_class( &self, ast: &ast::ClassAscii, ) -> Result { let mut cls = hir::ClassUnicode::new( ascii_class_as_chars(&ast.kind) .map(|(s, e)| hir::ClassUnicodeRange::new(s, e)), ); self.unicode_fold_and_negate(&ast.span, ast.negated, &mut cls)?; Ok(cls) } fn hir_ascii_byte_class( &self, ast: &ast::ClassAscii, ) -> Result { let mut cls = hir::ClassBytes::new( ascii_class(&ast.kind) .map(|(s, e)| hir::ClassBytesRange::new(s, e)), ); self.bytes_fold_and_negate(&ast.span, ast.negated, &mut cls)?; Ok(cls) } fn hir_perl_unicode_class( &self, ast_class: &ast::ClassPerl, ) -> Result { use crate::ast::ClassPerlKind::*; assert!(self.flags().unicode()); let result = match ast_class.kind { Digit => unicode::perl_digit(), Space => unicode::perl_space(), Word => unicode::perl_word(), }; let mut class = self.convert_unicode_class_error(&ast_class.span, result)?; // We needn't apply case folding here because the Perl Unicode classes // are already closed under Unicode simple case folding. if ast_class.negated { class.negate(); } Ok(class) } fn hir_perl_byte_class( &self, ast_class: &ast::ClassPerl, ) -> Result { use crate::ast::ClassPerlKind::*; assert!(!self.flags().unicode()); let mut class = match ast_class.kind { Digit => hir_ascii_class_bytes(&ast::ClassAsciiKind::Digit), Space => hir_ascii_class_bytes(&ast::ClassAsciiKind::Space), Word => hir_ascii_class_bytes(&ast::ClassAsciiKind::Word), }; // We needn't apply case folding here because the Perl ASCII classes // are already closed (under ASCII case folding). if ast_class.negated { class.negate(); } // Negating a Perl byte class is likely to cause it to match invalid // UTF-8. That's only OK if the translator is configured to allow such // things. if self.trans().utf8 && !class.is_ascii() { return Err(self.error(ast_class.span, ErrorKind::InvalidUtf8)); } Ok(class) } /// Converts the given Unicode specific error to an HIR translation error. /// /// The span given should approximate the position at which an error would /// occur. fn convert_unicode_class_error( &self, span: &Span, result: core::result::Result, ) -> Result { result.map_err(|err| { let sp = span.clone(); match err { unicode::Error::PropertyNotFound => { self.error(sp, ErrorKind::UnicodePropertyNotFound) } unicode::Error::PropertyValueNotFound => { self.error(sp, ErrorKind::UnicodePropertyValueNotFound) } unicode::Error::PerlClassNotFound => { self.error(sp, ErrorKind::UnicodePerlClassNotFound) } } }) } fn unicode_fold_and_negate( &self, span: &Span, negated: bool, class: &mut hir::ClassUnicode, ) -> Result<()> { // Note that we must apply case folding before negation! // Consider `(?i)[^x]`. If we applied negation first, then // the result would be the character class that matched any // Unicode scalar value. if self.flags().case_insensitive() { class.try_case_fold_simple().map_err(|_| { self.error(span.clone(), ErrorKind::UnicodeCaseUnavailable) })?; } if negated { class.negate(); } Ok(()) } fn bytes_fold_and_negate( &self, span: &Span, negated: bool, class: &mut hir::ClassBytes, ) -> Result<()> { // Note that we must apply case folding before negation! // Consider `(?i)[^x]`. If we applied negation first, then // the result would be the character class that matched any // Unicode scalar value. if self.flags().case_insensitive() { class.case_fold_simple(); } if negated { class.negate(); } if self.trans().utf8 && !class.is_ascii() { return Err(self.error(span.clone(), ErrorKind::InvalidUtf8)); } Ok(()) } /// Return a scalar byte value suitable for use as a literal in a byte /// character class. fn class_literal_byte(&self, ast: &ast::Literal) -> Result { match self.ast_literal_to_scalar(ast)? { Either::Right(byte) => Ok(byte), Either::Left(ch) => { let cp = u32::from(ch); if cp <= 0x7F { Ok(u8::try_from(cp).unwrap()) } else { // We can't feasibly support Unicode in // byte oriented classes. Byte classes don't // do Unicode case folding. Err(self.error(ast.span, ErrorKind::UnicodeNotAllowed)) } } } } } /// A translator's representation of a regular expression's flags at any given /// moment in time. /// /// Each flag can be in one of three states: absent, present but disabled or /// present but enabled. #[derive(Clone, Copy, Debug, Default)] struct Flags { case_insensitive: Option, multi_line: Option, dot_matches_new_line: Option, swap_greed: Option, unicode: Option, crlf: Option, // Note that `ignore_whitespace` is omitted here because it is handled // entirely in the parser. } impl Flags { fn from_ast(ast: &ast::Flags) -> Flags { let mut flags = Flags::default(); let mut enable = true; for item in &ast.items { match item.kind { ast::FlagsItemKind::Negation => { enable = false; } ast::FlagsItemKind::Flag(ast::Flag::CaseInsensitive) => { flags.case_insensitive = Some(enable); } ast::FlagsItemKind::Flag(ast::Flag::MultiLine) => { flags.multi_line = Some(enable); } ast::FlagsItemKind::Flag(ast::Flag::DotMatchesNewLine) => { flags.dot_matches_new_line = Some(enable); } ast::FlagsItemKind::Flag(ast::Flag::SwapGreed) => { flags.swap_greed = Some(enable); } ast::FlagsItemKind::Flag(ast::Flag::Unicode) => { flags.unicode = Some(enable); } ast::FlagsItemKind::Flag(ast::Flag::CRLF) => { flags.crlf = Some(enable); } ast::FlagsItemKind::Flag(ast::Flag::IgnoreWhitespace) => {} } } flags } fn merge(&mut self, previous: &Flags) { if self.case_insensitive.is_none() { self.case_insensitive = previous.case_insensitive; } if self.multi_line.is_none() { self.multi_line = previous.multi_line; } if self.dot_matches_new_line.is_none() { self.dot_matches_new_line = previous.dot_matches_new_line; } if self.swap_greed.is_none() { self.swap_greed = previous.swap_greed; } if self.unicode.is_none() { self.unicode = previous.unicode; } if self.crlf.is_none() { self.crlf = previous.crlf; } } fn case_insensitive(&self) -> bool { self.case_insensitive.unwrap_or(false) } fn multi_line(&self) -> bool { self.multi_line.unwrap_or(false) } fn dot_matches_new_line(&self) -> bool { self.dot_matches_new_line.unwrap_or(false) } fn swap_greed(&self) -> bool { self.swap_greed.unwrap_or(false) } fn unicode(&self) -> bool { self.unicode.unwrap_or(true) } fn crlf(&self) -> bool { self.crlf.unwrap_or(false) } } fn hir_ascii_class_bytes(kind: &ast::ClassAsciiKind) -> hir::ClassBytes { let ranges: Vec<_> = ascii_class(kind) .map(|(s, e)| hir::ClassBytesRange::new(s, e)) .collect(); hir::ClassBytes::new(ranges) } fn ascii_class(kind: &ast::ClassAsciiKind) -> impl Iterator { use crate::ast::ClassAsciiKind::*; let slice: &'static [(u8, u8)] = match *kind { Alnum => &[(b'0', b'9'), (b'A', b'Z'), (b'a', b'z')], Alpha => &[(b'A', b'Z'), (b'a', b'z')], Ascii => &[(b'\x00', b'\x7F')], Blank => &[(b'\t', b'\t'), (b' ', b' ')], Cntrl => &[(b'\x00', b'\x1F'), (b'\x7F', b'\x7F')], Digit => &[(b'0', b'9')], Graph => &[(b'!', b'~')], Lower => &[(b'a', b'z')], Print => &[(b' ', b'~')], Punct => &[(b'!', b'/'), (b':', b'@'), (b'[', b'`'), (b'{', b'~')], Space => &[ (b'\t', b'\t'), (b'\n', b'\n'), (b'\x0B', b'\x0B'), (b'\x0C', b'\x0C'), (b'\r', b'\r'), (b' ', b' '), ], Upper => &[(b'A', b'Z')], Word => &[(b'0', b'9'), (b'A', b'Z'), (b'_', b'_'), (b'a', b'z')], Xdigit => &[(b'0', b'9'), (b'A', b'F'), (b'a', b'f')], }; slice.iter().copied() } fn ascii_class_as_chars( kind: &ast::ClassAsciiKind, ) -> impl Iterator { ascii_class(kind).map(|(s, e)| (char::from(s), char::from(e))) } #[cfg(test)] mod tests { use crate::{ ast::{self, parse::ParserBuilder, Ast, Position, Span}, hir::{self, Hir, HirKind, Look, Properties}, unicode::{self, ClassQuery}, }; use super::*; // We create these errors to compare with real hir::Errors in the tests. // We define equality between TestError and hir::Error to disregard the // pattern string in hir::Error, which is annoying to provide in tests. #[derive(Clone, Debug)] struct TestError { span: Span, kind: hir::ErrorKind, } impl PartialEq for TestError { fn eq(&self, other: &hir::Error) -> bool { self.span == other.span && self.kind == other.kind } } impl PartialEq for hir::Error { fn eq(&self, other: &TestError) -> bool { self.span == other.span && self.kind == other.kind } } fn parse(pattern: &str) -> Ast { ParserBuilder::new().octal(true).build().parse(pattern).unwrap() } fn t(pattern: &str) -> Hir { TranslatorBuilder::new() .utf8(true) .build() .translate(pattern, &parse(pattern)) .unwrap() } fn t_err(pattern: &str) -> hir::Error { TranslatorBuilder::new() .utf8(true) .build() .translate(pattern, &parse(pattern)) .unwrap_err() } fn t_bytes(pattern: &str) -> Hir { TranslatorBuilder::new() .utf8(false) .build() .translate(pattern, &parse(pattern)) .unwrap() } fn props(pattern: &str) -> Properties { t(pattern).properties().clone() } fn props_bytes(pattern: &str) -> Properties { t_bytes(pattern).properties().clone() } fn hir_lit(s: &str) -> Hir { hir_blit(s.as_bytes()) } fn hir_blit(s: &[u8]) -> Hir { Hir::literal(s) } fn hir_capture(index: u32, expr: Hir) -> Hir { Hir::capture(hir::Capture { index, name: None, sub: Box::new(expr) }) } fn hir_capture_name(index: u32, name: &str, expr: Hir) -> Hir { Hir::capture(hir::Capture { index, name: Some(name.into()), sub: Box::new(expr), }) } fn hir_quest(greedy: bool, expr: Hir) -> Hir { Hir::repetition(hir::Repetition { min: 0, max: Some(1), greedy, sub: Box::new(expr), }) } fn hir_star(greedy: bool, expr: Hir) -> Hir { Hir::repetition(hir::Repetition { min: 0, max: None, greedy, sub: Box::new(expr), }) } fn hir_plus(greedy: bool, expr: Hir) -> Hir { Hir::repetition(hir::Repetition { min: 1, max: None, greedy, sub: Box::new(expr), }) } fn hir_range(greedy: bool, min: u32, max: Option, expr: Hir) -> Hir { Hir::repetition(hir::Repetition { min, max, greedy, sub: Box::new(expr), }) } fn hir_alt(alts: Vec) -> Hir { Hir::alternation(alts) } fn hir_cat(exprs: Vec) -> Hir { Hir::concat(exprs) } #[allow(dead_code)] fn hir_uclass_query(query: ClassQuery<'_>) -> Hir { Hir::class(hir::Class::Unicode(unicode::class(query).unwrap())) } #[allow(dead_code)] fn hir_uclass_perl_word() -> Hir { Hir::class(hir::Class::Unicode(unicode::perl_word().unwrap())) } fn hir_ascii_uclass(kind: &ast::ClassAsciiKind) -> Hir { Hir::class(hir::Class::Unicode(hir::ClassUnicode::new( ascii_class_as_chars(kind) .map(|(s, e)| hir::ClassUnicodeRange::new(s, e)), ))) } fn hir_ascii_bclass(kind: &ast::ClassAsciiKind) -> Hir { Hir::class(hir::Class::Bytes(hir::ClassBytes::new( ascii_class(kind).map(|(s, e)| hir::ClassBytesRange::new(s, e)), ))) } fn hir_uclass(ranges: &[(char, char)]) -> Hir { Hir::class(uclass(ranges)) } fn hir_bclass(ranges: &[(u8, u8)]) -> Hir { Hir::class(bclass(ranges)) } fn hir_case_fold(expr: Hir) -> Hir { match expr.into_kind() { HirKind::Class(mut cls) => { cls.case_fold_simple(); Hir::class(cls) } _ => panic!("cannot case fold non-class Hir expr"), } } fn hir_negate(expr: Hir) -> Hir { match expr.into_kind() { HirKind::Class(mut cls) => { cls.negate(); Hir::class(cls) } _ => panic!("cannot negate non-class Hir expr"), } } fn uclass(ranges: &[(char, char)]) -> hir::Class { let ranges: Vec = ranges .iter() .map(|&(s, e)| hir::ClassUnicodeRange::new(s, e)) .collect(); hir::Class::Unicode(hir::ClassUnicode::new(ranges)) } fn bclass(ranges: &[(u8, u8)]) -> hir::Class { let ranges: Vec = ranges .iter() .map(|&(s, e)| hir::ClassBytesRange::new(s, e)) .collect(); hir::Class::Bytes(hir::ClassBytes::new(ranges)) } #[cfg(feature = "unicode-case")] fn class_case_fold(mut cls: hir::Class) -> Hir { cls.case_fold_simple(); Hir::class(cls) } fn class_negate(mut cls: hir::Class) -> Hir { cls.negate(); Hir::class(cls) } #[allow(dead_code)] fn hir_union(expr1: Hir, expr2: Hir) -> Hir { use crate::hir::Class::{Bytes, Unicode}; match (expr1.into_kind(), expr2.into_kind()) { (HirKind::Class(Unicode(mut c1)), HirKind::Class(Unicode(c2))) => { c1.union(&c2); Hir::class(hir::Class::Unicode(c1)) } (HirKind::Class(Bytes(mut c1)), HirKind::Class(Bytes(c2))) => { c1.union(&c2); Hir::class(hir::Class::Bytes(c1)) } _ => panic!("cannot union non-class Hir exprs"), } } #[allow(dead_code)] fn hir_difference(expr1: Hir, expr2: Hir) -> Hir { use crate::hir::Class::{Bytes, Unicode}; match (expr1.into_kind(), expr2.into_kind()) { (HirKind::Class(Unicode(mut c1)), HirKind::Class(Unicode(c2))) => { c1.difference(&c2); Hir::class(hir::Class::Unicode(c1)) } (HirKind::Class(Bytes(mut c1)), HirKind::Class(Bytes(c2))) => { c1.difference(&c2); Hir::class(hir::Class::Bytes(c1)) } _ => panic!("cannot difference non-class Hir exprs"), } } fn hir_look(look: hir::Look) -> Hir { Hir::look(look) } #[test] fn empty() { assert_eq!(t(""), Hir::empty()); assert_eq!(t("(?i)"), Hir::empty()); assert_eq!(t("()"), hir_capture(1, Hir::empty())); assert_eq!(t("(?:)"), Hir::empty()); assert_eq!(t("(?P)"), hir_capture_name(1, "wat", Hir::empty())); assert_eq!(t("|"), hir_alt(vec![Hir::empty(), Hir::empty()])); assert_eq!( t("()|()"), hir_alt(vec![ hir_capture(1, Hir::empty()), hir_capture(2, Hir::empty()), ]) ); assert_eq!( t("(|b)"), hir_capture(1, hir_alt(vec![Hir::empty(), hir_lit("b"),])) ); assert_eq!( t("(a|)"), hir_capture(1, hir_alt(vec![hir_lit("a"), Hir::empty(),])) ); assert_eq!( t("(a||c)"), hir_capture( 1, hir_alt(vec![hir_lit("a"), Hir::empty(), hir_lit("c"),]) ) ); assert_eq!( t("(||)"), hir_capture( 1, hir_alt(vec![Hir::empty(), Hir::empty(), Hir::empty(),]) ) ); } #[test] fn literal() { assert_eq!(t("a"), hir_lit("a")); assert_eq!(t("(?-u)a"), hir_lit("a")); assert_eq!(t("☃"), hir_lit("☃")); assert_eq!(t("abcd"), hir_lit("abcd")); assert_eq!(t_bytes("(?-u)a"), hir_lit("a")); assert_eq!(t_bytes("(?-u)\x61"), hir_lit("a")); assert_eq!(t_bytes(r"(?-u)\x61"), hir_lit("a")); assert_eq!(t_bytes(r"(?-u)\xFF"), hir_blit(b"\xFF")); assert_eq!( t_err("(?-u)☃"), TestError { kind: hir::ErrorKind::UnicodeNotAllowed, span: Span::new( Position::new(5, 1, 6), Position::new(8, 1, 7) ), } ); assert_eq!( t_err(r"(?-u)\xFF"), TestError { kind: hir::ErrorKind::InvalidUtf8, span: Span::new( Position::new(5, 1, 6), Position::new(9, 1, 10) ), } ); } #[test] fn literal_case_insensitive() { #[cfg(feature = "unicode-case")] assert_eq!(t("(?i)a"), hir_uclass(&[('A', 'A'), ('a', 'a'),])); #[cfg(feature = "unicode-case")] assert_eq!(t("(?i:a)"), hir_uclass(&[('A', 'A'), ('a', 'a')])); #[cfg(feature = "unicode-case")] assert_eq!( t("a(?i)a(?-i)a"), hir_cat(vec![ hir_lit("a"), hir_uclass(&[('A', 'A'), ('a', 'a')]), hir_lit("a"), ]) ); #[cfg(feature = "unicode-case")] assert_eq!( t("(?i)ab@c"), hir_cat(vec![ hir_uclass(&[('A', 'A'), ('a', 'a')]), hir_uclass(&[('B', 'B'), ('b', 'b')]), hir_lit("@"), hir_uclass(&[('C', 'C'), ('c', 'c')]), ]) ); #[cfg(feature = "unicode-case")] assert_eq!( t("(?i)β"), hir_uclass(&[('Β', 'Β'), ('β', 'β'), ('ϐ', 'ϐ'),]) ); assert_eq!(t("(?i-u)a"), hir_bclass(&[(b'A', b'A'), (b'a', b'a'),])); #[cfg(feature = "unicode-case")] assert_eq!( t("(?-u)a(?i)a(?-i)a"), hir_cat(vec![ hir_lit("a"), hir_bclass(&[(b'A', b'A'), (b'a', b'a')]), hir_lit("a"), ]) ); assert_eq!( t("(?i-u)ab@c"), hir_cat(vec![ hir_bclass(&[(b'A', b'A'), (b'a', b'a')]), hir_bclass(&[(b'B', b'B'), (b'b', b'b')]), hir_lit("@"), hir_bclass(&[(b'C', b'C'), (b'c', b'c')]), ]) ); assert_eq!( t_bytes("(?i-u)a"), hir_bclass(&[(b'A', b'A'), (b'a', b'a'),]) ); assert_eq!( t_bytes("(?i-u)\x61"), hir_bclass(&[(b'A', b'A'), (b'a', b'a'),]) ); assert_eq!( t_bytes(r"(?i-u)\x61"), hir_bclass(&[(b'A', b'A'), (b'a', b'a'),]) ); assert_eq!(t_bytes(r"(?i-u)\xFF"), hir_blit(b"\xFF")); assert_eq!( t_err("(?i-u)β"), TestError { kind: hir::ErrorKind::UnicodeNotAllowed, span: Span::new( Position::new(6, 1, 7), Position::new(8, 1, 8), ), } ); } #[test] fn dot() { assert_eq!( t("."), hir_uclass(&[('\0', '\t'), ('\x0B', '\u{10FFFF}')]) ); assert_eq!( t("(?R)."), hir_uclass(&[ ('\0', '\t'), ('\x0B', '\x0C'), ('\x0E', '\u{10FFFF}'), ]) ); assert_eq!(t("(?s)."), hir_uclass(&[('\0', '\u{10FFFF}')])); assert_eq!(t("(?Rs)."), hir_uclass(&[('\0', '\u{10FFFF}')])); assert_eq!( t_bytes("(?-u)."), hir_bclass(&[(b'\0', b'\t'), (b'\x0B', b'\xFF')]) ); assert_eq!( t_bytes("(?R-u)."), hir_bclass(&[ (b'\0', b'\t'), (b'\x0B', b'\x0C'), (b'\x0E', b'\xFF'), ]) ); assert_eq!(t_bytes("(?s-u)."), hir_bclass(&[(b'\0', b'\xFF'),])); assert_eq!(t_bytes("(?Rs-u)."), hir_bclass(&[(b'\0', b'\xFF'),])); // If invalid UTF-8 isn't allowed, then non-Unicode `.` isn't allowed. assert_eq!( t_err("(?-u)."), TestError { kind: hir::ErrorKind::InvalidUtf8, span: Span::new( Position::new(5, 1, 6), Position::new(6, 1, 7) ), } ); assert_eq!( t_err("(?R-u)."), TestError { kind: hir::ErrorKind::InvalidUtf8, span: Span::new( Position::new(6, 1, 7), Position::new(7, 1, 8) ), } ); assert_eq!( t_err("(?s-u)."), TestError { kind: hir::ErrorKind::InvalidUtf8, span: Span::new( Position::new(6, 1, 7), Position::new(7, 1, 8) ), } ); assert_eq!( t_err("(?Rs-u)."), TestError { kind: hir::ErrorKind::InvalidUtf8, span: Span::new( Position::new(7, 1, 8), Position::new(8, 1, 9) ), } ); } #[test] fn assertions() { assert_eq!(t("^"), hir_look(hir::Look::Start)); assert_eq!(t("$"), hir_look(hir::Look::End)); assert_eq!(t(r"\A"), hir_look(hir::Look::Start)); assert_eq!(t(r"\z"), hir_look(hir::Look::End)); assert_eq!(t("(?m)^"), hir_look(hir::Look::StartLF)); assert_eq!(t("(?m)$"), hir_look(hir::Look::EndLF)); assert_eq!(t(r"(?m)\A"), hir_look(hir::Look::Start)); assert_eq!(t(r"(?m)\z"), hir_look(hir::Look::End)); assert_eq!(t(r"\b"), hir_look(hir::Look::WordUnicode)); assert_eq!(t(r"\B"), hir_look(hir::Look::WordUnicodeNegate)); assert_eq!(t(r"(?-u)\b"), hir_look(hir::Look::WordAscii)); assert_eq!(t(r"(?-u)\B"), hir_look(hir::Look::WordAsciiNegate)); } #[test] fn group() { assert_eq!(t("(a)"), hir_capture(1, hir_lit("a"))); assert_eq!( t("(a)(b)"), hir_cat(vec![ hir_capture(1, hir_lit("a")), hir_capture(2, hir_lit("b")), ]) ); assert_eq!( t("(a)|(b)"), hir_alt(vec![ hir_capture(1, hir_lit("a")), hir_capture(2, hir_lit("b")), ]) ); assert_eq!(t("(?P)"), hir_capture_name(1, "foo", Hir::empty())); assert_eq!(t("(?Pa)"), hir_capture_name(1, "foo", hir_lit("a"))); assert_eq!( t("(?Pa)(?Pb)"), hir_cat(vec![ hir_capture_name(1, "foo", hir_lit("a")), hir_capture_name(2, "bar", hir_lit("b")), ]) ); assert_eq!(t("(?:)"), Hir::empty()); assert_eq!(t("(?:a)"), hir_lit("a")); assert_eq!( t("(?:a)(b)"), hir_cat(vec![hir_lit("a"), hir_capture(1, hir_lit("b")),]) ); assert_eq!( t("(a)(?:b)(c)"), hir_cat(vec![ hir_capture(1, hir_lit("a")), hir_lit("b"), hir_capture(2, hir_lit("c")), ]) ); assert_eq!( t("(a)(?Pb)(c)"), hir_cat(vec![ hir_capture(1, hir_lit("a")), hir_capture_name(2, "foo", hir_lit("b")), hir_capture(3, hir_lit("c")), ]) ); assert_eq!(t("()"), hir_capture(1, Hir::empty())); assert_eq!(t("((?i))"), hir_capture(1, Hir::empty())); assert_eq!(t("((?x))"), hir_capture(1, Hir::empty())); assert_eq!( t("(((?x)))"), hir_capture(1, hir_capture(2, Hir::empty())) ); } #[test] fn line_anchors() { assert_eq!(t("^"), hir_look(hir::Look::Start)); assert_eq!(t("$"), hir_look(hir::Look::End)); assert_eq!(t(r"\A"), hir_look(hir::Look::Start)); assert_eq!(t(r"\z"), hir_look(hir::Look::End)); assert_eq!(t(r"(?m)\A"), hir_look(hir::Look::Start)); assert_eq!(t(r"(?m)\z"), hir_look(hir::Look::End)); assert_eq!(t("(?m)^"), hir_look(hir::Look::StartLF)); assert_eq!(t("(?m)$"), hir_look(hir::Look::EndLF)); assert_eq!(t(r"(?R)\A"), hir_look(hir::Look::Start)); assert_eq!(t(r"(?R)\z"), hir_look(hir::Look::End)); assert_eq!(t("(?R)^"), hir_look(hir::Look::Start)); assert_eq!(t("(?R)$"), hir_look(hir::Look::End)); assert_eq!(t(r"(?Rm)\A"), hir_look(hir::Look::Start)); assert_eq!(t(r"(?Rm)\z"), hir_look(hir::Look::End)); assert_eq!(t("(?Rm)^"), hir_look(hir::Look::StartCRLF)); assert_eq!(t("(?Rm)$"), hir_look(hir::Look::EndCRLF)); } #[test] fn flags() { #[cfg(feature = "unicode-case")] assert_eq!( t("(?i:a)a"), hir_cat( vec![hir_uclass(&[('A', 'A'), ('a', 'a')]), hir_lit("a"),] ) ); assert_eq!( t("(?i-u:a)β"), hir_cat(vec![ hir_bclass(&[(b'A', b'A'), (b'a', b'a')]), hir_lit("β"), ]) ); assert_eq!( t("(?:(?i-u)a)b"), hir_cat(vec![ hir_bclass(&[(b'A', b'A'), (b'a', b'a')]), hir_lit("b"), ]) ); assert_eq!( t("((?i-u)a)b"), hir_cat(vec![ hir_capture(1, hir_bclass(&[(b'A', b'A'), (b'a', b'a')])), hir_lit("b"), ]) ); #[cfg(feature = "unicode-case")] assert_eq!( t("(?i)(?-i:a)a"), hir_cat( vec![hir_lit("a"), hir_uclass(&[('A', 'A'), ('a', 'a')]),] ) ); #[cfg(feature = "unicode-case")] assert_eq!( t("(?im)a^"), hir_cat(vec![ hir_uclass(&[('A', 'A'), ('a', 'a')]), hir_look(hir::Look::StartLF), ]) ); #[cfg(feature = "unicode-case")] assert_eq!( t("(?im)a^(?i-m)a^"), hir_cat(vec![ hir_uclass(&[('A', 'A'), ('a', 'a')]), hir_look(hir::Look::StartLF), hir_uclass(&[('A', 'A'), ('a', 'a')]), hir_look(hir::Look::Start), ]) ); assert_eq!( t("(?U)a*a*?(?-U)a*a*?"), hir_cat(vec![ hir_star(false, hir_lit("a")), hir_star(true, hir_lit("a")), hir_star(true, hir_lit("a")), hir_star(false, hir_lit("a")), ]) ); #[cfg(feature = "unicode-case")] assert_eq!( t("(?:a(?i)a)a"), hir_cat(vec![ hir_cat(vec![ hir_lit("a"), hir_uclass(&[('A', 'A'), ('a', 'a')]), ]), hir_lit("a"), ]) ); #[cfg(feature = "unicode-case")] assert_eq!( t("(?i)(?:a(?-i)a)a"), hir_cat(vec![ hir_cat(vec![ hir_uclass(&[('A', 'A'), ('a', 'a')]), hir_lit("a"), ]), hir_uclass(&[('A', 'A'), ('a', 'a')]), ]) ); } #[test] fn escape() { assert_eq!( t(r"\\\.\+\*\?\(\)\|\[\]\{\}\^\$\#"), hir_lit(r"\.+*?()|[]{}^$#") ); } #[test] fn repetition() { assert_eq!(t("a?"), hir_quest(true, hir_lit("a"))); assert_eq!(t("a*"), hir_star(true, hir_lit("a"))); assert_eq!(t("a+"), hir_plus(true, hir_lit("a"))); assert_eq!(t("a??"), hir_quest(false, hir_lit("a"))); assert_eq!(t("a*?"), hir_star(false, hir_lit("a"))); assert_eq!(t("a+?"), hir_plus(false, hir_lit("a"))); assert_eq!(t("a{1}"), hir_range(true, 1, Some(1), hir_lit("a"),)); assert_eq!(t("a{1,}"), hir_range(true, 1, None, hir_lit("a"),)); assert_eq!(t("a{1,2}"), hir_range(true, 1, Some(2), hir_lit("a"),)); assert_eq!(t("a{1}?"), hir_range(false, 1, Some(1), hir_lit("a"),)); assert_eq!(t("a{1,}?"), hir_range(false, 1, None, hir_lit("a"),)); assert_eq!(t("a{1,2}?"), hir_range(false, 1, Some(2), hir_lit("a"),)); assert_eq!( t("ab?"), hir_cat(vec![hir_lit("a"), hir_quest(true, hir_lit("b")),]) ); assert_eq!(t("(ab)?"), hir_quest(true, hir_capture(1, hir_lit("ab")))); assert_eq!( t("a|b?"), hir_alt(vec![hir_lit("a"), hir_quest(true, hir_lit("b")),]) ); } #[test] fn cat_alt() { let a = || hir_look(hir::Look::Start); let b = || hir_look(hir::Look::End); let c = || hir_look(hir::Look::WordUnicode); let d = || hir_look(hir::Look::WordUnicodeNegate); assert_eq!(t("(^$)"), hir_capture(1, hir_cat(vec![a(), b()]))); assert_eq!(t("^|$"), hir_alt(vec![a(), b()])); assert_eq!(t(r"^|$|\b"), hir_alt(vec![a(), b(), c()])); assert_eq!( t(r"^$|$\b|\b\B"), hir_alt(vec![ hir_cat(vec![a(), b()]), hir_cat(vec![b(), c()]), hir_cat(vec![c(), d()]), ]) ); assert_eq!(t("(^|$)"), hir_capture(1, hir_alt(vec![a(), b()]))); assert_eq!( t(r"(^|$|\b)"), hir_capture(1, hir_alt(vec![a(), b(), c()])) ); assert_eq!( t(r"(^$|$\b|\b\B)"), hir_capture( 1, hir_alt(vec![ hir_cat(vec![a(), b()]), hir_cat(vec![b(), c()]), hir_cat(vec![c(), d()]), ]) ) ); assert_eq!( t(r"(^$|($\b|(\b\B)))"), hir_capture( 1, hir_alt(vec![ hir_cat(vec![a(), b()]), hir_capture( 2, hir_alt(vec![ hir_cat(vec![b(), c()]), hir_capture(3, hir_cat(vec![c(), d()])), ]) ), ]) ) ); } // Tests the HIR transformation of things like '[a-z]|[A-Z]' into // '[A-Za-z]'. In other words, an alternation of just classes is always // equivalent to a single class corresponding to the union of the branches // in that class. (Unless some branches match invalid UTF-8 and others // match non-ASCII Unicode.) #[test] fn cat_class_flattened() { assert_eq!(t(r"[a-z]|[A-Z]"), hir_uclass(&[('A', 'Z'), ('a', 'z')])); // Combining all of the letter properties should give us the one giant // letter property. #[cfg(feature = "unicode-gencat")] assert_eq!( t(r"(?x) \p{Lowercase_Letter} |\p{Uppercase_Letter} |\p{Titlecase_Letter} |\p{Modifier_Letter} |\p{Other_Letter} "), hir_uclass_query(ClassQuery::Binary("letter")) ); // Byte classes that can truly match invalid UTF-8 cannot be combined // with Unicode classes. assert_eq!( t_bytes(r"[Δδ]|(?-u:[\x90-\xFF])|[Λλ]"), hir_alt(vec![ hir_uclass(&[('Δ', 'Δ'), ('δ', 'δ')]), hir_bclass(&[(b'\x90', b'\xFF')]), hir_uclass(&[('Λ', 'Λ'), ('λ', 'λ')]), ]) ); // Byte classes on their own can be combined, even if some are ASCII // and others are invalid UTF-8. assert_eq!( t_bytes(r"[a-z]|(?-u:[\x90-\xFF])|[A-Z]"), hir_bclass(&[(b'A', b'Z'), (b'a', b'z'), (b'\x90', b'\xFF')]), ); } #[test] fn class_ascii() { assert_eq!( t("[[:alnum:]]"), hir_ascii_uclass(&ast::ClassAsciiKind::Alnum) ); assert_eq!( t("[[:alpha:]]"), hir_ascii_uclass(&ast::ClassAsciiKind::Alpha) ); assert_eq!( t("[[:ascii:]]"), hir_ascii_uclass(&ast::ClassAsciiKind::Ascii) ); assert_eq!( t("[[:blank:]]"), hir_ascii_uclass(&ast::ClassAsciiKind::Blank) ); assert_eq!( t("[[:cntrl:]]"), hir_ascii_uclass(&ast::ClassAsciiKind::Cntrl) ); assert_eq!( t("[[:digit:]]"), hir_ascii_uclass(&ast::ClassAsciiKind::Digit) ); assert_eq!( t("[[:graph:]]"), hir_ascii_uclass(&ast::ClassAsciiKind::Graph) ); assert_eq!( t("[[:lower:]]"), hir_ascii_uclass(&ast::ClassAsciiKind::Lower) ); assert_eq!( t("[[:print:]]"), hir_ascii_uclass(&ast::ClassAsciiKind::Print) ); assert_eq!( t("[[:punct:]]"), hir_ascii_uclass(&ast::ClassAsciiKind::Punct) ); assert_eq!( t("[[:space:]]"), hir_ascii_uclass(&ast::ClassAsciiKind::Space) ); assert_eq!( t("[[:upper:]]"), hir_ascii_uclass(&ast::ClassAsciiKind::Upper) ); assert_eq!( t("[[:word:]]"), hir_ascii_uclass(&ast::ClassAsciiKind::Word) ); assert_eq!( t("[[:xdigit:]]"), hir_ascii_uclass(&ast::ClassAsciiKind::Xdigit) ); assert_eq!( t("[[:^lower:]]"), hir_negate(hir_ascii_uclass(&ast::ClassAsciiKind::Lower)) ); #[cfg(feature = "unicode-case")] assert_eq!( t("(?i)[[:lower:]]"), hir_uclass(&[ ('A', 'Z'), ('a', 'z'), ('\u{17F}', '\u{17F}'), ('\u{212A}', '\u{212A}'), ]) ); assert_eq!( t("(?-u)[[:lower:]]"), hir_ascii_bclass(&ast::ClassAsciiKind::Lower) ); assert_eq!( t("(?i-u)[[:lower:]]"), hir_case_fold(hir_ascii_bclass(&ast::ClassAsciiKind::Lower)) ); assert_eq!( t_err("(?-u)[[:^lower:]]"), TestError { kind: hir::ErrorKind::InvalidUtf8, span: Span::new( Position::new(6, 1, 7), Position::new(16, 1, 17) ), } ); assert_eq!( t_err("(?i-u)[[:^lower:]]"), TestError { kind: hir::ErrorKind::InvalidUtf8, span: Span::new( Position::new(7, 1, 8), Position::new(17, 1, 18) ), } ); } #[test] fn class_ascii_multiple() { // See: https://github.com/rust-lang/regex/issues/680 assert_eq!( t("[[:alnum:][:^ascii:]]"), hir_union( hir_ascii_uclass(&ast::ClassAsciiKind::Alnum), hir_uclass(&[('\u{80}', '\u{10FFFF}')]), ), ); assert_eq!( t_bytes("(?-u)[[:alnum:][:^ascii:]]"), hir_union( hir_ascii_bclass(&ast::ClassAsciiKind::Alnum), hir_bclass(&[(0x80, 0xFF)]), ), ); } #[test] #[cfg(feature = "unicode-perl")] fn class_perl_unicode() { // Unicode assert_eq!(t(r"\d"), hir_uclass_query(ClassQuery::Binary("digit"))); assert_eq!(t(r"\s"), hir_uclass_query(ClassQuery::Binary("space"))); assert_eq!(t(r"\w"), hir_uclass_perl_word()); #[cfg(feature = "unicode-case")] assert_eq!( t(r"(?i)\d"), hir_uclass_query(ClassQuery::Binary("digit")) ); #[cfg(feature = "unicode-case")] assert_eq!( t(r"(?i)\s"), hir_uclass_query(ClassQuery::Binary("space")) ); #[cfg(feature = "unicode-case")] assert_eq!(t(r"(?i)\w"), hir_uclass_perl_word()); // Unicode, negated assert_eq!( t(r"\D"), hir_negate(hir_uclass_query(ClassQuery::Binary("digit"))) ); assert_eq!( t(r"\S"), hir_negate(hir_uclass_query(ClassQuery::Binary("space"))) ); assert_eq!(t(r"\W"), hir_negate(hir_uclass_perl_word())); #[cfg(feature = "unicode-case")] assert_eq!( t(r"(?i)\D"), hir_negate(hir_uclass_query(ClassQuery::Binary("digit"))) ); #[cfg(feature = "unicode-case")] assert_eq!( t(r"(?i)\S"), hir_negate(hir_uclass_query(ClassQuery::Binary("space"))) ); #[cfg(feature = "unicode-case")] assert_eq!(t(r"(?i)\W"), hir_negate(hir_uclass_perl_word())); } #[test] fn class_perl_ascii() { // ASCII only assert_eq!( t(r"(?-u)\d"), hir_ascii_bclass(&ast::ClassAsciiKind::Digit) ); assert_eq!( t(r"(?-u)\s"), hir_ascii_bclass(&ast::ClassAsciiKind::Space) ); assert_eq!( t(r"(?-u)\w"), hir_ascii_bclass(&ast::ClassAsciiKind::Word) ); assert_eq!( t(r"(?i-u)\d"), hir_ascii_bclass(&ast::ClassAsciiKind::Digit) ); assert_eq!( t(r"(?i-u)\s"), hir_ascii_bclass(&ast::ClassAsciiKind::Space) ); assert_eq!( t(r"(?i-u)\w"), hir_ascii_bclass(&ast::ClassAsciiKind::Word) ); // ASCII only, negated assert_eq!( t_bytes(r"(?-u)\D"), hir_negate(hir_ascii_bclass(&ast::ClassAsciiKind::Digit)) ); assert_eq!( t_bytes(r"(?-u)\S"), hir_negate(hir_ascii_bclass(&ast::ClassAsciiKind::Space)) ); assert_eq!( t_bytes(r"(?-u)\W"), hir_negate(hir_ascii_bclass(&ast::ClassAsciiKind::Word)) ); assert_eq!( t_bytes(r"(?i-u)\D"), hir_negate(hir_ascii_bclass(&ast::ClassAsciiKind::Digit)) ); assert_eq!( t_bytes(r"(?i-u)\S"), hir_negate(hir_ascii_bclass(&ast::ClassAsciiKind::Space)) ); assert_eq!( t_bytes(r"(?i-u)\W"), hir_negate(hir_ascii_bclass(&ast::ClassAsciiKind::Word)) ); // ASCII only, negated, with UTF-8 mode enabled. // In this case, negating any Perl class results in an error because // all such classes can match invalid UTF-8. assert_eq!( t_err(r"(?-u)\D"), TestError { kind: hir::ErrorKind::InvalidUtf8, span: Span::new( Position::new(5, 1, 6), Position::new(7, 1, 8), ), }, ); assert_eq!( t_err(r"(?-u)\S"), TestError { kind: hir::ErrorKind::InvalidUtf8, span: Span::new( Position::new(5, 1, 6), Position::new(7, 1, 8), ), }, ); assert_eq!( t_err(r"(?-u)\W"), TestError { kind: hir::ErrorKind::InvalidUtf8, span: Span::new( Position::new(5, 1, 6), Position::new(7, 1, 8), ), }, ); assert_eq!( t_err(r"(?i-u)\D"), TestError { kind: hir::ErrorKind::InvalidUtf8, span: Span::new( Position::new(6, 1, 7), Position::new(8, 1, 9), ), }, ); assert_eq!( t_err(r"(?i-u)\S"), TestError { kind: hir::ErrorKind::InvalidUtf8, span: Span::new( Position::new(6, 1, 7), Position::new(8, 1, 9), ), }, ); assert_eq!( t_err(r"(?i-u)\W"), TestError { kind: hir::ErrorKind::InvalidUtf8, span: Span::new( Position::new(6, 1, 7), Position::new(8, 1, 9), ), }, ); } #[test] #[cfg(not(feature = "unicode-perl"))] fn class_perl_word_disabled() { assert_eq!( t_err(r"\w"), TestError { kind: hir::ErrorKind::UnicodePerlClassNotFound, span: Span::new( Position::new(0, 1, 1), Position::new(2, 1, 3) ), } ); } #[test] #[cfg(all(not(feature = "unicode-perl"), not(feature = "unicode-bool")))] fn class_perl_space_disabled() { assert_eq!( t_err(r"\s"), TestError { kind: hir::ErrorKind::UnicodePerlClassNotFound, span: Span::new( Position::new(0, 1, 1), Position::new(2, 1, 3) ), } ); } #[test] #[cfg(all( not(feature = "unicode-perl"), not(feature = "unicode-gencat") ))] fn class_perl_digit_disabled() { assert_eq!( t_err(r"\d"), TestError { kind: hir::ErrorKind::UnicodePerlClassNotFound, span: Span::new( Position::new(0, 1, 1), Position::new(2, 1, 3) ), } ); } #[test] #[cfg(feature = "unicode-gencat")] fn class_unicode_gencat() { assert_eq!(t(r"\pZ"), hir_uclass_query(ClassQuery::Binary("Z"))); assert_eq!(t(r"\pz"), hir_uclass_query(ClassQuery::Binary("Z"))); assert_eq!( t(r"\p{Separator}"), hir_uclass_query(ClassQuery::Binary("Z")) ); assert_eq!( t(r"\p{se PaRa ToR}"), hir_uclass_query(ClassQuery::Binary("Z")) ); assert_eq!( t(r"\p{gc:Separator}"), hir_uclass_query(ClassQuery::Binary("Z")) ); assert_eq!( t(r"\p{gc=Separator}"), hir_uclass_query(ClassQuery::Binary("Z")) ); assert_eq!( t(r"\p{Other}"), hir_uclass_query(ClassQuery::Binary("Other")) ); assert_eq!(t(r"\pC"), hir_uclass_query(ClassQuery::Binary("Other"))); assert_eq!( t(r"\PZ"), hir_negate(hir_uclass_query(ClassQuery::Binary("Z"))) ); assert_eq!( t(r"\P{separator}"), hir_negate(hir_uclass_query(ClassQuery::Binary("Z"))) ); assert_eq!( t(r"\P{gc!=separator}"), hir_negate(hir_uclass_query(ClassQuery::Binary("Z"))) ); assert_eq!(t(r"\p{any}"), hir_uclass_query(ClassQuery::Binary("Any"))); assert_eq!( t(r"\p{assigned}"), hir_uclass_query(ClassQuery::Binary("Assigned")) ); assert_eq!( t(r"\p{ascii}"), hir_uclass_query(ClassQuery::Binary("ASCII")) ); assert_eq!( t(r"\p{gc:any}"), hir_uclass_query(ClassQuery::Binary("Any")) ); assert_eq!( t(r"\p{gc:assigned}"), hir_uclass_query(ClassQuery::Binary("Assigned")) ); assert_eq!( t(r"\p{gc:ascii}"), hir_uclass_query(ClassQuery::Binary("ASCII")) ); assert_eq!( t_err(r"(?-u)\pZ"), TestError { kind: hir::ErrorKind::UnicodeNotAllowed, span: Span::new( Position::new(5, 1, 6), Position::new(8, 1, 9) ), } ); assert_eq!( t_err(r"(?-u)\p{Separator}"), TestError { kind: hir::ErrorKind::UnicodeNotAllowed, span: Span::new( Position::new(5, 1, 6), Position::new(18, 1, 19) ), } ); assert_eq!( t_err(r"\pE"), TestError { kind: hir::ErrorKind::UnicodePropertyNotFound, span: Span::new( Position::new(0, 1, 1), Position::new(3, 1, 4) ), } ); assert_eq!( t_err(r"\p{Foo}"), TestError { kind: hir::ErrorKind::UnicodePropertyNotFound, span: Span::new( Position::new(0, 1, 1), Position::new(7, 1, 8) ), } ); assert_eq!( t_err(r"\p{gc:Foo}"), TestError { kind: hir::ErrorKind::UnicodePropertyValueNotFound, span: Span::new( Position::new(0, 1, 1), Position::new(10, 1, 11) ), } ); } #[test] #[cfg(not(feature = "unicode-gencat"))] fn class_unicode_gencat_disabled() { assert_eq!( t_err(r"\p{Separator}"), TestError { kind: hir::ErrorKind::UnicodePropertyNotFound, span: Span::new( Position::new(0, 1, 1), Position::new(13, 1, 14) ), } ); assert_eq!( t_err(r"\p{Any}"), TestError { kind: hir::ErrorKind::UnicodePropertyNotFound, span: Span::new( Position::new(0, 1, 1), Position::new(7, 1, 8) ), } ); } #[test] #[cfg(feature = "unicode-script")] fn class_unicode_script() { assert_eq!( t(r"\p{Greek}"), hir_uclass_query(ClassQuery::Binary("Greek")) ); #[cfg(feature = "unicode-case")] assert_eq!( t(r"(?i)\p{Greek}"), hir_case_fold(hir_uclass_query(ClassQuery::Binary("Greek"))) ); #[cfg(feature = "unicode-case")] assert_eq!( t(r"(?i)\P{Greek}"), hir_negate(hir_case_fold(hir_uclass_query(ClassQuery::Binary( "Greek" )))) ); assert_eq!( t_err(r"\p{sc:Foo}"), TestError { kind: hir::ErrorKind::UnicodePropertyValueNotFound, span: Span::new( Position::new(0, 1, 1), Position::new(10, 1, 11) ), } ); assert_eq!( t_err(r"\p{scx:Foo}"), TestError { kind: hir::ErrorKind::UnicodePropertyValueNotFound, span: Span::new( Position::new(0, 1, 1), Position::new(11, 1, 12) ), } ); } #[test] #[cfg(not(feature = "unicode-script"))] fn class_unicode_script_disabled() { assert_eq!( t_err(r"\p{Greek}"), TestError { kind: hir::ErrorKind::UnicodePropertyNotFound, span: Span::new( Position::new(0, 1, 1), Position::new(9, 1, 10) ), } ); assert_eq!( t_err(r"\p{scx:Greek}"), TestError { kind: hir::ErrorKind::UnicodePropertyNotFound, span: Span::new( Position::new(0, 1, 1), Position::new(13, 1, 14) ), } ); } #[test] #[cfg(feature = "unicode-age")] fn class_unicode_age() { assert_eq!( t_err(r"\p{age:Foo}"), TestError { kind: hir::ErrorKind::UnicodePropertyValueNotFound, span: Span::new( Position::new(0, 1, 1), Position::new(11, 1, 12) ), } ); } #[test] #[cfg(feature = "unicode-gencat")] fn class_unicode_any_empty() { assert_eq!(t(r"\P{any}"), hir_uclass(&[]),); } #[test] #[cfg(not(feature = "unicode-age"))] fn class_unicode_age_disabled() { assert_eq!( t_err(r"\p{age:3.0}"), TestError { kind: hir::ErrorKind::UnicodePropertyNotFound, span: Span::new( Position::new(0, 1, 1), Position::new(11, 1, 12) ), } ); } #[test] fn class_bracketed() { assert_eq!(t("[a]"), hir_lit("a")); assert_eq!(t("[ab]"), hir_uclass(&[('a', 'b')])); assert_eq!(t("[^[a]]"), class_negate(uclass(&[('a', 'a')]))); assert_eq!(t("[a-z]"), hir_uclass(&[('a', 'z')])); assert_eq!(t("[a-fd-h]"), hir_uclass(&[('a', 'h')])); assert_eq!(t("[a-fg-m]"), hir_uclass(&[('a', 'm')])); assert_eq!(t(r"[\x00]"), hir_uclass(&[('\0', '\0')])); assert_eq!(t(r"[\n]"), hir_uclass(&[('\n', '\n')])); assert_eq!(t("[\n]"), hir_uclass(&[('\n', '\n')])); #[cfg(any(feature = "unicode-perl", feature = "unicode-gencat"))] assert_eq!(t(r"[\d]"), hir_uclass_query(ClassQuery::Binary("digit"))); #[cfg(feature = "unicode-gencat")] assert_eq!( t(r"[\pZ]"), hir_uclass_query(ClassQuery::Binary("separator")) ); #[cfg(feature = "unicode-gencat")] assert_eq!( t(r"[\p{separator}]"), hir_uclass_query(ClassQuery::Binary("separator")) ); #[cfg(any(feature = "unicode-perl", feature = "unicode-gencat"))] assert_eq!(t(r"[^\D]"), hir_uclass_query(ClassQuery::Binary("digit"))); #[cfg(feature = "unicode-gencat")] assert_eq!( t(r"[^\PZ]"), hir_uclass_query(ClassQuery::Binary("separator")) ); #[cfg(feature = "unicode-gencat")] assert_eq!( t(r"[^\P{separator}]"), hir_uclass_query(ClassQuery::Binary("separator")) ); #[cfg(all( feature = "unicode-case", any(feature = "unicode-perl", feature = "unicode-gencat") ))] assert_eq!( t(r"(?i)[^\D]"), hir_uclass_query(ClassQuery::Binary("digit")) ); #[cfg(all(feature = "unicode-case", feature = "unicode-script"))] assert_eq!( t(r"(?i)[^\P{greek}]"), hir_case_fold(hir_uclass_query(ClassQuery::Binary("greek"))) ); assert_eq!(t("(?-u)[a]"), hir_bclass(&[(b'a', b'a')])); assert_eq!(t(r"(?-u)[\x00]"), hir_bclass(&[(b'\0', b'\0')])); assert_eq!(t_bytes(r"(?-u)[\xFF]"), hir_bclass(&[(b'\xFF', b'\xFF')])); #[cfg(feature = "unicode-case")] assert_eq!(t("(?i)[a]"), hir_uclass(&[('A', 'A'), ('a', 'a')])); #[cfg(feature = "unicode-case")] assert_eq!( t("(?i)[k]"), hir_uclass(&[('K', 'K'), ('k', 'k'), ('\u{212A}', '\u{212A}'),]) ); #[cfg(feature = "unicode-case")] assert_eq!( t("(?i)[β]"), hir_uclass(&[('Β', 'Β'), ('β', 'β'), ('ϐ', 'ϐ'),]) ); assert_eq!(t("(?i-u)[k]"), hir_bclass(&[(b'K', b'K'), (b'k', b'k'),])); assert_eq!(t("[^a]"), class_negate(uclass(&[('a', 'a')]))); assert_eq!(t(r"[^\x00]"), class_negate(uclass(&[('\0', '\0')]))); assert_eq!( t_bytes("(?-u)[^a]"), class_negate(bclass(&[(b'a', b'a')])) ); #[cfg(any(feature = "unicode-perl", feature = "unicode-gencat"))] assert_eq!( t(r"[^\d]"), hir_negate(hir_uclass_query(ClassQuery::Binary("digit"))) ); #[cfg(feature = "unicode-gencat")] assert_eq!( t(r"[^\pZ]"), hir_negate(hir_uclass_query(ClassQuery::Binary("separator"))) ); #[cfg(feature = "unicode-gencat")] assert_eq!( t(r"[^\p{separator}]"), hir_negate(hir_uclass_query(ClassQuery::Binary("separator"))) ); #[cfg(all(feature = "unicode-case", feature = "unicode-script"))] assert_eq!( t(r"(?i)[^\p{greek}]"), hir_negate(hir_case_fold(hir_uclass_query(ClassQuery::Binary( "greek" )))) ); #[cfg(all(feature = "unicode-case", feature = "unicode-script"))] assert_eq!( t(r"(?i)[\P{greek}]"), hir_negate(hir_case_fold(hir_uclass_query(ClassQuery::Binary( "greek" )))) ); // Test some weird cases. assert_eq!(t(r"[\[]"), hir_uclass(&[('[', '[')])); assert_eq!(t(r"[&]"), hir_uclass(&[('&', '&')])); assert_eq!(t(r"[\&]"), hir_uclass(&[('&', '&')])); assert_eq!(t(r"[\&\&]"), hir_uclass(&[('&', '&')])); assert_eq!(t(r"[\x00-&]"), hir_uclass(&[('\0', '&')])); assert_eq!(t(r"[&-\xFF]"), hir_uclass(&[('&', '\u{FF}')])); assert_eq!(t(r"[~]"), hir_uclass(&[('~', '~')])); assert_eq!(t(r"[\~]"), hir_uclass(&[('~', '~')])); assert_eq!(t(r"[\~\~]"), hir_uclass(&[('~', '~')])); assert_eq!(t(r"[\x00-~]"), hir_uclass(&[('\0', '~')])); assert_eq!(t(r"[~-\xFF]"), hir_uclass(&[('~', '\u{FF}')])); assert_eq!(t(r"[-]"), hir_uclass(&[('-', '-')])); assert_eq!(t(r"[\-]"), hir_uclass(&[('-', '-')])); assert_eq!(t(r"[\-\-]"), hir_uclass(&[('-', '-')])); assert_eq!(t(r"[\x00-\-]"), hir_uclass(&[('\0', '-')])); assert_eq!(t(r"[\--\xFF]"), hir_uclass(&[('-', '\u{FF}')])); assert_eq!( t_err("(?-u)[^a]"), TestError { kind: hir::ErrorKind::InvalidUtf8, span: Span::new( Position::new(5, 1, 6), Position::new(9, 1, 10) ), } ); #[cfg(any(feature = "unicode-perl", feature = "unicode-bool"))] assert_eq!(t(r"[^\s\S]"), hir_uclass(&[]),); #[cfg(any(feature = "unicode-perl", feature = "unicode-bool"))] assert_eq!(t_bytes(r"(?-u)[^\s\S]"), hir_bclass(&[]),); } #[test] fn class_bracketed_union() { assert_eq!(t("[a-zA-Z]"), hir_uclass(&[('A', 'Z'), ('a', 'z')])); #[cfg(feature = "unicode-gencat")] assert_eq!( t(r"[a\pZb]"), hir_union( hir_uclass(&[('a', 'b')]), hir_uclass_query(ClassQuery::Binary("separator")) ) ); #[cfg(all(feature = "unicode-gencat", feature = "unicode-script"))] assert_eq!( t(r"[\pZ\p{Greek}]"), hir_union( hir_uclass_query(ClassQuery::Binary("greek")), hir_uclass_query(ClassQuery::Binary("separator")) ) ); #[cfg(all( feature = "unicode-age", feature = "unicode-gencat", feature = "unicode-script" ))] assert_eq!( t(r"[\p{age:3.0}\pZ\p{Greek}]"), hir_union( hir_uclass_query(ClassQuery::ByValue { property_name: "age", property_value: "3.0", }), hir_union( hir_uclass_query(ClassQuery::Binary("greek")), hir_uclass_query(ClassQuery::Binary("separator")) ) ) ); #[cfg(all( feature = "unicode-age", feature = "unicode-gencat", feature = "unicode-script" ))] assert_eq!( t(r"[[[\p{age:3.0}\pZ]\p{Greek}][\p{Cyrillic}]]"), hir_union( hir_uclass_query(ClassQuery::ByValue { property_name: "age", property_value: "3.0", }), hir_union( hir_uclass_query(ClassQuery::Binary("cyrillic")), hir_union( hir_uclass_query(ClassQuery::Binary("greek")), hir_uclass_query(ClassQuery::Binary("separator")) ) ) ) ); #[cfg(all( feature = "unicode-age", feature = "unicode-case", feature = "unicode-gencat", feature = "unicode-script" ))] assert_eq!( t(r"(?i)[\p{age:3.0}\pZ\p{Greek}]"), hir_case_fold(hir_union( hir_uclass_query(ClassQuery::ByValue { property_name: "age", property_value: "3.0", }), hir_union( hir_uclass_query(ClassQuery::Binary("greek")), hir_uclass_query(ClassQuery::Binary("separator")) ) )) ); #[cfg(all( feature = "unicode-age", feature = "unicode-gencat", feature = "unicode-script" ))] assert_eq!( t(r"[^\p{age:3.0}\pZ\p{Greek}]"), hir_negate(hir_union( hir_uclass_query(ClassQuery::ByValue { property_name: "age", property_value: "3.0", }), hir_union( hir_uclass_query(ClassQuery::Binary("greek")), hir_uclass_query(ClassQuery::Binary("separator")) ) )) ); #[cfg(all( feature = "unicode-age", feature = "unicode-case", feature = "unicode-gencat", feature = "unicode-script" ))] assert_eq!( t(r"(?i)[^\p{age:3.0}\pZ\p{Greek}]"), hir_negate(hir_case_fold(hir_union( hir_uclass_query(ClassQuery::ByValue { property_name: "age", property_value: "3.0", }), hir_union( hir_uclass_query(ClassQuery::Binary("greek")), hir_uclass_query(ClassQuery::Binary("separator")) ) ))) ); } #[test] fn class_bracketed_nested() { assert_eq!(t(r"[a[^c]]"), class_negate(uclass(&[('c', 'c')]))); assert_eq!(t(r"[a-b[^c]]"), class_negate(uclass(&[('c', 'c')]))); assert_eq!(t(r"[a-c[^c]]"), class_negate(uclass(&[]))); assert_eq!(t(r"[^a[^c]]"), hir_uclass(&[('c', 'c')])); assert_eq!(t(r"[^a-b[^c]]"), hir_uclass(&[('c', 'c')])); #[cfg(feature = "unicode-case")] assert_eq!( t(r"(?i)[a[^c]]"), hir_negate(class_case_fold(uclass(&[('c', 'c')]))) ); #[cfg(feature = "unicode-case")] assert_eq!( t(r"(?i)[a-b[^c]]"), hir_negate(class_case_fold(uclass(&[('c', 'c')]))) ); #[cfg(feature = "unicode-case")] assert_eq!(t(r"(?i)[^a[^c]]"), hir_uclass(&[('C', 'C'), ('c', 'c')])); #[cfg(feature = "unicode-case")] assert_eq!( t(r"(?i)[^a-b[^c]]"), hir_uclass(&[('C', 'C'), ('c', 'c')]) ); assert_eq!(t(r"[^a-c[^c]]"), hir_uclass(&[]),); #[cfg(feature = "unicode-case")] assert_eq!(t(r"(?i)[^a-c[^c]]"), hir_uclass(&[]),); } #[test] fn class_bracketed_intersect() { assert_eq!(t("[abc&&b-c]"), hir_uclass(&[('b', 'c')])); assert_eq!(t("[abc&&[b-c]]"), hir_uclass(&[('b', 'c')])); assert_eq!(t("[[abc]&&[b-c]]"), hir_uclass(&[('b', 'c')])); assert_eq!(t("[a-z&&b-y&&c-x]"), hir_uclass(&[('c', 'x')])); assert_eq!(t("[c-da-b&&a-d]"), hir_uclass(&[('a', 'd')])); assert_eq!(t("[a-d&&c-da-b]"), hir_uclass(&[('a', 'd')])); assert_eq!(t(r"[a-z&&a-c]"), hir_uclass(&[('a', 'c')])); assert_eq!(t(r"[[a-z&&a-c]]"), hir_uclass(&[('a', 'c')])); assert_eq!(t(r"[^[a-z&&a-c]]"), hir_negate(hir_uclass(&[('a', 'c')]))); assert_eq!(t("(?-u)[abc&&b-c]"), hir_bclass(&[(b'b', b'c')])); assert_eq!(t("(?-u)[abc&&[b-c]]"), hir_bclass(&[(b'b', b'c')])); assert_eq!(t("(?-u)[[abc]&&[b-c]]"), hir_bclass(&[(b'b', b'c')])); assert_eq!(t("(?-u)[a-z&&b-y&&c-x]"), hir_bclass(&[(b'c', b'x')])); assert_eq!(t("(?-u)[c-da-b&&a-d]"), hir_bclass(&[(b'a', b'd')])); assert_eq!(t("(?-u)[a-d&&c-da-b]"), hir_bclass(&[(b'a', b'd')])); #[cfg(feature = "unicode-case")] assert_eq!( t("(?i)[abc&&b-c]"), hir_case_fold(hir_uclass(&[('b', 'c')])) ); #[cfg(feature = "unicode-case")] assert_eq!( t("(?i)[abc&&[b-c]]"), hir_case_fold(hir_uclass(&[('b', 'c')])) ); #[cfg(feature = "unicode-case")] assert_eq!( t("(?i)[[abc]&&[b-c]]"), hir_case_fold(hir_uclass(&[('b', 'c')])) ); #[cfg(feature = "unicode-case")] assert_eq!( t("(?i)[a-z&&b-y&&c-x]"), hir_case_fold(hir_uclass(&[('c', 'x')])) ); #[cfg(feature = "unicode-case")] assert_eq!( t("(?i)[c-da-b&&a-d]"), hir_case_fold(hir_uclass(&[('a', 'd')])) ); #[cfg(feature = "unicode-case")] assert_eq!( t("(?i)[a-d&&c-da-b]"), hir_case_fold(hir_uclass(&[('a', 'd')])) ); assert_eq!( t("(?i-u)[abc&&b-c]"), hir_case_fold(hir_bclass(&[(b'b', b'c')])) ); assert_eq!( t("(?i-u)[abc&&[b-c]]"), hir_case_fold(hir_bclass(&[(b'b', b'c')])) ); assert_eq!( t("(?i-u)[[abc]&&[b-c]]"), hir_case_fold(hir_bclass(&[(b'b', b'c')])) ); assert_eq!( t("(?i-u)[a-z&&b-y&&c-x]"), hir_case_fold(hir_bclass(&[(b'c', b'x')])) ); assert_eq!( t("(?i-u)[c-da-b&&a-d]"), hir_case_fold(hir_bclass(&[(b'a', b'd')])) ); assert_eq!( t("(?i-u)[a-d&&c-da-b]"), hir_case_fold(hir_bclass(&[(b'a', b'd')])) ); // In `[a^]`, `^` does not need to be escaped, so it makes sense that // `^` is also allowed to be unescaped after `&&`. assert_eq!(t(r"[\^&&^]"), hir_uclass(&[('^', '^')])); // `]` needs to be escaped after `&&` since it's not at start of class. assert_eq!(t(r"[]&&\]]"), hir_uclass(&[(']', ']')])); assert_eq!(t(r"[-&&-]"), hir_uclass(&[('-', '-')])); assert_eq!(t(r"[\&&&&]"), hir_uclass(&[('&', '&')])); assert_eq!(t(r"[\&&&\&]"), hir_uclass(&[('&', '&')])); // Test precedence. assert_eq!( t(r"[a-w&&[^c-g]z]"), hir_uclass(&[('a', 'b'), ('h', 'w')]) ); } #[test] fn class_bracketed_intersect_negate() { #[cfg(feature = "unicode-perl")] assert_eq!( t(r"[^\w&&\d]"), hir_negate(hir_uclass_query(ClassQuery::Binary("digit"))) ); assert_eq!(t(r"[^[a-z&&a-c]]"), hir_negate(hir_uclass(&[('a', 'c')]))); #[cfg(feature = "unicode-perl")] assert_eq!( t(r"[^[\w&&\d]]"), hir_negate(hir_uclass_query(ClassQuery::Binary("digit"))) ); #[cfg(feature = "unicode-perl")] assert_eq!( t(r"[^[^\w&&\d]]"), hir_uclass_query(ClassQuery::Binary("digit")) ); #[cfg(feature = "unicode-perl")] assert_eq!(t(r"[[[^\w]&&[^\d]]]"), hir_negate(hir_uclass_perl_word())); #[cfg(feature = "unicode-perl")] assert_eq!( t_bytes(r"(?-u)[^\w&&\d]"), hir_negate(hir_ascii_bclass(&ast::ClassAsciiKind::Digit)) ); assert_eq!( t_bytes(r"(?-u)[^[a-z&&a-c]]"), hir_negate(hir_bclass(&[(b'a', b'c')])) ); assert_eq!( t_bytes(r"(?-u)[^[\w&&\d]]"), hir_negate(hir_ascii_bclass(&ast::ClassAsciiKind::Digit)) ); assert_eq!( t_bytes(r"(?-u)[^[^\w&&\d]]"), hir_ascii_bclass(&ast::ClassAsciiKind::Digit) ); assert_eq!( t_bytes(r"(?-u)[[[^\w]&&[^\d]]]"), hir_negate(hir_ascii_bclass(&ast::ClassAsciiKind::Word)) ); } #[test] fn class_bracketed_difference() { #[cfg(feature = "unicode-gencat")] assert_eq!( t(r"[\pL--[:ascii:]]"), hir_difference( hir_uclass_query(ClassQuery::Binary("letter")), hir_uclass(&[('\0', '\x7F')]) ) ); assert_eq!( t(r"(?-u)[[:alpha:]--[:lower:]]"), hir_bclass(&[(b'A', b'Z')]) ); } #[test] fn class_bracketed_symmetric_difference() { #[cfg(feature = "unicode-script")] assert_eq!( t(r"[\p{sc:Greek}~~\p{scx:Greek}]"), hir_uclass(&[ ('\u{0342}', '\u{0342}'), ('\u{0345}', '\u{0345}'), ('\u{1DC0}', '\u{1DC1}'), ]) ); assert_eq!(t(r"[a-g~~c-j]"), hir_uclass(&[('a', 'b'), ('h', 'j')])); assert_eq!( t(r"(?-u)[a-g~~c-j]"), hir_bclass(&[(b'a', b'b'), (b'h', b'j')]) ); } #[test] fn ignore_whitespace() { assert_eq!(t(r"(?x)\12 3"), hir_lit("\n3")); assert_eq!(t(r"(?x)\x { 53 }"), hir_lit("S")); assert_eq!( t(r"(?x)\x # comment { # comment 53 # comment } #comment"), hir_lit("S") ); assert_eq!(t(r"(?x)\x 53"), hir_lit("S")); assert_eq!( t(r"(?x)\x # comment 53 # comment"), hir_lit("S") ); assert_eq!(t(r"(?x)\x5 3"), hir_lit("S")); #[cfg(feature = "unicode-gencat")] assert_eq!( t(r"(?x)\p # comment { # comment Separator # comment } # comment"), hir_uclass_query(ClassQuery::Binary("separator")) ); assert_eq!( t(r"(?x)a # comment { # comment 5 # comment , # comment 10 # comment } # comment"), hir_range(true, 5, Some(10), hir_lit("a")) ); assert_eq!(t(r"(?x)a\ # hi there"), hir_lit("a ")); } #[test] fn analysis_is_utf8() { // Positive examples. assert!(props_bytes(r"a").is_utf8()); assert!(props_bytes(r"ab").is_utf8()); assert!(props_bytes(r"(?-u)a").is_utf8()); assert!(props_bytes(r"(?-u)ab").is_utf8()); assert!(props_bytes(r"\xFF").is_utf8()); assert!(props_bytes(r"\xFF\xFF").is_utf8()); assert!(props_bytes(r"[^a]").is_utf8()); assert!(props_bytes(r"[^a][^a]").is_utf8()); assert!(props_bytes(r"\b").is_utf8()); assert!(props_bytes(r"\B").is_utf8()); assert!(props_bytes(r"(?-u)\b").is_utf8()); assert!(props_bytes(r"(?-u)\B").is_utf8()); // Negative examples. assert!(!props_bytes(r"(?-u)\xFF").is_utf8()); assert!(!props_bytes(r"(?-u)\xFF\xFF").is_utf8()); assert!(!props_bytes(r"(?-u)[^a]").is_utf8()); assert!(!props_bytes(r"(?-u)[^a][^a]").is_utf8()); } #[test] fn analysis_captures_len() { assert_eq!(0, props(r"a").explicit_captures_len()); assert_eq!(0, props(r"(?:a)").explicit_captures_len()); assert_eq!(0, props(r"(?i-u:a)").explicit_captures_len()); assert_eq!(0, props(r"(?i-u)a").explicit_captures_len()); assert_eq!(1, props(r"(a)").explicit_captures_len()); assert_eq!(1, props(r"(?Pa)").explicit_captures_len()); assert_eq!(1, props(r"()").explicit_captures_len()); assert_eq!(1, props(r"()a").explicit_captures_len()); assert_eq!(1, props(r"(a)+").explicit_captures_len()); assert_eq!(2, props(r"(a)(b)").explicit_captures_len()); assert_eq!(2, props(r"(a)|(b)").explicit_captures_len()); assert_eq!(2, props(r"((a))").explicit_captures_len()); assert_eq!(1, props(r"([a&&b])").explicit_captures_len()); } #[test] fn analysis_static_captures_len() { let len = |pattern| props(pattern).static_explicit_captures_len(); assert_eq!(Some(0), len(r"")); assert_eq!(Some(0), len(r"foo|bar")); assert_eq!(None, len(r"(foo)|bar")); assert_eq!(None, len(r"foo|(bar)")); assert_eq!(Some(1), len(r"(foo|bar)")); assert_eq!(Some(1), len(r"(a|b|c|d|e|f)")); assert_eq!(Some(1), len(r"(a)|(b)|(c)|(d)|(e)|(f)")); assert_eq!(Some(2), len(r"(a)(b)|(c)(d)|(e)(f)")); assert_eq!(Some(6), len(r"(a)(b)(c)(d)(e)(f)")); assert_eq!(Some(3), len(r"(a)(b)(extra)|(a)(b)()")); assert_eq!(Some(3), len(r"(a)(b)((?:extra)?)")); assert_eq!(None, len(r"(a)(b)(extra)?")); assert_eq!(Some(1), len(r"(foo)|(bar)")); assert_eq!(Some(2), len(r"(foo)(bar)")); assert_eq!(Some(2), len(r"(foo)+(bar)")); assert_eq!(None, len(r"(foo)*(bar)")); assert_eq!(Some(0), len(r"(foo)?{0}")); assert_eq!(None, len(r"(foo)?{1}")); assert_eq!(Some(1), len(r"(foo){1}")); assert_eq!(Some(1), len(r"(foo){1,}")); assert_eq!(Some(1), len(r"(foo){1,}?")); assert_eq!(None, len(r"(foo){1,}??")); assert_eq!(None, len(r"(foo){0,}")); assert_eq!(Some(1), len(r"(foo)(?:bar)")); assert_eq!(Some(2), len(r"(foo(?:bar)+)(?:baz(boo))")); assert_eq!(Some(2), len(r"(?Pfoo)(?:bar)(bal|loon)")); assert_eq!( Some(2), len(r#"<(a)[^>]+href="([^"]+)"|<(img)[^>]+src="([^"]+)""#) ); } #[test] fn analysis_is_all_assertions() { // Positive examples. let p = props(r"\b"); assert!(!p.look_set().is_empty()); assert_eq!(p.minimum_len(), Some(0)); let p = props(r"\B"); assert!(!p.look_set().is_empty()); assert_eq!(p.minimum_len(), Some(0)); let p = props(r"^"); assert!(!p.look_set().is_empty()); assert_eq!(p.minimum_len(), Some(0)); let p = props(r"$"); assert!(!p.look_set().is_empty()); assert_eq!(p.minimum_len(), Some(0)); let p = props(r"\A"); assert!(!p.look_set().is_empty()); assert_eq!(p.minimum_len(), Some(0)); let p = props(r"\z"); assert!(!p.look_set().is_empty()); assert_eq!(p.minimum_len(), Some(0)); let p = props(r"$^\z\A\b\B"); assert!(!p.look_set().is_empty()); assert_eq!(p.minimum_len(), Some(0)); let p = props(r"$|^|\z|\A|\b|\B"); assert!(!p.look_set().is_empty()); assert_eq!(p.minimum_len(), Some(0)); let p = props(r"^$|$^"); assert!(!p.look_set().is_empty()); assert_eq!(p.minimum_len(), Some(0)); let p = props(r"((\b)+())*^"); assert!(!p.look_set().is_empty()); assert_eq!(p.minimum_len(), Some(0)); // Negative examples. let p = props(r"^a"); assert!(!p.look_set().is_empty()); assert_eq!(p.minimum_len(), Some(1)); } #[test] fn analysis_look_set_prefix_any() { let p = props(r"(?-u)(?i:(?:\b|_)win(?:32|64|dows)?(?:\b|_))"); assert!(p.look_set_prefix_any().contains(Look::WordAscii)); } #[test] fn analysis_is_anchored() { let is_start = |p| props(p).look_set_prefix().contains(Look::Start); let is_end = |p| props(p).look_set_suffix().contains(Look::End); // Positive examples. assert!(is_start(r"^")); assert!(is_end(r"$")); assert!(is_start(r"^^")); assert!(props(r"$$").look_set_suffix().contains(Look::End)); assert!(is_start(r"^$")); assert!(is_end(r"^$")); assert!(is_start(r"^foo")); assert!(is_end(r"foo$")); assert!(is_start(r"^foo|^bar")); assert!(is_end(r"foo$|bar$")); assert!(is_start(r"^(foo|bar)")); assert!(is_end(r"(foo|bar)$")); assert!(is_start(r"^+")); assert!(is_end(r"$+")); assert!(is_start(r"^++")); assert!(is_end(r"$++")); assert!(is_start(r"(^)+")); assert!(is_end(r"($)+")); assert!(is_start(r"$^")); assert!(is_start(r"$^")); assert!(is_start(r"$^|^$")); assert!(is_end(r"$^|^$")); assert!(is_start(r"\b^")); assert!(is_end(r"$\b")); assert!(is_start(r"^(?m:^)")); assert!(is_end(r"(?m:$)$")); assert!(is_start(r"(?m:^)^")); assert!(is_end(r"$(?m:$)")); // Negative examples. assert!(!is_start(r"(?m)^")); assert!(!is_end(r"(?m)$")); assert!(!is_start(r"(?m:^$)|$^")); assert!(!is_end(r"(?m:^$)|$^")); assert!(!is_start(r"$^|(?m:^$)")); assert!(!is_end(r"$^|(?m:^$)")); assert!(!is_start(r"a^")); assert!(!is_start(r"$a")); assert!(!is_end(r"a^")); assert!(!is_end(r"$a")); assert!(!is_start(r"^foo|bar")); assert!(!is_end(r"foo|bar$")); assert!(!is_start(r"^*")); assert!(!is_end(r"$*")); assert!(!is_start(r"^*+")); assert!(!is_end(r"$*+")); assert!(!is_start(r"^+*")); assert!(!is_end(r"$+*")); assert!(!is_start(r"(^)*")); assert!(!is_end(r"($)*")); } #[test] fn analysis_is_any_anchored() { let is_start = |p| props(p).look_set().contains(Look::Start); let is_end = |p| props(p).look_set().contains(Look::End); // Positive examples. assert!(is_start(r"^")); assert!(is_end(r"$")); assert!(is_start(r"\A")); assert!(is_end(r"\z")); // Negative examples. assert!(!is_start(r"(?m)^")); assert!(!is_end(r"(?m)$")); assert!(!is_start(r"$")); assert!(!is_end(r"^")); } #[test] fn analysis_can_empty() { // Positive examples. let assert_empty = |p| assert_eq!(Some(0), props_bytes(p).minimum_len()); assert_empty(r""); assert_empty(r"()"); assert_empty(r"()*"); assert_empty(r"()+"); assert_empty(r"()?"); assert_empty(r"a*"); assert_empty(r"a?"); assert_empty(r"a{0}"); assert_empty(r"a{0,}"); assert_empty(r"a{0,1}"); assert_empty(r"a{0,10}"); #[cfg(feature = "unicode-gencat")] assert_empty(r"\pL*"); assert_empty(r"a*|b"); assert_empty(r"b|a*"); assert_empty(r"a|"); assert_empty(r"|a"); assert_empty(r"a||b"); assert_empty(r"a*a?(abcd)*"); assert_empty(r"^"); assert_empty(r"$"); assert_empty(r"(?m)^"); assert_empty(r"(?m)$"); assert_empty(r"\A"); assert_empty(r"\z"); assert_empty(r"\B"); assert_empty(r"(?-u)\B"); assert_empty(r"\b"); assert_empty(r"(?-u)\b"); // Negative examples. let assert_non_empty = |p| assert_ne!(Some(0), props_bytes(p).minimum_len()); assert_non_empty(r"a+"); assert_non_empty(r"a{1}"); assert_non_empty(r"a{1,}"); assert_non_empty(r"a{1,2}"); assert_non_empty(r"a{1,10}"); assert_non_empty(r"b|a"); assert_non_empty(r"a*a+(abcd)*"); #[cfg(feature = "unicode-gencat")] assert_non_empty(r"\P{any}"); assert_non_empty(r"[a--a]"); assert_non_empty(r"[a&&b]"); } #[test] fn analysis_is_literal() { // Positive examples. assert!(props(r"a").is_literal()); assert!(props(r"ab").is_literal()); assert!(props(r"abc").is_literal()); assert!(props(r"(?m)abc").is_literal()); assert!(props(r"(?:a)").is_literal()); assert!(props(r"foo(?:a)").is_literal()); assert!(props(r"(?:a)foo").is_literal()); assert!(props(r"[a]").is_literal()); // Negative examples. assert!(!props(r"").is_literal()); assert!(!props(r"^").is_literal()); assert!(!props(r"a|b").is_literal()); assert!(!props(r"(a)").is_literal()); assert!(!props(r"a+").is_literal()); assert!(!props(r"foo(a)").is_literal()); assert!(!props(r"(a)foo").is_literal()); assert!(!props(r"[ab]").is_literal()); } #[test] fn analysis_is_alternation_literal() { // Positive examples. assert!(props(r"a").is_alternation_literal()); assert!(props(r"ab").is_alternation_literal()); assert!(props(r"abc").is_alternation_literal()); assert!(props(r"(?m)abc").is_alternation_literal()); assert!(props(r"foo|bar").is_alternation_literal()); assert!(props(r"foo|bar|baz").is_alternation_literal()); assert!(props(r"[a]").is_alternation_literal()); assert!(props(r"(?:ab)|cd").is_alternation_literal()); assert!(props(r"ab|(?:cd)").is_alternation_literal()); // Negative examples. assert!(!props(r"").is_alternation_literal()); assert!(!props(r"^").is_alternation_literal()); assert!(!props(r"(a)").is_alternation_literal()); assert!(!props(r"a+").is_alternation_literal()); assert!(!props(r"foo(a)").is_alternation_literal()); assert!(!props(r"(a)foo").is_alternation_literal()); assert!(!props(r"[ab]").is_alternation_literal()); assert!(!props(r"[ab]|b").is_alternation_literal()); assert!(!props(r"a|[ab]").is_alternation_literal()); assert!(!props(r"(a)|b").is_alternation_literal()); assert!(!props(r"a|(b)").is_alternation_literal()); assert!(!props(r"a|b").is_alternation_literal()); assert!(!props(r"a|b|c").is_alternation_literal()); assert!(!props(r"[a]|b").is_alternation_literal()); assert!(!props(r"a|[b]").is_alternation_literal()); assert!(!props(r"(?:a)|b").is_alternation_literal()); assert!(!props(r"a|(?:b)").is_alternation_literal()); assert!(!props(r"(?:z|xx)@|xx").is_alternation_literal()); } // This tests that the smart Hir::repetition constructors does some basic // simplifications. #[test] fn smart_repetition() { assert_eq!(t(r"a{0}"), Hir::empty()); assert_eq!(t(r"a{1}"), hir_lit("a")); assert_eq!(t(r"\B{32111}"), hir_look(hir::Look::WordUnicodeNegate)); } // This tests that the smart Hir::concat constructor simplifies the given // exprs in a way we expect. #[test] fn smart_concat() { assert_eq!(t(""), Hir::empty()); assert_eq!(t("(?:)"), Hir::empty()); assert_eq!(t("abc"), hir_lit("abc")); assert_eq!(t("(?:foo)(?:bar)"), hir_lit("foobar")); assert_eq!(t("quux(?:foo)(?:bar)baz"), hir_lit("quuxfoobarbaz")); assert_eq!( t("foo(?:bar^baz)quux"), hir_cat(vec![ hir_lit("foobar"), hir_look(hir::Look::Start), hir_lit("bazquux"), ]) ); assert_eq!( t("foo(?:ba(?:r^b)az)quux"), hir_cat(vec![ hir_lit("foobar"), hir_look(hir::Look::Start), hir_lit("bazquux"), ]) ); } // This tests that the smart Hir::alternation constructor simplifies the // given exprs in a way we expect. #[test] fn smart_alternation() { assert_eq!( t("(?:foo)|(?:bar)"), hir_alt(vec![hir_lit("foo"), hir_lit("bar")]) ); assert_eq!( t("quux|(?:abc|def|xyz)|baz"), hir_alt(vec![ hir_lit("quux"), hir_lit("abc"), hir_lit("def"), hir_lit("xyz"), hir_lit("baz"), ]) ); assert_eq!( t("quux|(?:abc|(?:def|mno)|xyz)|baz"), hir_alt(vec![ hir_lit("quux"), hir_lit("abc"), hir_lit("def"), hir_lit("mno"), hir_lit("xyz"), hir_lit("baz"), ]) ); assert_eq!( t("a|b|c|d|e|f|x|y|z"), hir_uclass(&[('a', 'f'), ('x', 'z')]), ); // Tests that we lift common prefixes out of an alternation. assert_eq!( t("[A-Z]foo|[A-Z]quux"), hir_cat(vec![ hir_uclass(&[('A', 'Z')]), hir_alt(vec![hir_lit("foo"), hir_lit("quux")]), ]), ); assert_eq!( t("[A-Z][A-Z]|[A-Z]quux"), hir_cat(vec![ hir_uclass(&[('A', 'Z')]), hir_alt(vec![hir_uclass(&[('A', 'Z')]), hir_lit("quux")]), ]), ); assert_eq!( t("[A-Z][A-Z]|[A-Z][A-Z]quux"), hir_cat(vec![ hir_uclass(&[('A', 'Z')]), hir_uclass(&[('A', 'Z')]), hir_alt(vec![Hir::empty(), hir_lit("quux")]), ]), ); assert_eq!( t("[A-Z]foo|[A-Z]foobar"), hir_cat(vec![ hir_uclass(&[('A', 'Z')]), hir_alt(vec![hir_lit("foo"), hir_lit("foobar")]), ]), ); } }