From 698f8c2f01ea549d77d7dc3338a12e04c11057b9 Mon Sep 17 00:00:00 2001 From: Daniel Baumann Date: Wed, 17 Apr 2024 14:02:58 +0200 Subject: Adding upstream version 1.64.0+dfsg1. Signed-off-by: Daniel Baumann --- compiler/rustc_expand/src/mbe/macro_parser.rs | 704 ++++++++++++++++++++++++++ 1 file changed, 704 insertions(+) create mode 100644 compiler/rustc_expand/src/mbe/macro_parser.rs (limited to 'compiler/rustc_expand/src/mbe/macro_parser.rs') diff --git a/compiler/rustc_expand/src/mbe/macro_parser.rs b/compiler/rustc_expand/src/mbe/macro_parser.rs new file mode 100644 index 000000000..4fa91dfea --- /dev/null +++ b/compiler/rustc_expand/src/mbe/macro_parser.rs @@ -0,0 +1,704 @@ +//! This is an NFA-based parser, which calls out to the main Rust parser for named non-terminals +//! (which it commits to fully when it hits one in a grammar). There's a set of current NFA threads +//! and a set of next ones. Instead of NTs, we have a special case for Kleene star. The big-O, in +//! pathological cases, is worse than traditional use of NFA or Earley parsing, but it's an easier +//! fit for Macro-by-Example-style rules. +//! +//! (In order to prevent the pathological case, we'd need to lazily construct the resulting +//! `NamedMatch`es at the very end. It'd be a pain, and require more memory to keep around old +//! matcher positions, but it would also save overhead) +//! +//! We don't say this parser uses the Earley algorithm, because it's unnecessarily inaccurate. +//! The macro parser restricts itself to the features of finite state automata. Earley parsers +//! can be described as an extension of NFAs with completion rules, prediction rules, and recursion. +//! +//! Quick intro to how the parser works: +//! +//! A "matcher position" (a.k.a. "position" or "mp") is a dot in the middle of a matcher, usually +//! written as a `·`. For example `· a $( a )* a b` is one, as is `a $( · a )* a b`. +//! +//! The parser walks through the input a token at a time, maintaining a list +//! of threads consistent with the current position in the input string: `cur_mps`. +//! +//! As it processes them, it fills up `eof_mps` with threads that would be valid if +//! the macro invocation is now over, `bb_mps` with threads that are waiting on +//! a Rust non-terminal like `$e:expr`, and `next_mps` with threads that are waiting +//! on a particular token. Most of the logic concerns moving the · through the +//! repetitions indicated by Kleene stars. The rules for moving the · without +//! consuming any input are called epsilon transitions. It only advances or calls +//! out to the real Rust parser when no `cur_mps` threads remain. +//! +//! Example: +//! +//! ```text, ignore +//! Start parsing a a a a b against [· a $( a )* a b]. +//! +//! Remaining input: a a a a b +//! next: [· a $( a )* a b] +//! +//! - - - Advance over an a. - - - +//! +//! Remaining input: a a a b +//! cur: [a · $( a )* a b] +//! Descend/Skip (first position). +//! next: [a $( · a )* a b] [a $( a )* · a b]. +//! +//! - - - Advance over an a. - - - +//! +//! Remaining input: a a b +//! cur: [a $( a · )* a b] [a $( a )* a · b] +//! Follow epsilon transition: Finish/Repeat (first position) +//! next: [a $( a )* · a b] [a $( · a )* a b] [a $( a )* a · b] +//! +//! - - - Advance over an a. - - - (this looks exactly like the last step) +//! +//! Remaining input: a b +//! cur: [a $( a · )* a b] [a $( a )* a · b] +//! Follow epsilon transition: Finish/Repeat (first position) +//! next: [a $( a )* · a b] [a $( · a )* a b] [a $( a )* a · b] +//! +//! - - - Advance over an a. - - - (this looks exactly like the last step) +//! +//! Remaining input: b +//! cur: [a $( a · )* a b] [a $( a )* a · b] +//! Follow epsilon transition: Finish/Repeat (first position) +//! next: [a $( a )* · a b] [a $( · a )* a b] [a $( a )* a · b] +//! +//! - - - Advance over a b. - - - +//! +//! Remaining input: '' +//! eof: [a $( a )* a b ·] +//! ``` + +pub(crate) use NamedMatch::*; +pub(crate) use ParseResult::*; + +use crate::mbe::{KleeneOp, TokenTree}; + +use rustc_ast::token::{self, DocComment, Nonterminal, NonterminalKind, Token}; +use rustc_lint_defs::pluralize; +use rustc_parse::parser::{NtOrTt, Parser}; +use rustc_span::symbol::MacroRulesNormalizedIdent; +use rustc_span::Span; + +use rustc_data_structures::fx::FxHashMap; +use rustc_data_structures::sync::Lrc; +use rustc_span::symbol::Ident; +use std::borrow::Cow; +use std::collections::hash_map::Entry::{Occupied, Vacant}; + +/// A unit within a matcher that a `MatcherPos` can refer to. Similar to (and derived from) +/// `mbe::TokenTree`, but designed specifically for fast and easy traversal during matching. +/// Notable differences to `mbe::TokenTree`: +/// - It is non-recursive, i.e. there is no nesting. +/// - The end pieces of each sequence (the separator, if present, and the Kleene op) are +/// represented explicitly, as is the very end of the matcher. +/// +/// This means a matcher can be represented by `&[MatcherLoc]`, and traversal mostly involves +/// simply incrementing the current matcher position index by one. +pub(super) enum MatcherLoc { + Token { + token: Token, + }, + Delimited, + Sequence { + op: KleeneOp, + num_metavar_decls: usize, + idx_first_after: usize, + next_metavar: usize, + seq_depth: usize, + }, + SequenceKleeneOpNoSep { + op: KleeneOp, + idx_first: usize, + }, + SequenceSep { + separator: Token, + }, + SequenceKleeneOpAfterSep { + idx_first: usize, + }, + MetaVarDecl { + span: Span, + bind: Ident, + kind: Option, + next_metavar: usize, + seq_depth: usize, + }, + Eof, +} + +pub(super) fn compute_locs(matcher: &[TokenTree]) -> Vec { + fn inner( + tts: &[TokenTree], + locs: &mut Vec, + next_metavar: &mut usize, + seq_depth: usize, + ) { + for tt in tts { + match tt { + TokenTree::Token(token) => { + locs.push(MatcherLoc::Token { token: token.clone() }); + } + TokenTree::Delimited(span, delimited) => { + let open_token = Token::new(token::OpenDelim(delimited.delim), span.open); + let close_token = Token::new(token::CloseDelim(delimited.delim), span.close); + + locs.push(MatcherLoc::Delimited); + locs.push(MatcherLoc::Token { token: open_token }); + inner(&delimited.tts, locs, next_metavar, seq_depth); + locs.push(MatcherLoc::Token { token: close_token }); + } + TokenTree::Sequence(_, seq) => { + // We can't determine `idx_first_after` and construct the final + // `MatcherLoc::Sequence` until after `inner()` is called and the sequence end + // pieces are processed. So we push a dummy value (`Eof` is cheapest to + // construct) now, and overwrite it with the proper value below. + let dummy = MatcherLoc::Eof; + locs.push(dummy); + + let next_metavar_orig = *next_metavar; + let op = seq.kleene.op; + let idx_first = locs.len(); + let idx_seq = idx_first - 1; + inner(&seq.tts, locs, next_metavar, seq_depth + 1); + + if let Some(separator) = &seq.separator { + locs.push(MatcherLoc::SequenceSep { separator: separator.clone() }); + locs.push(MatcherLoc::SequenceKleeneOpAfterSep { idx_first }); + } else { + locs.push(MatcherLoc::SequenceKleeneOpNoSep { op, idx_first }); + } + + // Overwrite the dummy value pushed above with the proper value. + locs[idx_seq] = MatcherLoc::Sequence { + op, + num_metavar_decls: seq.num_captures, + idx_first_after: locs.len(), + next_metavar: next_metavar_orig, + seq_depth, + }; + } + &TokenTree::MetaVarDecl(span, bind, kind) => { + locs.push(MatcherLoc::MetaVarDecl { + span, + bind, + kind, + next_metavar: *next_metavar, + seq_depth, + }); + *next_metavar += 1; + } + TokenTree::MetaVar(..) | TokenTree::MetaVarExpr(..) => unreachable!(), + } + } + } + + let mut locs = vec![]; + let mut next_metavar = 0; + inner(matcher, &mut locs, &mut next_metavar, /* seq_depth */ 0); + + // A final entry is needed for eof. + locs.push(MatcherLoc::Eof); + + locs +} + +/// A single matcher position, representing the state of matching. +struct MatcherPos { + /// The index into `TtParser::locs`, which represents the "dot". + idx: usize, + + /// The matches made against metavar decls so far. On a successful match, this vector ends up + /// with one element per metavar decl in the matcher. Each element records token trees matched + /// against the relevant metavar by the black box parser. An element will be a `MatchedSeq` if + /// the corresponding metavar decl is within a sequence. + /// + /// It is critical to performance that this is an `Lrc`, because it gets cloned frequently when + /// processing sequences. Mostly for sequence-ending possibilities that must be tried but end + /// up failing. + matches: Lrc>, +} + +// This type is used a lot. Make sure it doesn't unintentionally get bigger. +#[cfg(all(target_arch = "x86_64", target_pointer_width = "64"))] +rustc_data_structures::static_assert_size!(MatcherPos, 16); + +impl MatcherPos { + /// Adds `m` as a named match for the `metavar_idx`-th metavar. There are only two call sites, + /// and both are hot enough to be always worth inlining. + #[inline(always)] + fn push_match(&mut self, metavar_idx: usize, seq_depth: usize, m: NamedMatch) { + let matches = Lrc::make_mut(&mut self.matches); + match seq_depth { + 0 => { + // We are not within a sequence. Just append `m`. + assert_eq!(metavar_idx, matches.len()); + matches.push(m); + } + _ => { + // We are within a sequence. Find the final `MatchedSeq` at the appropriate depth + // and append `m` to its vector. + let mut curr = &mut matches[metavar_idx]; + for _ in 0..seq_depth - 1 { + match curr { + MatchedSeq(seq) => curr = seq.last_mut().unwrap(), + _ => unreachable!(), + } + } + match curr { + MatchedSeq(seq) => seq.push(m), + _ => unreachable!(), + } + } + } + } +} + +enum EofMatcherPositions { + None, + One(MatcherPos), + Multiple, +} + +/// Represents the possible results of an attempted parse. +pub(crate) enum ParseResult { + /// Parsed successfully. + Success(T), + /// Arm failed to match. If the second parameter is `token::Eof`, it indicates an unexpected + /// end of macro invocation. Otherwise, it indicates that no rules expected the given token. + Failure(Token, &'static str), + /// Fatal error (malformed macro?). Abort compilation. + Error(rustc_span::Span, String), + ErrorReported, +} + +/// A `ParseResult` where the `Success` variant contains a mapping of +/// `MacroRulesNormalizedIdent`s to `NamedMatch`es. This represents the mapping +/// of metavars to the token trees they bind to. +pub(crate) type NamedParseResult = ParseResult>; + +/// Count how many metavars declarations are in `matcher`. +pub(super) fn count_metavar_decls(matcher: &[TokenTree]) -> usize { + matcher + .iter() + .map(|tt| match tt { + TokenTree::MetaVarDecl(..) => 1, + TokenTree::Sequence(_, seq) => seq.num_captures, + TokenTree::Delimited(_, delim) => count_metavar_decls(&delim.tts), + TokenTree::Token(..) => 0, + TokenTree::MetaVar(..) | TokenTree::MetaVarExpr(..) => unreachable!(), + }) + .sum() +} + +/// `NamedMatch` is a pattern-match result for a single metavar. All +/// `MatchedNonterminal`s in the `NamedMatch` have the same non-terminal type +/// (expr, item, etc). +/// +/// The in-memory structure of a particular `NamedMatch` represents the match +/// that occurred when a particular subset of a matcher was applied to a +/// particular token tree. +/// +/// The width of each `MatchedSeq` in the `NamedMatch`, and the identity of +/// the `MatchedNtNonTts`s, will depend on the token tree it was applied +/// to: each `MatchedSeq` corresponds to a single repetition in the originating +/// token tree. The depth of the `NamedMatch` structure will therefore depend +/// only on the nesting depth of repetitions in the originating token tree it +/// was derived from. +/// +/// In layperson's terms: `NamedMatch` will form a tree representing nested matches of a particular +/// meta variable. For example, if we are matching the following macro against the following +/// invocation... +/// +/// ```rust +/// macro_rules! foo { +/// ($($($x:ident),+);+) => {} +/// } +/// +/// foo!(a, b, c, d; a, b, c, d, e); +/// ``` +/// +/// Then, the tree will have the following shape: +/// +/// ```ignore (private-internal) +/// # use NamedMatch::*; +/// MatchedSeq([ +/// MatchedSeq([ +/// MatchedNonterminal(a), +/// MatchedNonterminal(b), +/// MatchedNonterminal(c), +/// MatchedNonterminal(d), +/// ]), +/// MatchedSeq([ +/// MatchedNonterminal(a), +/// MatchedNonterminal(b), +/// MatchedNonterminal(c), +/// MatchedNonterminal(d), +/// MatchedNonterminal(e), +/// ]) +/// ]) +/// ``` +#[derive(Debug, Clone)] +pub(crate) enum NamedMatch { + MatchedSeq(Vec), + + // A metavar match of type `tt`. + MatchedTokenTree(rustc_ast::tokenstream::TokenTree), + + // A metavar match of any type other than `tt`. + MatchedNonterminal(Lrc), +} + +/// Performs a token equality check, ignoring syntax context (that is, an unhygienic comparison) +fn token_name_eq(t1: &Token, t2: &Token) -> bool { + if let (Some((ident1, is_raw1)), Some((ident2, is_raw2))) = (t1.ident(), t2.ident()) { + ident1.name == ident2.name && is_raw1 == is_raw2 + } else if let (Some(ident1), Some(ident2)) = (t1.lifetime(), t2.lifetime()) { + ident1.name == ident2.name + } else { + t1.kind == t2.kind + } +} + +// Note: the vectors could be created and dropped within `parse_tt`, but to avoid excess +// allocations we have a single vector for each kind that is cleared and reused repeatedly. +pub struct TtParser { + macro_name: Ident, + + /// The set of current mps to be processed. This should be empty by the end of a successful + /// execution of `parse_tt_inner`. + cur_mps: Vec, + + /// The set of newly generated mps. These are used to replenish `cur_mps` in the function + /// `parse_tt`. + next_mps: Vec, + + /// The set of mps that are waiting for the black-box parser. + bb_mps: Vec, + + /// Pre-allocate an empty match array, so it can be cloned cheaply for macros with many rules + /// that have no metavars. + empty_matches: Lrc>, +} + +impl TtParser { + pub(super) fn new(macro_name: Ident) -> TtParser { + TtParser { + macro_name, + cur_mps: vec![], + next_mps: vec![], + bb_mps: vec![], + empty_matches: Lrc::new(vec![]), + } + } + + /// Process the matcher positions of `cur_mps` until it is empty. In the process, this will + /// produce more mps in `next_mps` and `bb_mps`. + /// + /// # Returns + /// + /// `Some(result)` if everything is finished, `None` otherwise. Note that matches are kept + /// track of through the mps generated. + fn parse_tt_inner( + &mut self, + matcher: &[MatcherLoc], + token: &Token, + ) -> Option { + // Matcher positions that would be valid if the macro invocation was over now. Only + // modified if `token == Eof`. + let mut eof_mps = EofMatcherPositions::None; + + while let Some(mut mp) = self.cur_mps.pop() { + match &matcher[mp.idx] { + MatcherLoc::Token { token: t } => { + // If it's a doc comment, we just ignore it and move on to the next tt in the + // matcher. This is a bug, but #95267 showed that existing programs rely on + // this behaviour, and changing it would require some care and a transition + // period. + // + // If the token matches, we can just advance the parser. + // + // Otherwise, this match has failed, there is nothing to do, and hopefully + // another mp in `cur_mps` will match. + if matches!(t, Token { kind: DocComment(..), .. }) { + mp.idx += 1; + self.cur_mps.push(mp); + } else if token_name_eq(&t, token) { + mp.idx += 1; + self.next_mps.push(mp); + } + } + MatcherLoc::Delimited => { + // Entering the delimeter is trivial. + mp.idx += 1; + self.cur_mps.push(mp); + } + &MatcherLoc::Sequence { + op, + num_metavar_decls, + idx_first_after, + next_metavar, + seq_depth, + } => { + // Install an empty vec for each metavar within the sequence. + for metavar_idx in next_metavar..next_metavar + num_metavar_decls { + mp.push_match(metavar_idx, seq_depth, MatchedSeq(vec![])); + } + + if op == KleeneOp::ZeroOrMore || op == KleeneOp::ZeroOrOne { + // Try zero matches of this sequence, by skipping over it. + self.cur_mps.push(MatcherPos { + idx: idx_first_after, + matches: mp.matches.clone(), // a cheap clone + }); + } + + // Try one or more matches of this sequence, by entering it. + mp.idx += 1; + self.cur_mps.push(mp); + } + &MatcherLoc::SequenceKleeneOpNoSep { op, idx_first } => { + // We are past the end of a sequence with no separator. Try ending the + // sequence. If that's not possible, `ending_mp` will fail quietly when it is + // processed next time around the loop. + let ending_mp = MatcherPos { + idx: mp.idx + 1, // +1 skips the Kleene op + matches: mp.matches.clone(), // a cheap clone + }; + self.cur_mps.push(ending_mp); + + if op != KleeneOp::ZeroOrOne { + // Try another repetition. + mp.idx = idx_first; + self.cur_mps.push(mp); + } + } + MatcherLoc::SequenceSep { separator } => { + // We are past the end of a sequence with a separator but we haven't seen the + // separator yet. Try ending the sequence. If that's not possible, `ending_mp` + // will fail quietly when it is processed next time around the loop. + let ending_mp = MatcherPos { + idx: mp.idx + 2, // +2 skips the separator and the Kleene op + matches: mp.matches.clone(), // a cheap clone + }; + self.cur_mps.push(ending_mp); + + if token_name_eq(token, separator) { + // The separator matches the current token. Advance past it. + mp.idx += 1; + self.next_mps.push(mp); + } + } + &MatcherLoc::SequenceKleeneOpAfterSep { idx_first } => { + // We are past the sequence separator. This can't be a `?` Kleene op, because + // they don't permit separators. Try another repetition. + mp.idx = idx_first; + self.cur_mps.push(mp); + } + &MatcherLoc::MetaVarDecl { span, kind, .. } => { + // Built-in nonterminals never start with these tokens, so we can eliminate + // them from consideration. We use the span of the metavariable declaration + // to determine any edition-specific matching behavior for non-terminals. + if let Some(kind) = kind { + if Parser::nonterminal_may_begin_with(kind, token) { + self.bb_mps.push(mp); + } + } else { + // E.g. `$e` instead of `$e:expr`, reported as a hard error if actually used. + // Both this check and the one in `nameize` are necessary, surprisingly. + return Some(Error(span, "missing fragment specifier".to_string())); + } + } + MatcherLoc::Eof => { + // We are past the matcher's end, and not in a sequence. Try to end things. + debug_assert_eq!(mp.idx, matcher.len() - 1); + if *token == token::Eof { + eof_mps = match eof_mps { + EofMatcherPositions::None => EofMatcherPositions::One(mp), + EofMatcherPositions::One(_) | EofMatcherPositions::Multiple => { + EofMatcherPositions::Multiple + } + } + } + } + } + } + + // If we reached the end of input, check that there is EXACTLY ONE possible matcher. + // Otherwise, either the parse is ambiguous (which is an error) or there is a syntax error. + if *token == token::Eof { + Some(match eof_mps { + EofMatcherPositions::One(mut eof_mp) => { + // Need to take ownership of the matches from within the `Lrc`. + Lrc::make_mut(&mut eof_mp.matches); + let matches = Lrc::try_unwrap(eof_mp.matches).unwrap().into_iter(); + self.nameize(matcher, matches) + } + EofMatcherPositions::Multiple => { + Error(token.span, "ambiguity: multiple successful parses".to_string()) + } + EofMatcherPositions::None => Failure( + Token::new( + token::Eof, + if token.span.is_dummy() { token.span } else { token.span.shrink_to_hi() }, + ), + "missing tokens in macro arguments", + ), + }) + } else { + None + } + } + + /// Match the token stream from `parser` against `matcher`. + pub(super) fn parse_tt( + &mut self, + parser: &mut Cow<'_, Parser<'_>>, + matcher: &[MatcherLoc], + ) -> NamedParseResult { + // A queue of possible matcher positions. We initialize it with the matcher position in + // which the "dot" is before the first token of the first token tree in `matcher`. + // `parse_tt_inner` then processes all of these possible matcher positions and produces + // possible next positions into `next_mps`. After some post-processing, the contents of + // `next_mps` replenish `cur_mps` and we start over again. + self.cur_mps.clear(); + self.cur_mps.push(MatcherPos { idx: 0, matches: self.empty_matches.clone() }); + + loop { + self.next_mps.clear(); + self.bb_mps.clear(); + + // Process `cur_mps` until either we have finished the input or we need to get some + // parsing from the black-box parser done. + if let Some(res) = self.parse_tt_inner(matcher, &parser.token) { + return res; + } + + // `parse_tt_inner` handled all of `cur_mps`, so it's empty. + assert!(self.cur_mps.is_empty()); + + // Error messages here could be improved with links to original rules. + match (self.next_mps.len(), self.bb_mps.len()) { + (0, 0) => { + // There are no possible next positions AND we aren't waiting for the black-box + // parser: syntax error. + return Failure( + parser.token.clone(), + "no rules expected this token in macro call", + ); + } + + (_, 0) => { + // Dump all possible `next_mps` into `cur_mps` for the next iteration. Then + // process the next token. + self.cur_mps.append(&mut self.next_mps); + parser.to_mut().bump(); + } + + (0, 1) => { + // We need to call the black-box parser to get some nonterminal. + let mut mp = self.bb_mps.pop().unwrap(); + let loc = &matcher[mp.idx]; + if let &MatcherLoc::MetaVarDecl { + span, + kind: Some(kind), + next_metavar, + seq_depth, + .. + } = loc + { + // We use the span of the metavariable declaration to determine any + // edition-specific matching behavior for non-terminals. + let nt = match parser.to_mut().parse_nonterminal(kind) { + Err(mut err) => { + err.span_label( + span, + format!( + "while parsing argument for this `{kind}` macro fragment" + ), + ) + .emit(); + return ErrorReported; + } + Ok(nt) => nt, + }; + let m = match nt { + NtOrTt::Nt(nt) => MatchedNonterminal(Lrc::new(nt)), + NtOrTt::Tt(tt) => MatchedTokenTree(tt), + }; + mp.push_match(next_metavar, seq_depth, m); + mp.idx += 1; + } else { + unreachable!() + } + self.cur_mps.push(mp); + } + + (_, _) => { + // Too many possibilities! + return self.ambiguity_error(matcher, parser.token.span); + } + } + + assert!(!self.cur_mps.is_empty()); + } + } + + fn ambiguity_error( + &self, + matcher: &[MatcherLoc], + token_span: rustc_span::Span, + ) -> NamedParseResult { + let nts = self + .bb_mps + .iter() + .map(|mp| match &matcher[mp.idx] { + MatcherLoc::MetaVarDecl { bind, kind: Some(kind), .. } => { + format!("{} ('{}')", kind, bind) + } + _ => unreachable!(), + }) + .collect::>() + .join(" or "); + + Error( + token_span, + format!( + "local ambiguity when calling macro `{}`: multiple parsing options: {}", + self.macro_name, + match self.next_mps.len() { + 0 => format!("built-in NTs {}.", nts), + n => format!("built-in NTs {} or {n} other option{s}.", nts, s = pluralize!(n)), + } + ), + ) + } + + fn nameize>( + &self, + matcher: &[MatcherLoc], + mut res: I, + ) -> NamedParseResult { + // Make that each metavar has _exactly one_ binding. If so, insert the binding into the + // `NamedParseResult`. Otherwise, it's an error. + let mut ret_val = FxHashMap::default(); + for loc in matcher { + if let &MatcherLoc::MetaVarDecl { span, bind, kind, .. } = loc { + if kind.is_some() { + match ret_val.entry(MacroRulesNormalizedIdent::new(bind)) { + Vacant(spot) => spot.insert(res.next().unwrap()), + Occupied(..) => { + return Error(span, format!("duplicated bind name: {}", bind)); + } + }; + } else { + // E.g. `$e` instead of `$e:expr`, reported as a hard error if actually used. + // Both this check and the one in `parse_tt_inner` are necessary, surprisingly. + return Error(span, "missing fragment specifier".to_string()); + } + } + } + Success(ret_val) + } +} -- cgit v1.2.3