From 698f8c2f01ea549d77d7dc3338a12e04c11057b9 Mon Sep 17 00:00:00 2001 From: Daniel Baumann Date: Wed, 17 Apr 2024 14:02:58 +0200 Subject: Adding upstream version 1.64.0+dfsg1. Signed-off-by: Daniel Baumann --- compiler/rustc_ast/src/util/classify.rs | 52 ++++ compiler/rustc_ast/src/util/comments.rs | 255 ++++++++++++++++ compiler/rustc_ast/src/util/comments/tests.rs | 61 ++++ compiler/rustc_ast/src/util/literal.rs | 336 +++++++++++++++++++++ compiler/rustc_ast/src/util/parser.rs | 406 ++++++++++++++++++++++++++ compiler/rustc_ast/src/util/unicode.rs | 35 +++ 6 files changed, 1145 insertions(+) create mode 100644 compiler/rustc_ast/src/util/classify.rs create mode 100644 compiler/rustc_ast/src/util/comments.rs create mode 100644 compiler/rustc_ast/src/util/comments/tests.rs create mode 100644 compiler/rustc_ast/src/util/literal.rs create mode 100644 compiler/rustc_ast/src/util/parser.rs create mode 100644 compiler/rustc_ast/src/util/unicode.rs (limited to 'compiler/rustc_ast/src/util') diff --git a/compiler/rustc_ast/src/util/classify.rs b/compiler/rustc_ast/src/util/classify.rs new file mode 100644 index 000000000..6ea3db6d3 --- /dev/null +++ b/compiler/rustc_ast/src/util/classify.rs @@ -0,0 +1,52 @@ +//! Routines the parser uses to classify AST nodes + +// Predicates on exprs and stmts that the pretty-printer and parser use + +use crate::ast; + +/// Does this expression require a semicolon to be treated +/// as a statement? The negation of this: 'can this expression +/// be used as a statement without a semicolon' -- is used +/// as an early-bail-out in the parser so that, for instance, +/// if true {...} else {...} +/// |x| 5 +/// isn't parsed as (if true {...} else {...} | x) | 5 +pub fn expr_requires_semi_to_be_stmt(e: &ast::Expr) -> bool { + !matches!( + e.kind, + ast::ExprKind::If(..) + | ast::ExprKind::Match(..) + | ast::ExprKind::Block(..) + | ast::ExprKind::While(..) + | ast::ExprKind::Loop(..) + | ast::ExprKind::ForLoop(..) + | ast::ExprKind::TryBlock(..) + ) +} + +/// If an expression ends with `}`, returns the innermost expression ending in the `}` +pub fn expr_trailing_brace(mut expr: &ast::Expr) -> Option<&ast::Expr> { + use ast::ExprKind::*; + + loop { + match &expr.kind { + AddrOf(_, _, e) + | Assign(_, e, _) + | AssignOp(_, _, e) + | Binary(_, _, e) + | Box(e) + | Break(_, Some(e)) + | Closure(.., e, _) + | Let(_, e, _) + | Range(_, Some(e), _) + | Ret(Some(e)) + | Unary(_, e) + | Yield(Some(e)) => { + expr = e; + } + Async(..) | Block(..) | ForLoop(..) | If(..) | Loop(..) | Match(..) | Struct(..) + | TryBlock(..) | While(..) => break Some(expr), + _ => break None, + } + } +} diff --git a/compiler/rustc_ast/src/util/comments.rs b/compiler/rustc_ast/src/util/comments.rs new file mode 100644 index 000000000..c96474ccb --- /dev/null +++ b/compiler/rustc_ast/src/util/comments.rs @@ -0,0 +1,255 @@ +use crate::token::CommentKind; +use rustc_span::source_map::SourceMap; +use rustc_span::{BytePos, CharPos, FileName, Pos, Symbol}; + +#[cfg(test)] +mod tests; + +#[derive(Clone, Copy, PartialEq, Debug)] +pub enum CommentStyle { + /// No code on either side of each line of the comment + Isolated, + /// Code exists to the left of the comment + Trailing, + /// Code before /* foo */ and after the comment + Mixed, + /// Just a manual blank line "\n\n", for layout + BlankLine, +} + +#[derive(Clone)] +pub struct Comment { + pub style: CommentStyle, + pub lines: Vec, + pub pos: BytePos, +} + +/// A fast conservative estimate on whether the string can contain documentation links. +/// A pair of square brackets `[]` must exist in the string, but we only search for the +/// opening bracket because brackets always go in pairs in practice. +#[inline] +pub fn may_have_doc_links(s: &str) -> bool { + s.contains('[') +} + +/// Makes a doc string more presentable to users. +/// Used by rustdoc and perhaps other tools, but not by rustc. +pub fn beautify_doc_string(data: Symbol, kind: CommentKind) -> Symbol { + fn get_vertical_trim(lines: &[&str]) -> Option<(usize, usize)> { + let mut i = 0; + let mut j = lines.len(); + // first line of all-stars should be omitted + if !lines.is_empty() && lines[0].chars().all(|c| c == '*') { + i += 1; + } + + // like the first, a last line of all stars should be omitted + if j > i && !lines[j - 1].is_empty() && lines[j - 1].chars().all(|c| c == '*') { + j -= 1; + } + + if i != 0 || j != lines.len() { Some((i, j)) } else { None } + } + + fn get_horizontal_trim<'a>(lines: &'a [&str], kind: CommentKind) -> Option { + let mut i = usize::MAX; + let mut first = true; + + // In case we have doc comments like `/**` or `/*!`, we want to remove stars if they are + // present. However, we first need to strip the empty lines so they don't get in the middle + // when we try to compute the "horizontal trim". + let lines = if kind == CommentKind::Block { + // Whatever happens, we skip the first line. + let mut i = lines + .get(0) + .map(|l| if l.trim_start().starts_with('*') { 0 } else { 1 }) + .unwrap_or(0); + let mut j = lines.len(); + + while i < j && lines[i].trim().is_empty() { + i += 1; + } + while j > i && lines[j - 1].trim().is_empty() { + j -= 1; + } + &lines[i..j] + } else { + lines + }; + + for line in lines { + for (j, c) in line.chars().enumerate() { + if j > i || !"* \t".contains(c) { + return None; + } + if c == '*' { + if first { + i = j; + first = false; + } else if i != j { + return None; + } + break; + } + } + if i >= line.len() { + return None; + } + } + if lines.is_empty() { None } else { Some(lines[0][..i].into()) } + } + + let data_s = data.as_str(); + if data_s.contains('\n') { + let mut lines = data_s.lines().collect::>(); + let mut changes = false; + let lines = if let Some((i, j)) = get_vertical_trim(&lines) { + changes = true; + // remove whitespace-only lines from the start/end of lines + &mut lines[i..j] + } else { + &mut lines + }; + if let Some(horizontal) = get_horizontal_trim(&lines, kind) { + changes = true; + // remove a "[ \t]*\*" block from each line, if possible + for line in lines.iter_mut() { + if let Some(tmp) = line.strip_prefix(&horizontal) { + *line = tmp; + if kind == CommentKind::Block + && (*line == "*" || line.starts_with("* ") || line.starts_with("**")) + { + *line = &line[1..]; + } + } + } + } + if changes { + return Symbol::intern(&lines.join("\n")); + } + } + data +} + +/// Returns `None` if the first `col` chars of `s` contain a non-whitespace char. +/// Otherwise returns `Some(k)` where `k` is first char offset after that leading +/// whitespace. Note that `k` may be outside bounds of `s`. +fn all_whitespace(s: &str, col: CharPos) -> Option { + let mut idx = 0; + for (i, ch) in s.char_indices().take(col.to_usize()) { + if !ch.is_whitespace() { + return None; + } + idx = i + ch.len_utf8(); + } + Some(idx) +} + +fn trim_whitespace_prefix(s: &str, col: CharPos) -> &str { + let len = s.len(); + match all_whitespace(&s, col) { + Some(col) => { + if col < len { + &s[col..] + } else { + "" + } + } + None => s, + } +} + +fn split_block_comment_into_lines(text: &str, col: CharPos) -> Vec { + let mut res: Vec = vec![]; + let mut lines = text.lines(); + // just push the first line + res.extend(lines.next().map(|it| it.to_string())); + // for other lines, strip common whitespace prefix + for line in lines { + res.push(trim_whitespace_prefix(line, col).to_string()) + } + res +} + +// it appears this function is called only from pprust... that's +// probably not a good thing. +pub fn gather_comments(sm: &SourceMap, path: FileName, src: String) -> Vec { + let sm = SourceMap::new(sm.path_mapping().clone()); + let source_file = sm.new_source_file(path, src); + let text = (*source_file.src.as_ref().unwrap()).clone(); + + let text: &str = text.as_str(); + let start_bpos = source_file.start_pos; + let mut pos = 0; + let mut comments: Vec = Vec::new(); + let mut code_to_the_left = false; + + if let Some(shebang_len) = rustc_lexer::strip_shebang(text) { + comments.push(Comment { + style: CommentStyle::Isolated, + lines: vec![text[..shebang_len].to_string()], + pos: start_bpos, + }); + pos += shebang_len; + } + + for token in rustc_lexer::tokenize(&text[pos..]) { + let token_text = &text[pos..pos + token.len as usize]; + match token.kind { + rustc_lexer::TokenKind::Whitespace => { + if let Some(mut idx) = token_text.find('\n') { + code_to_the_left = false; + while let Some(next_newline) = &token_text[idx + 1..].find('\n') { + idx += 1 + next_newline; + comments.push(Comment { + style: CommentStyle::BlankLine, + lines: vec![], + pos: start_bpos + BytePos((pos + idx) as u32), + }); + } + } + } + rustc_lexer::TokenKind::BlockComment { doc_style, .. } => { + if doc_style.is_none() { + let code_to_the_right = !matches!( + text[pos + token.len as usize..].chars().next(), + Some('\r' | '\n') + ); + let style = match (code_to_the_left, code_to_the_right) { + (_, true) => CommentStyle::Mixed, + (false, false) => CommentStyle::Isolated, + (true, false) => CommentStyle::Trailing, + }; + + // Count the number of chars since the start of the line by rescanning. + let pos_in_file = start_bpos + BytePos(pos as u32); + let line_begin_in_file = source_file.line_begin_pos(pos_in_file); + let line_begin_pos = (line_begin_in_file - start_bpos).to_usize(); + let col = CharPos(text[line_begin_pos..pos].chars().count()); + + let lines = split_block_comment_into_lines(token_text, col); + comments.push(Comment { style, lines, pos: pos_in_file }) + } + } + rustc_lexer::TokenKind::LineComment { doc_style } => { + if doc_style.is_none() { + comments.push(Comment { + style: if code_to_the_left { + CommentStyle::Trailing + } else { + CommentStyle::Isolated + }, + lines: vec![token_text.to_string()], + pos: start_bpos + BytePos(pos as u32), + }) + } + } + _ => { + code_to_the_left = true; + } + } + pos += token.len as usize; + } + + comments +} diff --git a/compiler/rustc_ast/src/util/comments/tests.rs b/compiler/rustc_ast/src/util/comments/tests.rs new file mode 100644 index 000000000..11d50603a --- /dev/null +++ b/compiler/rustc_ast/src/util/comments/tests.rs @@ -0,0 +1,61 @@ +use super::*; +use rustc_span::create_default_session_globals_then; + +#[test] +fn test_block_doc_comment_1() { + create_default_session_globals_then(|| { + let comment = "\n * Test \n ** Test\n * Test\n"; + let stripped = beautify_doc_string(Symbol::intern(comment), CommentKind::Block); + assert_eq!(stripped.as_str(), " Test \n* Test\n Test"); + }) +} + +#[test] +fn test_block_doc_comment_2() { + create_default_session_globals_then(|| { + let comment = "\n * Test\n * Test\n"; + let stripped = beautify_doc_string(Symbol::intern(comment), CommentKind::Block); + assert_eq!(stripped.as_str(), " Test\n Test"); + }) +} + +#[test] +fn test_block_doc_comment_3() { + create_default_session_globals_then(|| { + let comment = "\n let a: *i32;\n *a = 5;\n"; + let stripped = beautify_doc_string(Symbol::intern(comment), CommentKind::Block); + assert_eq!(stripped.as_str(), "let a: *i32;\n*a = 5;"); + }) +} + +#[test] +fn test_line_doc_comment() { + create_default_session_globals_then(|| { + let stripped = beautify_doc_string(Symbol::intern(" test"), CommentKind::Line); + assert_eq!(stripped.as_str(), " test"); + let stripped = beautify_doc_string(Symbol::intern("! test"), CommentKind::Line); + assert_eq!(stripped.as_str(), "! test"); + let stripped = beautify_doc_string(Symbol::intern("test"), CommentKind::Line); + assert_eq!(stripped.as_str(), "test"); + let stripped = beautify_doc_string(Symbol::intern("!test"), CommentKind::Line); + assert_eq!(stripped.as_str(), "!test"); + }) +} + +#[test] +fn test_doc_blocks() { + create_default_session_globals_then(|| { + let stripped = + beautify_doc_string(Symbol::intern(" # Returns\n *\n "), CommentKind::Block); + assert_eq!(stripped.as_str(), " # Returns\n\n"); + + let stripped = beautify_doc_string( + Symbol::intern("\n * # Returns\n *\n "), + CommentKind::Block, + ); + assert_eq!(stripped.as_str(), " # Returns\n\n"); + + let stripped = beautify_doc_string(Symbol::intern("\n * a\n "), CommentKind::Block); + assert_eq!(stripped.as_str(), " a\n"); + }) +} diff --git a/compiler/rustc_ast/src/util/literal.rs b/compiler/rustc_ast/src/util/literal.rs new file mode 100644 index 000000000..9c18f55c0 --- /dev/null +++ b/compiler/rustc_ast/src/util/literal.rs @@ -0,0 +1,336 @@ +//! Code related to parsing literals. + +use crate::ast::{self, Lit, LitKind}; +use crate::token::{self, Token}; + +use rustc_lexer::unescape::{unescape_byte, unescape_char}; +use rustc_lexer::unescape::{unescape_byte_literal, unescape_literal, Mode}; +use rustc_span::symbol::{kw, sym, Symbol}; +use rustc_span::Span; + +use std::ascii; +use tracing::debug; + +pub enum LitError { + NotLiteral, + LexerError, + InvalidSuffix, + InvalidIntSuffix, + InvalidFloatSuffix, + NonDecimalFloat(u32), + IntTooLarge, +} + +impl LitKind { + /// Converts literal token into a semantic literal. + pub fn from_lit_token(lit: token::Lit) -> Result { + let token::Lit { kind, symbol, suffix } = lit; + if suffix.is_some() && !kind.may_have_suffix() { + return Err(LitError::InvalidSuffix); + } + + Ok(match kind { + token::Bool => { + assert!(symbol.is_bool_lit()); + LitKind::Bool(symbol == kw::True) + } + token::Byte => { + return unescape_byte(symbol.as_str()) + .map(LitKind::Byte) + .map_err(|_| LitError::LexerError); + } + token::Char => { + return unescape_char(symbol.as_str()) + .map(LitKind::Char) + .map_err(|_| LitError::LexerError); + } + + // There are some valid suffixes for integer and float literals, + // so all the handling is done internally. + token::Integer => return integer_lit(symbol, suffix), + token::Float => return float_lit(symbol, suffix), + + token::Str => { + // If there are no characters requiring special treatment we can + // reuse the symbol from the token. Otherwise, we must generate a + // new symbol because the string in the LitKind is different to the + // string in the token. + let s = symbol.as_str(); + let symbol = if s.contains(&['\\', '\r']) { + let mut buf = String::with_capacity(s.len()); + let mut error = Ok(()); + // Force-inlining here is aggressive but the closure is + // called on every char in the string, so it can be + // hot in programs with many long strings. + unescape_literal( + &s, + Mode::Str, + &mut #[inline(always)] + |_, unescaped_char| match unescaped_char { + Ok(c) => buf.push(c), + Err(err) => { + if err.is_fatal() { + error = Err(LitError::LexerError); + } + } + }, + ); + error?; + Symbol::intern(&buf) + } else { + symbol + }; + LitKind::Str(symbol, ast::StrStyle::Cooked) + } + token::StrRaw(n) => { + // Ditto. + let s = symbol.as_str(); + let symbol = + if s.contains('\r') { + let mut buf = String::with_capacity(s.len()); + let mut error = Ok(()); + unescape_literal(&s, Mode::RawStr, &mut |_, unescaped_char| { + match unescaped_char { + Ok(c) => buf.push(c), + Err(err) => { + if err.is_fatal() { + error = Err(LitError::LexerError); + } + } + } + }); + error?; + Symbol::intern(&buf) + } else { + symbol + }; + LitKind::Str(symbol, ast::StrStyle::Raw(n)) + } + token::ByteStr => { + let s = symbol.as_str(); + let mut buf = Vec::with_capacity(s.len()); + let mut error = Ok(()); + unescape_byte_literal(&s, Mode::ByteStr, &mut |_, unescaped_byte| { + match unescaped_byte { + Ok(c) => buf.push(c), + Err(err) => { + if err.is_fatal() { + error = Err(LitError::LexerError); + } + } + } + }); + error?; + LitKind::ByteStr(buf.into()) + } + token::ByteStrRaw(_) => { + let s = symbol.as_str(); + let bytes = if s.contains('\r') { + let mut buf = Vec::with_capacity(s.len()); + let mut error = Ok(()); + unescape_byte_literal(&s, Mode::RawByteStr, &mut |_, unescaped_byte| { + match unescaped_byte { + Ok(c) => buf.push(c), + Err(err) => { + if err.is_fatal() { + error = Err(LitError::LexerError); + } + } + } + }); + error?; + buf + } else { + symbol.to_string().into_bytes() + }; + + LitKind::ByteStr(bytes.into()) + } + token::Err => LitKind::Err(symbol), + }) + } + + /// Attempts to recover a token from semantic literal. + /// This function is used when the original token doesn't exist (e.g. the literal is created + /// by an AST-based macro) or unavailable (e.g. from HIR pretty-printing). + pub fn to_lit_token(&self) -> token::Lit { + let (kind, symbol, suffix) = match *self { + LitKind::Str(symbol, ast::StrStyle::Cooked) => { + // Don't re-intern unless the escaped string is different. + let s = symbol.as_str(); + let escaped = s.escape_default().to_string(); + let symbol = if s == escaped { symbol } else { Symbol::intern(&escaped) }; + (token::Str, symbol, None) + } + LitKind::Str(symbol, ast::StrStyle::Raw(n)) => (token::StrRaw(n), symbol, None), + LitKind::ByteStr(ref bytes) => { + let string = bytes + .iter() + .cloned() + .flat_map(ascii::escape_default) + .map(Into::::into) + .collect::(); + (token::ByteStr, Symbol::intern(&string), None) + } + LitKind::Byte(byte) => { + let string: String = ascii::escape_default(byte).map(Into::::into).collect(); + (token::Byte, Symbol::intern(&string), None) + } + LitKind::Char(ch) => { + let string: String = ch.escape_default().map(Into::::into).collect(); + (token::Char, Symbol::intern(&string), None) + } + LitKind::Int(n, ty) => { + let suffix = match ty { + ast::LitIntType::Unsigned(ty) => Some(ty.name()), + ast::LitIntType::Signed(ty) => Some(ty.name()), + ast::LitIntType::Unsuffixed => None, + }; + (token::Integer, sym::integer(n), suffix) + } + LitKind::Float(symbol, ty) => { + let suffix = match ty { + ast::LitFloatType::Suffixed(ty) => Some(ty.name()), + ast::LitFloatType::Unsuffixed => None, + }; + (token::Float, symbol, suffix) + } + LitKind::Bool(value) => { + let symbol = if value { kw::True } else { kw::False }; + (token::Bool, symbol, None) + } + LitKind::Err(symbol) => (token::Err, symbol, None), + }; + + token::Lit::new(kind, symbol, suffix) + } +} + +impl Lit { + /// Converts literal token into an AST literal. + pub fn from_lit_token(token: token::Lit, span: Span) -> Result { + Ok(Lit { token, kind: LitKind::from_lit_token(token)?, span }) + } + + /// Converts arbitrary token into an AST literal. + /// + /// Keep this in sync with `Token::can_begin_literal_or_bool` excluding unary negation. + pub fn from_token(token: &Token) -> Result { + let lit = match token.uninterpolate().kind { + token::Ident(name, false) if name.is_bool_lit() => { + token::Lit::new(token::Bool, name, None) + } + token::Literal(lit) => lit, + token::Interpolated(ref nt) => { + if let token::NtExpr(expr) | token::NtLiteral(expr) = &**nt + && let ast::ExprKind::Lit(lit) = &expr.kind + { + return Ok(lit.clone()); + } + return Err(LitError::NotLiteral); + } + _ => return Err(LitError::NotLiteral), + }; + + Lit::from_lit_token(lit, token.span) + } + + /// Attempts to recover an AST literal from semantic literal. + /// This function is used when the original token doesn't exist (e.g. the literal is created + /// by an AST-based macro) or unavailable (e.g. from HIR pretty-printing). + pub fn from_lit_kind(kind: LitKind, span: Span) -> Lit { + Lit { token: kind.to_lit_token(), kind, span } + } + + /// Losslessly convert an AST literal into a token. + pub fn to_token(&self) -> Token { + let kind = match self.token.kind { + token::Bool => token::Ident(self.token.symbol, false), + _ => token::Literal(self.token), + }; + Token::new(kind, self.span) + } +} + +fn strip_underscores(symbol: Symbol) -> Symbol { + // Do not allocate a new string unless necessary. + let s = symbol.as_str(); + if s.contains('_') { + let mut s = s.to_string(); + s.retain(|c| c != '_'); + return Symbol::intern(&s); + } + symbol +} + +fn filtered_float_lit( + symbol: Symbol, + suffix: Option, + base: u32, +) -> Result { + debug!("filtered_float_lit: {:?}, {:?}, {:?}", symbol, suffix, base); + if base != 10 { + return Err(LitError::NonDecimalFloat(base)); + } + Ok(match suffix { + Some(suf) => LitKind::Float( + symbol, + ast::LitFloatType::Suffixed(match suf { + sym::f32 => ast::FloatTy::F32, + sym::f64 => ast::FloatTy::F64, + _ => return Err(LitError::InvalidFloatSuffix), + }), + ), + None => LitKind::Float(symbol, ast::LitFloatType::Unsuffixed), + }) +} + +fn float_lit(symbol: Symbol, suffix: Option) -> Result { + debug!("float_lit: {:?}, {:?}", symbol, suffix); + filtered_float_lit(strip_underscores(symbol), suffix, 10) +} + +fn integer_lit(symbol: Symbol, suffix: Option) -> Result { + debug!("integer_lit: {:?}, {:?}", symbol, suffix); + let symbol = strip_underscores(symbol); + let s = symbol.as_str(); + + let base = match s.as_bytes() { + [b'0', b'x', ..] => 16, + [b'0', b'o', ..] => 8, + [b'0', b'b', ..] => 2, + _ => 10, + }; + + let ty = match suffix { + Some(suf) => match suf { + sym::isize => ast::LitIntType::Signed(ast::IntTy::Isize), + sym::i8 => ast::LitIntType::Signed(ast::IntTy::I8), + sym::i16 => ast::LitIntType::Signed(ast::IntTy::I16), + sym::i32 => ast::LitIntType::Signed(ast::IntTy::I32), + sym::i64 => ast::LitIntType::Signed(ast::IntTy::I64), + sym::i128 => ast::LitIntType::Signed(ast::IntTy::I128), + sym::usize => ast::LitIntType::Unsigned(ast::UintTy::Usize), + sym::u8 => ast::LitIntType::Unsigned(ast::UintTy::U8), + sym::u16 => ast::LitIntType::Unsigned(ast::UintTy::U16), + sym::u32 => ast::LitIntType::Unsigned(ast::UintTy::U32), + sym::u64 => ast::LitIntType::Unsigned(ast::UintTy::U64), + sym::u128 => ast::LitIntType::Unsigned(ast::UintTy::U128), + // `1f64` and `2f32` etc. are valid float literals, and + // `fxxx` looks more like an invalid float literal than invalid integer literal. + _ if suf.as_str().starts_with('f') => return filtered_float_lit(symbol, suffix, base), + _ => return Err(LitError::InvalidIntSuffix), + }, + _ => ast::LitIntType::Unsuffixed, + }; + + let s = &s[if base != 10 { 2 } else { 0 }..]; + u128::from_str_radix(s, base).map(|i| LitKind::Int(i, ty)).map_err(|_| { + // Small bases are lexed as if they were base 10, e.g, the string + // might be `0b10201`. This will cause the conversion above to fail, + // but these kinds of errors are already reported by the lexer. + let from_lexer = + base < 10 && s.chars().any(|c| c.to_digit(10).map_or(false, |d| d >= base)); + if from_lexer { LitError::LexerError } else { LitError::IntTooLarge } + }) +} diff --git a/compiler/rustc_ast/src/util/parser.rs b/compiler/rustc_ast/src/util/parser.rs new file mode 100644 index 000000000..74b7fe9e2 --- /dev/null +++ b/compiler/rustc_ast/src/util/parser.rs @@ -0,0 +1,406 @@ +use crate::ast::{self, BinOpKind}; +use crate::token::{self, BinOpToken, Token}; +use rustc_span::symbol::kw; + +/// Associative operator with precedence. +/// +/// This is the enum which specifies operator precedence and fixity to the parser. +#[derive(Copy, Clone, PartialEq, Debug)] +pub enum AssocOp { + /// `+` + Add, + /// `-` + Subtract, + /// `*` + Multiply, + /// `/` + Divide, + /// `%` + Modulus, + /// `&&` + LAnd, + /// `||` + LOr, + /// `^` + BitXor, + /// `&` + BitAnd, + /// `|` + BitOr, + /// `<<` + ShiftLeft, + /// `>>` + ShiftRight, + /// `==` + Equal, + /// `<` + Less, + /// `<=` + LessEqual, + /// `!=` + NotEqual, + /// `>` + Greater, + /// `>=` + GreaterEqual, + /// `=` + Assign, + /// `?=` where ? is one of the BinOpToken + AssignOp(BinOpToken), + /// `as` + As, + /// `..` range + DotDot, + /// `..=` range + DotDotEq, + /// `:` + Colon, +} + +#[derive(PartialEq, Debug)] +pub enum Fixity { + /// The operator is left-associative + Left, + /// The operator is right-associative + Right, + /// The operator is not associative + None, +} + +impl AssocOp { + /// Creates a new AssocOP from a token + pub fn from_token(t: &Token) -> Option { + use AssocOp::*; + match t.kind { + token::BinOpEq(k) => Some(AssignOp(k)), + token::Eq => Some(Assign), + token::BinOp(BinOpToken::Star) => Some(Multiply), + token::BinOp(BinOpToken::Slash) => Some(Divide), + token::BinOp(BinOpToken::Percent) => Some(Modulus), + token::BinOp(BinOpToken::Plus) => Some(Add), + token::BinOp(BinOpToken::Minus) => Some(Subtract), + token::BinOp(BinOpToken::Shl) => Some(ShiftLeft), + token::BinOp(BinOpToken::Shr) => Some(ShiftRight), + token::BinOp(BinOpToken::And) => Some(BitAnd), + token::BinOp(BinOpToken::Caret) => Some(BitXor), + token::BinOp(BinOpToken::Or) => Some(BitOr), + token::Lt => Some(Less), + token::Le => Some(LessEqual), + token::Ge => Some(GreaterEqual), + token::Gt => Some(Greater), + token::EqEq => Some(Equal), + token::Ne => Some(NotEqual), + token::AndAnd => Some(LAnd), + token::OrOr => Some(LOr), + token::DotDot => Some(DotDot), + token::DotDotEq => Some(DotDotEq), + // DotDotDot is no longer supported, but we need some way to display the error + token::DotDotDot => Some(DotDotEq), + token::Colon => Some(Colon), + // `<-` should probably be `< -` + token::LArrow => Some(Less), + _ if t.is_keyword(kw::As) => Some(As), + _ => None, + } + } + + /// Creates a new AssocOp from ast::BinOpKind. + pub fn from_ast_binop(op: BinOpKind) -> Self { + use AssocOp::*; + match op { + BinOpKind::Lt => Less, + BinOpKind::Gt => Greater, + BinOpKind::Le => LessEqual, + BinOpKind::Ge => GreaterEqual, + BinOpKind::Eq => Equal, + BinOpKind::Ne => NotEqual, + BinOpKind::Mul => Multiply, + BinOpKind::Div => Divide, + BinOpKind::Rem => Modulus, + BinOpKind::Add => Add, + BinOpKind::Sub => Subtract, + BinOpKind::Shl => ShiftLeft, + BinOpKind::Shr => ShiftRight, + BinOpKind::BitAnd => BitAnd, + BinOpKind::BitXor => BitXor, + BinOpKind::BitOr => BitOr, + BinOpKind::And => LAnd, + BinOpKind::Or => LOr, + } + } + + /// Gets the precedence of this operator + pub fn precedence(&self) -> usize { + use AssocOp::*; + match *self { + As | Colon => 14, + Multiply | Divide | Modulus => 13, + Add | Subtract => 12, + ShiftLeft | ShiftRight => 11, + BitAnd => 10, + BitXor => 9, + BitOr => 8, + Less | Greater | LessEqual | GreaterEqual | Equal | NotEqual => 7, + LAnd => 6, + LOr => 5, + DotDot | DotDotEq => 4, + Assign | AssignOp(_) => 2, + } + } + + /// Gets the fixity of this operator + pub fn fixity(&self) -> Fixity { + use AssocOp::*; + // NOTE: it is a bug to have an operators that has same precedence but different fixities! + match *self { + Assign | AssignOp(_) => Fixity::Right, + As | Multiply | Divide | Modulus | Add | Subtract | ShiftLeft | ShiftRight | BitAnd + | BitXor | BitOr | Less | Greater | LessEqual | GreaterEqual | Equal | NotEqual + | LAnd | LOr | Colon => Fixity::Left, + DotDot | DotDotEq => Fixity::None, + } + } + + pub fn is_comparison(&self) -> bool { + use AssocOp::*; + match *self { + Less | Greater | LessEqual | GreaterEqual | Equal | NotEqual => true, + Assign | AssignOp(_) | As | Multiply | Divide | Modulus | Add | Subtract + | ShiftLeft | ShiftRight | BitAnd | BitXor | BitOr | LAnd | LOr | DotDot | DotDotEq + | Colon => false, + } + } + + pub fn is_assign_like(&self) -> bool { + use AssocOp::*; + match *self { + Assign | AssignOp(_) => true, + Less | Greater | LessEqual | GreaterEqual | Equal | NotEqual | As | Multiply + | Divide | Modulus | Add | Subtract | ShiftLeft | ShiftRight | BitAnd | BitXor + | BitOr | LAnd | LOr | DotDot | DotDotEq | Colon => false, + } + } + + pub fn to_ast_binop(&self) -> Option { + use AssocOp::*; + match *self { + Less => Some(BinOpKind::Lt), + Greater => Some(BinOpKind::Gt), + LessEqual => Some(BinOpKind::Le), + GreaterEqual => Some(BinOpKind::Ge), + Equal => Some(BinOpKind::Eq), + NotEqual => Some(BinOpKind::Ne), + Multiply => Some(BinOpKind::Mul), + Divide => Some(BinOpKind::Div), + Modulus => Some(BinOpKind::Rem), + Add => Some(BinOpKind::Add), + Subtract => Some(BinOpKind::Sub), + ShiftLeft => Some(BinOpKind::Shl), + ShiftRight => Some(BinOpKind::Shr), + BitAnd => Some(BinOpKind::BitAnd), + BitXor => Some(BinOpKind::BitXor), + BitOr => Some(BinOpKind::BitOr), + LAnd => Some(BinOpKind::And), + LOr => Some(BinOpKind::Or), + Assign | AssignOp(_) | As | DotDot | DotDotEq | Colon => None, + } + } + + /// This operator could be used to follow a block unambiguously. + /// + /// This is used for error recovery at the moment, providing a suggestion to wrap blocks with + /// parentheses while having a high degree of confidence on the correctness of the suggestion. + pub fn can_continue_expr_unambiguously(&self) -> bool { + use AssocOp::*; + matches!( + self, + BitXor | // `{ 42 } ^ 3` + Assign | // `{ 42 } = { 42 }` + Divide | // `{ 42 } / 42` + Modulus | // `{ 42 } % 2` + ShiftRight | // `{ 42 } >> 2` + LessEqual | // `{ 42 } <= 3` + Greater | // `{ 42 } > 3` + GreaterEqual | // `{ 42 } >= 3` + AssignOp(_) | // `{ 42 } +=` + As | // `{ 42 } as usize` + // Equal | // `{ 42 } == { 42 }` Accepting these here would regress incorrect + // NotEqual | // `{ 42 } != { 42 } struct literals parser recovery. + Colon, // `{ 42 }: usize` + ) + } +} + +pub const PREC_CLOSURE: i8 = -40; +pub const PREC_JUMP: i8 = -30; +pub const PREC_RANGE: i8 = -10; +// The range 2..=14 is reserved for AssocOp binary operator precedences. +pub const PREC_PREFIX: i8 = 50; +pub const PREC_POSTFIX: i8 = 60; +pub const PREC_PAREN: i8 = 99; +pub const PREC_FORCE_PAREN: i8 = 100; + +#[derive(Debug, Clone, Copy)] +pub enum ExprPrecedence { + Closure, + Break, + Continue, + Ret, + Yield, + Yeet, + + Range, + + Binary(BinOpKind), + + Cast, + Type, + + Assign, + AssignOp, + + Box, + AddrOf, + Let, + Unary, + + Call, + MethodCall, + Field, + Index, + Try, + InlineAsm, + Mac, + + Array, + Repeat, + Tup, + Lit, + Path, + Paren, + If, + While, + ForLoop, + Loop, + Match, + ConstBlock, + Block, + TryBlock, + Struct, + Async, + Await, + Err, +} + +impl ExprPrecedence { + pub fn order(self) -> i8 { + match self { + ExprPrecedence::Closure => PREC_CLOSURE, + + ExprPrecedence::Break | + ExprPrecedence::Continue | + ExprPrecedence::Ret | + ExprPrecedence::Yield | + ExprPrecedence::Yeet => PREC_JUMP, + + // `Range` claims to have higher precedence than `Assign`, but `x .. x = x` fails to + // parse, instead of parsing as `(x .. x) = x`. Giving `Range` a lower precedence + // ensures that `pprust` will add parentheses in the right places to get the desired + // parse. + ExprPrecedence::Range => PREC_RANGE, + + // Binop-like expr kinds, handled by `AssocOp`. + ExprPrecedence::Binary(op) => AssocOp::from_ast_binop(op).precedence() as i8, + ExprPrecedence::Cast => AssocOp::As.precedence() as i8, + ExprPrecedence::Type => AssocOp::Colon.precedence() as i8, + + ExprPrecedence::Assign | + ExprPrecedence::AssignOp => AssocOp::Assign.precedence() as i8, + + // Unary, prefix + ExprPrecedence::Box | + ExprPrecedence::AddrOf | + // Here `let pats = expr` has `let pats =` as a "unary" prefix of `expr`. + // However, this is not exactly right. When `let _ = a` is the LHS of a binop we + // need parens sometimes. E.g. we can print `(let _ = a) && b` as `let _ = a && b` + // but we need to print `(let _ = a) < b` as-is with parens. + ExprPrecedence::Let | + ExprPrecedence::Unary => PREC_PREFIX, + + // Unary, postfix + ExprPrecedence::Await | + ExprPrecedence::Call | + ExprPrecedence::MethodCall | + ExprPrecedence::Field | + ExprPrecedence::Index | + ExprPrecedence::Try | + ExprPrecedence::InlineAsm | + ExprPrecedence::Mac => PREC_POSTFIX, + + // Never need parens + ExprPrecedence::Array | + ExprPrecedence::Repeat | + ExprPrecedence::Tup | + ExprPrecedence::Lit | + ExprPrecedence::Path | + ExprPrecedence::Paren | + ExprPrecedence::If | + ExprPrecedence::While | + ExprPrecedence::ForLoop | + ExprPrecedence::Loop | + ExprPrecedence::Match | + ExprPrecedence::ConstBlock | + ExprPrecedence::Block | + ExprPrecedence::TryBlock | + ExprPrecedence::Async | + ExprPrecedence::Struct | + ExprPrecedence::Err => PREC_PAREN, + } + } +} + +/// In `let p = e`, operators with precedence `<=` this one requires parentheses in `e`. +pub fn prec_let_scrutinee_needs_par() -> usize { + AssocOp::LAnd.precedence() +} + +/// Suppose we have `let _ = e` and the `order` of `e`. +/// Is the `order` such that `e` in `let _ = e` needs parentheses when it is on the RHS? +/// +/// Conversely, suppose that we have `(let _ = a) OP b` and `order` is that of `OP`. +/// Can we print this as `let _ = a OP b`? +pub fn needs_par_as_let_scrutinee(order: i8) -> bool { + order <= prec_let_scrutinee_needs_par() as i8 +} + +/// Expressions that syntactically contain an "exterior" struct literal i.e., not surrounded by any +/// parens or other delimiters, e.g., `X { y: 1 }`, `X { y: 1 }.method()`, `foo == X { y: 1 }` and +/// `X { y: 1 } == foo` all do, but `(X { y: 1 }) == foo` does not. +pub fn contains_exterior_struct_lit(value: &ast::Expr) -> bool { + match value.kind { + ast::ExprKind::Struct(..) => true, + + ast::ExprKind::Assign(ref lhs, ref rhs, _) + | ast::ExprKind::AssignOp(_, ref lhs, ref rhs) + | ast::ExprKind::Binary(_, ref lhs, ref rhs) => { + // X { y: 1 } + X { y: 2 } + contains_exterior_struct_lit(&lhs) || contains_exterior_struct_lit(&rhs) + } + ast::ExprKind::Await(ref x) + | ast::ExprKind::Unary(_, ref x) + | ast::ExprKind::Cast(ref x, _) + | ast::ExprKind::Type(ref x, _) + | ast::ExprKind::Field(ref x, _) + | ast::ExprKind::Index(ref x, _) => { + // &X { y: 1 }, X { y: 1 }.y + contains_exterior_struct_lit(&x) + } + + ast::ExprKind::MethodCall(.., ref exprs, _) => { + // X { y: 1 }.bar(...) + contains_exterior_struct_lit(&exprs[0]) + } + + _ => false, + } +} diff --git a/compiler/rustc_ast/src/util/unicode.rs b/compiler/rustc_ast/src/util/unicode.rs new file mode 100644 index 000000000..f009f7b30 --- /dev/null +++ b/compiler/rustc_ast/src/util/unicode.rs @@ -0,0 +1,35 @@ +pub const TEXT_FLOW_CONTROL_CHARS: &[char] = &[ + '\u{202A}', '\u{202B}', '\u{202D}', '\u{202E}', '\u{2066}', '\u{2067}', '\u{2068}', '\u{202C}', + '\u{2069}', +]; + +#[inline] +pub fn contains_text_flow_control_chars(s: &str) -> bool { + // Char - UTF-8 + // U+202A - E2 80 AA + // U+202B - E2 80 AB + // U+202C - E2 80 AC + // U+202D - E2 80 AD + // U+202E - E2 80 AE + // U+2066 - E2 81 A6 + // U+2067 - E2 81 A7 + // U+2068 - E2 81 A8 + // U+2069 - E2 81 A9 + let mut bytes = s.as_bytes(); + loop { + match core::slice::memchr::memchr(0xE2, &bytes) { + Some(idx) => { + // bytes are valid UTF-8 -> E2 must be followed by two bytes + let ch = &bytes[idx..idx + 3]; + match ch { + [_, 0x80, 0xAA..=0xAE] | [_, 0x81, 0xA6..=0xA9] => break true, + _ => {} + } + bytes = &bytes[idx + 3..]; + } + None => { + break false; + } + } + } +} -- cgit v1.2.3