diff options
author | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-05-18 02:49:42 +0000 |
---|---|---|
committer | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-05-18 02:49:42 +0000 |
commit | 837b550238aa671a591ccf282dddeab29cadb206 (patch) | |
tree | 914b6b8862bace72bd3245ca184d374b08d8a672 /compiler/rustc_lexer | |
parent | Adding debian version 1.70.0+dfsg2-1. (diff) | |
download | rustc-837b550238aa671a591ccf282dddeab29cadb206.tar.xz rustc-837b550238aa671a591ccf282dddeab29cadb206.zip |
Merging upstream version 1.71.1+dfsg1.
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'compiler/rustc_lexer')
-rw-r--r-- | compiler/rustc_lexer/src/lib.rs | 111 | ||||
-rw-r--r-- | compiler/rustc_lexer/src/unescape.rs | 278 |
2 files changed, 242 insertions, 147 deletions
diff --git a/compiler/rustc_lexer/src/lib.rs b/compiler/rustc_lexer/src/lib.rs index b3f4b5cd5..29335a8c0 100644 --- a/compiler/rustc_lexer/src/lib.rs +++ b/compiler/rustc_lexer/src/lib.rs @@ -186,12 +186,16 @@ pub enum LiteralKind { Str { terminated: bool }, /// "b"abc"", "b"abc" ByteStr { terminated: bool }, + /// `c"abc"`, `c"abc` + CStr { terminated: bool }, /// "r"abc"", "r#"abc"#", "r####"ab"###"c"####", "r#"a". `None` indicates /// an invalid literal. RawStr { n_hashes: Option<u8> }, /// "br"abc"", "br#"abc"#", "br####"ab"###"c"####", "br#"a". `None` /// indicates an invalid literal. RawByteStr { n_hashes: Option<u8> }, + /// `cr"abc"`, "cr#"abc"#", `cr#"a`. `None` indicates an invalid literal. + RawCStr { n_hashes: Option<u8> }, } #[derive(Clone, Copy, Debug, PartialEq, Eq, PartialOrd, Ord)] @@ -357,39 +361,11 @@ impl Cursor<'_> { }, // Byte literal, byte string literal, raw byte string literal or identifier. - 'b' => match (self.first(), self.second()) { - ('\'', _) => { - self.bump(); - let terminated = self.single_quoted_string(); - let suffix_start = self.pos_within_token(); - if terminated { - self.eat_literal_suffix(); - } - let kind = Byte { terminated }; - Literal { kind, suffix_start } - } - ('"', _) => { - self.bump(); - let terminated = self.double_quoted_string(); - let suffix_start = self.pos_within_token(); - if terminated { - self.eat_literal_suffix(); - } - let kind = ByteStr { terminated }; - Literal { kind, suffix_start } - } - ('r', '"') | ('r', '#') => { - self.bump(); - let res = self.raw_double_quoted_string(2); - let suffix_start = self.pos_within_token(); - if res.is_ok() { - self.eat_literal_suffix(); - } - let kind = RawByteStr { n_hashes: res.ok() }; - Literal { kind, suffix_start } - } - _ => self.ident_or_unknown_prefix(), - }, + 'b' => self.c_or_byte_string( + |terminated| ByteStr { terminated }, + |n_hashes| RawByteStr { n_hashes }, + Some(|terminated| Byte { terminated }), + ), // Identifier (this should be checked after other variant that can // start as identifier). @@ -553,39 +529,84 @@ impl Cursor<'_> { } } + fn c_or_byte_string( + &mut self, + mk_kind: impl FnOnce(bool) -> LiteralKind, + mk_kind_raw: impl FnOnce(Option<u8>) -> LiteralKind, + single_quoted: Option<fn(bool) -> LiteralKind>, + ) -> TokenKind { + match (self.first(), self.second(), single_quoted) { + ('\'', _, Some(mk_kind)) => { + self.bump(); + let terminated = self.single_quoted_string(); + let suffix_start = self.pos_within_token(); + if terminated { + self.eat_literal_suffix(); + } + let kind = mk_kind(terminated); + Literal { kind, suffix_start } + } + ('"', _, _) => { + self.bump(); + let terminated = self.double_quoted_string(); + let suffix_start = self.pos_within_token(); + if terminated { + self.eat_literal_suffix(); + } + let kind = mk_kind(terminated); + Literal { kind, suffix_start } + } + ('r', '"', _) | ('r', '#', _) => { + self.bump(); + let res = self.raw_double_quoted_string(2); + let suffix_start = self.pos_within_token(); + if res.is_ok() { + self.eat_literal_suffix(); + } + let kind = mk_kind_raw(res.ok()); + Literal { kind, suffix_start } + } + _ => self.ident_or_unknown_prefix(), + } + } + fn number(&mut self, first_digit: char) -> LiteralKind { debug_assert!('0' <= self.prev() && self.prev() <= '9'); let mut base = Base::Decimal; if first_digit == '0' { // Attempt to parse encoding base. - let has_digits = match self.first() { + match self.first() { 'b' => { base = Base::Binary; self.bump(); - self.eat_decimal_digits() + if !self.eat_decimal_digits() { + return Int { base, empty_int: true }; + } } 'o' => { base = Base::Octal; self.bump(); - self.eat_decimal_digits() + if !self.eat_decimal_digits() { + return Int { base, empty_int: true }; + } } 'x' => { base = Base::Hexadecimal; self.bump(); - self.eat_hexadecimal_digits() + if !self.eat_hexadecimal_digits() { + return Int { base, empty_int: true }; + } } - // Not a base prefix. - '0'..='9' | '_' | '.' | 'e' | 'E' => { + // Not a base prefix; consume additional digits. + '0'..='9' | '_' => { self.eat_decimal_digits(); - true } + + // Also not a base prefix; nothing more to do here. + '.' | 'e' | 'E' => {} + // Just a 0. _ => return Int { base, empty_int: false }, - }; - // Base prefix was provided, but there were no digits - // after it, e.g. "0x". - if !has_digits { - return Int { base, empty_int: true }; } } else { // No base prefix, parse number in the usual way. diff --git a/compiler/rustc_lexer/src/unescape.rs b/compiler/rustc_lexer/src/unescape.rs index bb4d91247..c9ad54d8d 100644 --- a/compiler/rustc_lexer/src/unescape.rs +++ b/compiler/rustc_lexer/src/unescape.rs @@ -86,10 +86,45 @@ where let res = unescape_char_or_byte(&mut chars, mode == Mode::Byte); callback(0..(src.len() - chars.as_str().len()), res); } - Mode::Str | Mode::ByteStr => unescape_str_or_byte_str(src, mode == Mode::ByteStr, callback), + Mode::Str | Mode::ByteStr => unescape_str_common(src, mode, callback), + Mode::RawStr | Mode::RawByteStr => { unescape_raw_str_or_raw_byte_str(src, mode == Mode::RawByteStr, callback) } + Mode::CStr | Mode::RawCStr => unreachable!(), + } +} + +/// A unit within CStr. Must not be a nul character. +pub enum CStrUnit { + Byte(u8), + Char(char), +} + +impl From<u8> for CStrUnit { + fn from(value: u8) -> Self { + CStrUnit::Byte(value) + } +} + +impl From<char> for CStrUnit { + fn from(value: char) -> Self { + CStrUnit::Char(value) + } +} + +pub fn unescape_c_string<F>(src: &str, mode: Mode, callback: &mut F) +where + F: FnMut(Range<usize>, Result<CStrUnit, EscapeError>), +{ + if mode == Mode::RawCStr { + unescape_raw_str_or_raw_byte_str( + src, + mode.characters_should_be_ascii(), + &mut |r, result| callback(r, result.map(CStrUnit::Char)), + ); + } else { + unescape_str_common(src, mode, callback); } } @@ -114,34 +149,69 @@ pub enum Mode { ByteStr, RawStr, RawByteStr, + CStr, + RawCStr, } impl Mode { pub fn in_double_quotes(self) -> bool { match self { - Mode::Str | Mode::ByteStr | Mode::RawStr | Mode::RawByteStr => true, + Mode::Str + | Mode::ByteStr + | Mode::RawStr + | Mode::RawByteStr + | Mode::CStr + | Mode::RawCStr => true, Mode::Char | Mode::Byte => false, } } - pub fn is_byte(self) -> bool { + /// Non-byte literals should have `\xXX` escapes that are within the ASCII range. + pub fn ascii_escapes_should_be_ascii(self) -> bool { + match self { + Mode::Char | Mode::Str | Mode::RawStr => true, + Mode::Byte | Mode::ByteStr | Mode::RawByteStr | Mode::CStr | Mode::RawCStr => false, + } + } + + /// Whether characters within the literal must be within the ASCII range + pub fn characters_should_be_ascii(self) -> bool { + match self { + Mode::Byte | Mode::ByteStr | Mode::RawByteStr => true, + Mode::Char | Mode::Str | Mode::RawStr | Mode::CStr | Mode::RawCStr => false, + } + } + + /// Byte literals do not allow unicode escape. + pub fn is_unicode_escape_disallowed(self) -> bool { match self { Mode::Byte | Mode::ByteStr | Mode::RawByteStr => true, - Mode::Char | Mode::Str | Mode::RawStr => false, + Mode::Char | Mode::Str | Mode::RawStr | Mode::CStr | Mode::RawCStr => false, + } + } + + pub fn prefix_noraw(self) -> &'static str { + match self { + Mode::Byte | Mode::ByteStr | Mode::RawByteStr => "b", + Mode::CStr | Mode::RawCStr => "c", + Mode::Char | Mode::Str | Mode::RawStr => "", } } } -fn scan_escape(chars: &mut Chars<'_>, is_byte: bool) -> Result<char, EscapeError> { +fn scan_escape<T: From<u8> + From<char>>( + chars: &mut Chars<'_>, + mode: Mode, +) -> Result<T, EscapeError> { // Previous character was '\\', unescape what follows. let res = match chars.next().ok_or(EscapeError::LoneSlash)? { - '"' => '"', - 'n' => '\n', - 'r' => '\r', - 't' => '\t', - '\\' => '\\', - '\'' => '\'', - '0' => '\0', + '"' => b'"', + 'n' => b'\n', + 'r' => b'\r', + 't' => b'\t', + '\\' => b'\\', + '\'' => b'\'', + '0' => b'\0', 'x' => { // Parse hexadecimal character code. @@ -154,76 +224,78 @@ fn scan_escape(chars: &mut Chars<'_>, is_byte: bool) -> Result<char, EscapeError let value = hi * 16 + lo; - // For a non-byte literal verify that it is within ASCII range. - if !is_byte && !is_ascii(value) { + if mode.ascii_escapes_should_be_ascii() && !is_ascii(value) { return Err(EscapeError::OutOfRangeHexEscape); } - let value = value as u8; - value as char + value as u8 } - 'u' => { - // We've parsed '\u', now we have to parse '{..}'. + 'u' => return scan_unicode(chars, mode.is_unicode_escape_disallowed()).map(Into::into), + _ => return Err(EscapeError::InvalidEscape), + }; + Ok(res.into()) +} + +fn scan_unicode( + chars: &mut Chars<'_>, + is_unicode_escape_disallowed: bool, +) -> Result<char, EscapeError> { + // We've parsed '\u', now we have to parse '{..}'. - if chars.next() != Some('{') { - return Err(EscapeError::NoBraceInUnicodeEscape); - } + if chars.next() != Some('{') { + return Err(EscapeError::NoBraceInUnicodeEscape); + } - // First character must be a hexadecimal digit. - let mut n_digits = 1; - let mut value: u32 = match chars.next().ok_or(EscapeError::UnclosedUnicodeEscape)? { - '_' => return Err(EscapeError::LeadingUnderscoreUnicodeEscape), - '}' => return Err(EscapeError::EmptyUnicodeEscape), - c => c.to_digit(16).ok_or(EscapeError::InvalidCharInUnicodeEscape)?, - }; - - // First character is valid, now parse the rest of the number - // and closing brace. - loop { - match chars.next() { - None => return Err(EscapeError::UnclosedUnicodeEscape), - Some('_') => continue, - Some('}') => { - if n_digits > 6 { - return Err(EscapeError::OverlongUnicodeEscape); - } - - // Incorrect syntax has higher priority for error reporting - // than unallowed value for a literal. - if is_byte { - return Err(EscapeError::UnicodeEscapeInByte); - } - - break std::char::from_u32(value).ok_or_else(|| { - if value > 0x10FFFF { - EscapeError::OutOfRangeUnicodeEscape - } else { - EscapeError::LoneSurrogateUnicodeEscape - } - })?; - } - Some(c) => { - let digit: u32 = - c.to_digit(16).ok_or(EscapeError::InvalidCharInUnicodeEscape)?; - n_digits += 1; - if n_digits > 6 { - // Stop updating value since we're sure that it's incorrect already. - continue; - } - value = value * 16 + digit; + // First character must be a hexadecimal digit. + let mut n_digits = 1; + let mut value: u32 = match chars.next().ok_or(EscapeError::UnclosedUnicodeEscape)? { + '_' => return Err(EscapeError::LeadingUnderscoreUnicodeEscape), + '}' => return Err(EscapeError::EmptyUnicodeEscape), + c => c.to_digit(16).ok_or(EscapeError::InvalidCharInUnicodeEscape)?, + }; + + // First character is valid, now parse the rest of the number + // and closing brace. + loop { + match chars.next() { + None => return Err(EscapeError::UnclosedUnicodeEscape), + Some('_') => continue, + Some('}') => { + if n_digits > 6 { + return Err(EscapeError::OverlongUnicodeEscape); + } + + // Incorrect syntax has higher priority for error reporting + // than unallowed value for a literal. + if is_unicode_escape_disallowed { + return Err(EscapeError::UnicodeEscapeInByte); + } + + break std::char::from_u32(value).ok_or_else(|| { + if value > 0x10FFFF { + EscapeError::OutOfRangeUnicodeEscape + } else { + EscapeError::LoneSurrogateUnicodeEscape } - }; + }); } - } - _ => return Err(EscapeError::InvalidEscape), - }; - Ok(res) + Some(c) => { + let digit: u32 = c.to_digit(16).ok_or(EscapeError::InvalidCharInUnicodeEscape)?; + n_digits += 1; + if n_digits > 6 { + // Stop updating value since we're sure that it's incorrect already. + continue; + } + value = value * 16 + digit; + } + }; + } } #[inline] -fn ascii_check(c: char, is_byte: bool) -> Result<char, EscapeError> { - if is_byte && !c.is_ascii() { +fn ascii_check(c: char, characters_should_be_ascii: bool) -> Result<char, EscapeError> { + if characters_should_be_ascii && !c.is_ascii() { // Byte literal can't be a non-ascii character. Err(EscapeError::NonAsciiCharInByte) } else { @@ -234,7 +306,7 @@ fn ascii_check(c: char, is_byte: bool) -> Result<char, EscapeError> { fn unescape_char_or_byte(chars: &mut Chars<'_>, is_byte: bool) -> Result<char, EscapeError> { let c = chars.next().ok_or(EscapeError::ZeroChars)?; let res = match c { - '\\' => scan_escape(chars, is_byte), + '\\' => scan_escape(chars, if is_byte { Mode::Byte } else { Mode::Char }), '\n' | '\t' | '\'' => Err(EscapeError::EscapeOnlyChar), '\r' => Err(EscapeError::BareCarriageReturn), _ => ascii_check(c, is_byte), @@ -247,9 +319,9 @@ fn unescape_char_or_byte(chars: &mut Chars<'_>, is_byte: bool) -> Result<char, E /// Takes a contents of a string literal (without quotes) and produces a /// sequence of escaped characters or errors. -fn unescape_str_or_byte_str<F>(src: &str, is_byte: bool, callback: &mut F) +fn unescape_str_common<F, T: From<u8> + From<char>>(src: &str, mode: Mode, callback: &mut F) where - F: FnMut(Range<usize>, Result<char, EscapeError>), + F: FnMut(Range<usize>, Result<T, EscapeError>), { let mut chars = src.chars(); @@ -266,47 +338,49 @@ where // if unescaped '\' character is followed by '\n'. // For details see [Rust language reference] // (https://doc.rust-lang.org/reference/tokens.html#string-literals). - skip_ascii_whitespace(&mut chars, start, callback); + skip_ascii_whitespace(&mut chars, start, &mut |range, err| { + callback(range, Err(err)) + }); continue; } - _ => scan_escape(&mut chars, is_byte), + _ => scan_escape::<T>(&mut chars, mode), } } - '\n' => Ok('\n'), - '\t' => Ok('\t'), + '\n' => Ok(b'\n'.into()), + '\t' => Ok(b'\t'.into()), '"' => Err(EscapeError::EscapeOnlyChar), '\r' => Err(EscapeError::BareCarriageReturn), - _ => ascii_check(c, is_byte), + _ => ascii_check(c, mode.characters_should_be_ascii()).map(Into::into), }; let end = src.len() - chars.as_str().len(); - callback(start..end, res); + callback(start..end, res.map(Into::into)); } +} - fn skip_ascii_whitespace<F>(chars: &mut Chars<'_>, start: usize, callback: &mut F) - where - F: FnMut(Range<usize>, Result<char, EscapeError>), - { - let tail = chars.as_str(); - let first_non_space = tail - .bytes() - .position(|b| b != b' ' && b != b'\t' && b != b'\n' && b != b'\r') - .unwrap_or(tail.len()); - if tail[1..first_non_space].contains('\n') { - // The +1 accounts for the escaping slash. - let end = start + first_non_space + 1; - callback(start..end, Err(EscapeError::MultipleSkippedLinesWarning)); - } - let tail = &tail[first_non_space..]; - if let Some(c) = tail.chars().nth(0) { - if c.is_whitespace() { - // For error reporting, we would like the span to contain the character that was not - // skipped. The +1 is necessary to account for the leading \ that started the escape. - let end = start + first_non_space + c.len_utf8() + 1; - callback(start..end, Err(EscapeError::UnskippedWhitespaceWarning)); - } +fn skip_ascii_whitespace<F>(chars: &mut Chars<'_>, start: usize, callback: &mut F) +where + F: FnMut(Range<usize>, EscapeError), +{ + let tail = chars.as_str(); + let first_non_space = tail + .bytes() + .position(|b| b != b' ' && b != b'\t' && b != b'\n' && b != b'\r') + .unwrap_or(tail.len()); + if tail[1..first_non_space].contains('\n') { + // The +1 accounts for the escaping slash. + let end = start + first_non_space + 1; + callback(start..end, EscapeError::MultipleSkippedLinesWarning); + } + let tail = &tail[first_non_space..]; + if let Some(c) = tail.chars().nth(0) { + if c.is_whitespace() { + // For error reporting, we would like the span to contain the character that was not + // skipped. The +1 is necessary to account for the leading \ that started the escape. + let end = start + first_non_space + c.len_utf8() + 1; + callback(start..end, EscapeError::UnskippedWhitespaceWarning); } - *chars = tail.chars(); } + *chars = tail.chars(); } /// Takes a contents of a string literal (without quotes) and produces a |