summaryrefslogtreecommitdiffstats
path: root/compiler/rustc_lexer
diff options
context:
space:
mode:
authorDaniel Baumann <daniel.baumann@progress-linux.org>2024-05-18 02:49:42 +0000
committerDaniel Baumann <daniel.baumann@progress-linux.org>2024-05-18 02:49:42 +0000
commit837b550238aa671a591ccf282dddeab29cadb206 (patch)
tree914b6b8862bace72bd3245ca184d374b08d8a672 /compiler/rustc_lexer
parentAdding debian version 1.70.0+dfsg2-1. (diff)
downloadrustc-837b550238aa671a591ccf282dddeab29cadb206.tar.xz
rustc-837b550238aa671a591ccf282dddeab29cadb206.zip
Merging upstream version 1.71.1+dfsg1.
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'compiler/rustc_lexer')
-rw-r--r--compiler/rustc_lexer/src/lib.rs111
-rw-r--r--compiler/rustc_lexer/src/unescape.rs278
2 files changed, 242 insertions, 147 deletions
diff --git a/compiler/rustc_lexer/src/lib.rs b/compiler/rustc_lexer/src/lib.rs
index b3f4b5cd5..29335a8c0 100644
--- a/compiler/rustc_lexer/src/lib.rs
+++ b/compiler/rustc_lexer/src/lib.rs
@@ -186,12 +186,16 @@ pub enum LiteralKind {
Str { terminated: bool },
/// "b"abc"", "b"abc"
ByteStr { terminated: bool },
+ /// `c"abc"`, `c"abc`
+ CStr { terminated: bool },
/// "r"abc"", "r#"abc"#", "r####"ab"###"c"####", "r#"a". `None` indicates
/// an invalid literal.
RawStr { n_hashes: Option<u8> },
/// "br"abc"", "br#"abc"#", "br####"ab"###"c"####", "br#"a". `None`
/// indicates an invalid literal.
RawByteStr { n_hashes: Option<u8> },
+ /// `cr"abc"`, "cr#"abc"#", `cr#"a`. `None` indicates an invalid literal.
+ RawCStr { n_hashes: Option<u8> },
}
#[derive(Clone, Copy, Debug, PartialEq, Eq, PartialOrd, Ord)]
@@ -357,39 +361,11 @@ impl Cursor<'_> {
},
// Byte literal, byte string literal, raw byte string literal or identifier.
- 'b' => match (self.first(), self.second()) {
- ('\'', _) => {
- self.bump();
- let terminated = self.single_quoted_string();
- let suffix_start = self.pos_within_token();
- if terminated {
- self.eat_literal_suffix();
- }
- let kind = Byte { terminated };
- Literal { kind, suffix_start }
- }
- ('"', _) => {
- self.bump();
- let terminated = self.double_quoted_string();
- let suffix_start = self.pos_within_token();
- if terminated {
- self.eat_literal_suffix();
- }
- let kind = ByteStr { terminated };
- Literal { kind, suffix_start }
- }
- ('r', '"') | ('r', '#') => {
- self.bump();
- let res = self.raw_double_quoted_string(2);
- let suffix_start = self.pos_within_token();
- if res.is_ok() {
- self.eat_literal_suffix();
- }
- let kind = RawByteStr { n_hashes: res.ok() };
- Literal { kind, suffix_start }
- }
- _ => self.ident_or_unknown_prefix(),
- },
+ 'b' => self.c_or_byte_string(
+ |terminated| ByteStr { terminated },
+ |n_hashes| RawByteStr { n_hashes },
+ Some(|terminated| Byte { terminated }),
+ ),
// Identifier (this should be checked after other variant that can
// start as identifier).
@@ -553,39 +529,84 @@ impl Cursor<'_> {
}
}
+ fn c_or_byte_string(
+ &mut self,
+ mk_kind: impl FnOnce(bool) -> LiteralKind,
+ mk_kind_raw: impl FnOnce(Option<u8>) -> LiteralKind,
+ single_quoted: Option<fn(bool) -> LiteralKind>,
+ ) -> TokenKind {
+ match (self.first(), self.second(), single_quoted) {
+ ('\'', _, Some(mk_kind)) => {
+ self.bump();
+ let terminated = self.single_quoted_string();
+ let suffix_start = self.pos_within_token();
+ if terminated {
+ self.eat_literal_suffix();
+ }
+ let kind = mk_kind(terminated);
+ Literal { kind, suffix_start }
+ }
+ ('"', _, _) => {
+ self.bump();
+ let terminated = self.double_quoted_string();
+ let suffix_start = self.pos_within_token();
+ if terminated {
+ self.eat_literal_suffix();
+ }
+ let kind = mk_kind(terminated);
+ Literal { kind, suffix_start }
+ }
+ ('r', '"', _) | ('r', '#', _) => {
+ self.bump();
+ let res = self.raw_double_quoted_string(2);
+ let suffix_start = self.pos_within_token();
+ if res.is_ok() {
+ self.eat_literal_suffix();
+ }
+ let kind = mk_kind_raw(res.ok());
+ Literal { kind, suffix_start }
+ }
+ _ => self.ident_or_unknown_prefix(),
+ }
+ }
+
fn number(&mut self, first_digit: char) -> LiteralKind {
debug_assert!('0' <= self.prev() && self.prev() <= '9');
let mut base = Base::Decimal;
if first_digit == '0' {
// Attempt to parse encoding base.
- let has_digits = match self.first() {
+ match self.first() {
'b' => {
base = Base::Binary;
self.bump();
- self.eat_decimal_digits()
+ if !self.eat_decimal_digits() {
+ return Int { base, empty_int: true };
+ }
}
'o' => {
base = Base::Octal;
self.bump();
- self.eat_decimal_digits()
+ if !self.eat_decimal_digits() {
+ return Int { base, empty_int: true };
+ }
}
'x' => {
base = Base::Hexadecimal;
self.bump();
- self.eat_hexadecimal_digits()
+ if !self.eat_hexadecimal_digits() {
+ return Int { base, empty_int: true };
+ }
}
- // Not a base prefix.
- '0'..='9' | '_' | '.' | 'e' | 'E' => {
+ // Not a base prefix; consume additional digits.
+ '0'..='9' | '_' => {
self.eat_decimal_digits();
- true
}
+
+ // Also not a base prefix; nothing more to do here.
+ '.' | 'e' | 'E' => {}
+
// Just a 0.
_ => return Int { base, empty_int: false },
- };
- // Base prefix was provided, but there were no digits
- // after it, e.g. "0x".
- if !has_digits {
- return Int { base, empty_int: true };
}
} else {
// No base prefix, parse number in the usual way.
diff --git a/compiler/rustc_lexer/src/unescape.rs b/compiler/rustc_lexer/src/unescape.rs
index bb4d91247..c9ad54d8d 100644
--- a/compiler/rustc_lexer/src/unescape.rs
+++ b/compiler/rustc_lexer/src/unescape.rs
@@ -86,10 +86,45 @@ where
let res = unescape_char_or_byte(&mut chars, mode == Mode::Byte);
callback(0..(src.len() - chars.as_str().len()), res);
}
- Mode::Str | Mode::ByteStr => unescape_str_or_byte_str(src, mode == Mode::ByteStr, callback),
+ Mode::Str | Mode::ByteStr => unescape_str_common(src, mode, callback),
+
Mode::RawStr | Mode::RawByteStr => {
unescape_raw_str_or_raw_byte_str(src, mode == Mode::RawByteStr, callback)
}
+ Mode::CStr | Mode::RawCStr => unreachable!(),
+ }
+}
+
+/// A unit within CStr. Must not be a nul character.
+pub enum CStrUnit {
+ Byte(u8),
+ Char(char),
+}
+
+impl From<u8> for CStrUnit {
+ fn from(value: u8) -> Self {
+ CStrUnit::Byte(value)
+ }
+}
+
+impl From<char> for CStrUnit {
+ fn from(value: char) -> Self {
+ CStrUnit::Char(value)
+ }
+}
+
+pub fn unescape_c_string<F>(src: &str, mode: Mode, callback: &mut F)
+where
+ F: FnMut(Range<usize>, Result<CStrUnit, EscapeError>),
+{
+ if mode == Mode::RawCStr {
+ unescape_raw_str_or_raw_byte_str(
+ src,
+ mode.characters_should_be_ascii(),
+ &mut |r, result| callback(r, result.map(CStrUnit::Char)),
+ );
+ } else {
+ unescape_str_common(src, mode, callback);
}
}
@@ -114,34 +149,69 @@ pub enum Mode {
ByteStr,
RawStr,
RawByteStr,
+ CStr,
+ RawCStr,
}
impl Mode {
pub fn in_double_quotes(self) -> bool {
match self {
- Mode::Str | Mode::ByteStr | Mode::RawStr | Mode::RawByteStr => true,
+ Mode::Str
+ | Mode::ByteStr
+ | Mode::RawStr
+ | Mode::RawByteStr
+ | Mode::CStr
+ | Mode::RawCStr => true,
Mode::Char | Mode::Byte => false,
}
}
- pub fn is_byte(self) -> bool {
+ /// Non-byte literals should have `\xXX` escapes that are within the ASCII range.
+ pub fn ascii_escapes_should_be_ascii(self) -> bool {
+ match self {
+ Mode::Char | Mode::Str | Mode::RawStr => true,
+ Mode::Byte | Mode::ByteStr | Mode::RawByteStr | Mode::CStr | Mode::RawCStr => false,
+ }
+ }
+
+ /// Whether characters within the literal must be within the ASCII range
+ pub fn characters_should_be_ascii(self) -> bool {
+ match self {
+ Mode::Byte | Mode::ByteStr | Mode::RawByteStr => true,
+ Mode::Char | Mode::Str | Mode::RawStr | Mode::CStr | Mode::RawCStr => false,
+ }
+ }
+
+ /// Byte literals do not allow unicode escape.
+ pub fn is_unicode_escape_disallowed(self) -> bool {
match self {
Mode::Byte | Mode::ByteStr | Mode::RawByteStr => true,
- Mode::Char | Mode::Str | Mode::RawStr => false,
+ Mode::Char | Mode::Str | Mode::RawStr | Mode::CStr | Mode::RawCStr => false,
+ }
+ }
+
+ pub fn prefix_noraw(self) -> &'static str {
+ match self {
+ Mode::Byte | Mode::ByteStr | Mode::RawByteStr => "b",
+ Mode::CStr | Mode::RawCStr => "c",
+ Mode::Char | Mode::Str | Mode::RawStr => "",
}
}
}
-fn scan_escape(chars: &mut Chars<'_>, is_byte: bool) -> Result<char, EscapeError> {
+fn scan_escape<T: From<u8> + From<char>>(
+ chars: &mut Chars<'_>,
+ mode: Mode,
+) -> Result<T, EscapeError> {
// Previous character was '\\', unescape what follows.
let res = match chars.next().ok_or(EscapeError::LoneSlash)? {
- '"' => '"',
- 'n' => '\n',
- 'r' => '\r',
- 't' => '\t',
- '\\' => '\\',
- '\'' => '\'',
- '0' => '\0',
+ '"' => b'"',
+ 'n' => b'\n',
+ 'r' => b'\r',
+ 't' => b'\t',
+ '\\' => b'\\',
+ '\'' => b'\'',
+ '0' => b'\0',
'x' => {
// Parse hexadecimal character code.
@@ -154,76 +224,78 @@ fn scan_escape(chars: &mut Chars<'_>, is_byte: bool) -> Result<char, EscapeError
let value = hi * 16 + lo;
- // For a non-byte literal verify that it is within ASCII range.
- if !is_byte && !is_ascii(value) {
+ if mode.ascii_escapes_should_be_ascii() && !is_ascii(value) {
return Err(EscapeError::OutOfRangeHexEscape);
}
- let value = value as u8;
- value as char
+ value as u8
}
- 'u' => {
- // We've parsed '\u', now we have to parse '{..}'.
+ 'u' => return scan_unicode(chars, mode.is_unicode_escape_disallowed()).map(Into::into),
+ _ => return Err(EscapeError::InvalidEscape),
+ };
+ Ok(res.into())
+}
+
+fn scan_unicode(
+ chars: &mut Chars<'_>,
+ is_unicode_escape_disallowed: bool,
+) -> Result<char, EscapeError> {
+ // We've parsed '\u', now we have to parse '{..}'.
- if chars.next() != Some('{') {
- return Err(EscapeError::NoBraceInUnicodeEscape);
- }
+ if chars.next() != Some('{') {
+ return Err(EscapeError::NoBraceInUnicodeEscape);
+ }
- // First character must be a hexadecimal digit.
- let mut n_digits = 1;
- let mut value: u32 = match chars.next().ok_or(EscapeError::UnclosedUnicodeEscape)? {
- '_' => return Err(EscapeError::LeadingUnderscoreUnicodeEscape),
- '}' => return Err(EscapeError::EmptyUnicodeEscape),
- c => c.to_digit(16).ok_or(EscapeError::InvalidCharInUnicodeEscape)?,
- };
-
- // First character is valid, now parse the rest of the number
- // and closing brace.
- loop {
- match chars.next() {
- None => return Err(EscapeError::UnclosedUnicodeEscape),
- Some('_') => continue,
- Some('}') => {
- if n_digits > 6 {
- return Err(EscapeError::OverlongUnicodeEscape);
- }
-
- // Incorrect syntax has higher priority for error reporting
- // than unallowed value for a literal.
- if is_byte {
- return Err(EscapeError::UnicodeEscapeInByte);
- }
-
- break std::char::from_u32(value).ok_or_else(|| {
- if value > 0x10FFFF {
- EscapeError::OutOfRangeUnicodeEscape
- } else {
- EscapeError::LoneSurrogateUnicodeEscape
- }
- })?;
- }
- Some(c) => {
- let digit: u32 =
- c.to_digit(16).ok_or(EscapeError::InvalidCharInUnicodeEscape)?;
- n_digits += 1;
- if n_digits > 6 {
- // Stop updating value since we're sure that it's incorrect already.
- continue;
- }
- value = value * 16 + digit;
+ // First character must be a hexadecimal digit.
+ let mut n_digits = 1;
+ let mut value: u32 = match chars.next().ok_or(EscapeError::UnclosedUnicodeEscape)? {
+ '_' => return Err(EscapeError::LeadingUnderscoreUnicodeEscape),
+ '}' => return Err(EscapeError::EmptyUnicodeEscape),
+ c => c.to_digit(16).ok_or(EscapeError::InvalidCharInUnicodeEscape)?,
+ };
+
+ // First character is valid, now parse the rest of the number
+ // and closing brace.
+ loop {
+ match chars.next() {
+ None => return Err(EscapeError::UnclosedUnicodeEscape),
+ Some('_') => continue,
+ Some('}') => {
+ if n_digits > 6 {
+ return Err(EscapeError::OverlongUnicodeEscape);
+ }
+
+ // Incorrect syntax has higher priority for error reporting
+ // than unallowed value for a literal.
+ if is_unicode_escape_disallowed {
+ return Err(EscapeError::UnicodeEscapeInByte);
+ }
+
+ break std::char::from_u32(value).ok_or_else(|| {
+ if value > 0x10FFFF {
+ EscapeError::OutOfRangeUnicodeEscape
+ } else {
+ EscapeError::LoneSurrogateUnicodeEscape
}
- };
+ });
}
- }
- _ => return Err(EscapeError::InvalidEscape),
- };
- Ok(res)
+ Some(c) => {
+ let digit: u32 = c.to_digit(16).ok_or(EscapeError::InvalidCharInUnicodeEscape)?;
+ n_digits += 1;
+ if n_digits > 6 {
+ // Stop updating value since we're sure that it's incorrect already.
+ continue;
+ }
+ value = value * 16 + digit;
+ }
+ };
+ }
}
#[inline]
-fn ascii_check(c: char, is_byte: bool) -> Result<char, EscapeError> {
- if is_byte && !c.is_ascii() {
+fn ascii_check(c: char, characters_should_be_ascii: bool) -> Result<char, EscapeError> {
+ if characters_should_be_ascii && !c.is_ascii() {
// Byte literal can't be a non-ascii character.
Err(EscapeError::NonAsciiCharInByte)
} else {
@@ -234,7 +306,7 @@ fn ascii_check(c: char, is_byte: bool) -> Result<char, EscapeError> {
fn unescape_char_or_byte(chars: &mut Chars<'_>, is_byte: bool) -> Result<char, EscapeError> {
let c = chars.next().ok_or(EscapeError::ZeroChars)?;
let res = match c {
- '\\' => scan_escape(chars, is_byte),
+ '\\' => scan_escape(chars, if is_byte { Mode::Byte } else { Mode::Char }),
'\n' | '\t' | '\'' => Err(EscapeError::EscapeOnlyChar),
'\r' => Err(EscapeError::BareCarriageReturn),
_ => ascii_check(c, is_byte),
@@ -247,9 +319,9 @@ fn unescape_char_or_byte(chars: &mut Chars<'_>, is_byte: bool) -> Result<char, E
/// Takes a contents of a string literal (without quotes) and produces a
/// sequence of escaped characters or errors.
-fn unescape_str_or_byte_str<F>(src: &str, is_byte: bool, callback: &mut F)
+fn unescape_str_common<F, T: From<u8> + From<char>>(src: &str, mode: Mode, callback: &mut F)
where
- F: FnMut(Range<usize>, Result<char, EscapeError>),
+ F: FnMut(Range<usize>, Result<T, EscapeError>),
{
let mut chars = src.chars();
@@ -266,47 +338,49 @@ where
// if unescaped '\' character is followed by '\n'.
// For details see [Rust language reference]
// (https://doc.rust-lang.org/reference/tokens.html#string-literals).
- skip_ascii_whitespace(&mut chars, start, callback);
+ skip_ascii_whitespace(&mut chars, start, &mut |range, err| {
+ callback(range, Err(err))
+ });
continue;
}
- _ => scan_escape(&mut chars, is_byte),
+ _ => scan_escape::<T>(&mut chars, mode),
}
}
- '\n' => Ok('\n'),
- '\t' => Ok('\t'),
+ '\n' => Ok(b'\n'.into()),
+ '\t' => Ok(b'\t'.into()),
'"' => Err(EscapeError::EscapeOnlyChar),
'\r' => Err(EscapeError::BareCarriageReturn),
- _ => ascii_check(c, is_byte),
+ _ => ascii_check(c, mode.characters_should_be_ascii()).map(Into::into),
};
let end = src.len() - chars.as_str().len();
- callback(start..end, res);
+ callback(start..end, res.map(Into::into));
}
+}
- fn skip_ascii_whitespace<F>(chars: &mut Chars<'_>, start: usize, callback: &mut F)
- where
- F: FnMut(Range<usize>, Result<char, EscapeError>),
- {
- let tail = chars.as_str();
- let first_non_space = tail
- .bytes()
- .position(|b| b != b' ' && b != b'\t' && b != b'\n' && b != b'\r')
- .unwrap_or(tail.len());
- if tail[1..first_non_space].contains('\n') {
- // The +1 accounts for the escaping slash.
- let end = start + first_non_space + 1;
- callback(start..end, Err(EscapeError::MultipleSkippedLinesWarning));
- }
- let tail = &tail[first_non_space..];
- if let Some(c) = tail.chars().nth(0) {
- if c.is_whitespace() {
- // For error reporting, we would like the span to contain the character that was not
- // skipped. The +1 is necessary to account for the leading \ that started the escape.
- let end = start + first_non_space + c.len_utf8() + 1;
- callback(start..end, Err(EscapeError::UnskippedWhitespaceWarning));
- }
+fn skip_ascii_whitespace<F>(chars: &mut Chars<'_>, start: usize, callback: &mut F)
+where
+ F: FnMut(Range<usize>, EscapeError),
+{
+ let tail = chars.as_str();
+ let first_non_space = tail
+ .bytes()
+ .position(|b| b != b' ' && b != b'\t' && b != b'\n' && b != b'\r')
+ .unwrap_or(tail.len());
+ if tail[1..first_non_space].contains('\n') {
+ // The +1 accounts for the escaping slash.
+ let end = start + first_non_space + 1;
+ callback(start..end, EscapeError::MultipleSkippedLinesWarning);
+ }
+ let tail = &tail[first_non_space..];
+ if let Some(c) = tail.chars().nth(0) {
+ if c.is_whitespace() {
+ // For error reporting, we would like the span to contain the character that was not
+ // skipped. The +1 is necessary to account for the leading \ that started the escape.
+ let end = start + first_non_space + c.len_utf8() + 1;
+ callback(start..end, EscapeError::UnskippedWhitespaceWarning);
}
- *chars = tail.chars();
}
+ *chars = tail.chars();
}
/// Takes a contents of a string literal (without quotes) and produces a