summaryrefslogtreecommitdiffstats
path: root/compiler/rustc_lexer/src/unescape.rs
diff options
context:
space:
mode:
Diffstat (limited to 'compiler/rustc_lexer/src/unescape.rs')
-rw-r--r--compiler/rustc_lexer/src/unescape.rs278
1 files changed, 176 insertions, 102 deletions
diff --git a/compiler/rustc_lexer/src/unescape.rs b/compiler/rustc_lexer/src/unescape.rs
index bb4d91247..c9ad54d8d 100644
--- a/compiler/rustc_lexer/src/unescape.rs
+++ b/compiler/rustc_lexer/src/unescape.rs
@@ -86,10 +86,45 @@ where
let res = unescape_char_or_byte(&mut chars, mode == Mode::Byte);
callback(0..(src.len() - chars.as_str().len()), res);
}
- Mode::Str | Mode::ByteStr => unescape_str_or_byte_str(src, mode == Mode::ByteStr, callback),
+ Mode::Str | Mode::ByteStr => unescape_str_common(src, mode, callback),
+
Mode::RawStr | Mode::RawByteStr => {
unescape_raw_str_or_raw_byte_str(src, mode == Mode::RawByteStr, callback)
}
+ Mode::CStr | Mode::RawCStr => unreachable!(),
+ }
+}
+
+/// A unit within CStr. Must not be a nul character.
+pub enum CStrUnit {
+ Byte(u8),
+ Char(char),
+}
+
+impl From<u8> for CStrUnit {
+ fn from(value: u8) -> Self {
+ CStrUnit::Byte(value)
+ }
+}
+
+impl From<char> for CStrUnit {
+ fn from(value: char) -> Self {
+ CStrUnit::Char(value)
+ }
+}
+
+pub fn unescape_c_string<F>(src: &str, mode: Mode, callback: &mut F)
+where
+ F: FnMut(Range<usize>, Result<CStrUnit, EscapeError>),
+{
+ if mode == Mode::RawCStr {
+ unescape_raw_str_or_raw_byte_str(
+ src,
+ mode.characters_should_be_ascii(),
+ &mut |r, result| callback(r, result.map(CStrUnit::Char)),
+ );
+ } else {
+ unescape_str_common(src, mode, callback);
}
}
@@ -114,34 +149,69 @@ pub enum Mode {
ByteStr,
RawStr,
RawByteStr,
+ CStr,
+ RawCStr,
}
impl Mode {
pub fn in_double_quotes(self) -> bool {
match self {
- Mode::Str | Mode::ByteStr | Mode::RawStr | Mode::RawByteStr => true,
+ Mode::Str
+ | Mode::ByteStr
+ | Mode::RawStr
+ | Mode::RawByteStr
+ | Mode::CStr
+ | Mode::RawCStr => true,
Mode::Char | Mode::Byte => false,
}
}
- pub fn is_byte(self) -> bool {
+ /// Non-byte literals should have `\xXX` escapes that are within the ASCII range.
+ pub fn ascii_escapes_should_be_ascii(self) -> bool {
+ match self {
+ Mode::Char | Mode::Str | Mode::RawStr => true,
+ Mode::Byte | Mode::ByteStr | Mode::RawByteStr | Mode::CStr | Mode::RawCStr => false,
+ }
+ }
+
+ /// Whether characters within the literal must be within the ASCII range
+ pub fn characters_should_be_ascii(self) -> bool {
+ match self {
+ Mode::Byte | Mode::ByteStr | Mode::RawByteStr => true,
+ Mode::Char | Mode::Str | Mode::RawStr | Mode::CStr | Mode::RawCStr => false,
+ }
+ }
+
+ /// Byte literals do not allow unicode escape.
+ pub fn is_unicode_escape_disallowed(self) -> bool {
match self {
Mode::Byte | Mode::ByteStr | Mode::RawByteStr => true,
- Mode::Char | Mode::Str | Mode::RawStr => false,
+ Mode::Char | Mode::Str | Mode::RawStr | Mode::CStr | Mode::RawCStr => false,
+ }
+ }
+
+ pub fn prefix_noraw(self) -> &'static str {
+ match self {
+ Mode::Byte | Mode::ByteStr | Mode::RawByteStr => "b",
+ Mode::CStr | Mode::RawCStr => "c",
+ Mode::Char | Mode::Str | Mode::RawStr => "",
}
}
}
-fn scan_escape(chars: &mut Chars<'_>, is_byte: bool) -> Result<char, EscapeError> {
+fn scan_escape<T: From<u8> + From<char>>(
+ chars: &mut Chars<'_>,
+ mode: Mode,
+) -> Result<T, EscapeError> {
// Previous character was '\\', unescape what follows.
let res = match chars.next().ok_or(EscapeError::LoneSlash)? {
- '"' => '"',
- 'n' => '\n',
- 'r' => '\r',
- 't' => '\t',
- '\\' => '\\',
- '\'' => '\'',
- '0' => '\0',
+ '"' => b'"',
+ 'n' => b'\n',
+ 'r' => b'\r',
+ 't' => b'\t',
+ '\\' => b'\\',
+ '\'' => b'\'',
+ '0' => b'\0',
'x' => {
// Parse hexadecimal character code.
@@ -154,76 +224,78 @@ fn scan_escape(chars: &mut Chars<'_>, is_byte: bool) -> Result<char, EscapeError
let value = hi * 16 + lo;
- // For a non-byte literal verify that it is within ASCII range.
- if !is_byte && !is_ascii(value) {
+ if mode.ascii_escapes_should_be_ascii() && !is_ascii(value) {
return Err(EscapeError::OutOfRangeHexEscape);
}
- let value = value as u8;
- value as char
+ value as u8
}
- 'u' => {
- // We've parsed '\u', now we have to parse '{..}'.
+ 'u' => return scan_unicode(chars, mode.is_unicode_escape_disallowed()).map(Into::into),
+ _ => return Err(EscapeError::InvalidEscape),
+ };
+ Ok(res.into())
+}
+
+fn scan_unicode(
+ chars: &mut Chars<'_>,
+ is_unicode_escape_disallowed: bool,
+) -> Result<char, EscapeError> {
+ // We've parsed '\u', now we have to parse '{..}'.
- if chars.next() != Some('{') {
- return Err(EscapeError::NoBraceInUnicodeEscape);
- }
+ if chars.next() != Some('{') {
+ return Err(EscapeError::NoBraceInUnicodeEscape);
+ }
- // First character must be a hexadecimal digit.
- let mut n_digits = 1;
- let mut value: u32 = match chars.next().ok_or(EscapeError::UnclosedUnicodeEscape)? {
- '_' => return Err(EscapeError::LeadingUnderscoreUnicodeEscape),
- '}' => return Err(EscapeError::EmptyUnicodeEscape),
- c => c.to_digit(16).ok_or(EscapeError::InvalidCharInUnicodeEscape)?,
- };
-
- // First character is valid, now parse the rest of the number
- // and closing brace.
- loop {
- match chars.next() {
- None => return Err(EscapeError::UnclosedUnicodeEscape),
- Some('_') => continue,
- Some('}') => {
- if n_digits > 6 {
- return Err(EscapeError::OverlongUnicodeEscape);
- }
-
- // Incorrect syntax has higher priority for error reporting
- // than unallowed value for a literal.
- if is_byte {
- return Err(EscapeError::UnicodeEscapeInByte);
- }
-
- break std::char::from_u32(value).ok_or_else(|| {
- if value > 0x10FFFF {
- EscapeError::OutOfRangeUnicodeEscape
- } else {
- EscapeError::LoneSurrogateUnicodeEscape
- }
- })?;
- }
- Some(c) => {
- let digit: u32 =
- c.to_digit(16).ok_or(EscapeError::InvalidCharInUnicodeEscape)?;
- n_digits += 1;
- if n_digits > 6 {
- // Stop updating value since we're sure that it's incorrect already.
- continue;
- }
- value = value * 16 + digit;
+ // First character must be a hexadecimal digit.
+ let mut n_digits = 1;
+ let mut value: u32 = match chars.next().ok_or(EscapeError::UnclosedUnicodeEscape)? {
+ '_' => return Err(EscapeError::LeadingUnderscoreUnicodeEscape),
+ '}' => return Err(EscapeError::EmptyUnicodeEscape),
+ c => c.to_digit(16).ok_or(EscapeError::InvalidCharInUnicodeEscape)?,
+ };
+
+ // First character is valid, now parse the rest of the number
+ // and closing brace.
+ loop {
+ match chars.next() {
+ None => return Err(EscapeError::UnclosedUnicodeEscape),
+ Some('_') => continue,
+ Some('}') => {
+ if n_digits > 6 {
+ return Err(EscapeError::OverlongUnicodeEscape);
+ }
+
+ // Incorrect syntax has higher priority for error reporting
+ // than unallowed value for a literal.
+ if is_unicode_escape_disallowed {
+ return Err(EscapeError::UnicodeEscapeInByte);
+ }
+
+ break std::char::from_u32(value).ok_or_else(|| {
+ if value > 0x10FFFF {
+ EscapeError::OutOfRangeUnicodeEscape
+ } else {
+ EscapeError::LoneSurrogateUnicodeEscape
}
- };
+ });
}
- }
- _ => return Err(EscapeError::InvalidEscape),
- };
- Ok(res)
+ Some(c) => {
+ let digit: u32 = c.to_digit(16).ok_or(EscapeError::InvalidCharInUnicodeEscape)?;
+ n_digits += 1;
+ if n_digits > 6 {
+ // Stop updating value since we're sure that it's incorrect already.
+ continue;
+ }
+ value = value * 16 + digit;
+ }
+ };
+ }
}
#[inline]
-fn ascii_check(c: char, is_byte: bool) -> Result<char, EscapeError> {
- if is_byte && !c.is_ascii() {
+fn ascii_check(c: char, characters_should_be_ascii: bool) -> Result<char, EscapeError> {
+ if characters_should_be_ascii && !c.is_ascii() {
// Byte literal can't be a non-ascii character.
Err(EscapeError::NonAsciiCharInByte)
} else {
@@ -234,7 +306,7 @@ fn ascii_check(c: char, is_byte: bool) -> Result<char, EscapeError> {
fn unescape_char_or_byte(chars: &mut Chars<'_>, is_byte: bool) -> Result<char, EscapeError> {
let c = chars.next().ok_or(EscapeError::ZeroChars)?;
let res = match c {
- '\\' => scan_escape(chars, is_byte),
+ '\\' => scan_escape(chars, if is_byte { Mode::Byte } else { Mode::Char }),
'\n' | '\t' | '\'' => Err(EscapeError::EscapeOnlyChar),
'\r' => Err(EscapeError::BareCarriageReturn),
_ => ascii_check(c, is_byte),
@@ -247,9 +319,9 @@ fn unescape_char_or_byte(chars: &mut Chars<'_>, is_byte: bool) -> Result<char, E
/// Takes a contents of a string literal (without quotes) and produces a
/// sequence of escaped characters or errors.
-fn unescape_str_or_byte_str<F>(src: &str, is_byte: bool, callback: &mut F)
+fn unescape_str_common<F, T: From<u8> + From<char>>(src: &str, mode: Mode, callback: &mut F)
where
- F: FnMut(Range<usize>, Result<char, EscapeError>),
+ F: FnMut(Range<usize>, Result<T, EscapeError>),
{
let mut chars = src.chars();
@@ -266,47 +338,49 @@ where
// if unescaped '\' character is followed by '\n'.
// For details see [Rust language reference]
// (https://doc.rust-lang.org/reference/tokens.html#string-literals).
- skip_ascii_whitespace(&mut chars, start, callback);
+ skip_ascii_whitespace(&mut chars, start, &mut |range, err| {
+ callback(range, Err(err))
+ });
continue;
}
- _ => scan_escape(&mut chars, is_byte),
+ _ => scan_escape::<T>(&mut chars, mode),
}
}
- '\n' => Ok('\n'),
- '\t' => Ok('\t'),
+ '\n' => Ok(b'\n'.into()),
+ '\t' => Ok(b'\t'.into()),
'"' => Err(EscapeError::EscapeOnlyChar),
'\r' => Err(EscapeError::BareCarriageReturn),
- _ => ascii_check(c, is_byte),
+ _ => ascii_check(c, mode.characters_should_be_ascii()).map(Into::into),
};
let end = src.len() - chars.as_str().len();
- callback(start..end, res);
+ callback(start..end, res.map(Into::into));
}
+}
- fn skip_ascii_whitespace<F>(chars: &mut Chars<'_>, start: usize, callback: &mut F)
- where
- F: FnMut(Range<usize>, Result<char, EscapeError>),
- {
- let tail = chars.as_str();
- let first_non_space = tail
- .bytes()
- .position(|b| b != b' ' && b != b'\t' && b != b'\n' && b != b'\r')
- .unwrap_or(tail.len());
- if tail[1..first_non_space].contains('\n') {
- // The +1 accounts for the escaping slash.
- let end = start + first_non_space + 1;
- callback(start..end, Err(EscapeError::MultipleSkippedLinesWarning));
- }
- let tail = &tail[first_non_space..];
- if let Some(c) = tail.chars().nth(0) {
- if c.is_whitespace() {
- // For error reporting, we would like the span to contain the character that was not
- // skipped. The +1 is necessary to account for the leading \ that started the escape.
- let end = start + first_non_space + c.len_utf8() + 1;
- callback(start..end, Err(EscapeError::UnskippedWhitespaceWarning));
- }
+fn skip_ascii_whitespace<F>(chars: &mut Chars<'_>, start: usize, callback: &mut F)
+where
+ F: FnMut(Range<usize>, EscapeError),
+{
+ let tail = chars.as_str();
+ let first_non_space = tail
+ .bytes()
+ .position(|b| b != b' ' && b != b'\t' && b != b'\n' && b != b'\r')
+ .unwrap_or(tail.len());
+ if tail[1..first_non_space].contains('\n') {
+ // The +1 accounts for the escaping slash.
+ let end = start + first_non_space + 1;
+ callback(start..end, EscapeError::MultipleSkippedLinesWarning);
+ }
+ let tail = &tail[first_non_space..];
+ if let Some(c) = tail.chars().nth(0) {
+ if c.is_whitespace() {
+ // For error reporting, we would like the span to contain the character that was not
+ // skipped. The +1 is necessary to account for the leading \ that started the escape.
+ let end = start + first_non_space + c.len_utf8() + 1;
+ callback(start..end, EscapeError::UnskippedWhitespaceWarning);
}
- *chars = tail.chars();
}
+ *chars = tail.chars();
}
/// Takes a contents of a string literal (without quotes) and produces a