From fbaf0bb26397aa498eb9156f06d5a6fe34dd7dd8 Mon Sep 17 00:00:00 2001 From: Daniel Baumann Date: Fri, 19 Apr 2024 03:14:29 +0200 Subject: Merging upstream version 125.0.1. Signed-off-by: Daniel Baumann --- third_party/rust/litrs/src/escape.rs | 262 +++++++++++++++++++++++++++++++++++ 1 file changed, 262 insertions(+) create mode 100644 third_party/rust/litrs/src/escape.rs (limited to 'third_party/rust/litrs/src/escape.rs') diff --git a/third_party/rust/litrs/src/escape.rs b/third_party/rust/litrs/src/escape.rs new file mode 100644 index 0000000000..5eb8382bc4 --- /dev/null +++ b/third_party/rust/litrs/src/escape.rs @@ -0,0 +1,262 @@ +use crate::{ParseError, err::{perr, ParseErrorKind::*}, parse::{hex_digit_value, check_suffix}}; + + +/// Must start with `\` +pub(crate) fn unescape(input: &str, offset: usize) -> Result<(E, usize), ParseError> { + let first = input.as_bytes().get(1) + .ok_or(perr(offset, UnterminatedEscape))?; + let out = match first { + // Quote escapes + b'\'' => (E::from_byte(b'\''), 2), + b'"' => (E::from_byte(b'"'), 2), + + // Ascii escapes + b'n' => (E::from_byte(b'\n'), 2), + b'r' => (E::from_byte(b'\r'), 2), + b't' => (E::from_byte(b'\t'), 2), + b'\\' => (E::from_byte(b'\\'), 2), + b'0' => (E::from_byte(b'\0'), 2), + b'x' => { + let hex_string = input.get(2..4) + .ok_or(perr(offset..offset + input.len(), UnterminatedEscape))? + .as_bytes(); + let first = hex_digit_value(hex_string[0]) + .ok_or(perr(offset..offset + 4, InvalidXEscape))?; + let second = hex_digit_value(hex_string[1]) + .ok_or(perr(offset..offset + 4, InvalidXEscape))?; + let value = second + 16 * first; + + if E::SUPPORTS_UNICODE && value > 0x7F { + return Err(perr(offset..offset + 4, NonAsciiXEscape)); + } + + (E::from_byte(value), 4) + }, + + // Unicode escape + b'u' => { + if !E::SUPPORTS_UNICODE { + return Err(perr(offset..offset + 2, UnicodeEscapeInByteLiteral)); + } + + if input.as_bytes().get(2) != Some(&b'{') { + return Err(perr(offset..offset + 2, UnicodeEscapeWithoutBrace)); + } + + let closing_pos = input.bytes().position(|b| b == b'}') + .ok_or(perr(offset..offset + input.len(), UnterminatedUnicodeEscape))?; + + let inner = &input[3..closing_pos]; + if inner.as_bytes().first() == Some(&b'_') { + return Err(perr(4, InvalidStartOfUnicodeEscape)); + } + + let mut v: u32 = 0; + let mut digit_count = 0; + for (i, b) in inner.bytes().enumerate() { + if b == b'_'{ + continue; + } + + let digit = hex_digit_value(b) + .ok_or(perr(offset + 3 + i, NonHexDigitInUnicodeEscape))?; + + if digit_count == 6 { + return Err(perr(offset + 3 + i, TooManyDigitInUnicodeEscape)); + } + digit_count += 1; + v = 16 * v + digit as u32; + } + + let c = std::char::from_u32(v) + .ok_or(perr(offset..closing_pos + 1, InvalidUnicodeEscapeChar))?; + + (E::from_char(c), closing_pos + 1) + } + + _ => return Err(perr(offset..offset + 2, UnknownEscape)), + }; + + Ok(out) +} + +pub(crate) trait Escapee: Into { + const SUPPORTS_UNICODE: bool; + fn from_byte(b: u8) -> Self; + fn from_char(c: char) -> Self; +} + +impl Escapee for u8 { + const SUPPORTS_UNICODE: bool = false; + fn from_byte(b: u8) -> Self { + b + } + fn from_char(_: char) -> Self { + panic!("bug: `::from_char` was called"); + } +} + +impl Escapee for char { + const SUPPORTS_UNICODE: bool = true; + fn from_byte(b: u8) -> Self { + b.into() + } + fn from_char(c: char) -> Self { + c + } +} + +/// Checks whether the character is skipped after a string continue start +/// (unescaped backlash followed by `\n`). +fn is_string_continue_skipable_whitespace(b: u8) -> bool { + b == b' ' || b == b'\t' || b == b'\n' || b == b'\r' +} + +/// Unescapes a whole string or byte string. +#[inline(never)] +pub(crate) fn unescape_string( + input: &str, + offset: usize, +) -> Result<(Option, usize), ParseError> { + let mut closing_quote_pos = None; + let mut i = offset; + let mut end_last_escape = offset; + let mut value = String::new(); + while i < input.len() { + match input.as_bytes()[i] { + // Handle "string continue". + b'\\' if input.as_bytes().get(i + 1) == Some(&b'\n') => { + value.push_str(&input[end_last_escape..i]); + + // Find the first non-whitespace character. + let end_escape = input[i + 2..].bytes() + .position(|b| !is_string_continue_skipable_whitespace(b)) + .ok_or(perr(None, UnterminatedString))?; + + i += 2 + end_escape; + end_last_escape = i; + } + b'\\' => { + let (c, len) = unescape::(&input[i..input.len() - 1], i)?; + value.push_str(&input[end_last_escape..i]); + value.push(c.into()); + i += len; + end_last_escape = i; + } + b'\r' => { + if input.as_bytes().get(i + 1) == Some(&b'\n') { + value.push_str(&input[end_last_escape..i]); + value.push('\n'); + i += 2; + end_last_escape = i; + } else { + return Err(perr(i, IsolatedCr)) + } + } + b'"' => { + closing_quote_pos = Some(i); + break; + }, + b if !E::SUPPORTS_UNICODE && !b.is_ascii() + => return Err(perr(i, NonAsciiInByteLiteral)), + _ => i += 1, + } + } + + let closing_quote_pos = closing_quote_pos.ok_or(perr(None, UnterminatedString))?; + + let start_suffix = closing_quote_pos + 1; + let suffix = &input[start_suffix..]; + check_suffix(suffix).map_err(|kind| perr(start_suffix, kind))?; + + // `value` is only empty if there was no escape in the input string + // (with the special case of the input being empty). This means the + // string value basically equals the input, so we store `None`. + let value = if value.is_empty() { + None + } else { + // There was an escape in the string, so we need to push the + // remaining unescaped part of the string still. + value.push_str(&input[end_last_escape..closing_quote_pos]); + Some(value) + }; + + Ok((value, start_suffix)) +} + +/// Reads and checks a raw (byte) string literal, converting `\r\n` sequences to +/// just `\n` sequences. Returns an optional new string (if the input contained +/// any `\r\n`) and the number of hashes used by the literal. +#[inline(never)] +pub(crate) fn scan_raw_string( + input: &str, + offset: usize, +) -> Result<(Option, u32, usize), ParseError> { + // Raw string literal + let num_hashes = input[offset..].bytes().position(|b| b != b'#') + .ok_or(perr(None, InvalidLiteral))?; + + if input.as_bytes().get(offset + num_hashes) != Some(&b'"') { + return Err(perr(None, InvalidLiteral)); + } + let start_inner = offset + num_hashes + 1; + let hashes = &input[offset..num_hashes + offset]; + + let mut closing_quote_pos = None; + let mut i = start_inner; + let mut end_last_escape = start_inner; + let mut value = String::new(); + while i < input.len() { + let b = input.as_bytes()[i]; + if b == b'"' && input[i + 1..].starts_with(hashes) { + closing_quote_pos = Some(i); + break; + } + + if b == b'\r' { + // Convert `\r\n` into `\n`. This is currently not well documented + // in the Rust reference, but is done even for raw strings. That's + // because rustc simply converts all line endings when reading + // source files. + if input.as_bytes().get(i + 1) == Some(&b'\n') { + value.push_str(&input[end_last_escape..i]); + value.push('\n'); + i += 2; + end_last_escape = i; + continue; + } else if E::SUPPORTS_UNICODE { + // If no \n follows the \r and we are scanning a raw string + // (not raw byte string), we error. + return Err(perr(i, IsolatedCr)) + } + } + + if !E::SUPPORTS_UNICODE { + if !b.is_ascii() { + return Err(perr(i, NonAsciiInByteLiteral)); + } + } + + i += 1; + } + + let closing_quote_pos = closing_quote_pos.ok_or(perr(None, UnterminatedRawString))?; + + let start_suffix = closing_quote_pos + num_hashes + 1; + let suffix = &input[start_suffix..]; + check_suffix(suffix).map_err(|kind| perr(start_suffix, kind))?; + + // `value` is only empty if there was no \r\n in the input string (with the + // special case of the input being empty). This means the string value + // equals the input, so we store `None`. + let value = if value.is_empty() { + None + } else { + // There was an \r\n in the string, so we need to push the remaining + // unescaped part of the string still. + value.push_str(&input[end_last_escape..closing_quote_pos]); + Some(value) + }; + + Ok((value, num_hashes as u32, start_suffix)) +} -- cgit v1.2.3