summaryrefslogtreecommitdiffstats
path: root/third_party/rust/litrs/src/escape.rs
diff options
context:
space:
mode:
Diffstat (limited to 'third_party/rust/litrs/src/escape.rs')
-rw-r--r--third_party/rust/litrs/src/escape.rs262
1 files changed, 262 insertions, 0 deletions
diff --git a/third_party/rust/litrs/src/escape.rs b/third_party/rust/litrs/src/escape.rs
new file mode 100644
index 0000000000..5eb8382bc4
--- /dev/null
+++ b/third_party/rust/litrs/src/escape.rs
@@ -0,0 +1,262 @@
+use crate::{ParseError, err::{perr, ParseErrorKind::*}, parse::{hex_digit_value, check_suffix}};
+
+
+/// Must start with `\`
+pub(crate) fn unescape<E: Escapee>(input: &str, offset: usize) -> Result<(E, usize), ParseError> {
+ let first = input.as_bytes().get(1)
+ .ok_or(perr(offset, UnterminatedEscape))?;
+ let out = match first {
+ // Quote escapes
+ b'\'' => (E::from_byte(b'\''), 2),
+ b'"' => (E::from_byte(b'"'), 2),
+
+ // Ascii escapes
+ b'n' => (E::from_byte(b'\n'), 2),
+ b'r' => (E::from_byte(b'\r'), 2),
+ b't' => (E::from_byte(b'\t'), 2),
+ b'\\' => (E::from_byte(b'\\'), 2),
+ b'0' => (E::from_byte(b'\0'), 2),
+ b'x' => {
+ let hex_string = input.get(2..4)
+ .ok_or(perr(offset..offset + input.len(), UnterminatedEscape))?
+ .as_bytes();
+ let first = hex_digit_value(hex_string[0])
+ .ok_or(perr(offset..offset + 4, InvalidXEscape))?;
+ let second = hex_digit_value(hex_string[1])
+ .ok_or(perr(offset..offset + 4, InvalidXEscape))?;
+ let value = second + 16 * first;
+
+ if E::SUPPORTS_UNICODE && value > 0x7F {
+ return Err(perr(offset..offset + 4, NonAsciiXEscape));
+ }
+
+ (E::from_byte(value), 4)
+ },
+
+ // Unicode escape
+ b'u' => {
+ if !E::SUPPORTS_UNICODE {
+ return Err(perr(offset..offset + 2, UnicodeEscapeInByteLiteral));
+ }
+
+ if input.as_bytes().get(2) != Some(&b'{') {
+ return Err(perr(offset..offset + 2, UnicodeEscapeWithoutBrace));
+ }
+
+ let closing_pos = input.bytes().position(|b| b == b'}')
+ .ok_or(perr(offset..offset + input.len(), UnterminatedUnicodeEscape))?;
+
+ let inner = &input[3..closing_pos];
+ if inner.as_bytes().first() == Some(&b'_') {
+ return Err(perr(4, InvalidStartOfUnicodeEscape));
+ }
+
+ let mut v: u32 = 0;
+ let mut digit_count = 0;
+ for (i, b) in inner.bytes().enumerate() {
+ if b == b'_'{
+ continue;
+ }
+
+ let digit = hex_digit_value(b)
+ .ok_or(perr(offset + 3 + i, NonHexDigitInUnicodeEscape))?;
+
+ if digit_count == 6 {
+ return Err(perr(offset + 3 + i, TooManyDigitInUnicodeEscape));
+ }
+ digit_count += 1;
+ v = 16 * v + digit as u32;
+ }
+
+ let c = std::char::from_u32(v)
+ .ok_or(perr(offset..closing_pos + 1, InvalidUnicodeEscapeChar))?;
+
+ (E::from_char(c), closing_pos + 1)
+ }
+
+ _ => return Err(perr(offset..offset + 2, UnknownEscape)),
+ };
+
+ Ok(out)
+}
+
+pub(crate) trait Escapee: Into<char> {
+ const SUPPORTS_UNICODE: bool;
+ fn from_byte(b: u8) -> Self;
+ fn from_char(c: char) -> Self;
+}
+
+impl Escapee for u8 {
+ const SUPPORTS_UNICODE: bool = false;
+ fn from_byte(b: u8) -> Self {
+ b
+ }
+ fn from_char(_: char) -> Self {
+ panic!("bug: `<u8 as Escapee>::from_char` was called");
+ }
+}
+
+impl Escapee for char {
+ const SUPPORTS_UNICODE: bool = true;
+ fn from_byte(b: u8) -> Self {
+ b.into()
+ }
+ fn from_char(c: char) -> Self {
+ c
+ }
+}
+
+/// Checks whether the character is skipped after a string continue start
+/// (unescaped backlash followed by `\n`).
+fn is_string_continue_skipable_whitespace(b: u8) -> bool {
+ b == b' ' || b == b'\t' || b == b'\n' || b == b'\r'
+}
+
+/// Unescapes a whole string or byte string.
+#[inline(never)]
+pub(crate) fn unescape_string<E: Escapee>(
+ input: &str,
+ offset: usize,
+) -> Result<(Option<String>, usize), ParseError> {
+ let mut closing_quote_pos = None;
+ let mut i = offset;
+ let mut end_last_escape = offset;
+ let mut value = String::new();
+ while i < input.len() {
+ match input.as_bytes()[i] {
+ // Handle "string continue".
+ b'\\' if input.as_bytes().get(i + 1) == Some(&b'\n') => {
+ value.push_str(&input[end_last_escape..i]);
+
+ // Find the first non-whitespace character.
+ let end_escape = input[i + 2..].bytes()
+ .position(|b| !is_string_continue_skipable_whitespace(b))
+ .ok_or(perr(None, UnterminatedString))?;
+
+ i += 2 + end_escape;
+ end_last_escape = i;
+ }
+ b'\\' => {
+ let (c, len) = unescape::<E>(&input[i..input.len() - 1], i)?;
+ value.push_str(&input[end_last_escape..i]);
+ value.push(c.into());
+ i += len;
+ end_last_escape = i;
+ }
+ b'\r' => {
+ if input.as_bytes().get(i + 1) == Some(&b'\n') {
+ value.push_str(&input[end_last_escape..i]);
+ value.push('\n');
+ i += 2;
+ end_last_escape = i;
+ } else {
+ return Err(perr(i, IsolatedCr))
+ }
+ }
+ b'"' => {
+ closing_quote_pos = Some(i);
+ break;
+ },
+ b if !E::SUPPORTS_UNICODE && !b.is_ascii()
+ => return Err(perr(i, NonAsciiInByteLiteral)),
+ _ => i += 1,
+ }
+ }
+
+ let closing_quote_pos = closing_quote_pos.ok_or(perr(None, UnterminatedString))?;
+
+ let start_suffix = closing_quote_pos + 1;
+ let suffix = &input[start_suffix..];
+ check_suffix(suffix).map_err(|kind| perr(start_suffix, kind))?;
+
+ // `value` is only empty if there was no escape in the input string
+ // (with the special case of the input being empty). This means the
+ // string value basically equals the input, so we store `None`.
+ let value = if value.is_empty() {
+ None
+ } else {
+ // There was an escape in the string, so we need to push the
+ // remaining unescaped part of the string still.
+ value.push_str(&input[end_last_escape..closing_quote_pos]);
+ Some(value)
+ };
+
+ Ok((value, start_suffix))
+}
+
+/// Reads and checks a raw (byte) string literal, converting `\r\n` sequences to
+/// just `\n` sequences. Returns an optional new string (if the input contained
+/// any `\r\n`) and the number of hashes used by the literal.
+#[inline(never)]
+pub(crate) fn scan_raw_string<E: Escapee>(
+ input: &str,
+ offset: usize,
+) -> Result<(Option<String>, u32, usize), ParseError> {
+ // Raw string literal
+ let num_hashes = input[offset..].bytes().position(|b| b != b'#')
+ .ok_or(perr(None, InvalidLiteral))?;
+
+ if input.as_bytes().get(offset + num_hashes) != Some(&b'"') {
+ return Err(perr(None, InvalidLiteral));
+ }
+ let start_inner = offset + num_hashes + 1;
+ let hashes = &input[offset..num_hashes + offset];
+
+ let mut closing_quote_pos = None;
+ let mut i = start_inner;
+ let mut end_last_escape = start_inner;
+ let mut value = String::new();
+ while i < input.len() {
+ let b = input.as_bytes()[i];
+ if b == b'"' && input[i + 1..].starts_with(hashes) {
+ closing_quote_pos = Some(i);
+ break;
+ }
+
+ if b == b'\r' {
+ // Convert `\r\n` into `\n`. This is currently not well documented
+ // in the Rust reference, but is done even for raw strings. That's
+ // because rustc simply converts all line endings when reading
+ // source files.
+ if input.as_bytes().get(i + 1) == Some(&b'\n') {
+ value.push_str(&input[end_last_escape..i]);
+ value.push('\n');
+ i += 2;
+ end_last_escape = i;
+ continue;
+ } else if E::SUPPORTS_UNICODE {
+ // If no \n follows the \r and we are scanning a raw string
+ // (not raw byte string), we error.
+ return Err(perr(i, IsolatedCr))
+ }
+ }
+
+ if !E::SUPPORTS_UNICODE {
+ if !b.is_ascii() {
+ return Err(perr(i, NonAsciiInByteLiteral));
+ }
+ }
+
+ i += 1;
+ }
+
+ let closing_quote_pos = closing_quote_pos.ok_or(perr(None, UnterminatedRawString))?;
+
+ let start_suffix = closing_quote_pos + num_hashes + 1;
+ let suffix = &input[start_suffix..];
+ check_suffix(suffix).map_err(|kind| perr(start_suffix, kind))?;
+
+ // `value` is only empty if there was no \r\n in the input string (with the
+ // special case of the input being empty). This means the string value
+ // equals the input, so we store `None`.
+ let value = if value.is_empty() {
+ None
+ } else {
+ // There was an \r\n in the string, so we need to push the remaining
+ // unescaped part of the string still.
+ value.push_str(&input[end_last_escape..closing_quote_pos]);
+ Some(value)
+ };
+
+ Ok((value, num_hashes as u32, start_suffix))
+}