diff options
Diffstat (limited to 'third_party/rust/cstr/src/parse.rs')
-rw-r--r-- | third_party/rust/cstr/src/parse.rs | 225 |
1 files changed, 225 insertions, 0 deletions
diff --git a/third_party/rust/cstr/src/parse.rs b/third_party/rust/cstr/src/parse.rs new file mode 100644 index 0000000000..aff419d654 --- /dev/null +++ b/third_party/rust/cstr/src/parse.rs @@ -0,0 +1,225 @@ +use crate::Error; +use proc_macro2::{Delimiter, Ident, Literal, Span, TokenStream, TokenTree}; +use std::char; + +macro_rules! unexpected_content { + () => { + "expected one of: byte string literal, string literal, identifier" + }; +} + +pub(crate) fn parse_input(mut input: TokenStream) -> Result<(Vec<u8>, Span), Error> { + loop { + let mut tokens = input.into_iter(); + let token = match tokens.next() { + Some(token) => token, + None => { + return Err(Error( + Span::call_site(), + concat!("unexpected end of input, ", unexpected_content!()), + )) + } + }; + let span = token.span(); + let result = match token { + // Unwrap any empty group which may be created from macro expansion. + TokenTree::Group(group) if group.delimiter() == Delimiter::None => Err(group), + TokenTree::Literal(literal) => match parse_literal(literal) { + Ok(result) => Ok(result), + Err(msg) => return Err(Error(span, msg)), + }, + TokenTree::Ident(ident) => Ok(parse_ident(ident)), + _ => return Err(Error(span, unexpected_content!())), + }; + if let Some(token) = tokens.next() { + return Err(Error(token.span(), "unexpected token")); + } + match result { + Ok(result) => return Ok((result, span)), + Err(group) => input = group.stream(), + } + } +} + +fn parse_literal(literal: Literal) -> Result<Vec<u8>, &'static str> { + let s = literal.to_string(); + let s = s.as_bytes(); + match s[0] { + b'"' => Ok(parse_cooked_content(s)), + b'r' => Ok(parse_raw_content(&s[1..])), + b'b' => match s[1] { + b'"' => Ok(parse_cooked_content(&s[1..])), + b'r' => Ok(parse_raw_content(&s[2..])), + _ => Err(unexpected_content!()), + }, + _ => Err(unexpected_content!()), + } +} + +fn all_pounds(bytes: &[u8]) -> bool { + bytes.iter().all(|b| *b == b'#') +} + +/// Parses raw string / bytes content after `r` prefix. +fn parse_raw_content(s: &[u8]) -> Vec<u8> { + let q_start = s.iter().position(|b| *b == b'"').unwrap(); + let q_end = s.iter().rposition(|b| *b == b'"').unwrap(); + assert!(all_pounds(&s[0..q_start])); + assert!(all_pounds(&s[q_end + 1..q_end + q_start + 1])); + Vec::from(&s[q_start + 1..q_end]) +} + +/// Parses the cooked string / bytes content within quotes. +fn parse_cooked_content(mut s: &[u8]) -> Vec<u8> { + s = &s[1..s.iter().rposition(|b| *b == b'"').unwrap()]; + let mut result = Vec::new(); + while !s.is_empty() { + match s[0] { + b'\\' => {} + b'\r' => { + assert_eq!(s[1], b'\n'); + result.push(b'\n'); + s = &s[2..]; + continue; + } + b => { + result.push(b); + s = &s[1..]; + continue; + } + } + let b = s[1]; + s = &s[2..]; + match b { + b'x' => { + let (b, rest) = backslash_x(s); + result.push(b); + s = rest; + } + b'u' => { + let (c, rest) = backslash_u(s); + result.extend_from_slice(c.encode_utf8(&mut [0; 4]).as_bytes()); + s = rest; + } + b'n' => result.push(b'\n'), + b'r' => result.push(b'\r'), + b't' => result.push(b'\t'), + b'\\' => result.push(b'\\'), + b'0' => result.push(b'\0'), + b'\'' => result.push(b'\''), + b'"' => result.push(b'"'), + b'\r' | b'\n' => { + let next = s.iter().position(|b| { + let ch = char::from_u32(u32::from(*b)).unwrap(); + !ch.is_whitespace() + }); + match next { + Some(pos) => s = &s[pos..], + None => s = b"", + } + } + b => panic!("unexpected byte {:?} after \\", b), + } + } + result +} + +fn backslash_x(s: &[u8]) -> (u8, &[u8]) { + let ch = hex_to_u8(s[0]) * 0x10 + hex_to_u8(s[1]); + (ch, &s[2..]) +} + +fn hex_to_u8(b: u8) -> u8 { + match b { + b'0'..=b'9' => b - b'0', + b'a'..=b'f' => b - b'a' + 10, + b'A'..=b'F' => b - b'A' + 10, + _ => unreachable!("unexpected non-hex character {:?} after \\x", b), + } +} + +fn backslash_u(s: &[u8]) -> (char, &[u8]) { + assert_eq!(s[0], b'{'); + let end = s[1..].iter().position(|b| *b == b'}').unwrap(); + let mut ch = 0; + for b in &s[1..=end] { + ch *= 0x10; + ch += u32::from(hex_to_u8(*b)); + } + (char::from_u32(ch).unwrap(), &s[end + 2..]) +} + +fn parse_ident(ident: Ident) -> Vec<u8> { + ident.to_string().into_bytes() +} + +#[cfg(test)] +mod tests { + use super::*; + use std::str::FromStr; + + // Tests below were modified from + // https://github.com/dtolnay/syn/blob/cd5fdc0f530f822446fccaf831669cd0cf4a0fc9/tests/test_lit.rs + + fn lit(s: &str) -> Vec<u8> { + match TokenStream::from_str(s) + .unwrap() + .into_iter() + .next() + .unwrap() + { + TokenTree::Literal(lit) => parse_literal(lit).unwrap(), + _ => panic!(), + } + } + + #[test] + fn strings() { + #[track_caller] + fn test_string(s: &str, value: &[u8]) { + assert_eq!(lit(s), value); + } + + test_string("\"a\"", b"a"); + test_string("\"\\n\"", b"\n"); + test_string("\"\\r\"", b"\r"); + test_string("\"\\t\"", b"\t"); + test_string("\"🐕\"", b"\xf0\x9f\x90\x95"); // NOTE: This is an emoji + test_string("\"\\\"\"", b"\""); + test_string("\"'\"", b"'"); + test_string("\"\"", b""); + test_string("\"\\u{1F415}\"", b"\xf0\x9f\x90\x95"); + test_string( + "\"contains\nnewlines\\\nescaped newlines\"", + b"contains\nnewlinesescaped newlines", + ); + test_string("r\"raw\nstring\\\nhere\"", b"raw\nstring\\\nhere"); + test_string("\"...\"q", b"..."); + test_string("r\"...\"q", b"..."); + test_string("r##\"...\"##q", b"..."); + } + + #[test] + fn byte_strings() { + #[track_caller] + fn test_byte_string(s: &str, value: &[u8]) { + assert_eq!(lit(s), value); + } + + test_byte_string("b\"a\"", b"a"); + test_byte_string("b\"\\n\"", b"\n"); + test_byte_string("b\"\\r\"", b"\r"); + test_byte_string("b\"\\t\"", b"\t"); + test_byte_string("b\"\\\"\"", b"\""); + test_byte_string("b\"'\"", b"'"); + test_byte_string("b\"\"", b""); + test_byte_string( + "b\"contains\nnewlines\\\nescaped newlines\"", + b"contains\nnewlinesescaped newlines", + ); + test_byte_string("br\"raw\nstring\\\nhere\"", b"raw\nstring\\\nhere"); + test_byte_string("b\"...\"q", b"..."); + test_byte_string("br\"...\"q", b"..."); + test_byte_string("br##\"...\"##q", b"..."); + } +} |