use std::borrow::Cow; use std::char; use std::ops::RangeInclusive; use winnow::combinator::alt; use winnow::combinator::cut_err; use winnow::combinator::delimited; use winnow::combinator::fail; use winnow::combinator::opt; use winnow::combinator::peek; use winnow::combinator::preceded; use winnow::combinator::repeat; use winnow::combinator::success; use winnow::combinator::terminated; use winnow::prelude::*; use winnow::stream::Stream; use winnow::token::any; use winnow::token::none_of; use winnow::token::one_of; use winnow::token::tag; use winnow::token::take_while; use winnow::trace::trace; use crate::parser::errors::CustomError; use crate::parser::numbers::HEXDIG; use crate::parser::prelude::*; use crate::parser::trivia::{from_utf8_unchecked, newline, ws, ws_newlines, NON_ASCII, WSCHAR}; // ;; String // string = ml-basic-string / basic-string / ml-literal-string / literal-string pub(crate) fn string<'i>(input: &mut Input<'i>) -> PResult> { trace( "string", alt(( ml_basic_string, basic_string, ml_literal_string, literal_string.map(Cow::Borrowed), )), ) .parse_next(input) } // ;; Basic String // basic-string = quotation-mark *basic-char quotation-mark pub(crate) fn basic_string<'i>(input: &mut Input<'i>) -> PResult> { trace("basic-string", |input: &mut Input<'i>| { let _ = one_of(QUOTATION_MARK).parse_next(input)?; let mut c = Cow::Borrowed(""); if let Some(ci) = opt(basic_chars).parse_next(input)? { c = ci; } while let Some(ci) = opt(basic_chars).parse_next(input)? { c.to_mut().push_str(&ci); } let _ = cut_err(one_of(QUOTATION_MARK)) .context(StrContext::Label("basic string")) .parse_next(input)?; Ok(c) }) .parse_next(input) } // quotation-mark = %x22 ; " pub(crate) const QUOTATION_MARK: u8 = b'"'; // basic-char = basic-unescaped / escaped fn basic_chars<'i>(input: &mut Input<'i>) -> PResult> { alt(( // Deviate from the official grammar by batching the unescaped chars so we build a string a // chunk at a time, rather than a `char` at a time. take_while(1.., BASIC_UNESCAPED) .try_map(std::str::from_utf8) .map(Cow::Borrowed), escaped.map(|c| Cow::Owned(String::from(c))), )) .parse_next(input) } // basic-unescaped = wschar / %x21 / %x23-5B / %x5D-7E / non-ascii pub(crate) const BASIC_UNESCAPED: ( (u8, u8), u8, RangeInclusive, RangeInclusive, RangeInclusive, ) = (WSCHAR, 0x21, 0x23..=0x5B, 0x5D..=0x7E, NON_ASCII); // escaped = escape escape-seq-char fn escaped(input: &mut Input<'_>) -> PResult { preceded(ESCAPE, escape_seq_char).parse_next(input) } // escape = %x5C ; \ pub(crate) const ESCAPE: u8 = b'\\'; // escape-seq-char = %x22 ; " quotation mark U+0022 // escape-seq-char =/ %x5C ; \ reverse solidus U+005C // escape-seq-char =/ %x62 ; b backspace U+0008 // escape-seq-char =/ %x66 ; f form feed U+000C // escape-seq-char =/ %x6E ; n line feed U+000A // escape-seq-char =/ %x72 ; r carriage return U+000D // escape-seq-char =/ %x74 ; t tab U+0009 // escape-seq-char =/ %x75 4HEXDIG ; uXXXX U+XXXX // escape-seq-char =/ %x55 8HEXDIG ; UXXXXXXXX U+XXXXXXXX fn escape_seq_char(input: &mut Input<'_>) -> PResult { dispatch! {any; b'b' => success('\u{8}'), b'f' => success('\u{c}'), b'n' => success('\n'), b'r' => success('\r'), b't' => success('\t'), b'u' => cut_err(hexescape::<4>).context(StrContext::Label("unicode 4-digit hex code")), b'U' => cut_err(hexescape::<8>).context(StrContext::Label("unicode 8-digit hex code")), b'\\' => success('\\'), b'"' => success('"'), _ => { cut_err(fail::<_, char, _>) .context(StrContext::Label("escape sequence")) .context(StrContext::Expected(StrContextValue::CharLiteral('b'))) .context(StrContext::Expected(StrContextValue::CharLiteral('f'))) .context(StrContext::Expected(StrContextValue::CharLiteral('n'))) .context(StrContext::Expected(StrContextValue::CharLiteral('r'))) .context(StrContext::Expected(StrContextValue::CharLiteral('t'))) .context(StrContext::Expected(StrContextValue::CharLiteral('u'))) .context(StrContext::Expected(StrContextValue::CharLiteral('U'))) .context(StrContext::Expected(StrContextValue::CharLiteral('\\'))) .context(StrContext::Expected(StrContextValue::CharLiteral('"'))) } } .parse_next(input) } pub(crate) fn hexescape(input: &mut Input<'_>) -> PResult { take_while(0..=N, HEXDIG) .verify(|b: &[u8]| b.len() == N) .map(|b: &[u8]| unsafe { from_utf8_unchecked(b, "`is_ascii_digit` filters out on-ASCII") }) .verify_map(|s| u32::from_str_radix(s, 16).ok()) .try_map(|h| char::from_u32(h).ok_or(CustomError::OutOfRange)) .parse_next(input) } // ;; Multiline Basic String // ml-basic-string = ml-basic-string-delim [ newline ] ml-basic-body // ml-basic-string-delim fn ml_basic_string<'i>(input: &mut Input<'i>) -> PResult> { trace( "ml-basic-string", delimited( ML_BASIC_STRING_DELIM, preceded(opt(newline), cut_err(ml_basic_body)), cut_err(ML_BASIC_STRING_DELIM), ) .context(StrContext::Label("multiline basic string")), ) .parse_next(input) } // ml-basic-string-delim = 3quotation-mark pub(crate) const ML_BASIC_STRING_DELIM: &[u8] = b"\"\"\""; // ml-basic-body = *mlb-content *( mlb-quotes 1*mlb-content ) [ mlb-quotes ] fn ml_basic_body<'i>(input: &mut Input<'i>) -> PResult> { let mut c = Cow::Borrowed(""); if let Some(ci) = opt(mlb_content).parse_next(input)? { c = ci; } while let Some(ci) = opt(mlb_content).parse_next(input)? { c.to_mut().push_str(&ci); } while let Some(qi) = opt(mlb_quotes(none_of(b'\"').value(()))).parse_next(input)? { if let Some(ci) = opt(mlb_content).parse_next(input)? { c.to_mut().push_str(qi); c.to_mut().push_str(&ci); while let Some(ci) = opt(mlb_content).parse_next(input)? { c.to_mut().push_str(&ci); } } else { break; } } if let Some(qi) = opt(mlb_quotes(tag(ML_BASIC_STRING_DELIM).value(()))).parse_next(input)? { c.to_mut().push_str(qi); } Ok(c) } // mlb-content = mlb-char / newline / mlb-escaped-nl // mlb-char = mlb-unescaped / escaped fn mlb_content<'i>(input: &mut Input<'i>) -> PResult> { alt(( // Deviate from the official grammar by batching the unescaped chars so we build a string a // chunk at a time, rather than a `char` at a time. take_while(1.., MLB_UNESCAPED) .try_map(std::str::from_utf8) .map(Cow::Borrowed), // Order changed fromg grammar so `escaped` can more easily `cut_err` on bad escape sequences mlb_escaped_nl.map(|_| Cow::Borrowed("")), escaped.map(|c| Cow::Owned(String::from(c))), newline.map(|_| Cow::Borrowed("\n")), )) .parse_next(input) } // mlb-quotes = 1*2quotation-mark fn mlb_quotes<'i>( mut term: impl winnow::Parser, (), ContextError>, ) -> impl Parser, &'i str, ContextError> { move |input: &mut Input<'i>| { let start = input.checkpoint(); let res = terminated(b"\"\"", peek(term.by_ref())) .map(|b| unsafe { from_utf8_unchecked(b, "`bytes` out non-ASCII") }) .parse_next(input); match res { Err(winnow::error::ErrMode::Backtrack(_)) => { input.reset(start); terminated(b"\"", peek(term.by_ref())) .map(|b| unsafe { from_utf8_unchecked(b, "`bytes` out non-ASCII") }) .parse_next(input) } res => res, } } } // mlb-unescaped = wschar / %x21 / %x23-5B / %x5D-7E / non-ascii pub(crate) const MLB_UNESCAPED: ( (u8, u8), u8, RangeInclusive, RangeInclusive, RangeInclusive, ) = (WSCHAR, 0x21, 0x23..=0x5B, 0x5D..=0x7E, NON_ASCII); // mlb-escaped-nl = escape ws newline *( wschar / newline // When the last non-whitespace character on a line is a \, // it will be trimmed along with all whitespace // (including newlines) up to the next non-whitespace // character or closing delimiter. fn mlb_escaped_nl(input: &mut Input<'_>) -> PResult<()> { repeat(1.., (ESCAPE, ws, ws_newlines)) .map(|()| ()) .value(()) .parse_next(input) } // ;; Literal String // literal-string = apostrophe *literal-char apostrophe pub(crate) fn literal_string<'i>(input: &mut Input<'i>) -> PResult<&'i str> { trace( "literal-string", delimited( APOSTROPHE, cut_err(take_while(0.., LITERAL_CHAR)), cut_err(APOSTROPHE), ) .try_map(std::str::from_utf8) .context(StrContext::Label("literal string")), ) .parse_next(input) } // apostrophe = %x27 ; ' apostrophe pub(crate) const APOSTROPHE: u8 = b'\''; // literal-char = %x09 / %x20-26 / %x28-7E / non-ascii pub(crate) const LITERAL_CHAR: ( u8, RangeInclusive, RangeInclusive, RangeInclusive, ) = (0x9, 0x20..=0x26, 0x28..=0x7E, NON_ASCII); // ;; Multiline Literal String // ml-literal-string = ml-literal-string-delim [ newline ] ml-literal-body // ml-literal-string-delim fn ml_literal_string<'i>(input: &mut Input<'i>) -> PResult> { trace( "ml-literal-string", delimited( (ML_LITERAL_STRING_DELIM, opt(newline)), cut_err(ml_literal_body.map(|t| { if t.contains("\r\n") { Cow::Owned(t.replace("\r\n", "\n")) } else { Cow::Borrowed(t) } })), cut_err(ML_LITERAL_STRING_DELIM), ) .context(StrContext::Label("multiline literal string")), ) .parse_next(input) } // ml-literal-string-delim = 3apostrophe pub(crate) const ML_LITERAL_STRING_DELIM: &[u8] = b"'''"; // ml-literal-body = *mll-content *( mll-quotes 1*mll-content ) [ mll-quotes ] fn ml_literal_body<'i>(input: &mut Input<'i>) -> PResult<&'i str> { ( repeat(0.., mll_content).map(|()| ()), repeat( 0.., ( mll_quotes(none_of(APOSTROPHE).value(())), repeat(1.., mll_content).map(|()| ()), ), ) .map(|()| ()), opt(mll_quotes(tag(ML_LITERAL_STRING_DELIM).value(()))), ) .recognize() .try_map(std::str::from_utf8) .parse_next(input) } // mll-content = mll-char / newline fn mll_content(input: &mut Input<'_>) -> PResult { alt((one_of(MLL_CHAR), newline)).parse_next(input) } // mll-char = %x09 / %x20-26 / %x28-7E / non-ascii const MLL_CHAR: ( u8, RangeInclusive, RangeInclusive, RangeInclusive, ) = (0x9, 0x20..=0x26, 0x28..=0x7E, NON_ASCII); // mll-quotes = 1*2apostrophe fn mll_quotes<'i>( mut term: impl winnow::Parser, (), ContextError>, ) -> impl Parser, &'i str, ContextError> { move |input: &mut Input<'i>| { let start = input.checkpoint(); let res = terminated(b"''", peek(term.by_ref())) .map(|b| unsafe { from_utf8_unchecked(b, "`bytes` out non-ASCII") }) .parse_next(input); match res { Err(winnow::error::ErrMode::Backtrack(_)) => { input.reset(start); terminated(b"'", peek(term.by_ref())) .map(|b| unsafe { from_utf8_unchecked(b, "`bytes` out non-ASCII") }) .parse_next(input) } res => res, } } } #[cfg(test)] mod test { use super::*; #[test] fn basic_string() { let input = r#""I'm a string. \"You can quote me\". Name\tJos\u00E9\nLocation\tSF. \U0002070E""#; let expected = "I\'m a string. \"You can quote me\". Name\tJosé\nLocation\tSF. \u{2070E}"; let parsed = string.parse(new_input(input)); assert_eq!(parsed.as_deref(), Ok(expected), "Parsing {input:?}"); } #[test] fn ml_basic_string() { let cases = [ ( r#"""" Roses are red Violets are blue""""#, r#"Roses are red Violets are blue"#, ), (r#"""" \""" """"#, " \"\"\" "), (r#"""" \\""""#, " \\"), ]; for &(input, expected) in &cases { let parsed = string.parse(new_input(input)); assert_eq!(parsed.as_deref(), Ok(expected), "Parsing {input:?}"); } let invalid_cases = [r#"""" """#, r#"""" \""""#]; for input in &invalid_cases { let parsed = string.parse(new_input(input)); assert!(parsed.is_err()); } } #[test] fn ml_basic_string_escape_ws() { let inputs = [ r#"""" The quick brown \ fox jumps over \ the lazy dog.""""#, r#""""\ The quick brown \ fox jumps over \ the lazy dog.\ """"#, ]; for input in &inputs { let expected = "The quick brown fox jumps over the lazy dog."; let parsed = string.parse(new_input(input)); assert_eq!(parsed.as_deref(), Ok(expected), "Parsing {input:?}"); } let empties = [ r#""""\ """"#, r#"""" \ \ """"#, ]; for input in &empties { let expected = ""; let parsed = string.parse(new_input(input)); assert_eq!(parsed.as_deref(), Ok(expected), "Parsing {input:?}"); } } #[test] fn literal_string() { let inputs = [ r#"'C:\Users\nodejs\templates'"#, r#"'\\ServerX\admin$\system32\'"#, r#"'Tom "Dubs" Preston-Werner'"#, r#"'<\i\c*\s*>'"#, ]; for input in &inputs { let expected = &input[1..input.len() - 1]; let parsed = string.parse(new_input(input)); assert_eq!(parsed.as_deref(), Ok(expected), "Parsing {input:?}"); } } #[test] fn ml_literal_string() { let inputs = [ r#"'''I [dw]on't need \d{2} apples'''"#, r#"''''one_quote''''"#, ]; for input in &inputs { let expected = &input[3..input.len() - 3]; let parsed = string.parse(new_input(input)); assert_eq!(parsed.as_deref(), Ok(expected), "Parsing {input:?}"); } let input = r#"''' The first newline is trimmed in raw strings. All other whitespace is preserved. '''"#; let expected = &input[4..input.len() - 3]; let parsed = string.parse(new_input(input)); assert_eq!(parsed.as_deref(), Ok(expected), "Parsing {input:?}"); } }