diff options
Diffstat (limited to 'vendor/gix-config/src/parse/nom/mod.rs')
-rw-r--r-- | vendor/gix-config/src/parse/nom/mod.rs | 460 |
1 files changed, 460 insertions, 0 deletions
diff --git a/vendor/gix-config/src/parse/nom/mod.rs b/vendor/gix-config/src/parse/nom/mod.rs new file mode 100644 index 000000000..11d1dea6b --- /dev/null +++ b/vendor/gix-config/src/parse/nom/mod.rs @@ -0,0 +1,460 @@ +use std::borrow::Cow; + +use bstr::{BStr, BString, ByteSlice, ByteVec}; +use nom::{ + branch::alt, + bytes::complete::{tag, take_till, take_while}, + character::{ + complete::{char, one_of}, + is_space, + }, + combinator::{map, opt}, + error::{Error as NomError, ErrorKind}, + multi::{fold_many0, fold_many1}, + sequence::delimited, + IResult, +}; + +use crate::parse::{error::ParseNode, section, Comment, Error, Event}; + +/// Attempt to zero-copy parse the provided bytes, passing results to `dispatch`. +pub fn from_bytes<'a>(input: &'a [u8], mut dispatch: impl FnMut(Event<'a>)) -> Result<(), Error> { + let bom = unicode_bom::Bom::from(input); + let mut newlines = 0; + let (i, _) = fold_many0( + alt(( + map(comment, Event::Comment), + map(take_spaces, |whitespace| Event::Whitespace(Cow::Borrowed(whitespace))), + map(take_newlines, |(newline, counter)| { + newlines += counter; + Event::Newline(Cow::Borrowed(newline)) + }), + )), + || (), + |_acc, event| dispatch(event), + )(&input[bom.len()..]) + // I don't think this can panic. many0 errors if the child parser returns + // a success where the input was not consumed, but alt will only return Ok + // if one of its children succeed. However, all of it's children are + // guaranteed to consume something if they succeed, so the Ok(i) == i case + // can never occur. + .expect("many0(alt(...)) panicked. Likely a bug in one of the children parsers."); + + if i.is_empty() { + return Ok(()); + } + + let mut node = ParseNode::SectionHeader; + + let res = fold_many1( + |i| section(i, &mut node, &mut dispatch), + || (), + |_acc, additional_newlines| { + newlines += additional_newlines; + }, + )(i); + let (i, _) = res.map_err(|_| Error { + line_number: newlines, + last_attempted_parser: node, + parsed_until: i.as_bstr().into(), + })?; + + // This needs to happen after we collect sections, otherwise the line number + // will be off. + if !i.is_empty() { + return Err(Error { + line_number: newlines, + last_attempted_parser: node, + parsed_until: i.as_bstr().into(), + }); + } + + Ok(()) +} + +fn comment(i: &[u8]) -> IResult<&[u8], Comment<'_>> { + let (i, comment_tag) = one_of(";#")(i)?; + let (i, comment) = take_till(|c| c == b'\n')(i)?; + Ok(( + i, + Comment { + tag: comment_tag as u8, + text: Cow::Borrowed(comment.as_bstr()), + }, + )) +} + +#[cfg(test)] +mod tests; + +fn section<'a>(i: &'a [u8], node: &mut ParseNode, dispatch: &mut impl FnMut(Event<'a>)) -> IResult<&'a [u8], usize> { + let (mut i, header) = section_header(i)?; + dispatch(Event::SectionHeader(header)); + + let mut newlines = 0; + + // This would usually be a many0(alt(...)), the manual loop allows us to + // optimize vec insertions + loop { + let old_i = i; + + if let Ok((new_i, v)) = take_spaces(i) { + if old_i != new_i { + i = new_i; + dispatch(Event::Whitespace(Cow::Borrowed(v.as_bstr()))); + } + } + + if let Ok((new_i, (v, new_newlines))) = take_newlines(i) { + if old_i != new_i { + i = new_i; + newlines += new_newlines; + dispatch(Event::Newline(Cow::Borrowed(v.as_bstr()))); + } + } + + if let Ok((new_i, new_newlines)) = key_value_pair(i, node, dispatch) { + if old_i != new_i { + i = new_i; + newlines += new_newlines; + } + } + + if let Ok((new_i, comment)) = comment(i) { + if old_i != new_i { + i = new_i; + dispatch(Event::Comment(comment)); + } + } + + if old_i == i { + break; + } + } + + Ok((i, newlines)) +} + +fn section_header(i: &[u8]) -> IResult<&[u8], section::Header<'_>> { + let (i, _) = char('[')(i)?; + // No spaces must be between section name and section start + let (i, name) = take_while(|c: u8| c.is_ascii_alphanumeric() || c == b'-' || c == b'.')(i)?; + + let name = name.as_bstr(); + if let Ok((i, _)) = char::<_, NomError<&[u8]>>(']')(i) { + // Either section does not have a subsection or using deprecated + // subsection syntax at this point. + let header = match memchr::memrchr(b'.', name.as_bytes()) { + Some(index) => section::Header { + name: section::Name(Cow::Borrowed(name[..index].as_bstr())), + separator: name.get(index..=index).map(|s| Cow::Borrowed(s.as_bstr())), + subsection_name: name.get(index + 1..).map(|s| Cow::Borrowed(s.as_bstr())), + }, + None => section::Header { + name: section::Name(Cow::Borrowed(name.as_bstr())), + separator: None, + subsection_name: None, + }, + }; + + if header.name.is_empty() { + return Err(nom::Err::Error(NomError { + input: i, + code: ErrorKind::NoneOf, + })); + } + return Ok((i, header)); + } + + // Section header must be using modern subsection syntax at this point. + + let (i, whitespace) = take_spaces(i)?; + let (i, subsection_name) = delimited(char('"'), opt(sub_section), tag("\"]"))(i)?; + + Ok(( + i, + section::Header { + name: section::Name(Cow::Borrowed(name)), + separator: Some(Cow::Borrowed(whitespace)), + subsection_name, + }, + )) +} + +fn sub_section(i: &[u8]) -> IResult<&[u8], Cow<'_, BStr>> { + let (rest, (found_escape, consumed)) = sub_section_delegate(i, &mut |_| ())?; + if found_escape { + let mut buf = BString::default(); + sub_section_delegate(i, &mut |b| buf.push_byte(b)).map(|(i, _)| (i, buf.into())) + } else { + Ok((rest, i[..consumed].as_bstr().into())) + } +} + +fn sub_section_delegate<'a>(i: &'a [u8], push_byte: &mut dyn FnMut(u8)) -> IResult<&'a [u8], (bool, usize)> { + let mut cursor = 0; + let mut bytes = i.iter().copied(); + let mut found_terminator = false; + let mut found_escape = false; + while let Some(mut b) = bytes.next() { + cursor += 1; + if b == b'\n' || b == 0 { + return Err(nom::Err::Error(NomError { + input: &i[cursor..], + code: ErrorKind::NonEmpty, + })); + } + if b == b'"' { + found_terminator = true; + break; + } + if b == b'\\' { + b = bytes.next().ok_or_else(|| { + nom::Err::Error(NomError { + input: &i[cursor..], + code: ErrorKind::NonEmpty, + }) + })?; + found_escape = true; + cursor += 1; + if b == b'\n' { + return Err(nom::Err::Error(NomError { + input: &i[cursor..], + code: ErrorKind::NonEmpty, + })); + } + } + push_byte(b); + } + + if !found_terminator { + return Err(nom::Err::Error(NomError { + input: &i[cursor..], + code: ErrorKind::NonEmpty, + })); + } + + Ok((&i[cursor - 1..], (found_escape, cursor - 1))) +} + +fn key_value_pair<'a>( + i: &'a [u8], + node: &mut ParseNode, + dispatch: &mut impl FnMut(Event<'a>), +) -> IResult<&'a [u8], usize> { + *node = ParseNode::Name; + let (i, name) = config_name(i)?; + + dispatch(Event::SectionKey(section::Key(Cow::Borrowed(name)))); + + let (i, whitespace) = opt(take_spaces)(i)?; + if let Some(whitespace) = whitespace { + dispatch(Event::Whitespace(Cow::Borrowed(whitespace))); + } + + *node = ParseNode::Value; + let (i, newlines) = config_value(i, dispatch)?; + Ok((i, newlines)) +} + +/// Parses the config name of a config pair. Assumes the input has already been +/// trimmed of any leading whitespace. +fn config_name(i: &[u8]) -> IResult<&[u8], &BStr> { + if i.is_empty() { + return Err(nom::Err::Error(NomError { + input: i, + code: ErrorKind::NonEmpty, + })); + } + + if !i[0].is_ascii_alphabetic() { + return Err(nom::Err::Error(NomError { + input: i, + code: ErrorKind::Alpha, + })); + } + + let (i, name) = take_while(|c: u8| c.is_ascii_alphanumeric() || c == b'-')(i)?; + Ok((i, name.as_bstr())) +} + +fn config_value<'a>(i: &'a [u8], dispatch: &mut impl FnMut(Event<'a>)) -> IResult<&'a [u8], usize> { + if let (i, Some(_)) = opt(char('='))(i)? { + dispatch(Event::KeyValueSeparator); + let (i, whitespace) = opt(take_spaces)(i)?; + if let Some(whitespace) = whitespace { + dispatch(Event::Whitespace(Cow::Borrowed(whitespace))); + } + let (i, newlines) = value_impl(i, dispatch)?; + Ok((i, newlines)) + } else { + // This is a special way of denoting 'empty' values which a lot of code depends on. + // Hence, rather to fix this everywhere else, leave it here and fix it where it matters, namely + // when it's about differentiating between a missing key-value separator, and one followed by emptiness. + dispatch(Event::Value(Cow::Borrowed("".into()))); + Ok((i, 0)) + } +} + +/// Handles parsing of known-to-be values. This function handles both single +/// line values as well as values that are continuations. +fn value_impl<'a>(i: &'a [u8], dispatch: &mut impl FnMut(Event<'a>)) -> IResult<&'a [u8], usize> { + let (i, value_end, newlines, mut dispatch) = { + let new_err = |code| nom::Err::Error(NomError { input: i, code }); + let mut value_end = None::<usize>; + let mut value_start: usize = 0; + let mut newlines = 0; + + let mut prev_char_was_backslash = false; + // This is required to ignore comment markers if they're in a quote. + let mut is_in_quotes = false; + // Used to determine if we return a Value or Value{Not,}Done + let mut partial_value_found = false; + let mut last_value_index: usize = 0; + + let mut bytes = i.iter(); + while let Some(mut c) = bytes.next() { + if prev_char_was_backslash { + prev_char_was_backslash = false; + let mut consumed = 1; + if *c == b'\r' { + c = bytes.next().ok_or_else(|| new_err(ErrorKind::Escaped))?; + if *c != b'\n' { + return Err(new_err(ErrorKind::Tag)); + } + consumed += 1; + } + + match c { + b'\n' => { + partial_value_found = true; + let backslash = 1; + dispatch(Event::ValueNotDone(Cow::Borrowed( + i[value_start..last_value_index - backslash].as_bstr(), + ))); + let nl_end = last_value_index + consumed; + dispatch(Event::Newline(Cow::Borrowed(i[last_value_index..nl_end].as_bstr()))); + value_start = nl_end; + value_end = None; + newlines += 1; + + last_value_index += consumed; + } + b'n' | b't' | b'\\' | b'b' | b'"' => { + last_value_index += 1; + } + _ => { + return Err(new_err(ErrorKind::Escaped)); + } + } + } else { + match c { + b'\n' => { + value_end = last_value_index.into(); + break; + } + b';' | b'#' if !is_in_quotes => { + value_end = last_value_index.into(); + break; + } + b'\\' => prev_char_was_backslash = true, + b'"' => is_in_quotes = !is_in_quotes, + _ => {} + } + last_value_index += 1; + } + } + + if prev_char_was_backslash { + return Err(new_err(ErrorKind::Escaped)); + } + + if is_in_quotes { + return Err(new_err(ErrorKind::Tag)); + } + + let value_end = match value_end { + None => { + if last_value_index == 0 { + dispatch(Event::Value(Cow::Borrowed("".into()))); + return Ok((&i[0..], newlines)); + } else { + i.len() + } + } + Some(idx) => idx, + }; + + let dispatch = move |value: &'a [u8]| { + if partial_value_found { + dispatch(Event::ValueDone(Cow::Borrowed(value.as_bstr()))); + } else { + dispatch(Event::Value(Cow::Borrowed(value.as_bstr()))); + } + }; + (&i[value_start..], value_end - value_start, newlines, dispatch) + }; + + let (i, remainder_value) = { + let value_end_no_trailing_whitespace = i[..value_end] + .iter() + .enumerate() + .rev() + .find_map(|(idx, b)| (!b.is_ascii_whitespace()).then_some(idx + 1)) + .unwrap_or(0); + ( + &i[value_end_no_trailing_whitespace..], + &i[..value_end_no_trailing_whitespace], + ) + }; + + dispatch(remainder_value); + + Ok((i, newlines)) +} + +fn take_spaces(i: &[u8]) -> IResult<&[u8], &BStr> { + let (i, v) = take_while(|c: u8| c.is_ascii() && is_space(c))(i)?; + if v.is_empty() { + Err(nom::Err::Error(NomError { + input: i, + code: ErrorKind::Eof, + })) + } else { + Ok((i, v.as_bstr())) + } +} + +fn take_newlines(i: &[u8]) -> IResult<&[u8], (&BStr, usize)> { + let mut counter = 0; + let mut consumed_bytes = 0; + let mut next_must_be_newline = false; + for b in i.iter().copied() { + if !b.is_ascii() { + break; + }; + if b == b'\r' { + if next_must_be_newline { + break; + } + next_must_be_newline = true; + continue; + }; + if b == b'\n' { + counter += 1; + consumed_bytes += if next_must_be_newline { 2 } else { 1 }; + next_must_be_newline = false; + } else { + break; + } + } + let (v, i) = i.split_at(consumed_bytes); + if v.is_empty() { + Err(nom::Err::Error(NomError { + input: i, + code: ErrorKind::Eof, + })) + } else { + Ok((i, (v.as_bstr(), counter))) + } +} |