summaryrefslogtreecommitdiffstats
path: root/vendor/gix-config/src/parse/nom/mod.rs
diff options
context:
space:
mode:
Diffstat (limited to 'vendor/gix-config/src/parse/nom/mod.rs')
-rw-r--r--vendor/gix-config/src/parse/nom/mod.rs460
1 files changed, 460 insertions, 0 deletions
diff --git a/vendor/gix-config/src/parse/nom/mod.rs b/vendor/gix-config/src/parse/nom/mod.rs
new file mode 100644
index 000000000..11d1dea6b
--- /dev/null
+++ b/vendor/gix-config/src/parse/nom/mod.rs
@@ -0,0 +1,460 @@
+use std::borrow::Cow;
+
+use bstr::{BStr, BString, ByteSlice, ByteVec};
+use nom::{
+ branch::alt,
+ bytes::complete::{tag, take_till, take_while},
+ character::{
+ complete::{char, one_of},
+ is_space,
+ },
+ combinator::{map, opt},
+ error::{Error as NomError, ErrorKind},
+ multi::{fold_many0, fold_many1},
+ sequence::delimited,
+ IResult,
+};
+
+use crate::parse::{error::ParseNode, section, Comment, Error, Event};
+
+/// Attempt to zero-copy parse the provided bytes, passing results to `dispatch`.
+pub fn from_bytes<'a>(input: &'a [u8], mut dispatch: impl FnMut(Event<'a>)) -> Result<(), Error> {
+ let bom = unicode_bom::Bom::from(input);
+ let mut newlines = 0;
+ let (i, _) = fold_many0(
+ alt((
+ map(comment, Event::Comment),
+ map(take_spaces, |whitespace| Event::Whitespace(Cow::Borrowed(whitespace))),
+ map(take_newlines, |(newline, counter)| {
+ newlines += counter;
+ Event::Newline(Cow::Borrowed(newline))
+ }),
+ )),
+ || (),
+ |_acc, event| dispatch(event),
+ )(&input[bom.len()..])
+ // I don't think this can panic. many0 errors if the child parser returns
+ // a success where the input was not consumed, but alt will only return Ok
+ // if one of its children succeed. However, all of it's children are
+ // guaranteed to consume something if they succeed, so the Ok(i) == i case
+ // can never occur.
+ .expect("many0(alt(...)) panicked. Likely a bug in one of the children parsers.");
+
+ if i.is_empty() {
+ return Ok(());
+ }
+
+ let mut node = ParseNode::SectionHeader;
+
+ let res = fold_many1(
+ |i| section(i, &mut node, &mut dispatch),
+ || (),
+ |_acc, additional_newlines| {
+ newlines += additional_newlines;
+ },
+ )(i);
+ let (i, _) = res.map_err(|_| Error {
+ line_number: newlines,
+ last_attempted_parser: node,
+ parsed_until: i.as_bstr().into(),
+ })?;
+
+ // This needs to happen after we collect sections, otherwise the line number
+ // will be off.
+ if !i.is_empty() {
+ return Err(Error {
+ line_number: newlines,
+ last_attempted_parser: node,
+ parsed_until: i.as_bstr().into(),
+ });
+ }
+
+ Ok(())
+}
+
+fn comment(i: &[u8]) -> IResult<&[u8], Comment<'_>> {
+ let (i, comment_tag) = one_of(";#")(i)?;
+ let (i, comment) = take_till(|c| c == b'\n')(i)?;
+ Ok((
+ i,
+ Comment {
+ tag: comment_tag as u8,
+ text: Cow::Borrowed(comment.as_bstr()),
+ },
+ ))
+}
+
+#[cfg(test)]
+mod tests;
+
+fn section<'a>(i: &'a [u8], node: &mut ParseNode, dispatch: &mut impl FnMut(Event<'a>)) -> IResult<&'a [u8], usize> {
+ let (mut i, header) = section_header(i)?;
+ dispatch(Event::SectionHeader(header));
+
+ let mut newlines = 0;
+
+ // This would usually be a many0(alt(...)), the manual loop allows us to
+ // optimize vec insertions
+ loop {
+ let old_i = i;
+
+ if let Ok((new_i, v)) = take_spaces(i) {
+ if old_i != new_i {
+ i = new_i;
+ dispatch(Event::Whitespace(Cow::Borrowed(v.as_bstr())));
+ }
+ }
+
+ if let Ok((new_i, (v, new_newlines))) = take_newlines(i) {
+ if old_i != new_i {
+ i = new_i;
+ newlines += new_newlines;
+ dispatch(Event::Newline(Cow::Borrowed(v.as_bstr())));
+ }
+ }
+
+ if let Ok((new_i, new_newlines)) = key_value_pair(i, node, dispatch) {
+ if old_i != new_i {
+ i = new_i;
+ newlines += new_newlines;
+ }
+ }
+
+ if let Ok((new_i, comment)) = comment(i) {
+ if old_i != new_i {
+ i = new_i;
+ dispatch(Event::Comment(comment));
+ }
+ }
+
+ if old_i == i {
+ break;
+ }
+ }
+
+ Ok((i, newlines))
+}
+
+fn section_header(i: &[u8]) -> IResult<&[u8], section::Header<'_>> {
+ let (i, _) = char('[')(i)?;
+ // No spaces must be between section name and section start
+ let (i, name) = take_while(|c: u8| c.is_ascii_alphanumeric() || c == b'-' || c == b'.')(i)?;
+
+ let name = name.as_bstr();
+ if let Ok((i, _)) = char::<_, NomError<&[u8]>>(']')(i) {
+ // Either section does not have a subsection or using deprecated
+ // subsection syntax at this point.
+ let header = match memchr::memrchr(b'.', name.as_bytes()) {
+ Some(index) => section::Header {
+ name: section::Name(Cow::Borrowed(name[..index].as_bstr())),
+ separator: name.get(index..=index).map(|s| Cow::Borrowed(s.as_bstr())),
+ subsection_name: name.get(index + 1..).map(|s| Cow::Borrowed(s.as_bstr())),
+ },
+ None => section::Header {
+ name: section::Name(Cow::Borrowed(name.as_bstr())),
+ separator: None,
+ subsection_name: None,
+ },
+ };
+
+ if header.name.is_empty() {
+ return Err(nom::Err::Error(NomError {
+ input: i,
+ code: ErrorKind::NoneOf,
+ }));
+ }
+ return Ok((i, header));
+ }
+
+ // Section header must be using modern subsection syntax at this point.
+
+ let (i, whitespace) = take_spaces(i)?;
+ let (i, subsection_name) = delimited(char('"'), opt(sub_section), tag("\"]"))(i)?;
+
+ Ok((
+ i,
+ section::Header {
+ name: section::Name(Cow::Borrowed(name)),
+ separator: Some(Cow::Borrowed(whitespace)),
+ subsection_name,
+ },
+ ))
+}
+
+fn sub_section(i: &[u8]) -> IResult<&[u8], Cow<'_, BStr>> {
+ let (rest, (found_escape, consumed)) = sub_section_delegate(i, &mut |_| ())?;
+ if found_escape {
+ let mut buf = BString::default();
+ sub_section_delegate(i, &mut |b| buf.push_byte(b)).map(|(i, _)| (i, buf.into()))
+ } else {
+ Ok((rest, i[..consumed].as_bstr().into()))
+ }
+}
+
+fn sub_section_delegate<'a>(i: &'a [u8], push_byte: &mut dyn FnMut(u8)) -> IResult<&'a [u8], (bool, usize)> {
+ let mut cursor = 0;
+ let mut bytes = i.iter().copied();
+ let mut found_terminator = false;
+ let mut found_escape = false;
+ while let Some(mut b) = bytes.next() {
+ cursor += 1;
+ if b == b'\n' || b == 0 {
+ return Err(nom::Err::Error(NomError {
+ input: &i[cursor..],
+ code: ErrorKind::NonEmpty,
+ }));
+ }
+ if b == b'"' {
+ found_terminator = true;
+ break;
+ }
+ if b == b'\\' {
+ b = bytes.next().ok_or_else(|| {
+ nom::Err::Error(NomError {
+ input: &i[cursor..],
+ code: ErrorKind::NonEmpty,
+ })
+ })?;
+ found_escape = true;
+ cursor += 1;
+ if b == b'\n' {
+ return Err(nom::Err::Error(NomError {
+ input: &i[cursor..],
+ code: ErrorKind::NonEmpty,
+ }));
+ }
+ }
+ push_byte(b);
+ }
+
+ if !found_terminator {
+ return Err(nom::Err::Error(NomError {
+ input: &i[cursor..],
+ code: ErrorKind::NonEmpty,
+ }));
+ }
+
+ Ok((&i[cursor - 1..], (found_escape, cursor - 1)))
+}
+
+fn key_value_pair<'a>(
+ i: &'a [u8],
+ node: &mut ParseNode,
+ dispatch: &mut impl FnMut(Event<'a>),
+) -> IResult<&'a [u8], usize> {
+ *node = ParseNode::Name;
+ let (i, name) = config_name(i)?;
+
+ dispatch(Event::SectionKey(section::Key(Cow::Borrowed(name))));
+
+ let (i, whitespace) = opt(take_spaces)(i)?;
+ if let Some(whitespace) = whitespace {
+ dispatch(Event::Whitespace(Cow::Borrowed(whitespace)));
+ }
+
+ *node = ParseNode::Value;
+ let (i, newlines) = config_value(i, dispatch)?;
+ Ok((i, newlines))
+}
+
+/// Parses the config name of a config pair. Assumes the input has already been
+/// trimmed of any leading whitespace.
+fn config_name(i: &[u8]) -> IResult<&[u8], &BStr> {
+ if i.is_empty() {
+ return Err(nom::Err::Error(NomError {
+ input: i,
+ code: ErrorKind::NonEmpty,
+ }));
+ }
+
+ if !i[0].is_ascii_alphabetic() {
+ return Err(nom::Err::Error(NomError {
+ input: i,
+ code: ErrorKind::Alpha,
+ }));
+ }
+
+ let (i, name) = take_while(|c: u8| c.is_ascii_alphanumeric() || c == b'-')(i)?;
+ Ok((i, name.as_bstr()))
+}
+
+fn config_value<'a>(i: &'a [u8], dispatch: &mut impl FnMut(Event<'a>)) -> IResult<&'a [u8], usize> {
+ if let (i, Some(_)) = opt(char('='))(i)? {
+ dispatch(Event::KeyValueSeparator);
+ let (i, whitespace) = opt(take_spaces)(i)?;
+ if let Some(whitespace) = whitespace {
+ dispatch(Event::Whitespace(Cow::Borrowed(whitespace)));
+ }
+ let (i, newlines) = value_impl(i, dispatch)?;
+ Ok((i, newlines))
+ } else {
+ // This is a special way of denoting 'empty' values which a lot of code depends on.
+ // Hence, rather to fix this everywhere else, leave it here and fix it where it matters, namely
+ // when it's about differentiating between a missing key-value separator, and one followed by emptiness.
+ dispatch(Event::Value(Cow::Borrowed("".into())));
+ Ok((i, 0))
+ }
+}
+
+/// Handles parsing of known-to-be values. This function handles both single
+/// line values as well as values that are continuations.
+fn value_impl<'a>(i: &'a [u8], dispatch: &mut impl FnMut(Event<'a>)) -> IResult<&'a [u8], usize> {
+ let (i, value_end, newlines, mut dispatch) = {
+ let new_err = |code| nom::Err::Error(NomError { input: i, code });
+ let mut value_end = None::<usize>;
+ let mut value_start: usize = 0;
+ let mut newlines = 0;
+
+ let mut prev_char_was_backslash = false;
+ // This is required to ignore comment markers if they're in a quote.
+ let mut is_in_quotes = false;
+ // Used to determine if we return a Value or Value{Not,}Done
+ let mut partial_value_found = false;
+ let mut last_value_index: usize = 0;
+
+ let mut bytes = i.iter();
+ while let Some(mut c) = bytes.next() {
+ if prev_char_was_backslash {
+ prev_char_was_backslash = false;
+ let mut consumed = 1;
+ if *c == b'\r' {
+ c = bytes.next().ok_or_else(|| new_err(ErrorKind::Escaped))?;
+ if *c != b'\n' {
+ return Err(new_err(ErrorKind::Tag));
+ }
+ consumed += 1;
+ }
+
+ match c {
+ b'\n' => {
+ partial_value_found = true;
+ let backslash = 1;
+ dispatch(Event::ValueNotDone(Cow::Borrowed(
+ i[value_start..last_value_index - backslash].as_bstr(),
+ )));
+ let nl_end = last_value_index + consumed;
+ dispatch(Event::Newline(Cow::Borrowed(i[last_value_index..nl_end].as_bstr())));
+ value_start = nl_end;
+ value_end = None;
+ newlines += 1;
+
+ last_value_index += consumed;
+ }
+ b'n' | b't' | b'\\' | b'b' | b'"' => {
+ last_value_index += 1;
+ }
+ _ => {
+ return Err(new_err(ErrorKind::Escaped));
+ }
+ }
+ } else {
+ match c {
+ b'\n' => {
+ value_end = last_value_index.into();
+ break;
+ }
+ b';' | b'#' if !is_in_quotes => {
+ value_end = last_value_index.into();
+ break;
+ }
+ b'\\' => prev_char_was_backslash = true,
+ b'"' => is_in_quotes = !is_in_quotes,
+ _ => {}
+ }
+ last_value_index += 1;
+ }
+ }
+
+ if prev_char_was_backslash {
+ return Err(new_err(ErrorKind::Escaped));
+ }
+
+ if is_in_quotes {
+ return Err(new_err(ErrorKind::Tag));
+ }
+
+ let value_end = match value_end {
+ None => {
+ if last_value_index == 0 {
+ dispatch(Event::Value(Cow::Borrowed("".into())));
+ return Ok((&i[0..], newlines));
+ } else {
+ i.len()
+ }
+ }
+ Some(idx) => idx,
+ };
+
+ let dispatch = move |value: &'a [u8]| {
+ if partial_value_found {
+ dispatch(Event::ValueDone(Cow::Borrowed(value.as_bstr())));
+ } else {
+ dispatch(Event::Value(Cow::Borrowed(value.as_bstr())));
+ }
+ };
+ (&i[value_start..], value_end - value_start, newlines, dispatch)
+ };
+
+ let (i, remainder_value) = {
+ let value_end_no_trailing_whitespace = i[..value_end]
+ .iter()
+ .enumerate()
+ .rev()
+ .find_map(|(idx, b)| (!b.is_ascii_whitespace()).then_some(idx + 1))
+ .unwrap_or(0);
+ (
+ &i[value_end_no_trailing_whitespace..],
+ &i[..value_end_no_trailing_whitespace],
+ )
+ };
+
+ dispatch(remainder_value);
+
+ Ok((i, newlines))
+}
+
+fn take_spaces(i: &[u8]) -> IResult<&[u8], &BStr> {
+ let (i, v) = take_while(|c: u8| c.is_ascii() && is_space(c))(i)?;
+ if v.is_empty() {
+ Err(nom::Err::Error(NomError {
+ input: i,
+ code: ErrorKind::Eof,
+ }))
+ } else {
+ Ok((i, v.as_bstr()))
+ }
+}
+
+fn take_newlines(i: &[u8]) -> IResult<&[u8], (&BStr, usize)> {
+ let mut counter = 0;
+ let mut consumed_bytes = 0;
+ let mut next_must_be_newline = false;
+ for b in i.iter().copied() {
+ if !b.is_ascii() {
+ break;
+ };
+ if b == b'\r' {
+ if next_must_be_newline {
+ break;
+ }
+ next_must_be_newline = true;
+ continue;
+ };
+ if b == b'\n' {
+ counter += 1;
+ consumed_bytes += if next_must_be_newline { 2 } else { 1 };
+ next_must_be_newline = false;
+ } else {
+ break;
+ }
+ }
+ let (v, i) = i.split_at(consumed_bytes);
+ if v.is_empty() {
+ Err(nom::Err::Error(NomError {
+ input: i,
+ code: ErrorKind::Eof,
+ }))
+ } else {
+ Ok((i, (v.as_bstr(), counter)))
+ }
+}