diff options
Diffstat (limited to '')
-rw-r--r-- | third_party/rust/nom/doc/nom_recipes.md | 395 |
1 files changed, 395 insertions, 0 deletions
diff --git a/third_party/rust/nom/doc/nom_recipes.md b/third_party/rust/nom/doc/nom_recipes.md new file mode 100644 index 0000000000..e8626344a7 --- /dev/null +++ b/third_party/rust/nom/doc/nom_recipes.md @@ -0,0 +1,395 @@ +# Nom Recipes + +These are short recipes for accomplishing common tasks with nom. + +* [Whitespace](#whitespace) + + [Wrapper combinators that eat whitespace before and after a parser](#wrapper-combinators-that-eat-whitespace-before-and-after-a-parser) +* [Comments](#comments) + + [`// C++/EOL-style comments`](#-ceol-style-comments) + + [`/* C-style comments */`](#-c-style-comments-) +* [Identifiers](#identifiers) + + [`Rust-Style Identifiers`](#rust-style-identifiers) +* [Literal Values](#literal-values) + + [Escaped Strings](#escaped-strings) + + [Integers](#integers) + - [Hexadecimal](#hexadecimal) + - [Octal](#octal) + - [Binary](#binary) + - [Decimal](#decimal) + + [Floating Point Numbers](#floating-point-numbers) + +## Whitespace + + + +### Wrapper combinators that eat whitespace before and after a parser + +```rust +use nom::{ + IResult, + error::ParseError, + combinator::value, + sequence::delimited, + character::complete::multispace0, +}; + +/// A combinator that takes a parser `inner` and produces a parser that also consumes both leading and +/// trailing whitespace, returning the output of `inner`. +fn ws<'a, F: 'a, O, E: ParseError<&'a str>>(inner: F) -> impl FnMut(&'a str) -> IResult<&'a str, O, E> + where + F: Fn(&'a str) -> IResult<&'a str, O, E>, +{ + delimited( + multispace0, + inner, + multispace0 + ) +} +``` + +To eat only trailing whitespace, replace `delimited(...)` with `terminated(&inner, multispace0)`. +Likewise, the eat only leading whitespace, replace `delimited(...)` with `preceded(multispace0, +&inner)`. You can use your own parser instead of `multispace0` if you want to skip a different set +of lexemes. + +## Comments + +### `// C++/EOL-style comments` + +This version uses `%` to start a comment, does not consume the newline character, and returns an +output of `()`. + +```rust +use nom::{ + IResult, + error::ParseError, + combinator::value, + sequence::pair, + bytes::complete::is_not, + character::complete::char, +}; + +pub fn peol_comment<'a, E: ParseError<&'a str>>(i: &'a str) -> IResult<&'a str, (), E> +{ + value( + (), // Output is thrown away. + pair(char('%'), is_not("\n\r")) + )(i) +} +``` + +### `/* C-style comments */` + +Inline comments surrounded with sentinel tags `(*` and `*)`. This version returns an output of `()` +and does not handle nested comments. + +```rust +use nom::{ + IResult, + error::ParseError, + combinator::value, + sequence::tuple, + bytes::complete::{tag, take_until}, +}; + +pub fn pinline_comment<'a, E: ParseError<&'a str>>(i: &'a str) -> IResult<&'a str, (), E> { + value( + (), // Output is thrown away. + tuple(( + tag("(*"), + take_until("*)"), + tag("*)") + )) + )(i) +} +``` + +## Identifiers + +### `Rust-Style Identifiers` + +Parsing identifiers that may start with a letter (or underscore) and may contain underscores, +letters and numbers may be parsed like this: + +```rust +use nom::{ + IResult, + branch::alt, + multi::many0_count, + combinator::recognize, + sequence::pair, + character::complete::{alpha1, alphanumeric1}, + bytes::complete::tag, +}; + +pub fn identifier(input: &str) -> IResult<&str, &str> { + recognize( + pair( + alt((alpha1, tag("_"))), + many0_count(alt((alphanumeric1, tag("_")))) + ) + )(input) +} +``` + +Let's say we apply this to the identifier `hello_world123abc`. The first `alt` parser would +recognize `h`. The `pair` combinator ensures that `ello_world123abc` will be piped to the next +`alphanumeric0` parser, which recognizes every remaining character. However, the `pair` combinator +returns a tuple of the results of its sub-parsers. The `recognize` parser produces a `&str` of the +input text that was parsed, which in this case is the entire `&str` `hello_world123abc`. + +## Literal Values + +### Escaped Strings + +This is [one of the examples](https://github.com/Geal/nom/blob/main/examples/string.rs) in the +examples directory. + +### Integers + +The following recipes all return string slices rather than integer values. How to obtain an +integer value instead is demonstrated for hexadecimal integers. The others are similar. + +The parsers allow the grouping character `_`, which allows one to group the digits by byte, for +example: `0xA4_3F_11_28`. If you prefer to exclude the `_` character, the lambda to convert from a +string slice to an integer value is slightly simpler. You can also strip the `_` from the string +slice that is returned, which is demonstrated in the second hexdecimal number parser. + +If you wish to limit the number of digits in a valid integer literal, replace `many1` with +`many_m_n` in the recipes. + +#### Hexadecimal + +The parser outputs the string slice of the digits without the leading `0x`/`0X`. + +```rust +use nom::{ + IResult, + branch::alt, + multi::{many0, many1}, + combinator::recognize, + sequence::{preceded, terminated}, + character::complete::{char, one_of}, + bytes::complete::tag, +}; + +fn hexadecimal(input: &str) -> IResult<&str, &str> { // <'a, E: ParseError<&'a str>> + preceded( + alt((tag("0x"), tag("0X"))), + recognize( + many1( + terminated(one_of("0123456789abcdefABCDEF"), many0(char('_'))) + ) + ) + )(input) +} +``` + +If you want it to return the integer value instead, use map: + +```rust +use nom::{ + IResult, + branch::alt, + multi::{many0, many1}, + combinator::{map_res, recognize}, + sequence::{preceded, terminated}, + character::complete::{char, one_of}, + bytes::complete::tag, +}; + +fn hexadecimal_value(input: &str) -> IResult<&str, i64> { + map_res( + preceded( + alt((tag("0x"), tag("0X"))), + recognize( + many1( + terminated(one_of("0123456789abcdefABCDEF"), many0(char('_'))) + ) + ) + ), + |out: &str| i64::from_str_radix(&str::replace(&out, "_", ""), 16) + )(input) +} +``` + +#### Octal + +```rust +use nom::{ + IResult, + branch::alt, + multi::{many0, many1}, + combinator::recognize, + sequence::{preceded, terminated}, + character::complete::{char, one_of}, + bytes::complete::tag, +}; + +fn octal(input: &str) -> IResult<&str, &str> { + preceded( + alt((tag("0o"), tag("0O"))), + recognize( + many1( + terminated(one_of("01234567"), many0(char('_'))) + ) + ) + )(input) +} +``` + +#### Binary + +```rust +use nom::{ + IResult, + branch::alt, + multi::{many0, many1}, + combinator::recognize, + sequence::{preceded, terminated}, + character::complete::{char, one_of}, + bytes::complete::tag, +}; + +fn binary(input: &str) -> IResult<&str, &str> { + preceded( + alt((tag("0b"), tag("0B"))), + recognize( + many1( + terminated(one_of("01"), many0(char('_'))) + ) + ) + )(input) +} +``` + +#### Decimal + +```rust +use nom::{ + IResult, + multi::{many0, many1}, + combinator::recognize, + sequence::terminated, + character::complete::{char, one_of}, +}; + +fn decimal(input: &str) -> IResult<&str, &str> { + recognize( + many1( + terminated(one_of("0123456789"), many0(char('_'))) + ) + )(input) +} +``` + +### Floating Point Numbers + +The following is adapted from [the Python parser by Valentin Lorentz (ProgVal)](https://github.com/ProgVal/rust-python-parser/blob/master/src/numbers.rs). + +```rust +use nom::{ + IResult, + branch::alt, + multi::{many0, many1}, + combinator::{opt, recognize}, + sequence::{preceded, terminated, tuple}, + character::complete::{char, one_of}, +}; + +fn float(input: &str) -> IResult<&str, &str> { + alt(( + // Case one: .42 + recognize( + tuple(( + char('.'), + decimal, + opt(tuple(( + one_of("eE"), + opt(one_of("+-")), + decimal + ))) + )) + ) + , // Case two: 42e42 and 42.42e42 + recognize( + tuple(( + decimal, + opt(preceded( + char('.'), + decimal, + )), + one_of("eE"), + opt(one_of("+-")), + decimal + )) + ) + , // Case three: 42. and 42.42 + recognize( + tuple(( + decimal, + char('.'), + opt(decimal) + )) + ) + ))(input) +} + +fn decimal(input: &str) -> IResult<&str, &str> { + recognize( + many1( + terminated(one_of("0123456789"), many0(char('_'))) + ) + )(input) +} +``` + +# implementing FromStr + +The [FromStr trait](https://doc.rust-lang.org/std/str/trait.FromStr.html) provides +a common interface to parse from a string. + +```rust +use nom::{ + IResult, Finish, error::Error, + bytes::complete::{tag, take_while}, +}; +use std::str::FromStr; + +// will recognize the name in "Hello, name!" +fn parse_name(input: &str) -> IResult<&str, &str> { + let (i, _) = tag("Hello, ")(input)?; + let (i, name) = take_while(|c:char| c.is_alphabetic())(i)?; + let (i, _) = tag("!")(i)?; + + Ok((i, name)) +} + +// with FromStr, the result cannot be a reference to the input, it must be owned +#[derive(Debug)] +pub struct Name(pub String); + +impl FromStr for Name { + // the error must be owned as well + type Err = Error<String>; + + fn from_str(s: &str) -> Result<Self, Self::Err> { + match parse_name(s).finish() { + Ok((_remaining, name)) => Ok(Name(name.to_string())), + Err(Error { input, code }) => Err(Error { + input: input.to_string(), + code, + }) + } + } +} + +fn main() { + // parsed: Ok(Name("nom")) + println!("parsed: {:?}", "Hello, nom!".parse::<Name>()); + + // parsed: Err(Error { input: "123!", code: Tag }) + println!("parsed: {:?}", "Hello, 123!".parse::<Name>()); +} +``` + |