diff options
author | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-04-17 12:02:58 +0000 |
---|---|---|
committer | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-04-17 12:02:58 +0000 |
commit | 698f8c2f01ea549d77d7dc3338a12e04c11057b9 (patch) | |
tree | 173a775858bd501c378080a10dca74132f05bc50 /vendor/rowan/examples/s_expressions.rs | |
parent | Initial commit. (diff) | |
download | rustc-698f8c2f01ea549d77d7dc3338a12e04c11057b9.tar.xz rustc-698f8c2f01ea549d77d7dc3338a12e04c11057b9.zip |
Adding upstream version 1.64.0+dfsg1.upstream/1.64.0+dfsg1
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'vendor/rowan/examples/s_expressions.rs')
-rw-r--r-- | vendor/rowan/examples/s_expressions.rs | 425 |
1 files changed, 425 insertions, 0 deletions
diff --git a/vendor/rowan/examples/s_expressions.rs b/vendor/rowan/examples/s_expressions.rs new file mode 100644 index 000000000..e7df4c644 --- /dev/null +++ b/vendor/rowan/examples/s_expressions.rs @@ -0,0 +1,425 @@ +//! In this tutorial, we will write parser +//! and evaluator of arithmetic S-expressions, +//! which look like this: +//! ``` +//! (+ (* 15 2) 62) +//! ``` +//! +//! It's suggested to read the conceptual overview of the design +//! alongside this tutorial: +//! https://github.com/rust-analyzer/rust-analyzer/blob/master/docs/dev/syntax.md + +/// Let's start with defining all kinds of tokens and +/// composite nodes. +#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash)] +#[allow(non_camel_case_types)] +#[repr(u16)] +enum SyntaxKind { + L_PAREN = 0, // '(' + R_PAREN, // ')' + WORD, // '+', '15' + WHITESPACE, // whitespaces is explicit + ERROR, // as well as errors + + // composite nodes + LIST, // `(+ 2 3)` + ATOM, // `+`, `15`, wraps a WORD token + ROOT, // top-level node: a list of s-expressions +} +use SyntaxKind::*; + +/// Some boilerplate is needed, as rowan settled on using its own +/// `struct SyntaxKind(u16)` internally, instead of accepting the +/// user's `enum SyntaxKind` as a type parameter. +/// +/// First, to easily pass the enum variants into rowan via `.into()`: +impl From<SyntaxKind> for rowan::SyntaxKind { + fn from(kind: SyntaxKind) -> Self { + Self(kind as u16) + } +} + +/// Second, implementing the `Language` trait teaches rowan to convert between +/// these two SyntaxKind types, allowing for a nicer SyntaxNode API where +/// "kinds" are values from our `enum SyntaxKind`, instead of plain u16 values. +#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash)] +enum Lang {} +impl rowan::Language for Lang { + type Kind = SyntaxKind; + fn kind_from_raw(raw: rowan::SyntaxKind) -> Self::Kind { + assert!(raw.0 <= ROOT as u16); + unsafe { std::mem::transmute::<u16, SyntaxKind>(raw.0) } + } + fn kind_to_raw(kind: Self::Kind) -> rowan::SyntaxKind { + kind.into() + } +} + +/// GreenNode is an immutable tree, which is cheap to change, +/// but doesn't contain offsets and parent pointers. +use rowan::GreenNode; + +/// You can construct GreenNodes by hand, but a builder +/// is helpful for top-down parsers: it maintains a stack +/// of currently in-progress nodes +use rowan::GreenNodeBuilder; + +/// The parse results are stored as a "green tree". +/// We'll discuss working with the results later +struct Parse { + green_node: GreenNode, + #[allow(unused)] + errors: Vec<String>, +} + +/// Now, let's write a parser. +/// Note that `parse` does not return a `Result`: +/// by design, syntax tree can be built even for +/// completely invalid source code. +fn parse(text: &str) -> Parse { + struct Parser { + /// input tokens, including whitespace, + /// in *reverse* order. + tokens: Vec<(SyntaxKind, String)>, + /// the in-progress tree. + builder: GreenNodeBuilder<'static>, + /// the list of syntax errors we've accumulated + /// so far. + errors: Vec<String>, + } + + /// The outcome of parsing a single S-expression + enum SexpRes { + /// An S-expression (i.e. an atom, or a list) was successfully parsed + Ok, + /// Nothing was parsed, as no significant tokens remained + Eof, + /// An unexpected ')' was found + RParen, + } + + impl Parser { + fn parse(mut self) -> Parse { + // Make sure that the root node covers all source + self.builder.start_node(ROOT.into()); + // Parse zero or more S-expressions + loop { + match self.sexp() { + SexpRes::Eof => break, + SexpRes::RParen => { + self.builder.start_node(ERROR.into()); + self.errors.push("unmatched `)`".to_string()); + self.bump(); // be sure to chug along in case of error + self.builder.finish_node(); + } + SexpRes::Ok => (), + } + } + // Don't forget to eat *trailing* whitespace + self.skip_ws(); + // Close the root node. + self.builder.finish_node(); + + // Turn the builder into a GreenNode + Parse { green_node: self.builder.finish(), errors: self.errors } + } + fn list(&mut self) { + assert_eq!(self.current(), Some(L_PAREN)); + // Start the list node + self.builder.start_node(LIST.into()); + self.bump(); // '(' + loop { + match self.sexp() { + SexpRes::Eof => { + self.errors.push("expected `)`".to_string()); + break; + } + SexpRes::RParen => { + self.bump(); + break; + } + SexpRes::Ok => (), + } + } + // close the list node + self.builder.finish_node(); + } + fn sexp(&mut self) -> SexpRes { + // Eat leading whitespace + self.skip_ws(); + // Either a list, an atom, a closing paren, + // or an eof. + let t = match self.current() { + None => return SexpRes::Eof, + Some(R_PAREN) => return SexpRes::RParen, + Some(t) => t, + }; + match t { + L_PAREN => self.list(), + WORD => { + self.builder.start_node(ATOM.into()); + self.bump(); + self.builder.finish_node(); + } + ERROR => self.bump(), + _ => unreachable!(), + } + SexpRes::Ok + } + /// Advance one token, adding it to the current branch of the tree builder. + fn bump(&mut self) { + let (kind, text) = self.tokens.pop().unwrap(); + self.builder.token(kind.into(), text.as_str()); + } + /// Peek at the first unprocessed token + fn current(&self) -> Option<SyntaxKind> { + self.tokens.last().map(|(kind, _)| *kind) + } + fn skip_ws(&mut self) { + while self.current() == Some(WHITESPACE) { + self.bump() + } + } + } + + let mut tokens = lex(text); + tokens.reverse(); + Parser { tokens, builder: GreenNodeBuilder::new(), errors: Vec::new() }.parse() +} + +/// To work with the parse results we need a view into the +/// green tree - the Syntax tree. +/// It is also immutable, like a GreenNode, +/// but it contains parent pointers, offsets, and +/// has identity semantics. + +type SyntaxNode = rowan::SyntaxNode<Lang>; +#[allow(unused)] +type SyntaxToken = rowan::SyntaxToken<Lang>; +#[allow(unused)] +type SyntaxElement = rowan::NodeOrToken<SyntaxNode, SyntaxToken>; + +impl Parse { + fn syntax(&self) -> SyntaxNode { + SyntaxNode::new_root(self.green_node.clone()) + } +} + +/// Let's check that the parser works as expected +#[test] +fn test_parser() { + let text = "(+ (* 15 2) 62)"; + let node = parse(text).syntax(); + assert_eq!( + format!("{:?}", node), + "ROOT@0..15", // root node, spanning 15 bytes + ); + assert_eq!(node.children().count(), 1); + let list = node.children().next().unwrap(); + let children = list + .children_with_tokens() + .map(|child| format!("{:?}@{:?}", child.kind(), child.text_range())) + .collect::<Vec<_>>(); + + assert_eq!( + children, + vec![ + "L_PAREN@0..1".to_string(), + "ATOM@1..2".to_string(), + "WHITESPACE@2..3".to_string(), // note, explicit whitespace! + "LIST@3..11".to_string(), + "WHITESPACE@11..12".to_string(), + "ATOM@12..14".to_string(), + "R_PAREN@14..15".to_string(), + ] + ); +} + +/// So far, we've been working with a homogeneous untyped tree. +/// It's nice to provide generic tree operations, like traversals, +/// but it's a bad fit for semantic analysis. +/// This crate itself does not provide AST facilities directly, +/// but it is possible to layer AST on top of `SyntaxNode` API. +/// Let's write a function to evaluate S-expression. +/// +/// For that, let's define AST nodes. +/// It'll be quite a bunch of repetitive code, so we'll use a macro. +/// +/// For a real language, you'd want to generate an AST. I find a +/// combination of `serde`, `ron` and `tera` crates invaluable for that! +macro_rules! ast_node { + ($ast:ident, $kind:ident) => { + #[derive(PartialEq, Eq, Hash)] + #[repr(transparent)] + struct $ast(SyntaxNode); + impl $ast { + #[allow(unused)] + fn cast(node: SyntaxNode) -> Option<Self> { + if node.kind() == $kind { + Some(Self(node)) + } else { + None + } + } + } + }; +} + +ast_node!(Root, ROOT); +ast_node!(Atom, ATOM); +ast_node!(List, LIST); + +// Sexp is slightly different, so let's do it by hand. +#[derive(PartialEq, Eq, Hash)] +#[repr(transparent)] +struct Sexp(SyntaxNode); + +enum SexpKind { + Atom(Atom), + List(List), +} + +impl Sexp { + fn cast(node: SyntaxNode) -> Option<Self> { + if Atom::cast(node.clone()).is_some() || List::cast(node.clone()).is_some() { + Some(Sexp(node)) + } else { + None + } + } + + fn kind(&self) -> SexpKind { + Atom::cast(self.0.clone()) + .map(SexpKind::Atom) + .or_else(|| List::cast(self.0.clone()).map(SexpKind::List)) + .unwrap() + } +} + +// Let's enhance AST nodes with ancillary functions and +// eval. +impl Root { + fn sexps(&self) -> impl Iterator<Item = Sexp> + '_ { + self.0.children().filter_map(Sexp::cast) + } +} + +enum Op { + Add, + Sub, + Div, + Mul, +} + +impl Atom { + fn eval(&self) -> Option<i64> { + self.text().parse().ok() + } + fn as_op(&self) -> Option<Op> { + let op = match self.text().as_str() { + "+" => Op::Add, + "-" => Op::Sub, + "*" => Op::Mul, + "/" => Op::Div, + _ => return None, + }; + Some(op) + } + fn text(&self) -> String { + match self.0.green().children().next() { + Some(rowan::NodeOrToken::Token(token)) => token.text().to_string(), + _ => unreachable!(), + } + } +} + +impl List { + fn sexps(&self) -> impl Iterator<Item = Sexp> + '_ { + self.0.children().filter_map(Sexp::cast) + } + fn eval(&self) -> Option<i64> { + let op = match self.sexps().nth(0)?.kind() { + SexpKind::Atom(atom) => atom.as_op()?, + _ => return None, + }; + let arg1 = self.sexps().nth(1)?.eval()?; + let arg2 = self.sexps().nth(2)?.eval()?; + let res = match op { + Op::Add => arg1 + arg2, + Op::Sub => arg1 - arg2, + Op::Mul => arg1 * arg2, + Op::Div if arg2 == 0 => return None, + Op::Div => arg1 / arg2, + }; + Some(res) + } +} + +impl Sexp { + fn eval(&self) -> Option<i64> { + match self.kind() { + SexpKind::Atom(atom) => atom.eval(), + SexpKind::List(list) => list.eval(), + } + } +} + +impl Parse { + fn root(&self) -> Root { + Root::cast(self.syntax()).unwrap() + } +} + +/// Let's test the eval! +fn main() { + let sexps = " +92 +(+ 62 30) +(/ 92 0) +nan +(+ (* 15 2) 62) +"; + let root = parse(sexps).root(); + let res = root.sexps().map(|it| it.eval()).collect::<Vec<_>>(); + eprintln!("{:?}", res); + assert_eq!(res, vec![Some(92), Some(92), None, None, Some(92),]) +} + +/// Split the input string into a flat list of tokens +/// (such as L_PAREN, WORD, and WHITESPACE) +fn lex(text: &str) -> Vec<(SyntaxKind, String)> { + fn tok(t: SyntaxKind) -> m_lexer::TokenKind { + m_lexer::TokenKind(rowan::SyntaxKind::from(t).0) + } + fn kind(t: m_lexer::TokenKind) -> SyntaxKind { + match t.0 { + 0 => L_PAREN, + 1 => R_PAREN, + 2 => WORD, + 3 => WHITESPACE, + 4 => ERROR, + _ => unreachable!(), + } + } + + let lexer = m_lexer::LexerBuilder::new() + .error_token(tok(ERROR)) + .tokens(&[ + (tok(L_PAREN), r"\("), + (tok(R_PAREN), r"\)"), + (tok(WORD), r"[^\s()]+"), + (tok(WHITESPACE), r"\s+"), + ]) + .build(); + + lexer + .tokenize(text) + .into_iter() + .map(|t| (t.len, kind(t.kind))) + .scan(0usize, |start_offset, (len, kind)| { + let s: String = text[*start_offset..*start_offset + len].into(); + *start_offset += len; + Some((kind, s)) + }) + .collect() +} |