1
Fork 0
mirror of https://github.com/RGBCube/cstree synced 2025-07-27 17:17:45 +00:00

update S-expression example/tutorial

This commit is contained in:
Domenic Quirl 2021-02-10 23:56:17 +01:00
parent 1b7985b8bd
commit d5d3f7afa8

View file

@ -1,19 +1,13 @@
//! In this tutorial, we will write parser //! In this tutorial, we will write parser and evaluator of arithmetic S-expressions, which look like
//! and evaluator of arithmetic S-expressions, //! this:
//! which look like this:
//! ``` //! ```
//! (+ (* 15 2) 62) //! (+ (* 15 2) 62)
//! ``` //! ```
//! //!
//! It's suggested to read the conceptual overview of the design //! You may want to follow the conceptual overview of the design alongside this tutorial:
//! alongside this tutorial:
//! https://github.com/rust-analyzer/rust-analyzer/blob/master/docs/dev/syntax.md //! https://github.com/rust-analyzer/rust-analyzer/blob/master/docs/dev/syntax.md
/// cstree uses `TextSize` and `TextRange` types to /// Let's start with defining all kinds of tokens and composite nodes.
/// represent utf8 offsets and ranges.
/// Let's start with defining all kinds of tokens and
/// composite nodes.
#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash)] #[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash)]
#[allow(non_camel_case_types)] #[allow(non_camel_case_types)]
#[repr(u16)] #[repr(u16)]
@ -29,11 +23,12 @@ enum SyntaxKind {
ATOM, // `+`, `15`, wraps a WORD token ATOM, // `+`, `15`, wraps a WORD token
ROOT, // top-level node: a list of s-expressions ROOT, // top-level node: a list of s-expressions
} }
use std::collections::VecDeque;
use SyntaxKind::*; use SyntaxKind::*;
/// Some boilerplate is needed, as cstree settled on using its own /// Some boilerplate is needed, as cstree represents kinds as `struct SyntaxKind(u16)` internally,
/// `struct SyntaxKind(u16)` internally, instead of accepting the /// in order to not need the user's `enum SyntaxKind` as a type parameter.
/// user's `enum SyntaxKind` as a type parameter.
/// ///
/// First, to easily pass the enum variants into cstree via `.into()`: /// First, to easily pass the enum variants into cstree via `.into()`:
impl From<SyntaxKind> for cstree::SyntaxKind { impl From<SyntaxKind> for cstree::SyntaxKind {
@ -42,9 +37,9 @@ impl From<SyntaxKind> for cstree::SyntaxKind {
} }
} }
/// Second, implementing the `Language` trait teaches cstree to convert between /// Second, implementing the `Language` trait teaches cstree to convert between these two SyntaxKind
/// these two SyntaxKind types, allowing for a nicer SyntaxNode API where /// types, allowing for a nicer SyntaxNode API where "kinds" are values from our `enum SyntaxKind`,
/// "kinds" are values from our `enum SyntaxKind`, instead of plain u16 values. /// instead of plain u16 values.
#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash)] #[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash)]
enum Lang {} enum Lang {}
impl cstree::Language for Lang { impl cstree::Language for Lang {
@ -60,17 +55,18 @@ impl cstree::Language for Lang {
} }
} }
/// GreenNode is an immutable tree, which is cheap to change, /// GreenNode is an immutable tree, which caches identical nodes and tokens, but doesn't contain
/// but doesn't contain offsets and parent pointers. /// offsets and parent pointers.
/// cstree also deduplicates the actual source string in addition to the tree nodes, so we will need
/// the Resolver to get the real text back from the interned representation.
use cstree::{interning::Resolver, GreenNode}; use cstree::{interning::Resolver, GreenNode};
/// You can construct GreenNodes by hand, but a builder /// You can construct GreenNodes by hand, but a builder is helpful for top-down parsers: it maintains
/// is helpful for top-down parsers: it maintains a stack /// a stack of currently in-progress nodes.
/// of currently in-progress nodes
use cstree::GreenNodeBuilder; use cstree::GreenNodeBuilder;
/// The parse results are stored as a "green tree". /// The parse results are stored as a "green tree".
/// We'll discuss working with the results later /// We'll discuss how to work with the results later.
struct Parse<I> { struct Parse<I> {
green_node: GreenNode, green_node: GreenNode,
resolver: I, resolver: I,
@ -80,17 +76,14 @@ struct Parse<I> {
/// Now, let's write a parser. /// Now, let's write a parser.
/// Note that `parse` does not return a `Result`: /// Note that `parse` does not return a `Result`:
/// by design, syntax tree can be built even for /// By design, syntax trees can be built even for completely invalid source code.
/// completely invalid source code.
fn parse(text: &str) -> Parse<impl Resolver> { fn parse(text: &str) -> Parse<impl Resolver> {
struct Parser<'input> { struct Parser<'input> {
/// input tokens, including whitespace, /// input tokens, including whitespace.
/// in *reverse* order. tokens: VecDeque<(SyntaxKind, &'input str)>,
tokens: Vec<(SyntaxKind, &'input str)>, /// the in-progress green tree.
/// the in-progress tree.
builder: GreenNodeBuilder<'static, 'static>, builder: GreenNodeBuilder<'static, 'static>,
/// the list of syntax errors we've accumulated /// the list of syntax errors we've accumulated so far.
/// so far.
errors: Vec<String>, errors: Vec<String>,
} }
@ -115,10 +108,10 @@ fn parse(text: &str) -> Parse<impl Resolver> {
SexpRes::RParen => { SexpRes::RParen => {
self.builder.start_node(ERROR.into()); self.builder.start_node(ERROR.into());
self.errors.push("unmatched `)`".to_string()); self.errors.push("unmatched `)`".to_string());
self.bump(); // be sure to chug along in case of error self.bump(); // be sure to advance even in case of an error, so as to not get stuck
self.builder.finish_node(); self.builder.finish_node();
} }
SexpRes::Ok => (), SexpRes::Ok => {}
} }
} }
// Don't forget to eat *trailing* whitespace // Don't forget to eat *trailing* whitespace
@ -126,11 +119,13 @@ fn parse(text: &str) -> Parse<impl Resolver> {
// Close the root node. // Close the root node.
self.builder.finish_node(); self.builder.finish_node();
// Turn the builder into a GreenNode // Get the green tree from the builder.
let (tree, resolver) = self.builder.finish(); // Note that, since we didn't provide our own interner to the builder, it has
// instantiated one for us and now returns it together with the tree.
let (tree, interner) = self.builder.finish();
Parse { Parse {
green_node: tree, green_node: tree,
resolver: resolver.unwrap().into_resolver(), resolver: interner.unwrap().into_resolver(),
errors: self.errors, errors: self.errors,
} }
} }
@ -150,7 +145,7 @@ fn parse(text: &str) -> Parse<impl Resolver> {
self.bump(); self.bump();
break; break;
} }
SexpRes::Ok => (), SexpRes::Ok => {}
} }
} }
// close the list node // close the list node
@ -160,8 +155,7 @@ fn parse(text: &str) -> Parse<impl Resolver> {
fn sexp(&mut self) -> SexpRes { fn sexp(&mut self) -> SexpRes {
// Eat leading whitespace // Eat leading whitespace
self.skip_ws(); self.skip_ws();
// Either a list, an atom, a closing paren, // Either a list, an atom, a closing paren, or an eof.
// or an eof.
let t = match self.current() { let t = match self.current() {
None => return SexpRes::Eof, None => return SexpRes::Eof,
Some(R_PAREN) => return SexpRes::RParen, Some(R_PAREN) => return SexpRes::RParen,
@ -182,13 +176,13 @@ fn parse(text: &str) -> Parse<impl Resolver> {
/// Advance one token, adding it to the current branch of the tree builder. /// Advance one token, adding it to the current branch of the tree builder.
fn bump(&mut self) { fn bump(&mut self) {
let (kind, text) = self.tokens.pop().unwrap(); let (kind, text) = self.tokens.pop_front().unwrap();
self.builder.token(kind.into(), text); self.builder.token(kind.into(), text);
} }
/// Peek at the first unprocessed token /// Peek at the first unprocessed token
fn current(&self) -> Option<SyntaxKind> { fn current(&self) -> Option<SyntaxKind> {
self.tokens.last().map(|(kind, _)| *kind) self.tokens.front().map(|(kind, _)| *kind)
} }
fn skip_ws(&mut self) { fn skip_ws(&mut self) {
@ -198,30 +192,29 @@ fn parse(text: &str) -> Parse<impl Resolver> {
} }
} }
let mut tokens = lex(text);
tokens.reverse();
Parser { Parser {
tokens, tokens: lex(text),
builder: GreenNodeBuilder::new(), builder: GreenNodeBuilder::new(),
errors: Vec::new(), errors: Vec::new(),
} }
.parse() .parse()
} }
/// To work with the parse results we need a view into the /// To work with the parse results we need a view into the green tree - the syntax tree.
/// green tree - the Syntax tree. /// It is also immutable, like a GreenNode, but it contains parent pointers, offsets, and has
/// It is also immutable, like a GreenNode, /// identity semantics.
/// but it contains parent pointers, offsets, and
/// has identity semantics.
type SyntaxNode = cstree::SyntaxNode<Lang>; type SyntaxNode = cstree::SyntaxNode<Lang>;
#[allow(unused)] #[allow(unused)]
type SyntaxToken = cstree::SyntaxToken<Lang>; type SyntaxToken = cstree::SyntaxToken<Lang>;
#[allow(unused)] #[allow(unused)]
type SyntaxElement = cstree::NodeOrToken<SyntaxNode, SyntaxToken>; type SyntaxElement = cstree::SyntaxElement<Lang>;
impl<I> Parse<I> { impl<I> Parse<I> {
fn syntax(&self) -> SyntaxNode { fn syntax(&self) -> SyntaxNode {
// If we owned `self`, we could use `new_root_with_resolver` instead at this point to attach
// `self.resolver` to the tree. This simplifies retrieving text and provides automatic
// implementations for useful traits like `Display`, but also consumes the resolver (it can
// still be accessed indirectly via the `resolver` method).
SyntaxNode::new_root(self.green_node.clone()) SyntaxNode::new_root(self.green_node.clone())
} }
} }
@ -234,6 +227,7 @@ fn test_parser() {
let node = parse.syntax(); let node = parse.syntax();
let resolver = &parse.resolver; let resolver = &parse.resolver;
assert_eq!( assert_eq!(
// note how, since we didn't attach the resolver in `syntax`, we now need to provide it
node.debug(resolver, false), node.debug(resolver, false),
"ROOT@0..15", // root node, spanning 15 bytes "ROOT@0..15", // root node, spanning 15 bytes
); );
@ -259,17 +253,13 @@ fn test_parser() {
} }
/// So far, we've been working with a homogeneous untyped tree. /// So far, we've been working with a homogeneous untyped tree.
/// It's nice to provide generic tree operations, like traversals, /// That tree is nice to provide generic tree operations, like traversals, but it's a bad fit for
/// but it's a bad fit for semantic analysis. /// semantic analysis. cstree itself does not provide AST facilities directly, but it is possible to
/// This crate itself does not provide AST facilities directly, /// layer AST on top of `SyntaxNode` API. Let's write a function to evaluate S-expressions.
/// but it is possible to layer AST on top of `SyntaxNode` API.
/// Let's write a function to evaluate S-expression.
/// ///
/// For that, let's define AST nodes. /// For that, let's define AST nodes.
/// It'll be quite a bunch of repetitive code, so we'll use a macro. /// It'll be quite a bunch of repetitive code, so we'll use a macro.
/// /// For a real language, you may want to automatically generate the AST implementations with a task.
/// For a real language, you'd want to generate an AST. I find a
/// combination of `serde`, `ron` and `tera` crates invaluable for that!
macro_rules! ast_node { macro_rules! ast_node {
($ast:ident, $kind:ident) => { ($ast:ident, $kind:ident) => {
#[derive(PartialEq, Eq, Hash)] #[derive(PartialEq, Eq, Hash)]
@ -292,7 +282,7 @@ ast_node!(Root, ROOT);
ast_node!(Atom, ATOM); ast_node!(Atom, ATOM);
ast_node!(List, LIST); ast_node!(List, LIST);
// Sexp is slightly different, so let's do it by hand. // Sexp is slightly different because it can be both an atom and a list, so let's do it by hand.
#[derive(PartialEq, Eq, Hash)] #[derive(PartialEq, Eq, Hash)]
#[repr(transparent)] #[repr(transparent)]
struct Sexp(SyntaxNode); struct Sexp(SyntaxNode);
@ -319,8 +309,7 @@ impl Sexp {
} }
} }
// Let's enhance AST nodes with ancillary functions and // Let's enhance AST nodes with ancillary functions and eval.
// eval.
impl Root { impl Root {
fn sexps(&self) -> impl Iterator<Item = Sexp> + '_ { fn sexps(&self) -> impl Iterator<Item = Sexp> + '_ {
self.0.children().cloned().filter_map(Sexp::cast) self.0.children().cloned().filter_map(Sexp::cast)
@ -413,9 +402,8 @@ nan
assert_eq!(res, vec![Some(92), Some(92), None, None, Some(92),]) assert_eq!(res, vec![Some(92), Some(92), None, None, Some(92),])
} }
/// Split the input string into a flat list of tokens /// Split the input string into a flat list of tokens (such as L_PAREN, WORD, and WHITESPACE)
/// (such as L_PAREN, WORD, and WHITESPACE) fn lex(text: &str) -> VecDeque<(SyntaxKind, &str)> {
fn lex(text: &str) -> Vec<(SyntaxKind, &str)> {
fn tok(t: SyntaxKind) -> m_lexer::TokenKind { fn tok(t: SyntaxKind) -> m_lexer::TokenKind {
m_lexer::TokenKind(cstree::SyntaxKind::from(t).0) m_lexer::TokenKind(cstree::SyntaxKind::from(t).0)
} }
@ -445,6 +433,7 @@ fn lex(text: &str) -> Vec<(SyntaxKind, &str)> {
.into_iter() .into_iter()
.map(|t| (t.len, kind(t.kind))) .map(|t| (t.len, kind(t.kind)))
.scan(0usize, |start_offset, (len, kind)| { .scan(0usize, |start_offset, (len, kind)| {
// reconstruct the item's source text from offset and len
let s = &text[*start_offset..*start_offset + len]; let s = &text[*start_offset..*start_offset + len];
*start_offset += len; *start_offset += len;
Some((kind, s)) Some((kind, s))