mirror of
https://github.com/RGBCube/cstree
synced 2025-07-27 09:07:44 +00:00
update S-expression example/tutorial
This commit is contained in:
parent
1b7985b8bd
commit
d5d3f7afa8
1 changed files with 54 additions and 65 deletions
|
@ -1,19 +1,13 @@
|
||||||
//! In this tutorial, we will write parser
|
//! In this tutorial, we will write parser and evaluator of arithmetic S-expressions, which look like
|
||||||
//! and evaluator of arithmetic S-expressions,
|
//! this:
|
||||||
//! which look like this:
|
|
||||||
//! ```
|
//! ```
|
||||||
//! (+ (* 15 2) 62)
|
//! (+ (* 15 2) 62)
|
||||||
//! ```
|
//! ```
|
||||||
//!
|
//!
|
||||||
//! It's suggested to read the conceptual overview of the design
|
//! You may want to follow the conceptual overview of the design alongside this tutorial:
|
||||||
//! alongside this tutorial:
|
|
||||||
//! https://github.com/rust-analyzer/rust-analyzer/blob/master/docs/dev/syntax.md
|
//! https://github.com/rust-analyzer/rust-analyzer/blob/master/docs/dev/syntax.md
|
||||||
|
|
||||||
/// cstree uses `TextSize` and `TextRange` types to
|
/// Let's start with defining all kinds of tokens and composite nodes.
|
||||||
/// represent utf8 offsets and ranges.
|
|
||||||
|
|
||||||
/// Let's start with defining all kinds of tokens and
|
|
||||||
/// composite nodes.
|
|
||||||
#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash)]
|
#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash)]
|
||||||
#[allow(non_camel_case_types)]
|
#[allow(non_camel_case_types)]
|
||||||
#[repr(u16)]
|
#[repr(u16)]
|
||||||
|
@ -29,11 +23,12 @@ enum SyntaxKind {
|
||||||
ATOM, // `+`, `15`, wraps a WORD token
|
ATOM, // `+`, `15`, wraps a WORD token
|
||||||
ROOT, // top-level node: a list of s-expressions
|
ROOT, // top-level node: a list of s-expressions
|
||||||
}
|
}
|
||||||
|
use std::collections::VecDeque;
|
||||||
|
|
||||||
use SyntaxKind::*;
|
use SyntaxKind::*;
|
||||||
|
|
||||||
/// Some boilerplate is needed, as cstree settled on using its own
|
/// Some boilerplate is needed, as cstree represents kinds as `struct SyntaxKind(u16)` internally,
|
||||||
/// `struct SyntaxKind(u16)` internally, instead of accepting the
|
/// in order to not need the user's `enum SyntaxKind` as a type parameter.
|
||||||
/// user's `enum SyntaxKind` as a type parameter.
|
|
||||||
///
|
///
|
||||||
/// First, to easily pass the enum variants into cstree via `.into()`:
|
/// First, to easily pass the enum variants into cstree via `.into()`:
|
||||||
impl From<SyntaxKind> for cstree::SyntaxKind {
|
impl From<SyntaxKind> for cstree::SyntaxKind {
|
||||||
|
@ -42,9 +37,9 @@ impl From<SyntaxKind> for cstree::SyntaxKind {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Second, implementing the `Language` trait teaches cstree to convert between
|
/// Second, implementing the `Language` trait teaches cstree to convert between these two SyntaxKind
|
||||||
/// these two SyntaxKind types, allowing for a nicer SyntaxNode API where
|
/// types, allowing for a nicer SyntaxNode API where "kinds" are values from our `enum SyntaxKind`,
|
||||||
/// "kinds" are values from our `enum SyntaxKind`, instead of plain u16 values.
|
/// instead of plain u16 values.
|
||||||
#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash)]
|
#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash)]
|
||||||
enum Lang {}
|
enum Lang {}
|
||||||
impl cstree::Language for Lang {
|
impl cstree::Language for Lang {
|
||||||
|
@ -60,17 +55,18 @@ impl cstree::Language for Lang {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/// GreenNode is an immutable tree, which is cheap to change,
|
/// GreenNode is an immutable tree, which caches identical nodes and tokens, but doesn't contain
|
||||||
/// but doesn't contain offsets and parent pointers.
|
/// offsets and parent pointers.
|
||||||
|
/// cstree also deduplicates the actual source string in addition to the tree nodes, so we will need
|
||||||
|
/// the Resolver to get the real text back from the interned representation.
|
||||||
use cstree::{interning::Resolver, GreenNode};
|
use cstree::{interning::Resolver, GreenNode};
|
||||||
|
|
||||||
/// You can construct GreenNodes by hand, but a builder
|
/// You can construct GreenNodes by hand, but a builder is helpful for top-down parsers: it maintains
|
||||||
/// is helpful for top-down parsers: it maintains a stack
|
/// a stack of currently in-progress nodes.
|
||||||
/// of currently in-progress nodes
|
|
||||||
use cstree::GreenNodeBuilder;
|
use cstree::GreenNodeBuilder;
|
||||||
|
|
||||||
/// The parse results are stored as a "green tree".
|
/// The parse results are stored as a "green tree".
|
||||||
/// We'll discuss working with the results later
|
/// We'll discuss how to work with the results later.
|
||||||
struct Parse<I> {
|
struct Parse<I> {
|
||||||
green_node: GreenNode,
|
green_node: GreenNode,
|
||||||
resolver: I,
|
resolver: I,
|
||||||
|
@ -80,17 +76,14 @@ struct Parse<I> {
|
||||||
|
|
||||||
/// Now, let's write a parser.
|
/// Now, let's write a parser.
|
||||||
/// Note that `parse` does not return a `Result`:
|
/// Note that `parse` does not return a `Result`:
|
||||||
/// by design, syntax tree can be built even for
|
/// By design, syntax trees can be built even for completely invalid source code.
|
||||||
/// completely invalid source code.
|
|
||||||
fn parse(text: &str) -> Parse<impl Resolver> {
|
fn parse(text: &str) -> Parse<impl Resolver> {
|
||||||
struct Parser<'input> {
|
struct Parser<'input> {
|
||||||
/// input tokens, including whitespace,
|
/// input tokens, including whitespace.
|
||||||
/// in *reverse* order.
|
tokens: VecDeque<(SyntaxKind, &'input str)>,
|
||||||
tokens: Vec<(SyntaxKind, &'input str)>,
|
/// the in-progress green tree.
|
||||||
/// the in-progress tree.
|
|
||||||
builder: GreenNodeBuilder<'static, 'static>,
|
builder: GreenNodeBuilder<'static, 'static>,
|
||||||
/// the list of syntax errors we've accumulated
|
/// the list of syntax errors we've accumulated so far.
|
||||||
/// so far.
|
|
||||||
errors: Vec<String>,
|
errors: Vec<String>,
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -115,10 +108,10 @@ fn parse(text: &str) -> Parse<impl Resolver> {
|
||||||
SexpRes::RParen => {
|
SexpRes::RParen => {
|
||||||
self.builder.start_node(ERROR.into());
|
self.builder.start_node(ERROR.into());
|
||||||
self.errors.push("unmatched `)`".to_string());
|
self.errors.push("unmatched `)`".to_string());
|
||||||
self.bump(); // be sure to chug along in case of error
|
self.bump(); // be sure to advance even in case of an error, so as to not get stuck
|
||||||
self.builder.finish_node();
|
self.builder.finish_node();
|
||||||
}
|
}
|
||||||
SexpRes::Ok => (),
|
SexpRes::Ok => {}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
// Don't forget to eat *trailing* whitespace
|
// Don't forget to eat *trailing* whitespace
|
||||||
|
@ -126,11 +119,13 @@ fn parse(text: &str) -> Parse<impl Resolver> {
|
||||||
// Close the root node.
|
// Close the root node.
|
||||||
self.builder.finish_node();
|
self.builder.finish_node();
|
||||||
|
|
||||||
// Turn the builder into a GreenNode
|
// Get the green tree from the builder.
|
||||||
let (tree, resolver) = self.builder.finish();
|
// Note that, since we didn't provide our own interner to the builder, it has
|
||||||
|
// instantiated one for us and now returns it together with the tree.
|
||||||
|
let (tree, interner) = self.builder.finish();
|
||||||
Parse {
|
Parse {
|
||||||
green_node: tree,
|
green_node: tree,
|
||||||
resolver: resolver.unwrap().into_resolver(),
|
resolver: interner.unwrap().into_resolver(),
|
||||||
errors: self.errors,
|
errors: self.errors,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -150,7 +145,7 @@ fn parse(text: &str) -> Parse<impl Resolver> {
|
||||||
self.bump();
|
self.bump();
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
SexpRes::Ok => (),
|
SexpRes::Ok => {}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
// close the list node
|
// close the list node
|
||||||
|
@ -160,8 +155,7 @@ fn parse(text: &str) -> Parse<impl Resolver> {
|
||||||
fn sexp(&mut self) -> SexpRes {
|
fn sexp(&mut self) -> SexpRes {
|
||||||
// Eat leading whitespace
|
// Eat leading whitespace
|
||||||
self.skip_ws();
|
self.skip_ws();
|
||||||
// Either a list, an atom, a closing paren,
|
// Either a list, an atom, a closing paren, or an eof.
|
||||||
// or an eof.
|
|
||||||
let t = match self.current() {
|
let t = match self.current() {
|
||||||
None => return SexpRes::Eof,
|
None => return SexpRes::Eof,
|
||||||
Some(R_PAREN) => return SexpRes::RParen,
|
Some(R_PAREN) => return SexpRes::RParen,
|
||||||
|
@ -182,13 +176,13 @@ fn parse(text: &str) -> Parse<impl Resolver> {
|
||||||
|
|
||||||
/// Advance one token, adding it to the current branch of the tree builder.
|
/// Advance one token, adding it to the current branch of the tree builder.
|
||||||
fn bump(&mut self) {
|
fn bump(&mut self) {
|
||||||
let (kind, text) = self.tokens.pop().unwrap();
|
let (kind, text) = self.tokens.pop_front().unwrap();
|
||||||
self.builder.token(kind.into(), text);
|
self.builder.token(kind.into(), text);
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Peek at the first unprocessed token
|
/// Peek at the first unprocessed token
|
||||||
fn current(&self) -> Option<SyntaxKind> {
|
fn current(&self) -> Option<SyntaxKind> {
|
||||||
self.tokens.last().map(|(kind, _)| *kind)
|
self.tokens.front().map(|(kind, _)| *kind)
|
||||||
}
|
}
|
||||||
|
|
||||||
fn skip_ws(&mut self) {
|
fn skip_ws(&mut self) {
|
||||||
|
@ -198,30 +192,29 @@ fn parse(text: &str) -> Parse<impl Resolver> {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
let mut tokens = lex(text);
|
|
||||||
tokens.reverse();
|
|
||||||
Parser {
|
Parser {
|
||||||
tokens,
|
tokens: lex(text),
|
||||||
builder: GreenNodeBuilder::new(),
|
builder: GreenNodeBuilder::new(),
|
||||||
errors: Vec::new(),
|
errors: Vec::new(),
|
||||||
}
|
}
|
||||||
.parse()
|
.parse()
|
||||||
}
|
}
|
||||||
|
|
||||||
/// To work with the parse results we need a view into the
|
/// To work with the parse results we need a view into the green tree - the syntax tree.
|
||||||
/// green tree - the Syntax tree.
|
/// It is also immutable, like a GreenNode, but it contains parent pointers, offsets, and has
|
||||||
/// It is also immutable, like a GreenNode,
|
/// identity semantics.
|
||||||
/// but it contains parent pointers, offsets, and
|
|
||||||
/// has identity semantics.
|
|
||||||
|
|
||||||
type SyntaxNode = cstree::SyntaxNode<Lang>;
|
type SyntaxNode = cstree::SyntaxNode<Lang>;
|
||||||
#[allow(unused)]
|
#[allow(unused)]
|
||||||
type SyntaxToken = cstree::SyntaxToken<Lang>;
|
type SyntaxToken = cstree::SyntaxToken<Lang>;
|
||||||
#[allow(unused)]
|
#[allow(unused)]
|
||||||
type SyntaxElement = cstree::NodeOrToken<SyntaxNode, SyntaxToken>;
|
type SyntaxElement = cstree::SyntaxElement<Lang>;
|
||||||
|
|
||||||
impl<I> Parse<I> {
|
impl<I> Parse<I> {
|
||||||
fn syntax(&self) -> SyntaxNode {
|
fn syntax(&self) -> SyntaxNode {
|
||||||
|
// If we owned `self`, we could use `new_root_with_resolver` instead at this point to attach
|
||||||
|
// `self.resolver` to the tree. This simplifies retrieving text and provides automatic
|
||||||
|
// implementations for useful traits like `Display`, but also consumes the resolver (it can
|
||||||
|
// still be accessed indirectly via the `resolver` method).
|
||||||
SyntaxNode::new_root(self.green_node.clone())
|
SyntaxNode::new_root(self.green_node.clone())
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -234,6 +227,7 @@ fn test_parser() {
|
||||||
let node = parse.syntax();
|
let node = parse.syntax();
|
||||||
let resolver = &parse.resolver;
|
let resolver = &parse.resolver;
|
||||||
assert_eq!(
|
assert_eq!(
|
||||||
|
// note how, since we didn't attach the resolver in `syntax`, we now need to provide it
|
||||||
node.debug(resolver, false),
|
node.debug(resolver, false),
|
||||||
"ROOT@0..15", // root node, spanning 15 bytes
|
"ROOT@0..15", // root node, spanning 15 bytes
|
||||||
);
|
);
|
||||||
|
@ -259,17 +253,13 @@ fn test_parser() {
|
||||||
}
|
}
|
||||||
|
|
||||||
/// So far, we've been working with a homogeneous untyped tree.
|
/// So far, we've been working with a homogeneous untyped tree.
|
||||||
/// It's nice to provide generic tree operations, like traversals,
|
/// That tree is nice to provide generic tree operations, like traversals, but it's a bad fit for
|
||||||
/// but it's a bad fit for semantic analysis.
|
/// semantic analysis. cstree itself does not provide AST facilities directly, but it is possible to
|
||||||
/// This crate itself does not provide AST facilities directly,
|
/// layer AST on top of `SyntaxNode` API. Let's write a function to evaluate S-expressions.
|
||||||
/// but it is possible to layer AST on top of `SyntaxNode` API.
|
|
||||||
/// Let's write a function to evaluate S-expression.
|
|
||||||
///
|
///
|
||||||
/// For that, let's define AST nodes.
|
/// For that, let's define AST nodes.
|
||||||
/// It'll be quite a bunch of repetitive code, so we'll use a macro.
|
/// It'll be quite a bunch of repetitive code, so we'll use a macro.
|
||||||
///
|
/// For a real language, you may want to automatically generate the AST implementations with a task.
|
||||||
/// For a real language, you'd want to generate an AST. I find a
|
|
||||||
/// combination of `serde`, `ron` and `tera` crates invaluable for that!
|
|
||||||
macro_rules! ast_node {
|
macro_rules! ast_node {
|
||||||
($ast:ident, $kind:ident) => {
|
($ast:ident, $kind:ident) => {
|
||||||
#[derive(PartialEq, Eq, Hash)]
|
#[derive(PartialEq, Eq, Hash)]
|
||||||
|
@ -292,7 +282,7 @@ ast_node!(Root, ROOT);
|
||||||
ast_node!(Atom, ATOM);
|
ast_node!(Atom, ATOM);
|
||||||
ast_node!(List, LIST);
|
ast_node!(List, LIST);
|
||||||
|
|
||||||
// Sexp is slightly different, so let's do it by hand.
|
// Sexp is slightly different because it can be both an atom and a list, so let's do it by hand.
|
||||||
#[derive(PartialEq, Eq, Hash)]
|
#[derive(PartialEq, Eq, Hash)]
|
||||||
#[repr(transparent)]
|
#[repr(transparent)]
|
||||||
struct Sexp(SyntaxNode);
|
struct Sexp(SyntaxNode);
|
||||||
|
@ -319,8 +309,7 @@ impl Sexp {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// Let's enhance AST nodes with ancillary functions and
|
// Let's enhance AST nodes with ancillary functions and eval.
|
||||||
// eval.
|
|
||||||
impl Root {
|
impl Root {
|
||||||
fn sexps(&self) -> impl Iterator<Item = Sexp> + '_ {
|
fn sexps(&self) -> impl Iterator<Item = Sexp> + '_ {
|
||||||
self.0.children().cloned().filter_map(Sexp::cast)
|
self.0.children().cloned().filter_map(Sexp::cast)
|
||||||
|
@ -413,9 +402,8 @@ nan
|
||||||
assert_eq!(res, vec![Some(92), Some(92), None, None, Some(92),])
|
assert_eq!(res, vec![Some(92), Some(92), None, None, Some(92),])
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Split the input string into a flat list of tokens
|
/// Split the input string into a flat list of tokens (such as L_PAREN, WORD, and WHITESPACE)
|
||||||
/// (such as L_PAREN, WORD, and WHITESPACE)
|
fn lex(text: &str) -> VecDeque<(SyntaxKind, &str)> {
|
||||||
fn lex(text: &str) -> Vec<(SyntaxKind, &str)> {
|
|
||||||
fn tok(t: SyntaxKind) -> m_lexer::TokenKind {
|
fn tok(t: SyntaxKind) -> m_lexer::TokenKind {
|
||||||
m_lexer::TokenKind(cstree::SyntaxKind::from(t).0)
|
m_lexer::TokenKind(cstree::SyntaxKind::from(t).0)
|
||||||
}
|
}
|
||||||
|
@ -445,6 +433,7 @@ fn lex(text: &str) -> Vec<(SyntaxKind, &str)> {
|
||||||
.into_iter()
|
.into_iter()
|
||||||
.map(|t| (t.len, kind(t.kind)))
|
.map(|t| (t.len, kind(t.kind)))
|
||||||
.scan(0usize, |start_offset, (len, kind)| {
|
.scan(0usize, |start_offset, (len, kind)| {
|
||||||
|
// reconstruct the item's source text from offset and len
|
||||||
let s = &text[*start_offset..*start_offset + len];
|
let s = &text[*start_offset..*start_offset + len];
|
||||||
*start_offset += len;
|
*start_offset += len;
|
||||||
Some((kind, s))
|
Some((kind, s))
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue