From 16f7a3bd80b2057437adba8a7225c722c8a57c63 Mon Sep 17 00:00:00 2001 From: DQ Date: Fri, 7 Apr 2023 18:06:51 +0200 Subject: [PATCH] Set up a module structure (#44) --- .github/workflows/main.yml | 20 +- CHANGELOG.md | 47 ++- Cargo.toml | 40 +- README.md | 287 ++++++++++++- benches/main.rs | 33 +- examples/math.rs | 24 +- examples/readme.rs | 334 +++++++++++++++ examples/s_expressions.rs | 29 +- examples/salsa.rs | 50 +++ src/green.rs | 19 +- src/green/builder.rs | 81 ++-- src/green/element.rs | 10 +- src/green/interner.rs | 126 ------ src/green/iter.rs | 2 +- src/green/node.rs | 15 +- src/green/token.rs | 20 +- src/interning.rs | 205 ++++++++-- src/interning/default_interner.rs | 70 ++++ src/interning/lasso_compat.rs | 9 + src/interning/lasso_compat/token_interner.rs | 109 +++++ src/interning/lasso_compat/traits.rs | 166 ++++++++ src/interning/salsa_compat.rs | 228 +++++++++++ src/interning/traits.rs | 67 +++ src/lib.rs | 405 +++++++++++++++++-- src/serde_impls.rs | 23 +- src/syntax/element.rs | 32 +- src/syntax/iter.rs | 6 +- src/syntax/mod.rs | 1 + src/syntax/node.rs | 42 +- src/syntax/resolved.rs | 25 +- src/syntax/text.rs | 57 +-- src/syntax/token.rs | 38 +- src/utility_types.rs | 4 +- tests/it/basic.rs | 40 +- tests/it/main.rs | 44 +- tests/it/regressions.rs | 8 +- tests/it/sendsync.rs | 22 +- tests/it/serde.rs | 7 +- 38 files changed, 2291 insertions(+), 454 deletions(-) create mode 100644 examples/readme.rs create mode 100644 examples/salsa.rs delete mode 100644 src/green/interner.rs create mode 100644 src/interning/default_interner.rs create mode 100644 src/interning/lasso_compat.rs create mode 100644 src/interning/lasso_compat/token_interner.rs create mode 100644 src/interning/lasso_compat/traits.rs create mode 100644 src/interning/salsa_compat.rs create mode 100644 src/interning/traits.rs diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index 6462f23..12a686b 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -31,8 +31,19 @@ jobs: - uses: hecrj/setup-rust-action@v1 with: rust-version: ${{ matrix.rust }} - - run: cargo test --verbose --all-features - - run: cargo test --release --verbose --all-features + + - uses: actions-rs/cargo@v1 + with: + command: test + args: --all-targets --verbose + - uses: actions-rs/cargo@v1 + with: + command: test + args: --all-targets --verbose --all-features + - uses: actions-rs/cargo@v1 + with: + command: test + args: --all-targets --verbose --all-features --release check: name: Check @@ -48,6 +59,7 @@ jobs: - uses: actions-rs/cargo@v1 with: command: check + args: --all-targets --all-features clippy: name: Clippy @@ -79,11 +91,13 @@ jobs: name: Check doc links runs-on: ubuntu-latest env: - RUSTDOCFLAGS: -Dwarnings + RUSTDOCFLAGS: -Dwarnings --cfg doc_cfg steps: - uses: actions/checkout@v2 - uses: hecrj/setup-rust-action@v1 + with: + rust-version: nightly - run: cargo doc --all-features --document-private-items --no-deps miri-test: diff --git a/CHANGELOG.md b/CHANGELOG.md index c798315..caa6ac2 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -2,7 +2,52 @@ ## `v0.12.0` + * Documentation has been improved in most areas, together with a switch to a more principled module structure that allows explicitly documenting submodules. + * The `interning` module has been rewritten. It now provides fuctions for obtaining a default interner (`new_interner` and `new_threaded_interner`) and provides a small, dependency-free interner implementation. + * Compatibility with other interners can be enable via feature flags. + * **Note** that compatibilty with `lasso` is not enabled by default. Use the `lasso_compat` feature to match the previous default. * Introduced `Language::static_text` to optimize tokens that always appear with the same text (estimated 10-15% faster tree building when used, depending on the ratio of static to dynamic tokens). * Since `cstree`s are lossless, `GreenNodeBuilder::token` must still be passed the source text even for static tokens. * Internal performance improvements for up to 10% faster tree building by avoiding unnecessary duplication of elements. - * Use `NonNull` for the internal representation of `SyntaxNode`, meaning it now benefits from niche optimizations (`Option` is now the same size as `SyntaxNode` itself: the size of a pointer). \ No newline at end of file + * Use `NonNull` for the internal representation of `SyntaxNode`, meaning it now benefits from niche optimizations (`Option` is now the same size as `SyntaxNode` itself: the size of a pointer). + * `SyntaxKind` has been renamed to `RawSyntaxKind` to no longer conflict with user-defined `SyntaxKind` enumerations. + * The crate's export module structure has been reorganized to give different groups of definitions their own submodules. A `cstree::prelude` module is available, containing the most commonly needed types that were previously accessible via `use cstree::*`. Otherwise, the module structure is now as follows: + * `cstree` + * `Language` + * `RawSyntaxKind` + * `build` + * `GreenNodeBuilder` + * `NodeCache` + * `Checkpoint` + * `green` + * `GreenNode` + * `GreenToken` + * `GreenNodeChildren` + * `syntax` + * `{Syntax,Resolved}Node` + * `{Syntax,Resolved}Token` + * `{Syntax,Resolved}Element` + * `{Syntax,Resolved}ElementRef` + * `SyntaxNodeChildren` + * `SyntaxElementChildren` + * `SyntaxText` + * `interning` + * `TokenKey` and the `InternKey` trait + * `Interner` and `Resolver` traits + * `new_interner` and `TokenInterner` + * `new_threaded_interner` and `MultiThreadedTokenInterner` (with the `multi_threaded_interning` feature enabled) + * compatibility implementations for interning crates depending on selected feature flags + * `text` + * `TextSize` + * `TextRange` + * `SyntaxText` (re-export) + * `traversal` + * `Direction` + * `WalkEvent` + * `util` + * `NodeOrToken` + * `TokenAtOffset` + * `sync` + * `Arc` + * `prelude` + * re-exports of the most-used items \ No newline at end of file diff --git a/Cargo.toml b/Cargo.toml index 9850143..96e64a7 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,7 +1,7 @@ [package] edition = "2021" name = "cstree" -version = "0.11.1" +version = "0.12.0-rc.0" # when updating, also update `#![doc(html_root_url)]` authors = [ "Domenic Quirl ", "Aleksey Kladov ", @@ -15,7 +15,6 @@ readme = "README.md" debug = true [dependencies] -lasso = { version = "0.6", features = ["inline-more", "multi-threaded"] } text-size = "1.1.0" fxhash = "0.2.1" parking_lot = "0.11.2" @@ -24,6 +23,20 @@ parking_lot = "0.11.2" triomphe = "0.1.7" sptr = "0.3.2" +# Default Interner +indexmap = "1.9" + +[dependencies.lasso] +version = "0.6" +features = ["inline-more"] +optional = true + +[dependencies.salsa] +git = "https://github.com/salsa-rs/salsa/" +version = "0.1" +optional = true +package = "salsa-2022" + [dependencies.serde] version = "1.0" optional = true @@ -42,8 +55,25 @@ name = "main" harness = false [features] -default = [] -serialize = ["serde", "lasso/serialize"] +default = [] +# Implementations of `serde::{De,}Serialize` for CSTrees. +serialize = ["serde", "lasso?/serialize"] +# Interoperability with the `lasso` interning crate. +# When enabled, `cstree`'s default interners will use `lasso` internally, too. +lasso_compat = ["lasso"] +# Additionally provide threadsafe interner types. +# Where applicable (and if the corresponding features are selected), provide compatibility +# implementations for multi-thread interners from other crates. +multi_threaded_interning = ["lasso_compat", "lasso/multi-threaded"] +# Interoperability with the `salsa` framework for incremental computation. +# Use this feature for "Salsa 2022". +# WARNING: This feature is considered unstable! +salsa_2022_compat = ["salsa"] + +[[example]] +name = "salsa" +required-features = ["salsa_2022_compat"] [package.metadata.docs.rs] -features = ["serialize"] +all-features = true +rustdoc-args = ["--cfg", "doc_cfg"] diff --git a/README.md b/README.md index 36e1e61..44416ea 100644 --- a/README.md +++ b/README.md @@ -32,8 +32,291 @@ Notable differences of `cstree` compared to `rowan`: - Performance optimizations for tree traversal: persisting red nodes allows tree traversal methods to return references. You can still `clone` to obtain an owned node, but you only pay that cost when you need to. ## Getting Started -The main entry points for constructing syntax trees are `GreenNodeBuilder` and `SyntaxNode::new_root` for green and red trees respectively. -See `examples/s_expressions` for a guided tutorial to `cstree`. + +If you're looking at `cstree`, you're probably looking at or already writing a parser and are considering using +concrete syntax trees as its output. We'll talk more about parsing below -- first, let's have a look at what needs +to happen to go from input text to a `cstree` syntax tree: + + 1. Define an enumeration of the types of tokens (like keywords) and nodes (like "an expression") + that you want to have in your syntax and implement `Language` + + 2. Create a `GreenNodeBuilder` and call `start_node`, `token` and `finish_node` from your parser + + 3. Call `SyntaxNode::new_root` or `SyntaxNode::new_root_with_resolver` with the resulting + `GreenNode` to obtain a syntax tree that you can traverse + +Let's walk through the motions of parsing a (very) simple language into `cstree` syntax trees. +We'll just support addition and subtraction on integers, from which the user is allowed to construct a single, +compound expression. They will, however, be allowed to write nested expressions in parentheses, like `1 - (2 + 5)`. + +### Defining the language +First, we need to list the different part of our language's grammar. +We can do that using an `enum` with a unit variant for any terminal and non-terminal. +The `enum` needs to be convertible to a `u16`, so we use the `repr` attribute to ensure it uses the correct +representation. + +```rust +#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash)] +#[repr(u16)] +enum SyntaxKind { + /* Tokens */ + Int, // 42 + Plus, // + + Minus, // - + LParen, // ( + RParen, // ) + /* Nodes */ + Expr, + Root, +} +``` + +Most of these are tokens to lex the input string into, like numbers (`Int`) and operators (`Plus`, `Minus`). +We only really need one type of node; expressions. +Our syntax tree's root node will have the special kind `Root`, all other nodes will be +expressions containing a sequence of arithmetic operations potentially involving further, nested +expression nodes. + +To use our `SyntaxKind`s with `cstree`, we need to tell it how to convert it back to just a number (the +`#[repr(u16)]` that we added) by implementing the `Language` trait. We can also tell `cstree` about tokens that +always have the same text through the `static_text` method on the trait. This is useful for the operators and +parentheses, but not possible for numbers, since an integer token may be produced from the input `3`, but also from +other numbers like `7` or `12`. We implement `Language` on an empty type, just so we can give it a name. + +```rust +#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash)] +pub struct Calculator; + +impl Language for Calculator { + // The tokens and nodes we just defined + type Kind = SyntaxKind; + + fn kind_from_raw(raw: RawSyntaxKind) -> Self::Kind { + // This just needs to be the inverse of `kind_to_raw`, but could also + // be an `impl TryFrom for SyntaxKind` or any other conversion. + match raw.0 { + 0 => SyntaxKind::Int, + 1 => SyntaxKind::Plus, + 2 => SyntaxKind::Minus, + 3 => SyntaxKind::LParen, + 4 => SyntaxKind::RParen, + 5 => SyntaxKind::Expr, + 6 => SyntaxKind::Root, + n => panic!("Unknown raw syntax kind: {n}"), + } + } + + fn kind_to_raw(kind: Self::Kind) -> RawSyntaxKind { + RawSyntaxKind(kind as u16) + } + + fn static_text(kind: Self::Kind) -> Option<&'static str> { + match kind { + SyntaxKind::Plus => Some("+"), + SyntaxKind::Minus => Some("-"), + SyntaxKind::LParen => Some("("), + SyntaxKind::RParen => Some(")"), + _ => None, + } + } +} +``` + +### Parsing into a green tree +With that out of the way, we can start writing the parser for our expressions. +For the purposes of this introduction to `cstree`, I'll assume that there is a lexer that yields the following +tokens: + +```rust +#[derive(Debug, PartialEq, Eq, Clone, Copy)] +pub enum Token<'input> { + // Note that number strings are not yet parsed into actual numbers, + // we just remember the slice of the input that contains their digits + Int(&'input str), + Plus, + Minus, + LParen, + RParen, + // A special token that indicates that we have reached the end of the file + EoF, +} +``` + +A simple lexer that yields such tokens is part of the full `readme` example, but we'll be busy enough with the +combination of `cstree` and the actual parser, which we define like this: + +```rust +pub struct Parser<'input> { + // `Peekable` is a standard library iterator adapter that allows + // looking ahead at the next item without removing it from the iterator yet + lexer: Peekable>, + builder: GreenNodeBuilder<'static, 'static, Calculator>, +} + +impl<'input> Parser<'input> { + pub fn new(input: &'input str) -> Self { + Self { + // we get `peekable` from implementing `Iterator` on `Lexer` + lexer: Lexer::new(input).peekable(), + builder: GreenNodeBuilder::new(), + } + } + + pub fn bump(&mut self) -> Option> { + self.lexer.next() + } +} +``` + +In contrast to parsers that return abstract syntax trees, with `cstree` the syntax tree nodes for +all element in the language grammar will have the same type: `GreenNode` for the inner ("green") +tree and `SyntaxNode` for the outer ("red") tree. Different kinds of nodes (and tokens) are +differentiated by their `SyntaxKind` tag, which we defined above. + +You can implement many types of parsers with `cstree`. To get a feel for how it works, consider +a typical recursive descent parser. With a more traditional AST, one would define different AST +structs for struct or function definitions, statements, expressions and so on. Inside the +parser, the components of any element, such as all fields of a struct or all statements inside a +function, are parsed first and then the parser wraps them in the matching AST type, which is +returned from the corresponding parser function. + +Because `cstree`'s syntax trees are untyped, there is no explicit AST representation that the parser +would build. Instead, parsing into a CST using the `GreenNodeBuilder` follows the source code more +closely in that you tell `cstree` about each new element you enter and all tokens that the parser +consumes. So, for example, to parse a struct definition the parser first "enters" the struct +definition node, then parses the `struct` keyword and type name, then parses each field, and finally +"finishes" parsing the struct node. + +The most trivial example is the root node for our parser, which just creates a root node +containing the whole expression (we could do without a specific root node if any expression was +a node, in particular if we wrapped integer literal tokens inside `Expr` nodes). + +```rust +pub fn parse(&mut self) -> Result<(), String> { + self.builder.start_node(SyntaxKind::Root); + self.parse_expr()?; + self.builder.finish_node(); + Ok(()) +} +``` + +As there isn't a static AST type to return, the parser is very flexible as to what is part of a +node. In the previous example, if the user is adding a new field to the struct and has not yet +typed the field's type, the CST node for the struct doesn't care if there is no child node for +it. Similarly, if the user is deleting fields and the source code currently contains a leftover +field name, this additional identifier can be a part of the struct node without any +modifications to the syntax tree definition. This property is the key to why CSTs are such a +good fit as a lossless input representation, which necessitates the syntax tree to mirror the +user-specific layout of whitespace and comments around the AST items. + +In the parser for our simple expression language, we'll also have to deal with the fact that, +when we see a number the parser doesn't yet know whether there will be additional operations +following that number. That is, in the expression `1 + 2`, it can only know that it is parsing +a binary operation once it sees the `+`. The event-like model of building trees in `cstree`, +however, implies that when reaching the `+`, the parser would have to have already entered an +expression node in order for the whole input to be part of the expression. + +To get around this, `GreenNodeBuilder` provides the `checkpoint` method, which we can call to +"remember" the current position in the input. For example, we can create a checkpoint before the +parser parses the first `1`. Later, when it sees the following `+`, it can create an `Expr` node +for the whole expression using `start_node_at`: + +```rust +fn parse_lhs(&mut self) -> Result<(), String> { + // An expression may start either with a number, or with an opening parenthesis that is + // the start of a parenthesized expression + let next_token = *self.lexer.peek().unwrap(); + match next_token { + Token::Int(n) => { + self.bump(); + self.builder.token(SyntaxKind::Int, n); + } + Token::LParen => { + // Wrap the grouped expression inside a node containing it and its parentheses + self.builder.start_node(SyntaxKind::Expr); + self.bump(); + self.builder.static_token(SyntaxKind::LParen); + self.parse_expr()?; // Inner expression + if self.bump() != Some(Token::RParen) { + return Err("Missing ')'".to_string()); + } + self.builder.static_token(SyntaxKind::RParen); + self.builder.finish_node(); + } + Token::EoF => return Err("Unexpected end of file: expected expression".to_string()), + t => return Err(format!("Unexpected start of expression: '{t:?}'")), + } + Ok(()) +} + +fn parse_expr(&mut self) -> Result<(), String> { + // Remember our current position + let before_expr = self.builder.checkpoint(); + + // Parse the start of the expression + self.parse_lhs()?; + + // Check if the expression continues with `+ ` or `- ` + let Some(next_token) = self.lexer.peek() else { + return Ok(()); + }; + let op = match *next_token { + Token::Plus => SyntaxKind::Plus, + Token::Minus => SyntaxKind::Minus, + Token::RParen | Token::EoF => return Ok(()), + t => return Err(format!("Expected operator, found '{t:?}'")), + }; + + // If so, retroactively wrap the (already parsed) LHS and the following RHS + // inside an `Expr` node + self.builder.start_node_at(before_expr, SyntaxKind::Expr); + self.bump(); + self.builder.static_token(op); + self.parse_expr()?; // RHS + self.builder.finish_node(); + Ok(()) +} +``` + +### Obtaining the parser result + +Our parser is now capable of parsing our little arithmetic language, but it's methods don't return +anything. So how do we get our syntax tree out? The answer lies in `GreenNodeBuilder::finish`, which +finally returns the tree that we have painstakingly constructed. + +```rust +impl Parser<'_> { + pub fn finish(mut self) -> (GreenNode, impl Interner) { + assert!(self.lexer.next().map(|t| t == Token::EoF).unwrap_or(true)); + let (tree, cache) = self.builder.finish(); + (tree, cache.unwrap().into_interner().unwrap()) + } +} +``` + +`finish` also returns the cache it used to deduplicate tree nodes and tokens, so you can re-use it +for parsing related inputs (e.g., different source files from the same crate may share a lot of +common function and type names that can be deduplicated). See `GreenNodeBuilder`'s documentation for +more information on this, in particular the `with_cache` and `from_cache` methods. Most importantly +for us, we can extract the `Interner` that contains the source text of the tree's tokens from the +cache, which we need if we want to look up things like variable names or the value of numbers for +our calculator. + +To work with the syntax tree, you'll want to upgrade it to a `SyntaxNode` using +`SyntaxNode::new_root`. You can also use `SyntaxNode::new_root_with_resolver` to combine tree and +interner, which lets you directly retrieve source text and makes the nodes implement `Display` and +`Debug`. The same output can be produced from `SyntaxNode`s by calling the `debug` or `display` +method with a `Resolver`. To visualize the whole syntax tree, pass `true` for the `recursive` +parameter on `debug`, or simply debug-print a `ResolvedNode`: + +```rust +let input = "11 + 2-(5 + 4)"; +let mut parser = Parser::new(input); +parser.parse().unwrap(); +let (tree, interner) = parser.finish(); +let root = SyntaxNode::::new_root_with_resolver(tree, interner); +dbg!(root); +``` ## AST Layer While `cstree` is built for concrete syntax trees, applications are quite easily able to work with either a CST or an AST representation, or freely switch between them. diff --git a/benches/main.rs b/benches/main.rs index 2ce7cf3..f1373b5 100644 --- a/benches/main.rs +++ b/benches/main.rs @@ -1,6 +1,10 @@ use criterion::{black_box, criterion_group, criterion_main, Criterion, Throughput}; -use cstree::*; -use lasso::{Interner, Rodeo}; +use cstree::{ + build::*, + green::GreenNode, + interning::{new_interner, Interner}, + Language, RawSyntaxKind, +}; use std::{fmt, hash::Hash}; #[derive(Debug)] @@ -40,7 +44,7 @@ impl Bool for UseStaticText { impl Language for TestLang { type Kind = TestKind; - fn kind_from_raw(raw: SyntaxKind) -> Self::Kind { + fn kind_from_raw(raw: RawSyntaxKind) -> Self::Kind { if raw.0 == u16::MAX - 1 { TestKind::Plus } else { @@ -48,10 +52,10 @@ impl Language for TestLang { } } - fn kind_to_raw(kind: Self::Kind) -> SyntaxKind { + fn kind_to_raw(kind: Self::Kind) -> RawSyntaxKind { match kind { - TestKind::Element { n } => SyntaxKind(n), - TestKind::Plus => SyntaxKind(u16::MAX - 1), + TestKind::Element { n } => RawSyntaxKind(n), + TestKind::Plus => RawSyntaxKind(u16::MAX - 1), } } @@ -67,7 +71,7 @@ impl Language for TestLang { } } -pub fn build_tree_with_cache<'c, 'i, T: Bool, I>(root: &Element<'_>, cache: &'c mut NodeCache<'i, I>) -> GreenNode +pub fn build_tree_with_cache(root: &Element<'_>, cache: &mut NodeCache<'_, I>) -> GreenNode where I: Interner, { @@ -78,9 +82,9 @@ where node } -pub fn build_recursive<'c, 'i, T: Bool, I>( +pub fn build_recursive( root: &Element<'_>, - builder: &mut GreenNodeBuilder<'c, 'i, TestLang, I>, + builder: &mut GreenNodeBuilder<'_, '_, TestLang, I>, mut from: u16, ) -> u16 where @@ -95,7 +99,7 @@ where builder.finish_node(); } Element::Token(text) => { - builder.token(TestKind::Element { n: from }, *text); + builder.token(TestKind::Element { n: from }, text); } Element::Plus => { builder.token(TestKind::Plus, "+"); @@ -114,10 +118,15 @@ fn two_level_tree() -> Element<'static> { } pub fn create(c: &mut Criterion) { - let mut group = c.benchmark_group("two-level tree"); + #[cfg(not(feature = "lasso_compat"))] + const GROUP_NAME: &str = "two-level tree (default interner)"; + #[cfg(feature = "lasso_compat")] + const GROUP_NAME: &str = "two-level tree (lasso)"; + + let mut group = c.benchmark_group(GROUP_NAME); group.throughput(Throughput::Elements(1)); - let mut interner = Rodeo::new(); + let mut interner = new_interner(); let mut cache = NodeCache::with_interner(&mut interner); let tree = two_level_tree(); diff --git a/examples/math.rs b/examples/math.rs index ad0ce2d..e2346ac 100644 --- a/examples/math.rs +++ b/examples/math.rs @@ -13,10 +13,7 @@ //! - "+" Token(Add) //! - "4" Token(Number) -use cstree::{ - interning::{IntoResolver, Resolver}, - GreenNodeBuilder, NodeOrToken, -}; +use cstree::{build::GreenNodeBuilder, interning::Resolver, util::NodeOrToken}; use std::iter::Peekable; #[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash)] @@ -36,7 +33,7 @@ enum SyntaxKind { } use SyntaxKind::*; -impl From for cstree::SyntaxKind { +impl From for cstree::RawSyntaxKind { fn from(kind: SyntaxKind) -> Self { Self(kind as u16) } @@ -47,12 +44,12 @@ enum Lang {} impl cstree::Language for Lang { type Kind = SyntaxKind; - fn kind_from_raw(raw: cstree::SyntaxKind) -> Self::Kind { + fn kind_from_raw(raw: cstree::RawSyntaxKind) -> Self::Kind { assert!(raw.0 <= Root as u16); unsafe { std::mem::transmute::(raw.0) } } - fn kind_to_raw(kind: Self::Kind) -> cstree::SyntaxKind { + fn kind_to_raw(kind: Self::Kind) -> cstree::RawSyntaxKind { kind.into() } @@ -67,12 +64,12 @@ impl cstree::Language for Lang { } } -type SyntaxNode = cstree::SyntaxNode; +type SyntaxNode = cstree::syntax::SyntaxNode; #[allow(unused)] -type SyntaxToken = cstree::SyntaxToken; +type SyntaxToken = cstree::syntax::SyntaxToken; #[allow(unused)] -type SyntaxElement = cstree::NodeOrToken; -type SyntaxElementRef<'a> = cstree::NodeOrToken<&'a SyntaxNode, &'a SyntaxToken>; +type SyntaxElement = cstree::util::NodeOrToken; +type SyntaxElementRef<'a> = cstree::util::NodeOrToken<&'a SyntaxNode, &'a SyntaxToken>; struct Parser<'input, I: Iterator> { builder: GreenNodeBuilder<'static, 'static, Lang>, @@ -128,10 +125,7 @@ impl<'input, I: Iterator> Parser<'input, I> { self.builder.finish_node(); let (tree, cache) = self.builder.finish(); - ( - SyntaxNode::new_root(tree), - cache.unwrap().into_interner().unwrap().into_resolver(), - ) + (SyntaxNode::new_root(tree), cache.unwrap().into_interner().unwrap()) } } diff --git a/examples/readme.rs b/examples/readme.rs new file mode 100644 index 0000000..6e3890d --- /dev/null +++ b/examples/readme.rs @@ -0,0 +1,334 @@ +use std::{io::Write, iter::Peekable}; + +use cstree::{ + interning::Interner, + prelude::*, + syntax::{ResolvedElementRef, ResolvedNode}, +}; + +#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash)] +#[repr(u16)] +pub enum SyntaxKind { + /* Tokens */ + Int, // 42 + Plus, // + + Minus, // - + LParen, // ( + RParen, // ) + /* Nodes */ + Expr, + Root, +} + +#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash)] +pub struct Calculator; +impl Language for Calculator { + // The tokens and nodes we just defined + type Kind = SyntaxKind; + + fn kind_from_raw(raw: RawSyntaxKind) -> Self::Kind { + // This just needs to be the inverse of `kind_to_raw`, but could also + // be an `impl TryFrom for SyntaxKind` or any other conversion. + match raw.0 { + 0 => SyntaxKind::Int, + 1 => SyntaxKind::Plus, + 2 => SyntaxKind::Minus, + 3 => SyntaxKind::LParen, + 4 => SyntaxKind::RParen, + 5 => SyntaxKind::Expr, + 6 => SyntaxKind::Root, + n => panic!("Unknown raw syntax kind: {n}"), + } + } + + fn kind_to_raw(kind: Self::Kind) -> RawSyntaxKind { + RawSyntaxKind(kind as u16) + } + + fn static_text(kind: Self::Kind) -> Option<&'static str> { + match kind { + SyntaxKind::Plus => Some("+"), + SyntaxKind::Minus => Some("-"), + SyntaxKind::LParen => Some("("), + SyntaxKind::RParen => Some(")"), + _ => None, + } + } +} + +#[derive(Debug, PartialEq, Eq, Clone, Copy)] +pub enum Token<'input> { + Int(&'input str), + Plus, + Minus, + LParen, + RParen, + EoF, +} + +pub struct Lexer<'input> { + input: &'input str, + at_eof: bool, +} + +impl<'input> Lexer<'input> { + pub fn new(input: &'input str) -> Self { + Self { input, at_eof: false } + } + + fn next_token(&mut self) -> Result, String> { + loop { + let Some(next_char) = self.input.chars().next() else { + self.at_eof = true; + return Ok(Token::EoF); + }; + + let token = match next_char { + '+' => Token::Plus, + '-' => Token::Minus, + '(' => Token::LParen, + ')' => Token::RParen, + c if c.is_ascii_digit() => { + let (last_digit_idx, _char) = self + .input + .char_indices() + .take_while(|(_idx, c)| c.is_ascii_digit()) + .last() + .expect("matched at least one"); + // Advance lexer + let number = Token::Int(&self.input[..=last_digit_idx]); + self.input = &self.input[(last_digit_idx + 1)..]; + return Ok(number); + } + c if c.is_whitespace() => { + // Skip whitespace + let (last_ws_idx, _char) = self + .input + .char_indices() + .take_while(|(_idx, c)| c.is_whitespace()) + .last() + .expect("matched at least one"); + // Advance lexer + self.input = &self.input[(last_ws_idx + 1)..]; + continue; + } + c => return Err(format!("Unknown start of token: '{c}'")), + }; + + // Advance lexer + self.input = &self.input[1..]; + return Ok(token); + } + } +} + +impl<'input> Iterator for Lexer<'input> { + type Item = Token<'input>; + + fn next(&mut self) -> Option { + if self.at_eof { + None + } else { + Some(self.next_token().expect("Failed to lex input")) + } + } +} + +pub struct Parser<'input> { + lexer: Peekable>, + builder: GreenNodeBuilder<'static, 'static, Calculator>, +} + +impl<'input> Parser<'input> { + pub fn new(input: &'input str) -> Self { + Self { + lexer: Lexer::new(input).peekable(), + builder: GreenNodeBuilder::new(), + } + } + + pub fn bump(&mut self) -> Option> { + self.lexer.next() + } + + pub fn parse(&mut self) -> Result<(), String> { + self.builder.start_node(SyntaxKind::Root); + self.parse_expr()?; + self.builder.finish_node(); + Ok(()) + } + + fn parse_lhs(&mut self) -> Result<(), String> { + // An expression may start either with a number, or with an opening parenthesis that is the start of a + // parenthesized expression + let next_token = *self.lexer.peek().unwrap(); + match next_token { + Token::Int(n) => { + self.bump(); + self.builder.token(SyntaxKind::Int, n); + } + Token::LParen => { + // Wrap the grouped expression inside a node containing it and its parentheses + self.builder.start_node(SyntaxKind::Expr); + self.bump(); + self.builder.static_token(SyntaxKind::LParen); + self.parse_expr()?; // Inner expression + if self.bump() != Some(Token::RParen) { + return Err("Missing ')'".to_string()); + } + self.builder.static_token(SyntaxKind::RParen); + self.builder.finish_node(); + } + Token::EoF => return Err("Unexpected end of file: expected expression".to_string()), + t => return Err(format!("Unexpected start of expression: '{t:?}'")), + } + Ok(()) + } + + fn parse_expr(&mut self) -> Result<(), String> { + // Remember our current position + let before_expr = self.builder.checkpoint(); + + // Parse the start of the expression + self.parse_lhs()?; + + // Check if the expression continues with `+ ` or `- ` + let Some(next_token) = self.lexer.peek() else { + return Ok(()); + }; + let op = match *next_token { + Token::Plus => SyntaxKind::Plus, + Token::Minus => SyntaxKind::Minus, + Token::RParen | Token::EoF => return Ok(()), + t => return Err(format!("Expected operator, found '{t:?}'")), + }; + + // If so, retroactively wrap the (already parsed) LHS and the following RHS inside an `Expr` node + self.builder.start_node_at(before_expr, SyntaxKind::Expr); + self.bump(); + self.builder.static_token(op); + self.parse_expr()?; // RHS + self.builder.finish_node(); + Ok(()) + } + + pub fn finish(mut self) -> (GreenNode, impl Interner) { + assert!(self.lexer.next().map(|t| t == Token::EoF).unwrap_or(true)); + let (tree, cache) = self.builder.finish(); + (tree, cache.unwrap().into_interner().unwrap()) + } +} + +fn main() { + use std::io; + + let mut buf = String::new(); + loop { + print!("Enter expression: "); + io::stdout().flush().unwrap(); + buf.clear(); + if let Err(e) = io::stdin().read_line(&mut buf) { + eprintln!("Error reading input: {e}"); + continue; + } + let mut parser = Parser::new(&buf); + if let Err(e) = parser.parse() { + eprintln!("Parse error: {e}"); + continue; + } + + let (tree, interner) = parser.finish(); + let root = SyntaxNode::::new_root_with_resolver(tree, interner); + + if let Some(expr) = root.first_child_or_token() { + let result = eval_elem(expr, &mut root.children_with_tokens()); + println!("Result: {result}"); + } + } +} + +fn eval(expr: &ResolvedNode) -> i64 { + let mut children = expr.children_with_tokens(); + let lhs = eval_elem(children.next().expect("empty expr"), &mut children); + let Some(op) = children.next().map(|elem| elem.kind()) else { + // Literal expression + return lhs; + }; + let rhs = eval_elem(children.next().expect("missing RHS"), &mut children); + + match op { + SyntaxKind::Plus => lhs + rhs, + SyntaxKind::Minus => lhs - rhs, + _ => unreachable!("invalid op"), + } +} + +fn eval_elem<'e>( + expr: ResolvedElementRef<'_, Calculator>, + children: &mut impl Iterator>, +) -> i64 { + use cstree::util::NodeOrToken; + + match expr { + NodeOrToken::Node(n) => { + assert_eq!(n.kind(), SyntaxKind::Expr); + eval(n) + } + NodeOrToken::Token(t) => match t.kind() { + SyntaxKind::Int => { + let number_str = t.text(); + number_str.parse().expect("parsed int could not be evaluated") + } + SyntaxKind::LParen => { + let inner = children.next().expect("missing content inside parens"); + // It's important that we consume the `)` here, as otherwise `eval` might mistake it for an operator + assert_eq!( + children + .next() + .and_then(|elem| elem.into_token()) + .map(|token| token.kind()), + Some(SyntaxKind::RParen) + ); + eval_elem(inner, children) + } + _ => unreachable!("invalid start of expression"), + }, + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn lex() { + let input = "11 + 2-(5 + 4)"; + let lexer = Lexer::new(input); + let tokens: Vec<_> = lexer.into_iter().collect(); + assert_eq!( + tokens, + vec![ + Token::Int("11"), + Token::Plus, + Token::Int("2"), + Token::Minus, + Token::LParen, + Token::Int("5"), + Token::Plus, + Token::Int("4"), + Token::RParen, + Token::EoF + ] + ); + } + + #[test] + fn parse() { + let input = "11 + 2-(5 + 4)"; + let mut parser = Parser::new(input); + parser.parse().unwrap(); + let (tree, interner) = parser.finish(); + let root = SyntaxNode::::new_root_with_resolver(tree, interner); + dbg!(root); + } +} diff --git a/examples/s_expressions.rs b/examples/s_expressions.rs index dbe8128..bed6832 100644 --- a/examples/s_expressions.rs +++ b/examples/s_expressions.rs @@ -30,7 +30,7 @@ use SyntaxKind::*; /// in order to not need the user's `enum SyntaxKind` as a type parameter. /// /// First, to easily pass the enum variants into cstree via `.into()`: -impl From for cstree::SyntaxKind { +impl From for cstree::RawSyntaxKind { fn from(kind: SyntaxKind) -> Self { Self(kind as u16) } @@ -44,12 +44,12 @@ pub enum Lang {} impl cstree::Language for Lang { type Kind = SyntaxKind; - fn kind_from_raw(raw: cstree::SyntaxKind) -> Self::Kind { + fn kind_from_raw(raw: cstree::RawSyntaxKind) -> Self::Kind { assert!(raw.0 <= Root as u16); unsafe { std::mem::transmute::(raw.0) } } - fn kind_to_raw(kind: Self::Kind) -> cstree::SyntaxKind { + fn kind_to_raw(kind: Self::Kind) -> cstree::RawSyntaxKind { kind.into() } @@ -66,14 +66,11 @@ impl cstree::Language for Lang { /// offsets and parent pointers. /// cstree also deduplicates the actual source string in addition to the tree nodes, so we will need /// the Resolver to get the real text back from the interned representation. -use cstree::{ - interning::{IntoResolver, Resolver}, - GreenNode, Language, -}; +use cstree::{green::GreenNode, interning::Resolver, Language}; /// You can construct GreenNodes by hand, but a builder is helpful for top-down parsers: it maintains /// a stack of currently in-progress nodes. -use cstree::GreenNodeBuilder; +use cstree::build::GreenNodeBuilder; /// The parse results are stored as a "green tree". /// We'll discuss how to work with the results later. @@ -135,7 +132,7 @@ fn parse(text: &str) -> Parse { let (tree, cache) = self.builder.finish(); Parse { green_node: tree, - resolver: cache.unwrap().into_interner().unwrap().into_resolver(), + resolver: cache.unwrap().into_interner().unwrap(), errors: self.errors, } } @@ -213,11 +210,11 @@ fn parse(text: &str) -> Parse { /// To work with the parse results we need a view into the green tree - the syntax tree. /// It is also immutable, like a GreenNode, but it contains parent pointers, offsets, and has /// identity semantics. -type SyntaxNode = cstree::SyntaxNode; +type SyntaxNode = cstree::syntax::SyntaxNode; #[allow(unused)] -type SyntaxToken = cstree::SyntaxToken; +type SyntaxToken = cstree::syntax::SyntaxToken; #[allow(unused)] -type SyntaxElement = cstree::SyntaxElement; +type SyntaxElement = cstree::syntax::SyntaxElement; impl Parse { fn syntax(&self) -> SyntaxNode { @@ -355,8 +352,10 @@ impl ast::Atom { } fn text<'r>(&self, resolver: &'r impl Resolver) -> &'r str { - match &self.0.green().children().next() { - Some(cstree::NodeOrToken::Token(token)) => Lang::static_text(Lang::kind_from_raw(token.kind())) + use cstree::util::NodeOrToken; + + match self.0.green().children().next() { + Some(NodeOrToken::Token(token)) => Lang::static_text(Lang::kind_from_raw(token.kind())) .or_else(|| token.text(resolver)) .unwrap(), _ => unreachable!(), @@ -422,7 +421,7 @@ nan /// Split the input string into a flat list of tokens (such as L_PAREN, WORD, and WHITESPACE) fn lex(text: &str) -> VecDeque<(SyntaxKind, &str)> { fn tok(t: SyntaxKind) -> m_lexer::TokenKind { - m_lexer::TokenKind(cstree::SyntaxKind::from(t).0) + m_lexer::TokenKind(cstree::RawSyntaxKind::from(t).0) } fn kind(t: m_lexer::TokenKind) -> SyntaxKind { match t.0 { diff --git a/examples/salsa.rs b/examples/salsa.rs new file mode 100644 index 0000000..110133e --- /dev/null +++ b/examples/salsa.rs @@ -0,0 +1,50 @@ +#![cfg(feature = "salsa_2022_compat")] + +use cstree::{build::GreenNodeBuilder, impl_cstree_interning_for_salsa}; + +#[salsa::jar(db = Db)] +pub struct Jar(crate::SourceId); + +pub trait Db: salsa::DbWithJar {} +impl Db for DB where DB: ?Sized + salsa::DbWithJar {} + +#[salsa::interned] +pub struct SourceId { + #[return_ref] + pub text: String, +} + +#[derive(Default)] +#[salsa::db(crate::Jar)] +struct Database { + storage: salsa::Storage, +} + +impl salsa::Database for Database {} + +impl_cstree_interning_for_salsa!(impl Interning for Database => text as SourceId); + +use cstree::{syntax::SyntaxNode, testing::*}; + +fn main() { + let db = Database::default(); + let interned = SourceId::new(&db, "foo".to_string()); + let original = interned.text(&db); + assert_eq!(original, "foo"); + + let interner = db.as_interner(); + let mut shared_interner = &interner; + let mut builder: GreenNodeBuilder = GreenNodeBuilder::with_interner(&mut shared_interner); + let (tree, _no_interner_because_it_was_borrowed) = { + builder.start_node(TestSyntaxKind::Plus); + builder.token(TestSyntaxKind::Float, "2.05"); + builder.token(TestSyntaxKind::Whitespace, " "); + builder.token(TestSyntaxKind::Plus, "+"); + builder.token(TestSyntaxKind::Whitespace, " "); + builder.token(TestSyntaxKind::Float, "7.32"); + builder.finish_node(); + builder.finish() + }; + let tree: SyntaxNode = SyntaxNode::new_root(tree); + assert_eq!(tree.resolve_text(shared_interner), "2.05 + 7.32"); +} diff --git a/src/green.rs b/src/green.rs index ac5cf50..9489be4 100644 --- a/src/green.rs +++ b/src/green.rs @@ -1,10 +1,9 @@ //! Implementation of the inner, "green" tree. -//! The [`GreenNodeBuilder`] is the main entry point to constructing [`GreenNode`]s and -//! [`GreenToken`]s. +//! The [`GreenNodeBuilder`](crate::build::GreenNodeBuilder) from the [`build` module](crate::build) is the main entry +//! point to constructing [`GreenNode`]s and [`GreenToken`]s. -mod builder; +pub(super) mod builder; mod element; -mod interner; mod iter; mod node; mod token; @@ -12,17 +11,7 @@ mod token; pub(crate) use self::element::GreenElementRef; use self::element::{GreenElement, PackedGreenElement}; -pub use self::{ - builder::{Checkpoint, GreenNodeBuilder, NodeCache}, - interner::TokenInterner, - iter::GreenNodeChildren, - node::GreenNode, - token::GreenToken, -}; - -/// SyntaxKind is a type tag for each token or node. -#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash)] -pub struct SyntaxKind(pub u16); +pub use self::{iter::GreenNodeChildren, node::GreenNode, token::GreenToken}; #[cfg(test)] mod tests { diff --git a/src/green/builder.rs b/src/green/builder.rs index dcfb939..48ab437 100644 --- a/src/green/builder.rs +++ b/src/green/builder.rs @@ -4,10 +4,11 @@ use fxhash::{FxHashMap, FxHasher32}; use text_size::TextSize; use crate::{ - green::{interner::TokenInterner, GreenElement, GreenNode, GreenToken, SyntaxKind}, - interning::{Interner, Key}, + green::{GreenElement, GreenNode, GreenToken}, + interning::{new_interner, Interner, TokenInterner, TokenKey}, + util::NodeOrToken, utility_types::MaybeOwned, - Language, NodeOrToken, + Language, RawSyntaxKind, }; use super::{node::GreenNodeHead, token::GreenTokenData}; @@ -35,6 +36,8 @@ impl NodeCache<'static> { /// # Examples /// ``` /// # use cstree::testing::{*, Language as _}; + /// use cstree::build::NodeCache; + /// /// // Build a tree /// let mut cache = NodeCache::new(); /// let mut builder: GreenNodeBuilder = GreenNodeBuilder::with_cache(&mut cache); @@ -53,7 +56,7 @@ impl NodeCache<'static> { Self { nodes: FxHashMap::default(), tokens: FxHashMap::default(), - interner: MaybeOwned::Owned(TokenInterner::new()), + interner: MaybeOwned::Owned(new_interner()), } } } @@ -66,19 +69,21 @@ impl Default for NodeCache<'static> { impl<'i, I> NodeCache<'i, I> where - I: Interner, + I: Interner, { /// Constructs a new, empty cache that will use the given interner to deduplicate source text /// (strings) across tokens. /// # Examples /// ``` /// # use cstree::testing::{*, Language as _}; - /// use lasso::Rodeo; + /// # use cstree::interning::*; + /// use cstree::build::NodeCache; /// - /// // Create the builder from a custom `Rodeo` - /// let mut interner = Rodeo::new(); + /// // Create the builder from a custom interner + /// let mut interner = new_interner(); /// let mut cache = NodeCache::with_interner(&mut interner); - /// let mut builder: GreenNodeBuilder = GreenNodeBuilder::with_cache(&mut cache); + /// let mut builder: GreenNodeBuilder = + /// GreenNodeBuilder::with_cache(&mut cache); /// /// // Construct the tree /// # builder.start_node(Root); @@ -107,12 +112,14 @@ where /// # Examples /// ``` /// # use cstree::testing::{*, Language as _}; - /// use lasso::Rodeo; + /// # use cstree::interning::*; + /// use cstree::build::NodeCache; /// - /// // Create the builder from a custom `Rodeo` - /// let mut interner = Rodeo::new(); + /// // Create the builder from a custom interner + /// let mut interner = new_interner(); /// let cache = NodeCache::from_interner(interner); - /// let mut builder: GreenNodeBuilder = GreenNodeBuilder::from_cache(cache); + /// let mut builder: GreenNodeBuilder = + /// GreenNodeBuilder::from_cache(cache); /// /// // Construct the tree /// # builder.start_node(Root); @@ -142,22 +149,23 @@ where /// See also [`interner_mut`](NodeCache::interner_mut). #[inline] pub fn interner(&self) -> &I { - &*self.interner + &self.interner } /// Get a mutable reference to the interner used to deduplicate source text (strings). /// # Examples /// ``` /// # use cstree::*; + /// # use cstree::build::*; /// # use cstree::interning::*; /// let mut cache = NodeCache::new(); /// let interner = cache.interner_mut(); /// let key = interner.get_or_intern("foo"); - /// assert_eq!(interner.resolve(&key), "foo"); + /// assert_eq!(interner.resolve(key), "foo"); /// ``` #[inline] pub fn interner_mut(&mut self) -> &mut I { - &mut *self.interner + &mut self.interner } /// If this node cache was constructed with [`new`](NodeCache::new) or @@ -196,7 +204,7 @@ where } #[inline(always)] - fn intern(&mut self, text: &str) -> Key { + fn intern(&mut self, text: &str) -> TokenKey { self.interner.get_or_intern(text) } @@ -205,7 +213,7 @@ where #[inline] fn get_cached_node( &mut self, - kind: SyntaxKind, + kind: RawSyntaxKind, children: std::vec::Drain<'_, GreenElement>, text_len: TextSize, child_hash: u32, @@ -221,7 +229,7 @@ where .clone() } - fn token(&mut self, kind: L::Kind, text: Option, len: u32) -> GreenToken { + fn token(&mut self, kind: L::Kind, text: Option, len: u32) -> GreenToken { let text_len = TextSize::from(len); let kind = L::kind_to_raw(kind); let data = GreenTokenData { kind, text, text_len }; @@ -246,7 +254,6 @@ pub struct Checkpoint(usize); /// # Examples /// ``` /// # use cstree::testing::{*, Language as _}; -/// # use cstree::interning::IntoResolver; /// // Build a tree /// let mut builder: GreenNodeBuilder = GreenNodeBuilder::new(); /// builder.start_node(Root); @@ -258,7 +265,7 @@ pub struct Checkpoint(usize); /// assert_eq!(tree.kind(), MyLanguage::kind_to_raw(Root)); /// let int = tree.children().next().unwrap(); /// assert_eq!(int.kind(), MyLanguage::kind_to_raw(Int)); -/// let resolver = cache.unwrap().into_interner().unwrap().into_resolver(); +/// let resolver = cache.unwrap().into_interner().unwrap(); /// assert_eq!(int.as_token().unwrap().text(&resolver), Some("42")); /// ``` #[derive(Debug)] @@ -288,7 +295,7 @@ impl Default for GreenNodeBuilder<'static, 'static, L> { impl<'cache, 'interner, L, I> GreenNodeBuilder<'cache, 'interner, L, I> where L: Language, - I: Interner, + I: Interner, { /// Reusing a [`NodeCache`] between multiple builders saves memory, as it allows to structurally /// share underlying trees. @@ -306,6 +313,7 @@ where /// # Examples /// ``` /// # use cstree::testing::{*, Language as _}; + /// # use cstree::build::*; /// // Construct a builder from our own cache /// let cache = NodeCache::new(); /// let mut builder: GreenNodeBuilder = GreenNodeBuilder::from_cache(cache); @@ -358,7 +366,7 @@ where /// See also [`interner_mut`](GreenNodeBuilder::interner_mut). #[inline] pub fn interner(&self) -> &I { - &*self.cache.interner + &self.cache.interner } /// Get a mutable reference to the interner used to deduplicate source text (strings). @@ -367,20 +375,19 @@ where /// # Examples /// ``` /// # use cstree::testing::*; + /// # use cstree::build::*; /// # use cstree::interning::*; /// let mut builder: GreenNodeBuilder = GreenNodeBuilder::new(); /// let interner = builder.interner_mut(); /// let key = interner.get_or_intern("foo"); - /// assert_eq!(interner.resolve(&key), "foo"); + /// assert_eq!(interner.resolve(key), "foo"); /// ``` #[inline] pub fn interner_mut(&mut self) -> &mut I { - &mut *self.cache.interner + &mut self.cache.interner } - /// Add a new token to the current branch without storing an explicit section of text. - /// This is be useful if the text can always be inferred from the token's `kind`, for example - /// when using kinds for specific operators or punctuation. + /// Add a new token with the given `text` to the current node. /// /// ## Panics /// In debug mode, if `kind` has static text, this function will verify that `text` matches that text. @@ -403,6 +410,22 @@ where self.children.push(token.into()); } + /// Add a new token to the current node without storing an explicit section of text. + /// This is be useful if the text can always be inferred from the token's `kind`, for example + /// when using kinds for specific operators or punctuation. + /// + /// For tokens whose textual representation is not static, such as numbers or identifiers, use + /// [`token`](GreenNodeBuilder::token). + /// + /// ## Panics + /// If `kind` does not have static text, i.e., `L::static_text(kind)` returns `None`. + #[inline] + pub fn static_token(&mut self, kind: L::Kind) { + let static_text = L::static_text(kind).unwrap_or_else(|| panic!("Missing static text for '{kind:?}'")); + let token = self.cache.token::(kind, None, static_text.len() as u32); + self.children.push(token.into()); + } + /// Start new node of the given `kind` and make it current. #[inline] pub fn start_node(&mut self, kind: L::Kind) { @@ -427,7 +450,7 @@ where /// # Examples /// ``` /// # use cstree::testing::*; - /// # use cstree::{GreenNodeBuilder, Language}; + /// # use cstree::{build::GreenNodeBuilder, Language}; /// # struct Parser; /// # impl Parser { /// # fn peek(&self) -> Option { None } diff --git a/src/green/element.rs b/src/green/element.rs index 5ab0826..9298551 100644 --- a/src/green/element.rs +++ b/src/green/element.rs @@ -7,8 +7,10 @@ type ErasedPtr = *const u8; use sptr::Strict; use crate::{ - green::{GreenNode, GreenToken, SyntaxKind}, - NodeOrToken, TextSize, + green::{GreenNode, GreenToken}, + text::TextSize, + util::NodeOrToken, + RawSyntaxKind, }; pub(super) type GreenElement = NodeOrToken; @@ -64,7 +66,7 @@ impl From for PackedGreenElement { impl GreenElement { /// Returns kind of this element. #[inline] - pub fn kind(&self) -> SyntaxKind { + pub fn kind(&self) -> RawSyntaxKind { self.as_ref().kind() } @@ -78,7 +80,7 @@ impl GreenElement { impl GreenElementRef<'_> { /// Returns kind of this element. #[inline] - pub fn kind(&self) -> SyntaxKind { + pub fn kind(&self) -> RawSyntaxKind { match self { NodeOrToken::Node(it) => it.kind(), NodeOrToken::Token(it) => it.kind(), diff --git a/src/green/interner.rs b/src/green/interner.rs deleted file mode 100644 index db995fc..0000000 --- a/src/green/interner.rs +++ /dev/null @@ -1,126 +0,0 @@ -use std::num::NonZeroUsize; - -use crate::interning::{ - Capacity, Interner, IntoReader, IntoReaderAndResolver, IntoResolver, Key, Reader, Resolver, Rodeo, -}; -use fxhash::FxBuildHasher; - -/// The default [`Interner`] used to deduplicate green token strings. -#[derive(Debug)] -pub struct TokenInterner { - rodeo: Rodeo, -} - -impl TokenInterner { - pub(super) fn new() -> Self { - Self { - rodeo: Rodeo::with_capacity_and_hasher( - // capacity values suggested by author of `lasso` - Capacity::new(512, unsafe { NonZeroUsize::new_unchecked(4096) }), - FxBuildHasher::default(), - ), - } - } -} - -impl Resolver for TokenInterner { - #[inline] - fn resolve<'a>(&'a self, key: &Key) -> &'a str { - self.rodeo.resolve(key) - } - - #[inline] - fn try_resolve<'a>(&'a self, key: &Key) -> Option<&'a str> { - self.rodeo.try_resolve(key) - } - - #[inline] - unsafe fn resolve_unchecked<'a>(&'a self, key: &Key) -> &'a str { - self.rodeo.resolve_unchecked(key) - } - - #[inline] - fn contains_key(&self, key: &Key) -> bool { - self.rodeo.contains_key(key) - } - - #[inline] - fn len(&self) -> usize { - self.rodeo.len() - } -} - -impl Reader for TokenInterner { - #[inline] - fn get(&self, val: &str) -> Option { - self.rodeo.get(val) - } - - #[inline] - fn contains(&self, val: &str) -> bool { - self.rodeo.contains(val) - } -} - -impl IntoResolver for TokenInterner { - type Resolver = ::Resolver; - - #[inline] - fn into_resolver(self) -> Self::Resolver - where - Self: 'static, - { - self.rodeo.into_resolver() - } - - #[inline] - fn into_resolver_boxed(self: Box) -> Self::Resolver - where - Self: 'static, - { - Rodeo::into_resolver_boxed(Box::new(self.rodeo)) - } -} - -impl Interner for TokenInterner { - #[inline] - fn get_or_intern(&mut self, val: &str) -> Key { - self.rodeo.get_or_intern(val) - } - - #[inline] - fn try_get_or_intern(&mut self, val: &str) -> lasso::LassoResult { - self.rodeo.try_get_or_intern(val) - } - - #[inline] - fn get_or_intern_static(&mut self, val: &'static str) -> Key { - self.rodeo.get_or_intern_static(val) - } - - #[inline] - fn try_get_or_intern_static(&mut self, val: &'static str) -> lasso::LassoResult { - self.rodeo.try_get_or_intern_static(val) - } -} - -impl IntoReader for TokenInterner { - type Reader = ::Reader; - - #[inline] - fn into_reader(self) -> Self::Reader - where - Self: 'static, - { - self.rodeo.into_reader() - } - - fn into_reader_boxed(self: Box) -> Self::Reader - where - Self: 'static, - { - Rodeo::into_reader_boxed(Box::new(self.rodeo)) - } -} - -impl IntoReaderAndResolver for TokenInterner {} diff --git a/src/green/iter.rs b/src/green/iter.rs index ca78c8f..d4e32d9 100644 --- a/src/green/iter.rs +++ b/src/green/iter.rs @@ -4,7 +4,7 @@ use std::{iter::FusedIterator, slice}; use super::{element::PackedGreenElement, GreenElementRef}; -/// An iterator over a [`GreenNode`](crate::GreenNode)'s children. +/// An iterator over a [`GreenNode`](crate::green::GreenNode)'s children. #[derive(Debug, Clone)] pub struct GreenNodeChildren<'a> { pub(super) inner: slice::Iter<'a, PackedGreenElement>, diff --git a/src/green/node.rs b/src/green/node.rs index 6eb119e..9288d4c 100644 --- a/src/green/node.rs +++ b/src/green/node.rs @@ -6,15 +6,16 @@ use std::{ use fxhash::FxHasher32; use crate::{ - green::{iter::GreenNodeChildren, GreenElement, PackedGreenElement, SyntaxKind}, - TextSize, + green::{iter::GreenNodeChildren, GreenElement, PackedGreenElement}, + text::TextSize, + RawSyntaxKind, }; use triomphe::{Arc, HeaderWithLength, ThinArc}; #[repr(align(2))] //to use 1 bit for pointer tagging. NB: this is an at-least annotation #[derive(Debug, Clone, PartialEq, Eq, Hash)] pub(super) struct GreenNodeHead { - pub(super) kind: SyntaxKind, + pub(super) kind: RawSyntaxKind, pub(super) text_len: TextSize, pub(super) child_hash: u32, } @@ -35,7 +36,7 @@ impl std::fmt::Debug for GreenNode { impl GreenNode { /// Creates a new Node. #[inline] - pub fn new(kind: SyntaxKind, children: I) -> GreenNode + pub fn new(kind: RawSyntaxKind, children: I) -> GreenNode where I: IntoIterator, I::IntoIter: ExactSizeIterator, @@ -72,7 +73,7 @@ impl GreenNode { /// Creates a new Node. #[inline] pub(super) fn new_with_len_and_hash( - kind: SyntaxKind, + kind: RawSyntaxKind, children: I, text_len: TextSize, child_hash: u32, @@ -115,9 +116,9 @@ impl GreenNode { } } - /// [`SyntaxKind`] of this node. + /// [`RawSyntaxKind`] of this node. #[inline] - pub fn kind(&self) -> SyntaxKind { + pub fn kind(&self) -> RawSyntaxKind { self.data.header.header.kind } diff --git a/src/green/token.rs b/src/green/token.rs index e58ceef..d5543c1 100644 --- a/src/green/token.rs +++ b/src/green/token.rs @@ -1,9 +1,9 @@ use std::{fmt, hash, mem::ManuallyDrop, ptr::NonNull}; use crate::{ - green::SyntaxKind, - interning::{Key, Resolver}, - TextSize, + interning::{Resolver, TokenKey}, + text::TextSize, + RawSyntaxKind, }; use sptr::Strict; use triomphe::Arc; @@ -11,8 +11,8 @@ use triomphe::Arc; #[repr(align(2))] // to use 1 bit for pointer tagging. NB: this is an at-least annotation #[derive(Debug, PartialEq, Eq, Hash, Copy, Clone)] pub(super) struct GreenTokenData { - pub(super) kind: SyntaxKind, - pub(super) text: Option, + pub(super) kind: RawSyntaxKind, + pub(super) text: Option, pub(super) text_len: TextSize, } @@ -54,9 +54,9 @@ impl GreenToken { } } - /// [`SyntaxKind`] of this Token. + /// [`RawSyntaxKind`] of this Token. #[inline] - pub fn kind(&self) -> SyntaxKind { + pub fn kind(&self) -> RawSyntaxKind { self.data().kind } @@ -64,9 +64,9 @@ impl GreenToken { #[inline] pub fn text<'i, I>(&self, resolver: &'i I) -> Option<&'i str> where - I: Resolver + ?Sized, + I: Resolver + ?Sized, { - self.data().text.map(|key| resolver.resolve(&key)) + self.data().text.map(|key| resolver.resolve(key)) } /// Returns the length of text covered by this token. @@ -80,7 +80,7 @@ impl GreenToken { /// /// See also [`text`](GreenToken::text). #[inline] - pub fn text_key(&self) -> Option { + pub fn text_key(&self) -> Option { self.data().text } } diff --git a/src/interning.rs b/src/interning.rs index dbe2d41..d1fd913 100644 --- a/src/interning.rs +++ b/src/interning.rs @@ -1,47 +1,186 @@ //! Types and Traits for efficient String storage and deduplication. //! -//! Interning functionality is provided by the [`lasso`](lasso) crate. +//! Because `cstree` is aimed at _concrete_ syntax trees that faithfully represent all of the original program input, +//! `cstree` aks for the text of each token when building a syntax tree. You'll notice this when looking at +//! [`GreenNodeBuilder::token`], which takes the kind of token and a refernce to the text of the token in the source. +//! +//! Of course, there are tokens whose text will always be the same, such as punctuation (like a semicolon), keywords +//! (like `fn`), or operators (like `<=`). Use [`Language::static_text`] when implementing `Language` to make `cstree` +//! aware of such tokens. +//! +//! There is, however, another category of tokens whose text will appear repeatedly, but for which we cannot know the +//! text upfront. Any variable, type, or method that is user-defined will likely be named more than once, but there is +//! no way to know beforehand what names a user will choose. +//! +//! In order to avoid storing the source text for these tokens many times over, `cstree` _interns_ the text of its +//! tokens (if that text is not static). What this means is that each unique string is only stored once. When a new +//! token is added - say, a variable -, we check if we already know its contents (the variable name). If the text is +//! new, we save it and give it a unique Id. If we have seen the text before, we look up its unique Id and don't need to +//! keep the new data around. As an additional benefit, interning also makes it much cheaper to copy source text around +//! and also to compare it with other source text, since what is actually being copied or compared is just an integer. +//! +//! ## I just want to build a syntax tree +//! +//! If you don't want to worry about this for now, you (mostly) can! All required functionality is implemented in +//! `cstree` and you can just use [`GreenNodeBuilder::new`] to obtain a tree builder with everything set up (see the +//! [crate documentation] for more on how to get started). This will create an interner, which the builder returns +//! together with the syntax tree on [`finish`] as part of its node cache (call [`NodeCache::into_interner`] on the +//! result to get the interner out). +//! +//! Here begins the part where you do have to think about interning: `cstree` needs the interner you get when you want +//! to look at the source text for some part of the syntax tree, so you'll have to keep it around somehow until the +//! point where you need it. +//! +//! How best to do this depends on what you need the text for. If the code that accesses the text is close-by, it might +//! be enough to pass the return value to the functions that need it (within `cstree` or in your code). Other options +//! could be to store the interner together with the syntax tree. If you use [`SyntaxNode::new_root_with_resolver`], you +//! get a syntax tree that can handle text without any need to manage and pass an interner (the reason the method is +//! called `_with_resolver` and not `_with_interner` is that it doesn't actually needs a full [`Interner`] -- once the +//! tree is created, no more text will be added, so it just needs to be able to look up text. This part is called a +//! [`Resolver`]). Or you could put the interner somewhere "global", where you can easily access it from anywhere. +//! +//! ## Using other interners +//! +//! By default, `cstree` uses its own, simple interner implementation. You can obtain an interner by calling +//! [`new_interner`], or bring your own by implementing the [`Resolver`] and [`Interner`] traits defined in this module. +//! Most methods in `cstree` require that you support interning [`TokenKey`]s. `TokenKey` implements [`InternKey`], so +//! your implementation can use that to convert to whatever types it uses for its internal representation. Note that +//! there is no way to change the size of the internal representation. +//! +//! ### `lasso` +//! Using features, you can enable support for some third-party interners. The primary one is [`lasso`], a crate focused +//! on efficient interning of text strings. This is enabled via the `lasso_compat` feature and adds the necessary trait +//! implementation to make `lasso`'s interners work with `cstree` (as well as a re-export of the matching version of +//! `lasso` here). If enabled, `cstree`'s built-in interning functionality is replaced with `lasso`'s more efficient one +//! transparently, so you'll now be returned a `lasso` interner from [`new_interner`]. +//! +//! ### `salsa` +//! If you are using the "2022" version of the `salsa` incremental query framework, it is possible to use its interning +//! capabilities with `cstree` as well. Support for this is experimental, and you have to opt in via the +//! `salsa_2022_compat` feature. For instructions on how to do this, and whether you actually want to, please refer to +//! [the `salsa_compat` module documentation]. +//! +//! ## Multi-threaded interners +//! If you want to use your interner on more than one thread, the interner needs to support interning new text through +//! shared access. With the `multi_threaded_interning` feature, you can get such an interner by calling +//! [`new_threaded_interner`]. The feature also enables support for `ThreadedRodeo`, the multi-threaded interner from +//! `lasso`. +//! +//! **You can pass a reference to that interner to anything that expects an [`Interner`]!** +//! While the interning methods on [`Interner`] require a `&mut self` to also work for single-threaded interners, both +//! [`Resolver`] and [`Interner`] will be implemented for `&interner` if `interner` is multi-threaded: +//! +//! ``` +//! # use cstree::testing::{*, Language as _}; +//! # use cstree::interning::*; +//! +//! let interner = new_threaded_interner(); +//! let mut builder: GreenNodeBuilder = +//! GreenNodeBuilder::from_interner(&interner); +//! +//! # builder.start_node(Root); +//! # builder.token(Int, "42"); +//! # builder.finish_node(); +//! parse(&mut builder, "42"); +//! let (tree, cache) = builder.finish(); +//! +//! // Note that we get a cache and interner back, because we passed an "owned" reference to `from_interner` +//! let used_interner = cache.unwrap().into_interner().unwrap(); +//! assert_eq!(used_interner as *const _, &interner as *const _); +//! +//! let int = tree.children().next().unwrap(); +//! assert_eq!(int.as_token().unwrap().text(&interner), Some("42")); +//! ``` +//! +//! Here, we use `from_interner`, but pass it only a shared reference to "own". Take care to denote the type signature +//! of the `GreenNodeBuilder` appropriately. +//! +//! [crate documentation]: crate +//! [`Language::static_text`]: crate::Language::static_text +//! [`GreenNodeBuilder::token`]: crate::build::GreenNodeBuilder::token +//! [`GreenNodeBuilder::new`]: crate::build::GreenNodeBuilder::new +//! [`finish`]: crate::build::GreenNodeBuilder::finish +//! [`NodeCache::into_interner`]: crate::build::NodeCache::into_interner +//! [`SyntaxNode::new_root_with_resolver`]: crate::syntax::SyntaxNode::new_root_with_resolver +//! [`lasso`]: lasso +//! [the `salsa_compat` module documentation]: salsa_compat -pub use fxhash::FxBuildHasher as Hasher; +mod traits; +pub use self::traits::*; -pub use crate::green::TokenInterner; +mod default_interner; -/// The index type for all interners. Each key represents -pub type Key = lasso::Spur; -pub use lasso::{Interner, IntoReader, IntoReaderAndResolver, IntoResolver, Reader, Resolver}; +#[cfg(not(feature = "lasso_compat"))] +#[doc(inline)] +pub use default_interner::TokenInterner; -/// A string interner that caches strings quickly with a minimal memory footprint, returning a unique key to re-access -/// it with `O(1)` times. By default, `Rodeo` uses an [`fxhash`] [`Hasher`]. -pub type Rodeo = lasso::Rodeo; +#[cfg(feature = "lasso_compat")] +mod lasso_compat; -/// Constructs a new, single-threaded interner. +#[cfg(feature = "lasso_compat")] +#[doc(inline)] +pub use lasso_compat::TokenInterner; + +#[cfg(feature = "multi_threaded_interning")] +#[doc(inline)] +pub use lasso_compat::MultiThreadedTokenInterner; + +#[cfg(feature = "lasso_compat")] +#[cfg_attr(doc_cfg, doc(cfg(feature = "lasso_compat")))] +pub use lasso; + +#[cfg(feature = "salsa_2022_compat")] +#[cfg_attr(doc_cfg, doc(cfg(feature = "salsa_2022_compat")))] +pub mod salsa_compat; + +use core::fmt; +use std::num::NonZeroU32; + +/// The intern key type for the source text of [`GreenToken`s](crate::green::GreenToken). +/// Each unique key uniquely identifies a deduplicated, interned source string. +#[derive(Clone, Copy, PartialEq, Eq, Hash)] +#[repr(transparent)] +pub struct TokenKey { + inner: NonZeroU32, +} + +// Safety: we match `+ 1` and `- 1`, so it is always possible to round-trip. +unsafe impl InternKey for TokenKey { + #[inline] + fn into_u32(self) -> u32 { + self.inner.get() - 1 + } + + fn try_from_u32(key: u32) -> Option { + (key < u32::MAX).then(|| Self { + // Safety: non-zero by increment. + // Overflow is impossible under the check above. + inner: unsafe { NonZeroU32::new_unchecked(key + 1) }, + }) + } +} + +impl fmt::Debug for TokenKey { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + f.write_fmt(format_args!("TokenKey({})", self.inner)) + } +} + +/// Constructs a new, single-threaded [`Interner`](traits::Interner). /// /// If you need the interner to be multi-threaded, see [`new_threaded_interner`]. #[inline] -pub fn new_interner() -> Rodeo { - Rodeo::with_hasher(Hasher::default()) +pub fn new_interner() -> TokenInterner { + TokenInterner::new() } -/// A string interner that caches strings quickly with a minimal memory footprint, returning a unique key to re-access -/// it with `O(1)` times. By default, `ThreadedRodeo` uses an [`fxhash`] [`Hasher`]. -pub type ThreadedRodeo = lasso::ThreadedRodeo; - -/// Constructs a new interner that can be used across multiple threads. +/// Constructs a new [`Interner`](traits::Interner) that can be used across multiple threads. +/// +/// Note that you can use `&MultiThreadTokenInterner` to access interning methods through a shared reference, as well as +/// construct new syntax trees. See [the module documentation](self) for more information and examples. +#[cfg(feature = "multi_threaded_interning")] +#[cfg_attr(doc_cfg, doc(cfg(feature = "multi_threaded_interning")))] #[inline] -pub fn new_threaded_interner() -> ThreadedRodeo { - ThreadedRodeo::with_hasher(Hasher::default()) +pub fn new_threaded_interner() -> MultiThreadedTokenInterner { + MultiThreadedTokenInterner::new() } - -/// A read-only view of a [`Rodeo`] or [`ThreadedRodeo`] that allows contention-free access to interned strings, both -/// key to string resolution and string to key lookups. -/// -/// The hasher is the same as the Rodeo or ThreadedRodeo that created it. -/// Can be acquired with the `into_reader` methods (see also [`IntoReader`]). -pub type RodeoReader = lasso::RodeoReader; - -/// A read-only view of a [`Rodeo`] or [`ThreadedRodeo`] that allows contention-free access to interned strings with -/// only key to string resolution. -/// -/// Can be acquired with the `into_resolver` methods (see also [`IntoResolver`]). -pub type RodeoResolver = lasso::RodeoResolver; -pub use lasso::{Capacity, Iter, LassoError, LassoErrorKind, LassoResult, MemoryLimits, Strings}; diff --git a/src/interning/default_interner.rs b/src/interning/default_interner.rs new file mode 100644 index 0000000..cf25a67 --- /dev/null +++ b/src/interning/default_interner.rs @@ -0,0 +1,70 @@ +#![cfg(not(feature = "lasso_compat"))] + +use core::fmt; + +use fxhash::FxBuildHasher as Hasher; +use indexmap::IndexSet; + +use super::{InternKey, Interner, Resolver, TokenKey}; + +/// The default [`Interner`] used to deduplicate green token strings. +#[derive(Debug)] +pub struct TokenInterner { + id_set: IndexSet, +} + +impl TokenInterner { + pub(in crate::interning) fn new() -> Self { + Self { + id_set: IndexSet::default(), + } + } +} + +#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)] +pub enum InternerError { + KeySpaceExhausted, +} + +impl fmt::Display for InternerError { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + match self { + InternerError::KeySpaceExhausted => write!(f, "key space exhausted"), + } + } +} + +impl std::error::Error for InternerError {} + +impl Resolver for TokenInterner { + fn try_resolve(&self, key: TokenKey) -> Option<&str> { + let index = key.into_u32() as usize; + self.id_set.get_index(index).map(String::as_str) + } +} + +// `TokenKey` can represent `1` to `u32::MAX` (due to the `NonNull` niche), so `u32::MAX` elements. +// Set indices start at 0, so everything shifts down by 1. +const N_INDICES: usize = u32::MAX as usize; + +impl Interner for TokenInterner { + type Error = InternerError; + + fn try_get_or_intern(&mut self, text: &str) -> Result { + if let Some(index) = self.id_set.get_index_of(text) { + let raw_key = u32::try_from(index).unwrap_or_else(|_| { + panic!("found interned text with invalid index `{index}` (index too high for keyspace)") + }); + return Ok(TokenKey::try_from_u32(raw_key).unwrap_or_else(|| { + panic!("found interned text with invalid index `{index}` (index too high for keyspace)") + })); + } else if self.id_set.len() >= N_INDICES { + return Err(InternerError::KeySpaceExhausted); + } + + let (index, added) = self.id_set.insert_full(text.to_string()); + debug_assert!(added, "tried to intern duplicate text"); + let raw_key = u32::try_from(index).unwrap_or_else(|_| panic!("interned `{index}` despite keyspace exhaustion")); + TokenKey::try_from_u32(raw_key).ok_or(InternerError::KeySpaceExhausted) + } +} diff --git a/src/interning/lasso_compat.rs b/src/interning/lasso_compat.rs new file mode 100644 index 0000000..620b1a8 --- /dev/null +++ b/src/interning/lasso_compat.rs @@ -0,0 +1,9 @@ +//! Bridge between `cstree`'s and `lasso`'s types and traits. + +#![cfg(feature = "lasso_compat")] + +mod token_interner; +#[doc(inline)] +pub use token_interner::*; + +mod traits; diff --git a/src/interning/lasso_compat/token_interner.rs b/src/interning/lasso_compat/token_interner.rs new file mode 100644 index 0000000..55ba236 --- /dev/null +++ b/src/interning/lasso_compat/token_interner.rs @@ -0,0 +1,109 @@ +//! Default interner implementations based on `lasso`. + +#![cfg(feature = "lasso_compat")] + +use std::{hash::BuildHasher, num::NonZeroUsize}; + +use fxhash::FxBuildHasher as Hasher; +use lasso::{Capacity, Rodeo, ThreadedRodeo}; + +use crate::interning::{Interner, Resolver, TokenKey}; + +/// Default number of strings that the interner will initially allocate space for. +/// Value recommended by the author of `lasso`. +const DEFAULT_STRING_CAPACITY: usize = 512; + +/// Default memory in bytes that the interner will initially allocate space for. +/// Value recommended by the author of `lasso`. +const DEFAULT_BYTE_CAPACITY: NonZeroUsize = unsafe { NonZeroUsize::new_unchecked(4096) }; + +macro_rules! impl_traits { + (for $interner:ty $(, if #[cfg(feature = $feature:literal)])?) => { + $(#[cfg_attr(doc_cfg, doc(cfg(feature = $feature)))])? + impl Resolver for $interner { + #[inline] + fn try_resolve(&self, key: TokenKey) -> Option<&str> { + self.rodeo.try_resolve(&key) + } + + #[inline] + fn resolve(&self, key: TokenKey) -> &str { + self.rodeo.resolve(&key) + } + } + + $(#[cfg_attr(doc_cfg, doc(cfg(feature = $feature)))])? + impl Interner for $interner { + type Error = lasso::LassoError; + + #[inline] + fn try_get_or_intern(&mut self, text: &str) -> Result { + self.rodeo.try_get_or_intern(text) + } + + #[inline] + fn get_or_intern(&mut self, text: &str) -> TokenKey { + self.rodeo.get_or_intern(text) + } + } + }; +} + +/// The default [`Interner`] used to deduplicate green token strings. +#[derive(Debug)] +pub struct TokenInterner { + rodeo: Rodeo, +} + +impl TokenInterner { + pub(in crate::interning) fn new() -> Self { + Self { + rodeo: Rodeo::with_capacity_and_hasher( + Capacity::new(DEFAULT_STRING_CAPACITY, DEFAULT_BYTE_CAPACITY), + Hasher::default(), + ), + } + } + + /// Returns the [`Rodeo`] backing this interner. + #[cfg_attr(doc_cfg, doc(cfg(feature = "lasso_compat")))] + #[inline] + pub fn into_inner(self) -> Rodeo { + self.rodeo + } +} + +impl_traits!(for TokenInterner); + +#[cfg(feature = "multi_threaded_interning")] +pub use multi_threaded::MultiThreadedTokenInterner; + +#[cfg(feature = "multi_threaded_interning")] +mod multi_threaded { + use super::*; + + /// A threadsafe [`Interner`] for deduplicating [`GreenToken`](crate::green::GreenToken) strings. + /// + /// Note that [`Interner`] and [`Resolver`] are also implemented for `&MultiThreadTokenInterner` so you can pass + /// `&mut &interner` in shared contexts. + #[cfg_attr(doc_cfg, doc(cfg(feature = "multi_threaded_interning")))] + #[derive(Debug)] + pub struct MultiThreadedTokenInterner { + rodeo: ThreadedRodeo, + } + + impl MultiThreadedTokenInterner { + pub(in crate::interning) fn new() -> Self { + Self { + rodeo: ThreadedRodeo::with_capacity_and_hasher( + Capacity::new(DEFAULT_STRING_CAPACITY, DEFAULT_BYTE_CAPACITY), + Hasher::default(), + ), + } + } + } + + impl_traits!(for MultiThreadedTokenInterner, if #[cfg(feature = "multi_threaded_interning")]); + + impl_traits!(for &MultiThreadedTokenInterner, if #[cfg(feature = "multi_threaded_interning")]); +} diff --git a/src/interning/lasso_compat/traits.rs b/src/interning/lasso_compat/traits.rs new file mode 100644 index 0000000..9a09af8 --- /dev/null +++ b/src/interning/lasso_compat/traits.rs @@ -0,0 +1,166 @@ +#![cfg(feature = "lasso_compat")] + +use core::fmt; +use std::hash::{BuildHasher, Hash}; + +use crate::interning::{ + traits::{InternKey, Interner, Resolver}, + TokenKey, +}; + +// Safety: `InternKey` has the same invariant as `lasso::Key` +unsafe impl lasso::Key for TokenKey { + fn into_usize(self) -> usize { + self.into_u32() as usize + } + + fn try_from_usize(int: usize) -> Option { + let raw_key = u32::try_from(int).ok()?; + Self::try_from_u32(raw_key) + } +} + +#[derive(Debug, Clone, PartialEq, Eq, Hash)] +pub enum LassoCompatError { + LassoError(lasso::LassoError), + KeyConversionError { lasso_key: usize }, +} + +impl From for LassoCompatError { + #[inline] + fn from(error: lasso::LassoError) -> Self { + Self::LassoError(error) + } +} + +impl fmt::Display for LassoCompatError { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + match self { + LassoCompatError::LassoError(lasso_error) => write!(f, "{lasso_error}"), + LassoCompatError::KeyConversionError { lasso_key } => write!( + f, + "invalid key: failed to convert `lasso::Key` `{lasso_key}` to `InternKey`" + ), + } + } +} + +impl std::error::Error for LassoCompatError {} + +macro_rules! compat_resolver { + ($resolver:ident $(where $($t:ident : $bound:ident),+)? $(if #[cfg(feature = $feature:literal)])?) => { + $(#[cfg_attr(doc_cfg, doc(cfg(feature = $feature)))])? + impl Resolver for lasso::$resolver + where + K: lasso::Key, + $($($t: $bound),+)? + { + fn try_resolve(&self, key: TokenKey) -> Option<&str> { + let raw_key = TokenKey::into_u32(key); + let lasso_key = K::try_from_usize(raw_key as usize)?; + >::try_resolve(self, &lasso_key) + } + + fn resolve(&self, key: TokenKey) -> &str { + let raw_key = TokenKey::into_u32(key); + let lasso_key = K::try_from_usize(raw_key as usize).expect(&format!( + "invalid key: failed to convert `{key:?}` to `lasso::Key`" + )); + >::resolve(self, &lasso_key) + } + } + }; +} + +macro_rules! compat_interner { + ($interner:ident $(where $($t:ident : $bound:ident),+)? if #[cfg(feature = $feature:literal)]) => { + #[cfg_attr(doc_cfg, doc(cfg(feature = $feature)))] + impl Interner for lasso::$interner + where + K: lasso::Key, + S: BuildHasher, + $($($t: $bound),+)? + { + type Error = LassoCompatError; + + fn try_get_or_intern(&mut self, text: &str) -> Result { + let lasso_key = >::try_get_or_intern(self, text)?; + let raw_key = K::into_usize(lasso_key); + u32::try_from(raw_key) + .ok() + .and_then(TokenKey::try_from_u32) + .ok_or(LassoCompatError::KeyConversionError { lasso_key: raw_key }) + } + + fn get_or_intern(&mut self, text: &str) -> TokenKey { + let lasso_key = >::get_or_intern(self, text); + let raw_key = K::into_usize(lasso_key); + u32::try_from(raw_key) + .ok() + .and_then(TokenKey::try_from_u32) + .ok_or(LassoCompatError::KeyConversionError { lasso_key: raw_key }) + .unwrap_or_else(|_| panic!("invalid key: failed to convert `lasso::Key` `{raw_key}` to `InternKey` (failed to intern {text:?})")) + } + } + }; +} + +compat_resolver!(RodeoReader if #[cfg(feature = "lasso_compat")]); +compat_resolver!(RodeoResolver if #[cfg(feature = "lasso_compat")]); + +compat_resolver!(Rodeo if #[cfg(feature = "lasso_compat")]); +compat_interner!(Rodeo if #[cfg(feature = "lasso_compat")]); + +#[cfg(feature = "multi_threaded_interning")] +mod multi_threaded { + use super::*; + + compat_resolver!(ThreadedRodeo where K: Hash, S: BuildHasher, S: Clone if #[cfg(feature = "multi_threaded_interning")]); + + compat_interner!(ThreadedRodeo where K: Hash, S: Clone if #[cfg(feature = "multi_threaded_interning")]); + + #[cfg_attr(doc_cfg, doc(cfg(feature = "multi_threaded_interning")))] + impl Resolver for &lasso::ThreadedRodeo + where + K: lasso::Key + Hash, + S: BuildHasher + Clone, + { + #[inline] + fn try_resolve(&self, key: TokenKey) -> Option<&str> { + as Resolver>::try_resolve(self, key) + } + + #[inline] + fn resolve(&self, key: TokenKey) -> &str { + as Resolver>::resolve(self, key) + } + } + + #[cfg_attr(doc_cfg, doc(cfg(feature = "multi_threaded_interning")))] + impl Interner for &lasso::ThreadedRodeo + where + K: lasso::Key + Hash, + S: BuildHasher + Clone, + { + type Error = as Interner>::Error; + + fn try_get_or_intern(&mut self, text: &str) -> Result { + let lasso_key = >::try_get_or_intern(self, text)?; + let raw_key = K::into_usize(lasso_key); + u32::try_from(raw_key) + .ok() + .and_then(TokenKey::try_from_u32) + .ok_or(LassoCompatError::KeyConversionError { lasso_key: raw_key }) + } + + fn get_or_intern(&mut self, text: &str) -> TokenKey { + let lasso_key = >::get_or_intern(self, text); + let raw_key = K::into_usize(lasso_key); + u32::try_from(raw_key) + .ok() + .and_then(TokenKey::try_from_u32) + .ok_or(LassoCompatError::KeyConversionError { lasso_key: raw_key }) + .unwrap_or_else(|_| panic!("invalid key: failed to convert `lasso::Key` `{raw_key}` to `InternKey` (failed to intern {text:?})")) + } + } +} diff --git a/src/interning/salsa_compat.rs b/src/interning/salsa_compat.rs new file mode 100644 index 0000000..081c22c --- /dev/null +++ b/src/interning/salsa_compat.rs @@ -0,0 +1,228 @@ +//! # Using a `salsa` database as the interner for `cstree` +//! +//!

+//! Warning: Compatibility is only provided for "Salsa 2022". +//! This version is currently under active development and cstree's +//! compatibility features are unstable until there is an official +//! release. +//! Older versions of `salsa` are not supported. +//!

+//! +//! If you are using the `salsa` query system, you already have access to an implemenation of interning through +//! [`#[salsa::interned]`](macro@salsa::interned). This is all that is needed to use `cstree` and this module provides +//! the utilities needed to use `salsa`'s interners for working with syntax trees. +//! +//! Note that the primary benefit of this is that it avoids additional dependencies because it uses an interner that you +//! already depend on, but it can also be beneficial to use an interner that is more specialized towards string +//! interning. In particular, using `salsa`'s interning requires allocating all strings that are interned even if they +//! are deduplicated because they already exist in the interner. +//! +//! ## How to do it +//! +//! ``` +//! # use cstree::testing::*; +//! # use cstree::interning::salsa_compat::salsa; +//! # use cstree::impl_cstree_interning_for_salsa; +//! // Define the `salsa` jar, database and intern Id +//! #[salsa::jar(db = Db)] +//! pub struct Jar(SourceId); +//! +//! pub trait Db: salsa::DbWithJar {} +//! impl Db for DB where DB: ?Sized + salsa::DbWithJar {} +//! +//! // If you are not a doctest and can put `Jar` at the root of your crate, +//! // this can just be `#[salsa::interned]`. +//! #[salsa::interned(jar = Jar)] +//! pub struct SourceId { +//! #[return_ref] +//! pub text: String, +//! } +//! +//! #[derive(Default)] +//! #[salsa::db(Jar)] +//! struct Database { +//! storage: salsa::Storage, +//! } +//! impl salsa::Database for Database {} +//! +//! // Let `cstree` define a conversion trait and implement it for your database. +//! // `Database` is your db type, `SourceId` is your interning id, and `text` is +//! // its text field (all as defined above). +//! impl_cstree_interning_for_salsa!(impl Interning for Database => text as SourceId); +//! +//! // Build a tree with the `salsa` interner +//! let db = Database::default(); +//! let interner = db.as_interner(); // <-- conversion happens here +//! let mut shared_interner = &interner; +//! let mut builder: GreenNodeBuilder = GreenNodeBuilder::with_interner(&mut shared_interner); +//! let (tree, _no_interner_because_it_was_borrowed) = { +//! builder.start_node(TestSyntaxKind::Plus); +//! builder.token(TestSyntaxKind::Float, "2.05"); +//! builder.token(TestSyntaxKind::Whitespace, " "); +//! builder.token(TestSyntaxKind::Plus, "+"); +//! builder.token(TestSyntaxKind::Whitespace, " "); +//! builder.token(TestSyntaxKind::Float, "7.32"); +//! builder.finish_node(); +//! builder.finish() +//! }; +//! let tree: SyntaxNode = SyntaxNode::new_root(tree); +//! assert_eq!(tree.resolve_text(shared_interner), "2.05 + 7.32"); +//! ``` +//! +//! The full code is also available in the `salsa` example. +//! +//! ## Working with `InternWithDb` directly +//! If you don't want the trait, or macros, or if you just need more control about what happens during interning and +//! resolution, you can skip using [`impl_cstree_interning_for_salsa`](crate::impl_cstree_interning_for_salsa) and use +//! [`InternWithDb`] directly. +//! +//! Because `salsa` generates inherent methods (and not, for example, a trait implementation), we need information about +//! the used interning id either way. All that `as_interner` does is construct an instance of `InternWithDb` that uses +//! the generated methods to invoke `salsa`s interner. The implementation expands to +//! ```text +//! InternWithDb::new( +//! db, +//! |db, text| SourceId::new(db, text), +//! |db, id| id.text(db), +//! ) +//! ``` +//! but you may provide any function that doesn't capture. + +#![cfg(feature = "salsa_2022_compat")] + +#[cfg_attr(doc_cfg, doc(cfg(feature = "salsa_2022_compat")))] +pub use salsa; + +use core::fmt; + +use super::{InternKey, Interner, Resolver, TokenKey}; + +#[cfg_attr(doc_cfg, doc(cfg(feature = "salsa_2022_compat")))] +impl salsa::AsId for TokenKey { + fn as_id(self) -> salsa::Id { + salsa::Id::from_u32(self.into_u32()) + } + + /// Create an instance of the intern-key from an ID. + /// + /// # Panics + /// Panics if the given `id` from `salsa` cannot be represented by a [`TokenKey`]. + fn from_id(id: salsa::Id) -> Self { + TokenKey::try_from_u32(id.as_u32()) + .unwrap_or_else(|| panic!("`salsa::Id` is invalid for `TokenKey`'s keyspace: {id:?}")) + } +} + +/// Generates an extension trait `SalsaAsInterner` that lets you call `db.as_interner()` on your [`salsa::Database`] to +/// obtain a `cstree` compatible [`Interner`]. +/// +/// The `as_interner` method returns an instance of [`InternWithDb`] that uses the functions generated by `salsa` for +/// your Id type to perform interning and resolution. +/// +/// If you have defined your interned text as +/// ```ignore +/// #[salsa::interned] +/// pub struct SourceId { +/// #[return_ref] +/// pub text: String, +/// } +/// ``` +/// the syntax is +/// ```ignore +/// impl_cstree_interning_for_salsa!(impl Interning for YourDatabase => text as SourceId); +/// ``` +/// where `text` the name of the interned field. +/// Note that the use of `#[return_ref]` is required. +#[macro_export] +#[cfg_attr(doc_cfg, doc(cfg(feature = "salsa_2022_compat")))] +macro_rules! impl_cstree_interning_for_salsa { + (impl Interning for $db:ty => $name:ident as $id:ty) => { + trait SalsaAsInterner { + fn as_interner(&self) -> ::cstree::interning::salsa_compat::InternWithDb<'_, $db, $id>; + } + + impl SalsaAsInterner for Database { + fn as_interner(&self) -> ::cstree::interning::salsa_compat::InternWithDb<'_, $db, $id> { + ::cstree::interning::salsa_compat::InternWithDb::new( + self, + |db, text| <$id>::new(db, text), + |db, id| id.$name(db), + ) + } + } + }; +} + +/// This type allows you to wrap access to a [`salsa::Database`] together with an interning and a lookup function, which +/// makes it implement [`Interner`] and [`Resolver`]. The [module documentation](self) shows how to use this with your +/// own database, or you can use [`impl_cstree_interning_for_salsa`](crate::impl_cstree_interning_for_salsa). +/// +/// The interning traits are also implemented by `&InternWithDb`, as the `salsa` database supports interning through +/// shared references (see also [the `interning` module documentation](super)). +#[cfg_attr(doc_cfg, doc(cfg(feature = "salsa_2022_compat")))] +pub struct InternWithDb<'db, Db: salsa::Database, Id: salsa::interned::InternedId> { + db: &'db Db, + intern: fn(&Db, text: String) -> Id, + lookup: fn(&Db, Id) -> &str, +} + +impl<'db, Db: salsa::Database, Id: salsa::interned::InternedId> fmt::Debug for InternWithDb<'db, Db, Id> { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + f.write_str("InternWithDb") + } +} + +impl<'db, Db: salsa::Database, Id: salsa::interned::InternedId> InternWithDb<'db, Db, Id> { + /// Create an [`Interner`] that works with `cstree` but uses the given `db` from `salsa`. + /// To do this, you need to provide a function for interning new strings that creates the [`InternedId`] that you + /// defined with [`#[salsa::interned]`](macro@salsa::interned), and a second one that resolves an Id using your + /// database. See the [module documentation](self) for an example. + /// + /// [`InternedId`]: salsa::interned::InternedId + pub fn new(db: &'db Db, intern: fn(&Db, text: String) -> Id, lookup: fn(&Db, Id) -> &str) -> Self { + Self { db, intern, lookup } + } +} + +impl<'db, Db: salsa::Database, Id: salsa::interned::InternedId> Resolver for InternWithDb<'db, Db, Id> { + fn try_resolve(&self, key: TokenKey) -> Option<&'db str> { + use salsa::AsId; + + let key = Id::from_id(key.as_id()); + let text = (self.lookup)(self.db, key); + Some(text) + } +} + +impl<'db, Db: salsa::Database, Id: salsa::interned::InternedId> Interner for InternWithDb<'db, Db, Id> { + type Error = std::convert::Infallible; + + fn try_get_or_intern(&mut self, text: &str) -> Result { + use salsa::AsId; + + let id = (self.intern)(self.db, text.to_string()); + Ok(TokenKey::from_id(id.as_id())) + } +} + +impl<'db, Db: salsa::Database, Id: salsa::interned::InternedId> Resolver for &InternWithDb<'db, Db, Id> { + fn try_resolve(&self, key: TokenKey) -> Option<&'db str> { + use salsa::AsId; + + let key = Id::from_id(key.as_id()); + let text = (self.lookup)(self.db, key); + Some(text) + } +} + +impl<'db, Db: salsa::Database, Id: salsa::interned::InternedId> Interner for &InternWithDb<'db, Db, Id> { + type Error = std::convert::Infallible; + + fn try_get_or_intern(&mut self, text: &str) -> Result { + use salsa::AsId; + + let id = (self.intern)(self.db, text.to_string()); + Ok(TokenKey::from_id(id.as_id())) + } +} diff --git a/src/interning/traits.rs b/src/interning/traits.rs new file mode 100644 index 0000000..cf222e4 --- /dev/null +++ b/src/interning/traits.rs @@ -0,0 +1,67 @@ +use core::fmt; + +use super::TokenKey; + +/// Common interface for all intern keys via conversion to and from `u32`. +/// +/// # Safety +/// Implementations must guarantee that keys can round-trip in both directions: going from `Self` to `u32` to `Self` and +/// going from `u32` to `Self` to `u32` must each yield the original value. +pub unsafe trait InternKey: Copy + Eq + fmt::Debug { + /// Convert `self` into its raw representation. + fn into_u32(self) -> u32; + + /// Try to reconstruct an intern key from its raw representation. + /// Returns `None` if `key` is not a valid key. + fn try_from_u32(key: u32) -> Option; +} + +/// The read-only part of an interner. +/// Allows to perform lookups of intern keys to resolve them to their interned text. +pub trait Resolver { + /// Tries to resolve the given `key` and return its interned text. + /// + /// If `self` does not contain any text for `key`, `None` is returned. + fn try_resolve(&self, key: Key) -> Option<&str>; + + /// Resolves `key` to its interned text. + /// + /// # Panics + /// Panics if there is no text for `key`. + /// + /// Compatibility implementations for interners from other crates may also panic if `key` cannot be converted to the + /// key type of the external interner. Please ensure you configure any external interners appropriately (for + /// example by choosing an appropriately sized key type). + fn resolve(&self, key: Key) -> &str { + self.try_resolve(key) + .unwrap_or_else(|| panic!("failed to resolve `{key:?}`")) + } +} + +/// A full interner, which can intern new strings returning intern keys and also resolve intern keys to the interned +/// value. +/// +/// **Note:** Because single-threaded interners may require mutable access, the methods on this trait take `&mut self`. +/// In order to use a multi- (or single)-threaded interner that allows access through a shared reference, it is +/// implemented for `&`[`MultiThreadedTokenInterner`](crate::interning::MultiThreadedTokenInterner), allowing it to be +/// used with a `&mut &MultiThreadTokenInterner`. +pub trait Interner: Resolver { + /// Represents possible ways in which interning may fail. + /// For example, this might be running out of fresh intern keys, or failure to allocate sufficient space for a new + /// value. + type Error; + + /// Interns `text` and returns a new intern key for it. + /// If `text` was already previously interned, it will not be used and the existing intern key for its value will be + /// returned. + fn try_get_or_intern(&mut self, text: &str) -> Result; + + /// Interns `text` and returns a new intern key for it. + /// + /// # Panics + /// Panics if the internment process raises an [`Error`](Interner::Error). + fn get_or_intern(&mut self, text: &str) -> Key { + self.try_get_or_intern(text) + .unwrap_or_else(|_| panic!("failed to intern `{text:?}`")) + } +} diff --git a/src/lib.rs b/src/lib.rs index 1200339..dfa934b 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -2,7 +2,7 @@ //! "Traditional" abstract syntax trees (ASTs) usually contain different types of nodes which represent information //! about the source text of a document and reduce this information to the minimal amount necessary to correctly //! interpret it. In contrast, CSTs are lossless representations of the entire input where all tree nodes are -//! represented uniformly (i.e. the nodes are _untyped_), but include a [`SyntaxKind`] field to determine the kind of +//! represented uniformly (i.e. the nodes are _untyped_), but include a [`RawSyntaxKind`] field to determine the kind of //! node. //! One of the big advantages of this representation is not only that it can recreate the original source exactly, but //! also that it lends itself very well to the representation of _incomplete or erroneous_ trees and is thus very suited @@ -35,41 +35,385 @@ //! references. You can still `clone` to obtain an owned node, but you only pay that cost when you need to. //! //! ## Getting Started -//! The main entry points for constructing syntax trees are [`GreenNodeBuilder`] and [`SyntaxNode::new_root`] for green -//! and red trees respectively. See `examples/s_expressions.rs` for a guided tutorial to `cstree`. +//! If you're looking at `cstree`, you're probably looking at or already writing a parser and are considering using +//! concrete syntax trees as its output. We'll talk more about parsing below -- first, let's have a look at what needs +//! to happen to go from input text to a `cstree` syntax tree: +//! +//! 1. Define an enumeration of the types of tokens (like keywords) and nodes (like "an expression") that you want to +//! have in your syntax and implement [`Language`] +//! +//! 2. Create a [`GreenNodeBuilder`](build::GreenNodeBuilder) and call +//! [`start_node`](build::GreenNodeBuilder::start_node), [`token`](build::GreenNodeBuilder::token) and +//! [`finish_node`](build::GreenNodeBuilder::finish_node) from your parser +//! +//! 3. Call [`SyntaxNode::new_root`](syntax::SyntaxNode::new_root) or +//! [`SyntaxNode::new_root_with_resolver`](syntax::SyntaxNode::new_root_with_resolver) with the resulting +//! [`GreenNode`](green::GreenNode) to obtain a syntax tree that you can traverse +//! +//! Let's walk through the motions of parsing a (very) simple language into `cstree` syntax trees. +//! We'll just support addition and subtraction on integers, from which the user is allowed to construct a single, +//! compound expression. They will, however, be allowed to write nested expressions in parentheses, like `1 - (2 + 5)`. +//! +//! ### Defining the language +//! +//! First, we need to list the different part of our language's grammar. +//! We can do that using an `enum` with a unit variant for any terminal and non-terminal. +//! The `enum` needs to be convertible to a `u16`, so we use the `repr` attribute to ensure it uses the correct +//! representation. +//! +//! ```rust,ignore +//! #[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash)] +//! #[repr(u16)] +//! enum SyntaxKind { +//! /* Tokens */ +//! Int, // 42 +//! Plus, // + +//! Minus, // - +//! LParen, // ( +//! RParen, // ) +//! /* Nodes */ +//! Expr, +//! Root, +//! } +//! ``` +//! +//! Most of these are tokens to lex the input string into, like numbers (`Int`) and operators (`Plus`, `Minus`). +//! We only really need one type of node; expressions. +//! Our syntax tree's root node will have the special kind `Root`, all other nodes will be +//! expressions containing a sequence of arithmetic operations potentially involving further, nested +//! expression nodes. +//! +//! To use our `SyntaxKind`s with `cstree`, we need to tell it how to convert it back to just a number (the +//! `#[repr(u16)]` that we added) by implementing the [`Language`] trait. We can also tell `cstree` about tokens that +//! always have the same text through the `static_text` method on the trait. This is useful for the operators and +//! parentheses, but not possible for numbers, since an integer token may be produced from the input `3`, but also from +//! other numbers like `7` or `12`. We implement `Language` on an empty type, just so we can give it a name. +//! +//! ```rust,ignore +//! #[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash)] +//! pub struct Calculator; +//! impl Language for Calculator { +//! // The tokens and nodes we just defined +//! type Kind = SyntaxKind; +//! +//! fn kind_from_raw(raw: RawSyntaxKind) -> Self::Kind { +//! // This just needs to be the inverse of `kind_to_raw`, but could also +//! // be an `impl TryFrom for SyntaxKind` or any other conversion. +//! match raw.0 { +//! 0 => SyntaxKind::Int, +//! 1 => SyntaxKind::Plus, +//! 2 => SyntaxKind::Minus, +//! 3 => SyntaxKind::LParen, +//! 4 => SyntaxKind::RParen, +//! 5 => SyntaxKind::Expr, +//! 6 => SyntaxKind::Root, +//! n => panic!("Unknown raw syntax kind: {n}"), +//! } +//! } +//! +//! fn kind_to_raw(kind: Self::Kind) -> RawSyntaxKind { +//! RawSyntaxKind(kind as u16) +//! } +//! +//! fn static_text(kind: Self::Kind) -> Option<&'static str> { +//! match kind { +//! SyntaxKind::Plus => Some("+"), +//! SyntaxKind::Minus => Some("-"), +//! SyntaxKind::LParen => Some("("), +//! SyntaxKind::RParen => Some(")"), +//! _ => None, +//! } +//! } +//! } +//! ``` +//! +//! ### Parsing into a green tree +//! With that out of the way, we can start writing the parser for our expressions. +//! For the purposes of this introduction to `cstree`, I'll assume that there is a lexer that yields the following +//! tokens: +//! +//! ```rust,ignore +//! #[derive(Debug, PartialEq, Eq, Clone, Copy)] +//! pub enum Token<'input> { +//! // Note that number strings are not yet parsed into actual numbers, +//! // we just remember the slice of the input that contains their digits +//! Int(&'input str), +//! Plus, +//! Minus, +//! LParen, +//! RParen, +//! // A special token that indicates that we have reached the end of the file +//! EoF, +//! } +//! ``` +//! +//! A simple lexer that yields such tokens is part of the full `readme` example, but we'll be busy enough with the +//! combination of `cstree` and the actual parser, which we define like this: +//! +//! ```rust,ignore +//! pub struct Parser<'input> { +//! // `Peekable` is a standard library iterator adapter that allows +//! // looking ahead at the next item without removing it from the iterator yet +//! lexer: Peekable>, +//! builder: GreenNodeBuilder<'static, 'static, Calculator>, +//! } +//! +//! impl<'input> Parser<'input> { +//! pub fn new(input: &'input str) -> Self { +//! Self { +//! // we get `peekable` from implementing `Iterator` on `Lexer` +//! lexer: Lexer::new(input).peekable(), +//! builder: GreenNodeBuilder::new(), +//! } +//! } +//! +//! pub fn bump(&mut self) -> Option> { +//! self.lexer.next() +//! } +//! } +//! ``` +//! +//! In contrast to parsers that return abstract syntax trees, with `cstree` the syntax tree nodes +//! for all element in the language grammar will have the same type: [`GreenNode`](green::GreenNode) +//! for the inner ("green") tree and [`SyntaxNode`](syntax::SyntaxNode) for the outer ("red") tree. +//! Different kinds of nodes (and tokens) are differentiated by their `SyntaxKind` tag, which we defined above. +//! +//! You can implement many types of parsers with `cstree`. To get a feel for how it works, consider +//! a typical recursive descent parser. With a more traditional AST, one would define different AST +//! structs for struct or function definitions, statements, expressions and so on. Inside the +//! parser, the components of any element, such as all fields of a struct or all statements inside a +//! function, are parsed first and then the parser wraps them in the matching AST type, which is +//! returned from the corresponding parser function. +//! +//! Because `cstree`'s syntax trees are untyped, there is no explicit AST representation that the +//! parser would build. Instead, parsing into a CST using the +//! [`GreenNodeBuilder`](build::GreenNodeBuilder) follows the source code more closely in that you +//! tell `cstree` about each new element you enter and all tokens that the parser consumes. So, for +//! example, to parse a struct definition the parser first "enters" the struct definition node, then +//! parses the `struct` keyword and type name, then parses each field, and finally "finishes" +//! parsing the struct node. +//! +//! The most trivial example is the root node for our parser, which just creates a root node +//! containing the whole expression (we could do without a specific root node if any expression was +//! a node, in particular if we wrapped integer literal tokens inside `Expr` nodes). +//! +//! ```rust,ignore +//! pub fn parse(&mut self) -> Result<(), String> { +//! self.builder.start_node(SyntaxKind::Root); +//! self.parse_expr()?; +//! self.builder.finish_node(); +//! Ok(()) +//! } +//! ``` +//! +//! As there isn't a static AST type to return, the parser is very flexible as to what is part of a +//! node. In the previous example, if the user is adding a new field to the struct and has not yet +//! typed the field's type, the CST node for the struct doesn't care if there is no child node for +//! it. Similarly, if the user is deleting fields and the source code currently contains a leftover +//! field name, this additional identifier can be a part of the struct node without any +//! modifications to the syntax tree definition. This property is the key to why CSTs are such a +//! good fit as a lossless input representation, which necessitates the syntax tree to mirror the +//! user-specific layout of whitespace and comments around the AST items. +//! +//! In the parser for our simple expression language, we'll also have to deal with the fact that, +//! when we see a number the parser doesn't yet know whether there will be additional operations +//! following that number. That is, in the expression `1 + 2`, it can only know that it is parsing +//! a binary operation once it sees the `+`. The event-like model of building trees in `cstree`, +//! however, implies that when reaching the `+`, the parser would have to have already entered an +//! expression node in order for the whole input to be part of the expression. +//! +//! To get around this, `GreenNodeBuilder` provides the +//! [`checkpoint`](build::GreenNodeBuilder::checkpoint) method, which we can call to "remember" the +//! current position in the input. For example, we can create a checkpoint before the parser parses +//! the first `1`. Later, when it sees the following `+`, it can create an `Expr` node for the +//! whole expression using [`start_node_at`](build::GreenNodeBuilder::start_node_at): +//! +//! ```rust,ignore +//! fn parse_lhs(&mut self) -> Result<(), String> { +//! // An expression may start either with a number, or with an opening parenthesis that is +//! // the start of a parenthesized expression +//! let next_token = *self.lexer.peek().unwrap(); +//! match next_token { +//! Token::Int(n) => { +//! self.bump(); +//! self.builder.token(SyntaxKind::Int, n); +//! } +//! Token::LParen => { +//! // Wrap the grouped expression inside a node containing it and its parentheses +//! self.builder.start_node(SyntaxKind::Expr); +//! self.bump(); +//! self.builder.static_token(SyntaxKind::LParen); +//! self.parse_expr()?; // Inner expression +//! if self.bump() != Some(Token::RParen) { +//! return Err("Missing ')'".to_string()); +//! } +//! self.builder.static_token(SyntaxKind::RParen); +//! self.builder.finish_node(); +//! } +//! Token::EoF => return Err("Unexpected end of file: expected expression".to_string()), +//! t => return Err(format!("Unexpected start of expression: '{t:?}'")), +//! } +//! Ok(()) +//! } +//! +//! fn parse_expr(&mut self) -> Result<(), String> { +//! // Remember our current position +//! let before_expr = self.builder.checkpoint(); +//! +//! // Parse the start of the expression +//! self.parse_lhs()?; +//! +//! // Check if the expression continues with `+ ` or `- ` +//! let Some(next_token) = self.lexer.peek() else { +//! return Ok(()); +//! }; +//! let op = match *next_token { +//! Token::Plus => SyntaxKind::Plus, +//! Token::Minus => SyntaxKind::Minus, +//! Token::RParen | Token::EoF => return Ok(()), +//! t => return Err(format!("Expected operator, found '{t:?}'")), +//! }; +//! +//! // If so, retroactively wrap the (already parsed) LHS and the following RHS +//! // inside an `Expr` node +//! self.builder.start_node_at(before_expr, SyntaxKind::Expr); +//! self.bump(); +//! self.builder.static_token(op); +//! self.parse_expr()?; // RHS +//! self.builder.finish_node(); +//! Ok(()) +//! } +//! ``` +//! +//! ### Obtaining the parser result +//! +//! Our parser is now capable of parsing our little arithmetic language, but it's methods don't +//! return anything. So how do we get our syntax tree out? The answer lies in +//! [`GreenNodeBuilder::finish`](build::GreenNodeBuilder::finish), which finally returns the tree +//! that we have painstakingly constructed. +//! +//! ```rust,ignore +//! impl Parser<'_> { +//! pub fn finish(mut self) -> (GreenNode, impl Interner) { +//! assert!(self.lexer.next().map(|t| t == Token::EoF).unwrap_or(true)); +//! let (tree, cache) = self.builder.finish(); +//! (tree, cache.unwrap().into_interner().unwrap()) +//! } +//! } +//! ``` +//! +//! `finish` also returns the cache it used to deduplicate tree nodes and tokens, so you can re-use +//! it for parsing related inputs (e.g., different source files from the same crate may share a lot +//! of common function and type names that can be deduplicated). See `GreenNodeBuilder`'s +//! documentation for more information on this, in particular the `with_cache` and `from_cache` +//! methods. Most importantly for us, we can extract the [`Interner`](interning::Interner) that +//! contains the source text of the tree's tokens from the cache, which we need if we want to look +//! up things like variable names or the value of numbers for our calculator. +//! +//! To work with the syntax tree, you'll want to upgrade it to a [`SyntaxNode`](syntax::SyntaxNode) +//! using [`SyntaxNode::new_root`](syntax::SyntaxNode::new_root). You can also use +//! [`SyntaxNode::new_root_with_resolver`](syntax::SyntaxNode::new_root_with_resolver) to combine +//! tree and interner, which lets you directly retrieve source text and makes the nodes implement +//! `Display` and `Debug`. The same output can be produced from `SyntaxNode`s by calling the +//! `debug` or `display` method with a [`Resolver`](interning::Resolver). To visualize the whole +//! syntax tree, pass `true` for the `recursive` parameter on `debug`, or simply debug-print a +//! [`ResolvedNode`](syntax::ResolvedNode): +//! +//! ```rust,ignore +//! let input = "11 + 2-(5 + 4)"; +//! let mut parser = Parser::new(input); +//! parser.parse().unwrap(); +//! let (tree, interner) = parser.finish(); +//! let root = SyntaxNode::::new_root_with_resolver(tree, interner); +//! dbg!(root); +//! ``` +//! +//! ### Further examples +//! The parser we just built is available in full in the runnable `readme` example, which includes some additional code +//! to read expressions from the terminal and evaluate the parsed expressions - have it do a few calculations if you +//! like. +//! There are several more examples in the `examples/` folder in the repository. +//! A good starting point is the `s_expressions` example, which implements a parser for a small S-Expression language +//! with guiding comments. //! //! ## AST Layer //! While `cstree` is built for concrete syntax trees, applications are quite easily able to work with either a CST or //! an AST representation, or freely switch between them. To do so, use `cstree` to build syntax and underlying green -//! tree and provide AST wrappers for your different kinds of nodes. An example of how this is done can be seen [here](https://github.com/rust-analyzer/rust-analyzer/blob/master/crates/syntax/src/ast/generated.rs) and [here](https://github.com/rust-analyzer/rust-analyzer/blob/master/crates/syntax/src/ast/generated/nodes.rs) (note that the latter file is automatically generated by a task). +//! tree and provide AST wrappers for your different kinds of nodes. An example of how this is done can be seen [here](https://github.com/rust-analyzer/rust-analyzer/blob/master/crates/syntax/src/ast/generated.rs) +//! and [here](https://github.com/rust-analyzer/rust-analyzer/blob/master/crates/syntax/src/ast/generated/nodes.rs) +//! (note that the latter file is automatically generated by a task using [`ungrammar`](https://crates.io/crates/ungrammar)). #![forbid(missing_debug_implementations, unconditional_recursion)] -#![deny(unsafe_code, missing_docs, future_incompatible)] +#![deny(unsafe_code, future_incompatible)] #![allow(unstable_name_collisions)] // strict provenance - must come after `future_incompatible` to take precedence +#![warn(missing_docs)] +// Docs.rs +#![doc(html_root_url = "https://docs.rs/cstree/0.12.0-rc.0")] +#![cfg_attr(doc_cfg, feature(doc_cfg))] #[allow(unsafe_code)] -mod green; +pub mod green; #[allow(unsafe_code)] -mod syntax; +pub mod syntax; + +#[allow(unsafe_code)] +pub mod interning; #[cfg(feature = "serialize")] mod serde_impls; #[allow(missing_docs)] mod utility_types; -pub mod interning; use std::fmt; -// Reexport types for working with strings. -pub use text_size::{TextLen, TextRange, TextSize}; +/// `RawSyntaxKind` is a type tag for each token or node. +#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash)] +pub struct RawSyntaxKind(pub u16); -#[doc(inline)] -pub use crate::syntax::*; -pub use crate::{ - green::{Checkpoint, GreenNode, GreenNodeBuilder, GreenNodeChildren, GreenToken, NodeCache, SyntaxKind}, - utility_types::{Direction, NodeOrToken, TokenAtOffset, WalkEvent}, -}; -pub use triomphe::Arc; +/// Typesafe representations of text ranges and sizes. +pub mod text { + pub use crate::syntax::SyntaxText; + pub use text_size::{TextLen, TextRange, TextSize}; +} + +/// A tree builder for the construction of syntax trees. +/// +/// Please refer to the documentation on [`GreenNodeBuilder`](build::GreenNodeBuilder) itself and the ["getting started" +/// section](../index.html#getting-started) from the top-level documentation for an introduction to how to build a +/// syntax tree. +pub mod build { + pub use crate::green::builder::{Checkpoint, GreenNodeBuilder, NodeCache}; +} + +/// A convenient collection of the most used parts of `cstree`. +pub mod prelude { + pub use crate::{ + build::GreenNodeBuilder, + green::{GreenNode, GreenToken}, + syntax::{SyntaxElement, SyntaxNode, SyntaxToken}, + Language, RawSyntaxKind, + }; +} + +/// Types for syntax tree traversal / moving through trees. +pub mod traversal { + pub use crate::utility_types::{Direction, WalkEvent}; +} + +/// Utility types. It shouldn't be needed to reference these directly, but they are returned in several places in +/// `cstree` and may come in handy. +pub mod util { + pub use crate::utility_types::{NodeOrToken, TokenAtOffset}; +} + +/// Synchronization primitives. +pub mod sync { + /// An atomically reference counted shared pointer. + /// + /// This is like [`Arc`](std::sync::Arc) in the standard library, but more efficient for how `cstree` stores + /// syntax trees internally. This Arc does not support weak reference counting. + pub use triomphe::Arc; +} /// The `Language` trait is the bridge between the internal `cstree` representation and your /// language's types. @@ -97,13 +441,13 @@ pub use triomphe::Arc; /// impl cstree::Language for Lang { /// type Kind = SyntaxKind; /// -/// fn kind_from_raw(raw: cstree::SyntaxKind) -> Self::Kind { +/// fn kind_from_raw(raw: cstree::RawSyntaxKind) -> Self::Kind { /// assert!(raw.0 <= __LAST as u16); /// unsafe { std::mem::transmute::(raw.0) } /// } /// -/// fn kind_to_raw(kind: Self::Kind) -> cstree::SyntaxKind { -/// cstree::SyntaxKind(kind as u16) +/// fn kind_to_raw(kind: Self::Kind) -> cstree::RawSyntaxKind { +/// cstree::RawSyntaxKind(kind as u16) /// } /// /// fn static_text(kind: Self::Kind) -> Option<&'static str> { @@ -115,29 +459,34 @@ pub use triomphe::Arc; /// } /// } /// ``` +/// +/// [`SyntaxNode`]: crate::syntax::SyntaxNode pub trait Language: Sized + Clone + Copy + fmt::Debug + Eq + Ord + std::hash::Hash { /// A type that represents what items in your Language can be. /// Typically, this is an `enum` with variants such as `Identifier`, `Literal`, ... type Kind: Sized + Clone + Copy + fmt::Debug; /// Construct a semantic item kind from the compact representation. - fn kind_from_raw(raw: SyntaxKind) -> Self::Kind; + fn kind_from_raw(raw: RawSyntaxKind) -> Self::Kind; /// Convert a semantic item kind into a more compact representation. - fn kind_to_raw(kind: Self::Kind) -> SyntaxKind; + fn kind_to_raw(kind: Self::Kind) -> RawSyntaxKind; /// Fixed text for a particular syntax kind. - /// /// Implement for kinds that will only ever represent the same text, such as punctuation (like a /// semicolon), keywords (like `fn`), or operators (like `<=`). + /// + /// Indicating tokens that have a `static_text` this way allows `cstree` to store them more efficiently, which makes + /// it faster to add them to a syntax tree and to look up their text. Since there can often be many occurrences + /// of these tokens inside a file, doing so will improve the performance of using `cstree`. fn static_text(kind: Self::Kind) -> Option<&'static str>; } #[doc(hidden)] #[allow(unsafe_code, unused)] pub mod testing { - pub use crate::*; - pub fn parse(_b: &mut super::GreenNodeBuilder, _s: &str) {} + pub use crate::prelude::*; + pub fn parse(_b: &mut GreenNodeBuilder, _s: &str) {} #[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash)] #[repr(u16)] @@ -160,13 +509,13 @@ pub mod testing { impl Language for TestLang { type Kind = TestSyntaxKind; - fn kind_from_raw(raw: SyntaxKind) -> Self::Kind { + fn kind_from_raw(raw: RawSyntaxKind) -> Self::Kind { assert!(raw.0 <= TestSyntaxKind::__LAST as u16); unsafe { std::mem::transmute::(raw.0) } } - fn kind_to_raw(kind: Self::Kind) -> SyntaxKind { - SyntaxKind(kind as u16) + fn kind_to_raw(kind: Self::Kind) -> RawSyntaxKind { + RawSyntaxKind(kind as u16) } fn static_text(kind: Self::Kind) -> Option<&'static str> { diff --git a/src/serde_impls.rs b/src/serde_impls.rs index 204c3a4..ecf0731 100644 --- a/src/serde_impls.rs +++ b/src/serde_impls.rs @@ -1,8 +1,12 @@ //! Serialization and Deserialization for syntax trees. use crate::{ - interning::{IntoResolver, Resolver}, - GreenNodeBuilder, Language, NodeOrToken, ResolvedNode, SyntaxKind, SyntaxNode, WalkEvent, + build::GreenNodeBuilder, + interning::{Resolver, TokenKey}, + syntax::{ResolvedNode, SyntaxNode}, + traversal::WalkEvent, + util::NodeOrToken, + Language, RawSyntaxKind, }; use serde::{ de::{Error, SeqAccess, Visitor}, @@ -77,8 +81,8 @@ enum Event<'text> { /// The second parameter indicates if this node needs data. /// If the boolean is true, the next element inside the data list /// must be attached to this node. - EnterNode(SyntaxKind, bool), - Token(SyntaxKind, &'text str), + EnterNode(RawSyntaxKind, bool), + Token(RawSyntaxKind, &'text str), LeaveNode, } @@ -97,7 +101,7 @@ pub(crate) struct SerializeWithData<'node, 'resolver, L: Language, D: 'static, R impl Serialize for SerializeWithData<'_, '_, L, D, R> where L: Language, - R: Resolver + ?Sized, + R: Resolver + ?Sized, D: Serialize, { fn serialize(&self, serializer: S) -> Result @@ -112,7 +116,7 @@ where impl Serialize for SerializeWithResolver<'_, '_, L, D, R> where L: Language, - R: Resolver + ?Sized, + R: Resolver + ?Sized, { fn serialize(&self, serializer: S) -> Result where @@ -192,8 +196,7 @@ where } let (tree, cache) = builder.finish(); - let tree = - ResolvedNode::new_root_with_resolver(tree, cache.unwrap().into_interner().unwrap().into_resolver()); + let tree = ResolvedNode::new_root_with_resolver(tree, cache.unwrap().into_interner().unwrap()); Ok((tree, data_indices)) } } @@ -236,7 +239,7 @@ where } } -impl Serialize for SyntaxKind { +impl Serialize for RawSyntaxKind { fn serialize(&self, serializer: S) -> Result where S: serde::Serializer, @@ -245,7 +248,7 @@ impl Serialize for SyntaxKind { } } -impl<'de> Deserialize<'de> for SyntaxKind { +impl<'de> Deserialize<'de> for RawSyntaxKind { fn deserialize(deserializer: D) -> Result where D: serde::Deserializer<'de>, diff --git a/src/syntax/element.rs b/src/syntax/element.rs index 69fd80a..e6d74f9 100644 --- a/src/syntax/element.rs +++ b/src/syntax/element.rs @@ -1,10 +1,14 @@ use std::{fmt, sync::atomic::AtomicU32}; -use lasso::Resolver; use text_size::{TextRange, TextSize}; use super::*; -use crate::{green::GreenElementRef, Language, NodeOrToken, SyntaxKind, TokenAtOffset}; +use crate::{ + green::GreenElementRef, + interning::{Resolver, TokenKey}, + util::{NodeOrToken, TokenAtOffset}, + Language, RawSyntaxKind, +}; /// An element of the tree, can be either a node or a token. pub type SyntaxElement = NodeOrToken, SyntaxToken>; @@ -27,7 +31,7 @@ impl SyntaxElement { /// To avoid allocating for every element, see [`write_display`](type.SyntaxElement.html#method.write_display). pub fn display(&self, resolver: &R) -> String where - R: Resolver + ?Sized, + R: Resolver + ?Sized, { match self { NodeOrToken::Node(it) => it.display(resolver), @@ -38,7 +42,7 @@ impl SyntaxElement { /// Writes this element's [`Display`](fmt::Display) representation into the given `target`. pub fn write_display(&self, resolver: &R, target: &mut impl fmt::Write) -> fmt::Result where - R: Resolver + ?Sized, + R: Resolver + ?Sized, { match self { NodeOrToken::Node(it) => it.write_display(resolver, target), @@ -53,7 +57,7 @@ impl SyntaxElement { /// To avoid allocating for every element, see [`write_debug`](type.SyntaxElement.html#method.write_debug). pub fn debug(&self, resolver: &R, recursive: bool) -> String where - R: Resolver + ?Sized, + R: Resolver + ?Sized, { match self { NodeOrToken::Node(it) => it.debug(resolver, recursive), @@ -66,7 +70,7 @@ impl SyntaxElement { /// Otherwise, only this element's kind and range are written. pub fn write_debug(&self, resolver: &R, target: &mut impl fmt::Write, recursive: bool) -> fmt::Result where - R: Resolver + ?Sized, + R: Resolver + ?Sized, { match self { NodeOrToken::Node(it) => it.write_debug(resolver, target, recursive), @@ -105,7 +109,7 @@ impl<'a, L: Language, D> SyntaxElementRef<'a, L, D> { /// To avoid allocating for every element, see [`write_display`](type.SyntaxElementRef.html#method.write_display). pub fn display(&self, resolver: &R) -> String where - R: Resolver + ?Sized, + R: Resolver + ?Sized, { match self { NodeOrToken::Node(it) => it.display(resolver), @@ -116,7 +120,7 @@ impl<'a, L: Language, D> SyntaxElementRef<'a, L, D> { /// Writes this element's [`Display`](fmt::Display) representation into the given `target`. pub fn write_display(&self, resolver: &R, target: &mut impl fmt::Write) -> fmt::Result where - R: Resolver + ?Sized, + R: Resolver + ?Sized, { match self { NodeOrToken::Node(it) => it.write_display(resolver, target), @@ -131,7 +135,7 @@ impl<'a, L: Language, D> SyntaxElementRef<'a, L, D> { /// To avoid allocating for every element, see [`write_debug`](type.SyntaxElementRef.html#method.write_debug). pub fn debug(&self, resolver: &R, recursive: bool) -> String where - R: Resolver + ?Sized, + R: Resolver + ?Sized, { match self { NodeOrToken::Node(it) => it.debug(resolver, recursive), @@ -144,7 +148,7 @@ impl<'a, L: Language, D> SyntaxElementRef<'a, L, D> { /// Otherwise, only this element's kind and range are written. pub fn write_debug(&self, resolver: &R, target: &mut impl fmt::Write, recursive: bool) -> fmt::Result where - R: Resolver + ?Sized, + R: Resolver + ?Sized, { match self { NodeOrToken::Node(it) => it.write_debug(resolver, target, recursive), @@ -162,8 +166,8 @@ impl SyntaxElement { ref_count: *mut AtomicU32, ) -> SyntaxElement { match element { - NodeOrToken::Node(node) => SyntaxNode::new_child(node, parent, index as u32, offset, ref_count).into(), - NodeOrToken::Token(_) => SyntaxToken::new(parent, index as u32, offset).into(), + NodeOrToken::Node(node) => SyntaxNode::new_child(node, parent, index, offset, ref_count).into(), + NodeOrToken::Token(_) => SyntaxToken::new(parent, index, offset).into(), } } @@ -178,7 +182,7 @@ impl SyntaxElement { /// The internal representation of the kind of this element. #[inline] - pub fn syntax_kind(&self) -> SyntaxKind { + pub fn syntax_kind(&self) -> RawSyntaxKind { match self { NodeOrToken::Node(it) => it.syntax_kind(), NodeOrToken::Token(it) => it.syntax_kind(), @@ -261,7 +265,7 @@ impl<'a, L: Language, D> SyntaxElementRef<'a, L, D> { /// The internal representation of the kind of this element. #[inline] - pub fn syntax_kind(&self) -> SyntaxKind { + pub fn syntax_kind(&self) -> RawSyntaxKind { match self { NodeOrToken::Node(it) => it.syntax_kind(), NodeOrToken::Token(it) => it.syntax_kind(), diff --git a/src/syntax/iter.rs b/src/syntax/iter.rs index 645b709..cb711e9 100644 --- a/src/syntax/iter.rs +++ b/src/syntax/iter.rs @@ -4,7 +4,11 @@ use std::iter::FusedIterator; use text_size::TextSize; -use crate::{green::GreenElementRef, GreenNodeChildren, Language, SyntaxElementRef, SyntaxNode}; +use crate::{ + green::{GreenElementRef, GreenNodeChildren}, + syntax::{SyntaxElementRef, SyntaxNode}, + Language, +}; #[derive(Clone, Debug)] struct Iter<'n> { diff --git a/src/syntax/mod.rs b/src/syntax/mod.rs index 9d6dd4a..dd274eb 100644 --- a/src/syntax/mod.rs +++ b/src/syntax/mod.rs @@ -36,6 +36,7 @@ pub use text::SyntaxText; #[cfg(test)] mod tests { + use super::*; use crate::testing::*; #[test] diff --git a/src/syntax/node.rs b/src/syntax/node.rs index 64eba62..b103761 100644 --- a/src/syntax/node.rs +++ b/src/syntax/node.rs @@ -2,9 +2,12 @@ use super::*; #[cfg(feature = "serialize")] use crate::serde_impls::{SerializeWithData, SerializeWithResolver}; use crate::{ - green::{GreenElementRef, SyntaxKind}, - interning::Resolver, - *, + green::{GreenElementRef, GreenNode}, + interning::{Resolver, TokenKey}, + text::*, + traversal::*, + util::*, + Language, RawSyntaxKind, }; use parking_lot::RwLock; use std::{ @@ -39,7 +42,7 @@ impl SyntaxNode { /// Otherwise, only this node's kind and range are written. pub fn write_debug(&self, resolver: &R, target: &mut impl fmt::Write, recursive: bool) -> fmt::Result where - R: Resolver + ?Sized, + R: Resolver + ?Sized, { if recursive { let mut level = 0; @@ -71,7 +74,7 @@ impl SyntaxNode { #[inline] pub fn debug(&self, resolver: &R, recursive: bool) -> String where - R: Resolver + ?Sized, + R: Resolver + ?Sized, { // NOTE: `fmt::Write` methods on `String` never fail let mut res = String::new(); @@ -82,7 +85,7 @@ impl SyntaxNode { /// Writes this node's [`Display`](fmt::Display) representation into the given `target`. pub fn write_display(&self, resolver: &R, target: &mut impl fmt::Write) -> fmt::Result where - R: Resolver + ?Sized, + R: Resolver + ?Sized, { self.preorder_with_tokens() .filter_map(|event| match event { @@ -98,7 +101,7 @@ impl SyntaxNode { #[inline] pub fn display(&self, resolver: &R) -> String where - R: Resolver + ?Sized, + R: Resolver + ?Sized, { // NOTE: `fmt::Write` methods on `String` never fail let mut res = String::new(); @@ -107,21 +110,22 @@ impl SyntaxNode { } /// If there is a resolver associated with this tree, returns it. - pub fn resolver(&self) -> Option<&StdArc> { + pub fn resolver(&self) -> Option<&StdArc>> { match &self.root().data().kind { Kind::Root(_, resolver) => resolver.as_ref(), _ => unreachable!(), } } - /// Turns this node into a [`ResolvedNode`], but only if there is a resolver associated with this tree. + /// Turns this node into a [`ResolvedNode`](crate::syntax::ResolvedNode), but only if there is a resolver associated + /// with this tree. #[inline] pub fn try_resolved(&self) -> Option<&ResolvedNode> { // safety: we only coerce if `resolver` exists self.resolver().map(|_| unsafe { ResolvedNode::coerce_ref(self) }) } - /// Turns this node into a [`ResolvedNode`]. + /// Turns this node into a [`ResolvedNode`](crate::syntax::ResolvedNode). /// # Panics /// If there is no resolver associated with this tree. #[inline] @@ -233,7 +237,7 @@ impl Hash for SyntaxNode { } enum Kind { - Root(GreenNode, Option>), + Root(GreenNode, Option>>), Child { parent: SyntaxNode, index: u32, @@ -300,7 +304,7 @@ impl SyntaxNode { Self { data } } - fn make_new_root(green: GreenNode, resolver: Option>) -> Self { + fn make_new_root(green: GreenNode, resolver: Option>>) -> Self { let ref_count = Box::new(AtomicU32::new(1)); let n_children = green.children().count(); let data = NodeData::new( @@ -328,6 +332,8 @@ impl SyntaxNode { /// # Example /// ``` /// # use cstree::testing::*; + /// use cstree::syntax::ResolvedNode; + /// /// let mut builder: GreenNodeBuilder = GreenNodeBuilder::new(); /// builder.start_node(Root); /// builder.token(Identifier, "content"); @@ -342,8 +348,8 @@ impl SyntaxNode { /// assert_eq!(root.text(), "content"); /// ``` #[inline] - pub fn new_root_with_resolver(green: GreenNode, resolver: impl Resolver + 'static) -> ResolvedNode { - let ptr: StdArc = StdArc::new(resolver); + pub fn new_root_with_resolver(green: GreenNode, resolver: impl Resolver + 'static) -> ResolvedNode { + let ptr: StdArc> = StdArc::new(resolver); ResolvedNode { syntax: SyntaxNode::make_new_root(green, Some(ptr)), } @@ -517,7 +523,7 @@ impl SyntaxNode { /// The internal representation of the kind of this node. #[inline] - pub fn syntax_kind(&self) -> SyntaxKind { + pub fn syntax_kind(&self) -> RawSyntaxKind { self.green().kind() } @@ -543,7 +549,7 @@ impl SyntaxNode { #[inline] pub fn resolve_text<'n, 'i, I>(&'n self, resolver: &'i I) -> SyntaxText<'n, 'i, I, L, D> where - I: Resolver + ?Sized, + I: Resolver + ?Sized, { SyntaxText::new(self, resolver) } @@ -911,7 +917,7 @@ where /// including the data and by using an external resolver. pub fn as_serialize_with_data_with_resolver<'node>( &'node self, - resolver: &'node impl Resolver, + resolver: &'node impl Resolver, ) -> impl serde::Serialize + 'node where D: serde::Serialize, @@ -923,7 +929,7 @@ where /// which uses the given resolver instead of the resolver inside the tree. pub fn as_serialize_with_resolver<'node>( &'node self, - resolver: &'node impl Resolver, + resolver: &'node impl Resolver, ) -> impl serde::Serialize + 'node { SerializeWithResolver { node: self, resolver } } diff --git a/src/syntax/resolved.rs b/src/syntax/resolved.rs index 504ea4c..6d80072 100644 --- a/src/syntax/resolved.rs +++ b/src/syntax/resolved.rs @@ -9,12 +9,15 @@ use std::{ sync::Arc as StdArc, }; -use lasso::Resolver; use text_size::{TextRange, TextSize}; use crate::{ - Direction, GreenNode, Language, NodeOrToken, SyntaxElementRef, SyntaxKind, SyntaxNode, SyntaxText, SyntaxToken, - TokenAtOffset, WalkEvent, + green::GreenNode, + interning::{Resolver, TokenKey}, + syntax::*, + traversal::*, + util::*, + Language, RawSyntaxKind, }; /// Syntax tree node that is guaranteed to belong to a tree that contains an associated @@ -109,7 +112,7 @@ impl DerefMut for ResolvedToken { /// An element of the tree that is guaranteed to belong to a tree that contains an associated /// [`Resolver`](lasso::Resolver), can be either a node or a token. /// # See also -/// [`SyntaxElement`](crate::SyntaxElement) +/// [`SyntaxElement`](crate::syntax::SyntaxElement) pub type ResolvedElement = NodeOrToken, ResolvedToken>; impl From> for ResolvedElement { @@ -126,7 +129,7 @@ impl From> for ResolvedElement { impl ResolvedElement { #[allow(missing_docs)] - pub fn display(&self, resolver: &impl Resolver) -> String { + pub fn display(&self, resolver: &impl Resolver) -> String { match self { NodeOrToken::Node(it) => it.display(resolver), NodeOrToken::Token(it) => it.display(resolver), @@ -177,7 +180,7 @@ impl ResolvedNode { /// source text covered by this node, i.e. the combined text of all token leafs of the subtree /// originating in this node. #[inline] - pub fn text(&self) -> SyntaxText<'_, '_, dyn Resolver, L, D> { + pub fn text(&self) -> SyntaxText<'_, '_, dyn Resolver, L, D> { SyntaxText::new(self, &**self.resolver()) } } @@ -266,13 +269,13 @@ macro_rules! forward_node { impl ResolvedNode { /// Returns the [`Resolver`] associated with this tree. - pub fn resolver(&self) -> &StdArc { + pub fn resolver(&self) -> &StdArc> { self.syntax.resolver().unwrap() } /// See [`SyntaxNode::new_root_with_resolver`]. #[inline] - pub fn new_root_with_resolver(green: GreenNode, resolver: impl Resolver + 'static) -> Self { + pub fn new_root_with_resolver(green: GreenNode, resolver: impl Resolver + 'static) -> Self { SyntaxNode::new_root_with_resolver(green, resolver) } @@ -498,7 +501,7 @@ impl ResolvedNode { impl ResolvedToken { /// Returns the [`Resolver`] associated with this tree. - pub fn resolver(&self) -> &StdArc { + pub fn resolver(&self) -> &StdArc> { self.syntax.resolver().unwrap() } @@ -575,7 +578,7 @@ impl ResolvedElement { /// The internal representation of the kind of this element. #[inline] - pub fn syntax_kind(&self) -> SyntaxKind { + pub fn syntax_kind(&self) -> RawSyntaxKind { match self { NodeOrToken::Node(it) => it.syntax_kind(), NodeOrToken::Token(it) => it.syntax_kind(), @@ -658,7 +661,7 @@ impl<'a, L: Language, D> ResolvedElementRef<'a, L, D> { /// The internal representation of the kind of this element. #[inline] - pub fn syntax_kind(&self) -> SyntaxKind { + pub fn syntax_kind(&self) -> RawSyntaxKind { match self { NodeOrToken::Node(it) => it.syntax_kind(), NodeOrToken::Token(it) => it.syntax_kind(), diff --git a/src/syntax/text.rs b/src/syntax/text.rs index 712ce30..c8966ac 100644 --- a/src/syntax/text.rs +++ b/src/syntax/text.rs @@ -2,7 +2,12 @@ use std::fmt; -use crate::{interning::Resolver, Language, SyntaxNode, SyntaxToken, TextRange, TextSize}; +use crate::{ + interning::{Resolver, TokenKey}, + syntax::{SyntaxNode, SyntaxToken}, + text::{TextRange, TextSize}, + Language, +}; /// An efficient representation of the text that is covered by a [`SyntaxNode`], i.e. the combined /// source text of all tokens that are descendants of the node. @@ -14,7 +19,7 @@ use crate::{interning::Resolver, Language, SyntaxNode, SyntaxToken, TextRange, T /// # Example /// ``` /// # use cstree::testing::*; -/// # use cstree::interning::IntoResolver; +/// # use cstree::syntax::ResolvedNode; /// # /// fn parse_float_literal(s: &str) -> ResolvedNode { /// // parsing... @@ -23,7 +28,7 @@ use crate::{interning::Resolver, Language, SyntaxNode, SyntaxToken, TextRange, T /// # builder.token(Float, s); /// # builder.finish_node(); /// # let (root, cache) = builder.finish(); -/// # let resolver = cache.unwrap().into_interner().unwrap().into_resolver(); +/// # let resolver = cache.unwrap().into_interner().unwrap(); /// # SyntaxNode::new_root_with_resolver(root, resolver) /// } /// let float_node = parse_float_literal("2.748E2"); @@ -42,7 +47,7 @@ pub struct SyntaxText<'n, 'i, I: ?Sized, L: Language, D: 'static = ()> { resolver: &'i I, } -impl<'n, 'i, I: Resolver + ?Sized, L: Language, D> SyntaxText<'n, 'i, I, L, D> { +impl<'n, 'i, I: Resolver + ?Sized, L: Language, D> SyntaxText<'n, 'i, I, L, D> { pub(crate) fn new(node: &'n SyntaxNode, resolver: &'i I) -> Self { let range = node.text_range(); SyntaxText { node, range, resolver } @@ -203,25 +208,25 @@ fn found(res: Result<(), T>) -> Option { } } -impl fmt::Debug for SyntaxText<'_, '_, I, L, D> { +impl + ?Sized, L: Language, D> fmt::Debug for SyntaxText<'_, '_, I, L, D> { fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { fmt::Debug::fmt(&self.to_string(), f) } } -impl fmt::Display for SyntaxText<'_, '_, I, L, D> { +impl + ?Sized, L: Language, D> fmt::Display for SyntaxText<'_, '_, I, L, D> { fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { self.try_for_each_chunk(|chunk| fmt::Display::fmt(chunk, f)) } } -impl From> for String { +impl + ?Sized, L: Language, D> From> for String { fn from(text: SyntaxText<'_, '_, I, L, D>) -> String { text.to_string() } } -impl PartialEq for SyntaxText<'_, '_, I, L, D> { +impl + ?Sized, L: Language, D> PartialEq for SyntaxText<'_, '_, I, L, D> { fn eq(&self, mut rhs: &str) -> bool { self.try_for_each_chunk(|chunk| { if !rhs.starts_with(chunk) { @@ -235,19 +240,19 @@ impl PartialEq for SyntaxText<'_, '_, } } -impl PartialEq> for str { +impl + ?Sized, L: Language, D> PartialEq> for str { fn eq(&self, rhs: &SyntaxText<'_, '_, I, L, D>) -> bool { rhs == self } } -impl PartialEq<&'_ str> for SyntaxText<'_, '_, I, L, D> { +impl + ?Sized, L: Language, D> PartialEq<&'_ str> for SyntaxText<'_, '_, I, L, D> { fn eq(&self, rhs: &&str) -> bool { self == *rhs } } -impl PartialEq> for &'_ str { +impl + ?Sized, L: Language, D> PartialEq> for &'_ str { fn eq(&self, rhs: &SyntaxText<'_, '_, I, L, D>) -> bool { rhs == self } @@ -258,8 +263,8 @@ impl<'n1, 'i1, 'n2, 'i2, I1, I2, L1, L2, D1, D2> PartialEq + ?Sized, + I2: Resolver + ?Sized, { fn eq(&self, other: &SyntaxText<'_, '_, I2, L2, D2>) -> bool { if self.range.len() != other.range.len() { @@ -282,8 +287,8 @@ fn zip_texts<'it1, 'it2, It1, It2, I1, I2, L1, L2, D1, D2>( where It1: Iterator, TextRange)>, It2: Iterator, TextRange)>, - I1: Resolver + ?Sized, - I2: Resolver + ?Sized, + I1: Resolver + ?Sized, + I2: Resolver + ?Sized, D1: 'static, D2: 'static, L1: Language + 'it1, @@ -309,12 +314,12 @@ where } } -impl Eq for SyntaxText<'_, '_, I, L, D> {} +impl + ?Sized, L: Language, D> Eq for SyntaxText<'_, '_, I, L, D> {} mod private { use std::ops; - use crate::{TextRange, TextSize}; + use crate::text::{TextRange, TextSize}; pub trait SyntaxTextRange { fn start(&self) -> Option; @@ -374,27 +379,27 @@ mod private { #[cfg(test)] mod tests { - use crate::{green::SyntaxKind, GreenNodeBuilder}; + use crate::{build::GreenNodeBuilder, RawSyntaxKind}; use super::*; #[derive(Debug, Copy, Clone, Eq, PartialEq, Ord, PartialOrd, Hash)] pub enum TestLang {} impl Language for TestLang { - type Kind = SyntaxKind; + type Kind = RawSyntaxKind; - fn kind_from_raw(raw: SyntaxKind) -> Self::Kind { + fn kind_from_raw(raw: RawSyntaxKind) -> Self::Kind { raw } - fn kind_to_raw(kind: Self::Kind) -> SyntaxKind { + fn kind_to_raw(kind: Self::Kind) -> RawSyntaxKind { kind } fn static_text(kind: Self::Kind) -> Option<&'static str> { - if kind == SyntaxKind(1) { + if kind == RawSyntaxKind(1) { Some("{") - } else if kind == SyntaxKind(2) { + } else if kind == RawSyntaxKind(2) { Some("}") } else { None @@ -402,16 +407,16 @@ mod tests { } } - fn build_tree(chunks: &[&str]) -> (SyntaxNode, impl Resolver) { + fn build_tree(chunks: &[&str]) -> (SyntaxNode, impl Resolver) { let mut builder: GreenNodeBuilder = GreenNodeBuilder::new(); - builder.start_node(SyntaxKind(62)); + builder.start_node(RawSyntaxKind(62)); for &chunk in chunks.iter() { let kind = match chunk { "{" => 1, "}" => 2, _ => 3, }; - builder.token(SyntaxKind(kind), chunk); + builder.token(RawSyntaxKind(kind), chunk); } builder.finish_node(); let (node, cache) = builder.finish(); diff --git a/src/syntax/token.rs b/src/syntax/token.rs index 661a4d3..6e27f7d 100644 --- a/src/syntax/token.rs +++ b/src/syntax/token.rs @@ -5,11 +5,15 @@ use std::{ sync::Arc as StdArc, }; -use lasso::Resolver; use text_size::{TextRange, TextSize}; use super::*; -use crate::{interning::Key, Direction, GreenNode, GreenToken, Language, SyntaxKind}; +use crate::{ + green::{GreenNode, GreenToken}, + interning::{Resolver, TokenKey}, + traversal::Direction, + Language, RawSyntaxKind, +}; /// Syntax tree token. #[derive(Debug)] @@ -49,7 +53,7 @@ impl SyntaxToken { /// Writes this token's [`Debug`](fmt::Debug) representation into the given `target`. pub fn write_debug(&self, resolver: &R, target: &mut impl fmt::Write) -> fmt::Result where - R: Resolver + ?Sized, + R: Resolver + ?Sized, { write!(target, "{:?}@{:?}", self.kind(), self.text_range())?; let text = self.resolve_text(resolver); @@ -72,7 +76,7 @@ impl SyntaxToken { #[inline] pub fn debug(&self, resolver: &R) -> String where - R: Resolver + ?Sized, + R: Resolver + ?Sized, { // NOTE: `fmt::Write` methods on `String` never fail let mut res = String::new(); @@ -84,7 +88,7 @@ impl SyntaxToken { #[inline] pub fn write_display(&self, resolver: &R, target: &mut impl fmt::Write) -> fmt::Result where - R: Resolver + ?Sized, + R: Resolver + ?Sized, { write!(target, "{}", self.resolve_text(resolver)) } @@ -95,25 +99,26 @@ impl SyntaxToken { #[inline] pub fn display(&self, resolver: &R) -> String where - R: Resolver + ?Sized, + R: Resolver + ?Sized, { self.resolve_text(resolver).to_string() } /// If there is a resolver associated with this tree, returns it. #[inline] - pub fn resolver(&self) -> Option<&StdArc> { + pub fn resolver(&self) -> Option<&StdArc>> { self.parent.resolver() } - /// Turns this token into a [`ResolvedToken`], but only if there is a resolver associated with this tree. + /// Turns this token into a [`ResolvedToken`](crate::syntax::ResolvedToken), but only if there is a resolver + /// associated with this tree. #[inline] pub fn try_resolved(&self) -> Option<&ResolvedToken> { // safety: we only coerce if `resolver` exists self.resolver().map(|_| unsafe { ResolvedToken::coerce_ref(self) }) } - /// Turns this token into a [`ResolvedToken`]. + /// Turns this token into a [`ResolvedToken`](crate::syntax::ResolvedToken). /// # Panics /// If there is no resolver associated with this tree. #[inline] @@ -153,7 +158,7 @@ impl SyntaxToken { /// The internal representation of the kind of this token. #[inline] - pub fn syntax_kind(&self) -> SyntaxKind { + pub fn syntax_kind(&self) -> RawSyntaxKind { self.green().kind() } @@ -176,7 +181,7 @@ impl SyntaxToken { #[inline] pub fn resolve_text<'i, I>(&self, resolver: &'i I) -> &'i str where - I: Resolver + ?Sized, + I: Resolver + ?Sized, { // one of the two must be present upon construction self.static_text().or_else(|| self.green().text(resolver)).unwrap() @@ -191,6 +196,7 @@ impl SyntaxToken { /// /// ``` /// # use cstree::testing::*; + /// # use cstree::build::*; /// let mut builder: GreenNodeBuilder = GreenNodeBuilder::new(); /// # builder.start_node(Root); /// # builder.token(Identifier, "x"); @@ -278,18 +284,18 @@ impl SyntaxToken { /// implementation by re-using the interner in both. /// ``` /// # use cstree::testing::*; - /// use cstree::interning::{new_interner, Hasher, Key, Rodeo}; + /// use cstree::interning::{new_interner, TokenInterner, TokenKey}; /// struct TypeTable { /// // ... /// } /// impl TypeTable { - /// fn type_of(&self, ident: Key) -> &str { + /// fn type_of(&self, ident: TokenKey) -> &str { /// // ... /// # "" /// } /// } /// # struct State { - /// # interner: Rodeo, + /// # interner: TokenInterner, /// # type_table: TypeTable, /// # } /// let interner = new_interner(); @@ -297,7 +303,7 @@ impl SyntaxToken { /// interner, /// type_table: TypeTable{ /* stuff */}, /// }; - /// let mut builder: GreenNodeBuilder = + /// let mut builder: GreenNodeBuilder = /// GreenNodeBuilder::with_interner(&mut state.interner); /// # let input = ""; /// # builder.start_node(Root); @@ -315,7 +321,7 @@ impl SyntaxToken { /// let typ = type_table.type_of(ident.text_key().unwrap()); /// ``` #[inline] - pub fn text_key(&self) -> Option { + pub fn text_key(&self) -> Option { self.green().text_key() } diff --git a/src/utility_types.rs b/src/utility_types.rs index 77b770c..80ef3f4 100644 --- a/src/utility_types.rs +++ b/src/utility_types.rs @@ -109,7 +109,7 @@ impl std::ops::Deref for MaybeOwned<'_, T> { fn deref(&self) -> &T { match self { MaybeOwned::Owned(it) => it, - MaybeOwned::Borrowed(it) => *it, + MaybeOwned::Borrowed(it) => it, } } } @@ -118,7 +118,7 @@ impl std::ops::DerefMut for MaybeOwned<'_, T> { fn deref_mut(&mut self) -> &mut T { match self { MaybeOwned::Owned(it) => it, - MaybeOwned::Borrowed(it) => *it, + MaybeOwned::Borrowed(it) => it, } } } diff --git a/tests/it/basic.rs b/tests/it/basic.rs index 176198e..0e528fe 100644 --- a/tests/it/basic.rs +++ b/tests/it/basic.rs @@ -1,6 +1,10 @@ use super::*; -use cstree::{GreenNodeBuilder, NodeCache, SyntaxKind, TextRange}; -use lasso::{Resolver, Rodeo}; +use cstree::{ + build::{GreenNodeBuilder, NodeCache}, + interning::{new_interner, Resolver}, + text::TextRange, + RawSyntaxKind, +}; fn build_tree(root: &Element<'_>) -> (SyntaxNode, impl Resolver) { let mut builder: GreenNodeBuilder = GreenNodeBuilder::new(); @@ -31,20 +35,20 @@ fn tree_with_eq_tokens() -> Element<'static> { fn create() { let tree = two_level_tree(); let (tree, resolver) = build_tree::<()>(&tree); - assert_eq!(tree.syntax_kind(), SyntaxKind(0)); - assert_eq!(tree.kind(), SyntaxKind(0)); + assert_eq!(tree.syntax_kind(), RawSyntaxKind(0)); + assert_eq!(tree.kind(), RawSyntaxKind(0)); { let leaf1_0 = tree.children().nth(1).unwrap().children_with_tokens().next().unwrap(); let leaf1_0 = leaf1_0.into_token().unwrap(); - assert_eq!(leaf1_0.syntax_kind(), SyntaxKind(5)); - assert_eq!(leaf1_0.kind(), SyntaxKind(5)); + assert_eq!(leaf1_0.syntax_kind(), RawSyntaxKind(5)); + assert_eq!(leaf1_0.kind(), RawSyntaxKind(5)); assert_eq!(leaf1_0.resolve_text(&resolver), "1.0"); assert_eq!(leaf1_0.text_range(), TextRange::at(6.into(), 3.into())); } { let node2 = tree.children().nth(2).unwrap(); - assert_eq!(node2.syntax_kind(), SyntaxKind(6)); - assert_eq!(node2.kind(), SyntaxKind(6)); + assert_eq!(node2.syntax_kind(), RawSyntaxKind(6)); + assert_eq!(node2.kind(), RawSyntaxKind(6)); assert_eq!(node2.children_with_tokens().count(), 3); assert_eq!(node2.resolve_text(&resolver), "2.02.12.2"); } @@ -54,7 +58,7 @@ fn create() { fn token_text_eq() { let tree = tree_with_eq_tokens(); let (tree, _) = build_tree::<()>(&tree); - assert_eq!(tree.kind(), SyntaxKind(0)); + assert_eq!(tree.kind(), RawSyntaxKind(0)); let leaf0_0 = tree.children().next().unwrap().children_with_tokens().next().unwrap(); let leaf0_0 = leaf0_0.into_token().unwrap(); @@ -115,7 +119,7 @@ fn data() { #[test] fn with_interner() { - let mut interner = Rodeo::new(); + let mut interner = new_interner(); let mut cache = NodeCache::with_interner(&mut interner); let tree = two_level_tree(); let tree = build_tree_with_cache(&tree, &mut cache); @@ -135,7 +139,7 @@ fn with_interner() { #[test] fn inline_resolver() { - let mut interner = Rodeo::new(); + let mut interner = new_interner(); let mut cache = NodeCache::with_interner(&mut interner); let tree = two_level_tree(); let tree = build_tree_with_cache(&tree, &mut cache); @@ -146,7 +150,7 @@ fn inline_resolver() { assert_eq!(leaf1_0.text(), "1.0"); assert_eq!(leaf1_0.text_range(), TextRange::at(6.into(), 3.into())); assert_eq!(format!("{}", leaf1_0), leaf1_0.text()); - assert_eq!(format!("{:?}", leaf1_0), "SyntaxKind(5)@6..9 \"1.0\""); + assert_eq!(format!("{:?}", leaf1_0), "RawSyntaxKind(5)@6..9 \"1.0\""); } { let node2 = tree.children().nth(2).unwrap(); @@ -154,13 +158,13 @@ fn inline_resolver() { let resolver = node2.resolver(); assert_eq!(node2.resolve_text(resolver.as_ref()), node2.text()); assert_eq!(format!("{}", node2).as_str(), node2.text()); - assert_eq!(format!("{:?}", node2), "SyntaxKind(6)@9..18"); + assert_eq!(format!("{:?}", node2), "RawSyntaxKind(6)@9..18"); assert_eq!( format!("{:#?}", node2), - r#"SyntaxKind(6)@9..18 - SyntaxKind(7)@9..12 "2.0" - SyntaxKind(8)@12..15 "2.1" - SyntaxKind(9)@15..18 "2.2" + r#"RawSyntaxKind(6)@9..18 + RawSyntaxKind(7)@9..12 "2.0" + RawSyntaxKind(8)@12..15 "2.1" + RawSyntaxKind(9)@15..18 "2.2" "# ); } @@ -175,7 +179,7 @@ fn assert_debug_display() { f::(); f::(); f::>(); - f::>(); + f::>(); fn dbg() {} dbg::>(); diff --git a/tests/it/main.rs b/tests/it/main.rs index 78de746..c3777c3 100644 --- a/tests/it/main.rs +++ b/tests/it/main.rs @@ -4,18 +4,22 @@ mod sendsync; #[cfg(feature = "serialize")] mod serde; -use cstree::{GreenNode, GreenNodeBuilder, Language, NodeCache, SyntaxKind}; -use lasso::Interner; +use cstree::{ + build::{GreenNodeBuilder, NodeCache}, + green::GreenNode, + interning::Interner, + Language, RawSyntaxKind, +}; -pub type SyntaxNode = cstree::SyntaxNode; -pub type SyntaxToken = cstree::SyntaxToken; -pub type SyntaxElement = cstree::SyntaxElement; -pub type SyntaxElementRef<'a, D = ()> = cstree::SyntaxElementRef<'a, TestLang, D>; +pub type SyntaxNode = cstree::syntax::SyntaxNode; +pub type SyntaxToken = cstree::syntax::SyntaxToken; +pub type SyntaxElement = cstree::syntax::SyntaxElement; +pub type SyntaxElementRef<'a, D = ()> = cstree::syntax::SyntaxElementRef<'a, TestLang, D>; -pub type ResolvedNode = cstree::ResolvedNode; -pub type ResolvedToken = cstree::ResolvedToken; -pub type ResolvedElement = cstree::ResolvedElement; -pub type ResolvedElementRef<'a, D = ()> = cstree::ResolvedElementRef<'a, TestLang, D>; +pub type ResolvedNode = cstree::syntax::ResolvedNode; +pub type ResolvedToken = cstree::syntax::ResolvedToken; +pub type ResolvedElement = cstree::syntax::ResolvedElement; +pub type ResolvedElementRef<'a, D = ()> = cstree::syntax::ResolvedElementRef<'a, TestLang, D>; #[derive(Debug)] pub enum Element<'s> { @@ -26,13 +30,13 @@ pub enum Element<'s> { #[derive(Debug, Copy, Clone, Eq, PartialEq, Ord, PartialOrd, Hash)] pub enum TestLang {} impl Language for TestLang { - type Kind = SyntaxKind; + type Kind = RawSyntaxKind; - fn kind_from_raw(raw: SyntaxKind) -> Self::Kind { + fn kind_from_raw(raw: RawSyntaxKind) -> Self::Kind { raw } - fn kind_to_raw(kind: Self::Kind) -> SyntaxKind { + fn kind_to_raw(kind: Self::Kind) -> RawSyntaxKind { kind } @@ -41,7 +45,7 @@ impl Language for TestLang { } } -pub fn build_tree_with_cache<'c, 'i, I>(root: &Element<'_>, cache: &'c mut NodeCache<'i, I>) -> GreenNode +pub fn build_tree_with_cache(root: &Element<'_>, cache: &mut NodeCache<'_, I>) -> GreenNode where I: Interner, { @@ -52,25 +56,21 @@ where node } -pub fn build_recursive<'c, 'i, L, I>( - root: &Element<'_>, - builder: &mut GreenNodeBuilder<'c, 'i, L, I>, - mut from: u16, -) -> u16 +pub fn build_recursive(root: &Element<'_>, builder: &mut GreenNodeBuilder<'_, '_, L, I>, mut from: u16) -> u16 where - L: Language, + L: Language, I: Interner, { match root { Element::Node(children) => { - builder.start_node(SyntaxKind(from)); + builder.start_node(RawSyntaxKind(from)); for child in children { from = build_recursive(child, builder, from + 1); } builder.finish_node(); } Element::Token(text) => { - builder.token(SyntaxKind(from), *text); + builder.token(RawSyntaxKind(from), text); } } from diff --git a/tests/it/regressions.rs b/tests/it/regressions.rs index 3ad3cdd..12d5294 100644 --- a/tests/it/regressions.rs +++ b/tests/it/regressions.rs @@ -3,7 +3,7 @@ fn empty_tree_arc() { // this test is not here for the test itself, but to run it through MIRI, who complained about out-of-bound // `ThinArc` pointers for a root `GreenNode` with no children - use cstree::*; + use cstree::{build::GreenNodeBuilder, syntax::SyntaxNode}; #[allow(non_camel_case_types)] #[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash)] #[repr(u16)] @@ -16,13 +16,13 @@ fn empty_tree_arc() { // ... type Kind = SyntaxKind; - fn kind_from_raw(raw: cstree::SyntaxKind) -> Self::Kind { + fn kind_from_raw(raw: cstree::RawSyntaxKind) -> Self::Kind { assert!(raw.0 <= SyntaxKind::Root as u16); unsafe { std::mem::transmute::(raw.0) } } - fn kind_to_raw(kind: Self::Kind) -> cstree::SyntaxKind { - cstree::SyntaxKind(kind as u16) + fn kind_to_raw(kind: Self::Kind) -> cstree::RawSyntaxKind { + cstree::RawSyntaxKind(kind as u16) } fn static_text(_kind: Self::Kind) -> Option<&'static str> { diff --git a/tests/it/sendsync.rs b/tests/it/sendsync.rs index a41a6b4..ad501e0 100644 --- a/tests/it/sendsync.rs +++ b/tests/it/sendsync.rs @@ -4,13 +4,29 @@ use crossbeam_utils::thread::scope; use std::{thread, time::Duration}; use super::{build_recursive, Element, ResolvedNode, SyntaxNode, TestLang}; -use cstree::{interning::IntoResolver, GreenNodeBuilder}; +use cstree::build::GreenNodeBuilder; + +// Excercise the multi-threaded interner when the corresponding feature is enabled. + +#[cfg(feature = "multi_threaded_interning")] +use cstree::interning::{new_threaded_interner, MultiThreadedTokenInterner}; + +#[cfg(not(feature = "multi_threaded_interning"))] +fn get_builder() -> GreenNodeBuilder<'static, 'static, TestLang> { + GreenNodeBuilder::new() +} + +#[cfg(feature = "multi_threaded_interning")] +fn get_builder() -> GreenNodeBuilder<'static, 'static, TestLang, MultiThreadedTokenInterner> { + let interner = new_threaded_interner(); + GreenNodeBuilder::from_interner(interner) +} fn build_tree(root: &Element<'_>) -> ResolvedNode { - let mut builder: GreenNodeBuilder = GreenNodeBuilder::new(); + let mut builder = get_builder(); build_recursive(root, &mut builder, 0); let (node, cache) = builder.finish(); - SyntaxNode::new_root_with_resolver(node, cache.unwrap().into_interner().unwrap().into_resolver()) + SyntaxNode::new_root_with_resolver(node, cache.unwrap().into_interner().unwrap()) } fn two_level_tree() -> Element<'static> { diff --git a/tests/it/serde.rs b/tests/it/serde.rs index e7358f1..f8432b2 100644 --- a/tests/it/serde.rs +++ b/tests/it/serde.rs @@ -2,8 +2,9 @@ use crate::{build_recursive, build_tree_with_cache, ResolvedNode}; use super::{Element, SyntaxNode, TestLang}; use cstree::{ - interning::{new_interner, IntoResolver}, - GreenNodeBuilder, NodeCache, NodeOrToken, + build::{GreenNodeBuilder, NodeCache}, + interning::new_interner, + util::NodeOrToken, }; use serde_test::Token; use std::fmt; @@ -227,7 +228,7 @@ fn build_tree(root: Element<'_>) -> ResolvedNode { let mut builder: GreenNodeBuilder = GreenNodeBuilder::new(); build_recursive(&root, &mut builder, 0); let (node, cache) = builder.finish(); - SyntaxNode::new_root_with_resolver(node, cache.unwrap().into_interner().unwrap().into_resolver()) + SyntaxNode::new_root_with_resolver(node, cache.unwrap().into_interner().unwrap()) } fn attach_data(node: &SyntaxNode) {