From d602fe22eb4459b67b1c0136299a3851822da745 Mon Sep 17 00:00:00 2001 From: DQ Date: Fri, 1 Nov 2024 13:47:49 +0100 Subject: [PATCH] Support backtracking and `Checkpoint`s across nodes (#68) * Add [`GreenTreeBuilder::revert`] to support backtracking parsers Rowan, and hence CSTree, is designed around hand-written parsers. In particular, the APIs for *building* trees require that each token is recorded only once. Some parsers, and especially parser combinators, use backtracking instead, where the same token may be seen multiple times. To support this, add a new `revert` function which discards all tokens seen since the last checkpoint. * allow checkpointing across nodes (within reason) * clean up asserts and expand documentation * add Changelog entry * prepare v0.12.2 release --------- Co-authored-by: jyn Co-authored-by: Domenic Quirl --- CHANGELOG.md | 7 + Cargo.toml | 2 +- cstree/Cargo.toml | 2 +- cstree/src/green/builder.rs | 152 ++++++++++++++++-- cstree/src/lib.rs | 2 +- cstree/tests/it/main.rs | 35 ++++- cstree/tests/it/rollback.rs | 306 ++++++++++++++++++++++++++++++++++++ rustfmt.toml | 8 +- 8 files changed, 491 insertions(+), 23 deletions(-) create mode 100644 cstree/tests/it/rollback.rs diff --git a/CHANGELOG.md b/CHANGELOG.md index 4423b8f..3aedcf6 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -2,6 +2,13 @@ ## Unreleased +## `v0.12.2` + + * `Checkpoint`s for the `GreenNodeBuilder` can now be used across node boundaries, meaning you can use them to wrap (finished) nodes in addition to just tokens. + * A new method `Checkpoint::revert_to` has been added which resets a `GreenNodeBuilder` to the state it was in when the checkpoint was taken, allowing a parser to backtrack to the checkpoint. + +## `v0.12.1` + * Implement `Hash` and `Eq` for `ResolvedNode` and `ResolvedToken` ## `v0.12.0` diff --git a/Cargo.toml b/Cargo.toml index 3b7c718..c5ff1d0 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -8,7 +8,7 @@ resolver = "2" [workspace.package] edition = "2021" -version = "0.12.1" # when updating, also update `#![doc(html_root_url)]` and any inter-crate dependencies (such as `cstree`'s dependency on `cstree-derive`) +version = "0.12.2" # when updating, also update `#![doc(html_root_url)]` and any inter-crate dependencies (such as `cstree`'s dependency on `cstree-derive`) authors = [ "Domenic Quirl ", "Aleksey Kladov ", diff --git a/cstree/Cargo.toml b/cstree/Cargo.toml index 56ec836..08cc158 100644 --- a/cstree/Cargo.toml +++ b/cstree/Cargo.toml @@ -25,7 +25,7 @@ indexmap = "2.4.0" [dependencies.cstree_derive] path = "../cstree-derive" -version = "0.12.1" # must match the `cstree` version in the virtual workspace manifest +version = "0.12.2" # must match the `cstree` version in the virtual workspace manifest optional = true [dependencies.lasso] diff --git a/cstree/src/green/builder.rs b/cstree/src/green/builder.rs index cf4e967..2a6636b 100644 --- a/cstree/src/green/builder.rs +++ b/cstree/src/green/builder.rs @@ -242,7 +242,10 @@ where /// A checkpoint for maybe wrapping a node. See [`GreenNodeBuilder::checkpoint`] for details. #[derive(Clone, Copy, Debug)] -pub struct Checkpoint(usize); +pub struct Checkpoint { + parent_idx: usize, + child_idx: usize, +} /// A builder for green trees. /// Construct with [`new`](GreenNodeBuilder::new), [`with_cache`](GreenNodeBuilder::with_cache), or @@ -442,10 +445,12 @@ where self.children.push(node.into()); } - /// Prepare for maybe wrapping the next node with a surrounding node. + /// Take a snapshot of the builder's state, which can be used to retroactively insert surrounding nodes by calling + /// [`start_node_at`](GreenNodeBuilder::start_node_at), or for backtracking by allowing to + /// [`revert_to`](GreenNodeBuilder::revert_to) the returned checkpoint. /// - /// The way wrapping works is that you first get a checkpoint, then you add nodes and tokens as - /// normal, and then you *maybe* call [`start_node_at`](GreenNodeBuilder::start_node_at). + /// Checkpoints are _invalidated_ and can no longer be used if nodes older than (started before) the checkpoint are + /// finished, and also if the builder is reverted past them to an even earlier checkpoint. /// /// # Examples /// ``` @@ -469,27 +474,144 @@ where /// ``` #[inline] pub fn checkpoint(&self) -> Checkpoint { - Checkpoint(self.children.len()) + Checkpoint { + parent_idx: self.parents.len(), + child_idx: self.children.len(), + } } - /// Wrap the previous branch marked by [`checkpoint`](GreenNodeBuilder::checkpoint) in a new - /// branch and make it current. - #[inline] - pub fn start_node_at(&mut self, checkpoint: Checkpoint, kind: S) { - let Checkpoint(checkpoint) = checkpoint; + /// Restore the builder's state to when the [`Checkpoint`] was taken. + /// + /// This is useful for backtracking parsers. It will delete any nodes that were started but not finished since the + /// checkpoint was taken, as well as any tokens that were recorded since. Checkpoints can be nested, but reverting + /// to a checkpoint invalidates all other checkpoints that were taken after it. + /// + /// ## Panics + /// + /// When using checkpoints, calls to [`start_node`](GreenNodeBuilder::start_node) and + /// [`finish_node`](GreenNodeBuilder::finish_node) must always be balanced in between the call + /// to [`checkpoint`](GreenNodeBuilder::checkpoint) and a call to either `revert_to` or + /// [`start_node_at`](GreenNodeBuilder::start_node_at). + /// Since it is impossible to revert the finishing of any node started before taking the checkpoint, this function + /// will panic if it detects that this is the case. + /// + /// Example: + /// ```rust + /// # use cstree::testing::*; + /// # use cstree::build::GreenNodeBuilder; + /// # struct Parser; + /// # impl Parser { + /// # fn peek(&self) -> Option { None } + /// # fn parse_expr(&mut self) {} + /// # } + /// # let mut builder: GreenNodeBuilder = GreenNodeBuilder::new(); + /// # let mut parser = Parser; + /// let checkpoint = builder.checkpoint(); + /// parser.parse_expr(); + /// if let Some(Plus) = parser.peek() { + /// // 1 + 2 = Add(1, 2) + /// builder.start_node_at(checkpoint, Operation); + /// parser.parse_expr(); + /// builder.finish_node(); + /// } else { + /// builder.revert_to(checkpoint); + /// } + /// ``` + pub fn revert_to(&mut self, checkpoint: Checkpoint) { + let Checkpoint { parent_idx, child_idx } = checkpoint; + // NOTE: This doesn't catch scenarios where we've read more tokens since the previous revert + // (see test `misuse3`), but it'll catch simple cases of reverting to the past and then + // trying to revert to the future (see tests `misuse` and `misuse2`). + // + // See `start_node_at` for a general explanation of the conditions here. As opposed to `start_node_at`, + // unfinished nodes are fine here (since we're going to revert them), so a `parent_idx` less than + // `self.parents.len()` is valid. assert!( - checkpoint <= self.children.len(), - "checkpoint no longer valid, was finish_node called early?" + parent_idx <= self.parents.len(), + "checkpoint no longer valid, was `finish_node` called early or did you already `revert_to`?" + ); + assert!( + child_idx <= self.children.len(), + "checkpoint no longer valid after reverting to an earlier checkpoint" ); - if let Some(&(_, first_child)) = self.parents.last() { assert!( - checkpoint >= first_child, + child_idx >= first_child, "checkpoint no longer valid, was an unmatched start_node_at called?" ); } - self.parents.push((kind, checkpoint)); + self.parents.truncate(parent_idx); + self.children.truncate(child_idx); + } + + /// Start a node at the given [`checkpoint`](GreenNodeBuilder::checkpoint), wrapping all nodes + /// and tokens created since the checkpoint was taken. + /// + /// ## Panics + /// + /// When using checkpoints, calls to [`start_node`](GreenNodeBuilder::start_node) and + /// [`finish_node`](GreenNodeBuilder::finish_node) must always be balanced in between the call + /// to [`checkpoint`](GreenNodeBuilder::checkpoint) and a call to either + /// [`revert_to`](GreenNodeBuilder::revert_to) or `start_node_at`: + /// + /// It is impossible to start a node retroactively if one or more nodes started before taking + /// the checkpoint have already been finished when trying to call this function. This function + /// will panic if it detects that this is the case. + /// + /// This function will also panic if one or more nodes were started _after_ the checkpoint was + /// taken and were not yet finished, which would leave these nodes simultaneously inside and + /// outside the newly started node (neither node would contain the other). + #[inline] + pub fn start_node_at(&mut self, checkpoint: Checkpoint, kind: S) { + let Checkpoint { parent_idx, child_idx } = checkpoint; + // NOTE: Due to the requirement to balance `start_node` and `finish_node` in between the calls to `checkpoint` + // and this function, there must be exactly the same number of `self.parents` now as there were when the + // checkpoint was created (`start_node` creates a new parent and `finish_node` moves it to `self.children`). + // + // We assert both inequalities separately to give better error messages: if the checkpoint points past the end + // of `self.parents`, this indicates that a node outside the scope of the checkpoint was already finished (or + // reverted), while there being parents past the checkpointed index indicates that new nodes were started that + // were left unfinished. + assert!( + parent_idx <= self.parents.len(), + "checkpoint no longer valid, was `finish_node` called early or did you already `revert_to`?" + ); + assert!( + parent_idx >= self.parents.len(), + "checkpoint contains one or more unfinished nodes" + ); + + // NOTE: The checkpoint can only point past the end of children if there were children on the stack when it was + // taken that got removed before it was used. Since any nodes started after taking the checkpoint will + // point to a `first_child` >= the checkpoint's `child_idx`, they cannot remove such children when they + // are finished. The are only two ways to remove them: + // 1. to finish a node which was already started at the time of taking the checkpoint and which may thus point + // to an earlier child as its `first_child`, or + // 2. to `revert_to` an earlier checkpoint before trying to start a new node at this one. + // + // Since we require calls to `start_node` and `finish_node` to be balanced between taking and using the + // checkpoint, (1) will be caught by the assert against the `parent_idx` above. This will also catch (2) in + // cases where at least one node is started between the earlier checkpoint and this one. However, it may be the + // case that only tokens are recorded in between the two, in which case we need to check the `child_idx` too. + assert!( + child_idx <= self.children.len(), + "checkpoint no longer valid after reverting to an earlier checkpoint" + ); + + // NOTE: When `start_node` is called after `checkpoint`, the new node gets pushed to `parents`. + // However, `start_node` and `finish_node` must be balanced inbetween the calls to `checkpoint` and + // `start_node_at`, and `finish_node` causes the topmost parent to become a child instead. + // Therefore, the topmost parent _at the time of calling `start_node_at`_ must still be pointing + // "behind" the checkpoint in the list of children. + if let Some(&(_, first_child)) = self.parents.last() { + assert!( + child_idx >= first_child, + "checkpoint no longer valid, was an unmatched `start_node` or `start_node_at` called?" + ); + } + + self.parents.push((kind, child_idx)); } /// Complete building the tree. diff --git a/cstree/src/lib.rs b/cstree/src/lib.rs index 7b789a6..c79e70f 100644 --- a/cstree/src/lib.rs +++ b/cstree/src/lib.rs @@ -93,7 +93,7 @@ )] #![warn(missing_docs)] // Docs.rs -#![doc(html_root_url = "https://docs.rs/cstree/0.12.1")] +#![doc(html_root_url = "https://docs.rs/cstree/0.12.2")] #![cfg_attr(doc_cfg, feature(doc_cfg))] pub mod getting_started; diff --git a/cstree/tests/it/main.rs b/cstree/tests/it/main.rs index a952a23..024bc73 100644 --- a/cstree/tests/it/main.rs +++ b/cstree/tests/it/main.rs @@ -1,5 +1,6 @@ mod basic; mod regressions; +mod rollback; mod sendsync; #[cfg(feature = "serialize")] mod serde; @@ -7,7 +8,8 @@ mod serde; use cstree::{ build::{GreenNodeBuilder, NodeCache}, green::GreenNode, - interning::Interner, + interning::{Interner, Resolver}, + util::NodeOrToken, RawSyntaxKind, Syntax, }; @@ -78,3 +80,34 @@ where } from } + +#[track_caller] +pub fn assert_tree_eq( + (left, left_res): (&SyntaxNode, &impl Resolver), + (right, right_res): (&SyntaxNode, &impl Resolver), +) { + if left.green() == right.green() { + return; + } + + if left.kind() != right.kind() || left.children_with_tokens().len() != right.children_with_tokens().len() { + panic!("{} !=\n{}", left.debug(left_res, true), right.debug(right_res, true)) + } + + for elem in left.children_with_tokens().zip(right.children_with_tokens()) { + match elem { + (NodeOrToken::Node(ln), NodeOrToken::Node(rn)) => assert_tree_eq((ln, left_res), (rn, right_res)), + (NodeOrToken::Node(n), NodeOrToken::Token(t)) => { + panic!("{} != {}", n.debug(left_res, true), t.debug(right_res)) + } + (NodeOrToken::Token(t), NodeOrToken::Node(n)) => { + panic!("{} != {}", t.debug(left_res), n.debug(right_res, true)) + } + (NodeOrToken::Token(lt), NodeOrToken::Token(rt)) => { + if lt.syntax_kind() != rt.syntax_kind() || lt.resolve_text(left_res) != rt.resolve_text(right_res) { + panic!("{} != {}", lt.debug(left_res), rt.debug(right_res)) + } + } + } + } +} diff --git a/cstree/tests/it/rollback.rs b/cstree/tests/it/rollback.rs new file mode 100644 index 0000000..c826ce5 --- /dev/null +++ b/cstree/tests/it/rollback.rs @@ -0,0 +1,306 @@ +use super::*; +use cstree::interning::Resolver; + +type GreenNodeBuilder<'cache, 'interner> = cstree::build::GreenNodeBuilder<'cache, 'interner, SyntaxKind>; + +fn with_builder(f: impl FnOnce(&mut GreenNodeBuilder)) -> (SyntaxNode, impl Resolver) { + let mut builder = GreenNodeBuilder::new(); + f(&mut builder); + let (node, cache) = builder.finish(); + (SyntaxNode::new_root(node), cache.unwrap().into_interner().unwrap()) +} + +#[test] +#[should_panic = "`left == right` failed"] +fn comparison_works() { + let (first, res1) = with_builder(|_| {}); + let (second, res2) = with_builder(|builder| { + builder.start_node(SyntaxKind(0)); + builder.token(SyntaxKind(1), "hi"); + builder.finish_node(); + }); + assert_tree_eq((&first, &res1), (&second, &res2)); +} + +#[test] +fn no_rollback_token() { + let (first, res1) = with_builder(|builder| { + builder.start_node(SyntaxKind(0)); + builder.token(SyntaxKind(1), "hi"); + builder.finish_node(); + }); + let (second, res2) = with_builder(|builder| { + let checkpoint = builder.checkpoint(); + builder.token(SyntaxKind(1), "hi"); + builder.start_node_at(checkpoint, SyntaxKind(0)); + builder.finish_node(); + }); + assert_tree_eq((&first, &res1), (&second, &res2)); +} + +#[test] +fn no_rollback_node() { + let (first, res1) = with_builder(|builder| { + builder.start_node(SyntaxKind(2)); + builder.start_node(SyntaxKind(0)); + builder.token(SyntaxKind(1), "hi"); + builder.finish_node(); + builder.finish_node(); + }); + let (second, res2) = with_builder(|builder| { + let checkpoint = builder.checkpoint(); + builder.start_node(SyntaxKind(0)); + builder.token(SyntaxKind(1), "hi"); + builder.finish_node(); + builder.start_node_at(checkpoint, SyntaxKind(2)); + builder.finish_node(); + }); + assert_tree_eq((&first, &res1), (&second, &res2)); +} + +#[test] +#[should_panic = "unfinished nodes"] +fn no_rollback_unfinished_node() { + let (second, res2) = with_builder(|builder| { + let checkpoint = builder.checkpoint(); + builder.start_node(SyntaxKind(0)); + builder.token(SyntaxKind(1), "hi"); + builder.start_node_at(checkpoint, SyntaxKind(2)); + builder.finish_node(); + builder.finish_node(); + }); + println!("{}", second.debug(&res2, true)); +} + +#[test] +fn simple() { + let (first, res1) = with_builder(|builder| { + builder.start_node(SyntaxKind(0)); + builder.finish_node(); + }); + let (second, res2) = with_builder(|builder| { + builder.start_node(SyntaxKind(0)); + + // Add a token, then remove it. + let initial = builder.checkpoint(); + builder.token(SyntaxKind(1), "hi"); + builder.revert_to(initial); + + builder.finish_node(); + }); + assert_tree_eq((&first, &res1), (&second, &res2)); +} + +#[test] +fn nested() { + let (first, res1) = with_builder(|builder| { + builder.start_node(SyntaxKind(0)); + builder.finish_node(); + }); + + let (second, res2) = with_builder(|builder| { + builder.start_node(SyntaxKind(0)); + // Add two tokens, then remove both. + let initial = builder.checkpoint(); + builder.token(SyntaxKind(1), "hi"); + builder.token(SyntaxKind(2), "hello"); + builder.revert_to(initial); + + builder.finish_node(); + }); + + let (third, res3) = with_builder(|builder| { + builder.start_node(SyntaxKind(0)); + + // Add two tokens, then remove one after the other. + let initial = builder.checkpoint(); + builder.token(SyntaxKind(1), "hi"); + let second = builder.checkpoint(); + builder.token(SyntaxKind(2), "hello"); + builder.revert_to(second); + builder.revert_to(initial); + + builder.finish_node(); + }); + + assert_tree_eq((&first, &res1), (&second, &res2)); + assert_tree_eq((&first, &res1), (&third, &res3)); +} + +#[test] +fn unfinished_node() { + let (first, res1) = with_builder(|builder| { + builder.start_node(SyntaxKind(2)); + builder.finish_node(); + }); + let (second, res2) = with_builder(|builder| { + builder.start_node(SyntaxKind(2)); + let checkpoint = builder.checkpoint(); + builder.start_node(SyntaxKind(0)); + builder.token(SyntaxKind(1), "hi"); + builder.revert_to(checkpoint); + builder.finish_node(); + }); + assert_tree_eq((&first, &res1), (&second, &res2)); +} + +#[test] +#[should_panic = "checkpoint no longer valid after reverting to an earlier checkpoint"] +fn misuse() { + let (first, res1) = with_builder(|builder| { + builder.start_node(SyntaxKind(0)); + builder.finish_node(); + }); + let (second, res2) = with_builder(|builder| { + builder.start_node(SyntaxKind(0)); + + // Add two tokens, but remove them in the wrong order. + let initial = builder.checkpoint(); + builder.token(SyntaxKind(1), "hi"); + let new = builder.checkpoint(); + builder.token(SyntaxKind(2), "hello"); + builder.revert_to(initial); + builder.revert_to(new); + + builder.finish_node(); + }); + + assert_tree_eq((&first, &res1), (&second, &res2)); +} + +#[test] +#[should_panic = "did you already `revert_to`?"] +fn misuse2() { + with_builder(|builder| { + builder.start_node(SyntaxKind(0)); + + // Take two snapshots across a node boundary, but revert them in the wrong order. + let initial = builder.checkpoint(); + builder.start_node(SyntaxKind(3)); + builder.token(SyntaxKind(1), "hi"); + let new = builder.checkpoint(); + builder.token(SyntaxKind(2), "hello"); + builder.revert_to(initial); + builder.revert_to(new); + + builder.finish_node(); + }); +} + +#[test] +fn misuse3() { + let (first, res1) = with_builder(|builder| { + builder.start_node(SyntaxKind(0)); + builder.token(SyntaxKind(3), "no"); + builder.finish_node(); + }); + + let (second, res2) = with_builder(|builder| { + builder.start_node(SyntaxKind(0)); + + // Add two tokens, revert to the initial state, add three tokens, and try to revert to an earlier checkpoint. + let initial = builder.checkpoint(); + builder.token(SyntaxKind(1), "hi"); + let new = builder.checkpoint(); + builder.token(SyntaxKind(2), "hello"); + builder.revert_to(initial); + + // This is wrong, but there's not a whole lot the library can do about it. + builder.token(SyntaxKind(3), "no"); + builder.token(SyntaxKind(4), "bad"); + builder.token(SyntaxKind(4), "wrong"); + builder.revert_to(new); + + builder.finish_node(); + }); + + assert_tree_eq((&first, &res1), (&second, &res2)); +} + +#[test] +#[should_panic = "was `finish_node` called early or did you already `revert_to`"] +fn misuse_combined() { + with_builder(|builder| { + builder.start_node(SyntaxKind(0)); + + // Take two snapshots across a node boundary, revert to the earlier one but then try to start a node at the + // later one. + let initial = builder.checkpoint(); + builder.start_node(SyntaxKind(3)); + builder.token(SyntaxKind(1), "hi"); + let new = builder.checkpoint(); + builder.token(SyntaxKind(2), "hello"); + builder.revert_to(initial); + builder.start_node_at(new, SyntaxKind(4)); + + builder.finish_node(); + }); +} + +#[test] +#[should_panic = "reverting to an earlier checkpoint"] +fn misuse_combined2() { + with_builder(|builder| { + builder.start_node(SyntaxKind(0)); + + // Take two snapshots with only tokens between them, revert to the earlier one but then try to start a node at + // the later one. + let initial = builder.checkpoint(); + builder.token(SyntaxKind(1), "hi"); + let new = builder.checkpoint(); + builder.token(SyntaxKind(2), "hello"); + builder.revert_to(initial); + builder.start_node_at(new, SyntaxKind(3)); + + builder.finish_node(); + }); +} + +#[test] +fn revert_then_start() { + let (first, res1) = with_builder(|builder| { + builder.start_node(SyntaxKind(0)); + builder.start_node(SyntaxKind(3)); + builder.token(SyntaxKind(2), "hello"); + builder.finish_node(); + builder.finish_node(); + }); + let (second, res2) = with_builder(|builder| { + builder.start_node(SyntaxKind(0)); + + // Take two snapshots with only tokens between them, revert to the earlier one but then try to start a node at + // the later one. + let initial = builder.checkpoint(); + builder.token(SyntaxKind(1), "hi"); + builder.revert_to(initial); + builder.start_node_at(initial, SyntaxKind(3)); + builder.token(SyntaxKind(2), "hello"); + builder.finish_node(); + + builder.finish_node(); + }); + assert_tree_eq((&first, &res1), (&second, &res2)); +} + +#[test] +fn start_then_revert() { + let (first, res1) = with_builder(|builder| { + builder.start_node(SyntaxKind(0)); + builder.token(SyntaxKind(2), "hello"); + builder.finish_node(); + }); + let (second, res2) = with_builder(|builder| { + builder.start_node(SyntaxKind(0)); + + // Take two snapshots with only tokens between them, revert to the earlier one but then try to start a node at + // the later one. + let initial = builder.checkpoint(); + builder.token(SyntaxKind(1), "hi"); + builder.start_node_at(initial, SyntaxKind(3)); + builder.revert_to(initial); + builder.token(SyntaxKind(2), "hello"); + + builder.finish_node(); + }); + assert_tree_eq((&first, &res1), (&second, &res2)); +} diff --git a/rustfmt.toml b/rustfmt.toml index 2351305..e193002 100644 --- a/rustfmt.toml +++ b/rustfmt.toml @@ -1,15 +1,15 @@ unstable_features = true -edition = "2018" +edition = "2021" -max_width = 120 +max_width = 120 comment_width = 120 wrap_comments = true format_code_in_doc_comments = true -format_macro_matchers = true +format_macro_matchers = true -imports_granularity= "Crate" +imports_granularity = "Crate" reorder_impl_items = true