From 2aaf4169daa5039bec901ab68a9636ce71ce4545 Mon Sep 17 00:00:00 2001 From: Domenic Quirl Date: Fri, 17 Sep 2021 16:37:15 +0200 Subject: [PATCH] refactor re-exports of lasso interning components and expose interned string keys --- Cargo.toml | 8 ++--- src/green/builder.rs | 20 +++++++++++ src/green/interner.rs | 32 +++++++++-------- src/green/token.rs | 15 +++++--- src/interning.rs | 47 +++++++++++++++++++++++++ src/lib.rs | 11 +++--- src/syntax/token.rs | 81 ++++++++++++++++++++++++++++++++++++++++--- tests/it/serde.rs | 7 ++-- 8 files changed, 184 insertions(+), 37 deletions(-) create mode 100644 src/interning.rs diff --git a/Cargo.toml b/Cargo.toml index 1cde72c..e098e43 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -15,7 +15,7 @@ readme = "README.md" debug = true [dependencies] -lasso = { version = "0.6", features = ["inline-more"] } +lasso = { version = "0.6", features = ["inline-more", "multi-threaded"] } text-size = "1.1.0" fxhash = "0.2.1" parking_lot = "0.11.2" @@ -41,8 +41,8 @@ name = "main" harness = false [features] -default = [] -serde1 = ["serde"] +default = [] +serialize = ["serde", "lasso/serialize"] [package.metadata.docs.rs] -features = ["serde1"] +features = ["serialize"] diff --git a/src/green/builder.rs b/src/green/builder.rs index 7b74419..3b374fc 100644 --- a/src/green/builder.rs +++ b/src/green/builder.rs @@ -356,6 +356,26 @@ where } } + /// Shortcut to construct a builder that uses an existing interner. + /// + /// This is equivalent to using [`from_cache`](GreenNodeBuilder::from_cache) with a node cache + /// obtained from [`NodeCache::with_interner`]. + #[inline] + pub fn with_interner(interner: &'interner mut I) -> Self { + let cache = NodeCache::with_interner(interner); + Self::from_cache(cache) + } + + /// Shortcut to construct a builder that uses an existing interner. + /// + /// This is equivalent to using [`from_cache`](GreenNodeBuilder::from_cache) with a node cache + /// obtained from [`NodeCache::from_interner`]. + #[inline] + pub fn from_interner(interner: I) -> Self { + let cache = NodeCache::from_interner(interner); + Self::from_cache(cache) + } + /// Get a reference to the interner used to deduplicate source text (strings). /// /// This is the same interner as used by the underlying [`NodeCache`]. diff --git a/src/green/interner.rs b/src/green/interner.rs index 18cb3a8..db995fc 100644 --- a/src/green/interner.rs +++ b/src/green/interner.rs @@ -1,12 +1,14 @@ use std::num::NonZeroUsize; +use crate::interning::{ + Capacity, Interner, IntoReader, IntoReaderAndResolver, IntoResolver, Key, Reader, Resolver, Rodeo, +}; use fxhash::FxBuildHasher; -use lasso::{Capacity, Interner, IntoReader, IntoReaderAndResolver, IntoResolver, Reader, Resolver, Rodeo, Spur}; /// The default [`Interner`] used to deduplicate green token strings. #[derive(Debug)] pub struct TokenInterner { - rodeo: Rodeo, + rodeo: Rodeo, } impl TokenInterner { @@ -23,22 +25,22 @@ impl TokenInterner { impl Resolver for TokenInterner { #[inline] - fn resolve<'a>(&'a self, key: &Spur) -> &'a str { + fn resolve<'a>(&'a self, key: &Key) -> &'a str { self.rodeo.resolve(key) } #[inline] - fn try_resolve<'a>(&'a self, key: &Spur) -> Option<&'a str> { + fn try_resolve<'a>(&'a self, key: &Key) -> Option<&'a str> { self.rodeo.try_resolve(key) } #[inline] - unsafe fn resolve_unchecked<'a>(&'a self, key: &Spur) -> &'a str { + unsafe fn resolve_unchecked<'a>(&'a self, key: &Key) -> &'a str { self.rodeo.resolve_unchecked(key) } #[inline] - fn contains_key(&self, key: &Spur) -> bool { + fn contains_key(&self, key: &Key) -> bool { self.rodeo.contains_key(key) } @@ -50,7 +52,7 @@ impl Resolver for TokenInterner { impl Reader for TokenInterner { #[inline] - fn get(&self, val: &str) -> Option { + fn get(&self, val: &str) -> Option { self.rodeo.get(val) } @@ -61,7 +63,7 @@ impl Reader for TokenInterner { } impl IntoResolver for TokenInterner { - type Resolver = as IntoResolver>::Resolver; + type Resolver = ::Resolver; #[inline] fn into_resolver(self) -> Self::Resolver @@ -76,34 +78,34 @@ impl IntoResolver for TokenInterner { where Self: 'static, { - Rodeo::::into_resolver_boxed(Box::new(self.rodeo)) + Rodeo::into_resolver_boxed(Box::new(self.rodeo)) } } impl Interner for TokenInterner { #[inline] - fn get_or_intern(&mut self, val: &str) -> Spur { + fn get_or_intern(&mut self, val: &str) -> Key { self.rodeo.get_or_intern(val) } #[inline] - fn try_get_or_intern(&mut self, val: &str) -> lasso::LassoResult { + fn try_get_or_intern(&mut self, val: &str) -> lasso::LassoResult { self.rodeo.try_get_or_intern(val) } #[inline] - fn get_or_intern_static(&mut self, val: &'static str) -> Spur { + fn get_or_intern_static(&mut self, val: &'static str) -> Key { self.rodeo.get_or_intern_static(val) } #[inline] - fn try_get_or_intern_static(&mut self, val: &'static str) -> lasso::LassoResult { + fn try_get_or_intern_static(&mut self, val: &'static str) -> lasso::LassoResult { self.rodeo.try_get_or_intern_static(val) } } impl IntoReader for TokenInterner { - type Reader = as IntoReader>::Reader; + type Reader = ::Reader; #[inline] fn into_reader(self) -> Self::Reader @@ -117,7 +119,7 @@ impl IntoReader for TokenInterner { where Self: 'static, { - Rodeo::::into_reader_boxed(Box::new(self.rodeo)) + Rodeo::into_reader_boxed(Box::new(self.rodeo)) } } diff --git a/src/green/token.rs b/src/green/token.rs index 74f40db..df54e93 100644 --- a/src/green/token.rs +++ b/src/green/token.rs @@ -1,14 +1,17 @@ use std::{fmt, hash, mem::ManuallyDrop, ptr}; -use crate::{green::SyntaxKind, interning::Resolver, TextSize}; -use lasso::Spur; +use crate::{ + green::SyntaxKind, + interning::{Key, Resolver}, + TextSize, +}; use triomphe::Arc; #[repr(align(2))] // to use 1 bit for pointer tagging. NB: this is an at-least annotation #[derive(Debug, PartialEq, Eq, Hash, Copy, Clone)] pub(super) struct GreenTokenData { pub(super) kind: SyntaxKind, - pub(super) text: Spur, + pub(super) text: Key, pub(super) text_len: TextSize, } @@ -70,8 +73,12 @@ impl GreenToken { self.data().text_len } + /// Returns the interned key of text covered by this token. + /// This key may be used for comparisons with other keys of strings interned by the same interner. + /// + /// See also [`text`](GreenToken::text). #[inline] - pub(crate) fn text_key(&self) -> Spur { + pub fn text_key(&self) -> Key { self.data().text } } diff --git a/src/interning.rs b/src/interning.rs new file mode 100644 index 0000000..dbe2d41 --- /dev/null +++ b/src/interning.rs @@ -0,0 +1,47 @@ +//! Types and Traits for efficient String storage and deduplication. +//! +//! Interning functionality is provided by the [`lasso`](lasso) crate. + +pub use fxhash::FxBuildHasher as Hasher; + +pub use crate::green::TokenInterner; + +/// The index type for all interners. Each key represents +pub type Key = lasso::Spur; +pub use lasso::{Interner, IntoReader, IntoReaderAndResolver, IntoResolver, Reader, Resolver}; + +/// A string interner that caches strings quickly with a minimal memory footprint, returning a unique key to re-access +/// it with `O(1)` times. By default, `Rodeo` uses an [`fxhash`] [`Hasher`]. +pub type Rodeo = lasso::Rodeo; + +/// Constructs a new, single-threaded interner. +/// +/// If you need the interner to be multi-threaded, see [`new_threaded_interner`]. +#[inline] +pub fn new_interner() -> Rodeo { + Rodeo::with_hasher(Hasher::default()) +} + +/// A string interner that caches strings quickly with a minimal memory footprint, returning a unique key to re-access +/// it with `O(1)` times. By default, `ThreadedRodeo` uses an [`fxhash`] [`Hasher`]. +pub type ThreadedRodeo = lasso::ThreadedRodeo; + +/// Constructs a new interner that can be used across multiple threads. +#[inline] +pub fn new_threaded_interner() -> ThreadedRodeo { + ThreadedRodeo::with_hasher(Hasher::default()) +} + +/// A read-only view of a [`Rodeo`] or [`ThreadedRodeo`] that allows contention-free access to interned strings, both +/// key to string resolution and string to key lookups. +/// +/// The hasher is the same as the Rodeo or ThreadedRodeo that created it. +/// Can be acquired with the `into_reader` methods (see also [`IntoReader`]). +pub type RodeoReader = lasso::RodeoReader; + +/// A read-only view of a [`Rodeo`] or [`ThreadedRodeo`] that allows contention-free access to interned strings with +/// only key to string resolution. +/// +/// Can be acquired with the `into_resolver` methods (see also [`IntoResolver`]). +pub type RodeoResolver = lasso::RodeoResolver; +pub use lasso::{Capacity, Iter, LassoError, LassoErrorKind, LassoResult, MemoryLimits, Strings}; diff --git a/src/lib.rs b/src/lib.rs index e1b17a0..775fcd0 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -49,26 +49,23 @@ #[allow(unsafe_code)] mod green; #[allow(unsafe_code)] -pub mod syntax; +mod syntax; #[cfg(feature = "serde1")] mod serde_impls; #[allow(missing_docs)] mod utility_types; -/// Types and Traits for efficient String storage and deduplication. -pub mod interning { - pub use crate::green::TokenInterner; - pub use lasso::{Interner, IntoReader, IntoReaderAndResolver, IntoResolver, Reader, Resolver}; -} +pub mod interning; use std::fmt; // Reexport types for working with strings. pub use text_size::{TextLen, TextRange, TextSize}; +#[doc(inline)] +pub use crate::syntax::*; pub use crate::{ green::{Checkpoint, Children, GreenNode, GreenNodeBuilder, GreenToken, NodeCache, SyntaxKind}, - syntax::*, utility_types::{Direction, NodeOrToken, TokenAtOffset, WalkEvent}, }; pub use triomphe::Arc; diff --git a/src/syntax/token.rs b/src/syntax/token.rs index 757cbb7..b59c9c9 100644 --- a/src/syntax/token.rs +++ b/src/syntax/token.rs @@ -9,7 +9,7 @@ use lasso::Resolver; use text_size::{TextRange, TextSize}; use super::*; -use crate::{Direction, GreenNode, GreenToken, Language, SyntaxKind}; +use crate::{interning::Key, Direction, GreenNode, GreenToken, Language, SyntaxKind}; /// Syntax tree token. #[derive(Debug)] @@ -69,6 +69,7 @@ impl SyntaxToken { /// Returns this token's [`Debug`](fmt::Debug) representation as a string. /// /// To avoid allocating for every token, see [`write_debug`](SyntaxToken::write_debug). + #[inline] pub fn debug(&self, resolver: &R) -> String where R: Resolver + ?Sized, @@ -182,16 +183,86 @@ impl SyntaxToken { /// This method is different from the `PartialEq` and `Eq` implementations in that it compares /// the text and not the token position. /// It is more efficient than comparing the result of - /// [`resolve_text`](SyntaxToken::resolve_text) because it compares the tokens' interned string - /// keys. + /// [`resolve_text`](SyntaxToken::resolve_text) because it compares the tokens' interned + /// [`text_key`s](SyntaxToken::text_key). /// Therefore, it also does not require a [`Resolver`]. /// **Note** that the result of the comparison may be wrong when comparing two tokens from /// different trees that use different interners. + #[inline] pub fn text_eq(&self, other: &Self) -> bool { - self.green().text_key() == other.green().text_key() + self.text_key() == other.text_key() + } + + /// Returns the interned key of text covered by this token. + /// This key may be used for comparisons with other keys of strings interned by the same interner. + /// + /// See also [`resolve_text`](SyntaxToken::resolve_text) and [`text_eq`](SyntaxToken::text_eq). + /// + /// # Examples + /// If you intern strings inside of your application, e.g. inside of a compiler, you can use + /// token's text keys to cross-reference between the syntax tree and the rest of your + /// implementation by re-using the interner in both. + /// ``` + /// # use cstree::*; + /// # use cstree::interning::{Hasher, Rodeo, Key, new_interner}; + /// # #[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash)] + /// # #[repr(u16)] + /// # enum SyntaxKind { + /// # ROOT, + /// # INT, + /// # } + /// # #[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash)] + /// # enum Lang {} + /// # impl cstree::Language for Lang { + /// # type Kind = SyntaxKind; + /// # + /// # fn kind_from_raw(raw: cstree::SyntaxKind) -> Self::Kind { + /// # assert!(raw.0 <= SyntaxKind::INT as u16); + /// # unsafe { std::mem::transmute::(raw.0) } + /// # } + /// # + /// # fn kind_to_raw(kind: Self::Kind) -> cstree::SyntaxKind { + /// # cstree::SyntaxKind(kind as u16) + /// # } + /// # } + /// # type SyntaxNode = cstree::SyntaxNode; + /// # const ROOT: cstree::SyntaxKind = cstree::SyntaxKind(0); + /// # const IDENT: cstree::SyntaxKind = cstree::SyntaxKind(1); + /// # fn parse(b: &mut GreenNodeBuilder, s: &str) {} + /// # + /// struct TypeTable { + /// // ... + /// } + /// impl TypeTable { + /// fn type_of(&self, ident: Key) -> &str { + /// // ... + /// # "" + /// } + /// } + /// # struct State { + /// # interner: Rodeo, + /// # type_table: TypeTable, + /// # } + /// # let interner = new_interner(); + /// # let state = &mut State { interner, type_table: TypeTable{} }; + /// let mut builder = GreenNodeBuilder::with_interner(&mut state.interner); + /// # let input = ""; + /// # builder.start_node(ROOT); + /// # builder.token(IDENT, "x"); + /// # builder.finish_node(); + /// let tree = parse(&mut builder, "x"); + /// # let tree = SyntaxNode::::new_root(builder.finish().0); + /// let type_table = &state.type_table; + /// let ident = tree.children_with_tokens().next().unwrap().into_token().unwrap(); + /// let typ = type_table.type_of(ident.text_key()); + /// ``` + #[inline] + pub fn text_key(&self) -> Key { + self.green().text_key() } /// Returns the unterlying green tree token of this token. + #[inline] pub fn green(&self) -> &GreenToken { self.parent .green() @@ -242,6 +313,7 @@ impl SyntaxToken { /// Returns the next token in the tree. /// This is not necessary a direct sibling of this token, but will always be further right in the tree. + #[inline] pub fn next_token(&self) -> Option<&SyntaxToken> { match self.next_sibling_or_token() { Some(element) => element.first_token(), @@ -255,6 +327,7 @@ impl SyntaxToken { /// Returns the previous token in the tree. /// This is not necessary a direct sibling of this token, but will always be further left in the tree. + #[inline] pub fn prev_token(&self) -> Option<&SyntaxToken> { match self.prev_sibling_or_token() { Some(element) => element.last_token(), diff --git a/tests/it/serde.rs b/tests/it/serde.rs index 6ea5c71..e6625ff 100644 --- a/tests/it/serde.rs +++ b/tests/it/serde.rs @@ -1,12 +1,13 @@ use crate::{build_recursive, build_tree_with_cache, ResolvedNode}; use super::{Element, SyntaxNode}; -use cstree::{interning::IntoResolver, GreenNodeBuilder, NodeCache, NodeOrToken}; +use cstree::{ + interning::{IntoResolver, Rodeo}, + GreenNodeBuilder, NodeCache, NodeOrToken, +}; use serde_test::Token; use std::fmt; -type Rodeo = lasso::Rodeo; - /// Macro for generating a list of `serde_test::Token`s using a simpler DSL. macro_rules! event_tokens { ($($name:ident($($token:tt)*)),*) => {