1
Fork 0
mirror of https://github.com/RGBCube/serenity synced 2025-07-25 19:57:35 +00:00

LibJS: Implement string concatenation using ropes

Instead of concatenating string data every time you add two strings
together in JavaScript, we now create a new PrimitiveString that points
to the two concatenated strings instead.

This turns concatenated strings into a tree structure that doesn't have
to be serialized until someone wants the characters in the string.

This *dramatically* reduces the peak memory footprint when running
the SunSpider benchmark (from ~6G to ~1G on my machine). It's also
significantly faster (1.39x) :^)
This commit is contained in:
Andreas Kling 2022-08-05 23:58:47 +02:00
parent cf62d08b2a
commit 64b29eb459
3 changed files with 172 additions and 59 deletions

View file

@ -15,15 +15,22 @@
namespace JS { namespace JS {
PrimitiveString::PrimitiveString(PrimitiveString& lhs, PrimitiveString& rhs)
{
m_is_rope = true;
m_left = &lhs;
m_right = &rhs;
}
PrimitiveString::PrimitiveString(String string) PrimitiveString::PrimitiveString(String string)
: m_utf8_string(move(string)) : m_has_utf8_string(true)
, m_has_utf8_string(true) , m_utf8_string(move(string))
{ {
} }
PrimitiveString::PrimitiveString(Utf16String string) PrimitiveString::PrimitiveString(Utf16String string)
: m_utf16_string(move(string)) : m_has_utf16_string(true)
, m_has_utf16_string(true) , m_utf16_string(move(string))
{ {
} }
@ -32,8 +39,22 @@ PrimitiveString::~PrimitiveString()
vm().string_cache().remove(m_utf8_string); vm().string_cache().remove(m_utf8_string);
} }
void PrimitiveString::visit_edges(Cell::Visitor& visitor)
{
Cell::visit_edges(visitor);
if (m_is_rope) {
visitor.visit(m_left);
visitor.visit(m_right);
}
}
bool PrimitiveString::is_empty() const bool PrimitiveString::is_empty() const
{ {
if (m_is_rope) {
// NOTE: We never make an empty rope string.
return false;
}
if (m_has_utf16_string) if (m_has_utf16_string)
return m_utf16_string.is_empty(); return m_utf16_string.is_empty();
if (m_has_utf8_string) if (m_has_utf8_string)
@ -43,6 +64,7 @@ bool PrimitiveString::is_empty() const
String const& PrimitiveString::string() const String const& PrimitiveString::string() const
{ {
resolve_rope_if_needed();
if (!m_has_utf8_string) { if (!m_has_utf8_string) {
m_utf8_string = m_utf16_string.to_utf8(); m_utf8_string = m_utf16_string.to_utf8();
m_has_utf8_string = true; m_has_utf8_string = true;
@ -52,6 +74,7 @@ String const& PrimitiveString::string() const
Utf16String const& PrimitiveString::utf16_string() const Utf16String const& PrimitiveString::utf16_string() const
{ {
resolve_rope_if_needed();
if (!m_has_utf16_string) { if (!m_has_utf16_string) {
m_utf16_string = Utf16String(m_utf8_string); m_utf16_string = Utf16String(m_utf8_string);
m_has_utf16_string = true; m_has_utf16_string = true;
@ -139,4 +162,135 @@ PrimitiveString* js_string(VM& vm, String string)
return js_string(vm.heap(), move(string)); return js_string(vm.heap(), move(string));
} }
PrimitiveString* js_rope_string(VM& vm, PrimitiveString& lhs, PrimitiveString& rhs)
{
// We're here to concatenate two strings into a new rope string.
// However, if any of them are empty, no rope is required.
bool lhs_empty = lhs.is_empty();
bool rhs_empty = rhs.is_empty();
if (lhs_empty && rhs_empty)
return &vm.empty_string();
if (lhs_empty)
return &rhs;
if (rhs_empty)
return &lhs;
return vm.heap().allocate_without_global_object<PrimitiveString>(lhs, rhs);
}
void PrimitiveString::resolve_rope_if_needed() const
{
if (!m_is_rope)
return;
// NOTE: Special case for two concatenated UTF-16 strings.
// This is here as an optimization, although I'm unsure how valuable it is.
if (m_left->has_utf16_string() && m_right->has_utf16_string()) {
auto const& lhs_string = m_left->utf16_string();
auto const& rhs_string = m_right->utf16_string();
Vector<u16, 1> combined;
combined.ensure_capacity(lhs_string.length_in_code_units() + rhs_string.length_in_code_units());
combined.extend(lhs_string.string());
combined.extend(rhs_string.string());
m_utf16_string = Utf16String(move(combined));
m_has_utf16_string = true;
m_is_rope = false;
m_left = nullptr;
m_right = nullptr;
return;
}
// This vector will hold all the pieces of the rope that need to be assembled
// into the resolved string.
Vector<PrimitiveString const*> pieces;
// NOTE: We traverse the rope tree without using recursion, since we'd run out of
// stack space quickly when handling a long sequence of unresolved concatenations.
Vector<PrimitiveString const*> stack;
stack.append(m_right);
stack.append(m_left);
while (!stack.is_empty()) {
auto* current = stack.take_last();
if (current->m_is_rope) {
stack.append(current->m_right);
stack.append(current->m_left);
continue;
}
pieces.append(current);
}
// Now that we have all the pieces, we can concatenate them using a StringBuilder.
StringBuilder builder;
// We keep track of the previous piece in order to handle surrogate pairs spread across two pieces.
PrimitiveString const* previous = nullptr;
for (auto const* current : pieces) {
if (!previous) {
// This is the very first piece, just append it and continue.
builder.append(current->string());
previous = current;
continue;
}
// Get the UTF-8 representations for both strings.
auto const& previous_string_as_utf8 = previous->string();
auto const& current_string_as_utf8 = current->string();
// NOTE: Now we need to look at the end of the previous string and the start
// of the current string, to see if they should be combined into a surrogate.
// Surrogates encoded as UTF-8 are 3 bytes.
if ((previous_string_as_utf8.length() < 3) || (current_string_as_utf8.length() < 3)) {
builder.append(current->string());
previous = current;
continue;
}
// Might the previous string end with a UTF-8 encoded surrogate?
if ((static_cast<u8>(previous_string_as_utf8[previous_string_as_utf8.length() - 3]) & 0xf0) != 0xe0) {
// If not, just append the current string and continue.
builder.append(current->string());
previous = current;
continue;
}
// Might the current string begin with a UTF-8 encoded surrogate?
if ((static_cast<u8>(current_string_as_utf8[0]) & 0xf0) != 0xe0) {
// If not, just append the current string and continue.
builder.append(current->string());
previous = current;
continue;
}
auto high_surrogate = *Utf8View(previous_string_as_utf8.substring_view(previous_string_as_utf8.length() - 3)).begin();
auto low_surrogate = *Utf8View(current_string_as_utf8).begin();
if (!Utf16View::is_high_surrogate(high_surrogate) || !Utf16View::is_low_surrogate(low_surrogate)) {
builder.append(current->string());
previous = current;
continue;
}
// Remove 3 bytes from the builder and replace them with the UTF-8 encoded code point.
builder.trim(3);
builder.append_code_point(Utf16View::decode_surrogate_pair(high_surrogate, low_surrogate));
// Append the remaining part of the current string.
builder.append(current_string_as_utf8.substring_view(3));
previous = current;
}
m_utf8_string = builder.to_string();
m_has_utf8_string = true;
m_is_rope = false;
m_left = nullptr;
m_right = nullptr;
}
} }

View file

@ -17,6 +17,7 @@ namespace JS {
class PrimitiveString final : public Cell { class PrimitiveString final : public Cell {
public: public:
explicit PrimitiveString(PrimitiveString&, PrimitiveString&);
explicit PrimitiveString(String); explicit PrimitiveString(String);
explicit PrimitiveString(Utf16String); explicit PrimitiveString(Utf16String);
virtual ~PrimitiveString(); virtual ~PrimitiveString();
@ -37,12 +38,20 @@ public:
private: private:
virtual StringView class_name() const override { return "PrimitiveString"sv; } virtual StringView class_name() const override { return "PrimitiveString"sv; }
virtual void visit_edges(Cell::Visitor&) override;
void resolve_rope_if_needed() const;
mutable bool m_is_rope { false };
mutable bool m_has_utf8_string { false };
mutable bool m_has_utf16_string { false };
mutable PrimitiveString* m_left { nullptr };
mutable PrimitiveString* m_right { nullptr };
mutable String m_utf8_string; mutable String m_utf8_string;
mutable bool m_has_utf8_string { false };
mutable Utf16String m_utf16_string; mutable Utf16String m_utf16_string;
mutable bool m_has_utf16_string { false };
}; };
PrimitiveString* js_string(Heap&, Utf16View const&); PrimitiveString* js_string(Heap&, Utf16View const&);
@ -54,4 +63,6 @@ PrimitiveString* js_string(VM&, Utf16String);
PrimitiveString* js_string(Heap&, String); PrimitiveString* js_string(Heap&, String);
PrimitiveString* js_string(VM&, String); PrimitiveString* js_string(VM&, String);
PrimitiveString* js_rope_string(VM&, PrimitiveString&, PrimitiveString&);
} }

View file

@ -1074,58 +1074,6 @@ ThrowCompletionOr<Value> unsigned_right_shift(GlobalObject& global_object, Value
return vm.throw_completion<TypeError>(global_object, ErrorType::BigIntBadOperator, "unsigned right-shift"); return vm.throw_completion<TypeError>(global_object, ErrorType::BigIntBadOperator, "unsigned right-shift");
} }
// https://tc39.es/ecma262/#string-concatenation
static PrimitiveString* concatenate_strings(GlobalObject& global_object, PrimitiveString const& lhs, PrimitiveString const& rhs)
{
auto& vm = global_object.vm();
if (lhs.has_utf16_string() && rhs.has_utf16_string()) {
auto const& lhs_string = lhs.utf16_string();
auto const& rhs_string = rhs.utf16_string();
Vector<u16, 1> combined;
combined.ensure_capacity(lhs_string.length_in_code_units() + rhs_string.length_in_code_units());
combined.extend(lhs_string.string());
combined.extend(rhs_string.string());
return js_string(vm, Utf16String(move(combined)));
}
auto const& lhs_string = lhs.string();
auto const& rhs_string = rhs.string();
StringBuilder builder(lhs_string.length() + rhs_string.length());
auto return_combined_strings = [&]() {
builder.append(lhs_string);
builder.append(rhs_string);
return js_string(vm, builder.to_string());
};
// Surrogates encoded as UTF-8 are 3 bytes.
if ((lhs_string.length() < 3) || (rhs_string.length() < 3))
return return_combined_strings();
auto lhs_leading_byte = static_cast<u8>(lhs_string[lhs_string.length() - 3]);
auto rhs_leading_byte = static_cast<u8>(rhs_string[0]);
if ((lhs_leading_byte & 0xf0) != 0xe0)
return return_combined_strings();
if ((rhs_leading_byte & 0xf0) != 0xe0)
return return_combined_strings();
auto high_surrogate = *Utf8View(lhs_string.substring_view(lhs_string.length() - 3)).begin();
auto low_surrogate = *Utf8View(rhs_string).begin();
if (!Utf16View::is_high_surrogate(high_surrogate) || !Utf16View::is_low_surrogate(low_surrogate))
return return_combined_strings();
builder.append(lhs_string.substring_view(0, lhs_string.length() - 3));
builder.append_code_point(Utf16View::decode_surrogate_pair(high_surrogate, low_surrogate));
builder.append(rhs_string.substring_view(3));
return js_string(vm, builder.to_string());
}
// 13.8.1 The Addition Operator ( + ), https://tc39.es/ecma262/#sec-addition-operator-plus // 13.8.1 The Addition Operator ( + ), https://tc39.es/ecma262/#sec-addition-operator-plus
ThrowCompletionOr<Value> add(GlobalObject& global_object, Value lhs, Value rhs) ThrowCompletionOr<Value> add(GlobalObject& global_object, Value lhs, Value rhs)
{ {
@ -1146,7 +1094,7 @@ ThrowCompletionOr<Value> add(GlobalObject& global_object, Value lhs, Value rhs)
if (lhs_primitive.is_string() || rhs_primitive.is_string()) { if (lhs_primitive.is_string() || rhs_primitive.is_string()) {
auto lhs_string = TRY(lhs_primitive.to_primitive_string(global_object)); auto lhs_string = TRY(lhs_primitive.to_primitive_string(global_object));
auto rhs_string = TRY(rhs_primitive.to_primitive_string(global_object)); auto rhs_string = TRY(rhs_primitive.to_primitive_string(global_object));
return concatenate_strings(global_object, *lhs_string, *rhs_string); return js_rope_string(global_object.vm(), *lhs_string, *rhs_string);
} }
auto lhs_numeric = TRY(lhs_primitive.to_numeric(global_object)); auto lhs_numeric = TRY(lhs_primitive.to_numeric(global_object));