1
Fork 0
mirror of https://github.com/RGBCube/serenity synced 2025-10-26 08:42:35 +00:00
serenity/Userland/Libraries/LibJS/Runtime/PrimitiveString.cpp
Andreas Kling 0ad4be3d78 LibJS: Skip redundant UTF-8 validation in rope string resolution
When resolving a rope, we've already taken care to resolve it to
a UTF-8 byte stream. There's no need to do a separate pass just for
validating the data again.

This was noticeable in some profiles. I made a simple microbenchmark
that gets a 30% speed-up:

    ("x" + "y".repeat(100_000_000)).trimStart()
2023-12-30 13:49:50 +01:00

358 lines
11 KiB
C++

/*
* Copyright (c) 2020, Andreas Kling <kling@serenityos.org>
* Copyright (c) 2022, Linus Groh <linusg@serenityos.org>
*
* SPDX-License-Identifier: BSD-2-Clause
*/
#include <AK/CharacterTypes.h>
#include <AK/FlyString.h>
#include <AK/StringBuilder.h>
#include <AK/Utf16View.h>
#include <AK/Utf8View.h>
#include <LibJS/Runtime/AbstractOperations.h>
#include <LibJS/Runtime/GlobalObject.h>
#include <LibJS/Runtime/PrimitiveString.h>
#include <LibJS/Runtime/PropertyKey.h>
#include <LibJS/Runtime/VM.h>
#include <LibJS/Runtime/Value.h>
namespace JS {
JS_DEFINE_ALLOCATOR(PrimitiveString);
PrimitiveString::PrimitiveString(PrimitiveString& lhs, PrimitiveString& rhs)
: m_is_rope(true)
, m_lhs(&lhs)
, m_rhs(&rhs)
{
}
PrimitiveString::PrimitiveString(String string)
: m_utf8_string(move(string))
{
}
PrimitiveString::PrimitiveString(ByteString string)
: m_byte_string(move(string))
{
}
PrimitiveString::PrimitiveString(Utf16String string)
: m_utf16_string(move(string))
{
}
PrimitiveString::~PrimitiveString()
{
if (has_utf8_string())
vm().string_cache().remove(*m_utf8_string);
if (has_byte_string())
vm().byte_string_cache().remove(*m_byte_string);
}
void PrimitiveString::visit_edges(Cell::Visitor& visitor)
{
Base::visit_edges(visitor);
if (m_is_rope) {
visitor.visit(m_lhs);
visitor.visit(m_rhs);
}
}
bool PrimitiveString::is_empty() const
{
if (m_is_rope) {
// NOTE: We never make an empty rope string.
return false;
}
if (has_utf16_string())
return m_utf16_string->is_empty();
if (has_utf8_string())
return m_utf8_string->is_empty();
if (has_byte_string())
return m_byte_string->is_empty();
VERIFY_NOT_REACHED();
}
String PrimitiveString::utf8_string() const
{
resolve_rope_if_needed(EncodingPreference::UTF8);
if (!has_utf8_string()) {
if (has_byte_string())
m_utf8_string = MUST(String::from_byte_string(*m_byte_string));
else if (has_utf16_string())
m_utf8_string = m_utf16_string->to_utf8();
else
VERIFY_NOT_REACHED();
}
return *m_utf8_string;
}
StringView PrimitiveString::utf8_string_view() const
{
(void)utf8_string();
return m_utf8_string->bytes_as_string_view();
}
ByteString PrimitiveString::byte_string() const
{
resolve_rope_if_needed(EncodingPreference::UTF8);
if (!has_byte_string()) {
if (has_utf8_string())
m_byte_string = m_utf8_string->to_byte_string();
else if (has_utf16_string())
m_byte_string = m_utf16_string->to_byte_string();
else
VERIFY_NOT_REACHED();
}
return *m_byte_string;
}
Utf16String PrimitiveString::utf16_string() const
{
resolve_rope_if_needed(EncodingPreference::UTF16);
if (!has_utf16_string()) {
if (has_utf8_string()) {
m_utf16_string = Utf16String::create(m_utf8_string->bytes_as_string_view());
} else {
VERIFY(has_byte_string());
m_utf16_string = Utf16String::create(*m_byte_string);
}
}
return *m_utf16_string;
}
Utf16View PrimitiveString::utf16_string_view() const
{
(void)utf16_string();
return m_utf16_string->view();
}
ThrowCompletionOr<Optional<Value>> PrimitiveString::get(VM& vm, PropertyKey const& property_key) const
{
if (property_key.is_symbol())
return Optional<Value> {};
if (property_key.is_string()) {
if (property_key.as_string() == vm.names.length.as_string()) {
auto length = utf16_string().length_in_code_units();
return Value(static_cast<double>(length));
}
}
auto index = canonical_numeric_index_string(property_key, CanonicalIndexMode::IgnoreNumericRoundtrip);
if (!index.is_index())
return Optional<Value> {};
auto str = utf16_string_view();
auto length = str.length_in_code_units();
if (length <= index.as_index())
return Optional<Value> {};
return create(vm, Utf16String::create(str.substring_view(index.as_index(), 1)));
}
NonnullGCPtr<PrimitiveString> PrimitiveString::create(VM& vm, Utf16String string)
{
if (string.is_empty())
return vm.empty_string();
if (string.length_in_code_units() == 1) {
u16 code_unit = string.code_unit_at(0);
if (is_ascii(code_unit))
return vm.single_ascii_character_string(static_cast<u8>(code_unit));
}
return vm.heap().allocate_without_realm<PrimitiveString>(move(string));
}
NonnullGCPtr<PrimitiveString> PrimitiveString::create(VM& vm, String string)
{
if (string.is_empty())
return vm.empty_string();
if (auto bytes = string.bytes_as_string_view(); bytes.length() == 1) {
auto ch = static_cast<u8>(bytes[0]);
if (is_ascii(ch))
return vm.single_ascii_character_string(ch);
}
auto& string_cache = vm.string_cache();
if (auto it = string_cache.find(string); it != string_cache.end())
return *it->value;
auto new_string = vm.heap().allocate_without_realm<PrimitiveString>(string);
string_cache.set(move(string), new_string);
return *new_string;
}
NonnullGCPtr<PrimitiveString> PrimitiveString::create(VM& vm, FlyString const& string)
{
return create(vm, string.to_string());
}
NonnullGCPtr<PrimitiveString> PrimitiveString::create(VM& vm, StringView string)
{
return create(vm, String::from_utf8(string).release_value());
}
NonnullGCPtr<PrimitiveString> PrimitiveString::create(VM& vm, ByteString string)
{
if (string.is_empty())
return vm.empty_string();
if (string.length() == 1) {
auto ch = static_cast<u8>(string.characters()[0]);
if (is_ascii(ch))
return vm.single_ascii_character_string(ch);
}
auto& string_cache = vm.byte_string_cache();
auto it = string_cache.find(string);
if (it == string_cache.end()) {
auto new_string = vm.heap().allocate_without_realm<PrimitiveString>(string);
string_cache.set(move(string), new_string);
return *new_string;
}
return *it->value;
}
NonnullGCPtr<PrimitiveString> PrimitiveString::create(VM& vm, DeprecatedFlyString const& string)
{
if (string.is_empty())
return vm.empty_string();
return create(vm, *string.impl());
}
NonnullGCPtr<PrimitiveString> PrimitiveString::create(VM& vm, PrimitiveString& lhs, PrimitiveString& rhs)
{
// We're here to concatenate two strings into a new rope string.
// However, if any of them are empty, no rope is required.
bool lhs_empty = lhs.is_empty();
bool rhs_empty = rhs.is_empty();
if (lhs_empty && rhs_empty)
return vm.empty_string();
if (lhs_empty)
return rhs;
if (rhs_empty)
return lhs;
return vm.heap().allocate_without_realm<PrimitiveString>(lhs, rhs);
}
void PrimitiveString::resolve_rope_if_needed(EncodingPreference preference) const
{
if (!m_is_rope)
return;
// This vector will hold all the pieces of the rope that need to be assembled
// into the resolved string.
Vector<PrimitiveString const*> pieces;
// NOTE: We traverse the rope tree without using recursion, since we'd run out of
// stack space quickly when handling a long sequence of unresolved concatenations.
Vector<PrimitiveString const*> stack;
stack.append(m_rhs);
stack.append(m_lhs);
while (!stack.is_empty()) {
auto const* current = stack.take_last();
if (current->m_is_rope) {
stack.append(current->m_rhs);
stack.append(current->m_lhs);
continue;
}
pieces.append(current);
}
if (preference == EncodingPreference::UTF16) {
// The caller wants a UTF-16 string, so we can simply concatenate all the pieces
// into a UTF-16 code unit buffer and create a Utf16String from it.
Utf16Data code_units;
for (auto const* current : pieces)
code_units.extend(current->utf16_string().string());
m_utf16_string = Utf16String::create(move(code_units));
m_is_rope = false;
m_lhs = nullptr;
m_rhs = nullptr;
return;
}
// Now that we have all the pieces, we can concatenate them using a StringBuilder.
StringBuilder builder;
// We keep track of the previous piece in order to handle surrogate pairs spread across two pieces.
PrimitiveString const* previous = nullptr;
for (auto const* current : pieces) {
if (!previous) {
// This is the very first piece, just append it and continue.
builder.append(current->utf8_string());
previous = current;
continue;
}
// Get the UTF-8 representations for both strings.
auto current_string_as_utf8 = current->utf8_string_view();
auto previous_string_as_utf8 = previous->utf8_string_view();
// NOTE: Now we need to look at the end of the previous string and the start
// of the current string, to see if they should be combined into a surrogate.
// Surrogates encoded as UTF-8 are 3 bytes.
if ((previous_string_as_utf8.length() < 3) || (current_string_as_utf8.length() < 3)) {
builder.append(current_string_as_utf8);
previous = current;
continue;
}
// Might the previous string end with a UTF-8 encoded surrogate?
if ((static_cast<u8>(previous_string_as_utf8[previous_string_as_utf8.length() - 3]) & 0xf0) != 0xe0) {
// If not, just append the current string and continue.
builder.append(current_string_as_utf8);
previous = current;
continue;
}
// Might the current string begin with a UTF-8 encoded surrogate?
if ((static_cast<u8>(current_string_as_utf8[0]) & 0xf0) != 0xe0) {
// If not, just append the current string and continue.
builder.append(current_string_as_utf8);
previous = current;
continue;
}
auto high_surrogate = *Utf8View(previous_string_as_utf8.substring_view(previous_string_as_utf8.length() - 3)).begin();
auto low_surrogate = *Utf8View(current_string_as_utf8).begin();
if (!Utf16View::is_high_surrogate(high_surrogate) || !Utf16View::is_low_surrogate(low_surrogate)) {
builder.append(current_string_as_utf8);
previous = current;
continue;
}
// Remove 3 bytes from the builder and replace them with the UTF-8 encoded code point.
builder.trim(3);
builder.append_code_point(Utf16View::decode_surrogate_pair(high_surrogate, low_surrogate));
// Append the remaining part of the current string.
builder.append(current_string_as_utf8.substring_view(3));
previous = current;
}
// NOTE: We've already produced valid UTF-8 above, so there's no need for additional validation.
m_utf8_string = builder.to_string_without_validation();
m_is_rope = false;
m_lhs = nullptr;
m_rhs = nullptr;
}
}