mirror of
				https://github.com/RGBCube/serenity
				synced 2025-10-26 02:02:34 +00:00 
			
		
		
		
	 a3e4535f34
			
		
	
	
		a3e4535f34
		
	
	
	
	
		
			
			When someone calls PrimitiveString::utf16_string() on a rope string, we know for sure that the client wants a UTF-16 string and may not be interested in a UTF-8 version at all. To avoid round-tripping through UTF-8 in this scenario, callers can now inform resolve_rope_if_needed() about their preferred encoding, should rope resolution take place. The UTF-16 case is actually a lot simpler than the UTF-8 case, since we can simply ask for UTF-16 data for each fiber of the rope, and then concatenate all the fibers. Since LibJS always uses UTF-16 for regular expression matching, this avoids round-tripping through UTF-8 whenever the input to a regex test is already UTF-16. :^)
		
			
				
	
	
		
			357 lines
		
	
	
	
		
			11 KiB
		
	
	
	
		
			C++
		
	
	
	
	
	
			
		
		
	
	
			357 lines
		
	
	
	
		
			11 KiB
		
	
	
	
		
			C++
		
	
	
	
	
	
| /*
 | |
|  * Copyright (c) 2020, Andreas Kling <kling@serenityos.org>
 | |
|  * Copyright (c) 2022, Linus Groh <linusg@serenityos.org>
 | |
|  *
 | |
|  * SPDX-License-Identifier: BSD-2-Clause
 | |
|  */
 | |
| 
 | |
| #include <AK/CharacterTypes.h>
 | |
| #include <AK/FlyString.h>
 | |
| #include <AK/Utf16View.h>
 | |
| #include <AK/Utf8View.h>
 | |
| #include <LibJS/Runtime/AbstractOperations.h>
 | |
| #include <LibJS/Runtime/GlobalObject.h>
 | |
| #include <LibJS/Runtime/PrimitiveString.h>
 | |
| #include <LibJS/Runtime/PropertyKey.h>
 | |
| #include <LibJS/Runtime/ThrowableStringBuilder.h>
 | |
| #include <LibJS/Runtime/VM.h>
 | |
| #include <LibJS/Runtime/Value.h>
 | |
| 
 | |
| namespace JS {
 | |
| 
 | |
| PrimitiveString::PrimitiveString(PrimitiveString& lhs, PrimitiveString& rhs)
 | |
|     : m_is_rope(true)
 | |
|     , m_lhs(&lhs)
 | |
|     , m_rhs(&rhs)
 | |
| {
 | |
| }
 | |
| 
 | |
| PrimitiveString::PrimitiveString(String string)
 | |
|     : m_utf8_string(move(string))
 | |
| {
 | |
| }
 | |
| 
 | |
| PrimitiveString::PrimitiveString(DeprecatedString string)
 | |
|     : m_deprecated_string(move(string))
 | |
| {
 | |
| }
 | |
| 
 | |
| PrimitiveString::PrimitiveString(Utf16String string)
 | |
|     : m_utf16_string(move(string))
 | |
| {
 | |
| }
 | |
| 
 | |
| PrimitiveString::~PrimitiveString()
 | |
| {
 | |
|     if (has_utf8_string())
 | |
|         vm().string_cache().remove(*m_utf8_string);
 | |
|     if (has_deprecated_string())
 | |
|         vm().deprecated_string_cache().remove(*m_deprecated_string);
 | |
| }
 | |
| 
 | |
| void PrimitiveString::visit_edges(Cell::Visitor& visitor)
 | |
| {
 | |
|     Base::visit_edges(visitor);
 | |
|     if (m_is_rope) {
 | |
|         visitor.visit(m_lhs);
 | |
|         visitor.visit(m_rhs);
 | |
|     }
 | |
| }
 | |
| 
 | |
| bool PrimitiveString::is_empty() const
 | |
| {
 | |
|     if (m_is_rope) {
 | |
|         // NOTE: We never make an empty rope string.
 | |
|         return false;
 | |
|     }
 | |
| 
 | |
|     if (has_utf16_string())
 | |
|         return m_utf16_string->is_empty();
 | |
|     if (has_utf8_string())
 | |
|         return m_utf8_string->is_empty();
 | |
|     if (has_deprecated_string())
 | |
|         return m_deprecated_string->is_empty();
 | |
|     VERIFY_NOT_REACHED();
 | |
| }
 | |
| 
 | |
| ThrowCompletionOr<String> PrimitiveString::utf8_string() const
 | |
| {
 | |
|     auto& vm = this->vm();
 | |
|     TRY(resolve_rope_if_needed(EncodingPreference::UTF8));
 | |
| 
 | |
|     if (!has_utf8_string()) {
 | |
|         if (has_deprecated_string())
 | |
|             m_utf8_string = TRY_OR_THROW_OOM(vm, String::from_deprecated_string(*m_deprecated_string));
 | |
|         else if (has_utf16_string())
 | |
|             m_utf8_string = TRY(m_utf16_string->to_utf8(vm));
 | |
|         else
 | |
|             VERIFY_NOT_REACHED();
 | |
|     }
 | |
| 
 | |
|     return *m_utf8_string;
 | |
| }
 | |
| 
 | |
| ThrowCompletionOr<StringView> PrimitiveString::utf8_string_view() const
 | |
| {
 | |
|     (void)TRY(utf8_string());
 | |
|     return m_utf8_string->bytes_as_string_view();
 | |
| }
 | |
| 
 | |
| ThrowCompletionOr<DeprecatedString> PrimitiveString::deprecated_string() const
 | |
| {
 | |
|     TRY(resolve_rope_if_needed(EncodingPreference::UTF8));
 | |
| 
 | |
|     if (!has_deprecated_string()) {
 | |
|         if (has_utf8_string())
 | |
|             m_deprecated_string = m_utf8_string->to_deprecated_string();
 | |
|         else if (has_utf16_string())
 | |
|             m_deprecated_string = TRY(m_utf16_string->to_deprecated_string(vm()));
 | |
|         else
 | |
|             VERIFY_NOT_REACHED();
 | |
|     }
 | |
| 
 | |
|     return *m_deprecated_string;
 | |
| }
 | |
| 
 | |
| ThrowCompletionOr<Utf16String> PrimitiveString::utf16_string() const
 | |
| {
 | |
|     TRY(resolve_rope_if_needed(EncodingPreference::UTF16));
 | |
| 
 | |
|     if (!has_utf16_string()) {
 | |
|         if (has_utf8_string()) {
 | |
|             m_utf16_string = TRY(Utf16String::create(vm(), m_utf8_string->bytes_as_string_view()));
 | |
|         } else {
 | |
|             VERIFY(has_deprecated_string());
 | |
|             m_utf16_string = TRY(Utf16String::create(vm(), *m_deprecated_string));
 | |
|         }
 | |
|     }
 | |
| 
 | |
|     return *m_utf16_string;
 | |
| }
 | |
| 
 | |
| ThrowCompletionOr<Utf16View> PrimitiveString::utf16_string_view() const
 | |
| {
 | |
|     (void)TRY(utf16_string());
 | |
|     return m_utf16_string->view();
 | |
| }
 | |
| 
 | |
| ThrowCompletionOr<Optional<Value>> PrimitiveString::get(VM& vm, PropertyKey const& property_key) const
 | |
| {
 | |
|     if (property_key.is_symbol())
 | |
|         return Optional<Value> {};
 | |
|     if (property_key.is_string()) {
 | |
|         if (property_key.as_string() == vm.names.length.as_string()) {
 | |
|             auto length = TRY(utf16_string()).length_in_code_units();
 | |
|             return Value(static_cast<double>(length));
 | |
|         }
 | |
|     }
 | |
|     auto index = MUST_OR_THROW_OOM(canonical_numeric_index_string(vm, property_key, CanonicalIndexMode::IgnoreNumericRoundtrip));
 | |
|     if (!index.is_index())
 | |
|         return Optional<Value> {};
 | |
|     auto str = TRY(utf16_string_view());
 | |
|     auto length = str.length_in_code_units();
 | |
|     if (length <= index.as_index())
 | |
|         return Optional<Value> {};
 | |
|     return create(vm, TRY(Utf16String::create(vm, str.substring_view(index.as_index(), 1))));
 | |
| }
 | |
| 
 | |
| NonnullGCPtr<PrimitiveString> PrimitiveString::create(VM& vm, Utf16String string)
 | |
| {
 | |
|     if (string.is_empty())
 | |
|         return vm.empty_string();
 | |
| 
 | |
|     if (string.length_in_code_units() == 1) {
 | |
|         u16 code_unit = string.code_unit_at(0);
 | |
|         if (is_ascii(code_unit))
 | |
|             return vm.single_ascii_character_string(static_cast<u8>(code_unit));
 | |
|     }
 | |
| 
 | |
|     return vm.heap().allocate_without_realm<PrimitiveString>(move(string));
 | |
| }
 | |
| 
 | |
| NonnullGCPtr<PrimitiveString> PrimitiveString::create(VM& vm, String string)
 | |
| {
 | |
|     if (string.is_empty())
 | |
|         return vm.empty_string();
 | |
| 
 | |
|     if (auto bytes = string.bytes_as_string_view(); bytes.length() == 1) {
 | |
|         auto ch = static_cast<u8>(bytes[0]);
 | |
|         if (is_ascii(ch))
 | |
|             return vm.single_ascii_character_string(ch);
 | |
|     }
 | |
| 
 | |
|     auto& string_cache = vm.string_cache();
 | |
|     if (auto it = string_cache.find(string); it != string_cache.end())
 | |
|         return *it->value;
 | |
| 
 | |
|     auto new_string = vm.heap().allocate_without_realm<PrimitiveString>(string);
 | |
|     string_cache.set(move(string), new_string);
 | |
|     return *new_string;
 | |
| }
 | |
| 
 | |
| NonnullGCPtr<PrimitiveString> PrimitiveString::create(VM& vm, FlyString const& string)
 | |
| {
 | |
|     return create(vm, string.to_string());
 | |
| }
 | |
| 
 | |
| ThrowCompletionOr<NonnullGCPtr<PrimitiveString>> PrimitiveString::create(VM& vm, StringView string)
 | |
| {
 | |
|     return create(vm, TRY_OR_THROW_OOM(vm, String::from_utf8(string)));
 | |
| }
 | |
| 
 | |
| NonnullGCPtr<PrimitiveString> PrimitiveString::create(VM& vm, DeprecatedString string)
 | |
| {
 | |
|     if (string.is_empty())
 | |
|         return vm.empty_string();
 | |
| 
 | |
|     if (string.length() == 1) {
 | |
|         auto ch = static_cast<u8>(string.characters()[0]);
 | |
|         if (is_ascii(ch))
 | |
|             return vm.single_ascii_character_string(ch);
 | |
|     }
 | |
| 
 | |
|     auto& string_cache = vm.deprecated_string_cache();
 | |
|     auto it = string_cache.find(string);
 | |
|     if (it == string_cache.end()) {
 | |
|         auto new_string = vm.heap().allocate_without_realm<PrimitiveString>(string);
 | |
|         string_cache.set(move(string), new_string);
 | |
|         return *new_string;
 | |
|     }
 | |
|     return *it->value;
 | |
| }
 | |
| 
 | |
| NonnullGCPtr<PrimitiveString> PrimitiveString::create(VM& vm, DeprecatedFlyString const& string)
 | |
| {
 | |
|     return create(vm, string.impl());
 | |
| }
 | |
| 
 | |
| NonnullGCPtr<PrimitiveString> PrimitiveString::create(VM& vm, PrimitiveString& lhs, PrimitiveString& rhs)
 | |
| {
 | |
|     // We're here to concatenate two strings into a new rope string.
 | |
|     // However, if any of them are empty, no rope is required.
 | |
| 
 | |
|     bool lhs_empty = lhs.is_empty();
 | |
|     bool rhs_empty = rhs.is_empty();
 | |
| 
 | |
|     if (lhs_empty && rhs_empty)
 | |
|         return vm.empty_string();
 | |
| 
 | |
|     if (lhs_empty)
 | |
|         return rhs;
 | |
| 
 | |
|     if (rhs_empty)
 | |
|         return lhs;
 | |
| 
 | |
|     return vm.heap().allocate_without_realm<PrimitiveString>(lhs, rhs);
 | |
| }
 | |
| 
 | |
| ThrowCompletionOr<void> PrimitiveString::resolve_rope_if_needed(EncodingPreference preference) const
 | |
| {
 | |
|     if (!m_is_rope)
 | |
|         return {};
 | |
| 
 | |
|     auto& vm = this->vm();
 | |
| 
 | |
|     // This vector will hold all the pieces of the rope that need to be assembled
 | |
|     // into the resolved string.
 | |
|     Vector<PrimitiveString const*> pieces;
 | |
| 
 | |
|     // NOTE: We traverse the rope tree without using recursion, since we'd run out of
 | |
|     //       stack space quickly when handling a long sequence of unresolved concatenations.
 | |
|     Vector<PrimitiveString const*> stack;
 | |
|     TRY_OR_THROW_OOM(vm, stack.try_append(m_rhs));
 | |
|     TRY_OR_THROW_OOM(vm, stack.try_append(m_lhs));
 | |
|     while (!stack.is_empty()) {
 | |
|         auto const* current = stack.take_last();
 | |
|         if (current->m_is_rope) {
 | |
|             TRY_OR_THROW_OOM(vm, stack.try_append(current->m_rhs));
 | |
|             TRY_OR_THROW_OOM(vm, stack.try_append(current->m_lhs));
 | |
|             continue;
 | |
|         }
 | |
|         TRY_OR_THROW_OOM(vm, pieces.try_append(current));
 | |
|     }
 | |
| 
 | |
|     if (preference == EncodingPreference::UTF16) {
 | |
|         // The caller wants a UTF-16 string, so we can simply concatenate all the pieces
 | |
|         // into a UTF-16 code unit buffer and create a Utf16String from it.
 | |
| 
 | |
|         Utf16Data code_units;
 | |
|         for (auto const* current : pieces)
 | |
|             code_units.extend(TRY(current->utf16_string()).string());
 | |
| 
 | |
|         m_utf16_string = TRY(Utf16String::create(vm, move(code_units)));
 | |
|         m_is_rope = false;
 | |
|         m_lhs = nullptr;
 | |
|         m_rhs = nullptr;
 | |
|         return {};
 | |
|     }
 | |
| 
 | |
|     // Now that we have all the pieces, we can concatenate them using a StringBuilder.
 | |
|     ThrowableStringBuilder builder(vm);
 | |
| 
 | |
|     // We keep track of the previous piece in order to handle surrogate pairs spread across two pieces.
 | |
|     PrimitiveString const* previous = nullptr;
 | |
|     for (auto const* current : pieces) {
 | |
|         if (!previous) {
 | |
|             // This is the very first piece, just append it and continue.
 | |
|             TRY(builder.append(TRY(current->utf8_string())));
 | |
|             previous = current;
 | |
|             continue;
 | |
|         }
 | |
| 
 | |
|         // Get the UTF-8 representations for both strings.
 | |
|         auto current_string_as_utf8 = TRY(current->utf8_string_view());
 | |
|         auto previous_string_as_utf8 = TRY(previous->utf8_string_view());
 | |
| 
 | |
|         // NOTE: Now we need to look at the end of the previous string and the start
 | |
|         //       of the current string, to see if they should be combined into a surrogate.
 | |
| 
 | |
|         // Surrogates encoded as UTF-8 are 3 bytes.
 | |
|         if ((previous_string_as_utf8.length() < 3) || (current_string_as_utf8.length() < 3)) {
 | |
|             TRY(builder.append(current_string_as_utf8));
 | |
|             previous = current;
 | |
|             continue;
 | |
|         }
 | |
| 
 | |
|         // Might the previous string end with a UTF-8 encoded surrogate?
 | |
|         if ((static_cast<u8>(previous_string_as_utf8[previous_string_as_utf8.length() - 3]) & 0xf0) != 0xe0) {
 | |
|             // If not, just append the current string and continue.
 | |
|             TRY(builder.append(current_string_as_utf8));
 | |
|             previous = current;
 | |
|             continue;
 | |
|         }
 | |
| 
 | |
|         // Might the current string begin with a UTF-8 encoded surrogate?
 | |
|         if ((static_cast<u8>(current_string_as_utf8[0]) & 0xf0) != 0xe0) {
 | |
|             // If not, just append the current string and continue.
 | |
|             TRY(builder.append(current_string_as_utf8));
 | |
|             previous = current;
 | |
|             continue;
 | |
|         }
 | |
| 
 | |
|         auto high_surrogate = *Utf8View(previous_string_as_utf8.substring_view(previous_string_as_utf8.length() - 3)).begin();
 | |
|         auto low_surrogate = *Utf8View(current_string_as_utf8).begin();
 | |
| 
 | |
|         if (!Utf16View::is_high_surrogate(high_surrogate) || !Utf16View::is_low_surrogate(low_surrogate)) {
 | |
|             TRY(builder.append(current_string_as_utf8));
 | |
|             previous = current;
 | |
|             continue;
 | |
|         }
 | |
| 
 | |
|         // Remove 3 bytes from the builder and replace them with the UTF-8 encoded code point.
 | |
|         builder.trim(3);
 | |
|         TRY(builder.append_code_point(Utf16View::decode_surrogate_pair(high_surrogate, low_surrogate)));
 | |
| 
 | |
|         // Append the remaining part of the current string.
 | |
|         TRY(builder.append(current_string_as_utf8.substring_view(3)));
 | |
|         previous = current;
 | |
|     }
 | |
| 
 | |
|     m_utf8_string = TRY(builder.to_string());
 | |
|     m_is_rope = false;
 | |
|     m_lhs = nullptr;
 | |
|     m_rhs = nullptr;
 | |
|     return {};
 | |
| }
 | |
| 
 | |
| }
 |