mirror of
				https://github.com/RGBCube/serenity
				synced 2025-10-31 17:22:43 +00:00 
			
		
		
		
	 0c42aece36
			
		
	
	
		0c42aece36
		
	
	
	
	
		
			
			LibJS parses JavaScript as UTF-8, so when creating a string, we must
transcode it to UTF-16 to handle encoded surrogate pairs.
For example, consider the following string:
    "\ud83d\ude00"
The UTF-8 encoding of this surrogate pair is:
    0xf0 0x9f 0x98 0x80
However, LibJS will currently store the two surrogates individually as
UTF-8 encoded bytes, rather than combining the pair:
    0xed 0xa0 0xb8, 0xed 0xb8 0x80
These are not equivalent. So, as String.prototype becomes UTF-16 aware,
this encoding will no longer work for abstractions like strict equality.
		
	
			
		
			
				
	
	
		
			85 lines
		
	
	
	
		
			2.3 KiB
		
	
	
	
		
			C++
		
	
	
	
	
	
			
		
		
	
	
			85 lines
		
	
	
	
		
			2.3 KiB
		
	
	
	
		
			C++
		
	
	
	
	
	
| /*
 | |
|  * Copyright (c) 2020, Andreas Kling <kling@serenityos.org>
 | |
|  *
 | |
|  * SPDX-License-Identifier: BSD-2-Clause
 | |
|  */
 | |
| 
 | |
| #include <AK/CharacterTypes.h>
 | |
| #include <AK/Utf16View.h>
 | |
| #include <LibJS/Runtime/PrimitiveString.h>
 | |
| #include <LibJS/Runtime/VM.h>
 | |
| 
 | |
| namespace JS {
 | |
| 
 | |
| PrimitiveString::PrimitiveString(String string)
 | |
|     : m_string(move(string))
 | |
| {
 | |
| }
 | |
| 
 | |
| PrimitiveString::~PrimitiveString()
 | |
| {
 | |
| }
 | |
| 
 | |
| Vector<u16> const& PrimitiveString::utf16_string() const
 | |
| {
 | |
|     if (m_utf16_string.is_empty() && !m_string.is_empty())
 | |
|         m_utf16_string = AK::utf8_to_utf16(m_string);
 | |
|     return m_utf16_string;
 | |
| }
 | |
| 
 | |
| Utf16View PrimitiveString::utf16_string_view() const
 | |
| {
 | |
|     return Utf16View { utf16_string() };
 | |
| }
 | |
| 
 | |
| PrimitiveString* js_string(Heap& heap, Utf16View const& string)
 | |
| {
 | |
|     if (string.is_empty())
 | |
|         return &heap.vm().empty_string();
 | |
| 
 | |
|     if (string.length_in_code_units() == 1) {
 | |
|         u16 code_unit = string.code_unit_at(0);
 | |
|         if (is_ascii(code_unit))
 | |
|             return &heap.vm().single_ascii_character_string(static_cast<u8>(code_unit));
 | |
|     }
 | |
| 
 | |
|     auto utf8_string = string.to_utf8(Utf16View::AllowInvalidCodeUnits::Yes);
 | |
|     return heap.allocate_without_global_object<PrimitiveString>(move(utf8_string));
 | |
| }
 | |
| 
 | |
| PrimitiveString* js_string(VM& vm, Utf16View const& string)
 | |
| {
 | |
|     return js_string(vm.heap(), string);
 | |
| }
 | |
| 
 | |
| PrimitiveString* js_string(Heap& heap, String string)
 | |
| {
 | |
|     if (string.is_empty())
 | |
|         return &heap.vm().empty_string();
 | |
| 
 | |
|     if (string.length() == 1) {
 | |
|         auto ch = static_cast<u8>(string.characters()[0]);
 | |
|         if (is_ascii(ch))
 | |
|             return &heap.vm().single_ascii_character_string(ch);
 | |
|     }
 | |
| 
 | |
|     // UTF-8 strings must first be transcoded to UTF-16, even though they are stored as String objects
 | |
|     // internally, to parse encoded surrogate pairs. As an optimization to reduce string copying, only
 | |
|     // perform that transcoding if there are non-ASCII codepoints in the string.
 | |
|     for (auto it : string) {
 | |
|         auto ch = static_cast<u8>(it);
 | |
|         if (!is_ascii(ch)) {
 | |
|             auto utf16_string = AK::utf8_to_utf16(string);
 | |
|             return js_string(heap, Utf16View { utf16_string });
 | |
|         }
 | |
|     }
 | |
| 
 | |
|     return heap.allocate_without_global_object<PrimitiveString>(move(string));
 | |
| }
 | |
| 
 | |
| PrimitiveString* js_string(VM& vm, String string)
 | |
| {
 | |
|     return js_string(vm.heap(), move(string));
 | |
| }
 | |
| 
 | |
| }
 |