mirror of
https://github.com/RGBCube/serenity
synced 2025-07-25 23:07:35 +00:00
LibJS: Transcode UTF-8 strings to UTF-16 and add UTF-16 accessors
LibJS parses JavaScript as UTF-8, so when creating a string, we must transcode it to UTF-16 to handle encoded surrogate pairs. For example, consider the following string: "\ud83d\ude00" The UTF-8 encoding of this surrogate pair is: 0xf0 0x9f 0x98 0x80 However, LibJS will currently store the two surrogates individually as UTF-8 encoded bytes, rather than combining the pair: 0xed 0xa0 0xb8, 0xed 0xb8 0x80 These are not equivalent. So, as String.prototype becomes UTF-16 aware, this encoding will no longer work for abstractions like strict equality.
This commit is contained in:
parent
0e25d2393f
commit
0c42aece36
4 changed files with 73 additions and 3 deletions
|
@ -9,6 +9,7 @@
|
|||
#include <AK/FlyString.h>
|
||||
#include <AK/String.h>
|
||||
#include <AK/StringBuilder.h>
|
||||
#include <AK/Utf16View.h>
|
||||
#include <AK/Utf8View.h>
|
||||
#include <LibCrypto/BigInt/SignedBigInteger.h>
|
||||
#include <LibCrypto/NumberTheory/ModularFunctions.h>
|
||||
|
@ -365,6 +366,18 @@ String Value::to_string(GlobalObject& global_object, bool legacy_null_to_empty_s
|
|||
}
|
||||
}
|
||||
|
||||
Vector<u16> Value::to_utf16_string(GlobalObject& global_object) const
|
||||
{
|
||||
if (m_type == Type::String)
|
||||
return m_value.as_string->utf16_string();
|
||||
|
||||
auto utf8_string = to_string(global_object);
|
||||
if (global_object.vm().exception())
|
||||
return {};
|
||||
|
||||
return AK::utf8_to_utf16(utf8_string);
|
||||
}
|
||||
|
||||
// 7.1.2 ToBoolean ( argument ), https://tc39.es/ecma262/#sec-toboolean
|
||||
bool Value::to_boolean() const
|
||||
{
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue