From 4c2cc419f964009e44330b7d1404763209f97575 Mon Sep 17 00:00:00 2001 From: Timothy Flynn Date: Mon, 2 Aug 2021 10:06:42 -0400 Subject: [PATCH] LibJS: Decode UTF-16 surrogate pairs during string literal construction Rather than deferring this decoding to PrimitiveString, we can decode surrogate pairs when parsing the string. This prevents a string copy when constructing the PrimitiveString. --- .../LibJS/Runtime/PrimitiveString.cpp | 11 ------- Userland/Libraries/LibJS/Token.cpp | 31 +++++++++++++++++-- 2 files changed, 28 insertions(+), 14 deletions(-) diff --git a/Userland/Libraries/LibJS/Runtime/PrimitiveString.cpp b/Userland/Libraries/LibJS/Runtime/PrimitiveString.cpp index b35982fafa..bc40b4df19 100644 --- a/Userland/Libraries/LibJS/Runtime/PrimitiveString.cpp +++ b/Userland/Libraries/LibJS/Runtime/PrimitiveString.cpp @@ -63,17 +63,6 @@ PrimitiveString* js_string(Heap& heap, String string) return &heap.vm().single_ascii_character_string(ch); } - // UTF-8 strings must first be transcoded to UTF-16, even though they are stored as String objects - // internally, to parse encoded surrogate pairs. As an optimization to reduce string copying, only - // perform that transcoding if there are non-ASCII codepoints in the string. - for (auto it : string) { - auto ch = static_cast(it); - if (!is_ascii(ch)) { - auto utf16_string = AK::utf8_to_utf16(string); - return js_string(heap, Utf16View { utf16_string }); - } - } - return heap.allocate_without_global_object(move(string)); } diff --git a/Userland/Libraries/LibJS/Token.cpp b/Userland/Libraries/LibJS/Token.cpp index 44b4859b71..ffa611e80b 100644 --- a/Userland/Libraries/LibJS/Token.cpp +++ b/Userland/Libraries/LibJS/Token.cpp @@ -10,6 +10,7 @@ #include #include #include +#include namespace JS { @@ -102,6 +103,16 @@ String Token::string_value(StringValueStatus& status) const return {}; }; + auto decode_surrogate = [&lexer]() -> Optional { + u16 surrogate = 0; + for (int j = 0; j < 4; ++j) { + if (!lexer.next_is(is_ascii_hex_digit)) + return {}; + surrogate = (surrogate << 4u) | hex2int(lexer.consume()); + } + return surrogate; + }; + StringBuilder builder; while (!lexer.is_eof()) { // No escape, consume one char and continue @@ -157,10 +168,24 @@ String Token::string_value(StringValueStatus& status) const } lexer.ignore(); } else { - for (int j = 0; j < 4; ++j) { - if (!lexer.next_is(is_ascii_hex_digit)) + auto high_surrogate = decode_surrogate(); + if (!high_surrogate.has_value()) + return encoding_failure(StringValueStatus::MalformedUnicodeEscape); + + if (Utf16View::is_high_surrogate(*high_surrogate) && lexer.consume_specific("\\u"sv)) { + auto low_surrogate = decode_surrogate(); + if (!low_surrogate.has_value()) return encoding_failure(StringValueStatus::MalformedUnicodeEscape); - code_point = (code_point << 4u) | hex2int(lexer.consume()); + + if (Utf16View::is_low_surrogate(*low_surrogate)) { + code_point = Utf16View::decode_surrogate_pair(*high_surrogate, *low_surrogate); + } else { + builder.append_code_point(*high_surrogate); + code_point = *low_surrogate; + } + + } else { + code_point = *high_surrogate; } } builder.append_code_point(code_point);