From 4c2cc419f964009e44330b7d1404763209f97575 Mon Sep 17 00:00:00 2001
From: Timothy Flynn <trflynn89@pm.me>
Date: Mon, 2 Aug 2021 10:06:42 -0400
Subject: [PATCH] LibJS: Decode UTF-16 surrogate pairs during string literal
 construction

Rather than deferring this decoding to PrimitiveString, we can decode
surrogate pairs when parsing the string. This prevents a string copy
when constructing the PrimitiveString.
---
 .../LibJS/Runtime/PrimitiveString.cpp         | 11 -------
 Userland/Libraries/LibJS/Token.cpp            | 31 +++++++++++++++++--
 2 files changed, 28 insertions(+), 14 deletions(-)
diff --git a/Userland/Libraries/LibJS/Runtime/PrimitiveString.cpp b/Userland/Libraries/LibJS/Runtime/PrimitiveString.cpp
index b35982fafa..bc40b4df19 100644
--- a/Userland/Libraries/LibJS/Runtime/PrimitiveString.cpp
+++ b/Userland/Libraries/LibJS/Runtime/PrimitiveString.cpp
@@ -63,17 +63,6 @@ PrimitiveString* js_string(Heap& heap, String string)
             return &heap.vm().single_ascii_character_string(ch);
     }
 
-    // UTF-8 strings must first be transcoded to UTF-16, even though they are stored as String objects
-    // internally, to parse encoded surrogate pairs. As an optimization to reduce string copying, only
-    // perform that transcoding if there are non-ASCII codepoints in the string.
-    for (auto it : string) {
-        auto ch = static_cast<u8>(it);
-        if (!is_ascii(ch)) {
-            auto utf16_string = AK::utf8_to_utf16(string);
-            return js_string(heap, Utf16View { utf16_string });
-        }
-    }
-
     return heap.allocate_without_global_object<PrimitiveString>(move(string));
 }
 
diff --git a/Userland/Libraries/LibJS/Token.cpp b/Userland/Libraries/LibJS/Token.cpp
index 44b4859b71..ffa611e80b 100644
--- a/Userland/Libraries/LibJS/Token.cpp
+++ b/Userland/Libraries/LibJS/Token.cpp
@@ -10,6 +10,7 @@
 #include <AK/CharacterTypes.h>
 #include <AK/GenericLexer.h>
 #include <AK/StringBuilder.h>
+#include <AK/Utf16View.h>
 
 namespace JS {
 
@@ -102,6 +103,16 @@ String Token::string_value(StringValueStatus& status) const
         return {};
     };
 
+    auto decode_surrogate = [&lexer]() -> Optional<u16> {
+        u16 surrogate = 0;
+        for (int j = 0; j < 4; ++j) {
+            if (!lexer.next_is(is_ascii_hex_digit))
+                return {};
+            surrogate = (surrogate << 4u) | hex2int(lexer.consume());
+        }
+        return surrogate;
+    };
+
     StringBuilder builder;
     while (!lexer.is_eof()) {
         // No escape, consume one char and continue
@@ -157,10 +168,24 @@ String Token::string_value(StringValueStatus& status) const
                 }
                 lexer.ignore();
             } else {
-                for (int j = 0; j < 4; ++j) {
-                    if (!lexer.next_is(is_ascii_hex_digit))
+                auto high_surrogate = decode_surrogate();
+                if (!high_surrogate.has_value())
+                    return encoding_failure(StringValueStatus::MalformedUnicodeEscape);
+
+                if (Utf16View::is_high_surrogate(*high_surrogate) && lexer.consume_specific("\\u"sv)) {
+                    auto low_surrogate = decode_surrogate();
+                    if (!low_surrogate.has_value())
                         return encoding_failure(StringValueStatus::MalformedUnicodeEscape);
-                    code_point = (code_point << 4u) | hex2int(lexer.consume());
+
+                    if (Utf16View::is_low_surrogate(*low_surrogate)) {
+                        code_point = Utf16View::decode_surrogate_pair(*high_surrogate, *low_surrogate);
+                    } else {
+                        builder.append_code_point(*high_surrogate);
+                        code_point = *low_surrogate;
+                    }
+
+                } else {
+                    code_point = *high_surrogate;
                 }
             }
             builder.append_code_point(code_point);