From 444b2d9ec2e962235e2ea7c15c8457c8627d7ca2 Mon Sep 17 00:00:00 2001 From: Timothy Flynn Date: Mon, 17 Jan 2022 22:02:50 -0500 Subject: [PATCH] LibJS: Implement UTF-16 surrogate pair concatenation without iteration Performance of string concatenation regressed in a57e2f9. That commit iterates over the LHS string to find the last code unit, to check if it is a high surrogate. Instead, first look at the 3rd-to-last byte in the UTF-8 encoded string to check if it is a 3-byte code point; then decode just those bytes to check if we have a high surrogate. Similarly, check the first 3 bytes of the RHS string to check if we have a low surrogate. --- Userland/Libraries/LibJS/Runtime/Value.cpp | 47 +++++++++++++--------- 1 file changed, 27 insertions(+), 20 deletions(-) diff --git a/Userland/Libraries/LibJS/Runtime/Value.cpp b/Userland/Libraries/LibJS/Runtime/Value.cpp index 61bccf04cc..4612614e84 100644 --- a/Userland/Libraries/LibJS/Runtime/Value.cpp +++ b/Userland/Libraries/LibJS/Runtime/Value.cpp @@ -995,31 +995,38 @@ static PrimitiveString* concatenate_strings(GlobalObject& global_object, Primiti return js_string(vm, Utf16String(move(combined))); } - Utf8View lhs_string { lhs.string() }; - Utf8View rhs_string { rhs.string() }; - + auto const& lhs_string = lhs.string(); + auto const& rhs_string = rhs.string(); StringBuilder builder(lhs_string.length() + rhs_string.length()); - Optional high_surrogate; - for (auto it = lhs_string.begin(); it != lhs_string.end(); ++it) { - if (!it.peek(1).has_value() && Utf16View::is_high_surrogate(*it) && !rhs_string.is_empty()) - high_surrogate = *it; - else - builder.append_code_point(*it); - } + auto return_combined_strings = [&]() { + builder.append(lhs_string); + builder.append(rhs_string); + return js_string(vm, builder.to_string()); + }; - if (high_surrogate.has_value()) { - auto low_surrogate = *rhs_string.begin(); + // Surrogates encoded as UTF-8 are 3 bytes. + if ((lhs_string.length() < 3) || (rhs_string.length() < 3)) + return return_combined_strings(); - if (Utf16View::is_low_surrogate(low_surrogate)) { - builder.append_code_point(Utf16View::decode_surrogate_pair(*high_surrogate, low_surrogate)); - rhs_string = rhs_string.substring_view(3); // A low surrogate encoded as UTF-8 is 3 bytes. - } else { - builder.append_code_point(*high_surrogate); - } - } + auto lhs_leading_byte = static_cast(lhs_string[lhs_string.length() - 3]); + auto rhs_leading_byte = static_cast(rhs_string[0]); + + if ((lhs_leading_byte & 0xf0) != 0xe0) + return return_combined_strings(); + if ((rhs_leading_byte & 0xf0) != 0xe0) + return return_combined_strings(); + + auto high_surrogate = *Utf8View(lhs_string.substring_view(lhs_string.length() - 3)).begin(); + auto low_surrogate = *Utf8View(rhs_string).begin(); + + if (!Utf16View::is_high_surrogate(high_surrogate) || !Utf16View::is_low_surrogate(low_surrogate)) + return return_combined_strings(); + + builder.append(lhs_string.substring_view(0, lhs_string.length() - 3)); + builder.append_code_point(Utf16View::decode_surrogate_pair(high_surrogate, low_surrogate)); + builder.append(rhs_string.substring_view(3)); - builder.append(rhs_string.as_string()); return js_string(vm, builder.to_string()); }