AK+LibRegex: Add Utf16View::code_point_at and use it in RegexStringView

The current method of iterating through the string to access a code point hurts performance quite badly for very large strings. The test262 test "RegExp/property-escapes/generated/Any.js" previously took 3 hours to complete; this one change brings it down to under 10 seconds.
2025-07-25 04:17:35 +00:00 · 2021-08-01 18:56:52 -04:00 · 2021-08-01 18:56:52 -04:00 · 510bbcd8e0
commit 510bbcd8e0
parent bed51d856a
3 changed files with 21 additions and 0 deletions
--- a/AK/Utf16View.cpp
+++ b/AK/Utf16View.cpp
@ -111,6 +111,23 @@ u16 Utf16View::code_unit_at(size_t index) const
    return m_code_units[index];
 }

+u32 Utf16View::code_point_at(size_t index) const
+{
+    VERIFY(index < length_in_code_units());
+
+    u32 code_point = code_unit_at(index);
+    if (!is_high_surrogate(code_point) && !is_low_surrogate(code_point))
+        return code_point;
+    if (is_low_surrogate(code_point) || (index + 1 == length_in_code_units()))
+        return code_point;
+
+    auto second = code_unit_at(index + 1);
+    if (!is_low_surrogate(second))
+        return code_point;
+
+    return decode_surrogate_pair(code_point, second);
+}
+
 size_t Utf16View::code_point_offset_of(size_t code_unit_offset) const
 {
    size_t code_point_offset = 0;