From 4812b957952dbac912c6436bf82785aa41fd2443 Mon Sep 17 00:00:00 2001 From: Timothy Flynn Date: Thu, 15 Jul 2021 09:24:29 -0400 Subject: [PATCH] LibJS: Implement RegExp AdvanceStringIndex abstraction This isn't particularly useful yet because the underlying LibRegex engine doesn't support unicode matching yet. But the debt of FIXMEs related to AdvanceStringIndex have added up, so let's get this out of the way. --- .../LibJS/Runtime/RegExpPrototype.cpp | 64 +++++++++++++++---- .../Libraries/LibJS/Runtime/RegExpPrototype.h | 1 + .../Runtime/RegExpStringIteratorPrototype.cpp | 3 +- 3 files changed, 53 insertions(+), 15 deletions(-) diff --git a/Userland/Libraries/LibJS/Runtime/RegExpPrototype.cpp b/Userland/Libraries/LibJS/Runtime/RegExpPrototype.cpp index 55563ffeca..16c40ed36e 100644 --- a/Userland/Libraries/LibJS/Runtime/RegExpPrototype.cpp +++ b/Userland/Libraries/LibJS/Runtime/RegExpPrototype.cpp @@ -5,7 +5,9 @@ * SPDX-License-Identifier: BSD-2-Clause */ +#include #include +#include #include #include #include @@ -87,7 +89,34 @@ static String escape_regexp_pattern(const RegExpObject& regexp_object) return pattern; } -static void increment_last_index(GlobalObject& global_object, Object& regexp_object) +// 22.2.5.2.3 AdvanceStringIndex ( S, index, unicode ), https://tc39.es/ecma262/#sec-advancestringindex +size_t advance_string_index(String const& string, size_t index, bool unicode) +{ + if (!unicode) + return index + 1; + + Utf8View view(string); + + if (index + 1 >= view.length()) + return index + 1; + + auto it = view.begin(); + for (size_t i = 0; i < index; ++i) + ++it; + + // See https://tc39.es/ecma262/#sec-codepointat for details on [[CodeUnitCount]]. + auto code_unit_count = 1; + if (is_unicode_surrogate(*it)) { + ++it; + + if ((it != view.end()) && is_unicode_surrogate(*it)) + code_unit_count = 2; + } + + return index + code_unit_count; +} + +static void increment_last_index(GlobalObject& global_object, Object& regexp_object, String const& string, bool unicode) { auto& vm = global_object.vm(); @@ -98,10 +127,7 @@ static void increment_last_index(GlobalObject& global_object, Object& regexp_obj if (vm.exception()) return; - // FIXME: Implement AdvanceStringIndex to take Unicode code points into account - https://tc39.es/ecma262/#sec-advancestringindex - // Once implemented, step (8a) of the @@replace algorithm must also be implemented. - ++last_index; - + last_index = advance_string_index(string, last_index, unicode); regexp_object.set(vm.names.lastIndex, Value(last_index), true); } @@ -194,6 +220,7 @@ static Value regexp_builtin_exec(GlobalObject& global_object, RegExpObject& rege auto& regex = regexp_object.regex(); bool global = regex.options().has_flag_set(ECMAScriptFlags::Global); bool sticky = regex.options().has_flag_set(ECMAScriptFlags::Sticky); + bool unicode = regex.options().has_flag_set(ECMAScriptFlags::Unicode); bool has_indices = regexp_object.flags().find('d').has_value(); if (!global && !sticky) @@ -226,8 +253,7 @@ static Value regexp_builtin_exec(GlobalObject& global_object, RegExpObject& rege return js_null(); } - // FIXME: Implement AdvanceStringIndex to take Unicode code points into account - https://tc39.es/ecma262/#sec-advancestringindex - ++last_index; + last_index = advance_string_index(string, last_index, unicode); } auto& match = result.matches[0]; @@ -472,6 +498,11 @@ JS_DEFINE_NATIVE_FUNCTION(RegExpPrototype::symbol_match) if (vm.exception()) return {}; + auto unicode_value = regexp_object->get(vm.names.unicode); + if (vm.exception()) + return {}; + bool unicode = unicode_value.to_boolean(); + size_t n = 0; while (true) { @@ -500,7 +531,7 @@ JS_DEFINE_NATIVE_FUNCTION(RegExpPrototype::symbol_match) return {}; if (match_str.is_empty()) { - increment_last_index(global_object, *regexp_object); + increment_last_index(global_object, *regexp_object, s, unicode); if (vm.exception()) return {}; } @@ -586,7 +617,14 @@ JS_DEFINE_NATIVE_FUNCTION(RegExpPrototype::symbol_replace) return {}; bool global = global_value.to_boolean(); + bool unicode = false; + if (global) { + auto unicode_value = regexp_object->get(vm.names.unicode); + if (vm.exception()) + return {}; + unicode = unicode_value.to_boolean(); + regexp_object->set(vm.names.lastIndex, Value(0), true); if (vm.exception()) return {}; @@ -617,7 +655,7 @@ JS_DEFINE_NATIVE_FUNCTION(RegExpPrototype::symbol_replace) return {}; if (match_str.is_empty()) { - increment_last_index(global_object, *regexp_object); + increment_last_index(global_object, *regexp_object, string, unicode); if (vm.exception()) return {}; } @@ -793,6 +831,8 @@ JS_DEFINE_NATIVE_FUNCTION(RegExpPrototype::symbol_split) auto flags = flags_object.to_string(global_object); if (vm.exception()) return {}; + + bool unicode = flags.find('u').has_value(); auto new_flags = flags.find('y').has_value() ? move(flags) : String::formatted("{}y", flags); MarkedValueList arguments(vm.heap()); @@ -839,8 +879,7 @@ JS_DEFINE_NATIVE_FUNCTION(RegExpPrototype::symbol_split) if (vm.exception()) return {}; if (result.is_null()) { - // FIXME: Implement AdvanceStringIndex to take Unicode code points into account - https://tc39.es/ecma262/#sec-advancestringindex - ++next_search_from; + next_search_from = advance_string_index(string, next_search_from, unicode); continue; } @@ -853,8 +892,7 @@ JS_DEFINE_NATIVE_FUNCTION(RegExpPrototype::symbol_split) last_index = min(last_index, string.length()); if (last_index == last_match_end) { - // FIXME: Implement AdvanceStringIndex to take Unicode code points into account - https://tc39.es/ecma262/#sec-advancestringindex - ++next_search_from; + next_search_from = advance_string_index(string, next_search_from, unicode); continue; } diff --git a/Userland/Libraries/LibJS/Runtime/RegExpPrototype.h b/Userland/Libraries/LibJS/Runtime/RegExpPrototype.h index fce9688abb..aa9a1570f6 100644 --- a/Userland/Libraries/LibJS/Runtime/RegExpPrototype.h +++ b/Userland/Libraries/LibJS/Runtime/RegExpPrototype.h @@ -11,6 +11,7 @@ namespace JS { Value regexp_exec(GlobalObject& global_object, Object& regexp_object, String const& string); +size_t advance_string_index(String const& string, size_t index, bool unicode); class RegExpPrototype final : public Object { JS_OBJECT(RegExpPrototype, Object); diff --git a/Userland/Libraries/LibJS/Runtime/RegExpStringIteratorPrototype.cpp b/Userland/Libraries/LibJS/Runtime/RegExpStringIteratorPrototype.cpp index 9b61656d20..b06661957d 100644 --- a/Userland/Libraries/LibJS/Runtime/RegExpStringIteratorPrototype.cpp +++ b/Userland/Libraries/LibJS/Runtime/RegExpStringIteratorPrototype.cpp @@ -75,8 +75,7 @@ JS_DEFINE_NATIVE_FUNCTION(RegExpStringIteratorPrototype::next) if (vm.exception()) return {}; - // FIXME: Implement AdvanceStringIndex to take Unicode code points into account - https://tc39.es/ecma262/#sec-advancestringindex - ++last_index; + last_index = advance_string_index(iterator.string(), last_index, iterator.unicode()); iterator.regexp_object().set(vm.names.lastIndex, Value(last_index), true); if (vm.exception())