1
Fork 0
mirror of https://github.com/RGBCube/serenity synced 2025-07-26 03:37:43 +00:00

LibJS: Implement RegExp.prototype [ @@replace ] with UTF-16 code units

This also converts the GetSubstitution abstract operation take its input
strings as UTF-16 now that all callers are UTF-16 capable. This means
String.prototype.replace (and replaceAll) no longer needs UTF-8 and
UTF-16 copies of these strings.
This commit is contained in:
Timothy Flynn 2021-07-22 10:38:10 -04:00 committed by Linus Groh
parent ee7b04f7bb
commit 5a8f870594
6 changed files with 67 additions and 71 deletions

View file

@ -576,59 +576,48 @@ Value canonical_numeric_index_string(GlobalObject& global_object, PropertyName c
} }
// 22.1.3.17.1 GetSubstitution ( matched, str, position, captures, namedCaptures, replacement ), https://tc39.es/ecma262/#sec-getsubstitution // 22.1.3.17.1 GetSubstitution ( matched, str, position, captures, namedCaptures, replacement ), https://tc39.es/ecma262/#sec-getsubstitution
String get_substitution(GlobalObject& global_object, String const& matched, String const& str, size_t position, Vector<Value> const& captures, Value named_captures, Value replacement) String get_substitution(GlobalObject& global_object, Utf16View const& matched, Utf16View const& str, size_t position, Vector<Value> const& captures, Value named_captures, Value replacement)
{ {
auto& vm = global_object.vm(); auto& vm = global_object.vm();
auto replace_string = replacement.to_string(global_object); auto replace_string = replacement.to_utf16_string(global_object);
if (vm.exception()) if (vm.exception())
return {}; return {};
Utf16View replace_view { replace_string };
// FIXME: Once RegExp.prototype supports UTF-16, this AO can take UTF-16 strings as parameters instead of having to transcode here.
auto utf16_matched = AK::utf8_to_utf16(matched);
auto match_length = utf16_matched.size();
auto utf16_string = AK::utf8_to_utf16(str);
Utf16View utf16_string_view { utf16_string };
auto string_length = utf16_string_view.length_in_code_units();
auto utf16_replace = AK::utf8_to_utf16(replace_string);
Utf16View utf16_replace_view { utf16_replace };
auto replace_length = utf16_replace_view.length_in_code_units();
StringBuilder result; StringBuilder result;
for (size_t i = 0; i < replace_length; ++i) { for (size_t i = 0; i < replace_view.length_in_code_units(); ++i) {
u16 curr = utf16_replace_view.code_unit_at(i); u16 curr = replace_view.code_unit_at(i);
if ((curr != '$') || (i + 1 >= replace_length)) { if ((curr != '$') || (i + 1 >= replace_view.length_in_code_units())) {
result.append(curr); result.append(curr);
continue; continue;
} }
u16 next = utf16_replace_view.code_unit_at(i + 1); u16 next = replace_view.code_unit_at(i + 1);
if (next == '$') { if (next == '$') {
result.append('$'); result.append('$');
++i; ++i;
} else if (next == '&') { } else if (next == '&') {
result.append(matched); result.append(matched.to_utf8(Utf16View::AllowInvalidCodeUnits::Yes));
++i; ++i;
} else if (next == '`') { } else if (next == '`') {
auto substring = utf16_string_view.substring_view(0, position); auto substring = str.substring_view(0, position);
result.append(substring.to_utf8(Utf16View::AllowInvalidCodeUnits::Yes)); result.append(substring.to_utf8(Utf16View::AllowInvalidCodeUnits::Yes));
++i; ++i;
} else if (next == '\'') { } else if (next == '\'') {
auto tail_pos = position + match_length; auto tail_pos = position + matched.length_in_code_units();
if (tail_pos < string_length) { if (tail_pos < str.length_in_code_units()) {
auto substring = utf16_string_view.substring_view(tail_pos); auto substring = str.substring_view(tail_pos);
result.append(substring.to_utf8(Utf16View::AllowInvalidCodeUnits::Yes)); result.append(substring.to_utf8(Utf16View::AllowInvalidCodeUnits::Yes));
} }
++i; ++i;
} else if (is_ascii_digit(next)) { } else if (is_ascii_digit(next)) {
bool is_two_digits = (i + 2 < replace_length) && is_ascii_digit(utf16_replace_view.code_unit_at(i + 2)); bool is_two_digits = (i + 2 < replace_view.length_in_code_units()) && is_ascii_digit(replace_view.code_unit_at(i + 2));
auto capture_postition_string = utf16_replace_view.substring_view(i + 1, is_two_digits ? 2 : 1).to_utf8(); auto capture_postition_string = replace_view.substring_view(i + 1, is_two_digits ? 2 : 1).to_utf8();
auto capture_position = capture_postition_string.to_uint(); auto capture_position = capture_postition_string.to_uint();
if (capture_position.has_value() && (*capture_position > 0) && (*capture_position <= captures.size())) { if (capture_position.has_value() && (*capture_position > 0) && (*capture_position <= captures.size())) {
@ -650,8 +639,8 @@ String get_substitution(GlobalObject& global_object, String const& matched, Stri
auto start_position = i + 2; auto start_position = i + 2;
Optional<size_t> end_position; Optional<size_t> end_position;
for (size_t j = start_position; j < replace_length; ++j) { for (size_t j = start_position; j < replace_view.length_in_code_units(); ++j) {
if (utf16_replace_view.code_unit_at(j) == '>') { if (replace_view.code_unit_at(j) == '>') {
end_position = j; end_position = j;
break; break;
} }
@ -660,7 +649,7 @@ String get_substitution(GlobalObject& global_object, String const& matched, Stri
if (named_captures.is_undefined() || !end_position.has_value()) { if (named_captures.is_undefined() || !end_position.has_value()) {
result.append(curr); result.append(curr);
} else { } else {
auto group_name_view = utf16_replace_view.substring_view(start_position, *end_position - start_position); auto group_name_view = replace_view.substring_view(start_position, *end_position - start_position);
auto group_name = group_name_view.to_utf8(Utf16View::AllowInvalidCodeUnits::Yes); auto group_name = group_name_view.to_utf8(Utf16View::AllowInvalidCodeUnits::Yes);
auto capture = named_captures.as_object().get(group_name); auto capture = named_captures.as_object().get(group_name);

View file

@ -30,7 +30,7 @@ Object* get_prototype_from_constructor(GlobalObject&, FunctionObject const& cons
Object* create_unmapped_arguments_object(GlobalObject&, Vector<Value> const& arguments); Object* create_unmapped_arguments_object(GlobalObject&, Vector<Value> const& arguments);
Object* create_mapped_arguments_object(GlobalObject&, FunctionObject&, Vector<FunctionNode::Parameter> const&, Vector<Value> const& arguments, Environment&); Object* create_mapped_arguments_object(GlobalObject&, FunctionObject&, Vector<FunctionNode::Parameter> const&, Vector<Value> const& arguments, Environment&);
Value canonical_numeric_index_string(GlobalObject&, PropertyName const&); Value canonical_numeric_index_string(GlobalObject&, PropertyName const&);
String get_substitution(GlobalObject&, String const& matched, String const& str, size_t position, Vector<Value> const& captures, Value named_captures, Value replacement); String get_substitution(GlobalObject&, Utf16View const& matched, Utf16View const& str, size_t position, Vector<Value> const& captures, Value named_captures, Value replacement);
enum class CallerMode { enum class CallerMode {
Strict, Strict,

View file

@ -127,14 +127,6 @@ static void increment_last_index(GlobalObject& global_object, Object& regexp_obj
regexp_object.set(vm.names.lastIndex, Value(last_index), Object::ShouldThrowExceptions::Yes); regexp_object.set(vm.names.lastIndex, Value(last_index), Object::ShouldThrowExceptions::Yes);
} }
static void increment_last_index(GlobalObject& global_object, Object& regexp_object, String const& string, bool unicode)
{
auto utf16_string = AK::utf8_to_utf16(string);
Utf16View utf16_string_view { utf16_string };
return increment_last_index(global_object, regexp_object, utf16_string_view, unicode);
}
// 1.1.2.1 Match Records, https://tc39.es/proposal-regexp-match-indices/#sec-match-records // 1.1.2.1 Match Records, https://tc39.es/proposal-regexp-match-indices/#sec-match-records
struct Match { struct Match {
static Match create(regex::Match const& match) static Match create(regex::Match const& match)
@ -619,9 +611,10 @@ JS_DEFINE_NATIVE_FUNCTION(RegExpPrototype::symbol_replace)
auto* regexp_object = this_object_from(vm, global_object); auto* regexp_object = this_object_from(vm, global_object);
if (!regexp_object) if (!regexp_object)
return {}; return {};
auto string = string_value.to_string(global_object); auto string = string_value.to_utf16_string(global_object);
if (vm.exception()) if (vm.exception())
return {}; return {};
Utf16View string_view { string };
if (!replace_value.is_function()) { if (!replace_value.is_function()) {
auto replace_string = replace_value.to_string(global_object); auto replace_string = replace_value.to_string(global_object);
@ -654,7 +647,7 @@ JS_DEFINE_NATIVE_FUNCTION(RegExpPrototype::symbol_replace)
MarkedValueList results(vm.heap()); MarkedValueList results(vm.heap());
while (true) { while (true) {
auto result = regexp_exec(global_object, *regexp_object, string); auto result = regexp_exec(global_object, *regexp_object, string_view);
if (vm.exception()) if (vm.exception())
return {}; return {};
if (result.is_null()) if (result.is_null())
@ -676,7 +669,7 @@ JS_DEFINE_NATIVE_FUNCTION(RegExpPrototype::symbol_replace)
return {}; return {};
if (match_str.is_empty()) { if (match_str.is_empty()) {
increment_last_index(global_object, *regexp_object, string, unicode); increment_last_index(global_object, *regexp_object, string_view, unicode);
if (vm.exception()) if (vm.exception())
return {}; return {};
} }
@ -693,10 +686,10 @@ JS_DEFINE_NATIVE_FUNCTION(RegExpPrototype::symbol_replace)
auto matched_value = result.get(0); auto matched_value = result.get(0);
if (vm.exception()) if (vm.exception())
return {}; return {};
auto matched = matched_value.to_utf16_string(global_object);
auto matched = matched_value.to_string(global_object);
if (vm.exception()) if (vm.exception())
return {}; return {};
Utf16View matched_view { matched };
auto position_value = result.get(vm.names.index); auto position_value = result.get(vm.names.index);
if (vm.exception()) if (vm.exception())
@ -706,7 +699,7 @@ JS_DEFINE_NATIVE_FUNCTION(RegExpPrototype::symbol_replace)
if (vm.exception()) if (vm.exception())
return {}; return {};
position = clamp(position, static_cast<double>(0), static_cast<double>(string.length())); position = clamp(position, static_cast<double>(0), static_cast<double>(string_view.length_in_code_units()));
MarkedValueList captures(vm.heap()); MarkedValueList captures(vm.heap());
for (size_t n = 1; n <= n_captures; ++n) { for (size_t n = 1; n <= n_captures; ++n) {
@ -735,10 +728,10 @@ JS_DEFINE_NATIVE_FUNCTION(RegExpPrototype::symbol_replace)
if (replace_value.is_function()) { if (replace_value.is_function()) {
MarkedValueList replacer_args(vm.heap()); MarkedValueList replacer_args(vm.heap());
replacer_args.append(js_string(vm, matched)); replacer_args.append(js_string(vm, matched_view));
replacer_args.extend(move(captures)); replacer_args.extend(move(captures));
replacer_args.append(Value(position)); replacer_args.append(Value(position));
replacer_args.append(js_string(vm, string)); replacer_args.append(js_string(vm, string_view));
if (!named_captures.is_undefined()) { if (!named_captures.is_undefined()) {
replacer_args.append(move(named_captures)); replacer_args.append(move(named_captures));
} }
@ -758,28 +751,32 @@ JS_DEFINE_NATIVE_FUNCTION(RegExpPrototype::symbol_replace)
return {}; return {};
} }
replacement = get_substitution(global_object, matched, string, position, captures, named_captures_object, replace_value); replacement = get_substitution(global_object, matched_view, string_view, position, captures, named_captures_object, replace_value);
if (vm.exception()) if (vm.exception())
return {}; return {};
} }
if (position >= next_source_position) { if (position >= next_source_position) {
auto substring = string_view.substring_view(next_source_position, position - next_source_position);
StringBuilder builder; StringBuilder builder;
builder.append(accumulated_result); builder.append(accumulated_result);
builder.append(string.substring(next_source_position, position - next_source_position)); builder.append(substring.to_utf8(Utf16View::AllowInvalidCodeUnits::Yes));
builder.append(replacement); builder.append(replacement);
accumulated_result = builder.build(); accumulated_result = builder.build();
next_source_position = position + matched.length(); next_source_position = position + matched_view.length_in_code_units();
} }
} }
if (next_source_position >= string.length()) if (next_source_position >= string_view.length_in_code_units())
return js_string(vm, accumulated_result); return js_string(vm, accumulated_result);
auto substring = string_view.substring_view(next_source_position);
StringBuilder builder; StringBuilder builder;
builder.append(accumulated_result); builder.append(accumulated_result);
builder.append(string.substring(next_source_position)); builder.append(substring.to_utf8(Utf16View::AllowInvalidCodeUnits::Yes));
return js_string(vm, builder.build()); return js_string(vm, builder.build());
} }

View file

@ -910,10 +910,10 @@ JS_DEFINE_NATIVE_FUNCTION(StringPrototype::replace)
return {}; return {};
} }
auto string = this_object.to_string(global_object); auto string = this_object.to_utf16_string(global_object);
if (vm.exception()) if (vm.exception())
return {}; return {};
auto search_string = search_value.to_string(global_object); auto search_string = search_value.to_utf16_string(global_object);
if (vm.exception()) if (vm.exception())
return {}; return {};
@ -926,11 +926,8 @@ JS_DEFINE_NATIVE_FUNCTION(StringPrototype::replace)
return {}; return {};
} }
auto utf16_string = AK::utf8_to_utf16(string); Utf16View utf16_string_view { string };
Utf16View utf16_string_view { utf16_string }; Utf16View utf16_search_view { search_string };
auto utf16_search_string = AK::utf8_to_utf16(search_string);
Utf16View utf16_search_view { utf16_search_string };
Optional<size_t> position = string_index_of(utf16_string_view, utf16_search_view, 0); Optional<size_t> position = string_index_of(utf16_string_view, utf16_search_view, 0);
if (!position.has_value()) if (!position.has_value())
@ -948,7 +945,7 @@ JS_DEFINE_NATIVE_FUNCTION(StringPrototype::replace)
if (vm.exception()) if (vm.exception())
return {}; return {};
} else { } else {
replacement = get_substitution(global_object, search_string, string, *position, {}, js_undefined(), replace_value); replacement = get_substitution(global_object, utf16_search_view, utf16_string_view, *position, {}, js_undefined(), replace_value);
if (vm.exception()) if (vm.exception())
return {}; return {};
} }
@ -1004,10 +1001,10 @@ JS_DEFINE_NATIVE_FUNCTION(StringPrototype::replace_all)
} }
} }
auto string = this_object.to_string(global_object); auto string = this_object.to_utf16_string(global_object);
if (vm.exception()) if (vm.exception())
return {}; return {};
auto search_string = search_value.to_string(global_object); auto search_string = search_value.to_utf16_string(global_object);
if (vm.exception()) if (vm.exception())
return {}; return {};
@ -1020,12 +1017,10 @@ JS_DEFINE_NATIVE_FUNCTION(StringPrototype::replace_all)
return {}; return {};
} }
auto utf16_string = AK::utf8_to_utf16(string); Utf16View utf16_string_view { string };
Utf16View utf16_string_view { utf16_string };
auto string_length = utf16_string_view.length_in_code_units(); auto string_length = utf16_string_view.length_in_code_units();
auto utf16_search_string = AK::utf8_to_utf16(search_string); Utf16View utf16_search_view { search_string };
Utf16View utf16_search_view { utf16_search_string };
auto search_length = utf16_search_view.length_in_code_units(); auto search_length = utf16_search_view.length_in_code_units();
Vector<size_t> match_positions; Vector<size_t> match_positions;
@ -1053,7 +1048,7 @@ JS_DEFINE_NATIVE_FUNCTION(StringPrototype::replace_all)
if (vm.exception()) if (vm.exception())
return {}; return {};
} else { } else {
replacement = get_substitution(global_object, search_string, string, position, {}, js_undefined(), replace_value); replacement = get_substitution(global_object, utf16_search_view, utf16_string_view, position, {}, js_undefined(), replace_value);
if (vm.exception()) if (vm.exception())
return {}; return {};
} }

View file

@ -238,7 +238,11 @@ test("UTF-16", () => {
expect("😀".replace("\ud83d", "")).toBe("\ude00"); expect("😀".replace("\ud83d", "")).toBe("\ude00");
expect("😀".replace("\ude00", "")).toBe("\ud83d"); expect("😀".replace("\ude00", "")).toBe("\ud83d");
// FIXME: RegExp.prototype [ @@replace ] also needs to support UTF-16. expect("😀".replace(/\ud83d/, "")).toBe("\ude00");
// expect("😀".replace(/\ud83d/, "")).toBe("\ude00"); expect("😀".replace(/\ude00/, "")).toBe("\ud83d");
// expect("😀".replace(/\ude00/, "")).toBe("\ud83d"); expect("😀".replace(/\ud83d\ude00/, "")).toBe("");
expect("😀".replace(/\ud83d/u, "")).toBe("😀");
expect("😀".replace(/\ude00/u, "")).toBe("😀");
expect("😀".replace(/\ud83d\ude00/u, "")).toBe("");
}); });

View file

@ -151,7 +151,18 @@ test("UTF-16", () => {
expect("😀😀😀".replaceAll("\ud83d", "")).toBe("\ude00\ude00\ude00"); expect("😀😀😀".replaceAll("\ud83d", "")).toBe("\ude00\ude00\ude00");
expect("😀😀😀".replaceAll("\ude00", "")).toBe("\ud83d\ud83d\ud83d"); expect("😀😀😀".replaceAll("\ude00", "")).toBe("\ud83d\ud83d\ud83d");
// FIXME: RegExp.prototype [ @@replace ] also needs to support UTF-16. expect("😀".replaceAll(/\ud83d/g, "")).toBe("\ude00");
// expect("😀".replaceAll(/\ud83d/g, "")).toBe("\ude00"); expect("😀".replaceAll(/\ude00/g, "")).toBe("\ud83d");
// expect("😀".replaceAll(/\ude00/g, "")).toBe("\ud83d"); expect("😀".replaceAll(/\ud83d\ude00/g, "")).toBe("");
expect("😀😀😀".replaceAll(/\ud83d/g, "")).toBe("\ude00\ude00\ude00");
expect("😀😀😀".replaceAll(/\ude00/g, "")).toBe("\ud83d\ud83d\ud83d");
expect("😀😀😀".replaceAll(/\ud83d\ude00/g, "")).toBe("");
expect("😀".replaceAll(/\ud83d/gu, "")).toBe("😀");
expect("😀".replaceAll(/\ude00/gu, "")).toBe("😀");
expect("😀".replaceAll(/\ud83d\ude00/gu, "")).toBe("");
expect("😀😀😀".replaceAll(/\ud83d/gu, "")).toBe("😀😀😀");
expect("😀😀😀".replaceAll(/\ude00/gu, "")).toBe("😀😀😀");
expect("😀😀😀".replaceAll(/\ude00/gu, "")).toBe("😀😀😀");
expect("😀😀😀".replaceAll(/\ud83d\ude00/gu, "")).toBe("");
}); });