1
Fork 0
mirror of https://github.com/RGBCube/serenity synced 2025-07-26 03:37:43 +00:00

LibJS: Implement RegExp.prototype [ @@replace ] with UTF-16 code units

This also converts the GetSubstitution abstract operation take its input
strings as UTF-16 now that all callers are UTF-16 capable. This means
String.prototype.replace (and replaceAll) no longer needs UTF-8 and
UTF-16 copies of these strings.
This commit is contained in:
Timothy Flynn 2021-07-22 10:38:10 -04:00 committed by Linus Groh
parent ee7b04f7bb
commit 5a8f870594
6 changed files with 67 additions and 71 deletions

View file

@ -576,59 +576,48 @@ Value canonical_numeric_index_string(GlobalObject& global_object, PropertyName c
}
// 22.1.3.17.1 GetSubstitution ( matched, str, position, captures, namedCaptures, replacement ), https://tc39.es/ecma262/#sec-getsubstitution
String get_substitution(GlobalObject& global_object, String const& matched, String const& str, size_t position, Vector<Value> const& captures, Value named_captures, Value replacement)
String get_substitution(GlobalObject& global_object, Utf16View const& matched, Utf16View const& str, size_t position, Vector<Value> const& captures, Value named_captures, Value replacement)
{
auto& vm = global_object.vm();
auto replace_string = replacement.to_string(global_object);
auto replace_string = replacement.to_utf16_string(global_object);
if (vm.exception())
return {};
// FIXME: Once RegExp.prototype supports UTF-16, this AO can take UTF-16 strings as parameters instead of having to transcode here.
auto utf16_matched = AK::utf8_to_utf16(matched);
auto match_length = utf16_matched.size();
auto utf16_string = AK::utf8_to_utf16(str);
Utf16View utf16_string_view { utf16_string };
auto string_length = utf16_string_view.length_in_code_units();
auto utf16_replace = AK::utf8_to_utf16(replace_string);
Utf16View utf16_replace_view { utf16_replace };
auto replace_length = utf16_replace_view.length_in_code_units();
Utf16View replace_view { replace_string };
StringBuilder result;
for (size_t i = 0; i < replace_length; ++i) {
u16 curr = utf16_replace_view.code_unit_at(i);
for (size_t i = 0; i < replace_view.length_in_code_units(); ++i) {
u16 curr = replace_view.code_unit_at(i);
if ((curr != '$') || (i + 1 >= replace_length)) {
if ((curr != '$') || (i + 1 >= replace_view.length_in_code_units())) {
result.append(curr);
continue;
}
u16 next = utf16_replace_view.code_unit_at(i + 1);
u16 next = replace_view.code_unit_at(i + 1);
if (next == '$') {
result.append('$');
++i;
} else if (next == '&') {
result.append(matched);
result.append(matched.to_utf8(Utf16View::AllowInvalidCodeUnits::Yes));
++i;
} else if (next == '`') {
auto substring = utf16_string_view.substring_view(0, position);
auto substring = str.substring_view(0, position);
result.append(substring.to_utf8(Utf16View::AllowInvalidCodeUnits::Yes));
++i;
} else if (next == '\'') {
auto tail_pos = position + match_length;
if (tail_pos < string_length) {
auto substring = utf16_string_view.substring_view(tail_pos);
auto tail_pos = position + matched.length_in_code_units();
if (tail_pos < str.length_in_code_units()) {
auto substring = str.substring_view(tail_pos);
result.append(substring.to_utf8(Utf16View::AllowInvalidCodeUnits::Yes));
}
++i;
} else if (is_ascii_digit(next)) {
bool is_two_digits = (i + 2 < replace_length) && is_ascii_digit(utf16_replace_view.code_unit_at(i + 2));
bool is_two_digits = (i + 2 < replace_view.length_in_code_units()) && is_ascii_digit(replace_view.code_unit_at(i + 2));
auto capture_postition_string = utf16_replace_view.substring_view(i + 1, is_two_digits ? 2 : 1).to_utf8();
auto capture_postition_string = replace_view.substring_view(i + 1, is_two_digits ? 2 : 1).to_utf8();
auto capture_position = capture_postition_string.to_uint();
if (capture_position.has_value() && (*capture_position > 0) && (*capture_position <= captures.size())) {
@ -650,8 +639,8 @@ String get_substitution(GlobalObject& global_object, String const& matched, Stri
auto start_position = i + 2;
Optional<size_t> end_position;
for (size_t j = start_position; j < replace_length; ++j) {
if (utf16_replace_view.code_unit_at(j) == '>') {
for (size_t j = start_position; j < replace_view.length_in_code_units(); ++j) {
if (replace_view.code_unit_at(j) == '>') {
end_position = j;
break;
}
@ -660,7 +649,7 @@ String get_substitution(GlobalObject& global_object, String const& matched, Stri
if (named_captures.is_undefined() || !end_position.has_value()) {
result.append(curr);
} else {
auto group_name_view = utf16_replace_view.substring_view(start_position, *end_position - start_position);
auto group_name_view = replace_view.substring_view(start_position, *end_position - start_position);
auto group_name = group_name_view.to_utf8(Utf16View::AllowInvalidCodeUnits::Yes);
auto capture = named_captures.as_object().get(group_name);

View file

@ -30,7 +30,7 @@ Object* get_prototype_from_constructor(GlobalObject&, FunctionObject const& cons
Object* create_unmapped_arguments_object(GlobalObject&, Vector<Value> const& arguments);
Object* create_mapped_arguments_object(GlobalObject&, FunctionObject&, Vector<FunctionNode::Parameter> const&, Vector<Value> const& arguments, Environment&);
Value canonical_numeric_index_string(GlobalObject&, PropertyName const&);
String get_substitution(GlobalObject&, String const& matched, String const& str, size_t position, Vector<Value> const& captures, Value named_captures, Value replacement);
String get_substitution(GlobalObject&, Utf16View const& matched, Utf16View const& str, size_t position, Vector<Value> const& captures, Value named_captures, Value replacement);
enum class CallerMode {
Strict,

View file

@ -127,14 +127,6 @@ static void increment_last_index(GlobalObject& global_object, Object& regexp_obj
regexp_object.set(vm.names.lastIndex, Value(last_index), Object::ShouldThrowExceptions::Yes);
}
static void increment_last_index(GlobalObject& global_object, Object& regexp_object, String const& string, bool unicode)
{
auto utf16_string = AK::utf8_to_utf16(string);
Utf16View utf16_string_view { utf16_string };
return increment_last_index(global_object, regexp_object, utf16_string_view, unicode);
}
// 1.1.2.1 Match Records, https://tc39.es/proposal-regexp-match-indices/#sec-match-records
struct Match {
static Match create(regex::Match const& match)
@ -619,9 +611,10 @@ JS_DEFINE_NATIVE_FUNCTION(RegExpPrototype::symbol_replace)
auto* regexp_object = this_object_from(vm, global_object);
if (!regexp_object)
return {};
auto string = string_value.to_string(global_object);
auto string = string_value.to_utf16_string(global_object);
if (vm.exception())
return {};
Utf16View string_view { string };
if (!replace_value.is_function()) {
auto replace_string = replace_value.to_string(global_object);
@ -654,7 +647,7 @@ JS_DEFINE_NATIVE_FUNCTION(RegExpPrototype::symbol_replace)
MarkedValueList results(vm.heap());
while (true) {
auto result = regexp_exec(global_object, *regexp_object, string);
auto result = regexp_exec(global_object, *regexp_object, string_view);
if (vm.exception())
return {};
if (result.is_null())
@ -676,7 +669,7 @@ JS_DEFINE_NATIVE_FUNCTION(RegExpPrototype::symbol_replace)
return {};
if (match_str.is_empty()) {
increment_last_index(global_object, *regexp_object, string, unicode);
increment_last_index(global_object, *regexp_object, string_view, unicode);
if (vm.exception())
return {};
}
@ -693,10 +686,10 @@ JS_DEFINE_NATIVE_FUNCTION(RegExpPrototype::symbol_replace)
auto matched_value = result.get(0);
if (vm.exception())
return {};
auto matched = matched_value.to_string(global_object);
auto matched = matched_value.to_utf16_string(global_object);
if (vm.exception())
return {};
Utf16View matched_view { matched };
auto position_value = result.get(vm.names.index);
if (vm.exception())
@ -706,7 +699,7 @@ JS_DEFINE_NATIVE_FUNCTION(RegExpPrototype::symbol_replace)
if (vm.exception())
return {};
position = clamp(position, static_cast<double>(0), static_cast<double>(string.length()));
position = clamp(position, static_cast<double>(0), static_cast<double>(string_view.length_in_code_units()));
MarkedValueList captures(vm.heap());
for (size_t n = 1; n <= n_captures; ++n) {
@ -735,10 +728,10 @@ JS_DEFINE_NATIVE_FUNCTION(RegExpPrototype::symbol_replace)
if (replace_value.is_function()) {
MarkedValueList replacer_args(vm.heap());
replacer_args.append(js_string(vm, matched));
replacer_args.append(js_string(vm, matched_view));
replacer_args.extend(move(captures));
replacer_args.append(Value(position));
replacer_args.append(js_string(vm, string));
replacer_args.append(js_string(vm, string_view));
if (!named_captures.is_undefined()) {
replacer_args.append(move(named_captures));
}
@ -758,28 +751,32 @@ JS_DEFINE_NATIVE_FUNCTION(RegExpPrototype::symbol_replace)
return {};
}
replacement = get_substitution(global_object, matched, string, position, captures, named_captures_object, replace_value);
replacement = get_substitution(global_object, matched_view, string_view, position, captures, named_captures_object, replace_value);
if (vm.exception())
return {};
}
if (position >= next_source_position) {
auto substring = string_view.substring_view(next_source_position, position - next_source_position);
StringBuilder builder;
builder.append(accumulated_result);
builder.append(string.substring(next_source_position, position - next_source_position));
builder.append(substring.to_utf8(Utf16View::AllowInvalidCodeUnits::Yes));
builder.append(replacement);
accumulated_result = builder.build();
next_source_position = position + matched.length();
next_source_position = position + matched_view.length_in_code_units();
}
}
if (next_source_position >= string.length())
if (next_source_position >= string_view.length_in_code_units())
return js_string(vm, accumulated_result);
auto substring = string_view.substring_view(next_source_position);
StringBuilder builder;
builder.append(accumulated_result);
builder.append(string.substring(next_source_position));
builder.append(substring.to_utf8(Utf16View::AllowInvalidCodeUnits::Yes));
return js_string(vm, builder.build());
}

View file

@ -910,10 +910,10 @@ JS_DEFINE_NATIVE_FUNCTION(StringPrototype::replace)
return {};
}
auto string = this_object.to_string(global_object);
auto string = this_object.to_utf16_string(global_object);
if (vm.exception())
return {};
auto search_string = search_value.to_string(global_object);
auto search_string = search_value.to_utf16_string(global_object);
if (vm.exception())
return {};
@ -926,11 +926,8 @@ JS_DEFINE_NATIVE_FUNCTION(StringPrototype::replace)
return {};
}
auto utf16_string = AK::utf8_to_utf16(string);
Utf16View utf16_string_view { utf16_string };
auto utf16_search_string = AK::utf8_to_utf16(search_string);
Utf16View utf16_search_view { utf16_search_string };
Utf16View utf16_string_view { string };
Utf16View utf16_search_view { search_string };
Optional<size_t> position = string_index_of(utf16_string_view, utf16_search_view, 0);
if (!position.has_value())
@ -948,7 +945,7 @@ JS_DEFINE_NATIVE_FUNCTION(StringPrototype::replace)
if (vm.exception())
return {};
} else {
replacement = get_substitution(global_object, search_string, string, *position, {}, js_undefined(), replace_value);
replacement = get_substitution(global_object, utf16_search_view, utf16_string_view, *position, {}, js_undefined(), replace_value);
if (vm.exception())
return {};
}
@ -1004,10 +1001,10 @@ JS_DEFINE_NATIVE_FUNCTION(StringPrototype::replace_all)
}
}
auto string = this_object.to_string(global_object);
auto string = this_object.to_utf16_string(global_object);
if (vm.exception())
return {};
auto search_string = search_value.to_string(global_object);
auto search_string = search_value.to_utf16_string(global_object);
if (vm.exception())
return {};
@ -1020,12 +1017,10 @@ JS_DEFINE_NATIVE_FUNCTION(StringPrototype::replace_all)
return {};
}
auto utf16_string = AK::utf8_to_utf16(string);
Utf16View utf16_string_view { utf16_string };
Utf16View utf16_string_view { string };
auto string_length = utf16_string_view.length_in_code_units();
auto utf16_search_string = AK::utf8_to_utf16(search_string);
Utf16View utf16_search_view { utf16_search_string };
Utf16View utf16_search_view { search_string };
auto search_length = utf16_search_view.length_in_code_units();
Vector<size_t> match_positions;
@ -1053,7 +1048,7 @@ JS_DEFINE_NATIVE_FUNCTION(StringPrototype::replace_all)
if (vm.exception())
return {};
} else {
replacement = get_substitution(global_object, search_string, string, position, {}, js_undefined(), replace_value);
replacement = get_substitution(global_object, utf16_search_view, utf16_string_view, position, {}, js_undefined(), replace_value);
if (vm.exception())
return {};
}

View file

@ -238,7 +238,11 @@ test("UTF-16", () => {
expect("😀".replace("\ud83d", "")).toBe("\ude00");
expect("😀".replace("\ude00", "")).toBe("\ud83d");
// FIXME: RegExp.prototype [ @@replace ] also needs to support UTF-16.
// expect("😀".replace(/\ud83d/, "")).toBe("\ude00");
// expect("😀".replace(/\ude00/, "")).toBe("\ud83d");
expect("😀".replace(/\ud83d/, "")).toBe("\ude00");
expect("😀".replace(/\ude00/, "")).toBe("\ud83d");
expect("😀".replace(/\ud83d\ude00/, "")).toBe("");
expect("😀".replace(/\ud83d/u, "")).toBe("😀");
expect("😀".replace(/\ude00/u, "")).toBe("😀");
expect("😀".replace(/\ud83d\ude00/u, "")).toBe("");
});

View file

@ -151,7 +151,18 @@ test("UTF-16", () => {
expect("😀😀😀".replaceAll("\ud83d", "")).toBe("\ude00\ude00\ude00");
expect("😀😀😀".replaceAll("\ude00", "")).toBe("\ud83d\ud83d\ud83d");
// FIXME: RegExp.prototype [ @@replace ] also needs to support UTF-16.
// expect("😀".replaceAll(/\ud83d/g, "")).toBe("\ude00");
// expect("😀".replaceAll(/\ude00/g, "")).toBe("\ud83d");
expect("😀".replaceAll(/\ud83d/g, "")).toBe("\ude00");
expect("😀".replaceAll(/\ude00/g, "")).toBe("\ud83d");
expect("😀".replaceAll(/\ud83d\ude00/g, "")).toBe("");
expect("😀😀😀".replaceAll(/\ud83d/g, "")).toBe("\ude00\ude00\ude00");
expect("😀😀😀".replaceAll(/\ude00/g, "")).toBe("\ud83d\ud83d\ud83d");
expect("😀😀😀".replaceAll(/\ud83d\ude00/g, "")).toBe("");
expect("😀".replaceAll(/\ud83d/gu, "")).toBe("😀");
expect("😀".replaceAll(/\ude00/gu, "")).toBe("😀");
expect("😀".replaceAll(/\ud83d\ude00/gu, "")).toBe("");
expect("😀😀😀".replaceAll(/\ud83d/gu, "")).toBe("😀😀😀");
expect("😀😀😀".replaceAll(/\ude00/gu, "")).toBe("😀😀😀");
expect("😀😀😀".replaceAll(/\ude00/gu, "")).toBe("😀😀😀");
expect("😀😀😀".replaceAll(/\ud83d\ude00/gu, "")).toBe("");
});