mirror of
https://github.com/RGBCube/serenity
synced 2025-07-26 03:37:43 +00:00
LibJS: Implement RegExp.prototype [ @@replace ] with UTF-16 code units
This also converts the GetSubstitution abstract operation take its input strings as UTF-16 now that all callers are UTF-16 capable. This means String.prototype.replace (and replaceAll) no longer needs UTF-8 and UTF-16 copies of these strings.
This commit is contained in:
parent
ee7b04f7bb
commit
5a8f870594
6 changed files with 67 additions and 71 deletions
|
@ -576,59 +576,48 @@ Value canonical_numeric_index_string(GlobalObject& global_object, PropertyName c
|
|||
}
|
||||
|
||||
// 22.1.3.17.1 GetSubstitution ( matched, str, position, captures, namedCaptures, replacement ), https://tc39.es/ecma262/#sec-getsubstitution
|
||||
String get_substitution(GlobalObject& global_object, String const& matched, String const& str, size_t position, Vector<Value> const& captures, Value named_captures, Value replacement)
|
||||
String get_substitution(GlobalObject& global_object, Utf16View const& matched, Utf16View const& str, size_t position, Vector<Value> const& captures, Value named_captures, Value replacement)
|
||||
{
|
||||
auto& vm = global_object.vm();
|
||||
|
||||
auto replace_string = replacement.to_string(global_object);
|
||||
auto replace_string = replacement.to_utf16_string(global_object);
|
||||
if (vm.exception())
|
||||
return {};
|
||||
|
||||
// FIXME: Once RegExp.prototype supports UTF-16, this AO can take UTF-16 strings as parameters instead of having to transcode here.
|
||||
auto utf16_matched = AK::utf8_to_utf16(matched);
|
||||
auto match_length = utf16_matched.size();
|
||||
|
||||
auto utf16_string = AK::utf8_to_utf16(str);
|
||||
Utf16View utf16_string_view { utf16_string };
|
||||
auto string_length = utf16_string_view.length_in_code_units();
|
||||
|
||||
auto utf16_replace = AK::utf8_to_utf16(replace_string);
|
||||
Utf16View utf16_replace_view { utf16_replace };
|
||||
auto replace_length = utf16_replace_view.length_in_code_units();
|
||||
Utf16View replace_view { replace_string };
|
||||
|
||||
StringBuilder result;
|
||||
|
||||
for (size_t i = 0; i < replace_length; ++i) {
|
||||
u16 curr = utf16_replace_view.code_unit_at(i);
|
||||
for (size_t i = 0; i < replace_view.length_in_code_units(); ++i) {
|
||||
u16 curr = replace_view.code_unit_at(i);
|
||||
|
||||
if ((curr != '$') || (i + 1 >= replace_length)) {
|
||||
if ((curr != '$') || (i + 1 >= replace_view.length_in_code_units())) {
|
||||
result.append(curr);
|
||||
continue;
|
||||
}
|
||||
|
||||
u16 next = utf16_replace_view.code_unit_at(i + 1);
|
||||
u16 next = replace_view.code_unit_at(i + 1);
|
||||
|
||||
if (next == '$') {
|
||||
result.append('$');
|
||||
++i;
|
||||
} else if (next == '&') {
|
||||
result.append(matched);
|
||||
result.append(matched.to_utf8(Utf16View::AllowInvalidCodeUnits::Yes));
|
||||
++i;
|
||||
} else if (next == '`') {
|
||||
auto substring = utf16_string_view.substring_view(0, position);
|
||||
auto substring = str.substring_view(0, position);
|
||||
result.append(substring.to_utf8(Utf16View::AllowInvalidCodeUnits::Yes));
|
||||
++i;
|
||||
} else if (next == '\'') {
|
||||
auto tail_pos = position + match_length;
|
||||
if (tail_pos < string_length) {
|
||||
auto substring = utf16_string_view.substring_view(tail_pos);
|
||||
auto tail_pos = position + matched.length_in_code_units();
|
||||
if (tail_pos < str.length_in_code_units()) {
|
||||
auto substring = str.substring_view(tail_pos);
|
||||
result.append(substring.to_utf8(Utf16View::AllowInvalidCodeUnits::Yes));
|
||||
}
|
||||
++i;
|
||||
} else if (is_ascii_digit(next)) {
|
||||
bool is_two_digits = (i + 2 < replace_length) && is_ascii_digit(utf16_replace_view.code_unit_at(i + 2));
|
||||
bool is_two_digits = (i + 2 < replace_view.length_in_code_units()) && is_ascii_digit(replace_view.code_unit_at(i + 2));
|
||||
|
||||
auto capture_postition_string = utf16_replace_view.substring_view(i + 1, is_two_digits ? 2 : 1).to_utf8();
|
||||
auto capture_postition_string = replace_view.substring_view(i + 1, is_two_digits ? 2 : 1).to_utf8();
|
||||
auto capture_position = capture_postition_string.to_uint();
|
||||
|
||||
if (capture_position.has_value() && (*capture_position > 0) && (*capture_position <= captures.size())) {
|
||||
|
@ -650,8 +639,8 @@ String get_substitution(GlobalObject& global_object, String const& matched, Stri
|
|||
auto start_position = i + 2;
|
||||
Optional<size_t> end_position;
|
||||
|
||||
for (size_t j = start_position; j < replace_length; ++j) {
|
||||
if (utf16_replace_view.code_unit_at(j) == '>') {
|
||||
for (size_t j = start_position; j < replace_view.length_in_code_units(); ++j) {
|
||||
if (replace_view.code_unit_at(j) == '>') {
|
||||
end_position = j;
|
||||
break;
|
||||
}
|
||||
|
@ -660,7 +649,7 @@ String get_substitution(GlobalObject& global_object, String const& matched, Stri
|
|||
if (named_captures.is_undefined() || !end_position.has_value()) {
|
||||
result.append(curr);
|
||||
} else {
|
||||
auto group_name_view = utf16_replace_view.substring_view(start_position, *end_position - start_position);
|
||||
auto group_name_view = replace_view.substring_view(start_position, *end_position - start_position);
|
||||
auto group_name = group_name_view.to_utf8(Utf16View::AllowInvalidCodeUnits::Yes);
|
||||
|
||||
auto capture = named_captures.as_object().get(group_name);
|
||||
|
|
|
@ -30,7 +30,7 @@ Object* get_prototype_from_constructor(GlobalObject&, FunctionObject const& cons
|
|||
Object* create_unmapped_arguments_object(GlobalObject&, Vector<Value> const& arguments);
|
||||
Object* create_mapped_arguments_object(GlobalObject&, FunctionObject&, Vector<FunctionNode::Parameter> const&, Vector<Value> const& arguments, Environment&);
|
||||
Value canonical_numeric_index_string(GlobalObject&, PropertyName const&);
|
||||
String get_substitution(GlobalObject&, String const& matched, String const& str, size_t position, Vector<Value> const& captures, Value named_captures, Value replacement);
|
||||
String get_substitution(GlobalObject&, Utf16View const& matched, Utf16View const& str, size_t position, Vector<Value> const& captures, Value named_captures, Value replacement);
|
||||
|
||||
enum class CallerMode {
|
||||
Strict,
|
||||
|
|
|
@ -127,14 +127,6 @@ static void increment_last_index(GlobalObject& global_object, Object& regexp_obj
|
|||
regexp_object.set(vm.names.lastIndex, Value(last_index), Object::ShouldThrowExceptions::Yes);
|
||||
}
|
||||
|
||||
static void increment_last_index(GlobalObject& global_object, Object& regexp_object, String const& string, bool unicode)
|
||||
{
|
||||
auto utf16_string = AK::utf8_to_utf16(string);
|
||||
Utf16View utf16_string_view { utf16_string };
|
||||
|
||||
return increment_last_index(global_object, regexp_object, utf16_string_view, unicode);
|
||||
}
|
||||
|
||||
// 1.1.2.1 Match Records, https://tc39.es/proposal-regexp-match-indices/#sec-match-records
|
||||
struct Match {
|
||||
static Match create(regex::Match const& match)
|
||||
|
@ -619,9 +611,10 @@ JS_DEFINE_NATIVE_FUNCTION(RegExpPrototype::symbol_replace)
|
|||
auto* regexp_object = this_object_from(vm, global_object);
|
||||
if (!regexp_object)
|
||||
return {};
|
||||
auto string = string_value.to_string(global_object);
|
||||
auto string = string_value.to_utf16_string(global_object);
|
||||
if (vm.exception())
|
||||
return {};
|
||||
Utf16View string_view { string };
|
||||
|
||||
if (!replace_value.is_function()) {
|
||||
auto replace_string = replace_value.to_string(global_object);
|
||||
|
@ -654,7 +647,7 @@ JS_DEFINE_NATIVE_FUNCTION(RegExpPrototype::symbol_replace)
|
|||
MarkedValueList results(vm.heap());
|
||||
|
||||
while (true) {
|
||||
auto result = regexp_exec(global_object, *regexp_object, string);
|
||||
auto result = regexp_exec(global_object, *regexp_object, string_view);
|
||||
if (vm.exception())
|
||||
return {};
|
||||
if (result.is_null())
|
||||
|
@ -676,7 +669,7 @@ JS_DEFINE_NATIVE_FUNCTION(RegExpPrototype::symbol_replace)
|
|||
return {};
|
||||
|
||||
if (match_str.is_empty()) {
|
||||
increment_last_index(global_object, *regexp_object, string, unicode);
|
||||
increment_last_index(global_object, *regexp_object, string_view, unicode);
|
||||
if (vm.exception())
|
||||
return {};
|
||||
}
|
||||
|
@ -693,10 +686,10 @@ JS_DEFINE_NATIVE_FUNCTION(RegExpPrototype::symbol_replace)
|
|||
auto matched_value = result.get(0);
|
||||
if (vm.exception())
|
||||
return {};
|
||||
|
||||
auto matched = matched_value.to_string(global_object);
|
||||
auto matched = matched_value.to_utf16_string(global_object);
|
||||
if (vm.exception())
|
||||
return {};
|
||||
Utf16View matched_view { matched };
|
||||
|
||||
auto position_value = result.get(vm.names.index);
|
||||
if (vm.exception())
|
||||
|
@ -706,7 +699,7 @@ JS_DEFINE_NATIVE_FUNCTION(RegExpPrototype::symbol_replace)
|
|||
if (vm.exception())
|
||||
return {};
|
||||
|
||||
position = clamp(position, static_cast<double>(0), static_cast<double>(string.length()));
|
||||
position = clamp(position, static_cast<double>(0), static_cast<double>(string_view.length_in_code_units()));
|
||||
|
||||
MarkedValueList captures(vm.heap());
|
||||
for (size_t n = 1; n <= n_captures; ++n) {
|
||||
|
@ -735,10 +728,10 @@ JS_DEFINE_NATIVE_FUNCTION(RegExpPrototype::symbol_replace)
|
|||
|
||||
if (replace_value.is_function()) {
|
||||
MarkedValueList replacer_args(vm.heap());
|
||||
replacer_args.append(js_string(vm, matched));
|
||||
replacer_args.append(js_string(vm, matched_view));
|
||||
replacer_args.extend(move(captures));
|
||||
replacer_args.append(Value(position));
|
||||
replacer_args.append(js_string(vm, string));
|
||||
replacer_args.append(js_string(vm, string_view));
|
||||
if (!named_captures.is_undefined()) {
|
||||
replacer_args.append(move(named_captures));
|
||||
}
|
||||
|
@ -758,28 +751,32 @@ JS_DEFINE_NATIVE_FUNCTION(RegExpPrototype::symbol_replace)
|
|||
return {};
|
||||
}
|
||||
|
||||
replacement = get_substitution(global_object, matched, string, position, captures, named_captures_object, replace_value);
|
||||
replacement = get_substitution(global_object, matched_view, string_view, position, captures, named_captures_object, replace_value);
|
||||
if (vm.exception())
|
||||
return {};
|
||||
}
|
||||
|
||||
if (position >= next_source_position) {
|
||||
auto substring = string_view.substring_view(next_source_position, position - next_source_position);
|
||||
|
||||
StringBuilder builder;
|
||||
builder.append(accumulated_result);
|
||||
builder.append(string.substring(next_source_position, position - next_source_position));
|
||||
builder.append(substring.to_utf8(Utf16View::AllowInvalidCodeUnits::Yes));
|
||||
builder.append(replacement);
|
||||
|
||||
accumulated_result = builder.build();
|
||||
next_source_position = position + matched.length();
|
||||
next_source_position = position + matched_view.length_in_code_units();
|
||||
}
|
||||
}
|
||||
|
||||
if (next_source_position >= string.length())
|
||||
if (next_source_position >= string_view.length_in_code_units())
|
||||
return js_string(vm, accumulated_result);
|
||||
|
||||
auto substring = string_view.substring_view(next_source_position);
|
||||
|
||||
StringBuilder builder;
|
||||
builder.append(accumulated_result);
|
||||
builder.append(string.substring(next_source_position));
|
||||
builder.append(substring.to_utf8(Utf16View::AllowInvalidCodeUnits::Yes));
|
||||
|
||||
return js_string(vm, builder.build());
|
||||
}
|
||||
|
|
|
@ -910,10 +910,10 @@ JS_DEFINE_NATIVE_FUNCTION(StringPrototype::replace)
|
|||
return {};
|
||||
}
|
||||
|
||||
auto string = this_object.to_string(global_object);
|
||||
auto string = this_object.to_utf16_string(global_object);
|
||||
if (vm.exception())
|
||||
return {};
|
||||
auto search_string = search_value.to_string(global_object);
|
||||
auto search_string = search_value.to_utf16_string(global_object);
|
||||
if (vm.exception())
|
||||
return {};
|
||||
|
||||
|
@ -926,11 +926,8 @@ JS_DEFINE_NATIVE_FUNCTION(StringPrototype::replace)
|
|||
return {};
|
||||
}
|
||||
|
||||
auto utf16_string = AK::utf8_to_utf16(string);
|
||||
Utf16View utf16_string_view { utf16_string };
|
||||
|
||||
auto utf16_search_string = AK::utf8_to_utf16(search_string);
|
||||
Utf16View utf16_search_view { utf16_search_string };
|
||||
Utf16View utf16_string_view { string };
|
||||
Utf16View utf16_search_view { search_string };
|
||||
|
||||
Optional<size_t> position = string_index_of(utf16_string_view, utf16_search_view, 0);
|
||||
if (!position.has_value())
|
||||
|
@ -948,7 +945,7 @@ JS_DEFINE_NATIVE_FUNCTION(StringPrototype::replace)
|
|||
if (vm.exception())
|
||||
return {};
|
||||
} else {
|
||||
replacement = get_substitution(global_object, search_string, string, *position, {}, js_undefined(), replace_value);
|
||||
replacement = get_substitution(global_object, utf16_search_view, utf16_string_view, *position, {}, js_undefined(), replace_value);
|
||||
if (vm.exception())
|
||||
return {};
|
||||
}
|
||||
|
@ -1004,10 +1001,10 @@ JS_DEFINE_NATIVE_FUNCTION(StringPrototype::replace_all)
|
|||
}
|
||||
}
|
||||
|
||||
auto string = this_object.to_string(global_object);
|
||||
auto string = this_object.to_utf16_string(global_object);
|
||||
if (vm.exception())
|
||||
return {};
|
||||
auto search_string = search_value.to_string(global_object);
|
||||
auto search_string = search_value.to_utf16_string(global_object);
|
||||
if (vm.exception())
|
||||
return {};
|
||||
|
||||
|
@ -1020,12 +1017,10 @@ JS_DEFINE_NATIVE_FUNCTION(StringPrototype::replace_all)
|
|||
return {};
|
||||
}
|
||||
|
||||
auto utf16_string = AK::utf8_to_utf16(string);
|
||||
Utf16View utf16_string_view { utf16_string };
|
||||
Utf16View utf16_string_view { string };
|
||||
auto string_length = utf16_string_view.length_in_code_units();
|
||||
|
||||
auto utf16_search_string = AK::utf8_to_utf16(search_string);
|
||||
Utf16View utf16_search_view { utf16_search_string };
|
||||
Utf16View utf16_search_view { search_string };
|
||||
auto search_length = utf16_search_view.length_in_code_units();
|
||||
|
||||
Vector<size_t> match_positions;
|
||||
|
@ -1053,7 +1048,7 @@ JS_DEFINE_NATIVE_FUNCTION(StringPrototype::replace_all)
|
|||
if (vm.exception())
|
||||
return {};
|
||||
} else {
|
||||
replacement = get_substitution(global_object, search_string, string, position, {}, js_undefined(), replace_value);
|
||||
replacement = get_substitution(global_object, utf16_search_view, utf16_string_view, position, {}, js_undefined(), replace_value);
|
||||
if (vm.exception())
|
||||
return {};
|
||||
}
|
||||
|
|
|
@ -238,7 +238,11 @@ test("UTF-16", () => {
|
|||
expect("😀".replace("\ud83d", "")).toBe("\ude00");
|
||||
expect("😀".replace("\ude00", "")).toBe("\ud83d");
|
||||
|
||||
// FIXME: RegExp.prototype [ @@replace ] also needs to support UTF-16.
|
||||
// expect("😀".replace(/\ud83d/, "")).toBe("\ude00");
|
||||
// expect("😀".replace(/\ude00/, "")).toBe("\ud83d");
|
||||
expect("😀".replace(/\ud83d/, "")).toBe("\ude00");
|
||||
expect("😀".replace(/\ude00/, "")).toBe("\ud83d");
|
||||
expect("😀".replace(/\ud83d\ude00/, "")).toBe("");
|
||||
|
||||
expect("😀".replace(/\ud83d/u, "")).toBe("😀");
|
||||
expect("😀".replace(/\ude00/u, "")).toBe("😀");
|
||||
expect("😀".replace(/\ud83d\ude00/u, "")).toBe("");
|
||||
});
|
||||
|
|
|
@ -151,7 +151,18 @@ test("UTF-16", () => {
|
|||
expect("😀😀😀".replaceAll("\ud83d", "")).toBe("\ude00\ude00\ude00");
|
||||
expect("😀😀😀".replaceAll("\ude00", "")).toBe("\ud83d\ud83d\ud83d");
|
||||
|
||||
// FIXME: RegExp.prototype [ @@replace ] also needs to support UTF-16.
|
||||
// expect("😀".replaceAll(/\ud83d/g, "")).toBe("\ude00");
|
||||
// expect("😀".replaceAll(/\ude00/g, "")).toBe("\ud83d");
|
||||
expect("😀".replaceAll(/\ud83d/g, "")).toBe("\ude00");
|
||||
expect("😀".replaceAll(/\ude00/g, "")).toBe("\ud83d");
|
||||
expect("😀".replaceAll(/\ud83d\ude00/g, "")).toBe("");
|
||||
expect("😀😀😀".replaceAll(/\ud83d/g, "")).toBe("\ude00\ude00\ude00");
|
||||
expect("😀😀😀".replaceAll(/\ude00/g, "")).toBe("\ud83d\ud83d\ud83d");
|
||||
expect("😀😀😀".replaceAll(/\ud83d\ude00/g, "")).toBe("");
|
||||
|
||||
expect("😀".replaceAll(/\ud83d/gu, "")).toBe("😀");
|
||||
expect("😀".replaceAll(/\ude00/gu, "")).toBe("😀");
|
||||
expect("😀".replaceAll(/\ud83d\ude00/gu, "")).toBe("");
|
||||
expect("😀😀😀".replaceAll(/\ud83d/gu, "")).toBe("😀😀😀");
|
||||
expect("😀😀😀".replaceAll(/\ude00/gu, "")).toBe("😀😀😀");
|
||||
expect("😀😀😀".replaceAll(/\ude00/gu, "")).toBe("😀😀😀");
|
||||
expect("😀😀😀".replaceAll(/\ud83d\ude00/gu, "")).toBe("");
|
||||
});
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue