diff --git a/Userland/Libraries/LibJS/Forward.h b/Userland/Libraries/LibJS/Forward.h index ddafc41776..4b4b7869d2 100644 --- a/Userland/Libraries/LibJS/Forward.h +++ b/Userland/Libraries/LibJS/Forward.h @@ -124,13 +124,14 @@ __JS_ENUMERATE(toPrimitive, to_primitive) \ __JS_ENUMERATE(toStringTag, to_string_tag) -#define JS_ENUMERATE_REGEXP_FLAGS \ - __JS_ENUMERATE(hasIndices, has_indices, d) \ - __JS_ENUMERATE(global, global, g) \ - __JS_ENUMERATE(ignoreCase, ignore_case, i) \ - __JS_ENUMERATE(multiline, multiline, m) \ - __JS_ENUMERATE(dotAll, dot_all, s) \ - __JS_ENUMERATE(unicode, unicode, u) \ +#define JS_ENUMERATE_REGEXP_FLAGS \ + __JS_ENUMERATE(hasIndices, has_indices, d) \ + __JS_ENUMERATE(global, global, g) \ + __JS_ENUMERATE(ignoreCase, ignore_case, i) \ + __JS_ENUMERATE(multiline, multiline, m) \ + __JS_ENUMERATE(dotAll, dot_all, s) \ + __JS_ENUMERATE(unicodeSets, unicode_sets, v) \ + __JS_ENUMERATE(unicode, unicode, u) \ __JS_ENUMERATE(sticky, sticky, y) namespace JS { diff --git a/Userland/Libraries/LibJS/Parser.cpp b/Userland/Libraries/LibJS/Parser.cpp index 48dc4c4c85..30ee0cc961 100644 --- a/Userland/Libraries/LibJS/Parser.cpp +++ b/Userland/Libraries/LibJS/Parser.cpp @@ -1522,7 +1522,14 @@ NonnullRefPtr Parser::parse_regexp_literal() parsed_flags = parsed_flags_or_error.release_value(); } - auto parsed_pattern = parse_regex_pattern(pattern, parsed_flags.has_flag_set(ECMAScriptFlags::Unicode)); + String parsed_pattern; + auto parsed_pattern_result = parse_regex_pattern(pattern, parsed_flags.has_flag_set(ECMAScriptFlags::Unicode), parsed_flags.has_flag_set(ECMAScriptFlags::UnicodeSets)); + if (parsed_pattern_result.is_error()) { + syntax_error(parsed_pattern_result.release_error().error, rule_start.position()); + parsed_pattern = String::empty(); + } else { + parsed_pattern = parsed_pattern_result.release_value(); + } auto parsed_regex = Regex::parse_pattern(parsed_pattern, parsed_flags); if (parsed_regex.error != regex::Error::NoError) diff --git a/Userland/Libraries/LibJS/Runtime/CommonPropertyNames.h b/Userland/Libraries/LibJS/Runtime/CommonPropertyNames.h index fa43ef2a49..6bbf4ffd07 100644 --- a/Userland/Libraries/LibJS/Runtime/CommonPropertyNames.h +++ b/Userland/Libraries/LibJS/Runtime/CommonPropertyNames.h @@ -526,11 +526,12 @@ namespace JS { P(undefined) \ P(unescape) \ P(unicode) \ + P(unicodeSets) \ P(unit) \ P(unitDisplay) \ - P(until) \ P(unregister) \ P(unshift) \ + P(until) \ P(usage) \ P(useGrouping) \ P(value) \ diff --git a/Userland/Libraries/LibJS/Runtime/ErrorTypes.h b/Userland/Libraries/LibJS/Runtime/ErrorTypes.h index 4bf8692948..af9a88bb27 100644 --- a/Userland/Libraries/LibJS/Runtime/ErrorTypes.h +++ b/Userland/Libraries/LibJS/Runtime/ErrorTypes.h @@ -209,6 +209,7 @@ M(RegExpCompileError, "RegExp compile error: {}") \ M(RegExpObjectBadFlag, "Invalid RegExp flag '{}'") \ M(RegExpObjectRepeatedFlag, "Repeated RegExp flag '{}'") \ + M(RegExpObjectIncompatibleFlags, "RegExp flag '{}' is incompatible with flag '{}'") \ M(RestrictedFunctionPropertiesAccess, "Restricted function properties like 'callee', 'caller' and 'arguments' may " \ "not be accessed in strict mode") \ M(RestrictedGlobalProperty, "Cannot declare global property '{}'") \ diff --git a/Userland/Libraries/LibJS/Runtime/RegExpObject.cpp b/Userland/Libraries/LibJS/Runtime/RegExpObject.cpp index 7a011a09c1..d6a1dc4cfa 100644 --- a/Userland/Libraries/LibJS/Runtime/RegExpObject.cpp +++ b/Userland/Libraries/LibJS/Runtime/RegExpObject.cpp @@ -16,7 +16,7 @@ namespace JS { Result, String> regex_flags_from_string(StringView flags) { - bool d = false, g = false, i = false, m = false, s = false, u = false, y = false; + bool d = false, g = false, i = false, m = false, s = false, u = false, y = false, v = false; auto options = RegExpObject::default_flags; for (auto ch : flags) { @@ -68,6 +68,12 @@ Result, String> regex_flags_from_string(Str options |= (regex::ECMAScriptFlags)regex::AllFlags::Internal_Stateful; options |= regex::ECMAScriptFlags::Sticky; break; + case 'v': + if (v) + return String::formatted(ErrorType::RegExpObjectRepeatedFlag.message(), ch); + v = true; + options |= regex::ECMAScriptFlags::UnicodeSets; + break; default: return String::formatted(ErrorType::RegExpObjectBadFlag.message(), ch); } @@ -76,8 +82,11 @@ Result, String> regex_flags_from_string(Str return options; } -String parse_regex_pattern(StringView pattern, bool unicode) +ErrorOr parse_regex_pattern(StringView pattern, bool unicode, bool unicode_sets) { + if (unicode && unicode_sets) + return ParseRegexPatternError { String::formatted(ErrorType::RegExpObjectIncompatibleFlags.message(), 'u', 'v') }; + auto utf16_pattern = AK::utf8_to_utf16(pattern); Utf16View utf16_pattern_view { utf16_pattern }; StringBuilder builder; @@ -85,7 +94,7 @@ String parse_regex_pattern(StringView pattern, bool unicode) // If the Unicode flag is set, append each code point to the pattern. Otherwise, append each // code unit. But unlike the spec, multi-byte code units must be escaped for LibRegex to parse. for (size_t i = 0; i < utf16_pattern_view.length_in_code_units();) { - if (unicode) { + if (unicode || unicode_sets) { auto code_point = code_point_at(utf16_pattern_view, i); builder.append_code_point(code_point.code_point); i += code_point.code_unit_count; @@ -104,6 +113,15 @@ String parse_regex_pattern(StringView pattern, bool unicode) return builder.build(); } +ThrowCompletionOr parse_regex_pattern(StringView pattern, VM& vm, GlobalObject& global_object, bool unicode, bool unicode_sets) +{ + auto result = parse_regex_pattern(pattern, unicode, unicode_sets); + if (result.is_error()) + return vm.throw_completion(global_object, result.release_error().error); + + return result.release_value(); +} + RegExpObject* RegExpObject::create(GlobalObject& global_object) { return global_object.heap().allocate(global_object, *global_object.regexp_prototype()); @@ -156,7 +174,8 @@ ThrowCompletionOr RegExpObject::regexp_initialize(GlobalObject& g } else { original_pattern = TRY(pattern.to_string(global_object)); bool unicode = f.find('u').has_value(); - parsed_pattern = parse_regex_pattern(original_pattern, unicode); + bool unicode_sets = f.find('v').has_value(); + parsed_pattern = TRY(parse_regex_pattern(original_pattern, vm, global_object, unicode, unicode_sets)); } auto parsed_flags_or_error = regex_flags_from_string(f); @@ -181,7 +200,7 @@ String RegExpObject::escape_regexp_pattern() const { if (m_pattern.is_empty()) return "(?:)"; - // FIXME: Check u flag and escape accordingly + // FIXME: Check the 'u' and 'v' flags and escape accordingly return m_pattern.replace("\n"sv, "\\n"sv, ReplaceMode::All).replace("\r"sv, "\\r"sv, ReplaceMode::All).replace(LINE_SEPARATOR_STRING, "\\u2028"sv, ReplaceMode::All).replace(PARAGRAPH_SEPARATOR_STRING, "\\u2029"sv, ReplaceMode::All).replace("/"sv, "\\/"sv, ReplaceMode::All); } diff --git a/Userland/Libraries/LibJS/Runtime/RegExpObject.h b/Userland/Libraries/LibJS/Runtime/RegExpObject.h index 7ae8f74f4d..3d4eb7a99e 100644 --- a/Userland/Libraries/LibJS/Runtime/RegExpObject.h +++ b/Userland/Libraries/LibJS/Runtime/RegExpObject.h @@ -17,7 +17,11 @@ namespace JS { ThrowCompletionOr regexp_create(GlobalObject&, Value pattern, Value flags); Result, String> regex_flags_from_string(StringView flags); -String parse_regex_pattern(StringView pattern, bool unicode); +struct ParseRegexPatternError { + String error; +}; +ErrorOr parse_regex_pattern(StringView pattern, bool unicode, bool unicode_sets); +ThrowCompletionOr parse_regex_pattern(StringView pattern, VM& vm, GlobalObject& global_object, bool unicode, bool unicode_sets); class RegExpObject : public Object { JS_OBJECT(RegExpObject, Object); diff --git a/Userland/Libraries/LibJS/Runtime/RegExpPrototype.cpp b/Userland/Libraries/LibJS/Runtime/RegExpPrototype.cpp index 608e318cdd..534bcea5cf 100644 --- a/Userland/Libraries/LibJS/Runtime/RegExpPrototype.cpp +++ b/Userland/Libraries/LibJS/Runtime/RegExpPrototype.cpp @@ -410,6 +410,7 @@ size_t advance_string_index(Utf16View const& string, size_t index, bool unicode) // 22.2.5.10 get RegExp.prototype.multiline, https://tc39.es/ecma262/#sec-get-regexp.prototype.multiline // 22.2.5.15 get RegExp.prototype.sticky, https://tc39.es/ecma262/#sec-get-regexp.prototype.sticky // 22.2.5.18 get RegExp.prototype.unicode, https://tc39.es/ecma262/#sec-get-regexp.prototype.unicode +// 22.2.5.18 get RegExp.prototype.unicodeSets, https://arai-a.github.io/ecma262-compare/?pr=2418&id=sec-get-regexp.prototype.unicodeSets #define __JS_ENUMERATE(flagName, flag_name, flag_char) \ JS_DEFINE_NATIVE_FUNCTION(RegExpPrototype::flag_name) \ { \ @@ -467,10 +468,12 @@ JS_DEFINE_NATIVE_FUNCTION(RegExpPrototype::flags) // 11. If multiline is true, append the code unit 0x006D (LATIN SMALL LETTER M) as the last code unit of result. // 12. Let dotAll be ToBoolean(? Get(R, "dotAll")). // 13. If dotAll is true, append the code unit 0x0073 (LATIN SMALL LETTER S) as the last code unit of result. - // 14. Let unicode be ToBoolean(? Get(R, "unicode")). - // 15. If unicode is true, append the code unit 0x0075 (LATIN SMALL LETTER U) as the last code unit of result. - // 16. Let sticky be ToBoolean(? Get(R, "sticky")). - // 17. If sticky is true, append the code unit 0x0079 (LATIN SMALL LETTER Y) as the last code unit of result. + // 14. Let unicodeSets be ! ToBoolean(? Get(R, "unicodeSets")). + // 15. If unicodeSets is true, append the code unit 0x0076 (LATIN SMALL LETTER V) as the last code unit of result. + // 16. Let unicode be ToBoolean(? Get(R, "unicode")). + // 17. If unicode is true, append the code unit 0x0075 (LATIN SMALL LETTER U) as the last code unit of result. + // 18. Let sticky be ToBoolean(? Get(R, "sticky")). + // 19. If sticky is true, append the code unit 0x0079 (LATIN SMALL LETTER Y) as the last code unit of result. #define __JS_ENUMERATE(flagName, flag_name, flag_char) \ auto flag_##flag_name = TRY(regexp_object->get(vm.names.flagName)); \ if (flag_##flag_name.to_boolean()) \ @@ -483,6 +486,7 @@ JS_DEFINE_NATIVE_FUNCTION(RegExpPrototype::flags) } // 22.2.5.8 RegExp.prototype [ @@match ] ( string ), https://tc39.es/ecma262/#sec-regexp.prototype-@@match +// With changes from https://arai-a.github.io/ecma262-compare/?pr=2418&id=sec-regexp.prototype-%2540%2540match JS_DEFINE_NATIVE_FUNCTION(RegExpPrototype::symbol_match) { // 1. Let rx be the this value. @@ -504,19 +508,23 @@ JS_DEFINE_NATIVE_FUNCTION(RegExpPrototype::symbol_match) // 6. Else, // a. Assert: global is true. - // b. Let fullUnicode be ToBoolean(? Get(rx, "unicode")). - bool full_unicode = TRY(regexp_object->get(vm.names.unicode)).to_boolean(); + // b. Let fullUnicode be ToBoolean(? Get(rx, "unicodeSets")). + bool full_unicode = TRY(regexp_object->get(vm.names.unicodeSets)).to_boolean(); - // c. Perform ? Set(rx, "lastIndex", +0𝔽, true). + // c. If fullUnicode is false, set fullUnicode to ! ToBoolean(? Get(rx, "unicode")). + if (!full_unicode) + full_unicode = TRY(regexp_object->get(vm.names.unicode)).to_boolean(); + + // d. Perform ? Set(rx, "lastIndex", +0𝔽, true). TRY(regexp_object->set(vm.names.lastIndex, Value(0), Object::ShouldThrowExceptions::Yes)); - // d. Let A be ! ArrayCreate(0). + // e. Let A be ! ArrayCreate(0). auto* array = MUST(Array::create(global_object, 0)); - // e. Let n be 0. + // f. Let n be 0. size_t n = 0; - // f. Repeat, + // g. Repeat, while (true) { // i. Let result be ? RegExpExec(rx, S). auto result = TRY(regexp_exec(global_object, *regexp_object, string)); @@ -552,6 +560,7 @@ JS_DEFINE_NATIVE_FUNCTION(RegExpPrototype::symbol_match) } // 22.2.5.9 RegExp.prototype [ @@matchAll ] ( string ), https://tc39.es/ecma262/#sec-regexp-prototype-matchall +// With changes from https://arai-a.github.io/ecma262-compare/?pr=2418&id=sec-regexp-prototype-matchall JS_DEFINE_NATIVE_FUNCTION(RegExpPrototype::symbol_match_all) { // 1. Let R be the this value. @@ -576,7 +585,7 @@ JS_DEFINE_NATIVE_FUNCTION(RegExpPrototype::symbol_match_all) // 11. If flags contains "u", let fullUnicode be true. // 12. Else, let fullUnicode be false. - bool full_unicode = flags.contains('u'); + bool full_unicode = flags.contains('u') || flags.contains('v'); // 6. Let matcher be ? Construct(C, « R, flags »). auto* matcher = TRY(construct(global_object, *constructor, regexp_object, js_string(vm, move(flags)))); @@ -593,6 +602,7 @@ JS_DEFINE_NATIVE_FUNCTION(RegExpPrototype::symbol_match_all) } // 22.2.5.11 RegExp.prototype [ @@replace ] ( string, replaceValue ), https://tc39.es/ecma262/#sec-regexp.prototype-@@replace +// With changes from https://arai-a.github.io/ecma262-compare/?pr=2418&id=sec-regexp.prototype-@@replace JS_DEFINE_NATIVE_FUNCTION(RegExpPrototype::symbol_replace) { auto string_value = vm.argument(0); @@ -621,10 +631,14 @@ JS_DEFINE_NATIVE_FUNCTION(RegExpPrototype::symbol_replace) // 8. If global is true, then if (global) { - // a. Let fullUnicode be ToBoolean(? Get(rx, "unicode")). - full_unicode = TRY(regexp_object->get(vm.names.unicode)).to_boolean(); + // a. Let fullUnicode be ToBoolean(? Get(rx, "unicodeSets")). + full_unicode = TRY(regexp_object->get(vm.names.unicodeSets)).to_boolean(); - // b. Perform ? Set(rx, "lastIndex", +0𝔽, true). + // b. If fullUnicode is false, set fullUnicode to ! ToBoolean(? Get(rx, "unicode")). + if (!full_unicode) + full_unicode = TRY(regexp_object->get(vm.names.unicode)).to_boolean(); + + // c. Perform ? Set(rx, "lastIndex", +0𝔽, true). TRY(regexp_object->set(vm.names.lastIndex, Value(0), Object::ShouldThrowExceptions::Yes)); } @@ -863,7 +877,7 @@ JS_DEFINE_NATIVE_FUNCTION(RegExpPrototype::symbol_split) // 6. If flags contains "u", let unicodeMatching be true. // 7. Else, let unicodeMatching be false. - bool unicode_matching = flags.find('u').has_value(); + bool unicode_matching = flags.contains('u') || flags.contains('v'); // 8. If flags contains "y", let newFlags be flags. // 9. Else, let newFlags be the string-concatenation of flags and "y". diff --git a/Userland/Libraries/LibJS/Tests/builtins/RegExp/RegExp.prototype.flags.js b/Userland/Libraries/LibJS/Tests/builtins/RegExp/RegExp.prototype.flags.js index 6d548ee935..aea62d8d0c 100644 --- a/Userland/Libraries/LibJS/Tests/builtins/RegExp/RegExp.prototype.flags.js +++ b/Userland/Libraries/LibJS/Tests/builtins/RegExp/RegExp.prototype.flags.js @@ -5,8 +5,11 @@ test("basic functionality", () => { expect(/foo/i.flags).toBe("i"); expect(/foo/m.flags).toBe("m"); expect(/foo/s.flags).toBe("s"); + expect(/foo/v.flags).toBe("v"); expect(/foo/u.flags).toBe("u"); expect(/foo/y.flags).toBe("y"); // prettier-ignore expect(/foo/dsgimyu.flags).toBe("dgimsuy"); + // prettier-ignore + expect(/foo/dgimsvy.flags).toBe("dgimsvy"); });