diff --git a/Userland/Libraries/LibJS/AST.cpp b/Userland/Libraries/LibJS/AST.cpp index 5867f5b56d..32f7f21f5f 100644 --- a/Userland/Libraries/LibJS/AST.cpp +++ b/Userland/Libraries/LibJS/AST.cpp @@ -2020,7 +2020,9 @@ void RegExpLiteral::dump(int indent) const Value RegExpLiteral::execute(Interpreter& interpreter, GlobalObject& global_object) const { InterpreterNodeScope node_scope { interpreter, *this }; - return regexp_create(global_object, js_string(interpreter.heap(), pattern()), js_string(interpreter.heap(), flags())); + + Regex regex(parsed_regex(), parsed_pattern(), parsed_flags()); + return RegExpObject::create(global_object, move(regex), pattern(), flags()); } void ArrayExpression::dump(int indent) const diff --git a/Userland/Libraries/LibJS/AST.h b/Userland/Libraries/LibJS/AST.h index 7235cba96d..1ae404b175 100644 --- a/Userland/Libraries/LibJS/AST.h +++ b/Userland/Libraries/LibJS/AST.h @@ -19,6 +19,7 @@ #include #include #include +#include namespace JS { @@ -758,8 +759,11 @@ public: class RegExpLiteral final : public Literal { public: - explicit RegExpLiteral(SourceRange source_range, String pattern, String flags) + RegExpLiteral(SourceRange source_range, regex::Parser::Result parsed_regex, String parsed_pattern, regex::RegexOptions parsed_flags, String pattern, String flags) : Literal(source_range) + , m_parsed_regex(move(parsed_regex)) + , m_parsed_pattern(move(parsed_pattern)) + , m_parsed_flags(move(parsed_flags)) , m_pattern(move(pattern)) , m_flags(move(flags)) { @@ -769,10 +773,16 @@ public: virtual void dump(int indent) const override; virtual void generate_bytecode(Bytecode::Generator&) const override; + regex::Parser::Result const& parsed_regex() const { return m_parsed_regex; } + String const& parsed_pattern() const { return m_parsed_pattern; } + regex::RegexOptions const& parsed_flags() const { return m_parsed_flags; } String const& pattern() const { return m_pattern; } String const& flags() const { return m_flags; } private: + regex::Parser::Result m_parsed_regex; + String m_parsed_pattern; + regex::RegexOptions m_parsed_flags; String m_pattern; String m_flags; }; diff --git a/Userland/Libraries/LibJS/Parser.cpp b/Userland/Libraries/LibJS/Parser.cpp index 7018375c5e..b5d9a67c56 100644 --- a/Userland/Libraries/LibJS/Parser.cpp +++ b/Userland/Libraries/LibJS/Parser.cpp @@ -12,6 +12,8 @@ #include #include #include +#include +#include namespace JS { @@ -848,21 +850,29 @@ NonnullRefPtr Parser::parse_regexp_literal() auto pattern = consume().value(); // Remove leading and trailing slash. pattern = pattern.substring_view(1, pattern.length() - 2); + auto flags = String::empty(); + auto parsed_flags = RegExpObject::default_flags; + if (match(TokenType::RegexFlags)) { auto flags_start = position(); flags = consume().value(); - HashTable seen_flags; - for (size_t i = 0; i < flags.length(); ++i) { - auto flag = flags.substring_view(i, 1); - if (!flag.is_one_of("d", "g", "i", "m", "s", "u", "y")) - syntax_error(String::formatted("Invalid RegExp flag '{}'", flag), Position { flags_start.line, flags_start.column + i }); - if (seen_flags.contains(*flag.characters_without_null_termination())) - syntax_error(String::formatted("Repeated RegExp flag '{}'", flag), Position { flags_start.line, flags_start.column + i }); - seen_flags.set(*flag.characters_without_null_termination()); - } + + auto parsed_flags_or_error = regex_flags_from_string(flags); + if (parsed_flags_or_error.is_error()) + syntax_error(parsed_flags_or_error.release_error(), flags_start); + else + parsed_flags = parsed_flags_or_error.release_value(); } - return create_ast_node({ m_state.current_token.filename(), rule_start.position(), position() }, pattern, flags); + + auto parsed_pattern = parse_regex_pattern(pattern, parsed_flags.has_flag_set(ECMAScriptFlags::Unicode)); + auto parsed_regex = Regex::parse_pattern(parsed_pattern, parsed_flags); + + if (parsed_regex.error != regex::Error::NoError) + syntax_error(String::formatted("RegExp compile error: {}", Regex(parsed_regex, parsed_pattern, parsed_flags).error_string()), rule_start.position()); + + SourceRange range { m_state.current_token.filename(), rule_start.position(), position() }; + return create_ast_node(move(range), move(parsed_regex), move(parsed_pattern), move(parsed_flags), pattern.to_string(), move(flags)); } NonnullRefPtr Parser::parse_unary_prefixed_expression() diff --git a/Userland/Libraries/LibJS/Runtime/RegExpObject.cpp b/Userland/Libraries/LibJS/Runtime/RegExpObject.cpp index 8955c82b3a..9e2832667a 100644 --- a/Userland/Libraries/LibJS/Runtime/RegExpObject.cpp +++ b/Userland/Libraries/LibJS/Runtime/RegExpObject.cpp @@ -14,97 +14,108 @@ namespace JS { -static Flags options_from(GlobalObject& global_object, const String& flags) +Result, String> regex_flags_from_string(StringView flags) { - auto& vm = global_object.vm(); bool d = false, g = false, i = false, m = false, s = false, u = false, y = false; - Flags options { - // JS regexps are all 'global' by default as per our definition, but the "global" flag enables "stateful". - // FIXME: Enable 'BrowserExtended' only if in a browser context. - .effective_flags = { (regex::ECMAScriptFlags)regex::AllFlags::Global | (regex::ECMAScriptFlags)regex::AllFlags::SkipTrimEmptyMatches | regex::ECMAScriptFlags::BrowserExtended }, - .declared_flags = {}, - }; + auto options = RegExpObject::default_flags; for (auto ch : flags) { switch (ch) { case 'd': if (d) - vm.throw_exception(global_object, ErrorType::RegExpObjectRepeatedFlag, ch); + return String::formatted(ErrorType::RegExpObjectRepeatedFlag.message(), ch); d = true; break; case 'g': if (g) - vm.throw_exception(global_object, ErrorType::RegExpObjectRepeatedFlag, ch); + return String::formatted(ErrorType::RegExpObjectRepeatedFlag.message(), ch); g = true; - options.effective_flags |= regex::ECMAScriptFlags::Global; - options.declared_flags |= regex::ECMAScriptFlags::Global; + options |= regex::ECMAScriptFlags::Global; break; case 'i': if (i) - vm.throw_exception(global_object, ErrorType::RegExpObjectRepeatedFlag, ch); + return String::formatted(ErrorType::RegExpObjectRepeatedFlag.message(), ch); i = true; - options.effective_flags |= regex::ECMAScriptFlags::Insensitive; - options.declared_flags |= regex::ECMAScriptFlags::Insensitive; + options |= regex::ECMAScriptFlags::Insensitive; break; case 'm': if (m) - vm.throw_exception(global_object, ErrorType::RegExpObjectRepeatedFlag, ch); + return String::formatted(ErrorType::RegExpObjectRepeatedFlag.message(), ch); m = true; - options.effective_flags |= regex::ECMAScriptFlags::Multiline; - options.declared_flags |= regex::ECMAScriptFlags::Multiline; + options |= regex::ECMAScriptFlags::Multiline; break; case 's': if (s) - vm.throw_exception(global_object, ErrorType::RegExpObjectRepeatedFlag, ch); + return String::formatted(ErrorType::RegExpObjectRepeatedFlag.message(), ch); s = true; - options.effective_flags |= regex::ECMAScriptFlags::SingleLine; - options.declared_flags |= regex::ECMAScriptFlags::SingleLine; + options |= regex::ECMAScriptFlags::SingleLine; break; case 'u': if (u) - vm.throw_exception(global_object, ErrorType::RegExpObjectRepeatedFlag, ch); + return String::formatted(ErrorType::RegExpObjectRepeatedFlag.message(), ch); u = true; - options.effective_flags |= regex::ECMAScriptFlags::Unicode; - options.declared_flags |= regex::ECMAScriptFlags::Unicode; + options |= regex::ECMAScriptFlags::Unicode; break; case 'y': if (y) - vm.throw_exception(global_object, ErrorType::RegExpObjectRepeatedFlag, ch); + return String::formatted(ErrorType::RegExpObjectRepeatedFlag.message(), ch); y = true; // Now for the more interesting flag, 'sticky' actually unsets 'global', part of which is the default. - options.effective_flags.reset_flag(regex::ECMAScriptFlags::Global); + options.reset_flag(regex::ECMAScriptFlags::Global); // "What's the difference between sticky and global, then", that's simple. // all the other flags imply 'global', and the "global" flag implies 'stateful'; // however, the "sticky" flag does *not* imply 'global', only 'stateful'. - options.effective_flags |= (regex::ECMAScriptFlags)regex::AllFlags::Internal_Stateful; - options.effective_flags |= regex::ECMAScriptFlags::Sticky; - options.declared_flags |= regex::ECMAScriptFlags::Sticky; + options |= (regex::ECMAScriptFlags)regex::AllFlags::Internal_Stateful; + options |= regex::ECMAScriptFlags::Sticky; break; default: - vm.throw_exception(global_object, ErrorType::RegExpObjectBadFlag, ch); - return options; + return String::formatted(ErrorType::RegExpObjectBadFlag.message(), ch); } } return options; } -RegExpObject* RegExpObject::create(GlobalObject& global_object, String original_pattern, String parsed_pattern, String flags) +String parse_regex_pattern(StringView pattern, bool unicode) { - return global_object.heap().allocate(global_object, move(original_pattern), move(parsed_pattern), move(flags), *global_object.regexp_prototype()); + auto utf16_pattern = AK::utf8_to_utf16(pattern); + Utf16View utf16_pattern_view { utf16_pattern }; + StringBuilder builder; + + // If the Unicode flag is set, append each code point to the pattern. Otherwise, append each + // code unit. But unlike the spec, multi-byte code units must be escaped for LibRegex to parse. + for (size_t i = 0; i < utf16_pattern_view.length_in_code_units();) { + if (unicode) { + auto code_point = code_point_at(utf16_pattern_view, i); + builder.append_code_point(code_point.code_point); + i += code_point.code_unit_count; + continue; + } + + u16 code_unit = utf16_pattern_view.code_unit_at(i); + ++i; + + if (code_unit > 0x7f) + builder.appendff("\\u{:04x}", code_unit); + else + builder.append_code_point(code_unit); + } + + return builder.build(); } -RegExpObject::RegExpObject(String original_pattern, String parsed_pattern, String flags, Object& prototype) - : Object(prototype) - , m_original_pattern(move(original_pattern)) - , m_parsed_pattern(move(parsed_pattern)) - , m_flags(move(flags)) - , m_active_flags(options_from(global_object(), m_flags)) - , m_regex(m_parsed_pattern, m_active_flags.effective_flags) +RegExpObject* RegExpObject::create(GlobalObject& global_object, Regex regex, String pattern, String flags) { - if (m_regex.parser_result.error != regex::Error::NoError) { - vm().throw_exception(global_object(), ErrorType::RegExpCompileError, m_regex.error_string()); - } + return global_object.heap().allocate(global_object, move(regex), move(pattern), move(flags), *global_object.regexp_prototype()); +} + +RegExpObject::RegExpObject(Regex regex, String pattern, String flags, Object& prototype) + : Object(prototype) + , m_pattern(move(pattern)) + , m_flags(move(flags)) + , m_regex(move(regex)) +{ + VERIFY(m_regex.parser_result.error == regex::Error::NoError); } RegExpObject::~RegExpObject() @@ -115,7 +126,7 @@ void RegExpObject::initialize(GlobalObject& global_object) { auto& vm = this->vm(); Object::initialize(global_object); - define_direct_property(vm.names.lastIndex, {}, Attribute::Writable); + define_direct_property(vm.names.lastIndex, Value(0), Attribute::Writable); } // 22.2.3.2.4 RegExpCreate ( P, F ), https://tc39.es/ecma262/#sec-regexpcreate @@ -139,38 +150,27 @@ RegExpObject* regexp_create(GlobalObject& global_object, Value pattern, Value fl original_pattern = String::empty(); parsed_pattern = String::empty(); } else { - auto utf16_pattern = pattern.to_utf16_string(global_object); + original_pattern = pattern.to_string(global_object); if (vm.exception()) return {}; - Utf16View utf16_pattern_view { utf16_pattern }; bool unicode = f.find('u').has_value(); - StringBuilder builder; - - // If the Unicode flag is set, append each code point to the pattern. Otherwise, append each - // code unit. But unlike the spec, multi-byte code units must be escaped for LibRegex to parse. - for (size_t i = 0; i < utf16_pattern_view.length_in_code_units();) { - if (unicode) { - auto code_point = code_point_at(utf16_pattern_view, i); - builder.append_code_point(code_point.code_point); - i += code_point.code_unit_count; - continue; - } - - u16 code_unit = utf16_pattern_view.code_unit_at(i); - ++i; - - if (code_unit > 0x7f) - builder.appendff("\\u{:04x}", code_unit); - else - builder.append_code_point(code_unit); - } - - original_pattern = utf16_pattern_view.to_utf8(Utf16View::AllowInvalidCodeUnits::Yes); - parsed_pattern = builder.build(); + parsed_pattern = parse_regex_pattern(original_pattern, unicode); } - auto* object = RegExpObject::create(global_object, move(original_pattern), move(parsed_pattern), move(f)); + auto parsed_flags_or_error = regex_flags_from_string(f); + if (parsed_flags_or_error.is_error()) { + vm.throw_exception(global_object, SyntaxError::create(global_object, parsed_flags_or_error.release_error())); + return {}; + } + + Regex regex(move(parsed_pattern), parsed_flags_or_error.release_value()); + if (regex.parser_result.error != regex::Error::NoError) { + vm.throw_exception(global_object, ErrorType::RegExpCompileError, regex.error_string()); + return {}; + } + + auto* object = RegExpObject::create(global_object, move(regex), move(original_pattern), move(f)); object->set(vm.names.lastIndex, Value(0), Object::ShouldThrowExceptions::Yes); if (vm.exception()) return {}; diff --git a/Userland/Libraries/LibJS/Runtime/RegExpObject.h b/Userland/Libraries/LibJS/Runtime/RegExpObject.h index 528a9f1129..d181d76303 100644 --- a/Userland/Libraries/LibJS/Runtime/RegExpObject.h +++ b/Userland/Libraries/LibJS/Runtime/RegExpObject.h @@ -6,40 +6,40 @@ #pragma once +#include #include #include #include -struct Flags { - regex::RegexOptions effective_flags; - regex::RegexOptions declared_flags; -}; - namespace JS { RegExpObject* regexp_create(GlobalObject&, Value pattern, Value flags); +Result, String> regex_flags_from_string(StringView flags); +String parse_regex_pattern(StringView pattern, bool unicode); + class RegExpObject : public Object { JS_OBJECT(RegExpObject, Object); public: - static RegExpObject* create(GlobalObject&, String original_pattern, String parsed_pattern, String flags); + // JS regexps are all 'global' by default as per our definition, but the "global" flag enables "stateful". + // FIXME: Enable 'BrowserExtended' only if in a browser context. + static constexpr regex::RegexOptions default_flags { (regex::ECMAScriptFlags)regex::AllFlags::Global | (regex::ECMAScriptFlags)regex::AllFlags::SkipTrimEmptyMatches | regex::ECMAScriptFlags::BrowserExtended }; - RegExpObject(String original_pattern, String parsed_pattern, String flags, Object& prototype); + static RegExpObject* create(GlobalObject&, Regex regex, String pattern, String flags); + + RegExpObject(Regex regex, String pattern, String flags, Object& prototype); virtual void initialize(GlobalObject&) override; virtual ~RegExpObject() override; - const String& pattern() const { return m_original_pattern; } + const String& pattern() const { return m_pattern; } const String& flags() const { return m_flags; } - const regex::RegexOptions& declared_options() { return m_active_flags.declared_flags; } const Regex& regex() { return m_regex; } const Regex& regex() const { return m_regex; } private: - String m_original_pattern; - String m_parsed_pattern; + String m_pattern; String m_flags; - Flags m_active_flags; Regex m_regex; }; diff --git a/Userland/Libraries/LibJS/Tests/builtins/RegExp/RegExp.js b/Userland/Libraries/LibJS/Tests/builtins/RegExp/RegExp.js index 183d79284b..ccd2509b6e 100644 --- a/Userland/Libraries/LibJS/Tests/builtins/RegExp/RegExp.js +++ b/Userland/Libraries/LibJS/Tests/builtins/RegExp/RegExp.js @@ -52,3 +52,11 @@ test("regexp object as pattern parameter", () => { expect(RegExp(regex_like_object_with_flags, "").toString()).toBe("/foo/"); expect(RegExp(regex_like_object_with_flags, "y").toString()).toBe("/foo/y"); }); + +test("regexp literals are re-useable", () => { + for (var i = 0; i < 2; ++i) { + const re = /test/; + expect(re.test("te")).toBeFalse(); + expect(re.test("test")).toBeTrue(); + } +});