LibRegex: Implement an ECMA262-compatible parser

This also adds support for lookarounds and individually-negated comparisons. The only unimplemented part of the parser spec is the unicode stuff.
2025-07-25 02:27:35 +00:00 · 2020-11-27 19:33:53 +03:30 · 2020-11-27 19:33:53 +03:30 · dbef2b1ee9
commit dbef2b1ee9
parent 3200ff5f4f
11 changed files with 1321 additions and 25 deletions
--- a/Libraries/LibRegex/RegexParser.h
+++ b/Libraries/LibRegex/RegexParser.h
@ -39,6 +39,7 @@
 namespace regex {

 class PosixExtendedParser;
+class ECMA262Parser;

 template<typename T>
 struct GenericParserTraits {
@ -53,6 +54,10 @@ template<>
 struct ParserTraits<PosixExtendedParser> : public GenericParserTraits<PosixOptions> {
 };

+template<>
+struct ParserTraits<ECMA262Parser> : public GenericParserTraits<ECMAScriptOptions> {
+};
+
 class Parser {
 public:
    struct Result {
@ -88,6 +93,7 @@ protected:
    ALWAYS_INLINE Token consume();
    ALWAYS_INLINE Token consume(TokenType type, Error error);
    ALWAYS_INLINE bool consume(const String&);
+    ALWAYS_INLINE bool try_skip(StringView);
    ALWAYS_INLINE void reset();
    ALWAYS_INLINE bool done() const;
    ALWAYS_INLINE bool set_error(Error error);
@ -102,6 +108,10 @@ protected:
        size_t named_capture_groups_count { 0 };
        size_t match_length_minimum { 0 };
        AllOptions regex_options;
+        HashMap<int, size_t> capture_group_minimum_lengths;
+        HashMap<FlyString, size_t> named_capture_group_minimum_lengths;
+        HashMap<size_t, FlyString> named_capture_groups;
+
        explicit ParserState(Lexer& lexer)
            : lexer(lexer)
            , current_token(lexer.next())
@ -144,8 +154,54 @@ private:
    ALWAYS_INLINE bool parse_repetition_symbol(ByteCode&, size_t&);
 };

+class ECMA262Parser final : public Parser {
+public:
+    explicit ECMA262Parser(Lexer& lexer)
+        : Parser(lexer)
+    {
+    }
+
+    ECMA262Parser(Lexer& lexer, Optional<typename ParserTraits<ECMA262Parser>::OptionsType> regex_options)
+        : Parser(lexer, regex_options.value_or({}))
+    {
+    }
+
+    ~ECMA262Parser() = default;
+
+private:
+    bool parse_internal(ByteCode&, size_t&) override;
+
+    enum class ReadDigitsInitialZeroState {
+        Allow,
+        Disallow,
+        Require,
+    };
+    enum class ReadDigitFollowPolicy {
+        Any,
+        DisallowDigit,
+        DisallowNonDigit,
+    };
+    Optional<unsigned> read_digits(ReadDigitsInitialZeroState initial_zero = ReadDigitsInitialZeroState::Allow, ReadDigitFollowPolicy follow_policy = ReadDigitFollowPolicy::Any, bool hex = false, int max_count = -1);
+    StringView read_capture_group_specifier(bool take_starting_angle_bracket = false);
+
+    bool parse_pattern(ByteCode&, size_t&, bool unicode, bool named);
+    bool parse_disjunction(ByteCode&, size_t&, bool unicode, bool named);
+    bool parse_alternative(ByteCode&, size_t&, bool unicode, bool named);
+    bool parse_term(ByteCode&, size_t&, bool unicode, bool named);
+    bool parse_assertion(ByteCode&, size_t&, bool unicode, bool named);
+    bool parse_atom(ByteCode&, size_t&, bool unicode, bool named);
+    bool parse_quantifier(ByteCode&, size_t&, bool unicode, bool named);
+    bool parse_atom_escape(ByteCode&, size_t&, bool unicode, bool named);
+    bool parse_character_class(ByteCode&, size_t&, bool unicode, bool named);
+    bool parse_capture_group(ByteCode&, size_t&, bool unicode, bool named);
+    Optional<CharClass> parse_character_class_escape(bool& out_inverse, bool expect_backslash = false);
+    bool parse_nonempty_class_ranges(Vector<CompareTypeAndValuePair>&, bool unicode);
+};
+
 using PosixExtended = PosixExtendedParser;
+using ECMA262 = ECMA262Parser;

 }

+using regex::ECMA262;
 using regex::PosixExtended;