LibRegex: Allow references to capture groups that aren't parsed yet

This only applies to the ECMA262 parser. This behaviour is an ECMA262-specific quirk, such references always generate zero-length matches (even on subsequent passes). Also adds a test in LibJS's test suite. Fixes #6039.
2025-07-25 20:17:44 +00:00 · 2021-04-01 18:30:47 +04:30 · 2021-04-01 18:30:47 +04:30 · 6bbb26fdaf
commit 6bbb26fdaf
parent 804ab79995
6 changed files with 80 additions and 6 deletions
--- a/Userland/Libraries/LibRegex/RegexParser.cpp
+++ b/Userland/Libraries/LibRegex/RegexParser.cpp
@ -1114,12 +1114,19 @@ bool ECMA262Parser::parse_atom_escape(ByteCode& stack, size_t& match_length_mini
 {
    if (auto escape_str = read_digits_as_string(ReadDigitsInitialZeroState::Disallow, ReadDigitFollowPolicy::DisallowNonDigit); !escape_str.is_empty()) {
        if (auto escape = escape_str.to_uint(); escape.has_value()) {
+            // See if this is a "back"-reference (we've already parsed the group it refers to)
            auto maybe_length = m_parser_state.capture_group_minimum_lengths.get(escape.value());
            if (maybe_length.has_value()) {
                match_length_minimum += maybe_length.value();
                stack.insert_bytecode_compare_values({ { CharacterCompareType::Reference, (ByteCodeValueType)escape.value() } });
                return true;
            }
+            // It's not a pattern seen before, so we have to see if it's a valid reference to a future group.
+            if (escape.value() <= ensure_total_number_of_capturing_parenthesis()) {
+                // This refers to a future group, and it will _always_ be matching an empty string
+                // So just match nothing and move on.
+                return true;
+            }
            if (!m_should_use_browser_extended_grammar) {
                set_error(Error::InvalidNumber);
                return false;
@ -1729,4 +1736,47 @@ bool ECMA262Parser::parse_capture_group(ByteCode& stack, size_t& match_length_mi

    return true;
 }
+
+size_t ECMA262Parser::ensure_total_number_of_capturing_parenthesis()
+{
+    if (m_total_number_of_capturing_parenthesis.has_value())
+        return m_total_number_of_capturing_parenthesis.value();
+
+    GenericLexer lexer { m_parser_state.lexer.source() };
+    size_t count = 0;
+    while (!lexer.is_eof()) {
+        switch (lexer.peek()) {
+        case '\\':
+            lexer.consume(2);
+            continue;
+        case '[':
+            while (!lexer.is_eof()) {
+                if (lexer.consume_specific('\\'))
+                    lexer.consume();
+                else if (lexer.consume_specific(']'))
+                    break;
+                lexer.consume();
+            }
+            break;
+        case '(':
+            if (lexer.consume_specific('?')) {
+                // non-capturing group '(?:', lookaround '(?<='/'(?<!', or named capture '(?<'
+                if (!lexer.consume_specific('<'))
+                    break;
+
+                if (lexer.next_is(is_any_of("=!")))
+                    break;
+
+                ++count;
+            } else {
+                ++count;
+            }
+            break;
+        }
+        lexer.consume();
+    }
+
+    m_total_number_of_capturing_parenthesis = count;
+    return count;
+}
 }