LibRegex: Disallow invalid interval qualifiers in Unicode mode

Fixes all remaining 'built-ins/RegExp/property-escapes' test262 tests.
2025-10-31 19:42:43 +00:00 · 2021-08-10 16:35:45 -04:00 · 2021-08-10 16:35:45 -04:00 · df14d11a11
commit df14d11a11
parent a98d3a1a85
3 changed files with 63 additions and 45 deletions
--- a/Tests/LibRegex/Regex.cpp
+++ b/Tests/LibRegex/Regex.cpp
@ -522,6 +522,9 @@ TEST_CASE(ECMA262_parse)
        { "\\p{hello friends}", regex::Error::InvalidNameForProperty, ECMAScriptFlags::Unicode },
        { "\\p{Prepended_Concatenation_Mark}", regex::Error::InvalidNameForProperty, ECMAScriptFlags::Unicode },
        { "\\p{ASCII}", regex::Error::NoError, ECMAScriptFlags::Unicode },
+        { "\\\\p{1}", regex::Error::NoError, ECMAScriptFlags::Unicode },
+        { "\\\\p{AsCiI}", regex::Error::InvalidPattern, ECMAScriptFlags::Unicode },
+        { "\\\\p{ASCII}", regex::Error::InvalidPattern, ECMAScriptFlags::Unicode },
    };

    for (auto& test : tests) {
--- a/Userland/Libraries/LibRegex/RegexParser.cpp
+++ b/Userland/Libraries/LibRegex/RegexParser.cpp
@ -1121,7 +1121,7 @@ Optional<unsigned> ECMA262Parser::read_digits(ECMA262Parser::ReadDigitsInitialZe
    return str.to_uint();
 }

-bool ECMA262Parser::parse_quantifier(ByteCode& stack, size_t& match_length_minimum, bool, bool)
+bool ECMA262Parser::parse_quantifier(ByteCode& stack, size_t& match_length_minimum, bool unicode, bool)
 {
    enum class Repetition {
        OneOrMore,
@ -1144,52 +1144,13 @@ bool ECMA262Parser::parse_quantifier(ByteCode& stack, size_t& match_length_minim
        consume();
        repetition_mark = Repetition::Optional;
    } else if (match(TokenType::LeftCurly)) {
-        consume();
-        auto chars_consumed = 1;
        repetition_mark = Repetition::Explicit;
-
-        auto low_bound_string = read_digits_as_string();
-        chars_consumed += low_bound_string.length();
-
-        auto low_bound = low_bound_string.to_uint();
-
-        if (!low_bound.has_value()) {
-            if (!m_should_use_browser_extended_grammar && done())
-                return set_error(Error::MismatchingBrace);
-
-            back(chars_consumed + !done());
-            return true;
-        }
-
-        repeat_min = low_bound.value();
-
-        if (match(TokenType::Comma)) {
-            consume();
-            ++chars_consumed;
-            auto high_bound_string = read_digits_as_string();
-            auto high_bound = high_bound_string.to_uint();
-            if (high_bound.has_value()) {
-                repeat_max = high_bound.value();
-                chars_consumed += high_bound_string.length();
+        if (!parse_interval_quantifier(repeat_min, repeat_max)) {
+            if (unicode) {
+                // Invalid interval quantifiers are disallowed in Unicode mod - they must be esacped with '\{'.
+                set_error(Error::InvalidPattern);
            }
-        } else {
-            repeat_max = repeat_min;
-        }
-
-        if (!match(TokenType::RightCurly)) {
-            if (!m_should_use_browser_extended_grammar && done())
-                return set_error(Error::MismatchingBrace);
-
-            back(chars_consumed + !done());
-            return true;
-        }
-
-        consume();
-        ++chars_consumed;
-
-        if (repeat_max.has_value()) {
-            if (repeat_min.value() > repeat_max.value())
-                set_error(Error::InvalidBraceContent);
+            return !has_error();
        }
    } else {
        return true;
@ -1223,6 +1184,59 @@ bool ECMA262Parser::parse_quantifier(ByteCode& stack, size_t& match_length_minim
    return true;
 }

+bool ECMA262Parser::parse_interval_quantifier(Optional<size_t>& repeat_min, Optional<size_t>& repeat_max)
+{
+    VERIFY(match(TokenType::LeftCurly));
+    consume();
+    auto chars_consumed = 1;
+
+    auto low_bound_string = read_digits_as_string();
+    chars_consumed += low_bound_string.length();
+
+    auto low_bound = low_bound_string.to_uint();
+
+    if (!low_bound.has_value()) {
+        if (!m_should_use_browser_extended_grammar && done())
+            return set_error(Error::MismatchingBrace);
+
+        back(chars_consumed + !done());
+        return false;
+    }
+
+    repeat_min = low_bound.value();
+
+    if (match(TokenType::Comma)) {
+        consume();
+        ++chars_consumed;
+        auto high_bound_string = read_digits_as_string();
+        auto high_bound = high_bound_string.to_uint();
+        if (high_bound.has_value()) {
+            repeat_max = high_bound.value();
+            chars_consumed += high_bound_string.length();
+        }
+    } else {
+        repeat_max = repeat_min;
+    }
+
+    if (!match(TokenType::RightCurly)) {
+        if (!m_should_use_browser_extended_grammar && done())
+            return set_error(Error::MismatchingBrace);
+
+        back(chars_consumed + !done());
+        return false;
+    }
+
+    consume();
+    ++chars_consumed;
+
+    if (repeat_max.has_value()) {
+        if (repeat_min.value() > repeat_max.value())
+            set_error(Error::InvalidBraceContent);
+    }
+
+    return true;
+}
+
 bool ECMA262Parser::parse_atom(ByteCode& stack, size_t& match_length_minimum, bool unicode, bool named)
 {
    if (match(TokenType::EscapeSequence)) {
--- a/Userland/Libraries/LibRegex/RegexParser.h
+++ b/Userland/Libraries/LibRegex/RegexParser.h
@ -228,6 +228,7 @@ private:
    bool parse_assertion(ByteCode&, size_t&, bool unicode, bool named);
    bool parse_atom(ByteCode&, size_t&, bool unicode, bool named);
    bool parse_quantifier(ByteCode&, size_t&, bool unicode, bool named);
+    bool parse_interval_quantifier(Optional<size_t>& repeat_min, Optional<size_t>& repeat_max);
    bool parse_atom_escape(ByteCode&, size_t&, bool unicode, bool named);
    bool parse_character_class(ByteCode&, size_t&, bool unicode, bool named);
    bool parse_capture_group(ByteCode&, size_t&, bool unicode, bool named);