LibRegex: Implement an ECMA262-compatible parser

This also adds support for lookarounds and individually-negated comparisons. The only unimplemented part of the parser spec is the unicode stuff.
2025-10-16 15:52:27 +00:00 · 2020-11-27 19:33:53 +03:30 · 2020-11-27 19:33:53 +03:30 · dbef2b1ee9
commit dbef2b1ee9
parent 3200ff5f4f
11 changed files with 1321 additions and 25 deletions
--- a/Libraries/LibRegex/RegexParser.cpp
+++ b/Libraries/LibRegex/RegexParser.cpp
@ -28,7 +28,7 @@
 #include "RegexDebug.h"
 #include <AK/String.h>
 #include <AK/StringBuilder.h>
-#include <cstdio>
+#include <AK/StringUtils.h>

 namespace regex {

@ -88,6 +88,26 @@ ALWAYS_INLINE bool Parser::consume(const String& str)
    return true;
 }

+ALWAYS_INLINE bool Parser::try_skip(StringView str)
+{
+    if (str.starts_with(m_parser_state.current_token.value()))
+        str = str.substring_view(m_parser_state.current_token.value().length(), str.length() - m_parser_state.current_token.value().length());
+    else
+        return false;
+
+    size_t potentially_go_back { 0 };
+    for (auto ch : str) {
+        if (!m_parser_state.lexer.try_skip(ch)) {
+            m_parser_state.lexer.back(potentially_go_back);
+            return false;
+        }
+        ++potentially_go_back;
+    }
+
+    m_parser_state.current_token = m_parser_state.lexer.next();
+    return true;
+}
+
 ALWAYS_INLINE void Parser::reset()
 {
    m_parser_state.bytecode.clear();
@ -595,4 +615,762 @@ bool PosixExtendedParser::parse_root(ByteCode& stack, size_t& match_length_minim
    return !has_error();
 }

+// =============================
+// ECMA262 Parser
+// =============================
+
+bool ECMA262Parser::parse_internal(ByteCode& stack, size_t& match_length_minimum)
+{
+    if (m_parser_state.regex_options & AllFlags::Unicode) {
+        return parse_pattern(stack, match_length_minimum, true, true);
+    } else {
+        ByteCode new_stack;
+        size_t new_match_length = 0;
+        auto res = parse_pattern(new_stack, new_match_length, false, false);
+        if (m_parser_state.named_capture_groups_count > 0) {
+            reset();
+            return parse_pattern(stack, match_length_minimum, false, true);
+        }
+
+        if (!res)
+            return false;
+
+        stack.append(new_stack);
+        match_length_minimum = new_match_length;
+        return res;
+    }
+}
+
+bool ECMA262Parser::parse_pattern(ByteCode& stack, size_t& match_length_minimum, bool unicode, bool named)
+{
+    return parse_disjunction(stack, match_length_minimum, unicode, named);
+}
+
+bool ECMA262Parser::parse_disjunction(ByteCode& stack, size_t& match_length_minimum, bool unicode, bool named)
+{
+    ByteCode left_alternative_stack;
+    size_t left_alternative_min_length = 0;
+    auto alt_ok = parse_alternative(left_alternative_stack, left_alternative_min_length, unicode, named);
+    if (!alt_ok)
+        return false;
+
+    if (!match(TokenType::Pipe)) {
+        stack.append(left_alternative_stack);
+        match_length_minimum = left_alternative_min_length;
+        return alt_ok;
+    }
+
+    consume();
+    ByteCode right_alternative_stack;
+    size_t right_alternative_min_length = 0;
+    auto continuation_ok = parse_disjunction(right_alternative_stack, right_alternative_min_length, unicode, named);
+    if (!continuation_ok)
+        return false;
+
+    stack.insert_bytecode_alternation(move(left_alternative_stack), move(right_alternative_stack));
+    match_length_minimum = min(left_alternative_min_length, right_alternative_min_length);
+    return continuation_ok;
+}
+
+bool ECMA262Parser::parse_alternative(ByteCode& stack, size_t& match_length_minimum, bool unicode, bool named)
+{
+    for (;;) {
+        if (match(TokenType::Eof))
+            return true;
+
+        if (parse_term(stack, match_length_minimum, unicode, named))
+            continue;
+
+        return !has_error();
+    }
+}
+
+bool ECMA262Parser::parse_term(ByteCode& stack, size_t& match_length_minimum, bool unicode, bool named)
+{
+    if (parse_assertion(stack, match_length_minimum, unicode, named))
+        return true;
+
+    ByteCode atom_stack;
+    size_t minimum_atom_length = 0;
+    if (!parse_atom(atom_stack, minimum_atom_length, unicode, named))
+        return false;
+
+    if (!parse_quantifier(atom_stack, minimum_atom_length, unicode, named))
+        return false;
+
+    stack.append(move(atom_stack));
+    match_length_minimum += minimum_atom_length;
+    return true;
+}
+
+bool ECMA262Parser::parse_assertion(ByteCode& stack, [[maybe_unused]] size_t& match_length_minimum, bool unicode, bool named)
+{
+    if (match(TokenType::Circumflex)) {
+        consume();
+        stack.empend((ByteCodeValueType)OpCodeId::CheckBegin);
+        return true;
+    }
+
+    if (match(TokenType::Dollar)) {
+        consume();
+        stack.empend((ByteCodeValueType)OpCodeId::CheckEnd);
+        return true;
+    }
+
+    if (try_skip("\\b")) {
+        stack.insert_bytecode_check_boundary(BoundaryCheckType::Word);
+        return true;
+    }
+
+    if (try_skip("\\B")) {
+        stack.insert_bytecode_check_boundary(BoundaryCheckType::NonWord);
+        return true;
+    }
+
+    if (match(TokenType::LeftParen)) {
+        if (!try_skip("(?"))
+            return false;
+
+        ByteCode assertion_stack;
+        size_t length_dummy = 0;
+
+        auto parse_inner_disjunction = [&] {
+            auto disjunction_ok = parse_disjunction(assertion_stack, length_dummy, unicode, named);
+            if (!disjunction_ok)
+                return false;
+            consume(TokenType::RightParen, Error::MismatchingParen);
+            return true;
+        };
+
+        if (try_skip("=")) {
+            if (!parse_inner_disjunction())
+                return false;
+            stack.insert_bytecode_lookaround(move(assertion_stack), ByteCode::LookAroundType::LookAhead);
+            return true;
+        }
+        if (try_skip("!")) {
+            if (!parse_inner_disjunction())
+                return false;
+            stack.insert_bytecode_lookaround(move(assertion_stack), ByteCode::LookAroundType::NegatedLookAhead);
+            return true;
+        }
+        if (try_skip("<=")) {
+            if (!parse_inner_disjunction())
+                return false;
+            // FIXME: Somehow ensure that this assertion regexp has a fixed length.
+            stack.insert_bytecode_lookaround(move(assertion_stack), ByteCode::LookAroundType::LookBehind, length_dummy);
+            return true;
+        }
+        if (try_skip("<!")) {
+            if (!parse_inner_disjunction())
+                return false;
+            stack.insert_bytecode_lookaround(move(assertion_stack), ByteCode::LookAroundType::NegatedLookBehind, length_dummy);
+            return true;
+        }
+
+        // If none of these matched, put the '(?' back.
+        m_parser_state.lexer.back(3);
+        m_parser_state.current_token = m_parser_state.lexer.next();
+        return false;
+    }
+
+    return false;
+}
+
+Optional<unsigned> ECMA262Parser::read_digits(ECMA262Parser::ReadDigitsInitialZeroState initial_zero, ECMA262Parser::ReadDigitFollowPolicy follow_policy, bool hex, int max_count)
+{
+    if (!match(TokenType::Char))
+        return {};
+
+    if (initial_zero != ReadDigitsInitialZeroState::Allow) {
+        auto has_initial_zero = m_parser_state.current_token.value() == "0";
+        if (initial_zero == ReadDigitsInitialZeroState::Disallow && has_initial_zero)
+            return {};
+
+        if (initial_zero == ReadDigitsInitialZeroState::Require && !has_initial_zero)
+            return {};
+    }
+
+    int count = 0;
+    size_t offset = 0;
+    while (match(TokenType::Char)) {
+        auto c = m_parser_state.current_token.value();
+        if (follow_policy == ReadDigitFollowPolicy::DisallowDigit) {
+            if (hex && AK::StringUtils::convert_to_uint_from_hex(c).has_value())
+                break;
+            if (!hex && c.to_uint().has_value())
+                break;
+        }
+
+        if (follow_policy == ReadDigitFollowPolicy::DisallowNonDigit) {
+            if (hex && !AK::StringUtils::convert_to_uint_from_hex(c).has_value())
+                break;
+            if (!hex && !c.to_uint().has_value())
+                break;
+        }
+
+        if (max_count > 0 && count >= max_count)
+            break;
+
+        offset += consume().value().length();
+        ++count;
+    }
+
+    auto str = m_parser_state.lexer.slice_back(offset);
+    if (hex)
+        return AK::StringUtils::convert_to_uint_from_hex(str);
+
+    return str.to_uint();
+}
+
+bool ECMA262Parser::parse_quantifier(ByteCode& stack, size_t& match_length_minimum, bool, bool)
+{
+    enum class Repetition {
+        OneOrMore,
+        ZeroOrMore,
+        Optional,
+        Explicit,
+        None,
+    } repetition_mark { Repetition::None };
+
+    bool ungreedy = false;
+    Optional<size_t> repeat_min, repeat_max;
+
+    if (match(TokenType::Asterisk)) {
+        consume();
+        repetition_mark = Repetition::ZeroOrMore;
+    } else if (match(TokenType::Plus)) {
+        consume();
+        repetition_mark = Repetition::OneOrMore;
+    } else if (match(TokenType::Questionmark)) {
+        consume();
+        repetition_mark = Repetition::Optional;
+    } else if (match(TokenType::LeftCurly)) {
+        consume();
+        repetition_mark = Repetition::Explicit;
+
+        auto low_bound = read_digits();
+
+        if (!low_bound.has_value()) {
+            set_error(Error::InvalidBraceContent);
+            return false;
+        }
+
+        repeat_min = low_bound.value();
+
+        if (match(TokenType::Comma)) {
+            consume();
+            auto high_bound = read_digits();
+            if (!high_bound.has_value()) {
+                set_error(Error::InvalidBraceContent);
+                return false;
+            }
+
+            repeat_max = high_bound.value();
+        }
+
+        if (!match(TokenType::RightCurly)) {
+            set_error(Error::MismatchingBrace);
+            return false;
+        }
+        consume();
+
+        if (repeat_max.has_value()) {
+            if (repeat_min.value() > repeat_max.value())
+                set_error(Error::InvalidBraceContent);
+        }
+    } else {
+        return true;
+    }
+
+    if (match(TokenType::Questionmark)) {
+        if (repetition_mark == Repetition::Explicit) {
+            set_error(Error::InvalidRepetitionMarker);
+            return false;
+        }
+        consume();
+        ungreedy = true;
+    }
+
+    ByteCode new_bytecode;
+    switch (repetition_mark) {
+    case Repetition::OneOrMore:
+        new_bytecode.insert_bytecode_repetition_min_one(stack, !ungreedy);
+        break;
+    case Repetition::ZeroOrMore:
+        new_bytecode.insert_bytecode_repetition_any(stack, !ungreedy);
+        match_length_minimum = 0;
+        break;
+    case Repetition::Optional:
+        new_bytecode.insert_bytecode_repetition_zero_or_one(stack, !ungreedy);
+        match_length_minimum = 0;
+        break;
+    case Repetition::Explicit:
+        new_bytecode.insert_bytecode_repetition_min_max(stack, repeat_min.value(), repeat_max);
+        match_length_minimum *= repeat_min.value();
+        break;
+    case Repetition::None:
+        ASSERT_NOT_REACHED();
+    }
+
+    return true;
+}
+
+bool ECMA262Parser::parse_atom(ByteCode& stack, size_t& match_length_minimum, bool unicode, bool named)
+{
+    if (try_skip("\\")) {
+        // AtomEscape.
+        return parse_atom_escape(stack, match_length_minimum, unicode, named);
+    }
+
+    if (match(TokenType::LeftBracket)) {
+        // Character class.
+        return parse_character_class(stack, match_length_minimum, unicode, named);
+    }
+
+    if (match(TokenType::LeftParen)) {
+        // Non-capturing group, or a capture group.
+        return parse_capture_group(stack, match_length_minimum, unicode, named);
+    }
+
+    if (match(TokenType::Period)) {
+        consume();
+        match_length_minimum += 1;
+        stack.insert_bytecode_compare_values({ { CharacterCompareType::AnyChar, 0 } });
+        return true;
+    }
+
+    if (match(TokenType::Circumflex) || match(TokenType::Dollar) || match(TokenType::RightBracket)
+        || match(TokenType::RightCurly) || match(TokenType::RightParen) || match(TokenType::Pipe)
+        || match(TokenType::Plus) || match(TokenType::Asterisk) || match(TokenType::Questionmark)) {
+
+        return false;
+    }
+
+    if (match(TokenType::Char)) {
+        auto token = consume().value();
+        match_length_minimum += 1;
+        stack.insert_bytecode_compare_values({ { CharacterCompareType::Char, (ByteCodeValueType)token[0] } });
+        return true;
+    }
+
+    set_error(Error::InvalidPattern);
+    return false;
+}
+
+bool ECMA262Parser::parse_atom_escape(ByteCode& stack, size_t& match_length_minimum, bool unicode, bool named)
+{
+    if (auto escape = read_digits(ReadDigitsInitialZeroState::Disallow, ReadDigitFollowPolicy::DisallowNonDigit); escape.has_value()) {
+        auto maybe_length = m_parser_state.capture_group_minimum_lengths.get(escape.value());
+        if (!maybe_length.has_value()) {
+            set_error(Error::InvalidNumber);
+            return false;
+        }
+        match_length_minimum += maybe_length.value();
+        stack.insert_bytecode_compare_values({ { CharacterCompareType::Reference, (ByteCodeValueType)escape.value() } });
+        return true;
+    }
+
+    // CharacterEscape > ControlEscape
+    if (try_skip("f")) {
+        match_length_minimum += 1;
+        stack.insert_bytecode_compare_values({ { CharacterCompareType::Char, (ByteCodeValueType)'\f' } });
+        return true;
+    }
+
+    if (try_skip("n")) {
+        match_length_minimum += 1;
+        stack.insert_bytecode_compare_values({ { CharacterCompareType::Char, (ByteCodeValueType)'\n' } });
+        return true;
+    }
+
+    if (try_skip("r")) {
+        match_length_minimum += 1;
+        stack.insert_bytecode_compare_values({ { CharacterCompareType::Char, (ByteCodeValueType)'\r' } });
+        return true;
+    }
+
+    if (try_skip("t")) {
+        match_length_minimum += 1;
+        stack.insert_bytecode_compare_values({ { CharacterCompareType::Char, (ByteCodeValueType)'\t' } });
+        return true;
+    }
+
+    if (try_skip("v")) {
+        match_length_minimum += 1;
+        stack.insert_bytecode_compare_values({ { CharacterCompareType::Char, (ByteCodeValueType)'\v' } });
+        return true;
+    }
+
+    // CharacterEscape > ControlLetter
+    if (try_skip("c")) {
+        for (auto c = 'A'; c <= 'z'; ++c) {
+            if (try_skip({ &c, 1 })) {
+                match_length_minimum += 1;
+                stack.insert_bytecode_compare_values({ { CharacterCompareType::Char, (ByteCodeValueType)(c & 0x3f) } });
+                return true;
+            }
+        }
+    }
+
+    // '\0'
+    if (read_digits(ReadDigitsInitialZeroState::Require, ReadDigitFollowPolicy::DisallowDigit).has_value()) {
+        match_length_minimum += 1;
+        stack.insert_bytecode_compare_values({ { CharacterCompareType::Char, (ByteCodeValueType)0 } });
+        return true;
+    }
+
+    // HexEscape
+    if (try_skip("x")) {
+        if (auto hex_escape = read_digits(ReadDigitsInitialZeroState::Allow, ReadDigitFollowPolicy::Any, true, 2); hex_escape.has_value()) {
+            match_length_minimum += 1;
+            stack.insert_bytecode_compare_values({ { CharacterCompareType::Char, (ByteCodeValueType)hex_escape.value() } });
+            return true;
+        }
+    }
+
+    if (try_skip("u")) {
+        // FIXME: Implement this path, unicode escape sequence.
+        TODO();
+    }
+
+    // IdentityEscape
+    if (match(TokenType::EscapeSequence)) {
+        match_length_minimum += 1;
+        auto token = consume().value();
+        stack.insert_bytecode_compare_values({ { CharacterCompareType::Char, (ByteCodeValueType)token[token.length() - 1] } });
+        return true;
+    }
+
+    if (named && try_skip("k")) {
+        auto name = read_capture_group_specifier(true);
+        if (name.is_empty()) {
+            set_error(Error::InvalidNameForCaptureGroup);
+            return false;
+        }
+        auto maybe_length = m_parser_state.named_capture_group_minimum_lengths.get(name);
+        if (!maybe_length.has_value()) {
+            set_error(Error::InvalidNameForCaptureGroup);
+            return false;
+        }
+        match_length_minimum += maybe_length.value();
+
+        stack.insert_bytecode_compare_named_reference(name, name.length());
+        return true;
+    }
+
+    if (unicode) {
+        if (try_skip("p{")) {
+            // FIXME: Implement this path, Unicode property match.
+            TODO();
+        }
+        if (try_skip("P{")) {
+            // FIXME: Implement this path, Unicode property match.
+            TODO();
+        }
+    }
+
+    bool negate = false;
+    auto ch = parse_character_class_escape(negate);
+    if (!ch.has_value()) {
+        set_error(Error::InvalidCharacterClass);
+        return false;
+    }
+
+    Vector<CompareTypeAndValuePair> compares;
+    if (negate)
+        compares.empend(CharacterCompareType::Inverse, 0);
+    compares.empend(CharacterCompareType::CharClass, (ByteCodeValueType)ch.value());
+    match_length_minimum += 1;
+    stack.insert_bytecode_compare_values(move(compares));
+    return true;
+}
+
+Optional<CharClass> ECMA262Parser::parse_character_class_escape(bool& negate, bool expect_backslash)
+{
+    if (expect_backslash && !try_skip("\\"))
+        return {};
+
+    // CharacterClassEscape
+    CharClass ch_class;
+    if (try_skip("d")) {
+        ch_class = CharClass::Digit;
+    } else if (try_skip("D")) {
+        ch_class = CharClass::Digit;
+        negate = true;
+    } else if (try_skip("s")) {
+        ch_class = CharClass::Space;
+    } else if (try_skip("S")) {
+        ch_class = CharClass::Space;
+        negate = true;
+    } else if (try_skip("w")) {
+        ch_class = CharClass::Word;
+    } else if (try_skip("W")) {
+        ch_class = CharClass::Word;
+        negate = true;
+    } else {
+        return {};
+    }
+
+    return ch_class;
+}
+
+bool ECMA262Parser::parse_character_class(ByteCode& stack, size_t& match_length_minimum, bool unicode, bool)
+{
+    consume(TokenType::LeftBracket, Error::InvalidPattern);
+
+    Vector<CompareTypeAndValuePair> compares;
+
+    if (match(TokenType::Circumflex)) {
+        // Negated charclass
+        consume();
+        compares.empend(CharacterCompareType::Inverse, 0);
+    }
+
+    if (match(TokenType::RightBracket)) {
+        consume();
+        return true;
+    }
+
+    if (!parse_nonempty_class_ranges(compares, unicode))
+        return false;
+
+    match_length_minimum += 1;
+    stack.insert_bytecode_compare_values(move(compares));
+    return true;
+}
+
+struct CharClassRangeElement {
+    union {
+        CharClass character_class;
+        u32 code_point { 0 };
+    };
+
+    bool is_negated { false };
+    bool is_character_class { false };
+};
+
+bool ECMA262Parser::parse_nonempty_class_ranges(Vector<CompareTypeAndValuePair>& ranges, bool unicode)
+{
+    auto read_class_atom_no_dash = [&]() -> Optional<CharClassRangeElement> {
+        if (match(TokenType::EscapeSequence)) {
+            auto token = consume().value();
+            return { { .code_point = (u32)token[1], .is_character_class = false } };
+        }
+
+        if (try_skip("\\")) {
+            if (try_skip("f"))
+                return { { .code_point = '\f', .is_character_class = false } };
+            if (try_skip("n"))
+                return { { .code_point = '\n', .is_character_class = false } };
+            if (try_skip("r"))
+                return { { .code_point = '\r', .is_character_class = false } };
+            if (try_skip("t"))
+                return { { .code_point = '\t', .is_character_class = false } };
+            if (try_skip("v"))
+                return { { .code_point = '\v', .is_character_class = false } };
+            if (try_skip("b"))
+                return { { .code_point = '\b', .is_character_class = false } };
+
+            // CharacterEscape > ControlLetter
+            if (try_skip("c")) {
+                for (auto c = 'A'; c <= 'z'; ++c) {
+                    if (try_skip({ &c, 1 }))
+                        return { { .code_point = (u32)(c & 0x3f), .is_character_class = false } };
+                }
+            }
+
+            // '\0'
+            if (read_digits(ReadDigitsInitialZeroState::Require, ReadDigitFollowPolicy::DisallowDigit).has_value())
+                return { { .code_point = 0, .is_character_class = false } };
+
+            // HexEscape
+            if (try_skip("x")) {
+                if (auto hex_escape = read_digits(ReadDigitsInitialZeroState::Allow, ReadDigitFollowPolicy::Any, true, 2); hex_escape.has_value())
+                    return { { .code_point = hex_escape.value(), .is_character_class = false } };
+            }
+
+            if (try_skip("u")) {
+                // FIXME: Implement this path, unicode escape sequence.
+                TODO();
+            }
+
+            if (unicode) {
+                if (try_skip("-"))
+                    return { { .code_point = '-', .is_character_class = false } };
+            }
+
+            if (try_skip("p{") || try_skip("P{")) {
+                // FIXME: Implement these; unicode properties.
+                TODO();
+            }
+
+            if (try_skip("d"))
+                return { { .character_class = CharClass::Digit, .is_character_class = true } };
+            if (try_skip("s"))
+                return { { .character_class = CharClass::Space, .is_character_class = true } };
+            if (try_skip("w"))
+                return { { .character_class = CharClass::Word, .is_character_class = true } };
+            if (try_skip("D"))
+                return { { .character_class = CharClass::Digit, .is_negated = true, .is_character_class = true } };
+            if (try_skip("S"))
+                return { { .character_class = CharClass::Space, .is_negated = true, .is_character_class = true } };
+            if (try_skip("W"))
+                return { { .character_class = CharClass::Word, .is_negated = true, .is_character_class = true } };
+        }
+
+        if (match(TokenType::RightBracket) || match(TokenType::HyphenMinus))
+            return {};
+
+        auto token = consume(TokenType::Char, Error::InvalidCharacterClass);
+
+        return { { .code_point = (u32)token.value()[0], .is_character_class = false } };
+    };
+    auto read_class_atom = [&]() -> Optional<CharClassRangeElement> {
+        if (match(TokenType::HyphenMinus)) {
+            consume();
+            return { { .code_point = '-', .is_character_class = false } };
+        }
+
+        return read_class_atom_no_dash();
+    };
+
+    while (!match(TokenType::RightBracket)) {
+        auto first_atom = read_class_atom();
+        if (!first_atom.has_value())
+            return false;
+
+        if (match(TokenType::HyphenMinus)) {
+            consume();
+            auto second_atom = read_class_atom();
+            if (!second_atom.has_value())
+                return false;
+
+            if (first_atom.value().is_character_class || second_atom.value().is_character_class) {
+                set_error(Error::InvalidRange);
+                return false;
+            }
+
+            if (first_atom.value().code_point > second_atom.value().code_point) {
+                set_error(Error::InvalidRange);
+                return false;
+            }
+
+            ASSERT(!first_atom.value().is_negated);
+            ASSERT(!second_atom.value().is_negated);
+
+            ranges.empend(CharacterCompareType::CharRange, CharRange { first_atom.value().code_point, second_atom.value().code_point });
+            continue;
+        }
+
+        auto atom = first_atom.value();
+
+        if (atom.is_character_class) {
+            if (atom.is_negated)
+                ranges.empend(CharacterCompareType::TemporaryInverse, 0);
+            ranges.empend(CharacterCompareType::CharClass, (ByteCodeValueType)first_atom.value().character_class);
+        } else {
+            ASSERT(!atom.is_negated);
+            ranges.empend(CharacterCompareType::Char, first_atom.value().code_point);
+        }
+    }
+
+    consume(TokenType::RightBracket, Error::MismatchingBracket);
+
+    return true;
+}
+
+StringView ECMA262Parser::read_capture_group_specifier(bool take_starting_angle_bracket)
+{
+    if (take_starting_angle_bracket && !consume("<"))
+        return {};
+
+    size_t offset = 0;
+    while (match(TokenType::Char)) {
+        auto c = m_parser_state.current_token.value();
+        if (c == ">")
+            break;
+        offset += consume().value().length();
+    }
+
+    auto name = m_parser_state.lexer.slice_back(offset);
+    if (!consume(">") || name.is_empty())
+        set_error(Error::InvalidNameForCaptureGroup);
+
+    return name;
+}
+
+bool ECMA262Parser::parse_capture_group(ByteCode& stack, size_t& match_length_minimum, bool unicode, bool named)
+{
+    consume(TokenType::LeftParen, Error::InvalidPattern);
+
+    if (match(TokenType::Questionmark)) {
+        // Non-capturing group or group with specifier.
+        consume();
+
+        if (match(TokenType::Colon)) {
+            consume();
+            ByteCode noncapture_group_bytecode;
+            size_t length = 0;
+            if (!parse_disjunction(noncapture_group_bytecode, length, unicode, named))
+                return set_error(Error::InvalidPattern);
+
+            consume(TokenType::RightParen, Error::MismatchingParen);
+
+            stack.append(move(noncapture_group_bytecode));
+            match_length_minimum += length;
+            return true;
+        }
+
+        if (consume("<")) {
+            ++m_parser_state.named_capture_groups_count;
+            auto name = read_capture_group_specifier();
+
+            if (name.is_empty()) {
+                set_error(Error::InvalidNameForCaptureGroup);
+                return false;
+            }
+
+            ByteCode capture_group_bytecode;
+            size_t length = 0;
+            if (!parse_disjunction(capture_group_bytecode, length, unicode, named))
+                return set_error(Error::InvalidPattern);
+
+            consume(TokenType::RightParen, Error::MismatchingParen);
+
+            stack.insert_bytecode_group_capture_left(name);
+            stack.append(move(capture_group_bytecode));
+            stack.insert_bytecode_group_capture_right(name);
+
+            match_length_minimum += length;
+
+            m_parser_state.named_capture_group_minimum_lengths.set(name, length);
+            return true;
+        }
+
+        set_error(Error::InvalidCaptureGroup);
+        return false;
+    }
+
+    auto group_index = ++m_parser_state.capture_groups_count;
+    stack.insert_bytecode_group_capture_left(group_index);
+
+    ByteCode capture_group_bytecode;
+    size_t length = 0;
+
+    if (!parse_disjunction(capture_group_bytecode, length, unicode, named))
+        return set_error(Error::InvalidPattern);
+
+    stack.append(move(capture_group_bytecode));
+
+    m_parser_state.capture_group_minimum_lengths.set(group_index, length);
+
+    consume(TokenType::RightParen, Error::MismatchingParen);
+
+    stack.insert_bytecode_group_capture_right(group_index);
+
+    match_length_minimum += length;
+
+    return true;
+}
 }