From 7fefb8148bc65fe61562f773956d2c41b2646a8a Mon Sep 17 00:00:00 2001 From: Ali Mohammad Pur Date: Tue, 7 Sep 2021 14:33:06 +0430 Subject: [PATCH] LibRegex: Use the correct capture group index in ERE bytecode generation Otherwise the left and right capture instructions wouldn't point to the same capture group if there was another nested group there. --- Tests/LibRegex/Regex.cpp | 12 ++++++++++++ Userland/Libraries/LibRegex/RegexParser.cpp | 19 ++++++++++--------- 2 files changed, 22 insertions(+), 9 deletions(-) diff --git a/Tests/LibRegex/Regex.cpp b/Tests/LibRegex/Regex.cpp index be2b026fd0..5203a277a0 100644 --- a/Tests/LibRegex/Regex.cpp +++ b/Tests/LibRegex/Regex.cpp @@ -485,6 +485,18 @@ TEST_CASE(simple_period_end_benchmark) EXPECT_EQ(re.search("hello?", m), true); } +TEST_CASE(posix_extended_nested_capture_group) +{ + Regex re("(h(e(?llo)))"); // group 0 -> "hello", group 1 -> "ello", group 2/"llo" -> "llo" + auto result = re.match("hello"); + EXPECT(result.success); + EXPECT_EQ(result.capture_group_matches.size(), 1u); + EXPECT_EQ(result.capture_group_matches[0].size(), 3u); + EXPECT_EQ(result.capture_group_matches[0][0].view, "hello"sv); + EXPECT_EQ(result.capture_group_matches[0][1].view, "ello"sv); + EXPECT_EQ(result.capture_group_matches[0][2].view, "llo"sv); +} + TEST_CASE(ECMA262_parse) { struct _test { diff --git a/Userland/Libraries/LibRegex/RegexParser.cpp b/Userland/Libraries/LibRegex/RegexParser.cpp index 6f95c3d678..8fa9dd9b18 100644 --- a/Userland/Libraries/LibRegex/RegexParser.cpp +++ b/Userland/Libraries/LibRegex/RegexParser.cpp @@ -799,6 +799,7 @@ ALWAYS_INLINE bool PosixExtendedParser::parse_sub_expression(ByteCode& stack, si last_token = consume(); } capture_group_name = StringView(start_token.value().characters_without_null_termination(), capture_group_name_length); + ++m_parser_state.named_capture_groups_count; } else if (match(TokenType::EqualSign)) { // positive lookahead consume(); @@ -817,8 +818,11 @@ ALWAYS_INLINE bool PosixExtendedParser::parse_sub_expression(ByteCode& stack, si } } - if (!(m_parser_state.regex_options & AllFlags::SkipSubExprResults || prevent_capture_group)) - bytecode.insert_bytecode_group_capture_left(m_parser_state.capture_groups_count); + auto current_capture_group = m_parser_state.capture_groups_count; + if (!(m_parser_state.regex_options & AllFlags::SkipSubExprResults || prevent_capture_group)) { + bytecode.insert_bytecode_group_capture_left(current_capture_group); + m_parser_state.capture_groups_count++; + } ByteCode capture_group_bytecode; @@ -846,13 +850,10 @@ ALWAYS_INLINE bool PosixExtendedParser::parse_sub_expression(ByteCode& stack, si consume(TokenType::RightParen, Error::MismatchingParen); if (!(m_parser_state.regex_options & AllFlags::SkipSubExprResults || prevent_capture_group)) { - if (capture_group_name.has_value()) { - bytecode.insert_bytecode_group_capture_right(m_parser_state.capture_groups_count, capture_group_name.value()); - ++m_parser_state.named_capture_groups_count; - } else { - bytecode.insert_bytecode_group_capture_right(m_parser_state.capture_groups_count); - } - ++m_parser_state.capture_groups_count; + if (capture_group_name.has_value()) + bytecode.insert_bytecode_group_capture_right(current_capture_group, capture_group_name.value()); + else + bytecode.insert_bytecode_group_capture_right(current_capture_group); } should_parse_repetition_symbol = true; break;