1
Fork 0
mirror of https://github.com/RGBCube/serenity synced 2025-07-25 05:17:35 +00:00

LibRegex: Allow references to capture groups that aren't parsed yet

This only applies to the ECMA262 parser.
This behaviour is an ECMA262-specific quirk, such references always
generate zero-length matches (even on subsequent passes).
Also adds a test in LibJS's test suite.

Fixes #6039.
This commit is contained in:
AnotherTest 2021-04-01 18:30:47 +04:30 committed by Andreas Kling
parent 804ab79995
commit 6bbb26fdaf
6 changed files with 80 additions and 6 deletions

View file

@ -1114,12 +1114,19 @@ bool ECMA262Parser::parse_atom_escape(ByteCode& stack, size_t& match_length_mini
{
if (auto escape_str = read_digits_as_string(ReadDigitsInitialZeroState::Disallow, ReadDigitFollowPolicy::DisallowNonDigit); !escape_str.is_empty()) {
if (auto escape = escape_str.to_uint(); escape.has_value()) {
// See if this is a "back"-reference (we've already parsed the group it refers to)
auto maybe_length = m_parser_state.capture_group_minimum_lengths.get(escape.value());
if (maybe_length.has_value()) {
match_length_minimum += maybe_length.value();
stack.insert_bytecode_compare_values({ { CharacterCompareType::Reference, (ByteCodeValueType)escape.value() } });
return true;
}
// It's not a pattern seen before, so we have to see if it's a valid reference to a future group.
if (escape.value() <= ensure_total_number_of_capturing_parenthesis()) {
// This refers to a future group, and it will _always_ be matching an empty string
// So just match nothing and move on.
return true;
}
if (!m_should_use_browser_extended_grammar) {
set_error(Error::InvalidNumber);
return false;
@ -1729,4 +1736,47 @@ bool ECMA262Parser::parse_capture_group(ByteCode& stack, size_t& match_length_mi
return true;
}
size_t ECMA262Parser::ensure_total_number_of_capturing_parenthesis()
{
if (m_total_number_of_capturing_parenthesis.has_value())
return m_total_number_of_capturing_parenthesis.value();
GenericLexer lexer { m_parser_state.lexer.source() };
size_t count = 0;
while (!lexer.is_eof()) {
switch (lexer.peek()) {
case '\\':
lexer.consume(2);
continue;
case '[':
while (!lexer.is_eof()) {
if (lexer.consume_specific('\\'))
lexer.consume();
else if (lexer.consume_specific(']'))
break;
lexer.consume();
}
break;
case '(':
if (lexer.consume_specific('?')) {
// non-capturing group '(?:', lookaround '(?<='/'(?<!', or named capture '(?<'
if (!lexer.consume_specific('<'))
break;
if (lexer.next_is(is_any_of("=!")))
break;
++count;
} else {
++count;
}
break;
}
lexer.consume();
}
m_total_number_of_capturing_parenthesis = count;
return count;
}
}