mirror of
https://github.com/RGBCube/serenity
synced 2025-07-24 12:07:44 +00:00
LibRegex: Implement an ECMA262 Regex quirk with negative lookarounds
This implements the quirk defined by "Note 3" in section "Canonicalize" (https://tc39.es/ecma262/#sec-runtime-semantics-canonicalize-ch). Crosses off another quirk from #6042.
This commit is contained in:
parent
bfe8f312f3
commit
c11be92e23
4 changed files with 54 additions and 16 deletions
|
@ -682,6 +682,7 @@ TEST_CASE(ECMA262_match)
|
||||||
{ "[\\0]"sv, "\0"sv, true, combine_flags(ECMAScriptFlags::Unicode, ECMAScriptFlags::BrowserExtended) },
|
{ "[\\0]"sv, "\0"sv, true, combine_flags(ECMAScriptFlags::Unicode, ECMAScriptFlags::BrowserExtended) },
|
||||||
{ "[\\01]"sv, "\1"sv, true, ECMAScriptFlags::BrowserExtended },
|
{ "[\\01]"sv, "\1"sv, true, ECMAScriptFlags::BrowserExtended },
|
||||||
{ "(\0|a)"sv, "a"sv, true }, // #9686, Should allow null bytes in pattern
|
{ "(\0|a)"sv, "a"sv, true }, // #9686, Should allow null bytes in pattern
|
||||||
|
{ "(.*?)a(?!(a+)b\\2c)\\2(.*)"sv, "baaabaac"sv, true }, // #6042, Groups inside lookarounds may be referenced outside, but their contents appear empty if the pattern in the lookaround fails.
|
||||||
};
|
};
|
||||||
// clang-format on
|
// clang-format on
|
||||||
|
|
||||||
|
|
|
@ -486,7 +486,14 @@ Optional<bool> Matcher<Parser>::execute(MatchInput const& input, MatchState& sta
|
||||||
return true;
|
return true;
|
||||||
case ExecutionResult::Failed:
|
case ExecutionResult::Failed:
|
||||||
if (!states_to_try_next.is_empty()) {
|
if (!states_to_try_next.is_empty()) {
|
||||||
state = states_to_try_next.take_last();
|
auto next_state = states_to_try_next.take_last();
|
||||||
|
// Note: ECMA262 quirk: Note 3, https://tc39.es/ecma262/#sec-runtime-semantics-canonicalize-ch
|
||||||
|
// capture groups defined in lookarounds "leak" outside the regex,
|
||||||
|
// but their contents are empty if the lookaround fails.
|
||||||
|
// This is done by manually clearing the groups where needed, and leaking their contents here.
|
||||||
|
if constexpr (IsSame<Parser, ECMA262>)
|
||||||
|
swap(next_state.capture_group_matches, state.capture_group_matches);
|
||||||
|
state = move(next_state);
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
return false;
|
return false;
|
||||||
|
@ -496,7 +503,11 @@ Optional<bool> Matcher<Parser>::execute(MatchInput const& input, MatchState& sta
|
||||||
return {};
|
return {};
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
state = states_to_try_next.take_last();
|
auto next_state = states_to_try_next.take_last();
|
||||||
|
// See note above about an ECMA262 quirk.
|
||||||
|
if constexpr (IsSame<Parser, ECMA262>)
|
||||||
|
swap(next_state.capture_group_matches, state.capture_group_matches);
|
||||||
|
state = move(next_state);
|
||||||
++recursion_level;
|
++recursion_level;
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
|
@ -10,6 +10,7 @@
|
||||||
#include <AK/AnyOf.h>
|
#include <AK/AnyOf.h>
|
||||||
#include <AK/CharacterTypes.h>
|
#include <AK/CharacterTypes.h>
|
||||||
#include <AK/GenericLexer.h>
|
#include <AK/GenericLexer.h>
|
||||||
|
#include <AK/ScopeGuard.h>
|
||||||
#include <AK/String.h>
|
#include <AK/String.h>
|
||||||
#include <AK/StringBuilder.h>
|
#include <AK/StringBuilder.h>
|
||||||
#include <AK/StringUtils.h>
|
#include <AK/StringUtils.h>
|
||||||
|
@ -1065,9 +1066,17 @@ bool ECMA262Parser::parse_assertion(ByteCode& stack, [[maybe_unused]] size_t& ma
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
if (should_parse_forward_assertion && try_skip("!")) {
|
if (should_parse_forward_assertion && try_skip("!")) {
|
||||||
|
enter_capture_group_scope();
|
||||||
|
ScopeGuard quit_scope {
|
||||||
|
[this] {
|
||||||
|
exit_capture_group_scope();
|
||||||
|
}
|
||||||
|
};
|
||||||
if (!parse_inner_disjunction(assertion_stack, length_dummy, unicode, named))
|
if (!parse_inner_disjunction(assertion_stack, length_dummy, unicode, named))
|
||||||
return false;
|
return false;
|
||||||
stack.insert_bytecode_lookaround(move(assertion_stack), ByteCode::LookAroundType::NegatedLookAhead);
|
stack.insert_bytecode_lookaround(move(assertion_stack), ByteCode::LookAroundType::NegatedLookAhead);
|
||||||
|
clear_all_capture_groups_in_scope(stack);
|
||||||
|
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
if (m_should_use_browser_extended_grammar) {
|
if (m_should_use_browser_extended_grammar) {
|
||||||
|
@ -1086,9 +1095,16 @@ bool ECMA262Parser::parse_assertion(ByteCode& stack, [[maybe_unused]] size_t& ma
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
if (try_skip("<!")) {
|
if (try_skip("<!")) {
|
||||||
|
enter_capture_group_scope();
|
||||||
|
ScopeGuard quit_scope {
|
||||||
|
[this] {
|
||||||
|
exit_capture_group_scope();
|
||||||
|
}
|
||||||
|
};
|
||||||
if (!parse_inner_disjunction(assertion_stack, length_dummy, unicode, named))
|
if (!parse_inner_disjunction(assertion_stack, length_dummy, unicode, named))
|
||||||
return false;
|
return false;
|
||||||
stack.insert_bytecode_lookaround(move(assertion_stack), ByteCode::LookAroundType::NegatedLookBehind, length_dummy);
|
stack.insert_bytecode_lookaround(move(assertion_stack), ByteCode::LookAroundType::NegatedLookBehind, length_dummy);
|
||||||
|
clear_all_capture_groups_in_scope(stack);
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -1124,10 +1140,17 @@ bool ECMA262Parser::parse_quantifiable_assertion(ByteCode& stack, size_t&, bool
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
if (try_skip("!")) {
|
if (try_skip("!")) {
|
||||||
|
enter_capture_group_scope();
|
||||||
|
ScopeGuard quit_scope {
|
||||||
|
[this] {
|
||||||
|
exit_capture_group_scope();
|
||||||
|
}
|
||||||
|
};
|
||||||
if (!parse_inner_disjunction(assertion_stack, match_length_minimum, false, named))
|
if (!parse_inner_disjunction(assertion_stack, match_length_minimum, false, named))
|
||||||
return false;
|
return false;
|
||||||
|
|
||||||
stack.insert_bytecode_lookaround(move(assertion_stack), ByteCode::LookAroundType::NegatedLookAhead);
|
stack.insert_bytecode_lookaround(move(assertion_stack), ByteCode::LookAroundType::NegatedLookAhead);
|
||||||
|
clear_all_capture_groups_in_scope(stack);
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -2189,20 +2212,9 @@ bool ECMA262Parser::parse_capture_group(ByteCode& stack, size_t& match_length_mi
|
||||||
{
|
{
|
||||||
consume(TokenType::LeftParen, Error::InvalidPattern);
|
consume(TokenType::LeftParen, Error::InvalidPattern);
|
||||||
|
|
||||||
auto enter_capture_group_scope = [&] {
|
|
||||||
m_capture_groups_in_scope.empend();
|
|
||||||
};
|
|
||||||
auto exit_capture_group_scope = [&] {
|
|
||||||
auto last = m_capture_groups_in_scope.take_last();
|
|
||||||
m_capture_groups_in_scope.last().extend(move(last));
|
|
||||||
};
|
|
||||||
auto register_capture_group_in_current_scope = [&](auto identifier) {
|
auto register_capture_group_in_current_scope = [&](auto identifier) {
|
||||||
m_capture_groups_in_scope.last().empend(identifier);
|
m_capture_groups_in_scope.last().empend(identifier);
|
||||||
};
|
};
|
||||||
auto clear_all_capture_groups_in_scope = [&] {
|
|
||||||
for (auto& index : m_capture_groups_in_scope.last())
|
|
||||||
stack.insert_bytecode_clear_capture_group(index);
|
|
||||||
};
|
|
||||||
|
|
||||||
if (match(TokenType::Questionmark)) {
|
if (match(TokenType::Questionmark)) {
|
||||||
// Non-capturing group or group with specifier.
|
// Non-capturing group or group with specifier.
|
||||||
|
@ -2216,7 +2228,7 @@ bool ECMA262Parser::parse_capture_group(ByteCode& stack, size_t& match_length_mi
|
||||||
enter_capture_group_scope();
|
enter_capture_group_scope();
|
||||||
if (!parse_disjunction(noncapture_group_bytecode, length, unicode, named))
|
if (!parse_disjunction(noncapture_group_bytecode, length, unicode, named))
|
||||||
return set_error(Error::InvalidPattern);
|
return set_error(Error::InvalidPattern);
|
||||||
clear_all_capture_groups_in_scope();
|
clear_all_capture_groups_in_scope(stack);
|
||||||
exit_capture_group_scope();
|
exit_capture_group_scope();
|
||||||
|
|
||||||
consume(TokenType::RightParen, Error::MismatchingParen);
|
consume(TokenType::RightParen, Error::MismatchingParen);
|
||||||
|
@ -2246,7 +2258,7 @@ bool ECMA262Parser::parse_capture_group(ByteCode& stack, size_t& match_length_mi
|
||||||
enter_capture_group_scope();
|
enter_capture_group_scope();
|
||||||
if (!parse_disjunction(capture_group_bytecode, length, unicode, named))
|
if (!parse_disjunction(capture_group_bytecode, length, unicode, named))
|
||||||
return set_error(Error::InvalidPattern);
|
return set_error(Error::InvalidPattern);
|
||||||
clear_all_capture_groups_in_scope();
|
clear_all_capture_groups_in_scope(stack);
|
||||||
exit_capture_group_scope();
|
exit_capture_group_scope();
|
||||||
|
|
||||||
register_capture_group_in_current_scope(group_index);
|
register_capture_group_in_current_scope(group_index);
|
||||||
|
@ -2277,7 +2289,7 @@ bool ECMA262Parser::parse_capture_group(ByteCode& stack, size_t& match_length_mi
|
||||||
if (!parse_disjunction(capture_group_bytecode, length, unicode, named))
|
if (!parse_disjunction(capture_group_bytecode, length, unicode, named))
|
||||||
return set_error(Error::InvalidPattern);
|
return set_error(Error::InvalidPattern);
|
||||||
|
|
||||||
clear_all_capture_groups_in_scope();
|
clear_all_capture_groups_in_scope(stack);
|
||||||
exit_capture_group_scope();
|
exit_capture_group_scope();
|
||||||
|
|
||||||
register_capture_group_in_current_scope(group_index);
|
register_capture_group_in_current_scope(group_index);
|
||||||
|
|
|
@ -254,6 +254,20 @@ private:
|
||||||
|
|
||||||
size_t ensure_total_number_of_capturing_parenthesis();
|
size_t ensure_total_number_of_capturing_parenthesis();
|
||||||
|
|
||||||
|
void enter_capture_group_scope() { m_capture_groups_in_scope.empend(); }
|
||||||
|
|
||||||
|
void exit_capture_group_scope()
|
||||||
|
{
|
||||||
|
auto last = m_capture_groups_in_scope.take_last();
|
||||||
|
m_capture_groups_in_scope.last().extend(move(last));
|
||||||
|
}
|
||||||
|
|
||||||
|
void clear_all_capture_groups_in_scope(ByteCode& stack)
|
||||||
|
{
|
||||||
|
for (auto& index : m_capture_groups_in_scope.last())
|
||||||
|
stack.insert_bytecode_clear_capture_group(index);
|
||||||
|
};
|
||||||
|
|
||||||
// ECMA-262's flavour of regex is a bit weird in that it allows backrefs to reference "future" captures, and such backrefs
|
// ECMA-262's flavour of regex is a bit weird in that it allows backrefs to reference "future" captures, and such backrefs
|
||||||
// always match the empty string. So we have to know how many capturing parenthesis there are, but we don't want to always
|
// always match the empty string. So we have to know how many capturing parenthesis there are, but we don't want to always
|
||||||
// parse it twice, so we'll just do so when it's actually needed.
|
// parse it twice, so we'll just do so when it's actually needed.
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue