1
Fork 0
mirror of https://github.com/RGBCube/serenity synced 2025-07-24 21:47:43 +00:00

LibRegex: Clear previous capture group contents in ECMA262 mode

ECMA262 requires that the capture groups only contain the values from
the last iteration, e.g. `((c)(a)?(b))` should _not_ contain 'a' in the
second capture group when matching "cabcb".
This commit is contained in:
Ali Mohammad Pur 2021-07-23 19:37:18 +04:30 committed by Ali Mohammad Pur
parent 34ec0fa8ad
commit c8b2199251
4 changed files with 109 additions and 1 deletions

View file

@ -132,6 +132,12 @@ void ByteCode::ensure_opcodes_initialized()
case OpCodeId::CheckBegin:
s_opcodes[i] = make<OpCode_CheckBegin>();
break;
case OpCodeId::ClearCaptureGroup:
s_opcodes[i] = make<OpCode_ClearCaptureGroup>();
break;
case OpCodeId::ClearNamedCaptureGroup:
s_opcodes[i] = make<OpCode_ClearNamedCaptureGroup>();
break;
case OpCodeId::SaveLeftCaptureGroup:
s_opcodes[i] = make<OpCode_SaveLeftCaptureGroup>();
break;
@ -288,6 +294,16 @@ ALWAYS_INLINE ExecutionResult OpCode_CheckEnd::execute(const MatchInput& input,
return ExecutionResult::Failed_ExecuteLowPrioForks;
}
ALWAYS_INLINE ExecutionResult OpCode_ClearCaptureGroup::execute(const MatchInput& input, MatchState& state, MatchOutput&) const
{
if (input.match_index < state.capture_group_matches.size()) {
auto& group = state.capture_group_matches[input.match_index];
if (id() < group.size())
group[id()] = {};
}
return ExecutionResult::Continue;
}
ALWAYS_INLINE ExecutionResult OpCode_SaveLeftCaptureGroup::execute(const MatchInput& input, MatchState& state, MatchOutput&) const
{
if (input.match_index >= state.capture_group_matches.size()) {
@ -333,6 +349,15 @@ ALWAYS_INLINE ExecutionResult OpCode_SaveRightCaptureGroup::execute(const MatchI
return ExecutionResult::Continue;
}
ALWAYS_INLINE ExecutionResult OpCode_ClearNamedCaptureGroup::execute(const MatchInput& input, MatchState& state, MatchOutput&) const
{
if (input.match_index < state.capture_group_matches.size()) {
auto& group = state.named_capture_group_matches[input.match_index];
group.remove(name());
}
return ExecutionResult::Continue;
}
ALWAYS_INLINE ExecutionResult OpCode_SaveLeftNamedCaptureGroup::execute(const MatchInput& input, MatchState& state, MatchOutput&) const
{
if (input.match_index >= state.named_capture_group_matches.size()) {

View file

@ -39,6 +39,8 @@ using ByteCodeValueType = u64;
__ENUMERATE_OPCODE(Save) \
__ENUMERATE_OPCODE(Restore) \
__ENUMERATE_OPCODE(GoBack) \
__ENUMERATE_OPCODE(ClearCaptureGroup) \
__ENUMERATE_OPCODE(ClearNamedCaptureGroup) \
__ENUMERATE_OPCODE(Exit)
// clang-format off
@ -174,6 +176,19 @@ public:
extend(move(bytecode));
}
void insert_bytecode_clear_capture_group(size_t index)
{
empend(static_cast<ByteCodeValueType>(OpCodeId::ClearCaptureGroup));
empend(index);
}
void insert_bytecode_clear_named_capture_group(StringView name)
{
empend(static_cast<ByteCodeValueType>(OpCodeId::ClearNamedCaptureGroup));
empend(reinterpret_cast<ByteCodeValueType>(name.characters_without_null_termination()));
empend(name.length());
}
void insert_bytecode_compare_string(StringView view)
{
ByteCode bytecode;
@ -626,6 +641,28 @@ public:
const String arguments_string() const override { return String::formatted("kind={} ({})", (long unsigned int)argument(0), boundary_check_type_name(type())); }
};
class OpCode_ClearCaptureGroup final : public OpCode {
public:
ExecutionResult execute(const MatchInput& input, MatchState& state, MatchOutput& output) const override;
ALWAYS_INLINE OpCodeId opcode_id() const override { return OpCodeId::ClearCaptureGroup; }
ALWAYS_INLINE size_t size() const override { return 2; }
ALWAYS_INLINE size_t id() const { return argument(0); }
const String arguments_string() const override { return String::formatted("id={}", id()); }
};
class OpCode_ClearNamedCaptureGroup final : public OpCode {
public:
ExecutionResult execute(const MatchInput& input, MatchState& state, MatchOutput& output) const override;
ALWAYS_INLINE OpCodeId opcode_id() const override { return OpCodeId::ClearNamedCaptureGroup; }
ALWAYS_INLINE size_t size() const override { return 3; }
ALWAYS_INLINE StringView name() const { return { reinterpret_cast<char*>(argument(0)), length() }; }
ALWAYS_INLINE size_t length() const { return argument(1); }
const String arguments_string() const override
{
return String::formatted("name={}, length={}", name(), length());
}
};
class OpCode_SaveLeftCaptureGroup final : public OpCode {
public:
ExecutionResult execute(const MatchInput& input, MatchState& state, MatchOutput& output) const override;

View file

@ -1877,6 +1877,28 @@ bool ECMA262Parser::parse_capture_group(ByteCode& stack, size_t& match_length_mi
{
consume(TokenType::LeftParen, Error::InvalidPattern);
auto enter_capture_group_scope = [&] {
m_capture_groups_in_scope.empend();
};
auto exit_capture_group_scope = [&] {
auto last = m_capture_groups_in_scope.take_last();
m_capture_groups_in_scope.last().extend(move(last));
};
auto register_capture_group_in_current_scope = [&](auto identifier) {
m_capture_groups_in_scope.last().empend(identifier);
};
auto clear_all_capture_groups_in_scope = [&] {
for (auto& entry : m_capture_groups_in_scope.last()) {
entry.visit(
[&](size_t index) {
stack.insert_bytecode_clear_capture_group(index);
},
[&](String const& name) {
stack.insert_bytecode_clear_named_capture_group(name);
});
}
};
if (match(TokenType::Questionmark)) {
// Non-capturing group or group with specifier.
consume();
@ -1885,8 +1907,12 @@ bool ECMA262Parser::parse_capture_group(ByteCode& stack, size_t& match_length_mi
consume();
ByteCode noncapture_group_bytecode;
size_t length = 0;
enter_capture_group_scope();
if (!parse_disjunction(noncapture_group_bytecode, length, unicode, named))
return set_error(Error::InvalidPattern);
clear_all_capture_groups_in_scope();
exit_capture_group_scope();
consume(TokenType::RightParen, Error::MismatchingParen);
@ -1907,8 +1933,14 @@ bool ECMA262Parser::parse_capture_group(ByteCode& stack, size_t& match_length_mi
ByteCode capture_group_bytecode;
size_t length = 0;
enter_capture_group_scope();
if (!parse_disjunction(capture_group_bytecode, length, unicode, named))
return set_error(Error::InvalidPattern);
clear_all_capture_groups_in_scope();
exit_capture_group_scope();
register_capture_group_in_current_scope(name);
register_capture_group_in_current_scope(group_index);
consume(TokenType::RightParen, Error::MismatchingParen);
@ -1930,7 +1962,7 @@ bool ECMA262Parser::parse_capture_group(ByteCode& stack, size_t& match_length_mi
}
auto group_index = ++m_parser_state.capture_groups_count;
stack.insert_bytecode_group_capture_left(group_index);
enter_capture_group_scope();
ByteCode capture_group_bytecode;
size_t length = 0;
@ -1938,6 +1970,12 @@ bool ECMA262Parser::parse_capture_group(ByteCode& stack, size_t& match_length_mi
if (!parse_disjunction(capture_group_bytecode, length, unicode, named))
return set_error(Error::InvalidPattern);
clear_all_capture_groups_in_scope();
exit_capture_group_scope();
register_capture_group_in_current_scope(group_index);
stack.insert_bytecode_group_capture_left(group_index);
stack.extend(move(capture_group_bytecode));
m_parser_state.capture_group_minimum_lengths.set(group_index, length);

View file

@ -190,12 +190,14 @@ public:
explicit ECMA262Parser(Lexer& lexer)
: Parser(lexer)
{
m_capture_groups_in_scope.empend();
}
ECMA262Parser(Lexer& lexer, Optional<typename ParserTraits<ECMA262Parser>::OptionsType> regex_options)
: Parser(lexer, regex_options.value_or({}))
{
m_should_use_browser_extended_grammar = regex_options.has_value() && regex_options->has_flag_set(ECMAScriptFlags::BrowserExtended);
m_capture_groups_in_scope.empend();
}
~ECMA262Parser() = default;
@ -242,6 +244,12 @@ private:
// Keep the Annex B. behaviour behind a flag, the users can enable it by passing the `ECMAScriptFlags::BrowserExtended` flag.
bool m_should_use_browser_extended_grammar { false };
// ECMA-262 basically requires that we clear the inner captures of a capture group before trying to match it,
// by requiring that (...)+ only contain the matches for the last iteration.
// To do that, we have to keep track of which capture groups are "in scope", so we can clear them as needed.
using CaptureGroup = Variant<size_t, String>;
Vector<Vector<CaptureGroup>> m_capture_groups_in_scope;
};
using PosixExtended = PosixExtendedParser;