mirror of
https://github.com/RGBCube/serenity
synced 2025-07-24 22:07:34 +00:00
LibRegex: Clear previous capture group contents in ECMA262 mode
ECMA262 requires that the capture groups only contain the values from the last iteration, e.g. `((c)(a)?(b))` should _not_ contain 'a' in the second capture group when matching "cabcb".
This commit is contained in:
parent
34ec0fa8ad
commit
c8b2199251
4 changed files with 109 additions and 1 deletions
|
@ -132,6 +132,12 @@ void ByteCode::ensure_opcodes_initialized()
|
|||
case OpCodeId::CheckBegin:
|
||||
s_opcodes[i] = make<OpCode_CheckBegin>();
|
||||
break;
|
||||
case OpCodeId::ClearCaptureGroup:
|
||||
s_opcodes[i] = make<OpCode_ClearCaptureGroup>();
|
||||
break;
|
||||
case OpCodeId::ClearNamedCaptureGroup:
|
||||
s_opcodes[i] = make<OpCode_ClearNamedCaptureGroup>();
|
||||
break;
|
||||
case OpCodeId::SaveLeftCaptureGroup:
|
||||
s_opcodes[i] = make<OpCode_SaveLeftCaptureGroup>();
|
||||
break;
|
||||
|
@ -288,6 +294,16 @@ ALWAYS_INLINE ExecutionResult OpCode_CheckEnd::execute(const MatchInput& input,
|
|||
return ExecutionResult::Failed_ExecuteLowPrioForks;
|
||||
}
|
||||
|
||||
ALWAYS_INLINE ExecutionResult OpCode_ClearCaptureGroup::execute(const MatchInput& input, MatchState& state, MatchOutput&) const
|
||||
{
|
||||
if (input.match_index < state.capture_group_matches.size()) {
|
||||
auto& group = state.capture_group_matches[input.match_index];
|
||||
if (id() < group.size())
|
||||
group[id()] = {};
|
||||
}
|
||||
return ExecutionResult::Continue;
|
||||
}
|
||||
|
||||
ALWAYS_INLINE ExecutionResult OpCode_SaveLeftCaptureGroup::execute(const MatchInput& input, MatchState& state, MatchOutput&) const
|
||||
{
|
||||
if (input.match_index >= state.capture_group_matches.size()) {
|
||||
|
@ -333,6 +349,15 @@ ALWAYS_INLINE ExecutionResult OpCode_SaveRightCaptureGroup::execute(const MatchI
|
|||
return ExecutionResult::Continue;
|
||||
}
|
||||
|
||||
ALWAYS_INLINE ExecutionResult OpCode_ClearNamedCaptureGroup::execute(const MatchInput& input, MatchState& state, MatchOutput&) const
|
||||
{
|
||||
if (input.match_index < state.capture_group_matches.size()) {
|
||||
auto& group = state.named_capture_group_matches[input.match_index];
|
||||
group.remove(name());
|
||||
}
|
||||
return ExecutionResult::Continue;
|
||||
}
|
||||
|
||||
ALWAYS_INLINE ExecutionResult OpCode_SaveLeftNamedCaptureGroup::execute(const MatchInput& input, MatchState& state, MatchOutput&) const
|
||||
{
|
||||
if (input.match_index >= state.named_capture_group_matches.size()) {
|
||||
|
|
|
@ -39,6 +39,8 @@ using ByteCodeValueType = u64;
|
|||
__ENUMERATE_OPCODE(Save) \
|
||||
__ENUMERATE_OPCODE(Restore) \
|
||||
__ENUMERATE_OPCODE(GoBack) \
|
||||
__ENUMERATE_OPCODE(ClearCaptureGroup) \
|
||||
__ENUMERATE_OPCODE(ClearNamedCaptureGroup) \
|
||||
__ENUMERATE_OPCODE(Exit)
|
||||
|
||||
// clang-format off
|
||||
|
@ -174,6 +176,19 @@ public:
|
|||
extend(move(bytecode));
|
||||
}
|
||||
|
||||
void insert_bytecode_clear_capture_group(size_t index)
|
||||
{
|
||||
empend(static_cast<ByteCodeValueType>(OpCodeId::ClearCaptureGroup));
|
||||
empend(index);
|
||||
}
|
||||
|
||||
void insert_bytecode_clear_named_capture_group(StringView name)
|
||||
{
|
||||
empend(static_cast<ByteCodeValueType>(OpCodeId::ClearNamedCaptureGroup));
|
||||
empend(reinterpret_cast<ByteCodeValueType>(name.characters_without_null_termination()));
|
||||
empend(name.length());
|
||||
}
|
||||
|
||||
void insert_bytecode_compare_string(StringView view)
|
||||
{
|
||||
ByteCode bytecode;
|
||||
|
@ -626,6 +641,28 @@ public:
|
|||
const String arguments_string() const override { return String::formatted("kind={} ({})", (long unsigned int)argument(0), boundary_check_type_name(type())); }
|
||||
};
|
||||
|
||||
class OpCode_ClearCaptureGroup final : public OpCode {
|
||||
public:
|
||||
ExecutionResult execute(const MatchInput& input, MatchState& state, MatchOutput& output) const override;
|
||||
ALWAYS_INLINE OpCodeId opcode_id() const override { return OpCodeId::ClearCaptureGroup; }
|
||||
ALWAYS_INLINE size_t size() const override { return 2; }
|
||||
ALWAYS_INLINE size_t id() const { return argument(0); }
|
||||
const String arguments_string() const override { return String::formatted("id={}", id()); }
|
||||
};
|
||||
|
||||
class OpCode_ClearNamedCaptureGroup final : public OpCode {
|
||||
public:
|
||||
ExecutionResult execute(const MatchInput& input, MatchState& state, MatchOutput& output) const override;
|
||||
ALWAYS_INLINE OpCodeId opcode_id() const override { return OpCodeId::ClearNamedCaptureGroup; }
|
||||
ALWAYS_INLINE size_t size() const override { return 3; }
|
||||
ALWAYS_INLINE StringView name() const { return { reinterpret_cast<char*>(argument(0)), length() }; }
|
||||
ALWAYS_INLINE size_t length() const { return argument(1); }
|
||||
const String arguments_string() const override
|
||||
{
|
||||
return String::formatted("name={}, length={}", name(), length());
|
||||
}
|
||||
};
|
||||
|
||||
class OpCode_SaveLeftCaptureGroup final : public OpCode {
|
||||
public:
|
||||
ExecutionResult execute(const MatchInput& input, MatchState& state, MatchOutput& output) const override;
|
||||
|
|
|
@ -1877,6 +1877,28 @@ bool ECMA262Parser::parse_capture_group(ByteCode& stack, size_t& match_length_mi
|
|||
{
|
||||
consume(TokenType::LeftParen, Error::InvalidPattern);
|
||||
|
||||
auto enter_capture_group_scope = [&] {
|
||||
m_capture_groups_in_scope.empend();
|
||||
};
|
||||
auto exit_capture_group_scope = [&] {
|
||||
auto last = m_capture_groups_in_scope.take_last();
|
||||
m_capture_groups_in_scope.last().extend(move(last));
|
||||
};
|
||||
auto register_capture_group_in_current_scope = [&](auto identifier) {
|
||||
m_capture_groups_in_scope.last().empend(identifier);
|
||||
};
|
||||
auto clear_all_capture_groups_in_scope = [&] {
|
||||
for (auto& entry : m_capture_groups_in_scope.last()) {
|
||||
entry.visit(
|
||||
[&](size_t index) {
|
||||
stack.insert_bytecode_clear_capture_group(index);
|
||||
},
|
||||
[&](String const& name) {
|
||||
stack.insert_bytecode_clear_named_capture_group(name);
|
||||
});
|
||||
}
|
||||
};
|
||||
|
||||
if (match(TokenType::Questionmark)) {
|
||||
// Non-capturing group or group with specifier.
|
||||
consume();
|
||||
|
@ -1885,8 +1907,12 @@ bool ECMA262Parser::parse_capture_group(ByteCode& stack, size_t& match_length_mi
|
|||
consume();
|
||||
ByteCode noncapture_group_bytecode;
|
||||
size_t length = 0;
|
||||
|
||||
enter_capture_group_scope();
|
||||
if (!parse_disjunction(noncapture_group_bytecode, length, unicode, named))
|
||||
return set_error(Error::InvalidPattern);
|
||||
clear_all_capture_groups_in_scope();
|
||||
exit_capture_group_scope();
|
||||
|
||||
consume(TokenType::RightParen, Error::MismatchingParen);
|
||||
|
||||
|
@ -1907,8 +1933,14 @@ bool ECMA262Parser::parse_capture_group(ByteCode& stack, size_t& match_length_mi
|
|||
|
||||
ByteCode capture_group_bytecode;
|
||||
size_t length = 0;
|
||||
enter_capture_group_scope();
|
||||
if (!parse_disjunction(capture_group_bytecode, length, unicode, named))
|
||||
return set_error(Error::InvalidPattern);
|
||||
clear_all_capture_groups_in_scope();
|
||||
exit_capture_group_scope();
|
||||
|
||||
register_capture_group_in_current_scope(name);
|
||||
register_capture_group_in_current_scope(group_index);
|
||||
|
||||
consume(TokenType::RightParen, Error::MismatchingParen);
|
||||
|
||||
|
@ -1930,7 +1962,7 @@ bool ECMA262Parser::parse_capture_group(ByteCode& stack, size_t& match_length_mi
|
|||
}
|
||||
|
||||
auto group_index = ++m_parser_state.capture_groups_count;
|
||||
stack.insert_bytecode_group_capture_left(group_index);
|
||||
enter_capture_group_scope();
|
||||
|
||||
ByteCode capture_group_bytecode;
|
||||
size_t length = 0;
|
||||
|
@ -1938,6 +1970,12 @@ bool ECMA262Parser::parse_capture_group(ByteCode& stack, size_t& match_length_mi
|
|||
if (!parse_disjunction(capture_group_bytecode, length, unicode, named))
|
||||
return set_error(Error::InvalidPattern);
|
||||
|
||||
clear_all_capture_groups_in_scope();
|
||||
exit_capture_group_scope();
|
||||
|
||||
register_capture_group_in_current_scope(group_index);
|
||||
|
||||
stack.insert_bytecode_group_capture_left(group_index);
|
||||
stack.extend(move(capture_group_bytecode));
|
||||
|
||||
m_parser_state.capture_group_minimum_lengths.set(group_index, length);
|
||||
|
|
|
@ -190,12 +190,14 @@ public:
|
|||
explicit ECMA262Parser(Lexer& lexer)
|
||||
: Parser(lexer)
|
||||
{
|
||||
m_capture_groups_in_scope.empend();
|
||||
}
|
||||
|
||||
ECMA262Parser(Lexer& lexer, Optional<typename ParserTraits<ECMA262Parser>::OptionsType> regex_options)
|
||||
: Parser(lexer, regex_options.value_or({}))
|
||||
{
|
||||
m_should_use_browser_extended_grammar = regex_options.has_value() && regex_options->has_flag_set(ECMAScriptFlags::BrowserExtended);
|
||||
m_capture_groups_in_scope.empend();
|
||||
}
|
||||
|
||||
~ECMA262Parser() = default;
|
||||
|
@ -242,6 +244,12 @@ private:
|
|||
|
||||
// Keep the Annex B. behaviour behind a flag, the users can enable it by passing the `ECMAScriptFlags::BrowserExtended` flag.
|
||||
bool m_should_use_browser_extended_grammar { false };
|
||||
|
||||
// ECMA-262 basically requires that we clear the inner captures of a capture group before trying to match it,
|
||||
// by requiring that (...)+ only contain the matches for the last iteration.
|
||||
// To do that, we have to keep track of which capture groups are "in scope", so we can clear them as needed.
|
||||
using CaptureGroup = Variant<size_t, String>;
|
||||
Vector<Vector<CaptureGroup>> m_capture_groups_in_scope;
|
||||
};
|
||||
|
||||
using PosixExtended = PosixExtendedParser;
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue