diff --git a/Userland/Libraries/LibRegex/RegexByteCode.cpp b/Userland/Libraries/LibRegex/RegexByteCode.cpp index 00b4c0a299..9e1c7b2251 100644 --- a/Userland/Libraries/LibRegex/RegexByteCode.cpp +++ b/Userland/Libraries/LibRegex/RegexByteCode.cpp @@ -189,6 +189,9 @@ void ByteCode::ensure_opcodes_initialized() case OpCodeId::Repeat: s_opcodes[i] = make(); break; + case OpCodeId::ResetRepeat: + s_opcodes[i] = make(); + break; } } s_opcodes_initialized = true; @@ -883,4 +886,13 @@ ALWAYS_INLINE ExecutionResult OpCode_Repeat::execute(MatchInput const&, MatchSta return ExecutionResult::Continue; } +ALWAYS_INLINE ExecutionResult OpCode_ResetRepeat::execute(MatchInput const&, MatchState& state) const +{ + if (id() >= state.repetition_marks.size()) + state.repetition_marks.resize(id() + 1); + + state.repetition_marks.at(id()) = 0; + return ExecutionResult::Continue; +} + } diff --git a/Userland/Libraries/LibRegex/RegexByteCode.h b/Userland/Libraries/LibRegex/RegexByteCode.h index cb22806459..7ebadf10c7 100644 --- a/Userland/Libraries/LibRegex/RegexByteCode.h +++ b/Userland/Libraries/LibRegex/RegexByteCode.h @@ -41,6 +41,7 @@ using ByteCodeValueType = u64; __ENUMERATE_OPCODE(GoBack) \ __ENUMERATE_OPCODE(ClearCaptureGroup) \ __ENUMERATE_OPCODE(Repeat) \ + __ENUMERATE_OPCODE(ResetRepeat) \ __ENUMERATE_OPCODE(Exit) // clang-format off @@ -333,40 +334,46 @@ public: } template - static void transform_bytecode_repetition_min_max(ByteCode& bytecode_to_repeat, T minimum, Optional maximum, size_t repetition_mark_id, bool greedy = true) requires(IsIntegral) + static void transform_bytecode_repetition_min_max(ByteCode& bytecode_to_repeat, T minimum, Optional maximum, size_t min_repetition_mark_id, size_t max_repetition_mark_id, bool greedy = true) requires(IsIntegral) { ByteCode new_bytecode; - new_bytecode.insert_bytecode_repetition_n(bytecode_to_repeat, minimum, repetition_mark_id); + new_bytecode.insert_bytecode_repetition_n(bytecode_to_repeat, minimum, min_repetition_mark_id); if (maximum.has_value()) { // (REPEAT REGEXP MIN) - // LABEL _MAX_LOOP | - // FORK END | - // REGEXP | - // REPEAT _MAX_LOOP MAX-1 | if max > min - // REGEXP | - // FORK END | - // LABEL END | + // LABEL _MAX_LOOP | + // FORK END | + // REGEXP | + // REPEAT _MAX_LOOP MAX-MIN | if max > min + // FORK END | + // REGEXP | + // LABEL END | + // RESET _MAX_LOOP | auto jump_kind = static_cast(greedy ? OpCodeId::ForkStay : OpCodeId::ForkJump); if (maximum.value() > minimum) { new_bytecode.empend(jump_kind); new_bytecode.empend((ByteCodeValueType)0); // Placeholder for the jump target. auto pre_loop_fork_jump_index = new_bytecode.size(); + new_bytecode.extend(bytecode_to_repeat); auto repetitions = maximum.value() - minimum; - dbgln("max {}, min {}, reps {}", *maximum, minimum, repetitions); + auto fork_jump_address = new_bytecode.size(); if (repetitions > 1) { - new_bytecode.extend(bytecode_to_repeat); new_bytecode.empend((ByteCodeValueType)OpCodeId::Repeat); new_bytecode.empend(bytecode_to_repeat.size() + 2); new_bytecode.empend(static_cast(repetitions - 1)); - new_bytecode.empend(repetition_mark_id); + new_bytecode.empend(max_repetition_mark_id); + new_bytecode.empend(jump_kind); + new_bytecode.empend((ByteCodeValueType)0); // Placeholder for the jump target. + auto post_loop_fork_jump_index = new_bytecode.size(); + new_bytecode.extend(bytecode_to_repeat); + fork_jump_address = new_bytecode.size(); + + new_bytecode[post_loop_fork_jump_index - 1] = (ByteCodeValueType)(fork_jump_address - post_loop_fork_jump_index); + + new_bytecode.empend((ByteCodeValueType)OpCodeId::ResetRepeat); + new_bytecode.empend((ByteCodeValueType)max_repetition_mark_id); } - new_bytecode.extend(bytecode_to_repeat); - new_bytecode.empend(jump_kind); - new_bytecode.empend((ByteCodeValueType)0); // Placeholder for the jump target. - auto post_loop_fork_jump_index = new_bytecode.size(); - new_bytecode[pre_loop_fork_jump_index - 1] = (ByteCodeValueType)(new_bytecode.size() - pre_loop_fork_jump_index); - new_bytecode[post_loop_fork_jump_index - 1] = (ByteCodeValueType)(new_bytecode.size() - post_loop_fork_jump_index); + new_bytecode[pre_loop_fork_jump_index - 1] = (ByteCodeValueType)(fork_jump_address - pre_loop_fork_jump_index); } } else { // no maximum value set, repeat finding if possible @@ -724,6 +731,19 @@ public: } }; +class OpCode_ResetRepeat : public OpCode { +public: + ExecutionResult execute(MatchInput const& input, MatchState& state) const override; + ALWAYS_INLINE OpCodeId opcode_id() const override { return OpCodeId::ResetRepeat; } + ALWAYS_INLINE size_t size() const override { return 2; } + ALWAYS_INLINE size_t id() const { return argument(0); } + String const arguments_string() const override + { + auto reps = id() < state().repetition_marks.size() ? state().repetition_marks.at(id()) : 0; + return String::formatted("id={} rep={}", id(), reps + 1); + } +}; + template bool is(OpCode const&); diff --git a/Userland/Libraries/LibRegex/RegexParser.cpp b/Userland/Libraries/LibRegex/RegexParser.cpp index 343eb7a553..6f95c3d678 100644 --- a/Userland/Libraries/LibRegex/RegexParser.cpp +++ b/Userland/Libraries/LibRegex/RegexParser.cpp @@ -446,8 +446,9 @@ bool PosixBasicParser::parse_simple_re(ByteCode& bytecode, size_t& match_length_ if (min_limit > s_maximum_repetition_count || (max_limit.has_value() && *max_limit > s_maximum_repetition_count)) return set_error(Error::InvalidBraceContent); - auto repetition_mark_id = m_parser_state.repetition_mark_count++; - ByteCode::transform_bytecode_repetition_min_max(simple_re_bytecode, min_limit, max_limit, repetition_mark_id, true); + auto min_repetition_mark_id = m_parser_state.repetition_mark_count++; + auto max_repetition_mark_id = m_parser_state.repetition_mark_count++; + ByteCode::transform_bytecode_repetition_min_max(simple_re_bytecode, min_limit, max_limit, min_repetition_mark_id, max_repetition_mark_id, true); match_length_minimum += re_match_length_minimum * min_limit; } else { match_length_minimum += re_match_length_minimum; @@ -620,8 +621,9 @@ ALWAYS_INLINE bool PosixExtendedParser::parse_repetition_symbol(ByteCode& byteco maybe_maximum = value.value(); } - auto repetition_mark_id = m_parser_state.repetition_mark_count++; - ByteCode::transform_bytecode_repetition_min_max(bytecode_to_repeat, minimum, maybe_maximum, repetition_mark_id); + auto min_repetition_mark_id = m_parser_state.repetition_mark_count++; + auto max_repetition_mark_id = m_parser_state.repetition_mark_count++; + ByteCode::transform_bytecode_repetition_min_max(bytecode_to_repeat, minimum, maybe_maximum, min_repetition_mark_id, max_repetition_mark_id); consume(TokenType::RightCurly, Error::MismatchingBrace); return !has_error(); @@ -1219,8 +1221,9 @@ bool ECMA262Parser::parse_quantifier(ByteCode& stack, size_t& match_length_minim match_length_minimum = 0; break; case Repetition::Explicit: { - auto repetition_mark_id = m_parser_state.repetition_mark_count++; - ByteCode::transform_bytecode_repetition_min_max(stack, repeat_min.value(), repeat_max, repetition_mark_id, !ungreedy); + auto min_repetition_mark_id = m_parser_state.repetition_mark_count++; + auto max_repetition_mark_id = m_parser_state.repetition_mark_count++; + ByteCode::transform_bytecode_repetition_min_max(stack, repeat_min.value(), repeat_max, min_repetition_mark_id, max_repetition_mark_id, !ungreedy); match_length_minimum *= repeat_min.value(); break; }