mirror of
https://github.com/RGBCube/serenity
synced 2025-05-31 14:38:11 +00:00
LibRegex: Implement and use a REPEAT operation for bytecode repetition
Currently, when we need to repeat an instruction N times, we simply add that instruction N times in a for-loop. This doesn't scale well with extremely large values of N, and ECMA-262 allows up to N = 2^53 - 1. Instead, add a new REPEAT bytecode operation to defer this loop from the parser to the runtime executor. This allows the parser to complete sans any loops (for this instruction), and allows the executor to bail early if the repeated bytecode fails. Note: The templated ByteCode methods are to allow the Posix parsers to continue using u32 because they are limited to N = 2^20.
This commit is contained in:
parent
a0b72f5ad3
commit
9509433e25
7 changed files with 103 additions and 16 deletions
|
@ -17,6 +17,7 @@
|
|||
namespace regex {
|
||||
|
||||
static constexpr size_t s_maximum_repetition_count = 1024 * 1024;
|
||||
static constexpr u64 s_ecma262_maximum_repetition_count = (1ull << 53) - 1;
|
||||
static constexpr auto s_alphabetic_characters = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz"sv;
|
||||
static constexpr auto s_decimal_characters = "0123456789"sv;
|
||||
|
||||
|
@ -419,7 +420,8 @@ bool PosixBasicParser::parse_simple_re(ByteCode& bytecode, size_t& match_length_
|
|||
if (min_limit > s_maximum_repetition_count || (max_limit.has_value() && *max_limit > s_maximum_repetition_count))
|
||||
return set_error(Error::InvalidBraceContent);
|
||||
|
||||
ByteCode::transform_bytecode_repetition_min_max(simple_re_bytecode, min_limit, max_limit, true);
|
||||
auto repetition_mark_id = m_parser_state.repetition_mark_count++;
|
||||
ByteCode::transform_bytecode_repetition_min_max(simple_re_bytecode, min_limit, max_limit, repetition_mark_id, true);
|
||||
match_length_minimum += re_match_length_minimum * min_limit;
|
||||
} else {
|
||||
match_length_minimum += re_match_length_minimum;
|
||||
|
@ -564,15 +566,17 @@ ALWAYS_INLINE bool PosixExtendedParser::parse_repetition_symbol(ByteCode& byteco
|
|||
if (match(TokenType::Comma)) {
|
||||
consume();
|
||||
} else {
|
||||
auto repetition_mark_id = m_parser_state.repetition_mark_count++;
|
||||
|
||||
ByteCode bytecode;
|
||||
bytecode.insert_bytecode_repetition_n(bytecode_to_repeat, minimum);
|
||||
bytecode.insert_bytecode_repetition_n(bytecode_to_repeat, minimum, repetition_mark_id);
|
||||
bytecode_to_repeat = move(bytecode);
|
||||
|
||||
consume(TokenType::RightCurly, Error::MismatchingBrace);
|
||||
return !has_error();
|
||||
}
|
||||
|
||||
Optional<size_t> maybe_maximum {};
|
||||
Optional<u32> maybe_maximum {};
|
||||
number_builder.clear();
|
||||
while (match(TokenType::Char)) {
|
||||
number_builder.append(consume().value());
|
||||
|
@ -585,7 +589,8 @@ ALWAYS_INLINE bool PosixExtendedParser::parse_repetition_symbol(ByteCode& byteco
|
|||
maybe_maximum = value.value();
|
||||
}
|
||||
|
||||
ByteCode::transform_bytecode_repetition_min_max(bytecode_to_repeat, minimum, maybe_maximum);
|
||||
auto repetition_mark_id = m_parser_state.repetition_mark_count++;
|
||||
ByteCode::transform_bytecode_repetition_min_max(bytecode_to_repeat, minimum, maybe_maximum, repetition_mark_id);
|
||||
|
||||
consume(TokenType::RightCurly, Error::MismatchingBrace);
|
||||
return !has_error();
|
||||
|
@ -1141,7 +1146,7 @@ bool ECMA262Parser::parse_quantifier(ByteCode& stack, size_t& match_length_minim
|
|||
} repetition_mark { Repetition::None };
|
||||
|
||||
bool ungreedy = false;
|
||||
Optional<size_t> repeat_min, repeat_max;
|
||||
Optional<u64> repeat_min, repeat_max;
|
||||
|
||||
if (match(TokenType::Asterisk)) {
|
||||
consume();
|
||||
|
@ -1182,10 +1187,12 @@ bool ECMA262Parser::parse_quantifier(ByteCode& stack, size_t& match_length_minim
|
|||
ByteCode::transform_bytecode_repetition_zero_or_one(stack, !ungreedy);
|
||||
match_length_minimum = 0;
|
||||
break;
|
||||
case Repetition::Explicit:
|
||||
ByteCode::transform_bytecode_repetition_min_max(stack, repeat_min.value(), repeat_max, !ungreedy);
|
||||
case Repetition::Explicit: {
|
||||
auto repetition_mark_id = m_parser_state.repetition_mark_count++;
|
||||
ByteCode::transform_bytecode_repetition_min_max(stack, repeat_min.value(), repeat_max, repetition_mark_id, !ungreedy);
|
||||
match_length_minimum *= repeat_min.value();
|
||||
break;
|
||||
}
|
||||
case Repetition::None:
|
||||
VERIFY_NOT_REACHED();
|
||||
}
|
||||
|
@ -1193,7 +1200,7 @@ bool ECMA262Parser::parse_quantifier(ByteCode& stack, size_t& match_length_minim
|
|||
return true;
|
||||
}
|
||||
|
||||
bool ECMA262Parser::parse_interval_quantifier(Optional<size_t>& repeat_min, Optional<size_t>& repeat_max)
|
||||
bool ECMA262Parser::parse_interval_quantifier(Optional<u64>& repeat_min, Optional<u64>& repeat_max)
|
||||
{
|
||||
VERIFY(match(TokenType::LeftCurly));
|
||||
consume();
|
||||
|
@ -1202,7 +1209,7 @@ bool ECMA262Parser::parse_interval_quantifier(Optional<size_t>& repeat_min, Opti
|
|||
auto low_bound_string = read_digits_as_string();
|
||||
chars_consumed += low_bound_string.length();
|
||||
|
||||
auto low_bound = low_bound_string.to_uint();
|
||||
auto low_bound = low_bound_string.to_uint<u64>();
|
||||
|
||||
if (!low_bound.has_value()) {
|
||||
if (!m_should_use_browser_extended_grammar && done())
|
||||
|
@ -1218,7 +1225,7 @@ bool ECMA262Parser::parse_interval_quantifier(Optional<size_t>& repeat_min, Opti
|
|||
consume();
|
||||
++chars_consumed;
|
||||
auto high_bound_string = read_digits_as_string();
|
||||
auto high_bound = high_bound_string.to_uint();
|
||||
auto high_bound = high_bound_string.to_uint<u64>();
|
||||
if (high_bound.has_value()) {
|
||||
repeat_max = high_bound.value();
|
||||
chars_consumed += high_bound_string.length();
|
||||
|
@ -1243,6 +1250,9 @@ bool ECMA262Parser::parse_interval_quantifier(Optional<size_t>& repeat_min, Opti
|
|||
set_error(Error::InvalidBraceContent);
|
||||
}
|
||||
|
||||
if ((*repeat_min > s_ecma262_maximum_repetition_count) || (repeat_max.has_value() && (*repeat_max > s_ecma262_maximum_repetition_count)))
|
||||
return set_error(Error::InvalidBraceContent);
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue