mirror of
https://github.com/RGBCube/serenity
synced 2025-07-26 01:57:45 +00:00
LibRegex: Implement legacy octal escape parsing closer to the spec
The grammar for the ECMA-262 CharacterEscape is: CharacterEscape[U, N] :: ControlEscape c ControlLetter 0 [lookahead ∉ DecimalDigit] HexEscapeSequence RegExpUnicodeEscapeSequence[?U] [~U]LegacyOctalEscapeSequence IdentityEscape[?U, ?N] It's important to parse the standalone "\0 [lookahead ∉ DecimalDigit]" before parsing LegacyOctalEscapeSequence. Otherwise, all standalone "\0" patterns are parsed as octal, which are disallowed in Unicode mode. Further, LegacyOctalEscapeSequence should also be parsed while parsing character classes.
This commit is contained in:
parent
83ca8c7e38
commit
6a485f612f
2 changed files with 43 additions and 9 deletions
|
@ -18,6 +18,7 @@ namespace regex {
|
|||
|
||||
static constexpr size_t s_maximum_repetition_count = 1024 * 1024;
|
||||
static constexpr auto s_alphabetic_characters = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz"sv;
|
||||
static constexpr auto s_decimal_characters = "0123456789"sv;
|
||||
|
||||
ALWAYS_INLINE bool Parser::set_error(Error error)
|
||||
{
|
||||
|
@ -1430,6 +1431,17 @@ bool ECMA262Parser::parse_atom_escape(ByteCode& stack, size_t& match_length_mini
|
|||
return true;
|
||||
}
|
||||
|
||||
// '\0'
|
||||
if (try_skip("0")) {
|
||||
if (!lookahead_any(s_decimal_characters)) {
|
||||
match_length_minimum += 1;
|
||||
stack.insert_bytecode_compare_values({ { CharacterCompareType::Char, (ByteCodeValueType)0 } });
|
||||
return true;
|
||||
}
|
||||
|
||||
back();
|
||||
}
|
||||
|
||||
// LegacyOctalEscapeSequence
|
||||
if (m_should_use_browser_extended_grammar) {
|
||||
if (!unicode) {
|
||||
|
@ -1441,13 +1453,6 @@ bool ECMA262Parser::parse_atom_escape(ByteCode& stack, size_t& match_length_mini
|
|||
}
|
||||
}
|
||||
|
||||
// '\0'
|
||||
if (try_skip("0")) {
|
||||
match_length_minimum += 1;
|
||||
stack.insert_bytecode_compare_values({ { CharacterCompareType::Char, (ByteCodeValueType)0 } });
|
||||
return true;
|
||||
}
|
||||
|
||||
// HexEscape
|
||||
if (try_skip("x")) {
|
||||
if (auto hex_escape = read_digits(ReadDigitsInitialZeroState::Allow, true, 2, 2); hex_escape.has_value()) {
|
||||
|
@ -1797,8 +1802,17 @@ bool ECMA262Parser::parse_nonempty_class_ranges(Vector<CompareTypeAndValuePair>&
|
|||
}
|
||||
|
||||
// '\0'
|
||||
if (try_skip("0"))
|
||||
return { CharClassRangeElement { .code_point = 0, .is_character_class = false } };
|
||||
if (try_skip("0")) {
|
||||
if (!lookahead_any(s_decimal_characters))
|
||||
return { CharClassRangeElement { .code_point = 0, .is_character_class = false } };
|
||||
back();
|
||||
}
|
||||
|
||||
// LegacyOctalEscapeSequence
|
||||
if (m_should_use_browser_extended_grammar && !unicode) {
|
||||
if (auto escape = parse_legacy_octal_escape(); escape.has_value())
|
||||
return { CharClassRangeElement { .code_point = escape.value(), .is_character_class = false } };
|
||||
}
|
||||
|
||||
// HexEscape
|
||||
if (try_skip("x")) {
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue