1
Fork 0
mirror of https://github.com/RGBCube/serenity synced 2025-07-26 01:57:45 +00:00

LibRegex: Implement legacy octal escape parsing closer to the spec

The grammar for the ECMA-262 CharacterEscape is:

  CharacterEscape[U, N] ::
    ControlEscape
    c ControlLetter
    0 [lookahead ∉ DecimalDigit]
    HexEscapeSequence
    RegExpUnicodeEscapeSequence[?U]
    [~U]LegacyOctalEscapeSequence
    IdentityEscape[?U, ?N]

It's important to parse the standalone "\0 [lookahead ∉ DecimalDigit]"
before parsing LegacyOctalEscapeSequence. Otherwise, all standalone "\0"
patterns are parsed as octal, which are disallowed in Unicode mode.

Further, LegacyOctalEscapeSequence should also be parsed while parsing
character classes.
This commit is contained in:
Timothy Flynn 2021-08-11 16:41:57 -04:00 committed by Linus Groh
parent 83ca8c7e38
commit 6a485f612f
2 changed files with 43 additions and 9 deletions

View file

@ -18,6 +18,7 @@ namespace regex {
static constexpr size_t s_maximum_repetition_count = 1024 * 1024;
static constexpr auto s_alphabetic_characters = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz"sv;
static constexpr auto s_decimal_characters = "0123456789"sv;
ALWAYS_INLINE bool Parser::set_error(Error error)
{
@ -1430,6 +1431,17 @@ bool ECMA262Parser::parse_atom_escape(ByteCode& stack, size_t& match_length_mini
return true;
}
// '\0'
if (try_skip("0")) {
if (!lookahead_any(s_decimal_characters)) {
match_length_minimum += 1;
stack.insert_bytecode_compare_values({ { CharacterCompareType::Char, (ByteCodeValueType)0 } });
return true;
}
back();
}
// LegacyOctalEscapeSequence
if (m_should_use_browser_extended_grammar) {
if (!unicode) {
@ -1441,13 +1453,6 @@ bool ECMA262Parser::parse_atom_escape(ByteCode& stack, size_t& match_length_mini
}
}
// '\0'
if (try_skip("0")) {
match_length_minimum += 1;
stack.insert_bytecode_compare_values({ { CharacterCompareType::Char, (ByteCodeValueType)0 } });
return true;
}
// HexEscape
if (try_skip("x")) {
if (auto hex_escape = read_digits(ReadDigitsInitialZeroState::Allow, true, 2, 2); hex_escape.has_value()) {
@ -1797,8 +1802,17 @@ bool ECMA262Parser::parse_nonempty_class_ranges(Vector<CompareTypeAndValuePair>&
}
// '\0'
if (try_skip("0"))
return { CharClassRangeElement { .code_point = 0, .is_character_class = false } };
if (try_skip("0")) {
if (!lookahead_any(s_decimal_characters))
return { CharClassRangeElement { .code_point = 0, .is_character_class = false } };
back();
}
// LegacyOctalEscapeSequence
if (m_should_use_browser_extended_grammar && !unicode) {
if (auto escape = parse_legacy_octal_escape(); escape.has_value())
return { CharClassRangeElement { .code_point = escape.value(), .is_character_class = false } };
}
// HexEscape
if (try_skip("x")) {