1
Fork 0
mirror of https://github.com/RGBCube/serenity synced 2025-07-27 03:17:35 +00:00

LibRegex: Implement legacy octal escape parsing closer to the spec

The grammar for the ECMA-262 CharacterEscape is:

  CharacterEscape[U, N] ::
    ControlEscape
    c ControlLetter
    0 [lookahead ∉ DecimalDigit]
    HexEscapeSequence
    RegExpUnicodeEscapeSequence[?U]
    [~U]LegacyOctalEscapeSequence
    IdentityEscape[?U, ?N]

It's important to parse the standalone "\0 [lookahead ∉ DecimalDigit]"
before parsing LegacyOctalEscapeSequence. Otherwise, all standalone "\0"
patterns are parsed as octal, which are disallowed in Unicode mode.

Further, LegacyOctalEscapeSequence should also be parsed while parsing
character classes.
This commit is contained in:
Timothy Flynn 2021-08-11 16:41:57 -04:00 committed by Linus Groh
parent 83ca8c7e38
commit 6a485f612f
2 changed files with 43 additions and 9 deletions

View file

@ -22,6 +22,12 @@ static PosixOptions match_test_api_options(const PosixOptions options)
return options;
}
template<typename... Flags>
static constexpr ECMAScriptFlags combine_flags(Flags&&... flags) requires((IsSame<Flags, ECMAScriptFlags> && ...))
{
return static_cast<ECMAScriptFlags>((static_cast<regex::FlagsUnderlyingType>(flags) | ...));
}
TEST_CASE(regex_options_ecmascript)
{
ECMAScriptOptions eo;
@ -543,6 +549,14 @@ TEST_CASE(ECMA262_parse)
{ "\\A"sv, regex::Error::InvalidCharacterClass, ECMAScriptFlags::Unicode },
{ "[\\A]"sv, regex::Error::NoError, ECMAScriptFlags::BrowserExtended },
{ "[\\A]"sv, regex::Error::InvalidPattern, ECMAScriptFlags::Unicode },
{ "\\0"sv, regex::Error::NoError, ECMAScriptFlags::BrowserExtended },
{ "\\0"sv, regex::Error::NoError, combine_flags(ECMAScriptFlags::Unicode, ECMAScriptFlags::BrowserExtended) },
{ "\\00"sv, regex::Error::NoError, ECMAScriptFlags::BrowserExtended },
{ "\\00"sv, regex::Error::InvalidCharacterClass, combine_flags(ECMAScriptFlags::Unicode, ECMAScriptFlags::BrowserExtended) },
{ "[\\0]"sv, regex::Error::NoError, ECMAScriptFlags::BrowserExtended },
{ "[\\0]"sv, regex::Error::NoError, combine_flags(ECMAScriptFlags::Unicode, ECMAScriptFlags::BrowserExtended) },
{ "[\\00]"sv, regex::Error::NoError, ECMAScriptFlags::BrowserExtended },
{ "[\\00]"sv, regex::Error::InvalidPattern, combine_flags(ECMAScriptFlags::Unicode, ECMAScriptFlags::BrowserExtended) },
};
for (auto& test : tests) {
@ -606,6 +620,12 @@ TEST_CASE(ECMA262_match)
"return /xx/"sv, true, ECMAScriptFlags::BrowserExtended
}, // #5517, appears to be matching JS expressions that involve regular expressions...
{ "a{2,}"sv, "aaaa"sv }, // #5518
{ "\\0"sv, "\0"sv, true, ECMAScriptFlags::BrowserExtended },
{ "\\0"sv, "\0"sv, true, combine_flags(ECMAScriptFlags::Unicode, ECMAScriptFlags::BrowserExtended) },
{ "\\01"sv, "\1"sv, true, ECMAScriptFlags::BrowserExtended },
{ "[\\0]"sv, "\0"sv, true, ECMAScriptFlags::BrowserExtended },
{ "[\\0]"sv, "\0"sv, true, combine_flags(ECMAScriptFlags::Unicode, ECMAScriptFlags::BrowserExtended) },
{ "[\\01]"sv, "\1"sv, true, ECMAScriptFlags::BrowserExtended },
};
// clang-format on