From 6a485f612fbda354c8c67e9ccc3c119356687354 Mon Sep 17 00:00:00 2001 From: Timothy Flynn Date: Wed, 11 Aug 2021 16:41:57 -0400 Subject: [PATCH] LibRegex: Implement legacy octal escape parsing closer to the spec MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The grammar for the ECMA-262 CharacterEscape is: CharacterEscape[U, N] :: ControlEscape c ControlLetter 0 [lookahead ∉ DecimalDigit] HexEscapeSequence RegExpUnicodeEscapeSequence[?U] [~U]LegacyOctalEscapeSequence IdentityEscape[?U, ?N] It's important to parse the standalone "\0 [lookahead ∉ DecimalDigit]" before parsing LegacyOctalEscapeSequence. Otherwise, all standalone "\0" patterns are parsed as octal, which are disallowed in Unicode mode. Further, LegacyOctalEscapeSequence should also be parsed while parsing character classes. --- Tests/LibRegex/Regex.cpp | 20 +++++++++++++ Userland/Libraries/LibRegex/RegexParser.cpp | 32 +++++++++++++++------ 2 files changed, 43 insertions(+), 9 deletions(-) diff --git a/Tests/LibRegex/Regex.cpp b/Tests/LibRegex/Regex.cpp index 36ac6f1498..2e8cce5c30 100644 --- a/Tests/LibRegex/Regex.cpp +++ b/Tests/LibRegex/Regex.cpp @@ -22,6 +22,12 @@ static PosixOptions match_test_api_options(const PosixOptions options) return options; } +template +static constexpr ECMAScriptFlags combine_flags(Flags&&... flags) requires((IsSame && ...)) +{ + return static_cast((static_cast(flags) | ...)); +} + TEST_CASE(regex_options_ecmascript) { ECMAScriptOptions eo; @@ -543,6 +549,14 @@ TEST_CASE(ECMA262_parse) { "\\A"sv, regex::Error::InvalidCharacterClass, ECMAScriptFlags::Unicode }, { "[\\A]"sv, regex::Error::NoError, ECMAScriptFlags::BrowserExtended }, { "[\\A]"sv, regex::Error::InvalidPattern, ECMAScriptFlags::Unicode }, + { "\\0"sv, regex::Error::NoError, ECMAScriptFlags::BrowserExtended }, + { "\\0"sv, regex::Error::NoError, combine_flags(ECMAScriptFlags::Unicode, ECMAScriptFlags::BrowserExtended) }, + { "\\00"sv, regex::Error::NoError, ECMAScriptFlags::BrowserExtended }, + { "\\00"sv, regex::Error::InvalidCharacterClass, combine_flags(ECMAScriptFlags::Unicode, ECMAScriptFlags::BrowserExtended) }, + { "[\\0]"sv, regex::Error::NoError, ECMAScriptFlags::BrowserExtended }, + { "[\\0]"sv, regex::Error::NoError, combine_flags(ECMAScriptFlags::Unicode, ECMAScriptFlags::BrowserExtended) }, + { "[\\00]"sv, regex::Error::NoError, ECMAScriptFlags::BrowserExtended }, + { "[\\00]"sv, regex::Error::InvalidPattern, combine_flags(ECMAScriptFlags::Unicode, ECMAScriptFlags::BrowserExtended) }, }; for (auto& test : tests) { @@ -606,6 +620,12 @@ TEST_CASE(ECMA262_match) "return /xx/"sv, true, ECMAScriptFlags::BrowserExtended }, // #5517, appears to be matching JS expressions that involve regular expressions... { "a{2,}"sv, "aaaa"sv }, // #5518 + { "\\0"sv, "\0"sv, true, ECMAScriptFlags::BrowserExtended }, + { "\\0"sv, "\0"sv, true, combine_flags(ECMAScriptFlags::Unicode, ECMAScriptFlags::BrowserExtended) }, + { "\\01"sv, "\1"sv, true, ECMAScriptFlags::BrowserExtended }, + { "[\\0]"sv, "\0"sv, true, ECMAScriptFlags::BrowserExtended }, + { "[\\0]"sv, "\0"sv, true, combine_flags(ECMAScriptFlags::Unicode, ECMAScriptFlags::BrowserExtended) }, + { "[\\01]"sv, "\1"sv, true, ECMAScriptFlags::BrowserExtended }, }; // clang-format on diff --git a/Userland/Libraries/LibRegex/RegexParser.cpp b/Userland/Libraries/LibRegex/RegexParser.cpp index a19faa02f0..a0393a6ac0 100644 --- a/Userland/Libraries/LibRegex/RegexParser.cpp +++ b/Userland/Libraries/LibRegex/RegexParser.cpp @@ -18,6 +18,7 @@ namespace regex { static constexpr size_t s_maximum_repetition_count = 1024 * 1024; static constexpr auto s_alphabetic_characters = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz"sv; +static constexpr auto s_decimal_characters = "0123456789"sv; ALWAYS_INLINE bool Parser::set_error(Error error) { @@ -1430,6 +1431,17 @@ bool ECMA262Parser::parse_atom_escape(ByteCode& stack, size_t& match_length_mini return true; } + // '\0' + if (try_skip("0")) { + if (!lookahead_any(s_decimal_characters)) { + match_length_minimum += 1; + stack.insert_bytecode_compare_values({ { CharacterCompareType::Char, (ByteCodeValueType)0 } }); + return true; + } + + back(); + } + // LegacyOctalEscapeSequence if (m_should_use_browser_extended_grammar) { if (!unicode) { @@ -1441,13 +1453,6 @@ bool ECMA262Parser::parse_atom_escape(ByteCode& stack, size_t& match_length_mini } } - // '\0' - if (try_skip("0")) { - match_length_minimum += 1; - stack.insert_bytecode_compare_values({ { CharacterCompareType::Char, (ByteCodeValueType)0 } }); - return true; - } - // HexEscape if (try_skip("x")) { if (auto hex_escape = read_digits(ReadDigitsInitialZeroState::Allow, true, 2, 2); hex_escape.has_value()) { @@ -1797,8 +1802,17 @@ bool ECMA262Parser::parse_nonempty_class_ranges(Vector& } // '\0' - if (try_skip("0")) - return { CharClassRangeElement { .code_point = 0, .is_character_class = false } }; + if (try_skip("0")) { + if (!lookahead_any(s_decimal_characters)) + return { CharClassRangeElement { .code_point = 0, .is_character_class = false } }; + back(); + } + + // LegacyOctalEscapeSequence + if (m_should_use_browser_extended_grammar && !unicode) { + if (auto escape = parse_legacy_octal_escape(); escape.has_value()) + return { CharClassRangeElement { .code_point = escape.value(), .is_character_class = false } }; + } // HexEscape if (try_skip("x")) {