From 77349149092dea81849eac1fee1799df8b3919a6 Mon Sep 17 00:00:00 2001 From: Ali Mohammad Pur Date: Wed, 20 Jul 2022 23:19:43 +0430 Subject: [PATCH] LibRegex: Refactor parsing 'CharacterEscape' out of 'AtomEscape' The ECMA262 spec has this as a separate production, and we need it to be split up for a future commit. --- Userland/Libraries/LibRegex/RegexParser.cpp | 257 ++++++++++---------- Userland/Libraries/LibRegex/RegexParser.h | 2 + 2 files changed, 136 insertions(+), 123 deletions(-) diff --git a/Userland/Libraries/LibRegex/RegexParser.cpp b/Userland/Libraries/LibRegex/RegexParser.cpp index 442a662528..2503f38558 100644 --- a/Userland/Libraries/LibRegex/RegexParser.cpp +++ b/Userland/Libraries/LibRegex/RegexParser.cpp @@ -1427,6 +1427,137 @@ bool ECMA262Parser::parse_invalid_braced_quantifier() } bool ECMA262Parser::parse_character_escape(Vector& compares, size_t& match_length_minimum, ParseFlags flags) +{ + // CharacterEscape > ControlEscape + if (try_skip("f"sv)) { + match_length_minimum += 1; + compares.append({ CharacterCompareType::Char, (ByteCodeValueType)'\f' }); + return true; + } + + if (try_skip("n"sv)) { + match_length_minimum += 1; + compares.append({ CharacterCompareType::Char, (ByteCodeValueType)'\n' }); + return true; + } + + if (try_skip("r"sv)) { + match_length_minimum += 1; + compares.append({ CharacterCompareType::Char, (ByteCodeValueType)'\r' }); + return true; + } + + if (try_skip("t"sv)) { + match_length_minimum += 1; + compares.append({ CharacterCompareType::Char, (ByteCodeValueType)'\t' }); + return true; + } + + if (try_skip("v"sv)) { + match_length_minimum += 1; + compares.append({ CharacterCompareType::Char, (ByteCodeValueType)'\v' }); + return true; + } + + // CharacterEscape > ControlLetter + if (try_skip("c"sv)) { + for (auto c : s_alphabetic_characters) { + if (try_skip({ &c, 1 })) { + match_length_minimum += 1; + compares.append({ CharacterCompareType::Char, (ByteCodeValueType)(c % 32) }); + return true; + } + } + + if (flags.unicode) { + set_error(Error::InvalidPattern); + return false; + } + + if (m_should_use_browser_extended_grammar) { + back(1 + (done() ? 0 : 1)); + compares.append({ CharacterCompareType::Char, (ByteCodeValueType)'\\' }); + match_length_minimum += 1; + return true; + } + + // Allow '\c' in non-unicode mode, just matches 'c'. + match_length_minimum += 1; + compares.append({ CharacterCompareType::Char, (ByteCodeValueType)'c' }); + return true; + } + + // '\0' + if (try_skip("0"sv)) { + if (!lookahead_any(s_decimal_characters)) { + match_length_minimum += 1; + compares.append({ CharacterCompareType::Char, (ByteCodeValueType)0 }); + return true; + } + + back(); + } + + // LegacyOctalEscapeSequence + if (m_should_use_browser_extended_grammar) { + if (!flags.unicode) { + if (auto escape = parse_legacy_octal_escape(); escape.has_value()) { + compares.append({ CharacterCompareType::Char, (ByteCodeValueType)escape.value() }); + match_length_minimum += 1; + return true; + } + } + } + + // HexEscape + if (try_skip("x"sv)) { + if (auto hex_escape = read_digits(ReadDigitsInitialZeroState::Allow, true, 2, 2); hex_escape.has_value()) { + match_length_minimum += 1; + compares.append({ CharacterCompareType::Char, (ByteCodeValueType)hex_escape.value() }); + return true; + } + if (!flags.unicode) { + // '\x' is allowed in non-unicode mode, just matches 'x'. + match_length_minimum += 1; + compares.append({ CharacterCompareType::Char, (ByteCodeValueType)'x' }); + return true; + } + + set_error(Error::InvalidPattern); + return false; + } + + if (try_skip("u"sv)) { + if (auto code_point = consume_escaped_code_point(flags.unicode); code_point.has_value()) { + match_length_minimum += 1; + compares.append({ CharacterCompareType::Char, (ByteCodeValueType)code_point.value() }); + return true; + } + + return false; + } + + // IdentityEscape + for (auto ch : identity_escape_characters(flags.unicode, m_should_use_browser_extended_grammar)) { + if (try_skip({ &ch, 1 })) { + match_length_minimum += 1; + compares.append({ CharacterCompareType::Char, (ByteCodeValueType)ch }); + return true; + } + } + + if (flags.unicode) { + if (try_skip("/"sv)) { + match_length_minimum += 1; + compares.append({ CharacterCompareType::Char, (ByteCodeValueType)'/' }); + return true; + } + } + + return false; +} + +bool ECMA262Parser::parse_atom_escape(ByteCode& stack, size_t& match_length_minimum, ParseFlags flags) { if (auto escape_str = read_digits_as_string(ReadDigitsInitialZeroState::Disallow); !escape_str.is_empty()) { if (auto escape = escape_str.to_uint(); escape.has_value()) { @@ -1453,132 +1584,12 @@ bool ECMA262Parser::parse_character_escape(Vector& comp back(escape_str.length()); } - // CharacterEscape > ControlEscape - if (try_skip("f"sv)) { - match_length_minimum += 1; - stack.insert_bytecode_compare_values({ { CharacterCompareType::Char, (ByteCodeValueType)'\f' } }); + Vector escape_compares; + if (parse_character_escape(escape_compares, match_length_minimum, flags)) { + stack.insert_bytecode_compare_values(move(escape_compares)); return true; } - if (try_skip("n"sv)) { - match_length_minimum += 1; - stack.insert_bytecode_compare_values({ { CharacterCompareType::Char, (ByteCodeValueType)'\n' } }); - return true; - } - - if (try_skip("r"sv)) { - match_length_minimum += 1; - stack.insert_bytecode_compare_values({ { CharacterCompareType::Char, (ByteCodeValueType)'\r' } }); - return true; - } - - if (try_skip("t"sv)) { - match_length_minimum += 1; - stack.insert_bytecode_compare_values({ { CharacterCompareType::Char, (ByteCodeValueType)'\t' } }); - return true; - } - - if (try_skip("v"sv)) { - match_length_minimum += 1; - stack.insert_bytecode_compare_values({ { CharacterCompareType::Char, (ByteCodeValueType)'\v' } }); - return true; - } - - // CharacterEscape > ControlLetter - if (try_skip("c"sv)) { - for (auto c : s_alphabetic_characters) { - if (try_skip({ &c, 1 })) { - match_length_minimum += 1; - stack.insert_bytecode_compare_values({ { CharacterCompareType::Char, (ByteCodeValueType)(c % 32) } }); - return true; - } - } - - if (flags.unicode) { - set_error(Error::InvalidPattern); - return false; - } - - if (m_should_use_browser_extended_grammar) { - back(1 + !done()); - stack.insert_bytecode_compare_values({ { CharacterCompareType::Char, (ByteCodeValueType)'\\' } }); - match_length_minimum += 1; - return true; - } - - // Allow '\c' in non-unicode mode, just matches 'c'. - match_length_minimum += 1; - stack.insert_bytecode_compare_values({ { CharacterCompareType::Char, (ByteCodeValueType)'c' } }); - return true; - } - - // '\0' - if (try_skip("0"sv)) { - if (!lookahead_any(s_decimal_characters)) { - match_length_minimum += 1; - stack.insert_bytecode_compare_values({ { CharacterCompareType::Char, (ByteCodeValueType)0 } }); - return true; - } - - back(); - } - - // LegacyOctalEscapeSequence - if (m_should_use_browser_extended_grammar) { - if (!flags.unicode) { - if (auto escape = parse_legacy_octal_escape(); escape.has_value()) { - stack.insert_bytecode_compare_values({ { CharacterCompareType::Char, (ByteCodeValueType)escape.value() } }); - match_length_minimum += 1; - return true; - } - } - } - - // HexEscape - if (try_skip("x"sv)) { - if (auto hex_escape = read_digits(ReadDigitsInitialZeroState::Allow, true, 2, 2); hex_escape.has_value()) { - match_length_minimum += 1; - stack.insert_bytecode_compare_values({ { CharacterCompareType::Char, (ByteCodeValueType)hex_escape.value() } }); - return true; - } - if (!flags.unicode) { - // '\x' is allowed in non-unicode mode, just matches 'x'. - match_length_minimum += 1; - stack.insert_bytecode_compare_values({ { CharacterCompareType::Char, (ByteCodeValueType)'x' } }); - return true; - } - - set_error(Error::InvalidPattern); - return false; - } - - if (try_skip("u"sv)) { - if (auto code_point = consume_escaped_code_point(flags.unicode); code_point.has_value()) { - match_length_minimum += 1; - stack.insert_bytecode_compare_values({ { CharacterCompareType::Char, (ByteCodeValueType)code_point.value() } }); - return true; - } - - return false; - } - - // IdentityEscape - for (auto ch : identity_escape_characters(flags.unicode, m_should_use_browser_extended_grammar)) { - if (try_skip({ &ch, 1 })) { - match_length_minimum += 1; - stack.insert_bytecode_compare_values({ { CharacterCompareType::Char, (ByteCodeValueType)ch } }); - return true; - } - } - - if (flags.unicode) { - if (try_skip("/"sv)) { - match_length_minimum += 1; - stack.insert_bytecode_compare_values({ { CharacterCompareType::Char, (ByteCodeValueType)'/' } }); - return true; - } - } - if (flags.named && try_skip("k"sv)) { auto name = read_capture_group_specifier(true); if (name.is_empty()) { diff --git a/Userland/Libraries/LibRegex/RegexParser.h b/Userland/Libraries/LibRegex/RegexParser.h index 7f4e8104a2..3dbe5438d3 100644 --- a/Userland/Libraries/LibRegex/RegexParser.h +++ b/Userland/Libraries/LibRegex/RegexParser.h @@ -255,6 +255,8 @@ private: bool parse_nonempty_class_ranges(Vector&, ParseFlags); bool parse_unicode_property_escape(PropertyEscape& property, bool& negated); + bool parse_character_escape(Vector&, size_t&, ParseFlags); + // Used only by B.1.4, Regular Expression Patterns (Extended for use in browsers) bool parse_quantifiable_assertion(ByteCode&, size_t&, ParseFlags); bool parse_extended_atom(ByteCode&, size_t&, ParseFlags);