From cdec23a68c581f7ce0ab8ef4992bb2efb3c5084e Mon Sep 17 00:00:00 2001 From: Ali Mohammad Pur Date: Mon, 12 Jun 2023 20:00:19 +0330 Subject: [PATCH] LibRegex: Treat \ as unescaped in POSIX BRE/ERE This is undefined according to the spec, but glibc ignores the backslash and some applications seem to prefer this behaviour (e.g. sed). --- Userland/Libraries/LibRegex/RegexParser.cpp | 27 +++++++++++++++++++-- 1 file changed, 25 insertions(+), 2 deletions(-) diff --git a/Userland/Libraries/LibRegex/RegexParser.cpp b/Userland/Libraries/LibRegex/RegexParser.cpp index 24d378e8a9..7dd1530d96 100644 --- a/Userland/Libraries/LibRegex/RegexParser.cpp +++ b/Userland/Libraries/LibRegex/RegexParser.cpp @@ -205,7 +205,7 @@ ALWAYS_INLINE bool Parser::match_ordinary_characters() // NOTE: This method must not be called during bracket and repetition parsing! // FIXME: Add assertion for that? auto type = m_parser_state.current_token.type(); - return (type == TokenType::Char + return ((type == TokenType::Char && m_parser_state.current_token.value() != "\\"sv) // NOTE: Backslash will only be matched as 'char' if it does not form a valid escape. || type == TokenType::Comma || type == TokenType::Slash || type == TokenType::EqualSign @@ -529,8 +529,23 @@ bool PosixBasicParser::parse_one_char_or_collation_element(ByteCode& bytecode, s back(2); } + if (match(TokenType::Char)) { + auto ch = consume().value()[0]; + if (ch == '\\') { + if (m_parser_state.regex_options.has_flag_set(AllFlags::Extra)) + return set_error(Error::InvalidPattern); + + // This was \, the spec does not define any behaviour for this but glibc regex ignores it - and so do we. + return true; + } + + bytecode.insert_bytecode_compare_values({ { CharacterCompareType::Char, (ByteCodeValueType)ch } }); + match_length_minimum += 1; + return true; + } + // None of these are special in BRE. - if (match(TokenType::Char) || match(TokenType::Questionmark) || match(TokenType::RightParen) || match(TokenType::HyphenMinus) + if (match(TokenType::Questionmark) || match(TokenType::RightParen) || match(TokenType::HyphenMinus) || match(TokenType::Circumflex) || match(TokenType::RightCurly) || match(TokenType::Comma) || match(TokenType::Colon) || match(TokenType::Dollar) || match(TokenType::EqualSign) || match(TokenType::LeftCurly) || match(TokenType::LeftParen) || match(TokenType::Pipe) || match(TokenType::Slash) || match(TokenType::RightBracket) || match(TokenType::RightParen)) { @@ -721,6 +736,14 @@ ALWAYS_INLINE bool PosixExtendedParser::parse_sub_expression(ByteCode& stack, si break; } + if (m_parser_state.current_token.value() == "\\"sv) { + if (m_parser_state.regex_options.has_flag_set(AllFlags::Extra)) + return set_error(Error::InvalidPattern); + + consume(); + continue; + } + if (match_repetition_symbol()) return set_error(Error::InvalidRepetitionMarker);