1
Fork 0
mirror of https://github.com/RGBCube/serenity synced 2025-05-31 14:18:12 +00:00

LibRegex: Allow unknown escapes in non-unicode mode (for ECMA262)

This makes regexps like `/\x/` to work as normal.
Partially deals with #4189.
This commit is contained in:
AnotherTest 2020-11-28 12:27:59 +03:30 committed by Andreas Kling
parent 801750b95a
commit e2fa1b40c4
4 changed files with 64 additions and 4 deletions

View file

@ -108,6 +108,20 @@ ALWAYS_INLINE bool Parser::try_skip(StringView str)
return true;
}
ALWAYS_INLINE char Parser::skip()
{
char ch;
if (m_parser_state.current_token.value().length() == 1) {
ch = m_parser_state.current_token.value()[0];
} else {
m_parser_state.lexer.back(m_parser_state.current_token.value().length());
ch = m_parser_state.lexer.skip();
}
m_parser_state.current_token = m_parser_state.lexer.next();
return ch;
}
ALWAYS_INLINE void Parser::reset()
{
m_parser_state.bytecode.clear();
@ -1017,6 +1031,16 @@ bool ECMA262Parser::parse_atom_escape(ByteCode& stack, size_t& match_length_mini
return true;
}
}
if (unicode) {
set_error(Error::InvalidPattern);
return false;
}
// Allow '\c' in non-unicode mode, just matches 'c'.
match_length_minimum += 1;
stack.insert_bytecode_compare_values({ { CharacterCompareType::Char, (ByteCodeValueType)'c' } });
return true;
}
// '\0'
@ -1032,6 +1056,14 @@ bool ECMA262Parser::parse_atom_escape(ByteCode& stack, size_t& match_length_mini
match_length_minimum += 1;
stack.insert_bytecode_compare_values({ { CharacterCompareType::Char, (ByteCodeValueType)hex_escape.value() } });
return true;
} else if (!unicode) {
// '\x' is allowed in non-unicode mode, just matches 'x'.
match_length_minimum += 1;
stack.insert_bytecode_compare_values({ { CharacterCompareType::Char, (ByteCodeValueType)'x' } });
return true;
} else {
set_error(Error::InvalidPattern);
return false;
}
}
@ -1088,6 +1120,14 @@ bool ECMA262Parser::parse_atom_escape(ByteCode& stack, size_t& match_length_mini
bool negate = false;
auto ch = parse_character_class_escape(negate);
if (!ch.has_value()) {
if (!unicode) {
// Allow all SourceCharacter's as escapes here.
auto token = consume();
match_length_minimum += 1;
stack.insert_bytecode_compare_values({ { CharacterCompareType::Char, (ByteCodeValueType)token.value()[0] } });
return true;
}
set_error(Error::InvalidCharacterClass);
return false;
}
@ -1203,8 +1243,15 @@ bool ECMA262Parser::parse_nonempty_class_ranges(Vector<CompareTypeAndValuePair>&
// HexEscape
if (try_skip("x")) {
if (auto hex_escape = read_digits(ReadDigitsInitialZeroState::Allow, ReadDigitFollowPolicy::Any, true, 2); hex_escape.has_value())
if (auto hex_escape = read_digits(ReadDigitsInitialZeroState::Allow, ReadDigitFollowPolicy::Any, true, 2); hex_escape.has_value()) {
return { { .code_point = hex_escape.value(), .is_character_class = false } };
} else if (!unicode) {
// '\x' is allowed in non-unicode mode, just matches 'x'.
return { { .code_point = 'x', .is_character_class = false } };
} else {
set_error(Error::InvalidPattern);
return {};
}
}
if (try_skip("u")) {
@ -1234,14 +1281,18 @@ bool ECMA262Parser::parse_nonempty_class_ranges(Vector<CompareTypeAndValuePair>&
return { { .character_class = CharClass::Space, .is_negated = true, .is_character_class = true } };
if (try_skip("W"))
return { { .character_class = CharClass::Word, .is_negated = true, .is_character_class = true } };
if (!unicode) {
// Any unrecognised escape is allowed in non-unicode mode.
return { { .code_point = (u32)skip(), .is_character_class = false } };
}
}
if (match(TokenType::RightBracket) || match(TokenType::HyphenMinus))
return {};
auto token = consume(TokenType::Char, Error::InvalidCharacterClass);
return { { .code_point = (u32)token.value()[0], .is_character_class = false } };
// Allow any (other) SourceCharacter.
return { { .code_point = (u32)skip(), .is_character_class = false } };
};
auto read_class_atom = [&]() -> Optional<CharClassRangeElement> {
if (match(TokenType::HyphenMinus)) {