mirror of
https://github.com/RGBCube/serenity
synced 2025-07-25 07:17:35 +00:00
LibRegex: Use GenericLexer to consume escaped code points
This commit is contained in:
parent
5ff9596678
commit
6131c0485e
2 changed files with 29 additions and 61 deletions
|
@ -92,6 +92,29 @@ ALWAYS_INLINE bool Parser::consume(String const& str)
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
ALWAYS_INLINE Optional<u32> Parser::consume_escaped_code_point(bool unicode)
|
||||||
|
{
|
||||||
|
if (match(TokenType::LeftCurly) && !unicode) {
|
||||||
|
// In non-Unicode mode, this should be parsed as a repetition symbol (repeating the 'u').
|
||||||
|
return static_cast<u32>('u');
|
||||||
|
}
|
||||||
|
|
||||||
|
m_parser_state.lexer.retreat(2 + !done()); // Go back to just before '\u' (+1 char, because we will have consumed an extra character)
|
||||||
|
|
||||||
|
if (auto code_point_or_error = m_parser_state.lexer.consume_escaped_code_point(unicode); !code_point_or_error.is_error()) {
|
||||||
|
m_parser_state.current_token = m_parser_state.lexer.next();
|
||||||
|
return code_point_or_error.value();
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!unicode) {
|
||||||
|
// '\u' is allowed in non-unicode mode, just matches 'u'.
|
||||||
|
return static_cast<u32>('u');
|
||||||
|
}
|
||||||
|
|
||||||
|
set_error(Error::InvalidPattern);
|
||||||
|
return {};
|
||||||
|
}
|
||||||
|
|
||||||
ALWAYS_INLINE bool Parser::try_skip(StringView str)
|
ALWAYS_INLINE bool Parser::try_skip(StringView str)
|
||||||
{
|
{
|
||||||
if (str.starts_with(m_parser_state.current_token.value()))
|
if (str.starts_with(m_parser_state.current_token.value()))
|
||||||
|
@ -1489,64 +1512,13 @@ bool ECMA262Parser::parse_atom_escape(ByteCode& stack, size_t& match_length_mini
|
||||||
}
|
}
|
||||||
|
|
||||||
if (try_skip("u")) {
|
if (try_skip("u")) {
|
||||||
if (match(TokenType::LeftCurly)) {
|
if (auto code_point = consume_escaped_code_point(unicode); code_point.has_value()) {
|
||||||
if (!unicode) {
|
|
||||||
// In non-Unicode mode, this should be parsed as a repetition symbol (repeating the 'u').
|
|
||||||
match_length_minimum += 1;
|
|
||||||
stack.insert_bytecode_compare_values({ { CharacterCompareType::Char, (ByteCodeValueType)'u' } });
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
|
|
||||||
consume();
|
|
||||||
|
|
||||||
auto code_point = read_digits(ReadDigitsInitialZeroState::Allow, true, 6);
|
|
||||||
if (code_point.has_value() && is_unicode(*code_point) && match(TokenType::RightCurly)) {
|
|
||||||
consume();
|
|
||||||
match_length_minimum += 1;
|
|
||||||
stack.insert_bytecode_compare_values({ { CharacterCompareType::Char, (ByteCodeValueType)code_point.value() } });
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
|
|
||||||
set_error(Error::InvalidPattern);
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
|
|
||||||
if (auto code_point = read_digits(ReadDigitsInitialZeroState::Allow, true, 4, 4); code_point.has_value()) {
|
|
||||||
// In Unicode mode, we need to combine surrogate pairs into a single code point. But we also need to be
|
|
||||||
// rather forgiving if the surrogate pairs are invalid. So if a second code unit follows this code unit,
|
|
||||||
// but doesn't form a valid surrogate pair, insert bytecode for both code units individually.
|
|
||||||
Optional<u32> low_surrogate;
|
|
||||||
if (unicode && Utf16View::is_high_surrogate(*code_point) && try_skip("\\u")) {
|
|
||||||
low_surrogate = read_digits(ReadDigitsInitialZeroState::Allow, true, 4);
|
|
||||||
if (!low_surrogate.has_value()) {
|
|
||||||
set_error(Error::InvalidPattern);
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
|
|
||||||
if (Utf16View::is_low_surrogate(*low_surrogate)) {
|
|
||||||
*code_point = Utf16View::decode_surrogate_pair(*code_point, *low_surrogate);
|
|
||||||
low_surrogate.clear();
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
match_length_minimum += 1;
|
match_length_minimum += 1;
|
||||||
stack.insert_bytecode_compare_values({ { CharacterCompareType::Char, (ByteCodeValueType)code_point.value() } });
|
stack.insert_bytecode_compare_values({ { CharacterCompareType::Char, (ByteCodeValueType)code_point.value() } });
|
||||||
|
|
||||||
if (low_surrogate.has_value()) {
|
|
||||||
match_length_minimum += 1;
|
|
||||||
stack.insert_bytecode_compare_values({ { CharacterCompareType::Char, (ByteCodeValueType)low_surrogate.value() } });
|
|
||||||
}
|
|
||||||
|
|
||||||
return true;
|
return true;
|
||||||
} else if (!unicode) {
|
|
||||||
// '\u' is allowed in non-unicode mode, just matches 'u'.
|
|
||||||
match_length_minimum += 1;
|
|
||||||
stack.insert_bytecode_compare_values({ { CharacterCompareType::Char, (ByteCodeValueType)'u' } });
|
|
||||||
return true;
|
|
||||||
} else {
|
|
||||||
set_error(Error::InvalidPattern);
|
|
||||||
return false;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
// IdentityEscape
|
// IdentityEscape
|
||||||
|
@ -1847,16 +1819,11 @@ bool ECMA262Parser::parse_nonempty_class_ranges(Vector<CompareTypeAndValuePair>&
|
||||||
}
|
}
|
||||||
|
|
||||||
if (try_skip("u")) {
|
if (try_skip("u")) {
|
||||||
if (auto code_point = read_digits(ReadDigitsInitialZeroState::Allow, true, 4, 4); code_point.has_value()) {
|
if (auto code_point = consume_escaped_code_point(unicode); code_point.has_value()) {
|
||||||
// FIXME: While code point ranges are supported, code point matches as "Char" are not!
|
// FIXME: While code point ranges are supported, code point matches as "Char" are not!
|
||||||
return { CharClassRangeElement { .code_point = code_point.value(), .is_character_class = false } };
|
return { CharClassRangeElement { .code_point = code_point.value(), .is_character_class = false } };
|
||||||
} else if (!unicode) {
|
|
||||||
// '\u' is allowed in non-unicode mode, just matches 'u'.
|
|
||||||
return { CharClassRangeElement { .code_point = 'u', .is_character_class = false } };
|
|
||||||
} else {
|
|
||||||
set_error(Error::InvalidPattern);
|
|
||||||
return {};
|
|
||||||
}
|
}
|
||||||
|
return {};
|
||||||
}
|
}
|
||||||
|
|
||||||
// IdentityEscape
|
// IdentityEscape
|
||||||
|
|
|
@ -80,6 +80,7 @@ protected:
|
||||||
ALWAYS_INLINE Token consume();
|
ALWAYS_INLINE Token consume();
|
||||||
ALWAYS_INLINE Token consume(TokenType type, Error error);
|
ALWAYS_INLINE Token consume(TokenType type, Error error);
|
||||||
ALWAYS_INLINE bool consume(String const&);
|
ALWAYS_INLINE bool consume(String const&);
|
||||||
|
ALWAYS_INLINE Optional<u32> consume_escaped_code_point(bool unicode);
|
||||||
ALWAYS_INLINE bool try_skip(StringView);
|
ALWAYS_INLINE bool try_skip(StringView);
|
||||||
ALWAYS_INLINE bool lookahead_any(StringView);
|
ALWAYS_INLINE bool lookahead_any(StringView);
|
||||||
ALWAYS_INLINE char skip();
|
ALWAYS_INLINE char skip();
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue