1
Fork 0
mirror of https://github.com/RGBCube/serenity synced 2025-07-26 10:47:35 +00:00

LibJS: Fix some small remaining issues with parsing unicode escapes

Added a test to ensure the behavior stays the same.
We now throw on a direct usage of an escaped keywords with a specific
error to make it more clear to the user.
This commit is contained in:
davidot 2021-08-21 11:27:20 +02:00 committed by Linus Groh
parent b012170d69
commit 7bcffd1b6a
6 changed files with 125 additions and 9 deletions

View file

@ -372,11 +372,14 @@ bool Lexer::is_whitespace() const
return false;
}
Optional<u32> Lexer::is_unicode_escape(size_t& identifier_length) const
// UnicodeEscapeSequence :: https://tc39.es/ecma262/#prod-UnicodeEscapeSequence
// u Hex4Digits
// u{ CodePoint }
Optional<u32> Lexer::is_identifier_unicode_escape(size_t& identifier_length) const
{
GenericLexer lexer(source().substring_view(m_position - 1));
if (auto code_point_or_error = lexer.consume_escaped_code_point(); !code_point_or_error.is_error()) {
if (auto code_point_or_error = lexer.consume_escaped_code_point(false); !code_point_or_error.is_error()) {
identifier_length = lexer.tell();
return code_point_or_error.value();
}
@ -384,13 +387,18 @@ Optional<u32> Lexer::is_unicode_escape(size_t& identifier_length) const
return {};
}
// IdentifierStart :: https://tc39.es/ecma262/#prod-IdentifierStart
// UnicodeIDStart
// $
// _
// \ UnicodeEscapeSequence
Optional<u32> Lexer::is_identifier_start(size_t& identifier_length) const
{
u32 code_point = current_code_point();
identifier_length = 1;
if (code_point == '\\') {
if (auto maybe_code_point = is_unicode_escape(identifier_length); maybe_code_point.has_value())
if (auto maybe_code_point = is_identifier_unicode_escape(identifier_length); maybe_code_point.has_value())
code_point = *maybe_code_point;
else
return {};
@ -406,13 +414,19 @@ Optional<u32> Lexer::is_identifier_start(size_t& identifier_length) const
return {};
}
// IdentifierPart :: https://tc39.es/ecma262/#prod-IdentifierPart
// UnicodeIDContinue
// $
// \ UnicodeEscapeSequence
// <ZWNJ>
// <ZWJ>
Optional<u32> Lexer::is_identifier_middle(size_t& identifier_length) const
{
u32 code_point = current_code_point();
identifier_length = 1;
if (code_point == '\\') {
if (auto maybe_code_point = is_unicode_escape(identifier_length); maybe_code_point.has_value())
if (auto maybe_code_point = is_identifier_unicode_escape(identifier_length); maybe_code_point.has_value())
code_point = *maybe_code_point;
else
return {};
@ -574,6 +588,7 @@ Token Lexer::next()
token_type = TokenType::TemplateLiteralString;
}
} else if (auto code_point = is_identifier_start(identifier_length); code_point.has_value()) {
bool has_escaped_character = false;
// identifier or keyword
StringBuilder builder;
do {
@ -581,6 +596,8 @@ Token Lexer::next()
for (size_t i = 0; i < identifier_length; ++i)
consume();
has_escaped_character |= identifier_length > 1;
code_point = is_identifier_middle(identifier_length);
} while (code_point.has_value());
@ -592,7 +609,7 @@ Token Lexer::next()
if (it == s_keywords.end())
token_type = TokenType::Identifier;
else
token_type = it->value;
token_type = has_escaped_character ? TokenType::EscapedKeyword : it->value;
} else if (is_numeric_literal_start()) {
token_type = TokenType::NumericLiteral;
bool is_invalid_numeric_literal = false;