1
Fork 0
mirror of https://github.com/RGBCube/serenity synced 2025-05-31 22:48:11 +00:00

LibJS: Allow Unicode escape sequences in identifiers

For example, "property.br\u{64}wn" should resolve to "property.brown".

To support this behavior, this commit changes the Token class to hold
both the evaluated identifier name and a view into the original source
for the unevaluated name. There are some contexts in which identifiers
are not allowed to contain Unicode escape sequences; for example, export
statements of the form "export {} from foo.js" forbid escapes in the
identifier "from".

The test file is added to .prettierignore because prettier will replace
all escaped Unicode sequences with their unescaped value.
This commit is contained in:
Timothy Flynn 2021-08-18 16:34:25 -04:00 committed by Andreas Kling
parent c5b5c779ff
commit 1259dc3623
7 changed files with 163 additions and 54 deletions

View file

@ -8,6 +8,7 @@
#include "Lexer.h"
#include <AK/CharacterTypes.h>
#include <AK/Debug.h>
#include <AK/GenericLexer.h>
#include <AK/HashMap.h>
#include <AK/Utf8View.h>
#include <LibUnicode/CharacterTypes.h>
@ -350,6 +351,8 @@ u32 Lexer::current_code_point() const
if (m_position == 0)
return REPLACEMENT_CHARACTER;
Utf8View utf_8_view { m_source.substring_view(m_position - 1) };
if (utf_8_view.is_empty())
return REPLACEMENT_CHARACTER;
return *utf_8_view.begin();
}
@ -369,30 +372,60 @@ bool Lexer::is_whitespace() const
return false;
}
bool Lexer::is_identifier_start() const
Optional<u32> Lexer::is_unicode_escape(size_t& identifier_length) const
{
if (!is_unicode_character())
return is_ascii_alpha(m_current_char) || m_current_char == '_' || m_current_char == '$';
auto code_point = current_code_point();
GenericLexer lexer(source().substring_view(m_position - 1));
static auto id_start_category = Unicode::property_from_string("ID_Start"sv);
if (id_start_category.has_value())
return Unicode::code_point_has_property(code_point, *id_start_category);
return false;
if (auto code_point_or_error = lexer.consume_escaped_code_point(); !code_point_or_error.is_error()) {
identifier_length = lexer.tell();
return code_point_or_error.value();
}
return {};
}
bool Lexer::is_identifier_middle() const
Optional<u32> Lexer::is_identifier_start(size_t& identifier_length) const
{
if (!is_unicode_character())
return is_identifier_start() || is_ascii_digit(m_current_char);
auto code_point = current_code_point();
if (code_point == ZERO_WIDTH_NON_JOINER || code_point == ZERO_WIDTH_JOINER)
return true;
u32 code_point = current_code_point();
identifier_length = 1;
if (code_point == '\\') {
if (auto maybe_code_point = is_unicode_escape(identifier_length); maybe_code_point.has_value())
code_point = *maybe_code_point;
else
return {};
}
if (is_ascii_alpha(code_point) || code_point == '_' || code_point == '$')
return code_point;
static auto id_start_category = Unicode::property_from_string("ID_Start"sv);
if (id_start_category.has_value() && Unicode::code_point_has_property(code_point, *id_start_category))
return code_point;
return {};
}
Optional<u32> Lexer::is_identifier_middle(size_t& identifier_length) const
{
u32 code_point = current_code_point();
identifier_length = 1;
if (code_point == '\\') {
if (auto maybe_code_point = is_unicode_escape(identifier_length); maybe_code_point.has_value())
code_point = *maybe_code_point;
else
return {};
}
if (is_ascii_alphanumeric(code_point) || (code_point == '$') || (code_point == ZERO_WIDTH_NON_JOINER) || (code_point == ZERO_WIDTH_JOINER))
return code_point;
static auto id_continue_category = Unicode::property_from_string("ID_Continue"sv);
if (id_continue_category.has_value())
return Unicode::code_point_has_property(code_point, *id_continue_category);
return false;
if (id_continue_category.has_value() && Unicode::code_point_has_property(code_point, *id_continue_category))
return code_point;
return {};
}
bool Lexer::is_line_comment_start(bool line_has_token_yet) const
@ -494,6 +527,9 @@ Token Lexer::next()
// bunch of Invalid* tokens (bad numeric literals, unterminated comments etc.)
String token_message;
Optional<FlyString> identifier;
size_t identifier_length = 0;
if (m_current_token.type() == TokenType::RegexLiteral && !is_eof() && is_ascii_alpha(m_current_char) && !did_consume_whitespace_or_comments) {
token_type = TokenType::RegexFlags;
while (!is_eof() && is_ascii_alpha(m_current_char))
@ -537,19 +573,26 @@ Token Lexer::next()
else
token_type = TokenType::TemplateLiteralString;
}
} else if (is_identifier_start()) {
} else if (auto code_point = is_identifier_start(identifier_length); code_point.has_value()) {
// identifier or keyword
StringBuilder builder;
do {
consume();
} while (is_identifier_middle());
builder.append_code_point(*code_point);
for (size_t i = 0; i < identifier_length; ++i)
consume();
StringView value = m_source.substring_view(value_start - 1, m_position - value_start);
auto it = s_keywords.find(value.hash(), [&](auto& entry) { return entry.key == value; });
if (it == s_keywords.end()) {
code_point = is_identifier_middle(identifier_length);
} while (code_point.has_value());
identifier = builder.build();
if (!m_parsed_identifiers.contains_slow(*identifier))
m_parsed_identifiers.append(*identifier);
auto it = s_keywords.find(identifier->hash(), [&](auto& entry) { return entry.key == identifier; });
if (it == s_keywords.end())
token_type = TokenType::Identifier;
} else {
else
token_type = it->value;
}
} else if (is_numeric_literal_start()) {
token_type = TokenType::NumericLiteral;
bool is_invalid_numeric_literal = false;
@ -708,15 +751,28 @@ Token Lexer::next()
}
}
m_current_token = Token(
token_type,
token_message,
m_source.substring_view(trivia_start - 1, value_start - trivia_start),
m_source.substring_view(value_start - 1, m_position - value_start),
m_filename,
value_start_line_number,
value_start_column_number,
m_position);
if (identifier.has_value()) {
m_current_token = Token(
token_type,
token_message,
m_source.substring_view(trivia_start - 1, value_start - trivia_start),
m_source.substring_view(value_start - 1, m_position - value_start),
identifier.release_value(),
m_filename,
value_start_line_number,
value_start_column_number,
m_position);
} else {
m_current_token = Token(
token_type,
token_message,
m_source.substring_view(trivia_start - 1, value_start - trivia_start),
m_source.substring_view(value_start - 1, m_position - value_start),
m_filename,
value_start_line_number,
value_start_column_number,
m_position);
}
if constexpr (LEXER_DEBUG) {
dbgln("------------------------------");