mirror of
https://github.com/RGBCube/serenity
synced 2025-05-31 22:48:11 +00:00
LibJS: Allow Unicode escape sequences in identifiers
For example, "property.br\u{64}wn" should resolve to "property.brown". To support this behavior, this commit changes the Token class to hold both the evaluated identifier name and a view into the original source for the unevaluated name. There are some contexts in which identifiers are not allowed to contain Unicode escape sequences; for example, export statements of the form "export {} from foo.js" forbid escapes in the identifier "from". The test file is added to .prettierignore because prettier will replace all escaped Unicode sequences with their unescaped value.
This commit is contained in:
parent
c5b5c779ff
commit
1259dc3623
7 changed files with 163 additions and 54 deletions
|
@ -8,6 +8,7 @@
|
|||
#include "Lexer.h"
|
||||
#include <AK/CharacterTypes.h>
|
||||
#include <AK/Debug.h>
|
||||
#include <AK/GenericLexer.h>
|
||||
#include <AK/HashMap.h>
|
||||
#include <AK/Utf8View.h>
|
||||
#include <LibUnicode/CharacterTypes.h>
|
||||
|
@ -350,6 +351,8 @@ u32 Lexer::current_code_point() const
|
|||
if (m_position == 0)
|
||||
return REPLACEMENT_CHARACTER;
|
||||
Utf8View utf_8_view { m_source.substring_view(m_position - 1) };
|
||||
if (utf_8_view.is_empty())
|
||||
return REPLACEMENT_CHARACTER;
|
||||
return *utf_8_view.begin();
|
||||
}
|
||||
|
||||
|
@ -369,30 +372,60 @@ bool Lexer::is_whitespace() const
|
|||
return false;
|
||||
}
|
||||
|
||||
bool Lexer::is_identifier_start() const
|
||||
Optional<u32> Lexer::is_unicode_escape(size_t& identifier_length) const
|
||||
{
|
||||
if (!is_unicode_character())
|
||||
return is_ascii_alpha(m_current_char) || m_current_char == '_' || m_current_char == '$';
|
||||
auto code_point = current_code_point();
|
||||
GenericLexer lexer(source().substring_view(m_position - 1));
|
||||
|
||||
static auto id_start_category = Unicode::property_from_string("ID_Start"sv);
|
||||
if (id_start_category.has_value())
|
||||
return Unicode::code_point_has_property(code_point, *id_start_category);
|
||||
return false;
|
||||
if (auto code_point_or_error = lexer.consume_escaped_code_point(); !code_point_or_error.is_error()) {
|
||||
identifier_length = lexer.tell();
|
||||
return code_point_or_error.value();
|
||||
}
|
||||
|
||||
return {};
|
||||
}
|
||||
|
||||
bool Lexer::is_identifier_middle() const
|
||||
Optional<u32> Lexer::is_identifier_start(size_t& identifier_length) const
|
||||
{
|
||||
if (!is_unicode_character())
|
||||
return is_identifier_start() || is_ascii_digit(m_current_char);
|
||||
auto code_point = current_code_point();
|
||||
if (code_point == ZERO_WIDTH_NON_JOINER || code_point == ZERO_WIDTH_JOINER)
|
||||
return true;
|
||||
u32 code_point = current_code_point();
|
||||
identifier_length = 1;
|
||||
|
||||
if (code_point == '\\') {
|
||||
if (auto maybe_code_point = is_unicode_escape(identifier_length); maybe_code_point.has_value())
|
||||
code_point = *maybe_code_point;
|
||||
else
|
||||
return {};
|
||||
}
|
||||
|
||||
if (is_ascii_alpha(code_point) || code_point == '_' || code_point == '$')
|
||||
return code_point;
|
||||
|
||||
static auto id_start_category = Unicode::property_from_string("ID_Start"sv);
|
||||
if (id_start_category.has_value() && Unicode::code_point_has_property(code_point, *id_start_category))
|
||||
return code_point;
|
||||
|
||||
return {};
|
||||
}
|
||||
|
||||
Optional<u32> Lexer::is_identifier_middle(size_t& identifier_length) const
|
||||
{
|
||||
u32 code_point = current_code_point();
|
||||
identifier_length = 1;
|
||||
|
||||
if (code_point == '\\') {
|
||||
if (auto maybe_code_point = is_unicode_escape(identifier_length); maybe_code_point.has_value())
|
||||
code_point = *maybe_code_point;
|
||||
else
|
||||
return {};
|
||||
}
|
||||
|
||||
if (is_ascii_alphanumeric(code_point) || (code_point == '$') || (code_point == ZERO_WIDTH_NON_JOINER) || (code_point == ZERO_WIDTH_JOINER))
|
||||
return code_point;
|
||||
|
||||
static auto id_continue_category = Unicode::property_from_string("ID_Continue"sv);
|
||||
if (id_continue_category.has_value())
|
||||
return Unicode::code_point_has_property(code_point, *id_continue_category);
|
||||
return false;
|
||||
if (id_continue_category.has_value() && Unicode::code_point_has_property(code_point, *id_continue_category))
|
||||
return code_point;
|
||||
|
||||
return {};
|
||||
}
|
||||
|
||||
bool Lexer::is_line_comment_start(bool line_has_token_yet) const
|
||||
|
@ -494,6 +527,9 @@ Token Lexer::next()
|
|||
// bunch of Invalid* tokens (bad numeric literals, unterminated comments etc.)
|
||||
String token_message;
|
||||
|
||||
Optional<FlyString> identifier;
|
||||
size_t identifier_length = 0;
|
||||
|
||||
if (m_current_token.type() == TokenType::RegexLiteral && !is_eof() && is_ascii_alpha(m_current_char) && !did_consume_whitespace_or_comments) {
|
||||
token_type = TokenType::RegexFlags;
|
||||
while (!is_eof() && is_ascii_alpha(m_current_char))
|
||||
|
@ -537,19 +573,26 @@ Token Lexer::next()
|
|||
else
|
||||
token_type = TokenType::TemplateLiteralString;
|
||||
}
|
||||
} else if (is_identifier_start()) {
|
||||
} else if (auto code_point = is_identifier_start(identifier_length); code_point.has_value()) {
|
||||
// identifier or keyword
|
||||
StringBuilder builder;
|
||||
do {
|
||||
consume();
|
||||
} while (is_identifier_middle());
|
||||
builder.append_code_point(*code_point);
|
||||
for (size_t i = 0; i < identifier_length; ++i)
|
||||
consume();
|
||||
|
||||
StringView value = m_source.substring_view(value_start - 1, m_position - value_start);
|
||||
auto it = s_keywords.find(value.hash(), [&](auto& entry) { return entry.key == value; });
|
||||
if (it == s_keywords.end()) {
|
||||
code_point = is_identifier_middle(identifier_length);
|
||||
} while (code_point.has_value());
|
||||
|
||||
identifier = builder.build();
|
||||
if (!m_parsed_identifiers.contains_slow(*identifier))
|
||||
m_parsed_identifiers.append(*identifier);
|
||||
|
||||
auto it = s_keywords.find(identifier->hash(), [&](auto& entry) { return entry.key == identifier; });
|
||||
if (it == s_keywords.end())
|
||||
token_type = TokenType::Identifier;
|
||||
} else {
|
||||
else
|
||||
token_type = it->value;
|
||||
}
|
||||
} else if (is_numeric_literal_start()) {
|
||||
token_type = TokenType::NumericLiteral;
|
||||
bool is_invalid_numeric_literal = false;
|
||||
|
@ -708,15 +751,28 @@ Token Lexer::next()
|
|||
}
|
||||
}
|
||||
|
||||
m_current_token = Token(
|
||||
token_type,
|
||||
token_message,
|
||||
m_source.substring_view(trivia_start - 1, value_start - trivia_start),
|
||||
m_source.substring_view(value_start - 1, m_position - value_start),
|
||||
m_filename,
|
||||
value_start_line_number,
|
||||
value_start_column_number,
|
||||
m_position);
|
||||
if (identifier.has_value()) {
|
||||
m_current_token = Token(
|
||||
token_type,
|
||||
token_message,
|
||||
m_source.substring_view(trivia_start - 1, value_start - trivia_start),
|
||||
m_source.substring_view(value_start - 1, m_position - value_start),
|
||||
identifier.release_value(),
|
||||
m_filename,
|
||||
value_start_line_number,
|
||||
value_start_column_number,
|
||||
m_position);
|
||||
} else {
|
||||
m_current_token = Token(
|
||||
token_type,
|
||||
token_message,
|
||||
m_source.substring_view(trivia_start - 1, value_start - trivia_start),
|
||||
m_source.substring_view(value_start - 1, m_position - value_start),
|
||||
m_filename,
|
||||
value_start_line_number,
|
||||
value_start_column_number,
|
||||
m_position);
|
||||
}
|
||||
|
||||
if constexpr (LEXER_DEBUG) {
|
||||
dbgln("------------------------------");
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue