1
Fork 0
mirror of https://github.com/RGBCube/serenity synced 2025-07-25 17:07:34 +00:00

LibJS: Allow Unicode escape sequences in identifiers

For example, "property.br\u{64}wn" should resolve to "property.brown".

To support this behavior, this commit changes the Token class to hold
both the evaluated identifier name and a view into the original source
for the unevaluated name. There are some contexts in which identifiers
are not allowed to contain Unicode escape sequences; for example, export
statements of the form "export {} from foo.js" forbid escapes in the
identifier "from".

The test file is added to .prettierignore because prettier will replace
all escaped Unicode sequences with their unescaped value.
This commit is contained in:
Timothy Flynn 2021-08-18 16:34:25 -04:00 committed by Andreas Kling
parent c5b5c779ff
commit 1259dc3623
7 changed files with 163 additions and 54 deletions

View file

@ -1,3 +1,3 @@
Base/home/anon/Source/js Base/home/anon/Source/js
Userland/Libraries/LibJS/Tests/eval-aliasing.js Userland/Libraries/LibJS/Tests/eval-aliasing.js
Userland/Libraries/LibJS/Tests/unicode-identifier-escape.js

View file

@ -8,6 +8,7 @@
#include "Lexer.h" #include "Lexer.h"
#include <AK/CharacterTypes.h> #include <AK/CharacterTypes.h>
#include <AK/Debug.h> #include <AK/Debug.h>
#include <AK/GenericLexer.h>
#include <AK/HashMap.h> #include <AK/HashMap.h>
#include <AK/Utf8View.h> #include <AK/Utf8View.h>
#include <LibUnicode/CharacterTypes.h> #include <LibUnicode/CharacterTypes.h>
@ -350,6 +351,8 @@ u32 Lexer::current_code_point() const
if (m_position == 0) if (m_position == 0)
return REPLACEMENT_CHARACTER; return REPLACEMENT_CHARACTER;
Utf8View utf_8_view { m_source.substring_view(m_position - 1) }; Utf8View utf_8_view { m_source.substring_view(m_position - 1) };
if (utf_8_view.is_empty())
return REPLACEMENT_CHARACTER;
return *utf_8_view.begin(); return *utf_8_view.begin();
} }
@ -369,30 +372,60 @@ bool Lexer::is_whitespace() const
return false; return false;
} }
bool Lexer::is_identifier_start() const Optional<u32> Lexer::is_unicode_escape(size_t& identifier_length) const
{ {
if (!is_unicode_character()) GenericLexer lexer(source().substring_view(m_position - 1));
return is_ascii_alpha(m_current_char) || m_current_char == '_' || m_current_char == '$';
auto code_point = current_code_point();
static auto id_start_category = Unicode::property_from_string("ID_Start"sv); if (auto code_point_or_error = lexer.consume_escaped_code_point(); !code_point_or_error.is_error()) {
if (id_start_category.has_value()) identifier_length = lexer.tell();
return Unicode::code_point_has_property(code_point, *id_start_category); return code_point_or_error.value();
return false; }
return {};
} }
bool Lexer::is_identifier_middle() const Optional<u32> Lexer::is_identifier_start(size_t& identifier_length) const
{ {
if (!is_unicode_character()) u32 code_point = current_code_point();
return is_identifier_start() || is_ascii_digit(m_current_char); identifier_length = 1;
auto code_point = current_code_point();
if (code_point == ZERO_WIDTH_NON_JOINER || code_point == ZERO_WIDTH_JOINER) if (code_point == '\\') {
return true; if (auto maybe_code_point = is_unicode_escape(identifier_length); maybe_code_point.has_value())
code_point = *maybe_code_point;
else
return {};
}
if (is_ascii_alpha(code_point) || code_point == '_' || code_point == '$')
return code_point;
static auto id_start_category = Unicode::property_from_string("ID_Start"sv);
if (id_start_category.has_value() && Unicode::code_point_has_property(code_point, *id_start_category))
return code_point;
return {};
}
Optional<u32> Lexer::is_identifier_middle(size_t& identifier_length) const
{
u32 code_point = current_code_point();
identifier_length = 1;
if (code_point == '\\') {
if (auto maybe_code_point = is_unicode_escape(identifier_length); maybe_code_point.has_value())
code_point = *maybe_code_point;
else
return {};
}
if (is_ascii_alphanumeric(code_point) || (code_point == '$') || (code_point == ZERO_WIDTH_NON_JOINER) || (code_point == ZERO_WIDTH_JOINER))
return code_point;
static auto id_continue_category = Unicode::property_from_string("ID_Continue"sv); static auto id_continue_category = Unicode::property_from_string("ID_Continue"sv);
if (id_continue_category.has_value()) if (id_continue_category.has_value() && Unicode::code_point_has_property(code_point, *id_continue_category))
return Unicode::code_point_has_property(code_point, *id_continue_category); return code_point;
return false;
return {};
} }
bool Lexer::is_line_comment_start(bool line_has_token_yet) const bool Lexer::is_line_comment_start(bool line_has_token_yet) const
@ -494,6 +527,9 @@ Token Lexer::next()
// bunch of Invalid* tokens (bad numeric literals, unterminated comments etc.) // bunch of Invalid* tokens (bad numeric literals, unterminated comments etc.)
String token_message; String token_message;
Optional<FlyString> identifier;
size_t identifier_length = 0;
if (m_current_token.type() == TokenType::RegexLiteral && !is_eof() && is_ascii_alpha(m_current_char) && !did_consume_whitespace_or_comments) { if (m_current_token.type() == TokenType::RegexLiteral && !is_eof() && is_ascii_alpha(m_current_char) && !did_consume_whitespace_or_comments) {
token_type = TokenType::RegexFlags; token_type = TokenType::RegexFlags;
while (!is_eof() && is_ascii_alpha(m_current_char)) while (!is_eof() && is_ascii_alpha(m_current_char))
@ -537,19 +573,26 @@ Token Lexer::next()
else else
token_type = TokenType::TemplateLiteralString; token_type = TokenType::TemplateLiteralString;
} }
} else if (is_identifier_start()) { } else if (auto code_point = is_identifier_start(identifier_length); code_point.has_value()) {
// identifier or keyword // identifier or keyword
StringBuilder builder;
do { do {
consume(); builder.append_code_point(*code_point);
} while (is_identifier_middle()); for (size_t i = 0; i < identifier_length; ++i)
consume();
StringView value = m_source.substring_view(value_start - 1, m_position - value_start); code_point = is_identifier_middle(identifier_length);
auto it = s_keywords.find(value.hash(), [&](auto& entry) { return entry.key == value; }); } while (code_point.has_value());
if (it == s_keywords.end()) {
identifier = builder.build();
if (!m_parsed_identifiers.contains_slow(*identifier))
m_parsed_identifiers.append(*identifier);
auto it = s_keywords.find(identifier->hash(), [&](auto& entry) { return entry.key == identifier; });
if (it == s_keywords.end())
token_type = TokenType::Identifier; token_type = TokenType::Identifier;
} else { else
token_type = it->value; token_type = it->value;
}
} else if (is_numeric_literal_start()) { } else if (is_numeric_literal_start()) {
token_type = TokenType::NumericLiteral; token_type = TokenType::NumericLiteral;
bool is_invalid_numeric_literal = false; bool is_invalid_numeric_literal = false;
@ -708,15 +751,28 @@ Token Lexer::next()
} }
} }
m_current_token = Token( if (identifier.has_value()) {
token_type, m_current_token = Token(
token_message, token_type,
m_source.substring_view(trivia_start - 1, value_start - trivia_start), token_message,
m_source.substring_view(value_start - 1, m_position - value_start), m_source.substring_view(trivia_start - 1, value_start - trivia_start),
m_filename, m_source.substring_view(value_start - 1, m_position - value_start),
value_start_line_number, identifier.release_value(),
value_start_column_number, m_filename,
m_position); value_start_line_number,
value_start_column_number,
m_position);
} else {
m_current_token = Token(
token_type,
token_message,
m_source.substring_view(trivia_start - 1, value_start - trivia_start),
m_source.substring_view(value_start - 1, m_position - value_start),
m_filename,
value_start_line_number,
value_start_column_number,
m_position);
}
if constexpr (LEXER_DEBUG) { if constexpr (LEXER_DEBUG) {
dbgln("------------------------------"); dbgln("------------------------------");

View file

@ -41,8 +41,9 @@ private:
bool is_eof() const; bool is_eof() const;
bool is_line_terminator() const; bool is_line_terminator() const;
bool is_whitespace() const; bool is_whitespace() const;
bool is_identifier_start() const; Optional<u32> is_unicode_escape(size_t& identifier_length) const;
bool is_identifier_middle() const; Optional<u32> is_identifier_start(size_t& identifier_length) const;
Optional<u32> is_identifier_middle(size_t& identifier_length) const;
bool is_line_comment_start(bool line_has_token_yet) const; bool is_line_comment_start(bool line_has_token_yet) const;
bool is_block_comment_start() const; bool is_block_comment_start() const;
bool is_block_comment_end() const; bool is_block_comment_end() const;
@ -80,6 +81,10 @@ private:
static HashMap<String, TokenType> s_three_char_tokens; static HashMap<String, TokenType> s_three_char_tokens;
static HashMap<String, TokenType> s_two_char_tokens; static HashMap<String, TokenType> s_two_char_tokens;
static HashMap<char, TokenType> s_single_char_tokens; static HashMap<char, TokenType> s_single_char_tokens;
// Resolved identifiers must be kept alive for the duration of the parsing stage, otherwise
// the only references to these strings are deleted by the Token destructor.
Vector<FlyString> m_parsed_identifiers;
}; };
} }

View file

@ -210,7 +210,6 @@ constexpr OperatorPrecedenceTable g_operator_precedence;
Parser::ParserState::ParserState(Lexer l, Program::Type program_type) Parser::ParserState::ParserState(Lexer l, Program::Type program_type)
: lexer(move(l)) : lexer(move(l))
, current_token(TokenType::Invalid, {}, {}, {}, {}, 0, 0, 0)
{ {
if (program_type == Program::Type::Module) if (program_type == Program::Type::Module)
lexer.disallow_html_comments(); lexer.disallow_html_comments();
@ -680,7 +679,7 @@ NonnullRefPtr<ClassExpression> Parser::parse_class_expression(bool expect_class_
if (match_property_key()) { if (match_property_key()) {
StringView name; StringView name;
if (!is_generator && m_state.current_token.value() == "static"sv) { if (!is_generator && m_state.current_token.original_value() == "static"sv) {
if (match(TokenType::Identifier)) { if (match(TokenType::Identifier)) {
consume(); consume();
is_static = true; is_static = true;
@ -2524,7 +2523,7 @@ NonnullRefPtr<Statement> Parser::parse_for_statement()
{ {
auto rule_start = push_start(); auto rule_start = push_start();
auto match_for_in_of = [&]() { auto match_for_in_of = [&]() {
return match(TokenType::In) || (match(TokenType::Identifier) && m_state.current_token.value() == "of"); return match(TokenType::In) || (match(TokenType::Identifier) && m_state.current_token.original_value() == "of");
}; };
consume(TokenType::For); consume(TokenType::For);
@ -3019,7 +3018,7 @@ NonnullRefPtr<ImportStatement> Parser::parse_import_statement(Program& program)
}; };
auto match_as = [&] { auto match_as = [&] {
return match(TokenType::Identifier) && m_state.current_token.value() == "as"sv; return match(TokenType::Identifier) && m_state.current_token.original_value() == "as"sv;
}; };
bool continue_parsing = true; bool continue_parsing = true;
@ -3134,11 +3133,15 @@ NonnullRefPtr<ExportStatement> Parser::parse_export_statement(Program& program)
syntax_error("Cannot use export statement outside a module"); syntax_error("Cannot use export statement outside a module");
auto match_as = [&] { auto match_as = [&] {
return match(TokenType::Identifier) && m_state.current_token.value() == "as"sv; return match(TokenType::Identifier) && m_state.current_token.original_value() == "as"sv;
}; };
auto match_from = [&] { auto match_from = [&] {
return match(TokenType::Identifier) && m_state.current_token.value() == "from"sv; return match(TokenType::Identifier) && m_state.current_token.original_value() == "from"sv;
};
auto match_default = [&] {
return match(TokenType::Default) && m_state.current_token.original_value() == "default"sv;
}; };
consume(TokenType::Export); consume(TokenType::Export);
@ -3158,7 +3161,7 @@ NonnullRefPtr<ExportStatement> Parser::parse_export_statement(Program& program)
RefPtr<ASTNode> expression = {}; RefPtr<ASTNode> expression = {};
if (match(TokenType::Default)) { if (match_default()) {
auto default_position = position(); auto default_position = position();
consume(TokenType::Default); consume(TokenType::Default);

View file

@ -0,0 +1,19 @@
test("basic escapes", () => {
var foo = {};
foo.brown = 12389;
expect(foo.brown).toBe(12389);
expect(foo.br\u006fwn).toBe(12389);
expect(foo.br\u{6f}wn).toBe(12389);
expect(foo.\u{62}\u{72}\u{6f}\u{77}\u{6e}).toBe(12389);
});
test("non-ascii escapes", () => {
var foo = {};
foo.𝓑𝓻𝓸𝔀𝓷 = 12389;
expect(foo.𝓑𝓻𝓸𝔀𝓷).toBe(12389);
expect(foo.𝓑𝓻\ud835\udcf8𝔀𝓷).toBe(12389);
expect(foo.𝓑𝓻\u{1d4f8}𝔀𝓷).toBe(12389);
expect(foo.\u{1d4d1}\u{1d4fb}\u{1d4f8}\u{1d500}\u{1d4f7}).toBe(12389);
});

View file

@ -56,7 +56,7 @@ double Token::double_value() const
StringBuilder builder; StringBuilder builder;
for (auto ch : m_value) { for (auto ch : value()) {
if (ch == '_') if (ch == '_')
continue; continue;
builder.append(ch); builder.append(ch);
@ -75,7 +75,7 @@ double Token::double_value() const
return static_cast<double>(strtoul(value_string.characters() + 2, nullptr, 2)); return static_cast<double>(strtoul(value_string.characters() + 2, nullptr, 2));
} else if (is_ascii_digit(value_string[1])) { } else if (is_ascii_digit(value_string[1])) {
// also octal, but syntax error in strict mode // also octal, but syntax error in strict mode
if (!m_value.contains('8') && !m_value.contains('9')) if (!value().contains('8') && !value().contains('9'))
return static_cast<double>(strtoul(value_string.characters() + 1, nullptr, 8)); return static_cast<double>(strtoul(value_string.characters() + 1, nullptr, 8));
} }
} }
@ -95,7 +95,7 @@ String Token::string_value(StringValueStatus& status) const
VERIFY(type() == TokenType::StringLiteral || type() == TokenType::TemplateLiteralString); VERIFY(type() == TokenType::StringLiteral || type() == TokenType::TemplateLiteralString);
auto is_template = type() == TokenType::TemplateLiteralString; auto is_template = type() == TokenType::TemplateLiteralString;
GenericLexer lexer(is_template ? m_value : m_value.substring_view(1, m_value.length() - 2)); GenericLexer lexer(is_template ? value() : value().substring_view(1, value().length() - 2));
auto encoding_failure = [&status](StringValueStatus parse_status) -> String { auto encoding_failure = [&status](StringValueStatus parse_status) -> String {
status = parse_status; status = parse_status;
@ -195,7 +195,7 @@ String Token::string_value(StringValueStatus& status) const
bool Token::bool_value() const bool Token::bool_value() const
{ {
VERIFY(type() == TokenType::BoolLiteral); VERIFY(type() == TokenType::BoolLiteral);
return m_value == "true"; return value() == "true";
} }
bool Token::is_identifier_name() const bool Token::is_identifier_name() const

View file

@ -6,8 +6,10 @@
#pragma once #pragma once
#include <AK/FlyString.h>
#include <AK/String.h> #include <AK/String.h>
#include <AK/StringView.h> #include <AK/StringView.h>
#include <AK/Variant.h>
namespace JS { namespace JS {
@ -172,10 +174,13 @@ enum class TokenCategory {
class Token { class Token {
public: public:
Token() = default;
Token(TokenType type, String message, StringView trivia, StringView value, StringView filename, size_t line_number, size_t line_column, size_t offset) Token(TokenType type, String message, StringView trivia, StringView value, StringView filename, size_t line_number, size_t line_column, size_t offset)
: m_type(type) : m_type(type)
, m_message(message) , m_message(message)
, m_trivia(trivia) , m_trivia(trivia)
, m_original_value(value)
, m_value(value) , m_value(value)
, m_filename(filename) , m_filename(filename)
, m_line_number(line_number) , m_line_number(line_number)
@ -184,6 +189,19 @@ public:
{ {
} }
Token(TokenType type, String message, StringView trivia, StringView original_value, FlyString value, StringView filename, size_t line_number, size_t line_column, size_t offset)
: m_type(type)
, m_message(message)
, m_trivia(trivia)
, m_original_value(original_value)
, m_value(move(value))
, m_filename(filename)
, m_line_number(line_number)
, m_line_column(line_column)
, m_offset(offset)
{
}
TokenType type() const { return m_type; } TokenType type() const { return m_type; }
TokenCategory category() const; TokenCategory category() const;
static TokenCategory category(TokenType); static TokenCategory category(TokenType);
@ -192,7 +210,14 @@ public:
const String& message() const { return m_message; } const String& message() const { return m_message; }
const StringView& trivia() const { return m_trivia; } const StringView& trivia() const { return m_trivia; }
const StringView& value() const { return m_value; } const StringView& original_value() const { return m_original_value; }
StringView value() const
{
return m_value.visit(
[](StringView const& view) { return view; },
[](FlyString const& identifier) { return identifier.view(); },
[](Empty) -> StringView { VERIFY_NOT_REACHED(); });
}
const StringView& filename() const { return m_filename; } const StringView& filename() const { return m_filename; }
size_t line_number() const { return m_line_number; } size_t line_number() const { return m_line_number; }
size_t line_column() const { return m_line_column; } size_t line_column() const { return m_line_column; }
@ -213,14 +238,15 @@ public:
bool trivia_contains_line_terminator() const; bool trivia_contains_line_terminator() const;
private: private:
TokenType m_type; TokenType m_type { TokenType::Invalid };
String m_message; String m_message;
StringView m_trivia; StringView m_trivia;
StringView m_value; StringView m_original_value;
Variant<Empty, StringView, FlyString> m_value { Empty {} };
StringView m_filename; StringView m_filename;
size_t m_line_number; size_t m_line_number { 0 };
size_t m_line_column; size_t m_line_column { 0 };
size_t m_offset; size_t m_offset { 0 };
}; };
} }