diff --git a/.prettierignore b/.prettierignore index b8b02877b1..ad481c8ac4 100644 --- a/.prettierignore +++ b/.prettierignore @@ -1,3 +1,3 @@ Base/home/anon/Source/js Userland/Libraries/LibJS/Tests/eval-aliasing.js - +Userland/Libraries/LibJS/Tests/unicode-identifier-escape.js diff --git a/Userland/Libraries/LibJS/Lexer.cpp b/Userland/Libraries/LibJS/Lexer.cpp index e50e0a3956..11071b98fe 100644 --- a/Userland/Libraries/LibJS/Lexer.cpp +++ b/Userland/Libraries/LibJS/Lexer.cpp @@ -8,6 +8,7 @@ #include "Lexer.h" #include #include +#include #include #include #include @@ -350,6 +351,8 @@ u32 Lexer::current_code_point() const if (m_position == 0) return REPLACEMENT_CHARACTER; Utf8View utf_8_view { m_source.substring_view(m_position - 1) }; + if (utf_8_view.is_empty()) + return REPLACEMENT_CHARACTER; return *utf_8_view.begin(); } @@ -369,30 +372,60 @@ bool Lexer::is_whitespace() const return false; } -bool Lexer::is_identifier_start() const +Optional Lexer::is_unicode_escape(size_t& identifier_length) const { - if (!is_unicode_character()) - return is_ascii_alpha(m_current_char) || m_current_char == '_' || m_current_char == '$'; - auto code_point = current_code_point(); + GenericLexer lexer(source().substring_view(m_position - 1)); - static auto id_start_category = Unicode::property_from_string("ID_Start"sv); - if (id_start_category.has_value()) - return Unicode::code_point_has_property(code_point, *id_start_category); - return false; + if (auto code_point_or_error = lexer.consume_escaped_code_point(); !code_point_or_error.is_error()) { + identifier_length = lexer.tell(); + return code_point_or_error.value(); + } + + return {}; } -bool Lexer::is_identifier_middle() const +Optional Lexer::is_identifier_start(size_t& identifier_length) const { - if (!is_unicode_character()) - return is_identifier_start() || is_ascii_digit(m_current_char); - auto code_point = current_code_point(); - if (code_point == ZERO_WIDTH_NON_JOINER || code_point == ZERO_WIDTH_JOINER) - return true; + u32 code_point = current_code_point(); + identifier_length = 1; + + if (code_point == '\\') { + if (auto maybe_code_point = is_unicode_escape(identifier_length); maybe_code_point.has_value()) + code_point = *maybe_code_point; + else + return {}; + } + + if (is_ascii_alpha(code_point) || code_point == '_' || code_point == '$') + return code_point; + + static auto id_start_category = Unicode::property_from_string("ID_Start"sv); + if (id_start_category.has_value() && Unicode::code_point_has_property(code_point, *id_start_category)) + return code_point; + + return {}; +} + +Optional Lexer::is_identifier_middle(size_t& identifier_length) const +{ + u32 code_point = current_code_point(); + identifier_length = 1; + + if (code_point == '\\') { + if (auto maybe_code_point = is_unicode_escape(identifier_length); maybe_code_point.has_value()) + code_point = *maybe_code_point; + else + return {}; + } + + if (is_ascii_alphanumeric(code_point) || (code_point == '$') || (code_point == ZERO_WIDTH_NON_JOINER) || (code_point == ZERO_WIDTH_JOINER)) + return code_point; static auto id_continue_category = Unicode::property_from_string("ID_Continue"sv); - if (id_continue_category.has_value()) - return Unicode::code_point_has_property(code_point, *id_continue_category); - return false; + if (id_continue_category.has_value() && Unicode::code_point_has_property(code_point, *id_continue_category)) + return code_point; + + return {}; } bool Lexer::is_line_comment_start(bool line_has_token_yet) const @@ -494,6 +527,9 @@ Token Lexer::next() // bunch of Invalid* tokens (bad numeric literals, unterminated comments etc.) String token_message; + Optional identifier; + size_t identifier_length = 0; + if (m_current_token.type() == TokenType::RegexLiteral && !is_eof() && is_ascii_alpha(m_current_char) && !did_consume_whitespace_or_comments) { token_type = TokenType::RegexFlags; while (!is_eof() && is_ascii_alpha(m_current_char)) @@ -537,19 +573,26 @@ Token Lexer::next() else token_type = TokenType::TemplateLiteralString; } - } else if (is_identifier_start()) { + } else if (auto code_point = is_identifier_start(identifier_length); code_point.has_value()) { // identifier or keyword + StringBuilder builder; do { - consume(); - } while (is_identifier_middle()); + builder.append_code_point(*code_point); + for (size_t i = 0; i < identifier_length; ++i) + consume(); - StringView value = m_source.substring_view(value_start - 1, m_position - value_start); - auto it = s_keywords.find(value.hash(), [&](auto& entry) { return entry.key == value; }); - if (it == s_keywords.end()) { + code_point = is_identifier_middle(identifier_length); + } while (code_point.has_value()); + + identifier = builder.build(); + if (!m_parsed_identifiers.contains_slow(*identifier)) + m_parsed_identifiers.append(*identifier); + + auto it = s_keywords.find(identifier->hash(), [&](auto& entry) { return entry.key == identifier; }); + if (it == s_keywords.end()) token_type = TokenType::Identifier; - } else { + else token_type = it->value; - } } else if (is_numeric_literal_start()) { token_type = TokenType::NumericLiteral; bool is_invalid_numeric_literal = false; @@ -708,15 +751,28 @@ Token Lexer::next() } } - m_current_token = Token( - token_type, - token_message, - m_source.substring_view(trivia_start - 1, value_start - trivia_start), - m_source.substring_view(value_start - 1, m_position - value_start), - m_filename, - value_start_line_number, - value_start_column_number, - m_position); + if (identifier.has_value()) { + m_current_token = Token( + token_type, + token_message, + m_source.substring_view(trivia_start - 1, value_start - trivia_start), + m_source.substring_view(value_start - 1, m_position - value_start), + identifier.release_value(), + m_filename, + value_start_line_number, + value_start_column_number, + m_position); + } else { + m_current_token = Token( + token_type, + token_message, + m_source.substring_view(trivia_start - 1, value_start - trivia_start), + m_source.substring_view(value_start - 1, m_position - value_start), + m_filename, + value_start_line_number, + value_start_column_number, + m_position); + } if constexpr (LEXER_DEBUG) { dbgln("------------------------------"); diff --git a/Userland/Libraries/LibJS/Lexer.h b/Userland/Libraries/LibJS/Lexer.h index f4b9af4b20..ea4da7e14a 100644 --- a/Userland/Libraries/LibJS/Lexer.h +++ b/Userland/Libraries/LibJS/Lexer.h @@ -41,8 +41,9 @@ private: bool is_eof() const; bool is_line_terminator() const; bool is_whitespace() const; - bool is_identifier_start() const; - bool is_identifier_middle() const; + Optional is_unicode_escape(size_t& identifier_length) const; + Optional is_identifier_start(size_t& identifier_length) const; + Optional is_identifier_middle(size_t& identifier_length) const; bool is_line_comment_start(bool line_has_token_yet) const; bool is_block_comment_start() const; bool is_block_comment_end() const; @@ -80,6 +81,10 @@ private: static HashMap s_three_char_tokens; static HashMap s_two_char_tokens; static HashMap s_single_char_tokens; + + // Resolved identifiers must be kept alive for the duration of the parsing stage, otherwise + // the only references to these strings are deleted by the Token destructor. + Vector m_parsed_identifiers; }; } diff --git a/Userland/Libraries/LibJS/Parser.cpp b/Userland/Libraries/LibJS/Parser.cpp index 962d228598..ec04f51657 100644 --- a/Userland/Libraries/LibJS/Parser.cpp +++ b/Userland/Libraries/LibJS/Parser.cpp @@ -210,7 +210,6 @@ constexpr OperatorPrecedenceTable g_operator_precedence; Parser::ParserState::ParserState(Lexer l, Program::Type program_type) : lexer(move(l)) - , current_token(TokenType::Invalid, {}, {}, {}, {}, 0, 0, 0) { if (program_type == Program::Type::Module) lexer.disallow_html_comments(); @@ -680,7 +679,7 @@ NonnullRefPtr Parser::parse_class_expression(bool expect_class_ if (match_property_key()) { StringView name; - if (!is_generator && m_state.current_token.value() == "static"sv) { + if (!is_generator && m_state.current_token.original_value() == "static"sv) { if (match(TokenType::Identifier)) { consume(); is_static = true; @@ -2524,7 +2523,7 @@ NonnullRefPtr Parser::parse_for_statement() { auto rule_start = push_start(); auto match_for_in_of = [&]() { - return match(TokenType::In) || (match(TokenType::Identifier) && m_state.current_token.value() == "of"); + return match(TokenType::In) || (match(TokenType::Identifier) && m_state.current_token.original_value() == "of"); }; consume(TokenType::For); @@ -3019,7 +3018,7 @@ NonnullRefPtr Parser::parse_import_statement(Program& program) }; auto match_as = [&] { - return match(TokenType::Identifier) && m_state.current_token.value() == "as"sv; + return match(TokenType::Identifier) && m_state.current_token.original_value() == "as"sv; }; bool continue_parsing = true; @@ -3134,11 +3133,15 @@ NonnullRefPtr Parser::parse_export_statement(Program& program) syntax_error("Cannot use export statement outside a module"); auto match_as = [&] { - return match(TokenType::Identifier) && m_state.current_token.value() == "as"sv; + return match(TokenType::Identifier) && m_state.current_token.original_value() == "as"sv; }; auto match_from = [&] { - return match(TokenType::Identifier) && m_state.current_token.value() == "from"sv; + return match(TokenType::Identifier) && m_state.current_token.original_value() == "from"sv; + }; + + auto match_default = [&] { + return match(TokenType::Default) && m_state.current_token.original_value() == "default"sv; }; consume(TokenType::Export); @@ -3158,7 +3161,7 @@ NonnullRefPtr Parser::parse_export_statement(Program& program) RefPtr expression = {}; - if (match(TokenType::Default)) { + if (match_default()) { auto default_position = position(); consume(TokenType::Default); diff --git a/Userland/Libraries/LibJS/Tests/unicode-identifier-escape.js b/Userland/Libraries/LibJS/Tests/unicode-identifier-escape.js new file mode 100644 index 0000000000..4d089f21d9 --- /dev/null +++ b/Userland/Libraries/LibJS/Tests/unicode-identifier-escape.js @@ -0,0 +1,19 @@ +test("basic escapes", () => { + var foo = {}; + foo.brown = 12389; + + expect(foo.brown).toBe(12389); + expect(foo.br\u006fwn).toBe(12389); + expect(foo.br\u{6f}wn).toBe(12389); + expect(foo.\u{62}\u{72}\u{6f}\u{77}\u{6e}).toBe(12389); +}); + +test("non-ascii escapes", () => { + var foo = {}; + foo.𝓑𝓻𝓸𝔀𝓷 = 12389; + + expect(foo.𝓑𝓻𝓸𝔀𝓷).toBe(12389); + expect(foo.𝓑𝓻\ud835\udcf8𝔀𝓷).toBe(12389); + expect(foo.𝓑𝓻\u{1d4f8}𝔀𝓷).toBe(12389); + expect(foo.\u{1d4d1}\u{1d4fb}\u{1d4f8}\u{1d500}\u{1d4f7}).toBe(12389); +}); diff --git a/Userland/Libraries/LibJS/Token.cpp b/Userland/Libraries/LibJS/Token.cpp index 5e918b6557..bbce9d6e4f 100644 --- a/Userland/Libraries/LibJS/Token.cpp +++ b/Userland/Libraries/LibJS/Token.cpp @@ -56,7 +56,7 @@ double Token::double_value() const StringBuilder builder; - for (auto ch : m_value) { + for (auto ch : value()) { if (ch == '_') continue; builder.append(ch); @@ -75,7 +75,7 @@ double Token::double_value() const return static_cast(strtoul(value_string.characters() + 2, nullptr, 2)); } else if (is_ascii_digit(value_string[1])) { // also octal, but syntax error in strict mode - if (!m_value.contains('8') && !m_value.contains('9')) + if (!value().contains('8') && !value().contains('9')) return static_cast(strtoul(value_string.characters() + 1, nullptr, 8)); } } @@ -95,7 +95,7 @@ String Token::string_value(StringValueStatus& status) const VERIFY(type() == TokenType::StringLiteral || type() == TokenType::TemplateLiteralString); auto is_template = type() == TokenType::TemplateLiteralString; - GenericLexer lexer(is_template ? m_value : m_value.substring_view(1, m_value.length() - 2)); + GenericLexer lexer(is_template ? value() : value().substring_view(1, value().length() - 2)); auto encoding_failure = [&status](StringValueStatus parse_status) -> String { status = parse_status; @@ -195,7 +195,7 @@ String Token::string_value(StringValueStatus& status) const bool Token::bool_value() const { VERIFY(type() == TokenType::BoolLiteral); - return m_value == "true"; + return value() == "true"; } bool Token::is_identifier_name() const diff --git a/Userland/Libraries/LibJS/Token.h b/Userland/Libraries/LibJS/Token.h index f5755589bf..12c58ae10c 100644 --- a/Userland/Libraries/LibJS/Token.h +++ b/Userland/Libraries/LibJS/Token.h @@ -6,8 +6,10 @@ #pragma once +#include #include #include +#include namespace JS { @@ -172,10 +174,13 @@ enum class TokenCategory { class Token { public: + Token() = default; + Token(TokenType type, String message, StringView trivia, StringView value, StringView filename, size_t line_number, size_t line_column, size_t offset) : m_type(type) , m_message(message) , m_trivia(trivia) + , m_original_value(value) , m_value(value) , m_filename(filename) , m_line_number(line_number) @@ -184,6 +189,19 @@ public: { } + Token(TokenType type, String message, StringView trivia, StringView original_value, FlyString value, StringView filename, size_t line_number, size_t line_column, size_t offset) + : m_type(type) + , m_message(message) + , m_trivia(trivia) + , m_original_value(original_value) + , m_value(move(value)) + , m_filename(filename) + , m_line_number(line_number) + , m_line_column(line_column) + , m_offset(offset) + { + } + TokenType type() const { return m_type; } TokenCategory category() const; static TokenCategory category(TokenType); @@ -192,7 +210,14 @@ public: const String& message() const { return m_message; } const StringView& trivia() const { return m_trivia; } - const StringView& value() const { return m_value; } + const StringView& original_value() const { return m_original_value; } + StringView value() const + { + return m_value.visit( + [](StringView const& view) { return view; }, + [](FlyString const& identifier) { return identifier.view(); }, + [](Empty) -> StringView { VERIFY_NOT_REACHED(); }); + } const StringView& filename() const { return m_filename; } size_t line_number() const { return m_line_number; } size_t line_column() const { return m_line_column; } @@ -213,14 +238,15 @@ public: bool trivia_contains_line_terminator() const; private: - TokenType m_type; + TokenType m_type { TokenType::Invalid }; String m_message; StringView m_trivia; - StringView m_value; + StringView m_original_value; + Variant m_value { Empty {} }; StringView m_filename; - size_t m_line_number; - size_t m_line_column; - size_t m_offset; + size_t m_line_number { 0 }; + size_t m_line_column { 0 }; + size_t m_offset { 0 }; }; }