1
Fork 0
mirror of https://github.com/RGBCube/serenity synced 2025-10-24 09:02:32 +00:00
serenity/Userland/Libraries/LibJS/Lexer.h
Andreas Kling b0b022507b LibJS: Reduce AST memory usage by shrink-wrapping source range info
Before this change, each AST node had a 64-byte SourceRange member.
This SourceRange had the following layout:

    filename:       StringView (16 bytes)
    start:          Position (24 bytes)
    end:            Position (24 bytes)

The Position structs have { line, column, offset }, all members size_t.

To reduce memory consumption, AST nodes now only store the following:

    source_code:    NonnullRefPtr<SourceCode> (8 bytes)
    start_offset:   u32 (4 bytes)
    end_offset:     u32 (4 bytes)

SourceCode is a new ref-counted data structure that keeps the filename
and original parsed source code in a single location, and all AST nodes
have a pointer to it.

The start_offset and end_offset can be turned into (line, column) when
necessary by calling SourceCode::range_from_offsets(). This will walk
the source code string and compute line/column numbers on the fly, so
it's not necessarily fast, but it should be rare since this information
is primarily used for diagnostics and exception stack traces.

With this, ASTNode shrinks from 80 bytes to 32 bytes. This gives us a
~23% reduction in memory usage when loading twitter.com/awesomekling
(330 MiB before, 253 MiB after!) :^)
2022-11-22 21:13:35 +01:00

96 lines
2.7 KiB
C++

/*
* Copyright (c) 2020, Stephan Unverwerth <s.unverwerth@serenityos.org>
*
* SPDX-License-Identifier: BSD-2-Clause
*/
#pragma once
#include "Token.h"
#include <AK/HashMap.h>
#include <AK/String.h>
#include <AK/StringView.h>
namespace JS {
class Lexer {
public:
explicit Lexer(StringView source, StringView filename = "(unknown)"sv, size_t line_number = 1, size_t line_column = 0);
Token next();
String const& source() const { return m_source; };
String const& filename() const { return m_filename; };
void disallow_html_comments() { m_allow_html_comments = false; };
Token force_slash_as_regex();
private:
void consume();
bool consume_exponent();
bool consume_octal_number();
bool consume_hexadecimal_number();
bool consume_binary_number();
bool consume_decimal_number();
bool is_unicode_character() const;
u32 current_code_point() const;
bool is_eof() const;
bool is_line_terminator() const;
bool is_whitespace() const;
Optional<u32> is_identifier_unicode_escape(size_t& identifier_length) const;
Optional<u32> is_identifier_start(size_t& identifier_length) const;
Optional<u32> is_identifier_middle(size_t& identifier_length) const;
bool is_line_comment_start(bool line_has_token_yet) const;
bool is_block_comment_start() const;
bool is_block_comment_end() const;
bool is_numeric_literal_start() const;
bool match(char, char) const;
bool match(char, char, char) const;
bool match(char, char, char, char) const;
template<typename Callback>
bool match_numeric_literal_separator_followed_by(Callback) const;
bool slash_means_division() const;
TokenType consume_regex_literal();
String m_source;
size_t m_position { 0 };
Token m_current_token;
char m_current_char { 0 };
bool m_eof { false };
String m_filename;
size_t m_line_number { 1 };
size_t m_line_column { 0 };
bool m_regex_is_in_character_class { false };
struct TemplateState {
bool in_expr;
u8 open_bracket_count;
};
Vector<TemplateState> m_template_states;
bool m_allow_html_comments { true };
Optional<size_t> m_hit_invalid_unicode;
static HashMap<FlyString, TokenType> s_keywords;
static HashMap<String, TokenType> s_three_char_tokens;
static HashMap<String, TokenType> s_two_char_tokens;
static HashMap<char, TokenType> s_single_char_tokens;
struct ParsedIdentifiers : public RefCounted<ParsedIdentifiers> {
// Resolved identifiers must be kept alive for the duration of the parsing stage, otherwise
// the only references to these strings are deleted by the Token destructor.
HashTable<FlyString> identifiers;
};
RefPtr<ParsedIdentifiers> m_parsed_identifiers;
};
}