1
Fork 0
mirror of https://github.com/RGBCube/serenity synced 2025-10-24 21:02:06 +00:00
serenity/Userland/Libraries/LibWeb/HTML/Parser/HTMLTokenizer.h
Lorenz Steinert db789813c9 LibWeb: Add basic support for dynamic markup insertion
This implements basic support for dynamic markup insertion, adding
 * Document::open()
 * Document::write(Vector<String> const&)
 * Document::writeln(Vector<String> const&)
 * Document::close()

The HTMLParser is modified to make it possible to create a
script-created parser which initially only contains a HTMLTokenizer
without any data. Aditionally the HTMLParser::run method gains an
overload which does not modify the Document and does not run
HTMLParser::the_end() so that we can reenter the parser at a later time.
Furthermore all FIXMEs that consern the insertion point are implemented
wich is defined in the HTMLTokenizer. Additionally the following
member-variables of the HTMLParser are now exposed by getter funcions:
 * m_tokenizer
 * m_aborted
 * m_script_nesting_level

The HTMLTokenizer is modified so that it contains an insertion
point which keeps track of where the next input from the Document::write
functions will be inserted. The insertion point is implemented as the
charakter offset into m_decoded_input and a boolean describing if the
insertion point is defined. Functions to update, check and {re}store the
insertion point are also added.
The function HTMLTokenizer::insert_eof is added to tell a script-created
parser that document::close was called and HTMLParser::the_end() should
be called.
Lastly an explicit default constructor is added to HTMLTokenizer to
create a empty HTMLTokenizer into which data can be inserted.
2022-02-21 18:26:43 +01:00

213 lines
9.6 KiB
C++

/*
* Copyright (c) 2020, Andreas Kling <kling@serenityos.org>
* Copyright (c) 2022, Linus Groh <linusg@serenityos.org>
*
* SPDX-License-Identifier: BSD-2-Clause
*/
#pragma once
#include <AK/Queue.h>
#include <AK/StringBuilder.h>
#include <AK/StringView.h>
#include <AK/Types.h>
#include <AK/Utf8View.h>
#include <LibWeb/Forward.h>
#include <LibWeb/HTML/Parser/HTMLToken.h>
namespace Web::HTML {
#define ENUMERATE_TOKENIZER_STATES \
__ENUMERATE_TOKENIZER_STATE(Data) \
__ENUMERATE_TOKENIZER_STATE(RCDATA) \
__ENUMERATE_TOKENIZER_STATE(RAWTEXT) \
__ENUMERATE_TOKENIZER_STATE(ScriptData) \
__ENUMERATE_TOKENIZER_STATE(PLAINTEXT) \
__ENUMERATE_TOKENIZER_STATE(TagOpen) \
__ENUMERATE_TOKENIZER_STATE(EndTagOpen) \
__ENUMERATE_TOKENIZER_STATE(TagName) \
__ENUMERATE_TOKENIZER_STATE(RCDATALessThanSign) \
__ENUMERATE_TOKENIZER_STATE(RCDATAEndTagOpen) \
__ENUMERATE_TOKENIZER_STATE(RCDATAEndTagName) \
__ENUMERATE_TOKENIZER_STATE(RAWTEXTLessThanSign) \
__ENUMERATE_TOKENIZER_STATE(RAWTEXTEndTagOpen) \
__ENUMERATE_TOKENIZER_STATE(RAWTEXTEndTagName) \
__ENUMERATE_TOKENIZER_STATE(ScriptDataLessThanSign) \
__ENUMERATE_TOKENIZER_STATE(ScriptDataEndTagOpen) \
__ENUMERATE_TOKENIZER_STATE(ScriptDataEndTagName) \
__ENUMERATE_TOKENIZER_STATE(ScriptDataEscapeStart) \
__ENUMERATE_TOKENIZER_STATE(ScriptDataEscapeStartDash) \
__ENUMERATE_TOKENIZER_STATE(ScriptDataEscaped) \
__ENUMERATE_TOKENIZER_STATE(ScriptDataEscapedDash) \
__ENUMERATE_TOKENIZER_STATE(ScriptDataEscapedDashDash) \
__ENUMERATE_TOKENIZER_STATE(ScriptDataEscapedLessThanSign) \
__ENUMERATE_TOKENIZER_STATE(ScriptDataEscapedEndTagOpen) \
__ENUMERATE_TOKENIZER_STATE(ScriptDataEscapedEndTagName) \
__ENUMERATE_TOKENIZER_STATE(ScriptDataDoubleEscapeStart) \
__ENUMERATE_TOKENIZER_STATE(ScriptDataDoubleEscaped) \
__ENUMERATE_TOKENIZER_STATE(ScriptDataDoubleEscapedDash) \
__ENUMERATE_TOKENIZER_STATE(ScriptDataDoubleEscapedDashDash) \
__ENUMERATE_TOKENIZER_STATE(ScriptDataDoubleEscapedLessThanSign) \
__ENUMERATE_TOKENIZER_STATE(ScriptDataDoubleEscapeEnd) \
__ENUMERATE_TOKENIZER_STATE(BeforeAttributeName) \
__ENUMERATE_TOKENIZER_STATE(AttributeName) \
__ENUMERATE_TOKENIZER_STATE(AfterAttributeName) \
__ENUMERATE_TOKENIZER_STATE(BeforeAttributeValue) \
__ENUMERATE_TOKENIZER_STATE(AttributeValueDoubleQuoted) \
__ENUMERATE_TOKENIZER_STATE(AttributeValueSingleQuoted) \
__ENUMERATE_TOKENIZER_STATE(AttributeValueUnquoted) \
__ENUMERATE_TOKENIZER_STATE(AfterAttributeValueQuoted) \
__ENUMERATE_TOKENIZER_STATE(SelfClosingStartTag) \
__ENUMERATE_TOKENIZER_STATE(BogusComment) \
__ENUMERATE_TOKENIZER_STATE(MarkupDeclarationOpen) \
__ENUMERATE_TOKENIZER_STATE(CommentStart) \
__ENUMERATE_TOKENIZER_STATE(CommentStartDash) \
__ENUMERATE_TOKENIZER_STATE(Comment) \
__ENUMERATE_TOKENIZER_STATE(CommentLessThanSign) \
__ENUMERATE_TOKENIZER_STATE(CommentLessThanSignBang) \
__ENUMERATE_TOKENIZER_STATE(CommentLessThanSignBangDash) \
__ENUMERATE_TOKENIZER_STATE(CommentLessThanSignBangDashDash) \
__ENUMERATE_TOKENIZER_STATE(CommentEndDash) \
__ENUMERATE_TOKENIZER_STATE(CommentEnd) \
__ENUMERATE_TOKENIZER_STATE(CommentEndBang) \
__ENUMERATE_TOKENIZER_STATE(DOCTYPE) \
__ENUMERATE_TOKENIZER_STATE(BeforeDOCTYPEName) \
__ENUMERATE_TOKENIZER_STATE(DOCTYPEName) \
__ENUMERATE_TOKENIZER_STATE(AfterDOCTYPEName) \
__ENUMERATE_TOKENIZER_STATE(AfterDOCTYPEPublicKeyword) \
__ENUMERATE_TOKENIZER_STATE(BeforeDOCTYPEPublicIdentifier) \
__ENUMERATE_TOKENIZER_STATE(DOCTYPEPublicIdentifierDoubleQuoted) \
__ENUMERATE_TOKENIZER_STATE(DOCTYPEPublicIdentifierSingleQuoted) \
__ENUMERATE_TOKENIZER_STATE(AfterDOCTYPEPublicIdentifier) \
__ENUMERATE_TOKENIZER_STATE(BetweenDOCTYPEPublicAndSystemIdentifiers) \
__ENUMERATE_TOKENIZER_STATE(AfterDOCTYPESystemKeyword) \
__ENUMERATE_TOKENIZER_STATE(BeforeDOCTYPESystemIdentifier) \
__ENUMERATE_TOKENIZER_STATE(DOCTYPESystemIdentifierDoubleQuoted) \
__ENUMERATE_TOKENIZER_STATE(DOCTYPESystemIdentifierSingleQuoted) \
__ENUMERATE_TOKENIZER_STATE(AfterDOCTYPESystemIdentifier) \
__ENUMERATE_TOKENIZER_STATE(BogusDOCTYPE) \
__ENUMERATE_TOKENIZER_STATE(CDATASection) \
__ENUMERATE_TOKENIZER_STATE(CDATASectionBracket) \
__ENUMERATE_TOKENIZER_STATE(CDATASectionEnd) \
__ENUMERATE_TOKENIZER_STATE(CharacterReference) \
__ENUMERATE_TOKENIZER_STATE(NamedCharacterReference) \
__ENUMERATE_TOKENIZER_STATE(AmbiguousAmpersand) \
__ENUMERATE_TOKENIZER_STATE(NumericCharacterReference) \
__ENUMERATE_TOKENIZER_STATE(HexadecimalCharacterReferenceStart) \
__ENUMERATE_TOKENIZER_STATE(DecimalCharacterReferenceStart) \
__ENUMERATE_TOKENIZER_STATE(HexadecimalCharacterReference) \
__ENUMERATE_TOKENIZER_STATE(DecimalCharacterReference) \
__ENUMERATE_TOKENIZER_STATE(NumericCharacterReferenceEnd)
class HTMLTokenizer {
public:
explicit HTMLTokenizer();
explicit HTMLTokenizer(StringView input, String const& encoding);
enum class State {
#define __ENUMERATE_TOKENIZER_STATE(state) state,
ENUMERATE_TOKENIZER_STATES
#undef __ENUMERATE_TOKENIZER_STATE
};
Optional<HTMLToken> next_token();
void set_parser(Badge<HTMLParser>, HTMLParser& parser) { m_parser = &parser; }
void switch_to(Badge<HTMLParser>, State new_state);
void switch_to(State new_state)
{
m_state = new_state;
}
void set_blocked(bool b) { m_blocked = b; }
bool is_blocked() const { return m_blocked; }
String source() const { return m_decoded_input; }
void insert_input_at_insertion_point(String const& input);
void insert_eof();
bool is_eof_inserted();
bool is_insertion_point_defined() const { return m_insertion_point.defined; }
bool is_insertion_point_reached()
{
return m_insertion_point.defined && m_insertion_point.position >= m_utf8_view.iterator_offset(m_utf8_iterator);
}
void undefine_insertion_point() { m_insertion_point.defined = false; }
void store_insertion_point() { m_old_insertion_point = m_insertion_point; }
void restore_insertion_point() { m_insertion_point = m_old_insertion_point; }
void update_insertion_point()
{
m_insertion_point.defined = true;
m_insertion_point.position = m_utf8_view.iterator_offset(m_utf8_iterator);
}
private:
void skip(size_t count);
Optional<u32> next_code_point();
Optional<u32> peek_code_point(size_t offset) const;
bool consume_next_if_match(StringView, CaseSensitivity = CaseSensitivity::CaseSensitive);
void create_new_token(HTMLToken::Type);
bool current_end_tag_token_is_appropriate() const;
String consume_current_builder();
static char const* state_name(State state)
{
switch (state) {
#define __ENUMERATE_TOKENIZER_STATE(state) \
case State::state: \
return #state;
ENUMERATE_TOKENIZER_STATES
#undef __ENUMERATE_TOKENIZER_STATE
};
VERIFY_NOT_REACHED();
}
void will_emit(HTMLToken&);
void will_switch_to(State);
void will_reconsume_in(State);
bool consumed_as_part_of_an_attribute() const;
void restore_to(Utf8CodePointIterator const& new_iterator);
HTMLToken::Position nth_last_position(size_t n = 0);
HTMLParser* m_parser { nullptr };
State m_state { State::Data };
State m_return_state { State::Data };
Vector<u32> m_temporary_buffer;
String m_decoded_input;
struct InsertionPoint {
size_t position { 0 };
bool defined { false };
};
InsertionPoint m_insertion_point {};
InsertionPoint m_old_insertion_point {};
Utf8View m_utf8_view;
Utf8CodePointIterator m_utf8_iterator;
Utf8CodePointIterator m_prev_utf8_iterator;
HTMLToken m_current_token;
StringBuilder m_current_builder;
Optional<String> m_last_emitted_start_tag_name;
bool m_explicit_eof_inserted { false };
bool m_has_emitted_eof { false };
Queue<HTMLToken> m_queued_tokens;
u32 m_character_reference_code { 0 };
bool m_blocked { false };
Vector<HTMLToken::Position> m_source_positions;
};
}