1
Fork 0
mirror of https://github.com/RGBCube/serenity synced 2025-07-25 06:47:35 +00:00

LibWeb: More work on the HTML parser and tokenizer

The parser can now switch the state of the tokenizer! Very webby. :^)
This commit is contained in:
Andreas Kling 2020-05-24 20:24:43 +02:00
parent 31db3f21ae
commit 20911efd4d
8 changed files with 186 additions and 14 deletions

View file

@ -37,6 +37,7 @@ class EventTarget;
class Frame; class Frame;
class HTMLBodyElement; class HTMLBodyElement;
class HTMLCanvasElement; class HTMLCanvasElement;
class HTMLDocumentParser;
class HTMLElement; class HTMLElement;
class HTMLFormElement; class HTMLFormElement;
class HTMLHeadElement; class HTMLHeadElement;

View file

@ -34,6 +34,11 @@
#include <LibWeb/Parser/HTMLDocumentParser.h> #include <LibWeb/Parser/HTMLDocumentParser.h>
#include <LibWeb/Parser/HTMLToken.h> #include <LibWeb/Parser/HTMLToken.h>
#define TODO() \
do { \
ASSERT_NOT_REACHED(); \
} while (0)
namespace Web { namespace Web {
HTMLDocumentParser::HTMLDocumentParser(const StringView& input) HTMLDocumentParser::HTMLDocumentParser(const StringView& input)
@ -176,6 +181,19 @@ void HTMLDocumentParser::handle_before_head(HTMLToken& token)
void HTMLDocumentParser::handle_in_head(HTMLToken& token) void HTMLDocumentParser::handle_in_head(HTMLToken& token)
{ {
if (token.is_parser_whitespace()) {
insert_character(token.codepoint());
return;
}
if (token.is_start_tag() && token.tag_name() == "title") {
insert_html_element(token);
m_tokenizer.switch_to({}, HTMLTokenizer::State::RCDATA);
m_original_insertion_mode = m_insertion_mode;
m_insertion_mode = InsertionMode::Text;
return;
}
if (token.is_start_tag() && token.tag_name() == "meta") { if (token.is_start_tag() && token.tag_name() == "meta") {
auto element = insert_html_element(token); auto element = insert_html_element(token);
m_stack_of_open_elements.pop(); m_stack_of_open_elements.pop();
@ -381,8 +399,20 @@ void HTMLDocumentParser::handle_in_body(HTMLToken& token)
ASSERT_NOT_REACHED(); ASSERT_NOT_REACHED();
} }
void HTMLDocumentParser::handle_text(HTMLToken&) void HTMLDocumentParser::handle_text(HTMLToken& token)
{ {
if (token.is_character()) {
insert_character(token.codepoint());
return;
}
if (token.is_end_tag() && token.tag_name() == "script") {
ASSERT_NOT_REACHED();
}
if (token.is_end_tag()) {
m_stack_of_open_elements.pop();
m_insertion_mode = m_original_insertion_mode;
return;
}
ASSERT_NOT_REACHED(); ASSERT_NOT_REACHED();
} }

View file

@ -100,6 +100,7 @@ private:
void process_using_the_rules_for(InsertionMode, HTMLToken&); void process_using_the_rules_for(InsertionMode, HTMLToken&);
InsertionMode m_insertion_mode { InsertionMode::Initial }; InsertionMode m_insertion_mode { InsertionMode::Initial };
InsertionMode m_original_insertion_mode { InsertionMode::Initial };
StackOfOpenElements m_stack_of_open_elements; StackOfOpenElements m_stack_of_open_elements;

View file

@ -54,6 +54,8 @@ String HTMLToken::to_string() const
case HTMLToken::Type::EndOfFile: case HTMLToken::Type::EndOfFile:
builder.append("EndOfFile"); builder.append("EndOfFile");
break; break;
case HTMLToken::Type::Invalid:
ASSERT_NOT_REACHED();
} }
if (type() == HTMLToken::Type::StartTag || type() == HTMLToken::Type::EndTag) { if (type() == HTMLToken::Type::StartTag || type() == HTMLToken::Type::EndTag) {
@ -72,7 +74,7 @@ String HTMLToken::to_string() const
if (type() == HTMLToken::Type::Comment || type() == HTMLToken::Type::Character) { if (type() == HTMLToken::Type::Comment || type() == HTMLToken::Type::Character) {
builder.append(" { data: '"); builder.append(" { data: '");
builder.append(m_comment_or_character.data.to_string()); builder.append(m_comment_or_character.data.to_string());
builder.append(" }"); builder.append("' }");
} }
return builder.to_string(); return builder.to_string();

View file

@ -39,6 +39,7 @@ class HTMLToken {
public: public:
enum class Type { enum class Type {
Invalid,
DOCTYPE, DOCTYPE,
StartTag, StartTag,
EndTag, EndTag,
@ -101,7 +102,7 @@ private:
StringBuilder value_builder; StringBuilder value_builder;
}; };
Type m_type; Type m_type { Type::Invalid };
// Type::DOCTYPE // Type::DOCTYPE
struct { struct {

View file

@ -52,6 +52,7 @@
#define SWITCH_TO_AND_EMIT_CURRENT_TOKEN(new_state) \ #define SWITCH_TO_AND_EMIT_CURRENT_TOKEN(new_state) \
will_switch_to(State::new_state); \ will_switch_to(State::new_state); \
m_state = State::new_state; \ m_state = State::new_state; \
will_emit(m_current_token); \
return m_current_token; return m_current_token;
#define DONT_CONSUME_NEXT_INPUT_CHARACTER --m_cursor; #define DONT_CONSUME_NEXT_INPUT_CHARACTER --m_cursor;
@ -68,6 +69,9 @@
#define ON_ASCII_UPPER_ALPHA \ #define ON_ASCII_UPPER_ALPHA \
if (current_input_character.has_value() && current_input_character.value() >= 'A' && current_input_character.value() <= 'Z') if (current_input_character.has_value() && current_input_character.value() >= 'A' && current_input_character.value() <= 'Z')
#define ON_ASCII_LOWER_ALPHA \
if (current_input_character.has_value() && current_input_character.value() >= 'a' && current_input_character.value() <= 'z')
#define ON_WHITESPACE \ #define ON_WHITESPACE \
if (current_input_character.has_value() && (current_input_character.value() == '\t' || current_input_character.value() == '\n' || current_input_character.value() == '\f' || current_input_character.value() == ' ')) if (current_input_character.has_value() && (current_input_character.value() == '\t' || current_input_character.value() == '\n' || current_input_character.value() == '\f' || current_input_character.value() == ' '))
@ -78,11 +82,22 @@
return {}; \ return {}; \
m_has_emitted_eof = true; \ m_has_emitted_eof = true; \
create_new_token(HTMLToken::Type::EndOfFile); \ create_new_token(HTMLToken::Type::EndOfFile); \
will_emit(m_current_token); \
return m_current_token; return m_current_token;
#define EMIT_CURRENT_TOKEN \ #define EMIT_CURRENT_TOKEN \
will_emit(m_current_token); \
return m_current_token; return m_current_token;
#define EMIT_CHARACTER(codepoint) \
create_new_token(HTMLToken::Type::Character); \
m_current_token.m_comment_or_character.data.append(codepoint); \
will_emit(m_current_token); \
return m_current_token;
#define EMIT_CURRENT_CHARACTER \
EMIT_CHARACTER(current_input_character.value());
#define BEGIN_STATE(state) \ #define BEGIN_STATE(state) \
state: \ state: \
case State::state: { \ case State::state: { \
@ -134,9 +149,7 @@ Optional<HTMLToken> HTMLTokenizer::next_token()
} }
ANYTHING_ELSE ANYTHING_ELSE
{ {
create_new_token(HTMLToken::Type::Character); EMIT_CURRENT_CHARACTER;
m_current_token.m_comment_or_character.data.append(current_input_character.value());
return m_current_token;
} }
} }
END_STATE END_STATE
@ -721,6 +734,99 @@ Optional<HTMLToken> HTMLTokenizer::next_token()
} }
END_STATE END_STATE
BEGIN_STATE(RCDATA)
{
ON('&')
{
m_return_state = State::RCDATA;
SWITCH_TO(CharacterReference);
}
ON('<')
{
SWITCH_TO(RCDATALessThanSign);
}
ON(0)
{
TODO();
}
ON_EOF
{
EMIT_EOF;
}
ANYTHING_ELSE
{
EMIT_CURRENT_CHARACTER;
}
}
END_STATE
BEGIN_STATE(RCDATALessThanSign)
{
ON('/')
{
m_temporary_buffer.clear();
SWITCH_TO(RCDATAEndTagOpen);
}
ANYTHING_ELSE
{
EMIT_CHARACTER('<');
RECONSUME_IN(RCDATA);
}
}
END_STATE
BEGIN_STATE(RCDATAEndTagOpen)
{
ON_ASCII_ALPHA
{
create_new_token(HTMLToken::Type::EndTag);
RECONSUME_IN(RCDATAEndTagName);
}
ANYTHING_ELSE
{
// FIXME: Emit a U+003C LESS-THAN SIGN character token and a U+002F SOLIDUS character token. Reconsume in the RCDATA state.
TODO();
}
}
END_STATE
BEGIN_STATE(RCDATAEndTagName)
{
ON_WHITESPACE
{
TODO();
}
ON('/')
{
TODO();
}
ON('>')
{
if (!current_end_tag_token_is_appropriate()) {
// FIXME: Otherwise, treat it as per the "anything else" entry below.
TODO();
}
SWITCH_TO_AND_EMIT_CURRENT_TOKEN(Data);
}
ON_ASCII_UPPER_ALPHA
{
m_current_token.m_tag.tag_name.append(tolower(current_input_character.value()));
m_temporary_buffer.append(current_input_character.value());
continue;
}
ON_ASCII_LOWER_ALPHA
{
m_current_token.m_tag.tag_name.append(current_input_character.value());
m_temporary_buffer.append(current_input_character.value());
continue;
}
ANYTHING_ELSE
{
TODO();
}
}
END_STATE
default: default:
ASSERT_NOT_REACHED(); ASSERT_NOT_REACHED();
} }
@ -771,4 +877,26 @@ void HTMLTokenizer::will_reconsume_in([[maybe_unused]] State new_state)
#endif #endif
} }
void HTMLTokenizer::switch_to(Badge<HTMLDocumentParser>, State new_state)
{
#ifdef TOKENIZER_TRACE
dbg() << "[" << state_name(m_state) << "] Parser switches tokenizer state to " << state_name(new_state);
#endif
m_state = new_state;
}
void HTMLTokenizer::will_emit(HTMLToken& token)
{
if (token.is_start_tag())
m_last_emitted_start_tag = token;
}
bool HTMLTokenizer::current_end_tag_token_is_appropriate() const
{
ASSERT(m_current_token.is_end_tag());
if (!m_last_emitted_start_tag.is_start_tag())
return false;
return m_current_token.tag_name() == m_last_emitted_start_tag.tag_name();
}
} }

View file

@ -28,6 +28,7 @@
#include <AK/StringView.h> #include <AK/StringView.h>
#include <AK/Types.h> #include <AK/Types.h>
#include <LibWeb/Forward.h>
#include <LibWeb/Parser/HTMLToken.h> #include <LibWeb/Parser/HTMLToken.h>
#define ENUMERATE_TOKENIZER_STATES \ #define ENUMERATE_TOKENIZER_STATES \
@ -118,20 +119,23 @@ class HTMLTokenizer {
public: public:
explicit HTMLTokenizer(const StringView& input); explicit HTMLTokenizer(const StringView& input);
enum class State {
#define __ENUMERATE_TOKENIZER_STATE(state) state,
ENUMERATE_TOKENIZER_STATES
#undef __ENUMERATE_TOKENIZER_STATE
};
Optional<HTMLToken> next_token(); Optional<HTMLToken> next_token();
void switch_to(Badge<HTMLDocumentParser>, State new_state);
private: private:
Optional<u32> next_codepoint(); Optional<u32> next_codepoint();
Optional<u32> peek_codepoint(size_t offset) const; Optional<u32> peek_codepoint(size_t offset) const;
bool next_few_characters_are(const StringView&) const; bool next_few_characters_are(const StringView&) const;
void consume(const StringView&); void consume(const StringView&);
void create_new_token(HTMLToken::Type); void create_new_token(HTMLToken::Type);
bool current_end_tag_token_is_appropriate() const;
enum class State {
#define __ENUMERATE_TOKENIZER_STATE(state) state,
ENUMERATE_TOKENIZER_STATES
#undef __ENUMERATE_TOKENIZER_STATE
};
static const char* state_name(State state) static const char* state_name(State state)
{ {
@ -145,17 +149,22 @@ private:
ASSERT_NOT_REACHED(); ASSERT_NOT_REACHED();
} }
void will_emit(HTMLToken&);
void will_switch_to(State); void will_switch_to(State);
void will_reconsume_in(State); void will_reconsume_in(State);
State m_state { State::Data }; State m_state { State::Data };
State m_return_state { State::Data }; State m_return_state { State::Data };
StringBuilder m_temporary_buffer;
StringView m_input; StringView m_input;
size_t m_cursor { 0 }; size_t m_cursor { 0 };
HTMLToken m_current_token; HTMLToken m_current_token;
HTMLToken m_last_emitted_start_tag;
bool m_has_emitted_eof { false }; bool m_has_emitted_eof { false };
}; };
} }

View file

@ -38,7 +38,7 @@ int main(int argc, char** argv)
Core::EventLoop loop; Core::EventLoop loop;
// This is a temporary test program to aid with bringing up the new HTML parser. :^) // This is a temporary test program to aid with bringing up the new HTML parser. :^)
const char* input_path = "/home/anon/www/simple.html"; const char* input_path = "/home/anon/www/welcome.html";
if (argc > 1) if (argc > 1)
input_path = argv[1]; input_path = argv[1];