From 20911efd4d4a8bfe461171625b21e6bf85e6ecd2 Mon Sep 17 00:00:00 2001 From: Andreas Kling Date: Sun, 24 May 2020 20:24:43 +0200 Subject: [PATCH] LibWeb: More work on the HTML parser and tokenizer The parser can now switch the state of the tokenizer! Very webby. :^) --- Libraries/LibWeb/Forward.h | 1 + .../LibWeb/Parser/HTMLDocumentParser.cpp | 32 ++++- Libraries/LibWeb/Parser/HTMLDocumentParser.h | 1 + Libraries/LibWeb/Parser/HTMLToken.cpp | 4 +- Libraries/LibWeb/Parser/HTMLToken.h | 3 +- Libraries/LibWeb/Parser/HTMLTokenizer.cpp | 136 +++++++++++++++++- Libraries/LibWeb/Parser/HTMLTokenizer.h | 21 ++- Userland/ht.cpp | 2 +- 8 files changed, 186 insertions(+), 14 deletions(-) diff --git a/Libraries/LibWeb/Forward.h b/Libraries/LibWeb/Forward.h index 2ebcac7f2e..3431118c7e 100644 --- a/Libraries/LibWeb/Forward.h +++ b/Libraries/LibWeb/Forward.h @@ -37,6 +37,7 @@ class EventTarget; class Frame; class HTMLBodyElement; class HTMLCanvasElement; +class HTMLDocumentParser; class HTMLElement; class HTMLFormElement; class HTMLHeadElement; diff --git a/Libraries/LibWeb/Parser/HTMLDocumentParser.cpp b/Libraries/LibWeb/Parser/HTMLDocumentParser.cpp index 090d33e9a7..a318d2a895 100644 --- a/Libraries/LibWeb/Parser/HTMLDocumentParser.cpp +++ b/Libraries/LibWeb/Parser/HTMLDocumentParser.cpp @@ -34,6 +34,11 @@ #include #include +#define TODO() \ + do { \ + ASSERT_NOT_REACHED(); \ + } while (0) + namespace Web { HTMLDocumentParser::HTMLDocumentParser(const StringView& input) @@ -176,6 +181,19 @@ void HTMLDocumentParser::handle_before_head(HTMLToken& token) void HTMLDocumentParser::handle_in_head(HTMLToken& token) { + if (token.is_parser_whitespace()) { + insert_character(token.codepoint()); + return; + } + + if (token.is_start_tag() && token.tag_name() == "title") { + insert_html_element(token); + m_tokenizer.switch_to({}, HTMLTokenizer::State::RCDATA); + m_original_insertion_mode = m_insertion_mode; + m_insertion_mode = InsertionMode::Text; + return; + } + if (token.is_start_tag() && token.tag_name() == "meta") { auto element = insert_html_element(token); m_stack_of_open_elements.pop(); @@ -381,8 +399,20 @@ void HTMLDocumentParser::handle_in_body(HTMLToken& token) ASSERT_NOT_REACHED(); } -void HTMLDocumentParser::handle_text(HTMLToken&) +void HTMLDocumentParser::handle_text(HTMLToken& token) { + if (token.is_character()) { + insert_character(token.codepoint()); + return; + } + if (token.is_end_tag() && token.tag_name() == "script") { + ASSERT_NOT_REACHED(); + } + if (token.is_end_tag()) { + m_stack_of_open_elements.pop(); + m_insertion_mode = m_original_insertion_mode; + return; + } ASSERT_NOT_REACHED(); } diff --git a/Libraries/LibWeb/Parser/HTMLDocumentParser.h b/Libraries/LibWeb/Parser/HTMLDocumentParser.h index 5d3725591e..f8cd1f28e1 100644 --- a/Libraries/LibWeb/Parser/HTMLDocumentParser.h +++ b/Libraries/LibWeb/Parser/HTMLDocumentParser.h @@ -100,6 +100,7 @@ private: void process_using_the_rules_for(InsertionMode, HTMLToken&); InsertionMode m_insertion_mode { InsertionMode::Initial }; + InsertionMode m_original_insertion_mode { InsertionMode::Initial }; StackOfOpenElements m_stack_of_open_elements; diff --git a/Libraries/LibWeb/Parser/HTMLToken.cpp b/Libraries/LibWeb/Parser/HTMLToken.cpp index 88cf5991ca..587d1ae02d 100644 --- a/Libraries/LibWeb/Parser/HTMLToken.cpp +++ b/Libraries/LibWeb/Parser/HTMLToken.cpp @@ -54,6 +54,8 @@ String HTMLToken::to_string() const case HTMLToken::Type::EndOfFile: builder.append("EndOfFile"); break; + case HTMLToken::Type::Invalid: + ASSERT_NOT_REACHED(); } if (type() == HTMLToken::Type::StartTag || type() == HTMLToken::Type::EndTag) { @@ -72,7 +74,7 @@ String HTMLToken::to_string() const if (type() == HTMLToken::Type::Comment || type() == HTMLToken::Type::Character) { builder.append(" { data: '"); builder.append(m_comment_or_character.data.to_string()); - builder.append(" }"); + builder.append("' }"); } return builder.to_string(); diff --git a/Libraries/LibWeb/Parser/HTMLToken.h b/Libraries/LibWeb/Parser/HTMLToken.h index 2a64c06478..aeeb92e0d2 100644 --- a/Libraries/LibWeb/Parser/HTMLToken.h +++ b/Libraries/LibWeb/Parser/HTMLToken.h @@ -39,6 +39,7 @@ class HTMLToken { public: enum class Type { + Invalid, DOCTYPE, StartTag, EndTag, @@ -101,7 +102,7 @@ private: StringBuilder value_builder; }; - Type m_type; + Type m_type { Type::Invalid }; // Type::DOCTYPE struct { diff --git a/Libraries/LibWeb/Parser/HTMLTokenizer.cpp b/Libraries/LibWeb/Parser/HTMLTokenizer.cpp index 5c46e36249..a364fc8af6 100644 --- a/Libraries/LibWeb/Parser/HTMLTokenizer.cpp +++ b/Libraries/LibWeb/Parser/HTMLTokenizer.cpp @@ -52,6 +52,7 @@ #define SWITCH_TO_AND_EMIT_CURRENT_TOKEN(new_state) \ will_switch_to(State::new_state); \ m_state = State::new_state; \ + will_emit(m_current_token); \ return m_current_token; #define DONT_CONSUME_NEXT_INPUT_CHARACTER --m_cursor; @@ -68,6 +69,9 @@ #define ON_ASCII_UPPER_ALPHA \ if (current_input_character.has_value() && current_input_character.value() >= 'A' && current_input_character.value() <= 'Z') +#define ON_ASCII_LOWER_ALPHA \ + if (current_input_character.has_value() && current_input_character.value() >= 'a' && current_input_character.value() <= 'z') + #define ON_WHITESPACE \ if (current_input_character.has_value() && (current_input_character.value() == '\t' || current_input_character.value() == '\n' || current_input_character.value() == '\f' || current_input_character.value() == ' ')) @@ -78,11 +82,22 @@ return {}; \ m_has_emitted_eof = true; \ create_new_token(HTMLToken::Type::EndOfFile); \ + will_emit(m_current_token); \ return m_current_token; -#define EMIT_CURRENT_TOKEN \ +#define EMIT_CURRENT_TOKEN \ + will_emit(m_current_token); \ return m_current_token; +#define EMIT_CHARACTER(codepoint) \ + create_new_token(HTMLToken::Type::Character); \ + m_current_token.m_comment_or_character.data.append(codepoint); \ + will_emit(m_current_token); \ + return m_current_token; + +#define EMIT_CURRENT_CHARACTER \ + EMIT_CHARACTER(current_input_character.value()); + #define BEGIN_STATE(state) \ state: \ case State::state: { \ @@ -134,9 +149,7 @@ Optional HTMLTokenizer::next_token() } ANYTHING_ELSE { - create_new_token(HTMLToken::Type::Character); - m_current_token.m_comment_or_character.data.append(current_input_character.value()); - return m_current_token; + EMIT_CURRENT_CHARACTER; } } END_STATE @@ -721,6 +734,99 @@ Optional HTMLTokenizer::next_token() } END_STATE + BEGIN_STATE(RCDATA) + { + ON('&') + { + m_return_state = State::RCDATA; + SWITCH_TO(CharacterReference); + } + ON('<') + { + SWITCH_TO(RCDATALessThanSign); + } + ON(0) + { + TODO(); + } + ON_EOF + { + EMIT_EOF; + } + ANYTHING_ELSE + { + EMIT_CURRENT_CHARACTER; + } + } + END_STATE + + BEGIN_STATE(RCDATALessThanSign) + { + ON('/') + { + m_temporary_buffer.clear(); + SWITCH_TO(RCDATAEndTagOpen); + } + ANYTHING_ELSE + { + EMIT_CHARACTER('<'); + RECONSUME_IN(RCDATA); + } + } + END_STATE + + BEGIN_STATE(RCDATAEndTagOpen) + { + ON_ASCII_ALPHA + { + create_new_token(HTMLToken::Type::EndTag); + RECONSUME_IN(RCDATAEndTagName); + } + ANYTHING_ELSE + { + // FIXME: Emit a U+003C LESS-THAN SIGN character token and a U+002F SOLIDUS character token. Reconsume in the RCDATA state. + TODO(); + } + } + END_STATE + + BEGIN_STATE(RCDATAEndTagName) + { + ON_WHITESPACE + { + TODO(); + } + ON('/') + { + TODO(); + } + ON('>') + { + if (!current_end_tag_token_is_appropriate()) { + // FIXME: Otherwise, treat it as per the "anything else" entry below. + TODO(); + } + SWITCH_TO_AND_EMIT_CURRENT_TOKEN(Data); + } + ON_ASCII_UPPER_ALPHA + { + m_current_token.m_tag.tag_name.append(tolower(current_input_character.value())); + m_temporary_buffer.append(current_input_character.value()); + continue; + } + ON_ASCII_LOWER_ALPHA + { + m_current_token.m_tag.tag_name.append(current_input_character.value()); + m_temporary_buffer.append(current_input_character.value()); + continue; + } + ANYTHING_ELSE + { + TODO(); + } + } + END_STATE + default: ASSERT_NOT_REACHED(); } @@ -771,4 +877,26 @@ void HTMLTokenizer::will_reconsume_in([[maybe_unused]] State new_state) #endif } +void HTMLTokenizer::switch_to(Badge, State new_state) +{ +#ifdef TOKENIZER_TRACE + dbg() << "[" << state_name(m_state) << "] Parser switches tokenizer state to " << state_name(new_state); +#endif + m_state = new_state; +} + +void HTMLTokenizer::will_emit(HTMLToken& token) +{ + if (token.is_start_tag()) + m_last_emitted_start_tag = token; +} + +bool HTMLTokenizer::current_end_tag_token_is_appropriate() const +{ + ASSERT(m_current_token.is_end_tag()); + if (!m_last_emitted_start_tag.is_start_tag()) + return false; + return m_current_token.tag_name() == m_last_emitted_start_tag.tag_name(); +} + } diff --git a/Libraries/LibWeb/Parser/HTMLTokenizer.h b/Libraries/LibWeb/Parser/HTMLTokenizer.h index b01b0e200e..3f6aa4f191 100644 --- a/Libraries/LibWeb/Parser/HTMLTokenizer.h +++ b/Libraries/LibWeb/Parser/HTMLTokenizer.h @@ -28,6 +28,7 @@ #include #include +#include #include #define ENUMERATE_TOKENIZER_STATES \ @@ -118,20 +119,23 @@ class HTMLTokenizer { public: explicit HTMLTokenizer(const StringView& input); + enum class State { +#define __ENUMERATE_TOKENIZER_STATE(state) state, + ENUMERATE_TOKENIZER_STATES +#undef __ENUMERATE_TOKENIZER_STATE + }; + Optional next_token(); + void switch_to(Badge, State new_state); + private: Optional next_codepoint(); Optional peek_codepoint(size_t offset) const; bool next_few_characters_are(const StringView&) const; void consume(const StringView&); void create_new_token(HTMLToken::Type); - - enum class State { -#define __ENUMERATE_TOKENIZER_STATE(state) state, - ENUMERATE_TOKENIZER_STATES -#undef __ENUMERATE_TOKENIZER_STATE - }; + bool current_end_tag_token_is_appropriate() const; static const char* state_name(State state) { @@ -145,17 +149,22 @@ private: ASSERT_NOT_REACHED(); } + void will_emit(HTMLToken&); void will_switch_to(State); void will_reconsume_in(State); State m_state { State::Data }; State m_return_state { State::Data }; + StringBuilder m_temporary_buffer; + StringView m_input; size_t m_cursor { 0 }; HTMLToken m_current_token; + HTMLToken m_last_emitted_start_tag; + bool m_has_emitted_eof { false }; }; } diff --git a/Userland/ht.cpp b/Userland/ht.cpp index 03f33455a0..27667530cb 100644 --- a/Userland/ht.cpp +++ b/Userland/ht.cpp @@ -38,7 +38,7 @@ int main(int argc, char** argv) Core::EventLoop loop; // This is a temporary test program to aid with bringing up the new HTML parser. :^) - const char* input_path = "/home/anon/www/simple.html"; + const char* input_path = "/home/anon/www/welcome.html"; if (argc > 1) input_path = argv[1];