diff --git a/Libraries/LibWeb/CMakeLists.txt b/Libraries/LibWeb/CMakeLists.txt index 631ff1e534..e2789da5aa 100644 --- a/Libraries/LibWeb/CMakeLists.txt +++ b/Libraries/LibWeb/CMakeLists.txt @@ -84,7 +84,9 @@ set(SOURCES Layout/LineBox.cpp Layout/LineBoxFragment.cpp Parser/CSSParser.cpp + Parser/HTMLDocumentParser.cpp Parser/HTMLParser.cpp + Parser/HTMLToken.cpp Parser/HTMLTokenizer.cpp ResourceLoader.cpp StylePropertiesModel.cpp diff --git a/Libraries/LibWeb/Parser/HTMLDocumentParser.cpp b/Libraries/LibWeb/Parser/HTMLDocumentParser.cpp new file mode 100644 index 0000000000..3d61d29004 --- /dev/null +++ b/Libraries/LibWeb/Parser/HTMLDocumentParser.cpp @@ -0,0 +1,263 @@ +/* + * Copyright (c) 2020, Andreas Kling + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR + * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER + * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include +#include +#include +#include +#include +#include +#include + +namespace Web { + +HTMLDocumentParser::HTMLDocumentParser(const StringView& input) + : m_tokenizer(input) +{ +} + +HTMLDocumentParser::~HTMLDocumentParser() +{ +} + +void HTMLDocumentParser::run() +{ + m_document = adopt(*new Document); + + for (;;) { + auto optional_token = m_tokenizer.next_token(); + if (!optional_token.has_value()) + return; + auto& token = optional_token.value(); + + dbg() << "[" << insertion_mode_name() << "] " << token.to_string(); + + if (token.type() == HTMLToken::Type::EndOfFile) + return; + + switch (m_insertion_mode) { + case InsertionMode::Initial: + handle_initial(token); + break; + case InsertionMode::BeforeHTML: + handle_before_html(token); + break; + case InsertionMode::BeforeHead: + handle_before_head(token); + break; + case InsertionMode::InHead: + handle_in_head(token); + break; + case InsertionMode::InHeadNoscript: + handle_in_head_noscript(token); + break; + case InsertionMode::AfterHead: + handle_after_head(token); + break; + case InsertionMode::InBody: + handle_in_body(token); + break; + case InsertionMode::Text: + handle_text(token); + break; + default: + ASSERT_NOT_REACHED(); + } + } +} + +void HTMLDocumentParser::handle_initial(HTMLToken& token) +{ + if (token.type() == HTMLToken::Type::DOCTYPE) { + auto doctype = adopt(*new DocumentType(document())); + doctype->set_name(token.m_doctype.name.to_string()); + document().append_child(move(doctype)); + m_insertion_mode = InsertionMode::BeforeHTML; + return; + } + ASSERT_NOT_REACHED(); +} + +void HTMLDocumentParser::handle_before_html(HTMLToken& token) +{ + if (token.is_start_tag() && token.tag_name() == "html") { + auto element = create_element_for(token); + document().append_child(element); + m_stack_of_open_elements.append(element); + m_insertion_mode = InsertionMode::BeforeHead; + return; + } + ASSERT_NOT_REACHED(); +} + +NonnullRefPtr HTMLDocumentParser::current_node() +{ + return m_stack_of_open_elements.last(); +} + +RefPtr HTMLDocumentParser::find_appropriate_place_for_inserting_node() +{ + auto target = current_node(); + if (m_foster_parenting) { + ASSERT_NOT_REACHED(); + } + return target; +} + +NonnullRefPtr HTMLDocumentParser::create_element_for(HTMLToken& token) +{ + auto element = create_element(document(), token.tag_name()); + for (auto& attribute : token.m_tag.attributes) { + element->set_attribute(attribute.name_builder.to_string(), attribute.value_builder.to_string()); + } + return element; +} + +RefPtr HTMLDocumentParser::insert_html_element(HTMLToken& token) +{ + auto adjusted_insertion_location = find_appropriate_place_for_inserting_node(); + auto element = create_element_for(token); + // FIXME: Check if it's possible to insert `element` at `adjusted_insertion_location` + adjusted_insertion_location->append_child(element); + m_stack_of_open_elements.append(element); + return element; +} + +void HTMLDocumentParser::handle_before_head(HTMLToken& token) +{ + if (token.is_start_tag() && token.tag_name() == "head") { + auto element = insert_html_element(token); + m_head_element = to(element); + m_insertion_mode = InsertionMode::InHead; + return; + } + ASSERT_NOT_REACHED(); +} + +void HTMLDocumentParser::handle_in_head(HTMLToken& token) +{ + if (token.is_start_tag() && token.tag_name() == "meta") { + auto element = insert_html_element(token); + m_stack_of_open_elements.take_last(); + if (token.is_self_closing()) { + ASSERT_NOT_REACHED(); + } + return; + } + if (token.is_end_tag() && token.tag_name() == "head") { + m_stack_of_open_elements.take_last(); + m_insertion_mode = InsertionMode::AfterHead; + return; + } + ASSERT_NOT_REACHED(); +} + +void HTMLDocumentParser::handle_in_head_noscript(HTMLToken&) +{ + ASSERT_NOT_REACHED(); +} + +void HTMLDocumentParser::handle_after_head(HTMLToken& token) +{ + if (token.is_character()) { + ASSERT_NOT_REACHED(); + } + + if (token.is_comment()) { + ASSERT_NOT_REACHED(); + } + + if (token.is_doctype()) { + ASSERT_NOT_REACHED(); + } + + if (token.is_start_tag() && token.tag_name() == "html") { + ASSERT_NOT_REACHED(); + } + + if (token.is_start_tag() && token.tag_name() == "body") { + ASSERT_NOT_REACHED(); + } + + if (token.is_start_tag() && token.tag_name() == "frameset") { + ASSERT_NOT_REACHED(); + } + + { + Vector names = { "base", "basefont", "bgsound", "link", "meta", "noframes", "script", "style", "template", "title" }; + if (token.is_end_tag() && names.contains_slow(token.tag_name())) { + ASSERT_NOT_REACHED(); + } + } + + if (token.is_end_tag() && token.tag_name() == "template") { + ASSERT_NOT_REACHED(); + } + + if (token.is_end_tag() && (token.tag_name() == "body" || token.tag_name() == "html" || token.tag_name() == "br")) { + goto AnythingElse; + } + + if ((token.is_start_tag() && token.tag_name() == "head") || token.is_end_tag()) { + ASSERT_NOT_REACHED(); + } + +AnythingElse: + HTMLToken fake_body_token; + fake_body_token.m_type = HTMLToken::Type::StartTag; + fake_body_token.m_tag.tag_name.append("body"); + insert_html_element(fake_body_token); + m_insertion_mode = InsertionMode::InBody; +} + +void HTMLDocumentParser::handle_in_body(HTMLToken&) +{ + ASSERT_NOT_REACHED(); +} + +void HTMLDocumentParser::handle_text(HTMLToken&) +{ + ASSERT_NOT_REACHED(); +} + +const char* HTMLDocumentParser::insertion_mode_name() const +{ + switch (m_insertion_mode) { +#define __ENUMERATE_INSERTION_MODE(mode) \ + case InsertionMode::mode: \ + return #mode; + ENUMERATE_INSERTION_MODES +#undef __ENUMERATE_INSERTION_MODE + } + ASSERT_NOT_REACHED(); +} + +Document& HTMLDocumentParser::document() +{ + return *m_document; +} + +} diff --git a/Libraries/LibWeb/Parser/HTMLDocumentParser.h b/Libraries/LibWeb/Parser/HTMLDocumentParser.h new file mode 100644 index 0000000000..060cc31d45 --- /dev/null +++ b/Libraries/LibWeb/Parser/HTMLDocumentParser.h @@ -0,0 +1,106 @@ +/* + * Copyright (c) 2020, Andreas Kling + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR + * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER + * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#pragma once + +#include +#include +#include + +#define ENUMERATE_INSERTION_MODES \ + __ENUMERATE_INSERTION_MODE(Initial) \ + __ENUMERATE_INSERTION_MODE(BeforeHTML) \ + __ENUMERATE_INSERTION_MODE(BeforeHead) \ + __ENUMERATE_INSERTION_MODE(InHead) \ + __ENUMERATE_INSERTION_MODE(InHeadNoscript) \ + __ENUMERATE_INSERTION_MODE(AfterHead) \ + __ENUMERATE_INSERTION_MODE(InBody) \ + __ENUMERATE_INSERTION_MODE(Text) \ + __ENUMERATE_INSERTION_MODE(InTable) \ + __ENUMERATE_INSERTION_MODE(InTableText) \ + __ENUMERATE_INSERTION_MODE(InCaption) \ + __ENUMERATE_INSERTION_MODE(InColumnGroup) \ + __ENUMERATE_INSERTION_MODE(InTableBody) \ + __ENUMERATE_INSERTION_MODE(InRow) \ + __ENUMERATE_INSERTION_MODE(InCell) \ + __ENUMERATE_INSERTION_MODE(InSelect) \ + __ENUMERATE_INSERTION_MODE(InSelectInTable) \ + __ENUMERATE_INSERTION_MODE(InTemplate) \ + __ENUMERATE_INSERTION_MODE(AfterBody) \ + __ENUMERATE_INSERTION_MODE(InFrameset) \ + __ENUMERATE_INSERTION_MODE(AfterFrameset) \ + __ENUMERATE_INSERTION_MODE(AfterAfterBody) \ + __ENUMERATE_INSERTION_MODE(AfterAfterFrameset) + +namespace Web { + +class HTMLDocumentParser { +public: + explicit HTMLDocumentParser(const StringView& input); + ~HTMLDocumentParser(); + + void run(); + + Document& document(); + + enum class InsertionMode { +#define __ENUMERATE_INSERTION_MODE(mode) mode, + ENUMERATE_INSERTION_MODES +#undef __ENUMERATE_INSERTION_MODE + }; + + InsertionMode insertion_mode() const { return m_insertion_mode; } + +private: + const char* insertion_mode_name() const; + + void handle_initial(HTMLToken&); + void handle_before_html(HTMLToken&); + void handle_before_head(HTMLToken&); + void handle_in_head(HTMLToken&); + void handle_in_head_noscript(HTMLToken&); + void handle_after_head(HTMLToken&); + void handle_in_body(HTMLToken&); + void handle_text(HTMLToken&); + + NonnullRefPtr create_element_for(HTMLToken&); + RefPtr find_appropriate_place_for_inserting_node(); + RefPtr insert_html_element(HTMLToken&); + NonnullRefPtr current_node(); + + InsertionMode m_insertion_mode { InsertionMode::Initial }; + NonnullRefPtrVector m_stack_of_open_elements; + + HTMLTokenizer m_tokenizer; + + bool m_foster_parenting { false }; + + RefPtr m_document; + RefPtr m_head_element; + RefPtr m_form_element; +}; + +} diff --git a/Libraries/LibWeb/Parser/HTMLToken.cpp b/Libraries/LibWeb/Parser/HTMLToken.cpp new file mode 100644 index 0000000000..88cf5991ca --- /dev/null +++ b/Libraries/LibWeb/Parser/HTMLToken.cpp @@ -0,0 +1,84 @@ +/* + * Copyright (c) 2020, Andreas Kling + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR + * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER + * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include + +namespace Web { + +String HTMLToken::to_string() const +{ + StringBuilder builder; + + switch (type()) { + case HTMLToken::Type::DOCTYPE: + builder.append("DOCTYPE"); + builder.append(" { name: '"); + builder.append(m_doctype.name.to_string()); + builder.append("' }"); + break; + case HTMLToken::Type::StartTag: + builder.append("StartTag"); + break; + case HTMLToken::Type::EndTag: + builder.append("EndTag"); + break; + case HTMLToken::Type::Comment: + builder.append("Comment"); + break; + case HTMLToken::Type::Character: + builder.append("Character"); + break; + case HTMLToken::Type::EndOfFile: + builder.append("EndOfFile"); + break; + } + + if (type() == HTMLToken::Type::StartTag || type() == HTMLToken::Type::EndTag) { + builder.append(" { name: '"); + builder.append(m_tag.tag_name.to_string()); + builder.append("', { "); + for (auto& attribute : m_tag.attributes) { + builder.append(attribute.name_builder.to_string()); + builder.append("=\""); + builder.append(attribute.value_builder.to_string()); + builder.append("\" "); + } + builder.append("} }"); + } + + if (type() == HTMLToken::Type::Comment || type() == HTMLToken::Type::Character) { + builder.append(" { data: '"); + builder.append(m_comment_or_character.data.to_string()); + builder.append(" }"); + } + + return builder.to_string(); + + //dbg() << "[" << String::format("%42s", state_name(m_state)) << "] " << builder.to_string(); + //m_current_token = {}; +} + +} diff --git a/Libraries/LibWeb/Parser/HTMLToken.h b/Libraries/LibWeb/Parser/HTMLToken.h index 13c773d270..e956c4ac27 100644 --- a/Libraries/LibWeb/Parser/HTMLToken.h +++ b/Libraries/LibWeb/Parser/HTMLToken.h @@ -34,6 +34,7 @@ namespace Web { class HTMLToken { + friend class HTMLDocumentParser; friend class HTMLTokenizer; public: @@ -46,8 +47,29 @@ public: EndOfFile, }; + bool is_doctype() const { return m_type == Type::DOCTYPE; } + bool is_start_tag() const { return m_type == Type::StartTag; } + bool is_end_tag() const { return m_type == Type::EndTag; } + bool is_comment() const { return m_type == Type::Comment; } + bool is_character() const { return m_type == Type::Character; } + bool is_end_of_file() const { return m_type == Type::EndOfFile; } + + String tag_name() const + { + ASSERT(is_start_tag() || is_end_tag()); + return m_tag.tag_name.to_string(); + } + + bool is_self_closing() const + { + ASSERT(is_start_tag() || is_end_tag()); + return m_tag.self_closing; + } + Type type() const { return m_type; } + String to_string() const; + private: struct AttributeBuilder { StringBuilder name_builder; diff --git a/Libraries/LibWeb/Parser/HTMLTokenizer.cpp b/Libraries/LibWeb/Parser/HTMLTokenizer.cpp index b5835446d3..7badf5af65 100644 --- a/Libraries/LibWeb/Parser/HTMLTokenizer.cpp +++ b/Libraries/LibWeb/Parser/HTMLTokenizer.cpp @@ -28,6 +28,8 @@ #include #include +#pragma GCC diagnostic ignored "-Wunused-label" + //#define TOKENIZER_TRACE #define TODO() \ @@ -47,6 +49,11 @@ m_state = State::new_state; \ goto new_state; +#define SWITCH_TO_AND_EMIT_CURRENT_TOKEN(new_state) \ + will_switch_to(State::new_state); \ + m_state = State::new_state; \ + return m_current_token; + #define DONT_CONSUME_NEXT_INPUT_CHARACTER --m_cursor; #define ON(codepoint) \ @@ -66,10 +73,12 @@ #define ANYTHING_ELSE if (1) -#define EMIT_EOF_AND_RETURN \ +#define EMIT_EOF \ create_new_token(HTMLToken::Type::EndOfFile); \ - emit_current_token(); \ - return; + return m_current_token; + +#define EMIT_CURRENT_TOKEN \ + return m_current_token; #define BEGIN_STATE(state) \ state: \ @@ -100,7 +109,7 @@ Optional HTMLTokenizer::peek_codepoint(size_t offset) const return m_input[m_cursor + offset]; } -void HTMLTokenizer::run() +Optional HTMLTokenizer::next_token() { for (;;) { auto current_input_character = next_codepoint(); @@ -118,7 +127,7 @@ void HTMLTokenizer::run() } ON_EOF { - EMIT_EOF_AND_RETURN; + EMIT_EOF; } ANYTHING_ELSE { @@ -168,8 +177,7 @@ void HTMLTokenizer::run() } ON('>') { - emit_current_token(); - SWITCH_TO(Data); + SWITCH_TO_AND_EMIT_CURRENT_TOKEN(Data); } ANYTHING_ELSE { @@ -266,8 +274,7 @@ void HTMLTokenizer::run() } ON('>') { - emit_current_token(); - SWITCH_TO(Data); + SWITCH_TO_AND_EMIT_CURRENT_TOKEN(Data); } ON_ASCII_UPPER_ALPHA { @@ -297,8 +304,7 @@ void HTMLTokenizer::run() } ON('>') { - emit_current_token(); - SWITCH_TO(Data); + SWITCH_TO_AND_EMIT_CURRENT_TOKEN(Data); } ON_EOF { @@ -473,8 +479,7 @@ void HTMLTokenizer::run() } ON('>') { - emit_current_token(); - SWITCH_TO(Data); + SWITCH_TO_AND_EMIT_CURRENT_TOKEN(Data); } ON(0) { @@ -504,8 +509,7 @@ void HTMLTokenizer::run() } ON('>') { - emit_current_token(); - SWITCH_TO(Data); + SWITCH_TO_AND_EMIT_CURRENT_TOKEN(Data); } ON_EOF { @@ -588,8 +592,7 @@ void HTMLTokenizer::run() { ON('>') { - emit_current_token(); - SWITCH_TO(Data); + SWITCH_TO_AND_EMIT_CURRENT_TOKEN(Data); } ON('!') { @@ -741,57 +744,6 @@ bool HTMLTokenizer::next_few_characters_are(const StringView& string) const return true; } -void HTMLTokenizer::emit_current_token() -{ - StringBuilder builder; - - switch (m_current_token.type()) { - case HTMLToken::Type::DOCTYPE: - builder.append("DOCTYPE"); - builder.append(" { name: '"); - builder.append(m_current_token.m_doctype.name.to_string()); - builder.append("' }"); - break; - case HTMLToken::Type::StartTag: - builder.append("StartTag"); - break; - case HTMLToken::Type::EndTag: - builder.append("EndTag"); - break; - case HTMLToken::Type::Comment: - builder.append("Comment"); - break; - case HTMLToken::Type::Character: - builder.append("Character"); - break; - case HTMLToken::Type::EndOfFile: - builder.append("EndOfFile"); - break; - } - - if (m_current_token.type() == HTMLToken::Type::StartTag || m_current_token.type() == HTMLToken::Type::EndTag) { - builder.append(" { name: '"); - builder.append(m_current_token.m_tag.tag_name.to_string()); - builder.append("', { "); - for (auto& attribute : m_current_token.m_tag.attributes) { - builder.append(attribute.name_builder.to_string()); - builder.append("=\""); - builder.append(attribute.value_builder.to_string()); - builder.append("\" "); - } - builder.append("} }"); - } - - if (m_current_token.type() == HTMLToken::Type::Comment || m_current_token.type() == HTMLToken::Type::Character) { - builder.append(" { data: '"); - builder.append(m_current_token.m_comment_or_character.data.to_string()); - builder.append(" }"); - } - - dbg() << "[" << String::format("%42s", state_name(m_state)) << "] " << builder.to_string(); - m_current_token = {}; -} - void HTMLTokenizer::create_new_token(HTMLToken::Type type) { flush_current_character_or_comment_if_needed(); @@ -822,8 +774,8 @@ void HTMLTokenizer::will_reconsume_in([[maybe_unused]] State new_state) void HTMLTokenizer::flush_current_character_or_comment_if_needed() { - if (m_current_token.type() == HTMLToken::Type::Character || m_current_token.type() == HTMLToken::Type::Comment) - emit_current_token(); + //if (m_current_token.type() == HTMLToken::Type::Character || m_current_token.type() == HTMLToken::Type::Comment) +// emit_current_token(); } } diff --git a/Libraries/LibWeb/Parser/HTMLTokenizer.h b/Libraries/LibWeb/Parser/HTMLTokenizer.h index 5573cdd46c..2476e85be8 100644 --- a/Libraries/LibWeb/Parser/HTMLTokenizer.h +++ b/Libraries/LibWeb/Parser/HTMLTokenizer.h @@ -118,14 +118,13 @@ class HTMLTokenizer { public: explicit HTMLTokenizer(const StringView& input); - void run(); + Optional next_token(); private: Optional next_codepoint(); Optional peek_codepoint(size_t offset) const; bool next_few_characters_are(const StringView&) const; void consume(const StringView&); - void emit_current_token(); void create_new_token(HTMLToken::Type); enum class State { diff --git a/Userland/ht.cpp b/Userland/ht.cpp index 02c01913ce..03f33455a0 100644 --- a/Userland/ht.cpp +++ b/Userland/ht.cpp @@ -24,13 +24,19 @@ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ -#include -#include #include #include +#include +#include +#include +#include +#include +#include int main(int argc, char** argv) { + Core::EventLoop loop; + // This is a temporary test program to aid with bringing up the new HTML parser. :^) const char* input_path = "/home/anon/www/simple.html"; if (argc > 1) @@ -40,7 +46,12 @@ int main(int argc, char** argv) if (file_or_error.is_error()) return 1; auto contents = file_or_error.value()->read_all(); - Web::HTMLTokenizer tokenizer(contents); - tokenizer.run(); + + Web::HTMLDocumentParser parser(contents); + parser.run(); + + auto& document = parser.document(); + Web::dump_tree(document); + return 0; }