From 31db3f21ae9da7bd87f0d4a8ca41a86953a41335 Mon Sep 17 00:00:00 2001
From: Andreas Kling <kling@serenityos.org>
Date: Sun, 24 May 2020 19:51:50 +0200
Subject: [PATCH] LibWeb: Start implementing character token parsing

Now that we've gotten rid of the misguided character buffering in the
tokenizer, it actually spits out character tokens that we have to deal
with in the parser.

This patch implements enough to bring us back to speed with simple.html
---
 Libraries/LibWeb/DOM/CharacterData.h          |   1 +
 .../LibWeb/Parser/HTMLDocumentParser.cpp      | 134 +++++++++++++-----
 Libraries/LibWeb/Parser/HTMLDocumentParser.h  |   6 +
 Libraries/LibWeb/Parser/HTMLToken.h           |  25 ++++
 4 files changed, 132 insertions(+), 34 deletions(-)

diff --git a/Libraries/LibWeb/DOM/CharacterData.h b/Libraries/LibWeb/DOM/CharacterData.h
index dece5fe338..3589069db5 100644
--- a/Libraries/LibWeb/DOM/CharacterData.h
+++ b/Libraries/LibWeb/DOM/CharacterData.h
@@ -36,6 +36,7 @@ public:
     virtual ~CharacterData() override;
 
     const String& data() const { return m_data; }
+    void set_data(const String& data) { m_data = data; }
 
     virtual String text_content() const override { return m_data; }
 
diff --git a/Libraries/LibWeb/Parser/HTMLDocumentParser.cpp b/Libraries/LibWeb/Parser/HTMLDocumentParser.cpp
index 3ebd4e329e..090d33e9a7 100644
--- a/Libraries/LibWeb/Parser/HTMLDocumentParser.cpp
+++ b/Libraries/LibWeb/Parser/HTMLDocumentParser.cpp
@@ -24,11 +24,13 @@
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 
+#include <AK/Utf32View.h>
 #include <LibWeb/DOM/Document.h>
 #include <LibWeb/DOM/DocumentType.h>
 #include <LibWeb/DOM/ElementFactory.h>
 #include <LibWeb/DOM/HTMLFormElement.h>
 #include <LibWeb/DOM/HTMLHeadElement.h>
+#include <LibWeb/DOM/Text.h>
 #include <LibWeb/Parser/HTMLDocumentParser.h>
 #include <LibWeb/Parser/HTMLToken.h>
 
@@ -54,41 +56,45 @@ void HTMLDocumentParser::run()
         auto& token = optional_token.value();
 
         dbg() << "[" << insertion_mode_name() << "] " << token.to_string();
+        process_using_the_rules_for(m_insertion_mode, token);
+    }
+}
 
-        switch (m_insertion_mode) {
-        case InsertionMode::Initial:
-            handle_initial(token);
-            break;
-        case InsertionMode::BeforeHTML:
-            handle_before_html(token);
-            break;
-        case InsertionMode::BeforeHead:
-            handle_before_head(token);
-            break;
-        case InsertionMode::InHead:
-            handle_in_head(token);
-            break;
-        case InsertionMode::InHeadNoscript:
-            handle_in_head_noscript(token);
-            break;
-        case InsertionMode::AfterHead:
-            handle_after_head(token);
-            break;
-        case InsertionMode::InBody:
-            handle_in_body(token);
-            break;
-        case InsertionMode::AfterBody:
-            handle_after_body(token);
-            break;
-        case InsertionMode::AfterAfterBody:
-            handle_after_after_body(token);
-            break;
-        case InsertionMode::Text:
-            handle_text(token);
-            break;
-        default:
-            ASSERT_NOT_REACHED();
-        }
+void HTMLDocumentParser::process_using_the_rules_for(InsertionMode mode, HTMLToken& token)
+{
+    switch (mode) {
+    case InsertionMode::Initial:
+        handle_initial(token);
+        break;
+    case InsertionMode::BeforeHTML:
+        handle_before_html(token);
+        break;
+    case InsertionMode::BeforeHead:
+        handle_before_head(token);
+        break;
+    case InsertionMode::InHead:
+        handle_in_head(token);
+        break;
+    case InsertionMode::InHeadNoscript:
+        handle_in_head_noscript(token);
+        break;
+    case InsertionMode::AfterHead:
+        handle_after_head(token);
+        break;
+    case InsertionMode::InBody:
+        handle_in_body(token);
+        break;
+    case InsertionMode::AfterBody:
+        handle_after_body(token);
+        break;
+    case InsertionMode::AfterAfterBody:
+        handle_after_after_body(token);
+        break;
+    case InsertionMode::Text:
+        handle_text(token);
+        break;
+    default:
+        ASSERT_NOT_REACHED();
     }
 }
 
@@ -106,6 +112,10 @@ void HTMLDocumentParser::handle_initial(HTMLToken& token)
 
 void HTMLDocumentParser::handle_before_html(HTMLToken& token)
 {
+    if (token.is_character() && token.is_parser_whitespace()) {
+        return;
+    }
+
     if (token.is_start_tag() && token.tag_name() == "html") {
         auto element = create_element_for(token);
         document().append_child(element);
@@ -151,6 +161,10 @@ RefPtr<Element> HTMLDocumentParser::insert_html_element(HTMLToken& token)
 
 void HTMLDocumentParser::handle_before_head(HTMLToken& token)
 {
+    if (token.is_character() && token.is_parser_whitespace()) {
+        return;
+    }
+
     if (token.is_start_tag() && token.tag_name() == "head") {
         auto element = insert_html_element(token);
         m_head_element = to<HTMLHeadElement>(element);
@@ -183,9 +197,32 @@ void HTMLDocumentParser::handle_in_head_noscript(HTMLToken&)
     ASSERT_NOT_REACHED();
 }
 
+void HTMLDocumentParser::insert_character(u32 data)
+{
+    auto adjusted_insertion_location = find_appropriate_place_for_inserting_node();
+    if (adjusted_insertion_location->is_document())
+        return;
+    if (adjusted_insertion_location->last_child() && adjusted_insertion_location->last_child()->is_text()) {
+        auto& existing_text_node = to<Text>(*adjusted_insertion_location->last_child());
+        StringBuilder builder;
+        builder.append(existing_text_node.data());
+        builder.append(Utf32View { &data, 1 });
+        existing_text_node.set_data(builder.to_string());
+        return;
+    }
+    StringBuilder builder;
+    builder.append(Utf32View { &data, 1 });
+    adjusted_insertion_location->append_child(adopt(*new Text(document(), builder.to_string())));
+}
+
 void HTMLDocumentParser::handle_after_head(HTMLToken& token)
 {
     if (token.is_character()) {
+        if (token.is_parser_whitespace()) {
+            insert_character(token.codepoint());
+            return;
+        }
+
         ASSERT_NOT_REACHED();
     }
 
@@ -249,6 +286,11 @@ void HTMLDocumentParser::generate_implied_end_tags()
 
 void HTMLDocumentParser::handle_after_body(HTMLToken& token)
 {
+    if (token.is_character() && token.is_parser_whitespace()) {
+        process_using_the_rules_for(InsertionMode::InBody, token);
+        return;
+    }
+
     if (token.is_end_tag() && token.tag_name() == "html") {
         if (m_parsing_fragment) {
             ASSERT_NOT_REACHED();
@@ -261,6 +303,11 @@ void HTMLDocumentParser::handle_after_body(HTMLToken& token)
 
 void HTMLDocumentParser::handle_after_after_body(HTMLToken& token)
 {
+    if (token.is_doctype() || token.is_parser_whitespace() || (token.is_start_tag() && token.tag_name() == "html")) {
+        process_using_the_rules_for(InsertionMode::InBody, token);
+        return;
+    }
+
     if (token.is_end_of_file()) {
         dbg() << "Stop parsing! :^)";
         return;
@@ -268,8 +315,27 @@ void HTMLDocumentParser::handle_after_after_body(HTMLToken& token)
     ASSERT_NOT_REACHED();
 }
 
+void HTMLDocumentParser::reconstruct_the_active_formatting_elements()
+{
+    if (m_list_of_active_formatting_elements.is_empty())
+        return;
+
+    ASSERT_NOT_REACHED();
+}
+
 void HTMLDocumentParser::handle_in_body(HTMLToken& token)
 {
+    if (token.is_character()) {
+        if (token.codepoint() == 0) {
+            ASSERT_NOT_REACHED();
+        }
+        if (token.is_parser_whitespace()) {
+            reconstruct_the_active_formatting_elements();
+            insert_character(token.codepoint());
+            return;
+        }
+    }
+
     if (token.is_end_tag() && token.tag_name() == "body") {
         if (!m_stack_of_open_elements.has_in_scope("body")) {
             ASSERT_NOT_REACHED();
diff --git a/Libraries/LibWeb/Parser/HTMLDocumentParser.h b/Libraries/LibWeb/Parser/HTMLDocumentParser.h
index a31aa96dd0..5d3725591e 100644
--- a/Libraries/LibWeb/Parser/HTMLDocumentParser.h
+++ b/Libraries/LibWeb/Parser/HTMLDocumentParser.h
@@ -95,10 +95,16 @@ private:
     RefPtr<Node> find_appropriate_place_for_inserting_node();
     RefPtr<Element> insert_html_element(HTMLToken&);
     Element& current_node();
+    void insert_character(u32 data);
+    void reconstruct_the_active_formatting_elements();
+    void process_using_the_rules_for(InsertionMode, HTMLToken&);
 
     InsertionMode m_insertion_mode { InsertionMode::Initial };
+
     StackOfOpenElements m_stack_of_open_elements;
 
+    NonnullRefPtrVector<Element> m_list_of_active_formatting_elements;
+
     HTMLTokenizer m_tokenizer;
 
     bool m_foster_parenting { false };
diff --git a/Libraries/LibWeb/Parser/HTMLToken.h b/Libraries/LibWeb/Parser/HTMLToken.h
index e956c4ac27..2a64c06478 100644
--- a/Libraries/LibWeb/Parser/HTMLToken.h
+++ b/Libraries/LibWeb/Parser/HTMLToken.h
@@ -54,6 +54,31 @@ public:
     bool is_character() const { return m_type == Type::Character; }
     bool is_end_of_file() const { return m_type == Type::EndOfFile; }
 
+    u32 codepoint() const
+    {
+        ASSERT(is_character());
+        // FIXME: Handle non-ASCII codepoints properly.
+        ASSERT(m_comment_or_character.data.length() == 1);
+        return m_comment_or_character.data.string_view()[0];
+    }
+
+    bool is_parser_whitespace() const
+    {
+        // NOTE: The parser considers '\r' to be whitespace, while the tokenizer does not.
+        if (!is_character())
+            return false;
+        switch (codepoint()) {
+        case '\t':
+        case '\n':
+        case '\f':
+        case '\r':
+        case ' ':
+            return true;
+        default:
+            return false;
+        }
+    }
+
     String tag_name() const
     {
         ASSERT(is_start_tag() || is_end_tag());