From 5cdb39440056c88485cd28b25f0f1bb2f6872462 Mon Sep 17 00:00:00 2001 From: Andreas Kling Date: Mon, 3 Jul 2023 10:00:20 +0200 Subject: [PATCH] LibWeb: Make HTML parser flush all pending tokens in "in table text" There were multiple bugs in the parsing algorithm for handling text occurring inside a `table` element: - When there was pending non-whitespace text inside a table, we only flushed one token instead of all pending tokens. - Also, we didn't even flush one of the right tokens, but instead the token that caused the flush to happen. - Once we started flushing the right tokens, it turned out we had not yet implemented character insertion points expressed as "before X". - Finally, we were not exiting the "in table text" mode after flushing pending tokens, effectively getting us stuck in that mode until EOF. --- .../html-parser-text-in-table-hoisting.txt | 1 + .../html-parser-text-in-table-hoisting.html | 10 +++++++ .../LibWeb/HTML/Parser/HTMLParser.cpp | 26 ++++++++++--------- 3 files changed, 25 insertions(+), 12 deletions(-) create mode 100644 Tests/LibWeb/Text/expected/html-parser-text-in-table-hoisting.txt create mode 100644 Tests/LibWeb/Text/input/html-parser-text-in-table-hoisting.html diff --git a/Tests/LibWeb/Text/expected/html-parser-text-in-table-hoisting.txt b/Tests/LibWeb/Text/expected/html-parser-text-in-table-hoisting.txt new file mode 100644 index 0000000000..fb8524e50e --- /dev/null +++ b/Tests/LibWeb/Text/expected/html-parser-text-in-table-hoisting.txt @@ -0,0 +1 @@ +PASS \ No newline at end of file diff --git a/Tests/LibWeb/Text/input/html-parser-text-in-table-hoisting.html b/Tests/LibWeb/Text/input/html-parser-text-in-table-hoisting.html new file mode 100644 index 0000000000..61176585af --- /dev/null +++ b/Tests/LibWeb/Text/input/html-parser-text-in-table-hoisting.html @@ -0,0 +1,10 @@ + +PASS
+ diff --git a/Userland/Libraries/LibWeb/HTML/Parser/HTMLParser.cpp b/Userland/Libraries/LibWeb/HTML/Parser/HTMLParser.cpp index f2475d80e5..36cda26e14 100644 --- a/Userland/Libraries/LibWeb/HTML/Parser/HTMLParser.cpp +++ b/Userland/Libraries/LibWeb/HTML/Parser/HTMLParser.cpp @@ -1001,7 +1001,11 @@ DOM::Text* HTMLParser::find_character_insertion_node() { auto adjusted_insertion_location = find_appropriate_place_for_inserting_node(); if (adjusted_insertion_location.insert_before_sibling) { - TODO(); + if (adjusted_insertion_location.insert_before_sibling->previous_sibling() && adjusted_insertion_location.insert_before_sibling->previous_sibling()->is_text()) + return static_cast(adjusted_insertion_location.insert_before_sibling->previous_sibling()); + auto new_text_node = realm().heap().allocate(realm(), document(), "").release_allocated_value_but_fixme_should_propagate_errors(); + adjusted_insertion_location.parent->insert_before(*new_text_node, *adjusted_insertion_location.insert_before_sibling); + return new_text_node; } if (adjusted_insertion_location.parent->is_document()) return nullptr; @@ -2661,20 +2665,18 @@ void HTMLParser::handle_in_table_text(HTMLToken& token) // are character tokens that are not ASCII whitespace, then this is a parse error: // reprocess the character tokens in the pending table character tokens list using // the rules given in the "anything else" entry in the "in table" insertion mode. - for (auto& pending_token : m_pending_table_character_tokens) { - VERIFY(pending_token.is_character()); - if (!pending_token.is_parser_whitespace()) { - log_parse_error(); + if (any_of(m_pending_table_character_tokens, [](auto const& token) { return !token.is_parser_whitespace(); })) { + log_parse_error(); + for (auto& pending_token : m_pending_table_character_tokens) { m_foster_parenting = true; - process_using_the_rules_for(InsertionMode::InBody, token); + process_using_the_rules_for(InsertionMode::InBody, pending_token); m_foster_parenting = false; - return; } - } - - // Otherwise, insert the characters given by the pending table character tokens list. - for (auto& pending_token : m_pending_table_character_tokens) { - insert_character(pending_token.code_point()); + } else { + // Otherwise, insert the characters given by the pending table character tokens list. + for (auto& pending_token : m_pending_table_character_tokens) { + insert_character(pending_token.code_point()); + } } // Switch the insertion mode to the original insertion mode and reprocess the token.