From 5530040b3c990311141d721a3f9622db3abaa251 Mon Sep 17 00:00:00 2001 From: Andreas Kling Date: Sat, 29 Oct 2022 12:48:35 +0200 Subject: [PATCH] LibWeb: Annotate and simplify the HTML fragment parsing algorithm This patch adds inline spec comments, and then adjusts the code a bit so it reads more like the spec. --- .../LibWeb/HTML/Parser/HTMLParser.cpp | 69 +++++++++++++++---- 1 file changed, 56 insertions(+), 13 deletions(-) diff --git a/Userland/Libraries/LibWeb/HTML/Parser/HTMLParser.cpp b/Userland/Libraries/LibWeb/HTML/Parser/HTMLParser.cpp index 1348665048..289ced3ed7 100644 --- a/Userland/Libraries/LibWeb/HTML/Parser/HTMLParser.cpp +++ b/Userland/Libraries/LibWeb/HTML/Parser/HTMLParser.cpp @@ -3388,48 +3388,91 @@ DOM::Document& HTMLParser::document() return *m_document; } +// https://html.spec.whatwg.org/multipage/parsing.html#parsing-html-fragments Vector> HTMLParser::parse_html_fragment(DOM::Element& context_element, StringView markup) { + // 1. Create a new Document node, and mark it as being an HTML document. auto temp_document = DOM::Document::create(context_element.realm()); + temp_document->set_document_type(DOM::Document::Type::HTML); + + // 2. If the node document of the context element is in quirks mode, then let the Document be in quirks mode. + // Otherwise, the node document of the context element is in limited-quirks mode, then let the Document be in limited-quirks mode. + // Otherwise, leave the Document in no-quirks mode. + temp_document->set_quirks_mode(context_element.document().mode()); + + // 3. Create a new HTML parser, and associate it with the just created Document node. auto parser = HTMLParser::create(*temp_document, markup, "utf-8"); parser->m_context_element = JS::make_handle(context_element); parser->m_parsing_fragment = true; - parser->document().set_quirks_mode(context_element.document().mode()); + // 4. Set the state of the HTML parser's tokenization stage as follows, switching on the context element: + // - title + // - textarea if (context_element.local_name().is_one_of(HTML::TagNames::title, HTML::TagNames::textarea)) { + // Switch the tokenizer to the RCDATA state. parser->m_tokenizer.switch_to({}, HTMLTokenizer::State::RCDATA); - } else if (context_element.local_name().is_one_of(HTML::TagNames::style, HTML::TagNames::xmp, HTML::TagNames::iframe, HTML::TagNames::noembed, HTML::TagNames::noframes)) { + } + // - style + // - xmp + // - iframe + // - noembed + // - noframes + else if (context_element.local_name().is_one_of(HTML::TagNames::style, HTML::TagNames::xmp, HTML::TagNames::iframe, HTML::TagNames::noembed, HTML::TagNames::noframes)) { + // Switch the tokenizer to the RAWTEXT state. parser->m_tokenizer.switch_to({}, HTMLTokenizer::State::RAWTEXT); - } else if (context_element.local_name().is_one_of(HTML::TagNames::script)) { + } + // - script + else if (context_element.local_name().is_one_of(HTML::TagNames::script)) { + // Switch the tokenizer to the script data state. parser->m_tokenizer.switch_to({}, HTMLTokenizer::State::ScriptData); - } else if (context_element.local_name().is_one_of(HTML::TagNames::noscript)) { + } + // - noscript + else if (context_element.local_name().is_one_of(HTML::TagNames::noscript)) { + // If the scripting flag is enabled, switch the tokenizer to the RAWTEXT state. Otherwise, leave the tokenizer in the data state. if (context_element.document().is_scripting_enabled()) parser->m_tokenizer.switch_to({}, HTMLTokenizer::State::RAWTEXT); - } else if (context_element.local_name().is_one_of(HTML::TagNames::plaintext)) { + } + // - plaintext + else if (context_element.local_name().is_one_of(HTML::TagNames::plaintext)) { + // Switch the tokenizer to the PLAINTEXT state. parser->m_tokenizer.switch_to({}, HTMLTokenizer::State::PLAINTEXT); } + // Any other element + else { + // Leave the tokenizer in the data state. + } + // 5. Let root be a new html element with no attributes. auto root = create_element(context_element.document(), HTML::TagNames::html, Namespace::HTML); - parser->document().append_child(root); + + // 6. Append the element root to the Document node created above. + temp_document->append_child(root); + + // 7. Set up the parser's stack of open elements so that it contains just the single element root. parser->m_stack_of_open_elements.push(root); + // 8. If the context element is a template element, if (context_element.local_name() == HTML::TagNames::template_) { + // push "in template" onto the stack of template insertion modes so that it is the new current template insertion mode. parser->m_stack_of_template_insertion_modes.append(InsertionMode::InTemplate); } - // FIXME: Create a start tag token whose name is the local name of context and whose attributes are the attributes of context. + // FIXME: 9. Create a start tag token whose name is the local name of context and whose attributes are the attributes of context. + // Let this start tag token be the start tag token of the context node, e.g. for the purposes of determining if it is an HTML integration point. + // 10. Reset the parser's insertion mode appropriately. parser->reset_the_insertion_mode_appropriately(); - for (auto* form_candidate = &context_element; form_candidate; form_candidate = form_candidate->parent_element()) { - if (is(*form_candidate)) { - parser->m_form_element = JS::make_handle(verify_cast(*form_candidate)); - break; - } - } + // 11. Set the parser's form element pointer to the nearest node to the context element that is a form element + // (going straight up the ancestor chain, and including the element itself, if it is a form element), if any. + // (If there is no such form element, the form element pointer keeps its initial value, null.) + parser->m_form_element = context_element.first_ancestor_of_type(); + // 12. Place the input into the input stream for the HTML parser just created. The encoding confidence is irrelevant. + // 13. Start the parser and let it run until it has consumed all the characters just inserted into the input stream. parser->run(context_element.document().url()); + // 14. Return the child nodes of root, in tree order. Vector> children; while (JS::GCPtr child = root->first_child()) { root->remove_child(*child);