mirror of
https://github.com/RGBCube/serenity
synced 2025-07-25 21:57:43 +00:00
LibHTML: Add Comment and CharacterData nodes and improve HTML parsing
This patch adds the CharacterData subclass of Node, which is now the parent class of Text and a new Comment class. A Comment node is one of these in HTML: <!--hello friends--> Since these occur somewhat frequently on the web, we need to be able to parse them. This patch also adds a child rejection mechanism to the DOM tree. Nodes can now override is_child_allowed(Node) and return false if they don't want a particular Node to become a child of theirs. This is used to prevent Document from taking on unwanted children.
This commit is contained in:
parent
6d150df58a
commit
b083a233d8
15 changed files with 158 additions and 25 deletions
|
@ -1,6 +1,8 @@
|
||||||
|
<!DOCTYPE html>
|
||||||
<html>
|
<html>
|
||||||
<head>
|
<head>
|
||||||
<title>Welcome!</title>
|
<title>Welcome!</title>
|
||||||
|
<!-- this is a comment -->
|
||||||
<style type="text/css">
|
<style type="text/css">
|
||||||
body {
|
body {
|
||||||
background-color: #fff;
|
background-color: #fff;
|
||||||
|
|
11
Libraries/LibHTML/DOM/CharacterData.cpp
Normal file
11
Libraries/LibHTML/DOM/CharacterData.cpp
Normal file
|
@ -0,0 +1,11 @@
|
||||||
|
#include <LibHTML/DOM/CharacterData.h>
|
||||||
|
|
||||||
|
CharacterData::CharacterData(Document& document, NodeType type, const String& data)
|
||||||
|
: Node(document, type)
|
||||||
|
, m_data(data)
|
||||||
|
{
|
||||||
|
}
|
||||||
|
|
||||||
|
CharacterData::~CharacterData()
|
||||||
|
{
|
||||||
|
}
|
25
Libraries/LibHTML/DOM/CharacterData.h
Normal file
25
Libraries/LibHTML/DOM/CharacterData.h
Normal file
|
@ -0,0 +1,25 @@
|
||||||
|
#pragma once
|
||||||
|
|
||||||
|
#include <AK/String.h>
|
||||||
|
#include <LibHTML/DOM/Node.h>
|
||||||
|
|
||||||
|
class CharacterData : public Node {
|
||||||
|
public:
|
||||||
|
virtual ~CharacterData() override;
|
||||||
|
|
||||||
|
const String& data() const { return m_data; }
|
||||||
|
|
||||||
|
virtual String text_content() const override { return m_data; }
|
||||||
|
|
||||||
|
protected:
|
||||||
|
explicit CharacterData(Document&, NodeType, const String&);
|
||||||
|
|
||||||
|
private:
|
||||||
|
String m_data;
|
||||||
|
};
|
||||||
|
|
||||||
|
template<>
|
||||||
|
inline bool is<CharacterData>(const Node& node)
|
||||||
|
{
|
||||||
|
return node.is_character_data();
|
||||||
|
}
|
11
Libraries/LibHTML/DOM/Comment.cpp
Normal file
11
Libraries/LibHTML/DOM/Comment.cpp
Normal file
|
@ -0,0 +1,11 @@
|
||||||
|
#include <LibHTML/DOM/Comment.h>
|
||||||
|
#include <LibHTML/Layout/LayoutText.h>
|
||||||
|
|
||||||
|
Comment::Comment(Document& document, const String& data)
|
||||||
|
: CharacterData(document, NodeType::COMMENT_NODE, data)
|
||||||
|
{
|
||||||
|
}
|
||||||
|
|
||||||
|
Comment::~Comment()
|
||||||
|
{
|
||||||
|
}
|
18
Libraries/LibHTML/DOM/Comment.h
Normal file
18
Libraries/LibHTML/DOM/Comment.h
Normal file
|
@ -0,0 +1,18 @@
|
||||||
|
#pragma once
|
||||||
|
|
||||||
|
#include <AK/String.h>
|
||||||
|
#include <LibHTML/DOM/CharacterData.h>
|
||||||
|
|
||||||
|
class Comment final : public CharacterData {
|
||||||
|
public:
|
||||||
|
explicit Comment(Document&, const String&);
|
||||||
|
virtual ~Comment() override;
|
||||||
|
|
||||||
|
virtual String tag_name() const override { return "#comment"; }
|
||||||
|
};
|
||||||
|
|
||||||
|
template<>
|
||||||
|
inline bool is<Comment>(const Node& node)
|
||||||
|
{
|
||||||
|
return node.is_comment();
|
||||||
|
}
|
|
@ -29,6 +29,23 @@ StyleResolver& Document::style_resolver()
|
||||||
return *m_style_resolver;
|
return *m_style_resolver;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
bool Document::is_child_allowed(const Node& node) const
|
||||||
|
{
|
||||||
|
switch (node.type()) {
|
||||||
|
case NodeType::DOCUMENT_NODE:
|
||||||
|
case NodeType::TEXT_NODE:
|
||||||
|
return false;
|
||||||
|
case NodeType::COMMENT_NODE:
|
||||||
|
return true;
|
||||||
|
case NodeType::DOCUMENT_TYPE_NODE:
|
||||||
|
return !first_child_of_type<DocumentType>();
|
||||||
|
case NodeType::ELEMENT_NODE:
|
||||||
|
return !first_child_of_type<Element>();
|
||||||
|
default:
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
void Document::fixup()
|
void Document::fixup()
|
||||||
{
|
{
|
||||||
if (!is<DocumentType>(first_child()))
|
if (!is<DocumentType>(first_child()))
|
||||||
|
|
|
@ -67,6 +67,8 @@ public:
|
||||||
void invalidate_layout();
|
void invalidate_layout();
|
||||||
Function<void()> on_invalidate_layout;
|
Function<void()> on_invalidate_layout;
|
||||||
|
|
||||||
|
virtual bool is_child_allowed(const Node&) const override;
|
||||||
|
|
||||||
private:
|
private:
|
||||||
virtual RefPtr<LayoutNode> create_layout_node(const StyleResolver&, const StyleProperties* parent_style) const override;
|
virtual RefPtr<LayoutNode> create_layout_node(const StyleResolver&, const StyleProperties* parent_style) const override;
|
||||||
|
|
||||||
|
|
|
@ -7,7 +7,7 @@ public:
|
||||||
explicit DocumentType(Document&);
|
explicit DocumentType(Document&);
|
||||||
virtual ~DocumentType() override;
|
virtual ~DocumentType() override;
|
||||||
|
|
||||||
virtual String tag_name() const override { return "!DOCTYPE"; }
|
virtual String tag_name() const override { return "#doctype"; }
|
||||||
};
|
};
|
||||||
|
|
||||||
template<>
|
template<>
|
||||||
|
|
|
@ -10,6 +10,7 @@ enum class NodeType : unsigned {
|
||||||
INVALID = 0,
|
INVALID = 0,
|
||||||
ELEMENT_NODE = 1,
|
ELEMENT_NODE = 1,
|
||||||
TEXT_NODE = 3,
|
TEXT_NODE = 3,
|
||||||
|
COMMENT_NODE = 8,
|
||||||
DOCUMENT_NODE = 9,
|
DOCUMENT_NODE = 9,
|
||||||
DOCUMENT_TYPE_NODE = 10,
|
DOCUMENT_TYPE_NODE = 10,
|
||||||
};
|
};
|
||||||
|
@ -32,6 +33,8 @@ public:
|
||||||
bool is_text() const { return type() == NodeType::TEXT_NODE; }
|
bool is_text() const { return type() == NodeType::TEXT_NODE; }
|
||||||
bool is_document() const { return type() == NodeType::DOCUMENT_NODE; }
|
bool is_document() const { return type() == NodeType::DOCUMENT_NODE; }
|
||||||
bool is_document_type() const { return type() == NodeType::DOCUMENT_TYPE_NODE; }
|
bool is_document_type() const { return type() == NodeType::DOCUMENT_TYPE_NODE; }
|
||||||
|
bool is_comment() const { return type() == NodeType::COMMENT_NODE; }
|
||||||
|
bool is_character_data() const { return type() == NodeType::TEXT_NODE || type() == NodeType::COMMENT_NODE; }
|
||||||
bool is_parent_node() const { return is_element() || is_document(); }
|
bool is_parent_node() const { return is_element() || is_document(); }
|
||||||
|
|
||||||
virtual RefPtr<LayoutNode> create_layout_node(const StyleResolver&, const StyleProperties* parent_style) const;
|
virtual RefPtr<LayoutNode> create_layout_node(const StyleResolver&, const StyleProperties* parent_style) const;
|
||||||
|
@ -66,6 +69,8 @@ public:
|
||||||
const Element* previous_element_sibling() const;
|
const Element* previous_element_sibling() const;
|
||||||
const Element* next_element_sibling() const;
|
const Element* next_element_sibling() const;
|
||||||
|
|
||||||
|
virtual bool is_child_allowed(const Node&) const { return true; }
|
||||||
|
|
||||||
protected:
|
protected:
|
||||||
Node(Document&, NodeType);
|
Node(Document&, NodeType);
|
||||||
|
|
||||||
|
|
|
@ -2,8 +2,7 @@
|
||||||
#include <LibHTML/Layout/LayoutText.h>
|
#include <LibHTML/Layout/LayoutText.h>
|
||||||
|
|
||||||
Text::Text(Document& document, const String& data)
|
Text::Text(Document& document, const String& data)
|
||||||
: Node(document, NodeType::TEXT_NODE)
|
: CharacterData(document, NodeType::TEXT_NODE, data)
|
||||||
, m_data(data)
|
|
||||||
{
|
{
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -1,23 +1,17 @@
|
||||||
#pragma once
|
#pragma once
|
||||||
|
|
||||||
#include <AK/String.h>
|
#include <AK/String.h>
|
||||||
#include <LibHTML/DOM/Node.h>
|
#include <LibHTML/DOM/CharacterData.h>
|
||||||
|
|
||||||
class Text final : public Node {
|
class Text final : public CharacterData {
|
||||||
public:
|
public:
|
||||||
explicit Text(Document&, const String&);
|
explicit Text(Document&, const String&);
|
||||||
virtual ~Text() override;
|
virtual ~Text() override;
|
||||||
|
|
||||||
const String& data() const { return m_data; }
|
|
||||||
|
|
||||||
virtual String tag_name() const override { return "#text"; }
|
virtual String tag_name() const override { return "#text"; }
|
||||||
|
|
||||||
virtual String text_content() const override { return m_data; }
|
|
||||||
|
|
||||||
private:
|
private:
|
||||||
virtual RefPtr<LayoutNode> create_layout_node(const StyleResolver&, const StyleProperties* parent_style) const override;
|
virtual RefPtr<LayoutNode> create_layout_node(const StyleResolver&, const StyleProperties* parent_style) const override;
|
||||||
|
|
||||||
String m_data;
|
|
||||||
};
|
};
|
||||||
|
|
||||||
template<>
|
template<>
|
||||||
|
|
|
@ -1,5 +1,6 @@
|
||||||
#include <AK/Utf8View.h>
|
#include <AK/Utf8View.h>
|
||||||
#include <LibHTML/CSS/StyleSheet.h>
|
#include <LibHTML/CSS/StyleSheet.h>
|
||||||
|
#include <LibHTML/DOM/Comment.h>
|
||||||
#include <LibHTML/DOM/Document.h>
|
#include <LibHTML/DOM/Document.h>
|
||||||
#include <LibHTML/DOM/DocumentType.h>
|
#include <LibHTML/DOM/DocumentType.h>
|
||||||
#include <LibHTML/DOM/Element.h>
|
#include <LibHTML/DOM/Element.h>
|
||||||
|
@ -27,6 +28,8 @@ void dump_tree(const Node& node)
|
||||||
dbgprintf("\"%s\"\n", static_cast<const Text&>(node).data().characters());
|
dbgprintf("\"%s\"\n", static_cast<const Text&>(node).data().characters());
|
||||||
} else if (is<DocumentType>(node)) {
|
} else if (is<DocumentType>(node)) {
|
||||||
dbgprintf("<!DOCTYPE>\n");
|
dbgprintf("<!DOCTYPE>\n");
|
||||||
|
} else if (is<Comment>(node)) {
|
||||||
|
dbgprintf("<!--%s-->\n", to<Comment>(node).data().characters());
|
||||||
}
|
}
|
||||||
++indent;
|
++indent;
|
||||||
if (is<ParentNode>(node)) {
|
if (is<ParentNode>(node)) {
|
||||||
|
|
|
@ -17,6 +17,8 @@ LIBHTML_OBJS = \
|
||||||
DOM/HTMLBlinkElement.o \
|
DOM/HTMLBlinkElement.o \
|
||||||
DOM/HTMLBRElement.o \
|
DOM/HTMLBRElement.o \
|
||||||
DOM/Document.o \
|
DOM/Document.o \
|
||||||
|
DOM/CharacterData.o \
|
||||||
|
DOM/Comment.o \
|
||||||
DOM/Text.o \
|
DOM/Text.o \
|
||||||
DOM/DocumentType.o \
|
DOM/DocumentType.o \
|
||||||
DOM/ElementFactory.o \
|
DOM/ElementFactory.o \
|
||||||
|
|
|
@ -1,6 +1,7 @@
|
||||||
#include <AK/Function.h>
|
#include <AK/Function.h>
|
||||||
#include <AK/NonnullRefPtrVector.h>
|
#include <AK/NonnullRefPtrVector.h>
|
||||||
#include <AK/StringBuilder.h>
|
#include <AK/StringBuilder.h>
|
||||||
|
#include <LibHTML/DOM/Comment.h>
|
||||||
#include <LibHTML/DOM/DocumentType.h>
|
#include <LibHTML/DOM/DocumentType.h>
|
||||||
#include <LibHTML/DOM/Element.h>
|
#include <LibHTML/DOM/Element.h>
|
||||||
#include <LibHTML/DOM/ElementFactory.h>
|
#include <LibHTML/DOM/ElementFactory.h>
|
||||||
|
@ -44,6 +45,8 @@ NonnullRefPtr<Document> parse_html(const StringView& html, const URL& url)
|
||||||
Free = 0,
|
Free = 0,
|
||||||
BeforeTagName,
|
BeforeTagName,
|
||||||
InTagName,
|
InTagName,
|
||||||
|
InDoctype,
|
||||||
|
InComment,
|
||||||
InAttributeList,
|
InAttributeList,
|
||||||
InAttributeName,
|
InAttributeName,
|
||||||
BeforeAttributeValue,
|
BeforeAttributeValue,
|
||||||
|
@ -101,19 +104,16 @@ NonnullRefPtr<Document> parse_html(const StringView& html, const URL& url)
|
||||||
close_tag();
|
close_tag();
|
||||||
};
|
};
|
||||||
|
|
||||||
auto handle_exclamation_tag = [&] {
|
auto commit_doctype = [&] {
|
||||||
auto name = String::copy(tag_name_buffer);
|
node_stack.last().append_child(adopt(*new DocumentType(document)), false);
|
||||||
tag_name_buffer.clear();
|
};
|
||||||
ASSERT(name == "DOCTYPE");
|
|
||||||
if (node_stack.size() != 1)
|
auto commit_comment = [&] {
|
||||||
node_stack[node_stack.size() - 2].append_child(adopt(*new DocumentType(document)), false);
|
node_stack.last().append_child(adopt(*new Comment(document, text_buffer.to_string())), false);
|
||||||
close_tag();
|
|
||||||
};
|
};
|
||||||
|
|
||||||
auto commit_tag = [&] {
|
auto commit_tag = [&] {
|
||||||
if (is_exclamation_tag)
|
if (is_slash_tag)
|
||||||
handle_exclamation_tag();
|
|
||||||
else if (is_slash_tag)
|
|
||||||
close_tag();
|
close_tag();
|
||||||
else
|
else
|
||||||
open_tag();
|
open_tag();
|
||||||
|
@ -124,12 +124,16 @@ NonnullRefPtr<Document> parse_html(const StringView& html, const URL& url)
|
||||||
};
|
};
|
||||||
|
|
||||||
for (int i = 0; i < html.length(); ++i) {
|
for (int i = 0; i < html.length(); ++i) {
|
||||||
|
auto peek = [&](int offset) -> char {
|
||||||
|
if (i + offset >= html.length())
|
||||||
|
return '\0';
|
||||||
|
return html[i + offset];
|
||||||
|
};
|
||||||
char ch = html[i];
|
char ch = html[i];
|
||||||
switch (state) {
|
switch (state) {
|
||||||
case State::Free:
|
case State::Free:
|
||||||
if (ch == '<') {
|
if (ch == '<') {
|
||||||
is_slash_tag = false;
|
is_slash_tag = false;
|
||||||
is_exclamation_tag = false;
|
|
||||||
move_to_state(State::BeforeTagName);
|
move_to_state(State::BeforeTagName);
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
@ -165,7 +169,22 @@ NonnullRefPtr<Document> parse_html(const StringView& html, const URL& url)
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
if (ch == '!') {
|
if (ch == '!') {
|
||||||
is_exclamation_tag = true;
|
if (peek(1) == 'D'
|
||||||
|
&& peek(2) == 'O'
|
||||||
|
&& peek(3) == 'C'
|
||||||
|
&& peek(4) == 'T'
|
||||||
|
&& peek(5) == 'Y'
|
||||||
|
&& peek(6) == 'P'
|
||||||
|
&& peek(7) == 'E') {
|
||||||
|
i += 7;
|
||||||
|
move_to_state(State::InDoctype);
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
if (peek(1) == '-' && peek(2) == '-') {
|
||||||
|
i += 2;
|
||||||
|
move_to_state(State::InComment);
|
||||||
|
break;
|
||||||
|
}
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
if (ch == '>') {
|
if (ch == '>') {
|
||||||
|
@ -188,6 +207,22 @@ NonnullRefPtr<Document> parse_html(const StringView& html, const URL& url)
|
||||||
}
|
}
|
||||||
tag_name_buffer.append(ch);
|
tag_name_buffer.append(ch);
|
||||||
break;
|
break;
|
||||||
|
case State::InDoctype:
|
||||||
|
if (ch == '>') {
|
||||||
|
commit_doctype();
|
||||||
|
move_to_state(State::Free);
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
break;
|
||||||
|
case State::InComment:
|
||||||
|
if (ch == '-' && peek(1) == '-' && peek(2) == '>') {
|
||||||
|
commit_comment();
|
||||||
|
i += 2;
|
||||||
|
move_to_state(State::Free);
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
text_buffer.append(ch);
|
||||||
|
break;
|
||||||
case State::InAttributeList:
|
case State::InAttributeList:
|
||||||
if (ch == '>') {
|
if (ch == '>') {
|
||||||
commit_tag();
|
commit_tag();
|
||||||
|
|
|
@ -50,8 +50,10 @@ public:
|
||||||
void append_child(NonnullRefPtr<T> node, bool call_inserted_into = true);
|
void append_child(NonnullRefPtr<T> node, bool call_inserted_into = true);
|
||||||
void donate_all_children_to(T& node);
|
void donate_all_children_to(T& node);
|
||||||
|
|
||||||
|
bool is_child_allowed(const T&) const { return true; }
|
||||||
|
|
||||||
protected:
|
protected:
|
||||||
TreeNode() { }
|
TreeNode() {}
|
||||||
|
|
||||||
private:
|
private:
|
||||||
int m_ref_count { 1 };
|
int m_ref_count { 1 };
|
||||||
|
@ -66,6 +68,10 @@ template<typename T>
|
||||||
inline void TreeNode<T>::append_child(NonnullRefPtr<T> node, bool call_inserted_into)
|
inline void TreeNode<T>::append_child(NonnullRefPtr<T> node, bool call_inserted_into)
|
||||||
{
|
{
|
||||||
ASSERT(!node->m_parent);
|
ASSERT(!node->m_parent);
|
||||||
|
|
||||||
|
if (!static_cast<T*>(this)->is_child_allowed(*node))
|
||||||
|
return;
|
||||||
|
|
||||||
if (m_last_child)
|
if (m_last_child)
|
||||||
m_last_child->m_next_sibling = node.ptr();
|
m_last_child->m_next_sibling = node.ptr();
|
||||||
node->m_previous_sibling = m_last_child;
|
node->m_previous_sibling = m_last_child;
|
||||||
|
@ -82,6 +88,10 @@ template<typename T>
|
||||||
inline void TreeNode<T>::prepend_child(NonnullRefPtr<T> node, bool call_inserted_into)
|
inline void TreeNode<T>::prepend_child(NonnullRefPtr<T> node, bool call_inserted_into)
|
||||||
{
|
{
|
||||||
ASSERT(!node->m_parent);
|
ASSERT(!node->m_parent);
|
||||||
|
|
||||||
|
if (!static_cast<T*>(this)->is_child_allowed(*node))
|
||||||
|
return;
|
||||||
|
|
||||||
if (m_first_child)
|
if (m_first_child)
|
||||||
m_first_child->m_previous_sibling = node.ptr();
|
m_first_child->m_previous_sibling = node.ptr();
|
||||||
node->m_next_sibling = m_first_child;
|
node->m_next_sibling = m_first_child;
|
||||||
|
@ -112,7 +122,6 @@ inline void TreeNode<T>::donate_all_children_to(T& node)
|
||||||
m_last_child = nullptr;
|
m_last_child = nullptr;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
template<typename T>
|
template<typename T>
|
||||||
inline bool TreeNode<T>::is_ancestor_of(const TreeNode<T>& other) const
|
inline bool TreeNode<T>::is_ancestor_of(const TreeNode<T>& other) const
|
||||||
{
|
{
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue