diff --git a/Base/home/anon/small.html b/Base/home/anon/small.html
index f44d46836e..eebdeadb19 100644
--- a/Base/home/anon/small.html
+++ b/Base/home/anon/small.html
@@ -1,6 +1,7 @@
Small test page
-
+
+ Hello friends!
This is a very small test page :^)
diff --git a/LibHTML/Dump.cpp b/LibHTML/Dump.cpp
index 8826eb4444..595769fa8f 100644
--- a/LibHTML/Dump.cpp
+++ b/LibHTML/Dump.cpp
@@ -12,7 +12,11 @@ void dump_tree(Node& node)
if (node.is_document()) {
printf("*Document*\n");
} else if (node.is_element()) {
- printf("<%s>\n", static_cast(node).tag_name().characters());
+ printf("<%s", static_cast(node).tag_name().characters());
+ static_cast(node).for_each_attribute([](auto& name, auto& value) {
+ printf(" %s=%s", name.characters(), value.characters());
+ });
+ printf(">\n");
} else if (node.is_text()) {
printf("\"%s\"\n", static_cast(node).data().characters());
}
diff --git a/LibHTML/Element.cpp b/LibHTML/Element.cpp
index 5c284b6f74..263774409d 100644
--- a/LibHTML/Element.cpp
+++ b/LibHTML/Element.cpp
@@ -10,3 +10,40 @@ Element::~Element()
{
}
+Attribute* Element::find_attribute(const String& name)
+{
+ for (auto& attribute : m_attributes) {
+ if (attribute.name() == name)
+ return &attribute;
+ }
+ return nullptr;
+}
+
+const Attribute* Element::find_attribute(const String& name) const
+{
+ for (auto& attribute : m_attributes) {
+ if (attribute.name() == name)
+ return &attribute;
+ }
+ return nullptr;
+}
+
+String Element::attribute(const String& name) const
+{
+ if (auto* attribute = find_attribute(name))
+ return attribute->value();
+ return { };
+}
+
+void Element::set_attribute(const String& name, const String& value)
+{
+ if (auto* attribute = find_attribute(name))
+ attribute->set_value(value);
+ else
+ m_attributes.append({ name, value });
+}
+
+void Element::set_attributes(Vector&& attributes)
+{
+ m_attributes = move(attributes);
+}
diff --git a/LibHTML/Element.h b/LibHTML/Element.h
index 5b9f390603..c21c0f085b 100644
--- a/LibHTML/Element.h
+++ b/LibHTML/Element.h
@@ -11,6 +11,11 @@ public:
{
}
+ const String& name() const { return m_name; }
+ const String& value() const { return m_value; }
+
+ void set_value(const String& value) { m_value = value; }
+
private:
String m_name;
String m_value;
@@ -23,7 +28,22 @@ public:
const String& tag_name() const { return m_tag_name; }
+ String attribute(const String& name) const;
+ void set_attribute(const String& name, const String& value);
+
+ void set_attributes(Vector&&);
+
+ template
+ void for_each_attribute(Callback callback)
+ {
+ for (auto& attribute : m_attributes)
+ callback(attribute.name(), attribute.value());
+ }
+
private:
+ Attribute* find_attribute(const String& name);
+ const Attribute* find_attribute(const String& name) const;
+
String m_tag_name;
Vector m_attributes;
};
diff --git a/LibHTML/Parser.cpp b/LibHTML/Parser.cpp
index ea6dd65b95..53541570ff 100644
--- a/LibHTML/Parser.cpp
+++ b/LibHTML/Parser.cpp
@@ -2,12 +2,18 @@
#include
#include
#include
+#include
static Retained create_element(const String& tag_name)
{
return adopt(*new Element(tag_name));
}
+static bool is_valid_in_attribute_name(char ch)
+{
+ return isalnum(ch) || ch == '_' || ch == '-';
+}
+
static bool is_self_closing_tag(const String& tag_name)
{
return tag_name == "area"
@@ -34,11 +40,12 @@ Retained parse(const String& html)
node_stack.append(doc);
enum class State {
- Free,
+ Free = 0,
BeforeTagName,
InTagName,
InAttributeList,
InAttributeName,
+ BeforeAttributeValue,
InAttributeValueNoQuote,
InAttributeValueSingleQuote,
InAttributeValueDoubleQuote,
@@ -46,19 +53,33 @@ Retained parse(const String& html)
auto state = State::Free;
- Vector buffer;
+ Vector text_buffer;
+
+ Vector tag_name_buffer;
+
+ Vector attributes;
+ Vector attribute_name_buffer;
+ Vector attribute_value_buffer;
bool is_slash_tag = false;
auto move_to_state = [&](State new_state) {
- if (new_state == State::BeforeTagName)
+ if (new_state == State::BeforeTagName) {
is_slash_tag = false;
- if (state == State::Free && !buffer.is_empty()) {
- auto text_node = adopt(*new Text(String::copy(buffer)));
+ tag_name_buffer.clear();
+ attributes.clear();
+ }
+ if (new_state == State::InAttributeName)
+ attribute_name_buffer.clear();
+ if (new_state == State::BeforeAttributeValue)
+ attribute_value_buffer.clear();
+ if (state == State::Free && !text_buffer.is_empty()) {
+ auto text_node = adopt(*new Text(String::copy(text_buffer)));
+ text_buffer.clear();
node_stack.last()->append_child(text_node);
}
state = new_state;
- buffer.clear();
+ text_buffer.clear();
};
auto close_tag = [&] {
@@ -67,7 +88,9 @@ Retained parse(const String& html)
};
auto open_tag = [&] {
- auto new_element = create_element(String::copy(buffer));
+ auto new_element = create_element(String::copy(tag_name_buffer));
+ tag_name_buffer.clear();
+ new_element->set_attributes(move(attributes));
node_stack.append(new_element);
if (node_stack.size() != 1)
node_stack[node_stack.size() - 2]->append_child(new_element);
@@ -76,15 +99,27 @@ Retained parse(const String& html)
close_tag();
};
+ auto commit_tag = [&] {
+ if (is_slash_tag)
+ close_tag();
+ else
+ open_tag();
+ };
+
+ auto commit_attribute = [&] {
+ attributes.append({ String::copy(attribute_name_buffer), String::copy(attribute_value_buffer) });
+ };
+
for (int i = 0; i < html.length(); ++i) {
char ch = html[i];
switch (state) {
case State::Free:
if (ch == '<') {
+ is_slash_tag = false;
move_to_state(State::BeforeTagName);
break;
}
- buffer.append(ch);
+ text_buffer.append(ch);
break;
case State::BeforeTagName:
if (ch == '/') {
@@ -95,25 +130,105 @@ Retained parse(const String& html)
move_to_state(State::Free);
break;
}
- if (!isascii(ch))
+ if (!isalpha(ch))
break;
move_to_state(State::InTagName);
[[fallthrough]];
case State::InTagName:
- if (ch == ' ') {
+ if (isspace(ch)) {
move_to_state(State::InAttributeList);
break;
}
if (ch == '>') {
- if (is_slash_tag)
- close_tag();
- else
- open_tag();
+ commit_tag();
move_to_state(State::Free);
break;
}
- buffer.append(ch);
+ tag_name_buffer.append(ch);
break;
+ case State::InAttributeList:
+ if (ch == '>') {
+ commit_tag();
+ move_to_state(State::Free);
+ break;
+ }
+ if (!isalpha(ch))
+ break;
+ move_to_state(State::InAttributeName);
+ [[fallthrough]];
+ case State::InAttributeName:
+ if (is_valid_in_attribute_name(ch)) {
+ attribute_name_buffer.append(ch);
+ break;
+ }
+ if (isspace(ch)) {
+ commit_attribute();
+ break;
+ }
+
+ if (ch == '>') {
+ commit_tag();
+ move_to_state(State::Free);
+ break;
+ }
+
+ if (ch == '=') {
+ move_to_state(State::BeforeAttributeValue);
+ break;
+ }
+ break;
+ case State::BeforeAttributeValue:
+ if (ch == '\'') {
+ move_to_state(State::InAttributeValueSingleQuote);
+ break;
+ }
+ if (ch == '"') {
+ move_to_state(State::InAttributeValueDoubleQuote);
+ break;
+ }
+ if (ch == '>') {
+ commit_tag();
+ move_to_state(State::Free);
+ break;
+ }
+ if (isspace(ch)) {
+ commit_attribute();
+ move_to_state(State::InAttributeList);
+ break;
+ }
+ break;
+ case State::InAttributeValueSingleQuote:
+ if (ch == '\'') {
+ commit_attribute();
+ move_to_state(State::InAttributeList);
+ break;
+ }
+ attribute_value_buffer.append(ch);
+ break;
+ case State::InAttributeValueDoubleQuote:
+ if (ch == '"') {
+ commit_attribute();
+ move_to_state(State::InAttributeList);
+ break;
+ }
+ attribute_value_buffer.append(ch);
+ break;
+ case State::InAttributeValueNoQuote:
+ if (isspace(ch)) {
+ commit_attribute();
+ move_to_state(State::InAttributeList);
+ break;
+ }
+ if (ch == '>') {
+ commit_tag();
+ move_to_state(State::Free);
+ break;
+ }
+ attribute_value_buffer.append(ch);
+ break;
+ default:
+ fprintf(stderr, "Unhandled state %d\n", (int)state);
+ ASSERT_NOT_REACHED();
}
}
return doc;