diff --git a/Base/home/anon/small.html b/Base/home/anon/small.html index f44d46836e..eebdeadb19 100644 --- a/Base/home/anon/small.html +++ b/Base/home/anon/small.html @@ -1,6 +1,7 @@ Small test page - + +

Hello friends!

This is a very small test page :^)

diff --git a/LibHTML/Dump.cpp b/LibHTML/Dump.cpp index 8826eb4444..595769fa8f 100644 --- a/LibHTML/Dump.cpp +++ b/LibHTML/Dump.cpp @@ -12,7 +12,11 @@ void dump_tree(Node& node) if (node.is_document()) { printf("*Document*\n"); } else if (node.is_element()) { - printf("<%s>\n", static_cast(node).tag_name().characters()); + printf("<%s", static_cast(node).tag_name().characters()); + static_cast(node).for_each_attribute([](auto& name, auto& value) { + printf(" %s=%s", name.characters(), value.characters()); + }); + printf(">\n"); } else if (node.is_text()) { printf("\"%s\"\n", static_cast(node).data().characters()); } diff --git a/LibHTML/Element.cpp b/LibHTML/Element.cpp index 5c284b6f74..263774409d 100644 --- a/LibHTML/Element.cpp +++ b/LibHTML/Element.cpp @@ -10,3 +10,40 @@ Element::~Element() { } +Attribute* Element::find_attribute(const String& name) +{ + for (auto& attribute : m_attributes) { + if (attribute.name() == name) + return &attribute; + } + return nullptr; +} + +const Attribute* Element::find_attribute(const String& name) const +{ + for (auto& attribute : m_attributes) { + if (attribute.name() == name) + return &attribute; + } + return nullptr; +} + +String Element::attribute(const String& name) const +{ + if (auto* attribute = find_attribute(name)) + return attribute->value(); + return { }; +} + +void Element::set_attribute(const String& name, const String& value) +{ + if (auto* attribute = find_attribute(name)) + attribute->set_value(value); + else + m_attributes.append({ name, value }); +} + +void Element::set_attributes(Vector&& attributes) +{ + m_attributes = move(attributes); +} diff --git a/LibHTML/Element.h b/LibHTML/Element.h index 5b9f390603..c21c0f085b 100644 --- a/LibHTML/Element.h +++ b/LibHTML/Element.h @@ -11,6 +11,11 @@ public: { } + const String& name() const { return m_name; } + const String& value() const { return m_value; } + + void set_value(const String& value) { m_value = value; } + private: String m_name; String m_value; @@ -23,7 +28,22 @@ public: const String& tag_name() const { return m_tag_name; } + String attribute(const String& name) const; + void set_attribute(const String& name, const String& value); + + void set_attributes(Vector&&); + + template + void for_each_attribute(Callback callback) + { + for (auto& attribute : m_attributes) + callback(attribute.name(), attribute.value()); + } + private: + Attribute* find_attribute(const String& name); + const Attribute* find_attribute(const String& name) const; + String m_tag_name; Vector m_attributes; }; diff --git a/LibHTML/Parser.cpp b/LibHTML/Parser.cpp index ea6dd65b95..53541570ff 100644 --- a/LibHTML/Parser.cpp +++ b/LibHTML/Parser.cpp @@ -2,12 +2,18 @@ #include #include #include +#include static Retained create_element(const String& tag_name) { return adopt(*new Element(tag_name)); } +static bool is_valid_in_attribute_name(char ch) +{ + return isalnum(ch) || ch == '_' || ch == '-'; +} + static bool is_self_closing_tag(const String& tag_name) { return tag_name == "area" @@ -34,11 +40,12 @@ Retained parse(const String& html) node_stack.append(doc); enum class State { - Free, + Free = 0, BeforeTagName, InTagName, InAttributeList, InAttributeName, + BeforeAttributeValue, InAttributeValueNoQuote, InAttributeValueSingleQuote, InAttributeValueDoubleQuote, @@ -46,19 +53,33 @@ Retained parse(const String& html) auto state = State::Free; - Vector buffer; + Vector text_buffer; + + Vector tag_name_buffer; + + Vector attributes; + Vector attribute_name_buffer; + Vector attribute_value_buffer; bool is_slash_tag = false; auto move_to_state = [&](State new_state) { - if (new_state == State::BeforeTagName) + if (new_state == State::BeforeTagName) { is_slash_tag = false; - if (state == State::Free && !buffer.is_empty()) { - auto text_node = adopt(*new Text(String::copy(buffer))); + tag_name_buffer.clear(); + attributes.clear(); + } + if (new_state == State::InAttributeName) + attribute_name_buffer.clear(); + if (new_state == State::BeforeAttributeValue) + attribute_value_buffer.clear(); + if (state == State::Free && !text_buffer.is_empty()) { + auto text_node = adopt(*new Text(String::copy(text_buffer))); + text_buffer.clear(); node_stack.last()->append_child(text_node); } state = new_state; - buffer.clear(); + text_buffer.clear(); }; auto close_tag = [&] { @@ -67,7 +88,9 @@ Retained parse(const String& html) }; auto open_tag = [&] { - auto new_element = create_element(String::copy(buffer)); + auto new_element = create_element(String::copy(tag_name_buffer)); + tag_name_buffer.clear(); + new_element->set_attributes(move(attributes)); node_stack.append(new_element); if (node_stack.size() != 1) node_stack[node_stack.size() - 2]->append_child(new_element); @@ -76,15 +99,27 @@ Retained parse(const String& html) close_tag(); }; + auto commit_tag = [&] { + if (is_slash_tag) + close_tag(); + else + open_tag(); + }; + + auto commit_attribute = [&] { + attributes.append({ String::copy(attribute_name_buffer), String::copy(attribute_value_buffer) }); + }; + for (int i = 0; i < html.length(); ++i) { char ch = html[i]; switch (state) { case State::Free: if (ch == '<') { + is_slash_tag = false; move_to_state(State::BeforeTagName); break; } - buffer.append(ch); + text_buffer.append(ch); break; case State::BeforeTagName: if (ch == '/') { @@ -95,25 +130,105 @@ Retained parse(const String& html) move_to_state(State::Free); break; } - if (!isascii(ch)) + if (!isalpha(ch)) break; move_to_state(State::InTagName); [[fallthrough]]; case State::InTagName: - if (ch == ' ') { + if (isspace(ch)) { move_to_state(State::InAttributeList); break; } if (ch == '>') { - if (is_slash_tag) - close_tag(); - else - open_tag(); + commit_tag(); move_to_state(State::Free); break; } - buffer.append(ch); + tag_name_buffer.append(ch); break; + case State::InAttributeList: + if (ch == '>') { + commit_tag(); + move_to_state(State::Free); + break; + } + if (!isalpha(ch)) + break; + move_to_state(State::InAttributeName); + [[fallthrough]]; + case State::InAttributeName: + if (is_valid_in_attribute_name(ch)) { + attribute_name_buffer.append(ch); + break; + } + if (isspace(ch)) { + commit_attribute(); + break; + } + + if (ch == '>') { + commit_tag(); + move_to_state(State::Free); + break; + } + + if (ch == '=') { + move_to_state(State::BeforeAttributeValue); + break; + } + break; + case State::BeforeAttributeValue: + if (ch == '\'') { + move_to_state(State::InAttributeValueSingleQuote); + break; + } + if (ch == '"') { + move_to_state(State::InAttributeValueDoubleQuote); + break; + } + if (ch == '>') { + commit_tag(); + move_to_state(State::Free); + break; + } + if (isspace(ch)) { + commit_attribute(); + move_to_state(State::InAttributeList); + break; + } + break; + case State::InAttributeValueSingleQuote: + if (ch == '\'') { + commit_attribute(); + move_to_state(State::InAttributeList); + break; + } + attribute_value_buffer.append(ch); + break; + case State::InAttributeValueDoubleQuote: + if (ch == '"') { + commit_attribute(); + move_to_state(State::InAttributeList); + break; + } + attribute_value_buffer.append(ch); + break; + case State::InAttributeValueNoQuote: + if (isspace(ch)) { + commit_attribute(); + move_to_state(State::InAttributeList); + break; + } + if (ch == '>') { + commit_tag(); + move_to_state(State::Free); + break; + } + attribute_value_buffer.append(ch); + break; + default: + fprintf(stderr, "Unhandled state %d\n", (int)state); + ASSERT_NOT_REACHED(); } } return doc;