mirror of
https://github.com/RGBCube/serenity
synced 2025-05-31 10:48:11 +00:00
LibHTML: Basic element attribute parsing.
This commit is contained in:
parent
581d6b00c8
commit
f8a86b5164
5 changed files with 194 additions and 17 deletions
|
@ -1,6 +1,7 @@
|
||||||
<html>
|
<html>
|
||||||
<head><title>Small test page</title></head>
|
<head><title>Small test page</title></head>
|
||||||
<body>
|
<body bgcolor="#408080" text="#ffffff">
|
||||||
|
<h1>Hello friends!</h1>
|
||||||
<p>This is a <b>very small</b> test page :^)</p>
|
<p>This is a <b>very small</b> test page :^)</p>
|
||||||
</body>
|
</body>
|
||||||
</html>
|
</html>
|
||||||
|
|
|
@ -12,7 +12,11 @@ void dump_tree(Node& node)
|
||||||
if (node.is_document()) {
|
if (node.is_document()) {
|
||||||
printf("*Document*\n");
|
printf("*Document*\n");
|
||||||
} else if (node.is_element()) {
|
} else if (node.is_element()) {
|
||||||
printf("<%s>\n", static_cast<Element&>(node).tag_name().characters());
|
printf("<%s", static_cast<Element&>(node).tag_name().characters());
|
||||||
|
static_cast<Element&>(node).for_each_attribute([](auto& name, auto& value) {
|
||||||
|
printf(" %s=%s", name.characters(), value.characters());
|
||||||
|
});
|
||||||
|
printf(">\n");
|
||||||
} else if (node.is_text()) {
|
} else if (node.is_text()) {
|
||||||
printf("\"%s\"\n", static_cast<Text&>(node).data().characters());
|
printf("\"%s\"\n", static_cast<Text&>(node).data().characters());
|
||||||
}
|
}
|
||||||
|
|
|
@ -10,3 +10,40 @@ Element::~Element()
|
||||||
{
|
{
|
||||||
}
|
}
|
||||||
|
|
||||||
|
Attribute* Element::find_attribute(const String& name)
|
||||||
|
{
|
||||||
|
for (auto& attribute : m_attributes) {
|
||||||
|
if (attribute.name() == name)
|
||||||
|
return &attribute;
|
||||||
|
}
|
||||||
|
return nullptr;
|
||||||
|
}
|
||||||
|
|
||||||
|
const Attribute* Element::find_attribute(const String& name) const
|
||||||
|
{
|
||||||
|
for (auto& attribute : m_attributes) {
|
||||||
|
if (attribute.name() == name)
|
||||||
|
return &attribute;
|
||||||
|
}
|
||||||
|
return nullptr;
|
||||||
|
}
|
||||||
|
|
||||||
|
String Element::attribute(const String& name) const
|
||||||
|
{
|
||||||
|
if (auto* attribute = find_attribute(name))
|
||||||
|
return attribute->value();
|
||||||
|
return { };
|
||||||
|
}
|
||||||
|
|
||||||
|
void Element::set_attribute(const String& name, const String& value)
|
||||||
|
{
|
||||||
|
if (auto* attribute = find_attribute(name))
|
||||||
|
attribute->set_value(value);
|
||||||
|
else
|
||||||
|
m_attributes.append({ name, value });
|
||||||
|
}
|
||||||
|
|
||||||
|
void Element::set_attributes(Vector<Attribute>&& attributes)
|
||||||
|
{
|
||||||
|
m_attributes = move(attributes);
|
||||||
|
}
|
||||||
|
|
|
@ -11,6 +11,11 @@ public:
|
||||||
{
|
{
|
||||||
}
|
}
|
||||||
|
|
||||||
|
const String& name() const { return m_name; }
|
||||||
|
const String& value() const { return m_value; }
|
||||||
|
|
||||||
|
void set_value(const String& value) { m_value = value; }
|
||||||
|
|
||||||
private:
|
private:
|
||||||
String m_name;
|
String m_name;
|
||||||
String m_value;
|
String m_value;
|
||||||
|
@ -23,7 +28,22 @@ public:
|
||||||
|
|
||||||
const String& tag_name() const { return m_tag_name; }
|
const String& tag_name() const { return m_tag_name; }
|
||||||
|
|
||||||
|
String attribute(const String& name) const;
|
||||||
|
void set_attribute(const String& name, const String& value);
|
||||||
|
|
||||||
|
void set_attributes(Vector<Attribute>&&);
|
||||||
|
|
||||||
|
template<typename Callback>
|
||||||
|
void for_each_attribute(Callback callback)
|
||||||
|
{
|
||||||
|
for (auto& attribute : m_attributes)
|
||||||
|
callback(attribute.name(), attribute.value());
|
||||||
|
}
|
||||||
|
|
||||||
private:
|
private:
|
||||||
|
Attribute* find_attribute(const String& name);
|
||||||
|
const Attribute* find_attribute(const String& name) const;
|
||||||
|
|
||||||
String m_tag_name;
|
String m_tag_name;
|
||||||
Vector<Attribute> m_attributes;
|
Vector<Attribute> m_attributes;
|
||||||
};
|
};
|
||||||
|
|
|
@ -2,12 +2,18 @@
|
||||||
#include <LibHTML/Parser.h>
|
#include <LibHTML/Parser.h>
|
||||||
#include <LibHTML/Text.h>
|
#include <LibHTML/Text.h>
|
||||||
#include <ctype.h>
|
#include <ctype.h>
|
||||||
|
#include <stdio.h>
|
||||||
|
|
||||||
static Retained<Element> create_element(const String& tag_name)
|
static Retained<Element> create_element(const String& tag_name)
|
||||||
{
|
{
|
||||||
return adopt(*new Element(tag_name));
|
return adopt(*new Element(tag_name));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static bool is_valid_in_attribute_name(char ch)
|
||||||
|
{
|
||||||
|
return isalnum(ch) || ch == '_' || ch == '-';
|
||||||
|
}
|
||||||
|
|
||||||
static bool is_self_closing_tag(const String& tag_name)
|
static bool is_self_closing_tag(const String& tag_name)
|
||||||
{
|
{
|
||||||
return tag_name == "area"
|
return tag_name == "area"
|
||||||
|
@ -34,11 +40,12 @@ Retained<Document> parse(const String& html)
|
||||||
node_stack.append(doc);
|
node_stack.append(doc);
|
||||||
|
|
||||||
enum class State {
|
enum class State {
|
||||||
Free,
|
Free = 0,
|
||||||
BeforeTagName,
|
BeforeTagName,
|
||||||
InTagName,
|
InTagName,
|
||||||
InAttributeList,
|
InAttributeList,
|
||||||
InAttributeName,
|
InAttributeName,
|
||||||
|
BeforeAttributeValue,
|
||||||
InAttributeValueNoQuote,
|
InAttributeValueNoQuote,
|
||||||
InAttributeValueSingleQuote,
|
InAttributeValueSingleQuote,
|
||||||
InAttributeValueDoubleQuote,
|
InAttributeValueDoubleQuote,
|
||||||
|
@ -46,19 +53,33 @@ Retained<Document> parse(const String& html)
|
||||||
|
|
||||||
auto state = State::Free;
|
auto state = State::Free;
|
||||||
|
|
||||||
Vector<char, 256> buffer;
|
Vector<char, 256> text_buffer;
|
||||||
|
|
||||||
|
Vector<char, 32> tag_name_buffer;
|
||||||
|
|
||||||
|
Vector<Attribute> attributes;
|
||||||
|
Vector<char, 256> attribute_name_buffer;
|
||||||
|
Vector<char, 256> attribute_value_buffer;
|
||||||
|
|
||||||
bool is_slash_tag = false;
|
bool is_slash_tag = false;
|
||||||
|
|
||||||
auto move_to_state = [&](State new_state) {
|
auto move_to_state = [&](State new_state) {
|
||||||
if (new_state == State::BeforeTagName)
|
if (new_state == State::BeforeTagName) {
|
||||||
is_slash_tag = false;
|
is_slash_tag = false;
|
||||||
if (state == State::Free && !buffer.is_empty()) {
|
tag_name_buffer.clear();
|
||||||
auto text_node = adopt(*new Text(String::copy(buffer)));
|
attributes.clear();
|
||||||
|
}
|
||||||
|
if (new_state == State::InAttributeName)
|
||||||
|
attribute_name_buffer.clear();
|
||||||
|
if (new_state == State::BeforeAttributeValue)
|
||||||
|
attribute_value_buffer.clear();
|
||||||
|
if (state == State::Free && !text_buffer.is_empty()) {
|
||||||
|
auto text_node = adopt(*new Text(String::copy(text_buffer)));
|
||||||
|
text_buffer.clear();
|
||||||
node_stack.last()->append_child(text_node);
|
node_stack.last()->append_child(text_node);
|
||||||
}
|
}
|
||||||
state = new_state;
|
state = new_state;
|
||||||
buffer.clear();
|
text_buffer.clear();
|
||||||
};
|
};
|
||||||
|
|
||||||
auto close_tag = [&] {
|
auto close_tag = [&] {
|
||||||
|
@ -67,7 +88,9 @@ Retained<Document> parse(const String& html)
|
||||||
};
|
};
|
||||||
|
|
||||||
auto open_tag = [&] {
|
auto open_tag = [&] {
|
||||||
auto new_element = create_element(String::copy(buffer));
|
auto new_element = create_element(String::copy(tag_name_buffer));
|
||||||
|
tag_name_buffer.clear();
|
||||||
|
new_element->set_attributes(move(attributes));
|
||||||
node_stack.append(new_element);
|
node_stack.append(new_element);
|
||||||
if (node_stack.size() != 1)
|
if (node_stack.size() != 1)
|
||||||
node_stack[node_stack.size() - 2]->append_child(new_element);
|
node_stack[node_stack.size() - 2]->append_child(new_element);
|
||||||
|
@ -76,15 +99,27 @@ Retained<Document> parse(const String& html)
|
||||||
close_tag();
|
close_tag();
|
||||||
};
|
};
|
||||||
|
|
||||||
|
auto commit_tag = [&] {
|
||||||
|
if (is_slash_tag)
|
||||||
|
close_tag();
|
||||||
|
else
|
||||||
|
open_tag();
|
||||||
|
};
|
||||||
|
|
||||||
|
auto commit_attribute = [&] {
|
||||||
|
attributes.append({ String::copy(attribute_name_buffer), String::copy(attribute_value_buffer) });
|
||||||
|
};
|
||||||
|
|
||||||
for (int i = 0; i < html.length(); ++i) {
|
for (int i = 0; i < html.length(); ++i) {
|
||||||
char ch = html[i];
|
char ch = html[i];
|
||||||
switch (state) {
|
switch (state) {
|
||||||
case State::Free:
|
case State::Free:
|
||||||
if (ch == '<') {
|
if (ch == '<') {
|
||||||
|
is_slash_tag = false;
|
||||||
move_to_state(State::BeforeTagName);
|
move_to_state(State::BeforeTagName);
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
buffer.append(ch);
|
text_buffer.append(ch);
|
||||||
break;
|
break;
|
||||||
case State::BeforeTagName:
|
case State::BeforeTagName:
|
||||||
if (ch == '/') {
|
if (ch == '/') {
|
||||||
|
@ -95,25 +130,105 @@ Retained<Document> parse(const String& html)
|
||||||
move_to_state(State::Free);
|
move_to_state(State::Free);
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
if (!isascii(ch))
|
if (!isalpha(ch))
|
||||||
break;
|
break;
|
||||||
move_to_state(State::InTagName);
|
move_to_state(State::InTagName);
|
||||||
[[fallthrough]];
|
[[fallthrough]];
|
||||||
case State::InTagName:
|
case State::InTagName:
|
||||||
if (ch == ' ') {
|
if (isspace(ch)) {
|
||||||
move_to_state(State::InAttributeList);
|
move_to_state(State::InAttributeList);
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
if (ch == '>') {
|
if (ch == '>') {
|
||||||
if (is_slash_tag)
|
commit_tag();
|
||||||
close_tag();
|
|
||||||
else
|
|
||||||
open_tag();
|
|
||||||
move_to_state(State::Free);
|
move_to_state(State::Free);
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
buffer.append(ch);
|
tag_name_buffer.append(ch);
|
||||||
break;
|
break;
|
||||||
|
case State::InAttributeList:
|
||||||
|
if (ch == '>') {
|
||||||
|
commit_tag();
|
||||||
|
move_to_state(State::Free);
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
if (!isalpha(ch))
|
||||||
|
break;
|
||||||
|
move_to_state(State::InAttributeName);
|
||||||
|
[[fallthrough]];
|
||||||
|
case State::InAttributeName:
|
||||||
|
if (is_valid_in_attribute_name(ch)) {
|
||||||
|
attribute_name_buffer.append(ch);
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
if (isspace(ch)) {
|
||||||
|
commit_attribute();
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (ch == '>') {
|
||||||
|
commit_tag();
|
||||||
|
move_to_state(State::Free);
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (ch == '=') {
|
||||||
|
move_to_state(State::BeforeAttributeValue);
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
break;
|
||||||
|
case State::BeforeAttributeValue:
|
||||||
|
if (ch == '\'') {
|
||||||
|
move_to_state(State::InAttributeValueSingleQuote);
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
if (ch == '"') {
|
||||||
|
move_to_state(State::InAttributeValueDoubleQuote);
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
if (ch == '>') {
|
||||||
|
commit_tag();
|
||||||
|
move_to_state(State::Free);
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
if (isspace(ch)) {
|
||||||
|
commit_attribute();
|
||||||
|
move_to_state(State::InAttributeList);
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
break;
|
||||||
|
case State::InAttributeValueSingleQuote:
|
||||||
|
if (ch == '\'') {
|
||||||
|
commit_attribute();
|
||||||
|
move_to_state(State::InAttributeList);
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
attribute_value_buffer.append(ch);
|
||||||
|
break;
|
||||||
|
case State::InAttributeValueDoubleQuote:
|
||||||
|
if (ch == '"') {
|
||||||
|
commit_attribute();
|
||||||
|
move_to_state(State::InAttributeList);
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
attribute_value_buffer.append(ch);
|
||||||
|
break;
|
||||||
|
case State::InAttributeValueNoQuote:
|
||||||
|
if (isspace(ch)) {
|
||||||
|
commit_attribute();
|
||||||
|
move_to_state(State::InAttributeList);
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
if (ch == '>') {
|
||||||
|
commit_tag();
|
||||||
|
move_to_state(State::Free);
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
attribute_value_buffer.append(ch);
|
||||||
|
break;
|
||||||
|
default:
|
||||||
|
fprintf(stderr, "Unhandled state %d\n", (int)state);
|
||||||
|
ASSERT_NOT_REACHED();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
return doc;
|
return doc;
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue