mirror of
https://github.com/RGBCube/serenity
synced 2025-05-31 14:38:11 +00:00
LibHTML: Basic element attribute parsing.
This commit is contained in:
parent
581d6b00c8
commit
f8a86b5164
5 changed files with 194 additions and 17 deletions
|
@ -1,6 +1,7 @@
|
|||
<html>
|
||||
<head><title>Small test page</title></head>
|
||||
<body>
|
||||
<body bgcolor="#408080" text="#ffffff">
|
||||
<h1>Hello friends!</h1>
|
||||
<p>This is a <b>very small</b> test page :^)</p>
|
||||
</body>
|
||||
</html>
|
||||
|
|
|
@ -12,7 +12,11 @@ void dump_tree(Node& node)
|
|||
if (node.is_document()) {
|
||||
printf("*Document*\n");
|
||||
} else if (node.is_element()) {
|
||||
printf("<%s>\n", static_cast<Element&>(node).tag_name().characters());
|
||||
printf("<%s", static_cast<Element&>(node).tag_name().characters());
|
||||
static_cast<Element&>(node).for_each_attribute([](auto& name, auto& value) {
|
||||
printf(" %s=%s", name.characters(), value.characters());
|
||||
});
|
||||
printf(">\n");
|
||||
} else if (node.is_text()) {
|
||||
printf("\"%s\"\n", static_cast<Text&>(node).data().characters());
|
||||
}
|
||||
|
|
|
@ -10,3 +10,40 @@ Element::~Element()
|
|||
{
|
||||
}
|
||||
|
||||
Attribute* Element::find_attribute(const String& name)
|
||||
{
|
||||
for (auto& attribute : m_attributes) {
|
||||
if (attribute.name() == name)
|
||||
return &attribute;
|
||||
}
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
const Attribute* Element::find_attribute(const String& name) const
|
||||
{
|
||||
for (auto& attribute : m_attributes) {
|
||||
if (attribute.name() == name)
|
||||
return &attribute;
|
||||
}
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
String Element::attribute(const String& name) const
|
||||
{
|
||||
if (auto* attribute = find_attribute(name))
|
||||
return attribute->value();
|
||||
return { };
|
||||
}
|
||||
|
||||
void Element::set_attribute(const String& name, const String& value)
|
||||
{
|
||||
if (auto* attribute = find_attribute(name))
|
||||
attribute->set_value(value);
|
||||
else
|
||||
m_attributes.append({ name, value });
|
||||
}
|
||||
|
||||
void Element::set_attributes(Vector<Attribute>&& attributes)
|
||||
{
|
||||
m_attributes = move(attributes);
|
||||
}
|
||||
|
|
|
@ -11,6 +11,11 @@ public:
|
|||
{
|
||||
}
|
||||
|
||||
const String& name() const { return m_name; }
|
||||
const String& value() const { return m_value; }
|
||||
|
||||
void set_value(const String& value) { m_value = value; }
|
||||
|
||||
private:
|
||||
String m_name;
|
||||
String m_value;
|
||||
|
@ -23,7 +28,22 @@ public:
|
|||
|
||||
const String& tag_name() const { return m_tag_name; }
|
||||
|
||||
String attribute(const String& name) const;
|
||||
void set_attribute(const String& name, const String& value);
|
||||
|
||||
void set_attributes(Vector<Attribute>&&);
|
||||
|
||||
template<typename Callback>
|
||||
void for_each_attribute(Callback callback)
|
||||
{
|
||||
for (auto& attribute : m_attributes)
|
||||
callback(attribute.name(), attribute.value());
|
||||
}
|
||||
|
||||
private:
|
||||
Attribute* find_attribute(const String& name);
|
||||
const Attribute* find_attribute(const String& name) const;
|
||||
|
||||
String m_tag_name;
|
||||
Vector<Attribute> m_attributes;
|
||||
};
|
||||
|
|
|
@ -2,12 +2,18 @@
|
|||
#include <LibHTML/Parser.h>
|
||||
#include <LibHTML/Text.h>
|
||||
#include <ctype.h>
|
||||
#include <stdio.h>
|
||||
|
||||
static Retained<Element> create_element(const String& tag_name)
|
||||
{
|
||||
return adopt(*new Element(tag_name));
|
||||
}
|
||||
|
||||
static bool is_valid_in_attribute_name(char ch)
|
||||
{
|
||||
return isalnum(ch) || ch == '_' || ch == '-';
|
||||
}
|
||||
|
||||
static bool is_self_closing_tag(const String& tag_name)
|
||||
{
|
||||
return tag_name == "area"
|
||||
|
@ -34,11 +40,12 @@ Retained<Document> parse(const String& html)
|
|||
node_stack.append(doc);
|
||||
|
||||
enum class State {
|
||||
Free,
|
||||
Free = 0,
|
||||
BeforeTagName,
|
||||
InTagName,
|
||||
InAttributeList,
|
||||
InAttributeName,
|
||||
BeforeAttributeValue,
|
||||
InAttributeValueNoQuote,
|
||||
InAttributeValueSingleQuote,
|
||||
InAttributeValueDoubleQuote,
|
||||
|
@ -46,19 +53,33 @@ Retained<Document> parse(const String& html)
|
|||
|
||||
auto state = State::Free;
|
||||
|
||||
Vector<char, 256> buffer;
|
||||
Vector<char, 256> text_buffer;
|
||||
|
||||
Vector<char, 32> tag_name_buffer;
|
||||
|
||||
Vector<Attribute> attributes;
|
||||
Vector<char, 256> attribute_name_buffer;
|
||||
Vector<char, 256> attribute_value_buffer;
|
||||
|
||||
bool is_slash_tag = false;
|
||||
|
||||
auto move_to_state = [&](State new_state) {
|
||||
if (new_state == State::BeforeTagName)
|
||||
if (new_state == State::BeforeTagName) {
|
||||
is_slash_tag = false;
|
||||
if (state == State::Free && !buffer.is_empty()) {
|
||||
auto text_node = adopt(*new Text(String::copy(buffer)));
|
||||
tag_name_buffer.clear();
|
||||
attributes.clear();
|
||||
}
|
||||
if (new_state == State::InAttributeName)
|
||||
attribute_name_buffer.clear();
|
||||
if (new_state == State::BeforeAttributeValue)
|
||||
attribute_value_buffer.clear();
|
||||
if (state == State::Free && !text_buffer.is_empty()) {
|
||||
auto text_node = adopt(*new Text(String::copy(text_buffer)));
|
||||
text_buffer.clear();
|
||||
node_stack.last()->append_child(text_node);
|
||||
}
|
||||
state = new_state;
|
||||
buffer.clear();
|
||||
text_buffer.clear();
|
||||
};
|
||||
|
||||
auto close_tag = [&] {
|
||||
|
@ -67,7 +88,9 @@ Retained<Document> parse(const String& html)
|
|||
};
|
||||
|
||||
auto open_tag = [&] {
|
||||
auto new_element = create_element(String::copy(buffer));
|
||||
auto new_element = create_element(String::copy(tag_name_buffer));
|
||||
tag_name_buffer.clear();
|
||||
new_element->set_attributes(move(attributes));
|
||||
node_stack.append(new_element);
|
||||
if (node_stack.size() != 1)
|
||||
node_stack[node_stack.size() - 2]->append_child(new_element);
|
||||
|
@ -76,15 +99,27 @@ Retained<Document> parse(const String& html)
|
|||
close_tag();
|
||||
};
|
||||
|
||||
auto commit_tag = [&] {
|
||||
if (is_slash_tag)
|
||||
close_tag();
|
||||
else
|
||||
open_tag();
|
||||
};
|
||||
|
||||
auto commit_attribute = [&] {
|
||||
attributes.append({ String::copy(attribute_name_buffer), String::copy(attribute_value_buffer) });
|
||||
};
|
||||
|
||||
for (int i = 0; i < html.length(); ++i) {
|
||||
char ch = html[i];
|
||||
switch (state) {
|
||||
case State::Free:
|
||||
if (ch == '<') {
|
||||
is_slash_tag = false;
|
||||
move_to_state(State::BeforeTagName);
|
||||
break;
|
||||
}
|
||||
buffer.append(ch);
|
||||
text_buffer.append(ch);
|
||||
break;
|
||||
case State::BeforeTagName:
|
||||
if (ch == '/') {
|
||||
|
@ -95,25 +130,105 @@ Retained<Document> parse(const String& html)
|
|||
move_to_state(State::Free);
|
||||
break;
|
||||
}
|
||||
if (!isascii(ch))
|
||||
if (!isalpha(ch))
|
||||
break;
|
||||
move_to_state(State::InTagName);
|
||||
[[fallthrough]];
|
||||
case State::InTagName:
|
||||
if (ch == ' ') {
|
||||
if (isspace(ch)) {
|
||||
move_to_state(State::InAttributeList);
|
||||
break;
|
||||
}
|
||||
if (ch == '>') {
|
||||
if (is_slash_tag)
|
||||
close_tag();
|
||||
else
|
||||
open_tag();
|
||||
commit_tag();
|
||||
move_to_state(State::Free);
|
||||
break;
|
||||
}
|
||||
buffer.append(ch);
|
||||
tag_name_buffer.append(ch);
|
||||
break;
|
||||
case State::InAttributeList:
|
||||
if (ch == '>') {
|
||||
commit_tag();
|
||||
move_to_state(State::Free);
|
||||
break;
|
||||
}
|
||||
if (!isalpha(ch))
|
||||
break;
|
||||
move_to_state(State::InAttributeName);
|
||||
[[fallthrough]];
|
||||
case State::InAttributeName:
|
||||
if (is_valid_in_attribute_name(ch)) {
|
||||
attribute_name_buffer.append(ch);
|
||||
break;
|
||||
}
|
||||
if (isspace(ch)) {
|
||||
commit_attribute();
|
||||
break;
|
||||
}
|
||||
|
||||
if (ch == '>') {
|
||||
commit_tag();
|
||||
move_to_state(State::Free);
|
||||
break;
|
||||
}
|
||||
|
||||
if (ch == '=') {
|
||||
move_to_state(State::BeforeAttributeValue);
|
||||
break;
|
||||
}
|
||||
break;
|
||||
case State::BeforeAttributeValue:
|
||||
if (ch == '\'') {
|
||||
move_to_state(State::InAttributeValueSingleQuote);
|
||||
break;
|
||||
}
|
||||
if (ch == '"') {
|
||||
move_to_state(State::InAttributeValueDoubleQuote);
|
||||
break;
|
||||
}
|
||||
if (ch == '>') {
|
||||
commit_tag();
|
||||
move_to_state(State::Free);
|
||||
break;
|
||||
}
|
||||
if (isspace(ch)) {
|
||||
commit_attribute();
|
||||
move_to_state(State::InAttributeList);
|
||||
break;
|
||||
}
|
||||
break;
|
||||
case State::InAttributeValueSingleQuote:
|
||||
if (ch == '\'') {
|
||||
commit_attribute();
|
||||
move_to_state(State::InAttributeList);
|
||||
break;
|
||||
}
|
||||
attribute_value_buffer.append(ch);
|
||||
break;
|
||||
case State::InAttributeValueDoubleQuote:
|
||||
if (ch == '"') {
|
||||
commit_attribute();
|
||||
move_to_state(State::InAttributeList);
|
||||
break;
|
||||
}
|
||||
attribute_value_buffer.append(ch);
|
||||
break;
|
||||
case State::InAttributeValueNoQuote:
|
||||
if (isspace(ch)) {
|
||||
commit_attribute();
|
||||
move_to_state(State::InAttributeList);
|
||||
break;
|
||||
}
|
||||
if (ch == '>') {
|
||||
commit_tag();
|
||||
move_to_state(State::Free);
|
||||
break;
|
||||
}
|
||||
attribute_value_buffer.append(ch);
|
||||
break;
|
||||
default:
|
||||
fprintf(stderr, "Unhandled state %d\n", (int)state);
|
||||
ASSERT_NOT_REACHED();
|
||||
}
|
||||
}
|
||||
return doc;
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue