1
Fork 0
mirror of https://github.com/RGBCube/serenity synced 2025-07-27 13:37:44 +00:00

LibXML: Add a fairly basic XML parser

Currently this can parse XML and resolve external resources/references,
and read a DTD (but not apply or verify its rules).
That's good enough for _most_ XHTML documents as the HTML 5 spec
enforces its own rules about document well-formedness, and does not make
use of XML DTDs (aside from a list of predefined entities).

An accompanying `xml` utility is provided that can read and dump XML
documents, and can also run the XML conformance test suite.
This commit is contained in:
Ali Mohammad Pur 2022-03-26 21:32:57 +04:30 committed by Andreas Kling
parent 06cedf5bae
commit 67357fe984
15 changed files with 2895 additions and 0 deletions

View file

@ -0,0 +1,53 @@
/*
* Copyright (c) 2022, Ali Mohammad Pur <mpfard@serenityos.org>
*
* SPDX-License-Identifier: BSD-2-Clause
*/
#pragma once
#include <AK/HashMap.h>
#include <AK/NonnullOwnPtr.h>
#include <LibXML/DOM/DocumentTypeDeclaration.h>
#include <LibXML/DOM/Node.h>
#include <LibXML/Forward.h>
namespace XML {
enum class Version {
Version10,
Version11,
};
struct Doctype {
String type;
Vector<MarkupDeclaration> markup_declarations;
Optional<ExternalID> external_id;
};
class Document {
public:
explicit Document(NonnullOwnPtr<Node> root, Optional<Doctype> doctype, HashMap<Name, String> processing_instructions, Version version)
: m_root(move(root))
, m_processing_instructions(move(processing_instructions))
, m_version(version)
, m_explicit_doctype(move(doctype))
{
}
Node& root() { return *m_root; }
Node const& root() const { return *m_root; }
HashMap<Name, String> const& processing_instructions() const { return m_processing_instructions; }
Version version() const { return m_version; }
Optional<Doctype> const& doctype() const { return m_explicit_doctype; }
private:
NonnullOwnPtr<Node> m_root;
HashMap<Name, String> m_processing_instructions;
Version m_version;
Optional<Doctype> m_explicit_doctype;
};
}

View file

@ -0,0 +1,138 @@
/*
* Copyright (c) 2022, Ali Mohammad Pur <mpfard@serenityos.org>
*
* SPDX-License-Identifier: BSD-2-Clause
*/
#pragma once
#include <AK/HashTable.h>
#include <AK/String.h>
#include <AK/Variant.h>
#include <AK/Vector.h>
#include <LibXML/FundamentalTypes.h>
namespace XML {
struct ElementDeclaration {
struct Empty {
};
struct Any {
};
struct Mixed {
HashTable<Name> types;
bool many;
};
struct Children {
struct Entry;
enum class Qualifier {
ExactlyOnce,
Optional,
Any,
OneOrMore,
};
struct Choice {
Vector<Entry> entries;
Qualifier qualifier;
};
struct Sequence {
Vector<Entry> entries;
Qualifier qualifier;
};
struct Entry {
Variant<Name, Choice, Sequence> sub_entries;
Qualifier qualifier;
};
Variant<Choice, Sequence> contents;
Qualifier qualifier;
};
using ContentSpec = Variant<Empty, Any, Mixed, Children>;
Name type;
ContentSpec content_spec;
};
struct AttributeListDeclaration {
enum class StringType {
CData,
};
enum class TokenizedType {
ID,
IDRef,
IDRefs,
Entity,
Entities,
NMToken,
NMTokens,
};
struct NotationType {
HashTable<Name> names;
};
struct Enumeration {
// FIXME: NMToken
HashTable<String> tokens;
};
using Type = Variant<StringType, TokenizedType, NotationType, Enumeration>;
struct Required {
};
struct Implied {
};
struct Fixed {
String value;
};
struct DefaultValue {
String value;
};
using Default = Variant<Required, Implied, Fixed, DefaultValue>;
struct Definition {
Name name;
Type type;
Default default_;
};
Name type;
Vector<Definition> attributes;
};
struct PublicID {
String public_literal;
};
struct SystemID {
String system_literal;
};
struct ExternalID {
Optional<PublicID> public_id;
SystemID system_id;
};
struct EntityDefinition {
ExternalID id;
Optional<Name> notation;
};
struct GEDeclaration {
Name name;
Variant<String, EntityDefinition> definition;
};
struct PEDeclaration {
Name name;
Variant<String, ExternalID> definition;
};
using EntityDeclaration = Variant<GEDeclaration, PEDeclaration>;
struct NotationDeclaration {
Name name;
Variant<ExternalID, PublicID> notation;
};
using MarkupDeclaration = Variant<ElementDeclaration, AttributeListDeclaration, EntityDeclaration, NotationDeclaration>;
}

View file

@ -0,0 +1,54 @@
/*
* Copyright (c) 2022, Ali Mohammad Pur <mpfard@serenityos.org>
*
* SPDX-License-Identifier: BSD-2-Clause
*/
#include <AK/HashMap.h>
#include <LibXML/DOM/Node.h>
namespace XML {
bool Node::operator==(Node const& other) const
{
return content.visit(
[&](Text const& text) -> bool {
auto other_text = other.content.get_pointer<Text>();
if (!other_text)
return false;
return text.builder.string_view() == other_text->builder.string_view();
},
[&](Comment const& comment) -> bool {
auto other_comment = other.content.get_pointer<Comment>();
if (!other_comment)
return false;
return comment.text == other_comment->text;
},
[&](Element const& element) -> bool {
auto other_element = other.content.get_pointer<Element>();
if (!other_element)
return false;
if (element.name != other_element->name)
return false;
if (element.attributes.size() != other_element->attributes.size())
return false;
for (auto& entry : element.attributes) {
auto it = other_element->attributes.find(entry.key);
if (it == other_element->attributes.end())
return false;
if (it->value != entry.value)
return false;
}
if (element.children.size() != other_element->children.size())
return false;
for (size_t i = 0; i < element.children.size(); ++i) {
if (element.children[i] != other_element->children[i])
return false;
}
return true;
});
}
}

View file

@ -0,0 +1,40 @@
/*
* Copyright (c) 2022, Ali Mohammad Pur <mpfard@serenityos.org>
*
* SPDX-License-Identifier: BSD-2-Clause
*/
#pragma once
#include <AK/NonnullOwnPtrVector.h>
#include <AK/String.h>
#include <AK/Variant.h>
#include <AK/Vector.h>
#include <LibXML/FundamentalTypes.h>
namespace XML {
struct Attribute {
Name name;
String value;
};
struct Node {
struct Text {
StringBuilder builder;
};
struct Comment {
String text;
};
struct Element {
Name name;
HashMap<Name, String> attributes;
NonnullOwnPtrVector<Node> children;
};
bool operator==(Node const&) const;
Variant<Text, Comment, Element> content;
Node* parent { nullptr };
};
}