diff --git a/Meta/Lagom/Tools/CodeGenerators/JSSpecCompiler/Parser/Lexer.cpp b/Meta/Lagom/Tools/CodeGenerators/JSSpecCompiler/Parser/Lexer.cpp new file mode 100644 index 0000000000..5e9fd95406 --- /dev/null +++ b/Meta/Lagom/Tools/CodeGenerators/JSSpecCompiler/Parser/Lexer.cpp @@ -0,0 +1,157 @@ +/* + * Copyright (c) 2023, Dan Klishch + * + * SPDX-License-Identifier: BSD-2-Clause + */ + +#include +#include + +#include "Parser/Lexer.h" +#include "Parser/XMLUtils.h" + +namespace JSSpecCompiler { + +namespace { +Optional consume_number(GenericLexer& lexer, XML::Node const* node) +{ + u64 start = lexer.tell(); + + if (lexer.next_is('-')) + lexer.consume(1); + + if (!lexer.next_is(is_ascii_digit)) { + lexer.retreat(lexer.tell() - start); + return {}; + } + + lexer.consume_while(is_ascii_digit); + + if (lexer.next_is('.')) { + lexer.consume(1); + if (lexer.consume_while(is_ascii_digit).length() == 0) + lexer.retreat(1); + } + + auto length = lexer.tell() - start; + lexer.retreat(length); + return { Token { TokenType::Number, lexer.consume(length), node } }; +} + +bool can_end_word_token(char c) +{ + return is_ascii_space(c) || ".,"sv.contains(c); +} +} + +ParseErrorOr tokenize_string(XML::Node const* node, StringView view, Vector& tokens) +{ +#define CONSUME_IF_NEXT(view, type) \ + if (lexer.next_is(view##sv)) { \ + size_t length = __builtin_strlen(view); \ + tokens.append({ TokenType::type, lexer.consume(length), node }); \ + continue; \ + } + + GenericLexer lexer(view); + while (!lexer.is_eof()) { + lexer.ignore_while(is_ascii_space); + + if (auto result = consume_number(lexer, node); result.has_value()) { + tokens.append(result.release_value()); + continue; + } + + CONSUME_IF_NEXT("(", ParenOpen); + CONSUME_IF_NEXT(")", ParenClose); + CONSUME_IF_NEXT("{", BraceOpen); + CONSUME_IF_NEXT("}", BraceClose); + CONSUME_IF_NEXT(",", Comma); + CONSUME_IF_NEXT(". ", Dot); + CONSUME_IF_NEXT(".\n", Dot); + CONSUME_IF_NEXT(":", Colon); + CONSUME_IF_NEXT(".", MemberAccess); + CONSUME_IF_NEXT("<", Less); + CONSUME_IF_NEXT(">", Greater); + CONSUME_IF_NEXT("is not equal to", NotEquals); + CONSUME_IF_NEXT("≠", NotEquals); + CONSUME_IF_NEXT("is equal to", Equals); + CONSUME_IF_NEXT("=", Equals); + CONSUME_IF_NEXT("+", Plus); + CONSUME_IF_NEXT("-", AmbiguousMinus); + CONSUME_IF_NEXT("×", Multiplication); + CONSUME_IF_NEXT("/", Division); + CONSUME_IF_NEXT("!", ExclamationMark); + CONSUME_IF_NEXT("is", Is); + + StringView word = lexer.consume_until(can_end_word_token); + if (word.length()) + tokens.append({ TokenType::Word, word, node }); + } + return {}; + +#undef CONSUME_IF_NEXT +} + +ParseErrorOr tokenize_tree(XML::Node const* node, bool allow_substeps) +{ + TokenizeTreeResult result; + auto& tokens = result.tokens; + + for (auto const& child : node->as_element().children) { + TRY(child->content.visit( + [&](XML::Node::Element const& element) -> ParseErrorOr { + if (result.substeps != nullptr) + return ParseError::create("Substeps list must be the last non-empty child"sv, child); + + if (element.name == tag_var) { + tokens.append({ TokenType::Identifier, TRY(get_text_contents(child)), child }); + return {}; + } + + if (element.name == tag_span) { + auto element_class = TRY(get_attribute_by_name(child, attribute_class)); + if (element_class != class_secnum) + return ParseError::create(String::formatted("Expected 'secnum' as a class name of , but found '{}'", element_class), child); + tokens.append({ TokenType::SectionNumber, TRY(get_text_contents(child)), child }); + return {}; + } + + if (element.name == tag_emu_val) { + auto contents = TRY(get_text_contents(child)); + if (contents.length() >= 2 && contents.starts_with('"') && contents.ends_with('"')) + tokens.append({ TokenType::String, contents.substring_view(1, contents.length() - 2), child }); + else if (contents == "undefined") + tokens.append({ TokenType::Undefined, contents, child }); + else + tokens.append({ TokenType::Identifier, contents, child }); + return {}; + } + + if (element.name == tag_emu_xref) { + auto contents = TRY(get_text_contents(TRY(get_only_child(child, "a"sv)))); + tokens.append({ TokenType::Identifier, contents, child }); + return {}; + } + + if (element.name == tag_ol) { + if (!allow_substeps) + return ParseError::create("Found nested list but substeps are not allowed"sv, child); + result.substeps = child; + return {}; + } + + return ParseError::create(String::formatted("Unexpected child element with tag {}", element.name), child); + }, + [&](XML::Node::Text const& text) -> ParseErrorOr { + auto view = text.builder.string_view(); + if (result.substeps && !contains_empty_text(child)) + return ParseError::create("Substeps list must be the last non-empty child"sv, child); + return tokenize_string(child, view, tokens); + }, + move(ignore_comments))); + } + return result; +} + +} diff --git a/Meta/Lagom/Tools/CodeGenerators/JSSpecCompiler/Parser/Lexer.h b/Meta/Lagom/Tools/CodeGenerators/JSSpecCompiler/Parser/Lexer.h new file mode 100644 index 0000000000..6f039d850d --- /dev/null +++ b/Meta/Lagom/Tools/CodeGenerators/JSSpecCompiler/Parser/Lexer.h @@ -0,0 +1,40 @@ +/* + * Copyright (c) 2023, Dan Klishch + * + * SPDX-License-Identifier: BSD-2-Clause + */ + +#pragma once + +#include "Parser/ParseError.h" +#include "Parser/Token.h" + +namespace JSSpecCompiler { + +inline constexpr StringView tag_emu_alg = "emu-alg"sv; +inline constexpr StringView tag_emu_clause = "emu-clause"sv; +inline constexpr StringView tag_emu_val = "emu-val"sv; +inline constexpr StringView tag_emu_xref = "emu-xref"sv; +inline constexpr StringView tag_h1 = "h1"sv; +inline constexpr StringView tag_li = "li"sv; +inline constexpr StringView tag_ol = "ol"sv; +inline constexpr StringView tag_p = "p"sv; +inline constexpr StringView tag_span = "span"sv; +inline constexpr StringView tag_var = "var"sv; + +inline constexpr StringView attribute_aoid = "aoid"sv; +inline constexpr StringView attribute_class = "class"sv; +inline constexpr StringView attribute_id = "id"sv; + +inline constexpr StringView class_secnum = "secnum"sv; + +ParseErrorOr tokenize_string(XML::Node const* node, StringView view, Vector& tokens); + +struct TokenizeTreeResult { + Vector tokens; + XML::Node const* substeps = nullptr; +}; + +ParseErrorOr tokenize_tree(XML::Node const* node, bool allow_substeps = false); + +} diff --git a/Meta/Lagom/Tools/CodeGenerators/JSSpecCompiler/Parser/ParseError.cpp b/Meta/Lagom/Tools/CodeGenerators/JSSpecCompiler/Parser/ParseError.cpp new file mode 100644 index 0000000000..81f5b60fef --- /dev/null +++ b/Meta/Lagom/Tools/CodeGenerators/JSSpecCompiler/Parser/ParseError.cpp @@ -0,0 +1,51 @@ +/* + * Copyright (c) 2023, Dan Klishch + * + * SPDX-License-Identifier: BSD-2-Clause + */ + +#include "Parser/ParseError.h" + +namespace JSSpecCompiler { + +NonnullRefPtr ParseError::create(String message, XML::Node const* node) +{ + return make_ref_counted(move(message), node); +} + +NonnullRefPtr ParseError::create(StringView message, XML::Node const* node) +{ + return create(MUST(String::from_utf8(message)), node); +} + +// FIXME: Remove once String::formatted becomes infallible. +NonnullRefPtr ParseError::create(ErrorOr message, XML::Node const* node) +{ + return create(MUST(message), node); +} + +String ParseError::to_string() const +{ + StringBuilder builder; + builder.appendff("error: {}\n", m_message); + + XML::Node const* current = m_node; + while (current != nullptr) { + builder.appendff(" at {}:{} ", current->offset.line + 1, current->offset.column + 1); + if (current->is_element()) { + builder.append("<"sv); + builder.append(current->as_element().name); + for (auto [key, value] : current->as_element().attributes) + builder.appendff(" {}=\"{}\"", key, value); + builder.append(">\n"sv); + } else if (current->is_text()) { + builder.appendff("text \"{}\"\n", current->as_text().builder.string_view().trim_whitespace()); + } else { + builder.appendff("comment"); + } + current = current->parent; + } + return MUST(builder.to_string()); +} + +} diff --git a/Meta/Lagom/Tools/CodeGenerators/JSSpecCompiler/Parser/ParseError.h b/Meta/Lagom/Tools/CodeGenerators/JSSpecCompiler/Parser/ParseError.h new file mode 100644 index 0000000000..76c44eed7f --- /dev/null +++ b/Meta/Lagom/Tools/CodeGenerators/JSSpecCompiler/Parser/ParseError.h @@ -0,0 +1,37 @@ +/* + * Copyright (c) 2023, Dan Klishch + * + * SPDX-License-Identifier: BSD-2-Clause + */ + +#pragma once + +#include +#include + +namespace JSSpecCompiler { + +class ParseError : public RefCounted { +public: + ParseError(String&& message, XML::Node const* node) + : m_message(move(message)) + , m_node(node) + { + } + + static NonnullRefPtr create(String message, XML::Node const* node); + static NonnullRefPtr create(StringView message, XML::Node const* node); + static NonnullRefPtr create(ErrorOr message, XML::Node const* node); + + String to_string() const; + +private: + String m_message; + XML::Node const* m_node; + // TODO: Support chained parse errors +}; + +template +using ParseErrorOr = ErrorOr>; + +} diff --git a/Meta/Lagom/Tools/CodeGenerators/JSSpecCompiler/Parser/XMLUtils.cpp b/Meta/Lagom/Tools/CodeGenerators/JSSpecCompiler/Parser/XMLUtils.cpp new file mode 100644 index 0000000000..a97d1eb5c3 --- /dev/null +++ b/Meta/Lagom/Tools/CodeGenerators/JSSpecCompiler/Parser/XMLUtils.cpp @@ -0,0 +1,64 @@ +/* + * Copyright (c) 2023, Dan Klishch + * + * SPDX-License-Identifier: BSD-2-Clause + */ + +#include +#include + +#include "Parser/XMLUtils.h" + +namespace JSSpecCompiler { + +bool contains_empty_text(XML::Node const* node) +{ + return node->as_text().builder.string_view().trim_whitespace().is_empty(); +} + +ParseErrorOr get_attribute_by_name(XML::Node const* node, StringView attribute_name) +{ + auto const& attribute = node->as_element().attributes.get(attribute_name); + + if (!attribute.has_value()) + return ParseError::create(String::formatted("Attribute {} is not present", attribute_name), node); + return attribute.value(); +} + +ParseErrorOr get_text_contents(XML::Node const* node) +{ + auto const& children = node->as_element().children; + + if (children.size() != 1 || !children[0]->is_text()) + return ParseError::create("Expected single text node in a child list of the node"sv, node); + return children[0]->as_text().builder.string_view(); +} + +ParseErrorOr get_only_child(XML::Node const* element, StringView tag_name) +{ + XML::Node const* result = nullptr; + + for (auto const& child : element->as_element().children) { + TRY(child->content.visit( + [&](XML::Node::Element const& element) -> ParseErrorOr { + if (element.name != tag_name) + return ParseError::create(String::formatted("Expected child with the tag name {} but found {}", tag_name, element.name), child); + if (result != nullptr) + return ParseError::create("Element must have only one child"sv, child); + result = child; + return {}; + }, + [&](XML::Node::Text const&) -> ParseErrorOr { + if (!contains_empty_text(child)) + return ParseError::create("Element should not have non-empty child text nodes"sv, element); + return {}; + }, + move(ignore_comments))); + } + + if (result == nullptr) + return ParseError::create(String::formatted("Element must have only one child"), element); + return result; +} + +} diff --git a/Meta/Lagom/Tools/CodeGenerators/JSSpecCompiler/Parser/XMLUtils.h b/Meta/Lagom/Tools/CodeGenerators/JSSpecCompiler/Parser/XMLUtils.h new file mode 100644 index 0000000000..fdf5d4bc90 --- /dev/null +++ b/Meta/Lagom/Tools/CodeGenerators/JSSpecCompiler/Parser/XMLUtils.h @@ -0,0 +1,29 @@ +/* + * Copyright (c) 2023, Dan Klishch + * + * SPDX-License-Identifier: BSD-2-Clause + */ + +#pragma once + +#include + +#include "Parser/ParseError.h" + +namespace JSSpecCompiler { + +struct IgnoreComments { + ParseErrorOr operator()(XML::Node::Comment const&) { return {}; } +}; + +inline constexpr IgnoreComments ignore_comments {}; + +bool contains_empty_text(XML::Node const* node); + +ParseErrorOr get_attribute_by_name(XML::Node const* node, StringView attribute_name); + +ParseErrorOr get_text_contents(XML::Node const* node); + +ParseErrorOr get_only_child(XML::Node const* element, StringView tag_name); + +}