From f70e39d50180683f7519d8881644af4cab503497 Mon Sep 17 00:00:00 2001 From: Dan Klishch Date: Thu, 17 Aug 2023 22:30:01 -0400 Subject: [PATCH] JSSpecCompiler: Add TextParser for converting algorithm steps into AST --- .../JSSpecCompiler/Parser/TextParser.cpp | 504 ++++++++++++++++++ .../JSSpecCompiler/Parser/TextParser.h | 67 +++ 2 files changed, 571 insertions(+) create mode 100644 Meta/Lagom/Tools/CodeGenerators/JSSpecCompiler/Parser/TextParser.cpp create mode 100644 Meta/Lagom/Tools/CodeGenerators/JSSpecCompiler/Parser/TextParser.h diff --git a/Meta/Lagom/Tools/CodeGenerators/JSSpecCompiler/Parser/TextParser.cpp b/Meta/Lagom/Tools/CodeGenerators/JSSpecCompiler/Parser/TextParser.cpp new file mode 100644 index 0000000000..c0452da8c1 --- /dev/null +++ b/Meta/Lagom/Tools/CodeGenerators/JSSpecCompiler/Parser/TextParser.cpp @@ -0,0 +1,504 @@ +/* + * Copyright (c) 2023, Dan Klishch + * + * SPDX-License-Identifier: BSD-2-Clause + */ + +#include + +#include "Parser/TextParser.h" + +namespace JSSpecCompiler { + +void TextParser::retreat() +{ + --m_next_token_index; +} + +auto TextParser::rollback_point() +{ + return ArmedScopeGuard { + [this, index = this->m_next_token_index] { + m_next_token_index = index; + } + }; +} + +ParseErrorOr TextParser::peek_token() +{ + if (m_next_token_index == m_tokens.size()) + return ParseError::create("Expected token but found EOF"sv, m_node); + return &m_tokens[m_next_token_index]; +} + +ParseErrorOr TextParser::consume_token() +{ + auto result = peek_token(); + if (!result.is_error()) + ++m_next_token_index; + return result; +} + +ParseErrorOr TextParser::consume_token_with_one_of_types(std::initializer_list types) +{ + auto token = TRY(consume_token()); + for (TokenType type : types) + if (token->type == type) + return token; + retreat(); + + return ParseError::create(String::formatted("Unexpected token type {}", token->name()), token->node); +} + +ParseErrorOr TextParser::consume_token_with_type(TokenType type) +{ + return consume_token_with_one_of_types({ type }); +} + +ParseErrorOr TextParser::consume_word(StringView word) +{ + auto token = TRY(consume_token_with_type(TokenType::Word)); + if (!token->data.equals_ignoring_ascii_case(word)) { + retreat(); + return ParseError::create("Unexpected word"sv, token->node); + } + return {}; +} + +ParseErrorOr TextParser::consume_words(std::initializer_list words) +{ + for (auto word : words) + TRY(consume_word(word)); + return {}; +} + +bool TextParser::is_eof() const +{ + return m_next_token_index == m_tokens.size(); +} + +ParseErrorOr TextParser::expect_eof() const +{ + if (!is_eof()) + return ParseError::create("Expected EOF"sv, m_node); + return {}; +} + +ParseErrorOr TextParser::parse_record_direct_list_initialization() +{ + auto rollback = rollback_point(); + + (void)consume_word("the"sv); + + auto identifier = TRY(consume_token_with_type(TokenType::Identifier)); + TRY(consume_token_with_type(TokenType::BraceOpen)); + Vector arguments; + while (true) { + auto name = TRY(consume_token_with_one_of_types({ TokenType::Identifier, TokenType::BraceClose })); + + if (name->is_bracket()) { + break; + } else { + TRY(consume_token_with_type(TokenType::Colon)); + auto value = TRY(parse_expression()); + (void)consume_token_with_type(TokenType::Comma); + arguments.append({ make_ref_counted(name->data), value }); + } + } + + rollback.disarm(); + return make_ref_counted( + make_ref_counted(identifier->data), move(arguments)); +} + +// +ParseErrorOr TextParser::parse_expression() +{ + auto rollback = rollback_point(); + + // (the)? { (: ,)* } + if (auto record_init = parse_record_direct_list_initialization(); !record_init.is_error()) { + rollback.disarm(); + return record_init.release_value(); + } + +#define THROW_PARSE_ERROR_IF(expr) \ + do { \ + if (expr) \ + return ParseError::create("Expected expression"sv, m_node); \ + } while (false) +#define THROW_PARSE_ERROR THROW_PARSE_ERROR_IF(true) + + Vector> stack; + + auto merge_stack = [&](i32 precedence) { + if (!stack.last().has()) + return; + + while (stack.size() >= 2) { + auto const& maybe_operator = stack[stack.size() - 2]; + if (!maybe_operator.has()) + break; + auto last_operator = maybe_operator.get(); + + auto right = stack.last().get(); + + if (last_operator.is_unary_operator()) { + auto operation = make_ref_counted(last_operator.as_unary_operator(), right); + stack.shrink(stack.size() - 2); + stack.empend(operation); + } else if (last_operator.is_binary_operator() && last_operator.precedence() < precedence) { + auto left = stack[stack.size() - 3].get(); + auto operation = make_ref_counted(last_operator.as_binary_operator(), left, right); + stack.shrink(stack.size() - 3); + stack.empend(operation); + } else { + break; + } + } + }; + + auto merge_pre_merged = [&] { + if (stack.size() < 3) + return; + + auto const& maybe_left = stack[stack.size() - 3]; + auto const& maybe_operator = stack[stack.size() - 2]; + auto const& maybe_right = stack.last(); + + if (!maybe_left.has() || !maybe_operator.has() || !maybe_right.has()) + return; + + auto last_operator = maybe_operator.get(); + if (!last_operator.is_pre_merged_binary_operator()) + return; + + auto expression = make_ref_counted(last_operator.as_binary_operator(), maybe_left.get(), maybe_right.get()); + + stack.shrink(stack.size() - 3); + stack.empend(expression); + }; + + i32 bracket_balance = 0; + + while (true) { + auto token_or_error = peek_token(); + if (token_or_error.is_error()) + break; + auto token = *token_or_error.release_value(); + + enum { + NoneType, + ExpressionType, + PreMergedBinaryOperatorType, + UnaryOperatorType, + BinaryOperatorType, + BracketType, + } last_element_type; + + if (stack.is_empty()) + last_element_type = NoneType; + else if (stack.last().has()) + last_element_type = ExpressionType; + else if (stack.last().get().is_pre_merged_binary_operator()) + last_element_type = PreMergedBinaryOperatorType; + else if (stack.last().get().is_unary_operator()) + last_element_type = UnaryOperatorType; + else if (stack.last().get().is_binary_operator()) + last_element_type = BinaryOperatorType; + else if (stack.last().get().is_bracket()) + last_element_type = BracketType; + else + VERIFY_NOT_REACHED(); + + if (token.is_ambiguous_operator()) { + if (token.type == TokenType::AmbiguousMinus) + token.type = last_element_type == ExpressionType ? TokenType::BinaryMinus : TokenType::UnaryMinus; + else + VERIFY_NOT_REACHED(); + } + + bracket_balance += token.is_opening_bracket(); + bracket_balance -= token.is_closing_bracket(); + + if (bracket_balance < 0) + break; + + if (token.type == TokenType::ParenOpen) { + if (last_element_type == ExpressionType) + stack.append(Token { TokenType::FunctionCall, ""sv, m_node }); + stack.append(token); + } else if (token.is_pre_merged_binary_operator()) { + THROW_PARSE_ERROR_IF(last_element_type != ExpressionType); + stack.append(token); + } else if (token.is_unary_operator()) { + THROW_PARSE_ERROR_IF(last_element_type == PreMergedBinaryOperatorType); + stack.append(token); + } else if (token.is_binary_operator() || token.is_closing_bracket()) { + if (bracket_balance == 0 && token.type == TokenType::Comma) + break; + + THROW_PARSE_ERROR_IF(last_element_type != ExpressionType); + + merge_stack(token.precedence()); + if (token.is_closing_bracket()) { + THROW_PARSE_ERROR_IF(stack.size() == 1); + THROW_PARSE_ERROR_IF(!stack[stack.size() - 2].get().matches_with(token)); + stack.remove(stack.size() - 2); + merge_pre_merged(); + } else { + stack.append(token); + } + } else { + NullableTree expression; + if (token.type == TokenType::Identifier) { + expression = make_ref_counted(token.data); + } else if (token.type == TokenType::Number) { + expression = make_ref_counted(token.data.to_int().value()); + } else if (token.type == TokenType::String) { + expression = make_ref_counted(token.data); + } else { + break; + } + THROW_PARSE_ERROR_IF(last_element_type == ExpressionType); + stack.append(expression.release_nonnull()); + merge_pre_merged(); + } + + MUST(consume_token()); + } + + THROW_PARSE_ERROR_IF(stack.is_empty()); + merge_stack(closing_bracket_precedence); + THROW_PARSE_ERROR_IF(stack.size() != 1 || !stack[0].has()); + + rollback.disarm(); + return stack[0].get(); +#undef THROW_PARSE_ERROR +#undef THROW_PARSE_ERROR_IF +} + +// :== | ( is (or )?) +ParseErrorOr TextParser::parse_condition() +{ + auto rollback = rollback_point(); + auto expression = TRY(parse_expression()); + + if (!consume_token_with_type(TokenType::Is).is_error()) { + Vector compare_values { TRY(parse_expression()) }; + if (!consume_word("or"sv).is_error()) + compare_values.append(TRY(parse_expression())); + + rollback.disarm(); + return make_ref_counted(expression, move(compare_values)); + } + + rollback.disarm(); + return expression; +} + +// return +ParseErrorOr TextParser::parse_return_statement() +{ + auto rollback = rollback_point(); + + TRY(consume_word("return"sv)); + auto return_value = TRY(parse_expression()); + + rollback.disarm(); + return make_ref_counted(return_value); +} + +// assert: +ParseErrorOr TextParser::parse_assert() +{ + auto rollback = rollback_point(); + + auto identifier = TRY(consume_token_with_type(TokenType::Identifier))->data; + if (!identifier.equals_ignoring_ascii_case("assert"sv)) { + return ParseError::create("Expected identifier \"Assert\""sv, m_node); + } + + TRY(consume_token_with_type(TokenType::Colon)); + auto condition = TRY(parse_condition()); + + rollback.disarm(); + return make_ref_counted(condition); +} + +// (let be ) | (set to ) +ParseErrorOr TextParser::parse_assignment() +{ + auto rollback = rollback_point(); + + bool is_let = !consume_word("let"sv).is_error(); + if (!is_let) + TRY(consume_word("set"sv)); + auto lvalue = TRY(parse_expression()); + TRY(consume_word(is_let ? "be"sv : "to"sv)); + auto rvalue = TRY(parse_expression()); + + rollback.disarm(); + auto op = is_let ? BinaryOperator::Declaration : BinaryOperator::Assignment; + return make_ref_counted(op, lvalue, rvalue); +} + +// +ParseErrorOr TextParser::parse_simple_step_or_inline_if_branch() +{ + auto rollback = rollback_point(); + + // Return .$ + if (auto result = parse_return_statement(); !result.is_error()) { + TRY(consume_token_with_type(TokenType::Dot)); + TRY(expect_eof()); + rollback.disarm(); + return result.release_value(); + } + + // Assert: .$ + if (auto result = parse_assert(); !result.is_error()) { + TRY(consume_token_with_type(TokenType::Dot)); + TRY(expect_eof()); + rollback.disarm(); + return result.release_value(); + } + + // Let be .$ + // Set to .$ + if (auto result = parse_assignment(); !result.is_error()) { + TRY(consume_token_with_type(TokenType::Dot)); + TRY(expect_eof()); + rollback.disarm(); + return result.release_value(); + } + + return ParseError::create("Unable to parse simple step or inline if branch"sv, m_node); +} + +// :== (If ) | (Else) | (Else if ), +ParseErrorOr TextParser::parse_if_beginning() +{ + auto rollback = rollback_point(); + + bool is_if_branch = !consume_word("if"sv).is_error(); + Optional condition; + if (is_if_branch) { + condition = TRY(parse_condition()); + } else { + TRY(consume_word("else"sv)); + if (!consume_word("if"sv).is_error()) + condition = TRY(parse_condition()); + } + TRY(consume_token_with_type(TokenType::Comma)); + + rollback.disarm(); + return IfConditionParseResult { is_if_branch, condition }; +} + +// :== .$ +ParseErrorOr TextParser::parse_inline_if_else() +{ + auto rollback = rollback_point(); + + auto [is_if_branch, condition] = TRY(parse_if_beginning()); + auto then_branch = TRY(parse_simple_step_or_inline_if_branch()); + + rollback.disarm(); + if (is_if_branch) + return make_ref_counted(*condition, then_branch); + else + return make_ref_counted(condition, then_branch); +} + +// :== then$ +ParseErrorOr TextParser::parse_if(Tree then_branch) +{ + auto rollback = rollback_point(); + + auto [is_if_branch, condition] = TRY(parse_if_beginning()); + TRY(consume_word("then"sv)); + TRY(expect_eof()); + + rollback.disarm(); + if (is_if_branch) + return make_ref_counted(*condition, then_branch); + else + return make_ref_counted(condition, then_branch); +} + +// :== Else,$ +ParseErrorOr TextParser::parse_else(Tree else_branch) +{ + auto rollback = rollback_point(); + + TRY(consume_word("else"sv)); + TRY(consume_token_with_type(TokenType::Comma)); + TRY(expect_eof()); + + rollback.disarm(); + return make_ref_counted(Optional {}, else_branch); +} + +// | +ParseErrorOr TextParser::parse_step_without_substeps() +{ + auto rollback = rollback_point(); + + // + if (auto result = parse_simple_step_or_inline_if_branch(); !result.is_error()) { + rollback.disarm(); + return result.release_value(); + } + + // + if (auto result = parse_inline_if_else(); !result.is_error()) { + rollback.disarm(); + return result.release_value(); + } + + return ParseError::create("Unable to parse step without substeps"sv, m_node); +} + +// | +ParseErrorOr TextParser::parse_step_with_substeps(Tree substeps) +{ + auto rollback = rollback_point(); + + // + if (auto result = parse_if(substeps); !result.is_error()) { + rollback.disarm(); + return result.release_value(); + } + + // + if (auto result = parse_else(substeps); !result.is_error()) { + rollback.disarm(); + return result.release_value(); + } + + return ParseError::create("Unable to parse step with substeps"sv, m_node); +} + +ParseErrorOr TextParser::parse_definition() +{ + DefinitionParseResult result; + + auto section_number_token = TRY(consume_token_with_type(TokenType::SectionNumber)); + result.section_number = section_number_token->data; + + result.function_name = TRY(consume_token())->data; + + TRY(consume_token_with_type(TokenType::ParenOpen)); + while (true) { + result.arguments.append({ TRY(consume_token_with_type(TokenType::Identifier))->data }); + auto next_token = TRY(consume_token_with_one_of_types({ TokenType::ParenClose, TokenType::Comma })); + if (next_token->type == TokenType::ParenClose) + break; + } + TRY(expect_eof()); + + return result; +} + +} diff --git a/Meta/Lagom/Tools/CodeGenerators/JSSpecCompiler/Parser/TextParser.h b/Meta/Lagom/Tools/CodeGenerators/JSSpecCompiler/Parser/TextParser.h new file mode 100644 index 0000000000..47e6b171ad --- /dev/null +++ b/Meta/Lagom/Tools/CodeGenerators/JSSpecCompiler/Parser/TextParser.h @@ -0,0 +1,67 @@ +/* + * Copyright (c) 2023, Dan Klishch + * + * SPDX-License-Identifier: BSD-2-Clause + */ + +#pragma once + +#include "AST/AST.h" +#include "Parser/ParseError.h" +#include "Parser/Token.h" + +namespace JSSpecCompiler { + +class TextParser { +public: + struct DefinitionParseResult { + StringView section_number; + StringView function_name; + Vector arguments; + }; + + TextParser(Vector& tokens_, XML::Node const* node_) + : m_tokens(tokens_) + , m_node(node_) + { + } + + ParseErrorOr parse_definition(); + ParseErrorOr parse_step_without_substeps(); + ParseErrorOr parse_step_with_substeps(Tree substeps); + +private: + struct IfConditionParseResult { + bool is_if_branch; + Optional condition; + }; + + void retreat(); + [[nodiscard]] auto rollback_point(); + ParseErrorOr peek_token(); + ParseErrorOr consume_token(); + ParseErrorOr consume_token_with_one_of_types(std::initializer_list types); + ParseErrorOr consume_token_with_type(TokenType type); + ParseErrorOr consume_word(StringView word); + ParseErrorOr consume_words(std::initializer_list words); + bool is_eof() const; + ParseErrorOr expect_eof() const; + + ParseErrorOr parse_record_direct_list_initialization(); + ParseErrorOr parse_expression(); + ParseErrorOr parse_condition(); + ParseErrorOr parse_return_statement(); + ParseErrorOr parse_assert(); + ParseErrorOr parse_assignment(); + ParseErrorOr parse_simple_step_or_inline_if_branch(); + ParseErrorOr parse_if_beginning(); + ParseErrorOr parse_inline_if_else(); + ParseErrorOr parse_if(Tree then_branch); + ParseErrorOr parse_else(Tree else_branch); + + Vector const& m_tokens; + size_t m_next_token_index = 0; + XML::Node const* m_node; +}; + +}