From 3d8cc2257fd2db8139937cae3638772d8844383b Mon Sep 17 00:00:00 2001 From: Sam Atkins Date: Fri, 3 Mar 2023 12:12:56 +0000 Subject: [PATCH] LibCMake: Introduce a CMake lexer --- AK/Debug.h.in | 4 + Meta/CMake/all_the_debug_macros.cmake | 1 + Userland/Libraries/CMakeLists.txt | 1 + Userland/Libraries/LibCMake/CMakeLists.txt | 7 + Userland/Libraries/LibCMake/Lexer.cpp | 371 +++++++++++++++++++++ Userland/Libraries/LibCMake/Lexer.h | 51 +++ Userland/Libraries/LibCMake/Token.cpp | 44 +++ Userland/Libraries/LibCMake/Token.h | 102 ++++++ 8 files changed, 581 insertions(+) create mode 100644 Userland/Libraries/LibCMake/CMakeLists.txt create mode 100644 Userland/Libraries/LibCMake/Lexer.cpp create mode 100644 Userland/Libraries/LibCMake/Lexer.h create mode 100644 Userland/Libraries/LibCMake/Token.cpp create mode 100644 Userland/Libraries/LibCMake/Token.h diff --git a/AK/Debug.h.in b/AK/Debug.h.in index 045db27c0b..5428534534 100644 --- a/AK/Debug.h.in +++ b/AK/Debug.h.in @@ -34,6 +34,10 @@ # cmakedefine01 CANVAS_RENDERING_CONTEXT_2D_DEBUG #endif +#ifndef CMAKE_DEBUG +# cmakedefine01 CMAKE_DEBUG +#endif + #ifndef COMPOSE_DEBUG # cmakedefine01 COMPOSE_DEBUG #endif diff --git a/Meta/CMake/all_the_debug_macros.cmake b/Meta/CMake/all_the_debug_macros.cmake index 9905999fa7..c1a17b8475 100644 --- a/Meta/CMake/all_the_debug_macros.cmake +++ b/Meta/CMake/all_the_debug_macros.cmake @@ -13,6 +13,7 @@ set(BXVGA_DEBUG ON) set(CACHE_DEBUG ON) set(CALLBACK_MACHINE_DEBUG ON) set(CANVAS_RENDERING_CONTEXT_2D_DEBUG ON) +set(CMAKE_DEBUG ON) set(COMMIT_DEBUG ON) set(COMPOSE_DEBUG ON) set(CONTEXT_SWITCH_DEBUG ON) diff --git a/Userland/Libraries/CMakeLists.txt b/Userland/Libraries/CMakeLists.txt index 1700d65457..bb5403fbd6 100644 --- a/Userland/Libraries/CMakeLists.txt +++ b/Userland/Libraries/CMakeLists.txt @@ -3,6 +3,7 @@ add_subdirectory(LibAudio) add_subdirectory(LibC) add_subdirectory(LibCards) add_subdirectory(LibChess) +add_subdirectory(LibCMake) add_subdirectory(LibCodeComprehension) add_subdirectory(LibCompress) add_subdirectory(LibConfig) diff --git a/Userland/Libraries/LibCMake/CMakeLists.txt b/Userland/Libraries/LibCMake/CMakeLists.txt new file mode 100644 index 0000000000..58f7df5f0a --- /dev/null +++ b/Userland/Libraries/LibCMake/CMakeLists.txt @@ -0,0 +1,7 @@ +set(SOURCES + Lexer.cpp + Token.cpp +) + +serenity_lib(LibCMake cmake) +target_link_libraries(LibCMake PRIVATE LibSyntax) diff --git a/Userland/Libraries/LibCMake/Lexer.cpp b/Userland/Libraries/LibCMake/Lexer.cpp new file mode 100644 index 0000000000..72b467fc34 --- /dev/null +++ b/Userland/Libraries/LibCMake/Lexer.cpp @@ -0,0 +1,371 @@ +/* + * Copyright (c) 2023, Sam Atkins + * + * SPDX-License-Identifier: BSD-2-Clause + */ + +#include "Lexer.h" +#include +#include +#include +#include + +namespace CMake { + +static bool is_valid_identifier_initial_char(char c) +{ + return is_ascii_alpha(c) || c == '_'; +} + +static bool is_valid_identifier_char(char c) +{ + return is_ascii_alphanumeric(c) || c == '_'; +} + +ErrorOr> Lexer::lex(StringView input) +{ + Lexer lexer { input }; + return lexer.lex_file(); +} + +Lexer::Lexer(StringView input) + : GenericLexer(input) +{ +} + +ErrorOr> Lexer::lex_file() +{ + m_tokens.clear_with_capacity(); + + while (!is_eof()) { + consume_whitespace_or_comments(); + + if (is_eof()) + break; + + if (is_valid_identifier_initial_char(peek())) { + consume_command_invocation(); + } else { + consume_garbage(); + } + } + + return m_tokens; +} + +void Lexer::skip_whitespace() +{ + while (!is_eof()) { + if (next_is('\n')) { + next_line(); + continue; + } + auto consumed = consume_while([&](char c) { + return c == ' ' || c == '\t'; + }); + if (consumed.is_empty()) + break; + } +} + +void Lexer::consume_whitespace_or_comments() +{ + ScopeLogger log; + while (!is_eof()) { + skip_whitespace(); + + if (next_is('#')) { + consume_comment(); + } else { + break; + } + } +} + +// https://cmake.org/cmake/help/latest/manual/cmake-language.7.html#command-invocations +void Lexer::consume_command_invocation() +{ + ScopeLogger log; + auto identifier_start = position(); + auto identifier = consume_while(is_valid_identifier_char); + auto control_keyword = control_keyword_from_string(identifier); + if (control_keyword.has_value()) { + emit_token(Token::Type::ControlKeyword, identifier, identifier_start, position(), control_keyword.release_value()); + } else { + emit_token(Token::Type::Identifier, identifier, identifier_start, position()); + } + + consume_whitespace_or_comments(); + + if (next_is('(')) + consume_open_paren(); + + consume_arguments(); + + if (next_is(')')) + consume_close_paren(); +} + +void Lexer::consume_arguments() +{ + ScopeLogger log; + while (!is_eof()) { + consume_whitespace_or_comments(); + + if (next_is('(')) { + consume_open_paren(); + + consume_whitespace_or_comments(); + consume_arguments(); + consume_whitespace_or_comments(); + + if (next_is(')')) + consume_close_paren(); + + continue; + } + + if (next_is(')')) + return; + + consume_argument(); + } +} + +// https://cmake.org/cmake/help/latest/manual/cmake-language.7.html#command-arguments +void Lexer::consume_argument() +{ + ScopeLogger log; + consume_whitespace_or_comments(); + + if (next_is('[')) { + consume_bracket_argument(); + return; + } + + if (next_is('"')) { + consume_quoted_argument(); + return; + } + + consume_unquoted_argument(); +} + +// https://cmake.org/cmake/help/latest/manual/cmake-language.7.html#bracket-argument +void Lexer::consume_bracket_argument() +{ + ScopeLogger log; + auto start = position(); + auto value = read_bracket_argument(); + emit_token(Token::Type::BracketArgument, value, start, position()); +} + +// https://cmake.org/cmake/help/latest/manual/cmake-language.7.html#quoted-argument +void Lexer::consume_quoted_argument() +{ + ScopeLogger log; + auto start = position(); + auto start_offset = tell(); + + VERIFY(consume_specific('"')); + while (!is_eof()) { + if (next_is('"')) { + ignore(); + break; + } + + if (next_is("\\\""sv)) { + ignore(2); + continue; + } + + if (next_is('\n')) { + next_line(); + continue; + } + + ignore(); + } + + auto whole_token = m_input.substring_view(start_offset, tell() - start_offset); + auto value = whole_token.substring_view(1, whole_token.length() - 2); + auto variable_references = parse_variable_references_from_argument(whole_token, start); + emit_token(Token::Type::QuotedArgument, value, start, position(), {}, move(variable_references)); +} + +// https://cmake.org/cmake/help/latest/manual/cmake-language.7.html#unquoted-argument +void Lexer::consume_unquoted_argument() +{ + ScopeLogger log; + auto start_offset = tell(); + auto start = position(); + + while (!is_eof()) { + if (next_is('\\')) { + consume_escaped_character('\\'); + continue; + } + + auto consumed = consume_until([](char c) { return is_ascii_space(c) || "()#\"\\'"sv.contains(c); }); + if (consumed.is_empty()) + break; + + // FIXME: `unquoted_legacy` + } + + auto value = m_input.substring_view(start_offset, tell() - start_offset); + auto variable_references = parse_variable_references_from_argument(value, start); + emit_token(Token::Type::UnquotedArgument, value, start, position(), {}, move(variable_references)); +} + +// https://cmake.org/cmake/help/latest/manual/cmake-language.7.html#comments +void Lexer::consume_comment() +{ + ScopeLogger log; + auto start = position(); + + VERIFY(consume_specific('#')); + if (next_is('[')) { + // Bracket comment + // https://cmake.org/cmake/help/latest/manual/cmake-language.7.html#bracket-comment + auto comment = read_bracket_argument(); + emit_token(Token::Type::BracketComment, comment, start, position()); + return; + } + + // Line comment + // https://cmake.org/cmake/help/latest/manual/cmake-language.7.html#line-comment + auto comment = consume_until('\n'); + emit_token(Token::Type::LineComment, comment, start, position()); +} + +void Lexer::consume_open_paren() +{ + auto start = position(); + VERIFY(consume_specific('(')); + emit_token(Token::Type::OpenParen, "("sv, start, position()); +} + +void Lexer::consume_close_paren() +{ + auto start = position(); + VERIFY(consume_specific(')')); + emit_token(Token::Type::CloseParen, ")"sv, start, position()); +} + +void Lexer::consume_garbage() +{ + ScopeLogger log; + auto start = position(); + auto contents = consume_until(is_ascii_space); + if (!contents.is_empty()) + emit_token(Token::Type::Garbage, contents, start, position()); +} + +// https://cmake.org/cmake/help/latest/manual/cmake-language.7.html#bracket-argument +// Used by both bracket arguments and bracket comments. +StringView Lexer::read_bracket_argument() +{ + VERIFY(consume_specific('[')); + auto leading_equals_signs = consume_while([](char c) { return c == '='; }); + consume_specific('['); + auto start = tell(); + auto end = start; + while (!is_eof()) { + // Read everything until we see `]={len}]`. + ignore_until(']'); + end = tell(); + ignore(); + if (next_is(leading_equals_signs)) + ignore(leading_equals_signs.length()); + if (consume_specific(']')) + break; + } + + return m_input.substring_view(start, end - start); +} + +// https://cmake.org/cmake/help/latest/manual/cmake-language.7.html#variable-references +Vector Lexer::parse_variable_references_from_argument(StringView argument_value, Position argument_start) +{ + auto position = argument_start; + GenericLexer lexer { argument_value }; + Vector variable_references; + + while (!lexer.is_eof()) { + if (lexer.next_is('\n')) { + lexer.ignore(); + position.column = 0; + position.line++; + continue; + } + + if (lexer.next_is('\\')) { + lexer.ignore(); + if (lexer.next_is('\n')) { + lexer.ignore(); + position.column = 0; + position.line++; + continue; + } + lexer.ignore(); + position.column += 2; + } + + if (lexer.next_is('$')) { + auto start = position; + lexer.ignore(); + position.column++; + + if (lexer.next_is("ENV{"sv)) { + lexer.ignore(4); + position.column += 4; + } else if (lexer.next_is('{')) { + lexer.ignore(); + position.column++; + } else { + auto skipped = lexer.consume_until(is_any_of("$ \n"sv)); + position.column += skipped.length(); + continue; + } + + auto variable_name = lexer.consume_until(is_any_of("} \n"sv)); + position.column += variable_name.length(); + if (lexer.next_is('}')) { + lexer.ignore(); + position.column++; + variable_references.empend(variable_name, start, position); + } + + continue; + } + + lexer.ignore(); + position.column++; + } + + return variable_references; +} + +Position Lexer::position() const +{ + return Position { + .line = m_line, + .column = tell() - m_string_offset_after_previous_newline, + }; +} + +void Lexer::next_line() +{ + VERIFY(consume_specific('\n')); + m_string_offset_after_previous_newline = tell(); + m_line++; +} + +void Lexer::emit_token(Token::Type type, StringView value, Position start, Position end, Optional control_keyword, Vector variable_references) +{ + dbgln_if(CMAKE_DEBUG, "Emitting {} token: `{}` ({}:{} to {}:{})", to_string(type), value, start.line, start.column, end.line, end.column); + m_tokens.empend(type, value, start, end, move(control_keyword), move(variable_references)); +} + +} diff --git a/Userland/Libraries/LibCMake/Lexer.h b/Userland/Libraries/LibCMake/Lexer.h new file mode 100644 index 0000000000..833623439a --- /dev/null +++ b/Userland/Libraries/LibCMake/Lexer.h @@ -0,0 +1,51 @@ +/* + * Copyright (c) 2023, Sam Atkins + * + * SPDX-License-Identifier: BSD-2-Clause + */ + +#pragma once + +#include +#include +#include + +namespace CMake { + +class Lexer : private GenericLexer { +public: + static ErrorOr> lex(StringView input); + +private: + Lexer(StringView input); + + ErrorOr> lex_file(); + + void skip_whitespace(); + + void consume_whitespace_or_comments(); + void consume_command_invocation(); + void consume_arguments(); + void consume_argument(); + void consume_bracket_argument(); + void consume_quoted_argument(); + void consume_unquoted_argument(); + void consume_comment(); + void consume_open_paren(); + void consume_close_paren(); + void consume_garbage(); + + StringView read_bracket_argument(); + static Vector parse_variable_references_from_argument(StringView argument_value, Position argument_start); + + Position position() const; + void next_line(); + + void emit_token(Token::Type, StringView value, Position start, Position end, Optional = {}, Vector = {}); + + Vector m_tokens; + size_t m_line { 0 }; + size_t m_string_offset_after_previous_newline { 0 }; +}; + +} diff --git a/Userland/Libraries/LibCMake/Token.cpp b/Userland/Libraries/LibCMake/Token.cpp new file mode 100644 index 0000000000..6442ce5331 --- /dev/null +++ b/Userland/Libraries/LibCMake/Token.cpp @@ -0,0 +1,44 @@ +/* + * Copyright (c) 2023, Sam Atkins + * + * SPDX-License-Identifier: BSD-2-Clause + */ + +#include "Token.h" + +namespace CMake { + +Optional control_keyword_from_string(StringView value) +{ + if (value.equals_ignoring_case("if"sv)) + return ControlKeywordType::If; + if (value.equals_ignoring_case("elseif"sv)) + return ControlKeywordType::ElseIf; + if (value.equals_ignoring_case("else"sv)) + return ControlKeywordType::Else; + if (value.equals_ignoring_case("endif"sv)) + return ControlKeywordType::EndIf; + if (value.equals_ignoring_case("foreach"sv)) + return ControlKeywordType::ForEach; + if (value.equals_ignoring_case("endforeach"sv)) + return ControlKeywordType::EndForEach; + if (value.equals_ignoring_case("while"sv)) + return ControlKeywordType::While; + if (value.equals_ignoring_case("endwhile"sv)) + return ControlKeywordType::EndWhile; + if (value.equals_ignoring_case("break"sv)) + return ControlKeywordType::Break; + if (value.equals_ignoring_case("continue"sv)) + return ControlKeywordType::Continue; + if (value.equals_ignoring_case("macro"sv)) + return ControlKeywordType::Macro; + if (value.equals_ignoring_case("endmacro"sv)) + return ControlKeywordType::EndMacro; + if (value.equals_ignoring_case("function"sv)) + return ControlKeywordType::Function; + if (value.equals_ignoring_case("endfunction"sv)) + return ControlKeywordType::EndFunction; + return {}; +} + +} diff --git a/Userland/Libraries/LibCMake/Token.h b/Userland/Libraries/LibCMake/Token.h new file mode 100644 index 0000000000..de902604f2 --- /dev/null +++ b/Userland/Libraries/LibCMake/Token.h @@ -0,0 +1,102 @@ +/* + * Copyright (c) 2023, Sam Atkins + * + * SPDX-License-Identifier: BSD-2-Clause + */ + +#pragma once + +#include +#include + +namespace CMake { + +struct Position { + size_t line { 0 }; + size_t column { 0 }; +}; + +struct VariableReference { + StringView value; + Position start; + Position end; +}; + +enum class ControlKeywordType { + If, + ElseIf, + Else, + EndIf, + ForEach, + EndForEach, + While, + EndWhile, + Break, + Continue, + Macro, + EndMacro, + Function, + EndFunction, +}; + +struct Token { + enum class Type { + BracketComment, + LineComment, + Identifier, + ControlKeyword, + OpenParen, + CloseParen, + BracketArgument, + QuotedArgument, + UnquotedArgument, + Garbage, + + // These are elements inside argument tokens + VariableReference, + }; + + Type type; + StringView value; + + Position start; + Position end; + + // Type-specific + Optional control_keyword {}; + Vector variable_references {}; +}; + +static constexpr StringView to_string(Token::Type type) +{ + switch (type) { + case Token::Type::BracketComment: + return "BracketComment"sv; + case Token::Type::LineComment: + return "LineComment"sv; + case Token::Type::Identifier: + return "Identifier"sv; + case Token::Type::ControlKeyword: + return "ControlKeyword"sv; + case Token::Type::OpenParen: + return "OpenParen"sv; + case Token::Type::CloseParen: + return "CloseParen"sv; + case Token::Type::BracketArgument: + return "BracketArgument"sv; + case Token::Type::QuotedArgument: + return "QuotedArgument"sv; + case Token::Type::UnquotedArgument: + return "UnquotedArgument"sv; + case Token::Type::Garbage: + return "Garbage"sv; + case Token::Type::VariableReference: + return "VariableReference"sv; + } + + VERIFY_NOT_REACHED(); +} + +Optional control_keyword_from_string(StringView value); + +}