Shell: Add support for heredocs to the POSIX parser

2025-09-13 13:57:34 +00:00 · 2023-02-16 09:52:13 +03:30 · 2023-02-16 09:52:13 +03:30 · 2881bb4c3a
commit 2881bb4c3a
parent a5e4bc4faf
4 changed files with 333 additions and 20 deletions
--- a/Userland/Shell/PosixLexer.cpp
+++ b/Userland/Shell/PosixLexer.cpp
@ -23,8 +23,11 @@ static bool is_part_of_operator(StringView text, char ch)

 namespace Shell::Posix {

-Vector<Token> Lexer::batch_next()
+Vector<Token> Lexer::batch_next(Optional<Reduction> starting_reduction)
 {
+    if (starting_reduction.has_value())
+        m_next_reduction = *starting_reduction;
+
    for (; m_next_reduction != Reduction::None;) {
        auto result = reduce(m_next_reduction);
        m_next_reduction = result.next_reduction;
@ -55,6 +58,18 @@ char Lexer::consume()
    return ch;
 }

+void Lexer::reconsume(StringView string)
+{
+    for (auto byte : string.bytes()) {
+        if (byte == '\n') {
+            m_state.position.end_line.line_number++;
+            m_state.position.end_line.line_column = 0;
+        }
+
+        m_state.position.end_offset++;
+    }
+}
+
 bool Lexer::consume_specific(char ch)
 {
    if (m_lexer.peek() == ch) {
@ -95,6 +110,8 @@ Lexer::ReductionResult Lexer::reduce(Reduction reduction)
        return reduce_command_or_arithmetic_substitution_expansion();
    case Reduction::ExtendedParameterExpansion:
        return reduce_extended_parameter_expansion();
+    case Reduction::HeredocContents:
+        return reduce_heredoc_contents();
    }

    VERIFY_NOT_REACHED();
@ -108,6 +125,91 @@ Lexer::ReductionResult Lexer::reduce_end()
    };
 }

+Lexer::HeredocKeyResult Lexer::process_heredoc_key(Token const& token)
+{
+    StringBuilder builder;
+    enum ParseState {
+        Free,
+        InDoubleQuotes,
+        InSingleQuotes,
+    };
+    Vector<ParseState, 4> parse_state;
+    parse_state.append(Free);
+    bool escaped = false;
+    bool had_a_single_quote_segment = false;
+
+    for (auto byte : token.value.bytes()) {
+        switch (parse_state.last()) {
+        case Free:
+            switch (byte) {
+            case '"':
+                if (escaped) {
+                    builder.append(byte);
+                    escaped = false;
+                } else {
+                    parse_state.append(InDoubleQuotes);
+                }
+                break;
+            case '\'':
+                if (escaped) {
+                    builder.append(byte);
+                    escaped = false;
+                } else {
+                    had_a_single_quote_segment = true;
+                    parse_state.append(InSingleQuotes);
+                }
+                break;
+            case '\\':
+                if (escaped) {
+                    builder.append(byte);
+                    escaped = false;
+                } else {
+                    escaped = true;
+                }
+                break;
+            default:
+                if (escaped) {
+                    builder.append('\\');
+                    escaped = false;
+                }
+                builder.append(byte);
+                break;
+            }
+            break;
+        case InDoubleQuotes:
+            if (!escaped && byte == '"') {
+                parse_state.take_last();
+                break;
+            }
+            if (escaped) {
+                if (byte != '"')
+                    builder.append('\\');
+                builder.append(byte);
+                break;
+            }
+            if (byte == '\\')
+                escaped = true;
+            else
+                builder.append(byte);
+            break;
+        case InSingleQuotes:
+            if (byte == '\'') {
+                parse_state.take_last();
+                break;
+            }
+            builder.append(byte);
+            break;
+        }
+    }
+
+    // NOTE: Not checking the final state as any garbage that even partially parses is allowed to be used as a key :/
+
+    return {
+        .key = builder.to_deprecated_string(),
+        .allow_interpolation = !had_a_single_quote_segment,
+    };
+}
+
 Lexer::ReductionResult Lexer::reduce_operator()
 {
    if (m_lexer.is_eof()) {
@ -142,8 +244,25 @@ Lexer::ReductionResult Lexer::reduce_operator()
        m_state.position.start_line = m_state.position.end_line;
    }

+    auto expect_heredoc_entry = !tokens.is_empty() && (tokens.last().type == Token::Type::DoubleLessDash || tokens.last().type == Token::Type::DoubleLess);
+
    auto result = reduce(Reduction::Start);
    tokens.extend(move(result.tokens));
+
+    while (expect_heredoc_entry && tokens.size() == 1) {
+        result = reduce(result.next_reduction);
+        tokens.extend(move(result.tokens));
+    }
+
+    if (expect_heredoc_entry && tokens.size() > 1) {
+        auto [key, interpolation] = process_heredoc_key(tokens[1]);
+        m_state.heredoc_entries.enqueue(HeredocEntry {
+            .key = key,
+            .allow_interpolation = interpolation,
+            .dedent = tokens[0].type == Token::Type::DoubleLessDash,
+        });
+    }
+
    return {
        .tokens = move(tokens),
        .next_reduction = result.next_reduction,
@ -160,6 +279,7 @@ Lexer::ReductionResult Lexer::reduce_comment()
    }

    if (consume() == '\n') {
+        m_state.on_new_line = true;
        return {
            .tokens = { Token::newline() },
            .next_reduction = Reduction::Start,
@ -352,7 +472,7 @@ Lexer::ReductionResult Lexer::reduce_command_expansion()
    };
 }

-Lexer::ReductionResult Lexer::reduce_start()
+Lexer::ReductionResult Lexer::reduce_heredoc_contents()
 {
    if (m_lexer.is_eof()) {
        auto tokens = Token::maybe_from_state(m_state);
@ -366,6 +486,107 @@ Lexer::ReductionResult Lexer::reduce_start()
        };
    }

+    if (!m_state.escaping && consume_specific('\\')) {
+        m_state.escaping = true;
+        m_state.buffer.append('\\');
+        return {
+            .tokens = {},
+            .next_reduction = Reduction::HeredocContents,
+        };
+    }
+
+    if (!m_state.escaping && consume_specific('$')) {
+        m_state.buffer.append('$');
+        if (m_lexer.next_is("("))
+            m_state.expansions.empend(CommandExpansion { .command = StringBuilder {}, .range = range() });
+        else
+            m_state.expansions.empend(ParameterExpansion { .parameter = StringBuilder {}, .range = range() });
+
+        return {
+            .tokens = {},
+            .next_reduction = Reduction::Expansion,
+        };
+    }
+
+    if (!m_state.escaping && consume_specific('`')) {
+        m_state.buffer.append('`');
+        m_state.expansions.empend(CommandExpansion { .command = StringBuilder {}, .range = range() });
+        return {
+            .tokens = {},
+            .next_reduction = Reduction::CommandExpansion,
+        };
+    }
+
+    m_state.escaping = false;
+    m_state.buffer.append(consume());
+    return {
+        .tokens = {},
+        .next_reduction = Reduction::HeredocContents,
+    };
+}
+
+Lexer::ReductionResult Lexer::reduce_start()
+{
+    auto was_on_new_line = m_state.on_new_line;
+    m_state.on_new_line = false;
+
+    if (m_lexer.is_eof()) {
+        auto tokens = Token::maybe_from_state(m_state);
+        m_state.buffer.clear();
+        m_state.position.start_offset = m_state.position.end_offset;
+        m_state.position.start_line = m_state.position.end_line;
+
+        return {
+            .tokens = move(tokens),
+            .next_reduction = Reduction::End,
+        };
+    }
+
+    if (was_on_new_line && !m_state.heredoc_entries.is_empty()) {
+        auto const& entry = m_state.heredoc_entries.head();
+
+        auto start_index = m_lexer.tell();
+        Optional<size_t> end_index;
+
+        for (; !m_lexer.is_eof();) {
+            auto index = m_lexer.tell();
+            auto possible_end_index = m_lexer.tell();
+            if (m_lexer.consume_specific('\n')) {
+                if (entry.dedent)
+                    m_lexer.ignore_while(is_any_of("\t"sv));
+                if (m_lexer.consume_specific(entry.key.view())) {
+                    if (m_lexer.consume_specific('\n') || m_lexer.is_eof()) {
+                        end_index = possible_end_index;
+                        break;
+                    }
+                }
+            }
+            if (m_lexer.tell() == index)
+                m_lexer.ignore();
+        }
+
+        auto contents = m_lexer.input().substring_view(start_index, end_index.value_or(m_lexer.tell()) - start_index);
+        reconsume(contents);
+
+        m_state.buffer.clear();
+        m_state.buffer.append(contents);
+
+        auto token = Token::maybe_from_state(m_state).first();
+        token.relevant_heredoc_key = entry.key;
+        token.type = Token::Type::HeredocContents;
+
+        m_state.heredoc_entries.dequeue();
+
+        m_state.on_new_line = true;
+
+        m_state.buffer.clear();
+
+        return {
+            .tokens = { move(token) },
+            .next_reduction = Reduction::Start,
+        };
+    }
+
    if (m_state.escaping && consume_specific('\n')) {
        m_state.escaping = false;

@ -391,6 +612,8 @@ Lexer::ReductionResult Lexer::reduce_start()
        auto tokens = Token::maybe_from_state(m_state);
        tokens.append(Token::newline());

+        m_state.on_new_line = true;
+
        m_state.buffer.clear();
        m_state.position.start_offset = m_state.position.end_offset;
        m_state.position.start_line = m_state.position.end_line;
@ -678,6 +901,8 @@ StringView Token::type_name() const
        return "Clobber"sv;
    case Type::Semicolon:
        return "Semicolon"sv;
+    case Type::HeredocContents:
+        return "HeredocContents"sv;
    case Type::AssignmentWord:
        return "AssignmentWord"sv;
    case Type::Bang:
--- a/Userland/Shell/PosixLexer.h
+++ b/Userland/Shell/PosixLexer.h
@ -8,6 +8,7 @@

 #include <AK/DeprecatedString.h>
 #include <AK/GenericLexer.h>
+#include <AK/Queue.h>
 #include <AK/Variant.h>
 #include <AK/Vector.h>
 #include <Shell/AST.h>
@ -29,6 +30,9 @@ enum class Reduction {
    ParameterExpansion,
    CommandOrArithmeticSubstitutionExpansion,
    ExtendedParameterExpansion,
+
+    // Separate rule, not used by the main flow.
+    HeredocContents,
 };

 struct ExpansionRange {
@ -177,6 +181,12 @@ struct ResolvedCommandExpansion {

 using ResolvedExpansion = Variant<ResolvedParameterExpansion, ResolvedCommandExpansion>;

+struct HeredocEntry {
+    DeprecatedString key;
+    bool allow_interpolation;
+    bool dedent;
+};
+
 struct State {
    StringBuilder buffer {};
    Reduction previous_reduction { Reduction::Start };
@ -194,6 +204,8 @@ struct State {
        },
    };
    Vector<Expansion> expansions {};
+    Queue<HeredocEntry> heredoc_entries {};
+    bool on_new_line { true };
 };

 struct Token {
@ -219,6 +231,7 @@ struct Token {
        DoubleLessDash,
        Clobber,
        Semicolon,
+        HeredocContents,

        // Not produced by this lexer, but generated in later stages.
        AssignmentWord,
@ -249,6 +262,7 @@ struct Token {
    Vector<Expansion> expansions;
    Vector<ResolvedExpansion> resolved_expansions {};
    StringView original_text;
+    Optional<DeprecatedString> relevant_heredoc_key {};
    bool could_be_start_of_a_simple_command { false };

    static Vector<Token> maybe_from_state(State const& state)
@ -378,7 +392,14 @@ public:
    {
    }

-    Vector<Token> batch_next();
+    Vector<Token> batch_next(Optional<Reduction> starting_reduction = {});
+
+    struct HeredocKeyResult {
+        DeprecatedString key;
+        bool allow_interpolation;
+    };
+
+    static HeredocKeyResult process_heredoc_key(Token const&);

 private:
    struct ReductionResult {
@ -400,9 +421,11 @@ private:
    ReductionResult reduce_parameter_expansion();
    ReductionResult reduce_command_or_arithmetic_substitution_expansion();
    ReductionResult reduce_extended_parameter_expansion();
+    ReductionResult reduce_heredoc_contents();

    char consume();
    bool consume_specific(char);
+    void reconsume(StringView);
    ExpansionRange range(ssize_t offset = 0) const;

    GenericLexer m_lexer;
--- a/Userland/Shell/PosixParser.cpp
+++ b/Userland/Shell/PosixParser.cpp
@ -9,6 +9,11 @@
 #include <AK/StringUtils.h>
 #include <Shell/PosixParser.h>

+static Shell::AST::Position empty_position()
+{
+    return { 0, 0, { 0, 0 }, { 0, 0 } };
+}
+
 template<typename T, typename... Ts>
 static inline bool is_one_of(T const& value, Ts const&... values)
 {
@ -22,7 +27,8 @@ static inline bool is_io_operator(Shell::Posix::Token const& token)
        Token::Type::Less, Token::Type::Great,
        Token::Type::LessAnd, Token::Type::GreatAnd,
        Token::Type::DoubleLess, Token::Type::DoubleGreat,
-        Token::Type::LessGreat, Token::Type::Clobber);
+        Token::Type::DoubleLessDash, Token::Type::LessGreat,
+        Token::Type::Clobber);
 }

 static inline bool is_separator(Shell::Posix::Token const& token)
@ -95,10 +101,10 @@ static inline bool is_valid_name(StringView word)
 }

 namespace Shell::Posix {
-void Parser::fill_token_buffer()
+void Parser::fill_token_buffer(Optional<Reduction> starting_reduction)
 {
    for (;;) {
-        auto token = next_expanded_token();
+        auto token = next_expanded_token(starting_reduction);
        if (!token.has_value())
            break;
 #if SHELL_POSIX_PARSER_DEBUG
@ -126,10 +132,36 @@ RefPtr<AST::Node> Parser::parse()
    return parse_complete_command();
 }

-Optional<Token> Parser::next_expanded_token()
+void Parser::handle_heredoc_contents()
+{
+    while (!eof() && m_token_buffer[m_token_index].type == Token::Type::HeredocContents) {
+        auto& token = m_token_buffer[m_token_index++];
+        auto entry = m_unprocessed_heredoc_entries.get(token.relevant_heredoc_key.value());
+        if (!entry.has_value()) {
+            error(token, "Discarding unexpected heredoc contents for key '{}'", *token.relevant_heredoc_key);
+            continue;
+        }
+
+        auto& heredoc = **entry;
+
+        RefPtr<AST::Node> contents;
+        if (heredoc.allow_interpolation()) {
+            Parser parser { token.value, m_in_interactive_mode, Reduction::HeredocContents };
+            contents = parser.parse_word();
+        } else {
+            contents = make_ref_counted<AST::StringLiteral>(token.position.value_or(empty_position()), token.value, AST::StringLiteral::EnclosureType::None);
+        }
+
+        if (contents)
+            heredoc.set_contents(contents);
+        m_unprocessed_heredoc_entries.remove(*token.relevant_heredoc_key);
+    }
+}
+
+Optional<Token> Parser::next_expanded_token(Optional<Reduction> starting_reduction)
 {
    while (m_token_buffer.find_if([](auto& token) { return token.type == Token::Type::Eof; }).is_end()) {
-        auto tokens = m_lexer.batch_next();
+        auto tokens = m_lexer.batch_next(starting_reduction);
        auto expanded = perform_expansions(move(tokens));
        m_token_buffer.extend(expanded);
    }
@ -589,11 +621,6 @@ Vector<Token> Parser::perform_expansions(Vector<Token> tokens)
    return tokens;
 }

-static AST::Position empty_position()
-{
-    return { 0, 0, { 0, 0 }, { 0, 0 } };
-}
-
 RefPtr<AST::Node> Parser::parse_complete_command()
 {
    auto list = [&] {
@ -1835,13 +1862,47 @@ RefPtr<AST::Node> Parser::parse_io_redirect()
    if (auto io_file = parse_io_file(start_position, io_number))
        return io_file;

-    // if (auto io_here = parse_io_here(start_position, io_number))
-    //     return io_here;
+    if (auto io_here = parse_io_here(start_position, io_number))
+        return io_here;

    m_token_index = start_index;
    return nullptr;
 }

+RefPtr<AST::Node> Parser::parse_io_here(AST::Position start_position, Optional<int> fd)
+{
+    // io_here: IO_NUMBER? (DLESS | DLESSDASH) WORD
+    auto io_operator = peek().type;
+    if (!is_one_of(io_operator, Token::Type::DoubleLess, Token::Type::DoubleLessDash))
+        return nullptr;
+
+    auto io_operator_token = consume();
+
+    auto redirection_fd = fd.value_or(0);
+
+    auto end_keyword = consume();
+    if (!is_one_of(end_keyword.type, Token::Type::Word, Token::Type::Token))
+        return make_ref_counted<AST::SyntaxError>(io_operator_token.position.value_or(start_position), "Expected a heredoc keyword", true);
+
+    auto [end_keyword_text, allow_interpolation] = Lexer::process_heredoc_key(end_keyword);
+    RefPtr<AST::SyntaxError> error;
+
+    auto position = start_position.with_end(peek().position.value_or(empty_position()));
+    auto result = make_ref_counted<AST::Heredoc>(
+        position,
+        end_keyword_text,
+        allow_interpolation,
+        io_operator == Token::Type::DoubleLessDash,
+        Optional<int> { redirection_fd });
+
+    m_unprocessed_heredoc_entries.set(end_keyword_text, result);
+
+    if (error)
+        result->set_is_syntax_error(*error);
+
+    return result;
+}
+
 RefPtr<AST::Node> Parser::parse_io_file(AST::Position start_position, Optional<int> fd)
 {
    auto start_index = m_token_index;
--- a/Userland/Shell/PosixParser.h
+++ b/Userland/Shell/PosixParser.h
@ -13,12 +13,12 @@ namespace Shell::Posix {

 class Parser {
 public:
-    Parser(StringView input, bool interactive = false)
+    Parser(StringView input, bool interactive = false, Optional<Reduction> starting_reduction = {})
        : m_lexer(input)
        , m_in_interactive_mode(interactive)
        , m_eof_token(Token::eof())
    {
-        fill_token_buffer();
+        fill_token_buffer(starting_reduction);
    }

    RefPtr<AST::Node> parse();
@ -31,20 +31,23 @@ public:
    auto& errors() const { return m_errors; }

 private:
-    Optional<Token> next_expanded_token();
+    Optional<Token> next_expanded_token(Optional<Reduction> starting_reduction = {});
    Vector<Token> perform_expansions(Vector<Token> tokens);
-    void fill_token_buffer();
+    void fill_token_buffer(Optional<Reduction> starting_reduction = {});
+    void handle_heredoc_contents();

-    Token const& peek() const
+    Token const& peek()
    {
        if (eof())
            return m_eof_token;
+        handle_heredoc_contents();
        return m_token_buffer[m_token_index];
    }
    Token const& consume()
    {
        if (eof())
            return m_eof_token;
+        handle_heredoc_contents();
        return m_token_buffer[m_token_index++];
    }
    void skip()
@ -108,6 +111,7 @@ private:
    Vector<Token> m_previous_token_buffer;

    Vector<Error> m_errors;
+    HashMap<DeprecatedString, NonnullRefPtr<AST::Heredoc>> m_unprocessed_heredoc_entries;

    Token m_eof_token;