Shell: Add support for heredocs to the POSIX parser

2025-07-25 18:27:35 +00:00 · 2023-02-16 09:52:13 +03:30 · 2023-02-16 09:52:13 +03:30 · 2881bb4c3a
commit 2881bb4c3a
parent a5e4bc4faf
4 changed files with 333 additions and 20 deletions
--- a/Userland/Shell/PosixLexer.cpp
+++ b/Userland/Shell/PosixLexer.cpp
@ -23,8 +23,11 @@ static bool is_part_of_operator(StringView text, char ch)
 namespace Shell::Posix {
-Vector<Token> Lexer::batch_next()
+Vector<Token> Lexer::batch_next(Optional<Reduction> starting_reduction)
 {
    if (starting_reduction.has_value())
        m_next_reduction = *starting_reduction;
    for (; m_next_reduction != Reduction::None;) {
        auto result = reduce(m_next_reduction);
        m_next_reduction = result.next_reduction;
@ -55,6 +58,18 @@ char Lexer::consume()
    return ch;
 }
 void Lexer::reconsume(StringView string)
 {
    for (auto byte : string.bytes()) {
        if (byte == '\n') {
            m_state.position.end_line.line_number++;
            m_state.position.end_line.line_column = 0;
        }
        m_state.position.end_offset++;
    }
 }
 bool Lexer::consume_specific(char ch)
 {
    if (m_lexer.peek() == ch) {
@ -95,6 +110,8 @@ Lexer::ReductionResult Lexer::reduce(Reduction reduction)
        return reduce_command_or_arithmetic_substitution_expansion();
    case Reduction::ExtendedParameterExpansion:
        return reduce_extended_parameter_expansion();
    case Reduction::HeredocContents:
        return reduce_heredoc_contents();
    }
    VERIFY_NOT_REACHED();
@ -108,6 +125,91 @@ Lexer::ReductionResult Lexer::reduce_end()
    };
 }
 Lexer::HeredocKeyResult Lexer::process_heredoc_key(Token const& token)
 {
    StringBuilder builder;
    enum ParseState {
        Free,
        InDoubleQuotes,
        InSingleQuotes,
    };
    Vector<ParseState, 4> parse_state;
    parse_state.append(Free);
    bool escaped = false;
    bool had_a_single_quote_segment = false;
    for (auto byte : token.value.bytes()) {
        switch (parse_state.last()) {
        case Free:
            switch (byte) {
            case '"':
                if (escaped) {
                    builder.append(byte);
                    escaped = false;
                } else {
                    parse_state.append(InDoubleQuotes);
                }
                break;
            case '\'':
                if (escaped) {
                    builder.append(byte);
                    escaped = false;
                } else {
                    had_a_single_quote_segment = true;
                    parse_state.append(InSingleQuotes);
                }
                break;
            case '\\':
                if (escaped) {
                    builder.append(byte);
                    escaped = false;
                } else {
                    escaped = true;
                }
                break;
            default:
                if (escaped) {
                    builder.append('\\');
                    escaped = false;
                }
                builder.append(byte);
                break;
            }
            break;
        case InDoubleQuotes:
            if (!escaped && byte == '"') {
                parse_state.take_last();
                break;
            }
            if (escaped) {
                if (byte != '"')
                    builder.append('\\');
                builder.append(byte);
                break;
            }
            if (byte == '\\')
                escaped = true;
            else
                builder.append(byte);
            break;
        case InSingleQuotes:
            if (byte == '\'') {
                parse_state.take_last();
                break;
            }
            builder.append(byte);
            break;
        }
    }
    // NOTE: Not checking the final state as any garbage that even partially parses is allowed to be used as a key :/
    return {
        .key = builder.to_deprecated_string(),
        .allow_interpolation = !had_a_single_quote_segment,
    };
 }
 Lexer::ReductionResult Lexer::reduce_operator()
 {
    if (m_lexer.is_eof()) {
@ -142,8 +244,25 @@ Lexer::ReductionResult Lexer::reduce_operator()
        m_state.position.start_line = m_state.position.end_line;
    }
    auto expect_heredoc_entry = !tokens.is_empty() && (tokens.last().type == Token::Type::DoubleLessDash || tokens.last().type == Token::Type::DoubleLess);
    auto result = reduce(Reduction::Start);
    tokens.extend(move(result.tokens));
    while (expect_heredoc_entry && tokens.size() == 1) {
        result = reduce(result.next_reduction);
        tokens.extend(move(result.tokens));
    }
    if (expect_heredoc_entry && tokens.size() > 1) {
        auto [key, interpolation] = process_heredoc_key(tokens[1]);
        m_state.heredoc_entries.enqueue(HeredocEntry {
            .key = key,
            .allow_interpolation = interpolation,
            .dedent = tokens[0].type == Token::Type::DoubleLessDash,
        });
    }
    return {
        .tokens = move(tokens),
        .next_reduction = result.next_reduction,
@ -160,6 +279,7 @@ Lexer::ReductionResult Lexer::reduce_comment()
    }
    if (consume() == '\n') {
        m_state.on_new_line = true;
        return {
            .tokens = { Token::newline() },
            .next_reduction = Reduction::Start,
@ -352,7 +472,7 @@ Lexer::ReductionResult Lexer::reduce_command_expansion()
    };
 }
-Lexer::ReductionResult Lexer::reduce_start()
+Lexer::ReductionResult Lexer::reduce_heredoc_contents()
 {
    if (m_lexer.is_eof()) {
        auto tokens = Token::maybe_from_state(m_state);
@ -366,6 +486,107 @@ Lexer::ReductionResult Lexer::reduce_start()
        };
    }
    if (!m_state.escaping && consume_specific('\\')) {
        m_state.escaping = true;
        m_state.buffer.append('\\');
        return {
            .tokens = {},
            .next_reduction = Reduction::HeredocContents,
        };
    }
    if (!m_state.escaping && consume_specific('$')) {
        m_state.buffer.append('$');
        if (m_lexer.next_is("("))
            m_state.expansions.empend(CommandExpansion { .command = StringBuilder {}, .range = range() });
        else
            m_state.expansions.empend(ParameterExpansion { .parameter = StringBuilder {}, .range = range() });
        return {
            .tokens = {},
            .next_reduction = Reduction::Expansion,
        };
    }
    if (!m_state.escaping && consume_specific('`')) {
        m_state.buffer.append('`');
        m_state.expansions.empend(CommandExpansion { .command = StringBuilder {}, .range = range() });
        return {
            .tokens = {},
            .next_reduction = Reduction::CommandExpansion,
        };
    }
    m_state.escaping = false;
    m_state.buffer.append(consume());
    return {
        .tokens = {},
        .next_reduction = Reduction::HeredocContents,
    };
 }
 Lexer::ReductionResult Lexer::reduce_start()
 {
    auto was_on_new_line = m_state.on_new_line;
    m_state.on_new_line = false;
    if (m_lexer.is_eof()) {
        auto tokens = Token::maybe_from_state(m_state);
        m_state.buffer.clear();
        m_state.position.start_offset = m_state.position.end_offset;
        m_state.position.start_line = m_state.position.end_line;
        return {
            .tokens = move(tokens),
            .next_reduction = Reduction::End,
        };
    }
    if (was_on_new_line && !m_state.heredoc_entries.is_empty()) {
        auto const& entry = m_state.heredoc_entries.head();
        auto start_index = m_lexer.tell();
        Optional<size_t> end_index;
        for (; !m_lexer.is_eof();) {
            auto index = m_lexer.tell();
            auto possible_end_index = m_lexer.tell();
            if (m_lexer.consume_specific('\n')) {
                if (entry.dedent)
                    m_lexer.ignore_while(is_any_of("\t"sv));
                if (m_lexer.consume_specific(entry.key.view())) {
                    if (m_lexer.consume_specific('\n') || m_lexer.is_eof()) {
                        end_index = possible_end_index;
                        break;
                    }
                }
            }
            if (m_lexer.tell() == index)
                m_lexer.ignore();
        }
        auto contents = m_lexer.input().substring_view(start_index, end_index.value_or(m_lexer.tell()) - start_index);
        reconsume(contents);
        m_state.buffer.clear();
        m_state.buffer.append(contents);
        auto token = Token::maybe_from_state(m_state).first();
        token.relevant_heredoc_key = entry.key;
        token.type = Token::Type::HeredocContents;
        m_state.heredoc_entries.dequeue();
        m_state.on_new_line = true;
        m_state.buffer.clear();
        return {
            .tokens = { move(token) },
            .next_reduction = Reduction::Start,
        };
    }
    if (m_state.escaping && consume_specific('\n')) {
        m_state.escaping = false;
@ -391,6 +612,8 @@ Lexer::ReductionResult Lexer::reduce_start()
        auto tokens = Token::maybe_from_state(m_state);
        tokens.append(Token::newline());
        m_state.on_new_line = true;
        m_state.buffer.clear();
        m_state.position.start_offset = m_state.position.end_offset;
        m_state.position.start_line = m_state.position.end_line;
@ -678,6 +901,8 @@ StringView Token::type_name() const
        return "Clobber"sv;
    case Type::Semicolon:
        return "Semicolon"sv;
    case Type::HeredocContents:
        return "HeredocContents"sv;
    case Type::AssignmentWord:
        return "AssignmentWord"sv;
    case Type::Bang:
--- a/Userland/Shell/PosixLexer.h
+++ b/Userland/Shell/PosixLexer.h
@ -8,6 +8,7 @@
 #include <AK/DeprecatedString.h>
 #include <AK/GenericLexer.h>
 #include <AK/Queue.h>
 #include <AK/Variant.h>
 #include <AK/Vector.h>
 #include <Shell/AST.h>
@ -29,6 +30,9 @@ enum class Reduction {
    ParameterExpansion,
    CommandOrArithmeticSubstitutionExpansion,
    ExtendedParameterExpansion,
    // Separate rule, not used by the main flow.
    HeredocContents,
 };
 struct ExpansionRange {
@ -177,6 +181,12 @@ struct ResolvedCommandExpansion {
 using ResolvedExpansion = Variant<ResolvedParameterExpansion, ResolvedCommandExpansion>;
 struct HeredocEntry {
    DeprecatedString key;
    bool allow_interpolation;
    bool dedent;
 };
 struct State {
    StringBuilder buffer {};
    Reduction previous_reduction { Reduction::Start };
@ -194,6 +204,8 @@ struct State {
        },
    };
    Vector<Expansion> expansions {};
    Queue<HeredocEntry> heredoc_entries {};
    bool on_new_line { true };
 };
 struct Token {
@ -219,6 +231,7 @@ struct Token {
        DoubleLessDash,
        Clobber,
        Semicolon,
        HeredocContents,
        // Not produced by this lexer, but generated in later stages.
        AssignmentWord,
@ -249,6 +262,7 @@ struct Token {
    Vector<Expansion> expansions;
    Vector<ResolvedExpansion> resolved_expansions {};
    StringView original_text;
    Optional<DeprecatedString> relevant_heredoc_key {};
    bool could_be_start_of_a_simple_command { false };
    static Vector<Token> maybe_from_state(State const& state)
@ -378,7 +392,14 @@ public:
    {
    }
-    Vector<Token> batch_next();
+    Vector<Token> batch_next(Optional<Reduction> starting_reduction = {});
    struct HeredocKeyResult {
        DeprecatedString key;
        bool allow_interpolation;
    };
    static HeredocKeyResult process_heredoc_key(Token const&);
 private:
    struct ReductionResult {
@ -400,9 +421,11 @@ private:
    ReductionResult reduce_parameter_expansion();
    ReductionResult reduce_command_or_arithmetic_substitution_expansion();
    ReductionResult reduce_extended_parameter_expansion();
    ReductionResult reduce_heredoc_contents();
    char consume();
    bool consume_specific(char);
    void reconsume(StringView);
    ExpansionRange range(ssize_t offset = 0) const;
    GenericLexer m_lexer;
--- a/Userland/Shell/PosixParser.cpp
+++ b/Userland/Shell/PosixParser.cpp
@ -9,6 +9,11 @@
 #include <AK/StringUtils.h>
 #include <Shell/PosixParser.h>
 static Shell::AST::Position empty_position()
 {
    return { 0, 0, { 0, 0 }, { 0, 0 } };
 }
 template<typename T, typename... Ts>
 static inline bool is_one_of(T const& value, Ts const&... values)
 {
@ -22,7 +27,8 @@ static inline bool is_io_operator(Shell::Posix::Token const& token)
        Token::Type::Less, Token::Type::Great,
        Token::Type::LessAnd, Token::Type::GreatAnd,
        Token::Type::DoubleLess, Token::Type::DoubleGreat,
-        Token::Type::LessGreat, Token::Type::Clobber);
+        Token::Type::DoubleLessDash, Token::Type::LessGreat,
        Token::Type::Clobber);
 }
 static inline bool is_separator(Shell::Posix::Token const& token)
@ -95,10 +101,10 @@ static inline bool is_valid_name(StringView word)
 }
 namespace Shell::Posix {
-void Parser::fill_token_buffer()
+void Parser::fill_token_buffer(Optional<Reduction> starting_reduction)
 {
    for (;;) {
-        auto token = next_expanded_token();
+        auto token = next_expanded_token(starting_reduction);
        if (!token.has_value())
            break;
 #if SHELL_POSIX_PARSER_DEBUG
@ -126,10 +132,36 @@ RefPtr<AST::Node> Parser::parse()
    return parse_complete_command();
 }
-Optional<Token> Parser::next_expanded_token()
+void Parser::handle_heredoc_contents()
 {
    while (!eof() && m_token_buffer[m_token_index].type == Token::Type::HeredocContents) {
        auto& token = m_token_buffer[m_token_index++];
        auto entry = m_unprocessed_heredoc_entries.get(token.relevant_heredoc_key.value());
        if (!entry.has_value()) {
            error(token, "Discarding unexpected heredoc contents for key '{}'", *token.relevant_heredoc_key);
            continue;
        }
        auto& heredoc = **entry;
        RefPtr<AST::Node> contents;
        if (heredoc.allow_interpolation()) {
            Parser parser { token.value, m_in_interactive_mode, Reduction::HeredocContents };
            contents = parser.parse_word();
        } else {
            contents = make_ref_counted<AST::StringLiteral>(token.position.value_or(empty_position()), token.value, AST::StringLiteral::EnclosureType::None);
        }
        if (contents)
            heredoc.set_contents(contents);
        m_unprocessed_heredoc_entries.remove(*token.relevant_heredoc_key);
    }
 }
 Optional<Token> Parser::next_expanded_token(Optional<Reduction> starting_reduction)
 {
    while (m_token_buffer.find_if([](auto& token) { return token.type == Token::Type::Eof; }).is_end()) {
-        auto tokens = m_lexer.batch_next();
+        auto tokens = m_lexer.batch_next(starting_reduction);
        auto expanded = perform_expansions(move(tokens));
        m_token_buffer.extend(expanded);
    }
@ -589,11 +621,6 @@ Vector<Token> Parser::perform_expansions(Vector<Token> tokens)
    return tokens;
 }
 static AST::Position empty_position()
 {
    return { 0, 0, { 0, 0 }, { 0, 0 } };
 }
 RefPtr<AST::Node> Parser::parse_complete_command()
 {
    auto list = [&] {
@ -1835,13 +1862,47 @@ RefPtr<AST::Node> Parser::parse_io_redirect()
    if (auto io_file = parse_io_file(start_position, io_number))
        return io_file;
-    // if (auto io_here = parse_io_here(start_position, io_number))
+    if (auto io_here = parse_io_here(start_position, io_number))
-    //     return io_here;
+        return io_here;
    m_token_index = start_index;
    return nullptr;
 }
 RefPtr<AST::Node> Parser::parse_io_here(AST::Position start_position, Optional<int> fd)
 {
    // io_here: IO_NUMBER? (DLESS | DLESSDASH) WORD
    auto io_operator = peek().type;
    if (!is_one_of(io_operator, Token::Type::DoubleLess, Token::Type::DoubleLessDash))
        return nullptr;
    auto io_operator_token = consume();
    auto redirection_fd = fd.value_or(0);
    auto end_keyword = consume();
    if (!is_one_of(end_keyword.type, Token::Type::Word, Token::Type::Token))
        return make_ref_counted<AST::SyntaxError>(io_operator_token.position.value_or(start_position), "Expected a heredoc keyword", true);
    auto [end_keyword_text, allow_interpolation] = Lexer::process_heredoc_key(end_keyword);
    RefPtr<AST::SyntaxError> error;
    auto position = start_position.with_end(peek().position.value_or(empty_position()));
    auto result = make_ref_counted<AST::Heredoc>(
        position,
        end_keyword_text,
        allow_interpolation,
        io_operator == Token::Type::DoubleLessDash,
        Optional<int> { redirection_fd });
    m_unprocessed_heredoc_entries.set(end_keyword_text, result);
    if (error)
        result->set_is_syntax_error(*error);
    return result;
 }
 RefPtr<AST::Node> Parser::parse_io_file(AST::Position start_position, Optional<int> fd)
 {
    auto start_index = m_token_index;
--- a/Userland/Shell/PosixParser.h
+++ b/Userland/Shell/PosixParser.h
@ -13,12 +13,12 @@ namespace Shell::Posix {
 class Parser {
 public:
-    Parser(StringView input, bool interactive = false)
+    Parser(StringView input, bool interactive = false, Optional<Reduction> starting_reduction = {})
        : m_lexer(input)
        , m_in_interactive_mode(interactive)
        , m_eof_token(Token::eof())
    {
-        fill_token_buffer();
+        fill_token_buffer(starting_reduction);
    }
    RefPtr<AST::Node> parse();
@ -31,20 +31,23 @@ public:
    auto& errors() const { return m_errors; }
 private:
-    Optional<Token> next_expanded_token();
+    Optional<Token> next_expanded_token(Optional<Reduction> starting_reduction = {});
    Vector<Token> perform_expansions(Vector<Token> tokens);
-    void fill_token_buffer();
+    void fill_token_buffer(Optional<Reduction> starting_reduction = {});
    void handle_heredoc_contents();
-    Token const& peek() const
+    Token const& peek()
    {
        if (eof())
            return m_eof_token;
        handle_heredoc_contents();
        return m_token_buffer[m_token_index];
    }
    Token const& consume()
    {
        if (eof())
            return m_eof_token;
        handle_heredoc_contents();
        return m_token_buffer[m_token_index++];
    }
    void skip()
@ -108,6 +111,7 @@ private:
    Vector<Token> m_previous_token_buffer;
    Vector<Error> m_errors;
    HashMap<DeprecatedString, NonnullRefPtr<AST::Heredoc>> m_unprocessed_heredoc_entries;
    Token m_eof_token;