Shell: Add support for heredocs to the POSIX parser

2025-09-17 14:56:16 +00:00 · 2023-02-16 09:52:13 +03:30 · 2023-02-16 09:52:13 +03:30 · 2881bb4c3a
commit 2881bb4c3a
parent a5e4bc4faf
4 changed files with 333 additions and 20 deletions
--- a/Userland/Shell/PosixLexer.cpp
+++ b/Userland/Shell/PosixLexer.cpp
@ -23,8 +23,11 @@ static bool is_part_of_operator(StringView text, char ch)

 namespace Shell::Posix {

-Vector<Token> Lexer::batch_next()
+Vector<Token> Lexer::batch_next(Optional<Reduction> starting_reduction)
 {
+    if (starting_reduction.has_value())
+        m_next_reduction = *starting_reduction;
+
    for (; m_next_reduction != Reduction::None;) {
        auto result = reduce(m_next_reduction);
        m_next_reduction = result.next_reduction;
@ -55,6 +58,18 @@ char Lexer::consume()
    return ch;
 }

+void Lexer::reconsume(StringView string)
+{
+    for (auto byte : string.bytes()) {
+        if (byte == '\n') {
+            m_state.position.end_line.line_number++;
+            m_state.position.end_line.line_column = 0;
+        }
+
+        m_state.position.end_offset++;
+    }
+}
+
 bool Lexer::consume_specific(char ch)
 {
    if (m_lexer.peek() == ch) {
@ -95,6 +110,8 @@ Lexer::ReductionResult Lexer::reduce(Reduction reduction)
        return reduce_command_or_arithmetic_substitution_expansion();
    case Reduction::ExtendedParameterExpansion:
        return reduce_extended_parameter_expansion();
+    case Reduction::HeredocContents:
+        return reduce_heredoc_contents();
    }

    VERIFY_NOT_REACHED();
@ -108,6 +125,91 @@ Lexer::ReductionResult Lexer::reduce_end()
    };
 }

+Lexer::HeredocKeyResult Lexer::process_heredoc_key(Token const& token)
+{
+    StringBuilder builder;
+    enum ParseState {
+        Free,
+        InDoubleQuotes,
+        InSingleQuotes,
+    };
+    Vector<ParseState, 4> parse_state;
+    parse_state.append(Free);
+    bool escaped = false;
+    bool had_a_single_quote_segment = false;
+
+    for (auto byte : token.value.bytes()) {
+        switch (parse_state.last()) {
+        case Free:
+            switch (byte) {
+            case '"':
+                if (escaped) {
+                    builder.append(byte);
+                    escaped = false;
+                } else {
+                    parse_state.append(InDoubleQuotes);
+                }
+                break;
+            case '\'':
+                if (escaped) {
+                    builder.append(byte);
+                    escaped = false;
+                } else {
+                    had_a_single_quote_segment = true;
+                    parse_state.append(InSingleQuotes);
+                }
+                break;
+            case '\\':
+                if (escaped) {
+                    builder.append(byte);
+                    escaped = false;
+                } else {
+                    escaped = true;
+                }
+                break;
+            default:
+                if (escaped) {
+                    builder.append('\\');
+                    escaped = false;
+                }
+                builder.append(byte);
+                break;
+            }
+            break;
+        case InDoubleQuotes:
+            if (!escaped && byte == '"') {
+                parse_state.take_last();
+                break;
+            }
+            if (escaped) {
+                if (byte != '"')
+                    builder.append('\\');
+                builder.append(byte);
+                break;
+            }
+            if (byte == '\\')
+                escaped = true;
+            else
+                builder.append(byte);
+            break;
+        case InSingleQuotes:
+            if (byte == '\'') {
+                parse_state.take_last();
+                break;
+            }
+            builder.append(byte);
+            break;
+        }
+    }
+
+    // NOTE: Not checking the final state as any garbage that even partially parses is allowed to be used as a key :/
+
+    return {
+        .key = builder.to_deprecated_string(),
+        .allow_interpolation = !had_a_single_quote_segment,
+    };
+}
+
 Lexer::ReductionResult Lexer::reduce_operator()
 {
    if (m_lexer.is_eof()) {
@ -142,8 +244,25 @@ Lexer::ReductionResult Lexer::reduce_operator()
        m_state.position.start_line = m_state.position.end_line;
    }

+    auto expect_heredoc_entry = !tokens.is_empty() && (tokens.last().type == Token::Type::DoubleLessDash || tokens.last().type == Token::Type::DoubleLess);
+
    auto result = reduce(Reduction::Start);
    tokens.extend(move(result.tokens));
+
+    while (expect_heredoc_entry && tokens.size() == 1) {
+        result = reduce(result.next_reduction);
+        tokens.extend(move(result.tokens));
+    }
+
+    if (expect_heredoc_entry && tokens.size() > 1) {
+        auto [key, interpolation] = process_heredoc_key(tokens[1]);
+        m_state.heredoc_entries.enqueue(HeredocEntry {
+            .key = key,
+            .allow_interpolation = interpolation,
+            .dedent = tokens[0].type == Token::Type::DoubleLessDash,
+        });
+    }
+
    return {
        .tokens = move(tokens),
        .next_reduction = result.next_reduction,
@ -160,6 +279,7 @@ Lexer::ReductionResult Lexer::reduce_comment()
    }

    if (consume() == '\n') {
+        m_state.on_new_line = true;
        return {
            .tokens = { Token::newline() },
            .next_reduction = Reduction::Start,
@ -352,7 +472,7 @@ Lexer::ReductionResult Lexer::reduce_command_expansion()
    };
 }

-Lexer::ReductionResult Lexer::reduce_start()
+Lexer::ReductionResult Lexer::reduce_heredoc_contents()
 {
    if (m_lexer.is_eof()) {
        auto tokens = Token::maybe_from_state(m_state);
@ -366,6 +486,107 @@ Lexer::ReductionResult Lexer::reduce_start()
        };
    }

+    if (!m_state.escaping && consume_specific('\\')) {
+        m_state.escaping = true;
+        m_state.buffer.append('\\');
+        return {
+            .tokens = {},
+            .next_reduction = Reduction::HeredocContents,
+        };
+    }
+
+    if (!m_state.escaping && consume_specific('$')) {
+        m_state.buffer.append('$');
+        if (m_lexer.next_is("("))
+            m_state.expansions.empend(CommandExpansion { .command = StringBuilder {}, .range = range() });
+        else
+            m_state.expansions.empend(ParameterExpansion { .parameter = StringBuilder {}, .range = range() });
+
+        return {
+            .tokens = {},
+            .next_reduction = Reduction::Expansion,
+        };
+    }
+
+    if (!m_state.escaping && consume_specific('`')) {
+        m_state.buffer.append('`');
+        m_state.expansions.empend(CommandExpansion { .command = StringBuilder {}, .range = range() });
+        return {
+            .tokens = {},
+            .next_reduction = Reduction::CommandExpansion,
+        };
+    }
+
+    m_state.escaping = false;
+    m_state.buffer.append(consume());
+    return {
+        .tokens = {},
+        .next_reduction = Reduction::HeredocContents,
+    };
+}
+
+Lexer::ReductionResult Lexer::reduce_start()
+{
+    auto was_on_new_line = m_state.on_new_line;
+    m_state.on_new_line = false;
+
+    if (m_lexer.is_eof()) {
+        auto tokens = Token::maybe_from_state(m_state);
+        m_state.buffer.clear();
+        m_state.position.start_offset = m_state.position.end_offset;
+        m_state.position.start_line = m_state.position.end_line;
+
+        return {
+            .tokens = move(tokens),
+            .next_reduction = Reduction::End,
+        };
+    }
+
+    if (was_on_new_line && !m_state.heredoc_entries.is_empty()) {
+        auto const& entry = m_state.heredoc_entries.head();
+
+        auto start_index = m_lexer.tell();
+        Optional<size_t> end_index;
+
+        for (; !m_lexer.is_eof();) {
+            auto index = m_lexer.tell();
+            auto possible_end_index = m_lexer.tell();
+            if (m_lexer.consume_specific('\n')) {
+                if (entry.dedent)
+                    m_lexer.ignore_while(is_any_of("\t"sv));
+                if (m_lexer.consume_specific(entry.key.view())) {
+                    if (m_lexer.consume_specific('\n') || m_lexer.is_eof()) {
+                        end_index = possible_end_index;
+                        break;
+                    }
+                }
+            }
+            if (m_lexer.tell() == index)
+                m_lexer.ignore();
+        }
+
+        auto contents = m_lexer.input().substring_view(start_index, end_index.value_or(m_lexer.tell()) - start_index);
+        reconsume(contents);
+
+        m_state.buffer.clear();
+        m_state.buffer.append(contents);
+
+        auto token = Token::maybe_from_state(m_state).first();
+        token.relevant_heredoc_key = entry.key;
+        token.type = Token::Type::HeredocContents;
+
+        m_state.heredoc_entries.dequeue();
+
+        m_state.on_new_line = true;
+
+        m_state.buffer.clear();
+
+        return {
+            .tokens = { move(token) },
+            .next_reduction = Reduction::Start,
+        };
+    }
+
    if (m_state.escaping && consume_specific('\n')) {
        m_state.escaping = false;

@ -391,6 +612,8 @@ Lexer::ReductionResult Lexer::reduce_start()
        auto tokens = Token::maybe_from_state(m_state);
        tokens.append(Token::newline());

+        m_state.on_new_line = true;
+
        m_state.buffer.clear();
        m_state.position.start_offset = m_state.position.end_offset;
        m_state.position.start_line = m_state.position.end_line;
@ -678,6 +901,8 @@ StringView Token::type_name() const
        return "Clobber"sv;
    case Type::Semicolon:
        return "Semicolon"sv;
+    case Type::HeredocContents:
+        return "HeredocContents"sv;
    case Type::AssignmentWord:
        return "AssignmentWord"sv;
    case Type::Bang: