1
Fork 0
mirror of https://github.com/RGBCube/serenity synced 2025-07-25 18:57:45 +00:00

Shell: Add support for heredocs to the POSIX parser

This commit is contained in:
Ali Mohammad Pur 2023-02-16 09:52:13 +03:30 committed by Ali Mohammad Pur
parent a5e4bc4faf
commit 2881bb4c3a
4 changed files with 333 additions and 20 deletions

View file

@ -23,8 +23,11 @@ static bool is_part_of_operator(StringView text, char ch)
namespace Shell::Posix {
Vector<Token> Lexer::batch_next()
Vector<Token> Lexer::batch_next(Optional<Reduction> starting_reduction)
{
if (starting_reduction.has_value())
m_next_reduction = *starting_reduction;
for (; m_next_reduction != Reduction::None;) {
auto result = reduce(m_next_reduction);
m_next_reduction = result.next_reduction;
@ -55,6 +58,18 @@ char Lexer::consume()
return ch;
}
void Lexer::reconsume(StringView string)
{
for (auto byte : string.bytes()) {
if (byte == '\n') {
m_state.position.end_line.line_number++;
m_state.position.end_line.line_column = 0;
}
m_state.position.end_offset++;
}
}
bool Lexer::consume_specific(char ch)
{
if (m_lexer.peek() == ch) {
@ -95,6 +110,8 @@ Lexer::ReductionResult Lexer::reduce(Reduction reduction)
return reduce_command_or_arithmetic_substitution_expansion();
case Reduction::ExtendedParameterExpansion:
return reduce_extended_parameter_expansion();
case Reduction::HeredocContents:
return reduce_heredoc_contents();
}
VERIFY_NOT_REACHED();
@ -108,6 +125,91 @@ Lexer::ReductionResult Lexer::reduce_end()
};
}
Lexer::HeredocKeyResult Lexer::process_heredoc_key(Token const& token)
{
StringBuilder builder;
enum ParseState {
Free,
InDoubleQuotes,
InSingleQuotes,
};
Vector<ParseState, 4> parse_state;
parse_state.append(Free);
bool escaped = false;
bool had_a_single_quote_segment = false;
for (auto byte : token.value.bytes()) {
switch (parse_state.last()) {
case Free:
switch (byte) {
case '"':
if (escaped) {
builder.append(byte);
escaped = false;
} else {
parse_state.append(InDoubleQuotes);
}
break;
case '\'':
if (escaped) {
builder.append(byte);
escaped = false;
} else {
had_a_single_quote_segment = true;
parse_state.append(InSingleQuotes);
}
break;
case '\\':
if (escaped) {
builder.append(byte);
escaped = false;
} else {
escaped = true;
}
break;
default:
if (escaped) {
builder.append('\\');
escaped = false;
}
builder.append(byte);
break;
}
break;
case InDoubleQuotes:
if (!escaped && byte == '"') {
parse_state.take_last();
break;
}
if (escaped) {
if (byte != '"')
builder.append('\\');
builder.append(byte);
break;
}
if (byte == '\\')
escaped = true;
else
builder.append(byte);
break;
case InSingleQuotes:
if (byte == '\'') {
parse_state.take_last();
break;
}
builder.append(byte);
break;
}
}
// NOTE: Not checking the final state as any garbage that even partially parses is allowed to be used as a key :/
return {
.key = builder.to_deprecated_string(),
.allow_interpolation = !had_a_single_quote_segment,
};
}
Lexer::ReductionResult Lexer::reduce_operator()
{
if (m_lexer.is_eof()) {
@ -142,8 +244,25 @@ Lexer::ReductionResult Lexer::reduce_operator()
m_state.position.start_line = m_state.position.end_line;
}
auto expect_heredoc_entry = !tokens.is_empty() && (tokens.last().type == Token::Type::DoubleLessDash || tokens.last().type == Token::Type::DoubleLess);
auto result = reduce(Reduction::Start);
tokens.extend(move(result.tokens));
while (expect_heredoc_entry && tokens.size() == 1) {
result = reduce(result.next_reduction);
tokens.extend(move(result.tokens));
}
if (expect_heredoc_entry && tokens.size() > 1) {
auto [key, interpolation] = process_heredoc_key(tokens[1]);
m_state.heredoc_entries.enqueue(HeredocEntry {
.key = key,
.allow_interpolation = interpolation,
.dedent = tokens[0].type == Token::Type::DoubleLessDash,
});
}
return {
.tokens = move(tokens),
.next_reduction = result.next_reduction,
@ -160,6 +279,7 @@ Lexer::ReductionResult Lexer::reduce_comment()
}
if (consume() == '\n') {
m_state.on_new_line = true;
return {
.tokens = { Token::newline() },
.next_reduction = Reduction::Start,
@ -352,7 +472,7 @@ Lexer::ReductionResult Lexer::reduce_command_expansion()
};
}
Lexer::ReductionResult Lexer::reduce_start()
Lexer::ReductionResult Lexer::reduce_heredoc_contents()
{
if (m_lexer.is_eof()) {
auto tokens = Token::maybe_from_state(m_state);
@ -366,6 +486,107 @@ Lexer::ReductionResult Lexer::reduce_start()
};
}
if (!m_state.escaping && consume_specific('\\')) {
m_state.escaping = true;
m_state.buffer.append('\\');
return {
.tokens = {},
.next_reduction = Reduction::HeredocContents,
};
}
if (!m_state.escaping && consume_specific('$')) {
m_state.buffer.append('$');
if (m_lexer.next_is("("))
m_state.expansions.empend(CommandExpansion { .command = StringBuilder {}, .range = range() });
else
m_state.expansions.empend(ParameterExpansion { .parameter = StringBuilder {}, .range = range() });
return {
.tokens = {},
.next_reduction = Reduction::Expansion,
};
}
if (!m_state.escaping && consume_specific('`')) {
m_state.buffer.append('`');
m_state.expansions.empend(CommandExpansion { .command = StringBuilder {}, .range = range() });
return {
.tokens = {},
.next_reduction = Reduction::CommandExpansion,
};
}
m_state.escaping = false;
m_state.buffer.append(consume());
return {
.tokens = {},
.next_reduction = Reduction::HeredocContents,
};
}
Lexer::ReductionResult Lexer::reduce_start()
{
auto was_on_new_line = m_state.on_new_line;
m_state.on_new_line = false;
if (m_lexer.is_eof()) {
auto tokens = Token::maybe_from_state(m_state);
m_state.buffer.clear();
m_state.position.start_offset = m_state.position.end_offset;
m_state.position.start_line = m_state.position.end_line;
return {
.tokens = move(tokens),
.next_reduction = Reduction::End,
};
}
if (was_on_new_line && !m_state.heredoc_entries.is_empty()) {
auto const& entry = m_state.heredoc_entries.head();
auto start_index = m_lexer.tell();
Optional<size_t> end_index;
for (; !m_lexer.is_eof();) {
auto index = m_lexer.tell();
auto possible_end_index = m_lexer.tell();
if (m_lexer.consume_specific('\n')) {
if (entry.dedent)
m_lexer.ignore_while(is_any_of("\t"sv));
if (m_lexer.consume_specific(entry.key.view())) {
if (m_lexer.consume_specific('\n') || m_lexer.is_eof()) {
end_index = possible_end_index;
break;
}
}
}
if (m_lexer.tell() == index)
m_lexer.ignore();
}
auto contents = m_lexer.input().substring_view(start_index, end_index.value_or(m_lexer.tell()) - start_index);
reconsume(contents);
m_state.buffer.clear();
m_state.buffer.append(contents);
auto token = Token::maybe_from_state(m_state).first();
token.relevant_heredoc_key = entry.key;
token.type = Token::Type::HeredocContents;
m_state.heredoc_entries.dequeue();
m_state.on_new_line = true;
m_state.buffer.clear();
return {
.tokens = { move(token) },
.next_reduction = Reduction::Start,
};
}
if (m_state.escaping && consume_specific('\n')) {
m_state.escaping = false;
@ -391,6 +612,8 @@ Lexer::ReductionResult Lexer::reduce_start()
auto tokens = Token::maybe_from_state(m_state);
tokens.append(Token::newline());
m_state.on_new_line = true;
m_state.buffer.clear();
m_state.position.start_offset = m_state.position.end_offset;
m_state.position.start_line = m_state.position.end_line;
@ -678,6 +901,8 @@ StringView Token::type_name() const
return "Clobber"sv;
case Type::Semicolon:
return "Semicolon"sv;
case Type::HeredocContents:
return "HeredocContents"sv;
case Type::AssignmentWord:
return "AssignmentWord"sv;
case Type::Bang:

View file

@ -8,6 +8,7 @@
#include <AK/DeprecatedString.h>
#include <AK/GenericLexer.h>
#include <AK/Queue.h>
#include <AK/Variant.h>
#include <AK/Vector.h>
#include <Shell/AST.h>
@ -29,6 +30,9 @@ enum class Reduction {
ParameterExpansion,
CommandOrArithmeticSubstitutionExpansion,
ExtendedParameterExpansion,
// Separate rule, not used by the main flow.
HeredocContents,
};
struct ExpansionRange {
@ -177,6 +181,12 @@ struct ResolvedCommandExpansion {
using ResolvedExpansion = Variant<ResolvedParameterExpansion, ResolvedCommandExpansion>;
struct HeredocEntry {
DeprecatedString key;
bool allow_interpolation;
bool dedent;
};
struct State {
StringBuilder buffer {};
Reduction previous_reduction { Reduction::Start };
@ -194,6 +204,8 @@ struct State {
},
};
Vector<Expansion> expansions {};
Queue<HeredocEntry> heredoc_entries {};
bool on_new_line { true };
};
struct Token {
@ -219,6 +231,7 @@ struct Token {
DoubleLessDash,
Clobber,
Semicolon,
HeredocContents,
// Not produced by this lexer, but generated in later stages.
AssignmentWord,
@ -249,6 +262,7 @@ struct Token {
Vector<Expansion> expansions;
Vector<ResolvedExpansion> resolved_expansions {};
StringView original_text;
Optional<DeprecatedString> relevant_heredoc_key {};
bool could_be_start_of_a_simple_command { false };
static Vector<Token> maybe_from_state(State const& state)
@ -378,7 +392,14 @@ public:
{
}
Vector<Token> batch_next();
Vector<Token> batch_next(Optional<Reduction> starting_reduction = {});
struct HeredocKeyResult {
DeprecatedString key;
bool allow_interpolation;
};
static HeredocKeyResult process_heredoc_key(Token const&);
private:
struct ReductionResult {
@ -400,9 +421,11 @@ private:
ReductionResult reduce_parameter_expansion();
ReductionResult reduce_command_or_arithmetic_substitution_expansion();
ReductionResult reduce_extended_parameter_expansion();
ReductionResult reduce_heredoc_contents();
char consume();
bool consume_specific(char);
void reconsume(StringView);
ExpansionRange range(ssize_t offset = 0) const;
GenericLexer m_lexer;

View file

@ -9,6 +9,11 @@
#include <AK/StringUtils.h>
#include <Shell/PosixParser.h>
static Shell::AST::Position empty_position()
{
return { 0, 0, { 0, 0 }, { 0, 0 } };
}
template<typename T, typename... Ts>
static inline bool is_one_of(T const& value, Ts const&... values)
{
@ -22,7 +27,8 @@ static inline bool is_io_operator(Shell::Posix::Token const& token)
Token::Type::Less, Token::Type::Great,
Token::Type::LessAnd, Token::Type::GreatAnd,
Token::Type::DoubleLess, Token::Type::DoubleGreat,
Token::Type::LessGreat, Token::Type::Clobber);
Token::Type::DoubleLessDash, Token::Type::LessGreat,
Token::Type::Clobber);
}
static inline bool is_separator(Shell::Posix::Token const& token)
@ -95,10 +101,10 @@ static inline bool is_valid_name(StringView word)
}
namespace Shell::Posix {
void Parser::fill_token_buffer()
void Parser::fill_token_buffer(Optional<Reduction> starting_reduction)
{
for (;;) {
auto token = next_expanded_token();
auto token = next_expanded_token(starting_reduction);
if (!token.has_value())
break;
#if SHELL_POSIX_PARSER_DEBUG
@ -126,10 +132,36 @@ RefPtr<AST::Node> Parser::parse()
return parse_complete_command();
}
Optional<Token> Parser::next_expanded_token()
void Parser::handle_heredoc_contents()
{
while (!eof() && m_token_buffer[m_token_index].type == Token::Type::HeredocContents) {
auto& token = m_token_buffer[m_token_index++];
auto entry = m_unprocessed_heredoc_entries.get(token.relevant_heredoc_key.value());
if (!entry.has_value()) {
error(token, "Discarding unexpected heredoc contents for key '{}'", *token.relevant_heredoc_key);
continue;
}
auto& heredoc = **entry;
RefPtr<AST::Node> contents;
if (heredoc.allow_interpolation()) {
Parser parser { token.value, m_in_interactive_mode, Reduction::HeredocContents };
contents = parser.parse_word();
} else {
contents = make_ref_counted<AST::StringLiteral>(token.position.value_or(empty_position()), token.value, AST::StringLiteral::EnclosureType::None);
}
if (contents)
heredoc.set_contents(contents);
m_unprocessed_heredoc_entries.remove(*token.relevant_heredoc_key);
}
}
Optional<Token> Parser::next_expanded_token(Optional<Reduction> starting_reduction)
{
while (m_token_buffer.find_if([](auto& token) { return token.type == Token::Type::Eof; }).is_end()) {
auto tokens = m_lexer.batch_next();
auto tokens = m_lexer.batch_next(starting_reduction);
auto expanded = perform_expansions(move(tokens));
m_token_buffer.extend(expanded);
}
@ -589,11 +621,6 @@ Vector<Token> Parser::perform_expansions(Vector<Token> tokens)
return tokens;
}
static AST::Position empty_position()
{
return { 0, 0, { 0, 0 }, { 0, 0 } };
}
RefPtr<AST::Node> Parser::parse_complete_command()
{
auto list = [&] {
@ -1835,13 +1862,47 @@ RefPtr<AST::Node> Parser::parse_io_redirect()
if (auto io_file = parse_io_file(start_position, io_number))
return io_file;
// if (auto io_here = parse_io_here(start_position, io_number))
// return io_here;
if (auto io_here = parse_io_here(start_position, io_number))
return io_here;
m_token_index = start_index;
return nullptr;
}
RefPtr<AST::Node> Parser::parse_io_here(AST::Position start_position, Optional<int> fd)
{
// io_here: IO_NUMBER? (DLESS | DLESSDASH) WORD
auto io_operator = peek().type;
if (!is_one_of(io_operator, Token::Type::DoubleLess, Token::Type::DoubleLessDash))
return nullptr;
auto io_operator_token = consume();
auto redirection_fd = fd.value_or(0);
auto end_keyword = consume();
if (!is_one_of(end_keyword.type, Token::Type::Word, Token::Type::Token))
return make_ref_counted<AST::SyntaxError>(io_operator_token.position.value_or(start_position), "Expected a heredoc keyword", true);
auto [end_keyword_text, allow_interpolation] = Lexer::process_heredoc_key(end_keyword);
RefPtr<AST::SyntaxError> error;
auto position = start_position.with_end(peek().position.value_or(empty_position()));
auto result = make_ref_counted<AST::Heredoc>(
position,
end_keyword_text,
allow_interpolation,
io_operator == Token::Type::DoubleLessDash,
Optional<int> { redirection_fd });
m_unprocessed_heredoc_entries.set(end_keyword_text, result);
if (error)
result->set_is_syntax_error(*error);
return result;
}
RefPtr<AST::Node> Parser::parse_io_file(AST::Position start_position, Optional<int> fd)
{
auto start_index = m_token_index;

View file

@ -13,12 +13,12 @@ namespace Shell::Posix {
class Parser {
public:
Parser(StringView input, bool interactive = false)
Parser(StringView input, bool interactive = false, Optional<Reduction> starting_reduction = {})
: m_lexer(input)
, m_in_interactive_mode(interactive)
, m_eof_token(Token::eof())
{
fill_token_buffer();
fill_token_buffer(starting_reduction);
}
RefPtr<AST::Node> parse();
@ -31,20 +31,23 @@ public:
auto& errors() const { return m_errors; }
private:
Optional<Token> next_expanded_token();
Optional<Token> next_expanded_token(Optional<Reduction> starting_reduction = {});
Vector<Token> perform_expansions(Vector<Token> tokens);
void fill_token_buffer();
void fill_token_buffer(Optional<Reduction> starting_reduction = {});
void handle_heredoc_contents();
Token const& peek() const
Token const& peek()
{
if (eof())
return m_eof_token;
handle_heredoc_contents();
return m_token_buffer[m_token_index];
}
Token const& consume()
{
if (eof())
return m_eof_token;
handle_heredoc_contents();
return m_token_buffer[m_token_index++];
}
void skip()
@ -108,6 +111,7 @@ private:
Vector<Token> m_previous_token_buffer;
Vector<Error> m_errors;
HashMap<DeprecatedString, NonnullRefPtr<AST::Heredoc>> m_unprocessed_heredoc_entries;
Token m_eof_token;