From ef45221c215fead985198d4762de2f356d21c25c Mon Sep 17 00:00:00 2001 From: Shannon Booth Date: Sun, 2 Jul 2023 19:19:22 +1200 Subject: [PATCH] LibDiff: Make parsing of unified hunks more robust Parsing of the unified hunks now verifies that the expected number of lines given by the unified location at the beginning of that hunk are actually in that hunk. Furthermore, we no longer crash when given a bogus unified range. As a further benefit, this begins the scaffolding for a patch parser which should assist us in parsing full patches - even when we are not aware of the format that a patch has been written in. --- Userland/Libraries/LibDiff/Forward.h | 2 + Userland/Libraries/LibDiff/Hunks.cpp | 156 +++++++++++++++------------ Userland/Libraries/LibDiff/Hunks.h | 16 ++- 3 files changed, 102 insertions(+), 72 deletions(-) diff --git a/Userland/Libraries/LibDiff/Forward.h b/Userland/Libraries/LibDiff/Forward.h index 484bbf0dd9..c21cad11da 100644 --- a/Userland/Libraries/LibDiff/Forward.h +++ b/Userland/Libraries/LibDiff/Forward.h @@ -8,6 +8,8 @@ namespace Diff { +class Parser; + struct Hunk; struct HunkLocation; struct Line; diff --git a/Userland/Libraries/LibDiff/Hunks.cpp b/Userland/Libraries/LibDiff/Hunks.cpp index 10c17ad685..cecce7c9b3 100644 --- a/Userland/Libraries/LibDiff/Hunks.cpp +++ b/Userland/Libraries/LibDiff/Hunks.cpp @@ -10,45 +10,100 @@ namespace Diff { -ErrorOr> parse_hunks(StringView diff) +Optional Parser::consume_unified_location() { - Vector diff_lines = diff.split_view('\n'); - if (diff_lines.is_empty()) - return Vector {}; + auto consume_range = [this](Range& range) { + if (!consume_line_number(range.start_line)) + return false; + if (consume_specific(',')) { + if (!consume_line_number(range.number_of_lines)) + return false; + } else { + range.number_of_lines = 1; + } + return true; + }; + + if (!consume_specific("@@ -")) + return {}; + + HunkLocation location; + + if (!consume_range(location.old_range)) + return {}; + + if (!consume_specific(" +")) + return {}; + + if (!consume_range(location.new_range)) + return {}; + + if (!consume_specific(" @@")) + return {}; + + return location; +} + +bool Parser::consume_line_number(size_t& number) +{ + auto line = consume_while(is_ascii_digit); + + auto maybe_number = line.to_uint(); + if (!maybe_number.has_value()) + return false; + + number = maybe_number.value(); + return true; +} + +ErrorOr> Parser::parse_hunks() +{ Vector hunks; - size_t line_index = 0; - HunkLocation current_location {}; + while (!is_eof()) { + // Try an locate a hunk location in this hunk. It may be prefixed with information. + auto maybe_location = consume_unified_location(); + consume_line(); - // Skip to first hunk - while (diff_lines[line_index][0] != '@') { - ++line_index; - } - - while (line_index < diff_lines.size()) { - if (diff_lines[line_index][0] == '@') { - current_location = parse_hunk_location(diff_lines[line_index]); - ++line_index; + if (!maybe_location.has_value()) continue; - } - Hunk hunk {}; - hunk.location = current_location; + Hunk hunk { *maybe_location, {} }; - while (line_index < diff_lines.size()) { - auto const& line = diff_lines[line_index]; + auto old_lines_expected = hunk.location.old_range.number_of_lines; + auto new_lines_expected = hunk.location.new_range.number_of_lines; - char const operation = line[0]; - if (operation != ' ' && operation != '+' && operation != '-') - break; + // We've found a location. Now parse out all of the expected content lines. + while (old_lines_expected != 0 || new_lines_expected != 0) { + StringView line = consume_line(); + + if (line.is_empty()) + return Error::from_string_literal("Malformed empty content line in patch"); + + if (line[0] != ' ' && line[0] != '+' && line[0] != '-') + return Error::from_string_literal("Invaid operation in patch"); + + auto const operation = Line::operation_from_symbol(line[0]); + + if (operation != Line::Operation::Removal) { + if (new_lines_expected == 0) + return Error::from_string_literal("Found more removal and context lines in patch than expected"); + + --new_lines_expected; + } + + if (operation != Line::Operation::Addition) { + if (old_lines_expected == 0) + return Error::from_string_literal("Found more addition and context lines in patch than expected"); + + --old_lines_expected; + } auto const content = line.substring_view(1, line.length() - 1); - - TRY(hunk.lines.try_append(Line { Line::operation_from_symbol(operation), TRY(String::from_utf8(content)) })); - - ++line_index; + TRY(hunk.lines.try_append(Line { operation, TRY(String::from_utf8(content)) })); } + TRY(hunks.try_append(hunk)); } @@ -63,48 +118,9 @@ ErrorOr> parse_hunks(StringView diff) return hunks; } -HunkLocation parse_hunk_location(StringView location_line) +ErrorOr> parse_hunks(StringView diff) { - size_t char_index = 0; - auto parse_start_and_length_pair = [](StringView raw) { - auto maybe_index_of_separator = raw.find(','); - - size_t start = 0; - size_t length = 0; - if (maybe_index_of_separator.has_value()) { - auto index_of_separator = maybe_index_of_separator.value(); - start = raw.substring_view(0, index_of_separator).to_uint().value(); - length = raw.substring_view(index_of_separator + 1, raw.length() - index_of_separator - 1).to_uint().value(); - } else { - length = 1; - start = raw.to_uint().value(); - } - - return Range { start, length }; - }; - while (char_index < location_line.length() && location_line[char_index++] != '-') { - } - VERIFY(char_index < location_line.length()); - - size_t original_location_start_index = char_index; - - while (char_index < location_line.length() && location_line[char_index++] != ' ') { - } - VERIFY(char_index < location_line.length() && location_line[char_index] == '+'); - size_t original_location_end_index = char_index - 2; - - size_t target_location_start_index = char_index + 1; - - char_index += 1; - while (char_index < location_line.length() && location_line[char_index++] != ' ') { - } - VERIFY(char_index < location_line.length()); - - size_t target_location_end_index = char_index - 2; - - auto old_range = parse_start_and_length_pair(location_line.substring_view(original_location_start_index, original_location_end_index - original_location_start_index + 1)); - auto new_range = parse_start_and_length_pair(location_line.substring_view(target_location_start_index, target_location_end_index - target_location_start_index + 1)); - return { old_range, new_range }; + Parser lexer(diff); + return lexer.parse_hunks(); +} } - -}; diff --git a/Userland/Libraries/LibDiff/Hunks.h b/Userland/Libraries/LibDiff/Hunks.h index ca89d35e63..17b3d4a4bc 100644 --- a/Userland/Libraries/LibDiff/Hunks.h +++ b/Userland/Libraries/LibDiff/Hunks.h @@ -9,6 +9,7 @@ #include #include +#include #include #include #include @@ -58,10 +59,21 @@ struct Hunk { Vector lines; }; -ErrorOr> parse_hunks(StringView diff); -HunkLocation parse_hunk_location(StringView location_line); +class Parser : public GenericLexer { +public: + using GenericLexer::GenericLexer; + + ErrorOr> parse_hunks(); + +private: + Optional consume_unified_location(); + bool consume_line_number(size_t& number); }; +ErrorOr> parse_hunks(StringView diff); + +} + template<> struct AK::Formatter : Formatter { ErrorOr format(FormatBuilder& builder, Diff::Line::Operation operation)