From c2ad29c85f3b0f6c7e9e6f2aeb62e4807042c8ea Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Julian=20Offenh=C3=A4user?= Date: Thu, 10 Nov 2022 23:32:41 +0100 Subject: [PATCH] LibPDF: Implement png predictor decoding for flate filter For flate and lzw filters, the data can be transformed by this predictor function to make it compress better. For us this means that we have to undo this step in order to get the right result. Although this feature is meant for images, I found at least a few documents that use it all over the place, making this step very important. --- Userland/Libraries/LibPDF/CommonNames.h | 5 ++ Userland/Libraries/LibPDF/Filter.cpp | 113 +++++++++++++++++++++--- Userland/Libraries/LibPDF/Filter.h | 6 +- Userland/Libraries/LibPDF/Parser.cpp | 28 +++++- 4 files changed, 137 insertions(+), 15 deletions(-) diff --git a/Userland/Libraries/LibPDF/CommonNames.h b/Userland/Libraries/LibPDF/CommonNames.h index 6f9b613935..9ef7481faf 100644 --- a/Userland/Libraries/LibPDF/CommonNames.h +++ b/Userland/Libraries/LibPDF/CommonNames.h @@ -18,6 +18,7 @@ A(BM) \ A(BaseEncoding) \ A(BaseFont) \ + A(BitsPerComponent) \ A(BlackPoint) \ A(C) \ A(CA) \ @@ -25,7 +26,9 @@ A(CalRGB) \ A(CIDSystemInfo) \ A(CIDToGIDMap) \ + A(Colors) \ A(ColorSpace) \ + A(Columns) \ A(Contents) \ A(Count) \ A(CropBox) \ @@ -33,6 +36,7 @@ A(D) \ A(DW) \ A(DCTDecode) \ + A(DecodeParms) \ A(DescendantFonts) \ A(Dest) \ A(Dests) \ @@ -101,6 +105,7 @@ A(Pages) \ A(Parent) \ A(Pattern) \ + A(Predictor) \ A(Prev) \ A(R) \ A(RI) \ diff --git a/Userland/Libraries/LibPDF/Filter.cpp b/Userland/Libraries/LibPDF/Filter.cpp index c184952be0..021fe65006 100644 --- a/Userland/Libraries/LibPDF/Filter.cpp +++ b/Userland/Libraries/LibPDF/Filter.cpp @@ -12,8 +12,24 @@ namespace PDF { -ErrorOr Filter::decode(ReadonlyBytes bytes, FlyString const& encoding_type) +ErrorOr Filter::decode(ReadonlyBytes bytes, FlyString const& encoding_type, RefPtr decode_parms) { + int predictor = 1; + int columns = 1; + int colors = 1; + int bits_per_component = 8; + + if (decode_parms) { + if (decode_parms->contains(CommonNames::Predictor)) + predictor = decode_parms->get_value(CommonNames::Predictor).get(); + if (decode_parms->contains(CommonNames::Columns)) + columns = decode_parms->get_value(CommonNames::Columns).get(); + if (decode_parms->contains(CommonNames::Colors)) + colors = decode_parms->get_value(CommonNames::Colors).get(); + if (decode_parms->contains(CommonNames::BitsPerComponent)) + bits_per_component = decode_parms->get_value(CommonNames::BitsPerComponent).get(); + } + if (encoding_type == CommonNames::ASCIIHexDecode) return decode_ascii_hex(bytes); if (encoding_type == CommonNames::ASCII85Decode) @@ -21,7 +37,7 @@ ErrorOr Filter::decode(ReadonlyBytes bytes, FlyString const& encodin if (encoding_type == CommonNames::LZWDecode) return decode_lzw(bytes); if (encoding_type == CommonNames::FlateDecode) - return decode_flate(bytes); + return decode_flate(bytes, predictor, columns, colors, bits_per_component); if (encoding_type == CommonNames::RunLengthDecode) return decode_run_length(bytes); if (encoding_type == CommonNames::CCITTFaxDecode) @@ -35,7 +51,7 @@ ErrorOr Filter::decode(ReadonlyBytes bytes, FlyString const& encodin if (encoding_type == CommonNames::Crypt) return decode_crypt(bytes); - return Error::from_string_literal("Unrecognized filter encoding"); + return AK::Error::from_string_literal("Unrecognized filter encoding"); } ErrorOr Filter::decode_ascii_hex(ReadonlyBytes bytes) @@ -50,11 +66,11 @@ ErrorOr Filter::decode_ascii_hex(ReadonlyBytes bytes) for (size_t i = 0; i < bytes.size() / 2; ++i) { auto const c1 = decode_hex_digit(static_cast(bytes[i * 2])); if (c1 >= 16) - return Error::from_string_literal("Hex string contains invalid digit"); + return AK::Error::from_string_literal("Hex string contains invalid digit"); auto const c2 = decode_hex_digit(static_cast(bytes[i * 2 + 1])); if (c2 >= 16) - return Error::from_string_literal("Hex string contains invalid digit"); + return AK::Error::from_string_literal("Hex string contains invalid digit"); output[i] = (c1 << 4) + c2; } @@ -120,20 +136,95 @@ ErrorOr Filter::decode_ascii85(ReadonlyBytes bytes) return ByteBuffer::copy(buff.span()); }; +ErrorOr Filter::decode_png_prediction(Bytes bytes, int bytes_per_row) +{ + int number_of_rows = bytes.size() / bytes_per_row; + + ByteBuffer decoded; + decoded.ensure_capacity(bytes.size() - number_of_rows); + + auto empty_row = TRY(ByteBuffer::create_zeroed(bytes_per_row)); + auto previous_row = empty_row.data(); + + for (int row_index = 0; row_index < number_of_rows; ++row_index) { + auto row = bytes.data() + row_index * bytes_per_row; + + u8 algorithm_tag = row[0]; + switch (algorithm_tag) { + case 0: + break; + case 1: + for (int i = 2; i < bytes_per_row; ++i) + row[i] += row[i - 1]; + break; + case 2: + for (int i = 1; i < bytes_per_row; ++i) + row[i] += previous_row[i]; + break; + case 3: + for (int i = 1; i < bytes_per_row; ++i) { + u8 left = 0; + if (i > 1) + left = row[i - 1]; + u8 above = previous_row[i]; + row[i] += (left + above) / 2; + } + break; + case 4: + for (int i = 1; i < bytes_per_row; ++i) { + u8 left = 0; + u8 upper_left = 0; + if (i > 1) { + left = row[i - 1]; + upper_left = previous_row[i - 1]; + } + u8 above = previous_row[i]; + u8 p = left + above - upper_left; + + int left_distance = abs(p - left); + int above_distance = abs(p - above); + int upper_left_distance = abs(p - upper_left); + + u8 paeth = min(left_distance, min(above_distance, upper_left_distance)); + + row[i] += paeth; + } + break; + default: + return AK::Error::from_string_literal("Unknown PNG algorithm tag"); + } + + previous_row = row; + decoded.append(row + 1, bytes_per_row - 1); + } + + return decoded; +} + ErrorOr Filter::decode_lzw(ReadonlyBytes) { dbgln("LZW decoding is not supported"); VERIFY_NOT_REACHED(); }; -ErrorOr Filter::decode_flate(ReadonlyBytes bytes) +ErrorOr Filter::decode_flate(ReadonlyBytes bytes, int predictor, int columns, int colors, int bits_per_component) { - // FIXME: The spec says Flate decoding is "based on" zlib, does that mean they - // aren't exactly the same? + auto buff = Compress::DeflateDecompressor::decompress_all(bytes.slice(2)).value(); + if (predictor == 1) + return buff; - auto buff = Compress::DeflateDecompressor::decompress_all(bytes.slice(2)); - VERIFY(buff.has_value()); - return buff.value(); + // Check if we are dealing with a PNG prediction + if (predictor == 2) + return AK::Error::from_string_literal("The TIFF predictor is not supported"); + if (predictor < 10 || predictor > 15) + return AK::Error::from_string_literal("Invalid predictor value"); + + // Rows are always a whole number of bytes long, starting with an algorithm tag + int bytes_per_row = AK::ceil_div(columns * colors * bits_per_component, 8) + 1; + if (buff.size() % bytes_per_row) + return AK::Error::from_string_literal("Flate input data is not divisible into columns"); + + return decode_png_prediction(buff, bytes_per_row); }; ErrorOr Filter::decode_run_length(ReadonlyBytes) diff --git a/Userland/Libraries/LibPDF/Filter.h b/Userland/Libraries/LibPDF/Filter.h index aaa389788f..dd68eb36e6 100644 --- a/Userland/Libraries/LibPDF/Filter.h +++ b/Userland/Libraries/LibPDF/Filter.h @@ -9,18 +9,20 @@ #include #include #include +#include namespace PDF { class Filter { public: - static ErrorOr decode(ReadonlyBytes bytes, FlyString const& encoding_type); + static ErrorOr decode(ReadonlyBytes bytes, FlyString const& encoding_type, RefPtr decode_parms); private: static ErrorOr decode_ascii_hex(ReadonlyBytes bytes); static ErrorOr decode_ascii85(ReadonlyBytes bytes); + static ErrorOr decode_png_prediction(Bytes bytes, int bytes_per_row); static ErrorOr decode_lzw(ReadonlyBytes bytes); - static ErrorOr decode_flate(ReadonlyBytes bytes); + static ErrorOr decode_flate(ReadonlyBytes bytes, int predictor, int columns, int colors, int bits_per_component); static ErrorOr decode_run_length(ReadonlyBytes bytes); static ErrorOr decode_ccitt(ReadonlyBytes bytes); static ErrorOr decode_jbig2(ReadonlyBytes bytes); diff --git a/Userland/Libraries/LibPDF/Parser.cpp b/Userland/Libraries/LibPDF/Parser.cpp index 5c3095efc6..e888cbbf38 100644 --- a/Userland/Libraries/LibPDF/Parser.cpp +++ b/Userland/Libraries/LibPDF/Parser.cpp @@ -488,8 +488,32 @@ PDFErrorOr> Parser::parse_stream(NonnullRefPtrcast()->name()); } - for (auto const& filter_type : filters) - stream_object->buffer() = TRY(Filter::decode(stream_object->bytes(), filter_type)); + // Every filter may get its own parameter dictionary + Vector> decode_parms_vector; + RefPtr decode_parms_object; + if (dict->contains(CommonNames::DecodeParms)) { + decode_parms_object = TRY(dict->get_object(m_document, CommonNames::DecodeParms)); + if (decode_parms_object->is()) { + auto decode_parms_array = decode_parms_object->cast(); + for (size_t i = 0; i < decode_parms_array->size(); ++i) { + // FIXME: This entry may be the null object instead + RefPtr decode_parms = decode_parms_array->at(i).get>()->cast(); + decode_parms_vector.append(decode_parms); + } + } else { + decode_parms_vector.append(decode_parms_object->cast()); + } + } + + VERIFY(decode_parms_vector.is_empty() || decode_parms_vector.size() == filters.size()); + + for (size_t i = 0; i < filters.size(); ++i) { + RefPtr decode_parms; + if (!decode_parms_vector.is_empty()) + decode_parms = decode_parms_vector.at(i); + + stream_object->buffer() = TRY(Filter::decode(stream_object->bytes(), filters.at(i), decode_parms)); + } } return stream_object;