From c2ad29c85f3b0f6c7e9e6f2aeb62e4807042c8ea Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Julian=20Offenh=C3=A4user?= <offenhaeuser@protonmail.com>
Date: Thu, 10 Nov 2022 23:32:41 +0100
Subject: [PATCH] LibPDF: Implement png predictor decoding for flate filter

For flate and lzw filters, the data can be transformed by this
predictor function to make it compress better. For us this means that
we have to undo this step in order to get the right result.

Although this feature is meant for images, I found at least a few
documents that use it all over the place, making this step very
important.
---
 Userland/Libraries/LibPDF/CommonNames.h |   5 ++
 Userland/Libraries/LibPDF/Filter.cpp    | 113 +++++++++++++++++++++---
 Userland/Libraries/LibPDF/Filter.h      |   6 +-
 Userland/Libraries/LibPDF/Parser.cpp    |  28 +++++-
 4 files changed, 137 insertions(+), 15 deletions(-)
diff --git a/Userland/Libraries/LibPDF/CommonNames.h b/Userland/Libraries/LibPDF/CommonNames.h
index 6f9b613935..9ef7481faf 100644
--- a/Userland/Libraries/LibPDF/CommonNames.h
+++ b/Userland/Libraries/LibPDF/CommonNames.h
@@ -18,6 +18,7 @@
     A(BM)                         \
     A(BaseEncoding)               \
     A(BaseFont)                   \
+    A(BitsPerComponent)           \
     A(BlackPoint)                 \
     A(C)                          \
     A(CA)                         \
@@ -25,7 +26,9 @@
     A(CalRGB)                     \
     A(CIDSystemInfo)              \
     A(CIDToGIDMap)                \
+    A(Colors)                     \
     A(ColorSpace)                 \
+    A(Columns)                    \
     A(Contents)                   \
     A(Count)                      \
     A(CropBox)                    \
@@ -33,6 +36,7 @@
     A(D)                          \
     A(DW)                         \
     A(DCTDecode)                  \
+    A(DecodeParms)                \
     A(DescendantFonts)            \
     A(Dest)                       \
     A(Dests)                      \
@@ -101,6 +105,7 @@
     A(Pages)                      \
     A(Parent)                     \
     A(Pattern)                    \
+    A(Predictor)                  \
     A(Prev)                       \
     A(R)                          \
     A(RI)                         \
diff --git a/Userland/Libraries/LibPDF/Filter.cpp b/Userland/Libraries/LibPDF/Filter.cpp
index c184952be0..021fe65006 100644
--- a/Userland/Libraries/LibPDF/Filter.cpp
+++ b/Userland/Libraries/LibPDF/Filter.cpp
@@ -12,8 +12,24 @@
 
 namespace PDF {
 
-ErrorOr<ByteBuffer> Filter::decode(ReadonlyBytes bytes, FlyString const& encoding_type)
+ErrorOr<ByteBuffer> Filter::decode(ReadonlyBytes bytes, FlyString const& encoding_type, RefPtr<DictObject> decode_parms)
 {
+    int predictor = 1;
+    int columns = 1;
+    int colors = 1;
+    int bits_per_component = 8;
+
+    if (decode_parms) {
+        if (decode_parms->contains(CommonNames::Predictor))
+            predictor = decode_parms->get_value(CommonNames::Predictor).get<int>();
+        if (decode_parms->contains(CommonNames::Columns))
+            columns = decode_parms->get_value(CommonNames::Columns).get<int>();
+        if (decode_parms->contains(CommonNames::Colors))
+            colors = decode_parms->get_value(CommonNames::Colors).get<int>();
+        if (decode_parms->contains(CommonNames::BitsPerComponent))
+            bits_per_component = decode_parms->get_value(CommonNames::BitsPerComponent).get<int>();
+    }
+
     if (encoding_type == CommonNames::ASCIIHexDecode)
         return decode_ascii_hex(bytes);
     if (encoding_type == CommonNames::ASCII85Decode)
@@ -21,7 +37,7 @@ ErrorOr<ByteBuffer> Filter::decode(ReadonlyBytes bytes, FlyString const& encodin
     if (encoding_type == CommonNames::LZWDecode)
         return decode_lzw(bytes);
     if (encoding_type == CommonNames::FlateDecode)
-        return decode_flate(bytes);
+        return decode_flate(bytes, predictor, columns, colors, bits_per_component);
     if (encoding_type == CommonNames::RunLengthDecode)
         return decode_run_length(bytes);
     if (encoding_type == CommonNames::CCITTFaxDecode)
@@ -35,7 +51,7 @@ ErrorOr<ByteBuffer> Filter::decode(ReadonlyBytes bytes, FlyString const& encodin
     if (encoding_type == CommonNames::Crypt)
         return decode_crypt(bytes);
 
-    return Error::from_string_literal("Unrecognized filter encoding");
+    return AK::Error::from_string_literal("Unrecognized filter encoding");
 }
 
 ErrorOr<ByteBuffer> Filter::decode_ascii_hex(ReadonlyBytes bytes)
@@ -50,11 +66,11 @@ ErrorOr<ByteBuffer> Filter::decode_ascii_hex(ReadonlyBytes bytes)
     for (size_t i = 0; i < bytes.size() / 2; ++i) {
         auto const c1 = decode_hex_digit(static_cast<char>(bytes[i * 2]));
         if (c1 >= 16)
-            return Error::from_string_literal("Hex string contains invalid digit");
+            return AK::Error::from_string_literal("Hex string contains invalid digit");
 
         auto const c2 = decode_hex_digit(static_cast<char>(bytes[i * 2 + 1]));
         if (c2 >= 16)
-            return Error::from_string_literal("Hex string contains invalid digit");
+            return AK::Error::from_string_literal("Hex string contains invalid digit");
 
         output[i] = (c1 << 4) + c2;
     }
@@ -120,20 +136,95 @@ ErrorOr<ByteBuffer> Filter::decode_ascii85(ReadonlyBytes bytes)
     return ByteBuffer::copy(buff.span());
 };
 
+ErrorOr<ByteBuffer> Filter::decode_png_prediction(Bytes bytes, int bytes_per_row)
+{
+    int number_of_rows = bytes.size() / bytes_per_row;
+
+    ByteBuffer decoded;
+    decoded.ensure_capacity(bytes.size() - number_of_rows);
+
+    auto empty_row = TRY(ByteBuffer::create_zeroed(bytes_per_row));
+    auto previous_row = empty_row.data();
+
+    for (int row_index = 0; row_index < number_of_rows; ++row_index) {
+        auto row = bytes.data() + row_index * bytes_per_row;
+
+        u8 algorithm_tag = row[0];
+        switch (algorithm_tag) {
+        case 0:
+            break;
+        case 1:
+            for (int i = 2; i < bytes_per_row; ++i)
+                row[i] += row[i - 1];
+            break;
+        case 2:
+            for (int i = 1; i < bytes_per_row; ++i)
+                row[i] += previous_row[i];
+            break;
+        case 3:
+            for (int i = 1; i < bytes_per_row; ++i) {
+                u8 left = 0;
+                if (i > 1)
+                    left = row[i - 1];
+                u8 above = previous_row[i];
+                row[i] += (left + above) / 2;
+            }
+            break;
+        case 4:
+            for (int i = 1; i < bytes_per_row; ++i) {
+                u8 left = 0;
+                u8 upper_left = 0;
+                if (i > 1) {
+                    left = row[i - 1];
+                    upper_left = previous_row[i - 1];
+                }
+                u8 above = previous_row[i];
+                u8 p = left + above - upper_left;
+
+                int left_distance = abs(p - left);
+                int above_distance = abs(p - above);
+                int upper_left_distance = abs(p - upper_left);
+
+                u8 paeth = min(left_distance, min(above_distance, upper_left_distance));
+
+                row[i] += paeth;
+            }
+            break;
+        default:
+            return AK::Error::from_string_literal("Unknown PNG algorithm tag");
+        }
+
+        previous_row = row;
+        decoded.append(row + 1, bytes_per_row - 1);
+    }
+
+    return decoded;
+}
+
 ErrorOr<ByteBuffer> Filter::decode_lzw(ReadonlyBytes)
 {
     dbgln("LZW decoding is not supported");
     VERIFY_NOT_REACHED();
 };
 
-ErrorOr<ByteBuffer> Filter::decode_flate(ReadonlyBytes bytes)
+ErrorOr<ByteBuffer> Filter::decode_flate(ReadonlyBytes bytes, int predictor, int columns, int colors, int bits_per_component)
 {
-    // FIXME: The spec says Flate decoding is "based on" zlib, does that mean they
-    // aren't exactly the same?
+    auto buff = Compress::DeflateDecompressor::decompress_all(bytes.slice(2)).value();
+    if (predictor == 1)
+        return buff;
 
-    auto buff = Compress::DeflateDecompressor::decompress_all(bytes.slice(2));
-    VERIFY(buff.has_value());
-    return buff.value();
+    // Check if we are dealing with a PNG prediction
+    if (predictor == 2)
+        return AK::Error::from_string_literal("The TIFF predictor is not supported");
+    if (predictor < 10 || predictor > 15)
+        return AK::Error::from_string_literal("Invalid predictor value");
+
+    // Rows are always a whole number of bytes long, starting with an algorithm tag
+    int bytes_per_row = AK::ceil_div(columns * colors * bits_per_component, 8) + 1;
+    if (buff.size() % bytes_per_row)
+        return AK::Error::from_string_literal("Flate input data is not divisible into columns");
+
+    return decode_png_prediction(buff, bytes_per_row);
 };
 
 ErrorOr<ByteBuffer> Filter::decode_run_length(ReadonlyBytes)
diff --git a/Userland/Libraries/LibPDF/Filter.h b/Userland/Libraries/LibPDF/Filter.h
index aaa389788f..dd68eb36e6 100644
--- a/Userland/Libraries/LibPDF/Filter.h
+++ b/Userland/Libraries/LibPDF/Filter.h
@@ -9,18 +9,20 @@
 #include <AK/ByteBuffer.h>
 #include <AK/Error.h>
 #include <AK/FlyString.h>
+#include <LibPDF/ObjectDerivatives.h>
 
 namespace PDF {
 
 class Filter {
 public:
-    static ErrorOr<ByteBuffer> decode(ReadonlyBytes bytes, FlyString const& encoding_type);
+    static ErrorOr<ByteBuffer> decode(ReadonlyBytes bytes, FlyString const& encoding_type, RefPtr<DictObject> decode_parms);
 
 private:
     static ErrorOr<ByteBuffer> decode_ascii_hex(ReadonlyBytes bytes);
     static ErrorOr<ByteBuffer> decode_ascii85(ReadonlyBytes bytes);
+    static ErrorOr<ByteBuffer> decode_png_prediction(Bytes bytes, int bytes_per_row);
     static ErrorOr<ByteBuffer> decode_lzw(ReadonlyBytes bytes);
-    static ErrorOr<ByteBuffer> decode_flate(ReadonlyBytes bytes);
+    static ErrorOr<ByteBuffer> decode_flate(ReadonlyBytes bytes, int predictor, int columns, int colors, int bits_per_component);
     static ErrorOr<ByteBuffer> decode_run_length(ReadonlyBytes bytes);
     static ErrorOr<ByteBuffer> decode_ccitt(ReadonlyBytes bytes);
     static ErrorOr<ByteBuffer> decode_jbig2(ReadonlyBytes bytes);
diff --git a/Userland/Libraries/LibPDF/Parser.cpp b/Userland/Libraries/LibPDF/Parser.cpp
index 5c3095efc6..e888cbbf38 100644
--- a/Userland/Libraries/LibPDF/Parser.cpp
+++ b/Userland/Libraries/LibPDF/Parser.cpp
@@ -488,8 +488,32 @@ PDFErrorOr<NonnullRefPtr<StreamObject>> Parser::parse_stream(NonnullRefPtr<DictO
             filters.append(filter_object->cast<NameObject>()->name());
         }
 
-        for (auto const& filter_type : filters)
-            stream_object->buffer() = TRY(Filter::decode(stream_object->bytes(), filter_type));
+        // Every filter may get its own parameter dictionary
+        Vector<RefPtr<DictObject>> decode_parms_vector;
+        RefPtr<Object> decode_parms_object;
+        if (dict->contains(CommonNames::DecodeParms)) {
+            decode_parms_object = TRY(dict->get_object(m_document, CommonNames::DecodeParms));
+            if (decode_parms_object->is<ArrayObject>()) {
+                auto decode_parms_array = decode_parms_object->cast<ArrayObject>();
+                for (size_t i = 0; i < decode_parms_array->size(); ++i) {
+                    // FIXME: This entry may be the null object instead
+                    RefPtr<DictObject> decode_parms = decode_parms_array->at(i).get<NonnullRefPtr<Object>>()->cast<DictObject>();
+                    decode_parms_vector.append(decode_parms);
+                }
+            } else {
+                decode_parms_vector.append(decode_parms_object->cast<DictObject>());
+            }
+        }
+
+        VERIFY(decode_parms_vector.is_empty() || decode_parms_vector.size() == filters.size());
+
+        for (size_t i = 0; i < filters.size(); ++i) {
+            RefPtr<DictObject> decode_parms;
+            if (!decode_parms_vector.is_empty())
+                decode_parms = decode_parms_vector.at(i);
+
+            stream_object->buffer() = TRY(Filter::decode(stream_object->bytes(), filters.at(i), decode_parms));
+        }
     }
 
     return stream_object;