From 93f54202822ba28fb5d2b73b883724af1fba5200 Mon Sep 17 00:00:00 2001
From: Nico Weber <thakis@chromium.org>
Date: Mon, 15 Jan 2024 19:28:13 -0500
Subject: [PATCH] LibPDF: Start implementing the TIFF predictor

This codepath is separate from the predictor in the TIFF decoder.
The TIFF decoder currently does bits->Color conversion before
processing the predictor. That doesn't fit the PDF model where
filters are processed before converting streams into bitmaps.

If this code here ever grows to handle all cases, maybe we can move
it over to the TIFF decoder and then make it do predictions before
decoding to colors, to share this code.

(TIFF prediction is pretty messy since it's bits-per-pixel-dependent.
PNG prediction is always byte-based, which makes things easier.)
---
 Userland/Libraries/LibPDF/Filter.cpp | 70 +++++++++++++++++++++++++---
 Userland/Libraries/LibPDF/Filter.h   |  1 +
 2 files changed, 64 insertions(+), 7 deletions(-)
diff --git a/Userland/Libraries/LibPDF/Filter.cpp b/Userland/Libraries/LibPDF/Filter.cpp
index 2fd170391f..2569c67035 100644
--- a/Userland/Libraries/LibPDF/Filter.cpp
+++ b/Userland/Libraries/LibPDF/Filter.cpp
@@ -175,6 +175,59 @@ PDFErrorOr<ByteBuffer> Filter::decode_png_prediction(Bytes bytes, size_t bytes_p
     return decoded;
 }
 
+PDFErrorOr<ByteBuffer> Filter::decode_tiff_prediction(Bytes bytes, int columns, int colors, int bits_per_component)
+{
+    // TIFF 6 spec, Section 14: Differencing Predictor.
+    // "Assuming 8-bit grayscale pixels for the moment, a basic C implementation might look something like this:
+    //     char image[][]; int row, col;
+    //     /* take horizontal differences: */
+    //     for (row = 0; row < nrows; row++)
+    //         for (col = ncols - 1; col >= 1; col--)
+    //             image[row][col] -= image[row][col-1];
+    //  If we don’t have 8-bit components, we need to work a little harder [...]
+    //  Suppose we have 4-bit components packed two per byte in the normal TIFF uncompressed [...]
+    //  first expand each 4-bit component into an 8-bit byte, so that we have one component per byte, low-order justified.
+    //  We then perform the horizontal differencing [...] we then repack the 4-bit differences two to a byte [...]
+    //  If the components are greater than 8 bits deep, expanding the components into 16-bit words instead of 8-bit bytes
+    //  seems like the best way to perform the subtraction on most computers. [...]
+    //  An additional consideration arises in the case of color images. [...]
+    //  we will do our horizontal differences with an offset of SamplesPerPixel [...
+    //  In other words, we will subtract red from red, green from green, and blue from blue"
+    //
+    // The spec text describes the forward transform; this implements the inverse.
+    //
+    // Section 8: Baseline Field Reference Guide, entry for Compression:
+    // "Each row is padded to the next BYTE/SHORT/LONG boundary, consistent with
+    //  the preceding BitsPerSample rule."
+    //
+    // That's consistent with PDF. PDF allows bits_per_component to be 1, 2, 4, 8, or 16.
+
+    // FIXME: Support bits_per_component != 8.
+    if (bits_per_component != 8)
+        return AK::Error::from_string_literal("PDF: TIFF Predictor with bits per component != 8 not yet implemented");
+
+    // Translate from PDF spec language to TIFF 6 spec language:
+    int samples_per_pixel = colors;
+
+    ByteBuffer decoded;
+    decoded.ensure_capacity(bytes.size());
+
+    while (!bytes.is_empty()) {
+        auto row = bytes.slice(0, columns * samples_per_pixel);
+        for (int column = 1; column < columns; ++column) {
+            for (int sample = 0; sample < samples_per_pixel; ++sample) {
+                int index = column * samples_per_pixel + sample;
+                row[index] += row[index - samples_per_pixel];
+            }
+        }
+        decoded.append(row);
+
+        bytes = bytes.slice(row.size());
+    }
+
+    return decoded;
+}
+
 PDFErrorOr<ByteBuffer> Filter::handle_lzw_and_flate_parameters(ByteBuffer buffer, int predictor, int columns, int colors, int bits_per_component)
 {
     // Table 3.7 Optional parameters for LZWDecode and FlateDecode filters
@@ -182,16 +235,19 @@ PDFErrorOr<ByteBuffer> Filter::handle_lzw_and_flate_parameters(ByteBuffer buffer
     if (predictor == 1)
         return buffer;
 
-    // Check if we are dealing with a PNG prediction
-    if (predictor == 2)
-        return AK::Error::from_string_literal("The TIFF predictor is not supported");
-    if (predictor < 10 || predictor > 15)
+    // Check if we are dealing with a TIFF or PNG prediction.
+    if (predictor != 2 && (predictor < 10 || predictor > 15))
         return AK::Error::from_string_literal("Invalid predictor value");
 
-    // Rows are always a whole number of bytes long, starting with an algorithm tag
-    size_t const bytes_per_row = ceil_div(columns * colors * bits_per_component, 8) + 1;
+    // Rows are always a whole number of bytes long, for PNG starting with an algorithm tag.
+    size_t bytes_per_row = ceil_div(columns * colors * bits_per_component, 8);
+    if (predictor != 2)
+        bytes_per_row++;
     if (buffer.size() % bytes_per_row)
-        return AK::Error::from_string_literal("Flate input data is not divisible into columns");
+        return AK::Error::from_string_literal("Predictor input data is not divisible into columns");
+
+    if (predictor == 2)
+        return decode_tiff_prediction(buffer, columns, colors, bits_per_component);
 
     size_t bytes_per_pixel = ceil_div(colors * bits_per_component, 8);
     return decode_png_prediction(buffer, bytes_per_row, bytes_per_pixel);
diff --git a/Userland/Libraries/LibPDF/Filter.h b/Userland/Libraries/LibPDF/Filter.h
index ee897b1ad9..16f51ce3bd 100644
--- a/Userland/Libraries/LibPDF/Filter.h
+++ b/Userland/Libraries/LibPDF/Filter.h
@@ -21,6 +21,7 @@ private:
     static PDFErrorOr<ByteBuffer> decode_ascii_hex(ReadonlyBytes bytes);
     static PDFErrorOr<ByteBuffer> decode_ascii85(ReadonlyBytes bytes);
     static PDFErrorOr<ByteBuffer> decode_png_prediction(Bytes bytes, size_t bytes_per_row, size_t bytes_per_pixel);
+    static PDFErrorOr<ByteBuffer> decode_tiff_prediction(Bytes bytes, int columns, int colors, int bits_per_component);
     static PDFErrorOr<ByteBuffer> decode_lzw(ReadonlyBytes bytes, int predictor, int columns, int colors, int bits_per_component, int early_change);
     static PDFErrorOr<ByteBuffer> decode_flate(ReadonlyBytes bytes, int predictor, int columns, int colors, int bits_per_component);
     static PDFErrorOr<ByteBuffer> decode_run_length(ReadonlyBytes bytes);