From f562c470e21a2e81df8bd6d630a6b661af155324 Mon Sep 17 00:00:00 2001
From: Nico Weber <thakis@chromium.org>
Date: Fri, 2 Feb 2024 12:05:12 -0500
Subject: [PATCH] LibGfx+LibPDF: Simpler and faster N-D linear sampling

Previously, if we wanted to to e.g. do linear interpolation in 2-D,
we'd get a sample point like (1.3, 4.4), then get 4 samples around
it at (1, 4), (2, 4), (1, 5), (2, 5), then reduce the 4 samples
to 2 samples by computing the combined samples
`0.3 * f(1, 4) + 0.7 * f(2, 4)` and `0.3 * f(1, 5) + 0.8 * f(2, 5)`,
and then 1-D linearly blending between these two samples with the
factor 0.4. In the end we'd multiply the first value by 0.3 * 0.4,
the second by 0.7 * 0.4, the third by 0.3 * 0.6, and the third by
0.7 * 0.6, and then sum them all up.

This requires computing and storing 2**N samples, followed by
another 2**N iterations to combine the 2**N sampls to a single value.
(N is in practice either 4 or 3, so 2**N isn't super huge.)

Instead, for every sample we can directly compute the product of
weights and sum them up directly. This lets us omit the second loop
and storing 2**N values, in exchange for doing an additional O(n)
work to compute the product.

Takes

    Build/lagom/bin/image --no-output --invert-cmyk \
        --assign-color-profile \
            Build/lagom/Root/res/icc/Adobe/CMYK/USWebCoatedSWOP.icc \
        --convert-to-color-profile serenity-sRGB.icc \
        cmyk.jpg

form 3.42s to 3.08s on my machine, almost 10% faster (and less code).

Here cmyk.jpg is a 2253x3080 cmyk jpeg, and USWebCoatedSWOP.icc is an
mft2 profile with input tables with 256 samples and a 9x9x9x9 CLUT.

The LibPDF change is covered by TEST_CASE(sampled) in LibPDF.cpp,
and the LibGfx change is basically the same change as the one in
LibPDF (where the test results don't change) and the output
subjectively looks identical. So hopefully this causes indeed no
behavior change :^)
---
 Userland/Libraries/LibGfx/ICC/TagTypes.h | 17 +++++++----------
 Userland/Libraries/LibPDF/Function.cpp   | 19 +++++++++----------
 2 files changed, 16 insertions(+), 20 deletions(-)
diff --git a/Userland/Libraries/LibGfx/ICC/TagTypes.h b/Userland/Libraries/LibGfx/ICC/TagTypes.h
index eaaaf4579c..7fa00d4b2c 100644
--- a/Userland/Libraries/LibGfx/ICC/TagTypes.h
+++ b/Userland/Libraries/LibGfx/ICC/TagTypes.h
@@ -46,23 +46,20 @@ inline FloatVector3 lerp_nd(Function<unsigned(size_t)> size, Function<FloatVecto
         factor.append(ec - left_index[i]);
     }
 
-    Vector<FloatVector3> samples;
-    samples.resize(1u << x.size());
+    FloatVector3 sample_output {};
     // The i'th bit of mask indicates if the i'th coordinate is rounded up or down.
     Vector<unsigned> coordinates;
     coordinates.resize(x.size());
     for (size_t mask = 0; mask < (1u << x.size()); ++mask) {
-        for (size_t i = 0; i < x.size(); ++i)
+        float sample_weight = 1.0f;
+        for (size_t i = 0; i < x.size(); ++i) {
             coordinates[i] = left_index[i] + ((mask >> i) & 1u);
-        samples[mask] = sample(coordinates);
+            sample_weight *= ((mask >> i) & 1u) ? factor[i] : 1.0f - factor[i];
+        }
+        sample_output += sample(coordinates) * sample_weight;
     }
 
-    for (int i = static_cast<int>(x.size() - 1); i >= 0; --i) {
-        for (size_t mask = 0; mask < (1u << i); ++mask)
-            samples[mask] = mix(samples[mask], samples[mask | (1u << i)], factor[i]);
-    }
-
-    return samples[0];
+    return sample_output;
 }
 
 using S15Fixed16 = FixedPoint<16, i32>;
diff --git a/Userland/Libraries/LibPDF/Function.cpp b/Userland/Libraries/LibPDF/Function.cpp
index fd9c05b350..36834499d4 100644
--- a/Userland/Libraries/LibPDF/Function.cpp
+++ b/Userland/Libraries/LibPDF/Function.cpp
@@ -206,23 +206,22 @@ PDFErrorOr<ReadonlySpan<float>> SampledFunction::evaluate(ReadonlySpan<float> xs
         // then 2 by interpolating along y, then 1 by interpolating along x.
         // So for the general case, we create 2**N samples, and then for each coordinate, we cut the number of samples in half
         // by interpolating along that coordinate.
-        Vector<float> samples;
-        samples.resize(1 << m_domain.size());
+        // Instead of storing all the 2**N samples, we can calculate the product of weights for each corner,
+        // and sum up the weighted samples.
+        float sample_output = 0;
         // The i'th bit of mask indicates if the i'th coordinate is rounded up or down.
         Vector<int> coordinates;
         coordinates.resize(m_domain.size());
         for (size_t mask = 0; mask < (1u << m_domain.size()); ++mask) {
-            for (size_t i = 0; i < m_domain.size(); ++i)
+            float sample_weight = 1.0f;
+            for (size_t i = 0; i < m_domain.size(); ++i) {
                 coordinates[i] = m_left_index[i] + ((mask >> i) & 1u);
-            samples[mask] = sample(coordinates, r);
+                sample_weight *= ((mask >> i) & 1u) ? m_inputs[i] : (1.0f - m_inputs[i]);
+            }
+            sample_output += sample(coordinates, r) * sample_weight;
         }
 
-        for (int i = static_cast<int>(m_domain.size() - 1); i >= 0; --i) {
-            for (size_t mask = 0; mask < (1u << i); ++mask)
-                samples[mask] = mix(samples[mask], samples[mask | (1 << i)], m_inputs[i]);
-        }
-
-        float result = interpolate(samples[0], 0.0f, 255.0f, m_decode[r].lower, m_decode[r].upper);
+        float result = interpolate(sample_output, 0.0f, 255.0f, m_decode[r].lower, m_decode[r].upper);
         m_outputs[r] = clamp(result, m_range[r].lower, m_range[r].upper);
     }