LibPDF: Parse linearized PDF files

This is a big step, as most PDFs which are downloaded online will be linearized. Pretty much the only difference is that the xref structure is slightly different.
2025-07-27 01:17:35 +00:00 · 2021-05-26 22:52:05 -07:00 · 2021-05-26 22:52:05 -07:00 · e23bfd7252
commit e23bfd7252
parent be1be47613
8 changed files with 270 additions and 45 deletions
--- a/Userland/Libraries/LibPDF/XRefTable.h
+++ b/Userland/Libraries/LibPDF/XRefTable.h
@ -10,8 +10,10 @@

 namespace PDF {

+constexpr long invalid_byte_offset = NumericLimits<long>::max();
+
 struct XRefEntry {
-    long byte_offset { -1L };
+    long byte_offset { invalid_byte_offset };
    u16 generation_number { 0 };
    bool in_use { false };
 };
@ -22,8 +24,34 @@ struct XRefSection {
    Vector<XRefEntry> entries;
 };

-class XRefTable {
+class XRefTable final : public RefCounted<XRefTable> {
 public:
+    bool merge(XRefTable&& other)
+    {
+        auto this_size = m_entries.size();
+        auto other_size = other.m_entries.size();
+        m_entries.ensure_capacity(other_size);
+
+        for (size_t i = 0; i < other_size; i++) {
+            auto other_entry = other.m_entries[i];
+            if (i >= this_size) {
+                m_entries.unchecked_append(other_entry);
+                continue;
+            }
+
+            auto this_entry = m_entries[i];
+
+            if (this_entry.byte_offset == invalid_byte_offset) {
+                m_entries[i] = other_entry;
+            } else if (other_entry.byte_offset != invalid_byte_offset) {
+                // Both xref tables have an entry for the same object index
+                return false;
+            }
+        }
+
+        return true;
+    }
+
    void add_section(const XRefSection& section)
    {
        m_entries.ensure_capacity(section.starting_index + section.count);