LibPDF: Parse nested Page Tree structures

We now follow nested page tree nodes to find all of the actual page dicts, whereas previously we just assumed the root level page tree node contained all of the page children directly.
2025-09-18 23:06:17 +00:00 · 2021-05-02 18:53:07 -07:00 · 2021-05-02 18:53:07 -07:00 · 3aeaceb727
commit 3aeaceb727
parent 8c745ad0d9
4 changed files with 77 additions and 5 deletions
--- a/Userland/Libraries/LibPDF/Parser.cpp
+++ b/Userland/Libraries/LibPDF/Parser.cpp
@ -363,14 +363,14 @@ Value Parser::parse_number()
        }
    }

+    consume_whitespace();
+
    auto string = String(m_reader.bytes().slice(start_offset, m_reader.offset() - start_offset));
    float f = strtof(string.characters(), nullptr);
    if (is_float)
        return Value(f);

    VERIFY(floorf(f) == f);
-    consume_whitespace();
-
    return Value(static_cast<int>(f));
 }

@ -567,6 +567,50 @@ NonnullRefPtr<DictObject> Parser::parse_dict()
    return make_object<DictObject>(map);
 }

+RefPtr<DictObject> Parser::conditionally_parse_page_tree_node_at_offset(size_t offset)
+{
+    m_reader.move_to(offset);
+    parse_number();
+    parse_number();
+    VERIFY(m_reader.matches("obj"));
+    m_reader.move_by(3);
+    consume_whitespace();
+
+    consume('<');
+    consume('<');
+    consume_whitespace();
+    HashMap<FlyString, Value> map;
+
+    while (true) {
+        if (m_reader.matches(">>"))
+            break;
+        auto name = parse_name();
+        auto name_string = name->name();
+        if (!name_string.is_one_of("Type", "Parent", "Kids", "Count")) {
+            // This is a page, not a page tree node
+            return {};
+        }
+        auto value = parse_value();
+        if (name_string == "Type") {
+            if (!value.is_object())
+                return {};
+            auto type_object = value.as_object();
+            if (!type_object->is_name())
+                return {};
+            auto type_name = object_cast<NameObject>(type_object);
+            if (type_name->name() != "Pages")
+                return {};
+        }
+        map.set(name->name(), value);
+    }
+
+    consume('>');
+    consume('>');
+    consume_whitespace();
+
+    return make_object<DictObject>(map);
+}
+
 NonnullRefPtr<StreamObject> Parser::parse_stream(NonnullRefPtr<DictObject> dict)
 {
    VERIFY(m_reader.matches("stream"));