From 3aeaceb72729097c8ba8c59f8beab2cff87e02b1 Mon Sep 17 00:00:00 2001 From: Matthew Olsson Date: Sun, 2 May 2021 18:53:07 -0700 Subject: [PATCH] LibPDF: Parse nested Page Tree structures We now follow nested page tree nodes to find all of the actual page dicts, whereas previously we just assumed the root level page tree node contained all of the page children directly. --- Userland/Libraries/LibPDF/Document.cpp | 25 ++++++++++++-- Userland/Libraries/LibPDF/Document.h | 7 ++++ Userland/Libraries/LibPDF/Parser.cpp | 48 ++++++++++++++++++++++++-- Userland/Libraries/LibPDF/Parser.h | 2 ++ 4 files changed, 77 insertions(+), 5 deletions(-) diff --git a/Userland/Libraries/LibPDF/Document.cpp b/Userland/Libraries/LibPDF/Document.cpp index 38dab2c4d3..a4a7abc945 100644 --- a/Userland/Libraries/LibPDF/Document.cpp +++ b/Userland/Libraries/LibPDF/Document.cpp @@ -117,14 +117,33 @@ UnwrappedValueType Document::resolve_to(const Value& value) void Document::build_page_tree() { auto page_tree = m_catalog->get_dict(this, "Pages"); - auto kids_array = page_tree->get_array(this, "Kids"); + add_page_tree_node_to_page_tree(page_tree); +} +void Document::add_page_tree_node_to_page_tree(NonnullRefPtr page_tree) +{ + auto kids_array = page_tree->get_array(this, "Kids"); auto page_count = page_tree->get("Count").value().as_int(); + if (static_cast(page_count) != kids_array->elements().size()) { - // FIXME: Support recursive PDF page tree structures - VERIFY_NOT_REACHED(); + // This page tree contains child page trees, so we recursively add + // these pages to the overall page tree + + for (auto& value : *kids_array) { + auto reference = resolve_to(value); + auto byte_offset = m_xref_table.byte_offset_for_object(reference->index()); + auto maybe_page_tree_node = m_parser.conditionally_parse_page_tree_node_at_offset(byte_offset); + if (maybe_page_tree_node) { + add_page_tree_node_to_page_tree(maybe_page_tree_node.release_nonnull()); + } else { + m_page_object_indices.append(reference->index()); + } + } + + return; } + // We know all of the kids are leaf nodes for (auto& value : *kids_array) { auto reference = resolve_to(value); m_page_object_indices.append(reference->index()); diff --git a/Userland/Libraries/LibPDF/Document.h b/Userland/Libraries/LibPDF/Document.h index f94f6354c6..f521025087 100644 --- a/Userland/Libraries/LibPDF/Document.h +++ b/Userland/Libraries/LibPDF/Document.h @@ -66,7 +66,14 @@ public: UnwrappedValueType resolve_to(const Value& value); private: + // FIXME: Currently, to improve performance, we don't load any pages at Document + // construction, rather we just load the page structure and populate + // m_page_object_indices. However, we can be even lazier and defer page tree node + // parsing, as good PDF writers will layout the page tree in a balanced tree to + // improve lookup time. This would reduce the initial overhead by not loading + // every page tree node of, say, a 1000+ page PDF file. void build_page_tree(); + void add_page_tree_node_to_page_tree(NonnullRefPtr page_tree); Parser m_parser; XRefTable m_xref_table; diff --git a/Userland/Libraries/LibPDF/Parser.cpp b/Userland/Libraries/LibPDF/Parser.cpp index eb37bf47d7..2e7d5ab57e 100644 --- a/Userland/Libraries/LibPDF/Parser.cpp +++ b/Userland/Libraries/LibPDF/Parser.cpp @@ -363,14 +363,14 @@ Value Parser::parse_number() } } + consume_whitespace(); + auto string = String(m_reader.bytes().slice(start_offset, m_reader.offset() - start_offset)); float f = strtof(string.characters(), nullptr); if (is_float) return Value(f); VERIFY(floorf(f) == f); - consume_whitespace(); - return Value(static_cast(f)); } @@ -567,6 +567,50 @@ NonnullRefPtr Parser::parse_dict() return make_object(map); } +RefPtr Parser::conditionally_parse_page_tree_node_at_offset(size_t offset) +{ + m_reader.move_to(offset); + parse_number(); + parse_number(); + VERIFY(m_reader.matches("obj")); + m_reader.move_by(3); + consume_whitespace(); + + consume('<'); + consume('<'); + consume_whitespace(); + HashMap map; + + while (true) { + if (m_reader.matches(">>")) + break; + auto name = parse_name(); + auto name_string = name->name(); + if (!name_string.is_one_of("Type", "Parent", "Kids", "Count")) { + // This is a page, not a page tree node + return {}; + } + auto value = parse_value(); + if (name_string == "Type") { + if (!value.is_object()) + return {}; + auto type_object = value.as_object(); + if (!type_object->is_name()) + return {}; + auto type_name = object_cast(type_object); + if (type_name->name() != "Pages") + return {}; + } + map.set(name->name(), value); + } + + consume('>'); + consume('>'); + consume_whitespace(); + + return make_object(map); +} + NonnullRefPtr Parser::parse_stream(NonnullRefPtr dict) { VERIFY(m_reader.matches("stream")); diff --git a/Userland/Libraries/LibPDF/Parser.h b/Userland/Libraries/LibPDF/Parser.h index 56921916a3..3158f9d795 100644 --- a/Userland/Libraries/LibPDF/Parser.h +++ b/Userland/Libraries/LibPDF/Parser.h @@ -29,6 +29,8 @@ public: NonnullRefPtr parse_indirect_value_at_offset(size_t offset); + RefPtr conditionally_parse_page_tree_node_at_offset(size_t offset); + private: bool parse_header(); XRefTable parse_xref_table();