diff --git a/Userland/Libraries/LibPDF/Document.cpp b/Userland/Libraries/LibPDF/Document.cpp index 38dab2c4d3..a4a7abc945 100644 --- a/Userland/Libraries/LibPDF/Document.cpp +++ b/Userland/Libraries/LibPDF/Document.cpp @@ -117,14 +117,33 @@ UnwrappedValueType Document::resolve_to(const Value& value) void Document::build_page_tree() { auto page_tree = m_catalog->get_dict(this, "Pages"); - auto kids_array = page_tree->get_array(this, "Kids"); + add_page_tree_node_to_page_tree(page_tree); +} +void Document::add_page_tree_node_to_page_tree(NonnullRefPtr page_tree) +{ + auto kids_array = page_tree->get_array(this, "Kids"); auto page_count = page_tree->get("Count").value().as_int(); + if (static_cast(page_count) != kids_array->elements().size()) { - // FIXME: Support recursive PDF page tree structures - VERIFY_NOT_REACHED(); + // This page tree contains child page trees, so we recursively add + // these pages to the overall page tree + + for (auto& value : *kids_array) { + auto reference = resolve_to(value); + auto byte_offset = m_xref_table.byte_offset_for_object(reference->index()); + auto maybe_page_tree_node = m_parser.conditionally_parse_page_tree_node_at_offset(byte_offset); + if (maybe_page_tree_node) { + add_page_tree_node_to_page_tree(maybe_page_tree_node.release_nonnull()); + } else { + m_page_object_indices.append(reference->index()); + } + } + + return; } + // We know all of the kids are leaf nodes for (auto& value : *kids_array) { auto reference = resolve_to(value); m_page_object_indices.append(reference->index()); diff --git a/Userland/Libraries/LibPDF/Document.h b/Userland/Libraries/LibPDF/Document.h index f94f6354c6..f521025087 100644 --- a/Userland/Libraries/LibPDF/Document.h +++ b/Userland/Libraries/LibPDF/Document.h @@ -66,7 +66,14 @@ public: UnwrappedValueType resolve_to(const Value& value); private: + // FIXME: Currently, to improve performance, we don't load any pages at Document + // construction, rather we just load the page structure and populate + // m_page_object_indices. However, we can be even lazier and defer page tree node + // parsing, as good PDF writers will layout the page tree in a balanced tree to + // improve lookup time. This would reduce the initial overhead by not loading + // every page tree node of, say, a 1000+ page PDF file. void build_page_tree(); + void add_page_tree_node_to_page_tree(NonnullRefPtr page_tree); Parser m_parser; XRefTable m_xref_table; diff --git a/Userland/Libraries/LibPDF/Parser.cpp b/Userland/Libraries/LibPDF/Parser.cpp index eb37bf47d7..2e7d5ab57e 100644 --- a/Userland/Libraries/LibPDF/Parser.cpp +++ b/Userland/Libraries/LibPDF/Parser.cpp @@ -363,14 +363,14 @@ Value Parser::parse_number() } } + consume_whitespace(); + auto string = String(m_reader.bytes().slice(start_offset, m_reader.offset() - start_offset)); float f = strtof(string.characters(), nullptr); if (is_float) return Value(f); VERIFY(floorf(f) == f); - consume_whitespace(); - return Value(static_cast(f)); } @@ -567,6 +567,50 @@ NonnullRefPtr Parser::parse_dict() return make_object(map); } +RefPtr Parser::conditionally_parse_page_tree_node_at_offset(size_t offset) +{ + m_reader.move_to(offset); + parse_number(); + parse_number(); + VERIFY(m_reader.matches("obj")); + m_reader.move_by(3); + consume_whitespace(); + + consume('<'); + consume('<'); + consume_whitespace(); + HashMap map; + + while (true) { + if (m_reader.matches(">>")) + break; + auto name = parse_name(); + auto name_string = name->name(); + if (!name_string.is_one_of("Type", "Parent", "Kids", "Count")) { + // This is a page, not a page tree node + return {}; + } + auto value = parse_value(); + if (name_string == "Type") { + if (!value.is_object()) + return {}; + auto type_object = value.as_object(); + if (!type_object->is_name()) + return {}; + auto type_name = object_cast(type_object); + if (type_name->name() != "Pages") + return {}; + } + map.set(name->name(), value); + } + + consume('>'); + consume('>'); + consume_whitespace(); + + return make_object(map); +} + NonnullRefPtr Parser::parse_stream(NonnullRefPtr dict) { VERIFY(m_reader.matches("stream")); diff --git a/Userland/Libraries/LibPDF/Parser.h b/Userland/Libraries/LibPDF/Parser.h index 56921916a3..3158f9d795 100644 --- a/Userland/Libraries/LibPDF/Parser.h +++ b/Userland/Libraries/LibPDF/Parser.h @@ -29,6 +29,8 @@ public: NonnullRefPtr parse_indirect_value_at_offset(size_t offset); + RefPtr conditionally_parse_page_tree_node_at_offset(size_t offset); + private: bool parse_header(); XRefTable parse_xref_table();