From 78bc9d1539ad89565bb6e57b27298e7007f7af29 Mon Sep 17 00:00:00 2001 From: Matthew Olsson Date: Tue, 25 May 2021 08:55:15 -0700 Subject: [PATCH] LibPDF: Refine the distinction between the Document and Parser The Parser should hold information relevant for parsing, whereas the Document should hold information relevant for displaying pages. With this in mind, there is no reason for the Document to hold the xref table and trailer. These objects have been moved to the Parser, which allows the Parser to expose less public methods (which will be even more evident once linearized PDFs are supported). --- Userland/Libraries/LibPDF/Document.cpp | 24 ++++---------- Userland/Libraries/LibPDF/Document.h | 10 ------ Userland/Libraries/LibPDF/Parser.cpp | 43 +++++++++++++++----------- Userland/Libraries/LibPDF/Parser.h | 22 +++++++------ 4 files changed, 43 insertions(+), 56 deletions(-) diff --git a/Userland/Libraries/LibPDF/Document.cpp b/Userland/Libraries/LibPDF/Document.cpp index deeb3ca46c..a127a4359f 100644 --- a/Userland/Libraries/LibPDF/Document.cpp +++ b/Userland/Libraries/LibPDF/Document.cpp @@ -39,17 +39,10 @@ RefPtr Document::create(const ReadonlyBytes& bytes) auto parser = adopt_ref(*new Parser({}, bytes)); auto document = adopt_ref(*new Document(parser)); - VERIFY(parser->perform_validation()); - auto xref_table_and_trailer_opt = parser->parse_last_xref_table_and_trailer(); - if (!xref_table_and_trailer_opt.has_value()) + if (!parser->initialize()) return {}; - auto [xref_table, trailer] = xref_table_and_trailer_opt.value(); - - document->m_xref_table = xref_table; - document->m_trailer = trailer; - - document->m_catalog = document->m_trailer->get_dict(document, CommonNames::Root); + document->m_catalog = parser->trailer()->get_dict(document, CommonNames::Root); document->build_page_tree(); document->build_outline(); @@ -68,13 +61,9 @@ Value Document::get_or_load_value(u32 index) if (value) return value; - VERIFY(m_xref_table.has_object(index)); - auto byte_offset = m_xref_table.byte_offset_for_object(index); - auto indirect_value = m_parser->parse_indirect_value_at_offset(byte_offset); - VERIFY(indirect_value->index() == index); - value = indirect_value->value(); - m_values.set(index, value); - return value; + auto object = m_parser->parse_object_with_index(index); + m_values.set(index, object); + return object; } u32 Document::get_first_page_index() const @@ -179,9 +168,8 @@ bool Document::add_page_tree_node_to_page_tree(NonnullRefPtr page_tr for (auto& value : *kids_array) { auto reference_index = value.as_ref_index(); - auto byte_offset = m_xref_table.byte_offset_for_object(reference_index); bool ok; - auto maybe_page_tree_node = m_parser->conditionally_parse_page_tree_node_at_offset(byte_offset, ok); + auto maybe_page_tree_node = m_parser->conditionally_parse_page_tree_node(reference_index, ok); if (!ok) return false; if (maybe_page_tree_node) { diff --git a/Userland/Libraries/LibPDF/Document.h b/Userland/Libraries/LibPDF/Document.h index 655b0450eb..270c8c5b44 100644 --- a/Userland/Libraries/LibPDF/Document.h +++ b/Userland/Libraries/LibPDF/Document.h @@ -75,8 +75,6 @@ class Document final : public RefCounted { public: static RefPtr create(const ReadonlyBytes& bytes); - ALWAYS_INLINE const XRefTable& xref_table() const { return m_xref_table; } - ALWAYS_INLINE const DictObject& trailer() const { return *m_trailer; } ALWAYS_INLINE const RefPtr& outline() const { return m_outline; } [[nodiscard]] Value get_or_load_value(u32 index); @@ -92,12 +90,6 @@ public: return m_values.get(index).value_or({}); } - ALWAYS_INLINE void set_value(u32 index, const Value& value) - { - m_values.ensure_capacity(index); - m_values.set(index, value); - } - // Strips away the layer of indirection by turning indirect value // refs into the value they reference, and indirect values into // the value being wrapped. @@ -139,8 +131,6 @@ private: NonnullRefPtrVector build_outline_item_chain(const Value& first_ref, const Value& last_ref); NonnullRefPtr m_parser; - XRefTable m_xref_table; - RefPtr m_trailer; RefPtr m_catalog; Vector m_page_object_indices; HashMap m_pages; diff --git a/Userland/Libraries/LibPDF/Parser.cpp b/Userland/Libraries/LibPDF/Parser.cpp index f868629525..0f919618c2 100644 --- a/Userland/Libraries/LibPDF/Parser.cpp +++ b/Userland/Libraries/LibPDF/Parser.cpp @@ -38,43 +38,47 @@ Parser::Parser(const ReadonlyBytes& bytes) { } -bool Parser::perform_validation() +bool Parser::initialize() { - return !sloppy_is_linearized() && parse_header(); -} + if (!parse_header()) + return {}; -Optional Parser::parse_last_xref_table_and_trailer() -{ m_reader.move_to(m_reader.bytes().size() - 1); if (!navigate_to_before_eof_marker()) - return {}; + return false; if (!navigate_to_after_startxref()) - return {}; + return false; if (m_reader.done()) - return {}; + return false; m_reader.set_reading_forwards(); auto xref_offset_value = parse_number(); if (!xref_offset_value.is_int()) - return {}; + return false; auto xref_offset = xref_offset_value.as_int(); m_reader.move_to(xref_offset); auto xref_table = parse_xref_table(); if (!xref_table.has_value()) - return {}; + return false; auto trailer = parse_file_trailer(); if (!trailer) - return {}; + return false; - return XRefTableAndTrailer { xref_table.value(), trailer.release_nonnull() }; + m_xref_table = xref_table.value(); + m_trailer = trailer; + return true; } -RefPtr Parser::parse_indirect_value_at_offset(size_t offset) +Value Parser::parse_object_with_index(u32 index) { - m_reader.set_reading_forwards(); - m_reader.move_to(offset); - return parse_indirect_value(); + VERIFY(m_xref_table.has_object(index)); + auto byte_offset = m_xref_table.byte_offset_for_object(index); + m_reader.move_to(byte_offset); + auto indirect_value = parse_indirect_value(); + VERIFY(indirect_value); + VERIFY(indirect_value->index() == index); + return indirect_value->value(); } bool Parser::parse_header() @@ -647,11 +651,14 @@ RefPtr Parser::parse_dict() return make_object(map); } -RefPtr Parser::conditionally_parse_page_tree_node_at_offset(size_t offset, bool& ok) +RefPtr Parser::conditionally_parse_page_tree_node(u32 object_index, bool& ok) { ok = true; - m_reader.move_to(offset); + VERIFY(m_xref_table.has_object(object_index)); + auto byte_offset = m_xref_table.byte_offset_for_object(object_index); + + m_reader.move_to(byte_offset); parse_number(); parse_number(); if (!m_reader.matches("obj")) { diff --git a/Userland/Libraries/LibPDF/Parser.h b/Userland/Libraries/LibPDF/Parser.h index 6774acb93b..6227898a1d 100644 --- a/Userland/Libraries/LibPDF/Parser.h +++ b/Userland/Libraries/LibPDF/Parser.h @@ -22,19 +22,19 @@ public: Parser(Badge, const ReadonlyBytes&); - void set_document(RefPtr document) { m_document = document; } + [[nodiscard]] ALWAYS_INLINE const RefPtr& trailer() const { return m_trailer; } + void set_document(const RefPtr& document) { m_document = document; } - bool perform_validation(); + // Parses the header and initializes the xref table and trailer + bool initialize(); - struct XRefTableAndTrailer { - XRefTable xref_table; - NonnullRefPtr trailer; - }; - Optional parse_last_xref_table_and_trailer(); + Value parse_object_with_index(u32 index); - RefPtr parse_indirect_value_at_offset(size_t offset); - - RefPtr conditionally_parse_page_tree_node_at_offset(size_t offset, bool& ok); + // Specialized version of parse_dict which aborts early if the dict being parsed + // is not a page object. A null RefPtr return indicates that the dict at this index + // is not a page tree node, whereas ok == false indicates a malformed PDF file and + // should cause an abort of the current operation. + RefPtr conditionally_parse_page_tree_node(u32 object_index, bool& ok); private: explicit Parser(const ReadonlyBytes&); @@ -85,6 +85,8 @@ private: Reader m_reader; RefPtr m_document; + XRefTable m_xref_table; + RefPtr m_trailer; }; }