From 8c745ad0d9198db1cd078dff573353457294a840 Mon Sep 17 00:00:00 2001 From: Matthew Olsson Date: Sat, 8 May 2021 14:57:49 -0700 Subject: [PATCH] LibPDF: Parse page structures This commit introduces the ability to parse the document catalog dict, as well as the page tree and individual pages. Pages obviously aren't fully parsed, as we won't care about most of the fields until we start actually rendering PDFs. One of the primary benefits of the PDF format is laziness. PDFs are not meant to be parsed all at once, and the same is true for pages. When a Document is constructed, it builds a map of page number to object index, but it does not fetch and parse any of the pages. A page is only parsed when a caller requests that particular page (and is cached going forwards). Additionally, this commit also adds an object_cast function which logs bad casts if DEBUG_PDF is set. Additionally, utility functions were added to ArrayObject and DictObject to get all types of objects from the collections to avoid having to manually cast. --- AK/Debug.h.in | 4 + Meta/CMake/all_the_debug_macros.cmake | 1 + Userland/Libraries/LibPDF/CMakeLists.txt | 1 + Userland/Libraries/LibPDF/Document.cpp | 112 +++++++++++++++++++++++ Userland/Libraries/LibPDF/Document.h | 65 +++++++++++++ Userland/Libraries/LibPDF/Forward.h | 9 ++ Userland/Libraries/LibPDF/Object.cpp | 24 +++++ Userland/Libraries/LibPDF/Object.h | 73 ++++++++++++++- Userland/Libraries/LibPDF/Parser.cpp | 28 +++++- Userland/Libraries/LibPDF/Parser.h | 5 + Userland/Libraries/LibPDF/Reader.h | 4 +- 11 files changed, 320 insertions(+), 6 deletions(-) diff --git a/AK/Debug.h.in b/AK/Debug.h.in index 89c0fb1b5d..5b7d50fce7 100644 --- a/AK/Debug.h.in +++ b/AK/Debug.h.in @@ -286,6 +286,10 @@ #cmakedefine01 PATH_DEBUG #endif +#ifndef PDF_DEBUG +#cmakedefine01 PDF_DEBUG +#endif + #ifndef PNG_DEBUG #cmakedefine01 PNG_DEBUG #endif diff --git a/Meta/CMake/all_the_debug_macros.cmake b/Meta/CMake/all_the_debug_macros.cmake index 1f11e383d8..fd5c5f909a 100644 --- a/Meta/CMake/all_the_debug_macros.cmake +++ b/Meta/CMake/all_the_debug_macros.cmake @@ -176,6 +176,7 @@ set(LINE_EDITOR_DEBUG ON) set(LANGUAGE_SERVER_DEBUG ON) set(GL_DEBUG ON) set(WASM_BINPARSER_DEBUG ON) +set(PDF_DEBUG ON) # False positive: DEBUG is a flag but it works differently. # set(DEBUG ON) diff --git a/Userland/Libraries/LibPDF/CMakeLists.txt b/Userland/Libraries/LibPDF/CMakeLists.txt index c184d28653..1f1cd4738d 100644 --- a/Userland/Libraries/LibPDF/CMakeLists.txt +++ b/Userland/Libraries/LibPDF/CMakeLists.txt @@ -1,6 +1,7 @@ set(SOURCES Object.cpp Document.cpp + Object.cpp Parser.cpp Value.cpp ) diff --git a/Userland/Libraries/LibPDF/Document.cpp b/Userland/Libraries/LibPDF/Document.cpp index 4934e7a331..38dab2c4d3 100644 --- a/Userland/Libraries/LibPDF/Document.cpp +++ b/Userland/Libraries/LibPDF/Document.cpp @@ -17,6 +17,118 @@ Document::Document(const ReadonlyBytes& bytes) m_xref_table = xref_table; m_trailer = trailer; + + m_catalog = m_trailer->get_dict(this, "Root"); + build_page_tree(); +} + +Value Document::get_or_load_value(u32 index) +{ + auto value = get_value(index); + if (value) + return value; + + VERIFY(m_xref_table.has_object(index)); + auto byte_offset = m_xref_table.byte_offset_for_object(index); + auto indirect_value = m_parser.parse_indirect_value_at_offset(byte_offset); + VERIFY(indirect_value->index() == index); + value = indirect_value->value(); + m_values.set(index, value); + return value; +} + +u32 Document::get_first_page_index() const +{ + // FIXME: A PDF can have a different default first page, which + // should be fetched and returned here + return 0; +} + +u32 Document::get_page_count() const +{ + return m_page_object_indices.size(); +} + +Page Document::get_page(u32 index) +{ + VERIFY(index < m_page_object_indices.size()); + + auto cached_page = m_pages.get(index); + if (cached_page.has_value()) + return cached_page.value(); + + auto page_object_index = m_page_object_indices[index]; + auto raw_page_object = resolve_to(get_or_load_value(page_object_index)); + + auto resources = raw_page_object->get_dict(this, "Resources"); + auto media_box_array = raw_page_object->get_array(this, "MediaBox"); + auto media_box = Rectangle { + media_box_array->at(0).to_float(), + media_box_array->at(1).to_float(), + media_box_array->at(2).to_float(), + media_box_array->at(3).to_float(), + }; + auto contents = raw_page_object->get_object(this, "Contents"); + + Page page { resources, media_box, contents }; + m_pages.set(index, page); + return page; +} + +Value Document::resolve(const Value& value) +{ + if (!value.is_object()) + return value; + + auto obj = value.as_object(); + + // FIXME: Surely indirect PDF objects can't contain another indirect PDF object, + // right? Unsure from the spec, but if they can, these return values would have + // to be wrapped with another resolve() call. + + if (obj->is_indirect_value_ref()) { + auto object_index = static_cast>(obj)->index(); + return get_or_load_value(object_index); + } + + if (obj->is_indirect_value()) + return static_cast>(obj)->value(); + + return obj; +} + +template +UnwrappedValueType Document::resolve_to(const Value& value) +{ + auto resolved = resolve(value); + + if constexpr (IsSame) + return resolved.as_bool(); + if constexpr (IsSame) + return resolved.as_int(); + if constexpr (IsSame) + return resolved.as_float(); + if constexpr (IsObject) + return object_cast(resolved.as_object()); + + VERIFY_NOT_REACHED(); +} + +void Document::build_page_tree() +{ + auto page_tree = m_catalog->get_dict(this, "Pages"); + auto kids_array = page_tree->get_array(this, "Kids"); + + auto page_count = page_tree->get("Count").value().as_int(); + if (static_cast(page_count) != kids_array->elements().size()) { + // FIXME: Support recursive PDF page tree structures + VERIFY_NOT_REACHED(); + } + + for (auto& value : *kids_array) { + auto reference = resolve_to(value); + m_page_object_indices.append(reference->index()); + } } } diff --git a/Userland/Libraries/LibPDF/Document.h b/Userland/Libraries/LibPDF/Document.h index b218019968..f94f6354c6 100644 --- a/Userland/Libraries/LibPDF/Document.h +++ b/Userland/Libraries/LibPDF/Document.h @@ -6,6 +6,8 @@ #pragma once +#include +#include #include #include #include @@ -13,6 +15,19 @@ namespace PDF { +struct Rectangle { + float lower_left_x; + float lower_left_y; + float upper_right_x; + float upper_right_y; +}; + +struct Page { + NonnullRefPtr resources; + Rectangle media_box; + NonnullRefPtr contents; +}; + class Document final : public RefCounted { public: explicit Document(const ReadonlyBytes& bytes); @@ -21,6 +36,14 @@ public: ALWAYS_INLINE const DictObject& trailer() const { return *m_trailer; } + [[nodiscard]] Value get_or_load_value(u32 index); + + [[nodiscard]] u32 get_first_page_index() const; + + [[nodiscard]] u32 get_page_count() const; + + [[nodiscard]] Page get_page(u32 index); + ALWAYS_INLINE Value get_value(u32 index) const { return m_values.get(index).value_or({}); @@ -32,11 +55,53 @@ public: m_values.set(index, value); } + // Strips away the layer of indirection by turning indirect value + // refs into the value they reference, and indirect values into + // the value being wrapped. + Value resolve(const Value& value); + + // Like resolve, but unwraps the Value into the given type. Accepts + // any object type, and the three primitive Value types. + template + UnwrappedValueType resolve_to(const Value& value); + private: + void build_page_tree(); + Parser m_parser; XRefTable m_xref_table; RefPtr m_trailer; + RefPtr m_catalog; + Vector m_page_object_indices; + HashMap m_pages; HashMap m_values; }; } + +namespace AK { + +template<> +struct Formatter : Formatter { + void format(FormatBuilder& builder, const PDF::Rectangle& rectangle) + { + Formatter::format(builder, + String::formatted("Rectangle {{ ll=({}, {}), ur=({}, {}) }}", + rectangle.lower_left_x, + rectangle.lower_left_y, + rectangle.upper_right_x, + rectangle.upper_right_y)); + } +}; + +template<> +struct Formatter : Formatter { + void format(FormatBuilder& builder, const PDF::Page& page) + { + constexpr auto fmt_string = "Page {{\n resources={}\n contents={}\n media_box={}\n}}"; + auto str = String::formatted(fmt_string, page.resources->to_string(1), page.contents->to_string(1), page.media_box); + Formatter::format(builder, str); + } +}; + +} diff --git a/Userland/Libraries/LibPDF/Forward.h b/Userland/Libraries/LibPDF/Forward.h index dd9825cff8..f821af05cb 100644 --- a/Userland/Libraries/LibPDF/Forward.h +++ b/Userland/Libraries/LibPDF/Forward.h @@ -30,4 +30,13 @@ ENUMERATE_OBJECT_TYPES(FORWARD_DECL) template concept IsObject = IsBaseOf; +template +concept IsValuePrimitive = IsSame || IsSame || IsSame; + +template +concept IsValueType = IsValuePrimitive || IsObject; + +template +using UnwrappedValueType = Conditional, NonnullRefPtr, T>; + } diff --git a/Userland/Libraries/LibPDF/Object.cpp b/Userland/Libraries/LibPDF/Object.cpp index 4fca01a497..3cd861cb61 100644 --- a/Userland/Libraries/LibPDF/Object.cpp +++ b/Userland/Libraries/LibPDF/Object.cpp @@ -5,10 +5,34 @@ */ #include +#include #include namespace PDF { +NonnullRefPtr ArrayObject::get_object_at(Document* document, size_t index) const +{ + return document->resolve_to(m_elements[index]); +} + +NonnullRefPtr DictObject::get_object(Document* document, const FlyString& key) const +{ + return document->resolve_to(get_value(key)); +} + +#define DEFINE_ACCESSORS(class_name, snake_name) \ + NonnullRefPtr ArrayObject::get_##snake_name##_at(Document* document, size_t index) const \ + { \ + return document->resolve_to(m_elements[index]); \ + } \ + \ + NonnullRefPtr DictObject::get_##snake_name(Document* document, const FlyString& key) const \ + { \ + return document->resolve_to(get(key).value()); \ + } +ENUMERATE_DIRECT_OBJECT_TYPES(DEFINE_ACCESSORS) +#undef DEFINE_INDEXER + static void append_indent(StringBuilder& builder, int indent) { for (int i = 0; i < indent; i++) diff --git a/Userland/Libraries/LibPDF/Object.h b/Userland/Libraries/LibPDF/Object.h index 088204e397..322c3a143f 100644 --- a/Userland/Libraries/LibPDF/Object.h +++ b/Userland/Libraries/LibPDF/Object.h @@ -6,10 +6,12 @@ #pragma once +#include #include #include #include #include +#include #include #include @@ -27,6 +29,10 @@ public: ENUMERATE_OBJECT_TYPES(DEFINE_ID) #undef DEFINE_ID + template + NonnullRefPtr resolved_to(Document*) const; + + virtual const char* type_name() const = 0; virtual String to_string(int indent) const = 0; private: @@ -47,6 +53,7 @@ public: [[nodiscard]] ALWAYS_INLINE bool is_binary() const { return m_is_binary; } ALWAYS_INLINE bool is_string() const override { return true; } + ALWAYS_INLINE const char* type_name() const override { return "string"; } String to_string(int indent) const override; private: @@ -66,6 +73,7 @@ public: [[nodiscard]] ALWAYS_INLINE FlyString name() const { return m_name; } ALWAYS_INLINE bool is_name() const override { return true; } + ALWAYS_INLINE const char* type_name() const override { return "name"; } String to_string(int indent) const override; private: @@ -83,7 +91,24 @@ public: [[nodiscard]] ALWAYS_INLINE Vector elements() const { return m_elements; } - ALWAYS_INLINE bool is_array() const override { return true; } + ALWAYS_INLINE auto begin() const { return m_elements.begin(); } + ALWAYS_INLINE auto end() const { return m_elements.end(); } + + ALWAYS_INLINE const Value& operator[](size_t index) const { return at(index); } + ALWAYS_INLINE const Value& at(size_t index) const { return m_elements[index]; } + + NonnullRefPtr get_object_at(Document*, size_t index) const; + +#define DEFINE_INDEXER(class_name, snake_name) \ + NonnullRefPtr get_##snake_name##_at(Document*, size_t index) const; + ENUMERATE_OBJECT_TYPES(DEFINE_INDEXER) +#undef DEFINE_INDEXER + + ALWAYS_INLINE bool is_array() const override + { + return true; + } + ALWAYS_INLINE const char* type_name() const override { return "array"; } String to_string(int indent) const override; private: @@ -99,9 +124,26 @@ public: ~DictObject() override = default; - [[nodiscard]] ALWAYS_INLINE HashMap map() const { return m_map; } + [[nodiscard]] ALWAYS_INLINE const HashMap& map() const { return m_map; } - ALWAYS_INLINE bool is_dict() const override { return true; } + ALWAYS_INLINE bool contains(const FlyString& key) const { return m_map.contains(key); } + + ALWAYS_INLINE Optional get(const FlyString& key) const { return m_map.get(key); } + + Value get_value(const FlyString& key) const { return get(key).value(); } + + NonnullRefPtr get_object(Document*, const FlyString& key) const; + +#define DEFINE_GETTER(class_name, snake_name) \ + NonnullRefPtr get_##snake_name(Document*, const FlyString& key) const; + ENUMERATE_OBJECT_TYPES(DEFINE_GETTER) +#undef DEFINE_GETTER + + ALWAYS_INLINE bool is_dict() const override + { + return true; + } + ALWAYS_INLINE const char* type_name() const override { return "dict"; } String to_string(int indent) const override; private: @@ -122,6 +164,7 @@ public: [[nodiscard]] ALWAYS_INLINE const ReadonlyBytes& bytes() const { return m_bytes; } ALWAYS_INLINE bool is_stream() const override { return true; } + ALWAYS_INLINE const char* type_name() const override { return "stream"; } String to_string(int indent) const override; private: @@ -144,6 +187,7 @@ public: [[nodiscard]] ALWAYS_INLINE const Value& value() const { return m_value; } ALWAYS_INLINE bool is_indirect_value() const override { return true; } + ALWAYS_INLINE const char* type_name() const override { return "indirect_object"; } String to_string(int indent) const override; private: @@ -164,12 +208,35 @@ public: [[nodiscard]] ALWAYS_INLINE u32 index() const { return m_index; } ALWAYS_INLINE bool is_indirect_value_ref() const override { return true; } + ALWAYS_INLINE const char* type_name() const override { return "indirect_object_ref"; } String to_string(int indent) const override; private: u32 m_index; }; +template +[[nodiscard]] ALWAYS_INLINE static NonnullRefPtr object_cast(NonnullRefPtr obj +#ifdef PDF_DEBUG + , + SourceLocation loc = SourceLocation::current() +#endif +) +{ +#ifdef PDF_DEBUG +# define ENUMERATE_TYPES(class_name, snake_name) \ + if constexpr (IsSame) { \ + if (!obj->is_##snake_name()) { \ + dbgln("{} invalid cast from type {} to type " #snake_name, loc, obj->type_name()); \ + } \ + } + ENUMERATE_OBJECT_TYPES(ENUMERATE_TYPES) +# undef ENUMERATE_TYPES +#endif + + return static_cast>(obj); +} + } namespace AK { diff --git a/Userland/Libraries/LibPDF/Parser.cpp b/Userland/Libraries/LibPDF/Parser.cpp index 12da69dcb2..eb37bf47d7 100644 --- a/Userland/Libraries/LibPDF/Parser.cpp +++ b/Userland/Libraries/LibPDF/Parser.cpp @@ -48,6 +48,13 @@ Parser::XRefTableAndTrailer Parser::parse_last_xref_table_and_trailer() return { xref_table, trailer }; } +NonnullRefPtr Parser::parse_indirect_value_at_offset(size_t offset) +{ + m_reader.set_reading_forwards(); + m_reader.move_to(offset); + return parse_indirect_value(); +} + bool Parser::parse_header() { // FIXME: Do something with the version? @@ -323,11 +330,18 @@ NonnullRefPtr Parser::parse_indirect_value(int index, int generat auto value = parse_value(); VERIFY(value.is_object()); VERIFY(m_reader.matches("endobj")); - VERIFY(consume_whitespace()); return make_object(index, generation, value.as_object()); } +NonnullRefPtr Parser::parse_indirect_value() +{ + auto first_number = parse_number(); + auto second_number = parse_number(); + VERIFY(first_number.is_int() && second_number.is_int()); + return parse_indirect_value(first_number.as_int(), second_number.as_int()); +} + Value Parser::parse_number() { size_t start_offset = m_reader.offset(); @@ -366,7 +380,7 @@ NonnullRefPtr Parser::parse_name() StringBuilder builder; while (true) { - if (matches_whitespace()) + if (!matches_regular_character()) break; if (m_reader.matches('#')) { @@ -587,6 +601,16 @@ bool Parser::matches_number() const return isdigit(ch) || ch == '-' || ch == '+'; } +bool Parser::matches_delimiter() const +{ + return m_reader.matches_any('(', ')', '<', '>', '[', ']', '{', '}', '/', '%'); +} + +bool Parser::matches_regular_character() const +{ + return !matches_delimiter() && !matches_whitespace(); +} + void Parser::consume_eol() { if (m_reader.matches("\r\n")) { diff --git a/Userland/Libraries/LibPDF/Parser.h b/Userland/Libraries/LibPDF/Parser.h index c983628e49..56921916a3 100644 --- a/Userland/Libraries/LibPDF/Parser.h +++ b/Userland/Libraries/LibPDF/Parser.h @@ -27,6 +27,8 @@ public: }; XRefTableAndTrailer parse_last_xref_table_and_trailer(); + NonnullRefPtr parse_indirect_value_at_offset(size_t offset); + private: bool parse_header(); XRefTable parse_xref_table(); @@ -48,6 +50,7 @@ private: Value parse_value(); Value parse_possible_indirect_value_or_ref(); NonnullRefPtr parse_indirect_value(int index, int generation); + NonnullRefPtr parse_indirect_value(); Value parse_number(); NonnullRefPtr parse_name(); NonnullRefPtr parse_string(); @@ -60,6 +63,8 @@ private: bool matches_eol() const; bool matches_whitespace() const; bool matches_number() const; + bool matches_delimiter() const; + bool matches_regular_character() const; void consume_eol(); bool consume_whitespace(); diff --git a/Userland/Libraries/LibPDF/Reader.h b/Userland/Libraries/LibPDF/Reader.h index bc32416527..2300c43041 100644 --- a/Userland/Libraries/LibPDF/Reader.h +++ b/Userland/Libraries/LibPDF/Reader.h @@ -123,7 +123,8 @@ public: ALWAYS_INLINE void load() { m_offset = m_saved_offsets.take_last(); } ALWAYS_INLINE void discard() { m_saved_offsets.take_last(); } - void dump_state() +#ifdef PDF_DEBUG + void dump_state() const { StringBuilder builder; builder.append("Reader State Dump\n\n"); @@ -143,6 +144,7 @@ public: auto str = builder.to_string(); dbgputstr(str.characters(), str.length()); } +#endif private: ReadonlyBytes m_bytes;