diff --git a/AK/Debug.h.in b/AK/Debug.h.in index 89c0fb1b5d..5b7d50fce7 100644 --- a/AK/Debug.h.in +++ b/AK/Debug.h.in @@ -286,6 +286,10 @@ #cmakedefine01 PATH_DEBUG #endif +#ifndef PDF_DEBUG +#cmakedefine01 PDF_DEBUG +#endif + #ifndef PNG_DEBUG #cmakedefine01 PNG_DEBUG #endif diff --git a/Meta/CMake/all_the_debug_macros.cmake b/Meta/CMake/all_the_debug_macros.cmake index 1f11e383d8..fd5c5f909a 100644 --- a/Meta/CMake/all_the_debug_macros.cmake +++ b/Meta/CMake/all_the_debug_macros.cmake @@ -176,6 +176,7 @@ set(LINE_EDITOR_DEBUG ON) set(LANGUAGE_SERVER_DEBUG ON) set(GL_DEBUG ON) set(WASM_BINPARSER_DEBUG ON) +set(PDF_DEBUG ON) # False positive: DEBUG is a flag but it works differently. # set(DEBUG ON) diff --git a/Userland/Libraries/LibPDF/CMakeLists.txt b/Userland/Libraries/LibPDF/CMakeLists.txt index c184d28653..1f1cd4738d 100644 --- a/Userland/Libraries/LibPDF/CMakeLists.txt +++ b/Userland/Libraries/LibPDF/CMakeLists.txt @@ -1,6 +1,7 @@ set(SOURCES Object.cpp Document.cpp + Object.cpp Parser.cpp Value.cpp ) diff --git a/Userland/Libraries/LibPDF/Document.cpp b/Userland/Libraries/LibPDF/Document.cpp index 4934e7a331..38dab2c4d3 100644 --- a/Userland/Libraries/LibPDF/Document.cpp +++ b/Userland/Libraries/LibPDF/Document.cpp @@ -17,6 +17,118 @@ Document::Document(const ReadonlyBytes& bytes) m_xref_table = xref_table; m_trailer = trailer; + + m_catalog = m_trailer->get_dict(this, "Root"); + build_page_tree(); +} + +Value Document::get_or_load_value(u32 index) +{ + auto value = get_value(index); + if (value) + return value; + + VERIFY(m_xref_table.has_object(index)); + auto byte_offset = m_xref_table.byte_offset_for_object(index); + auto indirect_value = m_parser.parse_indirect_value_at_offset(byte_offset); + VERIFY(indirect_value->index() == index); + value = indirect_value->value(); + m_values.set(index, value); + return value; +} + +u32 Document::get_first_page_index() const +{ + // FIXME: A PDF can have a different default first page, which + // should be fetched and returned here + return 0; +} + +u32 Document::get_page_count() const +{ + return m_page_object_indices.size(); +} + +Page Document::get_page(u32 index) +{ + VERIFY(index < m_page_object_indices.size()); + + auto cached_page = m_pages.get(index); + if (cached_page.has_value()) + return cached_page.value(); + + auto page_object_index = m_page_object_indices[index]; + auto raw_page_object = resolve_to(get_or_load_value(page_object_index)); + + auto resources = raw_page_object->get_dict(this, "Resources"); + auto media_box_array = raw_page_object->get_array(this, "MediaBox"); + auto media_box = Rectangle { + media_box_array->at(0).to_float(), + media_box_array->at(1).to_float(), + media_box_array->at(2).to_float(), + media_box_array->at(3).to_float(), + }; + auto contents = raw_page_object->get_object(this, "Contents"); + + Page page { resources, media_box, contents }; + m_pages.set(index, page); + return page; +} + +Value Document::resolve(const Value& value) +{ + if (!value.is_object()) + return value; + + auto obj = value.as_object(); + + // FIXME: Surely indirect PDF objects can't contain another indirect PDF object, + // right? Unsure from the spec, but if they can, these return values would have + // to be wrapped with another resolve() call. + + if (obj->is_indirect_value_ref()) { + auto object_index = static_cast>(obj)->index(); + return get_or_load_value(object_index); + } + + if (obj->is_indirect_value()) + return static_cast>(obj)->value(); + + return obj; +} + +template +UnwrappedValueType Document::resolve_to(const Value& value) +{ + auto resolved = resolve(value); + + if constexpr (IsSame) + return resolved.as_bool(); + if constexpr (IsSame) + return resolved.as_int(); + if constexpr (IsSame) + return resolved.as_float(); + if constexpr (IsObject) + return object_cast(resolved.as_object()); + + VERIFY_NOT_REACHED(); +} + +void Document::build_page_tree() +{ + auto page_tree = m_catalog->get_dict(this, "Pages"); + auto kids_array = page_tree->get_array(this, "Kids"); + + auto page_count = page_tree->get("Count").value().as_int(); + if (static_cast(page_count) != kids_array->elements().size()) { + // FIXME: Support recursive PDF page tree structures + VERIFY_NOT_REACHED(); + } + + for (auto& value : *kids_array) { + auto reference = resolve_to(value); + m_page_object_indices.append(reference->index()); + } } } diff --git a/Userland/Libraries/LibPDF/Document.h b/Userland/Libraries/LibPDF/Document.h index b218019968..f94f6354c6 100644 --- a/Userland/Libraries/LibPDF/Document.h +++ b/Userland/Libraries/LibPDF/Document.h @@ -6,6 +6,8 @@ #pragma once +#include +#include #include #include #include @@ -13,6 +15,19 @@ namespace PDF { +struct Rectangle { + float lower_left_x; + float lower_left_y; + float upper_right_x; + float upper_right_y; +}; + +struct Page { + NonnullRefPtr resources; + Rectangle media_box; + NonnullRefPtr contents; +}; + class Document final : public RefCounted { public: explicit Document(const ReadonlyBytes& bytes); @@ -21,6 +36,14 @@ public: ALWAYS_INLINE const DictObject& trailer() const { return *m_trailer; } + [[nodiscard]] Value get_or_load_value(u32 index); + + [[nodiscard]] u32 get_first_page_index() const; + + [[nodiscard]] u32 get_page_count() const; + + [[nodiscard]] Page get_page(u32 index); + ALWAYS_INLINE Value get_value(u32 index) const { return m_values.get(index).value_or({}); @@ -32,11 +55,53 @@ public: m_values.set(index, value); } + // Strips away the layer of indirection by turning indirect value + // refs into the value they reference, and indirect values into + // the value being wrapped. + Value resolve(const Value& value); + + // Like resolve, but unwraps the Value into the given type. Accepts + // any object type, and the three primitive Value types. + template + UnwrappedValueType resolve_to(const Value& value); + private: + void build_page_tree(); + Parser m_parser; XRefTable m_xref_table; RefPtr m_trailer; + RefPtr m_catalog; + Vector m_page_object_indices; + HashMap m_pages; HashMap m_values; }; } + +namespace AK { + +template<> +struct Formatter : Formatter { + void format(FormatBuilder& builder, const PDF::Rectangle& rectangle) + { + Formatter::format(builder, + String::formatted("Rectangle {{ ll=({}, {}), ur=({}, {}) }}", + rectangle.lower_left_x, + rectangle.lower_left_y, + rectangle.upper_right_x, + rectangle.upper_right_y)); + } +}; + +template<> +struct Formatter : Formatter { + void format(FormatBuilder& builder, const PDF::Page& page) + { + constexpr auto fmt_string = "Page {{\n resources={}\n contents={}\n media_box={}\n}}"; + auto str = String::formatted(fmt_string, page.resources->to_string(1), page.contents->to_string(1), page.media_box); + Formatter::format(builder, str); + } +}; + +} diff --git a/Userland/Libraries/LibPDF/Forward.h b/Userland/Libraries/LibPDF/Forward.h index dd9825cff8..f821af05cb 100644 --- a/Userland/Libraries/LibPDF/Forward.h +++ b/Userland/Libraries/LibPDF/Forward.h @@ -30,4 +30,13 @@ ENUMERATE_OBJECT_TYPES(FORWARD_DECL) template concept IsObject = IsBaseOf; +template +concept IsValuePrimitive = IsSame || IsSame || IsSame; + +template +concept IsValueType = IsValuePrimitive || IsObject; + +template +using UnwrappedValueType = Conditional, NonnullRefPtr, T>; + } diff --git a/Userland/Libraries/LibPDF/Object.cpp b/Userland/Libraries/LibPDF/Object.cpp index 4fca01a497..3cd861cb61 100644 --- a/Userland/Libraries/LibPDF/Object.cpp +++ b/Userland/Libraries/LibPDF/Object.cpp @@ -5,10 +5,34 @@ */ #include +#include #include namespace PDF { +NonnullRefPtr ArrayObject::get_object_at(Document* document, size_t index) const +{ + return document->resolve_to(m_elements[index]); +} + +NonnullRefPtr DictObject::get_object(Document* document, const FlyString& key) const +{ + return document->resolve_to(get_value(key)); +} + +#define DEFINE_ACCESSORS(class_name, snake_name) \ + NonnullRefPtr ArrayObject::get_##snake_name##_at(Document* document, size_t index) const \ + { \ + return document->resolve_to(m_elements[index]); \ + } \ + \ + NonnullRefPtr DictObject::get_##snake_name(Document* document, const FlyString& key) const \ + { \ + return document->resolve_to(get(key).value()); \ + } +ENUMERATE_DIRECT_OBJECT_TYPES(DEFINE_ACCESSORS) +#undef DEFINE_INDEXER + static void append_indent(StringBuilder& builder, int indent) { for (int i = 0; i < indent; i++) diff --git a/Userland/Libraries/LibPDF/Object.h b/Userland/Libraries/LibPDF/Object.h index 088204e397..322c3a143f 100644 --- a/Userland/Libraries/LibPDF/Object.h +++ b/Userland/Libraries/LibPDF/Object.h @@ -6,10 +6,12 @@ #pragma once +#include #include #include #include #include +#include #include #include @@ -27,6 +29,10 @@ public: ENUMERATE_OBJECT_TYPES(DEFINE_ID) #undef DEFINE_ID + template + NonnullRefPtr resolved_to(Document*) const; + + virtual const char* type_name() const = 0; virtual String to_string(int indent) const = 0; private: @@ -47,6 +53,7 @@ public: [[nodiscard]] ALWAYS_INLINE bool is_binary() const { return m_is_binary; } ALWAYS_INLINE bool is_string() const override { return true; } + ALWAYS_INLINE const char* type_name() const override { return "string"; } String to_string(int indent) const override; private: @@ -66,6 +73,7 @@ public: [[nodiscard]] ALWAYS_INLINE FlyString name() const { return m_name; } ALWAYS_INLINE bool is_name() const override { return true; } + ALWAYS_INLINE const char* type_name() const override { return "name"; } String to_string(int indent) const override; private: @@ -83,7 +91,24 @@ public: [[nodiscard]] ALWAYS_INLINE Vector elements() const { return m_elements; } - ALWAYS_INLINE bool is_array() const override { return true; } + ALWAYS_INLINE auto begin() const { return m_elements.begin(); } + ALWAYS_INLINE auto end() const { return m_elements.end(); } + + ALWAYS_INLINE const Value& operator[](size_t index) const { return at(index); } + ALWAYS_INLINE const Value& at(size_t index) const { return m_elements[index]; } + + NonnullRefPtr get_object_at(Document*, size_t index) const; + +#define DEFINE_INDEXER(class_name, snake_name) \ + NonnullRefPtr get_##snake_name##_at(Document*, size_t index) const; + ENUMERATE_OBJECT_TYPES(DEFINE_INDEXER) +#undef DEFINE_INDEXER + + ALWAYS_INLINE bool is_array() const override + { + return true; + } + ALWAYS_INLINE const char* type_name() const override { return "array"; } String to_string(int indent) const override; private: @@ -99,9 +124,26 @@ public: ~DictObject() override = default; - [[nodiscard]] ALWAYS_INLINE HashMap map() const { return m_map; } + [[nodiscard]] ALWAYS_INLINE const HashMap& map() const { return m_map; } - ALWAYS_INLINE bool is_dict() const override { return true; } + ALWAYS_INLINE bool contains(const FlyString& key) const { return m_map.contains(key); } + + ALWAYS_INLINE Optional get(const FlyString& key) const { return m_map.get(key); } + + Value get_value(const FlyString& key) const { return get(key).value(); } + + NonnullRefPtr get_object(Document*, const FlyString& key) const; + +#define DEFINE_GETTER(class_name, snake_name) \ + NonnullRefPtr get_##snake_name(Document*, const FlyString& key) const; + ENUMERATE_OBJECT_TYPES(DEFINE_GETTER) +#undef DEFINE_GETTER + + ALWAYS_INLINE bool is_dict() const override + { + return true; + } + ALWAYS_INLINE const char* type_name() const override { return "dict"; } String to_string(int indent) const override; private: @@ -122,6 +164,7 @@ public: [[nodiscard]] ALWAYS_INLINE const ReadonlyBytes& bytes() const { return m_bytes; } ALWAYS_INLINE bool is_stream() const override { return true; } + ALWAYS_INLINE const char* type_name() const override { return "stream"; } String to_string(int indent) const override; private: @@ -144,6 +187,7 @@ public: [[nodiscard]] ALWAYS_INLINE const Value& value() const { return m_value; } ALWAYS_INLINE bool is_indirect_value() const override { return true; } + ALWAYS_INLINE const char* type_name() const override { return "indirect_object"; } String to_string(int indent) const override; private: @@ -164,12 +208,35 @@ public: [[nodiscard]] ALWAYS_INLINE u32 index() const { return m_index; } ALWAYS_INLINE bool is_indirect_value_ref() const override { return true; } + ALWAYS_INLINE const char* type_name() const override { return "indirect_object_ref"; } String to_string(int indent) const override; private: u32 m_index; }; +template +[[nodiscard]] ALWAYS_INLINE static NonnullRefPtr object_cast(NonnullRefPtr obj +#ifdef PDF_DEBUG + , + SourceLocation loc = SourceLocation::current() +#endif +) +{ +#ifdef PDF_DEBUG +# define ENUMERATE_TYPES(class_name, snake_name) \ + if constexpr (IsSame) { \ + if (!obj->is_##snake_name()) { \ + dbgln("{} invalid cast from type {} to type " #snake_name, loc, obj->type_name()); \ + } \ + } + ENUMERATE_OBJECT_TYPES(ENUMERATE_TYPES) +# undef ENUMERATE_TYPES +#endif + + return static_cast>(obj); +} + } namespace AK { diff --git a/Userland/Libraries/LibPDF/Parser.cpp b/Userland/Libraries/LibPDF/Parser.cpp index 12da69dcb2..eb37bf47d7 100644 --- a/Userland/Libraries/LibPDF/Parser.cpp +++ b/Userland/Libraries/LibPDF/Parser.cpp @@ -48,6 +48,13 @@ Parser::XRefTableAndTrailer Parser::parse_last_xref_table_and_trailer() return { xref_table, trailer }; } +NonnullRefPtr Parser::parse_indirect_value_at_offset(size_t offset) +{ + m_reader.set_reading_forwards(); + m_reader.move_to(offset); + return parse_indirect_value(); +} + bool Parser::parse_header() { // FIXME: Do something with the version? @@ -323,11 +330,18 @@ NonnullRefPtr Parser::parse_indirect_value(int index, int generat auto value = parse_value(); VERIFY(value.is_object()); VERIFY(m_reader.matches("endobj")); - VERIFY(consume_whitespace()); return make_object(index, generation, value.as_object()); } +NonnullRefPtr Parser::parse_indirect_value() +{ + auto first_number = parse_number(); + auto second_number = parse_number(); + VERIFY(first_number.is_int() && second_number.is_int()); + return parse_indirect_value(first_number.as_int(), second_number.as_int()); +} + Value Parser::parse_number() { size_t start_offset = m_reader.offset(); @@ -366,7 +380,7 @@ NonnullRefPtr Parser::parse_name() StringBuilder builder; while (true) { - if (matches_whitespace()) + if (!matches_regular_character()) break; if (m_reader.matches('#')) { @@ -587,6 +601,16 @@ bool Parser::matches_number() const return isdigit(ch) || ch == '-' || ch == '+'; } +bool Parser::matches_delimiter() const +{ + return m_reader.matches_any('(', ')', '<', '>', '[', ']', '{', '}', '/', '%'); +} + +bool Parser::matches_regular_character() const +{ + return !matches_delimiter() && !matches_whitespace(); +} + void Parser::consume_eol() { if (m_reader.matches("\r\n")) { diff --git a/Userland/Libraries/LibPDF/Parser.h b/Userland/Libraries/LibPDF/Parser.h index c983628e49..56921916a3 100644 --- a/Userland/Libraries/LibPDF/Parser.h +++ b/Userland/Libraries/LibPDF/Parser.h @@ -27,6 +27,8 @@ public: }; XRefTableAndTrailer parse_last_xref_table_and_trailer(); + NonnullRefPtr parse_indirect_value_at_offset(size_t offset); + private: bool parse_header(); XRefTable parse_xref_table(); @@ -48,6 +50,7 @@ private: Value parse_value(); Value parse_possible_indirect_value_or_ref(); NonnullRefPtr parse_indirect_value(int index, int generation); + NonnullRefPtr parse_indirect_value(); Value parse_number(); NonnullRefPtr parse_name(); NonnullRefPtr parse_string(); @@ -60,6 +63,8 @@ private: bool matches_eol() const; bool matches_whitespace() const; bool matches_number() const; + bool matches_delimiter() const; + bool matches_regular_character() const; void consume_eol(); bool consume_whitespace(); diff --git a/Userland/Libraries/LibPDF/Reader.h b/Userland/Libraries/LibPDF/Reader.h index bc32416527..2300c43041 100644 --- a/Userland/Libraries/LibPDF/Reader.h +++ b/Userland/Libraries/LibPDF/Reader.h @@ -123,7 +123,8 @@ public: ALWAYS_INLINE void load() { m_offset = m_saved_offsets.take_last(); } ALWAYS_INLINE void discard() { m_saved_offsets.take_last(); } - void dump_state() +#ifdef PDF_DEBUG + void dump_state() const { StringBuilder builder; builder.append("Reader State Dump\n\n"); @@ -143,6 +144,7 @@ public: auto str = builder.to_string(); dbgputstr(str.characters(), str.length()); } +#endif private: ReadonlyBytes m_bytes;