1
Fork 0
mirror of https://github.com/RGBCube/serenity synced 2025-07-28 19:27:36 +00:00

LibPDF: Parse page structures

This commit introduces the ability to parse the document catalog dict,
as well as the page tree and individual pages. Pages obviously aren't
fully parsed, as we won't care about most of the fields until we
start actually rendering PDFs.

One of the primary benefits of the PDF format is laziness. PDFs are
not meant to be parsed all at once, and the same is true for pages.
When a Document is constructed, it builds a map of page number to
object index, but it does not fetch and parse any of the pages. A page
is only parsed when a caller requests that particular page (and is
cached going forwards).

Additionally, this commit also adds an object_cast function which
logs bad casts if DEBUG_PDF is set. Additionally, utility functions
were added to ArrayObject and DictObject to get all types of objects
from the collections to avoid having to manually cast.
This commit is contained in:
Matthew Olsson 2021-05-08 14:57:49 -07:00 committed by Andreas Kling
parent 72f693e9ed
commit 8c745ad0d9
11 changed files with 320 additions and 6 deletions

View file

@ -6,6 +6,8 @@
#pragma once
#include <AK/Format.h>
#include <AK/HashMap.h>
#include <AK/RefCounted.h>
#include <LibPDF/Object.h>
#include <LibPDF/Parser.h>
@ -13,6 +15,19 @@
namespace PDF {
struct Rectangle {
float lower_left_x;
float lower_left_y;
float upper_right_x;
float upper_right_y;
};
struct Page {
NonnullRefPtr<DictObject> resources;
Rectangle media_box;
NonnullRefPtr<Object> contents;
};
class Document final : public RefCounted<Document> {
public:
explicit Document(const ReadonlyBytes& bytes);
@ -21,6 +36,14 @@ public:
ALWAYS_INLINE const DictObject& trailer() const { return *m_trailer; }
[[nodiscard]] Value get_or_load_value(u32 index);
[[nodiscard]] u32 get_first_page_index() const;
[[nodiscard]] u32 get_page_count() const;
[[nodiscard]] Page get_page(u32 index);
ALWAYS_INLINE Value get_value(u32 index) const
{
return m_values.get(index).value_or({});
@ -32,11 +55,53 @@ public:
m_values.set(index, value);
}
// Strips away the layer of indirection by turning indirect value
// refs into the value they reference, and indirect values into
// the value being wrapped.
Value resolve(const Value& value);
// Like resolve, but unwraps the Value into the given type. Accepts
// any object type, and the three primitive Value types.
template<IsValueType T>
UnwrappedValueType<T> resolve_to(const Value& value);
private:
void build_page_tree();
Parser m_parser;
XRefTable m_xref_table;
RefPtr<DictObject> m_trailer;
RefPtr<DictObject> m_catalog;
Vector<u32> m_page_object_indices;
HashMap<u32, Page> m_pages;
HashMap<u32, Value> m_values;
};
}
namespace AK {
template<>
struct Formatter<PDF::Rectangle> : Formatter<StringView> {
void format(FormatBuilder& builder, const PDF::Rectangle& rectangle)
{
Formatter<StringView>::format(builder,
String::formatted("Rectangle {{ ll=({}, {}), ur=({}, {}) }}",
rectangle.lower_left_x,
rectangle.lower_left_y,
rectangle.upper_right_x,
rectangle.upper_right_y));
}
};
template<>
struct Formatter<PDF::Page> : Formatter<StringView> {
void format(FormatBuilder& builder, const PDF::Page& page)
{
constexpr auto fmt_string = "Page {{\n resources={}\n contents={}\n media_box={}\n}}";
auto str = String::formatted(fmt_string, page.resources->to_string(1), page.contents->to_string(1), page.media_box);
Formatter<StringView>::format(builder, str);
}
};
}