1
Fork 0
mirror of https://github.com/RGBCube/serenity synced 2025-05-31 14:28:12 +00:00
serenity/Userland/Libraries/LibPDF/Document.cpp
Matthew Olsson 8c745ad0d9 LibPDF: Parse page structures
This commit introduces the ability to parse the document catalog dict,
as well as the page tree and individual pages. Pages obviously aren't
fully parsed, as we won't care about most of the fields until we
start actually rendering PDFs.

One of the primary benefits of the PDF format is laziness. PDFs are
not meant to be parsed all at once, and the same is true for pages.
When a Document is constructed, it builds a map of page number to
object index, but it does not fetch and parse any of the pages. A page
is only parsed when a caller requests that particular page (and is
cached going forwards).

Additionally, this commit also adds an object_cast function which
logs bad casts if DEBUG_PDF is set. Additionally, utility functions
were added to ArrayObject and DictObject to get all types of objects
from the collections to avoid having to manually cast.
2021-05-10 10:32:39 +02:00

134 lines
3.7 KiB
C++

/*
* Copyright (c) 2021, Matthew Olsson <mattco@serenityos.org>
*
* SPDX-License-Identifier: BSD-2-Clause
*/
#include <LibPDF/Document.h>
#include <LibPDF/Parser.h>
namespace PDF {
Document::Document(const ReadonlyBytes& bytes)
: m_parser(Parser({}, bytes))
{
VERIFY(m_parser.perform_validation());
auto [xref_table, trailer] = m_parser.parse_last_xref_table_and_trailer();
m_xref_table = xref_table;
m_trailer = trailer;
m_catalog = m_trailer->get_dict(this, "Root");
build_page_tree();
}
Value Document::get_or_load_value(u32 index)
{
auto value = get_value(index);
if (value)
return value;
VERIFY(m_xref_table.has_object(index));
auto byte_offset = m_xref_table.byte_offset_for_object(index);
auto indirect_value = m_parser.parse_indirect_value_at_offset(byte_offset);
VERIFY(indirect_value->index() == index);
value = indirect_value->value();
m_values.set(index, value);
return value;
}
u32 Document::get_first_page_index() const
{
// FIXME: A PDF can have a different default first page, which
// should be fetched and returned here
return 0;
}
u32 Document::get_page_count() const
{
return m_page_object_indices.size();
}
Page Document::get_page(u32 index)
{
VERIFY(index < m_page_object_indices.size());
auto cached_page = m_pages.get(index);
if (cached_page.has_value())
return cached_page.value();
auto page_object_index = m_page_object_indices[index];
auto raw_page_object = resolve_to<DictObject>(get_or_load_value(page_object_index));
auto resources = raw_page_object->get_dict(this, "Resources");
auto media_box_array = raw_page_object->get_array(this, "MediaBox");
auto media_box = Rectangle {
media_box_array->at(0).to_float(),
media_box_array->at(1).to_float(),
media_box_array->at(2).to_float(),
media_box_array->at(3).to_float(),
};
auto contents = raw_page_object->get_object(this, "Contents");
Page page { resources, media_box, contents };
m_pages.set(index, page);
return page;
}
Value Document::resolve(const Value& value)
{
if (!value.is_object())
return value;
auto obj = value.as_object();
// FIXME: Surely indirect PDF objects can't contain another indirect PDF object,
// right? Unsure from the spec, but if they can, these return values would have
// to be wrapped with another resolve() call.
if (obj->is_indirect_value_ref()) {
auto object_index = static_cast<NonnullRefPtr<IndirectValueRef>>(obj)->index();
return get_or_load_value(object_index);
}
if (obj->is_indirect_value())
return static_cast<NonnullRefPtr<IndirectValue>>(obj)->value();
return obj;
}
template<IsValueType T>
UnwrappedValueType<T> Document::resolve_to(const Value& value)
{
auto resolved = resolve(value);
if constexpr (IsSame<T, bool>)
return resolved.as_bool();
if constexpr (IsSame<T, int>)
return resolved.as_int();
if constexpr (IsSame<T, float>)
return resolved.as_float();
if constexpr (IsObject<T>)
return object_cast<T>(resolved.as_object());
VERIFY_NOT_REACHED();
}
void Document::build_page_tree()
{
auto page_tree = m_catalog->get_dict(this, "Pages");
auto kids_array = page_tree->get_array(this, "Kids");
auto page_count = page_tree->get("Count").value().as_int();
if (static_cast<size_t>(page_count) != kids_array->elements().size()) {
// FIXME: Support recursive PDF page tree structures
VERIFY_NOT_REACHED();
}
for (auto& value : *kids_array) {
auto reference = resolve_to<IndirectValueRef>(value);
m_page_object_indices.append(reference->index());
}
}
}