mirror of
https://github.com/RGBCube/serenity
synced 2025-06-01 03:08:13 +00:00

This commit introduces the ability to parse the document catalog dict, as well as the page tree and individual pages. Pages obviously aren't fully parsed, as we won't care about most of the fields until we start actually rendering PDFs. One of the primary benefits of the PDF format is laziness. PDFs are not meant to be parsed all at once, and the same is true for pages. When a Document is constructed, it builds a map of page number to object index, but it does not fetch and parse any of the pages. A page is only parsed when a caller requests that particular page (and is cached going forwards). Additionally, this commit also adds an object_cast function which logs bad casts if DEBUG_PDF is set. Additionally, utility functions were added to ArrayObject and DictObject to get all types of objects from the collections to avoid having to manually cast.
77 lines
2.2 KiB
C++
77 lines
2.2 KiB
C++
/*
|
|
* Copyright (c) 2021, Matthew Olsson <mattco@serenityos.org>
|
|
*
|
|
* SPDX-License-Identifier: BSD-2-Clause
|
|
*/
|
|
|
|
#pragma once
|
|
|
|
#include <AK/NonnullRefPtrVector.h>
|
|
#include <LibPDF/Object.h>
|
|
#include <LibPDF/Reader.h>
|
|
#include <LibPDF/XRefTable.h>
|
|
|
|
namespace PDF {
|
|
|
|
class Document;
|
|
|
|
class Parser {
|
|
public:
|
|
Parser(Badge<Document>, const ReadonlyBytes&);
|
|
|
|
bool perform_validation();
|
|
|
|
struct XRefTableAndTrailer {
|
|
XRefTable xref_table;
|
|
NonnullRefPtr<DictObject> trailer;
|
|
};
|
|
XRefTableAndTrailer parse_last_xref_table_and_trailer();
|
|
|
|
NonnullRefPtr<IndirectValue> parse_indirect_value_at_offset(size_t offset);
|
|
|
|
private:
|
|
bool parse_header();
|
|
XRefTable parse_xref_table();
|
|
NonnullRefPtr<DictObject> parse_file_trailer();
|
|
|
|
bool navigate_to_before_eof_marker();
|
|
bool navigate_to_after_startxref();
|
|
|
|
// If the PDF is linearized, the first object will be the linearization
|
|
// parameter dictionary, and it will always occur within the first 1024 bytes.
|
|
// We do a very sloppy and context-free search for this object. A return value
|
|
// of true does not necessarily mean this PDF is linearized, but a return value
|
|
// of false does mean this PDF is not linearized.
|
|
// FIXME: false doesn't guarantee non-linearization, but we VERIFY the result!
|
|
bool sloppy_is_linearized();
|
|
|
|
String parse_comment();
|
|
|
|
Value parse_value();
|
|
Value parse_possible_indirect_value_or_ref();
|
|
NonnullRefPtr<IndirectValue> parse_indirect_value(int index, int generation);
|
|
NonnullRefPtr<IndirectValue> parse_indirect_value();
|
|
Value parse_number();
|
|
NonnullRefPtr<NameObject> parse_name();
|
|
NonnullRefPtr<StringObject> parse_string();
|
|
String parse_literal_string();
|
|
String parse_hex_string();
|
|
NonnullRefPtr<ArrayObject> parse_array();
|
|
NonnullRefPtr<DictObject> parse_dict();
|
|
NonnullRefPtr<StreamObject> parse_stream(NonnullRefPtr<DictObject> dict);
|
|
|
|
bool matches_eol() const;
|
|
bool matches_whitespace() const;
|
|
bool matches_number() const;
|
|
bool matches_delimiter() const;
|
|
bool matches_regular_character() const;
|
|
|
|
void consume_eol();
|
|
bool consume_whitespace();
|
|
char consume();
|
|
void consume(char);
|
|
|
|
Reader m_reader;
|
|
};
|
|
|
|
}
|