From f9beff7b5e6adad71922fc080a89b27f45d544ea Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Julian=20Offenh=C3=A4user?= Date: Mon, 15 Aug 2022 12:04:59 +0200 Subject: [PATCH] LibPDF: Initial work on parsing xref streams Since PDF version 1.5, a document may omit the xref table in favor of a new kind of xref stream object. This is used to reference so-called "compressed" objects that are part of an object stream. With this patch we are able to parse this new kind of xref object, but we'll have to implement object streams to use them correctly. --- Userland/Libraries/LibPDF/CommonNames.h | 1 + Userland/Libraries/LibPDF/DocumentParser.cpp | 93 +++++++++++++++++++- Userland/Libraries/LibPDF/DocumentParser.h | 1 + Userland/Libraries/LibPDF/XRefTable.h | 17 ++++ 4 files changed, 108 insertions(+), 4 deletions(-) diff --git a/Userland/Libraries/LibPDF/CommonNames.h b/Userland/Libraries/LibPDF/CommonNames.h index 790690d45d..a436d1b36e 100644 --- a/Userland/Libraries/LibPDF/CommonNames.h +++ b/Userland/Libraries/LibPDF/CommonNames.h @@ -70,6 +70,7 @@ A(HTO) \ A(ICCBased) \ A(ID) \ + A(Index) \ A(JBIG2Decode) \ A(JPXDecode) \ A(Kids) \ diff --git a/Userland/Libraries/LibPDF/DocumentParser.cpp b/Userland/Libraries/LibPDF/DocumentParser.cpp index e34302928a..0dee963839 100644 --- a/Userland/Libraries/LibPDF/DocumentParser.cpp +++ b/Userland/Libraries/LibPDF/DocumentParser.cpp @@ -6,6 +6,7 @@ #include #include +#include #include #include #include @@ -178,7 +179,8 @@ PDFErrorOr DocumentParser::initialize_linearized_xref_table() // The linearization parameter dictionary has just been parsed, and the xref table // comes immediately after it. We are in the correct spot. m_xref_table = TRY(parse_xref_table()); - m_trailer = TRY(parse_file_trailer()); + if (!m_trailer) + m_trailer = TRY(parse_file_trailer()); // Also parse the main xref table and merge into the first-page xref table. Note // that we don't use the main xref table offset from the linearization dict because @@ -188,6 +190,7 @@ PDFErrorOr DocumentParser::initialize_linearized_xref_table() m_reader.move_to(main_xref_table_offset); auto main_xref_table = TRY(parse_xref_table()); TRY(m_xref_table->merge(move(*main_xref_table))); + return {}; } @@ -264,14 +267,96 @@ PDFErrorOr DocumentParser::initialize_non_linearized_xref_table() m_reader.move_to(xref_offset); m_xref_table = TRY(parse_xref_table()); - m_trailer = TRY(parse_file_trailer()); + if (!m_trailer) + m_trailer = TRY(parse_file_trailer()); return {}; } +PDFErrorOr> DocumentParser::parse_xref_stream() +{ + auto first_number = TRY(parse_number()); + auto second_number = TRY(parse_number()); + + if (!m_reader.matches("obj")) + return error("Malformed xref object"); + m_reader.move_by(3); + if (m_reader.matches_eol()) + m_reader.consume_eol(); + + auto dict = TRY(parse_dict()); + auto type = TRY(dict->get_name(m_document, CommonNames::Type))->name(); + if (type != "XRef") + return error("Malformed xref dictionary"); + + auto field_sizes = TRY(dict->get_array(m_document, "W")); + if (field_sizes->size() != 3) + return error("Malformed xref dictionary"); + + auto object_count = dict->get_value("Size").get(); + + Vector> subsection_indices; + if (dict->contains(CommonNames::Index)) { + auto index_array = TRY(dict->get_array(m_document, CommonNames::Index)); + if (index_array->size() % 2 != 0) + return error("Malformed xref dictionary"); + + for (size_t i = 0; i < index_array->size(); i += 2) + subsection_indices.append({ index_array->at(i).get(), index_array->at(i + 1).get() - 1 }); + } else { + subsection_indices.append({ 0, object_count - 1 }); + } + auto stream = TRY(parse_stream(dict)); + auto table = adopt_ref(*new XRefTable()); + + auto field_to_long = [](Span field) -> long { + long value = 0; + const u8 max = (field.size() - 1) * 8; + for (size_t i = 0; i < field.size(); ++i) { + value |= static_cast(field[i]) << (max - (i * 8)); + } + return value; + }; + + size_t byte_index = 0; + size_t subsection_index = 0; + + Vector entries; + + for (int entry_index = 0; entry_index < object_count; ++entry_index) { + Array fields; + for (size_t field_index = 0; field_index < 3; ++field_index) { + auto field_size = field_sizes->at(field_index).get_u32(); + auto field = stream->bytes().slice(byte_index, field_size); + fields[field_index] = field_to_long(field); + byte_index += field_size; + } + + u8 type = fields[0]; + if (!field_sizes->at(0).get_u32()) + type = 1; + + entries.append({ fields[1], static_cast(fields[2]), type != 0, type == 2 }); + + auto indices = subsection_indices[subsection_index]; + if (entry_index >= indices.get<1>()) { + table->add_section({ indices.get<0>(), indices.get<1>(), entries }); + entries.clear(); + subsection_index++; + } + } + + m_trailer = dict; + + return table; +} + PDFErrorOr> DocumentParser::parse_xref_table() { - if (!m_reader.matches("xref")) - return error("Expected \"xref\""); + if (!m_reader.matches("xref")) { + // Since version 1.5, there may be a cross-reference stream instead + return parse_xref_stream(); + } + m_reader.move_by(4); if (!m_reader.consume_eol()) return error("Expected newline after \"xref\""); diff --git a/Userland/Libraries/LibPDF/DocumentParser.h b/Userland/Libraries/LibPDF/DocumentParser.h index fef5f40ad4..6b6814bb75 100644 --- a/Userland/Libraries/LibPDF/DocumentParser.h +++ b/Userland/Libraries/LibPDF/DocumentParser.h @@ -82,6 +82,7 @@ private: PDFErrorOr initialize_hint_tables(); PDFErrorOr parse_page_offset_hint_table(ReadonlyBytes hint_stream_bytes); Vector parse_all_page_offset_hint_table_entries(PageOffsetHintTable const&, ReadonlyBytes hint_stream_bytes); + PDFErrorOr> parse_xref_stream(); PDFErrorOr> parse_xref_table(); PDFErrorOr> parse_file_trailer(); diff --git a/Userland/Libraries/LibPDF/XRefTable.h b/Userland/Libraries/LibPDF/XRefTable.h index 01c77197db..8c5abe14c9 100644 --- a/Userland/Libraries/LibPDF/XRefTable.h +++ b/Userland/Libraries/LibPDF/XRefTable.h @@ -19,6 +19,7 @@ struct XRefEntry { long byte_offset { invalid_byte_offset }; u16 generation_number { 0 }; bool in_use { false }; + bool compressed { false }; }; struct XRefSection { @@ -77,18 +78,34 @@ public: return m_entries[index].byte_offset; } + [[nodiscard]] ALWAYS_INLINE long object_stream_for_object(size_t index) const + { + return byte_offset_for_object(index); + } + [[nodiscard]] ALWAYS_INLINE u16 generation_number_for_object(size_t index) const { VERIFY(has_object(index)); return m_entries[index].generation_number; } + [[nodiscard]] ALWAYS_INLINE u16 object_stream_index_for_object(size_t index) const + { + return generation_number_for_object(index); + } + [[nodiscard]] ALWAYS_INLINE bool is_object_in_use(size_t index) const { VERIFY(has_object(index)); return m_entries[index].in_use; } + [[nodiscard]] ALWAYS_INLINE bool is_object_compressed(size_t index) const + { + VERIFY(has_object(index)); + return m_entries[index].compressed; + } + private: friend struct AK::Formatter;