From d1bc89e30b962d2e4805c0fcc7f2caf4d51267b7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Julian=20Offenh=C3=A4user?= Date: Wed, 23 Nov 2022 11:33:14 +0100 Subject: [PATCH] LibPDF: Try to repair XRef tables with broken indices An XRef table usually starts with an object number of zero. While it could technically start at any other number, this is a tell-tale sign of a broken table. For the "broken" documents I encountered, this always meant that some objects must have been removed from the start of the table, without updating the following indices. When this is the case, the document is not able to be read normally. However, most other PDF parsers seem to know of this quirk and fix the XRef table automatically. Likewise, we now check for this exact case, and if it matches up with what we expect, we update the XRef table such that all object numbers match the actual objects found in the file again. --- Userland/Libraries/LibPDF/DocumentParser.cpp | 52 +++++++++++++++++++- Userland/Libraries/LibPDF/DocumentParser.h | 1 + Userland/Libraries/LibPDF/XRefTable.h | 2 + 3 files changed, 54 insertions(+), 1 deletion(-) diff --git a/Userland/Libraries/LibPDF/DocumentParser.cpp b/Userland/Libraries/LibPDF/DocumentParser.cpp index 3b40fd1eb4..13161ffd54 100644 --- a/Userland/Libraries/LibPDF/DocumentParser.cpp +++ b/Userland/Libraries/LibPDF/DocumentParser.cpp @@ -197,7 +197,7 @@ PDFErrorOr DocumentParser::initialize_linearized_xref_table() auto main_xref_table = TRY(parse_xref_table()); TRY(m_xref_table->merge(move(*main_xref_table))); - return {}; + return validate_xref_table_and_fix_if_necessary(); } PDFErrorOr DocumentParser::initialize_hint_tables() @@ -275,6 +275,56 @@ PDFErrorOr DocumentParser::initialize_non_linearized_xref_table() m_xref_table = TRY(parse_xref_table()); if (!m_trailer) m_trailer = TRY(parse_file_trailer()); + return validate_xref_table_and_fix_if_necessary(); +} + +PDFErrorOr DocumentParser::validate_xref_table_and_fix_if_necessary() +{ + /* While an xref table may start with an object number other than zero, this is + very uncommon and likely a sign of a document with broken indices. + Like most other PDF parsers seem to do, we still try to salvage the situation. + NOTE: This is probably not spec-compliant behavior.*/ + size_t first_valid_index = 0; + while (m_xref_table->byte_offset_for_object(first_valid_index) == invalid_byte_offset) + first_valid_index++; + + if (first_valid_index) { + auto& entries = m_xref_table->entries(); + + bool need_to_rebuild_table = true; + for (size_t i = first_valid_index; i < entries.size(); ++i) { + if (!entries[i].in_use) + continue; + + size_t actual_object_number = 0; + if (entries[i].compressed) { + auto object_stream_index = m_xref_table->object_stream_for_object(i); + auto stream_offset = m_xref_table->byte_offset_for_object(object_stream_index); + m_reader.move_to(stream_offset); + auto first_number = TRY(parse_number()); + actual_object_number = first_number.get_u32(); + } else { + auto byte_offset = m_xref_table->byte_offset_for_object(i); + m_reader.move_to(byte_offset); + auto indirect_value = TRY(parse_indirect_value()); + actual_object_number = indirect_value->index(); + } + + if (actual_object_number != i - first_valid_index) { + /* Our suspicion was wrong, not all object numbers are shifted equally. + This could mean that the document is hopelessly broken, or it just + starts at a non-zero object index for some reason. */ + need_to_rebuild_table = false; + break; + } + } + + if (need_to_rebuild_table) { + warnln("Broken xref table detected, trying to fix it."); + entries.remove(0, first_valid_index); + } + } + return {}; } diff --git a/Userland/Libraries/LibPDF/DocumentParser.h b/Userland/Libraries/LibPDF/DocumentParser.h index 9f544f4b22..fdcb06d886 100644 --- a/Userland/Libraries/LibPDF/DocumentParser.h +++ b/Userland/Libraries/LibPDF/DocumentParser.h @@ -79,6 +79,7 @@ private: PDFErrorOr initialize_linearization_dict(); PDFErrorOr initialize_linearized_xref_table(); PDFErrorOr initialize_non_linearized_xref_table(); + PDFErrorOr validate_xref_table_and_fix_if_necessary(); PDFErrorOr initialize_hint_tables(); PDFErrorOr parse_page_offset_hint_table(ReadonlyBytes hint_stream_bytes); Vector parse_all_page_offset_hint_table_entries(PageOffsetHintTable const&, ReadonlyBytes hint_stream_bytes); diff --git a/Userland/Libraries/LibPDF/XRefTable.h b/Userland/Libraries/LibPDF/XRefTable.h index 3aeb75d449..20b4cef5c7 100644 --- a/Userland/Libraries/LibPDF/XRefTable.h +++ b/Userland/Libraries/LibPDF/XRefTable.h @@ -68,6 +68,8 @@ public: m_entries.append(entry); } + ALWAYS_INLINE Vector& entries() { return m_entries; } + [[nodiscard]] ALWAYS_INLINE bool has_object(size_t index) const { return index < m_entries.size() && m_entries[index].byte_offset != -1;