From a533ea7ae694a0e72be05e5ce63a6098613a3852 Mon Sep 17 00:00:00 2001 From: Rodrigo Tobar Date: Mon, 6 Feb 2023 00:05:33 +0800 Subject: [PATCH] LibPDF: Improve stream parsing When parsing streams we rely on a /Length item being defined in the stream's dictionary to know how much data comprises the stream. Its value is usually a direct value, but it can be indirect. There was however a contradiction in the code: the condition that allowed it to read and use the /Length value required it to be a direct value, but the actual code using the value would have worked with indirect ones. This meant that indirect /Length values triggered the fallback, "manual" stream parsing code. On the other hand, this latter code was also buggy, because it relied on the "endstream" keyword to appear on a separate line, which isn't always the case. This commit both fixes the bug in the manual stream parsing scenario, while also allowing for indirect /Length values to be used to parse streams more directly and avoid the manual approach. The main caveat to this second change is that for a brief period of time the Document is not able to resolve references (i.e., before the xref table itself is not parsed). Any parsing happening before that (e..g, the linearization dictionary) must therefore use the manual stream parsing approach. --- Userland/Libraries/LibPDF/Document.h | 5 +++++ Userland/Libraries/LibPDF/DocumentParser.h | 2 ++ Userland/Libraries/LibPDF/Parser.cpp | 18 +++++++----------- 3 files changed, 14 insertions(+), 11 deletions(-) diff --git a/Userland/Libraries/LibPDF/Document.h b/Userland/Libraries/LibPDF/Document.h index 85a64baa00..bbb7bc164f 100644 --- a/Userland/Libraries/LibPDF/Document.h +++ b/Userland/Libraries/LibPDF/Document.h @@ -119,6 +119,11 @@ public: return cast_to(TRY(resolve(value))); } + /// Whether this Document is reasdy to resolve references, which is usually + /// true, except just before the XRef table is parsed (and while the linearization + /// dict is being read). + bool can_resolve_refefences() { return m_parser->can_resolve_references(); } + private: explicit Document(NonnullRefPtr const& parser); diff --git a/Userland/Libraries/LibPDF/DocumentParser.h b/Userland/Libraries/LibPDF/DocumentParser.h index 590bca7f1a..0f58821a73 100644 --- a/Userland/Libraries/LibPDF/DocumentParser.h +++ b/Userland/Libraries/LibPDF/DocumentParser.h @@ -25,6 +25,8 @@ public: // Parses the header and initializes the xref table and trailer PDFErrorOr initialize(); + bool can_resolve_references() { return m_xref_table; }; + PDFErrorOr parse_object_with_index(u32 index); // Specialized version of parse_dict which aborts early if the dict being parsed diff --git a/Userland/Libraries/LibPDF/Parser.cpp b/Userland/Libraries/LibPDF/Parser.cpp index a51f0f8e53..374230a49b 100644 --- a/Userland/Libraries/LibPDF/Parser.cpp +++ b/Userland/Libraries/LibPDF/Parser.cpp @@ -446,7 +446,7 @@ PDFErrorOr> Parser::parse_stream(NonnullRefPtrget(CommonNames::Length); - if (maybe_length.has_value() && (!maybe_length->has())) { + if (maybe_length.has_value() && m_document->can_resolve_refefences()) { // The PDF writer has kindly provided us with the direct length of the stream m_reader.save(); auto length = TRY(m_document->resolve_to(maybe_length.value())); @@ -457,17 +457,13 @@ PDFErrorOr> Parser::parse_stream(NonnullRefPtr