From cf26fc23931332daaa9989a74f530bce1ec290f7 Mon Sep 17 00:00:00 2001 From: Nico Weber Date: Fri, 20 Oct 2023 22:36:10 -0400 Subject: [PATCH] LibPDF: Make parser skip whitespace after header 0000990.pdf from 0000.zip from https://pdfa.org/new-large-scale-pdf-corpus-now-publicly-available/ starts like so: ``` %PDF-1.7 4 0 obj ``` parse_heaader() used to put the cursor at the start of the 2nd, empty, line. initialize_linearization_dict() would then check if `m_reader.matches_number()` to see if there could possibly be a linearization dict. In this case, there isn't one, but we should detect linearization dicts even if they're separated by whitespace from the first line. --- Userland/Libraries/LibPDF/DocumentParser.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/Userland/Libraries/LibPDF/DocumentParser.cpp b/Userland/Libraries/LibPDF/DocumentParser.cpp index 1f427109dd..0731da6c6d 100644 --- a/Userland/Libraries/LibPDF/DocumentParser.cpp +++ b/Userland/Libraries/LibPDF/DocumentParser.cpp @@ -92,6 +92,7 @@ PDFErrorOr DocumentParser::parse_header() return error(DeprecatedString::formatted("Unknown minor version \"{}\"", minor_ver)); m_reader.consume_eol(); + m_reader.consume_whitespace(); // Parse optional high-byte comment, which signifies a binary file // FIXME: Do something with this?