From a1f17bd6432e842816cd851d319b85683f63b4be Mon Sep 17 00:00:00 2001 From: Nico Weber Date: Sat, 21 Oct 2023 22:19:06 -0400 Subject: [PATCH] LibPDF: Skip inline image data in operator stream Inline images can contain arbitrary binary data in the operator stream, greatly confusing the operator parser. Just skip them for now. They'll produce a `Rendering of feature not supported: draw operation: inline_image_begin` diag as usual, so we won't forget about it. After #21536, reduces number of crashes on 300 random PDFs from the web (the first 300 from 0000.zip from https://pdfa.org/new-large-scale-pdf-corpus-now-publicly-available/) from 23 (7%) to 22 (7%). On a larger sample (`Meta/test_pdf.py -n 500 ~/Downloads/0000`), reduces number of crashes from 53 (10.6%) with 36 distinct crash stacks to 46 (9.2%) with 33 distinct stacks. --- Userland/Libraries/LibPDF/Parser.cpp | 27 ++++++++++++++++++++++++++- 1 file changed, 26 insertions(+), 1 deletion(-) diff --git a/Userland/Libraries/LibPDF/Parser.cpp b/Userland/Libraries/LibPDF/Parser.cpp index ebdc4c8002..c16872b20e 100644 --- a/Userland/Libraries/LibPDF/Parser.cpp +++ b/Userland/Libraries/LibPDF/Parser.cpp @@ -531,10 +531,35 @@ PDFErrorOr> Parser::parse_operators() } auto operator_string = StringView(m_reader.bytes().slice(operator_start, m_reader.offset() - operator_start)); + m_reader.consume_whitespace(); + auto operator_type = Operator::operator_type_from_symbol(operator_string); + + // Inline images contain a dictionary containing arbitrary values between BI and ID, + // and then arbitrary binary data between ID and EI. + // This means they need a special code path in the parser, so that image data in there doesn't confuse the operator parser. + if (operator_type == OperatorType::InlineImageBegin) { + if (!operator_args.is_empty()) + return error("operator args not empty on start of inline image"); + + while (!m_reader.done()) { + if (m_reader.matches("EI")) { + break; + } + m_reader.consume(); + } + + if (m_reader.done()) + return error("operator stream ended inside inline image"); + + m_reader.consume(2); // "EI" + m_reader.consume_whitespace(); + + // FIXME: Do more with inline images than just skipping them. + } + operators.append(Operator(operator_type, move(operator_args))); operator_args = Vector(); - m_reader.consume_whitespace(); continue; }