1
Fork 0
mirror of https://github.com/RGBCube/serenity synced 2025-07-25 21:47:46 +00:00

LibPDF: Scan for PDF file start in first 1024 bytes

Other readers do this too, and files depend on this.

Fixes opening these four files from the PDFA 0000.zip dataset:

* 0000015.pdf
  Starts with `C:\web\webeuncet\_cat\_docs\_publics\` before header
* 0000408.pdf
  Starts with UTF-8 BOM
* 0000524.pdf
  Starts with 867 bytes of HTML containing a PHP backtrace
* 0000680.pdf
  Starts with `C:\web\webeuncet\_cat\_docs\_publics\` too
This commit is contained in:
Nico Weber 2023-10-22 21:24:52 -04:00 committed by Andreas Kling
parent 9495f64f91
commit 0bb0c7dac2
3 changed files with 21 additions and 0 deletions

View file

@ -97,6 +97,12 @@ ByteString Document::text_string_to_utf8(ByteString const& text_string)
PDFErrorOr<NonnullRefPtr<Document>> Document::create(ReadonlyBytes bytes)
{
size_t offset_to_start = TRY(DocumentParser::scan_for_header_start(bytes));
if (offset_to_start != 0) {
dbgln("warning: PDF header not at start of file, skipping {} bytes", offset_to_start);
bytes = bytes.slice(offset_to_start);
}
auto parser = adopt_ref(*new DocumentParser({}, bytes));
auto document = adopt_ref(*new Document(parser));