1
Fork 0
mirror of https://github.com/RGBCube/serenity synced 2025-07-25 15:47:44 +00:00

LibPDF: Scan for PDF file start in first 1024 bytes

Other readers do this too, and files depend on this.

Fixes opening these four files from the PDFA 0000.zip dataset:

* 0000015.pdf
  Starts with `C:\web\webeuncet\_cat\_docs\_publics\` before header
* 0000408.pdf
  Starts with UTF-8 BOM
* 0000524.pdf
  Starts with 867 bytes of HTML containing a PHP backtrace
* 0000680.pdf
  Starts with `C:\web\webeuncet\_cat\_docs\_publics\` too
This commit is contained in:
Nico Weber 2023-10-22 21:24:52 -04:00 committed by Andreas Kling
parent 9495f64f91
commit 0bb0c7dac2
3 changed files with 21 additions and 0 deletions

View file

@ -72,6 +72,19 @@ PDFErrorOr<Value> DocumentParser::parse_object_with_index(u32 index)
return indirect_value->value();
}
PDFErrorOr<size_t> DocumentParser::scan_for_header_start(ReadonlyBytes bytes)
{
// PDF 1.7 spec, APPENDIX H, 3.4.1 "File Header":
// "13. Acrobat viewers require only that the header appear somewhere within the first 1024 bytes of the file."
// ...which of course means files depend on it.
// All offsets in the file are relative to the header start, not to the start of the file.
StringView first_bytes { bytes.data(), min(bytes.size(), 1024 - "1.4"sv.length()) };
Optional<size_t> start_offset = first_bytes.find("%PDF-"sv);
if (!start_offset.has_value())
return Error { Error::Type::Parse, "Failed to find PDF start" };
return start_offset.value();
}
PDFErrorOr<Version> DocumentParser::parse_header()
{
m_reader.move_to(0);