1
Fork 0
mirror of https://github.com/RGBCube/serenity synced 2025-07-27 06:37:44 +00:00

LibPDF: Scan for PDF file start in first 1024 bytes

Other readers do this too, and files depend on this.

Fixes opening these four files from the PDFA 0000.zip dataset:

* 0000015.pdf
  Starts with `C:\web\webeuncet\_cat\_docs\_publics\` before header
* 0000408.pdf
  Starts with UTF-8 BOM
* 0000524.pdf
  Starts with 867 bytes of HTML containing a PHP backtrace
* 0000680.pdf
  Starts with `C:\web\webeuncet\_cat\_docs\_publics\` too
This commit is contained in:
Nico Weber 2023-10-22 21:24:52 -04:00 committed by Andreas Kling
parent 9495f64f91
commit 0bb0c7dac2
3 changed files with 21 additions and 0 deletions

View file

@ -18,6 +18,8 @@ struct Version {
class DocumentParser final : public RefCounted<DocumentParser>
, public Parser {
public:
static PDFErrorOr<size_t> scan_for_header_start(ReadonlyBytes);
DocumentParser(Document*, ReadonlyBytes);
enum class LinearizationResult {