LibPDF: Scan for PDF file start in first 1024 bytes

Other readers do this too, and files depend on this. Fixes opening these four files from the PDFA 0000.zip dataset: * 0000015.pdf Starts with `C:\web\webeuncet\_cat\_docs\_publics\` before header * 0000408.pdf Starts with UTF-8 BOM * 0000524.pdf Starts with 867 bytes of HTML containing a PHP backtrace * 0000680.pdf Starts with `C:\web\webeuncet\_cat\_docs\_publics\` too
2025-07-27 08:07:34 +00:00 · 2023-10-22 21:24:52 -04:00 · 2023-10-22 21:24:52 -04:00 · 0bb0c7dac2
commit 0bb0c7dac2
parent 9495f64f91
3 changed files with 21 additions and 0 deletions
--- a/Userland/Libraries/LibPDF/DocumentParser.h
+++ b/Userland/Libraries/LibPDF/DocumentParser.h
@ -18,6 +18,8 @@ struct Version {
 class DocumentParser final : public RefCounted<DocumentParser>
    , public Parser {
 public:
+    static PDFErrorOr<size_t> scan_for_header_start(ReadonlyBytes);
+
    DocumentParser(Document*, ReadonlyBytes);

    enum class LinearizationResult {