mirror of
https://github.com/RGBCube/serenity
synced 2025-05-31 13:48:12 +00:00
LibPDF: Scan for PDF file start in first 1024 bytes
Other readers do this too, and files depend on this. Fixes opening these four files from the PDFA 0000.zip dataset: * 0000015.pdf Starts with `C:\web\webeuncet\_cat\_docs\_publics\` before header * 0000408.pdf Starts with UTF-8 BOM * 0000524.pdf Starts with 867 bytes of HTML containing a PHP backtrace * 0000680.pdf Starts with `C:\web\webeuncet\_cat\_docs\_publics\` too
This commit is contained in:
parent
9495f64f91
commit
0bb0c7dac2
3 changed files with 21 additions and 0 deletions
|
@ -97,6 +97,12 @@ ByteString Document::text_string_to_utf8(ByteString const& text_string)
|
||||||
|
|
||||||
PDFErrorOr<NonnullRefPtr<Document>> Document::create(ReadonlyBytes bytes)
|
PDFErrorOr<NonnullRefPtr<Document>> Document::create(ReadonlyBytes bytes)
|
||||||
{
|
{
|
||||||
|
size_t offset_to_start = TRY(DocumentParser::scan_for_header_start(bytes));
|
||||||
|
if (offset_to_start != 0) {
|
||||||
|
dbgln("warning: PDF header not at start of file, skipping {} bytes", offset_to_start);
|
||||||
|
bytes = bytes.slice(offset_to_start);
|
||||||
|
}
|
||||||
|
|
||||||
auto parser = adopt_ref(*new DocumentParser({}, bytes));
|
auto parser = adopt_ref(*new DocumentParser({}, bytes));
|
||||||
auto document = adopt_ref(*new Document(parser));
|
auto document = adopt_ref(*new Document(parser));
|
||||||
|
|
||||||
|
|
|
@ -72,6 +72,19 @@ PDFErrorOr<Value> DocumentParser::parse_object_with_index(u32 index)
|
||||||
return indirect_value->value();
|
return indirect_value->value();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
PDFErrorOr<size_t> DocumentParser::scan_for_header_start(ReadonlyBytes bytes)
|
||||||
|
{
|
||||||
|
// PDF 1.7 spec, APPENDIX H, 3.4.1 "File Header":
|
||||||
|
// "13. Acrobat viewers require only that the header appear somewhere within the first 1024 bytes of the file."
|
||||||
|
// ...which of course means files depend on it.
|
||||||
|
// All offsets in the file are relative to the header start, not to the start of the file.
|
||||||
|
StringView first_bytes { bytes.data(), min(bytes.size(), 1024 - "1.4"sv.length()) };
|
||||||
|
Optional<size_t> start_offset = first_bytes.find("%PDF-"sv);
|
||||||
|
if (!start_offset.has_value())
|
||||||
|
return Error { Error::Type::Parse, "Failed to find PDF start" };
|
||||||
|
return start_offset.value();
|
||||||
|
}
|
||||||
|
|
||||||
PDFErrorOr<Version> DocumentParser::parse_header()
|
PDFErrorOr<Version> DocumentParser::parse_header()
|
||||||
{
|
{
|
||||||
m_reader.move_to(0);
|
m_reader.move_to(0);
|
||||||
|
|
|
@ -18,6 +18,8 @@ struct Version {
|
||||||
class DocumentParser final : public RefCounted<DocumentParser>
|
class DocumentParser final : public RefCounted<DocumentParser>
|
||||||
, public Parser {
|
, public Parser {
|
||||||
public:
|
public:
|
||||||
|
static PDFErrorOr<size_t> scan_for_header_start(ReadonlyBytes);
|
||||||
|
|
||||||
DocumentParser(Document*, ReadonlyBytes);
|
DocumentParser(Document*, ReadonlyBytes);
|
||||||
|
|
||||||
enum class LinearizationResult {
|
enum class LinearizationResult {
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue