1
Fork 0
mirror of https://github.com/RGBCube/serenity synced 2025-07-25 19:37:35 +00:00

LibPDF: Port 59b50fa43f8c2 to xref and object streams

0000440.pdf contains an xref stream object (at offset 3643676) starting:

```
294 0 obj <<
/Type /XRef
/Index [0 295]
/Size 295
```

and an object stream object (at offset 3640121) starting:

```
230 0 obj <<
/Type /ObjStm
/N 73
/First 614
```

In both cases, the `obj` and the `<<` are separated by non-newline
whitespace.

633e1632d0 made parse_indirect_value() tolerate this, but it didn't
update neither parse_xref_stream() (which parses xref streams) nor
parse_compressed_object_with_index() (which parses object streams),
despite all three changes being part of #14873.

Make parse_xref_stream() and parse_compressed_object_with_index()
call parse_indirect_value() to pick up the fix over there. It's a bit
less code too.

(0000440.pdf is the only PDF in my 1000 test PDFs that this helps,
somewhat surprisingly.)
This commit is contained in:
Nico Weber 2024-01-03 17:30:02 -05:00 committed by Andreas Kling
parent a545935997
commit e16345555b

View file

@ -390,18 +390,23 @@ PDFErrorOr<void> DocumentParser::validate_xref_table_and_fix_if_necessary()
return {};
}
static PDFErrorOr<NonnullRefPtr<StreamObject>> indirect_value_as_stream(NonnullRefPtr<IndirectValue> indirect_value)
{
auto value = indirect_value->value();
if (!value.has<NonnullRefPtr<Object>>())
return Error { Error::Type::Parse, "Expected indirect value to be a stream" };
auto value_object = value.get<NonnullRefPtr<Object>>();
if (!value_object->is<StreamObject>())
return Error { Error::Type::Parse, "Expected indirect value to be a stream" };
return value_object->cast<StreamObject>();
}
PDFErrorOr<NonnullRefPtr<XRefTable>> DocumentParser::parse_xref_stream()
{
auto first_number = TRY(parse_number());
auto second_number = TRY(parse_number());
auto xref_stream = TRY(parse_indirect_value());
auto stream = TRY(indirect_value_as_stream(xref_stream));
if (!m_reader.matches("obj"))
return error("Malformed xref object");
m_reader.move_by(3);
if (m_reader.matches_eol())
m_reader.consume_eol();
auto dict = TRY(parse_dict());
auto dict = stream->dict();
auto type = TRY(dict->get_name(m_document, CommonNames::Type))->name();
if (type != "XRef")
return error("Malformed xref dictionary");
@ -425,7 +430,6 @@ PDFErrorOr<NonnullRefPtr<XRefTable>> DocumentParser::parse_xref_stream()
} else {
subsections.append({ 0, number_of_object_entries });
}
auto stream = TRY(parse_stream(dict));
auto table = adopt_ref(*new XRefTable());
auto field_to_long = [](ReadonlyBytes field) -> long {
@ -562,22 +566,13 @@ PDFErrorOr<Value> DocumentParser::parse_compressed_object_with_index(u32 index)
m_reader.move_to(stream_offset);
auto first_number = TRY(parse_number());
auto second_number = TRY(parse_number());
auto obj_stream = TRY(parse_indirect_value());
auto stream = TRY(indirect_value_as_stream(obj_stream));
if (first_number.get<int>() != object_stream_index)
if (obj_stream->index() != object_stream_index)
return error("Mismatching object stream index");
if (second_number.get<int>() != 0)
return error("Non-zero object stream generation number");
if (!m_reader.matches("obj"))
return error("Malformed object stream");
m_reader.move_by(3);
if (m_reader.matches_eol())
m_reader.consume_eol();
push_reference({ static_cast<u32>(first_number.get<int>()), static_cast<u32>(second_number.get<int>()) });
auto dict = TRY(parse_dict());
auto dict = stream->dict();
auto type = TRY(dict->get_name(m_document, CommonNames::Type))->name();
if (type != "ObjStm")
@ -586,9 +581,6 @@ PDFErrorOr<Value> DocumentParser::parse_compressed_object_with_index(u32 index)
auto object_count = dict->get_value("N").get_u32();
auto first_object_offset = dict->get_value("First").get_u32();
auto stream = TRY(parse_stream(dict));
pop_reference();
Parser stream_parser(m_document, stream->bytes());
// The data was already decrypted when reading the outer compressed ObjStm.