LibPDF: Port 59b50fa43f8c2 to xref and object streams

0000440.pdf contains an xref stream object (at offset 3643676) starting: ``` 294 0 obj << /Type /XRef /Index [0 295] /Size 295 ``` and an object stream object (at offset 3640121) starting: ``` 230 0 obj << /Type /ObjStm /N 73 /First 614 ``` In both cases, the `obj` and the `<<` are separated by non-newline whitespace. 633e1632d0 made parse_indirect_value() tolerate this, but it didn't update neither parse_xref_stream() (which parses xref streams) nor parse_compressed_object_with_index() (which parses object streams), despite all three changes being part of #14873. Make parse_xref_stream() and parse_compressed_object_with_index() call parse_indirect_value() to pick up the fix over there. It's a bit less code too. (0000440.pdf is the only PDF in my 1000 test PDFs that this helps, somewhat surprisingly.)
2025-07-26 11:27:34 +00:00 · 2024-01-03 17:30:02 -05:00 · 2024-01-03 17:30:02 -05:00 · e16345555b
commit e16345555b
parent a545935997
1 changed files with 18 additions and 26 deletions
--- a/Userland/Libraries/LibPDF/DocumentParser.cpp
+++ b/Userland/Libraries/LibPDF/DocumentParser.cpp
@ -390,18 +390,23 @@ PDFErrorOr<void> DocumentParser::validate_xref_table_and_fix_if_necessary()
    return {};
 }

+static PDFErrorOr<NonnullRefPtr<StreamObject>> indirect_value_as_stream(NonnullRefPtr<IndirectValue> indirect_value)
+{
+    auto value = indirect_value->value();
+    if (!value.has<NonnullRefPtr<Object>>())
+        return Error { Error::Type::Parse, "Expected indirect value to be a stream" };
+    auto value_object = value.get<NonnullRefPtr<Object>>();
+    if (!value_object->is<StreamObject>())
+        return Error { Error::Type::Parse, "Expected indirect value to be a stream" };
+    return value_object->cast<StreamObject>();
+}
+
 PDFErrorOr<NonnullRefPtr<XRefTable>> DocumentParser::parse_xref_stream()
 {
-    auto first_number = TRY(parse_number());
-    auto second_number = TRY(parse_number());
+    auto xref_stream = TRY(parse_indirect_value());
+    auto stream = TRY(indirect_value_as_stream(xref_stream));

-    if (!m_reader.matches("obj"))
-        return error("Malformed xref object");
-    m_reader.move_by(3);
-    if (m_reader.matches_eol())
-        m_reader.consume_eol();
-
-    auto dict = TRY(parse_dict());
+    auto dict = stream->dict();
    auto type = TRY(dict->get_name(m_document, CommonNames::Type))->name();
    if (type != "XRef")
        return error("Malformed xref dictionary");
@ -425,7 +430,6 @@ PDFErrorOr<NonnullRefPtr<XRefTable>> DocumentParser::parse_xref_stream()
    } else {
        subsections.append({ 0, number_of_object_entries });
    }
-    auto stream = TRY(parse_stream(dict));
    auto table = adopt_ref(*new XRefTable());

    auto field_to_long = [](ReadonlyBytes field) -> long {
@ -562,22 +566,13 @@ PDFErrorOr<Value> DocumentParser::parse_compressed_object_with_index(u32 index)

    m_reader.move_to(stream_offset);

-    auto first_number = TRY(parse_number());
-    auto second_number = TRY(parse_number());
+    auto obj_stream = TRY(parse_indirect_value());
+    auto stream = TRY(indirect_value_as_stream(obj_stream));

-    if (first_number.get<int>() != object_stream_index)
+    if (obj_stream->index() != object_stream_index)
        return error("Mismatching object stream index");
-    if (second_number.get<int>() != 0)
-        return error("Non-zero object stream generation number");

-    if (!m_reader.matches("obj"))
-        return error("Malformed object stream");
-    m_reader.move_by(3);
-    if (m_reader.matches_eol())
-        m_reader.consume_eol();
-
-    push_reference({ static_cast<u32>(first_number.get<int>()), static_cast<u32>(second_number.get<int>()) });
-    auto dict = TRY(parse_dict());
+    auto dict = stream->dict();

    auto type = TRY(dict->get_name(m_document, CommonNames::Type))->name();
    if (type != "ObjStm")
@ -586,9 +581,6 @@ PDFErrorOr<Value> DocumentParser::parse_compressed_object_with_index(u32 index)
    auto object_count = dict->get_value("N").get_u32();
    auto first_object_offset = dict->get_value("First").get_u32();

-    auto stream = TRY(parse_stream(dict));
-    pop_reference();
-
    Parser stream_parser(m_document, stream->bytes());

    // The data was already decrypted when reading the outer compressed ObjStm.