1
Fork 0
mirror of https://github.com/RGBCube/serenity synced 2025-05-14 05:54:58 +00:00

LibPDF: Always treat /Subtype /Image as binary data when dumping

Sometimes, the "is mostly text" heuristic fails for images.

Before:

    Build/lagom/bin/pdf --render out.png ~/Downloads/0000/0000521.pdf \
        --page 10 --dump-contents 2>&1 | wc -l
       25709

After:

    Build/lagom/bin/pdf --render out.png ~/Downloads/0000/0000521.pdf \
         --page 10 --dump-contents 2>&1 | wc -l
       11376
This commit is contained in:
Nico Weber 2024-02-05 19:36:35 -05:00 committed by Tim Flynn
parent a9df60ff1c
commit 92a628c07c

View file

@ -5,6 +5,7 @@
*/
#include <AK/Hex.h>
#include <LibPDF/CommonNames.h>
#include <LibPDF/Document.h>
#include <LibPDF/ObjectDerivatives.h>
@ -136,6 +137,9 @@ ByteString StreamObject::to_byte_string(int indent) const
percentage_ascii = ascii_count * 100 / bytes().size();
bool is_mostly_text = percentage_ascii > 95;
if (dict()->contains(CommonNames::Subtype) && dict()->get_name(CommonNames::Subtype)->name() == "Image")
is_mostly_text = false;
if (is_mostly_text) {
for (size_t i = 0; i < bytes().size(); ++i) {
auto c = bytes()[i];