LibPDF: Stop converting encodings in object parser

Per 1.7 spec 3.8.1, there are multiple logical text string types: * text strings * ASCII strings * byte strings Text strings can be in UTF-16BE, PDFDocEncoding, or (since PDF 2.0) UTF-8. But byte strings shouldn't be converted but treated as binary data. This makes us no longer convert strings used for drawing page text. TABLE 5.6 "Text-showing operators" lists the operands for text-showing operators as just "string", not "text string" (even though these strings confusingly are called "text strings" in the body text), so not doing this there is correct (and matches other viewers). We also no longer incorrectly convert strings used for cypto data (such as passwords), if they start with an UTF-16BE or UTF-8 marker. No behavior change for outlines and info dict entries. https://pdfa.org/understanding-utf-8-in-pdf-2-0/ has a good overview of this. (ASCII strings only contain ASCII characters and behave the same anyways.)
2025-07-25 21:17:44 +00:00 · 2023-11-20 21:04:31 -05:00 · 2023-11-20 21:04:31 -05:00 · e39a790c82
commit e39a790c82
parent 8ee0c75f43
3 changed files with 38 additions and 20 deletions
--- a/Userland/Libraries/LibPDF/Document.cpp
+++ b/Userland/Libraries/LibPDF/Document.cpp
@ -7,6 +7,7 @@
 #include <LibPDF/CommonNames.h>
 #include <LibPDF/Document.h>
 #include <LibPDF/Parser.h>
+#include <LibTextCodec/Decoder.h>

 namespace PDF {

@ -36,32 +37,32 @@ DeprecatedString OutlineItem::to_deprecated_string(int indent) const

 PDFErrorOr<Optional<DeprecatedString>> InfoDict::title() const
 {
-    return get(CommonNames::Title);
+    return get_text(CommonNames::Title);
 }

 PDFErrorOr<Optional<DeprecatedString>> InfoDict::author() const
 {
-    return get(CommonNames::Author);
+    return get_text(CommonNames::Author);
 }

 PDFErrorOr<Optional<DeprecatedString>> InfoDict::subject() const
 {
-    return get(CommonNames::Subject);
+    return get_text(CommonNames::Subject);
 }

 PDFErrorOr<Optional<DeprecatedString>> InfoDict::keywords() const
 {
-    return get(CommonNames::Keywords);
+    return get_text(CommonNames::Keywords);
 }

 PDFErrorOr<Optional<DeprecatedString>> InfoDict::creator() const
 {
-    return get(CommonNames::Creator);
+    return get_text(CommonNames::Creator);
 }

 PDFErrorOr<Optional<DeprecatedString>> InfoDict::producer() const
 {
-    return get(CommonNames::Producer);
+    return get_text(CommonNames::Producer);
 }

 PDFErrorOr<Optional<DeprecatedString>> InfoDict::creation_date() const
@ -74,6 +75,28 @@ PDFErrorOr<Optional<DeprecatedString>> InfoDict::modification_date() const
    return get(CommonNames::ModDate);
 }

+PDFErrorOr<Optional<DeprecatedString>> InfoDict::get_text(DeprecatedFlyString const& name) const
+{
+    return TRY(get(name)).map(Document::text_string_to_utf8);
+}
+
+DeprecatedString Document::text_string_to_utf8(DeprecatedString const& text_string)
+{
+    if (text_string.bytes().starts_with(Array<u8, 2> { 0xfe, 0xff })) {
+        // The string is encoded in UTF16-BE
+        return TextCodec::decoder_for("utf-16be"sv)->to_utf8(text_string).release_value_but_fixme_should_propagate_errors().to_deprecated_string();
+    }
+
+    if (text_string.bytes().starts_with(Array<u8, 3> { 239, 187, 191 })) {
+        // The string is encoded in UTF-8.
+        return text_string.substring(3);
+    }
+
+    // FIXME: Convert from PDFDocEncoding to UTF-8.
+
+    return text_string;
+}
+
 PDFErrorOr<NonnullRefPtr<Document>> Document::create(ReadonlyBytes bytes)
 {
    auto parser = adopt_ref(*new DocumentParser({}, bytes));
@ -544,7 +567,7 @@ PDFErrorOr<NonnullRefPtr<OutlineItem>> Document::build_outline_item(NonnullRefPt
        outline_item->children = move(children);
    }

-    outline_item->title = TRY(outline_item_dict->get_string(this, CommonNames::Title))->string();
+    outline_item->title = text_string_to_utf8(TRY(outline_item_dict->get_string(this, CommonNames::Title))->string());

    if (outline_item_dict->contains(CommonNames::Count))
        outline_item->count = outline_item_dict->get_value(CommonNames::Count).get<int>();