diff --git a/Userland/Libraries/LibPDF/Document.cpp b/Userland/Libraries/LibPDF/Document.cpp index 5b73431530..fb63d20b48 100644 --- a/Userland/Libraries/LibPDF/Document.cpp +++ b/Userland/Libraries/LibPDF/Document.cpp @@ -7,6 +7,7 @@ #include #include #include +#include namespace PDF { @@ -36,32 +37,32 @@ DeprecatedString OutlineItem::to_deprecated_string(int indent) const PDFErrorOr> InfoDict::title() const { - return get(CommonNames::Title); + return get_text(CommonNames::Title); } PDFErrorOr> InfoDict::author() const { - return get(CommonNames::Author); + return get_text(CommonNames::Author); } PDFErrorOr> InfoDict::subject() const { - return get(CommonNames::Subject); + return get_text(CommonNames::Subject); } PDFErrorOr> InfoDict::keywords() const { - return get(CommonNames::Keywords); + return get_text(CommonNames::Keywords); } PDFErrorOr> InfoDict::creator() const { - return get(CommonNames::Creator); + return get_text(CommonNames::Creator); } PDFErrorOr> InfoDict::producer() const { - return get(CommonNames::Producer); + return get_text(CommonNames::Producer); } PDFErrorOr> InfoDict::creation_date() const @@ -74,6 +75,28 @@ PDFErrorOr> InfoDict::modification_date() const return get(CommonNames::ModDate); } +PDFErrorOr> InfoDict::get_text(DeprecatedFlyString const& name) const +{ + return TRY(get(name)).map(Document::text_string_to_utf8); +} + +DeprecatedString Document::text_string_to_utf8(DeprecatedString const& text_string) +{ + if (text_string.bytes().starts_with(Array { 0xfe, 0xff })) { + // The string is encoded in UTF16-BE + return TextCodec::decoder_for("utf-16be"sv)->to_utf8(text_string).release_value_but_fixme_should_propagate_errors().to_deprecated_string(); + } + + if (text_string.bytes().starts_with(Array { 239, 187, 191 })) { + // The string is encoded in UTF-8. + return text_string.substring(3); + } + + // FIXME: Convert from PDFDocEncoding to UTF-8. + + return text_string; +} + PDFErrorOr> Document::create(ReadonlyBytes bytes) { auto parser = adopt_ref(*new DocumentParser({}, bytes)); @@ -544,7 +567,7 @@ PDFErrorOr> Document::build_outline_item(NonnullRefPt outline_item->children = move(children); } - outline_item->title = TRY(outline_item_dict->get_string(this, CommonNames::Title))->string(); + outline_item->title = text_string_to_utf8(TRY(outline_item_dict->get_string(this, CommonNames::Title))->string()); if (outline_item_dict->contains(CommonNames::Count)) outline_item->count = outline_item_dict->get_value(CommonNames::Count).get(); diff --git a/Userland/Libraries/LibPDF/Document.h b/Userland/Libraries/LibPDF/Document.h index e5cb41b9b1..40675bccc7 100644 --- a/Userland/Libraries/LibPDF/Document.h +++ b/Userland/Libraries/LibPDF/Document.h @@ -39,7 +39,7 @@ struct Destination { struct OutlineItem final : public RefCounted { RefPtr parent; Vector> children; - DeprecatedString title; + DeprecatedString title; // Already converted to UTF-8. i32 count { 0 }; Destination dest; Gfx::Color color { Color::NamedColor::Black }; // 'C' in the PDF spec @@ -66,6 +66,8 @@ public: { } + // These all return strings that are already converted to UTF-8. + PDFErrorOr> title() const; PDFErrorOr> author() const; PDFErrorOr> subject() const; @@ -89,6 +91,8 @@ private: return TRY(m_info_dict->get_string(m_document, name))->string(); } + PDFErrorOr> get_text(DeprecatedFlyString const& name) const; + WeakPtr m_document; NonnullRefPtr m_info_dict; }; @@ -97,6 +101,9 @@ class Document final : public RefCounted , public Weakable { public: + // Converts a text string (PDF 1.7 spec, 3.8.1. "String Types") to UTF-8. + static DeprecatedString text_string_to_utf8(DeprecatedString const&); + static PDFErrorOr> create(ReadonlyBytes bytes); // If a security handler is present, it is the caller's responsibility to ensure diff --git a/Userland/Libraries/LibPDF/Parser.cpp b/Userland/Libraries/LibPDF/Parser.cpp index cd52d641c3..3a4e592eae 100644 --- a/Userland/Libraries/LibPDF/Parser.cpp +++ b/Userland/Libraries/LibPDF/Parser.cpp @@ -9,7 +9,6 @@ #include #include #include -#include #include namespace PDF { @@ -262,17 +261,6 @@ PDFErrorOr> Parser::parse_string() if (m_document->security_handler() && m_enable_encryption) m_document->security_handler()->decrypt(string_object, m_current_reference_stack.last()); - auto unencrypted_string = string_object->string(); - - if (unencrypted_string.bytes().starts_with(Array { 0xfe, 0xff })) { - // The string is encoded in UTF16-BE - string_object->set_string(TextCodec::decoder_for("utf-16be"sv)->to_utf8(unencrypted_string).release_value_but_fixme_should_propagate_errors().to_deprecated_string()); - } else if (unencrypted_string.bytes().starts_with(Array { 239, 187, 191 })) { - // The string is encoded in UTF-8. This is the default anyways, but if these bytes - // are explicitly included, we have to trim them - string_object->set_string(unencrypted_string.substring(3)); - } - return string_object; }