mirror of
https://github.com/RGBCube/serenity
synced 2025-07-27 05:27:45 +00:00
LibPDF: Stop converting encodings in object parser
Per 1.7 spec 3.8.1, there are multiple logical text string types: * text strings * ASCII strings * byte strings Text strings can be in UTF-16BE, PDFDocEncoding, or (since PDF 2.0) UTF-8. But byte strings shouldn't be converted but treated as binary data. This makes us no longer convert strings used for drawing page text. TABLE 5.6 "Text-showing operators" lists the operands for text-showing operators as just "string", not "text string" (even though these strings confusingly are called "text strings" in the body text), so not doing this there is correct (and matches other viewers). We also no longer incorrectly convert strings used for cypto data (such as passwords), if they start with an UTF-16BE or UTF-8 marker. No behavior change for outlines and info dict entries. https://pdfa.org/understanding-utf-8-in-pdf-2-0/ has a good overview of this. (ASCII strings only contain ASCII characters and behave the same anyways.)
This commit is contained in:
parent
8ee0c75f43
commit
e39a790c82
3 changed files with 38 additions and 20 deletions
|
@ -39,7 +39,7 @@ struct Destination {
|
|||
struct OutlineItem final : public RefCounted<OutlineItem> {
|
||||
RefPtr<OutlineItem> parent;
|
||||
Vector<NonnullRefPtr<OutlineItem>> children;
|
||||
DeprecatedString title;
|
||||
DeprecatedString title; // Already converted to UTF-8.
|
||||
i32 count { 0 };
|
||||
Destination dest;
|
||||
Gfx::Color color { Color::NamedColor::Black }; // 'C' in the PDF spec
|
||||
|
@ -66,6 +66,8 @@ public:
|
|||
{
|
||||
}
|
||||
|
||||
// These all return strings that are already converted to UTF-8.
|
||||
|
||||
PDFErrorOr<Optional<DeprecatedString>> title() const;
|
||||
PDFErrorOr<Optional<DeprecatedString>> author() const;
|
||||
PDFErrorOr<Optional<DeprecatedString>> subject() const;
|
||||
|
@ -89,6 +91,8 @@ private:
|
|||
return TRY(m_info_dict->get_string(m_document, name))->string();
|
||||
}
|
||||
|
||||
PDFErrorOr<Optional<DeprecatedString>> get_text(DeprecatedFlyString const& name) const;
|
||||
|
||||
WeakPtr<Document> m_document;
|
||||
NonnullRefPtr<DictObject> m_info_dict;
|
||||
};
|
||||
|
@ -97,6 +101,9 @@ class Document final
|
|||
: public RefCounted<Document>
|
||||
, public Weakable<Document> {
|
||||
public:
|
||||
// Converts a text string (PDF 1.7 spec, 3.8.1. "String Types") to UTF-8.
|
||||
static DeprecatedString text_string_to_utf8(DeprecatedString const&);
|
||||
|
||||
static PDFErrorOr<NonnullRefPtr<Document>> create(ReadonlyBytes bytes);
|
||||
|
||||
// If a security handler is present, it is the caller's responsibility to ensure
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue