LibPDF: Handle string encodings

Strings can be encoded in either UTF16-BE or UTF8. In either case, there are a few initial bytes which specify the encoding that must be checked and also removed from the final string.
2025-09-14 22:46:17 +00:00 · 2021-05-23 21:27:17 -07:00 · 2021-05-23 21:27:17 -07:00 · 67b65dffa8
commit 67b65dffa8
parent a08922d2f6
2 changed files with 23 additions and 4 deletions
--- a/Userland/Libraries/LibPDF/CMakeLists.txt
+++ b/Userland/Libraries/LibPDF/CMakeLists.txt
@ -8,4 +8,4 @@ set(SOURCES
    )
 serenity_lib(LibPDF pdf)
-target_link_libraries(LibPDF LibC LibCore LibIPC LibGfx)
+target_link_libraries(LibPDF LibC LibCore LibIPC LibGfx LibTextCodec)
--- a/Userland/Libraries/LibPDF/Parser.cpp
+++ b/Userland/Libraries/LibPDF/Parser.cpp
@ -9,6 +9,7 @@
 #include <LibPDF/Document.h>
 #include <LibPDF/Filter.h>
 #include <LibPDF/Parser.h>
 #include <LibTextCodec/Decoder.h>
 #include <ctype.h>
 #include <math.h>
@ -422,9 +423,27 @@ NonnullRefPtr<StringObject> Parser::parse_string()
 {
    ScopeGuard guard([&] { consume_whitespace(); });
-    if (m_reader.matches('('))
+    String string;
-        return make_object<StringObject>(parse_literal_string(), false);
+    bool is_binary_string;
-    return make_object<StringObject>(parse_hex_string(), true);
+
    if (m_reader.matches('(')) {
        string = parse_literal_string();
        is_binary_string = false;
    } else {
        string = parse_hex_string();
        is_binary_string = true;
    }
    if (string.bytes().starts_with(Array<u8, 2> { 0xfe, 0xff })) {
        // The string is encoded in UTF16-BE
        string = TextCodec::decoder_for("utf-16be")->to_utf8(string.substring(2));
    } else if (string.bytes().starts_with(Array<u8, 3> { 239, 187, 191 })) {
        // The string is encoded in UTF-8. This is the default anyways, but if these bytes
        // are explicitly included, we have to trim them
        string = string.substring(3);
    }
    return make_object<StringObject>(string, is_binary_string);
 }
 String Parser::parse_literal_string()