1
Fork 0
mirror of https://github.com/RGBCube/serenity synced 2025-07-25 05:07:34 +00:00

LibPDF: Handle string encodings

Strings can be encoded in either UTF16-BE or UTF8. In either case,
there are a few initial bytes which specify the encoding that must
be checked and also removed from the final string.
This commit is contained in:
Matthew Olsson 2021-05-23 21:27:17 -07:00 committed by Ali Mohammad Pur
parent a08922d2f6
commit 67b65dffa8
2 changed files with 23 additions and 4 deletions

View file

@ -9,6 +9,7 @@
#include <LibPDF/Document.h>
#include <LibPDF/Filter.h>
#include <LibPDF/Parser.h>
#include <LibTextCodec/Decoder.h>
#include <ctype.h>
#include <math.h>
@ -422,9 +423,27 @@ NonnullRefPtr<StringObject> Parser::parse_string()
{
ScopeGuard guard([&] { consume_whitespace(); });
if (m_reader.matches('('))
return make_object<StringObject>(parse_literal_string(), false);
return make_object<StringObject>(parse_hex_string(), true);
String string;
bool is_binary_string;
if (m_reader.matches('(')) {
string = parse_literal_string();
is_binary_string = false;
} else {
string = parse_hex_string();
is_binary_string = true;
}
if (string.bytes().starts_with(Array<u8, 2> { 0xfe, 0xff })) {
// The string is encoded in UTF16-BE
string = TextCodec::decoder_for("utf-16be")->to_utf8(string.substring(2));
} else if (string.bytes().starts_with(Array<u8, 3> { 239, 187, 191 })) {
// The string is encoded in UTF-8. This is the default anyways, but if these bytes
// are explicitly included, we have to trim them
string = string.substring(3);
}
return make_object<StringObject>(string, is_binary_string);
}
String Parser::parse_literal_string()