mirror of
https://github.com/RGBCube/serenity
synced 2025-07-26 02:37:35 +00:00
LibPDF: Handle string encodings
Strings can be encoded in either UTF16-BE or UTF8. In either case, there are a few initial bytes which specify the encoding that must be checked and also removed from the final string.
This commit is contained in:
parent
a08922d2f6
commit
67b65dffa8
2 changed files with 23 additions and 4 deletions
|
@ -8,4 +8,4 @@ set(SOURCES
|
||||||
)
|
)
|
||||||
|
|
||||||
serenity_lib(LibPDF pdf)
|
serenity_lib(LibPDF pdf)
|
||||||
target_link_libraries(LibPDF LibC LibCore LibIPC LibGfx)
|
target_link_libraries(LibPDF LibC LibCore LibIPC LibGfx LibTextCodec)
|
||||||
|
|
|
@ -9,6 +9,7 @@
|
||||||
#include <LibPDF/Document.h>
|
#include <LibPDF/Document.h>
|
||||||
#include <LibPDF/Filter.h>
|
#include <LibPDF/Filter.h>
|
||||||
#include <LibPDF/Parser.h>
|
#include <LibPDF/Parser.h>
|
||||||
|
#include <LibTextCodec/Decoder.h>
|
||||||
#include <ctype.h>
|
#include <ctype.h>
|
||||||
#include <math.h>
|
#include <math.h>
|
||||||
|
|
||||||
|
@ -422,9 +423,27 @@ NonnullRefPtr<StringObject> Parser::parse_string()
|
||||||
{
|
{
|
||||||
ScopeGuard guard([&] { consume_whitespace(); });
|
ScopeGuard guard([&] { consume_whitespace(); });
|
||||||
|
|
||||||
if (m_reader.matches('('))
|
String string;
|
||||||
return make_object<StringObject>(parse_literal_string(), false);
|
bool is_binary_string;
|
||||||
return make_object<StringObject>(parse_hex_string(), true);
|
|
||||||
|
if (m_reader.matches('(')) {
|
||||||
|
string = parse_literal_string();
|
||||||
|
is_binary_string = false;
|
||||||
|
} else {
|
||||||
|
string = parse_hex_string();
|
||||||
|
is_binary_string = true;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (string.bytes().starts_with(Array<u8, 2> { 0xfe, 0xff })) {
|
||||||
|
// The string is encoded in UTF16-BE
|
||||||
|
string = TextCodec::decoder_for("utf-16be")->to_utf8(string.substring(2));
|
||||||
|
} else if (string.bytes().starts_with(Array<u8, 3> { 239, 187, 191 })) {
|
||||||
|
// The string is encoded in UTF-8. This is the default anyways, but if these bytes
|
||||||
|
// are explicitly included, we have to trim them
|
||||||
|
string = string.substring(3);
|
||||||
|
}
|
||||||
|
|
||||||
|
return make_object<StringObject>(string, is_binary_string);
|
||||||
}
|
}
|
||||||
|
|
||||||
String Parser::parse_literal_string()
|
String Parser::parse_literal_string()
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue