1
Fork 0
mirror of https://github.com/RGBCube/serenity synced 2025-07-27 11:57:35 +00:00

LibPDF: Implement some of the AdobeGlyphList algorithm

Turns out there's a spec that goes with the table.

The big change here is that we can now map `uni1234` to 0x1234 and
`u123456` to 0x123456.

The parts where we split a name on `_` and map each component
and the part where we're supposed to allow multiple groups of 4
after `uni` aren't implemented yet.

The ZapfDingbats lookup is also still missing.

I haven't seen this have an effect in practice, but it's easy to
construct a PDF with a custom encoding where it would make a
difference.
This commit is contained in:
Nico Weber 2024-02-28 17:08:07 -05:00 committed by Andreas Kling
parent f8b8d1b3be
commit 2eb099aabe

View file

@ -31,6 +31,8 @@ print(f'}};')
where glyphlist.txt is from https://github.com/adobe-type-tools/agl-aglfn/blob/master/glyphlist.txt where glyphlist.txt is from https://github.com/adobe-type-tools/agl-aglfn/blob/master/glyphlist.txt
*/ */
#include <AK/CharacterTypes.h>
#include <AK/Format.h>
#include <AK/HashMap.h> #include <AK/HashMap.h>
#include <LibPDF/Fonts/AdobeGlyphList.h> #include <LibPDF/Fonts/AdobeGlyphList.h>
@ -4239,9 +4241,98 @@ static HashMap<StringView, u32> const glyph_list = {
{ "zukatakana"sv, 0x30BA }, { "zukatakana"sv, 0x30BA },
}; };
static bool are_all_uppercase_hex(StringView component)
{
for (auto c : component)
if (!is_ascii_uppercase_hex_digit(c))
return false;
return true;
}
static u32 decode_hex(StringView hex_string)
{
u32 code_point = 0;
for (auto c : hex_string) {
VERIFY(is_ascii_uppercase_hex_digit(c));
code_point = (code_point << 4) | parse_ascii_hex_digit(c);
}
return code_point;
}
Optional<u32> glyph_name_to_unicode(StringView name) Optional<u32> glyph_name_to_unicode(StringView name)
{ {
return glyph_list.get(name); // https://github.com/adobe-type-tools/agl-specification?tab=readme-ov-file#2-the-mapping
// "To map a glyph name to a character string, follow the three steps below:
//
// 1. Drop all the characters from the glyph name starting with the first occurrence of a period (U+002E FULL STOP), if any.
if (auto index = name.find('.'); index.has_value())
name = name.substring_view(0, index.value());
// 2. Split the remaining string into a sequence of components, using underscore (U+005F LOW LINE) as the delimiter.
if (auto index = name.find('_'); index.has_value()) {
dbgln("FIXME: splitting on _ not yet implemented, ignoring all but first component");
name = name.substring_view(0, index.value());
}
// 3. Map each component to a character string according to the procedure below, and concatenate those strings; the result is the character string to which the glyph name is mapped.
StringView component = name;
// If the font is Zapf Dingbats (PostScript FontName: ZapfDingbats), and the component is in the ITC Zapf Dingbats Glyph List, then map it to the corresponding character in that list."
// FIXME: Implement.
// "Otherwise, if the component is in AGL, then map it to the corresponding character in that list.
auto agl_entry = glyph_list.get(component);
if (agl_entry.has_value())
return agl_entry.value();
// Otherwise, if the component is of the form uni (U+0075, U+006E, and U+0069)"
if (component.starts_with("uni"sv)) {
component = component.substring_view(3);
// Implementor's note: The spec allows 0 groups of four hex digits and maps them to nothing. Get this special case out of the way early.
if (component.is_empty())
return OptionalNone {};
// "followed by a sequence of uppercase hexadecimal digits (09 and AF, meaning U+0030 through U+0039 and U+0041 through U+0046),
if (are_all_uppercase_hex(component)) {
bool all_are_ucs2 = true;
// if the length of that sequence is a multiple of four,
if (component.length() % 4 == 0) {
for (size_t i = 0; i < component.length(); i += 4) {
// and if each group of four digits represents a value in the ranges 0000 through D7FF or E000 through FFFF,
u32 code_point = decode_hex(component.substring_view(i, 4));
bool is_ucs2 = code_point <= 0xFFFF && !is_unicode_surrogate(code_point);
if (!is_ucs2) {
all_are_ucs2 = false;
break;
}
}
}
if (all_are_ucs2) {
// then interpret each as a Unicode scalar value and map the component to the string made of those scalar values.
// Note that the range and digit-length restrictions mean that the uni glyph name prefix can be used only with UVs in the Basic Multilingual Plane (BMP).
if (component.length() > 4)
dbgln("FIXME: Returning multiple uni components not yet implemented, returning only the first one");
return decode_hex(component.substring_view(0, 4));
}
}
}
// Otherwise, if the component is of the form u (U+0075)
else if (component.starts_with('u')) {
component = component.substring_view(1);
// followed by a sequence of four to six uppercase hexadecimal digits (09 and AF, meaning U+0030 through U+0039 and U+0041 through U+0046),
if (4 <= component.length() && component.length() <= 6 && are_all_uppercase_hex(component)) {
// and those digits represents a value in the ranges 0000 through D7FF or E000 through 10FFFF,
u32 code_point = decode_hex(component);
if (is_unicode(code_point) && !is_unicode_surrogate(code_point)) {
// then interpret it as a Unicode scalar value and map the component to the string made of this scalar value.
return code_point;
}
}
}
// Otherwise, map the component to an empty string."
return OptionalNone {};
} }
} }