mirror of
https://github.com/RGBCube/serenity
synced 2025-07-27 18:27:35 +00:00
LibPDF: Implement some of the AdobeGlyphList algorithm
Turns out there's a spec that goes with the table. The big change here is that we can now map `uni1234` to 0x1234 and `u123456` to 0x123456. The parts where we split a name on `_` and map each component and the part where we're supposed to allow multiple groups of 4 after `uni` aren't implemented yet. The ZapfDingbats lookup is also still missing. I haven't seen this have an effect in practice, but it's easy to construct a PDF with a custom encoding where it would make a difference.
This commit is contained in:
parent
f8b8d1b3be
commit
2eb099aabe
1 changed files with 92 additions and 1 deletions
|
@ -31,6 +31,8 @@ print(f'}};')
|
|||
where glyphlist.txt is from https://github.com/adobe-type-tools/agl-aglfn/blob/master/glyphlist.txt
|
||||
*/
|
||||
|
||||
#include <AK/CharacterTypes.h>
|
||||
#include <AK/Format.h>
|
||||
#include <AK/HashMap.h>
|
||||
#include <LibPDF/Fonts/AdobeGlyphList.h>
|
||||
|
||||
|
@ -4239,9 +4241,98 @@ static HashMap<StringView, u32> const glyph_list = {
|
|||
{ "zukatakana"sv, 0x30BA },
|
||||
};
|
||||
|
||||
static bool are_all_uppercase_hex(StringView component)
|
||||
{
|
||||
for (auto c : component)
|
||||
if (!is_ascii_uppercase_hex_digit(c))
|
||||
return false;
|
||||
return true;
|
||||
}
|
||||
|
||||
static u32 decode_hex(StringView hex_string)
|
||||
{
|
||||
u32 code_point = 0;
|
||||
for (auto c : hex_string) {
|
||||
VERIFY(is_ascii_uppercase_hex_digit(c));
|
||||
code_point = (code_point << 4) | parse_ascii_hex_digit(c);
|
||||
}
|
||||
return code_point;
|
||||
}
|
||||
|
||||
Optional<u32> glyph_name_to_unicode(StringView name)
|
||||
{
|
||||
return glyph_list.get(name);
|
||||
// https://github.com/adobe-type-tools/agl-specification?tab=readme-ov-file#2-the-mapping
|
||||
// "To map a glyph name to a character string, follow the three steps below:
|
||||
//
|
||||
// 1. Drop all the characters from the glyph name starting with the first occurrence of a period (U+002E FULL STOP), if any.
|
||||
if (auto index = name.find('.'); index.has_value())
|
||||
name = name.substring_view(0, index.value());
|
||||
|
||||
// 2. Split the remaining string into a sequence of components, using underscore (U+005F LOW LINE) as the delimiter.
|
||||
if (auto index = name.find('_'); index.has_value()) {
|
||||
dbgln("FIXME: splitting on _ not yet implemented, ignoring all but first component");
|
||||
name = name.substring_view(0, index.value());
|
||||
}
|
||||
|
||||
// 3. Map each component to a character string according to the procedure below, and concatenate those strings; the result is the character string to which the glyph name is mapped.
|
||||
StringView component = name;
|
||||
|
||||
// If the font is Zapf Dingbats (PostScript FontName: ZapfDingbats), and the component is in the ITC Zapf Dingbats Glyph List, then map it to the corresponding character in that list."
|
||||
// FIXME: Implement.
|
||||
|
||||
// "Otherwise, if the component is in AGL, then map it to the corresponding character in that list.
|
||||
auto agl_entry = glyph_list.get(component);
|
||||
if (agl_entry.has_value())
|
||||
return agl_entry.value();
|
||||
|
||||
// Otherwise, if the component is of the form ‘uni’ (U+0075, U+006E, and U+0069)"
|
||||
if (component.starts_with("uni"sv)) {
|
||||
component = component.substring_view(3);
|
||||
|
||||
// Implementor's note: The spec allows 0 groups of four hex digits and maps them to nothing. Get this special case out of the way early.
|
||||
if (component.is_empty())
|
||||
return OptionalNone {};
|
||||
|
||||
// "followed by a sequence of uppercase hexadecimal digits (0–9 and A–F, meaning U+0030 through U+0039 and U+0041 through U+0046),
|
||||
if (are_all_uppercase_hex(component)) {
|
||||
bool all_are_ucs2 = true;
|
||||
// if the length of that sequence is a multiple of four,
|
||||
if (component.length() % 4 == 0) {
|
||||
for (size_t i = 0; i < component.length(); i += 4) {
|
||||
// and if each group of four digits represents a value in the ranges 0000 through D7FF or E000 through FFFF,
|
||||
u32 code_point = decode_hex(component.substring_view(i, 4));
|
||||
bool is_ucs2 = code_point <= 0xFFFF && !is_unicode_surrogate(code_point);
|
||||
if (!is_ucs2) {
|
||||
all_are_ucs2 = false;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
if (all_are_ucs2) {
|
||||
// then interpret each as a Unicode scalar value and map the component to the string made of those scalar values.
|
||||
// Note that the range and digit-length restrictions mean that the ‘uni’ glyph name prefix can be used only with UVs in the Basic Multilingual Plane (BMP).
|
||||
if (component.length() > 4)
|
||||
dbgln("FIXME: Returning multiple uni components not yet implemented, returning only the first one");
|
||||
return decode_hex(component.substring_view(0, 4));
|
||||
}
|
||||
}
|
||||
}
|
||||
// Otherwise, if the component is of the form ‘u’ (U+0075)
|
||||
else if (component.starts_with('u')) {
|
||||
component = component.substring_view(1);
|
||||
// followed by a sequence of four to six uppercase hexadecimal digits (0–9 and A–F, meaning U+0030 through U+0039 and U+0041 through U+0046),
|
||||
if (4 <= component.length() && component.length() <= 6 && are_all_uppercase_hex(component)) {
|
||||
// and those digits represents a value in the ranges 0000 through D7FF or E000 through 10FFFF,
|
||||
u32 code_point = decode_hex(component);
|
||||
if (is_unicode(code_point) && !is_unicode_surrogate(code_point)) {
|
||||
// then interpret it as a Unicode scalar value and map the component to the string made of this scalar value.
|
||||
return code_point;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Otherwise, map the component to an empty string."
|
||||
return OptionalNone {};
|
||||
}
|
||||
|
||||
}
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue