mirror of
https://github.com/RGBCube/serenity
synced 2025-07-27 11:07:45 +00:00
LibPDF: Implement some of the AdobeGlyphList algorithm
Turns out there's a spec that goes with the table. The big change here is that we can now map `uni1234` to 0x1234 and `u123456` to 0x123456. The parts where we split a name on `_` and map each component and the part where we're supposed to allow multiple groups of 4 after `uni` aren't implemented yet. The ZapfDingbats lookup is also still missing. I haven't seen this have an effect in practice, but it's easy to construct a PDF with a custom encoding where it would make a difference.
This commit is contained in:
parent
f8b8d1b3be
commit
2eb099aabe
1 changed files with 92 additions and 1 deletions
|
@ -31,6 +31,8 @@ print(f'}};')
|
||||||
where glyphlist.txt is from https://github.com/adobe-type-tools/agl-aglfn/blob/master/glyphlist.txt
|
where glyphlist.txt is from https://github.com/adobe-type-tools/agl-aglfn/blob/master/glyphlist.txt
|
||||||
*/
|
*/
|
||||||
|
|
||||||
|
#include <AK/CharacterTypes.h>
|
||||||
|
#include <AK/Format.h>
|
||||||
#include <AK/HashMap.h>
|
#include <AK/HashMap.h>
|
||||||
#include <LibPDF/Fonts/AdobeGlyphList.h>
|
#include <LibPDF/Fonts/AdobeGlyphList.h>
|
||||||
|
|
||||||
|
@ -4239,9 +4241,98 @@ static HashMap<StringView, u32> const glyph_list = {
|
||||||
{ "zukatakana"sv, 0x30BA },
|
{ "zukatakana"sv, 0x30BA },
|
||||||
};
|
};
|
||||||
|
|
||||||
|
static bool are_all_uppercase_hex(StringView component)
|
||||||
|
{
|
||||||
|
for (auto c : component)
|
||||||
|
if (!is_ascii_uppercase_hex_digit(c))
|
||||||
|
return false;
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
static u32 decode_hex(StringView hex_string)
|
||||||
|
{
|
||||||
|
u32 code_point = 0;
|
||||||
|
for (auto c : hex_string) {
|
||||||
|
VERIFY(is_ascii_uppercase_hex_digit(c));
|
||||||
|
code_point = (code_point << 4) | parse_ascii_hex_digit(c);
|
||||||
|
}
|
||||||
|
return code_point;
|
||||||
|
}
|
||||||
|
|
||||||
Optional<u32> glyph_name_to_unicode(StringView name)
|
Optional<u32> glyph_name_to_unicode(StringView name)
|
||||||
{
|
{
|
||||||
return glyph_list.get(name);
|
// https://github.com/adobe-type-tools/agl-specification?tab=readme-ov-file#2-the-mapping
|
||||||
|
// "To map a glyph name to a character string, follow the three steps below:
|
||||||
|
//
|
||||||
|
// 1. Drop all the characters from the glyph name starting with the first occurrence of a period (U+002E FULL STOP), if any.
|
||||||
|
if (auto index = name.find('.'); index.has_value())
|
||||||
|
name = name.substring_view(0, index.value());
|
||||||
|
|
||||||
|
// 2. Split the remaining string into a sequence of components, using underscore (U+005F LOW LINE) as the delimiter.
|
||||||
|
if (auto index = name.find('_'); index.has_value()) {
|
||||||
|
dbgln("FIXME: splitting on _ not yet implemented, ignoring all but first component");
|
||||||
|
name = name.substring_view(0, index.value());
|
||||||
|
}
|
||||||
|
|
||||||
|
// 3. Map each component to a character string according to the procedure below, and concatenate those strings; the result is the character string to which the glyph name is mapped.
|
||||||
|
StringView component = name;
|
||||||
|
|
||||||
|
// If the font is Zapf Dingbats (PostScript FontName: ZapfDingbats), and the component is in the ITC Zapf Dingbats Glyph List, then map it to the corresponding character in that list."
|
||||||
|
// FIXME: Implement.
|
||||||
|
|
||||||
|
// "Otherwise, if the component is in AGL, then map it to the corresponding character in that list.
|
||||||
|
auto agl_entry = glyph_list.get(component);
|
||||||
|
if (agl_entry.has_value())
|
||||||
|
return agl_entry.value();
|
||||||
|
|
||||||
|
// Otherwise, if the component is of the form ‘uni’ (U+0075, U+006E, and U+0069)"
|
||||||
|
if (component.starts_with("uni"sv)) {
|
||||||
|
component = component.substring_view(3);
|
||||||
|
|
||||||
|
// Implementor's note: The spec allows 0 groups of four hex digits and maps them to nothing. Get this special case out of the way early.
|
||||||
|
if (component.is_empty())
|
||||||
|
return OptionalNone {};
|
||||||
|
|
||||||
|
// "followed by a sequence of uppercase hexadecimal digits (0–9 and A–F, meaning U+0030 through U+0039 and U+0041 through U+0046),
|
||||||
|
if (are_all_uppercase_hex(component)) {
|
||||||
|
bool all_are_ucs2 = true;
|
||||||
|
// if the length of that sequence is a multiple of four,
|
||||||
|
if (component.length() % 4 == 0) {
|
||||||
|
for (size_t i = 0; i < component.length(); i += 4) {
|
||||||
|
// and if each group of four digits represents a value in the ranges 0000 through D7FF or E000 through FFFF,
|
||||||
|
u32 code_point = decode_hex(component.substring_view(i, 4));
|
||||||
|
bool is_ucs2 = code_point <= 0xFFFF && !is_unicode_surrogate(code_point);
|
||||||
|
if (!is_ucs2) {
|
||||||
|
all_are_ucs2 = false;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (all_are_ucs2) {
|
||||||
|
// then interpret each as a Unicode scalar value and map the component to the string made of those scalar values.
|
||||||
|
// Note that the range and digit-length restrictions mean that the ‘uni’ glyph name prefix can be used only with UVs in the Basic Multilingual Plane (BMP).
|
||||||
|
if (component.length() > 4)
|
||||||
|
dbgln("FIXME: Returning multiple uni components not yet implemented, returning only the first one");
|
||||||
|
return decode_hex(component.substring_view(0, 4));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
// Otherwise, if the component is of the form ‘u’ (U+0075)
|
||||||
|
else if (component.starts_with('u')) {
|
||||||
|
component = component.substring_view(1);
|
||||||
|
// followed by a sequence of four to six uppercase hexadecimal digits (0–9 and A–F, meaning U+0030 through U+0039 and U+0041 through U+0046),
|
||||||
|
if (4 <= component.length() && component.length() <= 6 && are_all_uppercase_hex(component)) {
|
||||||
|
// and those digits represents a value in the ranges 0000 through D7FF or E000 through 10FFFF,
|
||||||
|
u32 code_point = decode_hex(component);
|
||||||
|
if (is_unicode(code_point) && !is_unicode_surrogate(code_point)) {
|
||||||
|
// then interpret it as a Unicode scalar value and map the component to the string made of this scalar value.
|
||||||
|
return code_point;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Otherwise, map the component to an empty string."
|
||||||
|
return OptionalNone {};
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue