From c4b45a82cd98821b31e48b00520eccc81907c9c9 Mon Sep 17 00:00:00 2001 From: Rodrigo Tobar Date: Wed, 18 Jan 2023 21:34:48 +0800 Subject: [PATCH] LibPDF: Add initial CFF parsing The Compat Font Format specification (Adobe's Technical Note #5176) is used by PDF's Type1C fonts to store their data. While being similar in spirit to PS1 Type 1 Font Programs, it was designed for a more compact representation and thus space reduction (but an increment on complexity). It also shares most of the charstring encoding logic, which is why the CFF class also inherits from Type1FontProgram. This initial implementation is still lacking many details, e.g.: * It doesn't include all the built-in CFF SIDs * It doesn't support CFF-provided SIDs (defaults those glyphs to the space character) * More checks in general --- Userland/Libraries/LibPDF/CMakeLists.txt | 1 + Userland/Libraries/LibPDF/Fonts/CFF.cpp | 431 +++++++++++++++++++++++ Userland/Libraries/LibPDF/Fonts/CFF.h | 86 +++++ 3 files changed, 518 insertions(+) create mode 100644 Userland/Libraries/LibPDF/Fonts/CFF.cpp create mode 100644 Userland/Libraries/LibPDF/Fonts/CFF.h diff --git a/Userland/Libraries/LibPDF/CMakeLists.txt b/Userland/Libraries/LibPDF/CMakeLists.txt index dc34aca20e..955d3b7c37 100644 --- a/Userland/Libraries/LibPDF/CMakeLists.txt +++ b/Userland/Libraries/LibPDF/CMakeLists.txt @@ -6,6 +6,7 @@ set(SOURCES Encoding.cpp Encryption.cpp Filter.cpp + Fonts/CFF.cpp Fonts/PDFFont.cpp Fonts/PS1FontProgram.cpp Fonts/TrueTypeFont.cpp diff --git a/Userland/Libraries/LibPDF/Fonts/CFF.cpp b/Userland/Libraries/LibPDF/Fonts/CFF.cpp new file mode 100644 index 0000000000..db7dce8343 --- /dev/null +++ b/Userland/Libraries/LibPDF/Fonts/CFF.cpp @@ -0,0 +1,431 @@ +/* + * Copyright (c) 2023, Rodrigo Tobar . + * + * SPDX-License-Identifier: BSD-2-Clause + */ + +#include +#include +#include +#include +#include +#include + +namespace PDF { + +PDFErrorOr> CFF::create(ReadonlyBytes const& cff_bytes, RefPtr encoding) +{ + Reader reader(cff_bytes); + + // Header + // skip major, minor version + reader.consume(2); + auto header_size = TRY(reader.try_read()); + // skip offset size + reader.consume(1); + reader.move_to(header_size); + + // Name INDEX + Vector font_names; + TRY(parse_index(reader, [&](ReadonlyBytes const& data) -> PDFErrorOr { + auto string = TRY(String::from_utf8(data)); + return TRY(font_names.try_append(string)); + })); + + auto cff = adopt_ref(*new CFF()); + cff->set_font_matrix({ 0.001f, 0.0f, 0.0f, 0.001f, 0.0f, 0.0f }); + + // Top DICT INDEX + int charset_offset = 0; + Vector encoding_codes; + auto charstrings_offset = 0; + Vector subroutines; + int defaultWidthX = 0; + int nominalWidthX = 0; + TRY(parse_index(reader, [&](ReadonlyBytes const& element_data) { + Reader element_reader { element_data }; + return parse_dict(element_reader, [&](TopDictOperator op, Vector const& operands) -> PDFErrorOr { + switch (op) { + case TopDictOperator::Encoding: { + auto encoding_offset = 0; + if (!operands.is_empty()) + encoding_offset = operands[0].get(); + encoding_codes = TRY(parse_encoding(Reader(cff_bytes.slice(encoding_offset)))); + break; + } + case TopDictOperator::Charset: { + if (!operands.is_empty()) + charset_offset = operands[0].get(); + break; + } + case TopDictOperator::CharStrings: { + if (!operands.is_empty()) + charstrings_offset = operands[0].get(); + break; + } + case TopDictOperator::Private: { + auto private_dict_size = operands[0].get(); + auto private_dict_offset = operands[1].get(); + Reader priv_dict_reader { cff_bytes.slice(private_dict_offset, private_dict_size) }; + TRY(parse_dict(priv_dict_reader, [&](PrivDictOperator op, Vector const& operands) -> PDFErrorOr { + switch (op) { + case PrivDictOperator::Subrs: { + auto subrs_offset = operands[0].get(); + Reader subrs_reader { cff_bytes.slice(private_dict_offset + subrs_offset) }; + dbgln("Parsing Subrs INDEX"); + TRY(parse_index(subrs_reader, [&](ReadonlyBytes const& subroutine_bytes) -> PDFErrorOr { + return TRY(subroutines.try_append(TRY(ByteBuffer::copy(subroutine_bytes)))); + })); + break; + } + case PrivDictOperator::DefaultWidthX: + defaultWidthX = operands[0].get(); + break; + case PrivDictOperator::NominalWidthX: + nominalWidthX = operands[0].get(); + break; + } + return {}; + })); + break; + } + default:; + } + return {}; + }); + })); + + // Create glpyhs (now that we have the subroutines) and associate missing information to store them and their encoding + auto glyphs = TRY(parse_charstrings(Reader(cff_bytes.slice(charstrings_offset)), subroutines)); + auto charset = TRY(parse_charset(Reader { cff_bytes.slice(charset_offset) }, glyphs.size())); + + // Adjust glyphs' widths as they are deltas from nominalWidthX + for (auto& glyph : glyphs) { + if (!glyph.width_specified) + glyph.width = float(defaultWidthX); + else + glyph.width += float(nominalWidthX); + } + + // Encoding given or read + if (encoding) { + for (size_t i = 0; i < glyphs.size(); i++) { + if (i == 0) { + TRY(cff->add_glyph(0, move(glyphs[0]))); + continue; + } + auto const& name = charset[i - 1]; + u16 code = encoding->get_char_code(name); + TRY(cff->add_glyph(code, move(glyphs[i]))); + } + cff->set_encoding(move(encoding)); + } else { + HashMap descriptors; + for (size_t i = 0; i < glyphs.size(); i++) { + if (i == 0) { + TRY(cff->add_glyph(0, move(glyphs[0]))); + descriptors.set(0, CharDescriptor { ".notdef", 0 }); + continue; + } + auto code = encoding_codes[i - 1]; + auto char_name = charset[i - 1]; + TRY(cff->add_glyph(code, move(glyphs[i]))); + descriptors.set(code, CharDescriptor { char_name, code }); + } + cff->set_encoding(TRY(Encoding::create(descriptors))); + } + + return cff; +} + +HashMap CFF::builtin_names { + { 0, ".notdef" }, + { 1, "space" }, + { 9, "parenleft" }, + { 10, "parenright" }, + { 13, "comma" }, + { 14, "hyphen" }, + { 15, "period" }, + + { 17, "zero" }, + { 18, "one" }, + { 19, "two" }, + { 20, "three" }, + { 21, "four" }, + { 22, "five" }, + { 23, "six" }, + { 24, "seven" }, + { 25, "eight" }, + { 26, "nine" }, + { 27, "colon" }, + { 28, "semicolon" }, + + { 34, "A" }, + { 35, "B" }, + { 36, "C" }, + { 37, "D" }, + { 38, "E" }, + { 39, "F" }, + { 40, "G" }, + { 41, "H" }, + { 42, "I" }, + { 43, "J" }, + { 44, "K" }, + { 45, "L" }, + { 46, "M" }, + { 47, "N" }, + { 48, "O" }, + { 49, "P" }, + { 50, "Q" }, + { 51, "R" }, + { 52, "S" }, + { 53, "T" }, + { 54, "U" }, + { 55, "V" }, + { 56, "W" }, + { 57, "X" }, + { 58, "Y" }, + { 59, "Z" }, + { 66, "a" }, + { 67, "b" }, + { 68, "c" }, + { 69, "d" }, + { 70, "e" }, + { 71, "f" }, + { 72, "g" }, + { 73, "h" }, + { 74, "i" }, + { 75, "j" }, + { 76, "k" }, + { 77, "l" }, + { 78, "m" }, + { 79, "n" }, + { 80, "o" }, + { 81, "p" }, + { 82, "q" }, + { 83, "r" }, + { 84, "s" }, + { 85, "t" }, + { 86, "u" }, + { 87, "v" }, + { 88, "w" }, + { 89, "x" }, + { 90, "y" }, + { 91, "z" }, + + { 104, "quotesingle" }, + { 105, "quotedblleft" }, + + { 111, "endash" }, + + { 116, "bullet" }, + + { 119, "quotedblright" }, + + { 137, "emdash" }, + + { 170, "copyright" }, +}; + +PDFErrorOr> CFF::parse_charset(Reader&& reader, size_t glyph_count) +{ + Vector names; + auto resolve = [](SID sid) { + auto x = builtin_names.find(sid); + if (x == builtin_names.end()) { + dbgln("Cound't find string for SID {}, going with space", sid); + return DeprecatedFlyString("space"); + } + return x->value; + }; + + auto format = TRY(reader.try_read()); + if (format == 0) { + for (u8 i = 0; i < glyph_count - 1; i++) { + SID sid = TRY(reader.try_read>()); + TRY(names.try_append(resolve(sid))); + } + } else if (format == 1) { + while (names.size() < glyph_count - 1) { + auto first_sid = TRY(reader.try_read>()); + int left = TRY(reader.try_read()); + for (u8 sid = first_sid; left >= 0; left--, sid++) + TRY(names.try_append(resolve(sid))); + } + } + return names; +} + +PDFErrorOr> CFF::parse_charstrings(Reader&& reader, Vector const& subroutines) +{ + Vector glyphs; + TRY(parse_index(reader, [&](ReadonlyBytes const& charstring_data) -> PDFErrorOr { + GlyphParserState state; + auto glyph = TRY(parse_glyph(charstring_data, subroutines, state, true)); + return TRY(glyphs.try_append(glyph)); + })); + return glyphs; +} + +PDFErrorOr> CFF::parse_encoding(Reader&& reader) +{ + Vector encoding_codes; + auto format = TRY(reader.try_read()); + if (format == 0) { + auto n_codes = TRY(reader.try_read()); + for (u8 i = 0; i < n_codes; i++) { + TRY(encoding_codes.try_append(TRY(reader.try_read()))); + } + } else if (format == 1) { + auto n_ranges = TRY(reader.try_read()); + for (u8 i = 0; i < n_ranges; i++) { + auto first_code = TRY(reader.try_read()); + int left = TRY(reader.try_read()); + for (u8 code = first_code; left >= 0; left--, code++) + TRY(encoding_codes.try_append(code)); + } + } else + return error(DeprecatedString::formatted("Invalid encoding format: {}", format)); + return encoding_codes; +} + +template +PDFErrorOr CFF::parse_dict(Reader& reader, DictEntryHandler&& handler) +{ + Vector operands; + while (reader.remaining() > 0) { + auto b0 = reader.read(); + // A command + if (b0 <= 21) { + auto op = TRY(parse_dict_operator(b0, reader)); + TRY(handler(op, operands)); + operands.clear(); + continue; + } + // An operand + TRY(operands.try_append(TRY(load_dict_operand(b0, reader)))); + } + return {}; +} + +template PDFErrorOr CFF::parse_dict(Reader&, DictEntryHandler&&); +template PDFErrorOr CFF::parse_dict(Reader&, DictEntryHandler&&); + +template +PDFErrorOr CFF::parse_dict_operator(u8 b0, Reader& reader) +{ + VERIFY(b0 <= 21); + if (b0 != 12) + return OperatorT { (int)b0 }; + auto b1 = TRY(reader.try_read()); + return OperatorT { b0 << 8 | b1 }; +} + +template PDFErrorOr CFF::parse_dict_operator(u8, Reader&); + +PDFErrorOr CFF::parse_index(Reader& reader, IndexDataHandler&& data_handler) +{ + Card16 count = TRY(reader.try_read>()); + if (count == 0) + return {}; + auto offset_size = TRY(reader.try_read()); + if (offset_size == 1) + return parse_index_data(count, reader, data_handler); + if (offset_size == 2) + return parse_index_data(count, reader, data_handler); + if (offset_size == 4) + return parse_index_data(count, reader, data_handler); + VERIFY_NOT_REACHED(); +} + +template +PDFErrorOr CFF::parse_index_data(Card16 count, Reader& reader, IndexDataHandler& handler) +{ + OffsetType last_data_end = 1; + auto offset_refpoint = reader.offset() + sizeof(OffsetType) * (count + 1) - 1; + for (u16 i = 0; i < count; i++) { + reader.save(); + reader.move_by(sizeof(OffsetType) * i); + OffsetType data_start = reader.read>(); + last_data_end = reader.read>(); + auto data_size = last_data_end - data_start; + reader.move_to(offset_refpoint + data_start); + TRY(handler(reader.bytes().slice(reader.offset(), data_size))); + reader.load(); + } + reader.move_to(offset_refpoint + last_data_end); + return {}; +} + +template PDFErrorOr CFF::parse_index_data(Card16, Reader&, IndexDataHandler&); +template PDFErrorOr CFF::parse_index_data(Card16, Reader&, IndexDataHandler&); +template PDFErrorOr CFF::parse_index_data(Card16, Reader&, IndexDataHandler&); + +// 4 DICT DATA, Table 3 Operand Encoding +int CFF::load_int_dict_operand(u8 b0, Reader& reader) +{ + if (b0 >= 32 && b0 <= 246) { + return b0 - 139; + } + if (b0 >= 247 && b0 <= 250) { + auto b1 = reader.read(); + return (b0 - 247) * 256 + b1 + 108; + } + if (b0 >= 251 && b0 <= 254) { + auto b1 = reader.read(); + return -(b0 - 251) * 256 - b1 - 108; + } + if (b0 == 28) { + auto b1 = reader.read(); + auto b2 = reader.read(); + return b1 << 8 | b2; + } + if (b0 == 29) { + auto b1 = reader.read(); + auto b2 = reader.read(); + auto b3 = reader.read(); + auto b4 = reader.read(); + return b1 << 24 | b2 << 16 | b3 << 8 | b4; + } + VERIFY_NOT_REACHED(); +} + +float CFF::load_float_dict_operand(Reader& reader) +{ + StringBuilder sb; + auto add_nibble = [&](char nibble) { + if (nibble < 0xa) + sb.append('0' + nibble); + else if (nibble == 0xa) + sb.append('.'); + else if (nibble == 0xb) + sb.append('E'); + else if (nibble == 0xc) + sb.append("E-"sv); + else if (nibble == 0xe) + sb.append('-'); + }; + while (true) { + auto byte = reader.read(); + char nibble1 = (byte & 0xf0) >> 4; + char nibble2 = byte & 0x0f; + if (nibble1 == 0xf) + break; + add_nibble(nibble1); + if (nibble2 == 0xf) + break; + add_nibble(nibble2); + } + auto result = AK::StringUtils::convert_to_floating_point(sb.string_view()); + return result.release_value(); +} + +PDFErrorOr CFF::load_dict_operand(u8 b0, Reader& reader) +{ + if (b0 == 30) + return load_float_dict_operand(reader); + if (b0 >= 28) + return load_int_dict_operand(b0, reader); + return Error { Error::Type::MalformedPDF, DeprecatedString::formatted("Unknown CFF dict element prefix: {}", b0) }; +} +} diff --git a/Userland/Libraries/LibPDF/Fonts/CFF.h b/Userland/Libraries/LibPDF/Fonts/CFF.h new file mode 100644 index 0000000000..7f911ae9ef --- /dev/null +++ b/Userland/Libraries/LibPDF/Fonts/CFF.h @@ -0,0 +1,86 @@ +/* + * Copyright (c) 2023, Rodrigo Tobar . + * + * SPDX-License-Identifier: BSD-2-Clause + */ + +#pragma once + +#include +#include +#include +#include + +namespace PDF { + +class Reader; + +class CFF : public Type1FontProgram { + +private: + // Table 9: Top DICT Operator Entries + enum class TopDictOperator { + Version = 0, + Notice, + FullName, + FamilyName, + Weight, + FontBBox, + // UniqueID = 13, + // XUID, + Charset = 15, + Encoding, + CharStrings, + Private, + // IsFixedPitch = (12 << 8 | 1), + // ItalicAngle, + // UnderlinePosition, + // UnderlineThickness, + // PaintType, + }; + + enum class PrivDictOperator { + Subrs = 19, + DefaultWidthX, + NominalWidthX, + }; + +public: + static PDFErrorOr> create(ReadonlyBytes const&, RefPtr encoding); + + // to private + using Card8 = u8; + using Card16 = u16; + using Offset = i32; + using OffSize = u8; + using SID = u16; + using DictOperand = Variant; + + static int load_int_dict_operand(u8 b0, Reader&); + static float load_float_dict_operand(Reader&); + static PDFErrorOr load_dict_operand(u8, Reader&); + + using IndexDataHandler = Function(ReadonlyBytes const&)>; + static PDFErrorOr parse_index(Reader& reader, IndexDataHandler&&); + + template + static PDFErrorOr parse_index_data(Card16 count, Reader& reader, IndexDataHandler&); + + template + using DictEntryHandler = Function(OperatorT, Vector const&)>; + + template + static PDFErrorOr parse_dict(Reader& reader, DictEntryHandler&& handler); + + template + static PDFErrorOr parse_dict_operator(u8, Reader&); + + static PDFErrorOr> parse_charstrings(Reader&&, Vector const& subroutines); + + static PDFErrorOr> parse_charset(Reader&&, size_t); + static PDFErrorOr> parse_encoding(Reader&&); + + static HashMap builtin_names; +}; + +}