mirror of
https://github.com/RGBCube/serenity
synced 2025-05-15 09:44:58 +00:00
LibUnicode: Add decomposition mappings and Unicode normalization
The mappings are exposed via `Unicode::code_point_decomposition(u32)` and `Unicode::code_point_decompositions()`, the latter being useful for reverse searching a code point from its decomposition. The normalization code does not make use of `Quick_Check` props (https://www.unicode.org/reports/tr44/#Decompositions_and_Normalization), meaning no quick check optimizations.
This commit is contained in:
parent
e8410bc2ee
commit
70d0c1616f
5 changed files with 392 additions and 2 deletions
|
@ -44,6 +44,14 @@ struct SpecialCasing {
|
|||
String condition;
|
||||
};
|
||||
|
||||
// Field descriptions: https://www.unicode.org/reports/tr44/#Character_Decomposition_Mappings
|
||||
struct CodePointDecomposition {
|
||||
// `tag` is a string since it's used for codegen as an enum value.
|
||||
String tag { "Canonical"sv };
|
||||
size_t decomposition_index { 0 };
|
||||
size_t decomposition_size { 0 };
|
||||
};
|
||||
|
||||
// PropList source: https://www.unicode.org/Public/13.0.0/ucd/PropList.txt
|
||||
// Property descriptions: https://www.unicode.org/reports/tr44/tr44-13.html#PropList.txt
|
||||
using PropList = HashMap<String, Vector<CodePointRange>>;
|
||||
|
@ -78,7 +86,7 @@ struct CodePointData {
|
|||
Optional<StringIndexType> abbreviation;
|
||||
u8 canonical_combining_class { 0 };
|
||||
String bidi_class;
|
||||
String decomposition_type;
|
||||
Optional<CodePointDecomposition> decomposition_mapping;
|
||||
Optional<i8> numeric_value_decimal;
|
||||
Optional<i8> numeric_value_digit;
|
||||
Optional<i8> numeric_value_numeric;
|
||||
|
@ -101,6 +109,10 @@ struct UnicodeData {
|
|||
|
||||
u32 code_points_with_non_zero_combining_class { 0 };
|
||||
|
||||
u32 code_points_with_decomposition_mapping { 0 };
|
||||
Vector<u32> decomposition_mappings;
|
||||
Vector<String> compatibility_tags;
|
||||
|
||||
u32 simple_uppercase_mapping_size { 0 };
|
||||
u32 simple_lowercase_mapping_size { 0 };
|
||||
|
||||
|
@ -534,6 +546,35 @@ static void add_canonical_code_point_name(CodePointRange range, StringView name,
|
|||
unicode_data.code_point_display_names.append({ range, index });
|
||||
}
|
||||
|
||||
static Optional<CodePointDecomposition> parse_decomposition_mapping(StringView string, UnicodeData& unicode_data)
|
||||
{
|
||||
if (string.is_empty())
|
||||
return {};
|
||||
|
||||
CodePointDecomposition mapping;
|
||||
|
||||
auto parts = string.split_view(' ');
|
||||
|
||||
VERIFY(parts.size() > 0);
|
||||
|
||||
if (parts.first().starts_with('<')) {
|
||||
auto const tag = parts.take_first().trim("<>"sv);
|
||||
|
||||
mapping.tag = String::formatted("{:c}{}", to_ascii_uppercase(tag[0]), tag.substring_view(1));
|
||||
|
||||
if (!unicode_data.compatibility_tags.contains_slow(mapping.tag))
|
||||
unicode_data.compatibility_tags.append(mapping.tag);
|
||||
}
|
||||
|
||||
mapping.decomposition_index = unicode_data.decomposition_mappings.size();
|
||||
mapping.decomposition_size = parts.size();
|
||||
for (auto part : parts) {
|
||||
unicode_data.decomposition_mappings.append(AK::StringUtils::convert_to_uint_from_hex<u32>(part).value());
|
||||
}
|
||||
|
||||
return mapping;
|
||||
}
|
||||
|
||||
static ErrorOr<void> parse_block_display_names(Core::Stream::BufferedFile& file, UnicodeData& unicode_data)
|
||||
{
|
||||
Array<u8, 1024> buffer;
|
||||
|
@ -581,7 +622,7 @@ static ErrorOr<void> parse_unicode_data(Core::Stream::BufferedFile& file, Unicod
|
|||
data.name = segments[1];
|
||||
data.canonical_combining_class = AK::StringUtils::convert_to_uint<u8>(segments[3]).value();
|
||||
data.bidi_class = segments[4];
|
||||
data.decomposition_type = segments[5];
|
||||
data.decomposition_mapping = parse_decomposition_mapping(segments[5], unicode_data);
|
||||
data.numeric_value_decimal = AK::StringUtils::convert_to_int<i8>(segments[6]);
|
||||
data.numeric_value_digit = AK::StringUtils::convert_to_int<i8>(segments[7]);
|
||||
data.numeric_value_numeric = AK::StringUtils::convert_to_int<i8>(segments[8]);
|
||||
|
@ -639,6 +680,7 @@ static ErrorOr<void> parse_unicode_data(Core::Stream::BufferedFile& file, Unicod
|
|||
unicode_data.code_points_with_non_zero_combining_class += data.canonical_combining_class != 0;
|
||||
unicode_data.simple_uppercase_mapping_size += data.simple_uppercase_mapping.has_value();
|
||||
unicode_data.simple_lowercase_mapping_size += data.simple_lowercase_mapping.has_value();
|
||||
unicode_data.code_points_with_decomposition_mapping += data.decomposition_mapping.has_value();
|
||||
|
||||
unicode_data.code_points_with_special_casing += has_special_casing;
|
||||
unicode_data.largest_special_casing_size = max(unicode_data.largest_special_casing_size, data.special_casing_indices.size());
|
||||
|
@ -714,6 +756,7 @@ namespace Unicode {
|
|||
generate_enum("GraphemeBreakProperty"sv, {}, unicode_data.grapheme_break_props.keys());
|
||||
generate_enum("WordBreakProperty"sv, {}, unicode_data.word_break_props.keys());
|
||||
generate_enum("SentenceBreakProperty"sv, {}, unicode_data.sentence_break_props.keys());
|
||||
generate_enum("CompatibilityFormattingTag"sv, "Canonical"sv, unicode_data.compatibility_tags);
|
||||
|
||||
generator.append(R"~~~(
|
||||
struct SpecialCasing {
|
||||
|
@ -732,6 +775,12 @@ struct SpecialCasing {
|
|||
Condition condition { Condition::None };
|
||||
};
|
||||
|
||||
struct CodePointDecomposition {
|
||||
u32 code_point { 0 };
|
||||
CompatibilityFormattingTag tag { CompatibilityFormattingTag::Canonical };
|
||||
Span<u32 const> decomposition;
|
||||
};
|
||||
|
||||
Optional<Locale> locale_from_string(StringView locale);
|
||||
|
||||
}
|
||||
|
@ -760,6 +809,7 @@ static ErrorOr<void> generate_unicode_data_implementation(Core::Stream::Buffered
|
|||
#include <AK/StringView.h>
|
||||
#include <LibUnicode/CharacterTypes.h>
|
||||
#include <LibUnicode/UnicodeData.h>
|
||||
#include <LibUnicode/Normalize.h>
|
||||
|
||||
namespace Unicode {
|
||||
)~~~");
|
||||
|
@ -863,6 +913,11 @@ struct CodePointNameComparator : public CodePointRangeComparator {
|
|||
};
|
||||
)~~~");
|
||||
|
||||
generator.set("decomposition_mappings_size", String::number(unicode_data.decomposition_mappings.size()));
|
||||
generator.append("\nstatic constexpr Array<u32, @decomposition_mappings_size@> s_decomposition_mappings_data { ");
|
||||
generator.append(String::join(", "sv, unicode_data.decomposition_mappings, "{:#x}"sv));
|
||||
generator.append(" };\n");
|
||||
|
||||
auto append_code_point_mappings = [&](StringView name, StringView mapping_type, u32 size, auto mapping_getter) {
|
||||
generator.set("name", name);
|
||||
generator.set("mapping_type", mapping_type);
|
||||
|
@ -895,6 +950,11 @@ static constexpr Array<@mapping_type@, @size@> s_@name@_mappings { {
|
|||
if constexpr (IsSame<decltype(mapping), Optional<u32>> || IsSame<decltype(mapping), Optional<StringIndexType>>) {
|
||||
generator.set("mapping", String::formatted("{:#x}", *mapping));
|
||||
generator.append(", @mapping@ },");
|
||||
} else if constexpr (IsSame<decltype(mapping), Optional<CodePointDecomposition>>) {
|
||||
generator.set("tag", mapping->tag);
|
||||
generator.set("start", String::number(mapping->decomposition_index));
|
||||
generator.set("size", String::number(mapping->decomposition_size));
|
||||
generator.append(", CompatibilityFormattingTag::@tag@, Span<u32 const> { s_decomposition_mappings_data.data() + @start@, @size@ } },");
|
||||
} else {
|
||||
append_list_and_size(data.special_casing_indices, "&s_special_casing[{}]"sv);
|
||||
generator.append(" },");
|
||||
|
@ -921,6 +981,11 @@ static constexpr Array<@mapping_type@, @size@> s_@name@_mappings { {
|
|||
append_code_point_mappings("special_case"sv, "SpecialCaseMapping"sv, unicode_data.code_points_with_special_casing, [](auto const& data) { return data.special_casing_indices; });
|
||||
append_code_point_mappings("abbreviation"sv, "CodePointAbbreviation"sv, unicode_data.code_point_abbreviations.size(), [](auto const& data) { return data.abbreviation; });
|
||||
|
||||
append_code_point_mappings("decomposition"sv, "CodePointDecomposition"sv, unicode_data.code_points_with_decomposition_mapping,
|
||||
[](auto const& data) {
|
||||
return data.decomposition_mapping;
|
||||
});
|
||||
|
||||
auto append_code_point_range_list = [&](String name, Vector<CodePointRange> const& ranges) {
|
||||
generator.set("name", name);
|
||||
generator.set("size", String::number(ranges.size()));
|
||||
|
@ -1094,6 +1159,19 @@ Optional<StringView> code_point_abbreviation(u32 code_point)
|
|||
|
||||
return decode_string(mapping->abbreviation);
|
||||
}
|
||||
|
||||
Optional<CodePointDecomposition const&> code_point_decomposition(u32 code_point)
|
||||
{
|
||||
auto const* mapping = binary_search(s_decomposition_mappings, code_point, nullptr, CodePointComparator<CodePointDecomposition> {});
|
||||
if (mapping == nullptr)
|
||||
return {};
|
||||
return *mapping;
|
||||
}
|
||||
|
||||
Span<CodePointDecomposition const> code_point_decompositions()
|
||||
{
|
||||
return s_decomposition_mappings;
|
||||
}
|
||||
)~~~");
|
||||
|
||||
auto append_prop_search = [&](StringView enum_title, StringView enum_snake, StringView collection_name) {
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue