mirror of
https://github.com/RGBCube/serenity
synced 2025-05-16 17:55:06 +00:00
LibUnicode: Parse and generate case folding code point data
Case folding rules have a similar mapping style as special casing rules, where one code point may map to zero or more case folding rules. These will be used for case-insensitive string comparisons. To see how case folding can differ from other casing rules, consider "ß" (U+00DF): >>> "ß".lower() 'ß' >>> "ß".upper() 'SS' >>> "ß".title() 'Ss' >>> "ß".casefold() 'ss'
This commit is contained in:
parent
9226cf7272
commit
8f2589b3b0
7 changed files with 207 additions and 5 deletions
|
@ -41,6 +41,13 @@ struct SpecialCasing {
|
|||
DeprecatedString condition;
|
||||
};
|
||||
|
||||
// https://www.unicode.org/reports/tr44/#CaseFolding.txt
|
||||
struct CaseFolding {
|
||||
u32 code_point { 0 };
|
||||
StringView status { "Common"sv };
|
||||
Vector<u32> mapping { 0 };
|
||||
};
|
||||
|
||||
// https://www.unicode.org/reports/tr44/#Character_Decomposition_Mappings
|
||||
struct CodePointDecomposition {
|
||||
// `tag` is a string since it's used for codegen as an enum value.
|
||||
|
@ -90,6 +97,7 @@ struct CodePointData {
|
|||
Optional<u32> simple_lowercase_mapping;
|
||||
Optional<u32> simple_titlecase_mapping;
|
||||
Vector<u32> special_casing_indices;
|
||||
Vector<u32> case_folding_indices;
|
||||
};
|
||||
|
||||
struct BlockName {
|
||||
|
@ -117,6 +125,12 @@ struct UnicodeData {
|
|||
Vector<DeprecatedString> conditions;
|
||||
Vector<DeprecatedString> locales;
|
||||
|
||||
Vector<CaseFolding> case_folding;
|
||||
u32 code_points_with_case_folding { 0 };
|
||||
u32 largest_case_folding_mapping_size { 0 };
|
||||
u32 largest_case_folding_size { 0 };
|
||||
Vector<StringView> statuses;
|
||||
|
||||
Vector<CodePointData> code_point_data;
|
||||
|
||||
HashMap<u32, size_t> code_point_abbreviations;
|
||||
|
@ -276,6 +290,54 @@ static ErrorOr<void> parse_special_casing(Core::Stream::BufferedFile& file, Unic
|
|||
return {};
|
||||
}
|
||||
|
||||
static ErrorOr<void> parse_case_folding(Core::Stream::BufferedFile& file, UnicodeData& unicode_data)
|
||||
{
|
||||
Array<u8, 1024> buffer;
|
||||
|
||||
while (TRY(file.can_read_line())) {
|
||||
auto line = TRY(file.read_line(buffer));
|
||||
if (line.is_empty() || line.starts_with('#'))
|
||||
continue;
|
||||
|
||||
auto segments = line.split_view(';', SplitBehavior::KeepEmpty);
|
||||
VERIFY(segments.size() == 4);
|
||||
|
||||
CaseFolding folding {};
|
||||
folding.code_point = AK::StringUtils::convert_to_uint_from_hex<u32>(segments[0]).value();
|
||||
folding.mapping = parse_code_point_list(segments[2]);
|
||||
|
||||
switch (segments[1].trim_whitespace()[0]) {
|
||||
case 'C':
|
||||
folding.status = "Common"sv;
|
||||
break;
|
||||
case 'F':
|
||||
folding.status = "Full"sv;
|
||||
break;
|
||||
case 'S':
|
||||
folding.status = "Simple"sv;
|
||||
break;
|
||||
case 'T':
|
||||
folding.status = "Special"sv;
|
||||
break;
|
||||
}
|
||||
|
||||
unicode_data.largest_case_folding_mapping_size = max(unicode_data.largest_case_folding_mapping_size, folding.mapping.size());
|
||||
|
||||
if (!unicode_data.statuses.contains_slow(folding.status))
|
||||
unicode_data.statuses.append(folding.status);
|
||||
|
||||
unicode_data.case_folding.append(move(folding));
|
||||
}
|
||||
|
||||
quick_sort(unicode_data.case_folding, [](auto const& lhs, auto const& rhs) {
|
||||
if (lhs.code_point != rhs.code_point)
|
||||
return lhs.code_point < rhs.code_point;
|
||||
return lhs.status < rhs.status;
|
||||
});
|
||||
|
||||
return {};
|
||||
}
|
||||
|
||||
static ErrorOr<void> parse_prop_list(Core::Stream::BufferedFile& file, PropList& prop_list, bool multi_value_property = false, bool sanitize_property = false)
|
||||
{
|
||||
Array<u8, 1024> buffer;
|
||||
|
@ -667,6 +729,14 @@ static ErrorOr<void> parse_unicode_data(Core::Stream::BufferedFile& file, Unicod
|
|||
}
|
||||
}
|
||||
|
||||
bool has_case_folding { false };
|
||||
for (size_t i = 0; i < unicode_data.case_folding.size(); ++i) {
|
||||
if (auto const& folding = unicode_data.case_folding[i]; folding.code_point == data.code_point) {
|
||||
data.case_folding_indices.append(i);
|
||||
has_case_folding = true;
|
||||
}
|
||||
}
|
||||
|
||||
unicode_data.code_points_with_non_zero_combining_class += data.canonical_combining_class != 0;
|
||||
unicode_data.simple_uppercase_mapping_size += data.simple_uppercase_mapping.has_value();
|
||||
unicode_data.simple_lowercase_mapping_size += data.simple_lowercase_mapping.has_value();
|
||||
|
@ -675,8 +745,11 @@ static ErrorOr<void> parse_unicode_data(Core::Stream::BufferedFile& file, Unicod
|
|||
|
||||
unicode_data.code_points_with_special_casing += has_special_casing;
|
||||
unicode_data.largest_special_casing_size = max(unicode_data.largest_special_casing_size, data.special_casing_indices.size());
|
||||
previous_code_point = data.code_point;
|
||||
|
||||
unicode_data.code_points_with_case_folding += has_case_folding;
|
||||
unicode_data.largest_case_folding_size = max(unicode_data.largest_case_folding_size, data.case_folding_indices.size());
|
||||
|
||||
previous_code_point = data.code_point;
|
||||
unicode_data.code_point_data.append(move(data));
|
||||
}
|
||||
|
||||
|
@ -688,8 +761,9 @@ static ErrorOr<void> generate_unicode_data_header(Core::Stream::BufferedFile& fi
|
|||
StringBuilder builder;
|
||||
SourceGenerator generator { builder };
|
||||
generator.set("special_casing_mapping_size", DeprecatedString::number(unicode_data.largest_special_casing_mapping_size));
|
||||
generator.set("case_folding_mapping_size", DeprecatedString::number(unicode_data.largest_case_folding_mapping_size));
|
||||
|
||||
auto generate_enum = [&](StringView name, StringView default_, Vector<DeprecatedString> values, Vector<Alias> aliases = {}) {
|
||||
auto generate_enum = [&](StringView name, StringView default_, auto values, Vector<Alias> aliases = {}) {
|
||||
quick_sort(values);
|
||||
quick_sort(aliases, [](auto& alias1, auto& alias2) { return alias1.alias < alias2.alias; });
|
||||
|
||||
|
@ -737,6 +811,7 @@ namespace Unicode {
|
|||
|
||||
generate_enum("Locale"sv, "None"sv, unicode_data.locales);
|
||||
generate_enum("Condition"sv, "None"sv, move(unicode_data.conditions));
|
||||
generate_enum("CaseFoldingStatus"sv, {}, move(unicode_data.statuses));
|
||||
generate_enum("GeneralCategory"sv, {}, unicode_data.general_categories.keys(), unicode_data.general_category_aliases);
|
||||
generate_enum("Property"sv, {}, unicode_data.prop_list.keys(), unicode_data.prop_aliases);
|
||||
generate_enum("Script"sv, {}, unicode_data.script_list.keys(), unicode_data.script_aliases);
|
||||
|
@ -763,6 +838,14 @@ struct SpecialCasing {
|
|||
Condition condition { Condition::None };
|
||||
};
|
||||
|
||||
struct CaseFolding {
|
||||
u32 code_point { 0 };
|
||||
CaseFoldingStatus status { CaseFoldingStatus::Common };
|
||||
|
||||
u32 mapping[@case_folding_mapping_size@];
|
||||
u32 mapping_size { 0 };
|
||||
};
|
||||
|
||||
struct CodePointDecompositionRaw {
|
||||
u32 code_point { 0 };
|
||||
CompatibilityFormattingTag tag { CompatibilityFormattingTag::Canonical };
|
||||
|
@ -779,6 +862,7 @@ struct CodePointDecomposition {
|
|||
Optional<Locale> locale_from_string(StringView locale);
|
||||
|
||||
Span<SpecialCasing const* const> special_case_mapping(u32 code_point);
|
||||
Span<CaseFolding const* const> case_folding_mapping(u32 code_point);
|
||||
|
||||
}
|
||||
)~~~");
|
||||
|
@ -795,6 +879,8 @@ static ErrorOr<void> generate_unicode_data_implementation(Core::Stream::Buffered
|
|||
generator.set("string_index_type"sv, unicode_data.unique_strings.type_that_fits());
|
||||
generator.set("largest_special_casing_size", DeprecatedString::number(unicode_data.largest_special_casing_size));
|
||||
generator.set("special_casing_size", DeprecatedString::number(unicode_data.special_casing.size()));
|
||||
generator.set("largest_case_folding_size", DeprecatedString::number(unicode_data.largest_case_folding_size));
|
||||
generator.set("case_folding_size", DeprecatedString::number(unicode_data.case_folding.size()));
|
||||
|
||||
generator.append(R"~~~(
|
||||
#include <AK/Array.h>
|
||||
|
@ -830,7 +916,7 @@ namespace Unicode {
|
|||
};
|
||||
|
||||
generator.append(R"~~~(
|
||||
static constexpr Array<SpecialCasing, @special_casing_size@> s_special_casing { {)~~~");
|
||||
static constexpr Array<SpecialCasing, @special_casing_size@> s_special_case { {)~~~");
|
||||
|
||||
for (auto const& casing : unicode_data.special_casing) {
|
||||
generator.set("code_point", DeprecatedString::formatted("{:#x}", casing.code_point));
|
||||
|
@ -854,6 +940,21 @@ static constexpr Array<SpecialCasing, @special_casing_size@> s_special_casing {
|
|||
generator.append(R"~~~(
|
||||
} };
|
||||
|
||||
static constexpr Array<CaseFolding, @case_folding_size@> s_case_folding { {)~~~");
|
||||
|
||||
for (auto const& folding : unicode_data.case_folding) {
|
||||
generator.set("code_point", DeprecatedString::formatted("{:#x}", folding.code_point));
|
||||
generator.set("status", folding.status);
|
||||
generator.append(R"~~~(
|
||||
{ @code_point@, CaseFoldingStatus::@status@)~~~");
|
||||
|
||||
append_list_and_size(folding.mapping, "0x{:x}"sv);
|
||||
generator.append(" },");
|
||||
}
|
||||
|
||||
generator.append(R"~~~(
|
||||
} };
|
||||
|
||||
struct CodePointMapping {
|
||||
u32 code_point { 0 };
|
||||
u32 mapping { 0 };
|
||||
|
@ -865,6 +966,12 @@ struct SpecialCaseMapping {
|
|||
u32 special_casing_size { 0 };
|
||||
};
|
||||
|
||||
struct CaseFoldingMapping {
|
||||
u32 code_point { 0 };
|
||||
Array<CaseFolding const*, @largest_case_folding_size@> case_folding {};
|
||||
u32 case_folding_size { 0 };
|
||||
};
|
||||
|
||||
struct CodePointAbbreviation {
|
||||
u32 code_point { 0 };
|
||||
@string_index_type@ abbreviation { 0 };
|
||||
|
@ -953,7 +1060,7 @@ static constexpr Array<@mapping_type@, @size@> s_@name@_mappings { {
|
|||
generator.set("size", DeprecatedString::number(mapping->decomposition_size));
|
||||
generator.append(", CompatibilityFormattingTag::@tag@, @start@, @size@ },");
|
||||
} else {
|
||||
append_list_and_size(data.special_casing_indices, "&s_special_casing[{}]"sv);
|
||||
append_list_and_size(mapping, "&s_@name@[{}]"sv);
|
||||
generator.append(" },");
|
||||
}
|
||||
|
||||
|
@ -977,6 +1084,7 @@ static constexpr Array<@mapping_type@, @size@> s_@name@_mappings { {
|
|||
append_code_point_mappings("lowercase"sv, "CodePointMapping"sv, unicode_data.simple_lowercase_mapping_size, [](auto const& data) { return data.simple_lowercase_mapping; });
|
||||
append_code_point_mappings("titlecase"sv, "CodePointMapping"sv, unicode_data.simple_titlecase_mapping_size, [](auto const& data) { return data.simple_titlecase_mapping; });
|
||||
append_code_point_mappings("special_case"sv, "SpecialCaseMapping"sv, unicode_data.code_points_with_special_casing, [](auto const& data) { return data.special_casing_indices; });
|
||||
append_code_point_mappings("case_folding"sv, "CaseFoldingMapping"sv, unicode_data.code_points_with_case_folding, [](auto const& data) { return data.case_folding_indices; });
|
||||
append_code_point_mappings("abbreviation"sv, "CodePointAbbreviation"sv, unicode_data.code_point_abbreviations.size(), [](auto const& data) { return data.abbreviation; });
|
||||
|
||||
append_code_point_mappings("decomposition"sv, "CodePointDecompositionRaw"sv, unicode_data.code_points_with_decomposition_mapping,
|
||||
|
@ -1148,6 +1256,15 @@ Span<SpecialCasing const* const> special_case_mapping(u32 code_point)
|
|||
return mapping->special_casing.span().slice(0, mapping->special_casing_size);
|
||||
}
|
||||
|
||||
Span<CaseFolding const* const> case_folding_mapping(u32 code_point)
|
||||
{
|
||||
auto const* mapping = binary_search(s_case_folding_mappings, code_point, nullptr, CodePointComparator<CaseFoldingMapping> {});
|
||||
if (mapping == nullptr)
|
||||
return {};
|
||||
|
||||
return mapping->case_folding.span().slice(0, mapping->case_folding_size);
|
||||
}
|
||||
|
||||
Optional<StringView> code_point_abbreviation(u32 code_point)
|
||||
{
|
||||
auto const* mapping = binary_search(s_abbreviation_mappings, code_point, nullptr, CodePointComparator<CodePointAbbreviation> {});
|
||||
|
@ -1373,6 +1490,7 @@ ErrorOr<int> serenity_main(Main::Arguments arguments)
|
|||
StringView generated_implementation_path;
|
||||
StringView unicode_data_path;
|
||||
StringView special_casing_path;
|
||||
StringView case_folding_path;
|
||||
StringView derived_general_category_path;
|
||||
StringView prop_list_path;
|
||||
StringView derived_core_prop_path;
|
||||
|
@ -1394,6 +1512,7 @@ ErrorOr<int> serenity_main(Main::Arguments arguments)
|
|||
args_parser.add_option(generated_implementation_path, "Path to the Unicode Data implementation file to generate", "generated-implementation-path", 'c', "generated-implementation-path");
|
||||
args_parser.add_option(unicode_data_path, "Path to UnicodeData.txt file", "unicode-data-path", 'u', "unicode-data-path");
|
||||
args_parser.add_option(special_casing_path, "Path to SpecialCasing.txt file", "special-casing-path", 's', "special-casing-path");
|
||||
args_parser.add_option(case_folding_path, "Path to CaseFolding.txt file", "case-folding-path", 'o', "case-folding-path");
|
||||
args_parser.add_option(derived_general_category_path, "Path to DerivedGeneralCategory.txt file", "derived-general-category-path", 'g', "derived-general-category-path");
|
||||
args_parser.add_option(prop_list_path, "Path to PropList.txt file", "prop-list-path", 'p', "prop-list-path");
|
||||
args_parser.add_option(derived_core_prop_path, "Path to DerivedCoreProperties.txt file", "derived-core-prop-path", 'd', "derived-core-prop-path");
|
||||
|
@ -1416,6 +1535,7 @@ ErrorOr<int> serenity_main(Main::Arguments arguments)
|
|||
auto unicode_data_file = TRY(open_file(unicode_data_path, Core::Stream::OpenMode::Read));
|
||||
auto derived_general_category_file = TRY(open_file(derived_general_category_path, Core::Stream::OpenMode::Read));
|
||||
auto special_casing_file = TRY(open_file(special_casing_path, Core::Stream::OpenMode::Read));
|
||||
auto case_folding_file = TRY(open_file(case_folding_path, Core::Stream::OpenMode::Read));
|
||||
auto prop_list_file = TRY(open_file(prop_list_path, Core::Stream::OpenMode::Read));
|
||||
auto derived_core_prop_file = TRY(open_file(derived_core_prop_path, Core::Stream::OpenMode::Read));
|
||||
auto derived_binary_prop_file = TRY(open_file(derived_binary_prop_path, Core::Stream::OpenMode::Read));
|
||||
|
@ -1433,6 +1553,7 @@ ErrorOr<int> serenity_main(Main::Arguments arguments)
|
|||
|
||||
UnicodeData unicode_data {};
|
||||
TRY(parse_special_casing(*special_casing_file, unicode_data));
|
||||
TRY(parse_case_folding(*case_folding_file, unicode_data));
|
||||
TRY(parse_prop_list(*derived_general_category_file, unicode_data.general_categories));
|
||||
TRY(parse_prop_list(*prop_list_file, unicode_data.prop_list));
|
||||
TRY(parse_prop_list(*derived_core_prop_file, unicode_data.prop_list));
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue