1
Fork 0
mirror of https://github.com/RGBCube/serenity synced 2025-07-25 21:17:44 +00:00

LibUnicode: Separate code point case information into its own structure

There is no functional change here. This information will compose the
upcoming multistage casing tables in an upcoming patch. Extract it to
its own struct to prepare for that.
This commit is contained in:
Timothy Flynn 2023-07-26 12:41:15 -04:00 committed by Andreas Kling
parent a332a8ad19
commit 0ee133af90

View file

@ -70,12 +70,30 @@ struct CodePointName {
size_t name { 0 }; size_t name { 0 };
}; };
struct CasingTable {
bool operator==(CasingTable const& other) const
{
return canonical_combining_class == other.canonical_combining_class
&& simple_lowercase_mapping == other.simple_lowercase_mapping
&& simple_uppercase_mapping == other.simple_uppercase_mapping
&& simple_titlecase_mapping == other.simple_titlecase_mapping
&& special_casing_indices == other.special_casing_indices
&& case_folding_indices == other.case_folding_indices;
}
u8 canonical_combining_class { 0 };
Optional<u32> simple_uppercase_mapping;
Optional<u32> simple_lowercase_mapping;
Optional<u32> simple_titlecase_mapping;
Vector<u32> special_casing_indices;
Vector<u32> case_folding_indices;
};
// https://www.unicode.org/reports/tr44/#UnicodeData.txt // https://www.unicode.org/reports/tr44/#UnicodeData.txt
struct CodePointData { struct CodePointData {
u32 code_point { 0 }; u32 code_point { 0 };
DeprecatedString name; DeprecatedString name;
Optional<size_t> abbreviation; Optional<size_t> abbreviation;
u8 canonical_combining_class { 0 };
DeprecatedString bidi_class; DeprecatedString bidi_class;
Optional<CodePointDecomposition> decomposition_mapping; Optional<CodePointDecomposition> decomposition_mapping;
Optional<i8> numeric_value_decimal; Optional<i8> numeric_value_decimal;
@ -84,11 +102,7 @@ struct CodePointData {
bool bidi_mirrored { false }; bool bidi_mirrored { false };
DeprecatedString unicode_1_name; DeprecatedString unicode_1_name;
DeprecatedString iso_comment; DeprecatedString iso_comment;
Optional<u32> simple_uppercase_mapping; CasingTable casing;
Optional<u32> simple_lowercase_mapping;
Optional<u32> simple_titlecase_mapping;
Vector<u32> special_casing_indices;
Vector<u32> case_folding_indices;
}; };
struct BlockName { struct BlockName {
@ -172,6 +186,7 @@ struct UnicodeData {
PropList word_break_props; PropList word_break_props;
PropList sentence_break_props; PropList sentence_break_props;
CodePointTables<CasingTable> casing_tables;
CodePointTables<PropertyTable> general_category_tables; CodePointTables<PropertyTable> general_category_tables;
CodePointTables<PropertyTable> property_tables; CodePointTables<PropertyTable> property_tables;
CodePointTables<PropertyTable> script_tables; CodePointTables<PropertyTable> script_tables;
@ -683,7 +698,7 @@ static ErrorOr<void> parse_unicode_data(Core::InputBufferedFile& file, UnicodeDa
CodePointData data {}; CodePointData data {};
data.code_point = AK::StringUtils::convert_to_uint_from_hex<u32>(segments[0]).value(); data.code_point = AK::StringUtils::convert_to_uint_from_hex<u32>(segments[0]).value();
data.name = segments[1]; data.name = segments[1];
data.canonical_combining_class = AK::StringUtils::convert_to_uint<u8>(segments[3]).value(); data.casing.canonical_combining_class = AK::StringUtils::convert_to_uint<u8>(segments[3]).value();
data.bidi_class = segments[4]; data.bidi_class = segments[4];
data.decomposition_mapping = parse_decomposition_mapping(segments[5], unicode_data); data.decomposition_mapping = parse_decomposition_mapping(segments[5], unicode_data);
data.numeric_value_decimal = AK::StringUtils::convert_to_int<i8>(segments[6]); data.numeric_value_decimal = AK::StringUtils::convert_to_int<i8>(segments[6]);
@ -692,9 +707,9 @@ static ErrorOr<void> parse_unicode_data(Core::InputBufferedFile& file, UnicodeDa
data.bidi_mirrored = segments[9] == "Y"sv; data.bidi_mirrored = segments[9] == "Y"sv;
data.unicode_1_name = segments[10]; data.unicode_1_name = segments[10];
data.iso_comment = segments[11]; data.iso_comment = segments[11];
data.simple_uppercase_mapping = AK::StringUtils::convert_to_uint_from_hex<u32>(segments[12]); data.casing.simple_uppercase_mapping = AK::StringUtils::convert_to_uint_from_hex<u32>(segments[12]);
data.simple_lowercase_mapping = AK::StringUtils::convert_to_uint_from_hex<u32>(segments[13]); data.casing.simple_lowercase_mapping = AK::StringUtils::convert_to_uint_from_hex<u32>(segments[13]);
data.simple_titlecase_mapping = AK::StringUtils::convert_to_uint_from_hex<u32>(segments[14]); data.casing.simple_titlecase_mapping = AK::StringUtils::convert_to_uint_from_hex<u32>(segments[14]);
if (auto abbreviation = unicode_data.code_point_abbreviations.get(data.code_point); abbreviation.has_value()) if (auto abbreviation = unicode_data.code_point_abbreviations.get(data.code_point); abbreviation.has_value())
data.abbreviation = *abbreviation; data.abbreviation = *abbreviation;
@ -734,7 +749,7 @@ static ErrorOr<void> parse_unicode_data(Core::InputBufferedFile& file, UnicodeDa
bool has_special_casing { false }; bool has_special_casing { false };
for (auto const& casing : unicode_data.special_casing) { for (auto const& casing : unicode_data.special_casing) {
if (casing.code_point == data.code_point) { if (casing.code_point == data.code_point) {
data.special_casing_indices.append(casing.index); data.casing.special_casing_indices.append(casing.index);
has_special_casing = true; has_special_casing = true;
} }
} }
@ -742,22 +757,22 @@ static ErrorOr<void> parse_unicode_data(Core::InputBufferedFile& file, UnicodeDa
bool has_case_folding { false }; bool has_case_folding { false };
for (size_t i = 0; i < unicode_data.case_folding.size(); ++i) { for (size_t i = 0; i < unicode_data.case_folding.size(); ++i) {
if (auto const& folding = unicode_data.case_folding[i]; folding.code_point == data.code_point) { if (auto const& folding = unicode_data.case_folding[i]; folding.code_point == data.code_point) {
data.case_folding_indices.append(i); data.casing.case_folding_indices.append(i);
has_case_folding = true; has_case_folding = true;
} }
} }
unicode_data.code_points_with_non_zero_combining_class += data.canonical_combining_class != 0; unicode_data.code_points_with_non_zero_combining_class += data.casing.canonical_combining_class != 0;
unicode_data.simple_uppercase_mapping_size += data.simple_uppercase_mapping.has_value(); unicode_data.simple_uppercase_mapping_size += data.casing.simple_uppercase_mapping.has_value();
unicode_data.simple_lowercase_mapping_size += data.simple_lowercase_mapping.has_value(); unicode_data.simple_lowercase_mapping_size += data.casing.simple_lowercase_mapping.has_value();
unicode_data.simple_titlecase_mapping_size += data.simple_titlecase_mapping.has_value(); unicode_data.simple_titlecase_mapping_size += data.casing.simple_titlecase_mapping.has_value();
unicode_data.code_points_with_decomposition_mapping += data.decomposition_mapping.has_value(); unicode_data.code_points_with_decomposition_mapping += data.decomposition_mapping.has_value();
unicode_data.code_points_with_special_casing += has_special_casing; unicode_data.code_points_with_special_casing += has_special_casing;
unicode_data.largest_special_casing_size = max(unicode_data.largest_special_casing_size, data.special_casing_indices.size()); unicode_data.largest_special_casing_size = max(unicode_data.largest_special_casing_size, data.casing.special_casing_indices.size());
unicode_data.code_points_with_case_folding += has_case_folding; unicode_data.code_points_with_case_folding += has_case_folding;
unicode_data.largest_case_folding_size = max(unicode_data.largest_case_folding_size, data.case_folding_indices.size()); unicode_data.largest_case_folding_size = max(unicode_data.largest_case_folding_size, data.casing.case_folding_indices.size());
previous_code_point = data.code_point; previous_code_point = data.code_point;
unicode_data.code_point_data.append(move(data)); unicode_data.code_point_data.append(move(data));
@ -1081,21 +1096,17 @@ static constexpr Array<@mapping_type@, @size@> s_@name@_mappings { {
append_code_point_mappings("combining_class"sv, "CodePointMapping"sv, unicode_data.code_points_with_non_zero_combining_class, append_code_point_mappings("combining_class"sv, "CodePointMapping"sv, unicode_data.code_points_with_non_zero_combining_class,
[](auto const& data) -> Optional<u32> { [](auto const& data) -> Optional<u32> {
if (data.canonical_combining_class == 0) if (data.casing.canonical_combining_class == 0)
return {}; return {};
return data.canonical_combining_class; return data.casing.canonical_combining_class;
}); });
append_code_point_mappings("uppercase"sv, "CodePointMapping"sv, unicode_data.simple_uppercase_mapping_size, [](auto const& data) { return data.simple_uppercase_mapping; }); append_code_point_mappings("uppercase"sv, "CodePointMapping"sv, unicode_data.simple_uppercase_mapping_size, [](auto const& data) { return data.casing.simple_uppercase_mapping; });
append_code_point_mappings("lowercase"sv, "CodePointMapping"sv, unicode_data.simple_lowercase_mapping_size, [](auto const& data) { return data.simple_lowercase_mapping; }); append_code_point_mappings("lowercase"sv, "CodePointMapping"sv, unicode_data.simple_lowercase_mapping_size, [](auto const& data) { return data.casing.simple_lowercase_mapping; });
append_code_point_mappings("titlecase"sv, "CodePointMapping"sv, unicode_data.simple_titlecase_mapping_size, [](auto const& data) { return data.simple_titlecase_mapping; }); append_code_point_mappings("titlecase"sv, "CodePointMapping"sv, unicode_data.simple_titlecase_mapping_size, [](auto const& data) { return data.casing.simple_titlecase_mapping; });
append_code_point_mappings("special_case"sv, "SpecialCaseMapping"sv, unicode_data.code_points_with_special_casing, [](auto const& data) { return data.special_casing_indices; }); append_code_point_mappings("special_case"sv, "SpecialCaseMapping"sv, unicode_data.code_points_with_special_casing, [](auto const& data) { return data.casing.special_casing_indices; });
append_code_point_mappings("case_folding"sv, "CaseFoldingMapping"sv, unicode_data.code_points_with_case_folding, [](auto const& data) { return data.case_folding_indices; }); append_code_point_mappings("case_folding"sv, "CaseFoldingMapping"sv, unicode_data.code_points_with_case_folding, [](auto const& data) { return data.casing.case_folding_indices; });
append_code_point_mappings("abbreviation"sv, "CodePointAbbreviation"sv, unicode_data.code_point_abbreviations.size(), [](auto const& data) { return data.abbreviation; }); append_code_point_mappings("abbreviation"sv, "CodePointAbbreviation"sv, unicode_data.code_point_abbreviations.size(), [](auto const& data) { return data.abbreviation; });
append_code_point_mappings("decomposition"sv, "CodePointDecompositionRaw"sv, unicode_data.code_points_with_decomposition_mapping, [](auto const& data) { return data.decomposition_mapping; });
append_code_point_mappings("decomposition"sv, "CodePointDecompositionRaw"sv, unicode_data.code_points_with_decomposition_mapping,
[](auto const& data) {
return data.decomposition_mapping;
});
auto append_property_table = [&](auto collection_snake, auto const& unique_properties) -> ErrorOr<void> { auto append_property_table = [&](auto collection_snake, auto const& unique_properties) -> ErrorOr<void> {
TRY(generator.set("name", TRY(String::formatted("{}_unique_properties", collection_snake)))); TRY(generator.set("name", TRY(String::formatted("{}_unique_properties", collection_snake))));