From 0ee133af90b165eee531e5654abee07ef9509f86 Mon Sep 17 00:00:00 2001 From: Timothy Flynn Date: Wed, 26 Jul 2023 12:41:15 -0400 Subject: [PATCH] LibUnicode: Separate code point case information into its own structure There is no functional change here. This information will compose the upcoming multistage casing tables in an upcoming patch. Extract it to its own struct to prepare for that. --- .../LibUnicode/GenerateUnicodeData.cpp | 71 +++++++++++-------- 1 file changed, 41 insertions(+), 30 deletions(-) diff --git a/Meta/Lagom/Tools/CodeGenerators/LibUnicode/GenerateUnicodeData.cpp b/Meta/Lagom/Tools/CodeGenerators/LibUnicode/GenerateUnicodeData.cpp index fcd209c3e8..903e28ccee 100644 --- a/Meta/Lagom/Tools/CodeGenerators/LibUnicode/GenerateUnicodeData.cpp +++ b/Meta/Lagom/Tools/CodeGenerators/LibUnicode/GenerateUnicodeData.cpp @@ -70,12 +70,30 @@ struct CodePointName { size_t name { 0 }; }; +struct CasingTable { + bool operator==(CasingTable const& other) const + { + return canonical_combining_class == other.canonical_combining_class + && simple_lowercase_mapping == other.simple_lowercase_mapping + && simple_uppercase_mapping == other.simple_uppercase_mapping + && simple_titlecase_mapping == other.simple_titlecase_mapping + && special_casing_indices == other.special_casing_indices + && case_folding_indices == other.case_folding_indices; + } + + u8 canonical_combining_class { 0 }; + Optional simple_uppercase_mapping; + Optional simple_lowercase_mapping; + Optional simple_titlecase_mapping; + Vector special_casing_indices; + Vector case_folding_indices; +}; + // https://www.unicode.org/reports/tr44/#UnicodeData.txt struct CodePointData { u32 code_point { 0 }; DeprecatedString name; Optional abbreviation; - u8 canonical_combining_class { 0 }; DeprecatedString bidi_class; Optional decomposition_mapping; Optional numeric_value_decimal; @@ -84,11 +102,7 @@ struct CodePointData { bool bidi_mirrored { false }; DeprecatedString unicode_1_name; DeprecatedString iso_comment; - Optional simple_uppercase_mapping; - Optional simple_lowercase_mapping; - Optional simple_titlecase_mapping; - Vector special_casing_indices; - Vector case_folding_indices; + CasingTable casing; }; struct BlockName { @@ -172,6 +186,7 @@ struct UnicodeData { PropList word_break_props; PropList sentence_break_props; + CodePointTables casing_tables; CodePointTables general_category_tables; CodePointTables property_tables; CodePointTables script_tables; @@ -683,7 +698,7 @@ static ErrorOr parse_unicode_data(Core::InputBufferedFile& file, UnicodeDa CodePointData data {}; data.code_point = AK::StringUtils::convert_to_uint_from_hex(segments[0]).value(); data.name = segments[1]; - data.canonical_combining_class = AK::StringUtils::convert_to_uint(segments[3]).value(); + data.casing.canonical_combining_class = AK::StringUtils::convert_to_uint(segments[3]).value(); data.bidi_class = segments[4]; data.decomposition_mapping = parse_decomposition_mapping(segments[5], unicode_data); data.numeric_value_decimal = AK::StringUtils::convert_to_int(segments[6]); @@ -692,9 +707,9 @@ static ErrorOr parse_unicode_data(Core::InputBufferedFile& file, UnicodeDa data.bidi_mirrored = segments[9] == "Y"sv; data.unicode_1_name = segments[10]; data.iso_comment = segments[11]; - data.simple_uppercase_mapping = AK::StringUtils::convert_to_uint_from_hex(segments[12]); - data.simple_lowercase_mapping = AK::StringUtils::convert_to_uint_from_hex(segments[13]); - data.simple_titlecase_mapping = AK::StringUtils::convert_to_uint_from_hex(segments[14]); + data.casing.simple_uppercase_mapping = AK::StringUtils::convert_to_uint_from_hex(segments[12]); + data.casing.simple_lowercase_mapping = AK::StringUtils::convert_to_uint_from_hex(segments[13]); + data.casing.simple_titlecase_mapping = AK::StringUtils::convert_to_uint_from_hex(segments[14]); if (auto abbreviation = unicode_data.code_point_abbreviations.get(data.code_point); abbreviation.has_value()) data.abbreviation = *abbreviation; @@ -734,7 +749,7 @@ static ErrorOr parse_unicode_data(Core::InputBufferedFile& file, UnicodeDa bool has_special_casing { false }; for (auto const& casing : unicode_data.special_casing) { if (casing.code_point == data.code_point) { - data.special_casing_indices.append(casing.index); + data.casing.special_casing_indices.append(casing.index); has_special_casing = true; } } @@ -742,22 +757,22 @@ static ErrorOr parse_unicode_data(Core::InputBufferedFile& file, UnicodeDa bool has_case_folding { false }; for (size_t i = 0; i < unicode_data.case_folding.size(); ++i) { if (auto const& folding = unicode_data.case_folding[i]; folding.code_point == data.code_point) { - data.case_folding_indices.append(i); + data.casing.case_folding_indices.append(i); has_case_folding = true; } } - unicode_data.code_points_with_non_zero_combining_class += data.canonical_combining_class != 0; - unicode_data.simple_uppercase_mapping_size += data.simple_uppercase_mapping.has_value(); - unicode_data.simple_lowercase_mapping_size += data.simple_lowercase_mapping.has_value(); - unicode_data.simple_titlecase_mapping_size += data.simple_titlecase_mapping.has_value(); + unicode_data.code_points_with_non_zero_combining_class += data.casing.canonical_combining_class != 0; + unicode_data.simple_uppercase_mapping_size += data.casing.simple_uppercase_mapping.has_value(); + unicode_data.simple_lowercase_mapping_size += data.casing.simple_lowercase_mapping.has_value(); + unicode_data.simple_titlecase_mapping_size += data.casing.simple_titlecase_mapping.has_value(); unicode_data.code_points_with_decomposition_mapping += data.decomposition_mapping.has_value(); unicode_data.code_points_with_special_casing += has_special_casing; - unicode_data.largest_special_casing_size = max(unicode_data.largest_special_casing_size, data.special_casing_indices.size()); + unicode_data.largest_special_casing_size = max(unicode_data.largest_special_casing_size, data.casing.special_casing_indices.size()); unicode_data.code_points_with_case_folding += has_case_folding; - unicode_data.largest_case_folding_size = max(unicode_data.largest_case_folding_size, data.case_folding_indices.size()); + unicode_data.largest_case_folding_size = max(unicode_data.largest_case_folding_size, data.casing.case_folding_indices.size()); previous_code_point = data.code_point; unicode_data.code_point_data.append(move(data)); @@ -1081,21 +1096,17 @@ static constexpr Array<@mapping_type@, @size@> s_@name@_mappings { { append_code_point_mappings("combining_class"sv, "CodePointMapping"sv, unicode_data.code_points_with_non_zero_combining_class, [](auto const& data) -> Optional { - if (data.canonical_combining_class == 0) + if (data.casing.canonical_combining_class == 0) return {}; - return data.canonical_combining_class; + return data.casing.canonical_combining_class; }); - append_code_point_mappings("uppercase"sv, "CodePointMapping"sv, unicode_data.simple_uppercase_mapping_size, [](auto const& data) { return data.simple_uppercase_mapping; }); - append_code_point_mappings("lowercase"sv, "CodePointMapping"sv, unicode_data.simple_lowercase_mapping_size, [](auto const& data) { return data.simple_lowercase_mapping; }); - append_code_point_mappings("titlecase"sv, "CodePointMapping"sv, unicode_data.simple_titlecase_mapping_size, [](auto const& data) { return data.simple_titlecase_mapping; }); - append_code_point_mappings("special_case"sv, "SpecialCaseMapping"sv, unicode_data.code_points_with_special_casing, [](auto const& data) { return data.special_casing_indices; }); - append_code_point_mappings("case_folding"sv, "CaseFoldingMapping"sv, unicode_data.code_points_with_case_folding, [](auto const& data) { return data.case_folding_indices; }); + append_code_point_mappings("uppercase"sv, "CodePointMapping"sv, unicode_data.simple_uppercase_mapping_size, [](auto const& data) { return data.casing.simple_uppercase_mapping; }); + append_code_point_mappings("lowercase"sv, "CodePointMapping"sv, unicode_data.simple_lowercase_mapping_size, [](auto const& data) { return data.casing.simple_lowercase_mapping; }); + append_code_point_mappings("titlecase"sv, "CodePointMapping"sv, unicode_data.simple_titlecase_mapping_size, [](auto const& data) { return data.casing.simple_titlecase_mapping; }); + append_code_point_mappings("special_case"sv, "SpecialCaseMapping"sv, unicode_data.code_points_with_special_casing, [](auto const& data) { return data.casing.special_casing_indices; }); + append_code_point_mappings("case_folding"sv, "CaseFoldingMapping"sv, unicode_data.code_points_with_case_folding, [](auto const& data) { return data.casing.case_folding_indices; }); append_code_point_mappings("abbreviation"sv, "CodePointAbbreviation"sv, unicode_data.code_point_abbreviations.size(), [](auto const& data) { return data.abbreviation; }); - - append_code_point_mappings("decomposition"sv, "CodePointDecompositionRaw"sv, unicode_data.code_points_with_decomposition_mapping, - [](auto const& data) { - return data.decomposition_mapping; - }); + append_code_point_mappings("decomposition"sv, "CodePointDecompositionRaw"sv, unicode_data.code_points_with_decomposition_mapping, [](auto const& data) { return data.decomposition_mapping; }); auto append_property_table = [&](auto collection_snake, auto const& unique_properties) -> ErrorOr { TRY(generator.set("name", TRY(String::formatted("{}_unique_properties", collection_snake))));