diff --git a/Meta/Lagom/Tools/CodeGenerators/LibUnicode/GenerateUnicodeData.cpp b/Meta/Lagom/Tools/CodeGenerators/LibUnicode/GenerateUnicodeData.cpp index 06127b272a..cb51b5591b 100644 --- a/Meta/Lagom/Tools/CodeGenerators/LibUnicode/GenerateUnicodeData.cpp +++ b/Meta/Lagom/Tools/CodeGenerators/LibUnicode/GenerateUnicodeData.cpp @@ -88,6 +88,8 @@ struct CodePointData { }; struct UnicodeData { + u32 code_points_with_non_zero_combining_class { 0 }; + u32 simple_uppercase_mapping_size { 0 }; u32 simple_lowercase_mapping_size { 0 }; @@ -438,6 +440,7 @@ static void parse_unicode_data(Core::File& file, UnicodeData& unicode_data) } } + unicode_data.code_points_with_non_zero_combining_class += data.canonical_combining_class != 0; unicode_data.simple_uppercase_mapping_size += data.simple_uppercase_mapping.has_value(); unicode_data.simple_lowercase_mapping_size += data.simple_lowercase_mapping.has_value(); @@ -564,6 +567,8 @@ namespace Detail { Optional unicode_data_for_code_point(u32 code_point); +u32 canonical_combining_class(u32 code_point); + u32 simple_uppercase_mapping(u32 code_point); u32 simple_lowercase_mapping(u32 code_point); Span special_case_mapping(u32 code_point); @@ -750,6 +755,12 @@ static constexpr Array<@mapping_type@, @size@> s_@name@_mappings { { )~~~"); }; + append_code_point_mappings("combining_class"sv, "CodePointMapping"sv, unicode_data.code_points_with_non_zero_combining_class, + [](auto const& data) -> Optional { + if (data.canonical_combining_class == 0) + return {}; + return data.canonical_combining_class; + }); append_code_point_mappings("uppercase"sv, "CodePointMapping"sv, unicode_data.simple_uppercase_mapping_size, [](auto const& data) { return data.simple_uppercase_mapping; }); append_code_point_mappings("lowercase"sv, "CodePointMapping"sv, unicode_data.simple_lowercase_mapping_size, [](auto const& data) { return data.simple_lowercase_mapping; }); append_code_point_mappings("special_case"sv, "SpecialCaseMapping"sv, unicode_data.code_points_with_special_casing, [](auto const& data) { return data.special_casing_indices; }); @@ -877,20 +888,22 @@ Optional unicode_data_for_code_point(u32 code_point) } )~~~"); - auto append_code_point_mapping_search = [&](StringView method, StringView mappings) { + auto append_code_point_mapping_search = [&](StringView method, StringView mappings, StringView fallback) { generator.set("method", method); generator.set("mappings", mappings); + generator.set("fallback", fallback); generator.append(R"~~~( u32 @method@(u32 code_point) { auto const* mapping = binary_search(@mappings@, code_point, nullptr, CodePointComparator {}); - return mapping ? mapping->mapping : code_point; + return mapping ? mapping->mapping : @fallback@; } )~~~"); }; - append_code_point_mapping_search("simple_uppercase_mapping"sv, "s_uppercase_mappings"sv); - append_code_point_mapping_search("simple_lowercase_mapping"sv, "s_lowercase_mappings"sv); + append_code_point_mapping_search("canonical_combining_class"sv, "s_combining_class_mappings"sv, "0"sv); + append_code_point_mapping_search("simple_uppercase_mapping"sv, "s_uppercase_mappings"sv, "code_point"sv); + append_code_point_mapping_search("simple_lowercase_mapping"sv, "s_lowercase_mappings"sv, "code_point"sv); generator.append(R"~~~( Span special_case_mapping(u32 code_point) diff --git a/Userland/Libraries/LibUnicode/CharacterTypes.cpp b/Userland/Libraries/LibUnicode/CharacterTypes.cpp index 83350e8a75..90788feb64 100644 --- a/Userland/Libraries/LibUnicode/CharacterTypes.cpp +++ b/Userland/Libraries/LibUnicode/CharacterTypes.cpp @@ -36,13 +36,11 @@ static bool is_after_uppercase_i(Utf8View const& string, size_t index) continue; } - auto unicode_data = Detail::unicode_data_for_code_point(code_point); - if (!unicode_data.has_value()) - return false; + u32 canonical_combining_class = Detail::canonical_combining_class(code_point); - if (unicode_data->canonical_combining_class == 0) + if (canonical_combining_class == 0) found_uppercase_i = false; - else if (unicode_data->canonical_combining_class == 230) + else if (canonical_combining_class == 230) found_uppercase_i = false; } @@ -62,13 +60,11 @@ static bool is_after_soft_dotted_code_point(Utf8View const& string, size_t index continue; } - auto unicode_data = Detail::unicode_data_for_code_point(code_point); - if (!unicode_data.has_value()) - return false; + u32 canonical_combining_class = Detail::canonical_combining_class(code_point); - if (unicode_data->canonical_combining_class == 0) + if (canonical_combining_class == 0) found_soft_dotted_code_point = false; - else if (unicode_data->canonical_combining_class == 230) + else if (canonical_combining_class == 230) found_soft_dotted_code_point = false; } @@ -123,12 +119,11 @@ static bool is_followed_by_combining_class_above(Utf8View const& string, size_t : Utf8View {}; for (auto code_point : following_view) { - auto unicode_data = Detail::unicode_data_for_code_point(code_point); - if (!unicode_data.has_value()) + u32 canonical_combining_class = Detail::canonical_combining_class(code_point); + + if (canonical_combining_class == 0) return false; - if (unicode_data->canonical_combining_class == 0) - return false; - if (unicode_data->canonical_combining_class == 230) + if (canonical_combining_class == 230) return true; } @@ -147,12 +142,11 @@ static bool is_followed_by_combining_dot_above(Utf8View const& string, size_t in if (code_point == 0x307) return true; - auto unicode_data = Detail::unicode_data_for_code_point(code_point); - if (!unicode_data.has_value()) + u32 canonical_combining_class = Detail::canonical_combining_class(code_point); + + if (canonical_combining_class == 0) return false; - if (unicode_data->canonical_combining_class == 0) - return false; - if (unicode_data->canonical_combining_class == 230) + if (canonical_combining_class == 230) return false; }