From da4b8897a7d507bacfcb05766215e9d65551165f Mon Sep 17 00:00:00 2001 From: Timothy Flynn Date: Sat, 9 Oct 2021 18:12:57 -0400 Subject: [PATCH] LibUnicode: Generate standalone compile-time arrays for simple casing Currently, all casing information (simple and special) are stored in a compile-time array of size 34,626, then statically copied to a hash map at runtime. In an effort to reduce the resulting memory usage, store the simple casing rules in standalone compile-time arrays. The uppercase map is size 1,450 and the lowercase map is size 1,433. Any code point not in a map will implicitly have an identity mapping. --- .../LibUnicode/GenerateUnicodeData.cpp | 74 +++++++++++++++++++ .../Libraries/LibUnicode/CharacterTypes.cpp | 14 +--- 2 files changed, 78 insertions(+), 10 deletions(-) diff --git a/Meta/Lagom/Tools/CodeGenerators/LibUnicode/GenerateUnicodeData.cpp b/Meta/Lagom/Tools/CodeGenerators/LibUnicode/GenerateUnicodeData.cpp index d02523b47c..b24a8ecb32 100644 --- a/Meta/Lagom/Tools/CodeGenerators/LibUnicode/GenerateUnicodeData.cpp +++ b/Meta/Lagom/Tools/CodeGenerators/LibUnicode/GenerateUnicodeData.cpp @@ -88,6 +88,9 @@ struct CodePointData { }; struct UnicodeData { + u32 simple_uppercase_mapping_size { 0 }; + u32 simple_lowercase_mapping_size { 0 }; + Vector special_casing; u32 largest_casing_transform_size { 0 }; u32 largest_special_casing_size { 0 }; @@ -430,6 +433,9 @@ static void parse_unicode_data(Core::File& file, UnicodeData& unicode_data) data.special_casing_indices.append(casing.index); } + unicode_data.simple_uppercase_mapping_size += data.simple_uppercase_mapping.has_value(); + unicode_data.simple_lowercase_mapping_size += data.simple_lowercase_mapping.has_value(); + unicode_data.largest_special_casing_size = max(unicode_data.largest_special_casing_size, data.special_casing_indices.size()); previous_code_point = data.code_point; @@ -551,6 +557,9 @@ namespace Detail { Optional unicode_data_for_code_point(u32 code_point); +u32 simple_uppercase_mapping(u32 code_point); +u32 simple_lowercase_mapping(u32 code_point); + bool code_point_has_general_category(u32 code_point, GeneralCategory general_category); Optional general_category_from_string(StringView const& general_category); @@ -665,6 +674,56 @@ static constexpr Array s_unicode_data { {)~ generator.append(R"~~~( } }; +struct CodePointMapping { + u32 code_point { 0 }; + u32 mapping { 0 }; +}; + +struct CodePointComparator { + constexpr int operator()(u32 code_point, CodePointMapping const& mapping) + { + return code_point - mapping.code_point; + } +}; +)~~~"); + + auto append_code_point_mappings = [&](StringView name, u32 size, auto mapping_getter) { + generator.set("name", name); + generator.set("size", String::number(size)); + + generator.append(R"~~~( +static constexpr Array s_@name@_mappings { { + )~~~"); + + constexpr size_t max_mappings_per_row = 20; + size_t mappings_in_current_row = 0; + + for (auto const& data : unicode_data.code_point_data) { + auto mapping = mapping_getter(data); + if (!mapping.has_value()) + continue; + + if (mappings_in_current_row++ > 0) + generator.append(" "); + + generator.set("code_point", String::formatted("{:#x}", data.code_point)); + generator.set("mapping", String::formatted("{:#x}", *mapping)); + generator.append("{ @code_point@, @mapping@ },"); + + if (mappings_in_current_row == max_mappings_per_row) { + mappings_in_current_row = 0; + generator.append("\n "); + } + } + generator.append(R"~~~( +} }; +)~~~"); + }; + + append_code_point_mappings("uppercase"sv, unicode_data.simple_uppercase_mapping_size, [](auto const& data) { return data.simple_uppercase_mapping; }); + append_code_point_mappings("lowercase"sv, unicode_data.simple_lowercase_mapping_size, [](auto const& data) { return data.simple_lowercase_mapping; }); + + generator.append(R"~~~( struct CodePointRange { u32 first { 0 }; u32 last { 0 }; @@ -787,6 +846,21 @@ Optional unicode_data_for_code_point(u32 code_point) } )~~~"); + auto append_code_point_mapping_search = [&](StringView method, StringView mappings) { + generator.set("method", method); + generator.set("mappings", mappings); + generator.append(R"~~~( +u32 @method@(u32 code_point) +{ + auto const* mapping = binary_search(@mappings@, code_point, nullptr, CodePointComparator {}); + return mapping ? mapping->mapping : code_point; +} +)~~~"); + }; + + append_code_point_mapping_search("simple_uppercase_mapping"sv, "s_uppercase_mappings"sv); + append_code_point_mapping_search("simple_lowercase_mapping"sv, "s_lowercase_mappings"sv); + auto append_prop_search = [&](StringView enum_title, StringView enum_snake, StringView collection_name) { generator.set("enum_title", enum_title); generator.set("enum_snake", enum_snake); diff --git a/Userland/Libraries/LibUnicode/CharacterTypes.cpp b/Userland/Libraries/LibUnicode/CharacterTypes.cpp index 0c2c7b2e90..0398c92bf3 100644 --- a/Userland/Libraries/LibUnicode/CharacterTypes.cpp +++ b/Userland/Libraries/LibUnicode/CharacterTypes.cpp @@ -213,10 +213,7 @@ static SpecialCasing const* find_matching_special_case(Utf8View const& string, O u32 to_unicode_lowercase(u32 code_point) { #if ENABLE_UNICODE_DATA - auto unicode_data = Detail::unicode_data_for_code_point(code_point); - if (unicode_data.has_value()) - return unicode_data->simple_lowercase_mapping; - return code_point; + return Detail::simple_lowercase_mapping(code_point); #else return AK::to_ascii_lowercase(code_point); #endif @@ -225,10 +222,7 @@ u32 to_unicode_lowercase(u32 code_point) u32 to_unicode_uppercase(u32 code_point) { #if ENABLE_UNICODE_DATA - auto unicode_data = Detail::unicode_data_for_code_point(code_point); - if (unicode_data.has_value()) - return unicode_data->simple_uppercase_mapping; - return code_point; + return Detail::simple_uppercase_mapping(code_point); #else return AK::to_ascii_uppercase(code_point); #endif @@ -255,7 +249,7 @@ String to_unicode_lowercase_full(StringView const& string, [[maybe_unused]] Opti auto const* special_casing = find_matching_special_case(view, locale, index, byte_length, *unicode_data); if (!special_casing) { - builder.append_code_point(unicode_data->simple_lowercase_mapping); + builder.append_code_point(to_unicode_lowercase(code_point)); continue; } @@ -290,7 +284,7 @@ String to_unicode_uppercase_full(StringView const& string, [[maybe_unused]] Opti auto const* special_casing = find_matching_special_case(view, locale, index, byte_length, *unicode_data); if (!special_casing) { - builder.append_code_point(unicode_data->simple_uppercase_mapping); + builder.append_code_point(to_unicode_uppercase(code_point)); continue; }