mirror of
				https://github.com/RGBCube/serenity
				synced 2025-10-31 05:32:44 +00:00 
			
		
		
		
	LibUnicode: Perform code point property lookups in constant time
We currently produce a single table for all categories of code point
properties (GeneralCategory, Script, etc.). Each row contains a field
indicating the range of code points to which that property applies. At
runtime, we then do a binary search through that table to decide if a
code point has a property.
This changes our approach to generate a 2-stage lookup table for each of
those categories. There is an in-depth explanation of these tables above
the new `create_code_point_tables` method. The end effect is that code
point property lookup is reduced from a binary search to constant-time
array lookups.
In total, this change:
    * Increases the size of libunicode.so from 2.7 MB to 2.9 MB.
    * Reduces the runtime of the new benchmark test case added here from
      3.576s to 1.020s (a 3.5x speedup).
    * In a profile of resizing a TextEditor window with a 3MB file open,
      the runtime of checking if a code point has a word break property
      reduces from ~81% to ~56%.
			
			
This commit is contained in:
		
							parent
							
								
									8f1d73abde
								
							
						
					
					
						commit
						0652cc48c0
					
				
					 2 changed files with 247 additions and 61 deletions
				
			
		|  | @ -7,6 +7,7 @@ | ||||||
| #include "GeneratorUtil.h" | #include "GeneratorUtil.h" | ||||||
| #include <AK/AllOf.h> | #include <AK/AllOf.h> | ||||||
| #include <AK/Array.h> | #include <AK/Array.h> | ||||||
|  | #include <AK/BinarySearch.h> | ||||||
| #include <AK/CharacterTypes.h> | #include <AK/CharacterTypes.h> | ||||||
| #include <AK/DeprecatedString.h> | #include <AK/DeprecatedString.h> | ||||||
| #include <AK/Error.h> | #include <AK/Error.h> | ||||||
|  | @ -96,6 +97,18 @@ struct BlockName { | ||||||
|     size_t name { 0 }; |     size_t name { 0 }; | ||||||
| }; | }; | ||||||
| 
 | 
 | ||||||
|  | struct CodePointTables { | ||||||
|  |     static constexpr auto MSB_COUNT = 16u; | ||||||
|  |     static_assert(MSB_COUNT < 24u); | ||||||
|  | 
 | ||||||
|  |     static constexpr auto LSB_COUNT = 24u - MSB_COUNT; | ||||||
|  |     static constexpr auto LSB_MASK = NumericLimits<u32>::max() >> (NumericLimits<u32>::digits() - LSB_COUNT); | ||||||
|  | 
 | ||||||
|  |     Vector<size_t> stage1; | ||||||
|  |     Vector<size_t> stage2; | ||||||
|  |     Vector<Vector<bool>> unique_properties; | ||||||
|  | }; | ||||||
|  | 
 | ||||||
| struct UnicodeData { | struct UnicodeData { | ||||||
|     UniqueStringStorage unique_strings; |     UniqueStringStorage unique_strings; | ||||||
| 
 | 
 | ||||||
|  | @ -156,6 +169,14 @@ struct UnicodeData { | ||||||
|     PropList grapheme_break_props; |     PropList grapheme_break_props; | ||||||
|     PropList word_break_props; |     PropList word_break_props; | ||||||
|     PropList sentence_break_props; |     PropList sentence_break_props; | ||||||
|  | 
 | ||||||
|  |     CodePointTables general_category_tables; | ||||||
|  |     CodePointTables property_tables; | ||||||
|  |     CodePointTables script_tables; | ||||||
|  |     CodePointTables script_extension_tables; | ||||||
|  |     CodePointTables grapheme_break_tables; | ||||||
|  |     CodePointTables word_break_tables; | ||||||
|  |     CodePointTables sentence_break_tables; | ||||||
| }; | }; | ||||||
| 
 | 
 | ||||||
| static DeprecatedString sanitize_entry(DeprecatedString const& entry) | static DeprecatedString sanitize_entry(DeprecatedString const& entry) | ||||||
|  | @ -1071,67 +1092,79 @@ static constexpr Array<@mapping_type@, @size@> s_@name@_mappings { { | ||||||
|             return data.decomposition_mapping; |             return data.decomposition_mapping; | ||||||
|         }); |         }); | ||||||
| 
 | 
 | ||||||
|     auto append_code_point_range_list = [&](DeprecatedString name, Vector<Unicode::CodePointRange> const& ranges) { |     auto append_code_point_tables = [&](StringView collection_snake, CodePointTables const& tables) -> ErrorOr<void> { | ||||||
|         generator.set("name", name); |         auto append_stage = [&](auto const& stage, auto name, auto type) -> ErrorOr<void> { | ||||||
|         generator.set("size", DeprecatedString::number(ranges.size())); |             TRY(generator.set("name", TRY(String::formatted("{}_{}", collection_snake, name)))); | ||||||
|         generator.append(R"~~~( |             TRY(generator.set("size", TRY(String::number(stage.size())))); | ||||||
| static constexpr Array<CodePointRange, @size@> @name@ { { |             generator.set("type", type); | ||||||
|  | 
 | ||||||
|  |             generator.append(R"~~~( | ||||||
|  | static constexpr Array<@type@, @size@> @name@ { { | ||||||
|     )~~~"); |     )~~~"); | ||||||
| 
 | 
 | ||||||
|         constexpr size_t max_ranges_per_row = 20; |             static constexpr size_t max_values_per_row = 300; | ||||||
|         size_t ranges_in_current_row = 0; |             size_t values_in_current_row = 0; | ||||||
| 
 | 
 | ||||||
|         for (auto const& range : ranges) { |             for (auto value : stage) { | ||||||
|             if (ranges_in_current_row++ > 0) |                 if (values_in_current_row++ > 0) | ||||||
|                 generator.append(" "); |                     generator.append(", "); | ||||||
| 
 | 
 | ||||||
|             generator.set("first", DeprecatedString::formatted("{:#x}", range.first)); |                 TRY(generator.set("value", TRY(String::number(value)))); | ||||||
|             generator.set("last", DeprecatedString::formatted("{:#x}", range.last)); |                 generator.append("@value@"); | ||||||
|             generator.append("{ @first@, @last@ },"); |  | ||||||
| 
 | 
 | ||||||
|             if (ranges_in_current_row == max_ranges_per_row) { |                 if (values_in_current_row == max_values_per_row) { | ||||||
|                 ranges_in_current_row = 0; |                     values_in_current_row = 0; | ||||||
|                 generator.append("\n    "); |                     generator.append(",\n    "); | ||||||
|  |                 } | ||||||
|             } |             } | ||||||
|         } |  | ||||||
| 
 | 
 | ||||||
|         generator.append(R"~~~( |  | ||||||
| } }; |  | ||||||
| )~~~"); |  | ||||||
|     }; |  | ||||||
| 
 |  | ||||||
|     auto append_prop_list = [&](StringView collection_name, StringView property_format, PropList const& property_list) { |  | ||||||
|         for (auto const& property : property_list) { |  | ||||||
|             auto name = DeprecatedString::formatted(property_format, property.key); |  | ||||||
|             append_code_point_range_list(move(name), property.value); |  | ||||||
|         } |  | ||||||
| 
 |  | ||||||
|         auto property_names = property_list.keys(); |  | ||||||
|         quick_sort(property_names); |  | ||||||
| 
 |  | ||||||
|         generator.set("name", collection_name); |  | ||||||
|         generator.set("size", DeprecatedString::number(property_names.size())); |  | ||||||
|         generator.append(R"~~~( |  | ||||||
| static constexpr Array<ReadonlySpan<CodePointRange>, @size@> @name@ { {)~~~"); |  | ||||||
| 
 |  | ||||||
|         for (auto const& property_name : property_names) { |  | ||||||
|             generator.set("name", DeprecatedString::formatted(property_format, property_name)); |  | ||||||
|             generator.append(R"~~~( |             generator.append(R"~~~( | ||||||
|     @name@.span(),)~~~"); |  | ||||||
|         } |  | ||||||
| 
 |  | ||||||
|         generator.append(R"~~~( |  | ||||||
| } }; | } }; | ||||||
| )~~~"); | )~~~"); | ||||||
|  | 
 | ||||||
|  |             return {}; | ||||||
|  |         }; | ||||||
|  | 
 | ||||||
|  |         auto append_unique_properties = [&](auto const& unique_properties) -> ErrorOr<void> { | ||||||
|  |             TRY(generator.set("name", TRY(String::formatted("{}_unique_properties", collection_snake)))); | ||||||
|  |             TRY(generator.set("outer_size", TRY(String::number(unique_properties.size())))); | ||||||
|  |             TRY(generator.set("inner_size", TRY(String::number(unique_properties[0].size())))); | ||||||
|  | 
 | ||||||
|  |             generator.append(R"~~~( | ||||||
|  | static constexpr Array<Array<bool, @inner_size@>, @outer_size@> @name@ { {)~~~"); | ||||||
|  | 
 | ||||||
|  |             for (auto const& property_set : unique_properties) { | ||||||
|  |                 generator.append(R"~~~( | ||||||
|  |     { )~~~"); | ||||||
|  | 
 | ||||||
|  |                 for (auto value : property_set) { | ||||||
|  |                     TRY(generator.set("value", TRY(String::formatted("{}", value)))); | ||||||
|  |                     generator.append("@value@, "); | ||||||
|  |                 } | ||||||
|  | 
 | ||||||
|  |                 generator.append(" },"); | ||||||
|  |             } | ||||||
|  | 
 | ||||||
|  |             generator.append(R"~~~( | ||||||
|  | } }; | ||||||
|  | )~~~"); | ||||||
|  | 
 | ||||||
|  |             return {}; | ||||||
|  |         }; | ||||||
|  | 
 | ||||||
|  |         TRY(append_stage(tables.stage1, "stage1"sv, "u16"sv)); | ||||||
|  |         TRY(append_stage(tables.stage2, "stage2"sv, "u16"sv)); | ||||||
|  |         TRY(append_unique_properties(tables.unique_properties)); | ||||||
|  |         return {}; | ||||||
|     }; |     }; | ||||||
| 
 | 
 | ||||||
|     append_prop_list("s_general_categories"sv, "s_general_category_{}"sv, unicode_data.general_categories); |     TRY(append_code_point_tables("s_general_categories"sv, unicode_data.general_category_tables)); | ||||||
|     append_prop_list("s_properties"sv, "s_property_{}"sv, unicode_data.prop_list); |     TRY(append_code_point_tables("s_properties"sv, unicode_data.property_tables)); | ||||||
|     append_prop_list("s_scripts"sv, "s_script_{}"sv, unicode_data.script_list); |     TRY(append_code_point_tables("s_scripts"sv, unicode_data.script_tables)); | ||||||
|     append_prop_list("s_script_extensions"sv, "s_script_extension_{}"sv, unicode_data.script_extensions); |     TRY(append_code_point_tables("s_script_extensions"sv, unicode_data.script_extension_tables)); | ||||||
|     append_prop_list("s_grapheme_break_properties"sv, "s_grapheme_break_property_{}"sv, unicode_data.grapheme_break_props); |     TRY(append_code_point_tables("s_grapheme_break_properties"sv, unicode_data.grapheme_break_tables)); | ||||||
|     append_prop_list("s_word_break_properties"sv, "s_word_break_property_{}"sv, unicode_data.word_break_props); |     TRY(append_code_point_tables("s_word_break_properties"sv, unicode_data.word_break_tables)); | ||||||
|     append_prop_list("s_sentence_break_properties"sv, "s_sentence_break_property_{}"sv, unicode_data.sentence_break_props); |     TRY(append_code_point_tables("s_sentence_break_properties"sv, unicode_data.sentence_break_tables)); | ||||||
| 
 | 
 | ||||||
|     auto append_code_point_display_names = [&](StringView type, StringView name, auto const& display_names) { |     auto append_code_point_display_names = [&](StringView type, StringView name, auto const& display_names) { | ||||||
|         constexpr size_t max_values_per_row = 30; |         constexpr size_t max_values_per_row = 30; | ||||||
|  | @ -1271,20 +1304,27 @@ Optional<CodePointDecomposition const> code_point_decomposition_by_index(size_t | ||||||
| } | } | ||||||
| )~~~"); | )~~~"); | ||||||
| 
 | 
 | ||||||
|     auto append_prop_search = [&](StringView enum_title, StringView enum_snake, StringView collection_name) { |     auto append_prop_search = [&](StringView enum_title, StringView enum_snake, StringView collection_name) -> ErrorOr<void> { | ||||||
|         generator.set("enum_title", enum_title); |         generator.set("enum_title", enum_title); | ||||||
|         generator.set("enum_snake", enum_snake); |         generator.set("enum_snake", enum_snake); | ||||||
|         generator.set("collection_name", collection_name); |         generator.set("collection_name", collection_name); | ||||||
|  | 
 | ||||||
|  |         TRY(generator.set("LSB_COUNT", TRY(String::number(CodePointTables::LSB_COUNT)))); | ||||||
|  |         TRY(generator.set("LSB_MASK", TRY(String::formatted("{:#x}", CodePointTables::LSB_MASK)))); | ||||||
|  | 
 | ||||||
|         generator.append(R"~~~( |         generator.append(R"~~~( | ||||||
| bool code_point_has_@enum_snake@(u32 code_point, @enum_title@ @enum_snake@) | bool code_point_has_@enum_snake@(u32 code_point, @enum_title@ @enum_snake@) | ||||||
| { | { | ||||||
|     auto index = static_cast<@enum_title@UnderlyingType>(@enum_snake@); |     auto stage1_index = code_point >> @LSB_COUNT@; | ||||||
|     auto const& ranges = @collection_name@.at(index); |     auto stage2_index = @collection_name@_stage1[stage1_index] + (code_point & @LSB_MASK@); | ||||||
|  |     auto unique_properties_index = @collection_name@_stage2[stage2_index]; | ||||||
| 
 | 
 | ||||||
|     auto const* range = binary_search(ranges, code_point, nullptr, CodePointRangeComparator {}); |     auto const& property_set = @collection_name@_unique_properties[unique_properties_index]; | ||||||
|     return range != nullptr; |     return property_set[to_underlying(@enum_snake@)]; | ||||||
| } | } | ||||||
| )~~~"); | )~~~"); | ||||||
|  | 
 | ||||||
|  |         return {}; | ||||||
|     }; |     }; | ||||||
| 
 | 
 | ||||||
|     auto append_from_string = [&](StringView enum_title, StringView enum_snake, auto const& prop_list, Vector<Alias> const& aliases) -> ErrorOr<void> { |     auto append_from_string = [&](StringView enum_title, StringView enum_snake, auto const& prop_list, Vector<Alias> const& aliases) -> ErrorOr<void> { | ||||||
|  | @ -1312,19 +1352,19 @@ bool code_point_has_@enum_snake@(u32 code_point, @enum_title@ @enum_snake@) | ||||||
| 
 | 
 | ||||||
|     TRY(append_from_string("Locale"sv, "locale"sv, unicode_data.locales, {})); |     TRY(append_from_string("Locale"sv, "locale"sv, unicode_data.locales, {})); | ||||||
| 
 | 
 | ||||||
|     append_prop_search("GeneralCategory"sv, "general_category"sv, "s_general_categories"sv); |     TRY(append_prop_search("GeneralCategory"sv, "general_category"sv, "s_general_categories"sv)); | ||||||
|     TRY(append_from_string("GeneralCategory"sv, "general_category"sv, unicode_data.general_categories, unicode_data.general_category_aliases)); |     TRY(append_from_string("GeneralCategory"sv, "general_category"sv, unicode_data.general_categories, unicode_data.general_category_aliases)); | ||||||
| 
 | 
 | ||||||
|     append_prop_search("Property"sv, "property"sv, "s_properties"sv); |     TRY(append_prop_search("Property"sv, "property"sv, "s_properties"sv)); | ||||||
|     TRY(append_from_string("Property"sv, "property"sv, unicode_data.prop_list, unicode_data.prop_aliases)); |     TRY(append_from_string("Property"sv, "property"sv, unicode_data.prop_list, unicode_data.prop_aliases)); | ||||||
| 
 | 
 | ||||||
|     append_prop_search("Script"sv, "script"sv, "s_scripts"sv); |     TRY(append_prop_search("Script"sv, "script"sv, "s_scripts"sv)); | ||||||
|     append_prop_search("Script"sv, "script_extension"sv, "s_script_extensions"sv); |     TRY(append_prop_search("Script"sv, "script_extension"sv, "s_script_extensions"sv)); | ||||||
|     TRY(append_from_string("Script"sv, "script"sv, unicode_data.script_list, unicode_data.script_aliases)); |     TRY(append_from_string("Script"sv, "script"sv, unicode_data.script_list, unicode_data.script_aliases)); | ||||||
| 
 | 
 | ||||||
|     append_prop_search("GraphemeBreakProperty"sv, "grapheme_break_property"sv, "s_grapheme_break_properties"sv); |     TRY(append_prop_search("GraphemeBreakProperty"sv, "grapheme_break_property"sv, "s_grapheme_break_properties"sv)); | ||||||
|     append_prop_search("WordBreakProperty"sv, "word_break_property"sv, "s_word_break_properties"sv); |     TRY(append_prop_search("WordBreakProperty"sv, "word_break_property"sv, "s_word_break_properties"sv)); | ||||||
|     append_prop_search("SentenceBreakProperty"sv, "sentence_break_property"sv, "s_sentence_break_properties"sv); |     TRY(append_prop_search("SentenceBreakProperty"sv, "sentence_break_property"sv, "s_sentence_break_properties"sv)); | ||||||
| 
 | 
 | ||||||
|     generator.append(R"~~~( |     generator.append(R"~~~( | ||||||
| } | } | ||||||
|  | @ -1460,6 +1500,128 @@ static ErrorOr<void> normalize_script_extensions(PropList& script_extensions, Pr | ||||||
|     return {}; |     return {}; | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
|  | // The goal here is to produce a set of tables that represent a category of code point properties for every code point.
 | ||||||
|  | // The most naive method would be to generate a single table per category, each with one entry per code point. Each of
 | ||||||
|  | // those tables would have a size of 0x10ffff though, which is a non-starter. Instead, we create a set of 2-stage lookup
 | ||||||
|  | // tables per category.
 | ||||||
|  | //
 | ||||||
|  | // To do so, it's important to note that Unicode tends to organize code points with similar properties together. This
 | ||||||
|  | // leads to long series of code points with identical properties. Therefore, if we divide the 0x10ffff code points into
 | ||||||
|  | // fixed-size blocks, many of those blocks will also be identical.
 | ||||||
|  | //
 | ||||||
|  | // So we iterate over every code point, classifying each one for the category of interest. We represent a classification
 | ||||||
|  | // as a list of booleans. We store the classification in the CodePointTables::unique_properties list for this category.
 | ||||||
|  | // As the name implies, this list is de-duplicated; we store the index into this list in a separate list, which we call
 | ||||||
|  | // a "block".
 | ||||||
|  | //
 | ||||||
|  | // As we iterate, we "pause" every BLOCK_SIZE code points to examine the block. If the block is unique so far, we extend
 | ||||||
|  | // CodePointTables::stage2 with the entries of that block (so CodePointTables::stage2 is also a list of indices into
 | ||||||
|  | // CodePointTables::unique_properties). We then append the index of the start of that block in CodePointTables::stage2
 | ||||||
|  | // to CodePointTables::stage1.
 | ||||||
|  | //
 | ||||||
|  | // The value of BLOCK_SIZE is determined by CodePointTables::MSB_COUNT and CodePointTables::LSB_COUNT. We need 24 bits
 | ||||||
|  | // to describe all code points; the blocks we create are based on splitting these bits into 2 segments. We currently use
 | ||||||
|  | // a 16:8 bit split. So when perform a runtime lookup of a code point in the 2-stage tables, we:
 | ||||||
|  | //
 | ||||||
|  | //     1. Use most-significant 16 bits of the code point as the index into CodePointTables::stage1. That value is the
 | ||||||
|  | //        index into CodePointTables::stage2 of the start of the block that contains properties for this code point.
 | ||||||
|  | //
 | ||||||
|  | //     2. Add the least-significant 8 bits of the code point to that value, to use as the index into
 | ||||||
|  | //        CodePointTables::stage2. As described above, that value is the index into CodePointTables::unique_properties,
 | ||||||
|  | //        which contains the classification for this code point.
 | ||||||
|  | //
 | ||||||
|  | // Using the code point GeneralCategory as an example, we end up with a CodePointTables::stage1 with a size of ~4000,
 | ||||||
|  | // a CodePointTables::stage2 with a size of ~40,000, and a CodePointTables::unique_properties with a size of ~30. So
 | ||||||
|  | // this process reduces over 1 million entries (0x10ffff) to ~44,030.
 | ||||||
|  | //
 | ||||||
|  | // For much more in-depth reading, see: https://icu.unicode.org/design/struct/utrie
 | ||||||
|  | static ErrorOr<void> create_code_point_tables(UnicodeData& unicode_data) | ||||||
|  | { | ||||||
|  |     static constexpr auto MAX_CODE_POINT = 0x10ffffu; | ||||||
|  | 
 | ||||||
|  |     struct TableMetadata { | ||||||
|  |         static ErrorOr<TableMetadata> create(PropList const& property_list) | ||||||
|  |         { | ||||||
|  |             TableMetadata data; | ||||||
|  |             TRY(data.property_values.try_ensure_capacity(property_list.size())); | ||||||
|  |             TRY(data.property_set.try_ensure_capacity(property_list.size())); | ||||||
|  | 
 | ||||||
|  |             auto property_names = property_list.keys(); | ||||||
|  |             quick_sort(property_names); | ||||||
|  | 
 | ||||||
|  |             for (auto const& property_name : property_names) { | ||||||
|  |                 auto const& code_point_rages = property_list.get(property_name).value(); | ||||||
|  |                 data.property_values.unchecked_append(code_point_rages); | ||||||
|  |             } | ||||||
|  | 
 | ||||||
|  |             return data; | ||||||
|  |         } | ||||||
|  | 
 | ||||||
|  |         Vector<typename PropList::ValueType const&> property_values; | ||||||
|  |         Vector<bool> property_set; | ||||||
|  | 
 | ||||||
|  |         Vector<size_t> current_block; | ||||||
|  |         HashMap<decltype(current_block), size_t> unique_blocks; | ||||||
|  |     }; | ||||||
|  | 
 | ||||||
|  |     auto update_tables = [](auto code_point, auto& tables, auto& metadata) -> ErrorOr<void> { | ||||||
|  |         static constexpr auto BLOCK_SIZE = CodePointTables::LSB_MASK + 1; | ||||||
|  | 
 | ||||||
|  |         for (auto const& property_values : metadata.property_values) { | ||||||
|  |             auto has_property = binary_search(property_values, code_point, nullptr, Unicode::CodePointRangeComparator {}) != nullptr; | ||||||
|  |             metadata.property_set.unchecked_append(has_property); | ||||||
|  |         } | ||||||
|  | 
 | ||||||
|  |         size_t unique_properties_index = 0; | ||||||
|  |         if (auto block_index = tables.unique_properties.find_first_index(metadata.property_set); block_index.has_value()) { | ||||||
|  |             unique_properties_index = *block_index; | ||||||
|  |         } else { | ||||||
|  |             unique_properties_index = tables.unique_properties.size(); | ||||||
|  |             TRY(tables.unique_properties.try_append(metadata.property_set)); | ||||||
|  |         } | ||||||
|  | 
 | ||||||
|  |         TRY(metadata.current_block.try_append(unique_properties_index)); | ||||||
|  |         metadata.property_set.clear_with_capacity(); | ||||||
|  | 
 | ||||||
|  |         if (metadata.current_block.size() == BLOCK_SIZE || code_point == MAX_CODE_POINT) { | ||||||
|  |             size_t stage2_index = 0; | ||||||
|  |             if (auto block_index = metadata.unique_blocks.get(metadata.current_block); block_index.has_value()) { | ||||||
|  |                 stage2_index = *block_index; | ||||||
|  |             } else { | ||||||
|  |                 stage2_index = tables.stage2.size(); | ||||||
|  |                 TRY(tables.stage2.try_extend(metadata.current_block)); | ||||||
|  | 
 | ||||||
|  |                 TRY(metadata.unique_blocks.try_set(metadata.current_block, stage2_index)); | ||||||
|  |             } | ||||||
|  | 
 | ||||||
|  |             TRY(tables.stage1.try_append(stage2_index)); | ||||||
|  |             metadata.current_block.clear_with_capacity(); | ||||||
|  |         } | ||||||
|  | 
 | ||||||
|  |         return {}; | ||||||
|  |     }; | ||||||
|  | 
 | ||||||
|  |     auto general_category_metadata = TRY(TableMetadata::create(unicode_data.general_categories)); | ||||||
|  |     auto property_metadata = TRY(TableMetadata::create(unicode_data.prop_list)); | ||||||
|  |     auto script_metadata = TRY(TableMetadata::create(unicode_data.script_list)); | ||||||
|  |     auto script_extension_metadata = TRY(TableMetadata::create(unicode_data.script_extensions)); | ||||||
|  |     auto grapheme_break_metadata = TRY(TableMetadata::create(unicode_data.grapheme_break_props)); | ||||||
|  |     auto word_break_metadata = TRY(TableMetadata::create(unicode_data.word_break_props)); | ||||||
|  |     auto sentence_break_metadata = TRY(TableMetadata::create(unicode_data.sentence_break_props)); | ||||||
|  | 
 | ||||||
|  |     for (u32 code_point = 0; code_point <= MAX_CODE_POINT; ++code_point) { | ||||||
|  |         TRY(update_tables(code_point, unicode_data.general_category_tables, general_category_metadata)); | ||||||
|  |         TRY(update_tables(code_point, unicode_data.property_tables, property_metadata)); | ||||||
|  |         TRY(update_tables(code_point, unicode_data.script_tables, script_metadata)); | ||||||
|  |         TRY(update_tables(code_point, unicode_data.script_extension_tables, script_extension_metadata)); | ||||||
|  |         TRY(update_tables(code_point, unicode_data.grapheme_break_tables, grapheme_break_metadata)); | ||||||
|  |         TRY(update_tables(code_point, unicode_data.word_break_tables, word_break_metadata)); | ||||||
|  |         TRY(update_tables(code_point, unicode_data.sentence_break_tables, sentence_break_metadata)); | ||||||
|  |     } | ||||||
|  | 
 | ||||||
|  |     return {}; | ||||||
|  | } | ||||||
|  | 
 | ||||||
| ErrorOr<int> serenity_main(Main::Arguments arguments) | ErrorOr<int> serenity_main(Main::Arguments arguments) | ||||||
| { | { | ||||||
|     StringView generated_header_path; |     StringView generated_header_path; | ||||||
|  | @ -1551,6 +1713,8 @@ ErrorOr<int> serenity_main(Main::Arguments arguments) | ||||||
|     TRY(parse_value_alias_list(*prop_value_alias_file, "sc"sv, unicode_data.script_list.keys(), unicode_data.script_aliases, false)); |     TRY(parse_value_alias_list(*prop_value_alias_file, "sc"sv, unicode_data.script_list.keys(), unicode_data.script_aliases, false)); | ||||||
|     TRY(normalize_script_extensions(unicode_data.script_extensions, unicode_data.script_list, unicode_data.script_aliases)); |     TRY(normalize_script_extensions(unicode_data.script_extensions, unicode_data.script_list, unicode_data.script_aliases)); | ||||||
| 
 | 
 | ||||||
|  |     TRY(create_code_point_tables(unicode_data)); | ||||||
|  | 
 | ||||||
|     TRY(generate_unicode_data_header(*generated_header_file, unicode_data)); |     TRY(generate_unicode_data_header(*generated_header_file, unicode_data)); | ||||||
|     TRY(generate_unicode_data_implementation(*generated_implementation_file, unicode_data)); |     TRY(generate_unicode_data_implementation(*generated_implementation_file, unicode_data)); | ||||||
| 
 | 
 | ||||||
|  |  | ||||||
|  | @ -594,6 +594,28 @@ TEST_CASE(general_category) | ||||||
|     } |     } | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
|  | BENCHMARK_CASE(general_category_performance) | ||||||
|  | { | ||||||
|  |     auto general_category_cased_letter = Unicode::general_category_from_string("Cased_Letter"sv).value(); | ||||||
|  | 
 | ||||||
|  |     for (size_t i = 0; i < 1'000'000; ++i) { | ||||||
|  |         for (u32 code_point = 0; code_point <= 0x1f; ++code_point) | ||||||
|  |             EXPECT(!Unicode::code_point_has_general_category(code_point, general_category_cased_letter)); | ||||||
|  | 
 | ||||||
|  |         for (u32 code_point = 0x41; code_point <= 0x5a; ++code_point) | ||||||
|  |             EXPECT(Unicode::code_point_has_general_category(code_point, general_category_cased_letter)); | ||||||
|  | 
 | ||||||
|  |         for (u32 code_point = 0x61; code_point <= 0x7a; ++code_point) | ||||||
|  |             EXPECT(Unicode::code_point_has_general_category(code_point, general_category_cased_letter)); | ||||||
|  | 
 | ||||||
|  |         for (u32 code_point = 0xe000; code_point <= 0xe100; ++code_point) | ||||||
|  |             EXPECT(!Unicode::code_point_has_general_category(code_point, general_category_cased_letter)); | ||||||
|  | 
 | ||||||
|  |         for (u32 code_point = 0x101fe; code_point <= 0x1027f; ++code_point) | ||||||
|  |             EXPECT(!Unicode::code_point_has_general_category(code_point, general_category_cased_letter)); | ||||||
|  |     } | ||||||
|  | } | ||||||
|  | 
 | ||||||
| TEST_CASE(property) | TEST_CASE(property) | ||||||
| { | { | ||||||
|     auto property = [](StringView name) { |     auto property = [](StringView name) { | ||||||
|  |  | ||||||
		Loading…
	
	Add table
		Add a link
		
	
		Reference in a new issue
	
	 Timothy Flynn
						Timothy Flynn