diff --git a/Tests/LibUnicode/TestUnicodeCharacterTypes.cpp b/Tests/LibUnicode/TestUnicodeCharacterTypes.cpp index a71fd0d9f3..481855fd06 100644 --- a/Tests/LibUnicode/TestUnicodeCharacterTypes.cpp +++ b/Tests/LibUnicode/TestUnicodeCharacterTypes.cpp @@ -213,3 +213,76 @@ TEST_CASE(to_unicode_uppercase_unconditional_special_casing) result = Unicode::to_unicode_uppercase_full("\u1FF7"sv); EXPECT_EQ(result, "\u03A9\u0342\u0399"); } + +TEST_CASE(general_category) +{ + auto general_category = [](StringView name) { + auto general_category = Unicode::general_category_from_string(name); + VERIFY(general_category.has_value()); + return *general_category; + }; + + auto general_category_c = general_category("C"sv); + auto general_category_other = general_category("Other"sv); + EXPECT_EQ(general_category_c, general_category_other); + + auto general_category_cc = general_category("Cc"sv); + auto general_category_control = general_category("Control"sv); + EXPECT_EQ(general_category_cc, general_category_control); + + auto general_category_co = general_category("Co"sv); + auto general_category_private_use = general_category("Private_Use"sv); + EXPECT_EQ(general_category_co, general_category_private_use); + + auto general_category_lc = general_category("LC"sv); + auto general_category_cased_letter = general_category("Cased_Letter"sv); + EXPECT_EQ(general_category_lc, general_category_cased_letter); + + auto general_category_ll = general_category("Ll"sv); + auto general_category_lowercase_letter = general_category("Lowercase_Letter"sv); + EXPECT_EQ(general_category_ll, general_category_lowercase_letter); + + auto general_category_lu = general_category("Lu"sv); + auto general_category_uppercase_letter = general_category("Uppercase_Letter"sv); + EXPECT_EQ(general_category_lu, general_category_uppercase_letter); + + for (u32 code_point = 0; code_point <= 0x1f; ++code_point) { + EXPECT(Unicode::code_point_has_general_category(code_point, general_category_c)); + EXPECT(Unicode::code_point_has_general_category(code_point, general_category_cc)); + + EXPECT(!Unicode::code_point_has_general_category(code_point, general_category_co)); + EXPECT(!Unicode::code_point_has_general_category(code_point, general_category_lc)); + EXPECT(!Unicode::code_point_has_general_category(code_point, general_category_ll)); + EXPECT(!Unicode::code_point_has_general_category(code_point, general_category_lu)); + } + + for (u32 code_point = 0xe000; code_point <= 0xe100; ++code_point) { + EXPECT(Unicode::code_point_has_general_category(code_point, general_category_c)); + EXPECT(Unicode::code_point_has_general_category(code_point, general_category_co)); + + EXPECT(!Unicode::code_point_has_general_category(code_point, general_category_cc)); + EXPECT(!Unicode::code_point_has_general_category(code_point, general_category_lc)); + EXPECT(!Unicode::code_point_has_general_category(code_point, general_category_ll)); + EXPECT(!Unicode::code_point_has_general_category(code_point, general_category_lu)); + } + + for (u32 code_point = 0x61; code_point <= 0x7a; ++code_point) { + EXPECT(Unicode::code_point_has_general_category(code_point, general_category_lc)); + EXPECT(Unicode::code_point_has_general_category(code_point, general_category_ll)); + + EXPECT(!Unicode::code_point_has_general_category(code_point, general_category_c)); + EXPECT(!Unicode::code_point_has_general_category(code_point, general_category_cc)); + EXPECT(!Unicode::code_point_has_general_category(code_point, general_category_co)); + EXPECT(!Unicode::code_point_has_general_category(code_point, general_category_lu)); + } + + for (u32 code_point = 0x41; code_point <= 0x5a; ++code_point) { + EXPECT(Unicode::code_point_has_general_category(code_point, general_category_lc)); + EXPECT(Unicode::code_point_has_general_category(code_point, general_category_lu)); + + EXPECT(!Unicode::code_point_has_general_category(code_point, general_category_c)); + EXPECT(!Unicode::code_point_has_general_category(code_point, general_category_cc)); + EXPECT(!Unicode::code_point_has_general_category(code_point, general_category_co)); + EXPECT(!Unicode::code_point_has_general_category(code_point, general_category_ll)); + } +} diff --git a/Userland/Libraries/LibUnicode/CharacterTypes.cpp b/Userland/Libraries/LibUnicode/CharacterTypes.cpp index a6de8ff331..667f5e4548 100644 --- a/Userland/Libraries/LibUnicode/CharacterTypes.cpp +++ b/Userland/Libraries/LibUnicode/CharacterTypes.cpp @@ -22,11 +22,6 @@ namespace Unicode { #if ENABLE_UNICODE_DATA -static bool has_general_category(UnicodeData const& unicode_data, GeneralCategory general_category) -{ - return (unicode_data.general_category & general_category) != GeneralCategory::None; -} - static bool has_property(UnicodeData const& unicode_data, Property property) { return (unicode_data.properties & property) == property; @@ -214,11 +209,7 @@ Optional general_category_from_string([[maybe_unused]] StringVi bool code_point_has_general_category([[maybe_unused]] u32 code_point, [[maybe_unused]] GeneralCategory general_category) { #if ENABLE_UNICODE_DATA - auto unicode_data = Detail::unicode_data_for_code_point(code_point); - if (!unicode_data.has_value()) - return false; - - return has_general_category(*unicode_data, general_category); + return Detail::code_point_has_general_category(code_point, general_category); #else return {}; #endif diff --git a/Userland/Libraries/LibUnicode/CodeGenerators/GenerateUnicodeData.cpp b/Userland/Libraries/LibUnicode/CodeGenerators/GenerateUnicodeData.cpp index 3403fa4e6c..58fb81cc0b 100644 --- a/Userland/Libraries/LibUnicode/CodeGenerators/GenerateUnicodeData.cpp +++ b/Userland/Libraries/LibUnicode/CodeGenerators/GenerateUnicodeData.cpp @@ -55,7 +55,6 @@ struct Alias { struct CodePointData { u32 code_point { 0 }; String name; - String general_category; u8 canonical_combining_class { 0 }; String bidi_class; String decomposition_type; @@ -84,20 +83,7 @@ struct UnicodeData { Vector code_point_data; Vector code_point_ranges; - // The Unicode standard defines General Category values which are not in any UCD file. These - // values are simply unions of other values. - // https://www.unicode.org/reports/tr44/#GC_Values_Table - Vector general_categories; - Vector general_category_unions { - { "Ll | Lu | Lt"sv, "LC"sv }, - { "Lu | Ll | Lt | Lm | Lo"sv, "L"sv }, - { "Mn | Mc | Me"sv, "M"sv }, - { "Nd | Nl | No"sv, "N"sv }, - { "Pc | Pd | Ps | Pe | Pi | Pf | Po"sv, "P"sv }, - { "Sm | Sc | Sk | So"sv, "S"sv }, - { "Zs | Zl | Zp"sv, "Z"sv }, - { "Cc | Cf | Cs | Co"sv, "C"sv }, // FIXME: This union should also contain "Cn" (Unassigned), which we don't parse yet. - }; + PropList general_categories; Vector general_category_aliases; // The Unicode standard defines additional properties (Any, Assigned, ASCII) which are not in @@ -120,7 +106,6 @@ struct UnicodeData { }; static constexpr auto s_desired_fields = Array { - "general_category"sv, "simple_uppercase_mapping"sv, "simple_lowercase_mapping"sv, }; @@ -278,7 +263,7 @@ static void parse_alias_list(Core::File& file, PropList const& prop_list, Vector } } -static void parse_value_alias_list(Core::File& file, StringView desired_category, Vector const& value_list, Vector const& prop_unions, Vector& prop_aliases, bool primary_value_is_first = true) +static void parse_value_alias_list(Core::File& file, StringView desired_category, Vector const& value_list, Vector& prop_aliases, bool primary_value_is_first = true) { VERIFY(file.seek(0)); @@ -288,7 +273,7 @@ static void parse_value_alias_list(Core::File& file, StringView desired_category return; // FIXME: We will, eventually, need to find where missing properties are located and parse them. - if (!value_list.contains_slow(value) && !any_of(prop_unions, [&](auto const& u) { return value == u.alias; })) + if (!value_list.contains_slow(value)) return; prop_aliases.append({ value, alias }); @@ -363,7 +348,6 @@ static void parse_unicode_data(Core::File& file, UnicodeData& unicode_data) CodePointData data {}; data.code_point = AK::StringUtils::convert_to_uint_from_hex(segments[0]).value(); data.name = move(segments[1]); - data.general_category = move(segments[2]); data.canonical_combining_class = AK::StringUtils::convert_to_uint(segments[3]).value(); data.bidi_class = move(segments[4]); data.decomposition_type = move(segments[5]); @@ -402,10 +386,6 @@ static void parse_unicode_data(Core::File& file, UnicodeData& unicode_data) unicode_data.largest_special_casing_size = max(unicode_data.largest_special_casing_size, data.special_casing_indices.size()); unicode_data.largest_script_extensions_size = max(unicode_data.largest_script_extensions_size, data.script_extensions.size()); - - if (!unicode_data.general_categories.contains_slow(data.general_category)) - unicode_data.general_categories.append(data.general_category); - unicode_data.code_point_data.append(move(data)); } } @@ -418,24 +398,19 @@ static void generate_unicode_data_header(Core::File& file, UnicodeData& unicode_ generator.set("special_casing_size", String::number(unicode_data.largest_special_casing_size)); generator.set("script_extensions_size", String::number(unicode_data.largest_script_extensions_size)); - auto generate_enum = [&](StringView name, StringView default_, Vector values, Vector unions = {}, Vector aliases = {}, bool as_bitmask = false) { + auto generate_enum = [&](StringView name, StringView default_, Vector values, Vector aliases = {}, bool as_bitmask = false) { VERIFY(!as_bitmask || (values.size() <= 64)); quick_sort(values); - quick_sort(unions, [](auto& union1, auto& union2) { return union1.alias < union2.alias; }); quick_sort(aliases, [](auto& alias1, auto& alias2) { return alias1.alias < alias2.alias; }); generator.set("name", name); generator.set("underlying", String::formatted("{}UnderlyingType", name)); + generator.set("underlying_type", as_bitmask ? "u64"sv : "u8"sv); - if (as_bitmask) { - generator.append(R"~~~( -using @underlying@ = u64; + generator.append(R"~~~( +using @underlying@ = @underlying_type@; enum class @name@ : @underlying@ {)~~~"); - } else { - generator.append(R"~~~( -enum class @name@ {)~~~"); - } if (!default_.is_empty()) { generator.set("default", default_); @@ -457,12 +432,6 @@ enum class @name@ {)~~~"); } } - for (auto const& union_ : unions) { - generator.set("union", union_.alias); - generator.set("value", union_.property); - generator.append(R"~~~( - @union@ = @value@,)~~~"); - } for (auto const& alias : aliases) { generator.set("alias", alias.alias); generator.set("value", alias.property); @@ -501,9 +470,9 @@ namespace Unicode { generate_enum("Locale"sv, "None"sv, move(unicode_data.locales)); generate_enum("Condition"sv, "None"sv, move(unicode_data.conditions)); - generate_enum("GeneralCategory"sv, "None"sv, unicode_data.general_categories, unicode_data.general_category_unions, unicode_data.general_category_aliases, true); - generate_enum("Property"sv, "Assigned"sv, unicode_data.prop_list.keys(), {}, unicode_data.prop_aliases, true); - generate_enum("Script"sv, {}, unicode_data.script_list.keys(), {}, unicode_data.script_aliases); + generate_enum("GeneralCategory"sv, {}, unicode_data.general_categories.keys(), unicode_data.general_category_aliases); + generate_enum("Property"sv, "Assigned"sv, unicode_data.prop_list.keys(), unicode_data.prop_aliases, true); + generate_enum("Script"sv, {}, unicode_data.script_list.keys(), unicode_data.script_aliases); generator.append(R"~~~( struct SpecialCasing { @@ -537,7 +506,6 @@ struct UnicodeData { // Note: For compile-time performance, only primitive types are used. append_field("char const*"sv, "name"sv); - append_field("GeneralCategory"sv, "general_category"sv); append_field("u8"sv, "canonical_combining_class"sv); append_field("char const*"sv, "bidi_class"sv); append_field("char const*"sv, "decomposition_type"sv); @@ -566,8 +534,12 @@ struct UnicodeData { namespace Detail { Optional unicode_data_for_code_point(u32 code_point); -Optional property_from_string(StringView const& property); + +bool code_point_has_general_category(u32 code_point, GeneralCategory general_category); Optional general_category_from_string(StringView const& general_category); + +Optional property_from_string(StringView const& property); + Optional