diff --git a/Tests/LibUnicode/TestUnicodeCharacterTypes.cpp b/Tests/LibUnicode/TestUnicodeCharacterTypes.cpp index 6450418c47..24c9e0c881 100644 --- a/Tests/LibUnicode/TestUnicodeCharacterTypes.cpp +++ b/Tests/LibUnicode/TestUnicodeCharacterTypes.cpp @@ -305,3 +305,73 @@ TEST_CASE(general_category) EXPECT(!Unicode::code_point_has_general_category(code_point, general_category_ll)); } } + +TEST_CASE(property) +{ + auto property = [](StringView name) { + auto property = Unicode::property_from_string(name); + VERIFY(property.has_value()); + return *property; + }; + + auto property_any = property("Any"sv); + auto property_assigned = property("Assigned"sv); + auto property_ascii = property("ASCII"sv); + + auto property_white_space = property("White_Space"sv); + auto property_wspace = property("WSpace"sv); + auto property_space = property("space"sv); + EXPECT_EQ(property_white_space, property_wspace); + EXPECT_EQ(property_white_space, property_space); + + auto property_emoji_presentation = property("Emoji_Presentation"sv); + auto property_epres = property("EPres"sv); + EXPECT_EQ(property_emoji_presentation, property_epres); + + for (u32 code_point = 0; code_point <= 0x10ffff; code_point += 1000) + EXPECT(Unicode::code_point_has_property(code_point, property_any)); + + for (u32 code_point = 0x101d0; code_point <= 0x101fd; ++code_point) { + EXPECT(Unicode::code_point_has_property(code_point, property_any)); + EXPECT(Unicode::code_point_has_property(code_point, property_assigned)); + + EXPECT(!Unicode::code_point_has_property(code_point, property_ascii)); + EXPECT(!Unicode::code_point_has_property(code_point, property_white_space)); + EXPECT(!Unicode::code_point_has_property(code_point, property_emoji_presentation)); + } + + for (u32 code_point = 0x101fe; code_point <= 0x1027f; ++code_point) { + EXPECT(Unicode::code_point_has_property(code_point, property_any)); + + EXPECT(!Unicode::code_point_has_property(code_point, property_assigned)); + EXPECT(!Unicode::code_point_has_property(code_point, property_ascii)); + EXPECT(!Unicode::code_point_has_property(code_point, property_white_space)); + EXPECT(!Unicode::code_point_has_property(code_point, property_emoji_presentation)); + } + + for (u32 code_point = 0; code_point <= 0x7f; ++code_point) { + EXPECT(Unicode::code_point_has_property(code_point, property_any)); + EXPECT(Unicode::code_point_has_property(code_point, property_assigned)); + EXPECT(Unicode::code_point_has_property(code_point, property_ascii)); + + EXPECT(!Unicode::code_point_has_property(code_point, property_emoji_presentation)); + } + + for (u32 code_point = 0x9; code_point <= 0xd; ++code_point) { + EXPECT(Unicode::code_point_has_property(code_point, property_any)); + EXPECT(Unicode::code_point_has_property(code_point, property_assigned)); + EXPECT(Unicode::code_point_has_property(code_point, property_ascii)); + EXPECT(Unicode::code_point_has_property(code_point, property_white_space)); + + EXPECT(!Unicode::code_point_has_property(code_point, property_emoji_presentation)); + } + + for (u32 code_point = 0x1f3e5; code_point <= 0x1f3f0; ++code_point) { + EXPECT(Unicode::code_point_has_property(code_point, property_any)); + EXPECT(Unicode::code_point_has_property(code_point, property_assigned)); + EXPECT(Unicode::code_point_has_property(code_point, property_emoji_presentation)); + + EXPECT(!Unicode::code_point_has_property(code_point, property_ascii)); + EXPECT(!Unicode::code_point_has_property(code_point, property_white_space)); + } +} diff --git a/Userland/Libraries/LibUnicode/CharacterTypes.cpp b/Userland/Libraries/LibUnicode/CharacterTypes.cpp index 667f5e4548..c5211390d8 100644 --- a/Userland/Libraries/LibUnicode/CharacterTypes.cpp +++ b/Userland/Libraries/LibUnicode/CharacterTypes.cpp @@ -22,11 +22,6 @@ namespace Unicode { #if ENABLE_UNICODE_DATA -static bool has_property(UnicodeData const& unicode_data, Property property) -{ - return (unicode_data.properties & property) == property; -} - static bool is_final_code_point(Utf8View const& string, size_t index, size_t byte_length) { // C is preceded by a sequence consisting of a cased letter and then zero or more case-ignorable @@ -40,12 +35,8 @@ static bool is_final_code_point(Utf8View const& string, size_t index, size_t byt size_t cased_letter_count = 0; for (auto code_point : preceding_view) { - auto unicode_data = Detail::unicode_data_for_code_point(code_point); - if (!unicode_data.has_value()) - return false; - - bool is_cased = has_property(*unicode_data, Property::Cased); - bool is_case_ignorable = has_property(*unicode_data, Property::Case_Ignorable); + bool is_cased = code_point_has_property(code_point, Property::Cased); + bool is_case_ignorable = code_point_has_property(code_point, Property::Case_Ignorable); if (is_cased && !is_case_ignorable) ++cased_letter_count; @@ -57,12 +48,8 @@ static bool is_final_code_point(Utf8View const& string, size_t index, size_t byt return false; for (auto code_point : following_view) { - auto unicode_data = Detail::unicode_data_for_code_point(code_point); - if (!unicode_data.has_value()) - return false; - - bool is_cased = has_property(*unicode_data, Property::Cased); - bool is_case_ignorable = has_property(*unicode_data, Property::Case_Ignorable); + bool is_cased = code_point_has_property(code_point, Property::Cased); + bool is_case_ignorable = code_point_has_property(code_point, Property::Case_Ignorable); if (is_case_ignorable) continue; @@ -227,14 +214,7 @@ Optional property_from_string([[maybe_unused]] StringView const& prope bool code_point_has_property([[maybe_unused]] u32 code_point, [[maybe_unused]] Property property) { #if ENABLE_UNICODE_DATA - if (property == Property::Any) - return is_unicode(code_point); - - auto unicode_data = Detail::unicode_data_for_code_point(code_point); - if (!unicode_data.has_value()) - return false; - - return has_property(*unicode_data, property); + return Detail::code_point_has_property(code_point, property); #else return false; #endif diff --git a/Userland/Libraries/LibUnicode/CodeGenerators/GenerateUnicodeData.cpp b/Userland/Libraries/LibUnicode/CodeGenerators/GenerateUnicodeData.cpp index 6a689d206d..c31e5f9c31 100644 --- a/Userland/Libraries/LibUnicode/CodeGenerators/GenerateUnicodeData.cpp +++ b/Userland/Libraries/LibUnicode/CodeGenerators/GenerateUnicodeData.cpp @@ -68,7 +68,6 @@ struct CodePointData { Optional simple_lowercase_mapping; Optional simple_titlecase_mapping; Vector special_casing_indices; - Vector prop_list; StringView script; Vector script_extensions; }; @@ -87,12 +86,11 @@ struct UnicodeData { Vector general_category_aliases; // The Unicode standard defines additional properties (Any, Assigned, ASCII) which are not in - // any UCD file. Assigned is set as the default enum value 0 so "property & Assigned == Assigned" - // is always true. Any is not assigned code points here because this file only parses assigned - // code points, whereas Any will include unassigned code points. + // any UCD file. Assigned code point ranges are derived as this generator is executed. // https://unicode.org/reports/tr18/#General_Category_Property PropList prop_list { - { "Any"sv, {} }, + { "Any"sv, { { 0, 0x10ffff } } }, + { "Assigned"sv, {} }, { "ASCII"sv, { { 0, 0x7f } } }, }; Vector prop_aliases; @@ -309,6 +307,10 @@ static void parse_unicode_data(Core::File& file, UnicodeData& unicode_data) { Optional code_point_range_start; + auto& assigned_code_points = unicode_data.prop_list.find("Assigned"sv)->value; + Optional assigned_code_point_range_start = 0; + u32 previous_code_point = 0; + auto assign_code_point_property = [&](u32 code_point, auto const& list, auto& property, StringView default_) { using PropertyType = RemoveCVReference; constexpr bool is_single_item = IsSame; @@ -361,18 +363,31 @@ static void parse_unicode_data(Core::File& file, UnicodeData& unicode_data) data.simple_lowercase_mapping = AK::StringUtils::convert_to_uint_from_hex(segments[13]); data.simple_titlecase_mapping = AK::StringUtils::convert_to_uint_from_hex(segments[14]); + if (!assigned_code_point_range_start.has_value()) + assigned_code_point_range_start = data.code_point; + if (data.name.starts_with("<"sv) && data.name.ends_with(", First>")) { - VERIFY(!code_point_range_start.has_value()); + VERIFY(!code_point_range_start.has_value() && assigned_code_point_range_start.has_value()); code_point_range_start = data.code_point; data.name = data.name.substring(1, data.name.length() - 9); + + assigned_code_points.append({ *assigned_code_point_range_start, previous_code_point }); + assigned_code_point_range_start.clear(); } else if (data.name.starts_with("<"sv) && data.name.ends_with(", Last>")) { VERIFY(code_point_range_start.has_value()); - unicode_data.code_point_ranges.append({ *code_point_range_start, data.code_point }); - data.name = data.name.substring(1, data.name.length() - 8); + CodePointRange code_point_range { *code_point_range_start, data.code_point }; + unicode_data.code_point_ranges.append(code_point_range); + assigned_code_points.append(code_point_range); + data.name = data.name.substring(1, data.name.length() - 8); code_point_range_start.clear(); + } else if ((data.code_point > 0) && (data.code_point - previous_code_point) != 1) { + VERIFY(assigned_code_point_range_start.has_value()); + + assigned_code_points.append({ *assigned_code_point_range_start, previous_code_point }); + assigned_code_point_range_start = data.code_point; } for (auto const& casing : unicode_data.special_casing) { @@ -380,12 +395,13 @@ static void parse_unicode_data(Core::File& file, UnicodeData& unicode_data) data.special_casing_indices.append(casing.index); } - assign_code_point_property(data.code_point, unicode_data.prop_list, data.prop_list, "Assigned"sv); assign_code_point_property(data.code_point, unicode_data.script_list, data.script, "Unknown"sv); assign_code_point_property(data.code_point, unicode_data.script_extensions, data.script_extensions, {}); unicode_data.largest_special_casing_size = max(unicode_data.largest_special_casing_size, data.special_casing_indices.size()); unicode_data.largest_script_extensions_size = max(unicode_data.largest_script_extensions_size, data.script_extensions.size()); + + previous_code_point = data.code_point; unicode_data.code_point_data.append(move(data)); } } @@ -398,17 +414,15 @@ static void generate_unicode_data_header(Core::File& file, UnicodeData& unicode_ generator.set("special_casing_size", String::number(unicode_data.largest_special_casing_size)); generator.set("script_extensions_size", String::number(unicode_data.largest_script_extensions_size)); - auto generate_enum = [&](StringView name, StringView default_, Vector values, Vector aliases = {}, bool as_bitmask = false) { - VERIFY(!as_bitmask || (values.size() <= 64)); + auto generate_enum = [&](StringView name, StringView default_, Vector values, Vector aliases = {}) { quick_sort(values); quick_sort(aliases, [](auto& alias1, auto& alias2) { return alias1.alias < alias2.alias; }); generator.set("name", name); generator.set("underlying", String::formatted("{}UnderlyingType", name)); - generator.set("underlying_type", as_bitmask ? "u64"sv : "u8"sv); generator.append(R"~~~( -using @underlying@ = @underlying_type@; +using @underlying@ = u8; enum class @name@ : @underlying@ {)~~~"); @@ -418,18 +432,10 @@ enum class @name@ : @underlying@ {)~~~"); @default@,)~~~"); } - u8 index = 0; for (auto const& value : values) { generator.set("value", value); - - if (as_bitmask) { - generator.set("index", String::number(index++)); - generator.append(R"~~~( - @value@ = static_cast<@underlying@>(1) << @index@,)~~~"); - } else { - generator.append(R"~~~( + generator.append(R"~~~( @value@,)~~~"); - } } for (auto const& alias : aliases) { @@ -442,20 +448,6 @@ enum class @name@ : @underlying@ {)~~~"); generator.append(R"~~~( }; )~~~"); - - if (as_bitmask) { - generator.append(R"~~~( -constexpr @name@ operator&(@name@ value1, @name@ value2) -{ - return static_cast<@name@>(static_cast<@underlying@>(value1) & static_cast<@underlying@>(value2)); -} - -constexpr @name@ operator|(@name@ value1, @name@ value2) -{ - return static_cast<@name@>(static_cast<@underlying@>(value1) | static_cast<@underlying@>(value2)); -} -)~~~"); - } }; generator.append(R"~~~( @@ -471,7 +463,7 @@ namespace Unicode { generate_enum("Locale"sv, "None"sv, move(unicode_data.locales)); generate_enum("Condition"sv, "None"sv, move(unicode_data.conditions)); generate_enum("GeneralCategory"sv, {}, unicode_data.general_categories.keys(), unicode_data.general_category_aliases); - generate_enum("Property"sv, "Assigned"sv, unicode_data.prop_list.keys(), unicode_data.prop_aliases, true); + generate_enum("Property"sv, {}, unicode_data.prop_list.keys(), unicode_data.prop_aliases); generate_enum("Script"sv, {}, unicode_data.script_list.keys(), unicode_data.script_aliases); generator.append(R"~~~( @@ -524,8 +516,6 @@ struct UnicodeData { SpecialCasing const* special_casing[@special_casing_size@] {}; u32 special_casing_size { 0 }; - Property properties { Property::Assigned }; - Script script { Script::Unknown }; Script script_extensions[@script_extensions_size@]; u32 script_extensions_size { 0 }; @@ -538,6 +528,7 @@ Optional unicode_data_for_code_point(u32 code_point); bool code_point_has_general_category(u32 code_point, GeneralCategory general_category); Optional general_category_from_string(StringView const& general_category); +bool code_point_has_property(u32 code_point, Property property); Optional property_from_string(StringView const& property); Optional