mirror of
https://github.com/RGBCube/serenity
synced 2025-07-25 05:27:43 +00:00
LibUnicode: Generate General Category unions and aliases
This downloads the PropertyValueAliases.txt UCD file, which contains a set of General Category aliases. This changes the General Category enumeration to now be generated as a bitmask. This is to easily allow General Category unions. For example, the LC (Cased_Letter) category is the union of the Ll, Lu, and Lt categories.
This commit is contained in:
parent
f87cc85cd3
commit
16e86ae743
3 changed files with 76 additions and 7 deletions
|
@ -85,9 +85,24 @@ struct UnicodeData {
|
||||||
|
|
||||||
Vector<CodePointData> code_point_data;
|
Vector<CodePointData> code_point_data;
|
||||||
Vector<CodePointRange> code_point_ranges;
|
Vector<CodePointRange> code_point_ranges;
|
||||||
Vector<String> general_categories;
|
|
||||||
u32 last_contiguous_code_point { 0 };
|
u32 last_contiguous_code_point { 0 };
|
||||||
|
|
||||||
|
// The Unicode standard defines General Category values which are not in any UCD file. These
|
||||||
|
// values are simply unions of other values.
|
||||||
|
// https://www.unicode.org/reports/tr44/#GC_Values_Table
|
||||||
|
Vector<String> general_categories;
|
||||||
|
Vector<Alias> general_category_unions {
|
||||||
|
{ "Ll | Lu | Lt"sv, "LC"sv },
|
||||||
|
{ "Lu | Ll | Lt | Lm | Lo"sv, "L"sv },
|
||||||
|
{ "Mn | Mc | Me"sv, "M"sv },
|
||||||
|
{ "Nd | Nl | No"sv, "N"sv },
|
||||||
|
{ "Pc | Pd | Ps | Pe | Pi | Pf | Po"sv, "P"sv },
|
||||||
|
{ "Sm | Sc | Sk | So"sv, "S"sv },
|
||||||
|
{ "Zs | Zl | Zp"sv, "Z"sv },
|
||||||
|
{ "Cc | Cf | Cs | Co"sv, "C"sv }, // FIXME: This union should also contain "Cn" (Unassigned), which we don't parse yet.
|
||||||
|
};
|
||||||
|
Vector<Alias> general_category_aliases;
|
||||||
|
|
||||||
PropList prop_list;
|
PropList prop_list;
|
||||||
Vector<Alias> prop_aliases;
|
Vector<Alias> prop_aliases;
|
||||||
PropList word_break_prop_list;
|
PropList word_break_prop_list;
|
||||||
|
@ -233,6 +248,42 @@ static void parse_alias_list(Core::File& file, PropList const& prop_list, Vector
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static void parse_value_alias_list(Core::File& file, StringView desired_category, Vector<String> const& value_list, Vector<Alias>& prop_unions, Vector<Alias>& prop_aliases)
|
||||||
|
{
|
||||||
|
auto append_alias = [&](auto alias, auto value) {
|
||||||
|
// FIXME: We will, eventually, need to find where missing properties are located and parse them.
|
||||||
|
if (!value_list.contains_slow(value) && !any_of(prop_unions, [&](auto const& u) { return value == u.alias; }))
|
||||||
|
return;
|
||||||
|
|
||||||
|
prop_aliases.append({ value, alias });
|
||||||
|
};
|
||||||
|
|
||||||
|
while (file.can_read_line()) {
|
||||||
|
auto line = file.read_line();
|
||||||
|
if (line.is_empty() || line.starts_with('#'))
|
||||||
|
continue;
|
||||||
|
|
||||||
|
if (auto index = line.find('#'); index.has_value())
|
||||||
|
line = line.substring(0, *index);
|
||||||
|
|
||||||
|
auto segments = line.split_view(';', true);
|
||||||
|
auto category = segments[0].trim_whitespace();
|
||||||
|
|
||||||
|
if (category != desired_category)
|
||||||
|
continue;
|
||||||
|
|
||||||
|
VERIFY((segments.size() == 3) || (segments.size() == 4));
|
||||||
|
auto value = segments[1].trim_whitespace();
|
||||||
|
auto alias = segments[2].trim_whitespace();
|
||||||
|
append_alias(alias, value);
|
||||||
|
|
||||||
|
if (segments.size() == 4) {
|
||||||
|
alias = segments[3].trim_whitespace();
|
||||||
|
append_alias(alias, value);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
static void parse_unicode_data(Core::File& file, UnicodeData& unicode_data)
|
static void parse_unicode_data(Core::File& file, UnicodeData& unicode_data)
|
||||||
{
|
{
|
||||||
Optional<u32> code_point_range_start;
|
Optional<u32> code_point_range_start;
|
||||||
|
@ -335,9 +386,10 @@ static void generate_unicode_data_header(UnicodeData& unicode_data)
|
||||||
generator.set("casing_transform_size", String::number(unicode_data.largest_casing_transform_size));
|
generator.set("casing_transform_size", String::number(unicode_data.largest_casing_transform_size));
|
||||||
generator.set("special_casing_size", String::number(unicode_data.largest_special_casing_size));
|
generator.set("special_casing_size", String::number(unicode_data.largest_special_casing_size));
|
||||||
|
|
||||||
auto generate_enum = [&](StringView name, StringView default_, Vector<String> values, Vector<Alias> aliases = {}, bool as_bitmask = false) {
|
auto generate_enum = [&](StringView name, StringView default_, Vector<String> values, Vector<Alias> unions = {}, Vector<Alias> aliases = {}, bool as_bitmask = false) {
|
||||||
VERIFY((values.size() + !default_.is_empty()) <= 64);
|
VERIFY((values.size() + !default_.is_empty()) <= 64);
|
||||||
quick_sort(values);
|
quick_sort(values);
|
||||||
|
quick_sort(unions, [](auto& union1, auto& union2) { return union1.alias < union2.alias; });
|
||||||
quick_sort(aliases, [](auto& alias1, auto& alias2) { return alias1.alias < alias2.alias; });
|
quick_sort(aliases, [](auto& alias1, auto& alias2) { return alias1.alias < alias2.alias; });
|
||||||
|
|
||||||
generator.set("name", name);
|
generator.set("name", name);
|
||||||
|
@ -373,6 +425,12 @@ enum class @name@ {)~~~");
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
for (auto const& union_ : unions) {
|
||||||
|
generator.set("union", union_.alias);
|
||||||
|
generator.set("value", union_.property);
|
||||||
|
generator.append(R"~~~(
|
||||||
|
@union@ = @value@,)~~~");
|
||||||
|
}
|
||||||
for (auto const& alias : aliases) {
|
for (auto const& alias : aliases) {
|
||||||
generator.set("alias", alias.alias);
|
generator.set("alias", alias.alias);
|
||||||
generator.set("value", alias.property);
|
generator.set("value", alias.property);
|
||||||
|
@ -411,8 +469,8 @@ namespace Unicode {
|
||||||
|
|
||||||
generate_enum("Locale"sv, "None"sv, move(unicode_data.locales));
|
generate_enum("Locale"sv, "None"sv, move(unicode_data.locales));
|
||||||
generate_enum("Condition"sv, "None"sv, move(unicode_data.conditions));
|
generate_enum("Condition"sv, "None"sv, move(unicode_data.conditions));
|
||||||
generate_enum("GeneralCategory"sv, {}, move(unicode_data.general_categories));
|
generate_enum("GeneralCategory"sv, "None"sv, move(unicode_data.general_categories), move(unicode_data.general_category_unions), move(unicode_data.general_category_aliases), true);
|
||||||
generate_enum("Property"sv, "Assigned"sv, unicode_data.prop_list.keys(), unicode_data.prop_aliases, true);
|
generate_enum("Property"sv, "Assigned"sv, unicode_data.prop_list.keys(), {}, unicode_data.prop_aliases, true);
|
||||||
generate_enum("WordBreakProperty"sv, "Other"sv, unicode_data.word_break_prop_list.keys());
|
generate_enum("WordBreakProperty"sv, "Other"sv, unicode_data.word_break_prop_list.keys());
|
||||||
|
|
||||||
generator.append(R"~~~(
|
generator.append(R"~~~(
|
||||||
|
@ -665,6 +723,7 @@ int main(int argc, char** argv)
|
||||||
char const* prop_list_path = nullptr;
|
char const* prop_list_path = nullptr;
|
||||||
char const* derived_core_prop_path = nullptr;
|
char const* derived_core_prop_path = nullptr;
|
||||||
char const* prop_alias_path = nullptr;
|
char const* prop_alias_path = nullptr;
|
||||||
|
char const* prop_value_alias_path = nullptr;
|
||||||
char const* word_break_path = nullptr;
|
char const* word_break_path = nullptr;
|
||||||
|
|
||||||
Core::ArgsParser args_parser;
|
Core::ArgsParser args_parser;
|
||||||
|
@ -675,6 +734,7 @@ int main(int argc, char** argv)
|
||||||
args_parser.add_option(prop_list_path, "Path to PropList.txt file", "prop-list-path", 'p', "prop-list-path");
|
args_parser.add_option(prop_list_path, "Path to PropList.txt file", "prop-list-path", 'p', "prop-list-path");
|
||||||
args_parser.add_option(derived_core_prop_path, "Path to DerivedCoreProperties.txt file", "derived-core-prop-path", 'd', "derived-core-prop-path");
|
args_parser.add_option(derived_core_prop_path, "Path to DerivedCoreProperties.txt file", "derived-core-prop-path", 'd', "derived-core-prop-path");
|
||||||
args_parser.add_option(prop_alias_path, "Path to PropertyAliases.txt file", "prop-alias-path", 'a', "prop-alias-path");
|
args_parser.add_option(prop_alias_path, "Path to PropertyAliases.txt file", "prop-alias-path", 'a', "prop-alias-path");
|
||||||
|
args_parser.add_option(prop_value_alias_path, "Path to PropertyValueAliases.txt file", "prop-value-alias-path", 'v', "prop-value-alias-path");
|
||||||
args_parser.add_option(word_break_path, "Path to WordBreakProperty.txt file", "word-break-path", 'w', "word-break-path");
|
args_parser.add_option(word_break_path, "Path to WordBreakProperty.txt file", "word-break-path", 'w', "word-break-path");
|
||||||
args_parser.parse(argc, argv);
|
args_parser.parse(argc, argv);
|
||||||
|
|
||||||
|
@ -705,6 +765,7 @@ int main(int argc, char** argv)
|
||||||
auto prop_list_file = open_file(prop_list_path, "-p/--prop-list-path");
|
auto prop_list_file = open_file(prop_list_path, "-p/--prop-list-path");
|
||||||
auto derived_core_prop_file = open_file(derived_core_prop_path, "-d/--derived-core-prop-path");
|
auto derived_core_prop_file = open_file(derived_core_prop_path, "-d/--derived-core-prop-path");
|
||||||
auto prop_alias_file = open_file(prop_alias_path, "-a/--prop-alias-path");
|
auto prop_alias_file = open_file(prop_alias_path, "-a/--prop-alias-path");
|
||||||
|
auto prop_value_alias_file = open_file(prop_value_alias_path, "-v/--prop-value-alias-path");
|
||||||
auto word_break_file = open_file(word_break_path, "-w/--word-break-path");
|
auto word_break_file = open_file(word_break_path, "-w/--word-break-path");
|
||||||
|
|
||||||
UnicodeData unicode_data {};
|
UnicodeData unicode_data {};
|
||||||
|
@ -723,6 +784,7 @@ int main(int argc, char** argv)
|
||||||
unicode_data.prop_list.set("ASCII"sv, { { 0, 0, 0x7f } });
|
unicode_data.prop_list.set("ASCII"sv, { { 0, 0, 0x7f } });
|
||||||
|
|
||||||
parse_unicode_data(unicode_data_file, unicode_data);
|
parse_unicode_data(unicode_data_file, unicode_data);
|
||||||
|
parse_value_alias_list(prop_value_alias_file, "gc"sv, unicode_data.general_categories, unicode_data.general_category_unions, unicode_data.general_category_aliases);
|
||||||
|
|
||||||
if (generate_header)
|
if (generate_header)
|
||||||
generate_unicode_data_header(unicode_data);
|
generate_unicode_data_header(unicode_data);
|
||||||
|
|
|
@ -11,7 +11,7 @@
|
||||||
namespace Unicode {
|
namespace Unicode {
|
||||||
|
|
||||||
enum class Condition;
|
enum class Condition;
|
||||||
enum class GeneralCategory;
|
enum class GeneralCategory : u64;
|
||||||
enum class Locale;
|
enum class Locale;
|
||||||
enum class Property : u64;
|
enum class Property : u64;
|
||||||
enum class WordBreakProperty;
|
enum class WordBreakProperty;
|
||||||
|
|
|
@ -15,6 +15,9 @@ set(DERIVED_CORE_PROP_PATH ${CMAKE_BINARY_DIR}/UCD/DerivedCoreProperties.txt)
|
||||||
set(PROP_ALIAS_URL https://www.unicode.org/Public/13.0.0/ucd/PropertyAliases.txt)
|
set(PROP_ALIAS_URL https://www.unicode.org/Public/13.0.0/ucd/PropertyAliases.txt)
|
||||||
set(PROP_ALIAS_PATH ${CMAKE_BINARY_DIR}/UCD/PropertyAliases.txt)
|
set(PROP_ALIAS_PATH ${CMAKE_BINARY_DIR}/UCD/PropertyAliases.txt)
|
||||||
|
|
||||||
|
set(PROP_VALUE_ALIAS_URL https://www.unicode.org/Public/13.0.0/ucd/PropertyValueAliases.txt)
|
||||||
|
set(PROP_VALUE_ALIAS_PATH ${CMAKE_BINARY_DIR}/UCD/PropertyValueAliases.txt)
|
||||||
|
|
||||||
set(WORD_BREAK_URL https://www.unicode.org/Public/13.0.0/ucd/auxiliary/WordBreakProperty.txt)
|
set(WORD_BREAK_URL https://www.unicode.org/Public/13.0.0/ucd/auxiliary/WordBreakProperty.txt)
|
||||||
set(WORD_BREAK_PATH ${CMAKE_BINARY_DIR}/UCD/WordBreakProperty.txt)
|
set(WORD_BREAK_PATH ${CMAKE_BINARY_DIR}/UCD/WordBreakProperty.txt)
|
||||||
|
|
||||||
|
@ -39,6 +42,10 @@ if (ENABLE_UNICODE_DATABASE_DOWNLOAD)
|
||||||
message(STATUS "Downloading UCD PropertyAliases.txt from ${PROP_ALIAS_URL}...")
|
message(STATUS "Downloading UCD PropertyAliases.txt from ${PROP_ALIAS_URL}...")
|
||||||
file(DOWNLOAD ${PROP_ALIAS_URL} ${PROP_ALIAS_PATH} INACTIVITY_TIMEOUT 10)
|
file(DOWNLOAD ${PROP_ALIAS_URL} ${PROP_ALIAS_PATH} INACTIVITY_TIMEOUT 10)
|
||||||
endif()
|
endif()
|
||||||
|
if (NOT EXISTS ${PROP_VALUE_ALIAS_PATH})
|
||||||
|
message(STATUS "Downloading UCD PropertyValueAliases.txt from ${PROP_VALUE_ALIAS_URL}...")
|
||||||
|
file(DOWNLOAD ${PROP_VALUE_ALIAS_URL} ${PROP_VALUE_ALIAS_PATH} INACTIVITY_TIMEOUT 10)
|
||||||
|
endif()
|
||||||
if (NOT EXISTS ${WORD_BREAK_PATH})
|
if (NOT EXISTS ${WORD_BREAK_PATH})
|
||||||
message(STATUS "Downloading UCD WordBreakProperty.txt from ${WORD_BREAK_URL}...")
|
message(STATUS "Downloading UCD WordBreakProperty.txt from ${WORD_BREAK_URL}...")
|
||||||
file(DOWNLOAD ${WORD_BREAK_URL} ${WORD_BREAK_PATH} INACTIVITY_TIMEOUT 10)
|
file(DOWNLOAD ${WORD_BREAK_URL} ${WORD_BREAK_PATH} INACTIVITY_TIMEOUT 10)
|
||||||
|
@ -54,7 +61,7 @@ if (ENABLE_UNICODE_DATABASE_DOWNLOAD)
|
||||||
|
|
||||||
add_custom_command(
|
add_custom_command(
|
||||||
OUTPUT ${UNICODE_DATA_HEADER}
|
OUTPUT ${UNICODE_DATA_HEADER}
|
||||||
COMMAND ${write_if_different} ${UNICODE_DATA_HEADER} $<TARGET_FILE:GenerateUnicodeData> -h -u ${UNICODE_DATA_PATH} -s ${SPECIAL_CASING_PATH} -p ${PROP_LIST_PATH} -d ${DERIVED_CORE_PROP_PATH} -a ${PROP_ALIAS_PATH} -w ${WORD_BREAK_PATH}
|
COMMAND ${write_if_different} ${UNICODE_DATA_HEADER} $<TARGET_FILE:GenerateUnicodeData> -h -u ${UNICODE_DATA_PATH} -s ${SPECIAL_CASING_PATH} -p ${PROP_LIST_PATH} -d ${DERIVED_CORE_PROP_PATH} -a ${PROP_ALIAS_PATH} -v ${PROP_VALUE_ALIAS_PATH} -w ${WORD_BREAK_PATH}
|
||||||
VERBATIM
|
VERBATIM
|
||||||
DEPENDS GenerateUnicodeData
|
DEPENDS GenerateUnicodeData
|
||||||
MAIN_DEPENDENCY ${UNICODE_DATA_PATH} ${SPECIAL_CASING_PATH}
|
MAIN_DEPENDENCY ${UNICODE_DATA_PATH} ${SPECIAL_CASING_PATH}
|
||||||
|
@ -62,7 +69,7 @@ if (ENABLE_UNICODE_DATABASE_DOWNLOAD)
|
||||||
|
|
||||||
add_custom_command(
|
add_custom_command(
|
||||||
OUTPUT ${UNICODE_DATA_IMPLEMENTATION}
|
OUTPUT ${UNICODE_DATA_IMPLEMENTATION}
|
||||||
COMMAND ${write_if_different} ${UNICODE_DATA_IMPLEMENTATION} $<TARGET_FILE:GenerateUnicodeData> -c -u ${UNICODE_DATA_PATH} -s ${SPECIAL_CASING_PATH} -p ${PROP_LIST_PATH} -d ${DERIVED_CORE_PROP_PATH} -a ${PROP_ALIAS_PATH} -w ${WORD_BREAK_PATH}
|
COMMAND ${write_if_different} ${UNICODE_DATA_IMPLEMENTATION} $<TARGET_FILE:GenerateUnicodeData> -c -u ${UNICODE_DATA_PATH} -s ${SPECIAL_CASING_PATH} -p ${PROP_LIST_PATH} -d ${DERIVED_CORE_PROP_PATH} -a ${PROP_ALIAS_PATH} -v ${PROP_VALUE_ALIAS_PATH} -w ${WORD_BREAK_PATH}
|
||||||
VERBATIM
|
VERBATIM
|
||||||
DEPENDS GenerateUnicodeData
|
DEPENDS GenerateUnicodeData
|
||||||
MAIN_DEPENDENCY ${UNICODE_DATA_PATH} ${SPECIAL_CASING_PATH}
|
MAIN_DEPENDENCY ${UNICODE_DATA_PATH} ${SPECIAL_CASING_PATH}
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue