mirror of
https://github.com/RGBCube/serenity
synced 2025-07-25 23:07:35 +00:00
LibUnicode: Remove WordBreakProperty from generated Unicode data
This was originally used for the "is_final_code_point" algorithm in LibUnicode/CharacterTypes.cpp. However, it has since been superseded by DerivedCoreProperties and is now unused. Remove it as it is currently a waste of time to process the data, and is trivial to add back if we need it again.
This commit is contained in:
parent
7f50805903
commit
4e546cee97
2 changed files with 2 additions and 22 deletions
|
@ -41,7 +41,6 @@ struct SpecialCasing {
|
||||||
|
|
||||||
// PropList source: https://www.unicode.org/Public/13.0.0/ucd/PropList.txt
|
// PropList source: https://www.unicode.org/Public/13.0.0/ucd/PropList.txt
|
||||||
// Property descriptions: https://www.unicode.org/reports/tr44/tr44-13.html#PropList.txt
|
// Property descriptions: https://www.unicode.org/reports/tr44/tr44-13.html#PropList.txt
|
||||||
// https://www.unicode.org/reports/tr44/tr44-13.html#WordBreakProperty.txt
|
|
||||||
using PropList = HashMap<String, Vector<CodePointRange>>;
|
using PropList = HashMap<String, Vector<CodePointRange>>;
|
||||||
|
|
||||||
// PropertyAliases source: https://www.unicode.org/Public/13.0.0/ucd/PropertyAliases.txt
|
// PropertyAliases source: https://www.unicode.org/Public/13.0.0/ucd/PropertyAliases.txt
|
||||||
|
@ -73,7 +72,6 @@ struct CodePointData {
|
||||||
Vector<StringView> prop_list;
|
Vector<StringView> prop_list;
|
||||||
StringView script;
|
StringView script;
|
||||||
Vector<StringView> script_extensions;
|
Vector<StringView> script_extensions;
|
||||||
StringView word_break_property;
|
|
||||||
};
|
};
|
||||||
|
|
||||||
struct UnicodeData {
|
struct UnicodeData {
|
||||||
|
@ -119,8 +117,6 @@ struct UnicodeData {
|
||||||
Vector<Alias> script_aliases;
|
Vector<Alias> script_aliases;
|
||||||
PropList script_extensions;
|
PropList script_extensions;
|
||||||
u32 largest_script_extensions_size { 0 };
|
u32 largest_script_extensions_size { 0 };
|
||||||
|
|
||||||
PropList word_break_prop_list;
|
|
||||||
};
|
};
|
||||||
|
|
||||||
static constexpr auto s_desired_fields = Array {
|
static constexpr auto s_desired_fields = Array {
|
||||||
|
@ -403,7 +399,6 @@ static void parse_unicode_data(Core::File& file, UnicodeData& unicode_data)
|
||||||
assign_code_point_property(data.code_point, unicode_data.prop_list, data.prop_list, "Assigned"sv);
|
assign_code_point_property(data.code_point, unicode_data.prop_list, data.prop_list, "Assigned"sv);
|
||||||
assign_code_point_property(data.code_point, unicode_data.script_list, data.script, "Unknown"sv);
|
assign_code_point_property(data.code_point, unicode_data.script_list, data.script, "Unknown"sv);
|
||||||
assign_code_point_property(data.code_point, unicode_data.script_extensions, data.script_extensions, {});
|
assign_code_point_property(data.code_point, unicode_data.script_extensions, data.script_extensions, {});
|
||||||
assign_code_point_property(data.code_point, unicode_data.word_break_prop_list, data.word_break_property, "Other"sv);
|
|
||||||
|
|
||||||
unicode_data.largest_special_casing_size = max(unicode_data.largest_special_casing_size, data.special_casing_indices.size());
|
unicode_data.largest_special_casing_size = max(unicode_data.largest_special_casing_size, data.special_casing_indices.size());
|
||||||
unicode_data.largest_script_extensions_size = max(unicode_data.largest_script_extensions_size, data.script_extensions.size());
|
unicode_data.largest_script_extensions_size = max(unicode_data.largest_script_extensions_size, data.script_extensions.size());
|
||||||
|
@ -509,7 +504,6 @@ namespace Unicode {
|
||||||
generate_enum("GeneralCategory"sv, "None"sv, unicode_data.general_categories, unicode_data.general_category_unions, unicode_data.general_category_aliases, true);
|
generate_enum("GeneralCategory"sv, "None"sv, unicode_data.general_categories, unicode_data.general_category_unions, unicode_data.general_category_aliases, true);
|
||||||
generate_enum("Property"sv, "Assigned"sv, unicode_data.prop_list.keys(), {}, unicode_data.prop_aliases, true);
|
generate_enum("Property"sv, "Assigned"sv, unicode_data.prop_list.keys(), {}, unicode_data.prop_aliases, true);
|
||||||
generate_enum("Script"sv, {}, unicode_data.script_list.keys(), {}, unicode_data.script_aliases);
|
generate_enum("Script"sv, {}, unicode_data.script_list.keys(), {}, unicode_data.script_aliases);
|
||||||
generate_enum("WordBreakProperty"sv, "Other"sv, unicode_data.word_break_prop_list.keys());
|
|
||||||
|
|
||||||
generator.append(R"~~~(
|
generator.append(R"~~~(
|
||||||
struct SpecialCasing {
|
struct SpecialCasing {
|
||||||
|
@ -567,8 +561,6 @@ struct UnicodeData {
|
||||||
Script script { Script::Unknown };
|
Script script { Script::Unknown };
|
||||||
Script script_extensions[@script_extensions_size@];
|
Script script_extensions[@script_extensions_size@];
|
||||||
u32 script_extensions_size { 0 };
|
u32 script_extensions_size { 0 };
|
||||||
|
|
||||||
WordBreakProperty word_break_property { WordBreakProperty::Other };
|
|
||||||
};
|
};
|
||||||
|
|
||||||
namespace Detail {
|
namespace Detail {
|
||||||
|
@ -685,7 +677,6 @@ static constexpr Array<UnicodeData, @code_point_data_size@> s_unicode_data { {)~
|
||||||
|
|
||||||
generator.append(String::formatted(", Script::{}", data.script));
|
generator.append(String::formatted(", Script::{}", data.script));
|
||||||
append_list_and_size(data.script_extensions, "Script::{}"sv);
|
append_list_and_size(data.script_extensions, "Script::{}"sv);
|
||||||
generator.append(String::formatted(", WordBreakProperty::{}", data.word_break_property));
|
|
||||||
generator.append(" },");
|
generator.append(" },");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -828,7 +819,6 @@ int main(int argc, char** argv)
|
||||||
char const* prop_value_alias_path = nullptr;
|
char const* prop_value_alias_path = nullptr;
|
||||||
char const* scripts_path = nullptr;
|
char const* scripts_path = nullptr;
|
||||||
char const* script_extensions_path = nullptr;
|
char const* script_extensions_path = nullptr;
|
||||||
char const* word_break_path = nullptr;
|
|
||||||
char const* emoji_data_path = nullptr;
|
char const* emoji_data_path = nullptr;
|
||||||
|
|
||||||
Core::ArgsParser args_parser;
|
Core::ArgsParser args_parser;
|
||||||
|
@ -843,7 +833,6 @@ int main(int argc, char** argv)
|
||||||
args_parser.add_option(prop_value_alias_path, "Path to PropertyValueAliases.txt file", "prop-value-alias-path", 'v', "prop-value-alias-path");
|
args_parser.add_option(prop_value_alias_path, "Path to PropertyValueAliases.txt file", "prop-value-alias-path", 'v', "prop-value-alias-path");
|
||||||
args_parser.add_option(scripts_path, "Path to Scripts.txt file", "scripts-path", 'r', "scripts-path");
|
args_parser.add_option(scripts_path, "Path to Scripts.txt file", "scripts-path", 'r', "scripts-path");
|
||||||
args_parser.add_option(script_extensions_path, "Path to ScriptExtensions.txt file", "script-extensions-path", 'x', "script-extensions-path");
|
args_parser.add_option(script_extensions_path, "Path to ScriptExtensions.txt file", "script-extensions-path", 'x', "script-extensions-path");
|
||||||
args_parser.add_option(word_break_path, "Path to WordBreakProperty.txt file", "word-break-path", 'w', "word-break-path");
|
|
||||||
args_parser.add_option(emoji_data_path, "Path to emoji-data.txt file", "emoji-data-path", 'e', "emoji-data-path");
|
args_parser.add_option(emoji_data_path, "Path to emoji-data.txt file", "emoji-data-path", 'e', "emoji-data-path");
|
||||||
args_parser.parse(argc, argv);
|
args_parser.parse(argc, argv);
|
||||||
|
|
||||||
|
@ -874,7 +863,6 @@ int main(int argc, char** argv)
|
||||||
auto prop_value_alias_file = open_file(prop_value_alias_path, "-v/--prop-value-alias-path");
|
auto prop_value_alias_file = open_file(prop_value_alias_path, "-v/--prop-value-alias-path");
|
||||||
auto scripts_file = open_file(scripts_path, "-r/--scripts-path");
|
auto scripts_file = open_file(scripts_path, "-r/--scripts-path");
|
||||||
auto script_extensions_file = open_file(script_extensions_path, "-x/--script-extensions-path");
|
auto script_extensions_file = open_file(script_extensions_path, "-x/--script-extensions-path");
|
||||||
auto word_break_file = open_file(word_break_path, "-w/--word-break-path");
|
|
||||||
auto emoji_data_file = open_file(emoji_data_path, "-e/--emoji-data-path");
|
auto emoji_data_file = open_file(emoji_data_path, "-e/--emoji-data-path");
|
||||||
|
|
||||||
UnicodeData unicode_data {};
|
UnicodeData unicode_data {};
|
||||||
|
@ -886,7 +874,6 @@ int main(int argc, char** argv)
|
||||||
parse_alias_list(prop_alias_file, unicode_data.prop_list, unicode_data.prop_aliases);
|
parse_alias_list(prop_alias_file, unicode_data.prop_list, unicode_data.prop_aliases);
|
||||||
parse_prop_list(scripts_file, unicode_data.script_list);
|
parse_prop_list(scripts_file, unicode_data.script_list);
|
||||||
parse_prop_list(script_extensions_file, unicode_data.script_extensions, true);
|
parse_prop_list(script_extensions_file, unicode_data.script_extensions, true);
|
||||||
parse_prop_list(word_break_file, unicode_data.word_break_prop_list);
|
|
||||||
|
|
||||||
parse_unicode_data(unicode_data_file, unicode_data);
|
parse_unicode_data(unicode_data_file, unicode_data);
|
||||||
parse_value_alias_list(prop_value_alias_file, "gc"sv, unicode_data.general_categories, unicode_data.general_category_unions, unicode_data.general_category_aliases);
|
parse_value_alias_list(prop_value_alias_file, "gc"sv, unicode_data.general_categories, unicode_data.general_category_unions, unicode_data.general_category_aliases);
|
||||||
|
|
|
@ -27,9 +27,6 @@ set(SCRIPTS_PATH ${CMAKE_BINARY_DIR}/UCD/Scripts.txt)
|
||||||
set(SCRIPT_EXTENSIONS_URL https://www.unicode.org/Public/13.0.0/ucd/ScriptExtensions.txt)
|
set(SCRIPT_EXTENSIONS_URL https://www.unicode.org/Public/13.0.0/ucd/ScriptExtensions.txt)
|
||||||
set(SCRIPT_EXTENSIONS_PATH ${CMAKE_BINARY_DIR}/UCD/ScriptExtensions.txt)
|
set(SCRIPT_EXTENSIONS_PATH ${CMAKE_BINARY_DIR}/UCD/ScriptExtensions.txt)
|
||||||
|
|
||||||
set(WORD_BREAK_URL https://www.unicode.org/Public/13.0.0/ucd/auxiliary/WordBreakProperty.txt)
|
|
||||||
set(WORD_BREAK_PATH ${CMAKE_BINARY_DIR}/UCD/WordBreakProperty.txt)
|
|
||||||
|
|
||||||
set(EMOJI_DATA_URL https://www.unicode.org/Public/13.0.0/ucd/emoji/emoji-data.txt)
|
set(EMOJI_DATA_URL https://www.unicode.org/Public/13.0.0/ucd/emoji/emoji-data.txt)
|
||||||
set(EMOJI_DATA_PATH ${CMAKE_BINARY_DIR}/UCD/emoji-data.txt)
|
set(EMOJI_DATA_PATH ${CMAKE_BINARY_DIR}/UCD/emoji-data.txt)
|
||||||
|
|
||||||
|
@ -70,10 +67,6 @@ if (ENABLE_UNICODE_DATABASE_DOWNLOAD)
|
||||||
message(STATUS "Downloading UCD ScriptExtensions.txt from ${SCRIPT_EXTENSIONS_URL}...")
|
message(STATUS "Downloading UCD ScriptExtensions.txt from ${SCRIPT_EXTENSIONS_URL}...")
|
||||||
file(DOWNLOAD ${SCRIPT_EXTENSIONS_URL} ${SCRIPT_EXTENSIONS_PATH} INACTIVITY_TIMEOUT 10)
|
file(DOWNLOAD ${SCRIPT_EXTENSIONS_URL} ${SCRIPT_EXTENSIONS_PATH} INACTIVITY_TIMEOUT 10)
|
||||||
endif()
|
endif()
|
||||||
if (NOT EXISTS ${WORD_BREAK_PATH})
|
|
||||||
message(STATUS "Downloading UCD WordBreakProperty.txt from ${WORD_BREAK_URL}...")
|
|
||||||
file(DOWNLOAD ${WORD_BREAK_URL} ${WORD_BREAK_PATH} INACTIVITY_TIMEOUT 10)
|
|
||||||
endif()
|
|
||||||
if (NOT EXISTS ${EMOJI_DATA_PATH})
|
if (NOT EXISTS ${EMOJI_DATA_PATH})
|
||||||
message(STATUS "Downloading UCD emoji-data.txt from ${EMOJI_DATA_URL}...")
|
message(STATUS "Downloading UCD emoji-data.txt from ${EMOJI_DATA_URL}...")
|
||||||
file(DOWNLOAD ${EMOJI_DATA_URL} ${EMOJI_DATA_PATH} INACTIVITY_TIMEOUT 10)
|
file(DOWNLOAD ${EMOJI_DATA_URL} ${EMOJI_DATA_PATH} INACTIVITY_TIMEOUT 10)
|
||||||
|
@ -89,9 +82,9 @@ if (ENABLE_UNICODE_DATABASE_DOWNLOAD)
|
||||||
|
|
||||||
add_custom_command(
|
add_custom_command(
|
||||||
OUTPUT ${UNICODE_DATA_HEADER} ${UNICODE_DATA_IMPLEMENTATION}
|
OUTPUT ${UNICODE_DATA_HEADER} ${UNICODE_DATA_IMPLEMENTATION}
|
||||||
COMMAND $<TARGET_FILE:GenerateUnicodeData> -h ${UNICODE_DATA_HEADER} -c ${UNICODE_DATA_IMPLEMENTATION} -u ${UNICODE_DATA_PATH} -s ${SPECIAL_CASING_PATH} -p ${PROP_LIST_PATH} -d ${DERIVED_CORE_PROP_PATH} -b ${DERIVED_BINARY_PROP_PATH} -a ${PROP_ALIAS_PATH} -v ${PROP_VALUE_ALIAS_PATH} -r ${SCRIPTS_PATH} -x ${SCRIPT_EXTENSIONS_PATH} -w ${WORD_BREAK_PATH} -e ${EMOJI_DATA_PATH}
|
COMMAND $<TARGET_FILE:GenerateUnicodeData> -h ${UNICODE_DATA_HEADER} -c ${UNICODE_DATA_IMPLEMENTATION} -u ${UNICODE_DATA_PATH} -s ${SPECIAL_CASING_PATH} -p ${PROP_LIST_PATH} -d ${DERIVED_CORE_PROP_PATH} -b ${DERIVED_BINARY_PROP_PATH} -a ${PROP_ALIAS_PATH} -v ${PROP_VALUE_ALIAS_PATH} -r ${SCRIPTS_PATH} -x ${SCRIPT_EXTENSIONS_PATH} -e ${EMOJI_DATA_PATH}
|
||||||
VERBATIM
|
VERBATIM
|
||||||
DEPENDS GenerateUnicodeData ${UNICODE_DATA_PATH} ${SPECIAL_CASING_PATH} ${PROP_LIST_PATH} ${DERIVED_CORE_PROP_PATH} ${DERIVED_BINARY_PROP_PATH} ${PROP_ALIAS_PATH} ${PROP_VALUE_ALIAS_PATH} ${SCRIPTS_PATH} ${SCRIPT_EXTENSIONS_PATH} ${WORD_BREAK_PATH} ${EMOJI_DATA_PATH}
|
DEPENDS GenerateUnicodeData ${UNICODE_DATA_PATH} ${SPECIAL_CASING_PATH} ${PROP_LIST_PATH} ${DERIVED_CORE_PROP_PATH} ${DERIVED_BINARY_PROP_PATH} ${PROP_ALIAS_PATH} ${PROP_VALUE_ALIAS_PATH} ${SCRIPTS_PATH} ${SCRIPT_EXTENSIONS_PATH} ${EMOJI_DATA_PATH}
|
||||||
)
|
)
|
||||||
|
|
||||||
set(UNICODE_DATA_SOURCES ${UNICODE_DATA_HEADER} ${UNICODE_DATA_IMPLEMENTATION})
|
set(UNICODE_DATA_SOURCES ${UNICODE_DATA_HEADER} ${UNICODE_DATA_IMPLEMENTATION})
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue