mirror of
https://github.com/RGBCube/serenity
synced 2025-07-25 16:37:35 +00:00
LibUnicode: Parse UCD ScriptExtensions.txt and generate property
This commit is contained in:
parent
6bdb19fe21
commit
5edd458420
4 changed files with 66 additions and 15 deletions
|
@ -340,4 +340,25 @@ bool code_point_has_script([[maybe_unused]] u32 code_point, [[maybe_unused]] Scr
|
||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
|
bool code_point_has_script_extension([[maybe_unused]] u32 code_point, [[maybe_unused]] Script script)
|
||||||
|
{
|
||||||
|
#if ENABLE_UNICODE_DATA
|
||||||
|
auto unicode_data = Detail::unicode_data_for_code_point(code_point);
|
||||||
|
if (!unicode_data.has_value())
|
||||||
|
return false;
|
||||||
|
|
||||||
|
if (unicode_data->script == script)
|
||||||
|
return true;
|
||||||
|
|
||||||
|
for (u32 i = 0; i < unicode_data->script_extensions_size; ++i) {
|
||||||
|
if (unicode_data->script_extensions[i] == script)
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
return false;
|
||||||
|
#else
|
||||||
|
return false;
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -30,5 +30,6 @@ bool is_ecma262_property(Property);
|
||||||
|
|
||||||
Optional<Script> script_from_string(StringView const&);
|
Optional<Script> script_from_string(StringView const&);
|
||||||
bool code_point_has_script(u32 code_point, Script script);
|
bool code_point_has_script(u32 code_point, Script script);
|
||||||
|
bool code_point_has_script_extension(u32 code_point, Script script);
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -72,6 +72,7 @@ struct CodePointData {
|
||||||
Vector<u32> special_casing_indices;
|
Vector<u32> special_casing_indices;
|
||||||
Vector<StringView> prop_list;
|
Vector<StringView> prop_list;
|
||||||
StringView script;
|
StringView script;
|
||||||
|
Vector<StringView> script_extensions;
|
||||||
StringView word_break_property;
|
StringView word_break_property;
|
||||||
};
|
};
|
||||||
|
|
||||||
|
@ -116,6 +117,8 @@ struct UnicodeData {
|
||||||
{ "Unknown"sv, {} },
|
{ "Unknown"sv, {} },
|
||||||
};
|
};
|
||||||
Vector<Alias> script_aliases;
|
Vector<Alias> script_aliases;
|
||||||
|
PropList script_extensions;
|
||||||
|
u32 largest_script_extensions_size { 0 };
|
||||||
|
|
||||||
PropList word_break_prop_list;
|
PropList word_break_prop_list;
|
||||||
};
|
};
|
||||||
|
@ -198,7 +201,7 @@ static void parse_special_casing(Core::File& file, UnicodeData& unicode_data)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
static void parse_prop_list(Core::File& file, PropList& prop_list)
|
static void parse_prop_list(Core::File& file, PropList& prop_list, bool multi_value_property = false)
|
||||||
{
|
{
|
||||||
while (file.can_read_line()) {
|
while (file.can_read_line()) {
|
||||||
auto line = file.read_line();
|
auto line = file.read_line();
|
||||||
|
@ -212,9 +215,15 @@ static void parse_prop_list(Core::File& file, PropList& prop_list)
|
||||||
VERIFY(segments.size() == 2);
|
VERIFY(segments.size() == 2);
|
||||||
|
|
||||||
auto code_point_range = segments[0].trim_whitespace();
|
auto code_point_range = segments[0].trim_whitespace();
|
||||||
auto property = segments[1].trim_whitespace();
|
Vector<StringView> properties;
|
||||||
|
|
||||||
auto& code_points = prop_list.ensure(property);
|
if (multi_value_property)
|
||||||
|
properties = segments[1].trim_whitespace().split_view(' ');
|
||||||
|
else
|
||||||
|
properties = { segments[1].trim_whitespace() };
|
||||||
|
|
||||||
|
for (auto const& property : properties) {
|
||||||
|
auto& code_points = prop_list.ensure(property.trim_whitespace());
|
||||||
|
|
||||||
if (code_point_range.contains(".."sv)) {
|
if (code_point_range.contains(".."sv)) {
|
||||||
segments = code_point_range.split_view(".."sv);
|
segments = code_point_range.split_view(".."sv);
|
||||||
|
@ -228,6 +237,7 @@ static void parse_prop_list(Core::File& file, PropList& prop_list)
|
||||||
code_points.append({ code_point, code_point });
|
code_points.append({ code_point, code_point });
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
static void parse_alias_list(Core::File& file, PropList const& prop_list, Vector<Alias>& prop_aliases)
|
static void parse_alias_list(Core::File& file, PropList const& prop_list, Vector<Alias>& prop_aliases)
|
||||||
|
@ -342,7 +352,7 @@ static void parse_unicode_data(Core::File& file, UnicodeData& unicode_data)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if (property.is_empty())
|
if (property.is_empty() && !default_.is_empty())
|
||||||
assign_property(default_);
|
assign_property(default_);
|
||||||
};
|
};
|
||||||
|
|
||||||
|
@ -392,9 +402,11 @@ static void parse_unicode_data(Core::File& file, UnicodeData& unicode_data)
|
||||||
|
|
||||||
assign_code_point_property(data.code_point, unicode_data.prop_list, data.prop_list, "Assigned"sv);
|
assign_code_point_property(data.code_point, unicode_data.prop_list, data.prop_list, "Assigned"sv);
|
||||||
assign_code_point_property(data.code_point, unicode_data.script_list, data.script, "Unknown"sv);
|
assign_code_point_property(data.code_point, unicode_data.script_list, data.script, "Unknown"sv);
|
||||||
|
assign_code_point_property(data.code_point, unicode_data.script_extensions, data.script_extensions, {});
|
||||||
assign_code_point_property(data.code_point, unicode_data.word_break_prop_list, data.word_break_property, "Other"sv);
|
assign_code_point_property(data.code_point, unicode_data.word_break_prop_list, data.word_break_property, "Other"sv);
|
||||||
|
|
||||||
unicode_data.largest_special_casing_size = max(unicode_data.largest_special_casing_size, data.special_casing_indices.size());
|
unicode_data.largest_special_casing_size = max(unicode_data.largest_special_casing_size, data.special_casing_indices.size());
|
||||||
|
unicode_data.largest_script_extensions_size = max(unicode_data.largest_script_extensions_size, data.script_extensions.size());
|
||||||
|
|
||||||
if (!unicode_data.general_categories.contains_slow(data.general_category))
|
if (!unicode_data.general_categories.contains_slow(data.general_category))
|
||||||
unicode_data.general_categories.append(data.general_category);
|
unicode_data.general_categories.append(data.general_category);
|
||||||
|
@ -409,6 +421,7 @@ static void generate_unicode_data_header(Core::File& file, UnicodeData& unicode_
|
||||||
SourceGenerator generator { builder };
|
SourceGenerator generator { builder };
|
||||||
generator.set("casing_transform_size", String::number(unicode_data.largest_casing_transform_size));
|
generator.set("casing_transform_size", String::number(unicode_data.largest_casing_transform_size));
|
||||||
generator.set("special_casing_size", String::number(unicode_data.largest_special_casing_size));
|
generator.set("special_casing_size", String::number(unicode_data.largest_special_casing_size));
|
||||||
|
generator.set("script_extensions_size", String::number(unicode_data.largest_script_extensions_size));
|
||||||
|
|
||||||
auto generate_enum = [&](StringView name, StringView default_, Vector<String> values, Vector<Alias> unions = {}, Vector<Alias> aliases = {}, bool as_bitmask = false) {
|
auto generate_enum = [&](StringView name, StringView default_, Vector<String> values, Vector<Alias> unions = {}, Vector<Alias> aliases = {}, bool as_bitmask = false) {
|
||||||
VERIFY(!as_bitmask || (values.size() <= 64));
|
VERIFY(!as_bitmask || (values.size() <= 64));
|
||||||
|
@ -550,7 +563,11 @@ struct UnicodeData {
|
||||||
u32 special_casing_size { 0 };
|
u32 special_casing_size { 0 };
|
||||||
|
|
||||||
Property properties { Property::Assigned };
|
Property properties { Property::Assigned };
|
||||||
|
|
||||||
Script script { Script::Unknown };
|
Script script { Script::Unknown };
|
||||||
|
Script script_extensions[@script_extensions_size@];
|
||||||
|
u32 script_extensions_size { 0 };
|
||||||
|
|
||||||
WordBreakProperty word_break_property { WordBreakProperty::Other };
|
WordBreakProperty word_break_property { WordBreakProperty::Other };
|
||||||
};
|
};
|
||||||
|
|
||||||
|
@ -667,6 +684,7 @@ static constexpr Array<UnicodeData, @code_point_data_size@> s_unicode_data { {)~
|
||||||
}
|
}
|
||||||
|
|
||||||
generator.append(String::formatted(", Script::{}", data.script));
|
generator.append(String::formatted(", Script::{}", data.script));
|
||||||
|
append_list_and_size(data.script_extensions, "Script::{}"sv);
|
||||||
generator.append(String::formatted(", WordBreakProperty::{}", data.word_break_property));
|
generator.append(String::formatted(", WordBreakProperty::{}", data.word_break_property));
|
||||||
generator.append(" },");
|
generator.append(" },");
|
||||||
}
|
}
|
||||||
|
@ -808,6 +826,7 @@ int main(int argc, char** argv)
|
||||||
char const* prop_alias_path = nullptr;
|
char const* prop_alias_path = nullptr;
|
||||||
char const* prop_value_alias_path = nullptr;
|
char const* prop_value_alias_path = nullptr;
|
||||||
char const* scripts_path = nullptr;
|
char const* scripts_path = nullptr;
|
||||||
|
char const* script_extensions_path = nullptr;
|
||||||
char const* word_break_path = nullptr;
|
char const* word_break_path = nullptr;
|
||||||
|
|
||||||
Core::ArgsParser args_parser;
|
Core::ArgsParser args_parser;
|
||||||
|
@ -820,6 +839,7 @@ int main(int argc, char** argv)
|
||||||
args_parser.add_option(prop_alias_path, "Path to PropertyAliases.txt file", "prop-alias-path", 'a', "prop-alias-path");
|
args_parser.add_option(prop_alias_path, "Path to PropertyAliases.txt file", "prop-alias-path", 'a', "prop-alias-path");
|
||||||
args_parser.add_option(prop_value_alias_path, "Path to PropertyValueAliases.txt file", "prop-value-alias-path", 'v', "prop-value-alias-path");
|
args_parser.add_option(prop_value_alias_path, "Path to PropertyValueAliases.txt file", "prop-value-alias-path", 'v', "prop-value-alias-path");
|
||||||
args_parser.add_option(scripts_path, "Path to Scripts.txt file", "scripts-path", 'r', "scripts-path");
|
args_parser.add_option(scripts_path, "Path to Scripts.txt file", "scripts-path", 'r', "scripts-path");
|
||||||
|
args_parser.add_option(script_extensions_path, "Path to ScriptExtensions.txt file", "script-extensions-path", 'x', "script-extensions-path");
|
||||||
args_parser.add_option(word_break_path, "Path to WordBreakProperty.txt file", "word-break-path", 'w', "word-break-path");
|
args_parser.add_option(word_break_path, "Path to WordBreakProperty.txt file", "word-break-path", 'w', "word-break-path");
|
||||||
args_parser.parse(argc, argv);
|
args_parser.parse(argc, argv);
|
||||||
|
|
||||||
|
@ -848,6 +868,7 @@ int main(int argc, char** argv)
|
||||||
auto prop_alias_file = open_file(prop_alias_path, "-a/--prop-alias-path");
|
auto prop_alias_file = open_file(prop_alias_path, "-a/--prop-alias-path");
|
||||||
auto prop_value_alias_file = open_file(prop_value_alias_path, "-v/--prop-value-alias-path");
|
auto prop_value_alias_file = open_file(prop_value_alias_path, "-v/--prop-value-alias-path");
|
||||||
auto scripts_file = open_file(scripts_path, "-r/--scripts-path");
|
auto scripts_file = open_file(scripts_path, "-r/--scripts-path");
|
||||||
|
auto script_extensions_file = open_file(script_extensions_path, "-x/--script-extensions-path");
|
||||||
auto word_break_file = open_file(word_break_path, "-w/--word-break-path");
|
auto word_break_file = open_file(word_break_path, "-w/--word-break-path");
|
||||||
|
|
||||||
UnicodeData unicode_data {};
|
UnicodeData unicode_data {};
|
||||||
|
@ -856,6 +877,7 @@ int main(int argc, char** argv)
|
||||||
parse_prop_list(derived_core_prop_file, unicode_data.prop_list);
|
parse_prop_list(derived_core_prop_file, unicode_data.prop_list);
|
||||||
parse_alias_list(prop_alias_file, unicode_data.prop_list, unicode_data.prop_aliases);
|
parse_alias_list(prop_alias_file, unicode_data.prop_list, unicode_data.prop_aliases);
|
||||||
parse_prop_list(scripts_file, unicode_data.script_list);
|
parse_prop_list(scripts_file, unicode_data.script_list);
|
||||||
|
parse_prop_list(script_extensions_file, unicode_data.script_extensions, true);
|
||||||
parse_prop_list(word_break_file, unicode_data.word_break_prop_list);
|
parse_prop_list(word_break_file, unicode_data.word_break_prop_list);
|
||||||
|
|
||||||
parse_unicode_data(unicode_data_file, unicode_data);
|
parse_unicode_data(unicode_data_file, unicode_data);
|
||||||
|
|
|
@ -21,6 +21,9 @@ set(PROP_VALUE_ALIAS_PATH ${CMAKE_BINARY_DIR}/UCD/PropertyValueAliases.txt)
|
||||||
set(SCRIPTS_URL https://www.unicode.org/Public/13.0.0/ucd/Scripts.txt)
|
set(SCRIPTS_URL https://www.unicode.org/Public/13.0.0/ucd/Scripts.txt)
|
||||||
set(SCRIPTS_PATH ${CMAKE_BINARY_DIR}/UCD/Scripts.txt)
|
set(SCRIPTS_PATH ${CMAKE_BINARY_DIR}/UCD/Scripts.txt)
|
||||||
|
|
||||||
|
set(SCRIPT_EXTENSIONS_URL https://www.unicode.org/Public/13.0.0/ucd/ScriptExtensions.txt)
|
||||||
|
set(SCRIPT_EXTENSIONS_PATH ${CMAKE_BINARY_DIR}/UCD/ScriptExtensions.txt)
|
||||||
|
|
||||||
set(WORD_BREAK_URL https://www.unicode.org/Public/13.0.0/ucd/auxiliary/WordBreakProperty.txt)
|
set(WORD_BREAK_URL https://www.unicode.org/Public/13.0.0/ucd/auxiliary/WordBreakProperty.txt)
|
||||||
set(WORD_BREAK_PATH ${CMAKE_BINARY_DIR}/UCD/WordBreakProperty.txt)
|
set(WORD_BREAK_PATH ${CMAKE_BINARY_DIR}/UCD/WordBreakProperty.txt)
|
||||||
|
|
||||||
|
@ -53,6 +56,10 @@ if (ENABLE_UNICODE_DATABASE_DOWNLOAD)
|
||||||
message(STATUS "Downloading UCD Scripts.txt from ${SCRIPTS_URL}...")
|
message(STATUS "Downloading UCD Scripts.txt from ${SCRIPTS_URL}...")
|
||||||
file(DOWNLOAD ${SCRIPTS_URL} ${SCRIPTS_PATH} INACTIVITY_TIMEOUT 10)
|
file(DOWNLOAD ${SCRIPTS_URL} ${SCRIPTS_PATH} INACTIVITY_TIMEOUT 10)
|
||||||
endif()
|
endif()
|
||||||
|
if (NOT EXISTS ${SCRIPT_EXTENSIONS_PATH})
|
||||||
|
message(STATUS "Downloading UCD ScriptExtensions.txt from ${SCRIPT_EXTENSIONS_URL}...")
|
||||||
|
file(DOWNLOAD ${SCRIPT_EXTENSIONS_URL} ${SCRIPT_EXTENSIONS_PATH} INACTIVITY_TIMEOUT 10)
|
||||||
|
endif()
|
||||||
if (NOT EXISTS ${WORD_BREAK_PATH})
|
if (NOT EXISTS ${WORD_BREAK_PATH})
|
||||||
message(STATUS "Downloading UCD WordBreakProperty.txt from ${WORD_BREAK_URL}...")
|
message(STATUS "Downloading UCD WordBreakProperty.txt from ${WORD_BREAK_URL}...")
|
||||||
file(DOWNLOAD ${WORD_BREAK_URL} ${WORD_BREAK_PATH} INACTIVITY_TIMEOUT 10)
|
file(DOWNLOAD ${WORD_BREAK_URL} ${WORD_BREAK_PATH} INACTIVITY_TIMEOUT 10)
|
||||||
|
@ -68,9 +75,9 @@ if (ENABLE_UNICODE_DATABASE_DOWNLOAD)
|
||||||
|
|
||||||
add_custom_command(
|
add_custom_command(
|
||||||
OUTPUT ${UNICODE_DATA_HEADER} ${UNICODE_DATA_IMPLEMENTATION}
|
OUTPUT ${UNICODE_DATA_HEADER} ${UNICODE_DATA_IMPLEMENTATION}
|
||||||
COMMAND $<TARGET_FILE:GenerateUnicodeData> -h ${UNICODE_DATA_HEADER} -c ${UNICODE_DATA_IMPLEMENTATION} -u ${UNICODE_DATA_PATH} -s ${SPECIAL_CASING_PATH} -p ${PROP_LIST_PATH} -d ${DERIVED_CORE_PROP_PATH} -a ${PROP_ALIAS_PATH} -v ${PROP_VALUE_ALIAS_PATH} -r ${SCRIPTS_PATH} -w ${WORD_BREAK_PATH}
|
COMMAND $<TARGET_FILE:GenerateUnicodeData> -h ${UNICODE_DATA_HEADER} -c ${UNICODE_DATA_IMPLEMENTATION} -u ${UNICODE_DATA_PATH} -s ${SPECIAL_CASING_PATH} -p ${PROP_LIST_PATH} -d ${DERIVED_CORE_PROP_PATH} -a ${PROP_ALIAS_PATH} -v ${PROP_VALUE_ALIAS_PATH} -r ${SCRIPTS_PATH} -x ${SCRIPT_EXTENSIONS_PATH} -w ${WORD_BREAK_PATH}
|
||||||
VERBATIM
|
VERBATIM
|
||||||
DEPENDS GenerateUnicodeData ${UNICODE_DATA_PATH} ${SPECIAL_CASING_PATH} ${PROP_LIST_PATH} ${DERIVED_CORE_PROP_PATH} ${PROP_ALIAS_PATH} ${PROP_VALUE_ALIAS_PATH} ${SCRIPTS_PATH} ${WORD_BREAK_PATH}
|
DEPENDS GenerateUnicodeData ${UNICODE_DATA_PATH} ${SPECIAL_CASING_PATH} ${PROP_LIST_PATH} ${DERIVED_CORE_PROP_PATH} ${PROP_ALIAS_PATH} ${PROP_VALUE_ALIAS_PATH} ${SCRIPTS_PATH} ${SCRIPT_EXTENSIONS_PATH} ${WORD_BREAK_PATH}
|
||||||
)
|
)
|
||||||
|
|
||||||
set(UNICODE_DATA_SOURCES ${UNICODE_DATA_HEADER} ${UNICODE_DATA_IMPLEMENTATION})
|
set(UNICODE_DATA_SOURCES ${UNICODE_DATA_HEADER} ${UNICODE_DATA_IMPLEMENTATION})
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue