mirror of
				https://github.com/RGBCube/serenity
				synced 2025-10-31 20:32:44 +00:00 
			
		
		
		
	LibUnicode: Download and parse {Grapheme,Word,Sentence} break props
This commit is contained in:
		
							parent
							
								
									6efbafa6e0
								
							
						
					
					
						commit
						2d50c08f34
					
				
					 5 changed files with 49 additions and 1 deletions
				
			
		|  | @ -48,6 +48,15 @@ set(EMOJI_DATA_PATH "${UCD_PATH}/emoji-data.txt") | |||
| set(NORM_PROPS_URL "https://www.unicode.org/Public/${UCD_VERSION}/ucd/DerivedNormalizationProps.txt") | ||||
| set(NORM_PROPS_PATH "${UCD_PATH}/DerivedNormalizationProps.txt") | ||||
| 
 | ||||
| set(GRAPHEME_BREAK_PROP_URL "https://www.unicode.org/Public/${UCD_VERSION}/ucd/auxiliary/GraphemeBreakProperty.txt") | ||||
| set(GRAPHEME_BREAK_PROP_PATH "${UCD_PATH}/GraphemeBreakProperty.txt") | ||||
| 
 | ||||
| set(WORD_BREAK_PROP_URL "https://www.unicode.org/Public/${UCD_VERSION}/ucd/auxiliary/WordBreakProperty.txt") | ||||
| set(WORD_BREAK_PROP_PATH "${UCD_PATH}/WordBreakProperty.txt") | ||||
| 
 | ||||
| set(SENTENCE_BREAK_PROP_URL "https://www.unicode.org/Public/${UCD_VERSION}/ucd/auxiliary/SentenceBreakProperty.txt") | ||||
| set(SENTENCE_BREAK_PROP_PATH "${UCD_PATH}/SentenceBreakProperty.txt") | ||||
| 
 | ||||
| set(CLDR_ZIP_URL "https://github.com/unicode-org/cldr-json/releases/download/${CLDR_VERSION}/cldr-${CLDR_VERSION}-json-modern.zip") | ||||
| set(CLDR_ZIP_PATH "${CLDR_PATH}/cldr.zip") | ||||
| 
 | ||||
|  | @ -96,6 +105,9 @@ if (ENABLE_UNICODE_DATABASE_DOWNLOAD) | |||
|     download_file("${SCRIPT_EXTENSIONS_URL}" "${SCRIPT_EXTENSIONS_PATH}") | ||||
|     download_file("${EMOJI_DATA_URL}" "${EMOJI_DATA_PATH}") | ||||
|     download_file("${NORM_PROPS_URL}" "${NORM_PROPS_PATH}") | ||||
|     download_file("${GRAPHEME_BREAK_PROP_URL}" "${GRAPHEME_BREAK_PROP_PATH}") | ||||
|     download_file("${WORD_BREAK_PROP_URL}" "${WORD_BREAK_PROP_PATH}") | ||||
|     download_file("${SENTENCE_BREAK_PROP_URL}" "${SENTENCE_BREAK_PROP_PATH}") | ||||
| 
 | ||||
|     download_file("${CLDR_ZIP_URL}" "${CLDR_ZIP_PATH}") | ||||
|     extract_cldr_file("${CLDR_CORE_SOURCE}" "${CLDR_CORE_PATH}") | ||||
|  | @ -148,7 +160,7 @@ if (ENABLE_UNICODE_DATABASE_DOWNLOAD) | |||
|         "${UNICODE_META_TARGET_PREFIX}" | ||||
|         "${UNICODE_DATA_HEADER}" | ||||
|         "${UNICODE_DATA_IMPLEMENTATION}" | ||||
|         arguments -u "${UNICODE_DATA_PATH}" -s "${SPECIAL_CASING_PATH}" -g "${DERIVED_GENERAL_CATEGORY_PATH}" -p "${PROP_LIST_PATH}" -d "${DERIVED_CORE_PROP_PATH}" -b "${DERIVED_BINARY_PROP_PATH}" -a "${PROP_ALIAS_PATH}" -v "${PROP_VALUE_ALIAS_PATH}" -r "${SCRIPTS_PATH}" -x "${SCRIPT_EXTENSIONS_PATH}" -e "${EMOJI_DATA_PATH}" -m "${NAME_ALIAS_PATH}" -n "${NORM_PROPS_PATH}" | ||||
|         arguments -u "${UNICODE_DATA_PATH}" -s "${SPECIAL_CASING_PATH}" -g "${DERIVED_GENERAL_CATEGORY_PATH}" -p "${PROP_LIST_PATH}" -d "${DERIVED_CORE_PROP_PATH}" -b "${DERIVED_BINARY_PROP_PATH}" -a "${PROP_ALIAS_PATH}" -v "${PROP_VALUE_ALIAS_PATH}" -r "${SCRIPTS_PATH}" -x "${SCRIPT_EXTENSIONS_PATH}" -e "${EMOJI_DATA_PATH}" -m "${NAME_ALIAS_PATH}" -n "${NORM_PROPS_PATH}" -f "${GRAPHEME_BREAK_PROP_PATH}" -w "${WORD_BREAK_PROP_PATH}" -i "${SENTENCE_BREAK_PROP_PATH}" | ||||
|     ) | ||||
|     invoke_generator( | ||||
|         "UnicodeDateTimeFormat" | ||||
|  |  | |||
|  | @ -127,6 +127,10 @@ struct UnicodeData { | |||
| 
 | ||||
|     // FIXME: We are not yet doing anything with this data. It will be needed for String.prototype.normalize.
 | ||||
|     NormalizationProps normalization_props; | ||||
| 
 | ||||
|     PropList grapheme_break_props; | ||||
|     PropList word_break_props; | ||||
|     PropList sentence_break_props; | ||||
| }; | ||||
| 
 | ||||
| static Vector<u32> parse_code_point_list(StringView list) | ||||
|  | @ -591,6 +595,9 @@ namespace Unicode { | |||
|     generate_enum("GeneralCategory"sv, {}, unicode_data.general_categories.keys(), unicode_data.general_category_aliases); | ||||
|     generate_enum("Property"sv, {}, unicode_data.prop_list.keys(), unicode_data.prop_aliases); | ||||
|     generate_enum("Script"sv, {}, unicode_data.script_list.keys(), unicode_data.script_aliases); | ||||
|     generate_enum("GraphemeBreakProperty"sv, {}, unicode_data.grapheme_break_props.keys()); | ||||
|     generate_enum("WordBreakProperty"sv, {}, unicode_data.word_break_props.keys()); | ||||
|     generate_enum("SentenceBreakProperty"sv, {}, unicode_data.sentence_break_props.keys()); | ||||
| 
 | ||||
|     generator.append(R"~~~( | ||||
| struct SpecialCasing { | ||||
|  | @ -837,6 +844,9 @@ static constexpr Array<Span<CodePointRange const>, @size@> @name@ { {)~~~"); | |||
|     append_prop_list("s_properties"sv, "s_property_{}"sv, unicode_data.prop_list); | ||||
|     append_prop_list("s_scripts"sv, "s_script_{}"sv, unicode_data.script_list); | ||||
|     append_prop_list("s_script_extensions"sv, "s_script_extension_{}"sv, unicode_data.script_extensions); | ||||
|     append_prop_list("s_grapheme_break_properties"sv, "s_grapheme_break_property_{}"sv, unicode_data.grapheme_break_props); | ||||
|     append_prop_list("s_word_break_properties"sv, "s_word_break_property_{}"sv, unicode_data.word_break_props); | ||||
|     append_prop_list("s_sentence_break_properties"sv, "s_sentence_break_property_{}"sv, unicode_data.sentence_break_props); | ||||
| 
 | ||||
|     generator.append(R"~~~( | ||||
| struct CodePointName { | ||||
|  | @ -955,6 +965,10 @@ bool code_point_has_@enum_snake@(u32 code_point, @enum_title@ @enum_snake@) | |||
|     append_prop_search("Script"sv, "script_extension"sv, "s_script_extensions"sv); | ||||
|     append_from_string("Script"sv, "script"sv, unicode_data.script_list, unicode_data.script_aliases); | ||||
| 
 | ||||
|     append_prop_search("GraphemeBreakProperty"sv, "grapheme_break_property"sv, "s_grapheme_break_properties"sv); | ||||
|     append_prop_search("WordBreakProperty"sv, "word_break_property"sv, "s_word_break_properties"sv); | ||||
|     append_prop_search("SentenceBreakProperty"sv, "sentence_break_property"sv, "s_sentence_break_properties"sv); | ||||
| 
 | ||||
|     generator.append(R"~~~( | ||||
| } | ||||
| )~~~"); | ||||
|  | @ -1104,6 +1118,9 @@ ErrorOr<int> serenity_main(Main::Arguments arguments) | |||
|     StringView script_extensions_path; | ||||
|     StringView emoji_data_path; | ||||
|     StringView normalization_path; | ||||
|     StringView grapheme_break_path; | ||||
|     StringView word_break_path; | ||||
|     StringView sentence_break_path; | ||||
| 
 | ||||
|     Core::ArgsParser args_parser; | ||||
|     args_parser.add_option(generated_header_path, "Path to the Unicode Data header file to generate", "generated-header-path", 'h', "generated-header-path"); | ||||
|  | @ -1121,6 +1138,9 @@ ErrorOr<int> serenity_main(Main::Arguments arguments) | |||
|     args_parser.add_option(script_extensions_path, "Path to ScriptExtensions.txt file", "script-extensions-path", 'x', "script-extensions-path"); | ||||
|     args_parser.add_option(emoji_data_path, "Path to emoji-data.txt file", "emoji-data-path", 'e', "emoji-data-path"); | ||||
|     args_parser.add_option(normalization_path, "Path to DerivedNormalizationProps.txt file", "normalization-path", 'n', "normalization-path"); | ||||
|     args_parser.add_option(grapheme_break_path, "Path to GraphemeBreakProperty.txt file", "grapheme-break-path", 'f', "grapheme-break-path"); | ||||
|     args_parser.add_option(word_break_path, "Path to WordBreakProperty.txt file", "word-break-path", 'w', "word-break-path"); | ||||
|     args_parser.add_option(sentence_break_path, "Path to SentenceBreakProperty.txt file", "sentence-break-path", 'i', "sentence-break-path"); | ||||
|     args_parser.parse(arguments); | ||||
| 
 | ||||
|     auto open_file = [&](StringView path, Core::OpenMode mode = Core::OpenMode::ReadOnly) -> ErrorOr<NonnullRefPtr<Core::File>> { | ||||
|  | @ -1147,6 +1167,9 @@ ErrorOr<int> serenity_main(Main::Arguments arguments) | |||
|     auto script_extensions_file = TRY(open_file(script_extensions_path)); | ||||
|     auto emoji_data_file = TRY(open_file(emoji_data_path)); | ||||
|     auto normalization_file = TRY(open_file(normalization_path)); | ||||
|     auto grapheme_break_file = TRY(open_file(grapheme_break_path)); | ||||
|     auto word_break_file = TRY(open_file(word_break_path)); | ||||
|     auto sentence_break_file = TRY(open_file(sentence_break_path)); | ||||
| 
 | ||||
|     UnicodeData unicode_data {}; | ||||
|     parse_special_casing(special_casing_file, unicode_data); | ||||
|  | @ -1160,6 +1183,9 @@ ErrorOr<int> serenity_main(Main::Arguments arguments) | |||
|     parse_prop_list(scripts_file, unicode_data.script_list); | ||||
|     parse_prop_list(script_extensions_file, unicode_data.script_extensions, true); | ||||
|     parse_name_aliases(name_alias_file, unicode_data); | ||||
|     parse_prop_list(grapheme_break_file, unicode_data.grapheme_break_props); | ||||
|     parse_prop_list(word_break_file, unicode_data.word_break_props); | ||||
|     parse_prop_list(sentence_break_file, unicode_data.sentence_break_props); | ||||
| 
 | ||||
|     populate_general_category_unions(unicode_data.general_categories); | ||||
|     parse_unicode_data(unicode_data_file, unicode_data); | ||||
|  |  | |||
|  | @ -353,4 +353,8 @@ Optional<Script> __attribute__((weak)) script_from_string(StringView) { return { | |||
| bool __attribute__((weak)) code_point_has_script(u32, Script) { return {}; } | ||||
| bool __attribute__((weak)) code_point_has_script_extension(u32, Script) { return {}; } | ||||
| 
 | ||||
| bool __attribute__((weak)) code_point_has_grapheme_break_property(u32, GraphemeBreakProperty) { return {}; } | ||||
| bool __attribute__((weak)) code_point_has_word_break_property(u32, WordBreakProperty) { return {}; } | ||||
| bool __attribute__((weak)) code_point_has_sentence_break_property(u32, SentenceBreakProperty) { return {}; } | ||||
| 
 | ||||
| } | ||||
|  |  | |||
|  | @ -40,4 +40,8 @@ Optional<Script> script_from_string(StringView); | |||
| bool code_point_has_script(u32 code_point, Script script); | ||||
| bool code_point_has_script_extension(u32 code_point, Script script); | ||||
| 
 | ||||
| bool code_point_has_grapheme_break_property(u32 code_point, GraphemeBreakProperty property); | ||||
| bool code_point_has_word_break_property(u32 code_point, WordBreakProperty property); | ||||
| bool code_point_has_sentence_break_property(u32 code_point, SentenceBreakProperty property); | ||||
| 
 | ||||
| } | ||||
|  |  | |||
|  | @ -22,6 +22,7 @@ enum class DateField : u8; | |||
| enum class DayPeriod : u8; | ||||
| enum class Era : u8; | ||||
| enum class GeneralCategory : u8; | ||||
| enum class GraphemeBreakProperty : u8; | ||||
| enum class HourCycle : u8; | ||||
| enum class HourCycleRegion : u8; | ||||
| enum class Key : u8; | ||||
|  | @ -35,6 +36,7 @@ enum class NumericSymbol : u8; | |||
| enum class Property : u8; | ||||
| enum class Script : u8; | ||||
| enum class ScriptTag : u8; | ||||
| enum class SentenceBreakProperty : u8; | ||||
| enum class StandardNumberFormatType : u8; | ||||
| enum class Style : u8; | ||||
| enum class Territory : u8; | ||||
|  |  | |||
		Loading…
	
	Add table
		Add a link
		
	
		Reference in a new issue
	
	 Idan Horowitz
						Idan Horowitz