diff --git a/Userland/Libraries/LibUnicode/CharacterTypes.cpp b/Userland/Libraries/LibUnicode/CharacterTypes.cpp index efe19ed61d..0cdf901721 100644 --- a/Userland/Libraries/LibUnicode/CharacterTypes.cpp +++ b/Userland/Libraries/LibUnicode/CharacterTypes.cpp @@ -28,51 +28,6 @@ static bool has_property(UnicodeData const& unicode_data, Property property) return (unicode_data.properties & property) == property; } -static bool is_cased_letter(UnicodeData const& unicode_data) -{ - // A character C is defined to be cased if and only if C has the Lowercase or Uppercase property - // or has a General_Category value of Titlecase_Letter. - switch (unicode_data.general_category) { - case GeneralCategory::Ll: - case GeneralCategory::Lu: - case GeneralCategory::Lt: - return true; - default: - break; - } - - return has_property(unicode_data, Property::OtherLowercase) || has_property(unicode_data, Property::OtherUppercase); -} - -static bool is_case_ignorable(UnicodeData const& unicode_data) -{ - // A character C is defined to be case-ignorable if C has the value MidLetter (ML), - // MidNumLet (MB), or Single_Quote (SQ) for the Word_Break property or its General_Category is - // one of Nonspacing_Mark (Mn), Enclosing_Mark (Me), Format (Cf), Modifier_Letter (Lm), or - // Modifier_Symbol (Sk). - switch (unicode_data.general_category) { - case GeneralCategory::Mn: - case GeneralCategory::Me: - case GeneralCategory::Cf: - case GeneralCategory::Lm: - case GeneralCategory::Sk: - return true; - default: - break; - } - - switch (unicode_data.word_break_property) { - case WordBreakProperty::MidLetter: - case WordBreakProperty::MidNumLet: - case WordBreakProperty::SingleQuote: - return true; - default: - break; - } - - return false; -} - static bool is_final_code_point(Utf8View const& string, size_t index, size_t byte_length) { // C is preceded by a sequence consisting of a cased letter and then zero or more case-ignorable @@ -90,9 +45,12 @@ static bool is_final_code_point(Utf8View const& string, size_t index, size_t byt if (!unicode_data.has_value()) return false; - if (is_cased_letter(*unicode_data) && !is_case_ignorable(*unicode_data)) + bool is_cased = has_property(*unicode_data, Property::Cased); + bool is_case_ignorable = has_property(*unicode_data, Property::Case_Ignorable); + + if (is_cased && !is_case_ignorable) ++cased_letter_count; - else if (!is_case_ignorable(*unicode_data)) + else if (!is_case_ignorable) cased_letter_count = 0; } @@ -104,9 +62,12 @@ static bool is_final_code_point(Utf8View const& string, size_t index, size_t byt if (!unicode_data.has_value()) return false; - if (is_case_ignorable(*unicode_data)) + bool is_cased = has_property(*unicode_data, Property::Cased); + bool is_case_ignorable = has_property(*unicode_data, Property::Case_Ignorable); + + if (is_case_ignorable) continue; - if (is_cased_letter(*unicode_data)) + if (is_cased) return false; break; diff --git a/Userland/Libraries/LibUnicode/CodeGenerators/GenerateUnicodeData.cpp b/Userland/Libraries/LibUnicode/CodeGenerators/GenerateUnicodeData.cpp index 66acb2af51..64652d6aac 100644 --- a/Userland/Libraries/LibUnicode/CodeGenerators/GenerateUnicodeData.cpp +++ b/Userland/Libraries/LibUnicode/CodeGenerators/GenerateUnicodeData.cpp @@ -573,6 +573,7 @@ int main(int argc, char** argv) char const* unicode_data_path = nullptr; char const* special_casing_path = nullptr; char const* prop_list_path = nullptr; + char const* derived_core_prop_path = nullptr; char const* word_break_path = nullptr; Core::ArgsParser args_parser; @@ -581,6 +582,7 @@ int main(int argc, char** argv) args_parser.add_option(unicode_data_path, "Path to UnicodeData.txt file", "unicode-data-path", 'u', "unicode-data-path"); args_parser.add_option(special_casing_path, "Path to SpecialCasing.txt file", "special-casing-path", 's', "special-casing-path"); args_parser.add_option(prop_list_path, "Path to PropList.txt file", "prop-list-path", 'p', "prop-list-path"); + args_parser.add_option(derived_core_prop_path, "Path to DerivedCoreProperties.txt file", "derived-core-prop-path", 'd', "derived-core-prop-path"); args_parser.add_option(word_break_path, "Path to WordBreakProperty.txt file", "word-break-path", 'w', "word-break-path"); args_parser.parse(argc, argv); @@ -609,11 +611,13 @@ int main(int argc, char** argv) auto unicode_data_file = open_file(unicode_data_path, "-u/--unicode-data-path"); auto special_casing_file = open_file(special_casing_path, "-s/--special-casing-path"); auto prop_list_file = open_file(prop_list_path, "-p/--prop-list-path"); + auto derived_core_prop_file = open_file(derived_core_prop_path, "-d/--derived-core-prop-path"); auto word_break_file = open_file(word_break_path, "-w/--word-break-path"); UnicodeData unicode_data {}; parse_special_casing(special_casing_file, unicode_data); parse_prop_list(prop_list_file, unicode_data.prop_list); + parse_prop_list(derived_core_prop_file, unicode_data.prop_list); parse_prop_list(word_break_file, unicode_data.word_break_prop_list); parse_unicode_data(unicode_data_file, unicode_data); diff --git a/Userland/Libraries/LibUnicode/unicode_data.cmake b/Userland/Libraries/LibUnicode/unicode_data.cmake index 6650d0dec5..777b81680e 100644 --- a/Userland/Libraries/LibUnicode/unicode_data.cmake +++ b/Userland/Libraries/LibUnicode/unicode_data.cmake @@ -9,6 +9,9 @@ set(SPECIAL_CASING_PATH ${CMAKE_BINARY_DIR}/UCD/SpecialCasing.txt) set(PROP_LIST_URL https://www.unicode.org/Public/13.0.0/ucd/PropList.txt) set(PROP_LIST_PATH ${CMAKE_BINARY_DIR}/UCD/PropList.txt) +set(DERIVED_CORE_PROP_URL https://www.unicode.org/Public/13.0.0/ucd/DerivedCoreProperties.txt) +set(DERIVED_CORE_PROP_PATH ${CMAKE_BINARY_DIR}/UCD/DerivedCoreProperties.txt) + set(WORD_BREAK_URL https://www.unicode.org/Public/13.0.0/ucd/auxiliary/WordBreakProperty.txt) set(WORD_BREAK_PATH ${CMAKE_BINARY_DIR}/UCD/WordBreakProperty.txt) @@ -25,6 +28,10 @@ if (ENABLE_UNICODE_DATABASE_DOWNLOAD) message(STATUS "Downloading UCD PropList.txt from ${PROP_LIST_URL}...") file(DOWNLOAD ${PROP_LIST_URL} ${PROP_LIST_PATH} INACTIVITY_TIMEOUT 10) endif() + if (NOT EXISTS ${DERIVED_CORE_PROP_PATH}) + message(STATUS "Downloading UCD DerivedCoreProperties.txt from ${DERIVED_CORE_PROP_URL}...") + file(DOWNLOAD ${DERIVED_CORE_PROP_URL} ${DERIVED_CORE_PROP_PATH} INACTIVITY_TIMEOUT 10) + endif() if (NOT EXISTS ${WORD_BREAK_PATH}) message(STATUS "Downloading UCD WordBreakProperty.txt from ${WORD_BREAK_URL}...") file(DOWNLOAD ${WORD_BREAK_URL} ${WORD_BREAK_PATH} INACTIVITY_TIMEOUT 10) @@ -40,7 +47,7 @@ if (ENABLE_UNICODE_DATABASE_DOWNLOAD) add_custom_command( OUTPUT ${UNICODE_DATA_HEADER} - COMMAND ${write_if_different} ${UNICODE_DATA_HEADER} $ -h -u ${UNICODE_DATA_PATH} -s ${SPECIAL_CASING_PATH} -p ${PROP_LIST_PATH} -w ${WORD_BREAK_PATH} + COMMAND ${write_if_different} ${UNICODE_DATA_HEADER} $ -h -u ${UNICODE_DATA_PATH} -s ${SPECIAL_CASING_PATH} -p ${PROP_LIST_PATH} -d ${DERIVED_CORE_PROP_PATH} -w ${WORD_BREAK_PATH} VERBATIM DEPENDS GenerateUnicodeData MAIN_DEPENDENCY ${UNICODE_DATA_PATH} ${SPECIAL_CASING_PATH} @@ -48,7 +55,7 @@ if (ENABLE_UNICODE_DATABASE_DOWNLOAD) add_custom_command( OUTPUT ${UNICODE_DATA_IMPLEMENTATION} - COMMAND ${write_if_different} ${UNICODE_DATA_IMPLEMENTATION} $ -c -u ${UNICODE_DATA_PATH} -s ${SPECIAL_CASING_PATH} -p ${PROP_LIST_PATH} -w ${WORD_BREAK_PATH} + COMMAND ${write_if_different} ${UNICODE_DATA_IMPLEMENTATION} $ -c -u ${UNICODE_DATA_PATH} -s ${SPECIAL_CASING_PATH} -p ${PROP_LIST_PATH} -d ${DERIVED_CORE_PROP_PATH} -w ${WORD_BREAK_PATH} VERBATIM DEPENDS GenerateUnicodeData MAIN_DEPENDENCY ${UNICODE_DATA_PATH} ${SPECIAL_CASING_PATH}