LibUnicode: Parse and utilize DerivedCoreProperties

DerivedCoreProperties are pseudo-properties that are the union of other categories and properties. For example, the derived property Math is the union of the general category Sm and the property Other_Math. Parsing these is necessary for implementing Unicode property escapes. But it also has the added benefit that LibUnicode now does not need to derive some of these properties at runtime.
2025-09-13 17:07:36 +00:00 · 2021-07-28 18:39:41 -04:00 · 2021-07-28 18:39:41 -04:00 · 761c16d873
commit 761c16d873
parent 4eb4b06688
3 changed files with 23 additions and 51 deletions
--- a/Userland/Libraries/LibUnicode/CharacterTypes.cpp
+++ b/Userland/Libraries/LibUnicode/CharacterTypes.cpp
@ -28,51 +28,6 @@ static bool has_property(UnicodeData const& unicode_data, Property property)
    return (unicode_data.properties & property) == property;
 }

-static bool is_cased_letter(UnicodeData const& unicode_data)
-{
-    // A character C is defined to be cased if and only if C has the Lowercase or Uppercase property
-    // or has a General_Category value of Titlecase_Letter.
-    switch (unicode_data.general_category) {
-    case GeneralCategory::Ll:
-    case GeneralCategory::Lu:
-    case GeneralCategory::Lt:
-        return true;
-    default:
-        break;
-    }
-
-    return has_property(unicode_data, Property::OtherLowercase) || has_property(unicode_data, Property::OtherUppercase);
-}
-
-static bool is_case_ignorable(UnicodeData const& unicode_data)
-{
-    // A character C is defined to be case-ignorable if C has the value MidLetter (ML),
-    // MidNumLet (MB), or Single_Quote (SQ) for the Word_Break property or its General_Category is
-    // one of Nonspacing_Mark (Mn), Enclosing_Mark (Me), Format (Cf), Modifier_Letter (Lm), or
-    // Modifier_Symbol (Sk).
-    switch (unicode_data.general_category) {
-    case GeneralCategory::Mn:
-    case GeneralCategory::Me:
-    case GeneralCategory::Cf:
-    case GeneralCategory::Lm:
-    case GeneralCategory::Sk:
-        return true;
-    default:
-        break;
-    }
-
-    switch (unicode_data.word_break_property) {
-    case WordBreakProperty::MidLetter:
-    case WordBreakProperty::MidNumLet:
-    case WordBreakProperty::SingleQuote:
-        return true;
-    default:
-        break;
-    }
-
-    return false;
-}
-
 static bool is_final_code_point(Utf8View const& string, size_t index, size_t byte_length)
 {
    // C is preceded by a sequence consisting of a cased letter and then zero or more case-ignorable
@ -90,9 +45,12 @@ static bool is_final_code_point(Utf8View const& string, size_t index, size_t byt
        if (!unicode_data.has_value())
            return false;

-        if (is_cased_letter(*unicode_data) && !is_case_ignorable(*unicode_data))
+        bool is_cased = has_property(*unicode_data, Property::Cased);
+        bool is_case_ignorable = has_property(*unicode_data, Property::Case_Ignorable);
+
+        if (is_cased && !is_case_ignorable)
            ++cased_letter_count;
-        else if (!is_case_ignorable(*unicode_data))
+        else if (!is_case_ignorable)
            cased_letter_count = 0;
    }

@ -104,9 +62,12 @@ static bool is_final_code_point(Utf8View const& string, size_t index, size_t byt
        if (!unicode_data.has_value())
            return false;

-        if (is_case_ignorable(*unicode_data))
+        bool is_cased = has_property(*unicode_data, Property::Cased);
+        bool is_case_ignorable = has_property(*unicode_data, Property::Case_Ignorable);
+
+        if (is_case_ignorable)
            continue;
-        if (is_cased_letter(*unicode_data))
+        if (is_cased)
            return false;

        break;
--- a/Userland/Libraries/LibUnicode/CodeGenerators/GenerateUnicodeData.cpp
+++ b/Userland/Libraries/LibUnicode/CodeGenerators/GenerateUnicodeData.cpp
@ -573,6 +573,7 @@ int main(int argc, char** argv)
    char const* unicode_data_path = nullptr;
    char const* special_casing_path = nullptr;
    char const* prop_list_path = nullptr;
+    char const* derived_core_prop_path = nullptr;
    char const* word_break_path = nullptr;

    Core::ArgsParser args_parser;
@ -581,6 +582,7 @@ int main(int argc, char** argv)
    args_parser.add_option(unicode_data_path, "Path to UnicodeData.txt file", "unicode-data-path", 'u', "unicode-data-path");
    args_parser.add_option(special_casing_path, "Path to SpecialCasing.txt file", "special-casing-path", 's', "special-casing-path");
    args_parser.add_option(prop_list_path, "Path to PropList.txt file", "prop-list-path", 'p', "prop-list-path");
+    args_parser.add_option(derived_core_prop_path, "Path to DerivedCoreProperties.txt file", "derived-core-prop-path", 'd', "derived-core-prop-path");
    args_parser.add_option(word_break_path, "Path to WordBreakProperty.txt file", "word-break-path", 'w', "word-break-path");
    args_parser.parse(argc, argv);

@ -609,11 +611,13 @@ int main(int argc, char** argv)
    auto unicode_data_file = open_file(unicode_data_path, "-u/--unicode-data-path");
    auto special_casing_file = open_file(special_casing_path, "-s/--special-casing-path");
    auto prop_list_file = open_file(prop_list_path, "-p/--prop-list-path");
+    auto derived_core_prop_file = open_file(derived_core_prop_path, "-d/--derived-core-prop-path");
    auto word_break_file = open_file(word_break_path, "-w/--word-break-path");

    UnicodeData unicode_data {};
    parse_special_casing(special_casing_file, unicode_data);
    parse_prop_list(prop_list_file, unicode_data.prop_list);
+    parse_prop_list(derived_core_prop_file, unicode_data.prop_list);
    parse_prop_list(word_break_file, unicode_data.word_break_prop_list);
    parse_unicode_data(unicode_data_file, unicode_data);

--- a/Userland/Libraries/LibUnicode/unicode_data.cmake
+++ b/Userland/Libraries/LibUnicode/unicode_data.cmake
@ -9,6 +9,9 @@ set(SPECIAL_CASING_PATH ${CMAKE_BINARY_DIR}/UCD/SpecialCasing.txt)
 set(PROP_LIST_URL https://www.unicode.org/Public/13.0.0/ucd/PropList.txt)
 set(PROP_LIST_PATH ${CMAKE_BINARY_DIR}/UCD/PropList.txt)

+set(DERIVED_CORE_PROP_URL https://www.unicode.org/Public/13.0.0/ucd/DerivedCoreProperties.txt)
+set(DERIVED_CORE_PROP_PATH ${CMAKE_BINARY_DIR}/UCD/DerivedCoreProperties.txt)
+
 set(WORD_BREAK_URL https://www.unicode.org/Public/13.0.0/ucd/auxiliary/WordBreakProperty.txt)
 set(WORD_BREAK_PATH ${CMAKE_BINARY_DIR}/UCD/WordBreakProperty.txt)

@ -25,6 +28,10 @@ if (ENABLE_UNICODE_DATABASE_DOWNLOAD)
        message(STATUS "Downloading UCD PropList.txt from ${PROP_LIST_URL}...")
        file(DOWNLOAD ${PROP_LIST_URL} ${PROP_LIST_PATH} INACTIVITY_TIMEOUT 10)
    endif()
+    if (NOT EXISTS ${DERIVED_CORE_PROP_PATH})
+        message(STATUS "Downloading UCD DerivedCoreProperties.txt from ${DERIVED_CORE_PROP_URL}...")
+        file(DOWNLOAD ${DERIVED_CORE_PROP_URL} ${DERIVED_CORE_PROP_PATH} INACTIVITY_TIMEOUT 10)
+    endif()
    if (NOT EXISTS ${WORD_BREAK_PATH})
        message(STATUS "Downloading UCD WordBreakProperty.txt from ${WORD_BREAK_URL}...")
        file(DOWNLOAD ${WORD_BREAK_URL} ${WORD_BREAK_PATH} INACTIVITY_TIMEOUT 10)
@ -40,7 +47,7 @@ if (ENABLE_UNICODE_DATABASE_DOWNLOAD)

    add_custom_command(
        OUTPUT ${UNICODE_DATA_HEADER}
-        COMMAND ${write_if_different} ${UNICODE_DATA_HEADER} $<TARGET_FILE:GenerateUnicodeData> -h -u ${UNICODE_DATA_PATH} -s ${SPECIAL_CASING_PATH} -p ${PROP_LIST_PATH} -w ${WORD_BREAK_PATH}
+        COMMAND ${write_if_different} ${UNICODE_DATA_HEADER} $<TARGET_FILE:GenerateUnicodeData> -h -u ${UNICODE_DATA_PATH} -s ${SPECIAL_CASING_PATH} -p ${PROP_LIST_PATH} -d ${DERIVED_CORE_PROP_PATH} -w ${WORD_BREAK_PATH}
        VERBATIM
        DEPENDS GenerateUnicodeData
        MAIN_DEPENDENCY ${UNICODE_DATA_PATH} ${SPECIAL_CASING_PATH}
@ -48,7 +55,7 @@ if (ENABLE_UNICODE_DATABASE_DOWNLOAD)

    add_custom_command(
        OUTPUT ${UNICODE_DATA_IMPLEMENTATION}
-        COMMAND ${write_if_different} ${UNICODE_DATA_IMPLEMENTATION} $<TARGET_FILE:GenerateUnicodeData> -c -u ${UNICODE_DATA_PATH} -s ${SPECIAL_CASING_PATH} -p ${PROP_LIST_PATH} -w ${WORD_BREAK_PATH}
+        COMMAND ${write_if_different} ${UNICODE_DATA_IMPLEMENTATION} $<TARGET_FILE:GenerateUnicodeData> -c -u ${UNICODE_DATA_PATH} -s ${SPECIAL_CASING_PATH} -p ${PROP_LIST_PATH} -d ${DERIVED_CORE_PROP_PATH} -w ${WORD_BREAK_PATH}
        VERBATIM
        DEPENDS GenerateUnicodeData
        MAIN_DEPENDENCY ${UNICODE_DATA_PATH} ${SPECIAL_CASING_PATH}