1
Fork 0
mirror of https://github.com/RGBCube/serenity synced 2025-05-31 05:48:12 +00:00

LibUnicode: Generate separate tables for General Category properties

Previously, each code point's General Category was part of the generated
UnicodeData structure. This ultimately presented two problems, one
functional and one performance related:

  * Some General Categories are applied to unassigned code points, for
    example the Unassigned (Cn) category. Unassigned code points are
    strictly excluded from UnicodeData.txt, so by relying on that file,
    the generator is unable to handle these categories.

  * Lookups for General Categories are slower when searching through the
    large UnicodeData hash map. Even though lookups are O(1), the hash
    function turned out to be slower than binary searching through a
    category-specific table.

So, now a table is generated for each General Category. When querying a
code point for a category, a binary search is done on each code point
range in that category's table to check if code point has that category.

Further, General Categories are now parsed from the UCD file
DerivedGeneralCategory.txt. This file is a normal "prop list" file and
contains the categories for unassigned code points.
This commit is contained in:
Timothy Flynn 2021-08-10 07:17:24 -04:00 committed by Andreas Kling
parent 4e546cee97
commit 7dce2bfe23
5 changed files with 254 additions and 89 deletions

View file

@ -6,6 +6,9 @@ set(UNICODE_DATA_PATH ${CMAKE_BINARY_DIR}/UCD/UnicodeData.txt)
set(SPECIAL_CASING_URL https://www.unicode.org/Public/13.0.0/ucd/SpecialCasing.txt)
set(SPECIAL_CASING_PATH ${CMAKE_BINARY_DIR}/UCD/SpecialCasing.txt)
set(DERIVED_GENERAL_CATEGORY_URL https://www.unicode.org/Public/13.0.0/ucd/extracted/DerivedGeneralCategory.txt)
set(DERIVED_GENERAL_CATEGORY_PATH ${CMAKE_BINARY_DIR}/UCD/DerivedGeneralCategory.txt)
set(PROP_LIST_URL https://www.unicode.org/Public/13.0.0/ucd/PropList.txt)
set(PROP_LIST_PATH ${CMAKE_BINARY_DIR}/UCD/PropList.txt)
@ -39,6 +42,10 @@ if (ENABLE_UNICODE_DATABASE_DOWNLOAD)
message(STATUS "Downloading UCD SpecialCasing.txt from ${SPECIAL_CASING_URL}...")
file(DOWNLOAD ${SPECIAL_CASING_URL} ${SPECIAL_CASING_PATH} INACTIVITY_TIMEOUT 10)
endif()
if (NOT EXISTS ${DERIVED_GENERAL_CATEGORY_PATH})
message(STATUS "Downloading UCD DerivedGeneralCategory.txt from ${DERIVED_GENERAL_CATEGORY_URL}...")
file(DOWNLOAD ${DERIVED_GENERAL_CATEGORY_URL} ${DERIVED_GENERAL_CATEGORY_PATH} INACTIVITY_TIMEOUT 10)
endif()
if (NOT EXISTS ${PROP_LIST_PATH})
message(STATUS "Downloading UCD PropList.txt from ${PROP_LIST_URL}...")
file(DOWNLOAD ${PROP_LIST_URL} ${PROP_LIST_PATH} INACTIVITY_TIMEOUT 10)
@ -82,9 +89,9 @@ if (ENABLE_UNICODE_DATABASE_DOWNLOAD)
add_custom_command(
OUTPUT ${UNICODE_DATA_HEADER} ${UNICODE_DATA_IMPLEMENTATION}
COMMAND $<TARGET_FILE:GenerateUnicodeData> -h ${UNICODE_DATA_HEADER} -c ${UNICODE_DATA_IMPLEMENTATION} -u ${UNICODE_DATA_PATH} -s ${SPECIAL_CASING_PATH} -p ${PROP_LIST_PATH} -d ${DERIVED_CORE_PROP_PATH} -b ${DERIVED_BINARY_PROP_PATH} -a ${PROP_ALIAS_PATH} -v ${PROP_VALUE_ALIAS_PATH} -r ${SCRIPTS_PATH} -x ${SCRIPT_EXTENSIONS_PATH} -e ${EMOJI_DATA_PATH}
COMMAND $<TARGET_FILE:GenerateUnicodeData> -h ${UNICODE_DATA_HEADER} -c ${UNICODE_DATA_IMPLEMENTATION} -u ${UNICODE_DATA_PATH} -s ${SPECIAL_CASING_PATH} -g ${DERIVED_GENERAL_CATEGORY_PATH} -p ${PROP_LIST_PATH} -d ${DERIVED_CORE_PROP_PATH} -b ${DERIVED_BINARY_PROP_PATH} -a ${PROP_ALIAS_PATH} -v ${PROP_VALUE_ALIAS_PATH} -r ${SCRIPTS_PATH} -x ${SCRIPT_EXTENSIONS_PATH} -e ${EMOJI_DATA_PATH}
VERBATIM
DEPENDS GenerateUnicodeData ${UNICODE_DATA_PATH} ${SPECIAL_CASING_PATH} ${PROP_LIST_PATH} ${DERIVED_CORE_PROP_PATH} ${DERIVED_BINARY_PROP_PATH} ${PROP_ALIAS_PATH} ${PROP_VALUE_ALIAS_PATH} ${SCRIPTS_PATH} ${SCRIPT_EXTENSIONS_PATH} ${EMOJI_DATA_PATH}
DEPENDS GenerateUnicodeData ${UNICODE_DATA_PATH} ${SPECIAL_CASING_PATH} ${DERIVED_GENERAL_CATEGORY_PATH} ${PROP_LIST_PATH} ${DERIVED_CORE_PROP_PATH} ${DERIVED_BINARY_PROP_PATH} ${PROP_ALIAS_PATH} ${PROP_VALUE_ALIAS_PATH} ${SCRIPTS_PATH} ${SCRIPT_EXTENSIONS_PATH} ${EMOJI_DATA_PATH}
)
set(UNICODE_DATA_SOURCES ${UNICODE_DATA_HEADER} ${UNICODE_DATA_IMPLEMENTATION})