mirror of
https://github.com/RGBCube/serenity
synced 2025-05-16 07:44:57 +00:00

Previously, each code point's General Category was part of the generated UnicodeData structure. This ultimately presented two problems, one functional and one performance related: * Some General Categories are applied to unassigned code points, for example the Unassigned (Cn) category. Unassigned code points are strictly excluded from UnicodeData.txt, so by relying on that file, the generator is unable to handle these categories. * Lookups for General Categories are slower when searching through the large UnicodeData hash map. Even though lookups are O(1), the hash function turned out to be slower than binary searching through a category-specific table. So, now a table is generated for each General Category. When querying a code point for a category, a binary search is done on each code point range in that category's table to check if code point has that category. Further, General Categories are now parsed from the UCD file DerivedGeneralCategory.txt. This file is a normal "prop list" file and contains the categories for unassigned code points.
101 lines
5.7 KiB
CMake
101 lines
5.7 KiB
CMake
option(ENABLE_UNICODE_DATABASE_DOWNLOAD "Enable download of Unicode UCD files at build time" ON)
|
|
|
|
set(UNICODE_DATA_URL https://www.unicode.org/Public/13.0.0/ucd/UnicodeData.txt)
|
|
set(UNICODE_DATA_PATH ${CMAKE_BINARY_DIR}/UCD/UnicodeData.txt)
|
|
|
|
set(SPECIAL_CASING_URL https://www.unicode.org/Public/13.0.0/ucd/SpecialCasing.txt)
|
|
set(SPECIAL_CASING_PATH ${CMAKE_BINARY_DIR}/UCD/SpecialCasing.txt)
|
|
|
|
set(DERIVED_GENERAL_CATEGORY_URL https://www.unicode.org/Public/13.0.0/ucd/extracted/DerivedGeneralCategory.txt)
|
|
set(DERIVED_GENERAL_CATEGORY_PATH ${CMAKE_BINARY_DIR}/UCD/DerivedGeneralCategory.txt)
|
|
|
|
set(PROP_LIST_URL https://www.unicode.org/Public/13.0.0/ucd/PropList.txt)
|
|
set(PROP_LIST_PATH ${CMAKE_BINARY_DIR}/UCD/PropList.txt)
|
|
|
|
set(DERIVED_CORE_PROP_URL https://www.unicode.org/Public/13.0.0/ucd/DerivedCoreProperties.txt)
|
|
set(DERIVED_CORE_PROP_PATH ${CMAKE_BINARY_DIR}/UCD/DerivedCoreProperties.txt)
|
|
|
|
set(DERIVED_BINARY_PROP_URL https://www.unicode.org/Public/13.0.0/ucd/extracted/DerivedBinaryProperties.txt)
|
|
set(DERIVED_BINARY_PROP_PATH ${CMAKE_BINARY_DIR}/UCD/DerivedBinaryProperties.txt)
|
|
|
|
set(PROP_ALIAS_URL https://www.unicode.org/Public/13.0.0/ucd/PropertyAliases.txt)
|
|
set(PROP_ALIAS_PATH ${CMAKE_BINARY_DIR}/UCD/PropertyAliases.txt)
|
|
|
|
set(PROP_VALUE_ALIAS_URL https://www.unicode.org/Public/13.0.0/ucd/PropertyValueAliases.txt)
|
|
set(PROP_VALUE_ALIAS_PATH ${CMAKE_BINARY_DIR}/UCD/PropertyValueAliases.txt)
|
|
|
|
set(SCRIPTS_URL https://www.unicode.org/Public/13.0.0/ucd/Scripts.txt)
|
|
set(SCRIPTS_PATH ${CMAKE_BINARY_DIR}/UCD/Scripts.txt)
|
|
|
|
set(SCRIPT_EXTENSIONS_URL https://www.unicode.org/Public/13.0.0/ucd/ScriptExtensions.txt)
|
|
set(SCRIPT_EXTENSIONS_PATH ${CMAKE_BINARY_DIR}/UCD/ScriptExtensions.txt)
|
|
|
|
set(EMOJI_DATA_URL https://www.unicode.org/Public/13.0.0/ucd/emoji/emoji-data.txt)
|
|
set(EMOJI_DATA_PATH ${CMAKE_BINARY_DIR}/UCD/emoji-data.txt)
|
|
|
|
if (ENABLE_UNICODE_DATABASE_DOWNLOAD)
|
|
if (NOT EXISTS ${UNICODE_DATA_PATH})
|
|
message(STATUS "Downloading UCD UnicodeData.txt from ${UNICODE_DATA_URL}...")
|
|
file(DOWNLOAD ${UNICODE_DATA_URL} ${UNICODE_DATA_PATH} INACTIVITY_TIMEOUT 10)
|
|
endif()
|
|
if (NOT EXISTS ${SPECIAL_CASING_PATH})
|
|
message(STATUS "Downloading UCD SpecialCasing.txt from ${SPECIAL_CASING_URL}...")
|
|
file(DOWNLOAD ${SPECIAL_CASING_URL} ${SPECIAL_CASING_PATH} INACTIVITY_TIMEOUT 10)
|
|
endif()
|
|
if (NOT EXISTS ${DERIVED_GENERAL_CATEGORY_PATH})
|
|
message(STATUS "Downloading UCD DerivedGeneralCategory.txt from ${DERIVED_GENERAL_CATEGORY_URL}...")
|
|
file(DOWNLOAD ${DERIVED_GENERAL_CATEGORY_URL} ${DERIVED_GENERAL_CATEGORY_PATH} INACTIVITY_TIMEOUT 10)
|
|
endif()
|
|
if (NOT EXISTS ${PROP_LIST_PATH})
|
|
message(STATUS "Downloading UCD PropList.txt from ${PROP_LIST_URL}...")
|
|
file(DOWNLOAD ${PROP_LIST_URL} ${PROP_LIST_PATH} INACTIVITY_TIMEOUT 10)
|
|
endif()
|
|
if (NOT EXISTS ${DERIVED_CORE_PROP_PATH})
|
|
message(STATUS "Downloading UCD DerivedCoreProperties.txt from ${DERIVED_CORE_PROP_URL}...")
|
|
file(DOWNLOAD ${DERIVED_CORE_PROP_URL} ${DERIVED_CORE_PROP_PATH} INACTIVITY_TIMEOUT 10)
|
|
endif()
|
|
if (NOT EXISTS ${DERIVED_BINARY_PROP_PATH})
|
|
message(STATUS "Downloading UCD DerivedBinaryProperties.txt from ${DERIVED_BINARY_PROP_URL}...")
|
|
file(DOWNLOAD ${DERIVED_BINARY_PROP_URL} ${DERIVED_BINARY_PROP_PATH} INACTIVITY_TIMEOUT 10)
|
|
endif()
|
|
if (NOT EXISTS ${PROP_ALIAS_PATH})
|
|
message(STATUS "Downloading UCD PropertyAliases.txt from ${PROP_ALIAS_URL}...")
|
|
file(DOWNLOAD ${PROP_ALIAS_URL} ${PROP_ALIAS_PATH} INACTIVITY_TIMEOUT 10)
|
|
endif()
|
|
if (NOT EXISTS ${PROP_VALUE_ALIAS_PATH})
|
|
message(STATUS "Downloading UCD PropertyValueAliases.txt from ${PROP_VALUE_ALIAS_URL}...")
|
|
file(DOWNLOAD ${PROP_VALUE_ALIAS_URL} ${PROP_VALUE_ALIAS_PATH} INACTIVITY_TIMEOUT 10)
|
|
endif()
|
|
if (NOT EXISTS ${SCRIPTS_PATH})
|
|
message(STATUS "Downloading UCD Scripts.txt from ${SCRIPTS_URL}...")
|
|
file(DOWNLOAD ${SCRIPTS_URL} ${SCRIPTS_PATH} INACTIVITY_TIMEOUT 10)
|
|
endif()
|
|
if (NOT EXISTS ${SCRIPT_EXTENSIONS_PATH})
|
|
message(STATUS "Downloading UCD ScriptExtensions.txt from ${SCRIPT_EXTENSIONS_URL}...")
|
|
file(DOWNLOAD ${SCRIPT_EXTENSIONS_URL} ${SCRIPT_EXTENSIONS_PATH} INACTIVITY_TIMEOUT 10)
|
|
endif()
|
|
if (NOT EXISTS ${EMOJI_DATA_PATH})
|
|
message(STATUS "Downloading UCD emoji-data.txt from ${EMOJI_DATA_URL}...")
|
|
file(DOWNLOAD ${EMOJI_DATA_URL} ${EMOJI_DATA_PATH} INACTIVITY_TIMEOUT 10)
|
|
endif()
|
|
|
|
set(UNICODE_DATA_HEADER LibUnicode/UnicodeData.h)
|
|
set(UNICODE_DATA_IMPLEMENTATION LibUnicode/UnicodeData.cpp)
|
|
|
|
if (CMAKE_CURRENT_BINARY_DIR MATCHES ".*/LibUnicode") # Serenity build.
|
|
set(UNICODE_DATA_HEADER UnicodeData.h)
|
|
set(UNICODE_DATA_IMPLEMENTATION UnicodeData.cpp)
|
|
endif()
|
|
|
|
add_custom_command(
|
|
OUTPUT ${UNICODE_DATA_HEADER} ${UNICODE_DATA_IMPLEMENTATION}
|
|
COMMAND $<TARGET_FILE:GenerateUnicodeData> -h ${UNICODE_DATA_HEADER} -c ${UNICODE_DATA_IMPLEMENTATION} -u ${UNICODE_DATA_PATH} -s ${SPECIAL_CASING_PATH} -g ${DERIVED_GENERAL_CATEGORY_PATH} -p ${PROP_LIST_PATH} -d ${DERIVED_CORE_PROP_PATH} -b ${DERIVED_BINARY_PROP_PATH} -a ${PROP_ALIAS_PATH} -v ${PROP_VALUE_ALIAS_PATH} -r ${SCRIPTS_PATH} -x ${SCRIPT_EXTENSIONS_PATH} -e ${EMOJI_DATA_PATH}
|
|
VERBATIM
|
|
DEPENDS GenerateUnicodeData ${UNICODE_DATA_PATH} ${SPECIAL_CASING_PATH} ${DERIVED_GENERAL_CATEGORY_PATH} ${PROP_LIST_PATH} ${DERIVED_CORE_PROP_PATH} ${DERIVED_BINARY_PROP_PATH} ${PROP_ALIAS_PATH} ${PROP_VALUE_ALIAS_PATH} ${SCRIPTS_PATH} ${SCRIPT_EXTENSIONS_PATH} ${EMOJI_DATA_PATH}
|
|
)
|
|
|
|
set(UNICODE_DATA_SOURCES ${UNICODE_DATA_HEADER} ${UNICODE_DATA_IMPLEMENTATION})
|
|
add_compile_definitions(ENABLE_UNICODE_DATA=1)
|
|
else()
|
|
add_compile_definitions(ENABLE_UNICODE_DATA=0)
|
|
endif()
|