diff --git a/Meta/CMake/unicode_data.cmake b/Meta/CMake/unicode_data.cmake index dca7f69fe4..1425a6bd5f 100644 --- a/Meta/CMake/unicode_data.cmake +++ b/Meta/CMake/unicode_data.cmake @@ -68,6 +68,9 @@ set(EMOJI_RES_PATH "${SerenityOS_SOURCE_DIR}/Base/res/emoji") set(EMOJI_SERENITY_PATH "${SerenityOS_SOURCE_DIR}/Base/home/anon/Documents/emoji-serenity.txt") set(EMOJI_INSTALL_PATH "${CMAKE_BINARY_DIR}/Root/home/anon/Documents/emoji.txt") +set(IDNA_MAPPING_TABLE_URL "https://www.unicode.org/Public/idna/${UCD_VERSION}/IdnaMappingTable.txt") +set(IDNA_MAPPING_TABLE_PATH "${UCD_PATH}/IdnaMappingTable.txt") + if (ENABLE_UNICODE_DATABASE_DOWNLOAD) remove_path_if_version_changed("${UCD_VERSION}" "${UCD_VERSION_FILE}" "${UCD_PATH}") @@ -98,12 +101,17 @@ if (ENABLE_UNICODE_DATABASE_DOWNLOAD) message(STATUS "Skipping download of ${EMOJI_TEST_URL}, expecting the archive to have been extracted to ${EMOJI_TEST_PATH}") endif() + download_file("${IDNA_MAPPING_TABLE_URL}" "${IDNA_MAPPING_TABLE_PATH}") + set(UNICODE_DATA_HEADER UnicodeData.h) set(UNICODE_DATA_IMPLEMENTATION UnicodeData.cpp) set(EMOJI_DATA_HEADER EmojiData.h) set(EMOJI_DATA_IMPLEMENTATION EmojiData.cpp) + set(IDNA_DATA_HEADER IDNAData.h) + set(IDNA_DATA_IMPLEMENTATION IDNAData.cpp) + if (SERENITYOS) set(EMOJI_INSTALL_ARG -i "${EMOJI_INSTALL_PATH}") endif() @@ -130,11 +138,21 @@ if (ENABLE_UNICODE_DATABASE_DOWNLOAD) # the generated emoji.txt file. dependencies "${EMOJI_RES_PATH}" "${EMOJI_SERENITY_PATH}" ) + invoke_generator( + "IDNAData" + Lagom::GenerateIDNAData + "${UCD_VERSION_FILE}" + "${IDNA_DATA_HEADER}" + "${IDNA_DATA_IMPLEMENTATION}" + arguments -m "${IDNA_MAPPING_TABLE_PATH}" + ) set(UNICODE_DATA_SOURCES ${UNICODE_DATA_HEADER} ${UNICODE_DATA_IMPLEMENTATION} ${EMOJI_DATA_HEADER} ${EMOJI_DATA_IMPLEMENTATION} + ${IDNA_DATA_HEADER} + ${IDNA_DATA_IMPLEMENTATION} ) endif() diff --git a/Meta/Lagom/Tools/CodeGenerators/LibUnicode/CMakeLists.txt b/Meta/Lagom/Tools/CodeGenerators/LibUnicode/CMakeLists.txt index b18637a184..a03d4cda73 100644 --- a/Meta/Lagom/Tools/CodeGenerators/LibUnicode/CMakeLists.txt +++ b/Meta/Lagom/Tools/CodeGenerators/LibUnicode/CMakeLists.txt @@ -1,2 +1,3 @@ lagom_tool(GenerateUnicodeData SOURCES GenerateUnicodeData.cpp LIBS LibMain) lagom_tool(GenerateEmojiData SOURCES GenerateEmojiData.cpp LIBS LibMain) +lagom_tool(GenerateIDNAData SOURCES GenerateIDNAData.cpp LIBS LibMain) diff --git a/Meta/Lagom/Tools/CodeGenerators/LibUnicode/GenerateIDNAData.cpp b/Meta/Lagom/Tools/CodeGenerators/LibUnicode/GenerateIDNAData.cpp new file mode 100644 index 0000000000..11da74ca78 --- /dev/null +++ b/Meta/Lagom/Tools/CodeGenerators/LibUnicode/GenerateIDNAData.cpp @@ -0,0 +1,240 @@ +/* + * Copyright (c) 2023, Simon Wanner + * + * SPDX-License-Identifier: BSD-2-Clause + */ + +#include "GeneratorUtil.h" +#include +#include +#include +#include + +enum class MappingStatus : u8 { + Valid, + Ignored, + Mapped, + Deviation, + Disallowed, + DisallowedStd3Valid, + DisallowedStd3Mapped, +}; + +static constexpr Array mapping_status_names { "Valid"sv, "Ignored"sv, "Mapped"sv, "Deviation"sv, "Disallowed"sv, "DisallowedStd3Valid"sv, "DisallowedStd3Mapped"sv }; + +enum class IDNA2008Status : u8 { + NV8, + XV8, +}; + +static constexpr Array idna_2008_status_names { "NV8"sv, "XV8"sv }; + +struct IDNAMapping { + Unicode::CodePointRange code_points; + MappingStatus status; + IDNA2008Status idna_2008_status; + Vector mapped_to {}; +}; + +struct IDNAData { + Vector mapping_table; +}; + +static MappingStatus parse_mapping_status(StringView status) +{ + if (status == "valid"sv) + return MappingStatus::Valid; + if (status == "ignored"sv) + return MappingStatus::Ignored; + if (status == "mapped"sv) + return MappingStatus::Mapped; + if (status == "deviation"sv) + return MappingStatus::Deviation; + if (status == "disallowed"sv) + return MappingStatus::Disallowed; + if (status == "disallowed_STD3_valid"sv) + return MappingStatus::DisallowedStd3Valid; + if (status == "disallowed_STD3_mapped"sv) + return MappingStatus::DisallowedStd3Mapped; + VERIFY_NOT_REACHED(); +} + +static ErrorOr parse_idna_mapping_table(Core::InputBufferedFile& file, Vector& mapping_table) +{ + Array buffer; + + while (TRY(file.can_read_line())) { + auto line = TRY(file.read_line(buffer)); + + if (line.is_empty() || line.starts_with('#')) + continue; + + if (auto index = line.find('#'); index.has_value()) + line = line.substring_view(0, *index); + + auto segments = line.split_view(';', SplitBehavior::KeepEmpty); + VERIFY(segments.size() >= 2); + + IDNAMapping idna_mapping {}; + idna_mapping.code_points = parse_code_point_range(segments[0].trim_whitespace()); + idna_mapping.status = parse_mapping_status(segments[1].trim_whitespace()); + + if (segments.size() >= 3) + idna_mapping.mapped_to = parse_code_point_list(segments[2].trim_whitespace()); + + if (segments.size() >= 4) { + auto trimmed = segments[3].trim_whitespace(); + if (trimmed == "NV8"sv) { + idna_mapping.idna_2008_status = IDNA2008Status::NV8; + } else { + VERIFY(trimmed == "XV8"sv); + idna_mapping.idna_2008_status = IDNA2008Status::XV8; + } + } + + TRY(mapping_table.try_append(move(idna_mapping))); + } + + return {}; +} + +static ErrorOr generate_idna_data_header(Core::InputBufferedFile& file, IDNAData&) +{ + StringBuilder builder; + SourceGenerator generator { builder }; + + generator.append(R"~~~( +#pragma once + +#include +#include + +namespace Unicode::IDNA { + +Optional get_idna_mapping(u32 code_point); + +} +)~~~"); + + TRY(file.write_until_depleted(generator.as_string_view().bytes())); + return {}; +} + +static ErrorOr generate_idna_data_implementation(Core::InputBufferedFile& file, IDNAData& idna_data) +{ + StringBuilder builder; + SourceGenerator generator { builder }; + + generator.set("idna_table_size", TRY(String::number(idna_data.mapping_table.size()))); + + generator.append(R"~~~( + +#include +#include +#include +#include + +namespace Unicode::IDNA { + +struct MappingEntry { + CodePointRange code_points {}; + MappingStatus status : 3 { MappingStatus::Valid }; + IDNA2008Status idna_2008_status : 1 { IDNA2008Status::NV8 }; + size_t mapping_offset : 20 { 0 }; + size_t mapping_length : 8 { 0 }; +}; + +static constexpr Array s_idna_mapping_table { {)~~~"); + + { + size_t mapping_offset = 0; + for (auto const& mapping : idna_data.mapping_table) { + generator.set("code_points", TRY(String::formatted("{:#x}, {:#x}", mapping.code_points.first, mapping.code_points.last))); + generator.set("status", mapping_status_names[to_underlying(mapping.status)]); + generator.set("idna_2008_status", idna_2008_status_names[to_underlying(mapping.idna_2008_status)]); + + if (mapping.mapped_to.is_empty()) { + generator.set("mapping_offset", "0"sv); + generator.set("mapping_length", "0"sv); + } else { + generator.set("mapping_offset", TRY(String::number(mapping_offset))); + generator.set("mapping_length", TRY(String::number(mapping.mapped_to.size()))); + mapping_offset += mapping.mapped_to.size(); + } + + generator.append(R"~~~( + { { @code_points@ }, MappingStatus::@status@, IDNA2008Status::@idna_2008_status@, @mapping_offset@, @mapping_length@ },)~~~"); + } + + generator.set("mapping_length_total", TRY(String::number(mapping_offset))); + } + + generator.append(R"~~~( +} }; + +static constexpr Array s_mapping_code_points { )~~~"); + + { + for (auto const& mapping : idna_data.mapping_table) { + if (mapping.mapped_to.is_empty()) + continue; + + for (u32 code_point : mapping.mapped_to) + generator.append(TRY(String::formatted("{:#x}, ", code_point))); + + generator.append(R"~~~( + )~~~"); + } + } + + generator.append(R"~~~( +}; + +Optional get_idna_mapping(u32 code_point) +{ + auto* entry = binary_search(s_idna_mapping_table, code_point, nullptr, [](auto code_point, auto entry) { + if (code_point < entry.code_points.first) + return -1; + if (code_point > entry.code_points.last) + return 1; + return 0; + }); + + if (!entry) + return {}; + + auto mapped_to = Utf32View { entry->mapping_length ? s_mapping_code_points.data() + entry->mapping_offset : nullptr, entry->mapping_length }; + return Mapping { entry->status, entry->idna_2008_status, move(mapped_to) }; +} + +} +)~~~"); + + TRY(file.write_until_depleted(generator.as_string_view().bytes())); + return {}; +} + +ErrorOr serenity_main(Main::Arguments arguments) +{ + StringView generated_header_path; + StringView generated_implementation_path; + StringView idna_mapping_table_path; + + Core::ArgsParser args_parser; + args_parser.add_option(generated_header_path, "Path to the IDNA Data header file to generate", "generated-header-path", 'h', "generated-header-path"); + args_parser.add_option(generated_implementation_path, "Path to the IDNA Data implementation file to generate", "generated-implementation-path", 'c', "generated-implementation-path"); + args_parser.add_option(idna_mapping_table_path, "Path to IdnaMappingTable.txt file", "idna-mapping-table-path", 'm', "idna-mapping-table-path"); + args_parser.parse(arguments); + + auto generated_header_file = TRY(open_file(generated_header_path, Core::File::OpenMode::Write)); + auto generated_implementation_file = TRY(open_file(generated_implementation_path, Core::File::OpenMode::Write)); + auto idna_mapping_table_file = TRY(open_file(idna_mapping_table_path, Core::File::OpenMode::Read)); + + IDNAData idna_data {}; + TRY(parse_idna_mapping_table(*idna_mapping_table_file, idna_data.mapping_table)); + + TRY(generate_idna_data_header(*generated_header_file, idna_data)); + TRY(generate_idna_data_implementation(*generated_implementation_file, idna_data)); + + return 0; +} diff --git a/Meta/Lagom/Tools/CodeGenerators/LibUnicode/GenerateUnicodeData.cpp b/Meta/Lagom/Tools/CodeGenerators/LibUnicode/GenerateUnicodeData.cpp index 36afc1fda1..95f8f33725 100644 --- a/Meta/Lagom/Tools/CodeGenerators/LibUnicode/GenerateUnicodeData.cpp +++ b/Meta/Lagom/Tools/CodeGenerators/LibUnicode/GenerateUnicodeData.cpp @@ -212,36 +212,6 @@ static DeprecatedString sanitize_entry(DeprecatedString const& entry) return builder.to_deprecated_string(); } -static Vector parse_code_point_list(StringView list) -{ - Vector code_points; - - auto segments = list.split_view(' '); - for (auto const& code_point : segments) - code_points.append(AK::StringUtils::convert_to_uint_from_hex(code_point).value()); - - return code_points; -} - -static Unicode::CodePointRange parse_code_point_range(StringView list) -{ - Unicode::CodePointRange code_point_range {}; - - if (list.contains(".."sv)) { - auto segments = list.split_view(".."sv); - VERIFY(segments.size() == 2); - - auto begin = AK::StringUtils::convert_to_uint_from_hex(segments[0]).value(); - auto end = AK::StringUtils::convert_to_uint_from_hex(segments[1]).value(); - code_point_range = { begin, end }; - } else { - auto code_point = AK::StringUtils::convert_to_uint_from_hex(list).value(); - code_point_range = { code_point, code_point }; - } - - return code_point_range; -} - static ErrorOr parse_special_casing(Core::InputBufferedFile& file, UnicodeData& unicode_data) { Array buffer; diff --git a/Meta/Lagom/Tools/CodeGenerators/LibUnicode/GeneratorUtil.h b/Meta/Lagom/Tools/CodeGenerators/LibUnicode/GeneratorUtil.h index d251e1de8a..03501c5b9c 100644 --- a/Meta/Lagom/Tools/CodeGenerators/LibUnicode/GeneratorUtil.h +++ b/Meta/Lagom/Tools/CodeGenerators/LibUnicode/GeneratorUtil.h @@ -22,6 +22,7 @@ #include #include #include +#include template inline constexpr bool StorageTypeIsList = false; @@ -598,3 +599,33 @@ ReadonlySpan @name@() } )~~~"); } + +inline Vector parse_code_point_list(StringView list) +{ + Vector code_points; + + auto segments = list.split_view(' '); + for (auto const& code_point : segments) + code_points.append(AK::StringUtils::convert_to_uint_from_hex(code_point).value()); + + return code_points; +} + +inline Unicode::CodePointRange parse_code_point_range(StringView list) +{ + Unicode::CodePointRange code_point_range {}; + + if (list.contains(".."sv)) { + auto segments = list.split_view(".."sv); + VERIFY(segments.size() == 2); + + auto begin = AK::StringUtils::convert_to_uint_from_hex(segments[0]).value(); + auto end = AK::StringUtils::convert_to_uint_from_hex(segments[1]).value(); + code_point_range = { begin, end }; + } else { + auto code_point = AK::StringUtils::convert_to_uint_from_hex(list).value(); + code_point_range = { code_point, code_point }; + } + + return code_point_range; +} diff --git a/Userland/Libraries/LibUnicode/IDNA.h b/Userland/Libraries/LibUnicode/IDNA.h new file mode 100644 index 0000000000..bdd30cae80 --- /dev/null +++ b/Userland/Libraries/LibUnicode/IDNA.h @@ -0,0 +1,34 @@ +/* + * Copyright (c) 2023, Simon Wanner + * + * SPDX-License-Identifier: BSD-2-Clause + */ + +#pragma once + +#include + +namespace Unicode::IDNA { + +enum class MappingStatus : u8 { + Valid, + Ignored, + Mapped, + Deviation, + Disallowed, + DisallowedStd3Valid, + DisallowedStd3Mapped, +}; + +enum class IDNA2008Status : u8 { + NV8, + XV8, +}; + +struct Mapping { + MappingStatus status; + IDNA2008Status idna_2008_status; + Utf32View mapped_to; +}; + +}