From 5bcb019106d870e57b8ad7412a6949ec27a96761 Mon Sep 17 00:00:00 2001 From: Simon Wanner Date: Thu, 15 Jun 2023 21:57:13 +0200 Subject: [PATCH] LibUnicode: Add IDNA::to_ascii This implements the ToASCII operation of Unicode Technical Standard 46 --- Tests/LibUnicode/CMakeLists.txt | 1 + Tests/LibUnicode/TestIDNA.cpp | 45 ++++ Userland/Libraries/LibUnicode/CMakeLists.txt | 1 + Userland/Libraries/LibUnicode/IDNA.cpp | 248 +++++++++++++++++++ Userland/Libraries/LibUnicode/IDNA.h | 43 ++++ 5 files changed, 338 insertions(+) create mode 100644 Tests/LibUnicode/TestIDNA.cpp create mode 100644 Userland/Libraries/LibUnicode/IDNA.cpp diff --git a/Tests/LibUnicode/CMakeLists.txt b/Tests/LibUnicode/CMakeLists.txt index 37d2a69020..4e850cb894 100644 --- a/Tests/LibUnicode/CMakeLists.txt +++ b/Tests/LibUnicode/CMakeLists.txt @@ -1,5 +1,6 @@ set(TEST_SOURCES TestEmoji.cpp + TestIDNA.cpp TestPunycode.cpp TestSegmentation.cpp TestUnicodeCharacterTypes.cpp diff --git a/Tests/LibUnicode/TestIDNA.cpp b/Tests/LibUnicode/TestIDNA.cpp new file mode 100644 index 0000000000..c57e914898 --- /dev/null +++ b/Tests/LibUnicode/TestIDNA.cpp @@ -0,0 +1,45 @@ +/* + * Copyright (c) 2023, Simon Wanner + * + * SPDX-License-Identifier: BSD-2-Clause + */ + +#include + +#include + +namespace Unicode::IDNA { + +TEST_CASE(to_ascii) +{ +#define TEST_TO_ASCII(input, expected, ...) EXPECT_EQ(TRY_OR_FAIL(to_ascii(Utf8View(input), ##__VA_ARGS__)), expected) + + ToAsciiOptions const options_with_transitional_processing { + .transitional_processing = TransitionalProcessing::Yes + }; +#define TEST_TO_ASCII_T(input, expected) TEST_TO_ASCII(input, expected, options_with_transitional_processing) + TEST_TO_ASCII("www.аррӏе.com"sv, "www.xn--80ak6aa92e.com"sv); + TEST_TO_ASCII("ö.com"sv, "xn--nda.com"sv); + TEST_TO_ASCII("o\u0308.com"sv, "xn--nda.com"sv); + + // Select cases from IdnaTestV2.txt + // FIXME: Download, parse and test all cases + TEST_TO_ASCII("Faß.de"sv, "xn--fa-hia.de"sv); + TEST_TO_ASCII_T("Faß.de"sv, "fass.de"sv); + TEST_TO_ASCII("¡"sv, "xn--7a"sv); + TEST_TO_ASCII("Bücher.de"sv, "xn--bcher-kva.de"); + TEST_TO_ASCII("\u0646\u0627\u0645\u0647\u0627\u06CC"sv, "xn--mgba3gch31f"sv); + TEST_TO_ASCII("A.b.c。D。"sv, "a.b.c.d."sv); + TEST_TO_ASCII("βόλος"sv, "xn--nxasmm1c"sv); + TEST_TO_ASCII_T("βόλος"sv, "xn--nxasmq6b"sv); +#undef TEST_TO_ASCII_T +#undef TEST_TO_ASCII + + EXPECT(to_ascii(Utf8View("xn--o-ccb.com"sv)).is_error()); + EXPECT(to_ascii(Utf8View("wh--f.com"sv)).is_error()); + EXPECT(to_ascii(Utf8View("xn--whf-cec.com"sv)).is_error()); + EXPECT(to_ascii(Utf8View("-whf.com"sv)).is_error()); + EXPECT(to_ascii(Utf8View("whf-.com"sv)).is_error()); +} + +} diff --git a/Userland/Libraries/LibUnicode/CMakeLists.txt b/Userland/Libraries/LibUnicode/CMakeLists.txt index 57269720a3..55e1d77cec 100644 --- a/Userland/Libraries/LibUnicode/CMakeLists.txt +++ b/Userland/Libraries/LibUnicode/CMakeLists.txt @@ -4,6 +4,7 @@ set(SOURCES CharacterTypes.cpp CurrencyCode.cpp Emoji.cpp + IDNA.cpp Normalize.cpp Punycode.cpp Segmentation.cpp diff --git a/Userland/Libraries/LibUnicode/IDNA.cpp b/Userland/Libraries/LibUnicode/IDNA.cpp new file mode 100644 index 0000000000..8c2b2e84fc --- /dev/null +++ b/Userland/Libraries/LibUnicode/IDNA.cpp @@ -0,0 +1,248 @@ +/* + * Copyright (c) 2023, Simon Wanner + * + * SPDX-License-Identifier: BSD-2-Clause + */ + +#include +#include +#include +#include +#include +#include + +#if ENABLE_UNICODE_DATA +# include +# include +#endif + +namespace Unicode::IDNA { + +#if not ENABLE_UNICODE_DATA + +Optional get_idna_mapping(u32) +{ + return {}; +} + +#endif + +struct ProcessingResult { + Vector result {}; + bool has_error { false }; +}; + +static MappingStatus translate_status(MappingStatus status, UseStd3AsciiRules use_std3_ascii_rules) +{ + switch (status) { + case MappingStatus::DisallowedStd3Valid: + return use_std3_ascii_rules == UseStd3AsciiRules::Yes ? MappingStatus::Disallowed : MappingStatus::Valid; + case MappingStatus::DisallowedStd3Mapped: + return use_std3_ascii_rules == UseStd3AsciiRules::Yes ? MappingStatus::Disallowed : MappingStatus::Mapped; + default: + return status; + } +} + +// https://www.unicode.org/reports/tr46/#Validity_Criteria +static bool is_valid_label(String const& label, CheckHyphens check_hyphens, CheckBidi check_bidi, CheckJoiners check_joiners, UseStd3AsciiRules use_std3_ascii_rules, TransitionalProcessing transitional_processing) +{ + // 1. The label must be in Unicode Normalization Form NFC. + auto normalized = normalize(label, NormalizationForm::NFC); + if (normalized != label) + return false; + + size_t position = 0; + for (auto code_point : label.code_points()) { + // 2. If CheckHyphens, the label must not contain a U+002D HYPHEN-MINUS character in both the third and fourth positions. + if (check_hyphens == CheckHyphens::Yes && code_point == '-' && (position == 2 || position == 3)) + return false; + + // 4. The label must not contain a U+002E ( . ) FULL STOP. + if (code_point == '.') + return false; + + // 5. The label must not begin with a combining mark, that is: General_Category=Mark. + static auto general_category_mark = general_category_from_string("Mark"sv); + if (position == 0 && general_category_mark.has_value() && code_point_has_general_category(code_point, general_category_mark.value())) + return false; + + // 6. Each code point in the label must only have certain status values according to Section 5, IDNA Mapping Table: + Optional mapping = get_idna_mapping(code_point); + if (!mapping.has_value()) + return false; + + auto status = translate_status(mapping->status, use_std3_ascii_rules); + if (transitional_processing == TransitionalProcessing::Yes) { + // 1. For Transitional Processing, each value must be valid. + if (status != MappingStatus::Valid) + return false; + } else { + // 2. For Nontransitional Processing, each value must be either valid or deviation. + if (status != MappingStatus::Valid && status != MappingStatus::Deviation) + return false; + } + position++; + } + + // 3. If CheckHyphens, the label must neither begin nor end with a U+002D HYPHEN-MINUS character. + if (check_hyphens == CheckHyphens::Yes && (label.starts_with('-') || label.ends_with('-'))) + return false; + + // FIXME: 7. If CheckJoiners, the label must satisify the ContextJ rules from Appendix A, in The Unicode Code Points and Internationalized Domain Names for Applications (IDNA) [IDNA2008]. + (void)check_joiners; + + // FIXME: 8. If CheckBidi, and if the domain name is a Bidi domain name, then the label must satisfy all six of the numbered conditions in [IDNA2008] RFC 5893, Section 2. + (void)check_bidi; + + return true; +} + +// https://www.unicode.org/reports/tr46/#Processing +static ErrorOr apply_main_processing_steps(Utf8View domain_name, ToAsciiOptions const& options) +{ + bool has_error = false; + StringBuilder mapped; + // 1. Map. For each code point in the domain_name string, look up the status value in Section 5, IDNA Mapping Table, and take the following actions: + for (u32 code_point : domain_name) { + Optional mapping = get_idna_mapping(code_point); + if (!mapping.has_value()) { + has_error = true; + continue; + } + switch (translate_status(mapping->status, options.use_std3_ascii_rules)) { + // disallowed: Leave the code point unchanged in the string, and record that there was an error. + case MappingStatus::Disallowed: + TRY(mapped.try_append_code_point(code_point)); + has_error = true; + break; + // ignored: Remove the code point from the string. This is equivalent to mapping the code point to an empty string. + case MappingStatus::Ignored: + break; + // mapped: Replace the code point in the string by the value for the mapping in Section 5, IDNA Mapping Table. + case MappingStatus::Mapped: + TRY(mapped.try_append(mapping->mapped_to)); + break; + // deviation: + case MappingStatus::Deviation: + if (options.transitional_processing == TransitionalProcessing::Yes) { + // If Transitional_Processing, replace the code point in the string by the value for the mapping in Section 5, IDNA Mapping Table . + TRY(mapped.try_append(mapping->mapped_to)); + } else { + TRY(mapped.try_append_code_point(code_point)); + } + break; + // valid: Leave the code point unchanged in the string. + case MappingStatus::Valid: + TRY(mapped.try_append_code_point(code_point)); + break; + + default: + VERIFY_NOT_REACHED(); + } + } + + // 2. Normalize. Normalize the domain_name string to Unicode Normalization Form C. + auto normalized = normalize(mapped.string_view(), NormalizationForm::NFC); + + // 3. Break. Break the string into labels at U+002E ( . ) FULL STOP. + auto labels = TRY(normalized.split('.', SplitBehavior::KeepEmpty)); + + // 4. Convert/Validate. For each label in the domain_name string: + for (auto& label : labels) { + // If the label starts with “xn--”: + if (label.starts_with_bytes("xn--"sv)) { + // 1. Attempt to convert the rest of the label to Unicode according to Punycode [RFC3492]. If that conversion fails, record that there was an error, and continue with the next label. + // Otherwise replace the original label in the string by the results of the conversion. + auto punycode = Punycode::decode(label.bytes_as_string_view().substring_view(4)); + if (punycode.is_error()) { + has_error = true; + continue; + } + + label = punycode.release_value(); + + // 2. Verify that the label meets the validity criteria in Section 4.1, Validity Criteria for Nontransitional Processing. + // If any of the validity criteria are not satisfied, record that there was an error. + if (!is_valid_label(label, options.check_hyphens, options.check_bidi, options.check_joiners, options.use_std3_ascii_rules, TransitionalProcessing::No)) + has_error = true; + } + // If the label does not start with “xn--”: + else { + // Verify that the label meets the validity criteria in Section 4.1, Validity Criteria for the input Processing choice (Transitional or Nontransitional). + // If any of the validity criteria are not satisfied, record that there was an error. + if (!is_valid_label(label, options.check_hyphens, options.check_bidi, options.check_joiners, options.use_std3_ascii_rules, options.transitional_processing)) + has_error = true; + } + } + + return ProcessingResult { + .result = move(labels), + .has_error = has_error, + }; +} + +// https://www.unicode.org/reports/tr46/#ToASCII +ErrorOr to_ascii(Utf8View domain_name, ToAsciiOptions const& options) +{ + // 1. To the input domain_name, apply the Processing Steps in Section 4, Processing, using the input boolean flags Transitional_Processing, CheckHyphens, CheckBidi, CheckJoiners, and UseSTD3ASCIIRules. This may record an error. + auto processed = TRY(apply_main_processing_steps(domain_name, options)); + bool has_error = processed.has_error; + + // 2. Break the result into labels at U+002E FULL STOP. + auto labels = move(processed.result); + + // 3. Convert each label with non-ASCII characters into Punycode [RFC3492], and prefix by “xn--”. This may record an error. + for (auto& label : labels) { + auto all_ascii = true; + for (auto code_point : label.code_points()) { + if (!is_ascii(code_point)) { + all_ascii = false; + break; + } + } + + if (!all_ascii) { + auto punycode = Punycode::encode(label); + if (punycode.is_error()) { + has_error = true; + continue; + } + auto punycode_result = punycode.release_value(); + + StringBuilder builder; + TRY(builder.try_append("xn--"sv)); + TRY(builder.try_append(punycode_result)); + label = TRY(builder.to_string()); + } + } + + // 4. If the VerifyDnsLength flag is true, then verify DNS length restrictions. This may record an error. For more information, see [STD13] and [STD3]. + if (options.verify_dns_length == VerifyDnsLength::Yes) { + // 1. The length of the domain name, excluding the root label and its dot, is from 1 to 253. + size_t total_length = 0; + auto* root_label = !labels.is_empty() && labels.last().is_empty() ? &labels.last() : nullptr; + for (auto& label : labels) { + // 2. The length of each label is from 1 to 63. + auto length = label.bytes().size(); + if (label.is_empty() && &label != root_label) + return Error::from_string_literal("Invalid empty label"); + if (length > 63) + return Error::from_string_literal("Label too long"); + total_length += length; + } + + total_length += labels.size() - (root_label ? 2 : 1); + if (total_length == 0 || total_length > 253) + return Error::from_string_literal("Domain too long"); + } + + // 5. If an error was recorded in steps 1-4, then the operation has failed and a failure value is returned. No DNS lookup should be done. + if (has_error) + return Error::from_string_literal("Invalid domain name"); + + // 6. Otherwise join the labels using U+002E FULL STOP as a separator, and return the result. + return String::join('.', labels); +} + +} diff --git a/Userland/Libraries/LibUnicode/IDNA.h b/Userland/Libraries/LibUnicode/IDNA.h index bdd30cae80..294f1bc9e7 100644 --- a/Userland/Libraries/LibUnicode/IDNA.h +++ b/Userland/Libraries/LibUnicode/IDNA.h @@ -6,7 +6,9 @@ #pragma once +#include #include +#include namespace Unicode::IDNA { @@ -31,4 +33,45 @@ struct Mapping { Utf32View mapped_to; }; +enum class CheckHyphens { + No, + Yes, +}; + +enum class CheckBidi { + No, + Yes, +}; + +enum class CheckJoiners { + No, + Yes, +}; + +enum class UseStd3AsciiRules { + No, + Yes, +}; + +enum class TransitionalProcessing { + No, + Yes, +}; + +enum class VerifyDnsLength { + No, + Yes, +}; + +struct ToAsciiOptions { + CheckHyphens check_hyphens { CheckHyphens::Yes }; + CheckBidi check_bidi { CheckBidi::Yes }; + CheckJoiners check_joiners { CheckJoiners::Yes }; + UseStd3AsciiRules use_std3_ascii_rules { UseStd3AsciiRules::No }; + TransitionalProcessing transitional_processing { TransitionalProcessing::No }; + VerifyDnsLength verify_dns_length { VerifyDnsLength::Yes }; +}; + +ErrorOr to_ascii(Utf8View domain_name, ToAsciiOptions const& = {}); + }