diff --git a/AK/URLParser.cpp b/AK/URLParser.cpp index f16e3ea2ba..8ab4e841ce 100644 --- a/AK/URLParser.cpp +++ b/AK/URLParser.cpp @@ -603,7 +603,8 @@ static Optional parse_host(StringView input, bool is_opaque = false) // FIXME: 4. Let domain be the result of running UTF-8 decode without BOM on the percent-decoding of input. auto domain = URL::percent_decode(input); - // FIXME: 5. Let asciiDomain be the result of running domain to ASCII on domain. + // NOTE: This is handled in Unicode::create_unicode_url, to work around the fact that we can't call into LibUnicode here + // FIXME: 5. Let asciiDomain be the result of running domain to ASCII with domain and false. // FIXME: 6. If asciiDomain is failure, then return failure. auto ascii_domain_or_error = String::from_deprecated_string(domain); if (ascii_domain_or_error.is_error()) diff --git a/Userland/Libraries/LibUnicode/CMakeLists.txt b/Userland/Libraries/LibUnicode/CMakeLists.txt index 55e1d77cec..3a6e44ffab 100644 --- a/Userland/Libraries/LibUnicode/CMakeLists.txt +++ b/Userland/Libraries/LibUnicode/CMakeLists.txt @@ -10,6 +10,7 @@ set(SOURCES Segmentation.cpp String.cpp UnicodeUtils.cpp + URL.cpp ${UNICODE_DATA_SOURCES} ) set(GENERATED_SOURCES ${CURRENT_LIB_GENERATED}) diff --git a/Userland/Libraries/LibUnicode/URL.cpp b/Userland/Libraries/LibUnicode/URL.cpp new file mode 100644 index 0000000000..5c69f447d5 --- /dev/null +++ b/Userland/Libraries/LibUnicode/URL.cpp @@ -0,0 +1,57 @@ +/* + * Copyright (c) 2023, Simon Wanner + * + * SPDX-License-Identifier: BSD-2-Clause + */ + +#include +#include + +namespace Unicode { + +// https://url.spec.whatwg.org/#concept-domain-to-ascii +static ErrorOr domain_to_ascii(StringView domain, bool be_strict) +{ + // 1. Let result be the result of running Unicode ToASCII with domain_name set to domain, UseSTD3ASCIIRules set to beStrict, CheckHyphens set to false, CheckBidi set to true, CheckJoiners set to true, Transitional_Processing set to false, and VerifyDnsLength set to beStrict. [UTS46] + // 2. If result is a failure value, domain-to-ASCII validation error, return failure. + Unicode::IDNA::ToAsciiOptions const options { + Unicode::IDNA::CheckHyphens::No, + Unicode::IDNA::CheckBidi::Yes, + Unicode::IDNA::CheckJoiners::Yes, + be_strict ? Unicode::IDNA::UseStd3AsciiRules::Yes : Unicode::IDNA::UseStd3AsciiRules::No, + Unicode::IDNA::TransitionalProcessing::No, + be_strict ? Unicode::IDNA::VerifyDnsLength::Yes : Unicode::IDNA::VerifyDnsLength::No + }; + auto result = TRY(Unicode::IDNA::to_ascii(Utf8View(domain), options)); + + // 3. If result is the empty string, domain-to-ASCII validation error, return failure. + if (result.is_empty()) + return Error::from_string_literal("Empty domain"); + + // 4. Return result. + return result; +} + +// https://url.spec.whatwg.org/#concept-host-parser +ErrorOr create_unicode_url(String const& url_string) +{ + // NOTE: 1.-4. are implemented in URLParser::parse_host + + URL url = url_string; + if (!url.is_valid() || !url.host().has()) + return url; + + auto& domain = url.host().get(); + if (domain.is_empty()) + return url; + + // 5. Let asciiDomain be the result of running domain to ASCII with domain and false. + // 6. If asciiDomain is failure, then return failure. + auto ascii_domain = TRY(domain_to_ascii(domain.bytes_as_string_view(), false)); + + // FIXME: Reimplement 7. or call into URLParser::parse_host using ascii_domain (8. & 9. do not apply) + url.set_host(ascii_domain); + return url; +} + +} diff --git a/Userland/Libraries/LibUnicode/URL.h b/Userland/Libraries/LibUnicode/URL.h new file mode 100644 index 0000000000..ab19cdd7d9 --- /dev/null +++ b/Userland/Libraries/LibUnicode/URL.h @@ -0,0 +1,16 @@ +/* + * Copyright (c) 2023, Simon Wanner + * + * SPDX-License-Identifier: BSD-2-Clause + */ + +#pragma once + +#include +#include + +namespace Unicode { + +ErrorOr create_unicode_url(String const&); + +}