diff --git a/Tests/LibUnicode/TestUnicodeLocale.cpp b/Tests/LibUnicode/TestUnicodeLocale.cpp index 4a477dd79d..dc0aa45974 100644 --- a/Tests/LibUnicode/TestUnicodeLocale.cpp +++ b/Tests/LibUnicode/TestUnicodeLocale.cpp @@ -212,6 +212,40 @@ TEST_CASE(parse_unicode_locale_id_with_transformed_extension) pass("en-t-en-k0-aaa"sv, { Unicode::LanguageID { false, "en"sv }, { { "k0"sv, { "aaa"sv } } } }); } +TEST_CASE(parse_unicode_locale_id_with_other_extension) +{ + auto fail = [](StringView locale) { + auto locale_id = Unicode::parse_unicode_locale_id(locale); + EXPECT(!locale_id.has_value()); + }; + auto pass = [](StringView locale, Unicode::OtherExtension const& expected_extension) { + auto locale_id = Unicode::parse_unicode_locale_id(locale); + VERIFY(locale_id.has_value()); + EXPECT_EQ(locale_id->extensions.size(), 1u); + + auto const& actual_extension = locale_id->extensions[0].get(); + EXPECT_EQ(actual_extension.key, expected_extension.key); + EXPECT_EQ(actual_extension.values, expected_extension.values); + }; + + fail("en-z"sv); + fail("en-0"sv); + fail("en-z-"sv); + fail("en-0-"sv); + fail("en-z-a"sv); + fail("en-0-a"sv); + fail("en-z-aaaaaaaaa"sv); + fail("en-0-aaaaaaaaa"sv); + fail("en-z-aaa-"sv); + fail("en-0-aaa-"sv); + fail("en-z-aaa-a"sv); + fail("en-0-aaa-a"sv); + + pass("en-z-aa", { 'z', { "aa"sv } }); + pass("en-z-aa-bbb", { 'z', { "aa"sv, "bbb"sv } }); + pass("en-z-aa-bbb-cccccccc", { 'z', { "aa"sv, "bbb"sv, "cccccccc"sv } }); +} + TEST_CASE(canonicalize_unicode_locale_id) { auto test = [](StringView locale, StringView expected_canonical_locale) { diff --git a/Userland/Libraries/LibUnicode/Locale.cpp b/Userland/Libraries/LibUnicode/Locale.cpp index d85829dd4b..d886c4a812 100644 --- a/Userland/Libraries/LibUnicode/Locale.cpp +++ b/Userland/Libraries/LibUnicode/Locale.cpp @@ -340,6 +340,34 @@ static Optional parse_transformed_extension(GenericLexer& return transformed_extension; } +static Optional parse_other_extension(char key, GenericLexer& lexer) +{ + // https://unicode.org/reports/tr35/#other_extensions + // + // other_extensions = sep [alphanum-[tTuUxX]] (sep alphanum{2,8})+ ; + OtherExtension other_extension { .key = key }; + + if (!is_ascii_alphanumeric(key) || (key == 'x') || (key == 'X')) + return {}; + + while (true) { + auto segment = consume_next_segment(lexer); + if (!segment.has_value()) + break; + + if ((segment->length() < 2) || (segment->length() > 8) || !all_of(*segment, is_ascii_alphanumeric)) { + lexer.retreat(segment->length() + 1); + break; + } + + other_extension.values.append(*segment); + } + + if (other_extension.values.is_empty()) + return {}; + return other_extension; +} + static Optional parse_extension(GenericLexer& lexer) { // https://unicode.org/reports/tr35/#extensions @@ -348,7 +376,7 @@ static Optional parse_extension(GenericLexer& lexer) size_t starting_position = lexer.tell(); if (auto header = consume_next_segment(lexer); header.has_value() && (header->length() == 1)) { - switch ((*header)[0]) { + switch (char key = (*header)[0]) { case 'u': case 'U': if (auto extension = parse_unicode_locale_extension(lexer); extension.has_value()) @@ -362,7 +390,8 @@ static Optional parse_extension(GenericLexer& lexer) break; default: - // FIXME: Handle other_extensions + if (auto extension = parse_other_extension(key, lexer); extension.has_value()) + return Extension { extension.release_value() }; break; } } diff --git a/Userland/Libraries/LibUnicode/Locale.h b/Userland/Libraries/LibUnicode/Locale.h index 3ac31026f5..b8fc43d1e4 100644 --- a/Userland/Libraries/LibUnicode/Locale.h +++ b/Userland/Libraries/LibUnicode/Locale.h @@ -43,7 +43,12 @@ struct TransformedExtension { Vector fields {}; }; -using Extension = Variant; +struct OtherExtension { + char key {}; + Vector values {}; +}; + +using Extension = Variant; struct LocaleID { LanguageID language_id {};