diff --git a/Tests/LibUnicode/TestUnicodeLocale.cpp b/Tests/LibUnicode/TestUnicodeLocale.cpp index a9e0fc209a..4a477dd79d 100644 --- a/Tests/LibUnicode/TestUnicodeLocale.cpp +++ b/Tests/LibUnicode/TestUnicodeLocale.cpp @@ -146,6 +146,72 @@ TEST_CASE(parse_unicode_locale_id_with_unicode_locale_extension) pass("en-u-fff-gggg-xx-yyyy"sv, { { "fff"sv, "gggg"sv }, { { "xx"sv, { "yyyy"sv } } } }); } +TEST_CASE(parse_unicode_locale_id_with_transformed_extension) +{ + auto fail = [](StringView locale) { + auto locale_id = Unicode::parse_unicode_locale_id(locale); + EXPECT(!locale_id.has_value()); + }; + auto pass = [](StringView locale, Unicode::TransformedExtension const& expected_extension) { + auto locale_id = Unicode::parse_unicode_locale_id(locale); + VERIFY(locale_id.has_value()); + EXPECT_EQ(locale_id->extensions.size(), 1u); + + auto const& actual_extension = locale_id->extensions[0].get(); + + VERIFY(actual_extension.language.has_value() == expected_extension.language.has_value()); + if (actual_extension.language.has_value()) { + EXPECT_EQ(actual_extension.language->language, expected_extension.language->language); + EXPECT_EQ(actual_extension.language->script, expected_extension.language->script); + EXPECT_EQ(actual_extension.language->region, expected_extension.language->region); + EXPECT_EQ(actual_extension.language->variants, expected_extension.language->variants); + } + + EXPECT_EQ(actual_extension.fields.size(), expected_extension.fields.size()); + + for (size_t i = 0; i < actual_extension.fields.size(); ++i) { + auto const& actual_field = actual_extension.fields[i]; + auto const& expected_field = expected_extension.fields[i]; + + EXPECT_EQ(actual_field.key, expected_field.key); + EXPECT_EQ(actual_field.values, expected_field.values); + } + }; + + fail("en-t"sv); + fail("en-t-"sv); + fail("en-t-a"sv); + fail("en-t-en-"sv); + fail("en-t-root"sv); + fail("en-t-aaaaaaaaa"sv); + fail("en-t-en-aaa"sv); + fail("en-t-en-latn-latn"sv); + fail("en-t-en-a"sv); + fail("en-t-en-00"sv); + fail("en-t-en-latn-0"sv); + fail("en-t-en-latn-00"sv); + fail("en-t-en-latn-xyz"sv); + fail("en-t-en-aaaaaaaaa"sv); + fail("en-t-en-latn-gb-aaaa"sv); + fail("en-t-en-latn-gb-aaaaaaaaa"sv); + fail("en-t-k0"sv); + fail("en-t-k0-aa"sv); + fail("en-t-k0-aaaaaaaaa"sv); + + pass("en-t-en"sv, { Unicode::LanguageID { false, "en"sv }, {} }); + pass("en-t-en-latn"sv, { Unicode::LanguageID { false, "en"sv, "latn"sv }, {} }); + pass("en-t-en-us"sv, { Unicode::LanguageID { false, "en"sv, {}, "us"sv }, {} }); + pass("en-t-en-latn-us"sv, { Unicode::LanguageID { false, "en"sv, "latn"sv, "us"sv }, {} }); + pass("en-t-en-posix"sv, { Unicode::LanguageID { false, "en"sv, {}, {}, { "posix"sv } }, {} }); + pass("en-t-en-latn-posix"sv, { Unicode::LanguageID { false, "en"sv, "latn"sv, {}, { "posix"sv } }, {} }); + pass("en-t-en-us-posix"sv, { Unicode::LanguageID { false, "en"sv, {}, "us"sv, { "posix"sv } }, {} }); + pass("en-t-en-latn-us-posix"sv, { Unicode::LanguageID { false, "en"sv, "latn"sv, "us"sv, { "posix"sv } }, {} }); + pass("en-t-k0-aaa"sv, { {}, { { "k0"sv, { "aaa"sv } } } }); + pass("en-t-k0-aaa-bbbb"sv, { {}, { { "k0"sv, { "aaa"sv, "bbbb" } } } }); + pass("en-t-k0-aaa-k1-bbbb"sv, { {}, { { "k0"sv, { "aaa"sv } }, { "k1"sv, { "bbbb"sv } } } }); + pass("en-t-en-k0-aaa"sv, { Unicode::LanguageID { false, "en"sv }, { { "k0"sv, { "aaa"sv } } } }); +} + TEST_CASE(canonicalize_unicode_locale_id) { auto test = [](StringView locale, StringView expected_canonical_locale) { diff --git a/Userland/Libraries/LibUnicode/Locale.cpp b/Userland/Libraries/LibUnicode/Locale.cpp index 6f52ef25a8..d85829dd4b 100644 --- a/Userland/Libraries/LibUnicode/Locale.cpp +++ b/Userland/Libraries/LibUnicode/Locale.cpp @@ -78,6 +78,23 @@ static bool is_attribute(StringView type) return all_of(type, is_ascii_alphanumeric); } +static bool is_transformed_key(StringView key) +{ + // tkey = alpha digit + if (key.length() != 2) + return false; + return is_ascii_alpha(key[0]) && is_ascii_digit(key[1]); +} + +static bool is_single_transformed_value(StringView value) +{ + // tvalue = (sep alphanum{3,8})+ + // Note: Consecutive values are not handled here, that is left to the caller. + if ((value.length() < 3) || (value.length() > 8)) + return false; + return all_of(value, is_ascii_alphanumeric); +} + static Optional consume_next_segment(GenericLexer& lexer, bool with_separator = true) { constexpr auto is_separator = is_any_of("-_"sv); @@ -248,6 +265,81 @@ static Optional parse_unicode_locale_extension(GenericLexer& le return locale_extension; } +static Optional parse_transformed_extension(GenericLexer& lexer) +{ + // https://unicode.org/reports/tr35/#transformed_extensions + // + // transformed_extensions = sep [tT] ((sep tlang (sep tfield)*) | (sep tfield)+) + TransformedExtension transformed_extension {}; + + enum class ParseState { + ParsingLanguageOrField, + ParsingLanguage, + ParsingField, + Done, + }; + + auto state = ParseState::ParsingLanguageOrField; + + while (!lexer.is_eof() && (state != ParseState::Done)) { + auto segment = consume_next_segment(lexer); + if (!segment.has_value()) + return {}; + + if (state == ParseState::ParsingLanguageOrField) + state = is_unicode_language_subtag(*segment) ? ParseState::ParsingLanguage : ParseState::ParsingField; + + switch (state) { + case ParseState::ParsingLanguage: + lexer.retreat(segment->length()); + + if (auto language_id = parse_unicode_language_id(lexer); language_id.has_value()) { + transformed_extension.language = language_id.release_value(); + state = ParseState::ParsingField; + break; + } + + return {}; + + case ParseState::ParsingField: { + // tfield = tkey tvalue; + TransformedField field { .key = *segment }; + + if (!is_transformed_key(*segment)) { + lexer.retreat(segment->length() + 1); + state = ParseState::Done; + break; + } + + while (true) { + auto value = consume_next_segment(lexer); + + if (!value.has_value() || !is_single_transformed_value(*value)) { + if (value.has_value()) + lexer.retreat(value->length() + 1); + break; + } + + field.values.append(*value); + } + + if (field.values.is_empty()) + return {}; + + transformed_extension.fields.append(move(field)); + break; + } + + default: + VERIFY_NOT_REACHED(); + } + } + + if (!transformed_extension.language.has_value() && transformed_extension.fields.is_empty()) + return {}; + return transformed_extension; +} + static Optional parse_extension(GenericLexer& lexer) { // https://unicode.org/reports/tr35/#extensions @@ -263,8 +355,14 @@ static Optional parse_extension(GenericLexer& lexer) return Extension { extension.release_value() }; break; + case 't': + case 'T': + if (auto extension = parse_transformed_extension(lexer); extension.has_value()) + return Extension { extension.release_value() }; + break; + default: - // FIXME: Handle transformed_extensions / other_extensions + // FIXME: Handle other_extensions break; } } diff --git a/Userland/Libraries/LibUnicode/Locale.h b/Userland/Libraries/LibUnicode/Locale.h index 742970a125..3ac31026f5 100644 --- a/Userland/Libraries/LibUnicode/Locale.h +++ b/Userland/Libraries/LibUnicode/Locale.h @@ -33,7 +33,17 @@ struct LocaleExtension { Vector keywords {}; }; -using Extension = Variant; +struct TransformedField { + StringView key; + Vector values {}; +}; + +struct TransformedExtension { + Optional language {}; + Vector fields {}; +}; + +using Extension = Variant; struct LocaleID { LanguageID language_id {};