From eda92d15e4c5f6eb91263695c42997b45de99c98 Mon Sep 17 00:00:00 2001 From: Timothy Flynn Date: Fri, 27 Aug 2021 16:38:06 -0400 Subject: [PATCH] LibUnicode: Parse locale extensions of the Unicode locale extension form --- Tests/LibUnicode/TestUnicodeLocale.cpp | 46 ++++++++ Userland/Libraries/LibUnicode/Locale.cpp | 136 ++++++++++++++++++++++- Userland/Libraries/LibUnicode/Locale.h | 14 +++ 3 files changed, 192 insertions(+), 4 deletions(-) diff --git a/Tests/LibUnicode/TestUnicodeLocale.cpp b/Tests/LibUnicode/TestUnicodeLocale.cpp index 95ae10cb4a..a9e0fc209a 100644 --- a/Tests/LibUnicode/TestUnicodeLocale.cpp +++ b/Tests/LibUnicode/TestUnicodeLocale.cpp @@ -100,6 +100,52 @@ TEST_CASE(parse_unicode_locale_id) pass("aaa-bbbb-cc-1234-5678"sv, "aaa"sv, "bbbb"sv, "cc"sv, { "1234"sv, "5678"sv }); } +TEST_CASE(parse_unicode_locale_id_with_unicode_locale_extension) +{ + auto fail = [](StringView locale) { + auto locale_id = Unicode::parse_unicode_locale_id(locale); + EXPECT(!locale_id.has_value()); + }; + auto pass = [](StringView locale, Unicode::LocaleExtension const& expected_extension) { + auto locale_id = Unicode::parse_unicode_locale_id(locale); + VERIFY(locale_id.has_value()); + EXPECT_EQ(locale_id->extensions.size(), 1u); + + auto const& actual_extension = locale_id->extensions[0].get(); + VERIFY(actual_extension.attributes == expected_extension.attributes); + EXPECT_EQ(actual_extension.keywords.size(), expected_extension.keywords.size()); + + for (size_t i = 0; i < actual_extension.keywords.size(); ++i) { + auto const& actual_keyword = actual_extension.keywords[i]; + auto const& expected_keyword = expected_extension.keywords[i]; + + EXPECT_EQ(actual_keyword.key, expected_keyword.key); + EXPECT_EQ(actual_keyword.types, expected_keyword.types); + } + }; + + fail("en-u"sv); + fail("en-u-"sv); + fail("en-u-x"sv); + fail("en-u-xx-"sv); + fail("en-u--xx"sv); + fail("en-u-xx-xxxxx-"sv); + fail("en-u-xx--xxxxx"sv); + fail("en-u-xx-xxxxxxxxx"sv); + fail("en-u-xxxxx-"sv); + fail("en-u-xxxxxxxxx"sv); + + pass("en-u-xx"sv, { {}, { { "xx"sv, {} } } }); + pass("en-u-xx-yyyy"sv, { {}, { { "xx"sv, { "yyyy"sv } } } }); + pass("en-u-xx-yyyy-zzzz"sv, { {}, { { "xx"sv, { "yyyy"sv, "zzzz"sv } } } }); + pass("en-u-xx-yyyy-zzzz-aa"sv, { {}, { { "xx"sv, { "yyyy"sv, "zzzz"sv } }, { "aa"sv, {} } } }); + pass("en-u-xxx"sv, { { "xxx"sv }, {} }); + pass("en-u-fff-gggg"sv, { { "fff"sv, "gggg"sv }, {} }); + pass("en-u-fff-xx"sv, { { "fff"sv }, { { "xx"sv, {} } } }); + pass("en-u-fff-xx-yyyy"sv, { { "fff"sv }, { { "xx"sv, { "yyyy"sv } } } }); + pass("en-u-fff-gggg-xx-yyyy"sv, { { "fff"sv, "gggg"sv }, { { "xx"sv, { "yyyy"sv } } } }); +} + TEST_CASE(canonicalize_unicode_locale_id) { auto test = [](StringView locale, StringView expected_canonical_locale) { diff --git a/Userland/Libraries/LibUnicode/Locale.cpp b/Userland/Libraries/LibUnicode/Locale.cpp index 4ccf9e9a89..6f52ef25a8 100644 --- a/Userland/Libraries/LibUnicode/Locale.cpp +++ b/Userland/Libraries/LibUnicode/Locale.cpp @@ -53,7 +53,32 @@ bool is_unicode_variant_subtag(StringView subtag) return false; } -static Optional consume_next_segment(GenericLexer& lexer, bool with_separator) +static bool is_key(StringView key) +{ + // key = alphanum alpha + if (key.length() != 2) + return false; + return is_ascii_alphanumeric(key[0]) && is_ascii_alpha(key[1]); +} + +static bool is_single_type(StringView type) +{ + // type = alphanum{3,8} (sep alphanum{3,8})* + // Note: Consecutive types are not handled here, that is left to the caller. + if ((type.length() < 3) || (type.length() > 8)) + return false; + return all_of(type, is_ascii_alphanumeric); +} + +static bool is_attribute(StringView type) +{ + // attribute = alphanum{3,8} + if ((type.length() < 3) || (type.length() > 8)) + return false; + return all_of(type, is_ascii_alphanumeric); +} + +static Optional consume_next_segment(GenericLexer& lexer, bool with_separator = true) { constexpr auto is_separator = is_any_of("-_"sv); @@ -153,6 +178,101 @@ static Optional parse_unicode_language_id(GenericLexer& lexer) return language_id; } +static Optional parse_unicode_locale_extension(GenericLexer& lexer) +{ + // https://unicode.org/reports/tr35/#unicode_locale_extensions + // + // unicode_locale_extensions = sep [uU] ((sep keyword)+ | (sep attribute)+ (sep keyword)*) + LocaleExtension locale_extension {}; + + enum class ParseState { + ParsingAttributeOrKeyword, + ParsingAttribute, + ParsingKeyword, + Done, + }; + + auto state = ParseState::ParsingAttributeOrKeyword; + + while (!lexer.is_eof() && (state != ParseState::Done)) { + auto segment = consume_next_segment(lexer); + if (!segment.has_value()) + return {}; + + if (state == ParseState::ParsingAttributeOrKeyword) + state = is_key(*segment) ? ParseState::ParsingKeyword : ParseState::ParsingAttribute; + + switch (state) { + case ParseState::ParsingAttribute: + if (is_attribute(*segment)) { + locale_extension.attributes.append(*segment); + break; + } + + state = ParseState::ParsingKeyword; + [[fallthrough]]; + + case ParseState::ParsingKeyword: { + // keyword = key (sep type)? + Keyword keyword { .key = *segment }; + + if (!is_key(*segment)) { + lexer.retreat(segment->length() + 1); + state = ParseState::Done; + break; + } + + while (true) { + auto type = consume_next_segment(lexer); + + if (!type.has_value() || !is_single_type(*type)) { + if (type.has_value()) + lexer.retreat(type->length() + 1); + break; + } + + keyword.types.append(*type); + } + + locale_extension.keywords.append(move(keyword)); + break; + } + + default: + VERIFY_NOT_REACHED(); + } + } + + if (locale_extension.attributes.is_empty() && locale_extension.keywords.is_empty()) + return {}; + return locale_extension; +} + +static Optional parse_extension(GenericLexer& lexer) +{ + // https://unicode.org/reports/tr35/#extensions + // + // extensions = unicode_locale_extensions | transformed_extensions | other_extensions + size_t starting_position = lexer.tell(); + + if (auto header = consume_next_segment(lexer); header.has_value() && (header->length() == 1)) { + switch ((*header)[0]) { + case 'u': + case 'U': + if (auto extension = parse_unicode_locale_extension(lexer); extension.has_value()) + return Extension { extension.release_value() }; + break; + + default: + // FIXME: Handle transformed_extensions / other_extensions + break; + } + } + + lexer.retreat(lexer.tell() - starting_position); + return {}; +} + Optional parse_unicode_language_id(StringView language) { GenericLexer lexer { language }; @@ -167,7 +287,6 @@ Optional parse_unicode_language_id(StringView language) Optional parse_unicode_locale_id(StringView locale) { GenericLexer lexer { locale }; - LocaleID locale_id {}; // https://unicode.org/reports/tr35/#Unicode_locale_identifier // @@ -178,12 +297,21 @@ Optional parse_unicode_locale_id(StringView locale) if (!language_id.has_value()) return {}; - // FIXME: Handle extensions and pu_extensions. + LocaleID locale_id { language_id.release_value() }; + + while (true) { + auto extension = parse_extension(lexer); + if (!extension.has_value()) + break; + locale_id.extensions.append(extension.release_value()); + } + + // FIXME: Handle pu_extensions. if (!lexer.is_eof()) return {}; - return LocaleID { language_id.release_value() }; + return locale_id; } Optional canonicalize_unicode_locale_id(LocaleID& locale_id) diff --git a/Userland/Libraries/LibUnicode/Locale.h b/Userland/Libraries/LibUnicode/Locale.h index d86cb97d6f..742970a125 100644 --- a/Userland/Libraries/LibUnicode/Locale.h +++ b/Userland/Libraries/LibUnicode/Locale.h @@ -9,6 +9,7 @@ #include #include #include +#include #include #include @@ -22,8 +23,21 @@ struct LanguageID { Vector variants {}; }; +struct Keyword { + StringView key {}; + Vector types {}; +}; + +struct LocaleExtension { + Vector attributes {}; + Vector keywords {}; +}; + +using Extension = Variant; + struct LocaleID { LanguageID language_id {}; + Vector extensions {}; }; // Note: These methods only verify that the provided strings match the EBNF grammar of the