From dd89901b070c07cd54095931c4b03fc5beb3316d Mon Sep 17 00:00:00 2001 From: Timothy Flynn Date: Fri, 27 Aug 2021 10:12:17 -0400 Subject: [PATCH] LibUnicode: Use GenericLexer to parse Unicode language IDs This is preparatory work to read locale extensions. The parser currently enforces that the entire string is consumed. But to parse extensions, parse_unicode_locale_id() will need parse_unicode_language_id() to just stop parsing on the first segment that does not match the language ID grammar. It will also need to know where the parsing stopped. Both of these needs are fulfilled by GenericLexer. The caveat is that we can no longer simply split the parsed string on separator characters. So parse_unicode_language_id() now operates as a small state machine. --- Userland/Libraries/LibUnicode/Locale.cpp | 125 +++++++++++++++++------ 1 file changed, 95 insertions(+), 30 deletions(-) diff --git a/Userland/Libraries/LibUnicode/Locale.cpp b/Userland/Libraries/LibUnicode/Locale.cpp index adb3c9ba60..4ccf9e9a89 100644 --- a/Userland/Libraries/LibUnicode/Locale.cpp +++ b/Userland/Libraries/LibUnicode/Locale.cpp @@ -53,7 +53,26 @@ bool is_unicode_variant_subtag(StringView subtag) return false; } -Optional parse_unicode_language_id(StringView language) +static Optional consume_next_segment(GenericLexer& lexer, bool with_separator) +{ + constexpr auto is_separator = is_any_of("-_"sv); + + if (with_separator) { + if (!lexer.next_is(is_separator)) + return {}; + lexer.ignore(); + } + + auto segment = lexer.consume_until(is_separator); + if (segment.is_empty()) { + lexer.retreat(with_separator); + return {}; + } + + return segment; +} + +static Optional parse_unicode_language_id(GenericLexer& lexer) { // https://unicode.org/reports/tr35/#Unicode_language_identifier // @@ -64,48 +83,90 @@ Optional parse_unicode_language_id(StringView language) // (sep unicode_variant_subtag)* LanguageID language_id {}; - if (language == "root"sv) { + if (lexer.consume_specific("root"sv)) { language_id.is_root = true; return language_id; } - auto segments = language.split_view_if(is_any_of("-_"sv), true); // keep_empty=true to ensure valid data follows a separator. - size_t index = 0; + enum class ParseState { + ParsingLanguageOrScript, + ParsingScript, + ParsingRegion, + ParsingVariant, + Done, + }; - if (segments.size() == index) - return {}; + auto state = ParseState::ParsingLanguageOrScript; - if (is_unicode_language_subtag(segments[index])) { - language_id.language = segments[index]; - if (segments.size() == ++index) - return language_id; - } - - if (is_unicode_script_subtag(segments[index])) { - language_id.script = segments[index]; - if (segments.size() == ++index) - return language_id; - } else if (!language_id.language.has_value()) { - return {}; - } - - if (is_unicode_region_subtag(segments[index])) { - language_id.region = segments[index]; - if (segments.size() == ++index) - return language_id; - } - - while (index < segments.size()) { - if (!is_unicode_variant_subtag(segments[index])) + while (!lexer.is_eof() && (state != ParseState::Done)) { + auto segment = consume_next_segment(lexer, state != ParseState::ParsingLanguageOrScript); + if (!segment.has_value()) return {}; - language_id.variants.append(segments[index++]); + + switch (state) { + case ParseState::ParsingLanguageOrScript: + if (is_unicode_language_subtag(*segment)) { + state = ParseState::ParsingScript; + language_id.language = *segment; + } else if (is_unicode_script_subtag(*segment)) { + state = ParseState::ParsingRegion; + language_id.script = *segment; + } else { + return {}; + } + break; + + case ParseState::ParsingScript: + if (is_unicode_script_subtag(*segment)) { + state = ParseState::ParsingRegion; + language_id.script = *segment; + break; + } + + state = ParseState::ParsingRegion; + [[fallthrough]]; + + case ParseState::ParsingRegion: + if (is_unicode_region_subtag(*segment)) { + state = ParseState::ParsingVariant; + language_id.region = *segment; + break; + } + + state = ParseState::ParsingVariant; + [[fallthrough]]; + + case ParseState::ParsingVariant: + if (is_unicode_variant_subtag(*segment)) { + language_id.variants.append(*segment); + } else { + lexer.retreat(segment->length() + 1); + state = ParseState::Done; + } + break; + + default: + VERIFY_NOT_REACHED(); + } } return language_id; } +Optional parse_unicode_language_id(StringView language) +{ + GenericLexer lexer { language }; + + auto language_id = parse_unicode_language_id(lexer); + if (!lexer.is_eof()) + return {}; + + return language_id; +} + Optional parse_unicode_locale_id(StringView locale) { + GenericLexer lexer { locale }; LocaleID locale_id {}; // https://unicode.org/reports/tr35/#Unicode_locale_identifier @@ -113,11 +174,15 @@ Optional parse_unicode_locale_id(StringView locale) // unicode_locale_id = unicode_language_id // extensions* // pu_extensions? - auto language_id = parse_unicode_language_id(locale); + auto language_id = parse_unicode_language_id(lexer); if (!language_id.has_value()) return {}; // FIXME: Handle extensions and pu_extensions. + + if (!lexer.is_eof()) + return {}; + return LocaleID { language_id.release_value() }; }