From 04b8b87c17ba21d3969c0e5c52dc1b01eddd8b80 Mon Sep 17 00:00:00 2001 From: Timothy Flynn Date: Tue, 16 Nov 2021 13:53:45 -0500 Subject: [PATCH] LibJS+LibUnicode: Support multiple identifiers within format pattern This wasn't the case for compact patterns, but unit patterns can contain multiple (up to 2, really) identifiers that must each be recognized by LibJS. Each generated NumberFormat object now stores an array of identifiers parsed. The format pattern itself is encoded with the index into this array for that identifier, e.g. the compact format string "0K" will become "{number}{compactIdentifier:0}". --- .../GenerateUnicodeNumberFormat.cpp | 103 +++++++++++------- .../LibJS/Runtime/Intl/NumberFormat.cpp | 7 +- Userland/Libraries/LibUnicode/Locale.h | 2 +- 3 files changed, 72 insertions(+), 40 deletions(-) diff --git a/Meta/Lagom/Tools/CodeGenerators/LibUnicode/GenerateUnicodeNumberFormat.cpp b/Meta/Lagom/Tools/CodeGenerators/LibUnicode/GenerateUnicodeNumberFormat.cpp index de68f3ab01..a1b20e404b 100644 --- a/Meta/Lagom/Tools/CodeGenerators/LibUnicode/GenerateUnicodeNumberFormat.cpp +++ b/Meta/Lagom/Tools/CodeGenerators/LibUnicode/GenerateUnicodeNumberFormat.cpp @@ -57,7 +57,7 @@ struct NumberFormat : public Unicode::NumberFormat { StringIndexType zero_format_index { 0 }; StringIndexType positive_format_index { 0 }; StringIndexType negative_format_index { 0 }; - StringIndexType identifier_index { 0 }; + Vector identifier_indices {}; }; struct NumberSystem { @@ -88,46 +88,61 @@ struct UnicodeLocaleData { UniqueStringStorage unique_strings; HashMap locales; Vector numeric_symbols; + size_t max_identifier_count { 0 }; }; -static String parse_identifier(String pattern, StringView replacement, UnicodeLocaleData& locale_data, NumberFormat& format) +static String parse_identifiers(String pattern, StringView replacement, UnicodeLocaleData& locale_data, NumberFormat& format) { static Utf8View whitespace { "\u0020\u00a0"sv }; - Utf8View utf8_pattern { pattern }; - Optional start_index; - Optional end_index; - bool inside_replacement = false; + while (true) { + Utf8View utf8_pattern { pattern }; + Optional start_index; + Optional end_index; + bool inside_replacement = false; - for (auto it = utf8_pattern.begin(); it != utf8_pattern.end(); ++it) { - if (*it == '{') { - if (start_index.has_value()) { - end_index = utf8_pattern.byte_offset_of(it); - break; + for (auto it = utf8_pattern.begin(); it != utf8_pattern.end(); ++it) { + if (*it == '{') { + if (start_index.has_value()) { + end_index = utf8_pattern.byte_offset_of(it); + break; + } + + inside_replacement = true; + } else if (*it == '}') { + inside_replacement = false; + } else if (!inside_replacement && !start_index.has_value() && !whitespace.contains(*it)) { + start_index = utf8_pattern.byte_offset_of(it); } - - inside_replacement = true; - } else if (*it == '}') { - inside_replacement = false; - } else if (!inside_replacement && !start_index.has_value() && !whitespace.contains(*it)) { - start_index = utf8_pattern.byte_offset_of(it); } + + if (!start_index.has_value()) + return pattern; + + end_index = end_index.value_or(pattern.length()); + + utf8_pattern = utf8_pattern.substring_view(*start_index, *end_index - *start_index); + utf8_pattern = utf8_pattern.trim(whitespace); + + auto identifier = utf8_pattern.as_string().replace("'.'"sv, "."sv); + auto identifier_index = locale_data.unique_strings.ensure(move(identifier)); + size_t replacement_index = 0; + + if (auto index = format.identifier_indices.find_first_index(identifier_index); index.has_value()) { + replacement_index = *index; + } else { + replacement_index = format.identifier_indices.size(); + format.identifier_indices.append(identifier_index); + + locale_data.max_identifier_count = max(locale_data.max_identifier_count, format.identifier_indices.size()); + } + + pattern = String::formatted("{}{{{}:{}}}{}", + *start_index > 0 ? pattern.substring_view(0, *start_index) : ""sv, + replacement, + replacement_index, + pattern.substring_view(*start_index + utf8_pattern.byte_length())); } - - if (!start_index.has_value()) - return pattern; - end_index = end_index.value_or(pattern.length()); - - utf8_pattern = utf8_pattern.substring_view(*start_index, *end_index - *start_index); - utf8_pattern = utf8_pattern.trim(whitespace); - - auto identifier = utf8_pattern.as_string().replace("'.'"sv, "."sv); - format.identifier_index = locale_data.unique_strings.ensure(move(identifier)); - - return String::formatted("{}{}{}", - *start_index > 0 ? pattern.substring_view(0, *start_index) : ""sv, - replacement, - pattern.substring_view(*start_index + utf8_pattern.byte_length())); } static void parse_number_pattern(Vector patterns, UnicodeLocaleData& locale_data, NumberFormatType type, NumberFormat& format, NumberSystem* number_system_for_groupings = nullptr) @@ -188,7 +203,7 @@ static void parse_number_pattern(Vector patterns, UnicodeLocaleData& loc } if (type == NumberFormatType::Compact) - return parse_identifier(move(pattern), "{compactIdentifier}"sv, locale_data, format); + return parse_identifiers(move(pattern), "compactIdentifier"sv, locale_data, format); return pattern; }; @@ -413,6 +428,7 @@ static void generate_unicode_locale_implementation(Core::File& file, UnicodeLoca SourceGenerator generator { builder }; generator.set("string_index_type"sv, s_string_index_type); generator.set("numeric_symbols_size", String::number(locale_data.numeric_symbols.size())); + generator.set("identifier_count", String::number(locale_data.max_identifier_count)); generator.append(R"~~~( #include @@ -437,7 +453,10 @@ struct NumberFormat { number_format.zero_format = s_string_list[zero_format]; number_format.positive_format = s_string_list[positive_format]; number_format.negative_format = s_string_list[negative_format]; - number_format.identifier = s_string_list[identifier]; + + number_format.identifiers.ensure_capacity(identifiers.size()); + for (@string_index_type@ identifier : identifiers) + number_format.identifiers.append(s_string_list[identifier]); return number_format; } @@ -448,7 +467,7 @@ struct NumberFormat { @string_index_type@ zero_format { 0 }; @string_index_type@ positive_format { 0 }; @string_index_type@ negative_format { 0 }; - @string_index_type@ identifier { 0 }; + Array<@string_index_type@, @identifier_count@> identifiers {}; }; struct NumberSystem { @@ -479,8 +498,18 @@ struct NumberSystem { generator.set("zero_format"sv, String::number(number_format.zero_format_index)); generator.set("positive_format"sv, String::number(number_format.positive_format_index)); generator.set("negative_format"sv, String::number(number_format.negative_format_index)); - generator.set("identifier"sv, String::number(number_format.identifier_index)); - generator.append("{ @magnitude@, @exponent@, @plurality@, @zero_format@, @positive_format@, @negative_format@, @identifier@ },"); + generator.append("{ @magnitude@, @exponent@, @plurality@, @zero_format@, @positive_format@, @negative_format@, { "); + + bool first = true; + for (auto identifier_index : number_format.identifier_indices) { + if (!first) + generator.append(", "); + + generator.append(String::number(identifier_index)); + first = false; + } + + generator.append(" } },"); }; auto append_number_formats = [&](String name, auto const& number_formats) { diff --git a/Userland/Libraries/LibJS/Runtime/Intl/NumberFormat.cpp b/Userland/Libraries/LibJS/Runtime/Intl/NumberFormat.cpp index f2bfecefd9..9373105fa4 100644 --- a/Userland/Libraries/LibJS/Runtime/Intl/NumberFormat.cpp +++ b/Userland/Libraries/LibJS/Runtime/Intl/NumberFormat.cpp @@ -962,11 +962,14 @@ Vector partition_notation_sub_pattern(NumberFormat& number_for } // iv. Else if p is equal to "compactSymbol", then // v. Else if p is equal to "compactName", then - else if (part == "compactIdentifier"sv) { + else if (part.starts_with("compactIdentifier:"sv)) { // Note: Our implementation combines "compactSymbol" and "compactName" into one field, "compactIdentifier". + auto identifier_index = part.substring_view("compactIdentifier:"sv.length()).to_uint(); + VERIFY(identifier_index.has_value()); + // 1. Let compactSymbol be an ILD string representing exponent in short form, which may depend on x in languages having different plural forms. The implementation must be able to provide this string, or else the pattern would not have a "{compactSymbol}" placeholder. - auto compact_identifier = number_format.compact_format().identifier; + auto compact_identifier = number_format.compact_format().identifiers[*identifier_index]; // 2. Append a new Record { [[Type]]: "compact", [[Value]]: compactSymbol } as the last element of result. result.append({ "compact"sv, compact_identifier }); diff --git a/Userland/Libraries/LibUnicode/Locale.h b/Userland/Libraries/LibUnicode/Locale.h index a82856b3e6..0979e4851f 100644 --- a/Userland/Libraries/LibUnicode/Locale.h +++ b/Userland/Libraries/LibUnicode/Locale.h @@ -122,7 +122,7 @@ struct NumberFormat { StringView zero_format {}; StringView positive_format {}; StringView negative_format {}; - StringView identifier {}; + Vector identifiers {}; }; struct ListPatterns {