mirror of
				https://github.com/RGBCube/serenity
				synced 2025-10-31 21:22:46 +00:00 
			
		
		
		
	LibUnicode: Parse compact identifiers and replace them with a format key
For example, in en-US, the decimal, long compact pattern for numbers
between 10,000 and 100,000 is "00 thousand". In that pattern, "thousand"
is the compact identifier, and the generated format pattern is now
"{number} {compactIdentifier}". This also generates that identifier as
its own field in the NumberFormat structure.
			
			
This commit is contained in:
		
							parent
							
								
									1533123263
								
							
						
					
					
						commit
						48d5684780
					
				
					 2 changed files with 54 additions and 8 deletions
				
			
		|  | @ -17,6 +17,7 @@ | ||||||
| #include <AK/SourceGenerator.h> | #include <AK/SourceGenerator.h> | ||||||
| #include <AK/String.h> | #include <AK/String.h> | ||||||
| #include <AK/StringBuilder.h> | #include <AK/StringBuilder.h> | ||||||
|  | #include <AK/Utf8View.h> | ||||||
| #include <LibCore/ArgsParser.h> | #include <LibCore/ArgsParser.h> | ||||||
| #include <LibCore/DirIterator.h> | #include <LibCore/DirIterator.h> | ||||||
| #include <LibCore/File.h> | #include <LibCore/File.h> | ||||||
|  | @ -26,6 +27,11 @@ | ||||||
| using StringIndexType = u16; | using StringIndexType = u16; | ||||||
| constexpr auto s_string_index_type = "u16"sv; | constexpr auto s_string_index_type = "u16"sv; | ||||||
| 
 | 
 | ||||||
|  | enum class NumberFormatType { | ||||||
|  |     Standard, | ||||||
|  |     Compact, | ||||||
|  | }; | ||||||
|  | 
 | ||||||
| struct NumberFormat : public Unicode::NumberFormat { | struct NumberFormat : public Unicode::NumberFormat { | ||||||
|     using Base = Unicode::NumberFormat; |     using Base = Unicode::NumberFormat; | ||||||
| 
 | 
 | ||||||
|  | @ -51,6 +57,7 @@ struct NumberFormat : public Unicode::NumberFormat { | ||||||
|     StringIndexType zero_format_index { 0 }; |     StringIndexType zero_format_index { 0 }; | ||||||
|     StringIndexType positive_format_index { 0 }; |     StringIndexType positive_format_index { 0 }; | ||||||
|     StringIndexType negative_format_index { 0 }; |     StringIndexType negative_format_index { 0 }; | ||||||
|  |     StringIndexType compact_identifier_index { 0 }; | ||||||
| }; | }; | ||||||
| 
 | 
 | ||||||
| struct NumberSystem { | struct NumberSystem { | ||||||
|  | @ -83,7 +90,7 @@ struct UnicodeLocaleData { | ||||||
|     Vector<String> numeric_symbols; |     Vector<String> numeric_symbols; | ||||||
| }; | }; | ||||||
| 
 | 
 | ||||||
| static void parse_number_pattern(String pattern, UnicodeLocaleData& locale_data, NumberFormat& format, NumberSystem* number_system_for_groupings = nullptr) | static void parse_number_pattern(String pattern, UnicodeLocaleData& locale_data, NumberFormatType type, NumberFormat& format, NumberSystem* number_system_for_groupings = nullptr) | ||||||
| { | { | ||||||
|     // https://unicode.org/reports/tr35/tr35-numbers.html#Number_Format_Patterns
 |     // https://unicode.org/reports/tr35/tr35-numbers.html#Number_Format_Patterns
 | ||||||
|     // https://cldr.unicode.org/translation/number-currency-formats/number-and-currency-patterns
 |     // https://cldr.unicode.org/translation/number-currency-formats/number-and-currency-patterns
 | ||||||
|  | @ -138,6 +145,41 @@ static void parse_number_pattern(String pattern, UnicodeLocaleData& locale_data, | ||||||
|                 pattern = pattern.replace("0"sv, "{scientificExponent}"sv); |                 pattern = pattern.replace("0"sv, "{scientificExponent}"sv); | ||||||
|         } |         } | ||||||
| 
 | 
 | ||||||
|  |         if (type == NumberFormatType::Compact) { | ||||||
|  |             static Utf8View whitespace { "\u0020\u00a0"sv }; | ||||||
|  | 
 | ||||||
|  |             Utf8View utf8_pattern { pattern }; | ||||||
|  |             Optional<size_t> start_compact_index; | ||||||
|  |             Optional<size_t> end_compact_index; | ||||||
|  |             bool inside_replacement = false; | ||||||
|  | 
 | ||||||
|  |             for (auto it = utf8_pattern.begin(); it != utf8_pattern.end(); ++it) { | ||||||
|  |                 if (*it == '{') { | ||||||
|  |                     if (start_compact_index.has_value()) { | ||||||
|  |                         end_compact_index = utf8_pattern.byte_offset_of(it); | ||||||
|  |                         break; | ||||||
|  |                     } | ||||||
|  | 
 | ||||||
|  |                     inside_replacement = true; | ||||||
|  |                 } else if (*it == '}') { | ||||||
|  |                     inside_replacement = false; | ||||||
|  |                 } else if (!inside_replacement && !start_compact_index.has_value() && !whitespace.contains(*it)) { | ||||||
|  |                     start_compact_index = utf8_pattern.byte_offset_of(it); | ||||||
|  |                 } | ||||||
|  |             } | ||||||
|  | 
 | ||||||
|  |             if (!start_compact_index.has_value()) | ||||||
|  |                 return pattern; | ||||||
|  | 
 | ||||||
|  |             utf8_pattern = utf8_pattern.substring_view(*start_compact_index, end_compact_index.value_or(pattern.length()) - *start_compact_index); | ||||||
|  |             utf8_pattern = utf8_pattern.trim(whitespace); | ||||||
|  | 
 | ||||||
|  |             auto identifier = utf8_pattern.as_string().replace("'.'"sv, "."sv); | ||||||
|  |             format.compact_identifier_index = locale_data.unique_strings.ensure(move(identifier)); | ||||||
|  | 
 | ||||||
|  |             pattern = pattern.replace(utf8_pattern.as_string(), "{compactIdentifier}"); | ||||||
|  |         } | ||||||
|  | 
 | ||||||
|         return pattern; |         return pattern; | ||||||
|     }; |     }; | ||||||
| 
 | 
 | ||||||
|  | @ -206,7 +248,7 @@ static void parse_number_systems(String locale_numbers_path, UnicodeLocaleData& | ||||||
|             } |             } | ||||||
| 
 | 
 | ||||||
|             format.plurality = NumberFormat::plurality_from_string(split_key[2]); |             format.plurality = NumberFormat::plurality_from_string(split_key[2]); | ||||||
|             parse_number_pattern(value.as_string(), locale_data, format); |             parse_number_pattern(value.as_string(), locale_data, NumberFormatType::Compact, format); | ||||||
| 
 | 
 | ||||||
|             result.append(move(format)); |             result.append(move(format)); | ||||||
|         }); |         }); | ||||||
|  | @ -237,7 +279,7 @@ static void parse_number_systems(String locale_numbers_path, UnicodeLocaleData& | ||||||
|             auto& number_system = ensure_number_system(system); |             auto& number_system = ensure_number_system(system); | ||||||
| 
 | 
 | ||||||
|             auto format_object = value.as_object().get("standard"sv); |             auto format_object = value.as_object().get("standard"sv); | ||||||
|             parse_number_pattern(format_object.as_string(), locale_data, number_system.decimal_format, &number_system); |             parse_number_pattern(format_object.as_string(), locale_data, NumberFormatType::Standard, number_system.decimal_format, &number_system); | ||||||
| 
 | 
 | ||||||
|             auto const& long_format = value.as_object().get("long"sv).as_object().get("decimalFormat"sv); |             auto const& long_format = value.as_object().get("long"sv).as_object().get("decimalFormat"sv); | ||||||
|             number_system.decimal_long_formats = parse_number_format(long_format.as_object()); |             number_system.decimal_long_formats = parse_number_format(long_format.as_object()); | ||||||
|  | @ -249,10 +291,10 @@ static void parse_number_systems(String locale_numbers_path, UnicodeLocaleData& | ||||||
|             auto& number_system = ensure_number_system(system); |             auto& number_system = ensure_number_system(system); | ||||||
| 
 | 
 | ||||||
|             auto format_object = value.as_object().get("standard"sv); |             auto format_object = value.as_object().get("standard"sv); | ||||||
|             parse_number_pattern(format_object.as_string(), locale_data, number_system.currency_format); |             parse_number_pattern(format_object.as_string(), locale_data, NumberFormatType::Standard, number_system.currency_format); | ||||||
| 
 | 
 | ||||||
|             format_object = value.as_object().get("accounting"sv); |             format_object = value.as_object().get("accounting"sv); | ||||||
|             parse_number_pattern(format_object.as_string(), locale_data, number_system.accounting_format); |             parse_number_pattern(format_object.as_string(), locale_data, NumberFormatType::Standard, number_system.accounting_format); | ||||||
| 
 | 
 | ||||||
|             number_system.currency_unit_formats = parse_number_format(value.as_object()); |             number_system.currency_unit_formats = parse_number_format(value.as_object()); | ||||||
| 
 | 
 | ||||||
|  | @ -265,13 +307,13 @@ static void parse_number_systems(String locale_numbers_path, UnicodeLocaleData& | ||||||
|             auto& number_system = ensure_number_system(system); |             auto& number_system = ensure_number_system(system); | ||||||
| 
 | 
 | ||||||
|             auto format_object = value.as_object().get("standard"sv); |             auto format_object = value.as_object().get("standard"sv); | ||||||
|             parse_number_pattern(format_object.as_string(), locale_data, number_system.percent_format); |             parse_number_pattern(format_object.as_string(), locale_data, NumberFormatType::Standard, number_system.percent_format); | ||||||
|         } else if (key.starts_with(scientific_formats_prefix)) { |         } else if (key.starts_with(scientific_formats_prefix)) { | ||||||
|             auto system = key.substring(scientific_formats_prefix.length()); |             auto system = key.substring(scientific_formats_prefix.length()); | ||||||
|             auto& number_system = ensure_number_system(system); |             auto& number_system = ensure_number_system(system); | ||||||
| 
 | 
 | ||||||
|             auto format_object = value.as_object().get("standard"sv); |             auto format_object = value.as_object().get("standard"sv); | ||||||
|             parse_number_pattern(format_object.as_string(), locale_data, number_system.scientific_format); |             parse_number_pattern(format_object.as_string(), locale_data, NumberFormatType::Standard, number_system.scientific_format); | ||||||
|         } |         } | ||||||
|     }); |     }); | ||||||
| } | } | ||||||
|  | @ -387,6 +429,7 @@ struct NumberFormat { | ||||||
|         number_format.zero_format = s_string_list[zero_format]; |         number_format.zero_format = s_string_list[zero_format]; | ||||||
|         number_format.positive_format = s_string_list[positive_format]; |         number_format.positive_format = s_string_list[positive_format]; | ||||||
|         number_format.negative_format = s_string_list[negative_format]; |         number_format.negative_format = s_string_list[negative_format]; | ||||||
|  |         number_format.compact_identifier = s_string_list[compact_identifier]; | ||||||
| 
 | 
 | ||||||
|         return number_format; |         return number_format; | ||||||
|     } |     } | ||||||
|  | @ -397,6 +440,7 @@ struct NumberFormat { | ||||||
|     @string_index_type@ zero_format { 0 }; |     @string_index_type@ zero_format { 0 }; | ||||||
|     @string_index_type@ positive_format { 0 }; |     @string_index_type@ positive_format { 0 }; | ||||||
|     @string_index_type@ negative_format { 0 }; |     @string_index_type@ negative_format { 0 }; | ||||||
|  |     @string_index_type@ compact_identifier { 0 }; | ||||||
| }; | }; | ||||||
| 
 | 
 | ||||||
| struct NumberSystem { | struct NumberSystem { | ||||||
|  | @ -427,7 +471,8 @@ struct NumberSystem { | ||||||
|         generator.set("zero_format"sv, String::number(number_format.zero_format_index)); |         generator.set("zero_format"sv, String::number(number_format.zero_format_index)); | ||||||
|         generator.set("positive_format"sv, String::number(number_format.positive_format_index)); |         generator.set("positive_format"sv, String::number(number_format.positive_format_index)); | ||||||
|         generator.set("negative_format"sv, String::number(number_format.negative_format_index)); |         generator.set("negative_format"sv, String::number(number_format.negative_format_index)); | ||||||
|         generator.append("{ @magnitude@, @compact_scale@, @plurality@, @zero_format@, @positive_format@, @negative_format@ },"); |         generator.set("compact_identifier"sv, String::number(number_format.compact_identifier_index)); | ||||||
|  |         generator.append("{ @magnitude@, @compact_scale@, @plurality@, @zero_format@, @positive_format@, @negative_format@, @compact_identifier@ },"); | ||||||
|     }; |     }; | ||||||
| 
 | 
 | ||||||
|     auto append_number_formats = [&](String name, auto const& number_formats) { |     auto append_number_formats = [&](String name, auto const& number_formats) { | ||||||
|  |  | ||||||
|  | @ -122,6 +122,7 @@ struct NumberFormat { | ||||||
|     StringView zero_format {}; |     StringView zero_format {}; | ||||||
|     StringView positive_format {}; |     StringView positive_format {}; | ||||||
|     StringView negative_format {}; |     StringView negative_format {}; | ||||||
|  |     StringView compact_identifier {}; | ||||||
| }; | }; | ||||||
| 
 | 
 | ||||||
| struct ListPatterns { | struct ListPatterns { | ||||||
|  |  | ||||||
		Loading…
	
	Add table
		Add a link
		
	
		Reference in a new issue
	
	 Timothy Flynn
						Timothy Flynn