From cafb717486eed8e24d9f1d6b0a845b270674562c Mon Sep 17 00:00:00 2001 From: Timothy Flynn Date: Tue, 16 Nov 2021 09:31:15 -0500 Subject: [PATCH] LibUnicode: Parse and generate CLDR unit data for Intl.NumberFormat The units data is in another CLDR package, cldr-units. --- Meta/CMake/unicode_data.cmake | 8 +- .../GenerateUnicodeNumberFormat.cpp | 213 +++++++++++++++++- Userland/Libraries/LibUnicode/Forward.h | 1 + Userland/Libraries/LibUnicode/Locale.cpp | 9 + Userland/Libraries/LibUnicode/Locale.h | 1 + 5 files changed, 228 insertions(+), 4 deletions(-) diff --git a/Meta/CMake/unicode_data.cmake b/Meta/CMake/unicode_data.cmake index 4b4a377b31..08c649b52f 100644 --- a/Meta/CMake/unicode_data.cmake +++ b/Meta/CMake/unicode_data.cmake @@ -58,6 +58,9 @@ set(CLDR_MISC_PATH "${CLDR_PATH}/${CLDR_MISC_SOURCE}") set(CLDR_NUMBERS_SOURCE cldr-numbers-modern) set(CLDR_NUMBERS_PATH "${CLDR_PATH}/${CLDR_NUMBERS_SOURCE}") +set(CLDR_UNITS_SOURCE cldr-units-modern) +set(CLDR_UNITS_PATH "${CLDR_PATH}/${CLDR_UNITS_SOURCE}") + function(remove_unicode_data_if_version_changed version version_file cache_path) set(version_differs YES) @@ -119,6 +122,7 @@ if (ENABLE_UNICODE_DATABASE_DOWNLOAD) extract_cldr_file("${CLDR_LOCALES_SOURCE}" "${CLDR_LOCALES_PATH}") extract_cldr_file("${CLDR_MISC_SOURCE}" "${CLDR_MISC_PATH}") extract_cldr_file("${CLDR_NUMBERS_SOURCE}" "${CLDR_NUMBERS_PATH}") + extract_cldr_file("${CLDR_UNITS_SOURCE}" "${CLDR_UNITS_PATH}") set(UNICODE_DATA_HEADER LibUnicode/UnicodeData.h) set(UNICODE_DATA_IMPLEMENTATION LibUnicode/UnicodeData.cpp) @@ -170,12 +174,12 @@ if (ENABLE_UNICODE_DATABASE_DOWNLOAD) add_custom_command( OUTPUT ${UNICODE_NUMBER_FORMAT_HEADER} ${UNICODE_NUMBER_FORMAT_IMPLEMENTATION} - COMMAND $ -h ${UNICODE_NUMBER_FORMAT_HEADER}.tmp -c ${UNICODE_NUMBER_FORMAT_IMPLEMENTATION}.tmp -r ${CLDR_CORE_PATH} -n ${CLDR_NUMBERS_PATH} + COMMAND $ -h ${UNICODE_NUMBER_FORMAT_HEADER}.tmp -c ${UNICODE_NUMBER_FORMAT_IMPLEMENTATION}.tmp -r ${CLDR_CORE_PATH} -n ${CLDR_NUMBERS_PATH} -u ${CLDR_UNITS_PATH} COMMAND "${CMAKE_COMMAND}" -E copy_if_different ${UNICODE_NUMBER_FORMAT_HEADER}.tmp ${UNICODE_NUMBER_FORMAT_HEADER} COMMAND "${CMAKE_COMMAND}" -E copy_if_different ${UNICODE_NUMBER_FORMAT_IMPLEMENTATION}.tmp ${UNICODE_NUMBER_FORMAT_IMPLEMENTATION} COMMAND "${CMAKE_COMMAND}" -E remove ${UNICODE_NUMBER_FORMAT_HEADER}.tmp ${UNICODE_NUMBER_FORMAT_IMPLEMENTATION}.tmp VERBATIM - DEPENDS Lagom::GenerateUnicodeNumberFormat ${CLDR_CORE_PATH} ${CLDR_LOCALES_PATH} ${CLDR_MISC_PATH} ${CLDR_NUMBERS_PATH} + DEPENDS Lagom::GenerateUnicodeNumberFormat ${CLDR_CORE_PATH} ${CLDR_LOCALES_PATH} ${CLDR_MISC_PATH} ${CLDR_NUMBERS_PATH} ${CLDR_UNITS_PATH} ) add_custom_target(generate_${UNICODE_META_TARGET_PREFIX}UnicodeNumberFormat DEPENDS ${UNICODE_NUMBER_FORMAT_HEADER} ${UNICODE_NUMBER_FORMAT_IMPLEMENTATION}) add_dependencies(all_generated generate_${UNICODE_META_TARGET_PREFIX}UnicodeNumberFormat) diff --git a/Meta/Lagom/Tools/CodeGenerators/LibUnicode/GenerateUnicodeNumberFormat.cpp b/Meta/Lagom/Tools/CodeGenerators/LibUnicode/GenerateUnicodeNumberFormat.cpp index 40d6d504cd..12815147bb 100644 --- a/Meta/Lagom/Tools/CodeGenerators/LibUnicode/GenerateUnicodeNumberFormat.cpp +++ b/Meta/Lagom/Tools/CodeGenerators/LibUnicode/GenerateUnicodeNumberFormat.cpp @@ -6,7 +6,9 @@ #include "GeneratorUtil.h" #include +#include #include +#include #include #include #include @@ -80,8 +82,16 @@ struct NumberSystem { NumberFormat scientific_format {}; }; +struct Unit { + StringIndexType unit { 0 }; + Vector long_formats {}; + Vector short_formats {}; + Vector narrow_formats {}; +}; + struct Locale { HashMap number_systems; + HashMap units {}; }; struct UnicodeLocaleData { @@ -341,9 +351,106 @@ static void parse_number_systems(String locale_numbers_path, UnicodeLocaleData& }); } -static void parse_all_locales(String core_path, String numbers_path, UnicodeLocaleData& locale_data) +static void parse_units(String locale_units_path, UnicodeLocaleData& locale_data, Locale& locale) +{ + LexicalPath units_path(move(locale_units_path)); + units_path = units_path.append("units.json"sv); + VERIFY(Core::File::exists(units_path.string())); + + auto units_file_or_error = Core::File::open(units_path.string(), Core::OpenMode::ReadOnly); + VERIFY(!units_file_or_error.is_error()); + + auto units = JsonParser(units_file_or_error.value()->read_all()).parse(); + VERIFY(units.has_value()); + + auto const& main_object = units->as_object().get("main"sv); + auto const& locale_object = main_object.as_object().get(units_path.parent().basename()); + auto const& locale_units_object = locale_object.as_object().get("units"sv); + auto const& long_object = locale_units_object.as_object().get("long"sv); + auto const& short_object = locale_units_object.as_object().get("short"sv); + auto const& narrow_object = locale_units_object.as_object().get("narrow"sv); + + auto ensure_unit = [&](auto const& unit) -> Unit& { + return locale.units.ensure(unit, [&]() { + auto unit_index = locale_data.unique_strings.ensure(unit); + return Unit { .unit = unit_index }; + }); + }; + + auto is_sanctioned_unit = [](StringView unit_name) { + // This is a copy of the units sanctioned for use within ECMA-402. LibUnicode generally tries to + // avoid being directly dependent on ECMA-402, but this rather significantly reduces the amount + // of data generated here, and ECMA-402 is currently the only consumer of this data. + // https://tc39.es/ecma402/#table-sanctioned-simple-unit-identifiers + constexpr auto sanctioned_units = AK::Array { "acre"sv, "bit"sv, "byte"sv, "celsius"sv, "centimeter"sv, "day"sv, "degree"sv, "fahrenheit"sv, "fluid-ounce"sv, "foot"sv, "gallon"sv, "gigabit"sv, "gigabyte"sv, "gram"sv, "hectare"sv, "hour"sv, "inch"sv, "kilobit"sv, "kilobyte"sv, "kilogram"sv, "kilometer"sv, "liter"sv, "megabit"sv, "megabyte"sv, "meter"sv, "mile"sv, "mile-scandinavian"sv, "milliliter"sv, "millimeter"sv, "millisecond"sv, "minute"sv, "month"sv, "ounce"sv, "percent"sv, "petabyte"sv, "pound"sv, "second"sv, "stone"sv, "terabit"sv, "terabyte"sv, "week"sv, "yard"sv, "year"sv }; + return find(sanctioned_units.begin(), sanctioned_units.end(), unit_name) != sanctioned_units.end(); + }; + + auto parse_units_object = [&](auto const& units_object, Unicode::Style style) { + constexpr auto unit_pattern_prefix = "unitPattern-count-"sv; + constexpr auto combined_unit_separator = "-per-"sv; + + units_object.for_each_member([&](auto const& key, JsonValue const& value) { + auto end_of_category = key.find('-'); + if (!end_of_category.has_value()) + return; + + auto unit_name = key.substring(*end_of_category + 1); + + if (!is_sanctioned_unit(unit_name)) { + auto indices = unit_name.find_all(combined_unit_separator); + if (indices.size() != 1) + return; + + auto numerator = unit_name.substring_view(0, indices[0]); + auto denominator = unit_name.substring_view(indices[0] + combined_unit_separator.length()); + if (!is_sanctioned_unit(numerator) || !is_sanctioned_unit(denominator)) + return; + } + + value.as_object().for_each_member([&](auto const& unit_key, JsonValue const& pattern_value) { + if (!unit_key.starts_with(unit_pattern_prefix)) + return; + + auto& unit = ensure_unit(unit_name); + NumberFormat format {}; + + auto plurality = unit_key.substring_view(unit_pattern_prefix.length()); + format.plurality = NumberFormat::plurality_from_string(plurality); + + auto zero_format = pattern_value.as_string().replace("{0}"sv, "{number}"sv); + zero_format = parse_identifiers(zero_format, "unitIdentifier"sv, locale_data, format); + + format.positive_format_index = locale_data.unique_strings.ensure(zero_format.replace("{number}"sv, "{plusSign}{number}"sv)); + format.negative_format_index = locale_data.unique_strings.ensure(zero_format.replace("{number}"sv, "{minusSign}{number}"sv)); + format.zero_format_index = locale_data.unique_strings.ensure(move(zero_format)); + + switch (style) { + case Unicode::Style::Long: + unit.long_formats.append(move(format)); + break; + case Unicode::Style::Short: + unit.short_formats.append(move(format)); + break; + case Unicode::Style::Narrow: + unit.narrow_formats.append(move(format)); + break; + default: + VERIFY_NOT_REACHED(); + } + }); + }); + }; + + parse_units_object(long_object.as_object(), Unicode::Style::Long); + parse_units_object(short_object.as_object(), Unicode::Style::Short); + parse_units_object(narrow_object.as_object(), Unicode::Style::Narrow); +} + +static void parse_all_locales(String core_path, String numbers_path, String units_path, UnicodeLocaleData& locale_data) { auto numbers_iterator = path_to_dir_iterator(move(numbers_path)); + auto units_iterator = path_to_dir_iterator(move(units_path)); auto remove_variants_from_path = [&](String path) -> Optional { auto parsed_locale = CanonicalLanguageID::parse(locale_data.unique_strings, LexicalPath::basename(path)); @@ -372,6 +479,18 @@ static void parse_all_locales(String core_path, String numbers_path, UnicodeLoca parse_number_systems(numbers_path, locale_data, locale); } + while (units_iterator.has_next()) { + auto units_path = units_iterator.next_full_path(); + VERIFY(Core::File::is_directory(units_path)); + + auto language = remove_variants_from_path(units_path); + if (!language.has_value()) + continue; + + auto& locale = locale_data.locales.ensure(*language); + parse_units(units_path, locale_data, locale); + } + parse_default_content_locales(move(core_path), locale_data); } @@ -412,6 +531,7 @@ Optional get_number_system_symbol(StringView locale, StringView syst Optional get_number_system_groupings(StringView locale, StringView system); Optional get_standard_number_system_format(StringView locale, StringView system, StandardNumberFormatType type); Vector get_compact_number_system_formats(StringView locale, StringView system, CompactNumberFormatType type); +Vector get_unit_formats(StringView locale, StringView unit, Style style); Optional numeric_symbol_from_string(StringView numeric_symbol); } @@ -489,6 +609,13 @@ struct NumberSystem { NumberFormat percent_format {}; NumberFormat scientific_format {}; }; + +struct Unit { + @string_index_type@ unit { 0 }; + Span long_formats {}; + Span short_formats {}; + Span narrow_formats {}; +}; )~~~"); auto append_number_format = [&](auto const& number_format) { @@ -593,7 +720,40 @@ static constexpr Array @name@ { {)~~~"); )~~~"); }; + auto append_units = [&](String name, auto const& units) { + auto format_name = [&](String unit, StringView format) { + unit = unit.replace("-"sv, "_"sv, true); + return String::formatted("{}_{}_{}", name, unit, format); + }; + + for (auto const& unit : units) { + append_number_formats(format_name(unit.key, "l"sv), unit.value.long_formats); + append_number_formats(format_name(unit.key, "s"sv), unit.value.short_formats); + append_number_formats(format_name(unit.key, "n"sv), unit.value.narrow_formats); + } + + generator.set("name", name); + generator.set("size", String::number(units.size())); + + generator.append(R"~~~( +static constexpr Array @name@ { {)~~~"); + + for (auto const& unit : units) { + generator.set("unit"sv, String::number(unit.value.unit)); + generator.set("long_formats"sv, format_name(unit.key, "l"sv)); + generator.set("short_formats"sv, format_name(unit.key, "s"sv)); + generator.set("narrow_formats"sv, format_name(unit.key, "n"sv)); + generator.append(R"~~~( + { @unit@, @long_formats@.span(), @short_formats@.span(), @narrow_formats@.span() },)~~~"); + } + + generator.append(R"~~~( +} }; +)~~~"); + }; + generate_mapping(generator, locale_data.locales, "NumberSystem"sv, "s_number_systems"sv, "s_number_systems_{}", [&](auto const& name, auto const& value) { append_number_systems(name, value.number_systems); }); + generate_mapping(generator, locale_data.locales, "Unit"sv, "s_units"sv, "s_units_{}", [&](auto const& name, auto const& value) { append_units(name, value.units); }); auto append_from_string = [&](StringView enum_title, StringView enum_snake, auto const& values) { HashValueMap hashes; @@ -697,6 +857,53 @@ Vector get_compact_number_system_formats(StringView local return formats; } +static Unit const* find_units(StringView locale, StringView unit) +{ + auto locale_value = locale_from_string(locale); + if (!locale_value.has_value()) + return nullptr; + + auto locale_index = to_underlying(*locale_value) - 1; // Subtract 1 because 0 == Locale::None. + auto const& locale_units = s_units.at(locale_index); + + for (auto const& units : locale_units) { + if (unit == s_string_list[units.unit]) + return &units; + }; + + return nullptr; +} + +Vector get_unit_formats(StringView locale, StringView unit, Style style) +{ + Vector formats; + + if (auto const* units = find_units(locale, unit); units != nullptr) { + Span number_formats; + + switch (style) { + case Style::Long: + number_formats = units->long_formats; + break; + case Style::Short: + number_formats = units->short_formats; + break; + case Style::Narrow: + number_formats = units->narrow_formats; + break; + default: + VERIFY_NOT_REACHED(); + } + + formats.ensure_capacity(number_formats.size()); + + for (auto const& number_format : number_formats) + formats.append(number_format.to_unicode_number_format()); + } + + return formats; +} + } )~~~"); @@ -709,12 +916,14 @@ int main(int argc, char** argv) char const* generated_implementation_path = nullptr; char const* core_path = nullptr; char const* numbers_path = nullptr; + char const* units_path = nullptr; Core::ArgsParser args_parser; args_parser.add_option(generated_header_path, "Path to the Unicode locale header file to generate", "generated-header-path", 'h', "generated-header-path"); args_parser.add_option(generated_implementation_path, "Path to the Unicode locale implementation file to generate", "generated-implementation-path", 'c', "generated-implementation-path"); args_parser.add_option(core_path, "Path to cldr-core directory", "core-path", 'r', "core-path"); args_parser.add_option(numbers_path, "Path to cldr-numbers directory", "numbers-path", 'n', "numbers-path"); + args_parser.add_option(units_path, "Path to cldr-units directory", "units-path", 'u', "units-path"); args_parser.parse(argc, argv); auto open_file = [&](StringView path, StringView flags, Core::OpenMode mode = Core::OpenMode::ReadOnly) { @@ -737,7 +946,7 @@ int main(int argc, char** argv) auto generated_implementation_file = open_file(generated_implementation_path, "-c/--generated-implementation-path", Core::OpenMode::ReadWrite); UnicodeLocaleData locale_data; - parse_all_locales(core_path, numbers_path, locale_data); + parse_all_locales(core_path, numbers_path, units_path, locale_data); generate_unicode_locale_header(generated_header_file, locale_data); generate_unicode_locale_implementation(generated_implementation_file, locale_data); diff --git a/Userland/Libraries/LibUnicode/Forward.h b/Userland/Libraries/LibUnicode/Forward.h index b18426ce34..d44859abf9 100644 --- a/Userland/Libraries/LibUnicode/Forward.h +++ b/Userland/Libraries/LibUnicode/Forward.h @@ -20,6 +20,7 @@ enum class Locale : u16; enum class Property : u8; enum class Script : u8; enum class StandardNumberFormatType : u8; +enum class Style : u8; enum class Territory : u8; enum class WordBreakProperty : u8; diff --git a/Userland/Libraries/LibUnicode/Locale.cpp b/Userland/Libraries/LibUnicode/Locale.cpp index f437491b3b..a2a9454e88 100644 --- a/Userland/Libraries/LibUnicode/Locale.cpp +++ b/Userland/Libraries/LibUnicode/Locale.cpp @@ -851,6 +851,15 @@ Optional get_standard_number_system_format([[maybe_unused]] String #endif } +Vector get_unit_formats([[maybe_unused]] StringView locale, [[maybe_unused]] StringView unit, [[maybe_unused]] Style style) +{ +#if ENABLE_UNICODE_DATA + return Detail::get_unit_formats(locale, unit, style); +#else + return {}; +#endif +} + Optional get_locale_list_patterns([[maybe_unused]] StringView locale, [[maybe_unused]] StringView type, [[maybe_unused]] StringView style) { #if ENABLE_UNICODE_DATA diff --git a/Userland/Libraries/LibUnicode/Locale.h b/Userland/Libraries/LibUnicode/Locale.h index 0979e4851f..cfb4821120 100644 --- a/Userland/Libraries/LibUnicode/Locale.h +++ b/Userland/Libraries/LibUnicode/Locale.h @@ -191,6 +191,7 @@ Optional get_number_system_symbol(StringView locale, StringView syst Optional get_number_system_groupings(StringView locale, StringView system); Optional get_standard_number_system_format(StringView locale, StringView system, StandardNumberFormatType type); Vector get_compact_number_system_formats(StringView locale, StringView system, CompactNumberFormatType type); +Vector get_unit_formats(StringView locale, StringView unit, Style style); Optional get_locale_list_patterns(StringView locale, StringView type, StringView style); Optional resolve_language_alias(StringView language);