From 48ce72e472e1ea5ba01b1ac8dc2bc409840cd23b Mon Sep 17 00:00:00 2001 From: Timothy Flynn Date: Sat, 27 Nov 2021 20:57:21 -0500 Subject: [PATCH] LibUnicode: Parse and generate regional hour cycles Unlike most data in the CLDR, hour cycles are not stored on a per-locale basis. Instead, they are keyed by a string that is usually a region, but sometimes is a locale. Therefore, given a locale, to determine the hour cycles for that locale, we: 1. Check if the locale itself is assigned hour cycles. 2. If the locale has a region, check if that region is assigned hour cycles. 3. Otherwise, maximize that locale, and if the maximized locale has a region, check if that region is assigned hour cycles. 4. If the above all fail, fallback to the "001" region. Further, each locale's default hour cycle is the first assigned hour cycle. --- Meta/CMake/unicode_data.cmake | 2 +- .../GenerateUnicodeDateTimeFormat.cpp | 108 +++++++++++++++++- .../Libraries/LibUnicode/DateTimeFormat.cpp | 68 +++++++++++ .../Libraries/LibUnicode/DateTimeFormat.h | 11 ++ Userland/Libraries/LibUnicode/Forward.h | 1 + 5 files changed, 183 insertions(+), 7 deletions(-) diff --git a/Meta/CMake/unicode_data.cmake b/Meta/CMake/unicode_data.cmake index c041577ef5..794a674d98 100644 --- a/Meta/CMake/unicode_data.cmake +++ b/Meta/CMake/unicode_data.cmake @@ -191,7 +191,7 @@ if (ENABLE_UNICODE_DATABASE_DOWNLOAD) Lagom::GenerateUnicodeDateTimeFormat "${UNICODE_DATE_TIME_FORMAT_HEADER}" "${UNICODE_DATE_TIME_FORMAT_IMPLEMENTATION}" - arguments -d "${CLDR_DATES_PATH}" + arguments -r "${CLDR_CORE_PATH}" -d "${CLDR_DATES_PATH}" ) invoke_generator( "UnicodeLocale" diff --git a/Meta/Lagom/Tools/CodeGenerators/LibUnicode/GenerateUnicodeDateTimeFormat.cpp b/Meta/Lagom/Tools/CodeGenerators/LibUnicode/GenerateUnicodeDateTimeFormat.cpp index f3fa4be49b..2700571676 100644 --- a/Meta/Lagom/Tools/CodeGenerators/LibUnicode/GenerateUnicodeDateTimeFormat.cpp +++ b/Meta/Lagom/Tools/CodeGenerators/LibUnicode/GenerateUnicodeDateTimeFormat.cpp @@ -49,6 +49,9 @@ struct UnicodeLocaleData { UniqueStringStorage unique_strings; HashMap locales; + HashMap> hour_cycles; + Vector hour_cycle_regions; + Vector calendars; Vector calendar_aliases { // FIXME: Aliases should come from BCP47. See: https://unicode-org.atlassian.net/browse/CLDR-15158 @@ -58,6 +61,50 @@ struct UnicodeLocaleData { size_t max_available_formats_size { 0 }; }; +static ErrorOr parse_hour_cycles(String core_path, UnicodeLocaleData& locale_data) +{ + // https://unicode.org/reports/tr35/tr35-dates.html#Time_Data + LexicalPath time_data_path(move(core_path)); + time_data_path = time_data_path.append("supplemental"sv); + time_data_path = time_data_path.append("timeData.json"sv); + + auto time_data_file = TRY(Core::File::open(time_data_path.string(), Core::OpenMode::ReadOnly)); + auto time_data = TRY(JsonValue::from_string(time_data_file->read_all())); + auto const& supplemental_object = time_data.as_object().get("supplemental"sv); + auto const& time_data_object = supplemental_object.as_object().get("timeData"sv); + + auto parse_hour_cycle = [](StringView hour_cycle) -> Optional { + if (hour_cycle == "h"sv) + return Unicode::HourCycle::H12; + if (hour_cycle == "H"sv) + return Unicode::HourCycle::H23; + if (hour_cycle == "K"sv) + return Unicode::HourCycle::H11; + if (hour_cycle == "k"sv) + return Unicode::HourCycle::H24; + return {}; + }; + + time_data_object.as_object().for_each_member([&](auto const& key, JsonValue const& value) { + auto allowed_hour_cycles_string = value.as_object().get("_allowed"sv).as_string(); + auto allowed_hour_cycles = allowed_hour_cycles_string.split_view(' '); + + Vector hour_cycles; + + for (auto allowed_hour_cycle : allowed_hour_cycles) { + if (auto hour_cycle = parse_hour_cycle(allowed_hour_cycle); hour_cycle.has_value()) + hour_cycles.append(*hour_cycle); + } + + locale_data.hour_cycles.set(key, move(hour_cycles)); + + if (!locale_data.hour_cycle_regions.contains_slow(key)) + locale_data.hour_cycle_regions.append(key); + }); + + return {}; +}; + static void parse_date_time_pattern(CalendarPattern& format, String pattern, UnicodeLocaleData& locale_data) { // FIXME: This is very incomplete. Similar to NumberFormat, the pattern string will need to be @@ -131,8 +178,9 @@ static ErrorOr parse_calendars(String locale_calendars_path, UnicodeLocale return {}; } -static ErrorOr parse_all_locales(String dates_path, UnicodeLocaleData& locale_data) +static ErrorOr parse_all_locales(String core_path, String dates_path, UnicodeLocaleData& locale_data) { + TRY(parse_hour_cycles(move(core_path), locale_data)); auto dates_iterator = TRY(path_to_dir_iterator(move(dates_path))); auto remove_variants_from_path = [&](String path) -> ErrorOr { @@ -164,9 +212,15 @@ static ErrorOr parse_all_locales(String dates_path, UnicodeLocaleData& loc return {}; } -static String format_identifier(StringView, StringView identifier) +static String format_identifier(StringView owner, String identifier) { - return identifier.to_titlecase_string(); + identifier = identifier.replace("-"sv, "_"sv, true); + + if (all_of(identifier, is_ascii_digit)) + return String::formatted("{}_{}", owner[0], identifier); + if (is_ascii_lower_alpha(identifier[0])) + return String::formatted("{:c}{}", to_ascii_uppercase(identifier[0]), identifier.substring_view(1)); + return identifier; } static void generate_unicode_locale_header(Core::File& file, UnicodeLocaleData& locale_data) @@ -185,11 +239,16 @@ namespace Unicode { )~~~"); generate_enum(generator, format_identifier, "Calendar"sv, {}, locale_data.calendars, locale_data.calendar_aliases); + generate_enum(generator, format_identifier, "HourCycleRegion"sv, {}, locale_data.hour_cycle_regions); generator.append(R"~~~( namespace Detail { Optional calendar_from_string(StringView calendar); + +Optional hour_cycle_region_from_string(StringView hour_cycle_region); +Vector get_regional_hour_cycles(StringView region); + Optional get_calendar_date_format(StringView locale, StringView calendar); Optional get_calendar_time_format(StringView locale, StringView calendar); Optional get_calendar_date_time_format(StringView locale, StringView calendar); @@ -313,9 +372,25 @@ static constexpr Array @name@ { {)~~~"); )~~~"); }; - generate_mapping(generator, locale_data.locales, "CalendarData"sv, "s_calendars"sv, "s_calendars_{}", [&](auto const& name, auto const& value) { append_calendars(name, value.calendars); }); + auto append_hour_cycles = [&](String name, auto const& hour_cycles) { + generator.set("name", name); + generator.set("size", String::number(hour_cycles.size())); - auto append_from_string = [&](StringView enum_title, StringView enum_snake, auto const& values, auto const& aliases) { + generator.append(R"~~~( +static constexpr Array @name@ { { )~~~"); + + for (auto hour_cycle : hour_cycles) { + generator.set("hour_cycle", String::number(static_cast(hour_cycle))); + generator.append("@hour_cycle@, "); + } + + generator.append("} };"); + }; + + generate_mapping(generator, locale_data.locales, "CalendarData"sv, "s_calendars"sv, "s_calendars_{}", [&](auto const& name, auto const& value) { append_calendars(name, value.calendars); }); + generate_mapping(generator, locale_data.hour_cycles, "u8"sv, "s_hour_cycles"sv, "s_hour_cycles_{}", [&](auto const& name, auto const& value) { append_hour_cycles(name, value); }); + + auto append_from_string = [&](StringView enum_title, StringView enum_snake, auto const& values, Vector const& aliases = {}) { HashValueMap hashes; hashes.ensure_capacity(values.size()); @@ -328,8 +403,27 @@ static constexpr Array @name@ { {)~~~"); }; append_from_string("Calendar"sv, "calendar"sv, locale_data.calendars, locale_data.calendar_aliases); + append_from_string("HourCycleRegion"sv, "hour_cycle_region"sv, locale_data.hour_cycle_regions); generator.append(R"~~~( +Vector get_regional_hour_cycles(StringView region) +{ + auto region_value = hour_cycle_region_from_string(region); + if (!region_value.has_value()) + return {}; + + auto region_index = to_underlying(*region_value); + auto const& regional_hour_cycles = s_hour_cycles.at(region_index); + + Vector hour_cycles; + hour_cycles.ensure_capacity(regional_hour_cycles.size()); + + for (auto hour_cycle : regional_hour_cycles) + hour_cycles.unchecked_append(static_cast(hour_cycle)); + + return hour_cycles; +} + static CalendarData const* find_calendar_data(StringView locale, StringView calendar) { auto locale_value = locale_from_string(locale); @@ -392,11 +486,13 @@ ErrorOr serenity_main(Main::Arguments arguments) { StringView generated_header_path; StringView generated_implementation_path; + StringView core_path; StringView dates_path; Core::ArgsParser args_parser; args_parser.add_option(generated_header_path, "Path to the Unicode locale header file to generate", "generated-header-path", 'h', "generated-header-path"); args_parser.add_option(generated_implementation_path, "Path to the Unicode locale implementation file to generate", "generated-implementation-path", 'c', "generated-implementation-path"); + args_parser.add_option(core_path, "Path to cldr-core directory", "core-path", 'r', "core-path"); args_parser.add_option(dates_path, "Path to cldr-dates directory", "dates-path", 'd', "dates-path"); args_parser.parse(arguments); @@ -413,7 +509,7 @@ ErrorOr serenity_main(Main::Arguments arguments) auto generated_implementation_file = TRY(open_file(generated_implementation_path)); UnicodeLocaleData locale_data; - TRY(parse_all_locales(dates_path, locale_data)); + TRY(parse_all_locales(core_path, dates_path, locale_data)); generate_unicode_locale_header(generated_header_file, locale_data); generate_unicode_locale_implementation(generated_implementation_file, locale_data); diff --git a/Userland/Libraries/LibUnicode/DateTimeFormat.cpp b/Userland/Libraries/LibUnicode/DateTimeFormat.cpp index 2df9b8f2dc..b63908d39f 100644 --- a/Userland/Libraries/LibUnicode/DateTimeFormat.cpp +++ b/Userland/Libraries/LibUnicode/DateTimeFormat.cpp @@ -5,6 +5,7 @@ */ #include +#include #if ENABLE_UNICODE_DATA # include @@ -12,6 +13,35 @@ namespace Unicode { +HourCycle hour_cycle_from_string(StringView hour_cycle) +{ + if (hour_cycle == "h11"sv) + return Unicode::HourCycle::H11; + else if (hour_cycle == "h12"sv) + return Unicode::HourCycle::H12; + else if (hour_cycle == "h23"sv) + return Unicode::HourCycle::H23; + else if (hour_cycle == "h24"sv) + return Unicode::HourCycle::H24; + VERIFY_NOT_REACHED(); +} + +StringView hour_cycle_to_string(HourCycle hour_cycle) +{ + switch (hour_cycle) { + case HourCycle::H11: + return "h11"sv; + case HourCycle::H12: + return "h12"sv; + case HourCycle::H23: + return "h23"sv; + case HourCycle::H24: + return "h24"sv; + default: + VERIFY_NOT_REACHED(); + } +} + CalendarPatternStyle calendar_pattern_style_from_string(StringView style) { if (style == "narrow"sv) @@ -45,6 +75,44 @@ StringView calendar_pattern_style_to_string(CalendarPatternStyle style) } } +// https://unicode.org/reports/tr35/tr35-dates.html#Date_Field_Symbol_Table +Vector get_regional_hour_cycles([[maybe_unused]] StringView locale) +{ +#if ENABLE_UNICODE_DATA + if (auto hour_cycles = Detail::get_regional_hour_cycles(locale); !hour_cycles.is_empty()) + return hour_cycles; + + auto return_default_hour_cycles = []() { + auto hour_cycles = Detail::get_regional_hour_cycles("001"sv); + VERIFY(!hour_cycles.is_empty()); + return hour_cycles; + }; + + auto language = parse_unicode_language_id(locale); + if (!language.has_value()) + return return_default_hour_cycles(); + + if (!language->region.has_value()) + language = add_likely_subtags(*language); + if (!language.has_value() || !language->region.has_value()) + return return_default_hour_cycles(); + + if (auto hour_cycles = Detail::get_regional_hour_cycles(*language->region); !hour_cycles.is_empty()) + return hour_cycles; + + return return_default_hour_cycles(); +#else + return {}; +#endif +} + +Optional get_default_regional_hour_cycle(StringView locale) +{ + if (auto hour_cycles = get_regional_hour_cycles(locale); !hour_cycles.is_empty()) + return hour_cycles.first(); + return {}; +} + Optional get_calendar_format([[maybe_unused]] StringView locale, [[maybe_unused]] StringView calendar, [[maybe_unused]] CalendarFormatType type) { #if ENABLE_UNICODE_DATA diff --git a/Userland/Libraries/LibUnicode/DateTimeFormat.h b/Userland/Libraries/LibUnicode/DateTimeFormat.h index 3faa31d0b5..0bb2abb27e 100644 --- a/Userland/Libraries/LibUnicode/DateTimeFormat.h +++ b/Userland/Libraries/LibUnicode/DateTimeFormat.h @@ -15,6 +15,13 @@ namespace Unicode { +enum class HourCycle : u8 { + H11, + H12, + H23, + H24, +}; + enum class CalendarPatternStyle : u8 { Narrow, Short, @@ -54,8 +61,12 @@ struct CalendarFormat { CalendarPattern short_format {}; }; +HourCycle hour_cycle_from_string(StringView hour_cycle); +StringView hour_cycle_to_string(HourCycle hour_cycle); CalendarPatternStyle calendar_pattern_style_from_string(StringView style); StringView calendar_pattern_style_to_string(CalendarPatternStyle style); +Vector get_regional_hour_cycles(StringView locale); +Optional get_default_regional_hour_cycle(StringView locale); Optional get_calendar_format(StringView locale, StringView calendar, CalendarFormatType type); Vector get_calendar_available_formats(StringView locale, StringView calendar); diff --git a/Userland/Libraries/LibUnicode/Forward.h b/Userland/Libraries/LibUnicode/Forward.h index 7db7514e8a..8888508687 100644 --- a/Userland/Libraries/LibUnicode/Forward.h +++ b/Userland/Libraries/LibUnicode/Forward.h @@ -15,6 +15,7 @@ enum class CalendarPatternStyle : u8; enum class CompactNumberFormatType : u8; enum class Condition : u8; enum class GeneralCategory : u8; +enum class HourCycle : u8; enum class Language : u8; enum class ListPatternStyle : u8; enum class ListPatternType : u8;