1
Fork 0
mirror of https://github.com/RGBCube/serenity synced 2025-05-14 18:44:59 +00:00

LibUnicode: Parse number system digits from the CLDR

We had a hard-coded table of number system digits copied from ECMA-402.
Turns out these digits are in the CLDR, so let's parse the digits from
there instead of hard-coding them.
This commit is contained in:
Timothy Flynn 2022-01-11 18:42:07 -05:00 committed by Linus Groh
parent 05ed8d1738
commit c5138f0f2b
5 changed files with 112 additions and 86 deletions

View file

@ -268,10 +268,47 @@ struct UnicodeLocaleData {
UniqueStorage<NumberSystem, NumberSystemIndexType> unique_systems;
UniqueStorage<Unit, UnitIndexType> unique_units;
HashMap<String, Array<u32, 10>> number_system_digits;
Vector<String> number_systems;
HashMap<String, Locale> locales;
size_t max_identifier_count { 0 };
};
static ErrorOr<void> parse_number_system_digits(String core_supplemental_path, UnicodeLocaleData& locale_data)
{
LexicalPath number_systems_path(move(core_supplemental_path));
number_systems_path = number_systems_path.append("numberingSystems.json"sv);
auto number_systems_file = TRY(Core::File::open(number_systems_path.string(), Core::OpenMode::ReadOnly));
auto number_systems = TRY(JsonValue::from_string(number_systems_file->read_all()));
auto const& supplemental_object = number_systems.as_object().get("supplemental"sv);
auto const& number_systems_object = supplemental_object.as_object().get("numberingSystems"sv);
number_systems_object.as_object().for_each_member([&](auto const& number_system, auto const& digits_object) {
auto type = digits_object.as_object().get("_type"sv).as_string();
if (type != "numeric"sv)
return;
auto digits = digits_object.as_object().get("_digits"sv).as_string();
Utf8View utf8_digits { digits };
VERIFY(utf8_digits.length() == 10);
auto& number_system_digits = locale_data.number_system_digits.ensure(number_system);
size_t index = 0;
for (u32 digit : utf8_digits)
number_system_digits[index++] = digit;
if (!locale_data.number_systems.contains_slow(number_system))
locale_data.number_systems.append(number_system);
});
return {};
}
static String parse_identifiers(String pattern, StringView replacement, UnicodeLocaleData& locale_data, NumberFormat& format)
{
static Utf8View whitespace { "\u0020\u00a0\u200f"sv };
@ -671,11 +708,17 @@ static ErrorOr<void> parse_units(String locale_units_path, UnicodeLocaleData& lo
return {};
}
static ErrorOr<void> parse_all_locales(String numbers_path, String units_path, UnicodeLocaleData& locale_data)
static ErrorOr<void> parse_all_locales(String core_path, String numbers_path, String units_path, UnicodeLocaleData& locale_data)
{
auto numbers_iterator = TRY(path_to_dir_iterator(move(numbers_path)));
auto units_iterator = TRY(path_to_dir_iterator(move(units_path)));
LexicalPath core_supplemental_path(move(core_path));
core_supplemental_path = core_supplemental_path.append("supplemental"sv);
VERIFY(Core::File::is_directory(core_supplemental_path.string()));
TRY(parse_number_system_digits(core_supplemental_path.string(), locale_data));
auto remove_variants_from_path = [&](String path) -> ErrorOr<String> {
auto parsed_locale = TRY(CanonicalLanguageID<StringIndexType>::parse(locale_data.unique_strings, LexicalPath::basename(path)));
@ -708,14 +751,28 @@ static ErrorOr<void> parse_all_locales(String numbers_path, String units_path, U
return {};
}
static void generate_unicode_locale_header(Core::File& file, UnicodeLocaleData&)
static String format_identifier(StringView, String identifier)
{
return identifier.to_titlecase();
}
static void generate_unicode_locale_header(Core::File& file, UnicodeLocaleData& locale_data)
{
StringBuilder builder;
SourceGenerator generator { builder };
// FIXME: Update unicode_data.cmake to not require a header.
generator.append(R"~~~(
#include <AK/Types.h>
#pragma once
namespace Unicode {
)~~~");
generate_enum(generator, format_identifier, "NumberSystem"sv, {}, locale_data.number_systems);
generator.append(R"~~~(
}
)~~~");
VERIFY(file.write(generator.as_string_view()));
@ -775,7 +832,7 @@ struct NumberFormatImpl {
Array<@string_index_type@, @identifier_count@> identifiers {};
};
struct NumberSystem {
struct NumberSystemData {
@string_index_type@ system { 0 };
@numeric_symbol_list_index_type@ symbols { 0 };
@ -806,7 +863,7 @@ struct Unit {
locale_data.unique_formats.generate(generator, "NumberFormatImpl"sv, "s_number_formats"sv, 10);
locale_data.unique_format_lists.generate(generator, s_number_format_index_type, "s_number_format_lists"sv);
locale_data.unique_symbols.generate(generator, s_string_index_type, "s_numeric_symbol_lists"sv);
locale_data.unique_systems.generate(generator, "NumberSystem"sv, "s_number_systems"sv, 10);
locale_data.unique_systems.generate(generator, "NumberSystemData"sv, "s_number_systems"sv, 10);
locale_data.unique_units.generate(generator, "Unit"sv, "s_units"sv, 10);
auto append_map = [&](String name, auto type, auto const& map) {
@ -820,18 +877,44 @@ static constexpr Array<@type@, @size@> @name@ { {)~~~");
bool first = true;
for (auto const& item : map) {
generator.append(first ? " " : ", ");
generator.append(String::number(item.value));
if constexpr (requires { item.value; })
generator.append(String::number(item.value));
else
generator.append(String::number(item));
first = false;
}
generator.append(" } };");
};
generate_mapping(generator, locale_data.number_system_digits, "u32"sv, "s_number_systems_digits"sv, "s_number_systems_digits_{}", nullptr, [&](auto const& name, auto const& value) { append_map(name, "u32"sv, value); });
generate_mapping(generator, locale_data.locales, s_number_system_index_type, "s_locale_number_systems"sv, "s_number_systems_{}", nullptr, [&](auto const& name, auto const& value) { append_map(name, s_number_system_index_type, value.number_systems); });
generate_mapping(generator, locale_data.locales, s_unit_index_type, "s_locale_units"sv, "s_units_{}", nullptr, [&](auto const& name, auto const& value) { append_map(name, s_unit_index_type, value.units); });
auto append_from_string = [&](StringView enum_title, StringView enum_snake, auto const& values) {
HashValueMap<String> hashes;
hashes.ensure_capacity(values.size());
for (auto const& value : values)
hashes.set(value.hash(), format_identifier(enum_title, value));
generate_value_from_string(generator, "{}_from_string"sv, enum_title, enum_snake, move(hashes));
};
append_from_string("NumberSystem"sv, "number_system"sv, locale_data.number_systems);
generator.append(R"~~~(
static NumberSystem const* find_number_system(StringView locale, StringView system)
Optional<Span<u32 const>> get_digits_for_number_system(StringView system)
{
auto number_system_value = number_system_from_string(system);
if (!number_system_value.has_value())
return {};
auto number_system_index = to_underlying(*number_system_value);
return s_number_systems_digits[number_system_index];
}
static NumberSystemData const* find_number_system(StringView locale, StringView system)
{
auto locale_value = locale_from_string(locale);
if (!locale_value.has_value())
@ -991,14 +1074,16 @@ Vector<NumberFormat> get_unit_formats(StringView locale, StringView unit, Style
ErrorOr<int> serenity_main(Main::Arguments arguments)
{
StringView generated_header_path = nullptr;
StringView generated_implementation_path = nullptr;
StringView numbers_path = nullptr;
StringView units_path = nullptr;
StringView generated_header_path;
StringView generated_implementation_path;
StringView core_path;
StringView numbers_path;
StringView units_path;
Core::ArgsParser args_parser;
args_parser.add_option(generated_header_path, "Path to the Unicode locale header file to generate", "generated-header-path", 'h', "generated-header-path");
args_parser.add_option(generated_implementation_path, "Path to the Unicode locale implementation file to generate", "generated-implementation-path", 'c', "generated-implementation-path");
args_parser.add_option(core_path, "Path to cldr-core directory", "core-path", 'r', "core-path");
args_parser.add_option(numbers_path, "Path to cldr-numbers directory", "numbers-path", 'n', "numbers-path");
args_parser.add_option(units_path, "Path to cldr-units directory", "units-path", 'u', "units-path");
args_parser.parse(arguments);
@ -1016,7 +1101,7 @@ ErrorOr<int> serenity_main(Main::Arguments arguments)
auto generated_implementation_file = TRY(open_file(generated_implementation_path));
UnicodeLocaleData locale_data;
TRY(parse_all_locales(numbers_path, units_path, locale_data));
TRY(parse_all_locales(core_path, numbers_path, units_path, locale_data));
generate_unicode_locale_header(generated_header_file, locale_data);
generate_unicode_locale_implementation(generated_implementation_file, locale_data);