From becec3578f0ec967126f5329e50d975c24d65529 Mon Sep 17 00:00:00 2001 From: Timothy Flynn Date: Mon, 15 Aug 2022 13:01:42 -0400 Subject: [PATCH] LibTimeZone+LibUnicode: Generate string data with run-length encoding Currently, the unique string lists are stored in the initialized data sections of their shared libraries. In order to move the data to the read-only section, generate the strings using RLE arrays. We generate two arrays: the first is the RLE data itself, the second is a list of indices into the RLE array for each string. We then generate a decoding method to convert an RLE string to a StringView. --- .../LibTimeZone/GenerateTimeZoneData.cpp | 6 +- .../GenerateUnicodeDateTimeFormat.cpp | 38 ++++----- .../LibUnicode/GenerateUnicodeLocale.cpp | 54 ++++++------ .../GenerateUnicodeNumberFormat.cpp | 12 +-- .../GenerateUnicodeRelativeTimeFormat.cpp | 4 +- .../CodeGenerators/LibUnicode/GeneratorUtil.h | 83 ++++++++++++++++++- 6 files changed, 138 insertions(+), 59 deletions(-) diff --git a/Meta/Lagom/Tools/CodeGenerators/LibTimeZone/GenerateTimeZoneData.cpp b/Meta/Lagom/Tools/CodeGenerators/LibTimeZone/GenerateTimeZoneData.cpp index 07e0ac578b..750b1ef983 100644 --- a/Meta/Lagom/Tools/CodeGenerators/LibTimeZone/GenerateTimeZoneData.cpp +++ b/Meta/Lagom/Tools/CodeGenerators/LibTimeZone/GenerateTimeZoneData.cpp @@ -724,8 +724,8 @@ Optional> get_named_time_zone_offsets(TimeZone time_zone, auto format_name = [](auto format, auto offset) -> String { if (offset == 0) - return s_string_list[format].replace("{}"sv, ""sv, ReplaceMode::FirstOnly); - return String::formatted(s_string_list[format], s_string_list[offset]); + return decode_string(format).replace("{}"sv, ""sv, ReplaceMode::FirstOnly); + return String::formatted(decode_string(format), decode_string(offset)); }; auto set_named_offset = [&](auto& named_offset, auto dst_offset, auto in_dst, auto format, auto offset) { @@ -776,7 +776,7 @@ Vector time_zones_in_region(StringView region) time_zones.ensure_capacity(regional_time_zones.size()); for (auto time_zone : regional_time_zones) - time_zones.unchecked_append(s_string_list[time_zone]); + time_zones.unchecked_append(decode_string(time_zone)); return time_zones; } diff --git a/Meta/Lagom/Tools/CodeGenerators/LibUnicode/GenerateUnicodeDateTimeFormat.cpp b/Meta/Lagom/Tools/CodeGenerators/LibUnicode/GenerateUnicodeDateTimeFormat.cpp index 841cc976f8..d6ed3cac34 100644 --- a/Meta/Lagom/Tools/CodeGenerators/LibUnicode/GenerateUnicodeDateTimeFormat.cpp +++ b/Meta/Lagom/Tools/CodeGenerators/LibUnicode/GenerateUnicodeDateTimeFormat.cpp @@ -1811,10 +1811,10 @@ struct CalendarPatternImpl { CalendarPattern to_unicode_calendar_pattern() const { CalendarPattern calendar_pattern {}; - calendar_pattern.skeleton = s_string_list[skeleton]; - calendar_pattern.pattern = s_string_list[pattern]; + calendar_pattern.skeleton = decode_string(skeleton); + calendar_pattern.pattern = decode_string(pattern); if (pattern12 != 0) - calendar_pattern.pattern12 = s_string_list[pattern12]; + calendar_pattern.pattern12 = decode_string(pattern12); convert_calendar_fields(*this, calendar_pattern); return calendar_pattern; @@ -1843,9 +1843,9 @@ struct CalendarRangePatternImpl { if (field != -1) calendar_range_pattern.field = static_cast(field); - calendar_range_pattern.start_range = s_string_list[start_range]; - calendar_range_pattern.separator = s_string_list[separator]; - calendar_range_pattern.end_range = s_string_list[end_range]; + calendar_range_pattern.start_range = decode_string(start_range); + calendar_range_pattern.separator = decode_string(separator); + calendar_range_pattern.end_range = decode_string(end_range); convert_calendar_fields(*this, calendar_range_pattern); return calendar_range_pattern; @@ -1929,12 +1929,12 @@ struct TimeZoneFormatImpl { TimeZoneFormat to_time_zone_format() const { TimeZoneFormat time_zone_format {}; - time_zone_format.symbol_ahead_sign = s_string_list[symbol_ahead_sign]; - time_zone_format.symbol_ahead_separator = s_string_list[symbol_ahead_separator]; - time_zone_format.symbol_behind_sign = s_string_list[symbol_behind_sign]; - time_zone_format.symbol_behind_separator = s_string_list[symbol_behind_separator]; - time_zone_format.gmt_format = s_string_list[gmt_format]; - time_zone_format.gmt_zero_format = s_string_list[gmt_zero_format]; + time_zone_format.symbol_ahead_sign = decode_string(symbol_ahead_sign); + time_zone_format.symbol_ahead_separator = decode_string(symbol_ahead_separator); + time_zone_format.symbol_behind_sign = decode_string(symbol_behind_sign); + time_zone_format.symbol_behind_separator = decode_string(symbol_behind_separator); + time_zone_format.gmt_format = decode_string(gmt_format); + time_zone_format.gmt_zero_format = decode_string(gmt_zero_format); return time_zone_format; } @@ -2200,7 +2200,7 @@ Vector get_calendar_range_formats(StringView locale, Strin for (auto format : range_formats) { auto const& pattern = s_calendar_range_patterns[format]; - if (skeleton == s_string_list[pattern.skeleton]) + if (skeleton == decode_string(pattern.skeleton)) result.append(pattern.to_unicode_calendar_range_pattern()); } } @@ -2218,7 +2218,7 @@ Vector get_calendar_range12_formats(StringView locale, Str for (auto format : range12_formats) { auto const& pattern = s_calendar_range_patterns[format]; - if (skeleton == s_string_list[pattern.skeleton]) + if (skeleton == decode_string(pattern.skeleton)) result.append(pattern.to_unicode_calendar_range_pattern()); } } @@ -2263,7 +2263,7 @@ Optional get_calendar_era_symbol(StringView locale, StringView calen if (auto value_index = to_underlying(value); value_index < symbols.size()) { if (auto symbol_index = symbols.at(value_index); symbol_index != 0) - return s_string_list[symbol_index]; + return decode_string(symbol_index); } return {}; @@ -2275,7 +2275,7 @@ Optional get_calendar_month_symbol(StringView locale, StringView cal if (auto value_index = to_underlying(value); value_index < symbols.size()) { if (auto symbol_index = symbols.at(value_index); symbol_index != 0) - return s_string_list[symbol_index]; + return decode_string(symbol_index); } return {}; @@ -2287,7 +2287,7 @@ Optional get_calendar_weekday_symbol(StringView locale, StringView c if (auto value_index = to_underlying(value); value_index < symbols.size()) { if (auto symbol_index = symbols.at(value_index); symbol_index != 0) - return s_string_list[symbol_index]; + return decode_string(symbol_index); } return {}; @@ -2299,7 +2299,7 @@ Optional get_calendar_day_period_symbol(StringView locale, StringVie if (auto value_index = to_underlying(value); value_index < symbols.size()) { if (auto symbol_index = symbols.at(value_index); symbol_index != 0) - return s_string_list[symbol_index]; + return decode_string(symbol_index); } return {}; @@ -2400,7 +2400,7 @@ Optional get_time_zone_name(StringView locale, StringView time_zone, } if (name_index != 0) - return s_string_list[name_index]; + return decode_string(name_index); } return {}; diff --git a/Meta/Lagom/Tools/CodeGenerators/LibUnicode/GenerateUnicodeLocale.cpp b/Meta/Lagom/Tools/CodeGenerators/LibUnicode/GenerateUnicodeLocale.cpp index 366c066386..3eee8dc1e4 100644 --- a/Meta/Lagom/Tools/CodeGenerators/LibUnicode/GenerateUnicodeLocale.cpp +++ b/Meta/Lagom/Tools/CodeGenerators/LibUnicode/GenerateUnicodeLocale.cpp @@ -1117,8 +1117,8 @@ struct DisplayPatternImpl { DisplayPattern to_display_pattern() const { DisplayPattern display_patterns {}; - display_patterns.locale_pattern = s_string_list[locale_pattern]; - display_patterns.locale_separator = s_string_list[locale_separator]; + display_patterns.locale_pattern = decode_string(locale_pattern); + display_patterns.locale_separator = decode_string(locale_separator); return display_patterns; } @@ -1266,13 +1266,13 @@ struct CanonicalLanguageID { LanguageID language_id {}; language_id.variants.ensure_capacity(variants_size); - language_id.language = s_string_list[language]; + language_id.language = decode_string(language); if (script != 0) - language_id.script = s_string_list[script]; + language_id.script = decode_string(script); if (region != 0) - language_id.region = s_string_list[region]; + language_id.region = decode_string(region); for (size_t i = 0; i < variants_size; ++i) - language_id.variants.append(s_string_list[variants[i]]); + language_id.variants.append(decode_string(variants[i])); return language_id; } @@ -1284,7 +1284,7 @@ struct CanonicalLanguageID { return false; for (size_t i = 0; i < variants_size; ++i) { - if (s_string_list[variants[i]] != other_variants[i]) + if (decode_string(variants[i]) != other_variants[i]) return false; } @@ -1415,9 +1415,9 @@ static LanguageMapping const* resolve_likely_subtag(LanguageID const& language_i } for (auto const& map : s_likely_subtags) { - auto const& key_language = s_string_list[map.key.language]; - auto const& key_script = s_string_list[map.key.script]; - auto const& key_region = s_string_list[map.key.region]; + auto const& key_language = decode_string(map.key.language); + auto const& key_script = decode_string(map.key.script); + auto const& key_region = decode_string(map.key.region); if (key_language != search_key.language) continue; @@ -1463,7 +1463,7 @@ Optional get_locale_@enum_snake@_mapping(StringView locale, StringVi auto const& mappings = @unique_list@.at(mapping_index); auto @enum_snake@_string_index = mappings.at(@enum_snake@_index); - auto @enum_snake@_mapping = s_string_list.at(@enum_snake@_string_index); + auto @enum_snake@_mapping = decode_string(@enum_snake@_string_index); if (@enum_snake@_mapping.is_empty()) return {}; @@ -1493,7 +1493,7 @@ Optional get_locale_@enum_snake@_mapping(StringView locale, StringVi ValueFromStringOptions options {}; options.return_type = "StringView"sv; - options.return_format = "s_string_list[{}]"sv; + options.return_format = "decode_string({})"sv; generate_value_from_string(generator, "resolve_{}_alias"sv, s_string_index_type, enum_snake, move(hashes), options); }; @@ -1606,7 +1606,7 @@ Optional get_preferred_keyword_value_for_locale(StringView locale, S if (keyword_indices.is_empty()) return {}; - return s_string_list[keyword_indices[0]]; + return decode_string(keyword_indices[0]); } Vector get_keywords_for_locale(StringView locale, StringView key) @@ -1636,7 +1636,7 @@ Vector get_keywords_for_locale(StringView locale, StringView key) keywords.ensure_capacity(keyword_indices.size()); for (auto keyword : keyword_indices) - keywords.unchecked_append(s_string_list[keyword]); + keywords.unchecked_append(decode_string(keyword)); return keywords; } @@ -1673,10 +1673,10 @@ Optional get_locale_list_patterns(StringView locale, StringView li auto const& list_patterns = s_list_patterns.at(list_patterns_index); if ((list_patterns.type == type_value) && (list_patterns.style == list_pattern_style)) { - auto const& start = s_string_list[list_patterns.start]; - auto const& middle = s_string_list[list_patterns.middle]; - auto const& end = s_string_list[list_patterns.end]; - auto const& pair = s_string_list[list_patterns.pair]; + auto const& start = decode_string(list_patterns.start); + auto const& middle = decode_string(list_patterns.middle); + auto const& end = decode_string(list_patterns.end); + auto const& pair = decode_string(list_patterns.pair); return ListPatterns { start, middle, end, pair }; } @@ -1707,9 +1707,9 @@ Optional character_order_for_locale(StringView locale) void resolve_complex_language_aliases(LanguageID& language_id) { for (auto const& map : s_complex_alias) { - auto const& key_language = s_string_list[map.key.language]; - auto const& key_script = s_string_list[map.key.script]; - auto const& key_region = s_string_list[map.key.region]; + auto const& key_language = decode_string(map.key.language); + auto const& key_script = decode_string(map.key.script); + auto const& key_region = decode_string(map.key.region); if ((key_language != language_id.language) && (key_language != "und"sv)) continue; @@ -1745,12 +1745,12 @@ Optional add_likely_subtags(LanguageID const& language_id) auto maximized = language_id; - auto const& key_script = s_string_list[likely_subtag->key.script]; - auto const& key_region = s_string_list[likely_subtag->key.region]; + auto const& key_script = decode_string(likely_subtag->key.script); + auto const& key_region = decode_string(likely_subtag->key.region); - auto const& alias_language = s_string_list[likely_subtag->alias.language]; - auto const& alias_script = s_string_list[likely_subtag->alias.script]; - auto const& alias_region = s_string_list[likely_subtag->alias.region]; + auto const& alias_language = decode_string(likely_subtag->alias.language); + auto const& alias_script = decode_string(likely_subtag->alias.script); + auto const& alias_region = decode_string(likely_subtag->alias.region); if (maximized.language == "und"sv) maximized.language = alias_language; @@ -1765,7 +1765,7 @@ Optional add_likely_subtags(LanguageID const& language_id) Optional resolve_most_likely_territory(LanguageID const& language_id) { if (auto const* likely_subtag = resolve_likely_subtag(language_id); likely_subtag != nullptr) - return s_string_list[likely_subtag->alias.region]; + return decode_string(likely_subtag->alias.region); return {}; } diff --git a/Meta/Lagom/Tools/CodeGenerators/LibUnicode/GenerateUnicodeNumberFormat.cpp b/Meta/Lagom/Tools/CodeGenerators/LibUnicode/GenerateUnicodeNumberFormat.cpp index 506c514887..76243ce0df 100644 --- a/Meta/Lagom/Tools/CodeGenerators/LibUnicode/GenerateUnicodeNumberFormat.cpp +++ b/Meta/Lagom/Tools/CodeGenerators/LibUnicode/GenerateUnicodeNumberFormat.cpp @@ -825,13 +825,13 @@ struct NumberFormatImpl { number_format.magnitude = magnitude; number_format.exponent = exponent; number_format.plurality = static_cast(plurality); - number_format.zero_format = s_string_list[zero_format]; - number_format.positive_format = s_string_list[positive_format]; - number_format.negative_format = s_string_list[negative_format]; + number_format.zero_format = decode_string(zero_format); + number_format.positive_format = decode_string(positive_format); + number_format.negative_format = decode_string(negative_format); number_format.identifiers.ensure_capacity(identifiers.size()); for (@string_index_type@ identifier : identifiers) - number_format.identifiers.append(s_string_list[identifier]); + number_format.identifiers.append(decode_string(identifier)); return number_format; } @@ -996,7 +996,7 @@ Optional get_number_system_symbol(StringView locale, StringView syst if (symbol_index >= symbols.size()) return {}; - return s_string_list[symbols[symbol_index]]; + return decode_string(symbols[symbol_index]); } return {}; @@ -1088,7 +1088,7 @@ static Unit const* find_units(StringView locale, StringView unit) for (auto unit_index : locale_units) { auto const& units = s_units.at(unit_index); - if (unit == s_string_list[units.unit]) + if (unit == decode_string(units.unit)) return &units; }; diff --git a/Meta/Lagom/Tools/CodeGenerators/LibUnicode/GenerateUnicodeRelativeTimeFormat.cpp b/Meta/Lagom/Tools/CodeGenerators/LibUnicode/GenerateUnicodeRelativeTimeFormat.cpp index 12bd97c26e..de9087d59b 100644 --- a/Meta/Lagom/Tools/CodeGenerators/LibUnicode/GenerateUnicodeRelativeTimeFormat.cpp +++ b/Meta/Lagom/Tools/CodeGenerators/LibUnicode/GenerateUnicodeRelativeTimeFormat.cpp @@ -218,7 +218,7 @@ struct RelativeTimeFormatImpl { { RelativeTimeFormat relative_time_format {}; relative_time_format.plurality = plurality; - relative_time_format.pattern = s_string_list[pattern]; + relative_time_format.pattern = decode_string(pattern); return relative_time_format; } @@ -271,7 +271,7 @@ Vector get_relative_time_format_patterns(StringView locale, continue; if (locale_format.style != style) continue; - if (s_string_list[locale_format.tense_or_number] != tense_or_number) + if (decode_string(locale_format.tense_or_number) != tense_or_number) continue; formats.append(locale_format.to_relative_time_format()); diff --git a/Meta/Lagom/Tools/CodeGenerators/LibUnicode/GeneratorUtil.h b/Meta/Lagom/Tools/CodeGenerators/LibUnicode/GeneratorUtil.h index a0888b531c..aaf68a14b4 100644 --- a/Meta/Lagom/Tools/CodeGenerators/LibUnicode/GeneratorUtil.h +++ b/Meta/Lagom/Tools/CodeGenerators/LibUnicode/GeneratorUtil.h @@ -11,6 +11,7 @@ #include #include #include +#include #include #include #include @@ -174,7 +175,7 @@ static constexpr Array, @size@ + 1> @name@ { { // clang-format off // clang-format gets confused by the requires() clauses above, and formats this section very weirdly. -private: +protected: Vector m_storage; HashMap m_storage_indices; // clang-format on @@ -185,9 +186,87 @@ class UniqueStringStorage : public UniqueStorage { using Base = UniqueStorage; public: + // The goal of the string table generator is to ensure the table is located within the read-only + // section of the shared library. If StringViews are generated directly, the table will be located + // in the initialized data section. So instead, we generate run-length encoded (RLE) arrays to + // represent the strings. void generate(SourceGenerator& generator) { - Base::generate(generator, "StringView"sv, "s_string_list"sv, 40); + constexpr size_t max_values_per_row = 300; + size_t values_in_current_row = 0; + + auto append_hex_value = [&](auto value) { + if (values_in_current_row++ > 0) + generator.append(", "); + + generator.append(String::formatted("{:#x}", value)); + + if (values_in_current_row == max_values_per_row) { + values_in_current_row = 0; + generator.append(",\n "); + } + }; + + Vector string_indices; + string_indices.ensure_capacity(Base::m_storage.size()); + u32 next_index { 0 }; + + for (auto const& string : Base::m_storage) { + // Ensure the string length may be encoded as two u8s. + VERIFY(string.length() <= NumericLimits::max()); + + string_indices.unchecked_append(next_index); + next_index += string.length() + 2; + } + + generator.set("size", String::number(next_index)); + generator.append(R"~~~( +static constexpr Array s_encoded_strings { { + )~~~"); + + for (auto const& string : Base::m_storage) { + auto length = string.length(); + append_hex_value((length & 0xff00) >> 8); + append_hex_value(length & 0x00ff); + + for (auto ch : string) + append_hex_value(static_cast(ch)); + } + + generator.append(R"~~~( +} }; +)~~~"); + + generator.set("size", String::number(string_indices.size())); + generator.append(R"~~~( +static constexpr Array s_encoded_string_indices { { + )~~~"); + + values_in_current_row = 0; + for (auto index : string_indices) + append_hex_value(index); + + generator.append(R"~~~( +} }; + +static constexpr StringView decode_string(size_t index) +{ + if (index == 0) + return {}; + + index = s_encoded_string_indices[index - 1]; + + auto length_high = s_encoded_strings[index]; + auto length_low = s_encoded_strings[index + 1]; + + size_t length = (length_high << 8) | length_low; + if (length == 0) + return {}; + + auto const* start = &s_encoded_strings[index + 2]; + return { reinterpret_cast(start), length }; +} +)~~~"); } };