Meta: Do not hard-code index types for UCD/CLDR/TZDB code generators

Hand-picking the smallest index type that fits a particular generated array started with commit 3ad159537e. This was to reduce the size of the generated library. Since then, the number of types using UniqueStorage has grown a ton, creating a long list of types for which index types are manually picked. When a new UCD/CLDR/TZDB is released, and the current index type no longer fits the generated data, we fail to generate. Tracking down which index caused the failure is a pretty annoying process. Instead, we can just use size_t while in the generators themselves, then automatically pick the size needed for the generated code.
2025-07-24 21:57:35 +00:00 · 2022-11-18 11:04:33 -05:00 · 2022-11-18 11:04:33 -05:00 · b2164ad979
commit b2164ad979
parent fa2579ffa9
9 changed files with 268 additions and 379 deletions
--- a/Meta/Lagom/Tools/CodeGenerators/LibUnicode/GenerateEmojiData.cpp
+++ b/Meta/Lagom/Tools/CodeGenerators/LibUnicode/GenerateEmojiData.cpp
@ -15,11 +15,8 @@
 #include <LibCore/Stream.h>
 #include <LibUnicode/Emoji.h>

-using StringIndexType = u16;
-constexpr auto s_string_index_type = "u16"sv;
-
 struct Emoji {
-    StringIndexType name { 0 };
+    size_t name { 0 };
    Optional<String> image_path;
    Unicode::EmojiGroup group;
    String subgroup;
@ -31,7 +28,7 @@ struct Emoji {
 };

 struct EmojiData {
-    UniqueStringStorage<StringIndexType> unique_strings;
+    UniqueStringStorage unique_strings;
    Vector<Emoji> emojis;
 };

@ -180,7 +177,7 @@ static ErrorOr<void> generate_emoji_data_implementation(Core::Stream::BufferedFi
    StringBuilder builder;
    SourceGenerator generator { builder };

-    generator.set("string_index_type"sv, s_string_index_type);
+    generator.set("string_index_type"sv, emoji_data.unique_strings.type_that_fits());
    generator.set("emojis_size"sv, String::number(emoji_data.emojis.size()));

    generator.append(R"~~~(
--- a/Meta/Lagom/Tools/CodeGenerators/LibUnicode/GenerateUnicodeData.cpp
+++ b/Meta/Lagom/Tools/CodeGenerators/LibUnicode/GenerateUnicodeData.cpp
@ -20,9 +20,6 @@
 #include <LibCore/ArgsParser.h>
 #include <LibCore/Stream.h>

-using StringIndexType = u16;
-constexpr auto s_string_index_type = "u16"sv;
-
 // Some code points are excluded from UnicodeData.txt, and instead are part of a "range" of code
 // points, as indicated by the "name" field. For example:
 //     3400;<CJK Ideograph Extension A, First>;Lo;0;L;;;;;N;;;;;
@ -74,7 +71,7 @@ using NormalizationProps = HashMap<String, Vector<Normalization>>;

 struct CodePointName {
    CodePointRange code_point_range;
-    StringIndexType name { 0 };
+    size_t name { 0 };
 };

 // UnicodeData source: https://www.unicode.org/Public/13.0.0/ucd/UnicodeData.txt
@ -83,7 +80,7 @@ struct CodePointName {
 struct CodePointData {
    u32 code_point { 0 };
    String name;
-    Optional<StringIndexType> abbreviation;
+    Optional<size_t> abbreviation;
    u8 canonical_combining_class { 0 };
    String bidi_class;
    Optional<CodePointDecomposition> decomposition_mapping;
@ -101,11 +98,11 @@ struct CodePointData {

 struct BlockName {
    CodePointRange code_point_range;
-    StringIndexType name { 0 };
+    size_t name { 0 };
 };

 struct UnicodeData {
-    UniqueStringStorage<StringIndexType> unique_strings;
+    UniqueStringStorage unique_strings;

    u32 code_points_with_non_zero_combining_class { 0 };

@ -125,8 +122,8 @@ struct UnicodeData {

    Vector<CodePointData> code_point_data;

-    HashMap<u32, StringIndexType> code_point_abbreviations;
-    HashMap<u32, StringIndexType> code_point_display_name_aliases;
+    HashMap<u32, size_t> code_point_abbreviations;
+    HashMap<u32, size_t> code_point_display_name_aliases;
    Vector<CodePointName> code_point_display_names;

    PropList general_categories;
@ -795,7 +792,7 @@ static ErrorOr<void> generate_unicode_data_implementation(Core::Stream::Buffered
    StringBuilder builder;
    SourceGenerator generator { builder };

-    generator.set("string_index_type"sv, s_string_index_type);
+    generator.set("string_index_type"sv, unicode_data.unique_strings.type_that_fits());
    generator.set("largest_special_casing_size", String::number(unicode_data.largest_special_casing_size));
    generator.set("special_casing_size", String::number(unicode_data.special_casing.size()));

@ -947,7 +944,7 @@ static constexpr Array<@mapping_type@, @size@> s_@name@_mappings { {
            generator.set("code_point", String::formatted("{:#x}", data.code_point));
            generator.append("{ @code_point@");

-            if constexpr (IsSame<decltype(mapping), Optional<u32>> || IsSame<decltype(mapping), Optional<StringIndexType>>) {
+            if constexpr (IsSame<decltype(mapping), Optional<u32>> || IsSame<decltype(mapping), Optional<size_t>>) {
                generator.set("mapping", String::formatted("{:#x}", *mapping));
                generator.append(", @mapping@ },");
            } else if constexpr (IsSame<decltype(mapping), Optional<CodePointDecomposition>>) {
--- a/Meta/Lagom/Tools/CodeGenerators/LibUnicode/GeneratorUtil.h
+++ b/Meta/Lagom/Tools/CodeGenerators/LibUnicode/GeneratorUtil.h
@ -51,10 +51,10 @@ struct AK::Traits<Vector<T>> : public GenericTraits<Vector<T>> {
    }
 };

-template<typename StorageType, typename IndexType>
+template<typename StorageType>
 class UniqueStorage {
 public:
-    IndexType ensure(StorageType value)
+    size_t ensure(StorageType value)
    {
        // We maintain a set of unique values in two structures: a vector which stores the values in
        // the order they are added, and a hash map which maps that value to its index in the vector.
@ -68,17 +68,14 @@ public:
            return *index;

        m_storage.append(move(value));
-        size_t index = m_storage.size();

-        VERIFY(index < NumericLimits<IndexType>::max());
-
-        auto storage_index = static_cast<IndexType>(index);
+        auto storage_index = m_storage.size();
        m_storage_indices.set(m_storage.last(), storage_index);

        return storage_index;
    }

-    StorageType const& get(IndexType index) const
+    StorageType const& get(size_t index) const
    {
        if (index == 0) {
            static StorageType empty {};
@ -89,6 +86,17 @@ public:
        return m_storage.at(index - 1);
    }

+    StringView type_that_fits() const
+    {
+        if (m_storage.size() <= NumericLimits<u8>::max())
+            return "u8"sv;
+        if (m_storage.size() <= NumericLimits<u16>::max())
+            return "u16"sv;
+        if (m_storage.size() <= NumericLimits<u32>::max())
+            return "u32"sv;
+        return "u64"sv;
+    }
+
    void generate(SourceGenerator& generator, StringView type, StringView name, size_t max_values_per_row) requires(!StorageTypeIsList<StorageType>)
    {
        generator.set("type"sv, type);
@ -177,13 +185,12 @@ static constexpr Array<Span<@type@ const>, @size@ + 1> @name@ { {
    // clang-format gets confused by the requires() clauses above, and formats this section very weirdly.
 protected:
    Vector<StorageType> m_storage;
-    HashMap<StorageType, IndexType> m_storage_indices;
+    HashMap<StorageType, size_t> m_storage_indices;
    // clang-format on
 };

-template<typename StringIndexType>
-class UniqueStringStorage : public UniqueStorage<String, StringIndexType> {
-    using Base = UniqueStorage<String, StringIndexType>;
+class UniqueStringStorage : public UniqueStorage<String> {
+    using Base = UniqueStorage<String>;

 public:
    // The goal of the string table generator is to ensure the table is located within the read-only
@ -275,9 +282,8 @@ struct Alias {
    String alias;
 };

-template<typename StringIndexType>
 struct CanonicalLanguageID {
-    static ErrorOr<CanonicalLanguageID> parse(UniqueStringStorage<StringIndexType>& unique_strings, StringView language)
+    static ErrorOr<CanonicalLanguageID> parse(UniqueStringStorage& unique_strings, StringView language)
    {
        CanonicalLanguageID language_id {};

@ -314,10 +320,10 @@ struct CanonicalLanguageID {
        return language_id;
    }

-    StringIndexType language { 0 };
-    StringIndexType script { 0 };
-    StringIndexType region { 0 };
-    Vector<StringIndexType> variants {};
+    size_t language { 0 };
+    size_t script { 0 };
+    size_t region { 0 };
+    Vector<size_t> variants {};
 };

 inline ErrorOr<NonnullOwnPtr<Core::Stream::BufferedFile>> open_file(StringView path, Core::Stream::OpenMode mode)