1
Fork 0
mirror of https://github.com/RGBCube/serenity synced 2025-05-31 13:38:11 +00:00

Meta: Do not hard-code index types for UCD/CLDR/TZDB code generators

Hand-picking the smallest index type that fits a particular generated
array started with commit 3ad159537e. This
was to reduce the size of the generated library.

Since then, the number of types using UniqueStorage has grown a ton,
creating a long list of types for which index types are manually picked.
When a new UCD/CLDR/TZDB is released, and the current index type no
longer fits the generated data, we fail to generate. Tracking down which
index caused the failure is a pretty annoying process.

Instead, we can just use size_t while in the generators themselves, then
automatically pick the size needed for the generated code.
This commit is contained in:
Timothy Flynn 2022-11-18 11:04:33 -05:00 committed by Linus Groh
parent fa2579ffa9
commit b2164ad979
9 changed files with 268 additions and 379 deletions

View file

@ -15,11 +15,8 @@
#include <LibCore/Stream.h>
#include <LibUnicode/Emoji.h>
using StringIndexType = u16;
constexpr auto s_string_index_type = "u16"sv;
struct Emoji {
StringIndexType name { 0 };
size_t name { 0 };
Optional<String> image_path;
Unicode::EmojiGroup group;
String subgroup;
@ -31,7 +28,7 @@ struct Emoji {
};
struct EmojiData {
UniqueStringStorage<StringIndexType> unique_strings;
UniqueStringStorage unique_strings;
Vector<Emoji> emojis;
};
@ -180,7 +177,7 @@ static ErrorOr<void> generate_emoji_data_implementation(Core::Stream::BufferedFi
StringBuilder builder;
SourceGenerator generator { builder };
generator.set("string_index_type"sv, s_string_index_type);
generator.set("string_index_type"sv, emoji_data.unique_strings.type_that_fits());
generator.set("emojis_size"sv, String::number(emoji_data.emojis.size()));
generator.append(R"~~~(

View file

@ -20,9 +20,6 @@
#include <LibCore/ArgsParser.h>
#include <LibCore/Stream.h>
using StringIndexType = u16;
constexpr auto s_string_index_type = "u16"sv;
// Some code points are excluded from UnicodeData.txt, and instead are part of a "range" of code
// points, as indicated by the "name" field. For example:
// 3400;<CJK Ideograph Extension A, First>;Lo;0;L;;;;;N;;;;;
@ -74,7 +71,7 @@ using NormalizationProps = HashMap<String, Vector<Normalization>>;
struct CodePointName {
CodePointRange code_point_range;
StringIndexType name { 0 };
size_t name { 0 };
};
// UnicodeData source: https://www.unicode.org/Public/13.0.0/ucd/UnicodeData.txt
@ -83,7 +80,7 @@ struct CodePointName {
struct CodePointData {
u32 code_point { 0 };
String name;
Optional<StringIndexType> abbreviation;
Optional<size_t> abbreviation;
u8 canonical_combining_class { 0 };
String bidi_class;
Optional<CodePointDecomposition> decomposition_mapping;
@ -101,11 +98,11 @@ struct CodePointData {
struct BlockName {
CodePointRange code_point_range;
StringIndexType name { 0 };
size_t name { 0 };
};
struct UnicodeData {
UniqueStringStorage<StringIndexType> unique_strings;
UniqueStringStorage unique_strings;
u32 code_points_with_non_zero_combining_class { 0 };
@ -125,8 +122,8 @@ struct UnicodeData {
Vector<CodePointData> code_point_data;
HashMap<u32, StringIndexType> code_point_abbreviations;
HashMap<u32, StringIndexType> code_point_display_name_aliases;
HashMap<u32, size_t> code_point_abbreviations;
HashMap<u32, size_t> code_point_display_name_aliases;
Vector<CodePointName> code_point_display_names;
PropList general_categories;
@ -795,7 +792,7 @@ static ErrorOr<void> generate_unicode_data_implementation(Core::Stream::Buffered
StringBuilder builder;
SourceGenerator generator { builder };
generator.set("string_index_type"sv, s_string_index_type);
generator.set("string_index_type"sv, unicode_data.unique_strings.type_that_fits());
generator.set("largest_special_casing_size", String::number(unicode_data.largest_special_casing_size));
generator.set("special_casing_size", String::number(unicode_data.special_casing.size()));
@ -947,7 +944,7 @@ static constexpr Array<@mapping_type@, @size@> s_@name@_mappings { {
generator.set("code_point", String::formatted("{:#x}", data.code_point));
generator.append("{ @code_point@");
if constexpr (IsSame<decltype(mapping), Optional<u32>> || IsSame<decltype(mapping), Optional<StringIndexType>>) {
if constexpr (IsSame<decltype(mapping), Optional<u32>> || IsSame<decltype(mapping), Optional<size_t>>) {
generator.set("mapping", String::formatted("{:#x}", *mapping));
generator.append(", @mapping@ },");
} else if constexpr (IsSame<decltype(mapping), Optional<CodePointDecomposition>>) {

View file

@ -51,10 +51,10 @@ struct AK::Traits<Vector<T>> : public GenericTraits<Vector<T>> {
}
};
template<typename StorageType, typename IndexType>
template<typename StorageType>
class UniqueStorage {
public:
IndexType ensure(StorageType value)
size_t ensure(StorageType value)
{
// We maintain a set of unique values in two structures: a vector which stores the values in
// the order they are added, and a hash map which maps that value to its index in the vector.
@ -68,17 +68,14 @@ public:
return *index;
m_storage.append(move(value));
size_t index = m_storage.size();
VERIFY(index < NumericLimits<IndexType>::max());
auto storage_index = static_cast<IndexType>(index);
auto storage_index = m_storage.size();
m_storage_indices.set(m_storage.last(), storage_index);
return storage_index;
}
StorageType const& get(IndexType index) const
StorageType const& get(size_t index) const
{
if (index == 0) {
static StorageType empty {};
@ -89,6 +86,17 @@ public:
return m_storage.at(index - 1);
}
StringView type_that_fits() const
{
if (m_storage.size() <= NumericLimits<u8>::max())
return "u8"sv;
if (m_storage.size() <= NumericLimits<u16>::max())
return "u16"sv;
if (m_storage.size() <= NumericLimits<u32>::max())
return "u32"sv;
return "u64"sv;
}
void generate(SourceGenerator& generator, StringView type, StringView name, size_t max_values_per_row) requires(!StorageTypeIsList<StorageType>)
{
generator.set("type"sv, type);
@ -177,13 +185,12 @@ static constexpr Array<Span<@type@ const>, @size@ + 1> @name@ { {
// clang-format gets confused by the requires() clauses above, and formats this section very weirdly.
protected:
Vector<StorageType> m_storage;
HashMap<StorageType, IndexType> m_storage_indices;
HashMap<StorageType, size_t> m_storage_indices;
// clang-format on
};
template<typename StringIndexType>
class UniqueStringStorage : public UniqueStorage<String, StringIndexType> {
using Base = UniqueStorage<String, StringIndexType>;
class UniqueStringStorage : public UniqueStorage<String> {
using Base = UniqueStorage<String>;
public:
// The goal of the string table generator is to ensure the table is located within the read-only
@ -275,9 +282,8 @@ struct Alias {
String alias;
};
template<typename StringIndexType>
struct CanonicalLanguageID {
static ErrorOr<CanonicalLanguageID> parse(UniqueStringStorage<StringIndexType>& unique_strings, StringView language)
static ErrorOr<CanonicalLanguageID> parse(UniqueStringStorage& unique_strings, StringView language)
{
CanonicalLanguageID language_id {};
@ -314,10 +320,10 @@ struct CanonicalLanguageID {
return language_id;
}
StringIndexType language { 0 };
StringIndexType script { 0 };
StringIndexType region { 0 };
Vector<StringIndexType> variants {};
size_t language { 0 };
size_t script { 0 };
size_t region { 0 };
Vector<size_t> variants {};
};
inline ErrorOr<NonnullOwnPtr<Core::Stream::BufferedFile>> open_file(StringView path, Core::Stream::OpenMode mode)