1
Fork 0
mirror of https://github.com/RGBCube/serenity synced 2025-05-31 06:58:11 +00:00

LibUnicode: Add public methods to compare and lookup Unicode properties

Adds methods to retrieve a Unicode property from a string and to check
if a code point matches a Unicode property.

Also adds a <LibUnicode/Forward.h> header.
This commit is contained in:
Timothy Flynn 2021-07-28 21:45:09 -04:00 committed by Linus Groh
parent 3f80791ed5
commit f1809db994
4 changed files with 92 additions and 9 deletions

View file

@ -4,6 +4,7 @@
* SPDX-License-Identifier: BSD-2-Clause
*/
#include <AK/CharacterTypes.h>
#include <AK/Platform.h>
#include <AK/StringBuilder.h>
#include <AK/Types.h>
@ -12,8 +13,6 @@
#if ENABLE_UNICODE_DATA
# include <LibUnicode/UnicodeData.h>
#else
# include <AK/CharacterTypes.h>
#endif
// For details on the algorithms used here, see Section 3.13 Default Case Algorithms
@ -41,7 +40,7 @@ static bool is_final_code_point(Utf8View const& string, size_t index, size_t byt
size_t cased_letter_count = 0;
for (auto code_point : preceding_view) {
auto unicode_data = unicode_data_for_code_point(code_point);
auto unicode_data = Detail::unicode_data_for_code_point(code_point);
if (!unicode_data.has_value())
return false;
@ -58,7 +57,7 @@ static bool is_final_code_point(Utf8View const& string, size_t index, size_t byt
return false;
for (auto code_point : following_view) {
auto unicode_data = unicode_data_for_code_point(code_point);
auto unicode_data = Detail::unicode_data_for_code_point(code_point);
if (!unicode_data.has_value())
return false;
@ -107,7 +106,7 @@ static SpecialCasing const* find_matching_special_case(Utf8View const& string, s
u32 to_unicode_lowercase(u32 code_point)
{
#if ENABLE_UNICODE_DATA
auto unicode_data = unicode_data_for_code_point(code_point);
auto unicode_data = Detail::unicode_data_for_code_point(code_point);
if (unicode_data.has_value())
return unicode_data->simple_lowercase_mapping;
return code_point;
@ -119,7 +118,7 @@ u32 to_unicode_lowercase(u32 code_point)
u32 to_unicode_uppercase(u32 code_point)
{
#if ENABLE_UNICODE_DATA
auto unicode_data = unicode_data_for_code_point(code_point);
auto unicode_data = Detail::unicode_data_for_code_point(code_point);
if (unicode_data.has_value())
return unicode_data->simple_uppercase_mapping;
return code_point;
@ -139,7 +138,7 @@ String to_unicode_lowercase_full(StringView const& string)
u32 code_point = *it;
size_t byte_length = it.underlying_code_point_length_in_bytes();
auto unicode_data = unicode_data_for_code_point(code_point);
auto unicode_data = Detail::unicode_data_for_code_point(code_point);
if (!unicode_data.has_value()) {
builder.append_code_point(code_point);
index += byte_length;
@ -174,7 +173,7 @@ String to_unicode_uppercase_full(StringView const& string)
u32 code_point = *it;
size_t byte_length = it.underlying_code_point_length_in_bytes();
auto unicode_data = unicode_data_for_code_point(code_point);
auto unicode_data = Detail::unicode_data_for_code_point(code_point);
if (!unicode_data.has_value()) {
builder.append_code_point(code_point);
index += byte_length;
@ -198,4 +197,29 @@ String to_unicode_uppercase_full(StringView const& string)
#endif
}
Optional<Property> property_from_string([[maybe_unused]] StringView const& property)
{
#if ENABLE_UNICODE_DATA
return Detail::property_from_string(property);
#else
return {};
#endif
}
bool code_point_has_property([[maybe_unused]] u32 code_point, [[maybe_unused]] Property property)
{
#if ENABLE_UNICODE_DATA
if (property == Property::Any)
return is_unicode(code_point);
auto unicode_data = Detail::unicode_data_for_code_point(code_point);
if (!unicode_data.has_value())
return false;
return has_property(*unicode_data, property);
#else
return false;
#endif
}
}

View file

@ -9,6 +9,7 @@
#include <AK/Forward.h>
#include <AK/String.h>
#include <AK/Types.h>
#include <LibUnicode/Forward.h>
namespace Unicode {
@ -20,4 +21,7 @@ u32 to_unicode_uppercase(u32 code_point);
String to_unicode_lowercase_full(StringView const&);
String to_unicode_uppercase_full(StringView const&);
Optional<Property> property_from_string(StringView const&);
bool code_point_has_property(u32 code_point, Property property);
}

View file

@ -404,6 +404,7 @@ constexpr @name@ operator|(@name@ value1, @name@ value2)
#include <AK/Optional.h>
#include <AK/Types.h>
#include <LibUnicode/Forward.h>
namespace Unicode {
)~~~");
@ -411,7 +412,7 @@ namespace Unicode {
generate_enum("Locale"sv, "None"sv, move(unicode_data.locales));
generate_enum("Condition"sv, "None"sv, move(unicode_data.conditions));
generate_enum("GeneralCategory"sv, {}, move(unicode_data.general_categories));
generate_enum("Property"sv, "Assigned"sv, unicode_data.prop_list.keys(), move(unicode_data.prop_aliases), true);
generate_enum("Property"sv, "Assigned"sv, unicode_data.prop_list.keys(), unicode_data.prop_aliases, true);
generate_enum("WordBreakProperty"sv, "Other"sv, unicode_data.word_break_prop_list.keys());
generator.append(R"~~~(
@ -469,7 +470,12 @@ struct UnicodeData {
WordBreakProperty word_break_property { WordBreakProperty::Other };
};
namespace Detail {
Optional<UnicodeData> unicode_data_for_code_point(u32 code_point);
Optional<Property> property_from_string(StringView const& property);
}
})~~~");
@ -489,6 +495,7 @@ static void generate_unicode_data_implementation(UnicodeData unicode_data)
#include <AK/Array.h>
#include <AK/CharacterTypes.h>
#include <AK/Find.h>
#include <AK/StringView.h>
#include <LibUnicode/UnicodeData.h>
namespace Unicode {
@ -597,6 +604,8 @@ static Optional<u32> index_of_code_point_in_range(u32 code_point)
return {};
}
namespace Detail {
Optional<UnicodeData> unicode_data_for_code_point(u32 code_point)
{
VERIFY(is_unicode(code_point));
@ -618,6 +627,30 @@ Optional<UnicodeData> unicode_data_for_code_point(u32 code_point)
return {};
}
Optional<Property> property_from_string(StringView const& property)
{
if (property == "Assigned"sv)
return Property::Assigned;)~~~");
for (auto const& property : unicode_data.prop_list) {
generator.set("property", property.key);
generator.append(R"~~~(
if (property == "@property@"sv)
return Property::@property@;)~~~");
}
for (auto const& alias : unicode_data.prop_aliases) {
generator.set("property", alias.alias);
generator.append(R"~~~(
if (property == "@property@"sv)
return Property::@property@;)~~~");
}
generator.append(R"~~~(
return {};
}
}
})~~~");
outln("{}", generator.as_string_view());

View file

@ -0,0 +1,22 @@
/*
* Copyright (c) 2021, Tim Flynn <trflynn89@pm.me>
*
* SPDX-License-Identifier: BSD-2-Clause
*/
#pragma once
#include <AK/Types.h>
namespace Unicode {
enum class Condition;
enum class GeneralCategory;
enum class Locale;
enum class Property : u64;
enum class WordBreakProperty;
struct SpecialCasing;
struct UnicodeData;
}