mirror of
https://github.com/RGBCube/serenity
synced 2025-05-31 08:48:11 +00:00
LibUnicode: Add public methods to compare and lookup Unicode properties
Adds methods to retrieve a Unicode property from a string and to check if a code point matches a Unicode property. Also adds a <LibUnicode/Forward.h> header.
This commit is contained in:
parent
3f80791ed5
commit
f1809db994
4 changed files with 92 additions and 9 deletions
|
@ -4,6 +4,7 @@
|
|||
* SPDX-License-Identifier: BSD-2-Clause
|
||||
*/
|
||||
|
||||
#include <AK/CharacterTypes.h>
|
||||
#include <AK/Platform.h>
|
||||
#include <AK/StringBuilder.h>
|
||||
#include <AK/Types.h>
|
||||
|
@ -12,8 +13,6 @@
|
|||
|
||||
#if ENABLE_UNICODE_DATA
|
||||
# include <LibUnicode/UnicodeData.h>
|
||||
#else
|
||||
# include <AK/CharacterTypes.h>
|
||||
#endif
|
||||
|
||||
// For details on the algorithms used here, see Section 3.13 Default Case Algorithms
|
||||
|
@ -41,7 +40,7 @@ static bool is_final_code_point(Utf8View const& string, size_t index, size_t byt
|
|||
size_t cased_letter_count = 0;
|
||||
|
||||
for (auto code_point : preceding_view) {
|
||||
auto unicode_data = unicode_data_for_code_point(code_point);
|
||||
auto unicode_data = Detail::unicode_data_for_code_point(code_point);
|
||||
if (!unicode_data.has_value())
|
||||
return false;
|
||||
|
||||
|
@ -58,7 +57,7 @@ static bool is_final_code_point(Utf8View const& string, size_t index, size_t byt
|
|||
return false;
|
||||
|
||||
for (auto code_point : following_view) {
|
||||
auto unicode_data = unicode_data_for_code_point(code_point);
|
||||
auto unicode_data = Detail::unicode_data_for_code_point(code_point);
|
||||
if (!unicode_data.has_value())
|
||||
return false;
|
||||
|
||||
|
@ -107,7 +106,7 @@ static SpecialCasing const* find_matching_special_case(Utf8View const& string, s
|
|||
u32 to_unicode_lowercase(u32 code_point)
|
||||
{
|
||||
#if ENABLE_UNICODE_DATA
|
||||
auto unicode_data = unicode_data_for_code_point(code_point);
|
||||
auto unicode_data = Detail::unicode_data_for_code_point(code_point);
|
||||
if (unicode_data.has_value())
|
||||
return unicode_data->simple_lowercase_mapping;
|
||||
return code_point;
|
||||
|
@ -119,7 +118,7 @@ u32 to_unicode_lowercase(u32 code_point)
|
|||
u32 to_unicode_uppercase(u32 code_point)
|
||||
{
|
||||
#if ENABLE_UNICODE_DATA
|
||||
auto unicode_data = unicode_data_for_code_point(code_point);
|
||||
auto unicode_data = Detail::unicode_data_for_code_point(code_point);
|
||||
if (unicode_data.has_value())
|
||||
return unicode_data->simple_uppercase_mapping;
|
||||
return code_point;
|
||||
|
@ -139,7 +138,7 @@ String to_unicode_lowercase_full(StringView const& string)
|
|||
u32 code_point = *it;
|
||||
size_t byte_length = it.underlying_code_point_length_in_bytes();
|
||||
|
||||
auto unicode_data = unicode_data_for_code_point(code_point);
|
||||
auto unicode_data = Detail::unicode_data_for_code_point(code_point);
|
||||
if (!unicode_data.has_value()) {
|
||||
builder.append_code_point(code_point);
|
||||
index += byte_length;
|
||||
|
@ -174,7 +173,7 @@ String to_unicode_uppercase_full(StringView const& string)
|
|||
u32 code_point = *it;
|
||||
size_t byte_length = it.underlying_code_point_length_in_bytes();
|
||||
|
||||
auto unicode_data = unicode_data_for_code_point(code_point);
|
||||
auto unicode_data = Detail::unicode_data_for_code_point(code_point);
|
||||
if (!unicode_data.has_value()) {
|
||||
builder.append_code_point(code_point);
|
||||
index += byte_length;
|
||||
|
@ -198,4 +197,29 @@ String to_unicode_uppercase_full(StringView const& string)
|
|||
#endif
|
||||
}
|
||||
|
||||
Optional<Property> property_from_string([[maybe_unused]] StringView const& property)
|
||||
{
|
||||
#if ENABLE_UNICODE_DATA
|
||||
return Detail::property_from_string(property);
|
||||
#else
|
||||
return {};
|
||||
#endif
|
||||
}
|
||||
|
||||
bool code_point_has_property([[maybe_unused]] u32 code_point, [[maybe_unused]] Property property)
|
||||
{
|
||||
#if ENABLE_UNICODE_DATA
|
||||
if (property == Property::Any)
|
||||
return is_unicode(code_point);
|
||||
|
||||
auto unicode_data = Detail::unicode_data_for_code_point(code_point);
|
||||
if (!unicode_data.has_value())
|
||||
return false;
|
||||
|
||||
return has_property(*unicode_data, property);
|
||||
#else
|
||||
return false;
|
||||
#endif
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
@ -9,6 +9,7 @@
|
|||
#include <AK/Forward.h>
|
||||
#include <AK/String.h>
|
||||
#include <AK/Types.h>
|
||||
#include <LibUnicode/Forward.h>
|
||||
|
||||
namespace Unicode {
|
||||
|
||||
|
@ -20,4 +21,7 @@ u32 to_unicode_uppercase(u32 code_point);
|
|||
String to_unicode_lowercase_full(StringView const&);
|
||||
String to_unicode_uppercase_full(StringView const&);
|
||||
|
||||
Optional<Property> property_from_string(StringView const&);
|
||||
bool code_point_has_property(u32 code_point, Property property);
|
||||
|
||||
}
|
||||
|
|
|
@ -404,6 +404,7 @@ constexpr @name@ operator|(@name@ value1, @name@ value2)
|
|||
|
||||
#include <AK/Optional.h>
|
||||
#include <AK/Types.h>
|
||||
#include <LibUnicode/Forward.h>
|
||||
|
||||
namespace Unicode {
|
||||
)~~~");
|
||||
|
@ -411,7 +412,7 @@ namespace Unicode {
|
|||
generate_enum("Locale"sv, "None"sv, move(unicode_data.locales));
|
||||
generate_enum("Condition"sv, "None"sv, move(unicode_data.conditions));
|
||||
generate_enum("GeneralCategory"sv, {}, move(unicode_data.general_categories));
|
||||
generate_enum("Property"sv, "Assigned"sv, unicode_data.prop_list.keys(), move(unicode_data.prop_aliases), true);
|
||||
generate_enum("Property"sv, "Assigned"sv, unicode_data.prop_list.keys(), unicode_data.prop_aliases, true);
|
||||
generate_enum("WordBreakProperty"sv, "Other"sv, unicode_data.word_break_prop_list.keys());
|
||||
|
||||
generator.append(R"~~~(
|
||||
|
@ -469,7 +470,12 @@ struct UnicodeData {
|
|||
WordBreakProperty word_break_property { WordBreakProperty::Other };
|
||||
};
|
||||
|
||||
namespace Detail {
|
||||
|
||||
Optional<UnicodeData> unicode_data_for_code_point(u32 code_point);
|
||||
Optional<Property> property_from_string(StringView const& property);
|
||||
|
||||
}
|
||||
|
||||
})~~~");
|
||||
|
||||
|
@ -489,6 +495,7 @@ static void generate_unicode_data_implementation(UnicodeData unicode_data)
|
|||
#include <AK/Array.h>
|
||||
#include <AK/CharacterTypes.h>
|
||||
#include <AK/Find.h>
|
||||
#include <AK/StringView.h>
|
||||
#include <LibUnicode/UnicodeData.h>
|
||||
|
||||
namespace Unicode {
|
||||
|
@ -597,6 +604,8 @@ static Optional<u32> index_of_code_point_in_range(u32 code_point)
|
|||
return {};
|
||||
}
|
||||
|
||||
namespace Detail {
|
||||
|
||||
Optional<UnicodeData> unicode_data_for_code_point(u32 code_point)
|
||||
{
|
||||
VERIFY(is_unicode(code_point));
|
||||
|
@ -618,6 +627,30 @@ Optional<UnicodeData> unicode_data_for_code_point(u32 code_point)
|
|||
return {};
|
||||
}
|
||||
|
||||
Optional<Property> property_from_string(StringView const& property)
|
||||
{
|
||||
if (property == "Assigned"sv)
|
||||
return Property::Assigned;)~~~");
|
||||
|
||||
for (auto const& property : unicode_data.prop_list) {
|
||||
generator.set("property", property.key);
|
||||
generator.append(R"~~~(
|
||||
if (property == "@property@"sv)
|
||||
return Property::@property@;)~~~");
|
||||
}
|
||||
for (auto const& alias : unicode_data.prop_aliases) {
|
||||
generator.set("property", alias.alias);
|
||||
generator.append(R"~~~(
|
||||
if (property == "@property@"sv)
|
||||
return Property::@property@;)~~~");
|
||||
}
|
||||
|
||||
generator.append(R"~~~(
|
||||
return {};
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
})~~~");
|
||||
|
||||
outln("{}", generator.as_string_view());
|
||||
|
|
22
Userland/Libraries/LibUnicode/Forward.h
Normal file
22
Userland/Libraries/LibUnicode/Forward.h
Normal file
|
@ -0,0 +1,22 @@
|
|||
/*
|
||||
* Copyright (c) 2021, Tim Flynn <trflynn89@pm.me>
|
||||
*
|
||||
* SPDX-License-Identifier: BSD-2-Clause
|
||||
*/
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <AK/Types.h>
|
||||
|
||||
namespace Unicode {
|
||||
|
||||
enum class Condition;
|
||||
enum class GeneralCategory;
|
||||
enum class Locale;
|
||||
enum class Property : u64;
|
||||
enum class WordBreakProperty;
|
||||
|
||||
struct SpecialCasing;
|
||||
struct UnicodeData;
|
||||
|
||||
}
|
Loading…
Add table
Add a link
Reference in a new issue