mirror of
https://github.com/RGBCube/serenity
synced 2025-05-31 11:38:11 +00:00
LibUnicode: Implement locale-aware AFTER_I special casing
This commit is contained in:
parent
68b2680040
commit
0053d48c41
3 changed files with 94 additions and 12 deletions
|
@ -155,6 +155,49 @@ TEST_CASE(to_unicode_lowercase_special_casing_sigma)
|
||||||
EXPECT_EQ(result, "a\u180E\u03C3\u180Eb");
|
EXPECT_EQ(result, "a\u180E\u03C3\u180Eb");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
TEST_CASE(to_unicode_lowercase_special_casing_i)
|
||||||
|
{
|
||||||
|
// LATIN CAPITAL LETTER I
|
||||||
|
auto result = Unicode::to_unicode_lowercase_full("I"sv, "en"sv);
|
||||||
|
EXPECT_EQ(result, "i"sv);
|
||||||
|
|
||||||
|
result = Unicode::to_unicode_lowercase_full("I"sv, "az"sv);
|
||||||
|
EXPECT_EQ(result, "\u0131"sv);
|
||||||
|
|
||||||
|
result = Unicode::to_unicode_lowercase_full("I"sv, "tr"sv);
|
||||||
|
EXPECT_EQ(result, "\u0131"sv);
|
||||||
|
|
||||||
|
// LATIN CAPITAL LETTER I WITH DOT ABOVE
|
||||||
|
result = Unicode::to_unicode_lowercase_full("\u0130"sv, "en"sv);
|
||||||
|
EXPECT_EQ(result, "\u0069\u0307"sv);
|
||||||
|
|
||||||
|
result = Unicode::to_unicode_lowercase_full("\u0130"sv, "az"sv);
|
||||||
|
EXPECT_EQ(result, "i"sv);
|
||||||
|
|
||||||
|
result = Unicode::to_unicode_lowercase_full("\u0130"sv, "tr"sv);
|
||||||
|
EXPECT_EQ(result, "i"sv);
|
||||||
|
|
||||||
|
// LATIN CAPITAL LETTER I followed by COMBINING DOT ABOVE
|
||||||
|
result = Unicode::to_unicode_lowercase_full("I\u0307"sv, "en"sv);
|
||||||
|
EXPECT_EQ(result, "i\u0307"sv);
|
||||||
|
|
||||||
|
result = Unicode::to_unicode_lowercase_full("I\u0307"sv, "az"sv);
|
||||||
|
EXPECT_EQ(result, "i"sv);
|
||||||
|
|
||||||
|
result = Unicode::to_unicode_lowercase_full("I\u0307"sv, "tr"sv);
|
||||||
|
EXPECT_EQ(result, "i"sv);
|
||||||
|
|
||||||
|
// LATIN CAPITAL LETTER I followed by combining class 0 and COMBINING DOT ABOVE
|
||||||
|
result = Unicode::to_unicode_lowercase_full("IA\u0307"sv, "en"sv);
|
||||||
|
EXPECT_EQ(result, "ia\u0307"sv);
|
||||||
|
|
||||||
|
result = Unicode::to_unicode_lowercase_full("IA\u0307"sv, "az"sv);
|
||||||
|
EXPECT_EQ(result, "\u0131a\u0307"sv);
|
||||||
|
|
||||||
|
result = Unicode::to_unicode_lowercase_full("IA\u0307"sv, "tr"sv);
|
||||||
|
EXPECT_EQ(result, "\u0131a\u0307"sv);
|
||||||
|
}
|
||||||
|
|
||||||
TEST_CASE(to_unicode_uppercase_unconditional_special_casing)
|
TEST_CASE(to_unicode_uppercase_unconditional_special_casing)
|
||||||
{
|
{
|
||||||
// LATIN SMALL LETTER SHARP S
|
// LATIN SMALL LETTER SHARP S
|
||||||
|
|
|
@ -10,6 +10,7 @@
|
||||||
#include <AK/Types.h>
|
#include <AK/Types.h>
|
||||||
#include <AK/Utf8View.h>
|
#include <AK/Utf8View.h>
|
||||||
#include <LibUnicode/CharacterTypes.h>
|
#include <LibUnicode/CharacterTypes.h>
|
||||||
|
#include <LibUnicode/Locale.h>
|
||||||
|
|
||||||
#if ENABLE_UNICODE_DATA
|
#if ENABLE_UNICODE_DATA
|
||||||
# include <LibUnicode/UnicodeData.h>
|
# include <LibUnicode/UnicodeData.h>
|
||||||
|
@ -22,6 +23,32 @@ namespace Unicode {
|
||||||
|
|
||||||
#if ENABLE_UNICODE_DATA
|
#if ENABLE_UNICODE_DATA
|
||||||
|
|
||||||
|
static bool is_after_uppercase_i(Utf8View const& string, size_t index)
|
||||||
|
{
|
||||||
|
// There is an uppercase I before C, and there is no intervening combining character class 230 (Above) or 0.
|
||||||
|
auto preceding_view = string.substring_view(0, index);
|
||||||
|
bool found_uppercase_i = false;
|
||||||
|
|
||||||
|
// FIXME: Would be better if Utf8View supported reverse iteration.
|
||||||
|
for (auto code_point : preceding_view) {
|
||||||
|
if (code_point == 'I') {
|
||||||
|
found_uppercase_i = true;
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
auto unicode_data = Detail::unicode_data_for_code_point(code_point);
|
||||||
|
if (!unicode_data.has_value())
|
||||||
|
return false;
|
||||||
|
|
||||||
|
if (unicode_data->canonical_combining_class == 0)
|
||||||
|
found_uppercase_i = false;
|
||||||
|
else if (unicode_data->canonical_combining_class == 230)
|
||||||
|
found_uppercase_i = false;
|
||||||
|
}
|
||||||
|
|
||||||
|
return found_uppercase_i;
|
||||||
|
}
|
||||||
|
|
||||||
static bool is_final_code_point(Utf8View const& string, size_t index, size_t byte_length)
|
static bool is_final_code_point(Utf8View const& string, size_t index, size_t byte_length)
|
||||||
{
|
{
|
||||||
// C is preceded by a sequence consisting of a cased letter and then zero or more case-ignorable
|
// C is preceded by a sequence consisting of a cased letter and then zero or more case-ignorable
|
||||||
|
@ -62,19 +89,30 @@ static bool is_final_code_point(Utf8View const& string, size_t index, size_t byt
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
static SpecialCasing const* find_matching_special_case(Utf8View const& string, size_t index, size_t byte_length, UnicodeData const& unicode_data)
|
static SpecialCasing const* find_matching_special_case(Utf8View const& string, Optional<StringView> locale, size_t index, size_t byte_length, UnicodeData const& unicode_data)
|
||||||
{
|
{
|
||||||
|
auto requested_locale = Locale::None;
|
||||||
|
|
||||||
|
if (locale.has_value()) {
|
||||||
|
if (auto maybe_locale = locale_from_string(*locale); maybe_locale.has_value())
|
||||||
|
requested_locale = *maybe_locale;
|
||||||
|
}
|
||||||
|
|
||||||
for (size_t i = 0; i < unicode_data.special_casing_size; ++i) {
|
for (size_t i = 0; i < unicode_data.special_casing_size; ++i) {
|
||||||
auto const* special_casing = unicode_data.special_casing[i];
|
auto const* special_casing = unicode_data.special_casing[i];
|
||||||
|
|
||||||
if ((special_casing->locale == Locale::None) && (special_casing->condition == Condition::None))
|
if (special_casing->locale != Locale::None && special_casing->locale != requested_locale)
|
||||||
return special_casing;
|
|
||||||
|
|
||||||
// FIXME: Handle locale.
|
|
||||||
if (special_casing->locale != Locale::None)
|
|
||||||
continue;
|
continue;
|
||||||
|
|
||||||
switch (special_casing->condition) {
|
switch (special_casing->condition) {
|
||||||
|
case Condition::None:
|
||||||
|
return special_casing;
|
||||||
|
|
||||||
|
case Condition::AfterI:
|
||||||
|
if (is_after_uppercase_i(string, index))
|
||||||
|
return special_casing;
|
||||||
|
break;
|
||||||
|
|
||||||
case Condition::FinalSigma:
|
case Condition::FinalSigma:
|
||||||
if (is_final_code_point(string, index, byte_length))
|
if (is_final_code_point(string, index, byte_length))
|
||||||
return special_casing;
|
return special_casing;
|
||||||
|
@ -114,7 +152,7 @@ u32 to_unicode_uppercase(u32 code_point)
|
||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
String to_unicode_lowercase_full(StringView const& string)
|
String to_unicode_lowercase_full(StringView const& string, [[maybe_unused]] Optional<StringView> locale)
|
||||||
{
|
{
|
||||||
#if ENABLE_UNICODE_DATA
|
#if ENABLE_UNICODE_DATA
|
||||||
Utf8View view { string };
|
Utf8View view { string };
|
||||||
|
@ -133,7 +171,7 @@ String to_unicode_lowercase_full(StringView const& string)
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
auto const* special_casing = find_matching_special_case(view, index, byte_length, *unicode_data);
|
auto const* special_casing = find_matching_special_case(view, locale, index, byte_length, *unicode_data);
|
||||||
if (!special_casing) {
|
if (!special_casing) {
|
||||||
builder.append_code_point(unicode_data->simple_lowercase_mapping);
|
builder.append_code_point(unicode_data->simple_lowercase_mapping);
|
||||||
continue;
|
continue;
|
||||||
|
@ -149,7 +187,7 @@ String to_unicode_lowercase_full(StringView const& string)
|
||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
String to_unicode_uppercase_full(StringView const& string)
|
String to_unicode_uppercase_full(StringView const& string, [[maybe_unused]] Optional<StringView> locale)
|
||||||
{
|
{
|
||||||
#if ENABLE_UNICODE_DATA
|
#if ENABLE_UNICODE_DATA
|
||||||
Utf8View view { string };
|
Utf8View view { string };
|
||||||
|
@ -168,7 +206,7 @@ String to_unicode_uppercase_full(StringView const& string)
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
auto const* special_casing = find_matching_special_case(view, index, byte_length, *unicode_data);
|
auto const* special_casing = find_matching_special_case(view, locale, index, byte_length, *unicode_data);
|
||||||
if (!special_casing) {
|
if (!special_casing) {
|
||||||
builder.append_code_point(unicode_data->simple_uppercase_mapping);
|
builder.append_code_point(unicode_data->simple_uppercase_mapping);
|
||||||
continue;
|
continue;
|
||||||
|
|
|
@ -7,6 +7,7 @@
|
||||||
#pragma once
|
#pragma once
|
||||||
|
|
||||||
#include <AK/Forward.h>
|
#include <AK/Forward.h>
|
||||||
|
#include <AK/Optional.h>
|
||||||
#include <AK/String.h>
|
#include <AK/String.h>
|
||||||
#include <AK/Types.h>
|
#include <AK/Types.h>
|
||||||
#include <LibUnicode/Forward.h>
|
#include <LibUnicode/Forward.h>
|
||||||
|
@ -18,8 +19,8 @@ namespace Unicode {
|
||||||
u32 to_unicode_lowercase(u32 code_point);
|
u32 to_unicode_lowercase(u32 code_point);
|
||||||
u32 to_unicode_uppercase(u32 code_point);
|
u32 to_unicode_uppercase(u32 code_point);
|
||||||
|
|
||||||
String to_unicode_lowercase_full(StringView const&);
|
String to_unicode_lowercase_full(StringView const&, Optional<StringView> locale = {});
|
||||||
String to_unicode_uppercase_full(StringView const&);
|
String to_unicode_uppercase_full(StringView const&, Optional<StringView> locale = {});
|
||||||
|
|
||||||
Optional<GeneralCategory> general_category_from_string(StringView const&);
|
Optional<GeneralCategory> general_category_from_string(StringView const&);
|
||||||
bool code_point_has_general_category(u32 code_point, GeneralCategory general_category);
|
bool code_point_has_general_category(u32 code_point, GeneralCategory general_category);
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue