mirror of
https://github.com/RGBCube/serenity
synced 2025-05-31 10:58:12 +00:00
LibUnicode: Implement locale-aware AFTER_SOFT_DOTTED special casing
This commit is contained in:
parent
0053d48c41
commit
1427ebc622
2 changed files with 62 additions and 0 deletions
|
@ -257,6 +257,37 @@ TEST_CASE(to_unicode_uppercase_unconditional_special_casing)
|
||||||
EXPECT_EQ(result, "\u03A9\u0342\u0399");
|
EXPECT_EQ(result, "\u03A9\u0342\u0399");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
TEST_CASE(to_unicode_uppercase_special_casing_soft_dotted)
|
||||||
|
{
|
||||||
|
// LATIN SMALL LETTER I
|
||||||
|
auto result = Unicode::to_unicode_uppercase_full("i"sv, "en"sv);
|
||||||
|
EXPECT_EQ(result, "I"sv);
|
||||||
|
|
||||||
|
result = Unicode::to_unicode_uppercase_full("i"sv, "lt"sv);
|
||||||
|
EXPECT_EQ(result, "I"sv);
|
||||||
|
|
||||||
|
// LATIN SMALL LETTER J
|
||||||
|
result = Unicode::to_unicode_uppercase_full("j"sv, "en"sv);
|
||||||
|
EXPECT_EQ(result, "J"sv);
|
||||||
|
|
||||||
|
result = Unicode::to_unicode_uppercase_full("j"sv, "lt"sv);
|
||||||
|
EXPECT_EQ(result, "J"sv);
|
||||||
|
|
||||||
|
// LATIN SMALL LETTER I followed by COMBINING DOT ABOVE
|
||||||
|
result = Unicode::to_unicode_uppercase_full("i\u0307"sv, "en"sv);
|
||||||
|
EXPECT_EQ(result, "I\u0307"sv);
|
||||||
|
|
||||||
|
result = Unicode::to_unicode_uppercase_full("i\u0307"sv, "lt"sv);
|
||||||
|
EXPECT_EQ(result, "I"sv);
|
||||||
|
|
||||||
|
// LATIN SMALL LETTER J followed by COMBINING DOT ABOVE
|
||||||
|
result = Unicode::to_unicode_uppercase_full("j\u0307"sv, "en"sv);
|
||||||
|
EXPECT_EQ(result, "J\u0307"sv);
|
||||||
|
|
||||||
|
result = Unicode::to_unicode_uppercase_full("j\u0307"sv, "lt"sv);
|
||||||
|
EXPECT_EQ(result, "J"sv);
|
||||||
|
}
|
||||||
|
|
||||||
TEST_CASE(general_category)
|
TEST_CASE(general_category)
|
||||||
{
|
{
|
||||||
auto general_category = [](StringView name) {
|
auto general_category = [](StringView name) {
|
||||||
|
|
|
@ -49,6 +49,32 @@ static bool is_after_uppercase_i(Utf8View const& string, size_t index)
|
||||||
return found_uppercase_i;
|
return found_uppercase_i;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static bool is_after_soft_dotted_code_point(Utf8View const& string, size_t index)
|
||||||
|
{
|
||||||
|
// There is a Soft_Dotted character before C, with no intervening character of combining class 0 or 230 (Above).
|
||||||
|
auto preceding_view = string.substring_view(0, index);
|
||||||
|
bool found_soft_dotted_code_point = false;
|
||||||
|
|
||||||
|
// FIXME: Would be better if Utf8View supported reverse iteration.
|
||||||
|
for (auto code_point : preceding_view) {
|
||||||
|
if (code_point_has_property(code_point, Property::Soft_Dotted)) {
|
||||||
|
found_soft_dotted_code_point = true;
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
auto unicode_data = Detail::unicode_data_for_code_point(code_point);
|
||||||
|
if (!unicode_data.has_value())
|
||||||
|
return false;
|
||||||
|
|
||||||
|
if (unicode_data->canonical_combining_class == 0)
|
||||||
|
found_soft_dotted_code_point = false;
|
||||||
|
else if (unicode_data->canonical_combining_class == 230)
|
||||||
|
found_soft_dotted_code_point = false;
|
||||||
|
}
|
||||||
|
|
||||||
|
return found_soft_dotted_code_point;
|
||||||
|
}
|
||||||
|
|
||||||
static bool is_final_code_point(Utf8View const& string, size_t index, size_t byte_length)
|
static bool is_final_code_point(Utf8View const& string, size_t index, size_t byte_length)
|
||||||
{
|
{
|
||||||
// C is preceded by a sequence consisting of a cased letter and then zero or more case-ignorable
|
// C is preceded by a sequence consisting of a cased letter and then zero or more case-ignorable
|
||||||
|
@ -113,6 +139,11 @@ static SpecialCasing const* find_matching_special_case(Utf8View const& string, O
|
||||||
return special_casing;
|
return special_casing;
|
||||||
break;
|
break;
|
||||||
|
|
||||||
|
case Condition::AfterSoftDotted:
|
||||||
|
if (is_after_soft_dotted_code_point(string, index))
|
||||||
|
return special_casing;
|
||||||
|
break;
|
||||||
|
|
||||||
case Condition::FinalSigma:
|
case Condition::FinalSigma:
|
||||||
if (is_final_code_point(string, index, byte_length))
|
if (is_final_code_point(string, index, byte_length))
|
||||||
return special_casing;
|
return special_casing;
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue