From c4bfda7f7fecdea9b255eee00c1d4beca25aa0c2 Mon Sep 17 00:00:00 2001 From: Timothy Flynn Date: Tue, 27 Jul 2021 18:47:41 -0400 Subject: [PATCH] LibUnicode: Handle code points that are both cased and case-ignorable Apparently, some code points fit both categories, for example U+0345 (COMBINING GREEK YPOGEGRAMMENI). Handle this fact when determining if a code point is a final code point in a string. --- Tests/LibUnicode/TestUnicodeCharacterTypes.cpp | 4 ++++ Userland/Libraries/LibUnicode/CharacterTypes.cpp | 2 +- 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/Tests/LibUnicode/TestUnicodeCharacterTypes.cpp b/Tests/LibUnicode/TestUnicodeCharacterTypes.cpp index 43f779ebd5..a71fd0d9f3 100644 --- a/Tests/LibUnicode/TestUnicodeCharacterTypes.cpp +++ b/Tests/LibUnicode/TestUnicodeCharacterTypes.cpp @@ -118,6 +118,10 @@ TEST_CASE(to_unicode_lowercase_special_casing_sigma) result = Unicode::to_unicode_lowercase_full("\u2160\u03A3"sv); EXPECT_EQ(result, "\u2170\u03C2"); + // Sigma preceded by COMBINING GREEK YPOGEGRAMMENI + result = Unicode::to_unicode_lowercase_full("\u0345\u03A3"sv); + EXPECT_EQ(result, "\u0345\u03C3"); + // Sigma preceded by A and FULL STOP result = Unicode::to_unicode_lowercase_full("A.\u03A3"sv); EXPECT_EQ(result, "a.\u03C2"); diff --git a/Userland/Libraries/LibUnicode/CharacterTypes.cpp b/Userland/Libraries/LibUnicode/CharacterTypes.cpp index 901acb5277..9fd675fc84 100644 --- a/Userland/Libraries/LibUnicode/CharacterTypes.cpp +++ b/Userland/Libraries/LibUnicode/CharacterTypes.cpp @@ -95,7 +95,7 @@ static bool is_final_code_point(Utf8View const& string, size_t index, size_t byt if (!unicode_data.has_value()) return false; - if (is_cased_letter(*unicode_data)) + if (is_cased_letter(*unicode_data) && !is_case_ignorable(*unicode_data)) ++cased_letter_count; else if (!is_case_ignorable(*unicode_data)) cased_letter_count = 0;