diff --git a/Tests/LibUnicode/TestUnicodeCharacterTypes.cpp b/Tests/LibUnicode/TestUnicodeCharacterTypes.cpp index e53b951945..c20f0e4849 100644 --- a/Tests/LibUnicode/TestUnicodeCharacterTypes.cpp +++ b/Tests/LibUnicode/TestUnicodeCharacterTypes.cpp @@ -243,6 +243,29 @@ TEST_CASE(to_unicode_lowercase_special_casing_more_above) EXPECT_EQ(result, "\u012f\u0307\u0300"sv); } +TEST_CASE(to_unicode_lowercase_special_casing_not_before_dot) +{ + // LATIN CAPITAL LETTER I + auto result = Unicode::to_unicode_lowercase_full("I"sv, "en"sv); + EXPECT_EQ(result, "i"sv); + + result = Unicode::to_unicode_lowercase_full("I"sv, "az"sv); + EXPECT_EQ(result, "\u0131"sv); + + result = Unicode::to_unicode_lowercase_full("I"sv, "tr"sv); + EXPECT_EQ(result, "\u0131"sv); + + // LATIN CAPITAL LETTER I followed by COMBINING DOT ABOVE + result = Unicode::to_unicode_lowercase_full("I\u0307"sv, "en"sv); + EXPECT_EQ(result, "i\u0307"sv); + + result = Unicode::to_unicode_lowercase_full("I\u0307"sv, "az"sv); + EXPECT_EQ(result, "i"sv); + + result = Unicode::to_unicode_lowercase_full("I\u0307"sv, "tr"sv); + EXPECT_EQ(result, "i"sv); +} + TEST_CASE(to_unicode_uppercase_unconditional_special_casing) { // LATIN SMALL LETTER SHARP S diff --git a/Userland/Libraries/LibUnicode/CharacterTypes.cpp b/Userland/Libraries/LibUnicode/CharacterTypes.cpp index 347d2f0c04..0c2c7b2e90 100644 --- a/Userland/Libraries/LibUnicode/CharacterTypes.cpp +++ b/Userland/Libraries/LibUnicode/CharacterTypes.cpp @@ -135,6 +135,30 @@ static bool is_followed_by_combining_class_above(Utf8View const& string, size_t return false; } +static bool is_followed_by_combining_dot_above(Utf8View const& string, size_t index, size_t byte_length) +{ + // C is followed by combining dot above (U+0307). Any sequence of characters with a combining class that is neither 0 nor 230 may + // intervene between the current character and the combining dot above. + auto following_view = ((index + byte_length) < string.byte_length()) + ? string.substring_view(index + byte_length) + : Utf8View {}; + + for (auto code_point : following_view) { + if (code_point == 0x307) + return true; + + auto unicode_data = Detail::unicode_data_for_code_point(code_point); + if (!unicode_data.has_value()) + return false; + if (unicode_data->canonical_combining_class == 0) + return false; + if (unicode_data->canonical_combining_class == 230) + return false; + } + + return false; +} + static SpecialCasing const* find_matching_special_case(Utf8View const& string, Optional locale, size_t index, size_t byte_length, UnicodeData const& unicode_data) { auto requested_locale = Locale::None; @@ -174,7 +198,9 @@ static SpecialCasing const* find_matching_special_case(Utf8View const& string, O return special_casing; break; - default: + case Condition::NotBeforeDot: + if (!is_followed_by_combining_dot_above(string, index, byte_length)) + return special_casing; break; } }