diff --git a/Tests/LibUnicode/TestUnicodeCharacterTypes.cpp b/Tests/LibUnicode/TestUnicodeCharacterTypes.cpp index bc5190c50c..7badcf9e86 100644 --- a/Tests/LibUnicode/TestUnicodeCharacterTypes.cpp +++ b/Tests/LibUnicode/TestUnicodeCharacterTypes.cpp @@ -6,6 +6,7 @@ #include +#include #include #include @@ -48,3 +49,151 @@ TEST_CASE(to_unicode_uppercase) EXPECT_EQ(Unicode::to_unicode_uppercase(0x3402u), 0x3402u); EXPECT_EQ(Unicode::to_unicode_uppercase(0x4dbfu), 0x4dbfu); } + +TEST_CASE(to_unicode_lowercase_unconditional_special_casing) +{ + // LATIN SMALL LETTER SHARP S + auto result = Unicode::to_unicode_lowercase_full("\u00DF"sv); + EXPECT_EQ(result, "\u00DF"); + + // LATIN CAPITAL LETTER I WITH DOT ABOVE + result = Unicode::to_unicode_lowercase_full("\u0130"sv); + EXPECT_EQ(result, "\u0069\u0307"); + + // LATIN SMALL LIGATURE FF + result = Unicode::to_unicode_lowercase_full("\uFB00"sv); + EXPECT_EQ(result, "\uFB00"); + + // LATIN SMALL LIGATURE FI + result = Unicode::to_unicode_lowercase_full("\uFB01"sv); + EXPECT_EQ(result, "\uFB01"); + + // LATIN SMALL LIGATURE FL + result = Unicode::to_unicode_lowercase_full("\uFB02"sv); + EXPECT_EQ(result, "\uFB02"); + + // LATIN SMALL LIGATURE FFI + result = Unicode::to_unicode_lowercase_full("\uFB03"sv); + EXPECT_EQ(result, "\uFB03"); + + // LATIN SMALL LIGATURE FFL + result = Unicode::to_unicode_lowercase_full("\uFB04"sv); + EXPECT_EQ(result, "\uFB04"); + + // LATIN SMALL LIGATURE LONG S T + result = Unicode::to_unicode_lowercase_full("\uFB05"sv); + EXPECT_EQ(result, "\uFB05"); + + // LATIN SMALL LIGATURE ST + result = Unicode::to_unicode_lowercase_full("\uFB06"sv); + EXPECT_EQ(result, "\uFB06"); + + // GREEK SMALL LETTER ALPHA WITH PERISPOMENI AND YPOGEGRAMMENI + result = Unicode::to_unicode_lowercase_full("\u1FB7"sv); + EXPECT_EQ(result, "\u1FB7"); + + // GREEK SMALL LETTER ETA WITH PERISPOMENI AND YPOGEGRAMMENI + result = Unicode::to_unicode_lowercase_full("\u1FC7"sv); + EXPECT_EQ(result, "\u1FC7"); + + // GREEK SMALL LETTER OMEGA WITH PERISPOMENI AND YPOGEGRAMMENI + result = Unicode::to_unicode_lowercase_full("\u1FF7"sv); + EXPECT_EQ(result, "\u1FF7"); +} + +TEST_CASE(to_unicode_lowercase_special_casing_sigma) +{ + auto result = Unicode::to_unicode_lowercase_full("ABCI"sv); + EXPECT_EQ(result, "abci"); + + // Sigma preceded by A + result = Unicode::to_unicode_lowercase_full("A\u03A3"sv); + EXPECT_EQ(result, "a\u03C2"); + + // Sigma preceded by A and MONGOLIAN VOWEL SEPARATOR + result = Unicode::to_unicode_lowercase_full("A\u180E\u03A3"sv); + EXPECT_EQ(result, "a\u180E\u03C2"); + + // Sigma preceded by A and MONGOLIAN VOWEL SEPARATOR, followed by B + result = Unicode::to_unicode_lowercase_full("A\u180E\u03A3B"sv); + EXPECT_EQ(result, "a\u180E\u03C3b"); + + // Sigma followed by A + result = Unicode::to_unicode_lowercase_full("\u03A3A"sv); + EXPECT_EQ(result, "\u03C3a"); + + // Sigma preceded by A, followed by MONGOLIAN VOWEL SEPARATOR + result = Unicode::to_unicode_lowercase_full("A\u03A3\u180E"sv); + EXPECT_EQ(result, "a\u03C2\u180E"); + + // Sigma preceded by A, followed by MONGOLIAN VOWEL SEPARATOR and B + result = Unicode::to_unicode_lowercase_full("A\u03A3\u180EB"sv); + EXPECT_EQ(result, "a\u03C3\u180Eb"); + + // Sigma preceded by A and MONGOLIAN VOWEL SEPARATOR, followed by MONGOLIAN VOWEL SEPARATOR + result = Unicode::to_unicode_lowercase_full("A\u180E\u03A3\u180E"sv); + EXPECT_EQ(result, "a\u180E\u03C2\u180E"); + + // Sigma preceded by A and MONGOLIAN VOWEL SEPARATOR, followed by MONGOLIAN VOWEL SEPARATOR and B + result = Unicode::to_unicode_lowercase_full("A\u180E\u03A3\u180EB"sv); + EXPECT_EQ(result, "a\u180E\u03C3\u180Eb"); +} + +TEST_CASE(to_unicode_uppercase_unconditional_special_casing) +{ + // LATIN SMALL LETTER SHARP S + auto result = Unicode::to_unicode_uppercase_full("\u00DF"sv); + EXPECT_EQ(result, "\u0053\u0053"); + + // LATIN CAPITAL LETTER I WITH DOT ABOVE + result = Unicode::to_unicode_uppercase_full("\u0130"sv); + EXPECT_EQ(result, "\u0130"); + + // LATIN SMALL LIGATURE FF + result = Unicode::to_unicode_uppercase_full("\uFB00"sv); + EXPECT_EQ(result, "\u0046\u0046"); + + // LATIN SMALL LIGATURE FI + result = Unicode::to_unicode_uppercase_full("\uFB01"sv); + EXPECT_EQ(result, "\u0046\u0049"); + + // LATIN SMALL LIGATURE FL + result = Unicode::to_unicode_uppercase_full("\uFB02"sv); + EXPECT_EQ(result, "\u0046\u004C"); + + // LATIN SMALL LIGATURE FFI + result = Unicode::to_unicode_uppercase_full("\uFB03"sv); + EXPECT_EQ(result, "\u0046\u0046\u0049"); + + // LATIN SMALL LIGATURE FFL + result = Unicode::to_unicode_uppercase_full("\uFB04"sv); + EXPECT_EQ(result, "\u0046\u0046\u004C"); + + // LATIN SMALL LIGATURE LONG S T + result = Unicode::to_unicode_uppercase_full("\uFB05"sv); + EXPECT_EQ(result, "\u0053\u0054"); + + // LATIN SMALL LIGATURE ST + result = Unicode::to_unicode_uppercase_full("\uFB06"sv); + EXPECT_EQ(result, "\u0053\u0054"); + + // GREEK SMALL LETTER IOTA WITH DIALYTIKA AND TONOS + result = Unicode::to_unicode_uppercase_full("\u0390"sv); + EXPECT_EQ(result, "\u0399\u0308\u0301"); + + // GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND TONOS + result = Unicode::to_unicode_uppercase_full("\u03B0"sv); + EXPECT_EQ(result, "\u03A5\u0308\u0301"); + + // GREEK SMALL LETTER ALPHA WITH PERISPOMENI AND YPOGEGRAMMENI + result = Unicode::to_unicode_uppercase_full("\u1FB7"sv); + EXPECT_EQ(result, "\u0391\u0342\u0399"); + + // GREEK SMALL LETTER ETA WITH PERISPOMENI AND YPOGEGRAMMENI + result = Unicode::to_unicode_uppercase_full("\u1FC7"sv); + EXPECT_EQ(result, "\u0397\u0342\u0399"); + + // GREEK SMALL LETTER OMEGA WITH PERISPOMENI AND YPOGEGRAMMENI + result = Unicode::to_unicode_uppercase_full("\u1FF7"sv); + EXPECT_EQ(result, "\u03A9\u0342\u0399"); +} diff --git a/Userland/Libraries/LibUnicode/CharacterTypes.cpp b/Userland/Libraries/LibUnicode/CharacterTypes.cpp index 103284195d..768f2b874d 100644 --- a/Userland/Libraries/LibUnicode/CharacterTypes.cpp +++ b/Userland/Libraries/LibUnicode/CharacterTypes.cpp @@ -5,7 +5,9 @@ */ #include +#include #include +#include #include #if ENABLE_UNICODE_DATA @@ -14,8 +16,116 @@ # include #endif +// For details on the algorithms used here, see Section 3.13 Default Case Algorithms +// https://www.unicode.org/versions/Unicode13.0.0/ch03.pdf + namespace Unicode { +#if ENABLE_UNICODE_DATA + +static bool is_cased_letter(UnicodeData const& unicode_data) +{ + // A character C is defined to be cased if and only if C has the Lowercase or Uppercase property + // or has a General_Category value of Titlecase_Letter. + switch (unicode_data.general_category) { + case GeneralCategory::Ll: // FIXME: Should be Ll + Other_Lowercase (PropList.txt). + case GeneralCategory::Lu: // FIXME: Should be Lu + Other_Uppercase (PropList.txt). + case GeneralCategory::Lt: + return true; + default: + return false; + } +} + +static bool is_case_ignorable(UnicodeData const& unicode_data) +{ + // A character C is defined to be case-ignorable if C has the value MidLetter (ML), + // MidNumLet (MB), or Single_Quote (SQ) for the Word_Break property or its General_Category is + // one of Nonspacing_Mark (Mn), Enclosing_Mark (Me), Format (Cf), Modifier_Letter (Lm), or + // Modifier_Symbol (Sk). + switch (unicode_data.general_category) { + case GeneralCategory::Mn: + case GeneralCategory::Me: + case GeneralCategory::Cf: + case GeneralCategory::Lm: + case GeneralCategory::Sk: + return true; + default: + // FIXME: Handle word break properties (auxiliary/WordBreakProperty.txt). + return false; + } +} + +static bool is_final_code_point(Utf8View const& string, size_t index, size_t byte_length) +{ + // C is preceded by a sequence consisting of a cased letter and then zero or more case-ignorable + // characters, and C is not followed by a sequence consisting of zero or more case-ignorable + // characters and then a cased letter. + auto preceding_view = string.substring_view(0, index); + auto following_view = ((index + byte_length) < string.byte_length()) + ? string.substring_view(index + byte_length) + : Utf8View {}; + + size_t cased_letter_count = 0; + + for (auto code_point : preceding_view) { + auto unicode_data = unicode_data_for_code_point(code_point); + if (!unicode_data.has_value()) + return false; + + if (is_cased_letter(*unicode_data)) + ++cased_letter_count; + else if (!is_case_ignorable(*unicode_data)) + cased_letter_count = 0; + } + + if (cased_letter_count == 0) + return false; + + for (auto code_point : following_view) { + auto unicode_data = unicode_data_for_code_point(code_point); + if (!unicode_data.has_value()) + return false; + + if (is_case_ignorable(*unicode_data)) + continue; + if (is_cased_letter(*unicode_data)) + return false; + + break; + } + + return true; +} + +static SpecialCasing const* find_matching_special_case(Utf8View const& string, size_t index, size_t byte_length, UnicodeData const& unicode_data) +{ + for (size_t i = 0; i < unicode_data.special_casing_size; ++i) { + auto const* special_casing = unicode_data.special_casing[i]; + + if ((special_casing->locale == Locale::None) && (special_casing->condition == Condition::None)) + return special_casing; + + // FIXME: Handle locale. + if (special_casing->locale != Locale::None) + continue; + + switch (special_casing->condition) { + case Condition::FinalSigma: + if (is_final_code_point(string, index, byte_length)) + return special_casing; + break; + + default: + break; + } + } + + return nullptr; +} + +#endif + u32 to_unicode_lowercase(u32 code_point) { #if ENABLE_UNICODE_DATA @@ -40,4 +150,74 @@ u32 to_unicode_uppercase(u32 code_point) #endif } +String to_unicode_lowercase_full(StringView const& string) +{ +#if ENABLE_UNICODE_DATA + Utf8View view { string }; + StringBuilder builder; + + size_t index = 0; + for (auto it = view.begin(); it != view.end(); ++it) { + u32 code_point = *it; + size_t byte_length = it.underlying_code_point_length_in_bytes(); + + auto unicode_data = unicode_data_for_code_point(code_point); + if (!unicode_data.has_value()) { + builder.append_code_point(code_point); + index += byte_length; + continue; + } + + auto const* special_casing = find_matching_special_case(view, index, byte_length, *unicode_data); + if (!special_casing) { + builder.append_code_point(unicode_data->simple_lowercase_mapping); + index += byte_length; + continue; + } + + for (size_t i = 0; i < special_casing->lowercase_mapping_size; ++i) + builder.append_code_point(special_casing->lowercase_mapping[i]); + } + + return builder.build(); +#else + return string.to_lowercase_string(); +#endif +} + +String to_unicode_uppercase_full(StringView const& string) +{ +#if ENABLE_UNICODE_DATA + Utf8View view { string }; + StringBuilder builder; + + size_t index = 0; + for (auto it = view.begin(); it != view.end(); ++it) { + u32 code_point = *it; + size_t byte_length = it.underlying_code_point_length_in_bytes(); + + auto unicode_data = unicode_data_for_code_point(code_point); + if (!unicode_data.has_value()) { + builder.append_code_point(code_point); + index += byte_length; + continue; + } + + auto const* special_casing = find_matching_special_case(view, index, byte_length, *unicode_data); + if (!special_casing) { + builder.append_code_point(unicode_data->simple_uppercase_mapping); + index += byte_length; + continue; + } + + for (size_t i = 0; i < special_casing->uppercase_mapping_size; ++i) + builder.append_code_point(special_casing->uppercase_mapping[i]); + } + + return builder.build(); +#else + return string.to_uppercase_string(); +#endif +} + } diff --git a/Userland/Libraries/LibUnicode/CharacterTypes.h b/Userland/Libraries/LibUnicode/CharacterTypes.h index d390d7a3d3..beb2288cfb 100644 --- a/Userland/Libraries/LibUnicode/CharacterTypes.h +++ b/Userland/Libraries/LibUnicode/CharacterTypes.h @@ -6,11 +6,18 @@ #pragma once +#include +#include #include namespace Unicode { +// Note: The single code point case conversions only perform simple case folding. +// Use the full-string transformations for full case folding. u32 to_unicode_lowercase(u32 code_point); u32 to_unicode_uppercase(u32 code_point); +String to_unicode_lowercase_full(StringView const&); +String to_unicode_uppercase_full(StringView const&); + }