1
Fork 0
mirror of https://github.com/RGBCube/serenity synced 2025-05-14 11:14:58 +00:00

LibUnicode: Begin implementing special Unicode case folding

This implements unconditional special case folding, and conditional
folding for non-locale cases. Worth noting that the only conditional,
non-locale special case is for converting an uppercase sigma to
lowercase.
This commit is contained in:
Timothy Flynn 2021-07-27 13:46:08 -04:00 committed by Linus Groh
parent 5b110034dd
commit 39f971e42b
3 changed files with 336 additions and 0 deletions

View file

@ -6,6 +6,7 @@
#include <LibTest/TestCase.h>
#include <AK/StringView.h>
#include <LibUnicode/CharacterTypes.h>
#include <ctype.h>
@ -48,3 +49,151 @@ TEST_CASE(to_unicode_uppercase)
EXPECT_EQ(Unicode::to_unicode_uppercase(0x3402u), 0x3402u);
EXPECT_EQ(Unicode::to_unicode_uppercase(0x4dbfu), 0x4dbfu);
}
TEST_CASE(to_unicode_lowercase_unconditional_special_casing)
{
// LATIN SMALL LETTER SHARP S
auto result = Unicode::to_unicode_lowercase_full("\u00DF"sv);
EXPECT_EQ(result, "\u00DF");
// LATIN CAPITAL LETTER I WITH DOT ABOVE
result = Unicode::to_unicode_lowercase_full("\u0130"sv);
EXPECT_EQ(result, "\u0069\u0307");
// LATIN SMALL LIGATURE FF
result = Unicode::to_unicode_lowercase_full("\uFB00"sv);
EXPECT_EQ(result, "\uFB00");
// LATIN SMALL LIGATURE FI
result = Unicode::to_unicode_lowercase_full("\uFB01"sv);
EXPECT_EQ(result, "\uFB01");
// LATIN SMALL LIGATURE FL
result = Unicode::to_unicode_lowercase_full("\uFB02"sv);
EXPECT_EQ(result, "\uFB02");
// LATIN SMALL LIGATURE FFI
result = Unicode::to_unicode_lowercase_full("\uFB03"sv);
EXPECT_EQ(result, "\uFB03");
// LATIN SMALL LIGATURE FFL
result = Unicode::to_unicode_lowercase_full("\uFB04"sv);
EXPECT_EQ(result, "\uFB04");
// LATIN SMALL LIGATURE LONG S T
result = Unicode::to_unicode_lowercase_full("\uFB05"sv);
EXPECT_EQ(result, "\uFB05");
// LATIN SMALL LIGATURE ST
result = Unicode::to_unicode_lowercase_full("\uFB06"sv);
EXPECT_EQ(result, "\uFB06");
// GREEK SMALL LETTER ALPHA WITH PERISPOMENI AND YPOGEGRAMMENI
result = Unicode::to_unicode_lowercase_full("\u1FB7"sv);
EXPECT_EQ(result, "\u1FB7");
// GREEK SMALL LETTER ETA WITH PERISPOMENI AND YPOGEGRAMMENI
result = Unicode::to_unicode_lowercase_full("\u1FC7"sv);
EXPECT_EQ(result, "\u1FC7");
// GREEK SMALL LETTER OMEGA WITH PERISPOMENI AND YPOGEGRAMMENI
result = Unicode::to_unicode_lowercase_full("\u1FF7"sv);
EXPECT_EQ(result, "\u1FF7");
}
TEST_CASE(to_unicode_lowercase_special_casing_sigma)
{
auto result = Unicode::to_unicode_lowercase_full("ABCI"sv);
EXPECT_EQ(result, "abci");
// Sigma preceded by A
result = Unicode::to_unicode_lowercase_full("A\u03A3"sv);
EXPECT_EQ(result, "a\u03C2");
// Sigma preceded by A and MONGOLIAN VOWEL SEPARATOR
result = Unicode::to_unicode_lowercase_full("A\u180E\u03A3"sv);
EXPECT_EQ(result, "a\u180E\u03C2");
// Sigma preceded by A and MONGOLIAN VOWEL SEPARATOR, followed by B
result = Unicode::to_unicode_lowercase_full("A\u180E\u03A3B"sv);
EXPECT_EQ(result, "a\u180E\u03C3b");
// Sigma followed by A
result = Unicode::to_unicode_lowercase_full("\u03A3A"sv);
EXPECT_EQ(result, "\u03C3a");
// Sigma preceded by A, followed by MONGOLIAN VOWEL SEPARATOR
result = Unicode::to_unicode_lowercase_full("A\u03A3\u180E"sv);
EXPECT_EQ(result, "a\u03C2\u180E");
// Sigma preceded by A, followed by MONGOLIAN VOWEL SEPARATOR and B
result = Unicode::to_unicode_lowercase_full("A\u03A3\u180EB"sv);
EXPECT_EQ(result, "a\u03C3\u180Eb");
// Sigma preceded by A and MONGOLIAN VOWEL SEPARATOR, followed by MONGOLIAN VOWEL SEPARATOR
result = Unicode::to_unicode_lowercase_full("A\u180E\u03A3\u180E"sv);
EXPECT_EQ(result, "a\u180E\u03C2\u180E");
// Sigma preceded by A and MONGOLIAN VOWEL SEPARATOR, followed by MONGOLIAN VOWEL SEPARATOR and B
result = Unicode::to_unicode_lowercase_full("A\u180E\u03A3\u180EB"sv);
EXPECT_EQ(result, "a\u180E\u03C3\u180Eb");
}
TEST_CASE(to_unicode_uppercase_unconditional_special_casing)
{
// LATIN SMALL LETTER SHARP S
auto result = Unicode::to_unicode_uppercase_full("\u00DF"sv);
EXPECT_EQ(result, "\u0053\u0053");
// LATIN CAPITAL LETTER I WITH DOT ABOVE
result = Unicode::to_unicode_uppercase_full("\u0130"sv);
EXPECT_EQ(result, "\u0130");
// LATIN SMALL LIGATURE FF
result = Unicode::to_unicode_uppercase_full("\uFB00"sv);
EXPECT_EQ(result, "\u0046\u0046");
// LATIN SMALL LIGATURE FI
result = Unicode::to_unicode_uppercase_full("\uFB01"sv);
EXPECT_EQ(result, "\u0046\u0049");
// LATIN SMALL LIGATURE FL
result = Unicode::to_unicode_uppercase_full("\uFB02"sv);
EXPECT_EQ(result, "\u0046\u004C");
// LATIN SMALL LIGATURE FFI
result = Unicode::to_unicode_uppercase_full("\uFB03"sv);
EXPECT_EQ(result, "\u0046\u0046\u0049");
// LATIN SMALL LIGATURE FFL
result = Unicode::to_unicode_uppercase_full("\uFB04"sv);
EXPECT_EQ(result, "\u0046\u0046\u004C");
// LATIN SMALL LIGATURE LONG S T
result = Unicode::to_unicode_uppercase_full("\uFB05"sv);
EXPECT_EQ(result, "\u0053\u0054");
// LATIN SMALL LIGATURE ST
result = Unicode::to_unicode_uppercase_full("\uFB06"sv);
EXPECT_EQ(result, "\u0053\u0054");
// GREEK SMALL LETTER IOTA WITH DIALYTIKA AND TONOS
result = Unicode::to_unicode_uppercase_full("\u0390"sv);
EXPECT_EQ(result, "\u0399\u0308\u0301");
// GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND TONOS
result = Unicode::to_unicode_uppercase_full("\u03B0"sv);
EXPECT_EQ(result, "\u03A5\u0308\u0301");
// GREEK SMALL LETTER ALPHA WITH PERISPOMENI AND YPOGEGRAMMENI
result = Unicode::to_unicode_uppercase_full("\u1FB7"sv);
EXPECT_EQ(result, "\u0391\u0342\u0399");
// GREEK SMALL LETTER ETA WITH PERISPOMENI AND YPOGEGRAMMENI
result = Unicode::to_unicode_uppercase_full("\u1FC7"sv);
EXPECT_EQ(result, "\u0397\u0342\u0399");
// GREEK SMALL LETTER OMEGA WITH PERISPOMENI AND YPOGEGRAMMENI
result = Unicode::to_unicode_uppercase_full("\u1FF7"sv);
EXPECT_EQ(result, "\u03A9\u0342\u0399");
}