LibUnicode: Introduce a Unicode library for interacting with UCD files

The Unicode standard publishes the Unicode Character Database (UCD) with information about every code point, such as each code point's upper case mapping. LibUnicode exists to download and parse UCD files at build time and to provide accessors to that data. As a start, LibUnicode includes upper- and lower-case code point converters.
2025-09-18 04:16:17 +00:00 · 2021-07-25 15:10:51 -04:00 · 2021-07-25 15:10:51 -04:00 · 4dda3edc9e
commit 4dda3edc9e
parent 83f88df757
11 changed files with 473 additions and 0 deletions
--- a/Tests/LibUnicode/TestUnicodeCharacterTypes.cpp
+++ b/Tests/LibUnicode/TestUnicodeCharacterTypes.cpp
@ -0,0 +1,50 @@
+/*
+ * Copyright (c) 2021, Tim Flynn <trflynn89@pm.me>
+ *
+ * SPDX-License-Identifier: BSD-2-Clause
+ */
+
+#include <LibTest/TestCase.h>
+
+#include <LibUnicode/CharacterTypes.h>
+#include <ctype.h>
+
+static void compare_to_ascii(auto& old_function, auto& new_function)
+{
+    i64 result1 = 0;
+    i64 result2 = 0;
+
+    for (u32 i = 0; i < 0x80; ++i) {
+        EXPECT_EQ(result1 = old_function(i), result2 = new_function(i));
+        if (result1 != result2)
+            dbgln("Function input value was {}.", i);
+    }
+}
+
+TEST_CASE(to_unicode_lowercase)
+{
+    compare_to_ascii(tolower, Unicode::to_unicode_lowercase);
+
+    EXPECT_EQ(Unicode::to_unicode_lowercase(0x03c9u), 0x03c9u); // "ω" to "ω"
+    EXPECT_EQ(Unicode::to_unicode_lowercase(0x03a9u), 0x03c9u); // "Ω" to "ω"
+
+    // Code points encoded by ranges in UnicodeData.txt
+    EXPECT_EQ(Unicode::to_unicode_lowercase(0x3400u), 0x3400u);
+    EXPECT_EQ(Unicode::to_unicode_lowercase(0x3401u), 0x3401u);
+    EXPECT_EQ(Unicode::to_unicode_lowercase(0x3402u), 0x3402u);
+    EXPECT_EQ(Unicode::to_unicode_lowercase(0x4dbfu), 0x4dbfu);
+}
+
+TEST_CASE(to_unicode_uppercase)
+{
+    compare_to_ascii(toupper, Unicode::to_unicode_uppercase);
+
+    EXPECT_EQ(Unicode::to_unicode_uppercase(0x03c9u), 0x03a9u); // "ω" to "Ω"
+    EXPECT_EQ(Unicode::to_unicode_uppercase(0x03a9u), 0x03a9u); // "Ω" to "Ω"
+
+    // Code points encoded by ranges in UnicodeData.txt
+    EXPECT_EQ(Unicode::to_unicode_uppercase(0x3400u), 0x3400u);
+    EXPECT_EQ(Unicode::to_unicode_uppercase(0x3401u), 0x3401u);
+    EXPECT_EQ(Unicode::to_unicode_uppercase(0x3402u), 0x3402u);
+    EXPECT_EQ(Unicode::to_unicode_uppercase(0x4dbfu), 0x4dbfu);
+}