AK: Add a Utf8View type for iterating over UTF-8 codepoints

Utf8View wraps a StringView and implements begin() and end() that return a Utf8CodepointIterator, which parses UTF-8-encoded Unicode codepoints and returns them as 32-bit integers. This is the first step towards supporting emojis in Serenity ^) https://github.com/SerenityOS/serenity/issues/490
2025-07-28 16:17:47 +00:00 · 2019-08-28 00:57:15 +03:00 · 2019-08-28 00:57:15 +03:00 · 5d3696174b
commit 5d3696174b
parent 970e0147f7
4 changed files with 241 additions and 1 deletions
--- a/AK/Tests/Makefile
+++ b/AK/Tests/Makefile
@ -1,4 +1,4 @@
-PROGRAMS = TestString TestQueue TestVector TestHashMap TestJSON TestWeakPtr TestNonnullRefPtr TestRefPtr TestFixedArray TestFileSystemPath TestURL TestStringView
+PROGRAMS = TestString TestQueue TestVector TestHashMap TestJSON TestWeakPtr TestNonnullRefPtr TestRefPtr TestFixedArray TestFileSystemPath TestURL TestStringView TestUtf8

 CXXFLAGS = -std=c++17 -Wall -Wextra -ggdb3 -O2 -I../ -I../../

@ -14,6 +14,7 @@ SHARED_TEST_OBJS = \
 	../JsonParser.o \
    ../FileSystemPath.o \
    ../URL.o \
+    ../Utf8View.o \

 .cpp.o:
 	@echo "HOST_CXX $<"; $(PRE_CXX) $(CXX) $(CXXFLAGS) -o $@ -c $<
@ -65,6 +66,9 @@ TestURL: TestURL.o $(SHARED_TEST_OBJS)
 TestStringView: TestStringView.o $(SHARED_TEST_OBJS)
 	$(PRE_CXX) $(CXX) $(CXXFLAGS) -o $@ TestStringView.o $(SHARED_TEST_OBJS)

+TestUtf8: TestUtf8.o $(SHARED_TEST_OBJS)
+	$(PRE_CXX) $(CXX) $(CXXFLAGS) -o $@ TestUtf8.o $(SHARED_TEST_OBJS)
+
 clean:
 	rm -f $(SHARED_TEST_OBJS)
 	rm -f $(PROGRAMS)
--- a/AK/Tests/TestUtf8.cpp
+++ b/AK/Tests/TestUtf8.cpp
@ -0,0 +1,58 @@
+#include <AK/TestSuite.h>
+
+#include <AK/Utf8View.h>
+
+TEST_CASE(decode_ascii)
+{
+    Utf8View utf8 { "Hello World!11" };
+    EXPECT(utf8.validate());
+
+    u32 expected[] = { 72, 101, 108, 108, 111, 32, 87, 111, 114, 108, 100, 33, 49, 49 };
+    size_t expected_size = sizeof(expected) / sizeof(expected[0]);
+
+    size_t i = 0;
+    for (u32 codepoint : utf8) {
+        ASSERT(i < expected_size);
+        EXPECT_EQ(codepoint, expected[i]);
+        i++;
+    }
+    EXPECT_EQ(i, expected_size);
+}
+
+TEST_CASE(decode_utf8)
+{
+    Utf8View utf8 { "Привет, мир! 😀 γειά σου κόσμος こんにちは世界" };
+    EXPECT(utf8.validate());
+
+    u32 expected[] = { 1055, 1088, 1080, 1074, 1077, 1090, 44, 32, 1084, 1080, 1088, 33, 32, 128512, 32, 947, 949, 953, 940, 32, 963, 959, 965, 32, 954, 972, 963, 956, 959, 962, 32, 12371, 12435, 12395, 12385, 12399, 19990, 30028 };
+    size_t expected_size = sizeof(expected) / sizeof(expected[0]);
+
+    size_t i = 0;
+    for (u32 codepoint : utf8) {
+        ASSERT(i < expected_size);
+        EXPECT_EQ(codepoint, expected[i]);
+        i++;
+    }
+    EXPECT_EQ(i, expected_size);
+}
+
+TEST_CASE(validate_invalid_ut8)
+{
+    char invalid_utf8_1[] = { 42, 35, (char)182, 9, 0 };
+    Utf8View utf8_1 { invalid_utf8_1 };
+    EXPECT(!utf8_1.validate());
+
+    char invalid_utf8_2[] = { 42, 35, (char)208, (char)208, 0 };
+    Utf8View utf8_2 { invalid_utf8_2 };
+    EXPECT(!utf8_2.validate());
+
+    char invalid_utf8_3[] = { (char)208, 0 };
+    Utf8View utf8_3 { invalid_utf8_3 };
+    EXPECT(!utf8_3.validate());
+
+    char invalid_utf8_4[] = { (char)208, 35, 0 };
+    Utf8View utf8_4 { invalid_utf8_4 };
+    EXPECT(!utf8_4.validate());
+}
+
+TEST_MAIN(UTF8)