From 5d3696174be8ab88ddad8397753b307a3a3ef94d Mon Sep 17 00:00:00 2001
From: Sergey Bugaev <bugaevc@gmail.com>
Date: Wed, 28 Aug 2019 00:57:15 +0300
Subject: [PATCH] AK: Add a Utf8View type for iterating over UTF-8 codepoints

Utf8View wraps a StringView and implements begin() and end() that
return a Utf8CodepointIterator, which parses UTF-8-encoded Unicode
codepoints and returns them as 32-bit integers.

This is the first step towards supporting emojis in Serenity ^)
https://github.com/SerenityOS/serenity/issues/490
---
 AK/Tests/Makefile     |   6 +-
 AK/Tests/TestUtf8.cpp |  58 +++++++++++++++++++
 AK/Utf8View.cpp       | 130 ++++++++++++++++++++++++++++++++++++++++++
 AK/Utf8View.h         |  48 ++++++++++++++++
 4 files changed, 241 insertions(+), 1 deletion(-)
 create mode 100644 AK/Tests/TestUtf8.cpp
 create mode 100644 AK/Utf8View.cpp
 create mode 100644 AK/Utf8View.h

diff --git a/AK/Tests/Makefile b/AK/Tests/Makefile
index a87819f174..a20db0fbf8 100644
--- a/AK/Tests/Makefile
+++ b/AK/Tests/Makefile
@@ -1,4 +1,4 @@
-PROGRAMS = TestString TestQueue TestVector TestHashMap TestJSON TestWeakPtr TestNonnullRefPtr TestRefPtr TestFixedArray TestFileSystemPath TestURL TestStringView
+PROGRAMS = TestString TestQueue TestVector TestHashMap TestJSON TestWeakPtr TestNonnullRefPtr TestRefPtr TestFixedArray TestFileSystemPath TestURL TestStringView TestUtf8
 
 CXXFLAGS = -std=c++17 -Wall -Wextra -ggdb3 -O2 -I../ -I../../
 
@@ -14,6 +14,7 @@ SHARED_TEST_OBJS = \
 	../JsonParser.o \
     ../FileSystemPath.o \
     ../URL.o \
+    ../Utf8View.o \
 
 .cpp.o:
 	@echo "HOST_CXX $<"; $(PRE_CXX) $(CXX) $(CXXFLAGS) -o $@ -c $<
@@ -65,6 +66,9 @@ TestURL: TestURL.o $(SHARED_TEST_OBJS)
 TestStringView: TestStringView.o $(SHARED_TEST_OBJS)
 	$(PRE_CXX) $(CXX) $(CXXFLAGS) -o $@ TestStringView.o $(SHARED_TEST_OBJS)
 
+TestUtf8: TestUtf8.o $(SHARED_TEST_OBJS)
+	$(PRE_CXX) $(CXX) $(CXXFLAGS) -o $@ TestUtf8.o $(SHARED_TEST_OBJS)
+
 clean:
 	rm -f $(SHARED_TEST_OBJS)
 	rm -f $(PROGRAMS)
diff --git a/AK/Tests/TestUtf8.cpp b/AK/Tests/TestUtf8.cpp
new file mode 100644
index 0000000000..8107ba9bac
--- /dev/null
+++ b/AK/Tests/TestUtf8.cpp
@@ -0,0 +1,58 @@
+#include <AK/TestSuite.h>
+
+#include <AK/Utf8View.h>
+
+TEST_CASE(decode_ascii)
+{
+    Utf8View utf8 { "Hello World!11" };
+    EXPECT(utf8.validate());
+
+    u32 expected[] = { 72, 101, 108, 108, 111, 32, 87, 111, 114, 108, 100, 33, 49, 49 };
+    size_t expected_size = sizeof(expected) / sizeof(expected[0]);
+
+    size_t i = 0;
+    for (u32 codepoint : utf8) {
+        ASSERT(i < expected_size);
+        EXPECT_EQ(codepoint, expected[i]);
+        i++;
+    }
+    EXPECT_EQ(i, expected_size);
+}
+
+TEST_CASE(decode_utf8)
+{
+    Utf8View utf8 { "Привет, мир! 😀 γειά σου κόσμος こんにちは世界" };
+    EXPECT(utf8.validate());
+
+    u32 expected[] = { 1055, 1088, 1080, 1074, 1077, 1090, 44, 32, 1084, 1080, 1088, 33, 32, 128512, 32, 947, 949, 953, 940, 32, 963, 959, 965, 32, 954, 972, 963, 956, 959, 962, 32, 12371, 12435, 12395, 12385, 12399, 19990, 30028 };
+    size_t expected_size = sizeof(expected) / sizeof(expected[0]);
+
+    size_t i = 0;
+    for (u32 codepoint : utf8) {
+        ASSERT(i < expected_size);
+        EXPECT_EQ(codepoint, expected[i]);
+        i++;
+    }
+    EXPECT_EQ(i, expected_size);
+}
+
+TEST_CASE(validate_invalid_ut8)
+{
+    char invalid_utf8_1[] = { 42, 35, (char)182, 9, 0 };
+    Utf8View utf8_1 { invalid_utf8_1 };
+    EXPECT(!utf8_1.validate());
+
+    char invalid_utf8_2[] = { 42, 35, (char)208, (char)208, 0 };
+    Utf8View utf8_2 { invalid_utf8_2 };
+    EXPECT(!utf8_2.validate());
+
+    char invalid_utf8_3[] = { (char)208, 0 };
+    Utf8View utf8_3 { invalid_utf8_3 };
+    EXPECT(!utf8_3.validate());
+
+    char invalid_utf8_4[] = { (char)208, 35, 0 };
+    Utf8View utf8_4 { invalid_utf8_4 };
+    EXPECT(!utf8_4.validate());
+}
+
+TEST_MAIN(UTF8)
diff --git a/AK/Utf8View.cpp b/AK/Utf8View.cpp
new file mode 100644
index 0000000000..b3904a1eef
--- /dev/null
+++ b/AK/Utf8View.cpp
@@ -0,0 +1,130 @@
+#include <AK/Utf8View.h>
+
+namespace AK {
+
+Utf8View::Utf8View(const StringView& string)
+    : m_string(string)
+{
+}
+
+const unsigned char* Utf8View::begin_ptr() const
+{
+    return (const unsigned char*)m_string.characters_without_null_termination();
+}
+
+const unsigned char* Utf8View::end_ptr() const
+{
+    return (const unsigned char*)m_string.characters_without_null_termination() + m_string.length();
+}
+
+Utf8CodepointIterator Utf8View::begin() const
+{
+    return { begin_ptr(), m_string.length() };
+}
+
+Utf8CodepointIterator Utf8View::end() const
+{
+    return { end_ptr(), 0 };
+}
+
+static inline bool decode_first_byte(
+    unsigned char byte,
+    int& out_codepoint_length_in_bytes,
+    u32& out_value)
+{
+    if ((byte & 128) == 0) {
+        out_value = byte;
+        out_codepoint_length_in_bytes = 1;
+        return true;
+    }
+    if ((byte & 64) == 0) {
+        return false;
+    }
+    if ((byte & 32) == 0) {
+        out_value = byte & 31;
+        out_codepoint_length_in_bytes = 2;
+        return true;
+    }
+    if ((byte & 16) == 0) {
+        out_value = byte & 15;
+        out_codepoint_length_in_bytes = 3;
+        return true;
+    }
+    if ((byte & 8) == 0) {
+        out_value = byte & 7;
+        out_codepoint_length_in_bytes = 4;
+        return true;
+    }
+
+    return false;
+}
+
+bool Utf8View::validate() const
+{
+    for (auto ptr = begin_ptr(); ptr < end_ptr(); ptr++) {
+        int codepoint_length_in_bytes;
+        u32 value;
+        bool first_byte_makes_sense = decode_first_byte(*ptr, codepoint_length_in_bytes, value);
+        if (!first_byte_makes_sense)
+            return false;
+
+        for (int i = 1; i < codepoint_length_in_bytes; i++) {
+            ptr++;
+            if (ptr >= end_ptr())
+                return false;
+            if (*ptr >> 6 != 2)
+                return false;
+        }
+    }
+
+    return true;
+}
+
+Utf8CodepointIterator::Utf8CodepointIterator(const unsigned char* ptr, int length)
+    : m_ptr(ptr)
+    , m_length(length)
+{
+}
+
+bool Utf8CodepointIterator::operator==(const Utf8CodepointIterator& other) const
+{
+    return m_ptr == other.m_ptr && m_length == other.m_length;
+}
+
+bool Utf8CodepointIterator::operator!=(const Utf8CodepointIterator& other) const
+{
+    return !(*this == other);
+}
+
+Utf8CodepointIterator& Utf8CodepointIterator::operator++()
+{
+    do {
+        ASSERT(m_length > 0);
+        m_length--;
+        m_ptr++;
+    } while (m_ptr[0] >> 6 == 2);
+
+    return *this;
+}
+
+u32 Utf8CodepointIterator::operator*() const
+{
+    ASSERT(m_length > 0);
+
+    u32 codepoint_value_so_far;
+    int codepoint_length_in_bytes;
+
+    bool first_byte_makes_sense = decode_first_byte(m_ptr[0], codepoint_length_in_bytes, codepoint_value_so_far);
+    ASSERT(first_byte_makes_sense);
+    ASSERT(codepoint_length_in_bytes <= m_length);
+
+    for (int offset = 1; offset < codepoint_length_in_bytes; offset++) {
+        ASSERT(m_ptr[offset] >> 6 == 2);
+        codepoint_value_so_far <<= 6;
+        codepoint_value_so_far |= m_ptr[offset] & 63;
+    }
+
+    return codepoint_value_so_far;
+}
+
+}
diff --git a/AK/Utf8View.h b/AK/Utf8View.h
new file mode 100644
index 0000000000..016b3f81f2
--- /dev/null
+++ b/AK/Utf8View.h
@@ -0,0 +1,48 @@
+#pragma once
+
+#include <AK/StringView.h>
+#include <AK/Types.h>
+
+namespace AK {
+
+class Utf8View;
+
+class Utf8CodepointIterator {
+    friend class Utf8View;
+
+public:
+    ~Utf8CodepointIterator() {}
+
+    bool operator==(const Utf8CodepointIterator&) const;
+    bool operator!=(const Utf8CodepointIterator&) const;
+    Utf8CodepointIterator& operator++();
+    u32 operator*() const;
+
+private:
+    Utf8CodepointIterator(const unsigned char*, int);
+    const unsigned char* m_ptr { nullptr };
+    int m_length { -1 };
+};
+
+class Utf8View {
+public:
+    explicit Utf8View(const StringView&);
+    ~Utf8View() {}
+
+    const StringView& as_string() const { return m_string; }
+
+    Utf8CodepointIterator begin() const;
+    Utf8CodepointIterator end() const;
+
+    bool validate() const;
+
+private:
+    const unsigned char* begin_ptr() const;
+    const unsigned char* end_ptr() const;
+
+    StringView m_string;
+};
+
+}
+
+using AK::Utf8View;