From 5d3696174be8ab88ddad8397753b307a3a3ef94d Mon Sep 17 00:00:00 2001 From: Sergey Bugaev Date: Wed, 28 Aug 2019 00:57:15 +0300 Subject: [PATCH] AK: Add a Utf8View type for iterating over UTF-8 codepoints Utf8View wraps a StringView and implements begin() and end() that return a Utf8CodepointIterator, which parses UTF-8-encoded Unicode codepoints and returns them as 32-bit integers. This is the first step towards supporting emojis in Serenity ^) https://github.com/SerenityOS/serenity/issues/490 --- AK/Tests/Makefile | 6 +- AK/Tests/TestUtf8.cpp | 58 +++++++++++++++++++ AK/Utf8View.cpp | 130 ++++++++++++++++++++++++++++++++++++++++++ AK/Utf8View.h | 48 ++++++++++++++++ 4 files changed, 241 insertions(+), 1 deletion(-) create mode 100644 AK/Tests/TestUtf8.cpp create mode 100644 AK/Utf8View.cpp create mode 100644 AK/Utf8View.h diff --git a/AK/Tests/Makefile b/AK/Tests/Makefile index a87819f174..a20db0fbf8 100644 --- a/AK/Tests/Makefile +++ b/AK/Tests/Makefile @@ -1,4 +1,4 @@ -PROGRAMS = TestString TestQueue TestVector TestHashMap TestJSON TestWeakPtr TestNonnullRefPtr TestRefPtr TestFixedArray TestFileSystemPath TestURL TestStringView +PROGRAMS = TestString TestQueue TestVector TestHashMap TestJSON TestWeakPtr TestNonnullRefPtr TestRefPtr TestFixedArray TestFileSystemPath TestURL TestStringView TestUtf8 CXXFLAGS = -std=c++17 -Wall -Wextra -ggdb3 -O2 -I../ -I../../ @@ -14,6 +14,7 @@ SHARED_TEST_OBJS = \ ../JsonParser.o \ ../FileSystemPath.o \ ../URL.o \ + ../Utf8View.o \ .cpp.o: @echo "HOST_CXX $<"; $(PRE_CXX) $(CXX) $(CXXFLAGS) -o $@ -c $< @@ -65,6 +66,9 @@ TestURL: TestURL.o $(SHARED_TEST_OBJS) TestStringView: TestStringView.o $(SHARED_TEST_OBJS) $(PRE_CXX) $(CXX) $(CXXFLAGS) -o $@ TestStringView.o $(SHARED_TEST_OBJS) +TestUtf8: TestUtf8.o $(SHARED_TEST_OBJS) + $(PRE_CXX) $(CXX) $(CXXFLAGS) -o $@ TestUtf8.o $(SHARED_TEST_OBJS) + clean: rm -f $(SHARED_TEST_OBJS) rm -f $(PROGRAMS) diff --git a/AK/Tests/TestUtf8.cpp b/AK/Tests/TestUtf8.cpp new file mode 100644 index 0000000000..8107ba9bac --- /dev/null +++ b/AK/Tests/TestUtf8.cpp @@ -0,0 +1,58 @@ +#include + +#include + +TEST_CASE(decode_ascii) +{ + Utf8View utf8 { "Hello World!11" }; + EXPECT(utf8.validate()); + + u32 expected[] = { 72, 101, 108, 108, 111, 32, 87, 111, 114, 108, 100, 33, 49, 49 }; + size_t expected_size = sizeof(expected) / sizeof(expected[0]); + + size_t i = 0; + for (u32 codepoint : utf8) { + ASSERT(i < expected_size); + EXPECT_EQ(codepoint, expected[i]); + i++; + } + EXPECT_EQ(i, expected_size); +} + +TEST_CASE(decode_utf8) +{ + Utf8View utf8 { "Привет, мир! 😀 γειά σου κόσμος こんにちは世界" }; + EXPECT(utf8.validate()); + + u32 expected[] = { 1055, 1088, 1080, 1074, 1077, 1090, 44, 32, 1084, 1080, 1088, 33, 32, 128512, 32, 947, 949, 953, 940, 32, 963, 959, 965, 32, 954, 972, 963, 956, 959, 962, 32, 12371, 12435, 12395, 12385, 12399, 19990, 30028 }; + size_t expected_size = sizeof(expected) / sizeof(expected[0]); + + size_t i = 0; + for (u32 codepoint : utf8) { + ASSERT(i < expected_size); + EXPECT_EQ(codepoint, expected[i]); + i++; + } + EXPECT_EQ(i, expected_size); +} + +TEST_CASE(validate_invalid_ut8) +{ + char invalid_utf8_1[] = { 42, 35, (char)182, 9, 0 }; + Utf8View utf8_1 { invalid_utf8_1 }; + EXPECT(!utf8_1.validate()); + + char invalid_utf8_2[] = { 42, 35, (char)208, (char)208, 0 }; + Utf8View utf8_2 { invalid_utf8_2 }; + EXPECT(!utf8_2.validate()); + + char invalid_utf8_3[] = { (char)208, 0 }; + Utf8View utf8_3 { invalid_utf8_3 }; + EXPECT(!utf8_3.validate()); + + char invalid_utf8_4[] = { (char)208, 35, 0 }; + Utf8View utf8_4 { invalid_utf8_4 }; + EXPECT(!utf8_4.validate()); +} + +TEST_MAIN(UTF8) diff --git a/AK/Utf8View.cpp b/AK/Utf8View.cpp new file mode 100644 index 0000000000..b3904a1eef --- /dev/null +++ b/AK/Utf8View.cpp @@ -0,0 +1,130 @@ +#include + +namespace AK { + +Utf8View::Utf8View(const StringView& string) + : m_string(string) +{ +} + +const unsigned char* Utf8View::begin_ptr() const +{ + return (const unsigned char*)m_string.characters_without_null_termination(); +} + +const unsigned char* Utf8View::end_ptr() const +{ + return (const unsigned char*)m_string.characters_without_null_termination() + m_string.length(); +} + +Utf8CodepointIterator Utf8View::begin() const +{ + return { begin_ptr(), m_string.length() }; +} + +Utf8CodepointIterator Utf8View::end() const +{ + return { end_ptr(), 0 }; +} + +static inline bool decode_first_byte( + unsigned char byte, + int& out_codepoint_length_in_bytes, + u32& out_value) +{ + if ((byte & 128) == 0) { + out_value = byte; + out_codepoint_length_in_bytes = 1; + return true; + } + if ((byte & 64) == 0) { + return false; + } + if ((byte & 32) == 0) { + out_value = byte & 31; + out_codepoint_length_in_bytes = 2; + return true; + } + if ((byte & 16) == 0) { + out_value = byte & 15; + out_codepoint_length_in_bytes = 3; + return true; + } + if ((byte & 8) == 0) { + out_value = byte & 7; + out_codepoint_length_in_bytes = 4; + return true; + } + + return false; +} + +bool Utf8View::validate() const +{ + for (auto ptr = begin_ptr(); ptr < end_ptr(); ptr++) { + int codepoint_length_in_bytes; + u32 value; + bool first_byte_makes_sense = decode_first_byte(*ptr, codepoint_length_in_bytes, value); + if (!first_byte_makes_sense) + return false; + + for (int i = 1; i < codepoint_length_in_bytes; i++) { + ptr++; + if (ptr >= end_ptr()) + return false; + if (*ptr >> 6 != 2) + return false; + } + } + + return true; +} + +Utf8CodepointIterator::Utf8CodepointIterator(const unsigned char* ptr, int length) + : m_ptr(ptr) + , m_length(length) +{ +} + +bool Utf8CodepointIterator::operator==(const Utf8CodepointIterator& other) const +{ + return m_ptr == other.m_ptr && m_length == other.m_length; +} + +bool Utf8CodepointIterator::operator!=(const Utf8CodepointIterator& other) const +{ + return !(*this == other); +} + +Utf8CodepointIterator& Utf8CodepointIterator::operator++() +{ + do { + ASSERT(m_length > 0); + m_length--; + m_ptr++; + } while (m_ptr[0] >> 6 == 2); + + return *this; +} + +u32 Utf8CodepointIterator::operator*() const +{ + ASSERT(m_length > 0); + + u32 codepoint_value_so_far; + int codepoint_length_in_bytes; + + bool first_byte_makes_sense = decode_first_byte(m_ptr[0], codepoint_length_in_bytes, codepoint_value_so_far); + ASSERT(first_byte_makes_sense); + ASSERT(codepoint_length_in_bytes <= m_length); + + for (int offset = 1; offset < codepoint_length_in_bytes; offset++) { + ASSERT(m_ptr[offset] >> 6 == 2); + codepoint_value_so_far <<= 6; + codepoint_value_so_far |= m_ptr[offset] & 63; + } + + return codepoint_value_so_far; +} + +} diff --git a/AK/Utf8View.h b/AK/Utf8View.h new file mode 100644 index 0000000000..016b3f81f2 --- /dev/null +++ b/AK/Utf8View.h @@ -0,0 +1,48 @@ +#pragma once + +#include +#include + +namespace AK { + +class Utf8View; + +class Utf8CodepointIterator { + friend class Utf8View; + +public: + ~Utf8CodepointIterator() {} + + bool operator==(const Utf8CodepointIterator&) const; + bool operator!=(const Utf8CodepointIterator&) const; + Utf8CodepointIterator& operator++(); + u32 operator*() const; + +private: + Utf8CodepointIterator(const unsigned char*, int); + const unsigned char* m_ptr { nullptr }; + int m_length { -1 }; +}; + +class Utf8View { +public: + explicit Utf8View(const StringView&); + ~Utf8View() {} + + const StringView& as_string() const { return m_string; } + + Utf8CodepointIterator begin() const; + Utf8CodepointIterator end() const; + + bool validate() const; + +private: + const unsigned char* begin_ptr() const; + const unsigned char* end_ptr() const; + + StringView m_string; +}; + +} + +using AK::Utf8View;