diff --git a/AK/Utf8View.h b/AK/Utf8View.h index 98a15bfc84..49382ef146 100644 --- a/AK/Utf8View.h +++ b/AK/Utf8View.h @@ -7,7 +7,6 @@ #pragma once -#include #include #include #include @@ -142,7 +141,7 @@ public: code_point |= code_point_bits; } - if (!is_unicode(code_point)) + if (!is_valid_code_point(code_point, byte_length)) return false; valid_bytes += byte_length; @@ -162,13 +161,15 @@ private: size_t byte_length { 0 }; u8 encoding_bits { 0 }; u8 encoding_mask { 0 }; + u32 first_code_point { 0 }; + u32 last_code_point { 0 }; }; static constexpr Array utf8_encoded_byte_data { { - { 1, 0b0000'0000, 0b1000'0000 }, - { 2, 0b1100'0000, 0b1110'0000 }, - { 3, 0b1110'0000, 0b1111'0000 }, - { 4, 0b1111'0000, 0b1111'1000 }, + { 1, 0b0000'0000, 0b1000'0000, 0x0000, 0x007F }, + { 2, 0b1100'0000, 0b1110'0000, 0x0080, 0x07FF }, + { 3, 0b1110'0000, 0b1111'0000, 0x0800, 0xFFFF }, + { 4, 0b1111'0000, 0b1111'1000, 0x10000, 0x10FFFF }, } }; struct LeadingByte { @@ -208,6 +209,16 @@ private: return { .is_valid = false }; } + static constexpr bool is_valid_code_point(u32 code_point, size_t byte_length) + { + for (auto const& data : utf8_encoded_byte_data) { + if (code_point >= data.first_code_point && code_point <= data.last_code_point) + return byte_length == data.byte_length; + } + + return false; + } + StringView m_string; mutable size_t m_length { 0 }; mutable bool m_have_length { false }; diff --git a/Tests/AK/TestUtf8.cpp b/Tests/AK/TestUtf8.cpp index 8c97246632..8dcc7d67fb 100644 --- a/Tests/AK/TestUtf8.cpp +++ b/Tests/AK/TestUtf8.cpp @@ -82,6 +82,47 @@ TEST_CASE(validate_invalid_ut8) EXPECT(valid_bytes == 0); } +TEST_CASE(validate_overlong_utf8) +{ + size_t valid_bytes = 0; + + // Overlong 2-byte encoding of U+002F + char invalid_utf8_1[] = { 42, 35, static_cast(0xc0), static_cast(0xaf) }; + Utf8View utf8_1 { StringView { invalid_utf8_1, sizeof(invalid_utf8_1) } }; + EXPECT(!utf8_1.validate(valid_bytes)); + EXPECT(valid_bytes == 2); + + // Overlong 3-byte encoding of U+002F + char invalid_utf8_2[] = { 42, 35, static_cast(0xe0), static_cast(0x80), static_cast(0xaf) }; + Utf8View utf8_2 { StringView { invalid_utf8_2, sizeof(invalid_utf8_2) } }; + EXPECT(!utf8_2.validate(valid_bytes)); + EXPECT(valid_bytes == 2); + + // Overlong 4-byte encoding of U+002F + char invalid_utf8_3[] = { 42, 35, static_cast(0xf0), static_cast(0x80), static_cast(0x80), static_cast(0xaf) }; + Utf8View utf8_3 { StringView { invalid_utf8_3, sizeof(invalid_utf8_3) } }; + EXPECT(!utf8_3.validate(valid_bytes)); + EXPECT(valid_bytes == 2); + + // Overlong 3-byte encoding of U+00FF + char invalid_utf8_4[] = { 42, 35, static_cast(0xe0), static_cast(0x83), static_cast(0xbf) }; + Utf8View utf8_4 { StringView { invalid_utf8_4, sizeof(invalid_utf8_4) } }; + EXPECT(!utf8_4.validate(valid_bytes)); + EXPECT(valid_bytes == 2); + + // Overlong 4-byte encoding of U+00FF + char invalid_utf8_5[] = { 42, 35, static_cast(0xf0), static_cast(0x80), static_cast(0x83), static_cast(0xbf) }; + Utf8View utf8_5 { StringView { invalid_utf8_5, sizeof(invalid_utf8_5) } }; + EXPECT(!utf8_5.validate(valid_bytes)); + EXPECT(valid_bytes == 2); + + // Overlong 4-byte encoding of U+0FFF + char invalid_utf8_6[] = { 42, 35, static_cast(0xf0), static_cast(0x8f), static_cast(0xbf), static_cast(0xbf) }; + Utf8View utf8_6 { StringView { invalid_utf8_6, sizeof(invalid_utf8_6) } }; + EXPECT(!utf8_6.validate(valid_bytes)); + EXPECT(valid_bytes == 2); +} + TEST_CASE(iterate_utf8) { Utf8View view("Some weird characters \u00A9\u266A\uA755"sv);