mirror of
				https://github.com/RGBCube/serenity
				synced 2025-10-31 15:32:46 +00:00 
			
		
		
		
	AK: Invalidate overlong UTF-8 code point encodings
For example, the code point U+002F could be encoded as UTF-8 with the bytes 0x80 0xAF. This trick has historically been used to bypass security checks.
This commit is contained in:
		
							parent
							
								
									796a615bc1
								
							
						
					
					
						commit
						c4d78c29a2
					
				
					 2 changed files with 58 additions and 6 deletions
				
			
		|  | @ -7,7 +7,6 @@ | |||
| 
 | ||||
| #pragma once | ||||
| 
 | ||||
| #include <AK/CharacterTypes.h> | ||||
| #include <AK/DeprecatedString.h> | ||||
| #include <AK/Format.h> | ||||
| #include <AK/StringView.h> | ||||
|  | @ -142,7 +141,7 @@ public: | |||
|                 code_point |= code_point_bits; | ||||
|             } | ||||
| 
 | ||||
|             if (!is_unicode(code_point)) | ||||
|             if (!is_valid_code_point(code_point, byte_length)) | ||||
|                 return false; | ||||
| 
 | ||||
|             valid_bytes += byte_length; | ||||
|  | @ -162,13 +161,15 @@ private: | |||
|         size_t byte_length { 0 }; | ||||
|         u8 encoding_bits { 0 }; | ||||
|         u8 encoding_mask { 0 }; | ||||
|         u32 first_code_point { 0 }; | ||||
|         u32 last_code_point { 0 }; | ||||
|     }; | ||||
| 
 | ||||
|     static constexpr Array<Utf8EncodedByteData, 4> utf8_encoded_byte_data { { | ||||
|         { 1, 0b0000'0000, 0b1000'0000 }, | ||||
|         { 2, 0b1100'0000, 0b1110'0000 }, | ||||
|         { 3, 0b1110'0000, 0b1111'0000 }, | ||||
|         { 4, 0b1111'0000, 0b1111'1000 }, | ||||
|         { 1, 0b0000'0000, 0b1000'0000, 0x0000, 0x007F }, | ||||
|         { 2, 0b1100'0000, 0b1110'0000, 0x0080, 0x07FF }, | ||||
|         { 3, 0b1110'0000, 0b1111'0000, 0x0800, 0xFFFF }, | ||||
|         { 4, 0b1111'0000, 0b1111'1000, 0x10000, 0x10FFFF }, | ||||
|     } }; | ||||
| 
 | ||||
|     struct LeadingByte { | ||||
|  | @ -208,6 +209,16 @@ private: | |||
|         return { .is_valid = false }; | ||||
|     } | ||||
| 
 | ||||
|     static constexpr bool is_valid_code_point(u32 code_point, size_t byte_length) | ||||
|     { | ||||
|         for (auto const& data : utf8_encoded_byte_data) { | ||||
|             if (code_point >= data.first_code_point && code_point <= data.last_code_point) | ||||
|                 return byte_length == data.byte_length; | ||||
|         } | ||||
| 
 | ||||
|         return false; | ||||
|     } | ||||
| 
 | ||||
|     StringView m_string; | ||||
|     mutable size_t m_length { 0 }; | ||||
|     mutable bool m_have_length { false }; | ||||
|  |  | |||
|  | @ -82,6 +82,47 @@ TEST_CASE(validate_invalid_ut8) | |||
|     EXPECT(valid_bytes == 0); | ||||
| } | ||||
| 
 | ||||
| TEST_CASE(validate_overlong_utf8) | ||||
| { | ||||
|     size_t valid_bytes = 0; | ||||
| 
 | ||||
|     // Overlong 2-byte encoding of U+002F
 | ||||
|     char invalid_utf8_1[] = { 42, 35, static_cast<char>(0xc0), static_cast<char>(0xaf) }; | ||||
|     Utf8View utf8_1 { StringView { invalid_utf8_1, sizeof(invalid_utf8_1) } }; | ||||
|     EXPECT(!utf8_1.validate(valid_bytes)); | ||||
|     EXPECT(valid_bytes == 2); | ||||
| 
 | ||||
|     // Overlong 3-byte encoding of U+002F
 | ||||
|     char invalid_utf8_2[] = { 42, 35, static_cast<char>(0xe0), static_cast<char>(0x80), static_cast<char>(0xaf) }; | ||||
|     Utf8View utf8_2 { StringView { invalid_utf8_2, sizeof(invalid_utf8_2) } }; | ||||
|     EXPECT(!utf8_2.validate(valid_bytes)); | ||||
|     EXPECT(valid_bytes == 2); | ||||
| 
 | ||||
|     // Overlong 4-byte encoding of U+002F
 | ||||
|     char invalid_utf8_3[] = { 42, 35, static_cast<char>(0xf0), static_cast<char>(0x80), static_cast<char>(0x80), static_cast<char>(0xaf) }; | ||||
|     Utf8View utf8_3 { StringView { invalid_utf8_3, sizeof(invalid_utf8_3) } }; | ||||
|     EXPECT(!utf8_3.validate(valid_bytes)); | ||||
|     EXPECT(valid_bytes == 2); | ||||
| 
 | ||||
|     // Overlong 3-byte encoding of U+00FF
 | ||||
|     char invalid_utf8_4[] = { 42, 35, static_cast<char>(0xe0), static_cast<char>(0x83), static_cast<char>(0xbf) }; | ||||
|     Utf8View utf8_4 { StringView { invalid_utf8_4, sizeof(invalid_utf8_4) } }; | ||||
|     EXPECT(!utf8_4.validate(valid_bytes)); | ||||
|     EXPECT(valid_bytes == 2); | ||||
| 
 | ||||
|     // Overlong 4-byte encoding of U+00FF
 | ||||
|     char invalid_utf8_5[] = { 42, 35, static_cast<char>(0xf0), static_cast<char>(0x80), static_cast<char>(0x83), static_cast<char>(0xbf) }; | ||||
|     Utf8View utf8_5 { StringView { invalid_utf8_5, sizeof(invalid_utf8_5) } }; | ||||
|     EXPECT(!utf8_5.validate(valid_bytes)); | ||||
|     EXPECT(valid_bytes == 2); | ||||
| 
 | ||||
|     // Overlong 4-byte encoding of U+0FFF
 | ||||
|     char invalid_utf8_6[] = { 42, 35, static_cast<char>(0xf0), static_cast<char>(0x8f), static_cast<char>(0xbf), static_cast<char>(0xbf) }; | ||||
|     Utf8View utf8_6 { StringView { invalid_utf8_6, sizeof(invalid_utf8_6) } }; | ||||
|     EXPECT(!utf8_6.validate(valid_bytes)); | ||||
|     EXPECT(valid_bytes == 2); | ||||
| } | ||||
| 
 | ||||
| TEST_CASE(iterate_utf8) | ||||
| { | ||||
|     Utf8View view("Some weird characters \u00A9\u266A\uA755"sv); | ||||
|  |  | |||
		Loading…
	
	Add table
		Add a link
		
	
		Reference in a new issue
	
	 Timothy Flynn
						Timothy Flynn