mirror of
				https://github.com/RGBCube/serenity
				synced 2025-10-31 17:02:45 +00:00 
			
		
		
		
	AK: Ensure short String instances are valid UTF-8
We are currently only validating long strings.
This commit is contained in:
		
							parent
							
								
									434ca78425
								
							
						
					
					
						commit
						da0d000909
					
				
					 3 changed files with 37 additions and 15 deletions
				
			
		|  | @ -11,7 +11,6 @@ | |||
| #include <AK/MemMem.h> | ||||
| #include <AK/Stream.h> | ||||
| #include <AK/String.h> | ||||
| #include <AK/Utf8View.h> | ||||
| #include <AK/Vector.h> | ||||
| #include <stdlib.h> | ||||
| 
 | ||||
|  | @ -132,10 +131,6 @@ ErrorOr<NonnullRefPtr<StringData>> StringData::from_utf8(char const* utf8_data, | |||
|     // Strings of MAX_SHORT_STRING_BYTE_COUNT bytes or less should be handled by the String short string optimization.
 | ||||
|     VERIFY(byte_count > String::MAX_SHORT_STRING_BYTE_COUNT); | ||||
| 
 | ||||
|     Utf8View view(StringView(utf8_data, byte_count)); | ||||
|     if (!view.validate()) | ||||
|         return Error::from_string_literal("StringData::from_utf8: Input was not valid UTF-8"); | ||||
| 
 | ||||
|     VERIFY(utf8_data); | ||||
|     u8* buffer = nullptr; | ||||
|     auto new_string_data = TRY(create_uninitialized(byte_count, buffer)); | ||||
|  | @ -143,6 +138,16 @@ ErrorOr<NonnullRefPtr<StringData>> StringData::from_utf8(char const* utf8_data, | |||
|     return new_string_data; | ||||
| } | ||||
| 
 | ||||
| static ErrorOr<void> read_stream_into_buffer(Stream& stream, Bytes buffer) | ||||
| { | ||||
|     TRY(stream.read_entire_buffer(buffer)); | ||||
| 
 | ||||
|     if (!Utf8View { StringView { buffer } }.validate()) | ||||
|         return Error::from_string_literal("String::from_stream: Input was not valid UTF-8"); | ||||
| 
 | ||||
|     return {}; | ||||
| } | ||||
| 
 | ||||
| ErrorOr<NonnullRefPtr<StringData>> StringData::from_stream(Stream& stream, size_t byte_count) | ||||
| { | ||||
|     // Strings of MAX_SHORT_STRING_BYTE_COUNT bytes or less should be handled by the String short string optimization.
 | ||||
|  | @ -150,12 +155,7 @@ ErrorOr<NonnullRefPtr<StringData>> StringData::from_stream(Stream& stream, size_ | |||
| 
 | ||||
|     u8* buffer = nullptr; | ||||
|     auto new_string_data = TRY(create_uninitialized(byte_count, buffer)); | ||||
|     Bytes new_string_bytes = { buffer, byte_count }; | ||||
|     TRY(stream.read_entire_buffer(new_string_bytes)); | ||||
| 
 | ||||
|     Utf8View view(StringView { new_string_bytes }); | ||||
|     if (!view.validate()) | ||||
|         return Error::from_string_literal("StringData::from_stream: Input was not valid UTF-8"); | ||||
|     TRY(read_stream_into_buffer(stream, { buffer, byte_count })); | ||||
| 
 | ||||
|     return new_string_data; | ||||
| } | ||||
|  | @ -230,6 +230,9 @@ void String::destroy_string() | |||
| 
 | ||||
| ErrorOr<String> String::from_utf8(StringView view) | ||||
| { | ||||
|     if (!Utf8View { view }.validate()) | ||||
|         return Error::from_string_literal("String::from_utf8: Input was not valid UTF-8"); | ||||
| 
 | ||||
|     if (view.length() <= MAX_SHORT_STRING_BYTE_COUNT) { | ||||
|         ShortString short_string; | ||||
|         if (!view.is_empty()) | ||||
|  | @ -246,7 +249,7 @@ ErrorOr<String> String::from_stream(Stream& stream, size_t byte_count) | |||
|     if (byte_count <= MAX_SHORT_STRING_BYTE_COUNT) { | ||||
|         ShortString short_string; | ||||
|         if (byte_count > 0) | ||||
|             TRY(stream.read_entire_buffer({ short_string.storage, byte_count })); | ||||
|             TRY(Detail::read_stream_into_buffer(stream, { short_string.storage, byte_count })); | ||||
|         short_string.byte_count_and_short_string_flag = (byte_count << 1) | SHORT_STRING_FLAG; | ||||
|         return String { short_string }; | ||||
|     } | ||||
|  | @ -587,9 +590,6 @@ DeprecatedString String::to_deprecated_string() const | |||
| 
 | ||||
| ErrorOr<String> String::from_deprecated_string(DeprecatedString const& deprecated_string) | ||||
| { | ||||
|     Utf8View view(deprecated_string); | ||||
|     if (!view.validate()) | ||||
|         return Error::from_string_literal("String::from_deprecated_string: Input was not valid UTF-8"); | ||||
|     return String::from_utf8(deprecated_string.view()); | ||||
| } | ||||
| 
 | ||||
|  |  | |||
|  | @ -20,6 +20,7 @@ | |||
| #include <AK/Traits.h> | ||||
| #include <AK/Types.h> | ||||
| #include <AK/UnicodeUtils.h> | ||||
| #include <AK/Utf8View.h> | ||||
| #include <AK/Vector.h> | ||||
| 
 | ||||
| namespace AK { | ||||
|  | @ -72,6 +73,7 @@ public: | |||
|     static AK_SHORT_STRING_CONSTEVAL String from_utf8_short_string(StringView string) | ||||
|     { | ||||
|         VERIFY(string.length() <= MAX_SHORT_STRING_BYTE_COUNT); | ||||
|         VERIFY(Utf8View { string }.validate()); | ||||
| 
 | ||||
|         ShortString short_string; | ||||
|         for (size_t i = 0; i < string.length(); ++i) | ||||
|  |  | |||
|  | @ -140,6 +140,26 @@ TEST_CASE(long_streams) | |||
|     } | ||||
| } | ||||
| 
 | ||||
| TEST_CASE(invalid_utf8) | ||||
| { | ||||
|     auto string1 = String::from_utf8("long string \xf4\x8f\xbf\xc0"sv); // U+110000
 | ||||
|     EXPECT(string1.is_error()); | ||||
|     EXPECT(string1.error().string_literal().contains("Input was not valid UTF-8"sv)); | ||||
| 
 | ||||
|     auto string2 = String::from_utf8("\xf4\xa1\xb0\xbd"sv); // U+121C3D
 | ||||
|     EXPECT(string2.is_error()); | ||||
|     EXPECT(string2.error().string_literal().contains("Input was not valid UTF-8"sv)); | ||||
| 
 | ||||
|     AllocatingMemoryStream stream; | ||||
|     MUST(stream.write_value<u8>(0xf4)); | ||||
|     MUST(stream.write_value<u8>(0xa1)); | ||||
|     MUST(stream.write_value<u8>(0xb0)); | ||||
|     MUST(stream.write_value<u8>(0xbd)); | ||||
|     auto string3 = String::from_stream(stream, stream.used_buffer_size()); | ||||
|     EXPECT_EQ(string3.is_error(), true); | ||||
|     EXPECT(string3.error().string_literal().contains("Input was not valid UTF-8"sv)); | ||||
| } | ||||
| 
 | ||||
| TEST_CASE(from_code_points) | ||||
| { | ||||
|     for (u32 code_point = 0; code_point < 0x80; ++code_point) { | ||||
|  |  | |||
		Loading…
	
	Add table
		Add a link
		
	
		Reference in a new issue
	
	 Timothy Flynn
						Timothy Flynn