mirror of
				https://github.com/RGBCube/serenity
				synced 2025-10-31 15:32:46 +00:00 
			
		
		
		
	AK: Compute UTF-8 code point lengths using only leading bytes
We don't need to decode the entire code point to know its length. This reduces the runtime of decoding a string containing 5 million instances of U+10FFFF from over 4 seconds to 0.9 seconds.
This commit is contained in:
		
							parent
							
								
									516d2f4892
								
							
						
					
					
						commit
						1d5b45f7d9
					
				
					 1 changed files with 7 additions and 2 deletions
				
			
		|  | @ -73,9 +73,14 @@ Utf8View Utf8View::unicode_substring_view(size_t code_point_offset, size_t code_ | |||
| size_t Utf8View::calculate_length() const | ||||
| { | ||||
|     size_t length = 0; | ||||
|     for ([[maybe_unused]] auto code_point : *this) { | ||||
|         ++length; | ||||
| 
 | ||||
|     for (size_t i = 0; i < m_string.length(); ++length) { | ||||
|         auto [byte_length, code_point, is_valid] = decode_leading_byte(static_cast<u8>(m_string[i])); | ||||
| 
 | ||||
|         // Similar to Utf8CodePointIterator::operator++, if the byte is invalid, try the next byte.
 | ||||
|         i += is_valid ? byte_length : 1; | ||||
|     } | ||||
| 
 | ||||
|     return length; | ||||
| } | ||||
| 
 | ||||
|  |  | |||
		Loading…
	
	Add table
		Add a link
		
	
		Reference in a new issue
	
	 Timothy Flynn
						Timothy Flynn