mirror of
				https://github.com/RGBCube/serenity
				synced 2025-10-31 17:02:45 +00:00 
			
		
		
		
	LibWeb: Make the new HTML parser parse input as UTF-8
We already convert the input to UTF-8 before starting the tokenizer, so all this patch had to do was switch the tokenizer to use an Utf8View for its input (and to emit 32-bit codepoints.)
This commit is contained in:
		
							parent
							
								
									23dad305e9
								
							
						
					
					
						commit
						b6288163f1
					
				
					 3 changed files with 75 additions and 49 deletions
				
			
		|  | @ -30,6 +30,7 @@ | |||
| #include <AK/String.h> | ||||
| #include <AK/StringBuilder.h> | ||||
| #include <AK/Types.h> | ||||
| #include <AK/Utf8View.h> | ||||
| #include <AK/Vector.h> | ||||
| 
 | ||||
| namespace Web { | ||||
|  | @ -67,9 +68,9 @@ public: | |||
|     u32 codepoint() const | ||||
|     { | ||||
|         ASSERT(is_character()); | ||||
|         // FIXME: Handle non-ASCII codepoints properly.
 | ||||
|         ASSERT(m_comment_or_character.data.length() == 1); | ||||
|         return m_comment_or_character.data.string_view()[0]; | ||||
|         Utf8View view(m_comment_or_character.data.string_view()); | ||||
|         ASSERT(view.length_in_codepoints() == 1); | ||||
|         return *view.begin(); | ||||
|     } | ||||
| 
 | ||||
|     bool is_parser_whitespace() const | ||||
|  |  | |||
		Loading…
	
	Add table
		Add a link
		
	
		Reference in a new issue
	
	 Andreas Kling
						Andreas Kling