mirror of
				https://github.com/RGBCube/serenity
				synced 2025-10-31 18:42:43 +00:00 
			
		
		
		
	LibWeb: Implement encoding sniffing algorithm
This patch implements the HTML specification's "encoding sniffing algorithm", which is used when no encoding can be obtained from the Content-Type header (either because it doesn't contain a charset=...) value or the file has not been opened via HTTP (as with local files). It also modifies the creator of the HTMLDocumentParser to use the new HTMLDocumentParser::create_with_uncertain_encoding static method, which runs the encoding sniffing algorithm before instantiating the parser. This now allows us to load local HTML pages (or remote pages without a charset specified in the 'Content-Type' header) with a non-UTF-8 encoding such as 'windows-1252'. This would previously crash the browser. :^)
This commit is contained in:
		
							parent
							
								
									67a9ebc817
								
							
						
					
					
						commit
						f808279769
					
				
					 6 changed files with 261 additions and 2 deletions
				
			
		|  | @ -147,6 +147,7 @@ set(SOURCES | |||
|     HTML/ImageData.cpp | ||||
|     HTML/Parser/Entities.cpp | ||||
|     HTML/Parser/HTMLDocumentParser.cpp | ||||
|     HTML/Parser/HTMLEncodingDetection.cpp | ||||
|     HTML/Parser/HTMLToken.cpp | ||||
|     HTML/Parser/HTMLTokenizer.cpp | ||||
|     HTML/Parser/ListOfActiveFormattingElements.cpp | ||||
|  |  | |||
|  | @ -22,6 +22,7 @@ | |||
| #include <LibWeb/HTML/HTMLTableElement.h> | ||||
| #include <LibWeb/HTML/HTMLTemplateElement.h> | ||||
| #include <LibWeb/HTML/Parser/HTMLDocumentParser.h> | ||||
| #include <LibWeb/HTML/Parser/HTMLEncodingDetection.h> | ||||
| #include <LibWeb/HTML/Parser/HTMLToken.h> | ||||
| #include <LibWeb/Namespace.h> | ||||
| #include <LibWeb/SVG/TagNames.h> | ||||
|  | @ -3039,4 +3040,14 @@ NonnullRefPtrVector<DOM::Node> HTMLDocumentParser::parse_html_fragment(DOM::Elem | |||
|     } | ||||
|     return children; | ||||
| } | ||||
| 
 | ||||
| NonnullOwnPtr<HTMLDocumentParser> HTMLDocumentParser::create_with_uncertain_encoding(DOM::Document& document, const ByteBuffer& input) | ||||
| { | ||||
|     if (document.has_encoding()) | ||||
|         return make<HTMLDocumentParser>(document, input, document.encoding().value()); | ||||
|     auto encoding = run_encoding_sniffing_algorithm(input); | ||||
|     dbgln("The encoding sniffing algorithm returned encoding '{}'", encoding); | ||||
|     return make<HTMLDocumentParser>(document, input, encoding); | ||||
| } | ||||
| 
 | ||||
| } | ||||
|  |  | |||
|  | @ -46,6 +46,8 @@ public: | |||
|     HTMLDocumentParser(DOM::Document&, const StringView& input, const String& encoding); | ||||
|     ~HTMLDocumentParser(); | ||||
| 
 | ||||
|     static NonnullOwnPtr<HTMLDocumentParser> create_with_uncertain_encoding(DOM::Document&, const ByteBuffer& input); | ||||
| 
 | ||||
|     void run(const URL&); | ||||
| 
 | ||||
|     DOM::Document& document(); | ||||
|  |  | |||
							
								
								
									
										223
									
								
								Userland/Libraries/LibWeb/HTML/Parser/HTMLEncodingDetection.cpp
									
										
									
									
									
										Normal file
									
								
							
							
						
						
									
										223
									
								
								Userland/Libraries/LibWeb/HTML/Parser/HTMLEncodingDetection.cpp
									
										
									
									
									
										Normal file
									
								
							|  | @ -0,0 +1,223 @@ | |||
| /*
 | ||||
|  * Copyright (c) 2021, Max Wipfli <mail@maxwipfli.ch> | ||||
|  * | ||||
|  * SPDX-License-Identifier: BSD-2-Clause | ||||
|  */ | ||||
| 
 | ||||
| #include <AK/StringView.h> | ||||
| #include <AK/Utf8View.h> | ||||
| #include <LibTextCodec/Decoder.h> | ||||
| #include <LibWeb/HTML/Parser/HTMLEncodingDetection.h> | ||||
| #include <ctype.h> | ||||
| 
 | ||||
| namespace Web::HTML { | ||||
| 
 | ||||
| bool prescan_should_abort(const ByteBuffer& input, const size_t& position) | ||||
| { | ||||
|     return position >= input.size() || position >= 1024; | ||||
| } | ||||
| 
 | ||||
| bool prescan_is_whitespace_or_slash(const u8& byte) | ||||
| { | ||||
|     return byte == '\t' || byte == '\n' || byte == '\f' || byte == '\r' || byte == ' ' || byte == '/'; | ||||
| } | ||||
| 
 | ||||
| bool prescan_skip_whitespace_and_slashes(const ByteBuffer& input, size_t& position) | ||||
| { | ||||
|     while (!prescan_should_abort(input, position) && (input[position] == '\t' || input[position] == '\n' || input[position] == '\f' || input[position] == '\r' || input[position] == ' ' || input[position] == '/')) | ||||
|         ++position; | ||||
|     return !prescan_should_abort(input, position); | ||||
| } | ||||
| 
 | ||||
| Optional<Attribute> prescan_get_attribute(const ByteBuffer& input, size_t& position) | ||||
| { | ||||
|     if (!prescan_skip_whitespace_and_slashes(input, position)) | ||||
|         return {}; | ||||
|     if (input[position] == '>') | ||||
|         return {}; | ||||
| 
 | ||||
|     StringBuilder attribute_name; | ||||
|     while (true) { | ||||
|         if (input[position] == '=' && !attribute_name.is_empty()) { | ||||
|             ++position; | ||||
|             goto value; | ||||
|         } else if (input[position] == '\t' || input[position] == '\n' || input[position] == '\f' || input[position] == '\r' || input[position] == ' ') | ||||
|             goto spaces; | ||||
|         else if (input[position] == '/' || input[position] == '>') | ||||
|             return Attribute(attribute_name.to_string(), ""); | ||||
|         else | ||||
|             attribute_name.append_as_lowercase(input[position]); | ||||
|         ++position; | ||||
|         if (prescan_should_abort(input, position)) | ||||
|             return {}; | ||||
|     } | ||||
| 
 | ||||
| spaces: | ||||
|     if (!prescan_skip_whitespace_and_slashes(input, position)) | ||||
|         return {}; | ||||
|     if (input[position] != '=') | ||||
|         return Attribute(attribute_name.to_string(), ""); | ||||
|     ++position; | ||||
| 
 | ||||
| value: | ||||
|     if (!prescan_skip_whitespace_and_slashes(input, position)) | ||||
|         return {}; | ||||
| 
 | ||||
|     StringBuilder attribute_value; | ||||
|     if (input[position] == '"' || input[position] == '\'') { | ||||
|         u8 quote_character = input[position]; | ||||
|         ++position; | ||||
|         for (; !prescan_should_abort(input, position); ++position) { | ||||
|             if (input[position] == quote_character) | ||||
|                 return Attribute(attribute_name.to_string(), attribute_value.to_string()); | ||||
|             else | ||||
|                 attribute_value.append_as_lowercase(input[position]); | ||||
|         } | ||||
|         return {}; | ||||
|     } else if (input[position] == '>') | ||||
|         return Attribute(attribute_name.to_string(), ""); | ||||
|     else | ||||
|         attribute_value.append_as_lowercase(input[position]); | ||||
| 
 | ||||
|     ++position; | ||||
|     if (prescan_should_abort(input, position)) | ||||
|         return {}; | ||||
| 
 | ||||
|     for (; !prescan_should_abort(input, position); ++position) { | ||||
|         if (input[position] == '\t' || input[position] == '\n' || input[position] == '\f' || input[position] == '\r' || input[position] == ' ' || input[position] == '>') | ||||
|             return Attribute(attribute_name.to_string(), attribute_value.to_string()); | ||||
|         else | ||||
|             attribute_value.append_as_lowercase(input[position]); | ||||
|     } | ||||
|     return {}; | ||||
| } | ||||
| 
 | ||||
| // https://html.spec.whatwg.org/multipage/parsing.html#prescan-a-byte-stream-to-determine-its-encoding
 | ||||
| Optional<String> run_prescan_byte_stream_algorithm(const ByteBuffer& input) | ||||
| { | ||||
|     // https://html.spec.whatwg.org/multipage/parsing.html#prescan-a-byte-stream-to-determine-its-encoding
 | ||||
| 
 | ||||
|     // Detects '<?x'
 | ||||
|     if (!prescan_should_abort(input, 6)) { | ||||
|         if (input[0] == 0x3C && input[1] == 0x00 && input[2] == 0x3F && input[3] == 0x00 && input[4] == 0x78 && input[5] == 0x00) | ||||
|             return "utf-16le"; | ||||
|         if (input[0] == 0x00 && input[1] == 0x3C && input[2] == 0x00 && input[4] == 0x3F && input[5] == 0x00 && input[6] == 0x78) | ||||
|             return "utf-16be"; | ||||
|     } | ||||
| 
 | ||||
|     for (size_t position = 0; !prescan_should_abort(input, position); ++position) { | ||||
|         if (!prescan_should_abort(input, position + 5) && input[position] == '<' && input[position + 1] == '!' | ||||
|             && input[position + 2] == '-' && input[position + 3] == '-') { | ||||
|             position += 2; | ||||
|             for (; !prescan_should_abort(input, position + 3); ++position) { | ||||
|                 if (input[position] == '-' && input[position + 1] == '-' && input[position + 2] == '>') { | ||||
|                     position += 2; | ||||
|                     break; | ||||
|                 } | ||||
|             } | ||||
|         } else if (!prescan_should_abort(input, position + 6) | ||||
|             && input[position] == '<' | ||||
|             && (input[position + 1] == 'M' || input[position + 1] == 'm') | ||||
|             && (input[position + 2] == 'E' || input[position + 2] == 'e') | ||||
|             && (input[position + 3] == 'T' || input[position + 3] == 't') | ||||
|             && (input[position + 4] == 'A' || input[position + 4] == 'a') | ||||
|             && prescan_is_whitespace_or_slash(input[position + 5])) { | ||||
|             position += 6; | ||||
|             Vector<String> attribute_list {}; | ||||
|             bool got_pragma = false; | ||||
|             Optional<bool> need_pragma {}; | ||||
|             Optional<String> charset {}; | ||||
| 
 | ||||
|             while (true) { | ||||
|                 auto attribute = prescan_get_attribute(input, position); | ||||
|                 if (!attribute.has_value()) | ||||
|                     break; | ||||
|                 if (attribute_list.contains_slow(attribute.value().name())) | ||||
|                     continue; | ||||
|                 auto& attribute_name = attribute.value().name(); | ||||
|                 attribute_list.append(attribute.value().name()); | ||||
| 
 | ||||
|                 if (attribute_name == "http-equiv" && attribute.value().value() == "content-type") | ||||
|                     got_pragma = true; | ||||
|                 else if (attribute_name == "charset") { | ||||
|                     auto maybe_charset = TextCodec::get_standardized_encoding(attribute.value().value()); | ||||
|                     if (maybe_charset.has_value()) { | ||||
|                         charset = Optional<String> { maybe_charset }; | ||||
|                         need_pragma = { false }; | ||||
|                     } | ||||
|                 } | ||||
| 
 | ||||
|                 // FIXME: For attribute name "content", do this:
 | ||||
|                 //        Apply the "algorithm for extracting a character encoding from a meta
 | ||||
|                 //        element", giving the attribute's value as the string to parse. If a
 | ||||
|                 //        character encoding is returned, and if charset is still set to null,
 | ||||
|                 //        let charset be the encoding returned, and set need pragma to true.
 | ||||
|             } | ||||
| 
 | ||||
|             if (!need_pragma.has_value() || (need_pragma.value() && !got_pragma) || !charset.has_value()) | ||||
|                 continue; | ||||
|             if (charset.value() == "UTF-16BE/LE") | ||||
|                 return "UTF-8"; | ||||
|             else if (charset.value() == "x-user-defined") | ||||
|                 return "windows-1252"; | ||||
|             else | ||||
|                 return charset.value(); | ||||
|         } else if (!prescan_should_abort(input, position + 3) && input[position] == '<' | ||||
|             && ((input[position + 1] == '/' && isalpha(input[position + 2])) || isalpha(input[position + 1]))) { | ||||
|             position += 2; | ||||
|             prescan_skip_whitespace_and_slashes(input, position); | ||||
|             while (prescan_get_attribute(input, position).has_value()) { }; | ||||
|         } else if (!prescan_should_abort(input, position + 1) && input[position] == '<' && (input[position + 1] == '!' || input[position + 1] == '/' || input[position + 1] == '?')) { | ||||
|             position += 2; | ||||
|             while (input[position] != '>') { | ||||
|                 ++position; | ||||
|                 if (prescan_should_abort(input, position)) | ||||
|                     return {}; | ||||
|             } | ||||
|         } else { | ||||
|             // Do nothing.
 | ||||
|         } | ||||
|     } | ||||
|     return {}; | ||||
| } | ||||
| 
 | ||||
| // https://html.spec.whatwg.org/multipage/parsing.html#determining-the-character-encoding
 | ||||
| String run_encoding_sniffing_algorithm(const ByteBuffer& input) | ||||
| { | ||||
|     if (input.size() >= 2) { | ||||
|         if (input[0] == 0xFE && input[1] == 0xFF) { | ||||
|             return "UTF-16BE"; | ||||
|         } else if (input[0] == 0xFF && input[1] == 0xFE) { | ||||
|             return "UTF-16LE"; | ||||
|         } else if (input.size() >= 3 && input[0] == 0xEF && input[1] == 0xBB && input[2] == 0xBF) { | ||||
|             return "UTF-8"; | ||||
|         } | ||||
|     } | ||||
| 
 | ||||
|     // FIXME: If the user has explicitly instructed the user agent to override the document's character
 | ||||
|     //        encoding with a specific encoding.
 | ||||
|     // FIXME: The user agent may wait for more bytes of the resource to be available, either in this step or
 | ||||
|     //        at any later step in this algorithm.
 | ||||
|     // FIXME: If the transport layer specifies a character encoding, and it is supported.
 | ||||
| 
 | ||||
|     auto optional_encoding = run_prescan_byte_stream_algorithm(input); | ||||
|     if (optional_encoding.has_value()) { | ||||
|         return optional_encoding.value(); | ||||
|     } | ||||
| 
 | ||||
|     // FIXME: If the HTML parser for which this algorithm is being run is associated with a Document whose browsing context
 | ||||
|     //        is non-null and a child browsing context.
 | ||||
|     // FIXME: If the user agent has information on the likely encoding for this page, e.g. based on the encoding of the page
 | ||||
|     //        when it was last visited.
 | ||||
| 
 | ||||
|     if (!Utf8View(StringView(input)).validate()) { | ||||
|         // FIXME: As soon as Locale is supported, this should sometimes return a different encoding based on the locale.
 | ||||
|         return "windows-1252"; | ||||
|     } | ||||
| 
 | ||||
|     // NOTE: This is the authoritative place to actually decide on using the default encoding as per the HTML specification.
 | ||||
|     //       "Otherwise, return an implementation-defined or user-specified default character encoding, [...]."
 | ||||
|     return "UTF-8"; | ||||
| } | ||||
| 
 | ||||
| } | ||||
|  | @ -0,0 +1,22 @@ | |||
| /*
 | ||||
|  * Copyright (c) 2021, Max Wipfli <mail@maxwipfli.ch> | ||||
|  * | ||||
|  * SPDX-License-Identifier: BSD-2-Clause | ||||
|  */ | ||||
| 
 | ||||
| #pragma once | ||||
| 
 | ||||
| #include <AK/Optional.h> | ||||
| #include <AK/String.h> | ||||
| #include <LibWeb/DOM/Attribute.h> | ||||
| 
 | ||||
| namespace Web::HTML { | ||||
| 
 | ||||
| bool prescan_should_abort(const ByteBuffer& input, const size_t& position); | ||||
| bool prescan_is_whitespace_or_slash(const u8& byte); | ||||
| bool prescan_skip_whitespace_and_slashes(const ByteBuffer& input, size_t& position); | ||||
| Optional<Attribute> prescan_get_attribute(const ByteBuffer& input, size_t& position); | ||||
| Optional<String> run_prescan_byte_stream_algorithm(const ByteBuffer& input); | ||||
| String run_encoding_sniffing_algorithm(const ByteBuffer& input); | ||||
| 
 | ||||
| } | ||||
|  | @ -113,8 +113,8 @@ bool FrameLoader::parse_document(DOM::Document& document, const ByteBuffer& data | |||
| { | ||||
|     auto& mime_type = document.content_type(); | ||||
|     if (mime_type == "text/html" || mime_type == "image/svg+xml") { | ||||
|         HTML::HTMLDocumentParser parser(document, data, document.encoding_or_default()); | ||||
|         parser.run(document.url()); | ||||
|         auto parser = HTML::HTMLDocumentParser::create_with_uncertain_encoding(document, data); | ||||
|         parser->run(document.url()); | ||||
|         return true; | ||||
|     } | ||||
|     if (mime_type.starts_with("image/")) | ||||
|  |  | |||
		Loading…
	
	Add table
		Add a link
		
	
		Reference in a new issue
	
	 Max Wipfli
						Max Wipfli