diff --git a/Userland/Libraries/LibWeb/CMakeLists.txt b/Userland/Libraries/LibWeb/CMakeLists.txt index cd3d7cf247..ecf3c02675 100644 --- a/Userland/Libraries/LibWeb/CMakeLists.txt +++ b/Userland/Libraries/LibWeb/CMakeLists.txt @@ -147,6 +147,7 @@ set(SOURCES HTML/ImageData.cpp HTML/Parser/Entities.cpp HTML/Parser/HTMLDocumentParser.cpp + HTML/Parser/HTMLEncodingDetection.cpp HTML/Parser/HTMLToken.cpp HTML/Parser/HTMLTokenizer.cpp HTML/Parser/ListOfActiveFormattingElements.cpp diff --git a/Userland/Libraries/LibWeb/HTML/Parser/HTMLDocumentParser.cpp b/Userland/Libraries/LibWeb/HTML/Parser/HTMLDocumentParser.cpp index e0c5efc4b2..77e49a5155 100644 --- a/Userland/Libraries/LibWeb/HTML/Parser/HTMLDocumentParser.cpp +++ b/Userland/Libraries/LibWeb/HTML/Parser/HTMLDocumentParser.cpp @@ -22,6 +22,7 @@ #include #include #include +#include #include #include #include @@ -3039,4 +3040,14 @@ NonnullRefPtrVector HTMLDocumentParser::parse_html_fragment(DOM::Elem } return children; } + +NonnullOwnPtr HTMLDocumentParser::create_with_uncertain_encoding(DOM::Document& document, const ByteBuffer& input) +{ + if (document.has_encoding()) + return make(document, input, document.encoding().value()); + auto encoding = run_encoding_sniffing_algorithm(input); + dbgln("The encoding sniffing algorithm returned encoding '{}'", encoding); + return make(document, input, encoding); +} + } diff --git a/Userland/Libraries/LibWeb/HTML/Parser/HTMLDocumentParser.h b/Userland/Libraries/LibWeb/HTML/Parser/HTMLDocumentParser.h index 15bb114e95..3f56db98c1 100644 --- a/Userland/Libraries/LibWeb/HTML/Parser/HTMLDocumentParser.h +++ b/Userland/Libraries/LibWeb/HTML/Parser/HTMLDocumentParser.h @@ -46,6 +46,8 @@ public: HTMLDocumentParser(DOM::Document&, const StringView& input, const String& encoding); ~HTMLDocumentParser(); + static NonnullOwnPtr create_with_uncertain_encoding(DOM::Document&, const ByteBuffer& input); + void run(const URL&); DOM::Document& document(); diff --git a/Userland/Libraries/LibWeb/HTML/Parser/HTMLEncodingDetection.cpp b/Userland/Libraries/LibWeb/HTML/Parser/HTMLEncodingDetection.cpp new file mode 100644 index 0000000000..1afe85ba05 --- /dev/null +++ b/Userland/Libraries/LibWeb/HTML/Parser/HTMLEncodingDetection.cpp @@ -0,0 +1,223 @@ +/* + * Copyright (c) 2021, Max Wipfli + * + * SPDX-License-Identifier: BSD-2-Clause + */ + +#include +#include +#include +#include +#include + +namespace Web::HTML { + +bool prescan_should_abort(const ByteBuffer& input, const size_t& position) +{ + return position >= input.size() || position >= 1024; +} + +bool prescan_is_whitespace_or_slash(const u8& byte) +{ + return byte == '\t' || byte == '\n' || byte == '\f' || byte == '\r' || byte == ' ' || byte == '/'; +} + +bool prescan_skip_whitespace_and_slashes(const ByteBuffer& input, size_t& position) +{ + while (!prescan_should_abort(input, position) && (input[position] == '\t' || input[position] == '\n' || input[position] == '\f' || input[position] == '\r' || input[position] == ' ' || input[position] == '/')) + ++position; + return !prescan_should_abort(input, position); +} + +Optional prescan_get_attribute(const ByteBuffer& input, size_t& position) +{ + if (!prescan_skip_whitespace_and_slashes(input, position)) + return {}; + if (input[position] == '>') + return {}; + + StringBuilder attribute_name; + while (true) { + if (input[position] == '=' && !attribute_name.is_empty()) { + ++position; + goto value; + } else if (input[position] == '\t' || input[position] == '\n' || input[position] == '\f' || input[position] == '\r' || input[position] == ' ') + goto spaces; + else if (input[position] == '/' || input[position] == '>') + return Attribute(attribute_name.to_string(), ""); + else + attribute_name.append_as_lowercase(input[position]); + ++position; + if (prescan_should_abort(input, position)) + return {}; + } + +spaces: + if (!prescan_skip_whitespace_and_slashes(input, position)) + return {}; + if (input[position] != '=') + return Attribute(attribute_name.to_string(), ""); + ++position; + +value: + if (!prescan_skip_whitespace_and_slashes(input, position)) + return {}; + + StringBuilder attribute_value; + if (input[position] == '"' || input[position] == '\'') { + u8 quote_character = input[position]; + ++position; + for (; !prescan_should_abort(input, position); ++position) { + if (input[position] == quote_character) + return Attribute(attribute_name.to_string(), attribute_value.to_string()); + else + attribute_value.append_as_lowercase(input[position]); + } + return {}; + } else if (input[position] == '>') + return Attribute(attribute_name.to_string(), ""); + else + attribute_value.append_as_lowercase(input[position]); + + ++position; + if (prescan_should_abort(input, position)) + return {}; + + for (; !prescan_should_abort(input, position); ++position) { + if (input[position] == '\t' || input[position] == '\n' || input[position] == '\f' || input[position] == '\r' || input[position] == ' ' || input[position] == '>') + return Attribute(attribute_name.to_string(), attribute_value.to_string()); + else + attribute_value.append_as_lowercase(input[position]); + } + return {}; +} + +// https://html.spec.whatwg.org/multipage/parsing.html#prescan-a-byte-stream-to-determine-its-encoding +Optional run_prescan_byte_stream_algorithm(const ByteBuffer& input) +{ + // https://html.spec.whatwg.org/multipage/parsing.html#prescan-a-byte-stream-to-determine-its-encoding + + // Detects '') { + position += 2; + break; + } + } + } else if (!prescan_should_abort(input, position + 6) + && input[position] == '<' + && (input[position + 1] == 'M' || input[position + 1] == 'm') + && (input[position + 2] == 'E' || input[position + 2] == 'e') + && (input[position + 3] == 'T' || input[position + 3] == 't') + && (input[position + 4] == 'A' || input[position + 4] == 'a') + && prescan_is_whitespace_or_slash(input[position + 5])) { + position += 6; + Vector attribute_list {}; + bool got_pragma = false; + Optional need_pragma {}; + Optional charset {}; + + while (true) { + auto attribute = prescan_get_attribute(input, position); + if (!attribute.has_value()) + break; + if (attribute_list.contains_slow(attribute.value().name())) + continue; + auto& attribute_name = attribute.value().name(); + attribute_list.append(attribute.value().name()); + + if (attribute_name == "http-equiv" && attribute.value().value() == "content-type") + got_pragma = true; + else if (attribute_name == "charset") { + auto maybe_charset = TextCodec::get_standardized_encoding(attribute.value().value()); + if (maybe_charset.has_value()) { + charset = Optional { maybe_charset }; + need_pragma = { false }; + } + } + + // FIXME: For attribute name "content", do this: + // Apply the "algorithm for extracting a character encoding from a meta + // element", giving the attribute's value as the string to parse. If a + // character encoding is returned, and if charset is still set to null, + // let charset be the encoding returned, and set need pragma to true. + } + + if (!need_pragma.has_value() || (need_pragma.value() && !got_pragma) || !charset.has_value()) + continue; + if (charset.value() == "UTF-16BE/LE") + return "UTF-8"; + else if (charset.value() == "x-user-defined") + return "windows-1252"; + else + return charset.value(); + } else if (!prescan_should_abort(input, position + 3) && input[position] == '<' + && ((input[position + 1] == '/' && isalpha(input[position + 2])) || isalpha(input[position + 1]))) { + position += 2; + prescan_skip_whitespace_and_slashes(input, position); + while (prescan_get_attribute(input, position).has_value()) { }; + } else if (!prescan_should_abort(input, position + 1) && input[position] == '<' && (input[position + 1] == '!' || input[position + 1] == '/' || input[position + 1] == '?')) { + position += 2; + while (input[position] != '>') { + ++position; + if (prescan_should_abort(input, position)) + return {}; + } + } else { + // Do nothing. + } + } + return {}; +} + +// https://html.spec.whatwg.org/multipage/parsing.html#determining-the-character-encoding +String run_encoding_sniffing_algorithm(const ByteBuffer& input) +{ + if (input.size() >= 2) { + if (input[0] == 0xFE && input[1] == 0xFF) { + return "UTF-16BE"; + } else if (input[0] == 0xFF && input[1] == 0xFE) { + return "UTF-16LE"; + } else if (input.size() >= 3 && input[0] == 0xEF && input[1] == 0xBB && input[2] == 0xBF) { + return "UTF-8"; + } + } + + // FIXME: If the user has explicitly instructed the user agent to override the document's character + // encoding with a specific encoding. + // FIXME: The user agent may wait for more bytes of the resource to be available, either in this step or + // at any later step in this algorithm. + // FIXME: If the transport layer specifies a character encoding, and it is supported. + + auto optional_encoding = run_prescan_byte_stream_algorithm(input); + if (optional_encoding.has_value()) { + return optional_encoding.value(); + } + + // FIXME: If the HTML parser for which this algorithm is being run is associated with a Document whose browsing context + // is non-null and a child browsing context. + // FIXME: If the user agent has information on the likely encoding for this page, e.g. based on the encoding of the page + // when it was last visited. + + if (!Utf8View(StringView(input)).validate()) { + // FIXME: As soon as Locale is supported, this should sometimes return a different encoding based on the locale. + return "windows-1252"; + } + + // NOTE: This is the authoritative place to actually decide on using the default encoding as per the HTML specification. + // "Otherwise, return an implementation-defined or user-specified default character encoding, [...]." + return "UTF-8"; +} + +} diff --git a/Userland/Libraries/LibWeb/HTML/Parser/HTMLEncodingDetection.h b/Userland/Libraries/LibWeb/HTML/Parser/HTMLEncodingDetection.h new file mode 100644 index 0000000000..b81d1f365c --- /dev/null +++ b/Userland/Libraries/LibWeb/HTML/Parser/HTMLEncodingDetection.h @@ -0,0 +1,22 @@ +/* + * Copyright (c) 2021, Max Wipfli + * + * SPDX-License-Identifier: BSD-2-Clause + */ + +#pragma once + +#include +#include +#include + +namespace Web::HTML { + +bool prescan_should_abort(const ByteBuffer& input, const size_t& position); +bool prescan_is_whitespace_or_slash(const u8& byte); +bool prescan_skip_whitespace_and_slashes(const ByteBuffer& input, size_t& position); +Optional prescan_get_attribute(const ByteBuffer& input, size_t& position); +Optional run_prescan_byte_stream_algorithm(const ByteBuffer& input); +String run_encoding_sniffing_algorithm(const ByteBuffer& input); + +} diff --git a/Userland/Libraries/LibWeb/Loader/FrameLoader.cpp b/Userland/Libraries/LibWeb/Loader/FrameLoader.cpp index adaba8905e..55a251deee 100644 --- a/Userland/Libraries/LibWeb/Loader/FrameLoader.cpp +++ b/Userland/Libraries/LibWeb/Loader/FrameLoader.cpp @@ -113,8 +113,8 @@ bool FrameLoader::parse_document(DOM::Document& document, const ByteBuffer& data { auto& mime_type = document.content_type(); if (mime_type == "text/html" || mime_type == "image/svg+xml") { - HTML::HTMLDocumentParser parser(document, data, document.encoding_or_default()); - parser.run(document.url()); + auto parser = HTML::HTMLDocumentParser::create_with_uncertain_encoding(document, data); + parser->run(document.url()); return true; } if (mime_type.starts_with("image/"))