diff --git a/Tests/LibWeb/CMakeLists.txt b/Tests/LibWeb/CMakeLists.txt index 0bba6cf7f4..b62aa719c1 100644 --- a/Tests/LibWeb/CMakeLists.txt +++ b/Tests/LibWeb/CMakeLists.txt @@ -2,6 +2,7 @@ set(TEST_SOURCES TestCSSIDSpeed.cpp TestCSSPixels.cpp TestHTMLTokenizer.cpp + TestMimeSniff.cpp TestNumbers.cpp ) diff --git a/Tests/LibWeb/TestMimeSniff.cpp b/Tests/LibWeb/TestMimeSniff.cpp new file mode 100644 index 0000000000..e28cb44c17 --- /dev/null +++ b/Tests/LibWeb/TestMimeSniff.cpp @@ -0,0 +1,32 @@ +/* + * Copyright (c) 2023, Kemal Zebari . + * + * SPDX-License-Identifier: BSD-2-Clause + */ + +#include + +#include + +TEST_CASE(determine_computed_mime_type_given_no_sniff_is_set) +{ + auto mime_type = MUST(Web::MimeSniff::MimeType::create("text"_string, "html"_string)); + auto computed_mime_type = MUST(Web::MimeSniff::Resource::sniff("\x00"sv.bytes(), Web::MimeSniff::SniffingConfiguration { .supplied_type = mime_type, .no_sniff = true })); + + EXPECT_EQ("text/html"sv, MUST(computed_mime_type.serialized())); +} + +TEST_CASE(determine_computed_mime_type_given_no_sniff_is_unset) +{ + auto supplied_type = MUST(Web::MimeSniff::MimeType::create("text"_string, "html"_string)); + auto computed_mime_type = MUST(Web::MimeSniff::Resource::sniff("\x00"sv.bytes(), Web::MimeSniff::SniffingConfiguration { .supplied_type = supplied_type })); + + EXPECT_EQ("application/octet-stream"sv, MUST(computed_mime_type.serialized())); + + // Make sure we cover the XML code path in the mime type sniffing algorithm. + auto xml_mime_type = "application/rss+xml"sv; + supplied_type = MUST(Web::MimeSniff::MimeType::parse(xml_mime_type)).release_value(); + computed_mime_type = MUST(Web::MimeSniff::Resource::sniff("\x00"sv.bytes(), Web::MimeSniff::SniffingConfiguration { .supplied_type = supplied_type })); + + EXPECT_EQ(xml_mime_type, MUST(computed_mime_type.serialized())); +} diff --git a/Userland/Libraries/LibWeb/CMakeLists.txt b/Userland/Libraries/LibWeb/CMakeLists.txt index 977303f206..89ee2f2a1b 100644 --- a/Userland/Libraries/LibWeb/CMakeLists.txt +++ b/Userland/Libraries/LibWeb/CMakeLists.txt @@ -472,6 +472,7 @@ set(SOURCES MathML/MathMLElement.cpp MathML/TagNames.cpp MimeSniff/MimeType.cpp + MimeSniff/Resource.cpp Namespace.cpp NavigationTiming/EntryNames.cpp NavigationTiming/PerformanceTiming.cpp diff --git a/Userland/Libraries/LibWeb/MimeSniff/Resource.cpp b/Userland/Libraries/LibWeb/MimeSniff/Resource.cpp new file mode 100644 index 0000000000..4f331abe8c --- /dev/null +++ b/Userland/Libraries/LibWeb/MimeSniff/Resource.cpp @@ -0,0 +1,172 @@ +/* + * Copyright (c) 2023, Kemal Zebari . + * + * SPDX-License-Identifier: BSD-2-Clause + */ + +#include +#include + +namespace Web::MimeSniff { + +ErrorOr Resource::create(ReadonlyBytes data, SniffingConfiguration configuration) +{ + // NOTE: Non-standard but for cases where pattern matching fails, let's fall back to the safest MIME type. + auto default_computed_mime_type = TRY(MimeType::create("application"_string, "octet-stream"_string)); + auto resource = Resource { data, configuration.no_sniff, move(default_computed_mime_type) }; + + TRY(resource.supplied_mime_type_detection_algorithm(configuration.scheme, move(configuration.supplied_type))); + TRY(resource.mime_type_sniffing_algorithm()); + + return resource; +} + +ErrorOr Resource::sniff(ReadonlyBytes data, SniffingConfiguration configuration) +{ + auto resource = TRY(create(data, move(configuration))); + return move(resource.m_computed_mime_type); +} + +Resource::Resource(ReadonlyBytes data, bool no_sniff, MimeType&& default_computed_mime_type) + : m_no_sniff(no_sniff) + , m_computed_mime_type(move(default_computed_mime_type)) +{ + read_the_resource_header(data); +} + +Resource::~Resource() = default; + +// https://mimesniff.spec.whatwg.org/#supplied-mime-type-detection-algorithm +// NOTE: Parameters are non-standard. +ErrorOr Resource::supplied_mime_type_detection_algorithm(StringView scheme, Optional supplied_type) +{ + // 1. Let supplied-type be null. + // 2. If the resource is retrieved via HTTP, execute the following steps: + // 1. If one or more Content-Type headers are associated with the resource, execute the following steps: + // 1. Set supplied-type to the value of the last Content-Type header associated with the resource. + // 2. Set the check-for-apache-bug flag if supplied-type is exactly equal to one of the values in the following table: + // NOTE: Non-standard but this algorithm expects the caller to handle step 2.1.1. + if (supplied_type.has_value()) { + if (Fetch::Infrastructure::is_http_or_https_scheme(scheme)) { + static Array constexpr apache_bug_mime_types = { + "text/plain"sv, + "text/plain; charset=ISO-8859-1"sv, + "text/plain; charset=iso-8859-1"sv, + "text/plain; charset=UTF-8"sv + }; + + auto serialized_supplied_type = TRY(supplied_type->serialized()); + for (auto apache_bug_mime_type : apache_bug_mime_types) { + if (serialized_supplied_type == apache_bug_mime_type) { + m_check_for_apache_bug_flag = true; + break; + } + } + } + } + + // 3. If the resource is retrieved directly from the file system, set supplied-type + // to the MIME type provided by the file system. + // 4. If the resource is retrieved via another protocol (such as FTP), set + // supplied-type to the MIME type as determined by that protocol, if any. + // 5. If supplied-type is not a MIME type, the supplied MIME type is undefined. + // Abort these steps. + // 6. The supplied MIME type is supplied-type. + // NOTE: The expectation is for the caller to handle these spec steps. + m_supplied_mime_type = supplied_type; + + return {}; +} + +// https://mimesniff.spec.whatwg.org/#read-the-resource-header +void Resource::read_the_resource_header(ReadonlyBytes data) +{ + // 1. Let buffer be a byte sequence. + ByteBuffer buffer; + + // 2. Read bytes of the resource into buffer until one of the following conditions is met: + // - the end of the resource is reached. + // - the number of bytes in buffer is greater than or equal to 1445. + // - a reasonable amount of time has elapsed, as determined by the user agent. + // FIXME: The spec expects us to be reading from a stream. Reimplement this spec step once + // we have greater support for streaming in areas that calls on this API. + static size_t constexpr MAX_SNIFF_SIZE = 1445; + buffer.append(data.slice(0, min(data.size(), MAX_SNIFF_SIZE))); + + // 3. The resource header is buffer. + m_resource_header = move(buffer); +} + +// https://mimesniff.spec.whatwg.org/#mime-type-sniffing-algorithm +ErrorOr Resource::mime_type_sniffing_algorithm() +{ + // 1. If the supplied MIME type is undefined or if the supplied MIME type’s essence + // is "unknown/unknown", "application/unknown", or "*/*", execute the rules for + // identifying an unknown MIME type with the sniff-scriptable flag equal to the + // inverse of the no-sniff flag and abort these steps. + if (!m_supplied_mime_type.has_value() || m_supplied_mime_type->essence().is_one_of("unknown/unknown", "application/unknown", "*/*")) { + + // FIXME: Execute the rules for identifying an unknown MIME type with the + // sniff-scriptable flag equal to the inverse of the no-sniff flag and abort + // these steps. + return {}; + } + + // 2. If the no-sniff flag is set, the computed MIME type is the supplied MIME type. + // Abort these steps. + if (m_no_sniff) { + m_computed_mime_type = m_supplied_mime_type.value(); + return {}; + } + + // 3. If the check-for-apache-bug flag is set, execute the rules for distinguishing + // if a resource is text or binary and abort these steps. + if (m_check_for_apache_bug_flag) { + // FIXME: Execute the rules for distinguishing if a resource is text or binary and abort these steps. + return {}; + } + + // 4. If the supplied MIME type is an XML MIME type, the computed MIME type is the supplied MIME type. + // Abort these steps. + if (m_supplied_mime_type->is_xml()) { + m_computed_mime_type = m_supplied_mime_type.value(); + return {}; + } + + // 5. If the supplied MIME type’s essence is "text/html", execute the rules for distinguishing if a + // resource is a feed or HTML and abort these steps. + if (m_supplied_mime_type->essence() == "text/html") { + // FIXME: Execute the rules for distinguishing if a resource is a feed or HTML and abort these steps. + return {}; + } + + // FIXME: 6. If the supplied MIME type is an image MIME type supported by the user agent, let matched-type be + // the result of executing the image type pattern matching algorithm with the resource header as + // the byte sequence to be matched. + Optional matched_type; + + // 7. If matched-type is not undefined, the computed MIME type is matched-type. + // Abort these steps. + if (matched_type.has_value()) { + m_computed_mime_type = matched_type.release_value(); + return {}; + } + + // FIXME: 8. If the supplied MIME type is an audio or video MIME type supported by the user agent, let matched-type be + // the result of executing the audio or video type pattern matching algorithm with the resource header as + // the byte sequence to be matched. + + // 9. If matched-type is not undefined, the computed MIME type is matched-type. + // Abort these steps. + if (matched_type.has_value()) { + m_computed_mime_type = matched_type.release_value(); + return {}; + } + + // 10. The computed MIME type is the supplied MIME type. + m_computed_mime_type = m_supplied_mime_type.value(); + + return {}; +} + +} diff --git a/Userland/Libraries/LibWeb/MimeSniff/Resource.h b/Userland/Libraries/LibWeb/MimeSniff/Resource.h new file mode 100644 index 0000000000..d1b90192b2 --- /dev/null +++ b/Userland/Libraries/LibWeb/MimeSniff/Resource.h @@ -0,0 +1,58 @@ +/* + * Copyright (c) 2023, Kemal Zebari . + * + * SPDX-License-Identifier: BSD-2-Clause + */ + +#pragma once + +#include + +namespace Web::MimeSniff { + +struct SniffingConfiguration { + StringView scheme { ""sv }; + Optional supplied_type = {}; + bool no_sniff { false }; +}; + +// https://mimesniff.spec.whatwg.org/#resource +class Resource { +public: + static ErrorOr create(ReadonlyBytes data, SniffingConfiguration configuration = {}); + static ErrorOr sniff(ReadonlyBytes data, SniffingConfiguration configuration = {}); + + ~Resource(); + + MimeType const& computed_mime_type() const { return m_computed_mime_type; } + ReadonlyBytes resource_header() const { return m_resource_header; } + +private: + Resource(ReadonlyBytes data, bool no_sniff, MimeType&& default_computed_mime_type); + + void read_the_resource_header(ReadonlyBytes data); + ErrorOr supplied_mime_type_detection_algorithm(StringView scheme, Optional supplied_type); + ErrorOr mime_type_sniffing_algorithm(); + + // https://mimesniff.spec.whatwg.org/#supplied-mime-type + // A supplied MIME type, the MIME type determined by the supplied MIME type detection algorithm. + Optional m_supplied_mime_type; + + // https://mimesniff.spec.whatwg.org/#check-for-apache-bug-flag + // A check-for-apache-bug flag, which defaults to unset. + bool m_check_for_apache_bug_flag { false }; + + // https://mimesniff.spec.whatwg.org/#no-sniff-flag + // A no-sniff flag, which defaults to set if the user agent does not wish to perform sniffing on the resource and unset otherwise. + bool m_no_sniff { false }; + + // https://mimesniff.spec.whatwg.org/#computed-mime-type + // A computed MIME type, the MIME type determined by the MIME type sniffing algorithm. + MimeType m_computed_mime_type; + + // https://mimesniff.spec.whatwg.org/#resource-header + // A resource header is the byte sequence at the beginning of a resource, as determined by reading the resource header. + ByteBuffer m_resource_header; +}; + +}