mirror of
https://github.com/RGBCube/serenity
synced 2025-07-25 22:27:35 +00:00
LibWeb: Add extracting character encoding from a meta content attribute
Some Gmail emails contain this.
This commit is contained in:
parent
b919789db2
commit
e9eae9d880
2 changed files with 75 additions and 9 deletions
|
@ -4,6 +4,8 @@
|
||||||
* SPDX-License-Identifier: BSD-2-Clause
|
* SPDX-License-Identifier: BSD-2-Clause
|
||||||
*/
|
*/
|
||||||
|
|
||||||
|
#include <AK/CharacterTypes.h>
|
||||||
|
#include <AK/GenericLexer.h>
|
||||||
#include <AK/StringView.h>
|
#include <AK/StringView.h>
|
||||||
#include <AK/Utf8View.h>
|
#include <AK/Utf8View.h>
|
||||||
#include <LibTextCodec/Decoder.h>
|
#include <LibTextCodec/Decoder.h>
|
||||||
|
@ -29,6 +31,69 @@ bool prescan_skip_whitespace_and_slashes(const ByteBuffer& input, size_t& positi
|
||||||
return !prescan_should_abort(input, position);
|
return !prescan_should_abort(input, position);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// https://html.spec.whatwg.org/multipage/urls-and-fetching.html#algorithm-for-extracting-a-character-encoding-from-a-meta-element
|
||||||
|
Optional<String> extract_character_encoding_from_meta_element(String const& string)
|
||||||
|
{
|
||||||
|
// Checking for "charset" is case insensitive, as is getting an encoding.
|
||||||
|
// Therefore, stick to lowercase from the start for simplicity.
|
||||||
|
auto lowercase_string = string.to_lowercase();
|
||||||
|
GenericLexer lexer(lowercase_string);
|
||||||
|
|
||||||
|
for (;;) {
|
||||||
|
auto charset_index = lexer.remaining().find("charset");
|
||||||
|
if (!charset_index.has_value())
|
||||||
|
return {};
|
||||||
|
|
||||||
|
// 7 is the length of "charset".
|
||||||
|
lexer.ignore(charset_index.value() + 7);
|
||||||
|
|
||||||
|
lexer.ignore_while([](char c) {
|
||||||
|
// FIXME: Not the exact same ASCII whitespace. The spec does not include vertical tab (\v).
|
||||||
|
return is_ascii_space(c);
|
||||||
|
});
|
||||||
|
|
||||||
|
if (lexer.peek() != '=')
|
||||||
|
continue;
|
||||||
|
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Ignore the '='.
|
||||||
|
lexer.ignore();
|
||||||
|
|
||||||
|
lexer.ignore_while([](char c) {
|
||||||
|
// FIXME: Not the exact same ASCII whitespace. The spec does not include vertical tab (\v).
|
||||||
|
return is_ascii_space(c);
|
||||||
|
});
|
||||||
|
|
||||||
|
if (lexer.is_eof())
|
||||||
|
return {};
|
||||||
|
|
||||||
|
if (lexer.consume_specific('"')) {
|
||||||
|
auto matching_double_quote = lexer.remaining().find("\"");
|
||||||
|
if (!matching_double_quote.has_value())
|
||||||
|
return {};
|
||||||
|
|
||||||
|
auto encoding = lexer.remaining().substring_view(0, matching_double_quote.value());
|
||||||
|
return TextCodec::get_standardized_encoding(encoding);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (lexer.consume_specific('\'')) {
|
||||||
|
auto matching_single_quote = lexer.remaining().find("'");
|
||||||
|
if (!matching_single_quote.has_value())
|
||||||
|
return {};
|
||||||
|
|
||||||
|
auto encoding = lexer.remaining().substring_view(0, matching_single_quote.value());
|
||||||
|
return TextCodec::get_standardized_encoding(encoding);
|
||||||
|
}
|
||||||
|
|
||||||
|
auto encoding = lexer.consume_until([](char c) {
|
||||||
|
// FIXME: Not the exact same ASCII whitespace. The spec does not include vertical tab (\v).
|
||||||
|
return is_ascii_space(c) || c == ';';
|
||||||
|
});
|
||||||
|
return TextCodec::get_standardized_encoding(encoding);
|
||||||
|
}
|
||||||
|
|
||||||
Optional<Attribute> prescan_get_attribute(const ByteBuffer& input, size_t& position)
|
Optional<Attribute> prescan_get_attribute(const ByteBuffer& input, size_t& position)
|
||||||
{
|
{
|
||||||
if (!prescan_skip_whitespace_and_slashes(input, position))
|
if (!prescan_skip_whitespace_and_slashes(input, position))
|
||||||
|
@ -137,21 +202,21 @@ Optional<String> run_prescan_byte_stream_algorithm(const ByteBuffer& input)
|
||||||
auto& attribute_name = attribute.value().name();
|
auto& attribute_name = attribute.value().name();
|
||||||
attribute_list.append(attribute.value().name());
|
attribute_list.append(attribute.value().name());
|
||||||
|
|
||||||
if (attribute_name == "http-equiv" && attribute.value().value() == "content-type")
|
if (attribute_name == "http-equiv") {
|
||||||
got_pragma = true;
|
got_pragma = attribute.value().value() == "content-type";
|
||||||
else if (attribute_name == "charset") {
|
} else if (attribute_name == "content") {
|
||||||
|
auto encoding = extract_character_encoding_from_meta_element(attribute.value().value());
|
||||||
|
if (encoding.has_value() && !charset.has_value()) {
|
||||||
|
charset = encoding.value();
|
||||||
|
need_pragma = true;
|
||||||
|
}
|
||||||
|
} else if (attribute_name == "charset") {
|
||||||
auto maybe_charset = TextCodec::get_standardized_encoding(attribute.value().value());
|
auto maybe_charset = TextCodec::get_standardized_encoding(attribute.value().value());
|
||||||
if (maybe_charset.has_value()) {
|
if (maybe_charset.has_value()) {
|
||||||
charset = Optional<String> { maybe_charset };
|
charset = Optional<String> { maybe_charset };
|
||||||
need_pragma = { false };
|
need_pragma = { false };
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// FIXME: For attribute name "content", do this:
|
|
||||||
// Apply the "algorithm for extracting a character encoding from a meta
|
|
||||||
// element", giving the attribute's value as the string to parse. If a
|
|
||||||
// character encoding is returned, and if charset is still set to null,
|
|
||||||
// let charset be the encoding returned, and set need pragma to true.
|
|
||||||
}
|
}
|
||||||
|
|
||||||
if (!need_pragma.has_value() || (need_pragma.value() && !got_pragma) || !charset.has_value())
|
if (!need_pragma.has_value() || (need_pragma.value() && !got_pragma) || !charset.has_value())
|
||||||
|
|
|
@ -15,6 +15,7 @@ namespace Web::HTML {
|
||||||
bool prescan_should_abort(const ByteBuffer& input, const size_t& position);
|
bool prescan_should_abort(const ByteBuffer& input, const size_t& position);
|
||||||
bool prescan_is_whitespace_or_slash(const u8& byte);
|
bool prescan_is_whitespace_or_slash(const u8& byte);
|
||||||
bool prescan_skip_whitespace_and_slashes(const ByteBuffer& input, size_t& position);
|
bool prescan_skip_whitespace_and_slashes(const ByteBuffer& input, size_t& position);
|
||||||
|
Optional<String> extract_character_encoding_from_meta_element(String const&);
|
||||||
Optional<Attribute> prescan_get_attribute(const ByteBuffer& input, size_t& position);
|
Optional<Attribute> prescan_get_attribute(const ByteBuffer& input, size_t& position);
|
||||||
Optional<String> run_prescan_byte_stream_algorithm(const ByteBuffer& input);
|
Optional<String> run_prescan_byte_stream_algorithm(const ByteBuffer& input);
|
||||||
String run_encoding_sniffing_algorithm(const ByteBuffer& input);
|
String run_encoding_sniffing_algorithm(const ByteBuffer& input);
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue