mirror of
https://github.com/RGBCube/serenity
synced 2025-07-07 20:17:34 +00:00
LibWeb: Handle iso-8859-1 web content a little bit better
We now look at the HTTP response headers for a Content-Type header and try to parse it if present to find the text encoding. If the text encoding is iso-8859-1, we turn all non-ASCII characters into question marks. This makes Swedish Google load on my machine! :^)
This commit is contained in:
parent
eb6e35a1be
commit
f3676ebef5
3 changed files with 44 additions and 6 deletions
|
@ -343,6 +343,15 @@ static RefPtr<Document> create_image_document(const ByteBuffer& data, const URL&
|
||||||
return document;
|
return document;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
String encoding_from_content_type(const String& content_type)
|
||||||
|
{
|
||||||
|
auto offset = content_type.index_of("charset=");
|
||||||
|
if (offset.has_value())
|
||||||
|
return content_type.substring(offset.value() + 8, content_type.length() - offset.value() - 8).to_lowercase();
|
||||||
|
|
||||||
|
return "utf-8";
|
||||||
|
}
|
||||||
|
|
||||||
void HtmlView::load(const URL& url)
|
void HtmlView::load(const URL& url)
|
||||||
{
|
{
|
||||||
dbg() << "HtmlView::load: " << url.to_string();
|
dbg() << "HtmlView::load: " << url.to_string();
|
||||||
|
@ -370,7 +379,15 @@ void HtmlView::load(const URL& url)
|
||||||
if (url.path().ends_with(".png") || url.path().ends_with(".gif")) {
|
if (url.path().ends_with(".png") || url.path().ends_with(".gif")) {
|
||||||
document = create_image_document(data, url);
|
document = create_image_document(data, url);
|
||||||
} else {
|
} else {
|
||||||
document = parse_html_document(data, url);
|
String encoding = "utf-8";
|
||||||
|
|
||||||
|
auto content_type = response_headers.get("Content-Type");
|
||||||
|
if (content_type.has_value()) {
|
||||||
|
encoding = encoding_from_content_type(content_type.value());
|
||||||
|
dbg() << "I think this content has encoding '" << encoding << "'";
|
||||||
|
}
|
||||||
|
|
||||||
|
document = parse_html_document(data, url, encoding);
|
||||||
}
|
}
|
||||||
ASSERT(document);
|
ASSERT(document);
|
||||||
set_document(document);
|
set_document(document);
|
||||||
|
|
|
@ -383,16 +383,37 @@ static bool parse_html_document(const StringView& html, Document& document, Pare
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
RefPtr<DocumentFragment> parse_html_fragment(Document& document, const StringView& html)
|
String to_utf8(const StringView& input, const String& encoding)
|
||||||
|
{
|
||||||
|
String output;
|
||||||
|
if (encoding == "utf-8") {
|
||||||
|
output = input;
|
||||||
|
} else if (encoding == "iso-8859-1") {
|
||||||
|
StringBuilder builder(input.length());
|
||||||
|
for (size_t i = 0; i < input.length(); ++i) {
|
||||||
|
u8 ch = input[i];
|
||||||
|
builder.append(ch >= 0x80 ? '?' : ch);
|
||||||
|
}
|
||||||
|
output = builder.to_string();
|
||||||
|
} else {
|
||||||
|
dbg() << "Unknown encoding " << encoding;
|
||||||
|
ASSERT_NOT_REACHED();
|
||||||
|
}
|
||||||
|
return output;
|
||||||
|
}
|
||||||
|
|
||||||
|
RefPtr<DocumentFragment> parse_html_fragment(Document& document, const StringView& raw_html, const String& encoding)
|
||||||
{
|
{
|
||||||
auto fragment = adopt(*new DocumentFragment(document));
|
auto fragment = adopt(*new DocumentFragment(document));
|
||||||
if (!parse_html_document(html, document, *fragment))
|
if (!parse_html_document(to_utf8(raw_html, encoding), document, *fragment))
|
||||||
return nullptr;
|
return nullptr;
|
||||||
return fragment;
|
return fragment;
|
||||||
}
|
}
|
||||||
|
|
||||||
RefPtr<Document> parse_html_document(const StringView& html, const URL& url)
|
RefPtr<Document> parse_html_document(const StringView& raw_html, const URL& url, const String& encoding)
|
||||||
{
|
{
|
||||||
|
String html = to_utf8(raw_html, encoding);
|
||||||
|
|
||||||
auto document = adopt(*new Document(url));
|
auto document = adopt(*new Document(url));
|
||||||
document->set_source(html);
|
document->set_source(html);
|
||||||
|
|
||||||
|
|
|
@ -33,7 +33,7 @@ namespace Web {
|
||||||
|
|
||||||
class DocumentFragment;
|
class DocumentFragment;
|
||||||
|
|
||||||
RefPtr<Document> parse_html_document(const StringView&, const URL& = URL());
|
RefPtr<Document> parse_html_document(const StringView&, const URL& = URL(), const String& encoding = "utf-8");
|
||||||
RefPtr<DocumentFragment> parse_html_fragment(Document&, const StringView&);
|
RefPtr<DocumentFragment> parse_html_fragment(Document&, const StringView&, const String& encoding = "utf-8");
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue