LibWeb: Plumb content encoding into the new HTML parser

We still don't handle non-ASCII input correctly, but at least now we'll convert e.g ISO-8859-1 to UTF-8 before starting to tokenize. This patch also makes "view source" work with the new parser. :^)
2025-07-26 17:37:35 +00:00 · 2020-05-28 12:35:19 +02:00 · 2020-05-28 12:35:19 +02:00 · 5e53c45113
commit 5e53c45113
parent 772b51038e
6 changed files with 18 additions and 9 deletions
--- a/Libraries/LibWeb/Parser/HTMLDocumentParser.cpp
+++ b/Libraries/LibWeb/Parser/HTMLDocumentParser.cpp
@ -24,7 +24,7 @@
 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */

-#define PARSER_DEBUG
+//#define PARSER_DEBUG

 #include <AK/Utf32View.h>
 #include <LibWeb/DOM/Comment.h>
@ -51,8 +51,8 @@

 namespace Web {

-HTMLDocumentParser::HTMLDocumentParser(const StringView& input)
-    : m_tokenizer(input)
+HTMLDocumentParser::HTMLDocumentParser(const StringView& input, const String& encoding)
+    : m_tokenizer(input, encoding)
 {
 }

@ -64,6 +64,7 @@ void HTMLDocumentParser::run(const URL& url)
 {
    m_document = adopt(*new Document);
    m_document->set_url(url);
+    m_document->set_source(m_tokenizer.source());

    for (;;) {
        auto optional_token = m_tokenizer.next_token();
--- a/Libraries/LibWeb/Parser/HTMLDocumentParser.h
+++ b/Libraries/LibWeb/Parser/HTMLDocumentParser.h
@ -61,7 +61,7 @@ namespace Web {

 class HTMLDocumentParser {
 public:
-    explicit HTMLDocumentParser(const StringView& input);
+    HTMLDocumentParser(const StringView& input, const String& encoding);
    ~HTMLDocumentParser();

    void run(const URL&);
--- a/Libraries/LibWeb/Parser/HTMLTokenizer.cpp
+++ b/Libraries/LibWeb/Parser/HTMLTokenizer.cpp
@ -24,6 +24,7 @@
 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */

+#include <LibTextCodec/Decoder.h>
 #include <LibWeb/Parser/Entities.h>
 #include <LibWeb/Parser/HTMLToken.h>
 #include <LibWeb/Parser/HTMLTokenizer.h>
@ -1711,9 +1712,12 @@ void HTMLTokenizer::create_new_token(HTMLToken::Type type)
    m_current_token.m_type = type;
 }

-HTMLTokenizer::HTMLTokenizer(const StringView& input)
-    : m_input(input)
+HTMLTokenizer::HTMLTokenizer(const StringView& input, const String& encoding)
 {
+    auto* decoder = TextCodec::decoder_for(encoding);
+    ASSERT(decoder);
+    m_decoded_input = decoder->to_utf8(input);
+    m_input = m_decoded_input;
 }

 void HTMLTokenizer::will_switch_to([[maybe_unused]] State new_state)
--- a/Libraries/LibWeb/Parser/HTMLTokenizer.h
+++ b/Libraries/LibWeb/Parser/HTMLTokenizer.h
@ -118,7 +118,7 @@ namespace Web {

 class HTMLTokenizer {
 public:
-    explicit HTMLTokenizer(const StringView& input);
+    explicit HTMLTokenizer(const StringView& input, const String& encoding);

    enum class State {
 #define __ENUMERATE_TOKENIZER_STATE(state) state,
@ -133,6 +133,8 @@ public:
    void set_blocked(bool b) { m_blocked = b; }
    bool is_blocked() const { return m_blocked; }

+    String source() const { return m_decoded_input; }
+
 private:
    Optional<u32> next_codepoint();
    Optional<u32> peek_codepoint(size_t offset) const;
@ -163,6 +165,8 @@ private:

    Vector<u32> m_temporary_buffer;

+    String m_decoded_input;
+
    StringView m_input;
    size_t m_cursor { 0 };