1
Fork 0
mirror of https://github.com/RGBCube/serenity synced 2025-05-31 04:38:11 +00:00

LibWeb: Plumb content encoding into the new HTML parser

We still don't handle non-ASCII input correctly, but at least now we'll
convert e.g ISO-8859-1 to UTF-8 before starting to tokenize.
This patch also makes "view source" work with the new parser. :^)
This commit is contained in:
Andreas Kling 2020-05-28 12:35:19 +02:00
parent 772b51038e
commit 5e53c45113
6 changed files with 18 additions and 9 deletions

View file

@ -118,7 +118,7 @@ namespace Web {
class HTMLTokenizer {
public:
explicit HTMLTokenizer(const StringView& input);
explicit HTMLTokenizer(const StringView& input, const String& encoding);
enum class State {
#define __ENUMERATE_TOKENIZER_STATE(state) state,
@ -133,6 +133,8 @@ public:
void set_blocked(bool b) { m_blocked = b; }
bool is_blocked() const { return m_blocked; }
String source() const { return m_decoded_input; }
private:
Optional<u32> next_codepoint();
Optional<u32> peek_codepoint(size_t offset) const;
@ -163,6 +165,8 @@ private:
Vector<u32> m_temporary_buffer;
String m_decoded_input;
StringView m_input;
size_t m_cursor { 0 };