LibWeb: Make the new HTML parser parse input as UTF-8

We already convert the input to UTF-8 before starting the tokenizer, so all this patch had to do was switch the tokenizer to use an Utf8View for its input (and to emit 32-bit codepoints.)
2025-10-31 17:02:45 +00:00 · 2020-06-04 21:06:54 +02:00 · 2020-06-04 21:06:54 +02:00 · b6288163f1
commit b6288163f1
parent 23dad305e9
3 changed files with 75 additions and 49 deletions
--- a/Libraries/LibWeb/Parser/HTMLToken.h
+++ b/Libraries/LibWeb/Parser/HTMLToken.h
@ -30,6 +30,7 @@
 #include <AK/String.h>
 #include <AK/StringBuilder.h>
 #include <AK/Types.h>
+#include <AK/Utf8View.h>
 #include <AK/Vector.h>

 namespace Web {
@ -67,9 +68,9 @@ public:
    u32 codepoint() const
    {
        ASSERT(is_character());
-        // FIXME: Handle non-ASCII codepoints properly.
-        ASSERT(m_comment_or_character.data.length() == 1);
-        return m_comment_or_character.data.string_view()[0];
+        Utf8View view(m_comment_or_character.data.string_view());
+        ASSERT(view.length_in_codepoints() == 1);
+        return *view.begin();
    }

    bool is_parser_whitespace() const