1
Fork 0
mirror of https://github.com/RGBCube/serenity synced 2025-05-31 07:48:11 +00:00

LibWeb: Make the new HTML parser parse input as UTF-8

We already convert the input to UTF-8 before starting the tokenizer,
so all this patch had to do was switch the tokenizer to use an Utf8View
for its input (and to emit 32-bit codepoints.)
This commit is contained in:
Andreas Kling 2020-06-04 21:06:54 +02:00
parent 23dad305e9
commit b6288163f1
3 changed files with 75 additions and 49 deletions

View file

@ -30,6 +30,7 @@
#include <AK/String.h>
#include <AK/StringBuilder.h>
#include <AK/Types.h>
#include <AK/Utf8View.h>
#include <AK/Vector.h>
namespace Web {
@ -67,9 +68,9 @@ public:
u32 codepoint() const
{
ASSERT(is_character());
// FIXME: Handle non-ASCII codepoints properly.
ASSERT(m_comment_or_character.data.length() == 1);
return m_comment_or_character.data.string_view()[0];
Utf8View view(m_comment_or_character.data.string_view());
ASSERT(view.length_in_codepoints() == 1);
return *view.begin();
}
bool is_parser_whitespace() const