mirror of
https://github.com/RGBCube/serenity
synced 2025-05-31 07:48:11 +00:00
LibWeb: Make the new HTML parser parse input as UTF-8
We already convert the input to UTF-8 before starting the tokenizer, so all this patch had to do was switch the tokenizer to use an Utf8View for its input (and to emit 32-bit codepoints.)
This commit is contained in:
parent
23dad305e9
commit
b6288163f1
3 changed files with 75 additions and 49 deletions
|
@ -30,6 +30,7 @@
|
|||
#include <AK/String.h>
|
||||
#include <AK/StringBuilder.h>
|
||||
#include <AK/Types.h>
|
||||
#include <AK/Utf8View.h>
|
||||
#include <AK/Vector.h>
|
||||
|
||||
namespace Web {
|
||||
|
@ -67,9 +68,9 @@ public:
|
|||
u32 codepoint() const
|
||||
{
|
||||
ASSERT(is_character());
|
||||
// FIXME: Handle non-ASCII codepoints properly.
|
||||
ASSERT(m_comment_or_character.data.length() == 1);
|
||||
return m_comment_or_character.data.string_view()[0];
|
||||
Utf8View view(m_comment_or_character.data.string_view());
|
||||
ASSERT(view.length_in_codepoints() == 1);
|
||||
return *view.begin();
|
||||
}
|
||||
|
||||
bool is_parser_whitespace() const
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue