diff --git a/Libraries/LibWeb/Parser/HTMLToken.h b/Libraries/LibWeb/Parser/HTMLToken.h index 17979eedc9..0727321f8d 100644 --- a/Libraries/LibWeb/Parser/HTMLToken.h +++ b/Libraries/LibWeb/Parser/HTMLToken.h @@ -30,6 +30,7 @@ #include #include #include +#include #include namespace Web { @@ -67,9 +68,9 @@ public: u32 codepoint() const { ASSERT(is_character()); - // FIXME: Handle non-ASCII codepoints properly. - ASSERT(m_comment_or_character.data.length() == 1); - return m_comment_or_character.data.string_view()[0]; + Utf8View view(m_comment_or_character.data.string_view()); + ASSERT(view.length_in_codepoints() == 1); + return *view.begin(); } bool is_parser_whitespace() const diff --git a/Libraries/LibWeb/Parser/HTMLTokenizer.cpp b/Libraries/LibWeb/Parser/HTMLTokenizer.cpp index 1f73adaad9..f1934e10d3 100644 --- a/Libraries/LibWeb/Parser/HTMLTokenizer.cpp +++ b/Libraries/LibWeb/Parser/HTMLTokenizer.cpp @@ -35,12 +35,12 @@ //#define TOKENIZER_TRACE #ifdef TOKENIZER_TRACE -#define PARSE_ERROR() \ - do { \ - dbg() << "Parse error (tokenization)" << __PRETTY_FUNCTION__ << " @ " << __LINE__; \ - } while (0) +# define PARSE_ERROR() \ + do { \ + dbg() << "Parse error (tokenization)" << __PRETTY_FUNCTION__ << " @ " << __LINE__; \ + } while (0) #else -#define PARSE_ERROR() +# define PARSE_ERROR() #endif #define CONSUME_NEXT_INPUT_CHARACTER \ @@ -92,20 +92,23 @@ goto new_state; \ } while (0) -#define FLUSH_CODEPOINTS_CONSUMED_AS_A_CHARACTER_REFERENCE \ - do { \ - for (auto codepoint : m_temporary_buffer) { \ - if (consumed_as_part_of_an_attribute()) { \ - m_current_token.m_tag.attributes.last().value_builder.append(codepoint); \ - } else { \ - create_new_token(HTMLToken::Type::Character); \ - m_current_token.m_comment_or_character.data.append(codepoint); \ - m_queued_tokens.enqueue(m_current_token); \ - } \ - } \ +#define FLUSH_CODEPOINTS_CONSUMED_AS_A_CHARACTER_REFERENCE \ + do { \ + for (auto codepoint : m_temporary_buffer) { \ + if (consumed_as_part_of_an_attribute()) { \ + m_current_token.m_tag.attributes.last().value_builder.append_codepoint(codepoint); \ + } else { \ + create_new_token(HTMLToken::Type::Character); \ + m_current_token.m_comment_or_character.data.append_codepoint(codepoint); \ + m_queued_tokens.enqueue(m_current_token); \ + } \ + } \ } while (0) -#define DONT_CONSUME_NEXT_INPUT_CHARACTER --m_cursor; +#define DONT_CONSUME_NEXT_INPUT_CHARACTER \ + do { \ + m_utf8_iterator = m_prev_utf8_iterator; \ + } while (0) #define ON(codepoint) \ if (current_input_character.has_value() && current_input_character.value() == codepoint) @@ -157,7 +160,7 @@ #define EMIT_CHARACTER(codepoint) \ do { \ create_new_token(HTMLToken::Type::Character); \ - m_current_token.m_comment_or_character.data.append(codepoint); \ + m_current_token.m_comment_or_character.data.append_codepoint(codepoint); \ m_queued_tokens.enqueue(m_current_token); \ return m_queued_tokens.dequeue(); \ } while (0) @@ -202,9 +205,11 @@ namespace Web { Optional HTMLTokenizer::next_codepoint() { - if (m_cursor >= m_input.length()) + if (m_utf8_iterator == m_utf8_view.end()) return {}; - return m_input[m_cursor++]; + m_prev_utf8_iterator = m_utf8_iterator; + ++m_utf8_iterator; + return *m_prev_utf8_iterator; } Optional HTMLTokenizer::peek_codepoint(size_t offset) const @@ -306,7 +311,7 @@ _StartOfFunction: ON(0) { PARSE_ERROR(); - m_current_token.m_tag.tag_name.append("\uFFFD"); + m_current_token.m_tag.tag_name.append_codepoint(0xFFFD); continue; } ON_EOF @@ -316,7 +321,7 @@ _StartOfFunction: } ANYTHING_ELSE { - m_current_token.m_tag.tag_name.append(current_input_character.value()); + m_current_token.m_tag.tag_name.append_codepoint(current_input_character.value()); continue; } } @@ -359,6 +364,15 @@ _StartOfFunction: if (consume_next_if_match("DOCTYPE", CaseSensitivity::CaseInsensitive)) { SWITCH_TO(DOCTYPE); } + if (consume_next_if_match("[CDATA[")) { + TODO(); + } + ANYTHING_ELSE + { + PARSE_ERROR(); + create_new_token(HTMLToken::Type::Comment); + SWITCH_TO(BogusComment); + } } END_STATE @@ -366,19 +380,23 @@ _StartOfFunction: { ON('>') { - TODO(); + SWITCH_TO_AND_EMIT_CURRENT_TOKEN(Data); } ON_EOF { - TODO(); + m_queued_tokens.enqueue(m_current_token); + EMIT_EOF; } ON(0) { - TODO(); + PARSE_ERROR(); + m_current_token.m_comment_or_character.data.append_codepoint(0xFFFD); + continue; } ANYTHING_ELSE { - TODO(); + m_current_token.m_comment_or_character.data.append_codepoint(current_input_character.value()); + continue; } } END_STATE @@ -431,7 +449,7 @@ _StartOfFunction: ANYTHING_ELSE { create_new_token(HTMLToken::Type::DOCTYPE); - m_current_token.m_doctype.name.append(current_input_character.value()); + m_current_token.m_doctype.name.append_codepoint(current_input_character.value()); SWITCH_TO(DOCTYPEName); } } @@ -461,7 +479,7 @@ _StartOfFunction: } ANYTHING_ELSE { - m_current_token.m_doctype.name.append(current_input_character.value()); + m_current_token.m_doctype.name.append_codepoint(current_input_character.value()); continue; } } @@ -634,7 +652,7 @@ _StartOfFunction: } ANYTHING_ELSE { - m_current_token.m_doctype.public_identifier.append(current_input_character.value()); + m_current_token.m_doctype.public_identifier.append_codepoint(current_input_character.value()); continue; } } @@ -660,7 +678,7 @@ _StartOfFunction: } ANYTHING_ELSE { - m_current_token.m_doctype.public_identifier.append(current_input_character.value()); + m_current_token.m_doctype.public_identifier.append_codepoint(current_input_character.value()); continue; } } @@ -686,7 +704,7 @@ _StartOfFunction: } ANYTHING_ELSE { - m_current_token.m_doctype.system_identifier.append(current_input_character.value()); + m_current_token.m_doctype.system_identifier.append_codepoint(current_input_character.value()); continue; } } @@ -712,7 +730,7 @@ _StartOfFunction: } ANYTHING_ELSE { - m_current_token.m_doctype.system_identifier.append(current_input_character.value()); + m_current_token.m_doctype.system_identifier.append_codepoint(current_input_character.value()); continue; } } @@ -873,7 +891,7 @@ _StartOfFunction: } ANYTHING_ELSE { - m_current_token.m_tag.attributes.last().name_builder.append(current_input_character.value()); + m_current_token.m_tag.attributes.last().name_builder.append_codepoint(current_input_character.value()); continue; } } @@ -956,7 +974,7 @@ _StartOfFunction: } ANYTHING_ELSE { - m_current_token.m_tag.attributes.last().value_builder.append(current_input_character.value()); + m_current_token.m_tag.attributes.last().value_builder.append_codepoint(current_input_character.value()); continue; } } @@ -983,7 +1001,7 @@ _StartOfFunction: } ANYTHING_ELSE { - m_current_token.m_tag.attributes.last().value_builder.append(current_input_character.value()); + m_current_token.m_tag.attributes.last().value_builder.append_codepoint(current_input_character.value()); continue; } } @@ -1014,7 +1032,7 @@ _StartOfFunction: } ANYTHING_ELSE { - m_current_token.m_tag.attributes.last().value_builder.append(current_input_character.value()); + m_current_token.m_tag.attributes.last().value_builder.append_codepoint(current_input_character.value()); continue; } } @@ -1090,7 +1108,7 @@ _StartOfFunction: { ON('<') { - m_current_token.m_comment_or_character.data.append(current_input_character.value()); + m_current_token.m_comment_or_character.data.append_codepoint(current_input_character.value()); SWITCH_TO(CommentLessThanSign); } ON('-') @@ -1107,7 +1125,7 @@ _StartOfFunction: } ANYTHING_ELSE { - m_current_token.m_comment_or_character.data.append(current_input_character.value()); + m_current_token.m_comment_or_character.data.append_codepoint(current_input_character.value()); continue; } } @@ -1185,12 +1203,12 @@ _StartOfFunction: { ON('!') { - m_current_token.m_comment_or_character.data.append(current_input_character.value()); + m_current_token.m_comment_or_character.data.append_codepoint(current_input_character.value()); SWITCH_TO(CommentLessThanSignBang); } ON('<') { - m_current_token.m_comment_or_character.data.append(current_input_character.value()); + m_current_token.m_comment_or_character.data.append_codepoint(current_input_character.value()); continue; } ANYTHING_ELSE @@ -1299,7 +1317,7 @@ _StartOfFunction: ON_ASCII_ALPHANUMERIC { if (consumed_as_part_of_an_attribute()) { - m_current_token.m_tag.attributes.last().value_builder.append(current_input_character.value()); + m_current_token.m_tag.attributes.last().value_builder.append_codepoint(current_input_character.value()); continue; } else { EMIT_CURRENT_CHARACTER; @@ -1495,7 +1513,7 @@ _StartOfFunction: ON(0) { PARSE_ERROR(); - EMIT_CHARACTER("\uFFFD"); + EMIT_CHARACTER(0xFFFD); } ON_EOF { @@ -1583,7 +1601,7 @@ _StartOfFunction: } ON_ASCII_LOWER_ALPHA { - m_current_token.m_tag.tag_name.append(current_input_character.value()); + m_current_token.m_tag.tag_name.append_codepoint(current_input_character.value()); m_temporary_buffer.append(current_input_character.value()); continue; } @@ -1607,7 +1625,7 @@ _StartOfFunction: ON(0) { PARSE_ERROR(); - EMIT_CHARACTER("\uFFFD"); + EMIT_CHARACTER(0xFFFD); } ON_EOF { @@ -1718,7 +1736,7 @@ _StartOfFunction: ON(0) { PARSE_ERROR(); - EMIT_CHARACTER("\uFFFD"); + EMIT_CHARACTER(0xFFFD); } ON_EOF { @@ -1736,7 +1754,7 @@ _StartOfFunction: ON(0) { PARSE_ERROR(); - EMIT_CHARACTER("\uFFFD"); + EMIT_CHARACTER(0xFFFD); } ON_EOF { @@ -2076,6 +2094,8 @@ HTMLTokenizer::HTMLTokenizer(const StringView& input, const String& encoding) ASSERT(decoder); m_decoded_input = decoder->to_utf8(input); m_input = m_decoded_input; + m_utf8_view = Utf8View(m_decoded_input); + m_utf8_iterator = m_utf8_view.begin(); } void HTMLTokenizer::will_switch_to([[maybe_unused]] State new_state) diff --git a/Libraries/LibWeb/Parser/HTMLTokenizer.h b/Libraries/LibWeb/Parser/HTMLTokenizer.h index e21bbfdcf2..a5550e0fcd 100644 --- a/Libraries/LibWeb/Parser/HTMLTokenizer.h +++ b/Libraries/LibWeb/Parser/HTMLTokenizer.h @@ -29,6 +29,7 @@ #include #include #include +#include #include #include @@ -170,6 +171,10 @@ private: StringView m_input; size_t m_cursor { 0 }; + Utf8View m_utf8_view; + AK::Utf8CodepointIterator m_utf8_iterator; + AK::Utf8CodepointIterator m_prev_utf8_iterator; + HTMLToken m_current_token; HTMLToken m_last_emitted_start_tag;