mirror of
https://github.com/RGBCube/serenity
synced 2025-07-25 06:47:35 +00:00
LibWeb: More work on the HTML parser and tokenizer
The parser can now switch the state of the tokenizer! Very webby. :^)
This commit is contained in:
parent
31db3f21ae
commit
20911efd4d
8 changed files with 186 additions and 14 deletions
|
@ -37,6 +37,7 @@ class EventTarget;
|
||||||
class Frame;
|
class Frame;
|
||||||
class HTMLBodyElement;
|
class HTMLBodyElement;
|
||||||
class HTMLCanvasElement;
|
class HTMLCanvasElement;
|
||||||
|
class HTMLDocumentParser;
|
||||||
class HTMLElement;
|
class HTMLElement;
|
||||||
class HTMLFormElement;
|
class HTMLFormElement;
|
||||||
class HTMLHeadElement;
|
class HTMLHeadElement;
|
||||||
|
|
|
@ -34,6 +34,11 @@
|
||||||
#include <LibWeb/Parser/HTMLDocumentParser.h>
|
#include <LibWeb/Parser/HTMLDocumentParser.h>
|
||||||
#include <LibWeb/Parser/HTMLToken.h>
|
#include <LibWeb/Parser/HTMLToken.h>
|
||||||
|
|
||||||
|
#define TODO() \
|
||||||
|
do { \
|
||||||
|
ASSERT_NOT_REACHED(); \
|
||||||
|
} while (0)
|
||||||
|
|
||||||
namespace Web {
|
namespace Web {
|
||||||
|
|
||||||
HTMLDocumentParser::HTMLDocumentParser(const StringView& input)
|
HTMLDocumentParser::HTMLDocumentParser(const StringView& input)
|
||||||
|
@ -176,6 +181,19 @@ void HTMLDocumentParser::handle_before_head(HTMLToken& token)
|
||||||
|
|
||||||
void HTMLDocumentParser::handle_in_head(HTMLToken& token)
|
void HTMLDocumentParser::handle_in_head(HTMLToken& token)
|
||||||
{
|
{
|
||||||
|
if (token.is_parser_whitespace()) {
|
||||||
|
insert_character(token.codepoint());
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (token.is_start_tag() && token.tag_name() == "title") {
|
||||||
|
insert_html_element(token);
|
||||||
|
m_tokenizer.switch_to({}, HTMLTokenizer::State::RCDATA);
|
||||||
|
m_original_insertion_mode = m_insertion_mode;
|
||||||
|
m_insertion_mode = InsertionMode::Text;
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
if (token.is_start_tag() && token.tag_name() == "meta") {
|
if (token.is_start_tag() && token.tag_name() == "meta") {
|
||||||
auto element = insert_html_element(token);
|
auto element = insert_html_element(token);
|
||||||
m_stack_of_open_elements.pop();
|
m_stack_of_open_elements.pop();
|
||||||
|
@ -381,8 +399,20 @@ void HTMLDocumentParser::handle_in_body(HTMLToken& token)
|
||||||
ASSERT_NOT_REACHED();
|
ASSERT_NOT_REACHED();
|
||||||
}
|
}
|
||||||
|
|
||||||
void HTMLDocumentParser::handle_text(HTMLToken&)
|
void HTMLDocumentParser::handle_text(HTMLToken& token)
|
||||||
{
|
{
|
||||||
|
if (token.is_character()) {
|
||||||
|
insert_character(token.codepoint());
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
if (token.is_end_tag() && token.tag_name() == "script") {
|
||||||
|
ASSERT_NOT_REACHED();
|
||||||
|
}
|
||||||
|
if (token.is_end_tag()) {
|
||||||
|
m_stack_of_open_elements.pop();
|
||||||
|
m_insertion_mode = m_original_insertion_mode;
|
||||||
|
return;
|
||||||
|
}
|
||||||
ASSERT_NOT_REACHED();
|
ASSERT_NOT_REACHED();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -100,6 +100,7 @@ private:
|
||||||
void process_using_the_rules_for(InsertionMode, HTMLToken&);
|
void process_using_the_rules_for(InsertionMode, HTMLToken&);
|
||||||
|
|
||||||
InsertionMode m_insertion_mode { InsertionMode::Initial };
|
InsertionMode m_insertion_mode { InsertionMode::Initial };
|
||||||
|
InsertionMode m_original_insertion_mode { InsertionMode::Initial };
|
||||||
|
|
||||||
StackOfOpenElements m_stack_of_open_elements;
|
StackOfOpenElements m_stack_of_open_elements;
|
||||||
|
|
||||||
|
|
|
@ -54,6 +54,8 @@ String HTMLToken::to_string() const
|
||||||
case HTMLToken::Type::EndOfFile:
|
case HTMLToken::Type::EndOfFile:
|
||||||
builder.append("EndOfFile");
|
builder.append("EndOfFile");
|
||||||
break;
|
break;
|
||||||
|
case HTMLToken::Type::Invalid:
|
||||||
|
ASSERT_NOT_REACHED();
|
||||||
}
|
}
|
||||||
|
|
||||||
if (type() == HTMLToken::Type::StartTag || type() == HTMLToken::Type::EndTag) {
|
if (type() == HTMLToken::Type::StartTag || type() == HTMLToken::Type::EndTag) {
|
||||||
|
@ -72,7 +74,7 @@ String HTMLToken::to_string() const
|
||||||
if (type() == HTMLToken::Type::Comment || type() == HTMLToken::Type::Character) {
|
if (type() == HTMLToken::Type::Comment || type() == HTMLToken::Type::Character) {
|
||||||
builder.append(" { data: '");
|
builder.append(" { data: '");
|
||||||
builder.append(m_comment_or_character.data.to_string());
|
builder.append(m_comment_or_character.data.to_string());
|
||||||
builder.append(" }");
|
builder.append("' }");
|
||||||
}
|
}
|
||||||
|
|
||||||
return builder.to_string();
|
return builder.to_string();
|
||||||
|
|
|
@ -39,6 +39,7 @@ class HTMLToken {
|
||||||
|
|
||||||
public:
|
public:
|
||||||
enum class Type {
|
enum class Type {
|
||||||
|
Invalid,
|
||||||
DOCTYPE,
|
DOCTYPE,
|
||||||
StartTag,
|
StartTag,
|
||||||
EndTag,
|
EndTag,
|
||||||
|
@ -101,7 +102,7 @@ private:
|
||||||
StringBuilder value_builder;
|
StringBuilder value_builder;
|
||||||
};
|
};
|
||||||
|
|
||||||
Type m_type;
|
Type m_type { Type::Invalid };
|
||||||
|
|
||||||
// Type::DOCTYPE
|
// Type::DOCTYPE
|
||||||
struct {
|
struct {
|
||||||
|
|
|
@ -52,6 +52,7 @@
|
||||||
#define SWITCH_TO_AND_EMIT_CURRENT_TOKEN(new_state) \
|
#define SWITCH_TO_AND_EMIT_CURRENT_TOKEN(new_state) \
|
||||||
will_switch_to(State::new_state); \
|
will_switch_to(State::new_state); \
|
||||||
m_state = State::new_state; \
|
m_state = State::new_state; \
|
||||||
|
will_emit(m_current_token); \
|
||||||
return m_current_token;
|
return m_current_token;
|
||||||
|
|
||||||
#define DONT_CONSUME_NEXT_INPUT_CHARACTER --m_cursor;
|
#define DONT_CONSUME_NEXT_INPUT_CHARACTER --m_cursor;
|
||||||
|
@ -68,6 +69,9 @@
|
||||||
#define ON_ASCII_UPPER_ALPHA \
|
#define ON_ASCII_UPPER_ALPHA \
|
||||||
if (current_input_character.has_value() && current_input_character.value() >= 'A' && current_input_character.value() <= 'Z')
|
if (current_input_character.has_value() && current_input_character.value() >= 'A' && current_input_character.value() <= 'Z')
|
||||||
|
|
||||||
|
#define ON_ASCII_LOWER_ALPHA \
|
||||||
|
if (current_input_character.has_value() && current_input_character.value() >= 'a' && current_input_character.value() <= 'z')
|
||||||
|
|
||||||
#define ON_WHITESPACE \
|
#define ON_WHITESPACE \
|
||||||
if (current_input_character.has_value() && (current_input_character.value() == '\t' || current_input_character.value() == '\n' || current_input_character.value() == '\f' || current_input_character.value() == ' '))
|
if (current_input_character.has_value() && (current_input_character.value() == '\t' || current_input_character.value() == '\n' || current_input_character.value() == '\f' || current_input_character.value() == ' '))
|
||||||
|
|
||||||
|
@ -78,11 +82,22 @@
|
||||||
return {}; \
|
return {}; \
|
||||||
m_has_emitted_eof = true; \
|
m_has_emitted_eof = true; \
|
||||||
create_new_token(HTMLToken::Type::EndOfFile); \
|
create_new_token(HTMLToken::Type::EndOfFile); \
|
||||||
|
will_emit(m_current_token); \
|
||||||
return m_current_token;
|
return m_current_token;
|
||||||
|
|
||||||
#define EMIT_CURRENT_TOKEN \
|
#define EMIT_CURRENT_TOKEN \
|
||||||
|
will_emit(m_current_token); \
|
||||||
return m_current_token;
|
return m_current_token;
|
||||||
|
|
||||||
|
#define EMIT_CHARACTER(codepoint) \
|
||||||
|
create_new_token(HTMLToken::Type::Character); \
|
||||||
|
m_current_token.m_comment_or_character.data.append(codepoint); \
|
||||||
|
will_emit(m_current_token); \
|
||||||
|
return m_current_token;
|
||||||
|
|
||||||
|
#define EMIT_CURRENT_CHARACTER \
|
||||||
|
EMIT_CHARACTER(current_input_character.value());
|
||||||
|
|
||||||
#define BEGIN_STATE(state) \
|
#define BEGIN_STATE(state) \
|
||||||
state: \
|
state: \
|
||||||
case State::state: { \
|
case State::state: { \
|
||||||
|
@ -134,9 +149,7 @@ Optional<HTMLToken> HTMLTokenizer::next_token()
|
||||||
}
|
}
|
||||||
ANYTHING_ELSE
|
ANYTHING_ELSE
|
||||||
{
|
{
|
||||||
create_new_token(HTMLToken::Type::Character);
|
EMIT_CURRENT_CHARACTER;
|
||||||
m_current_token.m_comment_or_character.data.append(current_input_character.value());
|
|
||||||
return m_current_token;
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
END_STATE
|
END_STATE
|
||||||
|
@ -721,6 +734,99 @@ Optional<HTMLToken> HTMLTokenizer::next_token()
|
||||||
}
|
}
|
||||||
END_STATE
|
END_STATE
|
||||||
|
|
||||||
|
BEGIN_STATE(RCDATA)
|
||||||
|
{
|
||||||
|
ON('&')
|
||||||
|
{
|
||||||
|
m_return_state = State::RCDATA;
|
||||||
|
SWITCH_TO(CharacterReference);
|
||||||
|
}
|
||||||
|
ON('<')
|
||||||
|
{
|
||||||
|
SWITCH_TO(RCDATALessThanSign);
|
||||||
|
}
|
||||||
|
ON(0)
|
||||||
|
{
|
||||||
|
TODO();
|
||||||
|
}
|
||||||
|
ON_EOF
|
||||||
|
{
|
||||||
|
EMIT_EOF;
|
||||||
|
}
|
||||||
|
ANYTHING_ELSE
|
||||||
|
{
|
||||||
|
EMIT_CURRENT_CHARACTER;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
END_STATE
|
||||||
|
|
||||||
|
BEGIN_STATE(RCDATALessThanSign)
|
||||||
|
{
|
||||||
|
ON('/')
|
||||||
|
{
|
||||||
|
m_temporary_buffer.clear();
|
||||||
|
SWITCH_TO(RCDATAEndTagOpen);
|
||||||
|
}
|
||||||
|
ANYTHING_ELSE
|
||||||
|
{
|
||||||
|
EMIT_CHARACTER('<');
|
||||||
|
RECONSUME_IN(RCDATA);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
END_STATE
|
||||||
|
|
||||||
|
BEGIN_STATE(RCDATAEndTagOpen)
|
||||||
|
{
|
||||||
|
ON_ASCII_ALPHA
|
||||||
|
{
|
||||||
|
create_new_token(HTMLToken::Type::EndTag);
|
||||||
|
RECONSUME_IN(RCDATAEndTagName);
|
||||||
|
}
|
||||||
|
ANYTHING_ELSE
|
||||||
|
{
|
||||||
|
// FIXME: Emit a U+003C LESS-THAN SIGN character token and a U+002F SOLIDUS character token. Reconsume in the RCDATA state.
|
||||||
|
TODO();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
END_STATE
|
||||||
|
|
||||||
|
BEGIN_STATE(RCDATAEndTagName)
|
||||||
|
{
|
||||||
|
ON_WHITESPACE
|
||||||
|
{
|
||||||
|
TODO();
|
||||||
|
}
|
||||||
|
ON('/')
|
||||||
|
{
|
||||||
|
TODO();
|
||||||
|
}
|
||||||
|
ON('>')
|
||||||
|
{
|
||||||
|
if (!current_end_tag_token_is_appropriate()) {
|
||||||
|
// FIXME: Otherwise, treat it as per the "anything else" entry below.
|
||||||
|
TODO();
|
||||||
|
}
|
||||||
|
SWITCH_TO_AND_EMIT_CURRENT_TOKEN(Data);
|
||||||
|
}
|
||||||
|
ON_ASCII_UPPER_ALPHA
|
||||||
|
{
|
||||||
|
m_current_token.m_tag.tag_name.append(tolower(current_input_character.value()));
|
||||||
|
m_temporary_buffer.append(current_input_character.value());
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
ON_ASCII_LOWER_ALPHA
|
||||||
|
{
|
||||||
|
m_current_token.m_tag.tag_name.append(current_input_character.value());
|
||||||
|
m_temporary_buffer.append(current_input_character.value());
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
ANYTHING_ELSE
|
||||||
|
{
|
||||||
|
TODO();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
END_STATE
|
||||||
|
|
||||||
default:
|
default:
|
||||||
ASSERT_NOT_REACHED();
|
ASSERT_NOT_REACHED();
|
||||||
}
|
}
|
||||||
|
@ -771,4 +877,26 @@ void HTMLTokenizer::will_reconsume_in([[maybe_unused]] State new_state)
|
||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void HTMLTokenizer::switch_to(Badge<HTMLDocumentParser>, State new_state)
|
||||||
|
{
|
||||||
|
#ifdef TOKENIZER_TRACE
|
||||||
|
dbg() << "[" << state_name(m_state) << "] Parser switches tokenizer state to " << state_name(new_state);
|
||||||
|
#endif
|
||||||
|
m_state = new_state;
|
||||||
|
}
|
||||||
|
|
||||||
|
void HTMLTokenizer::will_emit(HTMLToken& token)
|
||||||
|
{
|
||||||
|
if (token.is_start_tag())
|
||||||
|
m_last_emitted_start_tag = token;
|
||||||
|
}
|
||||||
|
|
||||||
|
bool HTMLTokenizer::current_end_tag_token_is_appropriate() const
|
||||||
|
{
|
||||||
|
ASSERT(m_current_token.is_end_tag());
|
||||||
|
if (!m_last_emitted_start_tag.is_start_tag())
|
||||||
|
return false;
|
||||||
|
return m_current_token.tag_name() == m_last_emitted_start_tag.tag_name();
|
||||||
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -28,6 +28,7 @@
|
||||||
|
|
||||||
#include <AK/StringView.h>
|
#include <AK/StringView.h>
|
||||||
#include <AK/Types.h>
|
#include <AK/Types.h>
|
||||||
|
#include <LibWeb/Forward.h>
|
||||||
#include <LibWeb/Parser/HTMLToken.h>
|
#include <LibWeb/Parser/HTMLToken.h>
|
||||||
|
|
||||||
#define ENUMERATE_TOKENIZER_STATES \
|
#define ENUMERATE_TOKENIZER_STATES \
|
||||||
|
@ -118,20 +119,23 @@ class HTMLTokenizer {
|
||||||
public:
|
public:
|
||||||
explicit HTMLTokenizer(const StringView& input);
|
explicit HTMLTokenizer(const StringView& input);
|
||||||
|
|
||||||
|
enum class State {
|
||||||
|
#define __ENUMERATE_TOKENIZER_STATE(state) state,
|
||||||
|
ENUMERATE_TOKENIZER_STATES
|
||||||
|
#undef __ENUMERATE_TOKENIZER_STATE
|
||||||
|
};
|
||||||
|
|
||||||
Optional<HTMLToken> next_token();
|
Optional<HTMLToken> next_token();
|
||||||
|
|
||||||
|
void switch_to(Badge<HTMLDocumentParser>, State new_state);
|
||||||
|
|
||||||
private:
|
private:
|
||||||
Optional<u32> next_codepoint();
|
Optional<u32> next_codepoint();
|
||||||
Optional<u32> peek_codepoint(size_t offset) const;
|
Optional<u32> peek_codepoint(size_t offset) const;
|
||||||
bool next_few_characters_are(const StringView&) const;
|
bool next_few_characters_are(const StringView&) const;
|
||||||
void consume(const StringView&);
|
void consume(const StringView&);
|
||||||
void create_new_token(HTMLToken::Type);
|
void create_new_token(HTMLToken::Type);
|
||||||
|
bool current_end_tag_token_is_appropriate() const;
|
||||||
enum class State {
|
|
||||||
#define __ENUMERATE_TOKENIZER_STATE(state) state,
|
|
||||||
ENUMERATE_TOKENIZER_STATES
|
|
||||||
#undef __ENUMERATE_TOKENIZER_STATE
|
|
||||||
};
|
|
||||||
|
|
||||||
static const char* state_name(State state)
|
static const char* state_name(State state)
|
||||||
{
|
{
|
||||||
|
@ -145,17 +149,22 @@ private:
|
||||||
ASSERT_NOT_REACHED();
|
ASSERT_NOT_REACHED();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void will_emit(HTMLToken&);
|
||||||
void will_switch_to(State);
|
void will_switch_to(State);
|
||||||
void will_reconsume_in(State);
|
void will_reconsume_in(State);
|
||||||
|
|
||||||
State m_state { State::Data };
|
State m_state { State::Data };
|
||||||
State m_return_state { State::Data };
|
State m_return_state { State::Data };
|
||||||
|
|
||||||
|
StringBuilder m_temporary_buffer;
|
||||||
|
|
||||||
StringView m_input;
|
StringView m_input;
|
||||||
size_t m_cursor { 0 };
|
size_t m_cursor { 0 };
|
||||||
|
|
||||||
HTMLToken m_current_token;
|
HTMLToken m_current_token;
|
||||||
|
|
||||||
|
HTMLToken m_last_emitted_start_tag;
|
||||||
|
|
||||||
bool m_has_emitted_eof { false };
|
bool m_has_emitted_eof { false };
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
|
|
@ -38,7 +38,7 @@ int main(int argc, char** argv)
|
||||||
Core::EventLoop loop;
|
Core::EventLoop loop;
|
||||||
|
|
||||||
// This is a temporary test program to aid with bringing up the new HTML parser. :^)
|
// This is a temporary test program to aid with bringing up the new HTML parser. :^)
|
||||||
const char* input_path = "/home/anon/www/simple.html";
|
const char* input_path = "/home/anon/www/welcome.html";
|
||||||
if (argc > 1)
|
if (argc > 1)
|
||||||
input_path = argv[1];
|
input_path = argv[1];
|
||||||
|
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue