mirror of
https://github.com/RGBCube/serenity
synced 2025-05-31 10:38:11 +00:00
LibWeb: Add position tracking information to HTML tokens
This commit is contained in:
parent
fd982f6562
commit
aa7939bc6c
4 changed files with 108 additions and 21 deletions
|
@ -57,6 +57,10 @@ String HTMLToken::to_string() const
|
||||||
builder.append("' }");
|
builder.append("' }");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
builder.appendff("@{}:{}-{}:{}",
|
||||||
|
m_start_position.line, m_start_position.column,
|
||||||
|
m_end_position.line, m_end_position.column);
|
||||||
|
|
||||||
return builder.to_string();
|
return builder.to_string();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -164,12 +164,30 @@ public:
|
||||||
|
|
||||||
String to_string() const;
|
String to_string() const;
|
||||||
|
|
||||||
|
const auto& start_position() const { return m_start_position; }
|
||||||
|
const auto& end_position() const { return m_end_position; }
|
||||||
|
|
||||||
|
const auto& attributes() const
|
||||||
|
{
|
||||||
|
VERIFY(is_start_tag() || is_end_tag());
|
||||||
|
return m_tag.attributes;
|
||||||
|
}
|
||||||
|
|
||||||
private:
|
private:
|
||||||
|
struct Position {
|
||||||
|
size_t line { 0 };
|
||||||
|
size_t column { 0 };
|
||||||
|
};
|
||||||
|
|
||||||
struct AttributeBuilder {
|
struct AttributeBuilder {
|
||||||
StringBuilder prefix_builder;
|
StringBuilder prefix_builder;
|
||||||
StringBuilder local_name_builder;
|
StringBuilder local_name_builder;
|
||||||
StringBuilder namespace_builder;
|
StringBuilder namespace_builder;
|
||||||
StringBuilder value_builder;
|
StringBuilder value_builder;
|
||||||
|
Position name_start_position;
|
||||||
|
Position value_start_position;
|
||||||
|
Position name_end_position;
|
||||||
|
Position value_end_position;
|
||||||
};
|
};
|
||||||
|
|
||||||
Type m_type { Type::Invalid };
|
Type m_type { Type::Invalid };
|
||||||
|
@ -201,6 +219,9 @@ private:
|
||||||
struct {
|
struct {
|
||||||
StringBuilder data;
|
StringBuilder data;
|
||||||
} m_comment_or_character;
|
} m_comment_or_character;
|
||||||
|
|
||||||
|
Position m_start_position;
|
||||||
|
Position m_end_position;
|
||||||
};
|
};
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -42,13 +42,13 @@ namespace Web::HTML {
|
||||||
goto _StartOfFunction; \
|
goto _StartOfFunction; \
|
||||||
} while (0)
|
} while (0)
|
||||||
|
|
||||||
#define RECONSUME_IN_RETURN_STATE \
|
#define RECONSUME_IN_RETURN_STATE \
|
||||||
do { \
|
do { \
|
||||||
will_reconsume_in(m_return_state); \
|
will_reconsume_in(m_return_state); \
|
||||||
m_state = m_return_state; \
|
m_state = m_return_state; \
|
||||||
if (current_input_character.has_value()) \
|
if (current_input_character.has_value()) \
|
||||||
m_utf8_iterator = m_prev_utf8_iterator; \
|
restore_to(m_prev_utf8_iterator); \
|
||||||
goto _StartOfFunction; \
|
goto _StartOfFunction; \
|
||||||
} while (0)
|
} while (0)
|
||||||
|
|
||||||
#define SWITCH_TO_AND_EMIT_CURRENT_TOKEN(new_state) \
|
#define SWITCH_TO_AND_EMIT_CURRENT_TOKEN(new_state) \
|
||||||
|
@ -81,9 +81,9 @@ namespace Web::HTML {
|
||||||
} \
|
} \
|
||||||
} while (0)
|
} while (0)
|
||||||
|
|
||||||
#define DONT_CONSUME_NEXT_INPUT_CHARACTER \
|
#define DONT_CONSUME_NEXT_INPUT_CHARACTER \
|
||||||
do { \
|
do { \
|
||||||
m_utf8_iterator = m_prev_utf8_iterator; \
|
restore_to(m_prev_utf8_iterator); \
|
||||||
} while (0)
|
} while (0)
|
||||||
|
|
||||||
#define ON(code_point) \
|
#define ON(code_point) \
|
||||||
|
@ -196,12 +196,27 @@ Optional<u32> HTMLTokenizer::next_code_point()
|
||||||
{
|
{
|
||||||
if (m_utf8_iterator == m_utf8_view.end())
|
if (m_utf8_iterator == m_utf8_view.end())
|
||||||
return {};
|
return {};
|
||||||
m_prev_utf8_iterator = m_utf8_iterator;
|
skip(1);
|
||||||
++m_utf8_iterator;
|
|
||||||
dbgln_if(TOKENIZER_TRACE_DEBUG, "(Tokenizer) Next code_point: {}", (char)*m_prev_utf8_iterator);
|
dbgln_if(TOKENIZER_TRACE_DEBUG, "(Tokenizer) Next code_point: {}", (char)*m_prev_utf8_iterator);
|
||||||
return *m_prev_utf8_iterator;
|
return *m_prev_utf8_iterator;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void HTMLTokenizer::skip(size_t count)
|
||||||
|
{
|
||||||
|
m_prev_utf8_iterator = m_utf8_iterator;
|
||||||
|
m_source_positions.append(m_source_positions.last());
|
||||||
|
for (size_t i = 0; i < count; ++i) {
|
||||||
|
auto code_point = *m_utf8_iterator;
|
||||||
|
if (code_point == '\n') {
|
||||||
|
m_source_positions.last().column = 0;
|
||||||
|
m_source_positions.last().line++;
|
||||||
|
} else {
|
||||||
|
m_source_positions.last().column++;
|
||||||
|
}
|
||||||
|
++m_utf8_iterator;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
Optional<u32> HTMLTokenizer::peek_code_point(size_t offset) const
|
Optional<u32> HTMLTokenizer::peek_code_point(size_t offset) const
|
||||||
{
|
{
|
||||||
auto it = m_utf8_iterator;
|
auto it = m_utf8_iterator;
|
||||||
|
@ -287,35 +302,42 @@ _StartOfFunction:
|
||||||
{
|
{
|
||||||
ON_WHITESPACE
|
ON_WHITESPACE
|
||||||
{
|
{
|
||||||
|
m_current_token.m_end_position = nth_last_position(1);
|
||||||
SWITCH_TO(BeforeAttributeName);
|
SWITCH_TO(BeforeAttributeName);
|
||||||
}
|
}
|
||||||
ON('/')
|
ON('/')
|
||||||
{
|
{
|
||||||
|
m_current_token.m_end_position = nth_last_position(1);
|
||||||
SWITCH_TO(SelfClosingStartTag);
|
SWITCH_TO(SelfClosingStartTag);
|
||||||
}
|
}
|
||||||
ON('>')
|
ON('>')
|
||||||
{
|
{
|
||||||
|
m_current_token.m_end_position = nth_last_position(1);
|
||||||
SWITCH_TO_AND_EMIT_CURRENT_TOKEN(Data);
|
SWITCH_TO_AND_EMIT_CURRENT_TOKEN(Data);
|
||||||
}
|
}
|
||||||
ON_ASCII_UPPER_ALPHA
|
ON_ASCII_UPPER_ALPHA
|
||||||
{
|
{
|
||||||
m_current_token.m_tag.tag_name.append(tolower(current_input_character.value()));
|
m_current_token.m_tag.tag_name.append(tolower(current_input_character.value()));
|
||||||
|
m_current_token.m_end_position = nth_last_position(0);
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
ON(0)
|
ON(0)
|
||||||
{
|
{
|
||||||
log_parse_error();
|
log_parse_error();
|
||||||
m_current_token.m_tag.tag_name.append_code_point(0xFFFD);
|
m_current_token.m_tag.tag_name.append_code_point(0xFFFD);
|
||||||
|
m_current_token.m_end_position = nth_last_position(0);
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
ON_EOF
|
ON_EOF
|
||||||
{
|
{
|
||||||
log_parse_error();
|
log_parse_error();
|
||||||
|
m_current_token.m_end_position = nth_last_position(1);
|
||||||
EMIT_EOF;
|
EMIT_EOF;
|
||||||
}
|
}
|
||||||
ANYTHING_ELSE
|
ANYTHING_ELSE
|
||||||
{
|
{
|
||||||
m_current_token.m_tag.tag_name.append_code_point(current_input_character.value());
|
m_current_token.m_tag.tag_name.append_code_point(current_input_character.value());
|
||||||
|
m_current_token.m_end_position = nth_last_position(0);
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -966,6 +988,8 @@ _StartOfFunction:
|
||||||
}
|
}
|
||||||
ON('/')
|
ON('/')
|
||||||
{
|
{
|
||||||
|
if (!m_current_token.m_tag.attributes.is_empty())
|
||||||
|
m_current_token.m_tag.attributes.last().name_end_position = nth_last_position(1);
|
||||||
RECONSUME_IN(AfterAttributeName);
|
RECONSUME_IN(AfterAttributeName);
|
||||||
}
|
}
|
||||||
ON('>')
|
ON('>')
|
||||||
|
@ -980,13 +1004,16 @@ _StartOfFunction:
|
||||||
{
|
{
|
||||||
log_parse_error();
|
log_parse_error();
|
||||||
auto new_attribute = HTMLToken::AttributeBuilder();
|
auto new_attribute = HTMLToken::AttributeBuilder();
|
||||||
|
new_attribute.name_start_position = nth_last_position(1);
|
||||||
new_attribute.local_name_builder.append_code_point(current_input_character.value());
|
new_attribute.local_name_builder.append_code_point(current_input_character.value());
|
||||||
m_current_token.m_tag.attributes.append(new_attribute);
|
m_current_token.m_tag.attributes.append(new_attribute);
|
||||||
SWITCH_TO(AttributeName);
|
SWITCH_TO(AttributeName);
|
||||||
}
|
}
|
||||||
ANYTHING_ELSE
|
ANYTHING_ELSE
|
||||||
{
|
{
|
||||||
m_current_token.m_tag.attributes.append(HTMLToken::AttributeBuilder());
|
auto new_attribute = HTMLToken::AttributeBuilder();
|
||||||
|
new_attribute.name_start_position = nth_last_position(1);
|
||||||
|
m_current_token.m_tag.attributes.append(move(new_attribute));
|
||||||
RECONSUME_IN(AttributeName);
|
RECONSUME_IN(AttributeName);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -1081,6 +1108,7 @@ _StartOfFunction:
|
||||||
}
|
}
|
||||||
ON('=')
|
ON('=')
|
||||||
{
|
{
|
||||||
|
m_current_token.m_tag.attributes.last().name_end_position = nth_last_position(1);
|
||||||
SWITCH_TO(BeforeAttributeValue);
|
SWITCH_TO(BeforeAttributeValue);
|
||||||
}
|
}
|
||||||
ON('>')
|
ON('>')
|
||||||
|
@ -1095,6 +1123,7 @@ _StartOfFunction:
|
||||||
ANYTHING_ELSE
|
ANYTHING_ELSE
|
||||||
{
|
{
|
||||||
m_current_token.m_tag.attributes.append(HTMLToken::AttributeBuilder());
|
m_current_token.m_tag.attributes.append(HTMLToken::AttributeBuilder());
|
||||||
|
m_current_token.m_tag.attributes.last().name_start_position = m_source_positions.last();
|
||||||
RECONSUME_IN(AttributeName);
|
RECONSUME_IN(AttributeName);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -1102,6 +1131,7 @@ _StartOfFunction:
|
||||||
|
|
||||||
BEGIN_STATE(BeforeAttributeValue)
|
BEGIN_STATE(BeforeAttributeValue)
|
||||||
{
|
{
|
||||||
|
m_current_token.m_tag.attributes.last().value_start_position = nth_last_position(1);
|
||||||
ON_WHITESPACE
|
ON_WHITESPACE
|
||||||
{
|
{
|
||||||
continue;
|
continue;
|
||||||
|
@ -1190,6 +1220,7 @@ _StartOfFunction:
|
||||||
{
|
{
|
||||||
ON_WHITESPACE
|
ON_WHITESPACE
|
||||||
{
|
{
|
||||||
|
m_current_token.m_tag.attributes.last().value_end_position = nth_last_position(2);
|
||||||
SWITCH_TO(BeforeAttributeName);
|
SWITCH_TO(BeforeAttributeName);
|
||||||
}
|
}
|
||||||
ON('&')
|
ON('&')
|
||||||
|
@ -1199,6 +1230,7 @@ _StartOfFunction:
|
||||||
}
|
}
|
||||||
ON('>')
|
ON('>')
|
||||||
{
|
{
|
||||||
|
m_current_token.m_tag.attributes.last().value_end_position = nth_last_position(2);
|
||||||
SWITCH_TO_AND_EMIT_CURRENT_TOKEN(Data);
|
SWITCH_TO_AND_EMIT_CURRENT_TOKEN(Data);
|
||||||
}
|
}
|
||||||
ON(0)
|
ON(0)
|
||||||
|
@ -1248,6 +1280,7 @@ _StartOfFunction:
|
||||||
|
|
||||||
BEGIN_STATE(AfterAttributeValueQuoted)
|
BEGIN_STATE(AfterAttributeValueQuoted)
|
||||||
{
|
{
|
||||||
|
m_current_token.m_tag.attributes.last().value_end_position = nth_last_position(2);
|
||||||
ON_WHITESPACE
|
ON_WHITESPACE
|
||||||
{
|
{
|
||||||
SWITCH_TO(BeforeAttributeName);
|
SWITCH_TO(BeforeAttributeName);
|
||||||
|
@ -1514,10 +1547,7 @@ _StartOfFunction:
|
||||||
auto match = HTML::code_points_from_entity(m_decoded_input.substring_view(byte_offset, m_decoded_input.length() - byte_offset - 1));
|
auto match = HTML::code_points_from_entity(m_decoded_input.substring_view(byte_offset, m_decoded_input.length() - byte_offset - 1));
|
||||||
|
|
||||||
if (match.has_value()) {
|
if (match.has_value()) {
|
||||||
for (size_t i = 0; i < match.value().entity.length() - 1; ++i) {
|
skip(match->entity.length() - 1);
|
||||||
m_prev_utf8_iterator = m_utf8_iterator;
|
|
||||||
++m_utf8_iterator;
|
|
||||||
}
|
|
||||||
for (auto ch : match.value().entity)
|
for (auto ch : match.value().entity)
|
||||||
m_temporary_buffer.append(ch);
|
m_temporary_buffer.append(ch);
|
||||||
|
|
||||||
|
@ -2571,10 +2601,7 @@ bool HTMLTokenizer::consume_next_if_match(const StringView& string, CaseSensitiv
|
||||||
if (code_point.value() != (u32)string[i])
|
if (code_point.value() != (u32)string[i])
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
for (size_t i = 0; i < string.length(); ++i) {
|
skip(string.length());
|
||||||
m_prev_utf8_iterator = m_utf8_iterator;
|
|
||||||
++m_utf8_iterator;
|
|
||||||
}
|
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -2582,6 +2609,19 @@ void HTMLTokenizer::create_new_token(HTMLToken::Type type)
|
||||||
{
|
{
|
||||||
m_current_token = {};
|
m_current_token = {};
|
||||||
m_current_token.m_type = type;
|
m_current_token.m_type = type;
|
||||||
|
size_t offset = 0;
|
||||||
|
switch (type) {
|
||||||
|
case HTMLToken::Type::StartTag:
|
||||||
|
offset = 1;
|
||||||
|
break;
|
||||||
|
case HTMLToken::Type::EndTag:
|
||||||
|
offset = 2;
|
||||||
|
break;
|
||||||
|
default:
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
|
m_current_token.m_start_position = nth_last_position(offset);
|
||||||
}
|
}
|
||||||
|
|
||||||
HTMLTokenizer::HTMLTokenizer(const StringView& input, const String& encoding)
|
HTMLTokenizer::HTMLTokenizer(const StringView& input, const String& encoding)
|
||||||
|
@ -2591,6 +2631,7 @@ HTMLTokenizer::HTMLTokenizer(const StringView& input, const String& encoding)
|
||||||
m_decoded_input = decoder->to_utf8(input);
|
m_decoded_input = decoder->to_utf8(input);
|
||||||
m_utf8_view = Utf8View(m_decoded_input);
|
m_utf8_view = Utf8View(m_decoded_input);
|
||||||
m_utf8_iterator = m_utf8_view.begin();
|
m_utf8_iterator = m_utf8_view.begin();
|
||||||
|
m_source_positions.empend(0u, 0u);
|
||||||
}
|
}
|
||||||
|
|
||||||
void HTMLTokenizer::will_switch_to([[maybe_unused]] State new_state)
|
void HTMLTokenizer::will_switch_to([[maybe_unused]] State new_state)
|
||||||
|
@ -2613,6 +2654,7 @@ void HTMLTokenizer::will_emit(HTMLToken& token)
|
||||||
{
|
{
|
||||||
if (token.is_start_tag())
|
if (token.is_start_tag())
|
||||||
m_last_emitted_start_tag = token;
|
m_last_emitted_start_tag = token;
|
||||||
|
token.m_end_position = m_source_positions.last();
|
||||||
}
|
}
|
||||||
|
|
||||||
bool HTMLTokenizer::current_end_tag_token_is_appropriate() const
|
bool HTMLTokenizer::current_end_tag_token_is_appropriate() const
|
||||||
|
@ -2628,4 +2670,18 @@ bool HTMLTokenizer::consumed_as_part_of_an_attribute() const
|
||||||
return m_return_state == State::AttributeValueUnquoted || m_return_state == State::AttributeValueSingleQuoted || m_return_state == State::AttributeValueDoubleQuoted;
|
return m_return_state == State::AttributeValueUnquoted || m_return_state == State::AttributeValueSingleQuoted || m_return_state == State::AttributeValueDoubleQuoted;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void HTMLTokenizer::restore_to(const Utf8CodepointIterator& new_iterator)
|
||||||
|
{
|
||||||
|
if (new_iterator != m_prev_utf8_iterator) {
|
||||||
|
auto diff = m_prev_utf8_iterator - new_iterator;
|
||||||
|
if (diff > 0) {
|
||||||
|
for (ssize_t i = 0; i < diff; ++i)
|
||||||
|
m_source_positions.take_last();
|
||||||
|
} else {
|
||||||
|
// Going forwards...?
|
||||||
|
TODO();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -117,6 +117,7 @@ public:
|
||||||
String source() const { return m_decoded_input; }
|
String source() const { return m_decoded_input; }
|
||||||
|
|
||||||
private:
|
private:
|
||||||
|
void skip(size_t count);
|
||||||
Optional<u32> next_code_point();
|
Optional<u32> next_code_point();
|
||||||
Optional<u32> peek_code_point(size_t offset) const;
|
Optional<u32> peek_code_point(size_t offset) const;
|
||||||
bool consume_next_if_match(const StringView&, CaseSensitivity = CaseSensitivity::CaseSensitive);
|
bool consume_next_if_match(const StringView&, CaseSensitivity = CaseSensitivity::CaseSensitive);
|
||||||
|
@ -141,6 +142,9 @@ private:
|
||||||
|
|
||||||
bool consumed_as_part_of_an_attribute() const;
|
bool consumed_as_part_of_an_attribute() const;
|
||||||
|
|
||||||
|
void restore_to(const Utf8CodepointIterator& new_iterator);
|
||||||
|
auto& nth_last_position(size_t n = 0) { return m_source_positions.at(m_source_positions.size() - 1 - n); }
|
||||||
|
|
||||||
State m_state { State::Data };
|
State m_state { State::Data };
|
||||||
State m_return_state { State::Data };
|
State m_return_state { State::Data };
|
||||||
|
|
||||||
|
@ -165,6 +169,8 @@ private:
|
||||||
u32 m_character_reference_code { 0 };
|
u32 m_character_reference_code { 0 };
|
||||||
|
|
||||||
bool m_blocked { false };
|
bool m_blocked { false };
|
||||||
|
|
||||||
|
Vector<HTMLToken::Position> m_source_positions;
|
||||||
};
|
};
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue