mirror of
				https://github.com/RGBCube/serenity
				synced 2025-10-31 18:42:43 +00:00 
			
		
		
		
	LibWeb: Add position tracking information to HTML tokens
This commit is contained in:
		
							parent
							
								
									fd982f6562
								
							
						
					
					
						commit
						aa7939bc6c
					
				
					 4 changed files with 108 additions and 21 deletions
				
			
		|  | @ -57,6 +57,10 @@ String HTMLToken::to_string() const | ||||||
|         builder.append("' }"); |         builder.append("' }"); | ||||||
|     } |     } | ||||||
| 
 | 
 | ||||||
|  |     builder.appendff("@{}:{}-{}:{}", | ||||||
|  |         m_start_position.line, m_start_position.column, | ||||||
|  |         m_end_position.line, m_end_position.column); | ||||||
|  | 
 | ||||||
|     return builder.to_string(); |     return builder.to_string(); | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
|  |  | ||||||
|  | @ -164,12 +164,30 @@ public: | ||||||
| 
 | 
 | ||||||
|     String to_string() const; |     String to_string() const; | ||||||
| 
 | 
 | ||||||
|  |     const auto& start_position() const { return m_start_position; } | ||||||
|  |     const auto& end_position() const { return m_end_position; } | ||||||
|  | 
 | ||||||
|  |     const auto& attributes() const | ||||||
|  |     { | ||||||
|  |         VERIFY(is_start_tag() || is_end_tag()); | ||||||
|  |         return m_tag.attributes; | ||||||
|  |     } | ||||||
|  | 
 | ||||||
| private: | private: | ||||||
|  |     struct Position { | ||||||
|  |         size_t line { 0 }; | ||||||
|  |         size_t column { 0 }; | ||||||
|  |     }; | ||||||
|  | 
 | ||||||
|     struct AttributeBuilder { |     struct AttributeBuilder { | ||||||
|         StringBuilder prefix_builder; |         StringBuilder prefix_builder; | ||||||
|         StringBuilder local_name_builder; |         StringBuilder local_name_builder; | ||||||
|         StringBuilder namespace_builder; |         StringBuilder namespace_builder; | ||||||
|         StringBuilder value_builder; |         StringBuilder value_builder; | ||||||
|  |         Position name_start_position; | ||||||
|  |         Position value_start_position; | ||||||
|  |         Position name_end_position; | ||||||
|  |         Position value_end_position; | ||||||
|     }; |     }; | ||||||
| 
 | 
 | ||||||
|     Type m_type { Type::Invalid }; |     Type m_type { Type::Invalid }; | ||||||
|  | @ -201,6 +219,9 @@ private: | ||||||
|     struct { |     struct { | ||||||
|         StringBuilder data; |         StringBuilder data; | ||||||
|     } m_comment_or_character; |     } m_comment_or_character; | ||||||
|  | 
 | ||||||
|  |     Position m_start_position; | ||||||
|  |     Position m_end_position; | ||||||
| }; | }; | ||||||
| 
 | 
 | ||||||
| } | } | ||||||
|  |  | ||||||
|  | @ -42,13 +42,13 @@ namespace Web::HTML { | ||||||
|         goto _StartOfFunction;          \ |         goto _StartOfFunction;          \ | ||||||
|     } while (0) |     } while (0) | ||||||
| 
 | 
 | ||||||
| #define RECONSUME_IN_RETURN_STATE                   \ | #define RECONSUME_IN_RETURN_STATE                \ | ||||||
|     do {                                            \ |     do {                                         \ | ||||||
|         will_reconsume_in(m_return_state);          \ |         will_reconsume_in(m_return_state);       \ | ||||||
|         m_state = m_return_state;                   \ |         m_state = m_return_state;                \ | ||||||
|         if (current_input_character.has_value())    \ |         if (current_input_character.has_value()) \ | ||||||
|             m_utf8_iterator = m_prev_utf8_iterator; \ |             restore_to(m_prev_utf8_iterator);    \ | ||||||
|         goto _StartOfFunction;                      \ |         goto _StartOfFunction;                   \ | ||||||
|     } while (0) |     } while (0) | ||||||
| 
 | 
 | ||||||
| #define SWITCH_TO_AND_EMIT_CURRENT_TOKEN(new_state) \ | #define SWITCH_TO_AND_EMIT_CURRENT_TOKEN(new_state) \ | ||||||
|  | @ -81,9 +81,9 @@ namespace Web::HTML { | ||||||
|         }                                                                                            \ |         }                                                                                            \ | ||||||
|     } while (0) |     } while (0) | ||||||
| 
 | 
 | ||||||
| #define DONT_CONSUME_NEXT_INPUT_CHARACTER       \ | #define DONT_CONSUME_NEXT_INPUT_CHARACTER \ | ||||||
|     do {                                        \ |     do {                                  \ | ||||||
|         m_utf8_iterator = m_prev_utf8_iterator; \ |         restore_to(m_prev_utf8_iterator); \ | ||||||
|     } while (0) |     } while (0) | ||||||
| 
 | 
 | ||||||
| #define ON(code_point) \ | #define ON(code_point) \ | ||||||
|  | @ -196,12 +196,27 @@ Optional<u32> HTMLTokenizer::next_code_point() | ||||||
| { | { | ||||||
|     if (m_utf8_iterator == m_utf8_view.end()) |     if (m_utf8_iterator == m_utf8_view.end()) | ||||||
|         return {}; |         return {}; | ||||||
|     m_prev_utf8_iterator = m_utf8_iterator; |     skip(1); | ||||||
|     ++m_utf8_iterator; |  | ||||||
|     dbgln_if(TOKENIZER_TRACE_DEBUG, "(Tokenizer) Next code_point: {}", (char)*m_prev_utf8_iterator); |     dbgln_if(TOKENIZER_TRACE_DEBUG, "(Tokenizer) Next code_point: {}", (char)*m_prev_utf8_iterator); | ||||||
|     return *m_prev_utf8_iterator; |     return *m_prev_utf8_iterator; | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
|  | void HTMLTokenizer::skip(size_t count) | ||||||
|  | { | ||||||
|  |     m_prev_utf8_iterator = m_utf8_iterator; | ||||||
|  |     m_source_positions.append(m_source_positions.last()); | ||||||
|  |     for (size_t i = 0; i < count; ++i) { | ||||||
|  |         auto code_point = *m_utf8_iterator; | ||||||
|  |         if (code_point == '\n') { | ||||||
|  |             m_source_positions.last().column = 0; | ||||||
|  |             m_source_positions.last().line++; | ||||||
|  |         } else { | ||||||
|  |             m_source_positions.last().column++; | ||||||
|  |         } | ||||||
|  |         ++m_utf8_iterator; | ||||||
|  |     } | ||||||
|  | } | ||||||
|  | 
 | ||||||
| Optional<u32> HTMLTokenizer::peek_code_point(size_t offset) const | Optional<u32> HTMLTokenizer::peek_code_point(size_t offset) const | ||||||
| { | { | ||||||
|     auto it = m_utf8_iterator; |     auto it = m_utf8_iterator; | ||||||
|  | @ -287,35 +302,42 @@ _StartOfFunction: | ||||||
|             { |             { | ||||||
|                 ON_WHITESPACE |                 ON_WHITESPACE | ||||||
|                 { |                 { | ||||||
|  |                     m_current_token.m_end_position = nth_last_position(1); | ||||||
|                     SWITCH_TO(BeforeAttributeName); |                     SWITCH_TO(BeforeAttributeName); | ||||||
|                 } |                 } | ||||||
|                 ON('/') |                 ON('/') | ||||||
|                 { |                 { | ||||||
|  |                     m_current_token.m_end_position = nth_last_position(1); | ||||||
|                     SWITCH_TO(SelfClosingStartTag); |                     SWITCH_TO(SelfClosingStartTag); | ||||||
|                 } |                 } | ||||||
|                 ON('>') |                 ON('>') | ||||||
|                 { |                 { | ||||||
|  |                     m_current_token.m_end_position = nth_last_position(1); | ||||||
|                     SWITCH_TO_AND_EMIT_CURRENT_TOKEN(Data); |                     SWITCH_TO_AND_EMIT_CURRENT_TOKEN(Data); | ||||||
|                 } |                 } | ||||||
|                 ON_ASCII_UPPER_ALPHA |                 ON_ASCII_UPPER_ALPHA | ||||||
|                 { |                 { | ||||||
|                     m_current_token.m_tag.tag_name.append(tolower(current_input_character.value())); |                     m_current_token.m_tag.tag_name.append(tolower(current_input_character.value())); | ||||||
|  |                     m_current_token.m_end_position = nth_last_position(0); | ||||||
|                     continue; |                     continue; | ||||||
|                 } |                 } | ||||||
|                 ON(0) |                 ON(0) | ||||||
|                 { |                 { | ||||||
|                     log_parse_error(); |                     log_parse_error(); | ||||||
|                     m_current_token.m_tag.tag_name.append_code_point(0xFFFD); |                     m_current_token.m_tag.tag_name.append_code_point(0xFFFD); | ||||||
|  |                     m_current_token.m_end_position = nth_last_position(0); | ||||||
|                     continue; |                     continue; | ||||||
|                 } |                 } | ||||||
|                 ON_EOF |                 ON_EOF | ||||||
|                 { |                 { | ||||||
|                     log_parse_error(); |                     log_parse_error(); | ||||||
|  |                     m_current_token.m_end_position = nth_last_position(1); | ||||||
|                     EMIT_EOF; |                     EMIT_EOF; | ||||||
|                 } |                 } | ||||||
|                 ANYTHING_ELSE |                 ANYTHING_ELSE | ||||||
|                 { |                 { | ||||||
|                     m_current_token.m_tag.tag_name.append_code_point(current_input_character.value()); |                     m_current_token.m_tag.tag_name.append_code_point(current_input_character.value()); | ||||||
|  |                     m_current_token.m_end_position = nth_last_position(0); | ||||||
|                     continue; |                     continue; | ||||||
|                 } |                 } | ||||||
|             } |             } | ||||||
|  | @ -966,6 +988,8 @@ _StartOfFunction: | ||||||
|                 } |                 } | ||||||
|                 ON('/') |                 ON('/') | ||||||
|                 { |                 { | ||||||
|  |                     if (!m_current_token.m_tag.attributes.is_empty()) | ||||||
|  |                         m_current_token.m_tag.attributes.last().name_end_position = nth_last_position(1); | ||||||
|                     RECONSUME_IN(AfterAttributeName); |                     RECONSUME_IN(AfterAttributeName); | ||||||
|                 } |                 } | ||||||
|                 ON('>') |                 ON('>') | ||||||
|  | @ -980,13 +1004,16 @@ _StartOfFunction: | ||||||
|                 { |                 { | ||||||
|                     log_parse_error(); |                     log_parse_error(); | ||||||
|                     auto new_attribute = HTMLToken::AttributeBuilder(); |                     auto new_attribute = HTMLToken::AttributeBuilder(); | ||||||
|  |                     new_attribute.name_start_position = nth_last_position(1); | ||||||
|                     new_attribute.local_name_builder.append_code_point(current_input_character.value()); |                     new_attribute.local_name_builder.append_code_point(current_input_character.value()); | ||||||
|                     m_current_token.m_tag.attributes.append(new_attribute); |                     m_current_token.m_tag.attributes.append(new_attribute); | ||||||
|                     SWITCH_TO(AttributeName); |                     SWITCH_TO(AttributeName); | ||||||
|                 } |                 } | ||||||
|                 ANYTHING_ELSE |                 ANYTHING_ELSE | ||||||
|                 { |                 { | ||||||
|                     m_current_token.m_tag.attributes.append(HTMLToken::AttributeBuilder()); |                     auto new_attribute = HTMLToken::AttributeBuilder(); | ||||||
|  |                     new_attribute.name_start_position = nth_last_position(1); | ||||||
|  |                     m_current_token.m_tag.attributes.append(move(new_attribute)); | ||||||
|                     RECONSUME_IN(AttributeName); |                     RECONSUME_IN(AttributeName); | ||||||
|                 } |                 } | ||||||
|             } |             } | ||||||
|  | @ -1081,6 +1108,7 @@ _StartOfFunction: | ||||||
|                 } |                 } | ||||||
|                 ON('=') |                 ON('=') | ||||||
|                 { |                 { | ||||||
|  |                     m_current_token.m_tag.attributes.last().name_end_position = nth_last_position(1); | ||||||
|                     SWITCH_TO(BeforeAttributeValue); |                     SWITCH_TO(BeforeAttributeValue); | ||||||
|                 } |                 } | ||||||
|                 ON('>') |                 ON('>') | ||||||
|  | @ -1095,6 +1123,7 @@ _StartOfFunction: | ||||||
|                 ANYTHING_ELSE |                 ANYTHING_ELSE | ||||||
|                 { |                 { | ||||||
|                     m_current_token.m_tag.attributes.append(HTMLToken::AttributeBuilder()); |                     m_current_token.m_tag.attributes.append(HTMLToken::AttributeBuilder()); | ||||||
|  |                     m_current_token.m_tag.attributes.last().name_start_position = m_source_positions.last(); | ||||||
|                     RECONSUME_IN(AttributeName); |                     RECONSUME_IN(AttributeName); | ||||||
|                 } |                 } | ||||||
|             } |             } | ||||||
|  | @ -1102,6 +1131,7 @@ _StartOfFunction: | ||||||
| 
 | 
 | ||||||
|             BEGIN_STATE(BeforeAttributeValue) |             BEGIN_STATE(BeforeAttributeValue) | ||||||
|             { |             { | ||||||
|  |                 m_current_token.m_tag.attributes.last().value_start_position = nth_last_position(1); | ||||||
|                 ON_WHITESPACE |                 ON_WHITESPACE | ||||||
|                 { |                 { | ||||||
|                     continue; |                     continue; | ||||||
|  | @ -1190,6 +1220,7 @@ _StartOfFunction: | ||||||
|             { |             { | ||||||
|                 ON_WHITESPACE |                 ON_WHITESPACE | ||||||
|                 { |                 { | ||||||
|  |                     m_current_token.m_tag.attributes.last().value_end_position = nth_last_position(2); | ||||||
|                     SWITCH_TO(BeforeAttributeName); |                     SWITCH_TO(BeforeAttributeName); | ||||||
|                 } |                 } | ||||||
|                 ON('&') |                 ON('&') | ||||||
|  | @ -1199,6 +1230,7 @@ _StartOfFunction: | ||||||
|                 } |                 } | ||||||
|                 ON('>') |                 ON('>') | ||||||
|                 { |                 { | ||||||
|  |                     m_current_token.m_tag.attributes.last().value_end_position = nth_last_position(2); | ||||||
|                     SWITCH_TO_AND_EMIT_CURRENT_TOKEN(Data); |                     SWITCH_TO_AND_EMIT_CURRENT_TOKEN(Data); | ||||||
|                 } |                 } | ||||||
|                 ON(0) |                 ON(0) | ||||||
|  | @ -1248,6 +1280,7 @@ _StartOfFunction: | ||||||
| 
 | 
 | ||||||
|             BEGIN_STATE(AfterAttributeValueQuoted) |             BEGIN_STATE(AfterAttributeValueQuoted) | ||||||
|             { |             { | ||||||
|  |                 m_current_token.m_tag.attributes.last().value_end_position = nth_last_position(2); | ||||||
|                 ON_WHITESPACE |                 ON_WHITESPACE | ||||||
|                 { |                 { | ||||||
|                     SWITCH_TO(BeforeAttributeName); |                     SWITCH_TO(BeforeAttributeName); | ||||||
|  | @ -1514,10 +1547,7 @@ _StartOfFunction: | ||||||
|                 auto match = HTML::code_points_from_entity(m_decoded_input.substring_view(byte_offset, m_decoded_input.length() - byte_offset - 1)); |                 auto match = HTML::code_points_from_entity(m_decoded_input.substring_view(byte_offset, m_decoded_input.length() - byte_offset - 1)); | ||||||
| 
 | 
 | ||||||
|                 if (match.has_value()) { |                 if (match.has_value()) { | ||||||
|                     for (size_t i = 0; i < match.value().entity.length() - 1; ++i) { |                     skip(match->entity.length() - 1); | ||||||
|                         m_prev_utf8_iterator = m_utf8_iterator; |  | ||||||
|                         ++m_utf8_iterator; |  | ||||||
|                     } |  | ||||||
|                     for (auto ch : match.value().entity) |                     for (auto ch : match.value().entity) | ||||||
|                         m_temporary_buffer.append(ch); |                         m_temporary_buffer.append(ch); | ||||||
| 
 | 
 | ||||||
|  | @ -2571,10 +2601,7 @@ bool HTMLTokenizer::consume_next_if_match(const StringView& string, CaseSensitiv | ||||||
|         if (code_point.value() != (u32)string[i]) |         if (code_point.value() != (u32)string[i]) | ||||||
|             return false; |             return false; | ||||||
|     } |     } | ||||||
|     for (size_t i = 0; i < string.length(); ++i) { |     skip(string.length()); | ||||||
|         m_prev_utf8_iterator = m_utf8_iterator; |  | ||||||
|         ++m_utf8_iterator; |  | ||||||
|     } |  | ||||||
|     return true; |     return true; | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
|  | @ -2582,6 +2609,19 @@ void HTMLTokenizer::create_new_token(HTMLToken::Type type) | ||||||
| { | { | ||||||
|     m_current_token = {}; |     m_current_token = {}; | ||||||
|     m_current_token.m_type = type; |     m_current_token.m_type = type; | ||||||
|  |     size_t offset = 0; | ||||||
|  |     switch (type) { | ||||||
|  |     case HTMLToken::Type::StartTag: | ||||||
|  |         offset = 1; | ||||||
|  |         break; | ||||||
|  |     case HTMLToken::Type::EndTag: | ||||||
|  |         offset = 2; | ||||||
|  |         break; | ||||||
|  |     default: | ||||||
|  |         break; | ||||||
|  |     } | ||||||
|  | 
 | ||||||
|  |     m_current_token.m_start_position = nth_last_position(offset); | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
| HTMLTokenizer::HTMLTokenizer(const StringView& input, const String& encoding) | HTMLTokenizer::HTMLTokenizer(const StringView& input, const String& encoding) | ||||||
|  | @ -2591,6 +2631,7 @@ HTMLTokenizer::HTMLTokenizer(const StringView& input, const String& encoding) | ||||||
|     m_decoded_input = decoder->to_utf8(input); |     m_decoded_input = decoder->to_utf8(input); | ||||||
|     m_utf8_view = Utf8View(m_decoded_input); |     m_utf8_view = Utf8View(m_decoded_input); | ||||||
|     m_utf8_iterator = m_utf8_view.begin(); |     m_utf8_iterator = m_utf8_view.begin(); | ||||||
|  |     m_source_positions.empend(0u, 0u); | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
| void HTMLTokenizer::will_switch_to([[maybe_unused]] State new_state) | void HTMLTokenizer::will_switch_to([[maybe_unused]] State new_state) | ||||||
|  | @ -2613,6 +2654,7 @@ void HTMLTokenizer::will_emit(HTMLToken& token) | ||||||
| { | { | ||||||
|     if (token.is_start_tag()) |     if (token.is_start_tag()) | ||||||
|         m_last_emitted_start_tag = token; |         m_last_emitted_start_tag = token; | ||||||
|  |     token.m_end_position = m_source_positions.last(); | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
| bool HTMLTokenizer::current_end_tag_token_is_appropriate() const | bool HTMLTokenizer::current_end_tag_token_is_appropriate() const | ||||||
|  | @ -2628,4 +2670,18 @@ bool HTMLTokenizer::consumed_as_part_of_an_attribute() const | ||||||
|     return m_return_state == State::AttributeValueUnquoted || m_return_state == State::AttributeValueSingleQuoted || m_return_state == State::AttributeValueDoubleQuoted; |     return m_return_state == State::AttributeValueUnquoted || m_return_state == State::AttributeValueSingleQuoted || m_return_state == State::AttributeValueDoubleQuoted; | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
|  | void HTMLTokenizer::restore_to(const Utf8CodepointIterator& new_iterator) | ||||||
|  | { | ||||||
|  |     if (new_iterator != m_prev_utf8_iterator) { | ||||||
|  |         auto diff = m_prev_utf8_iterator - new_iterator; | ||||||
|  |         if (diff > 0) { | ||||||
|  |             for (ssize_t i = 0; i < diff; ++i) | ||||||
|  |                 m_source_positions.take_last(); | ||||||
|  |         } else { | ||||||
|  |             // Going forwards...?
 | ||||||
|  |             TODO(); | ||||||
|  |         } | ||||||
|  |     } | ||||||
|  | } | ||||||
|  | 
 | ||||||
| } | } | ||||||
|  |  | ||||||
|  | @ -117,6 +117,7 @@ public: | ||||||
|     String source() const { return m_decoded_input; } |     String source() const { return m_decoded_input; } | ||||||
| 
 | 
 | ||||||
| private: | private: | ||||||
|  |     void skip(size_t count); | ||||||
|     Optional<u32> next_code_point(); |     Optional<u32> next_code_point(); | ||||||
|     Optional<u32> peek_code_point(size_t offset) const; |     Optional<u32> peek_code_point(size_t offset) const; | ||||||
|     bool consume_next_if_match(const StringView&, CaseSensitivity = CaseSensitivity::CaseSensitive); |     bool consume_next_if_match(const StringView&, CaseSensitivity = CaseSensitivity::CaseSensitive); | ||||||
|  | @ -141,6 +142,9 @@ private: | ||||||
| 
 | 
 | ||||||
|     bool consumed_as_part_of_an_attribute() const; |     bool consumed_as_part_of_an_attribute() const; | ||||||
| 
 | 
 | ||||||
|  |     void restore_to(const Utf8CodepointIterator& new_iterator); | ||||||
|  |     auto& nth_last_position(size_t n = 0) { return m_source_positions.at(m_source_positions.size() - 1 - n); } | ||||||
|  | 
 | ||||||
|     State m_state { State::Data }; |     State m_state { State::Data }; | ||||||
|     State m_return_state { State::Data }; |     State m_return_state { State::Data }; | ||||||
| 
 | 
 | ||||||
|  | @ -165,6 +169,8 @@ private: | ||||||
|     u32 m_character_reference_code { 0 }; |     u32 m_character_reference_code { 0 }; | ||||||
| 
 | 
 | ||||||
|     bool m_blocked { false }; |     bool m_blocked { false }; | ||||||
|  | 
 | ||||||
|  |     Vector<HTMLToken::Position> m_source_positions; | ||||||
| }; | }; | ||||||
| 
 | 
 | ||||||
| } | } | ||||||
|  |  | ||||||
		Loading…
	
	Add table
		Add a link
		
	
		Reference in a new issue
	
	 Ali Mohammad Pur
						Ali Mohammad Pur