mirror of
				https://github.com/RGBCube/serenity
				synced 2025-10-26 07:22:06 +00:00 
			
		
		
		
	 ab8432783e
			
		
	
	
		ab8432783e
		
	
	
	
	
		
			
			This is roughly on-spec, although I had to invent a simple "aborted" state for the tokenizer.
		
			
				
	
	
		
			218 lines
		
	
	
	
		
			9.7 KiB
		
	
	
	
		
			C++
		
	
	
	
	
	
			
		
		
	
	
			218 lines
		
	
	
	
		
			9.7 KiB
		
	
	
	
		
			C++
		
	
	
	
	
	
| /*
 | |
|  * Copyright (c) 2020, Andreas Kling <kling@serenityos.org>
 | |
|  * Copyright (c) 2022, Linus Groh <linusg@serenityos.org>
 | |
|  *
 | |
|  * SPDX-License-Identifier: BSD-2-Clause
 | |
|  */
 | |
| 
 | |
| #pragma once
 | |
| 
 | |
| #include <AK/Queue.h>
 | |
| #include <AK/StringBuilder.h>
 | |
| #include <AK/StringView.h>
 | |
| #include <AK/Types.h>
 | |
| #include <AK/Utf8View.h>
 | |
| #include <LibWeb/Forward.h>
 | |
| #include <LibWeb/HTML/Parser/HTMLToken.h>
 | |
| 
 | |
| namespace Web::HTML {
 | |
| 
 | |
| #define ENUMERATE_TOKENIZER_STATES                                        \
 | |
|     __ENUMERATE_TOKENIZER_STATE(Data)                                     \
 | |
|     __ENUMERATE_TOKENIZER_STATE(RCDATA)                                   \
 | |
|     __ENUMERATE_TOKENIZER_STATE(RAWTEXT)                                  \
 | |
|     __ENUMERATE_TOKENIZER_STATE(ScriptData)                               \
 | |
|     __ENUMERATE_TOKENIZER_STATE(PLAINTEXT)                                \
 | |
|     __ENUMERATE_TOKENIZER_STATE(TagOpen)                                  \
 | |
|     __ENUMERATE_TOKENIZER_STATE(EndTagOpen)                               \
 | |
|     __ENUMERATE_TOKENIZER_STATE(TagName)                                  \
 | |
|     __ENUMERATE_TOKENIZER_STATE(RCDATALessThanSign)                       \
 | |
|     __ENUMERATE_TOKENIZER_STATE(RCDATAEndTagOpen)                         \
 | |
|     __ENUMERATE_TOKENIZER_STATE(RCDATAEndTagName)                         \
 | |
|     __ENUMERATE_TOKENIZER_STATE(RAWTEXTLessThanSign)                      \
 | |
|     __ENUMERATE_TOKENIZER_STATE(RAWTEXTEndTagOpen)                        \
 | |
|     __ENUMERATE_TOKENIZER_STATE(RAWTEXTEndTagName)                        \
 | |
|     __ENUMERATE_TOKENIZER_STATE(ScriptDataLessThanSign)                   \
 | |
|     __ENUMERATE_TOKENIZER_STATE(ScriptDataEndTagOpen)                     \
 | |
|     __ENUMERATE_TOKENIZER_STATE(ScriptDataEndTagName)                     \
 | |
|     __ENUMERATE_TOKENIZER_STATE(ScriptDataEscapeStart)                    \
 | |
|     __ENUMERATE_TOKENIZER_STATE(ScriptDataEscapeStartDash)                \
 | |
|     __ENUMERATE_TOKENIZER_STATE(ScriptDataEscaped)                        \
 | |
|     __ENUMERATE_TOKENIZER_STATE(ScriptDataEscapedDash)                    \
 | |
|     __ENUMERATE_TOKENIZER_STATE(ScriptDataEscapedDashDash)                \
 | |
|     __ENUMERATE_TOKENIZER_STATE(ScriptDataEscapedLessThanSign)            \
 | |
|     __ENUMERATE_TOKENIZER_STATE(ScriptDataEscapedEndTagOpen)              \
 | |
|     __ENUMERATE_TOKENIZER_STATE(ScriptDataEscapedEndTagName)              \
 | |
|     __ENUMERATE_TOKENIZER_STATE(ScriptDataDoubleEscapeStart)              \
 | |
|     __ENUMERATE_TOKENIZER_STATE(ScriptDataDoubleEscaped)                  \
 | |
|     __ENUMERATE_TOKENIZER_STATE(ScriptDataDoubleEscapedDash)              \
 | |
|     __ENUMERATE_TOKENIZER_STATE(ScriptDataDoubleEscapedDashDash)          \
 | |
|     __ENUMERATE_TOKENIZER_STATE(ScriptDataDoubleEscapedLessThanSign)      \
 | |
|     __ENUMERATE_TOKENIZER_STATE(ScriptDataDoubleEscapeEnd)                \
 | |
|     __ENUMERATE_TOKENIZER_STATE(BeforeAttributeName)                      \
 | |
|     __ENUMERATE_TOKENIZER_STATE(AttributeName)                            \
 | |
|     __ENUMERATE_TOKENIZER_STATE(AfterAttributeName)                       \
 | |
|     __ENUMERATE_TOKENIZER_STATE(BeforeAttributeValue)                     \
 | |
|     __ENUMERATE_TOKENIZER_STATE(AttributeValueDoubleQuoted)               \
 | |
|     __ENUMERATE_TOKENIZER_STATE(AttributeValueSingleQuoted)               \
 | |
|     __ENUMERATE_TOKENIZER_STATE(AttributeValueUnquoted)                   \
 | |
|     __ENUMERATE_TOKENIZER_STATE(AfterAttributeValueQuoted)                \
 | |
|     __ENUMERATE_TOKENIZER_STATE(SelfClosingStartTag)                      \
 | |
|     __ENUMERATE_TOKENIZER_STATE(BogusComment)                             \
 | |
|     __ENUMERATE_TOKENIZER_STATE(MarkupDeclarationOpen)                    \
 | |
|     __ENUMERATE_TOKENIZER_STATE(CommentStart)                             \
 | |
|     __ENUMERATE_TOKENIZER_STATE(CommentStartDash)                         \
 | |
|     __ENUMERATE_TOKENIZER_STATE(Comment)                                  \
 | |
|     __ENUMERATE_TOKENIZER_STATE(CommentLessThanSign)                      \
 | |
|     __ENUMERATE_TOKENIZER_STATE(CommentLessThanSignBang)                  \
 | |
|     __ENUMERATE_TOKENIZER_STATE(CommentLessThanSignBangDash)              \
 | |
|     __ENUMERATE_TOKENIZER_STATE(CommentLessThanSignBangDashDash)          \
 | |
|     __ENUMERATE_TOKENIZER_STATE(CommentEndDash)                           \
 | |
|     __ENUMERATE_TOKENIZER_STATE(CommentEnd)                               \
 | |
|     __ENUMERATE_TOKENIZER_STATE(CommentEndBang)                           \
 | |
|     __ENUMERATE_TOKENIZER_STATE(DOCTYPE)                                  \
 | |
|     __ENUMERATE_TOKENIZER_STATE(BeforeDOCTYPEName)                        \
 | |
|     __ENUMERATE_TOKENIZER_STATE(DOCTYPEName)                              \
 | |
|     __ENUMERATE_TOKENIZER_STATE(AfterDOCTYPEName)                         \
 | |
|     __ENUMERATE_TOKENIZER_STATE(AfterDOCTYPEPublicKeyword)                \
 | |
|     __ENUMERATE_TOKENIZER_STATE(BeforeDOCTYPEPublicIdentifier)            \
 | |
|     __ENUMERATE_TOKENIZER_STATE(DOCTYPEPublicIdentifierDoubleQuoted)      \
 | |
|     __ENUMERATE_TOKENIZER_STATE(DOCTYPEPublicIdentifierSingleQuoted)      \
 | |
|     __ENUMERATE_TOKENIZER_STATE(AfterDOCTYPEPublicIdentifier)             \
 | |
|     __ENUMERATE_TOKENIZER_STATE(BetweenDOCTYPEPublicAndSystemIdentifiers) \
 | |
|     __ENUMERATE_TOKENIZER_STATE(AfterDOCTYPESystemKeyword)                \
 | |
|     __ENUMERATE_TOKENIZER_STATE(BeforeDOCTYPESystemIdentifier)            \
 | |
|     __ENUMERATE_TOKENIZER_STATE(DOCTYPESystemIdentifierDoubleQuoted)      \
 | |
|     __ENUMERATE_TOKENIZER_STATE(DOCTYPESystemIdentifierSingleQuoted)      \
 | |
|     __ENUMERATE_TOKENIZER_STATE(AfterDOCTYPESystemIdentifier)             \
 | |
|     __ENUMERATE_TOKENIZER_STATE(BogusDOCTYPE)                             \
 | |
|     __ENUMERATE_TOKENIZER_STATE(CDATASection)                             \
 | |
|     __ENUMERATE_TOKENIZER_STATE(CDATASectionBracket)                      \
 | |
|     __ENUMERATE_TOKENIZER_STATE(CDATASectionEnd)                          \
 | |
|     __ENUMERATE_TOKENIZER_STATE(CharacterReference)                       \
 | |
|     __ENUMERATE_TOKENIZER_STATE(NamedCharacterReference)                  \
 | |
|     __ENUMERATE_TOKENIZER_STATE(AmbiguousAmpersand)                       \
 | |
|     __ENUMERATE_TOKENIZER_STATE(NumericCharacterReference)                \
 | |
|     __ENUMERATE_TOKENIZER_STATE(HexadecimalCharacterReferenceStart)       \
 | |
|     __ENUMERATE_TOKENIZER_STATE(DecimalCharacterReferenceStart)           \
 | |
|     __ENUMERATE_TOKENIZER_STATE(HexadecimalCharacterReference)            \
 | |
|     __ENUMERATE_TOKENIZER_STATE(DecimalCharacterReference)                \
 | |
|     __ENUMERATE_TOKENIZER_STATE(NumericCharacterReferenceEnd)
 | |
| 
 | |
| class HTMLTokenizer {
 | |
| public:
 | |
|     explicit HTMLTokenizer();
 | |
|     explicit HTMLTokenizer(StringView input, String const& encoding);
 | |
| 
 | |
|     enum class State {
 | |
| #define __ENUMERATE_TOKENIZER_STATE(state) state,
 | |
|         ENUMERATE_TOKENIZER_STATES
 | |
| #undef __ENUMERATE_TOKENIZER_STATE
 | |
|     };
 | |
| 
 | |
|     Optional<HTMLToken> next_token();
 | |
| 
 | |
|     void set_parser(Badge<HTMLParser>, HTMLParser& parser) { m_parser = &parser; }
 | |
| 
 | |
|     void switch_to(Badge<HTMLParser>, State new_state);
 | |
|     void switch_to(State new_state)
 | |
|     {
 | |
|         m_state = new_state;
 | |
|     }
 | |
| 
 | |
|     void set_blocked(bool b) { m_blocked = b; }
 | |
|     bool is_blocked() const { return m_blocked; }
 | |
| 
 | |
|     String source() const { return m_decoded_input; }
 | |
| 
 | |
|     void insert_input_at_insertion_point(String const& input);
 | |
|     void insert_eof();
 | |
|     bool is_eof_inserted();
 | |
| 
 | |
|     bool is_insertion_point_defined() const { return m_insertion_point.defined; }
 | |
|     bool is_insertion_point_reached()
 | |
|     {
 | |
|         return m_insertion_point.defined && m_insertion_point.position >= m_utf8_view.iterator_offset(m_utf8_iterator);
 | |
|     }
 | |
|     void undefine_insertion_point() { m_insertion_point.defined = false; }
 | |
|     void store_insertion_point() { m_old_insertion_point = m_insertion_point; }
 | |
|     void restore_insertion_point() { m_insertion_point = m_old_insertion_point; }
 | |
|     void update_insertion_point()
 | |
|     {
 | |
|         m_insertion_point.defined = true;
 | |
|         m_insertion_point.position = m_utf8_view.iterator_offset(m_utf8_iterator);
 | |
|     }
 | |
| 
 | |
|     // This permanently cuts off the tokenizer input stream.
 | |
|     void abort() { m_aborted = true; }
 | |
| 
 | |
| private:
 | |
|     void skip(size_t count);
 | |
|     Optional<u32> next_code_point();
 | |
|     Optional<u32> peek_code_point(size_t offset) const;
 | |
|     bool consume_next_if_match(StringView, CaseSensitivity = CaseSensitivity::CaseSensitive);
 | |
|     void create_new_token(HTMLToken::Type);
 | |
|     bool current_end_tag_token_is_appropriate() const;
 | |
|     String consume_current_builder();
 | |
| 
 | |
|     static char const* state_name(State state)
 | |
|     {
 | |
|         switch (state) {
 | |
| #define __ENUMERATE_TOKENIZER_STATE(state) \
 | |
|     case State::state:                     \
 | |
|         return #state;
 | |
|             ENUMERATE_TOKENIZER_STATES
 | |
| #undef __ENUMERATE_TOKENIZER_STATE
 | |
|         };
 | |
|         VERIFY_NOT_REACHED();
 | |
|     }
 | |
| 
 | |
|     void will_emit(HTMLToken&);
 | |
|     void will_switch_to(State);
 | |
|     void will_reconsume_in(State);
 | |
| 
 | |
|     bool consumed_as_part_of_an_attribute() const;
 | |
| 
 | |
|     void restore_to(Utf8CodePointIterator const& new_iterator);
 | |
|     HTMLToken::Position nth_last_position(size_t n = 0);
 | |
| 
 | |
|     HTMLParser* m_parser { nullptr };
 | |
| 
 | |
|     State m_state { State::Data };
 | |
|     State m_return_state { State::Data };
 | |
| 
 | |
|     Vector<u32> m_temporary_buffer;
 | |
| 
 | |
|     String m_decoded_input;
 | |
| 
 | |
|     struct InsertionPoint {
 | |
|         size_t position { 0 };
 | |
|         bool defined { false };
 | |
|     };
 | |
|     InsertionPoint m_insertion_point {};
 | |
|     InsertionPoint m_old_insertion_point {};
 | |
| 
 | |
|     Utf8View m_utf8_view;
 | |
|     Utf8CodePointIterator m_utf8_iterator;
 | |
|     Utf8CodePointIterator m_prev_utf8_iterator;
 | |
| 
 | |
|     HTMLToken m_current_token;
 | |
|     StringBuilder m_current_builder;
 | |
| 
 | |
|     Optional<String> m_last_emitted_start_tag_name;
 | |
| 
 | |
|     bool m_explicit_eof_inserted { false };
 | |
|     bool m_has_emitted_eof { false };
 | |
| 
 | |
|     Queue<HTMLToken> m_queued_tokens;
 | |
| 
 | |
|     u32 m_character_reference_code { 0 };
 | |
| 
 | |
|     bool m_blocked { false };
 | |
| 
 | |
|     bool m_aborted { false };
 | |
| 
 | |
|     Vector<HTMLToken::Position> m_source_positions;
 | |
| };
 | |
| 
 | |
| }
 |