mirror of
				https://github.com/RGBCube/serenity
				synced 2025-10-31 15:32:46 +00:00 
			
		
		
		
	
		
			
				
	
	
		
			556 lines
		
	
	
	
		
			17 KiB
		
	
	
	
		
			C++
		
	
	
	
	
	
			
		
		
	
	
			556 lines
		
	
	
	
		
			17 KiB
		
	
	
	
		
			C++
		
	
	
	
	
	
| /*
 | |
|  * Copyright (c) 2018-2020, Andreas Kling <kling@serenityos.org>
 | |
|  * All rights reserved.
 | |
|  *
 | |
|  * Redistribution and use in source and binary forms, with or without
 | |
|  * modification, are permitted provided that the following conditions are met:
 | |
|  *
 | |
|  * 1. Redistributions of source code must retain the above copyright notice, this
 | |
|  *    list of conditions and the following disclaimer.
 | |
|  *
 | |
|  * 2. Redistributions in binary form must reproduce the above copyright notice,
 | |
|  *    this list of conditions and the following disclaimer in the documentation
 | |
|  *    and/or other materials provided with the distribution.
 | |
|  *
 | |
|  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 | |
|  * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 | |
|  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
 | |
|  * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
 | |
|  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 | |
|  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
 | |
|  * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
 | |
|  * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
 | |
|  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 | |
|  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 | |
|  */
 | |
| 
 | |
| #include "CppLexer.h"
 | |
| #include <AK/HashTable.h>
 | |
| #include <AK/String.h>
 | |
| #include <ctype.h>
 | |
| 
 | |
| namespace GUI {
 | |
| 
 | |
| CppLexer::CppLexer(const StringView& input)
 | |
|     : m_input(input)
 | |
| {
 | |
| }
 | |
| 
 | |
| char CppLexer::peek(size_t offset) const
 | |
| {
 | |
|     if ((m_index + offset) >= m_input.length())
 | |
|         return 0;
 | |
|     return m_input[m_index + offset];
 | |
| }
 | |
| 
 | |
| char CppLexer::consume()
 | |
| {
 | |
|     ASSERT(m_index < m_input.length());
 | |
|     char ch = m_input[m_index++];
 | |
|     m_previous_position = m_position;
 | |
|     if (ch == '\n') {
 | |
|         m_position.line++;
 | |
|         m_position.column = 0;
 | |
|     } else {
 | |
|         m_position.column++;
 | |
|     }
 | |
|     return ch;
 | |
| }
 | |
| 
 | |
| static bool is_valid_first_character_of_identifier(char ch)
 | |
| {
 | |
|     return isalpha(ch) || ch == '_' || ch == '$';
 | |
| }
 | |
| 
 | |
| static bool is_valid_nonfirst_character_of_identifier(char ch)
 | |
| {
 | |
|     return is_valid_first_character_of_identifier(ch) || isdigit(ch);
 | |
| }
 | |
| 
 | |
| static bool is_keyword(const StringView& string)
 | |
| {
 | |
|     static HashTable<String> keywords;
 | |
|     if (keywords.is_empty()) {
 | |
|         keywords.set("alignas");
 | |
|         keywords.set("alignof");
 | |
|         keywords.set("and");
 | |
|         keywords.set("and_eq");
 | |
|         keywords.set("asm");
 | |
|         keywords.set("bitand");
 | |
|         keywords.set("bitor");
 | |
|         keywords.set("bool");
 | |
|         keywords.set("break");
 | |
|         keywords.set("case");
 | |
|         keywords.set("catch");
 | |
|         keywords.set("class");
 | |
|         keywords.set("compl");
 | |
|         keywords.set("const");
 | |
|         keywords.set("const_cast");
 | |
|         keywords.set("constexpr");
 | |
|         keywords.set("continue");
 | |
|         keywords.set("decltype");
 | |
|         keywords.set("default");
 | |
|         keywords.set("delete");
 | |
|         keywords.set("do");
 | |
|         keywords.set("dynamic_cast");
 | |
|         keywords.set("else");
 | |
|         keywords.set("enum");
 | |
|         keywords.set("explicit");
 | |
|         keywords.set("export");
 | |
|         keywords.set("extern");
 | |
|         keywords.set("false");
 | |
|         keywords.set("final");
 | |
|         keywords.set("for");
 | |
|         keywords.set("friend");
 | |
|         keywords.set("goto");
 | |
|         keywords.set("if");
 | |
|         keywords.set("inline");
 | |
|         keywords.set("mutable");
 | |
|         keywords.set("namespace");
 | |
|         keywords.set("new");
 | |
|         keywords.set("noexcept");
 | |
|         keywords.set("not");
 | |
|         keywords.set("not_eq");
 | |
|         keywords.set("nullptr");
 | |
|         keywords.set("operator");
 | |
|         keywords.set("or");
 | |
|         keywords.set("or_eq");
 | |
|         keywords.set("override");
 | |
|         keywords.set("private");
 | |
|         keywords.set("protected");
 | |
|         keywords.set("public");
 | |
|         keywords.set("register");
 | |
|         keywords.set("reinterpret_cast");
 | |
|         keywords.set("return");
 | |
|         keywords.set("signed");
 | |
|         keywords.set("sizeof");
 | |
|         keywords.set("static");
 | |
|         keywords.set("static_assert");
 | |
|         keywords.set("static_cast");
 | |
|         keywords.set("struct");
 | |
|         keywords.set("switch");
 | |
|         keywords.set("template");
 | |
|         keywords.set("this");
 | |
|         keywords.set("thread_local");
 | |
|         keywords.set("throw");
 | |
|         keywords.set("true");
 | |
|         keywords.set("try");
 | |
|         keywords.set("typedef");
 | |
|         keywords.set("typeid");
 | |
|         keywords.set("typename");
 | |
|         keywords.set("union");
 | |
|         keywords.set("using");
 | |
|         keywords.set("virtual");
 | |
|         keywords.set("volatile");
 | |
|         keywords.set("while");
 | |
|         keywords.set("xor");
 | |
|         keywords.set("xor_eq");
 | |
|     }
 | |
|     return keywords.contains(string);
 | |
| }
 | |
| 
 | |
| static bool is_known_type(const StringView& string)
 | |
| {
 | |
|     static HashTable<String> types;
 | |
|     if (types.is_empty()) {
 | |
|         types.set("ByteBuffer");
 | |
|         types.set("CircularDeque");
 | |
|         types.set("CircularQueue");
 | |
|         types.set("Deque");
 | |
|         types.set("DoublyLinkedList");
 | |
|         types.set("FileSystemPath");
 | |
|         types.set("FixedArray");
 | |
|         types.set("Function");
 | |
|         types.set("HashMap");
 | |
|         types.set("HashTable");
 | |
|         types.set("IPv4Address");
 | |
|         types.set("InlineLinkedList");
 | |
|         types.set("IntrusiveList");
 | |
|         types.set("JsonArray");
 | |
|         types.set("JsonObject");
 | |
|         types.set("JsonValue");
 | |
|         types.set("MappedFile");
 | |
|         types.set("NetworkOrdered");
 | |
|         types.set("NonnullOwnPtr");
 | |
|         types.set("NonnullOwnPtrVector");
 | |
|         types.set("NonnullRefPtr");
 | |
|         types.set("NonnullRefPtrVector");
 | |
|         types.set("Optional");
 | |
|         types.set("OwnPtr");
 | |
|         types.set("RefPtr");
 | |
|         types.set("Result");
 | |
|         types.set("ScopeGuard");
 | |
|         types.set("SinglyLinkedList");
 | |
|         types.set("String");
 | |
|         types.set("StringBuilder");
 | |
|         types.set("StringImpl");
 | |
|         types.set("StringView");
 | |
|         types.set("Utf8View");
 | |
|         types.set("Vector");
 | |
|         types.set("WeakPtr");
 | |
|         types.set("auto");
 | |
|         types.set("char");
 | |
|         types.set("char16_t");
 | |
|         types.set("char32_t");
 | |
|         types.set("char8_t");
 | |
|         types.set("double");
 | |
|         types.set("float");
 | |
|         types.set("i16");
 | |
|         types.set("i32");
 | |
|         types.set("i64");
 | |
|         types.set("i8");
 | |
|         types.set("int");
 | |
|         types.set("int");
 | |
|         types.set("long");
 | |
|         types.set("short");
 | |
|         types.set("signed");
 | |
|         types.set("u16");
 | |
|         types.set("u32");
 | |
|         types.set("u64");
 | |
|         types.set("u8");
 | |
|         types.set("unsigned");
 | |
|         types.set("void");
 | |
|         types.set("wchar_t");
 | |
|     }
 | |
|     return types.contains(string);
 | |
| }
 | |
| 
 | |
| Vector<CppToken> CppLexer::lex()
 | |
| {
 | |
|     Vector<CppToken> tokens;
 | |
| 
 | |
|     size_t token_start_index = 0;
 | |
|     CppPosition token_start_position;
 | |
| 
 | |
|     auto emit_token = [&](auto type) {
 | |
|         CppToken token;
 | |
|         token.m_type = type;
 | |
|         token.m_start = m_position;
 | |
|         token.m_end = m_position;
 | |
|         tokens.append(token);
 | |
|         consume();
 | |
|     };
 | |
| 
 | |
|     auto begin_token = [&] {
 | |
|         token_start_index = m_index;
 | |
|         token_start_position = m_position;
 | |
|     };
 | |
|     auto commit_token = [&](auto type) {
 | |
|         CppToken token;
 | |
|         token.m_type = type;
 | |
|         token.m_start = token_start_position;
 | |
|         token.m_end = m_previous_position;
 | |
|         tokens.append(token);
 | |
|     };
 | |
| 
 | |
|     auto match_escape_sequence = [&]() -> size_t {
 | |
|         switch (peek(1)) {
 | |
|         case '\'':
 | |
|         case '"':
 | |
|         case '?':
 | |
|         case '\\':
 | |
|         case 'a':
 | |
|         case 'b':
 | |
|         case 'f':
 | |
|         case 'n':
 | |
|         case 'r':
 | |
|         case 't':
 | |
|         case 'v':
 | |
|             return 2;
 | |
|         case '0':
 | |
|         case '1':
 | |
|         case '2':
 | |
|         case '3':
 | |
|         case '4':
 | |
|         case '5':
 | |
|         case '6':
 | |
|         case '7': {
 | |
|             size_t octal_digits = 1;
 | |
|             for (size_t i = 0; i < 2; ++i) {
 | |
|                 char next = peek(2 + i);
 | |
|                 if (next < '0' || next > '7')
 | |
|                     break;
 | |
|                 ++octal_digits;
 | |
|             }
 | |
|             return 1 + octal_digits;
 | |
|         }
 | |
|         case 'x': {
 | |
|             size_t hex_digits = 0;
 | |
|             for (size_t i = 0; i < 2; ++i) {
 | |
|                 if (!isxdigit(peek(2 + i)))
 | |
|                     break;
 | |
|                 ++hex_digits;
 | |
|             }
 | |
|             return 2 + hex_digits;
 | |
|         }
 | |
|         case 'u': {
 | |
|             bool is_unicode = true;
 | |
|             for (size_t i = 0; i < 4; ++i) {
 | |
|                 if (!isxdigit(peek(2 + i))) {
 | |
|                     is_unicode = false;
 | |
|                     break;
 | |
|                 }
 | |
|             }
 | |
|             return is_unicode ? 6 : 0;
 | |
|         }
 | |
|         default:
 | |
|             return 0;
 | |
|         }
 | |
|     };
 | |
| 
 | |
|     while (m_index < m_input.length()) {
 | |
|         auto ch = peek();
 | |
|         if (isspace(ch)) {
 | |
|             begin_token();
 | |
|             while (isspace(peek()))
 | |
|                 consume();
 | |
|             commit_token(CppToken::Type::Whitespace);
 | |
|             continue;
 | |
|         }
 | |
|         if (ch == '(') {
 | |
|             emit_token(CppToken::Type::LeftParen);
 | |
|             continue;
 | |
|         }
 | |
|         if (ch == ')') {
 | |
|             emit_token(CppToken::Type::RightParen);
 | |
|             continue;
 | |
|         }
 | |
|         if (ch == '{') {
 | |
|             emit_token(CppToken::Type::LeftCurly);
 | |
|             continue;
 | |
|         }
 | |
|         if (ch == '}') {
 | |
|             emit_token(CppToken::Type::RightCurly);
 | |
|             continue;
 | |
|         }
 | |
|         if (ch == '[') {
 | |
|             emit_token(CppToken::Type::LeftBracket);
 | |
|             continue;
 | |
|         }
 | |
|         if (ch == ']') {
 | |
|             emit_token(CppToken::Type::RightBracket);
 | |
|             continue;
 | |
|         }
 | |
|         if (ch == ',') {
 | |
|             emit_token(CppToken::Type::Comma);
 | |
|             continue;
 | |
|         }
 | |
|         if (ch == '*') {
 | |
|             emit_token(CppToken::Type::Asterisk);
 | |
|             continue;
 | |
|         }
 | |
|         if (ch == ';') {
 | |
|             emit_token(CppToken::Type::Semicolon);
 | |
|             continue;
 | |
|         }
 | |
|         if (ch == '#') {
 | |
|             begin_token();
 | |
|             consume();
 | |
| 
 | |
|             if (is_valid_first_character_of_identifier(peek()))
 | |
|                 while (peek() && is_valid_nonfirst_character_of_identifier(peek()))
 | |
|                     consume();
 | |
| 
 | |
|             auto directive = StringView(m_input.characters_without_null_termination() + token_start_index, m_index - token_start_index);
 | |
|             if (directive == "#include") {
 | |
|                 commit_token(CppToken::Type::IncludeStatement);
 | |
| 
 | |
|                 begin_token();
 | |
|                 while (isspace(peek()))
 | |
|                     consume();
 | |
|                 commit_token(CppToken::Type::Whitespace);
 | |
| 
 | |
|                 begin_token();
 | |
|                 if (peek() == '<' || peek() == '"') {
 | |
|                     char closing = consume() == '<' ? '>' : '"';
 | |
|                     while (peek() && peek() != closing && peek() != '\n')
 | |
|                         consume();
 | |
| 
 | |
|                     if (peek() && consume() == '\n') {
 | |
|                         commit_token(CppToken::Type::IncludePath);
 | |
|                         continue;
 | |
|                     }
 | |
| 
 | |
|                     commit_token(CppToken::Type::IncludePath);
 | |
|                     begin_token();
 | |
|                 }
 | |
|             }
 | |
| 
 | |
|             while (peek() && peek() != '\n')
 | |
|                 consume();
 | |
| 
 | |
|             commit_token(CppToken::Type::PreprocessorStatement);
 | |
|             continue;
 | |
|         }
 | |
|         if (ch == '/' && peek(1) == '/') {
 | |
|             begin_token();
 | |
|             while (peek() && peek() != '\n')
 | |
|                 consume();
 | |
|             commit_token(CppToken::Type::Comment);
 | |
|             continue;
 | |
|         }
 | |
|         if (ch == '/' && peek(1) == '*') {
 | |
|             begin_token();
 | |
|             consume();
 | |
|             consume();
 | |
|             bool comment_block_ends = false;
 | |
|             while (peek()) {
 | |
|                 if (peek() == '*' && peek(1) == '/') {
 | |
|                     comment_block_ends = true;
 | |
|                     break;
 | |
|                 }
 | |
| 
 | |
|                 consume();
 | |
|             }
 | |
| 
 | |
|             if (comment_block_ends) {
 | |
|                 consume();
 | |
|                 consume();
 | |
|             }
 | |
| 
 | |
|             commit_token(CppToken::Type::Comment);
 | |
|             continue;
 | |
|         }
 | |
|         if (ch == '"') {
 | |
|             begin_token();
 | |
|             consume();
 | |
|             while (peek()) {
 | |
|                 if (peek() == '\\') {
 | |
|                     size_t escape = match_escape_sequence();
 | |
|                     if (escape > 0) {
 | |
|                         commit_token(CppToken::Type::DoubleQuotedString);
 | |
|                         begin_token();
 | |
|                         for (size_t i = 0; i < escape; ++i)
 | |
|                             consume();
 | |
|                         commit_token(CppToken::Type::EscapeSequence);
 | |
|                         begin_token();
 | |
|                         continue;
 | |
|                     }
 | |
|                 }
 | |
| 
 | |
|                 if (consume() == '"')
 | |
|                     break;
 | |
|             }
 | |
|             commit_token(CppToken::Type::DoubleQuotedString);
 | |
|             continue;
 | |
|         }
 | |
|         if (ch == '\'') {
 | |
|             begin_token();
 | |
|             consume();
 | |
|             while (peek()) {
 | |
|                 if (peek() == '\\') {
 | |
|                     size_t escape = match_escape_sequence();
 | |
|                     if (escape > 0) {
 | |
|                         commit_token(CppToken::Type::SingleQuotedString);
 | |
|                         begin_token();
 | |
|                         for (size_t i = 0; i < escape; ++i)
 | |
|                             consume();
 | |
|                         commit_token(CppToken::Type::EscapeSequence);
 | |
|                         begin_token();
 | |
|                         continue;
 | |
|                     }
 | |
|                 }
 | |
| 
 | |
|                 if (consume() == '\'')
 | |
|                     break;
 | |
|             }
 | |
|             commit_token(CppToken::Type::SingleQuotedString);
 | |
|             continue;
 | |
|         }
 | |
|         if (isdigit(ch) || (ch == '.' && isdigit(peek(1)))) {
 | |
|             begin_token();
 | |
|             consume();
 | |
| 
 | |
|             auto type = ch == '.' ? CppToken::Type::Float : CppToken::Type::Integer;
 | |
|             bool is_hex = false;
 | |
|             bool is_binary = false;
 | |
| 
 | |
|             auto match_exponent = [&]() -> size_t {
 | |
|                 char ch = peek();
 | |
|                 if (ch != 'e' && ch != 'E' && ch != 'p' && ch != 'P')
 | |
|                     return 0;
 | |
| 
 | |
|                 type = CppToken::Type::Float;
 | |
|                 size_t length = 1;
 | |
|                 ch = peek(length);
 | |
|                 if (ch == '+' || ch == '-') {
 | |
|                     ++length;
 | |
|                 }
 | |
|                 for (ch = peek(length); isdigit(ch); ch = peek(length)) {
 | |
|                     ++length;
 | |
|                 }
 | |
|                 return length;
 | |
|             };
 | |
| 
 | |
|             auto match_type_literal = [&]() -> size_t {
 | |
|                 size_t length = 0;
 | |
|                 for (;;) {
 | |
|                     char ch = peek(length);
 | |
|                     if ((ch == 'u' || ch == 'U') && type == CppToken::Type::Integer) {
 | |
|                         ++length;
 | |
|                     } else if ((ch == 'f' || ch == 'F') && !is_binary) {
 | |
|                         type = CppToken::Type::Float;
 | |
|                         ++length;
 | |
|                     } else if (ch == 'l' || ch == 'L') {
 | |
|                         ++length;
 | |
|                     } else
 | |
|                         return length;
 | |
|                 }
 | |
|             };
 | |
| 
 | |
|             if (peek() == 'b' || peek() == 'B') {
 | |
|                 consume();
 | |
|                 is_binary = true;
 | |
|                 for (char ch = peek(); ch == '0' || ch == '1' || (ch == '\'' && peek(1) != '\''); ch = peek()) {
 | |
|                     consume();
 | |
|                 }
 | |
|             } else {
 | |
|                 if (peek() == 'x' || peek() == 'X') {
 | |
|                     consume();
 | |
|                     is_hex = true;
 | |
|                 }
 | |
| 
 | |
|                 for (char ch = peek(); (is_hex ? isxdigit(ch) : isdigit(ch)) || (ch == '\'' && peek(1) != '\'') || ch == '.'; ch = peek()) {
 | |
|                     if (ch == '.') {
 | |
|                         if (type == CppToken::Type::Integer) {
 | |
|                             type = CppToken::Type::Float;
 | |
|                         } else
 | |
|                             break;
 | |
|                     };
 | |
|                     consume();
 | |
|                 }
 | |
|             }
 | |
| 
 | |
|             if (!is_binary) {
 | |
|                 size_t length = match_exponent();
 | |
|                 for (size_t i = 0; i < length; ++i)
 | |
|                     consume();
 | |
|             }
 | |
| 
 | |
|             size_t length = match_type_literal();
 | |
|             for (size_t i = 0; i < length; ++i)
 | |
|                 consume();
 | |
| 
 | |
|             commit_token(type);
 | |
|             continue;
 | |
|         }
 | |
|         if (is_valid_first_character_of_identifier(ch)) {
 | |
|             begin_token();
 | |
|             while (peek() && is_valid_nonfirst_character_of_identifier(peek()))
 | |
|                 consume();
 | |
|             auto token_view = StringView(m_input.characters_without_null_termination() + token_start_index, m_index - token_start_index);
 | |
|             if (is_keyword(token_view))
 | |
|                 commit_token(CppToken::Type::Keyword);
 | |
|             else if (is_known_type(token_view))
 | |
|                 commit_token(CppToken::Type::KnownType);
 | |
|             else
 | |
|                 commit_token(CppToken::Type::Identifier);
 | |
|             continue;
 | |
|         }
 | |
|         dbg() << "Unimplemented token character: " << ch;
 | |
|         emit_token(CppToken::Type::Unknown);
 | |
|     }
 | |
|     return tokens;
 | |
| }
 | |
| 
 | |
| }
 | 
