mirror of
				https://github.com/RGBCube/serenity
				synced 2025-10-25 02:02:34 +00:00 
			
		
		
		
	 d476144565
			
		
	
	
		d476144565
		
	
	
	
	
		
			
			Some of the code assumed that chars were always signed while that is not the case on ARM hosts. Also, some of the code tried to use EOF (-1) in a way similar to what fgetc() does, however instead of storing the characters in an int variable a char was used. While this seemed to work it also meant that character 0xFF would be incorrectly seen as an end-of-file. Careful reading of fgetc() reveals that fgetc() stores character data in an int where valid characters are in the range of 0-255 and the EOF value is explicitly outside of that range (usually -1).
		
			
				
	
	
		
			91 lines
		
	
	
	
		
			2.5 KiB
		
	
	
	
		
			C++
		
	
	
	
	
	
			
		
		
	
	
			91 lines
		
	
	
	
		
			2.5 KiB
		
	
	
	
		
			C++
		
	
	
	
	
	
| /*
 | |
|  * Copyright (c) 2020, Emanuel Sprung <emanuel.sprung@gmail.com>
 | |
|  *
 | |
|  * SPDX-License-Identifier: BSD-2-Clause
 | |
|  */
 | |
| 
 | |
| #pragma once
 | |
| 
 | |
| #include <AK/Forward.h>
 | |
| #include <AK/StringView.h>
 | |
| 
 | |
| namespace regex {
 | |
| 
 | |
| #define ENUMERATE_REGEX_TOKENS              \
 | |
|     __ENUMERATE_REGEX_TOKEN(Eof)            \
 | |
|     __ENUMERATE_REGEX_TOKEN(Char)           \
 | |
|     __ENUMERATE_REGEX_TOKEN(Circumflex)     \
 | |
|     __ENUMERATE_REGEX_TOKEN(Period)         \
 | |
|     __ENUMERATE_REGEX_TOKEN(LeftParen)      \
 | |
|     __ENUMERATE_REGEX_TOKEN(RightParen)     \
 | |
|     __ENUMERATE_REGEX_TOKEN(LeftCurly)      \
 | |
|     __ENUMERATE_REGEX_TOKEN(RightCurly)     \
 | |
|     __ENUMERATE_REGEX_TOKEN(LeftBracket)    \
 | |
|     __ENUMERATE_REGEX_TOKEN(RightBracket)   \
 | |
|     __ENUMERATE_REGEX_TOKEN(Asterisk)       \
 | |
|     __ENUMERATE_REGEX_TOKEN(EscapeSequence) \
 | |
|     __ENUMERATE_REGEX_TOKEN(Dollar)         \
 | |
|     __ENUMERATE_REGEX_TOKEN(Pipe)           \
 | |
|     __ENUMERATE_REGEX_TOKEN(Plus)           \
 | |
|     __ENUMERATE_REGEX_TOKEN(Comma)          \
 | |
|     __ENUMERATE_REGEX_TOKEN(Slash)          \
 | |
|     __ENUMERATE_REGEX_TOKEN(EqualSign)      \
 | |
|     __ENUMERATE_REGEX_TOKEN(HyphenMinus)    \
 | |
|     __ENUMERATE_REGEX_TOKEN(Colon)          \
 | |
|     __ENUMERATE_REGEX_TOKEN(Questionmark)
 | |
| 
 | |
| enum class TokenType {
 | |
| #define __ENUMERATE_REGEX_TOKEN(x) x,
 | |
|     ENUMERATE_REGEX_TOKENS
 | |
| #undef __ENUMERATE_REGEX_TOKEN
 | |
| };
 | |
| 
 | |
| class Token {
 | |
| public:
 | |
|     Token() = default;
 | |
|     Token(const TokenType type, const size_t start_position, const StringView value)
 | |
|         : m_type(type)
 | |
|         , m_position(start_position)
 | |
|         , m_value(value)
 | |
|     {
 | |
|     }
 | |
| 
 | |
|     TokenType type() const { return m_type; }
 | |
|     const StringView& value() const { return m_value; }
 | |
|     size_t position() const { return m_position; }
 | |
| 
 | |
|     const char* name() const;
 | |
|     static const char* name(const TokenType);
 | |
| 
 | |
| private:
 | |
|     TokenType m_type { TokenType::Eof };
 | |
|     size_t m_position { 0 };
 | |
|     StringView m_value { nullptr };
 | |
| };
 | |
| 
 | |
| class Lexer {
 | |
| public:
 | |
|     Lexer() = default;
 | |
|     explicit Lexer(const StringView source);
 | |
|     Token next();
 | |
|     void reset();
 | |
|     void back(size_t offset);
 | |
|     void set_source(const StringView source) { m_source = source; }
 | |
|     bool try_skip(char);
 | |
|     char skip();
 | |
|     const auto& source() const { return m_source; }
 | |
| 
 | |
| private:
 | |
|     ALWAYS_INLINE int peek(size_t offset = 0) const;
 | |
|     ALWAYS_INLINE void consume();
 | |
| 
 | |
|     StringView m_source {};
 | |
|     size_t m_position { 0 };
 | |
|     size_t m_previous_position { 0 };
 | |
|     Token m_current_token { TokenType::Eof, 0, StringView(nullptr) };
 | |
|     int m_current_char { 0 };
 | |
| };
 | |
| 
 | |
| }
 | |
| 
 | |
| using regex::Lexer;
 |