1
Fork 0
mirror of https://github.com/RGBCube/serenity synced 2025-07-25 15:17:36 +00:00

LibRegex: Convert regex::Lexer to inherit from GenericLexer

This will allow regex::Lexer users to invoke GenericLexer consumption
methods, such as GenericLexer::consume_escaped_codepoint().

This also allows for de-duplicating common methods between the lexers.
This commit is contained in:
Timothy Flynn 2021-08-18 14:10:08 -04:00 committed by Andreas Kling
parent dd44a5e948
commit 5ff9596678
3 changed files with 28 additions and 62 deletions

View file

@ -31,86 +31,58 @@ char const* Token::name() const
return name(m_type); return name(m_type);
} }
Lexer::Lexer(StringView const source) Lexer::Lexer()
: m_source(source) : GenericLexer(StringView { nullptr })
{ {
} }
ALWAYS_INLINE int Lexer::peek(size_t offset) const Lexer::Lexer(StringView const source)
: GenericLexer(source)
{ {
if ((m_position + offset) >= m_source.length())
return EOF;
return (unsigned char)m_source[m_position + offset];
} }
void Lexer::back(size_t offset) void Lexer::back(size_t offset)
{ {
if (offset == m_position + 1) if (offset == m_index + 1)
offset = m_position; // 'position == 0' occurs twice. offset = m_index; // 'position == 0' occurs twice.
VERIFY(offset <= m_position); VERIFY(offset <= m_index);
if (!offset) if (!offset)
return; return;
m_position -= offset; m_index -= offset;
m_previous_position = (m_position > 0) ? m_position - 1 : 0; m_previous_position = (m_index > 0) ? m_index - 1 : 0;
m_current_char = m_source[m_position];
} }
ALWAYS_INLINE void Lexer::consume() char Lexer::consume()
{ {
m_previous_position = m_position; m_previous_position = m_index;
return GenericLexer::consume();
if (m_position >= m_source.length()) {
m_position = m_source.length() + 1;
m_current_char = EOF;
return;
}
m_current_char = m_source[m_position++];
} }
void Lexer::reset() void Lexer::reset()
{ {
m_position = 0; m_index = 0;
m_current_token = { TokenType::Eof, 0, StringView(nullptr) }; m_current_token = { TokenType::Eof, 0, StringView(nullptr) };
m_current_char = 0;
m_previous_position = 0; m_previous_position = 0;
} }
bool Lexer::try_skip(char c)
{
if (peek() != c)
return false;
consume();
return true;
}
char Lexer::skip()
{
auto c = peek();
consume();
VERIFY(c != EOF);
return c;
}
Token Lexer::next() Token Lexer::next()
{ {
size_t token_start_position; size_t token_start_position;
auto begin_token = [&] { auto begin_token = [&] {
token_start_position = m_position; token_start_position = m_index;
}; };
auto commit_token = [&](auto type) -> Token& { auto commit_token = [&](auto type) -> Token& {
VERIFY(token_start_position + m_previous_position - token_start_position + 1 <= m_source.length()); VERIFY(token_start_position + m_previous_position - token_start_position + 1 <= m_input.length());
auto substring = m_source.substring_view(token_start_position, m_previous_position - token_start_position + 1); auto substring = m_input.substring_view(token_start_position, m_previous_position - token_start_position + 1);
m_current_token = Token(type, token_start_position, substring); m_current_token = Token(type, token_start_position, substring);
return m_current_token; return m_current_token;
}; };
auto emit_token = [&](auto type) -> Token& { auto emit_token = [&](auto type) -> Token& {
m_current_token = Token(type, m_position, m_source.substring_view(m_position, 1)); m_current_token = Token(type, m_index, m_input.substring_view(m_index, 1));
consume(); consume();
return m_current_token; return m_current_token;
}; };
@ -137,7 +109,7 @@ Token Lexer::next()
} }
}; };
while (m_position <= m_source.length()) { while (m_index <= m_input.length()) {
auto ch = peek(); auto ch = peek();
if (ch == '(') if (ch == '(')
return emit_token(TokenType::LeftParen); return emit_token(TokenType::LeftParen);
@ -203,13 +175,13 @@ Token Lexer::next()
} }
} }
if (ch == EOF) if (ch == '\0')
break; break;
return emit_token(TokenType::Char); return emit_token(TokenType::Char);
} }
return Token(TokenType::Eof, m_position, nullptr); return Token(TokenType::Eof, m_index, nullptr);
} }
} }

View file

@ -7,6 +7,7 @@
#pragma once #pragma once
#include <AK/Forward.h> #include <AK/Forward.h>
#include <AK/GenericLexer.h>
#include <AK/StringView.h> #include <AK/StringView.h>
namespace regex { namespace regex {
@ -63,27 +64,20 @@ private:
StringView m_value { nullptr }; StringView m_value { nullptr };
}; };
class Lexer { class Lexer : public GenericLexer {
public: public:
Lexer() = default; Lexer();
explicit Lexer(StringView const source); explicit Lexer(StringView const source);
Token next(); Token next();
void reset(); void reset();
void back(size_t offset); void back(size_t offset);
void set_source(StringView const source) { m_source = source; } char consume();
bool try_skip(char); void set_source(StringView const source) { m_input = source; }
char skip(); auto const& source() const { return m_input; }
auto const& source() const { return m_source; }
private: private:
ALWAYS_INLINE int peek(size_t offset = 0) const;
ALWAYS_INLINE void consume();
StringView m_source {};
size_t m_position { 0 };
size_t m_previous_position { 0 }; size_t m_previous_position { 0 };
Token m_current_token { TokenType::Eof, 0, StringView(nullptr) }; Token m_current_token { TokenType::Eof, 0, StringView(nullptr) };
int m_current_char { 0 };
}; };
} }

View file

@ -101,7 +101,7 @@ ALWAYS_INLINE bool Parser::try_skip(StringView str)
size_t potentially_go_back { 0 }; size_t potentially_go_back { 0 };
for (auto ch : str) { for (auto ch : str) {
if (!m_parser_state.lexer.try_skip(ch)) { if (!m_parser_state.lexer.consume_specific(ch)) {
m_parser_state.lexer.back(potentially_go_back); m_parser_state.lexer.back(potentially_go_back);
return false; return false;
} }
@ -129,7 +129,7 @@ ALWAYS_INLINE char Parser::skip()
ch = m_parser_state.current_token.value()[0]; ch = m_parser_state.current_token.value()[0];
} else { } else {
m_parser_state.lexer.back(m_parser_state.current_token.value().length()); m_parser_state.lexer.back(m_parser_state.current_token.value().length());
ch = m_parser_state.lexer.skip(); ch = m_parser_state.lexer.consume();
} }
m_parser_state.current_token = m_parser_state.lexer.next(); m_parser_state.current_token = m_parser_state.lexer.next();