LibSQL: Make lexer and parser more standard SQL compliant

SQL was standardized before there was consensus on sane language syntax constructs had evolved. The language is mostly case-insensitive, with unquoted text converted to upper case. Identifiers can include lower case characters and other 'special' characters by enclosing the identifier with double quotes. A double quote is escaped by doubling it. Likewise, a single quote in a literal string is escaped by doubling it. All this means that the strategy used in the lexer, where a token's value is a StringView 'window' on the source string, does not work, because the value needs to be massaged before being handed to the parser. Therefore a token now has a String containing its value. Given the limited lifetime of a token, this is acceptable overhead. Not doing this means that for example quote removal and double quote escaping would need to be done in the parser or in AST node construction, which would spread lexing basically all over the place. Which would be suboptimal. There was some impact on the sql utility and SyntaxHighlighter component which was addressed by storing the token's end position together with the start position in order to properly highlight it. Finally, reviewing the tests for parsing numeric literals revealed an inconsistency in which tokens we accept or reject: `1a` is accepted but `1e` is rejected. Related to this is the fate of `0x`. Added a FIXME reminding us to address this.
2025-07-27 07:47:35 +00:00 · 2021-06-21 11:20:09 -04:00 · 2021-06-21 11:20:09 -04:00 · 5c4890411b
commit 5c4890411b
parent 4198f7e1af
9 changed files with 408 additions and 311 deletions
--- a/Userland/Libraries/LibSQL/AST/Lexer.cpp
+++ b/Userland/Libraries/LibSQL/AST/Lexer.cpp
@ -1,5 +1,6 @@
 /*
 * Copyright (c) 2021, Tim Flynn <trflynn89@pm.me>
+ * Copyright (c) 2021, Jan de Visser <jan@de-visser.net>
 *
 * SPDX-License-Identifier: BSD-2-Clause
 */
@ -48,31 +49,36 @@ Token Lexer::next()
 {
    bool found_invalid_comment = consume_whitespace_and_comments();

-    size_t value_start = m_position;
    size_t value_start_line_number = m_line_number;
    size_t value_start_column_number = m_line_column;
    auto token_type = TokenType::Invalid;
+    StringBuilder current_token;

    if (is_eof()) {
        token_type = found_invalid_comment ? TokenType::Invalid : TokenType::Eof;
    } else if (is_numeric_literal_start()) {
        token_type = TokenType::NumericLiteral;
-        if (!consume_numeric_literal())
+        if (!consume_numeric_literal(current_token))
            token_type = TokenType::Invalid;
    } else if (is_string_literal_start()) {
        token_type = TokenType::StringLiteral;
-        if (!consume_string_literal())
+        if (!consume_string_literal(current_token))
+            token_type = TokenType::Invalid;
+    } else if (is_quoted_identifier_start()) {
+        token_type = TokenType::Identifier;
+        if (!consume_quoted_identifier(current_token))
            token_type = TokenType::Invalid;
    } else if (is_blob_literal_start()) {
        token_type = TokenType::BlobLiteral;
-        if (!consume_blob_literal())
+        if (!consume_blob_literal(current_token))
            token_type = TokenType::Invalid;
    } else if (is_identifier_start()) {
        do {
+            current_token.append((char)toupper(m_current_char));
            consume();
        } while (is_identifier_middle());

-        if (auto it = s_keywords.find(m_source.substring_view(value_start - 1, m_position - value_start)); it != s_keywords.end()) {
+        if (auto it = s_keywords.find(current_token.string_view()); it != s_keywords.end()) {
            token_type = it->value;
        } else {
            token_type = TokenType::Identifier;
@ -83,8 +89,8 @@ Token Lexer::next()
            if (auto it = s_two_char_tokens.find(m_source.substring_view(m_position - 1, 2)); it != s_two_char_tokens.end()) {
                found_two_char_token = true;
                token_type = it->value;
-                consume();
-                consume();
+                consume(&current_token);
+                consume(&current_token);
            }
        }

@ -93,30 +99,32 @@ Token Lexer::next()
            if (auto it = s_one_char_tokens.find(m_current_char); it != s_one_char_tokens.end()) {
                found_one_char_token = true;
                token_type = it->value;
-                consume();
+                consume(&current_token);
            }
        }

        if (!found_two_char_token && !found_one_char_token) {
            token_type = TokenType::Invalid;
-            consume();
+            consume(&current_token);
        }
    }

-    Token token(token_type, m_source.substring_view(value_start - 1, m_position - value_start), value_start_line_number, value_start_column_number);
+    Token token(token_type, current_token.build(),
+        { value_start_line_number, value_start_column_number },
+        { m_line_number, m_line_column });

    if constexpr (SQL_DEBUG) {
        dbgln("------------------------------");
        dbgln("Token: {}", token.name());
        dbgln("Value: {}", token.value());
-        dbgln("Line: {}, Column: {}", token.line_number(), token.line_column());
+        dbgln("Line: {}, Column: {}", token.start_position().line, token.start_position().column);
        dbgln("------------------------------");
    }

    return token;
 }

-void Lexer::consume()
+void Lexer::consume(StringBuilder* current_token)
 {
    auto did_reach_eof = [this] {
        if (m_position != m_source.length())
@ -128,6 +136,9 @@ void Lexer::consume()
        return true;
    };

+    if (current_token)
+        current_token->append(m_current_char);
+
    if (m_position > m_source.length())
        return;

@ -177,91 +188,148 @@ bool Lexer::consume_whitespace_and_comments()
    return found_invalid_comment;
 }

-bool Lexer::consume_numeric_literal()
+bool Lexer::consume_numeric_literal(StringBuilder& current_token)
 {
    // https://sqlite.org/syntax/numeric-literal.html
    bool is_valid_numeric_literal = true;

    if (m_current_char == '0') {
-        consume();
+        consume(&current_token);
        if (m_current_char == '.') {
-            consume();
+            consume(&current_token);
            while (isdigit(m_current_char))
-                consume();
+                consume(&current_token);
            if (m_current_char == 'e' || m_current_char == 'E')
-                is_valid_numeric_literal = consume_exponent();
+                is_valid_numeric_literal = consume_exponent(current_token);
        } else if (m_current_char == 'e' || m_current_char == 'E') {
-            is_valid_numeric_literal = consume_exponent();
+            is_valid_numeric_literal = consume_exponent(current_token);
        } else if (m_current_char == 'x' || m_current_char == 'X') {
-            is_valid_numeric_literal = consume_hexadecimal_number();
+            is_valid_numeric_literal = consume_hexadecimal_number(current_token);
        } else if (isdigit(m_current_char)) {
            do {
-                consume();
+                consume(&current_token);
            } while (isdigit(m_current_char));
        }
    } else {
        do {
-            consume();
+            consume(&current_token);
        } while (isdigit(m_current_char));

        if (m_current_char == '.') {
-            consume();
+            consume(&current_token);
            while (isdigit(m_current_char))
-                consume();
+                consume(&current_token);
        }
        if (m_current_char == 'e' || m_current_char == 'E')
-            is_valid_numeric_literal = consume_exponent();
+            is_valid_numeric_literal = consume_exponent(current_token);
    }

    return is_valid_numeric_literal;
 }

-bool Lexer::consume_string_literal()
+bool Lexer::consume_string_literal(StringBuilder& current_token)
 {
    // https://sqlite.org/lang_expr.html - See "3. Literal Values (Constants)"
    bool is_valid_string_literal = true;
+
+    // Skip the opening single quote:
    consume();

-    while (!is_eof() && !is_string_literal_end())
-        consume();
+    while (!is_eof() && !is_string_literal_end()) {
+        // If both the current character and the next one are single quotes,
+        // consume one single quote into the current token, and drop the
+        // other one on the floor:
+        if (match('\'', '\''))
+            consume();
+        consume(&current_token);
+    }

    if (is_eof())
        is_valid_string_literal = false;
+    // Drop the closing quote on the floor:
    consume();

    return is_valid_string_literal;
 }

-bool Lexer::consume_blob_literal()
+bool Lexer::consume_quoted_identifier(StringBuilder& current_token)
 {
-    // https://sqlite.org/lang_expr.html - See "3. Literal Values (Constants)"
+    // I have not found a reference to the syntax for identifiers in the
+    // SQLite documentation, but PostgreSQL has this:
+    // https://www.postgresql.org/docs/current/sql-syntax-lexical.html#SQL-SYNTAX-IDENTIFIERS
+    bool is_valid_identifier = true;
+
+    // Skip the opening double quote:
    consume();
-    return consume_string_literal();
+
+    while (!is_eof() && !is_quoted_identifier_end()) {
+        // If both the current character and the next one are double quotes,
+        // consume one single quote into the current token, and drop the
+        // other one on the floor:
+        if (match('"', '"'))
+            consume();
+        consume(&current_token);
+    }
+
+    if (is_eof())
+        is_valid_identifier = false;
+    // Drop the closing double quote on the floor:
+    consume();
+
+    return is_valid_identifier;
 }

-bool Lexer::consume_exponent()
+bool Lexer::consume_blob_literal(StringBuilder& current_token)
 {
+    // https://sqlite.org/lang_expr.html - See "3. Literal Values (Constants)"
+
+    // Skip starting 'X'/'x' character:
    consume();
-    if (m_current_char == '-' || m_current_char == '+')
-        consume();

-    if (!isdigit(m_current_char))
+    if (!consume_string_literal(current_token))
        return false;
-
-    while (isdigit(m_current_char)) {
-        consume();
+    for (auto ix = 0u; ix < current_token.length(); ix++) {
+        if (!isxdigit(current_token.string_view()[ix]))
+            return false;
    }
    return true;
 }

-bool Lexer::consume_hexadecimal_number()
+bool Lexer::consume_exponent(StringBuilder& current_token)
 {
-    consume();
+    consume(&current_token);
+    if (m_current_char == '-' || m_current_char == '+')
+        consume(&current_token);
+
+    if (!isdigit(m_current_char))
+        return false;
+
+    // FIXME This code results in the string "1e" being rejected as a
+    //       malformed numeric literal. We do however accept "1a" which
+    //       is inconsistent. We have to decide what we want to do:
+    //        - Be like `SQLite` and reject both "1a" and "1e" because we
+    //          require a space between the two tokens. This is pretty invasive;
+    //          we would have to decide where all spaces are required and fix
+    //          the lexer accordingly.
+    //        - Be like `PostgreSQL` and accept both "1e" and "1a" as two
+    //          separate tokens, and accept "1e3" as a single token. This would
+    //          would require pushing back the "e" we lexed here, terminate the
+    //          numeric literal, and re-process the "e" as the first char of
+    //          a new token.
+    while (isdigit(m_current_char)) {
+        consume(&current_token);
+    }
+    return true;
+}
+
+bool Lexer::consume_hexadecimal_number(StringBuilder& current_token)
+{
+    consume(&current_token);
    if (!isxdigit(m_current_char))
        return false;

    while (isxdigit(m_current_char))
-        consume();
+        consume(&current_token);

    return true;
 }
@ -299,6 +367,16 @@ bool Lexer::is_string_literal_end() const
    return m_current_char == '\'' && !(m_position < m_source.length() && m_source[m_position] == '\'');
 }

+bool Lexer::is_quoted_identifier_start() const
+{
+    return m_current_char == '"';
+}
+
+bool Lexer::is_quoted_identifier_end() const
+{
+    return m_current_char == '"' && !(m_position < m_source.length() && m_source[m_position] == '"');
+}
+
 bool Lexer::is_blob_literal_start() const
 {
    return match('x', '\'') || match('X', '\'');
--- a/Userland/Libraries/LibSQL/AST/Lexer.h
+++ b/Userland/Libraries/LibSQL/AST/Lexer.h
@ -1,5 +1,6 @@
 /*
 * Copyright (c) 2021, Tim Flynn <trflynn89@pm.me>
+ * Copyright (c) 2021, Jan de Visser <jan@de-visser.net>
 *
 * SPDX-License-Identifier: BSD-2-Clause
 */
@ -20,14 +21,15 @@ public:
    Token next();

 private:
-    void consume();
+    void consume(StringBuilder* = nullptr);

    bool consume_whitespace_and_comments();
-    bool consume_numeric_literal();
-    bool consume_string_literal();
-    bool consume_blob_literal();
-    bool consume_exponent();
-    bool consume_hexadecimal_number();
+    bool consume_numeric_literal(StringBuilder&);
+    bool consume_string_literal(StringBuilder&);
+    bool consume_quoted_identifier(StringBuilder&);
+    bool consume_blob_literal(StringBuilder&);
+    bool consume_exponent(StringBuilder&);
+    bool consume_hexadecimal_number(StringBuilder&);

    bool match(char a, char b) const;
    bool is_identifier_start() const;
@ -35,6 +37,8 @@ private:
    bool is_numeric_literal_start() const;
    bool is_string_literal_start() const;
    bool is_string_literal_end() const;
+    bool is_quoted_identifier_start() const;
+    bool is_quoted_identifier_end() const;
    bool is_blob_literal_start() const;
    bool is_line_comment_start() const;
    bool is_block_comment_start() const;
--- a/Userland/Libraries/LibSQL/AST/Parser.cpp
+++ b/Userland/Libraries/LibSQL/AST/Parser.cpp
@ -1075,12 +1075,9 @@ void Parser::syntax_error(String message)
    m_parser_state.m_errors.append({ move(message), position() });
 }

-Parser::Position Parser::position() const
+SourcePosition Parser::position() const
 {
-    return {
-        m_parser_state.m_token.line_number(),
-        m_parser_state.m_token.line_column()
-    };
+    return m_parser_state.m_token.start_position();
 }

 Parser::ParserState::ParserState(Lexer lexer)
--- a/Userland/Libraries/LibSQL/AST/Parser.h
+++ b/Userland/Libraries/LibSQL/AST/Parser.h
@ -21,14 +21,9 @@ constexpr size_t maximum_subquery_depth = 100;
 }

 class Parser {
-    struct Position {
-        size_t line { 0 };
-        size_t column { 0 };
-    };
-
    struct Error {
        String message;
-        Position position;
+        SourcePosition position;

        String to_string() const
        {
@ -126,7 +121,7 @@ private:
    void expected(StringView what);
    void syntax_error(String message);

-    Position position() const;
+    SourcePosition position() const;

    ParserState m_parser_state;
 };
--- a/Userland/Libraries/LibSQL/AST/SyntaxHighlighter.cpp
+++ b/Userland/Libraries/LibSQL/AST/SyntaxHighlighter.cpp
@ -47,22 +47,12 @@ void SyntaxHighlighter::rehighlight(Palette const& palette)

    Vector<GUI::TextDocumentSpan> spans;

-    auto append_token = [&](StringView str, Token const& token) {
-        if (str.is_empty())
+    auto append_token = [&](Token const& token) {
+        if (token.value().is_empty())
            return;
-
-        GUI::TextPosition position { token.line_number() - 1, token.line_column() - 1 };
-        for (char c : str) {
-            if (c == '\n') {
-                position.set_line(position.line() + 1);
-                position.set_column(0);
-            } else
-                position.set_column(position.column() + 1);
-        }
-
        GUI::TextDocumentSpan span;
-        span.range.set_start({ token.line_number() - 1, token.line_column() - 1 });
-        span.range.set_end({ position.line(), position.column() });
+        span.range.set_start({ token.start_position().line - 1, token.start_position().column - 1 });
+        span.range.set_end({ token.end_position().line - 1, token.end_position().column - 1 });
        auto style = style_for_token_type(palette, token.type());
        span.attributes.color = style.color;
        span.attributes.bold = style.bold;
@ -78,7 +68,7 @@ void SyntaxHighlighter::rehighlight(Palette const& palette)

    for (;;) {
        auto token = lexer.next();
-        append_token(token.value(), token);
+        append_token(token);
        if (token.type() == TokenType::Eof)
            break;
    }
--- a/Userland/Libraries/LibSQL/AST/Token.h
+++ b/Userland/Libraries/LibSQL/AST/Token.h
@ -1,5 +1,6 @@
 /*
 * Copyright (c) 2021, Tim Flynn <trflynn89@pm.me>
+ * Copyright (c) 2021, Jan de Visser <jan@de-visser.net>
 *
 * SPDX-License-Identifier: BSD-2-Clause
 */
@ -7,6 +8,7 @@
 #pragma once

 #include <AK/HashMap.h>
+#include <AK/String.h>
 #include <AK/StringView.h>

 namespace SQL::AST {
@ -209,13 +211,18 @@ enum class TokenCategory {
    Punctuation,
 };

+struct SourcePosition {
+    size_t line { 0 };
+    size_t column { 0 };
+};
+
 class Token {
 public:
-    Token(TokenType type, StringView value, size_t line_number, size_t line_column)
+    Token(TokenType type, String value, SourcePosition start_position, SourcePosition end_position)
        : m_type(type)
-        , m_value(value)
-        , m_line_number(line_number)
-        , m_line_column(line_column)
+        , m_value(move(value))
+        , m_start_position(start_position)
+        , m_end_position(end_position)
    {
    }

@ -226,17 +233,17 @@ public:
    TokenType type() const { return m_type; }
    TokenCategory category() const { return category(m_type); }

-    StringView value() const { return m_value; }
+    String const& value() const { return m_value; }
    double double_value() const;

-    size_t line_number() const { return m_line_number; }
-    size_t line_column() const { return m_line_column; }
+    SourcePosition const& start_position() const { return m_start_position; }
+    SourcePosition const& end_position() const { return m_end_position; }

 private:
    TokenType m_type;
-    StringView m_value;
-    size_t m_line_number;
-    size_t m_line_column;
+    String m_value;
+    SourcePosition m_start_position;
+    SourcePosition m_end_position;
 };

 }