JSSpecCompiler+LibXML: Store location for tokens

2025-09-16 00:06:17 +00:00 · 2024-01-16 20:47:08 -05:00 · 2024-01-16 20:47:08 -05:00 · dee4978d67
commit dee4978d67
parent d219c91ca9
7 changed files with 78 additions and 50 deletions
--- a/Meta/Lagom/Tools/CodeGenerators/JSSpecCompiler/Parser/Lexer.cpp
+++ b/Meta/Lagom/Tools/CodeGenerators/JSSpecCompiler/Parser/Lexer.cpp
@ -4,16 +4,17 @@
 * SPDX-License-Identifier: BSD-2-Clause
 */

-#include <AK/GenericLexer.h>
 #include <AK/NonnullOwnPtr.h>
+#include <LibXML/Parser/Parser.h>

 #include "Parser/Lexer.h"
+#include "Parser/SpecParser.h"
 #include "Parser/XMLUtils.h"

 namespace JSSpecCompiler {

 namespace {
-Optional<Token> consume_number(GenericLexer& lexer, XML::Node const* node)
+Optional<Token> consume_number(XML::LineTrackingLexer& lexer, XML::Node const* node, Location& location)
 {
    u64 start = lexer.tell();

@ -35,7 +36,7 @@ Optional<Token> consume_number(GenericLexer& lexer, XML::Node const* node)

    auto length = lexer.tell() - start;
    lexer.retreat(length);
-    return { Token { TokenType::Number, lexer.consume(length), node } };
+    return { Token { TokenType::Number, lexer.consume(length), node, move(location) } };
 }

 bool can_end_word_token(char c)
@ -44,56 +45,68 @@ bool can_end_word_token(char c)
 }
 }

-ParseErrorOr<void> tokenize_string(XML::Node const* node, StringView view, Vector<Token>& tokens)
+ParseErrorOr<void> tokenize_string(SpecificationParsingContext& ctx, XML::Node const* node, StringView view, Vector<Token>& tokens)
 {
-#define CONSUME_IF_NEXT(view, type)                                      \
-    if (lexer.next_is(view##sv)) {                                       \
-        size_t length = __builtin_strlen(view);                          \
-        tokens.append({ TokenType::type, lexer.consume(length), node }); \
-        continue;                                                        \
-    }
+    static constexpr struct {
+        StringView text_to_match;
+        TokenType token_type;
+    } choices[] = {
+        { "-"sv, TokenType::AmbiguousMinus },
+        { "}"sv, TokenType::BraceClose },
+        { "{"sv, TokenType::BraceOpen },
+        { ":"sv, TokenType::Colon },
+        { ","sv, TokenType::Comma },
+        { "/"sv, TokenType::Division },
+        { ". "sv, TokenType::Dot },
+        { ".\n"sv, TokenType::Dot },
+        { "="sv, TokenType::Equals },
+        { "is equal to"sv, TokenType::Equals },
+        { "!"sv, TokenType::ExclamationMark },
+        { ">"sv, TokenType::Greater },
+        { "is"sv, TokenType::Is },
+        { "<"sv, TokenType::Less },
+        { "."sv, TokenType::MemberAccess },
+        { "×"sv, TokenType::Multiplication },
+        { "is not equal to"sv, TokenType::NotEquals },
+        { "≠"sv, TokenType::NotEquals },
+        { ")"sv, TokenType::ParenClose },
+        { "("sv, TokenType::ParenOpen },
+        { "+"sv, TokenType::Plus },
+    };
+
+    XML::LineTrackingLexer lexer(view, node->offset);

-    GenericLexer lexer(view);
    while (!lexer.is_eof()) {
        lexer.ignore_while(is_ascii_space);

-        if (auto result = consume_number(lexer, node); result.has_value()) {
+        // FIXME: This is incorrect since we count text offset after XML reference resolution. To do
+        //        this properly, we need support from XML::Parser.
+        Location token_location = ctx.location_from_xml_offset(lexer.offset_for(lexer.tell()));
+
+        if (auto result = consume_number(lexer, node, token_location); result.has_value()) {
            tokens.append(result.release_value());
            continue;
        }

-        CONSUME_IF_NEXT("(", ParenOpen);
-        CONSUME_IF_NEXT(")", ParenClose);
-        CONSUME_IF_NEXT("{", BraceOpen);
-        CONSUME_IF_NEXT("}", BraceClose);
-        CONSUME_IF_NEXT(",", Comma);
-        CONSUME_IF_NEXT(". ", Dot);
-        CONSUME_IF_NEXT(".\n", Dot);
-        CONSUME_IF_NEXT(":", Colon);
-        CONSUME_IF_NEXT(".", MemberAccess);
-        CONSUME_IF_NEXT("<", Less);
-        CONSUME_IF_NEXT(">", Greater);
-        CONSUME_IF_NEXT("is not equal to", NotEquals);
-        CONSUME_IF_NEXT("≠", NotEquals);
-        CONSUME_IF_NEXT("is equal to", Equals);
-        CONSUME_IF_NEXT("=", Equals);
-        CONSUME_IF_NEXT("+", Plus);
-        CONSUME_IF_NEXT("-", AmbiguousMinus);
-        CONSUME_IF_NEXT("×", Multiplication);
-        CONSUME_IF_NEXT("/", Division);
-        CONSUME_IF_NEXT("!", ExclamationMark);
-        CONSUME_IF_NEXT("is", Is);
+        bool matched = false;
+        for (auto const& [text_to_match, token_type] : choices) {
+            if (lexer.consume_specific(text_to_match)) {
+                tokens.append({ token_type, ""sv, node, move(token_location) });
+                matched = true;
+                break;
+            }
+        }
+        if (matched)
+            continue;

        StringView word = lexer.consume_until(can_end_word_token);
        if (word.length())
-            tokens.append({ TokenType::Word, word, node });
+            tokens.append({ TokenType::Word, word, node, move(token_location) });
    }
    return {};
-
-#undef CONSUME_IF_NEXT
 }

-ParseErrorOr<TokenizeTreeResult> tokenize_tree(XML::Node const* node, bool allow_substeps)
+ParseErrorOr<TokenizeTreeResult> tokenize_tree(SpecificationParsingContext& ctx, XML::Node const* node, bool allow_substeps)
 {
    TokenizeTreeResult result;
    auto& tokens = result.tokens;
@ -104,8 +117,10 @@ ParseErrorOr<TokenizeTreeResult> tokenize_tree(XML::Node const* node, bool allow
                if (result.substeps != nullptr)
                    return ParseError::create("Substeps list must be the last non-empty child"sv, child);

+                Location child_location = ctx.location_from_xml_offset(child->offset);
+
                if (element.name == tag_var) {
-                    tokens.append({ TokenType::Identifier, TRY(get_text_contents(child)), child });
+                    tokens.append({ TokenType::Identifier, TRY(get_text_contents(child)), child, move(child_location) });
                    return {};
                }

@ -113,24 +128,24 @@ ParseErrorOr<TokenizeTreeResult> tokenize_tree(XML::Node const* node, bool allow
                    auto element_class = TRY(deprecated_get_attribute_by_name(child, attribute_class));
                    if (element_class != class_secnum)
                        return ParseError::create(String::formatted("Expected 'secnum' as a class name of <span>, but found '{}'", element_class), child);
-                    tokens.append({ TokenType::SectionNumber, TRY(get_text_contents(child)), child });
+                    tokens.append({ TokenType::SectionNumber, TRY(get_text_contents(child)), child, move(child_location) });
                    return {};
                }

                if (element.name == tag_emu_val) {
                    auto contents = TRY(get_text_contents(child));
                    if (contents.length() >= 2 && contents.starts_with('"') && contents.ends_with('"'))
-                        tokens.append({ TokenType::String, contents.substring_view(1, contents.length() - 2), child });
+                        tokens.append({ TokenType::String, contents.substring_view(1, contents.length() - 2), child, move(child_location) });
                    else if (contents == "undefined")
-                        tokens.append({ TokenType::Undefined, contents, child });
+                        tokens.append({ TokenType::Undefined, contents, child, move(child_location) });
                    else
-                        tokens.append({ TokenType::Identifier, contents, child });
+                        tokens.append({ TokenType::Identifier, contents, child, move(child_location) });
                    return {};
                }

                if (element.name == tag_emu_xref) {
                    auto contents = TRY(get_text_contents(TRY(get_only_child(child, "a"sv))));
-                    tokens.append({ TokenType::Identifier, contents, child });
+                    tokens.append({ TokenType::Identifier, contents, child, move(child_location) });
                    return {};
                }

@ -147,7 +162,7 @@ ParseErrorOr<TokenizeTreeResult> tokenize_tree(XML::Node const* node, bool allow
                auto view = text.builder.string_view();
                if (result.substeps && !contains_empty_text(child))
                    return ParseError::create("Substeps list must be the last non-empty child"sv, child);
-                return tokenize_string(child, view, tokens);
+                return tokenize_string(ctx, child, view, tokens);
            },
            move(ignore_comments)));
    }