mirror of
				https://github.com/RGBCube/serenity
				synced 2025-10-31 14:32:46 +00:00 
			
		
		
		
	JSSpecCompiler: Add functions for splitting node contents into tokens
This commit is contained in:
		
							parent
							
								
									8342361481
								
							
						
					
					
						commit
						9f29e04897
					
				
					 6 changed files with 378 additions and 0 deletions
				
			
		
							
								
								
									
										157
									
								
								Meta/Lagom/Tools/CodeGenerators/JSSpecCompiler/Parser/Lexer.cpp
									
										
									
									
									
										Normal file
									
								
							
							
						
						
									
										157
									
								
								Meta/Lagom/Tools/CodeGenerators/JSSpecCompiler/Parser/Lexer.cpp
									
										
									
									
									
										Normal file
									
								
							|  | @ -0,0 +1,157 @@ | |||
| /*
 | ||||
|  * Copyright (c) 2023, Dan Klishch <danilklishch@gmail.com> | ||||
|  * | ||||
|  * SPDX-License-Identifier: BSD-2-Clause | ||||
|  */ | ||||
| 
 | ||||
| #include <AK/GenericLexer.h> | ||||
| #include <AK/NonnullOwnPtr.h> | ||||
| 
 | ||||
| #include "Parser/Lexer.h" | ||||
| #include "Parser/XMLUtils.h" | ||||
| 
 | ||||
| namespace JSSpecCompiler { | ||||
| 
 | ||||
| namespace { | ||||
| Optional<Token> consume_number(GenericLexer& lexer, XML::Node const* node) | ||||
| { | ||||
|     u64 start = lexer.tell(); | ||||
| 
 | ||||
|     if (lexer.next_is('-')) | ||||
|         lexer.consume(1); | ||||
| 
 | ||||
|     if (!lexer.next_is(is_ascii_digit)) { | ||||
|         lexer.retreat(lexer.tell() - start); | ||||
|         return {}; | ||||
|     } | ||||
| 
 | ||||
|     lexer.consume_while(is_ascii_digit); | ||||
| 
 | ||||
|     if (lexer.next_is('.')) { | ||||
|         lexer.consume(1); | ||||
|         if (lexer.consume_while(is_ascii_digit).length() == 0) | ||||
|             lexer.retreat(1); | ||||
|     } | ||||
| 
 | ||||
|     auto length = lexer.tell() - start; | ||||
|     lexer.retreat(length); | ||||
|     return { Token { TokenType::Number, lexer.consume(length), node } }; | ||||
| } | ||||
| 
 | ||||
| bool can_end_word_token(char c) | ||||
| { | ||||
|     return is_ascii_space(c) || ".,"sv.contains(c); | ||||
| } | ||||
| } | ||||
| 
 | ||||
| ParseErrorOr<void> tokenize_string(XML::Node const* node, StringView view, Vector<Token>& tokens) | ||||
| { | ||||
| #define CONSUME_IF_NEXT(view, type)                                      \ | ||||
|     if (lexer.next_is(view##sv)) {                                       \ | ||||
|         size_t length = __builtin_strlen(view);                          \ | ||||
|         tokens.append({ TokenType::type, lexer.consume(length), node }); \ | ||||
|         continue;                                                        \ | ||||
|     } | ||||
| 
 | ||||
|     GenericLexer lexer(view); | ||||
|     while (!lexer.is_eof()) { | ||||
|         lexer.ignore_while(is_ascii_space); | ||||
| 
 | ||||
|         if (auto result = consume_number(lexer, node); result.has_value()) { | ||||
|             tokens.append(result.release_value()); | ||||
|             continue; | ||||
|         } | ||||
| 
 | ||||
|         CONSUME_IF_NEXT("(", ParenOpen); | ||||
|         CONSUME_IF_NEXT(")", ParenClose); | ||||
|         CONSUME_IF_NEXT("{", BraceOpen); | ||||
|         CONSUME_IF_NEXT("}", BraceClose); | ||||
|         CONSUME_IF_NEXT(",", Comma); | ||||
|         CONSUME_IF_NEXT(". ", Dot); | ||||
|         CONSUME_IF_NEXT(".\n", Dot); | ||||
|         CONSUME_IF_NEXT(":", Colon); | ||||
|         CONSUME_IF_NEXT(".", MemberAccess); | ||||
|         CONSUME_IF_NEXT("<", Less); | ||||
|         CONSUME_IF_NEXT(">", Greater); | ||||
|         CONSUME_IF_NEXT("is not equal to", NotEquals); | ||||
|         CONSUME_IF_NEXT("≠", NotEquals); | ||||
|         CONSUME_IF_NEXT("is equal to", Equals); | ||||
|         CONSUME_IF_NEXT("=", Equals); | ||||
|         CONSUME_IF_NEXT("+", Plus); | ||||
|         CONSUME_IF_NEXT("-", AmbiguousMinus); | ||||
|         CONSUME_IF_NEXT("×", Multiplication); | ||||
|         CONSUME_IF_NEXT("/", Division); | ||||
|         CONSUME_IF_NEXT("!", ExclamationMark); | ||||
|         CONSUME_IF_NEXT("is", Is); | ||||
| 
 | ||||
|         StringView word = lexer.consume_until(can_end_word_token); | ||||
|         if (word.length()) | ||||
|             tokens.append({ TokenType::Word, word, node }); | ||||
|     } | ||||
|     return {}; | ||||
| 
 | ||||
| #undef CONSUME_IF_NEXT | ||||
| } | ||||
| 
 | ||||
| ParseErrorOr<TokenizeTreeResult> tokenize_tree(XML::Node const* node, bool allow_substeps) | ||||
| { | ||||
|     TokenizeTreeResult result; | ||||
|     auto& tokens = result.tokens; | ||||
| 
 | ||||
|     for (auto const& child : node->as_element().children) { | ||||
|         TRY(child->content.visit( | ||||
|             [&](XML::Node::Element const& element) -> ParseErrorOr<void> { | ||||
|                 if (result.substeps != nullptr) | ||||
|                     return ParseError::create("Substeps list must be the last non-empty child"sv, child); | ||||
| 
 | ||||
|                 if (element.name == tag_var) { | ||||
|                     tokens.append({ TokenType::Identifier, TRY(get_text_contents(child)), child }); | ||||
|                     return {}; | ||||
|                 } | ||||
| 
 | ||||
|                 if (element.name == tag_span) { | ||||
|                     auto element_class = TRY(get_attribute_by_name(child, attribute_class)); | ||||
|                     if (element_class != class_secnum) | ||||
|                         return ParseError::create(String::formatted("Expected 'secnum' as a class name of <span>, but found '{}'", element_class), child); | ||||
|                     tokens.append({ TokenType::SectionNumber, TRY(get_text_contents(child)), child }); | ||||
|                     return {}; | ||||
|                 } | ||||
| 
 | ||||
|                 if (element.name == tag_emu_val) { | ||||
|                     auto contents = TRY(get_text_contents(child)); | ||||
|                     if (contents.length() >= 2 && contents.starts_with('"') && contents.ends_with('"')) | ||||
|                         tokens.append({ TokenType::String, contents.substring_view(1, contents.length() - 2), child }); | ||||
|                     else if (contents == "undefined") | ||||
|                         tokens.append({ TokenType::Undefined, contents, child }); | ||||
|                     else | ||||
|                         tokens.append({ TokenType::Identifier, contents, child }); | ||||
|                     return {}; | ||||
|                 } | ||||
| 
 | ||||
|                 if (element.name == tag_emu_xref) { | ||||
|                     auto contents = TRY(get_text_contents(TRY(get_only_child(child, "a"sv)))); | ||||
|                     tokens.append({ TokenType::Identifier, contents, child }); | ||||
|                     return {}; | ||||
|                 } | ||||
| 
 | ||||
|                 if (element.name == tag_ol) { | ||||
|                     if (!allow_substeps) | ||||
|                         return ParseError::create("Found nested list but substeps are not allowed"sv, child); | ||||
|                     result.substeps = child; | ||||
|                     return {}; | ||||
|                 } | ||||
| 
 | ||||
|                 return ParseError::create(String::formatted("Unexpected child element with tag {}", element.name), child); | ||||
|             }, | ||||
|             [&](XML::Node::Text const& text) -> ParseErrorOr<void> { | ||||
|                 auto view = text.builder.string_view(); | ||||
|                 if (result.substeps && !contains_empty_text(child)) | ||||
|                     return ParseError::create("Substeps list must be the last non-empty child"sv, child); | ||||
|                 return tokenize_string(child, view, tokens); | ||||
|             }, | ||||
|             move(ignore_comments))); | ||||
|     } | ||||
|     return result; | ||||
| } | ||||
| 
 | ||||
| } | ||||
		Loading…
	
	Add table
		Add a link
		
	
		Reference in a new issue
	
	 Dan Klishch
						Dan Klishch