mirror of
				https://github.com/RGBCube/serenity
				synced 2025-10-31 09:52:44 +00:00 
			
		
		
		
	
		
			
				
	
	
		
			225 lines
		
	
	
	
		
			8.2 KiB
		
	
	
	
		
			C++
		
	
	
	
	
	
			
		
		
	
	
			225 lines
		
	
	
	
		
			8.2 KiB
		
	
	
	
		
			C++
		
	
	
	
	
	
| /*
 | |
|  * Copyright (c) 2020, Emanuel Sprung <emanuel.sprung@gmail.com>
 | |
|  * All rights reserved.
 | |
|  *
 | |
|  * Redistribution and use in source and binary forms, with or without
 | |
|  * modification, are permitted provided that the following conditions are met:
 | |
|  *
 | |
|  * 1. Redistributions of source code must retain the above copyright notice, this
 | |
|  *    list of conditions and the following disclaimer.
 | |
|  *
 | |
|  * 2. Redistributions in binary form must reproduce the above copyright notice,
 | |
|  *    this list of conditions and the following disclaimer in the documentation
 | |
|  *    and/or other materials provided with the distribution.
 | |
|  *
 | |
|  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 | |
|  * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 | |
|  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
 | |
|  * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
 | |
|  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 | |
|  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
 | |
|  * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
 | |
|  * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
 | |
|  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 | |
|  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 | |
|  */
 | |
| 
 | |
| #pragma once
 | |
| 
 | |
| #include "RegexByteCode.h"
 | |
| #include "RegexError.h"
 | |
| #include "RegexLexer.h"
 | |
| #include "RegexOptions.h"
 | |
| 
 | |
| #include <AK/Forward.h>
 | |
| #include <AK/StringBuilder.h>
 | |
| #include <AK/Types.h>
 | |
| #include <AK/Vector.h>
 | |
| 
 | |
| namespace regex {
 | |
| 
 | |
| class PosixExtendedParser;
 | |
| class ECMA262Parser;
 | |
| 
 | |
| template<typename T>
 | |
| struct GenericParserTraits {
 | |
|     using OptionsType = T;
 | |
| };
 | |
| 
 | |
| template<typename T>
 | |
| struct ParserTraits : public GenericParserTraits<T> {
 | |
| };
 | |
| 
 | |
| template<>
 | |
| struct ParserTraits<PosixExtendedParser> : public GenericParserTraits<PosixOptions> {
 | |
| };
 | |
| 
 | |
| template<>
 | |
| struct ParserTraits<ECMA262Parser> : public GenericParserTraits<ECMAScriptOptions> {
 | |
| };
 | |
| 
 | |
| class Parser {
 | |
| public:
 | |
|     struct Result {
 | |
|         ByteCode bytecode;
 | |
|         size_t capture_groups_count;
 | |
|         size_t named_capture_groups_count;
 | |
|         size_t match_length_minimum;
 | |
|         Error error;
 | |
|         Token error_token;
 | |
|     };
 | |
| 
 | |
|     explicit Parser(Lexer& lexer)
 | |
|         : m_parser_state(lexer)
 | |
|     {
 | |
|     }
 | |
| 
 | |
|     Parser(Lexer& lexer, AllOptions regex_options)
 | |
|         : m_parser_state(lexer, regex_options)
 | |
|     {
 | |
|     }
 | |
| 
 | |
|     virtual ~Parser() = default;
 | |
| 
 | |
|     Result parse(Optional<AllOptions> regex_options = {});
 | |
|     bool has_error() const { return m_parser_state.error != Error::NoError; }
 | |
|     Error error() const { return m_parser_state.error; }
 | |
| 
 | |
| protected:
 | |
|     virtual bool parse_internal(ByteCode&, size_t& match_length_minimum) = 0;
 | |
| 
 | |
|     ALWAYS_INLINE bool match(TokenType type) const;
 | |
|     ALWAYS_INLINE bool match(char ch) const;
 | |
|     ALWAYS_INLINE bool match_ordinary_characters();
 | |
|     ALWAYS_INLINE Token consume();
 | |
|     ALWAYS_INLINE Token consume(TokenType type, Error error);
 | |
|     ALWAYS_INLINE bool consume(const String&);
 | |
|     ALWAYS_INLINE bool try_skip(StringView);
 | |
|     ALWAYS_INLINE bool lookahead_any(StringView);
 | |
|     ALWAYS_INLINE char skip();
 | |
|     ALWAYS_INLINE void back(size_t = 1);
 | |
|     ALWAYS_INLINE void reset();
 | |
|     ALWAYS_INLINE bool done() const;
 | |
|     ALWAYS_INLINE bool set_error(Error error);
 | |
| 
 | |
|     struct ParserState {
 | |
|         Lexer& lexer;
 | |
|         Token current_token;
 | |
|         Error error = Error::NoError;
 | |
|         Token error_token { TokenType::Eof, 0, StringView(nullptr) };
 | |
|         ByteCode bytecode;
 | |
|         size_t capture_groups_count { 0 };
 | |
|         size_t named_capture_groups_count { 0 };
 | |
|         size_t match_length_minimum { 0 };
 | |
|         AllOptions regex_options;
 | |
|         HashMap<int, size_t> capture_group_minimum_lengths;
 | |
|         HashMap<FlyString, size_t> named_capture_group_minimum_lengths;
 | |
|         HashMap<size_t, FlyString> named_capture_groups;
 | |
| 
 | |
|         explicit ParserState(Lexer& lexer)
 | |
|             : lexer(lexer)
 | |
|             , current_token(lexer.next())
 | |
|         {
 | |
|         }
 | |
|         explicit ParserState(Lexer& lexer, AllOptions regex_options)
 | |
|             : lexer(lexer)
 | |
|             , current_token(lexer.next())
 | |
|             , regex_options(regex_options)
 | |
|         {
 | |
|         }
 | |
|     };
 | |
| 
 | |
|     ParserState m_parser_state;
 | |
| };
 | |
| 
 | |
| class PosixExtendedParser final : public Parser {
 | |
| public:
 | |
|     explicit PosixExtendedParser(Lexer& lexer)
 | |
|         : Parser(lexer)
 | |
|     {
 | |
|     }
 | |
| 
 | |
|     PosixExtendedParser(Lexer& lexer, Optional<typename ParserTraits<PosixExtendedParser>::OptionsType> regex_options)
 | |
|         : Parser(lexer, regex_options.value_or({}))
 | |
|     {
 | |
|     }
 | |
| 
 | |
|     ~PosixExtendedParser() = default;
 | |
| 
 | |
| private:
 | |
|     ALWAYS_INLINE bool match_repetition_symbol();
 | |
| 
 | |
|     bool parse_internal(ByteCode&, size_t&) override;
 | |
| 
 | |
|     bool parse_root(ByteCode&, size_t&);
 | |
|     ALWAYS_INLINE bool parse_sub_expression(ByteCode&, size_t&);
 | |
|     ALWAYS_INLINE bool parse_bracket_expression(ByteCode&, size_t&);
 | |
|     ALWAYS_INLINE bool parse_repetition_symbol(ByteCode&, size_t&);
 | |
| };
 | |
| 
 | |
| class ECMA262Parser final : public Parser {
 | |
| public:
 | |
|     explicit ECMA262Parser(Lexer& lexer)
 | |
|         : Parser(lexer)
 | |
|     {
 | |
|     }
 | |
| 
 | |
|     ECMA262Parser(Lexer& lexer, Optional<typename ParserTraits<ECMA262Parser>::OptionsType> regex_options)
 | |
|         : Parser(lexer, regex_options.value_or({}))
 | |
|     {
 | |
|         m_should_use_browser_extended_grammar = regex_options.has_value() && regex_options->has_flag_set(ECMAScriptFlags::BrowserExtended);
 | |
|     }
 | |
| 
 | |
|     ~ECMA262Parser() = default;
 | |
| 
 | |
| private:
 | |
|     bool parse_internal(ByteCode&, size_t&) override;
 | |
| 
 | |
|     enum class ReadDigitsInitialZeroState {
 | |
|         Allow,
 | |
|         Disallow,
 | |
|     };
 | |
|     StringView read_digits_as_string(ReadDigitsInitialZeroState initial_zero = ReadDigitsInitialZeroState::Allow, bool hex = false, int max_count = -1);
 | |
|     Optional<unsigned> read_digits(ReadDigitsInitialZeroState initial_zero = ReadDigitsInitialZeroState::Allow, bool hex = false, int max_count = -1);
 | |
|     StringView read_capture_group_specifier(bool take_starting_angle_bracket = false);
 | |
| 
 | |
|     bool parse_pattern(ByteCode&, size_t&, bool unicode, bool named);
 | |
|     bool parse_disjunction(ByteCode&, size_t&, bool unicode, bool named);
 | |
|     bool parse_alternative(ByteCode&, size_t&, bool unicode, bool named);
 | |
|     bool parse_term(ByteCode&, size_t&, bool unicode, bool named);
 | |
|     bool parse_assertion(ByteCode&, size_t&, bool unicode, bool named);
 | |
|     bool parse_atom(ByteCode&, size_t&, bool unicode, bool named);
 | |
|     bool parse_quantifier(ByteCode&, size_t&, bool unicode, bool named);
 | |
|     bool parse_atom_escape(ByteCode&, size_t&, bool unicode, bool named);
 | |
|     bool parse_character_class(ByteCode&, size_t&, bool unicode, bool named);
 | |
|     bool parse_capture_group(ByteCode&, size_t&, bool unicode, bool named);
 | |
|     Optional<CharClass> parse_character_class_escape(bool& out_inverse, bool expect_backslash = false);
 | |
|     bool parse_nonempty_class_ranges(Vector<CompareTypeAndValuePair>&, bool unicode);
 | |
| 
 | |
|     // Used only by B.1.4, Regular Expression Patterns (Extended for use in browsers)
 | |
|     bool parse_quantifiable_assertion(ByteCode&, size_t&, bool named);
 | |
|     bool parse_extended_atom(ByteCode&, size_t&, bool named);
 | |
|     bool parse_inner_disjunction(ByteCode& bytecode_stack, size_t& length, bool unicode, bool named);
 | |
|     bool parse_invalid_braced_quantifier(); // Note: This function either parses and *fails*, or doesn't parse anything and returns false.
 | |
|     bool parse_legacy_octal_escape_sequence(ByteCode& bytecode_stack, size_t& length);
 | |
|     Optional<u8> parse_legacy_octal_escape();
 | |
| 
 | |
|     size_t ensure_total_number_of_capturing_parenthesis();
 | |
| 
 | |
|     // ECMA-262's flavour of regex is a bit weird in that it allows backrefs to reference "future" captures, and such backrefs
 | |
|     // always match the empty string. So we have to know how many capturing parenthesis there are, but we don't want to always
 | |
|     // parse it twice, so we'll just do so when it's actually needed.
 | |
|     // Most patterns should have no need to ever populate this field.
 | |
|     Optional<size_t> m_total_number_of_capturing_parenthesis;
 | |
| 
 | |
|     // Keep the Annex B. behaviour behind a flag, the users can enable it by passing the `ECMAScriptFlags::BrowserExtended` flag.
 | |
|     bool m_should_use_browser_extended_grammar { false };
 | |
| };
 | |
| 
 | |
| using PosixExtended = PosixExtendedParser;
 | |
| using ECMA262 = ECMA262Parser;
 | |
| 
 | |
| }
 | |
| 
 | |
| using regex::ECMA262;
 | |
| using regex::PosixExtended;
 | 
