diff --git a/Libraries/LibRegex/Forward.h b/Libraries/LibRegex/Forward.h index 10203513a4..7d0397a843 100644 --- a/Libraries/LibRegex/Forward.h +++ b/Libraries/LibRegex/Forward.h @@ -32,6 +32,7 @@ namespace regex { enum class Error : u8; class Lexer; class PosixExtendedParser; +class ECMA262Parser; class ByteCode; class OpCode; @@ -50,6 +51,7 @@ class OpCode_Compare; class RegexStringView; } +using regex::ECMA262Parser; using regex::Error; using regex::Lexer; using regex::PosixExtendedParser; diff --git a/Libraries/LibRegex/RegexByteCode.cpp b/Libraries/LibRegex/RegexByteCode.cpp index a68ee2d728..2887a5eee8 100644 --- a/Libraries/LibRegex/RegexByteCode.cpp +++ b/Libraries/LibRegex/RegexByteCode.cpp @@ -65,6 +65,20 @@ const char* execution_result_name(ExecutionResult result) } } +const char* boundary_check_type_name(BoundaryCheckType ty) +{ + switch (ty) { +#define __ENUMERATE_BOUNDARY_CHECK_TYPE(x) \ + case BoundaryCheckType::x: \ + return #x; + ENUMERATE_BOUNDARY_CHECK_TYPES +#undef __ENUMERATE_BOUNDARY_CHECK_TYPE + default: + ASSERT_NOT_REACHED(); + return ""; + } +} + const char* character_compare_type_name(CharacterCompareType ch_compare_type) { switch (ch_compare_type) { @@ -112,12 +126,27 @@ ALWAYS_INLINE OpCode* ByteCode::get_opcode_by_id(OpCodeId id) const case OpCodeId::CheckEnd: s_opcodes.set(i, make(*const_cast(this))); break; + case OpCodeId::CheckBoundary: + s_opcodes.set(i, make(*const_cast(this))); + break; case OpCodeId::ForkJump: s_opcodes.set(i, make(*const_cast(this))); break; case OpCodeId::ForkStay: s_opcodes.set(i, make(*const_cast(this))); break; + case OpCodeId::FailForks: + s_opcodes.set(i, make(*const_cast(this))); + break; + case OpCodeId::Save: + s_opcodes.set(i, make(*const_cast(this))); + break; + case OpCodeId::Restore: + s_opcodes.set(i, make(*const_cast(this))); + break; + case OpCodeId::GoBack: + s_opcodes.set(i, make(*const_cast(this))); + break; case OpCodeId::CheckBegin: s_opcodes.set(i, make(*const_cast(this))); break; @@ -166,6 +195,38 @@ ALWAYS_INLINE ExecutionResult OpCode_Exit::execute(const MatchInput& input, Matc return ExecutionResult::Failed; } +ALWAYS_INLINE ExecutionResult OpCode_Save::execute(const MatchInput& input, MatchState& state, MatchOutput&) const +{ + input.saved_positions.append(state.string_position); + return ExecutionResult::Continue; +} + +ALWAYS_INLINE ExecutionResult OpCode_Restore::execute(const MatchInput& input, MatchState& state, MatchOutput&) const +{ + if (input.saved_positions.is_empty()) + return ExecutionResult::Failed; + + state.string_position = input.saved_positions.take_last(); + return ExecutionResult::Continue; +} + +ALWAYS_INLINE ExecutionResult OpCode_GoBack::execute(const MatchInput&, MatchState& state, MatchOutput&) const +{ + if (count() > state.string_position) + return ExecutionResult::Failed_ExecuteLowPrioForks; + + state.string_position -= count(); + return ExecutionResult::Continue; +} + +ALWAYS_INLINE ExecutionResult OpCode_FailForks::execute(const MatchInput& input, MatchState&, MatchOutput&) const +{ + ASSERT(count() > 0); + + input.fail_counter += count() - 1; + return ExecutionResult::Failed_ExecuteLowPrioForks; +} + ALWAYS_INLINE ExecutionResult OpCode_Jump::execute(const MatchInput&, MatchState& state, MatchOutput&) const { @@ -198,6 +259,40 @@ ALWAYS_INLINE ExecutionResult OpCode_CheckBegin::execute(const MatchInput& input return ExecutionResult::Failed_ExecuteLowPrioForks; } +ALWAYS_INLINE ExecutionResult OpCode_CheckBoundary::execute(const MatchInput& input, MatchState& state, MatchOutput&) const +{ + auto isword = [](auto ch) { return isalnum(ch) || ch == '_'; }; + auto is_word_boundary = [&] { + if (state.string_position == input.view.length()) { + if (state.string_position > 0 && isword(input.view[state.string_position - 1])) + return true; + return false; + } + + if (state.string_position == 0) { + if (isword(input.view[0])) + return true; + + return false; + } + + return !!(isword(input.view[state.string_position]) ^ isword(input.view[state.string_position - 1])); + }; + switch (type()) { + case BoundaryCheckType::Word: { + if (is_word_boundary()) + return ExecutionResult::Continue; + return ExecutionResult::Failed_ExecuteLowPrioForks; + } + case BoundaryCheckType::NonWord: { + if (!is_word_boundary()) + return ExecutionResult::Continue; + return ExecutionResult::Failed_ExecuteLowPrioForks; + } + } + ASSERT_NOT_REACHED(); +} + ALWAYS_INLINE ExecutionResult OpCode_CheckEnd::execute(const MatchInput& input, MatchState& state, MatchOutput&) const { if (state.string_position == input.view.length() && (input.regex_options & AllFlags::MatchNotEndOfLine)) @@ -293,9 +388,13 @@ ALWAYS_INLINE ExecutionResult OpCode_SaveRightNamedCaptureGroup::execute(const M return ExecutionResult::Continue; } -ALWAYS_INLINE ExecutionResult OpCode_Compare::execute(const MatchInput& input, MatchState& state, MatchOutput&) const +ALWAYS_INLINE ExecutionResult OpCode_Compare::execute(const MatchInput& input, MatchState& state, MatchOutput& output) const { bool inverse { false }; + bool temporary_inverse { false }; + bool reset_temp_inverse { false }; + + auto current_inversion_state = [&]() -> bool { return temporary_inverse ^ inverse; }; size_t string_position = state.string_position; bool inverse_matched { false }; @@ -305,30 +404,45 @@ ALWAYS_INLINE ExecutionResult OpCode_Compare::execute(const MatchInput& input, M if (state.string_position > string_position) break; + if (reset_temp_inverse) { + reset_temp_inverse = false; + temporary_inverse = false; + } else { + reset_temp_inverse = true; + } + auto compare_type = (CharacterCompareType)m_bytecode->at(offset++); if (compare_type == CharacterCompareType::Inverse) inverse = true; - else if (compare_type == CharacterCompareType::Char) { - char ch = m_bytecode->at(offset++); + else if (compare_type == CharacterCompareType::TemporaryInverse) { + // If "TemporaryInverse" is given, negate the current inversion state only for the next opcode. + // it follows that this cannot be the last compare element. + ASSERT(i != arguments_count() - 1); + + temporary_inverse = true; + reset_temp_inverse = false; + + } else if (compare_type == CharacterCompareType::Char) { + u32 ch = m_bytecode->at(offset++); // We want to compare a string that is longer or equal in length to the available string if (input.view.length() - state.string_position < 1) return ExecutionResult::Failed_ExecuteLowPrioForks; - compare_char(input, state, ch, inverse, inverse_matched); + compare_char(input, state, ch, current_inversion_state(), inverse_matched); } else if (compare_type == CharacterCompareType::AnyChar) { // We want to compare a string that is definitely longer than the available string if (input.view.length() - state.string_position < 1) return ExecutionResult::Failed_ExecuteLowPrioForks; - ASSERT(!inverse); + ASSERT(!current_inversion_state()); ++state.string_position; } else if (compare_type == CharacterCompareType::String) { - ASSERT(!inverse); + ASSERT(!current_inversion_state()); char* str = reinterpret_cast(m_bytecode->at(offset++)); auto& length = m_bytecode->at(offset++); @@ -348,7 +462,7 @@ ALWAYS_INLINE ExecutionResult OpCode_Compare::execute(const MatchInput& input, M auto character_class = (CharClass)m_bytecode->at(offset++); auto ch = input.view[state.string_position]; - compare_character_class(input, state, character_class, ch, inverse, inverse_matched); + compare_character_class(input, state, character_class, ch, current_inversion_state(), inverse_matched); } else if (compare_type == CharacterCompareType::CharRange) { auto value = (CharRange)m_bytecode->at(offset++); @@ -357,7 +471,40 @@ ALWAYS_INLINE ExecutionResult OpCode_Compare::execute(const MatchInput& input, M auto to = value.to; auto ch = input.view[state.string_position]; - compare_character_range(input, state, from, to, ch, inverse, inverse_matched); + compare_character_range(input, state, from, to, ch, current_inversion_state(), inverse_matched); + + } else if (compare_type == CharacterCompareType::Reference) { + auto reference_number = (size_t)m_bytecode->at(offset++); + auto& groups = output.capture_group_matches.at(input.match_index); + if (groups.size() <= reference_number) + return ExecutionResult::Failed_ExecuteLowPrioForks; + + auto str = groups.at(reference_number).view; + + // We want to compare a string that is definitely longer than the available string + if (input.view.length() - state.string_position < str.length()) + return ExecutionResult::Failed_ExecuteLowPrioForks; + + if (!compare_string(input, state, str.characters_without_null_termination(), str.length())) + return ExecutionResult::Failed_ExecuteLowPrioForks; + + } else if (compare_type == CharacterCompareType::NamedReference) { + auto ptr = (const char*)m_bytecode->at(offset++); + auto length = (size_t)m_bytecode->at(offset++); + StringView name { ptr, length }; + + auto group = output.named_capture_group_matches.at(input.match_index).get(name); + if (!group.has_value()) + return ExecutionResult::Failed_ExecuteLowPrioForks; + + auto str = group.value().view; + + // We want to compare a string that is definitely longer than the available string + if (input.view.length() - state.string_position < str.length()) + return ExecutionResult::Failed_ExecuteLowPrioForks; + + if (!compare_string(input, state, str.characters_without_null_termination(), str.length())) + return ExecutionResult::Failed_ExecuteLowPrioForks; } else { fprintf(stderr, "Undefined comparison: %i\n", (int)compare_type); @@ -366,7 +513,7 @@ ALWAYS_INLINE ExecutionResult OpCode_Compare::execute(const MatchInput& input, M } } - if (inverse && !inverse_matched) + if (current_inversion_state() && !inverse_matched) ++state.string_position; if (string_position == state.string_position || state.string_position > input.view.length()) @@ -502,6 +649,14 @@ ALWAYS_INLINE void OpCode_Compare::compare_character_class(const MatchInput& inp ++state.string_position; } break; + case CharClass::Word: + if (isalnum(ch) || ch == '_') { + if (inverse) + inverse_matched = true; + else + ++state.string_position; + } + break; case CharClass::Xdigit: if (isxdigit(ch)) { if (inverse) @@ -550,6 +705,13 @@ const Vector OpCode_Compare::variable_arguments_to_string(Optional view.length() ? 0 : 1).to_string().characters())); + } else if (compare_type == CharacterCompareType::NamedReference) { + auto ptr = (const char*)m_bytecode->at(offset++); + auto length = m_bytecode->at(offset++); + result.empend(String::format("name='%.*s'", length, ptr)); + } else if (compare_type == CharacterCompareType::Reference) { + auto ref = m_bytecode->at(offset++); + result.empend(String::format("number=%lu", ref)); } else if (compare_type == CharacterCompareType::String) { char* str = reinterpret_cast(m_bytecode->at(offset++)); auto& length = m_bytecode->at(offset++); diff --git a/Libraries/LibRegex/RegexByteCode.h b/Libraries/LibRegex/RegexByteCode.h index 7cac259bf5..da3f971bfe 100644 --- a/Libraries/LibRegex/RegexByteCode.h +++ b/Libraries/LibRegex/RegexByteCode.h @@ -29,6 +29,7 @@ #include "RegexMatch.h" #include "RegexOptions.h" +#include #include #include #include @@ -46,33 +47,41 @@ using ByteCodeValueType = u64; __ENUMERATE_OPCODE(Jump) \ __ENUMERATE_OPCODE(ForkJump) \ __ENUMERATE_OPCODE(ForkStay) \ + __ENUMERATE_OPCODE(FailForks) \ __ENUMERATE_OPCODE(SaveLeftCaptureGroup) \ __ENUMERATE_OPCODE(SaveRightCaptureGroup) \ __ENUMERATE_OPCODE(SaveLeftNamedCaptureGroup) \ __ENUMERATE_OPCODE(SaveRightNamedCaptureGroup) \ __ENUMERATE_OPCODE(CheckBegin) \ __ENUMERATE_OPCODE(CheckEnd) \ + __ENUMERATE_OPCODE(CheckBoundary) \ + __ENUMERATE_OPCODE(Save) \ + __ENUMERATE_OPCODE(Restore) \ + __ENUMERATE_OPCODE(GoBack) \ __ENUMERATE_OPCODE(Exit) +// clang-format off enum class OpCodeId : ByteCodeValueType { #define __ENUMERATE_OPCODE(x) x, ENUMERATE_OPCODES #undef __ENUMERATE_OPCODE - First - = Compare, - Last - = Exit, + First = Compare, + Last = Exit, }; +// clang-format on -#define ENUMERATE_CHARACTER_COMPARE_TYPES \ - __ENUMERATE_CHARACTER_COMPARE_TYPE(Undefined) \ - __ENUMERATE_CHARACTER_COMPARE_TYPE(Inverse) \ - __ENUMERATE_CHARACTER_COMPARE_TYPE(AnyChar) \ - __ENUMERATE_CHARACTER_COMPARE_TYPE(Char) \ - __ENUMERATE_CHARACTER_COMPARE_TYPE(String) \ - __ENUMERATE_CHARACTER_COMPARE_TYPE(CharClass) \ - __ENUMERATE_CHARACTER_COMPARE_TYPE(CharRange) \ +#define ENUMERATE_CHARACTER_COMPARE_TYPES \ + __ENUMERATE_CHARACTER_COMPARE_TYPE(Undefined) \ + __ENUMERATE_CHARACTER_COMPARE_TYPE(Inverse) \ + __ENUMERATE_CHARACTER_COMPARE_TYPE(TemporaryInverse) \ + __ENUMERATE_CHARACTER_COMPARE_TYPE(AnyChar) \ + __ENUMERATE_CHARACTER_COMPARE_TYPE(Char) \ + __ENUMERATE_CHARACTER_COMPARE_TYPE(String) \ + __ENUMERATE_CHARACTER_COMPARE_TYPE(CharClass) \ + __ENUMERATE_CHARACTER_COMPARE_TYPE(CharRange) \ + __ENUMERATE_CHARACTER_COMPARE_TYPE(Reference) \ + __ENUMERATE_CHARACTER_COMPARE_TYPE(NamedReference) \ __ENUMERATE_CHARACTER_COMPARE_TYPE(RangeExpressionDummy) enum class CharacterCompareType : ByteCodeValueType { @@ -93,6 +102,7 @@ enum class CharacterCompareType : ByteCodeValueType { __ENUMERATE_CHARACTER_CLASS(Blank) \ __ENUMERATE_CHARACTER_CLASS(Graph) \ __ENUMERATE_CHARACTER_CLASS(Punct) \ + __ENUMERATE_CHARACTER_CLASS(Word) \ __ENUMERATE_CHARACTER_CLASS(Xdigit) enum class CharClass : ByteCodeValueType { @@ -101,6 +111,16 @@ enum class CharClass : ByteCodeValueType { #undef __ENUMERATE_CHARACTER_CLASS }; +#define ENUMERATE_BOUNDARY_CHECK_TYPES \ + __ENUMERATE_BOUNDARY_CHECK_TYPE(Word) \ + __ENUMERATE_BOUNDARY_CHECK_TYPE(NonWord) + +enum class BoundaryCheckType : ByteCodeValueType { +#define __ENUMERATE_BOUNDARY_CHECK_TYPE(x) x, + ENUMERATE_BOUNDARY_CHECK_TYPES +#undef __ENUMERATE_BOUNDARY_CHECK_TYPE +}; + struct CharRange { const u32 from; const u32 to; @@ -144,9 +164,10 @@ public: ASSERT(value.type != CharacterCompareType::RangeExpressionDummy); ASSERT(value.type != CharacterCompareType::Undefined); ASSERT(value.type != CharacterCompareType::String); + ASSERT(value.type != CharacterCompareType::NamedReference); arguments.append((ByteCodeValueType)value.type); - if (value.type != CharacterCompareType::Inverse && value.type != CharacterCompareType::AnyChar) + if (value.type != CharacterCompareType::Inverse && value.type != CharacterCompareType::AnyChar && value.type != CharacterCompareType::TemporaryInverse) arguments.append(move(value.value)); } @@ -156,6 +177,15 @@ public: append(move(bytecode)); } + void insert_bytecode_check_boundary(BoundaryCheckType type) + { + ByteCode bytecode; + bytecode.empend((ByteCodeValueType)OpCodeId::CheckBoundary); + bytecode.empend((ByteCodeValueType)type); + + append(move(bytecode)); + } + void insert_bytecode_compare_string(StringView view, size_t length) { ByteCode bytecode; @@ -175,6 +205,25 @@ public: append(move(bytecode)); } + void insert_bytecode_compare_named_reference(StringView name, size_t length) + { + ByteCode bytecode; + + bytecode.empend(static_cast(OpCodeId::Compare)); + bytecode.empend(1); // number of arguments + + ByteCode arguments; + + arguments.empend(static_cast(CharacterCompareType::NamedReference)); + arguments.empend(reinterpret_cast(name.characters_without_null_termination())); + arguments.empend(length); + + bytecode.empend(arguments.size()); // size of arguments + bytecode.append(move(arguments)); + + append(move(bytecode)); + } + void insert_bytecode_group_capture_left(size_t capture_groups_count) { empend(static_cast(OpCodeId::SaveLeftCaptureGroup)); @@ -201,6 +250,87 @@ public: empend(name.length()); } + enum class LookAroundType { + LookAhead, + LookBehind, + NegatedLookAhead, + NegatedLookBehind, + }; + void insert_bytecode_lookaround(ByteCode&& lookaround_body, LookAroundType type, size_t match_length = 0) + { + // FIXME: The save stack will grow infinitely with repeated failures + // as we do not discard that on failure (we don't necessarily know how many to pop with the current architecture). + switch (type) { + case LookAroundType::LookAhead: { + // SAVE + // REGEXP BODY + // RESTORE + empend((ByteCodeValueType)OpCodeId::Save); + append(move(lookaround_body)); + empend((ByteCodeValueType)OpCodeId::Restore); + return; + } + case LookAroundType::NegatedLookAhead: { + // JUMP _A + // LABEL _L + // REGEXP BODY + // FAIL 2 + // LABEL _A + // SAVE + // FORKJUMP _L + // RESTORE + auto body_length = lookaround_body.size(); + empend((ByteCodeValueType)OpCodeId::Jump); + empend((ByteCodeValueType)body_length + 2); // JUMP to label _A + append(move(lookaround_body)); + empend((ByteCodeValueType)OpCodeId::FailForks); + empend((ByteCodeValueType)2); // Fail two forks + empend((ByteCodeValueType)OpCodeId::Save); + empend((ByteCodeValueType)OpCodeId::ForkJump); + empend((ByteCodeValueType) - (body_length + 5)); // JUMP to lavel _L + empend((ByteCodeValueType)OpCodeId::Restore); + return; + } + case LookAroundType::LookBehind: + // SAVE + // GOBACK match_length(BODY) + // REGEXP BODY + // RESTORE + empend((ByteCodeValueType)OpCodeId::Save); + empend((ByteCodeValueType)OpCodeId::GoBack); + empend((ByteCodeValueType)match_length); + append(move(lookaround_body)); + empend((ByteCodeValueType)OpCodeId::Restore); + return; + case LookAroundType::NegatedLookBehind: { + // JUMP _A + // LABEL _L + // GOBACK match_length(BODY) + // REGEXP BODY + // FAIL 2 + // LABEL _A + // SAVE + // FORKJUMP _L + // RESTORE + auto body_length = lookaround_body.size(); + empend((ByteCodeValueType)OpCodeId::Jump); + empend((ByteCodeValueType)body_length + 4); // JUMP to label _A + empend((ByteCodeValueType)OpCodeId::GoBack); + empend((ByteCodeValueType)match_length); + append(move(lookaround_body)); + empend((ByteCodeValueType)OpCodeId::FailForks); + empend((ByteCodeValueType)2); // Fail two forks + empend((ByteCodeValueType)OpCodeId::Save); + empend((ByteCodeValueType)OpCodeId::ForkJump); + empend((ByteCodeValueType) - (body_length + 7)); // JUMP to lavel _L + empend((ByteCodeValueType)OpCodeId::Restore); + return; + } + } + + ASSERT_NOT_REACHED(); + } + void insert_bytecode_alternation(ByteCode&& left, ByteCode&& right) { @@ -348,6 +478,7 @@ enum class ExecutionResult : u8 { const char* execution_result_name(ExecutionResult result); const char* opcode_id_name(OpCodeId opcode_id); +const char* boundary_check_type_name(BoundaryCheckType); const char* character_compare_type_name(CharacterCompareType result); const char* execution_result_name(ExecutionResult result); @@ -419,6 +550,56 @@ public: const String arguments_string() const override { return ""; } }; +class OpCode_FailForks final : public OpCode { +public: + OpCode_FailForks(ByteCode& bytecode) + : OpCode(bytecode) + { + } + ExecutionResult execute(const MatchInput& input, MatchState& state, MatchOutput& output) const override; + ALWAYS_INLINE OpCodeId opcode_id() const override { return OpCodeId::FailForks; } + ALWAYS_INLINE size_t size() const override { return 2; } + ALWAYS_INLINE size_t count() const { return argument(0); } + const String arguments_string() const override { return String::formatted("count={}", count()); } +}; + +class OpCode_Save final : public OpCode { +public: + OpCode_Save(ByteCode& bytecode) + : OpCode(bytecode) + { + } + ExecutionResult execute(const MatchInput& input, MatchState& state, MatchOutput& output) const override; + ALWAYS_INLINE OpCodeId opcode_id() const override { return OpCodeId::Save; } + ALWAYS_INLINE size_t size() const override { return 1; } + const String arguments_string() const override { return ""; } +}; + +class OpCode_Restore final : public OpCode { +public: + OpCode_Restore(ByteCode& bytecode) + : OpCode(bytecode) + { + } + ExecutionResult execute(const MatchInput& input, MatchState& state, MatchOutput& output) const override; + ALWAYS_INLINE OpCodeId opcode_id() const override { return OpCodeId::Restore; } + ALWAYS_INLINE size_t size() const override { return 1; } + const String arguments_string() const override { return ""; } +}; + +class OpCode_GoBack final : public OpCode { +public: + OpCode_GoBack(ByteCode& bytecode) + : OpCode(bytecode) + { + } + ExecutionResult execute(const MatchInput& input, MatchState& state, MatchOutput& output) const override; + ALWAYS_INLINE OpCodeId opcode_id() const override { return OpCodeId::GoBack; } + ALWAYS_INLINE size_t size() const override { return 2; } + ALWAYS_INLINE size_t count() const { return argument(0); } + const String arguments_string() const override { return String::formatted("count={}", count()); } +}; + class OpCode_Jump final : public OpCode { public: OpCode_Jump(ByteCode& bytecode) @@ -491,6 +672,20 @@ public: const String arguments_string() const override { return ""; } }; +class OpCode_CheckBoundary final : public OpCode { +public: + OpCode_CheckBoundary(ByteCode& bytecode) + : OpCode(bytecode) + { + } + ExecutionResult execute(const MatchInput& input, MatchState& state, MatchOutput& output) const override; + ALWAYS_INLINE OpCodeId opcode_id() const override { return OpCodeId::CheckBoundary; } + ALWAYS_INLINE size_t size() const override { return 2; } + ALWAYS_INLINE size_t arguments_count() const { return 1; } + ALWAYS_INLINE BoundaryCheckType type() const { return static_cast(argument(0)); } + const String arguments_string() const override { return String::format("kind=%lu (%s)", argument(0), boundary_check_type_name(type())); } +}; + class OpCode_SaveLeftCaptureGroup final : public OpCode { public: OpCode_SaveLeftCaptureGroup(ByteCode& bytecode) diff --git a/Libraries/LibRegex/RegexDebug.h b/Libraries/LibRegex/RegexDebug.h index 2ef0c4c7c3..f49c190c2f 100644 --- a/Libraries/LibRegex/RegexDebug.h +++ b/Libraries/LibRegex/RegexDebug.h @@ -102,6 +102,7 @@ public: { StringBuilder builder; builder.append(execution_result_name(result)); + builder.appendff(", fc: {}, ss: {}", input.fail_counter, input.saved_positions.size()); if (result == ExecutionResult::Succeeded) { builder.appendf(", ip: %lu/%lu, sp: %lu/%lu", state.instruction_position, bytecode.size() - 1, state.string_position, input.view.length() - 1); } else if (result == ExecutionResult::Fork_PrioHigh) { diff --git a/Libraries/LibRegex/RegexLexer.cpp b/Libraries/LibRegex/RegexLexer.cpp index 1cac7b1222..d7ad83e11c 100644 --- a/Libraries/LibRegex/RegexLexer.cpp +++ b/Libraries/LibRegex/RegexLexer.cpp @@ -26,6 +26,7 @@ #include "RegexLexer.h" #include +#include #include namespace regex { @@ -89,6 +90,15 @@ void Lexer::reset() m_previous_position = 0; } +bool Lexer::try_skip(char c) +{ + if (peek() != c) + return false; + + consume(); + return true; +} + Token Lexer::next() { size_t token_start_position; @@ -127,7 +137,9 @@ Token Lexer::next() case '\\': return 2; default: - fprintf(stderr, "[LEXER] Found invalid escape sequence: \\%c\n", peek(1)); +#ifdef REGEX_DEBUG + fprintf(stderr, "[LEXER] Found invalid escape sequence: \\%c (the parser will have to deal with this!)\n", peek(1)); +#endif return 0; } }; diff --git a/Libraries/LibRegex/RegexLexer.h b/Libraries/LibRegex/RegexLexer.h index 77a7017990..1d930f3925 100644 --- a/Libraries/LibRegex/RegexLexer.h +++ b/Libraries/LibRegex/RegexLexer.h @@ -91,6 +91,9 @@ public: void reset(); void back(size_t offset); void set_source(const StringView source) { m_source = source; } + bool try_skip(char); + + StringView slice_back(size_t offset) const { return m_source.substring_view(m_position - offset - 1, offset); } private: ALWAYS_INLINE char peek(size_t offset = 0) const; diff --git a/Libraries/LibRegex/RegexMatch.h b/Libraries/LibRegex/RegexMatch.h index d0fadd4ba7..6f2dfacd5f 100644 --- a/Libraries/LibRegex/RegexMatch.h +++ b/Libraries/LibRegex/RegexMatch.h @@ -267,6 +267,9 @@ struct MatchInput { size_t column { 0 }; size_t global_offset { 0 }; // For multiline matching, knowing the offset from start could be important + + mutable size_t fail_counter { 0 }; + mutable Vector saved_positions; }; struct MatchState { diff --git a/Libraries/LibRegex/RegexMatcher.cpp b/Libraries/LibRegex/RegexMatcher.cpp index 60bccfd95e..9f1ef25709 100644 --- a/Libraries/LibRegex/RegexMatcher.cpp +++ b/Libraries/LibRegex/RegexMatcher.cpp @@ -264,7 +264,13 @@ Optional Matcher::execute(const MatchInput& input, MatchState& sta s_regex_dbg.print_opcode("VM", *opcode, state, recursion_level, false); #endif - auto result = opcode->execute(input, state, output); + ExecutionResult result; + if (input.fail_counter > 0) { + --input.fail_counter; + result = ExecutionResult::Failed_ExecuteLowPrioForks; + } else { + result = opcode->execute(input, state, output); + } #ifdef REGEX_DEBUG s_regex_dbg.print_result(*opcode, bytecode, input, state, result); @@ -330,4 +336,7 @@ ALWAYS_INLINE Optional Matcher::execute_low_prio_forks(const Match template class Matcher; template class Regex; + +template class Matcher; +template class Regex; } diff --git a/Libraries/LibRegex/RegexParser.cpp b/Libraries/LibRegex/RegexParser.cpp index c97427a18c..485a0902d6 100644 --- a/Libraries/LibRegex/RegexParser.cpp +++ b/Libraries/LibRegex/RegexParser.cpp @@ -28,7 +28,7 @@ #include "RegexDebug.h" #include #include -#include +#include namespace regex { @@ -88,6 +88,26 @@ ALWAYS_INLINE bool Parser::consume(const String& str) return true; } +ALWAYS_INLINE bool Parser::try_skip(StringView str) +{ + if (str.starts_with(m_parser_state.current_token.value())) + str = str.substring_view(m_parser_state.current_token.value().length(), str.length() - m_parser_state.current_token.value().length()); + else + return false; + + size_t potentially_go_back { 0 }; + for (auto ch : str) { + if (!m_parser_state.lexer.try_skip(ch)) { + m_parser_state.lexer.back(potentially_go_back); + return false; + } + ++potentially_go_back; + } + + m_parser_state.current_token = m_parser_state.lexer.next(); + return true; +} + ALWAYS_INLINE void Parser::reset() { m_parser_state.bytecode.clear(); @@ -595,4 +615,762 @@ bool PosixExtendedParser::parse_root(ByteCode& stack, size_t& match_length_minim return !has_error(); } +// ============================= +// ECMA262 Parser +// ============================= + +bool ECMA262Parser::parse_internal(ByteCode& stack, size_t& match_length_minimum) +{ + if (m_parser_state.regex_options & AllFlags::Unicode) { + return parse_pattern(stack, match_length_minimum, true, true); + } else { + ByteCode new_stack; + size_t new_match_length = 0; + auto res = parse_pattern(new_stack, new_match_length, false, false); + if (m_parser_state.named_capture_groups_count > 0) { + reset(); + return parse_pattern(stack, match_length_minimum, false, true); + } + + if (!res) + return false; + + stack.append(new_stack); + match_length_minimum = new_match_length; + return res; + } +} + +bool ECMA262Parser::parse_pattern(ByteCode& stack, size_t& match_length_minimum, bool unicode, bool named) +{ + return parse_disjunction(stack, match_length_minimum, unicode, named); +} + +bool ECMA262Parser::parse_disjunction(ByteCode& stack, size_t& match_length_minimum, bool unicode, bool named) +{ + ByteCode left_alternative_stack; + size_t left_alternative_min_length = 0; + auto alt_ok = parse_alternative(left_alternative_stack, left_alternative_min_length, unicode, named); + if (!alt_ok) + return false; + + if (!match(TokenType::Pipe)) { + stack.append(left_alternative_stack); + match_length_minimum = left_alternative_min_length; + return alt_ok; + } + + consume(); + ByteCode right_alternative_stack; + size_t right_alternative_min_length = 0; + auto continuation_ok = parse_disjunction(right_alternative_stack, right_alternative_min_length, unicode, named); + if (!continuation_ok) + return false; + + stack.insert_bytecode_alternation(move(left_alternative_stack), move(right_alternative_stack)); + match_length_minimum = min(left_alternative_min_length, right_alternative_min_length); + return continuation_ok; +} + +bool ECMA262Parser::parse_alternative(ByteCode& stack, size_t& match_length_minimum, bool unicode, bool named) +{ + for (;;) { + if (match(TokenType::Eof)) + return true; + + if (parse_term(stack, match_length_minimum, unicode, named)) + continue; + + return !has_error(); + } +} + +bool ECMA262Parser::parse_term(ByteCode& stack, size_t& match_length_minimum, bool unicode, bool named) +{ + if (parse_assertion(stack, match_length_minimum, unicode, named)) + return true; + + ByteCode atom_stack; + size_t minimum_atom_length = 0; + if (!parse_atom(atom_stack, minimum_atom_length, unicode, named)) + return false; + + if (!parse_quantifier(atom_stack, minimum_atom_length, unicode, named)) + return false; + + stack.append(move(atom_stack)); + match_length_minimum += minimum_atom_length; + return true; +} + +bool ECMA262Parser::parse_assertion(ByteCode& stack, [[maybe_unused]] size_t& match_length_minimum, bool unicode, bool named) +{ + if (match(TokenType::Circumflex)) { + consume(); + stack.empend((ByteCodeValueType)OpCodeId::CheckBegin); + return true; + } + + if (match(TokenType::Dollar)) { + consume(); + stack.empend((ByteCodeValueType)OpCodeId::CheckEnd); + return true; + } + + if (try_skip("\\b")) { + stack.insert_bytecode_check_boundary(BoundaryCheckType::Word); + return true; + } + + if (try_skip("\\B")) { + stack.insert_bytecode_check_boundary(BoundaryCheckType::NonWord); + return true; + } + + if (match(TokenType::LeftParen)) { + if (!try_skip("(?")) + return false; + + ByteCode assertion_stack; + size_t length_dummy = 0; + + auto parse_inner_disjunction = [&] { + auto disjunction_ok = parse_disjunction(assertion_stack, length_dummy, unicode, named); + if (!disjunction_ok) + return false; + consume(TokenType::RightParen, Error::MismatchingParen); + return true; + }; + + if (try_skip("=")) { + if (!parse_inner_disjunction()) + return false; + stack.insert_bytecode_lookaround(move(assertion_stack), ByteCode::LookAroundType::LookAhead); + return true; + } + if (try_skip("!")) { + if (!parse_inner_disjunction()) + return false; + stack.insert_bytecode_lookaround(move(assertion_stack), ByteCode::LookAroundType::NegatedLookAhead); + return true; + } + if (try_skip("<=")) { + if (!parse_inner_disjunction()) + return false; + // FIXME: Somehow ensure that this assertion regexp has a fixed length. + stack.insert_bytecode_lookaround(move(assertion_stack), ByteCode::LookAroundType::LookBehind, length_dummy); + return true; + } + if (try_skip(" ECMA262Parser::read_digits(ECMA262Parser::ReadDigitsInitialZeroState initial_zero, ECMA262Parser::ReadDigitFollowPolicy follow_policy, bool hex, int max_count) +{ + if (!match(TokenType::Char)) + return {}; + + if (initial_zero != ReadDigitsInitialZeroState::Allow) { + auto has_initial_zero = m_parser_state.current_token.value() == "0"; + if (initial_zero == ReadDigitsInitialZeroState::Disallow && has_initial_zero) + return {}; + + if (initial_zero == ReadDigitsInitialZeroState::Require && !has_initial_zero) + return {}; + } + + int count = 0; + size_t offset = 0; + while (match(TokenType::Char)) { + auto c = m_parser_state.current_token.value(); + if (follow_policy == ReadDigitFollowPolicy::DisallowDigit) { + if (hex && AK::StringUtils::convert_to_uint_from_hex(c).has_value()) + break; + if (!hex && c.to_uint().has_value()) + break; + } + + if (follow_policy == ReadDigitFollowPolicy::DisallowNonDigit) { + if (hex && !AK::StringUtils::convert_to_uint_from_hex(c).has_value()) + break; + if (!hex && !c.to_uint().has_value()) + break; + } + + if (max_count > 0 && count >= max_count) + break; + + offset += consume().value().length(); + ++count; + } + + auto str = m_parser_state.lexer.slice_back(offset); + if (hex) + return AK::StringUtils::convert_to_uint_from_hex(str); + + return str.to_uint(); +} + +bool ECMA262Parser::parse_quantifier(ByteCode& stack, size_t& match_length_minimum, bool, bool) +{ + enum class Repetition { + OneOrMore, + ZeroOrMore, + Optional, + Explicit, + None, + } repetition_mark { Repetition::None }; + + bool ungreedy = false; + Optional repeat_min, repeat_max; + + if (match(TokenType::Asterisk)) { + consume(); + repetition_mark = Repetition::ZeroOrMore; + } else if (match(TokenType::Plus)) { + consume(); + repetition_mark = Repetition::OneOrMore; + } else if (match(TokenType::Questionmark)) { + consume(); + repetition_mark = Repetition::Optional; + } else if (match(TokenType::LeftCurly)) { + consume(); + repetition_mark = Repetition::Explicit; + + auto low_bound = read_digits(); + + if (!low_bound.has_value()) { + set_error(Error::InvalidBraceContent); + return false; + } + + repeat_min = low_bound.value(); + + if (match(TokenType::Comma)) { + consume(); + auto high_bound = read_digits(); + if (!high_bound.has_value()) { + set_error(Error::InvalidBraceContent); + return false; + } + + repeat_max = high_bound.value(); + } + + if (!match(TokenType::RightCurly)) { + set_error(Error::MismatchingBrace); + return false; + } + consume(); + + if (repeat_max.has_value()) { + if (repeat_min.value() > repeat_max.value()) + set_error(Error::InvalidBraceContent); + } + } else { + return true; + } + + if (match(TokenType::Questionmark)) { + if (repetition_mark == Repetition::Explicit) { + set_error(Error::InvalidRepetitionMarker); + return false; + } + consume(); + ungreedy = true; + } + + ByteCode new_bytecode; + switch (repetition_mark) { + case Repetition::OneOrMore: + new_bytecode.insert_bytecode_repetition_min_one(stack, !ungreedy); + break; + case Repetition::ZeroOrMore: + new_bytecode.insert_bytecode_repetition_any(stack, !ungreedy); + match_length_minimum = 0; + break; + case Repetition::Optional: + new_bytecode.insert_bytecode_repetition_zero_or_one(stack, !ungreedy); + match_length_minimum = 0; + break; + case Repetition::Explicit: + new_bytecode.insert_bytecode_repetition_min_max(stack, repeat_min.value(), repeat_max); + match_length_minimum *= repeat_min.value(); + break; + case Repetition::None: + ASSERT_NOT_REACHED(); + } + + return true; +} + +bool ECMA262Parser::parse_atom(ByteCode& stack, size_t& match_length_minimum, bool unicode, bool named) +{ + if (try_skip("\\")) { + // AtomEscape. + return parse_atom_escape(stack, match_length_minimum, unicode, named); + } + + if (match(TokenType::LeftBracket)) { + // Character class. + return parse_character_class(stack, match_length_minimum, unicode, named); + } + + if (match(TokenType::LeftParen)) { + // Non-capturing group, or a capture group. + return parse_capture_group(stack, match_length_minimum, unicode, named); + } + + if (match(TokenType::Period)) { + consume(); + match_length_minimum += 1; + stack.insert_bytecode_compare_values({ { CharacterCompareType::AnyChar, 0 } }); + return true; + } + + if (match(TokenType::Circumflex) || match(TokenType::Dollar) || match(TokenType::RightBracket) + || match(TokenType::RightCurly) || match(TokenType::RightParen) || match(TokenType::Pipe) + || match(TokenType::Plus) || match(TokenType::Asterisk) || match(TokenType::Questionmark)) { + + return false; + } + + if (match(TokenType::Char)) { + auto token = consume().value(); + match_length_minimum += 1; + stack.insert_bytecode_compare_values({ { CharacterCompareType::Char, (ByteCodeValueType)token[0] } }); + return true; + } + + set_error(Error::InvalidPattern); + return false; +} + +bool ECMA262Parser::parse_atom_escape(ByteCode& stack, size_t& match_length_minimum, bool unicode, bool named) +{ + if (auto escape = read_digits(ReadDigitsInitialZeroState::Disallow, ReadDigitFollowPolicy::DisallowNonDigit); escape.has_value()) { + auto maybe_length = m_parser_state.capture_group_minimum_lengths.get(escape.value()); + if (!maybe_length.has_value()) { + set_error(Error::InvalidNumber); + return false; + } + match_length_minimum += maybe_length.value(); + stack.insert_bytecode_compare_values({ { CharacterCompareType::Reference, (ByteCodeValueType)escape.value() } }); + return true; + } + + // CharacterEscape > ControlEscape + if (try_skip("f")) { + match_length_minimum += 1; + stack.insert_bytecode_compare_values({ { CharacterCompareType::Char, (ByteCodeValueType)'\f' } }); + return true; + } + + if (try_skip("n")) { + match_length_minimum += 1; + stack.insert_bytecode_compare_values({ { CharacterCompareType::Char, (ByteCodeValueType)'\n' } }); + return true; + } + + if (try_skip("r")) { + match_length_minimum += 1; + stack.insert_bytecode_compare_values({ { CharacterCompareType::Char, (ByteCodeValueType)'\r' } }); + return true; + } + + if (try_skip("t")) { + match_length_minimum += 1; + stack.insert_bytecode_compare_values({ { CharacterCompareType::Char, (ByteCodeValueType)'\t' } }); + return true; + } + + if (try_skip("v")) { + match_length_minimum += 1; + stack.insert_bytecode_compare_values({ { CharacterCompareType::Char, (ByteCodeValueType)'\v' } }); + return true; + } + + // CharacterEscape > ControlLetter + if (try_skip("c")) { + for (auto c = 'A'; c <= 'z'; ++c) { + if (try_skip({ &c, 1 })) { + match_length_minimum += 1; + stack.insert_bytecode_compare_values({ { CharacterCompareType::Char, (ByteCodeValueType)(c & 0x3f) } }); + return true; + } + } + } + + // '\0' + if (read_digits(ReadDigitsInitialZeroState::Require, ReadDigitFollowPolicy::DisallowDigit).has_value()) { + match_length_minimum += 1; + stack.insert_bytecode_compare_values({ { CharacterCompareType::Char, (ByteCodeValueType)0 } }); + return true; + } + + // HexEscape + if (try_skip("x")) { + if (auto hex_escape = read_digits(ReadDigitsInitialZeroState::Allow, ReadDigitFollowPolicy::Any, true, 2); hex_escape.has_value()) { + match_length_minimum += 1; + stack.insert_bytecode_compare_values({ { CharacterCompareType::Char, (ByteCodeValueType)hex_escape.value() } }); + return true; + } + } + + if (try_skip("u")) { + // FIXME: Implement this path, unicode escape sequence. + TODO(); + } + + // IdentityEscape + if (match(TokenType::EscapeSequence)) { + match_length_minimum += 1; + auto token = consume().value(); + stack.insert_bytecode_compare_values({ { CharacterCompareType::Char, (ByteCodeValueType)token[token.length() - 1] } }); + return true; + } + + if (named && try_skip("k")) { + auto name = read_capture_group_specifier(true); + if (name.is_empty()) { + set_error(Error::InvalidNameForCaptureGroup); + return false; + } + auto maybe_length = m_parser_state.named_capture_group_minimum_lengths.get(name); + if (!maybe_length.has_value()) { + set_error(Error::InvalidNameForCaptureGroup); + return false; + } + match_length_minimum += maybe_length.value(); + + stack.insert_bytecode_compare_named_reference(name, name.length()); + return true; + } + + if (unicode) { + if (try_skip("p{")) { + // FIXME: Implement this path, Unicode property match. + TODO(); + } + if (try_skip("P{")) { + // FIXME: Implement this path, Unicode property match. + TODO(); + } + } + + bool negate = false; + auto ch = parse_character_class_escape(negate); + if (!ch.has_value()) { + set_error(Error::InvalidCharacterClass); + return false; + } + + Vector compares; + if (negate) + compares.empend(CharacterCompareType::Inverse, 0); + compares.empend(CharacterCompareType::CharClass, (ByteCodeValueType)ch.value()); + match_length_minimum += 1; + stack.insert_bytecode_compare_values(move(compares)); + return true; +} + +Optional ECMA262Parser::parse_character_class_escape(bool& negate, bool expect_backslash) +{ + if (expect_backslash && !try_skip("\\")) + return {}; + + // CharacterClassEscape + CharClass ch_class; + if (try_skip("d")) { + ch_class = CharClass::Digit; + } else if (try_skip("D")) { + ch_class = CharClass::Digit; + negate = true; + } else if (try_skip("s")) { + ch_class = CharClass::Space; + } else if (try_skip("S")) { + ch_class = CharClass::Space; + negate = true; + } else if (try_skip("w")) { + ch_class = CharClass::Word; + } else if (try_skip("W")) { + ch_class = CharClass::Word; + negate = true; + } else { + return {}; + } + + return ch_class; +} + +bool ECMA262Parser::parse_character_class(ByteCode& stack, size_t& match_length_minimum, bool unicode, bool) +{ + consume(TokenType::LeftBracket, Error::InvalidPattern); + + Vector compares; + + if (match(TokenType::Circumflex)) { + // Negated charclass + consume(); + compares.empend(CharacterCompareType::Inverse, 0); + } + + if (match(TokenType::RightBracket)) { + consume(); + return true; + } + + if (!parse_nonempty_class_ranges(compares, unicode)) + return false; + + match_length_minimum += 1; + stack.insert_bytecode_compare_values(move(compares)); + return true; +} + +struct CharClassRangeElement { + union { + CharClass character_class; + u32 code_point { 0 }; + }; + + bool is_negated { false }; + bool is_character_class { false }; +}; + +bool ECMA262Parser::parse_nonempty_class_ranges(Vector& ranges, bool unicode) +{ + auto read_class_atom_no_dash = [&]() -> Optional { + if (match(TokenType::EscapeSequence)) { + auto token = consume().value(); + return { { .code_point = (u32)token[1], .is_character_class = false } }; + } + + if (try_skip("\\")) { + if (try_skip("f")) + return { { .code_point = '\f', .is_character_class = false } }; + if (try_skip("n")) + return { { .code_point = '\n', .is_character_class = false } }; + if (try_skip("r")) + return { { .code_point = '\r', .is_character_class = false } }; + if (try_skip("t")) + return { { .code_point = '\t', .is_character_class = false } }; + if (try_skip("v")) + return { { .code_point = '\v', .is_character_class = false } }; + if (try_skip("b")) + return { { .code_point = '\b', .is_character_class = false } }; + + // CharacterEscape > ControlLetter + if (try_skip("c")) { + for (auto c = 'A'; c <= 'z'; ++c) { + if (try_skip({ &c, 1 })) + return { { .code_point = (u32)(c & 0x3f), .is_character_class = false } }; + } + } + + // '\0' + if (read_digits(ReadDigitsInitialZeroState::Require, ReadDigitFollowPolicy::DisallowDigit).has_value()) + return { { .code_point = 0, .is_character_class = false } }; + + // HexEscape + if (try_skip("x")) { + if (auto hex_escape = read_digits(ReadDigitsInitialZeroState::Allow, ReadDigitFollowPolicy::Any, true, 2); hex_escape.has_value()) + return { { .code_point = hex_escape.value(), .is_character_class = false } }; + } + + if (try_skip("u")) { + // FIXME: Implement this path, unicode escape sequence. + TODO(); + } + + if (unicode) { + if (try_skip("-")) + return { { .code_point = '-', .is_character_class = false } }; + } + + if (try_skip("p{") || try_skip("P{")) { + // FIXME: Implement these; unicode properties. + TODO(); + } + + if (try_skip("d")) + return { { .character_class = CharClass::Digit, .is_character_class = true } }; + if (try_skip("s")) + return { { .character_class = CharClass::Space, .is_character_class = true } }; + if (try_skip("w")) + return { { .character_class = CharClass::Word, .is_character_class = true } }; + if (try_skip("D")) + return { { .character_class = CharClass::Digit, .is_negated = true, .is_character_class = true } }; + if (try_skip("S")) + return { { .character_class = CharClass::Space, .is_negated = true, .is_character_class = true } }; + if (try_skip("W")) + return { { .character_class = CharClass::Word, .is_negated = true, .is_character_class = true } }; + } + + if (match(TokenType::RightBracket) || match(TokenType::HyphenMinus)) + return {}; + + auto token = consume(TokenType::Char, Error::InvalidCharacterClass); + + return { { .code_point = (u32)token.value()[0], .is_character_class = false } }; + }; + auto read_class_atom = [&]() -> Optional { + if (match(TokenType::HyphenMinus)) { + consume(); + return { { .code_point = '-', .is_character_class = false } }; + } + + return read_class_atom_no_dash(); + }; + + while (!match(TokenType::RightBracket)) { + auto first_atom = read_class_atom(); + if (!first_atom.has_value()) + return false; + + if (match(TokenType::HyphenMinus)) { + consume(); + auto second_atom = read_class_atom(); + if (!second_atom.has_value()) + return false; + + if (first_atom.value().is_character_class || second_atom.value().is_character_class) { + set_error(Error::InvalidRange); + return false; + } + + if (first_atom.value().code_point > second_atom.value().code_point) { + set_error(Error::InvalidRange); + return false; + } + + ASSERT(!first_atom.value().is_negated); + ASSERT(!second_atom.value().is_negated); + + ranges.empend(CharacterCompareType::CharRange, CharRange { first_atom.value().code_point, second_atom.value().code_point }); + continue; + } + + auto atom = first_atom.value(); + + if (atom.is_character_class) { + if (atom.is_negated) + ranges.empend(CharacterCompareType::TemporaryInverse, 0); + ranges.empend(CharacterCompareType::CharClass, (ByteCodeValueType)first_atom.value().character_class); + } else { + ASSERT(!atom.is_negated); + ranges.empend(CharacterCompareType::Char, first_atom.value().code_point); + } + } + + consume(TokenType::RightBracket, Error::MismatchingBracket); + + return true; +} + +StringView ECMA262Parser::read_capture_group_specifier(bool take_starting_angle_bracket) +{ + if (take_starting_angle_bracket && !consume("<")) + return {}; + + size_t offset = 0; + while (match(TokenType::Char)) { + auto c = m_parser_state.current_token.value(); + if (c == ">") + break; + offset += consume().value().length(); + } + + auto name = m_parser_state.lexer.slice_back(offset); + if (!consume(">") || name.is_empty()) + set_error(Error::InvalidNameForCaptureGroup); + + return name; +} + +bool ECMA262Parser::parse_capture_group(ByteCode& stack, size_t& match_length_minimum, bool unicode, bool named) +{ + consume(TokenType::LeftParen, Error::InvalidPattern); + + if (match(TokenType::Questionmark)) { + // Non-capturing group or group with specifier. + consume(); + + if (match(TokenType::Colon)) { + consume(); + ByteCode noncapture_group_bytecode; + size_t length = 0; + if (!parse_disjunction(noncapture_group_bytecode, length, unicode, named)) + return set_error(Error::InvalidPattern); + + consume(TokenType::RightParen, Error::MismatchingParen); + + stack.append(move(noncapture_group_bytecode)); + match_length_minimum += length; + return true; + } + + if (consume("<")) { + ++m_parser_state.named_capture_groups_count; + auto name = read_capture_group_specifier(); + + if (name.is_empty()) { + set_error(Error::InvalidNameForCaptureGroup); + return false; + } + + ByteCode capture_group_bytecode; + size_t length = 0; + if (!parse_disjunction(capture_group_bytecode, length, unicode, named)) + return set_error(Error::InvalidPattern); + + consume(TokenType::RightParen, Error::MismatchingParen); + + stack.insert_bytecode_group_capture_left(name); + stack.append(move(capture_group_bytecode)); + stack.insert_bytecode_group_capture_right(name); + + match_length_minimum += length; + + m_parser_state.named_capture_group_minimum_lengths.set(name, length); + return true; + } + + set_error(Error::InvalidCaptureGroup); + return false; + } + + auto group_index = ++m_parser_state.capture_groups_count; + stack.insert_bytecode_group_capture_left(group_index); + + ByteCode capture_group_bytecode; + size_t length = 0; + + if (!parse_disjunction(capture_group_bytecode, length, unicode, named)) + return set_error(Error::InvalidPattern); + + stack.append(move(capture_group_bytecode)); + + m_parser_state.capture_group_minimum_lengths.set(group_index, length); + + consume(TokenType::RightParen, Error::MismatchingParen); + + stack.insert_bytecode_group_capture_right(group_index); + + match_length_minimum += length; + + return true; +} } diff --git a/Libraries/LibRegex/RegexParser.h b/Libraries/LibRegex/RegexParser.h index 11d5b6fe3d..b9d89262e7 100644 --- a/Libraries/LibRegex/RegexParser.h +++ b/Libraries/LibRegex/RegexParser.h @@ -39,6 +39,7 @@ namespace regex { class PosixExtendedParser; +class ECMA262Parser; template struct GenericParserTraits { @@ -53,6 +54,10 @@ template<> struct ParserTraits : public GenericParserTraits { }; +template<> +struct ParserTraits : public GenericParserTraits { +}; + class Parser { public: struct Result { @@ -88,6 +93,7 @@ protected: ALWAYS_INLINE Token consume(); ALWAYS_INLINE Token consume(TokenType type, Error error); ALWAYS_INLINE bool consume(const String&); + ALWAYS_INLINE bool try_skip(StringView); ALWAYS_INLINE void reset(); ALWAYS_INLINE bool done() const; ALWAYS_INLINE bool set_error(Error error); @@ -102,6 +108,10 @@ protected: size_t named_capture_groups_count { 0 }; size_t match_length_minimum { 0 }; AllOptions regex_options; + HashMap capture_group_minimum_lengths; + HashMap named_capture_group_minimum_lengths; + HashMap named_capture_groups; + explicit ParserState(Lexer& lexer) : lexer(lexer) , current_token(lexer.next()) @@ -144,8 +154,54 @@ private: ALWAYS_INLINE bool parse_repetition_symbol(ByteCode&, size_t&); }; +class ECMA262Parser final : public Parser { +public: + explicit ECMA262Parser(Lexer& lexer) + : Parser(lexer) + { + } + + ECMA262Parser(Lexer& lexer, Optional::OptionsType> regex_options) + : Parser(lexer, regex_options.value_or({})) + { + } + + ~ECMA262Parser() = default; + +private: + bool parse_internal(ByteCode&, size_t&) override; + + enum class ReadDigitsInitialZeroState { + Allow, + Disallow, + Require, + }; + enum class ReadDigitFollowPolicy { + Any, + DisallowDigit, + DisallowNonDigit, + }; + Optional read_digits(ReadDigitsInitialZeroState initial_zero = ReadDigitsInitialZeroState::Allow, ReadDigitFollowPolicy follow_policy = ReadDigitFollowPolicy::Any, bool hex = false, int max_count = -1); + StringView read_capture_group_specifier(bool take_starting_angle_bracket = false); + + bool parse_pattern(ByteCode&, size_t&, bool unicode, bool named); + bool parse_disjunction(ByteCode&, size_t&, bool unicode, bool named); + bool parse_alternative(ByteCode&, size_t&, bool unicode, bool named); + bool parse_term(ByteCode&, size_t&, bool unicode, bool named); + bool parse_assertion(ByteCode&, size_t&, bool unicode, bool named); + bool parse_atom(ByteCode&, size_t&, bool unicode, bool named); + bool parse_quantifier(ByteCode&, size_t&, bool unicode, bool named); + bool parse_atom_escape(ByteCode&, size_t&, bool unicode, bool named); + bool parse_character_class(ByteCode&, size_t&, bool unicode, bool named); + bool parse_capture_group(ByteCode&, size_t&, bool unicode, bool named); + Optional parse_character_class_escape(bool& out_inverse, bool expect_backslash = false); + bool parse_nonempty_class_ranges(Vector&, bool unicode); +}; + using PosixExtended = PosixExtendedParser; +using ECMA262 = ECMA262Parser; } +using regex::ECMA262; using regex::PosixExtended; diff --git a/Libraries/LibRegex/Tests/Regex.cpp b/Libraries/LibRegex/Tests/Regex.cpp index 12487d3590..b4444c06e4 100644 --- a/Libraries/LibRegex/Tests/Regex.cpp +++ b/Libraries/LibRegex/Tests/Regex.cpp @@ -472,4 +472,79 @@ TEST_CASE(simple_period_end_benchmark) EXPECT_EQ(re.search("hello?", m), true); } +TEST_CASE(ECMA262_parse) +{ + constexpr const char* patterns[] { + "^hello.$", + "^(hello.)$", + "^h{0,1}ello.$", + "^hello\\W$", + "^hell\\w.$", + "^hell\\x6f1$", // ^hello1$ + "^hel(?:l\\w).$", + "^hel(?l\\w).$", + "^[-a-zA-Z\\w\\s]+$", + "\\bhello\\B", + }; + + for (auto& pattern : patterns) { + Regex re(pattern); + EXPECT_EQ(re.parser_result.error, Error::NoError); +#ifdef REGEX_DEBUG + dbg() << "\n"; + RegexDebug regex_dbg(stderr); + regex_dbg.print_raw_bytecode(re); + regex_dbg.print_header(); + regex_dbg.print_bytecode(re); + dbg() << "\n"; +#endif + } +} + +TEST_CASE(ECMA262_match) +{ + struct _test { + const char* pattern; + const char* subject; + bool matches { true }; + ECMAScriptFlags options {}; + }; + + constexpr _test tests[] { + { "^hello.$", "hello1" }, + { "^(hello.)$", "hello1" }, + { "^h{0,1}ello.$", "ello1" }, + { "^hello\\W$", "hello!" }, + { "^hell\\w.$", "hellx!" }, + { "^hell\\x6f1$", "hello1" }, + { "^hel(?l.)1$", "hello1" }, + { "^hel(?l.)1*\\k.$", "hello1lo1" }, + { "^[-a-z1-3\\s]+$", "hell2 o1" }, + { .pattern = "\\bhello\\B", .subject = "hello1", .options = ECMAScriptFlags::Global }, + { "\\b.*\\b", "hello1" }, + { "[^\\D\\S]{2}", "1 " }, + { "bar(?=f.)foo", "barfoo" }, + { "bar(?=foo)bar", "barbar", false }, + { "bar(?!foo)bar", "barbar", true }, + { "bar(?!bar)bar", "barbar", false }, + { "bar.*(?<=foo)", "barbar", false }, + { "bar.*(? re(test.pattern, test.options); +#ifdef REGEX_DEBUG + dbg() << "\n"; + RegexDebug regex_dbg(stderr); + regex_dbg.print_raw_bytecode(re); + regex_dbg.print_header(); + regex_dbg.print_bytecode(re); + dbg() << "\n"; +#endif + EXPECT_EQ(re.parser_result.error, Error::NoError); + EXPECT_EQ(re.match(test.subject).success, test.matches); + } +} + TEST_MAIN(Regex)