mirror of
https://github.com/RGBCube/serenity
synced 2025-07-27 08:37:46 +00:00
Libraries: Move to Userland/Libraries/
This commit is contained in:
parent
dc28c07fa5
commit
13d7c09125
1857 changed files with 266 additions and 274 deletions
256
Userland/Libraries/LibRegex/C/Regex.cpp
Normal file
256
Userland/Libraries/LibRegex/C/Regex.cpp
Normal file
|
@ -0,0 +1,256 @@
|
|||
/*
|
||||
* Copyright (c) 2020, Emanuel Sprung <emanuel.sprung@gmail.com>
|
||||
* All rights reserved.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are met:
|
||||
*
|
||||
* 1. Redistributions of source code must retain the above copyright notice, this
|
||||
* list of conditions and the following disclaimer.
|
||||
*
|
||||
* 2. Redistributions in binary form must reproduce the above copyright notice,
|
||||
* this list of conditions and the following disclaimer in the documentation
|
||||
* and/or other materials provided with the distribution.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
|
||||
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
#include <AK/String.h>
|
||||
#include <AK/StringBuilder.h>
|
||||
#include <LibRegex/Regex.h>
|
||||
#include <ctype.h>
|
||||
#include <stdio.h>
|
||||
#include <string.h>
|
||||
|
||||
#ifdef __serenity__
|
||||
# include <regex.h>
|
||||
#else
|
||||
# include <LibC/regex.h>
|
||||
#endif
|
||||
|
||||
struct internal_regex_t {
|
||||
u8 cflags;
|
||||
u8 eflags;
|
||||
OwnPtr<Regex<PosixExtended>> re;
|
||||
size_t re_pat_errpos;
|
||||
ReError re_pat_err;
|
||||
String re_pat;
|
||||
size_t re_nsub;
|
||||
};
|
||||
|
||||
static internal_regex_t* impl_from(regex_t* re)
|
||||
{
|
||||
if (!re)
|
||||
return nullptr;
|
||||
|
||||
return reinterpret_cast<internal_regex_t*>(re->__data);
|
||||
}
|
||||
|
||||
static const internal_regex_t* impl_from(const regex_t* re)
|
||||
{
|
||||
return impl_from(const_cast<regex_t*>(re));
|
||||
}
|
||||
|
||||
extern "C" {
|
||||
|
||||
int regcomp(regex_t* reg, const char* pattern, int cflags)
|
||||
{
|
||||
if (!reg)
|
||||
return REG_ESPACE;
|
||||
|
||||
// Note that subsequent uses of regcomp() without regfree() _will_ leak memory
|
||||
// This could've been prevented if libc provided a reginit() or similar, but it does not.
|
||||
reg->__data = new internal_regex_t { 0, 0, {}, 0, ReError::REG_NOERR, {}, 0 };
|
||||
|
||||
auto preg = impl_from(reg);
|
||||
|
||||
if (!(cflags & REG_EXTENDED))
|
||||
return REG_ENOSYS;
|
||||
|
||||
preg->cflags = cflags;
|
||||
|
||||
String pattern_str(pattern);
|
||||
preg->re = make<Regex<PosixExtended>>(pattern_str, PosixOptions {} | (PosixFlags)cflags | PosixFlags::SkipTrimEmptyMatches);
|
||||
|
||||
auto parser_result = preg->re->parser_result;
|
||||
if (parser_result.error != regex::Error::NoError) {
|
||||
preg->re_pat_errpos = parser_result.error_token.position();
|
||||
preg->re_pat_err = (ReError)parser_result.error;
|
||||
preg->re_pat = pattern;
|
||||
|
||||
dbg() << "Have Error: " << (ReError)parser_result.error;
|
||||
|
||||
return (ReError)parser_result.error;
|
||||
}
|
||||
|
||||
preg->re_nsub = parser_result.capture_groups_count;
|
||||
|
||||
return REG_NOERR;
|
||||
}
|
||||
|
||||
int regexec(const regex_t* reg, const char* string, size_t nmatch, regmatch_t pmatch[], int eflags)
|
||||
{
|
||||
auto preg = impl_from(reg);
|
||||
|
||||
if (!preg->re || preg->re_pat_err) {
|
||||
if (preg->re_pat_err)
|
||||
return preg->re_pat_err;
|
||||
return REG_BADPAT;
|
||||
}
|
||||
|
||||
RegexResult result;
|
||||
if (eflags & REG_SEARCH)
|
||||
result = preg->re->search(string, PosixOptions {} | (PosixFlags)eflags);
|
||||
else
|
||||
result = preg->re->match(string, PosixOptions {} | (PosixFlags)eflags);
|
||||
|
||||
if (result.success) {
|
||||
auto size = result.matches.size();
|
||||
if (size && nmatch && pmatch) {
|
||||
pmatch[0].rm_cnt = size;
|
||||
|
||||
size_t match_index { 0 };
|
||||
for (size_t i = 0; i < size; ++i) {
|
||||
pmatch[match_index].rm_so = result.matches.at(i).global_offset;
|
||||
pmatch[match_index].rm_eo = pmatch[match_index].rm_so + result.matches.at(i).view.length();
|
||||
if (match_index > 0)
|
||||
pmatch[match_index].rm_cnt = result.capture_group_matches.size();
|
||||
|
||||
++match_index;
|
||||
if (match_index >= nmatch)
|
||||
return REG_NOERR;
|
||||
|
||||
if (i < result.capture_group_matches.size()) {
|
||||
auto capture_groups_size = result.capture_group_matches.at(i).size();
|
||||
for (size_t j = 0; j < preg->re->parser_result.capture_groups_count; ++j) {
|
||||
if (j >= capture_groups_size || !result.capture_group_matches.at(i).at(j).view.length()) {
|
||||
pmatch[match_index].rm_so = -1;
|
||||
pmatch[match_index].rm_eo = -1;
|
||||
pmatch[match_index].rm_cnt = 0;
|
||||
} else {
|
||||
pmatch[match_index].rm_so = result.capture_group_matches.at(i).at(j).global_offset;
|
||||
pmatch[match_index].rm_eo = pmatch[match_index].rm_so + result.capture_group_matches.at(i).at(j).view.length();
|
||||
pmatch[match_index].rm_cnt = 1;
|
||||
}
|
||||
|
||||
++match_index;
|
||||
if (match_index >= nmatch)
|
||||
return REG_NOERR;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (match_index < nmatch) {
|
||||
for (size_t i = match_index; i < nmatch; ++i) {
|
||||
pmatch[i].rm_so = -1;
|
||||
pmatch[i].rm_eo = -1;
|
||||
pmatch[i].rm_cnt = 0;
|
||||
}
|
||||
}
|
||||
}
|
||||
return REG_NOERR;
|
||||
} else {
|
||||
if (nmatch && pmatch) {
|
||||
pmatch[0].rm_so = -1;
|
||||
pmatch[0].rm_eo = -1;
|
||||
pmatch[0].rm_cnt = 0;
|
||||
}
|
||||
}
|
||||
|
||||
return REG_NOMATCH;
|
||||
}
|
||||
|
||||
inline static String get_error(ReError errcode)
|
||||
{
|
||||
String error;
|
||||
switch ((ReError)errcode) {
|
||||
case REG_NOERR:
|
||||
error = "No error";
|
||||
break;
|
||||
case REG_NOMATCH:
|
||||
error = "regexec() failed to match.";
|
||||
break;
|
||||
case REG_BADPAT:
|
||||
error = "Invalid regular expression.";
|
||||
break;
|
||||
case REG_ECOLLATE:
|
||||
error = "Invalid collating element referenced.";
|
||||
break;
|
||||
case REG_ECTYPE:
|
||||
error = "Invalid character class type referenced.";
|
||||
break;
|
||||
case REG_EESCAPE:
|
||||
error = "Trailing \\ in pattern.";
|
||||
break;
|
||||
case REG_ESUBREG:
|
||||
error = "Number in \\digit invalid or in error.";
|
||||
break;
|
||||
case REG_EBRACK:
|
||||
error = "[ ] imbalance.";
|
||||
break;
|
||||
case REG_EPAREN:
|
||||
error = "\\( \\) or ( ) imbalance.";
|
||||
break;
|
||||
case REG_EBRACE:
|
||||
error = "\\{ \\} imbalance.";
|
||||
break;
|
||||
case REG_BADBR:
|
||||
error = "Content of \\{ \\} invalid: not a number, number too large, more than two numbers, first larger than second.";
|
||||
break;
|
||||
case REG_ERANGE:
|
||||
error = "Invalid endpoint in range expression.";
|
||||
break;
|
||||
case REG_ESPACE:
|
||||
error = "Out of memory.";
|
||||
break;
|
||||
case REG_BADRPT:
|
||||
error = "?, * or + not preceded by valid regular expression.";
|
||||
break;
|
||||
case REG_ENOSYS:
|
||||
error = "The implementation does not support the function.";
|
||||
break;
|
||||
case REG_EMPTY_EXPR:
|
||||
error = "Empty expression provided";
|
||||
break;
|
||||
}
|
||||
|
||||
return error;
|
||||
}
|
||||
|
||||
size_t regerror(int errcode, const regex_t* reg, char* errbuf, size_t errbuf_size)
|
||||
{
|
||||
String error;
|
||||
auto preg = impl_from(reg);
|
||||
|
||||
if (!preg)
|
||||
error = get_error((ReError)errcode);
|
||||
else
|
||||
error = preg->re->error_string(get_error(preg->re_pat_err));
|
||||
|
||||
if (!errbuf_size)
|
||||
return error.length();
|
||||
|
||||
if (!error.copy_characters_to_buffer(errbuf, errbuf_size))
|
||||
return 0;
|
||||
|
||||
return error.length();
|
||||
}
|
||||
|
||||
void regfree(regex_t* reg)
|
||||
{
|
||||
auto preg = impl_from(reg);
|
||||
if (preg) {
|
||||
delete preg;
|
||||
reg->__data = nullptr;
|
||||
}
|
||||
}
|
||||
}
|
10
Userland/Libraries/LibRegex/CMakeLists.txt
Normal file
10
Userland/Libraries/LibRegex/CMakeLists.txt
Normal file
|
@ -0,0 +1,10 @@
|
|||
set(SOURCES
|
||||
C/Regex.cpp
|
||||
RegexByteCode.cpp
|
||||
RegexLexer.cpp
|
||||
RegexMatcher.cpp
|
||||
RegexParser.cpp
|
||||
)
|
||||
|
||||
serenity_lib(LibRegex regex)
|
||||
target_link_libraries(LibRegex LibC LibCore)
|
58
Userland/Libraries/LibRegex/Forward.h
Normal file
58
Userland/Libraries/LibRegex/Forward.h
Normal file
|
@ -0,0 +1,58 @@
|
|||
/*
|
||||
* Copyright (c) 2020, Emanuel Sprung <emanuel.sprung@gmail.com>
|
||||
* All rights reserved.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are met:
|
||||
*
|
||||
* 1. Redistributions of source code must retain the above copyright notice, this
|
||||
* list of conditions and the following disclaimer.
|
||||
*
|
||||
* 2. Redistributions in binary form must reproduce the above copyright notice,
|
||||
* this list of conditions and the following disclaimer in the documentation
|
||||
* and/or other materials provided with the distribution.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
|
||||
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <AK/Types.h>
|
||||
|
||||
namespace regex {
|
||||
enum class Error : u8;
|
||||
class Lexer;
|
||||
class PosixExtendedParser;
|
||||
class ECMA262Parser;
|
||||
|
||||
class ByteCode;
|
||||
class OpCode;
|
||||
class OpCode_Exit;
|
||||
class OpCode_Jump;
|
||||
class OpCode_ForkJump;
|
||||
class OpCode_ForkStay;
|
||||
class OpCode_CheckBegin;
|
||||
class OpCode_CheckEnd;
|
||||
class OpCode_SaveLeftCaptureGroup;
|
||||
class OpCode_SaveRightCaptureGroup;
|
||||
class OpCode_SaveLeftNamedCaptureGroup;
|
||||
class OpCode_SaveNamedLeftCaptureGroup;
|
||||
class OpCode_SaveRightNamedCaptureGroup;
|
||||
class OpCode_Compare;
|
||||
class RegexStringView;
|
||||
}
|
||||
|
||||
using regex::ECMA262Parser;
|
||||
using regex::Error;
|
||||
using regex::Lexer;
|
||||
using regex::PosixExtendedParser;
|
||||
using regex::RegexStringView;
|
31
Userland/Libraries/LibRegex/Regex.h
Normal file
31
Userland/Libraries/LibRegex/Regex.h
Normal file
|
@ -0,0 +1,31 @@
|
|||
/*
|
||||
* Copyright (c) 2020, Emanuel Sprung <emanuel.sprung@gmail.com>
|
||||
* All rights reserved.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are met:
|
||||
*
|
||||
* 1. Redistributions of source code must retain the above copyright notice, this
|
||||
* list of conditions and the following disclaimer.
|
||||
*
|
||||
* 2. Redistributions in binary form must reproduce the above copyright notice,
|
||||
* this list of conditions and the following disclaimer in the documentation
|
||||
* and/or other materials provided with the distribution.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
|
||||
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <LibRegex/Forward.h>
|
||||
#include <LibRegex/RegexDebug.h>
|
||||
#include <LibRegex/RegexMatcher.h>
|
749
Userland/Libraries/LibRegex/RegexByteCode.cpp
Normal file
749
Userland/Libraries/LibRegex/RegexByteCode.cpp
Normal file
|
@ -0,0 +1,749 @@
|
|||
/*
|
||||
* Copyright (c) 2020, Emanuel Sprung <emanuel.sprung@gmail.com>
|
||||
* All rights reserved.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are met:
|
||||
*
|
||||
* 1. Redistributions of source code must retain the above copyright notice, this
|
||||
* list of conditions and the following disclaimer.
|
||||
*
|
||||
* 2. Redistributions in binary form must reproduce the above copyright notice,
|
||||
* this list of conditions and the following disclaimer in the documentation
|
||||
* and/or other materials provided with the distribution.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
|
||||
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
#include "RegexByteCode.h"
|
||||
#include "AK/StringBuilder.h"
|
||||
#include "RegexDebug.h"
|
||||
|
||||
#include <ctype.h>
|
||||
|
||||
namespace regex {
|
||||
|
||||
const char* OpCode::name(OpCodeId opcode_id)
|
||||
{
|
||||
switch (opcode_id) {
|
||||
#define __ENUMERATE_OPCODE(x) \
|
||||
case OpCodeId::x: \
|
||||
return #x;
|
||||
ENUMERATE_OPCODES
|
||||
#undef __ENUMERATE_OPCODE
|
||||
default:
|
||||
ASSERT_NOT_REACHED();
|
||||
return "<Unknown>";
|
||||
}
|
||||
}
|
||||
|
||||
const char* OpCode::name() const
|
||||
{
|
||||
return name(opcode_id());
|
||||
}
|
||||
|
||||
const char* execution_result_name(ExecutionResult result)
|
||||
{
|
||||
switch (result) {
|
||||
#define __ENUMERATE_EXECUTION_RESULT(x) \
|
||||
case ExecutionResult::x: \
|
||||
return #x;
|
||||
ENUMERATE_EXECUTION_RESULTS
|
||||
#undef __ENUMERATE_EXECUTION_RESULT
|
||||
default:
|
||||
ASSERT_NOT_REACHED();
|
||||
return "<Unknown>";
|
||||
}
|
||||
}
|
||||
|
||||
const char* boundary_check_type_name(BoundaryCheckType ty)
|
||||
{
|
||||
switch (ty) {
|
||||
#define __ENUMERATE_BOUNDARY_CHECK_TYPE(x) \
|
||||
case BoundaryCheckType::x: \
|
||||
return #x;
|
||||
ENUMERATE_BOUNDARY_CHECK_TYPES
|
||||
#undef __ENUMERATE_BOUNDARY_CHECK_TYPE
|
||||
default:
|
||||
ASSERT_NOT_REACHED();
|
||||
return "<Unknown>";
|
||||
}
|
||||
}
|
||||
|
||||
const char* character_compare_type_name(CharacterCompareType ch_compare_type)
|
||||
{
|
||||
switch (ch_compare_type) {
|
||||
#define __ENUMERATE_CHARACTER_COMPARE_TYPE(x) \
|
||||
case CharacterCompareType::x: \
|
||||
return #x;
|
||||
ENUMERATE_CHARACTER_COMPARE_TYPES
|
||||
#undef __ENUMERATE_CHARACTER_COMPARE_TYPE
|
||||
default:
|
||||
ASSERT_NOT_REACHED();
|
||||
return "<Unknown>";
|
||||
}
|
||||
}
|
||||
|
||||
static const char* character_class_name(CharClass ch_class)
|
||||
{
|
||||
switch (ch_class) {
|
||||
#define __ENUMERATE_CHARACTER_CLASS(x) \
|
||||
case CharClass::x: \
|
||||
return #x;
|
||||
ENUMERATE_CHARACTER_CLASSES
|
||||
#undef __ENUMERATE_CHARACTER_CLASS
|
||||
default:
|
||||
ASSERT_NOT_REACHED();
|
||||
return "<Unknown>";
|
||||
}
|
||||
}
|
||||
|
||||
HashMap<u32, OwnPtr<OpCode>> ByteCode::s_opcodes {};
|
||||
|
||||
ALWAYS_INLINE OpCode* ByteCode::get_opcode_by_id(OpCodeId id) const
|
||||
{
|
||||
if (!s_opcodes.size()) {
|
||||
for (u32 i = (u32)OpCodeId::First; i <= (u32)OpCodeId::Last; ++i) {
|
||||
switch ((OpCodeId)i) {
|
||||
case OpCodeId::Exit:
|
||||
s_opcodes.set(i, make<OpCode_Exit>(*const_cast<ByteCode*>(this)));
|
||||
break;
|
||||
case OpCodeId::Jump:
|
||||
s_opcodes.set(i, make<OpCode_Jump>(*const_cast<ByteCode*>(this)));
|
||||
break;
|
||||
case OpCodeId::Compare:
|
||||
s_opcodes.set(i, make<OpCode_Compare>(*const_cast<ByteCode*>(this)));
|
||||
break;
|
||||
case OpCodeId::CheckEnd:
|
||||
s_opcodes.set(i, make<OpCode_CheckEnd>(*const_cast<ByteCode*>(this)));
|
||||
break;
|
||||
case OpCodeId::CheckBoundary:
|
||||
s_opcodes.set(i, make<OpCode_CheckBoundary>(*const_cast<ByteCode*>(this)));
|
||||
break;
|
||||
case OpCodeId::ForkJump:
|
||||
s_opcodes.set(i, make<OpCode_ForkJump>(*const_cast<ByteCode*>(this)));
|
||||
break;
|
||||
case OpCodeId::ForkStay:
|
||||
s_opcodes.set(i, make<OpCode_ForkStay>(*const_cast<ByteCode*>(this)));
|
||||
break;
|
||||
case OpCodeId::FailForks:
|
||||
s_opcodes.set(i, make<OpCode_FailForks>(*const_cast<ByteCode*>(this)));
|
||||
break;
|
||||
case OpCodeId::Save:
|
||||
s_opcodes.set(i, make<OpCode_Save>(*const_cast<ByteCode*>(this)));
|
||||
break;
|
||||
case OpCodeId::Restore:
|
||||
s_opcodes.set(i, make<OpCode_Restore>(*const_cast<ByteCode*>(this)));
|
||||
break;
|
||||
case OpCodeId::GoBack:
|
||||
s_opcodes.set(i, make<OpCode_GoBack>(*const_cast<ByteCode*>(this)));
|
||||
break;
|
||||
case OpCodeId::CheckBegin:
|
||||
s_opcodes.set(i, make<OpCode_CheckBegin>(*const_cast<ByteCode*>(this)));
|
||||
break;
|
||||
case OpCodeId::SaveLeftCaptureGroup:
|
||||
s_opcodes.set(i, make<OpCode_SaveLeftCaptureGroup>(*const_cast<ByteCode*>(this)));
|
||||
break;
|
||||
case OpCodeId::SaveRightCaptureGroup:
|
||||
s_opcodes.set(i, make<OpCode_SaveRightCaptureGroup>(*const_cast<ByteCode*>(this)));
|
||||
break;
|
||||
case OpCodeId::SaveLeftNamedCaptureGroup:
|
||||
s_opcodes.set(i, make<OpCode_SaveLeftNamedCaptureGroup>(*const_cast<ByteCode*>(this)));
|
||||
break;
|
||||
case OpCodeId::SaveRightNamedCaptureGroup:
|
||||
s_opcodes.set(i, make<OpCode_SaveRightNamedCaptureGroup>(*const_cast<ByteCode*>(this)));
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (id > OpCodeId::Last)
|
||||
return nullptr;
|
||||
|
||||
return const_cast<OpCode*>(s_opcodes.get((u32)id).value())->set_bytecode(*const_cast<ByteCode*>(this));
|
||||
}
|
||||
|
||||
OpCode* ByteCode::get_opcode(MatchState& state) const
|
||||
{
|
||||
OpCode* op_code;
|
||||
|
||||
if (state.instruction_position >= size()) {
|
||||
op_code = get_opcode_by_id(OpCodeId::Exit);
|
||||
} else
|
||||
op_code = get_opcode_by_id((OpCodeId)at(state.instruction_position));
|
||||
|
||||
if (op_code)
|
||||
op_code->set_state(state);
|
||||
|
||||
return op_code;
|
||||
}
|
||||
|
||||
ALWAYS_INLINE ExecutionResult OpCode_Exit::execute(const MatchInput& input, MatchState& state, MatchOutput&) const
|
||||
{
|
||||
if (state.string_position > input.view.length() || state.instruction_position >= m_bytecode->size())
|
||||
return ExecutionResult::Succeeded;
|
||||
|
||||
return ExecutionResult::Failed;
|
||||
}
|
||||
|
||||
ALWAYS_INLINE ExecutionResult OpCode_Save::execute(const MatchInput& input, MatchState& state, MatchOutput&) const
|
||||
{
|
||||
input.saved_positions.append(state.string_position);
|
||||
return ExecutionResult::Continue;
|
||||
}
|
||||
|
||||
ALWAYS_INLINE ExecutionResult OpCode_Restore::execute(const MatchInput& input, MatchState& state, MatchOutput&) const
|
||||
{
|
||||
if (input.saved_positions.is_empty())
|
||||
return ExecutionResult::Failed;
|
||||
|
||||
state.string_position = input.saved_positions.take_last();
|
||||
return ExecutionResult::Continue;
|
||||
}
|
||||
|
||||
ALWAYS_INLINE ExecutionResult OpCode_GoBack::execute(const MatchInput&, MatchState& state, MatchOutput&) const
|
||||
{
|
||||
if (count() > state.string_position)
|
||||
return ExecutionResult::Failed_ExecuteLowPrioForks;
|
||||
|
||||
state.string_position -= count();
|
||||
return ExecutionResult::Continue;
|
||||
}
|
||||
|
||||
ALWAYS_INLINE ExecutionResult OpCode_FailForks::execute(const MatchInput& input, MatchState&, MatchOutput&) const
|
||||
{
|
||||
ASSERT(count() > 0);
|
||||
|
||||
input.fail_counter += count() - 1;
|
||||
return ExecutionResult::Failed_ExecuteLowPrioForks;
|
||||
}
|
||||
|
||||
ALWAYS_INLINE ExecutionResult OpCode_Jump::execute(const MatchInput&, MatchState& state, MatchOutput&) const
|
||||
{
|
||||
|
||||
state.instruction_position += offset();
|
||||
return ExecutionResult::Continue;
|
||||
}
|
||||
|
||||
ALWAYS_INLINE ExecutionResult OpCode_ForkJump::execute(const MatchInput&, MatchState& state, MatchOutput&) const
|
||||
{
|
||||
state.fork_at_position = state.instruction_position + size() + offset();
|
||||
return ExecutionResult::Fork_PrioHigh;
|
||||
}
|
||||
|
||||
ALWAYS_INLINE ExecutionResult OpCode_ForkStay::execute(const MatchInput&, MatchState& state, MatchOutput&) const
|
||||
{
|
||||
state.fork_at_position = state.instruction_position + size() + offset();
|
||||
return ExecutionResult::Fork_PrioLow;
|
||||
}
|
||||
|
||||
ALWAYS_INLINE ExecutionResult OpCode_CheckBegin::execute(const MatchInput& input, MatchState& state, MatchOutput&) const
|
||||
{
|
||||
if (0 == state.string_position && (input.regex_options & AllFlags::MatchNotBeginOfLine))
|
||||
return ExecutionResult::Failed_ExecuteLowPrioForks;
|
||||
|
||||
if ((0 == state.string_position && !(input.regex_options & AllFlags::MatchNotBeginOfLine))
|
||||
|| (0 != state.string_position && (input.regex_options & AllFlags::MatchNotBeginOfLine))
|
||||
|| (0 == state.string_position && (input.regex_options & AllFlags::Global)))
|
||||
return ExecutionResult::Continue;
|
||||
|
||||
return ExecutionResult::Failed_ExecuteLowPrioForks;
|
||||
}
|
||||
|
||||
ALWAYS_INLINE ExecutionResult OpCode_CheckBoundary::execute(const MatchInput& input, MatchState& state, MatchOutput&) const
|
||||
{
|
||||
auto isword = [](auto ch) { return isalnum(ch) || ch == '_'; };
|
||||
auto is_word_boundary = [&] {
|
||||
if (state.string_position == input.view.length()) {
|
||||
if (state.string_position > 0 && isword(input.view[state.string_position - 1]))
|
||||
return true;
|
||||
return false;
|
||||
}
|
||||
|
||||
if (state.string_position == 0) {
|
||||
if (isword(input.view[0]))
|
||||
return true;
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
return !!(isword(input.view[state.string_position]) ^ isword(input.view[state.string_position - 1]));
|
||||
};
|
||||
switch (type()) {
|
||||
case BoundaryCheckType::Word: {
|
||||
if (is_word_boundary())
|
||||
return ExecutionResult::Continue;
|
||||
return ExecutionResult::Failed_ExecuteLowPrioForks;
|
||||
}
|
||||
case BoundaryCheckType::NonWord: {
|
||||
if (!is_word_boundary())
|
||||
return ExecutionResult::Continue;
|
||||
return ExecutionResult::Failed_ExecuteLowPrioForks;
|
||||
}
|
||||
}
|
||||
ASSERT_NOT_REACHED();
|
||||
}
|
||||
|
||||
ALWAYS_INLINE ExecutionResult OpCode_CheckEnd::execute(const MatchInput& input, MatchState& state, MatchOutput&) const
|
||||
{
|
||||
if (state.string_position == input.view.length() && (input.regex_options & AllFlags::MatchNotEndOfLine))
|
||||
return ExecutionResult::Failed_ExecuteLowPrioForks;
|
||||
|
||||
if ((state.string_position == input.view.length() && !(input.regex_options & AllFlags::MatchNotEndOfLine))
|
||||
|| (state.string_position != input.view.length() && (input.regex_options & AllFlags::MatchNotEndOfLine || input.regex_options & AllFlags::MatchNotBeginOfLine)))
|
||||
return ExecutionResult::Continue;
|
||||
|
||||
return ExecutionResult::Failed_ExecuteLowPrioForks;
|
||||
}
|
||||
|
||||
ALWAYS_INLINE ExecutionResult OpCode_SaveLeftCaptureGroup::execute(const MatchInput& input, MatchState& state, MatchOutput& output) const
|
||||
{
|
||||
if (input.match_index >= output.capture_group_matches.size()) {
|
||||
output.capture_group_matches.ensure_capacity(input.match_index);
|
||||
auto capacity = output.capture_group_matches.capacity();
|
||||
for (size_t i = output.capture_group_matches.size(); i <= capacity; ++i)
|
||||
output.capture_group_matches.empend();
|
||||
}
|
||||
|
||||
if (id() >= output.capture_group_matches.at(input.match_index).size()) {
|
||||
output.capture_group_matches.at(input.match_index).ensure_capacity(id());
|
||||
auto capacity = output.capture_group_matches.at(input.match_index).capacity();
|
||||
for (size_t i = output.capture_group_matches.at(input.match_index).size(); i <= capacity; ++i)
|
||||
output.capture_group_matches.at(input.match_index).empend();
|
||||
}
|
||||
|
||||
output.capture_group_matches.at(input.match_index).at(id()).left_column = state.string_position;
|
||||
return ExecutionResult::Continue;
|
||||
}
|
||||
|
||||
ALWAYS_INLINE ExecutionResult OpCode_SaveRightCaptureGroup::execute(const MatchInput& input, MatchState& state, MatchOutput& output) const
|
||||
{
|
||||
auto& match = output.capture_group_matches.at(input.match_index).at(id());
|
||||
auto start_position = match.left_column;
|
||||
auto length = state.string_position - start_position;
|
||||
|
||||
if (start_position < match.column)
|
||||
return ExecutionResult::Continue;
|
||||
|
||||
ASSERT(start_position + length <= input.view.length());
|
||||
|
||||
auto view = input.view.substring_view(start_position, length);
|
||||
|
||||
if (input.regex_options & AllFlags::StringCopyMatches) {
|
||||
match = { view.to_string(), input.line, start_position, input.global_offset + start_position }; // create a copy of the original string
|
||||
} else {
|
||||
match = { view, input.line, start_position, input.global_offset + start_position }; // take view to original string
|
||||
}
|
||||
|
||||
return ExecutionResult::Continue;
|
||||
}
|
||||
|
||||
ALWAYS_INLINE ExecutionResult OpCode_SaveLeftNamedCaptureGroup::execute(const MatchInput& input, MatchState& state, MatchOutput& output) const
|
||||
{
|
||||
if (input.match_index >= output.named_capture_group_matches.size()) {
|
||||
output.named_capture_group_matches.ensure_capacity(input.match_index);
|
||||
auto capacity = output.named_capture_group_matches.capacity();
|
||||
for (size_t i = output.named_capture_group_matches.size(); i <= capacity; ++i)
|
||||
output.named_capture_group_matches.empend();
|
||||
}
|
||||
output.named_capture_group_matches.at(input.match_index).ensure(name()).column = state.string_position;
|
||||
return ExecutionResult::Continue;
|
||||
}
|
||||
|
||||
ALWAYS_INLINE ExecutionResult OpCode_SaveRightNamedCaptureGroup::execute(const MatchInput& input, MatchState& state, MatchOutput& output) const
|
||||
{
|
||||
StringView capture_group_name = name();
|
||||
|
||||
if (output.named_capture_group_matches.at(input.match_index).contains(capture_group_name)) {
|
||||
auto start_position = output.named_capture_group_matches.at(input.match_index).ensure(capture_group_name).column;
|
||||
auto length = state.string_position - start_position;
|
||||
|
||||
auto& map = output.named_capture_group_matches.at(input.match_index);
|
||||
|
||||
#ifdef REGEX_DEBUG
|
||||
ASSERT(start_position + length <= input.view.length());
|
||||
dbg() << "Save named capture group with name=" << capture_group_name << " and content: " << input.view.substring_view(start_position, length).to_string();
|
||||
#endif
|
||||
|
||||
ASSERT(start_position + length <= input.view.length());
|
||||
auto view = input.view.substring_view(start_position, length);
|
||||
if (input.regex_options & AllFlags::StringCopyMatches) {
|
||||
map.set(capture_group_name, { view.to_string(), input.line, start_position, input.global_offset + start_position }); // create a copy of the original string
|
||||
} else {
|
||||
map.set(capture_group_name, { view, input.line, start_position, input.global_offset + start_position }); // take view to original string
|
||||
}
|
||||
} else {
|
||||
fprintf(stderr, "Didn't find corresponding capture group match for name=%s, match_index=%lu\n", capture_group_name.to_string().characters(), input.match_index);
|
||||
}
|
||||
|
||||
return ExecutionResult::Continue;
|
||||
}
|
||||
|
||||
ALWAYS_INLINE ExecutionResult OpCode_Compare::execute(const MatchInput& input, MatchState& state, MatchOutput& output) const
|
||||
{
|
||||
bool inverse { false };
|
||||
bool temporary_inverse { false };
|
||||
bool reset_temp_inverse { false };
|
||||
|
||||
auto current_inversion_state = [&]() -> bool { return temporary_inverse ^ inverse; };
|
||||
|
||||
size_t string_position = state.string_position;
|
||||
bool inverse_matched { false };
|
||||
|
||||
size_t offset { state.instruction_position + 3 };
|
||||
for (size_t i = 0; i < arguments_count(); ++i) {
|
||||
if (state.string_position > string_position)
|
||||
break;
|
||||
|
||||
if (reset_temp_inverse) {
|
||||
reset_temp_inverse = false;
|
||||
temporary_inverse = false;
|
||||
} else {
|
||||
reset_temp_inverse = true;
|
||||
}
|
||||
|
||||
auto compare_type = (CharacterCompareType)m_bytecode->at(offset++);
|
||||
|
||||
if (compare_type == CharacterCompareType::Inverse)
|
||||
inverse = true;
|
||||
|
||||
else if (compare_type == CharacterCompareType::TemporaryInverse) {
|
||||
// If "TemporaryInverse" is given, negate the current inversion state only for the next opcode.
|
||||
// it follows that this cannot be the last compare element.
|
||||
ASSERT(i != arguments_count() - 1);
|
||||
|
||||
temporary_inverse = true;
|
||||
reset_temp_inverse = false;
|
||||
|
||||
} else if (compare_type == CharacterCompareType::Char) {
|
||||
u32 ch = m_bytecode->at(offset++);
|
||||
|
||||
// We want to compare a string that is longer or equal in length to the available string
|
||||
if (input.view.length() - state.string_position < 1)
|
||||
return ExecutionResult::Failed_ExecuteLowPrioForks;
|
||||
|
||||
compare_char(input, state, ch, current_inversion_state(), inverse_matched);
|
||||
|
||||
} else if (compare_type == CharacterCompareType::AnyChar) {
|
||||
// We want to compare a string that is definitely longer than the available string
|
||||
if (input.view.length() - state.string_position < 1)
|
||||
return ExecutionResult::Failed_ExecuteLowPrioForks;
|
||||
|
||||
ASSERT(!current_inversion_state());
|
||||
++state.string_position;
|
||||
|
||||
} else if (compare_type == CharacterCompareType::String) {
|
||||
ASSERT(!current_inversion_state());
|
||||
|
||||
const auto& length = m_bytecode->at(offset++);
|
||||
StringBuilder str_builder;
|
||||
for (size_t i = 0; i < length; ++i)
|
||||
str_builder.append(m_bytecode->at(offset++));
|
||||
|
||||
// We want to compare a string that is definitely longer than the available string
|
||||
if (input.view.length() - state.string_position < length)
|
||||
return ExecutionResult::Failed_ExecuteLowPrioForks;
|
||||
|
||||
if (!compare_string(input, state, str_builder.string_view().characters_without_null_termination(), length))
|
||||
return ExecutionResult::Failed_ExecuteLowPrioForks;
|
||||
|
||||
} else if (compare_type == CharacterCompareType::CharClass) {
|
||||
|
||||
if (input.view.length() - state.string_position < 1)
|
||||
return ExecutionResult::Failed_ExecuteLowPrioForks;
|
||||
|
||||
auto character_class = (CharClass)m_bytecode->at(offset++);
|
||||
auto ch = input.view[state.string_position];
|
||||
|
||||
compare_character_class(input, state, character_class, ch, current_inversion_state(), inverse_matched);
|
||||
|
||||
} else if (compare_type == CharacterCompareType::CharRange) {
|
||||
auto value = (CharRange)m_bytecode->at(offset++);
|
||||
|
||||
auto from = value.from;
|
||||
auto to = value.to;
|
||||
auto ch = input.view[state.string_position];
|
||||
|
||||
compare_character_range(input, state, from, to, ch, current_inversion_state(), inverse_matched);
|
||||
|
||||
} else if (compare_type == CharacterCompareType::Reference) {
|
||||
auto reference_number = (size_t)m_bytecode->at(offset++);
|
||||
auto& groups = output.capture_group_matches.at(input.match_index);
|
||||
if (groups.size() <= reference_number)
|
||||
return ExecutionResult::Failed_ExecuteLowPrioForks;
|
||||
|
||||
auto str = groups.at(reference_number).view;
|
||||
|
||||
// We want to compare a string that is definitely longer than the available string
|
||||
if (input.view.length() - state.string_position < str.length())
|
||||
return ExecutionResult::Failed_ExecuteLowPrioForks;
|
||||
|
||||
if (!compare_string(input, state, str.characters_without_null_termination(), str.length()))
|
||||
return ExecutionResult::Failed_ExecuteLowPrioForks;
|
||||
|
||||
} else if (compare_type == CharacterCompareType::NamedReference) {
|
||||
auto ptr = (const char*)m_bytecode->at(offset++);
|
||||
auto length = (size_t)m_bytecode->at(offset++);
|
||||
StringView name { ptr, length };
|
||||
|
||||
auto group = output.named_capture_group_matches.at(input.match_index).get(name);
|
||||
if (!group.has_value())
|
||||
return ExecutionResult::Failed_ExecuteLowPrioForks;
|
||||
|
||||
auto str = group.value().view;
|
||||
|
||||
// We want to compare a string that is definitely longer than the available string
|
||||
if (input.view.length() - state.string_position < str.length())
|
||||
return ExecutionResult::Failed_ExecuteLowPrioForks;
|
||||
|
||||
if (!compare_string(input, state, str.characters_without_null_termination(), str.length()))
|
||||
return ExecutionResult::Failed_ExecuteLowPrioForks;
|
||||
|
||||
} else {
|
||||
fprintf(stderr, "Undefined comparison: %i\n", (int)compare_type);
|
||||
ASSERT_NOT_REACHED();
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if (current_inversion_state() && !inverse_matched)
|
||||
++state.string_position;
|
||||
|
||||
if (string_position == state.string_position || state.string_position > input.view.length())
|
||||
return ExecutionResult::Failed_ExecuteLowPrioForks;
|
||||
|
||||
return ExecutionResult::Continue;
|
||||
}
|
||||
|
||||
ALWAYS_INLINE void OpCode_Compare::compare_char(const MatchInput& input, MatchState& state, u32 ch1, bool inverse, bool& inverse_matched)
|
||||
{
|
||||
u32 ch2 = input.view[state.string_position];
|
||||
|
||||
if (input.regex_options & AllFlags::Insensitive) {
|
||||
ch1 = tolower(ch1);
|
||||
ch2 = tolower(ch2);
|
||||
}
|
||||
|
||||
if (ch1 == ch2) {
|
||||
if (inverse)
|
||||
inverse_matched = true;
|
||||
else
|
||||
++state.string_position;
|
||||
}
|
||||
}
|
||||
|
||||
ALWAYS_INLINE bool OpCode_Compare::compare_string(const MatchInput& input, MatchState& state, const char* str, size_t length)
|
||||
{
|
||||
if (input.view.is_u8_view()) {
|
||||
auto str_view1 = StringView(str, length);
|
||||
auto str_view2 = StringView(&input.view.u8view()[state.string_position], length);
|
||||
|
||||
String str1, str2;
|
||||
if (input.regex_options & AllFlags::Insensitive) {
|
||||
str1 = str_view1.to_string().to_lowercase();
|
||||
str2 = str_view2.to_string().to_lowercase();
|
||||
str_view1 = str1.view();
|
||||
str_view2 = str2.view();
|
||||
}
|
||||
|
||||
if (str_view1 == str_view2) {
|
||||
state.string_position += length;
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
ALWAYS_INLINE void OpCode_Compare::compare_character_class(const MatchInput& input, MatchState& state, CharClass character_class, u32 ch, bool inverse, bool& inverse_matched)
|
||||
{
|
||||
switch (character_class) {
|
||||
case CharClass::Alnum:
|
||||
if (isalnum(ch)) {
|
||||
if (inverse)
|
||||
inverse_matched = true;
|
||||
else
|
||||
++state.string_position;
|
||||
}
|
||||
break;
|
||||
case CharClass::Alpha:
|
||||
if (isalpha(ch))
|
||||
++state.string_position;
|
||||
break;
|
||||
case CharClass::Blank:
|
||||
if (ch == ' ' || ch == '\t') {
|
||||
if (inverse)
|
||||
inverse_matched = true;
|
||||
else
|
||||
++state.string_position;
|
||||
}
|
||||
break;
|
||||
case CharClass::Cntrl:
|
||||
if (iscntrl(ch)) {
|
||||
if (inverse)
|
||||
inverse_matched = true;
|
||||
else
|
||||
++state.string_position;
|
||||
}
|
||||
break;
|
||||
case CharClass::Digit:
|
||||
if (isdigit(ch)) {
|
||||
if (inverse)
|
||||
inverse_matched = true;
|
||||
else
|
||||
++state.string_position;
|
||||
}
|
||||
break;
|
||||
case CharClass::Graph:
|
||||
if (isgraph(ch)) {
|
||||
if (inverse)
|
||||
inverse_matched = true;
|
||||
else
|
||||
++state.string_position;
|
||||
}
|
||||
break;
|
||||
case CharClass::Lower:
|
||||
if (islower(ch) || ((input.regex_options & AllFlags::Insensitive) && isupper(ch))) {
|
||||
if (inverse)
|
||||
inverse_matched = true;
|
||||
else
|
||||
++state.string_position;
|
||||
}
|
||||
break;
|
||||
case CharClass::Print:
|
||||
if (isprint(ch)) {
|
||||
if (inverse)
|
||||
inverse_matched = true;
|
||||
else
|
||||
++state.string_position;
|
||||
}
|
||||
break;
|
||||
case CharClass::Punct:
|
||||
if (ispunct(ch)) {
|
||||
if (inverse)
|
||||
inverse_matched = true;
|
||||
else
|
||||
++state.string_position;
|
||||
}
|
||||
break;
|
||||
case CharClass::Space:
|
||||
if (isspace(ch)) {
|
||||
if (inverse)
|
||||
inverse_matched = true;
|
||||
else
|
||||
++state.string_position;
|
||||
}
|
||||
break;
|
||||
case CharClass::Upper:
|
||||
if (isupper(ch) || ((input.regex_options & AllFlags::Insensitive) && islower(ch))) {
|
||||
if (inverse)
|
||||
inverse_matched = true;
|
||||
else
|
||||
++state.string_position;
|
||||
}
|
||||
break;
|
||||
case CharClass::Word:
|
||||
if (isalnum(ch) || ch == '_') {
|
||||
if (inverse)
|
||||
inverse_matched = true;
|
||||
else
|
||||
++state.string_position;
|
||||
}
|
||||
break;
|
||||
case CharClass::Xdigit:
|
||||
if (isxdigit(ch)) {
|
||||
if (inverse)
|
||||
inverse_matched = true;
|
||||
else
|
||||
++state.string_position;
|
||||
}
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
ALWAYS_INLINE void OpCode_Compare::compare_character_range(const MatchInput& input, MatchState& state, u32 from, u32 to, u32 ch, bool inverse, bool& inverse_matched)
|
||||
{
|
||||
if (input.regex_options & AllFlags::Insensitive) {
|
||||
from = tolower(from);
|
||||
to = tolower(to);
|
||||
ch = tolower(ch);
|
||||
}
|
||||
|
||||
if (ch >= from && ch <= to) {
|
||||
if (inverse)
|
||||
inverse_matched = true;
|
||||
else
|
||||
++state.string_position;
|
||||
}
|
||||
}
|
||||
|
||||
const String OpCode_Compare::arguments_string() const
|
||||
{
|
||||
return String::format("argc=%lu, args=%lu ", arguments_count(), arguments_size());
|
||||
}
|
||||
|
||||
const Vector<String> OpCode_Compare::variable_arguments_to_string(Optional<MatchInput> input) const
|
||||
{
|
||||
Vector<String> result;
|
||||
|
||||
size_t offset { state().instruction_position + 3 };
|
||||
RegexStringView view = ((input.has_value()) ? input.value().view : nullptr);
|
||||
|
||||
for (size_t i = 0; i < arguments_count(); ++i) {
|
||||
auto compare_type = (CharacterCompareType)m_bytecode->at(offset++);
|
||||
result.empend(String::format("type=%lu [%s]", (size_t)compare_type, character_compare_type_name(compare_type)));
|
||||
|
||||
auto compared_against_string_start_offset = state().string_position > 0 ? state().string_position - 1 : state().string_position;
|
||||
|
||||
if (compare_type == CharacterCompareType::Char) {
|
||||
char ch = m_bytecode->at(offset++);
|
||||
result.empend(String::format("value='%c'", ch));
|
||||
if (!view.is_null() && view.length() > state().string_position)
|
||||
result.empend(String::format(
|
||||
"compare against: '%s'",
|
||||
view.substring_view(compared_against_string_start_offset, state().string_position > view.length() ? 0 : 1).to_string().characters()));
|
||||
} else if (compare_type == CharacterCompareType::NamedReference) {
|
||||
auto ptr = (const char*)m_bytecode->at(offset++);
|
||||
auto length = m_bytecode->at(offset++);
|
||||
result.empend(String::format("name='%.*s'", (int)length, ptr));
|
||||
} else if (compare_type == CharacterCompareType::Reference) {
|
||||
auto ref = m_bytecode->at(offset++);
|
||||
result.empend(String::formatted("number={}", ref));
|
||||
} else if (compare_type == CharacterCompareType::String) {
|
||||
auto& length = m_bytecode->at(offset++);
|
||||
StringBuilder str_builder;
|
||||
for (size_t i = 0; i < length; ++i)
|
||||
str_builder.append(m_bytecode->at(offset++));
|
||||
result.empend(String::format("value=\"%.*s\"", (int)length, str_builder.string_view().characters_without_null_termination()));
|
||||
if (!view.is_null() && view.length() > state().string_position)
|
||||
result.empend(String::format(
|
||||
"compare against: \"%s\"",
|
||||
input.value().view.substring_view(compared_against_string_start_offset, compared_against_string_start_offset + length > view.length() ? 0 : length).to_string().characters()));
|
||||
} else if (compare_type == CharacterCompareType::CharClass) {
|
||||
auto character_class = (CharClass)m_bytecode->at(offset++);
|
||||
result.empend(String::format("ch_class=%lu [%s]", (size_t)character_class, character_class_name(character_class)));
|
||||
if (!view.is_null() && view.length() > state().string_position)
|
||||
result.empend(String::format(
|
||||
"compare against: '%s'",
|
||||
input.value().view.substring_view(compared_against_string_start_offset, state().string_position > view.length() ? 0 : 1).to_string().characters()));
|
||||
} else if (compare_type == CharacterCompareType::CharRange) {
|
||||
auto value = (CharRange)m_bytecode->at(offset++);
|
||||
result.empend(String::format("ch_range='%c'-'%c'", value.from, value.to));
|
||||
if (!view.is_null() && view.length() > state().string_position)
|
||||
result.empend(String::format(
|
||||
"compare against: '%s'",
|
||||
input.value().view.substring_view(compared_against_string_start_offset, state().string_position > view.length() ? 0 : 1).to_string().characters()));
|
||||
}
|
||||
}
|
||||
return result;
|
||||
}
|
||||
}
|
837
Userland/Libraries/LibRegex/RegexByteCode.h
Normal file
837
Userland/Libraries/LibRegex/RegexByteCode.h
Normal file
|
@ -0,0 +1,837 @@
|
|||
/*
|
||||
* Copyright (c) 2020, Emanuel Sprung <emanuel.sprung@gmail.com>
|
||||
* All rights reserved.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are met:
|
||||
*
|
||||
* 1. Redistributions of source code must retain the above copyright notice, this
|
||||
* list of conditions and the following disclaimer.
|
||||
*
|
||||
* 2. Redistributions in binary form must reproduce the above copyright notice,
|
||||
* this list of conditions and the following disclaimer in the documentation
|
||||
* and/or other materials provided with the distribution.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
|
||||
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "RegexMatch.h"
|
||||
#include "RegexOptions.h"
|
||||
|
||||
#include <AK/Format.h>
|
||||
#include <AK/Forward.h>
|
||||
#include <AK/HashMap.h>
|
||||
#include <AK/NonnullOwnPtr.h>
|
||||
#include <AK/OwnPtr.h>
|
||||
#include <AK/Traits.h>
|
||||
#include <AK/Types.h>
|
||||
#include <AK/Vector.h>
|
||||
|
||||
namespace regex {
|
||||
|
||||
using ByteCodeValueType = u64;
|
||||
|
||||
#define ENUMERATE_OPCODES \
|
||||
__ENUMERATE_OPCODE(Compare) \
|
||||
__ENUMERATE_OPCODE(Jump) \
|
||||
__ENUMERATE_OPCODE(ForkJump) \
|
||||
__ENUMERATE_OPCODE(ForkStay) \
|
||||
__ENUMERATE_OPCODE(FailForks) \
|
||||
__ENUMERATE_OPCODE(SaveLeftCaptureGroup) \
|
||||
__ENUMERATE_OPCODE(SaveRightCaptureGroup) \
|
||||
__ENUMERATE_OPCODE(SaveLeftNamedCaptureGroup) \
|
||||
__ENUMERATE_OPCODE(SaveRightNamedCaptureGroup) \
|
||||
__ENUMERATE_OPCODE(CheckBegin) \
|
||||
__ENUMERATE_OPCODE(CheckEnd) \
|
||||
__ENUMERATE_OPCODE(CheckBoundary) \
|
||||
__ENUMERATE_OPCODE(Save) \
|
||||
__ENUMERATE_OPCODE(Restore) \
|
||||
__ENUMERATE_OPCODE(GoBack) \
|
||||
__ENUMERATE_OPCODE(Exit)
|
||||
|
||||
// clang-format off
|
||||
enum class OpCodeId : ByteCodeValueType {
|
||||
#define __ENUMERATE_OPCODE(x) x,
|
||||
ENUMERATE_OPCODES
|
||||
#undef __ENUMERATE_OPCODE
|
||||
|
||||
First = Compare,
|
||||
Last = Exit,
|
||||
};
|
||||
// clang-format on
|
||||
|
||||
#define ENUMERATE_CHARACTER_COMPARE_TYPES \
|
||||
__ENUMERATE_CHARACTER_COMPARE_TYPE(Undefined) \
|
||||
__ENUMERATE_CHARACTER_COMPARE_TYPE(Inverse) \
|
||||
__ENUMERATE_CHARACTER_COMPARE_TYPE(TemporaryInverse) \
|
||||
__ENUMERATE_CHARACTER_COMPARE_TYPE(AnyChar) \
|
||||
__ENUMERATE_CHARACTER_COMPARE_TYPE(Char) \
|
||||
__ENUMERATE_CHARACTER_COMPARE_TYPE(String) \
|
||||
__ENUMERATE_CHARACTER_COMPARE_TYPE(CharClass) \
|
||||
__ENUMERATE_CHARACTER_COMPARE_TYPE(CharRange) \
|
||||
__ENUMERATE_CHARACTER_COMPARE_TYPE(Reference) \
|
||||
__ENUMERATE_CHARACTER_COMPARE_TYPE(NamedReference) \
|
||||
__ENUMERATE_CHARACTER_COMPARE_TYPE(RangeExpressionDummy)
|
||||
|
||||
enum class CharacterCompareType : ByteCodeValueType {
|
||||
#define __ENUMERATE_CHARACTER_COMPARE_TYPE(x) x,
|
||||
ENUMERATE_CHARACTER_COMPARE_TYPES
|
||||
#undef __ENUMERATE_CHARACTER_COMPARE_TYPE
|
||||
};
|
||||
|
||||
#define ENUMERATE_CHARACTER_CLASSES \
|
||||
__ENUMERATE_CHARACTER_CLASS(Alnum) \
|
||||
__ENUMERATE_CHARACTER_CLASS(Cntrl) \
|
||||
__ENUMERATE_CHARACTER_CLASS(Lower) \
|
||||
__ENUMERATE_CHARACTER_CLASS(Space) \
|
||||
__ENUMERATE_CHARACTER_CLASS(Alpha) \
|
||||
__ENUMERATE_CHARACTER_CLASS(Digit) \
|
||||
__ENUMERATE_CHARACTER_CLASS(Print) \
|
||||
__ENUMERATE_CHARACTER_CLASS(Upper) \
|
||||
__ENUMERATE_CHARACTER_CLASS(Blank) \
|
||||
__ENUMERATE_CHARACTER_CLASS(Graph) \
|
||||
__ENUMERATE_CHARACTER_CLASS(Punct) \
|
||||
__ENUMERATE_CHARACTER_CLASS(Word) \
|
||||
__ENUMERATE_CHARACTER_CLASS(Xdigit)
|
||||
|
||||
enum class CharClass : ByteCodeValueType {
|
||||
#define __ENUMERATE_CHARACTER_CLASS(x) x,
|
||||
ENUMERATE_CHARACTER_CLASSES
|
||||
#undef __ENUMERATE_CHARACTER_CLASS
|
||||
};
|
||||
|
||||
#define ENUMERATE_BOUNDARY_CHECK_TYPES \
|
||||
__ENUMERATE_BOUNDARY_CHECK_TYPE(Word) \
|
||||
__ENUMERATE_BOUNDARY_CHECK_TYPE(NonWord)
|
||||
|
||||
enum class BoundaryCheckType : ByteCodeValueType {
|
||||
#define __ENUMERATE_BOUNDARY_CHECK_TYPE(x) x,
|
||||
ENUMERATE_BOUNDARY_CHECK_TYPES
|
||||
#undef __ENUMERATE_BOUNDARY_CHECK_TYPE
|
||||
};
|
||||
|
||||
struct CharRange {
|
||||
const u32 from;
|
||||
const u32 to;
|
||||
|
||||
CharRange(u64 value)
|
||||
: from(value >> 32)
|
||||
, to(value & 0xffffffff)
|
||||
{
|
||||
}
|
||||
|
||||
CharRange(u32 from, u32 to)
|
||||
: from(from)
|
||||
, to(to)
|
||||
{
|
||||
}
|
||||
|
||||
operator ByteCodeValueType() const { return ((u64)from << 32) | to; }
|
||||
};
|
||||
|
||||
struct CompareTypeAndValuePair {
|
||||
CharacterCompareType type;
|
||||
ByteCodeValueType value;
|
||||
};
|
||||
|
||||
class OpCode;
|
||||
|
||||
class ByteCode : public Vector<ByteCodeValueType> {
|
||||
public:
|
||||
ByteCode() = default;
|
||||
virtual ~ByteCode() = default;
|
||||
|
||||
void insert_bytecode_compare_values(Vector<CompareTypeAndValuePair>&& pairs)
|
||||
{
|
||||
ByteCode bytecode;
|
||||
|
||||
bytecode.empend(static_cast<ByteCodeValueType>(OpCodeId::Compare));
|
||||
bytecode.empend(pairs.size()); // number of arguments
|
||||
|
||||
ByteCode arguments;
|
||||
for (auto& value : pairs) {
|
||||
ASSERT(value.type != CharacterCompareType::RangeExpressionDummy);
|
||||
ASSERT(value.type != CharacterCompareType::Undefined);
|
||||
ASSERT(value.type != CharacterCompareType::String);
|
||||
ASSERT(value.type != CharacterCompareType::NamedReference);
|
||||
|
||||
arguments.append((ByteCodeValueType)value.type);
|
||||
if (value.type != CharacterCompareType::Inverse && value.type != CharacterCompareType::AnyChar && value.type != CharacterCompareType::TemporaryInverse)
|
||||
arguments.append(move(value.value));
|
||||
}
|
||||
|
||||
bytecode.empend(arguments.size()); // size of arguments
|
||||
bytecode.append(move(arguments));
|
||||
|
||||
append(move(bytecode));
|
||||
}
|
||||
|
||||
void insert_bytecode_check_boundary(BoundaryCheckType type)
|
||||
{
|
||||
ByteCode bytecode;
|
||||
bytecode.empend((ByteCodeValueType)OpCodeId::CheckBoundary);
|
||||
bytecode.empend((ByteCodeValueType)type);
|
||||
|
||||
append(move(bytecode));
|
||||
}
|
||||
|
||||
void insert_bytecode_compare_string(StringView view)
|
||||
{
|
||||
ByteCode bytecode;
|
||||
|
||||
bytecode.empend(static_cast<ByteCodeValueType>(OpCodeId::Compare));
|
||||
bytecode.empend(static_cast<u64>(1)); // number of arguments
|
||||
|
||||
ByteCode arguments;
|
||||
|
||||
arguments.empend(static_cast<ByteCodeValueType>(CharacterCompareType::String));
|
||||
arguments.insert_string(view);
|
||||
|
||||
bytecode.empend(arguments.size()); // size of arguments
|
||||
bytecode.append(move(arguments));
|
||||
|
||||
append(move(bytecode));
|
||||
}
|
||||
|
||||
void insert_bytecode_compare_named_reference(StringView name)
|
||||
{
|
||||
ByteCode bytecode;
|
||||
|
||||
bytecode.empend(static_cast<ByteCodeValueType>(OpCodeId::Compare));
|
||||
bytecode.empend(static_cast<u64>(1)); // number of arguments
|
||||
|
||||
ByteCode arguments;
|
||||
|
||||
arguments.empend(static_cast<ByteCodeValueType>(CharacterCompareType::NamedReference));
|
||||
arguments.empend(reinterpret_cast<ByteCodeValueType>(name.characters_without_null_termination()));
|
||||
arguments.empend(name.length());
|
||||
|
||||
bytecode.empend(arguments.size()); // size of arguments
|
||||
bytecode.append(move(arguments));
|
||||
|
||||
append(move(bytecode));
|
||||
}
|
||||
|
||||
void insert_bytecode_group_capture_left(size_t capture_groups_count)
|
||||
{
|
||||
empend(static_cast<ByteCodeValueType>(OpCodeId::SaveLeftCaptureGroup));
|
||||
empend(capture_groups_count);
|
||||
}
|
||||
|
||||
void insert_bytecode_group_capture_left(const StringView& name)
|
||||
{
|
||||
empend(static_cast<ByteCodeValueType>(OpCodeId::SaveLeftNamedCaptureGroup));
|
||||
empend(reinterpret_cast<ByteCodeValueType>(name.characters_without_null_termination()));
|
||||
empend(name.length());
|
||||
}
|
||||
|
||||
void insert_bytecode_group_capture_right(size_t capture_groups_count)
|
||||
{
|
||||
empend(static_cast<ByteCodeValueType>(OpCodeId::SaveRightCaptureGroup));
|
||||
empend(capture_groups_count);
|
||||
}
|
||||
|
||||
void insert_bytecode_group_capture_right(const StringView& name)
|
||||
{
|
||||
empend(static_cast<ByteCodeValueType>(OpCodeId::SaveRightNamedCaptureGroup));
|
||||
empend(reinterpret_cast<ByteCodeValueType>(name.characters_without_null_termination()));
|
||||
empend(name.length());
|
||||
}
|
||||
|
||||
enum class LookAroundType {
|
||||
LookAhead,
|
||||
LookBehind,
|
||||
NegatedLookAhead,
|
||||
NegatedLookBehind,
|
||||
};
|
||||
void insert_bytecode_lookaround(ByteCode&& lookaround_body, LookAroundType type, size_t match_length = 0)
|
||||
{
|
||||
// FIXME: The save stack will grow infinitely with repeated failures
|
||||
// as we do not discard that on failure (we don't necessarily know how many to pop with the current architecture).
|
||||
switch (type) {
|
||||
case LookAroundType::LookAhead: {
|
||||
// SAVE
|
||||
// REGEXP BODY
|
||||
// RESTORE
|
||||
empend((ByteCodeValueType)OpCodeId::Save);
|
||||
append(move(lookaround_body));
|
||||
empend((ByteCodeValueType)OpCodeId::Restore);
|
||||
return;
|
||||
}
|
||||
case LookAroundType::NegatedLookAhead: {
|
||||
// JUMP _A
|
||||
// LABEL _L
|
||||
// REGEXP BODY
|
||||
// FAIL 2
|
||||
// LABEL _A
|
||||
// SAVE
|
||||
// FORKJUMP _L
|
||||
// RESTORE
|
||||
auto body_length = lookaround_body.size();
|
||||
empend((ByteCodeValueType)OpCodeId::Jump);
|
||||
empend((ByteCodeValueType)body_length + 2); // JUMP to label _A
|
||||
append(move(lookaround_body));
|
||||
empend((ByteCodeValueType)OpCodeId::FailForks);
|
||||
empend((ByteCodeValueType)2); // Fail two forks
|
||||
empend((ByteCodeValueType)OpCodeId::Save);
|
||||
empend((ByteCodeValueType)OpCodeId::ForkJump);
|
||||
empend((ByteCodeValueType) - (body_length + 5)); // JUMP to lavel _L
|
||||
empend((ByteCodeValueType)OpCodeId::Restore);
|
||||
return;
|
||||
}
|
||||
case LookAroundType::LookBehind:
|
||||
// SAVE
|
||||
// GOBACK match_length(BODY)
|
||||
// REGEXP BODY
|
||||
// RESTORE
|
||||
empend((ByteCodeValueType)OpCodeId::Save);
|
||||
empend((ByteCodeValueType)OpCodeId::GoBack);
|
||||
empend((ByteCodeValueType)match_length);
|
||||
append(move(lookaround_body));
|
||||
empend((ByteCodeValueType)OpCodeId::Restore);
|
||||
return;
|
||||
case LookAroundType::NegatedLookBehind: {
|
||||
// JUMP _A
|
||||
// LABEL _L
|
||||
// GOBACK match_length(BODY)
|
||||
// REGEXP BODY
|
||||
// FAIL 2
|
||||
// LABEL _A
|
||||
// SAVE
|
||||
// FORKJUMP _L
|
||||
// RESTORE
|
||||
auto body_length = lookaround_body.size();
|
||||
empend((ByteCodeValueType)OpCodeId::Jump);
|
||||
empend((ByteCodeValueType)body_length + 4); // JUMP to label _A
|
||||
empend((ByteCodeValueType)OpCodeId::GoBack);
|
||||
empend((ByteCodeValueType)match_length);
|
||||
append(move(lookaround_body));
|
||||
empend((ByteCodeValueType)OpCodeId::FailForks);
|
||||
empend((ByteCodeValueType)2); // Fail two forks
|
||||
empend((ByteCodeValueType)OpCodeId::Save);
|
||||
empend((ByteCodeValueType)OpCodeId::ForkJump);
|
||||
empend((ByteCodeValueType) - (body_length + 7)); // JUMP to lavel _L
|
||||
empend((ByteCodeValueType)OpCodeId::Restore);
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
ASSERT_NOT_REACHED();
|
||||
}
|
||||
|
||||
void insert_bytecode_alternation(ByteCode&& left, ByteCode&& right)
|
||||
{
|
||||
|
||||
// FORKJUMP _ALT
|
||||
// REGEXP ALT1
|
||||
// JUMP _END
|
||||
// LABEL _ALT
|
||||
// REGEXP ALT2
|
||||
// LABEL _END
|
||||
|
||||
ByteCode byte_code;
|
||||
|
||||
empend(static_cast<ByteCodeValueType>(OpCodeId::ForkJump));
|
||||
empend(left.size() + 2); // Jump to the _ALT label
|
||||
|
||||
for (auto& op : left)
|
||||
append(move(op));
|
||||
|
||||
empend(static_cast<ByteCodeValueType>(OpCodeId::Jump));
|
||||
empend(right.size()); // Jump to the _END label
|
||||
|
||||
// LABEL _ALT = bytecode.size() + 2
|
||||
|
||||
for (auto& op : right)
|
||||
append(move(op));
|
||||
|
||||
// LABEL _END = alterantive_bytecode.size
|
||||
}
|
||||
|
||||
void insert_bytecode_repetition_min_max(ByteCode& bytecode_to_repeat, size_t minimum, Optional<size_t> maximum)
|
||||
{
|
||||
ByteCode new_bytecode;
|
||||
new_bytecode.insert_bytecode_repetition_n(bytecode_to_repeat, minimum);
|
||||
|
||||
if (maximum.has_value()) {
|
||||
if (maximum.value() > minimum) {
|
||||
auto diff = maximum.value() - minimum;
|
||||
new_bytecode.empend(static_cast<ByteCodeValueType>(OpCodeId::ForkStay));
|
||||
new_bytecode.empend(diff * (bytecode_to_repeat.size() + 2)); // Jump to the _END label
|
||||
|
||||
for (size_t i = 0; i < diff; ++i) {
|
||||
new_bytecode.append(bytecode_to_repeat);
|
||||
new_bytecode.empend(static_cast<ByteCodeValueType>(OpCodeId::ForkStay));
|
||||
new_bytecode.empend((diff - i - 1) * (bytecode_to_repeat.size() + 2)); // Jump to the _END label
|
||||
}
|
||||
}
|
||||
} else {
|
||||
// no maximum value set, repeat finding if possible
|
||||
new_bytecode.empend(static_cast<ByteCodeValueType>(OpCodeId::ForkJump));
|
||||
new_bytecode.empend(-bytecode_to_repeat.size() - 2); // Jump to the last iteration
|
||||
}
|
||||
|
||||
bytecode_to_repeat = move(new_bytecode);
|
||||
}
|
||||
|
||||
void insert_bytecode_repetition_n(ByteCode& bytecode_to_repeat, size_t n)
|
||||
{
|
||||
for (size_t i = 0; i < n; ++i)
|
||||
append(bytecode_to_repeat);
|
||||
}
|
||||
|
||||
void insert_bytecode_repetition_min_one(ByteCode& bytecode_to_repeat, bool greedy)
|
||||
{
|
||||
// LABEL _START = -bytecode_to_repeat.size()
|
||||
// REGEXP
|
||||
// FORKSTAY _START (FORKJUMP -> Greedy)
|
||||
|
||||
if (greedy)
|
||||
bytecode_to_repeat.empend(static_cast<ByteCodeValueType>(OpCodeId::ForkJump));
|
||||
else
|
||||
bytecode_to_repeat.empend(static_cast<ByteCodeValueType>(OpCodeId::ForkStay));
|
||||
|
||||
bytecode_to_repeat.empend(-(bytecode_to_repeat.size() + 1)); // Jump to the _START label
|
||||
}
|
||||
|
||||
void insert_bytecode_repetition_any(ByteCode& bytecode_to_repeat, bool greedy)
|
||||
{
|
||||
// LABEL _START
|
||||
// FORKJUMP _END (FORKSTAY -> Greedy)
|
||||
// REGEXP
|
||||
// JUMP _START
|
||||
// LABEL _END
|
||||
|
||||
// LABEL _START = m_bytes.size();
|
||||
ByteCode bytecode;
|
||||
|
||||
if (greedy)
|
||||
bytecode.empend(static_cast<ByteCodeValueType>(OpCodeId::ForkStay));
|
||||
else
|
||||
bytecode.empend(static_cast<ByteCodeValueType>(OpCodeId::ForkJump));
|
||||
|
||||
bytecode.empend(bytecode_to_repeat.size() + 2); // Jump to the _END label
|
||||
|
||||
for (auto& op : bytecode_to_repeat)
|
||||
bytecode.append(move(op));
|
||||
|
||||
bytecode.empend(static_cast<ByteCodeValueType>(OpCodeId::Jump));
|
||||
bytecode.empend(-bytecode.size() - 1); // Jump to the _START label
|
||||
// LABEL _END = bytecode.size()
|
||||
|
||||
bytecode_to_repeat = move(bytecode);
|
||||
}
|
||||
|
||||
void insert_bytecode_repetition_zero_or_one(ByteCode& bytecode_to_repeat, bool greedy)
|
||||
{
|
||||
// FORKJUMP _END (FORKSTAY -> Greedy)
|
||||
// REGEXP
|
||||
// LABEL _END
|
||||
ByteCode bytecode;
|
||||
|
||||
if (greedy)
|
||||
bytecode.empend(static_cast<ByteCodeValueType>(OpCodeId::ForkStay));
|
||||
else
|
||||
bytecode.empend(static_cast<ByteCodeValueType>(OpCodeId::ForkJump));
|
||||
|
||||
bytecode.empend(bytecode_to_repeat.size()); // Jump to the _END label
|
||||
|
||||
for (auto& op : bytecode_to_repeat)
|
||||
bytecode.append(move(op));
|
||||
// LABEL _END = bytecode.size()
|
||||
|
||||
bytecode_to_repeat = move(bytecode);
|
||||
}
|
||||
|
||||
OpCode* get_opcode(MatchState& state) const;
|
||||
|
||||
private:
|
||||
void insert_string(const StringView& view)
|
||||
{
|
||||
empend((ByteCodeValueType)view.length());
|
||||
for (size_t i = 0; i < view.length(); ++i)
|
||||
empend((ByteCodeValueType)view[i]);
|
||||
}
|
||||
|
||||
ALWAYS_INLINE OpCode* get_opcode_by_id(OpCodeId id) const;
|
||||
static HashMap<u32, OwnPtr<OpCode>> s_opcodes;
|
||||
};
|
||||
|
||||
#define ENUMERATE_EXECUTION_RESULTS \
|
||||
__ENUMERATE_EXECUTION_RESULT(Continue) \
|
||||
__ENUMERATE_EXECUTION_RESULT(Fork_PrioHigh) \
|
||||
__ENUMERATE_EXECUTION_RESULT(Fork_PrioLow) \
|
||||
__ENUMERATE_EXECUTION_RESULT(Failed) \
|
||||
__ENUMERATE_EXECUTION_RESULT(Failed_ExecuteLowPrioForks) \
|
||||
__ENUMERATE_EXECUTION_RESULT(Succeeded)
|
||||
|
||||
enum class ExecutionResult : u8 {
|
||||
#define __ENUMERATE_EXECUTION_RESULT(x) x,
|
||||
ENUMERATE_EXECUTION_RESULTS
|
||||
#undef __ENUMERATE_EXECUTION_RESULT
|
||||
};
|
||||
|
||||
const char* execution_result_name(ExecutionResult result);
|
||||
const char* opcode_id_name(OpCodeId opcode_id);
|
||||
const char* boundary_check_type_name(BoundaryCheckType);
|
||||
const char* character_compare_type_name(CharacterCompareType result);
|
||||
const char* execution_result_name(ExecutionResult result);
|
||||
|
||||
class OpCode {
|
||||
public:
|
||||
OpCode(ByteCode& bytecode)
|
||||
: m_bytecode(&bytecode)
|
||||
{
|
||||
}
|
||||
|
||||
virtual ~OpCode() = default;
|
||||
|
||||
virtual OpCodeId opcode_id() const = 0;
|
||||
virtual size_t size() const = 0;
|
||||
virtual ExecutionResult execute(const MatchInput& input, MatchState& state, MatchOutput& output) const = 0;
|
||||
|
||||
ALWAYS_INLINE ByteCodeValueType argument(size_t offset) const
|
||||
{
|
||||
ASSERT(state().instruction_position + offset <= m_bytecode->size());
|
||||
return m_bytecode->at(state().instruction_position + 1 + offset);
|
||||
}
|
||||
|
||||
ALWAYS_INLINE const char* name() const;
|
||||
static const char* name(const OpCodeId);
|
||||
|
||||
ALWAYS_INLINE OpCode* set_state(MatchState& state)
|
||||
{
|
||||
m_state = &state;
|
||||
return this;
|
||||
}
|
||||
|
||||
ALWAYS_INLINE OpCode* set_bytecode(ByteCode& bytecode)
|
||||
{
|
||||
m_bytecode = &bytecode;
|
||||
return this;
|
||||
}
|
||||
|
||||
ALWAYS_INLINE void reset_state() { m_state.clear(); }
|
||||
|
||||
ALWAYS_INLINE const MatchState& state() const
|
||||
{
|
||||
ASSERT(m_state.has_value());
|
||||
return *m_state.value();
|
||||
}
|
||||
|
||||
const String to_string() const
|
||||
{
|
||||
return String::format("[0x%02X] %s", (int)opcode_id(), name(opcode_id()));
|
||||
}
|
||||
|
||||
virtual const String arguments_string() const = 0;
|
||||
|
||||
ALWAYS_INLINE const ByteCode& bytecode() const { return *m_bytecode; }
|
||||
|
||||
protected:
|
||||
ByteCode* m_bytecode;
|
||||
Optional<MatchState*> m_state;
|
||||
};
|
||||
|
||||
class OpCode_Exit final : public OpCode {
|
||||
public:
|
||||
OpCode_Exit(ByteCode& bytecode)
|
||||
: OpCode(bytecode)
|
||||
{
|
||||
}
|
||||
ExecutionResult execute(const MatchInput& input, MatchState& state, MatchOutput& output) const override;
|
||||
ALWAYS_INLINE OpCodeId opcode_id() const override { return OpCodeId::Exit; }
|
||||
ALWAYS_INLINE size_t size() const override { return 1; }
|
||||
const String arguments_string() const override { return ""; }
|
||||
};
|
||||
|
||||
class OpCode_FailForks final : public OpCode {
|
||||
public:
|
||||
OpCode_FailForks(ByteCode& bytecode)
|
||||
: OpCode(bytecode)
|
||||
{
|
||||
}
|
||||
ExecutionResult execute(const MatchInput& input, MatchState& state, MatchOutput& output) const override;
|
||||
ALWAYS_INLINE OpCodeId opcode_id() const override { return OpCodeId::FailForks; }
|
||||
ALWAYS_INLINE size_t size() const override { return 2; }
|
||||
ALWAYS_INLINE size_t count() const { return argument(0); }
|
||||
const String arguments_string() const override { return String::formatted("count={}", count()); }
|
||||
};
|
||||
|
||||
class OpCode_Save final : public OpCode {
|
||||
public:
|
||||
OpCode_Save(ByteCode& bytecode)
|
||||
: OpCode(bytecode)
|
||||
{
|
||||
}
|
||||
ExecutionResult execute(const MatchInput& input, MatchState& state, MatchOutput& output) const override;
|
||||
ALWAYS_INLINE OpCodeId opcode_id() const override { return OpCodeId::Save; }
|
||||
ALWAYS_INLINE size_t size() const override { return 1; }
|
||||
const String arguments_string() const override { return ""; }
|
||||
};
|
||||
|
||||
class OpCode_Restore final : public OpCode {
|
||||
public:
|
||||
OpCode_Restore(ByteCode& bytecode)
|
||||
: OpCode(bytecode)
|
||||
{
|
||||
}
|
||||
ExecutionResult execute(const MatchInput& input, MatchState& state, MatchOutput& output) const override;
|
||||
ALWAYS_INLINE OpCodeId opcode_id() const override { return OpCodeId::Restore; }
|
||||
ALWAYS_INLINE size_t size() const override { return 1; }
|
||||
const String arguments_string() const override { return ""; }
|
||||
};
|
||||
|
||||
class OpCode_GoBack final : public OpCode {
|
||||
public:
|
||||
OpCode_GoBack(ByteCode& bytecode)
|
||||
: OpCode(bytecode)
|
||||
{
|
||||
}
|
||||
ExecutionResult execute(const MatchInput& input, MatchState& state, MatchOutput& output) const override;
|
||||
ALWAYS_INLINE OpCodeId opcode_id() const override { return OpCodeId::GoBack; }
|
||||
ALWAYS_INLINE size_t size() const override { return 2; }
|
||||
ALWAYS_INLINE size_t count() const { return argument(0); }
|
||||
const String arguments_string() const override { return String::formatted("count={}", count()); }
|
||||
};
|
||||
|
||||
class OpCode_Jump final : public OpCode {
|
||||
public:
|
||||
OpCode_Jump(ByteCode& bytecode)
|
||||
: OpCode(bytecode)
|
||||
{
|
||||
}
|
||||
ExecutionResult execute(const MatchInput& input, MatchState& state, MatchOutput& output) const override;
|
||||
ALWAYS_INLINE OpCodeId opcode_id() const override { return OpCodeId::Jump; }
|
||||
ALWAYS_INLINE size_t size() const override { return 2; }
|
||||
ALWAYS_INLINE ssize_t offset() const { return argument(0); }
|
||||
const String arguments_string() const override
|
||||
{
|
||||
return String::format("offset=%zd [&%zu]", offset(), state().instruction_position + size() + offset());
|
||||
}
|
||||
};
|
||||
|
||||
class OpCode_ForkJump final : public OpCode {
|
||||
public:
|
||||
OpCode_ForkJump(ByteCode& bytecode)
|
||||
: OpCode(bytecode)
|
||||
{
|
||||
}
|
||||
ExecutionResult execute(const MatchInput& input, MatchState& state, MatchOutput& output) const override;
|
||||
ALWAYS_INLINE OpCodeId opcode_id() const override { return OpCodeId::ForkJump; }
|
||||
ALWAYS_INLINE size_t size() const override { return 2; }
|
||||
ALWAYS_INLINE ssize_t offset() const { return argument(0); }
|
||||
const String arguments_string() const override
|
||||
{
|
||||
return String::format("offset=%zd [&%zu], sp: %zu", offset(), state().instruction_position + size() + offset(), state().string_position);
|
||||
}
|
||||
};
|
||||
|
||||
class OpCode_ForkStay final : public OpCode {
|
||||
public:
|
||||
OpCode_ForkStay(ByteCode& bytecode)
|
||||
: OpCode(bytecode)
|
||||
{
|
||||
}
|
||||
ExecutionResult execute(const MatchInput& input, MatchState& state, MatchOutput& output) const override;
|
||||
ALWAYS_INLINE OpCodeId opcode_id() const override { return OpCodeId::ForkStay; }
|
||||
ALWAYS_INLINE size_t size() const override { return 2; }
|
||||
ALWAYS_INLINE ssize_t offset() const { return argument(0); }
|
||||
const String arguments_string() const override
|
||||
{
|
||||
return String::format("offset=%zd [&%zu], sp: %zu", offset(), state().instruction_position + size() + offset(), state().string_position);
|
||||
}
|
||||
};
|
||||
|
||||
class OpCode_CheckBegin final : public OpCode {
|
||||
public:
|
||||
OpCode_CheckBegin(ByteCode& bytecode)
|
||||
: OpCode(bytecode)
|
||||
{
|
||||
}
|
||||
ExecutionResult execute(const MatchInput& input, MatchState& state, MatchOutput& output) const override;
|
||||
ALWAYS_INLINE OpCodeId opcode_id() const override { return OpCodeId::CheckBegin; }
|
||||
ALWAYS_INLINE size_t size() const override { return 1; }
|
||||
const String arguments_string() const override { return ""; }
|
||||
};
|
||||
|
||||
class OpCode_CheckEnd final : public OpCode {
|
||||
public:
|
||||
OpCode_CheckEnd(ByteCode& bytecode)
|
||||
: OpCode(bytecode)
|
||||
{
|
||||
}
|
||||
ExecutionResult execute(const MatchInput& input, MatchState& state, MatchOutput& output) const override;
|
||||
ALWAYS_INLINE OpCodeId opcode_id() const override { return OpCodeId::CheckEnd; }
|
||||
ALWAYS_INLINE size_t size() const override { return 1; }
|
||||
const String arguments_string() const override { return ""; }
|
||||
};
|
||||
|
||||
class OpCode_CheckBoundary final : public OpCode {
|
||||
public:
|
||||
OpCode_CheckBoundary(ByteCode& bytecode)
|
||||
: OpCode(bytecode)
|
||||
{
|
||||
}
|
||||
ExecutionResult execute(const MatchInput& input, MatchState& state, MatchOutput& output) const override;
|
||||
ALWAYS_INLINE OpCodeId opcode_id() const override { return OpCodeId::CheckBoundary; }
|
||||
ALWAYS_INLINE size_t size() const override { return 2; }
|
||||
ALWAYS_INLINE size_t arguments_count() const { return 1; }
|
||||
ALWAYS_INLINE BoundaryCheckType type() const { return static_cast<BoundaryCheckType>(argument(0)); }
|
||||
const String arguments_string() const override { return String::format("kind=%lu (%s)", (long unsigned int)argument(0), boundary_check_type_name(type())); }
|
||||
};
|
||||
|
||||
class OpCode_SaveLeftCaptureGroup final : public OpCode {
|
||||
public:
|
||||
OpCode_SaveLeftCaptureGroup(ByteCode& bytecode)
|
||||
: OpCode(bytecode)
|
||||
{
|
||||
}
|
||||
ExecutionResult execute(const MatchInput& input, MatchState& state, MatchOutput& output) const override;
|
||||
ALWAYS_INLINE OpCodeId opcode_id() const override { return OpCodeId::SaveLeftCaptureGroup; }
|
||||
ALWAYS_INLINE size_t size() const override { return 2; }
|
||||
ALWAYS_INLINE size_t id() const { return argument(0); }
|
||||
const String arguments_string() const override { return String::format("id=%lu", id()); }
|
||||
};
|
||||
|
||||
class OpCode_SaveRightCaptureGroup final : public OpCode {
|
||||
public:
|
||||
OpCode_SaveRightCaptureGroup(ByteCode& bytecode)
|
||||
: OpCode(bytecode)
|
||||
{
|
||||
}
|
||||
ExecutionResult execute(const MatchInput& input, MatchState& state, MatchOutput& output) const override;
|
||||
ALWAYS_INLINE OpCodeId opcode_id() const override { return OpCodeId::SaveRightCaptureGroup; }
|
||||
ALWAYS_INLINE size_t size() const override { return 2; }
|
||||
ALWAYS_INLINE size_t id() const { return argument(0); }
|
||||
const String arguments_string() const override { return String::format("id=%lu", id()); }
|
||||
};
|
||||
|
||||
class OpCode_SaveLeftNamedCaptureGroup final : public OpCode {
|
||||
public:
|
||||
OpCode_SaveLeftNamedCaptureGroup(ByteCode& bytecode)
|
||||
: OpCode(bytecode)
|
||||
{
|
||||
}
|
||||
ExecutionResult execute(const MatchInput& input, MatchState& state, MatchOutput& output) const override;
|
||||
ALWAYS_INLINE OpCodeId opcode_id() const override { return OpCodeId::SaveLeftNamedCaptureGroup; }
|
||||
ALWAYS_INLINE size_t size() const override { return 3; }
|
||||
ALWAYS_INLINE StringView name() const { return { reinterpret_cast<char*>(argument(0)), length() }; }
|
||||
ALWAYS_INLINE size_t length() const { return argument(1); }
|
||||
const String arguments_string() const override
|
||||
{
|
||||
return String::format("name=%s, length=%lu", name().to_string().characters(), length());
|
||||
}
|
||||
};
|
||||
|
||||
class OpCode_SaveRightNamedCaptureGroup final : public OpCode {
|
||||
public:
|
||||
OpCode_SaveRightNamedCaptureGroup(ByteCode& bytecode)
|
||||
: OpCode(bytecode)
|
||||
{
|
||||
}
|
||||
ExecutionResult execute(const MatchInput& input, MatchState& state, MatchOutput& output) const override;
|
||||
ALWAYS_INLINE OpCodeId opcode_id() const override { return OpCodeId::SaveRightNamedCaptureGroup; }
|
||||
ALWAYS_INLINE size_t size() const override { return 3; }
|
||||
ALWAYS_INLINE StringView name() const { return { reinterpret_cast<char*>(argument(0)), length() }; }
|
||||
ALWAYS_INLINE size_t length() const { return argument(1); }
|
||||
const String arguments_string() const override
|
||||
{
|
||||
return String::format("name=%s, length=%zu", name().to_string().characters(), length());
|
||||
}
|
||||
};
|
||||
|
||||
class OpCode_Compare final : public OpCode {
|
||||
public:
|
||||
OpCode_Compare(ByteCode& bytecode)
|
||||
: OpCode(bytecode)
|
||||
{
|
||||
}
|
||||
ExecutionResult execute(const MatchInput& input, MatchState& state, MatchOutput& output) const override;
|
||||
ALWAYS_INLINE OpCodeId opcode_id() const override { return OpCodeId::Compare; }
|
||||
ALWAYS_INLINE size_t size() const override { return arguments_size() + 3; }
|
||||
ALWAYS_INLINE size_t arguments_count() const { return argument(0); }
|
||||
ALWAYS_INLINE size_t arguments_size() const { return argument(1); }
|
||||
const String arguments_string() const override;
|
||||
const Vector<String> variable_arguments_to_string(Optional<MatchInput> input = {}) const;
|
||||
|
||||
private:
|
||||
ALWAYS_INLINE static void compare_char(const MatchInput& input, MatchState& state, u32 ch1, bool inverse, bool& inverse_matched);
|
||||
ALWAYS_INLINE static bool compare_string(const MatchInput& input, MatchState& state, const char* str, size_t length);
|
||||
ALWAYS_INLINE static void compare_character_class(const MatchInput& input, MatchState& state, CharClass character_class, u32 ch, bool inverse, bool& inverse_matched);
|
||||
ALWAYS_INLINE static void compare_character_range(const MatchInput& input, MatchState& state, u32 from, u32 to, u32 ch, bool inverse, bool& inverse_matched);
|
||||
};
|
||||
|
||||
template<typename T>
|
||||
bool is(const OpCode&);
|
||||
|
||||
template<typename T>
|
||||
ALWAYS_INLINE bool is(const OpCode&)
|
||||
{
|
||||
return false;
|
||||
}
|
||||
|
||||
template<typename T>
|
||||
ALWAYS_INLINE bool is(const OpCode* opcode)
|
||||
{
|
||||
return is<T>(*opcode);
|
||||
}
|
||||
|
||||
template<>
|
||||
ALWAYS_INLINE bool is<OpCode_ForkStay>(const OpCode& opcode)
|
||||
{
|
||||
return opcode.opcode_id() == OpCodeId::ForkStay;
|
||||
}
|
||||
|
||||
template<>
|
||||
ALWAYS_INLINE bool is<OpCode_Exit>(const OpCode& opcode)
|
||||
{
|
||||
return opcode.opcode_id() == OpCodeId::Exit;
|
||||
}
|
||||
|
||||
template<>
|
||||
ALWAYS_INLINE bool is<OpCode_Compare>(const OpCode& opcode)
|
||||
{
|
||||
return opcode.opcode_id() == OpCodeId::Compare;
|
||||
}
|
||||
|
||||
template<typename T>
|
||||
ALWAYS_INLINE const T& to(const OpCode& opcode)
|
||||
{
|
||||
ASSERT(is<T>(opcode));
|
||||
return static_cast<const T&>(opcode);
|
||||
}
|
||||
|
||||
template<typename T>
|
||||
ALWAYS_INLINE T* to(OpCode* opcode)
|
||||
{
|
||||
ASSERT(is<T>(opcode));
|
||||
return static_cast<T*>(opcode);
|
||||
}
|
||||
|
||||
template<typename T>
|
||||
ALWAYS_INLINE const T* to(const OpCode* opcode)
|
||||
{
|
||||
ASSERT(is<T>(opcode));
|
||||
return static_cast<const T*>(opcode);
|
||||
}
|
||||
|
||||
template<typename T>
|
||||
ALWAYS_INLINE T& to(OpCode& opcode)
|
||||
{
|
||||
ASSERT(is<T>(opcode));
|
||||
return static_cast<T&>(opcode);
|
||||
}
|
||||
|
||||
}
|
154
Userland/Libraries/LibRegex/RegexDebug.h
Normal file
154
Userland/Libraries/LibRegex/RegexDebug.h
Normal file
|
@ -0,0 +1,154 @@
|
|||
/*
|
||||
* Copyright (c) 2020, Emanuel Sprung <emanuel.sprung@gmail.com>
|
||||
* All rights reserved.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are met:
|
||||
*
|
||||
* 1. Redistributions of source code must retain the above copyright notice, this
|
||||
* list of conditions and the following disclaimer.
|
||||
*
|
||||
* 2. Redistributions in binary form must reproduce the above copyright notice,
|
||||
* this list of conditions and the following disclaimer in the documentation
|
||||
* and/or other materials provided with the distribution.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
|
||||
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "AK/StringBuilder.h"
|
||||
#include "LibRegex/RegexMatcher.h"
|
||||
|
||||
//#define REGEX_DEBUG
|
||||
|
||||
#ifdef REGEX_DEBUG
|
||||
|
||||
namespace regex {
|
||||
|
||||
class RegexDebug {
|
||||
public:
|
||||
RegexDebug(FILE* file = stdout)
|
||||
: m_file(file)
|
||||
{
|
||||
}
|
||||
|
||||
virtual ~RegexDebug() = default;
|
||||
|
||||
template<typename T>
|
||||
void print_raw_bytecode(Regex<T>& regex) const
|
||||
{
|
||||
auto& bytecode = regex.parser_result.bytecode;
|
||||
size_t index { 0 };
|
||||
for (auto& value : bytecode) {
|
||||
fprintf(m_file, "OpCode i=%3lu [0x%02X]\n", index, (u32)value);
|
||||
++index;
|
||||
}
|
||||
}
|
||||
|
||||
template<typename T>
|
||||
void print_bytecode(Regex<T>& regex) const
|
||||
{
|
||||
MatchState state;
|
||||
auto& bytecode = regex.parser_result.bytecode;
|
||||
|
||||
for (;;) {
|
||||
auto* opcode = bytecode.get_opcode(state);
|
||||
if (!opcode) {
|
||||
dbgln("Wrong opcode... failed!");
|
||||
return;
|
||||
}
|
||||
|
||||
print_opcode("PrintBytecode", *opcode, state);
|
||||
fprintf(m_file, "%s", m_debug_stripline.characters());
|
||||
|
||||
if (is<OpCode_Exit>(*opcode))
|
||||
break;
|
||||
|
||||
state.instruction_position += opcode->size();
|
||||
}
|
||||
|
||||
fflush(m_file);
|
||||
}
|
||||
|
||||
void print_opcode(const String& system, OpCode& opcode, MatchState& state, size_t recursion = 0, bool newline = true) const
|
||||
{
|
||||
fprintf(m_file, "%-15s | %-5lu | %-9lu | %-35s | %-30s | %-20s%s",
|
||||
system.characters(),
|
||||
state.instruction_position,
|
||||
recursion,
|
||||
opcode.to_string().characters(),
|
||||
opcode.arguments_string().characters(),
|
||||
String::format("ip: %3lu, sp: %3lu", state.instruction_position, state.string_position).characters(),
|
||||
newline ? "\n" : "");
|
||||
|
||||
if (newline && is<OpCode_Compare>(opcode)) {
|
||||
for (auto& line : to<OpCode_Compare>(opcode).variable_arguments_to_string()) {
|
||||
fprintf(m_file, "%-15s | %-5s | %-9s | %-35s | %-30s | %-20s%s", "", "", "", "", line.characters(), "", "\n");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void print_result(const OpCode& opcode, const ByteCode& bytecode, const MatchInput& input, MatchState& state, ExecutionResult result) const
|
||||
{
|
||||
StringBuilder builder;
|
||||
builder.append(execution_result_name(result));
|
||||
builder.appendff(", fc: {}, ss: {}", input.fail_counter, input.saved_positions.size());
|
||||
if (result == ExecutionResult::Succeeded) {
|
||||
builder.appendf(", ip: %lu/%lu, sp: %lu/%lu", state.instruction_position, bytecode.size() - 1, state.string_position, input.view.length() - 1);
|
||||
} else if (result == ExecutionResult::Fork_PrioHigh) {
|
||||
builder.appendf(", next ip: %lu", state.fork_at_position + opcode.size());
|
||||
} else if (result != ExecutionResult::Failed) {
|
||||
builder.appendf(", next ip: %lu", state.instruction_position + opcode.size());
|
||||
}
|
||||
|
||||
fprintf(m_file, " | %-20s\n", builder.to_string().characters());
|
||||
|
||||
if (is<OpCode_Compare>(opcode)) {
|
||||
for (auto& line : to<OpCode_Compare>(opcode).variable_arguments_to_string(input)) {
|
||||
fprintf(m_file, "%-15s | %-5s | %-9s | %-35s | %-30s | %-20s%s", "", "", "", "", line.characters(), "", "\n");
|
||||
}
|
||||
}
|
||||
|
||||
fprintf(m_file, "%s", m_debug_stripline.characters());
|
||||
}
|
||||
|
||||
void print_header()
|
||||
{
|
||||
StringBuilder builder;
|
||||
builder.appendf("%-15s | %-5s | %-9s | %-35s | %-30s | %-20s | %-20s\n", "System", "Index", "Recursion", "OpCode", "Arguments", "State", "Result");
|
||||
auto length = builder.length();
|
||||
for (size_t i = 0; i < length; ++i) {
|
||||
builder.append('=');
|
||||
}
|
||||
|
||||
fprintf(m_file, "%s\n", builder.to_string().characters());
|
||||
fflush(m_file);
|
||||
|
||||
builder.clear();
|
||||
for (size_t i = 0; i < length; ++i) {
|
||||
builder.append('-');
|
||||
}
|
||||
builder.append('\n');
|
||||
m_debug_stripline = builder.to_string();
|
||||
}
|
||||
|
||||
private:
|
||||
String m_debug_stripline;
|
||||
FILE* m_file;
|
||||
};
|
||||
|
||||
}
|
||||
|
||||
using regex::RegexDebug;
|
||||
|
||||
#endif
|
102
Userland/Libraries/LibRegex/RegexError.h
Normal file
102
Userland/Libraries/LibRegex/RegexError.h
Normal file
|
@ -0,0 +1,102 @@
|
|||
/*
|
||||
* Copyright (c) 2020, Emanuel Sprung <emanuel.sprung@gmail.com>
|
||||
* All rights reserved.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are met:
|
||||
*
|
||||
* 1. Redistributions of source code must retain the above copyright notice, this
|
||||
* list of conditions and the following disclaimer.
|
||||
*
|
||||
* 2. Redistributions in binary form must reproduce the above copyright notice,
|
||||
* this list of conditions and the following disclaimer in the documentation
|
||||
* and/or other materials provided with the distribution.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
|
||||
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <AK/String.h>
|
||||
#include <AK/Types.h>
|
||||
#ifdef __serenity__
|
||||
# include <regex.h>
|
||||
#else
|
||||
# include <LibC/regex.h>
|
||||
#endif
|
||||
|
||||
namespace regex {
|
||||
|
||||
enum class Error : u8 {
|
||||
NoError = __Regex_NoError,
|
||||
InvalidPattern = __Regex_InvalidPattern, // Invalid regular expression.
|
||||
InvalidCollationElement = __Regex_InvalidCollationElement, // Invalid collating element referenced.
|
||||
InvalidCharacterClass = __Regex_InvalidCharacterClass, // Invalid character class type referenced.
|
||||
InvalidTrailingEscape = __Regex_InvalidTrailingEscape, // Trailing \ in pattern.
|
||||
InvalidNumber = __Regex_InvalidNumber, // Number in \digit invalid or in error.
|
||||
MismatchingBracket = __Regex_MismatchingBracket, // [ ] imbalance.
|
||||
MismatchingParen = __Regex_MismatchingParen, // ( ) imbalance.
|
||||
MismatchingBrace = __Regex_MismatchingBrace, // { } imbalance.
|
||||
InvalidBraceContent = __Regex_InvalidBraceContent, // Content of {} invalid: not a number, number too large, more than two numbers, first larger than second.
|
||||
InvalidBracketContent = __Regex_InvalidBracketContent, // Content of [] invalid.
|
||||
InvalidRange = __Regex_InvalidRange, // Invalid endpoint in range expression.
|
||||
InvalidRepetitionMarker = __Regex_InvalidRepetitionMarker, // ?, * or + not preceded by valid regular expression.
|
||||
ReachedMaxRecursion = __Regex_ReachedMaxRecursion, // MaximumRecursion has been reached.
|
||||
EmptySubExpression = __Regex_EmptySubExpression, // Sub expression has empty content.
|
||||
InvalidCaptureGroup = __Regex_InvalidCaptureGroup, // Content of capture group is invalid.
|
||||
InvalidNameForCaptureGroup = __Regex_InvalidNameForCaptureGroup, // Name of capture group is invalid.
|
||||
};
|
||||
|
||||
inline String get_error_string(Error error)
|
||||
{
|
||||
switch (error) {
|
||||
case Error::NoError:
|
||||
return "No error";
|
||||
case Error::InvalidPattern:
|
||||
return "Invalid regular expression.";
|
||||
case Error::InvalidCollationElement:
|
||||
return "Invalid collating element referenced.";
|
||||
case Error::InvalidCharacterClass:
|
||||
return "Invalid character class type referenced.";
|
||||
case Error::InvalidTrailingEscape:
|
||||
return "Trailing \\ in pattern.";
|
||||
case Error::InvalidNumber:
|
||||
return "Number in \\digit invalid or in error.";
|
||||
case Error::MismatchingBracket:
|
||||
return "[ ] imbalance.";
|
||||
case Error::MismatchingParen:
|
||||
return "( ) imbalance.";
|
||||
case Error::MismatchingBrace:
|
||||
return "{ } imbalance.";
|
||||
case Error::InvalidBraceContent:
|
||||
return "Content of {} invalid: not a number, number too large, more than two numbers, first larger than second.";
|
||||
case Error::InvalidBracketContent:
|
||||
return "Content of [] invalid.";
|
||||
case Error::InvalidRange:
|
||||
return "Invalid endpoint in range expression.";
|
||||
case Error::InvalidRepetitionMarker:
|
||||
return "?, * or + not preceded by valid regular expression.";
|
||||
case Error::ReachedMaxRecursion:
|
||||
return "Maximum recursion has been reached.";
|
||||
case Error::EmptySubExpression:
|
||||
return "Sub expression has empty content.";
|
||||
case Error::InvalidCaptureGroup:
|
||||
return "Content of capture group is invalid.";
|
||||
case Error::InvalidNameForCaptureGroup:
|
||||
return "Name of capture group is invalid.";
|
||||
}
|
||||
return "Undefined error.";
|
||||
}
|
||||
}
|
||||
|
||||
using regex::Error;
|
||||
using regex::get_error_string;
|
235
Userland/Libraries/LibRegex/RegexLexer.cpp
Normal file
235
Userland/Libraries/LibRegex/RegexLexer.cpp
Normal file
|
@ -0,0 +1,235 @@
|
|||
/*
|
||||
* Copyright (c) 2020, Emanuel Sprung <emanuel.sprung@gmail.com>
|
||||
* All rights reserved.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are met:
|
||||
*
|
||||
* 1. Redistributions of source code must retain the above copyright notice, this
|
||||
* list of conditions and the following disclaimer.
|
||||
*
|
||||
* 2. Redistributions in binary form must reproduce the above copyright notice,
|
||||
* this list of conditions and the following disclaimer in the documentation
|
||||
* and/or other materials provided with the distribution.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
|
||||
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
#include "RegexLexer.h"
|
||||
#include <AK/Assertions.h>
|
||||
#include <AK/LogStream.h>
|
||||
#include <stdio.h>
|
||||
|
||||
namespace regex {
|
||||
|
||||
const char* Token::name(const TokenType type)
|
||||
{
|
||||
switch (type) {
|
||||
#define __ENUMERATE_REGEX_TOKEN(x) \
|
||||
case TokenType::x: \
|
||||
return #x;
|
||||
ENUMERATE_REGEX_TOKENS
|
||||
#undef __ENUMERATE_REGEX_TOKEN
|
||||
default:
|
||||
ASSERT_NOT_REACHED();
|
||||
return "<Unknown>";
|
||||
}
|
||||
}
|
||||
|
||||
const char* Token::name() const
|
||||
{
|
||||
return name(m_type);
|
||||
}
|
||||
|
||||
Lexer::Lexer(const StringView source)
|
||||
: m_source(source)
|
||||
{
|
||||
}
|
||||
|
||||
ALWAYS_INLINE char Lexer::peek(size_t offset) const
|
||||
{
|
||||
if ((m_position + offset) >= m_source.length())
|
||||
return EOF;
|
||||
return m_source[m_position + offset];
|
||||
}
|
||||
|
||||
void Lexer::back(size_t offset)
|
||||
{
|
||||
if (offset == m_position + 1)
|
||||
offset = m_position; // 'position == 0' occurs twice.
|
||||
|
||||
ASSERT(offset <= m_position);
|
||||
if (!offset)
|
||||
return;
|
||||
m_position -= offset;
|
||||
m_previous_position = (m_position > 0) ? m_position - 1 : 0;
|
||||
m_current_char = m_source[m_position];
|
||||
}
|
||||
|
||||
ALWAYS_INLINE void Lexer::consume()
|
||||
{
|
||||
m_previous_position = m_position;
|
||||
|
||||
if (m_position >= m_source.length()) {
|
||||
m_position = m_source.length() + 1;
|
||||
m_current_char = EOF;
|
||||
return;
|
||||
}
|
||||
|
||||
m_current_char = m_source[m_position++];
|
||||
}
|
||||
|
||||
void Lexer::reset()
|
||||
{
|
||||
m_position = 0;
|
||||
m_current_token = { TokenType::Eof, 0, StringView(nullptr) };
|
||||
m_current_char = 0;
|
||||
m_previous_position = 0;
|
||||
}
|
||||
|
||||
bool Lexer::try_skip(char c)
|
||||
{
|
||||
if (peek() != c)
|
||||
return false;
|
||||
|
||||
consume();
|
||||
return true;
|
||||
}
|
||||
|
||||
char Lexer::skip()
|
||||
{
|
||||
auto c = peek();
|
||||
consume();
|
||||
return c;
|
||||
}
|
||||
|
||||
Token Lexer::next()
|
||||
{
|
||||
size_t token_start_position;
|
||||
|
||||
auto begin_token = [&] {
|
||||
token_start_position = m_position;
|
||||
};
|
||||
|
||||
auto commit_token = [&](auto type) -> Token& {
|
||||
ASSERT(token_start_position + m_previous_position - token_start_position + 1 <= m_source.length());
|
||||
auto substring = m_source.substring_view(token_start_position, m_previous_position - token_start_position + 1);
|
||||
m_current_token = Token(type, token_start_position, substring);
|
||||
return m_current_token;
|
||||
};
|
||||
|
||||
auto emit_token = [&](auto type) -> Token& {
|
||||
m_current_token = Token(type, m_position, m_source.substring_view(m_position, 1));
|
||||
consume();
|
||||
return m_current_token;
|
||||
};
|
||||
|
||||
auto match_escape_sequence = [&]() -> size_t {
|
||||
switch (peek(1)) {
|
||||
case '^':
|
||||
case '.':
|
||||
case '[':
|
||||
case ']':
|
||||
case '$':
|
||||
case '(':
|
||||
case ')':
|
||||
case '|':
|
||||
case '*':
|
||||
case '+':
|
||||
case '?':
|
||||
case '{':
|
||||
case '\\':
|
||||
return 2;
|
||||
default:
|
||||
#ifdef REGEX_DEBUG
|
||||
fprintf(stderr, "[LEXER] Found invalid escape sequence: \\%c (the parser will have to deal with this!)\n", peek(1));
|
||||
#endif
|
||||
return 0;
|
||||
}
|
||||
};
|
||||
|
||||
while (m_position <= m_source.length()) {
|
||||
auto ch = peek();
|
||||
if (ch == '(')
|
||||
return emit_token(TokenType::LeftParen);
|
||||
|
||||
if (ch == ')')
|
||||
return emit_token(TokenType::RightParen);
|
||||
|
||||
if (ch == '{')
|
||||
return emit_token(TokenType::LeftCurly);
|
||||
|
||||
if (ch == '}')
|
||||
return emit_token(TokenType::RightCurly);
|
||||
|
||||
if (ch == '[')
|
||||
return emit_token(TokenType::LeftBracket);
|
||||
|
||||
if (ch == ']')
|
||||
return emit_token(TokenType::RightBracket);
|
||||
|
||||
if (ch == '.')
|
||||
return emit_token(TokenType::Period);
|
||||
|
||||
if (ch == '*')
|
||||
return emit_token(TokenType::Asterisk);
|
||||
|
||||
if (ch == '+')
|
||||
return emit_token(TokenType::Plus);
|
||||
|
||||
if (ch == '$')
|
||||
return emit_token(TokenType::Dollar);
|
||||
|
||||
if (ch == '^')
|
||||
return emit_token(TokenType::Circumflex);
|
||||
|
||||
if (ch == '|')
|
||||
return emit_token(TokenType::Pipe);
|
||||
|
||||
if (ch == '?')
|
||||
return emit_token(TokenType::Questionmark);
|
||||
|
||||
if (ch == ',')
|
||||
return emit_token(TokenType::Comma);
|
||||
|
||||
if (ch == '/')
|
||||
return emit_token(TokenType::Slash);
|
||||
|
||||
if (ch == '=')
|
||||
return emit_token(TokenType::EqualSign);
|
||||
|
||||
if (ch == ':')
|
||||
return emit_token(TokenType::Colon);
|
||||
|
||||
if (ch == '-')
|
||||
return emit_token(TokenType::HyphenMinus);
|
||||
|
||||
if (ch == '\\') {
|
||||
size_t escape = match_escape_sequence();
|
||||
if (escape > 0) {
|
||||
begin_token();
|
||||
for (size_t i = 0; i < escape; ++i)
|
||||
consume();
|
||||
return commit_token(TokenType::EscapeSequence);
|
||||
}
|
||||
}
|
||||
|
||||
if (ch == EOF)
|
||||
break;
|
||||
|
||||
return emit_token(TokenType::Char);
|
||||
}
|
||||
|
||||
return Token(TokenType::Eof, m_position, nullptr);
|
||||
}
|
||||
|
||||
}
|
110
Userland/Libraries/LibRegex/RegexLexer.h
Normal file
110
Userland/Libraries/LibRegex/RegexLexer.h
Normal file
|
@ -0,0 +1,110 @@
|
|||
/*
|
||||
* Copyright (c) 2020, Emanuel Sprung <emanuel.sprung@gmail.com>
|
||||
* All rights reserved.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are met:
|
||||
*
|
||||
* 1. Redistributions of source code must retain the above copyright notice, this
|
||||
* list of conditions and the following disclaimer.
|
||||
*
|
||||
* 2. Redistributions in binary form must reproduce the above copyright notice,
|
||||
* this list of conditions and the following disclaimer in the documentation
|
||||
* and/or other materials provided with the distribution.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
|
||||
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <AK/Forward.h>
|
||||
#include <AK/StringView.h>
|
||||
|
||||
namespace regex {
|
||||
|
||||
#define ENUMERATE_REGEX_TOKENS \
|
||||
__ENUMERATE_REGEX_TOKEN(Eof) \
|
||||
__ENUMERATE_REGEX_TOKEN(Char) \
|
||||
__ENUMERATE_REGEX_TOKEN(Circumflex) \
|
||||
__ENUMERATE_REGEX_TOKEN(Period) \
|
||||
__ENUMERATE_REGEX_TOKEN(LeftParen) \
|
||||
__ENUMERATE_REGEX_TOKEN(RightParen) \
|
||||
__ENUMERATE_REGEX_TOKEN(LeftCurly) \
|
||||
__ENUMERATE_REGEX_TOKEN(RightCurly) \
|
||||
__ENUMERATE_REGEX_TOKEN(LeftBracket) \
|
||||
__ENUMERATE_REGEX_TOKEN(RightBracket) \
|
||||
__ENUMERATE_REGEX_TOKEN(Asterisk) \
|
||||
__ENUMERATE_REGEX_TOKEN(EscapeSequence) \
|
||||
__ENUMERATE_REGEX_TOKEN(Dollar) \
|
||||
__ENUMERATE_REGEX_TOKEN(Pipe) \
|
||||
__ENUMERATE_REGEX_TOKEN(Plus) \
|
||||
__ENUMERATE_REGEX_TOKEN(Comma) \
|
||||
__ENUMERATE_REGEX_TOKEN(Slash) \
|
||||
__ENUMERATE_REGEX_TOKEN(EqualSign) \
|
||||
__ENUMERATE_REGEX_TOKEN(HyphenMinus) \
|
||||
__ENUMERATE_REGEX_TOKEN(Colon) \
|
||||
__ENUMERATE_REGEX_TOKEN(Questionmark)
|
||||
|
||||
enum class TokenType {
|
||||
#define __ENUMERATE_REGEX_TOKEN(x) x,
|
||||
ENUMERATE_REGEX_TOKENS
|
||||
#undef __ENUMERATE_REGEX_TOKEN
|
||||
};
|
||||
|
||||
class Token {
|
||||
public:
|
||||
Token() = default;
|
||||
Token(const TokenType type, const size_t start_position, const StringView value)
|
||||
: m_type(type)
|
||||
, m_position(start_position)
|
||||
, m_value(value)
|
||||
{
|
||||
}
|
||||
|
||||
TokenType type() const { return m_type; }
|
||||
const StringView& value() const { return m_value; }
|
||||
size_t position() const { return m_position; }
|
||||
|
||||
const char* name() const;
|
||||
static const char* name(const TokenType);
|
||||
|
||||
private:
|
||||
TokenType m_type { TokenType::Eof };
|
||||
size_t m_position { 0 };
|
||||
StringView m_value { nullptr };
|
||||
};
|
||||
|
||||
class Lexer {
|
||||
public:
|
||||
Lexer() = default;
|
||||
explicit Lexer(const StringView source);
|
||||
Token next();
|
||||
void reset();
|
||||
void back(size_t offset);
|
||||
void set_source(const StringView source) { m_source = source; }
|
||||
bool try_skip(char);
|
||||
char skip();
|
||||
|
||||
private:
|
||||
ALWAYS_INLINE char peek(size_t offset = 0) const;
|
||||
ALWAYS_INLINE void consume();
|
||||
|
||||
StringView m_source {};
|
||||
size_t m_position { 0 };
|
||||
size_t m_previous_position { 0 };
|
||||
Token m_current_token { TokenType::Eof, 0, StringView(nullptr) };
|
||||
int m_current_char { 0 };
|
||||
};
|
||||
|
||||
}
|
||||
|
||||
using regex::Lexer;
|
291
Userland/Libraries/LibRegex/RegexMatch.h
Normal file
291
Userland/Libraries/LibRegex/RegexMatch.h
Normal file
|
@ -0,0 +1,291 @@
|
|||
/*
|
||||
* Copyright (c) 2020, Emanuel Sprung <emanuel.sprung@gmail.com>
|
||||
* All rights reserved.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are met:
|
||||
*
|
||||
* 1. Redistributions of source code must retain the above copyright notice, this
|
||||
* list of conditions and the following disclaimer.
|
||||
*
|
||||
* 2. Redistributions in binary form must reproduce the above copyright notice,
|
||||
* this list of conditions and the following disclaimer in the documentation
|
||||
* and/or other materials provided with the distribution.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
|
||||
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "RegexOptions.h"
|
||||
|
||||
#include "AK/FlyString.h"
|
||||
#include "AK/HashMap.h"
|
||||
#include "AK/String.h"
|
||||
#include "AK/StringBuilder.h"
|
||||
#include "AK/StringView.h"
|
||||
#include "AK/Utf32View.h"
|
||||
#include "AK/Vector.h"
|
||||
|
||||
namespace regex {
|
||||
|
||||
class RegexStringView {
|
||||
public:
|
||||
RegexStringView(const char* chars)
|
||||
: m_u8view(chars)
|
||||
{
|
||||
}
|
||||
|
||||
RegexStringView(const String& string)
|
||||
: m_u8view(string)
|
||||
{
|
||||
}
|
||||
|
||||
RegexStringView(const StringView view)
|
||||
: m_u8view(view)
|
||||
{
|
||||
}
|
||||
RegexStringView(const Utf32View view)
|
||||
: m_u32view(view)
|
||||
{
|
||||
}
|
||||
|
||||
bool is_u8_view() const { return m_u8view.has_value(); }
|
||||
bool is_u32_view() const { return m_u32view.has_value(); }
|
||||
|
||||
const StringView& u8view() const
|
||||
{
|
||||
ASSERT(m_u8view.has_value());
|
||||
return m_u8view.value();
|
||||
};
|
||||
|
||||
const Utf32View& u32view() const
|
||||
{
|
||||
ASSERT(m_u32view.has_value());
|
||||
return m_u32view.value();
|
||||
};
|
||||
|
||||
bool is_empty() const
|
||||
{
|
||||
if (is_u8_view())
|
||||
return m_u8view.value().is_empty();
|
||||
else
|
||||
return m_u32view.value().is_empty();
|
||||
}
|
||||
|
||||
bool is_null() const
|
||||
{
|
||||
if (is_u8_view())
|
||||
return m_u8view.value().is_null();
|
||||
else
|
||||
return m_u32view.value().code_points() == nullptr;
|
||||
}
|
||||
|
||||
size_t length() const
|
||||
{
|
||||
if (is_u8_view())
|
||||
return m_u8view.value().length();
|
||||
else
|
||||
return m_u32view.value().length();
|
||||
}
|
||||
|
||||
Vector<RegexStringView> lines() const
|
||||
{
|
||||
if (is_u8_view()) {
|
||||
auto views = u8view().lines(false);
|
||||
Vector<RegexStringView> new_views;
|
||||
for (auto& view : views)
|
||||
new_views.append(move(view));
|
||||
return new_views;
|
||||
}
|
||||
|
||||
// FIXME: line splitting for Utf32View needed
|
||||
Vector<RegexStringView> views;
|
||||
views.append(m_u32view.value());
|
||||
return views;
|
||||
}
|
||||
|
||||
RegexStringView substring_view(size_t offset, size_t length) const
|
||||
{
|
||||
if (is_u8_view()) {
|
||||
return u8view().substring_view(offset, length);
|
||||
}
|
||||
return u32view().substring_view(offset, length);
|
||||
}
|
||||
|
||||
String to_string() const
|
||||
{
|
||||
if (is_u8_view()) {
|
||||
return u8view().to_string();
|
||||
}
|
||||
|
||||
StringBuilder builder;
|
||||
builder.append(u32view());
|
||||
return builder.to_string();
|
||||
}
|
||||
|
||||
u32 operator[](size_t index) const
|
||||
{
|
||||
if (is_u8_view()) {
|
||||
return u8view()[index];
|
||||
}
|
||||
return u32view().code_points()[index];
|
||||
}
|
||||
|
||||
bool operator==(const char* cstring) const
|
||||
{
|
||||
if (is_u8_view())
|
||||
return u8view() == cstring;
|
||||
|
||||
return to_string() == cstring;
|
||||
}
|
||||
|
||||
bool operator!=(const char* cstring) const
|
||||
{
|
||||
return !(*this == cstring);
|
||||
}
|
||||
|
||||
bool operator==(const String& string) const
|
||||
{
|
||||
if (is_u8_view())
|
||||
return u8view() == string;
|
||||
|
||||
return to_string() == string;
|
||||
}
|
||||
|
||||
bool operator==(const StringView& other) const
|
||||
{
|
||||
if (is_u8_view())
|
||||
return u8view() == other;
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
bool operator!=(const StringView& other) const
|
||||
{
|
||||
return !(*this == other);
|
||||
}
|
||||
|
||||
bool operator==(const Utf32View& other) const
|
||||
{
|
||||
if (is_u32_view()) {
|
||||
StringBuilder builder;
|
||||
builder.append(other);
|
||||
return to_string() == builder.to_string();
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
bool operator!=(const Utf32View& other) const
|
||||
{
|
||||
return !(*this == other);
|
||||
}
|
||||
|
||||
const char* characters_without_null_termination() const
|
||||
{
|
||||
if (is_u8_view())
|
||||
return u8view().characters_without_null_termination();
|
||||
|
||||
return to_string().characters(); // FIXME: it contains the null termination, does that actually matter?
|
||||
}
|
||||
|
||||
bool starts_with(const StringView& str) const
|
||||
{
|
||||
if (is_u32_view())
|
||||
return false;
|
||||
return u8view().starts_with(str);
|
||||
}
|
||||
|
||||
bool starts_with(const Utf32View& str) const
|
||||
{
|
||||
if (is_u8_view())
|
||||
return false;
|
||||
|
||||
StringBuilder builder;
|
||||
builder.append(str);
|
||||
return to_string().starts_with(builder.to_string());
|
||||
}
|
||||
|
||||
private:
|
||||
Optional<StringView> m_u8view;
|
||||
Optional<Utf32View> m_u32view;
|
||||
};
|
||||
|
||||
class Match final {
|
||||
private:
|
||||
Optional<FlyString> string;
|
||||
|
||||
public:
|
||||
Match() = default;
|
||||
~Match() = default;
|
||||
|
||||
Match(const RegexStringView view_, const size_t line_, const size_t column_, const size_t global_offset_)
|
||||
: view(view_)
|
||||
, line(line_)
|
||||
, column(column_)
|
||||
, global_offset(global_offset_)
|
||||
, left_column(column_)
|
||||
{
|
||||
}
|
||||
|
||||
Match(const String string_, const size_t line_, const size_t column_, const size_t global_offset_)
|
||||
: string(string_)
|
||||
, view(string.value().view())
|
||||
, line(line_)
|
||||
, column(column_)
|
||||
, global_offset(global_offset_)
|
||||
, left_column(column_)
|
||||
{
|
||||
}
|
||||
|
||||
RegexStringView view { nullptr };
|
||||
size_t line { 0 };
|
||||
size_t column { 0 };
|
||||
size_t global_offset { 0 };
|
||||
|
||||
// ugly, as not usable by user, but needed to prevent to create extra vectors that are
|
||||
// able to store the column when the left paren has been found
|
||||
size_t left_column { 0 };
|
||||
};
|
||||
|
||||
struct MatchInput {
|
||||
RegexStringView view { nullptr };
|
||||
AllOptions regex_options {};
|
||||
size_t start_offset { 0 }; // For Stateful matches, saved and restored from Regex::start_offset.
|
||||
|
||||
size_t match_index { 0 };
|
||||
size_t line { 0 };
|
||||
size_t column { 0 };
|
||||
|
||||
size_t global_offset { 0 }; // For multiline matching, knowing the offset from start could be important
|
||||
|
||||
mutable size_t fail_counter { 0 };
|
||||
mutable Vector<size_t> saved_positions;
|
||||
};
|
||||
|
||||
struct MatchState {
|
||||
size_t string_position { 0 };
|
||||
size_t instruction_position { 0 };
|
||||
size_t fork_at_position { 0 };
|
||||
};
|
||||
|
||||
struct MatchOutput {
|
||||
size_t operations;
|
||||
Vector<Match> matches;
|
||||
Vector<Vector<Match>> capture_group_matches;
|
||||
Vector<HashMap<String, Match>> named_capture_group_matches;
|
||||
};
|
||||
|
||||
}
|
||||
|
||||
using regex::RegexStringView;
|
396
Userland/Libraries/LibRegex/RegexMatcher.cpp
Normal file
396
Userland/Libraries/LibRegex/RegexMatcher.cpp
Normal file
|
@ -0,0 +1,396 @@
|
|||
/*
|
||||
* Copyright (c) 2020, Emanuel Sprung <emanuel.sprung@gmail.com>
|
||||
* All rights reserved.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are met:
|
||||
*
|
||||
* 1. Redistributions of source code must retain the above copyright notice, this
|
||||
* list of conditions and the following disclaimer.
|
||||
*
|
||||
* 2. Redistributions in binary form must reproduce the above copyright notice,
|
||||
* this list of conditions and the following disclaimer in the documentation
|
||||
* and/or other materials provided with the distribution.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
|
||||
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
#include "RegexMatcher.h"
|
||||
#include "RegexDebug.h"
|
||||
#include "RegexParser.h"
|
||||
#include <AK/ScopedValueRollback.h>
|
||||
#include <AK/String.h>
|
||||
#include <AK/StringBuilder.h>
|
||||
|
||||
namespace regex {
|
||||
|
||||
#ifdef REGEX_DEBUG
|
||||
static RegexDebug s_regex_dbg(stderr);
|
||||
#endif
|
||||
|
||||
template<class Parser>
|
||||
Regex<Parser>::Regex(StringView pattern, typename ParserTraits<Parser>::OptionsType regex_options)
|
||||
{
|
||||
pattern_value = pattern.to_string();
|
||||
regex::Lexer lexer(pattern);
|
||||
|
||||
Parser parser(lexer, regex_options);
|
||||
parser_result = parser.parse();
|
||||
|
||||
if (parser_result.error == regex::Error::NoError)
|
||||
matcher = make<Matcher<Parser>>(*this, regex_options);
|
||||
}
|
||||
|
||||
template<class Parser>
|
||||
typename ParserTraits<Parser>::OptionsType Regex<Parser>::options() const
|
||||
{
|
||||
if (parser_result.error != Error::NoError)
|
||||
return {};
|
||||
|
||||
return matcher->options();
|
||||
}
|
||||
|
||||
template<class Parser>
|
||||
String Regex<Parser>::error_string(Optional<String> message) const
|
||||
{
|
||||
StringBuilder eb;
|
||||
eb.appendf("Error during parsing of regular expression:\n");
|
||||
eb.appendf(" %s\n ", pattern_value.characters());
|
||||
for (size_t i = 0; i < parser_result.error_token.position(); ++i)
|
||||
eb.append(" ");
|
||||
|
||||
eb.appendf("^---- %s", message.value_or(get_error_string(parser_result.error)).characters());
|
||||
return eb.build();
|
||||
}
|
||||
|
||||
template<typename Parser>
|
||||
RegexResult Matcher<Parser>::match(const RegexStringView& view, Optional<typename ParserTraits<Parser>::OptionsType> regex_options) const
|
||||
{
|
||||
AllOptions options = m_regex_options | regex_options.value_or({}).value();
|
||||
|
||||
if (options.has_flag_set(AllFlags::Multiline))
|
||||
return match(view.lines(), regex_options); // FIXME: how do we know, which line ending a line has (1char or 2char)? This is needed to get the correct match offsets from start of string...
|
||||
|
||||
Vector<RegexStringView> views;
|
||||
views.append(view);
|
||||
return match(views, regex_options);
|
||||
}
|
||||
|
||||
template<typename Parser>
|
||||
RegexResult Matcher<Parser>::match(const Vector<RegexStringView> views, Optional<typename ParserTraits<Parser>::OptionsType> regex_options) const
|
||||
{
|
||||
// If the pattern *itself* isn't stateful, reset any changes to start_offset.
|
||||
if (!((AllFlags)m_regex_options.value() & AllFlags::Internal_Stateful))
|
||||
m_pattern.start_offset = 0;
|
||||
|
||||
size_t match_count { 0 };
|
||||
|
||||
MatchInput input;
|
||||
MatchState state;
|
||||
MatchOutput output;
|
||||
|
||||
input.regex_options = m_regex_options | regex_options.value_or({}).value();
|
||||
input.start_offset = m_pattern.start_offset;
|
||||
output.operations = 0;
|
||||
|
||||
if (input.regex_options.has_flag_set(AllFlags::Internal_Stateful))
|
||||
ASSERT(views.size() == 1);
|
||||
|
||||
if (c_match_preallocation_count) {
|
||||
output.matches.ensure_capacity(c_match_preallocation_count);
|
||||
output.capture_group_matches.ensure_capacity(c_match_preallocation_count);
|
||||
output.named_capture_group_matches.ensure_capacity(c_match_preallocation_count);
|
||||
|
||||
auto& capture_groups_count = m_pattern.parser_result.capture_groups_count;
|
||||
auto& named_capture_groups_count = m_pattern.parser_result.named_capture_groups_count;
|
||||
|
||||
for (size_t j = 0; j < c_match_preallocation_count; ++j) {
|
||||
output.matches.empend();
|
||||
output.capture_group_matches.unchecked_append({});
|
||||
output.capture_group_matches.at(j).ensure_capacity(capture_groups_count);
|
||||
for (size_t k = 0; k < capture_groups_count; ++k)
|
||||
output.capture_group_matches.at(j).unchecked_append({});
|
||||
|
||||
output.named_capture_group_matches.unchecked_append({});
|
||||
output.named_capture_group_matches.at(j).ensure_capacity(named_capture_groups_count);
|
||||
}
|
||||
}
|
||||
|
||||
auto append_match = [](auto& input, auto& state, auto& output, auto& start_position) {
|
||||
if (output.matches.size() == input.match_index)
|
||||
output.matches.empend();
|
||||
|
||||
ASSERT(start_position + state.string_position - start_position <= input.view.length());
|
||||
if (input.regex_options.has_flag_set(AllFlags::StringCopyMatches)) {
|
||||
output.matches.at(input.match_index) = { input.view.substring_view(start_position, state.string_position - start_position).to_string(), input.line, start_position, input.global_offset + start_position };
|
||||
} else { // let the view point to the original string ...
|
||||
output.matches.at(input.match_index) = { input.view.substring_view(start_position, state.string_position - start_position), input.line, start_position, input.global_offset + start_position };
|
||||
}
|
||||
};
|
||||
|
||||
#ifdef REGEX_DEBUG
|
||||
s_regex_dbg.print_header();
|
||||
#endif
|
||||
|
||||
bool continue_search = input.regex_options.has_flag_set(AllFlags::Global) || input.regex_options.has_flag_set(AllFlags::Multiline);
|
||||
if (input.regex_options.has_flag_set(AllFlags::Internal_Stateful))
|
||||
continue_search = false;
|
||||
|
||||
for (auto& view : views) {
|
||||
input.view = view;
|
||||
#ifdef REGEX_DEBUG
|
||||
dbg() << "[match] Starting match with view (" << view.length() << "): _" << view.to_string() << "_";
|
||||
#endif
|
||||
|
||||
auto view_length = view.length();
|
||||
size_t view_index = m_pattern.start_offset;
|
||||
state.string_position = view_index;
|
||||
|
||||
if (view_index == view_length && m_pattern.parser_result.match_length_minimum == 0) {
|
||||
// Run the code until it tries to consume something.
|
||||
// This allows non-consuming code to run on empty strings, for instance
|
||||
// e.g. "Exit"
|
||||
MatchOutput temp_output { output };
|
||||
|
||||
input.column = match_count;
|
||||
input.match_index = match_count;
|
||||
|
||||
state.string_position = view_index;
|
||||
state.instruction_position = 0;
|
||||
|
||||
auto success = execute(input, state, temp_output, 0);
|
||||
// This success is acceptable only if it doesn't read anything from the input (input length is 0).
|
||||
if (state.string_position <= view_index) {
|
||||
if (success.value()) {
|
||||
output = move(temp_output);
|
||||
if (!match_count) {
|
||||
// Nothing was *actually* matched, so append an empty match.
|
||||
append_match(input, state, output, view_index);
|
||||
++match_count;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
for (; view_index < view_length; ++view_index) {
|
||||
auto& match_length_minimum = m_pattern.parser_result.match_length_minimum;
|
||||
// FIXME: More performant would be to know the remaining minimum string
|
||||
// length needed to match from the current position onwards within
|
||||
// the vm. Add new OpCode for MinMatchLengthFromSp with the value of
|
||||
// the remaining string length from the current path. The value though
|
||||
// has to be filled in reverse. That implies a second run over bytecode
|
||||
// after generation has finished.
|
||||
if (match_length_minimum && match_length_minimum > view_length - view_index)
|
||||
break;
|
||||
|
||||
input.column = match_count;
|
||||
input.match_index = match_count;
|
||||
|
||||
state.string_position = view_index;
|
||||
state.instruction_position = 0;
|
||||
|
||||
auto success = execute(input, state, output, 0);
|
||||
if (!success.has_value())
|
||||
return { false, 0, {}, {}, {}, output.operations };
|
||||
|
||||
if (success.value()) {
|
||||
|
||||
if (input.regex_options.has_flag_set(AllFlags::MatchNotEndOfLine) && state.string_position == input.view.length()) {
|
||||
if (!continue_search)
|
||||
break;
|
||||
continue;
|
||||
}
|
||||
if (input.regex_options.has_flag_set(AllFlags::MatchNotBeginOfLine) && view_index == 0) {
|
||||
if (!continue_search)
|
||||
break;
|
||||
continue;
|
||||
}
|
||||
|
||||
#ifdef REGEX_DEBUG
|
||||
dbg() << "state.string_position: " << state.string_position << " view_index: " << view_index;
|
||||
dbg() << "[match] Found a match (length = " << state.string_position - view_index << "): " << input.view.substring_view(view_index, state.string_position - view_index).to_string();
|
||||
#endif
|
||||
++match_count;
|
||||
|
||||
if (continue_search) {
|
||||
append_match(input, state, output, view_index);
|
||||
|
||||
bool has_zero_length = state.string_position == view_index;
|
||||
view_index = state.string_position - (has_zero_length ? 0 : 1);
|
||||
continue;
|
||||
|
||||
} else if (input.regex_options.has_flag_set(AllFlags::Internal_Stateful)) {
|
||||
append_match(input, state, output, view_index);
|
||||
break;
|
||||
|
||||
} else if (state.string_position < view_length) {
|
||||
return { false, 0, {}, {}, {}, output.operations };
|
||||
}
|
||||
|
||||
append_match(input, state, output, view_index);
|
||||
break;
|
||||
}
|
||||
|
||||
if (!continue_search && !input.regex_options.has_flag_set(AllFlags::Internal_Stateful))
|
||||
break;
|
||||
}
|
||||
|
||||
++input.line;
|
||||
input.global_offset += view.length() + 1; // +1 includes the line break character
|
||||
|
||||
if (input.regex_options.has_flag_set(AllFlags::Internal_Stateful))
|
||||
m_pattern.start_offset = state.string_position;
|
||||
}
|
||||
|
||||
MatchOutput output_copy;
|
||||
if (match_count) {
|
||||
auto capture_groups_count = min(output.capture_group_matches.size(), output.matches.size());
|
||||
for (size_t i = 0; i < capture_groups_count; ++i) {
|
||||
if (input.regex_options.has_flag_set(AllFlags::SkipTrimEmptyMatches)) {
|
||||
output_copy.capture_group_matches.append(output.capture_group_matches.at(i));
|
||||
} else {
|
||||
Vector<Match> capture_group_matches;
|
||||
for (size_t j = 0; j < output.capture_group_matches.at(i).size(); ++j) {
|
||||
if (!output.capture_group_matches.at(i).at(j).view.is_null())
|
||||
capture_group_matches.append(output.capture_group_matches.at(i).at(j));
|
||||
}
|
||||
output_copy.capture_group_matches.append(capture_group_matches);
|
||||
}
|
||||
}
|
||||
|
||||
auto named_capture_groups_count = min(output.named_capture_group_matches.size(), output.matches.size());
|
||||
for (size_t i = 0; i < named_capture_groups_count; ++i) {
|
||||
if (output.matches.at(i).view.length())
|
||||
output_copy.named_capture_group_matches.append(output.named_capture_group_matches.at(i));
|
||||
}
|
||||
|
||||
for (size_t i = 0; i < match_count; ++i)
|
||||
output_copy.matches.append(output.matches.at(i));
|
||||
|
||||
} else {
|
||||
output_copy.capture_group_matches.clear_with_capacity();
|
||||
output_copy.named_capture_group_matches.clear_with_capacity();
|
||||
}
|
||||
|
||||
return {
|
||||
match_count ? true : false,
|
||||
match_count,
|
||||
move(output_copy.matches),
|
||||
move(output_copy.capture_group_matches),
|
||||
move(output_copy.named_capture_group_matches),
|
||||
output.operations,
|
||||
m_pattern.parser_result.capture_groups_count,
|
||||
m_pattern.parser_result.named_capture_groups_count,
|
||||
};
|
||||
}
|
||||
|
||||
template<class Parser>
|
||||
Optional<bool> Matcher<Parser>::execute(const MatchInput& input, MatchState& state, MatchOutput& output, size_t recursion_level) const
|
||||
{
|
||||
if (recursion_level > c_max_recursion)
|
||||
return false;
|
||||
|
||||
Vector<MatchState> fork_low_prio_states;
|
||||
MatchState fork_high_prio_state;
|
||||
Optional<bool> success;
|
||||
|
||||
auto& bytecode = m_pattern.parser_result.bytecode;
|
||||
|
||||
for (;;) {
|
||||
++output.operations;
|
||||
auto* opcode = bytecode.get_opcode(state);
|
||||
|
||||
if (!opcode) {
|
||||
dbgln("Wrong opcode... failed!");
|
||||
return {};
|
||||
}
|
||||
|
||||
#ifdef REGEX_DEBUG
|
||||
s_regex_dbg.print_opcode("VM", *opcode, state, recursion_level, false);
|
||||
#endif
|
||||
|
||||
ExecutionResult result;
|
||||
if (input.fail_counter > 0) {
|
||||
--input.fail_counter;
|
||||
result = ExecutionResult::Failed_ExecuteLowPrioForks;
|
||||
} else {
|
||||
result = opcode->execute(input, state, output);
|
||||
}
|
||||
|
||||
#ifdef REGEX_DEBUG
|
||||
s_regex_dbg.print_result(*opcode, bytecode, input, state, result);
|
||||
#endif
|
||||
|
||||
state.instruction_position += opcode->size();
|
||||
|
||||
switch (result) {
|
||||
case ExecutionResult::Fork_PrioLow:
|
||||
fork_low_prio_states.prepend(state);
|
||||
continue;
|
||||
case ExecutionResult::Fork_PrioHigh:
|
||||
fork_high_prio_state = state;
|
||||
fork_high_prio_state.instruction_position = fork_high_prio_state.fork_at_position;
|
||||
success = execute(input, fork_high_prio_state, output, ++recursion_level);
|
||||
if (!success.has_value())
|
||||
return {};
|
||||
|
||||
if (success.value()) {
|
||||
state = fork_high_prio_state;
|
||||
return true;
|
||||
}
|
||||
|
||||
continue;
|
||||
case ExecutionResult::Continue:
|
||||
continue;
|
||||
case ExecutionResult::Succeeded:
|
||||
return true;
|
||||
case ExecutionResult::Failed:
|
||||
return false;
|
||||
case ExecutionResult::Failed_ExecuteLowPrioForks:
|
||||
return execute_low_prio_forks(input, state, output, fork_low_prio_states, recursion_level + 1);
|
||||
}
|
||||
}
|
||||
|
||||
ASSERT_NOT_REACHED();
|
||||
}
|
||||
|
||||
template<class Parser>
|
||||
ALWAYS_INLINE Optional<bool> Matcher<Parser>::execute_low_prio_forks(const MatchInput& input, MatchState& original_state, MatchOutput& output, Vector<MatchState> states, size_t recursion_level) const
|
||||
{
|
||||
for (auto& state : states) {
|
||||
|
||||
state.instruction_position = state.fork_at_position;
|
||||
#ifdef REGEX_DEBUG
|
||||
fprintf(stderr, "Forkstay... ip = %lu, sp = %lu\n", state.instruction_position, state.string_position);
|
||||
#endif
|
||||
auto success = execute(input, state, output, recursion_level);
|
||||
if (!success.has_value())
|
||||
return {};
|
||||
if (success.value()) {
|
||||
#ifdef REGEX_DEBUG
|
||||
fprintf(stderr, "Forkstay succeeded... ip = %lu, sp = %lu\n", state.instruction_position, state.string_position);
|
||||
#endif
|
||||
original_state = state;
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
original_state.string_position = 0;
|
||||
return false;
|
||||
}
|
||||
|
||||
template class Matcher<PosixExtendedParser>;
|
||||
template class Regex<PosixExtendedParser>;
|
||||
|
||||
template class Matcher<ECMA262Parser>;
|
||||
template class Regex<ECMA262Parser>;
|
||||
}
|
296
Userland/Libraries/LibRegex/RegexMatcher.h
Normal file
296
Userland/Libraries/LibRegex/RegexMatcher.h
Normal file
|
@ -0,0 +1,296 @@
|
|||
/*
|
||||
* Copyright (c) 2020, Emanuel Sprung <emanuel.sprung@gmail.com>
|
||||
* All rights reserved.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are met:
|
||||
*
|
||||
* 1. Redistributions of source code must retain the above copyright notice, this
|
||||
* list of conditions and the following disclaimer.
|
||||
*
|
||||
* 2. Redistributions in binary form must reproduce the above copyright notice,
|
||||
* this list of conditions and the following disclaimer in the documentation
|
||||
* and/or other materials provided with the distribution.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
|
||||
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "RegexByteCode.h"
|
||||
#include "RegexMatch.h"
|
||||
#include "RegexOptions.h"
|
||||
#include "RegexParser.h"
|
||||
|
||||
#include <AK/Forward.h>
|
||||
#include <AK/HashMap.h>
|
||||
#include <AK/NonnullOwnPtrVector.h>
|
||||
#include <AK/Types.h>
|
||||
#include <AK/Utf32View.h>
|
||||
#include <AK/Vector.h>
|
||||
#include <ctype.h>
|
||||
|
||||
#include <stdio.h>
|
||||
|
||||
namespace regex {
|
||||
|
||||
static const constexpr size_t c_max_recursion = 5000;
|
||||
static const constexpr size_t c_match_preallocation_count = 0;
|
||||
|
||||
struct RegexResult final {
|
||||
bool success { false };
|
||||
size_t count { 0 };
|
||||
Vector<Match> matches;
|
||||
Vector<Vector<Match>> capture_group_matches;
|
||||
Vector<HashMap<String, Match>> named_capture_group_matches;
|
||||
size_t n_operations { 0 };
|
||||
size_t n_capture_groups { 0 };
|
||||
size_t n_named_capture_groups { 0 };
|
||||
};
|
||||
|
||||
template<class Parser>
|
||||
class Regex;
|
||||
|
||||
template<class Parser>
|
||||
class Matcher final {
|
||||
|
||||
public:
|
||||
Matcher(const Regex<Parser>& pattern, Optional<typename ParserTraits<Parser>::OptionsType> regex_options = {})
|
||||
: m_pattern(pattern)
|
||||
, m_regex_options(regex_options.value_or({}))
|
||||
{
|
||||
}
|
||||
~Matcher() = default;
|
||||
|
||||
RegexResult match(const RegexStringView&, Optional<typename ParserTraits<Parser>::OptionsType> = {}) const;
|
||||
RegexResult match(const Vector<RegexStringView>, Optional<typename ParserTraits<Parser>::OptionsType> = {}) const;
|
||||
|
||||
typename ParserTraits<Parser>::OptionsType options() const
|
||||
{
|
||||
return m_regex_options;
|
||||
}
|
||||
|
||||
private:
|
||||
Optional<bool> execute(const MatchInput& input, MatchState& state, MatchOutput& output, size_t recursion_level) const;
|
||||
ALWAYS_INLINE Optional<bool> execute_low_prio_forks(const MatchInput& input, MatchState& original_state, MatchOutput& output, Vector<MatchState> states, size_t recursion_level) const;
|
||||
|
||||
const Regex<Parser>& m_pattern;
|
||||
const typename ParserTraits<Parser>::OptionsType m_regex_options;
|
||||
};
|
||||
|
||||
template<class Parser>
|
||||
class Regex final {
|
||||
public:
|
||||
String pattern_value;
|
||||
regex::Parser::Result parser_result;
|
||||
OwnPtr<Matcher<Parser>> matcher { nullptr };
|
||||
mutable size_t start_offset { 0 };
|
||||
|
||||
explicit Regex(StringView pattern, typename ParserTraits<Parser>::OptionsType regex_options = {});
|
||||
~Regex() = default;
|
||||
|
||||
typename ParserTraits<Parser>::OptionsType options() const;
|
||||
void print_bytecode(FILE* f = stdout) const;
|
||||
String error_string(Optional<String> message = {}) const;
|
||||
|
||||
RegexResult match(const RegexStringView view, Optional<typename ParserTraits<Parser>::OptionsType> regex_options = {}) const
|
||||
{
|
||||
if (!matcher || parser_result.error != Error::NoError)
|
||||
return {};
|
||||
return matcher->match(view, regex_options);
|
||||
}
|
||||
|
||||
RegexResult match(const Vector<RegexStringView> views, Optional<typename ParserTraits<Parser>::OptionsType> regex_options = {}) const
|
||||
{
|
||||
if (!matcher || parser_result.error != Error::NoError)
|
||||
return {};
|
||||
return matcher->match(views, regex_options);
|
||||
}
|
||||
|
||||
String replace(const RegexStringView view, const StringView& replacement_pattern, Optional<typename ParserTraits<Parser>::OptionsType> regex_options = {}) const
|
||||
{
|
||||
if (!matcher || parser_result.error != Error::NoError)
|
||||
return {};
|
||||
|
||||
StringBuilder builder;
|
||||
size_t start_offset = 0;
|
||||
RegexResult result = matcher->match(view, regex_options);
|
||||
if (!result.success)
|
||||
return view.to_string();
|
||||
|
||||
for (size_t i = 0; i < result.matches.size(); ++i) {
|
||||
auto& match = result.matches[i];
|
||||
builder.append(view.substring_view(start_offset, match.global_offset - start_offset).to_string());
|
||||
start_offset = match.global_offset + match.view.length();
|
||||
GenericLexer lexer(replacement_pattern);
|
||||
while (!lexer.is_eof()) {
|
||||
if (lexer.consume_specific('\\')) {
|
||||
if (lexer.consume_specific('\\')) {
|
||||
builder.append('\\');
|
||||
continue;
|
||||
}
|
||||
auto number = lexer.consume_while(isdigit);
|
||||
if (auto index = number.to_uint(); index.has_value() && result.n_capture_groups >= index.value()) {
|
||||
builder.append(result.capture_group_matches[i][index.value() - 1].view.to_string());
|
||||
} else {
|
||||
builder.appendff("\\{}", number);
|
||||
}
|
||||
} else {
|
||||
builder.append(lexer.consume_while([](auto ch) { return ch != '\\'; }));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
builder.append(view.substring_view(start_offset, view.length() - start_offset).to_string());
|
||||
|
||||
return builder.to_string();
|
||||
}
|
||||
|
||||
// FIXME: replace(const Vector<RegexStringView>, ...)
|
||||
|
||||
RegexResult search(const RegexStringView view, Optional<typename ParserTraits<Parser>::OptionsType> regex_options = {}) const
|
||||
{
|
||||
if (!matcher || parser_result.error != Error::NoError)
|
||||
return {};
|
||||
|
||||
AllOptions options = (AllOptions)regex_options.value_or({});
|
||||
if ((options & AllFlags::MatchNotBeginOfLine) && (options & AllFlags::MatchNotEndOfLine)) {
|
||||
options.reset_flag(AllFlags::MatchNotEndOfLine);
|
||||
options.reset_flag(AllFlags::MatchNotBeginOfLine);
|
||||
}
|
||||
options |= AllFlags::Global;
|
||||
|
||||
return matcher->match(view, options);
|
||||
}
|
||||
|
||||
RegexResult search(const Vector<RegexStringView> views, Optional<typename ParserTraits<Parser>::OptionsType> regex_options = {}) const
|
||||
{
|
||||
if (!matcher || parser_result.error != Error::NoError)
|
||||
return {};
|
||||
|
||||
AllOptions options = (AllOptions)regex_options.value_or({});
|
||||
if ((options & AllFlags::MatchNotBeginOfLine) && (options & AllFlags::MatchNotEndOfLine)) {
|
||||
options.reset_flag(AllFlags::MatchNotEndOfLine);
|
||||
options.reset_flag(AllFlags::MatchNotBeginOfLine);
|
||||
}
|
||||
options |= AllFlags::Global;
|
||||
|
||||
return matcher->match(views, options);
|
||||
}
|
||||
|
||||
bool match(const RegexStringView view, RegexResult& m, Optional<typename ParserTraits<Parser>::OptionsType> regex_options = {}) const
|
||||
{
|
||||
m = match(view, regex_options);
|
||||
return m.success;
|
||||
}
|
||||
|
||||
bool match(const Vector<RegexStringView> views, RegexResult& m, Optional<typename ParserTraits<Parser>::OptionsType> regex_options = {}) const
|
||||
{
|
||||
m = match(views, regex_options);
|
||||
return m.success;
|
||||
}
|
||||
|
||||
bool search(const RegexStringView view, RegexResult& m, Optional<typename ParserTraits<Parser>::OptionsType> regex_options = {}) const
|
||||
{
|
||||
m = search(view, regex_options);
|
||||
return m.success;
|
||||
}
|
||||
|
||||
bool search(const Vector<RegexStringView> views, RegexResult& m, Optional<typename ParserTraits<Parser>::OptionsType> regex_options = {}) const
|
||||
{
|
||||
m = search(views, regex_options);
|
||||
return m.success;
|
||||
}
|
||||
|
||||
bool has_match(const RegexStringView view, Optional<typename ParserTraits<Parser>::OptionsType> regex_options = {}) const
|
||||
{
|
||||
if (!matcher || parser_result.error != Error::NoError)
|
||||
return false;
|
||||
RegexResult result = matcher->match(view, AllOptions { regex_options.value_or({}) } | AllFlags::SkipSubExprResults);
|
||||
return result.success;
|
||||
}
|
||||
|
||||
bool has_match(const Vector<RegexStringView> views, Optional<typename ParserTraits<Parser>::OptionsType> regex_options = {}) const
|
||||
{
|
||||
if (!matcher || parser_result.error != Error::NoError)
|
||||
return false;
|
||||
RegexResult result = matcher->match(views, AllOptions { regex_options.value_or({}) } | AllFlags::SkipSubExprResults);
|
||||
return result.success;
|
||||
}
|
||||
};
|
||||
|
||||
// free standing functions for match, search and has_match
|
||||
template<class Parser>
|
||||
RegexResult match(const RegexStringView view, Regex<Parser>& pattern, Optional<typename ParserTraits<Parser>::OptionsType> regex_options = {})
|
||||
{
|
||||
return pattern.match(view, regex_options);
|
||||
}
|
||||
|
||||
template<class Parser>
|
||||
RegexResult match(const Vector<RegexStringView> view, Regex<Parser>& pattern, Optional<typename ParserTraits<Parser>::OptionsType> regex_options = {})
|
||||
{
|
||||
return pattern.match(view, regex_options);
|
||||
}
|
||||
|
||||
template<class Parser>
|
||||
bool match(const RegexStringView view, Regex<Parser>& pattern, RegexResult&, Optional<typename ParserTraits<Parser>::OptionsType> regex_options = {})
|
||||
{
|
||||
return pattern.match(view, regex_options);
|
||||
}
|
||||
|
||||
template<class Parser>
|
||||
bool match(const Vector<RegexStringView> view, Regex<Parser>& pattern, RegexResult&, Optional<typename ParserTraits<Parser>::OptionsType> regex_options = {})
|
||||
{
|
||||
return pattern.match(view, regex_options);
|
||||
}
|
||||
|
||||
template<class Parser>
|
||||
RegexResult search(const RegexStringView view, Regex<Parser>& pattern, Optional<typename ParserTraits<Parser>::OptionsType> regex_options = {})
|
||||
{
|
||||
return pattern.search(view, regex_options);
|
||||
}
|
||||
|
||||
template<class Parser>
|
||||
RegexResult search(const Vector<RegexStringView> views, Regex<Parser>& pattern, Optional<typename ParserTraits<Parser>::OptionsType> regex_options = {})
|
||||
{
|
||||
return pattern.search(views, regex_options);
|
||||
}
|
||||
|
||||
template<class Parser>
|
||||
bool search(const RegexStringView view, Regex<Parser>& pattern, RegexResult&, Optional<typename ParserTraits<Parser>::OptionsType> regex_options = {})
|
||||
{
|
||||
return pattern.search(view, regex_options);
|
||||
}
|
||||
|
||||
template<class Parser>
|
||||
bool search(const Vector<RegexStringView> views, Regex<Parser>& pattern, RegexResult&, Optional<typename ParserTraits<Parser>::OptionsType> regex_options = {})
|
||||
{
|
||||
return pattern.search(views, regex_options);
|
||||
}
|
||||
|
||||
template<class Parser>
|
||||
bool has_match(const RegexStringView view, Regex<Parser>& pattern, Optional<typename ParserTraits<Parser>::OptionsType> regex_options = {})
|
||||
{
|
||||
return pattern.has_match(view, regex_options);
|
||||
}
|
||||
|
||||
template<class Parser>
|
||||
bool has_match(const Vector<RegexStringView> views, Regex<Parser>& pattern, Optional<typename ParserTraits<Parser>::OptionsType> regex_options = {})
|
||||
{
|
||||
return pattern.has_match(views, regex_options);
|
||||
}
|
||||
}
|
||||
|
||||
using regex::has_match;
|
||||
using regex::match;
|
||||
using regex::Regex;
|
||||
using regex::RegexResult;
|
161
Userland/Libraries/LibRegex/RegexOptions.h
Normal file
161
Userland/Libraries/LibRegex/RegexOptions.h
Normal file
|
@ -0,0 +1,161 @@
|
|||
/*
|
||||
* Copyright (c) 2020, Emanuel Sprung <emanuel.sprung@gmail.com>
|
||||
* All rights reserved.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are met:
|
||||
*
|
||||
* 1. Redistributions of source code must retain the above copyright notice, this
|
||||
* list of conditions and the following disclaimer.
|
||||
*
|
||||
* 2. Redistributions in binary form must reproduce the above copyright notice,
|
||||
* this list of conditions and the following disclaimer in the documentation
|
||||
* and/or other materials provided with the distribution.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
|
||||
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <AK/Types.h>
|
||||
#include <stdio.h>
|
||||
#ifdef __serenity__
|
||||
# include <regex.h>
|
||||
#else
|
||||
# include <LibC/regex.h>
|
||||
#endif
|
||||
|
||||
namespace regex {
|
||||
|
||||
using FlagsUnderlyingType = u16;
|
||||
|
||||
enum class AllFlags {
|
||||
Global = __Regex_Global, // All matches (don't return after first match)
|
||||
Insensitive = __Regex_Insensitive, // Case insensitive match (ignores case of [a-zA-Z])
|
||||
Ungreedy = __Regex_Ungreedy, // The match becomes lazy by default. Now a ? following a quantifier makes it greedy
|
||||
Unicode = __Regex_Unicode, // Enable all unicode features and interpret all unicode escape sequences as such
|
||||
Extended = __Regex_Extended, // Ignore whitespaces. Spaces and text after a # in the pattern are ignored
|
||||
Extra = __Regex_Extra, // Disallow meaningless escapes. A \ followed by a letter with no special meaning is faulted
|
||||
MatchNotBeginOfLine = __Regex_MatchNotBeginOfLine, // Pattern is not forced to ^ -> search in whole string!
|
||||
MatchNotEndOfLine = __Regex_MatchNotEndOfLine, // Don't Force the dollar sign, $, to always match end of the string, instead of end of the line. This option is ignored if the Multiline-flag is set
|
||||
SkipSubExprResults = __Regex_SkipSubExprResults, // Do not return sub expressions in the result
|
||||
StringCopyMatches = __Regex_StringCopyMatches, // Do explicitly copy results into new allocated string instead of StringView to original string.
|
||||
SingleLine = __Regex_SingleLine, // Dot matches newline characters
|
||||
Sticky = __Regex_Sticky, // Force the pattern to only match consecutive matches from where the previous match ended.
|
||||
Multiline = __Regex_Multiline, // Handle newline characters. Match each line, one by one.
|
||||
SkipTrimEmptyMatches = __Regex_SkipTrimEmptyMatches, // Do not remove empty capture group results.
|
||||
Internal_Stateful = __Regex_Internal_Stateful, // Make global matches match one result at a time, and further match() calls on the same instance continue where the previous one left off.
|
||||
Last = Internal_Stateful,
|
||||
};
|
||||
|
||||
enum class PosixFlags : FlagsUnderlyingType {
|
||||
Global = (FlagsUnderlyingType)AllFlags::Global,
|
||||
Insensitive = (FlagsUnderlyingType)AllFlags::Insensitive,
|
||||
Ungreedy = (FlagsUnderlyingType)AllFlags::Ungreedy,
|
||||
Unicode = (FlagsUnderlyingType)AllFlags::Unicode,
|
||||
Extended = (FlagsUnderlyingType)AllFlags::Extended,
|
||||
Extra = (FlagsUnderlyingType)AllFlags::Extra,
|
||||
MatchNotBeginOfLine = (FlagsUnderlyingType)AllFlags::MatchNotBeginOfLine,
|
||||
MatchNotEndOfLine = (FlagsUnderlyingType)AllFlags::MatchNotEndOfLine,
|
||||
SkipSubExprResults = (FlagsUnderlyingType)AllFlags::SkipSubExprResults,
|
||||
SkipTrimEmptyMatches = (FlagsUnderlyingType)AllFlags::SkipTrimEmptyMatches,
|
||||
Multiline = (FlagsUnderlyingType)AllFlags::Multiline,
|
||||
StringCopyMatches = (FlagsUnderlyingType)AllFlags::StringCopyMatches,
|
||||
};
|
||||
|
||||
enum class ECMAScriptFlags : FlagsUnderlyingType {
|
||||
Global = (FlagsUnderlyingType)AllFlags::Global | (FlagsUnderlyingType)AllFlags::Internal_Stateful, // Note: ECMAScript "Global" creates a stateful regex.
|
||||
Insensitive = (FlagsUnderlyingType)AllFlags::Insensitive,
|
||||
Ungreedy = (FlagsUnderlyingType)AllFlags::Ungreedy,
|
||||
Unicode = (FlagsUnderlyingType)AllFlags::Unicode,
|
||||
Extended = (FlagsUnderlyingType)AllFlags::Extended,
|
||||
Extra = (FlagsUnderlyingType)AllFlags::Extra,
|
||||
SingleLine = (FlagsUnderlyingType)AllFlags::SingleLine,
|
||||
Sticky = (FlagsUnderlyingType)AllFlags::Sticky,
|
||||
Multiline = (FlagsUnderlyingType)AllFlags::Multiline,
|
||||
StringCopyMatches = (FlagsUnderlyingType)AllFlags::StringCopyMatches,
|
||||
};
|
||||
|
||||
template<class T>
|
||||
class RegexOptions {
|
||||
public:
|
||||
using FlagsType = T;
|
||||
|
||||
RegexOptions() = default;
|
||||
|
||||
RegexOptions(T flags)
|
||||
: m_flags(flags)
|
||||
{
|
||||
}
|
||||
|
||||
template<class U>
|
||||
RegexOptions(RegexOptions<U> other)
|
||||
: m_flags((T) static_cast<FlagsUnderlyingType>(other.value()))
|
||||
{
|
||||
}
|
||||
|
||||
operator bool() const { return !!*this; }
|
||||
bool operator!() const { return (FlagsUnderlyingType)m_flags == 0; }
|
||||
|
||||
RegexOptions<T> operator|(T flag) const { return RegexOptions<T> { (T)((FlagsUnderlyingType)m_flags | (FlagsUnderlyingType)flag) }; }
|
||||
RegexOptions<T> operator&(T flag) const { return RegexOptions<T> { (T)((FlagsUnderlyingType)m_flags & (FlagsUnderlyingType)flag) }; }
|
||||
|
||||
RegexOptions<T>& operator|=(T flag)
|
||||
{
|
||||
m_flags = (T)((FlagsUnderlyingType)m_flags | (FlagsUnderlyingType)flag);
|
||||
return *this;
|
||||
}
|
||||
|
||||
RegexOptions<T>& operator&=(T flag)
|
||||
{
|
||||
m_flags = (T)((FlagsUnderlyingType)m_flags & (FlagsUnderlyingType)flag);
|
||||
return *this;
|
||||
}
|
||||
|
||||
void reset_flags() { m_flags = (T)0; }
|
||||
void reset_flag(T flag) { m_flags = (T)((FlagsUnderlyingType)m_flags & ~(FlagsUnderlyingType)flag); }
|
||||
void set_flag(T flag) { *this |= flag; }
|
||||
bool has_flag_set(T flag) const { return (FlagsUnderlyingType)flag == ((FlagsUnderlyingType)m_flags & (FlagsUnderlyingType)flag); }
|
||||
T value() const { return m_flags; }
|
||||
|
||||
private:
|
||||
T m_flags { 0 };
|
||||
};
|
||||
|
||||
template<class T>
|
||||
inline RegexOptions<T> operator|(T lhs, T rhs)
|
||||
{
|
||||
return RegexOptions<T> { lhs } |= rhs;
|
||||
}
|
||||
|
||||
template<class T>
|
||||
inline RegexOptions<T> operator&(T lhs, T rhs)
|
||||
{
|
||||
return RegexOptions<T> { lhs } &= rhs;
|
||||
}
|
||||
|
||||
template<class T>
|
||||
inline T operator~(T flag)
|
||||
{
|
||||
return (T) ~((FlagsUnderlyingType)flag);
|
||||
}
|
||||
|
||||
using AllOptions = RegexOptions<AllFlags>;
|
||||
using ECMAScriptOptions = RegexOptions<ECMAScriptFlags>;
|
||||
using PosixOptions = RegexOptions<PosixFlags>;
|
||||
|
||||
}
|
||||
|
||||
using regex::ECMAScriptFlags;
|
||||
using regex::ECMAScriptOptions;
|
||||
using regex::PosixFlags;
|
||||
using regex::PosixOptions;
|
1493
Userland/Libraries/LibRegex/RegexParser.cpp
Normal file
1493
Userland/Libraries/LibRegex/RegexParser.cpp
Normal file
File diff suppressed because it is too large
Load diff
208
Userland/Libraries/LibRegex/RegexParser.h
Normal file
208
Userland/Libraries/LibRegex/RegexParser.h
Normal file
|
@ -0,0 +1,208 @@
|
|||
/*
|
||||
* Copyright (c) 2020, Emanuel Sprung <emanuel.sprung@gmail.com>
|
||||
* All rights reserved.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are met:
|
||||
*
|
||||
* 1. Redistributions of source code must retain the above copyright notice, this
|
||||
* list of conditions and the following disclaimer.
|
||||
*
|
||||
* 2. Redistributions in binary form must reproduce the above copyright notice,
|
||||
* this list of conditions and the following disclaimer in the documentation
|
||||
* and/or other materials provided with the distribution.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
|
||||
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "RegexByteCode.h"
|
||||
#include "RegexError.h"
|
||||
#include "RegexLexer.h"
|
||||
#include "RegexOptions.h"
|
||||
|
||||
#include <AK/Forward.h>
|
||||
#include <AK/StringBuilder.h>
|
||||
#include <AK/Types.h>
|
||||
#include <AK/Vector.h>
|
||||
|
||||
namespace regex {
|
||||
|
||||
class PosixExtendedParser;
|
||||
class ECMA262Parser;
|
||||
|
||||
template<typename T>
|
||||
struct GenericParserTraits {
|
||||
using OptionsType = T;
|
||||
};
|
||||
|
||||
template<typename T>
|
||||
struct ParserTraits : public GenericParserTraits<T> {
|
||||
};
|
||||
|
||||
template<>
|
||||
struct ParserTraits<PosixExtendedParser> : public GenericParserTraits<PosixOptions> {
|
||||
};
|
||||
|
||||
template<>
|
||||
struct ParserTraits<ECMA262Parser> : public GenericParserTraits<ECMAScriptOptions> {
|
||||
};
|
||||
|
||||
class Parser {
|
||||
public:
|
||||
struct Result {
|
||||
ByteCode bytecode;
|
||||
size_t capture_groups_count;
|
||||
size_t named_capture_groups_count;
|
||||
size_t match_length_minimum;
|
||||
Error error;
|
||||
Token error_token;
|
||||
};
|
||||
|
||||
explicit Parser(Lexer& lexer)
|
||||
: m_parser_state(lexer)
|
||||
{
|
||||
}
|
||||
|
||||
Parser(Lexer& lexer, AllOptions regex_options)
|
||||
: m_parser_state(lexer, regex_options)
|
||||
{
|
||||
}
|
||||
|
||||
virtual ~Parser() = default;
|
||||
|
||||
Result parse(Optional<AllOptions> regex_options = {});
|
||||
bool has_error() const { return m_parser_state.error != Error::NoError; }
|
||||
Error error() const { return m_parser_state.error; }
|
||||
|
||||
protected:
|
||||
virtual bool parse_internal(ByteCode&, size_t& match_length_minimum) = 0;
|
||||
|
||||
ALWAYS_INLINE bool match(TokenType type) const;
|
||||
ALWAYS_INLINE bool match(char ch) const;
|
||||
ALWAYS_INLINE bool match_ordinary_characters();
|
||||
ALWAYS_INLINE Token consume();
|
||||
ALWAYS_INLINE Token consume(TokenType type, Error error);
|
||||
ALWAYS_INLINE bool consume(const String&);
|
||||
ALWAYS_INLINE bool try_skip(StringView);
|
||||
ALWAYS_INLINE char skip();
|
||||
ALWAYS_INLINE void reset();
|
||||
ALWAYS_INLINE bool done() const;
|
||||
ALWAYS_INLINE bool set_error(Error error);
|
||||
|
||||
struct ParserState {
|
||||
Lexer& lexer;
|
||||
Token current_token;
|
||||
Error error = Error::NoError;
|
||||
Token error_token { TokenType::Eof, 0, StringView(nullptr) };
|
||||
ByteCode bytecode;
|
||||
size_t capture_groups_count { 0 };
|
||||
size_t named_capture_groups_count { 0 };
|
||||
size_t match_length_minimum { 0 };
|
||||
AllOptions regex_options;
|
||||
HashMap<int, size_t> capture_group_minimum_lengths;
|
||||
HashMap<FlyString, size_t> named_capture_group_minimum_lengths;
|
||||
HashMap<size_t, FlyString> named_capture_groups;
|
||||
|
||||
explicit ParserState(Lexer& lexer)
|
||||
: lexer(lexer)
|
||||
, current_token(lexer.next())
|
||||
{
|
||||
}
|
||||
explicit ParserState(Lexer& lexer, AllOptions regex_options)
|
||||
: lexer(lexer)
|
||||
, current_token(lexer.next())
|
||||
, regex_options(regex_options)
|
||||
{
|
||||
}
|
||||
};
|
||||
|
||||
ParserState m_parser_state;
|
||||
};
|
||||
|
||||
class PosixExtendedParser final : public Parser {
|
||||
public:
|
||||
explicit PosixExtendedParser(Lexer& lexer)
|
||||
: Parser(lexer)
|
||||
{
|
||||
}
|
||||
|
||||
PosixExtendedParser(Lexer& lexer, Optional<typename ParserTraits<PosixExtendedParser>::OptionsType> regex_options)
|
||||
: Parser(lexer, regex_options.value_or({}))
|
||||
{
|
||||
}
|
||||
|
||||
~PosixExtendedParser() = default;
|
||||
|
||||
private:
|
||||
ALWAYS_INLINE bool match_repetition_symbol();
|
||||
|
||||
bool parse_internal(ByteCode&, size_t&) override;
|
||||
|
||||
bool parse_root(ByteCode&, size_t&);
|
||||
ALWAYS_INLINE bool parse_sub_expression(ByteCode&, size_t&);
|
||||
ALWAYS_INLINE bool parse_bracket_expression(ByteCode&, size_t&);
|
||||
ALWAYS_INLINE bool parse_repetition_symbol(ByteCode&, size_t&);
|
||||
};
|
||||
|
||||
class ECMA262Parser final : public Parser {
|
||||
public:
|
||||
explicit ECMA262Parser(Lexer& lexer)
|
||||
: Parser(lexer)
|
||||
{
|
||||
}
|
||||
|
||||
ECMA262Parser(Lexer& lexer, Optional<typename ParserTraits<ECMA262Parser>::OptionsType> regex_options)
|
||||
: Parser(lexer, regex_options.value_or({}))
|
||||
{
|
||||
}
|
||||
|
||||
~ECMA262Parser() = default;
|
||||
|
||||
private:
|
||||
bool parse_internal(ByteCode&, size_t&) override;
|
||||
|
||||
enum class ReadDigitsInitialZeroState {
|
||||
Allow,
|
||||
Disallow,
|
||||
Require,
|
||||
};
|
||||
enum class ReadDigitFollowPolicy {
|
||||
Any,
|
||||
DisallowDigit,
|
||||
DisallowNonDigit,
|
||||
};
|
||||
Optional<unsigned> read_digits(ReadDigitsInitialZeroState initial_zero = ReadDigitsInitialZeroState::Allow, ReadDigitFollowPolicy follow_policy = ReadDigitFollowPolicy::Any, bool hex = false, int max_count = -1);
|
||||
StringView read_capture_group_specifier(bool take_starting_angle_bracket = false);
|
||||
|
||||
bool parse_pattern(ByteCode&, size_t&, bool unicode, bool named);
|
||||
bool parse_disjunction(ByteCode&, size_t&, bool unicode, bool named);
|
||||
bool parse_alternative(ByteCode&, size_t&, bool unicode, bool named);
|
||||
bool parse_term(ByteCode&, size_t&, bool unicode, bool named);
|
||||
bool parse_assertion(ByteCode&, size_t&, bool unicode, bool named);
|
||||
bool parse_atom(ByteCode&, size_t&, bool unicode, bool named);
|
||||
bool parse_quantifier(ByteCode&, size_t&, bool unicode, bool named);
|
||||
bool parse_atom_escape(ByteCode&, size_t&, bool unicode, bool named);
|
||||
bool parse_character_class(ByteCode&, size_t&, bool unicode, bool named);
|
||||
bool parse_capture_group(ByteCode&, size_t&, bool unicode, bool named);
|
||||
Optional<CharClass> parse_character_class_escape(bool& out_inverse, bool expect_backslash = false);
|
||||
bool parse_nonempty_class_ranges(Vector<CompareTypeAndValuePair>&, bool unicode);
|
||||
};
|
||||
|
||||
using PosixExtended = PosixExtendedParser;
|
||||
using ECMA262 = ECMA262Parser;
|
||||
|
||||
}
|
||||
|
||||
using regex::ECMA262;
|
||||
using regex::PosixExtended;
|
991
Userland/Libraries/LibRegex/Tests/Benchmark.cpp
Normal file
991
Userland/Libraries/LibRegex/Tests/Benchmark.cpp
Normal file
|
@ -0,0 +1,991 @@
|
|||
/*
|
||||
* Copyright (c) 2020, Emanuel Sprung <emanuel.sprung@gmail.com>
|
||||
* All rights reserved.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are met:
|
||||
*
|
||||
* 1. Redistributions of source code must retain the above copyright notice, this
|
||||
* list of conditions and the following disclaimer.
|
||||
*
|
||||
* 2. Redistributions in binary form must reproduce the above copyright notice,
|
||||
* this list of conditions and the following disclaimer in the documentation
|
||||
* and/or other materials provided with the distribution.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
|
||||
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
#include <AK/TestSuite.h> // import first, to prevent warning of ASSERT* redefinition
|
||||
|
||||
#include <LibRegex/Regex.h>
|
||||
#include <stdio.h>
|
||||
|
||||
#ifndef REGEX_DEBUG
|
||||
|
||||
# define BENCHMARK_LOOP_ITERATIONS 100000
|
||||
|
||||
//# define REGEX_BENCHMARK_OUR
|
||||
# ifndef __serenity__
|
||||
//# define REGEX_BENCHMARK_OTHER
|
||||
# endif
|
||||
|
||||
# if defined(REGEX_BENCHMARK_OTHER)
|
||||
# include <regex>
|
||||
# endif
|
||||
|
||||
# if not(defined(REGEX_BENCHMARK_OUR) && defined(REGEX_BENCHMARK_OUR))
|
||||
BENCHMARK_CASE(dummy_benchmark)
|
||||
{
|
||||
}
|
||||
# endif
|
||||
|
||||
# if defined(REGEX_BENCHMARK_OUR)
|
||||
BENCHMARK_CASE(catch_all_benchmark)
|
||||
{
|
||||
Regex<PosixExtended> re("^.*$");
|
||||
RegexResult m;
|
||||
for (size_t i = 0; i < BENCHMARK_LOOP_ITERATIONS; ++i) {
|
||||
EXPECT(re.match("Hello World", m));
|
||||
}
|
||||
}
|
||||
# endif
|
||||
|
||||
# if defined(REGEX_BENCHMARK_OTHER)
|
||||
BENCHMARK_CASE(catch_all_benchmark_reference_stdcpp)
|
||||
{
|
||||
std::regex re("^.*$");
|
||||
std::cmatch m;
|
||||
for (size_t i = 0; i < BENCHMARK_LOOP_ITERATIONS; ++i) {
|
||||
EXPECT_EQ(std::regex_match("Hello World", m, re), true);
|
||||
}
|
||||
}
|
||||
# endif
|
||||
|
||||
# if defined(REGEX_BENCHMARK_OUR)
|
||||
BENCHMARK_CASE(simple_start_benchmark)
|
||||
{
|
||||
Regex<PosixExtended> re("^hello friends");
|
||||
RegexResult m;
|
||||
for (size_t i = 0; i < BENCHMARK_LOOP_ITERATIONS; ++i) {
|
||||
EXPECT_EQ(re.match("Hello!", m), false);
|
||||
EXPECT_EQ(re.match("hello friends", m), true);
|
||||
EXPECT_EQ(re.match("Well, hello friends", m), false);
|
||||
}
|
||||
}
|
||||
# endif
|
||||
|
||||
# if defined(REGEX_BENCHMARK_OTHER)
|
||||
BENCHMARK_CASE(simple_start_benchmark_reference_stdcpp)
|
||||
{
|
||||
std::regex re("^hello friends");
|
||||
std::cmatch m;
|
||||
for (size_t i = 0; i < BENCHMARK_LOOP_ITERATIONS; ++i) {
|
||||
EXPECT_EQ(std::regex_match("Hello", m, re), false);
|
||||
EXPECT_EQ(std::regex_match("hello friends", m, re), true);
|
||||
EXPECT_EQ(std::regex_match("Well, hello friends", m, re), false);
|
||||
}
|
||||
}
|
||||
# endif
|
||||
|
||||
# if defined(REGEX_BENCHMARK_OUR)
|
||||
BENCHMARK_CASE(simple_end_benchmark)
|
||||
{
|
||||
Regex<PosixExtended> re(".*hello\\.\\.\\. there$");
|
||||
RegexResult m;
|
||||
for (size_t i = 0; i < BENCHMARK_LOOP_ITERATIONS; ++i) {
|
||||
EXPECT_EQ(re.match("Hallo", m), false);
|
||||
EXPECT_EQ(re.match("I said fyhello... there", m), true);
|
||||
EXPECT_EQ(re.match("ahello... therea", m), false);
|
||||
EXPECT_EQ(re.match("hello.. there", m), false);
|
||||
}
|
||||
}
|
||||
# endif
|
||||
|
||||
# if defined(REGEX_BENCHMARK_OTHER)
|
||||
BENCHMARK_CASE(simple_end_benchmark_reference_stdcpp)
|
||||
{
|
||||
std::regex re(".*hello\\.\\.\\. there$");
|
||||
std::cmatch m;
|
||||
for (size_t i = 0; i < BENCHMARK_LOOP_ITERATIONS; ++i) {
|
||||
EXPECT_EQ(std::regex_search("Hallo", m, re), false);
|
||||
EXPECT_EQ(std::regex_search("I said fyhello... there", m, re), true);
|
||||
EXPECT_EQ(std::regex_search("ahello... therea", m, re), false);
|
||||
EXPECT_EQ(std::regex_search("hello.. there", m, re), false);
|
||||
}
|
||||
}
|
||||
# endif
|
||||
|
||||
# if defined(REGEX_BENCHMARK_OUR)
|
||||
BENCHMARK_CASE(simple_period_benchmark)
|
||||
{
|
||||
Regex<PosixExtended> re("hello.");
|
||||
RegexResult m;
|
||||
for (size_t i = 0; i < BENCHMARK_LOOP_ITERATIONS; ++i) {
|
||||
EXPECT_EQ(re.match("Hello1", m), false);
|
||||
EXPECT_EQ(re.match("hello1", m), true);
|
||||
EXPECT_EQ(re.match("hello2", m), true);
|
||||
EXPECT_EQ(re.match("hello?", m), true);
|
||||
}
|
||||
}
|
||||
# endif
|
||||
|
||||
# if defined(REGEX_BENCHMARK_OTHER)
|
||||
BENCHMARK_CASE(simple_period_benchmark_reference_stdcpp)
|
||||
{
|
||||
std::regex re("hello.");
|
||||
std::cmatch m;
|
||||
for (size_t i = 0; i < BENCHMARK_LOOP_ITERATIONS; ++i) {
|
||||
EXPECT_EQ(std::regex_match("Hello1", m, re), false);
|
||||
EXPECT_EQ(std::regex_match("hello1", m, re), true);
|
||||
EXPECT_EQ(std::regex_match("hello2", m, re), true);
|
||||
EXPECT_EQ(std::regex_match("hello?", m, re), true);
|
||||
}
|
||||
}
|
||||
# endif
|
||||
|
||||
# if defined(REGEX_BENCHMARK_OUR)
|
||||
BENCHMARK_CASE(simple_period_end_benchmark)
|
||||
{
|
||||
Regex<PosixExtended> re("hello.$");
|
||||
RegexResult m;
|
||||
for (size_t i = 0; i < BENCHMARK_LOOP_ITERATIONS; ++i) {
|
||||
EXPECT_EQ(re.search("Hello1", m), false);
|
||||
EXPECT_EQ(re.search("hello1hello1", m), true);
|
||||
EXPECT_EQ(re.search("hello2hell", m), false);
|
||||
EXPECT_EQ(re.search("hello?", m), true);
|
||||
}
|
||||
}
|
||||
# endif
|
||||
|
||||
# if defined(REGEX_BENCHMARK_OTHER)
|
||||
BENCHMARK_CASE(simple_period_end_benchmark_reference_stdcpp)
|
||||
{
|
||||
std::regex re("hello.$");
|
||||
std::cmatch m;
|
||||
for (size_t i = 0; i < BENCHMARK_LOOP_ITERATIONS; ++i) {
|
||||
EXPECT_EQ(std::regex_search("Hello1", m, re), false);
|
||||
EXPECT_EQ(std::regex_search("hello1hello1", m, re), true);
|
||||
EXPECT_EQ(std::regex_search("hello2hell", m, re), false);
|
||||
EXPECT_EQ(std::regex_search("hello?", m, re), true);
|
||||
}
|
||||
}
|
||||
# endif
|
||||
|
||||
# if defined(REGEX_BENCHMARK_OUR)
|
||||
BENCHMARK_CASE(simple_escaped_benchmark)
|
||||
{
|
||||
Regex<PosixExtended> re("hello\\.");
|
||||
RegexResult m;
|
||||
for (size_t i = 0; i < BENCHMARK_LOOP_ITERATIONS; ++i) {
|
||||
EXPECT_EQ(re.match("hello", m), false);
|
||||
EXPECT_EQ(re.match("hello.", m), true);
|
||||
}
|
||||
}
|
||||
# endif
|
||||
|
||||
# if defined(REGEX_BENCHMARK_OTHER)
|
||||
BENCHMARK_CASE(simple_escaped_benchmark_reference_stdcpp)
|
||||
{
|
||||
std::regex re("hello\\.");
|
||||
std::cmatch m;
|
||||
for (size_t i = 0; i < BENCHMARK_LOOP_ITERATIONS; ++i) {
|
||||
EXPECT_EQ(std::regex_match("hello", m, re), false);
|
||||
EXPECT_EQ(std::regex_match("hello.", m, re), true);
|
||||
}
|
||||
}
|
||||
# endif
|
||||
|
||||
# if defined(REGEX_BENCHMARK_OUR)
|
||||
BENCHMARK_CASE(simple_period2_end_benchmark)
|
||||
{
|
||||
Regex<PosixExtended> re(".*hi... there$");
|
||||
RegexResult m;
|
||||
for (size_t i = 0; i < BENCHMARK_LOOP_ITERATIONS; ++i) {
|
||||
EXPECT_EQ(re.search("Hello there", m), false);
|
||||
EXPECT_EQ(re.search("I said fyhi... there", m), true);
|
||||
EXPECT_EQ(re.search("....hi... ", m), false);
|
||||
EXPECT_EQ(re.search("I said fyhihii there", m), true);
|
||||
EXPECT_EQ(re.search("I said fyhihi there", m), false);
|
||||
}
|
||||
}
|
||||
# endif
|
||||
|
||||
# if defined(REGEX_BENCHMARK_OTHER)
|
||||
BENCHMARK_CASE(simple_period2_end_benchmark_reference_stdcpp)
|
||||
{
|
||||
std::regex re(".*hi... there$");
|
||||
std::cmatch m;
|
||||
for (size_t i = 0; i < BENCHMARK_LOOP_ITERATIONS; ++i) {
|
||||
EXPECT_EQ(std::regex_search("Hello there", m, re), false);
|
||||
EXPECT_EQ(std::regex_search("I said fyhi... there", m, re), true);
|
||||
EXPECT_EQ(std::regex_search("....hi... ", m, re), false);
|
||||
EXPECT_EQ(std::regex_search("I said fyhihii there", m, re), true);
|
||||
EXPECT_EQ(std::regex_search("I said fyhihi there", m, re), false);
|
||||
}
|
||||
}
|
||||
# endif
|
||||
|
||||
# if defined(REGEX_BENCHMARK_OUR)
|
||||
BENCHMARK_CASE(simple_plus_benchmark)
|
||||
{
|
||||
Regex<PosixExtended> re("a+");
|
||||
RegexResult m;
|
||||
for (size_t i = 0; i < BENCHMARK_LOOP_ITERATIONS; ++i) {
|
||||
EXPECT_EQ(re.search("b", m), false);
|
||||
EXPECT_EQ(re.search("a", m), true);
|
||||
EXPECT_EQ(re.search("aaaaaabbbbb", m), true);
|
||||
EXPECT_EQ(re.search("aaaaaaaaaaa", m), true);
|
||||
}
|
||||
}
|
||||
# endif
|
||||
|
||||
# if defined(REGEX_BENCHMARK_OTHER)
|
||||
BENCHMARK_CASE(simple_plus_benchmark_reference_stdcpp)
|
||||
{
|
||||
std::regex re("a+");
|
||||
std::cmatch m;
|
||||
for (size_t i = 0; i < BENCHMARK_LOOP_ITERATIONS; ++i) {
|
||||
EXPECT_EQ(std::regex_search("b", m, re), false);
|
||||
EXPECT_EQ(std::regex_search("a", m, re), true);
|
||||
EXPECT_EQ(std::regex_search("aaaaaabbbbb", m, re), true);
|
||||
EXPECT_EQ(std::regex_search("aaaaaaaaaaa", m, re), true);
|
||||
}
|
||||
}
|
||||
# endif
|
||||
|
||||
# if defined(REGEX_BENCHMARK_OUR)
|
||||
BENCHMARK_CASE(simple_questionmark_benchmark)
|
||||
{
|
||||
Regex<PosixExtended> re("da?d");
|
||||
RegexResult m;
|
||||
for (size_t i = 0; i < BENCHMARK_LOOP_ITERATIONS; ++i) {
|
||||
EXPECT_EQ(re.search("a", m), false);
|
||||
EXPECT_EQ(re.search("daa", m), false);
|
||||
EXPECT_EQ(re.search("ddddd", m), true);
|
||||
EXPECT_EQ(re.search("dd", m), true);
|
||||
EXPECT_EQ(re.search("dad", m), true);
|
||||
EXPECT_EQ(re.search("dada", m), true);
|
||||
EXPECT_EQ(re.search("adadaa", m), true);
|
||||
}
|
||||
}
|
||||
# endif
|
||||
|
||||
# if defined(REGEX_BENCHMARK_OTHER)
|
||||
BENCHMARK_CASE(simple_questionmark_benchmark_reference_stdcpp)
|
||||
{
|
||||
std::regex re("da?d");
|
||||
std::cmatch m;
|
||||
for (size_t i = 0; i < BENCHMARK_LOOP_ITERATIONS; ++i) {
|
||||
EXPECT_EQ(std::regex_search("a", m, re), false);
|
||||
EXPECT_EQ(std::regex_search("daa", m, re), false);
|
||||
EXPECT_EQ(std::regex_search("ddddd", m, re), true);
|
||||
EXPECT_EQ(std::regex_search("dd", m, re), true);
|
||||
EXPECT_EQ(std::regex_search("dad", m, re), true);
|
||||
EXPECT_EQ(std::regex_search("dada", m, re), true);
|
||||
EXPECT_EQ(std::regex_search("adadaa", m, re), true);
|
||||
}
|
||||
}
|
||||
# endif
|
||||
|
||||
# if defined(REGEX_BENCHMARK_OUR)
|
||||
BENCHMARK_CASE(character_class_benchmark)
|
||||
{
|
||||
Regex<PosixExtended> re("[[:alpha:]]");
|
||||
RegexResult m;
|
||||
String haystack = "[Window]\nOpacity=255\nAudibleBeep=0\n";
|
||||
|
||||
for (size_t i = 0; i < BENCHMARK_LOOP_ITERATIONS; ++i) {
|
||||
EXPECT_EQ(re.match(haystack.characters(), m), false);
|
||||
EXPECT_EQ(re.search(haystack.characters(), m), true);
|
||||
}
|
||||
}
|
||||
# endif
|
||||
|
||||
# if defined(REGEX_BENCHMARK_OTHER)
|
||||
BENCHMARK_CASE(character_class_benchmark_reference_stdcpp)
|
||||
{
|
||||
std::regex re("[[:alpha:]]");
|
||||
std::cmatch m;
|
||||
String haystack = "[Window]\nOpacity=255\nAudibleBeep=0\n";
|
||||
|
||||
for (size_t i = 0; i < BENCHMARK_LOOP_ITERATIONS; ++i) {
|
||||
EXPECT_EQ(std::regex_match(haystack.characters(), m, re), false);
|
||||
EXPECT_EQ(std::regex_search(haystack.characters(), m, re), true);
|
||||
}
|
||||
}
|
||||
# endif
|
||||
|
||||
# if defined(REGEX_BENCHMARK_OUR)
|
||||
BENCHMARK_CASE(escaped_char_questionmark_benchmark)
|
||||
{
|
||||
Regex<PosixExtended> re("This\\.?And\\.?That");
|
||||
RegexResult m;
|
||||
for (size_t i = 0; i < BENCHMARK_LOOP_ITERATIONS; ++i) {
|
||||
EXPECT_EQ(re.match("ThisAndThat", m), true);
|
||||
EXPECT_EQ(re.match("This.And.That", m), true);
|
||||
EXPECT_EQ(re.match("This And That", m), false);
|
||||
EXPECT_EQ(re.match("This..And..That", m), false);
|
||||
}
|
||||
}
|
||||
# endif
|
||||
|
||||
# if defined(REGEX_BENCHMARK_OTHER)
|
||||
BENCHMARK_CASE(escaped_char_questionmark_benchmark_reference_stdcpp)
|
||||
{
|
||||
std::regex re("This\\.?And\\.?That");
|
||||
std::cmatch m;
|
||||
for (size_t i = 0; i < BENCHMARK_LOOP_ITERATIONS; ++i) {
|
||||
EXPECT_EQ(std::regex_match("ThisAndThat", m, re), true);
|
||||
EXPECT_EQ(std::regex_match("This.And.That", m, re), true);
|
||||
EXPECT_EQ(std::regex_match("This And That", m, re), false);
|
||||
EXPECT_EQ(std::regex_match("This..And..That", m, re), false);
|
||||
}
|
||||
}
|
||||
# endif
|
||||
|
||||
# if defined(REGEX_BENCHMARK_OUR)
|
||||
BENCHMARK_CASE(char_qualifier_asterisk_benchmark)
|
||||
{
|
||||
Regex<PosixExtended> re("regex*");
|
||||
RegexResult m;
|
||||
for (size_t i = 0; i < BENCHMARK_LOOP_ITERATIONS; ++i) {
|
||||
EXPECT_EQ(re.search("#include <regex.h>", m), true);
|
||||
EXPECT_EQ(re.search("#include <stdio.h>", m), false);
|
||||
}
|
||||
}
|
||||
# endif
|
||||
|
||||
# if defined(REGEX_BENCHMARK_OTHER)
|
||||
BENCHMARK_CASE(char_qualifier_asterisk_benchmark_reference_stdcpp)
|
||||
{
|
||||
std::regex re("regex*");
|
||||
std::cmatch m;
|
||||
for (size_t i = 0; i < BENCHMARK_LOOP_ITERATIONS; ++i) {
|
||||
EXPECT_EQ(std::regex_search("#include <regex.h>", m, re), true);
|
||||
EXPECT_EQ(std::regex_search("#include <stdio.h>", m, re), false);
|
||||
}
|
||||
}
|
||||
# endif
|
||||
|
||||
# if defined(REGEX_BENCHMARK_OUR)
|
||||
BENCHMARK_CASE(parens_qualifier_questionmark_benchmark)
|
||||
{
|
||||
Regex<PosixExtended> re("test(hello)?test");
|
||||
RegexResult m;
|
||||
for (size_t i = 0; i < BENCHMARK_LOOP_ITERATIONS; ++i) {
|
||||
EXPECT_EQ(re.match("testtest", m), true);
|
||||
EXPECT_EQ(re.match("testhellotest", m), true);
|
||||
EXPECT_EQ(re.match("testasfdtest", m), false);
|
||||
}
|
||||
}
|
||||
# endif
|
||||
|
||||
# if defined(REGEX_BENCHMARK_OTHER)
|
||||
BENCHMARK_CASE(parens_qualifier_questionmark_benchmark_reference_stdcpp)
|
||||
{
|
||||
std::regex re("test(hello)?test");
|
||||
std::cmatch m;
|
||||
for (size_t i = 0; i < BENCHMARK_LOOP_ITERATIONS; ++i) {
|
||||
EXPECT_EQ(std::regex_match("testtest", m, re), true);
|
||||
EXPECT_EQ(std::regex_match("testhellotest", m, re), true);
|
||||
EXPECT_EQ(std::regex_match("testasfdtest", m, re), false);
|
||||
}
|
||||
}
|
||||
# endif
|
||||
|
||||
# if defined(REGEX_BENCHMARK_OUR)
|
||||
BENCHMARK_CASE(parens_qualifier_asterisk_benchmark)
|
||||
{
|
||||
Regex<PosixExtended> re("test(hello)*test");
|
||||
RegexResult m;
|
||||
for (size_t i = 0; i < BENCHMARK_LOOP_ITERATIONS; ++i) {
|
||||
EXPECT_EQ(re.match("testtest", m), true);
|
||||
EXPECT_EQ(re.match("testhellohellotest", m), true);
|
||||
EXPECT_EQ(re.search("testhellohellotest, testhellotest", m), true);
|
||||
EXPECT_EQ(re.match("aaaaaaaaaaaaaaaaaaaaaaaaaaabbbbbbbbbbbbbbbbbbbbbbbbbb", m), false);
|
||||
}
|
||||
}
|
||||
# endif
|
||||
|
||||
# if defined(REGEX_BENCHMARK_OTHER)
|
||||
BENCHMARK_CASE(parens_qualifier_asterisk_benchmark_reference_stdcpp)
|
||||
{
|
||||
std::regex re("test(hello)*test");
|
||||
std::cmatch m;
|
||||
for (size_t i = 0; i < BENCHMARK_LOOP_ITERATIONS; ++i) {
|
||||
EXPECT_EQ(std::regex_match("testtest", m, re), true);
|
||||
EXPECT_EQ(std::regex_match("testhellohellotest", m, re), true);
|
||||
EXPECT_EQ(std::regex_search("testhellohellotest, testhellotest", m, re), true);
|
||||
EXPECT_EQ(std::regex_match("aaaaaaaaaaaaaaaaaaaaaaaaaaabbbbbbbbbbbbbbbbbbbbbbbbbb", m, re), false);
|
||||
}
|
||||
}
|
||||
# endif
|
||||
|
||||
# if defined(REGEX_BENCHMARK_OUR)
|
||||
BENCHMARK_CASE(parens_qualifier_asterisk_2_benchmark)
|
||||
{
|
||||
Regex<PosixExtended> re("test(.*)test");
|
||||
RegexResult m;
|
||||
for (size_t i = 0; i < BENCHMARK_LOOP_ITERATIONS; ++i) {
|
||||
EXPECT_EQ(re.match("testasdftest", m), true);
|
||||
EXPECT_EQ(re.match("testasdfasdftest", m), true);
|
||||
EXPECT_EQ(re.search("testaaaatest, testbbbtest, testtest", m), true);
|
||||
EXPECT_EQ(re.match("aaaaaaaaaaaaaaaaaaaaaaaaaaabbbbbbbbbbbbbbbbbbbbbbbbbb", m), false);
|
||||
}
|
||||
}
|
||||
# endif
|
||||
|
||||
# if defined(REGEX_BENCHMARK_OTHER)
|
||||
BENCHMARK_CASE(parens_qualifier_asterisk_2_benchmark_reference_stdcpp)
|
||||
{
|
||||
std::regex re("test(.*)test");
|
||||
std::cmatch m;
|
||||
for (size_t i = 0; i < BENCHMARK_LOOP_ITERATIONS; ++i) {
|
||||
EXPECT_EQ(std::regex_match("testasdftest", m, re), true);
|
||||
EXPECT_EQ(std::regex_match("testasdfasdftest", m, re), true);
|
||||
EXPECT_EQ(std::regex_search("testaaaatest, testbbbtest, testtest", m, re), true);
|
||||
EXPECT_EQ(std::regex_match("aaaaaaaaaaaaaaaaaaaaaaaaaaabbbbbbbbbbbbbbbbbbbbbbbbbb", m, re), false);
|
||||
}
|
||||
}
|
||||
# endif
|
||||
|
||||
# if defined(REGEX_BENCHMARK_OUR)
|
||||
BENCHMARK_CASE(multi_parens_qualifier_questionmark_benchmark)
|
||||
{
|
||||
Regex<PosixExtended> re("test(a)?(b)?(c)?test");
|
||||
RegexResult m;
|
||||
for (size_t i = 0; i < BENCHMARK_LOOP_ITERATIONS; ++i) {
|
||||
EXPECT_EQ(re.match("testtest", m), true);
|
||||
EXPECT_EQ(re.match("testabctest", m), true);
|
||||
EXPECT_EQ(re.search("testabctest, testactest", m), true);
|
||||
EXPECT_EQ(re.match("aaaaaaaaaaaaaaaaaaaaaaaaaaabbbbbbbbbbbbbbbbbbbbbbbbbb", m), false);
|
||||
EXPECT_EQ(re.match("test", m), false);
|
||||
EXPECT_EQ(re.match("whaaaaat", m), false);
|
||||
}
|
||||
}
|
||||
# endif
|
||||
|
||||
# if defined(REGEX_BENCHMARK_OTHER)
|
||||
BENCHMARK_CASE(multi_parens_qualifier_questionmark_benchmark_reference_stdcpp)
|
||||
{
|
||||
std::regex re("test(a)?(b)?(c)?test");
|
||||
std::cmatch m;
|
||||
for (size_t i = 0; i < BENCHMARK_LOOP_ITERATIONS; ++i) {
|
||||
EXPECT_EQ(std::regex_match("testtest", m, re), true);
|
||||
EXPECT_EQ(std::regex_match("testabctest", m, re), true);
|
||||
EXPECT_EQ(std::regex_search("testabctest, testactest", m, re), true);
|
||||
EXPECT_EQ(std::regex_match("aaaaaaaaaaaaaaaaaaaaaaaaaaabbbbbbbbbbbbbbbbbbbbbbbbbb", m, re), false);
|
||||
EXPECT_EQ(std::regex_match("test", m, re), false);
|
||||
EXPECT_EQ(std::regex_match("whaaaaat", m, re), false);
|
||||
}
|
||||
}
|
||||
# endif
|
||||
|
||||
# if defined(REGEX_BENCHMARK_OUR)
|
||||
BENCHMARK_CASE(simple_alternative_benchmark)
|
||||
{
|
||||
Regex<PosixExtended> re("test|hello|friends");
|
||||
RegexResult m;
|
||||
for (size_t i = 0; i < BENCHMARK_LOOP_ITERATIONS; ++i) {
|
||||
EXPECT_EQ(re.match("test", m), true);
|
||||
EXPECT_EQ(re.match("hello", m), true);
|
||||
EXPECT_EQ(re.match("friends", m), true);
|
||||
EXPECT_EQ(re.match("whaaaaat", m), false);
|
||||
}
|
||||
}
|
||||
# endif
|
||||
|
||||
# if defined(REGEX_BENCHMARK_OTHER)
|
||||
BENCHMARK_CASE(simple_alternative_benchmark_reference_stdcpp)
|
||||
{
|
||||
std::regex re("test|hello|friends");
|
||||
std::cmatch m;
|
||||
for (size_t i = 0; i < BENCHMARK_LOOP_ITERATIONS; ++i) {
|
||||
EXPECT_EQ(std::regex_match("test", m, re), true);
|
||||
EXPECT_EQ(std::regex_match("hello", m, re), true);
|
||||
EXPECT_EQ(std::regex_match("friends", m, re), true);
|
||||
EXPECT_EQ(std::regex_match("whaaaaat", m, re), false);
|
||||
}
|
||||
}
|
||||
# endif
|
||||
|
||||
# if defined(REGEX_BENCHMARK_OUR)
|
||||
BENCHMARK_CASE(alternative_match_groups_benchmark)
|
||||
{
|
||||
Regex<PosixExtended> re("test(a)?(b)?|hello ?(dear|my)? friends");
|
||||
RegexResult m;
|
||||
for (size_t i = 0; i < BENCHMARK_LOOP_ITERATIONS; ++i) {
|
||||
EXPECT_EQ(re.match("test", m), true);
|
||||
EXPECT_EQ(re.match("testa", m), true);
|
||||
EXPECT_EQ(re.match("testb", m), true);
|
||||
EXPECT_EQ(re.match("hello friends", m), true);
|
||||
EXPECT_EQ(re.match("hello dear friends", m), true);
|
||||
EXPECT_EQ(re.match("hello my friends", m), true);
|
||||
EXPECT_EQ(re.match("testabc", m), false);
|
||||
EXPECT_EQ(re.match("hello test friends", m), false);
|
||||
}
|
||||
}
|
||||
# endif
|
||||
|
||||
# if defined(REGEX_BENCHMARK_OTHER)
|
||||
BENCHMARK_CASE(alternative_match_groups_benchmark_reference_stdcpp)
|
||||
{
|
||||
std::regex re("test(a)?(b)?|hello ?(dear|my)? friends");
|
||||
std::cmatch m;
|
||||
for (size_t i = 0; i < BENCHMARK_LOOP_ITERATIONS; ++i) {
|
||||
EXPECT_EQ(std::regex_match("test", m, re), true);
|
||||
EXPECT_EQ(std::regex_match("testa", m, re), true);
|
||||
EXPECT_EQ(std::regex_match("testb", m, re), true);
|
||||
EXPECT_EQ(std::regex_match("hello friends", m, re), true);
|
||||
EXPECT_EQ(std::regex_match("hello dear friends", m, re), true);
|
||||
EXPECT_EQ(std::regex_match("hello my friends", m, re), true);
|
||||
EXPECT_EQ(std::regex_match("testabc", m, re), false);
|
||||
EXPECT_EQ(std::regex_match("hello test friends", m, re), false);
|
||||
}
|
||||
}
|
||||
# endif
|
||||
|
||||
# if defined(REGEX_BENCHMARK_OUR)
|
||||
BENCHMARK_CASE(parens_qualifier_exact_benchmark)
|
||||
{
|
||||
Regex<PosixExtended> re("(hello){3}");
|
||||
RegexResult m;
|
||||
for (size_t i = 0; i < BENCHMARK_LOOP_ITERATIONS; ++i) {
|
||||
EXPECT_EQ(re.match("hello", m), false);
|
||||
EXPECT_EQ(re.match("hellohellohello", m), true);
|
||||
EXPECT_EQ(re.search("hellohellohellohello", m), true);
|
||||
EXPECT_EQ(re.search("test hellohellohello", m), true);
|
||||
}
|
||||
}
|
||||
# endif
|
||||
|
||||
# if defined(REGEX_BENCHMARK_OTHER)
|
||||
BENCHMARK_CASE(parens_qualifier_exact_benchmark_reference_stdcpp)
|
||||
{
|
||||
std::regex re("(hello){3}");
|
||||
std::cmatch m;
|
||||
for (size_t i = 0; i < BENCHMARK_LOOP_ITERATIONS; ++i) {
|
||||
EXPECT_EQ(std::regex_match("hello", m, re), false);
|
||||
EXPECT_EQ(std::regex_match("hellohellohello", m, re), true);
|
||||
EXPECT_EQ(std::regex_search("hellohellohellohello", m, re), true);
|
||||
EXPECT_EQ(std::regex_search("test hellohellohello", m, re), true);
|
||||
}
|
||||
}
|
||||
# endif
|
||||
|
||||
# if defined(REGEX_BENCHMARK_OUR)
|
||||
BENCHMARK_CASE(parens_qualifier_minimum_benchmark)
|
||||
{
|
||||
Regex<PosixExtended> re("(hello){3,}");
|
||||
RegexResult m;
|
||||
for (size_t i = 0; i < BENCHMARK_LOOP_ITERATIONS; ++i) {
|
||||
EXPECT_EQ(re.match("hello", m), false);
|
||||
EXPECT_EQ(re.match("hellohellohello", m), true);
|
||||
EXPECT_EQ(re.search("hellohellohellohello", m), true);
|
||||
EXPECT_EQ(re.search("test hellohellohello", m), true);
|
||||
EXPECT_EQ(re.search("test hellohellohellohello", m), true);
|
||||
}
|
||||
}
|
||||
# endif
|
||||
|
||||
# if defined(REGEX_BENCHMARK_OTHER)
|
||||
BENCHMARK_CASE(parens_qualifier_minimum_benchmark_reference_stdcpp)
|
||||
{
|
||||
std::regex re("(hello){3,}");
|
||||
std::cmatch m;
|
||||
for (size_t i = 0; i < BENCHMARK_LOOP_ITERATIONS; ++i) {
|
||||
EXPECT_EQ(std::regex_match("hello", m, re), false);
|
||||
EXPECT_EQ(std::regex_match("hellohellohello", m, re), true);
|
||||
EXPECT_EQ(std::regex_search("hellohellohellohello", m, re), true);
|
||||
EXPECT_EQ(std::regex_search("test hellohellohello", m, re), true);
|
||||
EXPECT_EQ(std::regex_search("test hellohellohellohello", m, re), true);
|
||||
}
|
||||
}
|
||||
# endif
|
||||
|
||||
# if defined(REGEX_BENCHMARK_OUR)
|
||||
BENCHMARK_CASE(parens_qualifier_maximum_benchmark)
|
||||
{
|
||||
Regex<PosixExtended> re("(hello){2,3}");
|
||||
RegexResult m;
|
||||
for (size_t i = 0; i < BENCHMARK_LOOP_ITERATIONS; ++i) {
|
||||
EXPECT_EQ(re.match("hello", m), false);
|
||||
EXPECT_EQ(re.match("hellohellohello", m), true);
|
||||
EXPECT_EQ(re.search("hellohellohellohello", m), true);
|
||||
EXPECT_EQ(re.search("test hellohellohello", m), true);
|
||||
EXPECT_EQ(re.search("test hellohellohellohello", m), true);
|
||||
EXPECT_EQ(re.match("test hellohellohellohello", m), false);
|
||||
EXPECT_EQ(re.search("test hellohellohellohello", m), true);
|
||||
}
|
||||
}
|
||||
# endif
|
||||
|
||||
# if defined(REGEX_BENCHMARK_OTHER)
|
||||
BENCHMARK_CASE(parens_qualifier_maximum_benchmark_reference_stdcpp)
|
||||
{
|
||||
std::regex re("(hello){2,3}");
|
||||
std::cmatch m;
|
||||
for (size_t i = 0; i < BENCHMARK_LOOP_ITERATIONS; ++i) {
|
||||
EXPECT_EQ(std::regex_match("hello", m, re), false);
|
||||
EXPECT_EQ(std::regex_match("hellohellohello", m, re), true);
|
||||
EXPECT_EQ(std::regex_search("hellohellohellohello", m, re), true);
|
||||
EXPECT_EQ(std::regex_search("test hellohellohello", m, re), true);
|
||||
EXPECT_EQ(std::regex_search("test hellohellohellohello", m, re), true);
|
||||
EXPECT_EQ(std::regex_match("test hellohellohellohello", m, re), false);
|
||||
EXPECT_EQ(std::regex_search("test hellohellohellohello", m, re), true);
|
||||
}
|
||||
}
|
||||
# endif
|
||||
|
||||
# if defined(REGEX_BENCHMARK_OUR)
|
||||
BENCHMARK_CASE(char_qualifier_min_max_benchmark)
|
||||
{
|
||||
Regex<PosixExtended> re("c{3,30}");
|
||||
RegexResult m;
|
||||
for (size_t i = 0; i < BENCHMARK_LOOP_ITERATIONS; ++i) {
|
||||
EXPECT_EQ(re.match("cc", m), false);
|
||||
EXPECT_EQ(re.match("ccc", m), true);
|
||||
EXPECT_EQ(re.match("cccccccccccccccccccccccccccccc", m), true);
|
||||
EXPECT_EQ(re.match("ccccccccccccccccccccccccccccccc", m), false);
|
||||
EXPECT_EQ(re.search("ccccccccccccccccccccccccccccccc", m), true);
|
||||
EXPECT_EQ(re.match("cccccccccccccccccccccccccccccccc", m), false);
|
||||
}
|
||||
}
|
||||
# endif
|
||||
|
||||
# if defined(REGEX_BENCHMARK_OTHER)
|
||||
BENCHMARK_CASE(char_qualifier_min_max_benchmark_reference_stdcpp)
|
||||
{
|
||||
std::regex re("c{3,30}");
|
||||
std::cmatch m;
|
||||
for (size_t i = 0; i < BENCHMARK_LOOP_ITERATIONS; ++i) {
|
||||
EXPECT_EQ(std::regex_match("cc", m, re), false);
|
||||
EXPECT_EQ(std::regex_match("ccc", m, re), true);
|
||||
EXPECT_EQ(std::regex_match("cccccccccccccccccccccccccccccc", m, re), true);
|
||||
EXPECT_EQ(std::regex_match("ccccccccccccccccccccccccccccccc", m, re), false);
|
||||
EXPECT_EQ(std::regex_search("ccccccccccccccccccccccccccccccc", m, re), true);
|
||||
EXPECT_EQ(std::regex_match("cccccccccccccccccccccccccccccccc", m, re), false);
|
||||
}
|
||||
}
|
||||
# endif
|
||||
|
||||
# if defined(REGEX_BENCHMARK_OUR)
|
||||
BENCHMARK_CASE(simple_bracket_chars_benchmark)
|
||||
{
|
||||
Regex<PosixExtended> re("[abc]");
|
||||
RegexResult m;
|
||||
for (size_t i = 0; i < BENCHMARK_LOOP_ITERATIONS; ++i) {
|
||||
EXPECT_EQ(re.match("a", m), true);
|
||||
EXPECT_EQ(re.match("b", m), true);
|
||||
EXPECT_EQ(re.match("c", m), true);
|
||||
EXPECT_EQ(re.match("d", m), false);
|
||||
EXPECT_EQ(re.match("e", m), false);
|
||||
}
|
||||
}
|
||||
# endif
|
||||
|
||||
# if defined(REGEX_BENCHMARK_OTHER)
|
||||
BENCHMARK_CASE(simple_bracket_chars_benchmark_reference_stdcpp)
|
||||
{
|
||||
std::regex re("[abc]");
|
||||
std::cmatch m;
|
||||
for (size_t i = 0; i < BENCHMARK_LOOP_ITERATIONS; ++i) {
|
||||
EXPECT_EQ(std::regex_match("a", m, re), true);
|
||||
EXPECT_EQ(std::regex_match("b", m, re), true);
|
||||
EXPECT_EQ(std::regex_match("c", m, re), true);
|
||||
EXPECT_EQ(std::regex_match("d", m, re), false);
|
||||
EXPECT_EQ(std::regex_match("e", m, re), false);
|
||||
}
|
||||
}
|
||||
# endif
|
||||
|
||||
# if defined(REGEX_BENCHMARK_OUR)
|
||||
BENCHMARK_CASE(simple_bracket_chars_inverse_benchmark)
|
||||
{
|
||||
Regex<PosixExtended> re("[^abc]");
|
||||
RegexResult m;
|
||||
for (size_t i = 0; i < BENCHMARK_LOOP_ITERATIONS; ++i) {
|
||||
EXPECT_EQ(re.match("a", m), false);
|
||||
EXPECT_EQ(re.match("b", m), false);
|
||||
EXPECT_EQ(re.match("c", m), false);
|
||||
EXPECT_EQ(re.match("d", m), true);
|
||||
EXPECT_EQ(re.match("e", m), true);
|
||||
}
|
||||
}
|
||||
# endif
|
||||
|
||||
# if defined(REGEX_BENCHMARK_OTHER)
|
||||
BENCHMARK_CASE(simple_bracket_chars_inverse_benchmark_reference_stdcpp)
|
||||
{
|
||||
std::regex re("[^abc]");
|
||||
std::cmatch m;
|
||||
for (size_t i = 0; i < BENCHMARK_LOOP_ITERATIONS; ++i) {
|
||||
EXPECT_EQ(std::regex_match("a", m, re), false);
|
||||
EXPECT_EQ(std::regex_match("b", m, re), false);
|
||||
EXPECT_EQ(std::regex_match("c", m, re), false);
|
||||
EXPECT_EQ(std::regex_match("d", m, re), true);
|
||||
EXPECT_EQ(std::regex_match("e", m, re), true);
|
||||
}
|
||||
}
|
||||
# endif
|
||||
|
||||
# if defined(REGEX_BENCHMARK_OUR)
|
||||
BENCHMARK_CASE(simple_bracket_chars_range_benchmark)
|
||||
{
|
||||
Regex<PosixExtended> re("[a-d]");
|
||||
RegexResult m;
|
||||
for (size_t i = 0; i < BENCHMARK_LOOP_ITERATIONS; ++i) {
|
||||
EXPECT_EQ(re.match("a", m), true);
|
||||
EXPECT_EQ(re.match("b", m), true);
|
||||
EXPECT_EQ(re.match("c", m), true);
|
||||
EXPECT_EQ(re.match("d", m), true);
|
||||
EXPECT_EQ(re.match("e", m), false);
|
||||
}
|
||||
}
|
||||
# endif
|
||||
|
||||
# if defined(REGEX_BENCHMARK_OTHER)
|
||||
BENCHMARK_CASE(simple_bracket_chars_range_benchmark_reference_stdcpp)
|
||||
{
|
||||
std::regex re("[a-d]");
|
||||
std::cmatch m;
|
||||
for (size_t i = 0; i < BENCHMARK_LOOP_ITERATIONS; ++i) {
|
||||
EXPECT_EQ(std::regex_match("a", m, re), true);
|
||||
EXPECT_EQ(std::regex_match("b", m, re), true);
|
||||
EXPECT_EQ(std::regex_match("c", m, re), true);
|
||||
EXPECT_EQ(std::regex_match("d", m, re), true);
|
||||
EXPECT_EQ(std::regex_match("e", m, re), false);
|
||||
}
|
||||
}
|
||||
# endif
|
||||
|
||||
# if defined(REGEX_BENCHMARK_OUR)
|
||||
BENCHMARK_CASE(simple_bracket_chars_range_inverse_benchmark)
|
||||
{
|
||||
Regex<PosixExtended> re("[^a-df-z]");
|
||||
RegexResult m;
|
||||
for (size_t i = 0; i < BENCHMARK_LOOP_ITERATIONS; ++i) {
|
||||
EXPECT_EQ(re.match("a", m), false);
|
||||
EXPECT_EQ(re.match("b", m), false);
|
||||
EXPECT_EQ(re.match("c", m), false);
|
||||
EXPECT_EQ(re.match("d", m), false);
|
||||
EXPECT_EQ(re.match("e", m), true);
|
||||
EXPECT_EQ(re.match("k", m), false);
|
||||
EXPECT_EQ(re.match("z", m), false);
|
||||
}
|
||||
}
|
||||
# endif
|
||||
|
||||
# if defined(REGEX_BENCHMARK_OTHER)
|
||||
BENCHMARK_CASE(simple_bracket_chars_range_inverse_benchmark_reference_stdcpp)
|
||||
{
|
||||
std::regex re("[^a-df-z]");
|
||||
std::cmatch m;
|
||||
for (size_t i = 0; i < BENCHMARK_LOOP_ITERATIONS; ++i) {
|
||||
EXPECT_EQ(std::regex_match("a", m, re), false);
|
||||
EXPECT_EQ(std::regex_match("b", m, re), false);
|
||||
EXPECT_EQ(std::regex_match("c", m, re), false);
|
||||
EXPECT_EQ(std::regex_match("d", m, re), false);
|
||||
EXPECT_EQ(std::regex_match("e", m, re), true);
|
||||
EXPECT_EQ(std::regex_match("k", m, re), false);
|
||||
EXPECT_EQ(std::regex_match("z", m, re), false);
|
||||
}
|
||||
}
|
||||
# endif
|
||||
|
||||
# if defined(REGEX_BENCHMARK_OUR)
|
||||
BENCHMARK_CASE(bracket_character_class_uuid_benchmark)
|
||||
{
|
||||
Regex<PosixExtended> re("^([[:xdigit:]]{8})-([[:xdigit:]]{4})-([[:xdigit:]]{4})-([[:xdigit:]]{4})-([[:xdigit:]]{12})$");
|
||||
RegexResult m;
|
||||
for (size_t i = 0; i < BENCHMARK_LOOP_ITERATIONS; ++i) {
|
||||
EXPECT_EQ(re.match("fb9b62a2-1579-4e3a-afba-76239ccb6583", m), true);
|
||||
EXPECT_EQ(re.match("fb9b62a2", m), false);
|
||||
}
|
||||
}
|
||||
# endif
|
||||
|
||||
# if defined(REGEX_BENCHMARK_OTHER)
|
||||
BENCHMARK_CASE(bracket_character_class_uuid_benchmark_reference_stdcpp)
|
||||
{
|
||||
std::regex re("^([[:xdigit:]]{8})-([[:xdigit:]]{4})-([[:xdigit:]]{4})-([[:xdigit:]]{4})-([[:xdigit:]]{12})$");
|
||||
std::cmatch m;
|
||||
for (size_t i = 0; i < BENCHMARK_LOOP_ITERATIONS; ++i) {
|
||||
EXPECT_EQ(std::regex_match("fb9b62a2-1579-4e3a-afba-76239ccb6583", m, re), true);
|
||||
EXPECT_EQ(std::regex_match("fb9b62a2", m, re), false);
|
||||
}
|
||||
}
|
||||
# endif
|
||||
|
||||
# if defined(REGEX_BENCHMARK_OUR)
|
||||
BENCHMARK_CASE(simple_bracket_character_class_inverse_benchmark)
|
||||
{
|
||||
Regex<PosixExtended> re("[^[:digit:]]");
|
||||
RegexResult m;
|
||||
for (size_t i = 0; i < BENCHMARK_LOOP_ITERATIONS; ++i) {
|
||||
EXPECT_EQ(re.match("1", m), false);
|
||||
EXPECT_EQ(re.match("2", m), false);
|
||||
EXPECT_EQ(re.match("3", m), false);
|
||||
EXPECT_EQ(re.match("d", m), true);
|
||||
EXPECT_EQ(re.match("e", m), true);
|
||||
}
|
||||
}
|
||||
# endif
|
||||
|
||||
# if defined(REGEX_BENCHMARK_OTHER)
|
||||
BENCHMARK_CASE(simple_bracket_character_class_inverse_benchmark_reference_stdcpp)
|
||||
{
|
||||
std::regex re("[^[:digit:]]");
|
||||
std::cmatch m;
|
||||
for (size_t i = 0; i < BENCHMARK_LOOP_ITERATIONS; ++i) {
|
||||
EXPECT_EQ(std::regex_match("1", m, re), false);
|
||||
EXPECT_EQ(std::regex_match("2", m, re), false);
|
||||
EXPECT_EQ(std::regex_match("3", m, re), false);
|
||||
EXPECT_EQ(std::regex_match("d", m, re), true);
|
||||
EXPECT_EQ(std::regex_match("e", m, re), true);
|
||||
}
|
||||
}
|
||||
# endif
|
||||
|
||||
# if defined(REGEX_BENCHMARK_OUR)
|
||||
BENCHMARK_CASE(email_address_benchmark)
|
||||
{
|
||||
Regex<PosixExtended> re("^[A-Z0-9a-z._%+-]{1,64}@(?:[A-Za-z0-9-]{1,63}\\.){1,125}[A-Za-z]{2,63}$");
|
||||
RegexResult m;
|
||||
for (size_t i = 0; i < BENCHMARK_LOOP_ITERATIONS; ++i) {
|
||||
EXPECT_EQ(re.match("hello.world@domain.tld", m), true);
|
||||
EXPECT_EQ(re.match("this.is.a.very_long_email_address@world.wide.web", m), true);
|
||||
}
|
||||
}
|
||||
# endif
|
||||
|
||||
# if defined(REGEX_BENCHMARK_OTHER)
|
||||
BENCHMARK_CASE(email_address_benchmark_reference_stdcpp)
|
||||
{
|
||||
std::regex re("^[A-Z0-9a-z._%+-]{1,64}@(?:[A-Za-z0-9-]{1,63}\\.){1,125}[A-Za-z]{2,63}$");
|
||||
std::cmatch m;
|
||||
for (size_t i = 0; i < BENCHMARK_LOOP_ITERATIONS; ++i) {
|
||||
EXPECT_EQ(std::regex_match("hello.world@domain.tld", m, re), true);
|
||||
EXPECT_EQ(std::regex_match("this.is.a.very_long_email_address@world.wide.web", m, re), true);
|
||||
}
|
||||
}
|
||||
# endif
|
||||
|
||||
# if defined(REGEX_BENCHMARK_OUR)
|
||||
BENCHMARK_CASE(simple_ignorecase_benchmark)
|
||||
{
|
||||
Regex<PosixExtended> re("^hello friends", PosixFlags::Insensitive);
|
||||
RegexResult m;
|
||||
for (size_t i = 0; i < BENCHMARK_LOOP_ITERATIONS; ++i) {
|
||||
EXPECT_EQ(re.match("Hello Friends", m), true);
|
||||
EXPECT_EQ(re.match("hello Friends", m), true);
|
||||
|
||||
EXPECT_EQ(re.match("hello Friends!", m), false);
|
||||
EXPECT_EQ(re.search("hello Friends", m), true);
|
||||
|
||||
EXPECT_EQ(re.match("hell Friends", m), false);
|
||||
EXPECT_EQ(re.search("hell Friends", m), false);
|
||||
}
|
||||
}
|
||||
# endif
|
||||
|
||||
# if defined(REGEX_BENCHMARK_OTHER)
|
||||
BENCHMARK_CASE(simple_ignorecase_benchmark_reference_stdcpp)
|
||||
{
|
||||
std::regex re("^hello friends", std::regex_constants::icase);
|
||||
std::cmatch m;
|
||||
for (size_t i = 0; i < BENCHMARK_LOOP_ITERATIONS; ++i) {
|
||||
EXPECT_EQ(std::regex_match("Hello Friends", m, re), true);
|
||||
EXPECT_EQ(std::regex_match("hello Friends", m, re), true);
|
||||
|
||||
EXPECT_EQ(std::regex_match("hello Friends!", m, re), false);
|
||||
EXPECT_EQ(std::regex_search("hello Friends", m, re), true);
|
||||
|
||||
EXPECT_EQ(std::regex_match("hell Friends", m, re), false);
|
||||
EXPECT_EQ(std::regex_search("hell Friends", m, re), false);
|
||||
}
|
||||
}
|
||||
# endif
|
||||
|
||||
# if defined(REGEX_BENCHMARK_OUR)
|
||||
BENCHMARK_CASE(simple_notbol_noteol_benchmark)
|
||||
{
|
||||
String pattern = "^hello friends$";
|
||||
String pattern2 = "hello friends";
|
||||
regex_t regex, regex2;
|
||||
|
||||
EXPECT_EQ(regcomp(®ex, pattern.characters(), REG_EXTENDED | REG_NOSUB | REG_ICASE), REG_NOERR);
|
||||
EXPECT_EQ(regcomp(®ex2, pattern2.characters(), REG_EXTENDED | REG_NOSUB | REG_ICASE), REG_NOERR);
|
||||
|
||||
for (size_t i = 0; i < BENCHMARK_LOOP_ITERATIONS; ++i) {
|
||||
|
||||
EXPECT_EQ(regexec(®ex, "hello friends", 0, NULL, REG_NOTBOL), REG_NOMATCH);
|
||||
EXPECT_EQ(regexec(®ex, "hello friends", 0, NULL, REG_NOTEOL), REG_NOMATCH);
|
||||
EXPECT_EQ(regexec(®ex, "hello friends", 0, NULL, REG_NOTBOL | REG_NOTEOL), REG_NOMATCH);
|
||||
|
||||
EXPECT_EQ(regexec(®ex, "a hello friends b", 0, NULL, REG_NOTBOL), REG_NOMATCH);
|
||||
EXPECT_EQ(regexec(®ex, "a hello friends", 0, NULL, REG_NOTBOL), REG_NOMATCH);
|
||||
EXPECT_EQ(regexec(®ex, "a hello friends", 0, NULL, REG_NOTBOL | REG_SEARCH), REG_NOERR);
|
||||
EXPECT_EQ(regexec(®ex, "a hello friends b", 0, NULL, REG_NOTBOL | REG_SEARCH), REG_NOERR);
|
||||
|
||||
EXPECT_EQ(regexec(®ex, "a hello friends b", 0, NULL, REG_NOTEOL), REG_NOMATCH);
|
||||
EXPECT_EQ(regexec(®ex, "hello friends b", 0, NULL, REG_NOTEOL), REG_NOMATCH);
|
||||
EXPECT_EQ(regexec(®ex, "hello friends b", 0, NULL, REG_NOTEOL | REG_SEARCH), REG_NOERR);
|
||||
EXPECT_EQ(regexec(®ex, "a hello friends b", 0, NULL, REG_NOTEOL | REG_SEARCH), REG_NOMATCH);
|
||||
|
||||
EXPECT_EQ(regexec(®ex, "a hello friends b", 0, NULL, REG_NOTBOL | REG_NOTEOL), REG_NOMATCH);
|
||||
EXPECT_EQ(regexec(®ex, "a hello friends b", 0, NULL, REG_NOTBOL | REG_NOTEOL | REG_SEARCH), REG_NOMATCH);
|
||||
|
||||
EXPECT_EQ(regexec(®ex2, "hello friends", 0, NULL, REG_NOTBOL), REG_NOMATCH);
|
||||
EXPECT_EQ(regexec(®ex2, "hello friends", 0, NULL, REG_NOTEOL), REG_NOMATCH);
|
||||
}
|
||||
|
||||
regfree(®ex);
|
||||
}
|
||||
# endif
|
||||
|
||||
# if defined(REGEX_BENCHMARK_OTHER)
|
||||
BENCHMARK_CASE(simple_notbol_noteol_benchmark_reference_stdcpp)
|
||||
{
|
||||
std::regex re1("^hello friends$", std::regex_constants::match_not_bol);
|
||||
std::regex re2("^hello friends$", std::regex_constants::match_not_eol);
|
||||
std::regex re3("^hello friends$", std::regex_constants::match_not_bol | std::regex_constants::match_not_eol);
|
||||
std::regex re4("hello friends", std::regex_constants::match_not_bol);
|
||||
std::regex re5("hello friends", std::regex_constants::match_not_eol);
|
||||
std::cmatch m;
|
||||
for (size_t i = 0; i < BENCHMARK_LOOP_ITERATIONS; ++i) {
|
||||
EXPECT_EQ(std::regex_match("hello friends", m, re1), false);
|
||||
EXPECT_EQ(std::regex_match("hello friends", m, re2), false);
|
||||
EXPECT_EQ(std::regex_match("hello friends", m, re3), false);
|
||||
|
||||
EXPECT_EQ(std::regex_match("a hello friends b", m, re1), false);
|
||||
EXPECT_EQ(std::regex_match("a hello friends", m, re1), false);
|
||||
EXPECT_EQ(std::regex_search("a hello friends", m, re1), true);
|
||||
EXPECT_EQ(std::regex_search("a hello friends b", m, re1), true);
|
||||
|
||||
EXPECT_EQ(std::regex_match("a hello friends b", m, re2), false);
|
||||
EXPECT_EQ(std::regex_match("hello friends b", m, re2), false);
|
||||
EXPECT_EQ(std::regex_search("hello friends b", m, re2), true);
|
||||
EXPECT_EQ(std::regex_search("a hello friends b", m, re2), false);
|
||||
|
||||
EXPECT_EQ(std::regex_match("a hello friends b", m, re3), false);
|
||||
EXPECT_EQ(std::regex_search("a hello friends b", m, re3), false);
|
||||
|
||||
EXPECT_EQ(std::regex_match("hello friends", m, re4), false);
|
||||
EXPECT_EQ(std::regex_match("hello friends", m, re5), false);
|
||||
}
|
||||
}
|
||||
# endif
|
||||
|
||||
#endif
|
||||
|
||||
TEST_MAIN(Regex)
|
20
Userland/Libraries/LibRegex/Tests/CMakeLists.txt
Normal file
20
Userland/Libraries/LibRegex/Tests/CMakeLists.txt
Normal file
|
@ -0,0 +1,20 @@
|
|||
file(GLOB TEST_SOURCES CONFIGURE_DEPENDS "*.cpp")
|
||||
file(GLOB REGEX_SOURCES CONFIGURE_DEPENDS "../*.cpp" "../C/*.cpp")
|
||||
|
||||
foreach(source ${TEST_SOURCES})
|
||||
get_filename_component(name ${source} NAME_WE)
|
||||
add_executable(${name} ${source} ${REGEX_SOURCES})
|
||||
target_link_libraries(${name} LagomCore)
|
||||
add_test(
|
||||
NAME ${name}
|
||||
COMMAND ${name}
|
||||
WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}
|
||||
)
|
||||
|
||||
set_tests_properties(
|
||||
${name}
|
||||
PROPERTIES
|
||||
FAIL_REGULAR_EXPRESSION
|
||||
"FAIL"
|
||||
)
|
||||
endforeach()
|
600
Userland/Libraries/LibRegex/Tests/Regex.cpp
Normal file
600
Userland/Libraries/LibRegex/Tests/Regex.cpp
Normal file
|
@ -0,0 +1,600 @@
|
|||
/*
|
||||
* Copyright (c) 2020, Emanuel Sprung <emanuel.sprung@gmail.com>
|
||||
* All rights reserved.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are met:
|
||||
*
|
||||
* 1. Redistributions of source code must retain the above copyright notice, this
|
||||
* list of conditions and the following disclaimer.
|
||||
*
|
||||
* 2. Redistributions in binary form must reproduce the above copyright notice,
|
||||
* this list of conditions and the following disclaimer in the documentation
|
||||
* and/or other materials provided with the distribution.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
|
||||
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
#include <AK/TestSuite.h> // import first, to prevent warning of ASSERT* redefinition
|
||||
|
||||
#include <AK/StringBuilder.h>
|
||||
#include <LibRegex/Regex.h>
|
||||
#include <LibRegex/RegexDebug.h>
|
||||
#include <stdio.h>
|
||||
|
||||
static ECMAScriptOptions match_test_api_options(const ECMAScriptOptions options)
|
||||
{
|
||||
return options;
|
||||
}
|
||||
|
||||
static PosixOptions match_test_api_options(const PosixOptions options)
|
||||
{
|
||||
return options;
|
||||
}
|
||||
|
||||
TEST_CASE(regex_options_ecmascript)
|
||||
{
|
||||
ECMAScriptOptions eo;
|
||||
eo |= ECMAScriptFlags::Global;
|
||||
|
||||
EXPECT(eo & ECMAScriptFlags::Global);
|
||||
EXPECT(!(eo & ECMAScriptFlags::Insensitive));
|
||||
|
||||
eo = match_test_api_options(ECMAScriptFlags::Global | ECMAScriptFlags::Insensitive | ECMAScriptFlags::Sticky);
|
||||
EXPECT(eo & ECMAScriptFlags::Global);
|
||||
EXPECT(eo & ECMAScriptFlags::Insensitive);
|
||||
EXPECT(eo & ECMAScriptFlags::Sticky);
|
||||
EXPECT(!(eo & ECMAScriptFlags::Unicode));
|
||||
EXPECT(!(eo & ECMAScriptFlags::Multiline));
|
||||
EXPECT(!(eo & ECMAScriptFlags::SingleLine));
|
||||
|
||||
eo &= ECMAScriptFlags::Insensitive;
|
||||
EXPECT(!(eo & ECMAScriptFlags::Global));
|
||||
EXPECT(eo & ECMAScriptFlags::Insensitive);
|
||||
EXPECT(!(eo & ECMAScriptFlags::Multiline));
|
||||
|
||||
eo &= ECMAScriptFlags::Sticky;
|
||||
EXPECT(!(eo & ECMAScriptFlags::Global));
|
||||
EXPECT(!(eo & ECMAScriptFlags::Insensitive));
|
||||
EXPECT(!(eo & ECMAScriptFlags::Multiline));
|
||||
EXPECT(!(eo & ECMAScriptFlags::Sticky));
|
||||
|
||||
eo = ~ECMAScriptFlags::Insensitive;
|
||||
EXPECT(eo & ECMAScriptFlags::Global);
|
||||
EXPECT(!(eo & ECMAScriptFlags::Insensitive));
|
||||
EXPECT(eo & ECMAScriptFlags::Multiline);
|
||||
EXPECT(eo & ECMAScriptFlags::Sticky);
|
||||
}
|
||||
|
||||
TEST_CASE(regex_options_posix)
|
||||
{
|
||||
PosixOptions eo;
|
||||
eo |= PosixFlags::Global;
|
||||
|
||||
EXPECT(eo & PosixFlags::Global);
|
||||
EXPECT(!(eo & PosixFlags::Insensitive));
|
||||
|
||||
eo = match_test_api_options(PosixFlags::Global | PosixFlags::Insensitive | PosixFlags::MatchNotBeginOfLine);
|
||||
EXPECT(eo & PosixFlags::Global);
|
||||
EXPECT(eo & PosixFlags::Insensitive);
|
||||
EXPECT(eo & PosixFlags::MatchNotBeginOfLine);
|
||||
EXPECT(!(eo & PosixFlags::Unicode));
|
||||
EXPECT(!(eo & PosixFlags::Multiline));
|
||||
|
||||
eo &= PosixFlags::Insensitive;
|
||||
EXPECT(!(eo & PosixFlags::Global));
|
||||
EXPECT(eo & PosixFlags::Insensitive);
|
||||
EXPECT(!(eo & PosixFlags::Multiline));
|
||||
|
||||
eo &= PosixFlags::MatchNotBeginOfLine;
|
||||
EXPECT(!(eo & PosixFlags::Global));
|
||||
EXPECT(!(eo & PosixFlags::Insensitive));
|
||||
EXPECT(!(eo & PosixFlags::Multiline));
|
||||
|
||||
eo = ~PosixFlags::Insensitive;
|
||||
EXPECT(eo & PosixFlags::Global);
|
||||
EXPECT(!(eo & PosixFlags::Insensitive));
|
||||
EXPECT(eo & PosixFlags::Multiline);
|
||||
}
|
||||
|
||||
TEST_CASE(regex_lexer)
|
||||
{
|
||||
Lexer l("/[.*+?^${}()|[\\]\\\\]/g");
|
||||
EXPECT(l.next().type() == regex::TokenType::Slash);
|
||||
EXPECT(l.next().type() == regex::TokenType::LeftBracket);
|
||||
EXPECT(l.next().type() == regex::TokenType::Period);
|
||||
EXPECT(l.next().type() == regex::TokenType::Asterisk);
|
||||
EXPECT(l.next().type() == regex::TokenType::Plus);
|
||||
EXPECT(l.next().type() == regex::TokenType::Questionmark);
|
||||
EXPECT(l.next().type() == regex::TokenType::Circumflex);
|
||||
EXPECT(l.next().type() == regex::TokenType::Dollar);
|
||||
EXPECT(l.next().type() == regex::TokenType::LeftCurly);
|
||||
EXPECT(l.next().type() == regex::TokenType::RightCurly);
|
||||
EXPECT(l.next().type() == regex::TokenType::LeftParen);
|
||||
EXPECT(l.next().type() == regex::TokenType::RightParen);
|
||||
EXPECT(l.next().type() == regex::TokenType::Pipe);
|
||||
EXPECT(l.next().type() == regex::TokenType::LeftBracket);
|
||||
EXPECT(l.next().type() == regex::TokenType::EscapeSequence);
|
||||
EXPECT(l.next().type() == regex::TokenType::EscapeSequence);
|
||||
EXPECT(l.next().type() == regex::TokenType::RightBracket);
|
||||
EXPECT(l.next().type() == regex::TokenType::Slash);
|
||||
EXPECT(l.next().type() == regex::TokenType::Char);
|
||||
}
|
||||
|
||||
TEST_CASE(parser_error_parens)
|
||||
{
|
||||
String pattern = "test()test";
|
||||
Lexer l(pattern);
|
||||
PosixExtendedParser p(l);
|
||||
p.parse();
|
||||
EXPECT(p.has_error());
|
||||
EXPECT(p.error() == Error::EmptySubExpression);
|
||||
}
|
||||
|
||||
TEST_CASE(parser_error_special_characters_used_at_wrong_place)
|
||||
{
|
||||
String pattern;
|
||||
Vector<char, 5> chars = { '*', '+', '?', '{' };
|
||||
StringBuilder b;
|
||||
|
||||
Lexer l;
|
||||
PosixExtended p(l);
|
||||
|
||||
for (auto& ch : chars) {
|
||||
// First in ere
|
||||
b.clear();
|
||||
b.append(ch);
|
||||
pattern = b.build();
|
||||
l.set_source(pattern);
|
||||
p.parse();
|
||||
EXPECT(p.has_error());
|
||||
EXPECT(p.error() == Error::InvalidRepetitionMarker);
|
||||
|
||||
// After vertical line
|
||||
b.clear();
|
||||
b.append("a|");
|
||||
b.append(ch);
|
||||
pattern = b.build();
|
||||
l.set_source(pattern);
|
||||
p.parse();
|
||||
EXPECT(p.has_error());
|
||||
EXPECT(p.error() == Error::InvalidRepetitionMarker);
|
||||
|
||||
// After circumflex
|
||||
b.clear();
|
||||
b.append("^");
|
||||
b.append(ch);
|
||||
pattern = b.build();
|
||||
l.set_source(pattern);
|
||||
p.parse();
|
||||
EXPECT(p.has_error());
|
||||
EXPECT(p.error() == Error::InvalidRepetitionMarker);
|
||||
|
||||
// After dollar
|
||||
b.clear();
|
||||
b.append("$");
|
||||
b.append(ch);
|
||||
pattern = b.build();
|
||||
l.set_source(pattern);
|
||||
p.parse();
|
||||
EXPECT(p.has_error());
|
||||
EXPECT(p.error() == Error::InvalidRepetitionMarker);
|
||||
|
||||
// After left parens
|
||||
b.clear();
|
||||
b.append("(");
|
||||
b.append(ch);
|
||||
b.append(")");
|
||||
pattern = b.build();
|
||||
l.set_source(pattern);
|
||||
p.parse();
|
||||
EXPECT(p.has_error());
|
||||
EXPECT(p.error() == Error::InvalidRepetitionMarker);
|
||||
}
|
||||
}
|
||||
|
||||
TEST_CASE(parser_error_vertical_line_used_at_wrong_place)
|
||||
{
|
||||
Lexer l;
|
||||
PosixExtended p(l);
|
||||
|
||||
// First in ere
|
||||
l.set_source("|asdf");
|
||||
p.parse();
|
||||
EXPECT(p.has_error());
|
||||
EXPECT(p.error() == Error::EmptySubExpression);
|
||||
|
||||
// Last in ere
|
||||
l.set_source("asdf|");
|
||||
p.parse();
|
||||
EXPECT(p.has_error());
|
||||
EXPECT(p.error() == Error::EmptySubExpression);
|
||||
|
||||
// After left parens
|
||||
l.set_source("(|asdf)");
|
||||
p.parse();
|
||||
EXPECT(p.has_error());
|
||||
EXPECT(p.error() == Error::EmptySubExpression);
|
||||
|
||||
// Proceed right parens
|
||||
l.set_source("(asdf)|");
|
||||
p.parse();
|
||||
EXPECT(p.has_error());
|
||||
EXPECT(p.error() == Error::EmptySubExpression);
|
||||
}
|
||||
|
||||
TEST_CASE(catch_all_first)
|
||||
{
|
||||
Regex<PosixExtended> re("^.*$");
|
||||
RegexResult m;
|
||||
re.match("Hello World", m);
|
||||
EXPECT(m.count == 1);
|
||||
EXPECT(re.match("Hello World", m));
|
||||
}
|
||||
|
||||
TEST_CASE(catch_all)
|
||||
{
|
||||
Regex<PosixExtended> re("^.*$", PosixFlags::Global);
|
||||
|
||||
EXPECT(re.has_match("Hello World"));
|
||||
EXPECT(re.match("Hello World").success);
|
||||
EXPECT(re.match("Hello World").count == 1);
|
||||
|
||||
EXPECT(has_match("Hello World", re));
|
||||
auto res = match("Hello World", re);
|
||||
EXPECT(res.success);
|
||||
EXPECT(res.count == 1);
|
||||
EXPECT(res.matches.size() == 1);
|
||||
EXPECT(res.matches.first().view == "Hello World");
|
||||
}
|
||||
|
||||
TEST_CASE(catch_all_again)
|
||||
{
|
||||
Regex<PosixExtended> re("^.*$", PosixFlags::Extra);
|
||||
EXPECT_EQ(has_match("Hello World", re), true);
|
||||
}
|
||||
|
||||
TEST_CASE(char_utf8)
|
||||
{
|
||||
Regex<PosixExtended> re("😀");
|
||||
RegexResult result;
|
||||
|
||||
EXPECT_EQ((result = match("Привет, мир! 😀 γειά σου κόσμος 😀 こんにちは世界", re, PosixFlags::Global)).success, true);
|
||||
EXPECT_EQ(result.count, 2u);
|
||||
}
|
||||
|
||||
TEST_CASE(catch_all_newline)
|
||||
{
|
||||
Regex<PosixExtended> re("^.*$", PosixFlags::Multiline | PosixFlags::StringCopyMatches);
|
||||
RegexResult result;
|
||||
auto lambda = [&result, &re]() {
|
||||
String aaa = "Hello World\nTest\n1234\n";
|
||||
result = match(aaa, re);
|
||||
EXPECT_EQ(result.success, true);
|
||||
};
|
||||
lambda();
|
||||
EXPECT_EQ(result.count, 3u);
|
||||
EXPECT_EQ(result.matches.at(0).view, "Hello World");
|
||||
EXPECT_EQ(result.matches.at(1).view, "Test");
|
||||
EXPECT_EQ(result.matches.at(2).view, "1234");
|
||||
}
|
||||
|
||||
TEST_CASE(catch_all_newline_view)
|
||||
{
|
||||
Regex<PosixExtended> re("^.*$", PosixFlags::Multiline);
|
||||
RegexResult result;
|
||||
|
||||
String aaa = "Hello World\nTest\n1234\n";
|
||||
result = match(aaa, re);
|
||||
EXPECT_EQ(result.success, true);
|
||||
EXPECT_EQ(result.count, 3u);
|
||||
String str = "Hello World";
|
||||
EXPECT_EQ(result.matches.at(0).view, str.view());
|
||||
EXPECT_EQ(result.matches.at(1).view, "Test");
|
||||
EXPECT_EQ(result.matches.at(2).view, "1234");
|
||||
}
|
||||
|
||||
TEST_CASE(catch_all_newline_2)
|
||||
{
|
||||
Regex<PosixExtended> re("^.*$");
|
||||
RegexResult result;
|
||||
result = match("Hello World\nTest\n1234\n", re, PosixFlags::Multiline | PosixFlags::StringCopyMatches);
|
||||
EXPECT_EQ(result.success, true);
|
||||
EXPECT_EQ(result.count, 3u);
|
||||
EXPECT_EQ(result.matches.at(0).view, "Hello World");
|
||||
EXPECT_EQ(result.matches.at(1).view, "Test");
|
||||
EXPECT_EQ(result.matches.at(2).view, "1234");
|
||||
|
||||
result = match("Hello World\nTest\n1234\n", re);
|
||||
EXPECT_EQ(result.success, true);
|
||||
EXPECT_EQ(result.count, 1u);
|
||||
EXPECT_EQ(result.matches.at(0).view, "Hello World\nTest\n1234\n");
|
||||
}
|
||||
|
||||
TEST_CASE(match_all_character_class)
|
||||
{
|
||||
Regex<PosixExtended> re("[[:alpha:]]");
|
||||
String str = "[Window]\nOpacity=255\nAudibleBeep=0\n";
|
||||
RegexResult result = match(str, re, PosixFlags::Global | PosixFlags::StringCopyMatches);
|
||||
|
||||
EXPECT_EQ(result.success, true);
|
||||
EXPECT_EQ(result.count, 24u);
|
||||
EXPECT_EQ(result.matches.at(0).view, "W");
|
||||
EXPECT_EQ(result.matches.at(1).view, "i");
|
||||
EXPECT_EQ(result.matches.at(2).view, "n");
|
||||
EXPECT(&result.matches.at(0).view.characters_without_null_termination()[0] != &str.view().characters_without_null_termination()[1]);
|
||||
}
|
||||
|
||||
TEST_CASE(match_character_class_with_assertion)
|
||||
{
|
||||
Regex<PosixExtended> re("[[:alpha:]]+$");
|
||||
String str = "abcdef";
|
||||
RegexResult result = match(str, re);
|
||||
|
||||
EXPECT_EQ(result.success, true);
|
||||
EXPECT_EQ(result.count, 1u);
|
||||
}
|
||||
|
||||
TEST_CASE(example_for_git_commit)
|
||||
{
|
||||
Regex<PosixExtended> re("^.*$");
|
||||
auto result = re.match("Well, hello friends!\nHello World!");
|
||||
|
||||
EXPECT(result.success);
|
||||
EXPECT(result.count == 1);
|
||||
EXPECT(result.matches.at(0).view.starts_with("Well"));
|
||||
EXPECT(result.matches.at(0).view.length() == 33);
|
||||
|
||||
EXPECT(re.has_match("Well,...."));
|
||||
|
||||
result = re.match("Well, hello friends!\nHello World!", PosixFlags::Multiline);
|
||||
|
||||
EXPECT(result.success);
|
||||
EXPECT(result.count == 2);
|
||||
EXPECT(result.matches.at(0).view == "Well, hello friends!");
|
||||
EXPECT(result.matches.at(1).view == "Hello World!");
|
||||
}
|
||||
|
||||
TEST_CASE(email_address)
|
||||
{
|
||||
Regex<PosixExtended> re("^[A-Z0-9a-z._%+-]{1,64}@([A-Za-z0-9-]{1,63}\\.){1,125}[A-Za-z]{2,63}$");
|
||||
EXPECT(re.has_match("hello.world@domain.tld"));
|
||||
EXPECT(re.has_match("this.is.a.very_long_email_address@world.wide.web"));
|
||||
}
|
||||
|
||||
TEST_CASE(ini_file_entries)
|
||||
{
|
||||
Regex<PosixExtended> re("[[:alpha:]]*=([[:digit:]]*)|\\[(.*)\\]");
|
||||
RegexResult result;
|
||||
|
||||
#ifdef REGEX_DEBUG
|
||||
RegexDebug regex_dbg(stderr);
|
||||
regex_dbg.print_raw_bytecode(re);
|
||||
regex_dbg.print_header();
|
||||
regex_dbg.print_bytecode(re);
|
||||
#endif
|
||||
|
||||
String haystack = "[Window]\nOpacity=255\nAudibleBeep=0\n";
|
||||
EXPECT_EQ(re.search(haystack.view(), result, PosixFlags::Multiline), true);
|
||||
EXPECT_EQ(result.count, 3u);
|
||||
|
||||
#ifdef REGEX_DEBUG
|
||||
for (auto& v : result.matches)
|
||||
fprintf(stderr, "%s\n", v.view.to_string().characters());
|
||||
#endif
|
||||
|
||||
EXPECT_EQ(result.matches.at(0).view, "[Window]");
|
||||
EXPECT_EQ(result.capture_group_matches.at(0).at(0).view, "Window");
|
||||
EXPECT_EQ(result.matches.at(1).view, "Opacity=255");
|
||||
EXPECT_EQ(result.matches.at(1).line, 1u);
|
||||
EXPECT_EQ(result.matches.at(1).column, 0u);
|
||||
EXPECT_EQ(result.capture_group_matches.at(1).at(0).view, "255");
|
||||
EXPECT_EQ(result.capture_group_matches.at(1).at(0).line, 1u);
|
||||
EXPECT_EQ(result.capture_group_matches.at(1).at(0).column, 8u);
|
||||
EXPECT_EQ(result.matches.at(2).view, "AudibleBeep=0");
|
||||
EXPECT_EQ(result.capture_group_matches.at(2).at(0).view, "0");
|
||||
EXPECT_EQ(result.capture_group_matches.at(2).at(0).line, 2u);
|
||||
EXPECT_EQ(result.capture_group_matches.at(2).at(0).column, 12u);
|
||||
}
|
||||
|
||||
TEST_CASE(ini_file_entries2)
|
||||
{
|
||||
Regex<PosixExtended> re("[[:alpha:]]*=([[:digit:]]*)");
|
||||
RegexResult result;
|
||||
|
||||
String haystack = "ViewMode=Icon";
|
||||
|
||||
EXPECT_EQ(re.match(haystack.view(), result), false);
|
||||
EXPECT_EQ(result.count, 0u);
|
||||
|
||||
EXPECT_EQ(re.search(haystack.view(), result), true);
|
||||
EXPECT_EQ(result.count, 1u);
|
||||
}
|
||||
|
||||
TEST_CASE(named_capture_group)
|
||||
{
|
||||
Regex<PosixExtended> re("[[:alpha:]]*=(?<Test>[[:digit:]]*)");
|
||||
RegexResult result;
|
||||
|
||||
#ifdef REGEX_DEBUG
|
||||
RegexDebug regex_dbg(stderr);
|
||||
regex_dbg.print_raw_bytecode(re);
|
||||
regex_dbg.print_header();
|
||||
regex_dbg.print_bytecode(re);
|
||||
#endif
|
||||
|
||||
String haystack = "[Window]\nOpacity=255\nAudibleBeep=0\n";
|
||||
EXPECT_EQ(re.search(haystack, result, PosixFlags::Multiline), true);
|
||||
EXPECT_EQ(result.count, 2u);
|
||||
EXPECT_EQ(result.matches.at(0).view, "Opacity=255");
|
||||
EXPECT_EQ(result.named_capture_group_matches.at(0).ensure("Test").view, "255");
|
||||
EXPECT_EQ(result.matches.at(1).view, "AudibleBeep=0");
|
||||
EXPECT_EQ(result.named_capture_group_matches.at(1).ensure("Test").view, "0");
|
||||
}
|
||||
|
||||
TEST_CASE(a_star)
|
||||
{
|
||||
Regex<PosixExtended> re("a*");
|
||||
RegexResult result;
|
||||
|
||||
#ifdef REGEX_DEBUG
|
||||
RegexDebug regex_dbg(stderr);
|
||||
regex_dbg.print_raw_bytecode(re);
|
||||
regex_dbg.print_header();
|
||||
regex_dbg.print_bytecode(re);
|
||||
#endif
|
||||
|
||||
String haystack = "[Window]\nOpacity=255\nAudibleBeep=0\n";
|
||||
EXPECT_EQ(re.search(haystack.view(), result, PosixFlags::Multiline), true);
|
||||
EXPECT_EQ(result.count, 32u);
|
||||
EXPECT_EQ(result.matches.at(0).view.length(), 0u);
|
||||
EXPECT_EQ(result.matches.at(10).view.length(), 1u);
|
||||
EXPECT_EQ(result.matches.at(10).view, "a");
|
||||
EXPECT_EQ(result.matches.at(31).view.length(), 0u);
|
||||
}
|
||||
|
||||
TEST_CASE(simple_period_end_benchmark)
|
||||
{
|
||||
Regex<PosixExtended> re("hello.$");
|
||||
RegexResult m;
|
||||
EXPECT_EQ(re.search("Hello1", m), false);
|
||||
EXPECT_EQ(re.search("hello1hello1", m), true);
|
||||
EXPECT_EQ(re.search("hello2hell", m), false);
|
||||
EXPECT_EQ(re.search("hello?", m), true);
|
||||
}
|
||||
|
||||
TEST_CASE(ECMA262_parse)
|
||||
{
|
||||
struct _test {
|
||||
const char* pattern;
|
||||
regex::Error expected_error { regex::Error::NoError };
|
||||
regex::ECMAScriptFlags flags {};
|
||||
};
|
||||
|
||||
constexpr _test tests[] {
|
||||
{ "^hello.$" },
|
||||
{ "^(hello.)$" },
|
||||
{ "^h{0,1}ello.$" },
|
||||
{ "^hello\\W$" },
|
||||
{ "^hell\\w.$" },
|
||||
{ "^hell\\x6f1$" }, // ^hello1$
|
||||
{ "^hel(?:l\\w).$" },
|
||||
{ "^hel(?<LO>l\\w).$" },
|
||||
{ "^[-a-zA-Z\\w\\s]+$" },
|
||||
{ "\\bhello\\B" },
|
||||
{ "^[\\w+/_-]+[=]{0,2}$" }, // #4189
|
||||
{ "^(?:[^<]*(<[\\w\\W]+>)[^>]*$|#([\\w\\-]*)$)" }, // #4189
|
||||
{ "\\/" }, // #4189
|
||||
{ ",/=-:" }, // #4243
|
||||
{ "\\x" }, // Even invalid escapes are allowed if ~unicode.
|
||||
{ "\\", regex::Error::InvalidTrailingEscape },
|
||||
{ "(?", regex::Error::InvalidCaptureGroup },
|
||||
{ "\\u1234", regex::Error::NoError, regex::ECMAScriptFlags::Unicode },
|
||||
{ "[\\u1234]", regex::Error::NoError, regex::ECMAScriptFlags::Unicode },
|
||||
{ ",(?", regex::Error::InvalidCaptureGroup }, // #4583
|
||||
};
|
||||
|
||||
for (auto& test : tests) {
|
||||
Regex<ECMA262> re(test.pattern);
|
||||
EXPECT_EQ(re.parser_result.error, test.expected_error);
|
||||
#ifdef REGEX_DEBUG
|
||||
dbgln("\n");
|
||||
RegexDebug regex_dbg(stderr);
|
||||
regex_dbg.print_raw_bytecode(re);
|
||||
regex_dbg.print_header();
|
||||
regex_dbg.print_bytecode(re);
|
||||
dbgln("\n");
|
||||
#endif
|
||||
}
|
||||
}
|
||||
|
||||
TEST_CASE(ECMA262_match)
|
||||
{
|
||||
struct _test {
|
||||
const char* pattern;
|
||||
const char* subject;
|
||||
bool matches { true };
|
||||
ECMAScriptFlags options {};
|
||||
};
|
||||
|
||||
constexpr _test tests[] {
|
||||
{ "^hello.$", "hello1" },
|
||||
{ "^(hello.)$", "hello1" },
|
||||
{ "^h{0,1}ello.$", "ello1" },
|
||||
{ "^hello\\W$", "hello!" },
|
||||
{ "^hell\\w.$", "hellx!" },
|
||||
{ "^hell\\x6f1$", "hello1" },
|
||||
{ "^hel(?<LO>l.)1$", "hello1" },
|
||||
{ "^hel(?<LO>l.)1*\\k<LO>.$", "hello1lo1" },
|
||||
{ "^[-a-z1-3\\s]+$", "hell2 o1" },
|
||||
{ .pattern = "\\bhello\\B", .subject = "hello1", .options = ECMAScriptFlags::Global },
|
||||
{ "\\b.*\\b", "hello1" },
|
||||
{ "[^\\D\\S]{2}", "1 " },
|
||||
{ "bar(?=f.)foo", "barfoo" },
|
||||
{ "bar(?=foo)bar", "barbar", false },
|
||||
{ "bar(?!foo)bar", "barbar", true },
|
||||
{ "bar(?!bar)bar", "barbar", false },
|
||||
{ "bar.*(?<=foo)", "barbar", false },
|
||||
{ "bar.*(?<!foo)", "barbar", true },
|
||||
{ "((...)X)+", "fooXbarXbazX", true },
|
||||
{ "(?:)", "", true },
|
||||
};
|
||||
|
||||
for (auto& test : tests) {
|
||||
Regex<ECMA262> re(test.pattern, test.options);
|
||||
#ifdef REGEX_DEBUG
|
||||
dbgln("\n");
|
||||
RegexDebug regex_dbg(stderr);
|
||||
regex_dbg.print_raw_bytecode(re);
|
||||
regex_dbg.print_header();
|
||||
regex_dbg.print_bytecode(re);
|
||||
dbgln("\n");
|
||||
#endif
|
||||
EXPECT_EQ(re.parser_result.error, Error::NoError);
|
||||
EXPECT_EQ(re.match(test.subject).success, test.matches);
|
||||
}
|
||||
}
|
||||
|
||||
TEST_CASE(replace)
|
||||
{
|
||||
struct _test {
|
||||
const char* pattern;
|
||||
const char* replacement;
|
||||
const char* subject;
|
||||
const char* expected;
|
||||
ECMAScriptFlags options {};
|
||||
};
|
||||
|
||||
constexpr _test tests[] {
|
||||
{ "foo(.+)", "aaa", "test", "test" },
|
||||
{ "foo(.+)", "test\\1", "foobar", "testbar" },
|
||||
{ "foo(.+)", "\\2\\1", "foobar", "\\2bar" },
|
||||
{ "foo(.+)", "\\\\\\1", "foobar", "\\bar" },
|
||||
{ "foo(.)", "a\\1", "fooxfooy", "axay", ECMAScriptFlags::Multiline },
|
||||
};
|
||||
|
||||
for (auto& test : tests) {
|
||||
Regex<ECMA262> re(test.pattern, test.options);
|
||||
#ifdef REGEX_DEBUG
|
||||
dbgln("\n");
|
||||
RegexDebug regex_dbg(stderr);
|
||||
regex_dbg.print_raw_bytecode(re);
|
||||
regex_dbg.print_header();
|
||||
regex_dbg.print_bytecode(re);
|
||||
dbgln("\n");
|
||||
#endif
|
||||
EXPECT_EQ(re.parser_result.error, Error::NoError);
|
||||
EXPECT_EQ(re.replace(test.subject, test.replacement), test.expected);
|
||||
}
|
||||
}
|
||||
|
||||
TEST_MAIN(Regex)
|
1140
Userland/Libraries/LibRegex/Tests/RegexLibC.cpp
Normal file
1140
Userland/Libraries/LibRegex/Tests/RegexLibC.cpp
Normal file
File diff suppressed because it is too large
Load diff
Loading…
Add table
Add a link
Reference in a new issue