1
Fork 0
mirror of https://github.com/RGBCube/serenity synced 2025-07-27 17:17:45 +00:00

Libraries: Move to Userland/Libraries/

This commit is contained in:
Andreas Kling 2021-01-12 12:17:30 +01:00
parent dc28c07fa5
commit 13d7c09125
1857 changed files with 266 additions and 274 deletions

View file

@ -0,0 +1,256 @@
/*
* Copyright (c) 2020, Emanuel Sprung <emanuel.sprung@gmail.com>
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice, this
* list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include <AK/String.h>
#include <AK/StringBuilder.h>
#include <LibRegex/Regex.h>
#include <ctype.h>
#include <stdio.h>
#include <string.h>
#ifdef __serenity__
# include <regex.h>
#else
# include <LibC/regex.h>
#endif
struct internal_regex_t {
u8 cflags;
u8 eflags;
OwnPtr<Regex<PosixExtended>> re;
size_t re_pat_errpos;
ReError re_pat_err;
String re_pat;
size_t re_nsub;
};
static internal_regex_t* impl_from(regex_t* re)
{
if (!re)
return nullptr;
return reinterpret_cast<internal_regex_t*>(re->__data);
}
static const internal_regex_t* impl_from(const regex_t* re)
{
return impl_from(const_cast<regex_t*>(re));
}
extern "C" {
int regcomp(regex_t* reg, const char* pattern, int cflags)
{
if (!reg)
return REG_ESPACE;
// Note that subsequent uses of regcomp() without regfree() _will_ leak memory
// This could've been prevented if libc provided a reginit() or similar, but it does not.
reg->__data = new internal_regex_t { 0, 0, {}, 0, ReError::REG_NOERR, {}, 0 };
auto preg = impl_from(reg);
if (!(cflags & REG_EXTENDED))
return REG_ENOSYS;
preg->cflags = cflags;
String pattern_str(pattern);
preg->re = make<Regex<PosixExtended>>(pattern_str, PosixOptions {} | (PosixFlags)cflags | PosixFlags::SkipTrimEmptyMatches);
auto parser_result = preg->re->parser_result;
if (parser_result.error != regex::Error::NoError) {
preg->re_pat_errpos = parser_result.error_token.position();
preg->re_pat_err = (ReError)parser_result.error;
preg->re_pat = pattern;
dbg() << "Have Error: " << (ReError)parser_result.error;
return (ReError)parser_result.error;
}
preg->re_nsub = parser_result.capture_groups_count;
return REG_NOERR;
}
int regexec(const regex_t* reg, const char* string, size_t nmatch, regmatch_t pmatch[], int eflags)
{
auto preg = impl_from(reg);
if (!preg->re || preg->re_pat_err) {
if (preg->re_pat_err)
return preg->re_pat_err;
return REG_BADPAT;
}
RegexResult result;
if (eflags & REG_SEARCH)
result = preg->re->search(string, PosixOptions {} | (PosixFlags)eflags);
else
result = preg->re->match(string, PosixOptions {} | (PosixFlags)eflags);
if (result.success) {
auto size = result.matches.size();
if (size && nmatch && pmatch) {
pmatch[0].rm_cnt = size;
size_t match_index { 0 };
for (size_t i = 0; i < size; ++i) {
pmatch[match_index].rm_so = result.matches.at(i).global_offset;
pmatch[match_index].rm_eo = pmatch[match_index].rm_so + result.matches.at(i).view.length();
if (match_index > 0)
pmatch[match_index].rm_cnt = result.capture_group_matches.size();
++match_index;
if (match_index >= nmatch)
return REG_NOERR;
if (i < result.capture_group_matches.size()) {
auto capture_groups_size = result.capture_group_matches.at(i).size();
for (size_t j = 0; j < preg->re->parser_result.capture_groups_count; ++j) {
if (j >= capture_groups_size || !result.capture_group_matches.at(i).at(j).view.length()) {
pmatch[match_index].rm_so = -1;
pmatch[match_index].rm_eo = -1;
pmatch[match_index].rm_cnt = 0;
} else {
pmatch[match_index].rm_so = result.capture_group_matches.at(i).at(j).global_offset;
pmatch[match_index].rm_eo = pmatch[match_index].rm_so + result.capture_group_matches.at(i).at(j).view.length();
pmatch[match_index].rm_cnt = 1;
}
++match_index;
if (match_index >= nmatch)
return REG_NOERR;
}
}
}
if (match_index < nmatch) {
for (size_t i = match_index; i < nmatch; ++i) {
pmatch[i].rm_so = -1;
pmatch[i].rm_eo = -1;
pmatch[i].rm_cnt = 0;
}
}
}
return REG_NOERR;
} else {
if (nmatch && pmatch) {
pmatch[0].rm_so = -1;
pmatch[0].rm_eo = -1;
pmatch[0].rm_cnt = 0;
}
}
return REG_NOMATCH;
}
inline static String get_error(ReError errcode)
{
String error;
switch ((ReError)errcode) {
case REG_NOERR:
error = "No error";
break;
case REG_NOMATCH:
error = "regexec() failed to match.";
break;
case REG_BADPAT:
error = "Invalid regular expression.";
break;
case REG_ECOLLATE:
error = "Invalid collating element referenced.";
break;
case REG_ECTYPE:
error = "Invalid character class type referenced.";
break;
case REG_EESCAPE:
error = "Trailing \\ in pattern.";
break;
case REG_ESUBREG:
error = "Number in \\digit invalid or in error.";
break;
case REG_EBRACK:
error = "[ ] imbalance.";
break;
case REG_EPAREN:
error = "\\( \\) or ( ) imbalance.";
break;
case REG_EBRACE:
error = "\\{ \\} imbalance.";
break;
case REG_BADBR:
error = "Content of \\{ \\} invalid: not a number, number too large, more than two numbers, first larger than second.";
break;
case REG_ERANGE:
error = "Invalid endpoint in range expression.";
break;
case REG_ESPACE:
error = "Out of memory.";
break;
case REG_BADRPT:
error = "?, * or + not preceded by valid regular expression.";
break;
case REG_ENOSYS:
error = "The implementation does not support the function.";
break;
case REG_EMPTY_EXPR:
error = "Empty expression provided";
break;
}
return error;
}
size_t regerror(int errcode, const regex_t* reg, char* errbuf, size_t errbuf_size)
{
String error;
auto preg = impl_from(reg);
if (!preg)
error = get_error((ReError)errcode);
else
error = preg->re->error_string(get_error(preg->re_pat_err));
if (!errbuf_size)
return error.length();
if (!error.copy_characters_to_buffer(errbuf, errbuf_size))
return 0;
return error.length();
}
void regfree(regex_t* reg)
{
auto preg = impl_from(reg);
if (preg) {
delete preg;
reg->__data = nullptr;
}
}
}

View file

@ -0,0 +1,10 @@
set(SOURCES
C/Regex.cpp
RegexByteCode.cpp
RegexLexer.cpp
RegexMatcher.cpp
RegexParser.cpp
)
serenity_lib(LibRegex regex)
target_link_libraries(LibRegex LibC LibCore)

View file

@ -0,0 +1,58 @@
/*
* Copyright (c) 2020, Emanuel Sprung <emanuel.sprung@gmail.com>
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice, this
* list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#pragma once
#include <AK/Types.h>
namespace regex {
enum class Error : u8;
class Lexer;
class PosixExtendedParser;
class ECMA262Parser;
class ByteCode;
class OpCode;
class OpCode_Exit;
class OpCode_Jump;
class OpCode_ForkJump;
class OpCode_ForkStay;
class OpCode_CheckBegin;
class OpCode_CheckEnd;
class OpCode_SaveLeftCaptureGroup;
class OpCode_SaveRightCaptureGroup;
class OpCode_SaveLeftNamedCaptureGroup;
class OpCode_SaveNamedLeftCaptureGroup;
class OpCode_SaveRightNamedCaptureGroup;
class OpCode_Compare;
class RegexStringView;
}
using regex::ECMA262Parser;
using regex::Error;
using regex::Lexer;
using regex::PosixExtendedParser;
using regex::RegexStringView;

View file

@ -0,0 +1,31 @@
/*
* Copyright (c) 2020, Emanuel Sprung <emanuel.sprung@gmail.com>
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice, this
* list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#pragma once
#include <LibRegex/Forward.h>
#include <LibRegex/RegexDebug.h>
#include <LibRegex/RegexMatcher.h>

View file

@ -0,0 +1,749 @@
/*
* Copyright (c) 2020, Emanuel Sprung <emanuel.sprung@gmail.com>
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice, this
* list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "RegexByteCode.h"
#include "AK/StringBuilder.h"
#include "RegexDebug.h"
#include <ctype.h>
namespace regex {
const char* OpCode::name(OpCodeId opcode_id)
{
switch (opcode_id) {
#define __ENUMERATE_OPCODE(x) \
case OpCodeId::x: \
return #x;
ENUMERATE_OPCODES
#undef __ENUMERATE_OPCODE
default:
ASSERT_NOT_REACHED();
return "<Unknown>";
}
}
const char* OpCode::name() const
{
return name(opcode_id());
}
const char* execution_result_name(ExecutionResult result)
{
switch (result) {
#define __ENUMERATE_EXECUTION_RESULT(x) \
case ExecutionResult::x: \
return #x;
ENUMERATE_EXECUTION_RESULTS
#undef __ENUMERATE_EXECUTION_RESULT
default:
ASSERT_NOT_REACHED();
return "<Unknown>";
}
}
const char* boundary_check_type_name(BoundaryCheckType ty)
{
switch (ty) {
#define __ENUMERATE_BOUNDARY_CHECK_TYPE(x) \
case BoundaryCheckType::x: \
return #x;
ENUMERATE_BOUNDARY_CHECK_TYPES
#undef __ENUMERATE_BOUNDARY_CHECK_TYPE
default:
ASSERT_NOT_REACHED();
return "<Unknown>";
}
}
const char* character_compare_type_name(CharacterCompareType ch_compare_type)
{
switch (ch_compare_type) {
#define __ENUMERATE_CHARACTER_COMPARE_TYPE(x) \
case CharacterCompareType::x: \
return #x;
ENUMERATE_CHARACTER_COMPARE_TYPES
#undef __ENUMERATE_CHARACTER_COMPARE_TYPE
default:
ASSERT_NOT_REACHED();
return "<Unknown>";
}
}
static const char* character_class_name(CharClass ch_class)
{
switch (ch_class) {
#define __ENUMERATE_CHARACTER_CLASS(x) \
case CharClass::x: \
return #x;
ENUMERATE_CHARACTER_CLASSES
#undef __ENUMERATE_CHARACTER_CLASS
default:
ASSERT_NOT_REACHED();
return "<Unknown>";
}
}
HashMap<u32, OwnPtr<OpCode>> ByteCode::s_opcodes {};
ALWAYS_INLINE OpCode* ByteCode::get_opcode_by_id(OpCodeId id) const
{
if (!s_opcodes.size()) {
for (u32 i = (u32)OpCodeId::First; i <= (u32)OpCodeId::Last; ++i) {
switch ((OpCodeId)i) {
case OpCodeId::Exit:
s_opcodes.set(i, make<OpCode_Exit>(*const_cast<ByteCode*>(this)));
break;
case OpCodeId::Jump:
s_opcodes.set(i, make<OpCode_Jump>(*const_cast<ByteCode*>(this)));
break;
case OpCodeId::Compare:
s_opcodes.set(i, make<OpCode_Compare>(*const_cast<ByteCode*>(this)));
break;
case OpCodeId::CheckEnd:
s_opcodes.set(i, make<OpCode_CheckEnd>(*const_cast<ByteCode*>(this)));
break;
case OpCodeId::CheckBoundary:
s_opcodes.set(i, make<OpCode_CheckBoundary>(*const_cast<ByteCode*>(this)));
break;
case OpCodeId::ForkJump:
s_opcodes.set(i, make<OpCode_ForkJump>(*const_cast<ByteCode*>(this)));
break;
case OpCodeId::ForkStay:
s_opcodes.set(i, make<OpCode_ForkStay>(*const_cast<ByteCode*>(this)));
break;
case OpCodeId::FailForks:
s_opcodes.set(i, make<OpCode_FailForks>(*const_cast<ByteCode*>(this)));
break;
case OpCodeId::Save:
s_opcodes.set(i, make<OpCode_Save>(*const_cast<ByteCode*>(this)));
break;
case OpCodeId::Restore:
s_opcodes.set(i, make<OpCode_Restore>(*const_cast<ByteCode*>(this)));
break;
case OpCodeId::GoBack:
s_opcodes.set(i, make<OpCode_GoBack>(*const_cast<ByteCode*>(this)));
break;
case OpCodeId::CheckBegin:
s_opcodes.set(i, make<OpCode_CheckBegin>(*const_cast<ByteCode*>(this)));
break;
case OpCodeId::SaveLeftCaptureGroup:
s_opcodes.set(i, make<OpCode_SaveLeftCaptureGroup>(*const_cast<ByteCode*>(this)));
break;
case OpCodeId::SaveRightCaptureGroup:
s_opcodes.set(i, make<OpCode_SaveRightCaptureGroup>(*const_cast<ByteCode*>(this)));
break;
case OpCodeId::SaveLeftNamedCaptureGroup:
s_opcodes.set(i, make<OpCode_SaveLeftNamedCaptureGroup>(*const_cast<ByteCode*>(this)));
break;
case OpCodeId::SaveRightNamedCaptureGroup:
s_opcodes.set(i, make<OpCode_SaveRightNamedCaptureGroup>(*const_cast<ByteCode*>(this)));
break;
}
}
}
if (id > OpCodeId::Last)
return nullptr;
return const_cast<OpCode*>(s_opcodes.get((u32)id).value())->set_bytecode(*const_cast<ByteCode*>(this));
}
OpCode* ByteCode::get_opcode(MatchState& state) const
{
OpCode* op_code;
if (state.instruction_position >= size()) {
op_code = get_opcode_by_id(OpCodeId::Exit);
} else
op_code = get_opcode_by_id((OpCodeId)at(state.instruction_position));
if (op_code)
op_code->set_state(state);
return op_code;
}
ALWAYS_INLINE ExecutionResult OpCode_Exit::execute(const MatchInput& input, MatchState& state, MatchOutput&) const
{
if (state.string_position > input.view.length() || state.instruction_position >= m_bytecode->size())
return ExecutionResult::Succeeded;
return ExecutionResult::Failed;
}
ALWAYS_INLINE ExecutionResult OpCode_Save::execute(const MatchInput& input, MatchState& state, MatchOutput&) const
{
input.saved_positions.append(state.string_position);
return ExecutionResult::Continue;
}
ALWAYS_INLINE ExecutionResult OpCode_Restore::execute(const MatchInput& input, MatchState& state, MatchOutput&) const
{
if (input.saved_positions.is_empty())
return ExecutionResult::Failed;
state.string_position = input.saved_positions.take_last();
return ExecutionResult::Continue;
}
ALWAYS_INLINE ExecutionResult OpCode_GoBack::execute(const MatchInput&, MatchState& state, MatchOutput&) const
{
if (count() > state.string_position)
return ExecutionResult::Failed_ExecuteLowPrioForks;
state.string_position -= count();
return ExecutionResult::Continue;
}
ALWAYS_INLINE ExecutionResult OpCode_FailForks::execute(const MatchInput& input, MatchState&, MatchOutput&) const
{
ASSERT(count() > 0);
input.fail_counter += count() - 1;
return ExecutionResult::Failed_ExecuteLowPrioForks;
}
ALWAYS_INLINE ExecutionResult OpCode_Jump::execute(const MatchInput&, MatchState& state, MatchOutput&) const
{
state.instruction_position += offset();
return ExecutionResult::Continue;
}
ALWAYS_INLINE ExecutionResult OpCode_ForkJump::execute(const MatchInput&, MatchState& state, MatchOutput&) const
{
state.fork_at_position = state.instruction_position + size() + offset();
return ExecutionResult::Fork_PrioHigh;
}
ALWAYS_INLINE ExecutionResult OpCode_ForkStay::execute(const MatchInput&, MatchState& state, MatchOutput&) const
{
state.fork_at_position = state.instruction_position + size() + offset();
return ExecutionResult::Fork_PrioLow;
}
ALWAYS_INLINE ExecutionResult OpCode_CheckBegin::execute(const MatchInput& input, MatchState& state, MatchOutput&) const
{
if (0 == state.string_position && (input.regex_options & AllFlags::MatchNotBeginOfLine))
return ExecutionResult::Failed_ExecuteLowPrioForks;
if ((0 == state.string_position && !(input.regex_options & AllFlags::MatchNotBeginOfLine))
|| (0 != state.string_position && (input.regex_options & AllFlags::MatchNotBeginOfLine))
|| (0 == state.string_position && (input.regex_options & AllFlags::Global)))
return ExecutionResult::Continue;
return ExecutionResult::Failed_ExecuteLowPrioForks;
}
ALWAYS_INLINE ExecutionResult OpCode_CheckBoundary::execute(const MatchInput& input, MatchState& state, MatchOutput&) const
{
auto isword = [](auto ch) { return isalnum(ch) || ch == '_'; };
auto is_word_boundary = [&] {
if (state.string_position == input.view.length()) {
if (state.string_position > 0 && isword(input.view[state.string_position - 1]))
return true;
return false;
}
if (state.string_position == 0) {
if (isword(input.view[0]))
return true;
return false;
}
return !!(isword(input.view[state.string_position]) ^ isword(input.view[state.string_position - 1]));
};
switch (type()) {
case BoundaryCheckType::Word: {
if (is_word_boundary())
return ExecutionResult::Continue;
return ExecutionResult::Failed_ExecuteLowPrioForks;
}
case BoundaryCheckType::NonWord: {
if (!is_word_boundary())
return ExecutionResult::Continue;
return ExecutionResult::Failed_ExecuteLowPrioForks;
}
}
ASSERT_NOT_REACHED();
}
ALWAYS_INLINE ExecutionResult OpCode_CheckEnd::execute(const MatchInput& input, MatchState& state, MatchOutput&) const
{
if (state.string_position == input.view.length() && (input.regex_options & AllFlags::MatchNotEndOfLine))
return ExecutionResult::Failed_ExecuteLowPrioForks;
if ((state.string_position == input.view.length() && !(input.regex_options & AllFlags::MatchNotEndOfLine))
|| (state.string_position != input.view.length() && (input.regex_options & AllFlags::MatchNotEndOfLine || input.regex_options & AllFlags::MatchNotBeginOfLine)))
return ExecutionResult::Continue;
return ExecutionResult::Failed_ExecuteLowPrioForks;
}
ALWAYS_INLINE ExecutionResult OpCode_SaveLeftCaptureGroup::execute(const MatchInput& input, MatchState& state, MatchOutput& output) const
{
if (input.match_index >= output.capture_group_matches.size()) {
output.capture_group_matches.ensure_capacity(input.match_index);
auto capacity = output.capture_group_matches.capacity();
for (size_t i = output.capture_group_matches.size(); i <= capacity; ++i)
output.capture_group_matches.empend();
}
if (id() >= output.capture_group_matches.at(input.match_index).size()) {
output.capture_group_matches.at(input.match_index).ensure_capacity(id());
auto capacity = output.capture_group_matches.at(input.match_index).capacity();
for (size_t i = output.capture_group_matches.at(input.match_index).size(); i <= capacity; ++i)
output.capture_group_matches.at(input.match_index).empend();
}
output.capture_group_matches.at(input.match_index).at(id()).left_column = state.string_position;
return ExecutionResult::Continue;
}
ALWAYS_INLINE ExecutionResult OpCode_SaveRightCaptureGroup::execute(const MatchInput& input, MatchState& state, MatchOutput& output) const
{
auto& match = output.capture_group_matches.at(input.match_index).at(id());
auto start_position = match.left_column;
auto length = state.string_position - start_position;
if (start_position < match.column)
return ExecutionResult::Continue;
ASSERT(start_position + length <= input.view.length());
auto view = input.view.substring_view(start_position, length);
if (input.regex_options & AllFlags::StringCopyMatches) {
match = { view.to_string(), input.line, start_position, input.global_offset + start_position }; // create a copy of the original string
} else {
match = { view, input.line, start_position, input.global_offset + start_position }; // take view to original string
}
return ExecutionResult::Continue;
}
ALWAYS_INLINE ExecutionResult OpCode_SaveLeftNamedCaptureGroup::execute(const MatchInput& input, MatchState& state, MatchOutput& output) const
{
if (input.match_index >= output.named_capture_group_matches.size()) {
output.named_capture_group_matches.ensure_capacity(input.match_index);
auto capacity = output.named_capture_group_matches.capacity();
for (size_t i = output.named_capture_group_matches.size(); i <= capacity; ++i)
output.named_capture_group_matches.empend();
}
output.named_capture_group_matches.at(input.match_index).ensure(name()).column = state.string_position;
return ExecutionResult::Continue;
}
ALWAYS_INLINE ExecutionResult OpCode_SaveRightNamedCaptureGroup::execute(const MatchInput& input, MatchState& state, MatchOutput& output) const
{
StringView capture_group_name = name();
if (output.named_capture_group_matches.at(input.match_index).contains(capture_group_name)) {
auto start_position = output.named_capture_group_matches.at(input.match_index).ensure(capture_group_name).column;
auto length = state.string_position - start_position;
auto& map = output.named_capture_group_matches.at(input.match_index);
#ifdef REGEX_DEBUG
ASSERT(start_position + length <= input.view.length());
dbg() << "Save named capture group with name=" << capture_group_name << " and content: " << input.view.substring_view(start_position, length).to_string();
#endif
ASSERT(start_position + length <= input.view.length());
auto view = input.view.substring_view(start_position, length);
if (input.regex_options & AllFlags::StringCopyMatches) {
map.set(capture_group_name, { view.to_string(), input.line, start_position, input.global_offset + start_position }); // create a copy of the original string
} else {
map.set(capture_group_name, { view, input.line, start_position, input.global_offset + start_position }); // take view to original string
}
} else {
fprintf(stderr, "Didn't find corresponding capture group match for name=%s, match_index=%lu\n", capture_group_name.to_string().characters(), input.match_index);
}
return ExecutionResult::Continue;
}
ALWAYS_INLINE ExecutionResult OpCode_Compare::execute(const MatchInput& input, MatchState& state, MatchOutput& output) const
{
bool inverse { false };
bool temporary_inverse { false };
bool reset_temp_inverse { false };
auto current_inversion_state = [&]() -> bool { return temporary_inverse ^ inverse; };
size_t string_position = state.string_position;
bool inverse_matched { false };
size_t offset { state.instruction_position + 3 };
for (size_t i = 0; i < arguments_count(); ++i) {
if (state.string_position > string_position)
break;
if (reset_temp_inverse) {
reset_temp_inverse = false;
temporary_inverse = false;
} else {
reset_temp_inverse = true;
}
auto compare_type = (CharacterCompareType)m_bytecode->at(offset++);
if (compare_type == CharacterCompareType::Inverse)
inverse = true;
else if (compare_type == CharacterCompareType::TemporaryInverse) {
// If "TemporaryInverse" is given, negate the current inversion state only for the next opcode.
// it follows that this cannot be the last compare element.
ASSERT(i != arguments_count() - 1);
temporary_inverse = true;
reset_temp_inverse = false;
} else if (compare_type == CharacterCompareType::Char) {
u32 ch = m_bytecode->at(offset++);
// We want to compare a string that is longer or equal in length to the available string
if (input.view.length() - state.string_position < 1)
return ExecutionResult::Failed_ExecuteLowPrioForks;
compare_char(input, state, ch, current_inversion_state(), inverse_matched);
} else if (compare_type == CharacterCompareType::AnyChar) {
// We want to compare a string that is definitely longer than the available string
if (input.view.length() - state.string_position < 1)
return ExecutionResult::Failed_ExecuteLowPrioForks;
ASSERT(!current_inversion_state());
++state.string_position;
} else if (compare_type == CharacterCompareType::String) {
ASSERT(!current_inversion_state());
const auto& length = m_bytecode->at(offset++);
StringBuilder str_builder;
for (size_t i = 0; i < length; ++i)
str_builder.append(m_bytecode->at(offset++));
// We want to compare a string that is definitely longer than the available string
if (input.view.length() - state.string_position < length)
return ExecutionResult::Failed_ExecuteLowPrioForks;
if (!compare_string(input, state, str_builder.string_view().characters_without_null_termination(), length))
return ExecutionResult::Failed_ExecuteLowPrioForks;
} else if (compare_type == CharacterCompareType::CharClass) {
if (input.view.length() - state.string_position < 1)
return ExecutionResult::Failed_ExecuteLowPrioForks;
auto character_class = (CharClass)m_bytecode->at(offset++);
auto ch = input.view[state.string_position];
compare_character_class(input, state, character_class, ch, current_inversion_state(), inverse_matched);
} else if (compare_type == CharacterCompareType::CharRange) {
auto value = (CharRange)m_bytecode->at(offset++);
auto from = value.from;
auto to = value.to;
auto ch = input.view[state.string_position];
compare_character_range(input, state, from, to, ch, current_inversion_state(), inverse_matched);
} else if (compare_type == CharacterCompareType::Reference) {
auto reference_number = (size_t)m_bytecode->at(offset++);
auto& groups = output.capture_group_matches.at(input.match_index);
if (groups.size() <= reference_number)
return ExecutionResult::Failed_ExecuteLowPrioForks;
auto str = groups.at(reference_number).view;
// We want to compare a string that is definitely longer than the available string
if (input.view.length() - state.string_position < str.length())
return ExecutionResult::Failed_ExecuteLowPrioForks;
if (!compare_string(input, state, str.characters_without_null_termination(), str.length()))
return ExecutionResult::Failed_ExecuteLowPrioForks;
} else if (compare_type == CharacterCompareType::NamedReference) {
auto ptr = (const char*)m_bytecode->at(offset++);
auto length = (size_t)m_bytecode->at(offset++);
StringView name { ptr, length };
auto group = output.named_capture_group_matches.at(input.match_index).get(name);
if (!group.has_value())
return ExecutionResult::Failed_ExecuteLowPrioForks;
auto str = group.value().view;
// We want to compare a string that is definitely longer than the available string
if (input.view.length() - state.string_position < str.length())
return ExecutionResult::Failed_ExecuteLowPrioForks;
if (!compare_string(input, state, str.characters_without_null_termination(), str.length()))
return ExecutionResult::Failed_ExecuteLowPrioForks;
} else {
fprintf(stderr, "Undefined comparison: %i\n", (int)compare_type);
ASSERT_NOT_REACHED();
break;
}
}
if (current_inversion_state() && !inverse_matched)
++state.string_position;
if (string_position == state.string_position || state.string_position > input.view.length())
return ExecutionResult::Failed_ExecuteLowPrioForks;
return ExecutionResult::Continue;
}
ALWAYS_INLINE void OpCode_Compare::compare_char(const MatchInput& input, MatchState& state, u32 ch1, bool inverse, bool& inverse_matched)
{
u32 ch2 = input.view[state.string_position];
if (input.regex_options & AllFlags::Insensitive) {
ch1 = tolower(ch1);
ch2 = tolower(ch2);
}
if (ch1 == ch2) {
if (inverse)
inverse_matched = true;
else
++state.string_position;
}
}
ALWAYS_INLINE bool OpCode_Compare::compare_string(const MatchInput& input, MatchState& state, const char* str, size_t length)
{
if (input.view.is_u8_view()) {
auto str_view1 = StringView(str, length);
auto str_view2 = StringView(&input.view.u8view()[state.string_position], length);
String str1, str2;
if (input.regex_options & AllFlags::Insensitive) {
str1 = str_view1.to_string().to_lowercase();
str2 = str_view2.to_string().to_lowercase();
str_view1 = str1.view();
str_view2 = str2.view();
}
if (str_view1 == str_view2) {
state.string_position += length;
return true;
}
}
return false;
}
ALWAYS_INLINE void OpCode_Compare::compare_character_class(const MatchInput& input, MatchState& state, CharClass character_class, u32 ch, bool inverse, bool& inverse_matched)
{
switch (character_class) {
case CharClass::Alnum:
if (isalnum(ch)) {
if (inverse)
inverse_matched = true;
else
++state.string_position;
}
break;
case CharClass::Alpha:
if (isalpha(ch))
++state.string_position;
break;
case CharClass::Blank:
if (ch == ' ' || ch == '\t') {
if (inverse)
inverse_matched = true;
else
++state.string_position;
}
break;
case CharClass::Cntrl:
if (iscntrl(ch)) {
if (inverse)
inverse_matched = true;
else
++state.string_position;
}
break;
case CharClass::Digit:
if (isdigit(ch)) {
if (inverse)
inverse_matched = true;
else
++state.string_position;
}
break;
case CharClass::Graph:
if (isgraph(ch)) {
if (inverse)
inverse_matched = true;
else
++state.string_position;
}
break;
case CharClass::Lower:
if (islower(ch) || ((input.regex_options & AllFlags::Insensitive) && isupper(ch))) {
if (inverse)
inverse_matched = true;
else
++state.string_position;
}
break;
case CharClass::Print:
if (isprint(ch)) {
if (inverse)
inverse_matched = true;
else
++state.string_position;
}
break;
case CharClass::Punct:
if (ispunct(ch)) {
if (inverse)
inverse_matched = true;
else
++state.string_position;
}
break;
case CharClass::Space:
if (isspace(ch)) {
if (inverse)
inverse_matched = true;
else
++state.string_position;
}
break;
case CharClass::Upper:
if (isupper(ch) || ((input.regex_options & AllFlags::Insensitive) && islower(ch))) {
if (inverse)
inverse_matched = true;
else
++state.string_position;
}
break;
case CharClass::Word:
if (isalnum(ch) || ch == '_') {
if (inverse)
inverse_matched = true;
else
++state.string_position;
}
break;
case CharClass::Xdigit:
if (isxdigit(ch)) {
if (inverse)
inverse_matched = true;
else
++state.string_position;
}
break;
}
}
ALWAYS_INLINE void OpCode_Compare::compare_character_range(const MatchInput& input, MatchState& state, u32 from, u32 to, u32 ch, bool inverse, bool& inverse_matched)
{
if (input.regex_options & AllFlags::Insensitive) {
from = tolower(from);
to = tolower(to);
ch = tolower(ch);
}
if (ch >= from && ch <= to) {
if (inverse)
inverse_matched = true;
else
++state.string_position;
}
}
const String OpCode_Compare::arguments_string() const
{
return String::format("argc=%lu, args=%lu ", arguments_count(), arguments_size());
}
const Vector<String> OpCode_Compare::variable_arguments_to_string(Optional<MatchInput> input) const
{
Vector<String> result;
size_t offset { state().instruction_position + 3 };
RegexStringView view = ((input.has_value()) ? input.value().view : nullptr);
for (size_t i = 0; i < arguments_count(); ++i) {
auto compare_type = (CharacterCompareType)m_bytecode->at(offset++);
result.empend(String::format("type=%lu [%s]", (size_t)compare_type, character_compare_type_name(compare_type)));
auto compared_against_string_start_offset = state().string_position > 0 ? state().string_position - 1 : state().string_position;
if (compare_type == CharacterCompareType::Char) {
char ch = m_bytecode->at(offset++);
result.empend(String::format("value='%c'", ch));
if (!view.is_null() && view.length() > state().string_position)
result.empend(String::format(
"compare against: '%s'",
view.substring_view(compared_against_string_start_offset, state().string_position > view.length() ? 0 : 1).to_string().characters()));
} else if (compare_type == CharacterCompareType::NamedReference) {
auto ptr = (const char*)m_bytecode->at(offset++);
auto length = m_bytecode->at(offset++);
result.empend(String::format("name='%.*s'", (int)length, ptr));
} else if (compare_type == CharacterCompareType::Reference) {
auto ref = m_bytecode->at(offset++);
result.empend(String::formatted("number={}", ref));
} else if (compare_type == CharacterCompareType::String) {
auto& length = m_bytecode->at(offset++);
StringBuilder str_builder;
for (size_t i = 0; i < length; ++i)
str_builder.append(m_bytecode->at(offset++));
result.empend(String::format("value=\"%.*s\"", (int)length, str_builder.string_view().characters_without_null_termination()));
if (!view.is_null() && view.length() > state().string_position)
result.empend(String::format(
"compare against: \"%s\"",
input.value().view.substring_view(compared_against_string_start_offset, compared_against_string_start_offset + length > view.length() ? 0 : length).to_string().characters()));
} else if (compare_type == CharacterCompareType::CharClass) {
auto character_class = (CharClass)m_bytecode->at(offset++);
result.empend(String::format("ch_class=%lu [%s]", (size_t)character_class, character_class_name(character_class)));
if (!view.is_null() && view.length() > state().string_position)
result.empend(String::format(
"compare against: '%s'",
input.value().view.substring_view(compared_against_string_start_offset, state().string_position > view.length() ? 0 : 1).to_string().characters()));
} else if (compare_type == CharacterCompareType::CharRange) {
auto value = (CharRange)m_bytecode->at(offset++);
result.empend(String::format("ch_range='%c'-'%c'", value.from, value.to));
if (!view.is_null() && view.length() > state().string_position)
result.empend(String::format(
"compare against: '%s'",
input.value().view.substring_view(compared_against_string_start_offset, state().string_position > view.length() ? 0 : 1).to_string().characters()));
}
}
return result;
}
}

View file

@ -0,0 +1,837 @@
/*
* Copyright (c) 2020, Emanuel Sprung <emanuel.sprung@gmail.com>
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice, this
* list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#pragma once
#include "RegexMatch.h"
#include "RegexOptions.h"
#include <AK/Format.h>
#include <AK/Forward.h>
#include <AK/HashMap.h>
#include <AK/NonnullOwnPtr.h>
#include <AK/OwnPtr.h>
#include <AK/Traits.h>
#include <AK/Types.h>
#include <AK/Vector.h>
namespace regex {
using ByteCodeValueType = u64;
#define ENUMERATE_OPCODES \
__ENUMERATE_OPCODE(Compare) \
__ENUMERATE_OPCODE(Jump) \
__ENUMERATE_OPCODE(ForkJump) \
__ENUMERATE_OPCODE(ForkStay) \
__ENUMERATE_OPCODE(FailForks) \
__ENUMERATE_OPCODE(SaveLeftCaptureGroup) \
__ENUMERATE_OPCODE(SaveRightCaptureGroup) \
__ENUMERATE_OPCODE(SaveLeftNamedCaptureGroup) \
__ENUMERATE_OPCODE(SaveRightNamedCaptureGroup) \
__ENUMERATE_OPCODE(CheckBegin) \
__ENUMERATE_OPCODE(CheckEnd) \
__ENUMERATE_OPCODE(CheckBoundary) \
__ENUMERATE_OPCODE(Save) \
__ENUMERATE_OPCODE(Restore) \
__ENUMERATE_OPCODE(GoBack) \
__ENUMERATE_OPCODE(Exit)
// clang-format off
enum class OpCodeId : ByteCodeValueType {
#define __ENUMERATE_OPCODE(x) x,
ENUMERATE_OPCODES
#undef __ENUMERATE_OPCODE
First = Compare,
Last = Exit,
};
// clang-format on
#define ENUMERATE_CHARACTER_COMPARE_TYPES \
__ENUMERATE_CHARACTER_COMPARE_TYPE(Undefined) \
__ENUMERATE_CHARACTER_COMPARE_TYPE(Inverse) \
__ENUMERATE_CHARACTER_COMPARE_TYPE(TemporaryInverse) \
__ENUMERATE_CHARACTER_COMPARE_TYPE(AnyChar) \
__ENUMERATE_CHARACTER_COMPARE_TYPE(Char) \
__ENUMERATE_CHARACTER_COMPARE_TYPE(String) \
__ENUMERATE_CHARACTER_COMPARE_TYPE(CharClass) \
__ENUMERATE_CHARACTER_COMPARE_TYPE(CharRange) \
__ENUMERATE_CHARACTER_COMPARE_TYPE(Reference) \
__ENUMERATE_CHARACTER_COMPARE_TYPE(NamedReference) \
__ENUMERATE_CHARACTER_COMPARE_TYPE(RangeExpressionDummy)
enum class CharacterCompareType : ByteCodeValueType {
#define __ENUMERATE_CHARACTER_COMPARE_TYPE(x) x,
ENUMERATE_CHARACTER_COMPARE_TYPES
#undef __ENUMERATE_CHARACTER_COMPARE_TYPE
};
#define ENUMERATE_CHARACTER_CLASSES \
__ENUMERATE_CHARACTER_CLASS(Alnum) \
__ENUMERATE_CHARACTER_CLASS(Cntrl) \
__ENUMERATE_CHARACTER_CLASS(Lower) \
__ENUMERATE_CHARACTER_CLASS(Space) \
__ENUMERATE_CHARACTER_CLASS(Alpha) \
__ENUMERATE_CHARACTER_CLASS(Digit) \
__ENUMERATE_CHARACTER_CLASS(Print) \
__ENUMERATE_CHARACTER_CLASS(Upper) \
__ENUMERATE_CHARACTER_CLASS(Blank) \
__ENUMERATE_CHARACTER_CLASS(Graph) \
__ENUMERATE_CHARACTER_CLASS(Punct) \
__ENUMERATE_CHARACTER_CLASS(Word) \
__ENUMERATE_CHARACTER_CLASS(Xdigit)
enum class CharClass : ByteCodeValueType {
#define __ENUMERATE_CHARACTER_CLASS(x) x,
ENUMERATE_CHARACTER_CLASSES
#undef __ENUMERATE_CHARACTER_CLASS
};
#define ENUMERATE_BOUNDARY_CHECK_TYPES \
__ENUMERATE_BOUNDARY_CHECK_TYPE(Word) \
__ENUMERATE_BOUNDARY_CHECK_TYPE(NonWord)
enum class BoundaryCheckType : ByteCodeValueType {
#define __ENUMERATE_BOUNDARY_CHECK_TYPE(x) x,
ENUMERATE_BOUNDARY_CHECK_TYPES
#undef __ENUMERATE_BOUNDARY_CHECK_TYPE
};
struct CharRange {
const u32 from;
const u32 to;
CharRange(u64 value)
: from(value >> 32)
, to(value & 0xffffffff)
{
}
CharRange(u32 from, u32 to)
: from(from)
, to(to)
{
}
operator ByteCodeValueType() const { return ((u64)from << 32) | to; }
};
struct CompareTypeAndValuePair {
CharacterCompareType type;
ByteCodeValueType value;
};
class OpCode;
class ByteCode : public Vector<ByteCodeValueType> {
public:
ByteCode() = default;
virtual ~ByteCode() = default;
void insert_bytecode_compare_values(Vector<CompareTypeAndValuePair>&& pairs)
{
ByteCode bytecode;
bytecode.empend(static_cast<ByteCodeValueType>(OpCodeId::Compare));
bytecode.empend(pairs.size()); // number of arguments
ByteCode arguments;
for (auto& value : pairs) {
ASSERT(value.type != CharacterCompareType::RangeExpressionDummy);
ASSERT(value.type != CharacterCompareType::Undefined);
ASSERT(value.type != CharacterCompareType::String);
ASSERT(value.type != CharacterCompareType::NamedReference);
arguments.append((ByteCodeValueType)value.type);
if (value.type != CharacterCompareType::Inverse && value.type != CharacterCompareType::AnyChar && value.type != CharacterCompareType::TemporaryInverse)
arguments.append(move(value.value));
}
bytecode.empend(arguments.size()); // size of arguments
bytecode.append(move(arguments));
append(move(bytecode));
}
void insert_bytecode_check_boundary(BoundaryCheckType type)
{
ByteCode bytecode;
bytecode.empend((ByteCodeValueType)OpCodeId::CheckBoundary);
bytecode.empend((ByteCodeValueType)type);
append(move(bytecode));
}
void insert_bytecode_compare_string(StringView view)
{
ByteCode bytecode;
bytecode.empend(static_cast<ByteCodeValueType>(OpCodeId::Compare));
bytecode.empend(static_cast<u64>(1)); // number of arguments
ByteCode arguments;
arguments.empend(static_cast<ByteCodeValueType>(CharacterCompareType::String));
arguments.insert_string(view);
bytecode.empend(arguments.size()); // size of arguments
bytecode.append(move(arguments));
append(move(bytecode));
}
void insert_bytecode_compare_named_reference(StringView name)
{
ByteCode bytecode;
bytecode.empend(static_cast<ByteCodeValueType>(OpCodeId::Compare));
bytecode.empend(static_cast<u64>(1)); // number of arguments
ByteCode arguments;
arguments.empend(static_cast<ByteCodeValueType>(CharacterCompareType::NamedReference));
arguments.empend(reinterpret_cast<ByteCodeValueType>(name.characters_without_null_termination()));
arguments.empend(name.length());
bytecode.empend(arguments.size()); // size of arguments
bytecode.append(move(arguments));
append(move(bytecode));
}
void insert_bytecode_group_capture_left(size_t capture_groups_count)
{
empend(static_cast<ByteCodeValueType>(OpCodeId::SaveLeftCaptureGroup));
empend(capture_groups_count);
}
void insert_bytecode_group_capture_left(const StringView& name)
{
empend(static_cast<ByteCodeValueType>(OpCodeId::SaveLeftNamedCaptureGroup));
empend(reinterpret_cast<ByteCodeValueType>(name.characters_without_null_termination()));
empend(name.length());
}
void insert_bytecode_group_capture_right(size_t capture_groups_count)
{
empend(static_cast<ByteCodeValueType>(OpCodeId::SaveRightCaptureGroup));
empend(capture_groups_count);
}
void insert_bytecode_group_capture_right(const StringView& name)
{
empend(static_cast<ByteCodeValueType>(OpCodeId::SaveRightNamedCaptureGroup));
empend(reinterpret_cast<ByteCodeValueType>(name.characters_without_null_termination()));
empend(name.length());
}
enum class LookAroundType {
LookAhead,
LookBehind,
NegatedLookAhead,
NegatedLookBehind,
};
void insert_bytecode_lookaround(ByteCode&& lookaround_body, LookAroundType type, size_t match_length = 0)
{
// FIXME: The save stack will grow infinitely with repeated failures
// as we do not discard that on failure (we don't necessarily know how many to pop with the current architecture).
switch (type) {
case LookAroundType::LookAhead: {
// SAVE
// REGEXP BODY
// RESTORE
empend((ByteCodeValueType)OpCodeId::Save);
append(move(lookaround_body));
empend((ByteCodeValueType)OpCodeId::Restore);
return;
}
case LookAroundType::NegatedLookAhead: {
// JUMP _A
// LABEL _L
// REGEXP BODY
// FAIL 2
// LABEL _A
// SAVE
// FORKJUMP _L
// RESTORE
auto body_length = lookaround_body.size();
empend((ByteCodeValueType)OpCodeId::Jump);
empend((ByteCodeValueType)body_length + 2); // JUMP to label _A
append(move(lookaround_body));
empend((ByteCodeValueType)OpCodeId::FailForks);
empend((ByteCodeValueType)2); // Fail two forks
empend((ByteCodeValueType)OpCodeId::Save);
empend((ByteCodeValueType)OpCodeId::ForkJump);
empend((ByteCodeValueType) - (body_length + 5)); // JUMP to lavel _L
empend((ByteCodeValueType)OpCodeId::Restore);
return;
}
case LookAroundType::LookBehind:
// SAVE
// GOBACK match_length(BODY)
// REGEXP BODY
// RESTORE
empend((ByteCodeValueType)OpCodeId::Save);
empend((ByteCodeValueType)OpCodeId::GoBack);
empend((ByteCodeValueType)match_length);
append(move(lookaround_body));
empend((ByteCodeValueType)OpCodeId::Restore);
return;
case LookAroundType::NegatedLookBehind: {
// JUMP _A
// LABEL _L
// GOBACK match_length(BODY)
// REGEXP BODY
// FAIL 2
// LABEL _A
// SAVE
// FORKJUMP _L
// RESTORE
auto body_length = lookaround_body.size();
empend((ByteCodeValueType)OpCodeId::Jump);
empend((ByteCodeValueType)body_length + 4); // JUMP to label _A
empend((ByteCodeValueType)OpCodeId::GoBack);
empend((ByteCodeValueType)match_length);
append(move(lookaround_body));
empend((ByteCodeValueType)OpCodeId::FailForks);
empend((ByteCodeValueType)2); // Fail two forks
empend((ByteCodeValueType)OpCodeId::Save);
empend((ByteCodeValueType)OpCodeId::ForkJump);
empend((ByteCodeValueType) - (body_length + 7)); // JUMP to lavel _L
empend((ByteCodeValueType)OpCodeId::Restore);
return;
}
}
ASSERT_NOT_REACHED();
}
void insert_bytecode_alternation(ByteCode&& left, ByteCode&& right)
{
// FORKJUMP _ALT
// REGEXP ALT1
// JUMP _END
// LABEL _ALT
// REGEXP ALT2
// LABEL _END
ByteCode byte_code;
empend(static_cast<ByteCodeValueType>(OpCodeId::ForkJump));
empend(left.size() + 2); // Jump to the _ALT label
for (auto& op : left)
append(move(op));
empend(static_cast<ByteCodeValueType>(OpCodeId::Jump));
empend(right.size()); // Jump to the _END label
// LABEL _ALT = bytecode.size() + 2
for (auto& op : right)
append(move(op));
// LABEL _END = alterantive_bytecode.size
}
void insert_bytecode_repetition_min_max(ByteCode& bytecode_to_repeat, size_t minimum, Optional<size_t> maximum)
{
ByteCode new_bytecode;
new_bytecode.insert_bytecode_repetition_n(bytecode_to_repeat, minimum);
if (maximum.has_value()) {
if (maximum.value() > minimum) {
auto diff = maximum.value() - minimum;
new_bytecode.empend(static_cast<ByteCodeValueType>(OpCodeId::ForkStay));
new_bytecode.empend(diff * (bytecode_to_repeat.size() + 2)); // Jump to the _END label
for (size_t i = 0; i < diff; ++i) {
new_bytecode.append(bytecode_to_repeat);
new_bytecode.empend(static_cast<ByteCodeValueType>(OpCodeId::ForkStay));
new_bytecode.empend((diff - i - 1) * (bytecode_to_repeat.size() + 2)); // Jump to the _END label
}
}
} else {
// no maximum value set, repeat finding if possible
new_bytecode.empend(static_cast<ByteCodeValueType>(OpCodeId::ForkJump));
new_bytecode.empend(-bytecode_to_repeat.size() - 2); // Jump to the last iteration
}
bytecode_to_repeat = move(new_bytecode);
}
void insert_bytecode_repetition_n(ByteCode& bytecode_to_repeat, size_t n)
{
for (size_t i = 0; i < n; ++i)
append(bytecode_to_repeat);
}
void insert_bytecode_repetition_min_one(ByteCode& bytecode_to_repeat, bool greedy)
{
// LABEL _START = -bytecode_to_repeat.size()
// REGEXP
// FORKSTAY _START (FORKJUMP -> Greedy)
if (greedy)
bytecode_to_repeat.empend(static_cast<ByteCodeValueType>(OpCodeId::ForkJump));
else
bytecode_to_repeat.empend(static_cast<ByteCodeValueType>(OpCodeId::ForkStay));
bytecode_to_repeat.empend(-(bytecode_to_repeat.size() + 1)); // Jump to the _START label
}
void insert_bytecode_repetition_any(ByteCode& bytecode_to_repeat, bool greedy)
{
// LABEL _START
// FORKJUMP _END (FORKSTAY -> Greedy)
// REGEXP
// JUMP _START
// LABEL _END
// LABEL _START = m_bytes.size();
ByteCode bytecode;
if (greedy)
bytecode.empend(static_cast<ByteCodeValueType>(OpCodeId::ForkStay));
else
bytecode.empend(static_cast<ByteCodeValueType>(OpCodeId::ForkJump));
bytecode.empend(bytecode_to_repeat.size() + 2); // Jump to the _END label
for (auto& op : bytecode_to_repeat)
bytecode.append(move(op));
bytecode.empend(static_cast<ByteCodeValueType>(OpCodeId::Jump));
bytecode.empend(-bytecode.size() - 1); // Jump to the _START label
// LABEL _END = bytecode.size()
bytecode_to_repeat = move(bytecode);
}
void insert_bytecode_repetition_zero_or_one(ByteCode& bytecode_to_repeat, bool greedy)
{
// FORKJUMP _END (FORKSTAY -> Greedy)
// REGEXP
// LABEL _END
ByteCode bytecode;
if (greedy)
bytecode.empend(static_cast<ByteCodeValueType>(OpCodeId::ForkStay));
else
bytecode.empend(static_cast<ByteCodeValueType>(OpCodeId::ForkJump));
bytecode.empend(bytecode_to_repeat.size()); // Jump to the _END label
for (auto& op : bytecode_to_repeat)
bytecode.append(move(op));
// LABEL _END = bytecode.size()
bytecode_to_repeat = move(bytecode);
}
OpCode* get_opcode(MatchState& state) const;
private:
void insert_string(const StringView& view)
{
empend((ByteCodeValueType)view.length());
for (size_t i = 0; i < view.length(); ++i)
empend((ByteCodeValueType)view[i]);
}
ALWAYS_INLINE OpCode* get_opcode_by_id(OpCodeId id) const;
static HashMap<u32, OwnPtr<OpCode>> s_opcodes;
};
#define ENUMERATE_EXECUTION_RESULTS \
__ENUMERATE_EXECUTION_RESULT(Continue) \
__ENUMERATE_EXECUTION_RESULT(Fork_PrioHigh) \
__ENUMERATE_EXECUTION_RESULT(Fork_PrioLow) \
__ENUMERATE_EXECUTION_RESULT(Failed) \
__ENUMERATE_EXECUTION_RESULT(Failed_ExecuteLowPrioForks) \
__ENUMERATE_EXECUTION_RESULT(Succeeded)
enum class ExecutionResult : u8 {
#define __ENUMERATE_EXECUTION_RESULT(x) x,
ENUMERATE_EXECUTION_RESULTS
#undef __ENUMERATE_EXECUTION_RESULT
};
const char* execution_result_name(ExecutionResult result);
const char* opcode_id_name(OpCodeId opcode_id);
const char* boundary_check_type_name(BoundaryCheckType);
const char* character_compare_type_name(CharacterCompareType result);
const char* execution_result_name(ExecutionResult result);
class OpCode {
public:
OpCode(ByteCode& bytecode)
: m_bytecode(&bytecode)
{
}
virtual ~OpCode() = default;
virtual OpCodeId opcode_id() const = 0;
virtual size_t size() const = 0;
virtual ExecutionResult execute(const MatchInput& input, MatchState& state, MatchOutput& output) const = 0;
ALWAYS_INLINE ByteCodeValueType argument(size_t offset) const
{
ASSERT(state().instruction_position + offset <= m_bytecode->size());
return m_bytecode->at(state().instruction_position + 1 + offset);
}
ALWAYS_INLINE const char* name() const;
static const char* name(const OpCodeId);
ALWAYS_INLINE OpCode* set_state(MatchState& state)
{
m_state = &state;
return this;
}
ALWAYS_INLINE OpCode* set_bytecode(ByteCode& bytecode)
{
m_bytecode = &bytecode;
return this;
}
ALWAYS_INLINE void reset_state() { m_state.clear(); }
ALWAYS_INLINE const MatchState& state() const
{
ASSERT(m_state.has_value());
return *m_state.value();
}
const String to_string() const
{
return String::format("[0x%02X] %s", (int)opcode_id(), name(opcode_id()));
}
virtual const String arguments_string() const = 0;
ALWAYS_INLINE const ByteCode& bytecode() const { return *m_bytecode; }
protected:
ByteCode* m_bytecode;
Optional<MatchState*> m_state;
};
class OpCode_Exit final : public OpCode {
public:
OpCode_Exit(ByteCode& bytecode)
: OpCode(bytecode)
{
}
ExecutionResult execute(const MatchInput& input, MatchState& state, MatchOutput& output) const override;
ALWAYS_INLINE OpCodeId opcode_id() const override { return OpCodeId::Exit; }
ALWAYS_INLINE size_t size() const override { return 1; }
const String arguments_string() const override { return ""; }
};
class OpCode_FailForks final : public OpCode {
public:
OpCode_FailForks(ByteCode& bytecode)
: OpCode(bytecode)
{
}
ExecutionResult execute(const MatchInput& input, MatchState& state, MatchOutput& output) const override;
ALWAYS_INLINE OpCodeId opcode_id() const override { return OpCodeId::FailForks; }
ALWAYS_INLINE size_t size() const override { return 2; }
ALWAYS_INLINE size_t count() const { return argument(0); }
const String arguments_string() const override { return String::formatted("count={}", count()); }
};
class OpCode_Save final : public OpCode {
public:
OpCode_Save(ByteCode& bytecode)
: OpCode(bytecode)
{
}
ExecutionResult execute(const MatchInput& input, MatchState& state, MatchOutput& output) const override;
ALWAYS_INLINE OpCodeId opcode_id() const override { return OpCodeId::Save; }
ALWAYS_INLINE size_t size() const override { return 1; }
const String arguments_string() const override { return ""; }
};
class OpCode_Restore final : public OpCode {
public:
OpCode_Restore(ByteCode& bytecode)
: OpCode(bytecode)
{
}
ExecutionResult execute(const MatchInput& input, MatchState& state, MatchOutput& output) const override;
ALWAYS_INLINE OpCodeId opcode_id() const override { return OpCodeId::Restore; }
ALWAYS_INLINE size_t size() const override { return 1; }
const String arguments_string() const override { return ""; }
};
class OpCode_GoBack final : public OpCode {
public:
OpCode_GoBack(ByteCode& bytecode)
: OpCode(bytecode)
{
}
ExecutionResult execute(const MatchInput& input, MatchState& state, MatchOutput& output) const override;
ALWAYS_INLINE OpCodeId opcode_id() const override { return OpCodeId::GoBack; }
ALWAYS_INLINE size_t size() const override { return 2; }
ALWAYS_INLINE size_t count() const { return argument(0); }
const String arguments_string() const override { return String::formatted("count={}", count()); }
};
class OpCode_Jump final : public OpCode {
public:
OpCode_Jump(ByteCode& bytecode)
: OpCode(bytecode)
{
}
ExecutionResult execute(const MatchInput& input, MatchState& state, MatchOutput& output) const override;
ALWAYS_INLINE OpCodeId opcode_id() const override { return OpCodeId::Jump; }
ALWAYS_INLINE size_t size() const override { return 2; }
ALWAYS_INLINE ssize_t offset() const { return argument(0); }
const String arguments_string() const override
{
return String::format("offset=%zd [&%zu]", offset(), state().instruction_position + size() + offset());
}
};
class OpCode_ForkJump final : public OpCode {
public:
OpCode_ForkJump(ByteCode& bytecode)
: OpCode(bytecode)
{
}
ExecutionResult execute(const MatchInput& input, MatchState& state, MatchOutput& output) const override;
ALWAYS_INLINE OpCodeId opcode_id() const override { return OpCodeId::ForkJump; }
ALWAYS_INLINE size_t size() const override { return 2; }
ALWAYS_INLINE ssize_t offset() const { return argument(0); }
const String arguments_string() const override
{
return String::format("offset=%zd [&%zu], sp: %zu", offset(), state().instruction_position + size() + offset(), state().string_position);
}
};
class OpCode_ForkStay final : public OpCode {
public:
OpCode_ForkStay(ByteCode& bytecode)
: OpCode(bytecode)
{
}
ExecutionResult execute(const MatchInput& input, MatchState& state, MatchOutput& output) const override;
ALWAYS_INLINE OpCodeId opcode_id() const override { return OpCodeId::ForkStay; }
ALWAYS_INLINE size_t size() const override { return 2; }
ALWAYS_INLINE ssize_t offset() const { return argument(0); }
const String arguments_string() const override
{
return String::format("offset=%zd [&%zu], sp: %zu", offset(), state().instruction_position + size() + offset(), state().string_position);
}
};
class OpCode_CheckBegin final : public OpCode {
public:
OpCode_CheckBegin(ByteCode& bytecode)
: OpCode(bytecode)
{
}
ExecutionResult execute(const MatchInput& input, MatchState& state, MatchOutput& output) const override;
ALWAYS_INLINE OpCodeId opcode_id() const override { return OpCodeId::CheckBegin; }
ALWAYS_INLINE size_t size() const override { return 1; }
const String arguments_string() const override { return ""; }
};
class OpCode_CheckEnd final : public OpCode {
public:
OpCode_CheckEnd(ByteCode& bytecode)
: OpCode(bytecode)
{
}
ExecutionResult execute(const MatchInput& input, MatchState& state, MatchOutput& output) const override;
ALWAYS_INLINE OpCodeId opcode_id() const override { return OpCodeId::CheckEnd; }
ALWAYS_INLINE size_t size() const override { return 1; }
const String arguments_string() const override { return ""; }
};
class OpCode_CheckBoundary final : public OpCode {
public:
OpCode_CheckBoundary(ByteCode& bytecode)
: OpCode(bytecode)
{
}
ExecutionResult execute(const MatchInput& input, MatchState& state, MatchOutput& output) const override;
ALWAYS_INLINE OpCodeId opcode_id() const override { return OpCodeId::CheckBoundary; }
ALWAYS_INLINE size_t size() const override { return 2; }
ALWAYS_INLINE size_t arguments_count() const { return 1; }
ALWAYS_INLINE BoundaryCheckType type() const { return static_cast<BoundaryCheckType>(argument(0)); }
const String arguments_string() const override { return String::format("kind=%lu (%s)", (long unsigned int)argument(0), boundary_check_type_name(type())); }
};
class OpCode_SaveLeftCaptureGroup final : public OpCode {
public:
OpCode_SaveLeftCaptureGroup(ByteCode& bytecode)
: OpCode(bytecode)
{
}
ExecutionResult execute(const MatchInput& input, MatchState& state, MatchOutput& output) const override;
ALWAYS_INLINE OpCodeId opcode_id() const override { return OpCodeId::SaveLeftCaptureGroup; }
ALWAYS_INLINE size_t size() const override { return 2; }
ALWAYS_INLINE size_t id() const { return argument(0); }
const String arguments_string() const override { return String::format("id=%lu", id()); }
};
class OpCode_SaveRightCaptureGroup final : public OpCode {
public:
OpCode_SaveRightCaptureGroup(ByteCode& bytecode)
: OpCode(bytecode)
{
}
ExecutionResult execute(const MatchInput& input, MatchState& state, MatchOutput& output) const override;
ALWAYS_INLINE OpCodeId opcode_id() const override { return OpCodeId::SaveRightCaptureGroup; }
ALWAYS_INLINE size_t size() const override { return 2; }
ALWAYS_INLINE size_t id() const { return argument(0); }
const String arguments_string() const override { return String::format("id=%lu", id()); }
};
class OpCode_SaveLeftNamedCaptureGroup final : public OpCode {
public:
OpCode_SaveLeftNamedCaptureGroup(ByteCode& bytecode)
: OpCode(bytecode)
{
}
ExecutionResult execute(const MatchInput& input, MatchState& state, MatchOutput& output) const override;
ALWAYS_INLINE OpCodeId opcode_id() const override { return OpCodeId::SaveLeftNamedCaptureGroup; }
ALWAYS_INLINE size_t size() const override { return 3; }
ALWAYS_INLINE StringView name() const { return { reinterpret_cast<char*>(argument(0)), length() }; }
ALWAYS_INLINE size_t length() const { return argument(1); }
const String arguments_string() const override
{
return String::format("name=%s, length=%lu", name().to_string().characters(), length());
}
};
class OpCode_SaveRightNamedCaptureGroup final : public OpCode {
public:
OpCode_SaveRightNamedCaptureGroup(ByteCode& bytecode)
: OpCode(bytecode)
{
}
ExecutionResult execute(const MatchInput& input, MatchState& state, MatchOutput& output) const override;
ALWAYS_INLINE OpCodeId opcode_id() const override { return OpCodeId::SaveRightNamedCaptureGroup; }
ALWAYS_INLINE size_t size() const override { return 3; }
ALWAYS_INLINE StringView name() const { return { reinterpret_cast<char*>(argument(0)), length() }; }
ALWAYS_INLINE size_t length() const { return argument(1); }
const String arguments_string() const override
{
return String::format("name=%s, length=%zu", name().to_string().characters(), length());
}
};
class OpCode_Compare final : public OpCode {
public:
OpCode_Compare(ByteCode& bytecode)
: OpCode(bytecode)
{
}
ExecutionResult execute(const MatchInput& input, MatchState& state, MatchOutput& output) const override;
ALWAYS_INLINE OpCodeId opcode_id() const override { return OpCodeId::Compare; }
ALWAYS_INLINE size_t size() const override { return arguments_size() + 3; }
ALWAYS_INLINE size_t arguments_count() const { return argument(0); }
ALWAYS_INLINE size_t arguments_size() const { return argument(1); }
const String arguments_string() const override;
const Vector<String> variable_arguments_to_string(Optional<MatchInput> input = {}) const;
private:
ALWAYS_INLINE static void compare_char(const MatchInput& input, MatchState& state, u32 ch1, bool inverse, bool& inverse_matched);
ALWAYS_INLINE static bool compare_string(const MatchInput& input, MatchState& state, const char* str, size_t length);
ALWAYS_INLINE static void compare_character_class(const MatchInput& input, MatchState& state, CharClass character_class, u32 ch, bool inverse, bool& inverse_matched);
ALWAYS_INLINE static void compare_character_range(const MatchInput& input, MatchState& state, u32 from, u32 to, u32 ch, bool inverse, bool& inverse_matched);
};
template<typename T>
bool is(const OpCode&);
template<typename T>
ALWAYS_INLINE bool is(const OpCode&)
{
return false;
}
template<typename T>
ALWAYS_INLINE bool is(const OpCode* opcode)
{
return is<T>(*opcode);
}
template<>
ALWAYS_INLINE bool is<OpCode_ForkStay>(const OpCode& opcode)
{
return opcode.opcode_id() == OpCodeId::ForkStay;
}
template<>
ALWAYS_INLINE bool is<OpCode_Exit>(const OpCode& opcode)
{
return opcode.opcode_id() == OpCodeId::Exit;
}
template<>
ALWAYS_INLINE bool is<OpCode_Compare>(const OpCode& opcode)
{
return opcode.opcode_id() == OpCodeId::Compare;
}
template<typename T>
ALWAYS_INLINE const T& to(const OpCode& opcode)
{
ASSERT(is<T>(opcode));
return static_cast<const T&>(opcode);
}
template<typename T>
ALWAYS_INLINE T* to(OpCode* opcode)
{
ASSERT(is<T>(opcode));
return static_cast<T*>(opcode);
}
template<typename T>
ALWAYS_INLINE const T* to(const OpCode* opcode)
{
ASSERT(is<T>(opcode));
return static_cast<const T*>(opcode);
}
template<typename T>
ALWAYS_INLINE T& to(OpCode& opcode)
{
ASSERT(is<T>(opcode));
return static_cast<T&>(opcode);
}
}

View file

@ -0,0 +1,154 @@
/*
* Copyright (c) 2020, Emanuel Sprung <emanuel.sprung@gmail.com>
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice, this
* list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#pragma once
#include "AK/StringBuilder.h"
#include "LibRegex/RegexMatcher.h"
//#define REGEX_DEBUG
#ifdef REGEX_DEBUG
namespace regex {
class RegexDebug {
public:
RegexDebug(FILE* file = stdout)
: m_file(file)
{
}
virtual ~RegexDebug() = default;
template<typename T>
void print_raw_bytecode(Regex<T>& regex) const
{
auto& bytecode = regex.parser_result.bytecode;
size_t index { 0 };
for (auto& value : bytecode) {
fprintf(m_file, "OpCode i=%3lu [0x%02X]\n", index, (u32)value);
++index;
}
}
template<typename T>
void print_bytecode(Regex<T>& regex) const
{
MatchState state;
auto& bytecode = regex.parser_result.bytecode;
for (;;) {
auto* opcode = bytecode.get_opcode(state);
if (!opcode) {
dbgln("Wrong opcode... failed!");
return;
}
print_opcode("PrintBytecode", *opcode, state);
fprintf(m_file, "%s", m_debug_stripline.characters());
if (is<OpCode_Exit>(*opcode))
break;
state.instruction_position += opcode->size();
}
fflush(m_file);
}
void print_opcode(const String& system, OpCode& opcode, MatchState& state, size_t recursion = 0, bool newline = true) const
{
fprintf(m_file, "%-15s | %-5lu | %-9lu | %-35s | %-30s | %-20s%s",
system.characters(),
state.instruction_position,
recursion,
opcode.to_string().characters(),
opcode.arguments_string().characters(),
String::format("ip: %3lu, sp: %3lu", state.instruction_position, state.string_position).characters(),
newline ? "\n" : "");
if (newline && is<OpCode_Compare>(opcode)) {
for (auto& line : to<OpCode_Compare>(opcode).variable_arguments_to_string()) {
fprintf(m_file, "%-15s | %-5s | %-9s | %-35s | %-30s | %-20s%s", "", "", "", "", line.characters(), "", "\n");
}
}
}
void print_result(const OpCode& opcode, const ByteCode& bytecode, const MatchInput& input, MatchState& state, ExecutionResult result) const
{
StringBuilder builder;
builder.append(execution_result_name(result));
builder.appendff(", fc: {}, ss: {}", input.fail_counter, input.saved_positions.size());
if (result == ExecutionResult::Succeeded) {
builder.appendf(", ip: %lu/%lu, sp: %lu/%lu", state.instruction_position, bytecode.size() - 1, state.string_position, input.view.length() - 1);
} else if (result == ExecutionResult::Fork_PrioHigh) {
builder.appendf(", next ip: %lu", state.fork_at_position + opcode.size());
} else if (result != ExecutionResult::Failed) {
builder.appendf(", next ip: %lu", state.instruction_position + opcode.size());
}
fprintf(m_file, " | %-20s\n", builder.to_string().characters());
if (is<OpCode_Compare>(opcode)) {
for (auto& line : to<OpCode_Compare>(opcode).variable_arguments_to_string(input)) {
fprintf(m_file, "%-15s | %-5s | %-9s | %-35s | %-30s | %-20s%s", "", "", "", "", line.characters(), "", "\n");
}
}
fprintf(m_file, "%s", m_debug_stripline.characters());
}
void print_header()
{
StringBuilder builder;
builder.appendf("%-15s | %-5s | %-9s | %-35s | %-30s | %-20s | %-20s\n", "System", "Index", "Recursion", "OpCode", "Arguments", "State", "Result");
auto length = builder.length();
for (size_t i = 0; i < length; ++i) {
builder.append('=');
}
fprintf(m_file, "%s\n", builder.to_string().characters());
fflush(m_file);
builder.clear();
for (size_t i = 0; i < length; ++i) {
builder.append('-');
}
builder.append('\n');
m_debug_stripline = builder.to_string();
}
private:
String m_debug_stripline;
FILE* m_file;
};
}
using regex::RegexDebug;
#endif

View file

@ -0,0 +1,102 @@
/*
* Copyright (c) 2020, Emanuel Sprung <emanuel.sprung@gmail.com>
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice, this
* list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#pragma once
#include <AK/String.h>
#include <AK/Types.h>
#ifdef __serenity__
# include <regex.h>
#else
# include <LibC/regex.h>
#endif
namespace regex {
enum class Error : u8 {
NoError = __Regex_NoError,
InvalidPattern = __Regex_InvalidPattern, // Invalid regular expression.
InvalidCollationElement = __Regex_InvalidCollationElement, // Invalid collating element referenced.
InvalidCharacterClass = __Regex_InvalidCharacterClass, // Invalid character class type referenced.
InvalidTrailingEscape = __Regex_InvalidTrailingEscape, // Trailing \ in pattern.
InvalidNumber = __Regex_InvalidNumber, // Number in \digit invalid or in error.
MismatchingBracket = __Regex_MismatchingBracket, // [ ] imbalance.
MismatchingParen = __Regex_MismatchingParen, // ( ) imbalance.
MismatchingBrace = __Regex_MismatchingBrace, // { } imbalance.
InvalidBraceContent = __Regex_InvalidBraceContent, // Content of {} invalid: not a number, number too large, more than two numbers, first larger than second.
InvalidBracketContent = __Regex_InvalidBracketContent, // Content of [] invalid.
InvalidRange = __Regex_InvalidRange, // Invalid endpoint in range expression.
InvalidRepetitionMarker = __Regex_InvalidRepetitionMarker, // ?, * or + not preceded by valid regular expression.
ReachedMaxRecursion = __Regex_ReachedMaxRecursion, // MaximumRecursion has been reached.
EmptySubExpression = __Regex_EmptySubExpression, // Sub expression has empty content.
InvalidCaptureGroup = __Regex_InvalidCaptureGroup, // Content of capture group is invalid.
InvalidNameForCaptureGroup = __Regex_InvalidNameForCaptureGroup, // Name of capture group is invalid.
};
inline String get_error_string(Error error)
{
switch (error) {
case Error::NoError:
return "No error";
case Error::InvalidPattern:
return "Invalid regular expression.";
case Error::InvalidCollationElement:
return "Invalid collating element referenced.";
case Error::InvalidCharacterClass:
return "Invalid character class type referenced.";
case Error::InvalidTrailingEscape:
return "Trailing \\ in pattern.";
case Error::InvalidNumber:
return "Number in \\digit invalid or in error.";
case Error::MismatchingBracket:
return "[ ] imbalance.";
case Error::MismatchingParen:
return "( ) imbalance.";
case Error::MismatchingBrace:
return "{ } imbalance.";
case Error::InvalidBraceContent:
return "Content of {} invalid: not a number, number too large, more than two numbers, first larger than second.";
case Error::InvalidBracketContent:
return "Content of [] invalid.";
case Error::InvalidRange:
return "Invalid endpoint in range expression.";
case Error::InvalidRepetitionMarker:
return "?, * or + not preceded by valid regular expression.";
case Error::ReachedMaxRecursion:
return "Maximum recursion has been reached.";
case Error::EmptySubExpression:
return "Sub expression has empty content.";
case Error::InvalidCaptureGroup:
return "Content of capture group is invalid.";
case Error::InvalidNameForCaptureGroup:
return "Name of capture group is invalid.";
}
return "Undefined error.";
}
}
using regex::Error;
using regex::get_error_string;

View file

@ -0,0 +1,235 @@
/*
* Copyright (c) 2020, Emanuel Sprung <emanuel.sprung@gmail.com>
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice, this
* list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "RegexLexer.h"
#include <AK/Assertions.h>
#include <AK/LogStream.h>
#include <stdio.h>
namespace regex {
const char* Token::name(const TokenType type)
{
switch (type) {
#define __ENUMERATE_REGEX_TOKEN(x) \
case TokenType::x: \
return #x;
ENUMERATE_REGEX_TOKENS
#undef __ENUMERATE_REGEX_TOKEN
default:
ASSERT_NOT_REACHED();
return "<Unknown>";
}
}
const char* Token::name() const
{
return name(m_type);
}
Lexer::Lexer(const StringView source)
: m_source(source)
{
}
ALWAYS_INLINE char Lexer::peek(size_t offset) const
{
if ((m_position + offset) >= m_source.length())
return EOF;
return m_source[m_position + offset];
}
void Lexer::back(size_t offset)
{
if (offset == m_position + 1)
offset = m_position; // 'position == 0' occurs twice.
ASSERT(offset <= m_position);
if (!offset)
return;
m_position -= offset;
m_previous_position = (m_position > 0) ? m_position - 1 : 0;
m_current_char = m_source[m_position];
}
ALWAYS_INLINE void Lexer::consume()
{
m_previous_position = m_position;
if (m_position >= m_source.length()) {
m_position = m_source.length() + 1;
m_current_char = EOF;
return;
}
m_current_char = m_source[m_position++];
}
void Lexer::reset()
{
m_position = 0;
m_current_token = { TokenType::Eof, 0, StringView(nullptr) };
m_current_char = 0;
m_previous_position = 0;
}
bool Lexer::try_skip(char c)
{
if (peek() != c)
return false;
consume();
return true;
}
char Lexer::skip()
{
auto c = peek();
consume();
return c;
}
Token Lexer::next()
{
size_t token_start_position;
auto begin_token = [&] {
token_start_position = m_position;
};
auto commit_token = [&](auto type) -> Token& {
ASSERT(token_start_position + m_previous_position - token_start_position + 1 <= m_source.length());
auto substring = m_source.substring_view(token_start_position, m_previous_position - token_start_position + 1);
m_current_token = Token(type, token_start_position, substring);
return m_current_token;
};
auto emit_token = [&](auto type) -> Token& {
m_current_token = Token(type, m_position, m_source.substring_view(m_position, 1));
consume();
return m_current_token;
};
auto match_escape_sequence = [&]() -> size_t {
switch (peek(1)) {
case '^':
case '.':
case '[':
case ']':
case '$':
case '(':
case ')':
case '|':
case '*':
case '+':
case '?':
case '{':
case '\\':
return 2;
default:
#ifdef REGEX_DEBUG
fprintf(stderr, "[LEXER] Found invalid escape sequence: \\%c (the parser will have to deal with this!)\n", peek(1));
#endif
return 0;
}
};
while (m_position <= m_source.length()) {
auto ch = peek();
if (ch == '(')
return emit_token(TokenType::LeftParen);
if (ch == ')')
return emit_token(TokenType::RightParen);
if (ch == '{')
return emit_token(TokenType::LeftCurly);
if (ch == '}')
return emit_token(TokenType::RightCurly);
if (ch == '[')
return emit_token(TokenType::LeftBracket);
if (ch == ']')
return emit_token(TokenType::RightBracket);
if (ch == '.')
return emit_token(TokenType::Period);
if (ch == '*')
return emit_token(TokenType::Asterisk);
if (ch == '+')
return emit_token(TokenType::Plus);
if (ch == '$')
return emit_token(TokenType::Dollar);
if (ch == '^')
return emit_token(TokenType::Circumflex);
if (ch == '|')
return emit_token(TokenType::Pipe);
if (ch == '?')
return emit_token(TokenType::Questionmark);
if (ch == ',')
return emit_token(TokenType::Comma);
if (ch == '/')
return emit_token(TokenType::Slash);
if (ch == '=')
return emit_token(TokenType::EqualSign);
if (ch == ':')
return emit_token(TokenType::Colon);
if (ch == '-')
return emit_token(TokenType::HyphenMinus);
if (ch == '\\') {
size_t escape = match_escape_sequence();
if (escape > 0) {
begin_token();
for (size_t i = 0; i < escape; ++i)
consume();
return commit_token(TokenType::EscapeSequence);
}
}
if (ch == EOF)
break;
return emit_token(TokenType::Char);
}
return Token(TokenType::Eof, m_position, nullptr);
}
}

View file

@ -0,0 +1,110 @@
/*
* Copyright (c) 2020, Emanuel Sprung <emanuel.sprung@gmail.com>
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice, this
* list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#pragma once
#include <AK/Forward.h>
#include <AK/StringView.h>
namespace regex {
#define ENUMERATE_REGEX_TOKENS \
__ENUMERATE_REGEX_TOKEN(Eof) \
__ENUMERATE_REGEX_TOKEN(Char) \
__ENUMERATE_REGEX_TOKEN(Circumflex) \
__ENUMERATE_REGEX_TOKEN(Period) \
__ENUMERATE_REGEX_TOKEN(LeftParen) \
__ENUMERATE_REGEX_TOKEN(RightParen) \
__ENUMERATE_REGEX_TOKEN(LeftCurly) \
__ENUMERATE_REGEX_TOKEN(RightCurly) \
__ENUMERATE_REGEX_TOKEN(LeftBracket) \
__ENUMERATE_REGEX_TOKEN(RightBracket) \
__ENUMERATE_REGEX_TOKEN(Asterisk) \
__ENUMERATE_REGEX_TOKEN(EscapeSequence) \
__ENUMERATE_REGEX_TOKEN(Dollar) \
__ENUMERATE_REGEX_TOKEN(Pipe) \
__ENUMERATE_REGEX_TOKEN(Plus) \
__ENUMERATE_REGEX_TOKEN(Comma) \
__ENUMERATE_REGEX_TOKEN(Slash) \
__ENUMERATE_REGEX_TOKEN(EqualSign) \
__ENUMERATE_REGEX_TOKEN(HyphenMinus) \
__ENUMERATE_REGEX_TOKEN(Colon) \
__ENUMERATE_REGEX_TOKEN(Questionmark)
enum class TokenType {
#define __ENUMERATE_REGEX_TOKEN(x) x,
ENUMERATE_REGEX_TOKENS
#undef __ENUMERATE_REGEX_TOKEN
};
class Token {
public:
Token() = default;
Token(const TokenType type, const size_t start_position, const StringView value)
: m_type(type)
, m_position(start_position)
, m_value(value)
{
}
TokenType type() const { return m_type; }
const StringView& value() const { return m_value; }
size_t position() const { return m_position; }
const char* name() const;
static const char* name(const TokenType);
private:
TokenType m_type { TokenType::Eof };
size_t m_position { 0 };
StringView m_value { nullptr };
};
class Lexer {
public:
Lexer() = default;
explicit Lexer(const StringView source);
Token next();
void reset();
void back(size_t offset);
void set_source(const StringView source) { m_source = source; }
bool try_skip(char);
char skip();
private:
ALWAYS_INLINE char peek(size_t offset = 0) const;
ALWAYS_INLINE void consume();
StringView m_source {};
size_t m_position { 0 };
size_t m_previous_position { 0 };
Token m_current_token { TokenType::Eof, 0, StringView(nullptr) };
int m_current_char { 0 };
};
}
using regex::Lexer;

View file

@ -0,0 +1,291 @@
/*
* Copyright (c) 2020, Emanuel Sprung <emanuel.sprung@gmail.com>
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice, this
* list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#pragma once
#include "RegexOptions.h"
#include "AK/FlyString.h"
#include "AK/HashMap.h"
#include "AK/String.h"
#include "AK/StringBuilder.h"
#include "AK/StringView.h"
#include "AK/Utf32View.h"
#include "AK/Vector.h"
namespace regex {
class RegexStringView {
public:
RegexStringView(const char* chars)
: m_u8view(chars)
{
}
RegexStringView(const String& string)
: m_u8view(string)
{
}
RegexStringView(const StringView view)
: m_u8view(view)
{
}
RegexStringView(const Utf32View view)
: m_u32view(view)
{
}
bool is_u8_view() const { return m_u8view.has_value(); }
bool is_u32_view() const { return m_u32view.has_value(); }
const StringView& u8view() const
{
ASSERT(m_u8view.has_value());
return m_u8view.value();
};
const Utf32View& u32view() const
{
ASSERT(m_u32view.has_value());
return m_u32view.value();
};
bool is_empty() const
{
if (is_u8_view())
return m_u8view.value().is_empty();
else
return m_u32view.value().is_empty();
}
bool is_null() const
{
if (is_u8_view())
return m_u8view.value().is_null();
else
return m_u32view.value().code_points() == nullptr;
}
size_t length() const
{
if (is_u8_view())
return m_u8view.value().length();
else
return m_u32view.value().length();
}
Vector<RegexStringView> lines() const
{
if (is_u8_view()) {
auto views = u8view().lines(false);
Vector<RegexStringView> new_views;
for (auto& view : views)
new_views.append(move(view));
return new_views;
}
// FIXME: line splitting for Utf32View needed
Vector<RegexStringView> views;
views.append(m_u32view.value());
return views;
}
RegexStringView substring_view(size_t offset, size_t length) const
{
if (is_u8_view()) {
return u8view().substring_view(offset, length);
}
return u32view().substring_view(offset, length);
}
String to_string() const
{
if (is_u8_view()) {
return u8view().to_string();
}
StringBuilder builder;
builder.append(u32view());
return builder.to_string();
}
u32 operator[](size_t index) const
{
if (is_u8_view()) {
return u8view()[index];
}
return u32view().code_points()[index];
}
bool operator==(const char* cstring) const
{
if (is_u8_view())
return u8view() == cstring;
return to_string() == cstring;
}
bool operator!=(const char* cstring) const
{
return !(*this == cstring);
}
bool operator==(const String& string) const
{
if (is_u8_view())
return u8view() == string;
return to_string() == string;
}
bool operator==(const StringView& other) const
{
if (is_u8_view())
return u8view() == other;
return false;
}
bool operator!=(const StringView& other) const
{
return !(*this == other);
}
bool operator==(const Utf32View& other) const
{
if (is_u32_view()) {
StringBuilder builder;
builder.append(other);
return to_string() == builder.to_string();
}
return false;
}
bool operator!=(const Utf32View& other) const
{
return !(*this == other);
}
const char* characters_without_null_termination() const
{
if (is_u8_view())
return u8view().characters_without_null_termination();
return to_string().characters(); // FIXME: it contains the null termination, does that actually matter?
}
bool starts_with(const StringView& str) const
{
if (is_u32_view())
return false;
return u8view().starts_with(str);
}
bool starts_with(const Utf32View& str) const
{
if (is_u8_view())
return false;
StringBuilder builder;
builder.append(str);
return to_string().starts_with(builder.to_string());
}
private:
Optional<StringView> m_u8view;
Optional<Utf32View> m_u32view;
};
class Match final {
private:
Optional<FlyString> string;
public:
Match() = default;
~Match() = default;
Match(const RegexStringView view_, const size_t line_, const size_t column_, const size_t global_offset_)
: view(view_)
, line(line_)
, column(column_)
, global_offset(global_offset_)
, left_column(column_)
{
}
Match(const String string_, const size_t line_, const size_t column_, const size_t global_offset_)
: string(string_)
, view(string.value().view())
, line(line_)
, column(column_)
, global_offset(global_offset_)
, left_column(column_)
{
}
RegexStringView view { nullptr };
size_t line { 0 };
size_t column { 0 };
size_t global_offset { 0 };
// ugly, as not usable by user, but needed to prevent to create extra vectors that are
// able to store the column when the left paren has been found
size_t left_column { 0 };
};
struct MatchInput {
RegexStringView view { nullptr };
AllOptions regex_options {};
size_t start_offset { 0 }; // For Stateful matches, saved and restored from Regex::start_offset.
size_t match_index { 0 };
size_t line { 0 };
size_t column { 0 };
size_t global_offset { 0 }; // For multiline matching, knowing the offset from start could be important
mutable size_t fail_counter { 0 };
mutable Vector<size_t> saved_positions;
};
struct MatchState {
size_t string_position { 0 };
size_t instruction_position { 0 };
size_t fork_at_position { 0 };
};
struct MatchOutput {
size_t operations;
Vector<Match> matches;
Vector<Vector<Match>> capture_group_matches;
Vector<HashMap<String, Match>> named_capture_group_matches;
};
}
using regex::RegexStringView;

View file

@ -0,0 +1,396 @@
/*
* Copyright (c) 2020, Emanuel Sprung <emanuel.sprung@gmail.com>
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice, this
* list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "RegexMatcher.h"
#include "RegexDebug.h"
#include "RegexParser.h"
#include <AK/ScopedValueRollback.h>
#include <AK/String.h>
#include <AK/StringBuilder.h>
namespace regex {
#ifdef REGEX_DEBUG
static RegexDebug s_regex_dbg(stderr);
#endif
template<class Parser>
Regex<Parser>::Regex(StringView pattern, typename ParserTraits<Parser>::OptionsType regex_options)
{
pattern_value = pattern.to_string();
regex::Lexer lexer(pattern);
Parser parser(lexer, regex_options);
parser_result = parser.parse();
if (parser_result.error == regex::Error::NoError)
matcher = make<Matcher<Parser>>(*this, regex_options);
}
template<class Parser>
typename ParserTraits<Parser>::OptionsType Regex<Parser>::options() const
{
if (parser_result.error != Error::NoError)
return {};
return matcher->options();
}
template<class Parser>
String Regex<Parser>::error_string(Optional<String> message) const
{
StringBuilder eb;
eb.appendf("Error during parsing of regular expression:\n");
eb.appendf(" %s\n ", pattern_value.characters());
for (size_t i = 0; i < parser_result.error_token.position(); ++i)
eb.append(" ");
eb.appendf("^---- %s", message.value_or(get_error_string(parser_result.error)).characters());
return eb.build();
}
template<typename Parser>
RegexResult Matcher<Parser>::match(const RegexStringView& view, Optional<typename ParserTraits<Parser>::OptionsType> regex_options) const
{
AllOptions options = m_regex_options | regex_options.value_or({}).value();
if (options.has_flag_set(AllFlags::Multiline))
return match(view.lines(), regex_options); // FIXME: how do we know, which line ending a line has (1char or 2char)? This is needed to get the correct match offsets from start of string...
Vector<RegexStringView> views;
views.append(view);
return match(views, regex_options);
}
template<typename Parser>
RegexResult Matcher<Parser>::match(const Vector<RegexStringView> views, Optional<typename ParserTraits<Parser>::OptionsType> regex_options) const
{
// If the pattern *itself* isn't stateful, reset any changes to start_offset.
if (!((AllFlags)m_regex_options.value() & AllFlags::Internal_Stateful))
m_pattern.start_offset = 0;
size_t match_count { 0 };
MatchInput input;
MatchState state;
MatchOutput output;
input.regex_options = m_regex_options | regex_options.value_or({}).value();
input.start_offset = m_pattern.start_offset;
output.operations = 0;
if (input.regex_options.has_flag_set(AllFlags::Internal_Stateful))
ASSERT(views.size() == 1);
if (c_match_preallocation_count) {
output.matches.ensure_capacity(c_match_preallocation_count);
output.capture_group_matches.ensure_capacity(c_match_preallocation_count);
output.named_capture_group_matches.ensure_capacity(c_match_preallocation_count);
auto& capture_groups_count = m_pattern.parser_result.capture_groups_count;
auto& named_capture_groups_count = m_pattern.parser_result.named_capture_groups_count;
for (size_t j = 0; j < c_match_preallocation_count; ++j) {
output.matches.empend();
output.capture_group_matches.unchecked_append({});
output.capture_group_matches.at(j).ensure_capacity(capture_groups_count);
for (size_t k = 0; k < capture_groups_count; ++k)
output.capture_group_matches.at(j).unchecked_append({});
output.named_capture_group_matches.unchecked_append({});
output.named_capture_group_matches.at(j).ensure_capacity(named_capture_groups_count);
}
}
auto append_match = [](auto& input, auto& state, auto& output, auto& start_position) {
if (output.matches.size() == input.match_index)
output.matches.empend();
ASSERT(start_position + state.string_position - start_position <= input.view.length());
if (input.regex_options.has_flag_set(AllFlags::StringCopyMatches)) {
output.matches.at(input.match_index) = { input.view.substring_view(start_position, state.string_position - start_position).to_string(), input.line, start_position, input.global_offset + start_position };
} else { // let the view point to the original string ...
output.matches.at(input.match_index) = { input.view.substring_view(start_position, state.string_position - start_position), input.line, start_position, input.global_offset + start_position };
}
};
#ifdef REGEX_DEBUG
s_regex_dbg.print_header();
#endif
bool continue_search = input.regex_options.has_flag_set(AllFlags::Global) || input.regex_options.has_flag_set(AllFlags::Multiline);
if (input.regex_options.has_flag_set(AllFlags::Internal_Stateful))
continue_search = false;
for (auto& view : views) {
input.view = view;
#ifdef REGEX_DEBUG
dbg() << "[match] Starting match with view (" << view.length() << "): _" << view.to_string() << "_";
#endif
auto view_length = view.length();
size_t view_index = m_pattern.start_offset;
state.string_position = view_index;
if (view_index == view_length && m_pattern.parser_result.match_length_minimum == 0) {
// Run the code until it tries to consume something.
// This allows non-consuming code to run on empty strings, for instance
// e.g. "Exit"
MatchOutput temp_output { output };
input.column = match_count;
input.match_index = match_count;
state.string_position = view_index;
state.instruction_position = 0;
auto success = execute(input, state, temp_output, 0);
// This success is acceptable only if it doesn't read anything from the input (input length is 0).
if (state.string_position <= view_index) {
if (success.value()) {
output = move(temp_output);
if (!match_count) {
// Nothing was *actually* matched, so append an empty match.
append_match(input, state, output, view_index);
++match_count;
}
}
}
}
for (; view_index < view_length; ++view_index) {
auto& match_length_minimum = m_pattern.parser_result.match_length_minimum;
// FIXME: More performant would be to know the remaining minimum string
// length needed to match from the current position onwards within
// the vm. Add new OpCode for MinMatchLengthFromSp with the value of
// the remaining string length from the current path. The value though
// has to be filled in reverse. That implies a second run over bytecode
// after generation has finished.
if (match_length_minimum && match_length_minimum > view_length - view_index)
break;
input.column = match_count;
input.match_index = match_count;
state.string_position = view_index;
state.instruction_position = 0;
auto success = execute(input, state, output, 0);
if (!success.has_value())
return { false, 0, {}, {}, {}, output.operations };
if (success.value()) {
if (input.regex_options.has_flag_set(AllFlags::MatchNotEndOfLine) && state.string_position == input.view.length()) {
if (!continue_search)
break;
continue;
}
if (input.regex_options.has_flag_set(AllFlags::MatchNotBeginOfLine) && view_index == 0) {
if (!continue_search)
break;
continue;
}
#ifdef REGEX_DEBUG
dbg() << "state.string_position: " << state.string_position << " view_index: " << view_index;
dbg() << "[match] Found a match (length = " << state.string_position - view_index << "): " << input.view.substring_view(view_index, state.string_position - view_index).to_string();
#endif
++match_count;
if (continue_search) {
append_match(input, state, output, view_index);
bool has_zero_length = state.string_position == view_index;
view_index = state.string_position - (has_zero_length ? 0 : 1);
continue;
} else if (input.regex_options.has_flag_set(AllFlags::Internal_Stateful)) {
append_match(input, state, output, view_index);
break;
} else if (state.string_position < view_length) {
return { false, 0, {}, {}, {}, output.operations };
}
append_match(input, state, output, view_index);
break;
}
if (!continue_search && !input.regex_options.has_flag_set(AllFlags::Internal_Stateful))
break;
}
++input.line;
input.global_offset += view.length() + 1; // +1 includes the line break character
if (input.regex_options.has_flag_set(AllFlags::Internal_Stateful))
m_pattern.start_offset = state.string_position;
}
MatchOutput output_copy;
if (match_count) {
auto capture_groups_count = min(output.capture_group_matches.size(), output.matches.size());
for (size_t i = 0; i < capture_groups_count; ++i) {
if (input.regex_options.has_flag_set(AllFlags::SkipTrimEmptyMatches)) {
output_copy.capture_group_matches.append(output.capture_group_matches.at(i));
} else {
Vector<Match> capture_group_matches;
for (size_t j = 0; j < output.capture_group_matches.at(i).size(); ++j) {
if (!output.capture_group_matches.at(i).at(j).view.is_null())
capture_group_matches.append(output.capture_group_matches.at(i).at(j));
}
output_copy.capture_group_matches.append(capture_group_matches);
}
}
auto named_capture_groups_count = min(output.named_capture_group_matches.size(), output.matches.size());
for (size_t i = 0; i < named_capture_groups_count; ++i) {
if (output.matches.at(i).view.length())
output_copy.named_capture_group_matches.append(output.named_capture_group_matches.at(i));
}
for (size_t i = 0; i < match_count; ++i)
output_copy.matches.append(output.matches.at(i));
} else {
output_copy.capture_group_matches.clear_with_capacity();
output_copy.named_capture_group_matches.clear_with_capacity();
}
return {
match_count ? true : false,
match_count,
move(output_copy.matches),
move(output_copy.capture_group_matches),
move(output_copy.named_capture_group_matches),
output.operations,
m_pattern.parser_result.capture_groups_count,
m_pattern.parser_result.named_capture_groups_count,
};
}
template<class Parser>
Optional<bool> Matcher<Parser>::execute(const MatchInput& input, MatchState& state, MatchOutput& output, size_t recursion_level) const
{
if (recursion_level > c_max_recursion)
return false;
Vector<MatchState> fork_low_prio_states;
MatchState fork_high_prio_state;
Optional<bool> success;
auto& bytecode = m_pattern.parser_result.bytecode;
for (;;) {
++output.operations;
auto* opcode = bytecode.get_opcode(state);
if (!opcode) {
dbgln("Wrong opcode... failed!");
return {};
}
#ifdef REGEX_DEBUG
s_regex_dbg.print_opcode("VM", *opcode, state, recursion_level, false);
#endif
ExecutionResult result;
if (input.fail_counter > 0) {
--input.fail_counter;
result = ExecutionResult::Failed_ExecuteLowPrioForks;
} else {
result = opcode->execute(input, state, output);
}
#ifdef REGEX_DEBUG
s_regex_dbg.print_result(*opcode, bytecode, input, state, result);
#endif
state.instruction_position += opcode->size();
switch (result) {
case ExecutionResult::Fork_PrioLow:
fork_low_prio_states.prepend(state);
continue;
case ExecutionResult::Fork_PrioHigh:
fork_high_prio_state = state;
fork_high_prio_state.instruction_position = fork_high_prio_state.fork_at_position;
success = execute(input, fork_high_prio_state, output, ++recursion_level);
if (!success.has_value())
return {};
if (success.value()) {
state = fork_high_prio_state;
return true;
}
continue;
case ExecutionResult::Continue:
continue;
case ExecutionResult::Succeeded:
return true;
case ExecutionResult::Failed:
return false;
case ExecutionResult::Failed_ExecuteLowPrioForks:
return execute_low_prio_forks(input, state, output, fork_low_prio_states, recursion_level + 1);
}
}
ASSERT_NOT_REACHED();
}
template<class Parser>
ALWAYS_INLINE Optional<bool> Matcher<Parser>::execute_low_prio_forks(const MatchInput& input, MatchState& original_state, MatchOutput& output, Vector<MatchState> states, size_t recursion_level) const
{
for (auto& state : states) {
state.instruction_position = state.fork_at_position;
#ifdef REGEX_DEBUG
fprintf(stderr, "Forkstay... ip = %lu, sp = %lu\n", state.instruction_position, state.string_position);
#endif
auto success = execute(input, state, output, recursion_level);
if (!success.has_value())
return {};
if (success.value()) {
#ifdef REGEX_DEBUG
fprintf(stderr, "Forkstay succeeded... ip = %lu, sp = %lu\n", state.instruction_position, state.string_position);
#endif
original_state = state;
return true;
}
}
original_state.string_position = 0;
return false;
}
template class Matcher<PosixExtendedParser>;
template class Regex<PosixExtendedParser>;
template class Matcher<ECMA262Parser>;
template class Regex<ECMA262Parser>;
}

View file

@ -0,0 +1,296 @@
/*
* Copyright (c) 2020, Emanuel Sprung <emanuel.sprung@gmail.com>
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice, this
* list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#pragma once
#include "RegexByteCode.h"
#include "RegexMatch.h"
#include "RegexOptions.h"
#include "RegexParser.h"
#include <AK/Forward.h>
#include <AK/HashMap.h>
#include <AK/NonnullOwnPtrVector.h>
#include <AK/Types.h>
#include <AK/Utf32View.h>
#include <AK/Vector.h>
#include <ctype.h>
#include <stdio.h>
namespace regex {
static const constexpr size_t c_max_recursion = 5000;
static const constexpr size_t c_match_preallocation_count = 0;
struct RegexResult final {
bool success { false };
size_t count { 0 };
Vector<Match> matches;
Vector<Vector<Match>> capture_group_matches;
Vector<HashMap<String, Match>> named_capture_group_matches;
size_t n_operations { 0 };
size_t n_capture_groups { 0 };
size_t n_named_capture_groups { 0 };
};
template<class Parser>
class Regex;
template<class Parser>
class Matcher final {
public:
Matcher(const Regex<Parser>& pattern, Optional<typename ParserTraits<Parser>::OptionsType> regex_options = {})
: m_pattern(pattern)
, m_regex_options(regex_options.value_or({}))
{
}
~Matcher() = default;
RegexResult match(const RegexStringView&, Optional<typename ParserTraits<Parser>::OptionsType> = {}) const;
RegexResult match(const Vector<RegexStringView>, Optional<typename ParserTraits<Parser>::OptionsType> = {}) const;
typename ParserTraits<Parser>::OptionsType options() const
{
return m_regex_options;
}
private:
Optional<bool> execute(const MatchInput& input, MatchState& state, MatchOutput& output, size_t recursion_level) const;
ALWAYS_INLINE Optional<bool> execute_low_prio_forks(const MatchInput& input, MatchState& original_state, MatchOutput& output, Vector<MatchState> states, size_t recursion_level) const;
const Regex<Parser>& m_pattern;
const typename ParserTraits<Parser>::OptionsType m_regex_options;
};
template<class Parser>
class Regex final {
public:
String pattern_value;
regex::Parser::Result parser_result;
OwnPtr<Matcher<Parser>> matcher { nullptr };
mutable size_t start_offset { 0 };
explicit Regex(StringView pattern, typename ParserTraits<Parser>::OptionsType regex_options = {});
~Regex() = default;
typename ParserTraits<Parser>::OptionsType options() const;
void print_bytecode(FILE* f = stdout) const;
String error_string(Optional<String> message = {}) const;
RegexResult match(const RegexStringView view, Optional<typename ParserTraits<Parser>::OptionsType> regex_options = {}) const
{
if (!matcher || parser_result.error != Error::NoError)
return {};
return matcher->match(view, regex_options);
}
RegexResult match(const Vector<RegexStringView> views, Optional<typename ParserTraits<Parser>::OptionsType> regex_options = {}) const
{
if (!matcher || parser_result.error != Error::NoError)
return {};
return matcher->match(views, regex_options);
}
String replace(const RegexStringView view, const StringView& replacement_pattern, Optional<typename ParserTraits<Parser>::OptionsType> regex_options = {}) const
{
if (!matcher || parser_result.error != Error::NoError)
return {};
StringBuilder builder;
size_t start_offset = 0;
RegexResult result = matcher->match(view, regex_options);
if (!result.success)
return view.to_string();
for (size_t i = 0; i < result.matches.size(); ++i) {
auto& match = result.matches[i];
builder.append(view.substring_view(start_offset, match.global_offset - start_offset).to_string());
start_offset = match.global_offset + match.view.length();
GenericLexer lexer(replacement_pattern);
while (!lexer.is_eof()) {
if (lexer.consume_specific('\\')) {
if (lexer.consume_specific('\\')) {
builder.append('\\');
continue;
}
auto number = lexer.consume_while(isdigit);
if (auto index = number.to_uint(); index.has_value() && result.n_capture_groups >= index.value()) {
builder.append(result.capture_group_matches[i][index.value() - 1].view.to_string());
} else {
builder.appendff("\\{}", number);
}
} else {
builder.append(lexer.consume_while([](auto ch) { return ch != '\\'; }));
}
}
}
builder.append(view.substring_view(start_offset, view.length() - start_offset).to_string());
return builder.to_string();
}
// FIXME: replace(const Vector<RegexStringView>, ...)
RegexResult search(const RegexStringView view, Optional<typename ParserTraits<Parser>::OptionsType> regex_options = {}) const
{
if (!matcher || parser_result.error != Error::NoError)
return {};
AllOptions options = (AllOptions)regex_options.value_or({});
if ((options & AllFlags::MatchNotBeginOfLine) && (options & AllFlags::MatchNotEndOfLine)) {
options.reset_flag(AllFlags::MatchNotEndOfLine);
options.reset_flag(AllFlags::MatchNotBeginOfLine);
}
options |= AllFlags::Global;
return matcher->match(view, options);
}
RegexResult search(const Vector<RegexStringView> views, Optional<typename ParserTraits<Parser>::OptionsType> regex_options = {}) const
{
if (!matcher || parser_result.error != Error::NoError)
return {};
AllOptions options = (AllOptions)regex_options.value_or({});
if ((options & AllFlags::MatchNotBeginOfLine) && (options & AllFlags::MatchNotEndOfLine)) {
options.reset_flag(AllFlags::MatchNotEndOfLine);
options.reset_flag(AllFlags::MatchNotBeginOfLine);
}
options |= AllFlags::Global;
return matcher->match(views, options);
}
bool match(const RegexStringView view, RegexResult& m, Optional<typename ParserTraits<Parser>::OptionsType> regex_options = {}) const
{
m = match(view, regex_options);
return m.success;
}
bool match(const Vector<RegexStringView> views, RegexResult& m, Optional<typename ParserTraits<Parser>::OptionsType> regex_options = {}) const
{
m = match(views, regex_options);
return m.success;
}
bool search(const RegexStringView view, RegexResult& m, Optional<typename ParserTraits<Parser>::OptionsType> regex_options = {}) const
{
m = search(view, regex_options);
return m.success;
}
bool search(const Vector<RegexStringView> views, RegexResult& m, Optional<typename ParserTraits<Parser>::OptionsType> regex_options = {}) const
{
m = search(views, regex_options);
return m.success;
}
bool has_match(const RegexStringView view, Optional<typename ParserTraits<Parser>::OptionsType> regex_options = {}) const
{
if (!matcher || parser_result.error != Error::NoError)
return false;
RegexResult result = matcher->match(view, AllOptions { regex_options.value_or({}) } | AllFlags::SkipSubExprResults);
return result.success;
}
bool has_match(const Vector<RegexStringView> views, Optional<typename ParserTraits<Parser>::OptionsType> regex_options = {}) const
{
if (!matcher || parser_result.error != Error::NoError)
return false;
RegexResult result = matcher->match(views, AllOptions { regex_options.value_or({}) } | AllFlags::SkipSubExprResults);
return result.success;
}
};
// free standing functions for match, search and has_match
template<class Parser>
RegexResult match(const RegexStringView view, Regex<Parser>& pattern, Optional<typename ParserTraits<Parser>::OptionsType> regex_options = {})
{
return pattern.match(view, regex_options);
}
template<class Parser>
RegexResult match(const Vector<RegexStringView> view, Regex<Parser>& pattern, Optional<typename ParserTraits<Parser>::OptionsType> regex_options = {})
{
return pattern.match(view, regex_options);
}
template<class Parser>
bool match(const RegexStringView view, Regex<Parser>& pattern, RegexResult&, Optional<typename ParserTraits<Parser>::OptionsType> regex_options = {})
{
return pattern.match(view, regex_options);
}
template<class Parser>
bool match(const Vector<RegexStringView> view, Regex<Parser>& pattern, RegexResult&, Optional<typename ParserTraits<Parser>::OptionsType> regex_options = {})
{
return pattern.match(view, regex_options);
}
template<class Parser>
RegexResult search(const RegexStringView view, Regex<Parser>& pattern, Optional<typename ParserTraits<Parser>::OptionsType> regex_options = {})
{
return pattern.search(view, regex_options);
}
template<class Parser>
RegexResult search(const Vector<RegexStringView> views, Regex<Parser>& pattern, Optional<typename ParserTraits<Parser>::OptionsType> regex_options = {})
{
return pattern.search(views, regex_options);
}
template<class Parser>
bool search(const RegexStringView view, Regex<Parser>& pattern, RegexResult&, Optional<typename ParserTraits<Parser>::OptionsType> regex_options = {})
{
return pattern.search(view, regex_options);
}
template<class Parser>
bool search(const Vector<RegexStringView> views, Regex<Parser>& pattern, RegexResult&, Optional<typename ParserTraits<Parser>::OptionsType> regex_options = {})
{
return pattern.search(views, regex_options);
}
template<class Parser>
bool has_match(const RegexStringView view, Regex<Parser>& pattern, Optional<typename ParserTraits<Parser>::OptionsType> regex_options = {})
{
return pattern.has_match(view, regex_options);
}
template<class Parser>
bool has_match(const Vector<RegexStringView> views, Regex<Parser>& pattern, Optional<typename ParserTraits<Parser>::OptionsType> regex_options = {})
{
return pattern.has_match(views, regex_options);
}
}
using regex::has_match;
using regex::match;
using regex::Regex;
using regex::RegexResult;

View file

@ -0,0 +1,161 @@
/*
* Copyright (c) 2020, Emanuel Sprung <emanuel.sprung@gmail.com>
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice, this
* list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#pragma once
#include <AK/Types.h>
#include <stdio.h>
#ifdef __serenity__
# include <regex.h>
#else
# include <LibC/regex.h>
#endif
namespace regex {
using FlagsUnderlyingType = u16;
enum class AllFlags {
Global = __Regex_Global, // All matches (don't return after first match)
Insensitive = __Regex_Insensitive, // Case insensitive match (ignores case of [a-zA-Z])
Ungreedy = __Regex_Ungreedy, // The match becomes lazy by default. Now a ? following a quantifier makes it greedy
Unicode = __Regex_Unicode, // Enable all unicode features and interpret all unicode escape sequences as such
Extended = __Regex_Extended, // Ignore whitespaces. Spaces and text after a # in the pattern are ignored
Extra = __Regex_Extra, // Disallow meaningless escapes. A \ followed by a letter with no special meaning is faulted
MatchNotBeginOfLine = __Regex_MatchNotBeginOfLine, // Pattern is not forced to ^ -> search in whole string!
MatchNotEndOfLine = __Regex_MatchNotEndOfLine, // Don't Force the dollar sign, $, to always match end of the string, instead of end of the line. This option is ignored if the Multiline-flag is set
SkipSubExprResults = __Regex_SkipSubExprResults, // Do not return sub expressions in the result
StringCopyMatches = __Regex_StringCopyMatches, // Do explicitly copy results into new allocated string instead of StringView to original string.
SingleLine = __Regex_SingleLine, // Dot matches newline characters
Sticky = __Regex_Sticky, // Force the pattern to only match consecutive matches from where the previous match ended.
Multiline = __Regex_Multiline, // Handle newline characters. Match each line, one by one.
SkipTrimEmptyMatches = __Regex_SkipTrimEmptyMatches, // Do not remove empty capture group results.
Internal_Stateful = __Regex_Internal_Stateful, // Make global matches match one result at a time, and further match() calls on the same instance continue where the previous one left off.
Last = Internal_Stateful,
};
enum class PosixFlags : FlagsUnderlyingType {
Global = (FlagsUnderlyingType)AllFlags::Global,
Insensitive = (FlagsUnderlyingType)AllFlags::Insensitive,
Ungreedy = (FlagsUnderlyingType)AllFlags::Ungreedy,
Unicode = (FlagsUnderlyingType)AllFlags::Unicode,
Extended = (FlagsUnderlyingType)AllFlags::Extended,
Extra = (FlagsUnderlyingType)AllFlags::Extra,
MatchNotBeginOfLine = (FlagsUnderlyingType)AllFlags::MatchNotBeginOfLine,
MatchNotEndOfLine = (FlagsUnderlyingType)AllFlags::MatchNotEndOfLine,
SkipSubExprResults = (FlagsUnderlyingType)AllFlags::SkipSubExprResults,
SkipTrimEmptyMatches = (FlagsUnderlyingType)AllFlags::SkipTrimEmptyMatches,
Multiline = (FlagsUnderlyingType)AllFlags::Multiline,
StringCopyMatches = (FlagsUnderlyingType)AllFlags::StringCopyMatches,
};
enum class ECMAScriptFlags : FlagsUnderlyingType {
Global = (FlagsUnderlyingType)AllFlags::Global | (FlagsUnderlyingType)AllFlags::Internal_Stateful, // Note: ECMAScript "Global" creates a stateful regex.
Insensitive = (FlagsUnderlyingType)AllFlags::Insensitive,
Ungreedy = (FlagsUnderlyingType)AllFlags::Ungreedy,
Unicode = (FlagsUnderlyingType)AllFlags::Unicode,
Extended = (FlagsUnderlyingType)AllFlags::Extended,
Extra = (FlagsUnderlyingType)AllFlags::Extra,
SingleLine = (FlagsUnderlyingType)AllFlags::SingleLine,
Sticky = (FlagsUnderlyingType)AllFlags::Sticky,
Multiline = (FlagsUnderlyingType)AllFlags::Multiline,
StringCopyMatches = (FlagsUnderlyingType)AllFlags::StringCopyMatches,
};
template<class T>
class RegexOptions {
public:
using FlagsType = T;
RegexOptions() = default;
RegexOptions(T flags)
: m_flags(flags)
{
}
template<class U>
RegexOptions(RegexOptions<U> other)
: m_flags((T) static_cast<FlagsUnderlyingType>(other.value()))
{
}
operator bool() const { return !!*this; }
bool operator!() const { return (FlagsUnderlyingType)m_flags == 0; }
RegexOptions<T> operator|(T flag) const { return RegexOptions<T> { (T)((FlagsUnderlyingType)m_flags | (FlagsUnderlyingType)flag) }; }
RegexOptions<T> operator&(T flag) const { return RegexOptions<T> { (T)((FlagsUnderlyingType)m_flags & (FlagsUnderlyingType)flag) }; }
RegexOptions<T>& operator|=(T flag)
{
m_flags = (T)((FlagsUnderlyingType)m_flags | (FlagsUnderlyingType)flag);
return *this;
}
RegexOptions<T>& operator&=(T flag)
{
m_flags = (T)((FlagsUnderlyingType)m_flags & (FlagsUnderlyingType)flag);
return *this;
}
void reset_flags() { m_flags = (T)0; }
void reset_flag(T flag) { m_flags = (T)((FlagsUnderlyingType)m_flags & ~(FlagsUnderlyingType)flag); }
void set_flag(T flag) { *this |= flag; }
bool has_flag_set(T flag) const { return (FlagsUnderlyingType)flag == ((FlagsUnderlyingType)m_flags & (FlagsUnderlyingType)flag); }
T value() const { return m_flags; }
private:
T m_flags { 0 };
};
template<class T>
inline RegexOptions<T> operator|(T lhs, T rhs)
{
return RegexOptions<T> { lhs } |= rhs;
}
template<class T>
inline RegexOptions<T> operator&(T lhs, T rhs)
{
return RegexOptions<T> { lhs } &= rhs;
}
template<class T>
inline T operator~(T flag)
{
return (T) ~((FlagsUnderlyingType)flag);
}
using AllOptions = RegexOptions<AllFlags>;
using ECMAScriptOptions = RegexOptions<ECMAScriptFlags>;
using PosixOptions = RegexOptions<PosixFlags>;
}
using regex::ECMAScriptFlags;
using regex::ECMAScriptOptions;
using regex::PosixFlags;
using regex::PosixOptions;

File diff suppressed because it is too large Load diff

View file

@ -0,0 +1,208 @@
/*
* Copyright (c) 2020, Emanuel Sprung <emanuel.sprung@gmail.com>
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice, this
* list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#pragma once
#include "RegexByteCode.h"
#include "RegexError.h"
#include "RegexLexer.h"
#include "RegexOptions.h"
#include <AK/Forward.h>
#include <AK/StringBuilder.h>
#include <AK/Types.h>
#include <AK/Vector.h>
namespace regex {
class PosixExtendedParser;
class ECMA262Parser;
template<typename T>
struct GenericParserTraits {
using OptionsType = T;
};
template<typename T>
struct ParserTraits : public GenericParserTraits<T> {
};
template<>
struct ParserTraits<PosixExtendedParser> : public GenericParserTraits<PosixOptions> {
};
template<>
struct ParserTraits<ECMA262Parser> : public GenericParserTraits<ECMAScriptOptions> {
};
class Parser {
public:
struct Result {
ByteCode bytecode;
size_t capture_groups_count;
size_t named_capture_groups_count;
size_t match_length_minimum;
Error error;
Token error_token;
};
explicit Parser(Lexer& lexer)
: m_parser_state(lexer)
{
}
Parser(Lexer& lexer, AllOptions regex_options)
: m_parser_state(lexer, regex_options)
{
}
virtual ~Parser() = default;
Result parse(Optional<AllOptions> regex_options = {});
bool has_error() const { return m_parser_state.error != Error::NoError; }
Error error() const { return m_parser_state.error; }
protected:
virtual bool parse_internal(ByteCode&, size_t& match_length_minimum) = 0;
ALWAYS_INLINE bool match(TokenType type) const;
ALWAYS_INLINE bool match(char ch) const;
ALWAYS_INLINE bool match_ordinary_characters();
ALWAYS_INLINE Token consume();
ALWAYS_INLINE Token consume(TokenType type, Error error);
ALWAYS_INLINE bool consume(const String&);
ALWAYS_INLINE bool try_skip(StringView);
ALWAYS_INLINE char skip();
ALWAYS_INLINE void reset();
ALWAYS_INLINE bool done() const;
ALWAYS_INLINE bool set_error(Error error);
struct ParserState {
Lexer& lexer;
Token current_token;
Error error = Error::NoError;
Token error_token { TokenType::Eof, 0, StringView(nullptr) };
ByteCode bytecode;
size_t capture_groups_count { 0 };
size_t named_capture_groups_count { 0 };
size_t match_length_minimum { 0 };
AllOptions regex_options;
HashMap<int, size_t> capture_group_minimum_lengths;
HashMap<FlyString, size_t> named_capture_group_minimum_lengths;
HashMap<size_t, FlyString> named_capture_groups;
explicit ParserState(Lexer& lexer)
: lexer(lexer)
, current_token(lexer.next())
{
}
explicit ParserState(Lexer& lexer, AllOptions regex_options)
: lexer(lexer)
, current_token(lexer.next())
, regex_options(regex_options)
{
}
};
ParserState m_parser_state;
};
class PosixExtendedParser final : public Parser {
public:
explicit PosixExtendedParser(Lexer& lexer)
: Parser(lexer)
{
}
PosixExtendedParser(Lexer& lexer, Optional<typename ParserTraits<PosixExtendedParser>::OptionsType> regex_options)
: Parser(lexer, regex_options.value_or({}))
{
}
~PosixExtendedParser() = default;
private:
ALWAYS_INLINE bool match_repetition_symbol();
bool parse_internal(ByteCode&, size_t&) override;
bool parse_root(ByteCode&, size_t&);
ALWAYS_INLINE bool parse_sub_expression(ByteCode&, size_t&);
ALWAYS_INLINE bool parse_bracket_expression(ByteCode&, size_t&);
ALWAYS_INLINE bool parse_repetition_symbol(ByteCode&, size_t&);
};
class ECMA262Parser final : public Parser {
public:
explicit ECMA262Parser(Lexer& lexer)
: Parser(lexer)
{
}
ECMA262Parser(Lexer& lexer, Optional<typename ParserTraits<ECMA262Parser>::OptionsType> regex_options)
: Parser(lexer, regex_options.value_or({}))
{
}
~ECMA262Parser() = default;
private:
bool parse_internal(ByteCode&, size_t&) override;
enum class ReadDigitsInitialZeroState {
Allow,
Disallow,
Require,
};
enum class ReadDigitFollowPolicy {
Any,
DisallowDigit,
DisallowNonDigit,
};
Optional<unsigned> read_digits(ReadDigitsInitialZeroState initial_zero = ReadDigitsInitialZeroState::Allow, ReadDigitFollowPolicy follow_policy = ReadDigitFollowPolicy::Any, bool hex = false, int max_count = -1);
StringView read_capture_group_specifier(bool take_starting_angle_bracket = false);
bool parse_pattern(ByteCode&, size_t&, bool unicode, bool named);
bool parse_disjunction(ByteCode&, size_t&, bool unicode, bool named);
bool parse_alternative(ByteCode&, size_t&, bool unicode, bool named);
bool parse_term(ByteCode&, size_t&, bool unicode, bool named);
bool parse_assertion(ByteCode&, size_t&, bool unicode, bool named);
bool parse_atom(ByteCode&, size_t&, bool unicode, bool named);
bool parse_quantifier(ByteCode&, size_t&, bool unicode, bool named);
bool parse_atom_escape(ByteCode&, size_t&, bool unicode, bool named);
bool parse_character_class(ByteCode&, size_t&, bool unicode, bool named);
bool parse_capture_group(ByteCode&, size_t&, bool unicode, bool named);
Optional<CharClass> parse_character_class_escape(bool& out_inverse, bool expect_backslash = false);
bool parse_nonempty_class_ranges(Vector<CompareTypeAndValuePair>&, bool unicode);
};
using PosixExtended = PosixExtendedParser;
using ECMA262 = ECMA262Parser;
}
using regex::ECMA262;
using regex::PosixExtended;

View file

@ -0,0 +1,991 @@
/*
* Copyright (c) 2020, Emanuel Sprung <emanuel.sprung@gmail.com>
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice, this
* list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include <AK/TestSuite.h> // import first, to prevent warning of ASSERT* redefinition
#include <LibRegex/Regex.h>
#include <stdio.h>
#ifndef REGEX_DEBUG
# define BENCHMARK_LOOP_ITERATIONS 100000
//# define REGEX_BENCHMARK_OUR
# ifndef __serenity__
//# define REGEX_BENCHMARK_OTHER
# endif
# if defined(REGEX_BENCHMARK_OTHER)
# include <regex>
# endif
# if not(defined(REGEX_BENCHMARK_OUR) && defined(REGEX_BENCHMARK_OUR))
BENCHMARK_CASE(dummy_benchmark)
{
}
# endif
# if defined(REGEX_BENCHMARK_OUR)
BENCHMARK_CASE(catch_all_benchmark)
{
Regex<PosixExtended> re("^.*$");
RegexResult m;
for (size_t i = 0; i < BENCHMARK_LOOP_ITERATIONS; ++i) {
EXPECT(re.match("Hello World", m));
}
}
# endif
# if defined(REGEX_BENCHMARK_OTHER)
BENCHMARK_CASE(catch_all_benchmark_reference_stdcpp)
{
std::regex re("^.*$");
std::cmatch m;
for (size_t i = 0; i < BENCHMARK_LOOP_ITERATIONS; ++i) {
EXPECT_EQ(std::regex_match("Hello World", m, re), true);
}
}
# endif
# if defined(REGEX_BENCHMARK_OUR)
BENCHMARK_CASE(simple_start_benchmark)
{
Regex<PosixExtended> re("^hello friends");
RegexResult m;
for (size_t i = 0; i < BENCHMARK_LOOP_ITERATIONS; ++i) {
EXPECT_EQ(re.match("Hello!", m), false);
EXPECT_EQ(re.match("hello friends", m), true);
EXPECT_EQ(re.match("Well, hello friends", m), false);
}
}
# endif
# if defined(REGEX_BENCHMARK_OTHER)
BENCHMARK_CASE(simple_start_benchmark_reference_stdcpp)
{
std::regex re("^hello friends");
std::cmatch m;
for (size_t i = 0; i < BENCHMARK_LOOP_ITERATIONS; ++i) {
EXPECT_EQ(std::regex_match("Hello", m, re), false);
EXPECT_EQ(std::regex_match("hello friends", m, re), true);
EXPECT_EQ(std::regex_match("Well, hello friends", m, re), false);
}
}
# endif
# if defined(REGEX_BENCHMARK_OUR)
BENCHMARK_CASE(simple_end_benchmark)
{
Regex<PosixExtended> re(".*hello\\.\\.\\. there$");
RegexResult m;
for (size_t i = 0; i < BENCHMARK_LOOP_ITERATIONS; ++i) {
EXPECT_EQ(re.match("Hallo", m), false);
EXPECT_EQ(re.match("I said fyhello... there", m), true);
EXPECT_EQ(re.match("ahello... therea", m), false);
EXPECT_EQ(re.match("hello.. there", m), false);
}
}
# endif
# if defined(REGEX_BENCHMARK_OTHER)
BENCHMARK_CASE(simple_end_benchmark_reference_stdcpp)
{
std::regex re(".*hello\\.\\.\\. there$");
std::cmatch m;
for (size_t i = 0; i < BENCHMARK_LOOP_ITERATIONS; ++i) {
EXPECT_EQ(std::regex_search("Hallo", m, re), false);
EXPECT_EQ(std::regex_search("I said fyhello... there", m, re), true);
EXPECT_EQ(std::regex_search("ahello... therea", m, re), false);
EXPECT_EQ(std::regex_search("hello.. there", m, re), false);
}
}
# endif
# if defined(REGEX_BENCHMARK_OUR)
BENCHMARK_CASE(simple_period_benchmark)
{
Regex<PosixExtended> re("hello.");
RegexResult m;
for (size_t i = 0; i < BENCHMARK_LOOP_ITERATIONS; ++i) {
EXPECT_EQ(re.match("Hello1", m), false);
EXPECT_EQ(re.match("hello1", m), true);
EXPECT_EQ(re.match("hello2", m), true);
EXPECT_EQ(re.match("hello?", m), true);
}
}
# endif
# if defined(REGEX_BENCHMARK_OTHER)
BENCHMARK_CASE(simple_period_benchmark_reference_stdcpp)
{
std::regex re("hello.");
std::cmatch m;
for (size_t i = 0; i < BENCHMARK_LOOP_ITERATIONS; ++i) {
EXPECT_EQ(std::regex_match("Hello1", m, re), false);
EXPECT_EQ(std::regex_match("hello1", m, re), true);
EXPECT_EQ(std::regex_match("hello2", m, re), true);
EXPECT_EQ(std::regex_match("hello?", m, re), true);
}
}
# endif
# if defined(REGEX_BENCHMARK_OUR)
BENCHMARK_CASE(simple_period_end_benchmark)
{
Regex<PosixExtended> re("hello.$");
RegexResult m;
for (size_t i = 0; i < BENCHMARK_LOOP_ITERATIONS; ++i) {
EXPECT_EQ(re.search("Hello1", m), false);
EXPECT_EQ(re.search("hello1hello1", m), true);
EXPECT_EQ(re.search("hello2hell", m), false);
EXPECT_EQ(re.search("hello?", m), true);
}
}
# endif
# if defined(REGEX_BENCHMARK_OTHER)
BENCHMARK_CASE(simple_period_end_benchmark_reference_stdcpp)
{
std::regex re("hello.$");
std::cmatch m;
for (size_t i = 0; i < BENCHMARK_LOOP_ITERATIONS; ++i) {
EXPECT_EQ(std::regex_search("Hello1", m, re), false);
EXPECT_EQ(std::regex_search("hello1hello1", m, re), true);
EXPECT_EQ(std::regex_search("hello2hell", m, re), false);
EXPECT_EQ(std::regex_search("hello?", m, re), true);
}
}
# endif
# if defined(REGEX_BENCHMARK_OUR)
BENCHMARK_CASE(simple_escaped_benchmark)
{
Regex<PosixExtended> re("hello\\.");
RegexResult m;
for (size_t i = 0; i < BENCHMARK_LOOP_ITERATIONS; ++i) {
EXPECT_EQ(re.match("hello", m), false);
EXPECT_EQ(re.match("hello.", m), true);
}
}
# endif
# if defined(REGEX_BENCHMARK_OTHER)
BENCHMARK_CASE(simple_escaped_benchmark_reference_stdcpp)
{
std::regex re("hello\\.");
std::cmatch m;
for (size_t i = 0; i < BENCHMARK_LOOP_ITERATIONS; ++i) {
EXPECT_EQ(std::regex_match("hello", m, re), false);
EXPECT_EQ(std::regex_match("hello.", m, re), true);
}
}
# endif
# if defined(REGEX_BENCHMARK_OUR)
BENCHMARK_CASE(simple_period2_end_benchmark)
{
Regex<PosixExtended> re(".*hi... there$");
RegexResult m;
for (size_t i = 0; i < BENCHMARK_LOOP_ITERATIONS; ++i) {
EXPECT_EQ(re.search("Hello there", m), false);
EXPECT_EQ(re.search("I said fyhi... there", m), true);
EXPECT_EQ(re.search("....hi... ", m), false);
EXPECT_EQ(re.search("I said fyhihii there", m), true);
EXPECT_EQ(re.search("I said fyhihi there", m), false);
}
}
# endif
# if defined(REGEX_BENCHMARK_OTHER)
BENCHMARK_CASE(simple_period2_end_benchmark_reference_stdcpp)
{
std::regex re(".*hi... there$");
std::cmatch m;
for (size_t i = 0; i < BENCHMARK_LOOP_ITERATIONS; ++i) {
EXPECT_EQ(std::regex_search("Hello there", m, re), false);
EXPECT_EQ(std::regex_search("I said fyhi... there", m, re), true);
EXPECT_EQ(std::regex_search("....hi... ", m, re), false);
EXPECT_EQ(std::regex_search("I said fyhihii there", m, re), true);
EXPECT_EQ(std::regex_search("I said fyhihi there", m, re), false);
}
}
# endif
# if defined(REGEX_BENCHMARK_OUR)
BENCHMARK_CASE(simple_plus_benchmark)
{
Regex<PosixExtended> re("a+");
RegexResult m;
for (size_t i = 0; i < BENCHMARK_LOOP_ITERATIONS; ++i) {
EXPECT_EQ(re.search("b", m), false);
EXPECT_EQ(re.search("a", m), true);
EXPECT_EQ(re.search("aaaaaabbbbb", m), true);
EXPECT_EQ(re.search("aaaaaaaaaaa", m), true);
}
}
# endif
# if defined(REGEX_BENCHMARK_OTHER)
BENCHMARK_CASE(simple_plus_benchmark_reference_stdcpp)
{
std::regex re("a+");
std::cmatch m;
for (size_t i = 0; i < BENCHMARK_LOOP_ITERATIONS; ++i) {
EXPECT_EQ(std::regex_search("b", m, re), false);
EXPECT_EQ(std::regex_search("a", m, re), true);
EXPECT_EQ(std::regex_search("aaaaaabbbbb", m, re), true);
EXPECT_EQ(std::regex_search("aaaaaaaaaaa", m, re), true);
}
}
# endif
# if defined(REGEX_BENCHMARK_OUR)
BENCHMARK_CASE(simple_questionmark_benchmark)
{
Regex<PosixExtended> re("da?d");
RegexResult m;
for (size_t i = 0; i < BENCHMARK_LOOP_ITERATIONS; ++i) {
EXPECT_EQ(re.search("a", m), false);
EXPECT_EQ(re.search("daa", m), false);
EXPECT_EQ(re.search("ddddd", m), true);
EXPECT_EQ(re.search("dd", m), true);
EXPECT_EQ(re.search("dad", m), true);
EXPECT_EQ(re.search("dada", m), true);
EXPECT_EQ(re.search("adadaa", m), true);
}
}
# endif
# if defined(REGEX_BENCHMARK_OTHER)
BENCHMARK_CASE(simple_questionmark_benchmark_reference_stdcpp)
{
std::regex re("da?d");
std::cmatch m;
for (size_t i = 0; i < BENCHMARK_LOOP_ITERATIONS; ++i) {
EXPECT_EQ(std::regex_search("a", m, re), false);
EXPECT_EQ(std::regex_search("daa", m, re), false);
EXPECT_EQ(std::regex_search("ddddd", m, re), true);
EXPECT_EQ(std::regex_search("dd", m, re), true);
EXPECT_EQ(std::regex_search("dad", m, re), true);
EXPECT_EQ(std::regex_search("dada", m, re), true);
EXPECT_EQ(std::regex_search("adadaa", m, re), true);
}
}
# endif
# if defined(REGEX_BENCHMARK_OUR)
BENCHMARK_CASE(character_class_benchmark)
{
Regex<PosixExtended> re("[[:alpha:]]");
RegexResult m;
String haystack = "[Window]\nOpacity=255\nAudibleBeep=0\n";
for (size_t i = 0; i < BENCHMARK_LOOP_ITERATIONS; ++i) {
EXPECT_EQ(re.match(haystack.characters(), m), false);
EXPECT_EQ(re.search(haystack.characters(), m), true);
}
}
# endif
# if defined(REGEX_BENCHMARK_OTHER)
BENCHMARK_CASE(character_class_benchmark_reference_stdcpp)
{
std::regex re("[[:alpha:]]");
std::cmatch m;
String haystack = "[Window]\nOpacity=255\nAudibleBeep=0\n";
for (size_t i = 0; i < BENCHMARK_LOOP_ITERATIONS; ++i) {
EXPECT_EQ(std::regex_match(haystack.characters(), m, re), false);
EXPECT_EQ(std::regex_search(haystack.characters(), m, re), true);
}
}
# endif
# if defined(REGEX_BENCHMARK_OUR)
BENCHMARK_CASE(escaped_char_questionmark_benchmark)
{
Regex<PosixExtended> re("This\\.?And\\.?That");
RegexResult m;
for (size_t i = 0; i < BENCHMARK_LOOP_ITERATIONS; ++i) {
EXPECT_EQ(re.match("ThisAndThat", m), true);
EXPECT_EQ(re.match("This.And.That", m), true);
EXPECT_EQ(re.match("This And That", m), false);
EXPECT_EQ(re.match("This..And..That", m), false);
}
}
# endif
# if defined(REGEX_BENCHMARK_OTHER)
BENCHMARK_CASE(escaped_char_questionmark_benchmark_reference_stdcpp)
{
std::regex re("This\\.?And\\.?That");
std::cmatch m;
for (size_t i = 0; i < BENCHMARK_LOOP_ITERATIONS; ++i) {
EXPECT_EQ(std::regex_match("ThisAndThat", m, re), true);
EXPECT_EQ(std::regex_match("This.And.That", m, re), true);
EXPECT_EQ(std::regex_match("This And That", m, re), false);
EXPECT_EQ(std::regex_match("This..And..That", m, re), false);
}
}
# endif
# if defined(REGEX_BENCHMARK_OUR)
BENCHMARK_CASE(char_qualifier_asterisk_benchmark)
{
Regex<PosixExtended> re("regex*");
RegexResult m;
for (size_t i = 0; i < BENCHMARK_LOOP_ITERATIONS; ++i) {
EXPECT_EQ(re.search("#include <regex.h>", m), true);
EXPECT_EQ(re.search("#include <stdio.h>", m), false);
}
}
# endif
# if defined(REGEX_BENCHMARK_OTHER)
BENCHMARK_CASE(char_qualifier_asterisk_benchmark_reference_stdcpp)
{
std::regex re("regex*");
std::cmatch m;
for (size_t i = 0; i < BENCHMARK_LOOP_ITERATIONS; ++i) {
EXPECT_EQ(std::regex_search("#include <regex.h>", m, re), true);
EXPECT_EQ(std::regex_search("#include <stdio.h>", m, re), false);
}
}
# endif
# if defined(REGEX_BENCHMARK_OUR)
BENCHMARK_CASE(parens_qualifier_questionmark_benchmark)
{
Regex<PosixExtended> re("test(hello)?test");
RegexResult m;
for (size_t i = 0; i < BENCHMARK_LOOP_ITERATIONS; ++i) {
EXPECT_EQ(re.match("testtest", m), true);
EXPECT_EQ(re.match("testhellotest", m), true);
EXPECT_EQ(re.match("testasfdtest", m), false);
}
}
# endif
# if defined(REGEX_BENCHMARK_OTHER)
BENCHMARK_CASE(parens_qualifier_questionmark_benchmark_reference_stdcpp)
{
std::regex re("test(hello)?test");
std::cmatch m;
for (size_t i = 0; i < BENCHMARK_LOOP_ITERATIONS; ++i) {
EXPECT_EQ(std::regex_match("testtest", m, re), true);
EXPECT_EQ(std::regex_match("testhellotest", m, re), true);
EXPECT_EQ(std::regex_match("testasfdtest", m, re), false);
}
}
# endif
# if defined(REGEX_BENCHMARK_OUR)
BENCHMARK_CASE(parens_qualifier_asterisk_benchmark)
{
Regex<PosixExtended> re("test(hello)*test");
RegexResult m;
for (size_t i = 0; i < BENCHMARK_LOOP_ITERATIONS; ++i) {
EXPECT_EQ(re.match("testtest", m), true);
EXPECT_EQ(re.match("testhellohellotest", m), true);
EXPECT_EQ(re.search("testhellohellotest, testhellotest", m), true);
EXPECT_EQ(re.match("aaaaaaaaaaaaaaaaaaaaaaaaaaabbbbbbbbbbbbbbbbbbbbbbbbbb", m), false);
}
}
# endif
# if defined(REGEX_BENCHMARK_OTHER)
BENCHMARK_CASE(parens_qualifier_asterisk_benchmark_reference_stdcpp)
{
std::regex re("test(hello)*test");
std::cmatch m;
for (size_t i = 0; i < BENCHMARK_LOOP_ITERATIONS; ++i) {
EXPECT_EQ(std::regex_match("testtest", m, re), true);
EXPECT_EQ(std::regex_match("testhellohellotest", m, re), true);
EXPECT_EQ(std::regex_search("testhellohellotest, testhellotest", m, re), true);
EXPECT_EQ(std::regex_match("aaaaaaaaaaaaaaaaaaaaaaaaaaabbbbbbbbbbbbbbbbbbbbbbbbbb", m, re), false);
}
}
# endif
# if defined(REGEX_BENCHMARK_OUR)
BENCHMARK_CASE(parens_qualifier_asterisk_2_benchmark)
{
Regex<PosixExtended> re("test(.*)test");
RegexResult m;
for (size_t i = 0; i < BENCHMARK_LOOP_ITERATIONS; ++i) {
EXPECT_EQ(re.match("testasdftest", m), true);
EXPECT_EQ(re.match("testasdfasdftest", m), true);
EXPECT_EQ(re.search("testaaaatest, testbbbtest, testtest", m), true);
EXPECT_EQ(re.match("aaaaaaaaaaaaaaaaaaaaaaaaaaabbbbbbbbbbbbbbbbbbbbbbbbbb", m), false);
}
}
# endif
# if defined(REGEX_BENCHMARK_OTHER)
BENCHMARK_CASE(parens_qualifier_asterisk_2_benchmark_reference_stdcpp)
{
std::regex re("test(.*)test");
std::cmatch m;
for (size_t i = 0; i < BENCHMARK_LOOP_ITERATIONS; ++i) {
EXPECT_EQ(std::regex_match("testasdftest", m, re), true);
EXPECT_EQ(std::regex_match("testasdfasdftest", m, re), true);
EXPECT_EQ(std::regex_search("testaaaatest, testbbbtest, testtest", m, re), true);
EXPECT_EQ(std::regex_match("aaaaaaaaaaaaaaaaaaaaaaaaaaabbbbbbbbbbbbbbbbbbbbbbbbbb", m, re), false);
}
}
# endif
# if defined(REGEX_BENCHMARK_OUR)
BENCHMARK_CASE(multi_parens_qualifier_questionmark_benchmark)
{
Regex<PosixExtended> re("test(a)?(b)?(c)?test");
RegexResult m;
for (size_t i = 0; i < BENCHMARK_LOOP_ITERATIONS; ++i) {
EXPECT_EQ(re.match("testtest", m), true);
EXPECT_EQ(re.match("testabctest", m), true);
EXPECT_EQ(re.search("testabctest, testactest", m), true);
EXPECT_EQ(re.match("aaaaaaaaaaaaaaaaaaaaaaaaaaabbbbbbbbbbbbbbbbbbbbbbbbbb", m), false);
EXPECT_EQ(re.match("test", m), false);
EXPECT_EQ(re.match("whaaaaat", m), false);
}
}
# endif
# if defined(REGEX_BENCHMARK_OTHER)
BENCHMARK_CASE(multi_parens_qualifier_questionmark_benchmark_reference_stdcpp)
{
std::regex re("test(a)?(b)?(c)?test");
std::cmatch m;
for (size_t i = 0; i < BENCHMARK_LOOP_ITERATIONS; ++i) {
EXPECT_EQ(std::regex_match("testtest", m, re), true);
EXPECT_EQ(std::regex_match("testabctest", m, re), true);
EXPECT_EQ(std::regex_search("testabctest, testactest", m, re), true);
EXPECT_EQ(std::regex_match("aaaaaaaaaaaaaaaaaaaaaaaaaaabbbbbbbbbbbbbbbbbbbbbbbbbb", m, re), false);
EXPECT_EQ(std::regex_match("test", m, re), false);
EXPECT_EQ(std::regex_match("whaaaaat", m, re), false);
}
}
# endif
# if defined(REGEX_BENCHMARK_OUR)
BENCHMARK_CASE(simple_alternative_benchmark)
{
Regex<PosixExtended> re("test|hello|friends");
RegexResult m;
for (size_t i = 0; i < BENCHMARK_LOOP_ITERATIONS; ++i) {
EXPECT_EQ(re.match("test", m), true);
EXPECT_EQ(re.match("hello", m), true);
EXPECT_EQ(re.match("friends", m), true);
EXPECT_EQ(re.match("whaaaaat", m), false);
}
}
# endif
# if defined(REGEX_BENCHMARK_OTHER)
BENCHMARK_CASE(simple_alternative_benchmark_reference_stdcpp)
{
std::regex re("test|hello|friends");
std::cmatch m;
for (size_t i = 0; i < BENCHMARK_LOOP_ITERATIONS; ++i) {
EXPECT_EQ(std::regex_match("test", m, re), true);
EXPECT_EQ(std::regex_match("hello", m, re), true);
EXPECT_EQ(std::regex_match("friends", m, re), true);
EXPECT_EQ(std::regex_match("whaaaaat", m, re), false);
}
}
# endif
# if defined(REGEX_BENCHMARK_OUR)
BENCHMARK_CASE(alternative_match_groups_benchmark)
{
Regex<PosixExtended> re("test(a)?(b)?|hello ?(dear|my)? friends");
RegexResult m;
for (size_t i = 0; i < BENCHMARK_LOOP_ITERATIONS; ++i) {
EXPECT_EQ(re.match("test", m), true);
EXPECT_EQ(re.match("testa", m), true);
EXPECT_EQ(re.match("testb", m), true);
EXPECT_EQ(re.match("hello friends", m), true);
EXPECT_EQ(re.match("hello dear friends", m), true);
EXPECT_EQ(re.match("hello my friends", m), true);
EXPECT_EQ(re.match("testabc", m), false);
EXPECT_EQ(re.match("hello test friends", m), false);
}
}
# endif
# if defined(REGEX_BENCHMARK_OTHER)
BENCHMARK_CASE(alternative_match_groups_benchmark_reference_stdcpp)
{
std::regex re("test(a)?(b)?|hello ?(dear|my)? friends");
std::cmatch m;
for (size_t i = 0; i < BENCHMARK_LOOP_ITERATIONS; ++i) {
EXPECT_EQ(std::regex_match("test", m, re), true);
EXPECT_EQ(std::regex_match("testa", m, re), true);
EXPECT_EQ(std::regex_match("testb", m, re), true);
EXPECT_EQ(std::regex_match("hello friends", m, re), true);
EXPECT_EQ(std::regex_match("hello dear friends", m, re), true);
EXPECT_EQ(std::regex_match("hello my friends", m, re), true);
EXPECT_EQ(std::regex_match("testabc", m, re), false);
EXPECT_EQ(std::regex_match("hello test friends", m, re), false);
}
}
# endif
# if defined(REGEX_BENCHMARK_OUR)
BENCHMARK_CASE(parens_qualifier_exact_benchmark)
{
Regex<PosixExtended> re("(hello){3}");
RegexResult m;
for (size_t i = 0; i < BENCHMARK_LOOP_ITERATIONS; ++i) {
EXPECT_EQ(re.match("hello", m), false);
EXPECT_EQ(re.match("hellohellohello", m), true);
EXPECT_EQ(re.search("hellohellohellohello", m), true);
EXPECT_EQ(re.search("test hellohellohello", m), true);
}
}
# endif
# if defined(REGEX_BENCHMARK_OTHER)
BENCHMARK_CASE(parens_qualifier_exact_benchmark_reference_stdcpp)
{
std::regex re("(hello){3}");
std::cmatch m;
for (size_t i = 0; i < BENCHMARK_LOOP_ITERATIONS; ++i) {
EXPECT_EQ(std::regex_match("hello", m, re), false);
EXPECT_EQ(std::regex_match("hellohellohello", m, re), true);
EXPECT_EQ(std::regex_search("hellohellohellohello", m, re), true);
EXPECT_EQ(std::regex_search("test hellohellohello", m, re), true);
}
}
# endif
# if defined(REGEX_BENCHMARK_OUR)
BENCHMARK_CASE(parens_qualifier_minimum_benchmark)
{
Regex<PosixExtended> re("(hello){3,}");
RegexResult m;
for (size_t i = 0; i < BENCHMARK_LOOP_ITERATIONS; ++i) {
EXPECT_EQ(re.match("hello", m), false);
EXPECT_EQ(re.match("hellohellohello", m), true);
EXPECT_EQ(re.search("hellohellohellohello", m), true);
EXPECT_EQ(re.search("test hellohellohello", m), true);
EXPECT_EQ(re.search("test hellohellohellohello", m), true);
}
}
# endif
# if defined(REGEX_BENCHMARK_OTHER)
BENCHMARK_CASE(parens_qualifier_minimum_benchmark_reference_stdcpp)
{
std::regex re("(hello){3,}");
std::cmatch m;
for (size_t i = 0; i < BENCHMARK_LOOP_ITERATIONS; ++i) {
EXPECT_EQ(std::regex_match("hello", m, re), false);
EXPECT_EQ(std::regex_match("hellohellohello", m, re), true);
EXPECT_EQ(std::regex_search("hellohellohellohello", m, re), true);
EXPECT_EQ(std::regex_search("test hellohellohello", m, re), true);
EXPECT_EQ(std::regex_search("test hellohellohellohello", m, re), true);
}
}
# endif
# if defined(REGEX_BENCHMARK_OUR)
BENCHMARK_CASE(parens_qualifier_maximum_benchmark)
{
Regex<PosixExtended> re("(hello){2,3}");
RegexResult m;
for (size_t i = 0; i < BENCHMARK_LOOP_ITERATIONS; ++i) {
EXPECT_EQ(re.match("hello", m), false);
EXPECT_EQ(re.match("hellohellohello", m), true);
EXPECT_EQ(re.search("hellohellohellohello", m), true);
EXPECT_EQ(re.search("test hellohellohello", m), true);
EXPECT_EQ(re.search("test hellohellohellohello", m), true);
EXPECT_EQ(re.match("test hellohellohellohello", m), false);
EXPECT_EQ(re.search("test hellohellohellohello", m), true);
}
}
# endif
# if defined(REGEX_BENCHMARK_OTHER)
BENCHMARK_CASE(parens_qualifier_maximum_benchmark_reference_stdcpp)
{
std::regex re("(hello){2,3}");
std::cmatch m;
for (size_t i = 0; i < BENCHMARK_LOOP_ITERATIONS; ++i) {
EXPECT_EQ(std::regex_match("hello", m, re), false);
EXPECT_EQ(std::regex_match("hellohellohello", m, re), true);
EXPECT_EQ(std::regex_search("hellohellohellohello", m, re), true);
EXPECT_EQ(std::regex_search("test hellohellohello", m, re), true);
EXPECT_EQ(std::regex_search("test hellohellohellohello", m, re), true);
EXPECT_EQ(std::regex_match("test hellohellohellohello", m, re), false);
EXPECT_EQ(std::regex_search("test hellohellohellohello", m, re), true);
}
}
# endif
# if defined(REGEX_BENCHMARK_OUR)
BENCHMARK_CASE(char_qualifier_min_max_benchmark)
{
Regex<PosixExtended> re("c{3,30}");
RegexResult m;
for (size_t i = 0; i < BENCHMARK_LOOP_ITERATIONS; ++i) {
EXPECT_EQ(re.match("cc", m), false);
EXPECT_EQ(re.match("ccc", m), true);
EXPECT_EQ(re.match("cccccccccccccccccccccccccccccc", m), true);
EXPECT_EQ(re.match("ccccccccccccccccccccccccccccccc", m), false);
EXPECT_EQ(re.search("ccccccccccccccccccccccccccccccc", m), true);
EXPECT_EQ(re.match("cccccccccccccccccccccccccccccccc", m), false);
}
}
# endif
# if defined(REGEX_BENCHMARK_OTHER)
BENCHMARK_CASE(char_qualifier_min_max_benchmark_reference_stdcpp)
{
std::regex re("c{3,30}");
std::cmatch m;
for (size_t i = 0; i < BENCHMARK_LOOP_ITERATIONS; ++i) {
EXPECT_EQ(std::regex_match("cc", m, re), false);
EXPECT_EQ(std::regex_match("ccc", m, re), true);
EXPECT_EQ(std::regex_match("cccccccccccccccccccccccccccccc", m, re), true);
EXPECT_EQ(std::regex_match("ccccccccccccccccccccccccccccccc", m, re), false);
EXPECT_EQ(std::regex_search("ccccccccccccccccccccccccccccccc", m, re), true);
EXPECT_EQ(std::regex_match("cccccccccccccccccccccccccccccccc", m, re), false);
}
}
# endif
# if defined(REGEX_BENCHMARK_OUR)
BENCHMARK_CASE(simple_bracket_chars_benchmark)
{
Regex<PosixExtended> re("[abc]");
RegexResult m;
for (size_t i = 0; i < BENCHMARK_LOOP_ITERATIONS; ++i) {
EXPECT_EQ(re.match("a", m), true);
EXPECT_EQ(re.match("b", m), true);
EXPECT_EQ(re.match("c", m), true);
EXPECT_EQ(re.match("d", m), false);
EXPECT_EQ(re.match("e", m), false);
}
}
# endif
# if defined(REGEX_BENCHMARK_OTHER)
BENCHMARK_CASE(simple_bracket_chars_benchmark_reference_stdcpp)
{
std::regex re("[abc]");
std::cmatch m;
for (size_t i = 0; i < BENCHMARK_LOOP_ITERATIONS; ++i) {
EXPECT_EQ(std::regex_match("a", m, re), true);
EXPECT_EQ(std::regex_match("b", m, re), true);
EXPECT_EQ(std::regex_match("c", m, re), true);
EXPECT_EQ(std::regex_match("d", m, re), false);
EXPECT_EQ(std::regex_match("e", m, re), false);
}
}
# endif
# if defined(REGEX_BENCHMARK_OUR)
BENCHMARK_CASE(simple_bracket_chars_inverse_benchmark)
{
Regex<PosixExtended> re("[^abc]");
RegexResult m;
for (size_t i = 0; i < BENCHMARK_LOOP_ITERATIONS; ++i) {
EXPECT_EQ(re.match("a", m), false);
EXPECT_EQ(re.match("b", m), false);
EXPECT_EQ(re.match("c", m), false);
EXPECT_EQ(re.match("d", m), true);
EXPECT_EQ(re.match("e", m), true);
}
}
# endif
# if defined(REGEX_BENCHMARK_OTHER)
BENCHMARK_CASE(simple_bracket_chars_inverse_benchmark_reference_stdcpp)
{
std::regex re("[^abc]");
std::cmatch m;
for (size_t i = 0; i < BENCHMARK_LOOP_ITERATIONS; ++i) {
EXPECT_EQ(std::regex_match("a", m, re), false);
EXPECT_EQ(std::regex_match("b", m, re), false);
EXPECT_EQ(std::regex_match("c", m, re), false);
EXPECT_EQ(std::regex_match("d", m, re), true);
EXPECT_EQ(std::regex_match("e", m, re), true);
}
}
# endif
# if defined(REGEX_BENCHMARK_OUR)
BENCHMARK_CASE(simple_bracket_chars_range_benchmark)
{
Regex<PosixExtended> re("[a-d]");
RegexResult m;
for (size_t i = 0; i < BENCHMARK_LOOP_ITERATIONS; ++i) {
EXPECT_EQ(re.match("a", m), true);
EXPECT_EQ(re.match("b", m), true);
EXPECT_EQ(re.match("c", m), true);
EXPECT_EQ(re.match("d", m), true);
EXPECT_EQ(re.match("e", m), false);
}
}
# endif
# if defined(REGEX_BENCHMARK_OTHER)
BENCHMARK_CASE(simple_bracket_chars_range_benchmark_reference_stdcpp)
{
std::regex re("[a-d]");
std::cmatch m;
for (size_t i = 0; i < BENCHMARK_LOOP_ITERATIONS; ++i) {
EXPECT_EQ(std::regex_match("a", m, re), true);
EXPECT_EQ(std::regex_match("b", m, re), true);
EXPECT_EQ(std::regex_match("c", m, re), true);
EXPECT_EQ(std::regex_match("d", m, re), true);
EXPECT_EQ(std::regex_match("e", m, re), false);
}
}
# endif
# if defined(REGEX_BENCHMARK_OUR)
BENCHMARK_CASE(simple_bracket_chars_range_inverse_benchmark)
{
Regex<PosixExtended> re("[^a-df-z]");
RegexResult m;
for (size_t i = 0; i < BENCHMARK_LOOP_ITERATIONS; ++i) {
EXPECT_EQ(re.match("a", m), false);
EXPECT_EQ(re.match("b", m), false);
EXPECT_EQ(re.match("c", m), false);
EXPECT_EQ(re.match("d", m), false);
EXPECT_EQ(re.match("e", m), true);
EXPECT_EQ(re.match("k", m), false);
EXPECT_EQ(re.match("z", m), false);
}
}
# endif
# if defined(REGEX_BENCHMARK_OTHER)
BENCHMARK_CASE(simple_bracket_chars_range_inverse_benchmark_reference_stdcpp)
{
std::regex re("[^a-df-z]");
std::cmatch m;
for (size_t i = 0; i < BENCHMARK_LOOP_ITERATIONS; ++i) {
EXPECT_EQ(std::regex_match("a", m, re), false);
EXPECT_EQ(std::regex_match("b", m, re), false);
EXPECT_EQ(std::regex_match("c", m, re), false);
EXPECT_EQ(std::regex_match("d", m, re), false);
EXPECT_EQ(std::regex_match("e", m, re), true);
EXPECT_EQ(std::regex_match("k", m, re), false);
EXPECT_EQ(std::regex_match("z", m, re), false);
}
}
# endif
# if defined(REGEX_BENCHMARK_OUR)
BENCHMARK_CASE(bracket_character_class_uuid_benchmark)
{
Regex<PosixExtended> re("^([[:xdigit:]]{8})-([[:xdigit:]]{4})-([[:xdigit:]]{4})-([[:xdigit:]]{4})-([[:xdigit:]]{12})$");
RegexResult m;
for (size_t i = 0; i < BENCHMARK_LOOP_ITERATIONS; ++i) {
EXPECT_EQ(re.match("fb9b62a2-1579-4e3a-afba-76239ccb6583", m), true);
EXPECT_EQ(re.match("fb9b62a2", m), false);
}
}
# endif
# if defined(REGEX_BENCHMARK_OTHER)
BENCHMARK_CASE(bracket_character_class_uuid_benchmark_reference_stdcpp)
{
std::regex re("^([[:xdigit:]]{8})-([[:xdigit:]]{4})-([[:xdigit:]]{4})-([[:xdigit:]]{4})-([[:xdigit:]]{12})$");
std::cmatch m;
for (size_t i = 0; i < BENCHMARK_LOOP_ITERATIONS; ++i) {
EXPECT_EQ(std::regex_match("fb9b62a2-1579-4e3a-afba-76239ccb6583", m, re), true);
EXPECT_EQ(std::regex_match("fb9b62a2", m, re), false);
}
}
# endif
# if defined(REGEX_BENCHMARK_OUR)
BENCHMARK_CASE(simple_bracket_character_class_inverse_benchmark)
{
Regex<PosixExtended> re("[^[:digit:]]");
RegexResult m;
for (size_t i = 0; i < BENCHMARK_LOOP_ITERATIONS; ++i) {
EXPECT_EQ(re.match("1", m), false);
EXPECT_EQ(re.match("2", m), false);
EXPECT_EQ(re.match("3", m), false);
EXPECT_EQ(re.match("d", m), true);
EXPECT_EQ(re.match("e", m), true);
}
}
# endif
# if defined(REGEX_BENCHMARK_OTHER)
BENCHMARK_CASE(simple_bracket_character_class_inverse_benchmark_reference_stdcpp)
{
std::regex re("[^[:digit:]]");
std::cmatch m;
for (size_t i = 0; i < BENCHMARK_LOOP_ITERATIONS; ++i) {
EXPECT_EQ(std::regex_match("1", m, re), false);
EXPECT_EQ(std::regex_match("2", m, re), false);
EXPECT_EQ(std::regex_match("3", m, re), false);
EXPECT_EQ(std::regex_match("d", m, re), true);
EXPECT_EQ(std::regex_match("e", m, re), true);
}
}
# endif
# if defined(REGEX_BENCHMARK_OUR)
BENCHMARK_CASE(email_address_benchmark)
{
Regex<PosixExtended> re("^[A-Z0-9a-z._%+-]{1,64}@(?:[A-Za-z0-9-]{1,63}\\.){1,125}[A-Za-z]{2,63}$");
RegexResult m;
for (size_t i = 0; i < BENCHMARK_LOOP_ITERATIONS; ++i) {
EXPECT_EQ(re.match("hello.world@domain.tld", m), true);
EXPECT_EQ(re.match("this.is.a.very_long_email_address@world.wide.web", m), true);
}
}
# endif
# if defined(REGEX_BENCHMARK_OTHER)
BENCHMARK_CASE(email_address_benchmark_reference_stdcpp)
{
std::regex re("^[A-Z0-9a-z._%+-]{1,64}@(?:[A-Za-z0-9-]{1,63}\\.){1,125}[A-Za-z]{2,63}$");
std::cmatch m;
for (size_t i = 0; i < BENCHMARK_LOOP_ITERATIONS; ++i) {
EXPECT_EQ(std::regex_match("hello.world@domain.tld", m, re), true);
EXPECT_EQ(std::regex_match("this.is.a.very_long_email_address@world.wide.web", m, re), true);
}
}
# endif
# if defined(REGEX_BENCHMARK_OUR)
BENCHMARK_CASE(simple_ignorecase_benchmark)
{
Regex<PosixExtended> re("^hello friends", PosixFlags::Insensitive);
RegexResult m;
for (size_t i = 0; i < BENCHMARK_LOOP_ITERATIONS; ++i) {
EXPECT_EQ(re.match("Hello Friends", m), true);
EXPECT_EQ(re.match("hello Friends", m), true);
EXPECT_EQ(re.match("hello Friends!", m), false);
EXPECT_EQ(re.search("hello Friends", m), true);
EXPECT_EQ(re.match("hell Friends", m), false);
EXPECT_EQ(re.search("hell Friends", m), false);
}
}
# endif
# if defined(REGEX_BENCHMARK_OTHER)
BENCHMARK_CASE(simple_ignorecase_benchmark_reference_stdcpp)
{
std::regex re("^hello friends", std::regex_constants::icase);
std::cmatch m;
for (size_t i = 0; i < BENCHMARK_LOOP_ITERATIONS; ++i) {
EXPECT_EQ(std::regex_match("Hello Friends", m, re), true);
EXPECT_EQ(std::regex_match("hello Friends", m, re), true);
EXPECT_EQ(std::regex_match("hello Friends!", m, re), false);
EXPECT_EQ(std::regex_search("hello Friends", m, re), true);
EXPECT_EQ(std::regex_match("hell Friends", m, re), false);
EXPECT_EQ(std::regex_search("hell Friends", m, re), false);
}
}
# endif
# if defined(REGEX_BENCHMARK_OUR)
BENCHMARK_CASE(simple_notbol_noteol_benchmark)
{
String pattern = "^hello friends$";
String pattern2 = "hello friends";
regex_t regex, regex2;
EXPECT_EQ(regcomp(&regex, pattern.characters(), REG_EXTENDED | REG_NOSUB | REG_ICASE), REG_NOERR);
EXPECT_EQ(regcomp(&regex2, pattern2.characters(), REG_EXTENDED | REG_NOSUB | REG_ICASE), REG_NOERR);
for (size_t i = 0; i < BENCHMARK_LOOP_ITERATIONS; ++i) {
EXPECT_EQ(regexec(&regex, "hello friends", 0, NULL, REG_NOTBOL), REG_NOMATCH);
EXPECT_EQ(regexec(&regex, "hello friends", 0, NULL, REG_NOTEOL), REG_NOMATCH);
EXPECT_EQ(regexec(&regex, "hello friends", 0, NULL, REG_NOTBOL | REG_NOTEOL), REG_NOMATCH);
EXPECT_EQ(regexec(&regex, "a hello friends b", 0, NULL, REG_NOTBOL), REG_NOMATCH);
EXPECT_EQ(regexec(&regex, "a hello friends", 0, NULL, REG_NOTBOL), REG_NOMATCH);
EXPECT_EQ(regexec(&regex, "a hello friends", 0, NULL, REG_NOTBOL | REG_SEARCH), REG_NOERR);
EXPECT_EQ(regexec(&regex, "a hello friends b", 0, NULL, REG_NOTBOL | REG_SEARCH), REG_NOERR);
EXPECT_EQ(regexec(&regex, "a hello friends b", 0, NULL, REG_NOTEOL), REG_NOMATCH);
EXPECT_EQ(regexec(&regex, "hello friends b", 0, NULL, REG_NOTEOL), REG_NOMATCH);
EXPECT_EQ(regexec(&regex, "hello friends b", 0, NULL, REG_NOTEOL | REG_SEARCH), REG_NOERR);
EXPECT_EQ(regexec(&regex, "a hello friends b", 0, NULL, REG_NOTEOL | REG_SEARCH), REG_NOMATCH);
EXPECT_EQ(regexec(&regex, "a hello friends b", 0, NULL, REG_NOTBOL | REG_NOTEOL), REG_NOMATCH);
EXPECT_EQ(regexec(&regex, "a hello friends b", 0, NULL, REG_NOTBOL | REG_NOTEOL | REG_SEARCH), REG_NOMATCH);
EXPECT_EQ(regexec(&regex2, "hello friends", 0, NULL, REG_NOTBOL), REG_NOMATCH);
EXPECT_EQ(regexec(&regex2, "hello friends", 0, NULL, REG_NOTEOL), REG_NOMATCH);
}
regfree(&regex);
}
# endif
# if defined(REGEX_BENCHMARK_OTHER)
BENCHMARK_CASE(simple_notbol_noteol_benchmark_reference_stdcpp)
{
std::regex re1("^hello friends$", std::regex_constants::match_not_bol);
std::regex re2("^hello friends$", std::regex_constants::match_not_eol);
std::regex re3("^hello friends$", std::regex_constants::match_not_bol | std::regex_constants::match_not_eol);
std::regex re4("hello friends", std::regex_constants::match_not_bol);
std::regex re5("hello friends", std::regex_constants::match_not_eol);
std::cmatch m;
for (size_t i = 0; i < BENCHMARK_LOOP_ITERATIONS; ++i) {
EXPECT_EQ(std::regex_match("hello friends", m, re1), false);
EXPECT_EQ(std::regex_match("hello friends", m, re2), false);
EXPECT_EQ(std::regex_match("hello friends", m, re3), false);
EXPECT_EQ(std::regex_match("a hello friends b", m, re1), false);
EXPECT_EQ(std::regex_match("a hello friends", m, re1), false);
EXPECT_EQ(std::regex_search("a hello friends", m, re1), true);
EXPECT_EQ(std::regex_search("a hello friends b", m, re1), true);
EXPECT_EQ(std::regex_match("a hello friends b", m, re2), false);
EXPECT_EQ(std::regex_match("hello friends b", m, re2), false);
EXPECT_EQ(std::regex_search("hello friends b", m, re2), true);
EXPECT_EQ(std::regex_search("a hello friends b", m, re2), false);
EXPECT_EQ(std::regex_match("a hello friends b", m, re3), false);
EXPECT_EQ(std::regex_search("a hello friends b", m, re3), false);
EXPECT_EQ(std::regex_match("hello friends", m, re4), false);
EXPECT_EQ(std::regex_match("hello friends", m, re5), false);
}
}
# endif
#endif
TEST_MAIN(Regex)

View file

@ -0,0 +1,20 @@
file(GLOB TEST_SOURCES CONFIGURE_DEPENDS "*.cpp")
file(GLOB REGEX_SOURCES CONFIGURE_DEPENDS "../*.cpp" "../C/*.cpp")
foreach(source ${TEST_SOURCES})
get_filename_component(name ${source} NAME_WE)
add_executable(${name} ${source} ${REGEX_SOURCES})
target_link_libraries(${name} LagomCore)
add_test(
NAME ${name}
COMMAND ${name}
WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}
)
set_tests_properties(
${name}
PROPERTIES
FAIL_REGULAR_EXPRESSION
"FAIL"
)
endforeach()

View file

@ -0,0 +1,600 @@
/*
* Copyright (c) 2020, Emanuel Sprung <emanuel.sprung@gmail.com>
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice, this
* list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include <AK/TestSuite.h> // import first, to prevent warning of ASSERT* redefinition
#include <AK/StringBuilder.h>
#include <LibRegex/Regex.h>
#include <LibRegex/RegexDebug.h>
#include <stdio.h>
static ECMAScriptOptions match_test_api_options(const ECMAScriptOptions options)
{
return options;
}
static PosixOptions match_test_api_options(const PosixOptions options)
{
return options;
}
TEST_CASE(regex_options_ecmascript)
{
ECMAScriptOptions eo;
eo |= ECMAScriptFlags::Global;
EXPECT(eo & ECMAScriptFlags::Global);
EXPECT(!(eo & ECMAScriptFlags::Insensitive));
eo = match_test_api_options(ECMAScriptFlags::Global | ECMAScriptFlags::Insensitive | ECMAScriptFlags::Sticky);
EXPECT(eo & ECMAScriptFlags::Global);
EXPECT(eo & ECMAScriptFlags::Insensitive);
EXPECT(eo & ECMAScriptFlags::Sticky);
EXPECT(!(eo & ECMAScriptFlags::Unicode));
EXPECT(!(eo & ECMAScriptFlags::Multiline));
EXPECT(!(eo & ECMAScriptFlags::SingleLine));
eo &= ECMAScriptFlags::Insensitive;
EXPECT(!(eo & ECMAScriptFlags::Global));
EXPECT(eo & ECMAScriptFlags::Insensitive);
EXPECT(!(eo & ECMAScriptFlags::Multiline));
eo &= ECMAScriptFlags::Sticky;
EXPECT(!(eo & ECMAScriptFlags::Global));
EXPECT(!(eo & ECMAScriptFlags::Insensitive));
EXPECT(!(eo & ECMAScriptFlags::Multiline));
EXPECT(!(eo & ECMAScriptFlags::Sticky));
eo = ~ECMAScriptFlags::Insensitive;
EXPECT(eo & ECMAScriptFlags::Global);
EXPECT(!(eo & ECMAScriptFlags::Insensitive));
EXPECT(eo & ECMAScriptFlags::Multiline);
EXPECT(eo & ECMAScriptFlags::Sticky);
}
TEST_CASE(regex_options_posix)
{
PosixOptions eo;
eo |= PosixFlags::Global;
EXPECT(eo & PosixFlags::Global);
EXPECT(!(eo & PosixFlags::Insensitive));
eo = match_test_api_options(PosixFlags::Global | PosixFlags::Insensitive | PosixFlags::MatchNotBeginOfLine);
EXPECT(eo & PosixFlags::Global);
EXPECT(eo & PosixFlags::Insensitive);
EXPECT(eo & PosixFlags::MatchNotBeginOfLine);
EXPECT(!(eo & PosixFlags::Unicode));
EXPECT(!(eo & PosixFlags::Multiline));
eo &= PosixFlags::Insensitive;
EXPECT(!(eo & PosixFlags::Global));
EXPECT(eo & PosixFlags::Insensitive);
EXPECT(!(eo & PosixFlags::Multiline));
eo &= PosixFlags::MatchNotBeginOfLine;
EXPECT(!(eo & PosixFlags::Global));
EXPECT(!(eo & PosixFlags::Insensitive));
EXPECT(!(eo & PosixFlags::Multiline));
eo = ~PosixFlags::Insensitive;
EXPECT(eo & PosixFlags::Global);
EXPECT(!(eo & PosixFlags::Insensitive));
EXPECT(eo & PosixFlags::Multiline);
}
TEST_CASE(regex_lexer)
{
Lexer l("/[.*+?^${}()|[\\]\\\\]/g");
EXPECT(l.next().type() == regex::TokenType::Slash);
EXPECT(l.next().type() == regex::TokenType::LeftBracket);
EXPECT(l.next().type() == regex::TokenType::Period);
EXPECT(l.next().type() == regex::TokenType::Asterisk);
EXPECT(l.next().type() == regex::TokenType::Plus);
EXPECT(l.next().type() == regex::TokenType::Questionmark);
EXPECT(l.next().type() == regex::TokenType::Circumflex);
EXPECT(l.next().type() == regex::TokenType::Dollar);
EXPECT(l.next().type() == regex::TokenType::LeftCurly);
EXPECT(l.next().type() == regex::TokenType::RightCurly);
EXPECT(l.next().type() == regex::TokenType::LeftParen);
EXPECT(l.next().type() == regex::TokenType::RightParen);
EXPECT(l.next().type() == regex::TokenType::Pipe);
EXPECT(l.next().type() == regex::TokenType::LeftBracket);
EXPECT(l.next().type() == regex::TokenType::EscapeSequence);
EXPECT(l.next().type() == regex::TokenType::EscapeSequence);
EXPECT(l.next().type() == regex::TokenType::RightBracket);
EXPECT(l.next().type() == regex::TokenType::Slash);
EXPECT(l.next().type() == regex::TokenType::Char);
}
TEST_CASE(parser_error_parens)
{
String pattern = "test()test";
Lexer l(pattern);
PosixExtendedParser p(l);
p.parse();
EXPECT(p.has_error());
EXPECT(p.error() == Error::EmptySubExpression);
}
TEST_CASE(parser_error_special_characters_used_at_wrong_place)
{
String pattern;
Vector<char, 5> chars = { '*', '+', '?', '{' };
StringBuilder b;
Lexer l;
PosixExtended p(l);
for (auto& ch : chars) {
// First in ere
b.clear();
b.append(ch);
pattern = b.build();
l.set_source(pattern);
p.parse();
EXPECT(p.has_error());
EXPECT(p.error() == Error::InvalidRepetitionMarker);
// After vertical line
b.clear();
b.append("a|");
b.append(ch);
pattern = b.build();
l.set_source(pattern);
p.parse();
EXPECT(p.has_error());
EXPECT(p.error() == Error::InvalidRepetitionMarker);
// After circumflex
b.clear();
b.append("^");
b.append(ch);
pattern = b.build();
l.set_source(pattern);
p.parse();
EXPECT(p.has_error());
EXPECT(p.error() == Error::InvalidRepetitionMarker);
// After dollar
b.clear();
b.append("$");
b.append(ch);
pattern = b.build();
l.set_source(pattern);
p.parse();
EXPECT(p.has_error());
EXPECT(p.error() == Error::InvalidRepetitionMarker);
// After left parens
b.clear();
b.append("(");
b.append(ch);
b.append(")");
pattern = b.build();
l.set_source(pattern);
p.parse();
EXPECT(p.has_error());
EXPECT(p.error() == Error::InvalidRepetitionMarker);
}
}
TEST_CASE(parser_error_vertical_line_used_at_wrong_place)
{
Lexer l;
PosixExtended p(l);
// First in ere
l.set_source("|asdf");
p.parse();
EXPECT(p.has_error());
EXPECT(p.error() == Error::EmptySubExpression);
// Last in ere
l.set_source("asdf|");
p.parse();
EXPECT(p.has_error());
EXPECT(p.error() == Error::EmptySubExpression);
// After left parens
l.set_source("(|asdf)");
p.parse();
EXPECT(p.has_error());
EXPECT(p.error() == Error::EmptySubExpression);
// Proceed right parens
l.set_source("(asdf)|");
p.parse();
EXPECT(p.has_error());
EXPECT(p.error() == Error::EmptySubExpression);
}
TEST_CASE(catch_all_first)
{
Regex<PosixExtended> re("^.*$");
RegexResult m;
re.match("Hello World", m);
EXPECT(m.count == 1);
EXPECT(re.match("Hello World", m));
}
TEST_CASE(catch_all)
{
Regex<PosixExtended> re("^.*$", PosixFlags::Global);
EXPECT(re.has_match("Hello World"));
EXPECT(re.match("Hello World").success);
EXPECT(re.match("Hello World").count == 1);
EXPECT(has_match("Hello World", re));
auto res = match("Hello World", re);
EXPECT(res.success);
EXPECT(res.count == 1);
EXPECT(res.matches.size() == 1);
EXPECT(res.matches.first().view == "Hello World");
}
TEST_CASE(catch_all_again)
{
Regex<PosixExtended> re("^.*$", PosixFlags::Extra);
EXPECT_EQ(has_match("Hello World", re), true);
}
TEST_CASE(char_utf8)
{
Regex<PosixExtended> re("😀");
RegexResult result;
EXPECT_EQ((result = match("Привет, мир! 😀 γειά σου κόσμος 😀 こんにちは世界", re, PosixFlags::Global)).success, true);
EXPECT_EQ(result.count, 2u);
}
TEST_CASE(catch_all_newline)
{
Regex<PosixExtended> re("^.*$", PosixFlags::Multiline | PosixFlags::StringCopyMatches);
RegexResult result;
auto lambda = [&result, &re]() {
String aaa = "Hello World\nTest\n1234\n";
result = match(aaa, re);
EXPECT_EQ(result.success, true);
};
lambda();
EXPECT_EQ(result.count, 3u);
EXPECT_EQ(result.matches.at(0).view, "Hello World");
EXPECT_EQ(result.matches.at(1).view, "Test");
EXPECT_EQ(result.matches.at(2).view, "1234");
}
TEST_CASE(catch_all_newline_view)
{
Regex<PosixExtended> re("^.*$", PosixFlags::Multiline);
RegexResult result;
String aaa = "Hello World\nTest\n1234\n";
result = match(aaa, re);
EXPECT_EQ(result.success, true);
EXPECT_EQ(result.count, 3u);
String str = "Hello World";
EXPECT_EQ(result.matches.at(0).view, str.view());
EXPECT_EQ(result.matches.at(1).view, "Test");
EXPECT_EQ(result.matches.at(2).view, "1234");
}
TEST_CASE(catch_all_newline_2)
{
Regex<PosixExtended> re("^.*$");
RegexResult result;
result = match("Hello World\nTest\n1234\n", re, PosixFlags::Multiline | PosixFlags::StringCopyMatches);
EXPECT_EQ(result.success, true);
EXPECT_EQ(result.count, 3u);
EXPECT_EQ(result.matches.at(0).view, "Hello World");
EXPECT_EQ(result.matches.at(1).view, "Test");
EXPECT_EQ(result.matches.at(2).view, "1234");
result = match("Hello World\nTest\n1234\n", re);
EXPECT_EQ(result.success, true);
EXPECT_EQ(result.count, 1u);
EXPECT_EQ(result.matches.at(0).view, "Hello World\nTest\n1234\n");
}
TEST_CASE(match_all_character_class)
{
Regex<PosixExtended> re("[[:alpha:]]");
String str = "[Window]\nOpacity=255\nAudibleBeep=0\n";
RegexResult result = match(str, re, PosixFlags::Global | PosixFlags::StringCopyMatches);
EXPECT_EQ(result.success, true);
EXPECT_EQ(result.count, 24u);
EXPECT_EQ(result.matches.at(0).view, "W");
EXPECT_EQ(result.matches.at(1).view, "i");
EXPECT_EQ(result.matches.at(2).view, "n");
EXPECT(&result.matches.at(0).view.characters_without_null_termination()[0] != &str.view().characters_without_null_termination()[1]);
}
TEST_CASE(match_character_class_with_assertion)
{
Regex<PosixExtended> re("[[:alpha:]]+$");
String str = "abcdef";
RegexResult result = match(str, re);
EXPECT_EQ(result.success, true);
EXPECT_EQ(result.count, 1u);
}
TEST_CASE(example_for_git_commit)
{
Regex<PosixExtended> re("^.*$");
auto result = re.match("Well, hello friends!\nHello World!");
EXPECT(result.success);
EXPECT(result.count == 1);
EXPECT(result.matches.at(0).view.starts_with("Well"));
EXPECT(result.matches.at(0).view.length() == 33);
EXPECT(re.has_match("Well,...."));
result = re.match("Well, hello friends!\nHello World!", PosixFlags::Multiline);
EXPECT(result.success);
EXPECT(result.count == 2);
EXPECT(result.matches.at(0).view == "Well, hello friends!");
EXPECT(result.matches.at(1).view == "Hello World!");
}
TEST_CASE(email_address)
{
Regex<PosixExtended> re("^[A-Z0-9a-z._%+-]{1,64}@([A-Za-z0-9-]{1,63}\\.){1,125}[A-Za-z]{2,63}$");
EXPECT(re.has_match("hello.world@domain.tld"));
EXPECT(re.has_match("this.is.a.very_long_email_address@world.wide.web"));
}
TEST_CASE(ini_file_entries)
{
Regex<PosixExtended> re("[[:alpha:]]*=([[:digit:]]*)|\\[(.*)\\]");
RegexResult result;
#ifdef REGEX_DEBUG
RegexDebug regex_dbg(stderr);
regex_dbg.print_raw_bytecode(re);
regex_dbg.print_header();
regex_dbg.print_bytecode(re);
#endif
String haystack = "[Window]\nOpacity=255\nAudibleBeep=0\n";
EXPECT_EQ(re.search(haystack.view(), result, PosixFlags::Multiline), true);
EXPECT_EQ(result.count, 3u);
#ifdef REGEX_DEBUG
for (auto& v : result.matches)
fprintf(stderr, "%s\n", v.view.to_string().characters());
#endif
EXPECT_EQ(result.matches.at(0).view, "[Window]");
EXPECT_EQ(result.capture_group_matches.at(0).at(0).view, "Window");
EXPECT_EQ(result.matches.at(1).view, "Opacity=255");
EXPECT_EQ(result.matches.at(1).line, 1u);
EXPECT_EQ(result.matches.at(1).column, 0u);
EXPECT_EQ(result.capture_group_matches.at(1).at(0).view, "255");
EXPECT_EQ(result.capture_group_matches.at(1).at(0).line, 1u);
EXPECT_EQ(result.capture_group_matches.at(1).at(0).column, 8u);
EXPECT_EQ(result.matches.at(2).view, "AudibleBeep=0");
EXPECT_EQ(result.capture_group_matches.at(2).at(0).view, "0");
EXPECT_EQ(result.capture_group_matches.at(2).at(0).line, 2u);
EXPECT_EQ(result.capture_group_matches.at(2).at(0).column, 12u);
}
TEST_CASE(ini_file_entries2)
{
Regex<PosixExtended> re("[[:alpha:]]*=([[:digit:]]*)");
RegexResult result;
String haystack = "ViewMode=Icon";
EXPECT_EQ(re.match(haystack.view(), result), false);
EXPECT_EQ(result.count, 0u);
EXPECT_EQ(re.search(haystack.view(), result), true);
EXPECT_EQ(result.count, 1u);
}
TEST_CASE(named_capture_group)
{
Regex<PosixExtended> re("[[:alpha:]]*=(?<Test>[[:digit:]]*)");
RegexResult result;
#ifdef REGEX_DEBUG
RegexDebug regex_dbg(stderr);
regex_dbg.print_raw_bytecode(re);
regex_dbg.print_header();
regex_dbg.print_bytecode(re);
#endif
String haystack = "[Window]\nOpacity=255\nAudibleBeep=0\n";
EXPECT_EQ(re.search(haystack, result, PosixFlags::Multiline), true);
EXPECT_EQ(result.count, 2u);
EXPECT_EQ(result.matches.at(0).view, "Opacity=255");
EXPECT_EQ(result.named_capture_group_matches.at(0).ensure("Test").view, "255");
EXPECT_EQ(result.matches.at(1).view, "AudibleBeep=0");
EXPECT_EQ(result.named_capture_group_matches.at(1).ensure("Test").view, "0");
}
TEST_CASE(a_star)
{
Regex<PosixExtended> re("a*");
RegexResult result;
#ifdef REGEX_DEBUG
RegexDebug regex_dbg(stderr);
regex_dbg.print_raw_bytecode(re);
regex_dbg.print_header();
regex_dbg.print_bytecode(re);
#endif
String haystack = "[Window]\nOpacity=255\nAudibleBeep=0\n";
EXPECT_EQ(re.search(haystack.view(), result, PosixFlags::Multiline), true);
EXPECT_EQ(result.count, 32u);
EXPECT_EQ(result.matches.at(0).view.length(), 0u);
EXPECT_EQ(result.matches.at(10).view.length(), 1u);
EXPECT_EQ(result.matches.at(10).view, "a");
EXPECT_EQ(result.matches.at(31).view.length(), 0u);
}
TEST_CASE(simple_period_end_benchmark)
{
Regex<PosixExtended> re("hello.$");
RegexResult m;
EXPECT_EQ(re.search("Hello1", m), false);
EXPECT_EQ(re.search("hello1hello1", m), true);
EXPECT_EQ(re.search("hello2hell", m), false);
EXPECT_EQ(re.search("hello?", m), true);
}
TEST_CASE(ECMA262_parse)
{
struct _test {
const char* pattern;
regex::Error expected_error { regex::Error::NoError };
regex::ECMAScriptFlags flags {};
};
constexpr _test tests[] {
{ "^hello.$" },
{ "^(hello.)$" },
{ "^h{0,1}ello.$" },
{ "^hello\\W$" },
{ "^hell\\w.$" },
{ "^hell\\x6f1$" }, // ^hello1$
{ "^hel(?:l\\w).$" },
{ "^hel(?<LO>l\\w).$" },
{ "^[-a-zA-Z\\w\\s]+$" },
{ "\\bhello\\B" },
{ "^[\\w+/_-]+[=]{0,2}$" }, // #4189
{ "^(?:[^<]*(<[\\w\\W]+>)[^>]*$|#([\\w\\-]*)$)" }, // #4189
{ "\\/" }, // #4189
{ ",/=-:" }, // #4243
{ "\\x" }, // Even invalid escapes are allowed if ~unicode.
{ "\\", regex::Error::InvalidTrailingEscape },
{ "(?", regex::Error::InvalidCaptureGroup },
{ "\\u1234", regex::Error::NoError, regex::ECMAScriptFlags::Unicode },
{ "[\\u1234]", regex::Error::NoError, regex::ECMAScriptFlags::Unicode },
{ ",(?", regex::Error::InvalidCaptureGroup }, // #4583
};
for (auto& test : tests) {
Regex<ECMA262> re(test.pattern);
EXPECT_EQ(re.parser_result.error, test.expected_error);
#ifdef REGEX_DEBUG
dbgln("\n");
RegexDebug regex_dbg(stderr);
regex_dbg.print_raw_bytecode(re);
regex_dbg.print_header();
regex_dbg.print_bytecode(re);
dbgln("\n");
#endif
}
}
TEST_CASE(ECMA262_match)
{
struct _test {
const char* pattern;
const char* subject;
bool matches { true };
ECMAScriptFlags options {};
};
constexpr _test tests[] {
{ "^hello.$", "hello1" },
{ "^(hello.)$", "hello1" },
{ "^h{0,1}ello.$", "ello1" },
{ "^hello\\W$", "hello!" },
{ "^hell\\w.$", "hellx!" },
{ "^hell\\x6f1$", "hello1" },
{ "^hel(?<LO>l.)1$", "hello1" },
{ "^hel(?<LO>l.)1*\\k<LO>.$", "hello1lo1" },
{ "^[-a-z1-3\\s]+$", "hell2 o1" },
{ .pattern = "\\bhello\\B", .subject = "hello1", .options = ECMAScriptFlags::Global },
{ "\\b.*\\b", "hello1" },
{ "[^\\D\\S]{2}", "1 " },
{ "bar(?=f.)foo", "barfoo" },
{ "bar(?=foo)bar", "barbar", false },
{ "bar(?!foo)bar", "barbar", true },
{ "bar(?!bar)bar", "barbar", false },
{ "bar.*(?<=foo)", "barbar", false },
{ "bar.*(?<!foo)", "barbar", true },
{ "((...)X)+", "fooXbarXbazX", true },
{ "(?:)", "", true },
};
for (auto& test : tests) {
Regex<ECMA262> re(test.pattern, test.options);
#ifdef REGEX_DEBUG
dbgln("\n");
RegexDebug regex_dbg(stderr);
regex_dbg.print_raw_bytecode(re);
regex_dbg.print_header();
regex_dbg.print_bytecode(re);
dbgln("\n");
#endif
EXPECT_EQ(re.parser_result.error, Error::NoError);
EXPECT_EQ(re.match(test.subject).success, test.matches);
}
}
TEST_CASE(replace)
{
struct _test {
const char* pattern;
const char* replacement;
const char* subject;
const char* expected;
ECMAScriptFlags options {};
};
constexpr _test tests[] {
{ "foo(.+)", "aaa", "test", "test" },
{ "foo(.+)", "test\\1", "foobar", "testbar" },
{ "foo(.+)", "\\2\\1", "foobar", "\\2bar" },
{ "foo(.+)", "\\\\\\1", "foobar", "\\bar" },
{ "foo(.)", "a\\1", "fooxfooy", "axay", ECMAScriptFlags::Multiline },
};
for (auto& test : tests) {
Regex<ECMA262> re(test.pattern, test.options);
#ifdef REGEX_DEBUG
dbgln("\n");
RegexDebug regex_dbg(stderr);
regex_dbg.print_raw_bytecode(re);
regex_dbg.print_header();
regex_dbg.print_bytecode(re);
dbgln("\n");
#endif
EXPECT_EQ(re.parser_result.error, Error::NoError);
EXPECT_EQ(re.replace(test.subject, test.replacement), test.expected);
}
}
TEST_MAIN(Regex)

File diff suppressed because it is too large Load diff