mirror of
https://github.com/RGBCube/serenity
synced 2025-07-22 22:37:40 +00:00
LibRegex: Use a match table for character classes
Generate a sorted, compressed series of ranges in a match table for character classes, and use a binary search to find the matches. This is about a 3-4x speedup for character class match performance. :^)
This commit is contained in:
parent
478b36c37b
commit
8f722302d9
6 changed files with 232 additions and 35 deletions
|
@ -910,3 +910,21 @@ TEST_CASE(optimizer_atomic_groups)
|
||||||
EXPECT_EQ(result.success, test.get<2>());
|
EXPECT_EQ(result.success, test.get<2>());
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
TEST_CASE(optimizer_char_class_lut)
|
||||||
|
{
|
||||||
|
Regex<ECMA262> re(R"([\f\n\r\t\v\u00a0\u1680\u2000\u2001\u2002\u2003\u2004\u2005\u2006\u2007\u2008\u2009\u200a\u2028\u2029\u202f\u205f\u3000\ufeff]+$)");
|
||||||
|
|
||||||
|
if constexpr (REGEX_DEBUG) {
|
||||||
|
dbgln("\n");
|
||||||
|
RegexDebug regex_dbg(stderr);
|
||||||
|
regex_dbg.print_raw_bytecode(re);
|
||||||
|
regex_dbg.print_header();
|
||||||
|
regex_dbg.print_bytecode(re);
|
||||||
|
dbgln("\n");
|
||||||
|
}
|
||||||
|
|
||||||
|
// This will go through _all_ alternatives in the character class, and then fail.
|
||||||
|
for (size_t i = 0; i < 1'000'000; ++i)
|
||||||
|
EXPECT_EQ(re.match("1635488940000"sv).success, false);
|
||||||
|
}
|
||||||
|
|
|
@ -9,6 +9,8 @@
|
||||||
#include <AK/Types.h>
|
#include <AK/Types.h>
|
||||||
|
|
||||||
namespace regex {
|
namespace regex {
|
||||||
|
struct CompareTypeAndValuePair;
|
||||||
|
|
||||||
enum class Error : u8;
|
enum class Error : u8;
|
||||||
class Lexer;
|
class Lexer;
|
||||||
class PosixExtendedParser;
|
class PosixExtendedParser;
|
||||||
|
|
|
@ -7,6 +7,7 @@
|
||||||
#include "RegexByteCode.h"
|
#include "RegexByteCode.h"
|
||||||
#include "AK/StringBuilder.h"
|
#include "AK/StringBuilder.h"
|
||||||
#include "RegexDebug.h"
|
#include "RegexDebug.h"
|
||||||
|
#include <AK/BinarySearch.h>
|
||||||
#include <AK/CharacterTypes.h>
|
#include <AK/CharacterTypes.h>
|
||||||
#include <AK/Debug.h>
|
#include <AK/Debug.h>
|
||||||
#include <LibUnicode/CharacterTypes.h>
|
#include <LibUnicode/CharacterTypes.h>
|
||||||
|
@ -491,6 +492,38 @@ ALWAYS_INLINE ExecutionResult OpCode_Compare::execute(MatchInput const& input, M
|
||||||
|
|
||||||
compare_character_class(input, state, character_class, ch, current_inversion_state(), inverse_matched);
|
compare_character_class(input, state, character_class, ch, current_inversion_state(), inverse_matched);
|
||||||
|
|
||||||
|
} else if (compare_type == CharacterCompareType::LookupTable) {
|
||||||
|
if (input.view.length() <= state.string_position)
|
||||||
|
return ExecutionResult::Failed_ExecuteLowPrioForks;
|
||||||
|
|
||||||
|
auto count = m_bytecode->at(offset++);
|
||||||
|
auto range_data = m_bytecode->spans().slice(offset, count);
|
||||||
|
offset += count;
|
||||||
|
|
||||||
|
auto ch = input.view.substring_view(state.string_position, 1)[0];
|
||||||
|
|
||||||
|
auto matching_range = binary_search(range_data, ch, nullptr, [insensitive = input.regex_options & AllFlags::Insensitive](auto needle, CharRange range) {
|
||||||
|
auto from = range.from;
|
||||||
|
auto to = range.to;
|
||||||
|
if (insensitive) {
|
||||||
|
from = to_ascii_lowercase(from);
|
||||||
|
to = to_ascii_lowercase(to);
|
||||||
|
needle = to_ascii_lowercase(needle);
|
||||||
|
}
|
||||||
|
if (needle > range.to)
|
||||||
|
return 1;
|
||||||
|
if (needle < range.from)
|
||||||
|
return -1;
|
||||||
|
return 0;
|
||||||
|
});
|
||||||
|
|
||||||
|
if (matching_range) {
|
||||||
|
if (current_inversion_state())
|
||||||
|
inverse_matched = true;
|
||||||
|
else
|
||||||
|
advance_string_position(state, input.view, ch);
|
||||||
|
}
|
||||||
|
|
||||||
} else if (compare_type == CharacterCompareType::CharRange) {
|
} else if (compare_type == CharacterCompareType::CharRange) {
|
||||||
if (input.view.length() <= state.string_position)
|
if (input.view.length() <= state.string_position)
|
||||||
return ExecutionResult::Failed_ExecuteLowPrioForks;
|
return ExecutionResult::Failed_ExecuteLowPrioForks;
|
||||||
|
@ -816,6 +849,10 @@ Vector<CompareTypeAndValuePair> OpCode_Compare::flat_compares() const
|
||||||
} else if (compare_type == CharacterCompareType::CharRange) {
|
} else if (compare_type == CharacterCompareType::CharRange) {
|
||||||
auto value = m_bytecode->at(offset++);
|
auto value = m_bytecode->at(offset++);
|
||||||
result.append({ compare_type, value });
|
result.append({ compare_type, value });
|
||||||
|
} else if (compare_type == CharacterCompareType::LookupTable) {
|
||||||
|
auto count = m_bytecode->at(offset++);
|
||||||
|
for (size_t i = 0; i < count; ++i)
|
||||||
|
result.append({ CharacterCompareType::CharRange, m_bytecode->at(offset++) });
|
||||||
} else {
|
} else {
|
||||||
result.append({ compare_type, 0 });
|
result.append({ compare_type, 0 });
|
||||||
}
|
}
|
||||||
|
@ -884,6 +921,16 @@ Vector<String> const OpCode_Compare::variable_arguments_to_string(Optional<Match
|
||||||
result.empend(String::formatted(
|
result.empend(String::formatted(
|
||||||
"compare against: '{}'",
|
"compare against: '{}'",
|
||||||
input.value().view.substring_view(string_start_offset, state().string_position > view.length() ? 0 : 1).to_string()));
|
input.value().view.substring_view(string_start_offset, state().string_position > view.length() ? 0 : 1).to_string()));
|
||||||
|
} else if (compare_type == CharacterCompareType::LookupTable) {
|
||||||
|
auto count = m_bytecode->at(offset++);
|
||||||
|
for (size_t j = 0; j < count; ++j) {
|
||||||
|
auto range = (CharRange)m_bytecode->at(offset++);
|
||||||
|
result.append(String::formatted("{:x}-{:x}", range.from, range.to));
|
||||||
|
}
|
||||||
|
if (!view.is_null() && view.length() > state().string_position)
|
||||||
|
result.empend(String::formatted(
|
||||||
|
"compare against: '{}'",
|
||||||
|
input.value().view.substring_view(string_start_offset, state().string_position > view.length() ? 0 : 1).to_string()));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
return result;
|
return result;
|
||||||
|
|
|
@ -75,7 +75,8 @@ enum class OpCodeId : ByteCodeValueType {
|
||||||
__ENUMERATE_CHARACTER_COMPARE_TYPE(GeneralCategory) \
|
__ENUMERATE_CHARACTER_COMPARE_TYPE(GeneralCategory) \
|
||||||
__ENUMERATE_CHARACTER_COMPARE_TYPE(Script) \
|
__ENUMERATE_CHARACTER_COMPARE_TYPE(Script) \
|
||||||
__ENUMERATE_CHARACTER_COMPARE_TYPE(ScriptExtension) \
|
__ENUMERATE_CHARACTER_COMPARE_TYPE(ScriptExtension) \
|
||||||
__ENUMERATE_CHARACTER_COMPARE_TYPE(RangeExpressionDummy)
|
__ENUMERATE_CHARACTER_COMPARE_TYPE(RangeExpressionDummy) \
|
||||||
|
__ENUMERATE_CHARACTER_COMPARE_TYPE(LookupTable)
|
||||||
|
|
||||||
enum class CharacterCompareType : ByteCodeValueType {
|
enum class CharacterCompareType : ByteCodeValueType {
|
||||||
#define __ENUMERATE_CHARACTER_COMPARE_TYPE(x) x,
|
#define __ENUMERATE_CHARACTER_COMPARE_TYPE(x) x,
|
||||||
|
@ -186,26 +187,7 @@ public:
|
||||||
|
|
||||||
void insert_bytecode_compare_values(Vector<CompareTypeAndValuePair>&& pairs)
|
void insert_bytecode_compare_values(Vector<CompareTypeAndValuePair>&& pairs)
|
||||||
{
|
{
|
||||||
ByteCode bytecode;
|
Optimizer::append_character_class(*this, move(pairs));
|
||||||
|
|
||||||
bytecode.empend(static_cast<ByteCodeValueType>(OpCodeId::Compare));
|
|
||||||
bytecode.empend(pairs.size()); // number of arguments
|
|
||||||
|
|
||||||
ByteCode arguments;
|
|
||||||
for (auto& value : pairs) {
|
|
||||||
VERIFY(value.type != CharacterCompareType::RangeExpressionDummy);
|
|
||||||
VERIFY(value.type != CharacterCompareType::Undefined);
|
|
||||||
VERIFY(value.type != CharacterCompareType::String);
|
|
||||||
|
|
||||||
arguments.append((ByteCodeValueType)value.type);
|
|
||||||
if (value.type != CharacterCompareType::Inverse && value.type != CharacterCompareType::AnyChar && value.type != CharacterCompareType::TemporaryInverse)
|
|
||||||
arguments.append(move(value.value));
|
|
||||||
}
|
|
||||||
|
|
||||||
bytecode.empend(arguments.size()); // size of arguments
|
|
||||||
bytecode.extend(move(arguments));
|
|
||||||
|
|
||||||
extend(move(bytecode));
|
|
||||||
}
|
}
|
||||||
|
|
||||||
void insert_bytecode_check_boundary(BoundaryCheckType type)
|
void insert_bytecode_check_boundary(BoundaryCheckType type)
|
||||||
|
|
|
@ -7,12 +7,14 @@
|
||||||
#pragma once
|
#pragma once
|
||||||
|
|
||||||
#include "Forward.h"
|
#include "Forward.h"
|
||||||
|
#include <AK/Vector.h>
|
||||||
|
|
||||||
namespace regex {
|
namespace regex {
|
||||||
|
|
||||||
class Optimizer {
|
class Optimizer {
|
||||||
public:
|
public:
|
||||||
static void append_alternation(ByteCode& target, ByteCode&& left, ByteCode&& right);
|
static void append_alternation(ByteCode& target, ByteCode&& left, ByteCode&& right);
|
||||||
|
static void append_character_class(ByteCode& target, Vector<CompareTypeAndValuePair>&& pairs);
|
||||||
};
|
};
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -488,6 +488,152 @@ void Optimizer::append_alternation(ByteCode& target, ByteCode&& left, ByteCode&&
|
||||||
// LABEL _END = alterantive_bytecode.size
|
// LABEL _END = alterantive_bytecode.size
|
||||||
}
|
}
|
||||||
|
|
||||||
|
enum class LookupTableInsertionOutcome {
|
||||||
|
Successful,
|
||||||
|
ReplaceWithAnyChar,
|
||||||
|
TemporaryInversionNeeded,
|
||||||
|
PermanentInversionNeeded,
|
||||||
|
CannotPlaceInTable,
|
||||||
|
};
|
||||||
|
static LookupTableInsertionOutcome insert_into_lookup_table(RedBlackTree<ByteCodeValueType, CharRange>& table, CompareTypeAndValuePair pair)
|
||||||
|
{
|
||||||
|
switch (pair.type) {
|
||||||
|
case CharacterCompareType::Inverse:
|
||||||
|
return LookupTableInsertionOutcome::PermanentInversionNeeded;
|
||||||
|
case CharacterCompareType::TemporaryInverse:
|
||||||
|
return LookupTableInsertionOutcome::TemporaryInversionNeeded;
|
||||||
|
case CharacterCompareType::AnyChar:
|
||||||
|
return LookupTableInsertionOutcome::ReplaceWithAnyChar;
|
||||||
|
case CharacterCompareType::CharClass:
|
||||||
|
return LookupTableInsertionOutcome::CannotPlaceInTable;
|
||||||
|
case CharacterCompareType::Char:
|
||||||
|
table.insert(pair.value, { (u32)pair.value, (u32)pair.value });
|
||||||
|
break;
|
||||||
|
case CharacterCompareType::CharRange: {
|
||||||
|
CharRange range { pair.value };
|
||||||
|
table.insert(range.from, range);
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
case CharacterCompareType::Reference:
|
||||||
|
case CharacterCompareType::Property:
|
||||||
|
case CharacterCompareType::GeneralCategory:
|
||||||
|
case CharacterCompareType::Script:
|
||||||
|
case CharacterCompareType::ScriptExtension:
|
||||||
|
return LookupTableInsertionOutcome::CannotPlaceInTable;
|
||||||
|
case CharacterCompareType::Undefined:
|
||||||
|
case CharacterCompareType::RangeExpressionDummy:
|
||||||
|
case CharacterCompareType::String:
|
||||||
|
case CharacterCompareType::LookupTable:
|
||||||
|
VERIFY_NOT_REACHED();
|
||||||
|
}
|
||||||
|
|
||||||
|
return LookupTableInsertionOutcome::Successful;
|
||||||
|
}
|
||||||
|
|
||||||
|
void Optimizer::append_character_class(ByteCode& target, Vector<CompareTypeAndValuePair>&& pairs)
|
||||||
|
{
|
||||||
|
ByteCode arguments;
|
||||||
|
size_t argument_count = 0;
|
||||||
|
|
||||||
|
if (pairs.size() <= 1) {
|
||||||
|
for (auto& pair : pairs) {
|
||||||
|
arguments.append(to_underlying(pair.type));
|
||||||
|
if (pair.type != CharacterCompareType::AnyChar && pair.type != CharacterCompareType::TemporaryInverse && pair.type != CharacterCompareType::Inverse)
|
||||||
|
arguments.append(pair.value);
|
||||||
|
++argument_count;
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
RedBlackTree<ByteCodeValueType, CharRange> table;
|
||||||
|
RedBlackTree<ByteCodeValueType, CharRange> inverted_table;
|
||||||
|
auto* current_table = &table;
|
||||||
|
auto* current_inverted_table = &inverted_table;
|
||||||
|
bool invert_for_next_iteration = false;
|
||||||
|
bool is_currently_inverted = false;
|
||||||
|
|
||||||
|
for (auto& value : pairs) {
|
||||||
|
auto should_invert_after_this_iteration = invert_for_next_iteration;
|
||||||
|
invert_for_next_iteration = false;
|
||||||
|
|
||||||
|
auto insertion_result = insert_into_lookup_table(*current_table, value);
|
||||||
|
switch (insertion_result) {
|
||||||
|
case LookupTableInsertionOutcome::Successful:
|
||||||
|
break;
|
||||||
|
case LookupTableInsertionOutcome::ReplaceWithAnyChar: {
|
||||||
|
table.clear();
|
||||||
|
inverted_table.clear();
|
||||||
|
arguments.append(to_underlying(CharacterCompareType::AnyChar));
|
||||||
|
++argument_count;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
case LookupTableInsertionOutcome::TemporaryInversionNeeded:
|
||||||
|
swap(current_table, current_inverted_table);
|
||||||
|
invert_for_next_iteration = true;
|
||||||
|
is_currently_inverted = !is_currently_inverted;
|
||||||
|
break;
|
||||||
|
case LookupTableInsertionOutcome::PermanentInversionNeeded:
|
||||||
|
swap(current_table, current_inverted_table);
|
||||||
|
is_currently_inverted = !is_currently_inverted;
|
||||||
|
break;
|
||||||
|
case LookupTableInsertionOutcome::CannotPlaceInTable:
|
||||||
|
if (is_currently_inverted) {
|
||||||
|
arguments.append(to_underlying(CharacterCompareType::TemporaryInverse));
|
||||||
|
++argument_count;
|
||||||
|
}
|
||||||
|
arguments.append(to_underlying(value.type));
|
||||||
|
arguments.append(value.value);
|
||||||
|
++argument_count;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (should_invert_after_this_iteration) {
|
||||||
|
swap(current_table, current_inverted_table);
|
||||||
|
is_currently_inverted = !is_currently_inverted;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
auto append_table = [&](auto& table) {
|
||||||
|
++argument_count;
|
||||||
|
arguments.append(to_underlying(CharacterCompareType::LookupTable));
|
||||||
|
auto size_index = arguments.size();
|
||||||
|
arguments.append(0);
|
||||||
|
Optional<CharRange> active_range;
|
||||||
|
size_t range_count = 0;
|
||||||
|
for (auto& range : table) {
|
||||||
|
if (!active_range.has_value()) {
|
||||||
|
active_range = range;
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (range.from <= active_range->to + 1 && range.to + 1 >= active_range->from) {
|
||||||
|
active_range = CharRange { min(range.from, active_range->from), max(range.to, active_range->to) };
|
||||||
|
} else {
|
||||||
|
++range_count;
|
||||||
|
arguments.append(active_range.release_value());
|
||||||
|
active_range = range;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (active_range.has_value()) {
|
||||||
|
++range_count;
|
||||||
|
arguments.append(active_range.release_value());
|
||||||
|
}
|
||||||
|
arguments[size_index] = range_count;
|
||||||
|
};
|
||||||
|
|
||||||
|
if (!table.is_empty())
|
||||||
|
append_table(table);
|
||||||
|
|
||||||
|
if (!inverted_table.is_empty()) {
|
||||||
|
++argument_count;
|
||||||
|
arguments.append(to_underlying(CharacterCompareType::TemporaryInverse));
|
||||||
|
append_table(inverted_table);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
target.empend(static_cast<ByteCodeValueType>(OpCodeId::Compare));
|
||||||
|
target.empend(argument_count); // number of arguments
|
||||||
|
target.empend(arguments.size()); // size of arguments
|
||||||
|
target.extend(move(arguments));
|
||||||
|
}
|
||||||
|
|
||||||
template void Regex<PosixBasicParser>::run_optimization_passes();
|
template void Regex<PosixBasicParser>::run_optimization_passes();
|
||||||
template void Regex<PosixExtendedParser>::run_optimization_passes();
|
template void Regex<PosixExtendedParser>::run_optimization_passes();
|
||||||
template void Regex<ECMA262Parser>::run_optimization_passes();
|
template void Regex<ECMA262Parser>::run_optimization_passes();
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue