1
Fork 0
mirror of https://github.com/RGBCube/serenity synced 2025-05-31 10:18:11 +00:00

LibRegex: Use a match table for character classes

Generate a sorted, compressed series of ranges in a match table for
character classes, and use a binary search to find the matches.
This is about a 3-4x speedup for character class match performance. :^)
This commit is contained in:
Ali Mohammad Pur 2021-10-03 19:01:25 +03:30 committed by Andreas Kling
parent 478b36c37b
commit 8f722302d9
6 changed files with 232 additions and 35 deletions

View file

@ -488,6 +488,152 @@ void Optimizer::append_alternation(ByteCode& target, ByteCode&& left, ByteCode&&
// LABEL _END = alterantive_bytecode.size
}
enum class LookupTableInsertionOutcome {
Successful,
ReplaceWithAnyChar,
TemporaryInversionNeeded,
PermanentInversionNeeded,
CannotPlaceInTable,
};
static LookupTableInsertionOutcome insert_into_lookup_table(RedBlackTree<ByteCodeValueType, CharRange>& table, CompareTypeAndValuePair pair)
{
switch (pair.type) {
case CharacterCompareType::Inverse:
return LookupTableInsertionOutcome::PermanentInversionNeeded;
case CharacterCompareType::TemporaryInverse:
return LookupTableInsertionOutcome::TemporaryInversionNeeded;
case CharacterCompareType::AnyChar:
return LookupTableInsertionOutcome::ReplaceWithAnyChar;
case CharacterCompareType::CharClass:
return LookupTableInsertionOutcome::CannotPlaceInTable;
case CharacterCompareType::Char:
table.insert(pair.value, { (u32)pair.value, (u32)pair.value });
break;
case CharacterCompareType::CharRange: {
CharRange range { pair.value };
table.insert(range.from, range);
break;
}
case CharacterCompareType::Reference:
case CharacterCompareType::Property:
case CharacterCompareType::GeneralCategory:
case CharacterCompareType::Script:
case CharacterCompareType::ScriptExtension:
return LookupTableInsertionOutcome::CannotPlaceInTable;
case CharacterCompareType::Undefined:
case CharacterCompareType::RangeExpressionDummy:
case CharacterCompareType::String:
case CharacterCompareType::LookupTable:
VERIFY_NOT_REACHED();
}
return LookupTableInsertionOutcome::Successful;
}
void Optimizer::append_character_class(ByteCode& target, Vector<CompareTypeAndValuePair>&& pairs)
{
ByteCode arguments;
size_t argument_count = 0;
if (pairs.size() <= 1) {
for (auto& pair : pairs) {
arguments.append(to_underlying(pair.type));
if (pair.type != CharacterCompareType::AnyChar && pair.type != CharacterCompareType::TemporaryInverse && pair.type != CharacterCompareType::Inverse)
arguments.append(pair.value);
++argument_count;
}
} else {
RedBlackTree<ByteCodeValueType, CharRange> table;
RedBlackTree<ByteCodeValueType, CharRange> inverted_table;
auto* current_table = &table;
auto* current_inverted_table = &inverted_table;
bool invert_for_next_iteration = false;
bool is_currently_inverted = false;
for (auto& value : pairs) {
auto should_invert_after_this_iteration = invert_for_next_iteration;
invert_for_next_iteration = false;
auto insertion_result = insert_into_lookup_table(*current_table, value);
switch (insertion_result) {
case LookupTableInsertionOutcome::Successful:
break;
case LookupTableInsertionOutcome::ReplaceWithAnyChar: {
table.clear();
inverted_table.clear();
arguments.append(to_underlying(CharacterCompareType::AnyChar));
++argument_count;
break;
}
case LookupTableInsertionOutcome::TemporaryInversionNeeded:
swap(current_table, current_inverted_table);
invert_for_next_iteration = true;
is_currently_inverted = !is_currently_inverted;
break;
case LookupTableInsertionOutcome::PermanentInversionNeeded:
swap(current_table, current_inverted_table);
is_currently_inverted = !is_currently_inverted;
break;
case LookupTableInsertionOutcome::CannotPlaceInTable:
if (is_currently_inverted) {
arguments.append(to_underlying(CharacterCompareType::TemporaryInverse));
++argument_count;
}
arguments.append(to_underlying(value.type));
arguments.append(value.value);
++argument_count;
break;
}
if (should_invert_after_this_iteration) {
swap(current_table, current_inverted_table);
is_currently_inverted = !is_currently_inverted;
}
}
auto append_table = [&](auto& table) {
++argument_count;
arguments.append(to_underlying(CharacterCompareType::LookupTable));
auto size_index = arguments.size();
arguments.append(0);
Optional<CharRange> active_range;
size_t range_count = 0;
for (auto& range : table) {
if (!active_range.has_value()) {
active_range = range;
continue;
}
if (range.from <= active_range->to + 1 && range.to + 1 >= active_range->from) {
active_range = CharRange { min(range.from, active_range->from), max(range.to, active_range->to) };
} else {
++range_count;
arguments.append(active_range.release_value());
active_range = range;
}
}
if (active_range.has_value()) {
++range_count;
arguments.append(active_range.release_value());
}
arguments[size_index] = range_count;
};
if (!table.is_empty())
append_table(table);
if (!inverted_table.is_empty()) {
++argument_count;
arguments.append(to_underlying(CharacterCompareType::TemporaryInverse));
append_table(inverted_table);
}
}
target.empend(static_cast<ByteCodeValueType>(OpCodeId::Compare));
target.empend(argument_count); // number of arguments
target.empend(arguments.size()); // size of arguments
target.extend(move(arguments));
}
template void Regex<PosixBasicParser>::run_optimization_passes();
template void Regex<PosixExtendedParser>::run_optimization_passes();
template void Regex<ECMA262Parser>::run_optimization_passes();