1
Fork 0
mirror of https://github.com/RGBCube/serenity synced 2025-05-31 17:58:12 +00:00

LibRegex: Fully interpret the Compare Op when looking for overlaps

We had a really naive and simplistic implementation, which lead to
various issues where the optimiser incorrectly rewrote the regex to use
atomic groups; this commit fixes that.
This commit is contained in:
Ali Mohammad Pur 2022-07-04 21:11:15 +04:30 committed by Andreas Kling
parent b6f2ba6917
commit 6e655b7f89
4 changed files with 206 additions and 98 deletions

View file

@ -108,6 +108,182 @@ typename Regex<Parser>::BasicBlockList Regex<Parser>::split_basic_blocks(ByteCod
return block_boundaries;
}
static bool has_overlap(Vector<CompareTypeAndValuePair> const& lhs, Vector<CompareTypeAndValuePair> const& rhs)
{
// We have to fully interpret the two sequences to determine if they overlap (that is, keep track of inversion state and what ranges they cover).
bool inverse { false };
bool temporary_inverse { false };
bool reset_temporary_inverse { false };
auto current_lhs_inversion_state = [&]() -> bool { return temporary_inverse ^ inverse; };
RedBlackTree<u32, u32> lhs_ranges;
RedBlackTree<u32, u32> lhs_negated_ranges;
HashTable<CharClass> lhs_char_classes;
HashTable<CharClass> lhs_negated_char_classes;
auto range_contains = [&]<typename T>(T& value) -> bool {
u32 start;
u32 end;
if constexpr (IsSame<T, CharRange>) {
start = value.from;
end = value.to;
} else {
start = value;
end = value;
}
auto* max = lhs_ranges.find_smallest_not_below(start);
return max && *max <= end;
};
auto char_class_contains = [&](CharClass const& value) -> bool {
if (lhs_char_classes.contains(value))
return true;
if (lhs_negated_char_classes.contains(value))
return false;
// This char class might match something in the ranges we have, and checking that is far too expensive, so just bail out.
return true;
};
for (auto const& pair : lhs) {
if (reset_temporary_inverse) {
reset_temporary_inverse = false;
temporary_inverse = false;
} else {
reset_temporary_inverse = true;
}
switch (pair.type) {
case CharacterCompareType::Inverse:
inverse = !inverse;
break;
case CharacterCompareType::TemporaryInverse:
temporary_inverse = !temporary_inverse;
break;
case CharacterCompareType::AnyChar:
// Special case: if not inverted, AnyChar is always in the range.
if (!current_lhs_inversion_state())
return true;
break;
case CharacterCompareType::Char:
if (!current_lhs_inversion_state())
lhs_ranges.insert(pair.value, pair.value);
else
lhs_negated_ranges.insert(pair.value, pair.value);
break;
case CharacterCompareType::String:
// FIXME: We just need to look at the last character of this string, but we only have the first character here.
// Just bail out to avoid false positives.
return true;
case CharacterCompareType::CharClass:
if (!current_lhs_inversion_state())
lhs_char_classes.set(static_cast<CharClass>(pair.value));
else
lhs_negated_char_classes.set(static_cast<CharClass>(pair.value));
break;
case CharacterCompareType::CharRange: {
auto range = bit_cast<CharRange>(pair.value);
if (!current_lhs_inversion_state())
lhs_ranges.insert(range.from, range.to);
else
lhs_negated_ranges.insert(range.from, range.to);
break;
}
case CharacterCompareType::LookupTable:
// We've transformed this into a series of ranges in flat_compares(), so bail out if we see it.
return true;
case CharacterCompareType::Reference:
// We've handled this before coming here.
break;
case CharacterCompareType::Property:
case CharacterCompareType::GeneralCategory:
case CharacterCompareType::Script:
case CharacterCompareType::ScriptExtension:
// FIXME: These are too difficult to handle, so bail out.
return true;
case CharacterCompareType::Undefined:
case CharacterCompareType::RangeExpressionDummy:
// These do not occur in valid bytecode.
VERIFY_NOT_REACHED();
}
}
if constexpr (REGEX_DEBUG) {
dbgln("lhs ranges:");
for (auto it = lhs_ranges.begin(); it != lhs_ranges.end(); ++it)
dbgln(" {}..{}", it.key(), *it);
dbgln("lhs negated ranges:");
for (auto it = lhs_negated_ranges.begin(); it != lhs_negated_ranges.end(); ++it)
dbgln(" {}..{}", it.key(), *it);
}
for (auto const& pair : rhs) {
if (reset_temporary_inverse) {
reset_temporary_inverse = false;
temporary_inverse = false;
} else {
reset_temporary_inverse = true;
}
dbgln_if(REGEX_DEBUG, "check {} ({})...", character_compare_type_name(pair.type), pair.value);
switch (pair.type) {
case CharacterCompareType::Inverse:
inverse = !inverse;
break;
case CharacterCompareType::TemporaryInverse:
temporary_inverse = !temporary_inverse;
break;
case CharacterCompareType::AnyChar:
// Special case: if not inverted, AnyChar is always in the range.
if (!current_lhs_inversion_state())
return true;
break;
case CharacterCompareType::Char:
if (!current_lhs_inversion_state() && range_contains(pair.value))
return true;
break;
case CharacterCompareType::String:
// FIXME: We just need to look at the last character of this string, but we only have the first character here.
// Just bail out to avoid false positives.
return true;
case CharacterCompareType::CharClass:
if (!current_lhs_inversion_state() && char_class_contains(static_cast<CharClass>(pair.value)))
return true;
break;
case CharacterCompareType::CharRange: {
auto range = bit_cast<CharRange>(pair.value);
if (!current_lhs_inversion_state() && range_contains(range))
return true;
break;
}
case CharacterCompareType::LookupTable:
// We've transformed this into a series of ranges in flat_compares(), so bail out if we see it.
return true;
case CharacterCompareType::Reference:
// We've handled this before coming here.
break;
case CharacterCompareType::Property:
case CharacterCompareType::GeneralCategory:
case CharacterCompareType::Script:
case CharacterCompareType::ScriptExtension:
// FIXME: These are too difficult to handle, so bail out.
return true;
case CharacterCompareType::Undefined:
case CharacterCompareType::RangeExpressionDummy:
// These do not occur in valid bytecode.
VERIFY_NOT_REACHED();
}
}
return false;
}
enum class AtomicRewritePreconditionResult {
SatisfiedWithProperHeader,
SatisfiedWithEmptyHeader,
@ -179,17 +355,9 @@ static AtomicRewritePreconditionResult block_satisfies_atomic_rewrite_preconditi
}))
return AtomicRewritePreconditionResult::NotSatisfied;
for (auto& repeated_value : repeated_values) {
// FIXME: This is too naive!
if (any_of(repeated_value, [](auto& compare) { return compare.type == CharacterCompareType::AnyChar; }))
return AtomicRewritePreconditionResult::NotSatisfied;
if (any_of(repeated_values, [&](auto& repeated_value) { return has_overlap(compares, repeated_value); }))
return AtomicRewritePreconditionResult::NotSatisfied;
for (auto& repeated_compare : repeated_value) {
// FIXME: This is too naive! it will miss _tons_ of cases since it doesn't check ranges!
if (any_of(compares, [&](auto& compare) { return compare.type == repeated_compare.type && compare.value == repeated_compare.value; }))
return AtomicRewritePreconditionResult::NotSatisfied;
}
}
return AtomicRewritePreconditionResult::SatisfiedWithProperHeader;
}
case OpCodeId::CheckBegin: