From 06573cd46d50ed356681290a092187cbc9c85cae Mon Sep 17 00:00:00 2001 From: Ali Mohammad Pur Date: Fri, 14 Jul 2023 08:13:59 +0330 Subject: [PATCH] LibRegex: Enable the atomic rewrite optimisation for unicode properties --- .../Libraries/LibRegex/RegexOptimizer.cpp | 102 ++++++++++++++++++ 1 file changed, 102 insertions(+) diff --git a/Userland/Libraries/LibRegex/RegexOptimizer.cpp b/Userland/Libraries/LibRegex/RegexOptimizer.cpp index 282f267095..d3e3689a93 100644 --- a/Userland/Libraries/LibRegex/RegexOptimizer.cpp +++ b/Userland/Libraries/LibRegex/RegexOptimizer.cpp @@ -10,6 +10,7 @@ #include #include #include +#include #if REGEX_DEBUG # include # include @@ -124,6 +125,37 @@ static bool has_overlap(Vector const& lhs, Vector lhs_char_classes; HashTable lhs_negated_char_classes; + auto has_any_unicode_property = false; + HashTable lhs_unicode_general_categories; + HashTable lhs_unicode_properties; + HashTable lhs_unicode_scripts; + HashTable lhs_unicode_script_extensions; + HashTable lhs_negated_unicode_general_categories; + HashTable lhs_negated_unicode_properties; + HashTable lhs_negated_unicode_scripts; + HashTable lhs_negated_unicode_script_extensions; + + auto any_unicode_property_matches = [&](u32 code_point) { + if (any_of(lhs_negated_unicode_general_categories, [code_point](auto category) { return Unicode::code_point_has_general_category(code_point, category); })) + return false; + if (any_of(lhs_negated_unicode_properties, [code_point](auto property) { return Unicode::code_point_has_property(code_point, property); })) + return false; + if (any_of(lhs_negated_unicode_scripts, [code_point](auto script) { return Unicode::code_point_has_script(code_point, script); })) + return false; + if (any_of(lhs_negated_unicode_script_extensions, [code_point](auto script) { return Unicode::code_point_has_script_extension(code_point, script); })) + return false; + + if (any_of(lhs_unicode_general_categories, [code_point](auto category) { return Unicode::code_point_has_general_category(code_point, category); })) + return true; + if (any_of(lhs_unicode_properties, [code_point](auto property) { return Unicode::code_point_has_property(code_point, property); })) + return true; + if (any_of(lhs_unicode_scripts, [code_point](auto script) { return Unicode::code_point_has_script(code_point, script); })) + return true; + if (any_of(lhs_unicode_script_extensions, [code_point](auto script) { return Unicode::code_point_has_script_extension(code_point, script); })) + return true; + return false; + }; + auto range_contains = [&](T& value) -> bool { u32 start; u32 end; @@ -136,6 +168,12 @@ static bool has_overlap(Vector const& lhs, Vector const& lhs, Vector(pair.value)); + else + lhs_negated_unicode_properties.set(static_cast(pair.value)); + break; case CharacterCompareType::GeneralCategory: + has_any_unicode_property = true; + if (!current_lhs_inversion_state()) + lhs_unicode_general_categories.set(static_cast(pair.value)); + else + lhs_negated_unicode_general_categories.set(static_cast(pair.value)); + break; case CharacterCompareType::Script: + has_any_unicode_property = true; + if (!current_lhs_inversion_state()) + lhs_unicode_scripts.set(static_cast(pair.value)); + else + lhs_negated_unicode_scripts.set(static_cast(pair.value)); + break; case CharacterCompareType::ScriptExtension: + has_any_unicode_property = true; + if (!current_lhs_inversion_state()) + lhs_unicode_script_extensions.set(static_cast(pair.value)); + else + lhs_negated_unicode_script_extensions.set(static_cast(pair.value)); + break; case CharacterCompareType::And: case CharacterCompareType::Or: case CharacterCompareType::EndAndOr: @@ -275,9 +337,49 @@ static bool has_overlap(Vector const& lhs, Vector(pair.value))) + return true; + if (false == (current_lhs_inversion_state() ^ lhs_negated_unicode_properties.contains(static_cast(pair.value)))) + return true; + } + break; case CharacterCompareType::GeneralCategory: + if (!lhs_ranges.is_empty() || !lhs_negated_ranges.is_empty() || !lhs_char_classes.is_empty() || !lhs_negated_char_classes.is_empty()) + return true; + if (has_any_unicode_property && !lhs_unicode_general_categories.is_empty() && !lhs_negated_unicode_general_categories.is_empty()) { + if (current_lhs_inversion_state() ^ lhs_unicode_general_categories.contains(static_cast(pair.value))) + return true; + if (false == (current_lhs_inversion_state() ^ lhs_negated_unicode_general_categories.contains(static_cast(pair.value)))) + return true; + } + break; case CharacterCompareType::Script: + if (!lhs_ranges.is_empty() || !lhs_negated_ranges.is_empty() || !lhs_char_classes.is_empty() || !lhs_negated_char_classes.is_empty()) + return true; + if (has_any_unicode_property && !lhs_unicode_scripts.is_empty() && !lhs_negated_unicode_scripts.is_empty()) { + if (current_lhs_inversion_state() ^ lhs_unicode_scripts.contains(static_cast(pair.value))) + return true; + if (false == (current_lhs_inversion_state() ^ lhs_negated_unicode_scripts.contains(static_cast(pair.value)))) + return true; + } + break; case CharacterCompareType::ScriptExtension: + if (!lhs_ranges.is_empty() || !lhs_negated_ranges.is_empty() || !lhs_char_classes.is_empty() || !lhs_negated_char_classes.is_empty()) + return true; + if (has_any_unicode_property && !lhs_unicode_script_extensions.is_empty() && !lhs_negated_unicode_script_extensions.is_empty()) { + if (current_lhs_inversion_state() ^ lhs_unicode_script_extensions.contains(static_cast(pair.value))) + return true; + if (false == (current_lhs_inversion_state() ^ lhs_negated_unicode_script_extensions.contains(static_cast(pair.value)))) + return true; + } + break; case CharacterCompareType::And: case CharacterCompareType::Or: case CharacterCompareType::EndAndOr: