LibRegex+LibUnicode: Begin implementing Unicode property escapes

This supports some binary property matching. It does not support any properties not yet parsed by LibUnicode, nor does it support value matching (such as Script_Extensions=Latin).
2025-09-14 15:38:00 +00:00 · 2021-07-29 14:18:51 -04:00 · 2021-07-29 14:18:51 -04:00 · d485cf29d7
commit d485cf29d7
parent f1dd770a8a
11 changed files with 230 additions and 33 deletions
--- a/Meta/Lagom/CMakeLists.txt
+++ b/Meta/Lagom/CMakeLists.txt
@ -319,6 +319,7 @@ if (BUILD_LAGOM)
    file(GLOB LIBREGEX_SOURCES CONFIGURE_DEPENDS "../../Userland/Libraries/LibRegex/*.cpp")
    lagom_lib(Regex regex
        SOURCES ${LIBREGEX_SOURCES} ${LIBREGEX_LIBC_SOURCES}
+        LIBS LagomUnicode
    )

    # Shell
--- a/Tests/LibRegex/Regex.cpp
+++ b/Tests/LibRegex/Regex.cpp
@ -515,6 +515,13 @@ TEST_CASE(ECMA262_parse)
        { "\\u{10ffff", regex::Error::InvalidPattern, ECMAScriptFlags::Unicode },
        { "\\u{10ffffx", regex::Error::InvalidPattern, ECMAScriptFlags::Unicode },
        { "\\u{110000}", regex::Error::InvalidPattern, ECMAScriptFlags::Unicode },
+        { "\\p", regex::Error::InvalidPattern, ECMAScriptFlags::Unicode },
+        { "\\p{", regex::Error::InvalidPattern, ECMAScriptFlags::Unicode },
+        { "\\p{}", regex::Error::InvalidNameForProperty, ECMAScriptFlags::Unicode },
+        { "\\p{AsCiI}", regex::Error::InvalidNameForProperty, ECMAScriptFlags::Unicode },
+        { "\\p{hello friends}", regex::Error::InvalidNameForProperty, ECMAScriptFlags::Unicode },
+        { "\\p{Prepended_Concatenation_Mark}", regex::Error::InvalidNameForProperty, ECMAScriptFlags::Unicode },
+        { "\\p{ASCII}", regex::Error::NoError, ECMAScriptFlags::Unicode },
    };

    for (auto& test : tests) {
@ -635,6 +642,47 @@ TEST_CASE(ECMA262_unicode_match)
    }
 }

+TEST_CASE(ECMA262_property_match)
+{
+    struct _test {
+        char const* pattern;
+        char const* subject;
+        bool matches { true };
+        ECMAScriptFlags options {};
+    };
+
+    constexpr _test tests[] {
+        { "\\p{ASCII}", "a", false },
+        { "\\p{ASCII}", "p{ASCII}", true },
+        { "\\p{ASCII}", "a", true, ECMAScriptFlags::Unicode },
+        { "\\p{ASCII}", "😀", false, ECMAScriptFlags::Unicode },
+        { "\\p{ASCII_Hex_Digit}", "1", true, ECMAScriptFlags::Unicode },
+        { "\\p{ASCII_Hex_Digit}", "a", true, ECMAScriptFlags::Unicode },
+        { "\\p{ASCII_Hex_Digit}", "x", false, ECMAScriptFlags::Unicode },
+        { "\\p{Any}", "\xcd\xb8", true, ECMAScriptFlags::Unicode },       // U+0378, which is an unassigned code point.
+        { "\\p{Assigned}", "\xcd\xb8", false, ECMAScriptFlags::Unicode }, // U+0378, which is an unassigned code point.
+    };
+
+    for (auto& test : tests) {
+        Regex<ECMA262> re(test.pattern, (ECMAScriptFlags)regex::AllFlags::Global | regex::ECMAScriptFlags::BrowserExtended | test.options);
+
+        auto subject = AK::utf8_to_utf16(test.subject);
+        Utf16View view { subject };
+
+        if constexpr (REGEX_DEBUG) {
+            dbgln("\n");
+            RegexDebug regex_dbg(stderr);
+            regex_dbg.print_raw_bytecode(re);
+            regex_dbg.print_header();
+            regex_dbg.print_bytecode(re);
+            dbgln("\n");
+        }
+
+        EXPECT_EQ(re.parser_result.error, Error::NoError);
+        EXPECT_EQ(re.match(view).success, test.matches);
+    }
+}
+
 TEST_CASE(replace)
 {
    struct _test {
--- a/Userland/Libraries/LibC/regex.h
+++ b/Userland/Libraries/LibC/regex.h
@ -37,6 +37,7 @@ enum __Regex_Error {
    __Regex_EmptySubExpression,         // Sub expression has empty content.
    __Regex_InvalidCaptureGroup,        // Content of capture group is invalid.
    __Regex_InvalidNameForCaptureGroup, // Name of capture group is invalid.
+    __Regex_InvalidNameForProperty,     // Name of property is invalid.
 };

 enum ReError {
--- a/Userland/Libraries/LibRegex/CMakeLists.txt
+++ b/Userland/Libraries/LibRegex/CMakeLists.txt
@ -7,4 +7,4 @@ set(SOURCES
 )

 serenity_lib(LibRegex regex)
-target_link_libraries(LibRegex LibC LibCore)
+target_link_libraries(LibRegex LibC LibCore LibUnicode)
--- a/Userland/Libraries/LibRegex/RegexByteCode.cpp
+++ b/Userland/Libraries/LibRegex/RegexByteCode.cpp
@ -9,6 +9,7 @@
 #include "RegexDebug.h"
 #include <AK/CharacterTypes.h>
 #include <AK/Debug.h>
+#include <LibUnicode/CharacterTypes.h>

 namespace regex {

@ -532,6 +533,10 @@ ALWAYS_INLINE ExecutionResult OpCode_Compare::execute(MatchInput const& input, M
            if (!compare_string(input, state, str, had_zero_length_match))
                return ExecutionResult::Failed_ExecuteLowPrioForks;

+        } else if (compare_type == CharacterCompareType::Property) {
+            auto property = static_cast<Unicode::Property>(m_bytecode->at(offset++));
+            compare_property(input, state, property, current_inversion_state(), inverse_matched);
+
        } else {
            warnln("Undefined comparison: {}", (int)compare_type);
            VERIFY_NOT_REACHED();
@ -721,6 +726,22 @@ ALWAYS_INLINE void OpCode_Compare::compare_character_range(MatchInput const& inp
    }
 }

+ALWAYS_INLINE void OpCode_Compare::compare_property(MatchInput const& input, MatchState& state, Unicode::Property property, bool inverse, bool& inverse_matched)
+{
+    if (state.string_position == input.view.length())
+        return;
+
+    u32 code_point = input.view[state.string_position];
+    bool equal = Unicode::code_point_has_property(code_point, property);
+
+    if (equal) {
+        if (inverse)
+            inverse_matched = true;
+        else
+            ++state.string_position;
+    }
+}
+
 String const OpCode_Compare::arguments_string() const
 {
    return String::formatted("argc={}, args={} ", arguments_count(), arguments_size());
--- a/Userland/Libraries/LibRegex/RegexByteCode.h
+++ b/Userland/Libraries/LibRegex/RegexByteCode.h
@ -18,6 +18,7 @@
 #include <AK/TypeCasts.h>
 #include <AK/Types.h>
 #include <AK/Vector.h>
+#include <LibUnicode/Forward.h>

 namespace regex {

@ -65,6 +66,7 @@ enum class OpCodeId : ByteCodeValueType {
    __ENUMERATE_CHARACTER_COMPARE_TYPE(CharRange)        \
    __ENUMERATE_CHARACTER_COMPARE_TYPE(Reference)        \
    __ENUMERATE_CHARACTER_COMPARE_TYPE(NamedReference)   \
+    __ENUMERATE_CHARACTER_COMPARE_TYPE(Property)         \
    __ENUMERATE_CHARACTER_COMPARE_TYPE(RangeExpressionDummy)

 enum class CharacterCompareType : ByteCodeValueType {
@ -722,6 +724,7 @@ private:
    ALWAYS_INLINE static bool compare_string(MatchInput const& input, MatchState& state, RegexStringView const& str, bool& had_zero_length_match);
    ALWAYS_INLINE static void compare_character_class(MatchInput const& input, MatchState& state, CharClass character_class, u32 ch, bool inverse, bool& inverse_matched);
    ALWAYS_INLINE static void compare_character_range(MatchInput const& input, MatchState& state, u32 from, u32 to, u32 ch, bool inverse, bool& inverse_matched);
+    ALWAYS_INLINE static void compare_property(MatchInput const& input, MatchState& state, Unicode::Property property, bool inverse, bool& inverse_matched);
 };

 template<typename T>
--- a/Userland/Libraries/LibRegex/RegexError.h
+++ b/Userland/Libraries/LibRegex/RegexError.h
@ -34,6 +34,7 @@ enum class Error : u8 {
    EmptySubExpression = __Regex_EmptySubExpression,                 // Sub expression has empty content.
    InvalidCaptureGroup = __Regex_InvalidCaptureGroup,               // Content of capture group is invalid.
    InvalidNameForCaptureGroup = __Regex_InvalidNameForCaptureGroup, // Name of capture group is invalid.
+    InvalidNameForProperty = __Regex_InvalidNameForProperty,         // Name of property is invalid.
 };

 inline String get_error_string(Error error)
@ -73,6 +74,8 @@ inline String get_error_string(Error error)
        return "Content of capture group is invalid.";
    case Error::InvalidNameForCaptureGroup:
        return "Name of capture group is invalid.";
+    case Error::InvalidNameForProperty:
+        return "Name of property is invalid.";
    }
    return "Undefined error.";
 }
--- a/Userland/Libraries/LibRegex/RegexParser.cpp
+++ b/Userland/Libraries/LibRegex/RegexParser.cpp
@ -12,6 +12,7 @@
 #include <AK/StringBuilder.h>
 #include <AK/StringUtils.h>
 #include <AK/Utf16View.h>
+#include <LibUnicode/CharacterTypes.h>

 namespace regex {

@ -1238,12 +1239,12 @@ bool ECMA262Parser::parse_atom(ByteCode& stack, size_t& match_length_minimum, bo

    if (match(TokenType::LeftBracket)) {
        // Character class.
-        return parse_character_class(stack, match_length_minimum, unicode && !m_should_use_browser_extended_grammar, named);
+        return parse_character_class(stack, match_length_minimum, unicode, named);
    }

    if (match(TokenType::LeftParen)) {
        // Non-capturing group, or a capture group.
-        return parse_capture_group(stack, match_length_minimum, unicode && !m_should_use_browser_extended_grammar, named);
+        return parse_capture_group(stack, match_length_minimum, unicode, named);
    }

    if (match(TokenType::Period)) {
@ -1541,13 +1542,14 @@ bool ECMA262Parser::parse_atom_escape(ByteCode& stack, size_t& match_length_mini
    }

    if (unicode) {
-        if (try_skip("p{")) {
-            // FIXME: Implement this path, Unicode property match.
-            TODO();
-        }
-        if (try_skip("P{")) {
-            // FIXME: Implement this path, Unicode property match.
-            TODO();
+        Unicode::Property property {};
+        bool negated = false;
+
+        if (parse_unicode_property_escape(property, negated)) {
+            if (negated)
+                stack.insert_bytecode_compare_values({ { CharacterCompareType::Inverse, 0 } });
+            stack.insert_bytecode_compare_values({ { CharacterCompareType::Property, (ByteCodeValueType)(property) } });
+            return true;
        }
    }

@ -1692,10 +1694,12 @@ struct CharClassRangeElement {
    union {
        CharClass character_class;
        u32 code_point { 0 };
+        Unicode::Property property;
    };

    bool is_negated { false };
    bool is_character_class { false };
+    bool is_property_escape { false };
 };

 bool ECMA262Parser::parse_nonempty_class_ranges(Vector<CompareTypeAndValuePair>& ranges, bool unicode)
@ -1779,11 +1783,11 @@ bool ECMA262Parser::parse_nonempty_class_ranges(Vector<CompareTypeAndValuePair>&
            if (unicode) {
                if (try_skip("-"))
                    return { CharClassRangeElement { .code_point = '-', .is_character_class = false } };
-            }

-            if (try_skip("p{") || try_skip("P{")) {
-                // FIXME: Implement these; unicode properties.
-                TODO();
+                Unicode::Property property {};
+                bool negated = false;
+                if (parse_unicode_property_escape(property, negated))
+                    return { CharClassRangeElement { .property = property, .is_negated = negated, .is_character_class = true, .is_property_escape = true } };
            }

            if (try_skip("d"))
@ -1820,6 +1824,20 @@ bool ECMA262Parser::parse_nonempty_class_ranges(Vector<CompareTypeAndValuePair>&
        return read_class_atom_no_dash();
    };

+    auto empend_atom = [&](auto& atom) {
+        if (atom.is_character_class) {
+            if (atom.is_negated)
+                ranges.empend(CompareTypeAndValuePair { CharacterCompareType::TemporaryInverse, 0 });
+            if (atom.is_property_escape)
+                ranges.empend(CompareTypeAndValuePair { CharacterCompareType::Property, (ByteCodeValueType)(atom.property) });
+            else
+                ranges.empend(CompareTypeAndValuePair { CharacterCompareType::CharClass, (ByteCodeValueType)atom.character_class });
+        } else {
+            VERIFY(!atom.is_negated);
+            ranges.empend(CompareTypeAndValuePair { CharacterCompareType::Char, atom.code_point });
+        }
+    };
+
    while (!match(TokenType::RightBracket)) {
        if (match(TokenType::Eof)) {
            set_error(Error::MismatchingBracket);
@ -1848,18 +1866,11 @@ bool ECMA262Parser::parse_nonempty_class_ranges(Vector<CompareTypeAndValuePair>&
                        set_error(Error::InvalidRange);
                        return false;
                    }
+
                    // CharacterRangeOrUnion > !Unicode > CharClass
-                    if (first_atom->is_character_class)
-                        ranges.empend(CompareTypeAndValuePair { CharacterCompareType::CharClass, (ByteCodeValueType)first_atom->character_class });
-                    else
-                        ranges.empend(CompareTypeAndValuePair { CharacterCompareType::Char, (ByteCodeValueType)first_atom->code_point });
-
+                    empend_atom(*first_atom);
                    ranges.empend(CompareTypeAndValuePair { CharacterCompareType::Char, (ByteCodeValueType)'-' });
-
-                    if (second_atom->is_character_class)
-                        ranges.empend(CompareTypeAndValuePair { CharacterCompareType::CharClass, (ByteCodeValueType)second_atom->character_class });
-                    else
-                        ranges.empend(CompareTypeAndValuePair { CharacterCompareType::Char, (ByteCodeValueType)second_atom->code_point });
+                    empend_atom(*second_atom);
                    continue;
                } else {
                    set_error(Error::InvalidRange);
@ -1882,15 +1893,7 @@ bool ECMA262Parser::parse_nonempty_class_ranges(Vector<CompareTypeAndValuePair>&
    read_as_single_atom:;

        auto atom = first_atom.value();
-
-        if (atom.is_character_class) {
-            if (atom.is_negated)
-                ranges.empend(CompareTypeAndValuePair { CharacterCompareType::TemporaryInverse, 0 });
-            ranges.empend(CompareTypeAndValuePair { CharacterCompareType::CharClass, (ByteCodeValueType)first_atom.value().character_class });
-        } else {
-            VERIFY(!atom.is_negated);
-            ranges.empend(CompareTypeAndValuePair { CharacterCompareType::Char, first_atom.value().code_point });
-        }
+        empend_atom(atom);
    }

    consume(TokenType::RightBracket, Error::MismatchingBracket);
@ -1898,6 +1901,32 @@ bool ECMA262Parser::parse_nonempty_class_ranges(Vector<CompareTypeAndValuePair>&
    return true;
 }

+bool ECMA262Parser::parse_unicode_property_escape(Unicode::Property& property, bool& negated)
+{
+    negated = false;
+
+    if (try_skip("p"))
+        negated = false;
+    else if (try_skip("P"))
+        negated = true;
+    else
+        return false;
+
+    auto parsed_property = read_unicode_property_escape();
+    if (!parsed_property.has_value()) {
+        set_error(Error::InvalidNameForProperty);
+        return false;
+    }
+
+    if (!Unicode::is_ecma262_property(*parsed_property)) {
+        set_error(Error::InvalidNameForProperty);
+        return false;
+    }
+
+    property = *parsed_property;
+    return true;
+}
+
 StringView ECMA262Parser::read_capture_group_specifier(bool take_starting_angle_bracket)
 {
    if (take_starting_angle_bracket && !consume("<"))
@ -1919,6 +1948,24 @@ StringView ECMA262Parser::read_capture_group_specifier(bool take_starting_angle_
    return name;
 }

+Optional<Unicode::Property> ECMA262Parser::read_unicode_property_escape()
+{
+    consume(TokenType::LeftCurly, Error::InvalidPattern);
+
+    auto start_token = m_parser_state.current_token;
+    size_t offset = 0;
+    while (match(TokenType::Char)) {
+        if (m_parser_state.current_token.value() == "}")
+            break;
+        offset += consume().value().length();
+    }
+
+    consume(TokenType::RightCurly, Error::InvalidPattern);
+
+    StringView property_name { start_token.value().characters_without_null_termination(), offset };
+    return Unicode::property_from_string(property_name);
+}
+
 bool ECMA262Parser::parse_capture_group(ByteCode& stack, size_t& match_length_minimum, bool unicode, bool named)
 {
    consume(TokenType::LeftParen, Error::InvalidPattern);
--- a/Userland/Libraries/LibRegex/RegexParser.h
+++ b/Userland/Libraries/LibRegex/RegexParser.h
@ -15,6 +15,7 @@
 #include <AK/StringBuilder.h>
 #include <AK/Types.h>
 #include <AK/Vector.h>
+#include <LibUnicode/Forward.h>

 namespace regex {

@ -212,6 +213,7 @@ private:
    StringView read_digits_as_string(ReadDigitsInitialZeroState initial_zero = ReadDigitsInitialZeroState::Allow, bool hex = false, int max_count = -1);
    Optional<unsigned> read_digits(ReadDigitsInitialZeroState initial_zero = ReadDigitsInitialZeroState::Allow, bool hex = false, int max_count = -1);
    StringView read_capture_group_specifier(bool take_starting_angle_bracket = false);
+    Optional<Unicode::Property> read_unicode_property_escape();

    bool parse_pattern(ByteCode&, size_t&, bool unicode, bool named);
    bool parse_disjunction(ByteCode&, size_t&, bool unicode, bool named);
@ -225,6 +227,7 @@ private:
    bool parse_capture_group(ByteCode&, size_t&, bool unicode, bool named);
    Optional<CharClass> parse_character_class_escape(bool& out_inverse, bool expect_backslash = false);
    bool parse_nonempty_class_ranges(Vector<CompareTypeAndValuePair>&, bool unicode);
+    bool parse_unicode_property_escape(Unicode::Property& property, bool& negated);

    // Used only by B.1.4, Regular Expression Patterns (Extended for use in browsers)
    bool parse_quantifiable_assertion(ByteCode&, size_t&, bool named);
--- a/Userland/Libraries/LibUnicode/CharacterTypes.cpp
+++ b/Userland/Libraries/LibUnicode/CharacterTypes.cpp
@ -222,4 +222,73 @@ bool code_point_has_property([[maybe_unused]] u32 code_point, [[maybe_unused]] P
 #endif
 }

+bool is_ecma262_property([[maybe_unused]] Property property)
+{
+#if ENABLE_UNICODE_DATA
+    // EMCA-262 only allows a subset of Unicode properties: https://tc39.es/ecma262/#table-binary-unicode-properties
+    // Note: Some of the properties in the above link are not yet parsed by the LibUnicode generator. They are left
+    //       commented out here until they are parsed and can be used.
+    switch (property) {
+    case Unicode::Property::ASCII:
+    case Unicode::Property::ASCII_Hex_Digit:
+    case Unicode::Property::Alphabetic:
+    case Unicode::Property::Any:
+    case Unicode::Property::Assigned:
+    case Unicode::Property::Bidi_Control:
+    // case Unicode::Property::Bidi_Mirrored:
+    case Unicode::Property::Case_Ignorable:
+    case Unicode::Property::Cased:
+    case Unicode::Property::Changes_When_Casefolded:
+    case Unicode::Property::Changes_When_Casemapped:
+    case Unicode::Property::Changes_When_Lowercased:
+    // case Unicode::Property::Changes_When_NFKC_Casefolded:
+    case Unicode::Property::Changes_When_Titlecased:
+    case Unicode::Property::Changes_When_Uppercased:
+    case Unicode::Property::Dash:
+    case Unicode::Property::Default_Ignorable_Code_Point:
+    case Unicode::Property::Deprecated:
+    case Unicode::Property::Diacritic:
+    // case Unicode::Property::Emoji:
+    // case Unicode::Property::Emoji_Component:
+    // case Unicode::Property::Emoji_Modifier:
+    // case Unicode::Property::Emoji_Modifier_Base:
+    // case Unicode::Property::Emoji_Presentation:
+    // case Unicode::Property::Extended_Pictographic:
+    case Unicode::Property::Extender:
+    case Unicode::Property::Grapheme_Base:
+    case Unicode::Property::Grapheme_Extend:
+    case Unicode::Property::Hex_Digit:
+    case Unicode::Property::IDS_Binary_Operator:
+    case Unicode::Property::IDS_Trinary_Operator:
+    case Unicode::Property::ID_Continue:
+    case Unicode::Property::ID_Start:
+    case Unicode::Property::Ideographic:
+    case Unicode::Property::Join_Control:
+    case Unicode::Property::Logical_Order_Exception:
+    case Unicode::Property::Lowercase:
+    case Unicode::Property::Math:
+    case Unicode::Property::Noncharacter_Code_Point:
+    case Unicode::Property::Pattern_Syntax:
+    case Unicode::Property::Pattern_White_Space:
+    case Unicode::Property::Quotation_Mark:
+    case Unicode::Property::Radical:
+    case Unicode::Property::Regional_Indicator:
+    case Unicode::Property::Sentence_Terminal:
+    case Unicode::Property::Soft_Dotted:
+    case Unicode::Property::Terminal_Punctuation:
+    case Unicode::Property::Unified_Ideograph:
+    case Unicode::Property::Uppercase:
+    case Unicode::Property::Variation_Selector:
+    case Unicode::Property::White_Space:
+    case Unicode::Property::XID_Continue:
+    case Unicode::Property::XID_Start:
+        return true;
+    default:
+        return false;
+    }
+#else
+    return false;
+#endif
+}
+
 }
--- a/Userland/Libraries/LibUnicode/CharacterTypes.h
+++ b/Userland/Libraries/LibUnicode/CharacterTypes.h
@ -23,5 +23,6 @@ String to_unicode_uppercase_full(StringView const&);

 Optional<Property> property_from_string(StringView const&);
 bool code_point_has_property(u32 code_point, Property property);
+bool is_ecma262_property(Property);

 }