diff --git a/Tests/LibRegex/Regex.cpp b/Tests/LibRegex/Regex.cpp index af7e55532d..e294002fe7 100644 --- a/Tests/LibRegex/Regex.cpp +++ b/Tests/LibRegex/Regex.cpp @@ -683,6 +683,12 @@ TEST_CASE(ECMA262_property_match) { "\\p{gc=Cased_Letter}", "a", true, ECMAScriptFlags::Unicode }, { "\\p{gc=Cased_Letter}", "A", true, ECMAScriptFlags::Unicode }, { "\\p{gc=Cased_Letter}", "9", false, ECMAScriptFlags::Unicode }, + { "\\p{Script=Latin}", "a", true, ECMAScriptFlags::Unicode }, + { "\\p{Script=Latin}", "A", true, ECMAScriptFlags::Unicode }, + { "\\p{Script=Latin}", "9", false, ECMAScriptFlags::Unicode }, + { "\\p{sc=Latin}", "a", true, ECMAScriptFlags::Unicode }, + { "\\p{sc=Latin}", "A", true, ECMAScriptFlags::Unicode }, + { "\\p{sc=Latin}", "9", false, ECMAScriptFlags::Unicode }, }; for (auto& test : tests) { diff --git a/Userland/Libraries/LibRegex/RegexByteCode.cpp b/Userland/Libraries/LibRegex/RegexByteCode.cpp index 5463aa1def..e9a6deab02 100644 --- a/Userland/Libraries/LibRegex/RegexByteCode.cpp +++ b/Userland/Libraries/LibRegex/RegexByteCode.cpp @@ -569,6 +569,10 @@ ALWAYS_INLINE ExecutionResult OpCode_Compare::execute(MatchInput const& input, M auto general_category = static_cast(m_bytecode->at(offset++)); compare_general_category(input, state, general_category, current_inversion_state(), inverse_matched); + } else if (compare_type == CharacterCompareType::Script) { + auto script = static_cast(m_bytecode->at(offset++)); + compare_script(input, state, script, current_inversion_state(), inverse_matched); + } else { warnln("Undefined comparison: {}", (int)compare_type); VERIFY_NOT_REACHED(); @@ -790,6 +794,22 @@ ALWAYS_INLINE void OpCode_Compare::compare_general_category(MatchInput const& in } } +ALWAYS_INLINE void OpCode_Compare::compare_script(MatchInput const& input, MatchState& state, Unicode::Script script, bool inverse, bool& inverse_matched) +{ + if (state.string_position == input.view.length()) + return; + + u32 code_point = input.view[state.string_position_in_code_units]; + bool equal = Unicode::code_point_has_script(code_point, script); + + if (equal) { + if (inverse) + inverse_matched = true; + else + advance_string_position(state, input.view, code_point); + } +} + String const OpCode_Compare::arguments_string() const { return String::formatted("argc={}, args={} ", arguments_count(), arguments_size()); diff --git a/Userland/Libraries/LibRegex/RegexByteCode.h b/Userland/Libraries/LibRegex/RegexByteCode.h index ccf83e0c19..51c08bc84c 100644 --- a/Userland/Libraries/LibRegex/RegexByteCode.h +++ b/Userland/Libraries/LibRegex/RegexByteCode.h @@ -68,6 +68,7 @@ enum class OpCodeId : ByteCodeValueType { __ENUMERATE_CHARACTER_COMPARE_TYPE(NamedReference) \ __ENUMERATE_CHARACTER_COMPARE_TYPE(Property) \ __ENUMERATE_CHARACTER_COMPARE_TYPE(GeneralCategory) \ + __ENUMERATE_CHARACTER_COMPARE_TYPE(Script) \ __ENUMERATE_CHARACTER_COMPARE_TYPE(RangeExpressionDummy) enum class CharacterCompareType : ByteCodeValueType { @@ -727,6 +728,7 @@ private: ALWAYS_INLINE static void compare_character_range(MatchInput const& input, MatchState& state, u32 from, u32 to, u32 ch, bool inverse, bool& inverse_matched); ALWAYS_INLINE static void compare_property(MatchInput const& input, MatchState& state, Unicode::Property property, bool inverse, bool& inverse_matched); ALWAYS_INLINE static void compare_general_category(MatchInput const& input, MatchState& state, Unicode::GeneralCategory general_category, bool inverse, bool& inverse_matched); + ALWAYS_INLINE static void compare_script(MatchInput const& input, MatchState& state, Unicode::Script script, bool inverse, bool& inverse_matched); }; template diff --git a/Userland/Libraries/LibRegex/RegexParser.cpp b/Userland/Libraries/LibRegex/RegexParser.cpp index 3139afb80f..44c264c619 100644 --- a/Userland/Libraries/LibRegex/RegexParser.cpp +++ b/Userland/Libraries/LibRegex/RegexParser.cpp @@ -1555,6 +1555,9 @@ bool ECMA262Parser::parse_atom_escape(ByteCode& stack, size_t& match_length_mini }, [&](Unicode::GeneralCategory general_category) { compares.empend(CompareTypeAndValuePair { CharacterCompareType::GeneralCategory, (ByteCodeValueType)general_category }); + }, + [&](Unicode::Script script) { + compares.empend(CompareTypeAndValuePair { CharacterCompareType::Script, (ByteCodeValueType)script }); }); stack.insert_bytecode_compare_values(move(compares)); match_length_minimum += 1; @@ -1705,12 +1708,14 @@ struct CharClassRangeElement { u32 code_point { 0 }; Unicode::Property property; Unicode::GeneralCategory general_category; + Unicode::Script script; }; bool is_negated { false }; bool is_character_class { false }; bool is_property { false }; bool is_general_category { false }; + bool is_script { false }; }; bool ECMA262Parser::parse_nonempty_class_ranges(Vector& ranges, bool unicode) @@ -1804,6 +1809,9 @@ bool ECMA262Parser::parse_nonempty_class_ranges(Vector& }, [&](Unicode::GeneralCategory general_category) { return CharClassRangeElement { .general_category = general_category, .is_negated = negated, .is_character_class = true, .is_general_category = true }; + }, + [&](Unicode::Script script) { + return CharClassRangeElement { .script = script, .is_negated = negated, .is_character_class = true, .is_script = true }; }); } } @@ -1851,6 +1859,8 @@ bool ECMA262Parser::parse_nonempty_class_ranges(Vector& ranges.empend(CompareTypeAndValuePair { CharacterCompareType::Property, (ByteCodeValueType)(atom.property) }); else if (atom.is_general_category) ranges.empend(CompareTypeAndValuePair { CharacterCompareType::GeneralCategory, (ByteCodeValueType)(atom.general_category) }); + else if (atom.is_script) + ranges.empend(CompareTypeAndValuePair { CharacterCompareType::Script, (ByteCodeValueType)(atom.script) }); else ranges.empend(CompareTypeAndValuePair { CharacterCompareType::CharClass, (ByteCodeValueType)atom.character_class }); } else { @@ -1949,9 +1959,8 @@ bool ECMA262Parser::parse_unicode_property_escape(PropertyEscape& property, bool } return true; }, - [](Unicode::GeneralCategory) { - return true; - }); + [](Unicode::GeneralCategory) { return true; }, + [](Unicode::Script) { return true; }); } StringView ECMA262Parser::read_capture_group_specifier(bool take_starting_angle_bracket) @@ -2015,6 +2024,9 @@ Optional ECMA262Parser::read_unicode_property_esc } else if ((property_type == "General_Category"sv) || (property_type == "gc"sv)) { if (auto general_category = Unicode::general_category_from_string(property_name); general_category.has_value()) return { *general_category }; + } else if ((property_type == "Script"sv) || (property_type == "sc"sv)) { + if (auto script = Unicode::script_from_string(property_name); script.has_value()) + return { *script }; } return {}; diff --git a/Userland/Libraries/LibRegex/RegexParser.h b/Userland/Libraries/LibRegex/RegexParser.h index 4693bbe0cf..e07f36627f 100644 --- a/Userland/Libraries/LibRegex/RegexParser.h +++ b/Userland/Libraries/LibRegex/RegexParser.h @@ -214,7 +214,7 @@ private: Optional read_digits(ReadDigitsInitialZeroState initial_zero = ReadDigitsInitialZeroState::Allow, bool hex = false, int max_count = -1); StringView read_capture_group_specifier(bool take_starting_angle_bracket = false); - using PropertyEscape = Variant; + using PropertyEscape = Variant; Optional read_unicode_property_escape(); bool parse_pattern(ByteCode&, size_t&, bool unicode, bool named);