mirror of
https://github.com/RGBCube/serenity
synced 2025-07-25 18:37:35 +00:00
LibRegex: Support property escapes of Unicode script extensions
This commit is contained in:
parent
5edd458420
commit
484ccfadc3
5 changed files with 51 additions and 7 deletions
|
@ -689,6 +689,12 @@ TEST_CASE(ECMA262_property_match)
|
||||||
{ "\\p{sc=Latin}", "a", true, ECMAScriptFlags::Unicode },
|
{ "\\p{sc=Latin}", "a", true, ECMAScriptFlags::Unicode },
|
||||||
{ "\\p{sc=Latin}", "A", true, ECMAScriptFlags::Unicode },
|
{ "\\p{sc=Latin}", "A", true, ECMAScriptFlags::Unicode },
|
||||||
{ "\\p{sc=Latin}", "9", false, ECMAScriptFlags::Unicode },
|
{ "\\p{sc=Latin}", "9", false, ECMAScriptFlags::Unicode },
|
||||||
|
{ "\\p{Script_Extensions=Deva}", "a", false, ECMAScriptFlags::Unicode },
|
||||||
|
{ "\\p{Script_Extensions=Beng}", "\xe1\xb3\x95", true, ECMAScriptFlags::Unicode }, // U+01CD5
|
||||||
|
{ "\\p{Script_Extensions=Deva}", "\xe1\xb3\x95", true, ECMAScriptFlags::Unicode }, // U+01CD5
|
||||||
|
{ "\\p{scx=Deva}", "a", false, ECMAScriptFlags::Unicode },
|
||||||
|
{ "\\p{scx=Beng}", "\xe1\xb3\x95", true, ECMAScriptFlags::Unicode }, // U+01CD5
|
||||||
|
{ "\\p{scx=Deva}", "\xe1\xb3\x95", true, ECMAScriptFlags::Unicode }, // U+01CD5
|
||||||
};
|
};
|
||||||
|
|
||||||
for (auto& test : tests) {
|
for (auto& test : tests) {
|
||||||
|
|
|
@ -573,6 +573,10 @@ ALWAYS_INLINE ExecutionResult OpCode_Compare::execute(MatchInput const& input, M
|
||||||
auto script = static_cast<Unicode::Script>(m_bytecode->at(offset++));
|
auto script = static_cast<Unicode::Script>(m_bytecode->at(offset++));
|
||||||
compare_script(input, state, script, current_inversion_state(), inverse_matched);
|
compare_script(input, state, script, current_inversion_state(), inverse_matched);
|
||||||
|
|
||||||
|
} else if (compare_type == CharacterCompareType::ScriptExtension) {
|
||||||
|
auto script = static_cast<Unicode::Script>(m_bytecode->at(offset++));
|
||||||
|
compare_script_extension(input, state, script, current_inversion_state(), inverse_matched);
|
||||||
|
|
||||||
} else {
|
} else {
|
||||||
warnln("Undefined comparison: {}", (int)compare_type);
|
warnln("Undefined comparison: {}", (int)compare_type);
|
||||||
VERIFY_NOT_REACHED();
|
VERIFY_NOT_REACHED();
|
||||||
|
@ -810,6 +814,22 @@ ALWAYS_INLINE void OpCode_Compare::compare_script(MatchInput const& input, Match
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
ALWAYS_INLINE void OpCode_Compare::compare_script_extension(MatchInput const& input, MatchState& state, Unicode::Script script, bool inverse, bool& inverse_matched)
|
||||||
|
{
|
||||||
|
if (state.string_position == input.view.length())
|
||||||
|
return;
|
||||||
|
|
||||||
|
u32 code_point = input.view[state.string_position_in_code_units];
|
||||||
|
bool equal = Unicode::code_point_has_script_extension(code_point, script);
|
||||||
|
|
||||||
|
if (equal) {
|
||||||
|
if (inverse)
|
||||||
|
inverse_matched = true;
|
||||||
|
else
|
||||||
|
advance_string_position(state, input.view, code_point);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
String const OpCode_Compare::arguments_string() const
|
String const OpCode_Compare::arguments_string() const
|
||||||
{
|
{
|
||||||
return String::formatted("argc={}, args={} ", arguments_count(), arguments_size());
|
return String::formatted("argc={}, args={} ", arguments_count(), arguments_size());
|
||||||
|
|
|
@ -69,6 +69,7 @@ enum class OpCodeId : ByteCodeValueType {
|
||||||
__ENUMERATE_CHARACTER_COMPARE_TYPE(Property) \
|
__ENUMERATE_CHARACTER_COMPARE_TYPE(Property) \
|
||||||
__ENUMERATE_CHARACTER_COMPARE_TYPE(GeneralCategory) \
|
__ENUMERATE_CHARACTER_COMPARE_TYPE(GeneralCategory) \
|
||||||
__ENUMERATE_CHARACTER_COMPARE_TYPE(Script) \
|
__ENUMERATE_CHARACTER_COMPARE_TYPE(Script) \
|
||||||
|
__ENUMERATE_CHARACTER_COMPARE_TYPE(ScriptExtension) \
|
||||||
__ENUMERATE_CHARACTER_COMPARE_TYPE(RangeExpressionDummy)
|
__ENUMERATE_CHARACTER_COMPARE_TYPE(RangeExpressionDummy)
|
||||||
|
|
||||||
enum class CharacterCompareType : ByteCodeValueType {
|
enum class CharacterCompareType : ByteCodeValueType {
|
||||||
|
@ -729,6 +730,7 @@ private:
|
||||||
ALWAYS_INLINE static void compare_property(MatchInput const& input, MatchState& state, Unicode::Property property, bool inverse, bool& inverse_matched);
|
ALWAYS_INLINE static void compare_property(MatchInput const& input, MatchState& state, Unicode::Property property, bool inverse, bool& inverse_matched);
|
||||||
ALWAYS_INLINE static void compare_general_category(MatchInput const& input, MatchState& state, Unicode::GeneralCategory general_category, bool inverse, bool& inverse_matched);
|
ALWAYS_INLINE static void compare_general_category(MatchInput const& input, MatchState& state, Unicode::GeneralCategory general_category, bool inverse, bool& inverse_matched);
|
||||||
ALWAYS_INLINE static void compare_script(MatchInput const& input, MatchState& state, Unicode::Script script, bool inverse, bool& inverse_matched);
|
ALWAYS_INLINE static void compare_script(MatchInput const& input, MatchState& state, Unicode::Script script, bool inverse, bool& inverse_matched);
|
||||||
|
ALWAYS_INLINE static void compare_script_extension(MatchInput const& input, MatchState& state, Unicode::Script script, bool inverse, bool& inverse_matched);
|
||||||
};
|
};
|
||||||
|
|
||||||
template<typename T>
|
template<typename T>
|
||||||
|
|
|
@ -1556,8 +1556,11 @@ bool ECMA262Parser::parse_atom_escape(ByteCode& stack, size_t& match_length_mini
|
||||||
[&](Unicode::GeneralCategory general_category) {
|
[&](Unicode::GeneralCategory general_category) {
|
||||||
compares.empend(CompareTypeAndValuePair { CharacterCompareType::GeneralCategory, (ByteCodeValueType)general_category });
|
compares.empend(CompareTypeAndValuePair { CharacterCompareType::GeneralCategory, (ByteCodeValueType)general_category });
|
||||||
},
|
},
|
||||||
[&](Unicode::Script script) {
|
[&](Script script) {
|
||||||
compares.empend(CompareTypeAndValuePair { CharacterCompareType::Script, (ByteCodeValueType)script });
|
if (script.is_extension)
|
||||||
|
compares.empend(CompareTypeAndValuePair { CharacterCompareType::ScriptExtension, (ByteCodeValueType)script.script });
|
||||||
|
else
|
||||||
|
compares.empend(CompareTypeAndValuePair { CharacterCompareType::Script, (ByteCodeValueType)script.script });
|
||||||
});
|
});
|
||||||
stack.insert_bytecode_compare_values(move(compares));
|
stack.insert_bytecode_compare_values(move(compares));
|
||||||
match_length_minimum += 1;
|
match_length_minimum += 1;
|
||||||
|
@ -1716,6 +1719,7 @@ struct CharClassRangeElement {
|
||||||
bool is_property { false };
|
bool is_property { false };
|
||||||
bool is_general_category { false };
|
bool is_general_category { false };
|
||||||
bool is_script { false };
|
bool is_script { false };
|
||||||
|
bool is_script_extension { false };
|
||||||
};
|
};
|
||||||
|
|
||||||
bool ECMA262Parser::parse_nonempty_class_ranges(Vector<CompareTypeAndValuePair>& ranges, bool unicode)
|
bool ECMA262Parser::parse_nonempty_class_ranges(Vector<CompareTypeAndValuePair>& ranges, bool unicode)
|
||||||
|
@ -1810,8 +1814,11 @@ bool ECMA262Parser::parse_nonempty_class_ranges(Vector<CompareTypeAndValuePair>&
|
||||||
[&](Unicode::GeneralCategory general_category) {
|
[&](Unicode::GeneralCategory general_category) {
|
||||||
return CharClassRangeElement { .general_category = general_category, .is_negated = negated, .is_character_class = true, .is_general_category = true };
|
return CharClassRangeElement { .general_category = general_category, .is_negated = negated, .is_character_class = true, .is_general_category = true };
|
||||||
},
|
},
|
||||||
[&](Unicode::Script script) {
|
[&](Script script) {
|
||||||
return CharClassRangeElement { .script = script, .is_negated = negated, .is_character_class = true, .is_script = true };
|
if (script.is_extension)
|
||||||
|
return CharClassRangeElement { .script = script.script, .is_negated = negated, .is_character_class = true, .is_script_extension = true };
|
||||||
|
else
|
||||||
|
return CharClassRangeElement { .script = script.script, .is_negated = negated, .is_character_class = true, .is_script = true };
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -1861,6 +1868,8 @@ bool ECMA262Parser::parse_nonempty_class_ranges(Vector<CompareTypeAndValuePair>&
|
||||||
ranges.empend(CompareTypeAndValuePair { CharacterCompareType::GeneralCategory, (ByteCodeValueType)(atom.general_category) });
|
ranges.empend(CompareTypeAndValuePair { CharacterCompareType::GeneralCategory, (ByteCodeValueType)(atom.general_category) });
|
||||||
else if (atom.is_script)
|
else if (atom.is_script)
|
||||||
ranges.empend(CompareTypeAndValuePair { CharacterCompareType::Script, (ByteCodeValueType)(atom.script) });
|
ranges.empend(CompareTypeAndValuePair { CharacterCompareType::Script, (ByteCodeValueType)(atom.script) });
|
||||||
|
else if (atom.is_script_extension)
|
||||||
|
ranges.empend(CompareTypeAndValuePair { CharacterCompareType::ScriptExtension, (ByteCodeValueType)(atom.script) });
|
||||||
else
|
else
|
||||||
ranges.empend(CompareTypeAndValuePair { CharacterCompareType::CharClass, (ByteCodeValueType)atom.character_class });
|
ranges.empend(CompareTypeAndValuePair { CharacterCompareType::CharClass, (ByteCodeValueType)atom.character_class });
|
||||||
} else {
|
} else {
|
||||||
|
@ -1960,7 +1969,7 @@ bool ECMA262Parser::parse_unicode_property_escape(PropertyEscape& property, bool
|
||||||
return true;
|
return true;
|
||||||
},
|
},
|
||||||
[](Unicode::GeneralCategory) { return true; },
|
[](Unicode::GeneralCategory) { return true; },
|
||||||
[](Unicode::Script) { return true; });
|
[](Script) { return true; });
|
||||||
}
|
}
|
||||||
|
|
||||||
StringView ECMA262Parser::read_capture_group_specifier(bool take_starting_angle_bracket)
|
StringView ECMA262Parser::read_capture_group_specifier(bool take_starting_angle_bracket)
|
||||||
|
@ -2026,7 +2035,10 @@ Optional<ECMA262Parser::PropertyEscape> ECMA262Parser::read_unicode_property_esc
|
||||||
return { *general_category };
|
return { *general_category };
|
||||||
} else if ((property_type == "Script"sv) || (property_type == "sc"sv)) {
|
} else if ((property_type == "Script"sv) || (property_type == "sc"sv)) {
|
||||||
if (auto script = Unicode::script_from_string(property_name); script.has_value())
|
if (auto script = Unicode::script_from_string(property_name); script.has_value())
|
||||||
return { *script };
|
return Script { *script, false };
|
||||||
|
} else if ((property_type == "Script_Extensions"sv) || (property_type == "scx"sv)) {
|
||||||
|
if (auto script = Unicode::script_from_string(property_name); script.has_value())
|
||||||
|
return Script { *script, true };
|
||||||
}
|
}
|
||||||
|
|
||||||
return {};
|
return {};
|
||||||
|
|
|
@ -214,7 +214,11 @@ private:
|
||||||
Optional<unsigned> read_digits(ReadDigitsInitialZeroState initial_zero = ReadDigitsInitialZeroState::Allow, bool hex = false, int max_count = -1);
|
Optional<unsigned> read_digits(ReadDigitsInitialZeroState initial_zero = ReadDigitsInitialZeroState::Allow, bool hex = false, int max_count = -1);
|
||||||
StringView read_capture_group_specifier(bool take_starting_angle_bracket = false);
|
StringView read_capture_group_specifier(bool take_starting_angle_bracket = false);
|
||||||
|
|
||||||
using PropertyEscape = Variant<Unicode::Property, Unicode::GeneralCategory, Unicode::Script>;
|
struct Script {
|
||||||
|
Unicode::Script script {};
|
||||||
|
bool is_extension { false };
|
||||||
|
};
|
||||||
|
using PropertyEscape = Variant<Unicode::Property, Unicode::GeneralCategory, Script>;
|
||||||
Optional<PropertyEscape> read_unicode_property_escape();
|
Optional<PropertyEscape> read_unicode_property_escape();
|
||||||
|
|
||||||
bool parse_pattern(ByteCode&, size_t&, bool unicode, bool named);
|
bool parse_pattern(ByteCode&, size_t&, bool unicode, bool named);
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue