1
Fork 0
mirror of https://github.com/RGBCube/serenity synced 2025-07-25 23:37:35 +00:00

LibRegex: Partially implement the ECMAScript unicodeSets proposal

This skips the new string unicode properties additions, along with \q{}.
This commit is contained in:
Ali Mohammad Pur 2022-07-20 23:22:07 +04:30 committed by Linus Groh
parent 7734914909
commit 598dc74a76
9 changed files with 611 additions and 69 deletions

View file

@ -930,16 +930,17 @@ bool PosixExtendedParser::parse_root(ByteCode& stack, size_t& match_length_minim
bool ECMA262Parser::parse_internal(ByteCode& stack, size_t& match_length_minimum)
{
auto unicode = m_parser_state.regex_options.has_flag_set(AllFlags::Unicode);
if (unicode) {
return parse_pattern(stack, match_length_minimum, { .unicode = true, .named = true });
auto unicode_sets = m_parser_state.regex_options.has_flag_set(AllFlags::UnicodeSets);
if (unicode || unicode_sets) {
return parse_pattern(stack, match_length_minimum, { .unicode = true, .named = true, .unicode_sets = unicode_sets });
}
ByteCode new_stack;
size_t new_match_length = 0;
auto res = parse_pattern(new_stack, new_match_length, { .unicode = false, .named = false });
auto res = parse_pattern(new_stack, new_match_length, { .unicode = false, .named = false, .unicode_sets = false });
if (m_parser_state.named_capture_groups_count > 0) {
reset();
return parse_pattern(stack, match_length_minimum, { .unicode = false, .named = true });
return parse_pattern(stack, match_length_minimum, { .unicode = false, .named = true, .unicode_sets = false });
}
if (!res)
@ -1136,7 +1137,7 @@ bool ECMA262Parser::parse_quantifiable_assertion(ByteCode& stack, size_t&, Parse
size_t match_length_minimum = 0;
if (try_skip("="sv)) {
if (!parse_inner_disjunction(assertion_stack, match_length_minimum, { .unicode = false, .named = flags.named }))
if (!parse_inner_disjunction(assertion_stack, match_length_minimum, { .unicode = false, .named = flags.named, .unicode_sets = false }))
return false;
stack.insert_bytecode_lookaround(move(assertion_stack), ByteCode::LookAroundType::LookAhead);
@ -1149,7 +1150,7 @@ bool ECMA262Parser::parse_quantifiable_assertion(ByteCode& stack, size_t&, Parse
exit_capture_group_scope();
}
};
if (!parse_inner_disjunction(assertion_stack, match_length_minimum, { .unicode = false, .named = flags.named }))
if (!parse_inner_disjunction(assertion_stack, match_length_minimum, { .unicode = false, .named = flags.named, .unicode_sets = false }))
return false;
stack.insert_bytecode_lookaround(move(assertion_stack), ByteCode::LookAroundType::NegatedLookAhead);
@ -1756,6 +1757,7 @@ bool ECMA262Parser::parse_character_class(ByteCode& stack, size_t& match_length_
compares.empend(CompareTypeAndValuePair { CharacterCompareType::Inverse, 0 });
}
// ClassContents :: [empty]
if (match(TokenType::RightBracket)) {
consume();
// Should only have at most an 'Inverse'
@ -1764,7 +1766,12 @@ bool ECMA262Parser::parse_character_class(ByteCode& stack, size_t& match_length_
return true;
}
if (!parse_nonempty_class_ranges(compares, flags))
// ClassContents :: [~UnicodeSetsMode] NonemptyClassRanges[?UnicodeMode]
if (!flags.unicode_sets && !parse_nonempty_class_ranges(compares, flags))
return false;
// ClassContents :: [+UnicodeSetsMode] ClassSetExpression
if (flags.unicode_sets && !parse_class_set_expression(compares))
return false;
match_length_minimum += 1;
@ -2029,6 +2036,364 @@ bool ECMA262Parser::parse_nonempty_class_ranges(Vector<CompareTypeAndValuePair>&
return true;
}
bool ECMA262Parser::parse_class_set_expression(Vector<CompareTypeAndValuePair>& compares)
{
auto start_position = tell();
// ClassSetExpression :: ClassUnion | ClassIntersection | ClassSubtraction
if (parse_class_subtraction(compares)) {
consume(TokenType::RightBracket, Error::MismatchingBracket);
return true;
}
if (has_error())
return false;
back(tell() - start_position + 1);
if (parse_class_intersection(compares)) {
consume(TokenType::RightBracket, Error::MismatchingBracket);
return true;
}
if (has_error())
return false;
back(tell() - start_position + 1);
if (parse_class_union(compares)) {
consume(TokenType::RightBracket, Error::MismatchingBracket);
return true;
}
return false;
}
bool ECMA262Parser::parse_class_union(Vector<regex::CompareTypeAndValuePair>& compares)
{
auto start_position = tell();
ArmedScopeGuard restore_position { [&] { back(tell() - start_position + 1); } };
auto first = true;
// ClassUnion :: ClassSetRange ClassUnion[opt] | ClassSetOperand ClassUnion[opt]
for (;;) {
if (!parse_class_set_range(compares)) {
if (has_error() || match(TokenType::RightBracket))
break;
if (!parse_class_set_operand(compares)) {
if (first || has_error())
return false;
break;
}
}
first = false;
}
restore_position.disarm();
return !has_error();
}
bool ECMA262Parser::parse_class_intersection(Vector<CompareTypeAndValuePair>& compares)
{
// ClassIntersection :: ClassSetOperand "&&" [lookahead != "&"] ClassSetOperand
// | ClassIntersection "&&" [lookahead != "&"] ClassSetOperand
Vector<CompareTypeAndValuePair> lhs;
Vector<CompareTypeAndValuePair> rhs;
auto start_position = tell();
ArmedScopeGuard restore_position { [&] { back(tell() - start_position + 1); } };
if (!parse_class_set_operand(lhs))
return false;
if (!try_skip("&&"sv))
return false;
compares.append({ CharacterCompareType::And, 0 });
compares.extend(move(lhs));
do {
rhs.clear_with_capacity();
if (!parse_class_set_operand(rhs))
return false;
compares.extend(rhs);
if (try_skip("&&&"sv))
return false;
} while (!has_error() && try_skip("&&"sv));
compares.append({ CharacterCompareType::EndAndOr, 0 });
restore_position.disarm();
return true;
}
bool ECMA262Parser::parse_class_subtraction(Vector<CompareTypeAndValuePair>& compares)
{
// ClassSubtraction :: ClassSetOperand "--" ClassSetOperand | ClassSubtraction "--" ClassSetOperand
Vector<CompareTypeAndValuePair> lhs;
Vector<CompareTypeAndValuePair> rhs;
auto start_position = tell();
ArmedScopeGuard restore_position { [&] { back(tell() - start_position + 1); } };
if (!parse_class_set_operand(lhs))
return false;
if (!try_skip("--"sv))
return false;
compares.append({ CharacterCompareType::And, 0 });
compares.extend(move(lhs));
do {
rhs.clear_with_capacity();
if (!parse_class_set_operand(rhs))
return false;
compares.append({ CharacterCompareType::TemporaryInverse, 0 });
compares.extend(rhs);
} while (!has_error() && try_skip("--"sv));
compares.append({ CharacterCompareType::EndAndOr, 0 });
restore_position.disarm();
return true;
}
bool ECMA262Parser::parse_class_set_range(Vector<CompareTypeAndValuePair>& compares)
{
// ClassSetRange :: ClassSetCharacter "-" ClassSetCharacter
auto start_position = tell();
ArmedScopeGuard restore_position { [&] { back(tell() - start_position + 1); } };
auto lhs = parse_class_set_character();
if (!lhs.has_value())
return false;
if (!match(TokenType::HyphenMinus))
return false;
consume();
auto rhs = parse_class_set_character();
if (!rhs.has_value())
return false;
compares.append({
CharacterCompareType::CharRange,
CharRange { lhs.value(), rhs.value() },
});
restore_position.disarm();
return true;
}
Optional<u32> ECMA262Parser::parse_class_set_character()
{
// ClassSetCharacter :: [lookahead ∉ ClassSetReservedDoublePunctuator] SourceCharacter but not ClassSetSyntaxCharacter
// | "\" CharacterEscape[+UnicodeMode]
// | "\" ClassSetReservedPunctuator
// | "\" b
// ClassSetReservedDoublePunctuator :: one of "&&" "!!" "##" "$$" "%%" "**" "++" ",," ".." "::" ";;" "<<" "==" ">>" "??" "@@" "^^" "``" "~~"
// ClassSetSyntaxCharacter :: one of "(" ")" "{" "}" "[" "]" "/" "-" "\" "|"
// ClassSetReservedPunctuator :: one of "&" "-" "!" "#" "%" "," ":" ";" "<" "=" ">" "@" "`" "~"
constexpr auto class_set_reserved_double_punctuator = Array {
"&&"sv, "!!"sv, "##"sv, "$$"sv, "%%"sv, "**"sv, "++"sv, ",,"sv, ".."sv, "::"sv, ";;"sv, "<<"sv, "=="sv, ">>"sv, "??"sv, "@@"sv, "^^"sv, "``"sv, "~~"sv
};
auto start_position = tell();
ArmedScopeGuard restore { [&] { back(tell() - start_position + 1); } };
if (try_skip("\\"sv)) {
if (done()) {
set_error(Error::InvalidTrailingEscape);
return {};
}
// "\" ClassSetReservedPunctuator
for (auto const& reserved : class_set_reserved_double_punctuator) {
if (try_skip(reserved)) {
// "\" ClassSetReservedPunctuator (ClassSetReservedPunctuator)
back();
restore.disarm();
return reserved[0];
}
}
// "\" b
if (try_skip("b"sv)) {
restore.disarm();
return '\b';
}
// "\" CharacterEscape[+UnicodeMode]
Vector<CompareTypeAndValuePair> compares;
size_t minimum_length = 0;
if (parse_character_escape(compares, minimum_length, { .unicode = true })) {
VERIFY(compares.size() == 1);
auto& compare = compares.first();
VERIFY(compare.type == CharacterCompareType::Char);
restore.disarm();
return compare.value;
}
return {};
}
// [lookahead ∉ ClassSetReservedDoublePunctuator] SourceCharacter but not ClassSetSyntaxCharacter
auto lookahead_matches = any_of(class_set_reserved_double_punctuator, [this](auto& reserved) {
return try_skip(reserved);
});
if (lookahead_matches)
return {};
for (auto character : { "("sv, ")"sv, "{"sv, "}"sv, "["sv, "]"sv, "/"sv, "-"sv, "\\"sv, "|"sv }) {
if (try_skip(character))
return {};
}
restore.disarm();
return skip();
}
bool ECMA262Parser::parse_class_set_operand(Vector<regex::CompareTypeAndValuePair>& compares)
{
auto start_position = tell();
// ClassSetOperand :: ClassSetCharacter | ClassStringDisjunction | NestedClass
if (auto character = parse_class_set_character(); character.has_value()) {
compares.append({ CharacterCompareType::Char, character.value() });
return true;
}
// NestedClass :: "[" [lookahead != "^"] ClassContents[+UnicodeMode +UnicodeSetsMode] "]"
// | "[" "^" ClassContents[+UnicodeMode +UnicodeSetsMode] "]"
// | "\" CharacterClassEscape[+UnicodeMode]
if (parse_nested_class(compares))
return true;
if (has_error())
return false;
auto negated = false;
if (auto ch = parse_character_class_escape(negated, true); ch.has_value()) {
if (negated)
compares.append({ CharacterCompareType::TemporaryInverse, 1 });
compares.append({ CharacterCompareType::CharClass, (ByteCodeValueType)ch.value() });
return true;
}
PropertyEscape property {};
if (parse_unicode_property_escape(property, negated)) {
if (negated)
compares.empend(CompareTypeAndValuePair { CharacterCompareType::Inverse, 0 });
property.visit(
[&](Unicode::Property property) {
compares.empend(CompareTypeAndValuePair { CharacterCompareType::Property, (ByteCodeValueType)property });
},
[&](Unicode::GeneralCategory general_category) {
compares.empend(CompareTypeAndValuePair { CharacterCompareType::GeneralCategory, (ByteCodeValueType)general_category });
},
[&](Script script) {
if (script.is_extension)
compares.empend(CompareTypeAndValuePair { CharacterCompareType::ScriptExtension, (ByteCodeValueType)script.script });
else
compares.empend(CompareTypeAndValuePair { CharacterCompareType::Script, (ByteCodeValueType)script.script });
},
[](Empty&) { VERIFY_NOT_REACHED(); });
return true;
}
if (has_error())
return false;
// ClassStringDisjunction :: "\q{" ClassStringDisjunctionContents "}"
// ClassStringDisjunctionContents :: ClassString | ClassString "|" ClassStringDisjunctionContents
// ClassString :: [empty] | NonEmptyClassString
// NonEmptyClassString :: ClassCharacter NonEmptyClassString[opt]
if (try_skip("\\q{"sv)) {
// FIXME: Implement this :P
return set_error(Error::InvalidCharacterClass);
}
back(tell() - start_position + 1);
return false;
}
bool ECMA262Parser::parse_nested_class(Vector<regex::CompareTypeAndValuePair>& compares)
{
auto start_position = tell();
// NestedClass :: "[" [lookahead ≠ ^ ] ClassContents [+UnicodeMode, +UnicodeSetsMode] "]"
// | "[" "^" ClassContents[+UnicodeMode, +UnicodeSetsMode] "]"
// | "\" CharacterClassEscape[+UnicodeMode]
if (match(TokenType::LeftBracket)) {
consume();
compares.append(CompareTypeAndValuePair { CharacterCompareType::Or, 0 });
if (match(TokenType::Circumflex)) {
// Negated charclass
consume();
compares.empend(CompareTypeAndValuePair { CharacterCompareType::Inverse, 0 });
}
// ClassContents :: [empty]
if (match(TokenType::RightBracket)) {
consume();
// Should only have at most an 'Inverse' (after an 'Or')
VERIFY(compares.size() <= 2);
compares.append(CompareTypeAndValuePair { CharacterCompareType::EndAndOr, 0 });
return true;
}
// ClassContents :: [+UnicodeSetsMode] ClassSetExpression
if (!parse_class_set_expression(compares))
return false;
compares.append(CompareTypeAndValuePair { CharacterCompareType::EndAndOr, 0 });
return true;
}
if (try_skip("\\"sv)) {
auto negated = false;
if (auto char_class = parse_character_class_escape(negated); char_class.has_value()) {
if (negated)
compares.append({ CharacterCompareType::TemporaryInverse, 1 });
compares.append({ CharacterCompareType::CharClass, (ByteCodeValueType)char_class.value() });
return true;
}
PropertyEscape property {};
if (parse_unicode_property_escape(property, negated)) {
if (negated)
compares.empend(CompareTypeAndValuePair { CharacterCompareType::Inverse, 0 });
property.visit(
[&](Unicode::Property property) {
compares.empend(CompareTypeAndValuePair { CharacterCompareType::Property, (ByteCodeValueType)property });
},
[&](Unicode::GeneralCategory general_category) {
compares.empend(CompareTypeAndValuePair { CharacterCompareType::GeneralCategory, (ByteCodeValueType)general_category });
},
[&](Script script) {
if (script.is_extension)
compares.empend(CompareTypeAndValuePair { CharacterCompareType::ScriptExtension, (ByteCodeValueType)script.script });
else
compares.empend(CompareTypeAndValuePair { CharacterCompareType::Script, (ByteCodeValueType)script.script });
},
[](Empty&) { VERIFY_NOT_REACHED(); });
return true;
}
if (has_error())
return false;
}
back(tell() - start_position + 1);
return false;
}
bool ECMA262Parser::parse_unicode_property_escape(PropertyEscape& property, bool& negated)
{
negated = false;