1
Fork 0
mirror of https://github.com/RGBCube/serenity synced 2025-05-31 10:48:11 +00:00

LibRegex+LibUnicode: Begin implementing Unicode property escapes

This supports some binary property matching. It does not support any
properties not yet parsed by LibUnicode, nor does it support value
matching (such as Script_Extensions=Latin).
This commit is contained in:
Timothy Flynn 2021-07-29 14:18:51 -04:00 committed by Linus Groh
parent f1dd770a8a
commit d485cf29d7
11 changed files with 230 additions and 33 deletions

View file

@ -515,6 +515,13 @@ TEST_CASE(ECMA262_parse)
{ "\\u{10ffff", regex::Error::InvalidPattern, ECMAScriptFlags::Unicode },
{ "\\u{10ffffx", regex::Error::InvalidPattern, ECMAScriptFlags::Unicode },
{ "\\u{110000}", regex::Error::InvalidPattern, ECMAScriptFlags::Unicode },
{ "\\p", regex::Error::InvalidPattern, ECMAScriptFlags::Unicode },
{ "\\p{", regex::Error::InvalidPattern, ECMAScriptFlags::Unicode },
{ "\\p{}", regex::Error::InvalidNameForProperty, ECMAScriptFlags::Unicode },
{ "\\p{AsCiI}", regex::Error::InvalidNameForProperty, ECMAScriptFlags::Unicode },
{ "\\p{hello friends}", regex::Error::InvalidNameForProperty, ECMAScriptFlags::Unicode },
{ "\\p{Prepended_Concatenation_Mark}", regex::Error::InvalidNameForProperty, ECMAScriptFlags::Unicode },
{ "\\p{ASCII}", regex::Error::NoError, ECMAScriptFlags::Unicode },
};
for (auto& test : tests) {
@ -635,6 +642,47 @@ TEST_CASE(ECMA262_unicode_match)
}
}
TEST_CASE(ECMA262_property_match)
{
struct _test {
char const* pattern;
char const* subject;
bool matches { true };
ECMAScriptFlags options {};
};
constexpr _test tests[] {
{ "\\p{ASCII}", "a", false },
{ "\\p{ASCII}", "p{ASCII}", true },
{ "\\p{ASCII}", "a", true, ECMAScriptFlags::Unicode },
{ "\\p{ASCII}", "😀", false, ECMAScriptFlags::Unicode },
{ "\\p{ASCII_Hex_Digit}", "1", true, ECMAScriptFlags::Unicode },
{ "\\p{ASCII_Hex_Digit}", "a", true, ECMAScriptFlags::Unicode },
{ "\\p{ASCII_Hex_Digit}", "x", false, ECMAScriptFlags::Unicode },
{ "\\p{Any}", "\xcd\xb8", true, ECMAScriptFlags::Unicode }, // U+0378, which is an unassigned code point.
{ "\\p{Assigned}", "\xcd\xb8", false, ECMAScriptFlags::Unicode }, // U+0378, which is an unassigned code point.
};
for (auto& test : tests) {
Regex<ECMA262> re(test.pattern, (ECMAScriptFlags)regex::AllFlags::Global | regex::ECMAScriptFlags::BrowserExtended | test.options);
auto subject = AK::utf8_to_utf16(test.subject);
Utf16View view { subject };
if constexpr (REGEX_DEBUG) {
dbgln("\n");
RegexDebug regex_dbg(stderr);
regex_dbg.print_raw_bytecode(re);
regex_dbg.print_header();
regex_dbg.print_bytecode(re);
dbgln("\n");
}
EXPECT_EQ(re.parser_result.error, Error::NoError);
EXPECT_EQ(re.match(view).success, test.matches);
}
}
TEST_CASE(replace)
{
struct _test {