1
Fork 0
mirror of https://github.com/RGBCube/serenity synced 2025-07-25 13:57:35 +00:00

LibRegex+Everywhere: Make LibRegex more unicode-aware

This commit makes LibRegex (mostly) capable of operating on any of
the three main string views:
- StringView for raw strings
- Utf8View for utf-8 encoded strings
- Utf32View for raw unicode strings

As a result, regexps with unicode strings should be able to properly
handle utf-8 and not stop in the middle of a code point.
A future commit will update LibJS to use the correct type of string
depending on the flags.
This commit is contained in:
Ali Mohammad Pur 2021-07-18 05:07:01 +04:30 committed by Ali Mohammad Pur
parent e5af15a6e9
commit f364fcec5d
8 changed files with 310 additions and 207 deletions

View file

@ -249,7 +249,7 @@ TEST_CASE(char_utf8)
Regex<PosixExtended> re("😀");
RegexResult result;
EXPECT_EQ((result = match("Привет, мир! 😀 γειά σου κόσμος 😀 こんにちは世界", re, PosixFlags::Global)).success, true);
EXPECT_EQ((result = match(Utf8View { "Привет, мир! 😀 γειά σου κόσμος 😀 こんにちは世界" }, re, PosixFlags::Global)).success, true);
EXPECT_EQ(result.count, 2u);
}
@ -312,7 +312,6 @@ TEST_CASE(match_all_character_class)
EXPECT_EQ(result.matches.at(0).view, "W");
EXPECT_EQ(result.matches.at(1).view, "i");
EXPECT_EQ(result.matches.at(2).view, "n");
EXPECT(&result.matches.at(0).view.characters_without_null_termination()[0] != &str.view().characters_without_null_termination()[1]);
}
TEST_CASE(match_character_class_with_assertion)