mirror of
https://github.com/RGBCube/serenity
synced 2025-07-25 22:27:35 +00:00
LibRegex: Support UTF-16 RegexStringView and improve Unicode matching
When the Unicode option is not set, regular expressions should match based on code units; when it is set, they should match based on code points. To do so, the regex parser must combine surrogate pairs when the Unicode option is set. Further, RegexStringView needs to know if the flag is set in order to return code point vs. code unit based string lengths and substrings.
This commit is contained in:
parent
2e45e52993
commit
47f6bb38a1
5 changed files with 167 additions and 21 deletions
|
@ -84,6 +84,10 @@ RegexResult Matcher<Parser>::match(Vector<RegexStringView> const views, Optional
|
|||
output.operations = 0;
|
||||
size_t lines_to_skip = 0;
|
||||
|
||||
bool unicode = input.regex_options.has_flag_set(AllFlags::Unicode);
|
||||
for (auto& view : views)
|
||||
const_cast<RegexStringView&>(view).set_unicode(unicode);
|
||||
|
||||
if (input.regex_options.has_flag_set(AllFlags::Internal_Stateful)) {
|
||||
if (views.size() > 1 && input.start_offset > views.first().length()) {
|
||||
dbgln_if(REGEX_DEBUG, "Started with start={}, goff={}, skip={}", input.start_offset, input.global_offset, lines_to_skip);
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue