1
Fork 0
mirror of https://github.com/RGBCube/serenity synced 2025-05-31 09:48:11 +00:00

LibRegex: Support UTF-16 RegexStringView and improve Unicode matching

When the Unicode option is not set, regular expressions should match
based on code units; when it is set, they should match based on code
points. To do so, the regex parser must combine surrogate pairs when
the Unicode option is set. Further, RegexStringView needs to know if
the flag is set in order to return code point vs. code unit based
string lengths and substrings.
This commit is contained in:
Timothy Flynn 2021-07-20 22:33:00 -04:00 committed by Linus Groh
parent 2e45e52993
commit 47f6bb38a1
5 changed files with 167 additions and 21 deletions

View file

@ -465,12 +465,13 @@ ALWAYS_INLINE ExecutionResult OpCode_Compare::execute(MatchInput const& input, M
return ExecutionResult::Failed_ExecuteLowPrioForks;
Optional<String> str;
Vector<u16> utf16;
Vector<u32> data;
data.ensure_capacity(length);
for (size_t i = offset; i < offset + length; ++i)
data.unchecked_append(m_bytecode->at(i));
auto view = input.view.construct_as_same(data, str);
auto view = input.view.construct_as_same(data, str, utf16);
offset += length;
if (!compare_string(input, state, view, had_zero_length_match))
return ExecutionResult::Failed_ExecuteLowPrioForks;
@ -553,7 +554,8 @@ ALWAYS_INLINE void OpCode_Compare::compare_char(MatchInput const& input, MatchSt
auto input_view = input.view.substring_view(state.string_position, 1);
Optional<String> str;
auto compare_view = input_view.construct_as_same({ &ch1, 1 }, str);
Vector<u16> utf16;
auto compare_view = input_view.construct_as_same({ &ch1, 1 }, str, utf16);
bool equal;
if (input.regex_options & AllFlags::Insensitive)
equal = input_view.equals_ignoring_case(compare_view);