mirror of
https://github.com/RGBCube/serenity
synced 2025-07-24 18:07:35 +00:00
LibRegex: Support non-ASCII case-insensitive character comparisons
Specifically, when the Unicode flag is set, use Unicode-aware case folding to case-insensitively compare code points.
This commit is contained in:
parent
3fbf33bd37
commit
e122039c99
2 changed files with 23 additions and 4 deletions
|
@ -757,6 +757,18 @@ TEST_CASE(ECMA262_unicode_match)
|
||||||
{ "(?<\\ud835\\udcd1\\ud835\\udcfb\\ud835\\udcf8\\ud835\\udd00\\ud835\\udcf7>brown)"sv, "brown"sv, true, ECMAScriptFlags::Unicode },
|
{ "(?<\\ud835\\udcd1\\ud835\\udcfb\\ud835\\udcf8\\ud835\\udd00\\ud835\\udcf7>brown)"sv, "brown"sv, true, ECMAScriptFlags::Unicode },
|
||||||
{ "^\\s+$"sv, space_and_line_terminators },
|
{ "^\\s+$"sv, space_and_line_terminators },
|
||||||
{ "^\\s+$"sv, space_and_line_terminators, true, ECMAScriptFlags::Unicode },
|
{ "^\\s+$"sv, space_and_line_terminators, true, ECMAScriptFlags::Unicode },
|
||||||
|
{ "[\\u0390]"sv, "\u1fd3"sv, false, ECMAScriptFlags::Unicode },
|
||||||
|
{ "[\\u1fd3]"sv, "\u0390"sv, false, ECMAScriptFlags::Unicode },
|
||||||
|
{ "[\\u0390]"sv, "\u1fd3"sv, true, combine_flags(ECMAScriptFlags::Unicode, ECMAScriptFlags::Insensitive) },
|
||||||
|
{ "[\\u1fd3]"sv, "\u0390"sv, true, combine_flags(ECMAScriptFlags::Unicode, ECMAScriptFlags::Insensitive) },
|
||||||
|
{ "[\\u03b0]"sv, "\u1fe3"sv, false, ECMAScriptFlags::Unicode },
|
||||||
|
{ "[\\u1fe3]"sv, "\u03b0"sv, false, ECMAScriptFlags::Unicode },
|
||||||
|
{ "[\\u03b0]"sv, "\u1fe3"sv, true, combine_flags(ECMAScriptFlags::Unicode, ECMAScriptFlags::Insensitive) },
|
||||||
|
{ "[\\u1fe3]"sv, "\u03b0"sv, true, combine_flags(ECMAScriptFlags::Unicode, ECMAScriptFlags::Insensitive) },
|
||||||
|
{ "[\\ufb05]"sv, "\ufb06"sv, false, ECMAScriptFlags::Unicode },
|
||||||
|
{ "[\\ufb06]"sv, "\ufb05"sv, false, ECMAScriptFlags::Unicode },
|
||||||
|
{ "[\\ufb05]"sv, "\ufb06"sv, true, combine_flags(ECMAScriptFlags::Unicode, ECMAScriptFlags::Insensitive) },
|
||||||
|
{ "[\\ufb06]"sv, "\ufb05"sv, true, combine_flags(ECMAScriptFlags::Unicode, ECMAScriptFlags::Insensitive) },
|
||||||
};
|
};
|
||||||
|
|
||||||
for (auto& test : tests) {
|
for (auto& test : tests) {
|
||||||
|
|
|
@ -701,12 +701,19 @@ ALWAYS_INLINE void OpCode_Compare::compare_char(MatchInput const& input, MatchSt
|
||||||
return;
|
return;
|
||||||
|
|
||||||
// FIXME: Figure out how to do this if unicode() without performing a substring split first.
|
// FIXME: Figure out how to do this if unicode() without performing a substring split first.
|
||||||
auto input_view = input.view.unicode() ? input.view.substring_view(state.string_position, 1)[0] : input.view.code_unit_at(state.string_position_in_code_units);
|
auto input_view = input.view.unicode()
|
||||||
|
? input.view.substring_view(state.string_position, 1)[0]
|
||||||
|
: input.view.code_unit_at(state.string_position_in_code_units);
|
||||||
|
|
||||||
bool equal;
|
bool equal;
|
||||||
if (input.regex_options & AllFlags::Insensitive)
|
if (input.regex_options & AllFlags::Insensitive) {
|
||||||
equal = to_ascii_lowercase(input_view) == to_ascii_lowercase(ch1); // FIXME: Implement case-insensitive matching for non-ascii characters
|
if (input.view.unicode())
|
||||||
else
|
equal = Unicode::equals_ignoring_case(Utf32View { &input_view, 1 }, Utf32View { &ch1, 1 });
|
||||||
|
else
|
||||||
|
equal = to_ascii_lowercase(input_view) == to_ascii_lowercase(ch1);
|
||||||
|
} else {
|
||||||
equal = input_view == ch1;
|
equal = input_view == ch1;
|
||||||
|
}
|
||||||
|
|
||||||
if (equal) {
|
if (equal) {
|
||||||
if (inverse)
|
if (inverse)
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue