mirror of
https://github.com/RGBCube/serenity
synced 2025-06-29 03:42:07 +00:00
LibRegex: Support non-ASCII whitespace characters when matching \s or \S
ECMA-262 defines \s as: Return the CharSet containing all characters corresponding to a code point on the right-hand side of the WhiteSpace or LineTerminator productions. The LineTerminator production is simply: U+000A, U+000D, U+2028, or U+2029. Unfortunately there isn't a Unicode property that covers just those code points. The WhiteSpace production is: U+0009, U+000B, U+000C, U+FEFF, or any code point with the Space_Separator general category. If the Unicode generators are disabled, this will fall back to ASCII space code points.
This commit is contained in:
parent
54845c4bf2
commit
2212aa2388
2 changed files with 22 additions and 1 deletions
|
@ -659,6 +659,18 @@ ALWAYS_INLINE bool OpCode_Compare::compare_string(MatchInput const& input, Match
|
|||
|
||||
ALWAYS_INLINE void OpCode_Compare::compare_character_class(MatchInput const& input, MatchState& state, CharClass character_class, u32 ch, bool inverse, bool& inverse_matched)
|
||||
{
|
||||
auto is_space_or_line_terminator = [](u32 code_point) {
|
||||
static auto space_separator = Unicode::general_category_from_string("Space_Separator"sv);
|
||||
if (!space_separator.has_value())
|
||||
return is_ascii_space(code_point);
|
||||
|
||||
if ((code_point == 0x0a) || (code_point == 0x0d) || (code_point == 0x2028) || (code_point == 0x2029))
|
||||
return true;
|
||||
if ((code_point == 0x09) || (code_point == 0x0b) || (code_point == 0x0c) || (code_point == 0xfeff))
|
||||
return true;
|
||||
return Unicode::code_point_has_general_category(code_point, *space_separator);
|
||||
};
|
||||
|
||||
switch (character_class) {
|
||||
case CharClass::Alnum:
|
||||
if (is_ascii_alphanumeric(ch)) {
|
||||
|
@ -729,7 +741,7 @@ ALWAYS_INLINE void OpCode_Compare::compare_character_class(MatchInput const& inp
|
|||
}
|
||||
break;
|
||||
case CharClass::Space:
|
||||
if (is_ascii_space(ch)) {
|
||||
if (is_space_or_line_terminator(ch)) {
|
||||
if (inverse)
|
||||
inverse_matched = true;
|
||||
else
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue