From 2212aa2388c4f6abae577daa2cbb27f8796939d4 Mon Sep 17 00:00:00 2001 From: Timothy Flynn Date: Sat, 5 Feb 2022 10:43:13 -0500 Subject: [PATCH] LibRegex: Support non-ASCII whitespace characters when matching \s or \S ECMA-262 defines \s as: Return the CharSet containing all characters corresponding to a code point on the right-hand side of the WhiteSpace or LineTerminator productions. The LineTerminator production is simply: U+000A, U+000D, U+2028, or U+2029. Unfortunately there isn't a Unicode property that covers just those code points. The WhiteSpace production is: U+0009, U+000B, U+000C, U+FEFF, or any code point with the Space_Separator general category. If the Unicode generators are disabled, this will fall back to ASCII space code points. --- Tests/LibRegex/Regex.cpp | 9 +++++++++ Userland/Libraries/LibRegex/RegexByteCode.cpp | 14 +++++++++++++- 2 files changed, 22 insertions(+), 1 deletion(-) diff --git a/Tests/LibRegex/Regex.cpp b/Tests/LibRegex/Regex.cpp index c7e699c194..7c4dfe80fe 100644 --- a/Tests/LibRegex/Regex.cpp +++ b/Tests/LibRegex/Regex.cpp @@ -706,6 +706,13 @@ TEST_CASE(ECMA262_match) TEST_CASE(ECMA262_unicode_match) { + constexpr auto space_and_line_terminator_code_points = Array { 0x0009, 0x000A, 0x000B, 0x000C, 0x000D, 0x0020, 0x00A0, 0x1680, 0x2000, 0x2001, 0x2002, 0x2003, 0x2004, 0x2005, 0x2006, 0x2007, 0x2008, 0x2009, 0x200A, 0x2028, 0x2029, 0x202F, 0x205F, 0x3000, 0xFEFF }; + + StringBuilder builder; + for (u32 code_point : space_and_line_terminator_code_points) + builder.append_code_point(code_point); + auto space_and_line_terminators = builder.build(); + struct _test { StringView pattern; StringView subject; @@ -729,6 +736,8 @@ TEST_CASE(ECMA262_unicode_match) { "(?<𝓑𝓻𝓸𝔀𝓷>brown)"sv, "brown"sv, true, ECMAScriptFlags::Unicode }, { "(?<\\u{1d4d1}\\u{1d4fb}\\u{1d4f8}\\u{1d500}\\u{1d4f7}>brown)"sv, "brown"sv, true, ECMAScriptFlags::Unicode }, { "(?<\\ud835\\udcd1\\ud835\\udcfb\\ud835\\udcf8\\ud835\\udd00\\ud835\\udcf7>brown)"sv, "brown"sv, true, ECMAScriptFlags::Unicode }, + { "^\\s+$"sv, space_and_line_terminators }, + { "^\\s+$"sv, space_and_line_terminators, true, ECMAScriptFlags::Unicode }, }; for (auto& test : tests) { diff --git a/Userland/Libraries/LibRegex/RegexByteCode.cpp b/Userland/Libraries/LibRegex/RegexByteCode.cpp index eb9b19dbc9..4a1021ff77 100644 --- a/Userland/Libraries/LibRegex/RegexByteCode.cpp +++ b/Userland/Libraries/LibRegex/RegexByteCode.cpp @@ -659,6 +659,18 @@ ALWAYS_INLINE bool OpCode_Compare::compare_string(MatchInput const& input, Match ALWAYS_INLINE void OpCode_Compare::compare_character_class(MatchInput const& input, MatchState& state, CharClass character_class, u32 ch, bool inverse, bool& inverse_matched) { + auto is_space_or_line_terminator = [](u32 code_point) { + static auto space_separator = Unicode::general_category_from_string("Space_Separator"sv); + if (!space_separator.has_value()) + return is_ascii_space(code_point); + + if ((code_point == 0x0a) || (code_point == 0x0d) || (code_point == 0x2028) || (code_point == 0x2029)) + return true; + if ((code_point == 0x09) || (code_point == 0x0b) || (code_point == 0x0c) || (code_point == 0xfeff)) + return true; + return Unicode::code_point_has_general_category(code_point, *space_separator); + }; + switch (character_class) { case CharClass::Alnum: if (is_ascii_alphanumeric(ch)) { @@ -729,7 +741,7 @@ ALWAYS_INLINE void OpCode_Compare::compare_character_class(MatchInput const& inp } break; case CharClass::Space: - if (is_ascii_space(ch)) { + if (is_space_or_line_terminator(ch)) { if (inverse) inverse_matched = true; else