From 47f6bb38a1bd3c39324d11b4eec1d8d8993658a2 Mon Sep 17 00:00:00 2001 From: Timothy Flynn Date: Tue, 20 Jul 2021 22:33:00 -0400 Subject: [PATCH] LibRegex: Support UTF-16 RegexStringView and improve Unicode matching When the Unicode option is not set, regular expressions should match based on code units; when it is set, they should match based on code points. To do so, the regex parser must combine surrogate pairs when the Unicode option is set. Further, RegexStringView needs to know if the flag is set in order to return code point vs. code unit based string lengths and substrings. --- Tests/LibRegex/Regex.cpp | 45 +++++++- Userland/Libraries/LibRegex/RegexByteCode.cpp | 6 +- Userland/Libraries/LibRegex/RegexMatch.h | 102 +++++++++++++++--- Userland/Libraries/LibRegex/RegexMatcher.cpp | 4 + Userland/Libraries/LibRegex/RegexParser.cpp | 31 ++++-- 5 files changed, 167 insertions(+), 21 deletions(-) diff --git a/Tests/LibRegex/Regex.cpp b/Tests/LibRegex/Regex.cpp index 1643acd37b..a4731e62d3 100644 --- a/Tests/LibRegex/Regex.cpp +++ b/Tests/LibRegex/Regex.cpp @@ -506,10 +506,14 @@ TEST_CASE(ECMA262_parse) { ",(?", regex::Error::InvalidCaptureGroup }, // #4583 { "{1}", regex::Error::InvalidPattern }, { "{1,2}", regex::Error::InvalidPattern }, + { "\\uxxxx", regex::Error::NoError }, + { "\\uxxxx", regex::Error::InvalidPattern, ECMAScriptFlags::Unicode }, + { "\\ud83d", regex::Error::NoError, ECMAScriptFlags::Unicode }, + { "\\ud83d\\uxxxx", regex::Error::InvalidPattern, ECMAScriptFlags::Unicode }, }; for (auto& test : tests) { - Regex re(test.pattern); + Regex re(test.pattern, test.flags); EXPECT_EQ(re.parser_result.error, test.expected_error); if constexpr (REGEX_DEBUG) { dbgln("\n"); @@ -586,6 +590,45 @@ TEST_CASE(ECMA262_match) } } +TEST_CASE(ECMA262_unicode_match) +{ + struct _test { + char const* pattern; + char const* subject; + bool matches { true }; + ECMAScriptFlags options {}; + }; + _test tests[] { + { "\\ud83d", "😀", true }, + { "\\ud83d", "😀", false, ECMAScriptFlags::Unicode }, + { "\\ude00", "😀", true }, + { "\\ude00", "😀", false, ECMAScriptFlags::Unicode }, + { "\\ud83d\\ude00", "😀", true }, + { "\\ud83d\\ude00", "😀", true, ECMAScriptFlags::Unicode }, + { "\\ud83d\\ud83d", "\xed\xa0\xbd\xed\xa0\xbd", true }, + { "\\ud83d\\ud83d", "\xed\xa0\xbd\xed\xa0\xbd", true, ECMAScriptFlags::Unicode }, + }; + + for (auto& test : tests) { + Regex re(test.pattern, (ECMAScriptFlags)regex::AllFlags::Global | test.options); + + auto subject = AK::utf8_to_utf16(test.subject); + Utf16View view { subject }; + + if constexpr (REGEX_DEBUG) { + dbgln("\n"); + RegexDebug regex_dbg(stderr); + regex_dbg.print_raw_bytecode(re); + regex_dbg.print_header(); + regex_dbg.print_bytecode(re); + dbgln("\n"); + } + + EXPECT_EQ(re.parser_result.error, Error::NoError); + EXPECT_EQ(re.match(view).success, test.matches); + } +} + TEST_CASE(replace) { struct _test { diff --git a/Userland/Libraries/LibRegex/RegexByteCode.cpp b/Userland/Libraries/LibRegex/RegexByteCode.cpp index 677ad0cb0d..f5869acbdb 100644 --- a/Userland/Libraries/LibRegex/RegexByteCode.cpp +++ b/Userland/Libraries/LibRegex/RegexByteCode.cpp @@ -465,12 +465,13 @@ ALWAYS_INLINE ExecutionResult OpCode_Compare::execute(MatchInput const& input, M return ExecutionResult::Failed_ExecuteLowPrioForks; Optional str; + Vector utf16; Vector data; data.ensure_capacity(length); for (size_t i = offset; i < offset + length; ++i) data.unchecked_append(m_bytecode->at(i)); - auto view = input.view.construct_as_same(data, str); + auto view = input.view.construct_as_same(data, str, utf16); offset += length; if (!compare_string(input, state, view, had_zero_length_match)) return ExecutionResult::Failed_ExecuteLowPrioForks; @@ -553,7 +554,8 @@ ALWAYS_INLINE void OpCode_Compare::compare_char(MatchInput const& input, MatchSt auto input_view = input.view.substring_view(state.string_position, 1); Optional str; - auto compare_view = input_view.construct_as_same({ &ch1, 1 }, str); + Vector utf16; + auto compare_view = input_view.construct_as_same({ &ch1, 1 }, str, utf16); bool equal; if (input.regex_options & AllFlags::Insensitive) equal = input_view.equals_ignoring_case(compare_view); diff --git a/Userland/Libraries/LibRegex/RegexMatch.h b/Userland/Libraries/LibRegex/RegexMatch.h index b58dc5e132..6bc58ad78f 100644 --- a/Userland/Libraries/LibRegex/RegexMatch.h +++ b/Userland/Libraries/LibRegex/RegexMatch.h @@ -14,6 +14,7 @@ #include #include #include +#include #include #include #include @@ -43,6 +44,11 @@ public: { } + RegexStringView(Utf16View view) + : m_view(view) + { + } + RegexStringView(Utf8View view) : m_view(view) { @@ -58,11 +64,19 @@ public: return m_view.get(); } + Utf16View const& u16_view() const + { + return m_view.get(); + } + Utf8View const& u8_view() const { return m_view.get(); } + bool unicode() const { return m_unicode; } + void set_unicode(bool unicode) { m_unicode = unicode; } + bool is_empty() const { return m_view.visit([](auto& view) { return view.is_empty(); }); @@ -75,12 +89,21 @@ public: size_t length() const { - return m_view.visit([](auto& view) { return view.length(); }); + if (unicode()) { + return m_view.visit( + [](Utf16View const& view) { return view.length_in_code_points(); }, + [](auto const& view) { return view.length(); }); + } + + return m_view.visit( + [](Utf16View const& view) { return view.length_in_code_units(); }, + [](Utf8View const& view) { return view.byte_length(); }, + [](auto const& view) { return view.length(); }); } - RegexStringView construct_as_same(Span data, Optional& optional_string_storage) const + RegexStringView construct_as_same(Span data, Optional& optional_string_storage, Vector& optional_utf16_storage) const { - return m_view.visit( + auto view = m_view.visit( [&](T const&) { StringBuilder builder; for (auto ch : data) @@ -90,7 +113,14 @@ public: }, [&](Utf32View) { return RegexStringView { Utf32View { data.data(), data.size() } }; + }, + [&](Utf16View) { + optional_utf16_storage = AK::utf32_to_utf16(Utf32View { data.data(), data.size() }); + return RegexStringView { Utf16View { optional_utf16_storage } }; }); + + view.set_unicode(unicode()); + return view; } Vector lines() const @@ -118,6 +148,21 @@ public: views.empend(view); return views; }, + [](Utf16View view) { + Vector views; + u16 newline = '\n'; + while (!view.is_empty()) { + auto position = AK::memmem_optional(view.data(), view.length_in_code_units() * sizeof(u16), &newline, sizeof(u16)); + if (!position.has_value()) + break; + auto offset = position.value() / sizeof(u16); + views.empend(view.substring_view(0, offset)); + view = view.substring_view(offset + 1, view.length_in_code_units() - offset - 1); + } + if (!view.is_empty()) + views.empend(view); + return views; + }, [](Utf8View& view) { Vector views; auto it = view.begin(); @@ -147,15 +192,26 @@ public: RegexStringView substring_view(size_t offset, size_t length) const { - return m_view.visit( - [&](auto view) { return RegexStringView { view.substring_view(offset, length) }; }, - [&](Utf8View const& view) { return RegexStringView { view.unicode_substring_view(offset, length) }; }); + if (unicode()) { + auto view = m_view.visit( + [&](auto view) { return RegexStringView { view.substring_view(offset, length) }; }, + [&](Utf16View const& view) { return RegexStringView { view.unicode_substring_view(offset, length) }; }, + [&](Utf8View const& view) { return RegexStringView { view.unicode_substring_view(offset, length) }; }); + + view.set_unicode(unicode()); + return view; + } + + auto view = m_view.visit([&](auto view) { return RegexStringView { view.substring_view(offset, length) }; }); + view.set_unicode(unicode()); + return view; } String to_string() const { return m_view.visit( [](StringView view) { return view.to_string(); }, + [](Utf16View view) { return view.to_utf8(Utf16View::AllowInvalidCodeUnits::Yes); }, [](auto& view) { StringBuilder builder; for (auto it = view.begin(); it != view.end(); ++it) @@ -173,8 +229,8 @@ public: return 256u + ch; return ch; }, - [&](auto view) -> u32 { return view[index]; }, - [&](Utf8View& view) -> u32 { + [&](Utf32View& view) -> u32 { return view[index]; }, + [&](auto& view) -> u32 { size_t i = index; for (auto it = view.begin(); it != view.end(); ++it, --i) { if (i == 0) @@ -188,6 +244,7 @@ public: { return m_view.visit( [&](Utf32View) { return to_string() == cstring; }, + [&](Utf16View) { return to_string() == cstring; }, [&](Utf8View const& view) { return view.as_string() == cstring; }, [&](StringView view) { return view == cstring; }); } @@ -201,6 +258,7 @@ public: { return m_view.visit( [&](Utf32View) { return to_string() == string; }, + [&](Utf16View) { return to_string() == string; }, [&](Utf8View const& view) { return view.as_string() == string; }, [&](StringView view) { return view == string; }); } @@ -209,6 +267,7 @@ public: { return m_view.visit( [&](Utf32View) { return to_string() == string; }, + [&](Utf16View) { return to_string() == string; }, [&](Utf8View const& view) { return view.as_string() == string; }, [&](StringView view) { return view == string; }); } @@ -224,6 +283,7 @@ public: [&](Utf32View view) { return view.length() == other.length() && __builtin_memcmp(view.code_points(), other.code_points(), view.length() * sizeof(u32)) == 0; }, + [&](Utf16View) { return to_string() == RegexStringView { other }.to_string(); }, [&](Utf8View const& view) { return view.as_string() == RegexStringView { other }.to_string(); }, [&](StringView view) { return view == RegexStringView { other }.to_string(); }); } @@ -233,12 +293,25 @@ public: return !(*this == other); } + bool operator==(Utf16View const& other) const + { + return m_view.visit( + [&](Utf32View) { return to_string() == RegexStringView { other }.to_string(); }, + [&](Utf16View const& view) { return view == other; }, + [&](Utf8View const& view) { return view.as_string() == RegexStringView { other }.to_string(); }, + [&](StringView view) { return view == RegexStringView { other }.to_string(); }); + } + + bool operator!=(Utf16View const& other) const + { + return !(*this == other); + } + bool operator==(Utf8View const& other) const { return m_view.visit( - [&](Utf32View) { - return to_string() == other.as_string(); - }, + [&](Utf32View) { return to_string() == other.as_string(); }, + [&](Utf16View) { return to_string() == other.as_string(); }, [&](Utf8View const& view) { return view.as_string() == other.as_string(); }, [&](StringView view) { return other.as_string() == view; }); } @@ -271,6 +344,9 @@ public: [&](Utf32View) -> bool { TODO(); }, + [&](Utf16View) -> bool { + TODO(); + }, [&](Utf8View const& view) { return view.as_string().starts_with(str); }, [&](StringView view) { return view.starts_with(str); }); } @@ -289,6 +365,7 @@ public: } return true; }, + [&](Utf16View) -> bool { TODO(); }, [&](Utf8View const& view) { auto it = view.begin(); for (auto code_point : str) { @@ -304,7 +381,8 @@ public: } private: - Variant m_view; + Variant m_view; + bool m_unicode { false }; }; class Match final { diff --git a/Userland/Libraries/LibRegex/RegexMatcher.cpp b/Userland/Libraries/LibRegex/RegexMatcher.cpp index 60783b25f6..f4a848741a 100644 --- a/Userland/Libraries/LibRegex/RegexMatcher.cpp +++ b/Userland/Libraries/LibRegex/RegexMatcher.cpp @@ -84,6 +84,10 @@ RegexResult Matcher::match(Vector const views, Optional output.operations = 0; size_t lines_to_skip = 0; + bool unicode = input.regex_options.has_flag_set(AllFlags::Unicode); + for (auto& view : views) + const_cast(view).set_unicode(unicode); + if (input.regex_options.has_flag_set(AllFlags::Internal_Stateful)) { if (views.size() > 1 && input.start_offset > views.first().length()) { dbgln_if(REGEX_DEBUG, "Started with start={}, goff={}, skip={}", input.start_offset, input.global_offset, lines_to_skip); diff --git a/Userland/Libraries/LibRegex/RegexParser.cpp b/Userland/Libraries/LibRegex/RegexParser.cpp index 12c62fef78..07885173b6 100644 --- a/Userland/Libraries/LibRegex/RegexParser.cpp +++ b/Userland/Libraries/LibRegex/RegexParser.cpp @@ -10,6 +10,7 @@ #include #include #include +#include namespace regex { @@ -1440,13 +1441,31 @@ bool ECMA262Parser::parse_atom_escape(ByteCode& stack, size_t& match_length_mini if (try_skip("u")) { if (auto code_point = read_digits(ReadDigitsInitialZeroState::Allow, true, 4); code_point.has_value()) { - // FIXME: The minimum length depends on the mode - should be utf8-length in u8 mode. + // In Unicode mode, we need to combine surrogate pairs into a single code point. But we also need to be + // rather forgiving if the surrogate pairs are invalid. So if a second code unit follows this code unit, + // but doesn't form a valid surrogate pair, insert bytecode for both code units individually. + Optional low_surrogate; + if (unicode && Utf16View::is_high_surrogate(*code_point) && try_skip("\\u")) { + low_surrogate = read_digits(ReadDigitsInitialZeroState::Allow, true, 4); + if (!low_surrogate.has_value()) { + set_error(Error::InvalidPattern); + return false; + } + + if (Utf16View::is_low_surrogate(*low_surrogate)) { + *code_point = Utf16View::decode_surrogate_pair(*code_point, *low_surrogate); + low_surrogate.clear(); + } + } + match_length_minimum += 1; - StringBuilder builder; - builder.append_code_point(code_point.value()); - // FIXME: This isn't actually correct for ECMAScript. - auto u8_encoded = builder.string_view(); - stack.insert_bytecode_compare_string(u8_encoded); + stack.insert_bytecode_compare_values({ { CharacterCompareType::Char, (ByteCodeValueType)code_point.value() } }); + + if (low_surrogate.has_value()) { + match_length_minimum += 1; + stack.insert_bytecode_compare_values({ { CharacterCompareType::Char, (ByteCodeValueType)low_surrogate.value() } }); + } + return true; } else if (!unicode) { // '\u' is allowed in non-unicode mode, just matches 'u'.