From 325eabc770a92e3651e9c5fc74820622eae8ee48 Mon Sep 17 00:00:00 2001 From: Timothy Flynn Date: Mon, 16 Aug 2021 10:28:26 -0400 Subject: [PATCH] LibRegex: Ensure the GoBack operation decrements the code unit index This was missed in commit 27d555bab0d84913599cea3c4a6b0a0ed2a15b66. --- Tests/LibRegex/Regex.cpp | 4 ++++ Userland/Libraries/LibRegex/RegexByteCode.cpp | 15 +++++++++++++-- Userland/Libraries/LibRegex/RegexMatch.h | 16 ++++++++++++++++ 3 files changed, 33 insertions(+), 2 deletions(-) diff --git a/Tests/LibRegex/Regex.cpp b/Tests/LibRegex/Regex.cpp index 7b08d909e1..8cb3a87c9e 100644 --- a/Tests/LibRegex/Regex.cpp +++ b/Tests/LibRegex/Regex.cpp @@ -637,6 +637,8 @@ TEST_CASE(ECMA262_match) { "(a{4}){2}"sv, "aaaaaaaa"sv }, { "(a{4}){2}"sv, "aaaaaabaa"sv, false }, { "\\u{4}"sv, "uuuu" }, + { "(?<=.{3})f"sv, "abcdef"sv, true, (ECMAScriptFlags)regex::AllFlags::Global }, + { "(?<=.{3})f"sv, "abc😀ef"sv, true, (ECMAScriptFlags)regex::AllFlags::Global }, // ECMA262, B.1.4. Regular Expression Pattern extensions for browsers { "{"sv, "{"sv, true, ECMAScriptFlags::BrowserExtended }, { "\\5"sv, "\5"sv, true, ECMAScriptFlags::BrowserExtended }, @@ -694,6 +696,8 @@ TEST_CASE(ECMA262_unicode_match) { "\\u{1f600}"sv, "😀"sv, true, ECMAScriptFlags::Unicode }, { "\\ud83d\\ud83d"sv, "\xed\xa0\xbd\xed\xa0\xbd"sv, true }, { "\\ud83d\\ud83d"sv, "\xed\xa0\xbd\xed\xa0\xbd"sv, true, ECMAScriptFlags::Unicode }, + { "(?<=.{3})f"sv, "abcdef"sv, true, ECMAScriptFlags::Unicode }, + { "(?<=.{3})f"sv, "abc😀ef"sv, true, ECMAScriptFlags::Unicode }, }; for (auto& test : tests) { diff --git a/Userland/Libraries/LibRegex/RegexByteCode.cpp b/Userland/Libraries/LibRegex/RegexByteCode.cpp index 0d411ba34c..00b4c0a299 100644 --- a/Userland/Libraries/LibRegex/RegexByteCode.cpp +++ b/Userland/Libraries/LibRegex/RegexByteCode.cpp @@ -102,6 +102,17 @@ static void advance_string_position(MatchState& state, RegexStringView const& vi } } +static void reverse_string_position(MatchState& state, RegexStringView const& view, size_t amount) +{ + VERIFY(state.string_position >= amount); + state.string_position -= amount; + + if (view.unicode()) + state.string_position_in_code_units = view.code_unit_offset_of(state.string_position); + else + state.string_position_in_code_units -= amount; +} + static void save_string_position(MatchInput const& input, MatchState const& state) { input.saved_positions.append(state.string_position); @@ -226,12 +237,12 @@ ALWAYS_INLINE ExecutionResult OpCode_Restore::execute(MatchInput const& input, M return ExecutionResult::Continue; } -ALWAYS_INLINE ExecutionResult OpCode_GoBack::execute(MatchInput const&, MatchState& state) const +ALWAYS_INLINE ExecutionResult OpCode_GoBack::execute(MatchInput const& input, MatchState& state) const { if (count() > state.string_position) return ExecutionResult::Failed_ExecuteLowPrioForks; - state.string_position -= count(); + reverse_string_position(state, input.view, count()); return ExecutionResult::Continue; } diff --git a/Userland/Libraries/LibRegex/RegexMatch.h b/Userland/Libraries/LibRegex/RegexMatch.h index 0fdb27f147..f11eb860b1 100644 --- a/Userland/Libraries/LibRegex/RegexMatch.h +++ b/Userland/Libraries/LibRegex/RegexMatch.h @@ -274,6 +274,22 @@ public: }); } + size_t code_unit_offset_of(size_t code_point_index) const + { + return m_view.visit( + [&](StringView const& view) -> u32 { + Utf8View utf8_view { view }; + return utf8_view.byte_offset_of(code_point_index); + }, + [&](Utf32View const&) -> u32 { return code_point_index; }, + [&](Utf16View const& view) -> u32 { + return view.code_unit_offset_of(code_point_index); + }, + [&](Utf8View const& view) -> u32 { + return view.byte_offset_of(code_point_index); + }); + } + bool operator==(char const* cstring) const { return m_view.visit(