diff --git a/Userland/Libraries/LibRegex/RegexByteCode.cpp b/Userland/Libraries/LibRegex/RegexByteCode.cpp index 20a65b3d0c..5463aa1def 100644 --- a/Userland/Libraries/LibRegex/RegexByteCode.cpp +++ b/Userland/Libraries/LibRegex/RegexByteCode.cpp @@ -88,6 +88,36 @@ static char const* character_class_name(CharClass ch_class) } } +static void advance_string_position(MatchState& state, RegexStringView const& view, Optional code_point = {}) +{ + ++state.string_position; + + if (view.unicode()) { + if (!code_point.has_value() && (state.string_position_in_code_units < view.length_in_code_units())) + code_point = view[state.string_position_in_code_units]; + if (code_point.has_value()) + state.string_position_in_code_units += view.length_of_code_point(*code_point); + } else { + ++state.string_position_in_code_units; + } +} + +static void save_string_position(MatchInput const& input, MatchState const& state) +{ + input.saved_positions.append(state.string_position); + input.saved_code_unit_positions.append(state.string_position_in_code_units); +} + +static bool restore_string_position(MatchInput const& input, MatchState& state) +{ + if (input.saved_positions.is_empty()) + return false; + + state.string_position = input.saved_positions.take_last(); + state.string_position_in_code_units = input.saved_code_unit_positions.take_last(); + return true; +} + OwnPtr ByteCode::s_opcodes[(size_t)OpCodeId::Last + 1]; bool ByteCode::s_opcodes_initialized { false }; @@ -188,16 +218,14 @@ ALWAYS_INLINE ExecutionResult OpCode_Exit::execute(MatchInput const& input, Matc ALWAYS_INLINE ExecutionResult OpCode_Save::execute(MatchInput const& input, MatchState& state, MatchOutput&) const { - input.saved_positions.append(state.string_position); + save_string_position(input, state); return ExecutionResult::Continue; } ALWAYS_INLINE ExecutionResult OpCode_Restore::execute(MatchInput const& input, MatchState& state, MatchOutput&) const { - if (input.saved_positions.is_empty()) + if (!restore_string_position(input, state)) return ExecutionResult::Failed; - - state.string_position = input.saved_positions.take_last(); return ExecutionResult::Continue; } @@ -254,7 +282,7 @@ ALWAYS_INLINE ExecutionResult OpCode_CheckBoundary::execute(MatchInput const& in auto isword = [](auto ch) { return is_ascii_alphanumeric(ch) || ch == '_'; }; auto is_word_boundary = [&] { if (state.string_position == input.view.length()) { - if (state.string_position > 0 && isword(input.view[state.string_position - 1])) + if (state.string_position > 0 && isword(input.view[state.string_position_in_code_units - 1])) return true; return false; } @@ -266,7 +294,7 @@ ALWAYS_INLINE ExecutionResult OpCode_CheckBoundary::execute(MatchInput const& in return false; } - return !!(isword(input.view[state.string_position]) ^ isword(input.view[state.string_position - 1])); + return !!(isword(input.view[state.string_position_in_code_units]) ^ isword(input.view[state.string_position_in_code_units - 1])); }; switch (type()) { case BoundaryCheckType::Word: { @@ -455,7 +483,7 @@ ALWAYS_INLINE ExecutionResult OpCode_Compare::execute(MatchInput const& input, M return ExecutionResult::Failed_ExecuteLowPrioForks; VERIFY(!current_inversion_state()); - ++state.string_position; + advance_string_position(state, input.view); } else if (compare_type == CharacterCompareType::String) { VERIFY(!current_inversion_state()); @@ -484,7 +512,7 @@ ALWAYS_INLINE ExecutionResult OpCode_Compare::execute(MatchInput const& input, M return ExecutionResult::Failed_ExecuteLowPrioForks; auto character_class = (CharClass)m_bytecode->at(offset++); - auto ch = input.view[state.string_position]; + auto ch = input.view[state.string_position_in_code_units]; compare_character_class(input, state, character_class, ch, current_inversion_state(), inverse_matched); @@ -496,7 +524,7 @@ ALWAYS_INLINE ExecutionResult OpCode_Compare::execute(MatchInput const& input, M auto from = value.from; auto to = value.to; - auto ch = input.view[state.string_position]; + auto ch = input.view[state.string_position_in_code_units]; compare_character_range(input, state, from, to, ch, current_inversion_state(), inverse_matched); @@ -549,7 +577,7 @@ ALWAYS_INLINE ExecutionResult OpCode_Compare::execute(MatchInput const& input, M } if (current_inversion_state() && !inverse_matched) - ++state.string_position; + advance_string_position(state, input.view); if ((!had_zero_length_match && string_position == state.string_position) || state.string_position > input.view.length()) return ExecutionResult::Failed_ExecuteLowPrioForks; @@ -576,7 +604,7 @@ ALWAYS_INLINE void OpCode_Compare::compare_char(MatchInput const& input, MatchSt if (inverse) inverse_matched = true; else - ++state.string_position; + advance_string_position(state, input.view, ch1); } } @@ -616,19 +644,19 @@ ALWAYS_INLINE void OpCode_Compare::compare_character_class(MatchInput const& inp if (inverse) inverse_matched = true; else - ++state.string_position; + advance_string_position(state, input.view, ch); } break; case CharClass::Alpha: if (is_ascii_alpha(ch)) - ++state.string_position; + advance_string_position(state, input.view, ch); break; case CharClass::Blank: if (is_ascii_blank(ch)) { if (inverse) inverse_matched = true; else - ++state.string_position; + advance_string_position(state, input.view, ch); } break; case CharClass::Cntrl: @@ -636,7 +664,7 @@ ALWAYS_INLINE void OpCode_Compare::compare_character_class(MatchInput const& inp if (inverse) inverse_matched = true; else - ++state.string_position; + advance_string_position(state, input.view, ch); } break; case CharClass::Digit: @@ -644,7 +672,7 @@ ALWAYS_INLINE void OpCode_Compare::compare_character_class(MatchInput const& inp if (inverse) inverse_matched = true; else - ++state.string_position; + advance_string_position(state, input.view, ch); } break; case CharClass::Graph: @@ -652,7 +680,7 @@ ALWAYS_INLINE void OpCode_Compare::compare_character_class(MatchInput const& inp if (inverse) inverse_matched = true; else - ++state.string_position; + advance_string_position(state, input.view, ch); } break; case CharClass::Lower: @@ -660,7 +688,7 @@ ALWAYS_INLINE void OpCode_Compare::compare_character_class(MatchInput const& inp if (inverse) inverse_matched = true; else - ++state.string_position; + advance_string_position(state, input.view, ch); } break; case CharClass::Print: @@ -668,7 +696,7 @@ ALWAYS_INLINE void OpCode_Compare::compare_character_class(MatchInput const& inp if (inverse) inverse_matched = true; else - ++state.string_position; + advance_string_position(state, input.view, ch); } break; case CharClass::Punct: @@ -676,7 +704,7 @@ ALWAYS_INLINE void OpCode_Compare::compare_character_class(MatchInput const& inp if (inverse) inverse_matched = true; else - ++state.string_position; + advance_string_position(state, input.view, ch); } break; case CharClass::Space: @@ -684,7 +712,7 @@ ALWAYS_INLINE void OpCode_Compare::compare_character_class(MatchInput const& inp if (inverse) inverse_matched = true; else - ++state.string_position; + advance_string_position(state, input.view, ch); } break; case CharClass::Upper: @@ -692,7 +720,7 @@ ALWAYS_INLINE void OpCode_Compare::compare_character_class(MatchInput const& inp if (inverse) inverse_matched = true; else - ++state.string_position; + advance_string_position(state, input.view, ch); } break; case CharClass::Word: @@ -700,7 +728,7 @@ ALWAYS_INLINE void OpCode_Compare::compare_character_class(MatchInput const& inp if (inverse) inverse_matched = true; else - ++state.string_position; + advance_string_position(state, input.view, ch); } break; case CharClass::Xdigit: @@ -708,7 +736,7 @@ ALWAYS_INLINE void OpCode_Compare::compare_character_class(MatchInput const& inp if (inverse) inverse_matched = true; else - ++state.string_position; + advance_string_position(state, input.view, ch); } break; } @@ -726,7 +754,7 @@ ALWAYS_INLINE void OpCode_Compare::compare_character_range(MatchInput const& inp if (inverse) inverse_matched = true; else - ++state.string_position; + advance_string_position(state, input.view, ch); } } @@ -735,14 +763,14 @@ ALWAYS_INLINE void OpCode_Compare::compare_property(MatchInput const& input, Mat if (state.string_position == input.view.length()) return; - u32 code_point = input.view[state.string_position]; + u32 code_point = input.view[state.string_position_in_code_units]; bool equal = Unicode::code_point_has_property(code_point, property); if (equal) { if (inverse) inverse_matched = true; else - ++state.string_position; + advance_string_position(state, input.view, code_point); } } @@ -751,14 +779,14 @@ ALWAYS_INLINE void OpCode_Compare::compare_general_category(MatchInput const& in if (state.string_position == input.view.length()) return; - u32 code_point = input.view[state.string_position]; + u32 code_point = input.view[state.string_position_in_code_units]; bool equal = Unicode::code_point_has_general_category(code_point, general_category); if (equal) { if (inverse) inverse_matched = true; else - ++state.string_position; + advance_string_position(state, input.view, code_point); } } diff --git a/Userland/Libraries/LibRegex/RegexMatch.h b/Userland/Libraries/LibRegex/RegexMatch.h index 7147938085..599e68451a 100644 --- a/Userland/Libraries/LibRegex/RegexMatch.h +++ b/Userland/Libraries/LibRegex/RegexMatch.h @@ -95,12 +95,37 @@ public: [](auto const& view) { return view.length(); }); } + return length_in_code_units(); + } + + size_t length_in_code_units() const + { return m_view.visit( [](Utf16View const& view) { return view.length_in_code_units(); }, [](Utf8View const& view) { return view.byte_length(); }, [](auto const& view) { return view.length(); }); } + size_t length_of_code_point(u32 code_point) const + { + return m_view.visit( + [](Utf32View const&) { return 1; }, + [&](Utf16View const&) { + if (code_point < 0x10000) + return 1; + return 2; + }, + [&](auto const&) { + if (code_point <= 0x7f) + return 1; + else if (code_point <= 0x07ff) + return 2; + else if (code_point <= 0xffff) + return 3; + return 4; + }); + } + RegexStringView typed_null_view() { auto view = m_view.visit( @@ -230,6 +255,7 @@ public: }); } + // Note: index must always be the code unit offset to return. u32 operator[](size_t index) const { return m_view.visit( @@ -239,17 +265,12 @@ public: return 256u + ch; return ch; }, - [&](Utf32View& view) -> u32 { return view[index]; }, - [&](Utf16View& view) -> u32 { return view.code_point_at(index); }, - [&](auto& view) -> u32 { - // FIXME: Iterating to the code point is inefficient, particularly for very large - // strings. Implement something like code_point_at to Utf8View. - size_t i = index; - for (auto it = view.begin(); it != view.end(); ++it, --i) { - if (i == 0) - return *it; - } - VERIFY_NOT_REACHED(); + [&](Utf32View const& view) -> u32 { return view[index]; }, + [&](Utf16View const& view) -> u32 { return view.code_point_at(index); }, + [&](Utf8View const& view) -> u32 { + auto it = view.iterator_at_byte_offset(index); + VERIFY(it != view.end()); + return *it; }); } @@ -462,11 +483,13 @@ struct MatchInput { mutable size_t fail_counter { 0 }; mutable Vector saved_positions; + mutable Vector saved_code_unit_positions; }; struct MatchState { size_t string_position_before_match { 0 }; size_t string_position { 0 }; + size_t string_position_in_code_units { 0 }; size_t instruction_position { 0 }; size_t fork_at_position { 0 }; Vector matches; diff --git a/Userland/Libraries/LibRegex/RegexMatcher.cpp b/Userland/Libraries/LibRegex/RegexMatcher.cpp index ffb6b5bf3d..6b6ce96c71 100644 --- a/Userland/Libraries/LibRegex/RegexMatcher.cpp +++ b/Userland/Libraries/LibRegex/RegexMatcher.cpp @@ -198,6 +198,7 @@ RegexResult Matcher::match(Vector const& views, Optiona auto view_length = view.length(); size_t view_index = m_pattern->start_offset; state.string_position = view_index; + state.string_position_in_code_units = view_index; bool succeeded = false; if (view_index == view_length && m_pattern->parser_result.match_length_minimum == 0) { @@ -210,6 +211,7 @@ RegexResult Matcher::match(Vector const& views, Optiona input.match_index = match_count; state.string_position = view_index; + state.string_position_in_code_units = view_index; state.instruction_position = 0; auto success = execute(input, state, temp_output); @@ -241,6 +243,7 @@ RegexResult Matcher::match(Vector const& views, Optiona input.match_index = match_count; state.string_position = view_index; + state.string_position_in_code_units = view_index; state.instruction_position = 0; auto success = execute(input, state, output); @@ -388,7 +391,7 @@ private: Node* previous { nullptr }; }; - UniformBumpAllocator m_allocator; + UniformBumpAllocator m_allocator; Node* m_first { nullptr }; Node* m_last { nullptr }; };