mirror of
https://github.com/RGBCube/serenity
synced 2025-07-25 18:37:35 +00:00
LibRegex: Track string position in both code units and code points
In non-Unicode mode, the existing MatchState::string_position is tracked in code units; in Unicode mode, it is tracked in code points. In order for some RegexStringView operations to be performant, it is useful for the MatchState to have a field to always track the position in code units. This will allow RegexStringView methods (e.g. operator[]) to perform lookups based on code unit offsets, rather than needing to iterate over the entire string to find a code point offset.
This commit is contained in:
parent
dae7674ca9
commit
27d555bab0
3 changed files with 95 additions and 41 deletions
|
@ -88,6 +88,36 @@ static char const* character_class_name(CharClass ch_class)
|
|||
}
|
||||
}
|
||||
|
||||
static void advance_string_position(MatchState& state, RegexStringView const& view, Optional<u32> code_point = {})
|
||||
{
|
||||
++state.string_position;
|
||||
|
||||
if (view.unicode()) {
|
||||
if (!code_point.has_value() && (state.string_position_in_code_units < view.length_in_code_units()))
|
||||
code_point = view[state.string_position_in_code_units];
|
||||
if (code_point.has_value())
|
||||
state.string_position_in_code_units += view.length_of_code_point(*code_point);
|
||||
} else {
|
||||
++state.string_position_in_code_units;
|
||||
}
|
||||
}
|
||||
|
||||
static void save_string_position(MatchInput const& input, MatchState const& state)
|
||||
{
|
||||
input.saved_positions.append(state.string_position);
|
||||
input.saved_code_unit_positions.append(state.string_position_in_code_units);
|
||||
}
|
||||
|
||||
static bool restore_string_position(MatchInput const& input, MatchState& state)
|
||||
{
|
||||
if (input.saved_positions.is_empty())
|
||||
return false;
|
||||
|
||||
state.string_position = input.saved_positions.take_last();
|
||||
state.string_position_in_code_units = input.saved_code_unit_positions.take_last();
|
||||
return true;
|
||||
}
|
||||
|
||||
OwnPtr<OpCode> ByteCode::s_opcodes[(size_t)OpCodeId::Last + 1];
|
||||
bool ByteCode::s_opcodes_initialized { false };
|
||||
|
||||
|
@ -188,16 +218,14 @@ ALWAYS_INLINE ExecutionResult OpCode_Exit::execute(MatchInput const& input, Matc
|
|||
|
||||
ALWAYS_INLINE ExecutionResult OpCode_Save::execute(MatchInput const& input, MatchState& state, MatchOutput&) const
|
||||
{
|
||||
input.saved_positions.append(state.string_position);
|
||||
save_string_position(input, state);
|
||||
return ExecutionResult::Continue;
|
||||
}
|
||||
|
||||
ALWAYS_INLINE ExecutionResult OpCode_Restore::execute(MatchInput const& input, MatchState& state, MatchOutput&) const
|
||||
{
|
||||
if (input.saved_positions.is_empty())
|
||||
if (!restore_string_position(input, state))
|
||||
return ExecutionResult::Failed;
|
||||
|
||||
state.string_position = input.saved_positions.take_last();
|
||||
return ExecutionResult::Continue;
|
||||
}
|
||||
|
||||
|
@ -254,7 +282,7 @@ ALWAYS_INLINE ExecutionResult OpCode_CheckBoundary::execute(MatchInput const& in
|
|||
auto isword = [](auto ch) { return is_ascii_alphanumeric(ch) || ch == '_'; };
|
||||
auto is_word_boundary = [&] {
|
||||
if (state.string_position == input.view.length()) {
|
||||
if (state.string_position > 0 && isword(input.view[state.string_position - 1]))
|
||||
if (state.string_position > 0 && isword(input.view[state.string_position_in_code_units - 1]))
|
||||
return true;
|
||||
return false;
|
||||
}
|
||||
|
@ -266,7 +294,7 @@ ALWAYS_INLINE ExecutionResult OpCode_CheckBoundary::execute(MatchInput const& in
|
|||
return false;
|
||||
}
|
||||
|
||||
return !!(isword(input.view[state.string_position]) ^ isword(input.view[state.string_position - 1]));
|
||||
return !!(isword(input.view[state.string_position_in_code_units]) ^ isword(input.view[state.string_position_in_code_units - 1]));
|
||||
};
|
||||
switch (type()) {
|
||||
case BoundaryCheckType::Word: {
|
||||
|
@ -455,7 +483,7 @@ ALWAYS_INLINE ExecutionResult OpCode_Compare::execute(MatchInput const& input, M
|
|||
return ExecutionResult::Failed_ExecuteLowPrioForks;
|
||||
|
||||
VERIFY(!current_inversion_state());
|
||||
++state.string_position;
|
||||
advance_string_position(state, input.view);
|
||||
|
||||
} else if (compare_type == CharacterCompareType::String) {
|
||||
VERIFY(!current_inversion_state());
|
||||
|
@ -484,7 +512,7 @@ ALWAYS_INLINE ExecutionResult OpCode_Compare::execute(MatchInput const& input, M
|
|||
return ExecutionResult::Failed_ExecuteLowPrioForks;
|
||||
|
||||
auto character_class = (CharClass)m_bytecode->at(offset++);
|
||||
auto ch = input.view[state.string_position];
|
||||
auto ch = input.view[state.string_position_in_code_units];
|
||||
|
||||
compare_character_class(input, state, character_class, ch, current_inversion_state(), inverse_matched);
|
||||
|
||||
|
@ -496,7 +524,7 @@ ALWAYS_INLINE ExecutionResult OpCode_Compare::execute(MatchInput const& input, M
|
|||
|
||||
auto from = value.from;
|
||||
auto to = value.to;
|
||||
auto ch = input.view[state.string_position];
|
||||
auto ch = input.view[state.string_position_in_code_units];
|
||||
|
||||
compare_character_range(input, state, from, to, ch, current_inversion_state(), inverse_matched);
|
||||
|
||||
|
@ -549,7 +577,7 @@ ALWAYS_INLINE ExecutionResult OpCode_Compare::execute(MatchInput const& input, M
|
|||
}
|
||||
|
||||
if (current_inversion_state() && !inverse_matched)
|
||||
++state.string_position;
|
||||
advance_string_position(state, input.view);
|
||||
|
||||
if ((!had_zero_length_match && string_position == state.string_position) || state.string_position > input.view.length())
|
||||
return ExecutionResult::Failed_ExecuteLowPrioForks;
|
||||
|
@ -576,7 +604,7 @@ ALWAYS_INLINE void OpCode_Compare::compare_char(MatchInput const& input, MatchSt
|
|||
if (inverse)
|
||||
inverse_matched = true;
|
||||
else
|
||||
++state.string_position;
|
||||
advance_string_position(state, input.view, ch1);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -616,19 +644,19 @@ ALWAYS_INLINE void OpCode_Compare::compare_character_class(MatchInput const& inp
|
|||
if (inverse)
|
||||
inverse_matched = true;
|
||||
else
|
||||
++state.string_position;
|
||||
advance_string_position(state, input.view, ch);
|
||||
}
|
||||
break;
|
||||
case CharClass::Alpha:
|
||||
if (is_ascii_alpha(ch))
|
||||
++state.string_position;
|
||||
advance_string_position(state, input.view, ch);
|
||||
break;
|
||||
case CharClass::Blank:
|
||||
if (is_ascii_blank(ch)) {
|
||||
if (inverse)
|
||||
inverse_matched = true;
|
||||
else
|
||||
++state.string_position;
|
||||
advance_string_position(state, input.view, ch);
|
||||
}
|
||||
break;
|
||||
case CharClass::Cntrl:
|
||||
|
@ -636,7 +664,7 @@ ALWAYS_INLINE void OpCode_Compare::compare_character_class(MatchInput const& inp
|
|||
if (inverse)
|
||||
inverse_matched = true;
|
||||
else
|
||||
++state.string_position;
|
||||
advance_string_position(state, input.view, ch);
|
||||
}
|
||||
break;
|
||||
case CharClass::Digit:
|
||||
|
@ -644,7 +672,7 @@ ALWAYS_INLINE void OpCode_Compare::compare_character_class(MatchInput const& inp
|
|||
if (inverse)
|
||||
inverse_matched = true;
|
||||
else
|
||||
++state.string_position;
|
||||
advance_string_position(state, input.view, ch);
|
||||
}
|
||||
break;
|
||||
case CharClass::Graph:
|
||||
|
@ -652,7 +680,7 @@ ALWAYS_INLINE void OpCode_Compare::compare_character_class(MatchInput const& inp
|
|||
if (inverse)
|
||||
inverse_matched = true;
|
||||
else
|
||||
++state.string_position;
|
||||
advance_string_position(state, input.view, ch);
|
||||
}
|
||||
break;
|
||||
case CharClass::Lower:
|
||||
|
@ -660,7 +688,7 @@ ALWAYS_INLINE void OpCode_Compare::compare_character_class(MatchInput const& inp
|
|||
if (inverse)
|
||||
inverse_matched = true;
|
||||
else
|
||||
++state.string_position;
|
||||
advance_string_position(state, input.view, ch);
|
||||
}
|
||||
break;
|
||||
case CharClass::Print:
|
||||
|
@ -668,7 +696,7 @@ ALWAYS_INLINE void OpCode_Compare::compare_character_class(MatchInput const& inp
|
|||
if (inverse)
|
||||
inverse_matched = true;
|
||||
else
|
||||
++state.string_position;
|
||||
advance_string_position(state, input.view, ch);
|
||||
}
|
||||
break;
|
||||
case CharClass::Punct:
|
||||
|
@ -676,7 +704,7 @@ ALWAYS_INLINE void OpCode_Compare::compare_character_class(MatchInput const& inp
|
|||
if (inverse)
|
||||
inverse_matched = true;
|
||||
else
|
||||
++state.string_position;
|
||||
advance_string_position(state, input.view, ch);
|
||||
}
|
||||
break;
|
||||
case CharClass::Space:
|
||||
|
@ -684,7 +712,7 @@ ALWAYS_INLINE void OpCode_Compare::compare_character_class(MatchInput const& inp
|
|||
if (inverse)
|
||||
inverse_matched = true;
|
||||
else
|
||||
++state.string_position;
|
||||
advance_string_position(state, input.view, ch);
|
||||
}
|
||||
break;
|
||||
case CharClass::Upper:
|
||||
|
@ -692,7 +720,7 @@ ALWAYS_INLINE void OpCode_Compare::compare_character_class(MatchInput const& inp
|
|||
if (inverse)
|
||||
inverse_matched = true;
|
||||
else
|
||||
++state.string_position;
|
||||
advance_string_position(state, input.view, ch);
|
||||
}
|
||||
break;
|
||||
case CharClass::Word:
|
||||
|
@ -700,7 +728,7 @@ ALWAYS_INLINE void OpCode_Compare::compare_character_class(MatchInput const& inp
|
|||
if (inverse)
|
||||
inverse_matched = true;
|
||||
else
|
||||
++state.string_position;
|
||||
advance_string_position(state, input.view, ch);
|
||||
}
|
||||
break;
|
||||
case CharClass::Xdigit:
|
||||
|
@ -708,7 +736,7 @@ ALWAYS_INLINE void OpCode_Compare::compare_character_class(MatchInput const& inp
|
|||
if (inverse)
|
||||
inverse_matched = true;
|
||||
else
|
||||
++state.string_position;
|
||||
advance_string_position(state, input.view, ch);
|
||||
}
|
||||
break;
|
||||
}
|
||||
|
@ -726,7 +754,7 @@ ALWAYS_INLINE void OpCode_Compare::compare_character_range(MatchInput const& inp
|
|||
if (inverse)
|
||||
inverse_matched = true;
|
||||
else
|
||||
++state.string_position;
|
||||
advance_string_position(state, input.view, ch);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -735,14 +763,14 @@ ALWAYS_INLINE void OpCode_Compare::compare_property(MatchInput const& input, Mat
|
|||
if (state.string_position == input.view.length())
|
||||
return;
|
||||
|
||||
u32 code_point = input.view[state.string_position];
|
||||
u32 code_point = input.view[state.string_position_in_code_units];
|
||||
bool equal = Unicode::code_point_has_property(code_point, property);
|
||||
|
||||
if (equal) {
|
||||
if (inverse)
|
||||
inverse_matched = true;
|
||||
else
|
||||
++state.string_position;
|
||||
advance_string_position(state, input.view, code_point);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -751,14 +779,14 @@ ALWAYS_INLINE void OpCode_Compare::compare_general_category(MatchInput const& in
|
|||
if (state.string_position == input.view.length())
|
||||
return;
|
||||
|
||||
u32 code_point = input.view[state.string_position];
|
||||
u32 code_point = input.view[state.string_position_in_code_units];
|
||||
bool equal = Unicode::code_point_has_general_category(code_point, general_category);
|
||||
|
||||
if (equal) {
|
||||
if (inverse)
|
||||
inverse_matched = true;
|
||||
else
|
||||
++state.string_position;
|
||||
advance_string_position(state, input.view, code_point);
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
@ -95,12 +95,37 @@ public:
|
|||
[](auto const& view) { return view.length(); });
|
||||
}
|
||||
|
||||
return length_in_code_units();
|
||||
}
|
||||
|
||||
size_t length_in_code_units() const
|
||||
{
|
||||
return m_view.visit(
|
||||
[](Utf16View const& view) { return view.length_in_code_units(); },
|
||||
[](Utf8View const& view) { return view.byte_length(); },
|
||||
[](auto const& view) { return view.length(); });
|
||||
}
|
||||
|
||||
size_t length_of_code_point(u32 code_point) const
|
||||
{
|
||||
return m_view.visit(
|
||||
[](Utf32View const&) { return 1; },
|
||||
[&](Utf16View const&) {
|
||||
if (code_point < 0x10000)
|
||||
return 1;
|
||||
return 2;
|
||||
},
|
||||
[&](auto const&) {
|
||||
if (code_point <= 0x7f)
|
||||
return 1;
|
||||
else if (code_point <= 0x07ff)
|
||||
return 2;
|
||||
else if (code_point <= 0xffff)
|
||||
return 3;
|
||||
return 4;
|
||||
});
|
||||
}
|
||||
|
||||
RegexStringView typed_null_view()
|
||||
{
|
||||
auto view = m_view.visit(
|
||||
|
@ -230,6 +255,7 @@ public:
|
|||
});
|
||||
}
|
||||
|
||||
// Note: index must always be the code unit offset to return.
|
||||
u32 operator[](size_t index) const
|
||||
{
|
||||
return m_view.visit(
|
||||
|
@ -239,17 +265,12 @@ public:
|
|||
return 256u + ch;
|
||||
return ch;
|
||||
},
|
||||
[&](Utf32View& view) -> u32 { return view[index]; },
|
||||
[&](Utf16View& view) -> u32 { return view.code_point_at(index); },
|
||||
[&](auto& view) -> u32 {
|
||||
// FIXME: Iterating to the code point is inefficient, particularly for very large
|
||||
// strings. Implement something like code_point_at to Utf8View.
|
||||
size_t i = index;
|
||||
for (auto it = view.begin(); it != view.end(); ++it, --i) {
|
||||
if (i == 0)
|
||||
[&](Utf32View const& view) -> u32 { return view[index]; },
|
||||
[&](Utf16View const& view) -> u32 { return view.code_point_at(index); },
|
||||
[&](Utf8View const& view) -> u32 {
|
||||
auto it = view.iterator_at_byte_offset(index);
|
||||
VERIFY(it != view.end());
|
||||
return *it;
|
||||
}
|
||||
VERIFY_NOT_REACHED();
|
||||
});
|
||||
}
|
||||
|
||||
|
@ -462,11 +483,13 @@ struct MatchInput {
|
|||
|
||||
mutable size_t fail_counter { 0 };
|
||||
mutable Vector<size_t> saved_positions;
|
||||
mutable Vector<size_t> saved_code_unit_positions;
|
||||
};
|
||||
|
||||
struct MatchState {
|
||||
size_t string_position_before_match { 0 };
|
||||
size_t string_position { 0 };
|
||||
size_t string_position_in_code_units { 0 };
|
||||
size_t instruction_position { 0 };
|
||||
size_t fork_at_position { 0 };
|
||||
Vector<Match> matches;
|
||||
|
|
|
@ -198,6 +198,7 @@ RegexResult Matcher<Parser>::match(Vector<RegexStringView> const& views, Optiona
|
|||
auto view_length = view.length();
|
||||
size_t view_index = m_pattern->start_offset;
|
||||
state.string_position = view_index;
|
||||
state.string_position_in_code_units = view_index;
|
||||
bool succeeded = false;
|
||||
|
||||
if (view_index == view_length && m_pattern->parser_result.match_length_minimum == 0) {
|
||||
|
@ -210,6 +211,7 @@ RegexResult Matcher<Parser>::match(Vector<RegexStringView> const& views, Optiona
|
|||
input.match_index = match_count;
|
||||
|
||||
state.string_position = view_index;
|
||||
state.string_position_in_code_units = view_index;
|
||||
state.instruction_position = 0;
|
||||
|
||||
auto success = execute(input, state, temp_output);
|
||||
|
@ -241,6 +243,7 @@ RegexResult Matcher<Parser>::match(Vector<RegexStringView> const& views, Optiona
|
|||
input.match_index = match_count;
|
||||
|
||||
state.string_position = view_index;
|
||||
state.string_position_in_code_units = view_index;
|
||||
state.instruction_position = 0;
|
||||
|
||||
auto success = execute(input, state, output);
|
||||
|
@ -388,7 +391,7 @@ private:
|
|||
Node* previous { nullptr };
|
||||
};
|
||||
|
||||
UniformBumpAllocator<Node, true> m_allocator;
|
||||
UniformBumpAllocator<Node, true, 8 * MiB> m_allocator;
|
||||
Node* m_first { nullptr };
|
||||
Node* m_last { nullptr };
|
||||
};
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue