LibRegex: Track string position in both code units and code points

In non-Unicode mode, the existing MatchState::string_position is tracked in code units; in Unicode mode, it is tracked in code points. In order for some RegexStringView operations to be performant, it is useful for the MatchState to have a field to always track the position in code units. This will allow RegexStringView methods (e.g. operator[]) to perform lookups based on code unit offsets, rather than needing to iterate over the entire string to find a code point offset.
2025-07-27 10:07:44 +00:00 · 2021-08-02 15:06:43 -04:00 · 2021-08-02 15:06:43 -04:00 · 27d555bab0
commit 27d555bab0
parent dae7674ca9
3 changed files with 95 additions and 41 deletions
--- a/Userland/Libraries/LibRegex/RegexMatch.h
+++ b/Userland/Libraries/LibRegex/RegexMatch.h
@ -95,12 +95,37 @@ public:
                [](auto const& view) { return view.length(); });
        }

+        return length_in_code_units();
+    }
+
+    size_t length_in_code_units() const
+    {
        return m_view.visit(
            [](Utf16View const& view) { return view.length_in_code_units(); },
            [](Utf8View const& view) { return view.byte_length(); },
            [](auto const& view) { return view.length(); });
    }

+    size_t length_of_code_point(u32 code_point) const
+    {
+        return m_view.visit(
+            [](Utf32View const&) { return 1; },
+            [&](Utf16View const&) {
+                if (code_point < 0x10000)
+                    return 1;
+                return 2;
+            },
+            [&](auto const&) {
+                if (code_point <= 0x7f)
+                    return 1;
+                else if (code_point <= 0x07ff)
+                    return 2;
+                else if (code_point <= 0xffff)
+                    return 3;
+                return 4;
+            });
+    }
+
    RegexStringView typed_null_view()
    {
        auto view = m_view.visit(
@ -230,6 +255,7 @@ public:
            });
    }

+    // Note: index must always be the code unit offset to return.
    u32 operator[](size_t index) const
    {
        return m_view.visit(
@ -239,17 +265,12 @@ public:
                    return 256u + ch;
                return ch;
            },
-            [&](Utf32View& view) -> u32 { return view[index]; },
-            [&](Utf16View& view) -> u32 { return view.code_point_at(index); },
-            [&](auto& view) -> u32 {
-                // FIXME: Iterating to the code point is inefficient, particularly for very large
-                // strings. Implement something like code_point_at to Utf8View.
-                size_t i = index;
-                for (auto it = view.begin(); it != view.end(); ++it, --i) {
-                    if (i == 0)
-                        return *it;
-                }
-                VERIFY_NOT_REACHED();
+            [&](Utf32View const& view) -> u32 { return view[index]; },
+            [&](Utf16View const& view) -> u32 { return view.code_point_at(index); },
+            [&](Utf8View const& view) -> u32 {
+                auto it = view.iterator_at_byte_offset(index);
+                VERIFY(it != view.end());
+                return *it;
            });
    }

@ -462,11 +483,13 @@ struct MatchInput {

    mutable size_t fail_counter { 0 };
    mutable Vector<size_t> saved_positions;
+    mutable Vector<size_t> saved_code_unit_positions;
 };

 struct MatchState {
    size_t string_position_before_match { 0 };
    size_t string_position { 0 };
+    size_t string_position_in_code_units { 0 };
    size_t instruction_position { 0 };
    size_t fork_at_position { 0 };
    Vector<Match> matches;