LibGUI: Use Unicode's text segmentation for traversing word breaks

Note that for traversing words with ctrl+(left or right arrow) on the keyboard, we want to "skip" some boundaries. Consider the text: The ("quick") fox can't jump 32.3 feet, right? Using "|" to denote word break boundaries, we will have: |The| |(|"|quick|"|)| |fox| |can't| |jump| |32.3| |feet|,| |right|?| When starting at "The" and using ctrl+right to move rightward in this text, it is unlikely that users will want to break at every single one of those boundaries. Most text editors, for example, will skip from the end of "The" to the end of "quick", not breaking at the parentheses or opening quotation marks. We try to mimic such desired behavior here. It likely isn't perfect, but we can improve upon it as we find edge cases.
2025-07-26 05:17:34 +00:00 · 2023-02-14 14:10:08 -05:00 · 2023-02-14 14:10:08 -05:00 · 4ab7216827
commit 4ab7216827
parent 5cbf054651
1 changed files with 46 additions and 16 deletions
--- a/Userland/Libraries/LibGUI/TextDocument.cpp
+++ b/Userland/Libraries/LibGUI/TextDocument.cpp
@ -16,6 +16,7 @@
 #include <LibGUI/TextDocument.h>
 #include <LibRegex/Regex.h>
 #include <LibUnicode/CharacterTypes.h>
+#include <LibUnicode/Segmentation.h>

 namespace GUI {

@ -683,6 +684,26 @@ Optional<TextDocumentSpan> TextDocument::first_non_skippable_span_after(TextPosi
    return {};
 }

+static bool should_continue_beyond_word(Utf32View const& view)
+{
+    static auto punctuation = Unicode::general_category_from_string("Punctuation"sv);
+    static auto separator = Unicode::general_category_from_string("Separator"sv);
+
+    if (!punctuation.has_value() || !separator.has_value())
+        return false;
+
+    auto has_any_gc = [&](auto code_point, auto&&... categories) {
+        return (Unicode::code_point_has_general_category(code_point, *categories) || ...);
+    };
+
+    for (auto code_point : view) {
+        if (!has_any_gc(code_point, punctuation, separator))
+            return false;
+    }
+
+    return true;
+}
+
 TextPosition TextDocument::first_word_break_before(TextPosition const& position, bool start_at_column_before) const
 {
    if (position.column() == 0) {
@ -694,20 +715,26 @@ TextPosition TextDocument::first_word_break_before(TextPosition const& position,
    }

    auto target = position;
-    auto line = this->line(target.line());
+    auto const& line = this->line(target.line());
+
    auto modifier = start_at_column_before ? 1 : 0;
    if (target.column() == line.length())
        modifier = 1;

-    while (target.column() > 0 && is_ascii_blank(line.code_points()[target.column() - modifier]))
-        target.set_column(target.column() - 1);
-    auto is_start_alphanumeric = is_ascii_alphanumeric(line.code_points()[target.column() - modifier]);
+    target.set_column(target.column() - modifier);

-    while (target.column() > 0) {
-        auto prev_code_point = line.code_points()[target.column() - 1];
-        if ((is_start_alphanumeric && !is_ascii_alphanumeric(prev_code_point)) || (!is_start_alphanumeric && is_ascii_alphanumeric(prev_code_point)))
+    while (target.column() < line.length()) {
+        if (auto index = Unicode::previous_word_segmentation_boundary(line.view(), target.column()); index.has_value()) {
+            auto view_between_target_and_index = line.view().substring_view(*index, target.column() - *index);
+
+            if (should_continue_beyond_word(view_between_target_and_index)) {
+                target.set_column(*index - 1);
+                continue;
+            }
+
+            target.set_column(*index);
            break;
-        target.set_column(target.column() - 1);
+        }
    }

    return target;
@ -716,7 +743,7 @@ TextPosition TextDocument::first_word_break_before(TextPosition const& position,
 TextPosition TextDocument::first_word_break_after(TextPosition const& position) const
 {
    auto target = position;
-    auto line = this->line(target.line());
+    auto const& line = this->line(target.line());

    if (position.column() >= line.length()) {
        if (position.line() >= this->line_count() - 1) {
@ -725,15 +752,18 @@ TextPosition TextDocument::first_word_break_after(TextPosition const& position)
        return TextPosition(position.line() + 1, 0);
    }

-    while (target.column() < line.length() && is_ascii_space(line.code_points()[target.column()]))
-        target.set_column(target.column() + 1);
-    auto is_start_alphanumeric = is_ascii_alphanumeric(line.code_points()[target.column()]);
-
    while (target.column() < line.length()) {
-        auto next_code_point = line.code_points()[target.column()];
-        if ((is_start_alphanumeric && !is_ascii_alphanumeric(next_code_point)) || (!is_start_alphanumeric && is_ascii_alphanumeric(next_code_point)))
+        if (auto index = Unicode::next_word_segmentation_boundary(line.view(), target.column()); index.has_value()) {
+            auto view_between_target_and_index = line.view().substring_view(target.column(), *index - target.column());
+
+            if (should_continue_beyond_word(view_between_target_and_index)) {
+                target.set_column(*index + 1);
+                continue;
+            }
+
+            target.set_column(*index);
            break;
-        target.set_column(target.column() + 1);
+        }
    }

    return target;