From 6d710eeb431d4fc729e4692ac8db4270183cd039 Mon Sep 17 00:00:00 2001
From: Timothy Flynn <trflynn89@pm.me>
Date: Mon, 16 Jan 2023 09:39:12 -0500
Subject: [PATCH] LibUnicode: Add an overload of word segmentation for UTF-8
 strings

---
 .../Libraries/LibUnicode/CharacterTypes.cpp   | 44 ++++++++++++++++---
 .../Libraries/LibUnicode/CharacterTypes.h     |  2 +
 2 files changed, 40 insertions(+), 6 deletions(-)
diff --git a/Userland/Libraries/LibUnicode/CharacterTypes.cpp b/Userland/Libraries/LibUnicode/CharacterTypes.cpp
index fba41ee30b..4b569ff6f4 100644
--- a/Userland/Libraries/LibUnicode/CharacterTypes.cpp
+++ b/Userland/Libraries/LibUnicode/CharacterTypes.cpp
@@ -227,24 +227,46 @@ Vector<size_t> find_grapheme_segmentation_boundaries([[maybe_unused]] Utf16View
 #endif
 }
 
-Vector<size_t> find_word_segmentation_boundaries([[maybe_unused]] Utf16View const& view)
+template<typename ViewType>
+static Vector<size_t> find_word_segmentation_boundaries_impl([[maybe_unused]] ViewType const& view)
 {
 #if ENABLE_UNICODE_DATA
     using WBP = WordBreakProperty;
     Vector<size_t> boundaries;
 
     // https://www.unicode.org/reports/tr29/#Word_Boundary_Rules
-    if (view.length_in_code_points() == 0)
+    if (view.is_empty())
         return boundaries;
 
     auto has_any_wbp = [](u32 code_point, auto&&... properties) {
         return (code_point_has_word_break_property(code_point, properties) || ...);
     };
 
+    size_t code_unit_length = 0;
+    size_t code_point_length = 0;
+
+    if constexpr (requires { view.byte_length(); }) {
+        code_unit_length = view.byte_length();
+        code_point_length = view.length();
+    } else if constexpr (requires { view.length_in_code_units(); }) {
+        code_unit_length = view.length_in_code_units();
+        code_point_length = view.length_in_code_points();
+    } else {
+        static_assert(DependentFalse<ViewType>);
+    }
+
+    auto code_unit_offset_of = [&](auto it) {
+        if constexpr (requires { view.byte_offset_of(it); })
+            return view.byte_offset_of(it);
+        else if constexpr (requires { view.code_unit_offset_of(it); })
+            return view.code_unit_offset_of(it);
+        VERIFY_NOT_REACHED();
+    };
+
     // WB1
     boundaries.append(0);
 
-    if (view.length_in_code_points() > 1) {
+    if (code_point_length > 1) {
         auto it = view.begin();
         auto code_point = *it;
         u32 next_code_point;
@@ -262,7 +284,7 @@ Vector<size_t> find_word_segmentation_boundaries([[maybe_unused]] Utf16View cons
                 continue;
             // WB3a, WB3b
             if (code_point_is_cr || next_code_point_is_lf || has_any_wbp(next_code_point, WBP::CR, WBP::Newline) || has_any_wbp(code_point, WBP::LF, WBP::Newline)) {
-                boundaries.append(view.code_unit_offset_of(it));
+                boundaries.append(code_unit_offset_of(it));
                 continue;
             }
             // WB3c
@@ -367,18 +389,28 @@ Vector<size_t> find_word_segmentation_boundaries([[maybe_unused]] Utf16View cons
                 continue;
 
             // WB999
-            boundaries.append(view.code_unit_offset_of(it));
+            boundaries.append(code_unit_offset_of(it));
         }
     }
 
     // WB2
-    boundaries.append(view.length_in_code_units());
+    boundaries.append(code_unit_length);
     return boundaries;
 #else
     return {};
 #endif
 }
 
+Vector<size_t> find_word_segmentation_boundaries(Utf8View const& view)
+{
+    return find_word_segmentation_boundaries_impl(view);
+}
+
+Vector<size_t> find_word_segmentation_boundaries(Utf16View const& view)
+{
+    return find_word_segmentation_boundaries_impl(view);
+}
+
 Vector<size_t> find_sentence_segmentation_boundaries([[maybe_unused]] Utf16View const& view)
 {
 #if ENABLE_UNICODE_DATA
diff --git a/Userland/Libraries/LibUnicode/CharacterTypes.h b/Userland/Libraries/LibUnicode/CharacterTypes.h
index 788d567b91..553a1b7b49 100644
--- a/Userland/Libraries/LibUnicode/CharacterTypes.h
+++ b/Userland/Libraries/LibUnicode/CharacterTypes.h
@@ -11,6 +11,7 @@
 #include <AK/Optional.h>
 #include <AK/Span.h>
 #include <AK/Types.h>
+#include <AK/Vector.h>
 #include <LibUnicode/Forward.h>
 
 namespace Unicode {
@@ -60,6 +61,7 @@ bool code_point_has_word_break_property(u32 code_point, WordBreakProperty proper
 bool code_point_has_sentence_break_property(u32 code_point, SentenceBreakProperty property);
 
 Vector<size_t> find_grapheme_segmentation_boundaries(Utf16View const&);
+Vector<size_t> find_word_segmentation_boundaries(Utf8View const&);
 Vector<size_t> find_word_segmentation_boundaries(Utf16View const&);
 Vector<size_t> find_sentence_segmentation_boundaries(Utf16View const&);