From 6d710eeb431d4fc729e4692ac8db4270183cd039 Mon Sep 17 00:00:00 2001 From: Timothy Flynn Date: Mon, 16 Jan 2023 09:39:12 -0500 Subject: [PATCH] LibUnicode: Add an overload of word segmentation for UTF-8 strings --- .../Libraries/LibUnicode/CharacterTypes.cpp | 44 ++++++++++++++++--- .../Libraries/LibUnicode/CharacterTypes.h | 2 + 2 files changed, 40 insertions(+), 6 deletions(-) diff --git a/Userland/Libraries/LibUnicode/CharacterTypes.cpp b/Userland/Libraries/LibUnicode/CharacterTypes.cpp index fba41ee30b..4b569ff6f4 100644 --- a/Userland/Libraries/LibUnicode/CharacterTypes.cpp +++ b/Userland/Libraries/LibUnicode/CharacterTypes.cpp @@ -227,24 +227,46 @@ Vector find_grapheme_segmentation_boundaries([[maybe_unused]] Utf16View #endif } -Vector find_word_segmentation_boundaries([[maybe_unused]] Utf16View const& view) +template +static Vector find_word_segmentation_boundaries_impl([[maybe_unused]] ViewType const& view) { #if ENABLE_UNICODE_DATA using WBP = WordBreakProperty; Vector boundaries; // https://www.unicode.org/reports/tr29/#Word_Boundary_Rules - if (view.length_in_code_points() == 0) + if (view.is_empty()) return boundaries; auto has_any_wbp = [](u32 code_point, auto&&... properties) { return (code_point_has_word_break_property(code_point, properties) || ...); }; + size_t code_unit_length = 0; + size_t code_point_length = 0; + + if constexpr (requires { view.byte_length(); }) { + code_unit_length = view.byte_length(); + code_point_length = view.length(); + } else if constexpr (requires { view.length_in_code_units(); }) { + code_unit_length = view.length_in_code_units(); + code_point_length = view.length_in_code_points(); + } else { + static_assert(DependentFalse); + } + + auto code_unit_offset_of = [&](auto it) { + if constexpr (requires { view.byte_offset_of(it); }) + return view.byte_offset_of(it); + else if constexpr (requires { view.code_unit_offset_of(it); }) + return view.code_unit_offset_of(it); + VERIFY_NOT_REACHED(); + }; + // WB1 boundaries.append(0); - if (view.length_in_code_points() > 1) { + if (code_point_length > 1) { auto it = view.begin(); auto code_point = *it; u32 next_code_point; @@ -262,7 +284,7 @@ Vector find_word_segmentation_boundaries([[maybe_unused]] Utf16View cons continue; // WB3a, WB3b if (code_point_is_cr || next_code_point_is_lf || has_any_wbp(next_code_point, WBP::CR, WBP::Newline) || has_any_wbp(code_point, WBP::LF, WBP::Newline)) { - boundaries.append(view.code_unit_offset_of(it)); + boundaries.append(code_unit_offset_of(it)); continue; } // WB3c @@ -367,18 +389,28 @@ Vector find_word_segmentation_boundaries([[maybe_unused]] Utf16View cons continue; // WB999 - boundaries.append(view.code_unit_offset_of(it)); + boundaries.append(code_unit_offset_of(it)); } } // WB2 - boundaries.append(view.length_in_code_units()); + boundaries.append(code_unit_length); return boundaries; #else return {}; #endif } +Vector find_word_segmentation_boundaries(Utf8View const& view) +{ + return find_word_segmentation_boundaries_impl(view); +} + +Vector find_word_segmentation_boundaries(Utf16View const& view) +{ + return find_word_segmentation_boundaries_impl(view); +} + Vector find_sentence_segmentation_boundaries([[maybe_unused]] Utf16View const& view) { #if ENABLE_UNICODE_DATA diff --git a/Userland/Libraries/LibUnicode/CharacterTypes.h b/Userland/Libraries/LibUnicode/CharacterTypes.h index 788d567b91..553a1b7b49 100644 --- a/Userland/Libraries/LibUnicode/CharacterTypes.h +++ b/Userland/Libraries/LibUnicode/CharacterTypes.h @@ -11,6 +11,7 @@ #include #include #include +#include #include namespace Unicode { @@ -60,6 +61,7 @@ bool code_point_has_word_break_property(u32 code_point, WordBreakProperty proper bool code_point_has_sentence_break_property(u32 code_point, SentenceBreakProperty property); Vector find_grapheme_segmentation_boundaries(Utf16View const&); +Vector find_word_segmentation_boundaries(Utf8View const&); Vector find_word_segmentation_boundaries(Utf16View const&); Vector find_sentence_segmentation_boundaries(Utf16View const&);