From abe7786a81b93724e39698f1430729742678c324 Mon Sep 17 00:00:00 2001 From: Timothy Flynn Date: Tue, 14 Feb 2023 12:03:35 -0500 Subject: [PATCH] LibUnicode: Allow iterating over text segmentation boundaries This will be useful for e.g. finding the next boundary after a specific index - we can just stop iterating once a condition is satisfied. --- .../Libraries/LibUnicode/Segmentation.cpp | 93 +++++++++---------- Userland/Libraries/LibUnicode/Segmentation.h | 61 ++++++++++-- 2 files changed, 97 insertions(+), 57 deletions(-) diff --git a/Userland/Libraries/LibUnicode/Segmentation.cpp b/Userland/Libraries/LibUnicode/Segmentation.cpp index 2b330653fd..f843f1f5a0 100644 --- a/Userland/Libraries/LibUnicode/Segmentation.cpp +++ b/Userland/Libraries/LibUnicode/Segmentation.cpp @@ -44,22 +44,22 @@ static size_t code_unit_offset_of(ViewType const& view, CodeUnitIterator const& } template -static Vector find_grapheme_segmentation_boundaries_impl([[maybe_unused]] ViewType const& view) +static void for_each_grapheme_segmentation_boundary_impl([[maybe_unused]] ViewType const& view, [[maybe_unused]] SegmentationCallback callback) { #if ENABLE_UNICODE_DATA using GBP = GraphemeBreakProperty; - Vector boundaries; // https://www.unicode.org/reports/tr29/#Grapheme_Cluster_Boundary_Rules if (view.is_empty()) - return boundaries; + return; auto has_any_gbp = [](u32 code_point, auto&&... properties) { return (code_point_has_grapheme_break_property(code_point, properties) || ...); }; // GB1 - boundaries.append(0); + if (callback(0) == IterationDecision::Break) + return; if (code_unit_length(view) > 1) { auto it = view.begin(); @@ -79,7 +79,8 @@ static Vector find_grapheme_segmentation_boundaries_impl([[maybe_unused] continue; // GB4, GB5 if (code_point_is_cr || next_code_point_is_lf || has_any_gbp(next_code_point, GBP::CR, GBP::Control) || has_any_gbp(code_point, GBP::LF, GBP::Control)) { - boundaries.append(code_unit_offset_of(view, it)); + if (callback(code_unit_offset_of(view, it)) == IterationDecision::Break) + return; continue; } @@ -124,50 +125,48 @@ static Vector find_grapheme_segmentation_boundaries_impl([[maybe_unused] continue; // GB999 - boundaries.append(code_unit_offset_of(view, it)); + if (callback(code_unit_offset_of(view, it)) == IterationDecision::Break) + return; } } // GB2 - boundaries.append(code_unit_length(view)); - return boundaries; -#else - return {}; + callback(code_unit_length(view)); #endif } -Vector find_grapheme_segmentation_boundaries(Utf8View const& view) +void for_each_grapheme_segmentation_boundary(Utf8View const& view, SegmentationCallback callback) { - return find_grapheme_segmentation_boundaries_impl(view); + for_each_grapheme_segmentation_boundary_impl(view, move(callback)); } -Vector find_grapheme_segmentation_boundaries(Utf16View const& view) +void for_each_grapheme_segmentation_boundary(Utf16View const& view, SegmentationCallback callback) { - return find_grapheme_segmentation_boundaries_impl(view); + for_each_grapheme_segmentation_boundary_impl(view, move(callback)); } -Vector find_grapheme_segmentation_boundaries(Utf32View const& view) +void for_each_grapheme_segmentation_boundary(Utf32View const& view, SegmentationCallback callback) { - return find_grapheme_segmentation_boundaries_impl(view); + for_each_grapheme_segmentation_boundary_impl(view, move(callback)); } template -static Vector find_word_segmentation_boundaries_impl([[maybe_unused]] ViewType const& view) +static void for_each_word_segmentation_boundary_impl([[maybe_unused]] ViewType const& view, [[maybe_unused]] SegmentationCallback callback) { #if ENABLE_UNICODE_DATA using WBP = WordBreakProperty; - Vector boundaries; // https://www.unicode.org/reports/tr29/#Word_Boundary_Rules if (view.is_empty()) - return boundaries; + return; auto has_any_wbp = [](u32 code_point, auto&&... properties) { return (code_point_has_word_break_property(code_point, properties) || ...); }; // WB1 - boundaries.append(0); + if (callback(0) == IterationDecision::Break) + return; if (code_unit_length(view) > 1) { auto it = view.begin(); @@ -187,7 +186,8 @@ static Vector find_word_segmentation_boundaries_impl([[maybe_unused]] Vi continue; // WB3a, WB3b if (code_point_is_cr || next_code_point_is_lf || has_any_wbp(next_code_point, WBP::CR, WBP::Newline) || has_any_wbp(code_point, WBP::LF, WBP::Newline)) { - boundaries.append(code_unit_offset_of(view, it)); + if (callback(code_unit_offset_of(view, it)) == IterationDecision::Break) + return; continue; } // WB3c @@ -292,50 +292,48 @@ static Vector find_word_segmentation_boundaries_impl([[maybe_unused]] Vi continue; // WB999 - boundaries.append(code_unit_offset_of(view, it)); + if (callback(code_unit_offset_of(view, it)) == IterationDecision::Break) + return; } } // WB2 - boundaries.append(code_unit_length(view)); - return boundaries; -#else - return {}; + callback(code_unit_length(view)); #endif } -Vector find_word_segmentation_boundaries(Utf8View const& view) +void for_each_word_segmentation_boundary(Utf8View const& view, SegmentationCallback callback) { - return find_word_segmentation_boundaries_impl(view); + for_each_word_segmentation_boundary_impl(view, move(callback)); } -Vector find_word_segmentation_boundaries(Utf16View const& view) +void for_each_word_segmentation_boundary(Utf16View const& view, SegmentationCallback callback) { - return find_word_segmentation_boundaries_impl(view); + for_each_word_segmentation_boundary_impl(view, move(callback)); } -Vector find_word_segmentation_boundaries(Utf32View const& view) +void for_each_word_segmentation_boundary(Utf32View const& view, SegmentationCallback callback) { - return find_word_segmentation_boundaries_impl(view); + for_each_word_segmentation_boundary_impl(view, move(callback)); } template -static Vector find_sentence_segmentation_boundaries_impl([[maybe_unused]] ViewType const& view) +static void for_each_sentence_segmentation_boundary_impl([[maybe_unused]] ViewType const& view, [[maybe_unused]] SegmentationCallback callback) { #if ENABLE_UNICODE_DATA using SBP = SentenceBreakProperty; - Vector boundaries; // https://www.unicode.org/reports/tr29/#Grapheme_Cluster_Boundary_Rules if (view.is_empty()) - return boundaries; + return; auto has_any_sbp = [](u32 code_point, auto&&... properties) { return (code_point_has_sentence_break_property(code_point, properties) || ...); }; // SB1 - boundaries.append(0); + if (callback(0) == IterationDecision::Break) + return; if (code_unit_length(view) > 1) { auto it = view.begin(); @@ -364,7 +362,8 @@ static Vector find_sentence_segmentation_boundaries_impl([[maybe_unused] // SB4 if (code_point_is_para_sep) { - boundaries.append(code_unit_offset_of(view, it)); + if (callback(code_unit_offset_of(view, it)) == IterationDecision::Break) + return; continue; } @@ -422,33 +421,31 @@ static Vector find_sentence_segmentation_boundaries_impl([[maybe_unused] // SB11 if (terminator_sequence_state >= TerminatorSequenceState::Term) - boundaries.append(code_unit_offset_of(view, it)); + if (callback(code_unit_offset_of(view, it)) == IterationDecision::Break) + return; // SB998 } } // SB2 - boundaries.append(code_unit_length(view)); - return boundaries; -#else - return {}; + callback(code_unit_length(view)); #endif } -Vector find_sentence_segmentation_boundaries(Utf8View const& view) +void for_each_sentence_segmentation_boundary(Utf8View const& view, SegmentationCallback callback) { - return find_sentence_segmentation_boundaries_impl(view); + for_each_sentence_segmentation_boundary_impl(view, move(callback)); } -Vector find_sentence_segmentation_boundaries(Utf16View const& view) +void for_each_sentence_segmentation_boundary(Utf16View const& view, SegmentationCallback callback) { - return find_sentence_segmentation_boundaries_impl(view); + for_each_sentence_segmentation_boundary_impl(view, move(callback)); } -Vector find_sentence_segmentation_boundaries(Utf32View const& view) +void for_each_sentence_segmentation_boundary(Utf32View const& view, SegmentationCallback callback) { - return find_sentence_segmentation_boundaries_impl(view); + for_each_sentence_segmentation_boundary_impl(view, move(callback)); } } diff --git a/Userland/Libraries/LibUnicode/Segmentation.h b/Userland/Libraries/LibUnicode/Segmentation.h index af28aaee68..6e0f24f184 100644 --- a/Userland/Libraries/LibUnicode/Segmentation.h +++ b/Userland/Libraries/LibUnicode/Segmentation.h @@ -8,21 +8,64 @@ #pragma once #include +#include +#include #include #include namespace Unicode { -Vector find_grapheme_segmentation_boundaries(Utf8View const&); -Vector find_grapheme_segmentation_boundaries(Utf16View const&); -Vector find_grapheme_segmentation_boundaries(Utf32View const&); +using SegmentationCallback = Function; -Vector find_word_segmentation_boundaries(Utf8View const&); -Vector find_word_segmentation_boundaries(Utf16View const&); -Vector find_word_segmentation_boundaries(Utf32View const&); +void for_each_grapheme_segmentation_boundary(Utf8View const&, SegmentationCallback); +void for_each_grapheme_segmentation_boundary(Utf16View const&, SegmentationCallback); +void for_each_grapheme_segmentation_boundary(Utf32View const&, SegmentationCallback); -Vector find_sentence_segmentation_boundaries(Utf8View const&); -Vector find_sentence_segmentation_boundaries(Utf16View const&); -Vector find_sentence_segmentation_boundaries(Utf32View const&); +template +Vector find_grapheme_segmentation_boundaries(ViewType const& view) +{ + Vector boundaries; + + for_each_grapheme_segmentation_boundary(view, [&](auto boundary) { + boundaries.append(boundary); + return IterationDecision::Continue; + }); + + return boundaries; +} + +void for_each_word_segmentation_boundary(Utf8View const&, SegmentationCallback); +void for_each_word_segmentation_boundary(Utf16View const&, SegmentationCallback); +void for_each_word_segmentation_boundary(Utf32View const&, SegmentationCallback); + +template +Vector find_word_segmentation_boundaries(ViewType const& view) +{ + Vector boundaries; + + for_each_word_segmentation_boundary(view, [&](auto boundary) { + boundaries.append(boundary); + return IterationDecision::Continue; + }); + + return boundaries; +} + +void for_each_sentence_segmentation_boundary(Utf8View const&, SegmentationCallback); +void for_each_sentence_segmentation_boundary(Utf16View const&, SegmentationCallback); +void for_each_sentence_segmentation_boundary(Utf32View const&, SegmentationCallback); + +template +Vector find_sentence_segmentation_boundaries(ViewType const& view) +{ + Vector boundaries; + + for_each_sentence_segmentation_boundary(view, [&](auto boundary) { + boundaries.append(boundary); + return IterationDecision::Continue; + }); + + return boundaries; +} }