1
Fork 0
mirror of https://github.com/RGBCube/serenity synced 2025-07-26 13:17:35 +00:00

LibUnicode: Allow iterating over text segmentation boundaries

This will be useful for e.g. finding the next boundary after a specific
index - we can just stop iterating once a condition is satisfied.
This commit is contained in:
Timothy Flynn 2023-02-14 12:03:35 -05:00 committed by Linus Groh
parent dd4c47456e
commit abe7786a81
2 changed files with 97 additions and 57 deletions

View file

@ -44,22 +44,22 @@ static size_t code_unit_offset_of(ViewType const& view, CodeUnitIterator const&
} }
template<typename ViewType> template<typename ViewType>
static Vector<size_t> find_grapheme_segmentation_boundaries_impl([[maybe_unused]] ViewType const& view) static void for_each_grapheme_segmentation_boundary_impl([[maybe_unused]] ViewType const& view, [[maybe_unused]] SegmentationCallback callback)
{ {
#if ENABLE_UNICODE_DATA #if ENABLE_UNICODE_DATA
using GBP = GraphemeBreakProperty; using GBP = GraphemeBreakProperty;
Vector<size_t> boundaries;
// https://www.unicode.org/reports/tr29/#Grapheme_Cluster_Boundary_Rules // https://www.unicode.org/reports/tr29/#Grapheme_Cluster_Boundary_Rules
if (view.is_empty()) if (view.is_empty())
return boundaries; return;
auto has_any_gbp = [](u32 code_point, auto&&... properties) { auto has_any_gbp = [](u32 code_point, auto&&... properties) {
return (code_point_has_grapheme_break_property(code_point, properties) || ...); return (code_point_has_grapheme_break_property(code_point, properties) || ...);
}; };
// GB1 // GB1
boundaries.append(0); if (callback(0) == IterationDecision::Break)
return;
if (code_unit_length(view) > 1) { if (code_unit_length(view) > 1) {
auto it = view.begin(); auto it = view.begin();
@ -79,7 +79,8 @@ static Vector<size_t> find_grapheme_segmentation_boundaries_impl([[maybe_unused]
continue; continue;
// GB4, GB5 // GB4, GB5
if (code_point_is_cr || next_code_point_is_lf || has_any_gbp(next_code_point, GBP::CR, GBP::Control) || has_any_gbp(code_point, GBP::LF, GBP::Control)) { if (code_point_is_cr || next_code_point_is_lf || has_any_gbp(next_code_point, GBP::CR, GBP::Control) || has_any_gbp(code_point, GBP::LF, GBP::Control)) {
boundaries.append(code_unit_offset_of(view, it)); if (callback(code_unit_offset_of(view, it)) == IterationDecision::Break)
return;
continue; continue;
} }
@ -124,50 +125,48 @@ static Vector<size_t> find_grapheme_segmentation_boundaries_impl([[maybe_unused]
continue; continue;
// GB999 // GB999
boundaries.append(code_unit_offset_of(view, it)); if (callback(code_unit_offset_of(view, it)) == IterationDecision::Break)
return;
} }
} }
// GB2 // GB2
boundaries.append(code_unit_length(view)); callback(code_unit_length(view));
return boundaries;
#else
return {};
#endif #endif
} }
Vector<size_t> find_grapheme_segmentation_boundaries(Utf8View const& view) void for_each_grapheme_segmentation_boundary(Utf8View const& view, SegmentationCallback callback)
{ {
return find_grapheme_segmentation_boundaries_impl(view); for_each_grapheme_segmentation_boundary_impl(view, move(callback));
} }
Vector<size_t> find_grapheme_segmentation_boundaries(Utf16View const& view) void for_each_grapheme_segmentation_boundary(Utf16View const& view, SegmentationCallback callback)
{ {
return find_grapheme_segmentation_boundaries_impl(view); for_each_grapheme_segmentation_boundary_impl(view, move(callback));
} }
Vector<size_t> find_grapheme_segmentation_boundaries(Utf32View const& view) void for_each_grapheme_segmentation_boundary(Utf32View const& view, SegmentationCallback callback)
{ {
return find_grapheme_segmentation_boundaries_impl(view); for_each_grapheme_segmentation_boundary_impl(view, move(callback));
} }
template<typename ViewType> template<typename ViewType>
static Vector<size_t> find_word_segmentation_boundaries_impl([[maybe_unused]] ViewType const& view) static void for_each_word_segmentation_boundary_impl([[maybe_unused]] ViewType const& view, [[maybe_unused]] SegmentationCallback callback)
{ {
#if ENABLE_UNICODE_DATA #if ENABLE_UNICODE_DATA
using WBP = WordBreakProperty; using WBP = WordBreakProperty;
Vector<size_t> boundaries;
// https://www.unicode.org/reports/tr29/#Word_Boundary_Rules // https://www.unicode.org/reports/tr29/#Word_Boundary_Rules
if (view.is_empty()) if (view.is_empty())
return boundaries; return;
auto has_any_wbp = [](u32 code_point, auto&&... properties) { auto has_any_wbp = [](u32 code_point, auto&&... properties) {
return (code_point_has_word_break_property(code_point, properties) || ...); return (code_point_has_word_break_property(code_point, properties) || ...);
}; };
// WB1 // WB1
boundaries.append(0); if (callback(0) == IterationDecision::Break)
return;
if (code_unit_length(view) > 1) { if (code_unit_length(view) > 1) {
auto it = view.begin(); auto it = view.begin();
@ -187,7 +186,8 @@ static Vector<size_t> find_word_segmentation_boundaries_impl([[maybe_unused]] Vi
continue; continue;
// WB3a, WB3b // WB3a, WB3b
if (code_point_is_cr || next_code_point_is_lf || has_any_wbp(next_code_point, WBP::CR, WBP::Newline) || has_any_wbp(code_point, WBP::LF, WBP::Newline)) { if (code_point_is_cr || next_code_point_is_lf || has_any_wbp(next_code_point, WBP::CR, WBP::Newline) || has_any_wbp(code_point, WBP::LF, WBP::Newline)) {
boundaries.append(code_unit_offset_of(view, it)); if (callback(code_unit_offset_of(view, it)) == IterationDecision::Break)
return;
continue; continue;
} }
// WB3c // WB3c
@ -292,50 +292,48 @@ static Vector<size_t> find_word_segmentation_boundaries_impl([[maybe_unused]] Vi
continue; continue;
// WB999 // WB999
boundaries.append(code_unit_offset_of(view, it)); if (callback(code_unit_offset_of(view, it)) == IterationDecision::Break)
return;
} }
} }
// WB2 // WB2
boundaries.append(code_unit_length(view)); callback(code_unit_length(view));
return boundaries;
#else
return {};
#endif #endif
} }
Vector<size_t> find_word_segmentation_boundaries(Utf8View const& view) void for_each_word_segmentation_boundary(Utf8View const& view, SegmentationCallback callback)
{ {
return find_word_segmentation_boundaries_impl(view); for_each_word_segmentation_boundary_impl(view, move(callback));
} }
Vector<size_t> find_word_segmentation_boundaries(Utf16View const& view) void for_each_word_segmentation_boundary(Utf16View const& view, SegmentationCallback callback)
{ {
return find_word_segmentation_boundaries_impl(view); for_each_word_segmentation_boundary_impl(view, move(callback));
} }
Vector<size_t> find_word_segmentation_boundaries(Utf32View const& view) void for_each_word_segmentation_boundary(Utf32View const& view, SegmentationCallback callback)
{ {
return find_word_segmentation_boundaries_impl(view); for_each_word_segmentation_boundary_impl(view, move(callback));
} }
template<typename ViewType> template<typename ViewType>
static Vector<size_t> find_sentence_segmentation_boundaries_impl([[maybe_unused]] ViewType const& view) static void for_each_sentence_segmentation_boundary_impl([[maybe_unused]] ViewType const& view, [[maybe_unused]] SegmentationCallback callback)
{ {
#if ENABLE_UNICODE_DATA #if ENABLE_UNICODE_DATA
using SBP = SentenceBreakProperty; using SBP = SentenceBreakProperty;
Vector<size_t> boundaries;
// https://www.unicode.org/reports/tr29/#Grapheme_Cluster_Boundary_Rules // https://www.unicode.org/reports/tr29/#Grapheme_Cluster_Boundary_Rules
if (view.is_empty()) if (view.is_empty())
return boundaries; return;
auto has_any_sbp = [](u32 code_point, auto&&... properties) { auto has_any_sbp = [](u32 code_point, auto&&... properties) {
return (code_point_has_sentence_break_property(code_point, properties) || ...); return (code_point_has_sentence_break_property(code_point, properties) || ...);
}; };
// SB1 // SB1
boundaries.append(0); if (callback(0) == IterationDecision::Break)
return;
if (code_unit_length(view) > 1) { if (code_unit_length(view) > 1) {
auto it = view.begin(); auto it = view.begin();
@ -364,7 +362,8 @@ static Vector<size_t> find_sentence_segmentation_boundaries_impl([[maybe_unused]
// SB4 // SB4
if (code_point_is_para_sep) { if (code_point_is_para_sep) {
boundaries.append(code_unit_offset_of(view, it)); if (callback(code_unit_offset_of(view, it)) == IterationDecision::Break)
return;
continue; continue;
} }
@ -422,33 +421,31 @@ static Vector<size_t> find_sentence_segmentation_boundaries_impl([[maybe_unused]
// SB11 // SB11
if (terminator_sequence_state >= TerminatorSequenceState::Term) if (terminator_sequence_state >= TerminatorSequenceState::Term)
boundaries.append(code_unit_offset_of(view, it)); if (callback(code_unit_offset_of(view, it)) == IterationDecision::Break)
return;
// SB998 // SB998
} }
} }
// SB2 // SB2
boundaries.append(code_unit_length(view)); callback(code_unit_length(view));
return boundaries;
#else
return {};
#endif #endif
} }
Vector<size_t> find_sentence_segmentation_boundaries(Utf8View const& view) void for_each_sentence_segmentation_boundary(Utf8View const& view, SegmentationCallback callback)
{ {
return find_sentence_segmentation_boundaries_impl(view); for_each_sentence_segmentation_boundary_impl(view, move(callback));
} }
Vector<size_t> find_sentence_segmentation_boundaries(Utf16View const& view) void for_each_sentence_segmentation_boundary(Utf16View const& view, SegmentationCallback callback)
{ {
return find_sentence_segmentation_boundaries_impl(view); for_each_sentence_segmentation_boundary_impl(view, move(callback));
} }
Vector<size_t> find_sentence_segmentation_boundaries(Utf32View const& view) void for_each_sentence_segmentation_boundary(Utf32View const& view, SegmentationCallback callback)
{ {
return find_sentence_segmentation_boundaries_impl(view); for_each_sentence_segmentation_boundary_impl(view, move(callback));
} }
} }

View file

@ -8,21 +8,64 @@
#pragma once #pragma once
#include <AK/Forward.h> #include <AK/Forward.h>
#include <AK/Function.h>
#include <AK/IterationDecision.h>
#include <AK/Types.h> #include <AK/Types.h>
#include <AK/Vector.h> #include <AK/Vector.h>
namespace Unicode { namespace Unicode {
Vector<size_t> find_grapheme_segmentation_boundaries(Utf8View const&); using SegmentationCallback = Function<IterationDecision(size_t)>;
Vector<size_t> find_grapheme_segmentation_boundaries(Utf16View const&);
Vector<size_t> find_grapheme_segmentation_boundaries(Utf32View const&);
Vector<size_t> find_word_segmentation_boundaries(Utf8View const&); void for_each_grapheme_segmentation_boundary(Utf8View const&, SegmentationCallback);
Vector<size_t> find_word_segmentation_boundaries(Utf16View const&); void for_each_grapheme_segmentation_boundary(Utf16View const&, SegmentationCallback);
Vector<size_t> find_word_segmentation_boundaries(Utf32View const&); void for_each_grapheme_segmentation_boundary(Utf32View const&, SegmentationCallback);
Vector<size_t> find_sentence_segmentation_boundaries(Utf8View const&); template<typename ViewType>
Vector<size_t> find_sentence_segmentation_boundaries(Utf16View const&); Vector<size_t> find_grapheme_segmentation_boundaries(ViewType const& view)
Vector<size_t> find_sentence_segmentation_boundaries(Utf32View const&); {
Vector<size_t> boundaries;
for_each_grapheme_segmentation_boundary(view, [&](auto boundary) {
boundaries.append(boundary);
return IterationDecision::Continue;
});
return boundaries;
}
void for_each_word_segmentation_boundary(Utf8View const&, SegmentationCallback);
void for_each_word_segmentation_boundary(Utf16View const&, SegmentationCallback);
void for_each_word_segmentation_boundary(Utf32View const&, SegmentationCallback);
template<typename ViewType>
Vector<size_t> find_word_segmentation_boundaries(ViewType const& view)
{
Vector<size_t> boundaries;
for_each_word_segmentation_boundary(view, [&](auto boundary) {
boundaries.append(boundary);
return IterationDecision::Continue;
});
return boundaries;
}
void for_each_sentence_segmentation_boundary(Utf8View const&, SegmentationCallback);
void for_each_sentence_segmentation_boundary(Utf16View const&, SegmentationCallback);
void for_each_sentence_segmentation_boundary(Utf32View const&, SegmentationCallback);
template<typename ViewType>
Vector<size_t> find_sentence_segmentation_boundaries(ViewType const& view)
{
Vector<size_t> boundaries;
for_each_sentence_segmentation_boundary(view, [&](auto boundary) {
boundaries.append(boundary);
return IterationDecision::Continue;
});
return boundaries;
}
} }