From 58b0eed6a7a9b9396cad1a5a3096d34a10a66795 Mon Sep 17 00:00:00 2001 From: Idan Horowitz Date: Mon, 31 Jan 2022 18:20:52 +0200 Subject: [PATCH] LibUnicode: Implement grapheme segmentation --- .../Libraries/LibUnicode/CharacterTypes.cpp | 93 +++++++++++++++++++ .../Libraries/LibUnicode/CharacterTypes.h | 2 + 2 files changed, 95 insertions(+) diff --git a/Userland/Libraries/LibUnicode/CharacterTypes.cpp b/Userland/Libraries/LibUnicode/CharacterTypes.cpp index ace627530e..c16b188e88 100644 --- a/Userland/Libraries/LibUnicode/CharacterTypes.cpp +++ b/Userland/Libraries/LibUnicode/CharacterTypes.cpp @@ -8,6 +8,7 @@ #include #include #include +#include #include #include #include @@ -357,4 +358,96 @@ bool __attribute__((weak)) code_point_has_grapheme_break_property(u32, GraphemeB bool __attribute__((weak)) code_point_has_word_break_property(u32, WordBreakProperty) { return {}; } bool __attribute__((weak)) code_point_has_sentence_break_property(u32, SentenceBreakProperty) { return {}; } +Vector find_grapheme_segmentation_boundaries([[maybe_unused]] Utf16View const& view) +{ +#if ENABLE_UNICODE_DATA + using GBP = GraphemeBreakProperty; + Vector boundaries; + + // https://www.unicode.org/reports/tr29/#Grapheme_Cluster_Boundary_Rules + if (view.length_in_code_points() == 0) + return boundaries; + + auto has_any_gbp = [](u32 code_point, auto&&... properties) { + return (code_point_has_grapheme_break_property(code_point, properties) || ...); + }; + + // GB1 + boundaries.append(0); + + if (view.length_in_code_points() > 1) { + auto it = view.begin(); + auto code_point = *it; + u32 next_code_point; + auto current_ri_chain = 0; + auto in_emoji_sequence = false; + + for (++it; it != view.end(); ++it, code_point = next_code_point) { + next_code_point = *it; + + auto code_point_is_cr = has_any_gbp(code_point, GBP::CR); + auto next_code_point_is_lf = has_any_gbp(next_code_point, GBP::LF); + + // GB3 + if (code_point_is_cr && next_code_point_is_lf) + continue; + // GB4, GB5 + if (code_point_is_cr || next_code_point_is_lf || has_any_gbp(next_code_point, GBP::CR, GBP::Control) || has_any_gbp(code_point, GBP::LF, GBP::Control)) { + boundaries.append(view.code_unit_offset_of(it)); + continue; + } + + auto next_code_point_is_v = has_any_gbp(next_code_point, GBP::V); + auto next_code_point_is_t = has_any_gbp(next_code_point, GBP::T); + + // GB6 + if (has_any_gbp(code_point, GBP::L) && (next_code_point_is_v || has_any_gbp(next_code_point, GBP::L, GBP::LV, GBP::LVT))) + continue; + // GB7 + if ((next_code_point_is_v || next_code_point_is_t) && has_any_gbp(code_point, GBP::LV, GBP::V)) + continue; + // GB8 + if (next_code_point_is_t && has_any_gbp(code_point, GBP::LVT, GBP::T)) + continue; + + auto code_point_is_zwj = has_any_gbp(code_point, GBP::ZWJ); + if (!in_emoji_sequence && code_point_has_property(code_point, Property::Extended_Pictographic)) + in_emoji_sequence = true; + else if (in_emoji_sequence && !has_any_gbp(code_point, GBP::Extend) && !code_point_is_zwj) + in_emoji_sequence = false; + + // GB9 + if (has_any_gbp(next_code_point, GBP::Extend, GBP::ZWJ)) + continue; + // GB9a + if (has_any_gbp(next_code_point, GBP::SpacingMark)) + continue; + // GB9b + if (has_any_gbp(code_point, GBP::Prepend)) + continue; + + // GB11 + if (in_emoji_sequence && code_point_is_zwj && code_point_has_property(next_code_point, Property::Extended_Pictographic)) + continue; + + auto code_point_is_ri = has_any_gbp(code_point, GBP::Regional_Indicator); + current_ri_chain = code_point_is_ri ? current_ri_chain + 1 : 0; + + // GB12, GB13 + if (code_point_is_ri && has_any_gbp(next_code_point, GBP::Regional_Indicator) && current_ri_chain % 2 == 1) + continue; + + // GB999 + boundaries.append(view.code_unit_offset_of(it)); + } + } + + // GB2 + boundaries.append(view.length_in_code_units()); + return boundaries; +#else + return {}; +#endif +} + } diff --git a/Userland/Libraries/LibUnicode/CharacterTypes.h b/Userland/Libraries/LibUnicode/CharacterTypes.h index def59cd149..5a06697781 100644 --- a/Userland/Libraries/LibUnicode/CharacterTypes.h +++ b/Userland/Libraries/LibUnicode/CharacterTypes.h @@ -44,4 +44,6 @@ bool code_point_has_grapheme_break_property(u32 code_point, GraphemeBreakPropert bool code_point_has_word_break_property(u32 code_point, WordBreakProperty property); bool code_point_has_sentence_break_property(u32 code_point, SentenceBreakProperty property); +Vector find_grapheme_segmentation_boundaries(Utf16View const&); + }