From 2d487e4e4c0f678b3d1af76deb3a9c40c7426c3e Mon Sep 17 00:00:00 2001 From: Timothy Flynn Date: Tue, 14 Feb 2023 11:08:25 -0500 Subject: [PATCH] LibUnicode+LibJS: Move text segmentation algorithms to their own files These algorithms are quite chonky, and more APIs around them are to be added, so let's move them to their own files for a bit of organization. --- .../LibJS/Runtime/Intl/Segmenter.cpp | 2 +- Userland/Libraries/LibUnicode/CMakeLists.txt | 1 + .../Libraries/LibUnicode/CharacterTypes.cpp | 395 +---------------- .../Libraries/LibUnicode/CharacterTypes.h | 7 +- .../Libraries/LibUnicode/Segmentation.cpp | 411 ++++++++++++++++++ Userland/Libraries/LibUnicode/Segmentation.h | 23 + .../Libraries/LibUnicode/UnicodeUtils.cpp | 1 + 7 files changed, 439 insertions(+), 401 deletions(-) create mode 100644 Userland/Libraries/LibUnicode/Segmentation.cpp create mode 100644 Userland/Libraries/LibUnicode/Segmentation.h diff --git a/Userland/Libraries/LibJS/Runtime/Intl/Segmenter.cpp b/Userland/Libraries/LibJS/Runtime/Intl/Segmenter.cpp index dc51db79c3..6b3a52ab1b 100644 --- a/Userland/Libraries/LibJS/Runtime/Intl/Segmenter.cpp +++ b/Userland/Libraries/LibJS/Runtime/Intl/Segmenter.cpp @@ -8,7 +8,7 @@ #include #include #include -#include +#include namespace JS::Intl { diff --git a/Userland/Libraries/LibUnicode/CMakeLists.txt b/Userland/Libraries/LibUnicode/CMakeLists.txt index 27f9201815..54c99ceebb 100644 --- a/Userland/Libraries/LibUnicode/CMakeLists.txt +++ b/Userland/Libraries/LibUnicode/CMakeLists.txt @@ -5,6 +5,7 @@ set(SOURCES CurrencyCode.cpp Emoji.cpp Normalize.cpp + Segmentation.cpp String.cpp UnicodeUtils.cpp ${UNICODE_DATA_SOURCES} diff --git a/Userland/Libraries/LibUnicode/CharacterTypes.cpp b/Userland/Libraries/LibUnicode/CharacterTypes.cpp index 0c489361ce..098f53d8b8 100644 --- a/Userland/Libraries/LibUnicode/CharacterTypes.cpp +++ b/Userland/Libraries/LibUnicode/CharacterTypes.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021, Tim Flynn + * Copyright (c) 2021-2023, Tim Flynn * * SPDX-License-Identifier: BSD-2-Clause */ @@ -8,7 +8,6 @@ #include #include #include -#include #include #include #include @@ -151,396 +150,4 @@ bool __attribute__((weak)) code_point_has_grapheme_break_property(u32, GraphemeB bool __attribute__((weak)) code_point_has_word_break_property(u32, WordBreakProperty) { return {}; } bool __attribute__((weak)) code_point_has_sentence_break_property(u32, SentenceBreakProperty) { return {}; } -Vector find_grapheme_segmentation_boundaries([[maybe_unused]] Utf16View const& view) -{ -#if ENABLE_UNICODE_DATA - using GBP = GraphemeBreakProperty; - Vector boundaries; - - // https://www.unicode.org/reports/tr29/#Grapheme_Cluster_Boundary_Rules - if (view.length_in_code_points() == 0) - return boundaries; - - auto has_any_gbp = [](u32 code_point, auto&&... properties) { - return (code_point_has_grapheme_break_property(code_point, properties) || ...); - }; - - // GB1 - boundaries.append(0); - - if (view.length_in_code_points() > 1) { - auto it = view.begin(); - auto code_point = *it; - u32 next_code_point; - auto current_ri_chain = 0; - auto in_emoji_sequence = false; - - for (++it; it != view.end(); ++it, code_point = next_code_point) { - next_code_point = *it; - - auto code_point_is_cr = has_any_gbp(code_point, GBP::CR); - auto next_code_point_is_lf = has_any_gbp(next_code_point, GBP::LF); - - // GB3 - if (code_point_is_cr && next_code_point_is_lf) - continue; - // GB4, GB5 - if (code_point_is_cr || next_code_point_is_lf || has_any_gbp(next_code_point, GBP::CR, GBP::Control) || has_any_gbp(code_point, GBP::LF, GBP::Control)) { - boundaries.append(view.code_unit_offset_of(it)); - continue; - } - - auto next_code_point_is_v = has_any_gbp(next_code_point, GBP::V); - auto next_code_point_is_t = has_any_gbp(next_code_point, GBP::T); - - // GB6 - if (has_any_gbp(code_point, GBP::L) && (next_code_point_is_v || has_any_gbp(next_code_point, GBP::L, GBP::LV, GBP::LVT))) - continue; - // GB7 - if ((next_code_point_is_v || next_code_point_is_t) && has_any_gbp(code_point, GBP::LV, GBP::V)) - continue; - // GB8 - if (next_code_point_is_t && has_any_gbp(code_point, GBP::LVT, GBP::T)) - continue; - - auto code_point_is_zwj = has_any_gbp(code_point, GBP::ZWJ); - if (!in_emoji_sequence && code_point_has_property(code_point, Property::Extended_Pictographic)) - in_emoji_sequence = true; - else if (in_emoji_sequence && !has_any_gbp(code_point, GBP::Extend) && !code_point_is_zwj) - in_emoji_sequence = false; - - // GB9 - if (has_any_gbp(next_code_point, GBP::Extend, GBP::ZWJ)) - continue; - // GB9a - if (has_any_gbp(next_code_point, GBP::SpacingMark)) - continue; - // GB9b - if (has_any_gbp(code_point, GBP::Prepend)) - continue; - - // GB11 - if (in_emoji_sequence && code_point_is_zwj && code_point_has_property(next_code_point, Property::Extended_Pictographic)) - continue; - - auto code_point_is_ri = has_any_gbp(code_point, GBP::Regional_Indicator); - current_ri_chain = code_point_is_ri ? current_ri_chain + 1 : 0; - - // GB12, GB13 - if (code_point_is_ri && has_any_gbp(next_code_point, GBP::Regional_Indicator) && current_ri_chain % 2 == 1) - continue; - - // GB999 - boundaries.append(view.code_unit_offset_of(it)); - } - } - - // GB2 - boundaries.append(view.length_in_code_units()); - return boundaries; -#else - return {}; -#endif -} - -template -static Vector find_word_segmentation_boundaries_impl([[maybe_unused]] ViewType const& view) -{ -#if ENABLE_UNICODE_DATA - using WBP = WordBreakProperty; - Vector boundaries; - - // https://www.unicode.org/reports/tr29/#Word_Boundary_Rules - if (view.is_empty()) - return boundaries; - - auto has_any_wbp = [](u32 code_point, auto&&... properties) { - return (code_point_has_word_break_property(code_point, properties) || ...); - }; - - size_t code_unit_length = 0; - size_t code_point_length = 0; - - if constexpr (requires { view.byte_length(); }) { - code_unit_length = view.byte_length(); - code_point_length = view.length(); - } else if constexpr (requires { view.length_in_code_units(); }) { - code_unit_length = view.length_in_code_units(); - code_point_length = view.length_in_code_points(); - } else { - static_assert(DependentFalse); - } - - auto code_unit_offset_of = [&](auto it) { - if constexpr (requires { view.byte_offset_of(it); }) - return view.byte_offset_of(it); - else if constexpr (requires { view.code_unit_offset_of(it); }) - return view.code_unit_offset_of(it); - VERIFY_NOT_REACHED(); - }; - - // WB1 - boundaries.append(0); - - if (code_point_length > 1) { - auto it = view.begin(); - auto code_point = *it; - u32 next_code_point; - Optional previous_code_point; - auto current_ri_chain = 0; - - for (++it; it != view.end(); ++it, previous_code_point = code_point, code_point = next_code_point) { - next_code_point = *it; - - auto code_point_is_cr = has_any_wbp(code_point, WBP::CR); - auto next_code_point_is_lf = has_any_wbp(next_code_point, WBP::LF); - - // WB3 - if (code_point_is_cr && next_code_point_is_lf) - continue; - // WB3a, WB3b - if (code_point_is_cr || next_code_point_is_lf || has_any_wbp(next_code_point, WBP::CR, WBP::Newline) || has_any_wbp(code_point, WBP::LF, WBP::Newline)) { - boundaries.append(code_unit_offset_of(it)); - continue; - } - // WB3c - if (has_any_wbp(code_point, WBP::ZWJ) && code_point_has_property(next_code_point, Property::Extended_Pictographic)) - continue; - // WB3d - if (has_any_wbp(code_point, WBP::WSegSpace) && has_any_wbp(next_code_point, WBP::WSegSpace)) - continue; - - // WB4 - if (has_any_wbp(next_code_point, WBP::Format, WBP::Extend, WBP::ZWJ)) - continue; - - auto code_point_is_hebrew_letter = has_any_wbp(code_point, WBP::Hebrew_Letter); - auto code_point_is_ah_letter = code_point_is_hebrew_letter || has_any_wbp(code_point, WBP::ALetter); - auto next_code_point_is_hebrew_letter = has_any_wbp(next_code_point, WBP::Hebrew_Letter); - auto next_code_point_is_ah_letter = next_code_point_is_hebrew_letter || has_any_wbp(next_code_point, WBP::ALetter); - - // WB5 - if (code_point_is_ah_letter && next_code_point_is_ah_letter) - continue; - - Optional next_next_code_point; - if (it != view.end()) { - auto it_copy = it; - ++it_copy; - if (it_copy != view.end()) - next_next_code_point = *it; - } - bool next_next_code_point_is_hebrew_letter = next_next_code_point.has_value() && has_any_wbp(*next_next_code_point, WBP::Hebrew_Letter); - bool next_next_code_point_is_ah_letter = next_next_code_point_is_hebrew_letter || (next_next_code_point.has_value() && has_any_wbp(*next_next_code_point, WBP::ALetter)); - - auto next_code_point_is_mid_num_let_q = has_any_wbp(next_code_point, WBP::MidNumLet, WBP::Single_Quote); - - // WB6 - if (code_point_is_ah_letter && next_next_code_point_is_ah_letter && (next_code_point_is_mid_num_let_q || has_any_wbp(next_code_point, WBP::MidLetter))) - continue; - - auto code_point_is_mid_num_let_q = has_any_wbp(code_point, WBP::MidNumLet, WBP::Single_Quote); - auto previous_code_point_is_hebrew_letter = previous_code_point.has_value() && has_any_wbp(*previous_code_point, WBP::Hebrew_Letter); - auto previous_code_point_is_ah_letter = previous_code_point_is_hebrew_letter || (previous_code_point.has_value() && has_any_wbp(*previous_code_point, WBP::ALetter)); - - // WB7 - if (previous_code_point_is_ah_letter && next_code_point_is_ah_letter && (code_point_is_mid_num_let_q || has_any_wbp(code_point, WBP::MidLetter))) - continue; - // WB7a - if (code_point_is_hebrew_letter && has_any_wbp(next_code_point, WBP::Single_Quote)) - continue; - // WB7b - if (code_point_is_hebrew_letter && next_next_code_point_is_hebrew_letter && has_any_wbp(next_code_point, WBP::Double_Quote)) - continue; - // WB7c - if (previous_code_point_is_hebrew_letter && next_code_point_is_hebrew_letter && has_any_wbp(code_point, WBP::Double_Quote)) - continue; - - auto code_point_is_numeric = has_any_wbp(code_point, WBP::Numeric); - auto next_code_point_is_numeric = has_any_wbp(next_code_point, WBP::Numeric); - - // WB8 - if (code_point_is_numeric && next_code_point_is_numeric) - continue; - // WB9 - if (code_point_is_ah_letter && next_code_point_is_numeric) - continue; - // WB10 - if (code_point_is_numeric && next_code_point_is_ah_letter) - continue; - - auto previous_code_point_is_numeric = previous_code_point.has_value() && has_any_wbp(code_point, WBP::Numeric); - - // WB11 - if (previous_code_point_is_numeric && next_code_point_is_numeric && (code_point_is_mid_num_let_q || has_any_wbp(code_point, WBP::MidNum))) - continue; - - bool next_next_code_point_is_numeric = next_next_code_point.has_value() && has_any_wbp(*next_next_code_point, WBP::Numeric); - - // WB12 - if (code_point_is_numeric && next_next_code_point_is_numeric && (next_code_point_is_mid_num_let_q || has_any_wbp(next_code_point, WBP::MidNum))) - continue; - - auto code_point_is_katakana = has_any_wbp(code_point, WBP::Katakana); - auto next_code_point_is_katakana = has_any_wbp(next_code_point, WBP::Katakana); - - // WB13 - if (code_point_is_katakana && next_code_point_is_katakana) - continue; - - auto code_point_is_extend_num_let = has_any_wbp(code_point, WBP::ExtendNumLet); - - // WB13a - if ((code_point_is_ah_letter || code_point_is_numeric || code_point_is_katakana || code_point_is_extend_num_let) && has_any_wbp(next_code_point, WBP::ExtendNumLet)) - continue; - // WB13b - if (code_point_is_extend_num_let && (next_code_point_is_ah_letter || next_code_point_is_numeric || next_code_point_is_katakana)) - continue; - - auto code_point_is_ri = has_any_wbp(code_point, WBP::Regional_Indicator); - current_ri_chain = code_point_is_ri ? current_ri_chain + 1 : 0; - - // WB15, WB16 - if (code_point_is_ri && has_any_wbp(next_code_point, WBP::Regional_Indicator) && current_ri_chain % 2 == 1) - continue; - - // WB999 - boundaries.append(code_unit_offset_of(it)); - } - } - - // WB2 - boundaries.append(code_unit_length); - return boundaries; -#else - return {}; -#endif -} - -Vector find_word_segmentation_boundaries(Utf8View const& view) -{ - return find_word_segmentation_boundaries_impl(view); -} - -Vector find_word_segmentation_boundaries(Utf16View const& view) -{ - return find_word_segmentation_boundaries_impl(view); -} - -Vector find_sentence_segmentation_boundaries([[maybe_unused]] Utf16View const& view) -{ -#if ENABLE_UNICODE_DATA - using SBP = SentenceBreakProperty; - Vector boundaries; - - // https://www.unicode.org/reports/tr29/#Grapheme_Cluster_Boundary_Rules - if (view.length_in_code_points() == 0) - return boundaries; - - auto has_any_sbp = [](u32 code_point, auto&&... properties) { - return (code_point_has_sentence_break_property(code_point, properties) || ...); - }; - - // SB1 - boundaries.append(0); - - if (view.length_in_code_points() > 1) { - auto it = view.begin(); - auto code_point = *it; - u32 next_code_point; - Optional previous_code_point; - enum class TerminatorSequenceState { - None, - Term, - Close, - Sp - } terminator_sequence_state { TerminatorSequenceState::None }; - auto term_was_a_term = false; - - for (++it; it != view.end(); ++it, previous_code_point = code_point, code_point = next_code_point) { - next_code_point = *it; - - auto code_point_is_cr = has_any_sbp(code_point, SBP::CR); - auto next_code_point_is_lf = has_any_sbp(next_code_point, SBP::LF); - - // SB3 - if (code_point_is_cr && next_code_point_is_lf) - continue; - - auto code_point_is_para_sep = code_point_is_cr || has_any_sbp(code_point, SBP::LF, SBP::Sep); - - // SB4 - if (code_point_is_para_sep) { - boundaries.append(view.code_unit_offset_of(it)); - continue; - } - - // SB5 - if (has_any_sbp(next_code_point, SBP::Format, SBP::Extend)) - continue; - - auto code_point_is_a_term = has_any_sbp(code_point, SBP::ATerm); - - // SB6 - if (code_point_is_a_term && has_any_sbp(next_code_point, SBP::Numeric)) - continue; - // SB7 - if (code_point_is_a_term && previous_code_point.has_value() && has_any_sbp(*previous_code_point, SBP::Upper, SBP::Lower) && has_any_sbp(next_code_point, SBP::Upper)) - continue; - - if (code_point_is_a_term || has_any_sbp(code_point, SBP::STerm)) { - terminator_sequence_state = TerminatorSequenceState::Term; - term_was_a_term = code_point_is_a_term; - } else if (terminator_sequence_state >= TerminatorSequenceState::Term && terminator_sequence_state <= TerminatorSequenceState::Close && has_any_sbp(code_point, SBP::Close)) { - terminator_sequence_state = TerminatorSequenceState::Close; - } else if (terminator_sequence_state >= TerminatorSequenceState::Term && has_any_sbp(code_point, SBP::Sp)) { - terminator_sequence_state = TerminatorSequenceState::Sp; - } else { - terminator_sequence_state = TerminatorSequenceState::None; - } - - // SB8 - if (terminator_sequence_state >= TerminatorSequenceState::Term && term_was_a_term) { - auto it_copy = it; - bool illegal_sequence = false; - for (auto sequence_code_point = *it_copy; it_copy != view.end(); ++it_copy) { - if (has_any_sbp(sequence_code_point, SBP::Close, SBP::SContinue, SBP::Numeric, SBP::Sp, SBP::Format, SBP::Extend)) - continue; - illegal_sequence = has_any_sbp(sequence_code_point, SBP::Lower); - } - if (illegal_sequence) - continue; - } - - // SB8a - if (terminator_sequence_state >= TerminatorSequenceState::Term && (has_any_sbp(next_code_point, SBP::SContinue, SBP::STerm, SBP::ATerm))) - continue; - - auto next_code_point_is_sp = has_any_sbp(next_code_point, SBP::Sp); - auto next_code_point_is_para_sep = has_any_sbp(next_code_point, SBP::Sep, SBP::CR, SBP::LF); - - // SB9 - if (terminator_sequence_state >= TerminatorSequenceState::Term && terminator_sequence_state <= TerminatorSequenceState::Close && (next_code_point_is_sp || next_code_point_is_para_sep || has_any_sbp(next_code_point, SBP::Close))) - continue; - - // SB10 - if (terminator_sequence_state >= TerminatorSequenceState::Term && (next_code_point_is_sp || next_code_point_is_para_sep)) - continue; - - // SB11 - if (terminator_sequence_state >= TerminatorSequenceState::Term) - boundaries.append(view.code_unit_offset_of(it)); - - // SB998 - } - } - - // SB2 - boundaries.append(view.length_in_code_units()); - return boundaries; -#else - return {}; -#endif -} - } diff --git a/Userland/Libraries/LibUnicode/CharacterTypes.h b/Userland/Libraries/LibUnicode/CharacterTypes.h index 0f8b74b130..a56fb28865 100644 --- a/Userland/Libraries/LibUnicode/CharacterTypes.h +++ b/Userland/Libraries/LibUnicode/CharacterTypes.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021, Tim Flynn + * Copyright (c) 2021-2023, Tim Flynn * * SPDX-License-Identifier: BSD-2-Clause */ @@ -64,9 +64,4 @@ bool code_point_has_grapheme_break_property(u32 code_point, GraphemeBreakPropert bool code_point_has_word_break_property(u32 code_point, WordBreakProperty property); bool code_point_has_sentence_break_property(u32 code_point, SentenceBreakProperty property); -Vector find_grapheme_segmentation_boundaries(Utf16View const&); -Vector find_word_segmentation_boundaries(Utf8View const&); -Vector find_word_segmentation_boundaries(Utf16View const&); -Vector find_sentence_segmentation_boundaries(Utf16View const&); - } diff --git a/Userland/Libraries/LibUnicode/Segmentation.cpp b/Userland/Libraries/LibUnicode/Segmentation.cpp new file mode 100644 index 0000000000..8ffe083025 --- /dev/null +++ b/Userland/Libraries/LibUnicode/Segmentation.cpp @@ -0,0 +1,411 @@ +/* + * Copyright (c) 2022, Idan Horowitz + * Copyright (c) 2023, Tim Flynn + * + * SPDX-License-Identifier: BSD-2-Clause + */ + +#include +#include +#include +#include + +#if ENABLE_UNICODE_DATA +# include +#endif + +namespace Unicode { + +Vector find_grapheme_segmentation_boundaries([[maybe_unused]] Utf16View const& view) +{ +#if ENABLE_UNICODE_DATA + using GBP = GraphemeBreakProperty; + Vector boundaries; + + // https://www.unicode.org/reports/tr29/#Grapheme_Cluster_Boundary_Rules + if (view.length_in_code_points() == 0) + return boundaries; + + auto has_any_gbp = [](u32 code_point, auto&&... properties) { + return (code_point_has_grapheme_break_property(code_point, properties) || ...); + }; + + // GB1 + boundaries.append(0); + + if (view.length_in_code_points() > 1) { + auto it = view.begin(); + auto code_point = *it; + u32 next_code_point; + auto current_ri_chain = 0; + auto in_emoji_sequence = false; + + for (++it; it != view.end(); ++it, code_point = next_code_point) { + next_code_point = *it; + + auto code_point_is_cr = has_any_gbp(code_point, GBP::CR); + auto next_code_point_is_lf = has_any_gbp(next_code_point, GBP::LF); + + // GB3 + if (code_point_is_cr && next_code_point_is_lf) + continue; + // GB4, GB5 + if (code_point_is_cr || next_code_point_is_lf || has_any_gbp(next_code_point, GBP::CR, GBP::Control) || has_any_gbp(code_point, GBP::LF, GBP::Control)) { + boundaries.append(view.code_unit_offset_of(it)); + continue; + } + + auto next_code_point_is_v = has_any_gbp(next_code_point, GBP::V); + auto next_code_point_is_t = has_any_gbp(next_code_point, GBP::T); + + // GB6 + if (has_any_gbp(code_point, GBP::L) && (next_code_point_is_v || has_any_gbp(next_code_point, GBP::L, GBP::LV, GBP::LVT))) + continue; + // GB7 + if ((next_code_point_is_v || next_code_point_is_t) && has_any_gbp(code_point, GBP::LV, GBP::V)) + continue; + // GB8 + if (next_code_point_is_t && has_any_gbp(code_point, GBP::LVT, GBP::T)) + continue; + + auto code_point_is_zwj = has_any_gbp(code_point, GBP::ZWJ); + if (!in_emoji_sequence && code_point_has_property(code_point, Property::Extended_Pictographic)) + in_emoji_sequence = true; + else if (in_emoji_sequence && !has_any_gbp(code_point, GBP::Extend) && !code_point_is_zwj) + in_emoji_sequence = false; + + // GB9 + if (has_any_gbp(next_code_point, GBP::Extend, GBP::ZWJ)) + continue; + // GB9a + if (has_any_gbp(next_code_point, GBP::SpacingMark)) + continue; + // GB9b + if (has_any_gbp(code_point, GBP::Prepend)) + continue; + + // GB11 + if (in_emoji_sequence && code_point_is_zwj && code_point_has_property(next_code_point, Property::Extended_Pictographic)) + continue; + + auto code_point_is_ri = has_any_gbp(code_point, GBP::Regional_Indicator); + current_ri_chain = code_point_is_ri ? current_ri_chain + 1 : 0; + + // GB12, GB13 + if (code_point_is_ri && has_any_gbp(next_code_point, GBP::Regional_Indicator) && current_ri_chain % 2 == 1) + continue; + + // GB999 + boundaries.append(view.code_unit_offset_of(it)); + } + } + + // GB2 + boundaries.append(view.length_in_code_units()); + return boundaries; +#else + return {}; +#endif +} + +template +static Vector find_word_segmentation_boundaries_impl([[maybe_unused]] ViewType const& view) +{ +#if ENABLE_UNICODE_DATA + using WBP = WordBreakProperty; + Vector boundaries; + + // https://www.unicode.org/reports/tr29/#Word_Boundary_Rules + if (view.is_empty()) + return boundaries; + + auto has_any_wbp = [](u32 code_point, auto&&... properties) { + return (code_point_has_word_break_property(code_point, properties) || ...); + }; + + size_t code_unit_length = 0; + size_t code_point_length = 0; + + if constexpr (requires { view.byte_length(); }) { + code_unit_length = view.byte_length(); + code_point_length = view.length(); + } else if constexpr (requires { view.length_in_code_units(); }) { + code_unit_length = view.length_in_code_units(); + code_point_length = view.length_in_code_points(); + } else { + static_assert(DependentFalse); + } + + auto code_unit_offset_of = [&](auto it) { + if constexpr (requires { view.byte_offset_of(it); }) + return view.byte_offset_of(it); + else if constexpr (requires { view.code_unit_offset_of(it); }) + return view.code_unit_offset_of(it); + VERIFY_NOT_REACHED(); + }; + + // WB1 + boundaries.append(0); + + if (code_point_length > 1) { + auto it = view.begin(); + auto code_point = *it; + u32 next_code_point; + Optional previous_code_point; + auto current_ri_chain = 0; + + for (++it; it != view.end(); ++it, previous_code_point = code_point, code_point = next_code_point) { + next_code_point = *it; + + auto code_point_is_cr = has_any_wbp(code_point, WBP::CR); + auto next_code_point_is_lf = has_any_wbp(next_code_point, WBP::LF); + + // WB3 + if (code_point_is_cr && next_code_point_is_lf) + continue; + // WB3a, WB3b + if (code_point_is_cr || next_code_point_is_lf || has_any_wbp(next_code_point, WBP::CR, WBP::Newline) || has_any_wbp(code_point, WBP::LF, WBP::Newline)) { + boundaries.append(code_unit_offset_of(it)); + continue; + } + // WB3c + if (has_any_wbp(code_point, WBP::ZWJ) && code_point_has_property(next_code_point, Property::Extended_Pictographic)) + continue; + // WB3d + if (has_any_wbp(code_point, WBP::WSegSpace) && has_any_wbp(next_code_point, WBP::WSegSpace)) + continue; + + // WB4 + if (has_any_wbp(next_code_point, WBP::Format, WBP::Extend, WBP::ZWJ)) + continue; + + auto code_point_is_hebrew_letter = has_any_wbp(code_point, WBP::Hebrew_Letter); + auto code_point_is_ah_letter = code_point_is_hebrew_letter || has_any_wbp(code_point, WBP::ALetter); + auto next_code_point_is_hebrew_letter = has_any_wbp(next_code_point, WBP::Hebrew_Letter); + auto next_code_point_is_ah_letter = next_code_point_is_hebrew_letter || has_any_wbp(next_code_point, WBP::ALetter); + + // WB5 + if (code_point_is_ah_letter && next_code_point_is_ah_letter) + continue; + + Optional next_next_code_point; + if (it != view.end()) { + auto it_copy = it; + ++it_copy; + if (it_copy != view.end()) + next_next_code_point = *it; + } + bool next_next_code_point_is_hebrew_letter = next_next_code_point.has_value() && has_any_wbp(*next_next_code_point, WBP::Hebrew_Letter); + bool next_next_code_point_is_ah_letter = next_next_code_point_is_hebrew_letter || (next_next_code_point.has_value() && has_any_wbp(*next_next_code_point, WBP::ALetter)); + + auto next_code_point_is_mid_num_let_q = has_any_wbp(next_code_point, WBP::MidNumLet, WBP::Single_Quote); + + // WB6 + if (code_point_is_ah_letter && next_next_code_point_is_ah_letter && (next_code_point_is_mid_num_let_q || has_any_wbp(next_code_point, WBP::MidLetter))) + continue; + + auto code_point_is_mid_num_let_q = has_any_wbp(code_point, WBP::MidNumLet, WBP::Single_Quote); + auto previous_code_point_is_hebrew_letter = previous_code_point.has_value() && has_any_wbp(*previous_code_point, WBP::Hebrew_Letter); + auto previous_code_point_is_ah_letter = previous_code_point_is_hebrew_letter || (previous_code_point.has_value() && has_any_wbp(*previous_code_point, WBP::ALetter)); + + // WB7 + if (previous_code_point_is_ah_letter && next_code_point_is_ah_letter && (code_point_is_mid_num_let_q || has_any_wbp(code_point, WBP::MidLetter))) + continue; + // WB7a + if (code_point_is_hebrew_letter && has_any_wbp(next_code_point, WBP::Single_Quote)) + continue; + // WB7b + if (code_point_is_hebrew_letter && next_next_code_point_is_hebrew_letter && has_any_wbp(next_code_point, WBP::Double_Quote)) + continue; + // WB7c + if (previous_code_point_is_hebrew_letter && next_code_point_is_hebrew_letter && has_any_wbp(code_point, WBP::Double_Quote)) + continue; + + auto code_point_is_numeric = has_any_wbp(code_point, WBP::Numeric); + auto next_code_point_is_numeric = has_any_wbp(next_code_point, WBP::Numeric); + + // WB8 + if (code_point_is_numeric && next_code_point_is_numeric) + continue; + // WB9 + if (code_point_is_ah_letter && next_code_point_is_numeric) + continue; + // WB10 + if (code_point_is_numeric && next_code_point_is_ah_letter) + continue; + + auto previous_code_point_is_numeric = previous_code_point.has_value() && has_any_wbp(code_point, WBP::Numeric); + + // WB11 + if (previous_code_point_is_numeric && next_code_point_is_numeric && (code_point_is_mid_num_let_q || has_any_wbp(code_point, WBP::MidNum))) + continue; + + bool next_next_code_point_is_numeric = next_next_code_point.has_value() && has_any_wbp(*next_next_code_point, WBP::Numeric); + + // WB12 + if (code_point_is_numeric && next_next_code_point_is_numeric && (next_code_point_is_mid_num_let_q || has_any_wbp(next_code_point, WBP::MidNum))) + continue; + + auto code_point_is_katakana = has_any_wbp(code_point, WBP::Katakana); + auto next_code_point_is_katakana = has_any_wbp(next_code_point, WBP::Katakana); + + // WB13 + if (code_point_is_katakana && next_code_point_is_katakana) + continue; + + auto code_point_is_extend_num_let = has_any_wbp(code_point, WBP::ExtendNumLet); + + // WB13a + if ((code_point_is_ah_letter || code_point_is_numeric || code_point_is_katakana || code_point_is_extend_num_let) && has_any_wbp(next_code_point, WBP::ExtendNumLet)) + continue; + // WB13b + if (code_point_is_extend_num_let && (next_code_point_is_ah_letter || next_code_point_is_numeric || next_code_point_is_katakana)) + continue; + + auto code_point_is_ri = has_any_wbp(code_point, WBP::Regional_Indicator); + current_ri_chain = code_point_is_ri ? current_ri_chain + 1 : 0; + + // WB15, WB16 + if (code_point_is_ri && has_any_wbp(next_code_point, WBP::Regional_Indicator) && current_ri_chain % 2 == 1) + continue; + + // WB999 + boundaries.append(code_unit_offset_of(it)); + } + } + + // WB2 + boundaries.append(code_unit_length); + return boundaries; +#else + return {}; +#endif +} + +Vector find_word_segmentation_boundaries(Utf8View const& view) +{ + return find_word_segmentation_boundaries_impl(view); +} + +Vector find_word_segmentation_boundaries(Utf16View const& view) +{ + return find_word_segmentation_boundaries_impl(view); +} + +Vector find_sentence_segmentation_boundaries([[maybe_unused]] Utf16View const& view) +{ +#if ENABLE_UNICODE_DATA + using SBP = SentenceBreakProperty; + Vector boundaries; + + // https://www.unicode.org/reports/tr29/#Grapheme_Cluster_Boundary_Rules + if (view.length_in_code_points() == 0) + return boundaries; + + auto has_any_sbp = [](u32 code_point, auto&&... properties) { + return (code_point_has_sentence_break_property(code_point, properties) || ...); + }; + + // SB1 + boundaries.append(0); + + if (view.length_in_code_points() > 1) { + auto it = view.begin(); + auto code_point = *it; + u32 next_code_point; + Optional previous_code_point; + enum class TerminatorSequenceState { + None, + Term, + Close, + Sp + } terminator_sequence_state { TerminatorSequenceState::None }; + auto term_was_a_term = false; + + for (++it; it != view.end(); ++it, previous_code_point = code_point, code_point = next_code_point) { + next_code_point = *it; + + auto code_point_is_cr = has_any_sbp(code_point, SBP::CR); + auto next_code_point_is_lf = has_any_sbp(next_code_point, SBP::LF); + + // SB3 + if (code_point_is_cr && next_code_point_is_lf) + continue; + + auto code_point_is_para_sep = code_point_is_cr || has_any_sbp(code_point, SBP::LF, SBP::Sep); + + // SB4 + if (code_point_is_para_sep) { + boundaries.append(view.code_unit_offset_of(it)); + continue; + } + + // SB5 + if (has_any_sbp(next_code_point, SBP::Format, SBP::Extend)) + continue; + + auto code_point_is_a_term = has_any_sbp(code_point, SBP::ATerm); + + // SB6 + if (code_point_is_a_term && has_any_sbp(next_code_point, SBP::Numeric)) + continue; + // SB7 + if (code_point_is_a_term && previous_code_point.has_value() && has_any_sbp(*previous_code_point, SBP::Upper, SBP::Lower) && has_any_sbp(next_code_point, SBP::Upper)) + continue; + + if (code_point_is_a_term || has_any_sbp(code_point, SBP::STerm)) { + terminator_sequence_state = TerminatorSequenceState::Term; + term_was_a_term = code_point_is_a_term; + } else if (terminator_sequence_state >= TerminatorSequenceState::Term && terminator_sequence_state <= TerminatorSequenceState::Close && has_any_sbp(code_point, SBP::Close)) { + terminator_sequence_state = TerminatorSequenceState::Close; + } else if (terminator_sequence_state >= TerminatorSequenceState::Term && has_any_sbp(code_point, SBP::Sp)) { + terminator_sequence_state = TerminatorSequenceState::Sp; + } else { + terminator_sequence_state = TerminatorSequenceState::None; + } + + // SB8 + if (terminator_sequence_state >= TerminatorSequenceState::Term && term_was_a_term) { + auto it_copy = it; + bool illegal_sequence = false; + for (auto sequence_code_point = *it_copy; it_copy != view.end(); ++it_copy) { + if (has_any_sbp(sequence_code_point, SBP::Close, SBP::SContinue, SBP::Numeric, SBP::Sp, SBP::Format, SBP::Extend)) + continue; + illegal_sequence = has_any_sbp(sequence_code_point, SBP::Lower); + } + if (illegal_sequence) + continue; + } + + // SB8a + if (terminator_sequence_state >= TerminatorSequenceState::Term && (has_any_sbp(next_code_point, SBP::SContinue, SBP::STerm, SBP::ATerm))) + continue; + + auto next_code_point_is_sp = has_any_sbp(next_code_point, SBP::Sp); + auto next_code_point_is_para_sep = has_any_sbp(next_code_point, SBP::Sep, SBP::CR, SBP::LF); + + // SB9 + if (terminator_sequence_state >= TerminatorSequenceState::Term && terminator_sequence_state <= TerminatorSequenceState::Close && (next_code_point_is_sp || next_code_point_is_para_sep || has_any_sbp(next_code_point, SBP::Close))) + continue; + + // SB10 + if (terminator_sequence_state >= TerminatorSequenceState::Term && (next_code_point_is_sp || next_code_point_is_para_sep)) + continue; + + // SB11 + if (terminator_sequence_state >= TerminatorSequenceState::Term) + boundaries.append(view.code_unit_offset_of(it)); + + // SB998 + } + } + + // SB2 + boundaries.append(view.length_in_code_units()); + return boundaries; +#else + return {}; +#endif +} + +} diff --git a/Userland/Libraries/LibUnicode/Segmentation.h b/Userland/Libraries/LibUnicode/Segmentation.h new file mode 100644 index 0000000000..56c3b240c9 --- /dev/null +++ b/Userland/Libraries/LibUnicode/Segmentation.h @@ -0,0 +1,23 @@ +/* + * Copyright (c) 2022, Idan Horowitz + * Copyright (c) 2023, Tim Flynn + * + * SPDX-License-Identifier: BSD-2-Clause + */ + +#pragma once + +#include +#include +#include + +namespace Unicode { + +Vector find_grapheme_segmentation_boundaries(Utf16View const&); + +Vector find_word_segmentation_boundaries(Utf8View const&); +Vector find_word_segmentation_boundaries(Utf16View const&); + +Vector find_sentence_segmentation_boundaries(Utf16View const&); + +} diff --git a/Userland/Libraries/LibUnicode/UnicodeUtils.cpp b/Userland/Libraries/LibUnicode/UnicodeUtils.cpp index fbd3a8a164..439fcfcb54 100644 --- a/Userland/Libraries/LibUnicode/UnicodeUtils.cpp +++ b/Userland/Libraries/LibUnicode/UnicodeUtils.cpp @@ -9,6 +9,7 @@ #include #include #include +#include #include #if ENABLE_UNICODE_DATA