1
Fork 0
mirror of https://github.com/RGBCube/serenity synced 2025-07-27 06:17:35 +00:00

LibUnicode: Implement sentence segmentation

This commit is contained in:
Idan Horowitz 2022-01-31 18:22:24 +02:00
parent a593a5c8ab
commit 4967bcd4ce
2 changed files with 117 additions and 0 deletions

View file

@ -602,4 +602,120 @@ Vector<size_t> find_word_segmentation_boundaries([[maybe_unused]] Utf16View cons
#endif #endif
} }
Vector<size_t> find_sentence_segmentation_boundaries([[maybe_unused]] Utf16View const& view)
{
#if ENABLE_UNICODE_DATA
using SBP = SentenceBreakProperty;
Vector<size_t> boundaries;
// https://www.unicode.org/reports/tr29/#Grapheme_Cluster_Boundary_Rules
if (view.length_in_code_points() == 0)
return boundaries;
auto has_any_sbp = [](u32 code_point, auto&&... properties) {
return (code_point_has_sentence_break_property(code_point, properties) || ...);
};
// SB1
boundaries.append(0);
if (view.length_in_code_points() > 1) {
auto it = view.begin();
auto code_point = *it;
u32 next_code_point;
Optional<u32> previous_code_point;
enum class TerminatorSequenceState {
None,
Term,
Close,
Sp
} terminator_sequence_state { TerminatorSequenceState::None };
auto term_was_a_term = false;
for (++it; it != view.end(); ++it, previous_code_point = code_point, code_point = next_code_point) {
next_code_point = *it;
auto code_point_is_cr = has_any_sbp(code_point, SBP::CR);
auto next_code_point_is_lf = has_any_sbp(next_code_point, SBP::LF);
// SB3
if (code_point_is_cr && next_code_point_is_lf)
continue;
auto code_point_is_para_sep = code_point_is_cr || has_any_sbp(code_point, SBP::LF, SBP::Sep);
// SB4
if (code_point_is_para_sep) {
boundaries.append(view.code_unit_offset_of(it));
continue;
}
// SB5
if (has_any_sbp(next_code_point, SBP::Format, SBP::Extend))
continue;
auto code_point_is_a_term = has_any_sbp(code_point, SBP::ATerm);
// SB6
if (code_point_is_a_term && has_any_sbp(next_code_point, SBP::Numeric))
continue;
// SB7
if (code_point_is_a_term && previous_code_point.has_value() && has_any_sbp(*previous_code_point, SBP::Upper, SBP::Lower) && has_any_sbp(next_code_point, SBP::Upper))
continue;
if (code_point_is_a_term || has_any_sbp(code_point, SBP::STerm)) {
terminator_sequence_state = TerminatorSequenceState::Term;
term_was_a_term = code_point_is_a_term;
} else if (terminator_sequence_state >= TerminatorSequenceState::Term && terminator_sequence_state <= TerminatorSequenceState::Close && has_any_sbp(code_point, SBP::Close)) {
terminator_sequence_state = TerminatorSequenceState::Close;
} else if (terminator_sequence_state >= TerminatorSequenceState::Term && has_any_sbp(code_point, SBP::Sp)) {
terminator_sequence_state = TerminatorSequenceState::Sp;
} else {
terminator_sequence_state = TerminatorSequenceState::None;
}
// SB8
if (terminator_sequence_state >= TerminatorSequenceState::Term && term_was_a_term) {
auto it_copy = it;
bool illegal_sequence = false;
for (auto sequence_code_point = *it_copy; it_copy != view.end(); ++it_copy) {
if (has_any_sbp(sequence_code_point, SBP::Close, SBP::SContinue, SBP::Numeric, SBP::Sp, SBP::Format, SBP::Extend))
continue;
illegal_sequence = has_any_sbp(sequence_code_point, SBP::Lower);
}
if (illegal_sequence)
continue;
}
// SB8a
if (terminator_sequence_state >= TerminatorSequenceState::Term && (has_any_sbp(next_code_point, SBP::SContinue, SBP::STerm, SBP::ATerm)))
continue;
auto next_code_point_is_sp = has_any_sbp(next_code_point, SBP::Sp);
auto next_code_point_is_para_sep = has_any_sbp(next_code_point, SBP::Sep, SBP::CR, SBP::LF);
// SB9
if (terminator_sequence_state >= TerminatorSequenceState::Term && terminator_sequence_state <= TerminatorSequenceState::Close && (next_code_point_is_sp || next_code_point_is_para_sep || has_any_sbp(next_code_point, SBP::Close)))
continue;
// SB10
if (terminator_sequence_state >= TerminatorSequenceState::Term && (next_code_point_is_sp || next_code_point_is_para_sep))
continue;
// SB11
if (terminator_sequence_state >= TerminatorSequenceState::Term)
boundaries.append(view.code_unit_offset_of(it));
// SB998
}
}
// SB2
boundaries.append(view.length_in_code_units());
return boundaries;
#else
return {};
#endif
}
} }

View file

@ -46,5 +46,6 @@ bool code_point_has_sentence_break_property(u32 code_point, SentenceBreakPropert
Vector<size_t> find_grapheme_segmentation_boundaries(Utf16View const&); Vector<size_t> find_grapheme_segmentation_boundaries(Utf16View const&);
Vector<size_t> find_word_segmentation_boundaries(Utf16View const&); Vector<size_t> find_word_segmentation_boundaries(Utf16View const&);
Vector<size_t> find_sentence_segmentation_boundaries(Utf16View const&);
} }