mirror of
https://github.com/RGBCube/serenity
synced 2025-05-14 09:24:57 +00:00
LibUnicode: Skip over emoji sequences in grapheme boundary segmentation
Emoji sequences in the grapheme segmentation spec are a bit tricky: \p{Extended_Pictographic} Extend* ZWJ × \p{Extended_Pictographic} Our current strategy of tracking a boolean to indicate if we are in an emoji sequence was causing us to break up emoji made of multiple sub- sequences. For example, in the "family: man, woman, girl, boy" sequence: U+1F468 U+200D U+1F469 U+200D U+1F467 U+200D U+1F466 We would break at indices 0 (correctly) and 6 (incorrectly). Instead of tracking a boolean, it's quite a bit simpler to reason about emoji sequences by just skipping past them entirely. Note that in cases like the above emoji, we skip one sub-sequence at a time.
This commit is contained in:
parent
09d40bfbb2
commit
fa96811a22
1 changed files with 19 additions and 12 deletions
|
@ -64,13 +64,30 @@ static void for_each_grapheme_segmentation_boundary_impl([[maybe_unused]] ViewTy
|
|||
if (code_unit_length(view) > 1) {
|
||||
auto it = view.begin();
|
||||
auto code_point = *it;
|
||||
u32 next_code_point;
|
||||
u32 next_code_point = 0;
|
||||
auto current_ri_chain = 0;
|
||||
auto in_emoji_sequence = false;
|
||||
|
||||
for (++it; it != view.end(); ++it, code_point = next_code_point) {
|
||||
next_code_point = *it;
|
||||
|
||||
// GB11
|
||||
if (code_point_has_property(code_point, Property::Extended_Pictographic) && has_any_gbp(next_code_point, GBP::Extend, GBP::ZWJ)) {
|
||||
auto it_copy = it;
|
||||
|
||||
while (it_copy != view.end() && has_any_gbp(*it_copy, GBP::Extend))
|
||||
++it_copy;
|
||||
|
||||
if (it_copy != view.end() && has_any_gbp(*it_copy, GBP::ZWJ)) {
|
||||
++it_copy;
|
||||
|
||||
if (it_copy != view.end() && code_point_has_property(*it_copy, Property::Extended_Pictographic)) {
|
||||
next_code_point = *it_copy;
|
||||
it = it_copy;
|
||||
continue;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
auto code_point_is_cr = has_any_gbp(code_point, GBP::CR);
|
||||
auto next_code_point_is_lf = has_any_gbp(next_code_point, GBP::LF);
|
||||
|
||||
|
@ -97,12 +114,6 @@ static void for_each_grapheme_segmentation_boundary_impl([[maybe_unused]] ViewTy
|
|||
if (next_code_point_is_t && has_any_gbp(code_point, GBP::LVT, GBP::T))
|
||||
continue;
|
||||
|
||||
auto code_point_is_zwj = has_any_gbp(code_point, GBP::ZWJ);
|
||||
if (!in_emoji_sequence && code_point_has_property(code_point, Property::Extended_Pictographic))
|
||||
in_emoji_sequence = true;
|
||||
else if (in_emoji_sequence && !has_any_gbp(code_point, GBP::Extend) && !code_point_is_zwj)
|
||||
in_emoji_sequence = false;
|
||||
|
||||
// GB9
|
||||
if (has_any_gbp(next_code_point, GBP::Extend, GBP::ZWJ))
|
||||
continue;
|
||||
|
@ -113,10 +124,6 @@ static void for_each_grapheme_segmentation_boundary_impl([[maybe_unused]] ViewTy
|
|||
if (has_any_gbp(code_point, GBP::Prepend))
|
||||
continue;
|
||||
|
||||
// GB11
|
||||
if (in_emoji_sequence && code_point_is_zwj && code_point_has_property(next_code_point, Property::Extended_Pictographic))
|
||||
continue;
|
||||
|
||||
auto code_point_is_ri = has_any_gbp(code_point, GBP::Regional_Indicator);
|
||||
current_ri_chain = code_point_is_ri ? current_ri_chain + 1 : 0;
|
||||
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue