From acd290d11f3351a342e67e351d38792ffd725286 Mon Sep 17 00:00:00 2001 From: Dean Li Date: Wed, 2 Jun 2021 21:40:47 +0800 Subject: [PATCH] more: fix unicode bug for breakline - Use `unicode_segmentation` and `unicode_width` to determine proper `break_line` position. - Keep track of total_width as suggested by @tertsdiepraam. - Add unittest for ZWJ unicode case Related to #2319. --- Cargo.lock | 2 ++ src/uu/more/Cargo.toml | 2 ++ src/uu/more/src/more.rs | 62 ++++++++++++++++++++++++++++++----------- 3 files changed, 50 insertions(+), 16 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 17fa9e2b7..3778db34c 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2204,6 +2204,8 @@ dependencies = [ "nix 0.13.1", "redox_syscall 0.1.57", "redox_termios", + "unicode-segmentation", + "unicode-width", "uucore", "uucore_procs", ] diff --git a/src/uu/more/Cargo.toml b/src/uu/more/Cargo.toml index 9b1a3d7b6..b3b97e6dd 100644 --- a/src/uu/more/Cargo.toml +++ b/src/uu/more/Cargo.toml @@ -20,6 +20,8 @@ uucore = { version = ">=0.0.7", package = "uucore", path = "../../uucore" } uucore_procs = { version = ">=0.0.5", package = "uucore_procs", path = "../../uucore_procs" } crossterm = ">=0.19" atty = "0.2.14" +unicode-width = "0.1.7" +unicode-segmentation = "1.7.1" [target.'cfg(target_os = "redox")'.dependencies] redox_termios = "0.1" diff --git a/src/uu/more/src/more.rs b/src/uu/more/src/more.rs index 482c5491d..2e6771705 100644 --- a/src/uu/more/src/more.rs +++ b/src/uu/more/src/more.rs @@ -29,6 +29,9 @@ use crossterm::{ terminal, }; +use unicode_segmentation::UnicodeSegmentation; +use unicode_width::UnicodeWidthStr; + pub mod options { pub const SILENT: &str = "silent"; pub const LOGICAL: &str = "logical"; @@ -313,23 +316,30 @@ fn break_buff(buff: &str, cols: usize) -> Vec { lines } -fn break_line(mut line: &str, cols: usize) -> Vec { - let breaks = (line.len() / cols).saturating_add(1); - let mut lines = Vec::with_capacity(breaks); - // TODO: Use unicode width instead of the length in bytes. - if line.len() < cols { +fn break_line(line: &str, cols: usize) -> Vec { + let width = UnicodeWidthStr::width(line); + let mut lines = Vec::new(); + if width < cols { lines.push(line.to_string()); return lines; } - for _ in 1..=breaks { - let (line1, line2) = line.split_at(cols); - lines.push(line1.to_string()); - if line2.len() < cols { - lines.push(line2.to_string()); - break; + let gr_idx = UnicodeSegmentation::grapheme_indices(line, true); + let mut last_index = 0; + let mut total_width = 0; + for (index, grapheme) in gr_idx { + let width = UnicodeWidthStr::width(grapheme); + total_width += width; + + if total_width > cols { + lines.push(line[last_index..index].to_string()); + last_index = index; + total_width = width; } - line = line2; + } + + if last_index != line.len() { + lines.push(line[last_index..].to_string()); } lines } @@ -363,6 +373,7 @@ fn make_prompt_and_flush(stdout: &mut Stdout, lower_mark: u16, lc: u16) { #[cfg(test)] mod tests { use super::{break_line, calc_range}; + use unicode_width::UnicodeWidthStr; // It is good to test the above functions #[test] @@ -379,11 +390,12 @@ mod tests { } let lines = break_line(&test_string, 80); + let widths: Vec = lines + .iter() + .map(|s| UnicodeWidthStr::width(&s[..])) + .collect(); - assert_eq!( - (80, 80, 40), - (lines[0].len(), lines[1].len(), lines[2].len()) - ); + assert_eq!((80, 80, 40), (widths[0], widths[1], widths[2])); } #[test] @@ -397,4 +409,22 @@ mod tests { assert_eq!(20, lines[0].len()); } + + #[test] + fn test_break_line_zwj() { + let mut test_string = String::with_capacity(1100); + for _ in 0..20 { + test_string.push_str("👩🏻‍🔬"); + } + + let lines = break_line(&test_string, 80); + + let widths: Vec = lines + .iter() + .map(|s| UnicodeWidthStr::width(&s[..])) + .collect(); + + // Each 👩🏻‍🔬 is 6 character width it break line to the closest number to 80 => 6 * 13 = 78 + assert_eq!((78, 42), (widths[0], widths[1])); + } }