diff --git a/.github/workflows/GnuTests.yml b/.github/workflows/GnuTests.yml index 5d841d3b2..74df3792d 100644 --- a/.github/workflows/GnuTests.yml +++ b/.github/workflows/GnuTests.yml @@ -166,6 +166,7 @@ jobs: sudo locale-gen --keep-existing sv_SE sudo locale-gen --keep-existing sv_SE.UTF-8 sudo locale-gen --keep-existing en_US + sudo locale-gen --keep-existing en_US.UTF-8 sudo locale-gen --keep-existing ru_RU.KOI8-R sudo update-locale diff --git a/src/uu/fmt/src/linebreak.rs b/src/uu/fmt/src/linebreak.rs index 143228373..9dc7243cb 100644 --- a/src/uu/fmt/src/linebreak.rs +++ b/src/uu/fmt/src/linebreak.rs @@ -236,7 +236,11 @@ fn find_kp_breakpoints<'a, T: Iterator>>( let mut next_active_breaks = vec![]; let stretch = args.opts.width - args.opts.goal; - let minlength = args.opts.goal.max(stretch + 1) - stretch; + let minlength = if args.opts.goal <= 10 { + 1 + } else { + args.opts.goal.max(stretch + 1) - stretch + }; let mut new_linebreaks = vec![]; let mut is_sentence_start = false; let mut least_demerits = 0; @@ -384,11 +388,11 @@ fn build_best_path<'a>(paths: &[LineBreak<'a>], active: &[usize]) -> Vec<(&'a Wo const BAD_INFTY: i64 = 10_000_000; const BAD_INFTY_SQ: i64 = BAD_INFTY * BAD_INFTY; // badness = BAD_MULT * abs(r) ^ 3 -const BAD_MULT: f32 = 100.0; +const BAD_MULT: f32 = 200.0; // DR_MULT is multiplier for delta-R between lines const DR_MULT: f32 = 600.0; // DL_MULT is penalty multiplier for short words at end of line -const DL_MULT: f32 = 300.0; +const DL_MULT: f32 = 10.0; fn compute_demerits(delta_len: isize, stretch: usize, wlen: usize, prev_rat: f32) -> (i64, f32) { // how much stretch are we using? diff --git a/src/uu/fmt/src/parasplit.rs b/src/uu/fmt/src/parasplit.rs index f9da4ad58..dc8d98f6c 100644 --- a/src/uu/fmt/src/parasplit.rs +++ b/src/uu/fmt/src/parasplit.rs @@ -26,6 +26,14 @@ fn char_width(c: char) -> usize { } } +// GNU fmt has a more restrictive definition of whitespace than Unicode. +// It only considers ASCII whitespace characters (space, tab, newline, etc.) +// and excludes many Unicode whitespace characters like non-breaking spaces. +fn is_fmt_whitespace(c: char) -> bool { + // Only ASCII whitespace characters are considered whitespace in GNU fmt + matches!(c, ' ' | '\t' | '\n' | '\r' | '\x0B' | '\x0C') +} + // lines with PSKIP, lacking PREFIX, or which are entirely blank are // NoFormatLines; otherwise, they are FormatLines #[derive(Debug)] @@ -109,7 +117,7 @@ impl FileLines<'_> { for (i, char) in line.char_indices() { if line[i..].starts_with(pfx) { return (true, i); - } else if !char.is_whitespace() { + } else if !is_fmt_whitespace(char) { break; } } @@ -128,7 +136,7 @@ impl FileLines<'_> { prefix_len = indent_len; } - if (os >= prefix_end) && !c.is_whitespace() { + if (os >= prefix_end) && !is_fmt_whitespace(c) { // found first non-whitespace after prefix, this is indent_end indent_end = os; break; @@ -154,7 +162,7 @@ impl Iterator for FileLines<'_> { // emit a blank line // Err(true) indicates that this was a linebreak, // which is important to know when detecting mail headers - if n.chars().all(char::is_whitespace) { + if n.chars().all(is_fmt_whitespace) { return Some(Line::NoFormatLine(String::new(), true)); } @@ -174,7 +182,7 @@ impl Iterator for FileLines<'_> { if pmatch && n[poffset + self.opts.prefix.as_ref().map_or(0, |s| s.len())..] .chars() - .all(char::is_whitespace) + .all(is_fmt_whitespace) { return Some(Line::NoFormatLine(n, false)); } @@ -498,7 +506,7 @@ impl WordSplit<'_> { let mut aftertab = 0; let mut word_start = None; for (os, c) in string.char_indices() { - if !c.is_whitespace() { + if !is_fmt_whitespace(c) { word_start = Some(os); break; } else if c == '\t' { @@ -519,7 +527,7 @@ impl WordSplit<'_> { impl WordSplit<'_> { fn new<'b>(opts: &'b FmtOptions, string: &'b str) -> WordSplit<'b> { // wordsplits *must* start at a non-whitespace character - let trim_string = string.trim_start(); + let trim_string = string.trim_start_matches(is_fmt_whitespace); WordSplit { opts, string: trim_string, @@ -571,7 +579,7 @@ impl<'a> Iterator for WordSplit<'a> { // points to whitespace character OR end of string let mut word_nchars = 0; self.position = match self.string[word_start..].find(|x: char| { - if x.is_whitespace() { + if is_fmt_whitespace(x) { true } else { word_nchars += char_width(x); diff --git a/tests/by-util/test_fmt.rs b/tests/by-util/test_fmt.rs index 8d851d5ce..89a77d286 100644 --- a/tests/by-util/test_fmt.rs +++ b/tests/by-util/test_fmt.rs @@ -2,6 +2,9 @@ // // For the full copyright and license information, please view the LICENSE // file that was distributed with this source code. + +// spell-checker:ignore plass samp + use uutests::new_ucmd; use uutests::util::TestScenario; use uutests::util_name; @@ -303,3 +306,73 @@ fn prefix_equal_skip_prefix_equal_two() { .stdout_is_fixture("prefixed-one-word-per-line_p=_P=2.txt"); } } + +#[test] +fn test_fmt_unicode_whitespace_handling() { + // Character classification fix: Test that Unicode whitespace characters like non-breaking space + // are NOT treated as whitespace by fmt, maintaining GNU fmt compatibility. + // GNU fmt only recognizes ASCII whitespace (space, tab, newline, etc.) and excludes + // Unicode whitespace characters to ensure consistent formatting behavior. + // This prevents regression of the character classification fix + let non_breaking_space = "\u{00A0}"; // U+00A0 NO-BREAK SPACE + let figure_space = "\u{2007}"; // U+2007 FIGURE SPACE + let narrow_no_break_space = "\u{202F}"; // U+202F NARROW NO-BREAK SPACE + + // When fmt splits on width=1, these characters should NOT cause line breaks + // because they should not be considered whitespace + for (name, char) in [ + ("non-breaking space", non_breaking_space), + ("figure space", figure_space), + ("narrow no-break space", narrow_no_break_space), + ] { + let input = format!("={char}="); + let result = new_ucmd!() + .args(&["-s", "-w1"]) + .pipe_in(input.as_bytes()) + .succeeds(); + + // Should be 1 line since the Unicode char is not treated as whitespace + assert_eq!( + result.stdout_str().lines().count(), + 1, + "Failed for {name}: Unicode character should not be treated as whitespace" + ); + } +} + +#[test] +fn test_fmt_knuth_plass_line_breaking() { + // Line breaking algorithm improvements: Test the enhanced Knuth-Plass optimal line breaking + // algorithm that better handles sentence boundaries, word positioning constraints, + // and produces more natural line breaks for complex text formatting. + // This prevents regression of the line breaking algorithm improvements + let input = "@command{fmt} prefers breaking lines at the end of a sentence, and tries to\n\ + avoid line breaks after the first word of a sentence or before the last word\n\ + of a sentence. A @dfn{sentence break} is defined as either the end of a\n\ + paragraph or a word ending in any of @samp{.?!}, followed by two spaces or end\n\ + of line, ignoring any intervening parentheses or quotes. Like @TeX{},\n\ + @command{fmt} reads entire ''paragraphs'' before choosing line breaks; the\n\ + algorithm is a variant of that given by\n\ + Donald E. Knuth and Michael F. Plass\n\ + in ''Breaking Paragraphs Into Lines'',\n\ + @cite{Software---Practice & Experience}\n\ + @b{11}, 11 (November 1981), 1119--1184."; + + let expected = "@command{fmt} prefers breaking lines at the end of a sentence,\n\ + and tries to avoid line breaks after the first word of a sentence\n\ + or before the last word of a sentence. A @dfn{sentence break}\n\ + is defined as either the end of a paragraph or a word ending\n\ + in any of @samp{.?!}, followed by two spaces or end of line,\n\ + ignoring any intervening parentheses or quotes. Like @TeX{},\n\ + @command{fmt} reads entire ''paragraphs'' before choosing line\n\ + breaks; the algorithm is a variant of that given by Donald\n\ + E. Knuth and Michael F. Plass in ''Breaking Paragraphs Into\n\ + Lines'', @cite{Software---Practice & Experience} @b{11}, 11\n\ + (November 1981), 1119--1184.\n"; + + new_ucmd!() + .args(&["-g", "60", "-w", "72"]) + .pipe_in(input) + .succeeds() + .stdout_is(expected); +}