wc: Do a chunked read with proper UTF-8 handling

This brings the results mostly in line with GNU wc and solves nasty behavior with long lines.
2025-07-28 03:27:44 +00:00 · 2021-08-25 13:26:44 +02:00 · 2021-08-25 13:26:44 +02:00 · 6f7d740592
commit 6f7d740592
parent 48437fc49d
8 changed files with 105 additions and 138 deletions
--- a/Cargo.lock
+++ b/Cargo.lock
@ -2046,6 +2046,12 @@ dependencies = [
 "log",
 ]
 [[package]]
 name = "utf-8"
 version = "0.7.6"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "09cc8ee72d2a9becf2f2febe0205bbed8fc6615b7cb429ad062dc7b7ddd036a9"
 [[package]]
 name = "utf8-width"
 version = "0.1.5"
@ -3121,6 +3127,8 @@ dependencies = [
 "libc",
 "nix 0.20.0",
 "thiserror",
 "unicode-width",
 "utf-8",
 "uucore",
 "uucore_procs",
 ]
--- a/src/uu/wc/Cargo.toml
+++ b/src/uu/wc/Cargo.toml
@ -20,6 +20,8 @@ uucore = { version=">=0.0.9", package="uucore", path="../../uucore" }
 uucore_procs = { version=">=0.0.6", package="uucore_procs", path="../../uucore_procs" }
 thiserror = "1.0"
 bytecount = "0.6.2"
 utf-8 = "0.7.6"
 unicode-width = "0.1.8"
 [target.'cfg(unix)'.dependencies]
 nix = "0.20"
--- a/src/uu/wc/src/countable.rs
+++ b/src/uu/wc/src/countable.rs
@ -4,7 +4,7 @@
 //! for some common file-like objects. Use the [`WordCountable::lines`]
 //! method to get an iterator over lines of a file-like object.
 use std::fs::File;
-use std::io::{self, BufRead, BufReader, Read, StdinLock};
+use std::io::{BufRead, BufReader, Read, StdinLock};
 #[cfg(unix)]
 use std::os::unix::io::AsRawFd;
@ -12,65 +12,26 @@ use std::os::unix::io::AsRawFd;
 #[cfg(unix)]
 pub trait WordCountable: AsRawFd + Read {
    type Buffered: BufRead;
-    fn lines(self) -> Lines<Self::Buffered>;
+    fn buffered(self) -> Self::Buffered;
 }
 #[cfg(not(unix))]
 pub trait WordCountable: Read {
    type Buffered: BufRead;
-    fn lines(self) -> Lines<Self::Buffered>;
+    fn buffered(self) -> Self::Buffered;
 }
 impl WordCountable for StdinLock<'_> {
    type Buffered = Self;
-    fn lines(self) -> Lines<Self::Buffered>
+    fn buffered(self) -> Self::Buffered {
-    where
+        self
        Self: Sized,
    {
        Lines::new(self)
    }
 }
 impl WordCountable for File {
    type Buffered = BufReader<Self>;
-    fn lines(self) -> Lines<Self::Buffered>
+    fn buffered(self) -> Self::Buffered {
-    where
+        BufReader::new(self)
        Self: Sized,
    {
        Lines::new(BufReader::new(self))
    }
 }
 /// An iterator over the lines of an instance of `BufRead`.
 ///
 /// Similar to [`io::Lines`] but yields each line as a `Vec<u8>` and
 /// includes the newline character (`\n`, the `0xA` byte) that
 /// terminates the line.
 ///
 /// [`io::Lines`]:: io::Lines
 pub struct Lines<B> {
    buf: B,
    line: Vec<u8>,
 }
 impl<B: BufRead> Lines<B> {
    fn new(reader: B) -> Self {
        Lines {
            buf: reader,
            line: Vec::new(),
        }
    }
    pub fn next(&mut self) -> Option<io::Result<&[u8]>> {
        self.line.clear();
        // reading from a TTY seems to raise a condition on, rather than return Some(0) like a file.
        // hence the option wrapped in a result here
        match self.buf.read_until(b'\n', &mut self.line) {
            Ok(0) => None,
            Ok(_n) => Some(Ok(&self.line)),
            Err(e) => Some(Err(e)),
        }
    }
 }
--- a/src/uu/wc/src/wc.rs
+++ b/src/uu/wc/src/wc.rs
@ -13,11 +13,14 @@ mod countable;
 mod word_count;
 use count_fast::{count_bytes_and_lines_fast, count_bytes_fast};
 use countable::WordCountable;
 use unicode_width::UnicodeWidthChar;
 use utf8::{BufReadDecoder, BufReadDecoderError};
 use word_count::{TitledWordCount, WordCount};
 use clap::{crate_version, App, Arg, ArgMatches};
 use thiserror::Error;
 use std::cmp::max;
 use std::fs::{self, File};
 use std::io::{self, ErrorKind, Write};
 use std::path::Path;
@ -224,16 +227,59 @@ fn word_count_from_reader<T: WordCountable>(
        return count_bytes_and_lines_fast(&mut reader);
    }
    // Sum the WordCount for each line. Show a warning for each line
    // that results in an IO error when trying to read it.
    let mut lines = reader.lines();
    let mut total = WordCount::default();
-    while let Some(res) = lines.next() {
+    let mut reader = BufReadDecoder::new(reader.buffered());
-        match res {
+    let mut in_word = false;
-            Ok(line) => total += WordCount::from_line(line),
+    let mut current_len = 0;
-            Err(e) => show_warning!("Error while reading {}: {}", path, e),
+
    while let Some(chunk) = reader.next_strict() {
        match chunk {
            Ok(text) => {
                for ch in text.chars() {
                    if ch.is_whitespace() {
                        in_word = false;
                    } else if ch.is_ascii_control() {
                        // These count as characters but do not affect the word state
                    } else if !in_word {
                        in_word = true;
                        total.words += 1;
                    }
                    match ch {
                        '\n' => {
                            total.max_line_length = max(current_len, total.max_line_length);
                            current_len = 0;
                            total.lines += 1;
                        }
                        // '\x0c' = '\f'
                        '\r' | '\x0c' => {
                            total.max_line_length = max(current_len, total.max_line_length);
                            current_len = 0;
                        }
                        '\t' => {
                            current_len -= current_len % 8;
                            current_len += 8;
                        }
                        _ => {
                            current_len += ch.width().unwrap_or(0);
                        }
                    }
                    total.chars += 1;
                }
                total.bytes += text.len();
            }
            Err(BufReadDecoderError::InvalidByteSequence(bytes)) => {
                // GNU wc treats invalid data as neither word nor char nor whitespace,
                // so no other counters are affected
                total.bytes += bytes.len();
            }
            Err(BufReadDecoderError::Io(e)) => {
                show_warning!("Error while reading {}: {}", path, e);
            }
        }
    }
    total.max_line_length = max(current_len, total.max_line_length);
    Ok(total)
 }
--- a/src/uu/wc/src/word_count.rs
+++ b/src/uu/wc/src/word_count.rs
@ -1,19 +1,5 @@
 use std::cmp::max;
 use std::iter::Sum;
 use std::ops::{Add, AddAssign};
 use std::str::from_utf8;
 const CR: u8 = b'\r';
 const LF: u8 = b'\n';
 const SPACE: u8 = b' ';
 const TAB: u8 = b'\t';
 const SYN: u8 = 0x16_u8;
 const FF: u8 = 0x0C_u8;
 #[inline(always)]
 fn is_word_separator(byte: u8) -> bool {
    byte == SPACE || byte == TAB || byte == CR || byte == SYN || byte == FF
 }
 #[derive(Debug, Default, Copy, Clone)]
 pub struct WordCount {
@ -44,76 +30,10 @@ impl AddAssign for WordCount {
    }
 }
 impl Sum for WordCount {
    fn sum<I>(iter: I) -> WordCount
    where
        I: Iterator<Item = WordCount>,
    {
        iter.fold(WordCount::default(), |acc, x| acc + x)
    }
 }
 impl WordCount {
    /// Count the characters and whitespace-separated words in the given bytes.
    ///
    /// `line` is a slice of bytes that will be decoded as ASCII characters.
    fn ascii_word_and_char_count(line: &[u8]) -> (usize, usize) {
        let word_count = line.split(|&x| is_word_separator(x)).count();
        let char_count = line.iter().filter(|c| c.is_ascii()).count();
        (word_count, char_count)
    }
    /// Create a [`WordCount`] from a sequence of bytes representing a line.
    ///
    /// If the last byte of `line` encodes a newline character (`\n`),
    /// then the [`lines`] field will be set to 1. Otherwise, it will
    /// be set to 0. The [`bytes`] field is simply the length of
    /// `line`.
    ///
    /// If `decode_chars` is `false`, the [`chars`] and [`words`]
    /// fields will be set to 0. If it is `true`, this function will
    /// attempt to decode the bytes first as UTF-8, and failing that,
    /// as ASCII.
    pub fn from_line(line: &[u8]) -> WordCount {
        // GNU 'wc' only counts lines that end in LF as lines
        let lines = (*line.last().unwrap() == LF) as usize;
        let bytes = line.len();
        let (words, chars) = WordCount::word_and_char_count(line);
        // -L is a GNU 'wc' extension so same behavior on LF
        let max_line_length = if chars > 0 { chars - lines } else { 0 };
        WordCount {
            bytes,
            chars,
            lines,
            words,
            max_line_length,
        }
    }
    /// Count the UTF-8 characters and words in the given string slice.
    ///
    /// `s` is a string slice that is assumed to be a UTF-8 string.
    fn utf8_word_and_char_count(s: &str) -> (usize, usize) {
        let word_count = s.split_whitespace().count();
        let char_count = s.chars().count();
        (word_count, char_count)
    }
    pub fn with_title(self, title: Option<&str>) -> TitledWordCount {
        TitledWordCount { title, count: self }
    }
    /// Count the characters and words in the given slice of bytes.
    ///
    /// `line` is a slice of bytes that will be decoded as UTF-8
    /// characters, or if that fails, as ASCII characters.
    fn word_and_char_count(line: &[u8]) -> (usize, usize) {
        // try and convert the bytes to UTF-8 first
        match from_utf8(line) {
            Ok(s) => WordCount::utf8_word_and_char_count(s),
            Err(..) => WordCount::ascii_word_and_char_count(line),
        }
    }
 }
 /// This struct supplements the actual word count with an optional title that is
--- a/tests/by-util/test_wc.rs
+++ b/tests/by-util/test_wc.rs
@ -53,11 +53,16 @@ fn test_utf8() {
        .args(&["-lwmcL"])
        .pipe_in_fixture("UTF_8_test.txt")
        .run()
-        .stdout_is("    300    4969   22781   22213      79\n");
+        .stdout_is("    303    2119   23025   22457      79\n");
-    // GNU returns "    300    2086   22219   22781      79"
+}
-    //
+
-    // TODO: we should fix the word, character, and byte count to
+#[test]
-    // match the behavior of GNU wc
+fn test_utf8_extra() {
    new_ucmd!()
        .arg("-lwmcL")
        .pipe_in_fixture("UTF_8_weirdchars.txt")
        .run()
        .stdout_is("     25      87     513     442      48\n");
 }
 #[test]
--- a/tests/fixtures/wc/UTF_8_test.txt
+++ b/tests/fixtures/wc/UTF_8_test.txt
--- a/tests/fixtures/wc/UTF_8_weirdchars.txt
+++ b/tests/fixtures/wc/UTF_8_weirdchars.txt
@ -0,0 +1,25 @@
 zero-width space inbetween these: xx
 and inbetween two spaces: [  ]
 and at the end of the line: 
 non-breaking space: x x [   ]  
 simple unicode: xµx [ µ ] µ
 wide: xｗx [ ｗ ] ｗ
 simple emoji: x👩x [ 👩 ] 👩
 complex emoji: x👩‍🔬x [ 👩‍🔬 ] 👩‍🔬
 Ｈｅｌｌｏ, ｗｏｒｌｄ!
 line feed: xx [  ] 
 vertical tab: xx [  ] 
 horizontal tab: x	x [ 	 ]
 this should be the longest line:
 1234567	12345678	123456781234567812345678
 Control character: xx [  ]