From 6f7d740592111d5f558a841f6c5d1fdb6c0af03f Mon Sep 17 00:00:00 2001
From: Jan Verbeek <jan.verbeek@posteo.nl>
Date: Wed, 25 Aug 2021 13:26:44 +0200
Subject: [PATCH] wc: Do a chunked read with proper UTF-8 handling

This brings the results mostly in line with GNU wc and solves nasty
behavior with long lines.
---
 Cargo.lock                             |   8 +++
 src/uu/wc/Cargo.toml                   |   2 +
 src/uu/wc/src/countable.rs             |  53 +++-------------
 src/uu/wc/src/wc.rs                    |  60 ++++++++++++++++---
 src/uu/wc/src/word_count.rs            |  80 -------------------------
 tests/by-util/test_wc.rs               |  15 +++--
 tests/fixtures/wc/UTF_8_test.txt       | Bin 22781 -> 23025 bytes
 tests/fixtures/wc/UTF_8_weirdchars.txt |  25 ++++++++
 8 files changed, 105 insertions(+), 138 deletions(-)
 create mode 100644 tests/fixtures/wc/UTF_8_weirdchars.txt
diff --git a/Cargo.lock b/Cargo.lock
index 8a9d0bdff..4d51b124d 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -2046,6 +2046,12 @@ dependencies = [
  "log",
 ]
 
+[[package]]
+name = "utf-8"
+version = "0.7.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "09cc8ee72d2a9becf2f2febe0205bbed8fc6615b7cb429ad062dc7b7ddd036a9"
+
 [[package]]
 name = "utf8-width"
 version = "0.1.5"
@@ -3121,6 +3127,8 @@ dependencies = [
  "libc",
  "nix 0.20.0",
  "thiserror",
+ "unicode-width",
+ "utf-8",
  "uucore",
  "uucore_procs",
 ]
diff --git a/src/uu/wc/Cargo.toml b/src/uu/wc/Cargo.toml
index 49735adf7..1fdaa02f7 100644
--- a/src/uu/wc/Cargo.toml
+++ b/src/uu/wc/Cargo.toml
@@ -20,6 +20,8 @@ uucore = { version=">=0.0.9", package="uucore", path="../../uucore" }
 uucore_procs = { version=">=0.0.6", package="uucore_procs", path="../../uucore_procs" }
 thiserror = "1.0"
 bytecount = "0.6.2"
+utf-8 = "0.7.6"
+unicode-width = "0.1.8"
 
 [target.'cfg(unix)'.dependencies]
 nix = "0.20"
diff --git a/src/uu/wc/src/countable.rs b/src/uu/wc/src/countable.rs
index 098c451c7..a14623559 100644
--- a/src/uu/wc/src/countable.rs
+++ b/src/uu/wc/src/countable.rs
@@ -4,7 +4,7 @@
 //! for some common file-like objects. Use the [`WordCountable::lines`]
 //! method to get an iterator over lines of a file-like object.
 use std::fs::File;
-use std::io::{self, BufRead, BufReader, Read, StdinLock};
+use std::io::{BufRead, BufReader, Read, StdinLock};
 
 #[cfg(unix)]
 use std::os::unix::io::AsRawFd;
@@ -12,65 +12,26 @@ use std::os::unix::io::AsRawFd;
 #[cfg(unix)]
 pub trait WordCountable: AsRawFd + Read {
     type Buffered: BufRead;
-    fn lines(self) -> Lines<Self::Buffered>;
+    fn buffered(self) -> Self::Buffered;
 }
 
 #[cfg(not(unix))]
 pub trait WordCountable: Read {
     type Buffered: BufRead;
-    fn lines(self) -> Lines<Self::Buffered>;
+    fn buffered(self) -> Self::Buffered;
 }
 
 impl WordCountable for StdinLock<'_> {
     type Buffered = Self;
 
-    fn lines(self) -> Lines<Self::Buffered>
-    where
-        Self: Sized,
-    {
-        Lines::new(self)
+    fn buffered(self) -> Self::Buffered {
+        self
     }
 }
 impl WordCountable for File {
     type Buffered = BufReader<Self>;
 
-    fn lines(self) -> Lines<Self::Buffered>
-    where
-        Self: Sized,
-    {
-        Lines::new(BufReader::new(self))
-    }
-}
-
-/// An iterator over the lines of an instance of `BufRead`.
-///
-/// Similar to [`io::Lines`] but yields each line as a `Vec<u8>` and
-/// includes the newline character (`\n`, the `0xA` byte) that
-/// terminates the line.
-///
-/// [`io::Lines`]:: io::Lines
-pub struct Lines<B> {
-    buf: B,
-    line: Vec<u8>,
-}
-
-impl<B: BufRead> Lines<B> {
-    fn new(reader: B) -> Self {
-        Lines {
-            buf: reader,
-            line: Vec::new(),
-        }
-    }
-
-    pub fn next(&mut self) -> Option<io::Result<&[u8]>> {
-        self.line.clear();
-
-        // reading from a TTY seems to raise a condition on, rather than return Some(0) like a file.
-        // hence the option wrapped in a result here
-        match self.buf.read_until(b'\n', &mut self.line) {
-            Ok(0) => None,
-            Ok(_n) => Some(Ok(&self.line)),
-            Err(e) => Some(Err(e)),
-        }
+    fn buffered(self) -> Self::Buffered {
+        BufReader::new(self)
     }
 }
diff --git a/src/uu/wc/src/wc.rs b/src/uu/wc/src/wc.rs
index 68fd23fb4..5b0ae0a0a 100644
--- a/src/uu/wc/src/wc.rs
+++ b/src/uu/wc/src/wc.rs
@@ -13,11 +13,14 @@ mod countable;
 mod word_count;
 use count_fast::{count_bytes_and_lines_fast, count_bytes_fast};
 use countable::WordCountable;
+use unicode_width::UnicodeWidthChar;
+use utf8::{BufReadDecoder, BufReadDecoderError};
 use word_count::{TitledWordCount, WordCount};
 
 use clap::{crate_version, App, Arg, ArgMatches};
 use thiserror::Error;
 
+use std::cmp::max;
 use std::fs::{self, File};
 use std::io::{self, ErrorKind, Write};
 use std::path::Path;
@@ -224,16 +227,59 @@ fn word_count_from_reader<T: WordCountable>(
         return count_bytes_and_lines_fast(&mut reader);
     }
 
-    // Sum the WordCount for each line. Show a warning for each line
-    // that results in an IO error when trying to read it.
-    let mut lines = reader.lines();
     let mut total = WordCount::default();
-    while let Some(res) = lines.next() {
-        match res {
-            Ok(line) => total += WordCount::from_line(line),
-            Err(e) => show_warning!("Error while reading {}: {}", path, e),
+    let mut reader = BufReadDecoder::new(reader.buffered());
+    let mut in_word = false;
+    let mut current_len = 0;
+
+    while let Some(chunk) = reader.next_strict() {
+        match chunk {
+            Ok(text) => {
+                for ch in text.chars() {
+                    if ch.is_whitespace() {
+                        in_word = false;
+                    } else if ch.is_ascii_control() {
+                        // These count as characters but do not affect the word state
+                    } else if !in_word {
+                        in_word = true;
+                        total.words += 1;
+                    }
+                    match ch {
+                        '\n' => {
+                            total.max_line_length = max(current_len, total.max_line_length);
+                            current_len = 0;
+                            total.lines += 1;
+                        }
+                        // '\x0c' = '\f'
+                        '\r' | '\x0c' => {
+                            total.max_line_length = max(current_len, total.max_line_length);
+                            current_len = 0;
+                        }
+                        '\t' => {
+                            current_len -= current_len % 8;
+                            current_len += 8;
+                        }
+                        _ => {
+                            current_len += ch.width().unwrap_or(0);
+                        }
+                    }
+                    total.chars += 1;
+                }
+                total.bytes += text.len();
+            }
+            Err(BufReadDecoderError::InvalidByteSequence(bytes)) => {
+                // GNU wc treats invalid data as neither word nor char nor whitespace,
+                // so no other counters are affected
+                total.bytes += bytes.len();
+            }
+            Err(BufReadDecoderError::Io(e)) => {
+                show_warning!("Error while reading {}: {}", path, e);
+            }
         }
     }
+
+    total.max_line_length = max(current_len, total.max_line_length);
+
     Ok(total)
 }
 
diff --git a/src/uu/wc/src/word_count.rs b/src/uu/wc/src/word_count.rs
index 848e64b98..ac9de390c 100644
--- a/src/uu/wc/src/word_count.rs
+++ b/src/uu/wc/src/word_count.rs
@@ -1,19 +1,5 @@
 use std::cmp::max;
-use std::iter::Sum;
 use std::ops::{Add, AddAssign};
-use std::str::from_utf8;
-
-const CR: u8 = b'\r';
-const LF: u8 = b'\n';
-const SPACE: u8 = b' ';
-const TAB: u8 = b'\t';
-const SYN: u8 = 0x16_u8;
-const FF: u8 = 0x0C_u8;
-
-#[inline(always)]
-fn is_word_separator(byte: u8) -> bool {
-    byte == SPACE || byte == TAB || byte == CR || byte == SYN || byte == FF
-}
 
 #[derive(Debug, Default, Copy, Clone)]
 pub struct WordCount {
@@ -44,76 +30,10 @@ impl AddAssign for WordCount {
     }
 }
 
-impl Sum for WordCount {
-    fn sum<I>(iter: I) -> WordCount
-    where
-        I: Iterator<Item = WordCount>,
-    {
-        iter.fold(WordCount::default(), |acc, x| acc + x)
-    }
-}
-
 impl WordCount {
-    /// Count the characters and whitespace-separated words in the given bytes.
-    ///
-    /// `line` is a slice of bytes that will be decoded as ASCII characters.
-    fn ascii_word_and_char_count(line: &[u8]) -> (usize, usize) {
-        let word_count = line.split(|&x| is_word_separator(x)).count();
-        let char_count = line.iter().filter(|c| c.is_ascii()).count();
-        (word_count, char_count)
-    }
-
-    /// Create a [`WordCount`] from a sequence of bytes representing a line.
-    ///
-    /// If the last byte of `line` encodes a newline character (`\n`),
-    /// then the [`lines`] field will be set to 1. Otherwise, it will
-    /// be set to 0. The [`bytes`] field is simply the length of
-    /// `line`.
-    ///
-    /// If `decode_chars` is `false`, the [`chars`] and [`words`]
-    /// fields will be set to 0. If it is `true`, this function will
-    /// attempt to decode the bytes first as UTF-8, and failing that,
-    /// as ASCII.
-    pub fn from_line(line: &[u8]) -> WordCount {
-        // GNU 'wc' only counts lines that end in LF as lines
-        let lines = (*line.last().unwrap() == LF) as usize;
-        let bytes = line.len();
-        let (words, chars) = WordCount::word_and_char_count(line);
-        // -L is a GNU 'wc' extension so same behavior on LF
-        let max_line_length = if chars > 0 { chars - lines } else { 0 };
-        WordCount {
-            bytes,
-            chars,
-            lines,
-            words,
-            max_line_length,
-        }
-    }
-
-    /// Count the UTF-8 characters and words in the given string slice.
-    ///
-    /// `s` is a string slice that is assumed to be a UTF-8 string.
-    fn utf8_word_and_char_count(s: &str) -> (usize, usize) {
-        let word_count = s.split_whitespace().count();
-        let char_count = s.chars().count();
-        (word_count, char_count)
-    }
-
     pub fn with_title(self, title: Option<&str>) -> TitledWordCount {
         TitledWordCount { title, count: self }
     }
-
-    /// Count the characters and words in the given slice of bytes.
-    ///
-    /// `line` is a slice of bytes that will be decoded as UTF-8
-    /// characters, or if that fails, as ASCII characters.
-    fn word_and_char_count(line: &[u8]) -> (usize, usize) {
-        // try and convert the bytes to UTF-8 first
-        match from_utf8(line) {
-            Ok(s) => WordCount::utf8_word_and_char_count(s),
-            Err(..) => WordCount::ascii_word_and_char_count(line),
-        }
-    }
 }
 
 /// This struct supplements the actual word count with an optional title that is
diff --git a/tests/by-util/test_wc.rs b/tests/by-util/test_wc.rs
index 88c65c997..1c6625ab4 100644
--- a/tests/by-util/test_wc.rs
+++ b/tests/by-util/test_wc.rs
@@ -53,11 +53,16 @@ fn test_utf8() {
         .args(&["-lwmcL"])
         .pipe_in_fixture("UTF_8_test.txt")
         .run()
-        .stdout_is("    300    4969   22781   22213      79\n");
-    // GNU returns "    300    2086   22219   22781      79"
-    //
-    // TODO: we should fix the word, character, and byte count to
-    // match the behavior of GNU wc
+        .stdout_is("    303    2119   23025   22457      79\n");
+}
+
+#[test]
+fn test_utf8_extra() {
+    new_ucmd!()
+        .arg("-lwmcL")
+        .pipe_in_fixture("UTF_8_weirdchars.txt")
+        .run()
+        .stdout_is("     25      87     513     442      48\n");
 }
 
 #[test]
diff --git a/tests/fixtures/wc/UTF_8_test.txt b/tests/fixtures/wc/UTF_8_test.txt
index a5b5d50e6b61eb9a3b751b3954f83e61bb59db9b..cd0474c82c04fd18c01edf17d80374e9d483709b 100644
GIT binary patch
delta 307
zcmeynk@4eZ#tj96j1`j$1Vb1rCI@N@Pu{>QQD4IqU0Pa_nNzHgmtT@<r4XK~kdmLL
zUZPNxnw+1Wmsypnker{AT9BWaS5mByn3P|Zs$gi~=H}+6qfnlanUkvE?iZ?1o~)3P
zpIWS^pb*Oy9g>k*tdNpfoLrPzkeHWTsgPN$keHmDT2PXhl#{BXP@J!zUZI{^tdNmd
z=AWvNl$x5SkeOGUT2zvnqEM2rP+XFklcSKEn4PMi05-5BBUPa&wYW5=q*x&{B{i=k
zGdVE_q%yy>NFg(~ASX39HLoPGBr`v6a$>L&W9(!>{@~5dLOU55D<(7QYff$uW}B=n
uqQY1)`Jlbp<XU0Q$@L;COcndVTweb0%}Sy?OyHo=nw-ZMvU!bopd$dt_Gvx<

delta 96
zcmeyknep#N#tj96lRs$-O@6>DF-d@J@<KtC$se_aLEOm#{2`l7g?BPeZj{lSoWRdA
mSw>WiaX*|_Agac+AHoHx6cyuPnp`NTHMw3mWb*=vKt}+@F(E7f

diff --git a/tests/fixtures/wc/UTF_8_weirdchars.txt b/tests/fixtures/wc/UTF_8_weirdchars.txt
new file mode 100644
index 000000000..0c7670f5e
--- /dev/null
+++ b/tests/fixtures/wc/UTF_8_weirdchars.txt
@@ -0,0 +1,25 @@
+zero-width space inbetween these: x​x
+and inbetween two spaces: [ ​ ]
+and at the end of the line: ​
+
+non-breaking space: x x [   ]  
+
+simple unicode: xµx [ µ ] µ
+
+wide: xｗx [ ｗ ] ｗ
+
+simple emoji: x👩x [ 👩 ] 👩
+
+complex emoji: x👩‍🔬x [ 👩‍🔬 ] 👩‍🔬
+
+Ｈｅｌｌｏ, ｗｏｒｌｄ!
+
+line feed: xx [  ] 
+
+vertical tab: xx [  ] 
+
+horizontal tab: x	x [ 	 ]
+this should be the longest line:
+1234567	12345678	123456781234567812345678
+
+Control character: xx [  ]