From 6f7d740592111d5f558a841f6c5d1fdb6c0af03f Mon Sep 17 00:00:00 2001 From: Jan Verbeek Date: Wed, 25 Aug 2021 13:26:44 +0200 Subject: [PATCH] wc: Do a chunked read with proper UTF-8 handling This brings the results mostly in line with GNU wc and solves nasty behavior with long lines. --- Cargo.lock | 8 +++ src/uu/wc/Cargo.toml | 2 + src/uu/wc/src/countable.rs | 53 +++------------- src/uu/wc/src/wc.rs | 60 ++++++++++++++++--- src/uu/wc/src/word_count.rs | 80 ------------------------- tests/by-util/test_wc.rs | 15 +++-- tests/fixtures/wc/UTF_8_test.txt | Bin 22781 -> 23025 bytes tests/fixtures/wc/UTF_8_weirdchars.txt | 25 ++++++++ 8 files changed, 105 insertions(+), 138 deletions(-) create mode 100644 tests/fixtures/wc/UTF_8_weirdchars.txt diff --git a/Cargo.lock b/Cargo.lock index 8a9d0bdff..4d51b124d 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2046,6 +2046,12 @@ dependencies = [ "log", ] +[[package]] +name = "utf-8" +version = "0.7.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "09cc8ee72d2a9becf2f2febe0205bbed8fc6615b7cb429ad062dc7b7ddd036a9" + [[package]] name = "utf8-width" version = "0.1.5" @@ -3121,6 +3127,8 @@ dependencies = [ "libc", "nix 0.20.0", "thiserror", + "unicode-width", + "utf-8", "uucore", "uucore_procs", ] diff --git a/src/uu/wc/Cargo.toml b/src/uu/wc/Cargo.toml index 49735adf7..1fdaa02f7 100644 --- a/src/uu/wc/Cargo.toml +++ b/src/uu/wc/Cargo.toml @@ -20,6 +20,8 @@ uucore = { version=">=0.0.9", package="uucore", path="../../uucore" } uucore_procs = { version=">=0.0.6", package="uucore_procs", path="../../uucore_procs" } thiserror = "1.0" bytecount = "0.6.2" +utf-8 = "0.7.6" +unicode-width = "0.1.8" [target.'cfg(unix)'.dependencies] nix = "0.20" diff --git a/src/uu/wc/src/countable.rs b/src/uu/wc/src/countable.rs index 098c451c7..a14623559 100644 --- a/src/uu/wc/src/countable.rs +++ b/src/uu/wc/src/countable.rs @@ -4,7 +4,7 @@ //! for some common file-like objects. Use the [`WordCountable::lines`] //! method to get an iterator over lines of a file-like object. use std::fs::File; -use std::io::{self, BufRead, BufReader, Read, StdinLock}; +use std::io::{BufRead, BufReader, Read, StdinLock}; #[cfg(unix)] use std::os::unix::io::AsRawFd; @@ -12,65 +12,26 @@ use std::os::unix::io::AsRawFd; #[cfg(unix)] pub trait WordCountable: AsRawFd + Read { type Buffered: BufRead; - fn lines(self) -> Lines; + fn buffered(self) -> Self::Buffered; } #[cfg(not(unix))] pub trait WordCountable: Read { type Buffered: BufRead; - fn lines(self) -> Lines; + fn buffered(self) -> Self::Buffered; } impl WordCountable for StdinLock<'_> { type Buffered = Self; - fn lines(self) -> Lines - where - Self: Sized, - { - Lines::new(self) + fn buffered(self) -> Self::Buffered { + self } } impl WordCountable for File { type Buffered = BufReader; - fn lines(self) -> Lines - where - Self: Sized, - { - Lines::new(BufReader::new(self)) - } -} - -/// An iterator over the lines of an instance of `BufRead`. -/// -/// Similar to [`io::Lines`] but yields each line as a `Vec` and -/// includes the newline character (`\n`, the `0xA` byte) that -/// terminates the line. -/// -/// [`io::Lines`]:: io::Lines -pub struct Lines { - buf: B, - line: Vec, -} - -impl Lines { - fn new(reader: B) -> Self { - Lines { - buf: reader, - line: Vec::new(), - } - } - - pub fn next(&mut self) -> Option> { - self.line.clear(); - - // reading from a TTY seems to raise a condition on, rather than return Some(0) like a file. - // hence the option wrapped in a result here - match self.buf.read_until(b'\n', &mut self.line) { - Ok(0) => None, - Ok(_n) => Some(Ok(&self.line)), - Err(e) => Some(Err(e)), - } + fn buffered(self) -> Self::Buffered { + BufReader::new(self) } } diff --git a/src/uu/wc/src/wc.rs b/src/uu/wc/src/wc.rs index 68fd23fb4..5b0ae0a0a 100644 --- a/src/uu/wc/src/wc.rs +++ b/src/uu/wc/src/wc.rs @@ -13,11 +13,14 @@ mod countable; mod word_count; use count_fast::{count_bytes_and_lines_fast, count_bytes_fast}; use countable::WordCountable; +use unicode_width::UnicodeWidthChar; +use utf8::{BufReadDecoder, BufReadDecoderError}; use word_count::{TitledWordCount, WordCount}; use clap::{crate_version, App, Arg, ArgMatches}; use thiserror::Error; +use std::cmp::max; use std::fs::{self, File}; use std::io::{self, ErrorKind, Write}; use std::path::Path; @@ -224,16 +227,59 @@ fn word_count_from_reader( return count_bytes_and_lines_fast(&mut reader); } - // Sum the WordCount for each line. Show a warning for each line - // that results in an IO error when trying to read it. - let mut lines = reader.lines(); let mut total = WordCount::default(); - while let Some(res) = lines.next() { - match res { - Ok(line) => total += WordCount::from_line(line), - Err(e) => show_warning!("Error while reading {}: {}", path, e), + let mut reader = BufReadDecoder::new(reader.buffered()); + let mut in_word = false; + let mut current_len = 0; + + while let Some(chunk) = reader.next_strict() { + match chunk { + Ok(text) => { + for ch in text.chars() { + if ch.is_whitespace() { + in_word = false; + } else if ch.is_ascii_control() { + // These count as characters but do not affect the word state + } else if !in_word { + in_word = true; + total.words += 1; + } + match ch { + '\n' => { + total.max_line_length = max(current_len, total.max_line_length); + current_len = 0; + total.lines += 1; + } + // '\x0c' = '\f' + '\r' | '\x0c' => { + total.max_line_length = max(current_len, total.max_line_length); + current_len = 0; + } + '\t' => { + current_len -= current_len % 8; + current_len += 8; + } + _ => { + current_len += ch.width().unwrap_or(0); + } + } + total.chars += 1; + } + total.bytes += text.len(); + } + Err(BufReadDecoderError::InvalidByteSequence(bytes)) => { + // GNU wc treats invalid data as neither word nor char nor whitespace, + // so no other counters are affected + total.bytes += bytes.len(); + } + Err(BufReadDecoderError::Io(e)) => { + show_warning!("Error while reading {}: {}", path, e); + } } } + + total.max_line_length = max(current_len, total.max_line_length); + Ok(total) } diff --git a/src/uu/wc/src/word_count.rs b/src/uu/wc/src/word_count.rs index 848e64b98..ac9de390c 100644 --- a/src/uu/wc/src/word_count.rs +++ b/src/uu/wc/src/word_count.rs @@ -1,19 +1,5 @@ use std::cmp::max; -use std::iter::Sum; use std::ops::{Add, AddAssign}; -use std::str::from_utf8; - -const CR: u8 = b'\r'; -const LF: u8 = b'\n'; -const SPACE: u8 = b' '; -const TAB: u8 = b'\t'; -const SYN: u8 = 0x16_u8; -const FF: u8 = 0x0C_u8; - -#[inline(always)] -fn is_word_separator(byte: u8) -> bool { - byte == SPACE || byte == TAB || byte == CR || byte == SYN || byte == FF -} #[derive(Debug, Default, Copy, Clone)] pub struct WordCount { @@ -44,76 +30,10 @@ impl AddAssign for WordCount { } } -impl Sum for WordCount { - fn sum(iter: I) -> WordCount - where - I: Iterator, - { - iter.fold(WordCount::default(), |acc, x| acc + x) - } -} - impl WordCount { - /// Count the characters and whitespace-separated words in the given bytes. - /// - /// `line` is a slice of bytes that will be decoded as ASCII characters. - fn ascii_word_and_char_count(line: &[u8]) -> (usize, usize) { - let word_count = line.split(|&x| is_word_separator(x)).count(); - let char_count = line.iter().filter(|c| c.is_ascii()).count(); - (word_count, char_count) - } - - /// Create a [`WordCount`] from a sequence of bytes representing a line. - /// - /// If the last byte of `line` encodes a newline character (`\n`), - /// then the [`lines`] field will be set to 1. Otherwise, it will - /// be set to 0. The [`bytes`] field is simply the length of - /// `line`. - /// - /// If `decode_chars` is `false`, the [`chars`] and [`words`] - /// fields will be set to 0. If it is `true`, this function will - /// attempt to decode the bytes first as UTF-8, and failing that, - /// as ASCII. - pub fn from_line(line: &[u8]) -> WordCount { - // GNU 'wc' only counts lines that end in LF as lines - let lines = (*line.last().unwrap() == LF) as usize; - let bytes = line.len(); - let (words, chars) = WordCount::word_and_char_count(line); - // -L is a GNU 'wc' extension so same behavior on LF - let max_line_length = if chars > 0 { chars - lines } else { 0 }; - WordCount { - bytes, - chars, - lines, - words, - max_line_length, - } - } - - /// Count the UTF-8 characters and words in the given string slice. - /// - /// `s` is a string slice that is assumed to be a UTF-8 string. - fn utf8_word_and_char_count(s: &str) -> (usize, usize) { - let word_count = s.split_whitespace().count(); - let char_count = s.chars().count(); - (word_count, char_count) - } - pub fn with_title(self, title: Option<&str>) -> TitledWordCount { TitledWordCount { title, count: self } } - - /// Count the characters and words in the given slice of bytes. - /// - /// `line` is a slice of bytes that will be decoded as UTF-8 - /// characters, or if that fails, as ASCII characters. - fn word_and_char_count(line: &[u8]) -> (usize, usize) { - // try and convert the bytes to UTF-8 first - match from_utf8(line) { - Ok(s) => WordCount::utf8_word_and_char_count(s), - Err(..) => WordCount::ascii_word_and_char_count(line), - } - } } /// This struct supplements the actual word count with an optional title that is diff --git a/tests/by-util/test_wc.rs b/tests/by-util/test_wc.rs index 88c65c997..1c6625ab4 100644 --- a/tests/by-util/test_wc.rs +++ b/tests/by-util/test_wc.rs @@ -53,11 +53,16 @@ fn test_utf8() { .args(&["-lwmcL"]) .pipe_in_fixture("UTF_8_test.txt") .run() - .stdout_is(" 300 4969 22781 22213 79\n"); - // GNU returns " 300 2086 22219 22781 79" - // - // TODO: we should fix the word, character, and byte count to - // match the behavior of GNU wc + .stdout_is(" 303 2119 23025 22457 79\n"); +} + +#[test] +fn test_utf8_extra() { + new_ucmd!() + .arg("-lwmcL") + .pipe_in_fixture("UTF_8_weirdchars.txt") + .run() + .stdout_is(" 25 87 513 442 48\n"); } #[test] diff --git a/tests/fixtures/wc/UTF_8_test.txt b/tests/fixtures/wc/UTF_8_test.txt index a5b5d50e6b61eb9a3b751b3954f83e61bb59db9b..cd0474c82c04fd18c01edf17d80374e9d483709b 100644 GIT binary patch delta 307 zcmeynk@4eZ#tj96j1`j$1Vb1rCI@N@Pu{>QQD4IqU0Pa_nNzHgmtT@k*tdNpfoLrPzkeHWTsgPN$keHmDT2PXhl#{BXP@J!zUZI{^tdNmd z=AWvNl$x5SkeOGUT2zvnqEM2rP+XFklcSKEn4PMi05-5BBUPa&wYW5=q*x&{B{i=k zGdVE_q%yy>NFg(~ASX39HLoPGBr`v6a$>L&W9(!>{@~5dLOU55D<(7QYff$uW}B=n uqQY1)`JlbpDF-d@J@WiaX*|_Agac+AHoHx6cyuPnp`NTHMw3mWb*=vKt}+@F(E7f diff --git a/tests/fixtures/wc/UTF_8_weirdchars.txt b/tests/fixtures/wc/UTF_8_weirdchars.txt new file mode 100644 index 000000000..0c7670f5e --- /dev/null +++ b/tests/fixtures/wc/UTF_8_weirdchars.txt @@ -0,0 +1,25 @@ +zero-width space inbetween these: x​x +and inbetween two spaces: [ ​ ] +and at the end of the line: ​ + +non-breaking space: x x [   ]   + +simple unicode: xµx [ µ ] µ + +wide: xwx [ w ] w + +simple emoji: x👩x [ 👩 ] 👩 + +complex emoji: x👩‍🔬x [ 👩‍🔬 ] 👩‍🔬 + +Hello, world! + +line feed: x x [ ] + +vertical tab: x x [ ] + +horizontal tab: x x [ ] +this should be the longest line: +1234567 12345678 123456781234567812345678 + +Control character: xx [  ]