mirror of
https://github.com/RGBCube/uutils-coreutils
synced 2025-07-28 03:27:44 +00:00
wc: Do a chunked read with proper UTF-8 handling
This brings the results mostly in line with GNU wc and solves nasty behavior with long lines.
This commit is contained in:
parent
48437fc49d
commit
6f7d740592
8 changed files with 105 additions and 138 deletions
8
Cargo.lock
generated
8
Cargo.lock
generated
|
@ -2046,6 +2046,12 @@ dependencies = [
|
||||||
"log",
|
"log",
|
||||||
]
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "utf-8"
|
||||||
|
version = "0.7.6"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "09cc8ee72d2a9becf2f2febe0205bbed8fc6615b7cb429ad062dc7b7ddd036a9"
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "utf8-width"
|
name = "utf8-width"
|
||||||
version = "0.1.5"
|
version = "0.1.5"
|
||||||
|
@ -3121,6 +3127,8 @@ dependencies = [
|
||||||
"libc",
|
"libc",
|
||||||
"nix 0.20.0",
|
"nix 0.20.0",
|
||||||
"thiserror",
|
"thiserror",
|
||||||
|
"unicode-width",
|
||||||
|
"utf-8",
|
||||||
"uucore",
|
"uucore",
|
||||||
"uucore_procs",
|
"uucore_procs",
|
||||||
]
|
]
|
||||||
|
|
|
@ -20,6 +20,8 @@ uucore = { version=">=0.0.9", package="uucore", path="../../uucore" }
|
||||||
uucore_procs = { version=">=0.0.6", package="uucore_procs", path="../../uucore_procs" }
|
uucore_procs = { version=">=0.0.6", package="uucore_procs", path="../../uucore_procs" }
|
||||||
thiserror = "1.0"
|
thiserror = "1.0"
|
||||||
bytecount = "0.6.2"
|
bytecount = "0.6.2"
|
||||||
|
utf-8 = "0.7.6"
|
||||||
|
unicode-width = "0.1.8"
|
||||||
|
|
||||||
[target.'cfg(unix)'.dependencies]
|
[target.'cfg(unix)'.dependencies]
|
||||||
nix = "0.20"
|
nix = "0.20"
|
||||||
|
|
|
@ -4,7 +4,7 @@
|
||||||
//! for some common file-like objects. Use the [`WordCountable::lines`]
|
//! for some common file-like objects. Use the [`WordCountable::lines`]
|
||||||
//! method to get an iterator over lines of a file-like object.
|
//! method to get an iterator over lines of a file-like object.
|
||||||
use std::fs::File;
|
use std::fs::File;
|
||||||
use std::io::{self, BufRead, BufReader, Read, StdinLock};
|
use std::io::{BufRead, BufReader, Read, StdinLock};
|
||||||
|
|
||||||
#[cfg(unix)]
|
#[cfg(unix)]
|
||||||
use std::os::unix::io::AsRawFd;
|
use std::os::unix::io::AsRawFd;
|
||||||
|
@ -12,65 +12,26 @@ use std::os::unix::io::AsRawFd;
|
||||||
#[cfg(unix)]
|
#[cfg(unix)]
|
||||||
pub trait WordCountable: AsRawFd + Read {
|
pub trait WordCountable: AsRawFd + Read {
|
||||||
type Buffered: BufRead;
|
type Buffered: BufRead;
|
||||||
fn lines(self) -> Lines<Self::Buffered>;
|
fn buffered(self) -> Self::Buffered;
|
||||||
}
|
}
|
||||||
|
|
||||||
#[cfg(not(unix))]
|
#[cfg(not(unix))]
|
||||||
pub trait WordCountable: Read {
|
pub trait WordCountable: Read {
|
||||||
type Buffered: BufRead;
|
type Buffered: BufRead;
|
||||||
fn lines(self) -> Lines<Self::Buffered>;
|
fn buffered(self) -> Self::Buffered;
|
||||||
}
|
}
|
||||||
|
|
||||||
impl WordCountable for StdinLock<'_> {
|
impl WordCountable for StdinLock<'_> {
|
||||||
type Buffered = Self;
|
type Buffered = Self;
|
||||||
|
|
||||||
fn lines(self) -> Lines<Self::Buffered>
|
fn buffered(self) -> Self::Buffered {
|
||||||
where
|
self
|
||||||
Self: Sized,
|
|
||||||
{
|
|
||||||
Lines::new(self)
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
impl WordCountable for File {
|
impl WordCountable for File {
|
||||||
type Buffered = BufReader<Self>;
|
type Buffered = BufReader<Self>;
|
||||||
|
|
||||||
fn lines(self) -> Lines<Self::Buffered>
|
fn buffered(self) -> Self::Buffered {
|
||||||
where
|
BufReader::new(self)
|
||||||
Self: Sized,
|
|
||||||
{
|
|
||||||
Lines::new(BufReader::new(self))
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/// An iterator over the lines of an instance of `BufRead`.
|
|
||||||
///
|
|
||||||
/// Similar to [`io::Lines`] but yields each line as a `Vec<u8>` and
|
|
||||||
/// includes the newline character (`\n`, the `0xA` byte) that
|
|
||||||
/// terminates the line.
|
|
||||||
///
|
|
||||||
/// [`io::Lines`]:: io::Lines
|
|
||||||
pub struct Lines<B> {
|
|
||||||
buf: B,
|
|
||||||
line: Vec<u8>,
|
|
||||||
}
|
|
||||||
|
|
||||||
impl<B: BufRead> Lines<B> {
|
|
||||||
fn new(reader: B) -> Self {
|
|
||||||
Lines {
|
|
||||||
buf: reader,
|
|
||||||
line: Vec::new(),
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
pub fn next(&mut self) -> Option<io::Result<&[u8]>> {
|
|
||||||
self.line.clear();
|
|
||||||
|
|
||||||
// reading from a TTY seems to raise a condition on, rather than return Some(0) like a file.
|
|
||||||
// hence the option wrapped in a result here
|
|
||||||
match self.buf.read_until(b'\n', &mut self.line) {
|
|
||||||
Ok(0) => None,
|
|
||||||
Ok(_n) => Some(Ok(&self.line)),
|
|
||||||
Err(e) => Some(Err(e)),
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -13,11 +13,14 @@ mod countable;
|
||||||
mod word_count;
|
mod word_count;
|
||||||
use count_fast::{count_bytes_and_lines_fast, count_bytes_fast};
|
use count_fast::{count_bytes_and_lines_fast, count_bytes_fast};
|
||||||
use countable::WordCountable;
|
use countable::WordCountable;
|
||||||
|
use unicode_width::UnicodeWidthChar;
|
||||||
|
use utf8::{BufReadDecoder, BufReadDecoderError};
|
||||||
use word_count::{TitledWordCount, WordCount};
|
use word_count::{TitledWordCount, WordCount};
|
||||||
|
|
||||||
use clap::{crate_version, App, Arg, ArgMatches};
|
use clap::{crate_version, App, Arg, ArgMatches};
|
||||||
use thiserror::Error;
|
use thiserror::Error;
|
||||||
|
|
||||||
|
use std::cmp::max;
|
||||||
use std::fs::{self, File};
|
use std::fs::{self, File};
|
||||||
use std::io::{self, ErrorKind, Write};
|
use std::io::{self, ErrorKind, Write};
|
||||||
use std::path::Path;
|
use std::path::Path;
|
||||||
|
@ -224,16 +227,59 @@ fn word_count_from_reader<T: WordCountable>(
|
||||||
return count_bytes_and_lines_fast(&mut reader);
|
return count_bytes_and_lines_fast(&mut reader);
|
||||||
}
|
}
|
||||||
|
|
||||||
// Sum the WordCount for each line. Show a warning for each line
|
|
||||||
// that results in an IO error when trying to read it.
|
|
||||||
let mut lines = reader.lines();
|
|
||||||
let mut total = WordCount::default();
|
let mut total = WordCount::default();
|
||||||
while let Some(res) = lines.next() {
|
let mut reader = BufReadDecoder::new(reader.buffered());
|
||||||
match res {
|
let mut in_word = false;
|
||||||
Ok(line) => total += WordCount::from_line(line),
|
let mut current_len = 0;
|
||||||
Err(e) => show_warning!("Error while reading {}: {}", path, e),
|
|
||||||
|
while let Some(chunk) = reader.next_strict() {
|
||||||
|
match chunk {
|
||||||
|
Ok(text) => {
|
||||||
|
for ch in text.chars() {
|
||||||
|
if ch.is_whitespace() {
|
||||||
|
in_word = false;
|
||||||
|
} else if ch.is_ascii_control() {
|
||||||
|
// These count as characters but do not affect the word state
|
||||||
|
} else if !in_word {
|
||||||
|
in_word = true;
|
||||||
|
total.words += 1;
|
||||||
|
}
|
||||||
|
match ch {
|
||||||
|
'\n' => {
|
||||||
|
total.max_line_length = max(current_len, total.max_line_length);
|
||||||
|
current_len = 0;
|
||||||
|
total.lines += 1;
|
||||||
|
}
|
||||||
|
// '\x0c' = '\f'
|
||||||
|
'\r' | '\x0c' => {
|
||||||
|
total.max_line_length = max(current_len, total.max_line_length);
|
||||||
|
current_len = 0;
|
||||||
|
}
|
||||||
|
'\t' => {
|
||||||
|
current_len -= current_len % 8;
|
||||||
|
current_len += 8;
|
||||||
|
}
|
||||||
|
_ => {
|
||||||
|
current_len += ch.width().unwrap_or(0);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
total.chars += 1;
|
||||||
|
}
|
||||||
|
total.bytes += text.len();
|
||||||
|
}
|
||||||
|
Err(BufReadDecoderError::InvalidByteSequence(bytes)) => {
|
||||||
|
// GNU wc treats invalid data as neither word nor char nor whitespace,
|
||||||
|
// so no other counters are affected
|
||||||
|
total.bytes += bytes.len();
|
||||||
|
}
|
||||||
|
Err(BufReadDecoderError::Io(e)) => {
|
||||||
|
show_warning!("Error while reading {}: {}", path, e);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
total.max_line_length = max(current_len, total.max_line_length);
|
||||||
|
|
||||||
Ok(total)
|
Ok(total)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -1,19 +1,5 @@
|
||||||
use std::cmp::max;
|
use std::cmp::max;
|
||||||
use std::iter::Sum;
|
|
||||||
use std::ops::{Add, AddAssign};
|
use std::ops::{Add, AddAssign};
|
||||||
use std::str::from_utf8;
|
|
||||||
|
|
||||||
const CR: u8 = b'\r';
|
|
||||||
const LF: u8 = b'\n';
|
|
||||||
const SPACE: u8 = b' ';
|
|
||||||
const TAB: u8 = b'\t';
|
|
||||||
const SYN: u8 = 0x16_u8;
|
|
||||||
const FF: u8 = 0x0C_u8;
|
|
||||||
|
|
||||||
#[inline(always)]
|
|
||||||
fn is_word_separator(byte: u8) -> bool {
|
|
||||||
byte == SPACE || byte == TAB || byte == CR || byte == SYN || byte == FF
|
|
||||||
}
|
|
||||||
|
|
||||||
#[derive(Debug, Default, Copy, Clone)]
|
#[derive(Debug, Default, Copy, Clone)]
|
||||||
pub struct WordCount {
|
pub struct WordCount {
|
||||||
|
@ -44,76 +30,10 @@ impl AddAssign for WordCount {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
impl Sum for WordCount {
|
|
||||||
fn sum<I>(iter: I) -> WordCount
|
|
||||||
where
|
|
||||||
I: Iterator<Item = WordCount>,
|
|
||||||
{
|
|
||||||
iter.fold(WordCount::default(), |acc, x| acc + x)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
impl WordCount {
|
impl WordCount {
|
||||||
/// Count the characters and whitespace-separated words in the given bytes.
|
|
||||||
///
|
|
||||||
/// `line` is a slice of bytes that will be decoded as ASCII characters.
|
|
||||||
fn ascii_word_and_char_count(line: &[u8]) -> (usize, usize) {
|
|
||||||
let word_count = line.split(|&x| is_word_separator(x)).count();
|
|
||||||
let char_count = line.iter().filter(|c| c.is_ascii()).count();
|
|
||||||
(word_count, char_count)
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Create a [`WordCount`] from a sequence of bytes representing a line.
|
|
||||||
///
|
|
||||||
/// If the last byte of `line` encodes a newline character (`\n`),
|
|
||||||
/// then the [`lines`] field will be set to 1. Otherwise, it will
|
|
||||||
/// be set to 0. The [`bytes`] field is simply the length of
|
|
||||||
/// `line`.
|
|
||||||
///
|
|
||||||
/// If `decode_chars` is `false`, the [`chars`] and [`words`]
|
|
||||||
/// fields will be set to 0. If it is `true`, this function will
|
|
||||||
/// attempt to decode the bytes first as UTF-8, and failing that,
|
|
||||||
/// as ASCII.
|
|
||||||
pub fn from_line(line: &[u8]) -> WordCount {
|
|
||||||
// GNU 'wc' only counts lines that end in LF as lines
|
|
||||||
let lines = (*line.last().unwrap() == LF) as usize;
|
|
||||||
let bytes = line.len();
|
|
||||||
let (words, chars) = WordCount::word_and_char_count(line);
|
|
||||||
// -L is a GNU 'wc' extension so same behavior on LF
|
|
||||||
let max_line_length = if chars > 0 { chars - lines } else { 0 };
|
|
||||||
WordCount {
|
|
||||||
bytes,
|
|
||||||
chars,
|
|
||||||
lines,
|
|
||||||
words,
|
|
||||||
max_line_length,
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Count the UTF-8 characters and words in the given string slice.
|
|
||||||
///
|
|
||||||
/// `s` is a string slice that is assumed to be a UTF-8 string.
|
|
||||||
fn utf8_word_and_char_count(s: &str) -> (usize, usize) {
|
|
||||||
let word_count = s.split_whitespace().count();
|
|
||||||
let char_count = s.chars().count();
|
|
||||||
(word_count, char_count)
|
|
||||||
}
|
|
||||||
|
|
||||||
pub fn with_title(self, title: Option<&str>) -> TitledWordCount {
|
pub fn with_title(self, title: Option<&str>) -> TitledWordCount {
|
||||||
TitledWordCount { title, count: self }
|
TitledWordCount { title, count: self }
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Count the characters and words in the given slice of bytes.
|
|
||||||
///
|
|
||||||
/// `line` is a slice of bytes that will be decoded as UTF-8
|
|
||||||
/// characters, or if that fails, as ASCII characters.
|
|
||||||
fn word_and_char_count(line: &[u8]) -> (usize, usize) {
|
|
||||||
// try and convert the bytes to UTF-8 first
|
|
||||||
match from_utf8(line) {
|
|
||||||
Ok(s) => WordCount::utf8_word_and_char_count(s),
|
|
||||||
Err(..) => WordCount::ascii_word_and_char_count(line),
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/// This struct supplements the actual word count with an optional title that is
|
/// This struct supplements the actual word count with an optional title that is
|
||||||
|
|
|
@ -53,11 +53,16 @@ fn test_utf8() {
|
||||||
.args(&["-lwmcL"])
|
.args(&["-lwmcL"])
|
||||||
.pipe_in_fixture("UTF_8_test.txt")
|
.pipe_in_fixture("UTF_8_test.txt")
|
||||||
.run()
|
.run()
|
||||||
.stdout_is(" 300 4969 22781 22213 79\n");
|
.stdout_is(" 303 2119 23025 22457 79\n");
|
||||||
// GNU returns " 300 2086 22219 22781 79"
|
}
|
||||||
//
|
|
||||||
// TODO: we should fix the word, character, and byte count to
|
#[test]
|
||||||
// match the behavior of GNU wc
|
fn test_utf8_extra() {
|
||||||
|
new_ucmd!()
|
||||||
|
.arg("-lwmcL")
|
||||||
|
.pipe_in_fixture("UTF_8_weirdchars.txt")
|
||||||
|
.run()
|
||||||
|
.stdout_is(" 25 87 513 442 48\n");
|
||||||
}
|
}
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
|
|
BIN
tests/fixtures/wc/UTF_8_test.txt
vendored
BIN
tests/fixtures/wc/UTF_8_test.txt
vendored
Binary file not shown.
25
tests/fixtures/wc/UTF_8_weirdchars.txt
vendored
Normal file
25
tests/fixtures/wc/UTF_8_weirdchars.txt
vendored
Normal file
|
@ -0,0 +1,25 @@
|
||||||
|
zero-width space inbetween these: xx
|
||||||
|
and inbetween two spaces: [ ]
|
||||||
|
and at the end of the line:
|
||||||
|
|
||||||
|
non-breaking space: x x [ ]
|
||||||
|
|
||||||
|
simple unicode: xµx [ µ ] µ
|
||||||
|
|
||||||
|
wide: xwx [ w ] w
|
||||||
|
|
||||||
|
simple emoji: x👩x [ 👩 ] 👩
|
||||||
|
|
||||||
|
complex emoji: x👩🔬x [ 👩🔬 ] 👩🔬
|
||||||
|
|
||||||
|
Hello, world!
|
||||||
|
|
||||||
|
line feed: xx [ ]
|
||||||
|
|
||||||
|
vertical tab: xx [ ]
|
||||||
|
|
||||||
|
horizontal tab: x x [ ]
|
||||||
|
this should be the longest line:
|
||||||
|
1234567 12345678 123456781234567812345678
|
||||||
|
|
||||||
|
Control character: xx [ ]
|
Loading…
Add table
Add a link
Reference in a new issue