1
Fork 0
mirror of https://github.com/RGBCube/uutils-coreutils synced 2025-07-28 11:37:44 +00:00

wc: Optimize, improve correctness

- Reuse allocations for read lines
- Increase splice size
- Check if /dev/null was opened correctly
- Do not discard read bytes after I/O error
- Add fast line counting with bytecount
This commit is contained in:
Jan Verbeek 2021-08-25 11:24:00 +02:00 committed by Michael Debertol
parent c756878b20
commit 48437fc49d
6 changed files with 88 additions and 45 deletions

7
Cargo.lock generated
View file

@ -188,6 +188,12 @@ dependencies = [
"utf8-width", "utf8-width",
] ]
[[package]]
name = "bytecount"
version = "0.6.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "72feb31ffc86498dacdbd0fcebb56138e7177a8cc5cea4516031d15ae85a742e"
[[package]] [[package]]
name = "byteorder" name = "byteorder"
version = "1.4.3" version = "1.4.3"
@ -3110,6 +3116,7 @@ dependencies = [
name = "uu_wc" name = "uu_wc"
version = "0.0.7" version = "0.0.7"
dependencies = [ dependencies = [
"bytecount",
"clap", "clap",
"libc", "libc",
"nix 0.20.0", "nix 0.20.0",

View file

@ -19,6 +19,7 @@ clap = { version = "2.33", features = ["wrap_help"] }
uucore = { version=">=0.0.9", package="uucore", path="../../uucore" } uucore = { version=">=0.0.9", package="uucore", path="../../uucore" }
uucore_procs = { version=">=0.0.6", package="uucore_procs", path="../../uucore_procs" } uucore_procs = { version=">=0.0.6", package="uucore_procs", path="../../uucore_procs" }
thiserror = "1.0" thiserror = "1.0"
bytecount = "0.6.2"
[target.'cfg(unix)'.dependencies] [target.'cfg(unix)'.dependencies]
nix = "0.20" nix = "0.20"

View file

@ -1,13 +1,15 @@
use crate::word_count::WordCount;
use super::{WcResult, WordCountable}; use super::{WcResult, WordCountable};
#[cfg(any(target_os = "linux", target_os = "android"))] #[cfg(any(target_os = "linux", target_os = "android"))]
use std::fs::{File, OpenOptions}; use std::fs::{File, OpenOptions};
use std::io::ErrorKind; use std::io::{ErrorKind, Read};
#[cfg(unix)] #[cfg(unix)]
use libc::S_IFREG; use libc::S_IFREG;
#[cfg(unix)] #[cfg(unix)]
use nix::sys::stat::fstat; use nix::sys::stat;
#[cfg(any(target_os = "linux", target_os = "android"))] #[cfg(any(target_os = "linux", target_os = "android"))]
use std::os::unix::io::{AsRawFd, FromRawFd, RawFd}; use std::os::unix::io::{AsRawFd, FromRawFd, RawFd};
@ -18,7 +20,8 @@ use nix::fcntl::{splice, SpliceFFlags};
#[cfg(any(target_os = "linux", target_os = "android"))] #[cfg(any(target_os = "linux", target_os = "android"))]
use nix::unistd::pipe; use nix::unistd::pipe;
const BUF_SIZE: usize = 16384; const BUF_SIZE: usize = 16 * 1024;
const SPLICE_SIZE: usize = 128 * 1024;
/// Splice wrapper which handles short writes /// Splice wrapper which handles short writes
#[cfg(any(target_os = "linux", target_os = "android"))] #[cfg(any(target_os = "linux", target_os = "android"))]
@ -37,15 +40,24 @@ fn splice_exact(read_fd: RawFd, write_fd: RawFd, num_bytes: usize) -> nix::Resul
/// This is a Linux-specific function to count the number of bytes using the /// This is a Linux-specific function to count the number of bytes using the
/// `splice` system call, which is faster than using `read`. /// `splice` system call, which is faster than using `read`.
///
/// On error it returns the number of bytes it did manage to read, since the
/// caller will fall back to a simpler method.
#[inline] #[inline]
#[cfg(any(target_os = "linux", target_os = "android"))] #[cfg(any(target_os = "linux", target_os = "android"))]
fn count_bytes_using_splice(fd: RawFd) -> nix::Result<usize> { fn count_bytes_using_splice(fd: RawFd) -> Result<usize, usize> {
let null_file = OpenOptions::new() let null_file = OpenOptions::new()
.write(true) .write(true)
.open("/dev/null") .open("/dev/null")
.map_err(|_| nix::Error::last())?; .map_err(|_| 0_usize)?;
let null = null_file.as_raw_fd(); let null = null_file.as_raw_fd();
let (pipe_rd, pipe_wr) = pipe()?; let null_rdev = stat::fstat(null).map_err(|_| 0_usize)?.st_rdev;
if (stat::major(null_rdev), stat::minor(null_rdev)) != (1, 3) {
// This is not a proper /dev/null, writing to it is probably bad
// Bit of an edge case, but it has been known to happen
return Err(0);
}
let (pipe_rd, pipe_wr) = pipe().map_err(|_| 0_usize)?;
// Ensure the pipe is closed when the function returns. // Ensure the pipe is closed when the function returns.
// SAFETY: The file descriptors do not have other owners. // SAFETY: The file descriptors do not have other owners.
@ -53,12 +65,16 @@ fn count_bytes_using_splice(fd: RawFd) -> nix::Result<usize> {
let mut byte_count = 0; let mut byte_count = 0;
loop { loop {
let res = splice(fd, None, pipe_wr, None, BUF_SIZE, SpliceFFlags::empty())?; match splice(fd, None, pipe_wr, None, SPLICE_SIZE, SpliceFFlags::empty()) {
if res == 0 { Ok(0) => break,
break; Ok(res) => {
} byte_count += res;
byte_count += res; if splice_exact(pipe_rd, null, res).is_err() {
splice_exact(pipe_rd, null, res)?; return Err(byte_count);
}
}
Err(_) => return Err(byte_count),
};
} }
Ok(byte_count) Ok(byte_count)
@ -73,10 +89,12 @@ fn count_bytes_using_splice(fd: RawFd) -> nix::Result<usize> {
/// other things such as lines and words. /// other things such as lines and words.
#[inline] #[inline]
pub(crate) fn count_bytes_fast<T: WordCountable>(handle: &mut T) -> WcResult<usize> { pub(crate) fn count_bytes_fast<T: WordCountable>(handle: &mut T) -> WcResult<usize> {
let mut byte_count = 0;
#[cfg(unix)] #[cfg(unix)]
{ {
let fd = handle.as_raw_fd(); let fd = handle.as_raw_fd();
if let Ok(stat) = fstat(fd) { if let Ok(stat) = stat::fstat(fd) {
// If the file is regular, then the `st_size` should hold // If the file is regular, then the `st_size` should hold
// the file's size in bytes. // the file's size in bytes.
if (stat.st_mode & S_IFREG) != 0 { if (stat.st_mode & S_IFREG) != 0 {
@ -87,8 +105,9 @@ pub(crate) fn count_bytes_fast<T: WordCountable>(handle: &mut T) -> WcResult<usi
// Else, if we're on Linux and our file is a FIFO pipe // Else, if we're on Linux and our file is a FIFO pipe
// (or stdin), we use splice to count the number of bytes. // (or stdin), we use splice to count the number of bytes.
if (stat.st_mode & S_IFIFO) != 0 { if (stat.st_mode & S_IFIFO) != 0 {
if let Ok(n) = count_bytes_using_splice(fd) { match count_bytes_using_splice(fd) {
return Ok(n); Ok(n) => return Ok(n),
Err(n) => byte_count = n,
} }
} }
} }
@ -97,7 +116,6 @@ pub(crate) fn count_bytes_fast<T: WordCountable>(handle: &mut T) -> WcResult<usi
// Fall back on `read`, but without the overhead of counting words and lines. // Fall back on `read`, but without the overhead of counting words and lines.
let mut buf = [0_u8; BUF_SIZE]; let mut buf = [0_u8; BUF_SIZE];
let mut byte_count = 0;
loop { loop {
match handle.read(&mut buf) { match handle.read(&mut buf) {
Ok(0) => return Ok(byte_count), Ok(0) => return Ok(byte_count),
@ -109,3 +127,19 @@ pub(crate) fn count_bytes_fast<T: WordCountable>(handle: &mut T) -> WcResult<usi
} }
} }
} }
pub(crate) fn count_bytes_and_lines_fast<R: Read>(handle: &mut R) -> WcResult<WordCount> {
let mut total = WordCount::default();
let mut buf = [0; BUF_SIZE];
loop {
match handle.read(&mut buf) {
Ok(0) => return Ok(total),
Ok(n) => {
total.bytes += n;
total.lines += bytecount::count(&buf[..n], b'\n');
}
Err(ref e) if e.kind() == ErrorKind::Interrupted => continue,
Err(e) => return Err(e.into()),
}
}
}

View file

@ -28,7 +28,7 @@ impl WordCountable for StdinLock<'_> {
where where
Self: Sized, Self: Sized,
{ {
Lines { buf: self } Lines::new(self)
} }
} }
impl WordCountable for File { impl WordCountable for File {
@ -38,9 +38,7 @@ impl WordCountable for File {
where where
Self: Sized, Self: Sized,
{ {
Lines { Lines::new(BufReader::new(self))
buf: BufReader::new(self),
}
} }
} }
@ -53,19 +51,25 @@ impl WordCountable for File {
/// [`io::Lines`]:: io::Lines /// [`io::Lines`]:: io::Lines
pub struct Lines<B> { pub struct Lines<B> {
buf: B, buf: B,
line: Vec<u8>,
} }
impl<B: BufRead> Iterator for Lines<B> { impl<B: BufRead> Lines<B> {
type Item = io::Result<Vec<u8>>; fn new(reader: B) -> Self {
Lines {
buf: reader,
line: Vec::new(),
}
}
fn next(&mut self) -> Option<Self::Item> { pub fn next(&mut self) -> Option<io::Result<&[u8]>> {
let mut line = Vec::new(); self.line.clear();
// reading from a TTY seems to raise a condition on, rather than return Some(0) like a file. // reading from a TTY seems to raise a condition on, rather than return Some(0) like a file.
// hence the option wrapped in a result here // hence the option wrapped in a result here
match self.buf.read_until(b'\n', &mut line) { match self.buf.read_until(b'\n', &mut self.line) {
Ok(0) => None, Ok(0) => None,
Ok(_n) => Some(Ok(line)), Ok(_n) => Some(Ok(&self.line)),
Err(e) => Some(Err(e)), Err(e) => Some(Err(e)),
} }
} }

View file

@ -8,10 +8,10 @@
#[macro_use] #[macro_use]
extern crate uucore; extern crate uucore;
mod count_bytes; mod count_fast;
mod countable; mod countable;
mod word_count; mod word_count;
use count_bytes::count_bytes_fast; use count_fast::{count_bytes_and_lines_fast, count_bytes_fast};
use countable::WordCountable; use countable::WordCountable;
use word_count::{TitledWordCount, WordCount}; use word_count::{TitledWordCount, WordCount};
@ -220,19 +220,20 @@ fn word_count_from_reader<T: WordCountable>(
// we do not need to decode the byte stream if we're only counting bytes/newlines // we do not need to decode the byte stream if we're only counting bytes/newlines
let decode_chars = settings.show_chars || settings.show_words || settings.show_max_line_length; let decode_chars = settings.show_chars || settings.show_words || settings.show_max_line_length;
if !decode_chars {
return count_bytes_and_lines_fast(&mut reader);
}
// Sum the WordCount for each line. Show a warning for each line // Sum the WordCount for each line. Show a warning for each line
// that results in an IO error when trying to read it. // that results in an IO error when trying to read it.
let total = reader let mut lines = reader.lines();
.lines() let mut total = WordCount::default();
.filter_map(|res| match res { while let Some(res) = lines.next() {
Ok(line) => Some(line), match res {
Err(e) => { Ok(line) => total += WordCount::from_line(line),
show_warning!("Error while reading {}: {}", path, e); Err(e) => show_warning!("Error while reading {}: {}", path, e),
None }
} }
})
.map(|line| WordCount::from_line(&line, decode_chars))
.sum();
Ok(total) Ok(total)
} }

View file

@ -74,15 +74,11 @@ impl WordCount {
/// fields will be set to 0. If it is `true`, this function will /// fields will be set to 0. If it is `true`, this function will
/// attempt to decode the bytes first as UTF-8, and failing that, /// attempt to decode the bytes first as UTF-8, and failing that,
/// as ASCII. /// as ASCII.
pub fn from_line(line: &[u8], decode_chars: bool) -> WordCount { pub fn from_line(line: &[u8]) -> WordCount {
// GNU 'wc' only counts lines that end in LF as lines // GNU 'wc' only counts lines that end in LF as lines
let lines = (*line.last().unwrap() == LF) as usize; let lines = (*line.last().unwrap() == LF) as usize;
let bytes = line.len(); let bytes = line.len();
let (words, chars) = if decode_chars { let (words, chars) = WordCount::word_and_char_count(line);
WordCount::word_and_char_count(line)
} else {
(0, 0)
};
// -L is a GNU 'wc' extension so same behavior on LF // -L is a GNU 'wc' extension so same behavior on LF
let max_line_length = if chars > 0 { chars - lines } else { 0 }; let max_line_length = if chars > 0 { chars - lines } else { 0 };
WordCount { WordCount {