mirror of
https://github.com/RGBCube/uutils-coreutils
synced 2025-07-28 11:37:44 +00:00
wc: Optimize, improve correctness
- Reuse allocations for read lines - Increase splice size - Check if /dev/null was opened correctly - Do not discard read bytes after I/O error - Add fast line counting with bytecount
This commit is contained in:
parent
c756878b20
commit
48437fc49d
6 changed files with 88 additions and 45 deletions
7
Cargo.lock
generated
7
Cargo.lock
generated
|
@ -188,6 +188,12 @@ dependencies = [
|
||||||
"utf8-width",
|
"utf8-width",
|
||||||
]
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "bytecount"
|
||||||
|
version = "0.6.2"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "72feb31ffc86498dacdbd0fcebb56138e7177a8cc5cea4516031d15ae85a742e"
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "byteorder"
|
name = "byteorder"
|
||||||
version = "1.4.3"
|
version = "1.4.3"
|
||||||
|
@ -3110,6 +3116,7 @@ dependencies = [
|
||||||
name = "uu_wc"
|
name = "uu_wc"
|
||||||
version = "0.0.7"
|
version = "0.0.7"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
|
"bytecount",
|
||||||
"clap",
|
"clap",
|
||||||
"libc",
|
"libc",
|
||||||
"nix 0.20.0",
|
"nix 0.20.0",
|
||||||
|
|
|
@ -19,6 +19,7 @@ clap = { version = "2.33", features = ["wrap_help"] }
|
||||||
uucore = { version=">=0.0.9", package="uucore", path="../../uucore" }
|
uucore = { version=">=0.0.9", package="uucore", path="../../uucore" }
|
||||||
uucore_procs = { version=">=0.0.6", package="uucore_procs", path="../../uucore_procs" }
|
uucore_procs = { version=">=0.0.6", package="uucore_procs", path="../../uucore_procs" }
|
||||||
thiserror = "1.0"
|
thiserror = "1.0"
|
||||||
|
bytecount = "0.6.2"
|
||||||
|
|
||||||
[target.'cfg(unix)'.dependencies]
|
[target.'cfg(unix)'.dependencies]
|
||||||
nix = "0.20"
|
nix = "0.20"
|
||||||
|
|
|
@ -1,13 +1,15 @@
|
||||||
|
use crate::word_count::WordCount;
|
||||||
|
|
||||||
use super::{WcResult, WordCountable};
|
use super::{WcResult, WordCountable};
|
||||||
|
|
||||||
#[cfg(any(target_os = "linux", target_os = "android"))]
|
#[cfg(any(target_os = "linux", target_os = "android"))]
|
||||||
use std::fs::{File, OpenOptions};
|
use std::fs::{File, OpenOptions};
|
||||||
use std::io::ErrorKind;
|
use std::io::{ErrorKind, Read};
|
||||||
|
|
||||||
#[cfg(unix)]
|
#[cfg(unix)]
|
||||||
use libc::S_IFREG;
|
use libc::S_IFREG;
|
||||||
#[cfg(unix)]
|
#[cfg(unix)]
|
||||||
use nix::sys::stat::fstat;
|
use nix::sys::stat;
|
||||||
#[cfg(any(target_os = "linux", target_os = "android"))]
|
#[cfg(any(target_os = "linux", target_os = "android"))]
|
||||||
use std::os::unix::io::{AsRawFd, FromRawFd, RawFd};
|
use std::os::unix::io::{AsRawFd, FromRawFd, RawFd};
|
||||||
|
|
||||||
|
@ -18,7 +20,8 @@ use nix::fcntl::{splice, SpliceFFlags};
|
||||||
#[cfg(any(target_os = "linux", target_os = "android"))]
|
#[cfg(any(target_os = "linux", target_os = "android"))]
|
||||||
use nix::unistd::pipe;
|
use nix::unistd::pipe;
|
||||||
|
|
||||||
const BUF_SIZE: usize = 16384;
|
const BUF_SIZE: usize = 16 * 1024;
|
||||||
|
const SPLICE_SIZE: usize = 128 * 1024;
|
||||||
|
|
||||||
/// Splice wrapper which handles short writes
|
/// Splice wrapper which handles short writes
|
||||||
#[cfg(any(target_os = "linux", target_os = "android"))]
|
#[cfg(any(target_os = "linux", target_os = "android"))]
|
||||||
|
@ -37,15 +40,24 @@ fn splice_exact(read_fd: RawFd, write_fd: RawFd, num_bytes: usize) -> nix::Resul
|
||||||
|
|
||||||
/// This is a Linux-specific function to count the number of bytes using the
|
/// This is a Linux-specific function to count the number of bytes using the
|
||||||
/// `splice` system call, which is faster than using `read`.
|
/// `splice` system call, which is faster than using `read`.
|
||||||
|
///
|
||||||
|
/// On error it returns the number of bytes it did manage to read, since the
|
||||||
|
/// caller will fall back to a simpler method.
|
||||||
#[inline]
|
#[inline]
|
||||||
#[cfg(any(target_os = "linux", target_os = "android"))]
|
#[cfg(any(target_os = "linux", target_os = "android"))]
|
||||||
fn count_bytes_using_splice(fd: RawFd) -> nix::Result<usize> {
|
fn count_bytes_using_splice(fd: RawFd) -> Result<usize, usize> {
|
||||||
let null_file = OpenOptions::new()
|
let null_file = OpenOptions::new()
|
||||||
.write(true)
|
.write(true)
|
||||||
.open("/dev/null")
|
.open("/dev/null")
|
||||||
.map_err(|_| nix::Error::last())?;
|
.map_err(|_| 0_usize)?;
|
||||||
let null = null_file.as_raw_fd();
|
let null = null_file.as_raw_fd();
|
||||||
let (pipe_rd, pipe_wr) = pipe()?;
|
let null_rdev = stat::fstat(null).map_err(|_| 0_usize)?.st_rdev;
|
||||||
|
if (stat::major(null_rdev), stat::minor(null_rdev)) != (1, 3) {
|
||||||
|
// This is not a proper /dev/null, writing to it is probably bad
|
||||||
|
// Bit of an edge case, but it has been known to happen
|
||||||
|
return Err(0);
|
||||||
|
}
|
||||||
|
let (pipe_rd, pipe_wr) = pipe().map_err(|_| 0_usize)?;
|
||||||
|
|
||||||
// Ensure the pipe is closed when the function returns.
|
// Ensure the pipe is closed when the function returns.
|
||||||
// SAFETY: The file descriptors do not have other owners.
|
// SAFETY: The file descriptors do not have other owners.
|
||||||
|
@ -53,12 +65,16 @@ fn count_bytes_using_splice(fd: RawFd) -> nix::Result<usize> {
|
||||||
|
|
||||||
let mut byte_count = 0;
|
let mut byte_count = 0;
|
||||||
loop {
|
loop {
|
||||||
let res = splice(fd, None, pipe_wr, None, BUF_SIZE, SpliceFFlags::empty())?;
|
match splice(fd, None, pipe_wr, None, SPLICE_SIZE, SpliceFFlags::empty()) {
|
||||||
if res == 0 {
|
Ok(0) => break,
|
||||||
break;
|
Ok(res) => {
|
||||||
}
|
byte_count += res;
|
||||||
byte_count += res;
|
if splice_exact(pipe_rd, null, res).is_err() {
|
||||||
splice_exact(pipe_rd, null, res)?;
|
return Err(byte_count);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
Err(_) => return Err(byte_count),
|
||||||
|
};
|
||||||
}
|
}
|
||||||
|
|
||||||
Ok(byte_count)
|
Ok(byte_count)
|
||||||
|
@ -73,10 +89,12 @@ fn count_bytes_using_splice(fd: RawFd) -> nix::Result<usize> {
|
||||||
/// other things such as lines and words.
|
/// other things such as lines and words.
|
||||||
#[inline]
|
#[inline]
|
||||||
pub(crate) fn count_bytes_fast<T: WordCountable>(handle: &mut T) -> WcResult<usize> {
|
pub(crate) fn count_bytes_fast<T: WordCountable>(handle: &mut T) -> WcResult<usize> {
|
||||||
|
let mut byte_count = 0;
|
||||||
|
|
||||||
#[cfg(unix)]
|
#[cfg(unix)]
|
||||||
{
|
{
|
||||||
let fd = handle.as_raw_fd();
|
let fd = handle.as_raw_fd();
|
||||||
if let Ok(stat) = fstat(fd) {
|
if let Ok(stat) = stat::fstat(fd) {
|
||||||
// If the file is regular, then the `st_size` should hold
|
// If the file is regular, then the `st_size` should hold
|
||||||
// the file's size in bytes.
|
// the file's size in bytes.
|
||||||
if (stat.st_mode & S_IFREG) != 0 {
|
if (stat.st_mode & S_IFREG) != 0 {
|
||||||
|
@ -87,8 +105,9 @@ pub(crate) fn count_bytes_fast<T: WordCountable>(handle: &mut T) -> WcResult<usi
|
||||||
// Else, if we're on Linux and our file is a FIFO pipe
|
// Else, if we're on Linux and our file is a FIFO pipe
|
||||||
// (or stdin), we use splice to count the number of bytes.
|
// (or stdin), we use splice to count the number of bytes.
|
||||||
if (stat.st_mode & S_IFIFO) != 0 {
|
if (stat.st_mode & S_IFIFO) != 0 {
|
||||||
if let Ok(n) = count_bytes_using_splice(fd) {
|
match count_bytes_using_splice(fd) {
|
||||||
return Ok(n);
|
Ok(n) => return Ok(n),
|
||||||
|
Err(n) => byte_count = n,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -97,7 +116,6 @@ pub(crate) fn count_bytes_fast<T: WordCountable>(handle: &mut T) -> WcResult<usi
|
||||||
|
|
||||||
// Fall back on `read`, but without the overhead of counting words and lines.
|
// Fall back on `read`, but without the overhead of counting words and lines.
|
||||||
let mut buf = [0_u8; BUF_SIZE];
|
let mut buf = [0_u8; BUF_SIZE];
|
||||||
let mut byte_count = 0;
|
|
||||||
loop {
|
loop {
|
||||||
match handle.read(&mut buf) {
|
match handle.read(&mut buf) {
|
||||||
Ok(0) => return Ok(byte_count),
|
Ok(0) => return Ok(byte_count),
|
||||||
|
@ -109,3 +127,19 @@ pub(crate) fn count_bytes_fast<T: WordCountable>(handle: &mut T) -> WcResult<usi
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
pub(crate) fn count_bytes_and_lines_fast<R: Read>(handle: &mut R) -> WcResult<WordCount> {
|
||||||
|
let mut total = WordCount::default();
|
||||||
|
let mut buf = [0; BUF_SIZE];
|
||||||
|
loop {
|
||||||
|
match handle.read(&mut buf) {
|
||||||
|
Ok(0) => return Ok(total),
|
||||||
|
Ok(n) => {
|
||||||
|
total.bytes += n;
|
||||||
|
total.lines += bytecount::count(&buf[..n], b'\n');
|
||||||
|
}
|
||||||
|
Err(ref e) if e.kind() == ErrorKind::Interrupted => continue,
|
||||||
|
Err(e) => return Err(e.into()),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
|
@ -28,7 +28,7 @@ impl WordCountable for StdinLock<'_> {
|
||||||
where
|
where
|
||||||
Self: Sized,
|
Self: Sized,
|
||||||
{
|
{
|
||||||
Lines { buf: self }
|
Lines::new(self)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
impl WordCountable for File {
|
impl WordCountable for File {
|
||||||
|
@ -38,9 +38,7 @@ impl WordCountable for File {
|
||||||
where
|
where
|
||||||
Self: Sized,
|
Self: Sized,
|
||||||
{
|
{
|
||||||
Lines {
|
Lines::new(BufReader::new(self))
|
||||||
buf: BufReader::new(self),
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -53,19 +51,25 @@ impl WordCountable for File {
|
||||||
/// [`io::Lines`]:: io::Lines
|
/// [`io::Lines`]:: io::Lines
|
||||||
pub struct Lines<B> {
|
pub struct Lines<B> {
|
||||||
buf: B,
|
buf: B,
|
||||||
|
line: Vec<u8>,
|
||||||
}
|
}
|
||||||
|
|
||||||
impl<B: BufRead> Iterator for Lines<B> {
|
impl<B: BufRead> Lines<B> {
|
||||||
type Item = io::Result<Vec<u8>>;
|
fn new(reader: B) -> Self {
|
||||||
|
Lines {
|
||||||
|
buf: reader,
|
||||||
|
line: Vec::new(),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
fn next(&mut self) -> Option<Self::Item> {
|
pub fn next(&mut self) -> Option<io::Result<&[u8]>> {
|
||||||
let mut line = Vec::new();
|
self.line.clear();
|
||||||
|
|
||||||
// reading from a TTY seems to raise a condition on, rather than return Some(0) like a file.
|
// reading from a TTY seems to raise a condition on, rather than return Some(0) like a file.
|
||||||
// hence the option wrapped in a result here
|
// hence the option wrapped in a result here
|
||||||
match self.buf.read_until(b'\n', &mut line) {
|
match self.buf.read_until(b'\n', &mut self.line) {
|
||||||
Ok(0) => None,
|
Ok(0) => None,
|
||||||
Ok(_n) => Some(Ok(line)),
|
Ok(_n) => Some(Ok(&self.line)),
|
||||||
Err(e) => Some(Err(e)),
|
Err(e) => Some(Err(e)),
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -8,10 +8,10 @@
|
||||||
#[macro_use]
|
#[macro_use]
|
||||||
extern crate uucore;
|
extern crate uucore;
|
||||||
|
|
||||||
mod count_bytes;
|
mod count_fast;
|
||||||
mod countable;
|
mod countable;
|
||||||
mod word_count;
|
mod word_count;
|
||||||
use count_bytes::count_bytes_fast;
|
use count_fast::{count_bytes_and_lines_fast, count_bytes_fast};
|
||||||
use countable::WordCountable;
|
use countable::WordCountable;
|
||||||
use word_count::{TitledWordCount, WordCount};
|
use word_count::{TitledWordCount, WordCount};
|
||||||
|
|
||||||
|
@ -220,19 +220,20 @@ fn word_count_from_reader<T: WordCountable>(
|
||||||
// we do not need to decode the byte stream if we're only counting bytes/newlines
|
// we do not need to decode the byte stream if we're only counting bytes/newlines
|
||||||
let decode_chars = settings.show_chars || settings.show_words || settings.show_max_line_length;
|
let decode_chars = settings.show_chars || settings.show_words || settings.show_max_line_length;
|
||||||
|
|
||||||
|
if !decode_chars {
|
||||||
|
return count_bytes_and_lines_fast(&mut reader);
|
||||||
|
}
|
||||||
|
|
||||||
// Sum the WordCount for each line. Show a warning for each line
|
// Sum the WordCount for each line. Show a warning for each line
|
||||||
// that results in an IO error when trying to read it.
|
// that results in an IO error when trying to read it.
|
||||||
let total = reader
|
let mut lines = reader.lines();
|
||||||
.lines()
|
let mut total = WordCount::default();
|
||||||
.filter_map(|res| match res {
|
while let Some(res) = lines.next() {
|
||||||
Ok(line) => Some(line),
|
match res {
|
||||||
Err(e) => {
|
Ok(line) => total += WordCount::from_line(line),
|
||||||
show_warning!("Error while reading {}: {}", path, e);
|
Err(e) => show_warning!("Error while reading {}: {}", path, e),
|
||||||
None
|
}
|
||||||
}
|
}
|
||||||
})
|
|
||||||
.map(|line| WordCount::from_line(&line, decode_chars))
|
|
||||||
.sum();
|
|
||||||
Ok(total)
|
Ok(total)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -74,15 +74,11 @@ impl WordCount {
|
||||||
/// fields will be set to 0. If it is `true`, this function will
|
/// fields will be set to 0. If it is `true`, this function will
|
||||||
/// attempt to decode the bytes first as UTF-8, and failing that,
|
/// attempt to decode the bytes first as UTF-8, and failing that,
|
||||||
/// as ASCII.
|
/// as ASCII.
|
||||||
pub fn from_line(line: &[u8], decode_chars: bool) -> WordCount {
|
pub fn from_line(line: &[u8]) -> WordCount {
|
||||||
// GNU 'wc' only counts lines that end in LF as lines
|
// GNU 'wc' only counts lines that end in LF as lines
|
||||||
let lines = (*line.last().unwrap() == LF) as usize;
|
let lines = (*line.last().unwrap() == LF) as usize;
|
||||||
let bytes = line.len();
|
let bytes = line.len();
|
||||||
let (words, chars) = if decode_chars {
|
let (words, chars) = WordCount::word_and_char_count(line);
|
||||||
WordCount::word_and_char_count(line)
|
|
||||||
} else {
|
|
||||||
(0, 0)
|
|
||||||
};
|
|
||||||
// -L is a GNU 'wc' extension so same behavior on LF
|
// -L is a GNU 'wc' extension so same behavior on LF
|
||||||
let max_line_length = if chars > 0 { chars - lines } else { 0 };
|
let max_line_length = if chars > 0 { chars - lines } else { 0 };
|
||||||
WordCount {
|
WordCount {
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue