1
Fork 0
mirror of https://github.com/RGBCube/uutils-coreutils synced 2025-07-27 11:07:44 +00:00

wc: Perf gains with the bytecount crate.

Issue #7494
Improve performace of wc app.
 - Use the bytecount::num_chars API to count UTF-8 characters in a file.
 - Enable runtime-dispatch-simd feature in the bytecount crate.
This commit is contained in:
Karl McDowall 2025-03-19 11:56:07 -06:00
parent 2e3da88b78
commit eea6c82305
3 changed files with 6 additions and 13 deletions

View file

@ -26,10 +26,11 @@ output of uutils `cat` into it. Note that GNU `cat` is slower and therefore less
suitable, and that if a file is given as its input directly (as in suitable, and that if a file is given as its input directly (as in
`wc -c < largefile`) the first strategy kicks in. Try `uucat somefile | wc -c`. `wc -c < largefile`) the first strategy kicks in. Try `uucat somefile | wc -c`.
### Counting lines ### Counting lines and UTF-8 characters
In the case of `wc -l` or `wc -cl` the input doesn't have to be decoded. It's If the flags set are a subset of `-clm` then the input doesn't have to be decoded. The
read in chunks and the `bytecount` crate is used to count the newlines. input is read in chunks and the `bytecount` crate is used to count the newlines (`-l` flag)
and/or UTF-8 characters (`-m` flag).
It's useful to vary the line length in the input. GNU wc seems particularly It's useful to vary the line length in the input. GNU wc seems particularly
bad at short lines. bad at short lines.

View file

@ -19,7 +19,7 @@ path = "src/wc.rs"
[dependencies] [dependencies]
clap = { workspace = true } clap = { workspace = true }
uucore = { workspace = true, features = ["pipes", "quoting-style"] } uucore = { workspace = true, features = ["pipes", "quoting-style"] }
bytecount = { workspace = true } bytecount = { workspace = true, features = ["runtime-dispatch-simd"] }
thiserror = { workspace = true } thiserror = { workspace = true }
unicode-width = { workspace = true } unicode-width = { workspace = true }

View file

@ -212,11 +212,6 @@ pub(crate) fn count_bytes_chars_and_lines_fast<
>( >(
handle: &mut R, handle: &mut R,
) -> (WordCount, Option<io::Error>) { ) -> (WordCount, Option<io::Error>) {
/// Mask of the value bits of a continuation byte
const CONT_MASK: u8 = 0b0011_1111u8;
/// Value of the tag bits (tag mask is !CONT_MASK) of a continuation byte
const TAG_CONT_U8: u8 = 0b1000_0000u8;
let mut total = WordCount::default(); let mut total = WordCount::default();
let mut buf = [0; BUF_SIZE]; let mut buf = [0; BUF_SIZE];
loop { loop {
@ -227,10 +222,7 @@ pub(crate) fn count_bytes_chars_and_lines_fast<
total.bytes += n; total.bytes += n;
} }
if COUNT_CHARS { if COUNT_CHARS {
total.chars += buf[..n] total.chars += bytecount::num_chars(&buf[..n]);
.iter()
.filter(|&&byte| (byte & !CONT_MASK) != TAG_CONT_U8)
.count();
} }
if COUNT_LINES { if COUNT_LINES {
total.lines += bytecount::count(&buf[..n], b'\n'); total.lines += bytecount::count(&buf[..n], b'\n');