1
Fork 0
mirror of https://github.com/RGBCube/uutils-coreutils synced 2025-07-27 11:07:44 +00:00

Merge pull request #7495 from karlmcdowall/wc_perf

wc: Perf gains with the bytecount crate.
This commit is contained in:
Sylvestre Ledru 2025-03-20 09:52:15 +01:00 committed by GitHub
commit 187d3e58b5
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
3 changed files with 6 additions and 13 deletions

View file

@ -26,10 +26,11 @@ output of uutils `cat` into it. Note that GNU `cat` is slower and therefore less
suitable, and that if a file is given as its input directly (as in suitable, and that if a file is given as its input directly (as in
`wc -c < largefile`) the first strategy kicks in. Try `uucat somefile | wc -c`. `wc -c < largefile`) the first strategy kicks in. Try `uucat somefile | wc -c`.
### Counting lines ### Counting lines and UTF-8 characters
In the case of `wc -l` or `wc -cl` the input doesn't have to be decoded. It's If the flags set are a subset of `-clm` then the input doesn't have to be decoded. The
read in chunks and the `bytecount` crate is used to count the newlines. input is read in chunks and the `bytecount` crate is used to count the newlines (`-l` flag)
and/or UTF-8 characters (`-m` flag).
It's useful to vary the line length in the input. GNU wc seems particularly It's useful to vary the line length in the input. GNU wc seems particularly
bad at short lines. bad at short lines.

View file

@ -19,7 +19,7 @@ path = "src/wc.rs"
[dependencies] [dependencies]
clap = { workspace = true } clap = { workspace = true }
uucore = { workspace = true, features = ["pipes", "quoting-style"] } uucore = { workspace = true, features = ["pipes", "quoting-style"] }
bytecount = { workspace = true } bytecount = { workspace = true, features = ["runtime-dispatch-simd"] }
thiserror = { workspace = true } thiserror = { workspace = true }
unicode-width = { workspace = true } unicode-width = { workspace = true }

View file

@ -212,11 +212,6 @@ pub(crate) fn count_bytes_chars_and_lines_fast<
>( >(
handle: &mut R, handle: &mut R,
) -> (WordCount, Option<io::Error>) { ) -> (WordCount, Option<io::Error>) {
/// Mask of the value bits of a continuation byte
const CONT_MASK: u8 = 0b0011_1111u8;
/// Value of the tag bits (tag mask is !CONT_MASK) of a continuation byte
const TAG_CONT_U8: u8 = 0b1000_0000u8;
let mut total = WordCount::default(); let mut total = WordCount::default();
let mut buf = [0; BUF_SIZE]; let mut buf = [0; BUF_SIZE];
loop { loop {
@ -227,10 +222,7 @@ pub(crate) fn count_bytes_chars_and_lines_fast<
total.bytes += n; total.bytes += n;
} }
if COUNT_CHARS { if COUNT_CHARS {
total.chars += buf[..n] total.chars += bytecount::num_chars(&buf[..n]);
.iter()
.filter(|&&byte| (byte & !CONT_MASK) != TAG_CONT_U8)
.count();
} }
if COUNT_LINES { if COUNT_LINES {
total.lines += bytecount::count(&buf[..n], b'\n'); total.lines += bytecount::count(&buf[..n], b'\n');