mirror of
https://github.com/RGBCube/uutils-coreutils
synced 2025-07-27 11:07:44 +00:00
wc: Perf gains with the bytecount crate.
Issue #7494 Improve performace of wc app. - Use the bytecount::num_chars API to count UTF-8 characters in a file. - Enable runtime-dispatch-simd feature in the bytecount crate.
This commit is contained in:
parent
2e3da88b78
commit
eea6c82305
3 changed files with 6 additions and 13 deletions
|
@ -26,10 +26,11 @@ output of uutils `cat` into it. Note that GNU `cat` is slower and therefore less
|
||||||
suitable, and that if a file is given as its input directly (as in
|
suitable, and that if a file is given as its input directly (as in
|
||||||
`wc -c < largefile`) the first strategy kicks in. Try `uucat somefile | wc -c`.
|
`wc -c < largefile`) the first strategy kicks in. Try `uucat somefile | wc -c`.
|
||||||
|
|
||||||
### Counting lines
|
### Counting lines and UTF-8 characters
|
||||||
|
|
||||||
In the case of `wc -l` or `wc -cl` the input doesn't have to be decoded. It's
|
If the flags set are a subset of `-clm` then the input doesn't have to be decoded. The
|
||||||
read in chunks and the `bytecount` crate is used to count the newlines.
|
input is read in chunks and the `bytecount` crate is used to count the newlines (`-l` flag)
|
||||||
|
and/or UTF-8 characters (`-m` flag).
|
||||||
|
|
||||||
It's useful to vary the line length in the input. GNU wc seems particularly
|
It's useful to vary the line length in the input. GNU wc seems particularly
|
||||||
bad at short lines.
|
bad at short lines.
|
||||||
|
|
|
@ -19,7 +19,7 @@ path = "src/wc.rs"
|
||||||
[dependencies]
|
[dependencies]
|
||||||
clap = { workspace = true }
|
clap = { workspace = true }
|
||||||
uucore = { workspace = true, features = ["pipes", "quoting-style"] }
|
uucore = { workspace = true, features = ["pipes", "quoting-style"] }
|
||||||
bytecount = { workspace = true }
|
bytecount = { workspace = true, features = ["runtime-dispatch-simd"] }
|
||||||
thiserror = { workspace = true }
|
thiserror = { workspace = true }
|
||||||
unicode-width = { workspace = true }
|
unicode-width = { workspace = true }
|
||||||
|
|
||||||
|
|
|
@ -212,11 +212,6 @@ pub(crate) fn count_bytes_chars_and_lines_fast<
|
||||||
>(
|
>(
|
||||||
handle: &mut R,
|
handle: &mut R,
|
||||||
) -> (WordCount, Option<io::Error>) {
|
) -> (WordCount, Option<io::Error>) {
|
||||||
/// Mask of the value bits of a continuation byte
|
|
||||||
const CONT_MASK: u8 = 0b0011_1111u8;
|
|
||||||
/// Value of the tag bits (tag mask is !CONT_MASK) of a continuation byte
|
|
||||||
const TAG_CONT_U8: u8 = 0b1000_0000u8;
|
|
||||||
|
|
||||||
let mut total = WordCount::default();
|
let mut total = WordCount::default();
|
||||||
let mut buf = [0; BUF_SIZE];
|
let mut buf = [0; BUF_SIZE];
|
||||||
loop {
|
loop {
|
||||||
|
@ -227,10 +222,7 @@ pub(crate) fn count_bytes_chars_and_lines_fast<
|
||||||
total.bytes += n;
|
total.bytes += n;
|
||||||
}
|
}
|
||||||
if COUNT_CHARS {
|
if COUNT_CHARS {
|
||||||
total.chars += buf[..n]
|
total.chars += bytecount::num_chars(&buf[..n]);
|
||||||
.iter()
|
|
||||||
.filter(|&&byte| (byte & !CONT_MASK) != TAG_CONT_U8)
|
|
||||||
.count();
|
|
||||||
}
|
}
|
||||||
if COUNT_LINES {
|
if COUNT_LINES {
|
||||||
total.lines += bytecount::count(&buf[..n], b'\n');
|
total.lines += bytecount::count(&buf[..n], b'\n');
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue