mirror of
https://github.com/RGBCube/uutils-coreutils
synced 2025-07-29 12:07:46 +00:00
Merge pull request #3642 from Garfield96/shuf-find-seps
shuf: improve performance
This commit is contained in:
commit
7d807f3e73
4 changed files with 37 additions and 17 deletions
1
Cargo.lock
generated
1
Cargo.lock
generated
|
@ -2801,6 +2801,7 @@ name = "uu_shuf"
|
||||||
version = "0.0.14"
|
version = "0.0.14"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"clap 3.1.18",
|
"clap 3.1.18",
|
||||||
|
"memchr 2.5.0",
|
||||||
"rand",
|
"rand",
|
||||||
"rand_core",
|
"rand_core",
|
||||||
"uucore",
|
"uucore",
|
||||||
|
|
|
@ -1,26 +1,51 @@
|
||||||
|
<!-- spell-checker:ignore tmpfs -->
|
||||||
|
|
||||||
# Benchmarking shuf
|
# Benchmarking shuf
|
||||||
|
|
||||||
`shuf` is a simple utility, but there are at least two important cases
|
`shuf` is a simple utility, but there are at least two important cases
|
||||||
benchmark: with and without repetition.
|
benchmark: with and without repetition.
|
||||||
|
|
||||||
When benchmarking changes, make sure to always build with the `--release` flag.
|
When benchmarking changes, make sure to always build with the `--release` flag.
|
||||||
You can compare with another branch by compiling on that branch and than
|
You can compare with another branch by compiling on that branch and then
|
||||||
renaming the executable from `shuf` to `shuf.old`.
|
renaming the executable from `shuf` to `shuf.old`.
|
||||||
|
|
||||||
|
## Generate sample data
|
||||||
|
|
||||||
|
Sample input can be generated using `/dev/random`:
|
||||||
|
|
||||||
|
```shell
|
||||||
|
wget -O input.txt https://www.gutenberg.org/files/100/100-0.txt
|
||||||
|
```
|
||||||
|
|
||||||
|
To avoid distortions from IO, it is recommended to store input data in tmpfs.
|
||||||
|
|
||||||
## Without repetition
|
## Without repetition
|
||||||
|
|
||||||
By default, `shuf` samples without repetition. To benchmark only the
|
By default, `shuf` samples without repetition.
|
||||||
randomization and not IO, we can pass the `-i` flag with a range of numbers to
|
|
||||||
randomly sample from. An example of a command that works well for testing:
|
To benchmark only the randomization and not IO, we can pass the `-i` flag with
|
||||||
|
a range of numbers to randomly sample from. An example of a command that works
|
||||||
|
well for testing:
|
||||||
|
|
||||||
```shell
|
```shell
|
||||||
hyperfine --warmup 10 "target/release/shuf -i 0-10000000"
|
hyperfine --warmup 10 "target/release/shuf -i 0-10000000"
|
||||||
```
|
```
|
||||||
|
|
||||||
|
To measure the time taken by shuffling an input file, the following command can
|
||||||
|
be used::
|
||||||
|
|
||||||
|
```shell
|
||||||
|
hyperfine --warmup 10 "target/release/shuf input.txt > /dev/null"
|
||||||
|
```
|
||||||
|
|
||||||
|
It is important to discard the output by redirecting it to `/dev/null`, since
|
||||||
|
otherwise, a substantial amount of time is added to write the output to the
|
||||||
|
filesystem.
|
||||||
|
|
||||||
## With repetition
|
## With repetition
|
||||||
|
|
||||||
When repetition is allowed, `shuf` works very differently under the hood, so it
|
When repetition is allowed, `shuf` works very differently under the hood, so it
|
||||||
should be benchmarked separately. In this case we have to pass the `-n` flag or
|
should be benchmarked separately. In this case, we have to pass the `-n` flag or
|
||||||
the command will run forever. An example of a hyperfine command is
|
the command will run forever. An example of a hyperfine command is
|
||||||
|
|
||||||
```shell
|
```shell
|
||||||
|
|
|
@ -16,6 +16,7 @@ path = "src/shuf.rs"
|
||||||
|
|
||||||
[dependencies]
|
[dependencies]
|
||||||
clap = { version = "3.1", features = ["wrap_help", "cargo"] }
|
clap = { version = "3.1", features = ["wrap_help", "cargo"] }
|
||||||
|
memchr = "2.5.0"
|
||||||
rand = "0.8"
|
rand = "0.8"
|
||||||
rand_core = "0.6"
|
rand_core = "0.6"
|
||||||
uucore = { version=">=0.0.11", package="uucore", path="../../uucore" }
|
uucore = { version=">=0.0.11", package="uucore", path="../../uucore" }
|
||||||
|
|
|
@ -8,6 +8,7 @@
|
||||||
// spell-checker:ignore (ToDO) cmdline evec seps rvec fdata
|
// spell-checker:ignore (ToDO) cmdline evec seps rvec fdata
|
||||||
|
|
||||||
use clap::{crate_version, Arg, Command, Values};
|
use clap::{crate_version, Arg, Command, Values};
|
||||||
|
use memchr::memchr_iter;
|
||||||
use rand::prelude::SliceRandom;
|
use rand::prelude::SliceRandom;
|
||||||
use rand::RngCore;
|
use rand::RngCore;
|
||||||
use std::fs::File;
|
use std::fs::File;
|
||||||
|
@ -218,20 +219,12 @@ fn find_seps(data: &mut Vec<&[u8]>, sep: u8) {
|
||||||
if data[i].contains(&sep) {
|
if data[i].contains(&sep) {
|
||||||
let this = data.swap_remove(i);
|
let this = data.swap_remove(i);
|
||||||
let mut p = 0;
|
let mut p = 0;
|
||||||
let mut i = 1;
|
for i in memchr_iter(sep, this) {
|
||||||
loop {
|
|
||||||
if i == this.len() {
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
|
|
||||||
if this[i] == sep {
|
|
||||||
data.push(&this[p..i]);
|
data.push(&this[p..i]);
|
||||||
p = i + 1;
|
p = i + 1;
|
||||||
}
|
}
|
||||||
i += 1;
|
|
||||||
}
|
|
||||||
if p < this.len() {
|
if p < this.len() {
|
||||||
data.push(&this[p..i]);
|
data.push(&this[p..]);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue