From 19a43bf1b78bc208bc5c6d51c5dc793aa5b50aa0 Mon Sep 17 00:00:00 2001 From: Christian Menges Date: Thu, 16 Jun 2022 13:23:44 +0200 Subject: [PATCH 1/4] shuf: improve performance Use memchr crate to speed up splitting input data by a separator. Signed-off-by: Christian Menges --- Cargo.lock | 1 + src/uu/shuf/Cargo.toml | 1 + src/uu/shuf/src/shuf.rs | 17 +++++------------ 3 files changed, 7 insertions(+), 12 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index b191c36ef..e3b1eb945 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2808,6 +2808,7 @@ name = "uu_shuf" version = "0.0.14" dependencies = [ "clap 3.1.18", + "memchr 2.5.0", "rand", "rand_core", "uucore", diff --git a/src/uu/shuf/Cargo.toml b/src/uu/shuf/Cargo.toml index dd4ed18c1..cbf7deb26 100644 --- a/src/uu/shuf/Cargo.toml +++ b/src/uu/shuf/Cargo.toml @@ -16,6 +16,7 @@ path = "src/shuf.rs" [dependencies] clap = { version = "3.1", features = ["wrap_help", "cargo"] } +memchr = "2.5.0" rand = "0.8" rand_core = "0.6" uucore = { version=">=0.0.11", package="uucore", path="../../uucore" } diff --git a/src/uu/shuf/src/shuf.rs b/src/uu/shuf/src/shuf.rs index 6369fc9b5..9c0d08bdd 100644 --- a/src/uu/shuf/src/shuf.rs +++ b/src/uu/shuf/src/shuf.rs @@ -8,6 +8,7 @@ // spell-checker:ignore (ToDO) cmdline evec seps rvec fdata use clap::{crate_version, Arg, Command, Values}; +use memchr::memchr_iter; use rand::prelude::SliceRandom; use rand::RngCore; use std::fs::File; @@ -218,20 +219,12 @@ fn find_seps(data: &mut Vec<&[u8]>, sep: u8) { if data[i].contains(&sep) { let this = data.swap_remove(i); let mut p = 0; - let mut i = 1; - loop { - if i == this.len() { - break; - } - - if this[i] == sep { - data.push(&this[p..i]); - p = i + 1; - } - i += 1; + for i in memchr_iter(sep, this) { + data.push(&this[p..i]); + p = i + 1; } if p < this.len() { - data.push(&this[p..i]); + data.push(&this[p..]); } } } From 35e97fe88a8aa4bc799200ac9cafff0764c18065 Mon Sep 17 00:00:00 2001 From: Christian Menges Date: Fri, 17 Jun 2022 20:57:04 +0200 Subject: [PATCH 2/4] shuf: update BENCHMARKING.md Signed-off-by: Christian Menges --- src/uu/shuf/BENCHMARKING.md | 33 ++++++++++++++++++++++++++++----- 1 file changed, 28 insertions(+), 5 deletions(-) diff --git a/src/uu/shuf/BENCHMARKING.md b/src/uu/shuf/BENCHMARKING.md index 7607f04b4..cf5ee40e1 100644 --- a/src/uu/shuf/BENCHMARKING.md +++ b/src/uu/shuf/BENCHMARKING.md @@ -4,23 +4,46 @@ benchmark: with and without repetition. When benchmarking changes, make sure to always build with the `--release` flag. -You can compare with another branch by compiling on that branch and than +You can compare with another branch by compiling on that branch and then renaming the executable from `shuf` to `shuf.old`. +## Generate sample data + +Sample input can be generated using `/dev/random`: + +```shell +cat /dev/random | base64 | fold | head -n 50000000 > input.txt +``` + +To avoid distortions from IO, it is recommended to store input data in tmpfs. + ## Without repetition -By default, `shuf` samples without repetition. To benchmark only the -randomization and not IO, we can pass the `-i` flag with a range of numbers to -randomly sample from. An example of a command that works well for testing: +By default, `shuf` samples without repetition. + +To benchmark only the randomization and not IO, we can pass the `-i` flag with +a range of numbers to randomly sample from. An example of a command that works +well for testing: ```shell hyperfine --warmup 10 "target/release/shuf -i 0-10000000" ``` +To measure the time taken by shuffling an input file, the following command can +be used:: + +```shell +hyperfine --warmup 10 "target/release/shuf input.txt > /dev/null" +``` + +It is important to discard the output by redirecting it to `/dev/null`, since +otherwise, a substantial amount of time is added to write the output to the +filesystem. + ## With repetition When repetition is allowed, `shuf` works very differently under the hood, so it -should be benchmarked separately. In this case we have to pass the `-n` flag or +should be benchmarked separately. In this case, we have to pass the `-n` flag or the command will run forever. An example of a hyperfine command is ```shell From ad05b2f7cbcefb14d4659482c0692f90c7e66c83 Mon Sep 17 00:00:00 2001 From: Sylvestre Ledru Date: Sat, 18 Jun 2022 08:32:40 +0200 Subject: [PATCH 3/4] Ignore word "tmpfs" --- src/uu/shuf/BENCHMARKING.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/uu/shuf/BENCHMARKING.md b/src/uu/shuf/BENCHMARKING.md index cf5ee40e1..a7b841f61 100644 --- a/src/uu/shuf/BENCHMARKING.md +++ b/src/uu/shuf/BENCHMARKING.md @@ -1,3 +1,5 @@ + + # Benchmarking shuf `shuf` is a simple utility, but there are at least two important cases From 7c49bf4bd1a16a2806d2efc1919c1442648fb80c Mon Sep 17 00:00:00 2001 From: Sylvestre Ledru Date: Sun, 19 Jun 2022 16:59:44 +0200 Subject: [PATCH 4/4] the cat /dev/random can be too long, use the Shakespeare text --- src/uu/shuf/BENCHMARKING.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/uu/shuf/BENCHMARKING.md b/src/uu/shuf/BENCHMARKING.md index a7b841f61..23f647e9e 100644 --- a/src/uu/shuf/BENCHMARKING.md +++ b/src/uu/shuf/BENCHMARKING.md @@ -14,7 +14,7 @@ renaming the executable from `shuf` to `shuf.old`. Sample input can be generated using `/dev/random`: ```shell -cat /dev/random | base64 | fold | head -n 50000000 > input.txt +wget -O input.txt https://www.gutenberg.org/files/100/100-0.txt ``` To avoid distortions from IO, it is recommended to store input data in tmpfs.