mirror of
https://github.com/RGBCube/uutils-coreutils
synced 2025-07-28 11:37:44 +00:00
Merge pull request #2143 from nbraud/factor/faster/table
factor::table: Implement a batched version w/ improved performance
This commit is contained in:
commit
66cfdb8644
10 changed files with 324 additions and 26 deletions
20
Cargo.lock
generated
20
Cargo.lock
generated
|
@ -43,6 +43,12 @@ dependencies = [
|
|||
"winapi 0.3.9",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "array-init"
|
||||
version = "2.0.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "6945cc5422176fc5e602e590c2878d2c2acd9a4fe20a4baa7c28022521698ec6"
|
||||
|
||||
[[package]]
|
||||
name = "arrayvec"
|
||||
version = "0.4.12"
|
||||
|
@ -279,6 +285,7 @@ dependencies = [
|
|||
"uu_expand",
|
||||
"uu_expr",
|
||||
"uu_factor",
|
||||
"uu_factor_benches",
|
||||
"uu_false",
|
||||
"uu_fmt",
|
||||
"uu_fold",
|
||||
|
@ -2029,17 +2036,26 @@ name = "uu_factor"
|
|||
version = "0.0.6"
|
||||
dependencies = [
|
||||
"coz",
|
||||
"criterion",
|
||||
"num-traits",
|
||||
"paste",
|
||||
"quickcheck",
|
||||
"rand 0.7.3",
|
||||
"rand_chacha",
|
||||
"smallvec",
|
||||
"uucore",
|
||||
"uucore_procs",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "uu_factor_benches"
|
||||
version = "0.0.0"
|
||||
dependencies = [
|
||||
"array-init",
|
||||
"criterion",
|
||||
"rand 0.7.3",
|
||||
"rand_chacha",
|
||||
"uu_factor",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "uu_false"
|
||||
version = "0.0.6"
|
||||
|
|
|
@ -324,6 +324,9 @@ wc = { optional=true, version="0.0.6", package="uu_wc", path="src/uu/wc" }
|
|||
who = { optional=true, version="0.0.6", package="uu_who", path="src/uu/who" }
|
||||
whoami = { optional=true, version="0.0.6", package="uu_whoami", path="src/uu/whoami" }
|
||||
yes = { optional=true, version="0.0.6", package="uu_yes", path="src/uu/yes" }
|
||||
|
||||
factor_benches = { optional = true, version = "0.0.0", package = "uu_factor_benches", path = "tests/benches/factor" }
|
||||
|
||||
#
|
||||
# * pinned transitive dependencies
|
||||
# Not needed for now. Keep as examples:
|
||||
|
|
116
src/uu/factor/BENCHMARKING.md
Normal file
116
src/uu/factor/BENCHMARKING.md
Normal file
|
@ -0,0 +1,116 @@
|
|||
# Benchmarking `factor`
|
||||
|
||||
The benchmarks for `factor` are located under `tests/benches/factor`
|
||||
and can be invoked with `cargo bench` in that directory.
|
||||
|
||||
They are located outside the `uu_factor` crate, as they do not comply
|
||||
with the project's minimum supported Rust version, *i.e.* may require
|
||||
a newer version of `rustc`.
|
||||
|
||||
|
||||
## Microbenchmarking deterministic functions
|
||||
|
||||
We currently use [`criterion`] to benchmark deterministic functions,
|
||||
such as `gcd` and `table::factor`.
|
||||
|
||||
However, µbenchmarks are by nature unstable: not only are they specific to
|
||||
the hardware, operating system version, etc., but they are noisy and affected
|
||||
by other tasks on the system (browser, compile jobs, etc.), which can cause
|
||||
`criterion` to report spurious performance improvements and regressions.
|
||||
|
||||
This can be mitigated by getting as close to [idealised conditions][lemire]
|
||||
as possible:
|
||||
- minimize the amount of computation and I/O running concurrently to the
|
||||
benchmark, *i.e.* close your browser and IM clients, don't compile at the
|
||||
same time, etc. ;
|
||||
- ensure the CPU's [frequency stays constant] during the benchmark ;
|
||||
- [isolate a **physical** core], set it to `nohz_full`, and pin the benchmark
|
||||
to it, so it won't be preempted in the middle of a measurement ;
|
||||
- disable ASLR by running `setarch -R cargo bench`, so we can compare results
|
||||
across multiple executions.
|
||||
|
||||
|
||||
[`criterion`]: https://bheisler.github.io/criterion.rs/book/index.html
|
||||
[lemire]: https://lemire.me/blog/2018/01/16/microbenchmarking-calls-for-idealized-conditions/
|
||||
[isolate a **physical** core]: https://pyperf.readthedocs.io/en/latest/system.html#isolate-cpus-on-linux
|
||||
[frequency stays constant]: XXXTODO
|
||||
|
||||
|
||||
### Guidance for designing µbenchmarks
|
||||
|
||||
*Note:* this guidance is specific to `factor` and takes its application domain
|
||||
into account; do not expect it to generalise to other projects. It is based
|
||||
on Daniel Lemire's [*Microbenchmarking calls for idealized conditions*][lemire],
|
||||
which I recommend reading if you want to add benchmarks to `factor`.
|
||||
|
||||
1. Select a small, self-contained, deterministic component
|
||||
`gcd` and `table::factor` are good example of such:
|
||||
- no I/O or access to external data structures ;
|
||||
- no call into other components ;
|
||||
- behaviour is deterministic: no RNG, no concurrency, ... ;
|
||||
- the test's body is *fast* (~100ns for `gcd`, ~10µs for `factor::table`),
|
||||
so each sample takes a very short time, minimizing variability and
|
||||
maximizing the numbers of samples we can take in a given time.
|
||||
|
||||
2. Benchmarks are immutable (once merged in `uutils`)
|
||||
Modifying a benchmark means previously-collected values cannot meaningfully
|
||||
be compared, silently giving nonsensical results. If you must modify an
|
||||
existing benchmark, rename it.
|
||||
|
||||
3. Test common cases
|
||||
We are interested in overall performance, rather than specific edge-cases;
|
||||
use **reproducibly-randomised inputs**, sampling from either all possible
|
||||
input values or some subset of interest.
|
||||
|
||||
4. Use [`criterion`], `criterion::black_box`, ...
|
||||
`criterion` isn't perfect, but it is also much better than ad-hoc
|
||||
solutions in each benchmark.
|
||||
|
||||
|
||||
## Wishlist
|
||||
|
||||
### Configurable statistical estimators
|
||||
|
||||
`criterion` always uses the arithmetic average as estimator; in µbenchmarks,
|
||||
where the code under test is fully deterministic and the measurements are
|
||||
subject to additive, positive noise, [the minimum is more appropriate][lemire].
|
||||
|
||||
|
||||
### CI & reproducible performance testing
|
||||
|
||||
Measuring performance on real hardware is important, as it relates directly
|
||||
to what users of `factor` experience; however, such measurements are subject
|
||||
to the constraints of the real-world, and aren't perfectly reproducible.
|
||||
Moreover, the mitigations for it (described above) aren't achievable in
|
||||
virtualized, multi-tenant environments such as CI.
|
||||
|
||||
Instead, we could run the µbenchmarks in a simulated CPU with [`cachegrind`],
|
||||
measure execution “time” in that model (in CI), and use it to detect and report
|
||||
performance improvements and regressions.
|
||||
|
||||
[`iai`] is an implementation of this idea for Rust.
|
||||
|
||||
[`cachegrind`]: https://www.valgrind.org/docs/manual/cg-manual.html
|
||||
[`iai`]: https://bheisler.github.io/criterion.rs/book/iai/iai.html
|
||||
|
||||
|
||||
### Comparing randomised implementations across multiple inputs
|
||||
|
||||
`factor` is a challenging target for system benchmarks as it combines two
|
||||
characteristics:
|
||||
|
||||
1. integer factoring algorithms are randomised, with large variance in
|
||||
execution time ;
|
||||
|
||||
2. various inputs also have large differences in factoring time, that
|
||||
corresponds to no natural, linear ordering of the inputs.
|
||||
|
||||
|
||||
If (1) was untrue (i.e. if execution time wasn't random), we could faithfully
|
||||
compare 2 implementations (2 successive versions, or `uutils` and GNU) using
|
||||
a scatter plot, where each axis corresponds to the perf. of one implementation.
|
||||
|
||||
Similarly, without (2) we could plot numbers on the X axis and their factoring
|
||||
time on the Y axis, using multiple lines for various quantiles. The large
|
||||
differences in factoring times for successive numbers, mean that such a plot
|
||||
would be unreadable.
|
|
@ -17,20 +17,15 @@ num-traits = "0.2.13" # used in src/numerics.rs, which is included by build.rs
|
|||
[dependencies]
|
||||
coz = { version = "0.1.3", optional = true }
|
||||
num-traits = "0.2.13" # Needs at least version 0.2.13 for "OverflowingAdd"
|
||||
rand = { version="0.7", features=["small_rng"] }
|
||||
smallvec = { version="0.6.14, < 1.0" }
|
||||
uucore = { version=">=0.0.8", package="uucore", path="../../uucore" }
|
||||
uucore_procs = { version=">=0.0.5", package="uucore_procs", path="../../uucore_procs" }
|
||||
rand = { version = "0.7", features = ["small_rng"] }
|
||||
smallvec = { version = "0.6.14, < 1.0" }
|
||||
uucore = { version = ">=0.0.8", package = "uucore", path = "../../uucore" }
|
||||
uucore_procs = { version = ">=0.0.5", package = "uucore_procs", path = "../../uucore_procs" }
|
||||
|
||||
[dev-dependencies]
|
||||
criterion = "0.3"
|
||||
paste = "0.1.18"
|
||||
quickcheck = "0.9.2"
|
||||
rand_chacha = "0.2.2"
|
||||
|
||||
[[bench]]
|
||||
name = "gcd"
|
||||
harness = false
|
||||
|
||||
[[bin]]
|
||||
name = "factor"
|
||||
|
|
|
@ -13,13 +13,13 @@ use std::error::Error;
|
|||
use std::io::{self, stdin, stdout, BufRead, Write};
|
||||
|
||||
mod factor;
|
||||
pub(crate) use factor::*;
|
||||
pub use factor::*;
|
||||
use uucore::InvalidEncodingHandling;
|
||||
|
||||
mod miller_rabin;
|
||||
pub mod numeric;
|
||||
mod rho;
|
||||
mod table;
|
||||
pub mod table;
|
||||
|
||||
static SYNTAX: &str = "[OPTION] [NUMBER]...";
|
||||
static SUMMARY: &str = "Print the prime factors of the given number(s).
|
||||
|
|
|
@ -161,7 +161,7 @@ pub fn factor(mut n: u64) -> Factors {
|
|||
return factors;
|
||||
}
|
||||
|
||||
let (factors, n) = table::factor(n, factors);
|
||||
table::factor(&mut n, &mut factors);
|
||||
|
||||
#[allow(clippy::let_and_return)]
|
||||
let r = if n < (1 << 32) {
|
||||
|
@ -239,9 +239,13 @@ mod tests {
|
|||
}
|
||||
|
||||
#[cfg(test)]
|
||||
impl quickcheck::Arbitrary for Factors {
|
||||
fn arbitrary<G: quickcheck::Gen>(gen: &mut G) -> Self {
|
||||
use rand::Rng;
|
||||
use rand::{
|
||||
distributions::{Distribution, Standard},
|
||||
Rng,
|
||||
};
|
||||
#[cfg(test)]
|
||||
impl Distribution<Factors> for Standard {
|
||||
fn sample<R: Rng + ?Sized>(&self, rng: &mut R) -> Factors {
|
||||
let mut f = Factors::one();
|
||||
let mut g = 1u64;
|
||||
let mut n = u64::MAX;
|
||||
|
@ -252,7 +256,7 @@ impl quickcheck::Arbitrary for Factors {
|
|||
// See Generating Random Factored Numbers, Easily, J. Cryptology (2003)
|
||||
'attempt: loop {
|
||||
while n > 1 {
|
||||
n = gen.gen_range(1, n);
|
||||
n = rng.gen_range(1, n);
|
||||
if miller_rabin::is_prime(n) {
|
||||
if let Some(h) = g.checked_mul(n) {
|
||||
f.push(n);
|
||||
|
@ -269,6 +273,13 @@ impl quickcheck::Arbitrary for Factors {
|
|||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
impl quickcheck::Arbitrary for Factors {
|
||||
fn arbitrary<G: quickcheck::Gen>(g: &mut G) -> Self {
|
||||
g.gen()
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
impl std::ops::BitXor<Exponent> for Factors {
|
||||
type Output = Self;
|
||||
|
|
|
@ -8,15 +8,13 @@
|
|||
|
||||
// spell-checker: ignore (ToDO) INVS
|
||||
|
||||
use std::num::Wrapping;
|
||||
|
||||
use crate::Factors;
|
||||
|
||||
include!(concat!(env!("OUT_DIR"), "/prime_table.rs"));
|
||||
|
||||
pub(crate) fn factor(mut num: u64, mut factors: Factors) -> (Factors, u64) {
|
||||
pub fn factor(num: &mut u64, factors: &mut Factors) {
|
||||
for &(prime, inv, ceil) in P_INVS_U64 {
|
||||
if num == 1 {
|
||||
if *num == 1 {
|
||||
break;
|
||||
}
|
||||
|
||||
|
@ -27,11 +25,11 @@ pub(crate) fn factor(mut num: u64, mut factors: Factors) -> (Factors, u64) {
|
|||
// for a nice explanation.
|
||||
let mut k = 0;
|
||||
loop {
|
||||
let Wrapping(x) = Wrapping(num) * Wrapping(inv);
|
||||
let x = num.wrapping_mul(inv);
|
||||
|
||||
// While prime divides num
|
||||
if x <= ceil {
|
||||
num = x;
|
||||
*num = x;
|
||||
k += 1;
|
||||
#[cfg(feature = "coz")]
|
||||
coz::progress!("factor found");
|
||||
|
@ -43,6 +41,61 @@ pub(crate) fn factor(mut num: u64, mut factors: Factors) -> (Factors, u64) {
|
|||
}
|
||||
}
|
||||
}
|
||||
|
||||
(factors, num)
|
||||
}
|
||||
|
||||
pub const CHUNK_SIZE: usize = 8;
|
||||
pub fn factor_chunk(n_s: &mut [u64; CHUNK_SIZE], f_s: &mut [Factors; CHUNK_SIZE]) {
|
||||
for &(prime, inv, ceil) in P_INVS_U64 {
|
||||
if n_s[0] == 1 && n_s[1] == 1 && n_s[2] == 1 && n_s[3] == 1 {
|
||||
break;
|
||||
}
|
||||
|
||||
for (num, factors) in n_s.iter_mut().zip(f_s.iter_mut()) {
|
||||
if *num == 1 {
|
||||
continue;
|
||||
}
|
||||
let mut k = 0;
|
||||
loop {
|
||||
let x = num.wrapping_mul(inv);
|
||||
|
||||
// While prime divides num
|
||||
if x <= ceil {
|
||||
*num = x;
|
||||
k += 1;
|
||||
} else {
|
||||
if k > 0 {
|
||||
factors.add(prime, k);
|
||||
}
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
use crate::Factors;
|
||||
use quickcheck::quickcheck;
|
||||
use rand::{rngs::SmallRng, Rng, SeedableRng};
|
||||
|
||||
quickcheck! {
|
||||
fn chunk_vs_iter(seed: u64) -> () {
|
||||
let mut rng = SmallRng::seed_from_u64(seed);
|
||||
let mut n_c: [u64; CHUNK_SIZE] = rng.gen();
|
||||
let mut f_c: [Factors; CHUNK_SIZE] = rng.gen();
|
||||
|
||||
let mut n_i = n_c.clone();
|
||||
let mut f_i = f_c.clone();
|
||||
for (n, f) in n_i.iter_mut().zip(f_i.iter_mut()) {
|
||||
factor(n, f);
|
||||
}
|
||||
|
||||
factor_chunk(&mut n_c, &mut f_c);
|
||||
|
||||
assert_eq!(n_i, n_c);
|
||||
assert_eq!(f_i, f_c);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
26
tests/benches/factor/Cargo.toml
Normal file
26
tests/benches/factor/Cargo.toml
Normal file
|
@ -0,0 +1,26 @@
|
|||
[package]
|
||||
name = "uu_factor_benches"
|
||||
version = "0.0.0"
|
||||
authors = ["nicoo <nicoo@debian.org>"]
|
||||
license = "MIT"
|
||||
description = "Benchmarks for the uu_factor integer factorization tool"
|
||||
homepage = "https://github.com/uutils/coreutils"
|
||||
edition = "2018"
|
||||
|
||||
[dependencies]
|
||||
uu_factor = { path = "../../../src/uu/factor" }
|
||||
|
||||
[dev-dependencies]
|
||||
array-init = "2.0.0"
|
||||
criterion = "0.3"
|
||||
rand = "0.7"
|
||||
rand_chacha = "0.2.2"
|
||||
|
||||
|
||||
[[bench]]
|
||||
name = "gcd"
|
||||
harness = false
|
||||
|
||||
[[bench]]
|
||||
name = "table"
|
||||
harness = false
|
78
tests/benches/factor/benches/table.rs
Normal file
78
tests/benches/factor/benches/table.rs
Normal file
|
@ -0,0 +1,78 @@
|
|||
use array_init::array_init;
|
||||
use criterion::{criterion_group, criterion_main, BenchmarkId, Criterion, Throughput};
|
||||
use std::convert::TryInto;
|
||||
use uu_factor::{table::*, Factors};
|
||||
|
||||
fn table(c: &mut Criterion) {
|
||||
#[cfg(target_os = "linux")]
|
||||
check_personality();
|
||||
|
||||
const INPUT_SIZE: usize = 128;
|
||||
assert!(
|
||||
INPUT_SIZE % CHUNK_SIZE == 0,
|
||||
"INPUT_SIZE ({}) is not divisible by CHUNK_SIZE ({})",
|
||||
INPUT_SIZE,
|
||||
CHUNK_SIZE
|
||||
);
|
||||
let inputs = {
|
||||
// Deterministic RNG; use an explicitely-named RNG to guarantee stability
|
||||
use rand::{RngCore, SeedableRng};
|
||||
use rand_chacha::ChaCha8Rng;
|
||||
const SEED: u64 = 0xdead_bebe_ea75_cafe;
|
||||
let mut rng = ChaCha8Rng::seed_from_u64(SEED);
|
||||
|
||||
std::iter::repeat_with(move || array_init::<_, _, INPUT_SIZE>(|_| rng.next_u64()))
|
||||
};
|
||||
|
||||
let mut group = c.benchmark_group("table");
|
||||
group.throughput(Throughput::Elements(INPUT_SIZE as _));
|
||||
for a in inputs.take(10) {
|
||||
let a_str = format!("{:?}", a);
|
||||
group.bench_with_input(BenchmarkId::new("factor_chunk", &a_str), &a, |b, &a| {
|
||||
b.iter(|| {
|
||||
let mut n_s = a.clone();
|
||||
let mut f_s: [_; INPUT_SIZE] = array_init(|_| Factors::one());
|
||||
for (n_s, f_s) in n_s.chunks_mut(CHUNK_SIZE).zip(f_s.chunks_mut(CHUNK_SIZE)) {
|
||||
factor_chunk(n_s.try_into().unwrap(), f_s.try_into().unwrap())
|
||||
}
|
||||
})
|
||||
});
|
||||
group.bench_with_input(BenchmarkId::new("factor", &a_str), &a, |b, &a| {
|
||||
b.iter(|| {
|
||||
let mut n_s = a.clone();
|
||||
let mut f_s: [_; INPUT_SIZE] = array_init(|_| Factors::one());
|
||||
for (n, f) in n_s.iter_mut().zip(f_s.iter_mut()) {
|
||||
factor(n, f)
|
||||
}
|
||||
})
|
||||
});
|
||||
}
|
||||
group.finish()
|
||||
}
|
||||
|
||||
#[cfg(target_os = "linux")]
|
||||
fn check_personality() {
|
||||
use std::fs;
|
||||
const ADDR_NO_RANDOMIZE: u64 = 0x0040000;
|
||||
const PERSONALITY_PATH: &'static str = "/proc/self/personality";
|
||||
|
||||
let p_string = fs::read_to_string(PERSONALITY_PATH)
|
||||
.expect(&format!("Couldn't read '{}'", PERSONALITY_PATH))
|
||||
.strip_suffix("\n")
|
||||
.unwrap()
|
||||
.to_owned();
|
||||
|
||||
let personality = u64::from_str_radix(&p_string, 16).expect(&format!(
|
||||
"Expected a hex value for personality, got '{:?}'",
|
||||
p_string
|
||||
));
|
||||
if personality & ADDR_NO_RANDOMIZE == 0 {
|
||||
eprintln!(
|
||||
"WARNING: Benchmarking with ASLR enabled (personality is {:x}), results might not be reproducible.",
|
||||
personality
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
criterion_group!(benches, table);
|
||||
criterion_main!(benches);
|
Loading…
Add table
Add a link
Reference in a new issue