1
Fork 0
mirror of https://github.com/RGBCube/uutils-coreutils synced 2025-07-28 11:37:44 +00:00

Merge pull request #2143 from nbraud/factor/faster/table

factor::table: Implement a batched version w/ improved performance
This commit is contained in:
Sylvestre Ledru 2021-05-22 17:18:07 +02:00 committed by GitHub
commit 66cfdb8644
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
10 changed files with 324 additions and 26 deletions

20
Cargo.lock generated
View file

@ -43,6 +43,12 @@ dependencies = [
"winapi 0.3.9",
]
[[package]]
name = "array-init"
version = "2.0.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "6945cc5422176fc5e602e590c2878d2c2acd9a4fe20a4baa7c28022521698ec6"
[[package]]
name = "arrayvec"
version = "0.4.12"
@ -279,6 +285,7 @@ dependencies = [
"uu_expand",
"uu_expr",
"uu_factor",
"uu_factor_benches",
"uu_false",
"uu_fmt",
"uu_fold",
@ -2029,17 +2036,26 @@ name = "uu_factor"
version = "0.0.6"
dependencies = [
"coz",
"criterion",
"num-traits",
"paste",
"quickcheck",
"rand 0.7.3",
"rand_chacha",
"smallvec",
"uucore",
"uucore_procs",
]
[[package]]
name = "uu_factor_benches"
version = "0.0.0"
dependencies = [
"array-init",
"criterion",
"rand 0.7.3",
"rand_chacha",
"uu_factor",
]
[[package]]
name = "uu_false"
version = "0.0.6"

View file

@ -324,6 +324,9 @@ wc = { optional=true, version="0.0.6", package="uu_wc", path="src/uu/wc" }
who = { optional=true, version="0.0.6", package="uu_who", path="src/uu/who" }
whoami = { optional=true, version="0.0.6", package="uu_whoami", path="src/uu/whoami" }
yes = { optional=true, version="0.0.6", package="uu_yes", path="src/uu/yes" }
factor_benches = { optional = true, version = "0.0.0", package = "uu_factor_benches", path = "tests/benches/factor" }
#
# * pinned transitive dependencies
# Not needed for now. Keep as examples:

View file

@ -0,0 +1,116 @@
# Benchmarking `factor`
The benchmarks for `factor` are located under `tests/benches/factor`
and can be invoked with `cargo bench` in that directory.
They are located outside the `uu_factor` crate, as they do not comply
with the project's minimum supported Rust version, *i.e.* may require
a newer version of `rustc`.
## Microbenchmarking deterministic functions
We currently use [`criterion`] to benchmark deterministic functions,
such as `gcd` and `table::factor`.
However, µbenchmarks are by nature unstable: not only are they specific to
the hardware, operating system version, etc., but they are noisy and affected
by other tasks on the system (browser, compile jobs, etc.), which can cause
`criterion` to report spurious performance improvements and regressions.
This can be mitigated by getting as close to [idealised conditions][lemire]
as possible:
- minimize the amount of computation and I/O running concurrently to the
benchmark, *i.e.* close your browser and IM clients, don't compile at the
same time, etc. ;
- ensure the CPU's [frequency stays constant] during the benchmark ;
- [isolate a **physical** core], set it to `nohz_full`, and pin the benchmark
to it, so it won't be preempted in the middle of a measurement ;
- disable ASLR by running `setarch -R cargo bench`, so we can compare results
across multiple executions.
[`criterion`]: https://bheisler.github.io/criterion.rs/book/index.html
[lemire]: https://lemire.me/blog/2018/01/16/microbenchmarking-calls-for-idealized-conditions/
[isolate a **physical** core]: https://pyperf.readthedocs.io/en/latest/system.html#isolate-cpus-on-linux
[frequency stays constant]: XXXTODO
### Guidance for designing µbenchmarks
*Note:* this guidance is specific to `factor` and takes its application domain
into account; do not expect it to generalise to other projects. It is based
on Daniel Lemire's [*Microbenchmarking calls for idealized conditions*][lemire],
which I recommend reading if you want to add benchmarks to `factor`.
1. Select a small, self-contained, deterministic component
`gcd` and `table::factor` are good example of such:
- no I/O or access to external data structures ;
- no call into other components ;
- behaviour is deterministic: no RNG, no concurrency, ... ;
- the test's body is *fast* (~100ns for `gcd`, ~10µs for `factor::table`),
so each sample takes a very short time, minimizing variability and
maximizing the numbers of samples we can take in a given time.
2. Benchmarks are immutable (once merged in `uutils`)
Modifying a benchmark means previously-collected values cannot meaningfully
be compared, silently giving nonsensical results. If you must modify an
existing benchmark, rename it.
3. Test common cases
We are interested in overall performance, rather than specific edge-cases;
use **reproducibly-randomised inputs**, sampling from either all possible
input values or some subset of interest.
4. Use [`criterion`], `criterion::black_box`, ...
`criterion` isn't perfect, but it is also much better than ad-hoc
solutions in each benchmark.
## Wishlist
### Configurable statistical estimators
`criterion` always uses the arithmetic average as estimator; in µbenchmarks,
where the code under test is fully deterministic and the measurements are
subject to additive, positive noise, [the minimum is more appropriate][lemire].
### CI & reproducible performance testing
Measuring performance on real hardware is important, as it relates directly
to what users of `factor` experience; however, such measurements are subject
to the constraints of the real-world, and aren't perfectly reproducible.
Moreover, the mitigations for it (described above) aren't achievable in
virtualized, multi-tenant environments such as CI.
Instead, we could run the µbenchmarks in a simulated CPU with [`cachegrind`],
measure execution “time” in that model (in CI), and use it to detect and report
performance improvements and regressions.
[`iai`] is an implementation of this idea for Rust.
[`cachegrind`]: https://www.valgrind.org/docs/manual/cg-manual.html
[`iai`]: https://bheisler.github.io/criterion.rs/book/iai/iai.html
### Comparing randomised implementations across multiple inputs
`factor` is a challenging target for system benchmarks as it combines two
characteristics:
1. integer factoring algorithms are randomised, with large variance in
execution time ;
2. various inputs also have large differences in factoring time, that
corresponds to no natural, linear ordering of the inputs.
If (1) was untrue (i.e. if execution time wasn't random), we could faithfully
compare 2 implementations (2 successive versions, or `uutils` and GNU) using
a scatter plot, where each axis corresponds to the perf. of one implementation.
Similarly, without (2) we could plot numbers on the X axis and their factoring
time on the Y axis, using multiple lines for various quantiles. The large
differences in factoring times for successive numbers, mean that such a plot
would be unreadable.

View file

@ -17,20 +17,15 @@ num-traits = "0.2.13" # used in src/numerics.rs, which is included by build.rs
[dependencies]
coz = { version = "0.1.3", optional = true }
num-traits = "0.2.13" # Needs at least version 0.2.13 for "OverflowingAdd"
rand = { version="0.7", features=["small_rng"] }
smallvec = { version="0.6.14, < 1.0" }
uucore = { version=">=0.0.8", package="uucore", path="../../uucore" }
uucore_procs = { version=">=0.0.5", package="uucore_procs", path="../../uucore_procs" }
rand = { version = "0.7", features = ["small_rng"] }
smallvec = { version = "0.6.14, < 1.0" }
uucore = { version = ">=0.0.8", package = "uucore", path = "../../uucore" }
uucore_procs = { version = ">=0.0.5", package = "uucore_procs", path = "../../uucore_procs" }
[dev-dependencies]
criterion = "0.3"
paste = "0.1.18"
quickcheck = "0.9.2"
rand_chacha = "0.2.2"
[[bench]]
name = "gcd"
harness = false
[[bin]]
name = "factor"

View file

@ -13,13 +13,13 @@ use std::error::Error;
use std::io::{self, stdin, stdout, BufRead, Write};
mod factor;
pub(crate) use factor::*;
pub use factor::*;
use uucore::InvalidEncodingHandling;
mod miller_rabin;
pub mod numeric;
mod rho;
mod table;
pub mod table;
static SYNTAX: &str = "[OPTION] [NUMBER]...";
static SUMMARY: &str = "Print the prime factors of the given number(s).

View file

@ -161,7 +161,7 @@ pub fn factor(mut n: u64) -> Factors {
return factors;
}
let (factors, n) = table::factor(n, factors);
table::factor(&mut n, &mut factors);
#[allow(clippy::let_and_return)]
let r = if n < (1 << 32) {
@ -239,9 +239,13 @@ mod tests {
}
#[cfg(test)]
impl quickcheck::Arbitrary for Factors {
fn arbitrary<G: quickcheck::Gen>(gen: &mut G) -> Self {
use rand::Rng;
use rand::{
distributions::{Distribution, Standard},
Rng,
};
#[cfg(test)]
impl Distribution<Factors> for Standard {
fn sample<R: Rng + ?Sized>(&self, rng: &mut R) -> Factors {
let mut f = Factors::one();
let mut g = 1u64;
let mut n = u64::MAX;
@ -252,7 +256,7 @@ impl quickcheck::Arbitrary for Factors {
// See Generating Random Factored Numbers, Easily, J. Cryptology (2003)
'attempt: loop {
while n > 1 {
n = gen.gen_range(1, n);
n = rng.gen_range(1, n);
if miller_rabin::is_prime(n) {
if let Some(h) = g.checked_mul(n) {
f.push(n);
@ -269,6 +273,13 @@ impl quickcheck::Arbitrary for Factors {
}
}
#[cfg(test)]
impl quickcheck::Arbitrary for Factors {
fn arbitrary<G: quickcheck::Gen>(g: &mut G) -> Self {
g.gen()
}
}
#[cfg(test)]
impl std::ops::BitXor<Exponent> for Factors {
type Output = Self;

View file

@ -8,15 +8,13 @@
// spell-checker: ignore (ToDO) INVS
use std::num::Wrapping;
use crate::Factors;
include!(concat!(env!("OUT_DIR"), "/prime_table.rs"));
pub(crate) fn factor(mut num: u64, mut factors: Factors) -> (Factors, u64) {
pub fn factor(num: &mut u64, factors: &mut Factors) {
for &(prime, inv, ceil) in P_INVS_U64 {
if num == 1 {
if *num == 1 {
break;
}
@ -27,11 +25,11 @@ pub(crate) fn factor(mut num: u64, mut factors: Factors) -> (Factors, u64) {
// for a nice explanation.
let mut k = 0;
loop {
let Wrapping(x) = Wrapping(num) * Wrapping(inv);
let x = num.wrapping_mul(inv);
// While prime divides num
if x <= ceil {
num = x;
*num = x;
k += 1;
#[cfg(feature = "coz")]
coz::progress!("factor found");
@ -43,6 +41,61 @@ pub(crate) fn factor(mut num: u64, mut factors: Factors) -> (Factors, u64) {
}
}
}
(factors, num)
}
pub const CHUNK_SIZE: usize = 8;
pub fn factor_chunk(n_s: &mut [u64; CHUNK_SIZE], f_s: &mut [Factors; CHUNK_SIZE]) {
for &(prime, inv, ceil) in P_INVS_U64 {
if n_s[0] == 1 && n_s[1] == 1 && n_s[2] == 1 && n_s[3] == 1 {
break;
}
for (num, factors) in n_s.iter_mut().zip(f_s.iter_mut()) {
if *num == 1 {
continue;
}
let mut k = 0;
loop {
let x = num.wrapping_mul(inv);
// While prime divides num
if x <= ceil {
*num = x;
k += 1;
} else {
if k > 0 {
factors.add(prime, k);
}
break;
}
}
}
}
}
#[cfg(test)]
mod tests {
use super::*;
use crate::Factors;
use quickcheck::quickcheck;
use rand::{rngs::SmallRng, Rng, SeedableRng};
quickcheck! {
fn chunk_vs_iter(seed: u64) -> () {
let mut rng = SmallRng::seed_from_u64(seed);
let mut n_c: [u64; CHUNK_SIZE] = rng.gen();
let mut f_c: [Factors; CHUNK_SIZE] = rng.gen();
let mut n_i = n_c.clone();
let mut f_i = f_c.clone();
for (n, f) in n_i.iter_mut().zip(f_i.iter_mut()) {
factor(n, f);
}
factor_chunk(&mut n_c, &mut f_c);
assert_eq!(n_i, n_c);
assert_eq!(f_i, f_c);
}
}
}

View file

@ -0,0 +1,26 @@
[package]
name = "uu_factor_benches"
version = "0.0.0"
authors = ["nicoo <nicoo@debian.org>"]
license = "MIT"
description = "Benchmarks for the uu_factor integer factorization tool"
homepage = "https://github.com/uutils/coreutils"
edition = "2018"
[dependencies]
uu_factor = { path = "../../../src/uu/factor" }
[dev-dependencies]
array-init = "2.0.0"
criterion = "0.3"
rand = "0.7"
rand_chacha = "0.2.2"
[[bench]]
name = "gcd"
harness = false
[[bench]]
name = "table"
harness = false

View file

@ -0,0 +1,78 @@
use array_init::array_init;
use criterion::{criterion_group, criterion_main, BenchmarkId, Criterion, Throughput};
use std::convert::TryInto;
use uu_factor::{table::*, Factors};
fn table(c: &mut Criterion) {
#[cfg(target_os = "linux")]
check_personality();
const INPUT_SIZE: usize = 128;
assert!(
INPUT_SIZE % CHUNK_SIZE == 0,
"INPUT_SIZE ({}) is not divisible by CHUNK_SIZE ({})",
INPUT_SIZE,
CHUNK_SIZE
);
let inputs = {
// Deterministic RNG; use an explicitely-named RNG to guarantee stability
use rand::{RngCore, SeedableRng};
use rand_chacha::ChaCha8Rng;
const SEED: u64 = 0xdead_bebe_ea75_cafe;
let mut rng = ChaCha8Rng::seed_from_u64(SEED);
std::iter::repeat_with(move || array_init::<_, _, INPUT_SIZE>(|_| rng.next_u64()))
};
let mut group = c.benchmark_group("table");
group.throughput(Throughput::Elements(INPUT_SIZE as _));
for a in inputs.take(10) {
let a_str = format!("{:?}", a);
group.bench_with_input(BenchmarkId::new("factor_chunk", &a_str), &a, |b, &a| {
b.iter(|| {
let mut n_s = a.clone();
let mut f_s: [_; INPUT_SIZE] = array_init(|_| Factors::one());
for (n_s, f_s) in n_s.chunks_mut(CHUNK_SIZE).zip(f_s.chunks_mut(CHUNK_SIZE)) {
factor_chunk(n_s.try_into().unwrap(), f_s.try_into().unwrap())
}
})
});
group.bench_with_input(BenchmarkId::new("factor", &a_str), &a, |b, &a| {
b.iter(|| {
let mut n_s = a.clone();
let mut f_s: [_; INPUT_SIZE] = array_init(|_| Factors::one());
for (n, f) in n_s.iter_mut().zip(f_s.iter_mut()) {
factor(n, f)
}
})
});
}
group.finish()
}
#[cfg(target_os = "linux")]
fn check_personality() {
use std::fs;
const ADDR_NO_RANDOMIZE: u64 = 0x0040000;
const PERSONALITY_PATH: &'static str = "/proc/self/personality";
let p_string = fs::read_to_string(PERSONALITY_PATH)
.expect(&format!("Couldn't read '{}'", PERSONALITY_PATH))
.strip_suffix("\n")
.unwrap()
.to_owned();
let personality = u64::from_str_radix(&p_string, 16).expect(&format!(
"Expected a hex value for personality, got '{:?}'",
p_string
));
if personality & ADDR_NO_RANDOMIZE == 0 {
eprintln!(
"WARNING: Benchmarking with ASLR enabled (personality is {:x}), results might not be reproducible.",
personality
);
}
}
criterion_group!(benches, table);
criterion_main!(benches);