Merge pull request #2143 from nbraud/factor/faster/table

factor::table: Implement a batched version w/ improved performance
2025-07-28 11:37:44 +00:00 · 2021-05-22 17:18:07 +02:00 · 2021-05-22 17:18:07 +02:00 · 66cfdb8644
commit 66cfdb8644
parent 66dd6dbeff a0a103b15e
10 changed files with 324 additions and 26 deletions
--- a/Cargo.lock
+++ b/Cargo.lock
@ -43,6 +43,12 @@ dependencies = [
 "winapi 0.3.9",
 ]

+[[package]]
+name = "array-init"
+version = "2.0.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "6945cc5422176fc5e602e590c2878d2c2acd9a4fe20a4baa7c28022521698ec6"
+
 [[package]]
 name = "arrayvec"
 version = "0.4.12"
@ -279,6 +285,7 @@ dependencies = [
 "uu_expand",
 "uu_expr",
 "uu_factor",
+ "uu_factor_benches",
 "uu_false",
 "uu_fmt",
 "uu_fold",
@ -2029,17 +2036,26 @@ name = "uu_factor"
 version = "0.0.6"
 dependencies = [
 "coz",
- "criterion",
 "num-traits",
 "paste",
 "quickcheck",
 "rand 0.7.3",
- "rand_chacha",
 "smallvec",
 "uucore",
 "uucore_procs",
 ]

+[[package]]
+name = "uu_factor_benches"
+version = "0.0.0"
+dependencies = [
+ "array-init",
+ "criterion",
+ "rand 0.7.3",
+ "rand_chacha",
+ "uu_factor",
+]
+
 [[package]]
 name = "uu_false"
 version = "0.0.6"
--- a/Cargo.toml
+++ b/Cargo.toml
@ -324,6 +324,9 @@ wc       = { optional=true, version="0.0.6", package="uu_wc", path="src/uu/wc" }
 who      = { optional=true, version="0.0.6", package="uu_who", path="src/uu/who" }
 whoami   = { optional=true, version="0.0.6", package="uu_whoami", path="src/uu/whoami" }
 yes      = { optional=true, version="0.0.6", package="uu_yes", path="src/uu/yes" }
+
+factor_benches = { optional = true, version = "0.0.0", package = "uu_factor_benches", path = "tests/benches/factor" }
+
 #
 # * pinned transitive dependencies
 # Not needed for now. Keep as examples:
--- a/src/uu/factor/BENCHMARKING.md
+++ b/src/uu/factor/BENCHMARKING.md
@ -0,0 +1,116 @@
+# Benchmarking `factor`
+
+The benchmarks for `factor` are located under `tests/benches/factor`
+and can be invoked with `cargo bench` in that directory.
+
+They are located outside the `uu_factor` crate, as they do not comply
+with the project's minimum supported Rust version, *i.e.* may require
+a newer version of `rustc`.
+
+
+## Microbenchmarking deterministic functions
+
+We currently use [`criterion`] to benchmark deterministic functions,
+such as `gcd` and `table::factor`.
+
+However, µbenchmarks are by nature unstable: not only are they specific to
+the hardware, operating system version, etc., but they are noisy and affected
+by other tasks on the system (browser, compile jobs, etc.), which can cause
+`criterion` to report spurious performance improvements and regressions.
+
+This can be mitigated by getting as close to [idealised conditions][lemire]
+as possible:
+- minimize the amount of computation and I/O running concurrently to the
+  benchmark, *i.e.* close your browser and IM clients, don't compile at the
+  same time, etc. ;
+- ensure the CPU's [frequency stays constant] during the benchmark ;
+- [isolate a **physical** core], set it to `nohz_full`, and pin the benchmark
+  to it, so it won't be preempted in the middle of a measurement ;
+- disable ASLR by running `setarch -R cargo bench`, so we can compare results
+  across multiple executions.  
+
+
+[`criterion`]: https://bheisler.github.io/criterion.rs/book/index.html
+[lemire]: https://lemire.me/blog/2018/01/16/microbenchmarking-calls-for-idealized-conditions/
+[isolate a **physical** core]: https://pyperf.readthedocs.io/en/latest/system.html#isolate-cpus-on-linux
+[frequency stays constant]: XXXTODO
+
+
+### Guidance for designing µbenchmarks
+
+*Note:* this guidance is specific to `factor` and takes its application domain
+into account; do not expect it to generalise to other projects.  It is based
+on Daniel Lemire's [*Microbenchmarking calls for idealized conditions*][lemire],
+which I recommend reading if you want to add benchmarks to `factor`.
+
+1. Select a small, self-contained, deterministic component  
+   `gcd` and `table::factor` are good example of such:
+   - no I/O or access to external data structures ;
+   - no call into other components ;
+   - behaviour is deterministic: no RNG, no concurrency, ... ;
+   - the test's body is *fast* (~100ns for `gcd`, ~10µs for `factor::table`),
+     so each sample takes a very short time, minimizing variability and
+     maximizing the numbers of samples we can take in a given time.
+
+2. Benchmarks are immutable (once merged in `uutils`)  
+   Modifying a benchmark means previously-collected values cannot meaningfully
+   be compared, silently giving nonsensical results.  If you must modify an
+   existing benchmark, rename it.
+
+3. Test common cases  
+   We are interested in overall performance, rather than specific edge-cases;
+   use **reproducibly-randomised inputs**, sampling from either all possible
+   input values or some subset of interest.
+
+4. Use [`criterion`], `criterion::black_box`, ...  
+   `criterion` isn't perfect, but it is also much better than ad-hoc
+   solutions in each benchmark.
+
+
+## Wishlist
+
+### Configurable statistical estimators
+
+`criterion` always uses the arithmetic average as estimator; in µbenchmarks,
+where the code under test is fully deterministic and the measurements are
+subject to additive, positive noise, [the minimum is more appropriate][lemire].
+
+
+### CI & reproducible performance testing
+
+Measuring performance on real hardware is important, as it relates directly
+to what users of `factor` experience; however, such measurements are subject
+to the constraints of the real-world, and aren't perfectly reproducible.
+Moreover, the mitigations for it (described above) aren't achievable in
+virtualized, multi-tenant environments such as CI.
+
+Instead, we could run the µbenchmarks in a simulated CPU with [`cachegrind`],
+measure execution “time” in that model (in CI), and use it to detect and report
+performance improvements and regressions.
+
+[`iai`] is an implementation of this idea for Rust.
+
+[`cachegrind`]: https://www.valgrind.org/docs/manual/cg-manual.html
+[`iai`]: https://bheisler.github.io/criterion.rs/book/iai/iai.html
+
+
+### Comparing randomised implementations across multiple inputs
+
+`factor` is a challenging target for system benchmarks as it combines two
+characteristics:
+
+1. integer factoring algorithms are randomised, with large variance in
+   execution time ;
+
+2. various inputs also have large differences in factoring time, that
+   corresponds to no natural, linear ordering of the inputs.
+
+
+If (1) was untrue (i.e. if execution time wasn't random), we could faithfully
+compare 2 implementations (2 successive versions, or `uutils` and GNU) using
+a scatter plot, where each axis corresponds to the perf. of one implementation.
+
+Similarly, without (2) we could plot numbers on the X axis and their factoring
+time on the Y axis, using multiple lines for various quantiles.  The large
+differences in factoring times for successive numbers, mean that such a plot
+would be unreadable.
--- a/src/uu/factor/Cargo.toml
+++ b/src/uu/factor/Cargo.toml
@ -17,20 +17,15 @@ num-traits = "0.2.13" # used in src/numerics.rs, which is included by build.rs
 [dependencies]
 coz = { version = "0.1.3", optional = true }
 num-traits = "0.2.13" # Needs at least version 0.2.13 for "OverflowingAdd"
-rand = { version="0.7", features=["small_rng"] }
-smallvec = { version="0.6.14, < 1.0" }
-uucore = { version=">=0.0.8", package="uucore", path="../../uucore" }
-uucore_procs = { version=">=0.0.5", package="uucore_procs", path="../../uucore_procs" }
+rand = { version = "0.7", features = ["small_rng"] }
+smallvec = { version = "0.6.14, < 1.0" }
+uucore = { version = ">=0.0.8", package = "uucore", path = "../../uucore" }
+uucore_procs = { version = ">=0.0.5", package = "uucore_procs", path = "../../uucore_procs" }

 [dev-dependencies]
-criterion = "0.3"
 paste = "0.1.18"
 quickcheck = "0.9.2"
-rand_chacha = "0.2.2"

-[[bench]]
-name = "gcd"
-harness = false

 [[bin]]
 name = "factor"
--- a/src/uu/factor/src/cli.rs
+++ b/src/uu/factor/src/cli.rs
@ -13,13 +13,13 @@ use std::error::Error;
 use std::io::{self, stdin, stdout, BufRead, Write};

 mod factor;
-pub(crate) use factor::*;
+pub use factor::*;
 use uucore::InvalidEncodingHandling;

 mod miller_rabin;
 pub mod numeric;
 mod rho;
-mod table;
+pub mod table;

 static SYNTAX: &str = "[OPTION] [NUMBER]...";
 static SUMMARY: &str = "Print the prime factors of the given number(s).
--- a/src/uu/factor/src/factor.rs
+++ b/src/uu/factor/src/factor.rs
@ -161,7 +161,7 @@ pub fn factor(mut n: u64) -> Factors {
        return factors;
    }

-    let (factors, n) = table::factor(n, factors);
+    table::factor(&mut n, &mut factors);

    #[allow(clippy::let_and_return)]
    let r = if n < (1 << 32) {
@ -239,9 +239,13 @@ mod tests {
 }

 #[cfg(test)]
-impl quickcheck::Arbitrary for Factors {
-    fn arbitrary<G: quickcheck::Gen>(gen: &mut G) -> Self {
-        use rand::Rng;
+use rand::{
+    distributions::{Distribution, Standard},
+    Rng,
+};
+#[cfg(test)]
+impl Distribution<Factors> for Standard {
+    fn sample<R: Rng + ?Sized>(&self, rng: &mut R) -> Factors {
        let mut f = Factors::one();
        let mut g = 1u64;
        let mut n = u64::MAX;
@ -252,7 +256,7 @@ impl quickcheck::Arbitrary for Factors {
        // See Generating Random Factored Numbers, Easily, J. Cryptology (2003)
        'attempt: loop {
            while n > 1 {
-                n = gen.gen_range(1, n);
+                n = rng.gen_range(1, n);
                if miller_rabin::is_prime(n) {
                    if let Some(h) = g.checked_mul(n) {
                        f.push(n);
@ -269,6 +273,13 @@ impl quickcheck::Arbitrary for Factors {
    }
 }

+#[cfg(test)]
+impl quickcheck::Arbitrary for Factors {
+    fn arbitrary<G: quickcheck::Gen>(g: &mut G) -> Self {
+        g.gen()
+    }
+}
+
 #[cfg(test)]
 impl std::ops::BitXor<Exponent> for Factors {
    type Output = Self;
--- a/src/uu/factor/src/table.rs
+++ b/src/uu/factor/src/table.rs
@ -8,15 +8,13 @@

 // spell-checker: ignore (ToDO) INVS

-use std::num::Wrapping;
-
 use crate::Factors;

 include!(concat!(env!("OUT_DIR"), "/prime_table.rs"));

-pub(crate) fn factor(mut num: u64, mut factors: Factors) -> (Factors, u64) {
+pub fn factor(num: &mut u64, factors: &mut Factors) {
    for &(prime, inv, ceil) in P_INVS_U64 {
-        if num == 1 {
+        if *num == 1 {
            break;
        }

@ -27,11 +25,11 @@ pub(crate) fn factor(mut num: u64, mut factors: Factors) -> (Factors, u64) {
        // for a nice explanation.
        let mut k = 0;
        loop {
-            let Wrapping(x) = Wrapping(num) * Wrapping(inv);
+            let x = num.wrapping_mul(inv);

            // While prime divides num
            if x <= ceil {
-                num = x;
+                *num = x;
                k += 1;
                #[cfg(feature = "coz")]
                coz::progress!("factor found");
@ -43,6 +41,61 @@ pub(crate) fn factor(mut num: u64, mut factors: Factors) -> (Factors, u64) {
            }
        }
    }
-
-    (factors, num)
+}
+
+pub const CHUNK_SIZE: usize = 8;
+pub fn factor_chunk(n_s: &mut [u64; CHUNK_SIZE], f_s: &mut [Factors; CHUNK_SIZE]) {
+    for &(prime, inv, ceil) in P_INVS_U64 {
+        if n_s[0] == 1 && n_s[1] == 1 && n_s[2] == 1 && n_s[3] == 1 {
+            break;
+        }
+
+        for (num, factors) in n_s.iter_mut().zip(f_s.iter_mut()) {
+            if *num == 1 {
+                continue;
+            }
+            let mut k = 0;
+            loop {
+                let x = num.wrapping_mul(inv);
+
+                // While prime divides num
+                if x <= ceil {
+                    *num = x;
+                    k += 1;
+                } else {
+                    if k > 0 {
+                        factors.add(prime, k);
+                    }
+                    break;
+                }
+            }
+        }
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use crate::Factors;
+    use quickcheck::quickcheck;
+    use rand::{rngs::SmallRng, Rng, SeedableRng};
+
+    quickcheck! {
+        fn chunk_vs_iter(seed: u64) -> () {
+            let mut rng = SmallRng::seed_from_u64(seed);
+            let mut n_c: [u64; CHUNK_SIZE] = rng.gen();
+            let mut f_c: [Factors; CHUNK_SIZE] = rng.gen();
+
+            let mut n_i = n_c.clone();
+            let mut f_i = f_c.clone();
+            for (n, f) in n_i.iter_mut().zip(f_i.iter_mut()) {
+                factor(n, f);
+            }
+
+            factor_chunk(&mut n_c, &mut f_c);
+
+            assert_eq!(n_i, n_c);
+            assert_eq!(f_i, f_c);
+        }
+    }
 }
--- a/tests/benches/factor/Cargo.toml
+++ b/tests/benches/factor/Cargo.toml
@ -0,0 +1,26 @@
+[package]
+name = "uu_factor_benches"
+version = "0.0.0"
+authors = ["nicoo <nicoo@debian.org>"]
+license = "MIT"
+description = "Benchmarks for the uu_factor integer factorization tool"
+homepage = "https://github.com/uutils/coreutils"
+edition = "2018"
+
+[dependencies]
+uu_factor = { path = "../../../src/uu/factor" }
+
+[dev-dependencies]
+array-init = "2.0.0"
+criterion = "0.3"
+rand = "0.7"
+rand_chacha = "0.2.2"
+
+
+[[bench]]
+name = "gcd"
+harness = false
+
+[[bench]]
+name = "table"
+harness = false
--- a/tests/benches/factor/benches/gcd.rs
+++ b/tests/benches/factor/benches/gcd.rs
--- a/tests/benches/factor/benches/table.rs
+++ b/tests/benches/factor/benches/table.rs
@ -0,0 +1,78 @@
+use array_init::array_init;
+use criterion::{criterion_group, criterion_main, BenchmarkId, Criterion, Throughput};
+use std::convert::TryInto;
+use uu_factor::{table::*, Factors};
+
+fn table(c: &mut Criterion) {
+    #[cfg(target_os = "linux")]
+    check_personality();
+
+    const INPUT_SIZE: usize = 128;
+    assert!(
+        INPUT_SIZE % CHUNK_SIZE == 0,
+        "INPUT_SIZE ({}) is not divisible by CHUNK_SIZE ({})",
+        INPUT_SIZE,
+        CHUNK_SIZE
+    );
+    let inputs = {
+        // Deterministic RNG; use an explicitely-named RNG to guarantee stability
+        use rand::{RngCore, SeedableRng};
+        use rand_chacha::ChaCha8Rng;
+        const SEED: u64 = 0xdead_bebe_ea75_cafe;
+        let mut rng = ChaCha8Rng::seed_from_u64(SEED);
+
+        std::iter::repeat_with(move || array_init::<_, _, INPUT_SIZE>(|_| rng.next_u64()))
+    };
+
+    let mut group = c.benchmark_group("table");
+    group.throughput(Throughput::Elements(INPUT_SIZE as _));
+    for a in inputs.take(10) {
+        let a_str = format!("{:?}", a);
+        group.bench_with_input(BenchmarkId::new("factor_chunk", &a_str), &a, |b, &a| {
+            b.iter(|| {
+                let mut n_s = a.clone();
+                let mut f_s: [_; INPUT_SIZE] = array_init(|_| Factors::one());
+                for (n_s, f_s) in n_s.chunks_mut(CHUNK_SIZE).zip(f_s.chunks_mut(CHUNK_SIZE)) {
+                    factor_chunk(n_s.try_into().unwrap(), f_s.try_into().unwrap())
+                }
+            })
+        });
+        group.bench_with_input(BenchmarkId::new("factor", &a_str), &a, |b, &a| {
+            b.iter(|| {
+                let mut n_s = a.clone();
+                let mut f_s: [_; INPUT_SIZE] = array_init(|_| Factors::one());
+                for (n, f) in n_s.iter_mut().zip(f_s.iter_mut()) {
+                    factor(n, f)
+                }
+            })
+        });
+    }
+    group.finish()
+}
+
+#[cfg(target_os = "linux")]
+fn check_personality() {
+    use std::fs;
+    const ADDR_NO_RANDOMIZE: u64 = 0x0040000;
+    const PERSONALITY_PATH: &'static str = "/proc/self/personality";
+
+    let p_string = fs::read_to_string(PERSONALITY_PATH)
+        .expect(&format!("Couldn't read '{}'", PERSONALITY_PATH))
+        .strip_suffix("\n")
+        .unwrap()
+        .to_owned();
+
+    let personality = u64::from_str_radix(&p_string, 16).expect(&format!(
+        "Expected a hex value for personality, got '{:?}'",
+        p_string
+    ));
+    if personality & ADDR_NO_RANDOMIZE == 0 {
+        eprintln!(
+            "WARNING: Benchmarking with ASLR enabled (personality is {:x}), results might not be reproducible.",
+            personality
+        );
+    }
+}
+
+criterion_group!(benches, table);
+criterion_main!(benches);