diff --git a/CHANGELOG.md b/CHANGELOG.md
index 343350a..949a196 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -44,6 +44,44 @@ Types of changes
     1.07x faster,
     from 2m44s to 2m33s
 
+- After profiling the code as suggested by
+  [nnethercote's perf-book](https://nnethercote.github.io/perf-book/profiling.html)
+  one critical path of Alejandra was identified an optimized,
+  yielding huge performance boosts:
+
+  - x86_64-unknown-linux-gnu, 2.5x faster,
+    from 0m8.381s to 0m3.410s
+
+  - x86_64-unknown-linux-musl, 2.3x faster,
+    from 0m9.642s to 0m4.134s
+
+  - [On QEMU](https://www.qemu.org/) aarch64-unknown-linux-musl,
+    2.4x faster,
+    from 1m10s to 0m29s
+
+  - [On QEMU](https://www.qemu.org/) armv6l-unknown-linux-musleabihf,
+    1.85x faster,
+    from 7m41s to 4m8.399s
+
+  - [On QEMU](https://www.qemu.org/) armv7l-unknown-linux-musleabihf,
+    1.88x faster,
+    from 5m7s to 2m42.595s
+
+  - [On QEMU](https://www.qemu.org/) i686-unknown-linux-musl,
+    1.65x faster,
+    from 2m33s to 1m32.671s
+
+  In general this is an algorithmic improvement
+  and therefore the following platforms should be faster as well
+  by a similar ratio
+  (not measured):
+
+  - aarch64-apple-darwin
+  - x86_64-apple-darwin
+
+- A `--threads` flag, so you can pick how many formatting threads to spawn.
+  Defaults to the number of logical CPUs in your system.
+
 ## [0.6.0] - 2022-02-25
 
 ### Added
diff --git a/README.md b/README.md
index 60f51d8..ac3897d 100644
--- a/README.md
+++ b/README.md
@@ -207,11 +207,11 @@ Please see: [CHANGELOG.md](./CHANGELOG.md).
 
     | Logical Cores | Seconds |
     | :-----------: | :-----: |
-    |       1       |   35    |
-    |       2       |   18    |
-    |       4       |   10    |
-    |       8       |   10    |
-    |      16       |   10    |
+    |       1       |  15.1   |
+    |       2       |   7.9   |
+    |       4       |   5.4   |
+    |       8       |   4.1   |
+    |      16       |   3.6   |
 
 [^semantic-changes]: The methodology to claim this is:
 
diff --git a/src/alejandra_cli/src/cli.rs b/src/alejandra_cli/src/cli.rs
index f170b9f..ee8522e 100644
--- a/src/alejandra_cli/src/cli.rs
+++ b/src/alejandra_cli/src/cli.rs
@@ -24,7 +24,19 @@ pub(crate) fn parse(args: Vec<String>) -> clap::ArgMatches {
         .arg(
             clap::Arg::new("check")
                 .help("Check if the input is already formatted.")
-                .long("--check"),
+                .long("--check")
+                .short('c'),
+        )
+        .arg(
+            clap::Arg::new("threads")
+                .default_value("0")
+                .help(
+                    "Number of formatting threads to spawn. Defaults to the \
+                     number of logical CPUs.",
+                )
+                .long("--threads")
+                .short('t')
+                .takes_value(true),
         )
         .term_width(80)
         .after_help(indoc::indoc!(
@@ -310,6 +322,13 @@ pub fn main() -> std::io::Result<()> {
     let matches = crate::cli::parse(std::env::args().collect());
 
     let check = matches.is_present("check");
+    let threads = matches.value_of("threads").unwrap();
+    let threads: usize = threads.parse().unwrap();
+
+    rayon::ThreadPoolBuilder::new()
+        .num_threads(threads)
+        .build_global()
+        .unwrap();
 
     let formatted_paths = match matches.values_of("include") {
         Some(include) => {