From 3a5b31a30f2cc5ab933568a3be28165f786240ec Mon Sep 17 00:00:00 2001
From: Jeffrey Finkelstein <jeffrey.finkelstein@protonmail.com>
Date: Mon, 13 Jun 2022 22:00:58 -0400
Subject: [PATCH] dd: correct rendering of SI and IEC byte counts

Adjust the rendering of the concise byte counts in both SI and IEC
units to better match the behavior of GNU dd.

Before this commit,

    $ head -c 1024 /dev/zero | dd > /dev/null
    2+0 records in
    2+0 records out
    1024 bytes (1 KB, 1024 B) copied, 0.0 s, 1.0 MB/s

After this commit,

    $ head -c 1024 /dev/zero | dd > /dev/null
    2+0 records in
    2+0 records out
    1024 bytes (1.0 kB, 1.0 KiB) copied, 0.0 s, 1.0 MB/s

For comparison, GNU dd produces the following:

    $ head -c 1024 /dev/zero | dd > /dev/null
    2+0 records in
    2+0 records out
    1024 bytes (1.0 kB, 1.0 KiB) copied, 0.000332864 s, 3.1 MB/s
---
 Cargo.lock                |  17 -----
 src/uu/dd/Cargo.toml      |   1 -
 src/uu/dd/src/dd.rs       |   2 +
 src/uu/dd/src/numbers.rs  | 157 ++++++++++++++++++++++++++++++++++++++
 src/uu/dd/src/progress.rs |  47 +++++-------
 tests/by-util/test_dd.rs  |  33 ++++++--
 6 files changed, 207 insertions(+), 50 deletions(-)
 create mode 100644 src/uu/dd/src/numbers.rs

diff --git a/Cargo.lock b/Cargo.lock
index 815d1a953..3904277da 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -178,16 +178,6 @@ version = "3.11.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "572f695136211188308f16ad2ca5c851a712c464060ae6974944458eb83880ba"
 
-[[package]]
-name = "byte-unit"
-version = "4.0.17"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "581ad4b3d627b0c09a0ccb2912148f839acaca0b93cf54cbe42b6c674e86079c"
-dependencies = [
- "serde",
- "utf8-width",
-]
-
 [[package]]
 name = "bytecount"
 version = "0.6.3"
@@ -2184,12 +2174,6 @@ version = "0.7.6"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "09cc8ee72d2a9becf2f2febe0205bbed8fc6615b7cb429ad062dc7b7ddd036a9"
 
-[[package]]
-name = "utf8-width"
-version = "0.1.6"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "5190c9442dcdaf0ddd50f37420417d219ae5261bbf5db120d0f9bab996c9cba1"
-
 [[package]]
 name = "uu_arch"
 version = "0.0.16"
@@ -2356,7 +2340,6 @@ dependencies = [
 name = "uu_dd"
 version = "0.0.16"
 dependencies = [
- "byte-unit",
  "clap",
  "gcd",
  "libc",
diff --git a/src/uu/dd/Cargo.toml b/src/uu/dd/Cargo.toml
index 871f8d806..89e28e138 100644
--- a/src/uu/dd/Cargo.toml
+++ b/src/uu/dd/Cargo.toml
@@ -15,7 +15,6 @@ edition = "2021"
 path = "src/dd.rs"
 
 [dependencies]
-byte-unit = "4.0"
 clap = { version = "4.0", features = ["wrap_help", "cargo"] }
 gcd = "2.0"
 libc = "0.2"
diff --git a/src/uu/dd/src/dd.rs b/src/uu/dd/src/dd.rs
index 206f4480e..e71ff21b0 100644
--- a/src/uu/dd/src/dd.rs
+++ b/src/uu/dd/src/dd.rs
@@ -21,6 +21,8 @@ use progress::{gen_prog_updater, ProgUpdate, ReadStat, StatusLevel, WriteStat};
 mod blocks;
 use blocks::conv_block_unblock_helper;
 
+mod numbers;
+
 use std::cmp;
 use std::env;
 use std::ffi::OsString;
diff --git a/src/uu/dd/src/numbers.rs b/src/uu/dd/src/numbers.rs
new file mode 100644
index 000000000..1a32180b7
--- /dev/null
+++ b/src/uu/dd/src/numbers.rs
@@ -0,0 +1,157 @@
+//  * This file is part of the uutils coreutils package.
+//  *
+//  * For the full copyright and license information, please view the LICENSE
+//  * file that was distributed with this source code.
+/// Functions for formatting a number as a magnitude and a unit suffix.
+
+/// The first ten powers of 1024.
+const IEC_BASES: [u128; 10] = [
+    1,
+    1_024,
+    1_048_576,
+    1_073_741_824,
+    1_099_511_627_776,
+    1_125_899_906_842_624,
+    1_152_921_504_606_846_976,
+    1_180_591_620_717_411_303_424,
+    1_208_925_819_614_629_174_706_176,
+    1_237_940_039_285_380_274_899_124_224,
+];
+
+const IEC_SUFFIXES: [&str; 9] = ["B", "KiB", "MiB", "GiB", "TiB", "PiB", "EiB", "ZiB", "YiB"];
+
+/// The first ten powers of 1000.
+const SI_BASES: [u128; 10] = [
+    1,
+    1_000,
+    1_000_000,
+    1_000_000_000,
+    1_000_000_000_000,
+    1_000_000_000_000_000,
+    1_000_000_000_000_000_000,
+    1_000_000_000_000_000_000_000,
+    1_000_000_000_000_000_000_000_000,
+    1_000_000_000_000_000_000_000_000_000,
+];
+
+const SI_SUFFIXES: [&str; 9] = ["B", "kB", "MB", "GB", "TB", "PB", "EB", "ZB", "YB"];
+
+/// A SuffixType determines whether the suffixes are 1000 or 1024 based.
+#[derive(Clone, Copy)]
+pub(crate) enum SuffixType {
+    Iec,
+    Si,
+}
+
+impl SuffixType {
+    fn base_and_suffix(&self, n: u128) -> (u128, &'static str) {
+        let (bases, suffixes) = match self {
+            Self::Iec => (IEC_BASES, IEC_SUFFIXES),
+            Self::Si => (SI_BASES, SI_SUFFIXES),
+        };
+        let mut i = 0;
+        while bases[i + 1] - bases[i] < n && i < suffixes.len() {
+            i += 1;
+        }
+        (bases[i], suffixes[i])
+    }
+}
+
+/// Convert a number into a magnitude and a multi-byte unit suffix.
+///
+/// The returned string has a maximum length of 5 chars, for example: "1.1kB", "999kB", "1MB".
+pub(crate) fn to_magnitude_and_suffix(n: u128, suffix_type: SuffixType) -> String {
+    let (base, suffix) = suffix_type.base_and_suffix(n);
+    // TODO To match dd on my machine, we would need to round like
+    // this:
+    //
+    // 1049 => 1.0 kB
+    // 1050 => 1.0 kB  # why is this different?
+    // 1051 => 1.1 kB
+    // ...
+    // 1149 => 1.1 kB
+    // 1150 => 1.2 kB
+    // ...
+    // 1250 => 1.2 kB
+    // 1251 => 1.3 kB
+    // ..
+    // 10500 => 10 kB
+    // 10501 => 11 kB
+    //
+    let quotient = (n as f64) / (base as f64);
+    if quotient < 10.0 {
+        format!("{:.1} {}", quotient, suffix)
+    } else {
+        format!("{} {}", quotient.round(), suffix)
+    }
+}
+
+#[cfg(test)]
+mod tests {
+
+    use crate::numbers::{to_magnitude_and_suffix, SuffixType};
+
+    #[test]
+    fn test_to_magnitude_and_suffix_powers_of_1024() {
+        assert_eq!(to_magnitude_and_suffix(1024, SuffixType::Iec), "1.0 KiB");
+        assert_eq!(to_magnitude_and_suffix(2048, SuffixType::Iec), "2.0 KiB");
+        assert_eq!(to_magnitude_and_suffix(4096, SuffixType::Iec), "4.0 KiB");
+        assert_eq!(
+            to_magnitude_and_suffix(1024 * 1024, SuffixType::Iec),
+            "1.0 MiB"
+        );
+        assert_eq!(
+            to_magnitude_and_suffix(2 * 1024 * 1024, SuffixType::Iec),
+            "2.0 MiB"
+        );
+        assert_eq!(
+            to_magnitude_and_suffix(1024 * 1024 * 1024, SuffixType::Iec),
+            "1.0 GiB"
+        );
+        assert_eq!(
+            to_magnitude_and_suffix(34 * 1024 * 1024 * 1024, SuffixType::Iec),
+            "34 GiB"
+        );
+    }
+
+    #[test]
+    fn test_to_magnitude_and_suffix_not_powers_of_1024() {
+        assert_eq!(to_magnitude_and_suffix(1, SuffixType::Si), "1.0 B");
+        assert_eq!(to_magnitude_and_suffix(999, SuffixType::Si), "999 B");
+
+        assert_eq!(to_magnitude_and_suffix(1000, SuffixType::Si), "1.0 kB");
+        assert_eq!(to_magnitude_and_suffix(1001, SuffixType::Si), "1.0 kB");
+        assert_eq!(to_magnitude_and_suffix(1023, SuffixType::Si), "1.0 kB");
+        assert_eq!(to_magnitude_and_suffix(1025, SuffixType::Si), "1.0 kB");
+        assert_eq!(to_magnitude_and_suffix(10_001, SuffixType::Si), "10 kB");
+        assert_eq!(to_magnitude_and_suffix(999_000, SuffixType::Si), "999 kB");
+
+        assert_eq!(to_magnitude_and_suffix(999_001, SuffixType::Si), "1.0 MB");
+        assert_eq!(to_magnitude_and_suffix(999_999, SuffixType::Si), "1.0 MB");
+        assert_eq!(to_magnitude_and_suffix(1_000_000, SuffixType::Si), "1.0 MB");
+        assert_eq!(to_magnitude_and_suffix(1_000_001, SuffixType::Si), "1.0 MB");
+        assert_eq!(to_magnitude_and_suffix(1_100_000, SuffixType::Si), "1.1 MB");
+        assert_eq!(to_magnitude_and_suffix(1_100_001, SuffixType::Si), "1.1 MB");
+        assert_eq!(to_magnitude_and_suffix(1_900_000, SuffixType::Si), "1.9 MB");
+        assert_eq!(to_magnitude_and_suffix(1_900_001, SuffixType::Si), "1.9 MB");
+        assert_eq!(to_magnitude_and_suffix(9_900_000, SuffixType::Si), "9.9 MB");
+        assert_eq!(to_magnitude_and_suffix(9_900_001, SuffixType::Si), "9.9 MB");
+        assert_eq!(
+            to_magnitude_and_suffix(999_000_000, SuffixType::Si),
+            "999 MB"
+        );
+
+        assert_eq!(
+            to_magnitude_and_suffix(999_000_001, SuffixType::Si),
+            "1.0 GB"
+        );
+        assert_eq!(
+            to_magnitude_and_suffix(1_000_000_000, SuffixType::Si),
+            "1.0 GB"
+        );
+        assert_eq!(
+            to_magnitude_and_suffix(1_000_000_001, SuffixType::Si),
+            "1.0 GB"
+        );
+    }
+}
diff --git a/src/uu/dd/src/progress.rs b/src/uu/dd/src/progress.rs
index 312d8f096..440b2493c 100644
--- a/src/uu/dd/src/progress.rs
+++ b/src/uu/dd/src/progress.rs
@@ -13,7 +13,7 @@ use std::io::Write;
 use std::sync::mpsc;
 use std::time::Duration;
 
-use byte_unit::Byte;
+use crate::numbers::{to_magnitude_and_suffix, SuffixType};
 
 // On Linux, we register a signal handler that prints progress updates.
 #[cfg(target_os = "linux")]
@@ -129,22 +129,19 @@ impl ProgUpdate {
     /// let mut cursor = Cursor::new(vec![]);
     /// let rewrite = false;
     /// prog_update.write_prog_line(&mut cursor, rewrite).unwrap();
-    /// assert_eq!(cursor.get_ref(), b"0 bytes copied, 1.0 s, 0 B/s\n");
+    /// assert_eq!(cursor.get_ref(), b"0 bytes copied, 1.0 s, 0.0 B/s\n");
     /// ```
     fn write_prog_line(&self, w: &mut impl Write, rewrite: bool) -> std::io::Result<()> {
-        let btotal_metric = Byte::from_bytes(self.write_stat.bytes_total)
-            .get_appropriate_unit(false)
-            .format(0);
-        let btotal_bin = Byte::from_bytes(self.write_stat.bytes_total)
-            .get_appropriate_unit(true)
-            .format(0);
-        let safe_millis = std::cmp::max(1, self.duration.as_millis());
-        let transfer_rate = Byte::from_bytes(1000 * (self.write_stat.bytes_total / safe_millis))
-            .get_appropriate_unit(false)
-            .format(1);
-
+        // The total number of bytes written as a string, in SI and IEC format.
         let btotal = self.write_stat.bytes_total;
+        let btotal_metric = to_magnitude_and_suffix(btotal, SuffixType::Si);
+        let btotal_bin = to_magnitude_and_suffix(btotal, SuffixType::Iec);
+
+        // Compute the throughput (bytes per second) as a string.
         let duration = self.duration.as_secs_f64();
+        let safe_millis = std::cmp::max(1, self.duration.as_millis());
+        let rate = 1000 * (btotal / safe_millis);
+        let transfer_rate = to_magnitude_and_suffix(rate, SuffixType::Si);
 
         // If we are rewriting the progress line, do write a carriage
         // return (`\r`) at the beginning and don't write a newline
@@ -206,7 +203,7 @@ impl ProgUpdate {
     /// let mut iter = cursor.get_ref().split(|v| *v == b'\n');
     /// assert_eq!(iter.next().unwrap(), b"0+0 records in");
     /// assert_eq!(iter.next().unwrap(), b"0+0 records out");
-    /// assert_eq!(iter.next().unwrap(), b"0 bytes copied, 1.0 s, 0 B/s");
+    /// assert_eq!(iter.next().unwrap(), b"0 bytes copied, 1.0 s, 0.0 B/s");
     /// assert_eq!(iter.next().unwrap(), b"");
     /// assert!(iter.next().is_none());
     /// ```
@@ -564,22 +561,20 @@ mod tests {
         //     $ : | dd
         //     0 bytes copied, 7.9151e-05 s, 0.0 kB/s
         //
-        // The throughput still does not match GNU dd. For the ones
-        // that include the concise byte counts, the format is also
-        // not right.
-        assert_eq!(cursor.get_ref(), b"0 bytes copied, 1.0 s, 0 B/s\n");
+        // The throughput still does not match GNU dd.
+        assert_eq!(cursor.get_ref(), b"0 bytes copied, 1.0 s, 0.0 B/s\n");
 
         let prog_update = prog_update_write(999);
         let mut cursor = Cursor::new(vec![]);
         prog_update.write_prog_line(&mut cursor, rewrite).unwrap();
-        assert_eq!(cursor.get_ref(), b"999 bytes copied, 1.0 s, 0 B/s\n");
+        assert_eq!(cursor.get_ref(), b"999 bytes copied, 1.0 s, 0.0 B/s\n");
 
         let prog_update = prog_update_write(1000);
         let mut cursor = Cursor::new(vec![]);
         prog_update.write_prog_line(&mut cursor, rewrite).unwrap();
         assert_eq!(
             cursor.get_ref(),
-            b"1000 bytes (1000 B) copied, 1.0 s, 1000 B/s\n"
+            b"1000 bytes (1.0 kB) copied, 1.0 s, 1.0 kB/s\n"
         );
 
         let prog_update = prog_update_write(1023);
@@ -587,15 +582,15 @@ mod tests {
         prog_update.write_prog_line(&mut cursor, rewrite).unwrap();
         assert_eq!(
             cursor.get_ref(),
-            b"1023 bytes (1 KB) copied, 1.0 s, 1000 B/s\n"
+            b"1023 bytes (1.0 kB) copied, 1.0 s, 1.0 kB/s\n"
         );
 
         let prog_update = prog_update_write(1024);
         let mut cursor = Cursor::new(vec![]);
         prog_update.write_prog_line(&mut cursor, rewrite).unwrap();
         assert_eq!(
-            std::str::from_utf8(cursor.get_ref()).unwrap(),
-            "1024 bytes (1 KB, 1024 B) copied, 1.0 s, 1000 B/s\n"
+            cursor.get_ref(),
+            b"1024 bytes (1.0 kB, 1.0 KiB) copied, 1.0 s, 1.0 kB/s\n"
         );
     }
 
@@ -614,7 +609,7 @@ mod tests {
         let mut iter = cursor.get_ref().split(|v| *v == b'\n');
         assert_eq!(iter.next().unwrap(), b"0+0 records in");
         assert_eq!(iter.next().unwrap(), b"0+0 records out");
-        assert_eq!(iter.next().unwrap(), b"0 bytes copied, 1.0 s, 0 B/s");
+        assert_eq!(iter.next().unwrap(), b"0 bytes copied, 1.0 s, 0.0 B/s");
         assert_eq!(iter.next().unwrap(), b"");
         assert!(iter.next().is_none());
     }
@@ -633,10 +628,10 @@ mod tests {
         prog_update.write_prog_line(&mut cursor, rewrite).unwrap();
         prog_update.write_transfer_stats(&mut cursor, true).unwrap();
         let mut iter = cursor.get_ref().split(|v| *v == b'\n');
-        assert_eq!(iter.next().unwrap(), b"\r0 bytes copied, 1.0 s, 0 B/s");
+        assert_eq!(iter.next().unwrap(), b"\r0 bytes copied, 1.0 s, 0.0 B/s");
         assert_eq!(iter.next().unwrap(), b"0+0 records in");
         assert_eq!(iter.next().unwrap(), b"0+0 records out");
-        assert_eq!(iter.next().unwrap(), b"0 bytes copied, 1.0 s, 0 B/s");
+        assert_eq!(iter.next().unwrap(), b"0 bytes copied, 1.0 s, 0.0 B/s");
         assert_eq!(iter.next().unwrap(), b"");
         assert!(iter.next().is_none());
     }
diff --git a/tests/by-util/test_dd.rs b/tests/by-util/test_dd.rs
index b7309dfce..17cece8e6 100644
--- a/tests/by-util/test_dd.rs
+++ b/tests/by-util/test_dd.rs
@@ -259,7 +259,7 @@ fn test_final_stats_noxfer() {
 fn test_final_stats_unspec() {
     new_ucmd!()
         .run()
-        .stderr_only("0+0 records in\n0+0 records out\n0 bytes copied, 0.0 s, 0 B/s\n")
+        .stderr_only("0+0 records in\n0+0 records out\n0 bytes copied, 0.0 s, 0.0 B/s\n")
         .success();
 }
 
@@ -375,7 +375,7 @@ fn test_null_stats() {
     new_ucmd!()
         .args(&["if=null.txt"])
         .run()
-        .stderr_only("0+0 records in\n0+0 records out\n0 bytes copied, 0.0 s, 0 B/s\n")
+        .stderr_only("0+0 records in\n0+0 records out\n0 bytes copied, 0.0 s, 0.0 B/s\n")
         .success();
 }
 
@@ -1239,22 +1239,43 @@ fn test_bytes_oseek_seek_not_additive() {
 }
 
 #[test]
-fn test_final_stats_si_iec() {
+fn test_final_stats_less_than_one_kb_si() {
     let result = new_ucmd!().pipe_in("0".repeat(999)).succeeds();
     let s = result.stderr_str();
     assert!(s.starts_with("1+1 records in\n1+1 records out\n999 bytes copied,"));
+}
 
+#[test]
+fn test_final_stats_less_than_one_kb_iec() {
     let result = new_ucmd!().pipe_in("0".repeat(1000)).succeeds();
     let s = result.stderr_str();
-    assert!(s.starts_with("1+1 records in\n1+1 records out\n1000 bytes (1000 B) copied,"));
+    assert!(s.starts_with("1+1 records in\n1+1 records out\n1000 bytes (1.0 kB) copied,"));
 
     let result = new_ucmd!().pipe_in("0".repeat(1023)).succeeds();
     let s = result.stderr_str();
-    assert!(s.starts_with("1+1 records in\n1+1 records out\n1023 bytes (1 KB) copied,"));
+    assert!(s.starts_with("1+1 records in\n1+1 records out\n1023 bytes (1.0 kB) copied,"));
+}
 
+#[test]
+fn test_final_stats_more_than_one_kb() {
     let result = new_ucmd!().pipe_in("0".repeat(1024)).succeeds();
     let s = result.stderr_str();
-    assert!(s.starts_with("2+0 records in\n2+0 records out\n1024 bytes (1 KB, 1024 B) copied,"));
+    assert!(s.starts_with("2+0 records in\n2+0 records out\n1024 bytes (1.0 kB, 1.0 KiB) copied,"));
+}
+
+#[test]
+fn test_final_stats_three_char_limit() {
+    let result = new_ucmd!().pipe_in("0".repeat(10_000)).succeeds();
+    let s = result.stderr_str();
+    assert!(
+        s.starts_with("19+1 records in\n19+1 records out\n10000 bytes (10 kB, 9.8 KiB) copied,")
+    );
+
+    let result = new_ucmd!().pipe_in("0".repeat(100_000)).succeeds();
+    let s = result.stderr_str();
+    assert!(
+        s.starts_with("195+1 records in\n195+1 records out\n100000 bytes (100 kB, 98 KiB) copied,")
+    );
 }
 
 #[test]