sort: use "parse_size" from uucore

* make parsing of SIZE argument consistent with GNU's behavior * add error handling * add tests
2025-09-15 19:36:16 +00:00 · 2021-06-02 04:16:41 +02:00 · 2021-06-02 04:16:41 +02:00 · 6b8de1dd8b
commit 6b8de1dd8b
parent a900c7421a
2 changed files with 115 additions and 30 deletions
--- a/src/uu/sort/src/sort.rs
+++ b/src/uu/sort/src/sort.rs
@ -43,6 +43,7 @@ use std::ops::Range;
 use std::path::Path;
 use std::path::PathBuf;
 use unicode_width::UnicodeWidthStr;
+use uucore::parse_size::{parse_size, ParseSizeError};
 use uucore::InvalidEncodingHandling;

 static NAME: &str = "sort";
@ -159,32 +160,31 @@ pub struct GlobalSettings {
 }

 impl GlobalSettings {
-    /// Interpret this `&str` as a number with an optional trailing si unit.
-    ///
-    /// If there is no trailing si unit, the implicit unit is K.
-    /// The suffix B causes the number to be interpreted as a byte count.
-    fn parse_byte_count(input: &str) -> usize {
-        const SI_UNITS: &[char] = &['B', 'K', 'M', 'G', 'T', 'P', 'E', 'Z', 'Y'];
+    /// Parse a SIZE string into a number of bytes.
+    /// A size string comprises an integer and an optional unit.
+    /// The unit may be k, K, m, M, g, G, t, T, P, E, Z, Y (powers of 1024), or b which is 1.
+    /// Default is K.
+    fn parse_byte_count(input: &str) -> Result<usize, ParseSizeError> {
+        // GNU sort (8.32)   valid: 1b,        k, K, m, M, g, G, t, T, P, E, Z, Y
+        // GNU sort (8.32) invalid:  b, B, 1B,                         p, e, z, y
+        const ALLOW_LIST: &[char] = &[
+            'b', 'k', 'K', 'm', 'M', 'g', 'G', 't', 'T', 'P', 'E', 'Z', 'Y',
+        ];
+        let mut size_string = input.trim().to_string();

-        let input = input.trim();
-
-        let (num_str, si_unit) =
-            if input.ends_with(|c: char| SI_UNITS.contains(&c.to_ascii_uppercase())) {
-                let mut chars = input.chars();
-                let si_suffix = chars.next_back().unwrap().to_ascii_uppercase();
-                let si_unit = SI_UNITS.iter().position(|&c| c == si_suffix).unwrap();
-                let num_str = chars.as_str();
-                (num_str, si_unit)
+        if size_string.ends_with(|c: char| ALLOW_LIST.contains(&c))
+            || size_string.ends_with(|c: char| c.is_digit(10))
+        {
+            // b 1, K 1024 (default)
+            if size_string.ends_with(|c: char| c.is_digit(10)) {
+                size_string.push('K');
+            } else if size_string.ends_with('b') {
+                size_string.pop();
+            }
+            parse_size(&size_string)
        } else {
-                (input, 1)
-            };
-
-        let num_usize: usize = num_str
-            .trim()
-            .parse()
-            .unwrap_or_else(|e| crash!(1, "failed to parse buffer size `{}`: {}", num_str, e));
-
-        num_usize.saturating_mul(1000usize.saturating_pow(si_unit as u32))
+            Err(ParseSizeError::ParseFailure("invalid suffix".to_string()))
+        }
    }

    fn out_writer(&self) -> BufWriter<Box<dyn Write>> {
@ -1148,7 +1148,11 @@ pub fn uumain(args: impl uucore::Args) -> i32 {

    settings.buffer_size = matches
        .value_of(OPT_BUF_SIZE)
-        .map(GlobalSettings::parse_byte_count)
+        .map(|v| match GlobalSettings::parse_byte_count(v) {
+            Ok(n) => n,
+            Err(ParseSizeError::ParseFailure(_)) => crash!(2, "invalid -S argument '{}'", v),
+            Err(ParseSizeError::SizeTooBig(_)) => crash!(2, "-S argument '{}' too large", v),
+        })
        .unwrap_or(DEFAULT_BUF_SIZE);

    settings.tmp_dir = matches
@ -1640,4 +1644,48 @@ mod tests {
        // How big is a selection? Constant cost all lines pay when we need selections.
        assert_eq!(std::mem::size_of::<Selection>(), 24);
    }
+
+    #[test]
+    fn test_parse_byte_count() {
+        let valid_input = [
+            ("0", 0),
+            ("50K", 50 * 1024),
+            ("50k", 50 * 1024),
+            ("1M", 1024 * 1024),
+            ("100M", 100 * 1024 * 1024),
+            #[cfg(not(target_pointer_width = "32"))]
+            ("1000G", 1000 * 1024 * 1024 * 1024),
+            #[cfg(not(target_pointer_width = "32"))]
+            ("10T", 10 * 1024 * 1024 * 1024 * 1024),
+            ("1b", 1),
+            ("1024b", 1024),
+            ("1024Mb", 1024 * 1024 * 1024), // TODO: This might not be what GNU `sort` does?
+            ("1", 1024),                    // K is default
+            ("50", 50 * 1024),
+            ("K", 1024),
+            ("k", 1024),
+            ("m", 1024 * 1024),
+            #[cfg(not(target_pointer_width = "32"))]
+            ("E", 1024 * 1024 * 1024 * 1024 * 1024 * 1024),
+        ];
+        for (input, expected_output) in &valid_input {
+            assert_eq!(
+                GlobalSettings::parse_byte_count(input),
+                Ok(*expected_output)
+            );
+        }
+
+        // SizeTooBig
+        let invalid_input = ["500E", "1Y"];
+        for input in &invalid_input {
+            #[cfg(not(target_pointer_width = "128"))]
+            assert!(GlobalSettings::parse_byte_count(input).is_err());
+        }
+
+        // ParseFailure
+        let invalid_input = ["nonsense", "1B", "B", "b", "p", "e", "z", "y"];
+        for input in &invalid_input {
+            assert!(GlobalSettings::parse_byte_count(input).is_err());
+        }
+    }
 }
--- a/tests/by-util/test_sort.rs
+++ b/tests/by-util/test_sort.rs
@ -21,9 +21,7 @@ fn test_helper(file_name: &str, possible_args: &[&str]) {

 #[test]
 fn test_buffer_sizes() {
-    let buffer_sizes = [
-        "0", "50K", "50k", "1M", "100M", "1000G", "10T", "500E", "1Y",
-    ];
+    let buffer_sizes = ["0", "50K", "50k", "1M", "100M"];
    for buffer_size in &buffer_sizes {
        new_ucmd!()
            .arg("-n")
@ -32,6 +30,20 @@ fn test_buffer_sizes() {
            .arg("ext_sort.txt")
            .succeeds()
            .stdout_is_fixture("ext_sort.expected");
+
+        #[cfg(not(target_pointer_width = "32"))]
+        {
+            let buffer_sizes = ["1000G", "10T"];
+            for buffer_size in &buffer_sizes {
+                new_ucmd!()
+                    .arg("-n")
+                    .arg("-S")
+                    .arg(buffer_size)
+                    .arg("ext_sort.txt")
+                    .succeeds()
+                    .stdout_is_fixture("ext_sort.expected");
+            }
+        }
    }
 }

@ -43,11 +55,36 @@ fn test_invalid_buffer_size() {
            .arg("-S")
            .arg(invalid_buffer_size)
            .fails()
+            .code_is(2)
            .stderr_only(format!(
-                "sort: failed to parse buffer size `{}`: invalid digit found in string",
+                "sort: invalid -S argument '{}'",
                invalid_buffer_size
            ));
    }
+    #[cfg(not(target_pointer_width = "128"))]
+    new_ucmd!()
+        .arg("-n")
+        .arg("-S")
+        .arg("1Y")
+        .arg("ext_sort.txt")
+        .fails()
+        .code_is(2)
+        .stderr_only("sort: -S argument '1Y' too large");
+
+    #[cfg(target_pointer_width = "32")]
+    {
+        let buffer_sizes = ["1000G", "10T"];
+        for buffer_size in &buffer_sizes {
+            new_ucmd!()
+                .arg("-n")
+                .arg("-S")
+                .arg(buffer_size)
+                .arg("ext_sort.txt")
+                .fails()
+                .code_is(2)
+                .stderr_only(format!("sort: -S argument '{}' too large", buffer_size));
+        }
+    }
 }

 #[test]