diff --git a/Cargo.lock b/Cargo.lock index c950fb58e..07a99069c 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1317,6 +1317,12 @@ dependencies = [ "maybe-uninit", ] +[[package]] +name = "static_assertions" +version = "1.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a2eb9349b6444b326872e140eb1cf5e7c522154d69e7a0ffb0fb81c06b37543f" + [[package]] name = "strsim" version = "0.8.0" @@ -1443,6 +1449,17 @@ dependencies = [ "serde_json", ] +[[package]] +name = "twox-hash" +version = "1.6.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "04f8ab788026715fa63b31960869617cba39117e520eb415b0139543e325ab59" +dependencies = [ + "cfg-if 0.1.10", + "rand 0.7.3", + "static_assertions", +] + [[package]] name = "typenum" version = "1.13.0" @@ -2200,7 +2217,9 @@ version = "0.0.4" dependencies = [ "clap", "itertools 0.8.2", + "rand 0.7.3", "semver", + "twox-hash", "uucore", "uucore_procs", ] diff --git a/src/uu/sort/Cargo.toml b/src/uu/sort/Cargo.toml index 5158f6e52..e50caf53b 100644 --- a/src/uu/sort/Cargo.toml +++ b/src/uu/sort/Cargo.toml @@ -15,7 +15,9 @@ edition = "2018" path = "src/sort.rs" [dependencies] +rand = "0.7" clap = "2.33" +twox-hash = "1.6.0" itertools = "0.8.0" semver = "0.9.0" uucore = { version=">=0.0.7", package="uucore", path="../../uucore", features=["fs"] } diff --git a/src/uu/sort/src/sort.rs b/src/uu/sort/src/sort.rs index 8e79ff947..f8789835a 100644 --- a/src/uu/sort/src/sort.rs +++ b/src/uu/sort/src/sort.rs @@ -1,6 +1,7 @@ // * This file is part of the uutils coreutils package. // * // * (c) Michael Yin +// * (c) Robert Swinford // * // * For the full copyright and license information, please view the LICENSE // * file that was distributed with this source code. @@ -12,13 +13,17 @@ extern crate uucore; use clap::{App, Arg}; use itertools::Itertools; +use rand::distributions::Alphanumeric; +use rand::{thread_rng, Rng}; use semver::Version; use std::cmp::Ordering; use std::collections::BinaryHeap; use std::fs::File; +use std::hash::{Hash, Hasher}; use std::io::{stdin, stdout, BufRead, BufReader, BufWriter, Lines, Read, Write}; use std::mem::replace; use std::path::Path; +use twox_hash::XxHash64; use uucore::fs::is_stdin_interactive; // for Iterator::dedup() static NAME: &str = "sort"; @@ -34,16 +39,18 @@ static OPT_DICTIONARY_ORDER: &str = "dictionary-order"; static OPT_MERGE: &str = "merge"; static OPT_CHECK: &str = "check"; static OPT_IGNORE_CASE: &str = "ignore-case"; +static OPT_IGNORE_BLANKS: &str = "ignore-blanks"; static OPT_OUTPUT: &str = "output"; static OPT_REVERSE: &str = "reverse"; static OPT_STABLE: &str = "stable"; static OPT_UNIQUE: &str = "unique"; +static OPT_RANDOM: &str = "random-sort"; static ARG_FILES: &str = "files"; static DECIMAL_PT: char = '.'; static THOUSANDS_SEP: char = ','; - +#[derive(Eq, Ord, PartialEq, PartialOrd)] enum SortMode { Numeric, HumanNumeric, @@ -60,8 +67,10 @@ struct Settings { stable: bool, unique: bool, check: bool, + random: bool, compare_fns: Vec Ordering>, transform_fns: Vec String>, + salt: String, } impl Default for Settings { @@ -74,8 +83,10 @@ impl Default for Settings { stable: false, unique: false, check: false, + random: false, compare_fns: Vec::new(), transform_fns: Vec::new(), + salt: String::new(), } } } @@ -155,17 +166,14 @@ impl<'a> Iterator for FileMerger<'a> { } } } + fn get_usage() -> String { format!( "{0} {1} - Usage: {0} [OPTION]... [FILE]... - Write the sorted concatenation of all FILE(s) to standard output. - Mandatory arguments for long options are mandatory for short options too. - With no FILE, or when FILE is -, read standard input.", NAME, VERSION ) @@ -228,6 +236,12 @@ pub fn uumain(args: impl uucore::Args) -> i32 { .long(OPT_IGNORE_CASE) .help("fold lower case to upper case characters"), ) + .arg( + Arg::with_name(OPT_IGNORE_BLANKS) + .short("b") + .long(OPT_IGNORE_BLANKS) + .help("ignore leading blanks when finding sort keys in each line"), + ) .arg( Arg::with_name(OPT_OUTPUT) .short("o") @@ -236,6 +250,12 @@ pub fn uumain(args: impl uucore::Args) -> i32 { .takes_value(true) .value_name("FILENAME"), ) + .arg( + Arg::with_name(OPT_RANDOM) + .short("R") + .long(OPT_RANDOM) + .help("shuffle in random order"), + ) .arg( Arg::with_name(OPT_REVERSE) .short("r") @@ -285,11 +305,20 @@ pub fn uumain(args: impl uucore::Args) -> i32 { settings.transform_fns.push(|s| s.to_uppercase()); } + if matches.is_present(OPT_IGNORE_BLANKS) { + settings.transform_fns.push(|s| s.trim_start().to_string()); + } + settings.outfile = matches.value_of(OPT_OUTPUT).map(String::from); settings.reverse = matches.is_present(OPT_REVERSE); settings.stable = matches.is_present(OPT_STABLE); settings.unique = matches.is_present(OPT_UNIQUE); + if matches.is_present(OPT_RANDOM) { + settings.random = matches.is_present(OPT_RANDOM); + settings.salt = get_rand_string(); + } + //let mut files = matches.free; if files.is_empty() { /* if no file, default to stdin */ @@ -313,10 +342,10 @@ pub fn uumain(args: impl uucore::Args) -> i32 { } } - exec(files, &settings) + exec(files, &mut settings) } -fn exec(files: Vec, settings: &Settings) -> i32 { +fn exec(files: Vec, settings: &mut Settings) -> i32 { let mut lines = Vec::new(); let mut file_merger = FileMerger::new(&settings); @@ -351,6 +380,13 @@ fn exec(files: Vec, settings: &Settings) -> i32 { } else { print_sorted(file_merger, &settings.outfile) } + } else if settings.unique && settings.mode == SortMode::Numeric { + print_sorted( + lines + .iter() + .dedup_by(|a, b| num_sort_dedup(a) == num_sort_dedup(b)), + &settings.outfile, + ) } else if settings.unique { print_sorted(lines.iter().dedup(), &settings.outfile) } else { @@ -419,7 +455,11 @@ fn compare_by(a: &str, b: &str, settings: &Settings) -> Ordering { }; for compare_fn in &settings.compare_fns { - let cmp = compare_fn(a, b); + let cmp: Ordering = if settings.random { + random_shuffle(a, b, settings.salt.clone()) + } else { + compare_fn(a, b) + }; if cmp != Ordering::Equal { if settings.reverse { return cmp.reverse(); @@ -431,36 +471,60 @@ fn compare_by(a: &str, b: &str, settings: &Settings) -> Ordering { Ordering::Equal } -/// Parse the beginning string into an f64, returning -inf instead of NaN on errors. -fn permissive_f64_parse(a: &str) -> f64 { - // Maybe should be split on non-digit, but then 10e100 won't parse properly. - // On the flip side, this will give NEG_INFINITY for "1,234", which might be OK - // because there's no way to handle both CSV and thousands separators without a new flag. - // GNU sort treats "1,234" as "1" in numeric, so maybe it's fine. - // GNU sort treats "NaN" as non-number in numeric, so it needs special care. - match a.split_whitespace().next() { - None => std::f64::NEG_INFINITY, - Some(sa) => match sa.parse::() { - Ok(a) if a.is_nan() => std::f64::NEG_INFINITY, - Ok(a) => a, - Err(_) => std::f64::NEG_INFINITY, - }, - } -} - fn default_compare(a: &str, b: &str) -> Ordering { a.cmp(b) } -/// Compares two floating point numbers, with errors being assumed to be -inf. -/// Stops coercing at the first whitespace char, so 1e2 will parse as 100 but -/// 1,000 will parse as -inf. +fn get_leading_number(a: &str) -> &str { + let mut s = ""; + for c in a.chars() { + if !c.is_numeric() && !c.eq(&'-') && !c.eq(&' ') && !c.eq(&'.') && !c.eq(&',') { + s = a.trim().split(c).next().unwrap(); + break; + } + s = a.trim(); + } + return s; +} + +// Matches GNU behavior, see: +// https://www.gnu.org/software/coreutils/manual/html_node/sort-invocation.html +// Specifically *not* the same as sort -n | uniq +fn num_sort_dedup(a: &str) -> &str { + // Empty lines are dumped + if a.is_empty() { + return "0" + // And lines that don't begin numerically are dumped + } else if !a.trim().chars().nth(0).unwrap_or('\0').is_numeric() { + return "0" + } else { + // Prepare lines for comparison of only the numerical leading numbers + return get_leading_number(a) + }; +} + +/// Parse the beginning string into an f64, returning -inf instead of NaN on errors. +fn permissive_f64_parse(a: &str) -> f64 { + // GNU sort treats "NaN" as non-number in numeric, so it needs special care. + match a.parse::() { + Ok(a) if a.is_nan() => std::f64::NEG_INFINITY, + Ok(a) => a, + Err(_) => std::f64::NEG_INFINITY, + } +} + +/// Compares two floats, with errors and non-numerics assumed to be -inf. +/// Stops coercing at the first non-numeric char. fn numeric_compare(a: &str, b: &str) -> Ordering { #![allow(clippy::comparison_chain)] - let fa = permissive_f64_parse(a); - let fb = permissive_f64_parse(b); - // f64::cmp isn't implemented because NaN messes with it - // but we sidestep that with permissive_f64_parse so just fake it + + let sa = get_leading_number(a); + let sb = get_leading_number(b); + + let fa = permissive_f64_parse(sa); + let fb = permissive_f64_parse(sb); + + // f64::cmp isn't implemented (due to NaN issues); implement directly instead if fa > fb { Ordering::Greater } else if fa < fb { @@ -471,10 +535,10 @@ fn numeric_compare(a: &str, b: &str) -> Ordering { } fn human_numeric_convert(a: &str) -> f64 { - let int_str: String = a.chars().take_while(|c| c.is_numeric()).collect(); - let suffix = a.chars().find(|c| !c.is_numeric()); - let int_part = int_str.parse::().unwrap_or(-1f64) as f64; - let suffix: f64 = match suffix.unwrap_or('\0') { + let int_str = get_leading_number(a); + let (_, s) = a.split_at(int_str.len()); + let int_part = permissive_f64_parse(int_str); + let suffix: f64 = match s.parse().unwrap_or('\0') { 'K' => 1000f64, 'M' => 1E6, 'G' => 1E9, @@ -501,6 +565,30 @@ fn human_numeric_size_compare(a: &str, b: &str) -> Ordering { } } +fn random_shuffle(a: &str, b: &str, salt: String) -> Ordering { + #![allow(clippy::comparison_chain)] + let salt_slice = salt.as_str(); + + let da = hash(&[a, salt_slice].concat()); + let db = hash(&[b, salt_slice].concat()); + + da.cmp(&db) +} + +fn get_rand_string() -> String { + thread_rng() + .sample_iter(&Alphanumeric) + .take(16) + .map(char::from) + .collect::() +} + +fn hash(t: &T) -> u64 { + let mut s: XxHash64 = Default::default(); + t.hash(&mut s); + s.finish() +} + #[derive(Eq, Ord, PartialEq, PartialOrd)] enum Month { Unknown, @@ -606,3 +694,65 @@ fn open(path: &str) -> Option<(Box, bool)> { } } } + +#[cfg(test)] +mod tests { + + use super::*; + + #[test] + fn test_default_compare() { + let a = "your own"; + let b = "your place"; + + assert_eq!(Ordering::Less, default_compare(a, b)); + } + + #[test] + fn test_numeric_compare1() { + let a = "149:7"; + let b = "150:5"; + + assert_eq!(Ordering::Less, numeric_compare(a, b)); + } + + #[test] + fn test_numeric_compare2() { + let a = "-1.02"; + let b = "1"; + + assert_eq!(Ordering::Less, numeric_compare(a, b)); + } + + #[test] + fn test_human_numeric_compare() { + let a = "300K"; + let b = "1M"; + + assert_eq!(Ordering::Less, human_numeric_size_compare(a, b)); + } + + #[test] + fn test_month_compare() { + let a = "JaN"; + let b = "OCt"; + + assert_eq!(Ordering::Less, month_compare(a, b)); + } + #[test] + fn test_version_compare() { + let a = "1.2.3-alpha2"; + let b = "1.4.0"; + + assert_eq!(Ordering::Less, version_compare(a, b)); + } + + #[test] + fn test_random_compare() { + let a = "9"; + let b = "9"; + let c = get_rand_string(); + + assert_eq!(Ordering::Equal, random_shuffle(a, b, c)); + } +} diff --git a/tests/by-util/test_sort.rs b/tests/by-util/test_sort.rs index 9ff1b3522..2bac71def 100644 --- a/tests/by-util/test_sort.rs +++ b/tests/by-util/test_sort.rs @@ -2,22 +2,43 @@ use crate::common::util::*; #[test] fn test_numeric_floats_and_ints() { - test_helper("numeric_floats_and_ints", "-n"); + for numeric_sort_param in vec!["-n", "--numeric-sort"] { + let input = "1.444\n8.013\n1\n-8\n1.04\n-1"; + new_ucmd!() + .arg(numeric_sort_param) + .pipe_in(input) + .succeeds() + .stdout_only("-8\n-1\n1\n1.04\n1.444\n8.013\n"); + } } #[test] fn test_numeric_floats() { - test_helper("numeric_floats", "-n"); + for numeric_sort_param in vec!["-n", "--numeric-sort"] { + let input = "1.444\n8.013\n1.58590\n-8.90880\n1.040000000\n-.05"; + new_ucmd!() + .arg(numeric_sort_param) + .pipe_in(input) + .succeeds() + .stdout_only("-8.90880\n-.05\n1.040000000\n1.444\n1.58590\n8.013\n"); + } } #[test] fn test_numeric_floats_with_nan() { - test_helper("numeric_floats_with_nan", "-n"); + for numeric_sort_param in vec!["-n", "--numeric-sort"] { + let input = "1.444\n1.0/0.0\n1.58590\n-8.90880\n1.040000000\n-.05"; + new_ucmd!() + .arg(numeric_sort_param) + .pipe_in(input) + .succeeds() + .stdout_only("-8.90880\n-.05\n1.0/0.0\n1.040000000\n1.444\n1.58590\n"); + } } #[test] fn test_numeric_unfixed_floats() { - test_helper("numeric_unfixed_floats", "-n"); + test_helper("numeric_fixed_floats", "-n"); } #[test] @@ -32,12 +53,26 @@ fn test_numeric_unsorted_ints() { #[test] fn test_human_block_sizes() { - test_helper("human_block_sizes", "-h"); + for human_numeric_sort_param in vec!["-h", "--human-numeric-sort"] { + let input = "8981K\n909991M\n-8T\n21G\n0.8M"; + new_ucmd!() + .arg(human_numeric_sort_param) + .pipe_in(input) + .succeeds() + .stdout_only("-8T\n0.8M\n8981K\n21G\n909991M\n"); + } } #[test] fn test_month_default() { - test_helper("month_default", "-M"); + for month_sort_param in vec!["-M", "--month-sort"] { + let input = "JAn\nMAY\n000may\nJun\nFeb"; + new_ucmd!() + .arg(month_sort_param) + .pipe_in(input) + .succeeds() + .stdout_only("000may\nJAn\nFeb\nMAY\nJun\n"); + } } #[test] @@ -47,12 +82,23 @@ fn test_month_stable() { #[test] fn test_default_unsorted_ints() { - test_helper("default_unsorted_ints", ""); + let input = "9\n1909888\n000\n1\n2"; + new_ucmd!() + .pipe_in(input) + .succeeds() + .stdout_only("000\n1\n1909888\n2\n9\n"); } #[test] fn test_numeric_unique_ints() { - test_helper("numeric_unsorted_ints_unique", "-nu"); + for numeric_unique_sort_param in vec!["-nu"] { + let input = "9\n9\n8\n1\n"; + new_ucmd!() + .arg(numeric_unique_sort_param) + .pipe_in(input) + .succeeds() + .stdout_only("1\n8\n9\n"); + } } #[test]