diff --git a/src/uu/sort/BENCHMARKING.md b/src/uu/sort/BENCHMARKING.md index b20db014d..78c2e2b2d 100644 --- a/src/uu/sort/BENCHMARKING.md +++ b/src/uu/sort/BENCHMARKING.md @@ -9,25 +9,84 @@ list that we should improve / make sure not to regress. Run `cargo build --release` before benchmarking after you make a change! ## Sorting a wordlist -- Get a wordlist, for example with [words](https://en.wikipedia.org/wiki/Words_(Unix)) on Linux. The exact wordlist - doesn't matter for performance comparisons. In this example I'm using `/usr/share/dict/american-english` as the wordlist. -- Shuffle the wordlist by running `sort -R /usr/share/dict/american-english > shuffled_wordlist.txt`. -- Benchmark sorting the wordlist with hyperfine: `hyperfine "target/release/coreutils sort shuffled_wordlist.txt -o output.txt"`. + +- Get a wordlist, for example with [words]() on Linux. The exact wordlist + doesn't matter for performance comparisons. In this example I'm using `/usr/share/dict/american-english` as the wordlist. +- Shuffle the wordlist by running `sort -R /usr/share/dict/american-english > shuffled_wordlist.txt`. +- Benchmark sorting the wordlist with hyperfine: `hyperfine "target/release/coreutils sort shuffled_wordlist.txt -o output.txt"`. ## Sorting a wordlist with ignore_case -- Same wordlist as above -- Benchmark sorting the wordlist ignoring the case with hyperfine: `hyperfine "target/release/coreutils sort shuffled_wordlist.txt -f -o output.txt"`. + +- Same wordlist as above +- Benchmark sorting the wordlist ignoring the case with hyperfine: `hyperfine "target/release/coreutils sort shuffled_wordlist.txt -f -o output.txt"`. ## Sorting numbers -- Generate a list of numbers: `seq 0 100000 | sort -R > shuffled_numbers.txt`. -- Benchmark numeric sorting with hyperfine: `hyperfine "target/release/coreutils sort shuffled_numbers.txt -n -o output.txt"`. + +- Generate a list of numbers: `seq 0 100000 | sort -R > shuffled_numbers.txt`. +- Benchmark numeric sorting with hyperfine: `hyperfine "target/release/coreutils sort shuffled_numbers.txt -n -o output.txt"`. + +## Sorting numbers with -g + +- Same list of numbers as above. +- Benchmark numeric sorting with hyperfine: `hyperfine "target/release/coreutils sort shuffled_numbers.txt -g -o output.txt"`. + +## Sorting numbers with SI prefixes + +- Generate a list of numbers: +
+ Rust script + + ## Cargo.toml + + ```toml + [dependencies] + rand = "0.8.3" + ``` + + ## main.rs + + ```rust + use rand::prelude::*; + fn main() { + let suffixes = ['k', 'K', 'M', 'G', 'T', 'P', 'E', 'Z', 'Y']; + let mut rng = thread_rng(); + for _ in 0..100000 { + println!( + "{}{}", + rng.gen_range(0..1000000), + suffixes.choose(&mut rng).unwrap() + ) + } + } + + ``` + + ## running + + `cargo run > shuffled_numbers_si.txt` + +
+ +- Benchmark numeric sorting with hyperfine: `hyperfine "target/release/coreutils sort shuffled_numbers_si.txt -h -o output.txt"`. ## Stdout and stdin performance + Try to run the above benchmarks by piping the input through stdin (standard input) and redirect the output through stdout (standard output): -- Remove the input file from the arguments and add `cat [inputfile] | ` at the beginning. -- Remove `-o output.txt` and add `> output.txt` at the end. + +- Remove the input file from the arguments and add `cat [inputfile] | ` at the beginning. +- Remove `-o output.txt` and add `> output.txt` at the end. Example: `hyperfine "target/release/coreutils sort shuffled_numbers.txt -n -o output.txt"` becomes `hyperfine "cat shuffled_numbers.txt | target/release/coreutils sort -n > output.txt` -- Check that performance is similar to the original benchmark. \ No newline at end of file + +- Check that performance is similar to the original benchmark. + +## Comparing with GNU sort + +Hyperfine accepts multiple commands to run and will compare them. To compare performance with GNU sort +duplicate the string you passed to hyperfine but remove the `target/release/coreutils` bit from it. + +Example: `hyperfine "target/release/coreutils sort shuffled_numbers_si.txt -h -o output.txt"` becomes +`hyperfine "target/release/coreutils sort shuffled_numbers_si.txt -h -o output.txt" "sort shuffled_numbers_si.txt -h -o output.txt"` +(This assumes GNU sort is installed as `sort`) diff --git a/src/uu/sort/src/numeric_str_cmp.rs b/src/uu/sort/src/numeric_str_cmp.rs new file mode 100644 index 000000000..a50734ebd --- /dev/null +++ b/src/uu/sort/src/numeric_str_cmp.rs @@ -0,0 +1,455 @@ +// * This file is part of the uutils coreutils package. +// * +// * (c) Michael Debertol +// * +// * For the full copyright and license information, please view the LICENSE +// * file that was distributed with this source code. + +//! Fast comparison for strings representing a base 10 number without precision loss. +//! +//! To be able to short-circuit when comparing, [NumInfo] must be passed along with each number +//! to [numeric_str_cmp]. [NumInfo] is generally obtained by calling [NumInfo::parse] and should be cached. +//! It is allowed to arbitrarily modify the exponent afterwards, which is equivalent to shifting the decimal point. +//! +//! More specifically, exponent can be understood so that the original number is in (1..10)*10^exponent. +//! From that follows the constraints of this algorithm: It is able to compare numbers in ±(1*10^[i64::MIN]..10*10^[i64::MAX]). + +use std::{cmp::Ordering, ops::Range}; + +#[derive(Debug, PartialEq, Eq, PartialOrd, Ord)] +enum Sign { + Negative, + Positive, +} + +#[derive(Debug, PartialEq)] +pub struct NumInfo { + exponent: i64, + sign: Sign, +} + +pub struct NumInfoParseSettings { + pub accept_si_units: bool, + pub thousands_separator: Option, + pub decimal_pt: Option, +} + +impl Default for NumInfoParseSettings { + fn default() -> Self { + Self { + accept_si_units: false, + thousands_separator: None, + decimal_pt: Some('.'), + } + } +} + +impl NumInfo { + /// Parse NumInfo for this number. + /// Also returns the range of num that should be passed to numeric_str_cmp later + pub fn parse(num: &str, parse_settings: NumInfoParseSettings) -> (Self, Range) { + let mut exponent = -1; + let mut had_decimal_pt = false; + let mut had_digit = false; + let mut start = None; + let mut sign = Sign::Positive; + + let mut first_char = true; + + for (idx, char) in num.char_indices() { + if first_char && char.is_whitespace() { + continue; + } + + if first_char && char == '-' { + sign = Sign::Negative; + first_char = false; + continue; + } + first_char = false; + + if parse_settings + .thousands_separator + .map_or(false, |c| c == char) + { + continue; + } + + if Self::is_invalid_char(char, &mut had_decimal_pt, &parse_settings) { + let si_unit = if parse_settings.accept_si_units { + match char { + 'K' | 'k' => 3, + 'M' => 6, + 'G' => 9, + 'T' => 12, + 'P' => 15, + 'E' => 18, + 'Z' => 21, + 'Y' => 24, + _ => 0, + } + } else { + 0 + }; + return if let Some(start) = start { + ( + NumInfo { + exponent: exponent + si_unit, + sign, + }, + start..idx, + ) + } else { + ( + NumInfo { + sign: if had_digit { sign } else { Sign::Positive }, + exponent: 0, + }, + 0..0, + ) + }; + } + if Some(char) == parse_settings.decimal_pt { + continue; + } + had_digit = true; + if start.is_none() && char == '0' { + if had_decimal_pt { + // We're parsing a number whose first nonzero digit is after the decimal point. + exponent -= 1; + } else { + // Skip leading zeroes + continue; + } + } + if !had_decimal_pt { + exponent += 1; + } + if start.is_none() && char != '0' { + start = Some(idx); + } + } + if let Some(start) = start { + (NumInfo { exponent, sign }, start..num.len()) + } else { + ( + NumInfo { + sign: if had_digit { sign } else { Sign::Positive }, + exponent: 0, + }, + 0..0, + ) + } + } + + fn is_invalid_char( + c: char, + had_decimal_pt: &mut bool, + parse_settings: &NumInfoParseSettings, + ) -> bool { + if Some(c) == parse_settings.decimal_pt { + if *had_decimal_pt { + // this is a decimal pt but we already had one, so it is invalid + true + } else { + *had_decimal_pt = true; + false + } + } else { + !c.is_ascii_digit() + } + } +} + +/// compare two numbers as strings without parsing them as a number first. This should be more performant and can handle numbers more precisely. +/// NumInfo is needed to provide a fast path for most numbers. +pub fn numeric_str_cmp((a, a_info): (&str, &NumInfo), (b, b_info): (&str, &NumInfo)) -> Ordering { + // check for a difference in the sign + if a_info.sign != b_info.sign { + return a_info.sign.cmp(&b_info.sign); + } + + // check for a difference in the exponent + let ordering = if a_info.exponent != b_info.exponent && !a.is_empty() && !b.is_empty() { + a_info.exponent.cmp(&b_info.exponent) + } else { + // walk the characters from the front until we find a difference + let mut a_chars = a.chars().filter(|c| c.is_ascii_digit()); + let mut b_chars = b.chars().filter(|c| c.is_ascii_digit()); + loop { + let a_next = a_chars.next(); + let b_next = b_chars.next(); + match (a_next, b_next) { + (None, None) => break Ordering::Equal, + (Some(c), None) => { + break if c == '0' && a_chars.all(|c| c == '0') { + Ordering::Equal + } else { + Ordering::Greater + } + } + (None, Some(c)) => { + break if c == '0' && b_chars.all(|c| c == '0') { + Ordering::Equal + } else { + Ordering::Less + } + } + (Some(a_char), Some(b_char)) => { + let ord = a_char.cmp(&b_char); + if ord != Ordering::Equal { + break ord; + } + } + } + } + }; + + if a_info.sign == Sign::Negative { + ordering.reverse() + } else { + ordering + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn parses_exp() { + let n = "1"; + assert_eq!( + NumInfo::parse(n, Default::default()), + ( + NumInfo { + exponent: 0, + sign: Sign::Positive + }, + 0..1 + ) + ); + let n = "100"; + assert_eq!( + NumInfo::parse(n, Default::default()), + ( + NumInfo { + exponent: 2, + sign: Sign::Positive + }, + 0..3 + ) + ); + let n = "1,000"; + assert_eq!( + NumInfo::parse( + n, + NumInfoParseSettings { + thousands_separator: Some(','), + ..Default::default() + } + ), + ( + NumInfo { + exponent: 3, + sign: Sign::Positive + }, + 0..5 + ) + ); + let n = "1,000"; + assert_eq!( + NumInfo::parse(n, Default::default()), + ( + NumInfo { + exponent: 0, + sign: Sign::Positive + }, + 0..1 + ) + ); + let n = "1000.00"; + assert_eq!( + NumInfo::parse(n, Default::default()), + ( + NumInfo { + exponent: 3, + sign: Sign::Positive + }, + 0..7 + ) + ); + } + #[test] + fn parses_negative_exp() { + let n = "0.00005"; + assert_eq!( + NumInfo::parse(n, Default::default()), + ( + NumInfo { + exponent: -5, + sign: Sign::Positive + }, + 6..7 + ) + ); + let n = "00000.00005"; + assert_eq!( + NumInfo::parse(n, Default::default()), + ( + NumInfo { + exponent: -5, + sign: Sign::Positive + }, + 10..11 + ) + ); + } + + #[test] + fn parses_sign() { + let n = "5"; + assert_eq!( + NumInfo::parse(n, Default::default()), + ( + NumInfo { + exponent: 0, + sign: Sign::Positive + }, + 0..1 + ) + ); + let n = "-5"; + assert_eq!( + NumInfo::parse(n, Default::default()), + ( + NumInfo { + exponent: 0, + sign: Sign::Negative + }, + 1..2 + ) + ); + let n = " -5"; + assert_eq!( + NumInfo::parse(n, Default::default()), + ( + NumInfo { + exponent: 0, + sign: Sign::Negative + }, + 5..6 + ) + ); + } + + fn test_helper(a: &str, b: &str, expected: Ordering) { + let (a_info, a_range) = NumInfo::parse(a, Default::default()); + let (b_info, b_range) = NumInfo::parse(b, Default::default()); + let ordering = numeric_str_cmp( + (&a[a_range.to_owned()], &a_info), + (&b[b_range.to_owned()], &b_info), + ); + assert_eq!(ordering, expected); + let ordering = numeric_str_cmp((&b[b_range], &b_info), (&a[a_range], &a_info)); + assert_eq!(ordering, expected.reverse()); + } + #[test] + fn test_single_digit() { + test_helper("1", "2", Ordering::Less); + test_helper("0", "0", Ordering::Equal); + } + #[test] + fn test_minus() { + test_helper("-1", "-2", Ordering::Greater); + test_helper("-0", "-0", Ordering::Equal); + } + #[test] + fn test_different_len() { + test_helper("-20", "-100", Ordering::Greater); + test_helper("10.0", "2.000000", Ordering::Greater); + } + #[test] + fn test_decimal_digits() { + test_helper("20.1", "20.2", Ordering::Less); + test_helper("20.1", "20.15", Ordering::Less); + test_helper("-20.1", "+20.15", Ordering::Less); + test_helper("-20.1", "-20", Ordering::Less); + } + #[test] + fn test_trailing_zeroes() { + test_helper("20.00000", "20.1", Ordering::Less); + test_helper("20.00000", "20.0", Ordering::Equal); + } + #[test] + fn test_invalid_digits() { + test_helper("foo", "bar", Ordering::Equal); + test_helper("20.1", "a", Ordering::Greater); + test_helper("-20.1", "a", Ordering::Less); + test_helper("a", "0.15", Ordering::Less); + } + #[test] + fn test_multiple_decimal_pts() { + test_helper("10.0.0", "50.0.0", Ordering::Less); + test_helper("0.1.", "0.2.0", Ordering::Less); + test_helper("1.1.", "0", Ordering::Greater); + test_helper("1.1.", "-0", Ordering::Greater); + } + #[test] + fn test_leading_decimal_pts() { + test_helper(".0", ".0", Ordering::Equal); + test_helper(".1", ".0", Ordering::Greater); + test_helper(".02", "0", Ordering::Greater); + } + #[test] + fn test_leading_zeroes() { + test_helper("000000.0", ".0", Ordering::Equal); + test_helper("0.1", "0000000000000.0", Ordering::Greater); + test_helper("-01", "-2", Ordering::Greater); + } + + #[test] + fn minus_zero() { + // This matches GNU sort behavior. + test_helper("-0", "0", Ordering::Less); + test_helper("-0x", "0", Ordering::Less); + } + #[test] + fn double_minus() { + test_helper("--1", "0", Ordering::Equal); + } + #[test] + fn single_minus() { + let info = NumInfo::parse("-", Default::default()); + assert_eq!( + info, + ( + NumInfo { + exponent: 0, + sign: Sign::Positive + }, + 0..0 + ) + ); + } + #[test] + fn invalid_with_unit() { + let info = NumInfo::parse( + "-K", + NumInfoParseSettings { + accept_si_units: true, + ..Default::default() + }, + ); + assert_eq!( + info, + ( + NumInfo { + exponent: 0, + sign: Sign::Positive + }, + 0..0 + ) + ); + } +} diff --git a/src/uu/sort/src/sort.rs b/src/uu/sort/src/sort.rs index 8bf6eb1e8..c097861fc 100644 --- a/src/uu/sort/src/sort.rs +++ b/src/uu/sort/src/sort.rs @@ -15,9 +15,12 @@ #[macro_use] extern crate uucore; +mod numeric_str_cmp; + use clap::{App, Arg}; use fnv::FnvHasher; use itertools::Itertools; +use numeric_str_cmp::{numeric_str_cmp, NumInfo, NumInfoParseSettings}; use rand::distributions::Alphanumeric; use rand::{thread_rng, Rng}; use rayon::prelude::*; @@ -162,27 +165,71 @@ impl From<&GlobalSettings> for KeySettings { } /// Represents the string selected by a FieldSelector. -#[derive(Debug)] -enum Selection { +enum SelectionRange { /// If we had to transform this selection, we have to store a new string. String(String), /// If there was no transformation, we can store an index into the line. ByIndex(Range), } +impl SelectionRange { + /// Gets the actual string slice represented by this Selection. + fn get_str<'a>(&'a self, line: &'a str) -> &'a str { + match self { + SelectionRange::String(string) => string.as_str(), + SelectionRange::ByIndex(range) => &line[range.to_owned()], + } + } + + fn shorten(&mut self, new_range: Range) { + match self { + SelectionRange::String(string) => { + string.drain(new_range.end..); + string.drain(..new_range.start); + } + SelectionRange::ByIndex(range) => { + range.end = range.start + new_range.end; + range.start += new_range.start; + } + } + } +} + +enum NumCache { + AsF64(f64), + WithInfo(NumInfo), + None, +} + +impl NumCache { + fn as_f64(&self) -> f64 { + match self { + NumCache::AsF64(n) => *n, + _ => unreachable!(), + } + } + fn as_num_info(&self) -> &NumInfo { + match self { + NumCache::WithInfo(n) => n, + _ => unreachable!(), + } + } +} + +struct Selection { + range: SelectionRange, + num_cache: NumCache, +} + impl Selection { /// Gets the actual string slice represented by this Selection. fn get_str<'a>(&'a self, line: &'a Line) -> &'a str { - match self { - Selection::String(string) => string.as_str(), - Selection::ByIndex(range) => &line.line[range.to_owned()], - } + self.range.get_str(&line.line) } } type Field = Range; -#[derive(Debug)] struct Line { line: String, // The common case is not to specify fields. Let's make this fast. @@ -206,18 +253,38 @@ impl Line { .selectors .iter() .map(|selector| { - if let Some(range) = selector.get_selection(&line, fields.as_deref()) { - if let Some(transformed) = - transform(&line[range.to_owned()], &selector.settings) - { - Selection::String(transformed) + let mut range = + if let Some(range) = selector.get_selection(&line, fields.as_deref()) { + if let Some(transformed) = + transform(&line[range.to_owned()], &selector.settings) + { + SelectionRange::String(transformed) + } else { + SelectionRange::ByIndex(range.start().to_owned()..range.end() + 1) + } } else { - Selection::ByIndex(range.start().to_owned()..range.end() + 1) - } + // If there is no match, match the empty string. + SelectionRange::ByIndex(0..0) + }; + let num_cache = if selector.settings.mode == SortMode::Numeric + || selector.settings.mode == SortMode::HumanNumeric + { + let (info, num_range) = NumInfo::parse( + range.get_str(&line), + NumInfoParseSettings { + accept_si_units: selector.settings.mode == SortMode::HumanNumeric, + thousands_separator: Some(THOUSANDS_SEP), + decimal_pt: Some(DECIMAL_PT), + }, + ); + range.shorten(num_range); + NumCache::WithInfo(info) + } else if selector.settings.mode == SortMode::GeneralNumeric { + NumCache::AsF64(permissive_f64_parse(get_leading_gen(range.get_str(&line)))) } else { - // If there is no match, match the empty string. - Selection::ByIndex(0..0) - } + NumCache::None + }; + Selection { range, num_cache } }) .collect(); Self { line, selections } @@ -923,21 +990,28 @@ fn sort_by(lines: &mut Vec, settings: &GlobalSettings) { fn compare_by(a: &Line, b: &Line, global_settings: &GlobalSettings) -> Ordering { for (idx, selector) in global_settings.selectors.iter().enumerate() { - let a = a.selections[idx].get_str(a); - let b = b.selections[idx].get_str(b); + let a_selection = &a.selections[idx]; + let b_selection = &b.selections[idx]; + let a_str = a_selection.get_str(a); + let b_str = b_selection.get_str(b); let settings = &selector.settings; let cmp: Ordering = if settings.random { - random_shuffle(a, b, global_settings.salt.clone()) + random_shuffle(a_str, b_str, global_settings.salt.clone()) } else { - (match settings.mode { - SortMode::Numeric => numeric_compare, - SortMode::GeneralNumeric => general_numeric_compare, - SortMode::HumanNumeric => human_numeric_size_compare, - SortMode::Month => month_compare, - SortMode::Version => version_compare, - SortMode::Default => default_compare, - })(a, b) + match settings.mode { + SortMode::Numeric | SortMode::HumanNumeric => numeric_str_cmp( + (a_str, a_selection.num_cache.as_num_info()), + (b_str, b_selection.num_cache.as_num_info()), + ), + SortMode::GeneralNumeric => general_numeric_compare( + a_selection.num_cache.as_f64(), + b_selection.num_cache.as_f64(), + ), + SortMode::Month => month_compare(a_str, b_str), + SortMode::Version => version_compare(a_str, b_str), + SortMode::Default => default_compare(a_str, b_str), + } }; if cmp != Ordering::Equal { return if settings.reverse { cmp.reverse() } else { cmp }; @@ -945,7 +1019,6 @@ fn compare_by(a: &Line, b: &Line, global_settings: &GlobalSettings) -> Ordering } // Call "last resort compare" if all selectors returned Equal - let cmp = if global_settings.random || global_settings.stable || global_settings.unique { Ordering::Equal } else { @@ -997,34 +1070,6 @@ fn leading_num_common(a: &str) -> &str { s } -// This function cleans up the initial comparison done by leading_num_common for a numeric compare. -// GNU sort does its numeric comparison through strnumcmp. However, we don't have or -// may not want to use libc. Instead we emulate the GNU sort numeric compare by ignoring -// those leading number lines GNU sort would not recognize. GNU numeric compare would -// not recognize a positive sign or scientific/E notation so we strip those elements here. -fn get_leading_num(a: &str) -> &str { - let mut s = ""; - - let a = leading_num_common(a); - - // GNU numeric sort doesn't recognize '+' or 'e' notation so we strip - for (idx, c) in a.char_indices() { - if c.eq(&'e') || c.eq(&'E') || a.chars().next().unwrap_or('\0').eq(&POSITIVE) { - s = &a[..idx]; - break; - } - // If no further processing needed to be done, return the line as-is to be sorted - s = &a; - } - - // And empty number or non-number lines are to be treated as ‘0’ but only for numeric sort - // All '0'-ed lines will be sorted later, but only amongst themselves, during the so-called 'last resort comparison.' - if s.is_empty() { - s = "0"; - }; - s -} - // This function cleans up the initial comparison done by leading_num_common for a general numeric compare. // In contrast to numeric compare, GNU general numeric/FP sort *should* recognize positive signs and // scientific notation, so we strip those lines only after the end of the following numeric string. @@ -1054,17 +1099,6 @@ fn get_leading_gen(a: &str) -> &str { result } -#[inline(always)] -fn remove_thousands_sep<'a, S: Into>>(input: S) -> Cow<'a, str> { - let input = input.into(); - if input.contains(THOUSANDS_SEP) { - let output = input.replace(THOUSANDS_SEP, ""); - Cow::Owned(output) - } else { - input - } -} - #[inline(always)] fn remove_trailing_dec<'a, S: Into>>(input: S) -> Cow<'a, str> { let input = input.into(); @@ -1093,87 +1127,15 @@ fn permissive_f64_parse(a: &str) -> f64 { } } -fn numeric_compare(a: &str, b: &str) -> Ordering { - #![allow(clippy::comparison_chain)] - - let sa = get_leading_num(a); - let sb = get_leading_num(b); - - // Avoids a string alloc for every line to remove thousands seperators here - // instead of inside the get_leading_num function, which is a HUGE performance benefit - let ta = remove_thousands_sep(sa); - let tb = remove_thousands_sep(sb); - - let fa = permissive_f64_parse(&ta); - let fb = permissive_f64_parse(&tb); - - // f64::cmp isn't implemented (due to NaN issues); implement directly instead - if fa > fb { - Ordering::Greater - } else if fa < fb { - Ordering::Less - } else { - Ordering::Equal - } -} - /// Compares two floats, with errors and non-numerics assumed to be -inf. /// Stops coercing at the first non-numeric char. -fn general_numeric_compare(a: &str, b: &str) -> Ordering { +/// We explicitly need to convert to f64 in this case. +fn general_numeric_compare(a: f64, b: f64) -> Ordering { #![allow(clippy::comparison_chain)] - - let sa = get_leading_gen(a); - let sb = get_leading_gen(b); - - let fa = permissive_f64_parse(&sa); - let fb = permissive_f64_parse(&sb); - // f64::cmp isn't implemented (due to NaN issues); implement directly instead - if fa > fb { + if a > b { Ordering::Greater - } else if fa < fb { - Ordering::Less - } else { - Ordering::Equal - } -} - -// GNU/BSD does not handle converting numbers to an equal scale -// properly. GNU/BSD simply recognize that there is a human scale and sorts -// those numbers ahead of other number inputs. There are perhaps limits -// to the type of behavior we should emulate, and this might be such a limit. -// Properly handling these units seems like a value add to me. And when sorting -// these types of numbers, we rarely care about pure performance. -fn human_numeric_convert(a: &str) -> f64 { - let num_str = get_leading_num(a); - let suffix = a.trim_start_matches(&num_str); - let num_part = permissive_f64_parse(&num_str); - let suffix: f64 = match suffix.parse().unwrap_or('\0') { - // SI Units - 'K' => 1E3, - 'M' => 1E6, - 'G' => 1E9, - 'T' => 1E12, - 'P' => 1E15, - 'E' => 1E18, - 'Z' => 1E21, - 'Y' => 1E24, - _ => 1f64, - }; - num_part * suffix -} - -/// Compare two strings as if they are human readable sizes. -/// AKA 1M > 100k -fn human_numeric_size_compare(a: &str, b: &str) -> Ordering { - #![allow(clippy::comparison_chain)] - let fa = human_numeric_convert(a); - let fb = human_numeric_convert(b); - - // f64::cmp isn't implemented (due to NaN issues); implement directly instead - if fa > fb { - Ordering::Greater - } else if fa < fb { + } else if a < b { Ordering::Less } else { Ordering::Equal @@ -1373,30 +1335,6 @@ mod tests { assert_eq!(Ordering::Less, default_compare(a, b)); } - #[test] - fn test_numeric_compare1() { - let a = "149:7"; - let b = "150:5"; - - assert_eq!(Ordering::Less, numeric_compare(a, b)); - } - - #[test] - fn test_numeric_compare2() { - let a = "-1.02"; - let b = "1"; - - assert_eq!(Ordering::Less, numeric_compare(a, b)); - } - - #[test] - fn test_human_numeric_compare() { - let a = "300K"; - let b = "1M"; - - assert_eq!(Ordering::Less, human_numeric_size_compare(a, b)); - } - #[test] fn test_month_compare() { let a = "JaN"; diff --git a/tests/by-util/test_sort.rs b/tests/by-util/test_sort.rs index c3b8870b9..aacc34eb0 100644 --- a/tests/by-util/test_sort.rs +++ b/tests/by-util/test_sort.rs @@ -38,11 +38,7 @@ fn test_multiple_decimals_general() { #[test] fn test_multiple_decimals_numeric() { - new_ucmd!() - .arg("-n") - .arg("multiple_decimals_numeric.txt") - .succeeds() - .stdout_is("-2028789030\n-896689\n-8.90880\n-1\n-.05\n\n\n\n\n\n\n\n\n000\nCARAvan\n00000001\n1\n1.040000000\n1.444\n1.58590\n8.013\n45\n46.89\n 4567.\n4567.1\n4567.34\n\t\t\t\t\t\t\t\t\t\t4567..457\n\t\t\t\t37800\n\t\t\t\t\t\t45670.89079.098\n\t\t\t\t\t\t45670.89079.1\n576,446.88800000\n576,446.890\n4798908.340000000000\n4798908.45\n4798908.8909800\n"); + test_helper("multiple_decimals_numeric", "-n") } #[test] diff --git a/tests/fixtures/sort/multiple_decimals_numeric.expected b/tests/fixtures/sort/multiple_decimals_numeric.expected new file mode 100644 index 000000000..3ef4d22e8 --- /dev/null +++ b/tests/fixtures/sort/multiple_decimals_numeric.expected @@ -0,0 +1,35 @@ +-2028789030 +-896689 +-8.90880 +-1 +-.05 + + + + + + + + +000 +CARAvan +00000001 +1 +1.040000000 +1.444 +1.58590 +8.013 +45 +46.89 + 4567..457 + 4567. +4567.1 +4567.34 + 37800 + 45670.89079.098 + 45670.89079.1 +576,446.88800000 +576,446.890 +4798908.340000000000 +4798908.45 +4798908.8909800