1
Fork 0
mirror of https://github.com/RGBCube/uutils-coreutils synced 2025-07-28 11:37:44 +00:00

Sort: Various fixes and performance improvements (#2057)

* Various fixes and performance improvements

* fix a typo

Co-authored-by: Michael Debertol <michael.debertol@gmail.com>

Co-authored-by: Sylvestre Ledru <sledru@mozilla.com>
Co-authored-by: Michael Debertol <michael.debertol@gmail.com>
This commit is contained in:
electricboogie 2021-04-10 04:56:20 -05:00 committed by GitHub
parent ee070028e4
commit e5113ad00e
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
8 changed files with 242 additions and 44 deletions

View file

@ -22,6 +22,7 @@ use rand::distributions::Alphanumeric;
use rand::{thread_rng, Rng}; use rand::{thread_rng, Rng};
use rayon::prelude::*; use rayon::prelude::*;
use semver::Version; use semver::Version;
use std::borrow::Cow;
use std::cmp::Ordering; use std::cmp::Ordering;
use std::collections::BinaryHeap; use std::collections::BinaryHeap;
use std::env; use std::env;
@ -262,7 +263,7 @@ pub fn uumain(args: impl uucore::Args) -> i32 {
Arg::with_name(OPT_CHECK_SILENT) Arg::with_name(OPT_CHECK_SILENT)
.short("C") .short("C")
.long(OPT_CHECK_SILENT) .long(OPT_CHECK_SILENT)
.help("exit successfully if the given file is already sorted, and exit with status 1 otherwise. "), .help("exit successfully if the given file is already sorted, and exit with status 1 otherwise."),
) )
.arg( .arg(
Arg::with_name(OPT_IGNORE_CASE) Arg::with_name(OPT_IGNORE_CASE)
@ -353,7 +354,7 @@ pub fn uumain(args: impl uucore::Args) -> i32 {
if let Ok(n) = line { if let Ok(n) = line {
files.push( files.push(
std::str::from_utf8(&n) std::str::from_utf8(&n)
.expect("Could not parse zero terminated string from input.") .expect("Could not parse string from zero terminated input.")
.to_string(), .to_string(),
); );
} }
@ -488,6 +489,8 @@ fn exec(files: Vec<String>, settings: &mut Settings) -> i32 {
} else { } else {
print_sorted(file_merger, &settings) print_sorted(file_merger, &settings)
} }
} else if settings.mode == SortMode::Default && settings.unique {
print_sorted(lines.iter().dedup(), &settings)
} else if settings.mode == SortMode::Month && settings.unique { } else if settings.mode == SortMode::Month && settings.unique {
print_sorted( print_sorted(
lines lines
@ -499,7 +502,7 @@ fn exec(files: Vec<String>, settings: &mut Settings) -> i32 {
print_sorted( print_sorted(
lines lines
.iter() .iter()
.dedup_by(|a, b| get_nums_dedup(a) == get_nums_dedup(b)), .dedup_by(|a, b| get_num_dedup(a, &settings) == get_num_dedup(b, &settings)),
&settings, &settings,
) )
} else { } else {
@ -603,12 +606,13 @@ fn default_compare(a: &str, b: &str) -> Ordering {
#[inline(always)] #[inline(always)]
fn leading_num_common(a: &str) -> &str { fn leading_num_common(a: &str) -> &str {
let mut s = ""; let mut s = "";
// check whether char is numeric, whitespace or decimal point or thousand separator
for (idx, c) in a.char_indices() { for (idx, c) in a.char_indices() {
// check whether char is numeric, whitespace or decimal point or thousand seperator
if !c.is_numeric() if !c.is_numeric()
&& !c.is_whitespace() && !c.is_whitespace()
&& !c.eq(&DECIMAL_PT)
&& !c.eq(&THOUSANDS_SEP) && !c.eq(&THOUSANDS_SEP)
&& !c.eq(&DECIMAL_PT)
// check for e notation // check for e notation
&& !c.eq(&'e') && !c.eq(&'e')
&& !c.eq(&'E') && !c.eq(&'E')
@ -621,7 +625,7 @@ fn leading_num_common(a: &str) -> &str {
break; break;
} }
// If line is not a number line, return the line as is // If line is not a number line, return the line as is
s = a; s = &a;
} }
s s
} }
@ -633,16 +637,17 @@ fn leading_num_common(a: &str) -> &str {
// not recognize a positive sign or scientific/E notation so we strip those elements here. // not recognize a positive sign or scientific/E notation so we strip those elements here.
fn get_leading_num(a: &str) -> &str { fn get_leading_num(a: &str) -> &str {
let mut s = ""; let mut s = "";
let b = leading_num_common(a);
// GNU numeric sort doesn't recognize '+' or 'e' notation so we strip let a = leading_num_common(a);
for (idx, c) in b.char_indices() {
if c.eq(&'e') || c.eq(&'E') || b.chars().nth(0).unwrap_or('\0').eq(&POSITIVE) { // GNU numeric sort doesn't recognize '+' or 'e' notation so we strip trailing chars
s = &b[..idx]; for (idx, c) in a.char_indices() {
if c.eq(&'e') || c.eq(&'E') || a.chars().nth(0).unwrap_or('\0').eq(&POSITIVE) {
s = &a[..idx];
break; break;
} }
// If no further processing needed to be done, return the line as-is to be sorted // If no further processing needed to be done, return the line as-is to be sorted
s = b; s = &a;
} }
// And empty number or non-number lines are to be treated as 0 but only for numeric sort // And empty number or non-number lines are to be treated as 0 but only for numeric sort
@ -657,30 +662,32 @@ fn get_leading_num(a: &str) -> &str {
// In contrast to numeric compare, GNU general numeric/FP sort *should* recognize positive signs and // In contrast to numeric compare, GNU general numeric/FP sort *should* recognize positive signs and
// scientific notation, so we strip those lines only after the end of the following numeric string. // scientific notation, so we strip those lines only after the end of the following numeric string.
// For example, 5e10KFD would be 5e10 or 5x10^10 and +10000HFKJFK would become 10000. // For example, 5e10KFD would be 5e10 or 5x10^10 and +10000HFKJFK would become 10000.
fn get_leading_gen(a: &str) -> String { fn get_leading_gen(a: &str) -> &str {
// Make this iter peekable to see if next char is numeric // Make this iter peekable to see if next char is numeric
let mut p_iter = leading_num_common(a).chars().peekable(); let raw_leading_num = leading_num_common(a);
let mut r = String::new(); let mut p_iter = raw_leading_num.chars().peekable();
let mut result = "";
// Cleanup raw stripped strings // Cleanup raw stripped strings
for c in p_iter.to_owned() { for c in p_iter.to_owned() {
let next_char_numeric = p_iter.peek().unwrap_or(&'\0').is_numeric(); let next_char_numeric = p_iter.peek().unwrap_or(&'\0').is_numeric();
// Only general numeric recognizes e notation and, see block below, the '+' sign // Only general numeric recognizes e notation and the '+' sign
if (c.eq(&'e') && !next_char_numeric) || (c.eq(&'E') && !next_char_numeric) { if (c.eq(&'e') && !next_char_numeric)
r = a.split(c).next().unwrap_or("").to_owned(); || (c.eq(&'E') && !next_char_numeric)
// Only GNU (non-general) numeric recognize thousands seperators, takes only leading #
|| c.eq(&THOUSANDS_SEP)
{
result = a.split(c).next().unwrap_or("");
break; break;
// If positive sign and next char is not numeric, split at postive sign at keep trailing numbers // If positive sign and next char is not numeric, split at postive sign at keep trailing numbers
// There is a more elegant way to do this in Rust 1.45, std::str::strip_prefix // There is a more elegant way to do this in Rust 1.45, std::str::strip_prefix
} else if c.eq(&POSITIVE) && !next_char_numeric { } else if c.eq(&POSITIVE) && !next_char_numeric {
let mut v: Vec<&str> = a.split(c).collect(); result = a.trim().trim_start_matches('+');
let x = v.split_off(1);
r = x.join("");
break; break;
}
// If no further processing needed to be done, return the line as-is to be sorted // If no further processing needed to be done, return the line as-is to be sorted
} else { result = a;
r = a.to_owned();
} }
} result
r
} }
fn get_months_dedup(a: &str) -> String { fn get_months_dedup(a: &str) -> String {
@ -714,10 +721,10 @@ fn get_months_dedup(a: &str) -> String {
} }
} }
// *For all dedups/uniques we must compare leading numbers* // *For all dedups/uniques expect default we must compare leading numbers*
// Also note numeric compare and unique output is specifically *not* the same as a "sort | uniq" // Also note numeric compare and unique output is specifically *not* the same as a "sort | uniq"
// See: https://www.gnu.org/software/coreutils/manual/html_node/sort-invocation.html // See: https://www.gnu.org/software/coreutils/manual/html_node/sort-invocation.html
fn get_nums_dedup(a: &str) -> &str { fn get_num_dedup<'a>(a: &'a str, settings: &&mut Settings) -> &'a str {
// Trim and remove any leading zeros // Trim and remove any leading zeros
let s = a.trim().trim_start_matches('0'); let s = a.trim().trim_start_matches('0');
@ -731,20 +738,50 @@ fn get_nums_dedup(a: &str) -> &str {
"" ""
// Prepare lines for comparison of only the numerical leading numbers // Prepare lines for comparison of only the numerical leading numbers
} else { } else {
get_leading_num(s) let result = match settings.mode {
SortMode::Numeric => get_leading_num(s),
SortMode::GeneralNumeric => get_leading_gen(s),
SortMode::HumanNumeric => get_leading_num(s),
SortMode::Version => get_leading_num(s),
_ => s,
};
result
}
}
#[inline(always)]
fn remove_thousands_sep<'a, S: Into<Cow<'a, str>>>(input: S) -> Cow<'a, str> {
let input = input.into();
if input.contains(THOUSANDS_SEP) {
let output = input.replace(THOUSANDS_SEP, "");
Cow::Owned(output)
} else {
input
}
}
#[inline(always)]
fn remove_trailing_dec<'a, S: Into<Cow<'a, str>>>(input: S) -> Cow<'a, str> {
let input = input.into();
if let Some(s) = input.find(DECIMAL_PT) {
let (leading, trailing) = input.split_at(s);
let output = [leading, ".", trailing.replace(DECIMAL_PT, "").as_str()].concat();
Cow::Owned(output)
} else {
input
} }
} }
/// Parse the beginning string into an f64, returning -inf instead of NaN on errors. /// Parse the beginning string into an f64, returning -inf instead of NaN on errors.
#[inline(always)] #[inline(always)]
fn permissive_f64_parse(a: &str) -> f64 { fn permissive_f64_parse(a: &str) -> f64 {
// Remove thousands seperators
let a = a.replace(THOUSANDS_SEP, "");
// GNU sort treats "NaN" as non-number in numeric, so it needs special care. // GNU sort treats "NaN" as non-number in numeric, so it needs special care.
// *Keep this trim before parse* despite what POSIX may say about -b and -n // *Keep this trim before parse* despite what POSIX may say about -b and -n
// because GNU and BSD both seem to require it to match their behavior // because GNU and BSD both seem to require it to match their behavior
match a.trim().parse::<f64>() { //
// Remove any trailing decimals, ie 4568..890... becomes 4568.890
// Then, we trim whitespace and parse
match remove_trailing_dec(a).trim().parse::<f64>() {
Ok(a) if a.is_nan() => std::f64::NEG_INFINITY, Ok(a) if a.is_nan() => std::f64::NEG_INFINITY,
Ok(a) => a, Ok(a) => a,
Err(_) => std::f64::NEG_INFINITY, Err(_) => std::f64::NEG_INFINITY,
@ -757,8 +794,13 @@ fn numeric_compare(a: &str, b: &str) -> Ordering {
let sa = get_leading_num(a); let sa = get_leading_num(a);
let sb = get_leading_num(b); let sb = get_leading_num(b);
let fa = permissive_f64_parse(sa); // Avoids a string alloc for every line to remove thousands seperators here
let fb = permissive_f64_parse(sb); // instead of inside the get_leading_num function, which is a HUGE performance benefit
let ta = remove_thousands_sep(sa);
let tb = remove_thousands_sep(sb);
let fa = permissive_f64_parse(&ta);
let fb = permissive_f64_parse(&tb);
// f64::cmp isn't implemented (due to NaN issues); implement directly instead // f64::cmp isn't implemented (due to NaN issues); implement directly instead
if fa > fb { if fa > fb {
@ -799,8 +841,8 @@ fn general_numeric_compare(a: &str, b: &str) -> Ordering {
// these types of numbers, we rarely care about pure performance. // these types of numbers, we rarely care about pure performance.
fn human_numeric_convert(a: &str) -> f64 { fn human_numeric_convert(a: &str) -> f64 {
let num_str = get_leading_num(a); let num_str = get_leading_num(a);
let suffix = a.trim_start_matches(num_str); let suffix = a.trim_start_matches(&num_str);
let num_part = permissive_f64_parse(num_str); let num_part = permissive_f64_parse(&num_str);
let suffix: f64 = match suffix.parse().unwrap_or('\0') { let suffix: f64 = match suffix.parse().unwrap_or('\0') {
// SI Units // SI Units
'K' => 1E3, 'K' => 1E3,

BIN
tests/.DS_Store vendored Normal file

Binary file not shown.

View file

@ -1,5 +1,31 @@
use crate::common::util::*; use crate::common::util::*;
fn test_helper(file_name: &str, args: &str) {
new_ucmd!()
.arg(args)
.arg(format!("{}.txt", file_name))
.succeeds()
.stdout_is_fixture(format!("{}.expected", file_name));
}
#[test]
fn test_multiple_decimals_general() {
new_ucmd!()
.arg("-g")
.arg("multiple_decimals_general.txt")
.succeeds()
.stdout_is("\n\n\n\n\n\n\n\nCARAvan\n-2028789030\n-896689\n-8.90880\n-1\n-.05\n000\n00000001\n1\n1.040000000\n1.444\n1.58590\n8.013\n45\n46.89\n576,446.88800000\n576,446.890\n 4567.\n4567.1\n4567.34\n\t\t\t\t\t\t\t\t\t\t4567..457\n\t\t\t\t37800\n\t\t\t\t\t\t45670.89079.098\n\t\t\t\t\t\t45670.89079.1\n4798908.340000000000\n4798908.45\n4798908.8909800\n");
}
#[test]
fn test_multiple_decimals_numeric() {
new_ucmd!()
.arg("-n")
.arg("multiple_decimals_numeric.txt")
.succeeds()
.stdout_is("-2028789030\n-896689\n-8.90880\n-1\n-.05\n\n\n\n\n\n\n\n\n000\nCARAvan\n00000001\n1\n1.040000000\n1.444\n1.58590\n8.013\n45\n46.89\n 4567.\n4567.1\n4567.34\n\t\t\t\t\t\t\t\t\t\t4567..457\n\t\t\t\t37800\n\t\t\t\t\t\t45670.89079.098\n\t\t\t\t\t\t45670.89079.1\n576,446.88800000\n576,446.890\n4798908.340000000000\n4798908.45\n4798908.8909800\n");
}
#[test] #[test]
fn test_check_zero_terminated_failure() { fn test_check_zero_terminated_failure() {
new_ucmd!() new_ucmd!()
@ -44,6 +70,21 @@ fn test_random_shuffle_contains_all_lines() {
assert_eq!(result_sorted, expected); assert_eq!(result_sorted, expected);
} }
#[test]
fn test_random_shuffle_two_runs_not_the_same() {
// check to verify that two random shuffles are not equal; this has the
// potential to fail in the very unlikely event that the random order is the same
// as the starting order, or if both random sorts end up having the same order.
const FILE: &'static str = "default_unsorted_ints.expected";
let (at, _ucmd) = at_and_ucmd!();
let result = new_ucmd!().arg("-R").arg(FILE).run().stdout;
let expected = at.read(FILE);
let unexpected = new_ucmd!().arg("-R").arg(FILE).run().stdout;
assert_ne!(result, expected);
assert_ne!(result, unexpected);
}
#[test] #[test]
fn test_random_shuffle_contains_two_runs_not_the_same() { fn test_random_shuffle_contains_two_runs_not_the_same() {
// check to verify that two random shuffles are not equal; this has the // check to verify that two random shuffles are not equal; this has the
@ -355,11 +396,3 @@ fn test_check_silent() {
.fails() .fails()
.stdout_is(""); .stdout_is("");
} }
fn test_helper(file_name: &str, args: &str) {
new_ucmd!()
.arg(args)
.arg(format!("{}{}", file_name, ".txt"))
.succeeds()
.stdout_is_fixture(format!("{}{}", file_name, ".expected"));
}

BIN
tests/fixtures/.DS_Store vendored Normal file

Binary file not shown.

View file

@ -0,0 +1,20 @@
4798908.8909800
4798908.45
4798908.340000000000
576,446.890
576,446.88800000
37800
4567.
46.89
45
8.013
1.58590
1.444
1.040000000
1
-.05
-1
-8.90880
-896689
-2028789030

View file

@ -0,0 +1,33 @@
-2028789030
-896689
-8.90880
-1
-.05
000
CARAvan
00000001
1
1.040000000
1.444
1.58590
8.013
45
46.89
4567..457
4567.
4567.1
4567.34
37800
576,446.88800000
576,446.890
4798908.340000000000
4798908.45
4798908.8909800

View file

@ -0,0 +1,35 @@
576,446.890
576,446.88800000
4567.1
4567..457
45670.89079.1
45670.89079.098
4567.34
4567.
45
46.89
-1
1
00000001
4798908.340000000000
4798908.45
4798908.8909800
37800
-2028789030
-896689
CARAvan
-8.90880
-.05
1.444
1.58590
1.040000000
8.013
000

View file

@ -0,0 +1,35 @@
576,446.890
576,446.88800000
4567.1
4567..457
45670.89079.1
45670.89079.098
4567.34
4567.
45
46.89
-1
1
00000001
4798908.340000000000
4798908.45
4798908.8909800
37800
-2028789030
-896689
CARAvan
-8.90880
-.05
1.444
1.58590
1.040000000
8.013
000