diff --git a/src/uu/sort/BENCHMARKING.md b/src/uu/sort/BENCHMARKING.md index 355245b07..d3fdd80d4 100644 --- a/src/uu/sort/BENCHMARKING.md +++ b/src/uu/sort/BENCHMARKING.md @@ -24,8 +24,19 @@ Run `cargo build --release` before benchmarking after you make a change! ## Sorting numbers -- Generate a list of numbers: `seq 0 100000 | sort -R > shuffled_numbers.txt`. -- Benchmark numeric sorting with hyperfine: `hyperfine "target/release/coreutils sort shuffled_numbers.txt -n -o output.txt"`. +- Generate a list of numbers: + ``` + shuf -i 1-1000000 -n 1000000 > shuffled_numbers.txt + # or + seq 1 1000000 | sort -R > shuffled_numbers.txt + ``` +- Benchmark numeric sorting with hyperfine + ``` + hyperfine --warmup 3 \ + '/tmp/gnu-sort -n /tmp/shuffled_numbers.txt' + '/tmp/uu_before sort -n /tmp/shuffled_numbers.txt' + '/tmp/uu_after sort -n /tmp/shuffled_numbers.txt' + ``` ## Sorting numbers with -g diff --git a/src/uu/sort/src/chunks.rs b/src/uu/sort/src/chunks.rs index 6f0ba97bf..8f423701a 100644 --- a/src/uu/sort/src/chunks.rs +++ b/src/uu/sort/src/chunks.rs @@ -42,6 +42,7 @@ pub struct LineData<'a> { pub selections: Vec<&'a str>, pub num_infos: Vec, pub parsed_floats: Vec, + pub line_num_floats: Vec>, } impl Chunk { @@ -52,6 +53,7 @@ impl Chunk { contents.line_data.selections.clear(); contents.line_data.num_infos.clear(); contents.line_data.parsed_floats.clear(); + contents.line_data.line_num_floats.clear(); let lines = unsafe { // SAFETY: It is safe to (temporarily) transmute to a vector of lines with a longer lifetime, // because the vector is empty. @@ -73,6 +75,7 @@ impl Chunk { selections, std::mem::take(&mut contents.line_data.num_infos), std::mem::take(&mut contents.line_data.parsed_floats), + std::mem::take(&mut contents.line_data.line_num_floats), ) }); RecycledChunk { @@ -80,6 +83,7 @@ impl Chunk { selections: recycled_contents.1, num_infos: recycled_contents.2, parsed_floats: recycled_contents.3, + line_num_floats: recycled_contents.4, buffer: self.into_owner(), } } @@ -97,6 +101,7 @@ pub struct RecycledChunk { selections: Vec<&'static str>, num_infos: Vec, parsed_floats: Vec, + line_num_floats: Vec>, buffer: Vec, } @@ -107,6 +112,7 @@ impl RecycledChunk { selections: Vec::new(), num_infos: Vec::new(), parsed_floats: Vec::new(), + line_num_floats: Vec::new(), buffer: vec![0; capacity], } } @@ -149,6 +155,7 @@ pub fn read( selections, num_infos, parsed_floats, + line_num_floats, mut buffer, } = recycled_chunk; if buffer.len() < carry_over.len() { @@ -184,6 +191,7 @@ pub fn read( selections, num_infos, parsed_floats, + line_num_floats, }; parse_lines(read, &mut lines, &mut line_data, separator, settings); Ok(ChunkContents { lines, line_data }) @@ -207,6 +215,7 @@ fn parse_lines<'a>( assert!(line_data.selections.is_empty()); assert!(line_data.num_infos.is_empty()); assert!(line_data.parsed_floats.is_empty()); + assert!(line_data.line_num_floats.is_empty()); let mut token_buffer = vec![]; lines.extend( read.split(separator as char) diff --git a/src/uu/sort/src/sort.rs b/src/uu/sort/src/sort.rs index 31dc81751..87b0fa7b5 100644 --- a/src/uu/sort/src/sort.rs +++ b/src/uu/sort/src/sort.rs @@ -460,6 +460,13 @@ impl<'a> Line<'a> { if settings.precomputed.needs_tokens { tokenize(line, settings.separator, token_buffer); } + if settings.mode == SortMode::Numeric { + // exclude inf, nan, scientific notation + let line_num_float = (!line.contains(char::is_alphabetic)) + .then(|| line.parse::().ok()) + .flatten(); + line_data.line_num_floats.push(line_num_float); + } for (selector, selection) in settings .selectors .iter() @@ -1563,6 +1570,24 @@ fn compare_by<'a>( let mut selection_index = 0; let mut num_info_index = 0; let mut parsed_float_index = 0; + + if let (Some(Some(a_f64)), Some(Some(b_f64))) = ( + a_line_data.line_num_floats.get(a.index), + b_line_data.line_num_floats.get(b.index), + ) { + // we don't use total_cmp() because it always sorts -0 before 0 + if let Some(cmp) = a_f64.partial_cmp(b_f64) { + // don't trust `Ordering::Equal` if lines are not fully equal + if cmp != Ordering::Equal || a.line == b.line { + return if global_settings.reverse { + cmp.reverse() + } else { + cmp + }; + } + } + } + for selector in &global_settings.selectors { let (a_str, b_str) = if selector.needs_selection { let selections = (