From fd38fd69e9bf4ef750df4dbb0200abf02cc8c275 Mon Sep 17 00:00:00 2001 From: Mohammad AlSaleh Date: Tue, 25 Mar 2025 05:14:18 +0300 Subject: [PATCH 1/2] sort: immediately compare whole lines if they parse as numbers MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Numeric sort can be relatively slow on inputs that are wholly or mostly numbers. This is more clear when comparing with the speed of GeneralNumeric. This change parses whole lines as f64 and stores that info in `LineData`. This is faster than doing the parsing two lines at a time in `compare_by()`. # Benchmarks `shuf -i 1-1000000 -n 1000000 > /tmp/shuffled.txt` % hyperfine --warmup 3 \ '/tmp/gnu-sort -n /tmp/shuffled.txt' '/tmp/before_coreutils sort -n /tmp/shuffled.txt' '/tmp/after_coreutils sort -n /tmp/shuffled.txt' Benchmark 1: /tmp/gnu-sort -n /tmp/shuffled.txt Time (mean ± σ): 198.2 ms ± 5.8 ms [User: 884.6 ms, System: 22.0 ms] Range (min … max): 187.3 ms … 207.4 ms 15 runs Benchmark 2: /tmp/before_coreutils sort -n /tmp/shuffled.txt Time (mean ± σ): 361.3 ms ± 8.7 ms [User: 1898.7 ms, System: 18.9 ms] Range (min … max): 350.4 ms … 375.3 ms 10 runs Benchmark 3: /tmp/after_coreutils sort -n /tmp/shuffled.txt Time (mean ± σ): 175.1 ms ± 6.7 ms [User: 536.8 ms, System: 21.6 ms] Range (min … max): 169.3 ms … 197.0 ms 16 runs Summary /tmp/after_coreutils sort -n /tmp/shuffled.txt ran 1.13 ± 0.05 times faster than /tmp/gnu-sort -n /tmp/shuffled.txt 2.06 ± 0.09 times faster than /tmp/before_coreutils sort -n /tmp/shuffled.txt Signed-off-by: Mohammad AlSaleh --- src/uu/sort/src/chunks.rs | 9 +++++++++ src/uu/sort/src/sort.rs | 25 +++++++++++++++++++++++++ 2 files changed, 34 insertions(+) diff --git a/src/uu/sort/src/chunks.rs b/src/uu/sort/src/chunks.rs index 6f0ba97bf..8f423701a 100644 --- a/src/uu/sort/src/chunks.rs +++ b/src/uu/sort/src/chunks.rs @@ -42,6 +42,7 @@ pub struct LineData<'a> { pub selections: Vec<&'a str>, pub num_infos: Vec, pub parsed_floats: Vec, + pub line_num_floats: Vec>, } impl Chunk { @@ -52,6 +53,7 @@ impl Chunk { contents.line_data.selections.clear(); contents.line_data.num_infos.clear(); contents.line_data.parsed_floats.clear(); + contents.line_data.line_num_floats.clear(); let lines = unsafe { // SAFETY: It is safe to (temporarily) transmute to a vector of lines with a longer lifetime, // because the vector is empty. @@ -73,6 +75,7 @@ impl Chunk { selections, std::mem::take(&mut contents.line_data.num_infos), std::mem::take(&mut contents.line_data.parsed_floats), + std::mem::take(&mut contents.line_data.line_num_floats), ) }); RecycledChunk { @@ -80,6 +83,7 @@ impl Chunk { selections: recycled_contents.1, num_infos: recycled_contents.2, parsed_floats: recycled_contents.3, + line_num_floats: recycled_contents.4, buffer: self.into_owner(), } } @@ -97,6 +101,7 @@ pub struct RecycledChunk { selections: Vec<&'static str>, num_infos: Vec, parsed_floats: Vec, + line_num_floats: Vec>, buffer: Vec, } @@ -107,6 +112,7 @@ impl RecycledChunk { selections: Vec::new(), num_infos: Vec::new(), parsed_floats: Vec::new(), + line_num_floats: Vec::new(), buffer: vec![0; capacity], } } @@ -149,6 +155,7 @@ pub fn read( selections, num_infos, parsed_floats, + line_num_floats, mut buffer, } = recycled_chunk; if buffer.len() < carry_over.len() { @@ -184,6 +191,7 @@ pub fn read( selections, num_infos, parsed_floats, + line_num_floats, }; parse_lines(read, &mut lines, &mut line_data, separator, settings); Ok(ChunkContents { lines, line_data }) @@ -207,6 +215,7 @@ fn parse_lines<'a>( assert!(line_data.selections.is_empty()); assert!(line_data.num_infos.is_empty()); assert!(line_data.parsed_floats.is_empty()); + assert!(line_data.line_num_floats.is_empty()); let mut token_buffer = vec![]; lines.extend( read.split(separator as char) diff --git a/src/uu/sort/src/sort.rs b/src/uu/sort/src/sort.rs index 31dc81751..87b0fa7b5 100644 --- a/src/uu/sort/src/sort.rs +++ b/src/uu/sort/src/sort.rs @@ -460,6 +460,13 @@ impl<'a> Line<'a> { if settings.precomputed.needs_tokens { tokenize(line, settings.separator, token_buffer); } + if settings.mode == SortMode::Numeric { + // exclude inf, nan, scientific notation + let line_num_float = (!line.contains(char::is_alphabetic)) + .then(|| line.parse::().ok()) + .flatten(); + line_data.line_num_floats.push(line_num_float); + } for (selector, selection) in settings .selectors .iter() @@ -1563,6 +1570,24 @@ fn compare_by<'a>( let mut selection_index = 0; let mut num_info_index = 0; let mut parsed_float_index = 0; + + if let (Some(Some(a_f64)), Some(Some(b_f64))) = ( + a_line_data.line_num_floats.get(a.index), + b_line_data.line_num_floats.get(b.index), + ) { + // we don't use total_cmp() because it always sorts -0 before 0 + if let Some(cmp) = a_f64.partial_cmp(b_f64) { + // don't trust `Ordering::Equal` if lines are not fully equal + if cmp != Ordering::Equal || a.line == b.line { + return if global_settings.reverse { + cmp.reverse() + } else { + cmp + }; + } + } + } + for selector in &global_settings.selectors { let (a_str, b_str) = if selector.needs_selection { let selections = ( From 410da77d43efbb7f7103551b274fd7a06f059f18 Mon Sep 17 00:00:00 2001 From: Mohammad AlSaleh Date: Wed, 26 Mar 2025 11:58:48 +0300 Subject: [PATCH 2/2] sort: expand numeric sort section in BENCHMARKING.md a bit Signed-off-by: Mohammad AlSaleh --- src/uu/sort/BENCHMARKING.md | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) diff --git a/src/uu/sort/BENCHMARKING.md b/src/uu/sort/BENCHMARKING.md index 355245b07..d3fdd80d4 100644 --- a/src/uu/sort/BENCHMARKING.md +++ b/src/uu/sort/BENCHMARKING.md @@ -24,8 +24,19 @@ Run `cargo build --release` before benchmarking after you make a change! ## Sorting numbers -- Generate a list of numbers: `seq 0 100000 | sort -R > shuffled_numbers.txt`. -- Benchmark numeric sorting with hyperfine: `hyperfine "target/release/coreutils sort shuffled_numbers.txt -n -o output.txt"`. +- Generate a list of numbers: + ``` + shuf -i 1-1000000 -n 1000000 > shuffled_numbers.txt + # or + seq 1 1000000 | sort -R > shuffled_numbers.txt + ``` +- Benchmark numeric sorting with hyperfine + ``` + hyperfine --warmup 3 \ + '/tmp/gnu-sort -n /tmp/shuffled_numbers.txt' + '/tmp/uu_before sort -n /tmp/shuffled_numbers.txt' + '/tmp/uu_after sort -n /tmp/shuffled_numbers.txt' + ``` ## Sorting numbers with -g