1
Fork 0
mirror of https://github.com/RGBCube/uutils-coreutils synced 2025-07-27 19:17:43 +00:00

Merge pull request #7567 from MoSal/faster_sort_n

sort: immediately compare whole lines if they parse as numbers
This commit is contained in:
Dorian Péron 2025-04-01 12:14:29 +02:00 committed by GitHub
commit fb165850a4
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
3 changed files with 47 additions and 2 deletions

View file

@ -24,8 +24,19 @@ Run `cargo build --release` before benchmarking after you make a change!
## Sorting numbers ## Sorting numbers
- Generate a list of numbers: `seq 0 100000 | sort -R > shuffled_numbers.txt`. - Generate a list of numbers:
- Benchmark numeric sorting with hyperfine: `hyperfine "target/release/coreutils sort shuffled_numbers.txt -n -o output.txt"`. ```
shuf -i 1-1000000 -n 1000000 > shuffled_numbers.txt
# or
seq 1 1000000 | sort -R > shuffled_numbers.txt
```
- Benchmark numeric sorting with hyperfine
```
hyperfine --warmup 3 \
'/tmp/gnu-sort -n /tmp/shuffled_numbers.txt'
'/tmp/uu_before sort -n /tmp/shuffled_numbers.txt'
'/tmp/uu_after sort -n /tmp/shuffled_numbers.txt'
```
## Sorting numbers with -g ## Sorting numbers with -g

View file

@ -42,6 +42,7 @@ pub struct LineData<'a> {
pub selections: Vec<&'a str>, pub selections: Vec<&'a str>,
pub num_infos: Vec<NumInfo>, pub num_infos: Vec<NumInfo>,
pub parsed_floats: Vec<GeneralF64ParseResult>, pub parsed_floats: Vec<GeneralF64ParseResult>,
pub line_num_floats: Vec<Option<f64>>,
} }
impl Chunk { impl Chunk {
@ -52,6 +53,7 @@ impl Chunk {
contents.line_data.selections.clear(); contents.line_data.selections.clear();
contents.line_data.num_infos.clear(); contents.line_data.num_infos.clear();
contents.line_data.parsed_floats.clear(); contents.line_data.parsed_floats.clear();
contents.line_data.line_num_floats.clear();
let lines = unsafe { let lines = unsafe {
// SAFETY: It is safe to (temporarily) transmute to a vector of lines with a longer lifetime, // SAFETY: It is safe to (temporarily) transmute to a vector of lines with a longer lifetime,
// because the vector is empty. // because the vector is empty.
@ -73,6 +75,7 @@ impl Chunk {
selections, selections,
std::mem::take(&mut contents.line_data.num_infos), std::mem::take(&mut contents.line_data.num_infos),
std::mem::take(&mut contents.line_data.parsed_floats), std::mem::take(&mut contents.line_data.parsed_floats),
std::mem::take(&mut contents.line_data.line_num_floats),
) )
}); });
RecycledChunk { RecycledChunk {
@ -80,6 +83,7 @@ impl Chunk {
selections: recycled_contents.1, selections: recycled_contents.1,
num_infos: recycled_contents.2, num_infos: recycled_contents.2,
parsed_floats: recycled_contents.3, parsed_floats: recycled_contents.3,
line_num_floats: recycled_contents.4,
buffer: self.into_owner(), buffer: self.into_owner(),
} }
} }
@ -97,6 +101,7 @@ pub struct RecycledChunk {
selections: Vec<&'static str>, selections: Vec<&'static str>,
num_infos: Vec<NumInfo>, num_infos: Vec<NumInfo>,
parsed_floats: Vec<GeneralF64ParseResult>, parsed_floats: Vec<GeneralF64ParseResult>,
line_num_floats: Vec<Option<f64>>,
buffer: Vec<u8>, buffer: Vec<u8>,
} }
@ -107,6 +112,7 @@ impl RecycledChunk {
selections: Vec::new(), selections: Vec::new(),
num_infos: Vec::new(), num_infos: Vec::new(),
parsed_floats: Vec::new(), parsed_floats: Vec::new(),
line_num_floats: Vec::new(),
buffer: vec![0; capacity], buffer: vec![0; capacity],
} }
} }
@ -149,6 +155,7 @@ pub fn read<T: Read>(
selections, selections,
num_infos, num_infos,
parsed_floats, parsed_floats,
line_num_floats,
mut buffer, mut buffer,
} = recycled_chunk; } = recycled_chunk;
if buffer.len() < carry_over.len() { if buffer.len() < carry_over.len() {
@ -184,6 +191,7 @@ pub fn read<T: Read>(
selections, selections,
num_infos, num_infos,
parsed_floats, parsed_floats,
line_num_floats,
}; };
parse_lines(read, &mut lines, &mut line_data, separator, settings); parse_lines(read, &mut lines, &mut line_data, separator, settings);
Ok(ChunkContents { lines, line_data }) Ok(ChunkContents { lines, line_data })
@ -207,6 +215,7 @@ fn parse_lines<'a>(
assert!(line_data.selections.is_empty()); assert!(line_data.selections.is_empty());
assert!(line_data.num_infos.is_empty()); assert!(line_data.num_infos.is_empty());
assert!(line_data.parsed_floats.is_empty()); assert!(line_data.parsed_floats.is_empty());
assert!(line_data.line_num_floats.is_empty());
let mut token_buffer = vec![]; let mut token_buffer = vec![];
lines.extend( lines.extend(
read.split(separator as char) read.split(separator as char)

View file

@ -460,6 +460,13 @@ impl<'a> Line<'a> {
if settings.precomputed.needs_tokens { if settings.precomputed.needs_tokens {
tokenize(line, settings.separator, token_buffer); tokenize(line, settings.separator, token_buffer);
} }
if settings.mode == SortMode::Numeric {
// exclude inf, nan, scientific notation
let line_num_float = (!line.contains(char::is_alphabetic))
.then(|| line.parse::<f64>().ok())
.flatten();
line_data.line_num_floats.push(line_num_float);
}
for (selector, selection) in settings for (selector, selection) in settings
.selectors .selectors
.iter() .iter()
@ -1563,6 +1570,24 @@ fn compare_by<'a>(
let mut selection_index = 0; let mut selection_index = 0;
let mut num_info_index = 0; let mut num_info_index = 0;
let mut parsed_float_index = 0; let mut parsed_float_index = 0;
if let (Some(Some(a_f64)), Some(Some(b_f64))) = (
a_line_data.line_num_floats.get(a.index),
b_line_data.line_num_floats.get(b.index),
) {
// we don't use total_cmp() because it always sorts -0 before 0
if let Some(cmp) = a_f64.partial_cmp(b_f64) {
// don't trust `Ordering::Equal` if lines are not fully equal
if cmp != Ordering::Equal || a.line == b.line {
return if global_settings.reverse {
cmp.reverse()
} else {
cmp
};
}
}
}
for selector in &global_settings.selectors { for selector in &global_settings.selectors {
let (a_str, b_str) = if selector.needs_selection { let (a_str, b_str) = if selector.needs_selection {
let selections = ( let selections = (