mirror of
https://github.com/RGBCube/uutils-coreutils
synced 2025-07-27 19:17:43 +00:00
Merge pull request #7567 from MoSal/faster_sort_n
sort: immediately compare whole lines if they parse as numbers
This commit is contained in:
commit
fb165850a4
3 changed files with 47 additions and 2 deletions
|
@ -24,8 +24,19 @@ Run `cargo build --release` before benchmarking after you make a change!
|
|||
|
||||
## Sorting numbers
|
||||
|
||||
- Generate a list of numbers: `seq 0 100000 | sort -R > shuffled_numbers.txt`.
|
||||
- Benchmark numeric sorting with hyperfine: `hyperfine "target/release/coreutils sort shuffled_numbers.txt -n -o output.txt"`.
|
||||
- Generate a list of numbers:
|
||||
```
|
||||
shuf -i 1-1000000 -n 1000000 > shuffled_numbers.txt
|
||||
# or
|
||||
seq 1 1000000 | sort -R > shuffled_numbers.txt
|
||||
```
|
||||
- Benchmark numeric sorting with hyperfine
|
||||
```
|
||||
hyperfine --warmup 3 \
|
||||
'/tmp/gnu-sort -n /tmp/shuffled_numbers.txt'
|
||||
'/tmp/uu_before sort -n /tmp/shuffled_numbers.txt'
|
||||
'/tmp/uu_after sort -n /tmp/shuffled_numbers.txt'
|
||||
```
|
||||
|
||||
## Sorting numbers with -g
|
||||
|
||||
|
|
|
@ -42,6 +42,7 @@ pub struct LineData<'a> {
|
|||
pub selections: Vec<&'a str>,
|
||||
pub num_infos: Vec<NumInfo>,
|
||||
pub parsed_floats: Vec<GeneralF64ParseResult>,
|
||||
pub line_num_floats: Vec<Option<f64>>,
|
||||
}
|
||||
|
||||
impl Chunk {
|
||||
|
@ -52,6 +53,7 @@ impl Chunk {
|
|||
contents.line_data.selections.clear();
|
||||
contents.line_data.num_infos.clear();
|
||||
contents.line_data.parsed_floats.clear();
|
||||
contents.line_data.line_num_floats.clear();
|
||||
let lines = unsafe {
|
||||
// SAFETY: It is safe to (temporarily) transmute to a vector of lines with a longer lifetime,
|
||||
// because the vector is empty.
|
||||
|
@ -73,6 +75,7 @@ impl Chunk {
|
|||
selections,
|
||||
std::mem::take(&mut contents.line_data.num_infos),
|
||||
std::mem::take(&mut contents.line_data.parsed_floats),
|
||||
std::mem::take(&mut contents.line_data.line_num_floats),
|
||||
)
|
||||
});
|
||||
RecycledChunk {
|
||||
|
@ -80,6 +83,7 @@ impl Chunk {
|
|||
selections: recycled_contents.1,
|
||||
num_infos: recycled_contents.2,
|
||||
parsed_floats: recycled_contents.3,
|
||||
line_num_floats: recycled_contents.4,
|
||||
buffer: self.into_owner(),
|
||||
}
|
||||
}
|
||||
|
@ -97,6 +101,7 @@ pub struct RecycledChunk {
|
|||
selections: Vec<&'static str>,
|
||||
num_infos: Vec<NumInfo>,
|
||||
parsed_floats: Vec<GeneralF64ParseResult>,
|
||||
line_num_floats: Vec<Option<f64>>,
|
||||
buffer: Vec<u8>,
|
||||
}
|
||||
|
||||
|
@ -107,6 +112,7 @@ impl RecycledChunk {
|
|||
selections: Vec::new(),
|
||||
num_infos: Vec::new(),
|
||||
parsed_floats: Vec::new(),
|
||||
line_num_floats: Vec::new(),
|
||||
buffer: vec![0; capacity],
|
||||
}
|
||||
}
|
||||
|
@ -149,6 +155,7 @@ pub fn read<T: Read>(
|
|||
selections,
|
||||
num_infos,
|
||||
parsed_floats,
|
||||
line_num_floats,
|
||||
mut buffer,
|
||||
} = recycled_chunk;
|
||||
if buffer.len() < carry_over.len() {
|
||||
|
@ -184,6 +191,7 @@ pub fn read<T: Read>(
|
|||
selections,
|
||||
num_infos,
|
||||
parsed_floats,
|
||||
line_num_floats,
|
||||
};
|
||||
parse_lines(read, &mut lines, &mut line_data, separator, settings);
|
||||
Ok(ChunkContents { lines, line_data })
|
||||
|
@ -207,6 +215,7 @@ fn parse_lines<'a>(
|
|||
assert!(line_data.selections.is_empty());
|
||||
assert!(line_data.num_infos.is_empty());
|
||||
assert!(line_data.parsed_floats.is_empty());
|
||||
assert!(line_data.line_num_floats.is_empty());
|
||||
let mut token_buffer = vec![];
|
||||
lines.extend(
|
||||
read.split(separator as char)
|
||||
|
|
|
@ -460,6 +460,13 @@ impl<'a> Line<'a> {
|
|||
if settings.precomputed.needs_tokens {
|
||||
tokenize(line, settings.separator, token_buffer);
|
||||
}
|
||||
if settings.mode == SortMode::Numeric {
|
||||
// exclude inf, nan, scientific notation
|
||||
let line_num_float = (!line.contains(char::is_alphabetic))
|
||||
.then(|| line.parse::<f64>().ok())
|
||||
.flatten();
|
||||
line_data.line_num_floats.push(line_num_float);
|
||||
}
|
||||
for (selector, selection) in settings
|
||||
.selectors
|
||||
.iter()
|
||||
|
@ -1563,6 +1570,24 @@ fn compare_by<'a>(
|
|||
let mut selection_index = 0;
|
||||
let mut num_info_index = 0;
|
||||
let mut parsed_float_index = 0;
|
||||
|
||||
if let (Some(Some(a_f64)), Some(Some(b_f64))) = (
|
||||
a_line_data.line_num_floats.get(a.index),
|
||||
b_line_data.line_num_floats.get(b.index),
|
||||
) {
|
||||
// we don't use total_cmp() because it always sorts -0 before 0
|
||||
if let Some(cmp) = a_f64.partial_cmp(b_f64) {
|
||||
// don't trust `Ordering::Equal` if lines are not fully equal
|
||||
if cmp != Ordering::Equal || a.line == b.line {
|
||||
return if global_settings.reverse {
|
||||
cmp.reverse()
|
||||
} else {
|
||||
cmp
|
||||
};
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
for selector in &global_settings.selectors {
|
||||
let (a_str, b_str) = if selector.needs_selection {
|
||||
let selections = (
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue