mirror of
https://github.com/RGBCube/uutils-coreutils
synced 2025-07-27 19:17:43 +00:00
Merge pull request #7567 from MoSal/faster_sort_n
sort: immediately compare whole lines if they parse as numbers
This commit is contained in:
commit
fb165850a4
3 changed files with 47 additions and 2 deletions
|
@ -24,8 +24,19 @@ Run `cargo build --release` before benchmarking after you make a change!
|
||||||
|
|
||||||
## Sorting numbers
|
## Sorting numbers
|
||||||
|
|
||||||
- Generate a list of numbers: `seq 0 100000 | sort -R > shuffled_numbers.txt`.
|
- Generate a list of numbers:
|
||||||
- Benchmark numeric sorting with hyperfine: `hyperfine "target/release/coreutils sort shuffled_numbers.txt -n -o output.txt"`.
|
```
|
||||||
|
shuf -i 1-1000000 -n 1000000 > shuffled_numbers.txt
|
||||||
|
# or
|
||||||
|
seq 1 1000000 | sort -R > shuffled_numbers.txt
|
||||||
|
```
|
||||||
|
- Benchmark numeric sorting with hyperfine
|
||||||
|
```
|
||||||
|
hyperfine --warmup 3 \
|
||||||
|
'/tmp/gnu-sort -n /tmp/shuffled_numbers.txt'
|
||||||
|
'/tmp/uu_before sort -n /tmp/shuffled_numbers.txt'
|
||||||
|
'/tmp/uu_after sort -n /tmp/shuffled_numbers.txt'
|
||||||
|
```
|
||||||
|
|
||||||
## Sorting numbers with -g
|
## Sorting numbers with -g
|
||||||
|
|
||||||
|
|
|
@ -42,6 +42,7 @@ pub struct LineData<'a> {
|
||||||
pub selections: Vec<&'a str>,
|
pub selections: Vec<&'a str>,
|
||||||
pub num_infos: Vec<NumInfo>,
|
pub num_infos: Vec<NumInfo>,
|
||||||
pub parsed_floats: Vec<GeneralF64ParseResult>,
|
pub parsed_floats: Vec<GeneralF64ParseResult>,
|
||||||
|
pub line_num_floats: Vec<Option<f64>>,
|
||||||
}
|
}
|
||||||
|
|
||||||
impl Chunk {
|
impl Chunk {
|
||||||
|
@ -52,6 +53,7 @@ impl Chunk {
|
||||||
contents.line_data.selections.clear();
|
contents.line_data.selections.clear();
|
||||||
contents.line_data.num_infos.clear();
|
contents.line_data.num_infos.clear();
|
||||||
contents.line_data.parsed_floats.clear();
|
contents.line_data.parsed_floats.clear();
|
||||||
|
contents.line_data.line_num_floats.clear();
|
||||||
let lines = unsafe {
|
let lines = unsafe {
|
||||||
// SAFETY: It is safe to (temporarily) transmute to a vector of lines with a longer lifetime,
|
// SAFETY: It is safe to (temporarily) transmute to a vector of lines with a longer lifetime,
|
||||||
// because the vector is empty.
|
// because the vector is empty.
|
||||||
|
@ -73,6 +75,7 @@ impl Chunk {
|
||||||
selections,
|
selections,
|
||||||
std::mem::take(&mut contents.line_data.num_infos),
|
std::mem::take(&mut contents.line_data.num_infos),
|
||||||
std::mem::take(&mut contents.line_data.parsed_floats),
|
std::mem::take(&mut contents.line_data.parsed_floats),
|
||||||
|
std::mem::take(&mut contents.line_data.line_num_floats),
|
||||||
)
|
)
|
||||||
});
|
});
|
||||||
RecycledChunk {
|
RecycledChunk {
|
||||||
|
@ -80,6 +83,7 @@ impl Chunk {
|
||||||
selections: recycled_contents.1,
|
selections: recycled_contents.1,
|
||||||
num_infos: recycled_contents.2,
|
num_infos: recycled_contents.2,
|
||||||
parsed_floats: recycled_contents.3,
|
parsed_floats: recycled_contents.3,
|
||||||
|
line_num_floats: recycled_contents.4,
|
||||||
buffer: self.into_owner(),
|
buffer: self.into_owner(),
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -97,6 +101,7 @@ pub struct RecycledChunk {
|
||||||
selections: Vec<&'static str>,
|
selections: Vec<&'static str>,
|
||||||
num_infos: Vec<NumInfo>,
|
num_infos: Vec<NumInfo>,
|
||||||
parsed_floats: Vec<GeneralF64ParseResult>,
|
parsed_floats: Vec<GeneralF64ParseResult>,
|
||||||
|
line_num_floats: Vec<Option<f64>>,
|
||||||
buffer: Vec<u8>,
|
buffer: Vec<u8>,
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -107,6 +112,7 @@ impl RecycledChunk {
|
||||||
selections: Vec::new(),
|
selections: Vec::new(),
|
||||||
num_infos: Vec::new(),
|
num_infos: Vec::new(),
|
||||||
parsed_floats: Vec::new(),
|
parsed_floats: Vec::new(),
|
||||||
|
line_num_floats: Vec::new(),
|
||||||
buffer: vec![0; capacity],
|
buffer: vec![0; capacity],
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -149,6 +155,7 @@ pub fn read<T: Read>(
|
||||||
selections,
|
selections,
|
||||||
num_infos,
|
num_infos,
|
||||||
parsed_floats,
|
parsed_floats,
|
||||||
|
line_num_floats,
|
||||||
mut buffer,
|
mut buffer,
|
||||||
} = recycled_chunk;
|
} = recycled_chunk;
|
||||||
if buffer.len() < carry_over.len() {
|
if buffer.len() < carry_over.len() {
|
||||||
|
@ -184,6 +191,7 @@ pub fn read<T: Read>(
|
||||||
selections,
|
selections,
|
||||||
num_infos,
|
num_infos,
|
||||||
parsed_floats,
|
parsed_floats,
|
||||||
|
line_num_floats,
|
||||||
};
|
};
|
||||||
parse_lines(read, &mut lines, &mut line_data, separator, settings);
|
parse_lines(read, &mut lines, &mut line_data, separator, settings);
|
||||||
Ok(ChunkContents { lines, line_data })
|
Ok(ChunkContents { lines, line_data })
|
||||||
|
@ -207,6 +215,7 @@ fn parse_lines<'a>(
|
||||||
assert!(line_data.selections.is_empty());
|
assert!(line_data.selections.is_empty());
|
||||||
assert!(line_data.num_infos.is_empty());
|
assert!(line_data.num_infos.is_empty());
|
||||||
assert!(line_data.parsed_floats.is_empty());
|
assert!(line_data.parsed_floats.is_empty());
|
||||||
|
assert!(line_data.line_num_floats.is_empty());
|
||||||
let mut token_buffer = vec![];
|
let mut token_buffer = vec![];
|
||||||
lines.extend(
|
lines.extend(
|
||||||
read.split(separator as char)
|
read.split(separator as char)
|
||||||
|
|
|
@ -460,6 +460,13 @@ impl<'a> Line<'a> {
|
||||||
if settings.precomputed.needs_tokens {
|
if settings.precomputed.needs_tokens {
|
||||||
tokenize(line, settings.separator, token_buffer);
|
tokenize(line, settings.separator, token_buffer);
|
||||||
}
|
}
|
||||||
|
if settings.mode == SortMode::Numeric {
|
||||||
|
// exclude inf, nan, scientific notation
|
||||||
|
let line_num_float = (!line.contains(char::is_alphabetic))
|
||||||
|
.then(|| line.parse::<f64>().ok())
|
||||||
|
.flatten();
|
||||||
|
line_data.line_num_floats.push(line_num_float);
|
||||||
|
}
|
||||||
for (selector, selection) in settings
|
for (selector, selection) in settings
|
||||||
.selectors
|
.selectors
|
||||||
.iter()
|
.iter()
|
||||||
|
@ -1563,6 +1570,24 @@ fn compare_by<'a>(
|
||||||
let mut selection_index = 0;
|
let mut selection_index = 0;
|
||||||
let mut num_info_index = 0;
|
let mut num_info_index = 0;
|
||||||
let mut parsed_float_index = 0;
|
let mut parsed_float_index = 0;
|
||||||
|
|
||||||
|
if let (Some(Some(a_f64)), Some(Some(b_f64))) = (
|
||||||
|
a_line_data.line_num_floats.get(a.index),
|
||||||
|
b_line_data.line_num_floats.get(b.index),
|
||||||
|
) {
|
||||||
|
// we don't use total_cmp() because it always sorts -0 before 0
|
||||||
|
if let Some(cmp) = a_f64.partial_cmp(b_f64) {
|
||||||
|
// don't trust `Ordering::Equal` if lines are not fully equal
|
||||||
|
if cmp != Ordering::Equal || a.line == b.line {
|
||||||
|
return if global_settings.reverse {
|
||||||
|
cmp.reverse()
|
||||||
|
} else {
|
||||||
|
cmp
|
||||||
|
};
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
for selector in &global_settings.selectors {
|
for selector in &global_settings.selectors {
|
||||||
let (a_str, b_str) = if selector.needs_selection {
|
let (a_str, b_str) = if selector.needs_selection {
|
||||||
let selections = (
|
let selections = (
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue