mirror of
https://github.com/RGBCube/uutils-coreutils
synced 2025-07-28 11:37:44 +00:00
sort: immediately compare whole lines if they parse as numbers
Numeric sort can be relatively slow on inputs that are wholly or mostly numbers. This is more clear when comparing with the speed of GeneralNumeric. This change parses whole lines as f64 and stores that info in `LineData`. This is faster than doing the parsing two lines at a time in `compare_by()`. # Benchmarks `shuf -i 1-1000000 -n 1000000 > /tmp/shuffled.txt` % hyperfine --warmup 3 \ '/tmp/gnu-sort -n /tmp/shuffled.txt' '/tmp/before_coreutils sort -n /tmp/shuffled.txt' '/tmp/after_coreutils sort -n /tmp/shuffled.txt' Benchmark 1: /tmp/gnu-sort -n /tmp/shuffled.txt Time (mean ± σ): 198.2 ms ± 5.8 ms [User: 884.6 ms, System: 22.0 ms] Range (min … max): 187.3 ms … 207.4 ms 15 runs Benchmark 2: /tmp/before_coreutils sort -n /tmp/shuffled.txt Time (mean ± σ): 361.3 ms ± 8.7 ms [User: 1898.7 ms, System: 18.9 ms] Range (min … max): 350.4 ms … 375.3 ms 10 runs Benchmark 3: /tmp/after_coreutils sort -n /tmp/shuffled.txt Time (mean ± σ): 175.1 ms ± 6.7 ms [User: 536.8 ms, System: 21.6 ms] Range (min … max): 169.3 ms … 197.0 ms 16 runs Summary /tmp/after_coreutils sort -n /tmp/shuffled.txt ran 1.13 ± 0.05 times faster than /tmp/gnu-sort -n /tmp/shuffled.txt 2.06 ± 0.09 times faster than /tmp/before_coreutils sort -n /tmp/shuffled.txt Signed-off-by: Mohammad AlSaleh <CE.Mohammad.AlSaleh@gmail.com>
This commit is contained in:
parent
d33d731804
commit
fd38fd69e9
2 changed files with 34 additions and 0 deletions
|
@ -42,6 +42,7 @@ pub struct LineData<'a> {
|
||||||
pub selections: Vec<&'a str>,
|
pub selections: Vec<&'a str>,
|
||||||
pub num_infos: Vec<NumInfo>,
|
pub num_infos: Vec<NumInfo>,
|
||||||
pub parsed_floats: Vec<GeneralF64ParseResult>,
|
pub parsed_floats: Vec<GeneralF64ParseResult>,
|
||||||
|
pub line_num_floats: Vec<Option<f64>>,
|
||||||
}
|
}
|
||||||
|
|
||||||
impl Chunk {
|
impl Chunk {
|
||||||
|
@ -52,6 +53,7 @@ impl Chunk {
|
||||||
contents.line_data.selections.clear();
|
contents.line_data.selections.clear();
|
||||||
contents.line_data.num_infos.clear();
|
contents.line_data.num_infos.clear();
|
||||||
contents.line_data.parsed_floats.clear();
|
contents.line_data.parsed_floats.clear();
|
||||||
|
contents.line_data.line_num_floats.clear();
|
||||||
let lines = unsafe {
|
let lines = unsafe {
|
||||||
// SAFETY: It is safe to (temporarily) transmute to a vector of lines with a longer lifetime,
|
// SAFETY: It is safe to (temporarily) transmute to a vector of lines with a longer lifetime,
|
||||||
// because the vector is empty.
|
// because the vector is empty.
|
||||||
|
@ -73,6 +75,7 @@ impl Chunk {
|
||||||
selections,
|
selections,
|
||||||
std::mem::take(&mut contents.line_data.num_infos),
|
std::mem::take(&mut contents.line_data.num_infos),
|
||||||
std::mem::take(&mut contents.line_data.parsed_floats),
|
std::mem::take(&mut contents.line_data.parsed_floats),
|
||||||
|
std::mem::take(&mut contents.line_data.line_num_floats),
|
||||||
)
|
)
|
||||||
});
|
});
|
||||||
RecycledChunk {
|
RecycledChunk {
|
||||||
|
@ -80,6 +83,7 @@ impl Chunk {
|
||||||
selections: recycled_contents.1,
|
selections: recycled_contents.1,
|
||||||
num_infos: recycled_contents.2,
|
num_infos: recycled_contents.2,
|
||||||
parsed_floats: recycled_contents.3,
|
parsed_floats: recycled_contents.3,
|
||||||
|
line_num_floats: recycled_contents.4,
|
||||||
buffer: self.into_owner(),
|
buffer: self.into_owner(),
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -97,6 +101,7 @@ pub struct RecycledChunk {
|
||||||
selections: Vec<&'static str>,
|
selections: Vec<&'static str>,
|
||||||
num_infos: Vec<NumInfo>,
|
num_infos: Vec<NumInfo>,
|
||||||
parsed_floats: Vec<GeneralF64ParseResult>,
|
parsed_floats: Vec<GeneralF64ParseResult>,
|
||||||
|
line_num_floats: Vec<Option<f64>>,
|
||||||
buffer: Vec<u8>,
|
buffer: Vec<u8>,
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -107,6 +112,7 @@ impl RecycledChunk {
|
||||||
selections: Vec::new(),
|
selections: Vec::new(),
|
||||||
num_infos: Vec::new(),
|
num_infos: Vec::new(),
|
||||||
parsed_floats: Vec::new(),
|
parsed_floats: Vec::new(),
|
||||||
|
line_num_floats: Vec::new(),
|
||||||
buffer: vec![0; capacity],
|
buffer: vec![0; capacity],
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -149,6 +155,7 @@ pub fn read<T: Read>(
|
||||||
selections,
|
selections,
|
||||||
num_infos,
|
num_infos,
|
||||||
parsed_floats,
|
parsed_floats,
|
||||||
|
line_num_floats,
|
||||||
mut buffer,
|
mut buffer,
|
||||||
} = recycled_chunk;
|
} = recycled_chunk;
|
||||||
if buffer.len() < carry_over.len() {
|
if buffer.len() < carry_over.len() {
|
||||||
|
@ -184,6 +191,7 @@ pub fn read<T: Read>(
|
||||||
selections,
|
selections,
|
||||||
num_infos,
|
num_infos,
|
||||||
parsed_floats,
|
parsed_floats,
|
||||||
|
line_num_floats,
|
||||||
};
|
};
|
||||||
parse_lines(read, &mut lines, &mut line_data, separator, settings);
|
parse_lines(read, &mut lines, &mut line_data, separator, settings);
|
||||||
Ok(ChunkContents { lines, line_data })
|
Ok(ChunkContents { lines, line_data })
|
||||||
|
@ -207,6 +215,7 @@ fn parse_lines<'a>(
|
||||||
assert!(line_data.selections.is_empty());
|
assert!(line_data.selections.is_empty());
|
||||||
assert!(line_data.num_infos.is_empty());
|
assert!(line_data.num_infos.is_empty());
|
||||||
assert!(line_data.parsed_floats.is_empty());
|
assert!(line_data.parsed_floats.is_empty());
|
||||||
|
assert!(line_data.line_num_floats.is_empty());
|
||||||
let mut token_buffer = vec![];
|
let mut token_buffer = vec![];
|
||||||
lines.extend(
|
lines.extend(
|
||||||
read.split(separator as char)
|
read.split(separator as char)
|
||||||
|
|
|
@ -460,6 +460,13 @@ impl<'a> Line<'a> {
|
||||||
if settings.precomputed.needs_tokens {
|
if settings.precomputed.needs_tokens {
|
||||||
tokenize(line, settings.separator, token_buffer);
|
tokenize(line, settings.separator, token_buffer);
|
||||||
}
|
}
|
||||||
|
if settings.mode == SortMode::Numeric {
|
||||||
|
// exclude inf, nan, scientific notation
|
||||||
|
let line_num_float = (!line.contains(char::is_alphabetic))
|
||||||
|
.then(|| line.parse::<f64>().ok())
|
||||||
|
.flatten();
|
||||||
|
line_data.line_num_floats.push(line_num_float);
|
||||||
|
}
|
||||||
for (selector, selection) in settings
|
for (selector, selection) in settings
|
||||||
.selectors
|
.selectors
|
||||||
.iter()
|
.iter()
|
||||||
|
@ -1563,6 +1570,24 @@ fn compare_by<'a>(
|
||||||
let mut selection_index = 0;
|
let mut selection_index = 0;
|
||||||
let mut num_info_index = 0;
|
let mut num_info_index = 0;
|
||||||
let mut parsed_float_index = 0;
|
let mut parsed_float_index = 0;
|
||||||
|
|
||||||
|
if let (Some(Some(a_f64)), Some(Some(b_f64))) = (
|
||||||
|
a_line_data.line_num_floats.get(a.index),
|
||||||
|
b_line_data.line_num_floats.get(b.index),
|
||||||
|
) {
|
||||||
|
// we don't use total_cmp() because it always sorts -0 before 0
|
||||||
|
if let Some(cmp) = a_f64.partial_cmp(b_f64) {
|
||||||
|
// don't trust `Ordering::Equal` if lines are not fully equal
|
||||||
|
if cmp != Ordering::Equal || a.line == b.line {
|
||||||
|
return if global_settings.reverse {
|
||||||
|
cmp.reverse()
|
||||||
|
} else {
|
||||||
|
cmp
|
||||||
|
};
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
for selector in &global_settings.selectors {
|
for selector in &global_settings.selectors {
|
||||||
let (a_str, b_str) = if selector.needs_selection {
|
let (a_str, b_str) = if selector.needs_selection {
|
||||||
let selections = (
|
let selections = (
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue