mirror of
https://github.com/RGBCube/uutils-coreutils
synced 2025-07-28 03:27:44 +00:00
sort: implement numeric string comparison (#2070)
* sort: implement numeric string comparison This implements -n and -h using a string comparison algorithm instead of parsing each number to a f64 and comparing those. This should result in a moderate performance increase and eliminate loss of precision. * cache parsed f64 numbers For general numeric comparisons we have to parse numbers as f64, as this behavior is explicitly documented by GNU coreutils. We can however cache the parsed value to speed up comparisons. * fix leading zeroes for negative numbers * use more appropriate name for exponent * improvements to the parse function * move checks into main loop and fix thousands separator condition * remove unneeded checks * rustfmt
This commit is contained in:
parent
481d1ee659
commit
4bbbe3a3f2
5 changed files with 667 additions and 184 deletions
|
@ -9,25 +9,84 @@ list that we should improve / make sure not to regress.
|
|||
Run `cargo build --release` before benchmarking after you make a change!
|
||||
|
||||
## Sorting a wordlist
|
||||
- Get a wordlist, for example with [words](https://en.wikipedia.org/wiki/Words_(Unix)) on Linux. The exact wordlist
|
||||
doesn't matter for performance comparisons. In this example I'm using `/usr/share/dict/american-english` as the wordlist.
|
||||
- Shuffle the wordlist by running `sort -R /usr/share/dict/american-english > shuffled_wordlist.txt`.
|
||||
- Benchmark sorting the wordlist with hyperfine: `hyperfine "target/release/coreutils sort shuffled_wordlist.txt -o output.txt"`.
|
||||
|
||||
- Get a wordlist, for example with [words](<https://en.wikipedia.org/wiki/Words_(Unix)>) on Linux. The exact wordlist
|
||||
doesn't matter for performance comparisons. In this example I'm using `/usr/share/dict/american-english` as the wordlist.
|
||||
- Shuffle the wordlist by running `sort -R /usr/share/dict/american-english > shuffled_wordlist.txt`.
|
||||
- Benchmark sorting the wordlist with hyperfine: `hyperfine "target/release/coreutils sort shuffled_wordlist.txt -o output.txt"`.
|
||||
|
||||
## Sorting a wordlist with ignore_case
|
||||
- Same wordlist as above
|
||||
- Benchmark sorting the wordlist ignoring the case with hyperfine: `hyperfine "target/release/coreutils sort shuffled_wordlist.txt -f -o output.txt"`.
|
||||
|
||||
- Same wordlist as above
|
||||
- Benchmark sorting the wordlist ignoring the case with hyperfine: `hyperfine "target/release/coreutils sort shuffled_wordlist.txt -f -o output.txt"`.
|
||||
|
||||
## Sorting numbers
|
||||
- Generate a list of numbers: `seq 0 100000 | sort -R > shuffled_numbers.txt`.
|
||||
- Benchmark numeric sorting with hyperfine: `hyperfine "target/release/coreutils sort shuffled_numbers.txt -n -o output.txt"`.
|
||||
|
||||
- Generate a list of numbers: `seq 0 100000 | sort -R > shuffled_numbers.txt`.
|
||||
- Benchmark numeric sorting with hyperfine: `hyperfine "target/release/coreutils sort shuffled_numbers.txt -n -o output.txt"`.
|
||||
|
||||
## Sorting numbers with -g
|
||||
|
||||
- Same list of numbers as above.
|
||||
- Benchmark numeric sorting with hyperfine: `hyperfine "target/release/coreutils sort shuffled_numbers.txt -g -o output.txt"`.
|
||||
|
||||
## Sorting numbers with SI prefixes
|
||||
|
||||
- Generate a list of numbers:
|
||||
<details>
|
||||
<summary>Rust script</summary>
|
||||
|
||||
## Cargo.toml
|
||||
|
||||
```toml
|
||||
[dependencies]
|
||||
rand = "0.8.3"
|
||||
```
|
||||
|
||||
## main.rs
|
||||
|
||||
```rust
|
||||
use rand::prelude::*;
|
||||
fn main() {
|
||||
let suffixes = ['k', 'K', 'M', 'G', 'T', 'P', 'E', 'Z', 'Y'];
|
||||
let mut rng = thread_rng();
|
||||
for _ in 0..100000 {
|
||||
println!(
|
||||
"{}{}",
|
||||
rng.gen_range(0..1000000),
|
||||
suffixes.choose(&mut rng).unwrap()
|
||||
)
|
||||
}
|
||||
}
|
||||
|
||||
```
|
||||
|
||||
## running
|
||||
|
||||
`cargo run > shuffled_numbers_si.txt`
|
||||
|
||||
</details>
|
||||
|
||||
- Benchmark numeric sorting with hyperfine: `hyperfine "target/release/coreutils sort shuffled_numbers_si.txt -h -o output.txt"`.
|
||||
|
||||
## Stdout and stdin performance
|
||||
|
||||
Try to run the above benchmarks by piping the input through stdin (standard input) and redirect the
|
||||
output through stdout (standard output):
|
||||
- Remove the input file from the arguments and add `cat [inputfile] | ` at the beginning.
|
||||
- Remove `-o output.txt` and add `> output.txt` at the end.
|
||||
|
||||
- Remove the input file from the arguments and add `cat [inputfile] | ` at the beginning.
|
||||
- Remove `-o output.txt` and add `> output.txt` at the end.
|
||||
|
||||
Example: `hyperfine "target/release/coreutils sort shuffled_numbers.txt -n -o output.txt"` becomes
|
||||
`hyperfine "cat shuffled_numbers.txt | target/release/coreutils sort -n > output.txt`
|
||||
- Check that performance is similar to the original benchmark.
|
||||
|
||||
- Check that performance is similar to the original benchmark.
|
||||
|
||||
## Comparing with GNU sort
|
||||
|
||||
Hyperfine accepts multiple commands to run and will compare them. To compare performance with GNU sort
|
||||
duplicate the string you passed to hyperfine but remove the `target/release/coreutils` bit from it.
|
||||
|
||||
Example: `hyperfine "target/release/coreutils sort shuffled_numbers_si.txt -h -o output.txt"` becomes
|
||||
`hyperfine "target/release/coreutils sort shuffled_numbers_si.txt -h -o output.txt" "sort shuffled_numbers_si.txt -h -o output.txt"`
|
||||
(This assumes GNU sort is installed as `sort`)
|
||||
|
|
455
src/uu/sort/src/numeric_str_cmp.rs
Normal file
455
src/uu/sort/src/numeric_str_cmp.rs
Normal file
|
@ -0,0 +1,455 @@
|
|||
// * This file is part of the uutils coreutils package.
|
||||
// *
|
||||
// * (c) Michael Debertol <michael.debertol..AT..gmail.com>
|
||||
// *
|
||||
// * For the full copyright and license information, please view the LICENSE
|
||||
// * file that was distributed with this source code.
|
||||
|
||||
//! Fast comparison for strings representing a base 10 number without precision loss.
|
||||
//!
|
||||
//! To be able to short-circuit when comparing, [NumInfo] must be passed along with each number
|
||||
//! to [numeric_str_cmp]. [NumInfo] is generally obtained by calling [NumInfo::parse] and should be cached.
|
||||
//! It is allowed to arbitrarily modify the exponent afterwards, which is equivalent to shifting the decimal point.
|
||||
//!
|
||||
//! More specifically, exponent can be understood so that the original number is in (1..10)*10^exponent.
|
||||
//! From that follows the constraints of this algorithm: It is able to compare numbers in ±(1*10^[i64::MIN]..10*10^[i64::MAX]).
|
||||
|
||||
use std::{cmp::Ordering, ops::Range};
|
||||
|
||||
#[derive(Debug, PartialEq, Eq, PartialOrd, Ord)]
|
||||
enum Sign {
|
||||
Negative,
|
||||
Positive,
|
||||
}
|
||||
|
||||
#[derive(Debug, PartialEq)]
|
||||
pub struct NumInfo {
|
||||
exponent: i64,
|
||||
sign: Sign,
|
||||
}
|
||||
|
||||
pub struct NumInfoParseSettings {
|
||||
pub accept_si_units: bool,
|
||||
pub thousands_separator: Option<char>,
|
||||
pub decimal_pt: Option<char>,
|
||||
}
|
||||
|
||||
impl Default for NumInfoParseSettings {
|
||||
fn default() -> Self {
|
||||
Self {
|
||||
accept_si_units: false,
|
||||
thousands_separator: None,
|
||||
decimal_pt: Some('.'),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl NumInfo {
|
||||
/// Parse NumInfo for this number.
|
||||
/// Also returns the range of num that should be passed to numeric_str_cmp later
|
||||
pub fn parse(num: &str, parse_settings: NumInfoParseSettings) -> (Self, Range<usize>) {
|
||||
let mut exponent = -1;
|
||||
let mut had_decimal_pt = false;
|
||||
let mut had_digit = false;
|
||||
let mut start = None;
|
||||
let mut sign = Sign::Positive;
|
||||
|
||||
let mut first_char = true;
|
||||
|
||||
for (idx, char) in num.char_indices() {
|
||||
if first_char && char.is_whitespace() {
|
||||
continue;
|
||||
}
|
||||
|
||||
if first_char && char == '-' {
|
||||
sign = Sign::Negative;
|
||||
first_char = false;
|
||||
continue;
|
||||
}
|
||||
first_char = false;
|
||||
|
||||
if parse_settings
|
||||
.thousands_separator
|
||||
.map_or(false, |c| c == char)
|
||||
{
|
||||
continue;
|
||||
}
|
||||
|
||||
if Self::is_invalid_char(char, &mut had_decimal_pt, &parse_settings) {
|
||||
let si_unit = if parse_settings.accept_si_units {
|
||||
match char {
|
||||
'K' | 'k' => 3,
|
||||
'M' => 6,
|
||||
'G' => 9,
|
||||
'T' => 12,
|
||||
'P' => 15,
|
||||
'E' => 18,
|
||||
'Z' => 21,
|
||||
'Y' => 24,
|
||||
_ => 0,
|
||||
}
|
||||
} else {
|
||||
0
|
||||
};
|
||||
return if let Some(start) = start {
|
||||
(
|
||||
NumInfo {
|
||||
exponent: exponent + si_unit,
|
||||
sign,
|
||||
},
|
||||
start..idx,
|
||||
)
|
||||
} else {
|
||||
(
|
||||
NumInfo {
|
||||
sign: if had_digit { sign } else { Sign::Positive },
|
||||
exponent: 0,
|
||||
},
|
||||
0..0,
|
||||
)
|
||||
};
|
||||
}
|
||||
if Some(char) == parse_settings.decimal_pt {
|
||||
continue;
|
||||
}
|
||||
had_digit = true;
|
||||
if start.is_none() && char == '0' {
|
||||
if had_decimal_pt {
|
||||
// We're parsing a number whose first nonzero digit is after the decimal point.
|
||||
exponent -= 1;
|
||||
} else {
|
||||
// Skip leading zeroes
|
||||
continue;
|
||||
}
|
||||
}
|
||||
if !had_decimal_pt {
|
||||
exponent += 1;
|
||||
}
|
||||
if start.is_none() && char != '0' {
|
||||
start = Some(idx);
|
||||
}
|
||||
}
|
||||
if let Some(start) = start {
|
||||
(NumInfo { exponent, sign }, start..num.len())
|
||||
} else {
|
||||
(
|
||||
NumInfo {
|
||||
sign: if had_digit { sign } else { Sign::Positive },
|
||||
exponent: 0,
|
||||
},
|
||||
0..0,
|
||||
)
|
||||
}
|
||||
}
|
||||
|
||||
fn is_invalid_char(
|
||||
c: char,
|
||||
had_decimal_pt: &mut bool,
|
||||
parse_settings: &NumInfoParseSettings,
|
||||
) -> bool {
|
||||
if Some(c) == parse_settings.decimal_pt {
|
||||
if *had_decimal_pt {
|
||||
// this is a decimal pt but we already had one, so it is invalid
|
||||
true
|
||||
} else {
|
||||
*had_decimal_pt = true;
|
||||
false
|
||||
}
|
||||
} else {
|
||||
!c.is_ascii_digit()
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// compare two numbers as strings without parsing them as a number first. This should be more performant and can handle numbers more precisely.
|
||||
/// NumInfo is needed to provide a fast path for most numbers.
|
||||
pub fn numeric_str_cmp((a, a_info): (&str, &NumInfo), (b, b_info): (&str, &NumInfo)) -> Ordering {
|
||||
// check for a difference in the sign
|
||||
if a_info.sign != b_info.sign {
|
||||
return a_info.sign.cmp(&b_info.sign);
|
||||
}
|
||||
|
||||
// check for a difference in the exponent
|
||||
let ordering = if a_info.exponent != b_info.exponent && !a.is_empty() && !b.is_empty() {
|
||||
a_info.exponent.cmp(&b_info.exponent)
|
||||
} else {
|
||||
// walk the characters from the front until we find a difference
|
||||
let mut a_chars = a.chars().filter(|c| c.is_ascii_digit());
|
||||
let mut b_chars = b.chars().filter(|c| c.is_ascii_digit());
|
||||
loop {
|
||||
let a_next = a_chars.next();
|
||||
let b_next = b_chars.next();
|
||||
match (a_next, b_next) {
|
||||
(None, None) => break Ordering::Equal,
|
||||
(Some(c), None) => {
|
||||
break if c == '0' && a_chars.all(|c| c == '0') {
|
||||
Ordering::Equal
|
||||
} else {
|
||||
Ordering::Greater
|
||||
}
|
||||
}
|
||||
(None, Some(c)) => {
|
||||
break if c == '0' && b_chars.all(|c| c == '0') {
|
||||
Ordering::Equal
|
||||
} else {
|
||||
Ordering::Less
|
||||
}
|
||||
}
|
||||
(Some(a_char), Some(b_char)) => {
|
||||
let ord = a_char.cmp(&b_char);
|
||||
if ord != Ordering::Equal {
|
||||
break ord;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
if a_info.sign == Sign::Negative {
|
||||
ordering.reverse()
|
||||
} else {
|
||||
ordering
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn parses_exp() {
|
||||
let n = "1";
|
||||
assert_eq!(
|
||||
NumInfo::parse(n, Default::default()),
|
||||
(
|
||||
NumInfo {
|
||||
exponent: 0,
|
||||
sign: Sign::Positive
|
||||
},
|
||||
0..1
|
||||
)
|
||||
);
|
||||
let n = "100";
|
||||
assert_eq!(
|
||||
NumInfo::parse(n, Default::default()),
|
||||
(
|
||||
NumInfo {
|
||||
exponent: 2,
|
||||
sign: Sign::Positive
|
||||
},
|
||||
0..3
|
||||
)
|
||||
);
|
||||
let n = "1,000";
|
||||
assert_eq!(
|
||||
NumInfo::parse(
|
||||
n,
|
||||
NumInfoParseSettings {
|
||||
thousands_separator: Some(','),
|
||||
..Default::default()
|
||||
}
|
||||
),
|
||||
(
|
||||
NumInfo {
|
||||
exponent: 3,
|
||||
sign: Sign::Positive
|
||||
},
|
||||
0..5
|
||||
)
|
||||
);
|
||||
let n = "1,000";
|
||||
assert_eq!(
|
||||
NumInfo::parse(n, Default::default()),
|
||||
(
|
||||
NumInfo {
|
||||
exponent: 0,
|
||||
sign: Sign::Positive
|
||||
},
|
||||
0..1
|
||||
)
|
||||
);
|
||||
let n = "1000.00";
|
||||
assert_eq!(
|
||||
NumInfo::parse(n, Default::default()),
|
||||
(
|
||||
NumInfo {
|
||||
exponent: 3,
|
||||
sign: Sign::Positive
|
||||
},
|
||||
0..7
|
||||
)
|
||||
);
|
||||
}
|
||||
#[test]
|
||||
fn parses_negative_exp() {
|
||||
let n = "0.00005";
|
||||
assert_eq!(
|
||||
NumInfo::parse(n, Default::default()),
|
||||
(
|
||||
NumInfo {
|
||||
exponent: -5,
|
||||
sign: Sign::Positive
|
||||
},
|
||||
6..7
|
||||
)
|
||||
);
|
||||
let n = "00000.00005";
|
||||
assert_eq!(
|
||||
NumInfo::parse(n, Default::default()),
|
||||
(
|
||||
NumInfo {
|
||||
exponent: -5,
|
||||
sign: Sign::Positive
|
||||
},
|
||||
10..11
|
||||
)
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn parses_sign() {
|
||||
let n = "5";
|
||||
assert_eq!(
|
||||
NumInfo::parse(n, Default::default()),
|
||||
(
|
||||
NumInfo {
|
||||
exponent: 0,
|
||||
sign: Sign::Positive
|
||||
},
|
||||
0..1
|
||||
)
|
||||
);
|
||||
let n = "-5";
|
||||
assert_eq!(
|
||||
NumInfo::parse(n, Default::default()),
|
||||
(
|
||||
NumInfo {
|
||||
exponent: 0,
|
||||
sign: Sign::Negative
|
||||
},
|
||||
1..2
|
||||
)
|
||||
);
|
||||
let n = " -5";
|
||||
assert_eq!(
|
||||
NumInfo::parse(n, Default::default()),
|
||||
(
|
||||
NumInfo {
|
||||
exponent: 0,
|
||||
sign: Sign::Negative
|
||||
},
|
||||
5..6
|
||||
)
|
||||
);
|
||||
}
|
||||
|
||||
fn test_helper(a: &str, b: &str, expected: Ordering) {
|
||||
let (a_info, a_range) = NumInfo::parse(a, Default::default());
|
||||
let (b_info, b_range) = NumInfo::parse(b, Default::default());
|
||||
let ordering = numeric_str_cmp(
|
||||
(&a[a_range.to_owned()], &a_info),
|
||||
(&b[b_range.to_owned()], &b_info),
|
||||
);
|
||||
assert_eq!(ordering, expected);
|
||||
let ordering = numeric_str_cmp((&b[b_range], &b_info), (&a[a_range], &a_info));
|
||||
assert_eq!(ordering, expected.reverse());
|
||||
}
|
||||
#[test]
|
||||
fn test_single_digit() {
|
||||
test_helper("1", "2", Ordering::Less);
|
||||
test_helper("0", "0", Ordering::Equal);
|
||||
}
|
||||
#[test]
|
||||
fn test_minus() {
|
||||
test_helper("-1", "-2", Ordering::Greater);
|
||||
test_helper("-0", "-0", Ordering::Equal);
|
||||
}
|
||||
#[test]
|
||||
fn test_different_len() {
|
||||
test_helper("-20", "-100", Ordering::Greater);
|
||||
test_helper("10.0", "2.000000", Ordering::Greater);
|
||||
}
|
||||
#[test]
|
||||
fn test_decimal_digits() {
|
||||
test_helper("20.1", "20.2", Ordering::Less);
|
||||
test_helper("20.1", "20.15", Ordering::Less);
|
||||
test_helper("-20.1", "+20.15", Ordering::Less);
|
||||
test_helper("-20.1", "-20", Ordering::Less);
|
||||
}
|
||||
#[test]
|
||||
fn test_trailing_zeroes() {
|
||||
test_helper("20.00000", "20.1", Ordering::Less);
|
||||
test_helper("20.00000", "20.0", Ordering::Equal);
|
||||
}
|
||||
#[test]
|
||||
fn test_invalid_digits() {
|
||||
test_helper("foo", "bar", Ordering::Equal);
|
||||
test_helper("20.1", "a", Ordering::Greater);
|
||||
test_helper("-20.1", "a", Ordering::Less);
|
||||
test_helper("a", "0.15", Ordering::Less);
|
||||
}
|
||||
#[test]
|
||||
fn test_multiple_decimal_pts() {
|
||||
test_helper("10.0.0", "50.0.0", Ordering::Less);
|
||||
test_helper("0.1.", "0.2.0", Ordering::Less);
|
||||
test_helper("1.1.", "0", Ordering::Greater);
|
||||
test_helper("1.1.", "-0", Ordering::Greater);
|
||||
}
|
||||
#[test]
|
||||
fn test_leading_decimal_pts() {
|
||||
test_helper(".0", ".0", Ordering::Equal);
|
||||
test_helper(".1", ".0", Ordering::Greater);
|
||||
test_helper(".02", "0", Ordering::Greater);
|
||||
}
|
||||
#[test]
|
||||
fn test_leading_zeroes() {
|
||||
test_helper("000000.0", ".0", Ordering::Equal);
|
||||
test_helper("0.1", "0000000000000.0", Ordering::Greater);
|
||||
test_helper("-01", "-2", Ordering::Greater);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn minus_zero() {
|
||||
// This matches GNU sort behavior.
|
||||
test_helper("-0", "0", Ordering::Less);
|
||||
test_helper("-0x", "0", Ordering::Less);
|
||||
}
|
||||
#[test]
|
||||
fn double_minus() {
|
||||
test_helper("--1", "0", Ordering::Equal);
|
||||
}
|
||||
#[test]
|
||||
fn single_minus() {
|
||||
let info = NumInfo::parse("-", Default::default());
|
||||
assert_eq!(
|
||||
info,
|
||||
(
|
||||
NumInfo {
|
||||
exponent: 0,
|
||||
sign: Sign::Positive
|
||||
},
|
||||
0..0
|
||||
)
|
||||
);
|
||||
}
|
||||
#[test]
|
||||
fn invalid_with_unit() {
|
||||
let info = NumInfo::parse(
|
||||
"-K",
|
||||
NumInfoParseSettings {
|
||||
accept_si_units: true,
|
||||
..Default::default()
|
||||
},
|
||||
);
|
||||
assert_eq!(
|
||||
info,
|
||||
(
|
||||
NumInfo {
|
||||
exponent: 0,
|
||||
sign: Sign::Positive
|
||||
},
|
||||
0..0
|
||||
)
|
||||
);
|
||||
}
|
||||
}
|
|
@ -15,9 +15,12 @@
|
|||
#[macro_use]
|
||||
extern crate uucore;
|
||||
|
||||
mod numeric_str_cmp;
|
||||
|
||||
use clap::{App, Arg};
|
||||
use fnv::FnvHasher;
|
||||
use itertools::Itertools;
|
||||
use numeric_str_cmp::{numeric_str_cmp, NumInfo, NumInfoParseSettings};
|
||||
use rand::distributions::Alphanumeric;
|
||||
use rand::{thread_rng, Rng};
|
||||
use rayon::prelude::*;
|
||||
|
@ -162,27 +165,71 @@ impl From<&GlobalSettings> for KeySettings {
|
|||
}
|
||||
|
||||
/// Represents the string selected by a FieldSelector.
|
||||
#[derive(Debug)]
|
||||
enum Selection {
|
||||
enum SelectionRange {
|
||||
/// If we had to transform this selection, we have to store a new string.
|
||||
String(String),
|
||||
/// If there was no transformation, we can store an index into the line.
|
||||
ByIndex(Range<usize>),
|
||||
}
|
||||
|
||||
impl SelectionRange {
|
||||
/// Gets the actual string slice represented by this Selection.
|
||||
fn get_str<'a>(&'a self, line: &'a str) -> &'a str {
|
||||
match self {
|
||||
SelectionRange::String(string) => string.as_str(),
|
||||
SelectionRange::ByIndex(range) => &line[range.to_owned()],
|
||||
}
|
||||
}
|
||||
|
||||
fn shorten(&mut self, new_range: Range<usize>) {
|
||||
match self {
|
||||
SelectionRange::String(string) => {
|
||||
string.drain(new_range.end..);
|
||||
string.drain(..new_range.start);
|
||||
}
|
||||
SelectionRange::ByIndex(range) => {
|
||||
range.end = range.start + new_range.end;
|
||||
range.start += new_range.start;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
enum NumCache {
|
||||
AsF64(f64),
|
||||
WithInfo(NumInfo),
|
||||
None,
|
||||
}
|
||||
|
||||
impl NumCache {
|
||||
fn as_f64(&self) -> f64 {
|
||||
match self {
|
||||
NumCache::AsF64(n) => *n,
|
||||
_ => unreachable!(),
|
||||
}
|
||||
}
|
||||
fn as_num_info(&self) -> &NumInfo {
|
||||
match self {
|
||||
NumCache::WithInfo(n) => n,
|
||||
_ => unreachable!(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
struct Selection {
|
||||
range: SelectionRange,
|
||||
num_cache: NumCache,
|
||||
}
|
||||
|
||||
impl Selection {
|
||||
/// Gets the actual string slice represented by this Selection.
|
||||
fn get_str<'a>(&'a self, line: &'a Line) -> &'a str {
|
||||
match self {
|
||||
Selection::String(string) => string.as_str(),
|
||||
Selection::ByIndex(range) => &line.line[range.to_owned()],
|
||||
}
|
||||
self.range.get_str(&line.line)
|
||||
}
|
||||
}
|
||||
|
||||
type Field = Range<usize>;
|
||||
|
||||
#[derive(Debug)]
|
||||
struct Line {
|
||||
line: String,
|
||||
// The common case is not to specify fields. Let's make this fast.
|
||||
|
@ -206,18 +253,38 @@ impl Line {
|
|||
.selectors
|
||||
.iter()
|
||||
.map(|selector| {
|
||||
if let Some(range) = selector.get_selection(&line, fields.as_deref()) {
|
||||
if let Some(transformed) =
|
||||
transform(&line[range.to_owned()], &selector.settings)
|
||||
{
|
||||
Selection::String(transformed)
|
||||
let mut range =
|
||||
if let Some(range) = selector.get_selection(&line, fields.as_deref()) {
|
||||
if let Some(transformed) =
|
||||
transform(&line[range.to_owned()], &selector.settings)
|
||||
{
|
||||
SelectionRange::String(transformed)
|
||||
} else {
|
||||
SelectionRange::ByIndex(range.start().to_owned()..range.end() + 1)
|
||||
}
|
||||
} else {
|
||||
Selection::ByIndex(range.start().to_owned()..range.end() + 1)
|
||||
}
|
||||
// If there is no match, match the empty string.
|
||||
SelectionRange::ByIndex(0..0)
|
||||
};
|
||||
let num_cache = if selector.settings.mode == SortMode::Numeric
|
||||
|| selector.settings.mode == SortMode::HumanNumeric
|
||||
{
|
||||
let (info, num_range) = NumInfo::parse(
|
||||
range.get_str(&line),
|
||||
NumInfoParseSettings {
|
||||
accept_si_units: selector.settings.mode == SortMode::HumanNumeric,
|
||||
thousands_separator: Some(THOUSANDS_SEP),
|
||||
decimal_pt: Some(DECIMAL_PT),
|
||||
},
|
||||
);
|
||||
range.shorten(num_range);
|
||||
NumCache::WithInfo(info)
|
||||
} else if selector.settings.mode == SortMode::GeneralNumeric {
|
||||
NumCache::AsF64(permissive_f64_parse(get_leading_gen(range.get_str(&line))))
|
||||
} else {
|
||||
// If there is no match, match the empty string.
|
||||
Selection::ByIndex(0..0)
|
||||
}
|
||||
NumCache::None
|
||||
};
|
||||
Selection { range, num_cache }
|
||||
})
|
||||
.collect();
|
||||
Self { line, selections }
|
||||
|
@ -923,21 +990,28 @@ fn sort_by(lines: &mut Vec<Line>, settings: &GlobalSettings) {
|
|||
|
||||
fn compare_by(a: &Line, b: &Line, global_settings: &GlobalSettings) -> Ordering {
|
||||
for (idx, selector) in global_settings.selectors.iter().enumerate() {
|
||||
let a = a.selections[idx].get_str(a);
|
||||
let b = b.selections[idx].get_str(b);
|
||||
let a_selection = &a.selections[idx];
|
||||
let b_selection = &b.selections[idx];
|
||||
let a_str = a_selection.get_str(a);
|
||||
let b_str = b_selection.get_str(b);
|
||||
let settings = &selector.settings;
|
||||
|
||||
let cmp: Ordering = if settings.random {
|
||||
random_shuffle(a, b, global_settings.salt.clone())
|
||||
random_shuffle(a_str, b_str, global_settings.salt.clone())
|
||||
} else {
|
||||
(match settings.mode {
|
||||
SortMode::Numeric => numeric_compare,
|
||||
SortMode::GeneralNumeric => general_numeric_compare,
|
||||
SortMode::HumanNumeric => human_numeric_size_compare,
|
||||
SortMode::Month => month_compare,
|
||||
SortMode::Version => version_compare,
|
||||
SortMode::Default => default_compare,
|
||||
})(a, b)
|
||||
match settings.mode {
|
||||
SortMode::Numeric | SortMode::HumanNumeric => numeric_str_cmp(
|
||||
(a_str, a_selection.num_cache.as_num_info()),
|
||||
(b_str, b_selection.num_cache.as_num_info()),
|
||||
),
|
||||
SortMode::GeneralNumeric => general_numeric_compare(
|
||||
a_selection.num_cache.as_f64(),
|
||||
b_selection.num_cache.as_f64(),
|
||||
),
|
||||
SortMode::Month => month_compare(a_str, b_str),
|
||||
SortMode::Version => version_compare(a_str, b_str),
|
||||
SortMode::Default => default_compare(a_str, b_str),
|
||||
}
|
||||
};
|
||||
if cmp != Ordering::Equal {
|
||||
return if settings.reverse { cmp.reverse() } else { cmp };
|
||||
|
@ -945,7 +1019,6 @@ fn compare_by(a: &Line, b: &Line, global_settings: &GlobalSettings) -> Ordering
|
|||
}
|
||||
|
||||
// Call "last resort compare" if all selectors returned Equal
|
||||
|
||||
let cmp = if global_settings.random || global_settings.stable || global_settings.unique {
|
||||
Ordering::Equal
|
||||
} else {
|
||||
|
@ -997,34 +1070,6 @@ fn leading_num_common(a: &str) -> &str {
|
|||
s
|
||||
}
|
||||
|
||||
// This function cleans up the initial comparison done by leading_num_common for a numeric compare.
|
||||
// GNU sort does its numeric comparison through strnumcmp. However, we don't have or
|
||||
// may not want to use libc. Instead we emulate the GNU sort numeric compare by ignoring
|
||||
// those leading number lines GNU sort would not recognize. GNU numeric compare would
|
||||
// not recognize a positive sign or scientific/E notation so we strip those elements here.
|
||||
fn get_leading_num(a: &str) -> &str {
|
||||
let mut s = "";
|
||||
|
||||
let a = leading_num_common(a);
|
||||
|
||||
// GNU numeric sort doesn't recognize '+' or 'e' notation so we strip
|
||||
for (idx, c) in a.char_indices() {
|
||||
if c.eq(&'e') || c.eq(&'E') || a.chars().next().unwrap_or('\0').eq(&POSITIVE) {
|
||||
s = &a[..idx];
|
||||
break;
|
||||
}
|
||||
// If no further processing needed to be done, return the line as-is to be sorted
|
||||
s = &a;
|
||||
}
|
||||
|
||||
// And empty number or non-number lines are to be treated as ‘0’ but only for numeric sort
|
||||
// All '0'-ed lines will be sorted later, but only amongst themselves, during the so-called 'last resort comparison.'
|
||||
if s.is_empty() {
|
||||
s = "0";
|
||||
};
|
||||
s
|
||||
}
|
||||
|
||||
// This function cleans up the initial comparison done by leading_num_common for a general numeric compare.
|
||||
// In contrast to numeric compare, GNU general numeric/FP sort *should* recognize positive signs and
|
||||
// scientific notation, so we strip those lines only after the end of the following numeric string.
|
||||
|
@ -1054,17 +1099,6 @@ fn get_leading_gen(a: &str) -> &str {
|
|||
result
|
||||
}
|
||||
|
||||
#[inline(always)]
|
||||
fn remove_thousands_sep<'a, S: Into<Cow<'a, str>>>(input: S) -> Cow<'a, str> {
|
||||
let input = input.into();
|
||||
if input.contains(THOUSANDS_SEP) {
|
||||
let output = input.replace(THOUSANDS_SEP, "");
|
||||
Cow::Owned(output)
|
||||
} else {
|
||||
input
|
||||
}
|
||||
}
|
||||
|
||||
#[inline(always)]
|
||||
fn remove_trailing_dec<'a, S: Into<Cow<'a, str>>>(input: S) -> Cow<'a, str> {
|
||||
let input = input.into();
|
||||
|
@ -1093,87 +1127,15 @@ fn permissive_f64_parse(a: &str) -> f64 {
|
|||
}
|
||||
}
|
||||
|
||||
fn numeric_compare(a: &str, b: &str) -> Ordering {
|
||||
#![allow(clippy::comparison_chain)]
|
||||
|
||||
let sa = get_leading_num(a);
|
||||
let sb = get_leading_num(b);
|
||||
|
||||
// Avoids a string alloc for every line to remove thousands seperators here
|
||||
// instead of inside the get_leading_num function, which is a HUGE performance benefit
|
||||
let ta = remove_thousands_sep(sa);
|
||||
let tb = remove_thousands_sep(sb);
|
||||
|
||||
let fa = permissive_f64_parse(&ta);
|
||||
let fb = permissive_f64_parse(&tb);
|
||||
|
||||
// f64::cmp isn't implemented (due to NaN issues); implement directly instead
|
||||
if fa > fb {
|
||||
Ordering::Greater
|
||||
} else if fa < fb {
|
||||
Ordering::Less
|
||||
} else {
|
||||
Ordering::Equal
|
||||
}
|
||||
}
|
||||
|
||||
/// Compares two floats, with errors and non-numerics assumed to be -inf.
|
||||
/// Stops coercing at the first non-numeric char.
|
||||
fn general_numeric_compare(a: &str, b: &str) -> Ordering {
|
||||
/// We explicitly need to convert to f64 in this case.
|
||||
fn general_numeric_compare(a: f64, b: f64) -> Ordering {
|
||||
#![allow(clippy::comparison_chain)]
|
||||
|
||||
let sa = get_leading_gen(a);
|
||||
let sb = get_leading_gen(b);
|
||||
|
||||
let fa = permissive_f64_parse(&sa);
|
||||
let fb = permissive_f64_parse(&sb);
|
||||
|
||||
// f64::cmp isn't implemented (due to NaN issues); implement directly instead
|
||||
if fa > fb {
|
||||
if a > b {
|
||||
Ordering::Greater
|
||||
} else if fa < fb {
|
||||
Ordering::Less
|
||||
} else {
|
||||
Ordering::Equal
|
||||
}
|
||||
}
|
||||
|
||||
// GNU/BSD does not handle converting numbers to an equal scale
|
||||
// properly. GNU/BSD simply recognize that there is a human scale and sorts
|
||||
// those numbers ahead of other number inputs. There are perhaps limits
|
||||
// to the type of behavior we should emulate, and this might be such a limit.
|
||||
// Properly handling these units seems like a value add to me. And when sorting
|
||||
// these types of numbers, we rarely care about pure performance.
|
||||
fn human_numeric_convert(a: &str) -> f64 {
|
||||
let num_str = get_leading_num(a);
|
||||
let suffix = a.trim_start_matches(&num_str);
|
||||
let num_part = permissive_f64_parse(&num_str);
|
||||
let suffix: f64 = match suffix.parse().unwrap_or('\0') {
|
||||
// SI Units
|
||||
'K' => 1E3,
|
||||
'M' => 1E6,
|
||||
'G' => 1E9,
|
||||
'T' => 1E12,
|
||||
'P' => 1E15,
|
||||
'E' => 1E18,
|
||||
'Z' => 1E21,
|
||||
'Y' => 1E24,
|
||||
_ => 1f64,
|
||||
};
|
||||
num_part * suffix
|
||||
}
|
||||
|
||||
/// Compare two strings as if they are human readable sizes.
|
||||
/// AKA 1M > 100k
|
||||
fn human_numeric_size_compare(a: &str, b: &str) -> Ordering {
|
||||
#![allow(clippy::comparison_chain)]
|
||||
let fa = human_numeric_convert(a);
|
||||
let fb = human_numeric_convert(b);
|
||||
|
||||
// f64::cmp isn't implemented (due to NaN issues); implement directly instead
|
||||
if fa > fb {
|
||||
Ordering::Greater
|
||||
} else if fa < fb {
|
||||
} else if a < b {
|
||||
Ordering::Less
|
||||
} else {
|
||||
Ordering::Equal
|
||||
|
@ -1373,30 +1335,6 @@ mod tests {
|
|||
assert_eq!(Ordering::Less, default_compare(a, b));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_numeric_compare1() {
|
||||
let a = "149:7";
|
||||
let b = "150:5";
|
||||
|
||||
assert_eq!(Ordering::Less, numeric_compare(a, b));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_numeric_compare2() {
|
||||
let a = "-1.02";
|
||||
let b = "1";
|
||||
|
||||
assert_eq!(Ordering::Less, numeric_compare(a, b));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_human_numeric_compare() {
|
||||
let a = "300K";
|
||||
let b = "1M";
|
||||
|
||||
assert_eq!(Ordering::Less, human_numeric_size_compare(a, b));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_month_compare() {
|
||||
let a = "JaN";
|
||||
|
|
|
@ -38,11 +38,7 @@ fn test_multiple_decimals_general() {
|
|||
|
||||
#[test]
|
||||
fn test_multiple_decimals_numeric() {
|
||||
new_ucmd!()
|
||||
.arg("-n")
|
||||
.arg("multiple_decimals_numeric.txt")
|
||||
.succeeds()
|
||||
.stdout_is("-2028789030\n-896689\n-8.90880\n-1\n-.05\n\n\n\n\n\n\n\n\n000\nCARAvan\n00000001\n1\n1.040000000\n1.444\n1.58590\n8.013\n45\n46.89\n 4567.\n4567.1\n4567.34\n\t\t\t\t\t\t\t\t\t\t4567..457\n\t\t\t\t37800\n\t\t\t\t\t\t45670.89079.098\n\t\t\t\t\t\t45670.89079.1\n576,446.88800000\n576,446.890\n4798908.340000000000\n4798908.45\n4798908.8909800\n");
|
||||
test_helper("multiple_decimals_numeric", "-n")
|
||||
}
|
||||
|
||||
#[test]
|
||||
|
|
35
tests/fixtures/sort/multiple_decimals_numeric.expected
vendored
Normal file
35
tests/fixtures/sort/multiple_decimals_numeric.expected
vendored
Normal file
|
@ -0,0 +1,35 @@
|
|||
-2028789030
|
||||
-896689
|
||||
-8.90880
|
||||
-1
|
||||
-.05
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
000
|
||||
CARAvan
|
||||
00000001
|
||||
1
|
||||
1.040000000
|
||||
1.444
|
||||
1.58590
|
||||
8.013
|
||||
45
|
||||
46.89
|
||||
4567..457
|
||||
4567.
|
||||
4567.1
|
||||
4567.34
|
||||
37800
|
||||
45670.89079.098
|
||||
45670.89079.1
|
||||
576,446.88800000
|
||||
576,446.890
|
||||
4798908.340000000000
|
||||
4798908.45
|
||||
4798908.8909800
|
Loading…
Add table
Add a link
Reference in a new issue