mirror of
https://github.com/RGBCube/uutils-coreutils
synced 2025-07-28 03:27:44 +00:00
Merge pull request #8062 from drinkcat/sort-float
sort: Make use of ExtendedBigDecimal in -g sorting, then attempt to recover some performance
This commit is contained in:
commit
9e21259e2d
8 changed files with 163 additions and 41 deletions
1
Cargo.lock
generated
1
Cargo.lock
generated
|
@ -3287,6 +3287,7 @@ dependencies = [
|
||||||
name = "uu_sort"
|
name = "uu_sort"
|
||||||
version = "0.1.0"
|
version = "0.1.0"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
|
"bigdecimal",
|
||||||
"binary-heap-plus",
|
"binary-heap-plus",
|
||||||
"clap",
|
"clap",
|
||||||
"compare",
|
"compare",
|
||||||
|
|
|
@ -153,6 +153,15 @@ See also comments under `printf` for formatting precision and differences.
|
||||||
|
|
||||||
`seq` provides `-t`/`--terminator` to set the terminator character.
|
`seq` provides `-t`/`--terminator` to set the terminator character.
|
||||||
|
|
||||||
|
## `sort`
|
||||||
|
|
||||||
|
When sorting with `-g`/`--general-numeric-sort`, arbitrary precision decimal numbers
|
||||||
|
are parsed and compared, unlike GNU coreutils that uses platform-specific long
|
||||||
|
double floating point numbers.
|
||||||
|
|
||||||
|
Extremely large or small values can still overflow or underflow to infinity or zero,
|
||||||
|
see note in `seq`.
|
||||||
|
|
||||||
## `ls`
|
## `ls`
|
||||||
|
|
||||||
GNU `ls` provides two ways to use a long listing format: `-l` and `--format=long`. We support a
|
GNU `ls` provides two ways to use a long listing format: `-l` and `--format=long`. We support a
|
||||||
|
|
1
fuzz/Cargo.lock
generated
1
fuzz/Cargo.lock
generated
|
@ -1345,6 +1345,7 @@ dependencies = [
|
||||||
name = "uu_sort"
|
name = "uu_sort"
|
||||||
version = "0.1.0"
|
version = "0.1.0"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
|
"bigdecimal",
|
||||||
"binary-heap-plus",
|
"binary-heap-plus",
|
||||||
"clap",
|
"clap",
|
||||||
"compare",
|
"compare",
|
||||||
|
|
|
@ -1,3 +1,5 @@
|
||||||
|
# spell-checker:ignore bigdecimal
|
||||||
|
|
||||||
[package]
|
[package]
|
||||||
name = "uu_sort"
|
name = "uu_sort"
|
||||||
description = "sort ~ (uutils) sort input lines"
|
description = "sort ~ (uutils) sort input lines"
|
||||||
|
@ -18,6 +20,7 @@ workspace = true
|
||||||
path = "src/sort.rs"
|
path = "src/sort.rs"
|
||||||
|
|
||||||
[dependencies]
|
[dependencies]
|
||||||
|
bigdecimal = { workspace = true }
|
||||||
binary-heap-plus = { workspace = true }
|
binary-heap-plus = { workspace = true }
|
||||||
clap = { workspace = true }
|
clap = { workspace = true }
|
||||||
compare = { workspace = true }
|
compare = { workspace = true }
|
||||||
|
|
|
@ -17,7 +17,9 @@ use memchr::memchr_iter;
|
||||||
use self_cell::self_cell;
|
use self_cell::self_cell;
|
||||||
use uucore::error::{UResult, USimpleError};
|
use uucore::error::{UResult, USimpleError};
|
||||||
|
|
||||||
use crate::{GeneralF64ParseResult, GlobalSettings, Line, SortError, numeric_str_cmp::NumInfo};
|
use crate::{
|
||||||
|
GeneralBigDecimalParseResult, GlobalSettings, Line, SortError, numeric_str_cmp::NumInfo,
|
||||||
|
};
|
||||||
|
|
||||||
self_cell!(
|
self_cell!(
|
||||||
/// The chunk that is passed around between threads.
|
/// The chunk that is passed around between threads.
|
||||||
|
@ -41,7 +43,7 @@ pub struct ChunkContents<'a> {
|
||||||
pub struct LineData<'a> {
|
pub struct LineData<'a> {
|
||||||
pub selections: Vec<&'a str>,
|
pub selections: Vec<&'a str>,
|
||||||
pub num_infos: Vec<NumInfo>,
|
pub num_infos: Vec<NumInfo>,
|
||||||
pub parsed_floats: Vec<GeneralF64ParseResult>,
|
pub parsed_floats: Vec<GeneralBigDecimalParseResult>,
|
||||||
pub line_num_floats: Vec<Option<f64>>,
|
pub line_num_floats: Vec<Option<f64>>,
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -100,7 +102,7 @@ pub struct RecycledChunk {
|
||||||
lines: Vec<Line<'static>>,
|
lines: Vec<Line<'static>>,
|
||||||
selections: Vec<&'static str>,
|
selections: Vec<&'static str>,
|
||||||
num_infos: Vec<NumInfo>,
|
num_infos: Vec<NumInfo>,
|
||||||
parsed_floats: Vec<GeneralF64ParseResult>,
|
parsed_floats: Vec<GeneralBigDecimalParseResult>,
|
||||||
line_num_floats: Vec<Option<f64>>,
|
line_num_floats: Vec<Option<f64>>,
|
||||||
buffer: Vec<u8>,
|
buffer: Vec<u8>,
|
||||||
}
|
}
|
||||||
|
|
|
@ -7,7 +7,7 @@
|
||||||
// https://pubs.opengroup.org/onlinepubs/9699919799/utilities/sort.html
|
// https://pubs.opengroup.org/onlinepubs/9699919799/utilities/sort.html
|
||||||
// https://www.gnu.org/software/coreutils/manual/html_node/sort-invocation.html
|
// https://www.gnu.org/software/coreutils/manual/html_node/sort-invocation.html
|
||||||
|
|
||||||
// spell-checker:ignore (misc) HFKJFK Mbdfhn getrlimit RLIMIT_NOFILE rlim
|
// spell-checker:ignore (misc) HFKJFK Mbdfhn getrlimit RLIMIT_NOFILE rlim bigdecimal extendedbigdecimal
|
||||||
|
|
||||||
mod check;
|
mod check;
|
||||||
mod chunks;
|
mod chunks;
|
||||||
|
@ -17,6 +17,7 @@ mod merge;
|
||||||
mod numeric_str_cmp;
|
mod numeric_str_cmp;
|
||||||
mod tmp_dir;
|
mod tmp_dir;
|
||||||
|
|
||||||
|
use bigdecimal::BigDecimal;
|
||||||
use chunks::LineData;
|
use chunks::LineData;
|
||||||
use clap::builder::ValueParser;
|
use clap::builder::ValueParser;
|
||||||
use clap::{Arg, ArgAction, Command};
|
use clap::{Arg, ArgAction, Command};
|
||||||
|
@ -44,7 +45,9 @@ use unicode_width::UnicodeWidthStr;
|
||||||
use uucore::display::Quotable;
|
use uucore::display::Quotable;
|
||||||
use uucore::error::{FromIo, strip_errno};
|
use uucore::error::{FromIo, strip_errno};
|
||||||
use uucore::error::{UError, UResult, USimpleError, UUsageError, set_exit_code};
|
use uucore::error::{UError, UResult, USimpleError, UUsageError, set_exit_code};
|
||||||
|
use uucore::extendedbigdecimal::ExtendedBigDecimal;
|
||||||
use uucore::line_ending::LineEnding;
|
use uucore::line_ending::LineEnding;
|
||||||
|
use uucore::parser::num_parser::{ExtendedParser, ExtendedParserError};
|
||||||
use uucore::parser::parse_size::{ParseSizeError, Parser};
|
use uucore::parser::parse_size::{ParseSizeError, Parser};
|
||||||
use uucore::parser::shortcut_value_parser::ShortcutValueParser;
|
use uucore::parser::shortcut_value_parser::ShortcutValueParser;
|
||||||
use uucore::version_cmp::version_cmp;
|
use uucore::version_cmp::version_cmp;
|
||||||
|
@ -448,7 +451,7 @@ impl Default for KeySettings {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
enum Selection<'a> {
|
enum Selection<'a> {
|
||||||
AsF64(GeneralF64ParseResult),
|
AsBigDecimal(GeneralBigDecimalParseResult),
|
||||||
WithNumInfo(&'a str, NumInfo),
|
WithNumInfo(&'a str, NumInfo),
|
||||||
Str(&'a str),
|
Str(&'a str),
|
||||||
}
|
}
|
||||||
|
@ -490,7 +493,7 @@ impl<'a> Line<'a> {
|
||||||
.map(|selector| (selector, selector.get_selection(line, token_buffer)))
|
.map(|selector| (selector, selector.get_selection(line, token_buffer)))
|
||||||
{
|
{
|
||||||
match selection {
|
match selection {
|
||||||
Selection::AsF64(parsed_float) => line_data.parsed_floats.push(parsed_float),
|
Selection::AsBigDecimal(parsed_float) => line_data.parsed_floats.push(parsed_float),
|
||||||
Selection::WithNumInfo(str, num_info) => {
|
Selection::WithNumInfo(str, num_info) => {
|
||||||
line_data.num_infos.push(num_info);
|
line_data.num_infos.push(num_info);
|
||||||
line_data.selections.push(str);
|
line_data.selections.push(str);
|
||||||
|
@ -902,8 +905,8 @@ impl FieldSelector {
|
||||||
range = &range[num_range];
|
range = &range[num_range];
|
||||||
Selection::WithNumInfo(range, info)
|
Selection::WithNumInfo(range, info)
|
||||||
} else if self.settings.mode == SortMode::GeneralNumeric {
|
} else if self.settings.mode == SortMode::GeneralNumeric {
|
||||||
// Parse this number as f64, as this is the requirement for general numeric sorting.
|
// Parse this number as BigDecimal, as this is the requirement for general numeric sorting.
|
||||||
Selection::AsF64(general_f64_parse(&range[get_leading_gen(range)]))
|
Selection::AsBigDecimal(general_bd_parse(&range[get_leading_gen(range)]))
|
||||||
} else {
|
} else {
|
||||||
// This is not a numeric sort, so we don't need a NumCache.
|
// This is not a numeric sort, so we don't need a NumCache.
|
||||||
Selection::Str(range)
|
Selection::Str(range)
|
||||||
|
@ -1789,35 +1792,45 @@ fn get_leading_gen(input: &str) -> Range<usize> {
|
||||||
leading_whitespace_len..input.len()
|
leading_whitespace_len..input.len()
|
||||||
}
|
}
|
||||||
|
|
||||||
#[derive(Copy, Clone, PartialEq, PartialOrd, Debug)]
|
#[derive(Clone, PartialEq, PartialOrd, Debug)]
|
||||||
pub enum GeneralF64ParseResult {
|
pub enum GeneralBigDecimalParseResult {
|
||||||
Invalid,
|
Invalid,
|
||||||
NaN,
|
Nan,
|
||||||
NegInfinity,
|
MinusInfinity,
|
||||||
Number(f64),
|
Number(BigDecimal),
|
||||||
Infinity,
|
Infinity,
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Parse the beginning string into a GeneralF64ParseResult.
|
/// Parse the beginning string into a GeneralBigDecimalParseResult.
|
||||||
/// Using a GeneralF64ParseResult instead of f64 is necessary to correctly order floats.
|
/// Using a GeneralBigDecimalParseResult instead of ExtendedBigDecimal is necessary to correctly order floats.
|
||||||
#[inline(always)]
|
#[inline(always)]
|
||||||
fn general_f64_parse(a: &str) -> GeneralF64ParseResult {
|
fn general_bd_parse(a: &str) -> GeneralBigDecimalParseResult {
|
||||||
// The actual behavior here relies on Rust's implementation of parsing floating points.
|
// Parse digits, and fold in recoverable errors
|
||||||
// For example "nan", "inf" (ignoring the case) and "infinity" are only parsed to floats starting from 1.53.
|
let ebd = match ExtendedBigDecimal::extended_parse(a) {
|
||||||
// TODO: Once our minimum supported Rust version is 1.53 or above, we should add tests for those cases.
|
Err(ExtendedParserError::NotNumeric) => return GeneralBigDecimalParseResult::Invalid,
|
||||||
match a.parse::<f64>() {
|
Err(ExtendedParserError::PartialMatch(ebd, _))
|
||||||
Ok(a) if a.is_nan() => GeneralF64ParseResult::NaN,
|
| Err(ExtendedParserError::Overflow(ebd))
|
||||||
Ok(a) if a == f64::NEG_INFINITY => GeneralF64ParseResult::NegInfinity,
|
| Err(ExtendedParserError::Underflow(ebd))
|
||||||
Ok(a) if a == f64::INFINITY => GeneralF64ParseResult::Infinity,
|
| Ok(ebd) => ebd,
|
||||||
Ok(a) => GeneralF64ParseResult::Number(a),
|
};
|
||||||
Err(_) => GeneralF64ParseResult::Invalid,
|
|
||||||
|
match ebd {
|
||||||
|
ExtendedBigDecimal::BigDecimal(bd) => GeneralBigDecimalParseResult::Number(bd),
|
||||||
|
ExtendedBigDecimal::Infinity => GeneralBigDecimalParseResult::Infinity,
|
||||||
|
ExtendedBigDecimal::MinusInfinity => GeneralBigDecimalParseResult::MinusInfinity,
|
||||||
|
// Minus zero and zero are equal
|
||||||
|
ExtendedBigDecimal::MinusZero => GeneralBigDecimalParseResult::Number(0.into()),
|
||||||
|
ExtendedBigDecimal::Nan | ExtendedBigDecimal::MinusNan => GeneralBigDecimalParseResult::Nan,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Compares two floats, with errors and non-numerics assumed to be -inf.
|
/// Compares two floats, with errors and non-numerics assumed to be -inf.
|
||||||
/// Stops coercing at the first non-numeric char.
|
/// Stops coercing at the first non-numeric char.
|
||||||
/// We explicitly need to convert to f64 in this case.
|
/// We explicitly need to convert to f64 in this case.
|
||||||
fn general_numeric_compare(a: &GeneralF64ParseResult, b: &GeneralF64ParseResult) -> Ordering {
|
fn general_numeric_compare(
|
||||||
|
a: &GeneralBigDecimalParseResult,
|
||||||
|
b: &GeneralBigDecimalParseResult,
|
||||||
|
) -> Ordering {
|
||||||
a.partial_cmp(b).unwrap()
|
a.partial_cmp(b).unwrap()
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -67,16 +67,41 @@ impl Base {
|
||||||
&self,
|
&self,
|
||||||
str: &'a str,
|
str: &'a str,
|
||||||
digits: Option<BigUint>,
|
digits: Option<BigUint>,
|
||||||
) -> (Option<BigUint>, u64, &'a str) {
|
) -> (Option<BigUint>, i64, &'a str) {
|
||||||
let mut digits: Option<BigUint> = digits;
|
let mut digits: Option<BigUint> = digits;
|
||||||
let mut count: u64 = 0;
|
let mut count: i64 = 0;
|
||||||
let mut rest = str;
|
let mut rest = str;
|
||||||
|
|
||||||
|
// Doing operations on BigUint is really expensive, so we do as much as we
|
||||||
|
// can on u64, then add them to the BigUint.
|
||||||
|
let mut digits_tmp: u64 = 0;
|
||||||
|
let mut count_tmp: i64 = 0;
|
||||||
|
let mut mul_tmp: u64 = 1;
|
||||||
while let Some(d) = rest.chars().next().and_then(|c| self.digit(c)) {
|
while let Some(d) = rest.chars().next().and_then(|c| self.digit(c)) {
|
||||||
(digits, count) = (
|
(digits_tmp, count_tmp, mul_tmp) = (
|
||||||
Some(digits.unwrap_or_default() * *self as u8 + d),
|
digits_tmp * *self as u64 + d,
|
||||||
count + 1,
|
count_tmp + 1,
|
||||||
|
mul_tmp * *self as u64,
|
||||||
);
|
);
|
||||||
rest = &rest[1..];
|
rest = &rest[1..];
|
||||||
|
// In base 16, we parse 4 bits at a time, so we can parse 16 digits at most in a u64.
|
||||||
|
if count_tmp >= 15 {
|
||||||
|
// Accumulate what we have so far
|
||||||
|
(digits, count) = (
|
||||||
|
Some(digits.unwrap_or_default() * mul_tmp + digits_tmp),
|
||||||
|
count + count_tmp,
|
||||||
|
);
|
||||||
|
// Reset state
|
||||||
|
(digits_tmp, count_tmp, mul_tmp) = (0, 0, 1);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Accumulate the leftovers (if any)
|
||||||
|
if mul_tmp > 1 {
|
||||||
|
(digits, count) = (
|
||||||
|
Some(digits.unwrap_or_default() * mul_tmp + digits_tmp),
|
||||||
|
count + count_tmp,
|
||||||
|
);
|
||||||
}
|
}
|
||||||
(digits, count, rest)
|
(digits, count, rest)
|
||||||
}
|
}
|
||||||
|
@ -265,7 +290,7 @@ impl ExtendedParser for ExtendedBigDecimal {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
fn parse_digits(base: Base, str: &str, fractional: bool) -> (Option<BigUint>, u64, &str) {
|
fn parse_digits(base: Base, str: &str, fractional: bool) -> (Option<BigUint>, i64, &str) {
|
||||||
// Parse the integral part of the number
|
// Parse the integral part of the number
|
||||||
let (digits, rest) = base.parse_digits(str);
|
let (digits, rest) = base.parse_digits(str);
|
||||||
|
|
||||||
|
@ -447,7 +472,7 @@ fn construct_extended_big_decimal<'a>(
|
||||||
digits: BigUint,
|
digits: BigUint,
|
||||||
negative: bool,
|
negative: bool,
|
||||||
base: Base,
|
base: Base,
|
||||||
scale: u64,
|
scale: i64,
|
||||||
exponent: BigInt,
|
exponent: BigInt,
|
||||||
) -> Result<ExtendedBigDecimal, ExtendedParserError<'a, ExtendedBigDecimal>> {
|
) -> Result<ExtendedBigDecimal, ExtendedParserError<'a, ExtendedBigDecimal>> {
|
||||||
if digits == BigUint::zero() {
|
if digits == BigUint::zero() {
|
||||||
|
@ -465,16 +490,20 @@ fn construct_extended_big_decimal<'a>(
|
||||||
let bd = if scale == 0 && exponent.is_zero() {
|
let bd = if scale == 0 && exponent.is_zero() {
|
||||||
BigDecimal::from_bigint(signed_digits, 0)
|
BigDecimal::from_bigint(signed_digits, 0)
|
||||||
} else if base == Base::Decimal {
|
} else if base == Base::Decimal {
|
||||||
let new_scale = BigInt::from(scale) - exponent;
|
if exponent.is_zero() {
|
||||||
|
// Optimization: Converting scale to Bigint and back is relatively slow.
|
||||||
|
BigDecimal::from_bigint(signed_digits, scale)
|
||||||
|
} else {
|
||||||
|
let new_scale = -exponent + scale;
|
||||||
|
|
||||||
// BigDecimal "only" supports i64 scale.
|
// BigDecimal "only" supports i64 scale.
|
||||||
// Note that new_scale is a negative exponent: large value causes an underflow, small value an overflow.
|
// Note that new_scale is a negative exponent: large positive value causes an underflow, large negative values an overflow.
|
||||||
if new_scale > i64::MAX.into() {
|
if let Some(new_scale) = new_scale.to_i64() {
|
||||||
return Err(make_error(false, negative));
|
BigDecimal::from_bigint(signed_digits, new_scale)
|
||||||
} else if new_scale < i64::MIN.into() {
|
} else {
|
||||||
return Err(make_error(true, negative));
|
return Err(make_error(new_scale.is_negative(), negative));
|
||||||
|
}
|
||||||
}
|
}
|
||||||
BigDecimal::from_bigint(signed_digits, new_scale.to_i64().unwrap())
|
|
||||||
} else if base == Base::Hexadecimal {
|
} else if base == Base::Hexadecimal {
|
||||||
// pow "only" supports u32 values, just error out if given more than 2**32 fractional digits.
|
// pow "only" supports u32 values, just error out if given more than 2**32 fractional digits.
|
||||||
if scale > u32::MAX.into() {
|
if scale > u32::MAX.into() {
|
||||||
|
|
|
@ -1503,3 +1503,67 @@ fn test_files0_from_zero_length() {
|
||||||
.fails_with_code(2)
|
.fails_with_code(2)
|
||||||
.stderr_only("sort: -:2: invalid zero-length file name\n");
|
.stderr_only("sort: -:2: invalid zero-length file name\n");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
// Test for GNU tests/sort/sort-float.sh
|
||||||
|
fn test_g_float() {
|
||||||
|
let input = "0\n-3.3621031431120935063e-4932\n3.3621031431120935063e-4932\n";
|
||||||
|
let output = "-3.3621031431120935063e-4932\n0\n3.3621031431120935063e-4932\n";
|
||||||
|
new_ucmd!()
|
||||||
|
.args(&["-g"])
|
||||||
|
.pipe_in(input)
|
||||||
|
.succeeds()
|
||||||
|
.stdout_is(output);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
// Test misc numbers ("'a" is not interpreted as literal, trailing text is ignored...)
|
||||||
|
fn test_g_misc() {
|
||||||
|
let input = "1\n100\n90\n'a\n85hello\n";
|
||||||
|
let output = "'a\n1\n85hello\n90\n100\n";
|
||||||
|
new_ucmd!()
|
||||||
|
.args(&["-g"])
|
||||||
|
.pipe_in(input)
|
||||||
|
.succeeds()
|
||||||
|
.stdout_is(output);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
// Test numbers with a large number of digits, where only the last digit is different.
|
||||||
|
// We use scientific notation to make sure string sorting does not correctly order them.
|
||||||
|
fn test_g_arbitrary() {
|
||||||
|
let input = [
|
||||||
|
// GNU coreutils doesn't handle those correctly as they don't fit exactly in long double
|
||||||
|
"3",
|
||||||
|
"3.000000000000000000000000000000000000000000000000000000000000000004",
|
||||||
|
"0.3000000000000000000000000000000000000000000000000000000000000000002e1",
|
||||||
|
"0.03000000000000000000000000000000000000000000000000000000000000000003e2",
|
||||||
|
"0.003000000000000000000000000000000000000000000000000000000000000000001e3",
|
||||||
|
// GNU coreutils does handle those correctly though
|
||||||
|
"10",
|
||||||
|
"10.000000000000004",
|
||||||
|
"1.0000000000000002e1",
|
||||||
|
"0.10000000000000003e2",
|
||||||
|
"0.010000000000000001e3",
|
||||||
|
]
|
||||||
|
.join("\n");
|
||||||
|
let output = [
|
||||||
|
"3",
|
||||||
|
"0.003000000000000000000000000000000000000000000000000000000000000000001e3",
|
||||||
|
"0.3000000000000000000000000000000000000000000000000000000000000000002e1",
|
||||||
|
"0.03000000000000000000000000000000000000000000000000000000000000000003e2",
|
||||||
|
"3.000000000000000000000000000000000000000000000000000000000000000004",
|
||||||
|
"10",
|
||||||
|
"0.010000000000000001e3",
|
||||||
|
"1.0000000000000002e1",
|
||||||
|
"0.10000000000000003e2",
|
||||||
|
"10.000000000000004",
|
||||||
|
]
|
||||||
|
.join("\n")
|
||||||
|
+ "\n";
|
||||||
|
new_ucmd!()
|
||||||
|
.args(&["-g"])
|
||||||
|
.pipe_in(input)
|
||||||
|
.succeeds()
|
||||||
|
.stdout_is(output);
|
||||||
|
}
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue