diff --git a/Cargo.lock b/Cargo.lock index 810cc2a66..15c880338 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -3287,6 +3287,7 @@ dependencies = [ name = "uu_sort" version = "0.1.0" dependencies = [ + "bigdecimal", "binary-heap-plus", "clap", "compare", diff --git a/docs/src/extensions.md b/docs/src/extensions.md index af6119da5..81457c29d 100644 --- a/docs/src/extensions.md +++ b/docs/src/extensions.md @@ -153,6 +153,15 @@ See also comments under `printf` for formatting precision and differences. `seq` provides `-t`/`--terminator` to set the terminator character. +## `sort` + +When sorting with `-g`/`--general-numeric-sort`, arbitrary precision decimal numbers +are parsed and compared, unlike GNU coreutils that uses platform-specific long +double floating point numbers. + +Extremely large or small values can still overflow or underflow to infinity or zero, +see note in `seq`. + ## `ls` GNU `ls` provides two ways to use a long listing format: `-l` and `--format=long`. We support a diff --git a/fuzz/Cargo.lock b/fuzz/Cargo.lock index 765d3c0a9..6190e8ba6 100644 --- a/fuzz/Cargo.lock +++ b/fuzz/Cargo.lock @@ -1345,6 +1345,7 @@ dependencies = [ name = "uu_sort" version = "0.1.0" dependencies = [ + "bigdecimal", "binary-heap-plus", "clap", "compare", diff --git a/src/uu/sort/Cargo.toml b/src/uu/sort/Cargo.toml index 67676d809..80c8ba364 100644 --- a/src/uu/sort/Cargo.toml +++ b/src/uu/sort/Cargo.toml @@ -1,3 +1,5 @@ +# spell-checker:ignore bigdecimal + [package] name = "uu_sort" description = "sort ~ (uutils) sort input lines" @@ -18,6 +20,7 @@ workspace = true path = "src/sort.rs" [dependencies] +bigdecimal = { workspace = true } binary-heap-plus = { workspace = true } clap = { workspace = true } compare = { workspace = true } diff --git a/src/uu/sort/src/chunks.rs b/src/uu/sort/src/chunks.rs index 8f423701a..18b2547dc 100644 --- a/src/uu/sort/src/chunks.rs +++ b/src/uu/sort/src/chunks.rs @@ -17,7 +17,9 @@ use memchr::memchr_iter; use self_cell::self_cell; use uucore::error::{UResult, USimpleError}; -use crate::{GeneralF64ParseResult, GlobalSettings, Line, SortError, numeric_str_cmp::NumInfo}; +use crate::{ + GeneralBigDecimalParseResult, GlobalSettings, Line, SortError, numeric_str_cmp::NumInfo, +}; self_cell!( /// The chunk that is passed around between threads. @@ -41,7 +43,7 @@ pub struct ChunkContents<'a> { pub struct LineData<'a> { pub selections: Vec<&'a str>, pub num_infos: Vec, - pub parsed_floats: Vec, + pub parsed_floats: Vec, pub line_num_floats: Vec>, } @@ -100,7 +102,7 @@ pub struct RecycledChunk { lines: Vec>, selections: Vec<&'static str>, num_infos: Vec, - parsed_floats: Vec, + parsed_floats: Vec, line_num_floats: Vec>, buffer: Vec, } diff --git a/src/uu/sort/src/sort.rs b/src/uu/sort/src/sort.rs index c97d07a6a..faef098d9 100644 --- a/src/uu/sort/src/sort.rs +++ b/src/uu/sort/src/sort.rs @@ -7,7 +7,7 @@ // https://pubs.opengroup.org/onlinepubs/9699919799/utilities/sort.html // https://www.gnu.org/software/coreutils/manual/html_node/sort-invocation.html -// spell-checker:ignore (misc) HFKJFK Mbdfhn getrlimit RLIMIT_NOFILE rlim +// spell-checker:ignore (misc) HFKJFK Mbdfhn getrlimit RLIMIT_NOFILE rlim bigdecimal extendedbigdecimal mod check; mod chunks; @@ -17,6 +17,7 @@ mod merge; mod numeric_str_cmp; mod tmp_dir; +use bigdecimal::BigDecimal; use chunks::LineData; use clap::builder::ValueParser; use clap::{Arg, ArgAction, Command}; @@ -44,7 +45,9 @@ use unicode_width::UnicodeWidthStr; use uucore::display::Quotable; use uucore::error::{FromIo, strip_errno}; use uucore::error::{UError, UResult, USimpleError, UUsageError, set_exit_code}; +use uucore::extendedbigdecimal::ExtendedBigDecimal; use uucore::line_ending::LineEnding; +use uucore::parser::num_parser::{ExtendedParser, ExtendedParserError}; use uucore::parser::parse_size::{ParseSizeError, Parser}; use uucore::parser::shortcut_value_parser::ShortcutValueParser; use uucore::version_cmp::version_cmp; @@ -448,7 +451,7 @@ impl Default for KeySettings { } } enum Selection<'a> { - AsF64(GeneralF64ParseResult), + AsBigDecimal(GeneralBigDecimalParseResult), WithNumInfo(&'a str, NumInfo), Str(&'a str), } @@ -490,7 +493,7 @@ impl<'a> Line<'a> { .map(|selector| (selector, selector.get_selection(line, token_buffer))) { match selection { - Selection::AsF64(parsed_float) => line_data.parsed_floats.push(parsed_float), + Selection::AsBigDecimal(parsed_float) => line_data.parsed_floats.push(parsed_float), Selection::WithNumInfo(str, num_info) => { line_data.num_infos.push(num_info); line_data.selections.push(str); @@ -902,8 +905,8 @@ impl FieldSelector { range = &range[num_range]; Selection::WithNumInfo(range, info) } else if self.settings.mode == SortMode::GeneralNumeric { - // Parse this number as f64, as this is the requirement for general numeric sorting. - Selection::AsF64(general_f64_parse(&range[get_leading_gen(range)])) + // Parse this number as BigDecimal, as this is the requirement for general numeric sorting. + Selection::AsBigDecimal(general_bd_parse(&range[get_leading_gen(range)])) } else { // This is not a numeric sort, so we don't need a NumCache. Selection::Str(range) @@ -1789,35 +1792,45 @@ fn get_leading_gen(input: &str) -> Range { leading_whitespace_len..input.len() } -#[derive(Copy, Clone, PartialEq, PartialOrd, Debug)] -pub enum GeneralF64ParseResult { +#[derive(Clone, PartialEq, PartialOrd, Debug)] +pub enum GeneralBigDecimalParseResult { Invalid, - NaN, - NegInfinity, - Number(f64), + Nan, + MinusInfinity, + Number(BigDecimal), Infinity, } -/// Parse the beginning string into a GeneralF64ParseResult. -/// Using a GeneralF64ParseResult instead of f64 is necessary to correctly order floats. +/// Parse the beginning string into a GeneralBigDecimalParseResult. +/// Using a GeneralBigDecimalParseResult instead of ExtendedBigDecimal is necessary to correctly order floats. #[inline(always)] -fn general_f64_parse(a: &str) -> GeneralF64ParseResult { - // The actual behavior here relies on Rust's implementation of parsing floating points. - // For example "nan", "inf" (ignoring the case) and "infinity" are only parsed to floats starting from 1.53. - // TODO: Once our minimum supported Rust version is 1.53 or above, we should add tests for those cases. - match a.parse::() { - Ok(a) if a.is_nan() => GeneralF64ParseResult::NaN, - Ok(a) if a == f64::NEG_INFINITY => GeneralF64ParseResult::NegInfinity, - Ok(a) if a == f64::INFINITY => GeneralF64ParseResult::Infinity, - Ok(a) => GeneralF64ParseResult::Number(a), - Err(_) => GeneralF64ParseResult::Invalid, +fn general_bd_parse(a: &str) -> GeneralBigDecimalParseResult { + // Parse digits, and fold in recoverable errors + let ebd = match ExtendedBigDecimal::extended_parse(a) { + Err(ExtendedParserError::NotNumeric) => return GeneralBigDecimalParseResult::Invalid, + Err(ExtendedParserError::PartialMatch(ebd, _)) + | Err(ExtendedParserError::Overflow(ebd)) + | Err(ExtendedParserError::Underflow(ebd)) + | Ok(ebd) => ebd, + }; + + match ebd { + ExtendedBigDecimal::BigDecimal(bd) => GeneralBigDecimalParseResult::Number(bd), + ExtendedBigDecimal::Infinity => GeneralBigDecimalParseResult::Infinity, + ExtendedBigDecimal::MinusInfinity => GeneralBigDecimalParseResult::MinusInfinity, + // Minus zero and zero are equal + ExtendedBigDecimal::MinusZero => GeneralBigDecimalParseResult::Number(0.into()), + ExtendedBigDecimal::Nan | ExtendedBigDecimal::MinusNan => GeneralBigDecimalParseResult::Nan, } } /// Compares two floats, with errors and non-numerics assumed to be -inf. /// Stops coercing at the first non-numeric char. /// We explicitly need to convert to f64 in this case. -fn general_numeric_compare(a: &GeneralF64ParseResult, b: &GeneralF64ParseResult) -> Ordering { +fn general_numeric_compare( + a: &GeneralBigDecimalParseResult, + b: &GeneralBigDecimalParseResult, +) -> Ordering { a.partial_cmp(b).unwrap() } diff --git a/src/uucore/src/lib/features/parser/num_parser.rs b/src/uucore/src/lib/features/parser/num_parser.rs index 84aa82bdd..35a8892e8 100644 --- a/src/uucore/src/lib/features/parser/num_parser.rs +++ b/src/uucore/src/lib/features/parser/num_parser.rs @@ -67,16 +67,41 @@ impl Base { &self, str: &'a str, digits: Option, - ) -> (Option, u64, &'a str) { + ) -> (Option, i64, &'a str) { let mut digits: Option = digits; - let mut count: u64 = 0; + let mut count: i64 = 0; let mut rest = str; + + // Doing operations on BigUint is really expensive, so we do as much as we + // can on u64, then add them to the BigUint. + let mut digits_tmp: u64 = 0; + let mut count_tmp: i64 = 0; + let mut mul_tmp: u64 = 1; while let Some(d) = rest.chars().next().and_then(|c| self.digit(c)) { - (digits, count) = ( - Some(digits.unwrap_or_default() * *self as u8 + d), - count + 1, + (digits_tmp, count_tmp, mul_tmp) = ( + digits_tmp * *self as u64 + d, + count_tmp + 1, + mul_tmp * *self as u64, ); rest = &rest[1..]; + // In base 16, we parse 4 bits at a time, so we can parse 16 digits at most in a u64. + if count_tmp >= 15 { + // Accumulate what we have so far + (digits, count) = ( + Some(digits.unwrap_or_default() * mul_tmp + digits_tmp), + count + count_tmp, + ); + // Reset state + (digits_tmp, count_tmp, mul_tmp) = (0, 0, 1); + } + } + + // Accumulate the leftovers (if any) + if mul_tmp > 1 { + (digits, count) = ( + Some(digits.unwrap_or_default() * mul_tmp + digits_tmp), + count + count_tmp, + ); } (digits, count, rest) } @@ -265,7 +290,7 @@ impl ExtendedParser for ExtendedBigDecimal { } } -fn parse_digits(base: Base, str: &str, fractional: bool) -> (Option, u64, &str) { +fn parse_digits(base: Base, str: &str, fractional: bool) -> (Option, i64, &str) { // Parse the integral part of the number let (digits, rest) = base.parse_digits(str); @@ -447,7 +472,7 @@ fn construct_extended_big_decimal<'a>( digits: BigUint, negative: bool, base: Base, - scale: u64, + scale: i64, exponent: BigInt, ) -> Result> { if digits == BigUint::zero() { @@ -465,16 +490,20 @@ fn construct_extended_big_decimal<'a>( let bd = if scale == 0 && exponent.is_zero() { BigDecimal::from_bigint(signed_digits, 0) } else if base == Base::Decimal { - let new_scale = BigInt::from(scale) - exponent; + if exponent.is_zero() { + // Optimization: Converting scale to Bigint and back is relatively slow. + BigDecimal::from_bigint(signed_digits, scale) + } else { + let new_scale = -exponent + scale; - // BigDecimal "only" supports i64 scale. - // Note that new_scale is a negative exponent: large value causes an underflow, small value an overflow. - if new_scale > i64::MAX.into() { - return Err(make_error(false, negative)); - } else if new_scale < i64::MIN.into() { - return Err(make_error(true, negative)); + // BigDecimal "only" supports i64 scale. + // Note that new_scale is a negative exponent: large positive value causes an underflow, large negative values an overflow. + if let Some(new_scale) = new_scale.to_i64() { + BigDecimal::from_bigint(signed_digits, new_scale) + } else { + return Err(make_error(new_scale.is_negative(), negative)); + } } - BigDecimal::from_bigint(signed_digits, new_scale.to_i64().unwrap()) } else if base == Base::Hexadecimal { // pow "only" supports u32 values, just error out if given more than 2**32 fractional digits. if scale > u32::MAX.into() { diff --git a/tests/by-util/test_sort.rs b/tests/by-util/test_sort.rs index 0e20f1c9e..3fcd4e978 100644 --- a/tests/by-util/test_sort.rs +++ b/tests/by-util/test_sort.rs @@ -1503,3 +1503,67 @@ fn test_files0_from_zero_length() { .fails_with_code(2) .stderr_only("sort: -:2: invalid zero-length file name\n"); } + +#[test] +// Test for GNU tests/sort/sort-float.sh +fn test_g_float() { + let input = "0\n-3.3621031431120935063e-4932\n3.3621031431120935063e-4932\n"; + let output = "-3.3621031431120935063e-4932\n0\n3.3621031431120935063e-4932\n"; + new_ucmd!() + .args(&["-g"]) + .pipe_in(input) + .succeeds() + .stdout_is(output); +} + +#[test] +// Test misc numbers ("'a" is not interpreted as literal, trailing text is ignored...) +fn test_g_misc() { + let input = "1\n100\n90\n'a\n85hello\n"; + let output = "'a\n1\n85hello\n90\n100\n"; + new_ucmd!() + .args(&["-g"]) + .pipe_in(input) + .succeeds() + .stdout_is(output); +} + +#[test] +// Test numbers with a large number of digits, where only the last digit is different. +// We use scientific notation to make sure string sorting does not correctly order them. +fn test_g_arbitrary() { + let input = [ + // GNU coreutils doesn't handle those correctly as they don't fit exactly in long double + "3", + "3.000000000000000000000000000000000000000000000000000000000000000004", + "0.3000000000000000000000000000000000000000000000000000000000000000002e1", + "0.03000000000000000000000000000000000000000000000000000000000000000003e2", + "0.003000000000000000000000000000000000000000000000000000000000000000001e3", + // GNU coreutils does handle those correctly though + "10", + "10.000000000000004", + "1.0000000000000002e1", + "0.10000000000000003e2", + "0.010000000000000001e3", + ] + .join("\n"); + let output = [ + "3", + "0.003000000000000000000000000000000000000000000000000000000000000000001e3", + "0.3000000000000000000000000000000000000000000000000000000000000000002e1", + "0.03000000000000000000000000000000000000000000000000000000000000000003e2", + "3.000000000000000000000000000000000000000000000000000000000000000004", + "10", + "0.010000000000000001e3", + "1.0000000000000002e1", + "0.10000000000000003e2", + "10.000000000000004", + ] + .join("\n") + + "\n"; + new_ucmd!() + .args(&["-g"]) + .pipe_in(input) + .succeeds() + .stdout_is(output); +}