1
Fork 0
mirror of https://github.com/RGBCube/uutils-coreutils synced 2025-07-28 03:27:44 +00:00

sort: make GNU test sort-debug-keys pass (#2269)

* sort: disable support for thousand separators

In order to be compatible with GNU, we have to disable thousands
separators. GNU does not enable them for the C locale, either.

Once we add support for locales we can add this feature back.

* sort: delete unused fixtures

* sort: compare -0 and 0 equal

I must have misunderstood this when implementing, but GNU considers
-0, 0, and invalid numbers to be equal.

* sort: strip blanks before applying the char index

* sort: don't crash when key start is after key end

* sort: add "no match" for months at the first non-whitespace char

We should put the "^ no match for key" indicator at the first
non-whitespace character of a field.

* sort: improve support for e notation

* sort: use maches! macros
This commit is contained in:
Michael Debertol 2021-05-28 22:38:29 +02:00 committed by GitHub
parent 9442f26fdb
commit e9656a6c32
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
35 changed files with 191 additions and 231 deletions

View file

@ -174,7 +174,11 @@ impl NumInfo {
pub fn numeric_str_cmp((a, a_info): (&str, &NumInfo), (b, b_info): (&str, &NumInfo)) -> Ordering {
// check for a difference in the sign
if a_info.sign != b_info.sign {
return a_info.sign.cmp(&b_info.sign);
return if a.is_empty() && b.is_empty() {
Ordering::Equal
} else {
a_info.sign.cmp(&b_info.sign)
};
}
// check for a difference in the exponent
@ -419,8 +423,8 @@ mod tests {
#[test]
fn minus_zero() {
// This matches GNU sort behavior.
test_helper("-0", "0", Ordering::Less);
test_helper("-0x", "0", Ordering::Less);
test_helper("-0", "0", Ordering::Equal);
test_helper("-0x", "0", Ordering::Equal);
}
#[test]
fn double_minus() {

View file

@ -99,10 +99,9 @@ static OPT_TMP_DIR: &str = "temporary-directory";
static ARG_FILES: &str = "files";
static DECIMAL_PT: char = '.';
static THOUSANDS_SEP: char = ',';
static NEGATIVE: char = '-';
static POSITIVE: char = '+';
const NEGATIVE: char = '-';
const POSITIVE: char = '+';
// Choosing a higher buffer size does not result in performance improvements
// (at least not on my machine). TODO: In the future, we should also take the amount of
@ -330,8 +329,7 @@ impl<'a> Line<'a> {
&self.line[selection.clone()],
NumInfoParseSettings {
accept_si_units: selector.settings.mode == SortMode::HumanNumeric,
thousands_separator: Some(THOUSANDS_SEP),
decimal_pt: Some(DECIMAL_PT),
..Default::default()
},
);
let initial_selection = selection.clone();
@ -367,16 +365,24 @@ impl<'a> Line<'a> {
SortMode::Month => {
let initial_selection = &self.line[selection.clone()];
let month = if month_parse(initial_selection) == Month::Unknown {
// We failed to parse a month, which is equivalent to matching nothing.
0..0
} else {
// We parsed a month. Match the three first non-whitespace characters, which must be the month we parsed.
let mut chars = initial_selection
let mut month_chars = initial_selection
.char_indices()
.skip_while(|(_, c)| c.is_whitespace());
chars.next().unwrap().0
..chars.nth(2).map_or(initial_selection.len(), |(idx, _)| idx)
let month = if month_parse(initial_selection) == Month::Unknown {
// We failed to parse a month, which is equivalent to matching nothing.
// Add the "no match for key" marker to the first non-whitespace character.
let first_non_whitespace = month_chars.next();
first_non_whitespace.map_or(
initial_selection.len()..initial_selection.len(),
|(idx, _)| idx..idx,
)
} else {
// We parsed a month. Match the first three non-whitespace characters, which must be the month we parsed.
month_chars.next().unwrap().0
..month_chars
.nth(2)
.map_or(initial_selection.len(), |(idx, _)| idx)
};
// Shorten selection to month.
@ -606,8 +612,7 @@ impl FieldSelector {
range,
NumInfoParseSettings {
accept_si_units: self.settings.mode == SortMode::HumanNumeric,
thousands_separator: Some(THOUSANDS_SEP),
decimal_pt: Some(DECIMAL_PT),
..Default::default()
},
);
// Shorten the range to what we need to pass to numeric_str_cmp later.
@ -666,22 +671,21 @@ impl FieldSelector {
} else {
tokens.unwrap()[position.field - 1].start
};
// strip blanks if needed
if position.ignore_blanks {
idx += line[idx..]
.char_indices()
.find(|(_, c)| !c.is_whitespace())
.map_or(line[idx..].len(), |(idx, _)| idx);
}
// apply the character index
idx += line[idx..]
.char_indices()
.nth(position.char - 1)
.map_or(line.len(), |(idx, _)| idx);
.map_or(line[idx..].len(), |(idx, _)| idx);
if idx >= line.len() {
Resolution::TooHigh
} else {
if position.ignore_blanks {
if let Some((not_whitespace, _)) =
line[idx..].char_indices().find(|(_, c)| !c.is_whitespace())
{
idx += not_whitespace;
} else {
return Resolution::TooHigh;
}
}
Resolution::StartOfChar(idx)
}
}
@ -691,8 +695,9 @@ impl FieldSelector {
Resolution::StartOfChar(from) => {
let to = self.to.as_ref().map(|to| resolve_index(line, tokens, &to));
match to {
let mut range = match to {
Some(Resolution::StartOfChar(mut to)) => {
// We need to include the character at `to`.
to += line[to..].chars().next().map_or(1, |c| c.len_utf8());
from..to
}
@ -703,7 +708,11 @@ impl FieldSelector {
// If `to` is before the start of the line, report no match.
// This can happen if the line starts with a separator.
Some(Resolution::TooLow) => 0..0,
};
if range.start > range.end {
range.end = range.start;
}
range
}
Resolution::TooLow | Resolution::EndOfChar(_) => {
unreachable!("This should only happen if the field start index is 0, but that should already have caused an error.")
@ -1202,6 +1211,8 @@ fn compare_by<'a>(a: &Line<'a>, b: &Line<'a>, global_settings: &GlobalSettings)
fn get_leading_gen(input: &str) -> Range<usize> {
let trimmed = input.trim_start();
let leading_whitespace_len = input.len() - trimmed.len();
// check for inf, -inf and nan
for allowed_prefix in &["inf", "-inf", "nan"] {
if trimmed.is_char_boundary(allowed_prefix.len())
&& trimmed[..allowed_prefix.len()].eq_ignore_ascii_case(allowed_prefix)
@ -1210,11 +1221,11 @@ fn get_leading_gen(input: &str) -> Range<usize> {
}
}
// Make this iter peekable to see if next char is numeric
let mut char_indices = trimmed.char_indices().peekable();
let mut char_indices = itertools::peek_nth(trimmed.char_indices());
let first = char_indices.peek();
if first.map_or(false, |&(_, c)| c == NEGATIVE || c == POSITIVE) {
if matches!(first, Some((_, NEGATIVE)) | Some((_, POSITIVE))) {
char_indices.next();
}
@ -1224,17 +1235,30 @@ fn get_leading_gen(input: &str) -> Range<usize> {
if c.is_ascii_digit() {
continue;
}
if c == DECIMAL_PT && !had_decimal_pt {
if c == DECIMAL_PT && !had_decimal_pt && !had_e_notation {
had_decimal_pt = true;
continue;
}
let next_char_numeric = char_indices
.peek()
.map_or(false, |(_, c)| c.is_ascii_digit());
if (c == 'e' || c == 'E') && !had_e_notation && next_char_numeric {
if (c == 'e' || c == 'E') && !had_e_notation {
// we can only consume the 'e' if what follow is either a digit, or a sign followed by a digit.
if let Some(&(_, next_char)) = char_indices.peek() {
if (next_char == '+' || next_char == '-')
&& matches!(
char_indices.peek_nth(2),
Some((_, c)) if c.is_ascii_digit()
)
{
// Consume the sign. The following digits will be consumed by the main loop.
char_indices.next();
had_e_notation = true;
continue;
}
if next_char.is_ascii_digit() {
had_e_notation = true;
continue;
}
}
}
return leading_whitespace_len..(leading_whitespace_len + idx);
}
leading_whitespace_len..input.len()

View file

@ -526,6 +526,11 @@ fn test_keys_with_options_blanks_start() {
}
}
#[test]
fn test_keys_blanks_with_char_idx() {
test_helper("keys_blanks", &["-k 1.2b"])
}
#[test]
fn test_keys_with_options_blanks_end() {
let input = "a b
@ -574,6 +579,13 @@ aaaa
.stdout_only(input);
}
#[test]
fn test_keys_negative_size_match() {
// If the end of a field is before its start, we should not crash.
// Debug output should report "no match for key" at the start position (i.e. the later position).
test_helper("keys_negative_size", &["-k 3,1"]);
}
#[test]
fn test_zero_terminated() {
test_helper("zero-terminated", &["-z"]);

View file

@ -6,8 +6,15 @@
-12e-5555.5
0b10 // binary not supported
0x10 // hexadecimal not supported, but it should be
55e-20
55e-20.10
5.5.5.5
10E
64e+
99e-
1000EDKLD
10000K78
+100000
@ -15,5 +22,7 @@
100E6
100E6
10e10e10e10
13e+10
45e+10.5
50e10
50e10

View file

@ -22,12 +22,33 @@
^ no match for key
^ no match for key
> -12e-5555.5
_________
______________
0b10 // binary not supported
_
____________________________
0x10 // hexadecimal not supported, but it should be
_
___________________________________________________
55e-20
______
______
55e-20.10
______
_________
5.5.5.5
___
_______
10E
__
___
64e+
__
____
99e-
__
____
1000EDKLD
____
_________
@ -49,6 +70,12 @@ _____
10e10e10e10
_____
___________
13e+10
______
______
45e+10.5
______
________
50e10
_____
_____

View file

@ -4,16 +4,25 @@
+100000
10000K78
0x10 // hexadecimal not supported, but it should be
10E
0b10 // binary not supported
64e+
99e-
45e+10.5
1000EDKLD
13e+10
100E6
50e10
-12e-5555.5
+100000
10e10e10e10
5.5.5.5
55e-20
55e-20.10

View file

@ -0,0 +1,3 @@
cab
abc
bca

View file

@ -0,0 +1,9 @@
>cab
__
____
>abc
__
____
>bca
__
____

3
tests/fixtures/sort/keys_blanks.txt vendored Normal file
View file

@ -0,0 +1,3 @@
abc
cab
bca

View file

@ -0,0 +1 @@
a b c

View file

@ -0,0 +1,3 @@
a b c
^ no match for key
_____

View file

@ -0,0 +1 @@
a b c

View file

@ -21,10 +21,10 @@ CARAvan
8.013
45
46.89
4567.
37800
576,446.88800000
576,446.890
4567.
37800
4798908.340000000000
4798908.45
4798908.8909800

View file

@ -67,18 +67,18 @@ __
46.89
_____
_____
576,446.88800000
___
________________
576,446.890
___
___________
4567.
_____
____________________
>>>>37800
_____
_________
576,446.88800000
________________
________________
576,446.890
___________
___________
4798908.340000000000
____________________
____________________

View file

@ -1,30 +0,0 @@
4798908.8909800
4798908.45
4798908.340000000000
576,446.890
576,446.88800000
37800
4567.
46.89
45
8.013
1.58590
1.444
1.040000000
1
00000001
CARAvan
000
-.05
-1
-8.90880
-896689
-2028789030

View file

@ -1,30 +0,0 @@
4798908.8909800
4798908.45
4798908.340000000000
576,446.890
576,446.88800000
37800
4567.
46.89
45
8.013
1.58590
1.444
1.040000000
1
00000001
CARAvan
000
-.05
-1
-8.90880
-896689
-2028789030

View file

@ -1,30 +0,0 @@
576,446.890
576,446.88800000
4567.
45
46.89
-1
1
00000001
4798908.340000000000
4798908.45
4798908.8909800
37800
-2028789030
-896689
CARAvan
-8.90880
-.05
1.444
1.58590
1.040000000
8.013
000

View file

@ -8,11 +8,14 @@
-0
CARAvan
-0
000
-0
1
00000001
1.040000000
@ -21,10 +24,10 @@ CARAvan
8.013
45
46.89
576,446.890
576,446.88800000
4567.
37800
576,446.88800000
576,446.890
4798908.340000000000
4798908.45
4798908.8909800

View file

@ -18,8 +18,12 @@ ____
^ no match for key
^ no match for key
-0
__
CARAvan
^ no match for key
-0
__
^ no match for key
@ -28,6 +32,8 @@ CARAvan
^ no match for key
000
___
-0
__
1
_
00000001
@ -44,14 +50,14 @@ _____
__
46.89
_____
576,446.890
___
576,446.88800000
___
4567.
_____
>>>>37800
_____
576,446.88800000
________________
576,446.890
___________
4798908.340000000000
____________________
4798908.45

View file

@ -17,7 +17,9 @@
-2028789030
-896689
-0
CARAvan
-0
-8.90880
-.05
@ -28,3 +30,4 @@ CARAvan
8.013
000
-0

View file

@ -11,10 +11,9 @@
8.013
45
46.89
576,446.890
4567.
37800
576,446.88800000
576,446.890
4798908.340000000000
4798908.45
4798908.8909800

View file

@ -24,14 +24,12 @@ _____
__
46.89
_____
576,446.890
___
4567.
_____
>>>>37800
_____
576,446.88800000
________________
576,446.890
___________
4798908.340000000000
____________________
4798908.45

View file

@ -1,10 +1,9 @@
4798908.8909800
4798908.45
4798908.340000000000
576,446.890
576,446.88800000
37800
4567.
576,446.890
46.89
45
8.013

View file

@ -4,14 +4,12 @@ _______________
__________
4798908.340000000000
____________________
576,446.890
___________
576,446.88800000
________________
>>>>37800
_____
4567.
_____
576,446.890
___
46.89
_____
45

View file

@ -1,20 +0,0 @@
4798908.8909800
4798908.45
4798908.340000000000
576,446.890
576,446.88800000
37800
4567.
46.89
45
8.013
1.58590
1.444
1.040000000
1
-.05
-1
-8.90880
-896689
-2028789030

View file

@ -1,20 +0,0 @@
4798908.8909800
4798908.45
4798908.340000000000
576,446.890
576,446.88800000
37800
4567.
46.89
45
8.013
1.58590
1.444
1.040000000
1
-.05
-1
-8.90880
-896689
-2028789030

View file

@ -1,30 +0,0 @@
576,446.890
576,446.88800000
4567.
45
46.89
-1
1
00000001
4798908.340000000000
4798908.45
4798908.8909800
37800
-2028789030
-896689
CARAvan
-8.90880
-.05
1.444
1.58590
1.040000000
8.013
000

View file

@ -1,3 +1,4 @@
asdf
N/A Ut enim ad minim veniam, quis
Jan Lorem ipsum dolor sit amet
mar laboris nisi ut aliquip ex ea

View file

@ -1,3 +1,6 @@
asdf
^ no match for key
_____
N/A Ut enim ad minim veniam, quis
^ no match for key
_________________________________

View file

@ -8,3 +8,4 @@ mar laboris nisi ut aliquip ex ea
Jul 2 these three lines
Jul 1 should remain 2,1,3
Jul 3 if --stable is provided
asdf

View file

@ -1,4 +1,5 @@
N/A Ut enim ad minim veniam, quis
asdf
Jan Lorem ipsum dolor sit amet
mar laboris nisi ut aliquip ex ea
May sed do eiusmod tempor incididunt

View file

@ -1,5 +1,7 @@
N/A Ut enim ad minim veniam, quis
^ no match for key
asdf
^ no match for key
Jan Lorem ipsum dolor sit amet
___
mar laboris nisi ut aliquip ex ea

View file

@ -8,3 +8,4 @@ mar laboris nisi ut aliquip ex ea
Jul 2 these three lines
Jul 1 should remain 2,1,3
Jul 3 if --stable is provided
asdf

View file

@ -21,6 +21,8 @@ CARAvan
8.013
45
46.89
576,446.88800000
576,446.890
4567..457
4567.
4567.1
@ -28,8 +30,6 @@ CARAvan
37800
45670.89079.098
45670.89079.1
576,446.88800000
576,446.890
4798908.340000000000
4798908.45
4798908.8909800

View file

@ -67,6 +67,12 @@ __
46.89
_____
_____
576,446.88800000
___
________________
576,446.890
___
___________
>>>>>>>>>>4567..457
_____
___________________
@ -88,12 +94,6 @@ _____________________
>>>>>>45670.89079.1
___________
___________________
576,446.88800000
________________
________________
576,446.890
___________
___________
4798908.340000000000
____________________
____________________