1
Fork 0
mirror of https://github.com/RGBCube/uutils-coreutils synced 2025-07-31 04:57:45 +00:00

sort: fix tokenization for trailing separators

Trailing separators were included at the end of the last token, but they
should not be.

This changes tokenize_with_separator as suggested by @cbjadwani.
This commit is contained in:
Michael Debertol 2021-04-21 17:56:59 +02:00
parent fb2ae04b8f
commit 8a05148d7b

View file

@ -351,20 +351,18 @@ fn tokenize_default(line: &str) -> Vec<Field> {
/// Split between separators. These separators are not included in fields. /// Split between separators. These separators are not included in fields.
fn tokenize_with_separator(line: &str, separator: char) -> Vec<Field> { fn tokenize_with_separator(line: &str, separator: char) -> Vec<Field> {
let mut tokens = vec![0..0]; let mut tokens = vec![];
let mut previous_was_separator = false; let separator_indices =
for (idx, char) in line.char_indices() { line.char_indices()
if previous_was_separator { .filter_map(|(i, c)| if c == separator { Some(i) } else { None });
tokens.push(idx..0); let mut start = 0;
} for sep_idx in separator_indices {
if char == separator { tokens.push(start..sep_idx);
tokens.last_mut().unwrap().end = idx; start = sep_idx + 1;
previous_was_separator = true; }
} else { if start < line.len() {
previous_was_separator = false; tokens.push(start..line.len());
}
} }
tokens.last_mut().unwrap().end = line.len();
tokens tokens
} }
@ -1383,4 +1381,14 @@ mod tests {
vec![0..0, 1..1, 2..2, 3..9, 10..18,] vec![0..0, 1..1, 2..2, 3..9, 10..18,]
); );
} }
#[test]
fn test_tokenize_fields_trailing_custom_separator() {
let line = "a";
assert_eq!(tokenize(line, Some('a')), vec![0..0]);
let line = "aa";
assert_eq!(tokenize(line, Some('a')), vec![0..0, 1..1]);
let line = "..a..a";
assert_eq!(tokenize(line, Some('a')), vec![0..2, 3..5]);
}
} }