mirror of
https://github.com/RGBCube/uutils-coreutils
synced 2025-07-28 11:37:44 +00:00
sort: implement -k and -t support (#1996)
* sort: implement basic -k and -t support This allows to specify keys after the -k flag and a custom field separator using -t. Support for options for specific keys is still missing, and the -b flag is not passed down correctly. * sort: implement support for key options * remove unstable feature use * don't pipe in input when we expect a failure * only tokenize when needed, remove a clone() * improve comments * fix clippy lints * re-add test * buffer writes to stdout * fix ignore_non_printing and make the test fail in case it is broken :) * move attribute to the right position * add more tests * add my name to the copyright section * disallow dead code * move a comment * re-add a loc * use smallvec for a perf improvement in the common case * add BENCHMARKING.md * add ignore_case to benchmarks
This commit is contained in:
parent
e1221ef3f8
commit
49c9d8c901
5 changed files with 742 additions and 141 deletions
11
Cargo.lock
generated
11
Cargo.lock
generated
|
@ -1,5 +1,7 @@
|
||||||
# This file is automatically @generated by Cargo.
|
# This file is automatically @generated by Cargo.
|
||||||
# It is not intended for manual editing.
|
# It is not intended for manual editing.
|
||||||
|
version = 3
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "advapi32-sys"
|
name = "advapi32-sys"
|
||||||
version = "0.2.0"
|
version = "0.2.0"
|
||||||
|
@ -1362,6 +1364,12 @@ dependencies = [
|
||||||
"maybe-uninit",
|
"maybe-uninit",
|
||||||
]
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "smallvec"
|
||||||
|
version = "1.6.1"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "fe0f37c9e8f3c5a4a66ad655a93c74daac4ad00c441533bf5c6e7990bb42604e"
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "strsim"
|
name = "strsim"
|
||||||
version = "0.8.0"
|
version = "0.8.0"
|
||||||
|
@ -1816,7 +1824,7 @@ dependencies = [
|
||||||
"quickcheck",
|
"quickcheck",
|
||||||
"rand 0.7.3",
|
"rand 0.7.3",
|
||||||
"rand_chacha",
|
"rand_chacha",
|
||||||
"smallvec",
|
"smallvec 0.6.14",
|
||||||
"uucore",
|
"uucore",
|
||||||
"uucore_procs",
|
"uucore_procs",
|
||||||
]
|
]
|
||||||
|
@ -2289,6 +2297,7 @@ dependencies = [
|
||||||
"rand 0.7.3",
|
"rand 0.7.3",
|
||||||
"rayon",
|
"rayon",
|
||||||
"semver",
|
"semver",
|
||||||
|
"smallvec 1.6.1",
|
||||||
"uucore",
|
"uucore",
|
||||||
"uucore_procs",
|
"uucore_procs",
|
||||||
]
|
]
|
||||||
|
|
33
src/uu/sort/BENCHMARKING.md
Normal file
33
src/uu/sort/BENCHMARKING.md
Normal file
|
@ -0,0 +1,33 @@
|
||||||
|
# Benchmarking sort
|
||||||
|
|
||||||
|
Most of the time when sorting is spent comparing lines. The comparison functions however differ based
|
||||||
|
on which arguments are passed to `sort`, therefore it is important to always benchmark multiple scenarios.
|
||||||
|
This is an overwiew over what was benchmarked, and if you make changes to `sort`, you are encouraged to check
|
||||||
|
how performance was affected for the workloads listed below. Feel free to add other workloads to the
|
||||||
|
list that we should improve / make sure not to regress.
|
||||||
|
|
||||||
|
Run `cargo build --release` before benchmarking after you make a change!
|
||||||
|
|
||||||
|
## Sorting a wordlist
|
||||||
|
- Get a wordlist, for example with [words](https://en.wikipedia.org/wiki/Words_(Unix)) on Linux. The exact wordlist
|
||||||
|
doesn't matter for performance comparisons. In this example I'm using `/usr/share/dict/american-english` as the wordlist.
|
||||||
|
- Shuffle the wordlist by running `sort -R /usr/share/dict/american-english > shuffled_wordlist.txt`.
|
||||||
|
- Benchmark sorting the wordlist with hyperfine: `hyperfine "target/release/coreutils sort shuffled_wordlist.txt -o output.txt"`.
|
||||||
|
|
||||||
|
## Sorting a wordlist with ignore_case
|
||||||
|
- Same wordlist as above
|
||||||
|
- Benchmark sorting the wordlist ignoring the case with hyperfine: `hyperfine "target/release/coreutils sort shuffled_wordlist.txt -f -o output.txt"`.
|
||||||
|
|
||||||
|
## Sorting numbers
|
||||||
|
- Generate a list of numbers: `seq 0 100000 | sort -R > shuffled_numbers.txt`.
|
||||||
|
- Benchmark numeric sorting with hyperfine: `hyperfine "target/release/coreutils sort shuffled_numbers.txt -n -o output.txt"`.
|
||||||
|
|
||||||
|
## Stdout and stdin performance
|
||||||
|
Try to run the above benchmarks by piping the input through stdin (standard input) and redirect the
|
||||||
|
output through stdout (standard output):
|
||||||
|
- Remove the input file from the arguments and add `cat [inputfile] | ` at the beginning.
|
||||||
|
- Remove `-o output.txt` and add `> output.txt` at the end.
|
||||||
|
|
||||||
|
Example: `hyperfine "target/release/coreutils sort shuffled_numbers.txt -n -o output.txt"` becomes
|
||||||
|
`hyperfine "cat shuffled_numbers.txt | target/release/coreutils sort -n > output.txt`
|
||||||
|
- Check that performance is similar to the original benchmark.
|
|
@ -21,6 +21,7 @@ clap = "2.33"
|
||||||
fnv = "1.0.7"
|
fnv = "1.0.7"
|
||||||
itertools = "0.8.0"
|
itertools = "0.8.0"
|
||||||
semver = "0.9.0"
|
semver = "0.9.0"
|
||||||
|
smallvec = "1.6.1"
|
||||||
uucore = { version=">=0.0.8", package="uucore", path="../../uucore", features=["fs"] }
|
uucore = { version=">=0.0.8", package="uucore", path="../../uucore", features=["fs"] }
|
||||||
uucore_procs = { version=">=0.0.5", package="uucore_procs", path="../../uucore_procs" }
|
uucore_procs = { version=">=0.0.5", package="uucore_procs", path="../../uucore_procs" }
|
||||||
|
|
||||||
|
|
|
@ -2,10 +2,10 @@
|
||||||
// *
|
// *
|
||||||
// * (c) Michael Yin <mikeyin@mikeyin.org>
|
// * (c) Michael Yin <mikeyin@mikeyin.org>
|
||||||
// * (c) Robert Swinford <robert.swinford..AT..gmail.com>
|
// * (c) Robert Swinford <robert.swinford..AT..gmail.com>
|
||||||
|
// * (c) Michael Debertol <michael.debertol..AT..gmail.com>
|
||||||
// *
|
// *
|
||||||
// * For the full copyright and license information, please view the LICENSE
|
// * For the full copyright and license information, please view the LICENSE
|
||||||
// * file that was distributed with this source code.
|
// * file that was distributed with this source code.
|
||||||
#![allow(dead_code)]
|
|
||||||
|
|
||||||
// Although these links don't always seem to describe reality, check out the POSIX and GNU specs:
|
// Although these links don't always seem to describe reality, check out the POSIX and GNU specs:
|
||||||
// https://pubs.opengroup.org/onlinepubs/9699919799/utilities/sort.html
|
// https://pubs.opengroup.org/onlinepubs/9699919799/utilities/sort.html
|
||||||
|
@ -22,6 +22,7 @@ use rand::distributions::Alphanumeric;
|
||||||
use rand::{thread_rng, Rng};
|
use rand::{thread_rng, Rng};
|
||||||
use rayon::prelude::*;
|
use rayon::prelude::*;
|
||||||
use semver::Version;
|
use semver::Version;
|
||||||
|
use smallvec::SmallVec;
|
||||||
use std::borrow::Cow;
|
use std::borrow::Cow;
|
||||||
use std::cmp::Ordering;
|
use std::cmp::Ordering;
|
||||||
use std::collections::BinaryHeap;
|
use std::collections::BinaryHeap;
|
||||||
|
@ -30,6 +31,7 @@ use std::fs::File;
|
||||||
use std::hash::{Hash, Hasher};
|
use std::hash::{Hash, Hasher};
|
||||||
use std::io::{stdin, stdout, BufRead, BufReader, BufWriter, Lines, Read, Write};
|
use std::io::{stdin, stdout, BufRead, BufReader, BufWriter, Lines, Read, Write};
|
||||||
use std::mem::replace;
|
use std::mem::replace;
|
||||||
|
use std::ops::{Range, RangeInclusive};
|
||||||
use std::path::Path;
|
use std::path::Path;
|
||||||
use uucore::fs::is_stdin_interactive; // for Iterator::dedup()
|
use uucore::fs::is_stdin_interactive; // for Iterator::dedup()
|
||||||
|
|
||||||
|
@ -37,6 +39,16 @@ static NAME: &str = "sort";
|
||||||
static ABOUT: &str = "Display sorted concatenation of all FILE(s).";
|
static ABOUT: &str = "Display sorted concatenation of all FILE(s).";
|
||||||
static VERSION: &str = env!("CARGO_PKG_VERSION");
|
static VERSION: &str = env!("CARGO_PKG_VERSION");
|
||||||
|
|
||||||
|
const LONG_HELP_KEYS: &str = "The key format is FIELD[.CHAR][OPTIONS][,FIELD[.CHAR]][OPTIONS].
|
||||||
|
|
||||||
|
Fields by default are separated by the first whitespace after a non-whitespace character. Use -t to specify a custom separator.
|
||||||
|
In the default case, whitespace is appended at the beginning of each field. Custom separators however are not included in fields.
|
||||||
|
|
||||||
|
FIELD and CHAR both start at 1 (i.e. they are 1-indexed). If there is no end specified after a comma, the end will be the end of the line.
|
||||||
|
If CHAR is set 0, it means the end of the field. CHAR defaults to 1 for the start position and to 0 for the end position.
|
||||||
|
|
||||||
|
Valid options are: MbdfhnRrV. They override the global options for this key.";
|
||||||
|
|
||||||
static OPT_HUMAN_NUMERIC_SORT: &str = "human-numeric-sort";
|
static OPT_HUMAN_NUMERIC_SORT: &str = "human-numeric-sort";
|
||||||
static OPT_MONTH_SORT: &str = "month-sort";
|
static OPT_MONTH_SORT: &str = "month-sort";
|
||||||
static OPT_NUMERIC_SORT: &str = "numeric-sort";
|
static OPT_NUMERIC_SORT: &str = "numeric-sort";
|
||||||
|
@ -54,6 +66,8 @@ static OPT_OUTPUT: &str = "output";
|
||||||
static OPT_REVERSE: &str = "reverse";
|
static OPT_REVERSE: &str = "reverse";
|
||||||
static OPT_STABLE: &str = "stable";
|
static OPT_STABLE: &str = "stable";
|
||||||
static OPT_UNIQUE: &str = "unique";
|
static OPT_UNIQUE: &str = "unique";
|
||||||
|
static OPT_KEY: &str = "key";
|
||||||
|
static OPT_SEPARATOR: &str = "field-separator";
|
||||||
static OPT_RANDOM: &str = "random-sort";
|
static OPT_RANDOM: &str = "random-sort";
|
||||||
static OPT_ZERO_TERMINATED: &str = "zero-terminated";
|
static OPT_ZERO_TERMINATED: &str = "zero-terminated";
|
||||||
static OPT_PARALLEL: &str = "parallel";
|
static OPT_PARALLEL: &str = "parallel";
|
||||||
|
@ -63,10 +77,11 @@ static ARG_FILES: &str = "files";
|
||||||
|
|
||||||
static DECIMAL_PT: char = '.';
|
static DECIMAL_PT: char = '.';
|
||||||
static THOUSANDS_SEP: char = ',';
|
static THOUSANDS_SEP: char = ',';
|
||||||
|
|
||||||
static NEGATIVE: char = '-';
|
static NEGATIVE: char = '-';
|
||||||
static POSITIVE: char = '+';
|
static POSITIVE: char = '+';
|
||||||
|
|
||||||
#[derive(Eq, Ord, PartialEq, PartialOrd)]
|
#[derive(Eq, Ord, PartialEq, PartialOrd, Clone)]
|
||||||
enum SortMode {
|
enum SortMode {
|
||||||
Numeric,
|
Numeric,
|
||||||
HumanNumeric,
|
HumanNumeric,
|
||||||
|
@ -76,8 +91,12 @@ enum SortMode {
|
||||||
Default,
|
Default,
|
||||||
}
|
}
|
||||||
|
|
||||||
struct Settings {
|
struct GlobalSettings {
|
||||||
mode: SortMode,
|
mode: SortMode,
|
||||||
|
ignore_blanks: bool,
|
||||||
|
ignore_case: bool,
|
||||||
|
dictionary_order: bool,
|
||||||
|
ignore_non_printing: bool,
|
||||||
merge: bool,
|
merge: bool,
|
||||||
reverse: bool,
|
reverse: bool,
|
||||||
outfile: Option<String>,
|
outfile: Option<String>,
|
||||||
|
@ -86,17 +105,21 @@ struct Settings {
|
||||||
check: bool,
|
check: bool,
|
||||||
check_silent: bool,
|
check_silent: bool,
|
||||||
random: bool,
|
random: bool,
|
||||||
compare_fn: fn(&str, &str) -> Ordering,
|
|
||||||
transform_fns: Vec<fn(&str) -> String>,
|
|
||||||
threads: String,
|
|
||||||
salt: String,
|
salt: String,
|
||||||
|
selectors: Vec<FieldSelector>,
|
||||||
|
separator: Option<char>,
|
||||||
|
threads: String,
|
||||||
zero_terminated: bool,
|
zero_terminated: bool,
|
||||||
}
|
}
|
||||||
|
|
||||||
impl Default for Settings {
|
impl Default for GlobalSettings {
|
||||||
fn default() -> Settings {
|
fn default() -> GlobalSettings {
|
||||||
Settings {
|
GlobalSettings {
|
||||||
mode: SortMode::Default,
|
mode: SortMode::Default,
|
||||||
|
ignore_blanks: false,
|
||||||
|
ignore_case: false,
|
||||||
|
dictionary_order: false,
|
||||||
|
ignore_non_printing: false,
|
||||||
merge: false,
|
merge: false,
|
||||||
reverse: false,
|
reverse: false,
|
||||||
outfile: None,
|
outfile: None,
|
||||||
|
@ -105,19 +128,330 @@ impl Default for Settings {
|
||||||
check: false,
|
check: false,
|
||||||
check_silent: false,
|
check_silent: false,
|
||||||
random: false,
|
random: false,
|
||||||
compare_fn: default_compare,
|
|
||||||
transform_fns: Vec::new(),
|
|
||||||
threads: String::new(),
|
|
||||||
salt: String::new(),
|
salt: String::new(),
|
||||||
|
selectors: vec![],
|
||||||
|
separator: None,
|
||||||
|
threads: String::new(),
|
||||||
zero_terminated: false,
|
zero_terminated: false,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
struct KeySettings {
|
||||||
|
mode: SortMode,
|
||||||
|
ignore_blanks: bool,
|
||||||
|
ignore_case: bool,
|
||||||
|
dictionary_order: bool,
|
||||||
|
ignore_non_printing: bool,
|
||||||
|
random: bool,
|
||||||
|
reverse: bool,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl From<&GlobalSettings> for KeySettings {
|
||||||
|
fn from(settings: &GlobalSettings) -> Self {
|
||||||
|
Self {
|
||||||
|
mode: settings.mode.clone(),
|
||||||
|
ignore_blanks: settings.ignore_blanks,
|
||||||
|
ignore_case: settings.ignore_case,
|
||||||
|
ignore_non_printing: settings.ignore_non_printing,
|
||||||
|
random: settings.random,
|
||||||
|
reverse: settings.reverse,
|
||||||
|
dictionary_order: settings.dictionary_order,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Represents the string selected by a FieldSelector.
|
||||||
|
enum Selection {
|
||||||
|
/// If we had to transform this selection, we have to store a new string.
|
||||||
|
String(String),
|
||||||
|
/// If there was no transformation, we can store an index into the line.
|
||||||
|
ByIndex(Range<usize>),
|
||||||
|
}
|
||||||
|
|
||||||
|
impl Selection {
|
||||||
|
/// Gets the actual string slice represented by this Selection.
|
||||||
|
fn get_str<'a>(&'a self, line: &'a Line) -> &'a str {
|
||||||
|
match self {
|
||||||
|
Selection::String(string) => string.as_str(),
|
||||||
|
Selection::ByIndex(range) => &line.line[range.to_owned()],
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
type Field = Range<usize>;
|
||||||
|
|
||||||
|
struct Line {
|
||||||
|
line: String,
|
||||||
|
// The common case is not to specify fields. Let's make this fast.
|
||||||
|
selections: SmallVec<[Selection; 1]>,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl Line {
|
||||||
|
fn new(line: String, settings: &GlobalSettings) -> Self {
|
||||||
|
let fields = if settings
|
||||||
|
.selectors
|
||||||
|
.iter()
|
||||||
|
.any(|selector| selector.needs_tokens())
|
||||||
|
{
|
||||||
|
// Only tokenize if we will need tokens.
|
||||||
|
Some(tokenize(&line, settings.separator))
|
||||||
|
} else {
|
||||||
|
None
|
||||||
|
};
|
||||||
|
|
||||||
|
let selections = settings
|
||||||
|
.selectors
|
||||||
|
.iter()
|
||||||
|
.map(|selector| {
|
||||||
|
if let Some(range) = selector.get_selection(&line, fields.as_deref()) {
|
||||||
|
if let Some(transformed) =
|
||||||
|
transform(&line[range.to_owned()], &selector.settings)
|
||||||
|
{
|
||||||
|
Selection::String(transformed)
|
||||||
|
} else {
|
||||||
|
Selection::ByIndex(range.start().to_owned()..range.end() + 1)
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
// If there is no match, match the empty string.
|
||||||
|
Selection::ByIndex(0..0)
|
||||||
|
}
|
||||||
|
})
|
||||||
|
.collect();
|
||||||
|
Self { line, selections }
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Transform this line. Returns None if there's no need to transform.
|
||||||
|
fn transform(line: &str, settings: &KeySettings) -> Option<String> {
|
||||||
|
let mut transformed = None;
|
||||||
|
if settings.ignore_case {
|
||||||
|
transformed = Some(line.to_uppercase());
|
||||||
|
}
|
||||||
|
if settings.ignore_blanks {
|
||||||
|
transformed = Some(
|
||||||
|
transformed
|
||||||
|
.as_deref()
|
||||||
|
.unwrap_or(line)
|
||||||
|
.trim_start()
|
||||||
|
.to_string(),
|
||||||
|
);
|
||||||
|
}
|
||||||
|
if settings.dictionary_order {
|
||||||
|
transformed = Some(remove_nondictionary_chars(
|
||||||
|
transformed.as_deref().unwrap_or(line),
|
||||||
|
));
|
||||||
|
}
|
||||||
|
if settings.ignore_non_printing {
|
||||||
|
transformed = Some(remove_nonprinting_chars(
|
||||||
|
transformed.as_deref().unwrap_or(line),
|
||||||
|
));
|
||||||
|
}
|
||||||
|
transformed
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Tokenize a line into fields.
|
||||||
|
fn tokenize(line: &str, separator: Option<char>) -> Vec<Field> {
|
||||||
|
if let Some(separator) = separator {
|
||||||
|
tokenize_with_separator(line, separator)
|
||||||
|
} else {
|
||||||
|
tokenize_default(line)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// By default fields are separated by the first whitespace after non-whitespace.
|
||||||
|
/// Whitespace is included in fields at the start.
|
||||||
|
fn tokenize_default(line: &str) -> Vec<Field> {
|
||||||
|
let mut tokens = vec![0..0];
|
||||||
|
// pretend that there was whitespace in front of the line
|
||||||
|
let mut previous_was_whitespace = true;
|
||||||
|
for (idx, char) in line.char_indices() {
|
||||||
|
if char.is_whitespace() {
|
||||||
|
if !previous_was_whitespace {
|
||||||
|
tokens.last_mut().unwrap().end = idx;
|
||||||
|
tokens.push(idx..0);
|
||||||
|
}
|
||||||
|
previous_was_whitespace = true;
|
||||||
|
} else {
|
||||||
|
previous_was_whitespace = false;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
tokens.last_mut().unwrap().end = line.len();
|
||||||
|
tokens
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Split between separators. These separators are not included in fields.
|
||||||
|
fn tokenize_with_separator(line: &str, separator: char) -> Vec<Field> {
|
||||||
|
let mut tokens = vec![0..0];
|
||||||
|
let mut previous_was_separator = false;
|
||||||
|
for (idx, char) in line.char_indices() {
|
||||||
|
if previous_was_separator {
|
||||||
|
tokens.push(idx..0);
|
||||||
|
}
|
||||||
|
if char == separator {
|
||||||
|
tokens.last_mut().unwrap().end = idx;
|
||||||
|
previous_was_separator = true;
|
||||||
|
} else {
|
||||||
|
previous_was_separator = false;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
tokens.last_mut().unwrap().end = line.len();
|
||||||
|
tokens
|
||||||
|
}
|
||||||
|
|
||||||
|
struct KeyPosition {
|
||||||
|
/// 1-indexed, 0 is invalid.
|
||||||
|
field: usize,
|
||||||
|
/// 1-indexed, 0 is end of field.
|
||||||
|
char: usize,
|
||||||
|
ignore_blanks: bool,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl KeyPosition {
|
||||||
|
fn parse(key: &str, default_char_index: usize, settings: &mut KeySettings) -> Self {
|
||||||
|
let mut field_and_char = key.split('.');
|
||||||
|
let mut field = field_and_char
|
||||||
|
.next()
|
||||||
|
.unwrap_or_else(|| crash!(1, "invalid key `{}`", key));
|
||||||
|
let mut char = field_and_char.next();
|
||||||
|
|
||||||
|
// If there is a char index, we expect options to appear after it. Otherwise we expect them after the field index.
|
||||||
|
let value_with_options = char.as_mut().unwrap_or(&mut field);
|
||||||
|
|
||||||
|
let mut ignore_blanks = settings.ignore_blanks;
|
||||||
|
if let Some(options_start) = value_with_options.chars().position(char::is_alphabetic) {
|
||||||
|
for option in value_with_options[options_start..].chars() {
|
||||||
|
// valid options: MbdfghinRrV
|
||||||
|
match option {
|
||||||
|
'M' => settings.mode = SortMode::Month,
|
||||||
|
'b' => ignore_blanks = true,
|
||||||
|
'd' => settings.dictionary_order = true,
|
||||||
|
'f' => settings.ignore_case = true,
|
||||||
|
'g' => settings.mode = SortMode::GeneralNumeric,
|
||||||
|
'h' => settings.mode = SortMode::HumanNumeric,
|
||||||
|
'i' => settings.ignore_non_printing = true,
|
||||||
|
'n' => settings.mode = SortMode::Numeric,
|
||||||
|
'R' => settings.random = true,
|
||||||
|
'r' => settings.reverse = true,
|
||||||
|
'V' => settings.mode = SortMode::Version,
|
||||||
|
c => {
|
||||||
|
crash!(1, "invalid option for key: `{}`", c)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
// Strip away option characters from the original value so we can parse it later
|
||||||
|
*value_with_options = &value_with_options[..options_start];
|
||||||
|
}
|
||||||
|
|
||||||
|
let field = field
|
||||||
|
.parse()
|
||||||
|
.unwrap_or_else(|e| crash!(1, "failed to parse field index for key `{}`: {}", key, e));
|
||||||
|
if field == 0 {
|
||||||
|
crash!(1, "field index was 0");
|
||||||
|
}
|
||||||
|
let char = char.map_or(default_char_index, |char| {
|
||||||
|
char.parse().unwrap_or_else(|e| {
|
||||||
|
crash!(
|
||||||
|
1,
|
||||||
|
"failed to parse character index for key `{}`: {}",
|
||||||
|
key,
|
||||||
|
e
|
||||||
|
)
|
||||||
|
})
|
||||||
|
});
|
||||||
|
Self {
|
||||||
|
field,
|
||||||
|
char,
|
||||||
|
ignore_blanks,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
struct FieldSelector {
|
||||||
|
from: KeyPosition,
|
||||||
|
to: Option<KeyPosition>,
|
||||||
|
settings: KeySettings,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl FieldSelector {
|
||||||
|
fn needs_tokens(&self) -> bool {
|
||||||
|
self.from.field != 1 || self.from.char == 0 || self.to.is_some()
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Look up the slice that corresponds to this selector for the given line.
|
||||||
|
/// If needs_fields returned false, fields may be None.
|
||||||
|
fn get_selection<'a>(
|
||||||
|
&self,
|
||||||
|
line: &'a str,
|
||||||
|
tokens: Option<&[Field]>,
|
||||||
|
) -> Option<RangeInclusive<usize>> {
|
||||||
|
enum ResolutionErr {
|
||||||
|
TooLow,
|
||||||
|
TooHigh,
|
||||||
|
}
|
||||||
|
|
||||||
|
// Get the index for this line given the KeyPosition
|
||||||
|
fn resolve_index(
|
||||||
|
line: &str,
|
||||||
|
tokens: Option<&[Field]>,
|
||||||
|
position: &KeyPosition,
|
||||||
|
) -> Result<usize, ResolutionErr> {
|
||||||
|
if tokens.map_or(false, |fields| fields.len() < position.field) {
|
||||||
|
Err(ResolutionErr::TooHigh)
|
||||||
|
} else if position.char == 0 {
|
||||||
|
let end = tokens.unwrap()[position.field - 1].end;
|
||||||
|
if end == 0 {
|
||||||
|
Err(ResolutionErr::TooLow)
|
||||||
|
} else {
|
||||||
|
Ok(end - 1)
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
let mut idx = if position.field == 1 {
|
||||||
|
// The first field always starts at 0.
|
||||||
|
// We don't need tokens for this case.
|
||||||
|
0
|
||||||
|
} else {
|
||||||
|
tokens.unwrap()[position.field - 1].start
|
||||||
|
} + position.char
|
||||||
|
- 1;
|
||||||
|
if idx >= line.len() {
|
||||||
|
Err(ResolutionErr::TooHigh)
|
||||||
|
} else {
|
||||||
|
if position.ignore_blanks {
|
||||||
|
if let Some(not_whitespace) =
|
||||||
|
line[idx..].chars().position(|c| !c.is_whitespace())
|
||||||
|
{
|
||||||
|
idx += not_whitespace;
|
||||||
|
} else {
|
||||||
|
return Err(ResolutionErr::TooHigh);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
Ok(idx)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if let Ok(from) = resolve_index(line, tokens, &self.from) {
|
||||||
|
let to = self.to.as_ref().map(|to| resolve_index(line, tokens, &to));
|
||||||
|
match to {
|
||||||
|
Some(Ok(to)) => Some(from..=to),
|
||||||
|
// If `to` was not given or the match would be after the end of the line,
|
||||||
|
// match everything until the end of the line.
|
||||||
|
None | Some(Err(ResolutionErr::TooHigh)) => Some(from..=line.len() - 1),
|
||||||
|
// If `to` is before the start of the line, report no match.
|
||||||
|
// This can happen if the line starts with a separator.
|
||||||
|
Some(Err(ResolutionErr::TooLow)) => None,
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
None
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
struct MergeableFile<'a> {
|
struct MergeableFile<'a> {
|
||||||
lines: Lines<BufReader<Box<dyn Read>>>,
|
lines: Lines<BufReader<Box<dyn Read>>>,
|
||||||
current_line: String,
|
current_line: Line,
|
||||||
settings: &'a Settings,
|
settings: &'a GlobalSettings,
|
||||||
}
|
}
|
||||||
|
|
||||||
// BinaryHeap depends on `Ord`. Note that we want to pop smallest items
|
// BinaryHeap depends on `Ord`. Note that we want to pop smallest items
|
||||||
|
@ -125,7 +459,7 @@ struct MergeableFile<'a> {
|
||||||
// trick it into the right order by calling reverse() here.
|
// trick it into the right order by calling reverse() here.
|
||||||
impl<'a> Ord for MergeableFile<'a> {
|
impl<'a> Ord for MergeableFile<'a> {
|
||||||
fn cmp(&self, other: &MergeableFile) -> Ordering {
|
fn cmp(&self, other: &MergeableFile) -> Ordering {
|
||||||
compare_by(&self.current_line, &other.current_line, &self.settings).reverse()
|
compare_by(&self.current_line, &other.current_line, self.settings).reverse()
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -137,7 +471,7 @@ impl<'a> PartialOrd for MergeableFile<'a> {
|
||||||
|
|
||||||
impl<'a> PartialEq for MergeableFile<'a> {
|
impl<'a> PartialEq for MergeableFile<'a> {
|
||||||
fn eq(&self, other: &MergeableFile) -> bool {
|
fn eq(&self, other: &MergeableFile) -> bool {
|
||||||
Ordering::Equal == compare_by(&self.current_line, &other.current_line, &self.settings)
|
Ordering::Equal == compare_by(&self.current_line, &other.current_line, self.settings)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -145,11 +479,11 @@ impl<'a> Eq for MergeableFile<'a> {}
|
||||||
|
|
||||||
struct FileMerger<'a> {
|
struct FileMerger<'a> {
|
||||||
heap: BinaryHeap<MergeableFile<'a>>,
|
heap: BinaryHeap<MergeableFile<'a>>,
|
||||||
settings: &'a Settings,
|
settings: &'a GlobalSettings,
|
||||||
}
|
}
|
||||||
|
|
||||||
impl<'a> FileMerger<'a> {
|
impl<'a> FileMerger<'a> {
|
||||||
fn new(settings: &'a Settings) -> FileMerger<'a> {
|
fn new(settings: &'a GlobalSettings) -> FileMerger<'a> {
|
||||||
FileMerger {
|
FileMerger {
|
||||||
heap: BinaryHeap::new(),
|
heap: BinaryHeap::new(),
|
||||||
settings,
|
settings,
|
||||||
|
@ -159,7 +493,7 @@ impl<'a> FileMerger<'a> {
|
||||||
if let Some(Ok(next_line)) = lines.next() {
|
if let Some(Ok(next_line)) = lines.next() {
|
||||||
let mergeable_file = MergeableFile {
|
let mergeable_file = MergeableFile {
|
||||||
lines,
|
lines,
|
||||||
current_line: next_line,
|
current_line: Line::new(next_line, &self.settings),
|
||||||
settings: &self.settings,
|
settings: &self.settings,
|
||||||
};
|
};
|
||||||
self.heap.push(mergeable_file);
|
self.heap.push(mergeable_file);
|
||||||
|
@ -174,14 +508,17 @@ impl<'a> Iterator for FileMerger<'a> {
|
||||||
Some(mut current) => {
|
Some(mut current) => {
|
||||||
match current.lines.next() {
|
match current.lines.next() {
|
||||||
Some(Ok(next_line)) => {
|
Some(Ok(next_line)) => {
|
||||||
let ret = replace(&mut current.current_line, next_line);
|
let ret = replace(
|
||||||
|
&mut current.current_line,
|
||||||
|
Line::new(next_line, &self.settings),
|
||||||
|
);
|
||||||
self.heap.push(current);
|
self.heap.push(current);
|
||||||
Some(ret)
|
Some(ret.line)
|
||||||
}
|
}
|
||||||
_ => {
|
_ => {
|
||||||
// Don't put it back in the heap (it's empty/erroring)
|
// Don't put it back in the heap (it's empty/erroring)
|
||||||
// but its first line is still valid.
|
// but its first line is still valid.
|
||||||
Some(current.current_line)
|
Some(current.current_line.line)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -205,7 +542,7 @@ With no FILE, or when FILE is -, read standard input.",
|
||||||
pub fn uumain(args: impl uucore::Args) -> i32 {
|
pub fn uumain(args: impl uucore::Args) -> i32 {
|
||||||
let args = args.collect_str();
|
let args = args.collect_str();
|
||||||
let usage = get_usage();
|
let usage = get_usage();
|
||||||
let mut settings: Settings = Default::default();
|
let mut settings: GlobalSettings = Default::default();
|
||||||
|
|
||||||
let matches = App::new(executable!())
|
let matches = App::new(executable!())
|
||||||
.version(VERSION)
|
.version(VERSION)
|
||||||
|
@ -316,7 +653,21 @@ pub fn uumain(args: impl uucore::Args) -> i32 {
|
||||||
.help("output only the first of an equal run"),
|
.help("output only the first of an equal run"),
|
||||||
)
|
)
|
||||||
.arg(
|
.arg(
|
||||||
Arg::with_name(OPT_ZERO_TERMINATED)
|
Arg::with_name(OPT_KEY)
|
||||||
|
.short("k")
|
||||||
|
.long(OPT_KEY)
|
||||||
|
.help("sort by a key")
|
||||||
|
.long_help(LONG_HELP_KEYS)
|
||||||
|
.multiple(true)
|
||||||
|
.takes_value(true),
|
||||||
|
)
|
||||||
|
.arg(
|
||||||
|
Arg::with_name(OPT_SEPARATOR)
|
||||||
|
.short("t")
|
||||||
|
.long(OPT_SEPARATOR)
|
||||||
|
.help("custom separator for -k")
|
||||||
|
.takes_value(true))
|
||||||
|
.arg(Arg::with_name(OPT_ZERO_TERMINATED)
|
||||||
.short("z")
|
.short("z")
|
||||||
.long(OPT_ZERO_TERMINATED)
|
.long(OPT_ZERO_TERMINATED)
|
||||||
.help("line delimiter is NUL, not newline"),
|
.help("line delimiter is NUL, not newline"),
|
||||||
|
@ -350,14 +701,12 @@ pub fn uumain(args: impl uucore::Args) -> i32 {
|
||||||
for path in &files0_from {
|
for path in &files0_from {
|
||||||
let (reader, _) = open(path.as_str()).expect("Could not read from file specified.");
|
let (reader, _) = open(path.as_str()).expect("Could not read from file specified.");
|
||||||
let buf_reader = BufReader::new(reader);
|
let buf_reader = BufReader::new(reader);
|
||||||
for line in buf_reader.split(b'\0') {
|
for line in buf_reader.split(b'\0').flatten() {
|
||||||
if let Ok(n) = line {
|
files.push(
|
||||||
files.push(
|
std::str::from_utf8(&line)
|
||||||
std::str::from_utf8(&n)
|
.expect("Could not parse string from zero terminated input.")
|
||||||
.expect("Could not parse string from zero terminated input.")
|
.to_string(),
|
||||||
.to_string(),
|
);
|
||||||
);
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
files
|
files
|
||||||
|
@ -382,21 +731,17 @@ pub fn uumain(args: impl uucore::Args) -> i32 {
|
||||||
SortMode::Default
|
SortMode::Default
|
||||||
};
|
};
|
||||||
|
|
||||||
|
settings.dictionary_order = matches.is_present(OPT_DICTIONARY_ORDER);
|
||||||
|
settings.ignore_non_printing = matches.is_present(OPT_IGNORE_NONPRINTING);
|
||||||
if matches.is_present(OPT_PARALLEL) {
|
if matches.is_present(OPT_PARALLEL) {
|
||||||
// "0" is default - threads = num of cores
|
// "0" is default - threads = num of cores
|
||||||
settings.threads = matches
|
settings.threads = matches
|
||||||
.value_of(OPT_PARALLEL)
|
.value_of(OPT_PARALLEL)
|
||||||
.map(String::from)
|
.map(String::from)
|
||||||
.unwrap_or("0".to_string());
|
.unwrap_or_else(|| "0".to_string());
|
||||||
env::set_var("RAYON_NUM_THREADS", &settings.threads);
|
env::set_var("RAYON_NUM_THREADS", &settings.threads);
|
||||||
}
|
}
|
||||||
|
|
||||||
if matches.is_present(OPT_DICTIONARY_ORDER) {
|
|
||||||
settings.transform_fns.push(remove_nondictionary_chars);
|
|
||||||
} else if matches.is_present(OPT_IGNORE_NONPRINTING) {
|
|
||||||
settings.transform_fns.push(remove_nonprinting_chars);
|
|
||||||
}
|
|
||||||
|
|
||||||
settings.zero_terminated = matches.is_present(OPT_ZERO_TERMINATED);
|
settings.zero_terminated = matches.is_present(OPT_ZERO_TERMINATED);
|
||||||
settings.merge = matches.is_present(OPT_MERGE);
|
settings.merge = matches.is_present(OPT_MERGE);
|
||||||
|
|
||||||
|
@ -406,13 +751,9 @@ pub fn uumain(args: impl uucore::Args) -> i32 {
|
||||||
settings.check = true;
|
settings.check = true;
|
||||||
};
|
};
|
||||||
|
|
||||||
if matches.is_present(OPT_IGNORE_CASE) {
|
settings.ignore_case = matches.is_present(OPT_IGNORE_CASE);
|
||||||
settings.transform_fns.push(|s| s.to_uppercase());
|
|
||||||
}
|
|
||||||
|
|
||||||
if matches.is_present(OPT_IGNORE_BLANKS) {
|
settings.ignore_blanks = matches.is_present(OPT_IGNORE_BLANKS);
|
||||||
settings.transform_fns.push(|s| s.trim_start().to_string());
|
|
||||||
}
|
|
||||||
|
|
||||||
settings.outfile = matches.value_of(OPT_OUTPUT).map(String::from);
|
settings.outfile = matches.value_of(OPT_OUTPUT).map(String::from);
|
||||||
settings.reverse = matches.is_present(OPT_REVERSE);
|
settings.reverse = matches.is_present(OPT_REVERSE);
|
||||||
|
@ -424,27 +765,64 @@ pub fn uumain(args: impl uucore::Args) -> i32 {
|
||||||
settings.salt = get_rand_string();
|
settings.salt = get_rand_string();
|
||||||
}
|
}
|
||||||
|
|
||||||
//let mut files = matches.free;
|
|
||||||
if files.is_empty() {
|
if files.is_empty() {
|
||||||
/* if no file, default to stdin */
|
/* if no file, default to stdin */
|
||||||
files.push("-".to_owned());
|
files.push("-".to_owned());
|
||||||
} else if settings.check && files.len() != 1 {
|
} else if settings.check && files.len() != 1 {
|
||||||
crash!(1, "sort: extra operand `{}' not allowed with -c", files[1])
|
crash!(1, "extra operand `{}' not allowed with -c", files[1])
|
||||||
}
|
}
|
||||||
|
|
||||||
settings.compare_fn = match settings.mode {
|
if let Some(arg) = matches.args.get(OPT_SEPARATOR) {
|
||||||
SortMode::Numeric => numeric_compare,
|
let separator = arg.vals[0].to_string_lossy();
|
||||||
SortMode::GeneralNumeric => general_numeric_compare,
|
let separator = separator;
|
||||||
SortMode::HumanNumeric => human_numeric_size_compare,
|
if separator.len() != 1 {
|
||||||
SortMode::Month => month_compare,
|
crash!(1, "separator must be exactly one character long");
|
||||||
SortMode::Version => version_compare,
|
}
|
||||||
SortMode::Default => default_compare,
|
settings.separator = Some(separator.chars().next().unwrap())
|
||||||
};
|
}
|
||||||
|
|
||||||
exec(files, &mut settings)
|
if matches.is_present(OPT_KEY) {
|
||||||
|
for key in &matches.args[OPT_KEY].vals {
|
||||||
|
let key = key.to_string_lossy();
|
||||||
|
let mut from_to = key.split(',');
|
||||||
|
let mut key_settings = KeySettings::from(&settings);
|
||||||
|
let from = KeyPosition::parse(
|
||||||
|
from_to
|
||||||
|
.next()
|
||||||
|
.unwrap_or_else(|| crash!(1, "invalid key `{}`", key)),
|
||||||
|
1,
|
||||||
|
&mut key_settings,
|
||||||
|
);
|
||||||
|
let to = from_to
|
||||||
|
.next()
|
||||||
|
.map(|to| KeyPosition::parse(to, 0, &mut key_settings));
|
||||||
|
let field_selector = FieldSelector {
|
||||||
|
from,
|
||||||
|
to,
|
||||||
|
settings: key_settings,
|
||||||
|
};
|
||||||
|
settings.selectors.push(field_selector);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if !settings.stable || !matches.is_present(OPT_KEY) {
|
||||||
|
// add a default selector matching the whole line
|
||||||
|
let key_settings = KeySettings::from(&settings);
|
||||||
|
settings.selectors.push(FieldSelector {
|
||||||
|
from: KeyPosition {
|
||||||
|
field: 1,
|
||||||
|
char: 1,
|
||||||
|
ignore_blanks: key_settings.ignore_blanks,
|
||||||
|
},
|
||||||
|
to: None,
|
||||||
|
settings: key_settings,
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
exec(files, &settings)
|
||||||
}
|
}
|
||||||
|
|
||||||
fn exec(files: Vec<String>, settings: &mut Settings) -> i32 {
|
fn exec(files: Vec<String>, settings: &GlobalSettings) -> i32 {
|
||||||
let mut lines = Vec::new();
|
let mut lines = Vec::new();
|
||||||
let mut file_merger = FileMerger::new(&settings);
|
let mut file_merger = FileMerger::new(&settings);
|
||||||
|
|
||||||
|
@ -459,26 +837,27 @@ fn exec(files: Vec<String>, settings: &mut Settings) -> i32 {
|
||||||
if settings.merge {
|
if settings.merge {
|
||||||
file_merger.push_file(buf_reader.lines());
|
file_merger.push_file(buf_reader.lines());
|
||||||
} else if settings.zero_terminated {
|
} else if settings.zero_terminated {
|
||||||
for line in buf_reader.split(b'\0') {
|
for line in buf_reader.split(b'\0').flatten() {
|
||||||
if let Ok(n) = line {
|
lines.push(Line::new(
|
||||||
lines.push(
|
std::str::from_utf8(&line)
|
||||||
std::str::from_utf8(&n)
|
.expect("Could not parse string from zero terminated input.")
|
||||||
.expect("Could not parse string from zero terminated input.")
|
.to_string(),
|
||||||
.to_string(),
|
&settings,
|
||||||
);
|
));
|
||||||
}
|
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
for line in buf_reader.lines() {
|
for line in buf_reader.lines() {
|
||||||
if let Ok(n) = line {
|
if let Ok(n) = line {
|
||||||
lines.push(n);
|
lines.push(Line::new(n, &settings));
|
||||||
|
} else {
|
||||||
|
break;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if settings.check {
|
if settings.check {
|
||||||
return exec_check_file(lines, &settings);
|
return exec_check_file(&lines, &settings);
|
||||||
} else {
|
} else {
|
||||||
sort_by(&mut lines, &settings);
|
sort_by(&mut lines, &settings);
|
||||||
}
|
}
|
||||||
|
@ -490,29 +869,31 @@ fn exec(files: Vec<String>, settings: &mut Settings) -> i32 {
|
||||||
print_sorted(file_merger, &settings)
|
print_sorted(file_merger, &settings)
|
||||||
}
|
}
|
||||||
} else if settings.mode == SortMode::Default && settings.unique {
|
} else if settings.mode == SortMode::Default && settings.unique {
|
||||||
print_sorted(lines.iter().dedup(), &settings)
|
print_sorted(lines.into_iter().map(|line| line.line).dedup(), &settings)
|
||||||
} else if settings.mode == SortMode::Month && settings.unique {
|
} else if settings.mode == SortMode::Month && settings.unique {
|
||||||
print_sorted(
|
print_sorted(
|
||||||
lines
|
lines
|
||||||
.iter()
|
.into_iter()
|
||||||
|
.map(|line| line.line)
|
||||||
.dedup_by(|a, b| get_months_dedup(a) == get_months_dedup(b)),
|
.dedup_by(|a, b| get_months_dedup(a) == get_months_dedup(b)),
|
||||||
&settings,
|
&settings,
|
||||||
)
|
)
|
||||||
} else if settings.unique {
|
} else if settings.unique {
|
||||||
print_sorted(
|
print_sorted(
|
||||||
lines
|
lines
|
||||||
.iter()
|
.into_iter()
|
||||||
.dedup_by(|a, b| get_num_dedup(a, &settings) == get_num_dedup(b, &settings)),
|
.map(|line| line.line)
|
||||||
|
.dedup_by(|a, b| get_num_dedup(a, settings) == get_num_dedup(b, settings)),
|
||||||
&settings,
|
&settings,
|
||||||
)
|
)
|
||||||
} else {
|
} else {
|
||||||
print_sorted(lines.iter(), &settings)
|
print_sorted(lines.into_iter().map(|line| line.line), &settings)
|
||||||
}
|
}
|
||||||
|
|
||||||
0
|
0
|
||||||
}
|
}
|
||||||
|
|
||||||
fn exec_check_file(unwrapped_lines: Vec<String>, settings: &Settings) -> i32 {
|
fn exec_check_file(unwrapped_lines: &[Line], settings: &GlobalSettings) -> i32 {
|
||||||
// errors yields the line before each disorder,
|
// errors yields the line before each disorder,
|
||||||
// plus the last line (quirk of .coalesce())
|
// plus the last line (quirk of .coalesce())
|
||||||
let mut errors =
|
let mut errors =
|
||||||
|
@ -544,51 +925,45 @@ fn exec_check_file(unwrapped_lines: Vec<String>, settings: &Settings) -> i32 {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
#[inline(always)]
|
fn sort_by(lines: &mut Vec<Line>, settings: &GlobalSettings) {
|
||||||
fn transform(line: &str, settings: &Settings) -> String {
|
|
||||||
let mut transformed = line.to_owned();
|
|
||||||
for transform_fn in &settings.transform_fns {
|
|
||||||
transformed = transform_fn(&transformed);
|
|
||||||
}
|
|
||||||
|
|
||||||
transformed
|
|
||||||
}
|
|
||||||
|
|
||||||
#[inline(always)]
|
|
||||||
fn sort_by(lines: &mut Vec<String>, settings: &Settings) {
|
|
||||||
lines.par_sort_by(|a, b| compare_by(a, b, &settings))
|
lines.par_sort_by(|a, b| compare_by(a, b, &settings))
|
||||||
}
|
}
|
||||||
|
|
||||||
fn compare_by(a: &str, b: &str, settings: &Settings) -> Ordering {
|
fn compare_by(a: &Line, b: &Line, global_settings: &GlobalSettings) -> Ordering {
|
||||||
let (a_transformed, b_transformed): (String, String);
|
for (idx, selector) in global_settings.selectors.iter().enumerate() {
|
||||||
let (a, b) = if !settings.transform_fns.is_empty() {
|
let a = a.selections[idx].get_str(a);
|
||||||
a_transformed = transform(&a, &settings);
|
let b = b.selections[idx].get_str(b);
|
||||||
b_transformed = transform(&b, &settings);
|
let settings = &selector.settings;
|
||||||
(a_transformed.as_str(), b_transformed.as_str())
|
|
||||||
} else {
|
|
||||||
(a, b)
|
|
||||||
};
|
|
||||||
|
|
||||||
// 1st Compare
|
let cmp: Ordering = if settings.random {
|
||||||
let mut cmp: Ordering = if settings.random {
|
random_shuffle(a, b, global_settings.salt.clone())
|
||||||
random_shuffle(a, b, settings.salt.clone())
|
|
||||||
} else {
|
|
||||||
(settings.compare_fn)(a, b)
|
|
||||||
};
|
|
||||||
|
|
||||||
// Call "last resort compare" on any equal
|
|
||||||
if cmp == Ordering::Equal {
|
|
||||||
if settings.random || settings.stable || settings.unique {
|
|
||||||
cmp = Ordering::Equal
|
|
||||||
} else {
|
} else {
|
||||||
cmp = default_compare(a, b)
|
(match settings.mode {
|
||||||
|
SortMode::Numeric => numeric_compare,
|
||||||
|
SortMode::GeneralNumeric => general_numeric_compare,
|
||||||
|
SortMode::HumanNumeric => human_numeric_size_compare,
|
||||||
|
SortMode::Month => month_compare,
|
||||||
|
SortMode::Version => version_compare,
|
||||||
|
SortMode::Default => default_compare,
|
||||||
|
})(a, b)
|
||||||
};
|
};
|
||||||
|
if cmp != Ordering::Equal {
|
||||||
|
return if settings.reverse { cmp.reverse() } else { cmp };
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Call "last resort compare" if all selectors returned Equal
|
||||||
|
|
||||||
|
let cmp = if global_settings.random || global_settings.stable || global_settings.unique {
|
||||||
|
Ordering::Equal
|
||||||
|
} else {
|
||||||
|
default_compare(&a.line, &b.line)
|
||||||
};
|
};
|
||||||
|
|
||||||
if settings.reverse {
|
if global_settings.reverse {
|
||||||
return cmp.reverse();
|
cmp.reverse()
|
||||||
} else {
|
} else {
|
||||||
return cmp;
|
cmp
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -617,8 +992,8 @@ fn leading_num_common(a: &str) -> &str {
|
||||||
&& !c.eq(&'e')
|
&& !c.eq(&'e')
|
||||||
&& !c.eq(&'E')
|
&& !c.eq(&'E')
|
||||||
// check whether first char is + or -
|
// check whether first char is + or -
|
||||||
&& !a.chars().nth(0).unwrap_or('\0').eq(&POSITIVE)
|
&& !a.chars().next().unwrap_or('\0').eq(&POSITIVE)
|
||||||
&& !a.chars().nth(0).unwrap_or('\0').eq(&NEGATIVE)
|
&& !a.chars().next().unwrap_or('\0').eq(&NEGATIVE)
|
||||||
{
|
{
|
||||||
// Strip string of non-numeric trailing chars
|
// Strip string of non-numeric trailing chars
|
||||||
s = &a[..idx];
|
s = &a[..idx];
|
||||||
|
@ -640,9 +1015,9 @@ fn get_leading_num(a: &str) -> &str {
|
||||||
|
|
||||||
let a = leading_num_common(a);
|
let a = leading_num_common(a);
|
||||||
|
|
||||||
// GNU numeric sort doesn't recognize '+' or 'e' notation so we strip trailing chars
|
// GNU numeric sort doesn't recognize '+' or 'e' notation so we strip
|
||||||
for (idx, c) in a.char_indices() {
|
for (idx, c) in a.char_indices() {
|
||||||
if c.eq(&'e') || c.eq(&'E') || a.chars().nth(0).unwrap_or('\0').eq(&POSITIVE) {
|
if c.eq(&'e') || c.eq(&'E') || a.chars().next().unwrap_or('\0').eq(&POSITIVE) {
|
||||||
s = &a[..idx];
|
s = &a[..idx];
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
@ -670,12 +1045,9 @@ fn get_leading_gen(a: &str) -> &str {
|
||||||
// Cleanup raw stripped strings
|
// Cleanup raw stripped strings
|
||||||
for c in p_iter.to_owned() {
|
for c in p_iter.to_owned() {
|
||||||
let next_char_numeric = p_iter.peek().unwrap_or(&'\0').is_numeric();
|
let next_char_numeric = p_iter.peek().unwrap_or(&'\0').is_numeric();
|
||||||
// Only general numeric recognizes e notation and the '+' sign
|
// Only general numeric recognizes e notation and, see block below, the '+' sign
|
||||||
if (c.eq(&'e') && !next_char_numeric)
|
// Only GNU (non-general) numeric recognize thousands seperators, takes only leading #
|
||||||
|| (c.eq(&'E') && !next_char_numeric)
|
if (c.eq(&'e') || c.eq(&'E')) && !next_char_numeric || c.eq(&THOUSANDS_SEP) {
|
||||||
// Only GNU (non-general) numeric recognize thousands seperators, takes only leading #
|
|
||||||
|| c.eq(&THOUSANDS_SEP)
|
|
||||||
{
|
|
||||||
result = a.split(c).next().unwrap_or("");
|
result = a.split(c).next().unwrap_or("");
|
||||||
break;
|
break;
|
||||||
// If positive sign and next char is not numeric, split at postive sign at keep trailing numbers
|
// If positive sign and next char is not numeric, split at postive sign at keep trailing numbers
|
||||||
|
@ -724,19 +1096,17 @@ fn get_months_dedup(a: &str) -> String {
|
||||||
// *For all dedups/uniques expect default we must compare leading numbers*
|
// *For all dedups/uniques expect default we must compare leading numbers*
|
||||||
// Also note numeric compare and unique output is specifically *not* the same as a "sort | uniq"
|
// Also note numeric compare and unique output is specifically *not* the same as a "sort | uniq"
|
||||||
// See: https://www.gnu.org/software/coreutils/manual/html_node/sort-invocation.html
|
// See: https://www.gnu.org/software/coreutils/manual/html_node/sort-invocation.html
|
||||||
fn get_num_dedup<'a>(a: &'a str, settings: &&mut Settings) -> &'a str {
|
fn get_num_dedup<'a>(a: &'a str, settings: &GlobalSettings) -> &'a str {
|
||||||
// Trim and remove any leading zeros
|
// Trim and remove any leading zeros
|
||||||
let s = a.trim().trim_start_matches('0');
|
let s = a.trim().trim_start_matches('0');
|
||||||
|
|
||||||
// Get first char
|
// Get first char
|
||||||
let c = s.chars().nth(0).unwrap_or('\0');
|
let c = s.chars().next().unwrap_or('\0');
|
||||||
|
|
||||||
// Empty lines and non-number lines are treated as the same for dedup
|
// Empty lines and non-number lines are treated as the same for dedup
|
||||||
if s.is_empty() {
|
|
||||||
""
|
|
||||||
} else if !c.eq(&NEGATIVE) && !c.is_numeric() {
|
|
||||||
""
|
|
||||||
// Prepare lines for comparison of only the numerical leading numbers
|
// Prepare lines for comparison of only the numerical leading numbers
|
||||||
|
if s.is_empty() || (!c.eq(&NEGATIVE) && !c.is_numeric()) {
|
||||||
|
""
|
||||||
} else {
|
} else {
|
||||||
let result = match settings.mode {
|
let result = match settings.mode {
|
||||||
SortMode::Numeric => get_leading_num(s),
|
SortMode::Numeric => get_leading_num(s),
|
||||||
|
@ -794,7 +1164,7 @@ fn numeric_compare(a: &str, b: &str) -> Ordering {
|
||||||
let sa = get_leading_num(a);
|
let sa = get_leading_num(a);
|
||||||
let sb = get_leading_num(b);
|
let sb = get_leading_num(b);
|
||||||
|
|
||||||
// Avoids a string alloc for every line to remove thousands seperators here
|
// Avoids a string alloc for every line to remove thousands seperators here
|
||||||
// instead of inside the get_leading_num function, which is a HUGE performance benefit
|
// instead of inside the get_leading_num function, which is a HUGE performance benefit
|
||||||
let ta = remove_thousands_sep(sa);
|
let ta = remove_thousands_sep(sa);
|
||||||
let tb = remove_thousands_sep(sb);
|
let tb = remove_thousands_sep(sb);
|
||||||
|
@ -944,6 +1314,7 @@ fn month_parse(line: &str) -> Month {
|
||||||
}
|
}
|
||||||
|
|
||||||
fn month_compare(a: &str, b: &str) -> Ordering {
|
fn month_compare(a: &str, b: &str) -> Ordering {
|
||||||
|
#![allow(clippy::comparison_chain)]
|
||||||
let ma = month_parse(a);
|
let ma = month_parse(a);
|
||||||
let mb = month_parse(b);
|
let mb = month_parse(b);
|
||||||
|
|
||||||
|
@ -986,32 +1357,29 @@ fn remove_nonprinting_chars(s: &str) -> String {
|
||||||
.collect::<String>()
|
.collect::<String>()
|
||||||
}
|
}
|
||||||
|
|
||||||
fn print_sorted<S, T: Iterator<Item = S>>(iter: T, settings: &Settings)
|
fn print_sorted<T: Iterator<Item = String>>(iter: T, settings: &GlobalSettings) {
|
||||||
where
|
|
||||||
S: std::fmt::Display,
|
|
||||||
{
|
|
||||||
let mut file: Box<dyn Write> = match settings.outfile {
|
let mut file: Box<dyn Write> = match settings.outfile {
|
||||||
Some(ref filename) => match File::create(Path::new(&filename)) {
|
Some(ref filename) => match File::create(Path::new(&filename)) {
|
||||||
Ok(f) => Box::new(BufWriter::new(f)) as Box<dyn Write>,
|
Ok(f) => Box::new(BufWriter::new(f)) as Box<dyn Write>,
|
||||||
Err(e) => {
|
Err(e) => {
|
||||||
show_error!("sort: {0}: {1}", filename, e.to_string());
|
show_error!("{0}: {1}", filename, e.to_string());
|
||||||
panic!("Could not open output file");
|
panic!("Could not open output file");
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
None => Box::new(stdout()) as Box<dyn Write>,
|
None => Box::new(BufWriter::new(stdout())) as Box<dyn Write>,
|
||||||
};
|
};
|
||||||
|
|
||||||
if settings.zero_terminated {
|
if settings.zero_terminated {
|
||||||
for line in iter {
|
for line in iter {
|
||||||
let str = format!("{}\0", line);
|
crash_if_err!(1, file.write_all(line.as_bytes()));
|
||||||
crash_if_err!(1, file.write_all(str.as_bytes()));
|
crash_if_err!(1, file.write_all("\0".as_bytes()));
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
for line in iter {
|
for line in iter {
|
||||||
let str = format!("{}\n", line);
|
crash_if_err!(1, file.write_all(line.as_bytes()));
|
||||||
crash_if_err!(1, file.write_all(str.as_bytes()));
|
crash_if_err!(1, file.write_all("\n".as_bytes()));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
crash_if_err!(1, file.flush());
|
||||||
}
|
}
|
||||||
|
|
||||||
// from cat.rs
|
// from cat.rs
|
||||||
|
@ -1024,7 +1392,7 @@ fn open(path: &str) -> Option<(Box<dyn Read>, bool)> {
|
||||||
match File::open(Path::new(path)) {
|
match File::open(Path::new(path)) {
|
||||||
Ok(f) => Some((Box::new(f) as Box<dyn Read>, false)),
|
Ok(f) => Some((Box::new(f) as Box<dyn Read>, false)),
|
||||||
Err(e) => {
|
Err(e) => {
|
||||||
show_error!("sort: {0}: {1}", path, e.to_string());
|
show_error!("{0}: {1}", path, e.to_string());
|
||||||
None
|
None
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -1097,4 +1465,34 @@ mod tests {
|
||||||
|
|
||||||
assert_eq!(Ordering::Less, version_compare(a, b));
|
assert_eq!(Ordering::Less, version_compare(a, b));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_random_compare() {
|
||||||
|
let a = "9";
|
||||||
|
let b = "9";
|
||||||
|
let c = get_rand_string();
|
||||||
|
|
||||||
|
assert_eq!(Ordering::Equal, random_shuffle(a, b, c));
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_tokenize_fields() {
|
||||||
|
let line = "foo bar b x";
|
||||||
|
assert_eq!(tokenize(line, None), vec![0..3, 3..7, 7..9, 9..14,],);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_tokenize_fields_leading_whitespace() {
|
||||||
|
let line = " foo bar b x";
|
||||||
|
assert_eq!(tokenize(line, None), vec![0..7, 7..11, 11..13, 13..18,]);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_tokenize_fields_custom_separator() {
|
||||||
|
let line = "aaa foo bar b x";
|
||||||
|
assert_eq!(
|
||||||
|
tokenize(line, Some('a')),
|
||||||
|
vec![0..0, 1..1, 2..2, 3..9, 10..18,]
|
||||||
|
);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -185,10 +185,10 @@ fn test_dictionary_order2() {
|
||||||
fn test_non_printing_chars() {
|
fn test_non_printing_chars() {
|
||||||
for non_printing_chars_param in vec!["-i"] {
|
for non_printing_chars_param in vec!["-i"] {
|
||||||
new_ucmd!()
|
new_ucmd!()
|
||||||
.pipe_in("a👦🏻aa b\naaaa b")
|
.pipe_in("a👦🏻aa\naaaa")
|
||||||
.arg(non_printing_chars_param)
|
.arg(non_printing_chars_param)
|
||||||
.succeeds()
|
.succeeds()
|
||||||
.stdout_only("aaaa b\na👦🏻aa b\n");
|
.stdout_only("a👦🏻aa\naaaa\n");
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -307,6 +307,166 @@ fn test_numeric_unique_ints2() {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_keys_open_ended() {
|
||||||
|
let input = "aa bb cc\ndd aa ff\ngg aa cc\n";
|
||||||
|
new_ucmd!()
|
||||||
|
.args(&["-k", "2.2"])
|
||||||
|
.pipe_in(input)
|
||||||
|
.succeeds()
|
||||||
|
.stdout_only("gg aa cc\ndd aa ff\naa bb cc\n");
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_keys_closed_range() {
|
||||||
|
let input = "aa bb cc\ndd aa ff\ngg aa cc\n";
|
||||||
|
new_ucmd!()
|
||||||
|
.args(&["-k", "2.2,2.2"])
|
||||||
|
.pipe_in(input)
|
||||||
|
.succeeds()
|
||||||
|
.stdout_only("dd aa ff\ngg aa cc\naa bb cc\n");
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_keys_multiple_ranges() {
|
||||||
|
let input = "aa bb cc\ndd aa ff\ngg aa cc\n";
|
||||||
|
new_ucmd!()
|
||||||
|
.args(&["-k", "2,2", "-k", "3,3"])
|
||||||
|
.pipe_in(input)
|
||||||
|
.succeeds()
|
||||||
|
.stdout_only("gg aa cc\ndd aa ff\naa bb cc\n");
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_keys_no_field_match() {
|
||||||
|
let input = "aa aa aa aa\naa bb cc\ndd aa ff\n";
|
||||||
|
new_ucmd!()
|
||||||
|
.args(&["-k", "4,4"])
|
||||||
|
.pipe_in(input)
|
||||||
|
.succeeds()
|
||||||
|
.stdout_only("aa bb cc\ndd aa ff\naa aa aa aa\n");
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_keys_no_char_match() {
|
||||||
|
let input = "aaa\nba\nc\n";
|
||||||
|
new_ucmd!()
|
||||||
|
.args(&["-k", "1.2"])
|
||||||
|
.pipe_in(input)
|
||||||
|
.succeeds()
|
||||||
|
.stdout_only("c\nba\naaa\n");
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_keys_custom_separator() {
|
||||||
|
let input = "aaxbbxcc\nddxaaxff\nggxaaxcc\n";
|
||||||
|
new_ucmd!()
|
||||||
|
.args(&["-k", "2.2,2.2", "-t", "x"])
|
||||||
|
.pipe_in(input)
|
||||||
|
.succeeds()
|
||||||
|
.stdout_only("ddxaaxff\nggxaaxcc\naaxbbxcc\n");
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_keys_invalid_field() {
|
||||||
|
new_ucmd!()
|
||||||
|
.args(&["-k", "1."])
|
||||||
|
.fails()
|
||||||
|
.stderr_only("sort: error: failed to parse character index for key `1.`: cannot parse integer from empty string");
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_keys_invalid_field_option() {
|
||||||
|
new_ucmd!()
|
||||||
|
.args(&["-k", "1.1x"])
|
||||||
|
.fails()
|
||||||
|
.stderr_only("sort: error: invalid option for key: `x`");
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_keys_invalid_field_zero() {
|
||||||
|
new_ucmd!()
|
||||||
|
.args(&["-k", "0.1"])
|
||||||
|
.fails()
|
||||||
|
.stderr_only("sort: error: field index was 0");
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_keys_with_options() {
|
||||||
|
let input = "aa 3 cc\ndd 1 ff\ngg 2 cc\n";
|
||||||
|
for param in &[
|
||||||
|
&["-k", "2,2n"][..],
|
||||||
|
&["-k", "2n,2"][..],
|
||||||
|
&["-k", "2,2", "-n"][..],
|
||||||
|
] {
|
||||||
|
new_ucmd!()
|
||||||
|
.args(param)
|
||||||
|
.pipe_in(input)
|
||||||
|
.succeeds()
|
||||||
|
.stdout_only("dd 1 ff\ngg 2 cc\naa 3 cc\n");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_keys_with_options_blanks_start() {
|
||||||
|
let input = "aa 3 cc\ndd 1 ff\ngg 2 cc\n";
|
||||||
|
for param in &[&["-k", "2b,2"][..], &["-k", "2,2", "-b"][..]] {
|
||||||
|
new_ucmd!()
|
||||||
|
.args(param)
|
||||||
|
.pipe_in(input)
|
||||||
|
.succeeds()
|
||||||
|
.stdout_only("dd 1 ff\ngg 2 cc\naa 3 cc\n");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_keys_with_options_blanks_end() {
|
||||||
|
let input = "a b
|
||||||
|
a b
|
||||||
|
a b
|
||||||
|
";
|
||||||
|
new_ucmd!()
|
||||||
|
.args(&["-k", "1,2.1b", "-s"])
|
||||||
|
.pipe_in(input)
|
||||||
|
.succeeds()
|
||||||
|
.stdout_only(
|
||||||
|
"a b
|
||||||
|
a b
|
||||||
|
a b
|
||||||
|
",
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_keys_stable() {
|
||||||
|
let input = "a b
|
||||||
|
a b
|
||||||
|
a b
|
||||||
|
";
|
||||||
|
new_ucmd!()
|
||||||
|
.args(&["-k", "1,2.1", "-s"])
|
||||||
|
.pipe_in(input)
|
||||||
|
.succeeds()
|
||||||
|
.stdout_only(
|
||||||
|
"a b
|
||||||
|
a b
|
||||||
|
a b
|
||||||
|
",
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_keys_empty_match() {
|
||||||
|
let input = "a a a a
|
||||||
|
aaaa
|
||||||
|
";
|
||||||
|
new_ucmd!()
|
||||||
|
.args(&["-k", "1,1", "-t", "a"])
|
||||||
|
.pipe_in(input)
|
||||||
|
.succeeds()
|
||||||
|
.stdout_only(input);
|
||||||
|
}
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn test_zero_terminated() {
|
fn test_zero_terminated() {
|
||||||
test_helper("zero-terminated", "-z");
|
test_helper("zero-terminated", "-z");
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue