From 49c9d8c9018eeeb2ae68a43f3763e087c44b1653 Mon Sep 17 00:00:00 2001
From: Michael Debertol <michael.debertol@gmail.com>
Date: Sat, 10 Apr 2021 14:54:58 +0200
Subject: [PATCH] sort: implement -k and -t support (#1996)

* sort: implement basic -k and -t support

This allows to specify keys after the -k flag and a custom field
separator using -t.

Support for options for specific keys is still missing, and the -b flag
is not passed down correctly.

* sort: implement support for key options

* remove unstable feature use

* don't pipe in input when we expect a failure

* only tokenize when needed, remove a clone()

* improve comments

* fix clippy lints

* re-add test

* buffer writes to stdout

* fix ignore_non_printing

and make the test fail in case it is broken :)

* move attribute to the right position

* add more tests

* add my name to the copyright section

* disallow dead code

* move a comment

* re-add a loc

* use smallvec for a perf improvement in the common case

* add BENCHMARKING.md

* add ignore_case to benchmarks
---
 Cargo.lock                  |  11 +-
 src/uu/sort/BENCHMARKING.md |  33 ++
 src/uu/sort/Cargo.toml      |   1 +
 src/uu/sort/src/sort.rs     | 674 ++++++++++++++++++++++++++++--------
 tests/by-util/test_sort.rs  | 164 ++++++++-
 5 files changed, 742 insertions(+), 141 deletions(-)
 create mode 100644 src/uu/sort/BENCHMARKING.md

diff --git a/Cargo.lock b/Cargo.lock
index 615c1a961..d2b2fa32e 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -1,5 +1,7 @@
 # This file is automatically @generated by Cargo.
 # It is not intended for manual editing.
+version = 3
+
 [[package]]
 name = "advapi32-sys"
 version = "0.2.0"
@@ -1362,6 +1364,12 @@ dependencies = [
  "maybe-uninit",
 ]
 
+[[package]]
+name = "smallvec"
+version = "1.6.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "fe0f37c9e8f3c5a4a66ad655a93c74daac4ad00c441533bf5c6e7990bb42604e"
+
 [[package]]
 name = "strsim"
 version = "0.8.0"
@@ -1816,7 +1824,7 @@ dependencies = [
  "quickcheck",
  "rand 0.7.3",
  "rand_chacha",
- "smallvec",
+ "smallvec 0.6.14",
  "uucore",
  "uucore_procs",
 ]
@@ -2289,6 +2297,7 @@ dependencies = [
  "rand 0.7.3",
  "rayon",
  "semver",
+ "smallvec 1.6.1",
  "uucore",
  "uucore_procs",
 ]
diff --git a/src/uu/sort/BENCHMARKING.md b/src/uu/sort/BENCHMARKING.md
new file mode 100644
index 000000000..b20db014d
--- /dev/null
+++ b/src/uu/sort/BENCHMARKING.md
@@ -0,0 +1,33 @@
+# Benchmarking sort
+
+Most of the time when sorting is spent comparing lines. The comparison functions however differ based
+on which arguments are passed to `sort`, therefore it is important to always benchmark multiple scenarios.
+This is an overwiew over what was benchmarked, and if you make changes to `sort`, you are encouraged to check
+how performance was affected for the workloads listed below. Feel free to add other workloads to the
+list that we should improve / make sure not to regress.
+
+Run `cargo build --release` before benchmarking after you make a change!
+
+## Sorting a wordlist
+- Get a wordlist, for example with [words](https://en.wikipedia.org/wiki/Words_(Unix)) on Linux. The exact wordlist
+  doesn't matter for performance comparisons. In this example I'm using `/usr/share/dict/american-english` as the wordlist.
+- Shuffle the wordlist by running `sort -R /usr/share/dict/american-english > shuffled_wordlist.txt`.
+- Benchmark sorting the wordlist with hyperfine: `hyperfine "target/release/coreutils sort shuffled_wordlist.txt -o output.txt"`.
+
+## Sorting a wordlist with ignore_case
+- Same wordlist as above
+- Benchmark sorting the wordlist ignoring the case with hyperfine: `hyperfine "target/release/coreutils sort shuffled_wordlist.txt -f -o output.txt"`.
+
+## Sorting numbers
+- Generate a list of numbers: `seq 0 100000 | sort -R > shuffled_numbers.txt`.
+- Benchmark numeric sorting with hyperfine: `hyperfine "target/release/coreutils sort shuffled_numbers.txt -n -o output.txt"`.
+
+## Stdout and stdin performance
+Try to run the above benchmarks by piping the input through stdin (standard input) and redirect the
+output through stdout (standard output):
+- Remove the input file from the arguments and add `cat [inputfile] | ` at the beginning.
+- Remove `-o output.txt` and add `> output.txt` at the end.
+
+Example: `hyperfine "target/release/coreutils sort shuffled_numbers.txt -n -o output.txt"` becomes
+`hyperfine "cat shuffled_numbers.txt | target/release/coreutils sort -n > output.txt`
+- Check that performance is similar to the original benchmark.
\ No newline at end of file
diff --git a/src/uu/sort/Cargo.toml b/src/uu/sort/Cargo.toml
index 814e4bbba..96e88ebc9 100644
--- a/src/uu/sort/Cargo.toml
+++ b/src/uu/sort/Cargo.toml
@@ -21,6 +21,7 @@ clap = "2.33"
 fnv = "1.0.7"
 itertools = "0.8.0"
 semver = "0.9.0"
+smallvec = "1.6.1"
 uucore = { version=">=0.0.8", package="uucore", path="../../uucore", features=["fs"] }
 uucore_procs = { version=">=0.0.5", package="uucore_procs", path="../../uucore_procs" }
 
diff --git a/src/uu/sort/src/sort.rs b/src/uu/sort/src/sort.rs
index 4aa3fbed2..4f669f578 100644
--- a/src/uu/sort/src/sort.rs
+++ b/src/uu/sort/src/sort.rs
@@ -2,10 +2,10 @@
 //  *
 //  * (c) Michael Yin <mikeyin@mikeyin.org>
 //  * (c) Robert Swinford <robert.swinford..AT..gmail.com>
+//  * (c) Michael Debertol <michael.debertol..AT..gmail.com>
 //  *
 //  * For the full copyright and license information, please view the LICENSE
 //  * file that was distributed with this source code.
-#![allow(dead_code)]
 
 // Although these links don't always seem to describe reality, check out the POSIX and GNU specs:
 // https://pubs.opengroup.org/onlinepubs/9699919799/utilities/sort.html
@@ -22,6 +22,7 @@ use rand::distributions::Alphanumeric;
 use rand::{thread_rng, Rng};
 use rayon::prelude::*;
 use semver::Version;
+use smallvec::SmallVec;
 use std::borrow::Cow;
 use std::cmp::Ordering;
 use std::collections::BinaryHeap;
@@ -30,6 +31,7 @@ use std::fs::File;
 use std::hash::{Hash, Hasher};
 use std::io::{stdin, stdout, BufRead, BufReader, BufWriter, Lines, Read, Write};
 use std::mem::replace;
+use std::ops::{Range, RangeInclusive};
 use std::path::Path;
 use uucore::fs::is_stdin_interactive; // for Iterator::dedup()
 
@@ -37,6 +39,16 @@ static NAME: &str = "sort";
 static ABOUT: &str = "Display sorted concatenation of all FILE(s).";
 static VERSION: &str = env!("CARGO_PKG_VERSION");
 
+const LONG_HELP_KEYS: &str = "The key format is FIELD[.CHAR][OPTIONS][,FIELD[.CHAR]][OPTIONS].
+
+Fields by default are separated by the first whitespace after a non-whitespace character. Use -t to specify a custom separator.
+In the default case, whitespace is appended at the beginning of each field. Custom separators however are not included in fields.
+
+FIELD and CHAR both start at 1 (i.e. they are 1-indexed). If there is no end specified after a comma, the end will be the end of the line.
+If CHAR is set 0, it means the end of the field. CHAR defaults to 1 for the start position and to 0 for the end position.
+
+Valid options are: MbdfhnRrV. They override the global options for this key.";
+
 static OPT_HUMAN_NUMERIC_SORT: &str = "human-numeric-sort";
 static OPT_MONTH_SORT: &str = "month-sort";
 static OPT_NUMERIC_SORT: &str = "numeric-sort";
@@ -54,6 +66,8 @@ static OPT_OUTPUT: &str = "output";
 static OPT_REVERSE: &str = "reverse";
 static OPT_STABLE: &str = "stable";
 static OPT_UNIQUE: &str = "unique";
+static OPT_KEY: &str = "key";
+static OPT_SEPARATOR: &str = "field-separator";
 static OPT_RANDOM: &str = "random-sort";
 static OPT_ZERO_TERMINATED: &str = "zero-terminated";
 static OPT_PARALLEL: &str = "parallel";
@@ -63,10 +77,11 @@ static ARG_FILES: &str = "files";
 
 static DECIMAL_PT: char = '.';
 static THOUSANDS_SEP: char = ',';
+
 static NEGATIVE: char = '-';
 static POSITIVE: char = '+';
 
-#[derive(Eq, Ord, PartialEq, PartialOrd)]
+#[derive(Eq, Ord, PartialEq, PartialOrd, Clone)]
 enum SortMode {
     Numeric,
     HumanNumeric,
@@ -76,8 +91,12 @@ enum SortMode {
     Default,
 }
 
-struct Settings {
+struct GlobalSettings {
     mode: SortMode,
+    ignore_blanks: bool,
+    ignore_case: bool,
+    dictionary_order: bool,
+    ignore_non_printing: bool,
     merge: bool,
     reverse: bool,
     outfile: Option<String>,
@@ -86,17 +105,21 @@ struct Settings {
     check: bool,
     check_silent: bool,
     random: bool,
-    compare_fn: fn(&str, &str) -> Ordering,
-    transform_fns: Vec<fn(&str) -> String>,
-    threads: String,
     salt: String,
+    selectors: Vec<FieldSelector>,
+    separator: Option<char>,
+    threads: String,
     zero_terminated: bool,
 }
 
-impl Default for Settings {
-    fn default() -> Settings {
-        Settings {
+impl Default for GlobalSettings {
+    fn default() -> GlobalSettings {
+        GlobalSettings {
             mode: SortMode::Default,
+            ignore_blanks: false,
+            ignore_case: false,
+            dictionary_order: false,
+            ignore_non_printing: false,
             merge: false,
             reverse: false,
             outfile: None,
@@ -105,19 +128,330 @@ impl Default for Settings {
             check: false,
             check_silent: false,
             random: false,
-            compare_fn: default_compare,
-            transform_fns: Vec::new(),
-            threads: String::new(),
             salt: String::new(),
+            selectors: vec![],
+            separator: None,
+            threads: String::new(),
             zero_terminated: false,
         }
     }
 }
 
+struct KeySettings {
+    mode: SortMode,
+    ignore_blanks: bool,
+    ignore_case: bool,
+    dictionary_order: bool,
+    ignore_non_printing: bool,
+    random: bool,
+    reverse: bool,
+}
+
+impl From<&GlobalSettings> for KeySettings {
+    fn from(settings: &GlobalSettings) -> Self {
+        Self {
+            mode: settings.mode.clone(),
+            ignore_blanks: settings.ignore_blanks,
+            ignore_case: settings.ignore_case,
+            ignore_non_printing: settings.ignore_non_printing,
+            random: settings.random,
+            reverse: settings.reverse,
+            dictionary_order: settings.dictionary_order,
+        }
+    }
+}
+
+/// Represents the string selected by a FieldSelector.
+enum Selection {
+    /// If we had to transform this selection, we have to store a new string.
+    String(String),
+    /// If there was no transformation, we can store an index into the line.
+    ByIndex(Range<usize>),
+}
+
+impl Selection {
+    /// Gets the actual string slice represented by this Selection.
+    fn get_str<'a>(&'a self, line: &'a Line) -> &'a str {
+        match self {
+            Selection::String(string) => string.as_str(),
+            Selection::ByIndex(range) => &line.line[range.to_owned()],
+        }
+    }
+}
+
+type Field = Range<usize>;
+
+struct Line {
+    line: String,
+    // The common case is not to specify fields. Let's make this fast.
+    selections: SmallVec<[Selection; 1]>,
+}
+
+impl Line {
+    fn new(line: String, settings: &GlobalSettings) -> Self {
+        let fields = if settings
+            .selectors
+            .iter()
+            .any(|selector| selector.needs_tokens())
+        {
+            // Only tokenize if we will need tokens.
+            Some(tokenize(&line, settings.separator))
+        } else {
+            None
+        };
+
+        let selections = settings
+            .selectors
+            .iter()
+            .map(|selector| {
+                if let Some(range) = selector.get_selection(&line, fields.as_deref()) {
+                    if let Some(transformed) =
+                        transform(&line[range.to_owned()], &selector.settings)
+                    {
+                        Selection::String(transformed)
+                    } else {
+                        Selection::ByIndex(range.start().to_owned()..range.end() + 1)
+                    }
+                } else {
+                    // If there is no match, match the empty string.
+                    Selection::ByIndex(0..0)
+                }
+            })
+            .collect();
+        Self { line, selections }
+    }
+}
+
+/// Transform this line. Returns None if there's no need to transform.
+fn transform(line: &str, settings: &KeySettings) -> Option<String> {
+    let mut transformed = None;
+    if settings.ignore_case {
+        transformed = Some(line.to_uppercase());
+    }
+    if settings.ignore_blanks {
+        transformed = Some(
+            transformed
+                .as_deref()
+                .unwrap_or(line)
+                .trim_start()
+                .to_string(),
+        );
+    }
+    if settings.dictionary_order {
+        transformed = Some(remove_nondictionary_chars(
+            transformed.as_deref().unwrap_or(line),
+        ));
+    }
+    if settings.ignore_non_printing {
+        transformed = Some(remove_nonprinting_chars(
+            transformed.as_deref().unwrap_or(line),
+        ));
+    }
+    transformed
+}
+
+/// Tokenize a line into fields.
+fn tokenize(line: &str, separator: Option<char>) -> Vec<Field> {
+    if let Some(separator) = separator {
+        tokenize_with_separator(line, separator)
+    } else {
+        tokenize_default(line)
+    }
+}
+
+/// By default fields are separated by the first whitespace after non-whitespace.
+/// Whitespace is included in fields at the start.
+fn tokenize_default(line: &str) -> Vec<Field> {
+    let mut tokens = vec![0..0];
+    // pretend that there was whitespace in front of the line
+    let mut previous_was_whitespace = true;
+    for (idx, char) in line.char_indices() {
+        if char.is_whitespace() {
+            if !previous_was_whitespace {
+                tokens.last_mut().unwrap().end = idx;
+                tokens.push(idx..0);
+            }
+            previous_was_whitespace = true;
+        } else {
+            previous_was_whitespace = false;
+        }
+    }
+    tokens.last_mut().unwrap().end = line.len();
+    tokens
+}
+
+/// Split between separators. These separators are not included in fields.
+fn tokenize_with_separator(line: &str, separator: char) -> Vec<Field> {
+    let mut tokens = vec![0..0];
+    let mut previous_was_separator = false;
+    for (idx, char) in line.char_indices() {
+        if previous_was_separator {
+            tokens.push(idx..0);
+        }
+        if char == separator {
+            tokens.last_mut().unwrap().end = idx;
+            previous_was_separator = true;
+        } else {
+            previous_was_separator = false;
+        }
+    }
+    tokens.last_mut().unwrap().end = line.len();
+    tokens
+}
+
+struct KeyPosition {
+    /// 1-indexed, 0 is invalid.
+    field: usize,
+    /// 1-indexed, 0 is end of field.
+    char: usize,
+    ignore_blanks: bool,
+}
+
+impl KeyPosition {
+    fn parse(key: &str, default_char_index: usize, settings: &mut KeySettings) -> Self {
+        let mut field_and_char = key.split('.');
+        let mut field = field_and_char
+            .next()
+            .unwrap_or_else(|| crash!(1, "invalid key `{}`", key));
+        let mut char = field_and_char.next();
+
+        // If there is a char index, we expect options to appear after it. Otherwise we expect them after the field index.
+        let value_with_options = char.as_mut().unwrap_or(&mut field);
+
+        let mut ignore_blanks = settings.ignore_blanks;
+        if let Some(options_start) = value_with_options.chars().position(char::is_alphabetic) {
+            for option in value_with_options[options_start..].chars() {
+                // valid options: MbdfghinRrV
+                match option {
+                    'M' => settings.mode = SortMode::Month,
+                    'b' => ignore_blanks = true,
+                    'd' => settings.dictionary_order = true,
+                    'f' => settings.ignore_case = true,
+                    'g' => settings.mode = SortMode::GeneralNumeric,
+                    'h' => settings.mode = SortMode::HumanNumeric,
+                    'i' => settings.ignore_non_printing = true,
+                    'n' => settings.mode = SortMode::Numeric,
+                    'R' => settings.random = true,
+                    'r' => settings.reverse = true,
+                    'V' => settings.mode = SortMode::Version,
+                    c => {
+                        crash!(1, "invalid option for key: `{}`", c)
+                    }
+                }
+            }
+            // Strip away option characters from the original value so we can parse it later
+            *value_with_options = &value_with_options[..options_start];
+        }
+
+        let field = field
+            .parse()
+            .unwrap_or_else(|e| crash!(1, "failed to parse field index for key `{}`: {}", key, e));
+        if field == 0 {
+            crash!(1, "field index was 0");
+        }
+        let char = char.map_or(default_char_index, |char| {
+            char.parse().unwrap_or_else(|e| {
+                crash!(
+                    1,
+                    "failed to parse character index for key `{}`: {}",
+                    key,
+                    e
+                )
+            })
+        });
+        Self {
+            field,
+            char,
+            ignore_blanks,
+        }
+    }
+}
+
+struct FieldSelector {
+    from: KeyPosition,
+    to: Option<KeyPosition>,
+    settings: KeySettings,
+}
+
+impl FieldSelector {
+    fn needs_tokens(&self) -> bool {
+        self.from.field != 1 || self.from.char == 0 || self.to.is_some()
+    }
+
+    /// Look up the slice that corresponds to this selector for the given line.
+    /// If needs_fields returned false, fields may be None.
+    fn get_selection<'a>(
+        &self,
+        line: &'a str,
+        tokens: Option<&[Field]>,
+    ) -> Option<RangeInclusive<usize>> {
+        enum ResolutionErr {
+            TooLow,
+            TooHigh,
+        }
+
+        // Get the index for this line given the KeyPosition
+        fn resolve_index(
+            line: &str,
+            tokens: Option<&[Field]>,
+            position: &KeyPosition,
+        ) -> Result<usize, ResolutionErr> {
+            if tokens.map_or(false, |fields| fields.len() < position.field) {
+                Err(ResolutionErr::TooHigh)
+            } else if position.char == 0 {
+                let end = tokens.unwrap()[position.field - 1].end;
+                if end == 0 {
+                    Err(ResolutionErr::TooLow)
+                } else {
+                    Ok(end - 1)
+                }
+            } else {
+                let mut idx = if position.field == 1 {
+                    // The first field always starts at 0.
+                    // We don't need tokens for this case.
+                    0
+                } else {
+                    tokens.unwrap()[position.field - 1].start
+                } + position.char
+                    - 1;
+                if idx >= line.len() {
+                    Err(ResolutionErr::TooHigh)
+                } else {
+                    if position.ignore_blanks {
+                        if let Some(not_whitespace) =
+                            line[idx..].chars().position(|c| !c.is_whitespace())
+                        {
+                            idx += not_whitespace;
+                        } else {
+                            return Err(ResolutionErr::TooHigh);
+                        }
+                    }
+                    Ok(idx)
+                }
+            }
+        }
+
+        if let Ok(from) = resolve_index(line, tokens, &self.from) {
+            let to = self.to.as_ref().map(|to| resolve_index(line, tokens, &to));
+            match to {
+                Some(Ok(to)) => Some(from..=to),
+                // If `to` was not given or the match would be after the end of the line,
+                // match everything until the end of the line.
+                None | Some(Err(ResolutionErr::TooHigh)) => Some(from..=line.len() - 1),
+                // If `to` is before the start of the line, report no match.
+                // This can happen if the line starts with a separator.
+                Some(Err(ResolutionErr::TooLow)) => None,
+            }
+        } else {
+            None
+        }
+    }
+}
+
 struct MergeableFile<'a> {
     lines: Lines<BufReader<Box<dyn Read>>>,
-    current_line: String,
-    settings: &'a Settings,
+    current_line: Line,
+    settings: &'a GlobalSettings,
 }
 
 // BinaryHeap depends on `Ord`. Note that we want to pop smallest items
@@ -125,7 +459,7 @@ struct MergeableFile<'a> {
 // trick it into the right order by calling reverse() here.
 impl<'a> Ord for MergeableFile<'a> {
     fn cmp(&self, other: &MergeableFile) -> Ordering {
-        compare_by(&self.current_line, &other.current_line, &self.settings).reverse()
+        compare_by(&self.current_line, &other.current_line, self.settings).reverse()
     }
 }
 
@@ -137,7 +471,7 @@ impl<'a> PartialOrd for MergeableFile<'a> {
 
 impl<'a> PartialEq for MergeableFile<'a> {
     fn eq(&self, other: &MergeableFile) -> bool {
-        Ordering::Equal == compare_by(&self.current_line, &other.current_line, &self.settings)
+        Ordering::Equal == compare_by(&self.current_line, &other.current_line, self.settings)
     }
 }
 
@@ -145,11 +479,11 @@ impl<'a> Eq for MergeableFile<'a> {}
 
 struct FileMerger<'a> {
     heap: BinaryHeap<MergeableFile<'a>>,
-    settings: &'a Settings,
+    settings: &'a GlobalSettings,
 }
 
 impl<'a> FileMerger<'a> {
-    fn new(settings: &'a Settings) -> FileMerger<'a> {
+    fn new(settings: &'a GlobalSettings) -> FileMerger<'a> {
         FileMerger {
             heap: BinaryHeap::new(),
             settings,
@@ -159,7 +493,7 @@ impl<'a> FileMerger<'a> {
         if let Some(Ok(next_line)) = lines.next() {
             let mergeable_file = MergeableFile {
                 lines,
-                current_line: next_line,
+                current_line: Line::new(next_line, &self.settings),
                 settings: &self.settings,
             };
             self.heap.push(mergeable_file);
@@ -174,14 +508,17 @@ impl<'a> Iterator for FileMerger<'a> {
             Some(mut current) => {
                 match current.lines.next() {
                     Some(Ok(next_line)) => {
-                        let ret = replace(&mut current.current_line, next_line);
+                        let ret = replace(
+                            &mut current.current_line,
+                            Line::new(next_line, &self.settings),
+                        );
                         self.heap.push(current);
-                        Some(ret)
+                        Some(ret.line)
                     }
                     _ => {
                         // Don't put it back in the heap (it's empty/erroring)
                         // but its first line is still valid.
-                        Some(current.current_line)
+                        Some(current.current_line.line)
                     }
                 }
             }
@@ -205,7 +542,7 @@ With no FILE, or when FILE is -, read standard input.",
 pub fn uumain(args: impl uucore::Args) -> i32 {
     let args = args.collect_str();
     let usage = get_usage();
-    let mut settings: Settings = Default::default();
+    let mut settings: GlobalSettings = Default::default();
 
     let matches = App::new(executable!())
         .version(VERSION)
@@ -316,7 +653,21 @@ pub fn uumain(args: impl uucore::Args) -> i32 {
                 .help("output only the first of an equal run"),
         )
         .arg(
-            Arg::with_name(OPT_ZERO_TERMINATED)
+            Arg::with_name(OPT_KEY)
+                .short("k")
+                .long(OPT_KEY)
+                .help("sort by a key")
+                .long_help(LONG_HELP_KEYS)
+                .multiple(true)
+                .takes_value(true),
+        )
+        .arg(
+            Arg::with_name(OPT_SEPARATOR)
+                .short("t")
+                .long(OPT_SEPARATOR)
+                .help("custom separator for -k")
+                .takes_value(true))
+            .arg(Arg::with_name(OPT_ZERO_TERMINATED)
                 .short("z")
                 .long(OPT_ZERO_TERMINATED)
                 .help("line delimiter is NUL, not newline"),
@@ -350,14 +701,12 @@ pub fn uumain(args: impl uucore::Args) -> i32 {
         for path in &files0_from {
             let (reader, _) = open(path.as_str()).expect("Could not read from file specified.");
             let buf_reader = BufReader::new(reader);
-            for line in buf_reader.split(b'\0') {
-                if let Ok(n) = line {
-                    files.push(
-                        std::str::from_utf8(&n)
-                            .expect("Could not parse string from zero terminated input.")
-                            .to_string(),
-                    );
-                }
+            for line in buf_reader.split(b'\0').flatten() {
+                files.push(
+                    std::str::from_utf8(&line)
+                        .expect("Could not parse string from zero terminated input.")
+                        .to_string(),
+                );
             }
         }
         files
@@ -382,21 +731,17 @@ pub fn uumain(args: impl uucore::Args) -> i32 {
         SortMode::Default
     };
 
+    settings.dictionary_order = matches.is_present(OPT_DICTIONARY_ORDER);
+    settings.ignore_non_printing = matches.is_present(OPT_IGNORE_NONPRINTING);
     if matches.is_present(OPT_PARALLEL) {
         // "0" is default - threads = num of cores
         settings.threads = matches
             .value_of(OPT_PARALLEL)
             .map(String::from)
-            .unwrap_or("0".to_string());
+            .unwrap_or_else(|| "0".to_string());
         env::set_var("RAYON_NUM_THREADS", &settings.threads);
     }
 
-    if matches.is_present(OPT_DICTIONARY_ORDER) {
-        settings.transform_fns.push(remove_nondictionary_chars);
-    } else if matches.is_present(OPT_IGNORE_NONPRINTING) {
-        settings.transform_fns.push(remove_nonprinting_chars);
-    }
-
     settings.zero_terminated = matches.is_present(OPT_ZERO_TERMINATED);
     settings.merge = matches.is_present(OPT_MERGE);
 
@@ -406,13 +751,9 @@ pub fn uumain(args: impl uucore::Args) -> i32 {
         settings.check = true;
     };
 
-    if matches.is_present(OPT_IGNORE_CASE) {
-        settings.transform_fns.push(|s| s.to_uppercase());
-    }
+    settings.ignore_case = matches.is_present(OPT_IGNORE_CASE);
 
-    if matches.is_present(OPT_IGNORE_BLANKS) {
-        settings.transform_fns.push(|s| s.trim_start().to_string());
-    }
+    settings.ignore_blanks = matches.is_present(OPT_IGNORE_BLANKS);
 
     settings.outfile = matches.value_of(OPT_OUTPUT).map(String::from);
     settings.reverse = matches.is_present(OPT_REVERSE);
@@ -424,27 +765,64 @@ pub fn uumain(args: impl uucore::Args) -> i32 {
         settings.salt = get_rand_string();
     }
 
-    //let mut files = matches.free;
     if files.is_empty() {
         /* if no file, default to stdin */
         files.push("-".to_owned());
     } else if settings.check && files.len() != 1 {
-        crash!(1, "sort: extra operand `{}' not allowed with -c", files[1])
+        crash!(1, "extra operand `{}' not allowed with -c", files[1])
     }
 
-    settings.compare_fn = match settings.mode {
-        SortMode::Numeric => numeric_compare,
-        SortMode::GeneralNumeric => general_numeric_compare,
-        SortMode::HumanNumeric => human_numeric_size_compare,
-        SortMode::Month => month_compare,
-        SortMode::Version => version_compare,
-        SortMode::Default => default_compare,
-    };
+    if let Some(arg) = matches.args.get(OPT_SEPARATOR) {
+        let separator = arg.vals[0].to_string_lossy();
+        let separator = separator;
+        if separator.len() != 1 {
+            crash!(1, "separator must be exactly one character long");
+        }
+        settings.separator = Some(separator.chars().next().unwrap())
+    }
 
-    exec(files, &mut settings)
+    if matches.is_present(OPT_KEY) {
+        for key in &matches.args[OPT_KEY].vals {
+            let key = key.to_string_lossy();
+            let mut from_to = key.split(',');
+            let mut key_settings = KeySettings::from(&settings);
+            let from = KeyPosition::parse(
+                from_to
+                    .next()
+                    .unwrap_or_else(|| crash!(1, "invalid key `{}`", key)),
+                1,
+                &mut key_settings,
+            );
+            let to = from_to
+                .next()
+                .map(|to| KeyPosition::parse(to, 0, &mut key_settings));
+            let field_selector = FieldSelector {
+                from,
+                to,
+                settings: key_settings,
+            };
+            settings.selectors.push(field_selector);
+        }
+    }
+
+    if !settings.stable || !matches.is_present(OPT_KEY) {
+        // add a default selector matching the whole line
+        let key_settings = KeySettings::from(&settings);
+        settings.selectors.push(FieldSelector {
+            from: KeyPosition {
+                field: 1,
+                char: 1,
+                ignore_blanks: key_settings.ignore_blanks,
+            },
+            to: None,
+            settings: key_settings,
+        });
+    }
+
+    exec(files, &settings)
 }
 
-fn exec(files: Vec<String>, settings: &mut Settings) -> i32 {
+fn exec(files: Vec<String>, settings: &GlobalSettings) -> i32 {
     let mut lines = Vec::new();
     let mut file_merger = FileMerger::new(&settings);
 
@@ -459,26 +837,27 @@ fn exec(files: Vec<String>, settings: &mut Settings) -> i32 {
         if settings.merge {
             file_merger.push_file(buf_reader.lines());
         } else if settings.zero_terminated {
-            for line in buf_reader.split(b'\0') {
-                if let Ok(n) = line {
-                    lines.push(
-                        std::str::from_utf8(&n)
-                            .expect("Could not parse string from zero terminated input.")
-                            .to_string(),
-                    );
-                }
+            for line in buf_reader.split(b'\0').flatten() {
+                lines.push(Line::new(
+                    std::str::from_utf8(&line)
+                        .expect("Could not parse string from zero terminated input.")
+                        .to_string(),
+                    &settings,
+                ));
             }
         } else {
             for line in buf_reader.lines() {
                 if let Ok(n) = line {
-                    lines.push(n);
+                    lines.push(Line::new(n, &settings));
+                } else {
+                    break;
                 }
             }
         }
     }
 
     if settings.check {
-        return exec_check_file(lines, &settings);
+        return exec_check_file(&lines, &settings);
     } else {
         sort_by(&mut lines, &settings);
     }
@@ -490,29 +869,31 @@ fn exec(files: Vec<String>, settings: &mut Settings) -> i32 {
             print_sorted(file_merger, &settings)
         }
     } else if settings.mode == SortMode::Default && settings.unique {
-        print_sorted(lines.iter().dedup(), &settings)
+        print_sorted(lines.into_iter().map(|line| line.line).dedup(), &settings)
     } else if settings.mode == SortMode::Month && settings.unique {
         print_sorted(
             lines
-                .iter()
+                .into_iter()
+                .map(|line| line.line)
                 .dedup_by(|a, b| get_months_dedup(a) == get_months_dedup(b)),
             &settings,
         )
     } else if settings.unique {
         print_sorted(
             lines
-                .iter()
-                .dedup_by(|a, b| get_num_dedup(a, &settings) == get_num_dedup(b, &settings)),
+                .into_iter()
+                .map(|line| line.line)
+                .dedup_by(|a, b| get_num_dedup(a, settings) == get_num_dedup(b, settings)),
             &settings,
         )
     } else {
-        print_sorted(lines.iter(), &settings)
+        print_sorted(lines.into_iter().map(|line| line.line), &settings)
     }
 
     0
 }
 
-fn exec_check_file(unwrapped_lines: Vec<String>, settings: &Settings) -> i32 {
+fn exec_check_file(unwrapped_lines: &[Line], settings: &GlobalSettings) -> i32 {
     // errors yields the line before each disorder,
     // plus the last line (quirk of .coalesce())
     let mut errors =
@@ -544,51 +925,45 @@ fn exec_check_file(unwrapped_lines: Vec<String>, settings: &Settings) -> i32 {
     }
 }
 
-#[inline(always)]
-fn transform(line: &str, settings: &Settings) -> String {
-    let mut transformed = line.to_owned();
-    for transform_fn in &settings.transform_fns {
-        transformed = transform_fn(&transformed);
-    }
-
-    transformed
-}
-
-#[inline(always)]
-fn sort_by(lines: &mut Vec<String>, settings: &Settings) {
+fn sort_by(lines: &mut Vec<Line>, settings: &GlobalSettings) {
     lines.par_sort_by(|a, b| compare_by(a, b, &settings))
 }
 
-fn compare_by(a: &str, b: &str, settings: &Settings) -> Ordering {
-    let (a_transformed, b_transformed): (String, String);
-    let (a, b) = if !settings.transform_fns.is_empty() {
-        a_transformed = transform(&a, &settings);
-        b_transformed = transform(&b, &settings);
-        (a_transformed.as_str(), b_transformed.as_str())
-    } else {
-        (a, b)
-    };
+fn compare_by(a: &Line, b: &Line, global_settings: &GlobalSettings) -> Ordering {
+    for (idx, selector) in global_settings.selectors.iter().enumerate() {
+        let a = a.selections[idx].get_str(a);
+        let b = b.selections[idx].get_str(b);
+        let settings = &selector.settings;
 
-    // 1st Compare
-    let mut cmp: Ordering = if settings.random {
-        random_shuffle(a, b, settings.salt.clone())
-    } else {
-        (settings.compare_fn)(a, b)
-    };
-
-    // Call "last resort compare" on any equal
-    if cmp == Ordering::Equal {
-        if settings.random || settings.stable || settings.unique {
-            cmp = Ordering::Equal
+        let cmp: Ordering = if settings.random {
+            random_shuffle(a, b, global_settings.salt.clone())
         } else {
-            cmp = default_compare(a, b)
+            (match settings.mode {
+                SortMode::Numeric => numeric_compare,
+                SortMode::GeneralNumeric => general_numeric_compare,
+                SortMode::HumanNumeric => human_numeric_size_compare,
+                SortMode::Month => month_compare,
+                SortMode::Version => version_compare,
+                SortMode::Default => default_compare,
+            })(a, b)
         };
+        if cmp != Ordering::Equal {
+            return if settings.reverse { cmp.reverse() } else { cmp };
+        }
+    }
+
+    // Call "last resort compare" if all selectors returned Equal
+
+    let cmp = if global_settings.random || global_settings.stable || global_settings.unique {
+        Ordering::Equal
+    } else {
+        default_compare(&a.line, &b.line)
     };
 
-    if settings.reverse {
-        return cmp.reverse();
+    if global_settings.reverse {
+        cmp.reverse()
     } else {
-        return cmp;
+        cmp
     }
 }
 
@@ -617,8 +992,8 @@ fn leading_num_common(a: &str) -> &str {
             && !c.eq(&'e')
             && !c.eq(&'E')
             // check whether first char is + or - 
-            && !a.chars().nth(0).unwrap_or('\0').eq(&POSITIVE)
-            && !a.chars().nth(0).unwrap_or('\0').eq(&NEGATIVE)
+            && !a.chars().next().unwrap_or('\0').eq(&POSITIVE)
+            && !a.chars().next().unwrap_or('\0').eq(&NEGATIVE)
         {
             // Strip string of non-numeric trailing chars
             s = &a[..idx];
@@ -640,9 +1015,9 @@ fn get_leading_num(a: &str) -> &str {
 
     let a = leading_num_common(a);
 
-    // GNU numeric sort doesn't recognize '+' or 'e' notation so we strip trailing chars
+    // GNU numeric sort doesn't recognize '+' or 'e' notation so we strip
     for (idx, c) in a.char_indices() {
-        if c.eq(&'e') || c.eq(&'E') || a.chars().nth(0).unwrap_or('\0').eq(&POSITIVE) {
+        if c.eq(&'e') || c.eq(&'E') || a.chars().next().unwrap_or('\0').eq(&POSITIVE) {
             s = &a[..idx];
             break;
         }
@@ -670,12 +1045,9 @@ fn get_leading_gen(a: &str) -> &str {
     // Cleanup raw stripped strings
     for c in p_iter.to_owned() {
         let next_char_numeric = p_iter.peek().unwrap_or(&'\0').is_numeric();
-        // Only general numeric recognizes e notation and the '+' sign
-        if (c.eq(&'e') && !next_char_numeric)
-            || (c.eq(&'E') && !next_char_numeric)
-            // Only GNU (non-general) numeric recognize thousands seperators, takes only leading #
-            || c.eq(&THOUSANDS_SEP)
-        {
+        // Only general numeric recognizes e notation and, see block below, the '+' sign
+        // Only GNU (non-general) numeric recognize thousands seperators, takes only leading #
+        if (c.eq(&'e') || c.eq(&'E')) && !next_char_numeric || c.eq(&THOUSANDS_SEP) {
             result = a.split(c).next().unwrap_or("");
             break;
         // If positive sign and next char is not numeric, split at postive sign at keep trailing numbers
@@ -724,19 +1096,17 @@ fn get_months_dedup(a: &str) -> String {
 // *For all dedups/uniques expect default we must compare leading numbers*
 // Also note numeric compare and unique output is specifically *not* the same as a "sort | uniq"
 // See: https://www.gnu.org/software/coreutils/manual/html_node/sort-invocation.html
-fn get_num_dedup<'a>(a: &'a str, settings: &&mut Settings) -> &'a str {
+fn get_num_dedup<'a>(a: &'a str, settings: &GlobalSettings) -> &'a str {
     // Trim and remove any leading zeros
     let s = a.trim().trim_start_matches('0');
 
     // Get first char
-    let c = s.chars().nth(0).unwrap_or('\0');
+    let c = s.chars().next().unwrap_or('\0');
 
     // Empty lines and non-number lines are treated as the same for dedup
-    if s.is_empty() {
-        ""
-    } else if !c.eq(&NEGATIVE) && !c.is_numeric() {
-        ""
     // Prepare lines for comparison of only the numerical leading numbers
+    if s.is_empty() || (!c.eq(&NEGATIVE) && !c.is_numeric()) {
+        ""
     } else {
         let result = match settings.mode {
             SortMode::Numeric => get_leading_num(s),
@@ -794,7 +1164,7 @@ fn numeric_compare(a: &str, b: &str) -> Ordering {
     let sa = get_leading_num(a);
     let sb = get_leading_num(b);
 
-    // Avoids a string alloc for every line to remove thousands seperators here 
+    // Avoids a string alloc for every line to remove thousands seperators here
     // instead of inside the get_leading_num function, which is a HUGE performance benefit
     let ta = remove_thousands_sep(sa);
     let tb = remove_thousands_sep(sb);
@@ -944,6 +1314,7 @@ fn month_parse(line: &str) -> Month {
 }
 
 fn month_compare(a: &str, b: &str) -> Ordering {
+    #![allow(clippy::comparison_chain)]
     let ma = month_parse(a);
     let mb = month_parse(b);
 
@@ -986,32 +1357,29 @@ fn remove_nonprinting_chars(s: &str) -> String {
         .collect::<String>()
 }
 
-fn print_sorted<S, T: Iterator<Item = S>>(iter: T, settings: &Settings)
-where
-    S: std::fmt::Display,
-{
+fn print_sorted<T: Iterator<Item = String>>(iter: T, settings: &GlobalSettings) {
     let mut file: Box<dyn Write> = match settings.outfile {
         Some(ref filename) => match File::create(Path::new(&filename)) {
             Ok(f) => Box::new(BufWriter::new(f)) as Box<dyn Write>,
             Err(e) => {
-                show_error!("sort: {0}: {1}", filename, e.to_string());
+                show_error!("{0}: {1}", filename, e.to_string());
                 panic!("Could not open output file");
             }
         },
-        None => Box::new(stdout()) as Box<dyn Write>,
+        None => Box::new(BufWriter::new(stdout())) as Box<dyn Write>,
     };
-
     if settings.zero_terminated {
         for line in iter {
-            let str = format!("{}\0", line);
-            crash_if_err!(1, file.write_all(str.as_bytes()));
+            crash_if_err!(1, file.write_all(line.as_bytes()));
+            crash_if_err!(1, file.write_all("\0".as_bytes()));
         }
     } else {
         for line in iter {
-            let str = format!("{}\n", line);
-            crash_if_err!(1, file.write_all(str.as_bytes()));
+            crash_if_err!(1, file.write_all(line.as_bytes()));
+            crash_if_err!(1, file.write_all("\n".as_bytes()));
         }
     }
+    crash_if_err!(1, file.flush());
 }
 
 // from cat.rs
@@ -1024,7 +1392,7 @@ fn open(path: &str) -> Option<(Box<dyn Read>, bool)> {
     match File::open(Path::new(path)) {
         Ok(f) => Some((Box::new(f) as Box<dyn Read>, false)),
         Err(e) => {
-            show_error!("sort: {0}: {1}", path, e.to_string());
+            show_error!("{0}: {1}", path, e.to_string());
             None
         }
     }
@@ -1097,4 +1465,34 @@ mod tests {
 
         assert_eq!(Ordering::Less, version_compare(a, b));
     }
+
+    #[test]
+    fn test_random_compare() {
+        let a = "9";
+        let b = "9";
+        let c = get_rand_string();
+
+        assert_eq!(Ordering::Equal, random_shuffle(a, b, c));
+    }
+
+    #[test]
+    fn test_tokenize_fields() {
+        let line = "foo bar b    x";
+        assert_eq!(tokenize(line, None), vec![0..3, 3..7, 7..9, 9..14,],);
+    }
+
+    #[test]
+    fn test_tokenize_fields_leading_whitespace() {
+        let line = "    foo bar b    x";
+        assert_eq!(tokenize(line, None), vec![0..7, 7..11, 11..13, 13..18,]);
+    }
+
+    #[test]
+    fn test_tokenize_fields_custom_separator() {
+        let line = "aaa foo bar b    x";
+        assert_eq!(
+            tokenize(line, Some('a')),
+            vec![0..0, 1..1, 2..2, 3..9, 10..18,]
+        );
+    }
 }
diff --git a/tests/by-util/test_sort.rs b/tests/by-util/test_sort.rs
index 6455d837b..668e783ae 100644
--- a/tests/by-util/test_sort.rs
+++ b/tests/by-util/test_sort.rs
@@ -185,10 +185,10 @@ fn test_dictionary_order2() {
 fn test_non_printing_chars() {
     for non_printing_chars_param in vec!["-i"] {
         new_ucmd!()
-            .pipe_in("a👦🏻aa	b\naaaa	b")
+            .pipe_in("a👦🏻aa\naaaa")
             .arg(non_printing_chars_param)
             .succeeds()
-            .stdout_only("aaaa	b\na👦🏻aa	b\n");
+            .stdout_only("a👦🏻aa\naaaa\n");
     }
 }
 
@@ -307,6 +307,166 @@ fn test_numeric_unique_ints2() {
     }
 }
 
+#[test]
+fn test_keys_open_ended() {
+    let input = "aa bb cc\ndd aa ff\ngg aa cc\n";
+    new_ucmd!()
+        .args(&["-k", "2.2"])
+        .pipe_in(input)
+        .succeeds()
+        .stdout_only("gg aa cc\ndd aa ff\naa bb cc\n");
+}
+
+#[test]
+fn test_keys_closed_range() {
+    let input = "aa bb cc\ndd aa ff\ngg aa cc\n";
+    new_ucmd!()
+        .args(&["-k", "2.2,2.2"])
+        .pipe_in(input)
+        .succeeds()
+        .stdout_only("dd aa ff\ngg aa cc\naa bb cc\n");
+}
+
+#[test]
+fn test_keys_multiple_ranges() {
+    let input = "aa bb cc\ndd aa ff\ngg aa cc\n";
+    new_ucmd!()
+        .args(&["-k", "2,2", "-k", "3,3"])
+        .pipe_in(input)
+        .succeeds()
+        .stdout_only("gg aa cc\ndd aa ff\naa bb cc\n");
+}
+
+#[test]
+fn test_keys_no_field_match() {
+    let input = "aa aa aa aa\naa bb cc\ndd aa ff\n";
+    new_ucmd!()
+        .args(&["-k", "4,4"])
+        .pipe_in(input)
+        .succeeds()
+        .stdout_only("aa bb cc\ndd aa ff\naa aa aa aa\n");
+}
+
+#[test]
+fn test_keys_no_char_match() {
+    let input = "aaa\nba\nc\n";
+    new_ucmd!()
+        .args(&["-k", "1.2"])
+        .pipe_in(input)
+        .succeeds()
+        .stdout_only("c\nba\naaa\n");
+}
+
+#[test]
+fn test_keys_custom_separator() {
+    let input = "aaxbbxcc\nddxaaxff\nggxaaxcc\n";
+    new_ucmd!()
+        .args(&["-k", "2.2,2.2", "-t", "x"])
+        .pipe_in(input)
+        .succeeds()
+        .stdout_only("ddxaaxff\nggxaaxcc\naaxbbxcc\n");
+}
+
+#[test]
+fn test_keys_invalid_field() {
+    new_ucmd!()
+        .args(&["-k", "1."])
+        .fails()
+        .stderr_only("sort: error: failed to parse character index for key `1.`: cannot parse integer from empty string");
+}
+
+#[test]
+fn test_keys_invalid_field_option() {
+    new_ucmd!()
+        .args(&["-k", "1.1x"])
+        .fails()
+        .stderr_only("sort: error: invalid option for key: `x`");
+}
+
+#[test]
+fn test_keys_invalid_field_zero() {
+    new_ucmd!()
+        .args(&["-k", "0.1"])
+        .fails()
+        .stderr_only("sort: error: field index was 0");
+}
+
+#[test]
+fn test_keys_with_options() {
+    let input = "aa 3 cc\ndd 1 ff\ngg 2 cc\n";
+    for param in &[
+        &["-k", "2,2n"][..],
+        &["-k", "2n,2"][..],
+        &["-k", "2,2", "-n"][..],
+    ] {
+        new_ucmd!()
+            .args(param)
+            .pipe_in(input)
+            .succeeds()
+            .stdout_only("dd 1 ff\ngg 2 cc\naa 3 cc\n");
+    }
+}
+
+#[test]
+fn test_keys_with_options_blanks_start() {
+    let input = "aa   3 cc\ndd  1 ff\ngg         2 cc\n";
+    for param in &[&["-k", "2b,2"][..], &["-k", "2,2", "-b"][..]] {
+        new_ucmd!()
+            .args(param)
+            .pipe_in(input)
+            .succeeds()
+            .stdout_only("dd  1 ff\ngg         2 cc\naa   3 cc\n");
+    }
+}
+
+#[test]
+fn test_keys_with_options_blanks_end() {
+    let input = "a  b
+a b
+a   b
+";
+    new_ucmd!()
+        .args(&["-k", "1,2.1b", "-s"])
+        .pipe_in(input)
+        .succeeds()
+        .stdout_only(
+            "a   b
+a  b
+a b
+",
+        );
+}
+
+#[test]
+fn test_keys_stable() {
+    let input = "a  b
+a b
+a   b
+";
+    new_ucmd!()
+        .args(&["-k", "1,2.1", "-s"])
+        .pipe_in(input)
+        .succeeds()
+        .stdout_only(
+            "a  b
+a b
+a   b
+",
+        );
+}
+
+#[test]
+fn test_keys_empty_match() {
+    let input = "a a a a
+aaaa
+";
+    new_ucmd!()
+        .args(&["-k", "1,1", "-t", "a"])
+        .pipe_in(input)
+        .succeeds()
+        .stdout_only(input);
+}
+
 #[test]
 fn test_zero_terminated() {
     test_helper("zero-terminated", "-z");