Merge pull request #2130 from electricboogie/master

sort: implement --buffer-size and --temporary-directory (external sort)
2025-07-28 03:27:44 +00:00 · 2021-04-28 09:21:14 +02:00 · 2021-04-28 09:21:14 +02:00 · a37e3181a2
commit a37e3181a2
parent 33139817a2 ec19bb72d5
9 changed files with 40489 additions and 18 deletions
--- a/Cargo.lock
+++ b/Cargo.lock
@ -1353,6 +1353,9 @@ name = "serde"
 version = "1.0.125"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "558dc50e1a5a5fa7112ca2ce4effcb321b0300c0d4ccf0776a9f60cd89031171"
+dependencies = [
+ "serde_derive",
+]

 [[package]]
 name = "serde_cbor"
@ -1431,6 +1434,9 @@ name = "smallvec"
 version = "1.6.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "fe0f37c9e8f3c5a4a66ad655a93c74daac4ad00c441533bf5c6e7990bb42604e"
+dependencies = [
+ "serde",
+]

 [[package]]
 name = "strsim"
@ -2363,7 +2369,10 @@ dependencies = [
 "rand 0.7.3",
 "rayon",
 "semver",
+ "serde",
+ "serde_json",
 "smallvec 1.6.1",
+ "tempdir",
 "unicode-width",
 "uucore",
 "uucore_procs",
--- a/src/uu/sort/Cargo.toml
+++ b/src/uu/sort/Cargo.toml
@ -15,16 +15,19 @@ edition = "2018"
 path = "src/sort.rs"

 [dependencies]
+serde_json = { version = "1.0.64", default-features = false, features = ["alloc"] }
+serde = { version = "1.0", features = ["derive"] }
 rayon = "1.5"
 rand = "0.7"
 clap = "2.33"
 fnv = "1.0.7"
 itertools = "0.10.0"
 semver = "0.9.0"
-smallvec = "1.6.1"
+smallvec = { version="1.6.1", features=["serde"] } 
 unicode-width = "0.1.8"
 uucore = { version=">=0.0.8", package="uucore", path="../../uucore", features=["fs"] }
 uucore_procs = { version=">=0.0.5", package="uucore_procs", path="../../uucore_procs" }
+tempdir = "0.3.7"

 [[bin]]
 name = "sort"
--- a/src/uu/sort/src/external_sort/LICENSE
+++ b/src/uu/sort/src/external_sort/LICENSE
@ -0,0 +1,19 @@
+Copyright 2018 Battelle Memorial Institute
+
+Permission is hereby granted, free of charge, to any person obtaining a copy of
+this software and associated documentation files (the "Software"), to deal in
+the Software without restriction, including without limitation the rights to
+use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
+of the Software, and to permit persons to whom the Software is furnished to do
+so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
--- a/src/uu/sort/src/external_sort/mod.rs
+++ b/src/uu/sort/src/external_sort/mod.rs
@ -0,0 +1,295 @@
+use std::clone::Clone;
+use std::cmp::Ordering::Less;
+use std::collections::VecDeque;
+use std::error::Error;
+use std::fs::{File, OpenOptions};
+use std::io::SeekFrom::Start;
+use std::io::{BufRead, BufReader, BufWriter, Seek, Write};
+use std::marker::PhantomData;
+use std::path::PathBuf;
+
+use serde::de::DeserializeOwned;
+use serde::Serialize;
+use serde_json;
+use tempdir::TempDir;
+
+use super::{GlobalSettings, Line};
+
+/// Trait for types that can be used by
+/// [ExternalSorter](struct.ExternalSorter.html). Must be sortable, cloneable,
+/// serializeable, and able to report on it's size
+pub trait ExternallySortable: Clone + Serialize + DeserializeOwned {
+    /// Get the size, in bytes, of this object (used to constrain the buffer
+    /// used in the external sort).
+    fn get_size(&self) -> u64;
+}
+
+/// Iterator that provides sorted `T`s
+pub struct ExtSortedIterator<Line> {
+    buffers: Vec<VecDeque<Line>>,
+    chunk_offsets: Vec<u64>,
+    max_per_chunk: u64,
+    chunks: u64,
+    tmp_dir: TempDir,
+    settings: GlobalSettings,
+    failed: bool,
+}
+
+impl Iterator for ExtSortedIterator<Line>
+where
+    Line: ExternallySortable,
+{
+    type Item = Result<Line, Box<dyn Error>>;
+
+    /// # Errors
+    ///
+    /// This method can fail due to issues reading intermediate sorted chunks
+    /// from disk, or due to serde deserialization issues
+    fn next(&mut self) -> Option<Self::Item> {
+        if self.failed {
+            return None;
+        }
+        // fill up any empty buffers
+        let mut empty = true;
+        for chunk_num in 0..self.chunks {
+            if self.buffers[chunk_num as usize].is_empty() {
+                let mut f = match File::open(self.tmp_dir.path().join(chunk_num.to_string())) {
+                    Ok(f) => f,
+                    Err(e) => {
+                        self.failed = true;
+                        return Some(Err(Box::new(e)));
+                    }
+                };
+                match f.seek(Start(self.chunk_offsets[chunk_num as usize])) {
+                    Ok(_) => (),
+                    Err(e) => {
+                        self.failed = true;
+                        return Some(Err(Box::new(e)));
+                    }
+                }
+                let bytes_read =
+                    match fill_buff(&mut self.buffers[chunk_num as usize], f, self.max_per_chunk) {
+                        Ok(bytes_read) => bytes_read,
+                        Err(e) => {
+                            self.failed = true;
+                            return Some(Err(e));
+                        }
+                    };
+                self.chunk_offsets[chunk_num as usize] += bytes_read;
+                if !self.buffers[chunk_num as usize].is_empty() {
+                    empty = false;
+                }
+            } else {
+                empty = false;
+            }
+        }
+        if empty {
+            return None;
+        }
+
+        // find the next record to write
+        // check is_empty() before unwrap()ing
+        let mut idx = 0;
+        for chunk_num in 0..self.chunks as usize {
+            if !self.buffers[chunk_num].is_empty() {
+                if self.buffers[idx].is_empty()
+                    || (super::compare_by)(
+                        self.buffers[chunk_num].front().unwrap(),
+                        self.buffers[idx].front().unwrap(),
+                        &self.settings,
+                    ) == Less
+                {
+                    idx = chunk_num;
+                }
+            }
+        }
+
+        // unwrap due to checks above
+        let r = self.buffers[idx].pop_front().unwrap();
+        Some(Ok(r))
+    }
+}
+
+/// Perform an external sort on an unsorted stream of incoming data
+pub struct ExternalSorter<Line>
+where
+    Line: ExternallySortable,
+{
+    tmp_dir: Option<PathBuf>,
+    buffer_bytes: u64,
+    phantom: PhantomData<Line>,
+    settings: GlobalSettings,
+}
+
+impl ExternalSorter<Line>
+where
+    Line: ExternallySortable,
+{
+    /// Create a new `ExternalSorter` with a specified memory buffer and
+    /// temporary directory
+    pub fn new(
+        buffer_bytes: u64,
+        tmp_dir: Option<PathBuf>,
+        settings: GlobalSettings,
+    ) -> ExternalSorter<Line> {
+        ExternalSorter {
+            buffer_bytes,
+            tmp_dir,
+            phantom: PhantomData,
+            settings,
+        }
+    }
+
+    /// Sort (based on `compare`) the `T`s provided by `unsorted` and return an
+    /// iterator
+    ///
+    /// # Errors
+    ///
+    /// This method can fail due to issues writing intermediate sorted chunks
+    /// to disk, or due to serde serialization issues
+    pub fn sort_by<I>(
+        &self,
+        unsorted: I,
+        settings: GlobalSettings,
+    ) -> Result<ExtSortedIterator<Line>, Box<dyn Error>>
+    where
+        I: Iterator<Item = Line>,
+    {
+        let tmp_dir = match self.tmp_dir {
+            Some(ref p) => TempDir::new_in(p, "uutils_sort")?,
+            None => TempDir::new("uutils_sort")?,
+        };
+        // creating the thing we need to return first due to the face that we need to
+        // borrow tmp_dir and move it out
+        let mut iter = ExtSortedIterator {
+            buffers: Vec::new(),
+            chunk_offsets: Vec::new(),
+            max_per_chunk: 0,
+            chunks: 0,
+            tmp_dir,
+            settings,
+            failed: false,
+        };
+
+        {
+            let mut total_read = 0;
+            let mut chunk = Vec::new();
+            // Initial buffer is specified by user
+            let mut adjusted_buffer_size = self.buffer_bytes;
+            let (iter_size, _) = unsorted.size_hint();
+
+            // make the initial chunks on disk
+            for seq in unsorted {
+                let seq_size = seq.get_size();
+                total_read += seq_size;
+
+                // GNU minimum is 16 * (sizeof struct + 2), but GNU uses about
+                // 1/10 the memory that we do.  And GNU even says in the code it may
+                // not work on small buffer sizes.
+                //
+                // The following seems to work pretty well, and has about the same max
+                // RSS as lower minimum values.
+                //
+                let minimum_buffer_size: u64 = iter_size as u64 * seq_size / 8;
+
+                adjusted_buffer_size =
+                    // Grow buffer size for a struct/Line larger than buffer
+                    if adjusted_buffer_size < seq_size {
+                        seq_size
+                    } else if adjusted_buffer_size < minimum_buffer_size {
+                        minimum_buffer_size
+                    } else {
+                        adjusted_buffer_size
+                    };
+                chunk.push(seq);
+
+                if total_read >= adjusted_buffer_size {
+                    super::sort_by(&mut chunk, &self.settings);
+                    self.write_chunk(
+                        &iter.tmp_dir.path().join(iter.chunks.to_string()),
+                        &mut chunk,
+                    )?;
+                    chunk.clear();
+                    total_read = 0;
+                    iter.chunks += 1;
+                }
+            }
+            // write the last chunk
+            if chunk.len() > 0 {
+                super::sort_by(&mut chunk, &self.settings);
+                self.write_chunk(
+                    &iter.tmp_dir.path().join(iter.chunks.to_string()),
+                    &mut chunk,
+                )?;
+                iter.chunks += 1;
+            }
+
+            // initialize buffers for each chunk
+            //
+            // Having a right sized buffer for each chunk for smallish values seems silly to me?
+            //
+            // We will have to have the entire iter in memory sometime right?
+            // Set minimum to the size of the writer buffer, ~8K
+            //
+            const MINIMUM_READBACK_BUFFER: u64 = 8200;
+            let right_sized_buffer = adjusted_buffer_size
+                .checked_div(iter.chunks)
+                .unwrap_or(adjusted_buffer_size);
+            iter.max_per_chunk = if right_sized_buffer > MINIMUM_READBACK_BUFFER {
+                right_sized_buffer
+            } else {
+                MINIMUM_READBACK_BUFFER
+            };
+            iter.buffers = vec![VecDeque::new(); iter.chunks as usize];
+            iter.chunk_offsets = vec![0 as u64; iter.chunks as usize];
+            for chunk_num in 0..iter.chunks {
+                let offset = fill_buff(
+                    &mut iter.buffers[chunk_num as usize],
+                    File::open(iter.tmp_dir.path().join(chunk_num.to_string()))?,
+                    iter.max_per_chunk,
+                )?;
+                iter.chunk_offsets[chunk_num as usize] = offset;
+            }
+        }
+
+        Ok(iter)
+    }
+
+    fn write_chunk(&self, file: &PathBuf, chunk: &mut Vec<Line>) -> Result<(), Box<dyn Error>> {
+        let new_file = OpenOptions::new().create(true).append(true).open(file)?;
+        let mut buf_write = Box::new(BufWriter::new(new_file)) as Box<dyn Write>;
+        for s in chunk {
+            let mut serialized = serde_json::to_string(&s).expect("JSON write error: ");
+            serialized.push_str("\n");
+            buf_write.write(serialized.as_bytes())?;
+        }
+        buf_write.flush()?;
+
+        Ok(())
+    }
+}
+
+fn fill_buff<Line>(
+    vec: &mut VecDeque<Line>,
+    file: File,
+    max_bytes: u64,
+) -> Result<u64, Box<dyn Error>>
+where
+    Line: ExternallySortable,
+{
+    let mut total_read = 0;
+    let mut bytes_read = 0;
+    for line in BufReader::new(file).lines() {
+        let line_s = line?;
+        bytes_read += line_s.len() + 1;
+        // This is where the bad stuff happens usually
+        let deserialized: Line = serde_json::from_str(&line_s).expect("JSON read error: ");
+        total_read += deserialized.get_size();
+        vec.push_back(deserialized);
+        if total_read > max_bytes {
+            break;
+        }
+    }
+
+    Ok(bytes_read as u64)
+}
--- a/src/uu/sort/src/numeric_str_cmp.rs
+++ b/src/uu/sort/src/numeric_str_cmp.rs
@ -14,20 +14,21 @@
 //! More specifically, exponent can be understood so that the original number is in (1..10)*10^exponent.
 //! From that follows the constraints of this algorithm: It is able to compare numbers in ±(1*10^[i64::MIN]..10*10^[i64::MAX]).

+use serde::{Deserialize, Serialize};
 use std::{cmp::Ordering, ops::Range};

-#[derive(Debug, PartialEq, Eq, PartialOrd, Ord)]
+#[derive(Debug, PartialEq, Eq, PartialOrd, Ord, Serialize, Deserialize, Clone)]
 enum Sign {
    Negative,
    Positive,
 }

-#[derive(Debug, PartialEq)]
+#[derive(Debug, PartialEq, Serialize, Deserialize, Clone)]
 pub struct NumInfo {
    exponent: i64,
    sign: Sign,
 }
-
+#[derive(Debug, PartialEq, Serialize, Deserialize, Clone)]
 pub struct NumInfoParseSettings {
    pub accept_si_units: bool,
    pub thousands_separator: Option<char>,
--- a/src/uu/sort/src/sort.rs
+++ b/src/uu/sort/src/sort.rs
@ -15,9 +15,11 @@
 #[macro_use]
 extern crate uucore;

+mod external_sort;
 mod numeric_str_cmp;

 use clap::{App, Arg};
+use external_sort::{ExternalSorter, ExternallySortable};
 use fnv::FnvHasher;
 use itertools::Itertools;
 use numeric_str_cmp::{numeric_str_cmp, NumInfo, NumInfoParseSettings};
@ -25,6 +27,7 @@ use rand::distributions::Alphanumeric;
 use rand::{thread_rng, Rng};
 use rayon::prelude::*;
 use semver::Version;
+use serde::{Deserialize, Serialize};
 use smallvec::SmallVec;
 use std::cmp::Ordering;
 use std::collections::BinaryHeap;
@ -35,6 +38,7 @@ use std::io::{stdin, stdout, BufRead, BufReader, BufWriter, Lines, Read, Write};
 use std::mem::replace;
 use std::ops::Range;
 use std::path::Path;
+use std::path::PathBuf;
 use unicode_width::UnicodeWidthStr;
 use uucore::fs::is_stdin_interactive; // for Iterator::dedup()
 use uucore::InvalidEncodingHandling;
@ -77,6 +81,8 @@ static OPT_RANDOM: &str = "random-sort";
 static OPT_ZERO_TERMINATED: &str = "zero-terminated";
 static OPT_PARALLEL: &str = "parallel";
 static OPT_FILES0_FROM: &str = "files0-from";
+static OPT_BUF_SIZE: &str = "buffer-size";
+static OPT_TMP_DIR: &str = "temporary-directory";

 static ARG_FILES: &str = "files";

@ -86,6 +92,8 @@ static THOUSANDS_SEP: char = ',';
 static NEGATIVE: char = '-';
 static POSITIVE: char = '+';

+static DEFAULT_BUF_SIZE: usize = std::usize::MAX;
+
 #[derive(Eq, Ord, PartialEq, PartialOrd, Clone, Copy)]
 enum SortMode {
    Numeric,
@ -95,7 +103,7 @@ enum SortMode {
    Version,
    Default,
 }
-
+#[derive(Clone)]
 struct GlobalSettings {
    mode: SortMode,
    debug: bool,
@ -116,6 +124,31 @@ struct GlobalSettings {
    separator: Option<char>,
    threads: String,
    zero_terminated: bool,
+    buffer_size: usize,
+    tmp_dir: PathBuf,
+    ext_sort: bool,
+}
+
+impl GlobalSettings {
+    // It's back to do conversions for command line opts!
+    // Probably want to do through numstrcmp somehow now?
+    fn human_numeric_convert(a: &str) -> usize {
+        let num_str = &a[get_leading_gen(a)];
+        let (_, suf_str) = a.split_at(num_str.len());
+        let num_usize = num_str
+            .parse::<usize>()
+            .expect("Error parsing buffer size: ");
+        let suf_usize: usize = match suf_str.to_uppercase().as_str() {
+            // SI Units
+            "B" => 1usize,
+            "K" => 1000usize,
+            "M" => 1000000usize,
+            "G" => 1000000000usize,
+            // GNU regards empty human numeric values as K by default
+            _ => 1000usize,
+        };
+        num_usize * suf_usize
+    }
 }

 impl Default for GlobalSettings {
@ -140,10 +173,13 @@ impl Default for GlobalSettings {
            separator: None,
            threads: String::new(),
            zero_terminated: false,
+            buffer_size: DEFAULT_BUF_SIZE,
+            tmp_dir: PathBuf::new(),
+            ext_sort: false,
        }
    }
 }
-
+#[derive(Clone)]
 struct KeySettings {
    mode: SortMode,
    ignore_blanks: bool,
@ -168,6 +204,7 @@ impl From<&GlobalSettings> for KeySettings {
    }
 }

+#[derive(Debug, Serialize, Deserialize, Clone)]
 /// Represents the string selected by a FieldSelector.
 enum SelectionRange {
    /// If we had to transform this selection, we have to store a new string.
@ -199,6 +236,7 @@ impl SelectionRange {
    }
 }

+#[derive(Serialize, Deserialize, Clone)]
 enum NumCache {
    AsF64(GeneralF64ParseResult),
    WithInfo(NumInfo),
@ -219,7 +257,7 @@ impl NumCache {
        }
    }
 }
-
+#[derive(Serialize, Deserialize, Clone)]
 struct Selection {
    range: SelectionRange,
    num_cache: NumCache,
@ -234,12 +272,20 @@ impl Selection {

 type Field = Range<usize>;

+#[derive(Serialize, Deserialize, Clone)]
 struct Line {
    line: String,
    // The common case is not to specify fields. Let's make this fast.
    selections: SmallVec<[Selection; 1]>,
 }

+impl ExternallySortable for Line {
+    fn get_size(&self) -> u64 {
+        // Currently 96 bytes, but that could change, so we get that size here
+        std::mem::size_of::<Line>() as u64
+    }
+}
+
 impl Line {
    fn new(line: String, settings: &GlobalSettings) -> Self {
        let fields = if settings
@ -489,6 +535,7 @@ fn tokenize_with_separator(line: &str, separator: char) -> Vec<Field> {
    tokens
 }

+#[derive(Clone)]
 struct KeyPosition {
    /// 1-indexed, 0 is invalid.
    field: usize,
@ -578,7 +625,7 @@ impl KeyPosition {
        }
    }
 }
-
+#[derive(Clone)]
 struct FieldSelector {
    from: KeyPosition,
    to: Option<KeyPosition>,
@ -912,6 +959,22 @@ pub fn uumain(args: impl uucore::Args) -> i32 {
                .takes_value(true)
                .value_name("NUM_THREADS"),
        )
+        .arg(
+            Arg::with_name(OPT_BUF_SIZE)
+                .short("S")
+                .long(OPT_BUF_SIZE)
+                .help("sets the maximum SIZE of each segment in number of sorted items")
+                .takes_value(true)
+                .value_name("SIZE"),
+        )
+        .arg(
+            Arg::with_name(OPT_TMP_DIR)
+                .short("T")
+                .long(OPT_TMP_DIR)
+                .help("use DIR for temporaries, not $TMPDIR or /tmp")
+                .takes_value(true)
+                .value_name("DIR"),
+        )
        .arg(
            Arg::with_name(OPT_FILES0_FROM)
                .long(OPT_FILES0_FROM)
@ -982,6 +1045,29 @@ pub fn uumain(args: impl uucore::Args) -> i32 {
        env::set_var("RAYON_NUM_THREADS", &settings.threads);
    }

+    if matches.is_present(OPT_BUF_SIZE) {
+        settings.buffer_size = {
+            let input = matches
+                .value_of(OPT_BUF_SIZE)
+                .map(String::from)
+                .unwrap_or(format!("{}", DEFAULT_BUF_SIZE));
+
+            GlobalSettings::human_numeric_convert(&input)
+        };
+        settings.ext_sort = true;
+    }
+
+    if matches.is_present(OPT_TMP_DIR) {
+        let result = matches
+            .value_of(OPT_TMP_DIR)
+            .map(String::from)
+            .unwrap_or(format!("{}", env::temp_dir().display()));
+        settings.tmp_dir = PathBuf::from(result);
+        settings.ext_sort = true;
+    } else {
+        settings.tmp_dir = env::temp_dir();
+    }
+
    settings.zero_terminated = matches.is_present(OPT_ZERO_TERMINATED);
    settings.merge = matches.is_present(OPT_MERGE);

@ -1066,10 +1152,10 @@ pub fn uumain(args: impl uucore::Args) -> i32 {
        });
    }

-    exec(files, &settings)
+    exec(files, settings)
 }

-fn exec(files: Vec<String>, settings: &GlobalSettings) -> i32 {
+fn exec(files: Vec<String>, settings: GlobalSettings) -> i32 {
    let mut lines = Vec::new();
    let mut file_merger = FileMerger::new(&settings);

@ -1105,6 +1191,13 @@ fn exec(files: Vec<String>, settings: &GlobalSettings) -> i32 {

    if settings.check {
        return exec_check_file(&lines, &settings);
+    }
+
+    // Only use ext_sorter when we need to.
+    // Probably faster that we don't create
+    // an owned value each run
+    if settings.ext_sort {
+        lines = ext_sort_by(lines, settings.clone());
    } else {
        sort_by(&mut lines, &settings);
    }
@ -1112,7 +1205,7 @@ fn exec(files: Vec<String>, settings: &GlobalSettings) -> i32 {
    if settings.merge {
        if settings.unique {
            print_sorted(
-                file_merger.dedup_by(|a, b| compare_by(a, b, settings) == Ordering::Equal),
+                file_merger.dedup_by(|a, b| compare_by(a, b, &settings) == Ordering::Equal),
                &settings,
            )
        } else {
@ -1122,7 +1215,7 @@ fn exec(files: Vec<String>, settings: &GlobalSettings) -> i32 {
        print_sorted(
            lines
                .into_iter()
-                .dedup_by(|a, b| compare_by(a, b, settings) == Ordering::Equal),
+                .dedup_by(|a, b| compare_by(a, b, &settings) == Ordering::Equal),
            &settings,
        )
    } else {
@ -1164,11 +1257,25 @@ fn exec_check_file(unwrapped_lines: &[Line], settings: &GlobalSettings) -> i32 {
    }
 }

-fn sort_by(lines: &mut Vec<Line>, settings: &GlobalSettings) {
+fn ext_sort_by(unsorted: Vec<Line>, settings: GlobalSettings) -> Vec<Line> {
+    let external_sorter = ExternalSorter::new(
+        settings.buffer_size as u64,
+        Some(settings.tmp_dir.clone()),
+        settings.clone(),
+    );
+    let iter = external_sorter
+        .sort_by(unsorted.into_iter(), settings.clone())
+        .unwrap()
+        .map(|x| x.unwrap())
+        .collect::<Vec<Line>>();
+    iter
+}
+
+fn sort_by(unsorted: &mut Vec<Line>, settings: &GlobalSettings) {
    if settings.stable || settings.unique {
-        lines.par_sort_by(|a, b| compare_by(a, b, &settings))
+        unsorted.par_sort_by(|a, b| compare_by(a, b, &settings))
    } else {
-        lines.par_sort_unstable_by(|a, b| compare_by(a, b, &settings))
+        unsorted.par_sort_unstable_by(|a, b| compare_by(a, b, &settings))
    }
 }

@ -1189,8 +1296,8 @@ fn compare_by(a: &Line, b: &Line, global_settings: &GlobalSettings) -> Ordering
                    (b_str, b_selection.num_cache.as_num_info()),
                ),
                SortMode::GeneralNumeric => general_numeric_compare(
-                    a_selection.num_cache.as_f64(),
-                    b_selection.num_cache.as_f64(),
+                    general_f64_parse(&a_str[get_leading_gen(a_str)]),
+                    general_f64_parse(&b_str[get_leading_gen(b_str)]),
                ),
                SortMode::Month => month_compare(a_str, b_str),
                SortMode::Version => version_compare(a_str, b_str),
@ -1268,7 +1375,7 @@ fn get_leading_gen(input: &str) -> Range<usize> {
    leading_whitespace_len..input.len()
 }

-#[derive(Copy, Clone, PartialEq, PartialOrd)]
+#[derive(Serialize, Deserialize, Copy, Clone, PartialEq, PartialOrd)]
 enum GeneralF64ParseResult {
    Invalid,
    NaN,
--- a/tests/by-util/test_sort.rs
+++ b/tests/by-util/test_sort.rs
@ -15,6 +15,31 @@ fn test_helper(file_name: &str, args: &str) {
        .stdout_is_fixture(format!("{}.expected.debug", file_name));
 }

+// FYI, the initialization size of our Line struct is 96 bytes.
+//  
+// At very small buffer sizes, with that overhead we are certainly going 
+// to overrun our buffer way, way, way too quickly because of these excess 
+// bytes for the struct.
+//
+// For instance, seq 0..20000 > ...text = 108894 bytes
+// But overhead is 1920000 + 108894 = 2028894 bytes
+//
+// Or kjvbible-random.txt = 4332506 bytes, but minimum size of its 
+// 99817 lines in memory * 96 bytes = 9582432 bytes
+//
+// Here, we test 108894 bytes with a 50K buffer
+//
+#[test]
+fn test_larger_than_specified_segment() {
+    new_ucmd!()
+        .arg("-n")
+        .arg("-S")
+        .arg("50K")
+        .arg("ext_sort.txt")
+        .succeeds()
+        .stdout_is_fixture(format!("{}", "ext_sort.expected"));
+}
+
 #[test]
 fn test_months_whitespace() {
    test_helper("months-whitespace", "-M");
@ -34,6 +59,18 @@ fn test_human_numeric_whitespace() {
    test_helper("human-numeric-whitespace", "-h");
 }

+// This tests where serde often fails when reading back JSON 
+// if it finds a null value
+#[test]
+fn test_extsort_as64_bailout() {
+    new_ucmd!()
+        .arg("-g")
+        .arg("-S 5K")
+        .arg("multiple_decimals_general.txt")
+        .succeeds()
+        .stdout_is_fixture("multiple_decimals_general.expected");
+}
+
 #[test]
 fn test_multiple_decimals_general() {
    test_helper("multiple_decimals_general", "-g")
--- a/tests/fixtures/sort/ext_sort.expected
+++ b/tests/fixtures/sort/ext_sort.expected
--- a/tests/fixtures/sort/ext_sort.txt
+++ b/tests/fixtures/sort/ext_sort.txt