From e6c195a675eb9043ad7ee1668e784de2387bf21b Mon Sep 17 00:00:00 2001 From: electricboogie <32370782+electricboogie@users.noreply.github.com> Date: Mon, 12 Apr 2021 14:24:22 -0500 Subject: [PATCH] ExtSort --- Cargo.lock | 139 +++++++++++++++++++++++++++++++++++++++- src/uu/sort/Cargo.toml | 6 +- src/uu/sort/src/sort.rs | 96 +++++++++++++++++++++++---- 3 files changed, 223 insertions(+), 18 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index d45e41c16..052d6de40 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -119,12 +119,40 @@ version = "0.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "560c32574a12a89ecd91f5e742165893f86e3ab98d21f8ea548658eb9eef5f40" +[[package]] +name = "bytecount" +version = "0.6.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "72feb31ffc86498dacdbd0fcebb56138e7177a8cc5cea4516031d15ae85a742e" + [[package]] name = "byteorder" version = "1.4.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "14c189c53d098945499cdfa7ecc63567cf3886b3332b312a5b4585d8d3a6a610" +[[package]] +name = "cargo-platform" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0226944a63d1bf35a3b5f948dd7c59e263db83695c9e8bffc4037de02e30f1d7" +dependencies = [ + "serde", +] + +[[package]] +name = "cargo_metadata" +version = "0.12.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7714a157da7991e23d90686b9524b9e12e0407a108647f52e9328f4b3d51ac7f" +dependencies = [ + "cargo-platform", + "semver 0.11.0", + "semver-parser 0.10.2", + "serde", + "serde_json", +] + [[package]] name = "cast" version = "0.2.3" @@ -560,6 +588,26 @@ dependencies = [ "regex", ] +[[package]] +name = "error-chain" +version = "0.12.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2d2f06b9cac1506ece98fe3231e3cc9c4410ec3d5b1f24ae1c8946f0742cdefc" +dependencies = [ + "version_check", +] + +[[package]] +name = "extsort" +version = "0.4.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ffc5bb6fbca3c5ce6a51f6857eab8c35c898b2fbcb62ff1b728243dd19ec0c9f" +dependencies = [ + "rayon", + "skeptic", + "tempfile", +] + [[package]] name = "fake-simd" version = "0.1.2" @@ -944,6 +992,15 @@ dependencies = [ "proc-macro-hack", ] +[[package]] +name = "pest" +version = "2.1.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "10f4872ae94d7b90ae48754df22fd42ad52ce740b8f370b03da4835417403e53" +dependencies = [ + "ucd-trie", +] + [[package]] name = "pkg-config" version = "0.3.19" @@ -1009,6 +1066,17 @@ dependencies = [ "unicode-xid 0.2.1", ] +[[package]] +name = "pulldown-cmark" +version = "0.8.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ffade02495f22453cd593159ea2f59827aae7f53fa8323f756799b670881dcf8" +dependencies = [ + "bitflags", + "memchr 2.3.4", + "unicase", +] + [[package]] name = "quick-error" version = "1.2.3" @@ -1245,7 +1313,7 @@ version = "0.2.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "138e3e0acb6c9fb258b19b67cb8abd63c00679d2851805ea151465464fe9030a" dependencies = [ - "semver", + "semver 0.9.0", ] [[package]] @@ -1275,7 +1343,17 @@ version = "0.9.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1d7eb9ef2c18661902cc47e535f9bc51b78acd254da71d375c2f6720d9a40403" dependencies = [ - "semver-parser", + "semver-parser 0.7.0", +] + +[[package]] +name = "semver" +version = "0.11.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f301af10236f6df4160f7c3f04eec6dbc70ace82d23326abad5edee88801c6b6" +dependencies = [ + "semver-parser 0.10.2", + "serde", ] [[package]] @@ -1284,11 +1362,23 @@ version = "0.7.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "388a1df253eca08550bef6c72392cfe7c30914bf41df5269b68cbd6ff8f570a3" +[[package]] +name = "semver-parser" +version = "0.10.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "00b0bef5b7f9e0df16536d3961cfb6e84331c065b4066afb39768d0e319411f7" +dependencies = [ + "pest", +] + [[package]] name = "serde" version = "1.0.125" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "558dc50e1a5a5fa7112ca2ce4effcb321b0300c0d4ccf0776a9f60cd89031171" +dependencies = [ + "serde_derive", +] [[package]] name = "serde_cbor" @@ -1353,6 +1443,21 @@ dependencies = [ "generic-array", ] +[[package]] +name = "skeptic" +version = "0.13.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "188b810342d98f23f0bb875045299f34187b559370b041eb11520c905370a888" +dependencies = [ + "bytecount", + "cargo_metadata", + "error-chain", + "glob 0.3.0", + "pulldown-cmark", + "tempfile", + "walkdir", +] + [[package]] name = "smallvec" version = "0.6.14" @@ -1367,6 +1472,9 @@ name = "smallvec" version = "1.6.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "fe0f37c9e8f3c5a4a66ad655a93c74daac4ad00c441533bf5c6e7990bb42604e" +dependencies = [ + "serde", +] [[package]] name = "strsim" @@ -1528,6 +1636,21 @@ version = "1.13.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "879f6906492a7cd215bfa4cf595b600146ccfac0c79bcbd1f3000162af5e8b06" +[[package]] +name = "ucd-trie" +version = "0.1.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "56dee185309b50d1f11bfedef0fe6d036842e3fb77413abef29f8f8d1c5d4c1c" + +[[package]] +name = "unicase" +version = "2.6.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "50f37be617794602aabbeee0be4f259dc1778fabe05e2d67ee8f79326d5cb4f6" +dependencies = [ + "version_check", +] + [[package]] name = "unicode-segmentation" version = "1.7.1" @@ -2289,12 +2412,16 @@ dependencies = [ name = "uu_sort" version = "0.0.6" dependencies = [ + "byteorder", "clap", + "extsort", "fnv", "itertools 0.10.0", "rand 0.7.3", "rayon", - "semver", + "semver 0.9.0", + "serde", + "serde_json", "smallvec 1.6.1", "uucore", "uucore_procs", @@ -2604,6 +2731,12 @@ version = "0.8.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f1bddf1187be692e79c5ffeab891132dfb0f236ed36a43c7ed39f1165ee20191" +[[package]] +name = "version_check" +version = "0.9.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5fecdca9a5291cc2b8dcf7dc02453fee791a280f3743cb0905f8822ae463b3fe" + [[package]] name = "void" version = "1.0.2" diff --git a/src/uu/sort/Cargo.toml b/src/uu/sort/Cargo.toml index 6a9976278..8ad0a681f 100644 --- a/src/uu/sort/Cargo.toml +++ b/src/uu/sort/Cargo.toml @@ -15,13 +15,17 @@ edition = "2018" path = "src/sort.rs" [dependencies] +byteorder = "1.4.3" +extsort = "0.4.2" +serde_json = { version = "1.0", default-features = false, features = ["alloc"] } +serde = { version = "1.0", features = ["derive"] } rayon = "1.5" rand = "0.7" clap = "2.33" fnv = "1.0.7" itertools = "0.10.0" semver = "0.9.0" -smallvec = "1.6.1" +smallvec = { version = "1.6.1", features = ["serde"] } uucore = { version=">=0.0.8", package="uucore", path="../../uucore", features=["fs"] } uucore_procs = { version=">=0.0.5", package="uucore_procs", path="../../uucore_procs" } diff --git a/src/uu/sort/src/sort.rs b/src/uu/sort/src/sort.rs index 8bf6eb1e8..cb07f60b7 100644 --- a/src/uu/sort/src/sort.rs +++ b/src/uu/sort/src/sort.rs @@ -20,7 +20,6 @@ use fnv::FnvHasher; use itertools::Itertools; use rand::distributions::Alphanumeric; use rand::{thread_rng, Rng}; -use rayon::prelude::*; use semver::Version; use smallvec::SmallVec; use std::borrow::Cow; @@ -34,6 +33,14 @@ use std::mem::replace; use std::ops::{Range, RangeInclusive}; use std::path::Path; use uucore::fs::is_stdin_interactive; // for Iterator::dedup() +use extsort::*; +use std::str; +use serde::{Serialize, Deserialize}; +use std::ffi::OsString; +use std::usize; +use std::path::PathBuf; +use std::string::*; +use serde_json::Result; static NAME: &str = "sort"; static ABOUT: &str = "Display sorted concatenation of all FILE(s)."; @@ -72,6 +79,8 @@ static OPT_RANDOM: &str = "random-sort"; static OPT_ZERO_TERMINATED: &str = "zero-terminated"; static OPT_PARALLEL: &str = "parallel"; static OPT_FILES0_FROM: &str = "files0-from"; +static OPT_BUF_SIZE: &str = "buffer-size"; +static OPT_TMP_DIR: &str = "temporary-directory"; static ARG_FILES: &str = "files"; @@ -110,6 +119,8 @@ struct GlobalSettings { separator: Option, threads: String, zero_terminated: bool, + buffer_size: usize, + tmp_dir: PathBuf, } impl Default for GlobalSettings { @@ -133,6 +144,8 @@ impl Default for GlobalSettings { separator: None, threads: String::new(), zero_terminated: false, + buffer_size: 10000000usize, + tmp_dir: PathBuf::from(r"/tmp"), } } } @@ -162,7 +175,7 @@ impl From<&GlobalSettings> for KeySettings { } /// Represents the string selected by a FieldSelector. -#[derive(Debug)] +#[derive(Debug, Serialize, Deserialize, Clone)] enum Selection { /// If we had to transform this selection, we have to store a new string. String(String), @@ -182,13 +195,29 @@ impl Selection { type Field = Range; -#[derive(Debug)] +#[derive(Serialize, Deserialize, Debug, Clone)] struct Line { line: String, // The common case is not to specify fields. Let's make this fast. selections: SmallVec<[Selection; 1]>, } +impl Sortable for Line { + fn encode(&self, write: &mut W) { + let line = Line { line: self.line.clone(), selections: self.selections.clone() } ; + let serialized = serde_json::to_string(&line).unwrap(); + write.write_all(serialized.as_bytes()).unwrap(); + } + + fn decode(read: &mut R) -> Option { + let mut buf = String::new(); + read.read_to_string(&mut buf).ok(); + let line: Option = buf; + println!("deserialized = {:?}", line); + line + } +} + impl Line { fn new(line: String, settings: &GlobalSettings) -> Self { let fields = if settings @@ -681,6 +710,20 @@ pub fn uumain(args: impl uucore::Args) -> i32 { .takes_value(true) .value_name("NUM_THREADS"), ) + .arg( + Arg::with_name(OPT_BUF_SIZE) + .long(OPT_BUF_SIZE) + .help("sets the maximum SIZE of each segment in number of sorted items") + .takes_value(true) + .value_name("SIZE"), + ) + .arg( + Arg::with_name(OPT_TMP_DIR) + .long(OPT_TMP_DIR) + .help("use DIR for temporaries, not $TMPDIR or /tmp") + .takes_value(true) + .value_name("DIR"), + ) .arg( Arg::with_name(OPT_FILES0_FROM) .long(OPT_FILES0_FROM) @@ -744,6 +787,32 @@ pub fn uumain(args: impl uucore::Args) -> i32 { env::set_var("RAYON_NUM_THREADS", &settings.threads); } + if matches.is_present(OPT_BUF_SIZE) { + // 10000 is the default extsort buffer, but it's too small + settings.buffer_size = matches + .value_of(OPT_BUF_SIZE) + .map(String::from) + .unwrap_or( format! ( "{}", 10000000usize ) ) + .parse::() + .unwrap_or(10000000usize); + } + + if matches.is_present(OPT_TMP_DIR) { + let result = matches + .value_of(OPT_TMP_DIR) + .map(String::from) + .unwrap_or("/tmp".to_owned() ); + settings.tmp_dir = PathBuf::from(format!(r"{}", result)); + } else { + for (key, value) in env::vars_os() { + if key == OsString::from("TMPDIR") { + settings.tmp_dir = PathBuf::from(format!(r"{}", value.into_string().unwrap_or("/tmp".to_owned()))); + break + } + settings.tmp_dir = PathBuf::from(r"/tmp"); + } + } + settings.zero_terminated = matches.is_present(OPT_ZERO_TERMINATED); settings.merge = matches.is_present(OPT_MERGE); @@ -860,9 +929,9 @@ fn exec(files: Vec, settings: &GlobalSettings) -> i32 { if settings.check { return exec_check_file(&lines, &settings); - } else { - sort_by(&mut lines, &settings); } + + lines = sort_by(lines, &settings); if settings.merge { if settings.unique { @@ -917,8 +986,9 @@ fn exec_check_file(unwrapped_lines: &[Line], settings: &GlobalSettings) -> i32 { } } -fn sort_by(lines: &mut Vec, settings: &GlobalSettings) { - lines.par_sort_by(|a, b| compare_by(a, b, &settings)) +fn sort_by(lines: Vec, settings: &GlobalSettings) -> Vec { + let sorter = ExternalSorter::new().with_segment_size(settings.buffer_size).with_sort_dir(settings.tmp_dir.clone()).with_parallel_sort(); + sorter.sort_by(lines.into_iter(), |a, b| compare_by(a, b, &settings)).unwrap().collect() } fn compare_by(a: &Line, b: &Line, global_settings: &GlobalSettings) -> Ordering { @@ -1004,7 +1074,6 @@ fn leading_num_common(a: &str) -> &str { // not recognize a positive sign or scientific/E notation so we strip those elements here. fn get_leading_num(a: &str) -> &str { let mut s = ""; - let a = leading_num_common(a); // GNU numeric sort doesn't recognize '+' or 'e' notation so we strip @@ -1019,9 +1088,7 @@ fn get_leading_num(a: &str) -> &str { // And empty number or non-number lines are to be treated as ‘0’ but only for numeric sort // All '0'-ed lines will be sorted later, but only amongst themselves, during the so-called 'last resort comparison.' - if s.is_empty() { - s = "0"; - }; + if s.is_empty() { s = "0"; }; s } @@ -1087,8 +1154,8 @@ fn permissive_f64_parse(a: &str) -> f64 { // Remove any trailing decimals, ie 4568..890... becomes 4568.890 // Then, we trim whitespace and parse match remove_trailing_dec(a).trim().parse::() { - Ok(a) if a.is_nan() => std::f64::NEG_INFINITY, - Ok(a) => a, + Ok(val) if val.is_nan() => std::f64::NEG_INFINITY, + Ok(val) => val, Err(_) => std::f64::NEG_INFINITY, } } @@ -1107,7 +1174,6 @@ fn numeric_compare(a: &str, b: &str) -> Ordering { let fa = permissive_f64_parse(&ta); let fb = permissive_f64_parse(&tb); - // f64::cmp isn't implemented (due to NaN issues); implement directly instead if fa > fb { Ordering::Greater } else if fa < fb { @@ -1150,6 +1216,7 @@ fn human_numeric_convert(a: &str) -> f64 { let num_part = permissive_f64_parse(&num_str); let suffix: f64 = match suffix.parse().unwrap_or('\0') { // SI Units + 'b' => 1f64, 'K' => 1E3, 'M' => 1E6, 'G' => 1E9, @@ -1262,6 +1329,7 @@ fn month_compare(a: &str, b: &str) -> Ordering { } } +#[inline(always)] fn version_parse(a: &str) -> Version { let result = Version::parse(a);