mirror of
https://github.com/RGBCube/uutils-coreutils
synced 2025-07-31 04:57:45 +00:00
ExtSort
This commit is contained in:
parent
c6021e10c2
commit
e6c195a675
3 changed files with 223 additions and 18 deletions
139
Cargo.lock
generated
139
Cargo.lock
generated
|
@ -119,12 +119,40 @@ version = "0.2.0"
|
|||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "560c32574a12a89ecd91f5e742165893f86e3ab98d21f8ea548658eb9eef5f40"
|
||||
|
||||
[[package]]
|
||||
name = "bytecount"
|
||||
version = "0.6.2"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "72feb31ffc86498dacdbd0fcebb56138e7177a8cc5cea4516031d15ae85a742e"
|
||||
|
||||
[[package]]
|
||||
name = "byteorder"
|
||||
version = "1.4.3"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "14c189c53d098945499cdfa7ecc63567cf3886b3332b312a5b4585d8d3a6a610"
|
||||
|
||||
[[package]]
|
||||
name = "cargo-platform"
|
||||
version = "0.1.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "0226944a63d1bf35a3b5f948dd7c59e263db83695c9e8bffc4037de02e30f1d7"
|
||||
dependencies = [
|
||||
"serde",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "cargo_metadata"
|
||||
version = "0.12.3"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "7714a157da7991e23d90686b9524b9e12e0407a108647f52e9328f4b3d51ac7f"
|
||||
dependencies = [
|
||||
"cargo-platform",
|
||||
"semver 0.11.0",
|
||||
"semver-parser 0.10.2",
|
||||
"serde",
|
||||
"serde_json",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "cast"
|
||||
version = "0.2.3"
|
||||
|
@ -560,6 +588,26 @@ dependencies = [
|
|||
"regex",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "error-chain"
|
||||
version = "0.12.4"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "2d2f06b9cac1506ece98fe3231e3cc9c4410ec3d5b1f24ae1c8946f0742cdefc"
|
||||
dependencies = [
|
||||
"version_check",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "extsort"
|
||||
version = "0.4.2"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "ffc5bb6fbca3c5ce6a51f6857eab8c35c898b2fbcb62ff1b728243dd19ec0c9f"
|
||||
dependencies = [
|
||||
"rayon",
|
||||
"skeptic",
|
||||
"tempfile",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "fake-simd"
|
||||
version = "0.1.2"
|
||||
|
@ -944,6 +992,15 @@ dependencies = [
|
|||
"proc-macro-hack",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "pest"
|
||||
version = "2.1.3"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "10f4872ae94d7b90ae48754df22fd42ad52ce740b8f370b03da4835417403e53"
|
||||
dependencies = [
|
||||
"ucd-trie",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "pkg-config"
|
||||
version = "0.3.19"
|
||||
|
@ -1009,6 +1066,17 @@ dependencies = [
|
|||
"unicode-xid 0.2.1",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "pulldown-cmark"
|
||||
version = "0.8.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "ffade02495f22453cd593159ea2f59827aae7f53fa8323f756799b670881dcf8"
|
||||
dependencies = [
|
||||
"bitflags",
|
||||
"memchr 2.3.4",
|
||||
"unicase",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "quick-error"
|
||||
version = "1.2.3"
|
||||
|
@ -1245,7 +1313,7 @@ version = "0.2.3"
|
|||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "138e3e0acb6c9fb258b19b67cb8abd63c00679d2851805ea151465464fe9030a"
|
||||
dependencies = [
|
||||
"semver",
|
||||
"semver 0.9.0",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
|
@ -1275,7 +1343,17 @@ version = "0.9.0"
|
|||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "1d7eb9ef2c18661902cc47e535f9bc51b78acd254da71d375c2f6720d9a40403"
|
||||
dependencies = [
|
||||
"semver-parser",
|
||||
"semver-parser 0.7.0",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "semver"
|
||||
version = "0.11.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "f301af10236f6df4160f7c3f04eec6dbc70ace82d23326abad5edee88801c6b6"
|
||||
dependencies = [
|
||||
"semver-parser 0.10.2",
|
||||
"serde",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
|
@ -1284,11 +1362,23 @@ version = "0.7.0"
|
|||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "388a1df253eca08550bef6c72392cfe7c30914bf41df5269b68cbd6ff8f570a3"
|
||||
|
||||
[[package]]
|
||||
name = "semver-parser"
|
||||
version = "0.10.2"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "00b0bef5b7f9e0df16536d3961cfb6e84331c065b4066afb39768d0e319411f7"
|
||||
dependencies = [
|
||||
"pest",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "serde"
|
||||
version = "1.0.125"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "558dc50e1a5a5fa7112ca2ce4effcb321b0300c0d4ccf0776a9f60cd89031171"
|
||||
dependencies = [
|
||||
"serde_derive",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "serde_cbor"
|
||||
|
@ -1353,6 +1443,21 @@ dependencies = [
|
|||
"generic-array",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "skeptic"
|
||||
version = "0.13.6"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "188b810342d98f23f0bb875045299f34187b559370b041eb11520c905370a888"
|
||||
dependencies = [
|
||||
"bytecount",
|
||||
"cargo_metadata",
|
||||
"error-chain",
|
||||
"glob 0.3.0",
|
||||
"pulldown-cmark",
|
||||
"tempfile",
|
||||
"walkdir",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "smallvec"
|
||||
version = "0.6.14"
|
||||
|
@ -1367,6 +1472,9 @@ name = "smallvec"
|
|||
version = "1.6.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "fe0f37c9e8f3c5a4a66ad655a93c74daac4ad00c441533bf5c6e7990bb42604e"
|
||||
dependencies = [
|
||||
"serde",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "strsim"
|
||||
|
@ -1528,6 +1636,21 @@ version = "1.13.0"
|
|||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "879f6906492a7cd215bfa4cf595b600146ccfac0c79bcbd1f3000162af5e8b06"
|
||||
|
||||
[[package]]
|
||||
name = "ucd-trie"
|
||||
version = "0.1.3"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "56dee185309b50d1f11bfedef0fe6d036842e3fb77413abef29f8f8d1c5d4c1c"
|
||||
|
||||
[[package]]
|
||||
name = "unicase"
|
||||
version = "2.6.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "50f37be617794602aabbeee0be4f259dc1778fabe05e2d67ee8f79326d5cb4f6"
|
||||
dependencies = [
|
||||
"version_check",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "unicode-segmentation"
|
||||
version = "1.7.1"
|
||||
|
@ -2289,12 +2412,16 @@ dependencies = [
|
|||
name = "uu_sort"
|
||||
version = "0.0.6"
|
||||
dependencies = [
|
||||
"byteorder",
|
||||
"clap",
|
||||
"extsort",
|
||||
"fnv",
|
||||
"itertools 0.10.0",
|
||||
"rand 0.7.3",
|
||||
"rayon",
|
||||
"semver",
|
||||
"semver 0.9.0",
|
||||
"serde",
|
||||
"serde_json",
|
||||
"smallvec 1.6.1",
|
||||
"uucore",
|
||||
"uucore_procs",
|
||||
|
@ -2604,6 +2731,12 @@ version = "0.8.2"
|
|||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "f1bddf1187be692e79c5ffeab891132dfb0f236ed36a43c7ed39f1165ee20191"
|
||||
|
||||
[[package]]
|
||||
name = "version_check"
|
||||
version = "0.9.3"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "5fecdca9a5291cc2b8dcf7dc02453fee791a280f3743cb0905f8822ae463b3fe"
|
||||
|
||||
[[package]]
|
||||
name = "void"
|
||||
version = "1.0.2"
|
||||
|
|
|
@ -15,13 +15,17 @@ edition = "2018"
|
|||
path = "src/sort.rs"
|
||||
|
||||
[dependencies]
|
||||
byteorder = "1.4.3"
|
||||
extsort = "0.4.2"
|
||||
serde_json = { version = "1.0", default-features = false, features = ["alloc"] }
|
||||
serde = { version = "1.0", features = ["derive"] }
|
||||
rayon = "1.5"
|
||||
rand = "0.7"
|
||||
clap = "2.33"
|
||||
fnv = "1.0.7"
|
||||
itertools = "0.10.0"
|
||||
semver = "0.9.0"
|
||||
smallvec = "1.6.1"
|
||||
smallvec = { version = "1.6.1", features = ["serde"] }
|
||||
uucore = { version=">=0.0.8", package="uucore", path="../../uucore", features=["fs"] }
|
||||
uucore_procs = { version=">=0.0.5", package="uucore_procs", path="../../uucore_procs" }
|
||||
|
||||
|
|
|
@ -20,7 +20,6 @@ use fnv::FnvHasher;
|
|||
use itertools::Itertools;
|
||||
use rand::distributions::Alphanumeric;
|
||||
use rand::{thread_rng, Rng};
|
||||
use rayon::prelude::*;
|
||||
use semver::Version;
|
||||
use smallvec::SmallVec;
|
||||
use std::borrow::Cow;
|
||||
|
@ -34,6 +33,14 @@ use std::mem::replace;
|
|||
use std::ops::{Range, RangeInclusive};
|
||||
use std::path::Path;
|
||||
use uucore::fs::is_stdin_interactive; // for Iterator::dedup()
|
||||
use extsort::*;
|
||||
use std::str;
|
||||
use serde::{Serialize, Deserialize};
|
||||
use std::ffi::OsString;
|
||||
use std::usize;
|
||||
use std::path::PathBuf;
|
||||
use std::string::*;
|
||||
use serde_json::Result;
|
||||
|
||||
static NAME: &str = "sort";
|
||||
static ABOUT: &str = "Display sorted concatenation of all FILE(s).";
|
||||
|
@ -72,6 +79,8 @@ static OPT_RANDOM: &str = "random-sort";
|
|||
static OPT_ZERO_TERMINATED: &str = "zero-terminated";
|
||||
static OPT_PARALLEL: &str = "parallel";
|
||||
static OPT_FILES0_FROM: &str = "files0-from";
|
||||
static OPT_BUF_SIZE: &str = "buffer-size";
|
||||
static OPT_TMP_DIR: &str = "temporary-directory";
|
||||
|
||||
static ARG_FILES: &str = "files";
|
||||
|
||||
|
@ -110,6 +119,8 @@ struct GlobalSettings {
|
|||
separator: Option<char>,
|
||||
threads: String,
|
||||
zero_terminated: bool,
|
||||
buffer_size: usize,
|
||||
tmp_dir: PathBuf,
|
||||
}
|
||||
|
||||
impl Default for GlobalSettings {
|
||||
|
@ -133,6 +144,8 @@ impl Default for GlobalSettings {
|
|||
separator: None,
|
||||
threads: String::new(),
|
||||
zero_terminated: false,
|
||||
buffer_size: 10000000usize,
|
||||
tmp_dir: PathBuf::from(r"/tmp"),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -162,7 +175,7 @@ impl From<&GlobalSettings> for KeySettings {
|
|||
}
|
||||
|
||||
/// Represents the string selected by a FieldSelector.
|
||||
#[derive(Debug)]
|
||||
#[derive(Debug, Serialize, Deserialize, Clone)]
|
||||
enum Selection {
|
||||
/// If we had to transform this selection, we have to store a new string.
|
||||
String(String),
|
||||
|
@ -182,13 +195,29 @@ impl Selection {
|
|||
|
||||
type Field = Range<usize>;
|
||||
|
||||
#[derive(Debug)]
|
||||
#[derive(Serialize, Deserialize, Debug, Clone)]
|
||||
struct Line {
|
||||
line: String,
|
||||
// The common case is not to specify fields. Let's make this fast.
|
||||
selections: SmallVec<[Selection; 1]>,
|
||||
}
|
||||
|
||||
impl Sortable for Line {
|
||||
fn encode<W: Write>(&self, write: &mut W) {
|
||||
let line = Line { line: self.line.clone(), selections: self.selections.clone() } ;
|
||||
let serialized = serde_json::to_string(&line).unwrap();
|
||||
write.write_all(serialized.as_bytes()).unwrap();
|
||||
}
|
||||
|
||||
fn decode<R: Read>(read: &mut R) -> Option<Line> {
|
||||
let mut buf = String::new();
|
||||
read.read_to_string(&mut buf).ok();
|
||||
let line: Option<Line> = buf;
|
||||
println!("deserialized = {:?}", line);
|
||||
line
|
||||
}
|
||||
}
|
||||
|
||||
impl Line {
|
||||
fn new(line: String, settings: &GlobalSettings) -> Self {
|
||||
let fields = if settings
|
||||
|
@ -681,6 +710,20 @@ pub fn uumain(args: impl uucore::Args) -> i32 {
|
|||
.takes_value(true)
|
||||
.value_name("NUM_THREADS"),
|
||||
)
|
||||
.arg(
|
||||
Arg::with_name(OPT_BUF_SIZE)
|
||||
.long(OPT_BUF_SIZE)
|
||||
.help("sets the maximum SIZE of each segment in number of sorted items")
|
||||
.takes_value(true)
|
||||
.value_name("SIZE"),
|
||||
)
|
||||
.arg(
|
||||
Arg::with_name(OPT_TMP_DIR)
|
||||
.long(OPT_TMP_DIR)
|
||||
.help("use DIR for temporaries, not $TMPDIR or /tmp")
|
||||
.takes_value(true)
|
||||
.value_name("DIR"),
|
||||
)
|
||||
.arg(
|
||||
Arg::with_name(OPT_FILES0_FROM)
|
||||
.long(OPT_FILES0_FROM)
|
||||
|
@ -744,6 +787,32 @@ pub fn uumain(args: impl uucore::Args) -> i32 {
|
|||
env::set_var("RAYON_NUM_THREADS", &settings.threads);
|
||||
}
|
||||
|
||||
if matches.is_present(OPT_BUF_SIZE) {
|
||||
// 10000 is the default extsort buffer, but it's too small
|
||||
settings.buffer_size = matches
|
||||
.value_of(OPT_BUF_SIZE)
|
||||
.map(String::from)
|
||||
.unwrap_or( format! ( "{}", 10000000usize ) )
|
||||
.parse::<usize>()
|
||||
.unwrap_or(10000000usize);
|
||||
}
|
||||
|
||||
if matches.is_present(OPT_TMP_DIR) {
|
||||
let result = matches
|
||||
.value_of(OPT_TMP_DIR)
|
||||
.map(String::from)
|
||||
.unwrap_or("/tmp".to_owned() );
|
||||
settings.tmp_dir = PathBuf::from(format!(r"{}", result));
|
||||
} else {
|
||||
for (key, value) in env::vars_os() {
|
||||
if key == OsString::from("TMPDIR") {
|
||||
settings.tmp_dir = PathBuf::from(format!(r"{}", value.into_string().unwrap_or("/tmp".to_owned())));
|
||||
break
|
||||
}
|
||||
settings.tmp_dir = PathBuf::from(r"/tmp");
|
||||
}
|
||||
}
|
||||
|
||||
settings.zero_terminated = matches.is_present(OPT_ZERO_TERMINATED);
|
||||
settings.merge = matches.is_present(OPT_MERGE);
|
||||
|
||||
|
@ -860,9 +929,9 @@ fn exec(files: Vec<String>, settings: &GlobalSettings) -> i32 {
|
|||
|
||||
if settings.check {
|
||||
return exec_check_file(&lines, &settings);
|
||||
} else {
|
||||
sort_by(&mut lines, &settings);
|
||||
}
|
||||
|
||||
lines = sort_by(lines, &settings);
|
||||
|
||||
if settings.merge {
|
||||
if settings.unique {
|
||||
|
@ -917,8 +986,9 @@ fn exec_check_file(unwrapped_lines: &[Line], settings: &GlobalSettings) -> i32 {
|
|||
}
|
||||
}
|
||||
|
||||
fn sort_by(lines: &mut Vec<Line>, settings: &GlobalSettings) {
|
||||
lines.par_sort_by(|a, b| compare_by(a, b, &settings))
|
||||
fn sort_by(lines: Vec<Line>, settings: &GlobalSettings) -> Vec<Line> {
|
||||
let sorter = ExternalSorter::new().with_segment_size(settings.buffer_size).with_sort_dir(settings.tmp_dir.clone()).with_parallel_sort();
|
||||
sorter.sort_by(lines.into_iter(), |a, b| compare_by(a, b, &settings)).unwrap().collect()
|
||||
}
|
||||
|
||||
fn compare_by(a: &Line, b: &Line, global_settings: &GlobalSettings) -> Ordering {
|
||||
|
@ -1004,7 +1074,6 @@ fn leading_num_common(a: &str) -> &str {
|
|||
// not recognize a positive sign or scientific/E notation so we strip those elements here.
|
||||
fn get_leading_num(a: &str) -> &str {
|
||||
let mut s = "";
|
||||
|
||||
let a = leading_num_common(a);
|
||||
|
||||
// GNU numeric sort doesn't recognize '+' or 'e' notation so we strip
|
||||
|
@ -1019,9 +1088,7 @@ fn get_leading_num(a: &str) -> &str {
|
|||
|
||||
// And empty number or non-number lines are to be treated as ‘0’ but only for numeric sort
|
||||
// All '0'-ed lines will be sorted later, but only amongst themselves, during the so-called 'last resort comparison.'
|
||||
if s.is_empty() {
|
||||
s = "0";
|
||||
};
|
||||
if s.is_empty() { s = "0"; };
|
||||
s
|
||||
}
|
||||
|
||||
|
@ -1087,8 +1154,8 @@ fn permissive_f64_parse(a: &str) -> f64 {
|
|||
// Remove any trailing decimals, ie 4568..890... becomes 4568.890
|
||||
// Then, we trim whitespace and parse
|
||||
match remove_trailing_dec(a).trim().parse::<f64>() {
|
||||
Ok(a) if a.is_nan() => std::f64::NEG_INFINITY,
|
||||
Ok(a) => a,
|
||||
Ok(val) if val.is_nan() => std::f64::NEG_INFINITY,
|
||||
Ok(val) => val,
|
||||
Err(_) => std::f64::NEG_INFINITY,
|
||||
}
|
||||
}
|
||||
|
@ -1107,7 +1174,6 @@ fn numeric_compare(a: &str, b: &str) -> Ordering {
|
|||
let fa = permissive_f64_parse(&ta);
|
||||
let fb = permissive_f64_parse(&tb);
|
||||
|
||||
// f64::cmp isn't implemented (due to NaN issues); implement directly instead
|
||||
if fa > fb {
|
||||
Ordering::Greater
|
||||
} else if fa < fb {
|
||||
|
@ -1150,6 +1216,7 @@ fn human_numeric_convert(a: &str) -> f64 {
|
|||
let num_part = permissive_f64_parse(&num_str);
|
||||
let suffix: f64 = match suffix.parse().unwrap_or('\0') {
|
||||
// SI Units
|
||||
'b' => 1f64,
|
||||
'K' => 1E3,
|
||||
'M' => 1E6,
|
||||
'G' => 1E9,
|
||||
|
@ -1262,6 +1329,7 @@ fn month_compare(a: &str, b: &str) -> Ordering {
|
|||
}
|
||||
}
|
||||
|
||||
#[inline(always)]
|
||||
fn version_parse(a: &str) -> Version {
|
||||
let result = Version::parse(a);
|
||||
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue