1
Fork 0
mirror of https://github.com/RGBCube/uutils-coreutils synced 2025-07-31 04:57:45 +00:00
This commit is contained in:
electricboogie 2021-04-12 14:24:22 -05:00
parent c6021e10c2
commit e6c195a675
3 changed files with 223 additions and 18 deletions

139
Cargo.lock generated
View file

@ -119,12 +119,40 @@ version = "0.2.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "560c32574a12a89ecd91f5e742165893f86e3ab98d21f8ea548658eb9eef5f40"
[[package]]
name = "bytecount"
version = "0.6.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "72feb31ffc86498dacdbd0fcebb56138e7177a8cc5cea4516031d15ae85a742e"
[[package]]
name = "byteorder"
version = "1.4.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "14c189c53d098945499cdfa7ecc63567cf3886b3332b312a5b4585d8d3a6a610"
[[package]]
name = "cargo-platform"
version = "0.1.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "0226944a63d1bf35a3b5f948dd7c59e263db83695c9e8bffc4037de02e30f1d7"
dependencies = [
"serde",
]
[[package]]
name = "cargo_metadata"
version = "0.12.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "7714a157da7991e23d90686b9524b9e12e0407a108647f52e9328f4b3d51ac7f"
dependencies = [
"cargo-platform",
"semver 0.11.0",
"semver-parser 0.10.2",
"serde",
"serde_json",
]
[[package]]
name = "cast"
version = "0.2.3"
@ -560,6 +588,26 @@ dependencies = [
"regex",
]
[[package]]
name = "error-chain"
version = "0.12.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "2d2f06b9cac1506ece98fe3231e3cc9c4410ec3d5b1f24ae1c8946f0742cdefc"
dependencies = [
"version_check",
]
[[package]]
name = "extsort"
version = "0.4.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "ffc5bb6fbca3c5ce6a51f6857eab8c35c898b2fbcb62ff1b728243dd19ec0c9f"
dependencies = [
"rayon",
"skeptic",
"tempfile",
]
[[package]]
name = "fake-simd"
version = "0.1.2"
@ -944,6 +992,15 @@ dependencies = [
"proc-macro-hack",
]
[[package]]
name = "pest"
version = "2.1.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "10f4872ae94d7b90ae48754df22fd42ad52ce740b8f370b03da4835417403e53"
dependencies = [
"ucd-trie",
]
[[package]]
name = "pkg-config"
version = "0.3.19"
@ -1009,6 +1066,17 @@ dependencies = [
"unicode-xid 0.2.1",
]
[[package]]
name = "pulldown-cmark"
version = "0.8.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "ffade02495f22453cd593159ea2f59827aae7f53fa8323f756799b670881dcf8"
dependencies = [
"bitflags",
"memchr 2.3.4",
"unicase",
]
[[package]]
name = "quick-error"
version = "1.2.3"
@ -1245,7 +1313,7 @@ version = "0.2.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "138e3e0acb6c9fb258b19b67cb8abd63c00679d2851805ea151465464fe9030a"
dependencies = [
"semver",
"semver 0.9.0",
]
[[package]]
@ -1275,7 +1343,17 @@ version = "0.9.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "1d7eb9ef2c18661902cc47e535f9bc51b78acd254da71d375c2f6720d9a40403"
dependencies = [
"semver-parser",
"semver-parser 0.7.0",
]
[[package]]
name = "semver"
version = "0.11.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "f301af10236f6df4160f7c3f04eec6dbc70ace82d23326abad5edee88801c6b6"
dependencies = [
"semver-parser 0.10.2",
"serde",
]
[[package]]
@ -1284,11 +1362,23 @@ version = "0.7.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "388a1df253eca08550bef6c72392cfe7c30914bf41df5269b68cbd6ff8f570a3"
[[package]]
name = "semver-parser"
version = "0.10.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "00b0bef5b7f9e0df16536d3961cfb6e84331c065b4066afb39768d0e319411f7"
dependencies = [
"pest",
]
[[package]]
name = "serde"
version = "1.0.125"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "558dc50e1a5a5fa7112ca2ce4effcb321b0300c0d4ccf0776a9f60cd89031171"
dependencies = [
"serde_derive",
]
[[package]]
name = "serde_cbor"
@ -1353,6 +1443,21 @@ dependencies = [
"generic-array",
]
[[package]]
name = "skeptic"
version = "0.13.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "188b810342d98f23f0bb875045299f34187b559370b041eb11520c905370a888"
dependencies = [
"bytecount",
"cargo_metadata",
"error-chain",
"glob 0.3.0",
"pulldown-cmark",
"tempfile",
"walkdir",
]
[[package]]
name = "smallvec"
version = "0.6.14"
@ -1367,6 +1472,9 @@ name = "smallvec"
version = "1.6.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "fe0f37c9e8f3c5a4a66ad655a93c74daac4ad00c441533bf5c6e7990bb42604e"
dependencies = [
"serde",
]
[[package]]
name = "strsim"
@ -1528,6 +1636,21 @@ version = "1.13.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "879f6906492a7cd215bfa4cf595b600146ccfac0c79bcbd1f3000162af5e8b06"
[[package]]
name = "ucd-trie"
version = "0.1.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "56dee185309b50d1f11bfedef0fe6d036842e3fb77413abef29f8f8d1c5d4c1c"
[[package]]
name = "unicase"
version = "2.6.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "50f37be617794602aabbeee0be4f259dc1778fabe05e2d67ee8f79326d5cb4f6"
dependencies = [
"version_check",
]
[[package]]
name = "unicode-segmentation"
version = "1.7.1"
@ -2289,12 +2412,16 @@ dependencies = [
name = "uu_sort"
version = "0.0.6"
dependencies = [
"byteorder",
"clap",
"extsort",
"fnv",
"itertools 0.10.0",
"rand 0.7.3",
"rayon",
"semver",
"semver 0.9.0",
"serde",
"serde_json",
"smallvec 1.6.1",
"uucore",
"uucore_procs",
@ -2604,6 +2731,12 @@ version = "0.8.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "f1bddf1187be692e79c5ffeab891132dfb0f236ed36a43c7ed39f1165ee20191"
[[package]]
name = "version_check"
version = "0.9.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "5fecdca9a5291cc2b8dcf7dc02453fee791a280f3743cb0905f8822ae463b3fe"
[[package]]
name = "void"
version = "1.0.2"

View file

@ -15,13 +15,17 @@ edition = "2018"
path = "src/sort.rs"
[dependencies]
byteorder = "1.4.3"
extsort = "0.4.2"
serde_json = { version = "1.0", default-features = false, features = ["alloc"] }
serde = { version = "1.0", features = ["derive"] }
rayon = "1.5"
rand = "0.7"
clap = "2.33"
fnv = "1.0.7"
itertools = "0.10.0"
semver = "0.9.0"
smallvec = "1.6.1"
smallvec = { version = "1.6.1", features = ["serde"] }
uucore = { version=">=0.0.8", package="uucore", path="../../uucore", features=["fs"] }
uucore_procs = { version=">=0.0.5", package="uucore_procs", path="../../uucore_procs" }

View file

@ -20,7 +20,6 @@ use fnv::FnvHasher;
use itertools::Itertools;
use rand::distributions::Alphanumeric;
use rand::{thread_rng, Rng};
use rayon::prelude::*;
use semver::Version;
use smallvec::SmallVec;
use std::borrow::Cow;
@ -34,6 +33,14 @@ use std::mem::replace;
use std::ops::{Range, RangeInclusive};
use std::path::Path;
use uucore::fs::is_stdin_interactive; // for Iterator::dedup()
use extsort::*;
use std::str;
use serde::{Serialize, Deserialize};
use std::ffi::OsString;
use std::usize;
use std::path::PathBuf;
use std::string::*;
use serde_json::Result;
static NAME: &str = "sort";
static ABOUT: &str = "Display sorted concatenation of all FILE(s).";
@ -72,6 +79,8 @@ static OPT_RANDOM: &str = "random-sort";
static OPT_ZERO_TERMINATED: &str = "zero-terminated";
static OPT_PARALLEL: &str = "parallel";
static OPT_FILES0_FROM: &str = "files0-from";
static OPT_BUF_SIZE: &str = "buffer-size";
static OPT_TMP_DIR: &str = "temporary-directory";
static ARG_FILES: &str = "files";
@ -110,6 +119,8 @@ struct GlobalSettings {
separator: Option<char>,
threads: String,
zero_terminated: bool,
buffer_size: usize,
tmp_dir: PathBuf,
}
impl Default for GlobalSettings {
@ -133,6 +144,8 @@ impl Default for GlobalSettings {
separator: None,
threads: String::new(),
zero_terminated: false,
buffer_size: 10000000usize,
tmp_dir: PathBuf::from(r"/tmp"),
}
}
}
@ -162,7 +175,7 @@ impl From<&GlobalSettings> for KeySettings {
}
/// Represents the string selected by a FieldSelector.
#[derive(Debug)]
#[derive(Debug, Serialize, Deserialize, Clone)]
enum Selection {
/// If we had to transform this selection, we have to store a new string.
String(String),
@ -182,13 +195,29 @@ impl Selection {
type Field = Range<usize>;
#[derive(Debug)]
#[derive(Serialize, Deserialize, Debug, Clone)]
struct Line {
line: String,
// The common case is not to specify fields. Let's make this fast.
selections: SmallVec<[Selection; 1]>,
}
impl Sortable for Line {
fn encode<W: Write>(&self, write: &mut W) {
let line = Line { line: self.line.clone(), selections: self.selections.clone() } ;
let serialized = serde_json::to_string(&line).unwrap();
write.write_all(serialized.as_bytes()).unwrap();
}
fn decode<R: Read>(read: &mut R) -> Option<Line> {
let mut buf = String::new();
read.read_to_string(&mut buf).ok();
let line: Option<Line> = buf;
println!("deserialized = {:?}", line);
line
}
}
impl Line {
fn new(line: String, settings: &GlobalSettings) -> Self {
let fields = if settings
@ -681,6 +710,20 @@ pub fn uumain(args: impl uucore::Args) -> i32 {
.takes_value(true)
.value_name("NUM_THREADS"),
)
.arg(
Arg::with_name(OPT_BUF_SIZE)
.long(OPT_BUF_SIZE)
.help("sets the maximum SIZE of each segment in number of sorted items")
.takes_value(true)
.value_name("SIZE"),
)
.arg(
Arg::with_name(OPT_TMP_DIR)
.long(OPT_TMP_DIR)
.help("use DIR for temporaries, not $TMPDIR or /tmp")
.takes_value(true)
.value_name("DIR"),
)
.arg(
Arg::with_name(OPT_FILES0_FROM)
.long(OPT_FILES0_FROM)
@ -744,6 +787,32 @@ pub fn uumain(args: impl uucore::Args) -> i32 {
env::set_var("RAYON_NUM_THREADS", &settings.threads);
}
if matches.is_present(OPT_BUF_SIZE) {
// 10000 is the default extsort buffer, but it's too small
settings.buffer_size = matches
.value_of(OPT_BUF_SIZE)
.map(String::from)
.unwrap_or( format! ( "{}", 10000000usize ) )
.parse::<usize>()
.unwrap_or(10000000usize);
}
if matches.is_present(OPT_TMP_DIR) {
let result = matches
.value_of(OPT_TMP_DIR)
.map(String::from)
.unwrap_or("/tmp".to_owned() );
settings.tmp_dir = PathBuf::from(format!(r"{}", result));
} else {
for (key, value) in env::vars_os() {
if key == OsString::from("TMPDIR") {
settings.tmp_dir = PathBuf::from(format!(r"{}", value.into_string().unwrap_or("/tmp".to_owned())));
break
}
settings.tmp_dir = PathBuf::from(r"/tmp");
}
}
settings.zero_terminated = matches.is_present(OPT_ZERO_TERMINATED);
settings.merge = matches.is_present(OPT_MERGE);
@ -860,9 +929,9 @@ fn exec(files: Vec<String>, settings: &GlobalSettings) -> i32 {
if settings.check {
return exec_check_file(&lines, &settings);
} else {
sort_by(&mut lines, &settings);
}
lines = sort_by(lines, &settings);
if settings.merge {
if settings.unique {
@ -917,8 +986,9 @@ fn exec_check_file(unwrapped_lines: &[Line], settings: &GlobalSettings) -> i32 {
}
}
fn sort_by(lines: &mut Vec<Line>, settings: &GlobalSettings) {
lines.par_sort_by(|a, b| compare_by(a, b, &settings))
fn sort_by(lines: Vec<Line>, settings: &GlobalSettings) -> Vec<Line> {
let sorter = ExternalSorter::new().with_segment_size(settings.buffer_size).with_sort_dir(settings.tmp_dir.clone()).with_parallel_sort();
sorter.sort_by(lines.into_iter(), |a, b| compare_by(a, b, &settings)).unwrap().collect()
}
fn compare_by(a: &Line, b: &Line, global_settings: &GlobalSettings) -> Ordering {
@ -1004,7 +1074,6 @@ fn leading_num_common(a: &str) -> &str {
// not recognize a positive sign or scientific/E notation so we strip those elements here.
fn get_leading_num(a: &str) -> &str {
let mut s = "";
let a = leading_num_common(a);
// GNU numeric sort doesn't recognize '+' or 'e' notation so we strip
@ -1019,9 +1088,7 @@ fn get_leading_num(a: &str) -> &str {
// And empty number or non-number lines are to be treated as 0 but only for numeric sort
// All '0'-ed lines will be sorted later, but only amongst themselves, during the so-called 'last resort comparison.'
if s.is_empty() {
s = "0";
};
if s.is_empty() { s = "0"; };
s
}
@ -1087,8 +1154,8 @@ fn permissive_f64_parse(a: &str) -> f64 {
// Remove any trailing decimals, ie 4568..890... becomes 4568.890
// Then, we trim whitespace and parse
match remove_trailing_dec(a).trim().parse::<f64>() {
Ok(a) if a.is_nan() => std::f64::NEG_INFINITY,
Ok(a) => a,
Ok(val) if val.is_nan() => std::f64::NEG_INFINITY,
Ok(val) => val,
Err(_) => std::f64::NEG_INFINITY,
}
}
@ -1107,7 +1174,6 @@ fn numeric_compare(a: &str, b: &str) -> Ordering {
let fa = permissive_f64_parse(&ta);
let fb = permissive_f64_parse(&tb);
// f64::cmp isn't implemented (due to NaN issues); implement directly instead
if fa > fb {
Ordering::Greater
} else if fa < fb {
@ -1150,6 +1216,7 @@ fn human_numeric_convert(a: &str) -> f64 {
let num_part = permissive_f64_parse(&num_str);
let suffix: f64 = match suffix.parse().unwrap_or('\0') {
// SI Units
'b' => 1f64,
'K' => 1E3,
'M' => 1E6,
'G' => 1E9,
@ -1262,6 +1329,7 @@ fn month_compare(a: &str, b: &str) -> Ordering {
}
}
#[inline(always)]
fn version_parse(a: &str) -> Version {
let result = Version::parse(a);