mirror of
https://github.com/RGBCube/uutils-coreutils
synced 2025-07-28 03:27:44 +00:00
Merge pull request #2165 from miDeb/sort-optimize-line
sort: optimize the line struct
This commit is contained in:
commit
7c51fb4946
3 changed files with 82 additions and 55 deletions
9
Cargo.lock
generated
9
Cargo.lock
generated
|
@ -1445,12 +1445,6 @@ dependencies = [
|
||||||
"maybe-uninit",
|
"maybe-uninit",
|
||||||
]
|
]
|
||||||
|
|
||||||
[[package]]
|
|
||||||
name = "smallvec"
|
|
||||||
version = "1.6.1"
|
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
||||||
checksum = "fe0f37c9e8f3c5a4a66ad655a93c74daac4ad00c441533bf5c6e7990bb42604e"
|
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "strsim"
|
name = "strsim"
|
||||||
version = "0.8.0"
|
version = "0.8.0"
|
||||||
|
@ -1912,7 +1906,7 @@ dependencies = [
|
||||||
"quickcheck",
|
"quickcheck",
|
||||||
"rand 0.7.3",
|
"rand 0.7.3",
|
||||||
"rand_chacha",
|
"rand_chacha",
|
||||||
"smallvec 0.6.14",
|
"smallvec",
|
||||||
"uucore",
|
"uucore",
|
||||||
"uucore_procs",
|
"uucore_procs",
|
||||||
]
|
]
|
||||||
|
@ -2392,7 +2386,6 @@ dependencies = [
|
||||||
"rand 0.7.3",
|
"rand 0.7.3",
|
||||||
"rayon",
|
"rayon",
|
||||||
"semver",
|
"semver",
|
||||||
"smallvec 1.6.1",
|
|
||||||
"tempdir",
|
"tempdir",
|
||||||
"unicode-width",
|
"unicode-width",
|
||||||
"uucore",
|
"uucore",
|
||||||
|
|
|
@ -21,7 +21,6 @@ clap = "2.33"
|
||||||
fnv = "1.0.7"
|
fnv = "1.0.7"
|
||||||
itertools = "0.10.0"
|
itertools = "0.10.0"
|
||||||
semver = "0.9.0"
|
semver = "0.9.0"
|
||||||
smallvec = "1.6.1"
|
|
||||||
unicode-width = "0.1.8"
|
unicode-width = "0.1.8"
|
||||||
uucore = { version=">=0.0.8", package="uucore", path="../../uucore", features=["fs"] }
|
uucore = { version=">=0.0.8", package="uucore", path="../../uucore", features=["fs"] }
|
||||||
uucore_procs = { version=">=0.0.5", package="uucore_procs", path="../../uucore_procs" }
|
uucore_procs = { version=">=0.0.5", package="uucore_procs", path="../../uucore_procs" }
|
||||||
|
|
|
@ -29,7 +29,6 @@ use rand::distributions::Alphanumeric;
|
||||||
use rand::{thread_rng, Rng};
|
use rand::{thread_rng, Rng};
|
||||||
use rayon::prelude::*;
|
use rayon::prelude::*;
|
||||||
use semver::Version;
|
use semver::Version;
|
||||||
use smallvec::SmallVec;
|
|
||||||
use std::cmp::Ordering;
|
use std::cmp::Ordering;
|
||||||
use std::collections::BinaryHeap;
|
use std::collections::BinaryHeap;
|
||||||
use std::env;
|
use std::env;
|
||||||
|
@ -232,7 +231,6 @@ impl SelectionRange {
|
||||||
enum NumCache {
|
enum NumCache {
|
||||||
AsF64(GeneralF64ParseResult),
|
AsF64(GeneralF64ParseResult),
|
||||||
WithInfo(NumInfo),
|
WithInfo(NumInfo),
|
||||||
None,
|
|
||||||
}
|
}
|
||||||
|
|
||||||
impl NumCache {
|
impl NumCache {
|
||||||
|
@ -253,7 +251,7 @@ impl NumCache {
|
||||||
#[derive(Clone)]
|
#[derive(Clone)]
|
||||||
struct Selection {
|
struct Selection {
|
||||||
range: SelectionRange,
|
range: SelectionRange,
|
||||||
num_cache: NumCache,
|
num_cache: Option<Box<NumCache>>,
|
||||||
}
|
}
|
||||||
|
|
||||||
impl Selection {
|
impl Selection {
|
||||||
|
@ -267,15 +265,17 @@ type Field = Range<usize>;
|
||||||
|
|
||||||
#[derive(Clone)]
|
#[derive(Clone)]
|
||||||
pub struct Line {
|
pub struct Line {
|
||||||
line: String,
|
line: Box<str>,
|
||||||
// The common case is not to specify fields. Let's make this fast.
|
// The common case is not to specify fields. Let's make this fast.
|
||||||
selections: SmallVec<[Selection; 1]>,
|
first_selection: Selection,
|
||||||
|
other_selections: Box<[Selection]>,
|
||||||
}
|
}
|
||||||
|
|
||||||
impl Line {
|
impl Line {
|
||||||
|
/// Estimate the number of bytes that this Line is occupying
|
||||||
pub fn estimate_size(&self) -> usize {
|
pub fn estimate_size(&self) -> usize {
|
||||||
self.line.capacity()
|
self.line.len()
|
||||||
+ self.selections.capacity() * std::mem::size_of::<Selection>()
|
+ self.other_selections.len() * std::mem::size_of::<Selection>()
|
||||||
+ std::mem::size_of::<Self>()
|
+ std::mem::size_of::<Self>()
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -291,35 +291,22 @@ impl Line {
|
||||||
None
|
None
|
||||||
};
|
};
|
||||||
|
|
||||||
let selections: SmallVec<[Selection; 1]> = settings
|
let mut selectors = settings.selectors.iter();
|
||||||
.selectors
|
|
||||||
.iter()
|
let first_selection = selectors
|
||||||
.map(|selector| {
|
.next()
|
||||||
let mut range =
|
.unwrap()
|
||||||
SelectionRange::new(selector.get_selection(&line, fields.as_deref()));
|
.get_selection(&line, fields.as_deref());
|
||||||
let num_cache = if selector.settings.mode == SortMode::Numeric
|
|
||||||
|| selector.settings.mode == SortMode::HumanNumeric
|
let other_selections: Vec<Selection> = selectors
|
||||||
{
|
.map(|selector| selector.get_selection(&line, fields.as_deref()))
|
||||||
let (info, num_range) = NumInfo::parse(
|
|
||||||
range.get_str(&line),
|
|
||||||
NumInfoParseSettings {
|
|
||||||
accept_si_units: selector.settings.mode == SortMode::HumanNumeric,
|
|
||||||
thousands_separator: Some(THOUSANDS_SEP),
|
|
||||||
decimal_pt: Some(DECIMAL_PT),
|
|
||||||
},
|
|
||||||
);
|
|
||||||
range.shorten(num_range);
|
|
||||||
NumCache::WithInfo(info)
|
|
||||||
} else if selector.settings.mode == SortMode::GeneralNumeric {
|
|
||||||
let str = range.get_str(&line);
|
|
||||||
NumCache::AsF64(general_f64_parse(&str[get_leading_gen(str)]))
|
|
||||||
} else {
|
|
||||||
NumCache::None
|
|
||||||
};
|
|
||||||
Selection { range, num_cache }
|
|
||||||
})
|
|
||||||
.collect();
|
.collect();
|
||||||
Self { line, selections }
|
|
||||||
|
Self {
|
||||||
|
line: line.into_boxed_str(),
|
||||||
|
first_selection,
|
||||||
|
other_selections: other_selections.into_boxed_slice(),
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Writes indicators for the selections this line matched. The original line content is NOT expected
|
/// Writes indicators for the selections this line matched. The original line content is NOT expected
|
||||||
|
@ -338,7 +325,7 @@ impl Line {
|
||||||
|
|
||||||
let fields = tokenize(&self.line, settings.separator);
|
let fields = tokenize(&self.line, settings.separator);
|
||||||
for selector in settings.selectors.iter() {
|
for selector in settings.selectors.iter() {
|
||||||
let mut selection = selector.get_selection(&self.line, Some(&fields));
|
let mut selection = selector.get_range(&self.line, Some(&fields));
|
||||||
match selector.settings.mode {
|
match selector.settings.mode {
|
||||||
SortMode::Numeric | SortMode::HumanNumeric => {
|
SortMode::Numeric | SortMode::HumanNumeric => {
|
||||||
// find out which range is used for numeric comparisons
|
// find out which range is used for numeric comparisons
|
||||||
|
@ -595,9 +582,41 @@ impl FieldSelector {
|
||||||
self.from.field != 1 || self.from.char == 0 || self.to.is_some()
|
self.from.field != 1 || self.from.char == 0 || self.to.is_some()
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Look up the slice that corresponds to this selector for the given line.
|
/// Get the selection that corresponds to this selector for the line.
|
||||||
/// If needs_fields returned false, fields may be None.
|
/// If needs_fields returned false, tokens may be None.
|
||||||
fn get_selection<'a>(&self, line: &'a str, tokens: Option<&[Field]>) -> Range<usize> {
|
fn get_selection(&self, line: &str, tokens: Option<&[Field]>) -> Selection {
|
||||||
|
let mut range = SelectionRange::new(self.get_range(&line, tokens));
|
||||||
|
let num_cache = if self.settings.mode == SortMode::Numeric
|
||||||
|
|| self.settings.mode == SortMode::HumanNumeric
|
||||||
|
{
|
||||||
|
// Parse NumInfo for this number.
|
||||||
|
let (info, num_range) = NumInfo::parse(
|
||||||
|
range.get_str(&line),
|
||||||
|
NumInfoParseSettings {
|
||||||
|
accept_si_units: self.settings.mode == SortMode::HumanNumeric,
|
||||||
|
thousands_separator: Some(THOUSANDS_SEP),
|
||||||
|
decimal_pt: Some(DECIMAL_PT),
|
||||||
|
},
|
||||||
|
);
|
||||||
|
// Shorten the range to what we need to pass to numeric_str_cmp later.
|
||||||
|
range.shorten(num_range);
|
||||||
|
Some(Box::new(NumCache::WithInfo(info)))
|
||||||
|
} else if self.settings.mode == SortMode::GeneralNumeric {
|
||||||
|
// Parse this number as f64, as this is the requirement for general numeric sorting.
|
||||||
|
let str = range.get_str(&line);
|
||||||
|
Some(Box::new(NumCache::AsF64(general_f64_parse(
|
||||||
|
&str[get_leading_gen(str)],
|
||||||
|
))))
|
||||||
|
} else {
|
||||||
|
// This is not a numeric sort, so we don't need a NumCache.
|
||||||
|
None
|
||||||
|
};
|
||||||
|
Selection { range, num_cache }
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Look up the range in the line that corresponds to this selector.
|
||||||
|
/// If needs_fields returned false, tokens may be None.
|
||||||
|
fn get_range<'a>(&self, line: &'a str, tokens: Option<&[Field]>) -> Range<usize> {
|
||||||
enum Resolution {
|
enum Resolution {
|
||||||
// The start index of the resolved character, inclusive
|
// The start index of the resolved character, inclusive
|
||||||
StartOfChar(usize),
|
StartOfChar(usize),
|
||||||
|
@ -1238,8 +1257,11 @@ fn sort_by(unsorted: &mut Vec<Line>, settings: &GlobalSettings) {
|
||||||
|
|
||||||
fn compare_by(a: &Line, b: &Line, global_settings: &GlobalSettings) -> Ordering {
|
fn compare_by(a: &Line, b: &Line, global_settings: &GlobalSettings) -> Ordering {
|
||||||
for (idx, selector) in global_settings.selectors.iter().enumerate() {
|
for (idx, selector) in global_settings.selectors.iter().enumerate() {
|
||||||
let a_selection = &a.selections[idx];
|
let (a_selection, b_selection) = if idx == 0 {
|
||||||
let b_selection = &b.selections[idx];
|
(&a.first_selection, &b.first_selection)
|
||||||
|
} else {
|
||||||
|
(&a.other_selections[idx - 1], &b.other_selections[idx - 1])
|
||||||
|
};
|
||||||
let a_str = a_selection.get_str(a);
|
let a_str = a_selection.get_str(a);
|
||||||
let b_str = b_selection.get_str(b);
|
let b_str = b_selection.get_str(b);
|
||||||
let settings = &selector.settings;
|
let settings = &selector.settings;
|
||||||
|
@ -1249,12 +1271,12 @@ fn compare_by(a: &Line, b: &Line, global_settings: &GlobalSettings) -> Ordering
|
||||||
} else {
|
} else {
|
||||||
match settings.mode {
|
match settings.mode {
|
||||||
SortMode::Numeric | SortMode::HumanNumeric => numeric_str_cmp(
|
SortMode::Numeric | SortMode::HumanNumeric => numeric_str_cmp(
|
||||||
(a_str, a_selection.num_cache.as_num_info()),
|
(a_str, a_selection.num_cache.as_ref().unwrap().as_num_info()),
|
||||||
(b_str, b_selection.num_cache.as_num_info()),
|
(b_str, b_selection.num_cache.as_ref().unwrap().as_num_info()),
|
||||||
),
|
),
|
||||||
SortMode::GeneralNumeric => general_numeric_compare(
|
SortMode::GeneralNumeric => general_numeric_compare(
|
||||||
a_selection.num_cache.as_f64(),
|
a_selection.num_cache.as_ref().unwrap().as_f64(),
|
||||||
b_selection.num_cache.as_f64(),
|
b_selection.num_cache.as_ref().unwrap().as_f64(),
|
||||||
),
|
),
|
||||||
SortMode::Month => month_compare(a_str, b_str),
|
SortMode::Month => month_compare(a_str, b_str),
|
||||||
SortMode::Version => version_compare(a_str, b_str),
|
SortMode::Version => version_compare(a_str, b_str),
|
||||||
|
@ -1340,7 +1362,8 @@ enum GeneralF64ParseResult {
|
||||||
Infinity,
|
Infinity,
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Parse the beginning string into an f64, returning -inf instead of NaN on errors.
|
/// Parse the beginning string into a GeneralF64ParseResult.
|
||||||
|
/// Using a GeneralF64ParseResult instead of f64 is necessary to correctly order floats.
|
||||||
#[inline(always)]
|
#[inline(always)]
|
||||||
fn general_f64_parse(a: &str) -> GeneralF64ParseResult {
|
fn general_f64_parse(a: &str) -> GeneralF64ParseResult {
|
||||||
// The actual behavior here relies on Rust's implementation of parsing floating points.
|
// The actual behavior here relies on Rust's implementation of parsing floating points.
|
||||||
|
@ -1593,4 +1616,16 @@ mod tests {
|
||||||
let line = "..a..a";
|
let line = "..a..a";
|
||||||
assert_eq!(tokenize(line, Some('a')), vec![0..2, 3..5]);
|
assert_eq!(tokenize(line, Some('a')), vec![0..2, 3..5]);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
#[cfg(target_pointer_width = "64")]
|
||||||
|
fn test_line_size() {
|
||||||
|
// We should make sure to not regress the size of the Line struct because
|
||||||
|
// it is unconditional overhead for every line we sort.
|
||||||
|
assert_eq!(std::mem::size_of::<Line>(), 56);
|
||||||
|
// These are the fields of Line:
|
||||||
|
assert_eq!(std::mem::size_of::<Box<str>>(), 16);
|
||||||
|
assert_eq!(std::mem::size_of::<Selection>(), 24);
|
||||||
|
assert_eq!(std::mem::size_of::<Box<[Selection]>>(), 16);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue