1
Fork 0
mirror of https://github.com/RGBCube/uutils-coreutils synced 2025-07-28 03:27:44 +00:00

Merge pull request #2165 from miDeb/sort-optimize-line

sort: optimize the line struct
This commit is contained in:
Sylvestre Ledru 2021-05-09 18:41:39 +02:00 committed by GitHub
commit 7c51fb4946
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
3 changed files with 82 additions and 55 deletions

9
Cargo.lock generated
View file

@ -1445,12 +1445,6 @@ dependencies = [
"maybe-uninit", "maybe-uninit",
] ]
[[package]]
name = "smallvec"
version = "1.6.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "fe0f37c9e8f3c5a4a66ad655a93c74daac4ad00c441533bf5c6e7990bb42604e"
[[package]] [[package]]
name = "strsim" name = "strsim"
version = "0.8.0" version = "0.8.0"
@ -1912,7 +1906,7 @@ dependencies = [
"quickcheck", "quickcheck",
"rand 0.7.3", "rand 0.7.3",
"rand_chacha", "rand_chacha",
"smallvec 0.6.14", "smallvec",
"uucore", "uucore",
"uucore_procs", "uucore_procs",
] ]
@ -2392,7 +2386,6 @@ dependencies = [
"rand 0.7.3", "rand 0.7.3",
"rayon", "rayon",
"semver", "semver",
"smallvec 1.6.1",
"tempdir", "tempdir",
"unicode-width", "unicode-width",
"uucore", "uucore",

View file

@ -21,7 +21,6 @@ clap = "2.33"
fnv = "1.0.7" fnv = "1.0.7"
itertools = "0.10.0" itertools = "0.10.0"
semver = "0.9.0" semver = "0.9.0"
smallvec = "1.6.1"
unicode-width = "0.1.8" unicode-width = "0.1.8"
uucore = { version=">=0.0.8", package="uucore", path="../../uucore", features=["fs"] } uucore = { version=">=0.0.8", package="uucore", path="../../uucore", features=["fs"] }
uucore_procs = { version=">=0.0.5", package="uucore_procs", path="../../uucore_procs" } uucore_procs = { version=">=0.0.5", package="uucore_procs", path="../../uucore_procs" }

View file

@ -29,7 +29,6 @@ use rand::distributions::Alphanumeric;
use rand::{thread_rng, Rng}; use rand::{thread_rng, Rng};
use rayon::prelude::*; use rayon::prelude::*;
use semver::Version; use semver::Version;
use smallvec::SmallVec;
use std::cmp::Ordering; use std::cmp::Ordering;
use std::collections::BinaryHeap; use std::collections::BinaryHeap;
use std::env; use std::env;
@ -232,7 +231,6 @@ impl SelectionRange {
enum NumCache { enum NumCache {
AsF64(GeneralF64ParseResult), AsF64(GeneralF64ParseResult),
WithInfo(NumInfo), WithInfo(NumInfo),
None,
} }
impl NumCache { impl NumCache {
@ -253,7 +251,7 @@ impl NumCache {
#[derive(Clone)] #[derive(Clone)]
struct Selection { struct Selection {
range: SelectionRange, range: SelectionRange,
num_cache: NumCache, num_cache: Option<Box<NumCache>>,
} }
impl Selection { impl Selection {
@ -267,15 +265,17 @@ type Field = Range<usize>;
#[derive(Clone)] #[derive(Clone)]
pub struct Line { pub struct Line {
line: String, line: Box<str>,
// The common case is not to specify fields. Let's make this fast. // The common case is not to specify fields. Let's make this fast.
selections: SmallVec<[Selection; 1]>, first_selection: Selection,
other_selections: Box<[Selection]>,
} }
impl Line { impl Line {
/// Estimate the number of bytes that this Line is occupying
pub fn estimate_size(&self) -> usize { pub fn estimate_size(&self) -> usize {
self.line.capacity() self.line.len()
+ self.selections.capacity() * std::mem::size_of::<Selection>() + self.other_selections.len() * std::mem::size_of::<Selection>()
+ std::mem::size_of::<Self>() + std::mem::size_of::<Self>()
} }
@ -291,35 +291,22 @@ impl Line {
None None
}; };
let selections: SmallVec<[Selection; 1]> = settings let mut selectors = settings.selectors.iter();
.selectors
.iter() let first_selection = selectors
.map(|selector| { .next()
let mut range = .unwrap()
SelectionRange::new(selector.get_selection(&line, fields.as_deref())); .get_selection(&line, fields.as_deref());
let num_cache = if selector.settings.mode == SortMode::Numeric
|| selector.settings.mode == SortMode::HumanNumeric let other_selections: Vec<Selection> = selectors
{ .map(|selector| selector.get_selection(&line, fields.as_deref()))
let (info, num_range) = NumInfo::parse(
range.get_str(&line),
NumInfoParseSettings {
accept_si_units: selector.settings.mode == SortMode::HumanNumeric,
thousands_separator: Some(THOUSANDS_SEP),
decimal_pt: Some(DECIMAL_PT),
},
);
range.shorten(num_range);
NumCache::WithInfo(info)
} else if selector.settings.mode == SortMode::GeneralNumeric {
let str = range.get_str(&line);
NumCache::AsF64(general_f64_parse(&str[get_leading_gen(str)]))
} else {
NumCache::None
};
Selection { range, num_cache }
})
.collect(); .collect();
Self { line, selections }
Self {
line: line.into_boxed_str(),
first_selection,
other_selections: other_selections.into_boxed_slice(),
}
} }
/// Writes indicators for the selections this line matched. The original line content is NOT expected /// Writes indicators for the selections this line matched. The original line content is NOT expected
@ -338,7 +325,7 @@ impl Line {
let fields = tokenize(&self.line, settings.separator); let fields = tokenize(&self.line, settings.separator);
for selector in settings.selectors.iter() { for selector in settings.selectors.iter() {
let mut selection = selector.get_selection(&self.line, Some(&fields)); let mut selection = selector.get_range(&self.line, Some(&fields));
match selector.settings.mode { match selector.settings.mode {
SortMode::Numeric | SortMode::HumanNumeric => { SortMode::Numeric | SortMode::HumanNumeric => {
// find out which range is used for numeric comparisons // find out which range is used for numeric comparisons
@ -595,9 +582,41 @@ impl FieldSelector {
self.from.field != 1 || self.from.char == 0 || self.to.is_some() self.from.field != 1 || self.from.char == 0 || self.to.is_some()
} }
/// Look up the slice that corresponds to this selector for the given line. /// Get the selection that corresponds to this selector for the line.
/// If needs_fields returned false, fields may be None. /// If needs_fields returned false, tokens may be None.
fn get_selection<'a>(&self, line: &'a str, tokens: Option<&[Field]>) -> Range<usize> { fn get_selection(&self, line: &str, tokens: Option<&[Field]>) -> Selection {
let mut range = SelectionRange::new(self.get_range(&line, tokens));
let num_cache = if self.settings.mode == SortMode::Numeric
|| self.settings.mode == SortMode::HumanNumeric
{
// Parse NumInfo for this number.
let (info, num_range) = NumInfo::parse(
range.get_str(&line),
NumInfoParseSettings {
accept_si_units: self.settings.mode == SortMode::HumanNumeric,
thousands_separator: Some(THOUSANDS_SEP),
decimal_pt: Some(DECIMAL_PT),
},
);
// Shorten the range to what we need to pass to numeric_str_cmp later.
range.shorten(num_range);
Some(Box::new(NumCache::WithInfo(info)))
} else if self.settings.mode == SortMode::GeneralNumeric {
// Parse this number as f64, as this is the requirement for general numeric sorting.
let str = range.get_str(&line);
Some(Box::new(NumCache::AsF64(general_f64_parse(
&str[get_leading_gen(str)],
))))
} else {
// This is not a numeric sort, so we don't need a NumCache.
None
};
Selection { range, num_cache }
}
/// Look up the range in the line that corresponds to this selector.
/// If needs_fields returned false, tokens may be None.
fn get_range<'a>(&self, line: &'a str, tokens: Option<&[Field]>) -> Range<usize> {
enum Resolution { enum Resolution {
// The start index of the resolved character, inclusive // The start index of the resolved character, inclusive
StartOfChar(usize), StartOfChar(usize),
@ -1238,8 +1257,11 @@ fn sort_by(unsorted: &mut Vec<Line>, settings: &GlobalSettings) {
fn compare_by(a: &Line, b: &Line, global_settings: &GlobalSettings) -> Ordering { fn compare_by(a: &Line, b: &Line, global_settings: &GlobalSettings) -> Ordering {
for (idx, selector) in global_settings.selectors.iter().enumerate() { for (idx, selector) in global_settings.selectors.iter().enumerate() {
let a_selection = &a.selections[idx]; let (a_selection, b_selection) = if idx == 0 {
let b_selection = &b.selections[idx]; (&a.first_selection, &b.first_selection)
} else {
(&a.other_selections[idx - 1], &b.other_selections[idx - 1])
};
let a_str = a_selection.get_str(a); let a_str = a_selection.get_str(a);
let b_str = b_selection.get_str(b); let b_str = b_selection.get_str(b);
let settings = &selector.settings; let settings = &selector.settings;
@ -1249,12 +1271,12 @@ fn compare_by(a: &Line, b: &Line, global_settings: &GlobalSettings) -> Ordering
} else { } else {
match settings.mode { match settings.mode {
SortMode::Numeric | SortMode::HumanNumeric => numeric_str_cmp( SortMode::Numeric | SortMode::HumanNumeric => numeric_str_cmp(
(a_str, a_selection.num_cache.as_num_info()), (a_str, a_selection.num_cache.as_ref().unwrap().as_num_info()),
(b_str, b_selection.num_cache.as_num_info()), (b_str, b_selection.num_cache.as_ref().unwrap().as_num_info()),
), ),
SortMode::GeneralNumeric => general_numeric_compare( SortMode::GeneralNumeric => general_numeric_compare(
a_selection.num_cache.as_f64(), a_selection.num_cache.as_ref().unwrap().as_f64(),
b_selection.num_cache.as_f64(), b_selection.num_cache.as_ref().unwrap().as_f64(),
), ),
SortMode::Month => month_compare(a_str, b_str), SortMode::Month => month_compare(a_str, b_str),
SortMode::Version => version_compare(a_str, b_str), SortMode::Version => version_compare(a_str, b_str),
@ -1340,7 +1362,8 @@ enum GeneralF64ParseResult {
Infinity, Infinity,
} }
/// Parse the beginning string into an f64, returning -inf instead of NaN on errors. /// Parse the beginning string into a GeneralF64ParseResult.
/// Using a GeneralF64ParseResult instead of f64 is necessary to correctly order floats.
#[inline(always)] #[inline(always)]
fn general_f64_parse(a: &str) -> GeneralF64ParseResult { fn general_f64_parse(a: &str) -> GeneralF64ParseResult {
// The actual behavior here relies on Rust's implementation of parsing floating points. // The actual behavior here relies on Rust's implementation of parsing floating points.
@ -1593,4 +1616,16 @@ mod tests {
let line = "..a..a"; let line = "..a..a";
assert_eq!(tokenize(line, Some('a')), vec![0..2, 3..5]); assert_eq!(tokenize(line, Some('a')), vec![0..2, 3..5]);
} }
#[test]
#[cfg(target_pointer_width = "64")]
fn test_line_size() {
// We should make sure to not regress the size of the Line struct because
// it is unconditional overhead for every line we sort.
assert_eq!(std::mem::size_of::<Line>(), 56);
// These are the fields of Line:
assert_eq!(std::mem::size_of::<Box<str>>(), 16);
assert_eq!(std::mem::size_of::<Selection>(), 24);
assert_eq!(std::mem::size_of::<Box<[Selection]>>(), 16);
}
} }