From c38373946a6a24afa02050a9dc41d88bc2afcdd7 Mon Sep 17 00:00:00 2001 From: Michael Debertol Date: Fri, 7 May 2021 21:49:44 +0200 Subject: [PATCH 1/3] sort: optimize the Line struct --- Cargo.lock | 9 +-- src/uu/sort/Cargo.toml | 1 - src/uu/sort/src/sort.rs | 118 +++++++++++++++++++++++++--------------- 3 files changed, 74 insertions(+), 54 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 2362342d4..13441d4fe 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1445,12 +1445,6 @@ dependencies = [ "maybe-uninit", ] -[[package]] -name = "smallvec" -version = "1.6.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fe0f37c9e8f3c5a4a66ad655a93c74daac4ad00c441533bf5c6e7990bb42604e" - [[package]] name = "strsim" version = "0.8.0" @@ -1912,7 +1906,7 @@ dependencies = [ "quickcheck", "rand 0.7.3", "rand_chacha", - "smallvec 0.6.14", + "smallvec", "uucore", "uucore_procs", ] @@ -2392,7 +2386,6 @@ dependencies = [ "rand 0.7.3", "rayon", "semver", - "smallvec 1.6.1", "tempdir", "unicode-width", "uucore", diff --git a/src/uu/sort/Cargo.toml b/src/uu/sort/Cargo.toml index 3784ccbb0..5221f1f4e 100644 --- a/src/uu/sort/Cargo.toml +++ b/src/uu/sort/Cargo.toml @@ -21,7 +21,6 @@ clap = "2.33" fnv = "1.0.7" itertools = "0.10.0" semver = "0.9.0" -smallvec = "1.6.1" unicode-width = "0.1.8" uucore = { version=">=0.0.8", package="uucore", path="../../uucore", features=["fs"] } uucore_procs = { version=">=0.0.5", package="uucore_procs", path="../../uucore_procs" } diff --git a/src/uu/sort/src/sort.rs b/src/uu/sort/src/sort.rs index d8978cb2b..71d912f33 100644 --- a/src/uu/sort/src/sort.rs +++ b/src/uu/sort/src/sort.rs @@ -20,8 +20,8 @@ mod external_sort; mod numeric_str_cmp; use clap::{App, Arg}; -use external_sort::ext_sort; use custom_str_cmp::custom_str_cmp; +use external_sort::ext_sort; use fnv::FnvHasher; use itertools::Itertools; use numeric_str_cmp::{numeric_str_cmp, NumInfo, NumInfoParseSettings}; @@ -29,7 +29,6 @@ use rand::distributions::Alphanumeric; use rand::{thread_rng, Rng}; use rayon::prelude::*; use semver::Version; -use smallvec::SmallVec; use std::cmp::Ordering; use std::collections::BinaryHeap; use std::env; @@ -231,7 +230,6 @@ impl SelectionRange { enum NumCache { AsF64(GeneralF64ParseResult), WithInfo(NumInfo), - None, } impl NumCache { @@ -252,7 +250,7 @@ impl NumCache { #[derive(Clone)] struct Selection { range: SelectionRange, - num_cache: NumCache, + num_cache: Option>, } impl Selection { @@ -266,15 +264,17 @@ type Field = Range; #[derive(Clone)] pub struct Line { - line: String, + line: Box, // The common case is not to specify fields. Let's make this fast. - selections: SmallVec<[Selection; 1]>, + first_selection: Selection, + other_selections: Box<[Selection]>, } impl Line { + /// Estimate the number of bytes that this Line is occupying pub fn estimate_size(&self) -> usize { - self.line.capacity() - + self.selections.capacity() * std::mem::size_of::() + self.line.len() + + self.other_selections.len() * std::mem::size_of::() + std::mem::size_of::() } @@ -290,35 +290,22 @@ impl Line { None }; - let selections: SmallVec<[Selection; 1]> = settings - .selectors - .iter() - .map(|selector| { - let mut range = - SelectionRange::new(selector.get_selection(&line, fields.as_deref())); - let num_cache = if selector.settings.mode == SortMode::Numeric - || selector.settings.mode == SortMode::HumanNumeric - { - let (info, num_range) = NumInfo::parse( - range.get_str(&line), - NumInfoParseSettings { - accept_si_units: selector.settings.mode == SortMode::HumanNumeric, - thousands_separator: Some(THOUSANDS_SEP), - decimal_pt: Some(DECIMAL_PT), - }, - ); - range.shorten(num_range); - NumCache::WithInfo(info) - } else if selector.settings.mode == SortMode::GeneralNumeric { - let str = range.get_str(&line); - NumCache::AsF64(general_f64_parse(&str[get_leading_gen(str)])) - } else { - NumCache::None - }; - Selection { range, num_cache } - }) + let mut selectors = settings.selectors.iter(); + + let first_selection = selectors + .next() + .unwrap() + .get_selection(&line, fields.as_deref()); + + let other_selections: Vec = selectors + .map(|selector| selector.get_selection(&line, fields.as_deref())) .collect(); - Self { line, selections } + + Self { + line: line.into_boxed_str(), + first_selection, + other_selections: other_selections.into_boxed_slice(), + } } /// Writes indicators for the selections this line matched. The original line content is NOT expected @@ -337,7 +324,7 @@ impl Line { let fields = tokenize(&self.line, settings.separator); for selector in settings.selectors.iter() { - let mut selection = selector.get_selection(&self.line, Some(&fields)); + let mut selection = selector.get_range(&self.line, Some(&fields)); match selector.settings.mode { SortMode::Numeric | SortMode::HumanNumeric => { // find out which range is used for numeric comparisons @@ -594,9 +581,35 @@ impl FieldSelector { self.from.field != 1 || self.from.char == 0 || self.to.is_some() } + fn get_selection(&self, line: &str, fields: Option<&[Field]>) -> Selection { + let mut range = SelectionRange::new(self.get_range(&line, fields)); + let num_cache = if self.settings.mode == SortMode::Numeric + || self.settings.mode == SortMode::HumanNumeric + { + let (info, num_range) = NumInfo::parse( + range.get_str(&line), + NumInfoParseSettings { + accept_si_units: self.settings.mode == SortMode::HumanNumeric, + thousands_separator: Some(THOUSANDS_SEP), + decimal_pt: Some(DECIMAL_PT), + }, + ); + range.shorten(num_range); + Some(Box::new(NumCache::WithInfo(info))) + } else if self.settings.mode == SortMode::GeneralNumeric { + let str = range.get_str(&line); + Some(Box::new(NumCache::AsF64(general_f64_parse( + &str[get_leading_gen(str)], + )))) + } else { + None + }; + Selection { range, num_cache } + } + /// Look up the slice that corresponds to this selector for the given line. - /// If needs_fields returned false, fields may be None. - fn get_selection<'a>(&self, line: &'a str, tokens: Option<&[Field]>) -> Range { + /// If needs_fields returned false, tokens may be None. + fn get_range<'a>(&self, line: &'a str, tokens: Option<&[Field]>) -> Range { enum Resolution { // The start index of the resolved character, inclusive StartOfChar(usize), @@ -1237,8 +1250,11 @@ fn sort_by(unsorted: &mut Vec, settings: &GlobalSettings) { fn compare_by(a: &Line, b: &Line, global_settings: &GlobalSettings) -> Ordering { for (idx, selector) in global_settings.selectors.iter().enumerate() { - let a_selection = &a.selections[idx]; - let b_selection = &b.selections[idx]; + let (a_selection, b_selection) = if idx == 0 { + (&a.first_selection, &b.first_selection) + } else { + (&a.other_selections[idx - 1], &b.other_selections[idx - 1]) + }; let a_str = a_selection.get_str(a); let b_str = b_selection.get_str(b); let settings = &selector.settings; @@ -1248,12 +1264,12 @@ fn compare_by(a: &Line, b: &Line, global_settings: &GlobalSettings) -> Ordering } else { match settings.mode { SortMode::Numeric | SortMode::HumanNumeric => numeric_str_cmp( - (a_str, a_selection.num_cache.as_num_info()), - (b_str, b_selection.num_cache.as_num_info()), + (a_str, a_selection.num_cache.as_ref().unwrap().as_num_info()), + (b_str, b_selection.num_cache.as_ref().unwrap().as_num_info()), ), SortMode::GeneralNumeric => general_numeric_compare( - a_selection.num_cache.as_f64(), - b_selection.num_cache.as_f64(), + a_selection.num_cache.as_ref().unwrap().as_f64(), + b_selection.num_cache.as_ref().unwrap().as_f64(), ), SortMode::Month => month_compare(a_str, b_str), SortMode::Version => version_compare(a_str, b_str), @@ -1591,4 +1607,16 @@ mod tests { let line = "..a..a"; assert_eq!(tokenize(line, Some('a')), vec![0..2, 3..5]); } + + #[test] + #[cfg(target_pointer_width = "64")] + fn test_line_size() { + // We should make sure to not regress the size of the Line struct because + // it is unconditional overhead for every line we sort. + assert_eq!(std::mem::size_of::(), 56); + // These are the fields of Line: + assert_eq!(std::mem::size_of::>(), 16); + assert_eq!(std::mem::size_of::(), 24); + assert_eq!(std::mem::size_of::>(), 16); + } } From 8c9faa16b94c9ef9064b9a2e9d521046619bcc66 Mon Sep 17 00:00:00 2001 From: Michael Debertol Date: Fri, 7 May 2021 21:50:33 +0200 Subject: [PATCH 2/3] sort: improve memory usage for extsort --- src/uu/sort/src/external_sort/mod.rs | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/src/uu/sort/src/external_sort/mod.rs b/src/uu/sort/src/external_sort/mod.rs index 725b17bbd..662250e1d 100644 --- a/src/uu/sort/src/external_sort/mod.rs +++ b/src/uu/sort/src/external_sort/mod.rs @@ -113,7 +113,7 @@ pub fn ext_sort( chunk.push(seq); - if total_read >= settings.buffer_size { + if total_read + chunk.len() * std::mem::size_of::() >= settings.buffer_size { super::sort_by(&mut chunk, &settings); write_chunk( settings, @@ -136,6 +136,9 @@ pub fn ext_sort( iter.chunks += 1; } + // We manually drop here to not go over our memory limit when we allocate below. + drop(chunk); + // initialize buffers for each chunk // // Having a right sized buffer for each chunk for smallish values seems silly to me? From d686f7e48f929c7fccfade292dce0a1dd2ff8eea Mon Sep 17 00:00:00 2001 From: Michael Debertol Date: Sat, 8 May 2021 22:31:53 +0200 Subject: [PATCH 3/3] sort: improve comments --- src/uu/sort/src/sort.rs | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) diff --git a/src/uu/sort/src/sort.rs b/src/uu/sort/src/sort.rs index 02e54baf8..776f71058 100644 --- a/src/uu/sort/src/sort.rs +++ b/src/uu/sort/src/sort.rs @@ -582,11 +582,14 @@ impl FieldSelector { self.from.field != 1 || self.from.char == 0 || self.to.is_some() } - fn get_selection(&self, line: &str, fields: Option<&[Field]>) -> Selection { - let mut range = SelectionRange::new(self.get_range(&line, fields)); + /// Get the selection that corresponds to this selector for the line. + /// If needs_fields returned false, tokens may be None. + fn get_selection(&self, line: &str, tokens: Option<&[Field]>) -> Selection { + let mut range = SelectionRange::new(self.get_range(&line, tokens)); let num_cache = if self.settings.mode == SortMode::Numeric || self.settings.mode == SortMode::HumanNumeric { + // Parse NumInfo for this number. let (info, num_range) = NumInfo::parse( range.get_str(&line), NumInfoParseSettings { @@ -595,20 +598,23 @@ impl FieldSelector { decimal_pt: Some(DECIMAL_PT), }, ); + // Shorten the range to what we need to pass to numeric_str_cmp later. range.shorten(num_range); Some(Box::new(NumCache::WithInfo(info))) } else if self.settings.mode == SortMode::GeneralNumeric { + // Parse this number as f64, as this is the requirement for general numeric sorting. let str = range.get_str(&line); Some(Box::new(NumCache::AsF64(general_f64_parse( &str[get_leading_gen(str)], )))) } else { + // This is not a numeric sort, so we don't need a NumCache. None }; Selection { range, num_cache } } - /// Look up the slice that corresponds to this selector for the given line. + /// Look up the range in the line that corresponds to this selector. /// If needs_fields returned false, tokens may be None. fn get_range<'a>(&self, line: &'a str, tokens: Option<&[Field]>) -> Range { enum Resolution { @@ -1356,7 +1362,8 @@ enum GeneralF64ParseResult { Infinity, } -/// Parse the beginning string into an f64, returning -inf instead of NaN on errors. +/// Parse the beginning string into a GeneralF64ParseResult. +/// Using a GeneralF64ParseResult instead of f64 is necessary to correctly order floats. #[inline(always)] fn general_f64_parse(a: &str) -> GeneralF64ParseResult { // The actual behavior here relies on Rust's implementation of parsing floating points.