From 9f45431bf041053268bd4e76c10189432122a637 Mon Sep 17 00:00:00 2001 From: Michael Debertol Date: Thu, 29 Apr 2021 18:02:06 +0200 Subject: [PATCH 1/3] sort: add some custom string comparisons This removes the need to allocate a new string for each line when used with -f, -d or -i. Instead, a custom string comparison algorithm takes care of these cases. The resulting performance improvement is about 20% per flag (i.e. there is a 60% improvement when combining all three flags) As a side-effect, the size of the Line struct was reduced from 96 to 80 bytes, reducing the overhead for each line. --- src/uu/sort/src/custom_str_cmp.rs | 64 +++++++++++++++++ src/uu/sort/src/sort.rs | 111 ++++++------------------------ 2 files changed, 86 insertions(+), 89 deletions(-) create mode 100644 src/uu/sort/src/custom_str_cmp.rs diff --git a/src/uu/sort/src/custom_str_cmp.rs b/src/uu/sort/src/custom_str_cmp.rs new file mode 100644 index 000000000..a087a9fc2 --- /dev/null +++ b/src/uu/sort/src/custom_str_cmp.rs @@ -0,0 +1,64 @@ +// * This file is part of the uutils coreutils package. +// * +// * (c) Michael Debertol +// * +// * For the full copyright and license information, please view the LICENSE +// * file that was distributed with this source code. + +//! Custom string comparisons. +//! +//! The goal is to compare strings without transforming them first (i.e. not allocating new strings) + +use std::cmp::Ordering; + +fn filter_char(c: char, ignore_non_printing: bool, ignore_non_dictionary: bool) -> bool { + if ignore_non_dictionary && !(c.is_ascii_alphanumeric() || c.is_ascii_whitespace()) { + return false; + } + if ignore_non_printing && (c.is_ascii_control() || !c.is_ascii()) { + return false; + } + true +} + +fn cmp_chars(a: char, b: char, ignore_case: bool) -> Ordering { + if ignore_case { + a.to_ascii_uppercase().cmp(&b.to_ascii_uppercase()) + } else { + a.cmp(&b) + } +} + +pub fn custom_str_cmp( + a: &str, + b: &str, + ignore_non_printing: bool, + ignore_non_dictionary: bool, + ignore_case: bool, +) -> Ordering { + if !(ignore_case || ignore_non_dictionary || ignore_non_printing) { + // There are no custom settings. Fall back to the default strcmp, which is faster. + return a.cmp(&b); + } + let mut a_chars = a + .chars() + .filter(|&c| filter_char(c, ignore_non_printing, ignore_non_dictionary)); + let mut b_chars = b + .chars() + .filter(|&c| filter_char(c, ignore_non_printing, ignore_non_dictionary)); + loop { + let a_char = a_chars.next(); + let b_char = b_chars.next(); + match (a_char, b_char) { + (None, None) => return Ordering::Equal, + (Some(_), None) => return Ordering::Greater, + (None, Some(_)) => return Ordering::Less, + (Some(a_char), Some(b_char)) => { + let ordering = cmp_chars(a_char, b_char, ignore_case); + if ordering != Ordering::Equal { + return ordering; + } + } + } + } +} diff --git a/src/uu/sort/src/sort.rs b/src/uu/sort/src/sort.rs index 18d9304fa..a18850e9d 100644 --- a/src/uu/sort/src/sort.rs +++ b/src/uu/sort/src/sort.rs @@ -15,10 +15,12 @@ #[macro_use] extern crate uucore; +mod custom_str_cmp; mod external_sort; mod numeric_str_cmp; use clap::{App, Arg}; +use custom_str_cmp::custom_str_cmp; use external_sort::{ExternalSorter, ExternallySortable}; use fnv::FnvHasher; use itertools::Itertools; @@ -206,33 +208,23 @@ impl From<&GlobalSettings> for KeySettings { #[derive(Debug, Serialize, Deserialize, Clone)] /// Represents the string selected by a FieldSelector. -enum SelectionRange { - /// If we had to transform this selection, we have to store a new string. - String(String), - /// If there was no transformation, we can store an index into the line. - ByIndex(Range), +struct SelectionRange { + range: Range, } impl SelectionRange { + fn new(range: Range) -> Self { + Self { range } + } + /// Gets the actual string slice represented by this Selection. - fn get_str<'a>(&'a self, line: &'a str) -> &'a str { - match self { - SelectionRange::String(string) => string.as_str(), - SelectionRange::ByIndex(range) => &line[range.to_owned()], - } + fn get_str<'a>(&self, line: &'a str) -> &'a str { + &line[self.range.to_owned()] } fn shorten(&mut self, new_range: Range) { - match self { - SelectionRange::String(string) => { - string.drain(new_range.end..); - string.drain(..new_range.start); - } - SelectionRange::ByIndex(range) => { - range.end = range.start + new_range.end; - range.start += new_range.start; - } - } + self.range.end = self.range.start + new_range.end; + self.range.start += new_range.start; } } @@ -303,14 +295,8 @@ impl Line { .selectors .iter() .map(|selector| { - let range = selector.get_selection(&line, fields.as_deref()); - let mut range = if let Some(transformed) = - transform(&line[range.to_owned()], &selector.settings) - { - SelectionRange::String(transformed) - } else { - SelectionRange::ByIndex(range) - }; + let mut range = + SelectionRange::new(selector.get_selection(&line, fields.as_deref())); let num_cache = if selector.settings.mode == SortMode::Numeric || selector.settings.mode == SortMode::HumanNumeric { @@ -460,34 +446,6 @@ impl Line { } } -/// Transform this line. Returns None if there's no need to transform. -fn transform(line: &str, settings: &KeySettings) -> Option { - let mut transformed = None; - if settings.ignore_case { - transformed = Some(line.to_uppercase()); - } - if settings.ignore_blanks { - transformed = Some( - transformed - .as_deref() - .unwrap_or(line) - .trim_start() - .to_string(), - ); - } - if settings.dictionary_order { - transformed = Some(remove_nondictionary_chars( - transformed.as_deref().unwrap_or(line), - )); - } - if settings.ignore_non_printing { - transformed = Some(remove_nonprinting_chars( - transformed.as_deref().unwrap_or(line), - )); - } - transformed -} - /// Tokenize a line into fields. fn tokenize(line: &str, separator: Option) -> Vec { if let Some(separator) = separator { @@ -1301,7 +1259,13 @@ fn compare_by(a: &Line, b: &Line, global_settings: &GlobalSettings) -> Ordering ), SortMode::Month => month_compare(a_str, b_str), SortMode::Version => version_compare(a_str, b_str), - SortMode::Default => default_compare(a_str, b_str), + SortMode::Default => custom_str_cmp( + a_str, + b_str, + settings.ignore_non_printing, + settings.dictionary_order, + settings.ignore_case, + ), } }; if cmp != Ordering::Equal { @@ -1313,7 +1277,7 @@ fn compare_by(a: &Line, b: &Line, global_settings: &GlobalSettings) -> Ordering let cmp = if global_settings.random || global_settings.stable || global_settings.unique { Ordering::Equal } else { - default_compare(&a.line, &b.line) + a.line.cmp(&b.line) }; if global_settings.reverse { @@ -1323,13 +1287,6 @@ fn compare_by(a: &Line, b: &Line, global_settings: &GlobalSettings) -> Ordering } } -// Test output against BSDs and GNU with their locale -// env var set to lc_ctype=utf-8 to enjoy the exact same output. -#[inline(always)] -fn default_compare(a: &str, b: &str) -> Ordering { - a.cmp(b) -} - // This function cleans up the initial comparison done by leading_num_common for a general numeric compare. // In contrast to numeric compare, GNU general numeric/FP sort *should* recognize positive signs and // scientific notation, so we strip those lines only after the end of the following numeric string. @@ -1516,22 +1473,6 @@ fn version_compare(a: &str, b: &str) -> Ordering { } } -fn remove_nondictionary_chars(s: &str) -> String { - // According to GNU, dictionary chars are those of ASCII - // and a blank is a space or a tab - s.chars() - .filter(|c| c.is_ascii_alphanumeric() || c.is_ascii_whitespace()) - .collect::() -} - -fn remove_nonprinting_chars(s: &str) -> String { - // However, GNU says nonprinting chars are more permissive. - // All of ASCII except control chars ie, escape, newline - s.chars() - .filter(|c| c.is_ascii() && !c.is_ascii_control()) - .collect::() -} - fn print_sorted>(iter: T, settings: &GlobalSettings) { let mut file: Box = match settings.outfile { Some(ref filename) => match File::create(Path::new(&filename)) { @@ -1598,14 +1539,6 @@ mod tests { assert_eq!(Ordering::Equal, random_shuffle(a, b, c)); } - #[test] - fn test_default_compare() { - let a = "your own"; - let b = "your place"; - - assert_eq!(Ordering::Less, default_compare(a, b)); - } - #[test] fn test_month_compare() { let a = "JaN"; From a4813c26468963b7053de57fcddb0c95fa5b23fa Mon Sep 17 00:00:00 2001 From: Michael Debertol Date: Thu, 29 Apr 2021 18:03:00 +0200 Subject: [PATCH 2/3] sort: actually use the f64 cache This was probably reverted accidentally. --- src/uu/sort/src/sort.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/uu/sort/src/sort.rs b/src/uu/sort/src/sort.rs index a18850e9d..6768c82c1 100644 --- a/src/uu/sort/src/sort.rs +++ b/src/uu/sort/src/sort.rs @@ -1254,8 +1254,8 @@ fn compare_by(a: &Line, b: &Line, global_settings: &GlobalSettings) -> Ordering (b_str, b_selection.num_cache.as_num_info()), ), SortMode::GeneralNumeric => general_numeric_compare( - general_f64_parse(&a_str[get_leading_gen(a_str)]), - general_f64_parse(&b_str[get_leading_gen(b_str)]), + a_selection.num_cache.as_f64(), + b_selection.num_cache.as_f64(), ), SortMode::Month => month_compare(a_str, b_str), SortMode::Version => version_compare(a_str, b_str), From fecbf3dc856a37db6cf8d7a7a4ca15953b6b3f9d Mon Sep 17 00:00:00 2001 From: Michael Debertol Date: Thu, 29 Apr 2021 18:03:12 +0200 Subject: [PATCH 3/3] sort: remove an unneeded clone() --- src/uu/sort/src/sort.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/uu/sort/src/sort.rs b/src/uu/sort/src/sort.rs index 6768c82c1..c82524796 100644 --- a/src/uu/sort/src/sort.rs +++ b/src/uu/sort/src/sort.rs @@ -1222,7 +1222,7 @@ fn ext_sort_by(unsorted: Vec, settings: GlobalSettings) -> Vec { settings.clone(), ); let iter = external_sorter - .sort_by(unsorted.into_iter(), settings.clone()) + .sort_by(unsorted.into_iter(), settings) .unwrap() .map(|x| x.unwrap()) .collect::>();