1
Fork 0
mirror of https://github.com/RGBCube/uutils-coreutils synced 2025-07-27 19:17:43 +00:00

Merge pull request #2144 from miDeb/sort-no-transforms

sort: add some custom string comparisons
This commit is contained in:
Sylvestre Ledru 2021-05-02 18:04:27 +02:00 committed by GitHub
commit 9b7e7bbbc6
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
2 changed files with 89 additions and 92 deletions

View file

@ -0,0 +1,64 @@
// * This file is part of the uutils coreutils package.
// *
// * (c) Michael Debertol <michael.debertol..AT..gmail.com>
// *
// * For the full copyright and license information, please view the LICENSE
// * file that was distributed with this source code.
//! Custom string comparisons.
//!
//! The goal is to compare strings without transforming them first (i.e. not allocating new strings)
use std::cmp::Ordering;
fn filter_char(c: char, ignore_non_printing: bool, ignore_non_dictionary: bool) -> bool {
if ignore_non_dictionary && !(c.is_ascii_alphanumeric() || c.is_ascii_whitespace()) {
return false;
}
if ignore_non_printing && (c.is_ascii_control() || !c.is_ascii()) {
return false;
}
true
}
fn cmp_chars(a: char, b: char, ignore_case: bool) -> Ordering {
if ignore_case {
a.to_ascii_uppercase().cmp(&b.to_ascii_uppercase())
} else {
a.cmp(&b)
}
}
pub fn custom_str_cmp(
a: &str,
b: &str,
ignore_non_printing: bool,
ignore_non_dictionary: bool,
ignore_case: bool,
) -> Ordering {
if !(ignore_case || ignore_non_dictionary || ignore_non_printing) {
// There are no custom settings. Fall back to the default strcmp, which is faster.
return a.cmp(&b);
}
let mut a_chars = a
.chars()
.filter(|&c| filter_char(c, ignore_non_printing, ignore_non_dictionary));
let mut b_chars = b
.chars()
.filter(|&c| filter_char(c, ignore_non_printing, ignore_non_dictionary));
loop {
let a_char = a_chars.next();
let b_char = b_chars.next();
match (a_char, b_char) {
(None, None) => return Ordering::Equal,
(Some(_), None) => return Ordering::Greater,
(None, Some(_)) => return Ordering::Less,
(Some(a_char), Some(b_char)) => {
let ordering = cmp_chars(a_char, b_char, ignore_case);
if ordering != Ordering::Equal {
return ordering;
}
}
}
}
}

View file

@ -15,10 +15,12 @@
#[macro_use] #[macro_use]
extern crate uucore; extern crate uucore;
mod custom_str_cmp;
mod external_sort; mod external_sort;
mod numeric_str_cmp; mod numeric_str_cmp;
use clap::{App, Arg}; use clap::{App, Arg};
use custom_str_cmp::custom_str_cmp;
use external_sort::{ExternalSorter, ExternallySortable}; use external_sort::{ExternalSorter, ExternallySortable};
use fnv::FnvHasher; use fnv::FnvHasher;
use itertools::Itertools; use itertools::Itertools;
@ -206,33 +208,23 @@ impl From<&GlobalSettings> for KeySettings {
#[derive(Debug, Serialize, Deserialize, Clone)] #[derive(Debug, Serialize, Deserialize, Clone)]
/// Represents the string selected by a FieldSelector. /// Represents the string selected by a FieldSelector.
enum SelectionRange { struct SelectionRange {
/// If we had to transform this selection, we have to store a new string. range: Range<usize>,
String(String),
/// If there was no transformation, we can store an index into the line.
ByIndex(Range<usize>),
} }
impl SelectionRange { impl SelectionRange {
fn new(range: Range<usize>) -> Self {
Self { range }
}
/// Gets the actual string slice represented by this Selection. /// Gets the actual string slice represented by this Selection.
fn get_str<'a>(&'a self, line: &'a str) -> &'a str { fn get_str<'a>(&self, line: &'a str) -> &'a str {
match self { &line[self.range.to_owned()]
SelectionRange::String(string) => string.as_str(),
SelectionRange::ByIndex(range) => &line[range.to_owned()],
}
} }
fn shorten(&mut self, new_range: Range<usize>) { fn shorten(&mut self, new_range: Range<usize>) {
match self { self.range.end = self.range.start + new_range.end;
SelectionRange::String(string) => { self.range.start += new_range.start;
string.drain(new_range.end..);
string.drain(..new_range.start);
}
SelectionRange::ByIndex(range) => {
range.end = range.start + new_range.end;
range.start += new_range.start;
}
}
} }
} }
@ -303,14 +295,8 @@ impl Line {
.selectors .selectors
.iter() .iter()
.map(|selector| { .map(|selector| {
let range = selector.get_selection(&line, fields.as_deref()); let mut range =
let mut range = if let Some(transformed) = SelectionRange::new(selector.get_selection(&line, fields.as_deref()));
transform(&line[range.to_owned()], &selector.settings)
{
SelectionRange::String(transformed)
} else {
SelectionRange::ByIndex(range)
};
let num_cache = if selector.settings.mode == SortMode::Numeric let num_cache = if selector.settings.mode == SortMode::Numeric
|| selector.settings.mode == SortMode::HumanNumeric || selector.settings.mode == SortMode::HumanNumeric
{ {
@ -460,34 +446,6 @@ impl Line {
} }
} }
/// Transform this line. Returns None if there's no need to transform.
fn transform(line: &str, settings: &KeySettings) -> Option<String> {
let mut transformed = None;
if settings.ignore_case {
transformed = Some(line.to_uppercase());
}
if settings.ignore_blanks {
transformed = Some(
transformed
.as_deref()
.unwrap_or(line)
.trim_start()
.to_string(),
);
}
if settings.dictionary_order {
transformed = Some(remove_nondictionary_chars(
transformed.as_deref().unwrap_or(line),
));
}
if settings.ignore_non_printing {
transformed = Some(remove_nonprinting_chars(
transformed.as_deref().unwrap_or(line),
));
}
transformed
}
/// Tokenize a line into fields. /// Tokenize a line into fields.
fn tokenize(line: &str, separator: Option<char>) -> Vec<Field> { fn tokenize(line: &str, separator: Option<char>) -> Vec<Field> {
if let Some(separator) = separator { if let Some(separator) = separator {
@ -1264,7 +1222,7 @@ fn ext_sort_by(unsorted: Vec<Line>, settings: GlobalSettings) -> Vec<Line> {
settings.clone(), settings.clone(),
); );
let iter = external_sorter let iter = external_sorter
.sort_by(unsorted.into_iter(), settings.clone()) .sort_by(unsorted.into_iter(), settings)
.unwrap() .unwrap()
.map(|x| x.unwrap()) .map(|x| x.unwrap())
.collect::<Vec<Line>>(); .collect::<Vec<Line>>();
@ -1296,12 +1254,18 @@ fn compare_by(a: &Line, b: &Line, global_settings: &GlobalSettings) -> Ordering
(b_str, b_selection.num_cache.as_num_info()), (b_str, b_selection.num_cache.as_num_info()),
), ),
SortMode::GeneralNumeric => general_numeric_compare( SortMode::GeneralNumeric => general_numeric_compare(
general_f64_parse(&a_str[get_leading_gen(a_str)]), a_selection.num_cache.as_f64(),
general_f64_parse(&b_str[get_leading_gen(b_str)]), b_selection.num_cache.as_f64(),
), ),
SortMode::Month => month_compare(a_str, b_str), SortMode::Month => month_compare(a_str, b_str),
SortMode::Version => version_compare(a_str, b_str), SortMode::Version => version_compare(a_str, b_str),
SortMode::Default => default_compare(a_str, b_str), SortMode::Default => custom_str_cmp(
a_str,
b_str,
settings.ignore_non_printing,
settings.dictionary_order,
settings.ignore_case,
),
} }
}; };
if cmp != Ordering::Equal { if cmp != Ordering::Equal {
@ -1313,7 +1277,7 @@ fn compare_by(a: &Line, b: &Line, global_settings: &GlobalSettings) -> Ordering
let cmp = if global_settings.random || global_settings.stable || global_settings.unique { let cmp = if global_settings.random || global_settings.stable || global_settings.unique {
Ordering::Equal Ordering::Equal
} else { } else {
default_compare(&a.line, &b.line) a.line.cmp(&b.line)
}; };
if global_settings.reverse { if global_settings.reverse {
@ -1323,13 +1287,6 @@ fn compare_by(a: &Line, b: &Line, global_settings: &GlobalSettings) -> Ordering
} }
} }
// Test output against BSDs and GNU with their locale
// env var set to lc_ctype=utf-8 to enjoy the exact same output.
#[inline(always)]
fn default_compare(a: &str, b: &str) -> Ordering {
a.cmp(b)
}
// This function cleans up the initial comparison done by leading_num_common for a general numeric compare. // This function cleans up the initial comparison done by leading_num_common for a general numeric compare.
// In contrast to numeric compare, GNU general numeric/FP sort *should* recognize positive signs and // In contrast to numeric compare, GNU general numeric/FP sort *should* recognize positive signs and
// scientific notation, so we strip those lines only after the end of the following numeric string. // scientific notation, so we strip those lines only after the end of the following numeric string.
@ -1516,22 +1473,6 @@ fn version_compare(a: &str, b: &str) -> Ordering {
} }
} }
fn remove_nondictionary_chars(s: &str) -> String {
// According to GNU, dictionary chars are those of ASCII
// and a blank is a space or a tab
s.chars()
.filter(|c| c.is_ascii_alphanumeric() || c.is_ascii_whitespace())
.collect::<String>()
}
fn remove_nonprinting_chars(s: &str) -> String {
// However, GNU says nonprinting chars are more permissive.
// All of ASCII except control chars ie, escape, newline
s.chars()
.filter(|c| c.is_ascii() && !c.is_ascii_control())
.collect::<String>()
}
fn print_sorted<T: Iterator<Item = Line>>(iter: T, settings: &GlobalSettings) { fn print_sorted<T: Iterator<Item = Line>>(iter: T, settings: &GlobalSettings) {
let mut file: Box<dyn Write> = match settings.outfile { let mut file: Box<dyn Write> = match settings.outfile {
Some(ref filename) => match File::create(Path::new(&filename)) { Some(ref filename) => match File::create(Path::new(&filename)) {
@ -1598,14 +1539,6 @@ mod tests {
assert_eq!(Ordering::Equal, random_shuffle(a, b, c)); assert_eq!(Ordering::Equal, random_shuffle(a, b, c));
} }
#[test]
fn test_default_compare() {
let a = "your own";
let b = "your place";
assert_eq!(Ordering::Less, default_compare(a, b));
}
#[test] #[test]
fn test_month_compare() { fn test_month_compare() {
let a = "JaN"; let a = "JaN";