mirror of
https://github.com/RGBCube/uutils-coreutils
synced 2025-07-27 19:17:43 +00:00
Merge pull request #2144 from miDeb/sort-no-transforms
sort: add some custom string comparisons
This commit is contained in:
commit
9b7e7bbbc6
2 changed files with 89 additions and 92 deletions
64
src/uu/sort/src/custom_str_cmp.rs
Normal file
64
src/uu/sort/src/custom_str_cmp.rs
Normal file
|
@ -0,0 +1,64 @@
|
||||||
|
// * This file is part of the uutils coreutils package.
|
||||||
|
// *
|
||||||
|
// * (c) Michael Debertol <michael.debertol..AT..gmail.com>
|
||||||
|
// *
|
||||||
|
// * For the full copyright and license information, please view the LICENSE
|
||||||
|
// * file that was distributed with this source code.
|
||||||
|
|
||||||
|
//! Custom string comparisons.
|
||||||
|
//!
|
||||||
|
//! The goal is to compare strings without transforming them first (i.e. not allocating new strings)
|
||||||
|
|
||||||
|
use std::cmp::Ordering;
|
||||||
|
|
||||||
|
fn filter_char(c: char, ignore_non_printing: bool, ignore_non_dictionary: bool) -> bool {
|
||||||
|
if ignore_non_dictionary && !(c.is_ascii_alphanumeric() || c.is_ascii_whitespace()) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
if ignore_non_printing && (c.is_ascii_control() || !c.is_ascii()) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
true
|
||||||
|
}
|
||||||
|
|
||||||
|
fn cmp_chars(a: char, b: char, ignore_case: bool) -> Ordering {
|
||||||
|
if ignore_case {
|
||||||
|
a.to_ascii_uppercase().cmp(&b.to_ascii_uppercase())
|
||||||
|
} else {
|
||||||
|
a.cmp(&b)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn custom_str_cmp(
|
||||||
|
a: &str,
|
||||||
|
b: &str,
|
||||||
|
ignore_non_printing: bool,
|
||||||
|
ignore_non_dictionary: bool,
|
||||||
|
ignore_case: bool,
|
||||||
|
) -> Ordering {
|
||||||
|
if !(ignore_case || ignore_non_dictionary || ignore_non_printing) {
|
||||||
|
// There are no custom settings. Fall back to the default strcmp, which is faster.
|
||||||
|
return a.cmp(&b);
|
||||||
|
}
|
||||||
|
let mut a_chars = a
|
||||||
|
.chars()
|
||||||
|
.filter(|&c| filter_char(c, ignore_non_printing, ignore_non_dictionary));
|
||||||
|
let mut b_chars = b
|
||||||
|
.chars()
|
||||||
|
.filter(|&c| filter_char(c, ignore_non_printing, ignore_non_dictionary));
|
||||||
|
loop {
|
||||||
|
let a_char = a_chars.next();
|
||||||
|
let b_char = b_chars.next();
|
||||||
|
match (a_char, b_char) {
|
||||||
|
(None, None) => return Ordering::Equal,
|
||||||
|
(Some(_), None) => return Ordering::Greater,
|
||||||
|
(None, Some(_)) => return Ordering::Less,
|
||||||
|
(Some(a_char), Some(b_char)) => {
|
||||||
|
let ordering = cmp_chars(a_char, b_char, ignore_case);
|
||||||
|
if ordering != Ordering::Equal {
|
||||||
|
return ordering;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
|
@ -15,10 +15,12 @@
|
||||||
#[macro_use]
|
#[macro_use]
|
||||||
extern crate uucore;
|
extern crate uucore;
|
||||||
|
|
||||||
|
mod custom_str_cmp;
|
||||||
mod external_sort;
|
mod external_sort;
|
||||||
mod numeric_str_cmp;
|
mod numeric_str_cmp;
|
||||||
|
|
||||||
use clap::{App, Arg};
|
use clap::{App, Arg};
|
||||||
|
use custom_str_cmp::custom_str_cmp;
|
||||||
use external_sort::{ExternalSorter, ExternallySortable};
|
use external_sort::{ExternalSorter, ExternallySortable};
|
||||||
use fnv::FnvHasher;
|
use fnv::FnvHasher;
|
||||||
use itertools::Itertools;
|
use itertools::Itertools;
|
||||||
|
@ -206,33 +208,23 @@ impl From<&GlobalSettings> for KeySettings {
|
||||||
|
|
||||||
#[derive(Debug, Serialize, Deserialize, Clone)]
|
#[derive(Debug, Serialize, Deserialize, Clone)]
|
||||||
/// Represents the string selected by a FieldSelector.
|
/// Represents the string selected by a FieldSelector.
|
||||||
enum SelectionRange {
|
struct SelectionRange {
|
||||||
/// If we had to transform this selection, we have to store a new string.
|
range: Range<usize>,
|
||||||
String(String),
|
|
||||||
/// If there was no transformation, we can store an index into the line.
|
|
||||||
ByIndex(Range<usize>),
|
|
||||||
}
|
}
|
||||||
|
|
||||||
impl SelectionRange {
|
impl SelectionRange {
|
||||||
|
fn new(range: Range<usize>) -> Self {
|
||||||
|
Self { range }
|
||||||
|
}
|
||||||
|
|
||||||
/// Gets the actual string slice represented by this Selection.
|
/// Gets the actual string slice represented by this Selection.
|
||||||
fn get_str<'a>(&'a self, line: &'a str) -> &'a str {
|
fn get_str<'a>(&self, line: &'a str) -> &'a str {
|
||||||
match self {
|
&line[self.range.to_owned()]
|
||||||
SelectionRange::String(string) => string.as_str(),
|
|
||||||
SelectionRange::ByIndex(range) => &line[range.to_owned()],
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
fn shorten(&mut self, new_range: Range<usize>) {
|
fn shorten(&mut self, new_range: Range<usize>) {
|
||||||
match self {
|
self.range.end = self.range.start + new_range.end;
|
||||||
SelectionRange::String(string) => {
|
self.range.start += new_range.start;
|
||||||
string.drain(new_range.end..);
|
|
||||||
string.drain(..new_range.start);
|
|
||||||
}
|
|
||||||
SelectionRange::ByIndex(range) => {
|
|
||||||
range.end = range.start + new_range.end;
|
|
||||||
range.start += new_range.start;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -303,14 +295,8 @@ impl Line {
|
||||||
.selectors
|
.selectors
|
||||||
.iter()
|
.iter()
|
||||||
.map(|selector| {
|
.map(|selector| {
|
||||||
let range = selector.get_selection(&line, fields.as_deref());
|
let mut range =
|
||||||
let mut range = if let Some(transformed) =
|
SelectionRange::new(selector.get_selection(&line, fields.as_deref()));
|
||||||
transform(&line[range.to_owned()], &selector.settings)
|
|
||||||
{
|
|
||||||
SelectionRange::String(transformed)
|
|
||||||
} else {
|
|
||||||
SelectionRange::ByIndex(range)
|
|
||||||
};
|
|
||||||
let num_cache = if selector.settings.mode == SortMode::Numeric
|
let num_cache = if selector.settings.mode == SortMode::Numeric
|
||||||
|| selector.settings.mode == SortMode::HumanNumeric
|
|| selector.settings.mode == SortMode::HumanNumeric
|
||||||
{
|
{
|
||||||
|
@ -460,34 +446,6 @@ impl Line {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Transform this line. Returns None if there's no need to transform.
|
|
||||||
fn transform(line: &str, settings: &KeySettings) -> Option<String> {
|
|
||||||
let mut transformed = None;
|
|
||||||
if settings.ignore_case {
|
|
||||||
transformed = Some(line.to_uppercase());
|
|
||||||
}
|
|
||||||
if settings.ignore_blanks {
|
|
||||||
transformed = Some(
|
|
||||||
transformed
|
|
||||||
.as_deref()
|
|
||||||
.unwrap_or(line)
|
|
||||||
.trim_start()
|
|
||||||
.to_string(),
|
|
||||||
);
|
|
||||||
}
|
|
||||||
if settings.dictionary_order {
|
|
||||||
transformed = Some(remove_nondictionary_chars(
|
|
||||||
transformed.as_deref().unwrap_or(line),
|
|
||||||
));
|
|
||||||
}
|
|
||||||
if settings.ignore_non_printing {
|
|
||||||
transformed = Some(remove_nonprinting_chars(
|
|
||||||
transformed.as_deref().unwrap_or(line),
|
|
||||||
));
|
|
||||||
}
|
|
||||||
transformed
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Tokenize a line into fields.
|
/// Tokenize a line into fields.
|
||||||
fn tokenize(line: &str, separator: Option<char>) -> Vec<Field> {
|
fn tokenize(line: &str, separator: Option<char>) -> Vec<Field> {
|
||||||
if let Some(separator) = separator {
|
if let Some(separator) = separator {
|
||||||
|
@ -1264,7 +1222,7 @@ fn ext_sort_by(unsorted: Vec<Line>, settings: GlobalSettings) -> Vec<Line> {
|
||||||
settings.clone(),
|
settings.clone(),
|
||||||
);
|
);
|
||||||
let iter = external_sorter
|
let iter = external_sorter
|
||||||
.sort_by(unsorted.into_iter(), settings.clone())
|
.sort_by(unsorted.into_iter(), settings)
|
||||||
.unwrap()
|
.unwrap()
|
||||||
.map(|x| x.unwrap())
|
.map(|x| x.unwrap())
|
||||||
.collect::<Vec<Line>>();
|
.collect::<Vec<Line>>();
|
||||||
|
@ -1296,12 +1254,18 @@ fn compare_by(a: &Line, b: &Line, global_settings: &GlobalSettings) -> Ordering
|
||||||
(b_str, b_selection.num_cache.as_num_info()),
|
(b_str, b_selection.num_cache.as_num_info()),
|
||||||
),
|
),
|
||||||
SortMode::GeneralNumeric => general_numeric_compare(
|
SortMode::GeneralNumeric => general_numeric_compare(
|
||||||
general_f64_parse(&a_str[get_leading_gen(a_str)]),
|
a_selection.num_cache.as_f64(),
|
||||||
general_f64_parse(&b_str[get_leading_gen(b_str)]),
|
b_selection.num_cache.as_f64(),
|
||||||
),
|
),
|
||||||
SortMode::Month => month_compare(a_str, b_str),
|
SortMode::Month => month_compare(a_str, b_str),
|
||||||
SortMode::Version => version_compare(a_str, b_str),
|
SortMode::Version => version_compare(a_str, b_str),
|
||||||
SortMode::Default => default_compare(a_str, b_str),
|
SortMode::Default => custom_str_cmp(
|
||||||
|
a_str,
|
||||||
|
b_str,
|
||||||
|
settings.ignore_non_printing,
|
||||||
|
settings.dictionary_order,
|
||||||
|
settings.ignore_case,
|
||||||
|
),
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
if cmp != Ordering::Equal {
|
if cmp != Ordering::Equal {
|
||||||
|
@ -1313,7 +1277,7 @@ fn compare_by(a: &Line, b: &Line, global_settings: &GlobalSettings) -> Ordering
|
||||||
let cmp = if global_settings.random || global_settings.stable || global_settings.unique {
|
let cmp = if global_settings.random || global_settings.stable || global_settings.unique {
|
||||||
Ordering::Equal
|
Ordering::Equal
|
||||||
} else {
|
} else {
|
||||||
default_compare(&a.line, &b.line)
|
a.line.cmp(&b.line)
|
||||||
};
|
};
|
||||||
|
|
||||||
if global_settings.reverse {
|
if global_settings.reverse {
|
||||||
|
@ -1323,13 +1287,6 @@ fn compare_by(a: &Line, b: &Line, global_settings: &GlobalSettings) -> Ordering
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// Test output against BSDs and GNU with their locale
|
|
||||||
// env var set to lc_ctype=utf-8 to enjoy the exact same output.
|
|
||||||
#[inline(always)]
|
|
||||||
fn default_compare(a: &str, b: &str) -> Ordering {
|
|
||||||
a.cmp(b)
|
|
||||||
}
|
|
||||||
|
|
||||||
// This function cleans up the initial comparison done by leading_num_common for a general numeric compare.
|
// This function cleans up the initial comparison done by leading_num_common for a general numeric compare.
|
||||||
// In contrast to numeric compare, GNU general numeric/FP sort *should* recognize positive signs and
|
// In contrast to numeric compare, GNU general numeric/FP sort *should* recognize positive signs and
|
||||||
// scientific notation, so we strip those lines only after the end of the following numeric string.
|
// scientific notation, so we strip those lines only after the end of the following numeric string.
|
||||||
|
@ -1516,22 +1473,6 @@ fn version_compare(a: &str, b: &str) -> Ordering {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
fn remove_nondictionary_chars(s: &str) -> String {
|
|
||||||
// According to GNU, dictionary chars are those of ASCII
|
|
||||||
// and a blank is a space or a tab
|
|
||||||
s.chars()
|
|
||||||
.filter(|c| c.is_ascii_alphanumeric() || c.is_ascii_whitespace())
|
|
||||||
.collect::<String>()
|
|
||||||
}
|
|
||||||
|
|
||||||
fn remove_nonprinting_chars(s: &str) -> String {
|
|
||||||
// However, GNU says nonprinting chars are more permissive.
|
|
||||||
// All of ASCII except control chars ie, escape, newline
|
|
||||||
s.chars()
|
|
||||||
.filter(|c| c.is_ascii() && !c.is_ascii_control())
|
|
||||||
.collect::<String>()
|
|
||||||
}
|
|
||||||
|
|
||||||
fn print_sorted<T: Iterator<Item = Line>>(iter: T, settings: &GlobalSettings) {
|
fn print_sorted<T: Iterator<Item = Line>>(iter: T, settings: &GlobalSettings) {
|
||||||
let mut file: Box<dyn Write> = match settings.outfile {
|
let mut file: Box<dyn Write> = match settings.outfile {
|
||||||
Some(ref filename) => match File::create(Path::new(&filename)) {
|
Some(ref filename) => match File::create(Path::new(&filename)) {
|
||||||
|
@ -1598,14 +1539,6 @@ mod tests {
|
||||||
assert_eq!(Ordering::Equal, random_shuffle(a, b, c));
|
assert_eq!(Ordering::Equal, random_shuffle(a, b, c));
|
||||||
}
|
}
|
||||||
|
|
||||||
#[test]
|
|
||||||
fn test_default_compare() {
|
|
||||||
let a = "your own";
|
|
||||||
let b = "your place";
|
|
||||||
|
|
||||||
assert_eq!(Ordering::Less, default_compare(a, b));
|
|
||||||
}
|
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn test_month_compare() {
|
fn test_month_compare() {
|
||||||
let a = "JaN";
|
let a = "JaN";
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue