mirror of
https://github.com/RGBCube/uutils-coreutils
synced 2025-07-27 19:17:43 +00:00
Merge pull request #2144 from miDeb/sort-no-transforms
sort: add some custom string comparisons
This commit is contained in:
commit
9b7e7bbbc6
2 changed files with 89 additions and 92 deletions
64
src/uu/sort/src/custom_str_cmp.rs
Normal file
64
src/uu/sort/src/custom_str_cmp.rs
Normal file
|
@ -0,0 +1,64 @@
|
|||
// * This file is part of the uutils coreutils package.
|
||||
// *
|
||||
// * (c) Michael Debertol <michael.debertol..AT..gmail.com>
|
||||
// *
|
||||
// * For the full copyright and license information, please view the LICENSE
|
||||
// * file that was distributed with this source code.
|
||||
|
||||
//! Custom string comparisons.
|
||||
//!
|
||||
//! The goal is to compare strings without transforming them first (i.e. not allocating new strings)
|
||||
|
||||
use std::cmp::Ordering;
|
||||
|
||||
fn filter_char(c: char, ignore_non_printing: bool, ignore_non_dictionary: bool) -> bool {
|
||||
if ignore_non_dictionary && !(c.is_ascii_alphanumeric() || c.is_ascii_whitespace()) {
|
||||
return false;
|
||||
}
|
||||
if ignore_non_printing && (c.is_ascii_control() || !c.is_ascii()) {
|
||||
return false;
|
||||
}
|
||||
true
|
||||
}
|
||||
|
||||
fn cmp_chars(a: char, b: char, ignore_case: bool) -> Ordering {
|
||||
if ignore_case {
|
||||
a.to_ascii_uppercase().cmp(&b.to_ascii_uppercase())
|
||||
} else {
|
||||
a.cmp(&b)
|
||||
}
|
||||
}
|
||||
|
||||
pub fn custom_str_cmp(
|
||||
a: &str,
|
||||
b: &str,
|
||||
ignore_non_printing: bool,
|
||||
ignore_non_dictionary: bool,
|
||||
ignore_case: bool,
|
||||
) -> Ordering {
|
||||
if !(ignore_case || ignore_non_dictionary || ignore_non_printing) {
|
||||
// There are no custom settings. Fall back to the default strcmp, which is faster.
|
||||
return a.cmp(&b);
|
||||
}
|
||||
let mut a_chars = a
|
||||
.chars()
|
||||
.filter(|&c| filter_char(c, ignore_non_printing, ignore_non_dictionary));
|
||||
let mut b_chars = b
|
||||
.chars()
|
||||
.filter(|&c| filter_char(c, ignore_non_printing, ignore_non_dictionary));
|
||||
loop {
|
||||
let a_char = a_chars.next();
|
||||
let b_char = b_chars.next();
|
||||
match (a_char, b_char) {
|
||||
(None, None) => return Ordering::Equal,
|
||||
(Some(_), None) => return Ordering::Greater,
|
||||
(None, Some(_)) => return Ordering::Less,
|
||||
(Some(a_char), Some(b_char)) => {
|
||||
let ordering = cmp_chars(a_char, b_char, ignore_case);
|
||||
if ordering != Ordering::Equal {
|
||||
return ordering;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
|
@ -15,10 +15,12 @@
|
|||
#[macro_use]
|
||||
extern crate uucore;
|
||||
|
||||
mod custom_str_cmp;
|
||||
mod external_sort;
|
||||
mod numeric_str_cmp;
|
||||
|
||||
use clap::{App, Arg};
|
||||
use custom_str_cmp::custom_str_cmp;
|
||||
use external_sort::{ExternalSorter, ExternallySortable};
|
||||
use fnv::FnvHasher;
|
||||
use itertools::Itertools;
|
||||
|
@ -206,33 +208,23 @@ impl From<&GlobalSettings> for KeySettings {
|
|||
|
||||
#[derive(Debug, Serialize, Deserialize, Clone)]
|
||||
/// Represents the string selected by a FieldSelector.
|
||||
enum SelectionRange {
|
||||
/// If we had to transform this selection, we have to store a new string.
|
||||
String(String),
|
||||
/// If there was no transformation, we can store an index into the line.
|
||||
ByIndex(Range<usize>),
|
||||
struct SelectionRange {
|
||||
range: Range<usize>,
|
||||
}
|
||||
|
||||
impl SelectionRange {
|
||||
fn new(range: Range<usize>) -> Self {
|
||||
Self { range }
|
||||
}
|
||||
|
||||
/// Gets the actual string slice represented by this Selection.
|
||||
fn get_str<'a>(&'a self, line: &'a str) -> &'a str {
|
||||
match self {
|
||||
SelectionRange::String(string) => string.as_str(),
|
||||
SelectionRange::ByIndex(range) => &line[range.to_owned()],
|
||||
}
|
||||
fn get_str<'a>(&self, line: &'a str) -> &'a str {
|
||||
&line[self.range.to_owned()]
|
||||
}
|
||||
|
||||
fn shorten(&mut self, new_range: Range<usize>) {
|
||||
match self {
|
||||
SelectionRange::String(string) => {
|
||||
string.drain(new_range.end..);
|
||||
string.drain(..new_range.start);
|
||||
}
|
||||
SelectionRange::ByIndex(range) => {
|
||||
range.end = range.start + new_range.end;
|
||||
range.start += new_range.start;
|
||||
}
|
||||
}
|
||||
self.range.end = self.range.start + new_range.end;
|
||||
self.range.start += new_range.start;
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -303,14 +295,8 @@ impl Line {
|
|||
.selectors
|
||||
.iter()
|
||||
.map(|selector| {
|
||||
let range = selector.get_selection(&line, fields.as_deref());
|
||||
let mut range = if let Some(transformed) =
|
||||
transform(&line[range.to_owned()], &selector.settings)
|
||||
{
|
||||
SelectionRange::String(transformed)
|
||||
} else {
|
||||
SelectionRange::ByIndex(range)
|
||||
};
|
||||
let mut range =
|
||||
SelectionRange::new(selector.get_selection(&line, fields.as_deref()));
|
||||
let num_cache = if selector.settings.mode == SortMode::Numeric
|
||||
|| selector.settings.mode == SortMode::HumanNumeric
|
||||
{
|
||||
|
@ -460,34 +446,6 @@ impl Line {
|
|||
}
|
||||
}
|
||||
|
||||
/// Transform this line. Returns None if there's no need to transform.
|
||||
fn transform(line: &str, settings: &KeySettings) -> Option<String> {
|
||||
let mut transformed = None;
|
||||
if settings.ignore_case {
|
||||
transformed = Some(line.to_uppercase());
|
||||
}
|
||||
if settings.ignore_blanks {
|
||||
transformed = Some(
|
||||
transformed
|
||||
.as_deref()
|
||||
.unwrap_or(line)
|
||||
.trim_start()
|
||||
.to_string(),
|
||||
);
|
||||
}
|
||||
if settings.dictionary_order {
|
||||
transformed = Some(remove_nondictionary_chars(
|
||||
transformed.as_deref().unwrap_or(line),
|
||||
));
|
||||
}
|
||||
if settings.ignore_non_printing {
|
||||
transformed = Some(remove_nonprinting_chars(
|
||||
transformed.as_deref().unwrap_or(line),
|
||||
));
|
||||
}
|
||||
transformed
|
||||
}
|
||||
|
||||
/// Tokenize a line into fields.
|
||||
fn tokenize(line: &str, separator: Option<char>) -> Vec<Field> {
|
||||
if let Some(separator) = separator {
|
||||
|
@ -1264,7 +1222,7 @@ fn ext_sort_by(unsorted: Vec<Line>, settings: GlobalSettings) -> Vec<Line> {
|
|||
settings.clone(),
|
||||
);
|
||||
let iter = external_sorter
|
||||
.sort_by(unsorted.into_iter(), settings.clone())
|
||||
.sort_by(unsorted.into_iter(), settings)
|
||||
.unwrap()
|
||||
.map(|x| x.unwrap())
|
||||
.collect::<Vec<Line>>();
|
||||
|
@ -1296,12 +1254,18 @@ fn compare_by(a: &Line, b: &Line, global_settings: &GlobalSettings) -> Ordering
|
|||
(b_str, b_selection.num_cache.as_num_info()),
|
||||
),
|
||||
SortMode::GeneralNumeric => general_numeric_compare(
|
||||
general_f64_parse(&a_str[get_leading_gen(a_str)]),
|
||||
general_f64_parse(&b_str[get_leading_gen(b_str)]),
|
||||
a_selection.num_cache.as_f64(),
|
||||
b_selection.num_cache.as_f64(),
|
||||
),
|
||||
SortMode::Month => month_compare(a_str, b_str),
|
||||
SortMode::Version => version_compare(a_str, b_str),
|
||||
SortMode::Default => default_compare(a_str, b_str),
|
||||
SortMode::Default => custom_str_cmp(
|
||||
a_str,
|
||||
b_str,
|
||||
settings.ignore_non_printing,
|
||||
settings.dictionary_order,
|
||||
settings.ignore_case,
|
||||
),
|
||||
}
|
||||
};
|
||||
if cmp != Ordering::Equal {
|
||||
|
@ -1313,7 +1277,7 @@ fn compare_by(a: &Line, b: &Line, global_settings: &GlobalSettings) -> Ordering
|
|||
let cmp = if global_settings.random || global_settings.stable || global_settings.unique {
|
||||
Ordering::Equal
|
||||
} else {
|
||||
default_compare(&a.line, &b.line)
|
||||
a.line.cmp(&b.line)
|
||||
};
|
||||
|
||||
if global_settings.reverse {
|
||||
|
@ -1323,13 +1287,6 @@ fn compare_by(a: &Line, b: &Line, global_settings: &GlobalSettings) -> Ordering
|
|||
}
|
||||
}
|
||||
|
||||
// Test output against BSDs and GNU with their locale
|
||||
// env var set to lc_ctype=utf-8 to enjoy the exact same output.
|
||||
#[inline(always)]
|
||||
fn default_compare(a: &str, b: &str) -> Ordering {
|
||||
a.cmp(b)
|
||||
}
|
||||
|
||||
// This function cleans up the initial comparison done by leading_num_common for a general numeric compare.
|
||||
// In contrast to numeric compare, GNU general numeric/FP sort *should* recognize positive signs and
|
||||
// scientific notation, so we strip those lines only after the end of the following numeric string.
|
||||
|
@ -1516,22 +1473,6 @@ fn version_compare(a: &str, b: &str) -> Ordering {
|
|||
}
|
||||
}
|
||||
|
||||
fn remove_nondictionary_chars(s: &str) -> String {
|
||||
// According to GNU, dictionary chars are those of ASCII
|
||||
// and a blank is a space or a tab
|
||||
s.chars()
|
||||
.filter(|c| c.is_ascii_alphanumeric() || c.is_ascii_whitespace())
|
||||
.collect::<String>()
|
||||
}
|
||||
|
||||
fn remove_nonprinting_chars(s: &str) -> String {
|
||||
// However, GNU says nonprinting chars are more permissive.
|
||||
// All of ASCII except control chars ie, escape, newline
|
||||
s.chars()
|
||||
.filter(|c| c.is_ascii() && !c.is_ascii_control())
|
||||
.collect::<String>()
|
||||
}
|
||||
|
||||
fn print_sorted<T: Iterator<Item = Line>>(iter: T, settings: &GlobalSettings) {
|
||||
let mut file: Box<dyn Write> = match settings.outfile {
|
||||
Some(ref filename) => match File::create(Path::new(&filename)) {
|
||||
|
@ -1598,14 +1539,6 @@ mod tests {
|
|||
assert_eq!(Ordering::Equal, random_shuffle(a, b, c));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_default_compare() {
|
||||
let a = "your own";
|
||||
let b = "your place";
|
||||
|
||||
assert_eq!(Ordering::Less, default_compare(a, b));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_month_compare() {
|
||||
let a = "JaN";
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue