1
Fork 0
mirror of https://github.com/RGBCube/uutils-coreutils synced 2025-07-28 11:37:44 +00:00

join: avoid extra allocations when using -i

This commit is contained in:
Justin Tracey 2024-09-25 01:40:44 -04:00 committed by Ben Wiederhake
parent 4f79c0b69f
commit 7c3a9380f1
2 changed files with 38 additions and 4 deletions

View file

@ -55,7 +55,7 @@ The following options can have a non-trivial impact on performance:
- `-a`/`-v` if one of the two files has significantly more lines than the other
- `-j`/`-1`/`-2` cause work to be done to grab the appropriate field
- `-i` adds a call to `to_ascii_lowercase()` that adds some time for allocating and dropping memory for the lowercase key
- `-i` uses our custom code for case-insensitive text comparisons
- `--nocheck-order` causes some calls of `Input::compare` to be skipped
The content of the files being joined has a very significant impact on the performance.

View file

@ -288,6 +288,40 @@ impl<'a, Sep: Separator> Repr<'a, Sep> {
}
}
/// Byte slice wrapper whose Ord implementation is case-insensitive on ASCII.
#[derive(Eq)]
struct CaseInsensitiveSlice<'a> {
v: &'a [u8],
}
impl Ord for CaseInsensitiveSlice<'_> {
fn cmp(&self, other: &Self) -> Ordering {
if let Some((s, o)) =
std::iter::zip(self.v.iter(), other.v.iter()).find(|(s, o)| !s.eq_ignore_ascii_case(o))
{
// first characters that differ, return the case-insensitive comparison
let s = s.to_ascii_lowercase();
let o = o.to_ascii_lowercase();
s.cmp(&o)
} else {
// one of the strings is a substring or equal of the other
self.v.len().cmp(&other.v.len())
}
}
}
impl PartialOrd for CaseInsensitiveSlice<'_> {
fn partial_cmp(&self, other: &Self) -> Option<Ordering> {
Some(self.cmp(other))
}
}
impl PartialEq for CaseInsensitiveSlice<'_> {
fn eq(&self, other: &Self) -> bool {
self.v.eq_ignore_ascii_case(other.v)
}
}
/// Input processing parameters.
struct Input<Sep: Separator> {
separator: Sep,
@ -307,9 +341,9 @@ impl<Sep: Separator> Input<Sep> {
fn compare(&self, field1: Option<&[u8]>, field2: Option<&[u8]>) -> Ordering {
if let (Some(field1), Some(field2)) = (field1, field2) {
if self.ignore_case {
field1
.to_ascii_lowercase()
.cmp(&field2.to_ascii_lowercase())
let field1 = CaseInsensitiveSlice { v: field1 };
let field2 = CaseInsensitiveSlice { v: field2 };
field1.cmp(&field2)
} else {
field1.cmp(field2)
}