mirror of
https://github.com/RGBCube/uutils-coreutils
synced 2025-07-28 11:37:44 +00:00
uniq: fix multibyte input
Should fix tests/uniq/uniq.pl
This commit is contained in:
parent
805754b4f8
commit
1e23a3fa8d
2 changed files with 44 additions and 33 deletions
|
@ -154,43 +154,44 @@ impl Uniq {
|
||||||
|
|
||||||
fn cmp_key<F>(&self, line: &[u8], mut closure: F) -> bool
|
fn cmp_key<F>(&self, line: &[u8], mut closure: F) -> bool
|
||||||
where
|
where
|
||||||
F: FnMut(&mut dyn Iterator<Item = u8>) -> bool,
|
F: FnMut(&mut dyn Iterator<Item = char>) -> bool,
|
||||||
{
|
{
|
||||||
let fields_to_check = self.skip_fields(line);
|
let fields_to_check = self.skip_fields(line);
|
||||||
let len = fields_to_check.len();
|
|
||||||
let slice_start = self.slice_start.unwrap_or(0);
|
|
||||||
let slice_stop = self.slice_stop.unwrap_or(len);
|
|
||||||
if len > 0 {
|
|
||||||
// fast path: avoid doing any work if there is no need to skip or map to lower-case
|
|
||||||
if !self.ignore_case && slice_start == 0 && slice_stop == len {
|
|
||||||
return closure(&mut fields_to_check.iter().copied());
|
|
||||||
}
|
|
||||||
|
|
||||||
// fast path: avoid skipping
|
// Skip self.slice_start bytes (if -s was used).
|
||||||
if self.ignore_case && slice_start == 0 && slice_stop == len {
|
// self.slice_start is how many characters to skip, but historically
|
||||||
return closure(&mut fields_to_check.iter().map(|u| u.to_ascii_lowercase()));
|
// uniq’s `-s N` means “skip N *bytes*,” so do that literally:
|
||||||
}
|
let skip_bytes = self.slice_start.unwrap_or(0);
|
||||||
|
let fields_to_check = if skip_bytes < fields_to_check.len() {
|
||||||
// fast path: we can avoid mapping chars to lower-case, if we don't want to ignore the case
|
&fields_to_check[skip_bytes..]
|
||||||
if !self.ignore_case {
|
|
||||||
return closure(
|
|
||||||
&mut fields_to_check
|
|
||||||
.iter()
|
|
||||||
.skip(slice_start)
|
|
||||||
.take(slice_stop)
|
|
||||||
.copied(),
|
|
||||||
);
|
|
||||||
}
|
|
||||||
|
|
||||||
closure(
|
|
||||||
&mut fields_to_check
|
|
||||||
.iter()
|
|
||||||
.skip(slice_start)
|
|
||||||
.take(slice_stop)
|
|
||||||
.map(|u| u.to_ascii_lowercase()),
|
|
||||||
)
|
|
||||||
} else {
|
} else {
|
||||||
closure(&mut fields_to_check.iter().copied())
|
// If skipping beyond end-of-line, leftover is empty => effectively ""
|
||||||
|
&[]
|
||||||
|
};
|
||||||
|
|
||||||
|
// Convert the leftover bytes to UTF-8 for character-based -w
|
||||||
|
// If invalid UTF-8, just compare them as individual bytes (fallback).
|
||||||
|
let string_after_skip = match std::str::from_utf8(fields_to_check) {
|
||||||
|
Ok(s) => s,
|
||||||
|
Err(_) => {
|
||||||
|
// Fallback: if invalid UTF-8, treat them as single-byte “chars”
|
||||||
|
return closure(&mut fields_to_check.iter().map(|&b| b as char));
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
let total_chars = string_after_skip.chars().count();
|
||||||
|
|
||||||
|
// `-w N` => Compare no more than N characters
|
||||||
|
let slice_stop = self.slice_stop.unwrap_or(total_chars);
|
||||||
|
let slice_start = slice_stop.min(total_chars);
|
||||||
|
|
||||||
|
let mut iter = string_after_skip.chars().take(slice_start);
|
||||||
|
|
||||||
|
if self.ignore_case {
|
||||||
|
// We can do ASCII-lowercase or full Unicode-lowercase. For minimal changes, do ASCII:
|
||||||
|
closure(&mut iter.map(|c| c.to_ascii_lowercase()))
|
||||||
|
} else {
|
||||||
|
closure(&mut iter)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -1172,3 +1172,13 @@ fn gnu_tests() {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_stdin_w1_multibyte() {
|
||||||
|
let input = "à\ná\n";
|
||||||
|
new_ucmd!()
|
||||||
|
.args(&["-w1"])
|
||||||
|
.pipe_in(input)
|
||||||
|
.run()
|
||||||
|
.stdout_is("à\ná\n");
|
||||||
|
}
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue