mirror of
https://github.com/RGBCube/uutils-coreutils
synced 2025-07-28 11:37:44 +00:00
uniq: fix multibyte input
Should fix tests/uniq/uniq.pl
This commit is contained in:
parent
805754b4f8
commit
1e23a3fa8d
2 changed files with 44 additions and 33 deletions
|
@ -154,43 +154,44 @@ impl Uniq {
|
|||
|
||||
fn cmp_key<F>(&self, line: &[u8], mut closure: F) -> bool
|
||||
where
|
||||
F: FnMut(&mut dyn Iterator<Item = u8>) -> bool,
|
||||
F: FnMut(&mut dyn Iterator<Item = char>) -> bool,
|
||||
{
|
||||
let fields_to_check = self.skip_fields(line);
|
||||
let len = fields_to_check.len();
|
||||
let slice_start = self.slice_start.unwrap_or(0);
|
||||
let slice_stop = self.slice_stop.unwrap_or(len);
|
||||
if len > 0 {
|
||||
// fast path: avoid doing any work if there is no need to skip or map to lower-case
|
||||
if !self.ignore_case && slice_start == 0 && slice_stop == len {
|
||||
return closure(&mut fields_to_check.iter().copied());
|
||||
}
|
||||
|
||||
// fast path: avoid skipping
|
||||
if self.ignore_case && slice_start == 0 && slice_stop == len {
|
||||
return closure(&mut fields_to_check.iter().map(|u| u.to_ascii_lowercase()));
|
||||
}
|
||||
|
||||
// fast path: we can avoid mapping chars to lower-case, if we don't want to ignore the case
|
||||
if !self.ignore_case {
|
||||
return closure(
|
||||
&mut fields_to_check
|
||||
.iter()
|
||||
.skip(slice_start)
|
||||
.take(slice_stop)
|
||||
.copied(),
|
||||
);
|
||||
}
|
||||
|
||||
closure(
|
||||
&mut fields_to_check
|
||||
.iter()
|
||||
.skip(slice_start)
|
||||
.take(slice_stop)
|
||||
.map(|u| u.to_ascii_lowercase()),
|
||||
)
|
||||
// Skip self.slice_start bytes (if -s was used).
|
||||
// self.slice_start is how many characters to skip, but historically
|
||||
// uniq’s `-s N` means “skip N *bytes*,” so do that literally:
|
||||
let skip_bytes = self.slice_start.unwrap_or(0);
|
||||
let fields_to_check = if skip_bytes < fields_to_check.len() {
|
||||
&fields_to_check[skip_bytes..]
|
||||
} else {
|
||||
closure(&mut fields_to_check.iter().copied())
|
||||
// If skipping beyond end-of-line, leftover is empty => effectively ""
|
||||
&[]
|
||||
};
|
||||
|
||||
// Convert the leftover bytes to UTF-8 for character-based -w
|
||||
// If invalid UTF-8, just compare them as individual bytes (fallback).
|
||||
let string_after_skip = match std::str::from_utf8(fields_to_check) {
|
||||
Ok(s) => s,
|
||||
Err(_) => {
|
||||
// Fallback: if invalid UTF-8, treat them as single-byte “chars”
|
||||
return closure(&mut fields_to_check.iter().map(|&b| b as char));
|
||||
}
|
||||
};
|
||||
|
||||
let total_chars = string_after_skip.chars().count();
|
||||
|
||||
// `-w N` => Compare no more than N characters
|
||||
let slice_stop = self.slice_stop.unwrap_or(total_chars);
|
||||
let slice_start = slice_stop.min(total_chars);
|
||||
|
||||
let mut iter = string_after_skip.chars().take(slice_start);
|
||||
|
||||
if self.ignore_case {
|
||||
// We can do ASCII-lowercase or full Unicode-lowercase. For minimal changes, do ASCII:
|
||||
closure(&mut iter.map(|c| c.to_ascii_lowercase()))
|
||||
} else {
|
||||
closure(&mut iter)
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
@ -1172,3 +1172,13 @@ fn gnu_tests() {
|
|||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_stdin_w1_multibyte() {
|
||||
let input = "à\ná\n";
|
||||
new_ucmd!()
|
||||
.args(&["-w1"])
|
||||
.pipe_in(input)
|
||||
.run()
|
||||
.stdout_is("à\ná\n");
|
||||
}
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue