diff --git a/src/uu/uniq/src/uniq.rs b/src/uu/uniq/src/uniq.rs index afbda2829..a1809f0f0 100644 --- a/src/uu/uniq/src/uniq.rs +++ b/src/uu/uniq/src/uniq.rs @@ -10,7 +10,7 @@ extern crate uucore; use clap::{App, Arg, ArgMatches}; use std::fs::File; -use std::io::{stdin, stdout, BufRead, BufReader, BufWriter, Read, Write}; +use std::io::{stdin, stdout, BufRead, BufReader, BufWriter, Read, Result, Write}; use std::path::Path; use std::str::FromStr; @@ -61,8 +61,7 @@ impl Uniq { let delimiters = &self.delimiters; let line_terminator = self.get_line_terminator(); - for io_line in reader.split(line_terminator) { - let line = String::from_utf8(crash_if_err!(1, io_line)).unwrap(); + for line in reader.split(line_terminator).map(get_line_string) { if !lines.is_empty() && self.cmp_keys(&lines[0], &line) { let print_delimiter = delimiters == &Delimiters::Prepend || (delimiters == &Delimiters::Separate && first_line_printed); @@ -80,22 +79,19 @@ impl Uniq { fn skip_fields<'a>(&self, line: &'a str) -> &'a str { if let Some(skip_fields) = self.skip_fields { - if line.split_whitespace().count() > skip_fields { - let mut field = 0; - let mut i = 0; - while field < skip_fields && i < line.len() { - while i < line.len() && line.chars().nth(i).unwrap().is_whitespace() { - i += 1; - } - while i < line.len() && !line.chars().nth(i).unwrap().is_whitespace() { - i += 1; - } - field += 1; + let mut i = 0; + let mut char_indices = line.char_indices(); + for _ in 0..skip_fields { + if char_indices.find(|(_, c)| !c.is_whitespace()) == None { + return ""; + } + match char_indices.find(|(_, c)| c.is_whitespace()) { + None => return "", + + Some((next_field_i, _)) => i = next_field_i, } - &line[i..] - } else { - "" } + &line[i..] } else { line } @@ -199,6 +195,11 @@ impl Uniq { } } +fn get_line_string(io_line: Result>) -> String { + let line_bytes = crash_if_err!(1, io_line); + crash_if_err!(1, String::from_utf8(line_bytes)) +} + fn opt_parsed(opt_name: &str, matches: &ArgMatches) -> Option { matches.value_of(opt_name).map(|arg_str| { let opt_val: Option = arg_str.parse().ok(); diff --git a/tests/by-util/test_uniq.rs b/tests/by-util/test_uniq.rs index 53d57b49c..22e67540e 100644 --- a/tests/by-util/test_uniq.rs +++ b/tests/by-util/test_uniq.rs @@ -138,3 +138,12 @@ fn test_stdin_zero_terminated() { .run() .stdout_is_fixture("sorted-zero-terminated.expected"); } + +#[test] +fn test_invalid_utf8() { + new_ucmd!() + .arg("not-utf8-sequence.txt") + .run() + .failure() + .stderr_only("uniq: error: invalid utf-8 sequence of 1 bytes from index 0"); +} diff --git a/tests/fixtures/uniq/not-utf8-sequence.txt b/tests/fixtures/uniq/not-utf8-sequence.txt new file mode 100644 index 000000000..78c146ffd --- /dev/null +++ b/tests/fixtures/uniq/not-utf8-sequence.txt @@ -0,0 +1,2 @@ +Next line contains two bytes - 0xCC and 0xCD - which are not a valid utf-8 sequence +ÌÍ \ No newline at end of file diff --git a/tests/fixtures/uniq/skip-2-fields.expected b/tests/fixtures/uniq/skip-2-fields.expected index 8cffc2ebc..b971c2b2b 100644 --- a/tests/fixtures/uniq/skip-2-fields.expected +++ b/tests/fixtures/uniq/skip-2-fields.expected @@ -1,2 +1,2 @@ - aaa aa a + aaa ⟪⟫ a aa a diff --git a/tests/fixtures/uniq/skip-fields.txt b/tests/fixtures/uniq/skip-fields.txt index f84e377a0..4ec2744c6 100644 --- a/tests/fixtures/uniq/skip-fields.txt +++ b/tests/fixtures/uniq/skip-fields.txt @@ -1,4 +1,4 @@ - aaa aa a + aaa ⟪⟫ a ZZZ aa a ZZZ aa a ZZZ bb a