From ce4342d12e630c08c5a0db7da05a1b84f862882f Mon Sep 17 00:00:00 2001 From: Chirag Jadwani Date: Mon, 15 Mar 2021 14:08:14 +0530 Subject: [PATCH 1/2] uniq: Fix panic on invalid utf-8 input --- src/uu/uniq/src/uniq.rs | 10 +++++++--- tests/by-util/test_uniq.rs | 9 +++++++++ tests/fixtures/uniq/not-utf8-sequence.txt | 2 ++ 3 files changed, 18 insertions(+), 3 deletions(-) create mode 100644 tests/fixtures/uniq/not-utf8-sequence.txt diff --git a/src/uu/uniq/src/uniq.rs b/src/uu/uniq/src/uniq.rs index afbda2829..98260bb8f 100644 --- a/src/uu/uniq/src/uniq.rs +++ b/src/uu/uniq/src/uniq.rs @@ -10,7 +10,7 @@ extern crate uucore; use clap::{App, Arg, ArgMatches}; use std::fs::File; -use std::io::{stdin, stdout, BufRead, BufReader, BufWriter, Read, Write}; +use std::io::{stdin, stdout, BufRead, BufReader, BufWriter, Read, Result, Write}; use std::path::Path; use std::str::FromStr; @@ -61,8 +61,7 @@ impl Uniq { let delimiters = &self.delimiters; let line_terminator = self.get_line_terminator(); - for io_line in reader.split(line_terminator) { - let line = String::from_utf8(crash_if_err!(1, io_line)).unwrap(); + for line in reader.split(line_terminator).map(get_line_string) { if !lines.is_empty() && self.cmp_keys(&lines[0], &line) { let print_delimiter = delimiters == &Delimiters::Prepend || (delimiters == &Delimiters::Separate && first_line_printed); @@ -199,6 +198,11 @@ impl Uniq { } } +fn get_line_string(io_line: Result>) -> String { + let line_bytes = crash_if_err!(1, io_line); + crash_if_err!(1, String::from_utf8(line_bytes)) +} + fn opt_parsed(opt_name: &str, matches: &ArgMatches) -> Option { matches.value_of(opt_name).map(|arg_str| { let opt_val: Option = arg_str.parse().ok(); diff --git a/tests/by-util/test_uniq.rs b/tests/by-util/test_uniq.rs index 53d57b49c..22e67540e 100644 --- a/tests/by-util/test_uniq.rs +++ b/tests/by-util/test_uniq.rs @@ -138,3 +138,12 @@ fn test_stdin_zero_terminated() { .run() .stdout_is_fixture("sorted-zero-terminated.expected"); } + +#[test] +fn test_invalid_utf8() { + new_ucmd!() + .arg("not-utf8-sequence.txt") + .run() + .failure() + .stderr_only("uniq: error: invalid utf-8 sequence of 1 bytes from index 0"); +} diff --git a/tests/fixtures/uniq/not-utf8-sequence.txt b/tests/fixtures/uniq/not-utf8-sequence.txt new file mode 100644 index 000000000..78c146ffd --- /dev/null +++ b/tests/fixtures/uniq/not-utf8-sequence.txt @@ -0,0 +1,2 @@ +Next line contains two bytes - 0xCC and 0xCD - which are not a valid utf-8 sequence +ÌÍ \ No newline at end of file From 116e253cc0391e18e145170dcff222f07d2b00a2 Mon Sep 17 00:00:00 2001 From: Chirag Jadwani Date: Mon, 15 Mar 2021 18:11:33 +0530 Subject: [PATCH 2/2] uniq: Fix skip fields Current implementation of the skip fields logic does not handle multibyte code points correctly. It assumes each code point (`char`) is one byte. If the skipped part of the input line has any multibyte code points then this can cause fields not being skipped correctly (field start index is calculated to be before it actually starts). --- src/uu/uniq/src/uniq.rs | 25 ++++++++++------------ tests/fixtures/uniq/skip-2-fields.expected | 2 +- tests/fixtures/uniq/skip-fields.txt | 2 +- 3 files changed, 13 insertions(+), 16 deletions(-) diff --git a/src/uu/uniq/src/uniq.rs b/src/uu/uniq/src/uniq.rs index 98260bb8f..a1809f0f0 100644 --- a/src/uu/uniq/src/uniq.rs +++ b/src/uu/uniq/src/uniq.rs @@ -79,22 +79,19 @@ impl Uniq { fn skip_fields<'a>(&self, line: &'a str) -> &'a str { if let Some(skip_fields) = self.skip_fields { - if line.split_whitespace().count() > skip_fields { - let mut field = 0; - let mut i = 0; - while field < skip_fields && i < line.len() { - while i < line.len() && line.chars().nth(i).unwrap().is_whitespace() { - i += 1; - } - while i < line.len() && !line.chars().nth(i).unwrap().is_whitespace() { - i += 1; - } - field += 1; + let mut i = 0; + let mut char_indices = line.char_indices(); + for _ in 0..skip_fields { + if char_indices.find(|(_, c)| !c.is_whitespace()) == None { + return ""; + } + match char_indices.find(|(_, c)| c.is_whitespace()) { + None => return "", + + Some((next_field_i, _)) => i = next_field_i, } - &line[i..] - } else { - "" } + &line[i..] } else { line } diff --git a/tests/fixtures/uniq/skip-2-fields.expected b/tests/fixtures/uniq/skip-2-fields.expected index 8cffc2ebc..b971c2b2b 100644 --- a/tests/fixtures/uniq/skip-2-fields.expected +++ b/tests/fixtures/uniq/skip-2-fields.expected @@ -1,2 +1,2 @@ - aaa aa a + aaa ⟪⟫ a aa a diff --git a/tests/fixtures/uniq/skip-fields.txt b/tests/fixtures/uniq/skip-fields.txt index f84e377a0..4ec2744c6 100644 --- a/tests/fixtures/uniq/skip-fields.txt +++ b/tests/fixtures/uniq/skip-fields.txt @@ -1,4 +1,4 @@ - aaa aa a + aaa ⟪⟫ a ZZZ aa a ZZZ aa a ZZZ bb a