From ce4342d12e630c08c5a0db7da05a1b84f862882f Mon Sep 17 00:00:00 2001
From: Chirag Jadwani <chirag.jadwani@gmail.com>
Date: Mon, 15 Mar 2021 14:08:14 +0530
Subject: [PATCH 1/2] uniq: Fix panic on invalid utf-8 input

---
 src/uu/uniq/src/uniq.rs                   | 10 +++++++---
 tests/by-util/test_uniq.rs                |  9 +++++++++
 tests/fixtures/uniq/not-utf8-sequence.txt |  2 ++
 3 files changed, 18 insertions(+), 3 deletions(-)
 create mode 100644 tests/fixtures/uniq/not-utf8-sequence.txt
diff --git a/src/uu/uniq/src/uniq.rs b/src/uu/uniq/src/uniq.rs
index afbda2829..98260bb8f 100644
--- a/src/uu/uniq/src/uniq.rs
+++ b/src/uu/uniq/src/uniq.rs
@@ -10,7 +10,7 @@ extern crate uucore;
 
 use clap::{App, Arg, ArgMatches};
 use std::fs::File;
-use std::io::{stdin, stdout, BufRead, BufReader, BufWriter, Read, Write};
+use std::io::{stdin, stdout, BufRead, BufReader, BufWriter, Read, Result, Write};
 use std::path::Path;
 use std::str::FromStr;
 
@@ -61,8 +61,7 @@ impl Uniq {
         let delimiters = &self.delimiters;
         let line_terminator = self.get_line_terminator();
 
-        for io_line in reader.split(line_terminator) {
-            let line = String::from_utf8(crash_if_err!(1, io_line)).unwrap();
+        for line in reader.split(line_terminator).map(get_line_string) {
             if !lines.is_empty() && self.cmp_keys(&lines[0], &line) {
                 let print_delimiter = delimiters == &Delimiters::Prepend
                     || (delimiters == &Delimiters::Separate && first_line_printed);
@@ -199,6 +198,11 @@ impl Uniq {
     }
 }
 
+fn get_line_string(io_line: Result<Vec<u8>>) -> String {
+    let line_bytes = crash_if_err!(1, io_line);
+    crash_if_err!(1, String::from_utf8(line_bytes))
+}
+
 fn opt_parsed<T: FromStr>(opt_name: &str, matches: &ArgMatches) -> Option<T> {
     matches.value_of(opt_name).map(|arg_str| {
         let opt_val: Option<T> = arg_str.parse().ok();
diff --git a/tests/by-util/test_uniq.rs b/tests/by-util/test_uniq.rs
index 53d57b49c..22e67540e 100644
--- a/tests/by-util/test_uniq.rs
+++ b/tests/by-util/test_uniq.rs
@@ -138,3 +138,12 @@ fn test_stdin_zero_terminated() {
         .run()
         .stdout_is_fixture("sorted-zero-terminated.expected");
 }
+
+#[test]
+fn test_invalid_utf8() {
+    new_ucmd!()
+        .arg("not-utf8-sequence.txt")
+        .run()
+        .failure()
+        .stderr_only("uniq: error: invalid utf-8 sequence of 1 bytes from index 0");
+}
diff --git a/tests/fixtures/uniq/not-utf8-sequence.txt b/tests/fixtures/uniq/not-utf8-sequence.txt
new file mode 100644
index 000000000..78c146ffd
--- /dev/null
+++ b/tests/fixtures/uniq/not-utf8-sequence.txt
@@ -0,0 +1,2 @@
+Next line contains two bytes - 0xCC and 0xCD - which are not a valid utf-8 sequence
+ﾌﾍ
\ No newline at end of file

From 116e253cc0391e18e145170dcff222f07d2b00a2 Mon Sep 17 00:00:00 2001
From: Chirag Jadwani <chirag.jadwani@gmail.com>
Date: Mon, 15 Mar 2021 18:11:33 +0530
Subject: [PATCH 2/2] uniq: Fix skip fields

Current implementation of the skip fields logic does not handle
multibyte code points correctly. It assumes each code point (`char`) is
one byte. If the skipped part of the input line has any multibyte code
points then this can cause fields not being skipped correctly (field
start index is calculated to be before it actually starts).
---
 src/uu/uniq/src/uniq.rs                    | 25 ++++++++++------------
 tests/fixtures/uniq/skip-2-fields.expected |  2 +-
 tests/fixtures/uniq/skip-fields.txt        |  2 +-
 3 files changed, 13 insertions(+), 16 deletions(-)

diff --git a/src/uu/uniq/src/uniq.rs b/src/uu/uniq/src/uniq.rs
index 98260bb8f..a1809f0f0 100644
--- a/src/uu/uniq/src/uniq.rs
+++ b/src/uu/uniq/src/uniq.rs
@@ -79,22 +79,19 @@ impl Uniq {
 
     fn skip_fields<'a>(&self, line: &'a str) -> &'a str {
         if let Some(skip_fields) = self.skip_fields {
-            if line.split_whitespace().count() > skip_fields {
-                let mut field = 0;
-                let mut i = 0;
-                while field < skip_fields && i < line.len() {
-                    while i < line.len() && line.chars().nth(i).unwrap().is_whitespace() {
-                        i += 1;
-                    }
-                    while i < line.len() && !line.chars().nth(i).unwrap().is_whitespace() {
-                        i += 1;
-                    }
-                    field += 1;
+            let mut i = 0;
+            let mut char_indices = line.char_indices();
+            for _ in 0..skip_fields {
+                if char_indices.find(|(_, c)| !c.is_whitespace()) == None {
+                    return "";
+                }
+                match char_indices.find(|(_, c)| c.is_whitespace()) {
+                    None => return "",
+
+                    Some((next_field_i, _)) => i = next_field_i,
                 }
-                &line[i..]
-            } else {
-                ""
             }
+            &line[i..]
         } else {
             line
         }
diff --git a/tests/fixtures/uniq/skip-2-fields.expected b/tests/fixtures/uniq/skip-2-fields.expected
index 8cffc2ebc..b971c2b2b 100644
--- a/tests/fixtures/uniq/skip-2-fields.expected
+++ b/tests/fixtures/uniq/skip-2-fields.expected
@@ -1,2 +1,2 @@
-	  aaa  aa  a
+	  aaa  筺ｪ筺ｫ  a
 aa  a
diff --git a/tests/fixtures/uniq/skip-fields.txt b/tests/fixtures/uniq/skip-fields.txt
index f84e377a0..4ec2744c6 100644
--- a/tests/fixtures/uniq/skip-fields.txt
+++ b/tests/fixtures/uniq/skip-fields.txt
@@ -1,4 +1,4 @@
-	  aaa  aa  a
+	  aaa  筺ｪ筺ｫ  a
 	  ZZZ     aa  a
 ZZZ     aa  a
 	  ZZZ     bb  a