From 83f96ec29d6e063f0777bbc26f45d6ccf4eb5f1a Mon Sep 17 00:00:00 2001
From: Jeffrey Finkelstein <jeffrey.finkelstein@protonmail.com>
Date: Mon, 24 Jan 2022 21:18:59 -0500
Subject: [PATCH] tail: don't error when following non-UTF-8 data

Fix a bug where `tail -f` would terminate with an error due to failing
to parse a UTF-8 string from a sequence of bytes read from the
followed file. This commit replaces the call to `BufRead::read_line()`
with a call to `BufRead::read_until()` so that any sequence of bytes
regardless of encoding can be read.

Fixes #1050.
---
 src/uu/tail/src/tail.rs    | 10 +++++++---
 tests/by-util/test_tail.rs | 28 ++++++++++++++++++++++++++++
 tests/common/util.rs       |  9 ++++++++-
 3 files changed, 43 insertions(+), 4 deletions(-)

diff --git a/src/uu/tail/src/tail.rs b/src/uu/tail/src/tail.rs
index cf1f5c3c5..67b1741e6 100644
--- a/src/uu/tail/src/tail.rs
+++ b/src/uu/tail/src/tail.rs
@@ -345,6 +345,7 @@ pub fn uu_app<'a>() -> App<'a> {
         )
 }
 
+/// Continually check for new data in the given readers, writing any to stdout.
 fn follow<T: BufRead>(readers: &mut [(T, &String)], settings: &Settings) -> UResult<()> {
     if readers.is_empty() || !settings.follow {
         return Ok(());
@@ -353,6 +354,7 @@ fn follow<T: BufRead>(readers: &mut [(T, &String)], settings: &Settings) -> URes
     let mut last = readers.len() - 1;
     let mut read_some = false;
     let mut process = platform::ProcessChecker::new(settings.pid);
+    let mut stdout = stdout();
 
     loop {
         sleep(Duration::new(0, settings.sleep_msec * 1000));
@@ -363,8 +365,8 @@ fn follow<T: BufRead>(readers: &mut [(T, &String)], settings: &Settings) -> URes
         for (i, (reader, filename)) in readers.iter_mut().enumerate() {
             // Print all new content since the last pass
             loop {
-                let mut datum = String::new();
-                match reader.read_line(&mut datum) {
+                let mut datum = vec![];
+                match reader.read_until(b'\n', &mut datum) {
                     Ok(0) => break,
                     Ok(_) => {
                         read_some = true;
@@ -372,7 +374,9 @@ fn follow<T: BufRead>(readers: &mut [(T, &String)], settings: &Settings) -> URes
                             println!("\n==> {} <==", filename);
                             last = i;
                         }
-                        print!("{}", datum);
+                        stdout
+                            .write_all(&datum)
+                            .map_err_context(|| String::from("write error"))?;
                     }
                     Err(err) => return Err(USimpleError::new(1, err.to_string())),
                 }
diff --git a/tests/by-util/test_tail.rs b/tests/by-util/test_tail.rs
index f4d932e79..40a229d3a 100644
--- a/tests/by-util/test_tail.rs
+++ b/tests/by-util/test_tail.rs
@@ -77,6 +77,34 @@ fn test_follow() {
     child.kill().unwrap();
 }
 
+/// Test for following when bytes are written that are not valid UTF-8.
+#[test]
+fn test_follow_non_utf8_bytes() {
+    // Tail the test file and start following it.
+    let (at, mut ucmd) = at_and_ucmd!();
+    let mut child = ucmd.arg("-f").arg(FOOBAR_TXT).run_no_wait();
+    let expected = at.read("foobar_single_default.expected");
+    assert_eq!(read_size(&mut child, expected.len()), expected);
+
+    // Now append some bytes that are not valid UTF-8.
+    //
+    // The binary integer "10000000" is *not* a valid UTF-8 encoding
+    // of a character: https://en.wikipedia.org/wiki/UTF-8#Encoding
+    //
+    // We also write the newline character because our implementation
+    // of `tail` is attempting to read a line of input, so the
+    // presence of a newline character will force the `follow()`
+    // function to conclude reading input bytes and start writing them
+    // to output. The newline character is not fundamental to this
+    // test, it is just a requirement of the current implementation.
+    let expected = [0b10000000, b'\n'];
+    at.append_bytes(FOOBAR_TXT, &expected);
+    let actual = read_size_bytes(&mut child, expected.len());
+    assert_eq!(actual, expected.to_vec());
+
+    child.kill().unwrap();
+}
+
 #[test]
 fn test_follow_multiple() {
     let (at, mut ucmd) = at_and_ucmd!();
diff --git a/tests/common/util.rs b/tests/common/util.rs
index 5df0b753f..79fe4d5d7 100644
--- a/tests/common/util.rs
+++ b/tests/common/util.rs
@@ -1117,6 +1117,13 @@ impl UCommand {
 /// Wrapper for `child.stdout.read_exact()`.
 /// Careful, this blocks indefinitely if `size` bytes is never reached.
 pub fn read_size(child: &mut Child, size: usize) -> String {
+    String::from_utf8(read_size_bytes(child, size)).unwrap()
+}
+
+/// Read the specified number of bytes from the stdout of the child process.
+///
+/// Careful, this blocks indefinitely if `size` bytes is never reached.
+pub fn read_size_bytes(child: &mut Child, size: usize) -> Vec<u8> {
     let mut output = Vec::new();
     output.resize(size, 0);
     sleep(Duration::from_secs(1));
@@ -1126,7 +1133,7 @@ pub fn read_size(child: &mut Child, size: usize) -> String {
         .unwrap()
         .read_exact(output.as_mut_slice())
         .unwrap();
-    String::from_utf8(output).unwrap()
+    output
 }
 
 pub fn vec_of_size(n: usize) -> Vec<u8> {