From 0e689e78aa5395c278b349716c982c41f9b5b3d2 Mon Sep 17 00:00:00 2001
From: Jeffrey Finkelstein <jeffrey.finkelstein@protonmail.com>
Date: Sun, 22 Aug 2021 16:21:08 -0400
Subject: [PATCH] tac: support multi-char separator with overlap

Fix a bug in `tac` where multi-character line separators would cause
incorrect behavior when there was overlap between candidate matches in
the input string. This commit adds a dependency on `memchr` in order to
use the `memchr::memmem::rfind_iter()` function to scan for
non-overlapping instances of the specified line separator characters,
scanning from right to left.

Fixes #2580.
---
 Cargo.lock                |  1 +
 src/uu/tac/Cargo.toml     |  1 +
 src/uu/tac/src/tac.rs     | 53 +++++++++++++++++-----------
 tests/by-util/test_tac.rs | 74 ++++++++++++++++++++++++++++++++++++++-
 4 files changed, 107 insertions(+), 22 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index 2a5b46153..29be47dfb 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -2937,6 +2937,7 @@ name = "uu_tac"
 version = "0.0.7"
 dependencies = [
  "clap",
+ "memchr 2.4.0",
  "uucore",
  "uucore_procs",
 ]
diff --git a/src/uu/tac/Cargo.toml b/src/uu/tac/Cargo.toml
index 60e5d29ec..3ba1497a0 100644
--- a/src/uu/tac/Cargo.toml
+++ b/src/uu/tac/Cargo.toml
@@ -15,6 +15,7 @@ edition = "2018"
 path = "src/tac.rs"
 
 [dependencies]
+memchr = "2"
 clap = { version = "2.33", features = ["wrap_help"] }
 uucore = { version=">=0.0.9", package="uucore", path="../../uucore" }
 uucore_procs = { version=">=0.0.6", package="uucore_procs", path="../../uucore_procs" }
diff --git a/src/uu/tac/src/tac.rs b/src/uu/tac/src/tac.rs
index 4e1b38687..0568f1601 100644
--- a/src/uu/tac/src/tac.rs
+++ b/src/uu/tac/src/tac.rs
@@ -5,12 +5,13 @@
 //  * For the full copyright and license information, please view the LICENSE
 //  * file that was distributed with this source code.
 
-// spell-checker:ignore (ToDO) sbytes slen dlen
+// spell-checker:ignore (ToDO) sbytes slen dlen memmem
 
 #[macro_use]
 extern crate uucore;
 
 use clap::{crate_version, App, Arg};
+use memchr::memmem;
 use std::io::{stdin, stdout, BufReader, Read, Write};
 use std::{fs::File, path::Path};
 use uucore::InvalidEncodingHandling;
@@ -80,20 +81,33 @@ pub fn uu_app() -> App<'static, 'static> {
         .arg(Arg::with_name(options::FILE).hidden(true).multiple(true))
 }
 
+/// Write lines from `data` to stdout in reverse.
+///
+/// This function writes to [`stdout`] each line appearing in `data`,
+/// starting with the last line and ending with the first line. The
+/// `separator` parameter defines what characters to use as a line
+/// separator.
+///
+/// If `before` is `false`, then this function assumes that the
+/// `separator` appears at the end of each line, as in `"abc\ndef\n"`.
+/// If `before` is `true`, then this function assumes that the
+/// `separator` appears at the beginning of each line, as in
+/// `"/abc/def"`.
 fn buffer_tac(data: &[u8], before: bool, separator: &str) -> std::io::Result<()> {
     let mut out = stdout();
 
-    // Convert the line separator to a byte sequence.
-    let sbytes = separator.as_bytes();
-    let slen = sbytes.len();
+    // The number of bytes in the line separator.
+    let slen = separator.as_bytes().len();
 
-    // If there are more characters in the separator than in the data,
-    // we can't possibly split the data on the separator. Write the
-    // entire buffer to stdout.
-    let dlen = data.len();
-    if dlen < slen {
-        return out.write_all(data);
-    }
+    // The index of the start of the next line in the `data`.
+    //
+    // As we scan through the `data` from right to left, we update this
+    // variable each time we find a new line.
+    //
+    // If `before` is `true`, then each line starts immediately before
+    // the line separator. Otherwise, each line starts immediately after
+    // the line separator.
+    let mut following_line_start = data.len();
 
     // Iterate over each byte in the buffer in reverse. When we find a
     // line separator, write the line to stdout.
@@ -101,16 +115,13 @@ fn buffer_tac(data: &[u8], before: bool, separator: &str) -> std::io::Result<()>
     // The `before` flag controls whether the line separator appears at
     // the end of the line (as in "abc\ndef\n") or at the beginning of
     // the line (as in "/abc/def").
-    let mut following_line_start = data.len();
-    for i in (0..dlen - slen + 1).rev() {
-        if &data[i..i + slen] == sbytes {
-            if before {
-                out.write_all(&data[i..following_line_start])?;
-                following_line_start = i;
-            } else {
-                out.write_all(&data[i + slen..following_line_start])?;
-                following_line_start = i + slen;
-            }
+    for i in memmem::rfind_iter(data, separator) {
+        if before {
+            out.write_all(&data[i..following_line_start])?;
+            following_line_start = i;
+        } else {
+            out.write_all(&data[i + slen..following_line_start])?;
+            following_line_start = i + slen;
         }
     }
 
diff --git a/tests/by-util/test_tac.rs b/tests/by-util/test_tac.rs
index c6e32fef0..202f76d66 100644
--- a/tests/by-util/test_tac.rs
+++ b/tests/by-util/test_tac.rs
@@ -1,4 +1,4 @@
-// spell-checker:ignore axxbxx bxxaxx
+// spell-checker:ignore axxbxx bxxaxx axxx axxxx xxaxx xxax xxxxa
 use crate::common::util::*;
 
 #[test]
@@ -125,6 +125,78 @@ fn test_multi_char_separator() {
         .stdout_is("bxxaxx");
 }
 
+#[test]
+fn test_multi_char_separator_overlap() {
+    // The right-most pair of "x" characters in the input is treated as
+    // the only line separator. That is, "axxx" is interpreted as having
+    // one line comprising the string "ax" followed by the line
+    // separator "xx".
+    new_ucmd!()
+        .args(&["-s", "xx"])
+        .pipe_in("axxx")
+        .succeeds()
+        .stdout_is("axxx");
+
+    // Each non-overlapping pair of "x" characters in the input is
+    // treated as a line separator. That is, "axxxx" is interpreted as
+    // having two lines:
+    //
+    // * the second line is the empty string "" followed by the line
+    //   separator "xx",
+    // * the first line is the string "a" followed by the line separator
+    //   "xx".
+    //
+    // The lines are printed in reverse, resulting in "xx" followed by
+    // "axx".
+    new_ucmd!()
+        .args(&["-s", "xx"])
+        .pipe_in("axxxx")
+        .succeeds()
+        .stdout_is("xxaxx");
+}
+
+#[test]
+fn test_multi_char_separator_overlap_before() {
+    // With the "-b" option, the line separator is assumed to be at the
+    // beginning of the line. In this case, That is, "axxx" is
+    // interpreted as having two lines:
+    //
+    // * the second line is the empty string "" preceded by the line
+    //   separator "xx",
+    // * the first line is the string "ax" preceded by no line
+    //   separator, since there are no more characters preceding it.
+    //
+    // The lines are printed in reverse, resulting in "xx" followed by
+    // "ax".
+    new_ucmd!()
+        .args(&["-b", "-s", "xx"])
+        .pipe_in("axxx")
+        .succeeds()
+        .stdout_is("xxax");
+
+    // With the "-b" option, the line separator is assumed to be at the
+    // beginning of the line. Each non-overlapping pair of "x"
+    // characters in the input is treated as a line separator. That is,
+    // "axxxx" is interpreted as having three lines:
+    //
+    // * the third line is the empty string "" preceded by the line
+    //   separator "xx" (the last two "x" characters in the input
+    //   string),
+    // * the second line is the empty string "" preceded by the line
+    //   separator "xx" (the first two "x" characters in the input
+    //   string),
+    // * the first line is the string "a" preceded by no line separator,
+    //   since there are no more characters preceding it.
+    //
+    // The lines are printed in reverse, resulting in "xx" followed by
+    // "xx" followed by "a".
+    new_ucmd!()
+        .args(&["-b", "-s", "xx"])
+        .pipe_in("axxxx")
+        .succeeds()
+        .stdout_is("xxxxa");
+}
+
 #[test]
 fn test_null_separator() {
     new_ucmd!()