From 0e689e78aa5395c278b349716c982c41f9b5b3d2 Mon Sep 17 00:00:00 2001 From: Jeffrey Finkelstein Date: Sun, 22 Aug 2021 16:21:08 -0400 Subject: [PATCH] tac: support multi-char separator with overlap Fix a bug in `tac` where multi-character line separators would cause incorrect behavior when there was overlap between candidate matches in the input string. This commit adds a dependency on `memchr` in order to use the `memchr::memmem::rfind_iter()` function to scan for non-overlapping instances of the specified line separator characters, scanning from right to left. Fixes #2580. --- Cargo.lock | 1 + src/uu/tac/Cargo.toml | 1 + src/uu/tac/src/tac.rs | 53 +++++++++++++++++----------- tests/by-util/test_tac.rs | 74 ++++++++++++++++++++++++++++++++++++++- 4 files changed, 107 insertions(+), 22 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 2a5b46153..29be47dfb 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2937,6 +2937,7 @@ name = "uu_tac" version = "0.0.7" dependencies = [ "clap", + "memchr 2.4.0", "uucore", "uucore_procs", ] diff --git a/src/uu/tac/Cargo.toml b/src/uu/tac/Cargo.toml index 60e5d29ec..3ba1497a0 100644 --- a/src/uu/tac/Cargo.toml +++ b/src/uu/tac/Cargo.toml @@ -15,6 +15,7 @@ edition = "2018" path = "src/tac.rs" [dependencies] +memchr = "2" clap = { version = "2.33", features = ["wrap_help"] } uucore = { version=">=0.0.9", package="uucore", path="../../uucore" } uucore_procs = { version=">=0.0.6", package="uucore_procs", path="../../uucore_procs" } diff --git a/src/uu/tac/src/tac.rs b/src/uu/tac/src/tac.rs index 4e1b38687..0568f1601 100644 --- a/src/uu/tac/src/tac.rs +++ b/src/uu/tac/src/tac.rs @@ -5,12 +5,13 @@ // * For the full copyright and license information, please view the LICENSE // * file that was distributed with this source code. -// spell-checker:ignore (ToDO) sbytes slen dlen +// spell-checker:ignore (ToDO) sbytes slen dlen memmem #[macro_use] extern crate uucore; use clap::{crate_version, App, Arg}; +use memchr::memmem; use std::io::{stdin, stdout, BufReader, Read, Write}; use std::{fs::File, path::Path}; use uucore::InvalidEncodingHandling; @@ -80,20 +81,33 @@ pub fn uu_app() -> App<'static, 'static> { .arg(Arg::with_name(options::FILE).hidden(true).multiple(true)) } +/// Write lines from `data` to stdout in reverse. +/// +/// This function writes to [`stdout`] each line appearing in `data`, +/// starting with the last line and ending with the first line. The +/// `separator` parameter defines what characters to use as a line +/// separator. +/// +/// If `before` is `false`, then this function assumes that the +/// `separator` appears at the end of each line, as in `"abc\ndef\n"`. +/// If `before` is `true`, then this function assumes that the +/// `separator` appears at the beginning of each line, as in +/// `"/abc/def"`. fn buffer_tac(data: &[u8], before: bool, separator: &str) -> std::io::Result<()> { let mut out = stdout(); - // Convert the line separator to a byte sequence. - let sbytes = separator.as_bytes(); - let slen = sbytes.len(); + // The number of bytes in the line separator. + let slen = separator.as_bytes().len(); - // If there are more characters in the separator than in the data, - // we can't possibly split the data on the separator. Write the - // entire buffer to stdout. - let dlen = data.len(); - if dlen < slen { - return out.write_all(data); - } + // The index of the start of the next line in the `data`. + // + // As we scan through the `data` from right to left, we update this + // variable each time we find a new line. + // + // If `before` is `true`, then each line starts immediately before + // the line separator. Otherwise, each line starts immediately after + // the line separator. + let mut following_line_start = data.len(); // Iterate over each byte in the buffer in reverse. When we find a // line separator, write the line to stdout. @@ -101,16 +115,13 @@ fn buffer_tac(data: &[u8], before: bool, separator: &str) -> std::io::Result<()> // The `before` flag controls whether the line separator appears at // the end of the line (as in "abc\ndef\n") or at the beginning of // the line (as in "/abc/def"). - let mut following_line_start = data.len(); - for i in (0..dlen - slen + 1).rev() { - if &data[i..i + slen] == sbytes { - if before { - out.write_all(&data[i..following_line_start])?; - following_line_start = i; - } else { - out.write_all(&data[i + slen..following_line_start])?; - following_line_start = i + slen; - } + for i in memmem::rfind_iter(data, separator) { + if before { + out.write_all(&data[i..following_line_start])?; + following_line_start = i; + } else { + out.write_all(&data[i + slen..following_line_start])?; + following_line_start = i + slen; } } diff --git a/tests/by-util/test_tac.rs b/tests/by-util/test_tac.rs index c6e32fef0..202f76d66 100644 --- a/tests/by-util/test_tac.rs +++ b/tests/by-util/test_tac.rs @@ -1,4 +1,4 @@ -// spell-checker:ignore axxbxx bxxaxx +// spell-checker:ignore axxbxx bxxaxx axxx axxxx xxaxx xxax xxxxa use crate::common::util::*; #[test] @@ -125,6 +125,78 @@ fn test_multi_char_separator() { .stdout_is("bxxaxx"); } +#[test] +fn test_multi_char_separator_overlap() { + // The right-most pair of "x" characters in the input is treated as + // the only line separator. That is, "axxx" is interpreted as having + // one line comprising the string "ax" followed by the line + // separator "xx". + new_ucmd!() + .args(&["-s", "xx"]) + .pipe_in("axxx") + .succeeds() + .stdout_is("axxx"); + + // Each non-overlapping pair of "x" characters in the input is + // treated as a line separator. That is, "axxxx" is interpreted as + // having two lines: + // + // * the second line is the empty string "" followed by the line + // separator "xx", + // * the first line is the string "a" followed by the line separator + // "xx". + // + // The lines are printed in reverse, resulting in "xx" followed by + // "axx". + new_ucmd!() + .args(&["-s", "xx"]) + .pipe_in("axxxx") + .succeeds() + .stdout_is("xxaxx"); +} + +#[test] +fn test_multi_char_separator_overlap_before() { + // With the "-b" option, the line separator is assumed to be at the + // beginning of the line. In this case, That is, "axxx" is + // interpreted as having two lines: + // + // * the second line is the empty string "" preceded by the line + // separator "xx", + // * the first line is the string "ax" preceded by no line + // separator, since there are no more characters preceding it. + // + // The lines are printed in reverse, resulting in "xx" followed by + // "ax". + new_ucmd!() + .args(&["-b", "-s", "xx"]) + .pipe_in("axxx") + .succeeds() + .stdout_is("xxax"); + + // With the "-b" option, the line separator is assumed to be at the + // beginning of the line. Each non-overlapping pair of "x" + // characters in the input is treated as a line separator. That is, + // "axxxx" is interpreted as having three lines: + // + // * the third line is the empty string "" preceded by the line + // separator "xx" (the last two "x" characters in the input + // string), + // * the second line is the empty string "" preceded by the line + // separator "xx" (the first two "x" characters in the input + // string), + // * the first line is the string "a" preceded by no line separator, + // since there are no more characters preceding it. + // + // The lines are printed in reverse, resulting in "xx" followed by + // "xx" followed by "a". + new_ucmd!() + .args(&["-b", "-s", "xx"]) + .pipe_in("axxxx") + .succeeds() + .stdout_is("xxxxa"); +} + #[test] fn test_null_separator() { new_ucmd!()