mirror of
https://github.com/RGBCube/uutils-coreutils
synced 2025-07-28 03:27:44 +00:00
tac: support multi-char separator with overlap
Fix a bug in `tac` where multi-character line separators would cause incorrect behavior when there was overlap between candidate matches in the input string. This commit adds a dependency on `memchr` in order to use the `memchr::memmem::rfind_iter()` function to scan for non-overlapping instances of the specified line separator characters, scanning from right to left. Fixes #2580.
This commit is contained in:
parent
c77115ab51
commit
0e689e78aa
4 changed files with 107 additions and 22 deletions
1
Cargo.lock
generated
1
Cargo.lock
generated
|
@ -2937,6 +2937,7 @@ name = "uu_tac"
|
|||
version = "0.0.7"
|
||||
dependencies = [
|
||||
"clap",
|
||||
"memchr 2.4.0",
|
||||
"uucore",
|
||||
"uucore_procs",
|
||||
]
|
||||
|
|
|
@ -15,6 +15,7 @@ edition = "2018"
|
|||
path = "src/tac.rs"
|
||||
|
||||
[dependencies]
|
||||
memchr = "2"
|
||||
clap = { version = "2.33", features = ["wrap_help"] }
|
||||
uucore = { version=">=0.0.9", package="uucore", path="../../uucore" }
|
||||
uucore_procs = { version=">=0.0.6", package="uucore_procs", path="../../uucore_procs" }
|
||||
|
|
|
@ -5,12 +5,13 @@
|
|||
// * For the full copyright and license information, please view the LICENSE
|
||||
// * file that was distributed with this source code.
|
||||
|
||||
// spell-checker:ignore (ToDO) sbytes slen dlen
|
||||
// spell-checker:ignore (ToDO) sbytes slen dlen memmem
|
||||
|
||||
#[macro_use]
|
||||
extern crate uucore;
|
||||
|
||||
use clap::{crate_version, App, Arg};
|
||||
use memchr::memmem;
|
||||
use std::io::{stdin, stdout, BufReader, Read, Write};
|
||||
use std::{fs::File, path::Path};
|
||||
use uucore::InvalidEncodingHandling;
|
||||
|
@ -80,20 +81,33 @@ pub fn uu_app() -> App<'static, 'static> {
|
|||
.arg(Arg::with_name(options::FILE).hidden(true).multiple(true))
|
||||
}
|
||||
|
||||
/// Write lines from `data` to stdout in reverse.
|
||||
///
|
||||
/// This function writes to [`stdout`] each line appearing in `data`,
|
||||
/// starting with the last line and ending with the first line. The
|
||||
/// `separator` parameter defines what characters to use as a line
|
||||
/// separator.
|
||||
///
|
||||
/// If `before` is `false`, then this function assumes that the
|
||||
/// `separator` appears at the end of each line, as in `"abc\ndef\n"`.
|
||||
/// If `before` is `true`, then this function assumes that the
|
||||
/// `separator` appears at the beginning of each line, as in
|
||||
/// `"/abc/def"`.
|
||||
fn buffer_tac(data: &[u8], before: bool, separator: &str) -> std::io::Result<()> {
|
||||
let mut out = stdout();
|
||||
|
||||
// Convert the line separator to a byte sequence.
|
||||
let sbytes = separator.as_bytes();
|
||||
let slen = sbytes.len();
|
||||
// The number of bytes in the line separator.
|
||||
let slen = separator.as_bytes().len();
|
||||
|
||||
// If there are more characters in the separator than in the data,
|
||||
// we can't possibly split the data on the separator. Write the
|
||||
// entire buffer to stdout.
|
||||
let dlen = data.len();
|
||||
if dlen < slen {
|
||||
return out.write_all(data);
|
||||
}
|
||||
// The index of the start of the next line in the `data`.
|
||||
//
|
||||
// As we scan through the `data` from right to left, we update this
|
||||
// variable each time we find a new line.
|
||||
//
|
||||
// If `before` is `true`, then each line starts immediately before
|
||||
// the line separator. Otherwise, each line starts immediately after
|
||||
// the line separator.
|
||||
let mut following_line_start = data.len();
|
||||
|
||||
// Iterate over each byte in the buffer in reverse. When we find a
|
||||
// line separator, write the line to stdout.
|
||||
|
@ -101,16 +115,13 @@ fn buffer_tac(data: &[u8], before: bool, separator: &str) -> std::io::Result<()>
|
|||
// The `before` flag controls whether the line separator appears at
|
||||
// the end of the line (as in "abc\ndef\n") or at the beginning of
|
||||
// the line (as in "/abc/def").
|
||||
let mut following_line_start = data.len();
|
||||
for i in (0..dlen - slen + 1).rev() {
|
||||
if &data[i..i + slen] == sbytes {
|
||||
if before {
|
||||
out.write_all(&data[i..following_line_start])?;
|
||||
following_line_start = i;
|
||||
} else {
|
||||
out.write_all(&data[i + slen..following_line_start])?;
|
||||
following_line_start = i + slen;
|
||||
}
|
||||
for i in memmem::rfind_iter(data, separator) {
|
||||
if before {
|
||||
out.write_all(&data[i..following_line_start])?;
|
||||
following_line_start = i;
|
||||
} else {
|
||||
out.write_all(&data[i + slen..following_line_start])?;
|
||||
following_line_start = i + slen;
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
// spell-checker:ignore axxbxx bxxaxx
|
||||
// spell-checker:ignore axxbxx bxxaxx axxx axxxx xxaxx xxax xxxxa
|
||||
use crate::common::util::*;
|
||||
|
||||
#[test]
|
||||
|
@ -125,6 +125,78 @@ fn test_multi_char_separator() {
|
|||
.stdout_is("bxxaxx");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_multi_char_separator_overlap() {
|
||||
// The right-most pair of "x" characters in the input is treated as
|
||||
// the only line separator. That is, "axxx" is interpreted as having
|
||||
// one line comprising the string "ax" followed by the line
|
||||
// separator "xx".
|
||||
new_ucmd!()
|
||||
.args(&["-s", "xx"])
|
||||
.pipe_in("axxx")
|
||||
.succeeds()
|
||||
.stdout_is("axxx");
|
||||
|
||||
// Each non-overlapping pair of "x" characters in the input is
|
||||
// treated as a line separator. That is, "axxxx" is interpreted as
|
||||
// having two lines:
|
||||
//
|
||||
// * the second line is the empty string "" followed by the line
|
||||
// separator "xx",
|
||||
// * the first line is the string "a" followed by the line separator
|
||||
// "xx".
|
||||
//
|
||||
// The lines are printed in reverse, resulting in "xx" followed by
|
||||
// "axx".
|
||||
new_ucmd!()
|
||||
.args(&["-s", "xx"])
|
||||
.pipe_in("axxxx")
|
||||
.succeeds()
|
||||
.stdout_is("xxaxx");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_multi_char_separator_overlap_before() {
|
||||
// With the "-b" option, the line separator is assumed to be at the
|
||||
// beginning of the line. In this case, That is, "axxx" is
|
||||
// interpreted as having two lines:
|
||||
//
|
||||
// * the second line is the empty string "" preceded by the line
|
||||
// separator "xx",
|
||||
// * the first line is the string "ax" preceded by no line
|
||||
// separator, since there are no more characters preceding it.
|
||||
//
|
||||
// The lines are printed in reverse, resulting in "xx" followed by
|
||||
// "ax".
|
||||
new_ucmd!()
|
||||
.args(&["-b", "-s", "xx"])
|
||||
.pipe_in("axxx")
|
||||
.succeeds()
|
||||
.stdout_is("xxax");
|
||||
|
||||
// With the "-b" option, the line separator is assumed to be at the
|
||||
// beginning of the line. Each non-overlapping pair of "x"
|
||||
// characters in the input is treated as a line separator. That is,
|
||||
// "axxxx" is interpreted as having three lines:
|
||||
//
|
||||
// * the third line is the empty string "" preceded by the line
|
||||
// separator "xx" (the last two "x" characters in the input
|
||||
// string),
|
||||
// * the second line is the empty string "" preceded by the line
|
||||
// separator "xx" (the first two "x" characters in the input
|
||||
// string),
|
||||
// * the first line is the string "a" preceded by no line separator,
|
||||
// since there are no more characters preceding it.
|
||||
//
|
||||
// The lines are printed in reverse, resulting in "xx" followed by
|
||||
// "xx" followed by "a".
|
||||
new_ucmd!()
|
||||
.args(&["-b", "-s", "xx"])
|
||||
.pipe_in("axxxx")
|
||||
.succeeds()
|
||||
.stdout_is("xxxxa");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_null_separator() {
|
||||
new_ucmd!()
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue