1
Fork 0
mirror of https://github.com/RGBCube/uutils-coreutils synced 2025-07-28 11:37:44 +00:00

paste: permit the delimiter list to be empty (#6714)

* paste: permit the delimiter list to be empty

Also: refactored the delimiter processing logic

* Extract duplicated code into function

* Address PR comments. Improve code structure.

* Fix additional paste bugs

* Fix additional paste bugs

* Simplify backslash delimiter validation

* Fix Clippy violations
This commit is contained in:
Andrew Liebenow 2024-10-10 08:36:30 -05:00 committed by GitHub
parent 7c3a9380f1
commit c41c601b45
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
2 changed files with 426 additions and 101 deletions

View file

@ -3,13 +3,14 @@
// For the full copyright and license information, please view the LICENSE // For the full copyright and license information, please view the LICENSE
// file that was distributed with this source code. // file that was distributed with this source code.
// spell-checker:ignore (ToDO) delim
use clap::{crate_version, Arg, ArgAction, Command}; use clap::{crate_version, Arg, ArgAction, Command};
use std::cell::{OnceCell, RefCell};
use std::fs::File; use std::fs::File;
use std::io::{stdin, stdout, BufRead, BufReader, Read, Write}; use std::io::{stdin, stdout, BufRead, BufReader, Stdin, Write};
use std::path::Path; use std::iter::Cycle;
use uucore::error::{FromIo, UResult, USimpleError}; use std::rc::Rc;
use std::slice::Iter;
use uucore::error::{UResult, USimpleError};
use uucore::line_ending::LineEnding; use uucore::line_ending::LineEnding;
use uucore::{format_usage, help_about, help_usage}; use uucore::{format_usage, help_about, help_usage};
@ -23,18 +24,6 @@ mod options {
pub const ZERO_TERMINATED: &str = "zero-terminated"; pub const ZERO_TERMINATED: &str = "zero-terminated";
} }
// Wraps BufReader and stdin
fn read_until<R: Read>(
reader: Option<&mut BufReader<R>>,
byte: u8,
buf: &mut Vec<u8>,
) -> std::io::Result<usize> {
match reader {
Some(reader) => reader.read_until(byte, buf),
None => stdin().lock().read_until(byte, buf),
}
}
#[uucore::main] #[uucore::main]
pub fn uumain(args: impl uucore::Args) -> UResult<()> { pub fn uumain(args: impl uucore::Args) -> UResult<()> {
let matches = uu_app().try_get_matches_from(args)?; let matches = uu_app().try_get_matches_from(args)?;
@ -96,120 +85,292 @@ fn paste(
delimiters: &str, delimiters: &str,
line_ending: LineEnding, line_ending: LineEnding,
) -> UResult<()> { ) -> UResult<()> {
let mut files = Vec::with_capacity(filenames.len()); let unescaped_and_encoded_delimiters = parse_delimiters(delimiters)?;
for name in filenames {
let file = if name == "-" { let stdin_once_cell = OnceCell::<Rc<RefCell<Stdin>>>::new();
None
} else { let mut input_source_vec = Vec::with_capacity(filenames.len());
let path = Path::new(&name);
let r = File::open(path).map_err_context(String::new)?; for filename in filenames {
Some(BufReader::new(r)) let input_source = match filename.as_str() {
"-" => InputSource::StandardInput(
stdin_once_cell
.get_or_init(|| Rc::new(RefCell::new(stdin())))
.clone(),
),
st => {
let file = File::open(st)?;
InputSource::File(BufReader::new(file))
}
}; };
files.push(file);
input_source_vec.push(input_source);
} }
if delimiters.ends_with('\\') && !delimiters.ends_with("\\\\") { let mut stdout = stdout().lock();
return Err(USimpleError::new(
1,
format!("delimiter list ends with an unescaped backslash: {delimiters}"),
));
}
let delimiters: Vec<char> = unescape(delimiters).chars().collect(); let line_ending_byte = u8::from(line_ending);
let mut delim_count = 0; let line_ending_byte_array_ref = &[line_ending_byte];
let mut delim_length = 1;
let stdout = stdout(); let input_source_vec_len = input_source_vec.len();
let mut stdout = stdout.lock();
let mut delimiter_state = DelimiterState::new(&unescaped_and_encoded_delimiters);
let mut output = Vec::new(); let mut output = Vec::new();
if serial { if serial {
for file in &mut files { for input_source in &mut input_source_vec {
output.clear(); output.clear();
loop { loop {
match read_until(file.as_mut(), line_ending as u8, &mut output) { match input_source.read_until(line_ending_byte, &mut output)? {
Ok(0) => break, 0 => break,
Ok(_) => { _ => {
if output.ends_with(&[line_ending as u8]) { remove_trailing_line_ending_byte(line_ending_byte, &mut output);
output.pop();
}
// a buffer of length four is large enough to encode any char
let mut buffer = [0; 4];
let ch =
delimiters[delim_count % delimiters.len()].encode_utf8(&mut buffer);
delim_length = ch.len();
for byte in buffer.iter().take(delim_length) { delimiter_state.write_delimiter(&mut output);
output.push(*byte);
}
} }
Err(e) => return Err(e.map_err_context(String::new)),
} }
delim_count += 1;
} }
// remove final delimiter
output.truncate(output.len() - delim_length);
write!( delimiter_state.remove_trailing_delimiter(&mut output);
stdout,
"{}{}", stdout.write_all(&output)?;
String::from_utf8_lossy(&output), stdout.write_all(line_ending_byte_array_ref)?;
line_ending
)?;
} }
} else { } else {
let mut eof = vec![false; files.len()]; let mut eof = vec![false; input_source_vec_len];
loop { loop {
output.clear(); output.clear();
let mut eof_count = 0; let mut eof_count = 0;
for (i, file) in files.iter_mut().enumerate() {
for (i, input_source) in input_source_vec.iter_mut().enumerate() {
if eof[i] { if eof[i] {
eof_count += 1; eof_count += 1;
} else { } else {
match read_until(file.as_mut(), line_ending as u8, &mut output) { match input_source.read_until(line_ending_byte, &mut output)? {
Ok(0) => { 0 => {
eof[i] = true; eof[i] = true;
eof_count += 1; eof_count += 1;
} }
Ok(_) => { _ => {
if output.ends_with(&[line_ending as u8]) { remove_trailing_line_ending_byte(line_ending_byte, &mut output);
output.pop();
}
} }
Err(e) => return Err(e.map_err_context(String::new)),
} }
} }
// a buffer of length four is large enough to encode any char
let mut buffer = [0; 4];
let ch = delimiters[delim_count % delimiters.len()].encode_utf8(&mut buffer);
delim_length = ch.len();
for byte in buffer.iter().take(delim_length) { delimiter_state.write_delimiter(&mut output);
output.push(*byte);
}
delim_count += 1;
} }
if files.len() == eof_count {
if eof_count == input_source_vec_len {
break; break;
} }
// Remove final delimiter
output.truncate(output.len() - delim_length);
write!( delimiter_state.remove_trailing_delimiter(&mut output);
stdout,
"{}{}", stdout.write_all(&output)?;
String::from_utf8_lossy(&output), stdout.write_all(line_ending_byte_array_ref)?;
line_ending
)?; // Quote:
delim_count = 0; // When the -s option is not specified:
// [...]
// The delimiter shall be reset to the first element of list after each file operand is processed.
// https://pubs.opengroup.org/onlinepubs/9799919799/utilities/paste.html
delimiter_state.reset_to_first_delimiter();
} }
} }
Ok(()) Ok(())
} }
// Unescape all special characters fn parse_delimiters(delimiters: &str) -> UResult<Box<[Box<[u8]>]>> {
fn unescape(s: &str) -> String { /// A single backslash char
s.replace("\\n", "\n") const BACKSLASH: char = '\\';
.replace("\\t", "\t")
.replace("\\\\", "\\") fn add_one_byte_single_char_delimiter(vec: &mut Vec<Box<[u8]>>, byte: u8) {
vec.push(Box::new([byte]));
}
// a buffer of length four is large enough to encode any char
let mut buffer = [0; 4];
let mut add_single_char_delimiter = |vec: &mut Vec<Box<[u8]>>, ch: char| {
let delimiter_encoded = ch.encode_utf8(&mut buffer);
vec.push(Box::from(delimiter_encoded.as_bytes()));
};
let mut vec = Vec::<Box<[u8]>>::with_capacity(delimiters.len());
let mut chars = delimiters.chars();
// Unescape all special characters
while let Some(char) = chars.next() {
match char {
BACKSLASH => match chars.next() {
// "Empty string (not a null character)"
// https://pubs.opengroup.org/onlinepubs/9799919799/utilities/paste.html
Some('0') => {
vec.push(Box::<[u8; 0]>::new([]));
}
// "\\" to "\" (U+005C)
Some(BACKSLASH) => {
add_one_byte_single_char_delimiter(&mut vec, b'\\');
}
// "\n" to U+000A
Some('n') => {
add_one_byte_single_char_delimiter(&mut vec, b'\n');
}
// "\t" to U+0009
Some('t') => {
add_one_byte_single_char_delimiter(&mut vec, b'\t');
}
Some(other_char) => {
// "If any other characters follow the <backslash>, the results are unspecified."
// https://pubs.opengroup.org/onlinepubs/9799919799/utilities/paste.html
// However, other implementations remove the backslash
// See "test_posix_unspecified_delimiter"
add_single_char_delimiter(&mut vec, other_char);
}
None => {
return Err(USimpleError::new(
1,
format!("delimiter list ends with an unescaped backslash: {delimiters}"),
));
}
},
non_backslash_char => {
add_single_char_delimiter(&mut vec, non_backslash_char);
}
}
}
Ok(vec.into_boxed_slice())
}
fn remove_trailing_line_ending_byte(line_ending_byte: u8, output: &mut Vec<u8>) {
if let Some(&byte) = output.last() {
if byte == line_ending_byte {
assert!(output.pop() == Some(line_ending_byte));
}
}
}
enum DelimiterState<'a> {
NoDelimiters,
OneDelimiter(&'a [u8]),
MultipleDelimiters {
current_delimiter: &'a [u8],
delimiters: &'a [Box<[u8]>],
delimiters_iterator: Cycle<Iter<'a, Box<[u8]>>>,
},
}
impl<'a> DelimiterState<'a> {
fn new(unescaped_and_encoded_delimiters: &'a [Box<[u8]>]) -> DelimiterState<'a> {
match unescaped_and_encoded_delimiters {
[] => DelimiterState::NoDelimiters,
[only_delimiter] => {
// -d '\0' is equivalent to -d ''
if only_delimiter.is_empty() {
DelimiterState::NoDelimiters
} else {
DelimiterState::OneDelimiter(only_delimiter)
}
}
[first_delimiter, ..] => DelimiterState::MultipleDelimiters {
current_delimiter: first_delimiter,
delimiters: unescaped_and_encoded_delimiters,
delimiters_iterator: unescaped_and_encoded_delimiters.iter().cycle(),
},
}
}
/// This should only be used to return to the start of the delimiter list after a file has been processed.
/// This should only be used when the "serial" option is disabled.
/// This is a no-op unless there are multiple delimiters.
fn reset_to_first_delimiter(&mut self) {
if let DelimiterState::MultipleDelimiters {
delimiters_iterator,
delimiters,
..
} = self
{
*delimiters_iterator = delimiters.iter().cycle();
}
}
/// Remove the trailing delimiter.
/// If there are no delimiters, this is a no-op.
fn remove_trailing_delimiter(&mut self, output: &mut Vec<u8>) {
let delimiter_length = match self {
DelimiterState::OneDelimiter(only_delimiter) => only_delimiter.len(),
DelimiterState::MultipleDelimiters {
current_delimiter, ..
} => current_delimiter.len(),
_ => {
return;
}
};
// `delimiter_length` will be zero if the current delimiter is a "\0" delimiter
if delimiter_length > 0 {
let output_len = output.len();
if let Some(output_without_delimiter_length) = output_len.checked_sub(delimiter_length)
{
output.truncate(output_without_delimiter_length);
} else {
// This branch is NOT unreachable, must be skipped
// `output` should be empty in this case
assert!(output_len == 0);
}
}
}
/// Append the current delimiter to `output`.
/// If there are no delimiters, this is a no-op.
fn write_delimiter(&mut self, output: &mut Vec<u8>) {
match self {
DelimiterState::OneDelimiter(only_delimiter) => {
output.extend_from_slice(only_delimiter);
}
DelimiterState::MultipleDelimiters {
current_delimiter,
delimiters_iterator,
..
} => {
// Unwrap because `delimiters_iterator` is a cycle iter and was created from a non-empty slice
let bo = delimiters_iterator.next().unwrap();
output.extend_from_slice(bo);
*current_delimiter = bo;
}
_ => {}
}
}
}
enum InputSource {
File(BufReader<File>),
StandardInput(Rc<RefCell<Stdin>>),
}
impl InputSource {
fn read_until(&mut self, byte: u8, buf: &mut Vec<u8>) -> UResult<usize> {
let us = match self {
Self::File(bu) => bu.read_until(byte, buf)?,
Self::StandardInput(rc) => rc
.try_borrow()
.map_err(|bo| USimpleError::new(1, format!("{bo}")))?
.lock()
.read_until(byte, buf)?,
};
Ok(us)
}
} }

View file

@ -2,6 +2,9 @@
// //
// For the full copyright and license information, please view the LICENSE // For the full copyright and license information, please view the LICENSE
// file that was distributed with this source code. // file that was distributed with this source code.
// spell-checker:ignore bsdutils toybox
use crate::common::util::TestScenario; use crate::common::util::TestScenario;
struct TestData<'b> { struct TestData<'b> {
@ -11,7 +14,7 @@ struct TestData<'b> {
out: &'b str, out: &'b str,
} }
static EXAMPLE_DATA: &[TestData] = &[ const EXAMPLE_DATA: &[TestData] = &[
// Ensure that paste properly handles files lacking a final newline. // Ensure that paste properly handles files lacking a final newline.
TestData { TestData {
name: "no-nl-1", name: "no-nl-1",
@ -172,7 +175,7 @@ fn test_delimiter_list_ending_with_escaped_backslash() {
at.write(&file, one_in); at.write(&file, one_in);
ins.push(file); ins.push(file);
} }
ucmd.args(&[d, "\\\\"]) ucmd.args(&[d, r"\\"])
.args(&ins) .args(&ins)
.succeeds() .succeeds()
.stdout_is("a\\b\n"); .stdout_is("a\\b\n");
@ -183,13 +186,174 @@ fn test_delimiter_list_ending_with_escaped_backslash() {
fn test_delimiter_list_ending_with_unescaped_backslash() { fn test_delimiter_list_ending_with_unescaped_backslash() {
for d in ["-d", "--delimiters"] { for d in ["-d", "--delimiters"] {
new_ucmd!() new_ucmd!()
.args(&[d, "\\"]) .args(&[d, r"\"])
.fails() .fails()
.stderr_contains("delimiter list ends with an unescaped backslash: \\"); .stderr_contains(r"delimiter list ends with an unescaped backslash: \");
new_ucmd!() new_ucmd!()
.args(&[d, "_\\"]) .args(&[d, r"\\\"])
.fails() .fails()
.stderr_contains("delimiter list ends with an unescaped backslash: _\\"); .stderr_contains(r"delimiter list ends with an unescaped backslash: \\\");
new_ucmd!()
.args(&[d, r"_\"])
.fails()
.stderr_contains(r"delimiter list ends with an unescaped backslash: _\");
}
}
#[test]
fn test_delimiter_list_empty() {
for option_style in ["-d", "--delimiters"] {
new_ucmd!()
.args(&[option_style, "", "-s"])
.pipe_in(
"\
A ALPHA 1 _
B BRAVO 2 _
C CHARLIE 3 _
",
)
.succeeds()
.stdout_only(
"\
A ALPHA 1 _B BRAVO 2 _C CHARLIE 3 _
",
);
}
}
// Was panicking (usize subtraction that would have resulted in a negative number)
// Not observable in release builds, since integer overflow checking is not enabled
#[test]
fn test_delimiter_truncation() {
for option_style in ["-d", "--delimiters"] {
new_ucmd!()
.args(&[option_style, "!@#", "-s", "-", "-", "-"])
.pipe_in(
"\
FIRST
SECOND
THIRD
FOURTH
ABCDEFG
",
)
.succeeds()
.stdout_only(
"\
FIRST!SECOND@THIRD#FOURTH!ABCDEFG
",
);
}
}
#[test]
fn test_non_utf8_input() {
// 0xC0 is not valid UTF-8
const INPUT: &[u8] = b"Non-UTF-8 test: \xC0\x00\xC0.\n";
new_ucmd!()
.pipe_in(INPUT)
.succeeds()
.stdout_only_bytes(INPUT);
}
#[test]
fn test_three_trailing_backslashes_delimiter() {
const ONE_BACKSLASH_STR: &str = r"\";
let three_backslashes_string = ONE_BACKSLASH_STR.repeat(3);
for option_style in ["-d", "--delimiters"] {
new_ucmd!()
.args(&[option_style, &three_backslashes_string])
.fails()
.no_stdout()
.stderr_str_check(|st| {
st.ends_with(&format!(
": delimiter list ends with an unescaped backslash: {three_backslashes_string}\n"
))
});
}
}
// "If any other characters follow the <backslash>, the results are unspecified."
// https://pubs.opengroup.org/onlinepubs/9799919799/utilities/paste.html
// However, other implementations remove the backslash
#[test]
fn test_posix_unspecified_delimiter() {
for option_style in ["-d", "--delimiters"] {
new_ucmd!()
.args(&[option_style, r"\z", "-s"])
.pipe_in(
"\
1
2
3
4
",
)
.succeeds()
.stdout_only(
"\
1z2z3z4
",
);
}
}
// "Empty string (not a null character)"
// https://pubs.opengroup.org/onlinepubs/9799919799/utilities/paste.html
#[test]
fn test_backslash_zero_delimiter() {
for option_style in ["-d", "--delimiters"] {
new_ucmd!()
.args(&[option_style, r"\0z\0", "-s"])
.pipe_in(
"\
1
2
3
4
5
6
",
)
.succeeds()
.stdout_only(
"\
12z345z6
",
);
}
}
// As of 2024-10-09, only bsdutils (https://github.com/dcantrell/bsdutils, derived from FreeBSD) and toybox handle
// multibyte delimiter characters in the way a user would likely expect. BusyBox and GNU Core Utilities do not.
#[test]
fn test_multi_byte_delimiter() {
for option_style in ["-d", "--delimiters"] {
new_ucmd!()
.args(&[option_style, "!ß@", "-s"])
.pipe_in(
"\
1
2
3
4
5
6
",
)
.succeeds()
.stdout_only(
"\
1!2ß3@4!5ß6
",
);
} }
} }