paste: permit the delimiter list to be empty (#6714)

* paste: permit the delimiter list to be empty Also: refactored the delimiter processing logic * Extract duplicated code into function * Address PR comments. Improve code structure. * Fix additional paste bugs * Fix additional paste bugs * Simplify backslash delimiter validation * Fix Clippy violations
2025-09-14 19:16:17 +00:00 · 2024-10-10 08:36:30 -05:00 · 2024-10-10 08:36:30 -05:00 · c41c601b45
commit c41c601b45
parent 7c3a9380f1
2 changed files with 426 additions and 101 deletions
--- a/src/uu/paste/src/paste.rs
+++ b/src/uu/paste/src/paste.rs
@ -3,13 +3,14 @@
 // For the full copyright and license information, please view the LICENSE
 // file that was distributed with this source code.
 // spell-checker:ignore (ToDO) delim
 use clap::{crate_version, Arg, ArgAction, Command};
 use std::cell::{OnceCell, RefCell};
 use std::fs::File;
-use std::io::{stdin, stdout, BufRead, BufReader, Read, Write};
+use std::io::{stdin, stdout, BufRead, BufReader, Stdin, Write};
-use std::path::Path;
+use std::iter::Cycle;
-use uucore::error::{FromIo, UResult, USimpleError};
+use std::rc::Rc;
 use std::slice::Iter;
 use uucore::error::{UResult, USimpleError};
 use uucore::line_ending::LineEnding;
 use uucore::{format_usage, help_about, help_usage};
@ -23,18 +24,6 @@ mod options {
    pub const ZERO_TERMINATED: &str = "zero-terminated";
 }
 // Wraps BufReader and stdin
 fn read_until<R: Read>(
    reader: Option<&mut BufReader<R>>,
    byte: u8,
    buf: &mut Vec<u8>,
 ) -> std::io::Result<usize> {
    match reader {
        Some(reader) => reader.read_until(byte, buf),
        None => stdin().lock().read_until(byte, buf),
    }
 }
 #[uucore::main]
 pub fn uumain(args: impl uucore::Args) -> UResult<()> {
    let matches = uu_app().try_get_matches_from(args)?;
@ -96,120 +85,292 @@ fn paste(
    delimiters: &str,
    line_ending: LineEnding,
 ) -> UResult<()> {
-    let mut files = Vec::with_capacity(filenames.len());
+    let unescaped_and_encoded_delimiters = parse_delimiters(delimiters)?;
-    for name in filenames {
+
-        let file = if name == "-" {
+    let stdin_once_cell = OnceCell::<Rc<RefCell<Stdin>>>::new();
-            None
+
-        } else {
+    let mut input_source_vec = Vec::with_capacity(filenames.len());
-            let path = Path::new(&name);
+
-            let r = File::open(path).map_err_context(String::new)?;
+    for filename in filenames {
-            Some(BufReader::new(r))
+        let input_source = match filename.as_str() {
            "-" => InputSource::StandardInput(
                stdin_once_cell
                    .get_or_init(|| Rc::new(RefCell::new(stdin())))
                    .clone(),
            ),
            st => {
                let file = File::open(st)?;
                InputSource::File(BufReader::new(file))
            }
        };
-        files.push(file);
+
        input_source_vec.push(input_source);
    }
-    if delimiters.ends_with('\\') && !delimiters.ends_with("\\\\") {
+    let mut stdout = stdout().lock();
        return Err(USimpleError::new(
            1,
            format!("delimiter list ends with an unescaped backslash: {delimiters}"),
        ));
    }
-    let delimiters: Vec<char> = unescape(delimiters).chars().collect();
+    let line_ending_byte = u8::from(line_ending);
-    let mut delim_count = 0;
+    let line_ending_byte_array_ref = &[line_ending_byte];
-    let mut delim_length = 1;
+
-    let stdout = stdout();
+    let input_source_vec_len = input_source_vec.len();
-    let mut stdout = stdout.lock();
+
    let mut delimiter_state = DelimiterState::new(&unescaped_and_encoded_delimiters);
    let mut output = Vec::new();
    if serial {
-        for file in &mut files {
+        for input_source in &mut input_source_vec {
            output.clear();
            loop {
-                match read_until(file.as_mut(), line_ending as u8, &mut output) {
+                match input_source.read_until(line_ending_byte, &mut output)? {
-                    Ok(0) => break,
+                    0 => break,
-                    Ok(_) => {
+                    _ => {
-                        if output.ends_with(&[line_ending as u8]) {
+                        remove_trailing_line_ending_byte(line_ending_byte, &mut output);
                            output.pop();
                        }
                        // a buffer of length four is large enough to encode any char
                        let mut buffer = [0; 4];
                        let ch =
                            delimiters[delim_count % delimiters.len()].encode_utf8(&mut buffer);
                        delim_length = ch.len();
-                        for byte in buffer.iter().take(delim_length) {
+                        delimiter_state.write_delimiter(&mut output);
                            output.push(*byte);
                        }
                    }
                    Err(e) => return Err(e.map_err_context(String::new)),
                }
                delim_count += 1;
            }
            // remove final delimiter
            output.truncate(output.len() - delim_length);
-            write!(
+            delimiter_state.remove_trailing_delimiter(&mut output);
-                stdout,
+
-                "{}{}",
+            stdout.write_all(&output)?;
-                String::from_utf8_lossy(&output),
+            stdout.write_all(line_ending_byte_array_ref)?;
                line_ending
            )?;
        }
    } else {
-        let mut eof = vec![false; files.len()];
+        let mut eof = vec![false; input_source_vec_len];
        loop {
            output.clear();
            let mut eof_count = 0;
-            for (i, file) in files.iter_mut().enumerate() {
+
            for (i, input_source) in input_source_vec.iter_mut().enumerate() {
                if eof[i] {
                    eof_count += 1;
                } else {
-                    match read_until(file.as_mut(), line_ending as u8, &mut output) {
+                    match input_source.read_until(line_ending_byte, &mut output)? {
-                        Ok(0) => {
+                        0 => {
                            eof[i] = true;
                            eof_count += 1;
                        }
-                        Ok(_) => {
+                        _ => {
-                            if output.ends_with(&[line_ending as u8]) {
+                            remove_trailing_line_ending_byte(line_ending_byte, &mut output);
                                output.pop();
                            }
                        }
                        Err(e) => return Err(e.map_err_context(String::new)),
                    }
                }
                // a buffer of length four is large enough to encode any char
                let mut buffer = [0; 4];
                let ch = delimiters[delim_count % delimiters.len()].encode_utf8(&mut buffer);
                delim_length = ch.len();
-                for byte in buffer.iter().take(delim_length) {
+                delimiter_state.write_delimiter(&mut output);
                    output.push(*byte);
                }
                delim_count += 1;
            }
-            if files.len() == eof_count {
+
            if eof_count == input_source_vec_len {
                break;
            }
            // Remove final delimiter
            output.truncate(output.len() - delim_length);
-            write!(
+            delimiter_state.remove_trailing_delimiter(&mut output);
-                stdout,
+
-                "{}{}",
+            stdout.write_all(&output)?;
-                String::from_utf8_lossy(&output),
+            stdout.write_all(line_ending_byte_array_ref)?;
-                line_ending
+
-            )?;
+            // Quote:
-            delim_count = 0;
+            //     When the -s option is not specified:
            //     [...]
            //     The delimiter shall be reset to the first element of list after each file operand is processed.
            // https://pubs.opengroup.org/onlinepubs/9799919799/utilities/paste.html
            delimiter_state.reset_to_first_delimiter();
        }
    }
    Ok(())
 }
-// Unescape all special characters
+fn parse_delimiters(delimiters: &str) -> UResult<Box<[Box<[u8]>]>> {
-fn unescape(s: &str) -> String {
+    /// A single backslash char
-    s.replace("\\n", "\n")
+    const BACKSLASH: char = '\\';
-        .replace("\\t", "\t")
+
-        .replace("\\\\", "\\")
+    fn add_one_byte_single_char_delimiter(vec: &mut Vec<Box<[u8]>>, byte: u8) {
        vec.push(Box::new([byte]));
    }
    // a buffer of length four is large enough to encode any char
    let mut buffer = [0; 4];
    let mut add_single_char_delimiter = |vec: &mut Vec<Box<[u8]>>, ch: char| {
        let delimiter_encoded = ch.encode_utf8(&mut buffer);
        vec.push(Box::from(delimiter_encoded.as_bytes()));
    };
    let mut vec = Vec::<Box<[u8]>>::with_capacity(delimiters.len());
    let mut chars = delimiters.chars();
    // Unescape all special characters
    while let Some(char) = chars.next() {
        match char {
            BACKSLASH => match chars.next() {
                // "Empty string (not a null character)"
                // https://pubs.opengroup.org/onlinepubs/9799919799/utilities/paste.html
                Some('0') => {
                    vec.push(Box::<[u8; 0]>::new([]));
                }
                // "\\" to "\" (U+005C)
                Some(BACKSLASH) => {
                    add_one_byte_single_char_delimiter(&mut vec, b'\\');
                }
                // "\n" to U+000A
                Some('n') => {
                    add_one_byte_single_char_delimiter(&mut vec, b'\n');
                }
                // "\t" to U+0009
                Some('t') => {
                    add_one_byte_single_char_delimiter(&mut vec, b'\t');
                }
                Some(other_char) => {
                    // "If any other characters follow the <backslash>, the results are unspecified."
                    // https://pubs.opengroup.org/onlinepubs/9799919799/utilities/paste.html
                    // However, other implementations remove the backslash
                    // See "test_posix_unspecified_delimiter"
                    add_single_char_delimiter(&mut vec, other_char);
                }
                None => {
                    return Err(USimpleError::new(
                        1,
                        format!("delimiter list ends with an unescaped backslash: {delimiters}"),
                    ));
                }
            },
            non_backslash_char => {
                add_single_char_delimiter(&mut vec, non_backslash_char);
            }
        }
    }
    Ok(vec.into_boxed_slice())
 }
 fn remove_trailing_line_ending_byte(line_ending_byte: u8, output: &mut Vec<u8>) {
    if let Some(&byte) = output.last() {
        if byte == line_ending_byte {
            assert!(output.pop() == Some(line_ending_byte));
        }
    }
 }
 enum DelimiterState<'a> {
    NoDelimiters,
    OneDelimiter(&'a [u8]),
    MultipleDelimiters {
        current_delimiter: &'a [u8],
        delimiters: &'a [Box<[u8]>],
        delimiters_iterator: Cycle<Iter<'a, Box<[u8]>>>,
    },
 }
 impl<'a> DelimiterState<'a> {
    fn new(unescaped_and_encoded_delimiters: &'a [Box<[u8]>]) -> DelimiterState<'a> {
        match unescaped_and_encoded_delimiters {
            [] => DelimiterState::NoDelimiters,
            [only_delimiter] => {
                // -d '\0' is equivalent to -d ''
                if only_delimiter.is_empty() {
                    DelimiterState::NoDelimiters
                } else {
                    DelimiterState::OneDelimiter(only_delimiter)
                }
            }
            [first_delimiter, ..] => DelimiterState::MultipleDelimiters {
                current_delimiter: first_delimiter,
                delimiters: unescaped_and_encoded_delimiters,
                delimiters_iterator: unescaped_and_encoded_delimiters.iter().cycle(),
            },
        }
    }
    /// This should only be used to return to the start of the delimiter list after a file has been processed.
    /// This should only be used when the "serial" option is disabled.
    /// This is a no-op unless there are multiple delimiters.
    fn reset_to_first_delimiter(&mut self) {
        if let DelimiterState::MultipleDelimiters {
            delimiters_iterator,
            delimiters,
            ..
        } = self
        {
            *delimiters_iterator = delimiters.iter().cycle();
        }
    }
    /// Remove the trailing delimiter.
    /// If there are no delimiters, this is a no-op.
    fn remove_trailing_delimiter(&mut self, output: &mut Vec<u8>) {
        let delimiter_length = match self {
            DelimiterState::OneDelimiter(only_delimiter) => only_delimiter.len(),
            DelimiterState::MultipleDelimiters {
                current_delimiter, ..
            } => current_delimiter.len(),
            _ => {
                return;
            }
        };
        // `delimiter_length` will be zero if the current delimiter is a "\0" delimiter
        if delimiter_length > 0 {
            let output_len = output.len();
            if let Some(output_without_delimiter_length) = output_len.checked_sub(delimiter_length)
            {
                output.truncate(output_without_delimiter_length);
            } else {
                // This branch is NOT unreachable, must be skipped
                // `output` should be empty in this case
                assert!(output_len == 0);
            }
        }
    }
    /// Append the current delimiter to `output`.
    /// If there are no delimiters, this is a no-op.
    fn write_delimiter(&mut self, output: &mut Vec<u8>) {
        match self {
            DelimiterState::OneDelimiter(only_delimiter) => {
                output.extend_from_slice(only_delimiter);
            }
            DelimiterState::MultipleDelimiters {
                current_delimiter,
                delimiters_iterator,
                ..
            } => {
                // Unwrap because `delimiters_iterator` is a cycle iter and was created from a non-empty slice
                let bo = delimiters_iterator.next().unwrap();
                output.extend_from_slice(bo);
                *current_delimiter = bo;
            }
            _ => {}
        }
    }
 }
 enum InputSource {
    File(BufReader<File>),
    StandardInput(Rc<RefCell<Stdin>>),
 }
 impl InputSource {
    fn read_until(&mut self, byte: u8, buf: &mut Vec<u8>) -> UResult<usize> {
        let us = match self {
            Self::File(bu) => bu.read_until(byte, buf)?,
            Self::StandardInput(rc) => rc
                .try_borrow()
                .map_err(|bo| USimpleError::new(1, format!("{bo}")))?
                .lock()
                .read_until(byte, buf)?,
        };
        Ok(us)
    }
 }
--- a/tests/by-util/test_paste.rs
+++ b/tests/by-util/test_paste.rs
@ -2,6 +2,9 @@
 //
 // For the full copyright and license information, please view the LICENSE
 // file that was distributed with this source code.
 // spell-checker:ignore bsdutils toybox
 use crate::common::util::TestScenario;
 struct TestData<'b> {
@ -11,7 +14,7 @@ struct TestData<'b> {
    out: &'b str,
 }
-static EXAMPLE_DATA: &[TestData] = &[
+const EXAMPLE_DATA: &[TestData] = &[
    // Ensure that paste properly handles files lacking a final newline.
    TestData {
        name: "no-nl-1",
@ -172,7 +175,7 @@ fn test_delimiter_list_ending_with_escaped_backslash() {
            at.write(&file, one_in);
            ins.push(file);
        }
-        ucmd.args(&[d, "\\\\"])
+        ucmd.args(&[d, r"\\"])
            .args(&ins)
            .succeeds()
            .stdout_is("a\\b\n");
@ -183,13 +186,174 @@ fn test_delimiter_list_ending_with_escaped_backslash() {
 fn test_delimiter_list_ending_with_unescaped_backslash() {
    for d in ["-d", "--delimiters"] {
        new_ucmd!()
-            .args(&[d, "\\"])
+            .args(&[d, r"\"])
            .fails()
-            .stderr_contains("delimiter list ends with an unescaped backslash: \\");
+            .stderr_contains(r"delimiter list ends with an unescaped backslash: \");
        new_ucmd!()
-            .args(&[d, "_\\"])
+            .args(&[d, r"\\\"])
            .fails()
-            .stderr_contains("delimiter list ends with an unescaped backslash: _\\");
+            .stderr_contains(r"delimiter list ends with an unescaped backslash: \\\");
        new_ucmd!()
            .args(&[d, r"_\"])
            .fails()
            .stderr_contains(r"delimiter list ends with an unescaped backslash: _\");
    }
 }
 #[test]
 fn test_delimiter_list_empty() {
    for option_style in ["-d", "--delimiters"] {
        new_ucmd!()
            .args(&[option_style, "", "-s"])
            .pipe_in(
                "\
 A ALPHA 1 _
 B BRAVO 2 _
 C CHARLIE 3 _
 ",
            )
            .succeeds()
            .stdout_only(
                "\
 A ALPHA 1 _B BRAVO 2 _C CHARLIE 3 _
 ",
            );
    }
 }
 // Was panicking (usize subtraction that would have resulted in a negative number)
 // Not observable in release builds, since integer overflow checking is not enabled
 #[test]
 fn test_delimiter_truncation() {
    for option_style in ["-d", "--delimiters"] {
        new_ucmd!()
            .args(&[option_style, "!@#", "-s", "-", "-", "-"])
            .pipe_in(
                "\
 FIRST
 SECOND
 THIRD
 FOURTH
 ABCDEFG
 ",
            )
            .succeeds()
            .stdout_only(
                "\
 FIRST!SECOND@THIRD#FOURTH!ABCDEFG
 ",
            );
    }
 }
 #[test]
 fn test_non_utf8_input() {
    // 0xC0 is not valid UTF-8
    const INPUT: &[u8] = b"Non-UTF-8 test: \xC0\x00\xC0.\n";
    new_ucmd!()
        .pipe_in(INPUT)
        .succeeds()
        .stdout_only_bytes(INPUT);
 }
 #[test]
 fn test_three_trailing_backslashes_delimiter() {
    const ONE_BACKSLASH_STR: &str = r"\";
    let three_backslashes_string = ONE_BACKSLASH_STR.repeat(3);
    for option_style in ["-d", "--delimiters"] {
        new_ucmd!()
            .args(&[option_style, &three_backslashes_string])
            .fails()
            .no_stdout()
            .stderr_str_check(|st| {
                st.ends_with(&format!(
                    ": delimiter list ends with an unescaped backslash: {three_backslashes_string}\n"
                ))
            });
    }
 }
 // "If any other characters follow the <backslash>, the results are unspecified."
 // https://pubs.opengroup.org/onlinepubs/9799919799/utilities/paste.html
 // However, other implementations remove the backslash
 #[test]
 fn test_posix_unspecified_delimiter() {
    for option_style in ["-d", "--delimiters"] {
        new_ucmd!()
            .args(&[option_style, r"\z", "-s"])
            .pipe_in(
                "\
 1
 2
 3
 4
 ",
            )
            .succeeds()
            .stdout_only(
                "\
 1z2z3z4
 ",
            );
    }
 }
 // "Empty string (not a null character)"
 // https://pubs.opengroup.org/onlinepubs/9799919799/utilities/paste.html
 #[test]
 fn test_backslash_zero_delimiter() {
    for option_style in ["-d", "--delimiters"] {
        new_ucmd!()
            .args(&[option_style, r"\0z\0", "-s"])
            .pipe_in(
                "\
 1
 2
 3
 4
 5
 6
 ",
            )
            .succeeds()
            .stdout_only(
                "\
 12z345z6
 ",
            );
    }
 }
 // As of 2024-10-09, only bsdutils (https://github.com/dcantrell/bsdutils, derived from FreeBSD) and toybox handle
 // multibyte delimiter characters in the way a user would likely expect. BusyBox and GNU Core Utilities do not.
 #[test]
 fn test_multi_byte_delimiter() {
    for option_style in ["-d", "--delimiters"] {
        new_ucmd!()
            .args(&[option_style, "!ß@", "-s"])
            .pipe_in(
                "\
 1
 2
 3
 4
 5
 6
 ",
            )
            .succeeds()
            .stdout_only(
                "\
 1!2ß3@4!5ß6
 ",
            );
    }
 }