split: implement --line-bytes option

Implement the `--line-bytes` option to `split`. In this mode, the program tries to write as many lines of the input as possible to each chunk of output without exceeding a specified byte limit. The new `LineBytesChunkWriter` struct represents this functionality.
2025-09-14 02:57:57 +00:00 · 2022-01-01 23:53:29 -05:00 · 2022-01-01 23:53:29 -05:00 · 77d92883c7
commit 77d92883c7
parent 2e8945ba7f
3 changed files with 218 additions and 3 deletions
--- a/src/uu/split/src/split.rs
+++ b/src/uu/split/src/split.rs
@ -5,7 +5,7 @@
 //  * For the full copyright and license information, please view the LICENSE
 //  * file that was distributed with this source code.
-// spell-checker:ignore (ToDO) PREFIXaa
+// spell-checker:ignore (ToDO) PREFIXaa nbbbb ncccc
 mod filenames;
 mod number;
@ -760,6 +760,187 @@ impl<'a> Write for LineChunkWriter<'a> {
    }
 }
 /// Write lines to each sequential output files, limited by bytes.
 ///
 /// This struct maintains an underlying writer representing the
 /// current chunk of the output. On each call to [`write`], it writes
 /// as many lines as possible to the current chunk without exceeding
 /// the specified byte limit. If a single line has more bytes than the
 /// limit, then fill an entire single chunk with those bytes and
 /// handle the remainder of the line as if it were its own distinct
 /// line. As many new underlying writers are created as needed to
 /// write all the data in the input buffer.
 struct LineBytesChunkWriter<'a> {
    /// Parameters for creating the underlying writer for each new chunk.
    settings: &'a Settings,
    /// The maximum number of bytes allowed for a single chunk of output.
    chunk_size: u64,
    /// Running total of number of chunks that have been completed.
    num_chunks_written: usize,
    /// Remaining capacity in number of bytes in the current chunk.
    ///
    /// This number starts at `chunk_size` and decreases as lines are
    /// written. Once it reaches zero, a writer for a new chunk is
    /// initialized and this number gets reset to `chunk_size`.
    num_bytes_remaining_in_current_chunk: usize,
    /// The underlying writer for the current chunk.
    ///
    /// Once the number of bytes written to this writer exceeds
    /// `chunk_size`, a new writer is initialized and assigned to this
    /// field.
    inner: BufWriter<Box<dyn Write>>,
    /// Iterator that yields filenames for each chunk.
    filename_iterator: FilenameIterator<'a>,
 }
 impl<'a> LineBytesChunkWriter<'a> {
    fn new(chunk_size: u64, settings: &'a Settings) -> Option<LineBytesChunkWriter<'a>> {
        let mut filename_iterator = FilenameIterator::new(
            &settings.prefix,
            &settings.additional_suffix,
            settings.suffix_length,
            settings.suffix_type,
        );
        let filename = filename_iterator.next()?;
        if settings.verbose {
            println!("creating file {}", filename.quote());
        }
        let inner = platform::instantiate_current_writer(&settings.filter, &filename);
        Some(LineBytesChunkWriter {
            settings,
            chunk_size,
            num_bytes_remaining_in_current_chunk: chunk_size.try_into().unwrap(),
            num_chunks_written: 0,
            inner,
            filename_iterator,
        })
    }
 }
 impl<'a> Write for LineBytesChunkWriter<'a> {
    /// Write as many lines to a chunk as possible without
    /// exceeding the byte limit. If a single line has more bytes
    /// than the limit, then fill an entire single chunk with those
    /// bytes and handle the remainder of the line as if it were
    /// its own distinct line.
    ///
    /// For example: if the `chunk_size` is 8 and the input is:
    ///
    /// ```text
    /// aaaaaaaaa\nbbbb\ncccc\ndd\nee\n
    /// ```
    ///
    /// then the output gets broken into chunks like this:
    ///
    /// ```text
    /// chunk 0    chunk 1    chunk 2    chunk 3
    ///
    /// 0            1             2
    /// 01234567  89 01234   56789 012   345 6
    /// |------|  |-------|  |--------|  |---|
    /// aaaaaaaa  a\nbbbb\n  cccc\ndd\n  ee\n
    /// ```
    fn write(&mut self, mut buf: &[u8]) -> std::io::Result<usize> {
        // The total number of bytes written during the loop below.
        //
        // It is necessary to keep this running total because we may
        // be making multiple calls to `write()` on multiple different
        // underlying writers and we want the final reported number of
        // bytes written to reflect the total number of bytes written
        // to all of the underlying writers.
        let mut total_bytes_written = 0;
        // Loop until we have written all bytes in the input buffer
        // (or an IO error occurs).
        loop {
            // If we have filled the current chunk with bytes, then
            // start a new chunk and initialize its corresponding
            // writer.
            if self.num_bytes_remaining_in_current_chunk == 0 {
                self.num_chunks_written += 1;
                let filename = self.filename_iterator.next().ok_or_else(|| {
                    std::io::Error::new(ErrorKind::Other, "output file suffixes exhausted")
                })?;
                if self.settings.verbose {
                    println!("creating file {}", filename.quote());
                }
                self.inner = platform::instantiate_current_writer(&self.settings.filter, &filename);
                self.num_bytes_remaining_in_current_chunk = self.chunk_size.try_into().unwrap();
            }
            // Find the first newline character in the buffer.
            match memchr::memchr(b'\n', buf) {
                // If there is no newline character and the buffer is
                // empty, then we are done writing.
                None if buf.is_empty() => {
                    return Ok(total_bytes_written);
                }
                // If there is no newline character and the buffer is
                // not empty, then write as many bytes as we can and
                // then move on to the next chunk if necessary.
                None => {
                    let end = self.num_bytes_remaining_in_current_chunk;
                    let num_bytes_written = self.inner.write(&buf[..end])?;
                    self.num_bytes_remaining_in_current_chunk -= num_bytes_written;
                    total_bytes_written += num_bytes_written;
                    buf = &buf[num_bytes_written..];
                }
                // If there is a newline character and the line
                // (including the newline character) will fit in the
                // current chunk, then write the entire line and
                // continue to the next iteration. (See chunk 1 in the
                // example comment above.)
                Some(i) if i < self.num_bytes_remaining_in_current_chunk => {
                    let num_bytes_written = self.inner.write(&buf[..i + 1])?;
                    self.num_bytes_remaining_in_current_chunk -= num_bytes_written;
                    total_bytes_written += num_bytes_written;
                    buf = &buf[num_bytes_written..];
                }
                // If there is a newline character, the line
                // (including the newline character) will not fit in
                // the current chunk, *and* no other lines have been
                // written to the current chunk, then write as many
                // bytes as we can and continue to the next
                // iteration. (See chunk 0 in the example comment
                // above.)
                Some(_)
                    if self.num_bytes_remaining_in_current_chunk
                        == self.chunk_size.try_into().unwrap() =>
                {
                    let end = self.num_bytes_remaining_in_current_chunk;
                    let num_bytes_written = self.inner.write(&buf[..end])?;
                    self.num_bytes_remaining_in_current_chunk -= num_bytes_written;
                    total_bytes_written += num_bytes_written;
                    buf = &buf[num_bytes_written..];
                }
                // If there is a newline character, the line
                // (including the newline character) will not fit in
                // the current chunk, and at least one other line has
                // been written to the current chunk, then signal to
                // the next iteration that a new chunk needs to be
                // created and continue to the next iteration of the
                // loop to try writing the line there.
                Some(_) => {
                    self.num_bytes_remaining_in_current_chunk = 0;
                }
            }
        }
    }
    fn flush(&mut self) -> std::io::Result<()> {
        self.inner.flush()
    }
 }
 /// Split a file into a specific number of chunks by byte.
 ///
 /// This function always creates one output file for each chunk, even
@ -1027,7 +1208,7 @@ fn split(settings: &Settings) -> UResult<()> {
                },
            }
        }
-        Strategy::Bytes(chunk_size) | Strategy::LineBytes(chunk_size) => {
+        Strategy::Bytes(chunk_size) => {
            let mut writer = ByteChunkWriter::new(chunk_size, settings)
                .ok_or_else(|| USimpleError::new(1, "output file suffixes exhausted"))?;
            match std::io::copy(&mut reader, &mut writer) {
@ -1046,6 +1227,25 @@ fn split(settings: &Settings) -> UResult<()> {
                },
            }
        }
        Strategy::LineBytes(chunk_size) => {
            let mut writer = LineBytesChunkWriter::new(chunk_size, settings)
                .ok_or_else(|| USimpleError::new(1, "output file suffixes exhausted"))?;
            match std::io::copy(&mut reader, &mut writer) {
                Ok(_) => Ok(()),
                Err(e) => match e.kind() {
                    // TODO Since the writer object controls the creation of
                    // new files, we need to rely on the `std::io::Result`
                    // returned by its `write()` method to communicate any
                    // errors to this calling scope. If a new file cannot be
                    // created because we have exceeded the number of
                    // allowable filenames, we use `ErrorKind::Other` to
                    // indicate that. A special error message needs to be
                    // printed in that case.
                    ErrorKind::Other => Err(USimpleError::new(1, "output file suffixes exhausted")),
                    _ => Err(uio_error!(e, "input/output error")),
                },
            }
        }
    }
 }
--- a/tests/by-util/test_split.rs
+++ b/tests/by-util/test_split.rs
@ -2,7 +2,7 @@
 //  *
 //  * For the full copyright and license information, please view the LICENSE
 //  * file that was distributed with this source code.
-// spell-checker:ignore xzaaa sixhundredfiftyonebytes ninetyonebytes threebytes asciilowercase fghij klmno pqrst uvwxyz fivelines twohundredfortyonebytes onehundredlines
+// spell-checker:ignore xzaaa sixhundredfiftyonebytes ninetyonebytes threebytes asciilowercase fghij klmno pqrst uvwxyz fivelines twohundredfortyonebytes onehundredlines nbbbb
 extern crate rand;
 extern crate regex;
@ -595,3 +595,13 @@ fn test_lines_kth() {
        .succeeds()
        .stdout_only("20\n21\n22\n23\n24\n25\n26\n27\n28\n29\n");
 }
 #[test]
 fn test_line_bytes() {
    let (at, mut ucmd) = at_and_ucmd!();
    ucmd.args(&["-C", "8", "letters.txt"]).succeeds();
    assert_eq!(at.read("xaa"), "aaaaaaaa");
    assert_eq!(at.read("xab"), "a\nbbbb\n");
    assert_eq!(at.read("xac"), "cccc\ndd\n");
    assert_eq!(at.read("xad"), "ee\n");
 }
--- a/tests/fixtures/split/letters.txt
+++ b/tests/fixtures/split/letters.txt
@ -0,0 +1,5 @@
 aaaaaaaaa
 bbbb
 cccc
 dd
 ee