From 77d92883c75f37d4f2c5f22c133b029d62835182 Mon Sep 17 00:00:00 2001 From: Jeffrey Finkelstein Date: Sat, 1 Jan 2022 23:53:29 -0500 Subject: [PATCH] split: implement --line-bytes option Implement the `--line-bytes` option to `split`. In this mode, the program tries to write as many lines of the input as possible to each chunk of output without exceeding a specified byte limit. The new `LineBytesChunkWriter` struct represents this functionality. --- src/uu/split/src/split.rs | 204 ++++++++++++++++++++++++++++++- tests/by-util/test_split.rs | 12 +- tests/fixtures/split/letters.txt | 5 + 3 files changed, 218 insertions(+), 3 deletions(-) create mode 100644 tests/fixtures/split/letters.txt diff --git a/src/uu/split/src/split.rs b/src/uu/split/src/split.rs index 090d89d4e..73abc966b 100644 --- a/src/uu/split/src/split.rs +++ b/src/uu/split/src/split.rs @@ -5,7 +5,7 @@ // * For the full copyright and license information, please view the LICENSE // * file that was distributed with this source code. -// spell-checker:ignore (ToDO) PREFIXaa +// spell-checker:ignore (ToDO) PREFIXaa nbbbb ncccc mod filenames; mod number; @@ -760,6 +760,187 @@ impl<'a> Write for LineChunkWriter<'a> { } } +/// Write lines to each sequential output files, limited by bytes. +/// +/// This struct maintains an underlying writer representing the +/// current chunk of the output. On each call to [`write`], it writes +/// as many lines as possible to the current chunk without exceeding +/// the specified byte limit. If a single line has more bytes than the +/// limit, then fill an entire single chunk with those bytes and +/// handle the remainder of the line as if it were its own distinct +/// line. As many new underlying writers are created as needed to +/// write all the data in the input buffer. +struct LineBytesChunkWriter<'a> { + /// Parameters for creating the underlying writer for each new chunk. + settings: &'a Settings, + + /// The maximum number of bytes allowed for a single chunk of output. + chunk_size: u64, + + /// Running total of number of chunks that have been completed. + num_chunks_written: usize, + + /// Remaining capacity in number of bytes in the current chunk. + /// + /// This number starts at `chunk_size` and decreases as lines are + /// written. Once it reaches zero, a writer for a new chunk is + /// initialized and this number gets reset to `chunk_size`. + num_bytes_remaining_in_current_chunk: usize, + + /// The underlying writer for the current chunk. + /// + /// Once the number of bytes written to this writer exceeds + /// `chunk_size`, a new writer is initialized and assigned to this + /// field. + inner: BufWriter>, + + /// Iterator that yields filenames for each chunk. + filename_iterator: FilenameIterator<'a>, +} + +impl<'a> LineBytesChunkWriter<'a> { + fn new(chunk_size: u64, settings: &'a Settings) -> Option> { + let mut filename_iterator = FilenameIterator::new( + &settings.prefix, + &settings.additional_suffix, + settings.suffix_length, + settings.suffix_type, + ); + let filename = filename_iterator.next()?; + if settings.verbose { + println!("creating file {}", filename.quote()); + } + let inner = platform::instantiate_current_writer(&settings.filter, &filename); + Some(LineBytesChunkWriter { + settings, + chunk_size, + num_bytes_remaining_in_current_chunk: chunk_size.try_into().unwrap(), + num_chunks_written: 0, + inner, + filename_iterator, + }) + } +} + +impl<'a> Write for LineBytesChunkWriter<'a> { + /// Write as many lines to a chunk as possible without + /// exceeding the byte limit. If a single line has more bytes + /// than the limit, then fill an entire single chunk with those + /// bytes and handle the remainder of the line as if it were + /// its own distinct line. + /// + /// For example: if the `chunk_size` is 8 and the input is: + /// + /// ```text + /// aaaaaaaaa\nbbbb\ncccc\ndd\nee\n + /// ``` + /// + /// then the output gets broken into chunks like this: + /// + /// ```text + /// chunk 0 chunk 1 chunk 2 chunk 3 + /// + /// 0 1 2 + /// 01234567 89 01234 56789 012 345 6 + /// |------| |-------| |--------| |---| + /// aaaaaaaa a\nbbbb\n cccc\ndd\n ee\n + /// ``` + fn write(&mut self, mut buf: &[u8]) -> std::io::Result { + // The total number of bytes written during the loop below. + // + // It is necessary to keep this running total because we may + // be making multiple calls to `write()` on multiple different + // underlying writers and we want the final reported number of + // bytes written to reflect the total number of bytes written + // to all of the underlying writers. + let mut total_bytes_written = 0; + + // Loop until we have written all bytes in the input buffer + // (or an IO error occurs). + loop { + // If we have filled the current chunk with bytes, then + // start a new chunk and initialize its corresponding + // writer. + if self.num_bytes_remaining_in_current_chunk == 0 { + self.num_chunks_written += 1; + let filename = self.filename_iterator.next().ok_or_else(|| { + std::io::Error::new(ErrorKind::Other, "output file suffixes exhausted") + })?; + if self.settings.verbose { + println!("creating file {}", filename.quote()); + } + self.inner = platform::instantiate_current_writer(&self.settings.filter, &filename); + self.num_bytes_remaining_in_current_chunk = self.chunk_size.try_into().unwrap(); + } + + // Find the first newline character in the buffer. + match memchr::memchr(b'\n', buf) { + // If there is no newline character and the buffer is + // empty, then we are done writing. + None if buf.is_empty() => { + return Ok(total_bytes_written); + } + + // If there is no newline character and the buffer is + // not empty, then write as many bytes as we can and + // then move on to the next chunk if necessary. + None => { + let end = self.num_bytes_remaining_in_current_chunk; + let num_bytes_written = self.inner.write(&buf[..end])?; + self.num_bytes_remaining_in_current_chunk -= num_bytes_written; + total_bytes_written += num_bytes_written; + buf = &buf[num_bytes_written..]; + } + + // If there is a newline character and the line + // (including the newline character) will fit in the + // current chunk, then write the entire line and + // continue to the next iteration. (See chunk 1 in the + // example comment above.) + Some(i) if i < self.num_bytes_remaining_in_current_chunk => { + let num_bytes_written = self.inner.write(&buf[..i + 1])?; + self.num_bytes_remaining_in_current_chunk -= num_bytes_written; + total_bytes_written += num_bytes_written; + buf = &buf[num_bytes_written..]; + } + + // If there is a newline character, the line + // (including the newline character) will not fit in + // the current chunk, *and* no other lines have been + // written to the current chunk, then write as many + // bytes as we can and continue to the next + // iteration. (See chunk 0 in the example comment + // above.) + Some(_) + if self.num_bytes_remaining_in_current_chunk + == self.chunk_size.try_into().unwrap() => + { + let end = self.num_bytes_remaining_in_current_chunk; + let num_bytes_written = self.inner.write(&buf[..end])?; + self.num_bytes_remaining_in_current_chunk -= num_bytes_written; + total_bytes_written += num_bytes_written; + buf = &buf[num_bytes_written..]; + } + + // If there is a newline character, the line + // (including the newline character) will not fit in + // the current chunk, and at least one other line has + // been written to the current chunk, then signal to + // the next iteration that a new chunk needs to be + // created and continue to the next iteration of the + // loop to try writing the line there. + Some(_) => { + self.num_bytes_remaining_in_current_chunk = 0; + } + } + } + } + + fn flush(&mut self) -> std::io::Result<()> { + self.inner.flush() + } +} + /// Split a file into a specific number of chunks by byte. /// /// This function always creates one output file for each chunk, even @@ -1027,7 +1208,7 @@ fn split(settings: &Settings) -> UResult<()> { }, } } - Strategy::Bytes(chunk_size) | Strategy::LineBytes(chunk_size) => { + Strategy::Bytes(chunk_size) => { let mut writer = ByteChunkWriter::new(chunk_size, settings) .ok_or_else(|| USimpleError::new(1, "output file suffixes exhausted"))?; match std::io::copy(&mut reader, &mut writer) { @@ -1046,6 +1227,25 @@ fn split(settings: &Settings) -> UResult<()> { }, } } + Strategy::LineBytes(chunk_size) => { + let mut writer = LineBytesChunkWriter::new(chunk_size, settings) + .ok_or_else(|| USimpleError::new(1, "output file suffixes exhausted"))?; + match std::io::copy(&mut reader, &mut writer) { + Ok(_) => Ok(()), + Err(e) => match e.kind() { + // TODO Since the writer object controls the creation of + // new files, we need to rely on the `std::io::Result` + // returned by its `write()` method to communicate any + // errors to this calling scope. If a new file cannot be + // created because we have exceeded the number of + // allowable filenames, we use `ErrorKind::Other` to + // indicate that. A special error message needs to be + // printed in that case. + ErrorKind::Other => Err(USimpleError::new(1, "output file suffixes exhausted")), + _ => Err(uio_error!(e, "input/output error")), + }, + } + } } } diff --git a/tests/by-util/test_split.rs b/tests/by-util/test_split.rs index 08431a8f6..642cb7c68 100644 --- a/tests/by-util/test_split.rs +++ b/tests/by-util/test_split.rs @@ -2,7 +2,7 @@ // * // * For the full copyright and license information, please view the LICENSE // * file that was distributed with this source code. -// spell-checker:ignore xzaaa sixhundredfiftyonebytes ninetyonebytes threebytes asciilowercase fghij klmno pqrst uvwxyz fivelines twohundredfortyonebytes onehundredlines +// spell-checker:ignore xzaaa sixhundredfiftyonebytes ninetyonebytes threebytes asciilowercase fghij klmno pqrst uvwxyz fivelines twohundredfortyonebytes onehundredlines nbbbb extern crate rand; extern crate regex; @@ -595,3 +595,13 @@ fn test_lines_kth() { .succeeds() .stdout_only("20\n21\n22\n23\n24\n25\n26\n27\n28\n29\n"); } + +#[test] +fn test_line_bytes() { + let (at, mut ucmd) = at_and_ucmd!(); + ucmd.args(&["-C", "8", "letters.txt"]).succeeds(); + assert_eq!(at.read("xaa"), "aaaaaaaa"); + assert_eq!(at.read("xab"), "a\nbbbb\n"); + assert_eq!(at.read("xac"), "cccc\ndd\n"); + assert_eq!(at.read("xad"), "ee\n"); +} diff --git a/tests/fixtures/split/letters.txt b/tests/fixtures/split/letters.txt new file mode 100644 index 000000000..03e1003e3 --- /dev/null +++ b/tests/fixtures/split/letters.txt @@ -0,0 +1,5 @@ +aaaaaaaaa +bbbb +cccc +dd +ee