1
Fork 0
mirror of https://github.com/RGBCube/uutils-coreutils synced 2025-07-28 11:37:44 +00:00

split: implement --line-bytes option

Implement the `--line-bytes` option to `split`. In this mode, the
program tries to write as many lines of the input as possible to each
chunk of output without exceeding a specified byte limit. The new
`LineBytesChunkWriter` struct represents this functionality.
This commit is contained in:
Jeffrey Finkelstein 2022-01-01 23:53:29 -05:00
parent 2e8945ba7f
commit 77d92883c7
3 changed files with 218 additions and 3 deletions

View file

@ -5,7 +5,7 @@
// * For the full copyright and license information, please view the LICENSE // * For the full copyright and license information, please view the LICENSE
// * file that was distributed with this source code. // * file that was distributed with this source code.
// spell-checker:ignore (ToDO) PREFIXaa // spell-checker:ignore (ToDO) PREFIXaa nbbbb ncccc
mod filenames; mod filenames;
mod number; mod number;
@ -760,6 +760,187 @@ impl<'a> Write for LineChunkWriter<'a> {
} }
} }
/// Write lines to each sequential output files, limited by bytes.
///
/// This struct maintains an underlying writer representing the
/// current chunk of the output. On each call to [`write`], it writes
/// as many lines as possible to the current chunk without exceeding
/// the specified byte limit. If a single line has more bytes than the
/// limit, then fill an entire single chunk with those bytes and
/// handle the remainder of the line as if it were its own distinct
/// line. As many new underlying writers are created as needed to
/// write all the data in the input buffer.
struct LineBytesChunkWriter<'a> {
/// Parameters for creating the underlying writer for each new chunk.
settings: &'a Settings,
/// The maximum number of bytes allowed for a single chunk of output.
chunk_size: u64,
/// Running total of number of chunks that have been completed.
num_chunks_written: usize,
/// Remaining capacity in number of bytes in the current chunk.
///
/// This number starts at `chunk_size` and decreases as lines are
/// written. Once it reaches zero, a writer for a new chunk is
/// initialized and this number gets reset to `chunk_size`.
num_bytes_remaining_in_current_chunk: usize,
/// The underlying writer for the current chunk.
///
/// Once the number of bytes written to this writer exceeds
/// `chunk_size`, a new writer is initialized and assigned to this
/// field.
inner: BufWriter<Box<dyn Write>>,
/// Iterator that yields filenames for each chunk.
filename_iterator: FilenameIterator<'a>,
}
impl<'a> LineBytesChunkWriter<'a> {
fn new(chunk_size: u64, settings: &'a Settings) -> Option<LineBytesChunkWriter<'a>> {
let mut filename_iterator = FilenameIterator::new(
&settings.prefix,
&settings.additional_suffix,
settings.suffix_length,
settings.suffix_type,
);
let filename = filename_iterator.next()?;
if settings.verbose {
println!("creating file {}", filename.quote());
}
let inner = platform::instantiate_current_writer(&settings.filter, &filename);
Some(LineBytesChunkWriter {
settings,
chunk_size,
num_bytes_remaining_in_current_chunk: chunk_size.try_into().unwrap(),
num_chunks_written: 0,
inner,
filename_iterator,
})
}
}
impl<'a> Write for LineBytesChunkWriter<'a> {
/// Write as many lines to a chunk as possible without
/// exceeding the byte limit. If a single line has more bytes
/// than the limit, then fill an entire single chunk with those
/// bytes and handle the remainder of the line as if it were
/// its own distinct line.
///
/// For example: if the `chunk_size` is 8 and the input is:
///
/// ```text
/// aaaaaaaaa\nbbbb\ncccc\ndd\nee\n
/// ```
///
/// then the output gets broken into chunks like this:
///
/// ```text
/// chunk 0 chunk 1 chunk 2 chunk 3
///
/// 0 1 2
/// 01234567 89 01234 56789 012 345 6
/// |------| |-------| |--------| |---|
/// aaaaaaaa a\nbbbb\n cccc\ndd\n ee\n
/// ```
fn write(&mut self, mut buf: &[u8]) -> std::io::Result<usize> {
// The total number of bytes written during the loop below.
//
// It is necessary to keep this running total because we may
// be making multiple calls to `write()` on multiple different
// underlying writers and we want the final reported number of
// bytes written to reflect the total number of bytes written
// to all of the underlying writers.
let mut total_bytes_written = 0;
// Loop until we have written all bytes in the input buffer
// (or an IO error occurs).
loop {
// If we have filled the current chunk with bytes, then
// start a new chunk and initialize its corresponding
// writer.
if self.num_bytes_remaining_in_current_chunk == 0 {
self.num_chunks_written += 1;
let filename = self.filename_iterator.next().ok_or_else(|| {
std::io::Error::new(ErrorKind::Other, "output file suffixes exhausted")
})?;
if self.settings.verbose {
println!("creating file {}", filename.quote());
}
self.inner = platform::instantiate_current_writer(&self.settings.filter, &filename);
self.num_bytes_remaining_in_current_chunk = self.chunk_size.try_into().unwrap();
}
// Find the first newline character in the buffer.
match memchr::memchr(b'\n', buf) {
// If there is no newline character and the buffer is
// empty, then we are done writing.
None if buf.is_empty() => {
return Ok(total_bytes_written);
}
// If there is no newline character and the buffer is
// not empty, then write as many bytes as we can and
// then move on to the next chunk if necessary.
None => {
let end = self.num_bytes_remaining_in_current_chunk;
let num_bytes_written = self.inner.write(&buf[..end])?;
self.num_bytes_remaining_in_current_chunk -= num_bytes_written;
total_bytes_written += num_bytes_written;
buf = &buf[num_bytes_written..];
}
// If there is a newline character and the line
// (including the newline character) will fit in the
// current chunk, then write the entire line and
// continue to the next iteration. (See chunk 1 in the
// example comment above.)
Some(i) if i < self.num_bytes_remaining_in_current_chunk => {
let num_bytes_written = self.inner.write(&buf[..i + 1])?;
self.num_bytes_remaining_in_current_chunk -= num_bytes_written;
total_bytes_written += num_bytes_written;
buf = &buf[num_bytes_written..];
}
// If there is a newline character, the line
// (including the newline character) will not fit in
// the current chunk, *and* no other lines have been
// written to the current chunk, then write as many
// bytes as we can and continue to the next
// iteration. (See chunk 0 in the example comment
// above.)
Some(_)
if self.num_bytes_remaining_in_current_chunk
== self.chunk_size.try_into().unwrap() =>
{
let end = self.num_bytes_remaining_in_current_chunk;
let num_bytes_written = self.inner.write(&buf[..end])?;
self.num_bytes_remaining_in_current_chunk -= num_bytes_written;
total_bytes_written += num_bytes_written;
buf = &buf[num_bytes_written..];
}
// If there is a newline character, the line
// (including the newline character) will not fit in
// the current chunk, and at least one other line has
// been written to the current chunk, then signal to
// the next iteration that a new chunk needs to be
// created and continue to the next iteration of the
// loop to try writing the line there.
Some(_) => {
self.num_bytes_remaining_in_current_chunk = 0;
}
}
}
}
fn flush(&mut self) -> std::io::Result<()> {
self.inner.flush()
}
}
/// Split a file into a specific number of chunks by byte. /// Split a file into a specific number of chunks by byte.
/// ///
/// This function always creates one output file for each chunk, even /// This function always creates one output file for each chunk, even
@ -1027,7 +1208,7 @@ fn split(settings: &Settings) -> UResult<()> {
}, },
} }
} }
Strategy::Bytes(chunk_size) | Strategy::LineBytes(chunk_size) => { Strategy::Bytes(chunk_size) => {
let mut writer = ByteChunkWriter::new(chunk_size, settings) let mut writer = ByteChunkWriter::new(chunk_size, settings)
.ok_or_else(|| USimpleError::new(1, "output file suffixes exhausted"))?; .ok_or_else(|| USimpleError::new(1, "output file suffixes exhausted"))?;
match std::io::copy(&mut reader, &mut writer) { match std::io::copy(&mut reader, &mut writer) {
@ -1046,6 +1227,25 @@ fn split(settings: &Settings) -> UResult<()> {
}, },
} }
} }
Strategy::LineBytes(chunk_size) => {
let mut writer = LineBytesChunkWriter::new(chunk_size, settings)
.ok_or_else(|| USimpleError::new(1, "output file suffixes exhausted"))?;
match std::io::copy(&mut reader, &mut writer) {
Ok(_) => Ok(()),
Err(e) => match e.kind() {
// TODO Since the writer object controls the creation of
// new files, we need to rely on the `std::io::Result`
// returned by its `write()` method to communicate any
// errors to this calling scope. If a new file cannot be
// created because we have exceeded the number of
// allowable filenames, we use `ErrorKind::Other` to
// indicate that. A special error message needs to be
// printed in that case.
ErrorKind::Other => Err(USimpleError::new(1, "output file suffixes exhausted")),
_ => Err(uio_error!(e, "input/output error")),
},
}
}
} }
} }

View file

@ -2,7 +2,7 @@
// * // *
// * For the full copyright and license information, please view the LICENSE // * For the full copyright and license information, please view the LICENSE
// * file that was distributed with this source code. // * file that was distributed with this source code.
// spell-checker:ignore xzaaa sixhundredfiftyonebytes ninetyonebytes threebytes asciilowercase fghij klmno pqrst uvwxyz fivelines twohundredfortyonebytes onehundredlines // spell-checker:ignore xzaaa sixhundredfiftyonebytes ninetyonebytes threebytes asciilowercase fghij klmno pqrst uvwxyz fivelines twohundredfortyonebytes onehundredlines nbbbb
extern crate rand; extern crate rand;
extern crate regex; extern crate regex;
@ -595,3 +595,13 @@ fn test_lines_kth() {
.succeeds() .succeeds()
.stdout_only("20\n21\n22\n23\n24\n25\n26\n27\n28\n29\n"); .stdout_only("20\n21\n22\n23\n24\n25\n26\n27\n28\n29\n");
} }
#[test]
fn test_line_bytes() {
let (at, mut ucmd) = at_and_ucmd!();
ucmd.args(&["-C", "8", "letters.txt"]).succeeds();
assert_eq!(at.read("xaa"), "aaaaaaaa");
assert_eq!(at.read("xab"), "a\nbbbb\n");
assert_eq!(at.read("xac"), "cccc\ndd\n");
assert_eq!(at.read("xad"), "ee\n");
}

5
tests/fixtures/split/letters.txt vendored Normal file
View file

@ -0,0 +1,5 @@
aaaaaaaaa
bbbb
cccc
dd
ee