mirror of
https://github.com/RGBCube/uutils-coreutils
synced 2025-07-28 03:27:44 +00:00
split: implement --line-bytes option
Implement the `--line-bytes` option to `split`. In this mode, the program tries to write as many lines of the input as possible to each chunk of output without exceeding a specified byte limit. The new `LineBytesChunkWriter` struct represents this functionality.
This commit is contained in:
parent
2e8945ba7f
commit
77d92883c7
3 changed files with 218 additions and 3 deletions
|
@ -5,7 +5,7 @@
|
|||
// * For the full copyright and license information, please view the LICENSE
|
||||
// * file that was distributed with this source code.
|
||||
|
||||
// spell-checker:ignore (ToDO) PREFIXaa
|
||||
// spell-checker:ignore (ToDO) PREFIXaa nbbbb ncccc
|
||||
|
||||
mod filenames;
|
||||
mod number;
|
||||
|
@ -760,6 +760,187 @@ impl<'a> Write for LineChunkWriter<'a> {
|
|||
}
|
||||
}
|
||||
|
||||
/// Write lines to each sequential output files, limited by bytes.
|
||||
///
|
||||
/// This struct maintains an underlying writer representing the
|
||||
/// current chunk of the output. On each call to [`write`], it writes
|
||||
/// as many lines as possible to the current chunk without exceeding
|
||||
/// the specified byte limit. If a single line has more bytes than the
|
||||
/// limit, then fill an entire single chunk with those bytes and
|
||||
/// handle the remainder of the line as if it were its own distinct
|
||||
/// line. As many new underlying writers are created as needed to
|
||||
/// write all the data in the input buffer.
|
||||
struct LineBytesChunkWriter<'a> {
|
||||
/// Parameters for creating the underlying writer for each new chunk.
|
||||
settings: &'a Settings,
|
||||
|
||||
/// The maximum number of bytes allowed for a single chunk of output.
|
||||
chunk_size: u64,
|
||||
|
||||
/// Running total of number of chunks that have been completed.
|
||||
num_chunks_written: usize,
|
||||
|
||||
/// Remaining capacity in number of bytes in the current chunk.
|
||||
///
|
||||
/// This number starts at `chunk_size` and decreases as lines are
|
||||
/// written. Once it reaches zero, a writer for a new chunk is
|
||||
/// initialized and this number gets reset to `chunk_size`.
|
||||
num_bytes_remaining_in_current_chunk: usize,
|
||||
|
||||
/// The underlying writer for the current chunk.
|
||||
///
|
||||
/// Once the number of bytes written to this writer exceeds
|
||||
/// `chunk_size`, a new writer is initialized and assigned to this
|
||||
/// field.
|
||||
inner: BufWriter<Box<dyn Write>>,
|
||||
|
||||
/// Iterator that yields filenames for each chunk.
|
||||
filename_iterator: FilenameIterator<'a>,
|
||||
}
|
||||
|
||||
impl<'a> LineBytesChunkWriter<'a> {
|
||||
fn new(chunk_size: u64, settings: &'a Settings) -> Option<LineBytesChunkWriter<'a>> {
|
||||
let mut filename_iterator = FilenameIterator::new(
|
||||
&settings.prefix,
|
||||
&settings.additional_suffix,
|
||||
settings.suffix_length,
|
||||
settings.suffix_type,
|
||||
);
|
||||
let filename = filename_iterator.next()?;
|
||||
if settings.verbose {
|
||||
println!("creating file {}", filename.quote());
|
||||
}
|
||||
let inner = platform::instantiate_current_writer(&settings.filter, &filename);
|
||||
Some(LineBytesChunkWriter {
|
||||
settings,
|
||||
chunk_size,
|
||||
num_bytes_remaining_in_current_chunk: chunk_size.try_into().unwrap(),
|
||||
num_chunks_written: 0,
|
||||
inner,
|
||||
filename_iterator,
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a> Write for LineBytesChunkWriter<'a> {
|
||||
/// Write as many lines to a chunk as possible without
|
||||
/// exceeding the byte limit. If a single line has more bytes
|
||||
/// than the limit, then fill an entire single chunk with those
|
||||
/// bytes and handle the remainder of the line as if it were
|
||||
/// its own distinct line.
|
||||
///
|
||||
/// For example: if the `chunk_size` is 8 and the input is:
|
||||
///
|
||||
/// ```text
|
||||
/// aaaaaaaaa\nbbbb\ncccc\ndd\nee\n
|
||||
/// ```
|
||||
///
|
||||
/// then the output gets broken into chunks like this:
|
||||
///
|
||||
/// ```text
|
||||
/// chunk 0 chunk 1 chunk 2 chunk 3
|
||||
///
|
||||
/// 0 1 2
|
||||
/// 01234567 89 01234 56789 012 345 6
|
||||
/// |------| |-------| |--------| |---|
|
||||
/// aaaaaaaa a\nbbbb\n cccc\ndd\n ee\n
|
||||
/// ```
|
||||
fn write(&mut self, mut buf: &[u8]) -> std::io::Result<usize> {
|
||||
// The total number of bytes written during the loop below.
|
||||
//
|
||||
// It is necessary to keep this running total because we may
|
||||
// be making multiple calls to `write()` on multiple different
|
||||
// underlying writers and we want the final reported number of
|
||||
// bytes written to reflect the total number of bytes written
|
||||
// to all of the underlying writers.
|
||||
let mut total_bytes_written = 0;
|
||||
|
||||
// Loop until we have written all bytes in the input buffer
|
||||
// (or an IO error occurs).
|
||||
loop {
|
||||
// If we have filled the current chunk with bytes, then
|
||||
// start a new chunk and initialize its corresponding
|
||||
// writer.
|
||||
if self.num_bytes_remaining_in_current_chunk == 0 {
|
||||
self.num_chunks_written += 1;
|
||||
let filename = self.filename_iterator.next().ok_or_else(|| {
|
||||
std::io::Error::new(ErrorKind::Other, "output file suffixes exhausted")
|
||||
})?;
|
||||
if self.settings.verbose {
|
||||
println!("creating file {}", filename.quote());
|
||||
}
|
||||
self.inner = platform::instantiate_current_writer(&self.settings.filter, &filename);
|
||||
self.num_bytes_remaining_in_current_chunk = self.chunk_size.try_into().unwrap();
|
||||
}
|
||||
|
||||
// Find the first newline character in the buffer.
|
||||
match memchr::memchr(b'\n', buf) {
|
||||
// If there is no newline character and the buffer is
|
||||
// empty, then we are done writing.
|
||||
None if buf.is_empty() => {
|
||||
return Ok(total_bytes_written);
|
||||
}
|
||||
|
||||
// If there is no newline character and the buffer is
|
||||
// not empty, then write as many bytes as we can and
|
||||
// then move on to the next chunk if necessary.
|
||||
None => {
|
||||
let end = self.num_bytes_remaining_in_current_chunk;
|
||||
let num_bytes_written = self.inner.write(&buf[..end])?;
|
||||
self.num_bytes_remaining_in_current_chunk -= num_bytes_written;
|
||||
total_bytes_written += num_bytes_written;
|
||||
buf = &buf[num_bytes_written..];
|
||||
}
|
||||
|
||||
// If there is a newline character and the line
|
||||
// (including the newline character) will fit in the
|
||||
// current chunk, then write the entire line and
|
||||
// continue to the next iteration. (See chunk 1 in the
|
||||
// example comment above.)
|
||||
Some(i) if i < self.num_bytes_remaining_in_current_chunk => {
|
||||
let num_bytes_written = self.inner.write(&buf[..i + 1])?;
|
||||
self.num_bytes_remaining_in_current_chunk -= num_bytes_written;
|
||||
total_bytes_written += num_bytes_written;
|
||||
buf = &buf[num_bytes_written..];
|
||||
}
|
||||
|
||||
// If there is a newline character, the line
|
||||
// (including the newline character) will not fit in
|
||||
// the current chunk, *and* no other lines have been
|
||||
// written to the current chunk, then write as many
|
||||
// bytes as we can and continue to the next
|
||||
// iteration. (See chunk 0 in the example comment
|
||||
// above.)
|
||||
Some(_)
|
||||
if self.num_bytes_remaining_in_current_chunk
|
||||
== self.chunk_size.try_into().unwrap() =>
|
||||
{
|
||||
let end = self.num_bytes_remaining_in_current_chunk;
|
||||
let num_bytes_written = self.inner.write(&buf[..end])?;
|
||||
self.num_bytes_remaining_in_current_chunk -= num_bytes_written;
|
||||
total_bytes_written += num_bytes_written;
|
||||
buf = &buf[num_bytes_written..];
|
||||
}
|
||||
|
||||
// If there is a newline character, the line
|
||||
// (including the newline character) will not fit in
|
||||
// the current chunk, and at least one other line has
|
||||
// been written to the current chunk, then signal to
|
||||
// the next iteration that a new chunk needs to be
|
||||
// created and continue to the next iteration of the
|
||||
// loop to try writing the line there.
|
||||
Some(_) => {
|
||||
self.num_bytes_remaining_in_current_chunk = 0;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
fn flush(&mut self) -> std::io::Result<()> {
|
||||
self.inner.flush()
|
||||
}
|
||||
}
|
||||
|
||||
/// Split a file into a specific number of chunks by byte.
|
||||
///
|
||||
/// This function always creates one output file for each chunk, even
|
||||
|
@ -1027,7 +1208,7 @@ fn split(settings: &Settings) -> UResult<()> {
|
|||
},
|
||||
}
|
||||
}
|
||||
Strategy::Bytes(chunk_size) | Strategy::LineBytes(chunk_size) => {
|
||||
Strategy::Bytes(chunk_size) => {
|
||||
let mut writer = ByteChunkWriter::new(chunk_size, settings)
|
||||
.ok_or_else(|| USimpleError::new(1, "output file suffixes exhausted"))?;
|
||||
match std::io::copy(&mut reader, &mut writer) {
|
||||
|
@ -1046,6 +1227,25 @@ fn split(settings: &Settings) -> UResult<()> {
|
|||
},
|
||||
}
|
||||
}
|
||||
Strategy::LineBytes(chunk_size) => {
|
||||
let mut writer = LineBytesChunkWriter::new(chunk_size, settings)
|
||||
.ok_or_else(|| USimpleError::new(1, "output file suffixes exhausted"))?;
|
||||
match std::io::copy(&mut reader, &mut writer) {
|
||||
Ok(_) => Ok(()),
|
||||
Err(e) => match e.kind() {
|
||||
// TODO Since the writer object controls the creation of
|
||||
// new files, we need to rely on the `std::io::Result`
|
||||
// returned by its `write()` method to communicate any
|
||||
// errors to this calling scope. If a new file cannot be
|
||||
// created because we have exceeded the number of
|
||||
// allowable filenames, we use `ErrorKind::Other` to
|
||||
// indicate that. A special error message needs to be
|
||||
// printed in that case.
|
||||
ErrorKind::Other => Err(USimpleError::new(1, "output file suffixes exhausted")),
|
||||
_ => Err(uio_error!(e, "input/output error")),
|
||||
},
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
@ -2,7 +2,7 @@
|
|||
// *
|
||||
// * For the full copyright and license information, please view the LICENSE
|
||||
// * file that was distributed with this source code.
|
||||
// spell-checker:ignore xzaaa sixhundredfiftyonebytes ninetyonebytes threebytes asciilowercase fghij klmno pqrst uvwxyz fivelines twohundredfortyonebytes onehundredlines
|
||||
// spell-checker:ignore xzaaa sixhundredfiftyonebytes ninetyonebytes threebytes asciilowercase fghij klmno pqrst uvwxyz fivelines twohundredfortyonebytes onehundredlines nbbbb
|
||||
extern crate rand;
|
||||
extern crate regex;
|
||||
|
||||
|
@ -595,3 +595,13 @@ fn test_lines_kth() {
|
|||
.succeeds()
|
||||
.stdout_only("20\n21\n22\n23\n24\n25\n26\n27\n28\n29\n");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_line_bytes() {
|
||||
let (at, mut ucmd) = at_and_ucmd!();
|
||||
ucmd.args(&["-C", "8", "letters.txt"]).succeeds();
|
||||
assert_eq!(at.read("xaa"), "aaaaaaaa");
|
||||
assert_eq!(at.read("xab"), "a\nbbbb\n");
|
||||
assert_eq!(at.read("xac"), "cccc\ndd\n");
|
||||
assert_eq!(at.read("xad"), "ee\n");
|
||||
}
|
||||
|
|
5
tests/fixtures/split/letters.txt
vendored
Normal file
5
tests/fixtures/split/letters.txt
vendored
Normal file
|
@ -0,0 +1,5 @@
|
|||
aaaaaaaaa
|
||||
bbbb
|
||||
cccc
|
||||
dd
|
||||
ee
|
Loading…
Add table
Add a link
Reference in a new issue