mirror of
https://github.com/RGBCube/uutils-coreutils
synced 2025-07-28 11:37:44 +00:00
split: implement --line-bytes option
Implement the `--line-bytes` option to `split`. In this mode, the program tries to write as many lines of the input as possible to each chunk of output without exceeding a specified byte limit. The new `LineBytesChunkWriter` struct represents this functionality.
This commit is contained in:
parent
2e8945ba7f
commit
77d92883c7
3 changed files with 218 additions and 3 deletions
|
@ -5,7 +5,7 @@
|
||||||
// * For the full copyright and license information, please view the LICENSE
|
// * For the full copyright and license information, please view the LICENSE
|
||||||
// * file that was distributed with this source code.
|
// * file that was distributed with this source code.
|
||||||
|
|
||||||
// spell-checker:ignore (ToDO) PREFIXaa
|
// spell-checker:ignore (ToDO) PREFIXaa nbbbb ncccc
|
||||||
|
|
||||||
mod filenames;
|
mod filenames;
|
||||||
mod number;
|
mod number;
|
||||||
|
@ -760,6 +760,187 @@ impl<'a> Write for LineChunkWriter<'a> {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Write lines to each sequential output files, limited by bytes.
|
||||||
|
///
|
||||||
|
/// This struct maintains an underlying writer representing the
|
||||||
|
/// current chunk of the output. On each call to [`write`], it writes
|
||||||
|
/// as many lines as possible to the current chunk without exceeding
|
||||||
|
/// the specified byte limit. If a single line has more bytes than the
|
||||||
|
/// limit, then fill an entire single chunk with those bytes and
|
||||||
|
/// handle the remainder of the line as if it were its own distinct
|
||||||
|
/// line. As many new underlying writers are created as needed to
|
||||||
|
/// write all the data in the input buffer.
|
||||||
|
struct LineBytesChunkWriter<'a> {
|
||||||
|
/// Parameters for creating the underlying writer for each new chunk.
|
||||||
|
settings: &'a Settings,
|
||||||
|
|
||||||
|
/// The maximum number of bytes allowed for a single chunk of output.
|
||||||
|
chunk_size: u64,
|
||||||
|
|
||||||
|
/// Running total of number of chunks that have been completed.
|
||||||
|
num_chunks_written: usize,
|
||||||
|
|
||||||
|
/// Remaining capacity in number of bytes in the current chunk.
|
||||||
|
///
|
||||||
|
/// This number starts at `chunk_size` and decreases as lines are
|
||||||
|
/// written. Once it reaches zero, a writer for a new chunk is
|
||||||
|
/// initialized and this number gets reset to `chunk_size`.
|
||||||
|
num_bytes_remaining_in_current_chunk: usize,
|
||||||
|
|
||||||
|
/// The underlying writer for the current chunk.
|
||||||
|
///
|
||||||
|
/// Once the number of bytes written to this writer exceeds
|
||||||
|
/// `chunk_size`, a new writer is initialized and assigned to this
|
||||||
|
/// field.
|
||||||
|
inner: BufWriter<Box<dyn Write>>,
|
||||||
|
|
||||||
|
/// Iterator that yields filenames for each chunk.
|
||||||
|
filename_iterator: FilenameIterator<'a>,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl<'a> LineBytesChunkWriter<'a> {
|
||||||
|
fn new(chunk_size: u64, settings: &'a Settings) -> Option<LineBytesChunkWriter<'a>> {
|
||||||
|
let mut filename_iterator = FilenameIterator::new(
|
||||||
|
&settings.prefix,
|
||||||
|
&settings.additional_suffix,
|
||||||
|
settings.suffix_length,
|
||||||
|
settings.suffix_type,
|
||||||
|
);
|
||||||
|
let filename = filename_iterator.next()?;
|
||||||
|
if settings.verbose {
|
||||||
|
println!("creating file {}", filename.quote());
|
||||||
|
}
|
||||||
|
let inner = platform::instantiate_current_writer(&settings.filter, &filename);
|
||||||
|
Some(LineBytesChunkWriter {
|
||||||
|
settings,
|
||||||
|
chunk_size,
|
||||||
|
num_bytes_remaining_in_current_chunk: chunk_size.try_into().unwrap(),
|
||||||
|
num_chunks_written: 0,
|
||||||
|
inner,
|
||||||
|
filename_iterator,
|
||||||
|
})
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl<'a> Write for LineBytesChunkWriter<'a> {
|
||||||
|
/// Write as many lines to a chunk as possible without
|
||||||
|
/// exceeding the byte limit. If a single line has more bytes
|
||||||
|
/// than the limit, then fill an entire single chunk with those
|
||||||
|
/// bytes and handle the remainder of the line as if it were
|
||||||
|
/// its own distinct line.
|
||||||
|
///
|
||||||
|
/// For example: if the `chunk_size` is 8 and the input is:
|
||||||
|
///
|
||||||
|
/// ```text
|
||||||
|
/// aaaaaaaaa\nbbbb\ncccc\ndd\nee\n
|
||||||
|
/// ```
|
||||||
|
///
|
||||||
|
/// then the output gets broken into chunks like this:
|
||||||
|
///
|
||||||
|
/// ```text
|
||||||
|
/// chunk 0 chunk 1 chunk 2 chunk 3
|
||||||
|
///
|
||||||
|
/// 0 1 2
|
||||||
|
/// 01234567 89 01234 56789 012 345 6
|
||||||
|
/// |------| |-------| |--------| |---|
|
||||||
|
/// aaaaaaaa a\nbbbb\n cccc\ndd\n ee\n
|
||||||
|
/// ```
|
||||||
|
fn write(&mut self, mut buf: &[u8]) -> std::io::Result<usize> {
|
||||||
|
// The total number of bytes written during the loop below.
|
||||||
|
//
|
||||||
|
// It is necessary to keep this running total because we may
|
||||||
|
// be making multiple calls to `write()` on multiple different
|
||||||
|
// underlying writers and we want the final reported number of
|
||||||
|
// bytes written to reflect the total number of bytes written
|
||||||
|
// to all of the underlying writers.
|
||||||
|
let mut total_bytes_written = 0;
|
||||||
|
|
||||||
|
// Loop until we have written all bytes in the input buffer
|
||||||
|
// (or an IO error occurs).
|
||||||
|
loop {
|
||||||
|
// If we have filled the current chunk with bytes, then
|
||||||
|
// start a new chunk and initialize its corresponding
|
||||||
|
// writer.
|
||||||
|
if self.num_bytes_remaining_in_current_chunk == 0 {
|
||||||
|
self.num_chunks_written += 1;
|
||||||
|
let filename = self.filename_iterator.next().ok_or_else(|| {
|
||||||
|
std::io::Error::new(ErrorKind::Other, "output file suffixes exhausted")
|
||||||
|
})?;
|
||||||
|
if self.settings.verbose {
|
||||||
|
println!("creating file {}", filename.quote());
|
||||||
|
}
|
||||||
|
self.inner = platform::instantiate_current_writer(&self.settings.filter, &filename);
|
||||||
|
self.num_bytes_remaining_in_current_chunk = self.chunk_size.try_into().unwrap();
|
||||||
|
}
|
||||||
|
|
||||||
|
// Find the first newline character in the buffer.
|
||||||
|
match memchr::memchr(b'\n', buf) {
|
||||||
|
// If there is no newline character and the buffer is
|
||||||
|
// empty, then we are done writing.
|
||||||
|
None if buf.is_empty() => {
|
||||||
|
return Ok(total_bytes_written);
|
||||||
|
}
|
||||||
|
|
||||||
|
// If there is no newline character and the buffer is
|
||||||
|
// not empty, then write as many bytes as we can and
|
||||||
|
// then move on to the next chunk if necessary.
|
||||||
|
None => {
|
||||||
|
let end = self.num_bytes_remaining_in_current_chunk;
|
||||||
|
let num_bytes_written = self.inner.write(&buf[..end])?;
|
||||||
|
self.num_bytes_remaining_in_current_chunk -= num_bytes_written;
|
||||||
|
total_bytes_written += num_bytes_written;
|
||||||
|
buf = &buf[num_bytes_written..];
|
||||||
|
}
|
||||||
|
|
||||||
|
// If there is a newline character and the line
|
||||||
|
// (including the newline character) will fit in the
|
||||||
|
// current chunk, then write the entire line and
|
||||||
|
// continue to the next iteration. (See chunk 1 in the
|
||||||
|
// example comment above.)
|
||||||
|
Some(i) if i < self.num_bytes_remaining_in_current_chunk => {
|
||||||
|
let num_bytes_written = self.inner.write(&buf[..i + 1])?;
|
||||||
|
self.num_bytes_remaining_in_current_chunk -= num_bytes_written;
|
||||||
|
total_bytes_written += num_bytes_written;
|
||||||
|
buf = &buf[num_bytes_written..];
|
||||||
|
}
|
||||||
|
|
||||||
|
// If there is a newline character, the line
|
||||||
|
// (including the newline character) will not fit in
|
||||||
|
// the current chunk, *and* no other lines have been
|
||||||
|
// written to the current chunk, then write as many
|
||||||
|
// bytes as we can and continue to the next
|
||||||
|
// iteration. (See chunk 0 in the example comment
|
||||||
|
// above.)
|
||||||
|
Some(_)
|
||||||
|
if self.num_bytes_remaining_in_current_chunk
|
||||||
|
== self.chunk_size.try_into().unwrap() =>
|
||||||
|
{
|
||||||
|
let end = self.num_bytes_remaining_in_current_chunk;
|
||||||
|
let num_bytes_written = self.inner.write(&buf[..end])?;
|
||||||
|
self.num_bytes_remaining_in_current_chunk -= num_bytes_written;
|
||||||
|
total_bytes_written += num_bytes_written;
|
||||||
|
buf = &buf[num_bytes_written..];
|
||||||
|
}
|
||||||
|
|
||||||
|
// If there is a newline character, the line
|
||||||
|
// (including the newline character) will not fit in
|
||||||
|
// the current chunk, and at least one other line has
|
||||||
|
// been written to the current chunk, then signal to
|
||||||
|
// the next iteration that a new chunk needs to be
|
||||||
|
// created and continue to the next iteration of the
|
||||||
|
// loop to try writing the line there.
|
||||||
|
Some(_) => {
|
||||||
|
self.num_bytes_remaining_in_current_chunk = 0;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
fn flush(&mut self) -> std::io::Result<()> {
|
||||||
|
self.inner.flush()
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
/// Split a file into a specific number of chunks by byte.
|
/// Split a file into a specific number of chunks by byte.
|
||||||
///
|
///
|
||||||
/// This function always creates one output file for each chunk, even
|
/// This function always creates one output file for each chunk, even
|
||||||
|
@ -1027,7 +1208,7 @@ fn split(settings: &Settings) -> UResult<()> {
|
||||||
},
|
},
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
Strategy::Bytes(chunk_size) | Strategy::LineBytes(chunk_size) => {
|
Strategy::Bytes(chunk_size) => {
|
||||||
let mut writer = ByteChunkWriter::new(chunk_size, settings)
|
let mut writer = ByteChunkWriter::new(chunk_size, settings)
|
||||||
.ok_or_else(|| USimpleError::new(1, "output file suffixes exhausted"))?;
|
.ok_or_else(|| USimpleError::new(1, "output file suffixes exhausted"))?;
|
||||||
match std::io::copy(&mut reader, &mut writer) {
|
match std::io::copy(&mut reader, &mut writer) {
|
||||||
|
@ -1046,6 +1227,25 @@ fn split(settings: &Settings) -> UResult<()> {
|
||||||
},
|
},
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
Strategy::LineBytes(chunk_size) => {
|
||||||
|
let mut writer = LineBytesChunkWriter::new(chunk_size, settings)
|
||||||
|
.ok_or_else(|| USimpleError::new(1, "output file suffixes exhausted"))?;
|
||||||
|
match std::io::copy(&mut reader, &mut writer) {
|
||||||
|
Ok(_) => Ok(()),
|
||||||
|
Err(e) => match e.kind() {
|
||||||
|
// TODO Since the writer object controls the creation of
|
||||||
|
// new files, we need to rely on the `std::io::Result`
|
||||||
|
// returned by its `write()` method to communicate any
|
||||||
|
// errors to this calling scope. If a new file cannot be
|
||||||
|
// created because we have exceeded the number of
|
||||||
|
// allowable filenames, we use `ErrorKind::Other` to
|
||||||
|
// indicate that. A special error message needs to be
|
||||||
|
// printed in that case.
|
||||||
|
ErrorKind::Other => Err(USimpleError::new(1, "output file suffixes exhausted")),
|
||||||
|
_ => Err(uio_error!(e, "input/output error")),
|
||||||
|
},
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -2,7 +2,7 @@
|
||||||
// *
|
// *
|
||||||
// * For the full copyright and license information, please view the LICENSE
|
// * For the full copyright and license information, please view the LICENSE
|
||||||
// * file that was distributed with this source code.
|
// * file that was distributed with this source code.
|
||||||
// spell-checker:ignore xzaaa sixhundredfiftyonebytes ninetyonebytes threebytes asciilowercase fghij klmno pqrst uvwxyz fivelines twohundredfortyonebytes onehundredlines
|
// spell-checker:ignore xzaaa sixhundredfiftyonebytes ninetyonebytes threebytes asciilowercase fghij klmno pqrst uvwxyz fivelines twohundredfortyonebytes onehundredlines nbbbb
|
||||||
extern crate rand;
|
extern crate rand;
|
||||||
extern crate regex;
|
extern crate regex;
|
||||||
|
|
||||||
|
@ -595,3 +595,13 @@ fn test_lines_kth() {
|
||||||
.succeeds()
|
.succeeds()
|
||||||
.stdout_only("20\n21\n22\n23\n24\n25\n26\n27\n28\n29\n");
|
.stdout_only("20\n21\n22\n23\n24\n25\n26\n27\n28\n29\n");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_line_bytes() {
|
||||||
|
let (at, mut ucmd) = at_and_ucmd!();
|
||||||
|
ucmd.args(&["-C", "8", "letters.txt"]).succeeds();
|
||||||
|
assert_eq!(at.read("xaa"), "aaaaaaaa");
|
||||||
|
assert_eq!(at.read("xab"), "a\nbbbb\n");
|
||||||
|
assert_eq!(at.read("xac"), "cccc\ndd\n");
|
||||||
|
assert_eq!(at.read("xad"), "ee\n");
|
||||||
|
}
|
||||||
|
|
5
tests/fixtures/split/letters.txt
vendored
Normal file
5
tests/fixtures/split/letters.txt
vendored
Normal file
|
@ -0,0 +1,5 @@
|
||||||
|
aaaaaaaaa
|
||||||
|
bbbb
|
||||||
|
cccc
|
||||||
|
dd
|
||||||
|
ee
|
Loading…
Add table
Add a link
Reference in a new issue