1
Fork 0
mirror of https://github.com/RGBCube/uutils-coreutils synced 2025-07-28 19:47:45 +00:00

Fix a bug in split where chunking would be skipped when the chunk size (#3800)

* Fix a bug in split where chunking would be skipped when the chunk size
happened to be an exact divisor of the buffer size used to read the
input stream.

The issue here was that file was being split byte-wise in chunks of 1G.
The input stream was being read in chunks of 8KB, which evenly divides
the chunk size. Because the check to allocate the next output chunk was
done at the bottom of the loop previously, it would never occur because
the current input chunk was fully consumed at that point. By moving the
check to the top of the loop (but still late enough that we know we have
bytes to write) we resolve this issue.

This scenario is unfortunately hard to write a test for, since we don't
explicitly control the input chunk size.

Fixes https://github.com/uutils/coreutils/issues/3790
This commit is contained in:
Owen Anderson 2022-08-16 02:02:52 -07:00 committed by GitHub
parent 5ecabb8467
commit 9fad6fde35
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
2 changed files with 31 additions and 21 deletions

View file

@ -618,6 +618,21 @@ impl<'a> Write for ByteChunkWriter<'a> {
return Ok(carryover_bytes_written);
}
if self.num_bytes_remaining_in_current_chunk == 0 {
// Increment the chunk number, reset the number of bytes remaining, and instantiate the new underlying writer.
self.num_chunks_written += 1;
self.num_bytes_remaining_in_current_chunk = self.chunk_size;
// Allocate the new file, since at this point we know there are bytes to be written to it.
let filename = self.filename_iterator.next().ok_or_else(|| {
std::io::Error::new(ErrorKind::Other, "output file suffixes exhausted")
})?;
if self.settings.verbose {
println!("creating file {}", filename.quote());
}
self.inner = self.settings.instantiate_current_writer(&filename)?;
}
// If the capacity of this chunk is greater than the number of
// bytes in `buf`, then write all the bytes in `buf`. Otherwise,
// write enough bytes to fill the current chunk, then increment
@ -635,38 +650,18 @@ impl<'a> Write for ByteChunkWriter<'a> {
// n, which is already usize.
let i = self.num_bytes_remaining_in_current_chunk as usize;
let num_bytes_written = self.inner.write(&buf[..i])?;
self.num_bytes_remaining_in_current_chunk -= num_bytes_written as u64;
// It's possible that the underlying writer did not
// write all the bytes.
if num_bytes_written < i {
self.num_bytes_remaining_in_current_chunk -= num_bytes_written as u64;
return Ok(carryover_bytes_written + num_bytes_written);
} else {
// Move the window to look at only the remaining bytes.
buf = &buf[i..];
// Increment the chunk number, reset the number of
// bytes remaining, and instantiate the new
// underlying writer.
self.num_chunks_written += 1;
self.num_bytes_remaining_in_current_chunk = self.chunk_size;
// Remember for the next iteration that we wrote these bytes.
carryover_bytes_written += num_bytes_written;
// Only create the writer for the next chunk if
// there are any remaining bytes to write. This
// check prevents us from creating a new empty
// file.
if !buf.is_empty() {
let filename = self.filename_iterator.next().ok_or_else(|| {
std::io::Error::new(ErrorKind::Other, "output file suffixes exhausted")
})?;
if self.settings.verbose {
println!("creating file {}", filename.quote());
}
self.inner = self.settings.instantiate_current_writer(&filename)?;
}
}
}
}

View file

@ -683,3 +683,18 @@ fn test_guard_input() {
.stderr_only("split: 'xaa' would overwrite input; aborting");
assert_eq!(at.read("xaa"), "1\n2\n3\n");
}
#[test]
fn test_multiple_of_input_chunk() {
let (at, mut ucmd) = at_and_ucmd!();
let name = "multiple_of_input_chunk";
RandomFile::new(&at, name).add_bytes(16 * 1024);
ucmd.args(&["-b", "8K", name, "b"]).succeeds();
let glob = Glob::new(&at, ".", r"b[[:alpha:]][[:alpha:]]$");
assert_eq!(glob.count(), 2);
for filename in glob.collect() {
assert_eq!(glob.directory.metadata(&filename).len(), 8 * 1024);
}
assert_eq!(glob.collate(), at.read_bytes(name));
}