1
Fork 0
mirror of https://github.com/RGBCube/uutils-coreutils synced 2025-07-28 11:37:44 +00:00

split: pass GNU tests/b-chunk.sh (#5475)

---------

Co-authored-by: Terts Diepraam <terts.diepraam@gmail.com>
Co-authored-by: Daniel Hofstetter <daniel.hofstetter@42dh.com>
Co-authored-by: Brandon Elam Barker <brandon.barker@gmail.com>
Co-authored-by: Kostiantyn Hryshchuk <statheres@gmail.com>
Co-authored-by: renovate[bot] <29139614+renovate[bot]@users.noreply.github.com>
This commit is contained in:
Yury Zhytkou 2023-11-17 11:19:10 -05:00 committed by GitHub
parent a7e5af4770
commit eb00c195c6
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
2 changed files with 446 additions and 375 deletions

View file

@ -18,11 +18,12 @@ use std::ffi::OsString;
use std::fmt; use std::fmt;
use std::fs::{metadata, File}; use std::fs::{metadata, File};
use std::io; use std::io;
use std::io::{stdin, BufRead, BufReader, BufWriter, ErrorKind, Read, Write}; use std::io::{stdin, BufRead, BufReader, BufWriter, ErrorKind, Read, Seek, SeekFrom, Write};
use std::path::Path; use std::path::Path;
use std::u64; use std::u64;
use uucore::display::Quotable; use uucore::display::Quotable;
use uucore::error::{FromIo, UIoError, UResult, USimpleError, UUsageError}; use uucore::error::{FromIo, UIoError, UResult, USimpleError, UUsageError};
use uucore::parse_size::parse_size_u64;
use uucore::uio_error; use uucore::uio_error;
use uucore::{format_usage, help_about, help_section, help_usage}; use uucore::{format_usage, help_about, help_section, help_usage};
@ -40,11 +41,20 @@ static OPT_HEX_SUFFIXES_SHORT: &str = "-x";
static OPT_SUFFIX_LENGTH: &str = "suffix-length"; static OPT_SUFFIX_LENGTH: &str = "suffix-length";
static OPT_VERBOSE: &str = "verbose"; static OPT_VERBOSE: &str = "verbose";
static OPT_SEPARATOR: &str = "separator"; static OPT_SEPARATOR: &str = "separator";
//The ---io and ---io-blksize parameters are consumed and ignored.
//The parameter is included to make GNU coreutils tests pass.
static OPT_IO: &str = "-io";
static OPT_IO_BLKSIZE: &str = "-io-blksize";
static OPT_ELIDE_EMPTY_FILES: &str = "elide-empty-files"; static OPT_ELIDE_EMPTY_FILES: &str = "elide-empty-files";
static OPT_IO_BLKSIZE: &str = "-io-blksize";
// Cap ---io-blksize value
// For 64bit systems the max value is the same as in GNU
// and is equivalent of `i32::MAX >> 20 << 20` operation.
// On 32bit systems however, even though it fits within `u32` and `i32`,
// it causes rust-lang `library/alloc/src/raw_vec.rs` to panic with 'capacity overflow' error.
// Could be due to how `std::io::BufReader` handles internal buffers.
// So we use much smaller value for those
static OPT_IO_BLKSIZE_MAX: usize = if usize::BITS >= 64 {
2_146_435_072
} else {
1_000_000_000
};
static ARG_INPUT: &str = "input"; static ARG_INPUT: &str = "input";
static ARG_PREFIX: &str = "prefix"; static ARG_PREFIX: &str = "prefix";
@ -311,7 +321,6 @@ pub fn uu_app() -> Command {
.arg( .arg(
Arg::new(OPT_NUMERIC_SUFFIXES) Arg::new(OPT_NUMERIC_SUFFIXES)
.long(OPT_NUMERIC_SUFFIXES) .long(OPT_NUMERIC_SUFFIXES)
.alias("numeric")
.require_equals(true) .require_equals(true)
.num_args(0..=1) .num_args(0..=1)
.overrides_with_all([ .overrides_with_all([
@ -338,7 +347,6 @@ pub fn uu_app() -> Command {
.arg( .arg(
Arg::new(OPT_HEX_SUFFIXES) Arg::new(OPT_HEX_SUFFIXES)
.long(OPT_HEX_SUFFIXES) .long(OPT_HEX_SUFFIXES)
.alias("hex")
.require_equals(true) .require_equals(true)
.num_args(0..=1) .num_args(0..=1)
.overrides_with_all([ .overrides_with_all([
@ -373,12 +381,6 @@ pub fn uu_app() -> Command {
.action(ArgAction::Append) .action(ArgAction::Append)
.help("use SEP instead of newline as the record separator; '\\0' (zero) specifies the NUL character"), .help("use SEP instead of newline as the record separator; '\\0' (zero) specifies the NUL character"),
) )
.arg(
Arg::new(OPT_IO)
.long("io")
.alias(OPT_IO)
.hide(true),
)
.arg( .arg(
Arg::new(OPT_IO_BLKSIZE) Arg::new(OPT_IO_BLKSIZE)
.long("io-blksize") .long("io-blksize")
@ -419,6 +421,7 @@ struct Settings {
/// chunks. If this is `false`, then empty files will not be /// chunks. If this is `false`, then empty files will not be
/// created. /// created.
elide_empty_files: bool, elide_empty_files: bool,
io_blksize: Option<usize>,
} }
/// An error when parsing settings from command-line arguments. /// An error when parsing settings from command-line arguments.
@ -441,6 +444,9 @@ enum SettingsError {
/// r/K/N /// r/K/N
FilterWithKthChunkNumber, FilterWithKthChunkNumber,
/// Invalid IO block size
InvalidIOBlockSize(String),
/// The `--filter` option is not supported on Windows. /// The `--filter` option is not supported on Windows.
#[cfg(windows)] #[cfg(windows)]
NotSupported, NotSupported,
@ -471,6 +477,7 @@ impl fmt::Display for SettingsError {
Self::FilterWithKthChunkNumber => { Self::FilterWithKthChunkNumber => {
write!(f, "--filter does not process a chunk extracted to stdout") write!(f, "--filter does not process a chunk extracted to stdout")
} }
Self::InvalidIOBlockSize(s) => write!(f, "invalid IO block size: {}", s.quote()),
#[cfg(windows)] #[cfg(windows)]
Self::NotSupported => write!( Self::NotSupported => write!(
f, f,
@ -499,12 +506,29 @@ impl Settings {
match first.as_str() { match first.as_str() {
"\\0" => b'\0', "\\0" => b'\0',
s if s.as_bytes().len() == 1 => s.as_bytes()[0], s if s.as_bytes().len() == 1 => s.as_bytes()[0],
s => return Err(SettingsError::MultiCharacterSeparator(s.to_owned())), s => return Err(SettingsError::MultiCharacterSeparator(s.to_string())),
} }
} }
None => b'\n', None => b'\n',
}; };
let io_blksize: Option<usize> = if let Some(s) = matches.get_one::<String>(OPT_IO_BLKSIZE) {
match parse_size_u64(s) {
Ok(n) => {
let n: usize = n
.try_into()
.map_err(|_| SettingsError::InvalidIOBlockSize(s.to_string()))?;
if n > OPT_IO_BLKSIZE_MAX {
return Err(SettingsError::InvalidIOBlockSize(s.to_string()));
}
Some(n)
}
_ => return Err(SettingsError::InvalidIOBlockSize(s.to_string())),
}
} else {
None
};
let result = Self { let result = Self {
prefix: matches.get_one::<String>(ARG_PREFIX).unwrap().clone(), prefix: matches.get_one::<String>(ARG_PREFIX).unwrap().clone(),
suffix, suffix,
@ -514,6 +538,7 @@ impl Settings {
verbose: matches.value_source(OPT_VERBOSE) == Some(ValueSource::CommandLine), verbose: matches.value_source(OPT_VERBOSE) == Some(ValueSource::CommandLine),
separator, separator,
elide_empty_files: matches.get_flag(OPT_ELIDE_EMPTY_FILES), elide_empty_files: matches.get_flag(OPT_ELIDE_EMPTY_FILES),
io_blksize,
}; };
#[cfg(windows)] #[cfg(windows)]
@ -591,6 +616,93 @@ fn custom_write_all<T: Write>(
} }
} }
/// Get the size of the input file in bytes
/// Used only for subset of `--number=CHUNKS` strategy, as there is a need
/// to determine input file size upfront in order to know chunk size
/// to be written into each of N files/chunks:
/// * N split into N files based on size of input
/// * K/N output Kth of N to stdout
/// * l/N split into N files without splitting lines/records
/// * l/K/N output Kth of N to stdout without splitting lines/records
///
/// For most files the size will be determined by either reading entire file content into a buffer
/// or by `len()` function of [`std::fs::metadata`].
///
/// However, for some files which report filesystem metadata size that does not match
/// their actual content size, we will need to attempt to find the end of file
/// with direct `seek()` on [`std::fs::File`].
///
/// For STDIN stream - read into a buffer up to a limit
/// If input stream does not EOF before that - return an error
/// (i.e. "infinite" input as in `cat /dev/zero | split ...`, `yes | split ...` etc.).
///
/// Note: The `buf` might end up with either partial or entire input content.
fn get_input_size<R>(
input: &String,
reader: &mut R,
buf: &mut Vec<u8>,
io_blksize: &Option<usize>,
) -> std::io::Result<u64>
where
R: BufRead,
{
// Set read limit to io_blksize if specified
// Otherwise to OPT_IO_BLKSIZE_MAX
let read_limit = io_blksize.unwrap_or(OPT_IO_BLKSIZE_MAX) as u64;
// Try to read into buffer up to a limit
let num_bytes = reader
.by_ref()
.take(read_limit)
.read_to_end(buf)
.map(|n| n as u64)?;
if num_bytes < read_limit {
// Finite file or STDIN stream that fits entirely
// into a buffer within the limit
// Note: files like /dev/null or similar,
// empty STDIN stream,
// and files with true file size 0
// will also fit here
Ok(num_bytes)
} else if input == "-" {
// STDIN stream that did not fit all content into a buffer
// Most likely continuous/infinite input stream
return Err(io::Error::new(
ErrorKind::Other,
format!("{}: cannot determine input size", input),
));
} else {
// Could be that file size is larger than set read limit
// Get the file size from filesystem metadata
let metadata = metadata(input)?;
let metadata_size = metadata.len();
if num_bytes <= metadata_size {
Ok(metadata_size)
} else {
// Could be a file from locations like /dev, /sys, /proc or similar
// which report filesystem metadata size that does not match
// their actual content size
// Attempt direct `seek()` for the end of a file
let mut tmp_fd = File::open(Path::new(input))?;
let end = tmp_fd.seek(SeekFrom::End(0))?;
if end > 0 {
Ok(end)
} else {
// Edge case of either "infinite" file (i.e. /dev/zero)
// or some other "special" non-standard file type
// Give up and return an error
// TODO It might be possible to do more here
// to address all possible file types and edge cases
return Err(io::Error::new(
ErrorKind::Other,
format!("{}: cannot determine file size", input),
));
}
}
}
}
/// Write a certain number of bytes to one file, then move on to another one. /// Write a certain number of bytes to one file, then move on to another one.
/// ///
/// This struct maintains an underlying writer representing the /// This struct maintains an underlying writer representing the
@ -1018,155 +1130,110 @@ impl<'a> Write for LineBytesChunkWriter<'a> {
} }
} }
/// Split a file into a specific number of chunks by byte. /// Split a file or STDIN into a specific number of chunks by byte.
/// If in Kth chunk of N mode - print the k-th chunk to STDOUT.
/// ///
/// This function always creates one output file for each chunk, even /// When file size cannot be evenly divided into the number of chunks of the same size,
/// the first X chunks are 1 byte longer than the rest,
/// where X is a modulus reminder of (file size % number of chunks)
///
/// In Kth chunk of N mode - writes to stdout the contents of the chunk identified by `kth_chunk`
///
/// In N chunks mode - this function always creates one output file for each chunk, even
/// if there is an error reading or writing one of the chunks or if /// if there is an error reading or writing one of the chunks or if
/// the input file is truncated. However, if the `filter` option is /// the input file is truncated. However, if the `--filter` option is
/// being used, then no files are created. /// being used, then files will only be created if `$FILE` variable was used
/// in filter command,
/// i.e. `split -n 10 --filter='head -c1 > $FILE' in`
/// ///
/// # Errors /// # Errors
/// ///
/// This function returns an error if there is a problem reading from /// This function returns an error if there is a problem reading from
/// `reader` or writing to one of the output files. /// `reader` or writing to one of the output files or stdout.
///
/// # See also
///
/// * [`n_chunks_by_line`], which splits its input into a specific number of chunks by line.
/// ///
/// Implements `--number=CHUNKS` /// Implements `--number=CHUNKS`
/// Where CHUNKS /// Where CHUNKS
/// * N /// * N
fn split_into_n_chunks_by_byte<R>( /// * K/N
fn n_chunks_by_byte<R>(
settings: &Settings, settings: &Settings,
reader: &mut R, reader: &mut R,
num_chunks: u64, num_chunks: u64,
kth_chunk: Option<u64>,
) -> UResult<()> ) -> UResult<()>
where where
R: Read, R: BufRead,
{ {
// Get the size of the input file in bytes and compute the number // Get the size of the input in bytes
// of bytes per chunk. let initial_buf = &mut Vec::new();
// let mut num_bytes = get_input_size(&settings.input, reader, initial_buf, &settings.io_blksize)?;
// If the requested number of chunks exceeds the number of bytes let mut reader = initial_buf.chain(reader);
// in the file *and* the `elide_empty_files` parameter is enabled,
// then behave as if the number of chunks was set to the number of
// bytes in the file. This ensures that we don't write empty
// files. Otherwise, just write the `num_chunks - num_bytes` empty
// files.
let metadata = metadata(&settings.input).map_err(|_| {
USimpleError::new(1, format!("{}: cannot determine file size", settings.input))
})?;
let num_bytes = metadata.len(); // If input file is empty and we would not have determined the Kth chunk
let will_have_empty_files = settings.elide_empty_files && num_chunks > num_bytes; // in the Kth chunk of N chunk mode, then terminate immediately.
let (num_chunks, chunk_size) = if will_have_empty_files { // This happens on `split -n 3/10 /dev/null`, for example.
let num_chunks = num_bytes; if kth_chunk.is_some() && num_bytes == 0 {
let chunk_size = 1; return Ok(());
(num_chunks, chunk_size) }
// If the requested number of chunks exceeds the number of bytes
// in the input:
// * in Kth chunk of N mode - just write empty byte string to stdout
// NOTE: the `elide_empty_files` parameter is ignored here
// as we do not generate any files
// and instead writing to stdout
// * In N chunks mode - if the `elide_empty_files` parameter is enabled,
// then behave as if the number of chunks was set to the number of
// bytes in the file. This ensures that we don't write empty
// files. Otherwise, just write the `num_chunks - num_bytes` empty files.
let num_chunks = if kth_chunk.is_none() && settings.elide_empty_files && num_chunks > num_bytes
{
num_bytes
} else { } else {
let chunk_size = (num_bytes / (num_chunks)).max(1); num_chunks
(num_chunks, chunk_size)
}; };
// If we would have written zero chunks of output, then terminate // If we would have written zero chunks of output, then terminate
// immediately. This happens on `split -e -n 3 /dev/null`, for // immediately. This happens on `split -e -n 3 /dev/null`, for
// example. // example.
if num_chunks == 0 || num_bytes == 0 { if num_chunks == 0 {
return Ok(()); return Ok(());
} }
let num_chunks: usize = num_chunks // In Kth chunk of N mode - we will write to stdout instead of to a file.
.try_into() let mut stdout_writer = std::io::stdout().lock();
.map_err(|_| USimpleError::new(1, "Number of chunks too big"))?; // In N chunks mode - we will write to `num_chunks` files
// This object is responsible for creating the filename for each chunk.
let mut filename_iterator = FilenameIterator::new(&settings.prefix, &settings.suffix)?;
// Create one writer for each chunk. This will create each
// of the underlying files (if not in `--filter` mode).
let mut writers = vec![]; let mut writers = vec![];
for _ in 0..num_chunks {
let filename = filename_iterator // Calculate chunk size base and modulo reminder
.next() // to be used in calculating chunk_size later on
.ok_or_else(|| USimpleError::new(1, "output file suffixes exhausted"))?; let chunk_size_base = num_bytes / num_chunks;
let writer = settings.instantiate_current_writer(filename.as_str())?; let chunk_size_reminder = num_bytes % num_chunks;
writers.push(writer);
// If in N chunks mode
// Create one writer for each chunk.
// This will create each of the underlying files
// or stdin pipes to child shell/command processes if in `--filter` mode
if kth_chunk.is_none() {
// This object is responsible for creating the filename for each chunk.
let mut filename_iterator = FilenameIterator::new(&settings.prefix, &settings.suffix)
.map_err(|e| io::Error::new(ErrorKind::Other, format!("{e}")))?;
for _ in 0..num_chunks {
let filename = filename_iterator
.next()
.ok_or_else(|| USimpleError::new(1, "output file suffixes exhausted"))?;
let writer = settings.instantiate_current_writer(filename.as_str())?;
writers.push(writer);
}
} }
// Write `chunk_size` bytes from the reader into each writer for i in 1_u64..=num_chunks {
// except the last. let chunk_size = chunk_size_base + (chunk_size_reminder > i - 1) as u64;
// let buf = &mut Vec::new();
// The last writer gets all remaining bytes so that if the number
// of bytes in the input file was not evenly divisible by
// `num_chunks`, we don't leave any bytes behind.
for writer in writers.iter_mut().take(num_chunks - 1) {
match io::copy(&mut reader.by_ref().take(chunk_size), writer) {
Ok(_) => continue,
Err(e) if ignorable_io_error(&e, settings) => continue,
Err(e) => return Err(uio_error!(e, "input/output error")),
};
}
// Write all the remaining bytes to the last chunk.
let i = num_chunks - 1;
let last_chunk_size = num_bytes - (chunk_size * (num_chunks as u64 - 1));
match io::copy(&mut reader.by_ref().take(last_chunk_size), &mut writers[i]) {
Ok(_) => Ok(()),
Err(e) if ignorable_io_error(&e, settings) => Ok(()),
Err(e) => Err(uio_error!(e, "input/output error")),
}
}
/// Print the k-th chunk of a file to stdout, splitting by byte.
///
/// This function is like [`split_into_n_chunks_by_byte`], but instead
/// of writing each chunk to its own file, it only writes to stdout
/// the contents of the chunk identified by `chunk_number`
///
/// # Errors
///
/// This function returns an error if there is a problem reading from
/// `reader` or writing to stdout.
///
/// Implements `--number=CHUNKS`
/// Where CHUNKS
/// * K/N
fn kth_chunks_by_byte<R>(
settings: &Settings,
reader: &mut R,
chunk_number: u64,
num_chunks: u64,
) -> UResult<()>
where
R: BufRead,
{
// Get the size of the input file in bytes and compute the number
// of bytes per chunk.
//
// If the requested number of chunks exceeds the number of bytes
// in the file - just write empty byte string to stdout
// NOTE: the `elide_empty_files` parameter is ignored here
// as we do not generate any files
// and instead writing to stdout
let metadata = metadata(&settings.input).map_err(|_| {
USimpleError::new(1, format!("{}: cannot determine file size", settings.input))
})?;
let num_bytes = metadata.len();
// If input file is empty and we would have written zero chunks of output,
// then terminate immediately.
// This happens on `split -e -n 3 /dev/null`, for example.
if num_bytes == 0 {
return Ok(());
}
// Write to stdout instead of to a file.
let stdout = std::io::stdout();
let mut writer = stdout.lock();
let chunk_size = (num_bytes / (num_chunks)).max(1);
let mut num_bytes: usize = num_bytes.try_into().unwrap();
let mut i = 1;
loop {
let buf: &mut Vec<u8> = &mut vec![];
if num_bytes > 0 { if num_bytes > 0 {
// Read `chunk_size` bytes from the reader into `buf` // Read `chunk_size` bytes from the reader into `buf`
// except the last. // except the last.
@ -1176,15 +1243,17 @@ where
// `num_chunks`, we don't leave any bytes behind. // `num_chunks`, we don't leave any bytes behind.
let limit = { let limit = {
if i == num_chunks { if i == num_chunks {
num_bytes.try_into().unwrap() num_bytes
} else { } else {
chunk_size chunk_size
} }
}; };
let n_bytes_read = reader.by_ref().take(limit).read_to_end(buf); let n_bytes_read = reader.by_ref().take(limit).read_to_end(buf);
match n_bytes_read { match n_bytes_read {
Ok(n_bytes) => { Ok(n_bytes) => {
num_bytes -= n_bytes; num_bytes -= n_bytes as u64;
} }
Err(error) => { Err(error) => {
return Err(USimpleError::new( return Err(USimpleError::new(
@ -1193,11 +1262,20 @@ where
)); ));
} }
} }
if i == chunk_number {
writer.write_all(buf)?; match kth_chunk {
break; Some(chunk_number) => {
if i == chunk_number {
stdout_writer.write_all(buf)?;
break;
}
}
None => {
let idx = (i - 1) as usize;
let writer = writers.get_mut(idx).unwrap();
writer.write_all(buf)?;
}
} }
i += 1;
} else { } else {
break; break;
} }
@ -1205,12 +1283,17 @@ where
Ok(()) Ok(())
} }
/// Split a file into a specific number of chunks by line. /// Split a file or STDIN into a specific number of chunks by line.
/// If in Kth chunk of N mode - print the k-th chunk to STDOUT.
/// ///
/// This function always creates one output file for each chunk, even /// In Kth chunk of N mode - writes to stdout the contents of the chunk identified by `kth_chunk`
///
/// In N chunks mode - this function always creates one output file for each chunk, even
/// if there is an error reading or writing one of the chunks or if /// if there is an error reading or writing one of the chunks or if
/// the input file is truncated. However, if the `filter` option is /// the input file is truncated. However, if the `--filter` option is
/// being used, then no files are created. /// being used, then files will only be created if `$FILE` variable was used
/// in filter command,
/// i.e. `split -n l/10 --filter='head -c1 > $FILE' in`
/// ///
/// # Errors /// # Errors
/// ///
@ -1219,119 +1302,82 @@ where
/// ///
/// # See also /// # See also
/// ///
/// * [`kth_chunk_by_line`], which splits its input in the same way, /// * [`n_chunks_by_byte`], which splits its input into a specific number of chunks by byte.
/// but writes only one specified chunk to stdout.
/// ///
/// Implements `--number=CHUNKS` /// Implements `--number=CHUNKS`
/// Where CHUNKS /// Where CHUNKS
/// * l/N /// * l/N
fn split_into_n_chunks_by_line<R>( /// * l/K/N
fn n_chunks_by_line<R>(
settings: &Settings, settings: &Settings,
reader: &mut R, reader: &mut R,
num_chunks: u64, num_chunks: u64,
kth_chunk: Option<u64>,
) -> UResult<()> ) -> UResult<()>
where where
R: BufRead, R: BufRead,
{ {
// Get the size of the input file in bytes and compute the number // Get the size of the input in bytes and compute the number
// of bytes per chunk. // of bytes per chunk.
let metadata = metadata(&settings.input).map_err(|_| { let initial_buf = &mut Vec::new();
USimpleError::new(1, format!("{}: cannot determine file size", settings.input)) let num_bytes = get_input_size(&settings.input, reader, initial_buf, &settings.io_blksize)?;
})?; let reader = initial_buf.chain(reader);
let num_bytes = metadata.len();
let chunk_size = (num_bytes / num_chunks) as usize; let chunk_size = (num_bytes / num_chunks) as usize;
// This object is responsible for creating the filename for each chunk. // If input file is empty and we would not have determined the Kth chunk
let mut filename_iterator = FilenameIterator::new(&settings.prefix, &settings.suffix)?; // in the Kth chunk of N chunk mode, then terminate immediately.
// This happens on `split -n l/3/10 /dev/null`, for example.
// Create one writer for each chunk. This will create each if kth_chunk.is_some() && num_bytes == 0 {
// of the underlying files (if not in `--filter` mode). return Ok(());
let mut writers = vec![];
for _ in 0..num_chunks {
let filename = filename_iterator
.next()
.ok_or_else(|| USimpleError::new(1, "output file suffixes exhausted"))?;
let writer = settings.instantiate_current_writer(filename.as_str())?;
writers.push(writer);
} }
let mut num_bytes_remaining_in_current_chunk = chunk_size; // In Kth chunk of N mode - we will write to stdout instead of to a file.
let mut i = 0; let mut stdout_writer = std::io::stdout().lock();
let sep = settings.separator; // In N chunks mode - we will write to `num_chunks` files
for line_result in reader.split(sep) { let mut writers = vec![];
let line = line_result.unwrap();
let maybe_writer = writers.get_mut(i);
let writer = maybe_writer.unwrap();
let bytes = line.as_slice();
custom_write_all(bytes, writer, settings)?;
custom_write_all(&[sep], writer, settings)?;
// Add one byte for the separator character. // If in N chunks mode
let num_bytes = bytes.len() + 1; // Create one writer for each chunk.
if num_bytes > num_bytes_remaining_in_current_chunk { // This will create each of the underlying files
num_bytes_remaining_in_current_chunk = chunk_size; // or stdin pipes to child shell/command processes if in `--filter` mode
i += 1; if kth_chunk.is_none() {
} else { // This object is responsible for creating the filename for each chunk.
num_bytes_remaining_in_current_chunk -= num_bytes; let mut filename_iterator = FilenameIterator::new(&settings.prefix, &settings.suffix)
.map_err(|e| io::Error::new(ErrorKind::Other, format!("{e}")))?;
for _ in 0..num_chunks {
let filename = filename_iterator
.next()
.ok_or_else(|| USimpleError::new(1, "output file suffixes exhausted"))?;
let writer = settings.instantiate_current_writer(filename.as_str())?;
writers.push(writer);
} }
} }
Ok(())
}
/// Print the k-th chunk of a file, splitting by line.
///
/// This function is like [`split_into_n_chunks_by_line`], but instead
/// of writing each chunk to its own file, it only writes to stdout
/// the contents of the chunk identified by `chunk_number`.
///
/// # Errors
///
/// This function returns an error if there is a problem reading from
/// `reader` or writing to one of the output files.
///
/// # See also
///
/// * [`split_into_n_chunks_by_line`], which splits its input in the
/// same way, but writes each chunk to its own file.
///
/// Implements `--number=CHUNKS`
/// Where CHUNKS
/// * l/K/N
fn kth_chunk_by_line<R>(
settings: &Settings,
reader: &mut R,
chunk_number: u64,
num_chunks: u64,
) -> UResult<()>
where
R: BufRead,
{
// Get the size of the input file in bytes and compute the number
// of bytes per chunk.
let metadata = metadata(&settings.input).map_err(|_| {
USimpleError::new(1, format!("{}: cannot determine file size", settings.input))
})?;
let num_bytes = metadata.len();
let chunk_size = (num_bytes / num_chunks) as usize;
// Write to stdout instead of to a file.
let stdout = std::io::stdout();
let mut writer = stdout.lock();
let mut num_bytes_remaining_in_current_chunk = chunk_size; let mut num_bytes_remaining_in_current_chunk = chunk_size;
let mut i = 1; let mut i = 1;
let sep = settings.separator; let sep = settings.separator;
for line_result in reader.split(sep) { for line_result in reader.split(sep) {
let line = line_result?; // add separator back in at the end of the line
let mut line = line_result?;
line.push(sep);
let bytes = line.as_slice(); let bytes = line.as_slice();
if i == chunk_number {
writer.write_all(bytes)?; match kth_chunk {
writer.write_all(&[sep])?; Some(chunk_number) => {
if i == chunk_number {
stdout_writer.write_all(bytes)?;
}
}
None => {
let idx = (i - 1) as usize;
let maybe_writer = writers.get_mut(idx);
let writer = maybe_writer.unwrap();
custom_write_all(bytes, writer, settings)?;
}
} }
// Add one byte for the separator character. let num_bytes = bytes.len();
let num_bytes = bytes.len() + 1;
if num_bytes >= num_bytes_remaining_in_current_chunk { if num_bytes >= num_bytes_remaining_in_current_chunk {
num_bytes_remaining_in_current_chunk = chunk_size; num_bytes_remaining_in_current_chunk = chunk_size;
i += 1; i += 1;
@ -1339,72 +1385,8 @@ where
num_bytes_remaining_in_current_chunk -= num_bytes; num_bytes_remaining_in_current_chunk -= num_bytes;
} }
if i > chunk_number { if let Some(chunk_number) = kth_chunk {
break; if i > chunk_number {
}
}
Ok(())
}
/// Split a file into a specific number of chunks by line, but
/// assign lines via round-robin
///
/// This function always creates one output file for each chunk, even
/// if there is an error reading or writing one of the chunks or if
/// the input file is truncated. However, if the `filter` option is
/// being used, then no files are created.
///
/// # Errors
///
/// This function returns an error if there is a problem reading from
/// `reader` or writing to one of the output files.
///
/// # See also
///
/// * [`split_into_n_chunks_by_line`], which splits its input in the same way,
/// but without round robin distribution.
///
/// Implements `--number=CHUNKS`
/// Where CHUNKS
/// * r/N
fn split_into_n_chunks_by_line_round_robin<R>(
settings: &Settings,
reader: &mut R,
num_chunks: u64,
) -> UResult<()>
where
R: BufRead,
{
// This object is responsible for creating the filename for each chunk.
let mut filename_iterator = FilenameIterator::new(&settings.prefix, &settings.suffix)
.map_err(|e| io::Error::new(ErrorKind::Other, format!("{e}")))?;
// Create one writer for each chunk. This will create each
// of the underlying files (if not in `--filter` mode).
let mut writers = vec![];
for _ in 0..num_chunks {
let filename = filename_iterator
.next()
.ok_or_else(|| io::Error::new(ErrorKind::Other, "output file suffixes exhausted"))?;
let writer = settings.instantiate_current_writer(filename.as_str())?;
writers.push(writer);
}
let num_chunks: usize = num_chunks.try_into().unwrap();
let sep = settings.separator;
let mut closed_writers = 0;
for (i, line_result) in reader.split(sep).enumerate() {
let maybe_writer = writers.get_mut(i % num_chunks);
let writer = maybe_writer.unwrap();
let mut line = line_result.unwrap();
line.push(sep);
let bytes = line.as_slice();
let writer_stdin_open = custom_write_all(bytes, writer, settings)?;
if !writer_stdin_open {
closed_writers += 1;
if closed_writers == num_chunks {
// all writers are closed - stop reading
break; break;
} }
} }
@ -1413,14 +1395,17 @@ where
Ok(()) Ok(())
} }
/// Print the k-th chunk of a file, splitting by line, but /// Split a file or STDIN into a specific number of chunks by line, but
/// assign lines via round-robin to the specified number of output /// assign lines via round-robin
/// chunks, but output only the *k*th chunk.
/// ///
/// This function is like [`kth_chunk_by_line`], as it only writes to stdout and /// In Kth chunk of N mode - writes to stdout the contents of the chunk identified by `kth_chunk`
/// prints out only *k*th chunk ///
/// It is also like [`split_into_n_chunks_by_line_round_robin`], as it is assigning chunks /// In N chunks mode - this function always creates one output file for each chunk, even
/// using round robin distribution /// if there is an error reading or writing one of the chunks or if
/// the input file is truncated. However, if the `--filter` option is
/// being used, then files will only be created if `$FILE` variable was used
/// in filter command,
/// i.e. `split -n r/10 --filter='head -c1 > $FILE' in`
/// ///
/// # Errors /// # Errors
/// ///
@ -1429,46 +1414,83 @@ where
/// ///
/// # See also /// # See also
/// ///
/// * [`split_into_n_chunks_by_line_round_robin`], which splits its input in the /// * [`n_chunks_by_line`], which splits its input into a specific number of chunks by line.
/// same way, but writes each chunk to its own file.
/// ///
/// Implements `--number=CHUNKS` /// Implements `--number=CHUNKS`
/// Where CHUNKS /// Where CHUNKS
/// * r/N
/// * r/K/N /// * r/K/N
fn kth_chunk_by_line_round_robin<R>( fn n_chunks_by_line_round_robin<R>(
settings: &Settings, settings: &Settings,
reader: &mut R, reader: &mut R,
chunk_number: u64,
num_chunks: u64, num_chunks: u64,
kth_chunk: Option<u64>,
) -> UResult<()> ) -> UResult<()>
where where
R: BufRead, R: BufRead,
{ {
// Write to stdout instead of to a file. // In Kth chunk of N mode - we will write to stdout instead of to a file.
let stdout = std::io::stdout(); let mut stdout_writer = std::io::stdout().lock();
let mut writer = stdout.lock(); // In N chunks mode - we will write to `num_chunks` files
let mut writers = vec![];
let num_chunks: usize = num_chunks.try_into().unwrap(); // If in N chunks mode
let chunk_number: usize = chunk_number.try_into().unwrap(); // Create one writer for each chunk.
let sep = settings.separator; // This will create each of the underlying files
// The chunk number is given as a 1-indexed number, but it // or stdin pipes to child shell/command processes if in `--filter` mode
// is a little easier to deal with a 0-indexed number if kth_chunk.is_none() {
// since `.enumerate()` returns index `i` starting with 0 // This object is responsible for creating the filename for each chunk.
let chunk_number = chunk_number - 1; let mut filename_iterator = FilenameIterator::new(&settings.prefix, &settings.suffix)
for (i, line_result) in reader.split(sep).enumerate() { .map_err(|e| io::Error::new(ErrorKind::Other, format!("{e}")))?;
let line = line_result?; for _ in 0..num_chunks {
let bytes = line.as_slice(); let filename = filename_iterator
if (i % num_chunks) == chunk_number { .next()
writer.write_all(bytes)?; .ok_or_else(|| USimpleError::new(1, "output file suffixes exhausted"))?;
writer.write_all(&[sep])?; let writer = settings.instantiate_current_writer(filename.as_str())?;
writers.push(writer);
} }
} }
let num_chunks: usize = num_chunks.try_into().unwrap();
let sep = settings.separator;
let mut closed_writers = 0;
for (i, line_result) in reader.split(sep).enumerate() {
// add separator back in at the end of the line
let mut line = line_result?;
line.push(sep);
let bytes = line.as_slice();
match kth_chunk {
Some(chunk_number) => {
// The `.enumerate()` method returns index `i` starting with 0,
// but chunk number is given as a 1-indexed number,
// so compare to `chunk_number - 1`
if (i % num_chunks) == (chunk_number - 1) as usize {
stdout_writer.write_all(bytes)?;
}
}
None => {
let maybe_writer = writers.get_mut(i % num_chunks);
let writer = maybe_writer.unwrap();
let writer_stdin_open = custom_write_all(bytes, writer, settings)?;
if !writer_stdin_open {
closed_writers += 1;
if closed_writers == num_chunks {
// all writers are closed - stop reading
break;
}
}
}
}
}
Ok(()) Ok(())
} }
#[allow(clippy::cognitive_complexity)] #[allow(clippy::cognitive_complexity)]
fn split(settings: &Settings) -> UResult<()> { fn split(settings: &Settings) -> UResult<()> {
let mut reader = BufReader::new(if settings.input == "-" { let r_box = if settings.input == "-" {
Box::new(stdin()) as Box<dyn Read> Box::new(stdin()) as Box<dyn Read>
} else { } else {
let r = File::open(Path::new(&settings.input)).map_err_context(|| { let r = File::open(Path::new(&settings.input)).map_err_context(|| {
@ -1478,26 +1500,33 @@ fn split(settings: &Settings) -> UResult<()> {
) )
})?; })?;
Box::new(r) as Box<dyn Read> Box::new(r) as Box<dyn Read>
}); };
let mut reader = if let Some(c) = settings.io_blksize {
BufReader::with_capacity(c, r_box)
} else {
BufReader::new(r_box)
};
match settings.strategy { match settings.strategy {
Strategy::Number(NumberType::Bytes(num_chunks)) => { Strategy::Number(NumberType::Bytes(num_chunks)) => {
split_into_n_chunks_by_byte(settings, &mut reader, num_chunks) // split_into_n_chunks_by_byte(settings, &mut reader, num_chunks)
n_chunks_by_byte(settings, &mut reader, num_chunks, None)
} }
Strategy::Number(NumberType::KthBytes(chunk_number, num_chunks)) => { Strategy::Number(NumberType::KthBytes(chunk_number, num_chunks)) => {
kth_chunks_by_byte(settings, &mut reader, chunk_number, num_chunks) // kth_chunks_by_byte(settings, &mut reader, chunk_number, num_chunks)
n_chunks_by_byte(settings, &mut reader, num_chunks, Some(chunk_number))
} }
Strategy::Number(NumberType::Lines(num_chunks)) => { Strategy::Number(NumberType::Lines(num_chunks)) => {
split_into_n_chunks_by_line(settings, &mut reader, num_chunks) n_chunks_by_line(settings, &mut reader, num_chunks, None)
} }
Strategy::Number(NumberType::KthLines(chunk_number, num_chunks)) => { Strategy::Number(NumberType::KthLines(chunk_number, num_chunks)) => {
kth_chunk_by_line(settings, &mut reader, chunk_number, num_chunks) n_chunks_by_line(settings, &mut reader, num_chunks, Some(chunk_number))
} }
Strategy::Number(NumberType::RoundRobin(num_chunks)) => { Strategy::Number(NumberType::RoundRobin(num_chunks)) => {
split_into_n_chunks_by_line_round_robin(settings, &mut reader, num_chunks) n_chunks_by_line_round_robin(settings, &mut reader, num_chunks, None)
} }
Strategy::Number(NumberType::KthRoundRobin(chunk_number, num_chunks)) => { Strategy::Number(NumberType::KthRoundRobin(chunk_number, num_chunks)) => {
kth_chunk_by_line_round_robin(settings, &mut reader, chunk_number, num_chunks) n_chunks_by_line_round_robin(settings, &mut reader, num_chunks, Some(chunk_number))
} }
Strategy::Lines(chunk_size) => { Strategy::Lines(chunk_size) => {
let mut writer = LineChunkWriter::new(chunk_size, settings)?; let mut writer = LineChunkWriter::new(chunk_size, settings)?;

View file

@ -2,7 +2,7 @@
// //
// For the full copyright and license information, please view the LICENSE // For the full copyright and license information, please view the LICENSE
// file that was distributed with this source code. // file that was distributed with this source code.
// spell-checker:ignore xzaaa sixhundredfiftyonebytes ninetyonebytes threebytes asciilowercase fghij klmno pqrst uvwxyz fivelines twohundredfortyonebytes onehundredlines nbbbb dxen ncccc // spell-checker:ignore xzaaa sixhundredfiftyonebytes ninetyonebytes threebytes asciilowercase ghijkl mnopq rstuv wxyz fivelines twohundredfortyonebytes onehundredlines nbbbb dxen ncccc
use crate::common::util::{AtPath, TestScenario}; use crate::common::util::{AtPath, TestScenario};
use rand::{thread_rng, Rng, SeedableRng}; use rand::{thread_rng, Rng, SeedableRng};
@ -704,54 +704,41 @@ fn test_split_overflow_bytes_size() {
assert_eq!(glob.collate(), at.read_bytes(name)); assert_eq!(glob.collate(), at.read_bytes(name));
} }
#[test]
#[cfg(target_pointer_width = "32")]
fn test_split_chunks_num_chunks_oversized_32() {
let scene = TestScenario::new(util_name!());
let at = &scene.fixtures;
at.touch("file");
scene
.ucmd()
.args(&["--number", "5000000000", "sixhundredfiftyonebytes.txt"])
.fails()
.code_is(1)
.stderr_only("split: Number of chunks too big\n");
}
#[test] #[test]
fn test_split_stdin_num_chunks() { fn test_split_stdin_num_chunks() {
new_ucmd!() let (at, mut ucmd) = at_and_ucmd!();
.args(&["--number=1"]) ucmd.args(&["--number=1"]).pipe_in("").succeeds();
.fails() assert_eq!(file_read(&at, "xaa"), "");
.code_is(1) assert!(!at.plus("xab").exists());
.stderr_only("split: -: cannot determine file size\n");
} }
#[test] #[test]
fn test_split_stdin_num_kth_chunk() { fn test_split_stdin_num_kth_chunk() {
new_ucmd!() new_ucmd!()
.args(&["--number=1/2"]) .args(&["--number=1/2"])
.fails() .pipe_in("1\n2\n3\n4\n5\n")
.code_is(1) .succeeds()
.stderr_only("split: -: cannot determine file size\n"); .stdout_only("1\n2\n3");
} }
#[test] #[test]
fn test_split_stdin_num_line_chunks() { fn test_split_stdin_num_line_chunks() {
new_ucmd!() let (at, mut ucmd) = at_and_ucmd!();
.args(&["--number=l/2"]) ucmd.args(&["--number=l/2"])
.fails() .pipe_in("1\n2\n3\n4\n5\n")
.code_is(1) .succeeds();
.stderr_only("split: -: cannot determine file size\n"); assert_eq!(file_read(&at, "xaa"), "1\n2\n3\n");
assert_eq!(file_read(&at, "xab"), "4\n5\n");
assert!(!at.plus("xac").exists());
} }
#[test] #[test]
fn test_split_stdin_num_kth_line_chunk() { fn test_split_stdin_num_kth_line_chunk() {
new_ucmd!() new_ucmd!()
.args(&["--number=l/2/5"]) .args(&["--number=l/2/5"])
.fails() .pipe_in("1\n2\n3\n4\n5\n")
.code_is(1) .succeeds()
.stderr_only("split: -: cannot determine file size\n"); .stdout_only("2\n");
} }
fn file_read(at: &AtPath, filename: &str) -> String { fn file_read(at: &AtPath, filename: &str) -> String {
@ -912,6 +899,14 @@ fn test_suffixes_exhausted() {
.stderr_only("split: output file suffixes exhausted\n"); .stderr_only("split: output file suffixes exhausted\n");
} }
#[test]
fn test_suffix_length_req() {
new_ucmd!()
.args(&["-n", "100", "-a", "1", "asciilowercase.txt"])
.fails()
.stderr_only("split: the suffix length needs to be at least 2\n");
}
#[test] #[test]
fn test_verbose() { fn test_verbose() {
new_ucmd!() new_ucmd!()
@ -937,11 +932,11 @@ fn test_number_n() {
s s
}; };
ucmd.args(&["-n", "5", "asciilowercase.txt"]).succeeds(); ucmd.args(&["-n", "5", "asciilowercase.txt"]).succeeds();
assert_eq!(file_read("xaa"), "abcde"); assert_eq!(file_read("xaa"), "abcdef");
assert_eq!(file_read("xab"), "fghij"); assert_eq!(file_read("xab"), "ghijkl");
assert_eq!(file_read("xac"), "klmno"); assert_eq!(file_read("xac"), "mnopq");
assert_eq!(file_read("xad"), "pqrst"); assert_eq!(file_read("xad"), "rstuv");
assert_eq!(file_read("xae"), "uvwxyz\n"); assert_eq!(file_read("xae"), "wxyz\n");
#[cfg(unix)] #[cfg(unix)]
new_ucmd!() new_ucmd!()
.args(&["--number=100", "/dev/null"]) .args(&["--number=100", "/dev/null"])
@ -954,11 +949,11 @@ fn test_number_kth_of_n() {
new_ucmd!() new_ucmd!()
.args(&["--number=3/5", "asciilowercase.txt"]) .args(&["--number=3/5", "asciilowercase.txt"])
.succeeds() .succeeds()
.stdout_only("klmno"); .stdout_only("mnopq");
new_ucmd!() new_ucmd!()
.args(&["--number=5/5", "asciilowercase.txt"]) .args(&["--number=5/5", "asciilowercase.txt"])
.succeeds() .succeeds()
.stdout_only("uvwxyz\n"); .stdout_only("wxyz\n");
new_ucmd!() new_ucmd!()
.args(&["-e", "--number=99/100", "asciilowercase.txt"]) .args(&["-e", "--number=99/100", "asciilowercase.txt"])
.succeeds() .succeeds()
@ -1046,11 +1041,11 @@ fn test_split_number_with_io_blksize() {
}; };
ucmd.args(&["-n", "5", "asciilowercase.txt", "---io-blksize", "1024"]) ucmd.args(&["-n", "5", "asciilowercase.txt", "---io-blksize", "1024"])
.succeeds(); .succeeds();
assert_eq!(file_read("xaa"), "abcde"); assert_eq!(file_read("xaa"), "abcdef");
assert_eq!(file_read("xab"), "fghij"); assert_eq!(file_read("xab"), "ghijkl");
assert_eq!(file_read("xac"), "klmno"); assert_eq!(file_read("xac"), "mnopq");
assert_eq!(file_read("xad"), "pqrst"); assert_eq!(file_read("xad"), "rstuv");
assert_eq!(file_read("xae"), "uvwxyz\n"); assert_eq!(file_read("xae"), "wxyz\n");
} }
#[test] #[test]
@ -1065,6 +1060,32 @@ fn test_split_default_with_io_blksize() {
assert_eq!(glob.collate(), at.read_bytes(name)); assert_eq!(glob.collate(), at.read_bytes(name));
} }
#[test]
fn test_split_invalid_io_blksize() {
new_ucmd!()
.args(&["---io-blksize=XYZ", "threebytes.txt"])
.fails()
.stderr_only("split: invalid IO block size: 'XYZ'\n");
new_ucmd!()
.args(&["---io-blksize=5000000000", "threebytes.txt"])
.fails()
.stderr_only("split: invalid IO block size: '5000000000'\n");
#[cfg(target_pointer_width = "32")]
new_ucmd!()
.args(&["---io-blksize=2146435072", "threebytes.txt"])
.fails()
.stderr_only("split: invalid IO block size: '2146435072'\n");
}
#[test]
fn test_split_number_oversized_stdin() {
new_ucmd!()
.args(&["--number=3", "---io-blksize=600"])
.pipe_in_fixture("sixhundredfiftyonebytes.txt")
.fails()
.stderr_only("split: -: cannot determine input size\n");
}
#[test] #[test]
fn test_invalid_suffix_length() { fn test_invalid_suffix_length() {
new_ucmd!() new_ucmd!()
@ -1157,6 +1178,18 @@ fn test_elide_dev_null() {
assert!(!at.plus("xac").exists()); assert!(!at.plus("xac").exists());
} }
#[test]
#[cfg(unix)]
fn test_dev_zero() {
let (at, mut ucmd) = at_and_ucmd!();
ucmd.args(&["-n", "3", "/dev/zero"])
.fails()
.stderr_only("split: /dev/zero: cannot determine file size\n");
assert!(!at.plus("xaa").exists());
assert!(!at.plus("xab").exists());
assert!(!at.plus("xac").exists());
}
#[test] #[test]
fn test_lines() { fn test_lines() {
let (at, mut ucmd) = at_and_ucmd!(); let (at, mut ucmd) = at_and_ucmd!();
@ -1182,6 +1215,15 @@ fn test_lines_kth() {
.stdout_only("20\n21\n22\n23\n24\n25\n26\n27\n28\n29\n"); .stdout_only("20\n21\n22\n23\n24\n25\n26\n27\n28\n29\n");
} }
#[test]
#[cfg(unix)]
fn test_lines_kth_dev_null() {
new_ucmd!()
.args(&["-n", "l/3/10", "/dev/null"])
.succeeds()
.stdout_only("");
}
#[test] #[test]
fn test_line_bytes() { fn test_line_bytes() {
let (at, mut ucmd) = at_and_ucmd!(); let (at, mut ucmd) = at_and_ucmd!();
@ -1321,7 +1363,7 @@ fn test_numeric_suffix() {
} }
#[test] #[test]
fn test_numeric_suffix_alias() { fn test_numeric_suffix_inferred() {
let (at, mut ucmd) = at_and_ucmd!(); let (at, mut ucmd) = at_and_ucmd!();
ucmd.args(&["-n", "4", "--numeric=9", "threebytes.txt"]) ucmd.args(&["-n", "4", "--numeric=9", "threebytes.txt"])
.succeeds() .succeeds()