mirror of
https://github.com/RGBCube/uutils-coreutils
synced 2025-07-28 19:47:45 +00:00
split: add support for "-n l/NUM" option to split
Add support for `split -n l/NUM`. Previously, `split` only supported `-n NUM`, which splits a file into `NUM` chunks by byte. The `-n l/NUM` strategy splits a file into `NUM` chunks without splitting lines across chunks.
This commit is contained in:
parent
92d461247e
commit
dbbee573ab
2 changed files with 88 additions and 1 deletions
|
@ -19,7 +19,7 @@ use std::env;
|
|||
use std::fmt;
|
||||
use std::fs::{metadata, File};
|
||||
use std::io;
|
||||
use std::io::{stdin, BufReader, BufWriter, ErrorKind, Read, Write};
|
||||
use std::io::{stdin, BufRead, BufReader, BufWriter, ErrorKind, Read, Write};
|
||||
use std::path::Path;
|
||||
use uucore::display::Quotable;
|
||||
use uucore::error::{FromIo, UIoError, UResult, USimpleError, UUsageError};
|
||||
|
@ -845,6 +845,73 @@ where
|
|||
.map_err_context(|| "I/O error".to_string())
|
||||
}
|
||||
|
||||
/// Split a file into a specific number of chunks by line.
|
||||
///
|
||||
/// This function always creates one output file for each chunk, even
|
||||
/// if there is an error reading or writing one of the chunks or if
|
||||
/// the input file is truncated. However, if the `filter` option is
|
||||
/// being used, then no files are created.
|
||||
///
|
||||
/// # Errors
|
||||
///
|
||||
/// This function returns an error if there is a problem reading from
|
||||
/// `reader` or writing to one of the output files.
|
||||
fn split_into_n_chunks_by_line<R>(
|
||||
settings: &Settings,
|
||||
reader: &mut R,
|
||||
num_chunks: u64,
|
||||
) -> UResult<()>
|
||||
where
|
||||
R: BufRead,
|
||||
{
|
||||
// Get the size of the input file in bytes and compute the number
|
||||
// of bytes per chunk.
|
||||
let metadata = metadata(&settings.input).unwrap();
|
||||
let num_bytes = metadata.len();
|
||||
let chunk_size = (num_bytes / (num_chunks as u64)) as usize;
|
||||
|
||||
// This object is responsible for creating the filename for each chunk.
|
||||
let mut filename_iterator = FilenameIterator::new(
|
||||
&settings.prefix,
|
||||
&settings.additional_suffix,
|
||||
settings.suffix_length,
|
||||
settings.suffix_type,
|
||||
);
|
||||
|
||||
// Create one writer for each chunk. This will create each
|
||||
// of the underlying files (if not in `--filter` mode).
|
||||
let mut writers = vec![];
|
||||
for _ in 0..num_chunks {
|
||||
let filename = filename_iterator
|
||||
.next()
|
||||
.ok_or_else(|| USimpleError::new(1, "output file suffixes exhausted"))?;
|
||||
let writer = platform::instantiate_current_writer(&settings.filter, filename.as_str());
|
||||
writers.push(writer);
|
||||
}
|
||||
|
||||
let mut num_bytes_remaining_in_current_chunk = chunk_size;
|
||||
let mut i = 0;
|
||||
for line_result in reader.lines() {
|
||||
let line = line_result.unwrap();
|
||||
let maybe_writer = writers.get_mut(i);
|
||||
let writer = maybe_writer.unwrap();
|
||||
let bytes = line.as_bytes();
|
||||
writer.write_all(bytes)?;
|
||||
writer.write_all(b"\n")?;
|
||||
|
||||
// Add one byte for the newline character.
|
||||
let num_bytes = bytes.len() + 1;
|
||||
if num_bytes > num_bytes_remaining_in_current_chunk {
|
||||
num_bytes_remaining_in_current_chunk = chunk_size;
|
||||
i += 1;
|
||||
} else {
|
||||
num_bytes_remaining_in_current_chunk -= num_bytes;
|
||||
}
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn split(settings: &Settings) -> UResult<()> {
|
||||
let mut reader = BufReader::new(if settings.input == "-" {
|
||||
Box::new(stdin()) as Box<dyn Read>
|
||||
|
@ -862,6 +929,9 @@ fn split(settings: &Settings) -> UResult<()> {
|
|||
Strategy::Number(NumberType::Bytes(num_chunks)) => {
|
||||
split_into_n_chunks_by_byte(settings, &mut reader, num_chunks)
|
||||
}
|
||||
Strategy::Number(NumberType::Lines(num_chunks)) => {
|
||||
split_into_n_chunks_by_line(settings, &mut reader, num_chunks)
|
||||
}
|
||||
Strategy::Number(_) => Err(USimpleError::new(1, "-n mode not yet fully implemented")),
|
||||
Strategy::Lines(chunk_size) => {
|
||||
let mut writer = LineChunkWriter::new(chunk_size, settings)
|
||||
|
|
|
@ -545,3 +545,20 @@ fn test_elide_empty_files() {
|
|||
assert_eq!(at.read("xac"), "c");
|
||||
assert!(!at.plus("xad").exists());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_lines() {
|
||||
let (at, mut ucmd) = at_and_ucmd!();
|
||||
|
||||
let file_read = |f| {
|
||||
let mut s = String::new();
|
||||
at.open(f).read_to_string(&mut s).unwrap();
|
||||
s
|
||||
};
|
||||
|
||||
// Split into two files without splitting up lines.
|
||||
ucmd.args(&["-n", "l/2", "fivelines.txt"]).succeeds();
|
||||
|
||||
assert_eq!(file_read("xaa"), "1\n2\n3\n");
|
||||
assert_eq!(file_read("xab"), "4\n5\n");
|
||||
}
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue