diff --git a/src/uu/split/src/filenames.rs b/src/uu/split/src/filenames.rs index da72e090e..36488e7e4 100644 --- a/src/uu/split/src/filenames.rs +++ b/src/uu/split/src/filenames.rs @@ -355,23 +355,23 @@ fn num_prefix(i: usize) -> String { /// assert_eq!(factory.make(650).unwrap(), "zaaa"); /// assert_eq!(factory.make(6551).unwrap(), "zaab"); /// ``` -pub struct FilenameFactory { - additional_suffix: String, - prefix: String, +pub struct FilenameFactory<'a> { + prefix: &'a str, + additional_suffix: &'a str, suffix_length: usize, use_numeric_suffix: bool, } -impl FilenameFactory { +impl<'a> FilenameFactory<'a> { /// Create a new instance of this struct. /// /// For an explanation of the parameters, see the struct documentation. pub fn new( - prefix: String, - additional_suffix: String, + prefix: &'a str, + additional_suffix: &'a str, suffix_length: usize, use_numeric_suffix: bool, - ) -> FilenameFactory { + ) -> FilenameFactory<'a> { FilenameFactory { prefix, additional_suffix, @@ -392,8 +392,8 @@ impl FilenameFactory { /// ```rust,ignore /// use crate::filenames::FilenameFactory; /// - /// let prefix = String::new(); - /// let suffix = String::new(); + /// let prefix = ""; + /// let suffix = ""; /// let width = 1; /// let use_numeric_suffix = true; /// let factory = FilenameFactory::new(prefix, suffix, width, use_numeric_suffix); @@ -401,15 +401,16 @@ impl FilenameFactory { /// assert_eq!(factory.make(10), None); /// ``` pub fn make(&self, i: usize) -> Option { - let prefix = self.prefix.clone(); - let suffix1 = match (self.use_numeric_suffix, self.suffix_length) { + let suffix = match (self.use_numeric_suffix, self.suffix_length) { (true, 0) => Some(num_prefix(i)), (false, 0) => str_prefix(i), (true, width) => num_prefix_fixed_width(i, width), (false, width) => str_prefix_fixed_width(i, width), }?; - let suffix2 = &self.additional_suffix; - Some(prefix + &suffix1 + suffix2) + Some(format!( + "{}{}{}", + self.prefix, suffix, self.additional_suffix + )) } } @@ -513,7 +514,7 @@ mod tests { #[test] fn test_alphabetic_suffix() { - let factory = FilenameFactory::new("123".to_string(), "789".to_string(), 3, false); + let factory = FilenameFactory::new("123", "789", 3, false); assert_eq!(factory.make(0).unwrap(), "123aaa789"); assert_eq!(factory.make(1).unwrap(), "123aab789"); assert_eq!(factory.make(28).unwrap(), "123abc789"); @@ -521,7 +522,7 @@ mod tests { #[test] fn test_numeric_suffix() { - let factory = FilenameFactory::new("abc".to_string(), "xyz".to_string(), 3, true); + let factory = FilenameFactory::new("abc", "xyz", 3, true); assert_eq!(factory.make(0).unwrap(), "abc000xyz"); assert_eq!(factory.make(1).unwrap(), "abc001xyz"); assert_eq!(factory.make(123).unwrap(), "abc123xyz"); diff --git a/src/uu/split/src/split.rs b/src/uu/split/src/split.rs index 0270d4282..7d06b170f 100644 --- a/src/uu/split/src/split.rs +++ b/src/uu/split/src/split.rs @@ -14,8 +14,7 @@ use crate::filenames::FilenameFactory; use clap::{crate_version, App, AppSettings, Arg, ArgMatches}; use std::convert::TryFrom; use std::env; -use std::fs::remove_file; -use std::fs::File; +use std::fs::{metadata, remove_file, File}; use std::io::{stdin, BufRead, BufReader, BufWriter, Read, Write}; use std::path::Path; use uucore::display::Quotable; @@ -27,6 +26,7 @@ static OPT_LINE_BYTES: &str = "line-bytes"; static OPT_LINES: &str = "lines"; static OPT_ADDITIONAL_SUFFIX: &str = "additional-suffix"; static OPT_FILTER: &str = "filter"; +static OPT_NUMBER: &str = "number"; static OPT_NUMERIC_SUFFIXES: &str = "numeric-suffixes"; static OPT_SUFFIX_LENGTH: &str = "suffix-length"; static OPT_DEFAULT_SUFFIX_LENGTH: &str = "0"; @@ -132,6 +132,13 @@ pub fn uu_app<'a>() -> App<'a> { .default_value("1000") .help("put NUMBER lines/records per output file"), ) + .arg( + Arg::new(OPT_NUMBER) + .short('n') + .long(OPT_NUMBER) + .takes_value(true) + .help("generate CHUNKS output files; see explanation below"), + ) // rest of the arguments .arg( Arg::new(OPT_ADDITIONAL_SUFFIX) @@ -194,6 +201,9 @@ enum Strategy { /// Each chunk has as many lines as possible without exceeding the /// specified number of bytes. LineBytes(usize), + + /// Split the file into this many chunks. + Number(usize), } impl Strategy { @@ -208,26 +218,34 @@ impl Strategy { matches.occurrences_of(OPT_LINES), matches.occurrences_of(OPT_BYTES), matches.occurrences_of(OPT_LINE_BYTES), + matches.occurrences_of(OPT_NUMBER), ) { - (0, 0, 0) => Ok(Strategy::Lines(1000)), - (1, 0, 0) => { + (0, 0, 0, 0) => Ok(Strategy::Lines(1000)), + (1, 0, 0, 0) => { let s = matches.value_of(OPT_LINES).unwrap(); let n = parse_size(s) .map_err(|e| USimpleError::new(1, format!("invalid number of lines: {}", e)))?; Ok(Strategy::Lines(n)) } - (0, 1, 0) => { + (0, 1, 0, 0) => { let s = matches.value_of(OPT_BYTES).unwrap(); let n = parse_size(s) .map_err(|e| USimpleError::new(1, format!("invalid number of bytes: {}", e)))?; Ok(Strategy::Bytes(n)) } - (0, 0, 1) => { + (0, 0, 1, 0) => { let s = matches.value_of(OPT_LINE_BYTES).unwrap(); let n = parse_size(s) .map_err(|e| USimpleError::new(1, format!("invalid number of bytes: {}", e)))?; Ok(Strategy::LineBytes(n)) } + (0, 0, 0, 1) => { + let s = matches.value_of(OPT_NUMBER).unwrap(); + let n = s.parse::().map_err(|e| { + USimpleError::new(1, format!("invalid number of chunks: {}", e)) + })?; + Ok(Strategy::Number(n)) + } _ => Err(UUsageError::new(1, "cannot split in more than one way")), } } @@ -344,6 +362,84 @@ impl Splitter for ByteSplitter { } } +/// Split a file into a specific number of chunks by byte. +/// +/// This function always creates one output file for each chunk, even +/// if there is an error reading or writing one of the chunks or if +/// the input file is truncated. However, if the `filter` option is +/// being used, then no files are created. +/// +/// # Errors +/// +/// This function returns an error if there is a problem reading from +/// `reader` or writing to one of the output files. +fn split_into_n_chunks_by_byte( + settings: &Settings, + reader: &mut R, + num_chunks: usize, +) -> UResult<()> +where + R: Read, +{ + // Get the size of the input file in bytes and compute the number + // of bytes per chunk. + let metadata = metadata(&settings.input).unwrap(); + let num_bytes = metadata.len(); + let chunk_size = (num_bytes / (num_chunks as u64)) as usize; + + // This object is responsible for creating the filename for each chunk. + let filename_factory = FilenameFactory::new( + &settings.prefix, + &settings.additional_suffix, + settings.suffix_length, + settings.numeric_suffix, + ); + + // Create one writer for each chunk. This will create each + // of the underlying files (if not in `--filter` mode). + let mut writers = vec![]; + for i in 0..num_chunks { + let filename = filename_factory + .make(i) + .ok_or_else(|| USimpleError::new(1, "output file suffixes exhausted"))?; + let writer = platform::instantiate_current_writer(&settings.filter, filename.as_str()); + writers.push(writer); + } + + // This block evaluates to an object of type `std::io::Result<()>`. + { + // Write `chunk_size` bytes from the reader into each writer + // except the last. + // + // Re-use the buffer to avoid re-allocating a `Vec` on each + // iteration. The contents will be completely overwritten each + // time we call `read_exact()`. + // + // The last writer gets all remaining bytes so that if the number + // of bytes in the input file was not evenly divisible by + // `num_chunks`, we don't leave any bytes behind. + let mut buf = vec![0u8; chunk_size]; + for writer in writers.iter_mut().take(num_chunks - 1) { + reader.read_exact(&mut buf)?; + writer.write_all(&buf)?; + } + + // Write all the remaining bytes to the last chunk. + // + // To do this, we resize our buffer to have the necessary number + // of bytes. + let i = num_chunks - 1; + let last_chunk_size = num_bytes as usize - (chunk_size * (num_chunks - 1)); + buf.resize(last_chunk_size, 0); + + reader.read_exact(&mut buf)?; + writers[i].write_all(&buf)?; + + Ok(()) + } + .map_err_context(|| "I/O error".to_string()) +} + fn split(settings: Settings) -> UResult<()> { let mut reader = BufReader::new(if settings.input == "-" { Box::new(stdin()) as Box @@ -357,17 +453,22 @@ fn split(settings: Settings) -> UResult<()> { Box::new(r) as Box }); + if let Strategy::Number(num_chunks) = settings.strategy { + return split_into_n_chunks_by_byte(&settings, &mut reader, num_chunks); + } + let mut splitter: Box = match settings.strategy { Strategy::Lines(chunk_size) => Box::new(LineSplitter::new(chunk_size)), Strategy::Bytes(chunk_size) | Strategy::LineBytes(chunk_size) => { Box::new(ByteSplitter::new(chunk_size)) } + _ => unreachable!(), }; // This object is responsible for creating the filename for each chunk. let filename_factory = FilenameFactory::new( - settings.prefix, - settings.additional_suffix, + &settings.prefix, + &settings.additional_suffix, settings.suffix_length, settings.numeric_suffix, ); diff --git a/tests/by-util/test_split.rs b/tests/by-util/test_split.rs index d55e13644..2005c0235 100644 --- a/tests/by-util/test_split.rs +++ b/tests/by-util/test_split.rs @@ -2,7 +2,7 @@ // * // * For the full copyright and license information, please view the LICENSE // * file that was distributed with this source code. -// spell-checker:ignore xzaaa sixhundredfiftyonebytes ninetyonebytes asciilowercase +// spell-checker:ignore xzaaa sixhundredfiftyonebytes ninetyonebytes asciilowercase fghij klmno pqrst uvwxyz extern crate rand; extern crate regex; @@ -12,11 +12,10 @@ use crate::common::util::*; use rand::SeedableRng; #[cfg(not(windows))] use std::env; -use std::io::Write; use std::path::Path; use std::{ fs::{read_dir, File}, - io::{BufWriter, Read}, + io::{BufWriter, Read, Write}, }; fn random_chars(n: usize) -> String { @@ -425,3 +424,19 @@ creating file 'xaf' ", ); } + +#[test] +fn test_number() { + let (at, mut ucmd) = at_and_ucmd!(); + let file_read = |f| { + let mut s = String::new(); + at.open(f).read_to_string(&mut s).unwrap(); + s + }; + ucmd.args(&["-n", "5", "asciilowercase.txt"]).succeeds(); + assert_eq!(file_read("xaa"), "abcde"); + assert_eq!(file_read("xab"), "fghij"); + assert_eq!(file_read("xac"), "klmno"); + assert_eq!(file_read("xad"), "pqrst"); + assert_eq!(file_read("xae"), "uvwxyz"); +} diff --git a/tests/fixtures/split/asciilowercase.txt b/tests/fixtures/split/asciilowercase.txt index b0883f382..e85d5b452 100644 --- a/tests/fixtures/split/asciilowercase.txt +++ b/tests/fixtures/split/asciilowercase.txt @@ -1 +1 @@ -abcdefghijklmnopqrstuvwxyz +abcdefghijklmnopqrstuvwxyz \ No newline at end of file