diff --git a/src/uu/split/src/split.rs b/src/uu/split/src/split.rs index 445c1f205..128ef73c6 100644 --- a/src/uu/split/src/split.rs +++ b/src/uu/split/src/split.rs @@ -13,11 +13,11 @@ extern crate uucore; mod platform; use clap::{App, Arg}; -use std::char; use std::env; use std::fs::File; -use std::io::{stdin, stdout, BufRead, BufReader, BufWriter, Read, Write}; +use std::io::{stdin, BufRead, BufReader, BufWriter, Read, Write}; use std::path::Path; +use std::{char, fs::remove_file}; static NAME: &str = "split"; static VERSION: &str = env!("CARGO_PKG_VERSION"); @@ -213,50 +213,65 @@ struct Settings { verbose: bool, } -struct SplitControl { - current_line: String, // Don't touch - request_new_file: bool, // Splitter implementation requests new file -} - trait Splitter { - // Consume the current_line and return the consumed string - fn consume(&mut self, _: &mut SplitControl) -> String; + // Consume as much as possible from `reader` so as to saturate `writer`. + // Equivalent to finishing one of the part files. Returns the number of + // bytes that have been moved. + fn consume( + &mut self, + reader: &mut BufReader>, + writer: &mut BufWriter>, + ) -> usize; } struct LineSplitter { - saved_lines_to_write: usize, - lines_to_write: usize, + lines_per_split: usize, } impl LineSplitter { fn new(settings: &Settings) -> LineSplitter { - let n = match settings.strategy_param.parse() { - Ok(a) => a, - Err(e) => crash!(1, "invalid number of lines: {}", e), - }; LineSplitter { - saved_lines_to_write: n, - lines_to_write: n, + lines_per_split: settings + .strategy_param + .parse() + .unwrap_or_else(|e| crash!(1, "invalid number of lines: {}", e)), } } } impl Splitter for LineSplitter { - fn consume(&mut self, control: &mut SplitControl) -> String { - self.lines_to_write -= 1; - if self.lines_to_write == 0 { - self.lines_to_write = self.saved_lines_to_write; - control.request_new_file = true; + fn consume( + &mut self, + reader: &mut BufReader>, + writer: &mut BufWriter>, + ) -> usize { + let mut bytes_consumed = 0usize; + let mut buffer = String::with_capacity(1024); + for _ in 0..self.lines_per_split { + let bytes_read = reader + .read_line(&mut buffer) + .unwrap_or_else(|_| crash!(1, "error reading bytes from input file")); + // If we ever read 0 bytes then we know we've hit EOF. + if bytes_read == 0 { + return bytes_consumed; + } + + writer + .write_all(buffer.as_bytes()) + .unwrap_or_else(|_| crash!(1, "error writing bytes to output file")); + // Empty out the String buffer since `read_line` appends instead of + // replaces. + buffer.clear(); + + bytes_consumed += bytes_read; } - control.current_line.clone() + + bytes_consumed } } struct ByteSplitter { - saved_bytes_to_write: usize, - bytes_to_write: usize, - break_on_line_end: bool, - require_whole_line: bool, + bytes_per_split: usize, } impl ByteSplitter { @@ -294,36 +309,44 @@ impl ByteSplitter { // Try to parse the actual numeral. let n = &settings.strategy_param[0..(settings.strategy_param.len() - suffix.len())] .parse::() - .unwrap_or_else(|_| crash!(1, "invalid number of bytes")); + .unwrap_or_else(|e| crash!(1, "invalid number of bytes: {}", e)); ByteSplitter { - saved_bytes_to_write: n * multiplier, - bytes_to_write: n * multiplier, - break_on_line_end: settings.strategy == "b", - require_whole_line: false, + bytes_per_split: n * multiplier, } } } impl Splitter for ByteSplitter { - fn consume(&mut self, control: &mut SplitControl) -> String { - let line = control.current_line.clone(); - let n = std::cmp::min(line.chars().count(), self.bytes_to_write); - if self.require_whole_line && n < line.chars().count() { - self.bytes_to_write = self.saved_bytes_to_write; - control.request_new_file = true; - self.require_whole_line = false; - return "".to_owned(); + fn consume( + &mut self, + reader: &mut BufReader>, + writer: &mut BufWriter>, + ) -> usize { + // We buffer reads and writes. We proceed until `bytes_consumed` is + // equal to `self.bytes_per_split` or we reach EOF. + let mut bytes_consumed = 0usize; + const BUFFER_SIZE: usize = 1024; + let mut buffer = [0u8; BUFFER_SIZE]; + while bytes_consumed < self.bytes_per_split { + // Don't overshoot `self.bytes_per_split`! + let bytes_desired = std::cmp::min(BUFFER_SIZE, self.bytes_per_split - bytes_consumed); + let bytes_read = reader + .read(&mut buffer[0..bytes_desired]) + .unwrap_or_else(|_| crash!(1, "error reading bytes from input file")); + // If we ever read 0 bytes then we know we've hit EOF. + if bytes_read == 0 { + return bytes_consumed; + } + + writer + .write_all(&buffer[0..bytes_read]) + .unwrap_or_else(|_| crash!(1, "error writing bytes to output file")); + + bytes_consumed += bytes_read; } - self.bytes_to_write -= n; - if n == 0 { - self.bytes_to_write = self.saved_bytes_to_write; - control.request_new_file = true; - } - if self.break_on_line_end && n == line.chars().count() { - self.require_whole_line = self.break_on_line_end; - } - line[..n].to_owned() + + bytes_consumed } } @@ -363,14 +386,13 @@ fn split(settings: &Settings) -> i32 { let mut reader = BufReader::new(if settings.input == "-" { Box::new(stdin()) as Box } else { - let r = match File::open(Path::new(&settings.input)) { - Ok(a) => a, - Err(_) => crash!( + let r = File::open(Path::new(&settings.input)).unwrap_or_else(|_| { + crash!( 1, "cannot open '{}' for reading: No such file or directory", settings.input - ), - }; + ) + }); Box::new(r) as Box }); @@ -380,48 +402,39 @@ fn split(settings: &Settings) -> i32 { a => crash!(1, "strategy {} not supported", a), }; - let mut control = SplitControl { - current_line: "".to_owned(), // Request new line - request_new_file: true, // Request new file - }; - - let mut writer = BufWriter::new(Box::new(stdout()) as Box); let mut fileno = 0; loop { - if control.current_line.chars().count() == 0 { - match reader.read_line(&mut control.current_line) { - Ok(0) | Err(_) => break, - _ => {} + // Get a new part file set up, and construct `writer` for it. + let mut filename = settings.prefix.clone(); + filename.push_str( + if settings.numeric_suffix { + num_prefix(fileno, settings.suffix_length) + } else { + str_prefix(fileno, settings.suffix_length) } - } - if control.request_new_file { - let mut filename = settings.prefix.clone(); - filename.push_str( - if settings.numeric_suffix { - num_prefix(fileno, settings.suffix_length) - } else { - str_prefix(fileno, settings.suffix_length) - } - .as_ref(), - ); - filename.push_str(settings.additional_suffix.as_ref()); + .as_ref(), + ); + filename.push_str(settings.additional_suffix.as_ref()); + let mut writer = platform::instantiate_current_writer(&settings.filter, filename.as_str()); - crash_if_err!(1, writer.flush()); - fileno += 1; - writer = platform::instantiate_current_writer(&settings.filter, filename.as_str()); - control.request_new_file = false; - if settings.verbose { - println!("creating file '{}'", filename); + let bytes_consumed = splitter.consume(&mut reader, &mut writer); + writer + .flush() + .unwrap_or_else(|e| crash!(1, "error flushing to output file: {}", e)); + + // If we didn't write anything we should clean up the empty file, and + // break from the loop. + if bytes_consumed == 0 { + // The output file is only ever created if filter's aren't used. + // Complicated, I know... + if settings.filter.is_none() { + remove_file(filename) + .unwrap_or_else(|e| crash!(1, "error removing empty file: {}", e)); } + break; } - let consumed = splitter.consume(&mut control); - crash_if_err!(1, writer.write_all(consumed.as_bytes())); - - let advance = consumed.chars().count(); - let clone = control.current_line.clone(); - let sl = clone; - control.current_line = sl[advance..sl.chars().count()].to_owned(); + fileno += 1; } 0 } diff --git a/tests/by-util/test_split.rs b/tests/by-util/test_split.rs index 521cbbe9a..37856f419 100644 --- a/tests/by-util/test_split.rs +++ b/tests/by-util/test_split.rs @@ -4,11 +4,15 @@ extern crate regex; use self::rand::{thread_rng, Rng}; use self::regex::Regex; use crate::common::util::*; +use rand::SeedableRng; #[cfg(not(windows))] use std::env; -use std::fs::{read_dir, File}; use std::io::Write; use std::path::Path; +use std::{ + fs::{read_dir, File}, + io::BufWriter, +}; fn random_chars(n: usize) -> String { thread_rng() @@ -58,7 +62,7 @@ impl Glob { files.sort(); let mut data: Vec = vec![]; for name in &files { - data.extend(self.directory.read(name).into_bytes()); + data.extend(self.directory.read_bytes(name)); } data } @@ -81,20 +85,30 @@ impl RandomFile { } fn add_bytes(&mut self, bytes: usize) { - let chunk_size: usize = if bytes >= 1024 { 1024 } else { bytes }; - let mut n = bytes; - while n > chunk_size { - let _ = write!(self.inner, "{}", random_chars(chunk_size)); - n -= chunk_size; + // Note that just writing random characters isn't enough to cover all + // cases. We need truly random bytes. + let mut writer = BufWriter::new(&self.inner); + + // Seed the rng so as to avoid spurious test failures. + let mut rng = rand::rngs::StdRng::seed_from_u64(123); + let mut buffer = [0; 1024]; + let mut remaining_size = bytes; + + while remaining_size > 0 { + let to_write = std::cmp::min(remaining_size, buffer.len()); + let buf = &mut buffer[..to_write]; + rng.fill(buf); + writer.write(buf).unwrap(); + + remaining_size -= to_write; } - let _ = write!(self.inner, "{}", random_chars(n)); } /// Add n lines each of size `RandomFile::LINESIZE` fn add_lines(&mut self, lines: usize) { let mut n = lines; while n > 0 { - let _ = writeln!(self.inner, "{}", random_chars(RandomFile::LINESIZE)); + writeln!(self.inner, "{}", random_chars(RandomFile::LINESIZE)).unwrap(); n -= 1; } } @@ -104,18 +118,18 @@ impl RandomFile { fn test_split_default() { let (at, mut ucmd) = at_and_ucmd!(); let name = "split_default"; - let glob = Glob::new(&at, ".", r"x[[:alpha:]][[:alpha:]]$"); RandomFile::new(&at, name).add_lines(2000); ucmd.args(&[name]).succeeds(); + + let glob = Glob::new(&at, ".", r"x[[:alpha:]][[:alpha:]]$"); assert_eq!(glob.count(), 2); - assert_eq!(glob.collate(), at.read(name).into_bytes()); + assert_eq!(glob.collate(), at.read_bytes(name)); } #[test] fn test_split_numeric_prefixed_chunks_by_bytes() { let (at, mut ucmd) = at_and_ucmd!(); let name = "split_num_prefixed_chunks_by_bytes"; - let glob = Glob::new(&at, ".", r"a\d\d$"); RandomFile::new(&at, name).add_bytes(10000); ucmd.args(&[ "-d", // --numeric-suffixes @@ -123,52 +137,86 @@ fn test_split_numeric_prefixed_chunks_by_bytes() { "1000", name, "a", ]) .succeeds(); + + let glob = Glob::new(&at, ".", r"a\d\d$"); assert_eq!(glob.count(), 10); - assert_eq!(glob.collate(), at.read(name).into_bytes()); + for filename in glob.collect() { + assert_eq!(glob.directory.metadata(&filename).len(), 1000); + } + assert_eq!(glob.collate(), at.read_bytes(name)); } #[test] fn test_split_str_prefixed_chunks_by_bytes() { let (at, mut ucmd) = at_and_ucmd!(); let name = "split_str_prefixed_chunks_by_bytes"; - let glob = Glob::new(&at, ".", r"b[[:alpha:]][[:alpha:]]$"); RandomFile::new(&at, name).add_bytes(10000); + // Important that this is less than 1024 since that's our internal buffer + // size. Good to test that we don't overshoot. ucmd.args(&["-b", "1000", name, "b"]).succeeds(); + + let glob = Glob::new(&at, ".", r"b[[:alpha:]][[:alpha:]]$"); assert_eq!(glob.count(), 10); - assert_eq!(glob.collate(), at.read(name).into_bytes()); + for filename in glob.collect() { + assert_eq!(glob.directory.metadata(&filename).len(), 1000); + } + assert_eq!(glob.collate(), at.read_bytes(name)); +} + +// This is designed to test what happens when the desired part size is not a +// multiple of the buffer size and we hopefully don't overshoot the desired part +// size. +#[test] +fn test_split_bytes_prime_part_size() { + let (at, mut ucmd) = at_and_ucmd!(); + let name = "test_split_bytes_prime_part_size"; + RandomFile::new(&at, name).add_bytes(10000); + // 1753 is prime and greater than the buffer size, 1024. + ucmd.args(&["-b", "1753", name, "b"]).succeeds(); + + let glob = Glob::new(&at, ".", r"b[[:alpha:]][[:alpha:]]$"); + assert_eq!(glob.count(), 6); + for i in 0..5 { + assert_eq!(glob.directory.metadata(&glob.collect()[i]).len(), 1753); + } + assert_eq!(glob.directory.metadata(&glob.collect()[5]).len(), 1235); + assert_eq!(glob.collate(), at.read_bytes(name)); } #[test] fn test_split_num_prefixed_chunks_by_lines() { let (at, mut ucmd) = at_and_ucmd!(); let name = "split_num_prefixed_chunks_by_lines"; - let glob = Glob::new(&at, ".", r"c\d\d$"); RandomFile::new(&at, name).add_lines(10000); ucmd.args(&["-d", "-l", "1000", name, "c"]).succeeds(); + + let glob = Glob::new(&at, ".", r"c\d\d$"); assert_eq!(glob.count(), 10); - assert_eq!(glob.collate(), at.read(name).into_bytes()); + assert_eq!(glob.collate(), at.read_bytes(name)); } #[test] fn test_split_str_prefixed_chunks_by_lines() { let (at, mut ucmd) = at_and_ucmd!(); let name = "split_str_prefixed_chunks_by_lines"; - let glob = Glob::new(&at, ".", r"d[[:alpha:]][[:alpha:]]$"); RandomFile::new(&at, name).add_lines(10000); ucmd.args(&["-l", "1000", name, "d"]).succeeds(); + + let glob = Glob::new(&at, ".", r"d[[:alpha:]][[:alpha:]]$"); assert_eq!(glob.count(), 10); - assert_eq!(glob.collate(), at.read(name).into_bytes()); + assert_eq!(glob.collate(), at.read_bytes(name)); } #[test] fn test_split_additional_suffix() { let (at, mut ucmd) = at_and_ucmd!(); let name = "split_additional_suffix"; - let glob = Glob::new(&at, ".", r"x[[:alpha:]][[:alpha:]].txt$"); RandomFile::new(&at, name).add_lines(2000); ucmd.args(&["--additional-suffix", ".txt", name]).succeeds(); + + let glob = Glob::new(&at, ".", r"x[[:alpha:]][[:alpha:]].txt$"); assert_eq!(glob.count(), 2); - assert_eq!(glob.collate(), at.read(name).into_bytes()); + assert_eq!(glob.collate(), at.read_bytes(name)); } // note: the test_filter* tests below are unix-only @@ -182,15 +230,16 @@ fn test_filter() { // like `test_split_default()` but run a command before writing let (at, mut ucmd) = at_and_ucmd!(); let name = "filtered"; - let glob = Glob::new(&at, ".", r"x[[:alpha:]][[:alpha:]]$"); let n_lines = 3; RandomFile::new(&at, name).add_lines(n_lines); // change all characters to 'i' ucmd.args(&["--filter=sed s/./i/g > $FILE", name]) .succeeds(); + // assert all characters are 'i' / no character is not 'i' // (assert that command succeded) + let glob = Glob::new(&at, ".", r"x[[:alpha:]][[:alpha:]]$"); assert!( glob.collate().iter().find(|&&c| { // is not i @@ -209,7 +258,6 @@ fn test_filter_with_env_var_set() { // implemented like `test_split_default()` but run a command before writing let (at, mut ucmd) = at_and_ucmd!(); let name = "filtered"; - let glob = Glob::new(&at, ".", r"x[[:alpha:]][[:alpha:]]$"); let n_lines = 3; RandomFile::new(&at, name).add_lines(n_lines); @@ -217,7 +265,9 @@ fn test_filter_with_env_var_set() { env::set_var("FILE", &env_var_value); ucmd.args(&[format!("--filter={}", "cat > $FILE").as_str(), name]) .succeeds(); - assert_eq!(glob.collate(), at.read(name).into_bytes()); + + let glob = Glob::new(&at, ".", r"x[[:alpha:]][[:alpha:]]$"); + assert_eq!(glob.collate(), at.read_bytes(name)); assert!(env::var("FILE").unwrap_or("var was unset".to_owned()) == env_var_value); }