1
Fork 0
mirror of https://github.com/RGBCube/uutils-coreutils synced 2025-07-28 03:27:44 +00:00

split: use ByteChunkWriter and LineChunkWriter

Replace `ByteSplitter` and `LineSplitter` with `ByteChunkWriter` and
`LineChunkWriter` respectively. This results in a more maintainable
design and an increase in the speed of splitting by lines.
This commit is contained in:
Jeffrey Finkelstein 2021-12-30 20:11:03 -05:00
parent ca7af808d5
commit 1d7e1b8732
3 changed files with 65 additions and 59 deletions

View file

@ -16,13 +16,14 @@ use clap::{crate_version, App, AppSettings, Arg, ArgMatches};
use std::convert::TryFrom; use std::convert::TryFrom;
use std::env; use std::env;
use std::fmt; use std::fmt;
use std::fs::{metadata, remove_file, File}; use std::fs::{metadata, File};
use std::io::{stdin, BufRead, BufReader, BufWriter, ErrorKind, Read, Write}; use std::io::{stdin, BufRead, BufReader, BufWriter, ErrorKind, Read, Write};
use std::num::ParseIntError; use std::num::ParseIntError;
use std::path::Path; use std::path::Path;
use uucore::display::Quotable; use uucore::display::Quotable;
use uucore::error::{FromIo, UResult, USimpleError, UUsageError}; use uucore::error::{FromIo, UIoError, UResult, USimpleError, UUsageError};
use uucore::parse_size::{parse_size, ParseSizeError}; use uucore::parse_size::{parse_size, ParseSizeError};
use uucore::uio_error;
static OPT_BYTES: &str = "bytes"; static OPT_BYTES: &str = "bytes";
static OPT_LINE_BYTES: &str = "line-bytes"; static OPT_LINE_BYTES: &str = "line-bytes";
@ -739,65 +740,47 @@ fn split(settings: &Settings) -> UResult<()> {
Box::new(r) as Box<dyn Read> Box::new(r) as Box<dyn Read>
}); });
if let Strategy::Number(num_chunks) = settings.strategy { match settings.strategy {
return split_into_n_chunks_by_byte(settings, &mut reader, num_chunks); Strategy::Number(num_chunks) => {
split_into_n_chunks_by_byte(settings, &mut reader, num_chunks)
} }
Strategy::Lines(chunk_size) => {
let mut splitter: Box<dyn Splitter> = match settings.strategy { let mut writer = LineChunkWriter::new(chunk_size, settings)
Strategy::Lines(chunk_size) => Box::new(LineSplitter::new(chunk_size)),
Strategy::Bytes(chunk_size) | Strategy::LineBytes(chunk_size) => {
Box::new(ByteSplitter::new(chunk_size))
}
_ => unreachable!(),
};
// This object is responsible for creating the filename for each chunk.
let mut filename_iterator = FilenameIterator::new(
&settings.prefix,
&settings.additional_suffix,
settings.suffix_length,
settings.numeric_suffix,
);
loop {
// Get a new part file set up, and construct `writer` for it.
let filename = filename_iterator
.next()
.ok_or_else(|| USimpleError::new(1, "output file suffixes exhausted"))?; .ok_or_else(|| USimpleError::new(1, "output file suffixes exhausted"))?;
let mut writer = platform::instantiate_current_writer(&settings.filter, filename.as_str()); match std::io::copy(&mut reader, &mut writer) {
Ok(_) => Ok(()),
let bytes_consumed = splitter Err(e) => match e.kind() {
.consume(&mut reader, &mut writer) // TODO Since the writer object controls the creation of
.map_err_context(|| "input/output error".to_string())?; // new files, we need to rely on the `std::io::Result`
writer // returned by its `write()` method to communicate any
.flush() // errors to this calling scope. If a new file cannot be
.map_err_context(|| "error flushing to output file".to_string())?; // created because we have exceeded the number of
// allowable filenames, we use `ErrorKind::Other` to
// If we didn't write anything we should clean up the empty file, and // indicate that. A special error message needs to be
// break from the loop. // printed in that case.
if bytes_consumed == 0 { ErrorKind::Other => Err(USimpleError::new(1, "output file suffixes exhausted")),
// The output file is only ever created if --filter isn't used. _ => Err(uio_error!(e, "input/output error")),
// Complicated, I know... },
if settings.filter.is_none() { }
remove_file(filename) }
.map_err_context(|| "error removing empty file".to_string())?; Strategy::Bytes(chunk_size) | Strategy::LineBytes(chunk_size) => {
} let mut writer = ByteChunkWriter::new(chunk_size, settings)
break; .ok_or_else(|| USimpleError::new(1, "output file suffixes exhausted"))?;
} match std::io::copy(&mut reader, &mut writer) {
Ok(_) => Ok(()),
// TODO It is silly to have the "creating file" message here Err(e) => match e.kind() {
// after the file has been already created. However, because // TODO Since the writer object controls the creation of
// of the way the main loop has been written, an extra file // new files, we need to rely on the `std::io::Result`
// gets created and then deleted in the last iteration of the // returned by its `write()` method to communicate any
// loop. So we need to make sure we are not in that case when // errors to this calling scope. If a new file cannot be
// printing this message. // created because we have exceeded the number of
// // allowable filenames, we use `ErrorKind::Other` to
// This is only here temporarily while we make some // indicate that. A special error message needs to be
// improvements to the architecture of the main loop in this // printed in that case.
// function. In the future, it will move to a more appropriate ErrorKind::Other => Err(USimpleError::new(1, "output file suffixes exhausted")),
// place---at the point where the file is actually created. _ => Err(uio_error!(e, "input/output error")),
if settings.verbose { },
println!("creating file {}", filename.quote()); }
} }
} }
Ok(())
} }

View file

@ -2,7 +2,7 @@
// * // *
// * For the full copyright and license information, please view the LICENSE // * For the full copyright and license information, please view the LICENSE
// * file that was distributed with this source code. // * file that was distributed with this source code.
// spell-checker:ignore xzaaa sixhundredfiftyonebytes ninetyonebytes asciilowercase fghij klmno pqrst uvwxyz // spell-checker:ignore xzaaa sixhundredfiftyonebytes ninetyonebytes asciilowercase fghij klmno pqrst uvwxyz fivelines
extern crate rand; extern crate rand;
extern crate regex; extern crate regex;
@ -449,3 +449,21 @@ fn test_invalid_suffix_length() {
.no_stdout() .no_stdout()
.stderr_contains("invalid suffix length: 'xyz'"); .stderr_contains("invalid suffix length: 'xyz'");
} }
#[test]
fn test_include_newlines() {
let (at, mut ucmd) = at_and_ucmd!();
ucmd.args(&["-l", "2", "fivelines.txt"]).succeeds();
let mut s = String::new();
at.open("xaa").read_to_string(&mut s).unwrap();
assert_eq!(s, "1\n2\n");
let mut s = String::new();
at.open("xab").read_to_string(&mut s).unwrap();
assert_eq!(s, "3\n4\n");
let mut s = String::new();
at.open("xac").read_to_string(&mut s).unwrap();
assert_eq!(s, "5\n");
}

5
tests/fixtures/split/fivelines.txt vendored Normal file
View file

@ -0,0 +1,5 @@
1
2
3
4
5