split: use ByteChunkWriter and LineChunkWriter

Replace `ByteSplitter` and `LineSplitter` with `ByteChunkWriter` and `LineChunkWriter` respectively. This results in a more maintainable design and an increase in the speed of splitting by lines.
2025-09-14 11:07:59 +00:00 · 2021-12-30 20:11:03 -05:00 · 2021-12-30 20:11:03 -05:00 · 1d7e1b8732
commit 1d7e1b8732
parent ca7af808d5
3 changed files with 65 additions and 59 deletions
--- a/src/uu/split/src/split.rs
+++ b/src/uu/split/src/split.rs
@ -16,13 +16,14 @@ use clap::{crate_version, App, AppSettings, Arg, ArgMatches};
 use std::convert::TryFrom;
 use std::env;
 use std::fmt;
-use std::fs::{metadata, remove_file, File};
+use std::fs::{metadata, File};
 use std::io::{stdin, BufRead, BufReader, BufWriter, ErrorKind, Read, Write};
 use std::num::ParseIntError;
 use std::path::Path;
 use uucore::display::Quotable;
-use uucore::error::{FromIo, UResult, USimpleError, UUsageError};
+use uucore::error::{FromIo, UIoError, UResult, USimpleError, UUsageError};
 use uucore::parse_size::{parse_size, ParseSizeError};
 use uucore::uio_error;
 static OPT_BYTES: &str = "bytes";
 static OPT_LINE_BYTES: &str = "line-bytes";
@ -739,65 +740,47 @@ fn split(settings: &Settings) -> UResult<()> {
        Box::new(r) as Box<dyn Read>
    });
-    if let Strategy::Number(num_chunks) = settings.strategy {
+    match settings.strategy {
-        return split_into_n_chunks_by_byte(settings, &mut reader, num_chunks);
+        Strategy::Number(num_chunks) => {
-    }
+            split_into_n_chunks_by_byte(settings, &mut reader, num_chunks)
    let mut splitter: Box<dyn Splitter> = match settings.strategy {
        Strategy::Lines(chunk_size) => Box::new(LineSplitter::new(chunk_size)),
        Strategy::Bytes(chunk_size) | Strategy::LineBytes(chunk_size) => {
            Box::new(ByteSplitter::new(chunk_size))
        }
-        _ => unreachable!(),
+        Strategy::Lines(chunk_size) => {
-    };
+            let mut writer = LineChunkWriter::new(chunk_size, settings)
-
+                .ok_or_else(|| USimpleError::new(1, "output file suffixes exhausted"))?;
-    // This object is responsible for creating the filename for each chunk.
+            match std::io::copy(&mut reader, &mut writer) {
-    let mut filename_iterator = FilenameIterator::new(
+                Ok(_) => Ok(()),
-        &settings.prefix,
+                Err(e) => match e.kind() {
-        &settings.additional_suffix,
+                    // TODO Since the writer object controls the creation of
-        settings.suffix_length,
+                    // new files, we need to rely on the `std::io::Result`
-        settings.numeric_suffix,
+                    // returned by its `write()` method to communicate any
-    );
+                    // errors to this calling scope. If a new file cannot be
-    loop {
+                    // created because we have exceeded the number of
-        // Get a new part file set up, and construct `writer` for it.
+                    // allowable filenames, we use `ErrorKind::Other` to
-        let filename = filename_iterator
+                    // indicate that. A special error message needs to be
-            .next()
+                    // printed in that case.
-            .ok_or_else(|| USimpleError::new(1, "output file suffixes exhausted"))?;
+                    ErrorKind::Other => Err(USimpleError::new(1, "output file suffixes exhausted")),
-        let mut writer = platform::instantiate_current_writer(&settings.filter, filename.as_str());
+                    _ => Err(uio_error!(e, "input/output error")),
-
+                },
        let bytes_consumed = splitter
            .consume(&mut reader, &mut writer)
            .map_err_context(|| "input/output error".to_string())?;
        writer
            .flush()
            .map_err_context(|| "error flushing to output file".to_string())?;
        // If we didn't write anything we should clean up the empty file, and
        // break from the loop.
        if bytes_consumed == 0 {
            // The output file is only ever created if --filter isn't used.
            // Complicated, I know...
            if settings.filter.is_none() {
                remove_file(filename)
                    .map_err_context(|| "error removing empty file".to_string())?;
            }
            break;
        }
-
+        Strategy::Bytes(chunk_size) | Strategy::LineBytes(chunk_size) => {
-        // TODO It is silly to have the "creating file" message here
+            let mut writer = ByteChunkWriter::new(chunk_size, settings)
-        // after the file has been already created. However, because
+                .ok_or_else(|| USimpleError::new(1, "output file suffixes exhausted"))?;
-        // of the way the main loop has been written, an extra file
+            match std::io::copy(&mut reader, &mut writer) {
-        // gets created and then deleted in the last iteration of the
+                Ok(_) => Ok(()),
-        // loop. So we need to make sure we are not in that case when
+                Err(e) => match e.kind() {
-        // printing this message.
+                    // TODO Since the writer object controls the creation of
-        //
+                    // new files, we need to rely on the `std::io::Result`
-        // This is only here temporarily while we make some
+                    // returned by its `write()` method to communicate any
-        // improvements to the architecture of the main loop in this
+                    // errors to this calling scope. If a new file cannot be
-        // function. In the future, it will move to a more appropriate
+                    // created because we have exceeded the number of
-        // place---at the point where the file is actually created.
+                    // allowable filenames, we use `ErrorKind::Other` to
-        if settings.verbose {
+                    // indicate that. A special error message needs to be
-            println!("creating file {}", filename.quote());
+                    // printed in that case.
                    ErrorKind::Other => Err(USimpleError::new(1, "output file suffixes exhausted")),
                    _ => Err(uio_error!(e, "input/output error")),
                },
            }
        }
    }
    Ok(())
 }
--- a/tests/by-util/test_split.rs
+++ b/tests/by-util/test_split.rs
@ -2,7 +2,7 @@
 //  *
 //  * For the full copyright and license information, please view the LICENSE
 //  * file that was distributed with this source code.
-// spell-checker:ignore xzaaa sixhundredfiftyonebytes ninetyonebytes asciilowercase fghij klmno pqrst uvwxyz
+// spell-checker:ignore xzaaa sixhundredfiftyonebytes ninetyonebytes asciilowercase fghij klmno pqrst uvwxyz fivelines
 extern crate rand;
 extern crate regex;
@ -449,3 +449,21 @@ fn test_invalid_suffix_length() {
        .no_stdout()
        .stderr_contains("invalid suffix length: 'xyz'");
 }
 #[test]
 fn test_include_newlines() {
    let (at, mut ucmd) = at_and_ucmd!();
    ucmd.args(&["-l", "2", "fivelines.txt"]).succeeds();
    let mut s = String::new();
    at.open("xaa").read_to_string(&mut s).unwrap();
    assert_eq!(s, "1\n2\n");
    let mut s = String::new();
    at.open("xab").read_to_string(&mut s).unwrap();
    assert_eq!(s, "3\n4\n");
    let mut s = String::new();
    at.open("xac").read_to_string(&mut s).unwrap();
    assert_eq!(s, "5\n");
 }
--- a/tests/fixtures/split/fivelines.txt
+++ b/tests/fixtures/split/fivelines.txt
@ -0,0 +1,5 @@
 1
 2
 3
 4
 5