diff --git a/src/uu/split/split.md b/src/uu/split/split.md index d3a481fd3..836e3a0c6 100644 --- a/src/uu/split/split.md +++ b/src/uu/split/split.md @@ -11,3 +11,16 @@ Create output files containing consecutive or interleaved sections of input ## After Help Output fixed-size pieces of INPUT to PREFIXaa, PREFIXab, ...; default size is 1000, and default PREFIX is 'x'. With no INPUT, or when INPUT is -, read standard input. + +The SIZE argument is an integer and optional unit (example: 10K is 10*1024). +Units are K,M,G,T,P,E,Z,Y,R,Q (powers of 1024) or KB,MB,... (powers of 1000). +Binary prefixes can be used, too: KiB=K, MiB=M, and so on. + +CHUNKS may be: + +- N split into N files based on size of input +- K/N output Kth of N to stdout +- l/N split into N files without splitting lines/records +- l/K/N output Kth of N to stdout without splitting lines/records +- r/N like 'l' but use round robin distribution +- r/K/N likewise but only output Kth of N to stdout diff --git a/src/uu/split/src/split.rs b/src/uu/split/src/split.rs index a61c0e812..756248539 100644 --- a/src/uu/split/src/split.rs +++ b/src/uu/split/src/split.rs @@ -11,7 +11,7 @@ mod platform; use crate::filenames::FilenameIterator; use crate::filenames::SuffixType; -use clap::{crate_version, parser::ValueSource, Arg, ArgAction, ArgMatches, Command}; +use clap::{crate_version, parser::ValueSource, Arg, ArgAction, ArgMatches, Command, ValueHint}; use std::env; use std::ffi::OsString; use std::fmt; @@ -39,6 +39,7 @@ static OPT_HEX_SUFFIXES_SHORT: &str = "-x"; static OPT_SUFFIX_LENGTH: &str = "suffix-length"; static OPT_DEFAULT_SUFFIX_LENGTH: &str = "0"; static OPT_VERBOSE: &str = "verbose"; +static OPT_SEPARATOR: &str = "separator"; //The ---io and ---io-blksize parameters are consumed and ignored. //The parameter is included to make GNU coreutils tests pass. static OPT_IO: &str = "-io"; @@ -55,7 +56,6 @@ const AFTER_HELP: &str = help_section!("after help", "split.md"); #[uucore::main] pub fn uumain(args: impl uucore::Args) -> UResult<()> { let (args, obs_lines) = handle_obsolete(args); - let matches = uu_app().try_get_matches_from(args)?; match Settings::from(&matches, &obs_lines) { @@ -145,6 +145,7 @@ fn should_extract_obs_lines( && !slice.starts_with("-C") && !slice.starts_with("-l") && !slice.starts_with("-n") + && !slice.starts_with("-t") } /// Helper function to [`filter_args`] @@ -208,13 +209,18 @@ fn handle_preceding_options( || &slice[2..] == OPT_ADDITIONAL_SUFFIX || &slice[2..] == OPT_FILTER || &slice[2..] == OPT_NUMBER - || &slice[2..] == OPT_SUFFIX_LENGTH; + || &slice[2..] == OPT_SUFFIX_LENGTH + || &slice[2..] == OPT_SEPARATOR; } // capture if current slice is a preceding short option that requires value and does not have value in the same slice (value separated by whitespace) // following slice should be treaded as value for this option // even if it starts with '-' (which would be treated as hyphen prefixed value) - *preceding_short_opt_req_value = - slice == "-b" || slice == "-C" || slice == "-l" || slice == "-n" || slice == "-a"; + *preceding_short_opt_req_value = slice == "-b" + || slice == "-C" + || slice == "-l" + || slice == "-n" + || slice == "-a" + || slice == "-t"; // slice is a value // reset preceding option flags if !slice.starts_with('-') { @@ -278,7 +284,7 @@ pub fn uu_app() -> Command { .long(OPT_FILTER) .allow_hyphen_values(true) .value_name("COMMAND") - .value_hint(clap::ValueHint::CommandName) + .value_hint(ValueHint::CommandName) .help( "write to shell COMMAND; file name is $FILE (Currently not implemented for Windows)", ), @@ -293,7 +299,7 @@ pub fn uu_app() -> Command { .arg( Arg::new(OPT_NUMERIC_SUFFIXES_SHORT) .short('d') - .action(clap::ArgAction::SetTrue) + .action(ArgAction::SetTrue) .overrides_with_all([ OPT_NUMERIC_SUFFIXES, OPT_NUMERIC_SUFFIXES_SHORT, @@ -314,12 +320,13 @@ pub fn uu_app() -> Command { OPT_HEX_SUFFIXES, OPT_HEX_SUFFIXES_SHORT ]) + .value_name("FROM") .help("same as -d, but allow setting the start value"), ) .arg( Arg::new(OPT_HEX_SUFFIXES_SHORT) .short('x') - .action(clap::ArgAction::SetTrue) + .action(ArgAction::SetTrue) .overrides_with_all([ OPT_NUMERIC_SUFFIXES, OPT_NUMERIC_SUFFIXES_SHORT, @@ -340,6 +347,7 @@ pub fn uu_app() -> Command { OPT_HEX_SUFFIXES, OPT_HEX_SUFFIXES_SHORT ]) + .value_name("FROM") .help("same as -x, but allow setting the start value"), ) .arg( @@ -357,6 +365,15 @@ pub fn uu_app() -> Command { .help("print a diagnostic just before each output file is opened") .action(ArgAction::SetTrue), ) + .arg( + Arg::new(OPT_SEPARATOR) + .short('t') + .long(OPT_SEPARATOR) + .allow_hyphen_values(true) + .value_name("SEP") + .action(ArgAction::Append) + .help("use SEP instead of newline as the record separator; '\0' (zero) specifies the NUL character"), + ) .arg( Arg::new(OPT_IO) .long("io") @@ -372,7 +389,7 @@ pub fn uu_app() -> Command { .arg( Arg::new(ARG_INPUT) .default_value("-") - .value_hint(clap::ValueHint::FilePath), + .value_hint(ValueHint::FilePath), ) .arg( Arg::new(ARG_PREFIX) @@ -696,6 +713,7 @@ struct Settings { filter: Option, strategy: Strategy, verbose: bool, + separator: u8, /// Whether to *not* produce empty files when using `-n`. /// @@ -722,6 +740,12 @@ enum SettingsError { /// Suffix is not large enough to split into specified chunks SuffixTooSmall(usize), + /// Multi-character (Invalid) separator + MultiCharacterSeparator(String), + + /// Multiple different separator characters + MultipleSeparatorCharacters, + /// The `--filter` option is not supported on Windows. #[cfg(windows)] NotSupported, @@ -743,6 +767,12 @@ impl fmt::Display for SettingsError { Self::Strategy(e) => e.fmt(f), Self::SuffixNotParsable(s) => write!(f, "invalid suffix length: {}", s.quote()), Self::SuffixTooSmall(i) => write!(f, "the suffix length needs to be at least {i}"), + Self::MultiCharacterSeparator(s) => { + write!(f, "multi-character separator {}", s.quote()) + } + Self::MultipleSeparatorCharacters => { + write!(f, "multiple separator characters specified") + } Self::SuffixContainsSeparator(s) => write!( f, "invalid suffix {}, contains directory separator", @@ -783,6 +813,26 @@ impl Settings { } } } + + // Make sure that separator is only one UTF8 character (if specified) + // defaults to '\n' - newline character + // If the same separator (the same value) was used multiple times - `split` should NOT fail + // If the separator was used multiple times but with different values (not all values are the same) - `split` should fail + let separator = match matches.get_many::(OPT_SEPARATOR) { + Some(mut sep_values) => { + let first = sep_values.next().unwrap(); // it is safe to just unwrap here since Clap should not return empty ValuesRef<'_,String> in the option from get_many() call + if !sep_values.all(|s| s == first) { + return Err(SettingsError::MultipleSeparatorCharacters); + } + match first.as_str() { + "\\0" => b'\0', + s if s.as_bytes().len() == 1 => s.as_bytes()[0], + s => return Err(SettingsError::MultiCharacterSeparator(s.to_owned())), + } + } + None => b'\n', + }; + let result = Self { suffix_length: suffix_length_str .parse() @@ -791,6 +841,7 @@ impl Settings { suffix_start, additional_suffix, verbose: matches.value_source("verbose") == Some(ValueSource::CommandLine), + separator, strategy, input: matches.get_one::(ARG_INPUT).unwrap().to_owned(), prefix: matches.get_one::(ARG_PREFIX).unwrap().to_owned(), @@ -1019,7 +1070,8 @@ impl<'a> Write for LineChunkWriter<'a> { // corresponds to the current chunk number. let mut prev = 0; let mut total_bytes_written = 0; - for i in memchr::memchr_iter(b'\n', buf) { + let sep = self.settings.separator; + for i in memchr::memchr_iter(sep, buf) { // If we have exceeded the number of lines to write in the // current chunk, then start a new chunk and its // corresponding writer. @@ -1036,8 +1088,8 @@ impl<'a> Write for LineChunkWriter<'a> { } // Write the line, starting from *after* the previous - // newline character and ending *after* the current - // newline character. + // separator character and ending *after* the current + // separator character. let n = self.inner.write(&buf[prev..i + 1])?; total_bytes_written += n; prev = i + 1; @@ -1175,21 +1227,22 @@ impl<'a> Write for LineBytesChunkWriter<'a> { self.num_bytes_remaining_in_current_chunk = self.chunk_size.try_into().unwrap(); } - // Find the first newline character in the buffer. - match memchr::memchr(b'\n', buf) { - // If there is no newline character and the buffer is + // Find the first separator (default - newline character) in the buffer. + let sep = self.settings.separator; + match memchr::memchr(sep, buf) { + // If there is no separator character and the buffer is // not empty, then write as many bytes as we can and // then move on to the next chunk if necessary. None => { let end = self.num_bytes_remaining_in_current_chunk; // This is ugly but here to match GNU behavior. If the input - // doesn't end with a \n, pretend that it does for handling + // doesn't end with a separator, pretend that it does for handling // the second to last segment chunk. See `line-bytes.sh`. if end == buf.len() && self.num_bytes_remaining_in_current_chunk < self.chunk_size.try_into().unwrap() - && buf[buf.len() - 1] != b'\n' + && buf[buf.len() - 1] != sep { self.num_bytes_remaining_in_current_chunk = 0; } else { @@ -1200,8 +1253,8 @@ impl<'a> Write for LineBytesChunkWriter<'a> { } } - // If there is a newline character and the line - // (including the newline character) will fit in the + // If there is a separator character and the line + // (including the separator character) will fit in the // current chunk, then write the entire line and // continue to the next iteration. (See chunk 1 in the // example comment above.) @@ -1212,8 +1265,8 @@ impl<'a> Write for LineBytesChunkWriter<'a> { buf = &buf[num_bytes_written..]; } - // If there is a newline character, the line - // (including the newline character) will not fit in + // If there is a separator character, the line + // (including the separator character) will not fit in // the current chunk, *and* no other lines have been // written to the current chunk, then write as many // bytes as we can and continue to the next @@ -1230,8 +1283,8 @@ impl<'a> Write for LineBytesChunkWriter<'a> { buf = &buf[num_bytes_written..]; } - // If there is a newline character, the line - // (including the newline character) will not fit in + // If there is a separator character, the line + // (including the separator character) will not fit in // the current chunk, and at least one other line has // been written to the current chunk, then signal to // the next iteration that a new chunk needs to be @@ -1489,15 +1542,16 @@ where let mut num_bytes_remaining_in_current_chunk = chunk_size; let mut i = 0; - for line_result in reader.lines() { + let sep = settings.separator; + for line_result in reader.split(sep) { let line = line_result.unwrap(); let maybe_writer = writers.get_mut(i); let writer = maybe_writer.unwrap(); - let bytes = line.as_bytes(); + let bytes = line.as_slice(); writer.write_all(bytes)?; - writer.write_all(b"\n")?; + writer.write_all(&[sep])?; - // Add one byte for the newline character. + // Add one byte for the separator character. let num_bytes = bytes.len() + 1; if num_bytes > num_bytes_remaining_in_current_chunk { num_bytes_remaining_in_current_chunk = chunk_size; @@ -1546,15 +1600,16 @@ where let mut num_bytes_remaining_in_current_chunk = chunk_size; let mut i = 0; - for line_result in reader.lines() { + let sep = settings.separator; + for line_result in reader.split(sep) { let line = line_result?; - let bytes = line.as_bytes(); + let bytes = line.as_slice(); if i == chunk_number { writer.write_all(bytes)?; - writer.write_all(b"\n")?; + writer.write_all(&[sep])?; } - // Add one byte for the newline character. + // Add one byte for the separator character. let num_bytes = bytes.len() + 1; if num_bytes >= num_bytes_remaining_in_current_chunk { num_bytes_remaining_in_current_chunk = chunk_size; @@ -1601,13 +1656,14 @@ where } let num_chunks: usize = num_chunks.try_into().unwrap(); - for (i, line_result) in reader.lines().enumerate() { + let sep = settings.separator; + for (i, line_result) in reader.split(sep).enumerate() { let line = line_result.unwrap(); let maybe_writer = writers.get_mut(i % num_chunks); let writer = maybe_writer.unwrap(); - let bytes = line.as_bytes(); + let bytes = line.as_slice(); writer.write_all(bytes)?; - writer.write_all(b"\n")?; + writer.write_all(&[sep])?; } Ok(()) @@ -1632,7 +1688,7 @@ where /// * [`split_into_n_chunks_by_line_round_robin`], which splits its input in the /// same way, but writes each chunk to its own file. fn kth_chunk_by_line_round_robin( - _settings: &Settings, + settings: &Settings, reader: &mut R, chunk_number: u64, num_chunks: u64, @@ -1646,12 +1702,13 @@ where let num_chunks: usize = num_chunks.try_into().unwrap(); let chunk_number: usize = chunk_number.try_into().unwrap(); - for (i, line_result) in reader.lines().enumerate() { + let sep = settings.separator; + for (i, line_result) in reader.split(sep).enumerate() { let line = line_result?; - let bytes = line.as_bytes(); + let bytes = line.as_slice(); if (i % num_chunks) == chunk_number { writer.write_all(bytes)?; - writer.write_all(b"\n")?; + writer.write_all(&[sep])?; } } Ok(()) diff --git a/tests/by-util/test_split.rs b/tests/by-util/test_split.rs index 7c6d271e6..6fc3a3706 100644 --- a/tests/by-util/test_split.rs +++ b/tests/by-util/test_split.rs @@ -1483,3 +1483,242 @@ fn test_split_non_utf8_argument_windows() { .fails() .stderr_contains("error: invalid UTF-8 was detected in one or more arguments"); } + +// Test '--separator' / '-t' option following GNU tests example +// test separators: '\n' , '\0' , ';' +// test with '--lines=2' , '--line-bytes=4' , '--number=l/3' , '--number=r/3' , '--number=l/1/3' , '--number=r/1/3' +#[test] +fn test_split_separator_nl_lines() { + let (at, mut ucmd) = at_and_ucmd!(); + ucmd.args(&["--lines=2", "-t", "\n"]) + .pipe_in("1\n2\n3\n4\n5\n") + .succeeds(); + + assert_eq!(file_read(&at, "xaa"), "1\n2\n"); + assert_eq!(file_read(&at, "xab"), "3\n4\n"); + assert_eq!(file_read(&at, "xac"), "5\n"); + assert!(!at.plus("xad").exists()); +} + +#[test] +fn test_split_separator_nl_line_bytes() { + let (at, mut ucmd) = at_and_ucmd!(); + ucmd.args(&["--line-bytes=4", "-t", "\n"]) + .pipe_in("1\n2\n3\n4\n5\n") + .succeeds(); + + assert_eq!(file_read(&at, "xaa"), "1\n2\n"); + assert_eq!(file_read(&at, "xab"), "3\n4\n"); + assert_eq!(file_read(&at, "xac"), "5\n"); + assert!(!at.plus("xad").exists()); +} + +#[test] +fn test_split_separator_nl_number_l() { + let (at, mut ucmd) = at_and_ucmd!(); + ucmd.args(&["--number=l/3", "--separator=\n", "fivelines.txt"]) + .succeeds(); + + assert_eq!(file_read(&at, "xaa"), "1\n2\n"); + assert_eq!(file_read(&at, "xab"), "3\n4\n"); + assert_eq!(file_read(&at, "xac"), "5\n"); + assert!(!at.plus("xad").exists()); +} + +#[test] +fn test_split_separator_nl_number_r() { + let (at, mut ucmd) = at_and_ucmd!(); + ucmd.args(&["--number=r/3", "--separator", "\n", "fivelines.txt"]) + .succeeds(); + + assert_eq!(file_read(&at, "xaa"), "1\n4\n"); + assert_eq!(file_read(&at, "xab"), "2\n5\n"); + assert_eq!(file_read(&at, "xac"), "3\n"); + assert!(!at.plus("xad").exists()); +} + +#[test] +fn test_split_separator_nul_lines() { + let (at, mut ucmd) = at_and_ucmd!(); + ucmd.args(&["--lines=2", "-t", "\\0", "separator_nul.txt"]) + .succeeds(); + + assert_eq!(file_read(&at, "xaa"), "1\02\0"); + assert_eq!(file_read(&at, "xab"), "3\04\0"); + assert_eq!(file_read(&at, "xac"), "5\0"); + assert!(!at.plus("xad").exists()); +} + +#[test] +fn test_split_separator_nul_line_bytes() { + let (at, mut ucmd) = at_and_ucmd!(); + ucmd.args(&["--line-bytes=4", "-t", "\\0", "separator_nul.txt"]) + .succeeds(); + + assert_eq!(file_read(&at, "xaa"), "1\02\0"); + assert_eq!(file_read(&at, "xab"), "3\04\0"); + assert_eq!(file_read(&at, "xac"), "5\0"); + assert!(!at.plus("xad").exists()); +} + +#[test] +fn test_split_separator_nul_number_l() { + let (at, mut ucmd) = at_and_ucmd!(); + ucmd.args(&["--number=l/3", "--separator=\\0", "separator_nul.txt"]) + .succeeds(); + + assert_eq!(file_read(&at, "xaa"), "1\02\0"); + assert_eq!(file_read(&at, "xab"), "3\04\0"); + assert_eq!(file_read(&at, "xac"), "5\0"); + assert!(!at.plus("xad").exists()); +} + +#[test] +fn test_split_separator_nul_number_r() { + let (at, mut ucmd) = at_and_ucmd!(); + ucmd.args(&["--number=r/3", "--separator=\\0", "separator_nul.txt"]) + .succeeds(); + + assert_eq!(file_read(&at, "xaa"), "1\04\0"); + assert_eq!(file_read(&at, "xab"), "2\05\0"); + assert_eq!(file_read(&at, "xac"), "3\0"); + assert!(!at.plus("xad").exists()); +} + +#[test] +fn test_split_separator_semicolon_lines() { + let (at, mut ucmd) = at_and_ucmd!(); + ucmd.args(&["--lines=2", "-t", ";", "separator_semicolon.txt"]) + .succeeds(); + + assert_eq!(file_read(&at, "xaa"), "1;2;"); + assert_eq!(file_read(&at, "xab"), "3;4;"); + assert_eq!(file_read(&at, "xac"), "5;"); + assert!(!at.plus("xad").exists()); +} + +#[test] +fn test_split_separator_semicolon_line_bytes() { + let (at, mut ucmd) = at_and_ucmd!(); + ucmd.args(&["--line-bytes=4", "-t", ";", "separator_semicolon.txt"]) + .succeeds(); + + assert_eq!(file_read(&at, "xaa"), "1;2;"); + assert_eq!(file_read(&at, "xab"), "3;4;"); + assert_eq!(file_read(&at, "xac"), "5;"); + assert!(!at.plus("xad").exists()); +} + +#[test] +fn test_split_separator_semicolon_number_l() { + let (at, mut ucmd) = at_and_ucmd!(); + ucmd.args(&["--number=l/3", "--separator=;", "separator_semicolon.txt"]) + .succeeds(); + + assert_eq!(file_read(&at, "xaa"), "1;2;"); + assert_eq!(file_read(&at, "xab"), "3;4;"); + assert_eq!(file_read(&at, "xac"), "5;"); + assert!(!at.plus("xad").exists()); +} + +#[test] +fn test_split_separator_semicolon_number_r() { + let (at, mut ucmd) = at_and_ucmd!(); + ucmd.args(&["--number=r/3", "--separator=;", "separator_semicolon.txt"]) + .succeeds(); + + assert_eq!(file_read(&at, "xaa"), "1;4;"); + assert_eq!(file_read(&at, "xab"), "2;5;"); + assert_eq!(file_read(&at, "xac"), "3;"); + assert!(!at.plus("xad").exists()); +} + +#[test] +fn test_split_separator_semicolon_number_kth_l() { + new_ucmd!() + .args(&[ + "--number=l/1/3", + "--separator", + ";", + "separator_semicolon.txt", + ]) + .succeeds() + .stdout_only("1;2;"); +} + +#[test] +fn test_split_separator_semicolon_number_kth_r() { + new_ucmd!() + .args(&[ + "--number=r/1/3", + "--separator", + ";", + "separator_semicolon.txt", + ]) + .succeeds() + .stdout_only("1;4;"); +} + +// Test error edge cases for separator option +#[test] +fn test_split_separator_no_value() { + new_ucmd!() + .args(&["-t"]) + .ignore_stdin_write_error() + .pipe_in("a\n") + .fails() + .stderr_contains( + "error: a value is required for '--separator ' but none was supplied", + ); +} + +#[test] +fn test_split_separator_invalid_usage() { + let scene = TestScenario::new(util_name!()); + scene + .ucmd() + .args(&["--separator=xx"]) + .ignore_stdin_write_error() + .pipe_in("a\n") + .fails() + .no_stdout() + .stderr_contains("split: multi-character separator 'xx'"); + scene + .ucmd() + .args(&["-ta", "-tb"]) + .ignore_stdin_write_error() + .pipe_in("a\n") + .fails() + .no_stdout() + .stderr_contains("split: multiple separator characters specified"); + scene + .ucmd() + .args(&["-t'\n'", "-tb"]) + .ignore_stdin_write_error() + .pipe_in("a\n") + .fails() + .no_stdout() + .stderr_contains("split: multiple separator characters specified"); +} + +// Test using same separator multiple times +#[test] +fn test_split_separator_same_multiple() { + let scene = TestScenario::new(util_name!()); + scene + .ucmd() + .args(&["--separator=:", "--separator=:", "fivelines.txt"]) + .succeeds(); + scene + .ucmd() + .args(&["-t:", "--separator=:", "fivelines.txt"]) + .succeeds(); + scene + .ucmd() + .args(&["-t", ":", "-t", ":", "fivelines.txt"]) + .succeeds(); + scene + .ucmd() + .args(&["-t:", "-t:", "-t,", "fivelines.txt"]) + .fails(); +} diff --git a/tests/fixtures/split/separator_nul.txt b/tests/fixtures/split/separator_nul.txt new file mode 100644 index 000000000..c4c49609d Binary files /dev/null and b/tests/fixtures/split/separator_nul.txt differ diff --git a/tests/fixtures/split/separator_semicolon.txt b/tests/fixtures/split/separator_semicolon.txt new file mode 100644 index 000000000..a8396d8ee --- /dev/null +++ b/tests/fixtures/split/separator_semicolon.txt @@ -0,0 +1 @@ +1;2;3;4;5; \ No newline at end of file