mirror of
https://github.com/RGBCube/uutils-coreutils
synced 2025-07-27 19:17:43 +00:00
split: implementing separator option (#5331)
* split: implementing separator option * split: separator option - handle multiple update * split: style * split: separator tests * split: separator tests - stdin in ci/cd * split: tests - ci/cd stdin errors * split: refactor based on feedback * split: improve test coverage * split: fix broken pipe error in tests with stdin * split: fix for handle_multiple_separator_options * split: comments * split: refactor separator code * split: changes based on feedback * split: changes based on feedback
This commit is contained in:
parent
9f6a720582
commit
c5a0aa92f8
5 changed files with 348 additions and 38 deletions
|
@ -11,3 +11,16 @@ Create output files containing consecutive or interleaved sections of input
|
||||||
## After Help
|
## After Help
|
||||||
|
|
||||||
Output fixed-size pieces of INPUT to PREFIXaa, PREFIXab, ...; default size is 1000, and default PREFIX is 'x'. With no INPUT, or when INPUT is -, read standard input.
|
Output fixed-size pieces of INPUT to PREFIXaa, PREFIXab, ...; default size is 1000, and default PREFIX is 'x'. With no INPUT, or when INPUT is -, read standard input.
|
||||||
|
|
||||||
|
The SIZE argument is an integer and optional unit (example: 10K is 10*1024).
|
||||||
|
Units are K,M,G,T,P,E,Z,Y,R,Q (powers of 1024) or KB,MB,... (powers of 1000).
|
||||||
|
Binary prefixes can be used, too: KiB=K, MiB=M, and so on.
|
||||||
|
|
||||||
|
CHUNKS may be:
|
||||||
|
|
||||||
|
- N split into N files based on size of input
|
||||||
|
- K/N output Kth of N to stdout
|
||||||
|
- l/N split into N files without splitting lines/records
|
||||||
|
- l/K/N output Kth of N to stdout without splitting lines/records
|
||||||
|
- r/N like 'l' but use round robin distribution
|
||||||
|
- r/K/N likewise but only output Kth of N to stdout
|
||||||
|
|
|
@ -11,7 +11,7 @@ mod platform;
|
||||||
|
|
||||||
use crate::filenames::FilenameIterator;
|
use crate::filenames::FilenameIterator;
|
||||||
use crate::filenames::SuffixType;
|
use crate::filenames::SuffixType;
|
||||||
use clap::{crate_version, parser::ValueSource, Arg, ArgAction, ArgMatches, Command};
|
use clap::{crate_version, parser::ValueSource, Arg, ArgAction, ArgMatches, Command, ValueHint};
|
||||||
use std::env;
|
use std::env;
|
||||||
use std::ffi::OsString;
|
use std::ffi::OsString;
|
||||||
use std::fmt;
|
use std::fmt;
|
||||||
|
@ -39,6 +39,7 @@ static OPT_HEX_SUFFIXES_SHORT: &str = "-x";
|
||||||
static OPT_SUFFIX_LENGTH: &str = "suffix-length";
|
static OPT_SUFFIX_LENGTH: &str = "suffix-length";
|
||||||
static OPT_DEFAULT_SUFFIX_LENGTH: &str = "0";
|
static OPT_DEFAULT_SUFFIX_LENGTH: &str = "0";
|
||||||
static OPT_VERBOSE: &str = "verbose";
|
static OPT_VERBOSE: &str = "verbose";
|
||||||
|
static OPT_SEPARATOR: &str = "separator";
|
||||||
//The ---io and ---io-blksize parameters are consumed and ignored.
|
//The ---io and ---io-blksize parameters are consumed and ignored.
|
||||||
//The parameter is included to make GNU coreutils tests pass.
|
//The parameter is included to make GNU coreutils tests pass.
|
||||||
static OPT_IO: &str = "-io";
|
static OPT_IO: &str = "-io";
|
||||||
|
@ -55,7 +56,6 @@ const AFTER_HELP: &str = help_section!("after help", "split.md");
|
||||||
#[uucore::main]
|
#[uucore::main]
|
||||||
pub fn uumain(args: impl uucore::Args) -> UResult<()> {
|
pub fn uumain(args: impl uucore::Args) -> UResult<()> {
|
||||||
let (args, obs_lines) = handle_obsolete(args);
|
let (args, obs_lines) = handle_obsolete(args);
|
||||||
|
|
||||||
let matches = uu_app().try_get_matches_from(args)?;
|
let matches = uu_app().try_get_matches_from(args)?;
|
||||||
|
|
||||||
match Settings::from(&matches, &obs_lines) {
|
match Settings::from(&matches, &obs_lines) {
|
||||||
|
@ -145,6 +145,7 @@ fn should_extract_obs_lines(
|
||||||
&& !slice.starts_with("-C")
|
&& !slice.starts_with("-C")
|
||||||
&& !slice.starts_with("-l")
|
&& !slice.starts_with("-l")
|
||||||
&& !slice.starts_with("-n")
|
&& !slice.starts_with("-n")
|
||||||
|
&& !slice.starts_with("-t")
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Helper function to [`filter_args`]
|
/// Helper function to [`filter_args`]
|
||||||
|
@ -208,13 +209,18 @@ fn handle_preceding_options(
|
||||||
|| &slice[2..] == OPT_ADDITIONAL_SUFFIX
|
|| &slice[2..] == OPT_ADDITIONAL_SUFFIX
|
||||||
|| &slice[2..] == OPT_FILTER
|
|| &slice[2..] == OPT_FILTER
|
||||||
|| &slice[2..] == OPT_NUMBER
|
|| &slice[2..] == OPT_NUMBER
|
||||||
|| &slice[2..] == OPT_SUFFIX_LENGTH;
|
|| &slice[2..] == OPT_SUFFIX_LENGTH
|
||||||
|
|| &slice[2..] == OPT_SEPARATOR;
|
||||||
}
|
}
|
||||||
// capture if current slice is a preceding short option that requires value and does not have value in the same slice (value separated by whitespace)
|
// capture if current slice is a preceding short option that requires value and does not have value in the same slice (value separated by whitespace)
|
||||||
// following slice should be treaded as value for this option
|
// following slice should be treaded as value for this option
|
||||||
// even if it starts with '-' (which would be treated as hyphen prefixed value)
|
// even if it starts with '-' (which would be treated as hyphen prefixed value)
|
||||||
*preceding_short_opt_req_value =
|
*preceding_short_opt_req_value = slice == "-b"
|
||||||
slice == "-b" || slice == "-C" || slice == "-l" || slice == "-n" || slice == "-a";
|
|| slice == "-C"
|
||||||
|
|| slice == "-l"
|
||||||
|
|| slice == "-n"
|
||||||
|
|| slice == "-a"
|
||||||
|
|| slice == "-t";
|
||||||
// slice is a value
|
// slice is a value
|
||||||
// reset preceding option flags
|
// reset preceding option flags
|
||||||
if !slice.starts_with('-') {
|
if !slice.starts_with('-') {
|
||||||
|
@ -278,7 +284,7 @@ pub fn uu_app() -> Command {
|
||||||
.long(OPT_FILTER)
|
.long(OPT_FILTER)
|
||||||
.allow_hyphen_values(true)
|
.allow_hyphen_values(true)
|
||||||
.value_name("COMMAND")
|
.value_name("COMMAND")
|
||||||
.value_hint(clap::ValueHint::CommandName)
|
.value_hint(ValueHint::CommandName)
|
||||||
.help(
|
.help(
|
||||||
"write to shell COMMAND; file name is $FILE (Currently not implemented for Windows)",
|
"write to shell COMMAND; file name is $FILE (Currently not implemented for Windows)",
|
||||||
),
|
),
|
||||||
|
@ -293,7 +299,7 @@ pub fn uu_app() -> Command {
|
||||||
.arg(
|
.arg(
|
||||||
Arg::new(OPT_NUMERIC_SUFFIXES_SHORT)
|
Arg::new(OPT_NUMERIC_SUFFIXES_SHORT)
|
||||||
.short('d')
|
.short('d')
|
||||||
.action(clap::ArgAction::SetTrue)
|
.action(ArgAction::SetTrue)
|
||||||
.overrides_with_all([
|
.overrides_with_all([
|
||||||
OPT_NUMERIC_SUFFIXES,
|
OPT_NUMERIC_SUFFIXES,
|
||||||
OPT_NUMERIC_SUFFIXES_SHORT,
|
OPT_NUMERIC_SUFFIXES_SHORT,
|
||||||
|
@ -314,12 +320,13 @@ pub fn uu_app() -> Command {
|
||||||
OPT_HEX_SUFFIXES,
|
OPT_HEX_SUFFIXES,
|
||||||
OPT_HEX_SUFFIXES_SHORT
|
OPT_HEX_SUFFIXES_SHORT
|
||||||
])
|
])
|
||||||
|
.value_name("FROM")
|
||||||
.help("same as -d, but allow setting the start value"),
|
.help("same as -d, but allow setting the start value"),
|
||||||
)
|
)
|
||||||
.arg(
|
.arg(
|
||||||
Arg::new(OPT_HEX_SUFFIXES_SHORT)
|
Arg::new(OPT_HEX_SUFFIXES_SHORT)
|
||||||
.short('x')
|
.short('x')
|
||||||
.action(clap::ArgAction::SetTrue)
|
.action(ArgAction::SetTrue)
|
||||||
.overrides_with_all([
|
.overrides_with_all([
|
||||||
OPT_NUMERIC_SUFFIXES,
|
OPT_NUMERIC_SUFFIXES,
|
||||||
OPT_NUMERIC_SUFFIXES_SHORT,
|
OPT_NUMERIC_SUFFIXES_SHORT,
|
||||||
|
@ -340,6 +347,7 @@ pub fn uu_app() -> Command {
|
||||||
OPT_HEX_SUFFIXES,
|
OPT_HEX_SUFFIXES,
|
||||||
OPT_HEX_SUFFIXES_SHORT
|
OPT_HEX_SUFFIXES_SHORT
|
||||||
])
|
])
|
||||||
|
.value_name("FROM")
|
||||||
.help("same as -x, but allow setting the start value"),
|
.help("same as -x, but allow setting the start value"),
|
||||||
)
|
)
|
||||||
.arg(
|
.arg(
|
||||||
|
@ -357,6 +365,15 @@ pub fn uu_app() -> Command {
|
||||||
.help("print a diagnostic just before each output file is opened")
|
.help("print a diagnostic just before each output file is opened")
|
||||||
.action(ArgAction::SetTrue),
|
.action(ArgAction::SetTrue),
|
||||||
)
|
)
|
||||||
|
.arg(
|
||||||
|
Arg::new(OPT_SEPARATOR)
|
||||||
|
.short('t')
|
||||||
|
.long(OPT_SEPARATOR)
|
||||||
|
.allow_hyphen_values(true)
|
||||||
|
.value_name("SEP")
|
||||||
|
.action(ArgAction::Append)
|
||||||
|
.help("use SEP instead of newline as the record separator; '\0' (zero) specifies the NUL character"),
|
||||||
|
)
|
||||||
.arg(
|
.arg(
|
||||||
Arg::new(OPT_IO)
|
Arg::new(OPT_IO)
|
||||||
.long("io")
|
.long("io")
|
||||||
|
@ -372,7 +389,7 @@ pub fn uu_app() -> Command {
|
||||||
.arg(
|
.arg(
|
||||||
Arg::new(ARG_INPUT)
|
Arg::new(ARG_INPUT)
|
||||||
.default_value("-")
|
.default_value("-")
|
||||||
.value_hint(clap::ValueHint::FilePath),
|
.value_hint(ValueHint::FilePath),
|
||||||
)
|
)
|
||||||
.arg(
|
.arg(
|
||||||
Arg::new(ARG_PREFIX)
|
Arg::new(ARG_PREFIX)
|
||||||
|
@ -696,6 +713,7 @@ struct Settings {
|
||||||
filter: Option<String>,
|
filter: Option<String>,
|
||||||
strategy: Strategy,
|
strategy: Strategy,
|
||||||
verbose: bool,
|
verbose: bool,
|
||||||
|
separator: u8,
|
||||||
|
|
||||||
/// Whether to *not* produce empty files when using `-n`.
|
/// Whether to *not* produce empty files when using `-n`.
|
||||||
///
|
///
|
||||||
|
@ -722,6 +740,12 @@ enum SettingsError {
|
||||||
/// Suffix is not large enough to split into specified chunks
|
/// Suffix is not large enough to split into specified chunks
|
||||||
SuffixTooSmall(usize),
|
SuffixTooSmall(usize),
|
||||||
|
|
||||||
|
/// Multi-character (Invalid) separator
|
||||||
|
MultiCharacterSeparator(String),
|
||||||
|
|
||||||
|
/// Multiple different separator characters
|
||||||
|
MultipleSeparatorCharacters,
|
||||||
|
|
||||||
/// The `--filter` option is not supported on Windows.
|
/// The `--filter` option is not supported on Windows.
|
||||||
#[cfg(windows)]
|
#[cfg(windows)]
|
||||||
NotSupported,
|
NotSupported,
|
||||||
|
@ -743,6 +767,12 @@ impl fmt::Display for SettingsError {
|
||||||
Self::Strategy(e) => e.fmt(f),
|
Self::Strategy(e) => e.fmt(f),
|
||||||
Self::SuffixNotParsable(s) => write!(f, "invalid suffix length: {}", s.quote()),
|
Self::SuffixNotParsable(s) => write!(f, "invalid suffix length: {}", s.quote()),
|
||||||
Self::SuffixTooSmall(i) => write!(f, "the suffix length needs to be at least {i}"),
|
Self::SuffixTooSmall(i) => write!(f, "the suffix length needs to be at least {i}"),
|
||||||
|
Self::MultiCharacterSeparator(s) => {
|
||||||
|
write!(f, "multi-character separator {}", s.quote())
|
||||||
|
}
|
||||||
|
Self::MultipleSeparatorCharacters => {
|
||||||
|
write!(f, "multiple separator characters specified")
|
||||||
|
}
|
||||||
Self::SuffixContainsSeparator(s) => write!(
|
Self::SuffixContainsSeparator(s) => write!(
|
||||||
f,
|
f,
|
||||||
"invalid suffix {}, contains directory separator",
|
"invalid suffix {}, contains directory separator",
|
||||||
|
@ -783,6 +813,26 @@ impl Settings {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Make sure that separator is only one UTF8 character (if specified)
|
||||||
|
// defaults to '\n' - newline character
|
||||||
|
// If the same separator (the same value) was used multiple times - `split` should NOT fail
|
||||||
|
// If the separator was used multiple times but with different values (not all values are the same) - `split` should fail
|
||||||
|
let separator = match matches.get_many::<String>(OPT_SEPARATOR) {
|
||||||
|
Some(mut sep_values) => {
|
||||||
|
let first = sep_values.next().unwrap(); // it is safe to just unwrap here since Clap should not return empty ValuesRef<'_,String> in the option from get_many() call
|
||||||
|
if !sep_values.all(|s| s == first) {
|
||||||
|
return Err(SettingsError::MultipleSeparatorCharacters);
|
||||||
|
}
|
||||||
|
match first.as_str() {
|
||||||
|
"\\0" => b'\0',
|
||||||
|
s if s.as_bytes().len() == 1 => s.as_bytes()[0],
|
||||||
|
s => return Err(SettingsError::MultiCharacterSeparator(s.to_owned())),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
None => b'\n',
|
||||||
|
};
|
||||||
|
|
||||||
let result = Self {
|
let result = Self {
|
||||||
suffix_length: suffix_length_str
|
suffix_length: suffix_length_str
|
||||||
.parse()
|
.parse()
|
||||||
|
@ -791,6 +841,7 @@ impl Settings {
|
||||||
suffix_start,
|
suffix_start,
|
||||||
additional_suffix,
|
additional_suffix,
|
||||||
verbose: matches.value_source("verbose") == Some(ValueSource::CommandLine),
|
verbose: matches.value_source("verbose") == Some(ValueSource::CommandLine),
|
||||||
|
separator,
|
||||||
strategy,
|
strategy,
|
||||||
input: matches.get_one::<String>(ARG_INPUT).unwrap().to_owned(),
|
input: matches.get_one::<String>(ARG_INPUT).unwrap().to_owned(),
|
||||||
prefix: matches.get_one::<String>(ARG_PREFIX).unwrap().to_owned(),
|
prefix: matches.get_one::<String>(ARG_PREFIX).unwrap().to_owned(),
|
||||||
|
@ -1019,7 +1070,8 @@ impl<'a> Write for LineChunkWriter<'a> {
|
||||||
// corresponds to the current chunk number.
|
// corresponds to the current chunk number.
|
||||||
let mut prev = 0;
|
let mut prev = 0;
|
||||||
let mut total_bytes_written = 0;
|
let mut total_bytes_written = 0;
|
||||||
for i in memchr::memchr_iter(b'\n', buf) {
|
let sep = self.settings.separator;
|
||||||
|
for i in memchr::memchr_iter(sep, buf) {
|
||||||
// If we have exceeded the number of lines to write in the
|
// If we have exceeded the number of lines to write in the
|
||||||
// current chunk, then start a new chunk and its
|
// current chunk, then start a new chunk and its
|
||||||
// corresponding writer.
|
// corresponding writer.
|
||||||
|
@ -1036,8 +1088,8 @@ impl<'a> Write for LineChunkWriter<'a> {
|
||||||
}
|
}
|
||||||
|
|
||||||
// Write the line, starting from *after* the previous
|
// Write the line, starting from *after* the previous
|
||||||
// newline character and ending *after* the current
|
// separator character and ending *after* the current
|
||||||
// newline character.
|
// separator character.
|
||||||
let n = self.inner.write(&buf[prev..i + 1])?;
|
let n = self.inner.write(&buf[prev..i + 1])?;
|
||||||
total_bytes_written += n;
|
total_bytes_written += n;
|
||||||
prev = i + 1;
|
prev = i + 1;
|
||||||
|
@ -1175,21 +1227,22 @@ impl<'a> Write for LineBytesChunkWriter<'a> {
|
||||||
self.num_bytes_remaining_in_current_chunk = self.chunk_size.try_into().unwrap();
|
self.num_bytes_remaining_in_current_chunk = self.chunk_size.try_into().unwrap();
|
||||||
}
|
}
|
||||||
|
|
||||||
// Find the first newline character in the buffer.
|
// Find the first separator (default - newline character) in the buffer.
|
||||||
match memchr::memchr(b'\n', buf) {
|
let sep = self.settings.separator;
|
||||||
// If there is no newline character and the buffer is
|
match memchr::memchr(sep, buf) {
|
||||||
|
// If there is no separator character and the buffer is
|
||||||
// not empty, then write as many bytes as we can and
|
// not empty, then write as many bytes as we can and
|
||||||
// then move on to the next chunk if necessary.
|
// then move on to the next chunk if necessary.
|
||||||
None => {
|
None => {
|
||||||
let end = self.num_bytes_remaining_in_current_chunk;
|
let end = self.num_bytes_remaining_in_current_chunk;
|
||||||
|
|
||||||
// This is ugly but here to match GNU behavior. If the input
|
// This is ugly but here to match GNU behavior. If the input
|
||||||
// doesn't end with a \n, pretend that it does for handling
|
// doesn't end with a separator, pretend that it does for handling
|
||||||
// the second to last segment chunk. See `line-bytes.sh`.
|
// the second to last segment chunk. See `line-bytes.sh`.
|
||||||
if end == buf.len()
|
if end == buf.len()
|
||||||
&& self.num_bytes_remaining_in_current_chunk
|
&& self.num_bytes_remaining_in_current_chunk
|
||||||
< self.chunk_size.try_into().unwrap()
|
< self.chunk_size.try_into().unwrap()
|
||||||
&& buf[buf.len() - 1] != b'\n'
|
&& buf[buf.len() - 1] != sep
|
||||||
{
|
{
|
||||||
self.num_bytes_remaining_in_current_chunk = 0;
|
self.num_bytes_remaining_in_current_chunk = 0;
|
||||||
} else {
|
} else {
|
||||||
|
@ -1200,8 +1253,8 @@ impl<'a> Write for LineBytesChunkWriter<'a> {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// If there is a newline character and the line
|
// If there is a separator character and the line
|
||||||
// (including the newline character) will fit in the
|
// (including the separator character) will fit in the
|
||||||
// current chunk, then write the entire line and
|
// current chunk, then write the entire line and
|
||||||
// continue to the next iteration. (See chunk 1 in the
|
// continue to the next iteration. (See chunk 1 in the
|
||||||
// example comment above.)
|
// example comment above.)
|
||||||
|
@ -1212,8 +1265,8 @@ impl<'a> Write for LineBytesChunkWriter<'a> {
|
||||||
buf = &buf[num_bytes_written..];
|
buf = &buf[num_bytes_written..];
|
||||||
}
|
}
|
||||||
|
|
||||||
// If there is a newline character, the line
|
// If there is a separator character, the line
|
||||||
// (including the newline character) will not fit in
|
// (including the separator character) will not fit in
|
||||||
// the current chunk, *and* no other lines have been
|
// the current chunk, *and* no other lines have been
|
||||||
// written to the current chunk, then write as many
|
// written to the current chunk, then write as many
|
||||||
// bytes as we can and continue to the next
|
// bytes as we can and continue to the next
|
||||||
|
@ -1230,8 +1283,8 @@ impl<'a> Write for LineBytesChunkWriter<'a> {
|
||||||
buf = &buf[num_bytes_written..];
|
buf = &buf[num_bytes_written..];
|
||||||
}
|
}
|
||||||
|
|
||||||
// If there is a newline character, the line
|
// If there is a separator character, the line
|
||||||
// (including the newline character) will not fit in
|
// (including the separator character) will not fit in
|
||||||
// the current chunk, and at least one other line has
|
// the current chunk, and at least one other line has
|
||||||
// been written to the current chunk, then signal to
|
// been written to the current chunk, then signal to
|
||||||
// the next iteration that a new chunk needs to be
|
// the next iteration that a new chunk needs to be
|
||||||
|
@ -1489,15 +1542,16 @@ where
|
||||||
|
|
||||||
let mut num_bytes_remaining_in_current_chunk = chunk_size;
|
let mut num_bytes_remaining_in_current_chunk = chunk_size;
|
||||||
let mut i = 0;
|
let mut i = 0;
|
||||||
for line_result in reader.lines() {
|
let sep = settings.separator;
|
||||||
|
for line_result in reader.split(sep) {
|
||||||
let line = line_result.unwrap();
|
let line = line_result.unwrap();
|
||||||
let maybe_writer = writers.get_mut(i);
|
let maybe_writer = writers.get_mut(i);
|
||||||
let writer = maybe_writer.unwrap();
|
let writer = maybe_writer.unwrap();
|
||||||
let bytes = line.as_bytes();
|
let bytes = line.as_slice();
|
||||||
writer.write_all(bytes)?;
|
writer.write_all(bytes)?;
|
||||||
writer.write_all(b"\n")?;
|
writer.write_all(&[sep])?;
|
||||||
|
|
||||||
// Add one byte for the newline character.
|
// Add one byte for the separator character.
|
||||||
let num_bytes = bytes.len() + 1;
|
let num_bytes = bytes.len() + 1;
|
||||||
if num_bytes > num_bytes_remaining_in_current_chunk {
|
if num_bytes > num_bytes_remaining_in_current_chunk {
|
||||||
num_bytes_remaining_in_current_chunk = chunk_size;
|
num_bytes_remaining_in_current_chunk = chunk_size;
|
||||||
|
@ -1546,15 +1600,16 @@ where
|
||||||
|
|
||||||
let mut num_bytes_remaining_in_current_chunk = chunk_size;
|
let mut num_bytes_remaining_in_current_chunk = chunk_size;
|
||||||
let mut i = 0;
|
let mut i = 0;
|
||||||
for line_result in reader.lines() {
|
let sep = settings.separator;
|
||||||
|
for line_result in reader.split(sep) {
|
||||||
let line = line_result?;
|
let line = line_result?;
|
||||||
let bytes = line.as_bytes();
|
let bytes = line.as_slice();
|
||||||
if i == chunk_number {
|
if i == chunk_number {
|
||||||
writer.write_all(bytes)?;
|
writer.write_all(bytes)?;
|
||||||
writer.write_all(b"\n")?;
|
writer.write_all(&[sep])?;
|
||||||
}
|
}
|
||||||
|
|
||||||
// Add one byte for the newline character.
|
// Add one byte for the separator character.
|
||||||
let num_bytes = bytes.len() + 1;
|
let num_bytes = bytes.len() + 1;
|
||||||
if num_bytes >= num_bytes_remaining_in_current_chunk {
|
if num_bytes >= num_bytes_remaining_in_current_chunk {
|
||||||
num_bytes_remaining_in_current_chunk = chunk_size;
|
num_bytes_remaining_in_current_chunk = chunk_size;
|
||||||
|
@ -1601,13 +1656,14 @@ where
|
||||||
}
|
}
|
||||||
|
|
||||||
let num_chunks: usize = num_chunks.try_into().unwrap();
|
let num_chunks: usize = num_chunks.try_into().unwrap();
|
||||||
for (i, line_result) in reader.lines().enumerate() {
|
let sep = settings.separator;
|
||||||
|
for (i, line_result) in reader.split(sep).enumerate() {
|
||||||
let line = line_result.unwrap();
|
let line = line_result.unwrap();
|
||||||
let maybe_writer = writers.get_mut(i % num_chunks);
|
let maybe_writer = writers.get_mut(i % num_chunks);
|
||||||
let writer = maybe_writer.unwrap();
|
let writer = maybe_writer.unwrap();
|
||||||
let bytes = line.as_bytes();
|
let bytes = line.as_slice();
|
||||||
writer.write_all(bytes)?;
|
writer.write_all(bytes)?;
|
||||||
writer.write_all(b"\n")?;
|
writer.write_all(&[sep])?;
|
||||||
}
|
}
|
||||||
|
|
||||||
Ok(())
|
Ok(())
|
||||||
|
@ -1632,7 +1688,7 @@ where
|
||||||
/// * [`split_into_n_chunks_by_line_round_robin`], which splits its input in the
|
/// * [`split_into_n_chunks_by_line_round_robin`], which splits its input in the
|
||||||
/// same way, but writes each chunk to its own file.
|
/// same way, but writes each chunk to its own file.
|
||||||
fn kth_chunk_by_line_round_robin<R>(
|
fn kth_chunk_by_line_round_robin<R>(
|
||||||
_settings: &Settings,
|
settings: &Settings,
|
||||||
reader: &mut R,
|
reader: &mut R,
|
||||||
chunk_number: u64,
|
chunk_number: u64,
|
||||||
num_chunks: u64,
|
num_chunks: u64,
|
||||||
|
@ -1646,12 +1702,13 @@ where
|
||||||
|
|
||||||
let num_chunks: usize = num_chunks.try_into().unwrap();
|
let num_chunks: usize = num_chunks.try_into().unwrap();
|
||||||
let chunk_number: usize = chunk_number.try_into().unwrap();
|
let chunk_number: usize = chunk_number.try_into().unwrap();
|
||||||
for (i, line_result) in reader.lines().enumerate() {
|
let sep = settings.separator;
|
||||||
|
for (i, line_result) in reader.split(sep).enumerate() {
|
||||||
let line = line_result?;
|
let line = line_result?;
|
||||||
let bytes = line.as_bytes();
|
let bytes = line.as_slice();
|
||||||
if (i % num_chunks) == chunk_number {
|
if (i % num_chunks) == chunk_number {
|
||||||
writer.write_all(bytes)?;
|
writer.write_all(bytes)?;
|
||||||
writer.write_all(b"\n")?;
|
writer.write_all(&[sep])?;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
Ok(())
|
Ok(())
|
||||||
|
|
|
@ -1483,3 +1483,242 @@ fn test_split_non_utf8_argument_windows() {
|
||||||
.fails()
|
.fails()
|
||||||
.stderr_contains("error: invalid UTF-8 was detected in one or more arguments");
|
.stderr_contains("error: invalid UTF-8 was detected in one or more arguments");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Test '--separator' / '-t' option following GNU tests example
|
||||||
|
// test separators: '\n' , '\0' , ';'
|
||||||
|
// test with '--lines=2' , '--line-bytes=4' , '--number=l/3' , '--number=r/3' , '--number=l/1/3' , '--number=r/1/3'
|
||||||
|
#[test]
|
||||||
|
fn test_split_separator_nl_lines() {
|
||||||
|
let (at, mut ucmd) = at_and_ucmd!();
|
||||||
|
ucmd.args(&["--lines=2", "-t", "\n"])
|
||||||
|
.pipe_in("1\n2\n3\n4\n5\n")
|
||||||
|
.succeeds();
|
||||||
|
|
||||||
|
assert_eq!(file_read(&at, "xaa"), "1\n2\n");
|
||||||
|
assert_eq!(file_read(&at, "xab"), "3\n4\n");
|
||||||
|
assert_eq!(file_read(&at, "xac"), "5\n");
|
||||||
|
assert!(!at.plus("xad").exists());
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_split_separator_nl_line_bytes() {
|
||||||
|
let (at, mut ucmd) = at_and_ucmd!();
|
||||||
|
ucmd.args(&["--line-bytes=4", "-t", "\n"])
|
||||||
|
.pipe_in("1\n2\n3\n4\n5\n")
|
||||||
|
.succeeds();
|
||||||
|
|
||||||
|
assert_eq!(file_read(&at, "xaa"), "1\n2\n");
|
||||||
|
assert_eq!(file_read(&at, "xab"), "3\n4\n");
|
||||||
|
assert_eq!(file_read(&at, "xac"), "5\n");
|
||||||
|
assert!(!at.plus("xad").exists());
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_split_separator_nl_number_l() {
|
||||||
|
let (at, mut ucmd) = at_and_ucmd!();
|
||||||
|
ucmd.args(&["--number=l/3", "--separator=\n", "fivelines.txt"])
|
||||||
|
.succeeds();
|
||||||
|
|
||||||
|
assert_eq!(file_read(&at, "xaa"), "1\n2\n");
|
||||||
|
assert_eq!(file_read(&at, "xab"), "3\n4\n");
|
||||||
|
assert_eq!(file_read(&at, "xac"), "5\n");
|
||||||
|
assert!(!at.plus("xad").exists());
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_split_separator_nl_number_r() {
|
||||||
|
let (at, mut ucmd) = at_and_ucmd!();
|
||||||
|
ucmd.args(&["--number=r/3", "--separator", "\n", "fivelines.txt"])
|
||||||
|
.succeeds();
|
||||||
|
|
||||||
|
assert_eq!(file_read(&at, "xaa"), "1\n4\n");
|
||||||
|
assert_eq!(file_read(&at, "xab"), "2\n5\n");
|
||||||
|
assert_eq!(file_read(&at, "xac"), "3\n");
|
||||||
|
assert!(!at.plus("xad").exists());
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_split_separator_nul_lines() {
|
||||||
|
let (at, mut ucmd) = at_and_ucmd!();
|
||||||
|
ucmd.args(&["--lines=2", "-t", "\\0", "separator_nul.txt"])
|
||||||
|
.succeeds();
|
||||||
|
|
||||||
|
assert_eq!(file_read(&at, "xaa"), "1\02\0");
|
||||||
|
assert_eq!(file_read(&at, "xab"), "3\04\0");
|
||||||
|
assert_eq!(file_read(&at, "xac"), "5\0");
|
||||||
|
assert!(!at.plus("xad").exists());
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_split_separator_nul_line_bytes() {
|
||||||
|
let (at, mut ucmd) = at_and_ucmd!();
|
||||||
|
ucmd.args(&["--line-bytes=4", "-t", "\\0", "separator_nul.txt"])
|
||||||
|
.succeeds();
|
||||||
|
|
||||||
|
assert_eq!(file_read(&at, "xaa"), "1\02\0");
|
||||||
|
assert_eq!(file_read(&at, "xab"), "3\04\0");
|
||||||
|
assert_eq!(file_read(&at, "xac"), "5\0");
|
||||||
|
assert!(!at.plus("xad").exists());
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_split_separator_nul_number_l() {
|
||||||
|
let (at, mut ucmd) = at_and_ucmd!();
|
||||||
|
ucmd.args(&["--number=l/3", "--separator=\\0", "separator_nul.txt"])
|
||||||
|
.succeeds();
|
||||||
|
|
||||||
|
assert_eq!(file_read(&at, "xaa"), "1\02\0");
|
||||||
|
assert_eq!(file_read(&at, "xab"), "3\04\0");
|
||||||
|
assert_eq!(file_read(&at, "xac"), "5\0");
|
||||||
|
assert!(!at.plus("xad").exists());
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_split_separator_nul_number_r() {
|
||||||
|
let (at, mut ucmd) = at_and_ucmd!();
|
||||||
|
ucmd.args(&["--number=r/3", "--separator=\\0", "separator_nul.txt"])
|
||||||
|
.succeeds();
|
||||||
|
|
||||||
|
assert_eq!(file_read(&at, "xaa"), "1\04\0");
|
||||||
|
assert_eq!(file_read(&at, "xab"), "2\05\0");
|
||||||
|
assert_eq!(file_read(&at, "xac"), "3\0");
|
||||||
|
assert!(!at.plus("xad").exists());
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_split_separator_semicolon_lines() {
|
||||||
|
let (at, mut ucmd) = at_and_ucmd!();
|
||||||
|
ucmd.args(&["--lines=2", "-t", ";", "separator_semicolon.txt"])
|
||||||
|
.succeeds();
|
||||||
|
|
||||||
|
assert_eq!(file_read(&at, "xaa"), "1;2;");
|
||||||
|
assert_eq!(file_read(&at, "xab"), "3;4;");
|
||||||
|
assert_eq!(file_read(&at, "xac"), "5;");
|
||||||
|
assert!(!at.plus("xad").exists());
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_split_separator_semicolon_line_bytes() {
|
||||||
|
let (at, mut ucmd) = at_and_ucmd!();
|
||||||
|
ucmd.args(&["--line-bytes=4", "-t", ";", "separator_semicolon.txt"])
|
||||||
|
.succeeds();
|
||||||
|
|
||||||
|
assert_eq!(file_read(&at, "xaa"), "1;2;");
|
||||||
|
assert_eq!(file_read(&at, "xab"), "3;4;");
|
||||||
|
assert_eq!(file_read(&at, "xac"), "5;");
|
||||||
|
assert!(!at.plus("xad").exists());
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_split_separator_semicolon_number_l() {
|
||||||
|
let (at, mut ucmd) = at_and_ucmd!();
|
||||||
|
ucmd.args(&["--number=l/3", "--separator=;", "separator_semicolon.txt"])
|
||||||
|
.succeeds();
|
||||||
|
|
||||||
|
assert_eq!(file_read(&at, "xaa"), "1;2;");
|
||||||
|
assert_eq!(file_read(&at, "xab"), "3;4;");
|
||||||
|
assert_eq!(file_read(&at, "xac"), "5;");
|
||||||
|
assert!(!at.plus("xad").exists());
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_split_separator_semicolon_number_r() {
|
||||||
|
let (at, mut ucmd) = at_and_ucmd!();
|
||||||
|
ucmd.args(&["--number=r/3", "--separator=;", "separator_semicolon.txt"])
|
||||||
|
.succeeds();
|
||||||
|
|
||||||
|
assert_eq!(file_read(&at, "xaa"), "1;4;");
|
||||||
|
assert_eq!(file_read(&at, "xab"), "2;5;");
|
||||||
|
assert_eq!(file_read(&at, "xac"), "3;");
|
||||||
|
assert!(!at.plus("xad").exists());
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_split_separator_semicolon_number_kth_l() {
|
||||||
|
new_ucmd!()
|
||||||
|
.args(&[
|
||||||
|
"--number=l/1/3",
|
||||||
|
"--separator",
|
||||||
|
";",
|
||||||
|
"separator_semicolon.txt",
|
||||||
|
])
|
||||||
|
.succeeds()
|
||||||
|
.stdout_only("1;2;");
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_split_separator_semicolon_number_kth_r() {
|
||||||
|
new_ucmd!()
|
||||||
|
.args(&[
|
||||||
|
"--number=r/1/3",
|
||||||
|
"--separator",
|
||||||
|
";",
|
||||||
|
"separator_semicolon.txt",
|
||||||
|
])
|
||||||
|
.succeeds()
|
||||||
|
.stdout_only("1;4;");
|
||||||
|
}
|
||||||
|
|
||||||
|
// Test error edge cases for separator option
|
||||||
|
#[test]
|
||||||
|
fn test_split_separator_no_value() {
|
||||||
|
new_ucmd!()
|
||||||
|
.args(&["-t"])
|
||||||
|
.ignore_stdin_write_error()
|
||||||
|
.pipe_in("a\n")
|
||||||
|
.fails()
|
||||||
|
.stderr_contains(
|
||||||
|
"error: a value is required for '--separator <SEP>' but none was supplied",
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_split_separator_invalid_usage() {
|
||||||
|
let scene = TestScenario::new(util_name!());
|
||||||
|
scene
|
||||||
|
.ucmd()
|
||||||
|
.args(&["--separator=xx"])
|
||||||
|
.ignore_stdin_write_error()
|
||||||
|
.pipe_in("a\n")
|
||||||
|
.fails()
|
||||||
|
.no_stdout()
|
||||||
|
.stderr_contains("split: multi-character separator 'xx'");
|
||||||
|
scene
|
||||||
|
.ucmd()
|
||||||
|
.args(&["-ta", "-tb"])
|
||||||
|
.ignore_stdin_write_error()
|
||||||
|
.pipe_in("a\n")
|
||||||
|
.fails()
|
||||||
|
.no_stdout()
|
||||||
|
.stderr_contains("split: multiple separator characters specified");
|
||||||
|
scene
|
||||||
|
.ucmd()
|
||||||
|
.args(&["-t'\n'", "-tb"])
|
||||||
|
.ignore_stdin_write_error()
|
||||||
|
.pipe_in("a\n")
|
||||||
|
.fails()
|
||||||
|
.no_stdout()
|
||||||
|
.stderr_contains("split: multiple separator characters specified");
|
||||||
|
}
|
||||||
|
|
||||||
|
// Test using same separator multiple times
|
||||||
|
#[test]
|
||||||
|
fn test_split_separator_same_multiple() {
|
||||||
|
let scene = TestScenario::new(util_name!());
|
||||||
|
scene
|
||||||
|
.ucmd()
|
||||||
|
.args(&["--separator=:", "--separator=:", "fivelines.txt"])
|
||||||
|
.succeeds();
|
||||||
|
scene
|
||||||
|
.ucmd()
|
||||||
|
.args(&["-t:", "--separator=:", "fivelines.txt"])
|
||||||
|
.succeeds();
|
||||||
|
scene
|
||||||
|
.ucmd()
|
||||||
|
.args(&["-t", ":", "-t", ":", "fivelines.txt"])
|
||||||
|
.succeeds();
|
||||||
|
scene
|
||||||
|
.ucmd()
|
||||||
|
.args(&["-t:", "-t:", "-t,", "fivelines.txt"])
|
||||||
|
.fails();
|
||||||
|
}
|
||||||
|
|
BIN
tests/fixtures/split/separator_nul.txt
vendored
Normal file
BIN
tests/fixtures/split/separator_nul.txt
vendored
Normal file
Binary file not shown.
1
tests/fixtures/split/separator_semicolon.txt
vendored
Normal file
1
tests/fixtures/split/separator_semicolon.txt
vendored
Normal file
|
@ -0,0 +1 @@
|
||||||
|
1;2;3;4;5;
|
Loading…
Add table
Add a link
Reference in a new issue