diff --git a/src/uu/cut/src/cut.rs b/src/uu/cut/src/cut.rs index 0cc1ec339..a562d2b85 100644 --- a/src/uu/cut/src/cut.rs +++ b/src/uu/cut/src/cut.rs @@ -16,14 +16,16 @@ use uucore::display::Quotable; use uucore::error::{FromIo, UResult, USimpleError}; use self::searcher::Searcher; +use self::whitespace_searcher::WhitespaceSearcher; use uucore::ranges::Range; use uucore::{format_usage, show, show_error, show_if_err}; mod searcher; +mod whitespace_searcher; static NAME: &str = "cut"; static USAGE: &str = - "{} [-d] [-s] [-z] [--output-delimiter] ((-f|-b|-c) {{sequence}}) {{sourcefile}}+"; + "{} [-d|-w] [-s] [-z] [--output-delimiter] ((-f|-b|-c) {{sequence}}) {{sourcefile}}+"; static ABOUT: &str = "Prints specified byte or field columns from each line of stdin or the input files"; static LONG_HELP: &str = " @@ -85,6 +87,10 @@ static LONG_HELP: &str = " --delimiter (-d) option. Setting the delimiter is optional. If not set, a default delimiter of Tab will be used. + If the -w option is provided, fields will be separated by any number + of whitespace characters (Space and Tab). The output delimiter will + be a Tab unless explicitly specified. Only one of -d or -w option can be specified. + Optionally Filter based on delimiter If the --only-delimited (-s) flag is provided, only lines which contain the delimiter will be printed @@ -115,6 +121,7 @@ struct FieldOptions { delimiter: String, // one char long, String because of UTF8 representation out_delimiter: Option, only_delimited: bool, + whitespace_delimited: bool, zero_terminated: bool, } @@ -256,9 +263,98 @@ fn cut_fields_delimiter( Ok(()) } +fn cut_fields_whitespace( + reader: R, + ranges: &[Range], + only_delimited: bool, + newline_char: u8, + out_delim: &str, +) -> UResult<()> { + let mut buf_in = BufReader::new(reader); + let mut out = stdout_writer(); + + let result = buf_in.for_byte_record_with_terminator(newline_char, |line| { + let mut fields_pos = 1; + let mut low_idx = 0; + let mut delim_search = WhitespaceSearcher::new(line).peekable(); + let mut print_delim = false; + + if delim_search.peek().is_none() { + if !only_delimited { + out.write_all(line)?; + if line[line.len() - 1] != newline_char { + out.write_all(&[newline_char])?; + } + } + + return Ok(true); + } + + for &Range { low, high } in ranges { + if low - fields_pos > 0 { + low_idx = match delim_search.nth(low - fields_pos - 1) { + Some((_, last)) => last, + None => break, + }; + } + + for _ in 0..=high - low { + if print_delim { + out.write_all(out_delim.as_bytes())?; + } else { + print_delim = true; + } + + match delim_search.next() { + Some((first, last)) => { + let segment = &line[low_idx..first]; + + out.write_all(segment)?; + + low_idx = last; + fields_pos = high + 1; + } + None => { + let segment = &line[low_idx..]; + + out.write_all(segment)?; + + if line[line.len() - 1] == newline_char { + return Ok(true); + } + break; + } + } + } + } + + out.write_all(&[newline_char])?; + Ok(true) + }); + + if let Err(e) = result { + return Err(USimpleError::new(1, e.to_string())); + } + + Ok(()) +} + #[allow(clippy::cognitive_complexity)] fn cut_fields(reader: R, ranges: &[Range], opts: &FieldOptions) -> UResult<()> { let newline_char = if opts.zero_terminated { b'\0' } else { b'\n' }; + + if opts.whitespace_delimited { + return cut_fields_whitespace( + reader, + ranges, + opts.only_delimited, + newline_char, + match opts.out_delimiter { + Some(ref delim) => delim, + _ => "\t", + } + ); + } if let Some(ref o_delim) = opts.out_delimiter { return cut_fields_delimiter( reader, @@ -387,6 +483,7 @@ mod options { pub const ZERO_TERMINATED: &str = "zero-terminated"; pub const ONLY_DELIMITED: &str = "only-delimited"; pub const OUTPUT_DELIMITER: &str = "output-delimiter"; + pub const WHITESPACE_DELIMITED: &str = "whitespace-delimited"; pub const COMPLEMENT: &str = "complement"; pub const FILE: &str = "file"; } @@ -449,37 +546,44 @@ pub fn uumain(args: impl uucore::Args) -> UResult<()> { }; let only_delimited = matches.get_flag(options::ONLY_DELIMITED); + let whitespace_delimited = matches.get_flag(options::WHITESPACE_DELIMITED); let zero_terminated = matches.get_flag(options::ZERO_TERMINATED); match matches.get_one::(options::DELIMITER).map(|s| s.as_str()) { Some(mut delim) => { - // GNU's `cut` supports `-d=` to set the delimiter to `=`. - // Clap parsing is limited in this situation, see: - // https://github.com/uutils/coreutils/issues/2424#issuecomment-863825242 - if delimiter_is_equal { - delim = "="; - } else if delim == "''" { - // treat `''` as empty delimiter - delim = ""; + if whitespace_delimited { + Err("invalid input: Only one of --delimiter (-d) or -w option can be specified".into()) } - if delim.chars().count() > 1 { - Err("invalid input: The '--delimiter' ('-d') option expects empty or 1 character long, but was provided a value 2 characters or longer".into()) - } else { - let delim = if delim.is_empty() { - "\0".to_owned() + else { + // GNU's `cut` supports `-d=` to set the delimiter to `=`. + // Clap parsing is limited in this situation, see: + // https://github.com/uutils/coreutils/issues/2424#issuecomment-863825242 + if delimiter_is_equal { + delim = "="; + } else if delim == "''" { + // treat `''` as empty delimiter + delim = ""; + } + if delim.chars().count() > 1 { + Err("invalid input: The '--delimiter' ('-d') option expects empty or 1 character long, but was provided a value 2 characters or longer".into()) } else { - delim.to_owned() - }; + let delim = if delim.is_empty() { + "\0".to_owned() + } else { + delim.to_owned() + }; - Ok(Mode::Fields( - ranges, - FieldOptions { - delimiter: delim, - out_delimiter: out_delim, - only_delimited, - zero_terminated, - }, - )) + Ok(Mode::Fields( + ranges, + FieldOptions { + delimiter: delim, + out_delimiter: out_delim, + only_delimited, + whitespace_delimited, + zero_terminated, + }, + )) + } } } None => Ok(Mode::Fields( @@ -488,6 +592,7 @@ pub fn uumain(args: impl uucore::Args) -> UResult<()> { delimiter: "\t".to_owned(), out_delimiter: out_delim, only_delimited, + whitespace_delimited, zero_terminated, }, )), @@ -508,6 +613,11 @@ pub fn uumain(args: impl uucore::Args) -> UResult<()> { { Err("invalid input: The '--delimiter' ('-d') option only usable if printing a sequence of fields".into()) } + Mode::Bytes(_, _) | Mode::Characters(_, _) + if matches.contains_id(options::WHITESPACE_DELIMITED) => + { + Err("invalid input: The '-w' option only usable if printing a sequence of fields".into()) + } Mode::Bytes(_, _) | Mode::Characters(_, _) if matches.get_flag(options::ONLY_DELIMITED) => { @@ -563,6 +673,13 @@ pub fn uu_app() -> Command { .help("specify the delimiter character that separates fields in the input source. Defaults to Tab.") .value_name("DELIM"), ) + .arg( + Arg::new(options::WHITESPACE_DELIMITED) + .short('w') + .help("Use any number of whitespace (Space, Tab) to separate fields in the input source.") + .value_name("WHITESPACE") + .action(ArgAction::SetTrue), + ) .arg( Arg::new(options::FIELDS) .short('f') diff --git a/src/uu/cut/src/whitespace_searcher.rs b/src/uu/cut/src/whitespace_searcher.rs new file mode 100644 index 000000000..d1aa2a057 --- /dev/null +++ b/src/uu/cut/src/whitespace_searcher.rs @@ -0,0 +1,96 @@ +// This file is part of the uutils coreutils package. +// +// (c) Rolf Morel +// +// For the full copyright and license information, please view the LICENSE +// file that was distributed with this source code. + +use memchr::memchr2; + +pub struct WhitespaceSearcher<'a> { + haystack: &'a [u8], + position: usize, +} + +impl<'a> WhitespaceSearcher<'a> { + pub fn new(haystack: &'a [u8]) -> WhitespaceSearcher<'a> { + WhitespaceSearcher { + haystack, + position: 0, + } + } +} + +impl<'a> Iterator for WhitespaceSearcher<'a> { + type Item = (usize, usize); + + fn next(&mut self) -> Option { + loop { + if let Some(match_idx) = memchr2(b' ', b'\t', self.haystack) { + let mut skip = match_idx + 1; + while skip < self.haystack.len() + && (self.haystack[skip] == b' ' || self.haystack[skip] == b'\t') + { + skip += 1; + } + let match_pos = self.position + match_idx; + self.haystack = &self.haystack[skip..]; + self.position += skip; + return Some((match_pos, self.position)); + } else { + return None; + } + } + } +} + +#[cfg(test)] +mod tests { + + use super::*; + + #[test] + fn test_space() { + let iter = WhitespaceSearcher::new(" . . ".as_bytes()); + let items: Vec<(usize, usize)> = iter.collect(); + assert_eq!(vec![(0, 1), (2, 3), (4, 5)], items); + } + + #[test] + fn test_tab() { + let iter = WhitespaceSearcher::new("\t.\t.\t".as_bytes()); + let items: Vec<(usize, usize)> = iter.collect(); + assert_eq!(vec![(0, 1), (2, 3), (4, 5)], items); + } + + #[test] + fn test_empty() { + let iter = WhitespaceSearcher::new("".as_bytes()); + let items: Vec<(usize, usize)> = iter.collect(); + assert_eq!(vec![] as Vec<(usize, usize)>, items); + } + + fn test_multispace(line: &[u8], expected: &[(usize, usize)]) { + let iter = WhitespaceSearcher::new(line); + let items: Vec<(usize, usize)> = iter.collect(); + assert_eq!(expected, items); + } + + #[test] + fn test_multispace_normal() { + test_multispace( + "... ... \t...\t ... \t ...".as_bytes(), + &[(3, 5), (8, 10), (13, 15), (18, 21)], + ); + } + + #[test] + fn test_multispace_begin() { + test_multispace(" \t\t...".as_bytes(), &[(0, 3)]); + } + + #[test] + fn test_multispace_end() { + test_multispace("...\t ".as_bytes(), &[(3, 6)]); + } +} diff --git a/tests/by-util/test_cut.rs b/tests/by-util/test_cut.rs index bcdd9eaf0..f3930a633 100644 --- a/tests/by-util/test_cut.rs +++ b/tests/by-util/test_cut.rs @@ -81,6 +81,16 @@ fn test_field_sequence() { } } +#[test] +fn test_whitespace_delimited() { + for param in ["-w"] { + new_ucmd!() + .args(&[param, "-f", COMPLEX_SEQUENCE.sequence, INPUT]) + .succeeds() + .stdout_only_fixture("whitespace_delimited.expected"); + } +} + #[test] fn test_specify_delimiter() { for param in ["-d", "--delimiter", "--del"] { diff --git a/tests/fixtures/cut/whitespace_delimited.expected b/tests/fixtures/cut/whitespace_delimited.expected new file mode 100644 index 000000000..cb064b7d2 --- /dev/null +++ b/tests/fixtures/cut/whitespace_delimited.expected @@ -0,0 +1,5 @@ +foo:bar:baz:qux:quux +one:two:three:four:five:six:seven +alpha:beta:gamma:delta:epsilon:zeta:eta:theta:iota:kappa:lambda:mu +the quick fox over the dog +sally sells down the seashore are the seashells sally sells