From f6a0abaee315b441698b756c8f1e00dbbed1bea6 Mon Sep 17 00:00:00 2001 From: TechHara Date: Sat, 10 Dec 2022 21:47:37 -0500 Subject: [PATCH 01/21] add whitespace delimiter option --- src/uu/cut/src/cut.rs | 167 +++++++++++++++--- src/uu/cut/src/whitespace_searcher.rs | 96 ++++++++++ tests/by-util/test_cut.rs | 10 ++ .../cut/whitespace_delimited.expected | 5 + 4 files changed, 253 insertions(+), 25 deletions(-) create mode 100644 src/uu/cut/src/whitespace_searcher.rs create mode 100644 tests/fixtures/cut/whitespace_delimited.expected diff --git a/src/uu/cut/src/cut.rs b/src/uu/cut/src/cut.rs index 0cc1ec339..a562d2b85 100644 --- a/src/uu/cut/src/cut.rs +++ b/src/uu/cut/src/cut.rs @@ -16,14 +16,16 @@ use uucore::display::Quotable; use uucore::error::{FromIo, UResult, USimpleError}; use self::searcher::Searcher; +use self::whitespace_searcher::WhitespaceSearcher; use uucore::ranges::Range; use uucore::{format_usage, show, show_error, show_if_err}; mod searcher; +mod whitespace_searcher; static NAME: &str = "cut"; static USAGE: &str = - "{} [-d] [-s] [-z] [--output-delimiter] ((-f|-b|-c) {{sequence}}) {{sourcefile}}+"; + "{} [-d|-w] [-s] [-z] [--output-delimiter] ((-f|-b|-c) {{sequence}}) {{sourcefile}}+"; static ABOUT: &str = "Prints specified byte or field columns from each line of stdin or the input files"; static LONG_HELP: &str = " @@ -85,6 +87,10 @@ static LONG_HELP: &str = " --delimiter (-d) option. Setting the delimiter is optional. If not set, a default delimiter of Tab will be used. + If the -w option is provided, fields will be separated by any number + of whitespace characters (Space and Tab). The output delimiter will + be a Tab unless explicitly specified. Only one of -d or -w option can be specified. + Optionally Filter based on delimiter If the --only-delimited (-s) flag is provided, only lines which contain the delimiter will be printed @@ -115,6 +121,7 @@ struct FieldOptions { delimiter: String, // one char long, String because of UTF8 representation out_delimiter: Option, only_delimited: bool, + whitespace_delimited: bool, zero_terminated: bool, } @@ -256,9 +263,98 @@ fn cut_fields_delimiter( Ok(()) } +fn cut_fields_whitespace( + reader: R, + ranges: &[Range], + only_delimited: bool, + newline_char: u8, + out_delim: &str, +) -> UResult<()> { + let mut buf_in = BufReader::new(reader); + let mut out = stdout_writer(); + + let result = buf_in.for_byte_record_with_terminator(newline_char, |line| { + let mut fields_pos = 1; + let mut low_idx = 0; + let mut delim_search = WhitespaceSearcher::new(line).peekable(); + let mut print_delim = false; + + if delim_search.peek().is_none() { + if !only_delimited { + out.write_all(line)?; + if line[line.len() - 1] != newline_char { + out.write_all(&[newline_char])?; + } + } + + return Ok(true); + } + + for &Range { low, high } in ranges { + if low - fields_pos > 0 { + low_idx = match delim_search.nth(low - fields_pos - 1) { + Some((_, last)) => last, + None => break, + }; + } + + for _ in 0..=high - low { + if print_delim { + out.write_all(out_delim.as_bytes())?; + } else { + print_delim = true; + } + + match delim_search.next() { + Some((first, last)) => { + let segment = &line[low_idx..first]; + + out.write_all(segment)?; + + low_idx = last; + fields_pos = high + 1; + } + None => { + let segment = &line[low_idx..]; + + out.write_all(segment)?; + + if line[line.len() - 1] == newline_char { + return Ok(true); + } + break; + } + } + } + } + + out.write_all(&[newline_char])?; + Ok(true) + }); + + if let Err(e) = result { + return Err(USimpleError::new(1, e.to_string())); + } + + Ok(()) +} + #[allow(clippy::cognitive_complexity)] fn cut_fields(reader: R, ranges: &[Range], opts: &FieldOptions) -> UResult<()> { let newline_char = if opts.zero_terminated { b'\0' } else { b'\n' }; + + if opts.whitespace_delimited { + return cut_fields_whitespace( + reader, + ranges, + opts.only_delimited, + newline_char, + match opts.out_delimiter { + Some(ref delim) => delim, + _ => "\t", + } + ); + } if let Some(ref o_delim) = opts.out_delimiter { return cut_fields_delimiter( reader, @@ -387,6 +483,7 @@ mod options { pub const ZERO_TERMINATED: &str = "zero-terminated"; pub const ONLY_DELIMITED: &str = "only-delimited"; pub const OUTPUT_DELIMITER: &str = "output-delimiter"; + pub const WHITESPACE_DELIMITED: &str = "whitespace-delimited"; pub const COMPLEMENT: &str = "complement"; pub const FILE: &str = "file"; } @@ -449,37 +546,44 @@ pub fn uumain(args: impl uucore::Args) -> UResult<()> { }; let only_delimited = matches.get_flag(options::ONLY_DELIMITED); + let whitespace_delimited = matches.get_flag(options::WHITESPACE_DELIMITED); let zero_terminated = matches.get_flag(options::ZERO_TERMINATED); match matches.get_one::(options::DELIMITER).map(|s| s.as_str()) { Some(mut delim) => { - // GNU's `cut` supports `-d=` to set the delimiter to `=`. - // Clap parsing is limited in this situation, see: - // https://github.com/uutils/coreutils/issues/2424#issuecomment-863825242 - if delimiter_is_equal { - delim = "="; - } else if delim == "''" { - // treat `''` as empty delimiter - delim = ""; + if whitespace_delimited { + Err("invalid input: Only one of --delimiter (-d) or -w option can be specified".into()) } - if delim.chars().count() > 1 { - Err("invalid input: The '--delimiter' ('-d') option expects empty or 1 character long, but was provided a value 2 characters or longer".into()) - } else { - let delim = if delim.is_empty() { - "\0".to_owned() + else { + // GNU's `cut` supports `-d=` to set the delimiter to `=`. + // Clap parsing is limited in this situation, see: + // https://github.com/uutils/coreutils/issues/2424#issuecomment-863825242 + if delimiter_is_equal { + delim = "="; + } else if delim == "''" { + // treat `''` as empty delimiter + delim = ""; + } + if delim.chars().count() > 1 { + Err("invalid input: The '--delimiter' ('-d') option expects empty or 1 character long, but was provided a value 2 characters or longer".into()) } else { - delim.to_owned() - }; + let delim = if delim.is_empty() { + "\0".to_owned() + } else { + delim.to_owned() + }; - Ok(Mode::Fields( - ranges, - FieldOptions { - delimiter: delim, - out_delimiter: out_delim, - only_delimited, - zero_terminated, - }, - )) + Ok(Mode::Fields( + ranges, + FieldOptions { + delimiter: delim, + out_delimiter: out_delim, + only_delimited, + whitespace_delimited, + zero_terminated, + }, + )) + } } } None => Ok(Mode::Fields( @@ -488,6 +592,7 @@ pub fn uumain(args: impl uucore::Args) -> UResult<()> { delimiter: "\t".to_owned(), out_delimiter: out_delim, only_delimited, + whitespace_delimited, zero_terminated, }, )), @@ -508,6 +613,11 @@ pub fn uumain(args: impl uucore::Args) -> UResult<()> { { Err("invalid input: The '--delimiter' ('-d') option only usable if printing a sequence of fields".into()) } + Mode::Bytes(_, _) | Mode::Characters(_, _) + if matches.contains_id(options::WHITESPACE_DELIMITED) => + { + Err("invalid input: The '-w' option only usable if printing a sequence of fields".into()) + } Mode::Bytes(_, _) | Mode::Characters(_, _) if matches.get_flag(options::ONLY_DELIMITED) => { @@ -563,6 +673,13 @@ pub fn uu_app() -> Command { .help("specify the delimiter character that separates fields in the input source. Defaults to Tab.") .value_name("DELIM"), ) + .arg( + Arg::new(options::WHITESPACE_DELIMITED) + .short('w') + .help("Use any number of whitespace (Space, Tab) to separate fields in the input source.") + .value_name("WHITESPACE") + .action(ArgAction::SetTrue), + ) .arg( Arg::new(options::FIELDS) .short('f') diff --git a/src/uu/cut/src/whitespace_searcher.rs b/src/uu/cut/src/whitespace_searcher.rs new file mode 100644 index 000000000..d1aa2a057 --- /dev/null +++ b/src/uu/cut/src/whitespace_searcher.rs @@ -0,0 +1,96 @@ +// This file is part of the uutils coreutils package. +// +// (c) Rolf Morel +// +// For the full copyright and license information, please view the LICENSE +// file that was distributed with this source code. + +use memchr::memchr2; + +pub struct WhitespaceSearcher<'a> { + haystack: &'a [u8], + position: usize, +} + +impl<'a> WhitespaceSearcher<'a> { + pub fn new(haystack: &'a [u8]) -> WhitespaceSearcher<'a> { + WhitespaceSearcher { + haystack, + position: 0, + } + } +} + +impl<'a> Iterator for WhitespaceSearcher<'a> { + type Item = (usize, usize); + + fn next(&mut self) -> Option { + loop { + if let Some(match_idx) = memchr2(b' ', b'\t', self.haystack) { + let mut skip = match_idx + 1; + while skip < self.haystack.len() + && (self.haystack[skip] == b' ' || self.haystack[skip] == b'\t') + { + skip += 1; + } + let match_pos = self.position + match_idx; + self.haystack = &self.haystack[skip..]; + self.position += skip; + return Some((match_pos, self.position)); + } else { + return None; + } + } + } +} + +#[cfg(test)] +mod tests { + + use super::*; + + #[test] + fn test_space() { + let iter = WhitespaceSearcher::new(" . . ".as_bytes()); + let items: Vec<(usize, usize)> = iter.collect(); + assert_eq!(vec![(0, 1), (2, 3), (4, 5)], items); + } + + #[test] + fn test_tab() { + let iter = WhitespaceSearcher::new("\t.\t.\t".as_bytes()); + let items: Vec<(usize, usize)> = iter.collect(); + assert_eq!(vec![(0, 1), (2, 3), (4, 5)], items); + } + + #[test] + fn test_empty() { + let iter = WhitespaceSearcher::new("".as_bytes()); + let items: Vec<(usize, usize)> = iter.collect(); + assert_eq!(vec![] as Vec<(usize, usize)>, items); + } + + fn test_multispace(line: &[u8], expected: &[(usize, usize)]) { + let iter = WhitespaceSearcher::new(line); + let items: Vec<(usize, usize)> = iter.collect(); + assert_eq!(expected, items); + } + + #[test] + fn test_multispace_normal() { + test_multispace( + "... ... \t...\t ... \t ...".as_bytes(), + &[(3, 5), (8, 10), (13, 15), (18, 21)], + ); + } + + #[test] + fn test_multispace_begin() { + test_multispace(" \t\t...".as_bytes(), &[(0, 3)]); + } + + #[test] + fn test_multispace_end() { + test_multispace("...\t ".as_bytes(), &[(3, 6)]); + } +} diff --git a/tests/by-util/test_cut.rs b/tests/by-util/test_cut.rs index bcdd9eaf0..f3930a633 100644 --- a/tests/by-util/test_cut.rs +++ b/tests/by-util/test_cut.rs @@ -81,6 +81,16 @@ fn test_field_sequence() { } } +#[test] +fn test_whitespace_delimited() { + for param in ["-w"] { + new_ucmd!() + .args(&[param, "-f", COMPLEX_SEQUENCE.sequence, INPUT]) + .succeeds() + .stdout_only_fixture("whitespace_delimited.expected"); + } +} + #[test] fn test_specify_delimiter() { for param in ["-d", "--delimiter", "--del"] { diff --git a/tests/fixtures/cut/whitespace_delimited.expected b/tests/fixtures/cut/whitespace_delimited.expected new file mode 100644 index 000000000..cb064b7d2 --- /dev/null +++ b/tests/fixtures/cut/whitespace_delimited.expected @@ -0,0 +1,5 @@ +foo:bar:baz:qux:quux +one:two:three:four:five:six:seven +alpha:beta:gamma:delta:epsilon:zeta:eta:theta:iota:kappa:lambda:mu +the quick fox over the dog +sally sells down the seashore are the seashells sally sells From 1cc3f331e79987b6846f72faa81822b4ab779112 Mon Sep 17 00:00:00 2001 From: TechHara Date: Mon, 12 Dec 2022 22:29:31 -0500 Subject: [PATCH 02/21] format --- src/uu/cut/src/cut.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/uu/cut/src/cut.rs b/src/uu/cut/src/cut.rs index a562d2b85..afe751a57 100644 --- a/src/uu/cut/src/cut.rs +++ b/src/uu/cut/src/cut.rs @@ -352,7 +352,7 @@ fn cut_fields(reader: R, ranges: &[Range], opts: &FieldOptions) -> URes match opts.out_delimiter { Some(ref delim) => delim, _ => "\t", - } + }, ); } if let Some(ref o_delim) = opts.out_delimiter { From 21cf0b41ab7a971a2076cbcf98dda40465e627c6 Mon Sep 17 00:00:00 2001 From: TechHara Date: Tue, 13 Dec 2022 00:11:01 -0500 Subject: [PATCH 03/21] add clippy --- src/uu/cut/src/cut.rs | 1 + 1 file changed, 1 insertion(+) diff --git a/src/uu/cut/src/cut.rs b/src/uu/cut/src/cut.rs index afe751a57..8198d2384 100644 --- a/src/uu/cut/src/cut.rs +++ b/src/uu/cut/src/cut.rs @@ -263,6 +263,7 @@ fn cut_fields_delimiter( Ok(()) } +#[allow(clippy::cognitive_complexity)] fn cut_fields_whitespace( reader: R, ranges: &[Range], From 50c8bd4c6be9bbd8f3fcccd08f44bc42355da701 Mon Sep 17 00:00:00 2001 From: TechHara Date: Tue, 13 Dec 2022 00:48:17 -0500 Subject: [PATCH 04/21] fix --- src/uu/cut/src/cut.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/uu/cut/src/cut.rs b/src/uu/cut/src/cut.rs index 8198d2384..9699e36f0 100644 --- a/src/uu/cut/src/cut.rs +++ b/src/uu/cut/src/cut.rs @@ -615,7 +615,7 @@ pub fn uumain(args: impl uucore::Args) -> UResult<()> { Err("invalid input: The '--delimiter' ('-d') option only usable if printing a sequence of fields".into()) } Mode::Bytes(_, _) | Mode::Characters(_, _) - if matches.contains_id(options::WHITESPACE_DELIMITED) => + if matches.get_flag(options::WHITESPACE_DELIMITED) => { Err("invalid input: The '-w' option only usable if printing a sequence of fields".into()) } From f334ae314983c17909088b81fe38eb027c38787b Mon Sep 17 00:00:00 2001 From: TechHara Date: Tue, 13 Dec 2022 01:10:48 -0500 Subject: [PATCH 05/21] use match guard to minimize --- src/uu/cut/src/cut.rs | 59 +++++++++++++++++++++---------------------- 1 file changed, 29 insertions(+), 30 deletions(-) diff --git a/src/uu/cut/src/cut.rs b/src/uu/cut/src/cut.rs index 9699e36f0..fd1056199 100644 --- a/src/uu/cut/src/cut.rs +++ b/src/uu/cut/src/cut.rs @@ -551,42 +551,41 @@ pub fn uumain(args: impl uucore::Args) -> UResult<()> { let zero_terminated = matches.get_flag(options::ZERO_TERMINATED); match matches.get_one::(options::DELIMITER).map(|s| s.as_str()) { - Some(mut delim) => { - if whitespace_delimited { + Some(_) if whitespace_delimited => { Err("invalid input: Only one of --delimiter (-d) or -w option can be specified".into()) } - else { - // GNU's `cut` supports `-d=` to set the delimiter to `=`. - // Clap parsing is limited in this situation, see: - // https://github.com/uutils/coreutils/issues/2424#issuecomment-863825242 - if delimiter_is_equal { - delim = "="; - } else if delim == "''" { - // treat `''` as empty delimiter - delim = ""; - } - if delim.chars().count() > 1 { - Err("invalid input: The '--delimiter' ('-d') option expects empty or 1 character long, but was provided a value 2 characters or longer".into()) + Some(mut delim) => { + // GNU's `cut` supports `-d=` to set the delimiter to `=`. + // Clap parsing is limited in this situation, see: + // https://github.com/uutils/coreutils/issues/2424#issuecomment-863825242 + if delimiter_is_equal { + delim = "="; + } else if delim == "''" { + // treat `''` as empty delimiter + delim = ""; + } + if delim.chars().count() > 1 { + Err("invalid input: The '--delimiter' ('-d') option expects empty or 1 character long, but was provided a value 2 characters or longer".into()) + } else { + let delim = if delim.is_empty() { + "\0".to_owned() } else { - let delim = if delim.is_empty() { - "\0".to_owned() - } else { - delim.to_owned() - }; + delim.to_owned() + }; - Ok(Mode::Fields( - ranges, - FieldOptions { - delimiter: delim, - out_delimiter: out_delim, - only_delimited, - whitespace_delimited, - zero_terminated, - }, - )) - } + Ok(Mode::Fields( + ranges, + FieldOptions { + delimiter: delim, + out_delimiter: out_delim, + only_delimited, + whitespace_delimited, + zero_terminated, + }, + )) } } + None => Ok(Mode::Fields( ranges, FieldOptions { From 188d84b154bd363d7d29e8c10b822f7bf718d648 Mon Sep 17 00:00:00 2001 From: TechHara Date: Tue, 13 Dec 2022 01:14:14 -0500 Subject: [PATCH 06/21] minimize unnecessary code change --- src/uu/cut/src/cut.rs | 1 - 1 file changed, 1 deletion(-) diff --git a/src/uu/cut/src/cut.rs b/src/uu/cut/src/cut.rs index fd1056199..efcebd8c6 100644 --- a/src/uu/cut/src/cut.rs +++ b/src/uu/cut/src/cut.rs @@ -585,7 +585,6 @@ pub fn uumain(args: impl uucore::Args) -> UResult<()> { )) } } - None => Ok(Mode::Fields( ranges, FieldOptions { From 866a6d25c8b84219738bad647ef5b4af4f404c8e Mon Sep 17 00:00:00 2001 From: TechHara Date: Tue, 13 Dec 2022 11:18:07 -0500 Subject: [PATCH 07/21] fix clippy complaint --- src/uu/cut/src/whitespace_searcher.rs | 26 ++++++++++++-------------- 1 file changed, 12 insertions(+), 14 deletions(-) diff --git a/src/uu/cut/src/whitespace_searcher.rs b/src/uu/cut/src/whitespace_searcher.rs index d1aa2a057..eafd786e1 100644 --- a/src/uu/cut/src/whitespace_searcher.rs +++ b/src/uu/cut/src/whitespace_searcher.rs @@ -25,21 +25,19 @@ impl<'a> Iterator for WhitespaceSearcher<'a> { type Item = (usize, usize); fn next(&mut self) -> Option { - loop { - if let Some(match_idx) = memchr2(b' ', b'\t', self.haystack) { - let mut skip = match_idx + 1; - while skip < self.haystack.len() - && (self.haystack[skip] == b' ' || self.haystack[skip] == b'\t') - { - skip += 1; - } - let match_pos = self.position + match_idx; - self.haystack = &self.haystack[skip..]; - self.position += skip; - return Some((match_pos, self.position)); - } else { - return None; + if let Some(match_idx) = memchr2(b' ', b'\t', self.haystack) { + let mut skip = match_idx + 1; + while skip < self.haystack.len() + && (self.haystack[skip] == b' ' || self.haystack[skip] == b'\t') + { + skip += 1; } + let match_pos = self.position + match_idx; + self.haystack = &self.haystack[skip..]; + self.position += skip; + return Some((match_pos, self.position)); + } else { + return None; } } } From 17c48e13f4a8e854013a28b8d619eb081d08e7d8 Mon Sep 17 00:00:00 2001 From: TechHara Date: Tue, 13 Dec 2022 11:30:58 -0500 Subject: [PATCH 08/21] specify that cut -w is freebsd extension --- docs/src/extensions.md | 4 ++++ src/uu/cut/src/cut.rs | 3 ++- 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/docs/src/extensions.md b/docs/src/extensions.md index f24518281..275690a33 100644 --- a/docs/src/extensions.md +++ b/docs/src/extensions.md @@ -29,3 +29,7 @@ We provide a simple implementation of `more`, which is not part of GNU coreutils. We do not aim for full compatibility with the `more` utility from `util-linux`. Features from more modern pagers (like `less` and `bat`) are therefore welcomed. + +## `cut` + +`cut` can separate fields by whitespace (Space and Tab) with `-w` flag. This feature is adopted from [FreeBSD](https://www.freebsd.org/cgi/man.cgi?cut). \ No newline at end of file diff --git a/src/uu/cut/src/cut.rs b/src/uu/cut/src/cut.rs index efcebd8c6..5f0534795 100644 --- a/src/uu/cut/src/cut.rs +++ b/src/uu/cut/src/cut.rs @@ -90,6 +90,7 @@ static LONG_HELP: &str = " If the -w option is provided, fields will be separated by any number of whitespace characters (Space and Tab). The output delimiter will be a Tab unless explicitly specified. Only one of -d or -w option can be specified. + This is an extension adopted from FreeBSD. Optionally Filter based on delimiter If the --only-delimited (-s) flag is provided, only lines which @@ -675,7 +676,7 @@ pub fn uu_app() -> Command { .arg( Arg::new(options::WHITESPACE_DELIMITED) .short('w') - .help("Use any number of whitespace (Space, Tab) to separate fields in the input source.") + .help("Use any number of whitespace (Space, Tab) to separate fields in the input source (FreeBSD extension).") .value_name("WHITESPACE") .action(ArgAction::SetTrue), ) From 11e64958d671a791dc80d8f0ec6774ac07f96432 Mon Sep 17 00:00:00 2001 From: TechHara Date: Tue, 13 Dec 2022 11:40:35 -0500 Subject: [PATCH 09/21] add test cases to cover cut -w failing options --- tests/by-util/test_cut.rs | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) diff --git a/tests/by-util/test_cut.rs b/tests/by-util/test_cut.rs index f3930a633..dcf4a524f 100644 --- a/tests/by-util/test_cut.rs +++ b/tests/by-util/test_cut.rs @@ -91,6 +91,30 @@ fn test_whitespace_delimited() { } } +#[test] +fn test_whitespace_with_explicit_delimiter() { + new_ucmd!() + .args(&["-w", "-f", COMPLEX_SEQUENCE.sequence, "-d:"]) + .fails() + .code_is(1); +} + +#[test] +fn test_whitespace_with_byte() { + new_ucmd!() + .args(&["-w", "-b", COMPLEX_SEQUENCE.sequence]) + .fails() + .code_is(1); +} + +#[test] +fn test_whitespace_with_char() { + new_ucmd!() + .args(&["-c", COMPLEX_SEQUENCE.sequence, "-w"]) + .fails() + .code_is(1); +} + #[test] fn test_specify_delimiter() { for param in ["-d", "--delimiter", "--del"] { From 5867a47f2113654a1890d784c662839e66f7fe9f Mon Sep 17 00:00:00 2001 From: TechHara Date: Tue, 13 Dec 2022 11:49:26 -0500 Subject: [PATCH 10/21] fix clippy complaint --- src/uu/cut/src/whitespace_searcher.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/uu/cut/src/whitespace_searcher.rs b/src/uu/cut/src/whitespace_searcher.rs index eafd786e1..e35b03bca 100644 --- a/src/uu/cut/src/whitespace_searcher.rs +++ b/src/uu/cut/src/whitespace_searcher.rs @@ -35,9 +35,9 @@ impl<'a> Iterator for WhitespaceSearcher<'a> { let match_pos = self.position + match_idx; self.haystack = &self.haystack[skip..]; self.position += skip; - return Some((match_pos, self.position)); + Some((match_pos, self.position)) } else { - return None; + None } } } From db829321fca1c1c11567fdee3e658427612f92dc Mon Sep 17 00:00:00 2001 From: TechHara Date: Tue, 13 Dec 2022 11:51:05 -0500 Subject: [PATCH 11/21] remove clippy complex --- src/uu/cut/src/cut.rs | 1 - 1 file changed, 1 deletion(-) diff --git a/src/uu/cut/src/cut.rs b/src/uu/cut/src/cut.rs index 5f0534795..7e4991179 100644 --- a/src/uu/cut/src/cut.rs +++ b/src/uu/cut/src/cut.rs @@ -264,7 +264,6 @@ fn cut_fields_delimiter( Ok(()) } -#[allow(clippy::cognitive_complexity)] fn cut_fields_whitespace( reader: R, ranges: &[Range], From 9798ae79877d1aa5e76519c4db9fd0aca9390d1a Mon Sep 17 00:00:00 2001 From: TechHara Date: Tue, 13 Dec 2022 20:34:01 -0500 Subject: [PATCH 12/21] fix clippy complaint --- tests/by-util/test_cut.rs | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/tests/by-util/test_cut.rs b/tests/by-util/test_cut.rs index dcf4a524f..bad609758 100644 --- a/tests/by-util/test_cut.rs +++ b/tests/by-util/test_cut.rs @@ -83,12 +83,10 @@ fn test_field_sequence() { #[test] fn test_whitespace_delimited() { - for param in ["-w"] { - new_ucmd!() - .args(&[param, "-f", COMPLEX_SEQUENCE.sequence, INPUT]) - .succeeds() - .stdout_only_fixture("whitespace_delimited.expected"); - } + new_ucmd!() + .args(&["-w", "-f", COMPLEX_SEQUENCE.sequence, INPUT]) + .succeeds() + .stdout_only_fixture("whitespace_delimited.expected"); } #[test] From 27dfe63f4c83beba9f4bd74820fd2b9a4f5c3268 Mon Sep 17 00:00:00 2001 From: TechHara Date: Tue, 13 Dec 2022 20:57:46 -0500 Subject: [PATCH 13/21] add comments on the function logic --- src/uu/cut/src/cut.rs | 5 ++++- src/uu/cut/src/whitespace_searcher.rs | 3 +++ 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/src/uu/cut/src/cut.rs b/src/uu/cut/src/cut.rs index 7e4991179..2645a2dee 100644 --- a/src/uu/cut/src/cut.rs +++ b/src/uu/cut/src/cut.rs @@ -290,7 +290,10 @@ fn cut_fields_whitespace( return Ok(true); } - + // The logic is identical to `cut_fields_delimiter` function above, which uses + // `Searcher` that iterates over and returns the first position of the delimiter character. + // The main difference is that `WhitespaceSearcher` returns a pair of the first and last + // delimiter character positions, since each delimiter sequence length can vary. for &Range { low, high } in ranges { if low - fields_pos > 0 { low_idx = match delim_search.nth(low - fields_pos - 1) { diff --git a/src/uu/cut/src/whitespace_searcher.rs b/src/uu/cut/src/whitespace_searcher.rs index e35b03bca..b17910c7a 100644 --- a/src/uu/cut/src/whitespace_searcher.rs +++ b/src/uu/cut/src/whitespace_searcher.rs @@ -24,6 +24,9 @@ impl<'a> WhitespaceSearcher<'a> { impl<'a> Iterator for WhitespaceSearcher<'a> { type Item = (usize, usize); + // Iterate over sequences of consecutive whitespace (space and/or tab) characters. + // Returns (first, last) positions of each sequence, where `first` is inclusive and `last` is exclusive. + // The delimiter sequence byte-length is equal to `last - first` fn next(&mut self) -> Option { if let Some(match_idx) = memchr2(b' ', b'\t', self.haystack) { let mut skip = match_idx + 1; From 76f818da0591bf7dd72bcda92df415443910f81f Mon Sep 17 00:00:00 2001 From: TechHara Date: Tue, 13 Dec 2022 21:02:16 -0500 Subject: [PATCH 14/21] fix whitespace --- src/uu/cut/src/cut.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/uu/cut/src/cut.rs b/src/uu/cut/src/cut.rs index 2645a2dee..cc8a348bb 100644 --- a/src/uu/cut/src/cut.rs +++ b/src/uu/cut/src/cut.rs @@ -88,7 +88,7 @@ static LONG_HELP: &str = " If not set, a default delimiter of Tab will be used. If the -w option is provided, fields will be separated by any number - of whitespace characters (Space and Tab). The output delimiter will + of whitespace characters (Space and Tab). The output delimiter will be a Tab unless explicitly specified. Only one of -d or -w option can be specified. This is an extension adopted from FreeBSD. From 4e2cfdb8cea063b1eff68c60561f679f8f6fe46c Mon Sep 17 00:00:00 2001 From: TechHara Date: Wed, 14 Dec 2022 04:07:12 -0500 Subject: [PATCH 15/21] add more comments --- src/uu/cut/src/cut.rs | 6 ++++++ src/uu/cut/src/whitespace_searcher.rs | 4 ++-- 2 files changed, 8 insertions(+), 2 deletions(-) diff --git a/src/uu/cut/src/cut.rs b/src/uu/cut/src/cut.rs index cc8a348bb..3ecf2f34b 100644 --- a/src/uu/cut/src/cut.rs +++ b/src/uu/cut/src/cut.rs @@ -296,13 +296,17 @@ fn cut_fields_whitespace( // delimiter character positions, since each delimiter sequence length can vary. for &Range { low, high } in ranges { if low - fields_pos > 0 { + // current field is not in the range, so jump to the field corresponding to the + // beginning of the range if any low_idx = match delim_search.nth(low - fields_pos - 1) { Some((_, last)) => last, None => break, }; } + // at this point, current field is the first in the range for _ in 0..=high - low { + // skip printing delimiter if this is the first matching field for this line if print_delim { out.write_all(out_delim.as_bytes())?; } else { @@ -310,6 +314,7 @@ fn cut_fields_whitespace( } match delim_search.next() { + // print the current field up to the next whitespace Some((first, last)) => { let segment = &line[low_idx..first]; @@ -319,6 +324,7 @@ fn cut_fields_whitespace( fields_pos = high + 1; } None => { + // this is the last field in the line, so print the rest let segment = &line[low_idx..]; out.write_all(segment)?; diff --git a/src/uu/cut/src/whitespace_searcher.rs b/src/uu/cut/src/whitespace_searcher.rs index b17910c7a..392b951f4 100644 --- a/src/uu/cut/src/whitespace_searcher.rs +++ b/src/uu/cut/src/whitespace_searcher.rs @@ -25,8 +25,8 @@ impl<'a> Iterator for WhitespaceSearcher<'a> { type Item = (usize, usize); // Iterate over sequences of consecutive whitespace (space and/or tab) characters. - // Returns (first, last) positions of each sequence, where `first` is inclusive and `last` is exclusive. - // The delimiter sequence byte-length is equal to `last - first` + // Returns (first, last) positions of each sequence, where `haystack[first..last]` + // corresponds to the delimiter. fn next(&mut self) -> Option { if let Some(match_idx) = memchr2(b' ', b'\t', self.haystack) { let mut skip = match_idx + 1; From df8ce0c9996c25a02e4508e8fbefe920d73d85c9 Mon Sep 17 00:00:00 2001 From: TechHara Date: Sun, 18 Dec 2022 09:06:47 -0500 Subject: [PATCH 16/21] ignore multispace cspell --- src/uu/cut/src/whitespace_searcher.rs | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/uu/cut/src/whitespace_searcher.rs b/src/uu/cut/src/whitespace_searcher.rs index 392b951f4..08e2ff3fb 100644 --- a/src/uu/cut/src/whitespace_searcher.rs +++ b/src/uu/cut/src/whitespace_searcher.rs @@ -5,6 +5,8 @@ // For the full copyright and license information, please view the LICENSE // file that was distributed with this source code. +// cSpell:ignore multispace + use memchr::memchr2; pub struct WhitespaceSearcher<'a> { From f2a7175144675085a43affa50c26ac4196ff61bc Mon Sep 17 00:00:00 2001 From: TechHara Date: Tue, 20 Dec 2022 20:37:31 -0500 Subject: [PATCH 17/21] enum Delimiter; misc changes --- src/uu/cut/src/cut.rs | 178 ++++++++++++++------------ src/uu/cut/src/whitespace_searcher.rs | 4 +- 2 files changed, 94 insertions(+), 88 deletions(-) diff --git a/src/uu/cut/src/cut.rs b/src/uu/cut/src/cut.rs index 3ecf2f34b..92a62970c 100644 --- a/src/uu/cut/src/cut.rs +++ b/src/uu/cut/src/cut.rs @@ -118,11 +118,15 @@ struct Options { zero_terminated: bool, } +enum Delimiter { + Whitespace, + String(String), // one char long, String because of UTF8 representation +} + struct FieldOptions { - delimiter: String, // one char long, String because of UTF8 representation + delimiter: Delimiter, out_delimiter: Option, only_delimited: bool, - whitespace_delimited: bool, zero_terminated: bool, } @@ -352,95 +356,98 @@ fn cut_fields_whitespace( #[allow(clippy::cognitive_complexity)] fn cut_fields(reader: R, ranges: &[Range], opts: &FieldOptions) -> UResult<()> { let newline_char = if opts.zero_terminated { b'\0' } else { b'\n' }; - - if opts.whitespace_delimited { - return cut_fields_whitespace( - reader, - ranges, - opts.only_delimited, - newline_char, - match opts.out_delimiter { - Some(ref delim) => delim, - _ => "\t", - }, - ); - } - if let Some(ref o_delim) = opts.out_delimiter { - return cut_fields_delimiter( - reader, - ranges, - &opts.delimiter, - opts.only_delimited, - newline_char, - o_delim, - ); - } - - let mut buf_in = BufReader::new(reader); - let mut out = stdout_writer(); - let delim_len = opts.delimiter.len(); - - let result = buf_in.for_byte_record_with_terminator(newline_char, |line| { - let mut fields_pos = 1; - let mut low_idx = 0; - let mut delim_search = Searcher::new(line, opts.delimiter.as_bytes()).peekable(); - let mut print_delim = false; - - if delim_search.peek().is_none() { - if !opts.only_delimited { - out.write_all(line)?; - if line[line.len() - 1] != newline_char { - out.write_all(&[newline_char])?; - } - } - - return Ok(true); + match opts.delimiter { + Delimiter::Whitespace => { + return cut_fields_whitespace( + reader, + ranges, + opts.only_delimited, + newline_char, + match opts.out_delimiter { + Some(ref delim) => delim, + _ => "\t", + }, + ) } - - for &Range { low, high } in ranges { - if low - fields_pos > 0 { - if let Some(delim_pos) = delim_search.nth(low - fields_pos - 1) { - low_idx = if print_delim { - delim_pos - } else { - delim_pos + delim_len - } - } else { - break; - } + Delimiter::String(ref delimiter) => { + if let Some(ref o_delim) = opts.out_delimiter { + return cut_fields_delimiter( + reader, + ranges, + &delimiter, + opts.only_delimited, + newline_char, + o_delim, + ); } - match delim_search.nth(high - low) { - Some(high_idx) => { - let segment = &line[low_idx..high_idx]; + let mut buf_in = BufReader::new(reader); + let mut out = stdout_writer(); + let delim_len = delimiter.len(); - out.write_all(segment)?; + let result = buf_in.for_byte_record_with_terminator(newline_char, |line| { + let mut fields_pos = 1; + let mut low_idx = 0; + let mut delim_search = Searcher::new(line, delimiter.as_bytes()).peekable(); + let mut print_delim = false; - print_delim = true; - low_idx = high_idx; - fields_pos = high + 1; - } - None => { - let segment = &line[low_idx..line.len()]; - - out.write_all(segment)?; - - if line[line.len() - 1] == newline_char { - return Ok(true); + if delim_search.peek().is_none() { + if !opts.only_delimited { + out.write_all(line)?; + if line[line.len() - 1] != newline_char { + out.write_all(&[newline_char])?; + } } - break; + + return Ok(true); } + + for &Range { low, high } in ranges { + if low - fields_pos > 0 { + if let Some(delim_pos) = delim_search.nth(low - fields_pos - 1) { + low_idx = if print_delim { + delim_pos + } else { + delim_pos + delim_len + } + } else { + break; + } + } + + match delim_search.nth(high - low) { + Some(high_idx) => { + let segment = &line[low_idx..high_idx]; + + out.write_all(segment)?; + + print_delim = true; + low_idx = high_idx; + fields_pos = high + 1; + } + None => { + let segment = &line[low_idx..line.len()]; + + out.write_all(segment)?; + + if line[line.len() - 1] == newline_char { + return Ok(true); + } + break; + } + } + } + out.write_all(&[newline_char])?; + Ok(true) + }); + + if let Err(e) = result { + return Err(USimpleError::new(1, e.to_string())); } + + Ok(()) } - out.write_all(&[newline_char])?; - Ok(true) - }); - - if let Err(e) = result { - return Err(USimpleError::new(1, e.to_string())); } - - Ok(()) } fn cut_files(mut filenames: Vec, mode: &Mode) { @@ -585,10 +592,9 @@ pub fn uumain(args: impl uucore::Args) -> UResult<()> { Ok(Mode::Fields( ranges, FieldOptions { - delimiter: delim, + delimiter: Delimiter::String(delim), out_delimiter: out_delim, only_delimited, - whitespace_delimited, zero_terminated, }, )) @@ -597,10 +603,12 @@ pub fn uumain(args: impl uucore::Args) -> UResult<()> { None => Ok(Mode::Fields( ranges, FieldOptions { - delimiter: "\t".to_owned(), + delimiter: match whitespace_delimited { + true => Delimiter::Whitespace, + false => Delimiter::String("\t".to_owned()), + }, out_delimiter: out_delim, only_delimited, - whitespace_delimited, zero_terminated, }, )), diff --git a/src/uu/cut/src/whitespace_searcher.rs b/src/uu/cut/src/whitespace_searcher.rs index 08e2ff3fb..8fc8ca5c6 100644 --- a/src/uu/cut/src/whitespace_searcher.rs +++ b/src/uu/cut/src/whitespace_searcher.rs @@ -1,11 +1,9 @@ // This file is part of the uutils coreutils package. // -// (c) Rolf Morel -// // For the full copyright and license information, please view the LICENSE // file that was distributed with this source code. -// cSpell:ignore multispace +// spell-checker:ignore multispace use memchr::memchr2; From a549589682ea03f3749498aa5bf0ec64d6a92f7d Mon Sep 17 00:00:00 2001 From: TechHara Date: Wed, 21 Dec 2022 08:01:31 -0500 Subject: [PATCH 18/21] fix clippy warnings --- src/uu/cut/src/cut.rs | 24 +++++++++++------------- 1 file changed, 11 insertions(+), 13 deletions(-) diff --git a/src/uu/cut/src/cut.rs b/src/uu/cut/src/cut.rs index 92a62970c..236169f49 100644 --- a/src/uu/cut/src/cut.rs +++ b/src/uu/cut/src/cut.rs @@ -357,24 +357,22 @@ fn cut_fields_whitespace( fn cut_fields(reader: R, ranges: &[Range], opts: &FieldOptions) -> UResult<()> { let newline_char = if opts.zero_terminated { b'\0' } else { b'\n' }; match opts.delimiter { - Delimiter::Whitespace => { - return cut_fields_whitespace( - reader, - ranges, - opts.only_delimited, - newline_char, - match opts.out_delimiter { - Some(ref delim) => delim, - _ => "\t", - }, - ) - } + Delimiter::Whitespace => cut_fields_whitespace( + reader, + ranges, + opts.only_delimited, + newline_char, + match opts.out_delimiter { + Some(ref delim) => delim, + _ => "\t", + }, + ), Delimiter::String(ref delimiter) => { if let Some(ref o_delim) = opts.out_delimiter { return cut_fields_delimiter( reader, ranges, - &delimiter, + delimiter, opts.only_delimited, newline_char, o_delim, From a53dcba6d3aa97e7e3dfdd4d883aa92b8ad43666 Mon Sep 17 00:00:00 2001 From: TechHara Date: Thu, 22 Dec 2022 21:38:47 -0500 Subject: [PATCH 19/21] mark fixme item -- use char for delimiter --- src/uu/cut/src/cut.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/uu/cut/src/cut.rs b/src/uu/cut/src/cut.rs index 236169f49..2688cc858 100644 --- a/src/uu/cut/src/cut.rs +++ b/src/uu/cut/src/cut.rs @@ -120,7 +120,7 @@ struct Options { enum Delimiter { Whitespace, - String(String), // one char long, String because of UTF8 representation + String(String), // FIXME: use char? } struct FieldOptions { From 20761fe4224496cf38a926de707576deeacbe563 Mon Sep 17 00:00:00 2001 From: TechHara Date: Fri, 23 Dec 2022 02:18:16 -0500 Subject: [PATCH 20/21] simplify --- src/uu/cut/src/cut.rs | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/src/uu/cut/src/cut.rs b/src/uu/cut/src/cut.rs index 2688cc858..fd414355f 100644 --- a/src/uu/cut/src/cut.rs +++ b/src/uu/cut/src/cut.rs @@ -362,10 +362,7 @@ fn cut_fields(reader: R, ranges: &[Range], opts: &FieldOptions) -> URes ranges, opts.only_delimited, newline_char, - match opts.out_delimiter { - Some(ref delim) => delim, - _ => "\t", - }, + opts.out_delimiter.as_deref().unwrap_or("\t") ), Delimiter::String(ref delimiter) => { if let Some(ref o_delim) = opts.out_delimiter { From 276b115c02f98cd7300726be1c4ff442d69935a1 Mon Sep 17 00:00:00 2001 From: TechHara Date: Fri, 23 Dec 2022 09:56:21 -0500 Subject: [PATCH 21/21] remove clippy allow; fmt --- src/uu/cut/src/cut.rs | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/uu/cut/src/cut.rs b/src/uu/cut/src/cut.rs index fd414355f..a9a261882 100644 --- a/src/uu/cut/src/cut.rs +++ b/src/uu/cut/src/cut.rs @@ -353,7 +353,6 @@ fn cut_fields_whitespace( Ok(()) } -#[allow(clippy::cognitive_complexity)] fn cut_fields(reader: R, ranges: &[Range], opts: &FieldOptions) -> UResult<()> { let newline_char = if opts.zero_terminated { b'\0' } else { b'\n' }; match opts.delimiter { @@ -362,7 +361,7 @@ fn cut_fields(reader: R, ranges: &[Range], opts: &FieldOptions) -> URes ranges, opts.only_delimited, newline_char, - opts.out_delimiter.as_deref().unwrap_or("\t") + opts.out_delimiter.as_deref().unwrap_or("\t"), ), Delimiter::String(ref delimiter) => { if let Some(ref o_delim) = opts.out_delimiter {