add whitespace delimiter option

2025-09-14 11:07:59 +00:00 · 2022-12-10 21:47:37 -05:00 · 2022-12-10 21:47:37 -05:00 · f6a0abaee3
commit f6a0abaee3
parent 01153a701f
4 changed files with 253 additions and 25 deletions
--- a/src/uu/cut/src/cut.rs
+++ b/src/uu/cut/src/cut.rs
@ -16,14 +16,16 @@ use uucore::display::Quotable;
 use uucore::error::{FromIo, UResult, USimpleError};
 use self::searcher::Searcher;
 use self::whitespace_searcher::WhitespaceSearcher;
 use uucore::ranges::Range;
 use uucore::{format_usage, show, show_error, show_if_err};
 mod searcher;
 mod whitespace_searcher;
 static NAME: &str = "cut";
 static USAGE: &str =
-    "{} [-d] [-s] [-z] [--output-delimiter] ((-f|-b|-c) {{sequence}}) {{sourcefile}}+";
+    "{} [-d|-w] [-s] [-z] [--output-delimiter] ((-f|-b|-c) {{sequence}}) {{sourcefile}}+";
 static ABOUT: &str =
    "Prints specified byte or field columns from each line of stdin or the input files";
 static LONG_HELP: &str = "
@ -85,6 +87,10 @@ static LONG_HELP: &str = "
        --delimiter (-d) option. Setting the delimiter is optional.
        If not set, a default delimiter of Tab will be used.
        If the -w option is provided, fields will be separated by any number
        of  whitespace characters (Space and Tab). The output delimiter will
        be a Tab unless explicitly specified. Only one of -d or -w option can be specified.
    Optionally Filter based on delimiter
        If the --only-delimited (-s) flag is provided, only lines which
        contain the delimiter will be printed
@ -115,6 +121,7 @@ struct FieldOptions {
    delimiter: String, // one char long, String because of UTF8 representation
    out_delimiter: Option<String>,
    only_delimited: bool,
    whitespace_delimited: bool,
    zero_terminated: bool,
 }
@ -256,9 +263,98 @@ fn cut_fields_delimiter<R: Read>(
    Ok(())
 }
 fn cut_fields_whitespace<R: Read>(
    reader: R,
    ranges: &[Range],
    only_delimited: bool,
    newline_char: u8,
    out_delim: &str,
 ) -> UResult<()> {
    let mut buf_in = BufReader::new(reader);
    let mut out = stdout_writer();
    let result = buf_in.for_byte_record_with_terminator(newline_char, |line| {
        let mut fields_pos = 1;
        let mut low_idx = 0;
        let mut delim_search = WhitespaceSearcher::new(line).peekable();
        let mut print_delim = false;
        if delim_search.peek().is_none() {
            if !only_delimited {
                out.write_all(line)?;
                if line[line.len() - 1] != newline_char {
                    out.write_all(&[newline_char])?;
                }
            }
            return Ok(true);
        }
        for &Range { low, high } in ranges {
            if low - fields_pos > 0 {
                low_idx = match delim_search.nth(low - fields_pos - 1) {
                    Some((_, last)) => last,
                    None => break,
                };
            }
            for _ in 0..=high - low {
                if print_delim {
                    out.write_all(out_delim.as_bytes())?;
                } else {
                    print_delim = true;
                }
                match delim_search.next() {
                    Some((first, last)) => {
                        let segment = &line[low_idx..first];
                        out.write_all(segment)?;
                        low_idx = last;
                        fields_pos = high + 1;
                    }
                    None => {
                        let segment = &line[low_idx..];
                        out.write_all(segment)?;
                        if line[line.len() - 1] == newline_char {
                            return Ok(true);
                        }
                        break;
                    }
                }
            }
        }
        out.write_all(&[newline_char])?;
        Ok(true)
    });
    if let Err(e) = result {
        return Err(USimpleError::new(1, e.to_string()));
    }
    Ok(())
 }
 #[allow(clippy::cognitive_complexity)]
 fn cut_fields<R: Read>(reader: R, ranges: &[Range], opts: &FieldOptions) -> UResult<()> {
    let newline_char = if opts.zero_terminated { b'\0' } else { b'\n' };
    if opts.whitespace_delimited {
        return cut_fields_whitespace(
            reader,
            ranges,
            opts.only_delimited,
            newline_char,
            match opts.out_delimiter {
                Some(ref delim) => delim,
                _ => "\t",
            }
        );
    }
    if let Some(ref o_delim) = opts.out_delimiter {
        return cut_fields_delimiter(
            reader,
@ -387,6 +483,7 @@ mod options {
    pub const ZERO_TERMINATED: &str = "zero-terminated";
    pub const ONLY_DELIMITED: &str = "only-delimited";
    pub const OUTPUT_DELIMITER: &str = "output-delimiter";
    pub const WHITESPACE_DELIMITED: &str = "whitespace-delimited";
    pub const COMPLEMENT: &str = "complement";
    pub const FILE: &str = "file";
 }
@ -449,37 +546,44 @@ pub fn uumain(args: impl uucore::Args) -> UResult<()> {
                };
                let only_delimited = matches.get_flag(options::ONLY_DELIMITED);
                let whitespace_delimited = matches.get_flag(options::WHITESPACE_DELIMITED);
                let zero_terminated = matches.get_flag(options::ZERO_TERMINATED);
                match matches.get_one::<String>(options::DELIMITER).map(|s| s.as_str()) {
                    Some(mut delim) => {
-                        // GNU's `cut` supports `-d=` to set the delimiter to `=`.
+                        if whitespace_delimited {
-                        // Clap parsing is limited in this situation, see:
+                            Err("invalid input: Only one of --delimiter (-d) or -w option can be specified".into())
                        // https://github.com/uutils/coreutils/issues/2424#issuecomment-863825242
                        if delimiter_is_equal {
                            delim = "=";
                        } else if delim == "''" {
                            // treat `''` as empty delimiter
                            delim = "";
                        }
-                        if delim.chars().count() > 1 {
+                        else {
-                            Err("invalid input: The '--delimiter' ('-d') option expects empty or 1 character long, but was provided a value 2 characters or longer".into())
+                            // GNU's `cut` supports `-d=` to set the delimiter to `=`.
-                        } else {
+                            // Clap parsing is limited in this situation, see:
-                            let delim = if delim.is_empty() {
+                            // https://github.com/uutils/coreutils/issues/2424#issuecomment-863825242
-                                "\0".to_owned()
+                            if delimiter_is_equal {
                                delim = "=";
                            } else if delim == "''" {
                                // treat `''` as empty delimiter
                                delim = "";
                            }
                            if delim.chars().count() > 1 {
                                Err("invalid input: The '--delimiter' ('-d') option expects empty or 1 character long, but was provided a value 2 characters or longer".into())
                            } else {
-                                delim.to_owned()
+                                let delim = if delim.is_empty() {
-                            };
+                                    "\0".to_owned()
                                } else {
                                    delim.to_owned()
                                };
-                            Ok(Mode::Fields(
+                                Ok(Mode::Fields(
-                                ranges,
+                                    ranges,
-                                FieldOptions {
+                                    FieldOptions {
-                                    delimiter: delim,
+                                        delimiter: delim,
-                                    out_delimiter: out_delim,
+                                        out_delimiter: out_delim,
-                                    only_delimited,
+                                        only_delimited,
-                                    zero_terminated,
+                                        whitespace_delimited,
-                                },
+                                        zero_terminated,
-                            ))
+                                    },
                                ))
                            }
                        }
                    }
                    None => Ok(Mode::Fields(
@ -488,6 +592,7 @@ pub fn uumain(args: impl uucore::Args) -> UResult<()> {
                            delimiter: "\t".to_owned(),
                            out_delimiter: out_delim,
                            only_delimited,
                            whitespace_delimited,
                            zero_terminated,
                        },
                    )),
@ -508,6 +613,11 @@ pub fn uumain(args: impl uucore::Args) -> UResult<()> {
            {
                Err("invalid input: The '--delimiter' ('-d') option only usable if printing a sequence of fields".into())
            }
            Mode::Bytes(_, _) | Mode::Characters(_, _)
                if matches.contains_id(options::WHITESPACE_DELIMITED) =>
            {
                Err("invalid input: The '-w' option only usable if printing a sequence of fields".into())
            }
            Mode::Bytes(_, _) | Mode::Characters(_, _)
                if matches.get_flag(options::ONLY_DELIMITED) =>
            {
@ -563,6 +673,13 @@ pub fn uu_app() -> Command {
                .help("specify the delimiter character that separates fields in the input source. Defaults to Tab.")
                .value_name("DELIM"),
        )
        .arg(
            Arg::new(options::WHITESPACE_DELIMITED)
                .short('w')
                .help("Use any number of whitespace (Space, Tab) to separate fields in the input source.")
                .value_name("WHITESPACE")
                .action(ArgAction::SetTrue),
        )
        .arg(
            Arg::new(options::FIELDS)
                .short('f')
--- a/src/uu/cut/src/whitespace_searcher.rs
+++ b/src/uu/cut/src/whitespace_searcher.rs
@ -0,0 +1,96 @@
 // This file is part of the uutils coreutils package.
 //
 // (c) Rolf Morel <rolfmorel@gmail.com>
 //
 // For the full copyright and license information, please view the LICENSE
 // file that was distributed with this source code.
 use memchr::memchr2;
 pub struct WhitespaceSearcher<'a> {
    haystack: &'a [u8],
    position: usize,
 }
 impl<'a> WhitespaceSearcher<'a> {
    pub fn new(haystack: &'a [u8]) -> WhitespaceSearcher<'a> {
        WhitespaceSearcher {
            haystack,
            position: 0,
        }
    }
 }
 impl<'a> Iterator for WhitespaceSearcher<'a> {
    type Item = (usize, usize);
    fn next(&mut self) -> Option<Self::Item> {
        loop {
            if let Some(match_idx) = memchr2(b' ', b'\t', self.haystack) {
                let mut skip = match_idx + 1;
                while skip < self.haystack.len()
                    && (self.haystack[skip] == b' ' || self.haystack[skip] == b'\t')
                {
                    skip += 1;
                }
                let match_pos = self.position + match_idx;
                self.haystack = &self.haystack[skip..];
                self.position += skip;
                return Some((match_pos, self.position));
            } else {
                return None;
            }
        }
    }
 }
 #[cfg(test)]
 mod tests {
    use super::*;
    #[test]
    fn test_space() {
        let iter = WhitespaceSearcher::new(" . . ".as_bytes());
        let items: Vec<(usize, usize)> = iter.collect();
        assert_eq!(vec![(0, 1), (2, 3), (4, 5)], items);
    }
    #[test]
    fn test_tab() {
        let iter = WhitespaceSearcher::new("\t.\t.\t".as_bytes());
        let items: Vec<(usize, usize)> = iter.collect();
        assert_eq!(vec![(0, 1), (2, 3), (4, 5)], items);
    }
    #[test]
    fn test_empty() {
        let iter = WhitespaceSearcher::new("".as_bytes());
        let items: Vec<(usize, usize)> = iter.collect();
        assert_eq!(vec![] as Vec<(usize, usize)>, items);
    }
    fn test_multispace(line: &[u8], expected: &[(usize, usize)]) {
        let iter = WhitespaceSearcher::new(line);
        let items: Vec<(usize, usize)> = iter.collect();
        assert_eq!(expected, items);
    }
    #[test]
    fn test_multispace_normal() {
        test_multispace(
            "...  ... \t...\t ... \t ...".as_bytes(),
            &[(3, 5), (8, 10), (13, 15), (18, 21)],
        );
    }
    #[test]
    fn test_multispace_begin() {
        test_multispace(" \t\t...".as_bytes(), &[(0, 3)]);
    }
    #[test]
    fn test_multispace_end() {
        test_multispace("...\t  ".as_bytes(), &[(3, 6)]);
    }
 }
--- a/tests/by-util/test_cut.rs
+++ b/tests/by-util/test_cut.rs
@ -81,6 +81,16 @@ fn test_field_sequence() {
    }
 }
 #[test]
 fn test_whitespace_delimited() {
    for param in ["-w"] {
        new_ucmd!()
            .args(&[param, "-f", COMPLEX_SEQUENCE.sequence, INPUT])
            .succeeds()
            .stdout_only_fixture("whitespace_delimited.expected");
    }
 }
 #[test]
 fn test_specify_delimiter() {
    for param in ["-d", "--delimiter", "--del"] {
--- a/tests/fixtures/cut/whitespace_delimited.expected
+++ b/tests/fixtures/cut/whitespace_delimited.expected
@ -0,0 +1,5 @@
 foo:bar:baz:qux:quux
 one:two:three:four:five:six:seven
 alpha:beta:gamma:delta:epsilon:zeta:eta:theta:iota:kappa:lambda:mu
 the	quick	fox	over	the	dog
 sally	sells	down	the	seashore	are	the	seashells	sally	sells