1
Fork 0
mirror of https://github.com/RGBCube/uutils-coreutils synced 2025-07-28 03:27:44 +00:00

add whitespace delimiter option

This commit is contained in:
TechHara 2022-12-10 21:47:37 -05:00
parent 01153a701f
commit f6a0abaee3
4 changed files with 253 additions and 25 deletions

View file

@ -16,14 +16,16 @@ use uucore::display::Quotable;
use uucore::error::{FromIo, UResult, USimpleError}; use uucore::error::{FromIo, UResult, USimpleError};
use self::searcher::Searcher; use self::searcher::Searcher;
use self::whitespace_searcher::WhitespaceSearcher;
use uucore::ranges::Range; use uucore::ranges::Range;
use uucore::{format_usage, show, show_error, show_if_err}; use uucore::{format_usage, show, show_error, show_if_err};
mod searcher; mod searcher;
mod whitespace_searcher;
static NAME: &str = "cut"; static NAME: &str = "cut";
static USAGE: &str = static USAGE: &str =
"{} [-d] [-s] [-z] [--output-delimiter] ((-f|-b|-c) {{sequence}}) {{sourcefile}}+"; "{} [-d|-w] [-s] [-z] [--output-delimiter] ((-f|-b|-c) {{sequence}}) {{sourcefile}}+";
static ABOUT: &str = static ABOUT: &str =
"Prints specified byte or field columns from each line of stdin or the input files"; "Prints specified byte or field columns from each line of stdin or the input files";
static LONG_HELP: &str = " static LONG_HELP: &str = "
@ -85,6 +87,10 @@ static LONG_HELP: &str = "
--delimiter (-d) option. Setting the delimiter is optional. --delimiter (-d) option. Setting the delimiter is optional.
If not set, a default delimiter of Tab will be used. If not set, a default delimiter of Tab will be used.
If the -w option is provided, fields will be separated by any number
of whitespace characters (Space and Tab). The output delimiter will
be a Tab unless explicitly specified. Only one of -d or -w option can be specified.
Optionally Filter based on delimiter Optionally Filter based on delimiter
If the --only-delimited (-s) flag is provided, only lines which If the --only-delimited (-s) flag is provided, only lines which
contain the delimiter will be printed contain the delimiter will be printed
@ -115,6 +121,7 @@ struct FieldOptions {
delimiter: String, // one char long, String because of UTF8 representation delimiter: String, // one char long, String because of UTF8 representation
out_delimiter: Option<String>, out_delimiter: Option<String>,
only_delimited: bool, only_delimited: bool,
whitespace_delimited: bool,
zero_terminated: bool, zero_terminated: bool,
} }
@ -256,9 +263,98 @@ fn cut_fields_delimiter<R: Read>(
Ok(()) Ok(())
} }
fn cut_fields_whitespace<R: Read>(
reader: R,
ranges: &[Range],
only_delimited: bool,
newline_char: u8,
out_delim: &str,
) -> UResult<()> {
let mut buf_in = BufReader::new(reader);
let mut out = stdout_writer();
let result = buf_in.for_byte_record_with_terminator(newline_char, |line| {
let mut fields_pos = 1;
let mut low_idx = 0;
let mut delim_search = WhitespaceSearcher::new(line).peekable();
let mut print_delim = false;
if delim_search.peek().is_none() {
if !only_delimited {
out.write_all(line)?;
if line[line.len() - 1] != newline_char {
out.write_all(&[newline_char])?;
}
}
return Ok(true);
}
for &Range { low, high } in ranges {
if low - fields_pos > 0 {
low_idx = match delim_search.nth(low - fields_pos - 1) {
Some((_, last)) => last,
None => break,
};
}
for _ in 0..=high - low {
if print_delim {
out.write_all(out_delim.as_bytes())?;
} else {
print_delim = true;
}
match delim_search.next() {
Some((first, last)) => {
let segment = &line[low_idx..first];
out.write_all(segment)?;
low_idx = last;
fields_pos = high + 1;
}
None => {
let segment = &line[low_idx..];
out.write_all(segment)?;
if line[line.len() - 1] == newline_char {
return Ok(true);
}
break;
}
}
}
}
out.write_all(&[newline_char])?;
Ok(true)
});
if let Err(e) = result {
return Err(USimpleError::new(1, e.to_string()));
}
Ok(())
}
#[allow(clippy::cognitive_complexity)] #[allow(clippy::cognitive_complexity)]
fn cut_fields<R: Read>(reader: R, ranges: &[Range], opts: &FieldOptions) -> UResult<()> { fn cut_fields<R: Read>(reader: R, ranges: &[Range], opts: &FieldOptions) -> UResult<()> {
let newline_char = if opts.zero_terminated { b'\0' } else { b'\n' }; let newline_char = if opts.zero_terminated { b'\0' } else { b'\n' };
if opts.whitespace_delimited {
return cut_fields_whitespace(
reader,
ranges,
opts.only_delimited,
newline_char,
match opts.out_delimiter {
Some(ref delim) => delim,
_ => "\t",
}
);
}
if let Some(ref o_delim) = opts.out_delimiter { if let Some(ref o_delim) = opts.out_delimiter {
return cut_fields_delimiter( return cut_fields_delimiter(
reader, reader,
@ -387,6 +483,7 @@ mod options {
pub const ZERO_TERMINATED: &str = "zero-terminated"; pub const ZERO_TERMINATED: &str = "zero-terminated";
pub const ONLY_DELIMITED: &str = "only-delimited"; pub const ONLY_DELIMITED: &str = "only-delimited";
pub const OUTPUT_DELIMITER: &str = "output-delimiter"; pub const OUTPUT_DELIMITER: &str = "output-delimiter";
pub const WHITESPACE_DELIMITED: &str = "whitespace-delimited";
pub const COMPLEMENT: &str = "complement"; pub const COMPLEMENT: &str = "complement";
pub const FILE: &str = "file"; pub const FILE: &str = "file";
} }
@ -449,37 +546,44 @@ pub fn uumain(args: impl uucore::Args) -> UResult<()> {
}; };
let only_delimited = matches.get_flag(options::ONLY_DELIMITED); let only_delimited = matches.get_flag(options::ONLY_DELIMITED);
let whitespace_delimited = matches.get_flag(options::WHITESPACE_DELIMITED);
let zero_terminated = matches.get_flag(options::ZERO_TERMINATED); let zero_terminated = matches.get_flag(options::ZERO_TERMINATED);
match matches.get_one::<String>(options::DELIMITER).map(|s| s.as_str()) { match matches.get_one::<String>(options::DELIMITER).map(|s| s.as_str()) {
Some(mut delim) => { Some(mut delim) => {
// GNU's `cut` supports `-d=` to set the delimiter to `=`. if whitespace_delimited {
// Clap parsing is limited in this situation, see: Err("invalid input: Only one of --delimiter (-d) or -w option can be specified".into())
// https://github.com/uutils/coreutils/issues/2424#issuecomment-863825242
if delimiter_is_equal {
delim = "=";
} else if delim == "''" {
// treat `''` as empty delimiter
delim = "";
} }
if delim.chars().count() > 1 { else {
Err("invalid input: The '--delimiter' ('-d') option expects empty or 1 character long, but was provided a value 2 characters or longer".into()) // GNU's `cut` supports `-d=` to set the delimiter to `=`.
} else { // Clap parsing is limited in this situation, see:
let delim = if delim.is_empty() { // https://github.com/uutils/coreutils/issues/2424#issuecomment-863825242
"\0".to_owned() if delimiter_is_equal {
delim = "=";
} else if delim == "''" {
// treat `''` as empty delimiter
delim = "";
}
if delim.chars().count() > 1 {
Err("invalid input: The '--delimiter' ('-d') option expects empty or 1 character long, but was provided a value 2 characters or longer".into())
} else { } else {
delim.to_owned() let delim = if delim.is_empty() {
}; "\0".to_owned()
} else {
delim.to_owned()
};
Ok(Mode::Fields( Ok(Mode::Fields(
ranges, ranges,
FieldOptions { FieldOptions {
delimiter: delim, delimiter: delim,
out_delimiter: out_delim, out_delimiter: out_delim,
only_delimited, only_delimited,
zero_terminated, whitespace_delimited,
}, zero_terminated,
)) },
))
}
} }
} }
None => Ok(Mode::Fields( None => Ok(Mode::Fields(
@ -488,6 +592,7 @@ pub fn uumain(args: impl uucore::Args) -> UResult<()> {
delimiter: "\t".to_owned(), delimiter: "\t".to_owned(),
out_delimiter: out_delim, out_delimiter: out_delim,
only_delimited, only_delimited,
whitespace_delimited,
zero_terminated, zero_terminated,
}, },
)), )),
@ -508,6 +613,11 @@ pub fn uumain(args: impl uucore::Args) -> UResult<()> {
{ {
Err("invalid input: The '--delimiter' ('-d') option only usable if printing a sequence of fields".into()) Err("invalid input: The '--delimiter' ('-d') option only usable if printing a sequence of fields".into())
} }
Mode::Bytes(_, _) | Mode::Characters(_, _)
if matches.contains_id(options::WHITESPACE_DELIMITED) =>
{
Err("invalid input: The '-w' option only usable if printing a sequence of fields".into())
}
Mode::Bytes(_, _) | Mode::Characters(_, _) Mode::Bytes(_, _) | Mode::Characters(_, _)
if matches.get_flag(options::ONLY_DELIMITED) => if matches.get_flag(options::ONLY_DELIMITED) =>
{ {
@ -563,6 +673,13 @@ pub fn uu_app() -> Command {
.help("specify the delimiter character that separates fields in the input source. Defaults to Tab.") .help("specify the delimiter character that separates fields in the input source. Defaults to Tab.")
.value_name("DELIM"), .value_name("DELIM"),
) )
.arg(
Arg::new(options::WHITESPACE_DELIMITED)
.short('w')
.help("Use any number of whitespace (Space, Tab) to separate fields in the input source.")
.value_name("WHITESPACE")
.action(ArgAction::SetTrue),
)
.arg( .arg(
Arg::new(options::FIELDS) Arg::new(options::FIELDS)
.short('f') .short('f')

View file

@ -0,0 +1,96 @@
// This file is part of the uutils coreutils package.
//
// (c) Rolf Morel <rolfmorel@gmail.com>
//
// For the full copyright and license information, please view the LICENSE
// file that was distributed with this source code.
use memchr::memchr2;
pub struct WhitespaceSearcher<'a> {
haystack: &'a [u8],
position: usize,
}
impl<'a> WhitespaceSearcher<'a> {
pub fn new(haystack: &'a [u8]) -> WhitespaceSearcher<'a> {
WhitespaceSearcher {
haystack,
position: 0,
}
}
}
impl<'a> Iterator for WhitespaceSearcher<'a> {
type Item = (usize, usize);
fn next(&mut self) -> Option<Self::Item> {
loop {
if let Some(match_idx) = memchr2(b' ', b'\t', self.haystack) {
let mut skip = match_idx + 1;
while skip < self.haystack.len()
&& (self.haystack[skip] == b' ' || self.haystack[skip] == b'\t')
{
skip += 1;
}
let match_pos = self.position + match_idx;
self.haystack = &self.haystack[skip..];
self.position += skip;
return Some((match_pos, self.position));
} else {
return None;
}
}
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_space() {
let iter = WhitespaceSearcher::new(" . . ".as_bytes());
let items: Vec<(usize, usize)> = iter.collect();
assert_eq!(vec![(0, 1), (2, 3), (4, 5)], items);
}
#[test]
fn test_tab() {
let iter = WhitespaceSearcher::new("\t.\t.\t".as_bytes());
let items: Vec<(usize, usize)> = iter.collect();
assert_eq!(vec![(0, 1), (2, 3), (4, 5)], items);
}
#[test]
fn test_empty() {
let iter = WhitespaceSearcher::new("".as_bytes());
let items: Vec<(usize, usize)> = iter.collect();
assert_eq!(vec![] as Vec<(usize, usize)>, items);
}
fn test_multispace(line: &[u8], expected: &[(usize, usize)]) {
let iter = WhitespaceSearcher::new(line);
let items: Vec<(usize, usize)> = iter.collect();
assert_eq!(expected, items);
}
#[test]
fn test_multispace_normal() {
test_multispace(
"... ... \t...\t ... \t ...".as_bytes(),
&[(3, 5), (8, 10), (13, 15), (18, 21)],
);
}
#[test]
fn test_multispace_begin() {
test_multispace(" \t\t...".as_bytes(), &[(0, 3)]);
}
#[test]
fn test_multispace_end() {
test_multispace("...\t ".as_bytes(), &[(3, 6)]);
}
}

View file

@ -81,6 +81,16 @@ fn test_field_sequence() {
} }
} }
#[test]
fn test_whitespace_delimited() {
for param in ["-w"] {
new_ucmd!()
.args(&[param, "-f", COMPLEX_SEQUENCE.sequence, INPUT])
.succeeds()
.stdout_only_fixture("whitespace_delimited.expected");
}
}
#[test] #[test]
fn test_specify_delimiter() { fn test_specify_delimiter() {
for param in ["-d", "--delimiter", "--del"] { for param in ["-d", "--delimiter", "--del"] {

View file

@ -0,0 +1,5 @@
foo:bar:baz:qux:quux
one:two:three:four:five:six:seven
alpha:beta:gamma:delta:epsilon:zeta:eta:theta:iota:kappa:lambda:mu
the quick fox over the dog
sally sells down the seashore are the seashells sally sells