mirror of
https://github.com/RGBCube/uutils-coreutils
synced 2025-07-28 11:37:44 +00:00
Merge pull request #4232 from TechHara/whitespace
cut: add whitespace option for separating fields
This commit is contained in:
commit
36f3507bed
5 changed files with 310 additions and 45 deletions
|
@ -33,3 +33,7 @@ We provide a simple implementation of `more`, which is not part of GNU
|
||||||
coreutils. We do not aim for full compatibility with the `more` utility from
|
coreutils. We do not aim for full compatibility with the `more` utility from
|
||||||
`util-linux`. Features from more modern pagers (like `less` and `bat`) are
|
`util-linux`. Features from more modern pagers (like `less` and `bat`) are
|
||||||
therefore welcomed.
|
therefore welcomed.
|
||||||
|
|
||||||
|
## `cut`
|
||||||
|
|
||||||
|
`cut` can separate fields by whitespace (Space and Tab) with `-w` flag. This feature is adopted from [FreeBSD](https://www.freebsd.org/cgi/man.cgi?cut).
|
|
@ -16,14 +16,16 @@ use uucore::display::Quotable;
|
||||||
use uucore::error::{FromIo, UResult, USimpleError};
|
use uucore::error::{FromIo, UResult, USimpleError};
|
||||||
|
|
||||||
use self::searcher::Searcher;
|
use self::searcher::Searcher;
|
||||||
|
use self::whitespace_searcher::WhitespaceSearcher;
|
||||||
use uucore::ranges::Range;
|
use uucore::ranges::Range;
|
||||||
use uucore::{format_usage, show, show_error, show_if_err};
|
use uucore::{format_usage, show, show_error, show_if_err};
|
||||||
|
|
||||||
mod searcher;
|
mod searcher;
|
||||||
|
mod whitespace_searcher;
|
||||||
|
|
||||||
static NAME: &str = "cut";
|
static NAME: &str = "cut";
|
||||||
static USAGE: &str =
|
static USAGE: &str =
|
||||||
"{} [-d] [-s] [-z] [--output-delimiter] ((-f|-b|-c) {{sequence}}) {{sourcefile}}+";
|
"{} [-d|-w] [-s] [-z] [--output-delimiter] ((-f|-b|-c) {{sequence}}) {{sourcefile}}+";
|
||||||
static ABOUT: &str =
|
static ABOUT: &str =
|
||||||
"Prints specified byte or field columns from each line of stdin or the input files";
|
"Prints specified byte or field columns from each line of stdin or the input files";
|
||||||
static LONG_HELP: &str = "
|
static LONG_HELP: &str = "
|
||||||
|
@ -85,6 +87,11 @@ static LONG_HELP: &str = "
|
||||||
--delimiter (-d) option. Setting the delimiter is optional.
|
--delimiter (-d) option. Setting the delimiter is optional.
|
||||||
If not set, a default delimiter of Tab will be used.
|
If not set, a default delimiter of Tab will be used.
|
||||||
|
|
||||||
|
If the -w option is provided, fields will be separated by any number
|
||||||
|
of whitespace characters (Space and Tab). The output delimiter will
|
||||||
|
be a Tab unless explicitly specified. Only one of -d or -w option can be specified.
|
||||||
|
This is an extension adopted from FreeBSD.
|
||||||
|
|
||||||
Optionally Filter based on delimiter
|
Optionally Filter based on delimiter
|
||||||
If the --only-delimited (-s) flag is provided, only lines which
|
If the --only-delimited (-s) flag is provided, only lines which
|
||||||
contain the delimiter will be printed
|
contain the delimiter will be printed
|
||||||
|
@ -111,8 +118,13 @@ struct Options {
|
||||||
zero_terminated: bool,
|
zero_terminated: bool,
|
||||||
}
|
}
|
||||||
|
|
||||||
|
enum Delimiter {
|
||||||
|
Whitespace,
|
||||||
|
String(String), // FIXME: use char?
|
||||||
|
}
|
||||||
|
|
||||||
struct FieldOptions {
|
struct FieldOptions {
|
||||||
delimiter: String, // one char long, String because of UTF8 representation
|
delimiter: Delimiter,
|
||||||
out_delimiter: Option<String>,
|
out_delimiter: Option<String>,
|
||||||
only_delimited: bool,
|
only_delimited: bool,
|
||||||
zero_terminated: bool,
|
zero_terminated: bool,
|
||||||
|
@ -256,14 +268,107 @@ fn cut_fields_delimiter<R: Read>(
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
#[allow(clippy::cognitive_complexity)]
|
fn cut_fields_whitespace<R: Read>(
|
||||||
|
reader: R,
|
||||||
|
ranges: &[Range],
|
||||||
|
only_delimited: bool,
|
||||||
|
newline_char: u8,
|
||||||
|
out_delim: &str,
|
||||||
|
) -> UResult<()> {
|
||||||
|
let mut buf_in = BufReader::new(reader);
|
||||||
|
let mut out = stdout_writer();
|
||||||
|
|
||||||
|
let result = buf_in.for_byte_record_with_terminator(newline_char, |line| {
|
||||||
|
let mut fields_pos = 1;
|
||||||
|
let mut low_idx = 0;
|
||||||
|
let mut delim_search = WhitespaceSearcher::new(line).peekable();
|
||||||
|
let mut print_delim = false;
|
||||||
|
|
||||||
|
if delim_search.peek().is_none() {
|
||||||
|
if !only_delimited {
|
||||||
|
out.write_all(line)?;
|
||||||
|
if line[line.len() - 1] != newline_char {
|
||||||
|
out.write_all(&[newline_char])?;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return Ok(true);
|
||||||
|
}
|
||||||
|
// The logic is identical to `cut_fields_delimiter` function above, which uses
|
||||||
|
// `Searcher` that iterates over and returns the first position of the delimiter character.
|
||||||
|
// The main difference is that `WhitespaceSearcher` returns a pair of the first and last
|
||||||
|
// delimiter character positions, since each delimiter sequence length can vary.
|
||||||
|
for &Range { low, high } in ranges {
|
||||||
|
if low - fields_pos > 0 {
|
||||||
|
// current field is not in the range, so jump to the field corresponding to the
|
||||||
|
// beginning of the range if any
|
||||||
|
low_idx = match delim_search.nth(low - fields_pos - 1) {
|
||||||
|
Some((_, last)) => last,
|
||||||
|
None => break,
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
// at this point, current field is the first in the range
|
||||||
|
for _ in 0..=high - low {
|
||||||
|
// skip printing delimiter if this is the first matching field for this line
|
||||||
|
if print_delim {
|
||||||
|
out.write_all(out_delim.as_bytes())?;
|
||||||
|
} else {
|
||||||
|
print_delim = true;
|
||||||
|
}
|
||||||
|
|
||||||
|
match delim_search.next() {
|
||||||
|
// print the current field up to the next whitespace
|
||||||
|
Some((first, last)) => {
|
||||||
|
let segment = &line[low_idx..first];
|
||||||
|
|
||||||
|
out.write_all(segment)?;
|
||||||
|
|
||||||
|
low_idx = last;
|
||||||
|
fields_pos = high + 1;
|
||||||
|
}
|
||||||
|
None => {
|
||||||
|
// this is the last field in the line, so print the rest
|
||||||
|
let segment = &line[low_idx..];
|
||||||
|
|
||||||
|
out.write_all(segment)?;
|
||||||
|
|
||||||
|
if line[line.len() - 1] == newline_char {
|
||||||
|
return Ok(true);
|
||||||
|
}
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
out.write_all(&[newline_char])?;
|
||||||
|
Ok(true)
|
||||||
|
});
|
||||||
|
|
||||||
|
if let Err(e) = result {
|
||||||
|
return Err(USimpleError::new(1, e.to_string()));
|
||||||
|
}
|
||||||
|
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
fn cut_fields<R: Read>(reader: R, ranges: &[Range], opts: &FieldOptions) -> UResult<()> {
|
fn cut_fields<R: Read>(reader: R, ranges: &[Range], opts: &FieldOptions) -> UResult<()> {
|
||||||
let newline_char = if opts.zero_terminated { b'\0' } else { b'\n' };
|
let newline_char = if opts.zero_terminated { b'\0' } else { b'\n' };
|
||||||
|
match opts.delimiter {
|
||||||
|
Delimiter::Whitespace => cut_fields_whitespace(
|
||||||
|
reader,
|
||||||
|
ranges,
|
||||||
|
opts.only_delimited,
|
||||||
|
newline_char,
|
||||||
|
opts.out_delimiter.as_deref().unwrap_or("\t"),
|
||||||
|
),
|
||||||
|
Delimiter::String(ref delimiter) => {
|
||||||
if let Some(ref o_delim) = opts.out_delimiter {
|
if let Some(ref o_delim) = opts.out_delimiter {
|
||||||
return cut_fields_delimiter(
|
return cut_fields_delimiter(
|
||||||
reader,
|
reader,
|
||||||
ranges,
|
ranges,
|
||||||
&opts.delimiter,
|
delimiter,
|
||||||
opts.only_delimited,
|
opts.only_delimited,
|
||||||
newline_char,
|
newline_char,
|
||||||
o_delim,
|
o_delim,
|
||||||
|
@ -272,12 +377,12 @@ fn cut_fields<R: Read>(reader: R, ranges: &[Range], opts: &FieldOptions) -> URes
|
||||||
|
|
||||||
let mut buf_in = BufReader::new(reader);
|
let mut buf_in = BufReader::new(reader);
|
||||||
let mut out = stdout_writer();
|
let mut out = stdout_writer();
|
||||||
let delim_len = opts.delimiter.len();
|
let delim_len = delimiter.len();
|
||||||
|
|
||||||
let result = buf_in.for_byte_record_with_terminator(newline_char, |line| {
|
let result = buf_in.for_byte_record_with_terminator(newline_char, |line| {
|
||||||
let mut fields_pos = 1;
|
let mut fields_pos = 1;
|
||||||
let mut low_idx = 0;
|
let mut low_idx = 0;
|
||||||
let mut delim_search = Searcher::new(line, opts.delimiter.as_bytes()).peekable();
|
let mut delim_search = Searcher::new(line, delimiter.as_bytes()).peekable();
|
||||||
let mut print_delim = false;
|
let mut print_delim = false;
|
||||||
|
|
||||||
if delim_search.peek().is_none() {
|
if delim_search.peek().is_none() {
|
||||||
|
@ -336,6 +441,8 @@ fn cut_fields<R: Read>(reader: R, ranges: &[Range], opts: &FieldOptions) -> URes
|
||||||
|
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
fn cut_files(mut filenames: Vec<String>, mode: &Mode) {
|
fn cut_files(mut filenames: Vec<String>, mode: &Mode) {
|
||||||
let mut stdin_read = false;
|
let mut stdin_read = false;
|
||||||
|
@ -387,6 +494,7 @@ mod options {
|
||||||
pub const ZERO_TERMINATED: &str = "zero-terminated";
|
pub const ZERO_TERMINATED: &str = "zero-terminated";
|
||||||
pub const ONLY_DELIMITED: &str = "only-delimited";
|
pub const ONLY_DELIMITED: &str = "only-delimited";
|
||||||
pub const OUTPUT_DELIMITER: &str = "output-delimiter";
|
pub const OUTPUT_DELIMITER: &str = "output-delimiter";
|
||||||
|
pub const WHITESPACE_DELIMITED: &str = "whitespace-delimited";
|
||||||
pub const COMPLEMENT: &str = "complement";
|
pub const COMPLEMENT: &str = "complement";
|
||||||
pub const FILE: &str = "file";
|
pub const FILE: &str = "file";
|
||||||
}
|
}
|
||||||
|
@ -449,9 +557,13 @@ pub fn uumain(args: impl uucore::Args) -> UResult<()> {
|
||||||
};
|
};
|
||||||
|
|
||||||
let only_delimited = matches.get_flag(options::ONLY_DELIMITED);
|
let only_delimited = matches.get_flag(options::ONLY_DELIMITED);
|
||||||
|
let whitespace_delimited = matches.get_flag(options::WHITESPACE_DELIMITED);
|
||||||
let zero_terminated = matches.get_flag(options::ZERO_TERMINATED);
|
let zero_terminated = matches.get_flag(options::ZERO_TERMINATED);
|
||||||
|
|
||||||
match matches.get_one::<String>(options::DELIMITER).map(|s| s.as_str()) {
|
match matches.get_one::<String>(options::DELIMITER).map(|s| s.as_str()) {
|
||||||
|
Some(_) if whitespace_delimited => {
|
||||||
|
Err("invalid input: Only one of --delimiter (-d) or -w option can be specified".into())
|
||||||
|
}
|
||||||
Some(mut delim) => {
|
Some(mut delim) => {
|
||||||
// GNU's `cut` supports `-d=` to set the delimiter to `=`.
|
// GNU's `cut` supports `-d=` to set the delimiter to `=`.
|
||||||
// Clap parsing is limited in this situation, see:
|
// Clap parsing is limited in this situation, see:
|
||||||
|
@ -474,7 +586,7 @@ pub fn uumain(args: impl uucore::Args) -> UResult<()> {
|
||||||
Ok(Mode::Fields(
|
Ok(Mode::Fields(
|
||||||
ranges,
|
ranges,
|
||||||
FieldOptions {
|
FieldOptions {
|
||||||
delimiter: delim,
|
delimiter: Delimiter::String(delim),
|
||||||
out_delimiter: out_delim,
|
out_delimiter: out_delim,
|
||||||
only_delimited,
|
only_delimited,
|
||||||
zero_terminated,
|
zero_terminated,
|
||||||
|
@ -485,7 +597,10 @@ pub fn uumain(args: impl uucore::Args) -> UResult<()> {
|
||||||
None => Ok(Mode::Fields(
|
None => Ok(Mode::Fields(
|
||||||
ranges,
|
ranges,
|
||||||
FieldOptions {
|
FieldOptions {
|
||||||
delimiter: "\t".to_owned(),
|
delimiter: match whitespace_delimited {
|
||||||
|
true => Delimiter::Whitespace,
|
||||||
|
false => Delimiter::String("\t".to_owned()),
|
||||||
|
},
|
||||||
out_delimiter: out_delim,
|
out_delimiter: out_delim,
|
||||||
only_delimited,
|
only_delimited,
|
||||||
zero_terminated,
|
zero_terminated,
|
||||||
|
@ -508,6 +623,11 @@ pub fn uumain(args: impl uucore::Args) -> UResult<()> {
|
||||||
{
|
{
|
||||||
Err("invalid input: The '--delimiter' ('-d') option only usable if printing a sequence of fields".into())
|
Err("invalid input: The '--delimiter' ('-d') option only usable if printing a sequence of fields".into())
|
||||||
}
|
}
|
||||||
|
Mode::Bytes(_, _) | Mode::Characters(_, _)
|
||||||
|
if matches.get_flag(options::WHITESPACE_DELIMITED) =>
|
||||||
|
{
|
||||||
|
Err("invalid input: The '-w' option only usable if printing a sequence of fields".into())
|
||||||
|
}
|
||||||
Mode::Bytes(_, _) | Mode::Characters(_, _)
|
Mode::Bytes(_, _) | Mode::Characters(_, _)
|
||||||
if matches.get_flag(options::ONLY_DELIMITED) =>
|
if matches.get_flag(options::ONLY_DELIMITED) =>
|
||||||
{
|
{
|
||||||
|
@ -563,6 +683,13 @@ pub fn uu_app() -> Command {
|
||||||
.help("specify the delimiter character that separates fields in the input source. Defaults to Tab.")
|
.help("specify the delimiter character that separates fields in the input source. Defaults to Tab.")
|
||||||
.value_name("DELIM"),
|
.value_name("DELIM"),
|
||||||
)
|
)
|
||||||
|
.arg(
|
||||||
|
Arg::new(options::WHITESPACE_DELIMITED)
|
||||||
|
.short('w')
|
||||||
|
.help("Use any number of whitespace (Space, Tab) to separate fields in the input source (FreeBSD extension).")
|
||||||
|
.value_name("WHITESPACE")
|
||||||
|
.action(ArgAction::SetTrue),
|
||||||
|
)
|
||||||
.arg(
|
.arg(
|
||||||
Arg::new(options::FIELDS)
|
Arg::new(options::FIELDS)
|
||||||
.short('f')
|
.short('f')
|
||||||
|
|
97
src/uu/cut/src/whitespace_searcher.rs
Normal file
97
src/uu/cut/src/whitespace_searcher.rs
Normal file
|
@ -0,0 +1,97 @@
|
||||||
|
// This file is part of the uutils coreutils package.
|
||||||
|
//
|
||||||
|
// For the full copyright and license information, please view the LICENSE
|
||||||
|
// file that was distributed with this source code.
|
||||||
|
|
||||||
|
// spell-checker:ignore multispace
|
||||||
|
|
||||||
|
use memchr::memchr2;
|
||||||
|
|
||||||
|
pub struct WhitespaceSearcher<'a> {
|
||||||
|
haystack: &'a [u8],
|
||||||
|
position: usize,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl<'a> WhitespaceSearcher<'a> {
|
||||||
|
pub fn new(haystack: &'a [u8]) -> WhitespaceSearcher<'a> {
|
||||||
|
WhitespaceSearcher {
|
||||||
|
haystack,
|
||||||
|
position: 0,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl<'a> Iterator for WhitespaceSearcher<'a> {
|
||||||
|
type Item = (usize, usize);
|
||||||
|
|
||||||
|
// Iterate over sequences of consecutive whitespace (space and/or tab) characters.
|
||||||
|
// Returns (first, last) positions of each sequence, where `haystack[first..last]`
|
||||||
|
// corresponds to the delimiter.
|
||||||
|
fn next(&mut self) -> Option<Self::Item> {
|
||||||
|
if let Some(match_idx) = memchr2(b' ', b'\t', self.haystack) {
|
||||||
|
let mut skip = match_idx + 1;
|
||||||
|
while skip < self.haystack.len()
|
||||||
|
&& (self.haystack[skip] == b' ' || self.haystack[skip] == b'\t')
|
||||||
|
{
|
||||||
|
skip += 1;
|
||||||
|
}
|
||||||
|
let match_pos = self.position + match_idx;
|
||||||
|
self.haystack = &self.haystack[skip..];
|
||||||
|
self.position += skip;
|
||||||
|
Some((match_pos, self.position))
|
||||||
|
} else {
|
||||||
|
None
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[cfg(test)]
|
||||||
|
mod tests {
|
||||||
|
|
||||||
|
use super::*;
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_space() {
|
||||||
|
let iter = WhitespaceSearcher::new(" . . ".as_bytes());
|
||||||
|
let items: Vec<(usize, usize)> = iter.collect();
|
||||||
|
assert_eq!(vec![(0, 1), (2, 3), (4, 5)], items);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_tab() {
|
||||||
|
let iter = WhitespaceSearcher::new("\t.\t.\t".as_bytes());
|
||||||
|
let items: Vec<(usize, usize)> = iter.collect();
|
||||||
|
assert_eq!(vec![(0, 1), (2, 3), (4, 5)], items);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_empty() {
|
||||||
|
let iter = WhitespaceSearcher::new("".as_bytes());
|
||||||
|
let items: Vec<(usize, usize)> = iter.collect();
|
||||||
|
assert_eq!(vec![] as Vec<(usize, usize)>, items);
|
||||||
|
}
|
||||||
|
|
||||||
|
fn test_multispace(line: &[u8], expected: &[(usize, usize)]) {
|
||||||
|
let iter = WhitespaceSearcher::new(line);
|
||||||
|
let items: Vec<(usize, usize)> = iter.collect();
|
||||||
|
assert_eq!(expected, items);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_multispace_normal() {
|
||||||
|
test_multispace(
|
||||||
|
"... ... \t...\t ... \t ...".as_bytes(),
|
||||||
|
&[(3, 5), (8, 10), (13, 15), (18, 21)],
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_multispace_begin() {
|
||||||
|
test_multispace(" \t\t...".as_bytes(), &[(0, 3)]);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_multispace_end() {
|
||||||
|
test_multispace("...\t ".as_bytes(), &[(3, 6)]);
|
||||||
|
}
|
||||||
|
}
|
|
@ -81,6 +81,38 @@ fn test_field_sequence() {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_whitespace_delimited() {
|
||||||
|
new_ucmd!()
|
||||||
|
.args(&["-w", "-f", COMPLEX_SEQUENCE.sequence, INPUT])
|
||||||
|
.succeeds()
|
||||||
|
.stdout_only_fixture("whitespace_delimited.expected");
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_whitespace_with_explicit_delimiter() {
|
||||||
|
new_ucmd!()
|
||||||
|
.args(&["-w", "-f", COMPLEX_SEQUENCE.sequence, "-d:"])
|
||||||
|
.fails()
|
||||||
|
.code_is(1);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_whitespace_with_byte() {
|
||||||
|
new_ucmd!()
|
||||||
|
.args(&["-w", "-b", COMPLEX_SEQUENCE.sequence])
|
||||||
|
.fails()
|
||||||
|
.code_is(1);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_whitespace_with_char() {
|
||||||
|
new_ucmd!()
|
||||||
|
.args(&["-c", COMPLEX_SEQUENCE.sequence, "-w"])
|
||||||
|
.fails()
|
||||||
|
.code_is(1);
|
||||||
|
}
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn test_specify_delimiter() {
|
fn test_specify_delimiter() {
|
||||||
for param in ["-d", "--delimiter", "--del"] {
|
for param in ["-d", "--delimiter", "--del"] {
|
||||||
|
|
5
tests/fixtures/cut/whitespace_delimited.expected
vendored
Normal file
5
tests/fixtures/cut/whitespace_delimited.expected
vendored
Normal file
|
@ -0,0 +1,5 @@
|
||||||
|
foo:bar:baz:qux:quux
|
||||||
|
one:two:three:four:five:six:seven
|
||||||
|
alpha:beta:gamma:delta:epsilon:zeta:eta:theta:iota:kappa:lambda:mu
|
||||||
|
the quick fox over the dog
|
||||||
|
sally sells down the seashore are the seashells sally sells
|
Loading…
Add table
Add a link
Reference in a new issue