mirror of
https://github.com/RGBCube/uutils-coreutils
synced 2025-07-28 19:47:45 +00:00
cut: implement zero-terminated option
no changes to char mode because removal of specialized char mode behavior is pending
This commit is contained in:
parent
3618d9df94
commit
5aaff02195
3 changed files with 50 additions and 27 deletions
|
@ -33,12 +33,14 @@ pub mod Bytes {
|
||||||
#[derive(Debug)]
|
#[derive(Debug)]
|
||||||
pub struct ByteReader<R> where R: Read {
|
pub struct ByteReader<R> where R: Read {
|
||||||
inner: BufReader<R>,
|
inner: BufReader<R>,
|
||||||
|
newline_char: u8,
|
||||||
}
|
}
|
||||||
|
|
||||||
impl<R: Read> ByteReader<R> {
|
impl<R: Read> ByteReader<R> {
|
||||||
pub fn new(read: R) -> ByteReader<R> {
|
pub fn new(read: R, newline_char: u8) -> ByteReader<R> {
|
||||||
ByteReader {
|
ByteReader {
|
||||||
inner: BufReader::with_capacity(4096, read),
|
inner: BufReader::with_capacity(4096, read),
|
||||||
|
newline_char: newline_char
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -63,6 +65,7 @@ impl<R: Read> ByteReader<R> {
|
||||||
pub fn consume_line(&mut self) -> usize {
|
pub fn consume_line(&mut self) -> usize {
|
||||||
let mut bytes_consumed = 0;
|
let mut bytes_consumed = 0;
|
||||||
let mut consume_val;
|
let mut consume_val;
|
||||||
|
let newline_char = self.newline_char;
|
||||||
|
|
||||||
loop {
|
loop {
|
||||||
{ // need filled_buf to go out of scope
|
{ // need filled_buf to go out of scope
|
||||||
|
@ -77,7 +80,7 @@ impl<R: Read> ByteReader<R> {
|
||||||
Err(e) => crash!(1, "read error: {}", e),
|
Err(e) => crash!(1, "read error: {}", e),
|
||||||
};
|
};
|
||||||
|
|
||||||
if let Some(idx) = filled_buf.iter().position(|byte| *byte == b'\n') {
|
if let Some(idx) = filled_buf.iter().position(|byte| *byte == newline_char) {
|
||||||
consume_val = idx + 1;
|
consume_val = idx + 1;
|
||||||
bytes_consumed += consume_val;
|
bytes_consumed += consume_val;
|
||||||
break;
|
break;
|
||||||
|
@ -105,6 +108,7 @@ impl<R: Read> self::Bytes::Select for ByteReader<R> {
|
||||||
|
|
||||||
use self::Bytes::Selected::*;
|
use self::Bytes::Selected::*;
|
||||||
|
|
||||||
|
let newline_char = self.newline_char;
|
||||||
let (res, consume_val) = {
|
let (res, consume_val) = {
|
||||||
let buffer = match self.fill_buf() {
|
let buffer = match self.fill_buf() {
|
||||||
Err(e) => crash!(1, "read error: {}", e),
|
Err(e) => crash!(1, "read error: {}", e),
|
||||||
|
@ -118,13 +122,13 @@ impl<R: Read> self::Bytes::Select for ByteReader<R> {
|
||||||
// segments check if the byte after bytes is a newline
|
// segments check if the byte after bytes is a newline
|
||||||
let buf_slice = &buffer[0..bytes + 1];
|
let buf_slice = &buffer[0..bytes + 1];
|
||||||
|
|
||||||
match buf_slice.iter().position(|byte| *byte == b'\n') {
|
match buf_slice.iter().position(|byte| *byte == newline_char) {
|
||||||
Some(idx) => (SRes::Newl, idx+1),
|
Some(idx) => (SRes::Newl, idx+1),
|
||||||
None => (SRes::Comp, bytes),
|
None => (SRes::Comp, bytes),
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
_ => {
|
_ => {
|
||||||
match buffer.iter().position(|byte| *byte == b'\n') {
|
match buffer.iter().position(|byte| *byte == newline_char) {
|
||||||
Some(idx) => (SRes::Newl, idx+1),
|
Some(idx) => (SRes::Newl, idx+1),
|
||||||
None => (SRes::Part, buffer.len()),
|
None => (SRes::Part, buffer.len()),
|
||||||
}
|
}
|
||||||
|
|
|
@ -30,12 +30,14 @@ static VERSION: &'static str = env!("CARGO_PKG_VERSION");
|
||||||
|
|
||||||
struct Options {
|
struct Options {
|
||||||
out_delim: Option<String>,
|
out_delim: Option<String>,
|
||||||
|
zero_terminated: bool,
|
||||||
}
|
}
|
||||||
|
|
||||||
struct FieldOptions {
|
struct FieldOptions {
|
||||||
delimiter: String, // one char long, String because of UTF8 representation
|
delimiter: String, // one char long, String because of UTF8 representation
|
||||||
out_delimeter: Option<String>,
|
out_delimeter: Option<String>,
|
||||||
only_delimited: bool,
|
only_delimited: bool,
|
||||||
|
zero_terminated: bool,
|
||||||
}
|
}
|
||||||
|
|
||||||
enum Mode {
|
enum Mode {
|
||||||
|
@ -56,7 +58,9 @@ fn cut_bytes<R: Read>(reader: R, ranges: &[Range], opts: &Options) -> i32 {
|
||||||
use buffer::Bytes::Select;
|
use buffer::Bytes::Select;
|
||||||
use buffer::Bytes::Selected::*;
|
use buffer::Bytes::Selected::*;
|
||||||
|
|
||||||
let mut buf_read = buffer::ByteReader::new(reader);
|
let newline_char =
|
||||||
|
if opts.zero_terminated { b'\0' } else { b'\n' };
|
||||||
|
let mut buf_read = buffer::ByteReader::new(reader, newline_char);
|
||||||
let mut out = stdout();
|
let mut out = stdout();
|
||||||
|
|
||||||
'newline: loop {
|
'newline: loop {
|
||||||
|
@ -69,7 +73,7 @@ fn cut_bytes<R: Read>(reader: R, ranges: &[Range], opts: &Options) -> i32 {
|
||||||
loop {
|
loop {
|
||||||
match buf_read.select(low - cur_pos, None::<&mut Stdout>) {
|
match buf_read.select(low - cur_pos, None::<&mut Stdout>) {
|
||||||
NewlineFound => {
|
NewlineFound => {
|
||||||
pipe_crash_if_err!(1, out.write_all(&[b'\n']));
|
pipe_crash_if_err!(1, out.write_all(&[newline_char]));
|
||||||
continue 'newline
|
continue 'newline
|
||||||
}
|
}
|
||||||
Complete(len) => {
|
Complete(len) => {
|
||||||
|
@ -79,7 +83,7 @@ fn cut_bytes<R: Read>(reader: R, ranges: &[Range], opts: &Options) -> i32 {
|
||||||
Partial(len) => cur_pos += len,
|
Partial(len) => cur_pos += len,
|
||||||
EndOfFile => {
|
EndOfFile => {
|
||||||
if orig_pos != cur_pos {
|
if orig_pos != cur_pos {
|
||||||
pipe_crash_if_err!(1, out.write_all(&[b'\n']));
|
pipe_crash_if_err!(1, out.write_all(&[newline_char]));
|
||||||
}
|
}
|
||||||
|
|
||||||
break 'newline
|
break 'newline
|
||||||
|
@ -108,7 +112,7 @@ fn cut_bytes<R: Read>(reader: R, ranges: &[Range], opts: &Options) -> i32 {
|
||||||
}
|
}
|
||||||
EndOfFile => {
|
EndOfFile => {
|
||||||
if cur_pos != low || low == high {
|
if cur_pos != low || low == high {
|
||||||
pipe_crash_if_err!(1, out.write_all(&[b'\n']));
|
pipe_crash_if_err!(1, out.write_all(&[newline_char]));
|
||||||
}
|
}
|
||||||
|
|
||||||
break 'newline
|
break 'newline
|
||||||
|
@ -118,7 +122,7 @@ fn cut_bytes<R: Read>(reader: R, ranges: &[Range], opts: &Options) -> i32 {
|
||||||
}
|
}
|
||||||
|
|
||||||
buf_read.consume_line();
|
buf_read.consume_line();
|
||||||
pipe_crash_if_err!(1, out.write_all(&[b'\n']));
|
pipe_crash_if_err!(1, out.write_all(&[newline_char]));
|
||||||
}
|
}
|
||||||
|
|
||||||
0
|
0
|
||||||
|
@ -194,14 +198,14 @@ fn cut_characters<R: Read>(reader: R, ranges: &[Range], opts: &Options) -> i32 {
|
||||||
0
|
0
|
||||||
}
|
}
|
||||||
|
|
||||||
fn cut_fields_delimiter<R: Read>(reader: R, ranges: &[Range], delim: &str, only_delimited: bool, out_delim: &str) -> i32 {
|
fn cut_fields_delimiter<R: Read>(reader: R, ranges: &[Range], delim: &str, only_delimited: bool, newline_char: u8, out_delim: &str) -> i32 {
|
||||||
let mut buf_in = BufReader::new(reader);
|
let mut buf_in = BufReader::new(reader);
|
||||||
let mut out = stdout();
|
let mut out = stdout();
|
||||||
let mut buffer = Vec::new();
|
let mut buffer = Vec::new();
|
||||||
|
|
||||||
'newline: loop {
|
'newline: loop {
|
||||||
buffer.clear();
|
buffer.clear();
|
||||||
match buf_in.read_until(b'\n', &mut buffer) {
|
match buf_in.read_until(newline_char, &mut buffer) {
|
||||||
Ok(n) if n == 0 => break,
|
Ok(n) if n == 0 => break,
|
||||||
Err(e) => {
|
Err(e) => {
|
||||||
if buffer.is_empty() {
|
if buffer.is_empty() {
|
||||||
|
@ -220,8 +224,8 @@ fn cut_fields_delimiter<R: Read>(reader: R, ranges: &[Range], delim: &str, only_
|
||||||
if delim_search.peek().is_none() {
|
if delim_search.peek().is_none() {
|
||||||
if ! only_delimited {
|
if ! only_delimited {
|
||||||
pipe_crash_if_err!(1, out.write_all(line));
|
pipe_crash_if_err!(1, out.write_all(line));
|
||||||
if line[line.len() - 1] != b'\n' {
|
if line[line.len() - 1] != newline_char {
|
||||||
pipe_crash_if_err!(1, out.write_all(&[b'\n']));
|
pipe_crash_if_err!(1, out.write_all(&[newline_char]));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -257,7 +261,7 @@ fn cut_fields_delimiter<R: Read>(reader: R, ranges: &[Range], delim: &str, only_
|
||||||
|
|
||||||
pipe_crash_if_err!(1, out.write_all(segment));
|
pipe_crash_if_err!(1, out.write_all(segment));
|
||||||
|
|
||||||
if line[line.len() - 1] == b'\n' {
|
if line[line.len() - 1] == newline_char {
|
||||||
continue 'newline
|
continue 'newline
|
||||||
}
|
}
|
||||||
break
|
break
|
||||||
|
@ -266,17 +270,19 @@ fn cut_fields_delimiter<R: Read>(reader: R, ranges: &[Range], delim: &str, only_
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
pipe_crash_if_err!(1, out.write_all(&[b'\n']));
|
pipe_crash_if_err!(1, out.write_all(&[newline_char]));
|
||||||
}
|
}
|
||||||
|
|
||||||
0
|
0
|
||||||
}
|
}
|
||||||
|
|
||||||
fn cut_fields<R: Read>(reader: R, ranges: &[Range], opts: &FieldOptions) -> i32 {
|
fn cut_fields<R: Read>(reader: R, ranges: &[Range], opts: &FieldOptions) -> i32 {
|
||||||
|
let newline_char =
|
||||||
|
if opts.zero_terminated { b'\0' } else { b'\n' };
|
||||||
match opts.out_delimeter {
|
match opts.out_delimeter {
|
||||||
Some(ref o_delim) => {
|
Some(ref o_delim) => {
|
||||||
return cut_fields_delimiter(reader, ranges, &opts.delimiter,
|
return cut_fields_delimiter(reader, ranges, &opts.delimiter,
|
||||||
opts.only_delimited, o_delim);
|
opts.only_delimited, newline_char, o_delim);
|
||||||
}
|
}
|
||||||
None => ()
|
None => ()
|
||||||
}
|
}
|
||||||
|
@ -287,7 +293,7 @@ fn cut_fields<R: Read>(reader: R, ranges: &[Range], opts: &FieldOptions) -> i32
|
||||||
|
|
||||||
'newline: loop {
|
'newline: loop {
|
||||||
buffer.clear();
|
buffer.clear();
|
||||||
match buf_in.read_until(b'\n', &mut buffer) {
|
match buf_in.read_until(newline_char, &mut buffer) {
|
||||||
Ok(n) if n == 0 => break,
|
Ok(n) if n == 0 => break,
|
||||||
Err(e) => {
|
Err(e) => {
|
||||||
if buffer.is_empty() {
|
if buffer.is_empty() {
|
||||||
|
@ -306,8 +312,8 @@ fn cut_fields<R: Read>(reader: R, ranges: &[Range], opts: &FieldOptions) -> i32
|
||||||
if delim_search.peek().is_none() {
|
if delim_search.peek().is_none() {
|
||||||
if ! opts.only_delimited {
|
if ! opts.only_delimited {
|
||||||
pipe_crash_if_err!(1, out.write_all(line));
|
pipe_crash_if_err!(1, out.write_all(line));
|
||||||
if line[line.len() - 1] != b'\n' {
|
if line[line.len() - 1] != newline_char {
|
||||||
pipe_crash_if_err!(1, out.write_all(&[b'\n']));
|
pipe_crash_if_err!(1, out.write_all(&[newline_char]));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -343,7 +349,7 @@ fn cut_fields<R: Read>(reader: R, ranges: &[Range], opts: &FieldOptions) -> i32
|
||||||
|
|
||||||
pipe_crash_if_err!(1, out.write_all(segment));
|
pipe_crash_if_err!(1, out.write_all(segment));
|
||||||
|
|
||||||
if line[line.len() - 1] == b'\n' {
|
if line[line.len() - 1] == newline_char {
|
||||||
continue 'newline
|
continue 'newline
|
||||||
}
|
}
|
||||||
break
|
break
|
||||||
|
@ -351,7 +357,7 @@ fn cut_fields<R: Read>(reader: R, ranges: &[Range], opts: &FieldOptions) -> i32
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
pipe_crash_if_err!(1, out.write_all(&[b'\n']));
|
pipe_crash_if_err!(1, out.write_all(&[newline_char]));
|
||||||
}
|
}
|
||||||
|
|
||||||
0
|
0
|
||||||
|
@ -411,6 +417,7 @@ pub fn uumain(args: Vec<String>) -> i32 {
|
||||||
opts.optflag("n", "", "legacy option - has no effect.");
|
opts.optflag("n", "", "legacy option - has no effect.");
|
||||||
opts.optflag("", "complement", "invert the filter - instead of displaying only the filtered columns, display all but those columns");
|
opts.optflag("", "complement", "invert the filter - instead of displaying only the filtered columns, display all but those columns");
|
||||||
opts.optflag("s", "only-delimited", "in field mode, only print lines which contain the delimiter");
|
opts.optflag("s", "only-delimited", "in field mode, only print lines which contain the delimiter");
|
||||||
|
opts.optflag("z", "zero-terminated", "instead of filtering columns based on line, filter columns based on \\0 (NULL character)");
|
||||||
opts.optopt("", "output-delimiter", "in field mode, replace the delimiter in output lines with this option's argument", "new delimiter");
|
opts.optopt("", "output-delimiter", "in field mode, replace the delimiter in output lines with this option's argument", "new delimiter");
|
||||||
let usage = opts.usage("Prints specified byte or field columns from each line of stdin or the input files");
|
let usage = opts.usage("Prints specified byte or field columns from each line of stdin or the input files");
|
||||||
opts.help(format!("
|
opts.help(format!("
|
||||||
|
@ -489,6 +496,17 @@ pub fn uumain(args: Vec<String>) -> i32 {
|
||||||
it will replace the delimiter character in each line printed. This is
|
it will replace the delimiter character in each line printed. This is
|
||||||
useful for transforming tabular data - e.g. to convert a CSV to a
|
useful for transforming tabular data - e.g. to convert a CSV to a
|
||||||
TSV (tab-separated file)
|
TSV (tab-separated file)
|
||||||
|
|
||||||
|
Line endings
|
||||||
|
|
||||||
|
When the --zero-terminated (-z) option is used, cut sees \\0 (null) as the
|
||||||
|
'line ending' character (both for the purposes of reading lines and
|
||||||
|
separating printed lines) instead of \\n (newline). This is useful for
|
||||||
|
tabular data where some of the cells may contain newlines
|
||||||
|
|
||||||
|
echo 'ab\\0cd' | cut -z -c 1
|
||||||
|
will result in 'a\\0c\\0'
|
||||||
|
|
||||||
", NAME, VERSION, usage));
|
", NAME, VERSION, usage));
|
||||||
let matches = opts.parse(args);
|
let matches = opts.parse(args);
|
||||||
|
|
||||||
|
@ -499,11 +517,11 @@ pub fn uumain(args: Vec<String>) -> i32 {
|
||||||
matches.opt_str("fields")) {
|
matches.opt_str("fields")) {
|
||||||
(Some(byte_ranges), None, None) => {
|
(Some(byte_ranges), None, None) => {
|
||||||
list_to_ranges(&byte_ranges[..], complement)
|
list_to_ranges(&byte_ranges[..], complement)
|
||||||
.map(|ranges| Mode::Bytes(ranges, Options { out_delim: matches.opt_str("output-delimiter") }))
|
.map(|ranges| Mode::Bytes(ranges, Options { out_delim: matches.opt_str("output-delimiter"), zero_terminated : matches.opt_present("zero-terminated") }))
|
||||||
}
|
}
|
||||||
(None, Some(char_ranges), None) => {
|
(None, Some(char_ranges), None) => {
|
||||||
list_to_ranges(&char_ranges[..], complement)
|
list_to_ranges(&char_ranges[..], complement)
|
||||||
.map(|ranges| Mode::Characters(ranges, Options { out_delim: matches.opt_str("output-delimiter") }))
|
.map(|ranges| Mode::Characters(ranges, Options { out_delim: matches.opt_str("output-delimiter"), zero_terminated : matches.opt_present("zero-terminated") }))
|
||||||
}
|
}
|
||||||
(None, None, Some(field_ranges)) => {
|
(None, None, Some(field_ranges)) => {
|
||||||
list_to_ranges(&field_ranges[..], complement).and_then(|ranges|
|
list_to_ranges(&field_ranges[..], complement).and_then(|ranges|
|
||||||
|
@ -520,6 +538,7 @@ pub fn uumain(args: Vec<String>) -> i32 {
|
||||||
};
|
};
|
||||||
|
|
||||||
let only_delimited = matches.opt_present("only-delimited");
|
let only_delimited = matches.opt_present("only-delimited");
|
||||||
|
let zero_terminated = matches.opt_present("zero-terminated");
|
||||||
|
|
||||||
match matches.opt_str("delimiter") {
|
match matches.opt_str("delimiter") {
|
||||||
Some(delim) => {
|
Some(delim) => {
|
||||||
|
@ -536,7 +555,8 @@ pub fn uumain(args: Vec<String>) -> i32 {
|
||||||
FieldOptions {
|
FieldOptions {
|
||||||
delimiter: delim,
|
delimiter: delim,
|
||||||
out_delimeter: out_delim,
|
out_delimeter: out_delim,
|
||||||
only_delimited: only_delimited
|
only_delimited: only_delimited,
|
||||||
|
zero_terminated: zero_terminated
|
||||||
}))
|
}))
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -544,7 +564,8 @@ pub fn uumain(args: Vec<String>) -> i32 {
|
||||||
FieldOptions {
|
FieldOptions {
|
||||||
delimiter: "\t".to_owned(),
|
delimiter: "\t".to_owned(),
|
||||||
out_delimeter: out_delim,
|
out_delimeter: out_delim,
|
||||||
only_delimited: only_delimited
|
only_delimited: only_delimited,
|
||||||
|
zero_terminated: zero_terminated
|
||||||
}))
|
}))
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -78,7 +78,6 @@ fn test_complement() {
|
||||||
.succeeds().stdout_only("9\n8\n7\n");
|
.succeeds().stdout_only("9\n8\n7\n");
|
||||||
}
|
}
|
||||||
|
|
||||||
#[cfg_attr(not(feature="test_unimplemented"),ignore)]
|
|
||||||
#[test]
|
#[test]
|
||||||
fn test_zero_terminated() {
|
fn test_zero_terminated() {
|
||||||
new_ucmd().args(&["-d_","-z", "-f", "1"])
|
new_ucmd().args(&["-d_","-z", "-f", "1"])
|
||||||
|
@ -95,7 +94,6 @@ fn test_only_delimited() {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
#[cfg_attr(not(feature="test_unimplemented"),ignore)]
|
|
||||||
#[test]
|
#[test]
|
||||||
fn test_zero_terminated_only_delimited() {
|
fn test_zero_terminated_only_delimited() {
|
||||||
new_ucmd().args(&["-d_","-z", "-s", "-f", "1"])
|
new_ucmd().args(&["-d_","-z", "-s", "-f", "1"])
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue