diff --git a/src/cut/buffer.rs b/src/cut/buffer.rs index a194b6dd5..d490c001a 100644 --- a/src/cut/buffer.rs +++ b/src/cut/buffer.rs @@ -33,12 +33,14 @@ pub mod Bytes { #[derive(Debug)] pub struct ByteReader where R: Read { inner: BufReader, + newline_char: u8, } impl ByteReader { - pub fn new(read: R) -> ByteReader { + pub fn new(read: R, newline_char: u8) -> ByteReader { ByteReader { inner: BufReader::with_capacity(4096, read), + newline_char: newline_char } } } @@ -63,6 +65,7 @@ impl ByteReader { pub fn consume_line(&mut self) -> usize { let mut bytes_consumed = 0; let mut consume_val; + let newline_char = self.newline_char; loop { { // need filled_buf to go out of scope @@ -77,7 +80,7 @@ impl ByteReader { Err(e) => crash!(1, "read error: {}", e), }; - if let Some(idx) = filled_buf.iter().position(|byte| *byte == b'\n') { + if let Some(idx) = filled_buf.iter().position(|byte| *byte == newline_char) { consume_val = idx + 1; bytes_consumed += consume_val; break; @@ -105,6 +108,7 @@ impl self::Bytes::Select for ByteReader { use self::Bytes::Selected::*; + let newline_char = self.newline_char; let (res, consume_val) = { let buffer = match self.fill_buf() { Err(e) => crash!(1, "read error: {}", e), @@ -118,13 +122,13 @@ impl self::Bytes::Select for ByteReader { // segments check if the byte after bytes is a newline let buf_slice = &buffer[0..bytes + 1]; - match buf_slice.iter().position(|byte| *byte == b'\n') { + match buf_slice.iter().position(|byte| *byte == newline_char) { Some(idx) => (SRes::Newl, idx+1), None => (SRes::Comp, bytes), } }, _ => { - match buffer.iter().position(|byte| *byte == b'\n') { + match buffer.iter().position(|byte| *byte == newline_char) { Some(idx) => (SRes::Newl, idx+1), None => (SRes::Part, buffer.len()), } diff --git a/src/cut/cut.rs b/src/cut/cut.rs index 5a51ee957..f61c7b90d 100644 --- a/src/cut/cut.rs +++ b/src/cut/cut.rs @@ -30,12 +30,14 @@ static VERSION: &'static str = env!("CARGO_PKG_VERSION"); struct Options { out_delim: Option, + zero_terminated: bool, } struct FieldOptions { delimiter: String, // one char long, String because of UTF8 representation out_delimeter: Option, only_delimited: bool, + zero_terminated: bool, } enum Mode { @@ -56,7 +58,9 @@ fn cut_bytes(reader: R, ranges: &[Range], opts: &Options) -> i32 { use buffer::Bytes::Select; use buffer::Bytes::Selected::*; - let mut buf_read = buffer::ByteReader::new(reader); + let newline_char = + if opts.zero_terminated { b'\0' } else { b'\n' }; + let mut buf_read = buffer::ByteReader::new(reader, newline_char); let mut out = stdout(); 'newline: loop { @@ -69,7 +73,7 @@ fn cut_bytes(reader: R, ranges: &[Range], opts: &Options) -> i32 { loop { match buf_read.select(low - cur_pos, None::<&mut Stdout>) { NewlineFound => { - pipe_crash_if_err!(1, out.write_all(&[b'\n'])); + pipe_crash_if_err!(1, out.write_all(&[newline_char])); continue 'newline } Complete(len) => { @@ -79,7 +83,7 @@ fn cut_bytes(reader: R, ranges: &[Range], opts: &Options) -> i32 { Partial(len) => cur_pos += len, EndOfFile => { if orig_pos != cur_pos { - pipe_crash_if_err!(1, out.write_all(&[b'\n'])); + pipe_crash_if_err!(1, out.write_all(&[newline_char])); } break 'newline @@ -108,7 +112,7 @@ fn cut_bytes(reader: R, ranges: &[Range], opts: &Options) -> i32 { } EndOfFile => { if cur_pos != low || low == high { - pipe_crash_if_err!(1, out.write_all(&[b'\n'])); + pipe_crash_if_err!(1, out.write_all(&[newline_char])); } break 'newline @@ -118,7 +122,7 @@ fn cut_bytes(reader: R, ranges: &[Range], opts: &Options) -> i32 { } buf_read.consume_line(); - pipe_crash_if_err!(1, out.write_all(&[b'\n'])); + pipe_crash_if_err!(1, out.write_all(&[newline_char])); } 0 @@ -194,14 +198,14 @@ fn cut_characters(reader: R, ranges: &[Range], opts: &Options) -> i32 { 0 } -fn cut_fields_delimiter(reader: R, ranges: &[Range], delim: &str, only_delimited: bool, out_delim: &str) -> i32 { +fn cut_fields_delimiter(reader: R, ranges: &[Range], delim: &str, only_delimited: bool, newline_char: u8, out_delim: &str) -> i32 { let mut buf_in = BufReader::new(reader); let mut out = stdout(); let mut buffer = Vec::new(); 'newline: loop { buffer.clear(); - match buf_in.read_until(b'\n', &mut buffer) { + match buf_in.read_until(newline_char, &mut buffer) { Ok(n) if n == 0 => break, Err(e) => { if buffer.is_empty() { @@ -220,8 +224,8 @@ fn cut_fields_delimiter(reader: R, ranges: &[Range], delim: &str, only_ if delim_search.peek().is_none() { if ! only_delimited { pipe_crash_if_err!(1, out.write_all(line)); - if line[line.len() - 1] != b'\n' { - pipe_crash_if_err!(1, out.write_all(&[b'\n'])); + if line[line.len() - 1] != newline_char { + pipe_crash_if_err!(1, out.write_all(&[newline_char])); } } @@ -257,7 +261,7 @@ fn cut_fields_delimiter(reader: R, ranges: &[Range], delim: &str, only_ pipe_crash_if_err!(1, out.write_all(segment)); - if line[line.len() - 1] == b'\n' { + if line[line.len() - 1] == newline_char { continue 'newline } break @@ -266,17 +270,19 @@ fn cut_fields_delimiter(reader: R, ranges: &[Range], delim: &str, only_ } } - pipe_crash_if_err!(1, out.write_all(&[b'\n'])); + pipe_crash_if_err!(1, out.write_all(&[newline_char])); } 0 } fn cut_fields(reader: R, ranges: &[Range], opts: &FieldOptions) -> i32 { + let newline_char = + if opts.zero_terminated { b'\0' } else { b'\n' }; match opts.out_delimeter { Some(ref o_delim) => { return cut_fields_delimiter(reader, ranges, &opts.delimiter, - opts.only_delimited, o_delim); + opts.only_delimited, newline_char, o_delim); } None => () } @@ -287,7 +293,7 @@ fn cut_fields(reader: R, ranges: &[Range], opts: &FieldOptions) -> i32 'newline: loop { buffer.clear(); - match buf_in.read_until(b'\n', &mut buffer) { + match buf_in.read_until(newline_char, &mut buffer) { Ok(n) if n == 0 => break, Err(e) => { if buffer.is_empty() { @@ -306,8 +312,8 @@ fn cut_fields(reader: R, ranges: &[Range], opts: &FieldOptions) -> i32 if delim_search.peek().is_none() { if ! opts.only_delimited { pipe_crash_if_err!(1, out.write_all(line)); - if line[line.len() - 1] != b'\n' { - pipe_crash_if_err!(1, out.write_all(&[b'\n'])); + if line[line.len() - 1] != newline_char { + pipe_crash_if_err!(1, out.write_all(&[newline_char])); } } @@ -343,7 +349,7 @@ fn cut_fields(reader: R, ranges: &[Range], opts: &FieldOptions) -> i32 pipe_crash_if_err!(1, out.write_all(segment)); - if line[line.len() - 1] == b'\n' { + if line[line.len() - 1] == newline_char { continue 'newline } break @@ -351,7 +357,7 @@ fn cut_fields(reader: R, ranges: &[Range], opts: &FieldOptions) -> i32 } } - pipe_crash_if_err!(1, out.write_all(&[b'\n'])); + pipe_crash_if_err!(1, out.write_all(&[newline_char])); } 0 @@ -411,6 +417,7 @@ pub fn uumain(args: Vec) -> i32 { opts.optflag("n", "", "legacy option - has no effect."); opts.optflag("", "complement", "invert the filter - instead of displaying only the filtered columns, display all but those columns"); opts.optflag("s", "only-delimited", "in field mode, only print lines which contain the delimiter"); + opts.optflag("z", "zero-terminated", "instead of filtering columns based on line, filter columns based on \\0 (NULL character)"); opts.optopt("", "output-delimiter", "in field mode, replace the delimiter in output lines with this option's argument", "new delimiter"); let usage = opts.usage("Prints specified byte or field columns from each line of stdin or the input files"); opts.help(format!(" @@ -489,6 +496,17 @@ pub fn uumain(args: Vec) -> i32 { it will replace the delimiter character in each line printed. This is useful for transforming tabular data - e.g. to convert a CSV to a TSV (tab-separated file) + + Line endings + + When the --zero-terminated (-z) option is used, cut sees \\0 (null) as the + 'line ending' character (both for the purposes of reading lines and + separating printed lines) instead of \\n (newline). This is useful for + tabular data where some of the cells may contain newlines + + echo 'ab\\0cd' | cut -z -c 1 + will result in 'a\\0c\\0' + ", NAME, VERSION, usage)); let matches = opts.parse(args); @@ -499,11 +517,11 @@ pub fn uumain(args: Vec) -> i32 { matches.opt_str("fields")) { (Some(byte_ranges), None, None) => { list_to_ranges(&byte_ranges[..], complement) - .map(|ranges| Mode::Bytes(ranges, Options { out_delim: matches.opt_str("output-delimiter") })) + .map(|ranges| Mode::Bytes(ranges, Options { out_delim: matches.opt_str("output-delimiter"), zero_terminated : matches.opt_present("zero-terminated") })) } (None, Some(char_ranges), None) => { list_to_ranges(&char_ranges[..], complement) - .map(|ranges| Mode::Characters(ranges, Options { out_delim: matches.opt_str("output-delimiter") })) + .map(|ranges| Mode::Characters(ranges, Options { out_delim: matches.opt_str("output-delimiter"), zero_terminated : matches.opt_present("zero-terminated") })) } (None, None, Some(field_ranges)) => { list_to_ranges(&field_ranges[..], complement).and_then(|ranges| @@ -520,6 +538,7 @@ pub fn uumain(args: Vec) -> i32 { }; let only_delimited = matches.opt_present("only-delimited"); + let zero_terminated = matches.opt_present("zero-terminated"); match matches.opt_str("delimiter") { Some(delim) => { @@ -536,7 +555,8 @@ pub fn uumain(args: Vec) -> i32 { FieldOptions { delimiter: delim, out_delimeter: out_delim, - only_delimited: only_delimited + only_delimited: only_delimited, + zero_terminated: zero_terminated })) } } @@ -544,7 +564,8 @@ pub fn uumain(args: Vec) -> i32 { FieldOptions { delimiter: "\t".to_owned(), out_delimeter: out_delim, - only_delimited: only_delimited + only_delimited: only_delimited, + zero_terminated: zero_terminated })) } } diff --git a/tests/test_cut.rs b/tests/test_cut.rs index 03b1d1c69..0d85ceb7c 100644 --- a/tests/test_cut.rs +++ b/tests/test_cut.rs @@ -78,7 +78,6 @@ fn test_complement() { .succeeds().stdout_only("9\n8\n7\n"); } -#[cfg_attr(not(feature="test_unimplemented"),ignore)] #[test] fn test_zero_terminated() { new_ucmd().args(&["-d_","-z", "-f", "1"]) @@ -95,7 +94,6 @@ fn test_only_delimited() { } } -#[cfg_attr(not(feature="test_unimplemented"),ignore)] #[test] fn test_zero_terminated_only_delimited() { new_ucmd().args(&["-d_","-z", "-s", "-f", "1"])