1
Fork 0
mirror of https://github.com/RGBCube/uutils-coreutils synced 2025-07-28 19:47:45 +00:00

cut: implement zero-terminated option

no changes to char mode because removal
of specialized char mode behavior is pending
This commit is contained in:
Nathan Ross 2016-08-08 20:15:53 -04:00
parent 3618d9df94
commit 5aaff02195
3 changed files with 50 additions and 27 deletions

View file

@ -33,12 +33,14 @@ pub mod Bytes {
#[derive(Debug)] #[derive(Debug)]
pub struct ByteReader<R> where R: Read { pub struct ByteReader<R> where R: Read {
inner: BufReader<R>, inner: BufReader<R>,
newline_char: u8,
} }
impl<R: Read> ByteReader<R> { impl<R: Read> ByteReader<R> {
pub fn new(read: R) -> ByteReader<R> { pub fn new(read: R, newline_char: u8) -> ByteReader<R> {
ByteReader { ByteReader {
inner: BufReader::with_capacity(4096, read), inner: BufReader::with_capacity(4096, read),
newline_char: newline_char
} }
} }
} }
@ -63,6 +65,7 @@ impl<R: Read> ByteReader<R> {
pub fn consume_line(&mut self) -> usize { pub fn consume_line(&mut self) -> usize {
let mut bytes_consumed = 0; let mut bytes_consumed = 0;
let mut consume_val; let mut consume_val;
let newline_char = self.newline_char;
loop { loop {
{ // need filled_buf to go out of scope { // need filled_buf to go out of scope
@ -77,7 +80,7 @@ impl<R: Read> ByteReader<R> {
Err(e) => crash!(1, "read error: {}", e), Err(e) => crash!(1, "read error: {}", e),
}; };
if let Some(idx) = filled_buf.iter().position(|byte| *byte == b'\n') { if let Some(idx) = filled_buf.iter().position(|byte| *byte == newline_char) {
consume_val = idx + 1; consume_val = idx + 1;
bytes_consumed += consume_val; bytes_consumed += consume_val;
break; break;
@ -105,6 +108,7 @@ impl<R: Read> self::Bytes::Select for ByteReader<R> {
use self::Bytes::Selected::*; use self::Bytes::Selected::*;
let newline_char = self.newline_char;
let (res, consume_val) = { let (res, consume_val) = {
let buffer = match self.fill_buf() { let buffer = match self.fill_buf() {
Err(e) => crash!(1, "read error: {}", e), Err(e) => crash!(1, "read error: {}", e),
@ -118,13 +122,13 @@ impl<R: Read> self::Bytes::Select for ByteReader<R> {
// segments check if the byte after bytes is a newline // segments check if the byte after bytes is a newline
let buf_slice = &buffer[0..bytes + 1]; let buf_slice = &buffer[0..bytes + 1];
match buf_slice.iter().position(|byte| *byte == b'\n') { match buf_slice.iter().position(|byte| *byte == newline_char) {
Some(idx) => (SRes::Newl, idx+1), Some(idx) => (SRes::Newl, idx+1),
None => (SRes::Comp, bytes), None => (SRes::Comp, bytes),
} }
}, },
_ => { _ => {
match buffer.iter().position(|byte| *byte == b'\n') { match buffer.iter().position(|byte| *byte == newline_char) {
Some(idx) => (SRes::Newl, idx+1), Some(idx) => (SRes::Newl, idx+1),
None => (SRes::Part, buffer.len()), None => (SRes::Part, buffer.len()),
} }

View file

@ -30,12 +30,14 @@ static VERSION: &'static str = env!("CARGO_PKG_VERSION");
struct Options { struct Options {
out_delim: Option<String>, out_delim: Option<String>,
zero_terminated: bool,
} }
struct FieldOptions { struct FieldOptions {
delimiter: String, // one char long, String because of UTF8 representation delimiter: String, // one char long, String because of UTF8 representation
out_delimeter: Option<String>, out_delimeter: Option<String>,
only_delimited: bool, only_delimited: bool,
zero_terminated: bool,
} }
enum Mode { enum Mode {
@ -56,7 +58,9 @@ fn cut_bytes<R: Read>(reader: R, ranges: &[Range], opts: &Options) -> i32 {
use buffer::Bytes::Select; use buffer::Bytes::Select;
use buffer::Bytes::Selected::*; use buffer::Bytes::Selected::*;
let mut buf_read = buffer::ByteReader::new(reader); let newline_char =
if opts.zero_terminated { b'\0' } else { b'\n' };
let mut buf_read = buffer::ByteReader::new(reader, newline_char);
let mut out = stdout(); let mut out = stdout();
'newline: loop { 'newline: loop {
@ -69,7 +73,7 @@ fn cut_bytes<R: Read>(reader: R, ranges: &[Range], opts: &Options) -> i32 {
loop { loop {
match buf_read.select(low - cur_pos, None::<&mut Stdout>) { match buf_read.select(low - cur_pos, None::<&mut Stdout>) {
NewlineFound => { NewlineFound => {
pipe_crash_if_err!(1, out.write_all(&[b'\n'])); pipe_crash_if_err!(1, out.write_all(&[newline_char]));
continue 'newline continue 'newline
} }
Complete(len) => { Complete(len) => {
@ -79,7 +83,7 @@ fn cut_bytes<R: Read>(reader: R, ranges: &[Range], opts: &Options) -> i32 {
Partial(len) => cur_pos += len, Partial(len) => cur_pos += len,
EndOfFile => { EndOfFile => {
if orig_pos != cur_pos { if orig_pos != cur_pos {
pipe_crash_if_err!(1, out.write_all(&[b'\n'])); pipe_crash_if_err!(1, out.write_all(&[newline_char]));
} }
break 'newline break 'newline
@ -108,7 +112,7 @@ fn cut_bytes<R: Read>(reader: R, ranges: &[Range], opts: &Options) -> i32 {
} }
EndOfFile => { EndOfFile => {
if cur_pos != low || low == high { if cur_pos != low || low == high {
pipe_crash_if_err!(1, out.write_all(&[b'\n'])); pipe_crash_if_err!(1, out.write_all(&[newline_char]));
} }
break 'newline break 'newline
@ -118,7 +122,7 @@ fn cut_bytes<R: Read>(reader: R, ranges: &[Range], opts: &Options) -> i32 {
} }
buf_read.consume_line(); buf_read.consume_line();
pipe_crash_if_err!(1, out.write_all(&[b'\n'])); pipe_crash_if_err!(1, out.write_all(&[newline_char]));
} }
0 0
@ -194,14 +198,14 @@ fn cut_characters<R: Read>(reader: R, ranges: &[Range], opts: &Options) -> i32 {
0 0
} }
fn cut_fields_delimiter<R: Read>(reader: R, ranges: &[Range], delim: &str, only_delimited: bool, out_delim: &str) -> i32 { fn cut_fields_delimiter<R: Read>(reader: R, ranges: &[Range], delim: &str, only_delimited: bool, newline_char: u8, out_delim: &str) -> i32 {
let mut buf_in = BufReader::new(reader); let mut buf_in = BufReader::new(reader);
let mut out = stdout(); let mut out = stdout();
let mut buffer = Vec::new(); let mut buffer = Vec::new();
'newline: loop { 'newline: loop {
buffer.clear(); buffer.clear();
match buf_in.read_until(b'\n', &mut buffer) { match buf_in.read_until(newline_char, &mut buffer) {
Ok(n) if n == 0 => break, Ok(n) if n == 0 => break,
Err(e) => { Err(e) => {
if buffer.is_empty() { if buffer.is_empty() {
@ -220,8 +224,8 @@ fn cut_fields_delimiter<R: Read>(reader: R, ranges: &[Range], delim: &str, only_
if delim_search.peek().is_none() { if delim_search.peek().is_none() {
if ! only_delimited { if ! only_delimited {
pipe_crash_if_err!(1, out.write_all(line)); pipe_crash_if_err!(1, out.write_all(line));
if line[line.len() - 1] != b'\n' { if line[line.len() - 1] != newline_char {
pipe_crash_if_err!(1, out.write_all(&[b'\n'])); pipe_crash_if_err!(1, out.write_all(&[newline_char]));
} }
} }
@ -257,7 +261,7 @@ fn cut_fields_delimiter<R: Read>(reader: R, ranges: &[Range], delim: &str, only_
pipe_crash_if_err!(1, out.write_all(segment)); pipe_crash_if_err!(1, out.write_all(segment));
if line[line.len() - 1] == b'\n' { if line[line.len() - 1] == newline_char {
continue 'newline continue 'newline
} }
break break
@ -266,17 +270,19 @@ fn cut_fields_delimiter<R: Read>(reader: R, ranges: &[Range], delim: &str, only_
} }
} }
pipe_crash_if_err!(1, out.write_all(&[b'\n'])); pipe_crash_if_err!(1, out.write_all(&[newline_char]));
} }
0 0
} }
fn cut_fields<R: Read>(reader: R, ranges: &[Range], opts: &FieldOptions) -> i32 { fn cut_fields<R: Read>(reader: R, ranges: &[Range], opts: &FieldOptions) -> i32 {
let newline_char =
if opts.zero_terminated { b'\0' } else { b'\n' };
match opts.out_delimeter { match opts.out_delimeter {
Some(ref o_delim) => { Some(ref o_delim) => {
return cut_fields_delimiter(reader, ranges, &opts.delimiter, return cut_fields_delimiter(reader, ranges, &opts.delimiter,
opts.only_delimited, o_delim); opts.only_delimited, newline_char, o_delim);
} }
None => () None => ()
} }
@ -287,7 +293,7 @@ fn cut_fields<R: Read>(reader: R, ranges: &[Range], opts: &FieldOptions) -> i32
'newline: loop { 'newline: loop {
buffer.clear(); buffer.clear();
match buf_in.read_until(b'\n', &mut buffer) { match buf_in.read_until(newline_char, &mut buffer) {
Ok(n) if n == 0 => break, Ok(n) if n == 0 => break,
Err(e) => { Err(e) => {
if buffer.is_empty() { if buffer.is_empty() {
@ -306,8 +312,8 @@ fn cut_fields<R: Read>(reader: R, ranges: &[Range], opts: &FieldOptions) -> i32
if delim_search.peek().is_none() { if delim_search.peek().is_none() {
if ! opts.only_delimited { if ! opts.only_delimited {
pipe_crash_if_err!(1, out.write_all(line)); pipe_crash_if_err!(1, out.write_all(line));
if line[line.len() - 1] != b'\n' { if line[line.len() - 1] != newline_char {
pipe_crash_if_err!(1, out.write_all(&[b'\n'])); pipe_crash_if_err!(1, out.write_all(&[newline_char]));
} }
} }
@ -343,7 +349,7 @@ fn cut_fields<R: Read>(reader: R, ranges: &[Range], opts: &FieldOptions) -> i32
pipe_crash_if_err!(1, out.write_all(segment)); pipe_crash_if_err!(1, out.write_all(segment));
if line[line.len() - 1] == b'\n' { if line[line.len() - 1] == newline_char {
continue 'newline continue 'newline
} }
break break
@ -351,7 +357,7 @@ fn cut_fields<R: Read>(reader: R, ranges: &[Range], opts: &FieldOptions) -> i32
} }
} }
pipe_crash_if_err!(1, out.write_all(&[b'\n'])); pipe_crash_if_err!(1, out.write_all(&[newline_char]));
} }
0 0
@ -411,6 +417,7 @@ pub fn uumain(args: Vec<String>) -> i32 {
opts.optflag("n", "", "legacy option - has no effect."); opts.optflag("n", "", "legacy option - has no effect.");
opts.optflag("", "complement", "invert the filter - instead of displaying only the filtered columns, display all but those columns"); opts.optflag("", "complement", "invert the filter - instead of displaying only the filtered columns, display all but those columns");
opts.optflag("s", "only-delimited", "in field mode, only print lines which contain the delimiter"); opts.optflag("s", "only-delimited", "in field mode, only print lines which contain the delimiter");
opts.optflag("z", "zero-terminated", "instead of filtering columns based on line, filter columns based on \\0 (NULL character)");
opts.optopt("", "output-delimiter", "in field mode, replace the delimiter in output lines with this option's argument", "new delimiter"); opts.optopt("", "output-delimiter", "in field mode, replace the delimiter in output lines with this option's argument", "new delimiter");
let usage = opts.usage("Prints specified byte or field columns from each line of stdin or the input files"); let usage = opts.usage("Prints specified byte or field columns from each line of stdin or the input files");
opts.help(format!(" opts.help(format!("
@ -489,6 +496,17 @@ pub fn uumain(args: Vec<String>) -> i32 {
it will replace the delimiter character in each line printed. This is it will replace the delimiter character in each line printed. This is
useful for transforming tabular data - e.g. to convert a CSV to a useful for transforming tabular data - e.g. to convert a CSV to a
TSV (tab-separated file) TSV (tab-separated file)
Line endings
When the --zero-terminated (-z) option is used, cut sees \\0 (null) as the
'line ending' character (both for the purposes of reading lines and
separating printed lines) instead of \\n (newline). This is useful for
tabular data where some of the cells may contain newlines
echo 'ab\\0cd' | cut -z -c 1
will result in 'a\\0c\\0'
", NAME, VERSION, usage)); ", NAME, VERSION, usage));
let matches = opts.parse(args); let matches = opts.parse(args);
@ -499,11 +517,11 @@ pub fn uumain(args: Vec<String>) -> i32 {
matches.opt_str("fields")) { matches.opt_str("fields")) {
(Some(byte_ranges), None, None) => { (Some(byte_ranges), None, None) => {
list_to_ranges(&byte_ranges[..], complement) list_to_ranges(&byte_ranges[..], complement)
.map(|ranges| Mode::Bytes(ranges, Options { out_delim: matches.opt_str("output-delimiter") })) .map(|ranges| Mode::Bytes(ranges, Options { out_delim: matches.opt_str("output-delimiter"), zero_terminated : matches.opt_present("zero-terminated") }))
} }
(None, Some(char_ranges), None) => { (None, Some(char_ranges), None) => {
list_to_ranges(&char_ranges[..], complement) list_to_ranges(&char_ranges[..], complement)
.map(|ranges| Mode::Characters(ranges, Options { out_delim: matches.opt_str("output-delimiter") })) .map(|ranges| Mode::Characters(ranges, Options { out_delim: matches.opt_str("output-delimiter"), zero_terminated : matches.opt_present("zero-terminated") }))
} }
(None, None, Some(field_ranges)) => { (None, None, Some(field_ranges)) => {
list_to_ranges(&field_ranges[..], complement).and_then(|ranges| list_to_ranges(&field_ranges[..], complement).and_then(|ranges|
@ -520,6 +538,7 @@ pub fn uumain(args: Vec<String>) -> i32 {
}; };
let only_delimited = matches.opt_present("only-delimited"); let only_delimited = matches.opt_present("only-delimited");
let zero_terminated = matches.opt_present("zero-terminated");
match matches.opt_str("delimiter") { match matches.opt_str("delimiter") {
Some(delim) => { Some(delim) => {
@ -536,7 +555,8 @@ pub fn uumain(args: Vec<String>) -> i32 {
FieldOptions { FieldOptions {
delimiter: delim, delimiter: delim,
out_delimeter: out_delim, out_delimeter: out_delim,
only_delimited: only_delimited only_delimited: only_delimited,
zero_terminated: zero_terminated
})) }))
} }
} }
@ -544,7 +564,8 @@ pub fn uumain(args: Vec<String>) -> i32 {
FieldOptions { FieldOptions {
delimiter: "\t".to_owned(), delimiter: "\t".to_owned(),
out_delimeter: out_delim, out_delimeter: out_delim,
only_delimited: only_delimited only_delimited: only_delimited,
zero_terminated: zero_terminated
})) }))
} }
} }

View file

@ -78,7 +78,6 @@ fn test_complement() {
.succeeds().stdout_only("9\n8\n7\n"); .succeeds().stdout_only("9\n8\n7\n");
} }
#[cfg_attr(not(feature="test_unimplemented"),ignore)]
#[test] #[test]
fn test_zero_terminated() { fn test_zero_terminated() {
new_ucmd().args(&["-d_","-z", "-f", "1"]) new_ucmd().args(&["-d_","-z", "-f", "1"])
@ -95,7 +94,6 @@ fn test_only_delimited() {
} }
} }
#[cfg_attr(not(feature="test_unimplemented"),ignore)]
#[test] #[test]
fn test_zero_terminated_only_delimited() { fn test_zero_terminated_only_delimited() {
new_ucmd().args(&["-d_","-z", "-s", "-f", "1"]) new_ucmd().args(&["-d_","-z", "-s", "-f", "1"])