cut: implement zero-terminated option

no changes to char mode because removal of specialized char mode behavior is pending
2025-07-28 19:47:45 +00:00 · 2016-08-08 20:15:53 -04:00 · 2016-08-08 20:15:53 -04:00 · 5aaff02195
commit 5aaff02195
parent 3618d9df94
3 changed files with 50 additions and 27 deletions
--- a/src/cut/buffer.rs
+++ b/src/cut/buffer.rs
@ -33,12 +33,14 @@ pub mod Bytes {
 #[derive(Debug)]
 pub struct ByteReader<R> where R: Read {
    inner: BufReader<R>,
    newline_char: u8,
 }
 impl<R: Read> ByteReader<R> {
-    pub fn new(read: R) -> ByteReader<R> {
+    pub fn new(read: R, newline_char: u8) -> ByteReader<R> {
        ByteReader {
            inner: BufReader::with_capacity(4096, read),
            newline_char: newline_char
        }
    }
 }
@ -63,6 +65,7 @@ impl<R: Read> ByteReader<R> {
    pub fn consume_line(&mut self) -> usize {
        let mut bytes_consumed = 0;
        let mut consume_val;
        let newline_char = self.newline_char;
        loop {
            { // need filled_buf to go out of scope
@ -77,7 +80,7 @@ impl<R: Read> ByteReader<R> {
                    Err(e) => crash!(1, "read error: {}", e),
                };
-                if let Some(idx) = filled_buf.iter().position(|byte| *byte == b'\n') {
+                if let Some(idx) = filled_buf.iter().position(|byte| *byte == newline_char) {
                    consume_val = idx + 1;
                    bytes_consumed += consume_val;
                    break;
@ -105,6 +108,7 @@ impl<R: Read> self::Bytes::Select for ByteReader<R> {
        use self::Bytes::Selected::*;
        let newline_char = self.newline_char;
        let (res, consume_val) = {
            let buffer = match self.fill_buf() {
                Err(e) => crash!(1, "read error: {}", e),
@ -118,13 +122,13 @@ impl<R: Read> self::Bytes::Select for ByteReader<R> {
                    // segments check if the byte after bytes is a newline
                    let buf_slice = &buffer[0..bytes + 1];
-                    match buf_slice.iter().position(|byte| *byte == b'\n') {
+                    match buf_slice.iter().position(|byte| *byte == newline_char) {
                        Some(idx) => (SRes::Newl, idx+1),
                        None => (SRes::Comp, bytes),
                    }
                },
                _ => {
-                    match buffer.iter().position(|byte| *byte == b'\n') {
+                    match buffer.iter().position(|byte| *byte == newline_char) {
                        Some(idx) => (SRes::Newl, idx+1),
                        None => (SRes::Part, buffer.len()),
                    }
--- a/src/cut/cut.rs
+++ b/src/cut/cut.rs
@ -30,12 +30,14 @@ static VERSION: &'static str = env!("CARGO_PKG_VERSION");
 struct Options {
    out_delim: Option<String>,
    zero_terminated: bool,
 }
 struct FieldOptions {
    delimiter: String,  // one char long, String because of UTF8 representation
    out_delimeter: Option<String>,
    only_delimited: bool,
    zero_terminated: bool,
 }
 enum Mode {
@ -56,7 +58,9 @@ fn cut_bytes<R: Read>(reader: R, ranges: &[Range], opts: &Options) -> i32 {
    use buffer::Bytes::Select;
    use buffer::Bytes::Selected::*;
-    let mut buf_read = buffer::ByteReader::new(reader);
+    let newline_char = 
        if opts.zero_terminated { b'\0' } else { b'\n' };
    let mut buf_read = buffer::ByteReader::new(reader, newline_char);
    let mut out = stdout();
    'newline: loop {
@ -69,7 +73,7 @@ fn cut_bytes<R: Read>(reader: R, ranges: &[Range], opts: &Options) -> i32 {
            loop {
                match buf_read.select(low - cur_pos, None::<&mut Stdout>) {
                    NewlineFound => {
-                        pipe_crash_if_err!(1, out.write_all(&[b'\n']));
+                        pipe_crash_if_err!(1, out.write_all(&[newline_char]));
                        continue 'newline
                    }
                    Complete(len) => {
@ -79,7 +83,7 @@ fn cut_bytes<R: Read>(reader: R, ranges: &[Range], opts: &Options) -> i32 {
                    Partial(len) => cur_pos += len,
                    EndOfFile => {
                        if orig_pos != cur_pos {
-                            pipe_crash_if_err!(1, out.write_all(&[b'\n']));
+                            pipe_crash_if_err!(1, out.write_all(&[newline_char]));
                        }
                        break 'newline
@ -108,7 +112,7 @@ fn cut_bytes<R: Read>(reader: R, ranges: &[Range], opts: &Options) -> i32 {
                    }
                    EndOfFile => {
                        if cur_pos != low || low == high {
-                            pipe_crash_if_err!(1, out.write_all(&[b'\n']));
+                            pipe_crash_if_err!(1, out.write_all(&[newline_char]));
                        }
                        break 'newline
@ -118,7 +122,7 @@ fn cut_bytes<R: Read>(reader: R, ranges: &[Range], opts: &Options) -> i32 {
        }
        buf_read.consume_line();
-        pipe_crash_if_err!(1, out.write_all(&[b'\n']));
+        pipe_crash_if_err!(1, out.write_all(&[newline_char]));
    }
    0
@ -194,14 +198,14 @@ fn cut_characters<R: Read>(reader: R, ranges: &[Range], opts: &Options) -> i32 {
    0
 }
-fn cut_fields_delimiter<R: Read>(reader: R, ranges: &[Range], delim: &str, only_delimited: bool, out_delim: &str) -> i32 {
+fn cut_fields_delimiter<R: Read>(reader: R, ranges: &[Range], delim: &str, only_delimited: bool, newline_char: u8, out_delim: &str) -> i32 {
    let mut buf_in = BufReader::new(reader);
    let mut out = stdout();
    let mut buffer = Vec::new();
    'newline: loop {
        buffer.clear();
-        match buf_in.read_until(b'\n', &mut buffer) {
+        match buf_in.read_until(newline_char, &mut buffer) {
            Ok(n) if n == 0 => break,
            Err(e) => {
                if buffer.is_empty() {
@ -220,8 +224,8 @@ fn cut_fields_delimiter<R: Read>(reader: R, ranges: &[Range], delim: &str, only_
        if delim_search.peek().is_none() {
            if ! only_delimited {
                pipe_crash_if_err!(1, out.write_all(line));
-                if line[line.len() - 1] != b'\n' {
+                if line[line.len() - 1] != newline_char {
-                    pipe_crash_if_err!(1, out.write_all(&[b'\n']));
+                    pipe_crash_if_err!(1, out.write_all(&[newline_char]));
                }
            }
@ -257,7 +261,7 @@ fn cut_fields_delimiter<R: Read>(reader: R, ranges: &[Range], delim: &str, only_
                        pipe_crash_if_err!(1, out.write_all(segment));
-                        if line[line.len() - 1] == b'\n' {
+                        if line[line.len() - 1] == newline_char {
                            continue 'newline
                        }
                        break
@ -266,17 +270,19 @@ fn cut_fields_delimiter<R: Read>(reader: R, ranges: &[Range], delim: &str, only_
            }
        }
-        pipe_crash_if_err!(1, out.write_all(&[b'\n']));
+        pipe_crash_if_err!(1, out.write_all(&[newline_char]));
    }
    0
 }
 fn cut_fields<R: Read>(reader: R, ranges: &[Range], opts: &FieldOptions) -> i32 {
    let newline_char = 
        if opts.zero_terminated { b'\0' } else { b'\n' };
    match opts.out_delimeter {
        Some(ref o_delim) => {
            return cut_fields_delimiter(reader, ranges, &opts.delimiter,
-                                        opts.only_delimited, o_delim);
+                                        opts.only_delimited, newline_char, o_delim);
        }
        None => ()
    }
@ -287,7 +293,7 @@ fn cut_fields<R: Read>(reader: R, ranges: &[Range], opts: &FieldOptions) -> i32
    'newline: loop {
        buffer.clear();
-        match buf_in.read_until(b'\n', &mut buffer) {
+        match buf_in.read_until(newline_char, &mut buffer) {
            Ok(n) if n == 0 => break,
            Err(e) => {
                if buffer.is_empty() {
@ -306,8 +312,8 @@ fn cut_fields<R: Read>(reader: R, ranges: &[Range], opts: &FieldOptions) -> i32
        if delim_search.peek().is_none() {
            if ! opts.only_delimited {
                pipe_crash_if_err!(1, out.write_all(line));
-                if line[line.len() - 1] != b'\n' {
+                if line[line.len() - 1] != newline_char {
-                    pipe_crash_if_err!(1, out.write_all(&[b'\n']));
+                    pipe_crash_if_err!(1, out.write_all(&[newline_char]));
                }
            }
@ -343,7 +349,7 @@ fn cut_fields<R: Read>(reader: R, ranges: &[Range], opts: &FieldOptions) -> i32
                    pipe_crash_if_err!(1, out.write_all(segment));
-                    if line[line.len() - 1] == b'\n' {
+                    if line[line.len() - 1] == newline_char {
                        continue 'newline
                    }
                    break
@ -351,7 +357,7 @@ fn cut_fields<R: Read>(reader: R, ranges: &[Range], opts: &FieldOptions) -> i32
            }
        }
-        pipe_crash_if_err!(1, out.write_all(&[b'\n']));
+        pipe_crash_if_err!(1, out.write_all(&[newline_char]));
    }
    0
@ -411,6 +417,7 @@ pub fn uumain(args: Vec<String>) -> i32 {
    opts.optflag("n", "", "legacy option - has no effect.");
    opts.optflag("", "complement", "invert the filter - instead of displaying only the filtered columns, display all but those columns");
    opts.optflag("s", "only-delimited", "in field mode, only print lines which contain the delimiter");
    opts.optflag("z", "zero-terminated", "instead of filtering columns based on line, filter columns based on \\0 (NULL character)");
    opts.optopt("", "output-delimiter", "in field mode, replace the delimiter in output lines with this option's argument", "new delimiter");
    let usage = opts.usage("Prints specified byte or field columns from each line of stdin or the input files");
    opts.help(format!("
@ -489,6 +496,17 @@ pub fn uumain(args: Vec<String>) -> i32 {
        it will replace the delimiter character in each line printed. This is
        useful for transforming tabular data - e.g. to convert a CSV to a 
        TSV (tab-separated file)
 Line endings
    When the --zero-terminated (-z) option is used, cut sees \\0 (null) as the
    'line ending' character (both for the purposes of reading lines and 
    separating printed lines) instead of \\n (newline). This is useful for
    tabular data where some of the cells may contain newlines
    echo 'ab\\0cd' | cut -z -c 1
    will result in 'a\\0c\\0'
 ", NAME, VERSION, usage));
    let matches = opts.parse(args);
@ -499,11 +517,11 @@ pub fn uumain(args: Vec<String>) -> i32 {
                            matches.opt_str("fields")) {
        (Some(byte_ranges), None, None) => {
            list_to_ranges(&byte_ranges[..], complement)
-                .map(|ranges| Mode::Bytes(ranges, Options { out_delim: matches.opt_str("output-delimiter") }))
+                .map(|ranges| Mode::Bytes(ranges, Options { out_delim: matches.opt_str("output-delimiter"), zero_terminated : matches.opt_present("zero-terminated") }))
        }
        (None, Some(char_ranges), None) => {
            list_to_ranges(&char_ranges[..], complement)
-                .map(|ranges| Mode::Characters(ranges, Options { out_delim: matches.opt_str("output-delimiter") }))
+                .map(|ranges| Mode::Characters(ranges, Options { out_delim: matches.opt_str("output-delimiter"), zero_terminated : matches.opt_present("zero-terminated") }))
        }
        (None, None, Some(field_ranges)) => {
            list_to_ranges(&field_ranges[..], complement).and_then(|ranges|
@ -520,6 +538,7 @@ pub fn uumain(args: Vec<String>) -> i32 {
                    };
                    let only_delimited = matches.opt_present("only-delimited");
                    let zero_terminated = matches.opt_present("zero-terminated");
                    match matches.opt_str("delimiter") {
                        Some(delim) => {
@ -536,7 +555,8 @@ pub fn uumain(args: Vec<String>) -> i32 {
                                          FieldOptions {
                                              delimiter: delim,
                                              out_delimeter: out_delim,
-                                              only_delimited: only_delimited
+                                              only_delimited: only_delimited,
                                              zero_terminated: zero_terminated
                                          }))
                            }
                        }
@ -544,7 +564,8 @@ pub fn uumain(args: Vec<String>) -> i32 {
                                          FieldOptions {
                                              delimiter: "\t".to_owned(),
                                              out_delimeter: out_delim,
-                                              only_delimited: only_delimited
+                                              only_delimited: only_delimited,
                                              zero_terminated: zero_terminated
                                          }))
                    }
                }
--- a/tests/test_cut.rs
+++ b/tests/test_cut.rs
@ -78,7 +78,6 @@ fn test_complement() {
        .succeeds().stdout_only("9\n8\n7\n");
 }
 #[cfg_attr(not(feature="test_unimplemented"),ignore)]
 #[test]
 fn test_zero_terminated() {
    new_ucmd().args(&["-d_","-z", "-f", "1"])
@ -95,7 +94,6 @@ fn test_only_delimited() {
    }
 }
 #[cfg_attr(not(feature="test_unimplemented"),ignore)]
 #[test]
 fn test_zero_terminated_only_delimited() {
    new_ucmd().args(&["-d_","-z", "-s", "-f", "1"])