fix/rewrite unexpand and its tests

This is a reworked version of unexpand. I did this for two main reasons: 1. The previous version of unexpand had issues correctly computing tabstops when the `-a` flag was supplied. 2. The previous version assumed the input was UTF-8. This version works with non-UTF-8 inputs. 3. This version has a new flag, -U, which forces unexpand to treat input as 8-bit ASCII rather than interpreting it as UTF-8. This might be handy in some cases.
2025-08-02 14:07:46 +00:00 · 2015-04-28 17:33:31 -04:00 · 2015-04-28 17:33:31 -04:00 · feee266b20
commit feee266b20
parent ec4e3a60e4
3 changed files with 196 additions and 127 deletions
--- a/src/unexpand/deps.mk
+++ b/src/unexpand/deps.mk
@ -0,0 +1 @@
 DEPLIBS += unicode-width
--- a/src/unexpand/unexpand.rs
+++ b/src/unexpand/unexpand.rs
@ -1,10 +1,12 @@
 #![crate_name = "unexpand"]
-#![feature(collections, core, old_io, old_path, rustc_private)]
+#![feature(rustc_private, unicode)]
 /*
 * This file is part of the uutils coreutils package.
 *
 * (c) Virgile Andreani <virgile.andreani@anbuco.fr>
 * (c) kwantam <kwantam@gmail.com>
 *     20150428 updated to work with both UTF-8 and non-UTF-8 encodings
 *
 * For the full copyright and license information, please view the LICENSE
 * file that was distributed with this source code.
@ -12,8 +14,14 @@
 extern crate getopts;
 extern crate libc;
 extern crate rustc_unicode;
 extern crate unicode_width;
-use std::old_io as io;
+use std::fs::File;
 use std::io::{stdin, stdout, BufRead, BufReader, BufWriter, Read, Stdout, Write};
 use std::str::from_utf8;
 use rustc_unicode::str::utf8_char_width;
 use unicode_width::UnicodeWidthChar;
 #[path = "../common/util.rs"]
 #[macro_use]
@ -25,7 +33,7 @@ static VERSION: &'static str = "0.0.1";
 static DEFAULT_TABSTOP: usize = 8;
 fn tabstops_parse(s: String) -> Vec<usize> {
-    let words = s.as_slice().split(',').collect::<Vec<&str>>();
+    let words = s.split(',').collect::<Vec<&str>>();
    let nums = words.into_iter()
        .map(|sn| sn.parse()
@ -49,7 +57,8 @@ fn tabstops_parse(s: String) -> Vec<usize> {
 struct Options {
    files: Vec<String>,
    tabstops: Vec<usize>,
-    aflag: bool
+    aflag: bool,
    uflag: bool,
 }
 impl Options {
@ -61,6 +70,7 @@ impl Options {
        let aflag = (matches.opt_present("all") || matches.opt_present("tabs"))
                    && !matches.opt_present("first-only");
        let uflag = !matches.opt_present("U");
        let files =
            if matches.free.is_empty() {
@ -69,7 +79,7 @@ impl Options {
                matches.free
            };
-        Options { files: files, tabstops: tabstops, aflag: aflag }
+        Options { files: files, tabstops: tabstops, aflag: aflag, uflag: uflag }
    }
 }
@ -79,20 +89,21 @@ pub fn uumain(args: Vec<String>) -> i32 {
        getopts::optflag("", "first-only", "convert only leading sequences of blanks (overrides -a)"),
        getopts::optopt("t", "tabs", "have tabs N characters apart instead of 8 (enables -a)", "N"),
        getopts::optopt("t", "tabs", "use comma separated LIST of tab positions (enables -a)", "LIST"),
        getopts::optflag("U", "no-utf8", "interpret input file as 8-bit ASCII rather than UTF-8"),
        getopts::optflag("h", "help", "display this help and exit"),
        getopts::optflag("V", "version", "output version information and exit"),
    ];
-    let matches = match getopts::getopts(args.tail(), &opts) {
+    let matches = match getopts::getopts(&args[1..], &opts) {
        Ok(m) => m,
        Err(f) => crash!(1, "{}", f)
    };
    if matches.opt_present("help") {
        println!("Usage: {} [OPTION]... [FILE]...", NAME);
-        io::print(getopts::usage(
+        println!("{}", getopts::usage(
            "Convert blanks in each FILE to tabs, writing to standard output.\n\
-            With no FILE, or when FILE is -, read standard input.", &opts).as_slice());
+            With no FILE, or when FILE is -, read standard input.", &opts));
        return 0;
    }
@ -106,121 +117,175 @@ pub fn uumain(args: Vec<String>) -> i32 {
    return 0;
 }
-fn open(path: String) -> io::BufferedReader<Box<Reader+'static>> {
+fn open(path: String) -> BufReader<Box<Read+'static>> {
    let mut file_buf;
-    if path.as_slice() == "-" {
+    if path == "-" {
-        io::BufferedReader::new(Box::new(io::stdio::stdin_raw()) as Box<Reader>)
+        BufReader::new(Box::new(stdin()) as Box<Read>)
    } else {
-        file_buf = match io::File::open(&Path::new(path.as_slice())) {
+        file_buf = match File::open(&path[..]) {
            Ok(a) => a,
-            _ => crash!(1, "{}: {}\n", path, "No such file or directory")
+            Err(e) => crash!(1, "{}: {}", &path[..], e),
        };
-        io::BufferedReader::new(Box::new(file_buf) as Box<Reader>)
+        BufReader::new(Box::new(file_buf) as Box<Read>)
    }
 }
-fn is_tabstop(tabstops: &[usize], col: usize) -> bool {
+fn next_tabstop(tabstops: &[usize], col: usize) -> Option<usize> {
-    match tabstops {
+    if tabstops.len() == 1 {
-        [tabstop] => col % tabstop == 0,
+        Some(tabstops[0] - col % tabstops[0])
-        tabstops => tabstops.binary_search_by(|&e| e.cmp(&col)).is_ok()
+    } else {
-    }
+        // find next larger tab
-}
+        match tabstops.iter().skip_while(|&&t| t <= col).next() {
-
+            Some(t) => Some(t - col),
-fn to_next_stop(tabstops: &[usize], col: usize) -> Option<usize> {
+            None => None,   // if there isn't one in the list, tab becomes a single space
    match tabstops {
        [tabstop] => Some(tabstop - col % tabstop),
        tabstops => tabstops.iter().skip_while(|&t| *t <= col).next()
            .map(|&tabstop| tabstop - col % tabstop)
    }
 }
 fn unexpandspan(mut output: &mut io::LineBufferedWriter<io::stdio::StdWriter>,
                tabstops: &[usize], nspaces: usize, col: usize, init: bool) {
    let mut cur = col - nspaces;
    if nspaces > 1 || init {
        loop {
            match to_next_stop(tabstops, cur) {
                Some(to_next) if cur + to_next <= col => {
                        safe_write!(&mut output, "{}", '\t');
                        cur += to_next;
                    }
                _ => break
            }
        }
    }
-    safe_write!(&mut output, "{:1$}", "", col - cur);
+}
 fn write_tabs(mut output: &mut BufWriter<Stdout>, tabstops: &[usize], mut scol: usize, col: usize) {
    while let Some(nts) = next_tabstop(tabstops, scol) {
        if col < scol + nts {
            break;
        }
        safe_unwrap!(output.write_all("\t".as_bytes()));
        scol += nts;
    }
    while col > scol {
        safe_unwrap!(output.write_all(" ".as_bytes()));
        scol += 1;
    }
 }
 #[derive(PartialEq, Eq, Debug)]
 enum CharType {
    Backspace,
    Space,
    Tab,
    Other,
 }
 fn unexpand(options: Options) {
-    let mut output = io::stdout();
+    use self::CharType::*;
-    let ts = options.tabstops.as_slice();
+
    let mut output = BufWriter::new(stdout());
    let ts = &options.tabstops[..];
    let mut buf = Vec::new();
    let lastcol = if ts.len() > 1 {
        *ts.last().unwrap()
    } else {
        0
    };
    for file in options.files.into_iter() {
-        let mut col = 0;
+        let mut fh = open(file);
-        let mut nspaces = 0;
+
-        let mut init = true;
+        while match fh.read_until('\n' as u8, &mut buf) {
-        for c in open(file).chars() {
+            Ok(s) => s > 0,
-            match c {
+            Err(_) => buf.len() > 0,
-                Ok(' ') => {
+        } {
-                    if init || options.aflag {
+            let mut byte = 0;       // offset into the buffer
-                        nspaces += 1;
+            let mut col = 0;        // the current column
            let mut scol = 0;       // the start col for the current span, i.e., the already-printed width
            let mut init = true;    // are we at the start of the line?
            let mut pctype = Other;
            while byte < buf.len() {
                // when we have a finite number of columns, never convert past the last column
                if lastcol > 0 && col >= lastcol {
                    if (pctype != Tab && col > scol + 1) || 
                       (col > scol && (init || pctype == Tab)) {
                        write_tabs(&mut output, ts, scol, col);
                    } else if col > scol {
                        safe_unwrap!(output.write_all(" ".as_bytes()));
                    }
                    scol = col;
                    safe_unwrap!(output.write_all(&buf[byte..]));
                    break;
                }
                let (ctype, cwidth, nbytes) = if options.uflag {
                    let nbytes = utf8_char_width(buf[byte]);
                    // figure out how big the next char is, if it's UTF-8
                    if byte + nbytes > buf.len() {
                        // make sure we don't overrun the buffer because of invalid UTF-8
                        (Other, 1, 1)
                    } else if let Ok(t) = from_utf8(&buf[byte..byte+nbytes]) {
                        // Now that we think it's UTF-8, figure out what kind of char it is
                        match t.chars().next() {
                            Some(' ') => (Space, 0, 1),
                            Some('\t') => (Tab, 0, 1),
                            Some('\x08') => (Backspace, 0, 1),
                            Some(c) => (Other, UnicodeWidthChar::width(c).unwrap_or(0), nbytes),
                            None => {   // invalid char snuck past the utf8_validation_iterator somehow???
                                (Other, 1, 1)
                            },
                        }
                    } else {
-                        nspaces = 0;
+                        // otherwise, it's not valid
-                        safe_write!(&mut output, "{}", ' ');
+                        (Other, 1, 1)       // implicit assumption: non-UTF8 char has display width 1
                    }
-                    col += 1;
+                } else {
-                }
+                    (match buf[byte] {      // always take exactly 1 byte in strict ASCII mode
-                Ok('\t') if nspaces > 0 => {
+                        0x20 => Space,
-                    if is_tabstop(ts, col) {
+                        0x09 => Tab,
-                        nspaces = 0;
+                        0x08 => Backspace,
-                        col += 1;
+                        _ => Other,
-                        safe_write!(&mut output, "{}", '\t');
+                    }, 1, 1)
-                    }
+                };
-                    match to_next_stop(ts, col) {
+
-                        Some(to_next) => {
+                // now figure out how many columns this char takes up, and maybe print it
-                            nspaces += to_next;
+                let tabs_buffered = init || options.aflag;
-                            col += to_next;
+                match ctype {
                    Space | Tab => {    // compute next col, but only write space or tab chars if not buffering
                        col += if ctype == Space {
                            1
                        } else {
                            next_tabstop(ts, col).unwrap_or(1)
                        };
                        if !tabs_buffered {
                            safe_unwrap!(output.write_all(&buf[byte..byte+nbytes]));
                            scol = col;             // now printed up to this column
                        }
-                        None => {
+                    },
-                            col += 1;
+                    Other | Backspace => {  // always 
-                            unexpandspan(&mut output, ts, nspaces, col, init);
+                        // never turn a single space before a non-blank into a tab
-                            nspaces = 0;
+                        // unless it's at the start of the line
-                            safe_write!(&mut output, "{}", '\t');
+                        if (tabs_buffered && pctype != Tab && col > scol + 1) || 
                           (col > scol && (init || (tabs_buffered && pctype == Tab))) {
                            write_tabs(&mut output, ts, scol, col);
                        } else if col > scol {
                            safe_unwrap!(output.write_all(" ".as_bytes()));
                        }
-                    }
+                        init = false;
                        col = if ctype == Other {   // use computed width
                            col + cwidth
                        } else if col > 0 {         // Backspace case, but only if col > 0
                            col - 1
                        } else {
                            0
                        };
                        safe_unwrap!(output.write_all(&buf[byte..byte+nbytes]));
                        scol = col;                 // we've now printed up to this column
                    },
                }
-                Ok('\x08') => { // '\b'
+
-                    if init || options.aflag {
+                byte += nbytes; // move on to next char
-                        unexpandspan(&mut output, ts, nspaces, col, init)
+                pctype = ctype; // save the previous type
                    }
                    nspaces = 0;
                    if col > 0 { col -= 1; }
                    init = false;
                    safe_write!(&mut output, "{}", '\x08');
                }
                Ok('\n') => {
                    if init || options.aflag {
                        unexpandspan(&mut output, ts, nspaces, col, init)
                    }
                    nspaces = 0;
                    col = 0;
                    init = true;
                    safe_write!(&mut output, "{}", '\n');
                }
                Ok(c) => {
                    if init || options.aflag {
                        unexpandspan(&mut output, ts, nspaces, col, init)
                    }
                    nspaces = 0;
                    col += 1;
                    init = false;
                    safe_write!(&mut output, "{}", c);
                }
                Err(_) => break
            }
-        }
+
-        if init || options.aflag {
+            // write out anything remaining
-            unexpandspan(&mut output, ts, nspaces, col, init)
+            if col > scol + 1 || (init && col > scol) {
                write_tabs(&mut output, ts, scol, col);
            } else if col > scol {
                safe_unwrap!(output.write_all(" ".as_bytes()));
            }
            buf.truncate(0);    // clear out the buffer
        }
    }
 }
--- a/test/unexpand.rs
+++ b/test/unexpand.rs
@ -1,74 +1,76 @@
-#![allow(unstable)]
+use std::io::Write;
-
+use std::process::{Command, Stdio};
 use std::old_io::process::Command;
 static PROGNAME: &'static str = "./unexpand";
 fn run(input: &str, args: &[&'static str]) -> Vec<u8> {
-    let mut process = Command::new(PROGNAME).args(args).spawn().unwrap();
+    let mut process = Command::new(PROGNAME)
                                   .args(args)
                                   .stdin(Stdio::piped())
                                   .stdout(Stdio::piped())
                                   .spawn()
                                   .unwrap_or_else(|e| panic!("{}", e));
-    process.stdin.take().unwrap().write_str(input).unwrap();
+    process.stdin.take().unwrap_or_else(|| panic!("Could not take child process stdin"))
        .write_all(input.as_bytes()).unwrap_or_else(|e| panic!("{}", e));
-    let po = match process.wait_with_output() {
+    let po = process.wait_with_output().unwrap_or_else(|e| panic!("{}", e));
-        Ok(p) => p,
+    po.stdout
        Err(err) => panic!("{}", err),
    };
    po.output
 }
 #[test]
 fn unexpand_init_0() {
    let out = run(" 1\n  2\n   3\n    4\n", &["-t4"]);
-    assert_eq!(out.as_slice(), b" 1\n  2\n   3\n\t4\n");
+    assert_eq!(&out[..], b" 1\n  2\n   3\n\t4\n" as &[u8]);
 }
 #[test]
 fn unexpand_init_1() {
    let out = run("     5\n      6\n       7\n        8\n", &["-t4"]);
-    assert_eq!(out.as_slice(), b"\t 5\n\t  6\n\t   7\n\t\t8\n");
+    assert_eq!(&out[..], b"\t 5\n\t  6\n\t   7\n\t\t8\n" as &[u8]);
 }
 #[test]
 fn unexpand_init_list_0() {
    let out = run(" 1\n  2\n   3\n    4\n", &["-t2,4"]);
-    assert_eq!(out.as_slice(), b" 1\n\t2\n\t 3\n\t\t4\n");
+    assert_eq!(&out[..], b" 1\n\t2\n\t 3\n\t\t4\n" as &[u8]);
 }
 #[test]
 fn unexpand_init_list_1() {
    // Once the list is exhausted, spaces are not converted anymore
    let out = run("     5\n      6\n       7\n        8\n", &["-t2,4"]);
-    assert_eq!(out.as_slice(), b"\t\t 5\n\t\t  6\n\t\t   7\n\t\t    8\n");
+    assert_eq!(&out[..], b"\t\t 5\n\t\t  6\n\t\t   7\n\t\t    8\n" as &[u8]);
 }
 #[test]
 fn unexpand_aflag_0() {
-    let out = run("e     E\nf      F\ng       G\nh        H\n", &[]);
+    let out = run("e     E\nf      F\ng       G\nh        H\n", &["--"]);
-    assert_eq!(out.as_slice(), b"e     E\nf      F\ng       G\nh        H\n");
+    assert_eq!(&out[..], b"e     E\nf      F\ng       G\nh        H\n" as &[u8]);
 }
 #[test]
 fn unexpand_aflag_1() {
    let out = run("e     E\nf      F\ng       G\nh        H\n", &["-a"]);
-    assert_eq!(out.as_slice(), b"e     E\nf      F\ng\tG\nh\t H\n");
+    assert_eq!(&out[..], b"e     E\nf      F\ng\tG\nh\t H\n" as &[u8]);
 }
 #[test]
 fn unexpand_aflag_2() {
    let out = run("e     E\nf      F\ng       G\nh        H\n", &["-t8"]);
-    assert_eq!(out.as_slice(), b"e     E\nf      F\ng\tG\nh\t H\n");
+    assert_eq!(&out[..], b"e     E\nf      F\ng\tG\nh\t H\n" as &[u8]);
 }
 #[test]
 fn unexpand_first_only_0() {
    let out = run("        A     B", &["-t3"]);
-    assert_eq!(out.as_slice(), b"\t\t  A\t  B");
+    assert_eq!(&out[..], b"\t\t  A\t  B" as &[u8]);
 }
 #[test]
 fn unexpand_first_only_1() {
    let out = run("        A     B", &["-t3", "--first-only"]);
-    assert_eq!(out.as_slice(), b"\t\t  A     B");
+    assert_eq!(&out[..], b"\t\t  A     B" as &[u8]);
 }
 #[test]
@ -76,20 +78,20 @@ fn unexpand_trailing_space_0() { // evil
    // Individual spaces before fields starting with non blanks should not be
    // converted, unless they are at the beginning of the line.
    let out = run("123 \t1\n123 1\n123 \n123 ", &["-t4"]);
-    assert_eq!(out.as_slice(), b"123\t\t1\n123 1\n123 \n123 ");
+    assert_eq!(&out[..], b"123\t\t1\n123 1\n123 \n123 " as &[u8]);
 }
 #[test]
 fn unexpand_trailing_space_1() { // super evil
    let out = run(" abc d e  f  g ", &["-t1"]);
-    assert_eq!(out.as_slice(), b"\tabc d e\t\tf\t\tg ");
+    assert_eq!(&out[..], b"\tabc d e\t\tf\t\tg " as &[u8]);
 }
 #[test]
 fn unexpand_spaces_follow_tabs_0() {
    // The two first spaces can be included into the first tab.
    let out = run("  \t\t   A", &[]);
-    assert_eq!(out.as_slice(), b"\t\t   A");
+    assert_eq!(&out[..], b"\t\t   A" as &[u8]);
 }
 #[test]
@ -100,6 +102,7 @@ fn unexpand_spaces_follow_tabs_1() { // evil
    //      ' ' -> '\t'         // third tabstop (5)
    // '  B \t' -> '  B \t'     // after the list is exhausted, nothing must change
    let out = run("a \t   B \t", &["-t1,4,5"]);
-    assert_eq!(out.as_slice(), b"a\t\t  B \t");
+    assert_eq!(&out[..], b"a\t\t  B \t" as &[u8]);
 }