fix/rewrite unexpand and its tests

This is a reworked version of unexpand. I did this for two main reasons: 1. The previous version of unexpand had issues correctly computing tabstops when the `-a` flag was supplied. 2. The previous version assumed the input was UTF-8. This version works with non-UTF-8 inputs. 3. This version has a new flag, -U, which forces unexpand to treat input as 8-bit ASCII rather than interpreting it as UTF-8. This might be handy in some cases.
2025-08-02 05:57:46 +00:00 · 2015-04-28 17:33:31 -04:00 · 2015-04-28 17:33:31 -04:00 · feee266b20
commit feee266b20
parent ec4e3a60e4
3 changed files with 196 additions and 127 deletions
--- a/src/unexpand/deps.mk
+++ b/src/unexpand/deps.mk
@ -0,0 +1 @@
+DEPLIBS += unicode-width
--- a/src/unexpand/unexpand.rs
+++ b/src/unexpand/unexpand.rs
@ -1,10 +1,12 @@
 #![crate_name = "unexpand"]
-#![feature(collections, core, old_io, old_path, rustc_private)]
+#![feature(rustc_private, unicode)]

 /*
 * This file is part of the uutils coreutils package.
 *
 * (c) Virgile Andreani <virgile.andreani@anbuco.fr>
+ * (c) kwantam <kwantam@gmail.com>
+ *     20150428 updated to work with both UTF-8 and non-UTF-8 encodings
 *
 * For the full copyright and license information, please view the LICENSE
 * file that was distributed with this source code.
@ -12,8 +14,14 @@

 extern crate getopts;
 extern crate libc;
+extern crate rustc_unicode;
+extern crate unicode_width;

-use std::old_io as io;
+use std::fs::File;
+use std::io::{stdin, stdout, BufRead, BufReader, BufWriter, Read, Stdout, Write};
+use std::str::from_utf8;
+use rustc_unicode::str::utf8_char_width;
+use unicode_width::UnicodeWidthChar;

 #[path = "../common/util.rs"]
 #[macro_use]
@ -25,7 +33,7 @@ static VERSION: &'static str = "0.0.1";
 static DEFAULT_TABSTOP: usize = 8;

 fn tabstops_parse(s: String) -> Vec<usize> {
-    let words = s.as_slice().split(',').collect::<Vec<&str>>();
+    let words = s.split(',').collect::<Vec<&str>>();

    let nums = words.into_iter()
        .map(|sn| sn.parse()
@ -49,7 +57,8 @@ fn tabstops_parse(s: String) -> Vec<usize> {
 struct Options {
    files: Vec<String>,
    tabstops: Vec<usize>,
-    aflag: bool
+    aflag: bool,
+    uflag: bool,
 }

 impl Options {
@ -61,6 +70,7 @@ impl Options {

        let aflag = (matches.opt_present("all") || matches.opt_present("tabs"))
                    && !matches.opt_present("first-only");
+        let uflag = !matches.opt_present("U");

        let files =
            if matches.free.is_empty() {
@ -69,7 +79,7 @@ impl Options {
                matches.free
            };

-        Options { files: files, tabstops: tabstops, aflag: aflag }
+        Options { files: files, tabstops: tabstops, aflag: aflag, uflag: uflag }
    }
 }

@ -79,20 +89,21 @@ pub fn uumain(args: Vec<String>) -> i32 {
        getopts::optflag("", "first-only", "convert only leading sequences of blanks (overrides -a)"),
        getopts::optopt("t", "tabs", "have tabs N characters apart instead of 8 (enables -a)", "N"),
        getopts::optopt("t", "tabs", "use comma separated LIST of tab positions (enables -a)", "LIST"),
+        getopts::optflag("U", "no-utf8", "interpret input file as 8-bit ASCII rather than UTF-8"),
        getopts::optflag("h", "help", "display this help and exit"),
        getopts::optflag("V", "version", "output version information and exit"),
    ];

-    let matches = match getopts::getopts(args.tail(), &opts) {
+    let matches = match getopts::getopts(&args[1..], &opts) {
        Ok(m) => m,
        Err(f) => crash!(1, "{}", f)
    };

    if matches.opt_present("help") {
        println!("Usage: {} [OPTION]... [FILE]...", NAME);
-        io::print(getopts::usage(
+        println!("{}", getopts::usage(
            "Convert blanks in each FILE to tabs, writing to standard output.\n\
-            With no FILE, or when FILE is -, read standard input.", &opts).as_slice());
+            With no FILE, or when FILE is -, read standard input.", &opts));
        return 0;
    }

@ -106,121 +117,175 @@ pub fn uumain(args: Vec<String>) -> i32 {
    return 0;
 }

-fn open(path: String) -> io::BufferedReader<Box<Reader+'static>> {
+fn open(path: String) -> BufReader<Box<Read+'static>> {
    let mut file_buf;
-    if path.as_slice() == "-" {
-        io::BufferedReader::new(Box::new(io::stdio::stdin_raw()) as Box<Reader>)
+    if path == "-" {
+        BufReader::new(Box::new(stdin()) as Box<Read>)
    } else {
-        file_buf = match io::File::open(&Path::new(path.as_slice())) {
+        file_buf = match File::open(&path[..]) {
            Ok(a) => a,
-            _ => crash!(1, "{}: {}\n", path, "No such file or directory")
+            Err(e) => crash!(1, "{}: {}", &path[..], e),
        };
-        io::BufferedReader::new(Box::new(file_buf) as Box<Reader>)
+        BufReader::new(Box::new(file_buf) as Box<Read>)
    }
 }

-fn is_tabstop(tabstops: &[usize], col: usize) -> bool {
-    match tabstops {
-        [tabstop] => col % tabstop == 0,
-        tabstops => tabstops.binary_search_by(|&e| e.cmp(&col)).is_ok()
+fn next_tabstop(tabstops: &[usize], col: usize) -> Option<usize> {
+    if tabstops.len() == 1 {
+        Some(tabstops[0] - col % tabstops[0])
+    } else {
+        // find next larger tab
+        match tabstops.iter().skip_while(|&&t| t <= col).next() {
+            Some(t) => Some(t - col),
+            None => None,   // if there isn't one in the list, tab becomes a single space
+        }
    }
 }

-fn to_next_stop(tabstops: &[usize], col: usize) -> Option<usize> {
-    match tabstops {
-        [tabstop] => Some(tabstop - col % tabstop),
-        tabstops => tabstops.iter().skip_while(|&t| *t <= col).next()
-            .map(|&tabstop| tabstop - col % tabstop)
+fn write_tabs(mut output: &mut BufWriter<Stdout>, tabstops: &[usize], mut scol: usize, col: usize) {
+    while let Some(nts) = next_tabstop(tabstops, scol) {
+        if col < scol + nts {
+            break;
+        }
+
+        safe_unwrap!(output.write_all("\t".as_bytes()));
+        scol += nts;
+    }
+
+    while col > scol {
+        safe_unwrap!(output.write_all(" ".as_bytes()));
+        scol += 1;
    }
 }

-fn unexpandspan(mut output: &mut io::LineBufferedWriter<io::stdio::StdWriter>,
-                tabstops: &[usize], nspaces: usize, col: usize, init: bool) {
-    let mut cur = col - nspaces;
-    if nspaces > 1 || init {
-        loop {
-            match to_next_stop(tabstops, cur) {
-                Some(to_next) if cur + to_next <= col => {
-                        safe_write!(&mut output, "{}", '\t');
-                        cur += to_next;
-                    }
-                _ => break
-            }
-        }
-    }
-    safe_write!(&mut output, "{:1$}", "", col - cur);
+#[derive(PartialEq, Eq, Debug)]
+enum CharType {
+    Backspace,
+    Space,
+    Tab,
+    Other,
 }

 fn unexpand(options: Options) {
-    let mut output = io::stdout();
-    let ts = options.tabstops.as_slice();
+    use self::CharType::*;
+
+    let mut output = BufWriter::new(stdout());
+    let ts = &options.tabstops[..];
+    let mut buf = Vec::new();
+    let lastcol = if ts.len() > 1 {
+        *ts.last().unwrap()
+    } else {
+        0
+    };

    for file in options.files.into_iter() {
-        let mut col = 0;
-        let mut nspaces = 0;
-        let mut init = true;
-        for c in open(file).chars() {
-            match c {
-                Ok(' ') => {
-                    if init || options.aflag {
-                        nspaces += 1;
+        let mut fh = open(file);
+
+        while match fh.read_until('\n' as u8, &mut buf) {
+            Ok(s) => s > 0,
+            Err(_) => buf.len() > 0,
+        } {
+            let mut byte = 0;       // offset into the buffer
+            let mut col = 0;        // the current column
+            let mut scol = 0;       // the start col for the current span, i.e., the already-printed width
+            let mut init = true;    // are we at the start of the line?
+            let mut pctype = Other;
+
+            while byte < buf.len() {
+                // when we have a finite number of columns, never convert past the last column
+                if lastcol > 0 && col >= lastcol {
+                    if (pctype != Tab && col > scol + 1) || 
+                       (col > scol && (init || pctype == Tab)) {
+                        write_tabs(&mut output, ts, scol, col);
+                    } else if col > scol {
+                        safe_unwrap!(output.write_all(" ".as_bytes()));
+                    }
+                    scol = col;
+
+                    safe_unwrap!(output.write_all(&buf[byte..]));
+                    break;
+                }
+
+                let (ctype, cwidth, nbytes) = if options.uflag {
+                    let nbytes = utf8_char_width(buf[byte]);
+
+                    // figure out how big the next char is, if it's UTF-8
+                    if byte + nbytes > buf.len() {
+                        // make sure we don't overrun the buffer because of invalid UTF-8
+                        (Other, 1, 1)
+                    } else if let Ok(t) = from_utf8(&buf[byte..byte+nbytes]) {
+                        // Now that we think it's UTF-8, figure out what kind of char it is
+                        match t.chars().next() {
+                            Some(' ') => (Space, 0, 1),
+                            Some('\t') => (Tab, 0, 1),
+                            Some('\x08') => (Backspace, 0, 1),
+                            Some(c) => (Other, UnicodeWidthChar::width(c).unwrap_or(0), nbytes),
+                            None => {   // invalid char snuck past the utf8_validation_iterator somehow???
+                                (Other, 1, 1)
+                            },
+                        }
                    } else {
-                        nspaces = 0;
-                        safe_write!(&mut output, "{}", ' ');
+                        // otherwise, it's not valid
+                        (Other, 1, 1)       // implicit assumption: non-UTF8 char has display width 1
                    }
-                    col += 1;
+                } else {
+                    (match buf[byte] {      // always take exactly 1 byte in strict ASCII mode
+                        0x20 => Space,
+                        0x09 => Tab,
+                        0x08 => Backspace,
+                        _ => Other,
+                    }, 1, 1)
+                };
+
+                // now figure out how many columns this char takes up, and maybe print it
+                let tabs_buffered = init || options.aflag;
+                match ctype {
+                    Space | Tab => {    // compute next col, but only write space or tab chars if not buffering
+                        col += if ctype == Space {
+                            1
+                        } else {
+                            next_tabstop(ts, col).unwrap_or(1)
+                        };
+
+                        if !tabs_buffered {
+                            safe_unwrap!(output.write_all(&buf[byte..byte+nbytes]));
+                            scol = col;             // now printed up to this column
                        }
-                Ok('\t') if nspaces > 0 => {
-                    if is_tabstop(ts, col) {
-                        nspaces = 0;
-                        col += 1;
-                        safe_write!(&mut output, "{}", '\t');
+                    },
+                    Other | Backspace => {  // always 
+                        // never turn a single space before a non-blank into a tab
+                        // unless it's at the start of the line
+                        if (tabs_buffered && pctype != Tab && col > scol + 1) || 
+                           (col > scol && (init || (tabs_buffered && pctype == Tab))) {
+                            write_tabs(&mut output, ts, scol, col);
+                        } else if col > scol {
+                            safe_unwrap!(output.write_all(" ".as_bytes()));
                        }
-                    match to_next_stop(ts, col) {
-                        Some(to_next) => {
-                            nspaces += to_next;
-                            col += to_next;
-                        }
-                        None => {
-                            col += 1;
-                            unexpandspan(&mut output, ts, nspaces, col, init);
-                            nspaces = 0;
-                            safe_write!(&mut output, "{}", '\t');
-                        }
-                    }
-                }
-                Ok('\x08') => { // '\b'
-                    if init || options.aflag {
-                        unexpandspan(&mut output, ts, nspaces, col, init)
-                    }
-                    nspaces = 0;
-                    if col > 0 { col -= 1; }
                        init = false;
-                    safe_write!(&mut output, "{}", '\x08');
+                        col = if ctype == Other {   // use computed width
+                            col + cwidth
+                        } else if col > 0 {         // Backspace case, but only if col > 0
+                            col - 1
+                        } else {
+                            0
+                        };
+                        safe_unwrap!(output.write_all(&buf[byte..byte+nbytes]));
+                        scol = col;                 // we've now printed up to this column
+                    },
                }
-                Ok('\n') => {
-                    if init || options.aflag {
-                        unexpandspan(&mut output, ts, nspaces, col, init)
+
+                byte += nbytes; // move on to next char
+                pctype = ctype; // save the previous type
            }
-                    nspaces = 0;
-                    col = 0;
-                    init = true;
-                    safe_write!(&mut output, "{}", '\n');
+
+            // write out anything remaining
+            if col > scol + 1 || (init && col > scol) {
+                write_tabs(&mut output, ts, scol, col);
+            } else if col > scol {
+                safe_unwrap!(output.write_all(" ".as_bytes()));
            }
-                Ok(c) => {
-                    if init || options.aflag {
-                        unexpandspan(&mut output, ts, nspaces, col, init)
-                    }
-                    nspaces = 0;
-                    col += 1;
-                    init = false;
-                    safe_write!(&mut output, "{}", c);
-                }
-                Err(_) => break
-            }
-        }
-        if init || options.aflag {
-            unexpandspan(&mut output, ts, nspaces, col, init)
+
+            buf.truncate(0);    // clear out the buffer
        }
    }
 }
--- a/test/unexpand.rs
+++ b/test/unexpand.rs
@ -1,74 +1,76 @@
-#![allow(unstable)]
-
-use std::old_io::process::Command;
+use std::io::Write;
+use std::process::{Command, Stdio};

 static PROGNAME: &'static str = "./unexpand";

 fn run(input: &str, args: &[&'static str]) -> Vec<u8> {
-    let mut process = Command::new(PROGNAME).args(args).spawn().unwrap();
+    let mut process = Command::new(PROGNAME)
+                                   .args(args)
+                                   .stdin(Stdio::piped())
+                                   .stdout(Stdio::piped())
+                                   .spawn()
+                                   .unwrap_or_else(|e| panic!("{}", e));

-    process.stdin.take().unwrap().write_str(input).unwrap();
+    process.stdin.take().unwrap_or_else(|| panic!("Could not take child process stdin"))
+        .write_all(input.as_bytes()).unwrap_or_else(|e| panic!("{}", e));

-    let po = match process.wait_with_output() {
-        Ok(p) => p,
-        Err(err) => panic!("{}", err),
-    };
-    po.output
+    let po = process.wait_with_output().unwrap_or_else(|e| panic!("{}", e));
+    po.stdout
 }

 #[test]
 fn unexpand_init_0() {
    let out = run(" 1\n  2\n   3\n    4\n", &["-t4"]);
-    assert_eq!(out.as_slice(), b" 1\n  2\n   3\n\t4\n");
+    assert_eq!(&out[..], b" 1\n  2\n   3\n\t4\n" as &[u8]);
 }

 #[test]
 fn unexpand_init_1() {
    let out = run("     5\n      6\n       7\n        8\n", &["-t4"]);
-    assert_eq!(out.as_slice(), b"\t 5\n\t  6\n\t   7\n\t\t8\n");
+    assert_eq!(&out[..], b"\t 5\n\t  6\n\t   7\n\t\t8\n" as &[u8]);
 }

 #[test]
 fn unexpand_init_list_0() {
    let out = run(" 1\n  2\n   3\n    4\n", &["-t2,4"]);
-    assert_eq!(out.as_slice(), b" 1\n\t2\n\t 3\n\t\t4\n");
+    assert_eq!(&out[..], b" 1\n\t2\n\t 3\n\t\t4\n" as &[u8]);
 }

 #[test]
 fn unexpand_init_list_1() {
    // Once the list is exhausted, spaces are not converted anymore
    let out = run("     5\n      6\n       7\n        8\n", &["-t2,4"]);
-    assert_eq!(out.as_slice(), b"\t\t 5\n\t\t  6\n\t\t   7\n\t\t    8\n");
+    assert_eq!(&out[..], b"\t\t 5\n\t\t  6\n\t\t   7\n\t\t    8\n" as &[u8]);
 }

 #[test]
 fn unexpand_aflag_0() {
-    let out = run("e     E\nf      F\ng       G\nh        H\n", &[]);
-    assert_eq!(out.as_slice(), b"e     E\nf      F\ng       G\nh        H\n");
+    let out = run("e     E\nf      F\ng       G\nh        H\n", &["--"]);
+    assert_eq!(&out[..], b"e     E\nf      F\ng       G\nh        H\n" as &[u8]);
 }

 #[test]
 fn unexpand_aflag_1() {
    let out = run("e     E\nf      F\ng       G\nh        H\n", &["-a"]);
-    assert_eq!(out.as_slice(), b"e     E\nf      F\ng\tG\nh\t H\n");
+    assert_eq!(&out[..], b"e     E\nf      F\ng\tG\nh\t H\n" as &[u8]);
 }

 #[test]
 fn unexpand_aflag_2() {
    let out = run("e     E\nf      F\ng       G\nh        H\n", &["-t8"]);
-    assert_eq!(out.as_slice(), b"e     E\nf      F\ng\tG\nh\t H\n");
+    assert_eq!(&out[..], b"e     E\nf      F\ng\tG\nh\t H\n" as &[u8]);
 }

 #[test]
 fn unexpand_first_only_0() {
    let out = run("        A     B", &["-t3"]);
-    assert_eq!(out.as_slice(), b"\t\t  A\t  B");
+    assert_eq!(&out[..], b"\t\t  A\t  B" as &[u8]);
 }

 #[test]
 fn unexpand_first_only_1() {
    let out = run("        A     B", &["-t3", "--first-only"]);
-    assert_eq!(out.as_slice(), b"\t\t  A     B");
+    assert_eq!(&out[..], b"\t\t  A     B" as &[u8]);
 }

 #[test]
@ -76,20 +78,20 @@ fn unexpand_trailing_space_0() { // evil
    // Individual spaces before fields starting with non blanks should not be
    // converted, unless they are at the beginning of the line.
    let out = run("123 \t1\n123 1\n123 \n123 ", &["-t4"]);
-    assert_eq!(out.as_slice(), b"123\t\t1\n123 1\n123 \n123 ");
+    assert_eq!(&out[..], b"123\t\t1\n123 1\n123 \n123 " as &[u8]);
 }

 #[test]
 fn unexpand_trailing_space_1() { // super evil
    let out = run(" abc d e  f  g ", &["-t1"]);
-    assert_eq!(out.as_slice(), b"\tabc d e\t\tf\t\tg ");
+    assert_eq!(&out[..], b"\tabc d e\t\tf\t\tg " as &[u8]);
 }

 #[test]
 fn unexpand_spaces_follow_tabs_0() {
    // The two first spaces can be included into the first tab.
    let out = run("  \t\t   A", &[]);
-    assert_eq!(out.as_slice(), b"\t\t   A");
+    assert_eq!(&out[..], b"\t\t   A" as &[u8]);
 }

 #[test]
@ -100,6 +102,7 @@ fn unexpand_spaces_follow_tabs_1() { // evil
    //      ' ' -> '\t'         // third tabstop (5)
    // '  B \t' -> '  B \t'     // after the list is exhausted, nothing must change
    let out = run("a \t   B \t", &["-t1,4,5"]);
-    assert_eq!(out.as_slice(), b"a\t\t  B \t");
+    assert_eq!(&out[..], b"a\t\t  B \t" as &[u8]);
 }

+