fix/rewrite expand

This is a reworked version of expand. I did this for two main reasons: 1. The previous version assumed the input was UTF-8. This version is compatible with both UTF-8 and non-UTF-8 inputs. 2. This version has a new flag, -U, which forces expand to treat input as 8-bit ASCII rather than interpreting it as UTF-8. This might be handy in some cases.
2025-08-02 14:07:46 +00:00 · 2015-04-28 19:12:28 -04:00 · 2015-04-28 19:12:28 -04:00 · 8e2788bd39
commit 8e2788bd39
parent feee266b20
2 changed files with 127 additions and 52 deletions
--- a/src/expand/deps.mk
+++ b/src/expand/deps.mk
@ -0,0 +1 @@
 DEPLIBS += unicode-width
--- a/src/expand/expand.rs
+++ b/src/expand/expand.rs
@ -1,22 +1,28 @@
 #![crate_name = "expand"]
-#![feature(collections, core, old_io, old_path, rustc_private)]
+#![feature(rustc_private, unicode)]
 /*
 * This file is part of the uutils coreutils package.
 *
 * (c) Virgile Andreani <virgile.andreani@anbuco.fr>
 * (c) kwantam <kwantam@gmail.com>
 *     20150428 updated to work with both UTF-8 and non-UTF-8 encodings
 *
 * For the full copyright and license information, please view the LICENSE
 * file that was distributed with this source code.
 */
 #![feature(box_syntax)]
 extern crate getopts;
 extern crate libc;
 extern crate rustc_unicode;
 extern crate unicode_width;
-use std::old_io as io;
+use std::fs::File;
-use std::str::StrExt;
+use std::io::{stdin, stdout, BufRead, BufReader, BufWriter, Read, Write};
 use std::iter::repeat;
 use std::str::from_utf8;
 use rustc_unicode::str::utf8_char_width;
 use unicode_width::UnicodeWidthChar;
 #[path = "../common/util.rs"]
 #[macro_use]
@ -28,7 +34,7 @@ static VERSION: &'static str = "0.0.1";
 static DEFAULT_TABSTOP: usize = 8;
 fn tabstops_parse(s: String) -> Vec<usize> {
-    let words = s.as_slice().split(',').collect::<Vec<&str>>();
+    let words = s.split(',').collect::<Vec<&str>>();
    let nums = words.into_iter()
        .map(|sn| sn.parse::<usize>()
@ -52,7 +58,9 @@ fn tabstops_parse(s: String) -> Vec<usize> {
 struct Options {
    files: Vec<String>,
    tabstops: Vec<usize>,
-    iflag: bool
+    tspaces: String,
    iflag: bool,
    uflag: bool,
 }
 impl Options {
@ -63,6 +71,16 @@ impl Options {
        };
        let iflag = matches.opt_present("i");
        let uflag = !matches.opt_present("U");
        // avoid allocations when dumping out long sequences of spaces
        // by precomputing the longest string of spaces we will ever need
        let nspaces = tabstops.iter().scan(0, |pr,&it| {
            let ret = Some(it - *pr);
            *pr = it;
            ret
        }).max().unwrap();  // length of tabstops is guaranteed >= 1
        let tspaces = repeat(' ').take(nspaces).collect();
        let files =
            if matches.free.is_empty() {
@ -71,7 +89,7 @@ impl Options {
                matches.free
            };
-        Options { files: files, tabstops: tabstops, iflag: iflag }
+        Options { files: files, tabstops: tabstops, tspaces: tspaces, iflag: iflag, uflag: uflag }
    }
 }
@ -80,20 +98,21 @@ pub fn uumain(args: Vec<String>) -> i32 {
        getopts::optflag("i", "initial", "do not convert tabs after non blanks"),
        getopts::optopt("t", "tabs", "have tabs NUMBER characters apart, not 8", "NUMBER"),
        getopts::optopt("t", "tabs", "use comma separated list of explicit tab positions", "LIST"),
        getopts::optflag("U", "no-utf8", "interpret input file as 8-bit ASCII rather than UTF-8"),
        getopts::optflag("h", "help", "display this help and exit"),
        getopts::optflag("V", "version", "output version information and exit"),
    ];
-    let matches = match getopts::getopts(args.tail(), &opts) {
+    let matches = match getopts::getopts(&args[1..], &opts) {
        Ok(m) => m,
        Err(f) => crash!(1, "{}", f)
    };
    if matches.opt_present("help") {
        println!("Usage: {} [OPTION]... [FILE]...", NAME);
-        io::print(getopts::usage(
+        println!("{}", getopts::usage(
            "Convert tabs in each FILE to spaces, writing to standard output.\n\
-            With no FILE, or when FILE is -, read standard input.", &opts).as_slice());
+            With no FILE, or when FILE is -, read standard input.", &opts));
        return 0;
    }
@ -107,64 +126,119 @@ pub fn uumain(args: Vec<String>) -> i32 {
    return 0;
 }
-fn open(path: String) -> io::BufferedReader<Box<Reader+'static>> {
+fn open(path: String) -> BufReader<Box<Read+'static>> {
    let mut file_buf;
-    if path.as_slice() == "-" {
+    if path == "-" {
-        io::BufferedReader::new(box io::stdio::stdin_raw() as Box<Reader>)
+        BufReader::new(Box::new(stdin()) as Box<Read>)
    } else {
-        file_buf = match io::File::open(&Path::new(path.as_slice())) {
+        file_buf = match File::open(&path[..]) {
            Ok(a) => a,
-            _ => crash!(1, "{}: {}\n", path, "No such file or directory")
+            Err(e) => crash!(1, "{}: {}\n", &path[..], e),
        };
-        io::BufferedReader::new(box file_buf as Box<Reader>)
+        BufReader::new(Box::new(file_buf) as Box<Read>)
    }
 }
-fn to_next_stop(tabstops: &[usize], col: usize) -> usize {
+fn next_tabstop(tabstops: &[usize], col: usize) -> usize {
-    match tabstops.as_slice() {
+    if tabstops.len() == 1 {
-        [tabstop] => tabstop - col % tabstop,
+        tabstops[0] - col % tabstops[0]
-        tabstops => match tabstops.iter().skip_while(|&t| *t <= col).next() {
+    } else {
-            Some(&tabstop) => tabstop - col % tabstop,
+        match tabstops.iter().skip_while(|&&t| t <= col).next() {
-            None => 1
+            Some(t) => t - col,
            None => 1,
        }
    }
 }
 #[derive(PartialEq, Eq, Debug)]
 enum CharType {
    Backspace,
    Tab,
    Other,
 }
 fn expand(options: Options) {
-    let mut output = io::stdout();
+    use self::CharType::*;
    let mut output = BufWriter::new(stdout());
    let ts = options.tabstops.as_ref();
    let mut buf = Vec::new();
    for file in options.files.into_iter() {
-        let mut col = 0;
+        let mut fh = open(file);
-        let mut init = true;
+
-        for c in open(file).chars() {
+        while match fh.read_until('\n' as u8, &mut buf) {
-            match c {
+            Ok(s) => s > 0,
-                Ok('\t') if init || !options.iflag => {
+            Err(_) => buf.len() > 0,
-                    let nb_spaces = to_next_stop(options.tabstops.as_slice(), col);
+        } {
-                    col += nb_spaces;
+            let mut col = 0;
-                    safe_write!(&mut output, "{:1$}", "", nb_spaces);
+            let mut byte = 0;
-                }
+            let mut init = true;
-                Ok('\x08') => {
+
-                    if col > 0 {
+            while byte < buf.len() {
-                        col -= 1;
+                let (ctype, cwidth, nbytes) = if options.uflag {
                    let nbytes = utf8_char_width(buf[byte]);
                    if byte + nbytes > buf.len() {
                        // don't overrun buffer because of invalid UTF-8
                        (Other, 1, 1)
                    } else if let Ok(t) = from_utf8(&buf[byte..byte+nbytes]) {
                        match t.chars().next() {
                            Some('\t') => (Tab, 0, nbytes),
                            Some('\x08') => (Backspace, 0, nbytes),
                            Some(c) => (Other, UnicodeWidthChar::width(c).unwrap_or(0), nbytes),
                            None => {   // no valid char at start of t, so take 1 byte
                                (Other, 1, 1)
                            },
                        }
                    } else {
                        (Other, 1, 1)   // implicit assumption: non-UTF-8 char is 1 col wide
                    }
-                    init = false;
+                } else {
-                    safe_write!(&mut output, "{}", '\x08');
+                    (match buf[byte] {   // always take exactly 1 byte in strict ASCII mode
                        0x09 => Tab,
                        0x08 => Backspace,
                        _ => Other,
                    }, 1, 1)
                };
                // figure out how many columns this char takes up
                match ctype {
                    Tab => {
                        // figure out how many spaces to the next tabstop
                        let nts = next_tabstop(ts, col);
                        col += nts;
                        // now dump out either spaces if we're expanding, or a literal tab if we're not
                        if init || !options.iflag {
                            safe_unwrap!(output.write_all(&options.tspaces[..nts].as_bytes()));
                        } else {
                            safe_unwrap!(output.write_all(&buf[byte..byte+nbytes]));
                        }
                    },
                    _ => {
                        col = if ctype == Other {
                            col + cwidth
                        } else if col > 0 {
                            col - 1
                        } else {
                            0
                        };
                        // if we're writing anything other than a space, then we're
                        // done with the line's leading spaces
                        if buf[byte] != 0x20 {
                            init = false;
                        }
                        safe_unwrap!(output.write_all(&buf[byte..byte+nbytes]));
                    },
                }
-                Ok('\n') =>  {
+
-                    col = 0;
+                byte += nbytes; // advance the pointer
                    init = true;
                    safe_write!(&mut output, "{}", '\n');
                }
                Ok(c) => {
                    col += 1;
                    if c != ' ' {
                        init = false;
                    }
                    safe_write!(&mut output, "{}", c);
                }
                Err(_) => break
            }
            buf.truncate(0);    // clear the buffer
        }
    }
 }