diff --git a/src/unexpand/deps.mk b/src/unexpand/deps.mk new file mode 100644 index 000000000..fb8005c0c --- /dev/null +++ b/src/unexpand/deps.mk @@ -0,0 +1 @@ +DEPLIBS += unicode-width diff --git a/src/unexpand/unexpand.rs b/src/unexpand/unexpand.rs index 4f05dc04b..3761c3477 100644 --- a/src/unexpand/unexpand.rs +++ b/src/unexpand/unexpand.rs @@ -1,10 +1,12 @@ #![crate_name = "unexpand"] -#![feature(collections, core, old_io, old_path, rustc_private)] +#![feature(rustc_private, unicode)] /* * This file is part of the uutils coreutils package. * * (c) Virgile Andreani + * (c) kwantam + * 20150428 updated to work with both UTF-8 and non-UTF-8 encodings * * For the full copyright and license information, please view the LICENSE * file that was distributed with this source code. @@ -12,8 +14,14 @@ extern crate getopts; extern crate libc; +extern crate rustc_unicode; +extern crate unicode_width; -use std::old_io as io; +use std::fs::File; +use std::io::{stdin, stdout, BufRead, BufReader, BufWriter, Read, Stdout, Write}; +use std::str::from_utf8; +use rustc_unicode::str::utf8_char_width; +use unicode_width::UnicodeWidthChar; #[path = "../common/util.rs"] #[macro_use] @@ -25,7 +33,7 @@ static VERSION: &'static str = "0.0.1"; static DEFAULT_TABSTOP: usize = 8; fn tabstops_parse(s: String) -> Vec { - let words = s.as_slice().split(',').collect::>(); + let words = s.split(',').collect::>(); let nums = words.into_iter() .map(|sn| sn.parse() @@ -49,7 +57,8 @@ fn tabstops_parse(s: String) -> Vec { struct Options { files: Vec, tabstops: Vec, - aflag: bool + aflag: bool, + uflag: bool, } impl Options { @@ -61,6 +70,7 @@ impl Options { let aflag = (matches.opt_present("all") || matches.opt_present("tabs")) && !matches.opt_present("first-only"); + let uflag = !matches.opt_present("U"); let files = if matches.free.is_empty() { @@ -69,7 +79,7 @@ impl Options { matches.free }; - Options { files: files, tabstops: tabstops, aflag: aflag } + Options { files: files, tabstops: tabstops, aflag: aflag, uflag: uflag } } } @@ -79,20 +89,21 @@ pub fn uumain(args: Vec) -> i32 { getopts::optflag("", "first-only", "convert only leading sequences of blanks (overrides -a)"), getopts::optopt("t", "tabs", "have tabs N characters apart instead of 8 (enables -a)", "N"), getopts::optopt("t", "tabs", "use comma separated LIST of tab positions (enables -a)", "LIST"), + getopts::optflag("U", "no-utf8", "interpret input file as 8-bit ASCII rather than UTF-8"), getopts::optflag("h", "help", "display this help and exit"), getopts::optflag("V", "version", "output version information and exit"), ]; - let matches = match getopts::getopts(args.tail(), &opts) { + let matches = match getopts::getopts(&args[1..], &opts) { Ok(m) => m, Err(f) => crash!(1, "{}", f) }; if matches.opt_present("help") { println!("Usage: {} [OPTION]... [FILE]...", NAME); - io::print(getopts::usage( + println!("{}", getopts::usage( "Convert blanks in each FILE to tabs, writing to standard output.\n\ - With no FILE, or when FILE is -, read standard input.", &opts).as_slice()); + With no FILE, or when FILE is -, read standard input.", &opts)); return 0; } @@ -106,121 +117,175 @@ pub fn uumain(args: Vec) -> i32 { return 0; } -fn open(path: String) -> io::BufferedReader> { +fn open(path: String) -> BufReader> { let mut file_buf; - if path.as_slice() == "-" { - io::BufferedReader::new(Box::new(io::stdio::stdin_raw()) as Box) + if path == "-" { + BufReader::new(Box::new(stdin()) as Box) } else { - file_buf = match io::File::open(&Path::new(path.as_slice())) { + file_buf = match File::open(&path[..]) { Ok(a) => a, - _ => crash!(1, "{}: {}\n", path, "No such file or directory") + Err(e) => crash!(1, "{}: {}", &path[..], e), }; - io::BufferedReader::new(Box::new(file_buf) as Box) + BufReader::new(Box::new(file_buf) as Box) } } -fn is_tabstop(tabstops: &[usize], col: usize) -> bool { - match tabstops { - [tabstop] => col % tabstop == 0, - tabstops => tabstops.binary_search_by(|&e| e.cmp(&col)).is_ok() - } -} - -fn to_next_stop(tabstops: &[usize], col: usize) -> Option { - match tabstops { - [tabstop] => Some(tabstop - col % tabstop), - tabstops => tabstops.iter().skip_while(|&t| *t <= col).next() - .map(|&tabstop| tabstop - col % tabstop) - } -} - -fn unexpandspan(mut output: &mut io::LineBufferedWriter, - tabstops: &[usize], nspaces: usize, col: usize, init: bool) { - let mut cur = col - nspaces; - if nspaces > 1 || init { - loop { - match to_next_stop(tabstops, cur) { - Some(to_next) if cur + to_next <= col => { - safe_write!(&mut output, "{}", '\t'); - cur += to_next; - } - _ => break - } +fn next_tabstop(tabstops: &[usize], col: usize) -> Option { + if tabstops.len() == 1 { + Some(tabstops[0] - col % tabstops[0]) + } else { + // find next larger tab + match tabstops.iter().skip_while(|&&t| t <= col).next() { + Some(t) => Some(t - col), + None => None, // if there isn't one in the list, tab becomes a single space } } - safe_write!(&mut output, "{:1$}", "", col - cur); +} + +fn write_tabs(mut output: &mut BufWriter, tabstops: &[usize], mut scol: usize, col: usize) { + while let Some(nts) = next_tabstop(tabstops, scol) { + if col < scol + nts { + break; + } + + safe_unwrap!(output.write_all("\t".as_bytes())); + scol += nts; + } + + while col > scol { + safe_unwrap!(output.write_all(" ".as_bytes())); + scol += 1; + } +} + +#[derive(PartialEq, Eq, Debug)] +enum CharType { + Backspace, + Space, + Tab, + Other, } fn unexpand(options: Options) { - let mut output = io::stdout(); - let ts = options.tabstops.as_slice(); + use self::CharType::*; + + let mut output = BufWriter::new(stdout()); + let ts = &options.tabstops[..]; + let mut buf = Vec::new(); + let lastcol = if ts.len() > 1 { + *ts.last().unwrap() + } else { + 0 + }; for file in options.files.into_iter() { - let mut col = 0; - let mut nspaces = 0; - let mut init = true; - for c in open(file).chars() { - match c { - Ok(' ') => { - if init || options.aflag { - nspaces += 1; + let mut fh = open(file); + + while match fh.read_until('\n' as u8, &mut buf) { + Ok(s) => s > 0, + Err(_) => buf.len() > 0, + } { + let mut byte = 0; // offset into the buffer + let mut col = 0; // the current column + let mut scol = 0; // the start col for the current span, i.e., the already-printed width + let mut init = true; // are we at the start of the line? + let mut pctype = Other; + + while byte < buf.len() { + // when we have a finite number of columns, never convert past the last column + if lastcol > 0 && col >= lastcol { + if (pctype != Tab && col > scol + 1) || + (col > scol && (init || pctype == Tab)) { + write_tabs(&mut output, ts, scol, col); + } else if col > scol { + safe_unwrap!(output.write_all(" ".as_bytes())); + } + scol = col; + + safe_unwrap!(output.write_all(&buf[byte..])); + break; + } + + let (ctype, cwidth, nbytes) = if options.uflag { + let nbytes = utf8_char_width(buf[byte]); + + // figure out how big the next char is, if it's UTF-8 + if byte + nbytes > buf.len() { + // make sure we don't overrun the buffer because of invalid UTF-8 + (Other, 1, 1) + } else if let Ok(t) = from_utf8(&buf[byte..byte+nbytes]) { + // Now that we think it's UTF-8, figure out what kind of char it is + match t.chars().next() { + Some(' ') => (Space, 0, 1), + Some('\t') => (Tab, 0, 1), + Some('\x08') => (Backspace, 0, 1), + Some(c) => (Other, UnicodeWidthChar::width(c).unwrap_or(0), nbytes), + None => { // invalid char snuck past the utf8_validation_iterator somehow??? + (Other, 1, 1) + }, + } } else { - nspaces = 0; - safe_write!(&mut output, "{}", ' '); + // otherwise, it's not valid + (Other, 1, 1) // implicit assumption: non-UTF8 char has display width 1 } - col += 1; - } - Ok('\t') if nspaces > 0 => { - if is_tabstop(ts, col) { - nspaces = 0; - col += 1; - safe_write!(&mut output, "{}", '\t'); - } - match to_next_stop(ts, col) { - Some(to_next) => { - nspaces += to_next; - col += to_next; + } else { + (match buf[byte] { // always take exactly 1 byte in strict ASCII mode + 0x20 => Space, + 0x09 => Tab, + 0x08 => Backspace, + _ => Other, + }, 1, 1) + }; + + // now figure out how many columns this char takes up, and maybe print it + let tabs_buffered = init || options.aflag; + match ctype { + Space | Tab => { // compute next col, but only write space or tab chars if not buffering + col += if ctype == Space { + 1 + } else { + next_tabstop(ts, col).unwrap_or(1) + }; + + if !tabs_buffered { + safe_unwrap!(output.write_all(&buf[byte..byte+nbytes])); + scol = col; // now printed up to this column } - None => { - col += 1; - unexpandspan(&mut output, ts, nspaces, col, init); - nspaces = 0; - safe_write!(&mut output, "{}", '\t'); + }, + Other | Backspace => { // always + // never turn a single space before a non-blank into a tab + // unless it's at the start of the line + if (tabs_buffered && pctype != Tab && col > scol + 1) || + (col > scol && (init || (tabs_buffered && pctype == Tab))) { + write_tabs(&mut output, ts, scol, col); + } else if col > scol { + safe_unwrap!(output.write_all(" ".as_bytes())); } - } + init = false; + col = if ctype == Other { // use computed width + col + cwidth + } else if col > 0 { // Backspace case, but only if col > 0 + col - 1 + } else { + 0 + }; + safe_unwrap!(output.write_all(&buf[byte..byte+nbytes])); + scol = col; // we've now printed up to this column + }, } - Ok('\x08') => { // '\b' - if init || options.aflag { - unexpandspan(&mut output, ts, nspaces, col, init) - } - nspaces = 0; - if col > 0 { col -= 1; } - init = false; - safe_write!(&mut output, "{}", '\x08'); - } - Ok('\n') => { - if init || options.aflag { - unexpandspan(&mut output, ts, nspaces, col, init) - } - nspaces = 0; - col = 0; - init = true; - safe_write!(&mut output, "{}", '\n'); - } - Ok(c) => { - if init || options.aflag { - unexpandspan(&mut output, ts, nspaces, col, init) - } - nspaces = 0; - col += 1; - init = false; - safe_write!(&mut output, "{}", c); - } - Err(_) => break + + byte += nbytes; // move on to next char + pctype = ctype; // save the previous type } - } - if init || options.aflag { - unexpandspan(&mut output, ts, nspaces, col, init) + + // write out anything remaining + if col > scol + 1 || (init && col > scol) { + write_tabs(&mut output, ts, scol, col); + } else if col > scol { + safe_unwrap!(output.write_all(" ".as_bytes())); + } + + buf.truncate(0); // clear out the buffer } } } diff --git a/test/unexpand.rs b/test/unexpand.rs index 92377fad2..8776a6fe2 100644 --- a/test/unexpand.rs +++ b/test/unexpand.rs @@ -1,74 +1,76 @@ -#![allow(unstable)] - -use std::old_io::process::Command; +use std::io::Write; +use std::process::{Command, Stdio}; static PROGNAME: &'static str = "./unexpand"; fn run(input: &str, args: &[&'static str]) -> Vec { - let mut process = Command::new(PROGNAME).args(args).spawn().unwrap(); + let mut process = Command::new(PROGNAME) + .args(args) + .stdin(Stdio::piped()) + .stdout(Stdio::piped()) + .spawn() + .unwrap_or_else(|e| panic!("{}", e)); - process.stdin.take().unwrap().write_str(input).unwrap(); + process.stdin.take().unwrap_or_else(|| panic!("Could not take child process stdin")) + .write_all(input.as_bytes()).unwrap_or_else(|e| panic!("{}", e)); - let po = match process.wait_with_output() { - Ok(p) => p, - Err(err) => panic!("{}", err), - }; - po.output + let po = process.wait_with_output().unwrap_or_else(|e| panic!("{}", e)); + po.stdout } #[test] fn unexpand_init_0() { let out = run(" 1\n 2\n 3\n 4\n", &["-t4"]); - assert_eq!(out.as_slice(), b" 1\n 2\n 3\n\t4\n"); + assert_eq!(&out[..], b" 1\n 2\n 3\n\t4\n" as &[u8]); } #[test] fn unexpand_init_1() { let out = run(" 5\n 6\n 7\n 8\n", &["-t4"]); - assert_eq!(out.as_slice(), b"\t 5\n\t 6\n\t 7\n\t\t8\n"); + assert_eq!(&out[..], b"\t 5\n\t 6\n\t 7\n\t\t8\n" as &[u8]); } #[test] fn unexpand_init_list_0() { let out = run(" 1\n 2\n 3\n 4\n", &["-t2,4"]); - assert_eq!(out.as_slice(), b" 1\n\t2\n\t 3\n\t\t4\n"); + assert_eq!(&out[..], b" 1\n\t2\n\t 3\n\t\t4\n" as &[u8]); } #[test] fn unexpand_init_list_1() { // Once the list is exhausted, spaces are not converted anymore let out = run(" 5\n 6\n 7\n 8\n", &["-t2,4"]); - assert_eq!(out.as_slice(), b"\t\t 5\n\t\t 6\n\t\t 7\n\t\t 8\n"); + assert_eq!(&out[..], b"\t\t 5\n\t\t 6\n\t\t 7\n\t\t 8\n" as &[u8]); } #[test] fn unexpand_aflag_0() { - let out = run("e E\nf F\ng G\nh H\n", &[]); - assert_eq!(out.as_slice(), b"e E\nf F\ng G\nh H\n"); + let out = run("e E\nf F\ng G\nh H\n", &["--"]); + assert_eq!(&out[..], b"e E\nf F\ng G\nh H\n" as &[u8]); } #[test] fn unexpand_aflag_1() { let out = run("e E\nf F\ng G\nh H\n", &["-a"]); - assert_eq!(out.as_slice(), b"e E\nf F\ng\tG\nh\t H\n"); + assert_eq!(&out[..], b"e E\nf F\ng\tG\nh\t H\n" as &[u8]); } #[test] fn unexpand_aflag_2() { let out = run("e E\nf F\ng G\nh H\n", &["-t8"]); - assert_eq!(out.as_slice(), b"e E\nf F\ng\tG\nh\t H\n"); + assert_eq!(&out[..], b"e E\nf F\ng\tG\nh\t H\n" as &[u8]); } #[test] fn unexpand_first_only_0() { let out = run(" A B", &["-t3"]); - assert_eq!(out.as_slice(), b"\t\t A\t B"); + assert_eq!(&out[..], b"\t\t A\t B" as &[u8]); } #[test] fn unexpand_first_only_1() { let out = run(" A B", &["-t3", "--first-only"]); - assert_eq!(out.as_slice(), b"\t\t A B"); + assert_eq!(&out[..], b"\t\t A B" as &[u8]); } #[test] @@ -76,20 +78,20 @@ fn unexpand_trailing_space_0() { // evil // Individual spaces before fields starting with non blanks should not be // converted, unless they are at the beginning of the line. let out = run("123 \t1\n123 1\n123 \n123 ", &["-t4"]); - assert_eq!(out.as_slice(), b"123\t\t1\n123 1\n123 \n123 "); + assert_eq!(&out[..], b"123\t\t1\n123 1\n123 \n123 " as &[u8]); } #[test] fn unexpand_trailing_space_1() { // super evil let out = run(" abc d e f g ", &["-t1"]); - assert_eq!(out.as_slice(), b"\tabc d e\t\tf\t\tg "); + assert_eq!(&out[..], b"\tabc d e\t\tf\t\tg " as &[u8]); } #[test] fn unexpand_spaces_follow_tabs_0() { // The two first spaces can be included into the first tab. let out = run(" \t\t A", &[]); - assert_eq!(out.as_slice(), b"\t\t A"); + assert_eq!(&out[..], b"\t\t A" as &[u8]); } #[test] @@ -100,6 +102,7 @@ fn unexpand_spaces_follow_tabs_1() { // evil // ' ' -> '\t' // third tabstop (5) // ' B \t' -> ' B \t' // after the list is exhausted, nothing must change let out = run("a \t B \t", &["-t1,4,5"]); - assert_eq!(out.as_slice(), b"a\t\t B \t"); + assert_eq!(&out[..], b"a\t\t B \t" as &[u8]); } +