1
Fork 0
mirror of https://github.com/RGBCube/uutils-coreutils synced 2025-08-01 21:47:46 +00:00

fix/rewrite unexpand and its tests

This is a reworked version of unexpand. I did this for two main
reasons:

1. The previous version of unexpand had issues correctly computing
   tabstops when the `-a` flag was supplied.

2. The previous version assumed the input was UTF-8. This version works
   with non-UTF-8 inputs.

3. This version has a new flag, -U, which forces unexpand to
   treat input as 8-bit ASCII rather than interpreting it
   as UTF-8. This might be handy in some cases.
This commit is contained in:
kwantam 2015-04-28 17:33:31 -04:00
parent ec4e3a60e4
commit feee266b20
3 changed files with 196 additions and 127 deletions

1
src/unexpand/deps.mk Normal file
View file

@ -0,0 +1 @@
DEPLIBS += unicode-width

View file

@ -1,10 +1,12 @@
#![crate_name = "unexpand"]
#![feature(collections, core, old_io, old_path, rustc_private)]
#![feature(rustc_private, unicode)]
/*
* This file is part of the uutils coreutils package.
*
* (c) Virgile Andreani <virgile.andreani@anbuco.fr>
* (c) kwantam <kwantam@gmail.com>
* 20150428 updated to work with both UTF-8 and non-UTF-8 encodings
*
* For the full copyright and license information, please view the LICENSE
* file that was distributed with this source code.
@ -12,8 +14,14 @@
extern crate getopts;
extern crate libc;
extern crate rustc_unicode;
extern crate unicode_width;
use std::old_io as io;
use std::fs::File;
use std::io::{stdin, stdout, BufRead, BufReader, BufWriter, Read, Stdout, Write};
use std::str::from_utf8;
use rustc_unicode::str::utf8_char_width;
use unicode_width::UnicodeWidthChar;
#[path = "../common/util.rs"]
#[macro_use]
@ -25,7 +33,7 @@ static VERSION: &'static str = "0.0.1";
static DEFAULT_TABSTOP: usize = 8;
fn tabstops_parse(s: String) -> Vec<usize> {
let words = s.as_slice().split(',').collect::<Vec<&str>>();
let words = s.split(',').collect::<Vec<&str>>();
let nums = words.into_iter()
.map(|sn| sn.parse()
@ -49,7 +57,8 @@ fn tabstops_parse(s: String) -> Vec<usize> {
struct Options {
files: Vec<String>,
tabstops: Vec<usize>,
aflag: bool
aflag: bool,
uflag: bool,
}
impl Options {
@ -61,6 +70,7 @@ impl Options {
let aflag = (matches.opt_present("all") || matches.opt_present("tabs"))
&& !matches.opt_present("first-only");
let uflag = !matches.opt_present("U");
let files =
if matches.free.is_empty() {
@ -69,7 +79,7 @@ impl Options {
matches.free
};
Options { files: files, tabstops: tabstops, aflag: aflag }
Options { files: files, tabstops: tabstops, aflag: aflag, uflag: uflag }
}
}
@ -79,20 +89,21 @@ pub fn uumain(args: Vec<String>) -> i32 {
getopts::optflag("", "first-only", "convert only leading sequences of blanks (overrides -a)"),
getopts::optopt("t", "tabs", "have tabs N characters apart instead of 8 (enables -a)", "N"),
getopts::optopt("t", "tabs", "use comma separated LIST of tab positions (enables -a)", "LIST"),
getopts::optflag("U", "no-utf8", "interpret input file as 8-bit ASCII rather than UTF-8"),
getopts::optflag("h", "help", "display this help and exit"),
getopts::optflag("V", "version", "output version information and exit"),
];
let matches = match getopts::getopts(args.tail(), &opts) {
let matches = match getopts::getopts(&args[1..], &opts) {
Ok(m) => m,
Err(f) => crash!(1, "{}", f)
};
if matches.opt_present("help") {
println!("Usage: {} [OPTION]... [FILE]...", NAME);
io::print(getopts::usage(
println!("{}", getopts::usage(
"Convert blanks in each FILE to tabs, writing to standard output.\n\
With no FILE, or when FILE is -, read standard input.", &opts).as_slice());
With no FILE, or when FILE is -, read standard input.", &opts));
return 0;
}
@ -106,121 +117,175 @@ pub fn uumain(args: Vec<String>) -> i32 {
return 0;
}
fn open(path: String) -> io::BufferedReader<Box<Reader+'static>> {
fn open(path: String) -> BufReader<Box<Read+'static>> {
let mut file_buf;
if path.as_slice() == "-" {
io::BufferedReader::new(Box::new(io::stdio::stdin_raw()) as Box<Reader>)
if path == "-" {
BufReader::new(Box::new(stdin()) as Box<Read>)
} else {
file_buf = match io::File::open(&Path::new(path.as_slice())) {
file_buf = match File::open(&path[..]) {
Ok(a) => a,
_ => crash!(1, "{}: {}\n", path, "No such file or directory")
Err(e) => crash!(1, "{}: {}", &path[..], e),
};
io::BufferedReader::new(Box::new(file_buf) as Box<Reader>)
BufReader::new(Box::new(file_buf) as Box<Read>)
}
}
fn is_tabstop(tabstops: &[usize], col: usize) -> bool {
match tabstops {
[tabstop] => col % tabstop == 0,
tabstops => tabstops.binary_search_by(|&e| e.cmp(&col)).is_ok()
}
}
fn to_next_stop(tabstops: &[usize], col: usize) -> Option<usize> {
match tabstops {
[tabstop] => Some(tabstop - col % tabstop),
tabstops => tabstops.iter().skip_while(|&t| *t <= col).next()
.map(|&tabstop| tabstop - col % tabstop)
}
}
fn unexpandspan(mut output: &mut io::LineBufferedWriter<io::stdio::StdWriter>,
tabstops: &[usize], nspaces: usize, col: usize, init: bool) {
let mut cur = col - nspaces;
if nspaces > 1 || init {
loop {
match to_next_stop(tabstops, cur) {
Some(to_next) if cur + to_next <= col => {
safe_write!(&mut output, "{}", '\t');
cur += to_next;
}
_ => break
}
fn next_tabstop(tabstops: &[usize], col: usize) -> Option<usize> {
if tabstops.len() == 1 {
Some(tabstops[0] - col % tabstops[0])
} else {
// find next larger tab
match tabstops.iter().skip_while(|&&t| t <= col).next() {
Some(t) => Some(t - col),
None => None, // if there isn't one in the list, tab becomes a single space
}
}
safe_write!(&mut output, "{:1$}", "", col - cur);
}
fn write_tabs(mut output: &mut BufWriter<Stdout>, tabstops: &[usize], mut scol: usize, col: usize) {
while let Some(nts) = next_tabstop(tabstops, scol) {
if col < scol + nts {
break;
}
safe_unwrap!(output.write_all("\t".as_bytes()));
scol += nts;
}
while col > scol {
safe_unwrap!(output.write_all(" ".as_bytes()));
scol += 1;
}
}
#[derive(PartialEq, Eq, Debug)]
enum CharType {
Backspace,
Space,
Tab,
Other,
}
fn unexpand(options: Options) {
let mut output = io::stdout();
let ts = options.tabstops.as_slice();
use self::CharType::*;
let mut output = BufWriter::new(stdout());
let ts = &options.tabstops[..];
let mut buf = Vec::new();
let lastcol = if ts.len() > 1 {
*ts.last().unwrap()
} else {
0
};
for file in options.files.into_iter() {
let mut col = 0;
let mut nspaces = 0;
let mut init = true;
for c in open(file).chars() {
match c {
Ok(' ') => {
if init || options.aflag {
nspaces += 1;
let mut fh = open(file);
while match fh.read_until('\n' as u8, &mut buf) {
Ok(s) => s > 0,
Err(_) => buf.len() > 0,
} {
let mut byte = 0; // offset into the buffer
let mut col = 0; // the current column
let mut scol = 0; // the start col for the current span, i.e., the already-printed width
let mut init = true; // are we at the start of the line?
let mut pctype = Other;
while byte < buf.len() {
// when we have a finite number of columns, never convert past the last column
if lastcol > 0 && col >= lastcol {
if (pctype != Tab && col > scol + 1) ||
(col > scol && (init || pctype == Tab)) {
write_tabs(&mut output, ts, scol, col);
} else if col > scol {
safe_unwrap!(output.write_all(" ".as_bytes()));
}
scol = col;
safe_unwrap!(output.write_all(&buf[byte..]));
break;
}
let (ctype, cwidth, nbytes) = if options.uflag {
let nbytes = utf8_char_width(buf[byte]);
// figure out how big the next char is, if it's UTF-8
if byte + nbytes > buf.len() {
// make sure we don't overrun the buffer because of invalid UTF-8
(Other, 1, 1)
} else if let Ok(t) = from_utf8(&buf[byte..byte+nbytes]) {
// Now that we think it's UTF-8, figure out what kind of char it is
match t.chars().next() {
Some(' ') => (Space, 0, 1),
Some('\t') => (Tab, 0, 1),
Some('\x08') => (Backspace, 0, 1),
Some(c) => (Other, UnicodeWidthChar::width(c).unwrap_or(0), nbytes),
None => { // invalid char snuck past the utf8_validation_iterator somehow???
(Other, 1, 1)
},
}
} else {
nspaces = 0;
safe_write!(&mut output, "{}", ' ');
// otherwise, it's not valid
(Other, 1, 1) // implicit assumption: non-UTF8 char has display width 1
}
col += 1;
}
Ok('\t') if nspaces > 0 => {
if is_tabstop(ts, col) {
nspaces = 0;
col += 1;
safe_write!(&mut output, "{}", '\t');
}
match to_next_stop(ts, col) {
Some(to_next) => {
nspaces += to_next;
col += to_next;
} else {
(match buf[byte] { // always take exactly 1 byte in strict ASCII mode
0x20 => Space,
0x09 => Tab,
0x08 => Backspace,
_ => Other,
}, 1, 1)
};
// now figure out how many columns this char takes up, and maybe print it
let tabs_buffered = init || options.aflag;
match ctype {
Space | Tab => { // compute next col, but only write space or tab chars if not buffering
col += if ctype == Space {
1
} else {
next_tabstop(ts, col).unwrap_or(1)
};
if !tabs_buffered {
safe_unwrap!(output.write_all(&buf[byte..byte+nbytes]));
scol = col; // now printed up to this column
}
None => {
col += 1;
unexpandspan(&mut output, ts, nspaces, col, init);
nspaces = 0;
safe_write!(&mut output, "{}", '\t');
},
Other | Backspace => { // always
// never turn a single space before a non-blank into a tab
// unless it's at the start of the line
if (tabs_buffered && pctype != Tab && col > scol + 1) ||
(col > scol && (init || (tabs_buffered && pctype == Tab))) {
write_tabs(&mut output, ts, scol, col);
} else if col > scol {
safe_unwrap!(output.write_all(" ".as_bytes()));
}
}
init = false;
col = if ctype == Other { // use computed width
col + cwidth
} else if col > 0 { // Backspace case, but only if col > 0
col - 1
} else {
0
};
safe_unwrap!(output.write_all(&buf[byte..byte+nbytes]));
scol = col; // we've now printed up to this column
},
}
Ok('\x08') => { // '\b'
if init || options.aflag {
unexpandspan(&mut output, ts, nspaces, col, init)
}
nspaces = 0;
if col > 0 { col -= 1; }
init = false;
safe_write!(&mut output, "{}", '\x08');
}
Ok('\n') => {
if init || options.aflag {
unexpandspan(&mut output, ts, nspaces, col, init)
}
nspaces = 0;
col = 0;
init = true;
safe_write!(&mut output, "{}", '\n');
}
Ok(c) => {
if init || options.aflag {
unexpandspan(&mut output, ts, nspaces, col, init)
}
nspaces = 0;
col += 1;
init = false;
safe_write!(&mut output, "{}", c);
}
Err(_) => break
byte += nbytes; // move on to next char
pctype = ctype; // save the previous type
}
}
if init || options.aflag {
unexpandspan(&mut output, ts, nspaces, col, init)
// write out anything remaining
if col > scol + 1 || (init && col > scol) {
write_tabs(&mut output, ts, scol, col);
} else if col > scol {
safe_unwrap!(output.write_all(" ".as_bytes()));
}
buf.truncate(0); // clear out the buffer
}
}
}

View file

@ -1,74 +1,76 @@
#![allow(unstable)]
use std::old_io::process::Command;
use std::io::Write;
use std::process::{Command, Stdio};
static PROGNAME: &'static str = "./unexpand";
fn run(input: &str, args: &[&'static str]) -> Vec<u8> {
let mut process = Command::new(PROGNAME).args(args).spawn().unwrap();
let mut process = Command::new(PROGNAME)
.args(args)
.stdin(Stdio::piped())
.stdout(Stdio::piped())
.spawn()
.unwrap_or_else(|e| panic!("{}", e));
process.stdin.take().unwrap().write_str(input).unwrap();
process.stdin.take().unwrap_or_else(|| panic!("Could not take child process stdin"))
.write_all(input.as_bytes()).unwrap_or_else(|e| panic!("{}", e));
let po = match process.wait_with_output() {
Ok(p) => p,
Err(err) => panic!("{}", err),
};
po.output
let po = process.wait_with_output().unwrap_or_else(|e| panic!("{}", e));
po.stdout
}
#[test]
fn unexpand_init_0() {
let out = run(" 1\n 2\n 3\n 4\n", &["-t4"]);
assert_eq!(out.as_slice(), b" 1\n 2\n 3\n\t4\n");
assert_eq!(&out[..], b" 1\n 2\n 3\n\t4\n" as &[u8]);
}
#[test]
fn unexpand_init_1() {
let out = run(" 5\n 6\n 7\n 8\n", &["-t4"]);
assert_eq!(out.as_slice(), b"\t 5\n\t 6\n\t 7\n\t\t8\n");
assert_eq!(&out[..], b"\t 5\n\t 6\n\t 7\n\t\t8\n" as &[u8]);
}
#[test]
fn unexpand_init_list_0() {
let out = run(" 1\n 2\n 3\n 4\n", &["-t2,4"]);
assert_eq!(out.as_slice(), b" 1\n\t2\n\t 3\n\t\t4\n");
assert_eq!(&out[..], b" 1\n\t2\n\t 3\n\t\t4\n" as &[u8]);
}
#[test]
fn unexpand_init_list_1() {
// Once the list is exhausted, spaces are not converted anymore
let out = run(" 5\n 6\n 7\n 8\n", &["-t2,4"]);
assert_eq!(out.as_slice(), b"\t\t 5\n\t\t 6\n\t\t 7\n\t\t 8\n");
assert_eq!(&out[..], b"\t\t 5\n\t\t 6\n\t\t 7\n\t\t 8\n" as &[u8]);
}
#[test]
fn unexpand_aflag_0() {
let out = run("e E\nf F\ng G\nh H\n", &[]);
assert_eq!(out.as_slice(), b"e E\nf F\ng G\nh H\n");
let out = run("e E\nf F\ng G\nh H\n", &["--"]);
assert_eq!(&out[..], b"e E\nf F\ng G\nh H\n" as &[u8]);
}
#[test]
fn unexpand_aflag_1() {
let out = run("e E\nf F\ng G\nh H\n", &["-a"]);
assert_eq!(out.as_slice(), b"e E\nf F\ng\tG\nh\t H\n");
assert_eq!(&out[..], b"e E\nf F\ng\tG\nh\t H\n" as &[u8]);
}
#[test]
fn unexpand_aflag_2() {
let out = run("e E\nf F\ng G\nh H\n", &["-t8"]);
assert_eq!(out.as_slice(), b"e E\nf F\ng\tG\nh\t H\n");
assert_eq!(&out[..], b"e E\nf F\ng\tG\nh\t H\n" as &[u8]);
}
#[test]
fn unexpand_first_only_0() {
let out = run(" A B", &["-t3"]);
assert_eq!(out.as_slice(), b"\t\t A\t B");
assert_eq!(&out[..], b"\t\t A\t B" as &[u8]);
}
#[test]
fn unexpand_first_only_1() {
let out = run(" A B", &["-t3", "--first-only"]);
assert_eq!(out.as_slice(), b"\t\t A B");
assert_eq!(&out[..], b"\t\t A B" as &[u8]);
}
#[test]
@ -76,20 +78,20 @@ fn unexpand_trailing_space_0() { // evil
// Individual spaces before fields starting with non blanks should not be
// converted, unless they are at the beginning of the line.
let out = run("123 \t1\n123 1\n123 \n123 ", &["-t4"]);
assert_eq!(out.as_slice(), b"123\t\t1\n123 1\n123 \n123 ");
assert_eq!(&out[..], b"123\t\t1\n123 1\n123 \n123 " as &[u8]);
}
#[test]
fn unexpand_trailing_space_1() { // super evil
let out = run(" abc d e f g ", &["-t1"]);
assert_eq!(out.as_slice(), b"\tabc d e\t\tf\t\tg ");
assert_eq!(&out[..], b"\tabc d e\t\tf\t\tg " as &[u8]);
}
#[test]
fn unexpand_spaces_follow_tabs_0() {
// The two first spaces can be included into the first tab.
let out = run(" \t\t A", &[]);
assert_eq!(out.as_slice(), b"\t\t A");
assert_eq!(&out[..], b"\t\t A" as &[u8]);
}
#[test]
@ -100,6 +102,7 @@ fn unexpand_spaces_follow_tabs_1() { // evil
// ' ' -> '\t' // third tabstop (5)
// ' B \t' -> ' B \t' // after the list is exhausted, nothing must change
let out = run("a \t B \t", &["-t1,4,5"]);
assert_eq!(out.as_slice(), b"a\t\t B \t");
assert_eq!(&out[..], b"a\t\t B \t" as &[u8]);
}