mirror of
https://github.com/RGBCube/uutils-coreutils
synced 2025-08-02 05:57:46 +00:00
fix/rewrite unexpand
and its tests
This is a reworked version of unexpand. I did this for two main reasons: 1. The previous version of unexpand had issues correctly computing tabstops when the `-a` flag was supplied. 2. The previous version assumed the input was UTF-8. This version works with non-UTF-8 inputs. 3. This version has a new flag, -U, which forces unexpand to treat input as 8-bit ASCII rather than interpreting it as UTF-8. This might be handy in some cases.
This commit is contained in:
parent
ec4e3a60e4
commit
feee266b20
3 changed files with 196 additions and 127 deletions
1
src/unexpand/deps.mk
Normal file
1
src/unexpand/deps.mk
Normal file
|
@ -0,0 +1 @@
|
|||
DEPLIBS += unicode-width
|
|
@ -1,10 +1,12 @@
|
|||
#![crate_name = "unexpand"]
|
||||
#![feature(collections, core, old_io, old_path, rustc_private)]
|
||||
#![feature(rustc_private, unicode)]
|
||||
|
||||
/*
|
||||
* This file is part of the uutils coreutils package.
|
||||
*
|
||||
* (c) Virgile Andreani <virgile.andreani@anbuco.fr>
|
||||
* (c) kwantam <kwantam@gmail.com>
|
||||
* 20150428 updated to work with both UTF-8 and non-UTF-8 encodings
|
||||
*
|
||||
* For the full copyright and license information, please view the LICENSE
|
||||
* file that was distributed with this source code.
|
||||
|
@ -12,8 +14,14 @@
|
|||
|
||||
extern crate getopts;
|
||||
extern crate libc;
|
||||
extern crate rustc_unicode;
|
||||
extern crate unicode_width;
|
||||
|
||||
use std::old_io as io;
|
||||
use std::fs::File;
|
||||
use std::io::{stdin, stdout, BufRead, BufReader, BufWriter, Read, Stdout, Write};
|
||||
use std::str::from_utf8;
|
||||
use rustc_unicode::str::utf8_char_width;
|
||||
use unicode_width::UnicodeWidthChar;
|
||||
|
||||
#[path = "../common/util.rs"]
|
||||
#[macro_use]
|
||||
|
@ -25,7 +33,7 @@ static VERSION: &'static str = "0.0.1";
|
|||
static DEFAULT_TABSTOP: usize = 8;
|
||||
|
||||
fn tabstops_parse(s: String) -> Vec<usize> {
|
||||
let words = s.as_slice().split(',').collect::<Vec<&str>>();
|
||||
let words = s.split(',').collect::<Vec<&str>>();
|
||||
|
||||
let nums = words.into_iter()
|
||||
.map(|sn| sn.parse()
|
||||
|
@ -49,7 +57,8 @@ fn tabstops_parse(s: String) -> Vec<usize> {
|
|||
struct Options {
|
||||
files: Vec<String>,
|
||||
tabstops: Vec<usize>,
|
||||
aflag: bool
|
||||
aflag: bool,
|
||||
uflag: bool,
|
||||
}
|
||||
|
||||
impl Options {
|
||||
|
@ -61,6 +70,7 @@ impl Options {
|
|||
|
||||
let aflag = (matches.opt_present("all") || matches.opt_present("tabs"))
|
||||
&& !matches.opt_present("first-only");
|
||||
let uflag = !matches.opt_present("U");
|
||||
|
||||
let files =
|
||||
if matches.free.is_empty() {
|
||||
|
@ -69,7 +79,7 @@ impl Options {
|
|||
matches.free
|
||||
};
|
||||
|
||||
Options { files: files, tabstops: tabstops, aflag: aflag }
|
||||
Options { files: files, tabstops: tabstops, aflag: aflag, uflag: uflag }
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -79,20 +89,21 @@ pub fn uumain(args: Vec<String>) -> i32 {
|
|||
getopts::optflag("", "first-only", "convert only leading sequences of blanks (overrides -a)"),
|
||||
getopts::optopt("t", "tabs", "have tabs N characters apart instead of 8 (enables -a)", "N"),
|
||||
getopts::optopt("t", "tabs", "use comma separated LIST of tab positions (enables -a)", "LIST"),
|
||||
getopts::optflag("U", "no-utf8", "interpret input file as 8-bit ASCII rather than UTF-8"),
|
||||
getopts::optflag("h", "help", "display this help and exit"),
|
||||
getopts::optflag("V", "version", "output version information and exit"),
|
||||
];
|
||||
|
||||
let matches = match getopts::getopts(args.tail(), &opts) {
|
||||
let matches = match getopts::getopts(&args[1..], &opts) {
|
||||
Ok(m) => m,
|
||||
Err(f) => crash!(1, "{}", f)
|
||||
};
|
||||
|
||||
if matches.opt_present("help") {
|
||||
println!("Usage: {} [OPTION]... [FILE]...", NAME);
|
||||
io::print(getopts::usage(
|
||||
println!("{}", getopts::usage(
|
||||
"Convert blanks in each FILE to tabs, writing to standard output.\n\
|
||||
With no FILE, or when FILE is -, read standard input.", &opts).as_slice());
|
||||
With no FILE, or when FILE is -, read standard input.", &opts));
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
@ -106,121 +117,175 @@ pub fn uumain(args: Vec<String>) -> i32 {
|
|||
return 0;
|
||||
}
|
||||
|
||||
fn open(path: String) -> io::BufferedReader<Box<Reader+'static>> {
|
||||
fn open(path: String) -> BufReader<Box<Read+'static>> {
|
||||
let mut file_buf;
|
||||
if path.as_slice() == "-" {
|
||||
io::BufferedReader::new(Box::new(io::stdio::stdin_raw()) as Box<Reader>)
|
||||
if path == "-" {
|
||||
BufReader::new(Box::new(stdin()) as Box<Read>)
|
||||
} else {
|
||||
file_buf = match io::File::open(&Path::new(path.as_slice())) {
|
||||
file_buf = match File::open(&path[..]) {
|
||||
Ok(a) => a,
|
||||
_ => crash!(1, "{}: {}\n", path, "No such file or directory")
|
||||
Err(e) => crash!(1, "{}: {}", &path[..], e),
|
||||
};
|
||||
io::BufferedReader::new(Box::new(file_buf) as Box<Reader>)
|
||||
BufReader::new(Box::new(file_buf) as Box<Read>)
|
||||
}
|
||||
}
|
||||
|
||||
fn is_tabstop(tabstops: &[usize], col: usize) -> bool {
|
||||
match tabstops {
|
||||
[tabstop] => col % tabstop == 0,
|
||||
tabstops => tabstops.binary_search_by(|&e| e.cmp(&col)).is_ok()
|
||||
fn next_tabstop(tabstops: &[usize], col: usize) -> Option<usize> {
|
||||
if tabstops.len() == 1 {
|
||||
Some(tabstops[0] - col % tabstops[0])
|
||||
} else {
|
||||
// find next larger tab
|
||||
match tabstops.iter().skip_while(|&&t| t <= col).next() {
|
||||
Some(t) => Some(t - col),
|
||||
None => None, // if there isn't one in the list, tab becomes a single space
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
fn to_next_stop(tabstops: &[usize], col: usize) -> Option<usize> {
|
||||
match tabstops {
|
||||
[tabstop] => Some(tabstop - col % tabstop),
|
||||
tabstops => tabstops.iter().skip_while(|&t| *t <= col).next()
|
||||
.map(|&tabstop| tabstop - col % tabstop)
|
||||
fn write_tabs(mut output: &mut BufWriter<Stdout>, tabstops: &[usize], mut scol: usize, col: usize) {
|
||||
while let Some(nts) = next_tabstop(tabstops, scol) {
|
||||
if col < scol + nts {
|
||||
break;
|
||||
}
|
||||
|
||||
safe_unwrap!(output.write_all("\t".as_bytes()));
|
||||
scol += nts;
|
||||
}
|
||||
|
||||
while col > scol {
|
||||
safe_unwrap!(output.write_all(" ".as_bytes()));
|
||||
scol += 1;
|
||||
}
|
||||
}
|
||||
|
||||
fn unexpandspan(mut output: &mut io::LineBufferedWriter<io::stdio::StdWriter>,
|
||||
tabstops: &[usize], nspaces: usize, col: usize, init: bool) {
|
||||
let mut cur = col - nspaces;
|
||||
if nspaces > 1 || init {
|
||||
loop {
|
||||
match to_next_stop(tabstops, cur) {
|
||||
Some(to_next) if cur + to_next <= col => {
|
||||
safe_write!(&mut output, "{}", '\t');
|
||||
cur += to_next;
|
||||
}
|
||||
_ => break
|
||||
}
|
||||
}
|
||||
}
|
||||
safe_write!(&mut output, "{:1$}", "", col - cur);
|
||||
#[derive(PartialEq, Eq, Debug)]
|
||||
enum CharType {
|
||||
Backspace,
|
||||
Space,
|
||||
Tab,
|
||||
Other,
|
||||
}
|
||||
|
||||
fn unexpand(options: Options) {
|
||||
let mut output = io::stdout();
|
||||
let ts = options.tabstops.as_slice();
|
||||
use self::CharType::*;
|
||||
|
||||
let mut output = BufWriter::new(stdout());
|
||||
let ts = &options.tabstops[..];
|
||||
let mut buf = Vec::new();
|
||||
let lastcol = if ts.len() > 1 {
|
||||
*ts.last().unwrap()
|
||||
} else {
|
||||
0
|
||||
};
|
||||
|
||||
for file in options.files.into_iter() {
|
||||
let mut col = 0;
|
||||
let mut nspaces = 0;
|
||||
let mut init = true;
|
||||
for c in open(file).chars() {
|
||||
match c {
|
||||
Ok(' ') => {
|
||||
if init || options.aflag {
|
||||
nspaces += 1;
|
||||
let mut fh = open(file);
|
||||
|
||||
while match fh.read_until('\n' as u8, &mut buf) {
|
||||
Ok(s) => s > 0,
|
||||
Err(_) => buf.len() > 0,
|
||||
} {
|
||||
let mut byte = 0; // offset into the buffer
|
||||
let mut col = 0; // the current column
|
||||
let mut scol = 0; // the start col for the current span, i.e., the already-printed width
|
||||
let mut init = true; // are we at the start of the line?
|
||||
let mut pctype = Other;
|
||||
|
||||
while byte < buf.len() {
|
||||
// when we have a finite number of columns, never convert past the last column
|
||||
if lastcol > 0 && col >= lastcol {
|
||||
if (pctype != Tab && col > scol + 1) ||
|
||||
(col > scol && (init || pctype == Tab)) {
|
||||
write_tabs(&mut output, ts, scol, col);
|
||||
} else if col > scol {
|
||||
safe_unwrap!(output.write_all(" ".as_bytes()));
|
||||
}
|
||||
scol = col;
|
||||
|
||||
safe_unwrap!(output.write_all(&buf[byte..]));
|
||||
break;
|
||||
}
|
||||
|
||||
let (ctype, cwidth, nbytes) = if options.uflag {
|
||||
let nbytes = utf8_char_width(buf[byte]);
|
||||
|
||||
// figure out how big the next char is, if it's UTF-8
|
||||
if byte + nbytes > buf.len() {
|
||||
// make sure we don't overrun the buffer because of invalid UTF-8
|
||||
(Other, 1, 1)
|
||||
} else if let Ok(t) = from_utf8(&buf[byte..byte+nbytes]) {
|
||||
// Now that we think it's UTF-8, figure out what kind of char it is
|
||||
match t.chars().next() {
|
||||
Some(' ') => (Space, 0, 1),
|
||||
Some('\t') => (Tab, 0, 1),
|
||||
Some('\x08') => (Backspace, 0, 1),
|
||||
Some(c) => (Other, UnicodeWidthChar::width(c).unwrap_or(0), nbytes),
|
||||
None => { // invalid char snuck past the utf8_validation_iterator somehow???
|
||||
(Other, 1, 1)
|
||||
},
|
||||
}
|
||||
} else {
|
||||
nspaces = 0;
|
||||
safe_write!(&mut output, "{}", ' ');
|
||||
// otherwise, it's not valid
|
||||
(Other, 1, 1) // implicit assumption: non-UTF8 char has display width 1
|
||||
}
|
||||
col += 1;
|
||||
} else {
|
||||
(match buf[byte] { // always take exactly 1 byte in strict ASCII mode
|
||||
0x20 => Space,
|
||||
0x09 => Tab,
|
||||
0x08 => Backspace,
|
||||
_ => Other,
|
||||
}, 1, 1)
|
||||
};
|
||||
|
||||
// now figure out how many columns this char takes up, and maybe print it
|
||||
let tabs_buffered = init || options.aflag;
|
||||
match ctype {
|
||||
Space | Tab => { // compute next col, but only write space or tab chars if not buffering
|
||||
col += if ctype == Space {
|
||||
1
|
||||
} else {
|
||||
next_tabstop(ts, col).unwrap_or(1)
|
||||
};
|
||||
|
||||
if !tabs_buffered {
|
||||
safe_unwrap!(output.write_all(&buf[byte..byte+nbytes]));
|
||||
scol = col; // now printed up to this column
|
||||
}
|
||||
Ok('\t') if nspaces > 0 => {
|
||||
if is_tabstop(ts, col) {
|
||||
nspaces = 0;
|
||||
col += 1;
|
||||
safe_write!(&mut output, "{}", '\t');
|
||||
},
|
||||
Other | Backspace => { // always
|
||||
// never turn a single space before a non-blank into a tab
|
||||
// unless it's at the start of the line
|
||||
if (tabs_buffered && pctype != Tab && col > scol + 1) ||
|
||||
(col > scol && (init || (tabs_buffered && pctype == Tab))) {
|
||||
write_tabs(&mut output, ts, scol, col);
|
||||
} else if col > scol {
|
||||
safe_unwrap!(output.write_all(" ".as_bytes()));
|
||||
}
|
||||
match to_next_stop(ts, col) {
|
||||
Some(to_next) => {
|
||||
nspaces += to_next;
|
||||
col += to_next;
|
||||
}
|
||||
None => {
|
||||
col += 1;
|
||||
unexpandspan(&mut output, ts, nspaces, col, init);
|
||||
nspaces = 0;
|
||||
safe_write!(&mut output, "{}", '\t');
|
||||
}
|
||||
}
|
||||
}
|
||||
Ok('\x08') => { // '\b'
|
||||
if init || options.aflag {
|
||||
unexpandspan(&mut output, ts, nspaces, col, init)
|
||||
}
|
||||
nspaces = 0;
|
||||
if col > 0 { col -= 1; }
|
||||
init = false;
|
||||
safe_write!(&mut output, "{}", '\x08');
|
||||
col = if ctype == Other { // use computed width
|
||||
col + cwidth
|
||||
} else if col > 0 { // Backspace case, but only if col > 0
|
||||
col - 1
|
||||
} else {
|
||||
0
|
||||
};
|
||||
safe_unwrap!(output.write_all(&buf[byte..byte+nbytes]));
|
||||
scol = col; // we've now printed up to this column
|
||||
},
|
||||
}
|
||||
Ok('\n') => {
|
||||
if init || options.aflag {
|
||||
unexpandspan(&mut output, ts, nspaces, col, init)
|
||||
|
||||
byte += nbytes; // move on to next char
|
||||
pctype = ctype; // save the previous type
|
||||
}
|
||||
nspaces = 0;
|
||||
col = 0;
|
||||
init = true;
|
||||
safe_write!(&mut output, "{}", '\n');
|
||||
|
||||
// write out anything remaining
|
||||
if col > scol + 1 || (init && col > scol) {
|
||||
write_tabs(&mut output, ts, scol, col);
|
||||
} else if col > scol {
|
||||
safe_unwrap!(output.write_all(" ".as_bytes()));
|
||||
}
|
||||
Ok(c) => {
|
||||
if init || options.aflag {
|
||||
unexpandspan(&mut output, ts, nspaces, col, init)
|
||||
}
|
||||
nspaces = 0;
|
||||
col += 1;
|
||||
init = false;
|
||||
safe_write!(&mut output, "{}", c);
|
||||
}
|
||||
Err(_) => break
|
||||
}
|
||||
}
|
||||
if init || options.aflag {
|
||||
unexpandspan(&mut output, ts, nspaces, col, init)
|
||||
|
||||
buf.truncate(0); // clear out the buffer
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -1,74 +1,76 @@
|
|||
#![allow(unstable)]
|
||||
|
||||
use std::old_io::process::Command;
|
||||
use std::io::Write;
|
||||
use std::process::{Command, Stdio};
|
||||
|
||||
static PROGNAME: &'static str = "./unexpand";
|
||||
|
||||
fn run(input: &str, args: &[&'static str]) -> Vec<u8> {
|
||||
let mut process = Command::new(PROGNAME).args(args).spawn().unwrap();
|
||||
let mut process = Command::new(PROGNAME)
|
||||
.args(args)
|
||||
.stdin(Stdio::piped())
|
||||
.stdout(Stdio::piped())
|
||||
.spawn()
|
||||
.unwrap_or_else(|e| panic!("{}", e));
|
||||
|
||||
process.stdin.take().unwrap().write_str(input).unwrap();
|
||||
process.stdin.take().unwrap_or_else(|| panic!("Could not take child process stdin"))
|
||||
.write_all(input.as_bytes()).unwrap_or_else(|e| panic!("{}", e));
|
||||
|
||||
let po = match process.wait_with_output() {
|
||||
Ok(p) => p,
|
||||
Err(err) => panic!("{}", err),
|
||||
};
|
||||
po.output
|
||||
let po = process.wait_with_output().unwrap_or_else(|e| panic!("{}", e));
|
||||
po.stdout
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn unexpand_init_0() {
|
||||
let out = run(" 1\n 2\n 3\n 4\n", &["-t4"]);
|
||||
assert_eq!(out.as_slice(), b" 1\n 2\n 3\n\t4\n");
|
||||
assert_eq!(&out[..], b" 1\n 2\n 3\n\t4\n" as &[u8]);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn unexpand_init_1() {
|
||||
let out = run(" 5\n 6\n 7\n 8\n", &["-t4"]);
|
||||
assert_eq!(out.as_slice(), b"\t 5\n\t 6\n\t 7\n\t\t8\n");
|
||||
assert_eq!(&out[..], b"\t 5\n\t 6\n\t 7\n\t\t8\n" as &[u8]);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn unexpand_init_list_0() {
|
||||
let out = run(" 1\n 2\n 3\n 4\n", &["-t2,4"]);
|
||||
assert_eq!(out.as_slice(), b" 1\n\t2\n\t 3\n\t\t4\n");
|
||||
assert_eq!(&out[..], b" 1\n\t2\n\t 3\n\t\t4\n" as &[u8]);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn unexpand_init_list_1() {
|
||||
// Once the list is exhausted, spaces are not converted anymore
|
||||
let out = run(" 5\n 6\n 7\n 8\n", &["-t2,4"]);
|
||||
assert_eq!(out.as_slice(), b"\t\t 5\n\t\t 6\n\t\t 7\n\t\t 8\n");
|
||||
assert_eq!(&out[..], b"\t\t 5\n\t\t 6\n\t\t 7\n\t\t 8\n" as &[u8]);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn unexpand_aflag_0() {
|
||||
let out = run("e E\nf F\ng G\nh H\n", &[]);
|
||||
assert_eq!(out.as_slice(), b"e E\nf F\ng G\nh H\n");
|
||||
let out = run("e E\nf F\ng G\nh H\n", &["--"]);
|
||||
assert_eq!(&out[..], b"e E\nf F\ng G\nh H\n" as &[u8]);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn unexpand_aflag_1() {
|
||||
let out = run("e E\nf F\ng G\nh H\n", &["-a"]);
|
||||
assert_eq!(out.as_slice(), b"e E\nf F\ng\tG\nh\t H\n");
|
||||
assert_eq!(&out[..], b"e E\nf F\ng\tG\nh\t H\n" as &[u8]);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn unexpand_aflag_2() {
|
||||
let out = run("e E\nf F\ng G\nh H\n", &["-t8"]);
|
||||
assert_eq!(out.as_slice(), b"e E\nf F\ng\tG\nh\t H\n");
|
||||
assert_eq!(&out[..], b"e E\nf F\ng\tG\nh\t H\n" as &[u8]);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn unexpand_first_only_0() {
|
||||
let out = run(" A B", &["-t3"]);
|
||||
assert_eq!(out.as_slice(), b"\t\t A\t B");
|
||||
assert_eq!(&out[..], b"\t\t A\t B" as &[u8]);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn unexpand_first_only_1() {
|
||||
let out = run(" A B", &["-t3", "--first-only"]);
|
||||
assert_eq!(out.as_slice(), b"\t\t A B");
|
||||
assert_eq!(&out[..], b"\t\t A B" as &[u8]);
|
||||
}
|
||||
|
||||
#[test]
|
||||
|
@ -76,20 +78,20 @@ fn unexpand_trailing_space_0() { // evil
|
|||
// Individual spaces before fields starting with non blanks should not be
|
||||
// converted, unless they are at the beginning of the line.
|
||||
let out = run("123 \t1\n123 1\n123 \n123 ", &["-t4"]);
|
||||
assert_eq!(out.as_slice(), b"123\t\t1\n123 1\n123 \n123 ");
|
||||
assert_eq!(&out[..], b"123\t\t1\n123 1\n123 \n123 " as &[u8]);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn unexpand_trailing_space_1() { // super evil
|
||||
let out = run(" abc d e f g ", &["-t1"]);
|
||||
assert_eq!(out.as_slice(), b"\tabc d e\t\tf\t\tg ");
|
||||
assert_eq!(&out[..], b"\tabc d e\t\tf\t\tg " as &[u8]);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn unexpand_spaces_follow_tabs_0() {
|
||||
// The two first spaces can be included into the first tab.
|
||||
let out = run(" \t\t A", &[]);
|
||||
assert_eq!(out.as_slice(), b"\t\t A");
|
||||
assert_eq!(&out[..], b"\t\t A" as &[u8]);
|
||||
}
|
||||
|
||||
#[test]
|
||||
|
@ -100,6 +102,7 @@ fn unexpand_spaces_follow_tabs_1() { // evil
|
|||
// ' ' -> '\t' // third tabstop (5)
|
||||
// ' B \t' -> ' B \t' // after the list is exhausted, nothing must change
|
||||
let out = run("a \t B \t", &["-t1,4,5"]);
|
||||
assert_eq!(out.as_slice(), b"a\t\t B \t");
|
||||
assert_eq!(&out[..], b"a\t\t B \t" as &[u8]);
|
||||
}
|
||||
|
||||
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue