mirror of
https://github.com/RGBCube/uutils-coreutils
synced 2025-08-02 14:07:46 +00:00
fix tr
and its test
In addition, this commit substantially reduces the number of allocations that tr does when building the substitution tables.
This commit is contained in:
parent
8e2788bd39
commit
91827a594a
3 changed files with 174 additions and 111 deletions
117
src/tr/expand.rs
Normal file
117
src/tr/expand.rs
Normal file
|
@ -0,0 +1,117 @@
|
||||||
|
/*
|
||||||
|
* This file is part of the uutils coreutils package.
|
||||||
|
*
|
||||||
|
* (c) Michael Gehring <mg@ebfe.org>
|
||||||
|
* (c) kwantam <kwantam@gmail.com>
|
||||||
|
* 20150428 created `expand` module to eliminate most allocs during setup
|
||||||
|
*
|
||||||
|
* For the full copyright and license information, please view the LICENSE
|
||||||
|
* file that was distributed with this source code.
|
||||||
|
*/
|
||||||
|
|
||||||
|
use std::char::from_u32;
|
||||||
|
use std::cmp::min;
|
||||||
|
use std::iter::Peekable;
|
||||||
|
use std::ops::Range;
|
||||||
|
|
||||||
|
#[inline]
|
||||||
|
fn unescape_char(c: char) -> char {
|
||||||
|
match c {
|
||||||
|
'a' => 0x07u8 as char,
|
||||||
|
'b' => 0x08u8 as char,
|
||||||
|
'f' => 0x0cu8 as char,
|
||||||
|
'v' => 0x0bu8 as char,
|
||||||
|
'n' => '\n',
|
||||||
|
'r' => '\r',
|
||||||
|
't' => '\t',
|
||||||
|
_ => c,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
struct Unescape<'a> {
|
||||||
|
string: &'a str,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl<'a> Iterator for Unescape<'a> {
|
||||||
|
type Item = char;
|
||||||
|
|
||||||
|
#[inline]
|
||||||
|
fn size_hint(&self) -> (usize, Option<usize>) {
|
||||||
|
let slen = self.string.len();
|
||||||
|
(min(slen, 1), None)
|
||||||
|
}
|
||||||
|
|
||||||
|
#[inline]
|
||||||
|
fn next(&mut self) -> Option<Self::Item> {
|
||||||
|
if self.string.len() == 0 {
|
||||||
|
return None;
|
||||||
|
}
|
||||||
|
|
||||||
|
// is the next character an escape?
|
||||||
|
let (ret, idx) = match self.string.chars().next().unwrap() {
|
||||||
|
'\\' if self.string.len() > 1 => {
|
||||||
|
// yes---it's \ and it's not the last char in a string
|
||||||
|
// we know that \ is 1 byte long so we can index into the string safely
|
||||||
|
let c = self.string[1..].chars().next().unwrap();
|
||||||
|
(Some(unescape_char(c)), 1 + c.len_utf8())
|
||||||
|
},
|
||||||
|
c => (Some(c), c.len_utf8()), // not an escape char
|
||||||
|
};
|
||||||
|
|
||||||
|
self.string = &self.string[idx..]; // advance the pointer to the next char
|
||||||
|
ret
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
pub struct ExpandSet<'a> {
|
||||||
|
range: Range<u32>,
|
||||||
|
unesc: Peekable<Unescape<'a>>,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl<'a> Iterator for ExpandSet<'a> {
|
||||||
|
type Item = char;
|
||||||
|
|
||||||
|
#[inline]
|
||||||
|
fn size_hint(&self) -> (usize, Option<usize>) {
|
||||||
|
self.unesc.size_hint()
|
||||||
|
}
|
||||||
|
|
||||||
|
#[inline]
|
||||||
|
fn next(&mut self) -> Option<Self::Item> {
|
||||||
|
// while the Range has elements, try to return chars from it
|
||||||
|
// but make sure that they actually turn out to be Chars!
|
||||||
|
while let Some(n) = self.range.next() {
|
||||||
|
match from_u32(n) {
|
||||||
|
Some(c) => return Some(c),
|
||||||
|
_ => (),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if let Some(first) = self.unesc.next() {
|
||||||
|
// peek ahead
|
||||||
|
if self.unesc.peek() == Some(&'-') && match self.unesc.size_hint() {
|
||||||
|
(x, _) if x > 1 => true, // there's a range here; record it in our internal Range struct
|
||||||
|
_ => false,
|
||||||
|
} {
|
||||||
|
self.unesc.next(); // this is the '-'
|
||||||
|
let last = self.unesc.next().unwrap(); // this is the end of the range
|
||||||
|
|
||||||
|
self.range = first as u32 + 1 .. last as u32 + 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
return Some(first); // in any case, return the next char
|
||||||
|
}
|
||||||
|
|
||||||
|
None
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl<'a> ExpandSet<'a> {
|
||||||
|
#[inline]
|
||||||
|
pub fn new(s: &'a str) -> ExpandSet<'a> {
|
||||||
|
ExpandSet {
|
||||||
|
range: 0 .. 0,
|
||||||
|
unesc: Unescape { string: s }.peekable(),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
136
src/tr/tr.rs
136
src/tr/tr.rs
|
@ -1,98 +1,40 @@
|
||||||
#![crate_name = "tr"]
|
#![crate_name = "tr"]
|
||||||
#![feature(collections, core, old_io, rustc_private)]
|
#![feature(io, rustc_private)]
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* This file is part of the uutils coreutils package.
|
* This file is part of the uutils coreutils package.
|
||||||
*
|
*
|
||||||
* (c) Michael Gehring <mg@ebfe.org>
|
* (c) Michael Gehring <mg@ebfe.org>
|
||||||
|
* (c) kwantam <kwantam@gmail.com>
|
||||||
|
* 20150428 created `expand` module to eliminate most allocs during setup
|
||||||
*
|
*
|
||||||
* For the full copyright and license information, please view the LICENSE
|
* For the full copyright and license information, please view the LICENSE
|
||||||
* file that was distributed with this source code.
|
* file that was distributed with this source code.
|
||||||
*/
|
*/
|
||||||
|
|
||||||
extern crate collections;
|
|
||||||
extern crate getopts;
|
extern crate getopts;
|
||||||
|
|
||||||
use getopts::OptGroup;
|
use getopts::OptGroup;
|
||||||
use std::char::from_u32;
|
|
||||||
use std::collections::{BitSet, VecMap};
|
use std::collections::{BitSet, VecMap};
|
||||||
use std::old_io::{BufferedReader, print};
|
use std::io::{stdin, stdout, BufReader, Read, Write};
|
||||||
use std::old_io::stdio::{stdin_raw, stdout};
|
use expand::ExpandSet;
|
||||||
use std::iter::FromIterator;
|
|
||||||
use std::vec::Vec;
|
|
||||||
|
|
||||||
#[path="../common/util.rs"]
|
#[path="../common/util.rs"]
|
||||||
#[macro_use]
|
#[macro_use]
|
||||||
mod util;
|
mod util;
|
||||||
|
|
||||||
|
mod expand;
|
||||||
|
|
||||||
static NAME : &'static str = "tr";
|
static NAME : &'static str = "tr";
|
||||||
static VERSION : &'static str = "1.0.0";
|
static VERSION : &'static str = "1.0.0";
|
||||||
|
const BUFFER_LEN: usize = 1024;
|
||||||
|
|
||||||
#[inline]
|
fn delete<'a>(set: ExpandSet<'a>, complement: bool) {
|
||||||
fn unescape_char(c: char) -> char {
|
|
||||||
match c {
|
|
||||||
'a' => 0x07u8 as char,
|
|
||||||
'b' => 0x08u8 as char,
|
|
||||||
'f' => 0x0cu8 as char,
|
|
||||||
'v' => 0x0bu8 as char,
|
|
||||||
'n' => '\n',
|
|
||||||
'r' => '\r',
|
|
||||||
't' => '\t',
|
|
||||||
_ => c,
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
#[inline]
|
|
||||||
fn unescape(v: Vec<char>) -> Vec<char> {
|
|
||||||
let mut out = Vec::new();
|
|
||||||
let mut input = v.as_slice();
|
|
||||||
loop {
|
|
||||||
input = match input {
|
|
||||||
['\\', e, rest..] => {
|
|
||||||
out.push(unescape_char(e));
|
|
||||||
rest
|
|
||||||
}
|
|
||||||
[c, rest..] => {
|
|
||||||
out.push(c);
|
|
||||||
rest
|
|
||||||
}
|
|
||||||
[] => break
|
|
||||||
}
|
|
||||||
}
|
|
||||||
out
|
|
||||||
}
|
|
||||||
|
|
||||||
#[inline]
|
|
||||||
fn expand_range(from: char, to: char) -> Vec<char> {
|
|
||||||
range(from as u32, to as u32 + 1).map(|c| from_u32(c).unwrap()).collect()
|
|
||||||
}
|
|
||||||
|
|
||||||
fn expand_set(s: &str) -> Vec<char> {
|
|
||||||
let mut set = Vec::<char>::new();
|
|
||||||
let unesc = unescape(FromIterator::from_iter(s.chars()));
|
|
||||||
let mut input = unesc.as_slice();
|
|
||||||
|
|
||||||
loop {
|
|
||||||
input = match input {
|
|
||||||
[f, '-', t, rest..] => {
|
|
||||||
set.push_all(expand_range(f, t).as_slice());
|
|
||||||
rest
|
|
||||||
}
|
|
||||||
[c, rest..] => {
|
|
||||||
set.push(c);
|
|
||||||
rest
|
|
||||||
}
|
|
||||||
[] => break
|
|
||||||
};
|
|
||||||
}
|
|
||||||
set
|
|
||||||
}
|
|
||||||
|
|
||||||
fn delete(set: Vec<char>, complement: bool) {
|
|
||||||
let mut bset = BitSet::new();
|
let mut bset = BitSet::new();
|
||||||
let mut out = stdout();
|
let mut stdout = stdout();
|
||||||
|
let mut buf = String::with_capacity(BUFFER_LEN + 4);
|
||||||
|
|
||||||
for &c in set.iter() {
|
for c in set {
|
||||||
bset.insert(c as usize);
|
bset.insert(c as usize);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -104,42 +46,44 @@ fn delete(set: Vec<char>, complement: bool) {
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
for c in BufferedReader::new(stdin_raw()).chars() {
|
for c in BufReader::new(stdin()).chars() {
|
||||||
match c {
|
match c {
|
||||||
Ok(c) if is_allowed(c) => out.write_char(c).unwrap(),
|
Ok(c) if is_allowed(c) => buf.push(c),
|
||||||
Ok(_) => (),
|
Ok(_) => (),
|
||||||
Err(err) => panic!("{}", err),
|
Err(err) => panic!("{}", err),
|
||||||
};
|
};
|
||||||
|
if buf.len() >= BUFFER_LEN {
|
||||||
|
safe_unwrap!(stdout.write_all(&buf[..].as_bytes()));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if buf.len() > 0 {
|
||||||
|
safe_unwrap!(stdout.write_all(&buf[..].as_bytes()));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
fn tr(set1: &[char], set2: &[char]) {
|
fn tr<'a>(set1: ExpandSet<'a>, mut set2: ExpandSet<'a>) {
|
||||||
const BUFFER_LEN: usize = 1024;
|
|
||||||
|
|
||||||
let mut map = VecMap::new();
|
let mut map = VecMap::new();
|
||||||
let mut stdout = stdout();
|
let mut stdout = stdout();
|
||||||
let mut outbuffer = String::with_capacity(BUFFER_LEN);
|
let mut buf = String::with_capacity(BUFFER_LEN + 4);
|
||||||
|
|
||||||
let set2_len = set2.len();
|
let mut s2_prev = '_';
|
||||||
for i in range(0, set1.len()) {
|
for i in set1 {
|
||||||
if i >= set2_len {
|
s2_prev = set2.next().unwrap_or(s2_prev);
|
||||||
map.insert(set1[i] as usize, set2[set2_len - 1]);
|
|
||||||
} else {
|
map.insert(i as usize, s2_prev);
|
||||||
map.insert(set1[i] as usize, set2[i]);
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
for c in BufferedReader::new(stdin_raw()).chars() {
|
for c in BufReader::new(stdin()).chars() {
|
||||||
match c {
|
match c {
|
||||||
Ok(inc) => {
|
Ok(inc) => {
|
||||||
let trc = match map.get(&(inc as usize)) {
|
let trc = match map.get(&(inc as usize)) {
|
||||||
Some(t) => *t,
|
Some(t) => *t,
|
||||||
None => inc,
|
None => inc,
|
||||||
};
|
};
|
||||||
outbuffer.push(trc);
|
buf.push(trc);
|
||||||
if outbuffer.len() >= BUFFER_LEN {
|
if buf.len() >= BUFFER_LEN {
|
||||||
stdout.write_str(outbuffer.as_slice()).unwrap();
|
safe_unwrap!(stdout.write_all(&buf[..].as_bytes()));
|
||||||
outbuffer.clear();
|
buf.truncate(0);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
Err(err) => {
|
Err(err) => {
|
||||||
|
@ -147,8 +91,8 @@ fn tr(set1: &[char], set2: &[char]) {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if outbuffer.len() > 0 {
|
if buf.len() > 0 {
|
||||||
stdout.write_str(outbuffer.as_slice()).unwrap();
|
safe_unwrap!(stdout.write_all(&buf[..].as_bytes()));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -158,7 +102,7 @@ fn usage(opts: &[OptGroup]) {
|
||||||
println!("Usage:");
|
println!("Usage:");
|
||||||
println!(" {} [OPTIONS] SET1 [SET2]", NAME);
|
println!(" {} [OPTIONS] SET1 [SET2]", NAME);
|
||||||
println!("");
|
println!("");
|
||||||
print(getopts::usage("Translate or delete characters.", opts).as_slice());
|
println!("{}", getopts::usage("Translate or delete characters.", opts));
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn uumain(args: Vec<String>) -> i32 {
|
pub fn uumain(args: Vec<String>) -> i32 {
|
||||||
|
@ -170,7 +114,7 @@ pub fn uumain(args: Vec<String>) -> i32 {
|
||||||
getopts::optflag("V", "version", "output version information and exit"),
|
getopts::optflag("V", "version", "output version information and exit"),
|
||||||
];
|
];
|
||||||
|
|
||||||
let matches = match getopts::getopts(args.tail(), &opts) {
|
let matches = match getopts::getopts(&args[1..], &opts) {
|
||||||
Ok(m) => m,
|
Ok(m) => m,
|
||||||
Err(err) => {
|
Err(err) => {
|
||||||
show_error!("{}", err);
|
show_error!("{}", err);
|
||||||
|
@ -203,12 +147,12 @@ pub fn uumain(args: Vec<String>) -> i32 {
|
||||||
}
|
}
|
||||||
|
|
||||||
if dflag {
|
if dflag {
|
||||||
let set1 = expand_set(sets[0].as_slice());
|
let set1 = ExpandSet::new(sets[0].as_ref());
|
||||||
delete(set1, cflag);
|
delete(set1, cflag);
|
||||||
} else {
|
} else {
|
||||||
let set1 = expand_set(sets[0].as_slice());
|
let set1 = ExpandSet::new(sets[0].as_ref());
|
||||||
let set2 = expand_set(sets[1].as_slice());
|
let set2 = ExpandSet::new(sets[1].as_ref());
|
||||||
tr(set1.as_slice(), set2.as_slice());
|
tr(set1, set2);
|
||||||
}
|
}
|
||||||
|
|
||||||
0
|
0
|
||||||
|
|
32
test/tr.rs
32
test/tr.rs
|
@ -1,49 +1,51 @@
|
||||||
#![allow(unstable)]
|
use std::io::Write;
|
||||||
|
use std::process::{Command, Stdio};
|
||||||
use std::old_io::process::Command;
|
|
||||||
|
|
||||||
static PROGNAME: &'static str = "./tr";
|
static PROGNAME: &'static str = "./tr";
|
||||||
|
|
||||||
fn run(input: &str, args: &[&'static str]) -> Vec<u8> {
|
fn run(input: &str, args: &[&'static str]) -> Vec<u8> {
|
||||||
let mut process = Command::new(PROGNAME).args(args).spawn().unwrap();
|
let mut process = Command::new(PROGNAME)
|
||||||
|
.args(args)
|
||||||
|
.stdin(Stdio::piped())
|
||||||
|
.stdout(Stdio::piped())
|
||||||
|
.spawn()
|
||||||
|
.unwrap_or_else(|e| panic!("{}", e));
|
||||||
|
|
||||||
process.stdin.take().unwrap().write_str(input).unwrap();
|
process.stdin.take().unwrap_or_else(|| panic!("Could not take child process stdin"))
|
||||||
|
.write_all(input.as_bytes()).unwrap_or_else(|e| panic!("{}", e));
|
||||||
|
|
||||||
let po = match process.wait_with_output() {
|
let po = process.wait_with_output().unwrap_or_else(|e| panic!("{}", e));
|
||||||
Ok(p) => p,
|
po.stdout
|
||||||
Err(err) => panic!("{}", err),
|
|
||||||
};
|
|
||||||
po.output
|
|
||||||
}
|
}
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn test_toupper() {
|
fn test_toupper() {
|
||||||
let out = run("!abcd!", &["a-z", "A-Z"]);
|
let out = run("!abcd!", &["a-z", "A-Z"]);
|
||||||
assert_eq!(out.as_slice(), b"!ABCD!");
|
assert_eq!(&out[..], b"!ABCD!");
|
||||||
}
|
}
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn test_small_set2() {
|
fn test_small_set2() {
|
||||||
let out = run("@0123456789", &["0-9", "X"]);
|
let out = run("@0123456789", &["0-9", "X"]);
|
||||||
assert_eq!(out.as_slice(), b"@XXXXXXXXXX");
|
assert_eq!(&out[..], b"@XXXXXXXXXX");
|
||||||
}
|
}
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn test_unicode() {
|
fn test_unicode() {
|
||||||
let out = run("(,°□°), ┬─┬", &[", ┬─┬", "╯︵┻━┻"]);
|
let out = run("(,°□°), ┬─┬", &[", ┬─┬", "╯︵┻━┻"]);
|
||||||
assert_eq!(out.as_slice(), "(╯°□°)╯︵┻━┻".as_bytes());
|
assert_eq!(&out[..], "(╯°□°)╯︵┻━┻".as_bytes());
|
||||||
}
|
}
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn test_delete() {
|
fn test_delete() {
|
||||||
let out = run("aBcD", &["-d", "a-z"]);
|
let out = run("aBcD", &["-d", "a-z"]);
|
||||||
assert_eq!(out.as_slice(), b"BD");
|
assert_eq!(&out[..], b"BD");
|
||||||
}
|
}
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn test_delete_complement() {
|
fn test_delete_complement() {
|
||||||
let out = run("aBcD", &["-d", "-c", "a-z"]);
|
let out = run("aBcD", &["-d", "-c", "a-z"]);
|
||||||
assert_eq!(out.as_slice(), b"ac");
|
assert_eq!(&out[..], b"ac");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue